/* This file is part of the KDE libraries Copyright (C) 1999 Lars Knoll (knoll@kde.org) Copyright (C) 2003 Dirk Mueller (mueller@kde.org) Copyright (C) 2003 Apple Computer, Inc. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ //---------------------------------------------------------------------------- // // KDE HTML Widget -- decoder for input stream #undef DECODE_DEBUG //#define DECODE_DEBUG #include #include "decoder.h" #include "guess_ja.h" using namespace khtml; #include "htmlhashes.h" #include #include #include #include #include #include #include Decoder::Decoder() { // latin1 m_codec = TQTextCodec::codecForMib(4); m_decoder = m_codec->makeDecoder(); enc = 0; m_type = DefaultEncoding; body = false; beginning = true; visualRTL = false; m_autoDetectLanguage = SemiautomaticDetection; kc = NULL; } Decoder::~Decoder() { delete m_decoder; if (kc) delete kc; } void Decoder::setEncoding(const char *_encoding, EncodingType type) { #ifdef DECODE_DEBUG kdDebug(6005) << "setEncoding " << _encoding << " " << type << endl; #endif enc = _encoding; #ifdef DECODE_DEBUG kdDebug(6005) << "old encoding is:" << m_codec->name() << endl; #endif enc = enc.lower(); #ifdef DECODE_DEBUG kdDebug(6005) << "requesting:" << enc << endl; #endif if(enc.isNull() || enc.isEmpty()) return; #ifdef APPLE_CHANGES TQTextCodec *codec = (type == EncodingFromMetaTag || type == EncodingFromXMLHeader) ? TQTextCodec::codecForNameEightBitOnly(enc) : TQTextCodec::codecForName(enc); if (codec) { enc = codec->name(); visualRTL = codec->usesVisualOrdering(); } #else if(enc == "visual") // hebrew visually ordered enc = "iso8859-8"; bool b; TQTextCodec *codec = KGlobal::charsets()->codecForName(enc, b); if (!b) codec = 0; if (type == EncodingFromMetaTag || type == EncodingFromXMLHeader) { //Sometimes the codec specified is absurd, i.e. UTF-16 despite //us decoding a meta tag as ASCII. In that case, ignore it. if (codec && (codec->mibEnum() == 1000)) //UTF16 or similar. codec = 0; } if (codec && codec->mibEnum() == 11) { //We do NOT want to use Qt's TQHebrewCodec, since it tries to reorder itself. codec = TQTextCodec::codecForName("iso8859-8-i"); // visually ordered unless one of the following if( !(enc == "iso-8859-8-i" || enc == "iso_8859-8-i" || enc == "csiso88598i" || enc == "logical") ) visualRTL = true; } #endif if( codec ) { // in case the codec didn't exist, we keep the old one (fixes some sites specifying invalid codecs) m_codec = codec; m_type = type; delete m_decoder; m_decoder = m_codec->makeDecoder(); } #ifdef DECODE_DEBUG kdDebug(6005) << "Decoder::encoding used is" << m_codec->name() << endl; #endif } const char *Decoder::encoding() const { return enc; } // Other browsers allow comments in the head section, so we need to also. // It's important not to look for tags inside the comments. static void skipComment(const char *&ptr, const char *pEnd) { const char *p = ptr; // Allow ; other browsers do. if (*p == '>') { p++; } else { while (p != pEnd) { if (*p == '-') { // This is the real end of comment, "-->". if (p[1] == '-' && p[2] == '>') { p += 3; break; } // This is the incorrect end of comment that other browsers allow, "--!>". if (p[1] == '-' && p[2] == '!' && p[3] == '>') { p += 4; break; } } p++; } } ptr = p; } // Returns the position of the encoding string. static int findXMLEncoding(const TQCString &str, int &encodingLength) { int len = str.length(); int pos = str.find("encoding"); if (pos == -1) return -1; pos += 8; // Skip spaces and stray control characters. while (pos < len && str[pos] <= ' ') ++pos; //Bail out if nothing after if (pos >= len) return -1; // Skip equals sign. if (str[pos] != '=') return -1; ++pos; // Skip spaces and stray control characters. while (pos < len && str[pos] <= ' ') ++pos; //Bail out if nothing after if (pos >= len) return -1; // Skip quotation mark. char quoteMark = str[pos]; if (quoteMark != '"' && quoteMark != '\'') return -1; ++pos; // Find the trailing quotation mark. int end = pos; while (end < len && str[end] != quoteMark) ++end; if (end >= len) return -1; encodingLength = end - pos; return pos; } TQString Decoder::decode(const char *data, int len) { // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding. int bufferLength = buffer.length(); const int maximumBOMLength = 10; if (beginning && bufferLength + len >= maximumBOMLength) { // If the user has chosen utf16 we still need to auto-detect the endianness if ((m_type != UserChosenEncoding) || (m_codec->mibEnum() == 1000)) { // Extract the first three bytes. // Handle the case where some of bytes are already in the buffer. const uchar *udata = (const uchar *)data; uchar c1 = bufferLength >= 1 ? (uchar)buffer[0] : *udata++; uchar c2 = bufferLength >= 2 ? (uchar)buffer[1] : *udata++; uchar c3 = bufferLength >= 3 ? (uchar)buffer[2] : *udata++; // Check for the BOM const char *autoDetectedEncoding; if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) { autoDetectedEncoding = "ISO-10646-UCS-2"; } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { autoDetectedEncoding = "UTF-8"; } else if (c1 == 0x00 || c2 == 0x00) { uchar c4 = bufferLength >= 4 ? (uchar)buffer[3] : *udata++; uchar c5 = bufferLength >= 5 ? (uchar)buffer[4] : *udata++; uchar c6 = bufferLength >= 6 ? (uchar)buffer[5] : *udata++; uchar c7 = bufferLength >= 7 ? (uchar)buffer[6] : *udata++; uchar c8 = bufferLength >= 8 ? (uchar)buffer[7] : *udata++; uchar c9 = bufferLength >= 9 ? (uchar)buffer[8] : *udata++; uchar c10 = bufferLength >= 10 ? (uchar)buffer[9] : *udata++; int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0); int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0); if ((nul_count_even == 0 && nul_count_odd == 5) || (nul_count_even == 5 && nul_count_odd == 0)) autoDetectedEncoding = "ISO-10646-UCS-2"; else autoDetectedEncoding = 0; } else { autoDetectedEncoding = 0; } // If we found a BOM, use the encoding it implies. if (autoDetectedEncoding != 0) { m_type = AutoDetectedEncoding; m_codec = TQTextCodec::codecForName(autoDetectedEncoding); assert(m_codec); enc = m_codec->name(); delete m_decoder; m_decoder = m_codec->makeDecoder(); if (m_codec->mibEnum() == 1000 && c2 == 0x00) { // utf16LE, we need to put the decoder in LE mode char reverseUtf16[3] = {'\xFF', '\xFE', '\x00'}; m_decoder->toUnicode(reverseUtf16, 2); } } } beginning = false; } // this is not completely efficient, since the function might go // through the html head several times... bool lookForMetaTag = m_type == DefaultEncoding && !body; if (lookForMetaTag) { #ifdef DECODE_DEBUG kdDebug(6005) << "looking for charset definition" << endl; #endif { // extra level of braces to keep indenting matching original for better diff'ing #ifdef APPLE_CHANGES buffer.append(data, len); #else if(m_codec->mibEnum() != 1000) { // utf16 // replace '\0' by spaces, for buggy pages char *d = const_cast(data); int i = len - 1; while(i >= 0) { if(d[i] == 0) d[i] = ' '; i--; } } buffer += TQCString(data, len+1); #endif // we still don't have an encoding, and are in the head // the following tags are allowed in : // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE int invalid = 0; // invalid head tag count #ifdef APPLE_CHANGES const char *ptr = buffer.latin1(); const char *pEnd = ptr + buffer.length(); #else const char *ptr = buffer.data(); const char *pEnd = ptr + buffer.length(); #endif while(ptr != pEnd) { if(*ptr == '<') { bool end = false; ptr++; // Handle comments. if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') { ptr += 3; skipComment(ptr, pEnd); continue; } // Handle XML header, which can have encoding in it. if (ptr[0] == '?' && ptr[1] == 'x' && ptr[2] == 'm' && ptr[3] == 'l') { const char *end = ptr; while (*end != '>' && *end != '\0') end++; if (*end == '\0') break; TQCString str(ptr, end - ptr + 1); //+1 as it must include the \0 terminator int len; int pos = findXMLEncoding(str, len); if (pos != -1) { setEncoding(str.mid(pos, len), EncodingFromXMLHeader); if (m_type == EncodingFromXMLHeader) goto found; } } if(*ptr == '/') ptr++, end=true; char tmp[20]; int len = 0; while ( ((*ptr >= 'a') && (*ptr <= 'z') || (*ptr >= 'A') && (*ptr <= 'Z') || (*ptr >= '0') && (*ptr <= '9')) && len < 19 ) { tmp[len] = tolower( *ptr ); ptr++; len++; } tmp[len] = 0; int id = khtml::getTagID(tmp, len); if(end) id += ID_CLOSE_TAG; switch( id ) { case ID_META: { // found a meta tag... //ptr += 5; const char * end = ptr; while(*end != '>' && *end != '\0') end++; if ( *end == '\0' ) break; TQCString str( ptr, (end-ptr)+1); str = str.lower(); int pos = 0; //if( (pos = str.find("http-equiv", pos)) == -1) break; //if( (pos = str.find("content-type", pos)) == -1) break; while( pos < ( int ) str.length() ) { if( (pos = str.find("charset", pos)) == -1) break; pos += 7; // skip whitespace.. while( pos < (int)str.length() && str[pos] <= ' ' ) pos++; if ( pos == ( int )str.length()) break; if ( str[pos++] != '=' ) continue; while ( pos < ( int )str.length() && ( str[pos] <= ' ' ) || str[pos] == '=' || str[pos] == '"' || str[pos] == '\'') pos++; // end ? if ( pos == ( int )str.length() ) break; uint endpos = pos; while( endpos < str.length() && (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\'' && str[endpos] != ';' && str[endpos] != '>') ) endpos++; enc = str.mid(pos, endpos-pos); #ifdef DECODE_DEBUG kdDebug( 6005 ) << "Decoder: found charset: " << enc.data() << endl; #endif setEncoding(enc, EncodingFromMetaTag); if( m_type == EncodingFromMetaTag ) goto found; if ( endpos >= str.length() || str[endpos] == '/' || str[endpos] == '>' ) break; pos = endpos + 1; } } case ID_SCRIPT: case (ID_SCRIPT+ID_CLOSE_TAG): case ID_NOSCRIPT: case (ID_NOSCRIPT+ID_CLOSE_TAG): case ID_STYLE: case (ID_STYLE+ID_CLOSE_TAG): case ID_LINK: case (ID_LINK+ID_CLOSE_TAG): case ID_OBJECT: case (ID_OBJECT+ID_CLOSE_TAG): case ID_TITLE: case (ID_TITLE+ID_CLOSE_TAG): case ID_BASE: case (ID_BASE+ID_CLOSE_TAG): case ID_HTML: case ID_HEAD: case 0: case (0 + ID_CLOSE_TAG ): break; case ID_BODY: case (ID_HEAD+ID_CLOSE_TAG): body = true; #ifdef DECODE_DEBUG kdDebug( 6005 ) << "Decoder: no charset found. Id=" << id << endl; #endif goto found; default: // Invalid tag in head. Let's be a little tolerant invalid++; if (invalid > 2) { body = true; #ifdef DECODE_DEBUG kdDebug( 6005 ) << "Decoder: no charset found. Id=" << id << endl; #endif goto found; } } } else ptr++; } if (invalid > 0) { body = true; goto found; } return TQString::null; } } found: if (m_type == DefaultEncoding) { #ifdef DECODE_DEBUG kdDebug( 6005 ) << "Decoder: use auto-detect (" << strlen(data) << ")" << endl; #endif switch ( m_autoDetectLanguage) { case Decoder::Arabic: enc = automaticDetectionForArabic( (const unsigned char*) data, len ); break; case Decoder::Baltic: enc = automaticDetectionForBaltic( (const unsigned char*) data, len ); break; case Decoder::CentralEuropean: enc = automaticDetectionForCentralEuropean( (const unsigned char*) data, len ); break; case Decoder::Russian: case Decoder::Ukrainian: enc = automaticDetectionForCyrillic( (const unsigned char*) data, len, m_autoDetectLanguage ); break; case Decoder::Greek: enc = automaticDetectionForGreek( (const unsigned char*) data, len ); break; case Decoder::Hebrew: enc = automaticDetectionForHebrew( (const unsigned char*) data, len ); break; case Decoder::Japanese: enc = automaticDetectionForJapanese( (const unsigned char*) data, len ); break; case Decoder::Turkish: enc = automaticDetectionForTurkish( (const unsigned char*) data, len ); break; case Decoder::WesternEuropean: enc = automaticDetectionForWesternEuropean( (const unsigned char*) data, len ); break; case Decoder::SemiautomaticDetection: case Decoder::Chinese: case Decoder::Korean: case Decoder::Thai: case Decoder::Unicode: // huh. somethings broken in this code ### FIXME enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback. break; } #ifdef DECODE_DEBUG kdDebug( 6005 ) << "Decoder: auto detect encoding is " << enc.data() << endl; #endif if ( !enc.isEmpty() ) setEncoding( enc.data(), AutoDetectedEncoding); } // if we still haven't found an encoding latin1 will be used... // this is according to HTML4.0 specs if (!m_codec) { if(enc.isEmpty()) enc = "iso8859-1"; m_codec = TQTextCodec::codecForName(enc); // be sure not to crash if(!m_codec) { m_codec = TQTextCodec::codecForMib(4); enc = "iso8859-1"; } delete m_decoder; m_decoder = m_codec->makeDecoder(); } TQString out; if(!buffer.isEmpty() && enc != "ISO-10646-UCS-2") { out = m_decoder->toUnicode(buffer, buffer.length()); buffer = ""; } else { if(m_codec->mibEnum() != 1000) // utf16 { // ### hack for a bug in TQTextCodec. It cut's the input stream // in case there are \0 in it. ZDNET has them inside... :-( char *d = const_cast(data); int i = len - 1; while(i >= 0) { if(*(d+i) == 0) *(d+i) = ' '; i--; } } out = m_decoder->toUnicode(data, len); } return out; } TQString Decoder::flush() const { return m_decoder->toUnicode(buffer, buffer.length()); } TQCString Decoder::automaticDetectionForArabic( const unsigned char* ptr, int size ) { for ( int i = 0; i < size; ++i ) { if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3 || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA ) || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0 || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) { return "cp1256"; } } return "iso-8859-6"; } TQCString Decoder::automaticDetectionForBaltic( const unsigned char* ptr, int size ) { for ( int i = 0; i < size; ++i ) { if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) ) return "cp1257"; if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 ) return "iso-8859-13"; } return "iso-8859-13"; } TQCString Decoder::automaticDetectionForCentralEuropean(const unsigned char* ptr, int size ) { TQCString charset = TQCString(); for ( int i = 0; i < size; ++i ) { if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) { if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 ) return "ibm852"; if ( i + 1 > size ) return "cp1250"; else { // maybe ibm852 ? charset = "cp1250"; continue; } } if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) { if ( i + 1 > size ) return "iso-8859-2"; else { // maybe ibm852 ? if ( charset.isNull() ) charset = "iso-8859-2"; continue; } } } if ( charset.isNull() ) charset = "iso-8859-3"; return charset.data(); } TQCString Decoder::automaticDetectionForCyrillic( const unsigned char* ptr, int size, AutoDetectLanguage _language ) { int koi_st=0; int cp1251_st=0; // int koi_na=0; // int cp1251_na=0; int koi_o_capital=0; int koi_o=0; int cp1251_o_capital=0; int cp1251_o=0; int koi_a_capital=0; int koi_a=0; int cp1251_a_capital=0; int cp1251_a=0; int koi_i_capital=0; int koi_i=0; int cp1251_i_capital=0; int cp1251_i=0; int cp1251_small_range=0; int koi_small_range=0; int ibm866_small_range=0; int i; for (i=1; (i0xdf) { ++cp1251_small_range; if (ptr[i]==0xee)//small o ++cp1251_o; else if (ptr[i]==0xe0)//small a ++cp1251_a; else if (ptr[i]==0xe8)//small i ++cp1251_i; else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)//small st ++cp1251_st; else if (ptr[i]==0xef) ++koi_o_capital; else if (ptr[i]==0xe1) ++koi_a_capital; else if (ptr[i]==0xe9) ++koi_i_capital; } else if (ptr[i]>0xbf) { ++koi_small_range; if (ptr[i]==0xcf)//small o ++koi_o; else if (ptr[i]==0xc1)//small a ++koi_a; else if (ptr[i]==0xc9)//small i ++koi_i; else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)//small st ++koi_st; else if (ptr[i]==0xce) ++cp1251_o_capital; else if (ptr[i]==0xc0) ++cp1251_a_capital; else if (ptr[i]==0xc8) ++cp1251_i_capital; } else if (ptr[i]>0x9f && ptr[i]<0xaf) //first 16 letterz is 60% ++ibm866_small_range; } if (ibm866_small_range>cp1251_small_range+koi_small_range) return "ibm866"; //hehe this is a rare case :) TQCString koi_string = "koi8-u"; TQCString cp1251_string = "cp1251"; if (cp1251_st==0 && koi_st>1) return koi_string; if (koi_st==0 && cp1251_st>1) return cp1251_string; if (cp1251_st>0 && koi_st>0) { if (cp1251_st/koi_st>2) return cp1251_string; else if (koi_st/cp1251_st>2) return koi_string; } if (cp1251_a>koi_a && cp1251_o>koi_o && cp1251_i>koi_i) return cp1251_string; if (koi_a>cp1251_a && koi_o>cp1251_o && koi_i>cp1251_i) return koi_string; if (cp1251_a_capital>koi_a_capital && cp1251_o_capital>koi_o_capital && cp1251_i_capital>koi_i_capital) return cp1251_string; if (koi_a_capital>cp1251_a_capital && koi_o_capital>cp1251_o_capital && koi_i_capital>cp1251_i_capital) return koi_string; //fallback... if (cp1251_small_range>koi_small_range) return cp1251_string; else return koi_string; } TQCString Decoder::automaticDetectionForGreek( const unsigned char* ptr, int size ) { for ( int i = 0; i < size; ++i ) { if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4 || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) { return "cp1253"; } } return "iso-8859-7"; } TQCString Decoder::automaticDetectionForHebrew( const unsigned char* ptr, int size ) { for ( int i = 0; i < size; ++i ) { if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 ) || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) { return "cp1255"; } if ( ptr[ i ] == 0xDF ) return "iso-8859-8-i"; } return "iso-8859-8-i"; } TQCString Decoder::automaticDetectionForJapanese( const unsigned char* ptr, int size ) { if (!kc) kc = new JapaneseCode(); switch ( kc->guess_jp( (const char*)ptr, size ) ) { case JapaneseCode::JIS: return "jis7"; case JapaneseCode::EUC: return "eucjp"; case JapaneseCode::SJIS: return "sjis"; case JapaneseCode::UTF8: return "utf8"; default: break; } return ""; } TQCString Decoder::automaticDetectionForTurkish( const unsigned char* ptr, int size ) { for ( int i = 0; i < size; ++i ) { if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) { return "cp1254"; } } return "iso-8859-9"; } TQCString Decoder::automaticDetectionForWesternEuropean( const unsigned char* ptr, int size ) { for ( int i = 0; i < size; ++i ) { if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) return "cp1252"; } return "iso-8859-1"; //"iso-8859-15"; Which better at default ? } // ----------------------------------------------------------------------------- #undef DECODE_DEBUG