|
|
|
/*
|
|
|
|
This file is part of the KDE libraries
|
|
|
|
|
|
|
|
Copyright (C) 1999 Lars Knoll (knoll@kde.org)
|
|
|
|
Copyright (C) 2003 Dirk Mueller (mueller@kde.org)
|
|
|
|
Copyright (C) 2003 Apple Computer, Inc.
|
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Library General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Library General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Library General Public License
|
|
|
|
along with this library; see the file COPYING.LIB. If not, write to
|
|
|
|
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
|
|
|
Boston, MA 02110-1301, USA.
|
|
|
|
*/
|
|
|
|
//----------------------------------------------------------------------------
|
|
|
|
//
|
|
|
|
// KDE HTML Widget -- decoder for input stream
|
|
|
|
|
|
|
|
#undef DECODE_DEBUG
|
|
|
|
//#define DECODE_DEBUG
|
|
|
|
|
|
|
|
#include <assert.h>
|
|
|
|
|
|
|
|
#include "decoder.h"
|
|
|
|
#include "guess_ja.h"
|
|
|
|
|
|
|
|
using namespace khtml;
|
|
|
|
|
|
|
|
#include "htmlhashes.h"
|
|
|
|
|
|
|
|
#include <tqregexp.h>
|
|
|
|
#include <tqtextcodec.h>
|
|
|
|
|
|
|
|
#include <kglobal.h>
|
|
|
|
#include <kcharsets.h>
|
|
|
|
|
|
|
|
#include <ctype.h>
|
|
|
|
#include <kdebug.h>
|
|
|
|
#include <klocale.h>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Decoder::Decoder()
|
|
|
|
{
|
|
|
|
// latin1
|
|
|
|
m_codec = TQTextCodec::codecForMib(4);
|
|
|
|
m_decoder = m_codec->makeDecoder();
|
|
|
|
enc = 0;
|
|
|
|
m_type = DefaultEncoding;
|
|
|
|
body = false;
|
|
|
|
beginning = true;
|
|
|
|
visualRTL = false;
|
|
|
|
m_autoDetectLanguage = SemiautomaticDetection;
|
|
|
|
kc = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
Decoder::~Decoder()
|
|
|
|
{
|
|
|
|
delete m_decoder;
|
|
|
|
if (kc)
|
|
|
|
delete kc;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Decoder::setEncoding(const char *_encoding, EncodingType type)
|
|
|
|
{
|
|
|
|
#ifdef DECODE_DEBUG
|
|
|
|
kdDebug(6005) << "setEncoding " << _encoding << " " << type << endl;
|
|
|
|
#endif
|
|
|
|
enc = _encoding;
|
|
|
|
#ifdef DECODE_DEBUG
|
|
|
|
kdDebug(6005) << "old encoding is:" << m_codec->name() << endl;
|
|
|
|
#endif
|
|
|
|
enc = enc.lower();
|
|
|
|
#ifdef DECODE_DEBUG
|
|
|
|
kdDebug(6005) << "requesting:" << enc << endl;
|
|
|
|
#endif
|
|
|
|
if(enc.isNull() || enc.isEmpty())
|
|
|
|
return;
|
|
|
|
|
|
|
|
#ifdef APPLE_CHANGES
|
|
|
|
TQTextCodec *codec = (type == EncodingFromMetaTag || type == EncodingFromXMLHeader)
|
|
|
|
? TQTextCodec::codecForNameEightBitOnly(enc)
|
|
|
|
: TQTextCodec::codecForName(enc);
|
|
|
|
if (codec) {
|
|
|
|
enc = codec->name();
|
|
|
|
visualRTL = codec->usesVisualOrdering();
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
if(enc == "visual") // hebrew visually ordered
|
|
|
|
enc = "iso8859-8";
|
|
|
|
bool b;
|
|
|
|
TQTextCodec *codec = KGlobal::charsets()->codecForName(enc, b);
|
|
|
|
if (!b)
|
|
|
|
codec = 0;
|
|
|
|
|
|
|
|
if (type == EncodingFromMetaTag || type == EncodingFromXMLHeader) {
|
|
|
|
//Sometimes the codec specified is absurd, i.e. UTF-16 despite
|
|
|
|
//us decoding a meta tag as ASCII. In that case, ignore it.
|
|
|
|
if (codec &&
|
|
|
|
(codec->mibEnum() == 1000)) //UTF16 or similar.
|
|
|
|
codec = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (codec && codec->mibEnum() == 11) {
|
|
|
|
//We do NOT want to use Qt's TQHebrewCodec, since it tries to reorder itself.
|
|
|
|
codec = TQTextCodec::codecForName("iso8859-8-i");
|
|
|
|
|
|
|
|
// visually ordered unless one of the following
|
|
|
|
if( !(enc == "iso-8859-8-i" || enc == "iso_8859-8-i"
|
|
|
|
|| enc == "csiso88598i" || enc == "logical") )
|
|
|
|
visualRTL = true;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
if( codec ) { // in case the codec didn't exist, we keep the old one (fixes some sites specifying invalid codecs)
|
|
|
|
m_codec = codec;
|
|
|
|
m_type = type;
|
|
|
|
delete m_decoder;
|
|
|
|
m_decoder = m_codec->makeDecoder();
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef DECODE_DEBUG
|
|
|
|
kdDebug(6005) << "Decoder::encoding used is" << m_codec->name() << endl;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
const char *Decoder::encoding() const
|
|
|
|
{
|
|
|
|
return enc;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Other browsers allow comments in the head section, so we need to also.
|
|
|
|
// It's important not to look for tags inside the comments.
|
|
|
|
static void skipComment(const char *&ptr, const char *pEnd)
|
|
|
|
{
|
|
|
|
const char *p = ptr;
|
|
|
|
// Allow <!-->; other browsers do.
|
|
|
|
if (*p == '>') {
|
|
|
|
p++;
|
|
|
|
} else {
|
|
|
|
while (p != pEnd) {
|
|
|
|
if (*p == '-') {
|
|
|
|
// This is the real end of comment, "-->".
|
|
|
|
if (p[1] == '-' && p[2] == '>') {
|
|
|
|
p += 3;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
// This is the incorrect end of comment that other browsers allow, "--!>".
|
|
|
|
if (p[1] == '-' && p[2] == '!' && p[3] == '>') {
|
|
|
|
p += 4;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ptr = p;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns the position of the encoding string.
|
|
|
|
static int findXMLEncoding(const TQCString &str, int &encodingLength)
|
|
|
|
{
|
|
|
|
int len = str.length();
|
|
|
|
|
|
|
|
int pos = str.find("encoding");
|
|
|
|
if (pos == -1)
|
|
|
|
return -1;
|
|
|
|
pos += 8;
|
|
|
|
|
|
|
|
// Skip spaces and stray control characters.
|
|
|
|
while (pos < len && str[pos] <= ' ')
|
|
|
|
++pos;
|
|
|
|
|
|
|
|
//Bail out if nothing after
|
|
|
|
if (pos >= len)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
// Skip equals sign.
|
|
|
|
if (str[pos] != '=')
|
|
|
|
return -1;
|
|
|
|
++pos;
|
|
|
|
|
|
|
|
// Skip spaces and stray control characters.
|
|
|
|
while (pos < len && str[pos] <= ' ')
|
|
|
|
++pos;
|
|
|
|
|
|
|
|
//Bail out if nothing after
|
|
|
|
if (pos >= len)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
// Skip quotation mark.
|
|
|
|
char quoteMark = str[pos];
|
|
|
|
if (quoteMark != '"' && quoteMark != '\'')
|
|
|
|
return -1;
|
|
|
|
++pos;
|
|
|
|
|
|
|
|
// Find the trailing quotation mark.
|
|
|
|
int end = pos;
|
|
|
|
while (end < len && str[end] != quoteMark)
|
|
|
|
++end;
|
|
|
|
|
|
|
|
if (end >= len)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
encodingLength = end - pos;
|
|
|
|
return pos;
|
|
|
|
}
|
|
|
|
|
|
|
|
TQString Decoder::decode(const char *data, int len)
|
|
|
|
{
|
|
|
|
// Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
|
|
|
|
int bufferLength = buffer.length();
|
|
|
|
const int maximumBOMLength = 10;
|
|
|
|
if (beginning && bufferLength + len >= maximumBOMLength) {
|
|
|
|
// If the user has chosen utf16 we still need to auto-detect the endianness
|
|
|
|
if ((m_type != UserChosenEncoding) || (m_codec->mibEnum() == 1000)) {
|
|
|
|
// Extract the first three bytes.
|
|
|
|
// Handle the case where some of bytes are already in the buffer.
|
|
|
|
const uchar *udata = (const uchar *)data;
|
|
|
|
uchar c1 = bufferLength >= 1 ? (uchar)buffer[0] : *udata++;
|
|
|
|
uchar c2 = bufferLength >= 2 ? (uchar)buffer[1] : *udata++;
|
|
|
|
uchar c3 = bufferLength >= 3 ? (uchar)buffer[2] : *udata++;
|
|
|
|
|
|
|
|
// Check for the BOM
|
|
|
|
const char *autoDetectedEncoding;
|
|
|
|
if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) {
|
|
|
|
autoDetectedEncoding = "ISO-10646-UCS-2";
|
|
|
|
} else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
|
|
|
|
autoDetectedEncoding = "UTF-8";
|
|
|
|
} else if (c1 == 0x00 || c2 == 0x00) {
|
|
|
|
uchar c4 = bufferLength >= 4 ? (uchar)buffer[3] : *udata++;
|
|
|
|
uchar c5 = bufferLength >= 5 ? (uchar)buffer[4] : *udata++;
|
|
|
|
uchar c6 = bufferLength >= 6 ? (uchar)buffer[5] : *udata++;
|
|
|
|
uchar c7 = bufferLength >= 7 ? (uchar)buffer[6] : *udata++;
|
|
|
|
uchar c8 = bufferLength >= 8 ? (uchar)buffer[7] : *udata++;
|
|
|
|
uchar c9 = bufferLength >= 9 ? (uchar)buffer[8] : *udata++;
|
|
|
|
uchar c10 = bufferLength >= 10 ? (uchar)buffer[9] : *udata++;
|
|
|
|
int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
|
|
|
|
int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
|
|
|
|
if ((nul_count_even == 0 && nul_count_odd == 5) ||
|
|
|
|
(nul_count_even == 5 && nul_count_odd == 0))
|
|
|
|
autoDetectedEncoding = "ISO-10646-UCS-2";
|
|
|
|
else
|
|
|
|
autoDetectedEncoding = 0;
|
|
|
|
} else {
|
|
|
|
autoDetectedEncoding = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we found a BOM, use the encoding it implies.
|
|
|
|
if (autoDetectedEncoding != 0) {
|
|
|
|
m_type = AutoDetectedEncoding;
|
|
|
|
m_codec = TQTextCodec::codecForName(autoDetectedEncoding);
|
|
|
|
assert(m_codec);
|
|
|
|
enc = m_codec->name();
|
|
|
|
delete m_decoder;
|
|
|
|
m_decoder = m_codec->makeDecoder();
|
|
|
|
if (m_codec->mibEnum() == 1000 && c2 == 0x00)
|
|
|
|
{
|
|
|
|
// utf16LE, we need to put the decoder in LE mode
|
|
|
|
char reverseUtf16[3] = {'\xFF', '\xFE', '\x00'};
|
|
|
|
m_decoder->toUnicode(reverseUtf16, 2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
beginning = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// this is not completely efficient, since the function might go
|
|
|
|
// through the html head several times...
|
|
|
|
|
|
|
|
bool lookForMetaTag = m_type == DefaultEncoding && !body;
|
|
|
|
|
|
|
|
if (lookForMetaTag) {
|
|
|
|
#ifdef DECODE_DEBUG
|
|
|
|
kdDebug(6005) << "looking for charset definition" << endl;
|
|
|
|
#endif
|
|
|
|
{ // extra level of braces to keep indenting matching original for better diff'ing
|
|
|
|
#ifdef APPLE_CHANGES
|
|
|
|
buffer.append(data, len);
|
|
|
|
#else
|
|
|
|
if(m_codec->mibEnum() != 1000) { // utf16
|
|
|
|
// replace '\0' by spaces, for buggy pages
|
|
|
|
char *d = const_cast<char *>(data);
|
|
|
|
int i = len - 1;
|
|
|
|
while(i >= 0) {
|
|
|
|
if(d[i] == 0) d[i] = ' ';
|
|
|
|
i--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
buffer += TQCString(data, len+1);
|
|
|
|
#endif
|
|
|
|
// we still don't have an encoding, and are in the head
|
|
|
|
// the following tags are allowed in <head>:
|
|
|
|
// SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
|
|
|
|
int invalid = 0; // invalid head tag count
|
|
|
|
#ifdef APPLE_CHANGES
|
|
|
|
const char *ptr = buffer.latin1();
|
|
|
|
const char *pEnd = ptr + buffer.length();
|
|
|
|
#else
|
|
|
|
const char *ptr = buffer.data();
|
|
|
|
const char *pEnd = ptr + buffer.length();
|
|
|
|
#endif
|
|
|
|
while(ptr != pEnd)
|
|
|
|
{
|
|
|
|
if(*ptr == '<') {
|
|
|
|
bool end = false;
|
|
|
|
ptr++;
|
|
|
|
|
|
|
|
// Handle comments.
|
|
|
|
if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') {
|
|
|
|
ptr += 3;
|
|
|
|
skipComment(ptr, pEnd);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Handle XML header, which can have encoding in it.
|
|
|
|
if (ptr[0] == '?' && ptr[1] == 'x' && ptr[2] == 'm' && ptr[3] == 'l') {
|
|
|
|
const char *end = ptr;
|
|
|
|
while (*end != '>' && *end != '\0') end++;
|
|
|
|
if (*end == '\0')
|
|
|
|
break;
|
|
|
|
TQCString str(ptr, end - ptr + 1); //+1 as it must include the \0 terminator
|
|
|
|
int len;
|
|
|
|
int pos = findXMLEncoding(str, len);
|
|
|
|
if (pos != -1) {
|
|
|
|
setEncoding(str.mid(pos, len), EncodingFromXMLHeader);
|
|
|
|
if (m_type == EncodingFromXMLHeader)
|
|
|
|
goto found;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if(*ptr == '/') ptr++, end=true;
|
|
|
|
char tmp[20];
|
|
|
|
int len = 0;
|
|
|
|
while (
|
|
|
|
((*ptr >= 'a') && (*ptr <= 'z') ||
|
|
|
|
(*ptr >= 'A') && (*ptr <= 'Z') ||
|
|
|
|
(*ptr >= '0') && (*ptr <= '9'))
|
|
|
|
&& len < 19 )
|
|
|
|
{
|
|
|
|
tmp[len] = tolower( *ptr );
|
|
|
|
ptr++;
|
|
|
|
len++;
|
|
|
|
}
|
|
|
|
tmp[len] = 0;
|
|
|
|
int id = khtml::getTagID(tmp, len);
|
|
|
|
if(end) id += ID_CLOSE_TAG;
|
|
|
|
|
|
|
|
switch( id ) {
|
|
|
|
case ID_META:
|
|
|
|
{
|
|
|
|
// found a meta tag...
|
|
|
|
//ptr += 5;
|
|
|
|
const char * end = ptr;
|
|
|
|
while(*end != '>' && *end != '\0') end++;
|
|
|
|
if ( *end == '\0' ) break;
|
|
|
|
TQCString str( ptr, (end-ptr)+1);
|
|
|
|
str = str.lower();
|
|
|
|
int pos = 0;
|
|
|
|
//if( (pos = str.find("http-equiv", pos)) == -1) break;
|
|
|
|
//if( (pos = str.find("content-type", pos)) == -1) break;
|
|
|
|
while( pos < ( int ) str.length() ) {
|
|
|
|
if( (pos = str.find("charset", pos)) == -1) break;
|
|
|
|
pos += 7;
|
|
|
|
// skip whitespace..
|
|
|
|
while( pos < (int)str.length() && str[pos] <= ' ' ) pos++;
|
|
|
|
if ( pos == ( int )str.length()) break;
|
|
|
|
if ( str[pos++] != '=' ) continue;
|
|
|
|
while ( pos < ( int )str.length() &&
|
|
|
|
( str[pos] <= ' ' ) || str[pos] == '=' || str[pos] == '"' || str[pos] == '\'')
|
|
|
|
pos++;
|
|
|
|
|
|
|
|
// end ?
|
|
|
|
if ( pos == ( int )str.length() ) break;
|
|
|
|
uint endpos = pos;
|
|
|
|
while( endpos < str.length() &&
|
|
|
|
(str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
|
|
|
|
&& str[endpos] != ';' && str[endpos] != '>') )
|
|
|
|
endpos++;
|
|
|
|
enc = str.mid(pos, endpos-pos);
|
|
|
|
#ifdef DECODE_DEBUG
|
|
|
|
kdDebug( 6005 ) << "Decoder: found charset: " << enc.data() << endl;
|
|
|
|
#endif
|
|
|
|
setEncoding(enc, EncodingFromMetaTag);
|
|
|
|
if( m_type == EncodingFromMetaTag ) goto found;
|
|
|
|
|
|
|
|
if ( endpos >= str.length() || str[endpos] == '/' || str[endpos] == '>' ) break;
|
|
|
|
|
|
|
|
pos = endpos + 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
case ID_SCRIPT:
|
|
|
|
case (ID_SCRIPT+ID_CLOSE_TAG):
|
|
|
|
case ID_NOSCRIPT:
|
|
|
|
case (ID_NOSCRIPT+ID_CLOSE_TAG):
|
|
|
|
case ID_STYLE:
|
|
|
|
case (ID_STYLE+ID_CLOSE_TAG):
|
|
|
|
case ID_LINK:
|
|
|
|
case (ID_LINK+ID_CLOSE_TAG):
|
|
|
|
case ID_OBJECT:
|
|
|
|
case (ID_OBJECT+ID_CLOSE_TAG):
|
|
|
|
case ID_TITLE:
|
|
|
|
case (ID_TITLE+ID_CLOSE_TAG):
|
|
|
|
case ID_BASE:
|
|
|
|
case (ID_BASE+ID_CLOSE_TAG):
|
|
|
|
case ID_HTML:
|
|
|
|
case ID_HEAD:
|
|
|
|
case 0:
|
|
|
|
case (0 + ID_CLOSE_TAG ):
|
|
|
|
break;
|
|
|
|
case ID_BODY:
|
|
|
|
case (ID_HEAD+ID_CLOSE_TAG):
|
|
|
|
body = true;
|
|
|
|
#ifdef DECODE_DEBUG
|
|
|
|
kdDebug( 6005 ) << "Decoder: no charset found. Id=" << id << endl;
|
|
|
|
#endif
|
|
|
|
goto found;
|
|
|
|
default:
|
|
|
|
// Invalid tag in head. Let's be a little tolerant
|
|
|
|
invalid++;
|
|
|
|
if (invalid > 2) {
|
|
|
|
body = true;
|
|
|
|
#ifdef DECODE_DEBUG
|
|
|
|
kdDebug( 6005 ) << "Decoder: no charset found. Id=" << id << endl;
|
|
|
|
#endif
|
|
|
|
goto found;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
if (invalid > 0) {
|
|
|
|
body = true;
|
|
|
|
goto found;
|
|
|
|
}
|
|
|
|
return TQString::null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
found:
|
|
|
|
if (m_type == DefaultEncoding)
|
|
|
|
{
|
|
|
|
#ifdef DECODE_DEBUG
|
|
|
|
kdDebug( 6005 ) << "Decoder: use auto-detect (" << strlen(data) << ")" << endl;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
switch ( m_autoDetectLanguage) {
|
|
|
|
case Decoder::Arabic:
|
|
|
|
enc = automaticDetectionForArabic( (const unsigned char*) data, len );
|
|
|
|
break;
|
|
|
|
case Decoder::Baltic:
|
|
|
|
enc = automaticDetectionForBaltic( (const unsigned char*) data, len );
|
|
|
|
break;
|
|
|
|
case Decoder::CentralEuropean:
|
|
|
|
enc = automaticDetectionForCentralEuropean( (const unsigned char*) data, len );
|
|
|
|
break;
|
|
|
|
case Decoder::Russian:
|
|
|
|
case Decoder::Ukrainian:
|
|
|
|
enc = automaticDetectionForCyrillic( (const unsigned char*) data, len, m_autoDetectLanguage );
|
|
|
|
break;
|
|
|
|
case Decoder::Greek:
|
|
|
|
enc = automaticDetectionForGreek( (const unsigned char*) data, len );
|
|
|
|
break;
|
|
|
|
case Decoder::Hebrew:
|
|
|
|
enc = automaticDetectionForHebrew( (const unsigned char*) data, len );
|
|
|
|
break;
|
|
|
|
case Decoder::Japanese:
|
|
|
|
enc = automaticDetectionForJapanese( (const unsigned char*) data, len );
|
|
|
|
break;
|
|
|
|
case Decoder::Turkish:
|
|
|
|
enc = automaticDetectionForTurkish( (const unsigned char*) data, len );
|
|
|
|
break;
|
|
|
|
case Decoder::WesternEuropean:
|
|
|
|
enc = automaticDetectionForWesternEuropean( (const unsigned char*) data, len );
|
|
|
|
break;
|
|
|
|
case Decoder::SemiautomaticDetection:
|
|
|
|
case Decoder::Chinese:
|
|
|
|
case Decoder::Korean:
|
|
|
|
case Decoder::Thai:
|
|
|
|
case Decoder::Unicode:
|
|
|
|
// huh. somethings broken in this code ### FIXME
|
|
|
|
enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef DECODE_DEBUG
|
|
|
|
kdDebug( 6005 ) << "Decoder: auto detect encoding is " << enc.data() << endl;
|
|
|
|
#endif
|
|
|
|
if ( !enc.isEmpty() )
|
|
|
|
setEncoding( enc.data(), AutoDetectedEncoding);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// if we still haven't found an encoding latin1 will be used...
|
|
|
|
// this is according to HTML4.0 specs
|
|
|
|
if (!m_codec)
|
|
|
|
{
|
|
|
|
if(enc.isEmpty()) enc = "iso8859-1";
|
|
|
|
m_codec = TQTextCodec::codecForName(enc);
|
|
|
|
// be sure not to crash
|
|
|
|
if(!m_codec) {
|
|
|
|
m_codec = TQTextCodec::codecForMib(4);
|
|
|
|
enc = "iso8859-1";
|
|
|
|
}
|
|
|
|
delete m_decoder;
|
|
|
|
m_decoder = m_codec->makeDecoder();
|
|
|
|
}
|
|
|
|
TQString out;
|
|
|
|
|
|
|
|
if(!buffer.isEmpty() && enc != "ISO-10646-UCS-2") {
|
|
|
|
out = m_decoder->toUnicode(buffer, buffer.length());
|
|
|
|
buffer = "";
|
|
|
|
} else {
|
|
|
|
if(m_codec->mibEnum() != 1000) // utf16
|
|
|
|
{
|
|
|
|
// ### hack for a bug in TQTextCodec. It cut's the input stream
|
|
|
|
// in case there are \0 in it. ZDNET has them inside... :-(
|
|
|
|
char *d = const_cast<char *>(data);
|
|
|
|
int i = len - 1;
|
|
|
|
while(i >= 0) {
|
|
|
|
if(*(d+i) == 0) *(d+i) = ' ';
|
|
|
|
i--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out = m_decoder->toUnicode(data, len);
|
|
|
|
}
|
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
TQString Decoder::flush() const
|
|
|
|
{
|
|
|
|
return m_decoder->toUnicode(buffer, buffer.length());
|
|
|
|
}
|
|
|
|
|
|
|
|
TQCString Decoder::automaticDetectionForArabic( const unsigned char* ptr, int size )
|
|
|
|
{
|
|
|
|
for ( int i = 0; i < size; ++i ) {
|
|
|
|
if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
|
|
|
|
|| ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA )
|
|
|
|
|| ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
|
|
|
|
|| ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) {
|
|
|
|
return "cp1256";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return "iso-8859-6";
|
|
|
|
}
|
|
|
|
|
|
|
|
TQCString Decoder::automaticDetectionForBaltic( const unsigned char* ptr, int size )
|
|
|
|
{
|
|
|
|
for ( int i = 0; i < size; ++i ) {
|
|
|
|
if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) )
|
|
|
|
return "cp1257";
|
|
|
|
|
|
|
|
if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 )
|
|
|
|
return "iso-8859-13";
|
|
|
|
}
|
|
|
|
|
|
|
|
return "iso-8859-13";
|
|
|
|
}
|
|
|
|
|
|
|
|
TQCString Decoder::automaticDetectionForCentralEuropean(const unsigned char* ptr, int size )
|
|
|
|
{
|
|
|
|
TQCString charset = TQCString();
|
|
|
|
for ( int i = 0; i < size; ++i ) {
|
|
|
|
if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) {
|
|
|
|
if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 )
|
|
|
|
return "ibm852";
|
|
|
|
|
|
|
|
if ( i + 1 > size )
|
|
|
|
return "cp1250";
|
|
|
|
else { // maybe ibm852 ?
|
|
|
|
charset = "cp1250";
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) {
|
|
|
|
if ( i + 1 > size )
|
|
|
|
return "iso-8859-2";
|
|
|
|
else { // maybe ibm852 ?
|
|
|
|
if ( charset.isNull() )
|
|
|
|
charset = "iso-8859-2";
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( charset.isNull() )
|
|
|
|
charset = "iso-8859-3";
|
|
|
|
|
|
|
|
return charset.data();
|
|
|
|
}
|
|
|
|
|
|
|
|
TQCString Decoder::automaticDetectionForCyrillic( const unsigned char* ptr, int size, AutoDetectLanguage _language )
|
|
|
|
{
|
|
|
|
int koi_st=0;
|
|
|
|
int cp1251_st=0;
|
|
|
|
|
|
|
|
// int koi_na=0;
|
|
|
|
// int cp1251_na=0;
|
|
|
|
|
|
|
|
int koi_o_capital=0;
|
|
|
|
int koi_o=0;
|
|
|
|
int cp1251_o_capital=0;
|
|
|
|
int cp1251_o=0;
|
|
|
|
|
|
|
|
int koi_a_capital=0;
|
|
|
|
int koi_a=0;
|
|
|
|
int cp1251_a_capital=0;
|
|
|
|
int cp1251_a=0;
|
|
|
|
|
|
|
|
int koi_i_capital=0;
|
|
|
|
int koi_i=0;
|
|
|
|
int cp1251_i_capital=0;
|
|
|
|
int cp1251_i=0;
|
|
|
|
|
|
|
|
int cp1251_small_range=0;
|
|
|
|
int koi_small_range=0;
|
|
|
|
int ibm866_small_range=0;
|
|
|
|
|
|
|
|
int i;
|
|
|
|
for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i)
|
|
|
|
{
|
|
|
|
if (ptr[i]>0xdf)
|
|
|
|
{
|
|
|
|
++cp1251_small_range;
|
|
|
|
|
|
|
|
if (ptr[i]==0xee)//small o
|
|
|
|
++cp1251_o;
|
|
|
|
else if (ptr[i]==0xe0)//small a
|
|
|
|
++cp1251_a;
|
|
|
|
else if (ptr[i]==0xe8)//small i
|
|
|
|
++cp1251_i;
|
|
|
|
else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)//small st
|
|
|
|
++cp1251_st;
|
|
|
|
|
|
|
|
else if (ptr[i]==0xef)
|
|
|
|
++koi_o_capital;
|
|
|
|
else if (ptr[i]==0xe1)
|
|
|
|
++koi_a_capital;
|
|
|
|
else if (ptr[i]==0xe9)
|
|
|
|
++koi_i_capital;
|
|
|
|
|
|
|
|
}
|
|
|
|
else if (ptr[i]>0xbf)
|
|
|
|
{
|
|
|
|
++koi_small_range;
|
|
|
|
|
|
|
|
if (ptr[i]==0xcf)//small o
|
|
|
|
++koi_o;
|
|
|
|
else if (ptr[i]==0xc1)//small a
|
|
|
|
++koi_a;
|
|
|
|
else if (ptr[i]==0xc9)//small i
|
|
|
|
++koi_i;
|
|
|
|
else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)//small st
|
|
|
|
++koi_st;
|
|
|
|
|
|
|
|
else if (ptr[i]==0xce)
|
|
|
|
++cp1251_o_capital;
|
|
|
|
else if (ptr[i]==0xc0)
|
|
|
|
++cp1251_a_capital;
|
|
|
|
else if (ptr[i]==0xc8)
|
|
|
|
++cp1251_i_capital;
|
|
|
|
}
|
|
|
|
else if (ptr[i]>0x9f && ptr[i]<0xaf) //first 16 letterz is 60%
|
|
|
|
++ibm866_small_range;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ibm866_small_range>cp1251_small_range+koi_small_range)
|
|
|
|
return "ibm866"; //hehe this is a rare case :)
|
|
|
|
|
|
|
|
TQCString koi_string = "koi8-u";
|
|
|
|
TQCString cp1251_string = "cp1251";
|
|
|
|
|
|
|
|
if (cp1251_st==0 && koi_st>1)
|
|
|
|
return koi_string;
|
|
|
|
if (koi_st==0 && cp1251_st>1)
|
|
|
|
return cp1251_string;
|
|
|
|
|
|
|
|
if (cp1251_st>0 && koi_st>0)
|
|
|
|
{
|
|
|
|
if (cp1251_st/koi_st>2)
|
|
|
|
return cp1251_string;
|
|
|
|
else if (koi_st/cp1251_st>2)
|
|
|
|
return koi_string;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cp1251_a>koi_a && cp1251_o>koi_o && cp1251_i>koi_i)
|
|
|
|
return cp1251_string;
|
|
|
|
if (koi_a>cp1251_a && koi_o>cp1251_o && koi_i>cp1251_i)
|
|
|
|
return koi_string;
|
|
|
|
|
|
|
|
if (cp1251_a_capital>koi_a_capital && cp1251_o_capital>koi_o_capital && cp1251_i_capital>koi_i_capital)
|
|
|
|
return cp1251_string;
|
|
|
|
if (koi_a_capital>cp1251_a_capital && koi_o_capital>cp1251_o_capital && koi_i_capital>cp1251_i_capital)
|
|
|
|
return koi_string;
|
|
|
|
|
|
|
|
//fallback...
|
|
|
|
if (cp1251_small_range>koi_small_range)
|
|
|
|
return cp1251_string;
|
|
|
|
else
|
|
|
|
return koi_string;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
TQCString Decoder::automaticDetectionForGreek( const unsigned char* ptr, int size )
|
|
|
|
{
|
|
|
|
for ( int i = 0; i < size; ++i ) {
|
|
|
|
if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
|
|
|
|
|| ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
|
|
|
|
|| ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) {
|
|
|
|
return "cp1253";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return "iso-8859-7";
|
|
|
|
}
|
|
|
|
|
|
|
|
TQCString Decoder::automaticDetectionForHebrew( const unsigned char* ptr, int size )
|
|
|
|
{
|
|
|
|
for ( int i = 0; i < size; ++i ) {
|
|
|
|
if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B
|
|
|
|
|| ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 )
|
|
|
|
|| ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) {
|
|
|
|
return "cp1255";
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( ptr[ i ] == 0xDF )
|
|
|
|
return "iso-8859-8-i";
|
|
|
|
}
|
|
|
|
|
|
|
|
return "iso-8859-8-i";
|
|
|
|
}
|
|
|
|
|
|
|
|
TQCString Decoder::automaticDetectionForJapanese( const unsigned char* ptr, int size )
|
|
|
|
{
|
|
|
|
if (!kc)
|
|
|
|
kc = new JapaneseCode();
|
|
|
|
|
|
|
|
switch ( kc->guess_jp( (const char*)ptr, size ) ) {
|
|
|
|
case JapaneseCode::JIS:
|
|
|
|
return "jis7";
|
|
|
|
case JapaneseCode::EUC:
|
|
|
|
return "eucjp";
|
|
|
|
case JapaneseCode::SJIS:
|
|
|
|
return "sjis";
|
|
|
|
case JapaneseCode::UTF8:
|
|
|
|
return "utf8";
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
|
|
|
|
TQCString Decoder::automaticDetectionForTurkish( const unsigned char* ptr, int size )
|
|
|
|
{
|
|
|
|
for ( int i = 0; i < size; ++i ) {
|
|
|
|
if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) {
|
|
|
|
return "cp1254";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return "iso-8859-9";
|
|
|
|
}
|
|
|
|
|
|
|
|
TQCString Decoder::automaticDetectionForWesternEuropean( const unsigned char* ptr, int size )
|
|
|
|
{
|
|
|
|
for ( int i = 0; i < size; ++i ) {
|
|
|
|
if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F )
|
|
|
|
return "cp1252";
|
|
|
|
}
|
|
|
|
|
|
|
|
return "iso-8859-1"; //"iso-8859-15"; Which better at default ?
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// -----------------------------------------------------------------------------
|
|
|
|
#undef DECODE_DEBUG
|