You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
konversation/konversation/src/unicode.cpp

146 lines
4.1 KiB

/*
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
*/
/*
The Original Code is mozilla.org code.
See http://lxr.mozilla.org/mozilla/source/modules/rdf/src/utils.c#540
Copyright (C) 1998 Netscape Communications Corporation
Copyright (C) 2005 Ismail Donmez <ismail@kde.org>
*/
#define kLeft1BitMask 0x80
#define kLeft2BitsMask 0xC0
#define kLeft3BitsMask 0xE0
#define kLeft4BitsMask 0xF0
#define kLeft5BitsMask 0xF8
#define kLeft6BitsMask 0xFC
#define kLeft7BitsMask 0xFE
#define k2BytesLeadByte kLeft2BitsMask
#define k3BytesLeadByte kLeft3BitsMask
#define k4BytesLeadByte kLeft4BitsMask
#define k5BytesLeadByte kLeft5BitsMask
#define k6BytesLeadByte kLeft6BitsMask
#define kTrialByte kLeft1BitMask
#define UTF8_1Byte(c) ( 0 == ((c) & kLeft1BitMask))
#define UTF8_2Bytes(c) ( k2BytesLeadByte == ((c) & kLeft3BitsMask))
#define UTF8_3Bytes(c) ( k3BytesLeadByte == ((c) & kLeft4BitsMask))
#define UTF8_4Bytes(c) ( k4BytesLeadByte == ((c) & kLeft5BitsMask))
#define UTF8_5Bytes(c) ( k5BytesLeadByte == ((c) & kLeft6BitsMask))
#define UTF8_6Bytes(c) ( k6BytesLeadByte == ((c) & kLeft7BitsMask))
#define UTF8_ValidTrialByte(c) ( kTrialByte == ((c) & kLeft2BitsMask))
bool isUtf8(const TQCString& text)
{
int i;
int j;
int clen = 0;
int len = text.length();
JapaneseCode* jc = new JapaneseCode();
JapaneseCode::Type result = jc->guess_jp(text, len);
switch(result)
{
case JapaneseCode::SJIS:
case JapaneseCode::JIS:
delete jc;
return false;
default:
delete jc;
break;
}
for(i=0; i < len; i += clen)
{
if(UTF8_1Byte(text[i]))
{
clen = 1;
}
else if(UTF8_2Bytes(text[i]))
{
clen = 2;
/* No enough trail bytes */
if( (i + clen) > len)
return false;
/* 0000 0000 - 0000 007F : should encode in less bytes */
if(0 == (text[i] & 0x1E ))
return false;
}
else if(UTF8_3Bytes(text[i]))
{
clen = 3;
/* No enough trail bytes */
if( (i + clen) > len)
return false;
/* a single Surrogate should not show in 3 bytes UTF8, instead, the pair should be intepreted
as one single UCS4 char and encoded UTF8 in 4 bytes */
if((TQChar(0xED) == text[i] ) && (0xA0 == (text[i+1] & 0xA0 ) ))
return false;
/* 0000 0000 - 0000 07FF : should encode in less bytes */
if((0 == (text[i] & 0x0F )) && (0 == (text[i+1] & 0x20 ) ))
return false;
}
else if(UTF8_4Bytes(text[i]))
{
clen = 4;
/* No enough trail bytes */
if( (i + clen) > len)
return false;
/* 0000 0000 - 0000 FFFF : should encode in less bytes */
if((0 == (text[i] & 0x07 )) && (0 == (text[i+1] & 0x30 )) )
return false;
}
else if(UTF8_5Bytes(text[i]))
{
clen = 5;
/* No enough trail bytes */
if( (i + clen) > len)
return false;
/* 0000 0000 - 001F FFFF : should encode in less bytes */
if((0 == (text[i] & 0x03 )) && (0 == (text[i+1] & 0x38 )) )
return false;
}
else if(UTF8_6Bytes(text[i]))
{
clen = 6;
/* No enough trail bytes */
if( (i + clen) > len)
return false;
/* 0000 0000 - 03FF FFFF : should encode in less bytes */
if((0 == (text[i] & 0x01 )) && (0 == (text[i+1] & 0x3E )) )
return false;
}
else
{
return false;
}
for(j = 1; j<clen ;++j)
{
if(! UTF8_ValidTrialByte(text[i+j])) /* Trail bytes invalid */
return false;
}
}
return true;
}