You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tdepim/libtdepim/qutf7codec.cpp

551 lines
14 KiB

/*
qutf7codec.cpp
A TQTextCodec for UTF-7 (rfc2152).
Copyright (c) 2001 Marc Mutz <mutz@kde.org>
See file COPYING for details
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2.0,
as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, US
As a special exception, permission is granted to use this plugin
with any version of TQt by TrollTech AS, Norway. In this case, the
use of this plugin doesn't cause the resulting executable to be
covered by the GNU General Public License.
This exception does not however invalidate any other reasons why the
executable file might be covered by the GNU General Public License.
*/
#include "qutf7codec.h"
#ifndef TQT_NO_TEXTCODEC
int TQUtf7Codec::mibEnum() const {
return 1012;
}
int TQStrictUtf7Codec::mibEnum() const {
return -1012;
}
const char* TQUtf7Codec::name() const {
return "UTF-7";
}
const char* TQStrictUtf7Codec::name() const {
return "X-QT-UTF-7-STRICT";
}
const char* TQUtf7Codec::mimeName() const {
return "UTF-7";
}
bool TQUtf7Codec::canEncode( TQChar ) const {
return TRUE;
}
bool TQUtf7Codec::canEncode( const TQString & ) const {
return TRUE;
}
static uchar base64Set[] = {
0x00, 0x00, 0x00, 0x00, // '\0' ...
0x00, 0x11, 0xFF, 0xC0, // ' ' ... '?'
0x7F, 0xFF, 0xFF, 0xE0, // '@' ... '_'
0x7F, 0xFF, 0xFF, 0xE0 // '`' ... DEL
};
static uchar base64SetWithLastTwoBitsZero[] = {
0x00, 0x00, 0x00, 0x00, // '\0' ...
0x00, 0x00, 0x88, 0x80, // ' ' ... '?'
0x44, 0x44, 0x44, 0x40, // '@' ... '_'
0x11, 0x11, 0x11, 0x00 // '`' ... DEL
};
static uchar directSet[] = {
0x00, 0x00, 0x00, 0x00, // '\0' ...
0x01, 0xCF, 0xFF, 0xE1, // ' ' ... '?'
0x7F, 0xFF, 0xFF, 0xE0, // '@' ... '_'
0x7F, 0xFF, 0xFF, 0xE0 // '`' ... DEL
};
static uchar optDirectSet[] = {
0x00, 0x00, 0x00, 0x00, // '\0' ...
0x7E, 0x20, 0x00, 0x1E, // ' ' ... '?'
0x80, 0x00, 0x00, 0x17, // '@' ... '_'
0x80, 0x00, 0x00, 0x1C // '`' ... DEL
};
static inline bool isOfSet(uchar ch, uchar* set) {
return set[ ch/8 ] & (0x80 >> ( ch%8 ));
}
int TQUtf7Codec::heuristicContentMatch(const char* chars, int len) const
{
int stepNo = 0;
int i;
bool shifted = FALSE;
bool rightAfterEscape = FALSE;
bool onlyNullBitsSinceLastBoundary = TRUE;
for ( i = 0; i < len ; i++ ) {
if ((unsigned char)chars[i] >= 128) // 8bit chars not allowed.
break;
if (shifted) {
if ( isOfSet(chars[i],base64Set) ) {
switch (stepNo) {
case 0:
onlyNullBitsSinceLastBoundary = TRUE;
break;
case 3:
onlyNullBitsSinceLastBoundary
= isOfSet(chars[i],base64SetWithLastTwoBitsZero);
break;
case 6:
onlyNullBitsSinceLastBoundary
= ( chars[i] == 'A' || chars[i] == 'Q' ||
chars[i] == 'g' || chars[i] == 'w' );
break;
default:
onlyNullBitsSinceLastBoundary
= onlyNullBitsSinceLastBoundary && (chars[i] == 'A');
}
stepNo = (stepNo + 1) % 8;
rightAfterEscape = FALSE;
} else {
if (rightAfterEscape && chars[i] != '-')
break; // a '+' must be followed by '-' or a base64 char
if (!onlyNullBitsSinceLastBoundary)
break; // non-zero bits in the tail of the base64 encoding
shifted = FALSE;
stepNo = 0;
}
} else {
if (chars[i] == '+') {
shifted = TRUE;
rightAfterEscape = TRUE;
}
}
}
return i;
}
class TQUtf7Decoder : public TQTextDecoder {
// the storage for our unicode char until it's finished
ushort uc;
// the state of the base64 decoding
// can be 0 (just finished three unicode chars)
// 1 (have the upper 6 bits of uc already)
// 2 (have the upper 12 bits of uc already)
// 3 (have the upper 2 bits of uc already)
// ..........
// 7 (have the upper 10 bits of uc already)
// => n (have the upper (n * 6) % 16 bits of uc already)
// "stepNo" cycles through all it's values every three
// unicode chars.
char stepNo;
// remembers if we are in shifted-sequence mode
bool shifted;
// remembers if we're just after the initial '+'
// of a shifted-sequence.
bool rightAfterEscape;
public:
TQUtf7Decoder() : uc(0), stepNo(0), shifted(FALSE), rightAfterEscape(FALSE)
{
}
private:
inline void resetParser()
{
uc = 0;
stepNo = 0;
shifted = FALSE;
rightAfterEscape = FALSE;
}
public:
TQString toUnicode(const char* chars, int len)
{
TQString result = "";
for (int i=0; i<len; i++) {
uchar ch = chars[i];
//
// check for 8bit char's:
//
if ( ch > 127 ) {
tqWarning("TQUtf7Decoder: 8bit char found in input. "
"Parser has been re-initialized!");
resetParser();
result += TQChar::replacement;
continue;
}
if (shifted) { // in shifted mode
//
// first, we check specialities that only occur
// right after the escaping '+':
//
if ( rightAfterEscape && ch == '-' ) {
// a "+-" sequence is a short-circuit encoding
// for just '+':
resetParser();
result += TQChar('+');
// we're already done for this "ch", so
continue;
}
//
// Here we're going to extract the bits represented by "ch":
//
ushort bits;
if ( ch >= 'A' && ch <= 'Z' ) {
bits = ch - 'A';
} else if ( ch >= 'a' && ch <= 'z' ) {
bits = ch - 'a' + 26;
} else if ( ch >= '0' && ch <= '9' ) {
bits = ch - '0' + 52;
} else if ( ch == '+' ) {
bits = 62;
} else if ( ch == '/' ) {
bits = 63;
} else {
bits = 0; // keep compiler happy
//
// ch is not of the base64 alphabet.
// Here we are going to check the sequence's validity:
//
if ( rightAfterEscape ) {
// any non-base64 char following an escaping '+'
// makes for an ill-formed sequence.
// Note that we catch (the valid) "+-" pair
// right at the beginning.
tqWarning("TQUtf7Decoder: ill-formed input: "
"non-base64 char after escaping \"+\"!");
}
// pending bits from base64 encoding must be all 0:
if (stepNo >= 1 && uc) {
tqWarning("TQUtf7Decoder: ill-formed sequence: "
"non-zero bits in shifted-sequence tail!");
}
resetParser();
// a '-' signifies the end of the shifted-sequence,
// so we just swallow it.
if ( ch == '-' )
continue;
// end of validity checking. Process ch now...
}
if ( /*still*/ shifted ) {
//
// now we're going to stuff the "bits" bit bucket into
// the right position inside "uc", emitting a resulting
// TQChar if possible.
//
switch (stepNo) {
// "bits" are the 6 msb's of uc
case 0: uc = bits << 10; break;
case 1: uc |= bits << 4; break;
// 4 bits of "bits" complete the first ushort
case 2: uc |= bits >> 2; result += TQChar(uc);
// 2 bits of "bits" make the msb's of the next ushort
uc = bits << 14; break;
case 3: uc |= bits << 8; break;
case 4: uc |= bits << 2; break;
// 2 bits of "bits" complete the second ushort
case 5: uc |= bits >> 4; result += TQChar(uc);
// 4 bits of "bits" make the msb's of the next ushort
uc = bits << 12; break;
case 6: uc |= bits << 6; break;
// these 6 bits complete the third ushort
// and also one round of 8 chars -> 3 ushort decoding
case 7: uc |= bits; result += TQChar(uc);
uc = 0; break;
default: ;
} // switch (stepNo)
// increase the step counter
stepNo++;
stepNo %= 8;
rightAfterEscape = FALSE;
// and look at the next char.
continue;
} // fi (still) shifted
} // fi shifted
//
// if control reaches here, we either weren't in a
// shifted sequence or we just left one by seeing
// a non-base64-char.
// Either way, we have to process "ch" outside
// a shifted-sequence now:
//
if ( ch == '+' ) {
// '+' is the escape char for entering a
// shifted sequence:
shifted = TRUE;
stepNo = 0;
// also, we're right at the beginning where
// special rules apply:
rightAfterEscape = TRUE;
} else {
// US-ASCII values are directly used
result += TQChar(ch);
}
}
return result;
} // toUnicode()
}; // class TQUtf7Decoder
TQTextDecoder* TQUtf7Codec::makeDecoder() const
{
return new TQUtf7Decoder;
}
class TQUtf7Encoder : public TQTextEncoder {
uchar dontNeedEncodingSet[16];
ushort outbits;
uint stepNo : 2;
bool shifted : 1;
bool mayContinueShiftedSequence : 1;
public:
TQUtf7Encoder(bool encOpt, bool encLwsp)
: outbits(0), stepNo(0),
shifted(FALSE), mayContinueShiftedSequence(FALSE)
{
for ( int i = 0; i < 16 ; i++) {
dontNeedEncodingSet[i] = directSet[i];
if (!encOpt)
dontNeedEncodingSet[i] |= optDirectSet[i];
}
if(!encLwsp) {
dontNeedEncodingSet[' '/8] |= 0x80 >> (' '%8);
dontNeedEncodingSet['\n'/8] |= 0x80 >> ('\n'%8);
dontNeedEncodingSet['\r'/8] |= 0x80 >> ('\r'%8);
dontNeedEncodingSet['\t'/8] |= 0x80 >> ('\t'%8);
}
}
private:
char toBase64( ushort u ) {
if ( u < 26 )
return (char)u + 'A';
else if ( u < 52 )
return (char)u - 26 + 'a';
else if ( u < 62 )
return (char)u - 52 + '0';
else if ( u == 62 )
return '+';
else
return '/';
}
void addToShiftedSequence(TQCString::Iterator & t, ushort u) {
switch (stepNo) {
// no outbits; use uppermost 6 bits of u
case 0:
*t++ = toBase64( u >> 10 );
*t++ = toBase64( (u & 0x03FF /* umask top 6 bits */ ) >> 4 );
// save 4 lowest-order bits in outbits[5..2]
outbits = (u & 0x000F) << 2;
break;
// outbits available; use top two bits of u to complete
// the previous char
case 1:
if (!mayContinueShiftedSequence) {
// if mayContinue, this char has already been written
*t++ = toBase64( outbits | ( u >> 14 ) );
}
*t++ = toBase64( (u & 0x3F00 /* mask top 2 bits */ ) >> 8 );
*t++ = toBase64( (u & 0x00FC /* mask msbyte */ ) >> 2 );
// save 2 lowest-significant bits in outbits[5..4]
outbits = (u & 0x0003) << 4;
break;
// outbits available; use top four bits of u to complete
// the previous char
case 2:
if (!mayContinueShiftedSequence) {
// if mayContinue, this char has already been written
*t++ = toBase64( outbits | ( u >> 12 ) );
}
*t++ = toBase64( (u & 0x0FFF) >> 6 );
*t++ = toBase64( u & 0x003F );
break;
default: ;
}
stepNo = (stepNo + 1) % 3;
}
void endShiftedSequence(TQCString::Iterator & t) {
switch (stepNo) {
case 1: // four outbits still to be written
case 2: // two outbits still to be written
*t++ = toBase64( outbits );
break;
case 0: // nothing to do
default: ;
}
outbits = 0;
}
// depending on the stepNo, checks whether we can continue
// an already ended shifted-sequence with char "u".
// This is only possible if the topmost bits fit the
// already written ones (which are all 0 between calls)
bool continueOK( ushort u ) {
return stepNo == 0 ||
( stepNo == 1 && (u & 0xF000) == 0 ) ||
( stepNo == 2 && (u & 0xC000) == 0 );
}
void processDoesntNeedEncoding(TQCString::Iterator & t, ushort ch) {
// doesn't need encoding
if (shifted) {
endShiftedSequence(t);
// add "lead-out" to dis-ambiguate following chars:
if (isOfSet((char)ch,base64Set) || ch == '-' ) {
*t++ = '-';
}
} else if (mayContinueShiftedSequence) {
// if mayContinue is set, this means the
// shifted-sequence needs a lead-out.
mayContinueShiftedSequence = FALSE;
if (isOfSet(ch,base64Set) || ch == '-' ) {
*t++ = '-';
}
}
*t++ = (uchar)ch;
shifted = FALSE;
stepNo = 0;
}
public:
TQCString fromUnicode(const TQString & uc, int & len_in_out)
{
// allocate place for worst case:
// len/2 * (5+1) for an alternating sequence of e.g. "A\",
// + 4 for a worst-case of another +ABC encoded char
// + 1 for the trailing \0
//
int maxreslen = 3 * len_in_out + 5;
TQCString result( maxreslen );
#if 0
// if (len_in_out == 1) {
cout << "\nlen_in_out: " << len_in_out
<<"; shifted: " << (shifted ? "true" : "false")
<< ";\n" << "mayContinue: "
<< (mayContinueShiftedSequence ? "true" : "false")
<< "; stepNo: " << stepNo << ";\n"
<< "outbits: " << outbits << endl;
// }
#endif
// source and destination cursor
const TQChar * s = uc.unicode();
TQCString::Iterator t = result.data();
if ( uc.isNull() ) {
// return to ascii requested:
if ( mayContinueShiftedSequence )
*t++ = '-';
} else {
// normal operation:
for (int i = 0 ; i < len_in_out ;
i++/*, checkOutBuf(result,maxreslen,t,i,len_in_out,5)*/ ) {
ushort ch = s[i].unicode();
//
// first, we check whether we might get around encoding:
//
if ( ch < 128 ) {
//
// ch is usAscii, so we have a chance that we don't
// need to encode it.
//
if ( isOfSet((uchar)ch,dontNeedEncodingSet) ) {
processDoesntNeedEncoding(t,ch);
continue;
} else if ( ch == '+' ) {
// '+' is the shift escape character
if (shifted || mayContinueShiftedSequence) {
// if we are already in shifted mode, we just
// encode the '+', too. Compare
// 24bits ("-+-") + some from ending the shifted-sequence
// with 21,33 bits
addToShiftedSequence(t,ch);
mayContinueShiftedSequence = FALSE;
shifted = TRUE;
} else {
// shortcut encoding of '+':
*t++ = '+';
*t++ = '-';
}
continue; // done
} // else fall through to encoding
}
//
// need encoding
//
if (!shifted && (!mayContinueShiftedSequence || !continueOK(ch) ) ) {
*t++ = '+';
stepNo = 0;
}
addToShiftedSequence(t,ch);
shifted = TRUE;
mayContinueShiftedSequence = FALSE;
}
if ( shifted ) {
endShiftedSequence(t);
mayContinueShiftedSequence = TRUE;
};
shifted = FALSE;
}
*t = '\0';
len_in_out = t - result.data();
#if 0
cout << "len_in_out: " << len_in_out << "; "
<< "mayContinue: " << (mayContinueShiftedSequence ? "true" : "false")
<< "; stepNo: " << stepNo << endl;
#endif
Q_ASSERT(len_in_out <= maxreslen-1);
return result;
} // fromUnicode()
}; // class TQUtf7Encoder
TQTextEncoder* TQUtf7Codec::makeEncoder() const {
return new TQUtf7Encoder( false, false );
}
TQTextEncoder* TQStrictUtf7Codec::makeEncoder() const {
return new TQUtf7Encoder( true, false );
}
#endif // TQT_NO_TEXTCODEC