/********************************************************************** * * rfcdecoder.cc - handler for various rfc/mime encodings * Copyright (C) 2000 s.carstens@gmx.de * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * Send comments and bug fixes to s.carstens@gmx.de * *********************************************************************/ #include "rfcdecoder.h" #include #include #include #include #include #include #include #include // This part taken from rfc 2192 IMAP URL Scheme. C. Newman. September 1997. // adapted to QT-Toolkit by Sven Carstens 2000 static unsigned char base64chars[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; #define UNDEFINED 64 #define MAXLINE 76 /* UTF16 definitions */ #define UTF16MASK 0x03FFUL #define UTF16SHIFT 10 #define UTF16BASE 0x10000UL #define UTF16HIGHSTART 0xD800UL #define UTF16HIGHEND 0xDBFFUL #define UTF16LOSTART 0xDC00UL #define UTF16LOEND 0xDFFFUL /* Convert an IMAP mailbox to a Unicode path */ TQString rfcDecoder::fromIMAP (const TQString & inSrc) { unsigned char c, i, bitcount; unsigned long ucs4, utf16, bitbuf; unsigned char base64[256], utf8[6]; unsigned long srcPtr = 0; TQCString dst; TQCString src = inSrc.ascii (); uint srcLen = inSrc.length(); /* initialize modified base64 decoding table */ memset (base64, UNDEFINED, sizeof (base64)); for (i = 0; i < sizeof (base64chars); ++i) { base64[(int)base64chars[i]] = i; } /* loop until end of string */ while (srcPtr < srcLen) { c = src[srcPtr++]; /* deal with literal characters and &- */ if (c != '&' || src[srcPtr] == '-') { /* encode literally */ dst += c; /* skip over the '-' if this is an &- sequence */ if (c == '&') srcPtr++; } else { /* convert modified UTF-7 -> UTF-16 -> UCS-4 -> UTF-8 -> HEX */ bitbuf = 0; bitcount = 0; ucs4 = 0; while ((c = base64[(unsigned char) src[srcPtr]]) != UNDEFINED) { ++srcPtr; bitbuf = (bitbuf << 6) | c; bitcount += 6; /* enough bits for a UTF-16 character? */ if (bitcount >= 16) { bitcount -= 16; utf16 = (bitcount ? bitbuf >> bitcount : bitbuf) & 0xffff; /* convert UTF16 to UCS4 */ if (utf16 >= UTF16HIGHSTART && utf16 <= UTF16HIGHEND) { ucs4 = (utf16 - UTF16HIGHSTART) << UTF16SHIFT; continue; } else if (utf16 >= UTF16LOSTART && utf16 <= UTF16LOEND) { ucs4 += utf16 - UTF16LOSTART + UTF16BASE; } else { ucs4 = utf16; } /* convert UTF-16 range of UCS4 to UTF-8 */ if (ucs4 <= 0x7fUL) { utf8[0] = ucs4; i = 1; } else if (ucs4 <= 0x7ffUL) { utf8[0] = 0xc0 | (ucs4 >> 6); utf8[1] = 0x80 | (ucs4 & 0x3f); i = 2; } else if (ucs4 <= 0xffffUL) { utf8[0] = 0xe0 | (ucs4 >> 12); utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f); utf8[2] = 0x80 | (ucs4 & 0x3f); i = 3; } else { utf8[0] = 0xf0 | (ucs4 >> 18); utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f); utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f); utf8[3] = 0x80 | (ucs4 & 0x3f); i = 4; } /* copy it */ for (c = 0; c < i; ++c) { dst += utf8[c]; } } } /* skip over trailing '-' in modified UTF-7 encoding */ if (src[srcPtr] == '-') ++srcPtr; } } return TQString::fromUtf8 (dst.data ()); } /* replace " with \" and \ with \\ " and \ characters */ TQString rfcDecoder::quoteIMAP(const TQString &src) { uint len = src.length(); TQString result; result.reserve(2 * len); for (unsigned int i = 0; i < len; i++) { if (src[i] == '"' || src[i] == '\\') result += '\\'; result += src[i]; } //result.squeeze(); - unnecessary and slow return result; } /* Convert Unicode path to modified UTF-7 IMAP mailbox */ TQString rfcDecoder::toIMAP (const TQString & inSrc) { unsigned int utf8pos, utf8total, c, utf7mode, bitstogo, utf16flag; unsigned long ucs4, bitbuf; TQCString src = inSrc.utf8 (); TQString dst; ulong srcPtr = 0; utf7mode = 0; utf8total = 0; bitstogo = 0; utf8pos = 0; bitbuf = 0; ucs4 = 0; while (srcPtr < src.length ()) { c = (unsigned char) src[srcPtr++]; /* normal character? */ if (c >= ' ' && c <= '~') { /* switch out of UTF-7 mode */ if (utf7mode) { if (bitstogo) { dst += base64chars[(bitbuf << (6 - bitstogo)) & 0x3F]; bitstogo = 0; } dst += '-'; utf7mode = 0; } dst += c; /* encode '&' as '&-' */ if (c == '&') { dst += '-'; } continue; } /* switch to UTF-7 mode */ if (!utf7mode) { dst += '&'; utf7mode = 1; } /* Encode US-ASCII characters as themselves */ if (c < 0x80) { ucs4 = c; utf8total = 1; } else if (utf8total) { /* save UTF8 bits into UCS4 */ ucs4 = (ucs4 << 6) | (c & 0x3FUL); if (++utf8pos < utf8total) { continue; } } else { utf8pos = 1; if (c < 0xE0) { utf8total = 2; ucs4 = c & 0x1F; } else if (c < 0xF0) { utf8total = 3; ucs4 = c & 0x0F; } else { /* NOTE: can't convert UTF8 sequences longer than 4 */ utf8total = 4; ucs4 = c & 0x03; } continue; } /* loop to split ucs4 into two utf16 chars if necessary */ utf8total = 0; do { if (ucs4 >= UTF16BASE) { ucs4 -= UTF16BASE; bitbuf = (bitbuf << 16) | ((ucs4 >> UTF16SHIFT) + UTF16HIGHSTART); ucs4 = (ucs4 & UTF16MASK) + UTF16LOSTART; utf16flag = 1; } else { bitbuf = (bitbuf << 16) | ucs4; utf16flag = 0; } bitstogo += 16; /* spew out base64 */ while (bitstogo >= 6) { bitstogo -= 6; dst += base64chars[(bitstogo ? (bitbuf >> bitstogo) : bitbuf) & 0x3F]; } } while (utf16flag); } /* if in UTF-7 mode, finish in ASCII */ if (utf7mode) { if (bitstogo) { dst += base64chars[(bitbuf << (6 - bitstogo)) & 0x3F]; } dst += '-'; } return quoteIMAP(dst); } //----------------------------------------------------------------------------- TQString rfcDecoder::decodeQuoting(const TQString &aStr) { TQString result; unsigned int strLength(aStr.length()); for (unsigned int i = 0; i < strLength ; i++) { if (aStr[i] == "\\") i++; result += aStr[i]; } return result; } //----------------------------------------------------------------------------- TQTextCodec * rfcDecoder::codecForName (const TQString & _str) { if (_str.isEmpty ()) return NULL; return TQTextCodec::codecForName (_str.lower (). replace ("windows", "cp").latin1 ()); } //----------------------------------------------------------------------------- const TQString rfcDecoder::decodeRFC2047String (const TQString & _str) { TQString throw_away; return decodeRFC2047String (_str, throw_away); } //----------------------------------------------------------------------------- const TQString rfcDecoder::decodeRFC2047String (const TQString & _str, TQString & charset) { TQString throw_away; return decodeRFC2047String (_str, charset, throw_away); } //----------------------------------------------------------------------------- const TQString rfcDecoder::decodeRFC2047String (const TQString & _str, TQString & charset, TQString & language) { //do we have a rfc string if (_str.find("=?") < 0) return _str; TQCString aStr = _str.ascii (); // TQString.length() means Unicode chars TQCString result; char *pos, *beg, *end, *mid = NULL; TQCString str; char encoding = 0, ch; bool valid; const int maxLen = 200; int i; // result.truncate(aStr.length()); for (pos = aStr.data (); *pos; pos++) { if (pos[0] != '=' || pos[1] != '?') { result += *pos; continue; } beg = pos + 2; end = beg; valid = TRUE; // parse charset name for (i = 2, pos += 2; i < maxLen && (*pos != '?' && (ispunct (*pos) || isalnum (*pos))); i++) pos++; if (*pos != '?' || i < 4 || i >= maxLen) valid = FALSE; else { charset = TQCString (beg, i - 1); // -2 + 1 for the zero int pt = charset.findRev('*'); if (pt != -1) { // save language for later usage language = charset.right (charset.length () - pt - 1); // tie off language as defined in rfc2047 charset.truncate(pt); } // get encoding and check delimiting question marks encoding = toupper (pos[1]); if (pos[2] != '?' || (encoding != 'Q' && encoding != 'B' && encoding != 'q' && encoding != 'b')) valid = FALSE; pos += 3; i += 3; // kdDebug(7116) << "rfcDecoder::decodeRFC2047String - charset " << charset << " - language " << language << " - '" << pos << "'" << endl; } if (valid) { mid = pos; // search for end of encoded part while (i < maxLen && *pos && !(*pos == '?' && *(pos + 1) == '=')) { i++; pos++; } end = pos + 2; //end now points to the first char after the encoded string if (i >= maxLen || !*pos) valid = FALSE; } if (valid) { ch = *pos; *pos = '\0'; str = TQCString (mid).left ((int) (mid - pos - 1)); if (encoding == 'Q') { // decode quoted printable text for (i = str.length () - 1; i >= 0; i--) if (str[i] == '_') str[i] = ' '; // kdDebug(7116) << "rfcDecoder::decodeRFC2047String - before QP '" << str << "'" << endl; str = KCodecs::quotedPrintableDecode(str); // kdDebug(7116) << "rfcDecoder::decodeRFC2047String - after QP '" << str << "'" << endl; } else { // decode base64 text str = KCodecs::base64Decode(str); } *pos = ch; int len = str.length(); for (i = 0; i < len; i++) result += (char) (TQChar) str[i]; pos = end - 1; } else { // kdDebug(7116) << "rfcDecoder::decodeRFC2047String - invalid" << endl; //result += "=?"; //pos = beg -1; // because pos gets increased shortly afterwards pos = beg - 2; result += *pos++; result += *pos; } } if (!charset.isEmpty ()) { TQTextCodec *aCodec = codecForName (charset.ascii ()); if (aCodec) { // kdDebug(7116) << "Codec is " << aCodec->name() << endl; return aCodec->toUnicode (result); } } return result; } //----------------------------------------------------------------------------- const char especials[17] = "()<>@,;:\"/[]?.= "; const TQString rfcDecoder::encodeRFC2047String (const TQString & _str) { if (_str.isEmpty ()) return _str; const signed char *latin = reinterpret_cast(_str.latin1()), *l, *start, *stop; char hexcode; int numQuotes, i; int rptr = 0; // My stats show this number results in 12 resize() out of 73,000 int resultLen = 3 * _str.length() / 2; TQCString result(resultLen); while (*latin) { l = latin; start = latin; while (*l) { if (*l == 32) start = l + 1; if (*l < 0) break; l++; } if (*l) { numQuotes = 1; while (*l) { /* The encoded word must be limited to 75 character */ for (i = 0; i < 16; i++) if (*l == especials[i]) numQuotes++; if (*l < 0) numQuotes++; /* Stop after 58 = 75 - 17 characters or at "= 58 || *l == 60) break; l++; } if (*l) { stop = l - 1; while (stop >= start && *stop != 32) stop--; if (stop <= start) stop = l; } else stop = l; if (resultLen - rptr - 1 <= start - latin + 1 + 16 /* =?iso-88... */) { resultLen += (start - latin + 1) * 2 + 20; // more space result.resize(resultLen); } while (latin < start) { result[rptr++] = *latin; latin++; } strcpy(&result[rptr], "=?iso-8859-1?q?"); rptr += 15; if (resultLen - rptr - 1 <= 3*(stop - latin + 1)) { resultLen += (stop - latin + 1) * 4 + 20; // more space result.resize(resultLen); } while (latin < stop) // can add up to 3 chars/iteration { numQuotes = 0; for (i = 0; i < 16; i++) if (*latin == especials[i]) numQuotes = 1; if (*latin < 0) numQuotes = 1; if (numQuotes) { result[rptr++] = '='; hexcode = ((*latin & 0xF0) >> 4) + 48; if (hexcode >= 58) hexcode += 7; result[rptr++] = hexcode; hexcode = (*latin & 0x0F) + 48; if (hexcode >= 58) hexcode += 7; result[rptr++] = hexcode; } else { result[rptr++] = *latin; } latin++; } result[rptr++] = '?'; result[rptr++] = '='; } else { while (*latin) { if (rptr == resultLen - 1) { resultLen += 30; result.resize(resultLen); } result[rptr++] = *latin; latin++; } } } result[rptr] = 0; //free (latinStart); return result; } //----------------------------------------------------------------------------- const TQString rfcDecoder::encodeRFC2231String (const TQString & _str) { if (_str.isEmpty ()) return _str; signed char *latin = (signed char *) calloc (1, _str.length () + 1); char *latin_us = (char *) latin; strcpy (latin_us, _str.latin1 ()); signed char *l = latin; char hexcode; int i; bool quote; while (*l) { if (*l < 0) break; l++; } if (!*l) { free(latin); return _str.ascii (); } TQCString result; l = latin; while (*l) { quote = *l < 0; for (i = 0; i < 16; i++) if (*l == especials[i]) quote = true; if (quote) { result += "%"; hexcode = ((*l & 0xF0) >> 4) + 48; if (hexcode >= 58) hexcode += 7; result += hexcode; hexcode = (*l & 0x0F) + 48; if (hexcode >= 58) hexcode += 7; result += hexcode; } else { result += *l; } l++; } free (latin); return result; } //----------------------------------------------------------------------------- const TQString rfcDecoder::decodeRFC2231String (const TQString & _str) { int p = _str.find ('\''); //see if it is an rfc string if (p < 0) return _str; int l = _str.findRev ('\''); //second is language if (p >= l) return _str; //first is charset or empty TQString charset = _str.left (p); TQString st = _str.mid (l + 1); TQString language = _str.mid (p + 1, l - p - 1); //kdDebug(7116) << "Charset: " << charset << " Language: " << language << endl; char ch, ch2; p = 0; while (p < (int) st.length ()) { if (st.at (p) == 37) { ch = st.at (p + 1).latin1 () - 48; if (ch > 16) ch -= 7; ch2 = st.at (p + 2).latin1 () - 48; if (ch2 > 16) ch2 -= 7; st.at (p) = ch * 16 + ch2; st.remove (p + 1, 2); } p++; } return st; }