tdepim/tdeioslave/imap4/rfcdecoder.cpp

/**********************************************************************
 *
 *   rfcdecoder.cpp  - handler for various rfc/mime encodings
 *   Copyright (C) 2000 s.carstens@gmx.de
 *
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 *
 *   Send comments and bug fixes to s.carstens@gmx.de
 *
 *********************************************************************/
#include "rfcdecoder.h"

#include <ctype.h>
#include <sys/types.h>

#include <stdio.h>
#include <stdlib.h>

#include <tqtextcodec.h>
#include <tqbuffer.h>
#include <tqregexp.h>
#include <kmdcodec.h>

// This part taken from rfc 2192 IMAP URL Scheme. C. Newman. September 1997.
// adapted to QT-Toolkit by Sven Carstens <s.carstens@gmx.de> 2000

static unsigned char base64chars[] =
  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
#define UNDEFINED 64
#define MAXLINE  76

/* UTF16 definitions */
#define UTF16MASK       0x03FFUL
#define UTF16SHIFT      10
#define UTF16BASE       0x10000UL
#define UTF16HIGHSTART  0xD800UL
#define UTF16HIGHEND    0xDBFFUL
#define UTF16LOSTART    0xDC00UL
#define UTF16LOEND      0xDFFFUL

/* Convert an IMAP mailbox to a Unicode path
 */
TQString rfcDecoder::fromIMAP (const TQString & inSrc)
{
  unsigned char c, i, bitcount;
  unsigned long ucs4, utf16, bitbuf;
  unsigned char base64[256], utf8[6];
  unsigned long srcPtr = 0;
  TQCString dst;
  TQCString src = inSrc.ascii ();
  uint srcLen = inSrc.length();

  /* initialize modified base64 decoding table */
  memset (base64, UNDEFINED, sizeof (base64));
  for (i = 0; i < sizeof (base64chars); ++i)
  {
    base64[(int)base64chars[i]] = i;
  }

  /* loop until end of string */
  while (srcPtr < srcLen)
  {
    c = src[srcPtr++];
    /* deal with literal characters and &- */
    if (c != '&' || src[srcPtr] == '-')
    {
      /* encode literally */
      dst += c;
      /* skip over the '-' if this is an &- sequence */
      if (c == '&')
        srcPtr++;
    }
    else
    {
      /* convert modified UTF-7 -> UTF-16 -> UCS-4 -> UTF-8 -> HEX */
      bitbuf = 0;
      bitcount = 0;
      ucs4 = 0;
      while ((c = base64[(unsigned char) src[srcPtr]]) != UNDEFINED)
      {
        ++srcPtr;
        bitbuf = (bitbuf << 6) | c;
        bitcount += 6;
        /* enough bits for a UTF-16 character? */
        if (bitcount >= 16)
        {
          bitcount -= 16;
          utf16 = (bitcount ? bitbuf >> bitcount : bitbuf) & 0xffff;
          /* convert UTF16 to UCS4 */
          if (utf16 >= UTF16HIGHSTART && utf16 <= UTF16HIGHEND)
          {
            ucs4 = (utf16 - UTF16HIGHSTART) << UTF16SHIFT;
            continue;
          }
          else if (utf16 >= UTF16LOSTART && utf16 <= UTF16LOEND)
          {
            ucs4 += utf16 - UTF16LOSTART + UTF16BASE;
          }
          else
          {
            ucs4 = utf16;
          }
          /* convert UTF-16 range of UCS4 to UTF-8 */
          if (ucs4 <= 0x7fUL)
          {
            utf8[0] = ucs4;
            i = 1;
          }
          else if (ucs4 <= 0x7ffUL)
          {
            utf8[0] = 0xc0 | (ucs4 >> 6);
            utf8[1] = 0x80 | (ucs4 & 0x3f);
            i = 2;
          }
          else if (ucs4 <= 0xffffUL)
          {
            utf8[0] = 0xe0 | (ucs4 >> 12);
            utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f);
            utf8[2] = 0x80 | (ucs4 & 0x3f);
            i = 3;
          }
          else
          {
            utf8[0] = 0xf0 | (ucs4 >> 18);
            utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
            utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f);
            utf8[3] = 0x80 | (ucs4 & 0x3f);
            i = 4;
          }
          /* copy it */
          for (c = 0; c < i; ++c)
          {
            dst += utf8[c];
          }
        }
      }
      /* skip over trailing '-' in modified UTF-7 encoding */
      if (src[srcPtr] == '-')
        ++srcPtr;
    }
  }
  return TQString::fromUtf8 (dst.data ());
}

/* replace " with \" and \ with \\ " and \ characters */
TQString rfcDecoder::quoteIMAP(const TQString &src)
{
  uint len = src.length();
  TQString result;
  result.reserve(2 * len);
  for (unsigned int i = 0; i < len; i++)
  {
    if (src[i] == '"' || src[i] == '\\')
      result += '\\';
    result += src[i];
  }
  //result.squeeze(); - unnecessary and slow
  return result;
}

/* Convert Unicode path to modified UTF-7 IMAP mailbox
 */
TQString rfcDecoder::toIMAP (const TQString & inSrc)
{
  unsigned int utf8pos, utf8total, c, utf7mode, bitstogo, utf16flag;
  unsigned long ucs4, bitbuf;
  TQCString src = inSrc.utf8 ();
  TQString dst;

  ulong srcPtr = 0;
  utf7mode = 0;
  utf8total = 0;
  bitstogo = 0;
  utf8pos = 0;
  bitbuf = 0;
  ucs4 = 0;
  while (srcPtr < src.length ())
  {
    c = (unsigned char) src[srcPtr++];
    /* normal character? */
    if (c >= ' ' && c <= '~')
    {
      /* switch out of UTF-7 mode */
      if (utf7mode)
      {
        if (bitstogo)
        {
          dst += base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
          bitstogo = 0;
        }
        dst += '-';
        utf7mode = 0;
      }
      dst += c;
      /* encode '&' as '&-' */
      if (c == '&')
      {
        dst += '-';
      }
      continue;
    }
    /* switch to UTF-7 mode */
    if (!utf7mode)
    {
      dst += '&';
      utf7mode = 1;
    }
    /* Encode US-ASCII characters as themselves */
    if (c < 0x80)
    {
      ucs4 = c;
      utf8total = 1;
    }
    else if (utf8total)
    {
      /* save UTF8 bits into UCS4 */
      ucs4 = (ucs4 << 6) | (c & 0x3FUL);
      if (++utf8pos < utf8total)
      {
        continue;
      }
    }
    else
    {
      utf8pos = 1;
      if (c < 0xE0)
      {
        utf8total = 2;
        ucs4 = c & 0x1F;
      }
      else if (c < 0xF0)
      {
        utf8total = 3;
        ucs4 = c & 0x0F;
      }
      else
      {
        /* NOTE: can't convert UTF8 sequences longer than 4 */
        utf8total = 4;
        ucs4 = c & 0x03;
      }
      continue;
    }
    /* loop to split ucs4 into two utf16 chars if necessary */
    utf8total = 0;
    do
    {
      if (ucs4 >= UTF16BASE)
      {
        ucs4 -= UTF16BASE;
        bitbuf = (bitbuf << 16) | ((ucs4 >> UTF16SHIFT) + UTF16HIGHSTART);
        ucs4 = (ucs4 & UTF16MASK) + UTF16LOSTART;
        utf16flag = 1;
      }
      else
      {
        bitbuf = (bitbuf << 16) | ucs4;
        utf16flag = 0;
      }
      bitstogo += 16;
      /* spew out base64 */
      while (bitstogo >= 6)
      {
        bitstogo -= 6;
        dst += base64chars[(bitstogo ? (bitbuf >> bitstogo) : bitbuf) & 0x3F];
      }
    }
    while (utf16flag);
  }
  /* if in UTF-7 mode, finish in ASCII */
  if (utf7mode)
  {
    if (bitstogo)
    {
      dst += base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
    }
    dst += '-';
  }
  return quoteIMAP(dst);
}

//-----------------------------------------------------------------------------
TQString rfcDecoder::decodeQuoting(const TQString &aStr)
{
  TQString result;
  unsigned int strLength(aStr.length());
  for (unsigned int i = 0; i < strLength ; i++)
  {
    if (aStr[i] == "\\") i++;
    result += aStr[i];
  }
  return result;
}

//-----------------------------------------------------------------------------
TQTextCodec *
rfcDecoder::codecForName (const TQString & _str)
{
  if (_str.isEmpty ())
    return NULL;
  return TQTextCodec::codecForName (_str.lower ().
                                   replace ("windows", "cp").latin1 ());
}

//-----------------------------------------------------------------------------
const TQString
rfcDecoder::decodeRFC2047String (const TQString & _str)
{
  TQString throw_away;

  return decodeRFC2047String (_str, throw_away);
}

//-----------------------------------------------------------------------------
const TQString
rfcDecoder::decodeRFC2047String (const TQString & _str, TQString & charset)
{
  TQString throw_away;

  return decodeRFC2047String (_str, charset, throw_away);
}

//-----------------------------------------------------------------------------
const TQString
rfcDecoder::decodeRFC2047String (const TQString & _str, TQString & charset,
                                 TQString & language)
{
  //do we have a rfc string
  if (_str.find("=?") < 0)
    return _str;

  TQCString aStr = _str.ascii ();  // TQString.length() means Unicode chars
  TQCString result;
  char *pos, *beg, *end, *mid = NULL;
  TQCString str;
  char encoding = 0, ch;
  bool valid;
  const int maxLen = 200;
  int i;

//  result.truncate(aStr.length());
  for (pos = aStr.data (); *pos; pos++)
  {
    if (pos[0] != '=' || pos[1] != '?')
    {
      result += *pos;
      continue;
    }
    beg = pos + 2;
    end = beg;
    valid = TRUE;
    // parse charset name
    for (i = 2, pos += 2;
         i < maxLen && (*pos != '?' && (ispunct (*pos) || isalnum (*pos)));
         i++)
      pos++;
    if (*pos != '?' || i < 4 || i >= maxLen)
      valid = FALSE;
    else
    {
      charset = TQCString (beg, i - 1);  // -2 + 1 for the zero
      int pt = charset.findRev('*');
      if (pt != -1)
      {
        // save language for later usage
        language = charset.right (charset.length () - pt - 1);

        // tie off language as defined in rfc2047
        charset.truncate(pt);
      }
      // get encoding and check delimiting question marks
      encoding = toupper (pos[1]);
      if (pos[2] != '?'
          || (encoding != 'Q' && encoding != 'B' && encoding != 'q'
              && encoding != 'b'))
        valid = FALSE;
      pos += 3;
      i += 3;
//    kdDebug(7116) << "rfcDecoder::decodeRFC2047String - charset " << charset << " - language " << language << " - '" << pos << "'" << endl;
    }
    if (valid)
    {
      mid = pos;
      // search for end of encoded part
      while (i < maxLen && *pos && !(*pos == '?' && *(pos + 1) == '='))
      {
        i++;
        pos++;
      }
      end = pos + 2;            //end now points to the first char after the encoded string
      if (i >= maxLen || !*pos)
        valid = FALSE;
    }
    if (valid)
    {
      ch = *pos;
      *pos = '\0';
      str = TQCString (mid).left ((int) (mid - pos - 1));
      if (encoding == 'Q')
      {
        // decode quoted printable text
        for (i = str.length () - 1; i >= 0; i--)
          if (str[i] == '_')
            str[i] = ' ';
//    kdDebug(7116) << "rfcDecoder::decodeRFC2047String - before QP '" << str << "'" << endl;

        str = KCodecs::quotedPrintableDecode(str);
//    kdDebug(7116) << "rfcDecoder::decodeRFC2047String - after QP '" << str << "'" << endl;
      }
      else
      {
        // decode base64 text
        str = KCodecs::base64Decode(str);
      }
      *pos = ch;
      int len = str.length();
      for (i = 0; i < len; i++)
        result += (char) (TQChar) str[i];

      pos = end - 1;
    }
    else
    {
//    kdDebug(7116) << "rfcDecoder::decodeRFC2047String - invalid" << endl;
      //result += "=?";
      //pos = beg -1; // because pos gets increased shortly afterwards
      pos = beg - 2;
      result += *pos++;
      result += *pos;
    }
  }
  if (!charset.isEmpty ())
  {
    TQTextCodec *aCodec = codecForName (charset.ascii ());
    if (aCodec)
    {
//    kdDebug(7116) << "Codec is " << aCodec->name() << endl;
      return aCodec->toUnicode (result);
    }
  }
  return result;
}


//-----------------------------------------------------------------------------
const char especials[17] = "()<>@,;:\"/[]?.= ";

const TQString
rfcDecoder::encodeRFC2047String (const TQString & _str)
{
  if (_str.isEmpty ())
    return _str;
  const signed char *latin = reinterpret_cast<const signed char *>(_str.latin1()), *l, *start, *stop;
  char hexcode;
  int numQuotes, i;
  int rptr = 0;
  // My stats show this number results in 12 resize() out of 73,000
  int resultLen = 3 * _str.length() / 2;
  TQCString result(resultLen);

  while (*latin)
  {
    l = latin;
    start = latin;
    while (*l)
    {
      if (*l == 32)
        start = l + 1;
      if (*l < 0)
        break;
      l++;
    }
    if (*l)
    {
      numQuotes = 1;
      while (*l)
      {
        /* The encoded word must be limited to 75 character */
        for (i = 0; i < 16; i++)
          if (*l == especials[i])
            numQuotes++;
        if (*l < 0)
          numQuotes++;
        /* Stop after 58 = 75 - 17 characters or at "<user@host..." */
        if (l - start + 2 * numQuotes >= 58 || *l == 60)
          break;
        l++;
      }
      if (*l)
      {
        stop = l - 1;
        while (stop >= start && *stop != 32)
          stop--;
        if (stop <= start)
          stop = l;
      }
      else
        stop = l;
      if (resultLen - rptr - 1 <= start -  latin + 1 + 16 /* =?iso-88... */) {
        resultLen += (start - latin + 1) * 2 + 20; // more space
	result.resize(resultLen);
      }
      while (latin < start)
      {
        result[rptr++] = *latin;
        latin++;
      }
      strcpy(&result[rptr], "=?iso-8859-1?q?"); rptr += 15;
      if (resultLen - rptr - 1 <= 3*(stop - latin + 1)) {
        resultLen += (stop - latin + 1) * 4 + 20; // more space
	result.resize(resultLen);
      }
      while (latin < stop) // can add up to 3 chars/iteration
      {
        numQuotes = 0;
        for (i = 0; i < 16; i++)
          if (*latin == especials[i])
            numQuotes = 1;
        if (*latin < 0)
          numQuotes = 1;
        if (numQuotes)
        {
          result[rptr++] = '=';
          hexcode = ((*latin & 0xF0) >> 4) + 48;
          if (hexcode >= 58)
            hexcode += 7;
          result[rptr++] = hexcode;
          hexcode = (*latin & 0x0F) + 48;
          if (hexcode >= 58)
            hexcode += 7;
          result[rptr++] = hexcode;
        }
        else
        {
          result[rptr++] = *latin;
        }
        latin++;
      }
      result[rptr++] = '?';
      result[rptr++] = '=';
    }
    else
    {
      while (*latin)
      {
        if (rptr == resultLen - 1) {
          resultLen += 30;
          result.resize(resultLen);
        }
        result[rptr++] = *latin;
        latin++;
      }
    }
  }
  result[rptr] = 0;
  //free (latinStart);
  return result;
}


//-----------------------------------------------------------------------------
const TQString
rfcDecoder::encodeRFC2231String (const TQString & _str)
{
  if (_str.isEmpty ())
    return _str;
  signed char *latin = (signed char *) calloc (1, _str.length () + 1);
  char *latin_us = (char *) latin;
  strcpy (latin_us, _str.latin1 ());
  signed char *l = latin;
  char hexcode;
  int i;
  bool quote;
  while (*l)
  {
    if (*l < 0)
      break;
    l++;
  }
  if (!*l) {
    free(latin);
    return _str;
  }
  TQCString result;
  l = latin;
  while (*l)
  {
    quote = *l < 0;
    for (i = 0; i < 16; i++)
      if (*l == especials[i])
        quote = true;
    if (quote)
    {
      result += "%";
      hexcode = ((*l & 0xF0) >> 4) + 48;
      if (hexcode >= 58)
        hexcode += 7;
      result += hexcode;
      hexcode = (*l & 0x0F) + 48;
      if (hexcode >= 58)
        hexcode += 7;
      result += hexcode;
    }
    else
    {
      result += *l;
    }
    l++;
  }
  free (latin);
  return result;
}


//-----------------------------------------------------------------------------
const TQString
rfcDecoder::decodeRFC2231String (const TQString & _str)
{
  int p = _str.find ('\'');

  //see if it is an rfc string
  if (p < 0)
    return _str;

  int l = _str.findRev ('\'');

  //second is language
  if (p >= l)
    return _str;

  //first is charset or empty
  TQString charset = _str.left (p);
  TQString st = _str.mid (l + 1);
  TQString language = _str.mid (p + 1, l - p - 1);

  //kdDebug(7116) << "Charset: " << charset << " Language: " << language << endl;

  char ch, ch2;
  p = 0;
  while (p < (int) st.length ())
  {
    if (st.at (p) == 37)
    {
      ch = st.at (p + 1).latin1 () - 48;
      if (ch > 16)
        ch -= 7;
      ch2 = st.at (p + 2).latin1 () - 48;
      if (ch2 > 16)
        ch2 -= 7;
      st.at (p) = ch * 16 + ch2;
      st.remove (p + 1, 2);
    }
    p++;
  }
  return st;
}