tqtinterface/qtinterface/interface_tqt3/tqtextcodec.cpp

/*

Copyright (C) 2010 Timothy Pearson <kb9vqf@pearsoncomputing.net>

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Library General Public License for more details.

You should have received a copy of the GNU Library General Public License
along with this library; see the file COPYING.LIB.  If not, write to
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
Boston, MA 02110-1301, USA.

*/

#include <tqt.h>
#include <ntqtextcodec.h>

#ifdef USE_QT4

// returns a string containing the letters and numbers from input,
// with a space separating run of a character class. e.g. "iso8859-1"
// becomes "iso 8859 1"
static QString lettersAndNumbers( const char * input )
{
    QString result;
    QChar c;

    while( input && *input ) {
	c = *input;
 	if ( c.isLetter() || c.isNumber() )
 	    result += c.lower();
	if ( input[1] ) {
	    // add space at character class transition, except
	    // transition from upper-case to lower-case letter
	    QChar n( input[1] );
	    if ( c.isLetter() && n.isLetter() ) {
		if ( c == c.lower() && n == n.upper() )
		    result += ' ';
	    } else if ( c.category() != n.category() ) {
		result += ' ';
	    }
	}
	input++;
    }
    return result.simplifyWhiteSpace();
}

#define CHAINED 0xffff

struct QMultiByteUnicodeTable {
    // If multiByte, ignore unicode and index into multiByte
    //  with the next character.
    QMultiByteUnicodeTable() : unicode(0xfffd), multiByte(0) { }

    ~QMultiByteUnicodeTable()
    {
	if ( multiByte )
	    delete [] multiByte;
    }

    ushort unicode;
    QMultiByteUnicodeTable* multiByte;
};

static int getByte(char* &cursor)
{
    int byte = 0;
    if ( *cursor ) {
	if ( cursor[1] == 'x' )
	    byte = strtol(cursor+2,&cursor,16);
	else if ( cursor[1] == 'd' )
	    byte = strtol(cursor+2,&cursor,10);
	else
	    byte = strtol(cursor+2,&cursor,8);
    }
    return byte&0xff;
}

class QTextCodecFromIOD;

class QTextCodecFromIODDecoder : public QTextDecoder {
    const QTextCodecFromIOD* codec;
    QMultiByteUnicodeTable* mb;
public:
    QTextCodecFromIODDecoder(const QTextCodecFromIOD* c);
    //QString toUnicode(const char* chars, int len);
    QString convertToUnicode(const char* chars, int len, int *state);
};

class QTextCodecFromIOD : public QTextCodec {
    friend class QTextCodecFromIODDecoder;

    TQCString n;

    // If from_unicode_page[row()][cell()] is 0 and from_unicode_page_multiByte,
    //  use from_unicode_page_multiByte[row()][cell()] as string.
    char** from_unicode_page;
    char*** from_unicode_page_multiByte;
    char unkn;

    // Only one of these is used
    ushort* to_unicode;
    QMultiByteUnicodeTable* to_unicode_multiByte;
    int max_bytes_per_char;
    TQStrList aliases;

    bool stateless() const { return !to_unicode_multiByte; }

public:
    QTextCodecFromIOD(QIODevice* iod)
    {
	from_unicode_page = 0;
	to_unicode_multiByte = 0;
	to_unicode = 0;
	from_unicode_page_multiByte = 0;
	max_bytes_per_char = 1;

	const int maxlen=100;
	char line[maxlen];
	char esc='\\';
	char comm='%';
	bool incmap = FALSE;
	while (iod->readLine(line,maxlen) > 0) {
	    if (0==qstrnicmp(line,"<code_set_name>",15))
		n = line+15;
	    else if (0==qstrnicmp(line,"<escape_char> ",14))
		esc = line[14];
	    else if (0==qstrnicmp(line,"<comment_char> ",15))
		comm = line[15];
	    else if (line[0]==comm && 0==qstrnicmp(line+1," alias ",7)) {
		aliases.append(line+8);
	    } else if (0==qstrnicmp(line,"CHARMAP",7)) {
		if (!from_unicode_page) {
		    from_unicode_page = new char*[256];
		    for (int i=0; i<256; i++)
			from_unicode_page[i]=0;
		}
		if (!to_unicode) {
		    to_unicode = new ushort[256];
		}
		incmap = TRUE;
	    } else if (0==qstrnicmp(line,"END CHARMAP",11))
		break;
	    else if (incmap) {
		char* cursor = line;
		int byte=-1,unicode=-1;
		ushort* mb_unicode=0;
		const int maxmb=8; // more -> we'll need to improve datastructures
		char mb[maxmb+1];
		int nmb=0;

		while (*cursor) {
		    if (cursor[0]=='<' && cursor[1]=='U' &&
			cursor[2]>='0' && cursor[2]<='9' &&
			cursor[3]>='0' && cursor[3]<='9') {

			unicode = strtol(cursor+2,&cursor,16);

		    } else if (*cursor==esc) {

			byte = getByte(cursor);

			if ( *cursor == esc ) {
			    if ( !to_unicode_multiByte ) {
				to_unicode_multiByte =
				    new QMultiByteUnicodeTable[256];
				for (int i=0; i<256; i++) {
				    to_unicode_multiByte[i].unicode =
					to_unicode[i];
				    to_unicode_multiByte[i].multiByte = 0;
				}
				delete [] to_unicode;
				to_unicode = 0;
			    }
			    QMultiByteUnicodeTable* mbut =
				to_unicode_multiByte+byte;
			    mb[nmb++] = byte;
			    while ( nmb < maxmb && *cursor == esc ) {
				// Always at least once

				mbut->unicode = CHAINED;
				byte = getByte(cursor);
				mb[nmb++] = byte;
				if (!mbut->multiByte) {
				    mbut->multiByte =
					new QMultiByteUnicodeTable[256];
				}
				mbut = mbut->multiByte+byte;
				mb_unicode = & mbut->unicode;
			    }

			    if ( nmb > max_bytes_per_char )
				max_bytes_per_char = nmb;
			}
		    } else {
			cursor++;
		    }
		}

		if (unicode >= 0 && unicode <= 0xffff)
		{
		    QChar ch((ushort)unicode);
		    if (!from_unicode_page[ch.row()]) {
			from_unicode_page[ch.row()] = new char[256];
			for (int i=0; i<256; i++)
			    from_unicode_page[ch.row()][i]=0;
		    }
		    if ( mb_unicode ) {
			from_unicode_page[ch.row()][ch.cell()] = 0;
			if (!from_unicode_page_multiByte) {
			    from_unicode_page_multiByte = new char**[256];
			    for (int i=0; i<256; i++)
				from_unicode_page_multiByte[i]=0;
			}
			if (!from_unicode_page_multiByte[ch.row()]) {
			    from_unicode_page_multiByte[ch.row()] = new char*[256];
			    for (int i=0; i<256; i++)
				from_unicode_page_multiByte[ch.row()][i] = 0;
			}
			mb[nmb++] = 0;
			from_unicode_page_multiByte[ch.row()][ch.cell()]
			    = qstrdup(mb);
			*mb_unicode = unicode;
		    } else {
			from_unicode_page[ch.row()][ch.cell()] = (char)byte;
			if ( to_unicode )
			    to_unicode[byte] = unicode;
			else
			    to_unicode_multiByte[byte].unicode = unicode;
		    }
		} else {
		}
	    }
	}
	n = n.stripWhiteSpace();

	unkn = '?'; // ##### Might be a bad choice.
    }

    ~QTextCodecFromIOD()
    {
	if ( from_unicode_page ) {
	    for (int i=0; i<256; i++)
		if (from_unicode_page[i])
		    delete [] from_unicode_page[i];
	}
	if ( from_unicode_page_multiByte ) {
	    for (int i=0; i<256; i++)
		if (from_unicode_page_multiByte[i])
		    for (int j=0; j<256; j++)
			if (from_unicode_page_multiByte[i][j])
			    delete [] from_unicode_page_multiByte[i][j];
	}
	if ( to_unicode )
	    delete [] to_unicode;
	if ( to_unicode_multiByte )
	    delete [] to_unicode_multiByte;
    }

    bool ok() const
    {
	return !!from_unicode_page;
    }

    QTextDecoder* makeDecoder() const
    {
	if ( stateless() )
	    return QTextCodec::makeDecoder();
	else
	    return new QTextCodecFromIODDecoder(this);
    }

    const char* qtio_name() const
    {
	return n;
    }

    int mibEnum() const
    {
	return 0; // #### Unknown.
    }

    int heuristicContentMatch(const char*, int) const
    {
	return 0;
    }

    int heuristicNameMatch(const char* hint) const
    {
	int bestr = QTextCodec::heuristicNameMatch(hint);
	TQStrListIterator it(aliases);
	char* a;
	while ((a=it.current())) {
	    ++it;
	    int r = simpleHeuristicNameMatch(a,hint);
	    if (r > bestr)
		bestr = r;
	}
	return bestr;
    }

    QString toUnicode(const char* chars, int len) const
    {
	const uchar* uchars = (const uchar*)chars;
	QString result;
	QMultiByteUnicodeTable* multiByte=to_unicode_multiByte;
	if ( multiByte ) {
	    while (len--) {
		QMultiByteUnicodeTable& mb = multiByte[*uchars];
		if ( mb.multiByte ) {
		    // Chained multi-byte
		    multiByte = mb.multiByte;
		} else {
		    result += QChar(mb.unicode);
		    multiByte=to_unicode_multiByte;
		}
		uchars++;
	    }
	} else {
	    while (len--)
		result += QChar(to_unicode[*uchars++]);
	}
	return result;
    }

    QString convertToUnicode(const char* chars, int len, ConverterState *state) const
    {
	return toUnicode(chars, len);
    }

#if !defined(Q_NO_USING_KEYWORD)
   using QTextCodec::fromUnicode;
#endif
   TQCString fromUnicode(const QString& uc, int& lenInOut) const
    {
	if (lenInOut > (int)uc.length())
	    lenInOut = uc.length();
	int rlen = lenInOut*max_bytes_per_char;
	TQCString rstr(rlen+1);
	char* cursor = rstr.data();
	char* s=0;
	int l = lenInOut;
	int lout = 0;
	for (int i=0; i<l; i++) {
	    QChar ch = uc[i];
	    if ( ch == QChar() ) {
		// special
		*cursor++ = 0;
	    } else if ( from_unicode_page[ch.row()] &&
		from_unicode_page[ch.row()][ch.cell()] )
	    {
		*cursor++ = from_unicode_page[ch.row()][ch.cell()];
		lout++;
	    } else if ( from_unicode_page_multiByte &&
		      from_unicode_page_multiByte[ch.row()] &&
		      (s=from_unicode_page_multiByte[ch.row()][ch.cell()]) )
	    {
		while (*s) {
		    *cursor++ = *s++;
		    lout++;
		}
	    } else {
		*cursor++ = unkn;
		lout++;
	    }
	}
	*cursor = 0;
	lenInOut = lout;
	return rstr;
    }

    QByteArray convertFromUnicode(const QChar *charin, int len, ConverterState *state) const
    {
	return fromUnicode(charin, len);
    }

    QByteArray name() const
    {
	return qtio_name();
    }
};

// QTextCodecFromIODDecoder::QTextCodecFromIODDecoder(const QTextCodecFromIOD* c) :
//     codec(c)
// {
//     mb = codec->to_unicode_multiByte;
// }

QString QTextCodecFromIODDecoder::convertToUnicode(const char* chars, int len, int *state)
{
    const uchar* uchars = (const uchar*)chars;
    QString result;
    while (len--) {
	QMultiByteUnicodeTable& t = mb[*uchars];
	if ( t.multiByte ) {
	    // Chained multi-byte
	    mb = t.multiByte;
	} else {
	    if ( t.unicode )
		result += QChar(t.unicode);
	    mb=codec->to_unicode_multiByte;
	}
	uchars++;
    }
    return result;
}

#ifndef TQT_NO_CODECS
// Cannot use <pre> or \code
/*!
    Reads a POSIX2 charmap definition from \a iod.
    The parser recognizes the following lines:

<font name="sans">
&nbsp;&nbsp;&lt;code_set_name&gt; <i>name</i></br>
&nbsp;&nbsp;&lt;escape_char&gt; <i>character</i></br>
&nbsp;&nbsp;% alias <i>alias</i></br>
&nbsp;&nbsp;CHARMAP</br>
&nbsp;&nbsp;&lt;<i>token</i>&gt; /x<i>hexbyte</i> &lt;U<i>unicode</i>&gt; ...</br>
&nbsp;&nbsp;&lt;<i>token</i>&gt; /d<i>decbyte</i> &lt;U<i>unicode</i>&gt; ...</br>
&nbsp;&nbsp;&lt;<i>token</i>&gt; /<i>octbyte</i> &lt;U<i>unicode</i>&gt; ...</br>
&nbsp;&nbsp;&lt;<i>token</i>&gt; /<i>any</i>/<i>any</i>... &lt;U<i>unicode</i>&gt; ...</br>
&nbsp;&nbsp;END CHARMAP</br>
</font>

    The resulting QTextCodec is returned (and also added to the global
    list of codecs). The name() of the result is taken from the
    code_set_name.

    Note that a codec constructed in this way uses much more memory
    and is slower than a hand-written QTextCodec subclass, since
    tables in code are kept in memory shared by all Qt applications.

    \sa loadCharmapFile()
*/
QTextCodec* QTextCodec::loadCharmap(QIODevice* iod)
{
    QTextCodecFromIOD* r = new QTextCodecFromIOD(iod);
    if ( !r->ok() ) {
	delete r;
	r = 0;
    }
    return r;
}

/*!
    A convenience function for loadCharmap() that loads the charmap
    definition from the file \a filename.
*/
QTextCodec* QTextCodec::loadCharmapFile(QString filename)
{
    QFile f(filename);
    if (f.open(IO_ReadOnly)) {
	QTextCodecFromIOD* r = new QTextCodecFromIOD(&f);
	if ( !r->ok() )
	    delete r;
	else
	    return r;
    }
    return 0;
}

/*!
    Returns a value indicating how likely it is that this decoder is
    appropriate for decoding some format that has the given name. The
    name is compared with the \a hint.

    A good match returns a positive number around the length of the
    string. A bad match is negative.

    The default implementation calls simpleHeuristicNameMatch() with
    the name of the codec.
*/
int QTextCodec::heuristicNameMatch(const char* hint) const
{
    return simpleHeuristicNameMatch(name(),hint);
}

/*!
    A simple utility function for heuristicNameMatch(): it does some
    very minor character-skipping so that almost-exact matches score
    high. \a name is the text we're matching and \a hint is used for
    the comparison.
*/
int QTextCodec::simpleHeuristicNameMatch(const char* name, const char* hint)
{
    // if they're the same, return a perfect score.
    if ( name && hint && *name && *hint && qstricmp( name, hint ) == 0 )
	return qstrlen( hint );

    // if the letters and numbers are the same, we have an "almost"
    // perfect match.
    QString h( lettersAndNumbers( hint ) );
    QString n( lettersAndNumbers( name ) );
    if ( h == n )
	return qstrlen( hint )-1;

    if ( h.stripWhiteSpace() == n.stripWhiteSpace() )
	return qstrlen( hint )-2;

    // could do some more here, but I don't think it's worth it

    return 0;
}

#endif //TQT_NO_CODECS

#endif // USE_QT4