tdelibs/tdespell2/plugins/ispell/ispell_checker.cpp

/* tdespell2 - adopted from Enchant
 * Copyright (C) 2003 Dom Lachowicz
 * Copyright (C) 2004 Zack Rusin <zack@kde.org>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 *
 * In addition, as a special exception, Dom Lachowicz
 * gives permission to link the code of this program with
 * non-LGPL Spelling Provider libraries (eg: a MSFT Office
 * spell checker backend) and distribute linked combinations including
 * the two.  You must obey the GNU Lesser General Public License in all
 * respects for all of the code used other than said providers.  If you modify
 * this file, you may extend this exception to your version of the
 * file, but you are not obligated to do so.  If you do not wish to
 * do so, delete this exception statement from your version.
 */

#include <config.h>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <string>
#include <vector>

#include "sp_spell.h"
#include "ispell_checker.h"

#include <tqmap.h>
#include <tqdir.h>
#include <tqfileinfo.h>

/***************************************************************************/

typedef struct str_ispell_map
{
	const char * lang;
	const char * dict;
	const char * enc;
} IspellMap;

static const char *ispell_dirs [] = {
#ifdef ISPELL_LIBDIR
	ISPELL_LIBDIR,
#else
	"/usr/" SYSTEM_LIBDIR "/ispell",
	"/usr/lib/ispell",
	"/usr/local/" SYSTEM_LIBDIR "/ispell",
	"/usr/local/lib/ispell",
	"/usr/local/share/ispell",
	"/usr/share/ispell",
	"/usr/pkg/lib",
#endif
	0
};
static const IspellMap ispell_map [] = {
	{"ca"    ,"catala.hash"         ,"iso-8859-1" },
	{"ca_ES" ,"catala.hash"         ,"iso-8859-1" },
	{"cs"    ,"czech.hash"          ,"iso-8859-2" },
	{"cs_CZ" ,"czech.hash"          ,"iso-8859-2" },
	{"da"    ,"dansk.hash"          ,"iso-8859-1" },
	{"da_DK" ,"dansk.hash"          ,"iso-8859-1" },
	{"de"    ,"deutsch.hash"        ,"iso-8859-1" },
	{"de_CH" ,"swiss.hash"          ,"iso-8859-1" },
	{"de_AT" ,"deutsch.hash"        ,"iso-8859-1" },
	{"de_DE" ,"deutsch.hash"        ,"iso-8859-1" },
	{"el"    ,"ellhnika.hash"       ,"iso-8859-7" },
	{"el_GR" ,"ellhnika.hash"       ,"iso-8859-7" },
	{"en"    ,"british.hash"        ,"iso-8859-1" },
	{"en_AU" ,"british.hash"        ,"iso-8859-1" },
	{"en_BZ" ,"british.hash"        ,"iso-8859-1" },
	{"en_CA" ,"british.hash"        ,"iso-8859-1" },
	{"en_GB" ,"british.hash"        ,"iso-8859-1" },
	{"en_IE" ,"british.hash"        ,"iso-8859-1" },
	{"en_JM" ,"british.hash"        ,"iso-8859-1" },
	{"en_NZ" ,"british.hash"        ,"iso-8859-1" },
	{"en_TT" ,"british.hash"        ,"iso-8859-1" },
	{"en_ZA" ,"british.hash"        ,"iso-8859-1" },
	{"en_ZW" ,"british.hash"        ,"iso-8859-1" },
	{"en_PH" ,"american.hash"       ,"iso-8859-1" },
	{"en_US" ,"american.hash"       ,"iso-8859-1" },
	{"eo"    ,"esperanto.hash"      ,"iso-8859-3" },
	{"es"    ,"espanol.hash"        ,"iso-8859-1" },
	{"es_AR" ,"espanol.hash"        ,"iso-8859-1" },
	{"es_BO" ,"espanol.hash"        ,"iso-8859-1" },
	{"es_CL" ,"espanol.hash"        ,"iso-8859-1" },
	{"es_CO" ,"espanol.hash"        ,"iso-8859-1" },
	{"es_CR" ,"espanol.hash"        ,"iso-8859-1" },
	{"es_DO" ,"espanol.hash"        ,"iso-8859-1" },
	{"es_EC" ,"espanol.hash"        ,"iso-8859-1" },
	{"es_ES" ,"espanol.hash"        ,"iso-8859-1" },
	{"es_GT" ,"espanol.hash"        ,"iso-8859-1" },
	{"es_HN" ,"espanol.hash"        ,"iso-8859-1" },
	{"es_MX" ,"espanol.hash"        ,"iso-8859-1" },
	{"es_NI" ,"espanol.hash"        ,"iso-8859-1" },
	{"es_PA" ,"espanol.hash"        ,"iso-8859-1" },
	{"es_PE" ,"espanol.hash"        ,"iso-8859-1" },
	{"es_PR" ,"espanol.hash"        ,"iso-8859-1" },
	{"es_PY" ,"espanol.hash"        ,"iso-8859-1" },
	{"es_SV" ,"espanol.hash"        ,"iso-8859-1" },
	{"es_UY" ,"espanol.hash"        ,"iso-8859-1" },
	{"es_VE" ,"espanol.hash"        ,"iso-8859-1" },
	{"fi"    ,"finnish.hash"        ,"iso-8859-1" },
	{"fi_FI" ,"finnish.hash"        ,"iso-8859-1" },
	{"fr"    ,"francais.hash"       ,"iso-8859-1" },
	{"fr_BE" ,"francais.hash"       ,"iso-8859-1" },
	{"fr_CA" ,"francais.hash"       ,"iso-8859-1" },
	{"fr_CH" ,"francais.hash"       ,"iso-8859-1" },
	{"fr_FR" ,"francais.hash"       ,"iso-8859-1" },
	{"fr_LU" ,"francais.hash"       ,"iso-8859-1" },
	{"fr_MC" ,"francais.hash"       ,"iso-8859-1" },
	{"hu"    ,"hungarian.hash"      ,"iso-8859-2" },
	{"hu_HU" ,"hungarian.hash"      ,"iso-8859-2" },
	{"ga"    ,"irish.hash"          ,"iso-8859-1" },
	{"ga_IE" ,"irish.hash"          ,"iso-8859-1" },
	{"gl"    ,"galician.hash"       ,"iso-8859-1" },
	{"gl_ES" ,"galician.hash"       ,"iso-8859-1" },
	{"ia"    ,"interlingua.hash"    ,"iso-8859-1" },
	{"it"    ,"italian.hash"        ,"iso-8859-1" },
	{"it_IT" ,"italian.hash"        ,"iso-8859-1" },
	{"it_CH" ,"italian.hash"        ,"iso-8859-1" },
	{"la"    ,"mlatin.hash"         ,"iso-8859-1" },
	{"la_IT" ,"mlatin.hash"         ,"iso-8859-1" },
	{"lt"    ,"lietuviu.hash"       ,"iso-8859-13" },
	{"lt_LT" ,"lietuviu.hash"       ,"iso-8859-13" },
	{"nl"    ,"nederlands.hash"     ,"iso-8859-1" },
	{"nl_NL" ,"nederlands.hash"     ,"iso-8859-1" },
	{"nl_BE" ,"nederlands.hash"     ,"iso-8859-1" },
	{"nb"    ,"norsk.hash"          ,"iso-8859-1" },
	{"nb_NO" ,"norsk.hash"          ,"iso-8859-1" },
	{"nn"    ,"nynorsk.hash"        ,"iso-8859-1" },
	{"nn_NO" ,"nynorsk.hash"        ,"iso-8859-1" },
	{"no"    ,"norsk.hash"          ,"iso-8859-1" },
	{"no_NO" ,"norsk.hash"          ,"iso-8859-1" },
	{"pl"    ,"polish.hash"         ,"iso-8859-2" },
	{"pl_PL" ,"polish.hash"         ,"iso-8859-2" },
	{"pt"    ,"brazilian.hash"      ,"iso-8859-1" },
	{"pt_BR" ,"brazilian.hash"      ,"iso-8859-1" },
	{"pt_PT" ,"portugues.hash"      ,"iso-8859-1" },
	{"ru"    ,"russian.hash"        ,"koi8-r" },
	{"ru_MD" ,"russian.hash"        ,"koi8-r" },
	{"ru_RU" ,"russian.hash"        ,"koi8-r" },
	{"sc"    ,"sardinian.hash"      ,"iso-8859-1" },
	{"sc_IT" ,"sardinian.hash"      ,"iso-8859-1" },
	{"sk"    ,"slovak.hash"         ,"iso-8859-2" },
	{"sk_SK" ,"slovak.hash"         ,"iso-8859-2" },
	{"sl"    ,"slovensko.hash"      ,"iso-8859-2" },
	{"sl_SI" ,"slovensko.hash"      ,"iso-8859-2" },
	{"sv"    ,"svenska.hash"        ,"iso-8859-1" },
	{"sv_SE" ,"svenska.hash"        ,"iso-8859-1" },
	{"uk"    ,"ukrainian.hash"      ,"koi8-u" },
	{"uk_UA" ,"ukrainian.hash"      ,"koi8-u" },
	{"yi"    ,"yiddish-yivo.hash"   ,"utf-8" }
};

static const size_t size_ispell_map = ( sizeof(ispell_map) / sizeof((ispell_map)[0]) );
static TQMap<TQString, TQString> ispell_dict_map;


void
ISpellChecker::try_autodetect_charset(const char * const inEncoding)
{
	if (inEncoding && strlen(inEncoding))
		{
			m_translate_in = TQTextCodec::codecForName(inEncoding);
		}
}

/***************************************************************************/
/***************************************************************************/

ISpellChecker::ISpellChecker()
	: deftflag(-1),
     prefstringchar(-1),
     m_bSuccessfulInit(false),
     m_BC(NULL),
     m_cd(NULL),
     m_cl(NULL),
     m_cm(NULL),
     m_ho(NULL),
     m_nd(NULL),
     m_so(NULL),
     m_se(NULL),
     m_ti(NULL),
     m_te(NULL),
     m_hashstrings(NULL),
     m_hashtbl(NULL),
     m_pflaglist(NULL),
     m_sflaglist(NULL),
     m_chartypes(NULL),
     m_infile(NULL),
     m_outfile(NULL),
     m_askfilename(NULL),
     m_Trynum(0),
     m_translate_in(0)
{
	memset(m_sflagindex,0,sizeof(m_sflagindex));
	memset(m_pflagindex,0,sizeof(m_pflagindex));
}

#ifndef FREEP
#define FREEP(p)        do { if (p) free(p); } while (0)
#endif

ISpellChecker::~ISpellChecker()
{
	if (m_bSuccessfulInit) {
		// only cleanup our mess if we were successfully initialized

		clearindex (m_pflagindex);
		clearindex (m_sflagindex);
	}

	FREEP(m_hashtbl);
	FREEP(m_hashstrings);
	FREEP(m_sflaglist);
	FREEP(m_chartypes);

	delete m_translate_in;
	m_translate_in = 0;
}

bool
ISpellChecker::checkWord( const TQString& utf8Word )
{
	ichar_t iWord[INPUTWORDLEN + MAXAFFIXLEN];
	if (!m_bSuccessfulInit)
		return false;

	if (!utf8Word || utf8Word.length() >= (INPUTWORDLEN + MAXAFFIXLEN) || utf8Word.isEmpty())
		return false;

	bool retVal = false;
	TQCString out;
	if (!m_translate_in)
		return false;
	else {
		/* convert to 8bit string and null terminate */
		int len_out = utf8Word.length();

		out = m_translate_in->fromUnicode( utf8Word, len_out );
	}

	if (!strtoichar(iWord, out.data(), INPUTWORDLEN + MAXAFFIXLEN, 0))
		{
			if (good(iWord, 0, 0, 1, 0) == 1 ||
			    compoundgood(iWord, 1) == 1)
				{
					retVal = true;
				}
		}

	return retVal;
}

TQStringList
ISpellChecker::suggestWord(const TQString& utf8Word)
{
	ichar_t  iWord[INPUTWORDLEN + MAXAFFIXLEN];
	int  c;

	if (!m_bSuccessfulInit)
		return TQStringList();

	if (utf8Word.isEmpty() || utf8Word.length() >= (INPUTWORDLEN + MAXAFFIXLEN) ||
			utf8Word.length() == 0)
		return TQStringList();

	TQCString out;
	if (!m_translate_in)
		return TQStringList();
	else
		{
			/* convert to 8bit string and null terminate */

			int len_out = utf8Word.length();
			out = m_translate_in->fromUnicode( utf8Word, len_out );
		}

	if (!strtoichar(iWord, out.data(), INPUTWORDLEN + MAXAFFIXLEN, 0))
		makepossibilities(iWord);
	else
		return TQStringList();

	TQStringList sugg_arr;
	for (c = 0; c < m_pcount; c++)
	{
		TQString utf8Word;

		if (!m_translate_in)
		{
			/* copy to 8bit string and null terminate */
			utf8Word = TQString::fromUtf8( m_possibilities[c] );
		}
		else
		{
			/* convert to 32bit string and null terminate */
			utf8Word = m_translate_in->toUnicode( m_possibilities[c] );
		}

		sugg_arr.append( utf8Word );
	}

	return sugg_arr;
}

static void
s_buildHashNames (std::vector<std::string> & names, const char * dict)
{
	const char * tmp = 0;
	int i = 0;

	names.clear ();

	while ( (tmp = ispell_dirs[i++]) ) {
		TQCString maybeFile = TQCString( tmp ) + '/';
		maybeFile += dict;
		names.push_back( maybeFile.data() );
	}
}

static void
s_allDics()
{
	const char * tmp = 0;
	int i = 0;

	while ( (tmp = ispell_dirs[i++]) ) {
		TQDir dir( tmp );
		TQStringList lst = dir.entryList( "*.hash" );
		for ( TQStringList::Iterator it = lst.begin(); it != lst.end(); ++it ) {
			TQFileInfo info( *it );
			for (size_t i = 0; i < size_ispell_map; i++)
			{
				const IspellMap * mapping = (const IspellMap *)(&(ispell_map[i]));
				if (!strcmp (info.fileName().latin1(), mapping->dict))
				{
					ispell_dict_map.insert( mapping->lang, *it );
				}
			}
		}
	}
}

TQValueList<TQString>
ISpellChecker::allDics()
{
	if ( ispell_dict_map.empty() )
		s_allDics();

	return ispell_dict_map.keys();
}

TQString
ISpellChecker::loadDictionary (const char * szdict)
{
	std::vector<std::string> dict_names;

	s_buildHashNames (dict_names, szdict);

	for (size_t i = 0; i < dict_names.size(); i++)
		{
			if (linit(const_cast<char*>(dict_names[i].c_str())) >= 0)
				return dict_names[i].c_str();
		}

	return TQString::null;
}

/*!
 * Load ispell dictionary hash file for given language.
 *
 * \param szLang -  The language tag ("en-US") we want to use
 * \return The name of the dictionary file
 */
bool
ISpellChecker::loadDictionaryForLanguage ( const char * szLang )
{
	TQString hashname;

	const char * encoding = NULL;
	const char * szFile = NULL;

	for (size_t i = 0; i < size_ispell_map; i++)
		{
			const IspellMap * mapping = (const IspellMap *)(&(ispell_map[i]));
			if (!strcmp (szLang, mapping->lang))
				{
					szFile = mapping->dict;
					encoding = mapping->enc;
					break;
				}
		}

	if (!szFile || !strlen(szFile))
		return false;

	alloc_ispell_struct();

	hashname = loadDictionary(szFile);
	if (hashname.isEmpty())
		return false;

	// one of the two above calls succeeded
	setDictionaryEncoding (hashname, encoding);

	return true;
}

void
ISpellChecker::setDictionaryEncoding( const TQString& hashname, const char * encoding )
{
	/* Get Hash encoding from XML file. This should always work! */
	try_autodetect_charset(encoding);

	if (m_translate_in)
		{
			/* We still have to setup prefstringchar*/
			prefstringchar = findfiletype("utf8", 1, deftflag < 0 ? &deftflag
						      : static_cast<int *>(NULL));

			if (prefstringchar < 0)
				{
					std::string teststring;
					for(int n1 = 1; n1 <= 15; n1++)
						{
							teststring = "latin" + n1;
							prefstringchar = findfiletype(teststring.c_str(), 1,
										      deftflag < 0 ? &deftflag : static_cast<int *>(NULL));
							if (prefstringchar >= 0)
								break;
						}
				}

			return; /* success */
		}

	/* Test for UTF-8 first */
	prefstringchar = findfiletype("utf8", 1, deftflag < 0 ? &deftflag : static_cast<int *>(NULL));
	if (prefstringchar >= 0)
		{
			m_translate_in = TQTextCodec::codecForName("utf8");
		}

	if (m_translate_in)
		return; /* success */

	/* Test for "latinN" */
	if (!m_translate_in)
		{
			/* Look for "altstringtype" names from latin1 to latin15 */
			for(int n1 = 1; n1 <= 15; n1++)
				{
					TQString teststring = TQString("latin%1").arg(n1);
					prefstringchar = findfiletype(teststring.latin1(), 1,
								      deftflag < 0 ? &deftflag : static_cast<int *>(NULL));
					if (prefstringchar >= 0)
						{
							//FIXME: latin1 might be wrong
							m_translate_in = TQTextCodec::codecForName( teststring.latin1() );
							break;
						}
				}
		}

	/* If nothing found, use latin1 */
	if (!m_translate_in)
		{
			m_translate_in = TQTextCodec::codecForName("latin1");
		}
}

bool
ISpellChecker::requestDictionary(const char *szLang)
{
	if (!loadDictionaryForLanguage (szLang))
		{
			// handle a shortened version of the language tag: en_US => en
			std::string shortened_dict (szLang);
			size_t uscore_pos;

			if ((uscore_pos = shortened_dict.rfind ('_')) != ((size_t)-1)) {
				shortened_dict = shortened_dict.substr(0, uscore_pos);
				if (!loadDictionaryForLanguage (shortened_dict.c_str()))
					return false;
			} else
				return false;
		}

	m_bSuccessfulInit = true;

	if (prefstringchar < 0)
		m_defdupchar = 0;
	else
		m_defdupchar = prefstringchar;

	return true;
}