You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
510 lines
14 KiB
510 lines
14 KiB
/* vim: set sw=8: -*- Mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
|
|
/* tdespell2 - adopted from Enchant
|
|
* Copyright (C) 2003 Dom Lachowicz
|
|
* Copyright (C) 2004 Zack Rusin <zack@kde.org>
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this library; if not, write to the
|
|
* Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
|
* Boston, MA 02110-1301, USA.
|
|
*
|
|
* In addition, as a special exception, Dom Lachowicz
|
|
* gives permission to link the code of this program with
|
|
* non-LGPL Spelling Provider libraries (eg: a MSFT Office
|
|
* spell checker backend) and distribute linked combinations including
|
|
* the two. You must obey the GNU Lesser General Public License in all
|
|
* respects for all of the code used other than said providers. If you modify
|
|
* this file, you may extend this exception to your version of the
|
|
* file, but you are not obligated to do so. If you do not wish to
|
|
* do so, delete this exception statement from your version.
|
|
*/
|
|
|
|
#include <config.h>
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "sp_spell.h"
|
|
#include "ispell_checker.h"
|
|
|
|
#include <tqmap.h>
|
|
#include <tqdir.h>
|
|
#include <tqfileinfo.h>
|
|
|
|
/***************************************************************************/
|
|
|
|
typedef struct str_ispell_map
|
|
{
|
|
const char * lang;
|
|
const char * dict;
|
|
const char * enc;
|
|
} IspellMap;
|
|
|
|
static const char *ispell_dirs [] = {
|
|
"/usr/" SYSTEM_LIBDIR "/ispell",
|
|
"/usr/lib/ispell",
|
|
"/usr/local/" SYSTEM_LIBDIR "/ispell",
|
|
"/usr/local/lib/ispell",
|
|
"/usr/local/share/ispell",
|
|
"/usr/share/ispell",
|
|
"/usr/pkg/lib",
|
|
0
|
|
};
|
|
static const IspellMap ispell_map [] = {
|
|
{"ca" ,"catala.hash" ,"iso-8859-1" },
|
|
{"ca_ES" ,"catala.hash" ,"iso-8859-1" },
|
|
{"cs" ,"czech.hash" ,"iso-8859-2" },
|
|
{"cs_CZ" ,"czech.hash" ,"iso-8859-2" },
|
|
{"da" ,"dansk.hash" ,"iso-8859-1" },
|
|
{"da_DK" ,"dansk.hash" ,"iso-8859-1" },
|
|
{"de" ,"deutsch.hash" ,"iso-8859-1" },
|
|
{"de_CH" ,"swiss.hash" ,"iso-8859-1" },
|
|
{"de_AT" ,"deutsch.hash" ,"iso-8859-1" },
|
|
{"de_DE" ,"deutsch.hash" ,"iso-8859-1" },
|
|
{"el" ,"ellhnika.hash" ,"iso-8859-7" },
|
|
{"el_GR" ,"ellhnika.hash" ,"iso-8859-7" },
|
|
{"en" ,"british.hash" ,"iso-8859-1" },
|
|
{"en_AU" ,"british.hash" ,"iso-8859-1" },
|
|
{"en_BZ" ,"british.hash" ,"iso-8859-1" },
|
|
{"en_CA" ,"british.hash" ,"iso-8859-1" },
|
|
{"en_GB" ,"british.hash" ,"iso-8859-1" },
|
|
{"en_IE" ,"british.hash" ,"iso-8859-1" },
|
|
{"en_JM" ,"british.hash" ,"iso-8859-1" },
|
|
{"en_NZ" ,"british.hash" ,"iso-8859-1" },
|
|
{"en_TT" ,"british.hash" ,"iso-8859-1" },
|
|
{"en_ZA" ,"british.hash" ,"iso-8859-1" },
|
|
{"en_ZW" ,"british.hash" ,"iso-8859-1" },
|
|
{"en_PH" ,"american.hash" ,"iso-8859-1" },
|
|
{"en_US" ,"american.hash" ,"iso-8859-1" },
|
|
{"eo" ,"esperanto.hash" ,"iso-8859-3" },
|
|
{"es" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"es_AR" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"es_BO" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"es_CL" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"es_CO" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"es_CR" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"es_DO" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"es_EC" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"es_ES" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"es_GT" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"es_HN" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"es_MX" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"es_NI" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"es_PA" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"es_PE" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"es_PR" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"es_PY" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"es_SV" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"es_UY" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"es_VE" ,"espanol.hash" ,"iso-8859-1" },
|
|
{"fi" ,"finnish.hash" ,"iso-8859-1" },
|
|
{"fi_FI" ,"finnish.hash" ,"iso-8859-1" },
|
|
{"fr" ,"francais.hash" ,"iso-8859-1" },
|
|
{"fr_BE" ,"francais.hash" ,"iso-8859-1" },
|
|
{"fr_CA" ,"francais.hash" ,"iso-8859-1" },
|
|
{"fr_CH" ,"francais.hash" ,"iso-8859-1" },
|
|
{"fr_FR" ,"francais.hash" ,"iso-8859-1" },
|
|
{"fr_LU" ,"francais.hash" ,"iso-8859-1" },
|
|
{"fr_MC" ,"francais.hash" ,"iso-8859-1" },
|
|
{"hu" ,"hungarian.hash" ,"iso-8859-2" },
|
|
{"hu_HU" ,"hungarian.hash" ,"iso-8859-2" },
|
|
{"ga" ,"irish.hash" ,"iso-8859-1" },
|
|
{"ga_IE" ,"irish.hash" ,"iso-8859-1" },
|
|
{"gl" ,"galician.hash" ,"iso-8859-1" },
|
|
{"gl_ES" ,"galician.hash" ,"iso-8859-1" },
|
|
{"ia" ,"interlingua.hash" ,"iso-8859-1" },
|
|
{"it" ,"italian.hash" ,"iso-8859-1" },
|
|
{"it_IT" ,"italian.hash" ,"iso-8859-1" },
|
|
{"it_CH" ,"italian.hash" ,"iso-8859-1" },
|
|
{"la" ,"mlatin.hash" ,"iso-8859-1" },
|
|
{"la_IT" ,"mlatin.hash" ,"iso-8859-1" },
|
|
{"lt" ,"lietuviu.hash" ,"iso-8859-13" },
|
|
{"lt_LT" ,"lietuviu.hash" ,"iso-8859-13" },
|
|
{"nl" ,"nederlands.hash" ,"iso-8859-1" },
|
|
{"nl_NL" ,"nederlands.hash" ,"iso-8859-1" },
|
|
{"nl_BE" ,"nederlands.hash" ,"iso-8859-1" },
|
|
{"nb" ,"norsk.hash" ,"iso-8859-1" },
|
|
{"nb_NO" ,"norsk.hash" ,"iso-8859-1" },
|
|
{"nn" ,"nynorsk.hash" ,"iso-8859-1" },
|
|
{"nn_NO" ,"nynorsk.hash" ,"iso-8859-1" },
|
|
{"no" ,"norsk.hash" ,"iso-8859-1" },
|
|
{"no_NO" ,"norsk.hash" ,"iso-8859-1" },
|
|
{"pl" ,"polish.hash" ,"iso-8859-2" },
|
|
{"pl_PL" ,"polish.hash" ,"iso-8859-2" },
|
|
{"pt" ,"brazilian.hash" ,"iso-8859-1" },
|
|
{"pt_BR" ,"brazilian.hash" ,"iso-8859-1" },
|
|
{"pt_PT" ,"portugues.hash" ,"iso-8859-1" },
|
|
{"ru" ,"russian.hash" ,"koi8-r" },
|
|
{"ru_MD" ,"russian.hash" ,"koi8-r" },
|
|
{"ru_RU" ,"russian.hash" ,"koi8-r" },
|
|
{"sc" ,"sardinian.hash" ,"iso-8859-1" },
|
|
{"sc_IT" ,"sardinian.hash" ,"iso-8859-1" },
|
|
{"sk" ,"slovak.hash" ,"iso-8859-2" },
|
|
{"sk_SK" ,"slovak.hash" ,"iso-8859-2" },
|
|
{"sl" ,"slovensko.hash" ,"iso-8859-2" },
|
|
{"sl_SI" ,"slovensko.hash" ,"iso-8859-2" },
|
|
{"sv" ,"svenska.hash" ,"iso-8859-1" },
|
|
{"sv_SE" ,"svenska.hash" ,"iso-8859-1" },
|
|
{"uk" ,"ukrainian.hash" ,"koi8-u" },
|
|
{"uk_UA" ,"ukrainian.hash" ,"koi8-u" },
|
|
{"yi" ,"yiddish-yivo.hash" ,"utf-8" }
|
|
};
|
|
|
|
static const size_t size_ispell_map = ( sizeof(ispell_map) / sizeof((ispell_map)[0]) );
|
|
static TQMap<TQString, TQString> ispell_dict_map;
|
|
|
|
|
|
void
|
|
ISpellChecker::try_autodetect_charset(const char * const inEncoding)
|
|
{
|
|
if (inEncoding && strlen(inEncoding))
|
|
{
|
|
m_translate_in = TQTextCodec::codecForName(inEncoding);
|
|
}
|
|
}
|
|
|
|
/***************************************************************************/
|
|
/***************************************************************************/
|
|
|
|
ISpellChecker::ISpellChecker()
|
|
: deftflag(-1),
|
|
prefstringchar(-1),
|
|
m_bSuccessfulInit(false),
|
|
m_BC(NULL),
|
|
m_cd(NULL),
|
|
m_cl(NULL),
|
|
m_cm(NULL),
|
|
m_ho(NULL),
|
|
m_nd(NULL),
|
|
m_so(NULL),
|
|
m_se(NULL),
|
|
m_ti(NULL),
|
|
m_te(NULL),
|
|
m_hashstrings(NULL),
|
|
m_hashtbl(NULL),
|
|
m_pflaglist(NULL),
|
|
m_sflaglist(NULL),
|
|
m_chartypes(NULL),
|
|
m_infile(NULL),
|
|
m_outfile(NULL),
|
|
m_askfilename(NULL),
|
|
m_Trynum(0),
|
|
m_translate_in(0)
|
|
{
|
|
memset(m_sflagindex,0,sizeof(m_sflagindex));
|
|
memset(m_pflagindex,0,sizeof(m_pflagindex));
|
|
}
|
|
|
|
#ifndef FREEP
|
|
#define FREEP(p) do { if (p) free(p); } while (0)
|
|
#endif
|
|
|
|
ISpellChecker::~ISpellChecker()
|
|
{
|
|
if (m_bSuccessfulInit) {
|
|
// only cleanup our mess if we were successfully initialized
|
|
|
|
clearindex (m_pflagindex);
|
|
clearindex (m_sflagindex);
|
|
}
|
|
|
|
FREEP(m_hashtbl);
|
|
FREEP(m_hashstrings);
|
|
FREEP(m_sflaglist);
|
|
FREEP(m_chartypes);
|
|
|
|
delete m_translate_in;
|
|
m_translate_in = 0;
|
|
}
|
|
|
|
bool
|
|
ISpellChecker::checkWord( const TQString& utf8Word )
|
|
{
|
|
ichar_t iWord[INPUTWORDLEN + MAXAFFIXLEN];
|
|
if (!m_bSuccessfulInit)
|
|
return false;
|
|
|
|
if (!utf8Word || utf8Word.length() >= (INPUTWORDLEN + MAXAFFIXLEN) || utf8Word.isEmpty())
|
|
return false;
|
|
|
|
bool retVal = false;
|
|
TQCString out;
|
|
if (!m_translate_in)
|
|
return false;
|
|
else {
|
|
/* convert to 8bit string and null terminate */
|
|
int len_out = utf8Word.length();
|
|
|
|
out = m_translate_in->fromUnicode( utf8Word, len_out );
|
|
}
|
|
|
|
if (!strtoichar(iWord, out.data(), INPUTWORDLEN + MAXAFFIXLEN, 0))
|
|
{
|
|
if (good(iWord, 0, 0, 1, 0) == 1 ||
|
|
compoundgood(iWord, 1) == 1)
|
|
{
|
|
retVal = true;
|
|
}
|
|
}
|
|
|
|
return retVal;
|
|
}
|
|
|
|
TQStringList
|
|
ISpellChecker::suggestWord(const TQString& utf8Word)
|
|
{
|
|
ichar_t iWord[INPUTWORDLEN + MAXAFFIXLEN];
|
|
int c;
|
|
|
|
if (!m_bSuccessfulInit)
|
|
return TQStringList();
|
|
|
|
if (utf8Word.isEmpty() || utf8Word.length() >= (INPUTWORDLEN + MAXAFFIXLEN) ||
|
|
utf8Word.length() == 0)
|
|
return TQStringList();
|
|
|
|
TQCString out;
|
|
if (!m_translate_in)
|
|
return TQStringList();
|
|
else
|
|
{
|
|
/* convert to 8bit string and null terminate */
|
|
|
|
int len_out = utf8Word.length();
|
|
out = m_translate_in->fromUnicode( utf8Word, len_out );
|
|
}
|
|
|
|
if (!strtoichar(iWord, out.data(), INPUTWORDLEN + MAXAFFIXLEN, 0))
|
|
makepossibilities(iWord);
|
|
else
|
|
return TQStringList();
|
|
|
|
TQStringList sugg_arr;
|
|
for (c = 0; c < m_pcount; c++)
|
|
{
|
|
TQString utf8Word;
|
|
|
|
if (!m_translate_in)
|
|
{
|
|
/* copy to 8bit string and null terminate */
|
|
utf8Word = TQString::fromUtf8( m_possibilities[c] );
|
|
}
|
|
else
|
|
{
|
|
/* convert to 32bit string and null terminate */
|
|
utf8Word = m_translate_in->toUnicode( m_possibilities[c] );
|
|
}
|
|
|
|
sugg_arr.append( utf8Word );
|
|
}
|
|
|
|
return sugg_arr;
|
|
}
|
|
|
|
static void
|
|
s_buildHashNames (std::vector<std::string> & names, const char * dict)
|
|
{
|
|
const char * tmp = 0;
|
|
int i = 0;
|
|
|
|
names.clear ();
|
|
|
|
while ( (tmp = ispell_dirs[i++]) ) {
|
|
TQCString maybeFile = TQCString( tmp ) + '/';
|
|
maybeFile += dict;
|
|
names.push_back( maybeFile.data() );
|
|
}
|
|
}
|
|
|
|
static void
|
|
s_allDics()
|
|
{
|
|
const char * tmp = 0;
|
|
int i = 0;
|
|
|
|
while ( (tmp = ispell_dirs[i++]) ) {
|
|
TQDir dir( tmp );
|
|
TQStringList lst = dir.entryList( "*.hash" );
|
|
for ( TQStringList::Iterator it = lst.begin(); it != lst.end(); ++it ) {
|
|
TQFileInfo info( *it );
|
|
for (size_t i = 0; i < size_ispell_map; i++)
|
|
{
|
|
const IspellMap * mapping = (const IspellMap *)(&(ispell_map[i]));
|
|
if (!strcmp (info.fileName().latin1(), mapping->dict))
|
|
{
|
|
ispell_dict_map.insert( mapping->lang, *it );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
TQValueList<TQString>
|
|
ISpellChecker::allDics()
|
|
{
|
|
if ( ispell_dict_map.empty() )
|
|
s_allDics();
|
|
|
|
return ispell_dict_map.keys();
|
|
}
|
|
|
|
TQString
|
|
ISpellChecker::loadDictionary (const char * szdict)
|
|
{
|
|
std::vector<std::string> dict_names;
|
|
|
|
s_buildHashNames (dict_names, szdict);
|
|
|
|
for (size_t i = 0; i < dict_names.size(); i++)
|
|
{
|
|
if (linit(const_cast<char*>(dict_names[i].c_str())) >= 0)
|
|
return dict_names[i].c_str();
|
|
}
|
|
|
|
return TQString::null;
|
|
}
|
|
|
|
/*!
|
|
* Load ispell dictionary hash file for given language.
|
|
*
|
|
* \param szLang - The language tag ("en-US") we want to use
|
|
* \return The name of the dictionary file
|
|
*/
|
|
bool
|
|
ISpellChecker::loadDictionaryForLanguage ( const char * szLang )
|
|
{
|
|
TQString hashname;
|
|
|
|
const char * encoding = NULL;
|
|
const char * szFile = NULL;
|
|
|
|
for (size_t i = 0; i < size_ispell_map; i++)
|
|
{
|
|
const IspellMap * mapping = (const IspellMap *)(&(ispell_map[i]));
|
|
if (!strcmp (szLang, mapping->lang))
|
|
{
|
|
szFile = mapping->dict;
|
|
encoding = mapping->enc;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!szFile || !strlen(szFile))
|
|
return false;
|
|
|
|
alloc_ispell_struct();
|
|
|
|
hashname = loadDictionary(szFile);
|
|
if (hashname.isEmpty())
|
|
return false;
|
|
|
|
// one of the two above calls succeeded
|
|
setDictionaryEncoding (hashname, encoding);
|
|
|
|
return true;
|
|
}
|
|
|
|
void
|
|
ISpellChecker::setDictionaryEncoding( const TQString& hashname, const char * encoding )
|
|
{
|
|
/* Get Hash encoding from XML file. This should always work! */
|
|
try_autodetect_charset(encoding);
|
|
|
|
if (m_translate_in)
|
|
{
|
|
/* We still have to setup prefstringchar*/
|
|
prefstringchar = findfiletype("utf8", 1, deftflag < 0 ? &deftflag
|
|
: static_cast<int *>(NULL));
|
|
|
|
if (prefstringchar < 0)
|
|
{
|
|
std::string teststring;
|
|
for(int n1 = 1; n1 <= 15; n1++)
|
|
{
|
|
teststring = "latin" + n1;
|
|
prefstringchar = findfiletype(teststring.c_str(), 1,
|
|
deftflag < 0 ? &deftflag : static_cast<int *>(NULL));
|
|
if (prefstringchar >= 0)
|
|
break;
|
|
}
|
|
}
|
|
|
|
return; /* success */
|
|
}
|
|
|
|
/* Test for UTF-8 first */
|
|
prefstringchar = findfiletype("utf8", 1, deftflag < 0 ? &deftflag : static_cast<int *>(NULL));
|
|
if (prefstringchar >= 0)
|
|
{
|
|
m_translate_in = TQTextCodec::codecForName("utf8");
|
|
}
|
|
|
|
if (m_translate_in)
|
|
return; /* success */
|
|
|
|
/* Test for "latinN" */
|
|
if (!m_translate_in)
|
|
{
|
|
/* Look for "altstringtype" names from latin1 to latin15 */
|
|
for(int n1 = 1; n1 <= 15; n1++)
|
|
{
|
|
TQString teststring = TQString("latin%1").arg(n1);
|
|
prefstringchar = findfiletype(teststring.latin1(), 1,
|
|
deftflag < 0 ? &deftflag : static_cast<int *>(NULL));
|
|
if (prefstringchar >= 0)
|
|
{
|
|
//FIXME: latin1 might be wrong
|
|
m_translate_in = TQTextCodec::codecForName( teststring.latin1() );
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* If nothing found, use latin1 */
|
|
if (!m_translate_in)
|
|
{
|
|
m_translate_in = TQTextCodec::codecForName("latin1");
|
|
}
|
|
}
|
|
|
|
bool
|
|
ISpellChecker::requestDictionary(const char *szLang)
|
|
{
|
|
if (!loadDictionaryForLanguage (szLang))
|
|
{
|
|
// handle a shortened version of the language tag: en_US => en
|
|
std::string shortened_dict (szLang);
|
|
size_t uscore_pos;
|
|
|
|
if ((uscore_pos = shortened_dict.rfind ('_')) != ((size_t)-1)) {
|
|
shortened_dict = shortened_dict.substr(0, uscore_pos);
|
|
if (!loadDictionaryForLanguage (shortened_dict.c_str()))
|
|
return false;
|
|
} else
|
|
return false;
|
|
}
|
|
|
|
m_bSuccessfulInit = true;
|
|
|
|
if (prefstringchar < 0)
|
|
m_defdupchar = 0;
|
|
else
|
|
m_defdupchar = prefstringchar;
|
|
|
|
return true;
|
|
}
|