You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tdeedu/kiten/dict.cpp

877 lines
18 KiB

/**
This file is part of Kiten, a KDE Japanese Reference Tool...
Copyright (C) 2001 Jason Katz-Brown <jason@katzbrown.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
USA
**/
#include <kdebug.h>
#include <klocale.h>
#include <kmessagebox.h>
#include <kprocess.h>
#include <kstandarddirs.h>
#include <tqfileinfo.h>
#include <tqregexp.h>
#include <tqtextcodec.h>
#include "dict.h"
#include <iostream>
#include <cassert>
#include <sys/mman.h>
#include <stdio.h>
namespace
{
void msgerr(const TQString &msg, const TQString &dict = TQString())
{
TQString output = msg;
if (!dict.isNull())
output = msg.tqarg(dict);
KMessageBox::error(0, output);
}
}
using namespace Dict;
TextType Dict::textType(const TQString &text)
{
ushort first = text.tqat(0).tqunicode();
if (first < 0x3000)
return Text_Latin;
// else if (first < 0x3040) // CJK Symbols and Punctuation
// return Text_Kana;
// else if (first < 0x30A0) // Hiragana
// return Text_Kana;
else if (first < 0x3100) // Katakana
return Text_Kana;
else /*if (first >= 0x3400 && first < 0x4DC0)*/ // CJK Unified Ideographs Extension A
return Text_Kanji;
}
File::File(TQString path, TQString n)
: myName(n)
, dictFile(path)
, dictPtr((const unsigned char *)MAP_FAILED)
, indexFile(KGlobal::dirs()->saveLocation("data", "kiten/xjdx/", true) + TQFileInfo(path).baseName() + ".xjdx")
, indexPtr((const uint32_t *)MAP_FAILED)
, valid(false)
{
bool forceUpdate = false;
bool indexFileExists = indexFile.exists();
if (indexFileExists)
{
// ### change this if need be!!
const int indexFileVersion = 14;
// this up-to-date code from xjdservcomm.c
// we need to check if the index needs to
// remade
int dictionaryLength;
TQFile dictionary(path);
dictionaryLength = dictionary.size();
dictionaryLength++;
//kdDebug() << "dictionaryLength = " << dictionaryLength << endl;
int32_t testWord[1];
fread(&testWord[0], sizeof(int32_t), 1, fopen(indexFile.name().latin1(), "rb"));
//kdDebug() << "testWord[0] = " << testWord[0] << endl;
if (testWord[0] != (dictionaryLength + indexFileVersion))
forceUpdate = true;
}
if (!indexFileExists || forceUpdate)
{
//kdDebug() << "creating " << indexFile.name() << endl;
// find the index generator executable
KProcess proc;
proc << KStandardDirs::findExe("kitengen") << path << indexFile.name();
// TODO: put up a status dialog and event loop instead of blocking
proc.start(KProcess::Block, KProcess::NoCommunication);
}
if (!dictFile.open(IO_ReadOnly))
{
msgerr(i18n("Could not open dictionary %1."), path);
return;
}
dictPtr = (const unsigned char *)mmap(0, dictFile.size(), PROT_READ, MAP_SHARED, dictFile.handle(), 0);
if (dictPtr == (unsigned char*) MAP_FAILED)
{
msgerr(i18n("Memory error when loading dictionary %1."), path);
return;
}
if (!indexFile.open(IO_ReadOnly))
{
msgerr(i18n("Could not open index for dictionary %1."), path);
return;
}
indexPtr = (const uint32_t*)mmap(0, indexFile.size(), PROT_READ, MAP_SHARED, indexFile.handle(), 0);
if (indexPtr == (uint32_t*) MAP_FAILED)
{
msgerr(i18n("Memory error when loading dictionary %1's index file."), path);
return;
}
valid = true;
}
File::~File(void)
{
if (dictPtr != (unsigned char*) MAP_FAILED)
munmap((char *)dictPtr, dictFile.size());
dictFile.close();
if (indexPtr != (uint32_t*) MAP_FAILED)
munmap((char *)indexPtr, indexFile.size());
indexFile.close();
}
TQString File::name(void)
{
return myName;
}
Array<const unsigned char> File::dict(void)
{
assert(valid);
return Array<const unsigned char>(dictPtr, dictFile.size());
}
Array<const uint32_t> File::index(void)
{
assert(valid);
return Array<const uint32_t>(indexPtr, indexFile.size());
}
int File::dictLength(void)
{
return dictFile.size();
}
int File::indexLength(void)
{
return indexFile.size();
}
bool File::isValid(void)
{
return valid;
}
// returns specified character from a dictionary
unsigned char File::lookup(unsigned i, int offset)
{
uint32_t pos = indexPtr[i] + offset - 1;
if (pos > dictFile.size()) return 10;
return dictPtr[pos];
}
TQCString File::lookup(unsigned i)
{
uint32_t start = indexPtr[i] - 1;
uint32_t pos = start;
const unsigned size = dictFile.size();
// get the whole word
while(pos <= size && dictPtr[pos] != 0 && dictPtr[pos] != 0x0a)
++pos;
// put the word in the TQCString
TQCString retval((const char *)(dictPtr + start), pos - start);
// tack on a null
char null = 0;
retval.append(&null);
// and away we go
return retval;
}
// And last, Index itself is the API presented to the rest of Kiten
Index::Index()
: TQObject()
{
dictFiles.setAutoDelete(true);
kanjiDictFiles.setAutoDelete(true);
}
Index::~Index()
{
}
void Index::setDictList(const TQStringList &list, const TQStringList &names)
{
loadDictList(dictFiles, list, names);
}
void Index::setKanjiDictList(const TQStringList &list, const TQStringList &names)
{
loadDictList(kanjiDictFiles, list, names);
}
void Index::loadDictList(TQPtrList<File> &fileList, const TQStringList &dictList, const TQStringList &dictNameList)
{
fileList.clear();
// check if we have a dict
if (dictList.size() < 1)
{
msgerr(i18n("No dictionaries in list!"));
return;
}
TQStringList::ConstIterator it;
TQStringList::ConstIterator dictIt;
for (it = dictList.begin(), dictIt = dictNameList.begin(); it != dictList.end(); ++it, ++dictIt)
{
File *f = new File(*it, *dictIt);
// our ugly substitute for exceptions
if (f->isValid())
fileList.append(f);
else
delete f;
}
}
TQStringList Index::doSearch(File &file, const TQString &text)
{
// Do a binary search to find an entry that matches text
TQTextCodec &codec = *TQTextCodec::codecForName("eucJP");
TQCString eucString = codec.fromUnicode(text);
TQString prevResult;
Array<const uint32_t> index = file.index();
Array<const unsigned char> dict = file.dict();
int lo = 0;
int hi = index.size() - 1;
unsigned cur;
int comp = 0;
do
{
cur = (hi + lo) / 2;
comp = stringCompare(file, cur, eucString);
if (comp < 0)
hi = cur - 1;
else if (comp > 0)
lo = cur + 1;
}
while(hi >= lo && comp != 0 && !(hi == 0 && lo == 0));
TQStringList results;
// A match?
if (comp == 0)
{
// wheel back to make sure we get the first matching entry
while(cur - 1 && 0 == stringCompare(file, cur - 1, eucString))
--cur;
// output every matching entry
while(cur < index.size() && 0 == stringCompare(file, cur, eucString))
{
// because the index doesn't point
// to the start of the line, find the
// start of the line:
int i = 0;
while(file.lookup(cur, i - 1) != 0x0a) --i;
TQByteArray bytes(0);
while(file.lookup(cur, i) != 0x0a) // get to end of our line
{
const char eucchar = file.lookup(cur, i);
bytes.resize(bytes.size() + 1);
bytes[bytes.size() - 1] = eucchar;
++i;
}
TQString result = codec.toUnicode(bytes) + TQString("\n");
if (prevResult != result)
{
results.append(result);
prevResult = result;
}
++cur;
}
}
// return all the entries found, or null if no match
return results;
}
SearchResult Index::scanResults(TQRegExp regexp, TQStringList results, bool common)
{
unsigned int num = 0;
unsigned int fullNum = 0;
SearchResult ret;
//ret.results = results; //not here..
for (TQStringList::Iterator itr = results.begin(); itr != results.end(); ++itr)
{
if ((*itr).left(5) == "DICT " || (*itr).left(8) == "HEADING ")
{
ret.list.append(parse(*itr));
continue;
}
int found = regexp.search(*itr);
if (found >= 0)
{
++fullNum;
if ((*itr).find(TQString("(P)")) >= 0 || !common)
{
// we append HERE, so we get the exact
// results we have in ret.list
ret.results.append(*itr);
ret.list.append(parse(*itr));
++num;
}
}
}
ret.count = num;
ret.outOf = fullNum;
ret.common = common;
return ret;
}
SearchResult Index::search(TQRegExp regexp, const TQString &text, bool common)
{
TQStringList results;
for (TQPtrListIterator<File> file(dictFiles); *file; ++file)
{
results.append(TQString("DICT ") + (*file)->name());
results += doSearch(**file, text);
}
SearchResult res = scanResults(regexp, results, common);
res.text = text;
return res;
}
SearchResult Index::scanKanjiResults(TQRegExp regexp, TQStringList results, bool common)
{
unsigned int num = 0;
unsigned int fullNum = 0;
const bool jmyCount = false; // don't count JinMeiYou as common
SearchResult ret;
ret.results = results;
for (TQStringList::Iterator itr = results.begin(); itr != results.end(); ++itr)
{
if ((*itr).left(5) == "DICT " || (*itr).left(8) == "HEADING ")
{
ret.list.append(kanjiParse(*itr));
continue;
}
int found = regexp.search(*itr);
if (found >= 0)
{
++fullNum;
// common entries have G[1-8] (jouyou)
TQRegExp comregexp(jmyCount ? "G[1-9]" : "G[1-8]");
if ((*itr).find(comregexp) >= 0 || !common)
{
ret.list.append(kanjiParse(*itr));
++num;
}
}
}
ret.count = num;
ret.outOf = fullNum;
ret.common = common;
return ret;
}
SearchResult Index::searchKanji(TQRegExp regexp, const TQString &text, bool common)
{
TQStringList results;
for (TQPtrListIterator<File> file(kanjiDictFiles); *file; ++file)
{
results.append(TQString("DICT ") + (*file)->name());
results += doSearch(**file, text);
}
SearchResult res = scanKanjiResults(regexp, results, common);
res.text = text;
return res;
}
SearchResult Index::searchPrevious(TQRegExp regexp, const TQString &text, SearchResult list, bool common)
{
SearchResult res;
if (firstEntry(list).extendedKanjiInfo())
res = scanKanjiResults(regexp, list.results, common);
else
res = scanResults(regexp, list.results, common);
res.text = text;
return res;
}
TQRegExp Dict::Index::createRegExp(SearchType type, const TQString &text, DictionaryType dictionaryType, bool caseSensitive)
{
TQString regExp;
switch (type)
{
case Search_Beginning:
switch (textType(text))
{
case Dict::Text_Latin:
regExp = "\\W%1";
break;
case Dict::Text_Kana:
if (dictionaryType == Kanjidict)
regExp = "\\W%1";
else // edict
regExp = "\\[%1";
break;
case Dict::Text_Kanji:
regExp = "^%1";
}
break;
case Search_FullWord:
switch (textType(text))
{
case Dict::Text_Latin:
regExp = "\\W%1\\W";
break;
case Dict::Text_Kana:
if (dictionaryType == Kanjidict)
regExp = " %1 ";
else // edict
regExp = "\\[%1\\]";
break;
case Dict::Text_Kanji:
regExp = "^%1\\W";
}
break;
case Search_Anywhere:
regExp = "%1";
}
return TQRegExp(regExp.tqarg(text), caseSensitive);
}
int Index::stringCompare(File &file, int index, TQCString str)
{
return eucStringCompare(file.lookup(index), str);
}
// effectively does a strnicmp on two "strings"
// except it will make katakana and hiragana match (EUC A4 & A5)
int Dict::eucStringCompare(const char *str, const char *str2)
{
for (unsigned i = 0; ; ++i)
{
unsigned char c = static_cast<unsigned char>(str[i]);
unsigned char c2 = static_cast<unsigned char>(str2[i]);
if ((c2 == '\0') || (c == '\0'))
return 0;
if ((i % 2) == 0)
{
if (c2 == 0xA5)
c2 = 0xA4;
if (c == 0xA5)
c = 0xA4;
}
if ((c2 >= 'A') && (c2 <= 'Z')) c2 |= 0x20; /*fix ucase*/
if ((c >= 'A') && (c <= 'Z')) c |= 0x20;
if (c2 != c)
return (int)c2 - (int)c;
}
return 0;
}
bool Dict::isEUC(unsigned char c)
{
return (c & 0x80);
}
Entry Dict::parse(const TQString &raw)
{
unsigned int length = raw.length();
if (raw.left(5) == "DICT ")
return Entry(raw.right(length - 5));
if (raw.left(8) == "HEADING ")
return Entry(raw.right(length - 8), true);
TQString reading;
TQString kanji;
TQStringList meanings;
TQString curmeaning;
bool firstmeaning = true;
TQCString parsemode("kanji");
unsigned int i;
for (i = 0; i < length; i++)
{
TQChar ichar(raw.at(i));
if (ichar == '[')
{
parsemode = "reading";
}
else if (ichar == ']')
{
// do nothing
}
else if (ichar == '/')
{
if (!firstmeaning)
{
meanings.append(curmeaning);
curmeaning = "";
}
else
{
firstmeaning = false;
parsemode = "meaning";
}
}
else if (ichar == ' ')
{
if (parsemode == "meaning") // only one that needs the space
curmeaning += ' ';
}
else if (parsemode == "kanji")
{
kanji += ichar;
}
else if (parsemode == "meaning")
{
curmeaning += ichar;
}
else if (parsemode == "reading")
{
reading += ichar;
}
}
return (Entry(kanji, reading, meanings));
}
Entry Dict::kanjiParse(const TQString &raw)
{
unsigned int length = raw.length();
if (raw.left(5) == "DICT ")
return Entry(raw.right(length - 5));
if (raw.left(8) == "HEADING ")
return Entry(raw.right(length - 8), true);
TQStringList readings;
TQString kanji;
TQStringList meanings;
TQString curmeaning;
TQString curreading;
TQString strfreq;
TQString strgrade;
TQString strstrokes;
TQString strmiscount = "";
bool prevwasspace = true;
TQChar detailname;
TQCString parsemode("kanji");
// if there are two S entries, second is common miscount
bool strokesset = false;
unsigned int i;
TQChar ichar;
for (i = 0; i < length; i++)
{
ichar = raw.at(i);
if (ichar == ' ')
{
if (parsemode == "reading")
{
readings.append(curreading);
curreading = "";
}
else if (parsemode == "kanji")
{
parsemode = "misc";
}
else if (parsemode == "detail")
{
if (detailname == 'S')
strokesset = true;
parsemode = "misc";
}
else if (parsemode == "meaning")
{
curmeaning += ichar;
}
prevwasspace = true;
}
else if (ichar == '{')
{
parsemode = "meaning";
}
else if (ichar == '}')
{
meanings.append(curmeaning);
curmeaning = "";
}
else if (parsemode == "detail")
{
if (detailname == 'G')
{
strgrade += ichar;
}
else if (detailname == 'F')
{
strfreq += ichar;
}
else if (detailname == 'S')
{
if (strokesset)
strmiscount += ichar;
else
strstrokes += ichar;
}
prevwasspace = false;
}
else if (parsemode == "kanji")
{
kanji += ichar;
}
else if (parsemode == "meaning")
{
curmeaning += ichar;
}
else if (parsemode == "reading")
{
curreading += ichar;
}
else if (parsemode == "misc" && prevwasspace)
{
if (TQRegExp("[A-Za-z0-9]").search(TQString(ichar)) >= 0)
// is non-japanese?
{
detailname = ichar;
parsemode = "detail";
}
else
{
curreading = TQString(ichar);
parsemode = "reading";
}
}
}
return (Entry(kanji, readings, meanings, strgrade.toUInt(), strfreq.toUInt(), strstrokes.toUInt(), strmiscount.toUInt()));
}
TQString Dict::prettyMeaning(TQStringList Meanings)
{
TQString meanings;
TQStringList::Iterator it;
for (it = Meanings.begin(); it != Meanings.end(); ++it)
meanings.append((*it).stripWhiteSpace()).append("; ");
meanings.truncate(meanings.length() - 2);
return meanings;
}
TQString Dict::prettyKanjiReading(TQStringList Readings)
{
TQStringList::Iterator it;
TQString html;
for (it = Readings.begin(); it != Readings.end(); ++it)
{
if ((*it) == "T1")
html += i18n("In names: ");
else
{
if ((*it) == "T2")
html += i18n("As radical: ");
else
{
html += (*it).stripWhiteSpace();
html += ", ";
}
}
}
html.truncate(html.length() - 2); // get rid of last ,
return html;
}
Dict::Entry Dict::firstEntry(Dict::SearchResult result)
{
for (TQValueListIterator<Dict::Entry> it = result.list.begin(); it != result.list.end(); ++it)
{
if ((*it).dictName() == "__NOTSET" && (*it).header() == "__NOTSET")
return (*it);
}
return Dict::Entry("__NOTHING");
}
TQString Dict::firstEntryText(Dict::SearchResult result)
{
for (TQStringList::Iterator it = result.results.begin(); it != result.results.end(); ++it)
{
if ((*it).left(5) != "DICT " && (*it).left(7) != "HEADER ")
return (*it);
}
return TQString("NONE ");
}
///////////////////////////////////////////////////////////////
Entry::Entry(const TQString & kanji, const TQString & reading, const TQStringList &meanings)
: DictName(TQString::tqfromLatin1("__NOTSET"))
, Header(TQString::tqfromLatin1("__NOTSET"))
, Meanings(meanings)
, Kanji(kanji)
, KanaOnly(reading.isEmpty())
, Readings(KanaOnly ? kanji : reading)
, ExtendedKanjiInfo(false)
, Grade(0)
, Strokes(0)
, Miscount(0)
, Freq(0)
{
}
Entry::Entry(const TQString &kanji, TQStringList &readings, TQStringList &meanings, unsigned int grade, unsigned int freq, unsigned int strokes, unsigned int miscount)
: DictName(TQString::tqfromLatin1("__NOTSET"))
, Header(TQString::tqfromLatin1("__NOTSET"))
, Meanings(meanings)
, Kanji(kanji)
, KanaOnly(false)
, Readings(readings)
, ExtendedKanjiInfo(true)
, Grade(grade)
, Strokes(strokes)
, Miscount(miscount)
, Freq(freq)
{
}
Entry::Entry(const TQString &dictname)
: KanaOnly(true)
, ExtendedKanjiInfo(false)
{
DictName = dictname;
}
Entry::Entry(const TQString &headername, bool)
: DictName(TQString::tqfromLatin1("__NOTSET"))
, Header(headername)
, KanaOnly(true)
, ExtendedKanjiInfo(false)
{
}
TQString Entry::dictName()
{
return DictName;
}
TQString Entry::header()
{
return Header;
}
bool Entry::kanaOnly()
{
return KanaOnly;
}
TQString Entry::kanji()
{
return Kanji;
}
TQStringList Entry::readings()
{
return Readings;
}
TQString Entry::firstReading()
{
return *Readings.at(0);
}
TQStringList Entry::meanings()
{
return Meanings;
}
unsigned int Entry::grade()
{
return Grade;
}
unsigned int Entry::freq()
{
return Freq;
}
unsigned int Entry::miscount()
{
return Miscount;
}
unsigned int Entry::strokes()
{
return Strokes;
}
bool Entry::extendedKanjiInfo()
{
return ExtendedKanjiInfo;
}
#include "dict.moc"