/** This file is part of Kiten, a KDE Japanese Reference Tool... Copyright (C) 2001 Jason Katz-Brown This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA **/ #include #include #include #include #include #include #include #include #include "dict.h" #include #include #include #include namespace { void msgerr(const TQString &msg, const TQString &dict = TQString()) { TQString output = msg; if (!dict.isNull()) output = msg.arg(dict); KMessageBox::error(0, output); } } using namespace Dict; TextType Dict::textType(const TQString &text) { ushort first = text.at(0).unicode(); if (first < 0x3000) return Text_Latin; // else if (first < 0x3040) // CJK Symbols and Punctuation // return Text_Kana; // else if (first < 0x30A0) // Hiragana // return Text_Kana; else if (first < 0x3100) // Katakana return Text_Kana; else /*if (first >= 0x3400 && first < 0x4DC0)*/ // CJK Unified Ideographs Extension A return Text_Kanji; } File::File(TQString path, TQString n) : myName(n) , dictFile(path) , dictPtr((const unsigned char *)MAP_FAILED) , indexFile(TDEGlobal::dirs()->saveLocation("data", "kiten/xjdx/", true) + TQFileInfo(path).baseName() + ".xjdx") , indexPtr((const uint32_t *)MAP_FAILED) , valid(false) { bool forceUpdate = false; bool indexFileExists = indexFile.exists(); if (indexFileExists) { // ### change this if need be!! const int indexFileVersion = 14; // this up-to-date code from xjdservcomm.c // we need to check if the index needs to // remade int dictionaryLength; TQFile dictionary(path); dictionaryLength = dictionary.size(); dictionaryLength++; //kdDebug() << "dictionaryLength = " << dictionaryLength << endl; int32_t testWord[1]; fread(&testWord[0], sizeof(int32_t), 1, fopen(indexFile.name().latin1(), "rb")); //kdDebug() << "testWord[0] = " << testWord[0] << endl; if (testWord[0] != (dictionaryLength + indexFileVersion)) forceUpdate = true; } if (!indexFileExists || forceUpdate) { //kdDebug() << "creating " << indexFile.name() << endl; // find the index generator executable TDEProcess proc; proc << TDEStandardDirs::findExe("kitengen") << path << indexFile.name(); // TODO: put up a status dialog and event loop instead of blocking proc.start(TDEProcess::Block, TDEProcess::NoCommunication); } if (!dictFile.open(IO_ReadOnly)) { msgerr(i18n("Could not open dictionary %1."), path); return; } dictPtr = (const unsigned char *)mmap(0, dictFile.size(), PROT_READ, MAP_SHARED, dictFile.handle(), 0); if (dictPtr == (unsigned char*) MAP_FAILED) { msgerr(i18n("Memory error when loading dictionary %1."), path); return; } if (!indexFile.open(IO_ReadOnly)) { msgerr(i18n("Could not open index for dictionary %1."), path); return; } indexPtr = (const uint32_t*)mmap(0, indexFile.size(), PROT_READ, MAP_SHARED, indexFile.handle(), 0); if (indexPtr == (uint32_t*) MAP_FAILED) { msgerr(i18n("Memory error when loading dictionary %1's index file."), path); return; } valid = true; } File::~File(void) { if (dictPtr != (unsigned char*) MAP_FAILED) munmap((char *)dictPtr, dictFile.size()); dictFile.close(); if (indexPtr != (uint32_t*) MAP_FAILED) munmap((char *)indexPtr, indexFile.size()); indexFile.close(); } TQString File::name(void) { return myName; } Array File::dict(void) { assert(valid); return Array(dictPtr, dictFile.size()); } Array File::index(void) { assert(valid); return Array(indexPtr, indexFile.size()); } int File::dictLength(void) { return dictFile.size(); } int File::indexLength(void) { return indexFile.size(); } bool File::isValid(void) { return valid; } // returns specified character from a dictionary unsigned char File::lookup(unsigned i, int offset) { uint32_t pos = indexPtr[i] + offset - 1; if (pos > dictFile.size()) return 10; return dictPtr[pos]; } TQCString File::lookup(unsigned i) { uint32_t start = indexPtr[i] - 1; uint32_t pos = start; const unsigned size = dictFile.size(); // get the whole word while(pos <= size && dictPtr[pos] != 0 && dictPtr[pos] != 0x0a) ++pos; // put the word in the TQCString TQCString retval((const char *)(dictPtr + start), pos - start); // tack on a null char null = 0; retval.append(&null); // and away we go return retval; } // And last, Index itself is the API presented to the rest of Kiten Index::Index() : TQObject() { dictFiles.setAutoDelete(true); kanjiDictFiles.setAutoDelete(true); } Index::~Index() { } void Index::setDictList(const TQStringList &list, const TQStringList &names) { loadDictList(dictFiles, list, names); } void Index::setKanjiDictList(const TQStringList &list, const TQStringList &names) { loadDictList(kanjiDictFiles, list, names); } void Index::loadDictList(TQPtrList &fileList, const TQStringList &dictList, const TQStringList &dictNameList) { fileList.clear(); // check if we have a dict if (dictList.size() < 1) { msgerr(i18n("No dictionaries in list!")); return; } TQStringList::ConstIterator it; TQStringList::ConstIterator dictIt; for (it = dictList.begin(), dictIt = dictNameList.begin(); it != dictList.end(); ++it, ++dictIt) { File *f = new File(*it, *dictIt); // our ugly substitute for exceptions if (f->isValid()) fileList.append(f); else delete f; } } TQStringList Index::doSearch(File &file, const TQString &text) { // Do a binary search to find an entry that matches text TQTextCodec &codec = *TQTextCodec::codecForName("eucJP"); TQCString eucString = codec.fromUnicode(text); TQString prevResult; Array index = file.index(); Array dict = file.dict(); int lo = 0; int hi = index.size() - 1; unsigned cur; int comp = 0; do { cur = (hi + lo) / 2; comp = stringCompare(file, cur, eucString); if (comp < 0) hi = cur - 1; else if (comp > 0) lo = cur + 1; } while(hi >= lo && comp != 0 && !(hi == 0 && lo == 0)); TQStringList results; // A match? if (comp == 0) { // wheel back to make sure we get the first matching entry while(cur - 1 && 0 == stringCompare(file, cur - 1, eucString)) --cur; // output every matching entry while(cur < index.size() && 0 == stringCompare(file, cur, eucString)) { // because the index doesn't point // to the start of the line, find the // start of the line: int i = 0; while(file.lookup(cur, i - 1) != 0x0a) --i; TQByteArray bytes(0); while(file.lookup(cur, i) != 0x0a) // get to end of our line { const char eucchar = file.lookup(cur, i); bytes.resize(bytes.size() + 1); bytes[bytes.size() - 1] = eucchar; ++i; } TQString result = codec.toUnicode(bytes) + TQString("\n"); if (prevResult != result) { results.append(result); prevResult = result; } ++cur; } } // return all the entries found, or null if no match return results; } SearchResult Index::scanResults(TQRegExp regexp, TQStringList results, bool common) { unsigned int num = 0; unsigned int fullNum = 0; SearchResult ret; //ret.results = results; //not here.. for (TQStringList::Iterator itr = results.begin(); itr != results.end(); ++itr) { if ((*itr).left(5) == "DICT " || (*itr).left(8) == "HEADING ") { ret.list.append(parse(*itr)); continue; } int found = regexp.search(*itr); if (found >= 0) { ++fullNum; if ((*itr).find(TQString("(P)")) >= 0 || !common) { // we append HERE, so we get the exact // results we have in ret.list ret.results.append(*itr); ret.list.append(parse(*itr)); ++num; } } } ret.count = num; ret.outOf = fullNum; ret.common = common; return ret; } SearchResult Index::search(TQRegExp regexp, const TQString &text, bool common) { TQStringList results; for (TQPtrListIterator file(dictFiles); *file; ++file) { results.append(TQString("DICT ") + (*file)->name()); results += doSearch(**file, text); } SearchResult res = scanResults(regexp, results, common); res.text = text; return res; } SearchResult Index::scanKanjiResults(TQRegExp regexp, TQStringList results, bool common) { unsigned int num = 0; unsigned int fullNum = 0; const bool jmyCount = false; // don't count JinMeiYou as common SearchResult ret; ret.results = results; for (TQStringList::Iterator itr = results.begin(); itr != results.end(); ++itr) { if ((*itr).left(5) == "DICT " || (*itr).left(8) == "HEADING ") { ret.list.append(kanjiParse(*itr)); continue; } int found = regexp.search(*itr); if (found >= 0) { ++fullNum; // common entries have G[1-8] (jouyou) TQRegExp comregexp(jmyCount ? "G[1-9]" : "G[1-8]"); if ((*itr).find(comregexp) >= 0 || !common) { ret.list.append(kanjiParse(*itr)); ++num; } } } ret.count = num; ret.outOf = fullNum; ret.common = common; return ret; } SearchResult Index::searchKanji(TQRegExp regexp, const TQString &text, bool common) { TQStringList results; for (TQPtrListIterator file(kanjiDictFiles); *file; ++file) { results.append(TQString("DICT ") + (*file)->name()); results += doSearch(**file, text); } SearchResult res = scanKanjiResults(regexp, results, common); res.text = text; return res; } SearchResult Index::searchPrevious(TQRegExp regexp, const TQString &text, SearchResult list, bool common) { SearchResult res; if (firstEntry(list).extendedKanjiInfo()) res = scanKanjiResults(regexp, list.results, common); else res = scanResults(regexp, list.results, common); res.text = text; return res; } TQRegExp Dict::Index::createRegExp(SearchType type, const TQString &text, DictionaryType dictionaryType, bool caseSensitive) { TQString regExp; switch (type) { case Search_Beginning: switch (textType(text)) { case Dict::Text_Latin: regExp = "\\W%1"; break; case Dict::Text_Kana: if (dictionaryType == Kanjidict) regExp = "\\W%1"; else // edict regExp = "\\[%1"; break; case Dict::Text_Kanji: regExp = "^%1"; } break; case Search_FullWord: switch (textType(text)) { case Dict::Text_Latin: regExp = "\\W%1\\W"; break; case Dict::Text_Kana: if (dictionaryType == Kanjidict) regExp = " %1 "; else // edict regExp = "\\[%1\\]"; break; case Dict::Text_Kanji: regExp = "^%1\\W"; } break; case Search_Anywhere: regExp = "%1"; } return TQRegExp(regExp.arg(text), caseSensitive); } int Index::stringCompare(File &file, int index, TQCString str) { return eucStringCompare(file.lookup(index), str); } // effectively does a strnicmp on two "strings" // except it will make katakana and hiragana match (EUC A4 & A5) int Dict::eucStringCompare(const char *str, const char *str2) { for (unsigned i = 0; ; ++i) { unsigned char c = static_cast(str[i]); unsigned char c2 = static_cast(str2[i]); if ((c2 == '\0') || (c == '\0')) return 0; if ((i % 2) == 0) { if (c2 == 0xA5) c2 = 0xA4; if (c == 0xA5) c = 0xA4; } if ((c2 >= 'A') && (c2 <= 'Z')) c2 |= 0x20; /*fix ucase*/ if ((c >= 'A') && (c <= 'Z')) c |= 0x20; if (c2 != c) return (int)c2 - (int)c; } return 0; } bool Dict::isEUC(unsigned char c) { return (c & 0x80); } Entry Dict::parse(const TQString &raw) { unsigned int length = raw.length(); if (raw.left(5) == "DICT ") return Entry(raw.right(length - 5)); if (raw.left(8) == "HEADING ") return Entry(raw.right(length - 8), true); TQString reading; TQString kanji; TQStringList meanings; TQString curmeaning; bool firstmeaning = true; TQCString parsemode("kanji"); unsigned int i; for (i = 0; i < length; i++) { TQChar ichar(raw.at(i)); if (ichar == '[') { parsemode = "reading"; } else if (ichar == ']') { // do nothing } else if (ichar == '/') { if (!firstmeaning) { meanings.append(curmeaning); curmeaning = ""; } else { firstmeaning = false; parsemode = "meaning"; } } else if (ichar == ' ') { if (parsemode == "meaning") // only one that needs the space curmeaning += ' '; } else if (parsemode == "kanji") { kanji += ichar; } else if (parsemode == "meaning") { curmeaning += ichar; } else if (parsemode == "reading") { reading += ichar; } } return (Entry(kanji, reading, meanings)); } Entry Dict::kanjiParse(const TQString &raw) { unsigned int length = raw.length(); if (raw.left(5) == "DICT ") return Entry(raw.right(length - 5)); if (raw.left(8) == "HEADING ") return Entry(raw.right(length - 8), true); TQStringList readings; TQString kanji; TQStringList meanings; TQString curmeaning; TQString curreading; TQString strfreq; TQString strgrade; TQString strstrokes; TQString strmiscount = ""; bool prevwasspace = true; TQChar detailname; TQCString parsemode("kanji"); // if there are two S entries, second is common miscount bool strokesset = false; unsigned int i; TQChar ichar; for (i = 0; i < length; i++) { ichar = raw.at(i); if (ichar == ' ') { if (parsemode == "reading") { readings.append(curreading); curreading = ""; } else if (parsemode == "kanji") { parsemode = "misc"; } else if (parsemode == "detail") { if (detailname == 'S') strokesset = true; parsemode = "misc"; } else if (parsemode == "meaning") { curmeaning += ichar; } prevwasspace = true; } else if (ichar == '{') { parsemode = "meaning"; } else if (ichar == '}') { meanings.append(curmeaning); curmeaning = ""; } else if (parsemode == "detail") { if (detailname == 'G') { strgrade += ichar; } else if (detailname == 'F') { strfreq += ichar; } else if (detailname == 'S') { if (strokesset) strmiscount += ichar; else strstrokes += ichar; } prevwasspace = false; } else if (parsemode == "kanji") { kanji += ichar; } else if (parsemode == "meaning") { curmeaning += ichar; } else if (parsemode == "reading") { curreading += ichar; } else if (parsemode == "misc" && prevwasspace) { if (TQRegExp("[A-Za-z0-9]").search(TQString(ichar)) >= 0) // is non-japanese? { detailname = ichar; parsemode = "detail"; } else { curreading = TQString(ichar); parsemode = "reading"; } } } return (Entry(kanji, readings, meanings, strgrade.toUInt(), strfreq.toUInt(), strstrokes.toUInt(), strmiscount.toUInt())); } TQString Dict::prettyMeaning(TQStringList Meanings) { TQString meanings; TQStringList::Iterator it; for (it = Meanings.begin(); it != Meanings.end(); ++it) meanings.append((*it).stripWhiteSpace()).append("; "); meanings.truncate(meanings.length() - 2); return meanings; } TQString Dict::prettyKanjiReading(TQStringList Readings) { TQStringList::Iterator it; TQString html; for (it = Readings.begin(); it != Readings.end(); ++it) { if ((*it) == "T1") html += i18n("In names: "); else { if ((*it) == "T2") html += i18n("As radical: "); else { html += (*it).stripWhiteSpace(); html += ", "; } } } html.truncate(html.length() - 2); // get rid of last , return html; } Dict::Entry Dict::firstEntry(Dict::SearchResult result) { for (TQValueListIterator it = result.list.begin(); it != result.list.end(); ++it) { if ((*it).dictName() == "__NOTSET" && (*it).header() == "__NOTSET") return (*it); } return Dict::Entry("__NOTHING"); } TQString Dict::firstEntryText(Dict::SearchResult result) { for (TQStringList::Iterator it = result.results.begin(); it != result.results.end(); ++it) { if ((*it).left(5) != "DICT " && (*it).left(7) != "HEADER ") return (*it); } return TQString("NONE "); } /////////////////////////////////////////////////////////////// Entry::Entry(const TQString & kanji, const TQString & reading, const TQStringList &meanings) : DictName(TQString::fromLatin1("__NOTSET")) , Header(TQString::fromLatin1("__NOTSET")) , Meanings(meanings) , Kanji(kanji) , KanaOnly(reading.isEmpty()) , Readings(KanaOnly ? kanji : reading) , ExtendedKanjiInfo(false) , Grade(0) , Strokes(0) , Miscount(0) , Freq(0) { } Entry::Entry(const TQString &kanji, TQStringList &readings, TQStringList &meanings, unsigned int grade, unsigned int freq, unsigned int strokes, unsigned int miscount) : DictName(TQString::fromLatin1("__NOTSET")) , Header(TQString::fromLatin1("__NOTSET")) , Meanings(meanings) , Kanji(kanji) , KanaOnly(false) , Readings(readings) , ExtendedKanjiInfo(true) , Grade(grade) , Strokes(strokes) , Miscount(miscount) , Freq(freq) { } Entry::Entry(const TQString &dictname) : KanaOnly(true) , ExtendedKanjiInfo(false) { DictName = dictname; } Entry::Entry(const TQString &headername, bool) : DictName(TQString::fromLatin1("__NOTSET")) , Header(headername) , KanaOnly(true) , ExtendedKanjiInfo(false) { } TQString Entry::dictName() { return DictName; } TQString Entry::header() { return Header; } bool Entry::kanaOnly() { return KanaOnly; } TQString Entry::kanji() { return Kanji; } TQStringList Entry::readings() { return Readings; } TQString Entry::firstReading() { return *Readings.at(0); } TQStringList Entry::meanings() { return Meanings; } unsigned int Entry::grade() { return Grade; } unsigned int Entry::freq() { return Freq; } unsigned int Entry::miscount() { return Miscount; } unsigned int Entry::strokes() { return Strokes; } bool Entry::extendedKanjiInfo() { return ExtendedKanjiInfo; } #include "dict.moc"