You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
442 lines
9.0 KiB
442 lines
9.0 KiB
3 years ago
|
//
|
||
|
// EndingsDB.cc
|
||
|
//
|
||
|
// EndingsDB: Implementation of the private endings database
|
||
|
//
|
||
|
// Part of the ht://Dig package <http://www.htdig.org/>
|
||
|
// Copyright (c) 1995-2004 The ht://Dig Group
|
||
|
// For copyright details, see the file COPYING in your distribution
|
||
|
// or the GNU Library General Public License (LGPL) version 2 or later
|
||
|
// <http://www.gnu.org/copyleft/lgpl.html>
|
||
|
//
|
||
|
// $Id: EndingsDB.cc,v 1.17 2004/05/28 13:15:20 lha Exp $
|
||
|
//
|
||
|
|
||
|
#ifdef HAVE_CONFIG_H
|
||
|
#include "htconfig.h"
|
||
|
#endif /* HAVE_CONFIG_H */
|
||
|
|
||
|
#include <fcntl.h>
|
||
|
|
||
|
#include "Endings.h"
|
||
|
#include "htfuzzy.h"
|
||
|
#include "SuffixEntry.h"
|
||
|
#include "Dictionary.h"
|
||
|
#include "List.h"
|
||
|
#include "HtConfiguration.h"
|
||
|
|
||
|
#include "filecopy.h"
|
||
|
|
||
|
// This is an attempt to get around compatibility problems
|
||
|
// with the included regex
|
||
|
#ifdef _MSC_VER /* _WIN32 */
|
||
|
#include "regex_win32.h"
|
||
|
#else
|
||
|
# ifdef USE_RX
|
||
|
# include <rxposix.h>
|
||
|
# else // Use regex
|
||
|
# ifdef HAVE_BROKEN_REGEX
|
||
|
# include <regex.h>
|
||
|
# else // include regex code and header
|
||
|
# include "gregex.h"
|
||
|
# endif
|
||
|
# endif
|
||
|
#endif //_MSC_VER /* _WIN32 */
|
||
|
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <sys/stat.h>
|
||
|
|
||
|
#ifdef HAVE_STD
|
||
|
#include <fstream>
|
||
|
#ifdef HAVE_NAMESPACES
|
||
|
using namespace std;
|
||
|
#endif
|
||
|
#else
|
||
|
#include <fstream.h>
|
||
|
#endif /* HAVE_STD */
|
||
|
|
||
|
//*****************************************************************************
|
||
|
//
|
||
|
int
|
||
|
Endings::createDB(const HtConfiguration &config)
|
||
|
{
|
||
|
Dictionary rules;
|
||
|
String tmpdir = getenv("TMPDIR");
|
||
|
String word2root, root2word;
|
||
|
|
||
|
#if defined(LIBHTDIG) || defined(LIBHTDIGPHP) || defined(_MSC_VER) //WIN32
|
||
|
int ret = -1;
|
||
|
char * source = NULL;
|
||
|
char * dest = NULL;
|
||
|
#endif
|
||
|
|
||
|
if (tmpdir.length())
|
||
|
{
|
||
|
word2root = tmpdir;
|
||
|
root2word = tmpdir;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
word2root = "/tmp";
|
||
|
root2word = "/tmp";
|
||
|
}
|
||
|
|
||
|
word2root << "/word2root.db";
|
||
|
root2word << "/root2word.db";
|
||
|
|
||
|
if (debug)
|
||
|
cout << "htfuzzy/endings: Reading rules\n";
|
||
|
|
||
|
if (readRules(rules, config["endings_affix_file"]) == NOTOK)
|
||
|
return NOTOK;
|
||
|
|
||
|
if (debug)
|
||
|
cout << "htfuzzy/endings: Creating databases\n";
|
||
|
|
||
|
if (createRoot(rules, word2root, root2word,
|
||
|
config["endings_dictionary"]) == NOTOK)
|
||
|
return NOTOK;
|
||
|
|
||
|
//
|
||
|
// Since we used files in TMPDIR for our temporary databases, we need
|
||
|
// to now move them to the correct location as defined in the config
|
||
|
// database.
|
||
|
//
|
||
|
|
||
|
#if defined(LIBHTDIG) || defined(LIBHTDIGPHP) || defined(_MSC_VER) //WIN32
|
||
|
|
||
|
//Uses file_copy function - works on Unix/Linux & WinNT
|
||
|
source = root2word.get();
|
||
|
dest = (char *)config["endings_root2word_db"].get();
|
||
|
|
||
|
//Attempt rename, if fail attempt copy & delete.
|
||
|
ret = rename(source, dest);
|
||
|
if (ret < 0)
|
||
|
{
|
||
|
ret = file_copy(source, dest, FILECOPY_OVERWRITE_ON);
|
||
|
if (ret == TRUE)
|
||
|
unlink(source);
|
||
|
else
|
||
|
return NOTOK;
|
||
|
}
|
||
|
|
||
|
source = word2root.get();
|
||
|
dest = (char *)config["endings_word2root_db"].get();
|
||
|
|
||
|
//Attempt rename, if fail attempt copy & delete.
|
||
|
ret = rename(source, dest);
|
||
|
if (ret < 0)
|
||
|
{
|
||
|
ret = file_copy(source, dest, FILECOPY_OVERWRITE_ON);
|
||
|
if (ret == TRUE)
|
||
|
unlink(source);
|
||
|
else
|
||
|
return NOTOK;
|
||
|
}
|
||
|
|
||
|
#else //This code uses a system call - Phase this out
|
||
|
|
||
|
struct stat stat_buf;
|
||
|
String mv("mv"); // assume it's in the PATH if predefined setting fails
|
||
|
if ((stat(MV, &stat_buf) != -1) && S_ISREG(stat_buf.st_mode))
|
||
|
mv = MV;
|
||
|
system(form("%s %s %s;%s %s %s",
|
||
|
mv.get(), root2word.get(), config["endings_root2word_db"].get(),
|
||
|
mv.get(), word2root.get(), config["endings_word2root_db"].get()));
|
||
|
|
||
|
#endif
|
||
|
|
||
|
return OK;
|
||
|
|
||
|
}
|
||
|
|
||
|
|
||
|
//*****************************************************************************
|
||
|
int
|
||
|
Endings::readRules(Dictionary &rules, const String& rulesFile)
|
||
|
{
|
||
|
FILE *fl = fopen(rulesFile, "r");
|
||
|
|
||
|
if (fl == NULL)
|
||
|
return NOTOK;
|
||
|
|
||
|
int inSuffixes = 0;
|
||
|
char currentSuffix[2] = " ";
|
||
|
char *p;
|
||
|
char input[1024];
|
||
|
String line;
|
||
|
|
||
|
while (fgets(input, sizeof(input), fl))
|
||
|
{
|
||
|
if (input[0] == '\n' || input[0] == '#')
|
||
|
continue;
|
||
|
|
||
|
if (mystrncasecmp(input, "suffixes", 8) == 0)
|
||
|
{
|
||
|
inSuffixes = 1;
|
||
|
continue;
|
||
|
}
|
||
|
else if (mystrncasecmp(input, "prefixes", 8) == 0)
|
||
|
{
|
||
|
inSuffixes = 0;
|
||
|
continue;
|
||
|
}
|
||
|
if (!inSuffixes)
|
||
|
continue;
|
||
|
|
||
|
if (mystrncasecmp(input, "flag ", 5) == 0)
|
||
|
{
|
||
|
p = input + 5;
|
||
|
while (*p == '*' || *p == ' ' || *p == '\t')
|
||
|
p++;
|
||
|
currentSuffix[0] = *p;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
line << input;
|
||
|
line.chop("\r\n");
|
||
|
if (line.indexOf('>') > 0)
|
||
|
{
|
||
|
List *list;
|
||
|
SuffixEntry *se = new SuffixEntry(line);
|
||
|
|
||
|
if (rules.Exists(currentSuffix))
|
||
|
{
|
||
|
list = (List *) rules[currentSuffix];
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
list = new List;
|
||
|
rules.Add(currentSuffix, list);
|
||
|
}
|
||
|
list->Add(se);
|
||
|
line = 0;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
fclose(fl);
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
|
||
|
//*****************************************************************************
|
||
|
int
|
||
|
Endings::createRoot(Dictionary &rules, char *word2root, char *root2word, const String& dictFile)
|
||
|
{
|
||
|
FILE *fl = fopen(dictFile, "r");
|
||
|
if (fl == NULL)
|
||
|
return NOTOK;
|
||
|
|
||
|
Database *w2r = Database::getDatabaseInstance(DB_BTREE);
|
||
|
Database *r2w = Database::getDatabaseInstance(DB_BTREE);
|
||
|
|
||
|
w2r->OpenReadWrite(word2root, 0664);
|
||
|
r2w->OpenReadWrite(root2word, 0664);
|
||
|
|
||
|
char input[1024];
|
||
|
char *p;
|
||
|
String words;
|
||
|
String word;
|
||
|
List wordList;
|
||
|
int count = 0;
|
||
|
String data;
|
||
|
|
||
|
while (fgets(input, sizeof(input), fl))
|
||
|
{
|
||
|
if ((count % 100) == 0 && debug == 1)
|
||
|
{
|
||
|
cout << "htfuzzy/endings: words: " << count << '\n';
|
||
|
cout.flush();
|
||
|
}
|
||
|
count++;
|
||
|
|
||
|
p = strchr(input, '/');
|
||
|
if (p == NULL)
|
||
|
continue; // Only words that have legal endings are used
|
||
|
|
||
|
*p++ = '\0';
|
||
|
|
||
|
mungeWord(input, word);
|
||
|
expandWord(words, wordList, rules, word, p);
|
||
|
|
||
|
if (debug > 1)
|
||
|
cout << "htfuzzy/endings: " << word << " --> " << words << endl;
|
||
|
|
||
|
//
|
||
|
// Store the root mapped to the list of expanded words.
|
||
|
//
|
||
|
r2w->Put(word, words);
|
||
|
|
||
|
//
|
||
|
// For each of the expanded words, build a map to its root.
|
||
|
//
|
||
|
for (int i = 0; i < wordList.Count(); i++)
|
||
|
{
|
||
|
//
|
||
|
// Append to existing record if there is one.
|
||
|
//
|
||
|
data = "";
|
||
|
if (w2r->Get(*(String *)wordList[i], data) == OK)
|
||
|
data << ' ';
|
||
|
data << word;
|
||
|
w2r->Put(*(String *)wordList[i], data);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (debug == 1)
|
||
|
cout << endl;
|
||
|
|
||
|
fclose(fl);
|
||
|
w2r->Close();
|
||
|
r2w->Close();
|
||
|
delete w2r;
|
||
|
delete r2w;
|
||
|
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
|
||
|
//*****************************************************************************
|
||
|
// Convert a word from the dictionary format into something we can actually
|
||
|
// use. This means that the word will be converted to lowercase and that
|
||
|
// any accents will be combined into single characters.
|
||
|
//
|
||
|
void
|
||
|
Endings::mungeWord(char *input, String &word)
|
||
|
{
|
||
|
char *p = input + 1;
|
||
|
|
||
|
word = 0;
|
||
|
while (*input)
|
||
|
{
|
||
|
p = input + 1;
|
||
|
switch (*p)
|
||
|
{
|
||
|
case '"': // The previous character needs to get an umlaut
|
||
|
switch (*input)
|
||
|
{
|
||
|
case 'a':
|
||
|
case 'A':
|
||
|
word << char(228);
|
||
|
input += 2;
|
||
|
continue;
|
||
|
break;
|
||
|
case 'e':
|
||
|
case 'E':
|
||
|
word << char(235);
|
||
|
input += 2;
|
||
|
continue;
|
||
|
break;
|
||
|
case 'i':
|
||
|
case 'I':
|
||
|
word << char(239);
|
||
|
input += 2;
|
||
|
continue;
|
||
|
break;
|
||
|
case 'o':
|
||
|
case 'O':
|
||
|
word << char(246);
|
||
|
input += 2;
|
||
|
continue;
|
||
|
break;
|
||
|
case 'u':
|
||
|
case 'U':
|
||
|
word << char(252);
|
||
|
input += 2;
|
||
|
continue;
|
||
|
break;
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
case 'S': // See if the previous character needs to be an sz
|
||
|
if (*input == 's')
|
||
|
{
|
||
|
word << char(223);
|
||
|
input += 2;
|
||
|
continue;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
word << *input;
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
default:
|
||
|
word << *input;
|
||
|
break;
|
||
|
}
|
||
|
input++;
|
||
|
}
|
||
|
word.lowercase();
|
||
|
}
|
||
|
|
||
|
|
||
|
//*****************************************************************************
|
||
|
void
|
||
|
Endings::expandWord(String &words, List &wordList,
|
||
|
Dictionary &rules, char *word, char *suffixes)
|
||
|
{
|
||
|
char suffix[2] = " ";
|
||
|
String root;
|
||
|
SuffixEntry *entry;
|
||
|
List *suffixRules;
|
||
|
char *p;
|
||
|
String rule;
|
||
|
|
||
|
words = 0;
|
||
|
wordList.Destroy();
|
||
|
|
||
|
while (*suffixes > ' ')
|
||
|
{
|
||
|
suffix[0] = *suffixes++;
|
||
|
if (!rules.Exists(suffix))
|
||
|
continue;
|
||
|
|
||
|
suffixRules = (List *) rules[suffix];
|
||
|
for (int i = 0; i < suffixRules->Count(); i++)
|
||
|
{
|
||
|
entry = (SuffixEntry *) (*suffixRules)[i];
|
||
|
root = word;
|
||
|
regex_t reg;
|
||
|
rule = entry->rule;
|
||
|
if (strchr((char*)rule, '\''))
|
||
|
continue;
|
||
|
if (debug > 2)
|
||
|
cout << "Applying regex '" << entry->expression << "' to " << word << endl;
|
||
|
regcomp(®, (char*)entry->expression, REG_ICASE | REG_NOSUB | REG_EXTENDED);
|
||
|
if (regexec(®, word, 0, NULL, 0) == 0)
|
||
|
{
|
||
|
//
|
||
|
// Matched
|
||
|
//
|
||
|
if (rule[0] == '-')
|
||
|
{
|
||
|
//
|
||
|
// We need to remove something...
|
||
|
//
|
||
|
p = strchr((char*)rule, ',');
|
||
|
if (p)
|
||
|
{
|
||
|
*p++ = '\0';
|
||
|
root.chop((int)strlen(rule.get()) - 1);
|
||
|
root << p;
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
root << rule;
|
||
|
}
|
||
|
root.lowercase();
|
||
|
if (debug > 2)
|
||
|
cout << word << " with " << rule << " --> '" << root << "'\n";
|
||
|
wordList.Add(new String(root));
|
||
|
words << root << ' ';
|
||
|
}
|
||
|
regfree(®);
|
||
|
}
|
||
|
}
|
||
|
words.chop(1);
|
||
|
}
|