You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
226 lines
4.9 KiB
226 lines
4.9 KiB
3 years ago
|
//
|
||
|
// Synonym.cc
|
||
|
//
|
||
|
// Synonym: A fuzzy matching algorithm to create a database of related words
|
||
|
// (or misspellings) that should be searched together.
|
||
|
//
|
||
|
// Part of the ht://Dig package <http://www.htdig.org/>
|
||
|
// Copyright (c) 1995-2004 The ht://Dig Group
|
||
|
// For copyright details, see the file COPYING in your distribution
|
||
|
// or the GNU Library General Public License (LGPL) version 2 or later
|
||
|
// <http://www.gnu.org/copyleft/lgpl.html>
|
||
|
//
|
||
|
// $Id: Synonym.cc,v 1.16 2004/05/28 13:15:20 lha Exp $
|
||
|
//
|
||
|
|
||
|
#ifdef HAVE_CONFIG_H
|
||
|
#include "htconfig.h"
|
||
|
#endif /* HAVE_CONFIG_H */
|
||
|
|
||
|
#include <fcntl.h>
|
||
|
|
||
|
#include "Synonym.h"
|
||
|
#include "htfuzzy.h"
|
||
|
#include "List.h"
|
||
|
#include "StringList.h"
|
||
|
#include "HtConfiguration.h"
|
||
|
|
||
|
#include "filecopy.h"
|
||
|
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <sys/stat.h>
|
||
|
|
||
|
#ifdef HAVE_STD
|
||
|
#include <fstream>
|
||
|
#ifdef HAVE_NAMESPACES
|
||
|
using namespace std;
|
||
|
#endif
|
||
|
#else
|
||
|
#include <fstream.h>
|
||
|
#endif /* HAVE_STD */
|
||
|
|
||
|
//*****************************************************************************
|
||
|
Synonym::Synonym(const HtConfiguration& config_arg) :
|
||
|
Fuzzy(config_arg)
|
||
|
{
|
||
|
name = "synonyms";
|
||
|
db = 0;
|
||
|
}
|
||
|
|
||
|
|
||
|
//*****************************************************************************
|
||
|
Synonym::~Synonym()
|
||
|
{
|
||
|
if (db)
|
||
|
{
|
||
|
db->Close();
|
||
|
delete db;
|
||
|
db = 0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
//*****************************************************************************
|
||
|
int
|
||
|
Synonym::createDB(const HtConfiguration &config)
|
||
|
{
|
||
|
String tmpdir = getenv("TMPDIR");
|
||
|
String dbFile;
|
||
|
|
||
|
#if defined(LIBHTDIG) || defined(LIBHTDIGPHP) || defined(_MSC_VER) //WIN32
|
||
|
int ret = -1;
|
||
|
char * source = NULL;
|
||
|
char * dest = NULL;
|
||
|
#endif
|
||
|
|
||
|
if (tmpdir.length())
|
||
|
dbFile = tmpdir;
|
||
|
else
|
||
|
dbFile = "/tmp";
|
||
|
|
||
|
dbFile << "/synonyms.db";
|
||
|
|
||
|
char input[1000];
|
||
|
FILE *fl;
|
||
|
|
||
|
const String sourceFile = config["synonym_dictionary"];
|
||
|
|
||
|
fl = fopen(sourceFile, "r");
|
||
|
if (fl == NULL)
|
||
|
{
|
||
|
cout << "htfuzzy/synonyms: unable to open " << sourceFile << endl;
|
||
|
cout << "htfuzzy/synonyms: Use the 'synonym_dictionary' attribute\n";
|
||
|
cout << "htfuzzy/synonyms: to specify the file that contains the synonyms\n";
|
||
|
return NOTOK;
|
||
|
}
|
||
|
|
||
|
Database *db = Database::getDatabaseInstance(DB_HASH);
|
||
|
|
||
|
if (db->OpenReadWrite(dbFile.get(), 0664) == NOTOK)
|
||
|
{
|
||
|
delete db;
|
||
|
db = 0;
|
||
|
return NOTOK;
|
||
|
}
|
||
|
|
||
|
String data;
|
||
|
String word;
|
||
|
int count = 0;
|
||
|
while (fgets(input, sizeof(input), fl))
|
||
|
{
|
||
|
StringList sl(input, " \t\r\n");
|
||
|
if (sl.Count() < 2)
|
||
|
{ // Avoid segfault caused by calling Database::Put()
|
||
|
if (debug) // with negative length for data field
|
||
|
{
|
||
|
cout<<"htfuzzy/synonyms: Rejected line with less than 2 words: "
|
||
|
<< input << endl;
|
||
|
cout.flush();
|
||
|
}
|
||
|
continue;
|
||
|
}
|
||
|
for (int i = 0; i < sl.Count(); i++)
|
||
|
{
|
||
|
data = 0;
|
||
|
for (int j = 0; j < sl.Count(); j++)
|
||
|
{
|
||
|
if (i != j)
|
||
|
data << sl[j] << ' ';
|
||
|
}
|
||
|
word = sl[i];
|
||
|
word.lowercase();
|
||
|
data.lowercase();
|
||
|
db->Put(word, String(data.get(), data.length() - 1));
|
||
|
if (debug && (count % 10) == 0)
|
||
|
{
|
||
|
cout << "htfuzzy/synonyms: " << count << ' ' << word << "\n";
|
||
|
cout.flush();
|
||
|
}
|
||
|
count++;
|
||
|
}
|
||
|
}
|
||
|
fclose(fl);
|
||
|
db->Close();
|
||
|
delete db;
|
||
|
|
||
|
#if defined(LIBHTDIG) || defined(LIBHTDIGPHP) || defined(_MSC_VER) //WIN32
|
||
|
|
||
|
//Uses file_copy function - works on Unix/Linux & WinNT
|
||
|
source = dbFile.get();
|
||
|
dest = (char *)config["synonym_db"].get();
|
||
|
|
||
|
//Attempt rename, if fail attempt copy & delete.
|
||
|
ret = rename(source, dest);
|
||
|
if (ret < 0)
|
||
|
{
|
||
|
ret = file_copy(source, dest, FILECOPY_OVERWRITE_ON);
|
||
|
if (ret == TRUE)
|
||
|
unlink(source);
|
||
|
else
|
||
|
return NOTOK;
|
||
|
}
|
||
|
|
||
|
if (debug)
|
||
|
{
|
||
|
cout << "htfuzzy/synonyms: " << count << ' ' << word << "\n";
|
||
|
cout << "htfuzzy/synonyms: Done.\n";
|
||
|
}
|
||
|
|
||
|
#else //This code uses a system call - Phase this out
|
||
|
|
||
|
struct stat stat_buf;
|
||
|
String mv("mv"); // assume it's in the PATH if predefined setting fails
|
||
|
if ((stat(MV, &stat_buf) != -1) && S_ISREG(stat_buf.st_mode))
|
||
|
mv = MV;
|
||
|
system(form("%s %s %s",
|
||
|
mv.get(), dbFile.get(), config["synonym_db"].get()));
|
||
|
|
||
|
#endif
|
||
|
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
|
||
|
//*****************************************************************************
|
||
|
int
|
||
|
Synonym::openIndex()
|
||
|
{
|
||
|
const String dbFile = config["synonym_db"];
|
||
|
|
||
|
if (db)
|
||
|
{
|
||
|
db->Close();
|
||
|
delete db;
|
||
|
db = 0;
|
||
|
}
|
||
|
db = Database::getDatabaseInstance(DB_HASH);
|
||
|
if (db->OpenRead(dbFile) == NOTOK)
|
||
|
{
|
||
|
delete db;
|
||
|
db = 0;
|
||
|
return NOTOK;
|
||
|
}
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
|
||
|
//*****************************************************************************
|
||
|
void
|
||
|
Synonym::getWords(char *originalWord, List &words)
|
||
|
{
|
||
|
String data;
|
||
|
String stripped = originalWord;
|
||
|
HtStripPunctuation(stripped);
|
||
|
|
||
|
if (db && db->Get(stripped, data) == OK)
|
||
|
{
|
||
|
char *token = strtok(data.get(), " ");
|
||
|
while (token)
|
||
|
{
|
||
|
words.Add(new String(token));
|
||
|
token = strtok(0, " ");
|
||
|
}
|
||
|
}
|
||
|
}
|