You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
169 lines
4.6 KiB
169 lines
4.6 KiB
//
|
|
// Accents.cc
|
|
//
|
|
// Accents: A fuzzy matching algorithm by Robert Marchand, to treat all
|
|
// ISO-8859-1 accented letters as equivalent to their unaccented
|
|
// counterparts.
|
|
//
|
|
// Part of the ht://Dig package <http://www.htdig.org/>
|
|
// Copyright (c) 2000-2004 The ht://Dig Group
|
|
// For copyright details, see the file COPYING in your distribution
|
|
// or the GNU Library General Public License (LGPL) version 2 or later
|
|
// <http://www.gnu.org/copyleft/lgpl.html>
|
|
//
|
|
// $Id: Accents.cc,v 1.5 2004/05/28 13:15:19 lha Exp $
|
|
//
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "htconfig.h"
|
|
#endif /* HAVE_CONFIG_H */
|
|
|
|
#include "Configuration.h"
|
|
#include "Accents.h"
|
|
#include "Dictionary.h"
|
|
#include <ctype.h>
|
|
|
|
#ifdef HAVE_STD
|
|
#include <fstream>
|
|
#ifdef HAVE_NAMESPACES
|
|
using namespace std;
|
|
#endif
|
|
#else
|
|
#include <fstream.h>
|
|
#endif /* HAVE_STD */
|
|
|
|
extern int debug;
|
|
|
|
/*-------------------------------------------------------------------.
|
|
| Ajoute par Robert Marchand pour permettre le traitement adequat de |
|
|
| l'ISO-LATIN (provient du code de Pierre Rosa) |
|
|
`-------------------------------------------------------------------*/
|
|
|
|
/*--------------------------------------------------.
|
|
| table iso-latin1 "minusculisee" et "de-accentuee" |
|
|
`--------------------------------------------------*/
|
|
|
|
static char MinusculeISOLAT1[256] = {
|
|
0, 1, 2, 3, 4, 5, 6, 7,
|
|
8, 9, 10, 11, 12, 13, 14, 15,
|
|
16, 17, 18, 19, 20, 21, 22, 23,
|
|
24, 25, 26, 27, 28, 29, 30, 31,
|
|
32, 33, 34, 35, 36, 37, 38, 39,
|
|
40, 41, 42, 43, 44, 45, 46, 47,
|
|
48, 49, 50, 51, 52, 53, 54, 55,
|
|
56, 57, 58, 59, 60, 61, 62, 63,
|
|
64, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
|
|
'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
|
|
'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
|
|
'x', 'y', 'z', 91, 92, 93, 94, 95,
|
|
96, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
|
|
'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
|
|
'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
|
|
'x', 'y', 'z', 123, 124, 125, 126, 127,
|
|
128, 129, 130, 131, 132, 133, 134, 135,
|
|
136, 137, 138, 139, 140, 141, 142, 143,
|
|
144, 145, 146, 147, 148, 149, 150, 151,
|
|
152, 153, 154, 155, 156, 157, 158, 159,
|
|
160, 161, 162, 163, 164, 165, 166, 167,
|
|
168, 168, 170, 171, 172, 173, 174, 175,
|
|
176, 177, 178, 179, 180, 181, 182, 183,
|
|
184, 185, 186, 187, 188, 189, 190, 191,
|
|
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
|
|
'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
|
|
208, 'n', 'o', 'o', 'o', 'o', 'o', 'o',
|
|
'o', 'u', 'u', 'u', 'u', 'y', 222, 223,
|
|
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
|
|
'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
|
|
240, 'n', 'o', 'o', 'o', 'o', 'o', 'o',
|
|
'o', 'u', 'u', 'u', 'u', 'y', 254, 255};
|
|
|
|
|
|
//*****************************************************************************
|
|
// Accents::Accents(const HtConfiguration& config_arg)
|
|
//
|
|
Accents::Accents(const HtConfiguration& config_arg) :
|
|
Fuzzy(config_arg)
|
|
{
|
|
name = "accents";
|
|
}
|
|
|
|
|
|
//*****************************************************************************
|
|
// Accents::~Accents()
|
|
//
|
|
Accents::~Accents()
|
|
{
|
|
}
|
|
|
|
//*****************************************************************************
|
|
// void Accents::generateKey(char *word, String &key)
|
|
//
|
|
void
|
|
Accents::generateKey(char *word, String &key)
|
|
{
|
|
static int maximum_word_length = config.Value("maximum_word_length", 12);
|
|
|
|
if (!word || !*word)
|
|
return;
|
|
|
|
String temp(word);
|
|
if (temp.length() > maximum_word_length)
|
|
temp.chop(temp.length()-maximum_word_length);
|
|
word = temp.get();
|
|
key = '0';
|
|
while (*word) {
|
|
key << MinusculeISOLAT1[ (unsigned char) *word++ ];
|
|
}
|
|
}
|
|
|
|
|
|
//*****************************************************************************
|
|
// void Accents::addWord(char *word)
|
|
//
|
|
void
|
|
Accents::addWord(char *word)
|
|
{
|
|
if (!dict)
|
|
{
|
|
dict = new Dictionary;
|
|
}
|
|
|
|
String key;
|
|
generateKey(word, key);
|
|
|
|
// Do not add fuzzy key as a word, will be added at search time.
|
|
if (mystrcasecmp(word, key.get()) == 0)
|
|
return;
|
|
|
|
String *s = (String *) dict->Find(key);
|
|
if (s)
|
|
{
|
|
// if (mystrcasestr(s->get(), word) != 0)
|
|
(*s) << ' ' << word;
|
|
}
|
|
else
|
|
{
|
|
dict->Add(key, new String(word));
|
|
}
|
|
}
|
|
|
|
|
|
//*****************************************************************************
|
|
// void Accents::getWords(char *word, List &words)
|
|
//
|
|
void
|
|
Accents::getWords(char *word, List &words)
|
|
{
|
|
|
|
if (!word || !*word)
|
|
return;
|
|
|
|
Fuzzy::getWords(word, words);
|
|
|
|
// fuzzy key itself is always searched.
|
|
String fuzzyKey;
|
|
generateKey(word, fuzzyKey);
|
|
if (mystrcasecmp(fuzzyKey.get(), word) != 0)
|
|
words.Add(new String(fuzzyKey));
|
|
}
|