You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
501 lines
12 KiB
501 lines
12 KiB
15 years ago
|
/* LibHnj is dual licensed under LGPL and MPL. Boilerplate for both
|
||
|
* licenses follows.
|
||
|
*/
|
||
|
|
||
|
/* LibHnj - a library for high quality hyphenation and justification
|
||
|
* Copyright (C) 1998 Raph Levien,
|
||
|
* (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
|
||
|
* (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
|
||
|
*
|
||
|
* This library is free software; you can redistribute it and/or
|
||
|
* modify it under the terms of the GNU Library General Public
|
||
|
* License as published by the Free Software Foundation; either
|
||
|
* version 2 of the License, or (at your option) any later version.
|
||
|
*
|
||
|
* This library is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
* Library General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU Library General Public
|
||
|
* License along with this library; if not, write to the
|
||
|
* Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
||
|
* Boston, MA 02110-1301 USA.
|
||
|
*/
|
||
|
|
||
|
/*
|
||
|
* The contents of this file are subject to the Mozilla Public License
|
||
|
* Version 1.0 (the "MPL"); you may not use this file except in
|
||
|
* compliance with the MPL. You may obtain a copy of the MPL at
|
||
|
* http://www.mozilla.org/MPL/
|
||
|
*
|
||
|
* Software distributed under the MPL is distributed on an "AS IS" basis,
|
||
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
|
||
|
* for the specific language governing rights and limitations under the
|
||
|
* MPL.
|
||
|
*
|
||
|
*/
|
||
|
#include <stdlib.h> /* for NULL, malloc */
|
||
|
#include <stdio.h> /* for fprintf */
|
||
|
#include <string.h> /* for strdup */
|
||
|
|
||
|
#ifdef UNX
|
||
|
#include <unistd.h> /* for exit */
|
||
|
#endif
|
||
|
|
||
|
#define noVERBOSE
|
||
|
|
||
|
#include "hnjalloc.h"
|
||
|
#include "hyphen.h"
|
||
|
|
||
|
static char *
|
||
|
hnj_strdup (const char *s)
|
||
|
{
|
||
|
char *new;
|
||
|
int l;
|
||
|
|
||
|
l = strlen (s);
|
||
|
new = hnj_malloc (l + 1);
|
||
|
memcpy (new, s, l);
|
||
|
new[l] = 0;
|
||
|
return new;
|
||
|
}
|
||
|
|
||
|
/* a little bit of a hash table implementation. This simply maps strings
|
||
|
to state numbers */
|
||
|
|
||
|
typedef struct _HashTab HashTab;
|
||
|
typedef struct _HashEntry HashEntry;
|
||
|
|
||
|
/* A cheap, but effective, hack. */
|
||
|
#define HASH_SIZE 31627
|
||
|
|
||
|
struct _HashTab {
|
||
|
HashEntry *entries[HASH_SIZE];
|
||
|
};
|
||
|
|
||
|
struct _HashEntry {
|
||
|
HashEntry *next;
|
||
|
char *key;
|
||
|
int val;
|
||
|
};
|
||
|
|
||
|
/* a char* hash function from ASU - adapted from Gtk+ */
|
||
|
static unsigned int
|
||
|
hnj_string_hash (const char *s)
|
||
|
{
|
||
|
const char *p;
|
||
|
unsigned int h=0, g;
|
||
|
|
||
|
for(p = s; *p != '\0'; p += 1) {
|
||
|
h = ( h << 4 ) + *p;
|
||
|
if ( ( g = h & 0xf0000000 ) ) {
|
||
|
h = h ^ (g >> 24);
|
||
|
h = h ^ g;
|
||
|
}
|
||
|
}
|
||
|
return h /* % M */;
|
||
|
}
|
||
|
|
||
|
static HashTab *
|
||
|
hnj_hash_new (void)
|
||
|
{
|
||
|
HashTab *hashtab;
|
||
|
int i;
|
||
|
|
||
|
hashtab = hnj_malloc (sizeof(HashTab));
|
||
|
for (i = 0; i < HASH_SIZE; i++)
|
||
|
hashtab->entries[i] = NULL;
|
||
|
|
||
|
return hashtab;
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
hnj_hash_free (HashTab *hashtab)
|
||
|
{
|
||
|
int i;
|
||
|
HashEntry *e, *next;
|
||
|
|
||
|
for (i = 0; i < HASH_SIZE; i++)
|
||
|
for (e = hashtab->entries[i]; e; e = next)
|
||
|
{
|
||
|
next = e->next;
|
||
|
hnj_free (e->key);
|
||
|
hnj_free (e);
|
||
|
}
|
||
|
|
||
|
hnj_free (hashtab);
|
||
|
}
|
||
|
|
||
|
/* assumes that key is not already present! */
|
||
|
static void
|
||
|
hnj_hash_insert (HashTab *hashtab, const char *key, int val)
|
||
|
{
|
||
|
int i;
|
||
|
HashEntry *e;
|
||
|
|
||
|
i = hnj_string_hash (key) % HASH_SIZE;
|
||
|
e = hnj_malloc (sizeof(HashEntry));
|
||
|
e->next = hashtab->entries[i];
|
||
|
e->key = hnj_strdup (key);
|
||
|
e->val = val;
|
||
|
hashtab->entries[i] = e;
|
||
|
}
|
||
|
|
||
|
/* return val if found, otherwise -1 */
|
||
|
static int
|
||
|
hnj_hash_lookup (HashTab *hashtab, const char *key)
|
||
|
{
|
||
|
int i;
|
||
|
HashEntry *e;
|
||
|
|
||
|
i = hnj_string_hash (key) % HASH_SIZE;
|
||
|
for (e = hashtab->entries[i]; e; e = e->next)
|
||
|
if (!strcmp (key, e->key))
|
||
|
return e->val;
|
||
|
return -1;
|
||
|
}
|
||
|
|
||
|
/* Get the state number, allocating a new state if necessary. */
|
||
|
static int
|
||
|
hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
|
||
|
{
|
||
|
int state_num;
|
||
|
|
||
|
state_num = hnj_hash_lookup (hashtab, string);
|
||
|
|
||
|
if (state_num >= 0)
|
||
|
return state_num;
|
||
|
|
||
|
hnj_hash_insert (hashtab, string, dict->num_states);
|
||
|
/* predicate is true if dict->num_states is a power of two */
|
||
|
if (!(dict->num_states & (dict->num_states - 1)))
|
||
|
{
|
||
|
dict->states = hnj_realloc (dict->states,
|
||
|
(dict->num_states << 1) *
|
||
|
sizeof(HyphenState));
|
||
|
}
|
||
|
dict->states[dict->num_states].match = NULL;
|
||
|
dict->states[dict->num_states].fallback_state = -1;
|
||
|
dict->states[dict->num_states].num_trans = 0;
|
||
|
dict->states[dict->num_states].trans = NULL;
|
||
|
return dict->num_states++;
|
||
|
}
|
||
|
|
||
|
/* add a transition from state1 to state2 through ch - assumes that the
|
||
|
transition does not already exist */
|
||
|
static void
|
||
|
hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
|
||
|
{
|
||
|
int num_trans;
|
||
|
|
||
|
num_trans = dict->states[state1].num_trans;
|
||
|
if (num_trans == 0)
|
||
|
{
|
||
|
dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
|
||
|
}
|
||
|
else if (!(num_trans & (num_trans - 1)))
|
||
|
{
|
||
|
dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
|
||
|
(num_trans << 1) *
|
||
|
sizeof(HyphenTrans));
|
||
|
}
|
||
|
dict->states[state1].trans[num_trans].ch = ch;
|
||
|
dict->states[state1].trans[num_trans].new_state = state2;
|
||
|
dict->states[state1].num_trans++;
|
||
|
}
|
||
|
|
||
|
#ifdef VERBOSE
|
||
|
HashTab *global;
|
||
|
|
||
|
static char *
|
||
|
get_state_str (int state)
|
||
|
{
|
||
|
int i;
|
||
|
HashEntry *e;
|
||
|
|
||
|
for (i = 0; i < HASH_SIZE; i++)
|
||
|
for (e = global->entries[i]; e; e = e->next)
|
||
|
if (e->val == state)
|
||
|
return e->key;
|
||
|
return NULL;
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
HyphenDict *
|
||
|
hnj_hyphen_load (const char *fn)
|
||
|
{
|
||
|
HyphenDict *dict;
|
||
|
HashTab *hashtab;
|
||
|
FILE *f;
|
||
|
char buf[80];
|
||
|
char word[80];
|
||
|
char pattern[80];
|
||
|
int state_num, last_state;
|
||
|
int i, j;
|
||
|
char ch;
|
||
|
int found;
|
||
|
HashEntry *e;
|
||
|
|
||
|
f = fopen (fn, "r");
|
||
|
if (f == NULL)
|
||
|
return NULL;
|
||
|
|
||
|
hashtab = hnj_hash_new ();
|
||
|
#ifdef VERBOSE
|
||
|
global = hashtab;
|
||
|
#endif
|
||
|
hnj_hash_insert (hashtab, "", 0);
|
||
|
|
||
|
dict = hnj_malloc (sizeof(HyphenDict));
|
||
|
dict->num_states = 1;
|
||
|
dict->states = hnj_malloc (sizeof(HyphenState));
|
||
|
dict->states[0].match = NULL;
|
||
|
dict->states[0].fallback_state = -1;
|
||
|
dict->states[0].num_trans = 0;
|
||
|
dict->states[0].trans = NULL;
|
||
|
|
||
|
/* read in character set info */
|
||
|
for (i=0;i<MAX_NAME;i++) dict->cset[i]= 0;
|
||
|
fgets(dict->cset, sizeof(dict->cset),f);
|
||
|
for (i=0;i<MAX_NAME;i++)
|
||
|
if ((dict->cset[i] == '\r') || (dict->cset[i] == '\n'))
|
||
|
dict->cset[i] = 0;
|
||
|
|
||
|
while (fgets (buf, sizeof(buf), f) != NULL)
|
||
|
{
|
||
|
if (buf[0] != '%')
|
||
|
{
|
||
|
j = 0;
|
||
|
pattern[j] = '0';
|
||
|
for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
|
||
|
{
|
||
|
if (buf[i] >= '0' && buf[i] <= '9')
|
||
|
pattern[j] = buf[i];
|
||
|
else
|
||
|
{
|
||
|
word[j] = buf[i];
|
||
|
pattern[++j] = '0';
|
||
|
}
|
||
|
}
|
||
|
word[j] = '\0';
|
||
|
pattern[j + 1] = '\0';
|
||
|
|
||
|
/* Optimize away leading zeroes */
|
||
|
for (i = 0; pattern[i] == '0'; i++);
|
||
|
|
||
|
#ifdef VERBOSE
|
||
|
printf ("word %s pattern %s, j = %d\n", word, pattern + i, j);
|
||
|
#endif
|
||
|
found = hnj_hash_lookup (hashtab, word);
|
||
|
state_num = hnj_get_state (dict, hashtab, word);
|
||
|
dict->states[state_num].match = hnj_strdup (pattern + i);
|
||
|
|
||
|
/* now, put in the prefix transitions */
|
||
|
for (; found < 0 ;j--)
|
||
|
{
|
||
|
last_state = state_num;
|
||
|
ch = word[j - 1];
|
||
|
word[j - 1] = '\0';
|
||
|
found = hnj_hash_lookup (hashtab, word);
|
||
|
state_num = hnj_get_state (dict, hashtab, word);
|
||
|
hnj_add_trans (dict, state_num, last_state, ch);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* Could do unioning of matches here (instead of the preprocessor script).
|
||
|
If we did, the pseudocode would look something like this:
|
||
|
|
||
|
foreach state in the hash table
|
||
|
foreach i = [1..length(state) - 1]
|
||
|
state to check is substr (state, i)
|
||
|
look it up
|
||
|
if found, and if there is a match, union the match in.
|
||
|
|
||
|
It's also possible to avoid the quadratic blowup by doing the
|
||
|
search in order of increasing state string sizes - then you
|
||
|
can break the loop after finding the first match.
|
||
|
|
||
|
This step should be optional in any case - if there is a
|
||
|
preprocessed rule table, it's always faster to use that.
|
||
|
|
||
|
*/
|
||
|
|
||
|
/* put in the fallback states */
|
||
|
for (i = 0; i < HASH_SIZE; i++)
|
||
|
for (e = hashtab->entries[i]; e; e = e->next)
|
||
|
if ( *(e->key) ) /* not for the empty key entry */
|
||
|
{
|
||
|
for (j = 1; 1; j++)
|
||
|
{
|
||
|
state_num = hnj_hash_lookup (hashtab, e->key + j);
|
||
|
if (state_num >= 0)
|
||
|
break;
|
||
|
}
|
||
|
/*KBH: FIXME state 0 fallback_state should always be -1? */
|
||
|
if (e->val)
|
||
|
dict->states[e->val].fallback_state = state_num;
|
||
|
}
|
||
|
#ifdef VERBOSE
|
||
|
for (i = 0; i < HASH_SIZE; i++)
|
||
|
for (e = hashtab->entries[i]; e; e = e->next)
|
||
|
{
|
||
|
printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
|
||
|
dict->states[e->val].fallback_state);
|
||
|
for (j = 0; j < dict->states[e->val].num_trans; j++)
|
||
|
printf (" %c->%d\n", dict->states[e->val].trans[j].ch,
|
||
|
dict->states[e->val].trans[j].new_state);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
#ifndef VERBOSE
|
||
|
hnj_hash_free (hashtab);
|
||
|
#endif
|
||
|
|
||
|
return dict;
|
||
|
}
|
||
|
|
||
|
void hnj_hyphen_free (HyphenDict *dict)
|
||
|
{
|
||
|
int state_num;
|
||
|
HyphenState *hstate;
|
||
|
|
||
|
for (state_num = 0; state_num < dict->num_states; state_num++)
|
||
|
{
|
||
|
hstate = &dict->states[state_num];
|
||
|
if (hstate->match)
|
||
|
hnj_free (hstate->match);
|
||
|
if (hstate->trans)
|
||
|
hnj_free (hstate->trans);
|
||
|
}
|
||
|
|
||
|
hnj_free (dict->states);
|
||
|
|
||
|
hnj_free (dict);
|
||
|
}
|
||
|
|
||
|
#define MAX_WORD 256
|
||
|
|
||
|
int hnj_hyphen_hyphenate (HyphenDict *dict,
|
||
|
const char *word, int word_size,
|
||
|
char *hyphens)
|
||
|
{
|
||
|
char prep_word_buf[MAX_WORD];
|
||
|
char *prep_word;
|
||
|
int i, j, k;
|
||
|
int state;
|
||
|
char ch;
|
||
|
HyphenState *hstate;
|
||
|
char *match;
|
||
|
int offset;
|
||
|
|
||
|
if (word_size + 3 < MAX_WORD)
|
||
|
prep_word = prep_word_buf;
|
||
|
else
|
||
|
prep_word = hnj_malloc (word_size + 3);
|
||
|
|
||
|
j = 0;
|
||
|
prep_word[j++] = '.';
|
||
|
|
||
|
for (i = 0; i < word_size; i++)
|
||
|
prep_word[j++] = word[i];
|
||
|
|
||
|
for (i = 0; i < j; i++)
|
||
|
hyphens[i] = '0';
|
||
|
|
||
|
prep_word[j++] = '.';
|
||
|
|
||
|
prep_word[j] = '\0';
|
||
|
#ifdef VERBOSE
|
||
|
printf ("prep_word = %s\n", prep_word);
|
||
|
#endif
|
||
|
|
||
|
/* now, run the finite state machine */
|
||
|
state = 0;
|
||
|
for (i = 0; i < j; i++)
|
||
|
{
|
||
|
ch = prep_word[i];
|
||
|
for (;;)
|
||
|
{
|
||
|
|
||
|
if (state == -1) {
|
||
|
/*return 1;*/
|
||
|
/*KBH: FIXME shouldn't this be as follows?*/
|
||
|
state = 0;
|
||
|
goto try_next_letter;
|
||
|
}
|
||
|
|
||
|
#ifdef VERBOSE
|
||
|
char *state_str;
|
||
|
state_str = get_state_str (state);
|
||
|
|
||
|
for (k = 0; k < i - strlen (state_str); k++)
|
||
|
putchar (' ');
|
||
|
printf ("%s", state_str);
|
||
|
#endif
|
||
|
|
||
|
hstate = &dict->states[state];
|
||
|
for (k = 0; k < hstate->num_trans; k++)
|
||
|
if (hstate->trans[k].ch == ch)
|
||
|
{
|
||
|
state = hstate->trans[k].new_state;
|
||
|
goto found_state;
|
||
|
}
|
||
|
state = hstate->fallback_state;
|
||
|
#ifdef VERBOSE
|
||
|
printf (" falling back, fallback_state %d\n", state);
|
||
|
#endif
|
||
|
}
|
||
|
found_state:
|
||
|
#ifdef VERBOSE
|
||
|
printf ("found state %d\n",state);
|
||
|
#endif
|
||
|
/* Additional optimization is possible here - especially,
|
||
|
elimination of trailing zeroes from the match. Leading zeroes
|
||
|
have already been optimized. */
|
||
|
match = dict->states[state].match;
|
||
|
if (match)
|
||
|
{
|
||
|
offset = i + 1 - strlen (match);
|
||
|
#ifdef VERBOSE
|
||
|
for (k = 0; k < offset; k++)
|
||
|
putchar (' ');
|
||
|
printf ("%s\n", match);
|
||
|
#endif
|
||
|
/* This is a linear search because I tried a binary search and
|
||
|
found it to be just a teeny bit slower. */
|
||
|
for (k = 0; match[k] && offset+k < word_size+1 ; k++)
|
||
|
if (hyphens[offset + k] < match[k])
|
||
|
hyphens[offset + k] = match[k];
|
||
|
}
|
||
|
|
||
|
/* KBH: we need this to make sure we keep looking in a word
|
||
|
for patterns even if the current character is not known in state 0
|
||
|
since patterns for hyphenation may occur anywhere in the word*/
|
||
|
try_next_letter: ;
|
||
|
|
||
|
}
|
||
|
#ifdef VERBOSE
|
||
|
for (i = 0; i < j; i++)
|
||
|
putchar (hyphens[i]);
|
||
|
putchar ('\n');
|
||
|
#endif
|
||
|
|
||
|
for (i = 0; i < j - 4; i++)
|
||
|
#if 0
|
||
|
if (hyphens[i + 1] & 1)
|
||
|
hyphens[i] = '-';
|
||
|
#else
|
||
|
hyphens[i] = hyphens[i + 1];
|
||
|
#endif
|
||
|
hyphens[0] = '0';
|
||
|
for (; i < word_size; i++)
|
||
|
hyphens[i] = '0';
|
||
|
hyphens[word_size] = '\0';
|
||
|
|
||
|
if (prep_word != prep_word_buf)
|
||
|
hnj_free (prep_word);
|
||
|
return 0;
|
||
|
}
|