You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tdeedu/kiten/xjdxgen.c

424 lines
10 KiB

/**************************************************************************
* X J D X G E N
* Author: Jim Breen
* Index (.xjdx) generator program fron XJDIC
*
* V2.3 - indexes JIS X 0212 (3-byte EUC) kanji
***************************************************************************/
/* This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 1, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
/* Changed: ignore all rc stuff. use args 1 and 2 for input/output file.
-- jason */
#include <config.h>
#include <inttypes.h>
#include <sys/stat.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#ifdef HAVE_STDINT_H
#include <stdint.h>
#endif
#include "xjdic.h"
#define TRUE 1
#define FALSE 0
#define SPTAG '@'
#define EXLIM 100
#define TOKENLIM 40
unsigned char *db;
unsigned char ENVname[50];
unsigned char *dicenv;
struct stat buf;
uint32_t dbyte;
uint32_t *jindex;
uint32_t indptr,llone;
const char *ctl_file = ".xjdicrc";
const char *Dname;
const char *JDXname;
unsigned char exlist[EXLIM][11]; /* list of words to be excluded */
int excount,exlens[EXLIM];
int jiver = 14; /*The last time the index structure changed was Version1.4*/
/*====== prototypes=================================================*/
int stringcomp(unsigned char *s1, unsigned char *s2);
void jqsort(int32_t i, int32_t j);
int Kstrcmp(uint32_t lhs, uint32_t rhs);
void xjdicrc();
int alphaoreuc(unsigned char x);
int stringcomp(unsigned char *s1, unsigned char *s2)
{
uint i;
unsigned char c1,c2;
for(i = 0; i < strlen(s1);i++)
{
c1 = s1[i];
if (c1 < 0x60) c1 = (c1|0x20);
c2 = s2[i];
if (c2 < 0x60) c2 = (c2|0x20);
if (c1 != c2) return(1);
}
return (0);
}
extern char *getenv(const char *name);
/*====function to Load Dictionary and load/create index table=======*/
int main(int argc, char *argv[])
{
FILE *fp,*fopen();
uint32_t possav,schi,diclen,indlen;
int i,inwd,cstrp,saving,isc,nodread;
unsigned char c;
unsigned char currstr[TOKENLIM];
printf("\nNOTE: running this program by itself is never necessary. Kiten will run it automatically.\n");
printf("\nXJDXGEN V2.3 Index Table Generator for XJDIC. \n Copyright J.W. Breen, 1998\n");
if (argc < 3)
{
printf("\nUSAGE: kitengen input output.xjdx\n");
exit(2);
}
Dname = argv[1];
JDXname = argv[2];
printf("Commandline request to use files %s and %s \n", Dname, JDXname);
inwd = FALSE;
indptr = 1;
llone = 1;
if(stat(Dname, &buf) != 0)
{
perror(NULL);
printf("Cannot stat: %s \n",Dname);
exit(1);
}
diclen = buf.st_size;
printf("\nWARNING!! This program may take a long time to run .....\n");
puts ("\nLoading Dictionary file. Please wait.....\n");
fp=fopen(Dname,"rb");
if (fp==NULL )
{
printf("\nCannot open dictionary file\n");
exit(1);
}
db = (unsigned char *)malloc((diclen+100) * sizeof(unsigned char));
if(db == NULL)
{
fprintf(stderr,"malloc() for dictionary failed.\n");
fclose(fp);
exit(1);
}
nodread = diclen/1024;
dbyte = fread((unsigned char *)db+1, 1024, nodread, fp);
nodread = diclen % 1024;
dbyte = fread((unsigned char *)(db+(diclen/1024)*1024)+1, nodread,1, fp);
fclose(fp);
diclen++;
dbyte = diclen;
db[diclen] = 10;
db[0] = 10;
printf("Dictionary size: %d bytes.\n",dbyte);
indlen = (diclen * 3)/4;
jindex = (uint32_t *)malloc(indlen);
if(jindex == NULL)
{
fprintf(stderr,"malloc() for index table failed.\n");
exit(1);
}
printf("Parsing.... \n");
/*this is the dictionary parser. It places an entry in jindex for every
kana/kanji string and every alphabetic string it finds which is >=3
characters and is not on the "exclude" list */
indptr = 1;
saving = FALSE;
cstrp = 0;
for (schi =0; schi < dbyte; schi++) /* scan whole dictionary */
{
c = db[schi];
if (inwd)
{
if ((alphaoreuc(c))||(c == '-')||(c == '.')||((c >= '0') && (c <= '9')))
{
currstr[cstrp] = c;
if(cstrp < TOKENLIM-1) cstrp++;
}
else
{
currstr[cstrp] = '\0';
inwd = FALSE;
if ((strlen(currstr) <= 2) && (currstr[0] < 127))saving = FALSE;
if ((strlen(currstr) == 2) && (currstr[1] <= '9'))saving = TRUE;
if (saving && (currstr[0] > 127))
{
possav = jindex[indptr];
indptr++;
if (indptr > indlen/sizeof(int32_t))
{
printf("Index table overflow. Dictionary too etarge?\n");
exit(1);
}
/* generate index for *every* kanji in key */
i = 2;
if (currstr[0] == 0x8f) i++;
for ( ; i < strlen(currstr); i+=2)
{
if((currstr[i] >= 0xb0) || (currstr[i] == 0x8f))
{
jindex[indptr] = possav+i;
indptr++;
if (indptr > indlen/sizeof(int32_t))
{
printf("Index table overflow. Dictionary too large?\n");
exit(1);
}
}
if (currstr[i] == 0x8f) i++;
}
}
if (saving && (currstr[0] < 127))
{
indptr++;
if (indptr > indlen/sizeof(int32_t))
{
printf("Index table overflow. Dictionary too large?\n");
exit(1);
}
/* If this is non-Japanese, and has a 'SPTAGn' tag, generate two indices */
if ( currstr[0] == SPTAG)
{
jindex[indptr] = jindex[indptr-1]+1;
strcpy(currstr,currstr+1);
indptr++;
if (indptr > indlen/sizeof(int32_t))
{
printf("Index table overflow. Dictionary too large?\n");
exit(1);
}
}
if (currstr[0] < 128)
{
for (isc = 0; isc <= excount; isc++)
{
if (( (uint) exlens[isc] == strlen(currstr)) &&
(stringcomp(currstr,exlist[isc]) == 0) )
{
indptr--;
break;
}
}
}
}
}
}
else
{
if (alphaoreuc(c) || c == SPTAG)
{
inwd = TRUE;
jindex[indptr] = schi;
cstrp = 1;
currstr[0] = c;
currstr[1] = '\0';
saving = TRUE;
}
}
}
indptr--;
printf("Index entries: %d \nSorting (this is slow)......\n",indptr);
jqsort(llone,indptr);
printf("Sorted\nWriting index file ....\n");
fp = fopen(JDXname,"wb");
if (fp==NULL )
{
printf("\nCannot open %s output file\n",JDXname);
exit(1);
}
jindex[0] = diclen+jiver;
fwrite(jindex,sizeof(int32_t),indptr+1,fp);
fclose(fp);
return 0;
}
/*======function to sort jindex table====================*/
void jqsort(int32_t lhs, int32_t rhs)
{
int32_t i,last,midp;
uint32_t temp;
if (lhs >= rhs) return;
/* Swap ( lhs , (lhs+rhs)/2);*/
midp = (lhs+rhs)/2;
temp = jindex[lhs];
jindex[lhs] = jindex[midp];
jindex[midp] = temp;
last = lhs;
for (i = lhs+1;i <= rhs; i++)
{
if (Kstrcmp(jindex[i],jindex[lhs]) < 0)
{
/* Swap(++last,i);*/
last++;
temp = jindex[i];
jindex[i] = jindex[last];
jindex[last] = temp;
}
}
/* Swap (lhs,last);*/
temp = jindex[lhs];
jindex[lhs] = jindex[last];
jindex[last] = temp;
jqsort(lhs,last-1);
jqsort(last+1,rhs);
}
/*=====string comparison used by jqsort==========================*/
int Kstrcmp(uint32_t lhs, uint32_t rhs)
{
int i,c1 = 0, c2 = 0;
/* effectively does a strnicmp on two "strings" within the dictionary,
except it will make katakana and hirgana match (EUC A4 & A5) */
for (i = 0; i<20 ; i++)
{
c1 = db[lhs+i];
c2 = db[rhs+i];
if ((i % 2) == 0)
{
if (c1 == 0xA5)
{
c1 = 0xA4;
}
if (c2 == 0xA5)
{
c2 = 0xA4;
}
}
if ((c1 >= 'A') && (c1 <= 'Z')) c1 |= 0x20;
if ((c2 >= 'A') && (c2 <= 'Z')) c2 |= 0x20;
if (c1 != c2 ) break;
}
return(c1-c2);
}
/*=====xjdicrc - access and analyze "xjdicrc" file (if any)==============*/
/*
void xjdicrc()
{
unsigned char xjdicdir[PATH_MAX],rcstr[80],*rcwd;
int iex;
FILE *fm,*fopen();
iex = 0;
xjdicdir[0] = '\0';
dicenv = (unsigned char *)getenv("XJDIC");
if (!dicenv) dicenv = (unsigned char *)DEFAULT_DICDIR;
if (strlen(dicenv) <= 2)
{
dicenv = (unsigned char *)getcwd(ENVname,sizeof(ENVname));
if (dicenv == NULL)
{
printf("Cannot extract working directory!\n");
exit(1);
}
}
else
{
strncpy(ENVname,dicenv,sizeof(ENVname));
}
ENVname[sizeof(ENVname)-1] = '\0';
xjdicdir[sizeof(xjdicdir)-1] = '\0';
if (strlen(ENVname) > 2)
{
strncpy(xjdicdir,ENVname, sizeof(xjdicdir)-1);
strncat(xjdicdir,"/", sizeof(xjdicdir)-1-strlen(xjdicdir));
}
else
{
strncpy(xjdicdir,(unsigned char *)getenv("HOME"), sizeof(xjdicdir)-1);
strncat(xjdicdir,"/", sizeof(xjdicdir)-1-strlen(xjdicdir));
}
strncat(xjdicdir, ctl_file, sizeof(xjdicdir)-1-strlen(xjdicdir));
fm = fopen(xjdicdir,"r");
if (fm == NULL)
{
// Weird code --waba
strncat(xjdicdir, ctl_file, sizeof(xjdicdir)-1-strlen(xjdicdir));
fm = fopen(xjdicdir,"r");
}
if (fm != NULL)
{
while(fgets(rcstr,79,fm) != NULL)
{
rcwd = (unsigned char *)strtok(rcstr," \t");
if( stringcomp((unsigned char *)"exlist",rcwd) == 0)
{
while (TRUE)
{
rcwd = (unsigned char *)strtok(NULL," \t\f\r\n");
if (rcwd == NULL) break;
strncpy(exlist[iex],rcwd, sizeof(exlist[iex]));
exlist[iex][sizeof(exlist[iex])-1] = '\0';
exlens[iex] = strlen(rcwd);
if (iex < EXLIM) iex++;
}
excount = iex-1;
continue;
}
}
}
if (fm == NULL)
{
printf("No control file detected!\n");
return;
}
else
{
fclose(fm);
return;
}
}
*/
/*=======function to test a character for alpha or kana/kanji====*/
int alphaoreuc(unsigned char x)
{
int c;
c = x & 0xff;
if(((c >= 65) && (c <= 90)) || ((c >= 97) && (c <= 122)))
{
return (TRUE);
}
if ((c >= '0') && (c <= '9'))
{
return(TRUE);
}
if ((c & 0x80) > 0)
{
return(TRUE);
}
return (FALSE);
}