You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tdegraphics/kviewshell/plugins/djvu/libdjvu/GUnicode.cpp

791 lines
20 KiB

//C- -*- C++ -*-
//C- -------------------------------------------------------------------
//C- DjVuLibre-3.5
//C- Copyright (c) 2002 Leon Bottou and Yann Le Cun.
//C- Copyright (c) 2001 AT&T
//C-
//C- This software is subject to, and may be distributed under, the
//C- GNU General Public License, Version 2. The license should have
//C- accompanied the software or you may obtain a copy of the license
//C- from the Free Software Foundation at http://www.fsf.org .
//C-
//C- This program is distributed in the hope that it will be useful,
//C- but WITHOUT ANY WARRANTY; without even the implied warranty of
//C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//C- GNU General Public License for more details.
//C-
//C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library
//C- distributed by Lizardtech Software. On July 19th 2002, Lizardtech
//C- Software authorized us to replace the original DjVu(r) Reference
//C- Library notice by the following text (see doc/lizard2002.djvu):
//C-
//C- ------------------------------------------------------------------
//C- | DjVu (r) Reference Library (v. 3.5)
//C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
//C- | The DjVu Reference Library is protected by U.S. Pat. No.
//C- | 6,058,214 and patents pending.
//C- |
//C- | This software is subject to, and may be distributed under, the
//C- | GNU General Public License, Version 2. The license should have
//C- | accompanied the software or you may obtain a copy of the license
//C- | from the Free Software Foundation at http://www.fsf.org .
//C- |
//C- | The computer code originally released by LizardTech under this
//C- | license and unmodified by other parties is deemed "the LIZARDTECH
//C- | ORIGINAL CODE." Subject to any third party intellectual property
//C- | claims, LizardTech grants recipient a worldwide, royalty-free,
//C- | non-exclusive license to make, use, sell, or otherwise dispose of
//C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the
//C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU
//C- | General Public License. This grant only confers the right to
//C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to
//C- | the extent such infringement is reasonably necessary to enable
//C- | recipient to make, have made, practice, sell, or otherwise dispose
//C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to
//C- | any greater extent that may be necessary to utilize further
//C- | modifications or combinations.
//C- |
//C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
//C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
//C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
//C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
//C- +------------------------------------------------------------------
//
// $Id: GUnicode.cpp,v 1.11 2003/11/07 22:08:21 leonb Exp $
// $Name: release_3_5_15 $
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#if NEED_GNUG_PRAGMAS
# pragma implementation
#endif
#include "GString.h"
#if HAS_ICONV
#include <iconv.h>
#endif
#ifdef HAVE_NAMESPACES
namespace DJVU {
# ifdef NOT_DEFINED // Just to fool emacs c++ mode
}
#endif
#endif
static unsigned char nill=0;
static void const *
checkmarks(void const * const xbuf,
unsigned int &bufsize,
GStringRep::EncodeType &rep)
{
unsigned char const *buf=(unsigned char const *)xbuf;
if(bufsize >= 2 || (xbuf && !bufsize && rep != GStringRep::XOTHER))
{
const unsigned int s=(((unsigned int)buf[0])<<8)+(unsigned int)buf[1];
switch(s)
{
case 0:
if((bufsize>=4)||(!bufsize && rep == GStringRep::XUCS4BE)
||(!bufsize && rep == GStringRep::XUCS4_2143))
{
const unsigned int s=(((unsigned int)buf[2])<<8)+(unsigned int)buf[3];
if(s == 0xfeff)
{
rep=GStringRep::XUCS4BE;
buf+=4;
}else if(s == 0xfffe)
{
rep=GStringRep::XUCS4_2143;
buf+=4;
}
}
break;
case 0xfffe:
if(((bufsize>=4)||(!bufsize && rep == GStringRep::XUCS4LE))
&& !((unsigned char *)buf)[2] && !((unsigned char *)buf)[3])
{
rep=GStringRep::XUCS4LE;
buf+=4;
}else
{
rep=GStringRep::XUTF16LE;
buf+=2;
}
break;
case 0xfeff:
if(((bufsize>=4)||(!bufsize && rep == GStringRep::XUCS4_3412))
&& !((unsigned char *)buf)[2] && !((unsigned char *)buf)[3])
{
rep=GStringRep::XUCS4_3412;
buf+=4;
}else
{
rep=GStringRep::XUTF16LE;
buf+=2;
}
break;
case 0xefbb:
if(((bufsize>=3)||(!bufsize && GStringRep::XUTF8 == rep))&&(buf[2] == 0xbf))
{
rep=GStringRep::XUTF8;
buf+=3;
}
break;
default:
break;
}
}
if(buf != xbuf)
{
if(bufsize)
{
const size_t s=(size_t)xbuf-(size_t)buf;
if(bufsize> s)
{
bufsize-=s;
}else
{
bufsize=0;
buf=(const unsigned char *)&nill;
}
}
}
return buf;
}
class GStringRep::Unicode : public GStringRep::UTF8
{
public:
GP<GStringRep> encoding;
EncodeType encodetype;
void *remainder;
GPBufferBase gremainder;
public:
Unicode(void);
/// virtual destructor.
virtual ~Unicode();
static GP<GStringRep> create(const unsigned int sz);
static GP<GStringRep> create(void const * const buf, unsigned int bufsize,
const EncodeType, const GP<GStringRep> &encoding);
static GP<GStringRep> create( void const * const buf,
unsigned int size, const EncodeType encodetype );
static GP<GStringRep> create( void const * const buf,
const unsigned int size, GP<GStringRep> encoding );
static GP<GStringRep> create( void const * const buf,
const unsigned int size, const GP<Unicode> &remainder );
protected:
virtual void set_remainder( void const * const buf, const unsigned int size,
const EncodeType encodetype );
virtual void set_remainder( void const * const buf, const unsigned int size,
const GP<GStringRep> &encoding );
virtual void set_remainder( const GP<Unicode> &remainder );
virtual GP<Unicode> get_remainder(void) const;
};
// static unsigned long UTF8toUCS4(unsigned char const *&,void const * const);
static unsigned long xUTF16toUCS4(unsigned short const *&s,void const * const);
static unsigned long UTF16BEtoUCS4(unsigned char const *&s,void const * const);
static unsigned long UTF16LEtoUCS4(unsigned char const *&s,void const * const);
static unsigned long UCS4BEtoUCS4(unsigned char const *&s,void const * const);
static unsigned long UCS4LEtoUCS4(unsigned char const *&s,void const * const);
static unsigned long UCS4_3412toUCS4(unsigned char const *&s,void const * const);
static unsigned long UCS4_2143toUCS4(unsigned char const *&s,void const * const);
GP<GStringRep>
GStringRep::Unicode::create(const unsigned int sz)
{
GP<GStringRep> gaddr;
if (sz > 0)
{
GStringRep *addr;
gaddr=(addr=new GStringRep::Unicode);
addr->data=(char *)(::operator new(sz+1));
addr->size = sz;
addr->data[sz] = 0;
}
return gaddr;
}
GStringRep::Unicode::Unicode(void)
: encodetype(XUTF8), gremainder(remainder,0,1) {}
GStringRep::Unicode::~Unicode() {}
GP<GStringRep>
GStringRep::Unicode::create(
void const * const xbuf,
unsigned int bufsize,
const EncodeType t,
const GP<GStringRep> &encoding)
{
return (encoding->size)
?create(xbuf,bufsize,encoding)
:create(xbuf,bufsize,t);
}
GP<GStringRep>
GStringRep::Unicode::create(
void const * const xbuf,
const unsigned int bufsize,
const GP<Unicode> &xremainder )
{
Unicode *r=xremainder;
GP<GStringRep> retval;
if(r)
{
const int s=r->gremainder;
if(xbuf && bufsize)
{
if(s)
{
void *buf;
GPBufferBase gbuf(buf,s+bufsize,1);
memcpy(buf,r->remainder,s);
memcpy((void *)((size_t)buf+s),xbuf,bufsize);
retval=((r->encoding)
?create(buf,s+bufsize,r->encoding)
:create(buf,s+bufsize,r->encodetype));
}else
{
retval=((r->encoding)
?create(xbuf,bufsize,r->encoding)
:create(xbuf,bufsize,r->encodetype));
}
}else if(s)
{
void *buf;
GPBufferBase gbuf(buf,s,1);
memcpy(buf,r->remainder,s);
retval=((r->encoding)
?create(buf,s,r->encoding)
:create(buf,s,r->encodetype));
}else
{
retval=((r->encoding)
?create(0,0,r->encoding)
:create(0,0,r->encodetype));
}
}else
{
retval=create(xbuf,bufsize,XUTF8);
}
return retval;
}
#if HAS_ICONV
/* This template works around incompatible iconv protoypes */
template<typename _T> inline size_t
iconv_adaptor(size_t(*iconv_func)(iconv_t, _T, size_t *, char**, size_t*),
iconv_t cd, char **inbuf, size_t *inbytesleft,
char **outbuf, size_t *outbytesleft)
{
return iconv_func (cd, (_T)inbuf, inbytesleft, outbuf, outbytesleft);
}
#endif
GP<GStringRep>
GStringRep::Unicode::create(
void const * const xbuf,
unsigned int bufsize,
GP<GStringRep> encoding)
{
GP<GStringRep> retval;
GStringRep *e=encoding;
if(e)
{
e=(encoding=e->upcase());
}
if(!e || !e->size)
{
retval=create(xbuf,bufsize,XOTHER);
}else if(!e->cmp("UTF8") || !e->cmp("UTF-8"))
{
retval=create(xbuf,bufsize,XUTF8);
}else if(!e->cmp("UTF16")|| !e->cmp("UTF-16")
|| !e->cmp("UCS2") || !e->cmp("UCS2"))
{
retval=create(xbuf,bufsize,XUTF16);
}else if(!e->cmp("UCS4") || !e->cmp("UCS-4"))
{
retval=create(xbuf,bufsize,XUCS4);
}else
{
#if HAS_ICONV
EncodeType t=XOTHER;
void const * const buf=checkmarks(xbuf,bufsize,t);
if(t != XOTHER)
{
retval=create(xbuf,bufsize,t);
}else if(buf && bufsize)
{
unsigned char const *eptr=(unsigned char *)buf;
unsigned int j=0;
for(j=0;(j<bufsize)&&*eptr;j++,eptr++)
EMPTY_LOOP;
if (j)
{
unsigned char const *ptr=(unsigned char *)buf;
if(e)
{
iconv_t cv=iconv_open("UTF-8",(const char *)e);
if(cv == (iconv_t)(-1))
{
const int i=e->search('-');
if(i>=0)
{
cv=iconv_open("UTF-8",e->data+i+1);
}
}
if(cv == (iconv_t)(-1))
{
retval=create(0,0,XOTHER);
}else
{
size_t ptrleft=(eptr-ptr);
char *utf8buf;
size_t pleft=6*ptrleft+1;
GPBuffer<char> gutf8buf(utf8buf,pleft);
char *p=utf8buf;
unsigned char const *last=ptr;
for(;iconv_adaptor(iconv, cv, (char**)&ptr, &ptrleft, &p, &pleft);last=ptr)
EMPTY_LOOP;
iconv_close(cv);
retval=create(utf8buf,(size_t)last-(size_t)buf,t);
retval->set_remainder(last,(size_t)eptr-(size_t)last,e);
}
}
}else
{
retval=create(0,0,XOTHER);
retval->set_remainder(0,0,e);
}
}
#else
retval=create(xbuf,bufsize,XOTHER);
#endif
}
return retval;
}
GP<GStringRep>
GStringRep::Unicode::create(
void const * const xbuf,
unsigned int bufsize,
EncodeType t)
{
GP<GStringRep> gretval;
GStringRep *retval=0;
void const * const buf=checkmarks(xbuf,bufsize,t);
if(buf && bufsize)
{
unsigned char const *eptr=(unsigned char *)buf;
unsigned int maxutf8size=0;
void const* const xeptr=(void const *)((size_t)eptr+bufsize);
switch(t)
{
case XUCS4:
case XUCS4BE:
case XUCS4LE:
case XUCS4_2143:
case XUCS4_3412:
{
for(unsigned long w;
(eptr<xeptr)&&(w=*(unsigned long const *)eptr);
eptr+=sizeof(unsigned long))
{
maxutf8size+=(w>0x7f)?6:1;
}
break;
}
case XUTF16:
case XUTF16BE:
case XUTF16LE:
{
for(unsigned short w;
(eptr<xeptr)&&(w=*(unsigned short const *)eptr);
eptr+=sizeof(unsigned short))
{
maxutf8size+=3;
}
break;
}
case XUTF8:
for(;(eptr<xeptr)&&*eptr;maxutf8size++,eptr++)
EMPTY_LOOP;
break;
case XEBCDIC:
for(;(eptr<xeptr)&&*eptr;eptr++)
{
maxutf8size+=(*eptr>0x7f)?2:1;
}
break;
default:
break;
}
unsigned char *utf8buf=0;
GPBuffer<unsigned char> gutf8buf(utf8buf,maxutf8size+1);
utf8buf[0]=0;
if (maxutf8size)
{
unsigned char *optr=utf8buf;
int len=0;
unsigned char const *iptr=(unsigned char *)buf;
unsigned long w;
switch(t)
{
case XUCS4:
for(;
(iptr<eptr)&&(w=*(unsigned long const *)iptr);
len++,iptr+=sizeof(unsigned long const))
{
optr=UCS4toUTF8(w,optr);
}
break;
case XUCS4BE:
for(;(w=UCS4BEtoUCS4(iptr,eptr));len++)
{
optr=UCS4toUTF8(w,optr);
}
break;
case XUCS4LE:
for(;(w=UCS4LEtoUCS4(iptr,eptr));len++)
{
optr=UCS4toUTF8(w,optr);
}
break;
case XUCS4_2143:
for(;(w=UCS4_2143toUCS4(iptr,eptr));len++)
{
optr=UCS4toUTF8(w,optr);
}
break;
case XUCS4_3412:
for(;(w=UCS4_3412toUCS4(iptr,eptr));len++)
{
optr=UCS4toUTF8(w,optr);
}
break;
case XUTF16:
for(;
(w=xUTF16toUCS4((unsigned short const*&)iptr,eptr));
len++)
{
optr=UCS4toUTF8(w,optr);
}
break;
case XUTF16BE:
for(;(w=UTF16BEtoUCS4(iptr,eptr));len++)
{
optr=UCS4toUTF8(w,optr);
}
break;
case XUTF16LE:
for(;(w=UTF16LEtoUCS4(iptr,eptr));len++)
{
optr=UCS4toUTF8(w,optr);
}
break;
case XUTF8:
for(;(w=UTF8toUCS4(iptr,eptr));len++)
{
optr=UCS4toUTF8(w,optr);
}
break;
case XEBCDIC:
for(;(iptr<eptr)&&(w=*iptr++);len++)
{
optr=UCS4toUTF8(w,optr);
}
break;
default:
break;
}
const unsigned int size=(size_t)optr-(size_t)utf8buf;
if(size)
{
retval=(gretval=GStringRep::Unicode::create(size));
memcpy(retval->data,utf8buf,size);
}else
{
retval=(gretval=GStringRep::Unicode::create(1));
retval->size=size;
}
retval->data[size]=0;
gutf8buf.resize(0);
const size_t s=(size_t)eptr-(size_t)iptr;
retval->set_remainder(iptr,s,t);
}
}
if(!retval)
{
retval=(gretval=GStringRep::Unicode::create(1));
retval->data[0]=0;
retval->size=0;
retval->set_remainder(0,0,t);
}
return gretval;
}
static unsigned long
xUTF16toUCS4(unsigned short const *&s,void const * const eptr)
{
unsigned long U=0;
unsigned short const * const r=s+1;
if(r <= eptr)
{
unsigned long const W1=s[0];
if((W1<0xD800)||(W1>0xDFFF))
{
if((U=W1))
{
s=r;
}
}else if(W1<=0xDBFF)
{
unsigned short const * const rr=r+1;
if(rr <= eptr)
{
unsigned long const W2=s[1];
if(((W2>=0xDC00)||(W2<=0xDFFF))&&((U=(0x1000+((W1&0x3ff)<<10))|(W2&0x3ff))))
{
s=rr;
}else
{
U=(unsigned int)(-1)-W1;
s=r;
}
}
}
}
return U;
}
static unsigned long
UTF16BEtoUCS4(unsigned char const *&s,void const * const eptr)
{
unsigned long U=0;
unsigned char const * const r=s+2;
if(r <= eptr)
{
unsigned long const C1MSB=s[0];
if((C1MSB<0xD8)||(C1MSB>0xDF))
{
if((U=((C1MSB<<8)|((unsigned long)s[1]))))
{
s=r;
}
}else if(C1MSB<=0xDB)
{
unsigned char const * const rr=r+2;
if(rr <= eptr)
{
unsigned long const C2MSB=s[2];
if((C2MSB>=0xDC)||(C2MSB<=0xDF))
{
U=0x10000+((unsigned long)s[1]<<10)+(unsigned long)s[3]
+(((C1MSB<<18)|(C2MSB<<8))&0xc0300);
s=rr;
}else
{
U=(unsigned int)(-1)-((C1MSB<<8)|((unsigned long)s[1]));
s=r;
}
}
}
}
return U;
}
static unsigned long
UTF16LEtoUCS4(unsigned char const *&s,void const * const eptr)
{
unsigned long U=0;
unsigned char const * const r=s+2;
if(r <= eptr)
{
unsigned long const C1MSB=s[1];
if((C1MSB<0xD8)||(C1MSB>0xDF))
{
if((U=((C1MSB<<8)|((unsigned long)s[0]))))
{
s=r;
}
}else if(C1MSB<=0xDB)
{
unsigned char const * const rr=r+2;
if(rr <= eptr)
{
unsigned long const C2MSB=s[3];
if((C2MSB>=0xDC)||(C2MSB<=0xDF))
{
U=0x10000+((unsigned long)s[0]<<10)+(unsigned long)s[2]
+(((C1MSB<<18)|(C2MSB<<8))&0xc0300);
s=rr;
}else
{
U=(unsigned int)(-1)-((C1MSB<<8)|((unsigned long)s[1]));
s=r;
}
}
}
}
return U;
}
static unsigned long
UCS4BEtoUCS4(unsigned char const *&s,void const * const eptr)
{
unsigned long U=0;
unsigned char const * const r=s+4;
if(r<=eptr)
{
U=(((((((unsigned long)s[0]<<8)|(unsigned long)s[1])<<8)|(unsigned long)s[2])<<8)|(unsigned long)s[3]);
if(U)
{
s=r;
}
}
return U;
}
static unsigned long
UCS4LEtoUCS4(unsigned char const *&s,void const * const eptr)
{
unsigned long U=0;
unsigned char const * const r=s+4;
if(r<=eptr)
{
U=(((((((unsigned long)s[3]<<8)|(unsigned long)s[2])<<8)|(unsigned long)s[1])<<8)|(unsigned long)s[0]);
if(U)
{
s=r;
}
}
return U;
}
static unsigned long
UCS4_2143toUCS4(unsigned char const *&s,void const * const eptr)
{
unsigned long U=0;
unsigned char const * const r=s+4;
if(r<=eptr)
{
U=(((((((unsigned long)s[1]<<8)|(unsigned long)s[0])<<8)|(unsigned long)s[3])<<8)|(unsigned long)s[2]);
if(U)
{
s=r;
}
}
return U;
}
static unsigned long
UCS4_3412toUCS4(unsigned char const *&s,void const * const eptr)
{
unsigned long U=0;
unsigned char const * const r=s+4;
if(r<=eptr)
{
U=(((((((unsigned long)s[2]<<8)|(unsigned long)s[3])<<8)|(unsigned long)s[0])<<8)|(unsigned long)s[1]);
if(U)
{
s=r;
}
}
return U;
}
void
GStringRep::Unicode::set_remainder( void const * const buf,
const unsigned int size, const EncodeType xencodetype )
{
gremainder.resize(size,1);
if(size)
memcpy(remainder,buf,size);
encodetype=xencodetype;
encoding=0;
}
void
GStringRep::Unicode::set_remainder( void const * const buf,
const unsigned int size, const GP<GStringRep> &xencoding )
{
gremainder.resize(size,1);
if(size)
memcpy(remainder,buf,size);
encoding=xencoding;
encodetype=XOTHER;
}
void
GStringRep::Unicode::set_remainder( const GP<GStringRep::Unicode> &xremainder )
{
if(xremainder)
{
const int size=xremainder->gremainder;
gremainder.resize(size,1);
if(size)
memcpy(remainder,xremainder->remainder,size);
encodetype=xremainder->encodetype;
}else
{
gremainder.resize(0,1);
encodetype=XUTF8;
}
}
GP<GStringRep::Unicode>
GStringRep::Unicode::get_remainder( void ) const
{
return const_cast<GStringRep::Unicode *>(this);
}
GUTF8String
GUTF8String::create(void const * const buf,const unsigned int size,
const EncodeType encodetype, const GUTF8String &encoding)
{
return encoding.length()
?create(buf,size,encodetype)
:create(buf,size,encoding);
}
GUTF8String
GUTF8String::create( void const * const buf,
unsigned int size, const EncodeType encodetype )
{
GUTF8String retval;
retval.init(GStringRep::Unicode::create(buf,size,encodetype));
return retval;
}
GUTF8String
GUTF8String::create( void const * const buf,
const unsigned int size, const GP<GStringRep::Unicode> &remainder)
{
GUTF8String retval;
retval.init(GStringRep::Unicode::create(buf,size,remainder));
return retval;
}
GUTF8String
GUTF8String::create( void const * const buf,
const unsigned int size, const GUTF8String &encoding )
{
GUTF8String retval;
retval.init(GStringRep::Unicode::create(buf,size,encoding ));
return retval;
}
#ifdef HAVE_NAMESPACES
}
# ifndef NOT_USING_DJVU_NAMESPACE
using namespace DJVU;
# endif
#endif