You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
276 lines
11 KiB
276 lines
11 KiB
//C- -*- C++ -*-
|
|
//C- -------------------------------------------------------------------
|
|
//C- DjVuLibre-3.5
|
|
//C- Copyright (c) 2002 Leon Bottou and Yann Le Cun.
|
|
//C- Copyright (c) 2001 AT&T
|
|
//C-
|
|
//C- This software is subject to, and may be distributed under, the
|
|
//C- GNU General Public License, Version 2. The license should have
|
|
//C- accompanied the software or you may obtain a copy of the license
|
|
//C- from the Free Software Foundation at http://www.fsf.org .
|
|
//C-
|
|
//C- This program is distributed in the hope that it will be useful,
|
|
//C- but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
//C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
//C- GNU General Public License for more details.
|
|
//C-
|
|
//C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library
|
|
//C- distributed by Lizardtech Software. On July 19th 2002, Lizardtech
|
|
//C- Software authorized us to replace the original DjVu(r) Reference
|
|
//C- Library notice by the following text (see doc/lizard2002.djvu):
|
|
//C-
|
|
//C- ------------------------------------------------------------------
|
|
//C- | DjVu (r) Reference Library (v. 3.5)
|
|
//C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
|
|
//C- | The DjVu Reference Library is protected by U.S. Pat. No.
|
|
//C- | 6,058,214 and patents pending.
|
|
//C- |
|
|
//C- | This software is subject to, and may be distributed under, the
|
|
//C- | GNU General Public License, Version 2. The license should have
|
|
//C- | accompanied the software or you may obtain a copy of the license
|
|
//C- | from the Free Software Foundation at http://www.fsf.org .
|
|
//C- |
|
|
//C- | The computer code originally released by LizardTech under this
|
|
//C- | license and unmodified by other parties is deemed "the LIZARDTECH
|
|
//C- | ORIGINAL CODE." Subject to any third party intellectual property
|
|
//C- | claims, LizardTech grants recipient a worldwide, royalty-free,
|
|
//C- | non-exclusive license to make, use, sell, or otherwise dispose of
|
|
//C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the
|
|
//C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU
|
|
//C- | General Public License. This grant only confers the right to
|
|
//C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to
|
|
//C- | the extent such infringement is reasonably necessary to enable
|
|
//C- | recipient to make, have made, practice, sell, or otherwise dispose
|
|
//C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to
|
|
//C- | any greater extent that may be necessary to utilize further
|
|
//C- | modifications or combinations.
|
|
//C- |
|
|
//C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
|
|
//C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
|
//C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
|
|
//C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
|
|
//C- +------------------------------------------------------------------
|
|
//
|
|
// $Id: BSByteStream.h,v 1.8 2003/11/07 22:08:20 leonb Exp $
|
|
// $Name: release_3_5_15 $
|
|
|
|
#ifndef _BSBYTESTREAM_H
|
|
#define _BSBYTESTREAM_H
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "config.h"
|
|
#endif
|
|
#if NEED_GNUG_PRAGMAS
|
|
# pragma interface
|
|
#endif
|
|
|
|
/** @name BSByteStream.h
|
|
|
|
Files #"BSByteStream.h"# and #"BSByteStream.cpp"# implement a very compact
|
|
general purpose compressor based on the Burrows-Wheeler transform. The
|
|
utility program \Ref{bzz} provides a front-end for this class. Although
|
|
this compression model is not currently used in DjVu files, it may be used
|
|
in the future for encoding textual data chunks.
|
|
|
|
{\bf Algorithms} --- The Burrows-Wheeler transform (also named Block-Sorting)
|
|
is performed using a combination of the Karp-Miller-Rosenberg and the
|
|
Bentley-Sedgewick algorithms. This is comparable to (Sadakane, DCC 98)
|
|
with a slightly more flexible ranking scheme. Symbols are then ordered
|
|
according to a running estimate of their occurrence frequencies. The
|
|
symbol ranks are then coded using a simple fixed tree and the
|
|
\Ref{ZPCodec} binary adaptive coder.
|
|
|
|
{\bf Performances} --- The basic algorithm is mostly similar to those
|
|
implemented in well known compressors like #bzip# or #bzip2#
|
|
(\URL{http://www.muraroa.demon.co.uk}). The adaptive binary coder however
|
|
generates small differences. The adaptation noise may cost up to 5\% in
|
|
file size, but this penalty is usually offset by the benefits of
|
|
adaptation. This is good when processing large and highly structured
|
|
files like spreadsheet files. Compression and decompression speed is
|
|
about twice slower than #bzip2# but the sorting algorithms is more
|
|
robust. Unlike #bzip2# (as of August 1998), this code can compress half a
|
|
megabyte of "abababab...." in bounded time.
|
|
|
|
Here are some comparative results (in bits per character) obtained on the
|
|
Canterbury Corpus (\URL{http://corpus.canterbury.ac.nz}) as of August
|
|
1998. The BSByteStream performance on the single spreadsheet file #Excl#
|
|
moves #bzz#'s weighted average ahead of much more sophisticated methods,
|
|
like Suzanne Bunton's #fsmxBest# system
|
|
\URL{http://corpus.canterbury.ac.nz/methodinfo/fsmx.html}. This result
|
|
will not last very long.
|
|
|
|
{\footnotesize
|
|
\begin{tabular}{lccccccccccccc}
|
|
& text & fax & Csrc & Excl & SPRC & tech
|
|
& poem & html & lisp & man & play & Weighted & Average \\
|
|
compress
|
|
& 3.27 & 0.97 & 3.56 & 2.41 & 4.21 & 3.06
|
|
& 3.38 & 3.68 & 3.90 & 4.43 & 3.51
|
|
& 2.55 & 3.31 \\
|
|
gzip -9
|
|
& 2.85 & 0.82 & 2.24 & 1.63 & 2.67 & 2.71
|
|
& 3.23 & 2.59 & 2.65 & 3.31 & 3.12
|
|
& 2.08 & 2.53 \\
|
|
bzip2 -9
|
|
& 2.27 & 0.78 & 2.18 & 1.01 & 2.70 & 2.02
|
|
& 2.42 & 2.48 & 2.79 & 3.33 & 2.53
|
|
& 1.54 & 2.23 \\
|
|
ppmd
|
|
& 2.31 & 0.99 & 2.11 & 1.08 & 2.68 & 2.19
|
|
& 2.48 & 2.38 & 2.43 & 3.00 & 2.53
|
|
& 1.65 & 2.20 \\
|
|
fsmx
|
|
& {\bf 2.10} & 0.79 & {\bf 1.89} & 1.48 & {\bf 2.52} & {\bf 1.84}
|
|
& {\bf 2.21} & {\bf 2.24} & {\bf 2.29} & {\bf 2.91} & {\bf 2.35}
|
|
& 1.63 & {\bf 2.06} \\
|
|
{\bf bzz}
|
|
& 2.25 & {\bf 0.76} & 2.13 & {\bf 0.78} & 2.67 & 2.00
|
|
& 2.40 & 2.52 & 2.60 & 3.19 & 2.52
|
|
& {\bf 1.44} & 2.16
|
|
\end{tabular}
|
|
}
|
|
|
|
Note that the DjVu people have several entries in this table. Program
|
|
#compress# was written some time ago by Joe Orost
|
|
(\URL{http://www.research.att.com/info/orost}). The #ppmc# method, (a
|
|
precursor of #ppmd#) was created by Paul Howard
|
|
(\URL{http://www.research.att.com/info/pgh}). The #bzz# program is just
|
|
below your eyes.
|
|
|
|
@author
|
|
L\'eon Bottou <leonb@research.att.com> -- Initial implementation\\
|
|
Andrei Erofeev <eaf@geocities.com> -- Improved Block Sorting algorithm.
|
|
@memo
|
|
Simple Burrows-Wheeler general purpose compressor.
|
|
@version
|
|
#$Id: BSByteStream.h,v 1.8 2003/11/07 22:08:20 leonb Exp $# */
|
|
//@{
|
|
|
|
|
|
#include "ByteStream.h"
|
|
#include "GException.h"
|
|
#include "ZPCodec.h"
|
|
|
|
|
|
#ifdef HAVE_NAMESPACES
|
|
namespace DJVU {
|
|
# ifdef NOT_DEFINED // Just to fool emacs c++ mode
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
|
|
/** Performs bzz compression/decompression.
|
|
|
|
Class #BSByteStream# defines a \Ref{ByteStream} which transparently
|
|
performs the BZZ compression/decompression. The constructor of class
|
|
\Ref{BSByteStream} takes another \Ref{ByteStream} as argument. Any data
|
|
written to the BSByteStream is compressed and written to this second
|
|
ByteStream. Any data read from the BSByteStream is internally generated by
|
|
decompressing data read from the second ByteStream.
|
|
|
|
Program \Ref{bzz} demonstrates how to use this class. All the hard work
|
|
is achieved by a simple ByteStream to ByteStream copy, as shown below.
|
|
\begin{verbatim}
|
|
GP<ByteStream> in=ByteStream::create(infile,"rb");
|
|
GP<ByteStream> out=ByteStream::create(outfile,"wb");
|
|
if (encoding) {
|
|
BSByteStream bsb(out, blocksize);
|
|
bsb.copy(*in);
|
|
} else {
|
|
BSByteStream bsb(in);
|
|
out->copy(bsb);
|
|
}
|
|
\end{verbatim}
|
|
Due to the block oriented nature of the Burrows-Wheeler transform, there
|
|
is a very significant latency between the data input and the data output.
|
|
You can use function #flush# to force data output at the expense of
|
|
compression efficiency.
|
|
|
|
You should never directly access a ByteStream object connected to a valid
|
|
BSByteStream object. The ByteStream object can be accessed again after the
|
|
destruction of the BSByteStream object. Note that the encoder always
|
|
flushes its internal buffers and writes a few final code bytes when the
|
|
BSByteStream object is destroyed. Note also that the decoder often reads
|
|
a few bytes beyond the last code byte written by the encoder. This lag
|
|
means that you must reposition the ByteStream after the destruction of the
|
|
BSByteStream object and before re-using the ByteStream object (see
|
|
\Ref{IFFByteStream}.)
|
|
*/
|
|
class BSByteStream : public ByteStream
|
|
{
|
|
public:
|
|
// Limits on block sizes
|
|
enum { MINBLOCK=10, MAXBLOCK=4096 };
|
|
|
|
// Sorting tresholds
|
|
enum { FREQMAX=4, CTXIDS=3 };
|
|
|
|
class Decode;
|
|
class Encode;
|
|
protected:
|
|
BSByteStream(GP<ByteStream> bs);
|
|
|
|
public:
|
|
/** Creates a BSByteStream.
|
|
The BSByteStream will be used for decompressing data.
|
|
\begin{description}
|
|
\item[Decompression]
|
|
The BSByteStream is created and the decompressor initializes. Chunks of
|
|
data will be read from ByteStream #bs# and decompressed into an internal
|
|
buffer. Function #read# can be used to access the decompressed data.
|
|
\end{description} */
|
|
static GP<ByteStream> create(GP<ByteStream> bs);
|
|
|
|
/** Constructs a BSByteStream.
|
|
The BSByteStream will be used for compressing data.
|
|
\begin{description}
|
|
\item[Compression]
|
|
Set #blocksize# to a positive number smaller than 4096 to
|
|
initialize the compressor. Data written to the BSByteStream will be
|
|
accumulated into an internal buffer. The buffered data will be
|
|
compressed and written to ByteStream #bs# whenever the buffer sizes
|
|
reaches the maximum value specified by argument #blocksize# (in
|
|
kilobytes). Using a larger block size usually increases the compression
|
|
ratio at the expense of computation time. There is no need however to
|
|
specify a block size larger than the total number of bytes to compress.
|
|
Setting #blocksize# to #1024# is a good starting point. A minimal block
|
|
size of 10 is silently enforced.
|
|
\end{description} */
|
|
static GP<ByteStream> create(GP<ByteStream> bs, const int blocksize);
|
|
|
|
// ByteStream Interface
|
|
~BSByteStream();
|
|
virtual long tell(void) const;
|
|
virtual void flush(void) = 0;
|
|
protected:
|
|
// Data
|
|
long offset;
|
|
int bptr;
|
|
unsigned int blocksize;
|
|
int size;
|
|
ByteStream *bs;
|
|
GP<ByteStream> gbs;
|
|
unsigned char *data;
|
|
GPBuffer<unsigned char> gdata;
|
|
// Coder
|
|
GP<ZPCodec> gzp;
|
|
BitContext ctx[300];
|
|
private:
|
|
// Cancel C++ default stuff
|
|
BSByteStream(const BSByteStream &);
|
|
BSByteStream & operator=(const BSByteStream &);
|
|
BSByteStream(ByteStream *);
|
|
BSByteStream(ByteStream *, int);
|
|
};
|
|
|
|
//@}
|
|
|
|
|
|
#ifdef HAVE_NAMESPACES
|
|
}
|
|
# ifndef NOT_USING_DJVU_NAMESPACE
|
|
using namespace DJVU;
|
|
# endif
|
|
#endif
|
|
#endif
|