You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
406 lines
9.3 KiB
406 lines
9.3 KiB
15 years ago
|
// based on: MakeDoc, version 2
|
||
|
// I only took the tBuf class from there and adapted it.
|
||
|
//
|
||
|
// Compresses text files into a format that is ready to export to a Pilot
|
||
|
// and work with Rick Bram's PilotDOC reader.
|
||
|
// Copyright (C) Reinhold Kainhofer, 2002
|
||
|
// Copyrigth (C) Pat Beirne, 2000
|
||
|
//
|
||
|
// Original file (makedoc9.cpp) copyright by:
|
||
|
// Copyright (C) Pat Beirne, 2000.
|
||
|
// Distributable under the GNU General Public License Version 2 or later.
|
||
|
//
|
||
|
// ver 0.6 enforce 31 char limit on database names
|
||
|
// ver 0.7 change header and record0 to structs
|
||
|
// ver 2.0 added category control on the command line
|
||
|
// changed extensions from .prc to .pdb
|
||
|
|
||
|
/*
|
||
|
** This program is free software; you can redistribute it and/or modify
|
||
|
** it under the terms of the GNU General Public License as published by
|
||
|
** the Free Software Foundation; either version 2 of the License, or
|
||
|
** (at your option) any later version.
|
||
|
**
|
||
|
** This program is distributed in the hope that it will be useful,
|
||
|
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
** GNU General Public License for more details.
|
||
|
**
|
||
|
** You should have received a copy of the GNU General Public License
|
||
|
** along with this program in a file called COPYING; if not, write to
|
||
|
** the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
||
|
** MA 02110-1301, USA.
|
||
|
*/
|
||
|
|
||
|
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <string.h>
|
||
|
|
||
|
#include <iostream>
|
||
|
|
||
|
|
||
|
#include "makedoc9.h"
|
||
|
|
||
|
|
||
|
|
||
|
//
|
||
|
// Issue()
|
||
|
//
|
||
|
// action: handle the details of writing a single
|
||
|
// character to the compressed stream
|
||
|
//
|
||
|
unsigned
|
||
|
tBuf::Issue(byte src, int &bSpace)
|
||
|
{
|
||
|
unsigned int iDest = len;
|
||
|
byte *dest = buf;
|
||
|
|
||
|
// TODO: which of the if parts should really be included???
|
||
|
#if 0
|
||
|
// modified version of issue
|
||
|
// just issue the char
|
||
|
if (src >= 0x80 || src <= 8)
|
||
|
dest[iDest++] = 1;
|
||
|
dest[iDest++] = src;
|
||
|
|
||
|
#else
|
||
|
// if there is an outstanding space char, see if
|
||
|
// we can squeeze it in with an ASCII char
|
||
|
if (bSpace)
|
||
|
{
|
||
|
if (src >= 0x40 && src <= 0x7F)
|
||
|
dest[iDest++] = src ^ 0x80;
|
||
|
else
|
||
|
{
|
||
|
// couldn't squeeze it in, so issue the space char by itself
|
||
|
// most chars go out simple, except the range 1...8,0x80...0xFF
|
||
|
dest[iDest++] = ' ';
|
||
|
if (src < 0x80 && (src == 0 || src > 8))
|
||
|
dest[iDest++] = src;
|
||
|
else
|
||
|
dest[iDest++] = 1, dest[iDest++] = src;
|
||
|
}
|
||
|
// knock down the space flag
|
||
|
bSpace = 0;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
// check for a space char
|
||
|
if (src == ' ')
|
||
|
bSpace = 1;
|
||
|
else
|
||
|
{
|
||
|
if (src < 0x80 && (src == 0 || src > 8))
|
||
|
dest[iDest++] = src;
|
||
|
else
|
||
|
dest[iDest++] = 1, dest[iDest++] = src;
|
||
|
|
||
|
}
|
||
|
}
|
||
|
#endif
|
||
|
len = iDest;
|
||
|
return iDest;
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// Compress
|
||
|
//
|
||
|
// params: none
|
||
|
//
|
||
|
// action: takes the given buffer,
|
||
|
// and compresses
|
||
|
// the original data down into a second buffer
|
||
|
//
|
||
|
// comment: This version make heavy use of walking pointers.
|
||
|
//
|
||
|
unsigned tBuf::Compress()
|
||
|
{
|
||
|
if (!buf)
|
||
|
return 0;
|
||
|
if (isCompressed) {
|
||
|
// cout<<"Buffer is already compressed!"<<endl;
|
||
|
return len;
|
||
|
// } else {
|
||
|
// cout<<" Compressing buffer!!!"<<endl;
|
||
|
}
|
||
|
|
||
|
unsigned int i;
|
||
|
|
||
|
// run through the input buffer
|
||
|
byte *pBuffer; // points to the input buffer
|
||
|
byte *pHit; // points to a walking test hit; works upwards on successive matches
|
||
|
byte *pPrevHit; // previous value of pHit; also, start of next test
|
||
|
byte *pTestHead; // current test string
|
||
|
byte *pTestTail; // current walking pointer; one past the current test buffer
|
||
|
byte *pEnd; // 1 past the end of the input buffer
|
||
|
|
||
|
pHit = pPrevHit = pTestHead = pBuffer = buf;
|
||
|
pTestTail = pTestHead + 1;
|
||
|
pEnd = buf + len; // should point to a 0!
|
||
|
|
||
|
// make a dest buffer and reassign the local buffer
|
||
|
buf = new byte[6000];
|
||
|
len = 0; // used to walk through the output buffer
|
||
|
|
||
|
// loop, absorbing one more char from the input buffer on each pass
|
||
|
for (; pTestHead != pEnd; pTestTail++)
|
||
|
{
|
||
|
// if we already have 10 char match, don't bother scanning again for the 11th (wasted time)
|
||
|
if (pTestTail - pTestHead != (1 << COUNT_BITS) + 3)
|
||
|
{
|
||
|
// scan in the previous data for a match
|
||
|
// terminate the test string (and the matcher string, as well!) in a 0
|
||
|
byte tmp = *pTestTail;
|
||
|
|
||
|
*pTestTail = 0;
|
||
|
pHit = (byte *) strstr((const char *) pPrevHit,
|
||
|
(const char *) pTestHead);
|
||
|
*pTestTail = tmp; // restore the char
|
||
|
}
|
||
|
|
||
|
// on a mismatch or end of buffer, issued codes
|
||
|
if (pHit == pTestHead
|
||
|
|| pTestTail - pTestHead > (1 << COUNT_BITS) + 2
|
||
|
|| pTestTail == pEnd)
|
||
|
{
|
||
|
// issue the codes
|
||
|
// first, check for short runs
|
||
|
if (pTestTail - pTestHead < 4)
|
||
|
{
|
||
|
if (pTestHead[0] > 0x7F || pTestHead[0] <= 8)
|
||
|
buf[len++] = 1;
|
||
|
buf[len++] = pTestHead[0];
|
||
|
pTestHead++;
|
||
|
}
|
||
|
// for longer runs, issue a run-code
|
||
|
else
|
||
|
{
|
||
|
unsigned int dist = pTestHead - pPrevHit;
|
||
|
unsigned int compound =
|
||
|
(dist << COUNT_BITS) + pTestTail - pTestHead - 4;
|
||
|
|
||
|
//if (dist>=(1<<DISP_BITS)) printf("\n!! error dist overflow");
|
||
|
//if (pTestTail-pTestHead-4>7) printf("\n!! error len overflow");
|
||
|
|
||
|
buf[len++] = 0x80 + (compound >> 8);
|
||
|
buf[len++] = compound & 0xFF;
|
||
|
//printf("\nissuing code for sequence len %d <%c%c%c>",pTestTail-pTestHead-1,pTestHead[0],pTestHead[1],pTestHead[2]);
|
||
|
//printf("\n <%x%x>",pOut[-2],pOut[-1]);
|
||
|
// and start again
|
||
|
pTestHead = pTestTail - 1;
|
||
|
}
|
||
|
// start the search again
|
||
|
pPrevHit = pBuffer;
|
||
|
// within range
|
||
|
if (pTestHead - pPrevHit > ((1 << DISP_BITS) - 1))
|
||
|
pPrevHit = pTestHead - ((1 << DISP_BITS) - 1);
|
||
|
}
|
||
|
// got a match
|
||
|
else
|
||
|
{
|
||
|
pPrevHit = pHit;
|
||
|
}
|
||
|
// when we get to the end of the buffer, don't inc past the end
|
||
|
// this forces the residue chars out one at a time
|
||
|
if (pTestTail == pEnd)
|
||
|
pTestTail--;
|
||
|
}
|
||
|
|
||
|
|
||
|
// final scan to merge consecutive high chars together
|
||
|
// and merge space chars
|
||
|
unsigned int k;
|
||
|
|
||
|
for (i = k = 0; i < len; i++, k++)
|
||
|
{
|
||
|
buf[k] = buf[i];
|
||
|
// skip the run-length codes
|
||
|
if (buf[k] >= 0x80 && buf[k] < 0xC0)
|
||
|
buf[++k] = buf[++i];
|
||
|
// if we hit a high char marker, look ahead for another
|
||
|
// and merge multiples together
|
||
|
else if (buf[k] == 1)
|
||
|
{
|
||
|
buf[k + 1] = buf[i + 1];
|
||
|
while (i + 2 < len && buf[i + 2] == 1 && buf[k] < 8)
|
||
|
{
|
||
|
buf[k]++;
|
||
|
buf[k + buf[k]] = buf[i + 3];
|
||
|
i += 2;
|
||
|
}
|
||
|
k += buf[k];
|
||
|
i++;
|
||
|
}
|
||
|
else if (buf[k] == ' ' && i < len - 1 && buf[i + 1] <= 0x7F
|
||
|
&& buf[i + 1] >= 0x40)
|
||
|
buf[k] = 0x80 | buf[++i];
|
||
|
}
|
||
|
|
||
|
// delete original buffer
|
||
|
delete[]pBuffer;
|
||
|
len = k;
|
||
|
|
||
|
isCompressed = true;
|
||
|
return k;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
Decompress
|
||
|
|
||
|
params: none
|
||
|
|
||
|
action: make a new buffer
|
||
|
run through the source data
|
||
|
check the 4 cases:
|
||
|
0,9...7F represent self
|
||
|
1...8 escape n chars
|
||
|
80...bf reference earlier run
|
||
|
c0...ff space+ASCII
|
||
|
|
||
|
*/
|
||
|
unsigned tBuf::Decompress()
|
||
|
{
|
||
|
if (!buf)
|
||
|
return 0;
|
||
|
if (!isCompressed) {
|
||
|
// cout<<"Buffer already uncompressed. Doing nothing"<<endl;
|
||
|
return len;
|
||
|
// } else {
|
||
|
// cout<<"Decompressing buffer"<<endl;
|
||
|
}
|
||
|
|
||
|
// we "know" that all decompresses fit within 4096, right?
|
||
|
byte *pOut = new byte[6000];
|
||
|
byte *in_buf = buf;
|
||
|
byte *out_buf = pOut;
|
||
|
|
||
|
unsigned int i, j;
|
||
|
|
||
|
for (j = i = 0; j < len;)
|
||
|
{
|
||
|
unsigned int c;
|
||
|
|
||
|
// take a char from the input buffer
|
||
|
c = in_buf[j++];
|
||
|
|
||
|
// separate the char into zones: 0, 1...8, 9...0x7F, 0x80...0xBF, 0xC0...0xFF
|
||
|
|
||
|
// codes 1...8 mean copy that many bytes; for accented chars & binary
|
||
|
if (c > 0 && c < 9)
|
||
|
while (c--)
|
||
|
out_buf[i++] = in_buf[j++];
|
||
|
|
||
|
// codes 0, 9...0x7F represent themselves
|
||
|
else if (c < 0x80)
|
||
|
out_buf[i++] = c;
|
||
|
|
||
|
// codes 0xC0...0xFF represent "space + ascii char"
|
||
|
else if (c >= 0xC0)
|
||
|
out_buf[i++] = ' ', out_buf[i++] = c ^ 0x80;
|
||
|
|
||
|
// codes 0x80...0xBf represent sequences
|
||
|
else
|
||
|
{
|
||
|
int m, n;
|
||
|
|
||
|
c <<= 8;
|
||
|
c += in_buf[j++];
|
||
|
m = (c & 0x3FFF) >> COUNT_BITS;
|
||
|
n = c & ((1 << COUNT_BITS) - 1);
|
||
|
n += 3;
|
||
|
while (n--)
|
||
|
{
|
||
|
out_buf[i] = out_buf[i - m];
|
||
|
i++;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
out_buf[i++]='\0';
|
||
|
out_buf[i++]='\0';
|
||
|
delete[]buf;
|
||
|
buf = pOut;
|
||
|
len = i;
|
||
|
|
||
|
isCompressed = false;
|
||
|
return i;
|
||
|
}
|
||
|
|
||
|
unsigned tBuf::DuplicateCR()
|
||
|
{
|
||
|
if (!buf)
|
||
|
return 0;
|
||
|
byte *pBuf = new byte[2 * len];
|
||
|
|
||
|
unsigned int k, j;
|
||
|
|
||
|
for (j = k = 0; j < len; j++, k++)
|
||
|
{
|
||
|
pBuf[k] = buf[j];
|
||
|
if (pBuf[k] == 0x0A)
|
||
|
pBuf[k++] = 0x0D, pBuf[k] = 0x0A;
|
||
|
}
|
||
|
delete[]buf;
|
||
|
buf = pBuf;
|
||
|
len = k;
|
||
|
return k;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
// this nasty little beast removes really low ASCII and 0's
|
||
|
// and handles the CR problem
|
||
|
//
|
||
|
// if a cr appears before a lf, then remove the cr
|
||
|
// if a cr appears in isolation, change to a lf
|
||
|
unsigned tBuf::RemoveBinary()
|
||
|
{
|
||
|
if (!buf)
|
||
|
return 0;
|
||
|
byte *in_buf = buf;
|
||
|
byte *out_buf = new byte[len];
|
||
|
|
||
|
unsigned int k, j;
|
||
|
|
||
|
for (j = k = 0; j < len; j++, k++)
|
||
|
{
|
||
|
// copy each byte
|
||
|
out_buf[k] = in_buf[j];
|
||
|
|
||
|
// throw away really low ASCII
|
||
|
if (( /*out_buf[k]>=0 && */ out_buf[k] < 9))
|
||
|
k--;
|
||
|
|
||
|
// for CR
|
||
|
if (out_buf[k] == 0x0D)
|
||
|
{
|
||
|
// if next is LF, then drop it
|
||
|
if (j < len - 1 && in_buf[j + 1] == 0x0A)
|
||
|
k--;
|
||
|
else // turn it into a LF
|
||
|
out_buf[k] = 0x0A;
|
||
|
}
|
||
|
}
|
||
|
delete[]buf;
|
||
|
buf = out_buf;
|
||
|
len = k;
|
||
|
return k;
|
||
|
}
|
||
|
|
||
|
void tBuf::setText(const byte * text, unsigned txtlen, bool txtcomp)
|
||
|
{
|
||
|
if (buf)
|
||
|
delete[]buf;
|
||
|
buf = 0L;
|
||
|
|
||
|
if (txtlen <= 0)
|
||
|
txtlen = strlen((const char *) text);
|
||
|
len = txtlen;
|
||
|
buf = new byte[len];
|
||
|
|
||
|
memcpy(buf, text, len*sizeof(char));
|
||
|
// strncpy((char *) buf, (const char *) text, len);
|
||
|
isCompressed = txtcomp;
|
||
|
// cout<<"Setting text, compressed="<<txtcomp<<endl;
|
||
|
}
|