kpilot/conduits/docconduit/makedoc9.cc

// based on: MakeDoc, version 2
// I only took the tBuf class from there and adapted it.
//
// Compresses text files into a format that is ready to export to a Pilot
// and work with Rick Bram's PilotDOC reader.
// Copyright (C) Reinhold Kainhofer, 2002
// Copyrigth (C) Pat Beirne, 2000
//
// Original file (makedoc9.cpp) copyright by:
// Copyright (C) Pat Beirne, 2000.
// Distributable under the GNU General Public License Version 2 or later.
//
// ver 0.6 enforce 31 char limit on database names
// ver 0.7 change header and record0 to structs
// ver 2.0 added category control on the command line
//              changed extensions from .prc to .pdb

/*
** This program is free software; you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation; either version 2 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program in a file called COPYING; if not, write to
** the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
** MA 02110-1301, USA.
*/


#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <iostream>


#include "makedoc9.h"


//
// Issue()
//
// action: handle the details of writing a single
//              character to the compressed stream
//
unsigned
 tBuf::Issue(byte src, int &bSpace)
{
	unsigned int iDest = len;
	byte *dest = buf;

	// TODO: which of the if parts should really be included???
#if 0
	// modified version of issue
	// just issue the char
	if (src >= 0x80 || src <= 8)
		dest[iDest++] = 1;
	dest[iDest++] = src;

#else
	// if there is an outstanding space char, see if
	// we can squeeze it in with an ASCII char
	if (bSpace)
	{
		if (src >= 0x40 && src <= 0x7F)
			dest[iDest++] = src ^ 0x80;
		else
		{
			// couldn't squeeze it in, so issue the space char by itself
			// most chars go out simple, except the range 1...8,0x80...0xFF
			dest[iDest++] = ' ';
			if (src < 0x80 && (src == 0 || src > 8))
				dest[iDest++] = src;
			else
				dest[iDest++] = 1, dest[iDest++] = src;
		}
		// knock down the space flag
		bSpace = 0;
	}
	else
	{
		// check for a space char
		if (src == ' ')
			bSpace = 1;
		else
		{
			if (src < 0x80 && (src == 0 || src > 8))
				dest[iDest++] = src;
			else
				dest[iDest++] = 1, dest[iDest++] = src;

		}
	}
#endif
	len = iDest;
	return iDest;
}

//
// Compress
//
// params:      none
//
// action:      takes the given buffer,
//                                      and compresses
//                                      the original data down into a second buffer
//
// comment:     This version make heavy use of walking pointers.
//
unsigned tBuf::Compress()
{
	if (!buf)
		return 0;
	if (isCompressed) {
//		cout<<"Buffer is already compressed!"<<endl;
		return len;
//	} else {
//		cout<<" Compressing buffer!!!"<<endl;
	}

	unsigned int i;

	// run through the input buffer
	byte *pBuffer;					  // points to the input buffer
	byte *pHit;						  // points to a walking test hit; works upwards on successive matches
	byte *pPrevHit;				  // previous value of pHit; also, start of next test
	byte *pTestHead;				  // current test string
	byte *pTestTail;				  // current walking pointer; one past the current test buffer
	byte *pEnd;						  // 1 past the end of the input buffer

	pHit = pPrevHit = pTestHead = pBuffer = buf;
	pTestTail = pTestHead + 1;
	pEnd = buf + len;				  // should point to a 0!

	// make a dest buffer and reassign the local buffer
	buf = new byte[6000];
	len = 0;							  // used to walk through the output buffer

	// loop, absorbing one more char from the input buffer on each pass
	for (; pTestHead != pEnd; pTestTail++)
	{
		// if we already have 10 char match, don't bother scanning again for the 11th (wasted time)
		if (pTestTail - pTestHead != (1 << COUNT_BITS) + 3)
		{
			// scan in the previous data for a match
			// terminate the test string (and the matcher string, as well!) in a 0
			byte tmp = *pTestTail;

			*pTestTail = 0;
			pHit = (byte *) strstr((const char *) pPrevHit,
				(const char *) pTestHead);
			*pTestTail = tmp;		  // restore the char
		}

		// on a mismatch or end of buffer, issued codes
		if (pHit == pTestHead
			|| pTestTail - pTestHead > (1 << COUNT_BITS) + 2
			|| pTestTail == pEnd)
		{
			// issue the codes
			// first, check for short runs
			if (pTestTail - pTestHead < 4)
			{
				if (pTestHead[0] > 0x7F || pTestHead[0] <= 8)
					buf[len++] = 1;
				buf[len++] = pTestHead[0];
				pTestHead++;
			}
			// for longer runs, issue a run-code
			else
			{
				unsigned int dist = pTestHead - pPrevHit;
				unsigned int compound =
					(dist << COUNT_BITS) + pTestTail - pTestHead - 4;

//if (dist>=(1<<DISP_BITS)) printf("\n!! error dist overflow");
//if (pTestTail-pTestHead-4>7) printf("\n!! error len overflow");

				buf[len++] = 0x80 + (compound >> 8);
				buf[len++] = compound & 0xFF;
//printf("\nissuing code for sequence len %d <%c%c%c>",pTestTail-pTestHead-1,pTestHead[0],pTestHead[1],pTestHead[2]);
//printf("\n          <%x%x>",pOut[-2],pOut[-1]);
				// and start again
				pTestHead = pTestTail - 1;
			}
			// start the search again
			pPrevHit = pBuffer;
			// within range
			if (pTestHead - pPrevHit > ((1 << DISP_BITS) - 1))
				pPrevHit = pTestHead - ((1 << DISP_BITS) - 1);
		}
		// got a match
		else
		{
			pPrevHit = pHit;
		}
		// when we get to the end of the buffer, don't inc past the end
		// this forces the residue chars out one at a time
		if (pTestTail == pEnd)
			pTestTail--;
	}


	// final scan to merge consecutive high chars together
	// and merge space chars
	unsigned int k;

	for (i = k = 0; i < len; i++, k++)
	{
		buf[k] = buf[i];
		// skip the run-length codes
		if (buf[k] >= 0x80 && buf[k] < 0xC0)
			buf[++k] = buf[++i];
		// if we hit a high char marker, look ahead for another
		// and merge multiples together
		else if (buf[k] == 1)
		{
			buf[k + 1] = buf[i + 1];
			while (i + 2 < len && buf[i + 2] == 1 && buf[k] < 8)
			{
				buf[k]++;
				buf[k + buf[k]] = buf[i + 3];
				i += 2;
			}
			k += buf[k];
			i++;
		}
		else if (buf[k] == ' ' && i < len - 1 && buf[i + 1] <= 0x7F
			&& buf[i + 1] >= 0x40)
			buf[k] = 0x80 | buf[++i];
	}

	// delete original buffer
	delete[]pBuffer;
	len = k;

	isCompressed = true;
	return k;
}

/*
	Decompress

	params:	none

	action: make a new buffer
					run through the source data
					check the 4 cases:
						0,9...7F represent self
						1...8		escape n chars
						80...bf reference earlier run
						c0...ff	space+ASCII

*/
unsigned tBuf::Decompress()
{
	if (!buf)
		return 0;
	if (!isCompressed) {
//		cout<<"Buffer already uncompressed. Doing nothing"<<endl;
		return len;
//	} else {
//		cout<<"Decompressing buffer"<<endl;
	}

	// we "know" that all decompresses fit within 4096, right?
	byte *pOut = new byte[6000];
	byte *in_buf = buf;
	byte *out_buf = pOut;

	unsigned int i, j;

	for (j = i = 0; j < len;)
	{
		unsigned int c;

		// take a char from the input buffer
		c = in_buf[j++];

		// separate the char into zones: 0, 1...8, 9...0x7F, 0x80...0xBF, 0xC0...0xFF

		// codes 1...8 mean copy that many bytes; for accented chars & binary
		if (c > 0 && c < 9)
			while (c--)
				out_buf[i++] = in_buf[j++];

		// codes 0, 9...0x7F represent themselves
		else if (c < 0x80)
			out_buf[i++] = c;

		// codes 0xC0...0xFF represent "space + ascii char"
		else if (c >= 0xC0)
			out_buf[i++] = ' ', out_buf[i++] = c ^ 0x80;

		// codes 0x80...0xBf represent sequences
		else
		{
			int m, n;

			c <<= 8;
			c += in_buf[j++];
			m = (c & 0x3FFF) >> COUNT_BITS;
			n = c & ((1 << COUNT_BITS) - 1);
			n += 3;
			while (n--)
			{
				out_buf[i] = out_buf[i - m];
				i++;
			}
		}
	}
	out_buf[i++]='\0';
	out_buf[i++]='\0';
	delete[]buf;
	buf = pOut;
	len = i;

	isCompressed = false;
	return i;
}

unsigned tBuf::DuplicateCR()
{
	if (!buf)
		return 0;
	byte *pBuf = new byte[2 * len];

	unsigned int k, j;

	for (j = k = 0; j < len; j++, k++)
	{
		pBuf[k] = buf[j];
		if (pBuf[k] == 0x0A)
			pBuf[k++] = 0x0D, pBuf[k] = 0x0A;
	}
	delete[]buf;
	buf = pBuf;
	len = k;
	return k;
}


// this nasty little beast removes really low ASCII and 0's
// and handles the CR problem
//
// if a cr appears before a lf, then remove the cr
// if a cr appears in isolation, change to a lf
unsigned tBuf::RemoveBinary()
{
	if (!buf)
		return 0;
	byte *in_buf = buf;
	byte *out_buf = new byte[len];

	unsigned int k, j;

	for (j = k = 0; j < len; j++, k++)
	{
		// copy each byte
		out_buf[k] = in_buf[j];

		// throw away really low ASCII
		if (( /*out_buf[k]>=0 && */ out_buf[k] < 9))
			k--;

		// for CR
		if (out_buf[k] == 0x0D)
		{
			// if next is LF, then drop it
			if (j < len - 1 && in_buf[j + 1] == 0x0A)
				k--;
			else						  // turn it into a LF
				out_buf[k] = 0x0A;
		}
	}
	delete[]buf;
	buf = out_buf;
	len = k;
	return k;
}

void tBuf::setText(const byte * text, unsigned txtlen, bool txtcomp)
{
	if (buf)
		delete[]buf;
	buf = 0L;

	if (txtlen <= 0)
		txtlen = strlen((const char *) text);
	len = txtlen;
	buf = new byte[len];

	memcpy(buf, text, len*sizeof(char));
//	strncpy((char *) buf, (const char *) text, len);
	isCompressed = txtcomp;
//	cout<<"Setting text, compressed="<<txtcomp<<endl;
}