You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
509 lines
15 KiB
C++
509 lines
15 KiB
C++
//
|
|
// WordDBPage.h
|
|
//
|
|
// WordDBPage: Implements specific compression scheme for
|
|
// Berkeley DB pages containing WordReferences objects.
|
|
//
|
|
// Part of the ht://Dig package <http://www.htdig.org/>
|
|
// Copyright (c) 1999-2004 The ht://Dig Group
|
|
// For copyright details, see the file COPYING in your distribution
|
|
// or the GNU Library General Public License (LGPL) version 2 or later
|
|
// <http://www.gnu.org/copyleft/lgpl.html>
|
|
//
|
|
// $Id: WordDBPage.h,v 1.8 2004/05/28 13:15:26 lha Exp $
|
|
//
|
|
//
|
|
// Access to Berkeley DB internal
|
|
//
|
|
|
|
#ifndef _WordDBPage_h_
|
|
#define _WordDBPage_h_
|
|
|
|
extern "C"
|
|
{
|
|
#include "db_int.h"
|
|
#include "shqueue.h"
|
|
#include "db_shash.h"
|
|
#include "mp.h"
|
|
#include "db_page.h"
|
|
#include "common_ext.h"
|
|
}
|
|
|
|
#include "WordDBCompress.h"
|
|
#include "WordBitCompress.h"
|
|
#include "WordRecord.h"
|
|
#include "WordKey.h"
|
|
|
|
|
|
#define WORD_ALIGN_TO(v,a) ( (v)%(a) ? (v+((a)-(v)%(a))) : v)
|
|
#define NBITS_KEYLEN 16
|
|
#define NBITS_DATALEN 16
|
|
|
|
// ***********************************************
|
|
// *************** WordDBRecord *****************
|
|
// ***********************************************
|
|
|
|
// WordRecord with added functionalities to help with compression/decompression
|
|
class WordDBRecord : public WordRecord
|
|
{
|
|
public:
|
|
|
|
// retreive WordRecord data/stats from coded numbers
|
|
void set_decompress(unsigned int **data,int *indexes,int i,int pdata,int pstat0,int pstat1)
|
|
{
|
|
if(i>=indexes[pstat0])
|
|
{// were pas the end of coded stats, so this can't be a stat
|
|
type=DefaultType();
|
|
if(type==WORD_RECORD_DATA){info.data=data[pdata][i-indexes[pstat0]];}
|
|
else{info.data=0;}
|
|
}
|
|
else
|
|
{// this is a stat
|
|
type=WORD_RECORD_STATS;
|
|
info.stats.noccurrence=data[pstat0][i];
|
|
info.stats.ndoc =data[pstat1][i];
|
|
}
|
|
}
|
|
WordDBRecord():WordRecord(){;}
|
|
WordDBRecord(byte *dat,int len,int rectyp):WordRecord()
|
|
{
|
|
type=(rectyp ? DefaultType() : WORD_RECORD_STATS);
|
|
Unpack(String((char *)dat,len));
|
|
}
|
|
WordDBRecord(BKEYDATA *ndata,int rectyp):WordRecord()
|
|
{// typ: 0->stat 1->data
|
|
type=(rectyp ? DefaultType() : WORD_RECORD_STATS);
|
|
Unpack(String((char *)ndata->data,ndata->len));
|
|
}
|
|
};
|
|
|
|
|
|
// ***********************************************
|
|
// **************** WordDBKey *****************
|
|
// ***********************************************
|
|
|
|
// WordKey with added functionalities to help with compression/decompression
|
|
class WordDBKey : public WordKey
|
|
{
|
|
BKEYDATA *key;
|
|
public:
|
|
|
|
int RecType(){return (GetWord()[0]!=1 ? 1 :0);}
|
|
WordDBKey():WordKey()
|
|
{
|
|
key=NULL;
|
|
}
|
|
WordDBKey(BKEYDATA *nkey):WordKey()
|
|
{
|
|
key=nkey;
|
|
Unpack(String((char *)key->data,key->len));
|
|
}
|
|
int is_null()
|
|
{
|
|
errr("UNUSED");
|
|
if(GetWord().length()==0)
|
|
{
|
|
for(int j=1;j<NFields();j++)
|
|
{if(Get(j)!=0){errr("WordDBKey::is_null key has 0 len word but is not null");}}
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
WordDBKey(BINTERNAL *nkey):WordKey()
|
|
{
|
|
key=NULL;
|
|
if(nkey->len==0)
|
|
{
|
|
;// errr("WordDBKey::WordDBKey(BINTERNAL) : nkey->len==0");
|
|
}
|
|
else{Unpack(String((char *)nkey->data,nkey->len));}
|
|
}
|
|
WordDBKey(byte *data,int len):WordKey()
|
|
{
|
|
key=NULL;
|
|
if(!data || !len){errr("WordDBKey::WordDBKey(data,len) !data || !len");}
|
|
Unpack(String((char *)data,len));
|
|
}
|
|
};
|
|
|
|
|
|
// ***********************************************
|
|
// **************** WordDBPage *****************
|
|
// ***********************************************
|
|
|
|
// encapsulation of Berkeley DB BTREE page.
|
|
// this one knows how to compress/decompress itself
|
|
class WordDBPage
|
|
{
|
|
public:
|
|
int n; // number of entries
|
|
int nk; // number of keys
|
|
int type; // for now 3(btreeinternal) && 5(leave:normal case) are allowed
|
|
int pgsz;
|
|
|
|
PAGE *pg; // pointer to BerkeleyDB BTREE page structure
|
|
|
|
// assert this page is a leave
|
|
void isleave()
|
|
{
|
|
if(type!=P_LBTREE){errr("WordDBPage::isleave: trying leave specific on non leave");}
|
|
}
|
|
|
|
// assert this page is an internal (non-leave) page
|
|
void isintern()
|
|
{
|
|
if(type!=P_IBTREE){errr("WordDBPage::isintern: trying btreeinternal specific on non btreeinternal page type");}
|
|
|
|
}
|
|
|
|
// get the i'th key stored in this page
|
|
WordDBKey get_WordDBKey(int i)
|
|
{
|
|
if(type==P_LBTREE){return(WordDBKey(key(i)));}
|
|
else
|
|
if(type==P_IBTREE){return(WordDBKey(btikey(i)));}
|
|
else
|
|
{errr("WordDBPage:get_WordDBKey: bad page type");}
|
|
return WordDBKey();
|
|
}
|
|
|
|
// ******************* Accessors to packed entries ****************
|
|
|
|
// get the i'th key stored in this (internal==nonleave) page. (ptr to packed)
|
|
BINTERNAL *btikey(int i)
|
|
{
|
|
if(i<0 || i>=pg->entries){printf("btikey:%d\n",i);errr("WordDBPage::btikey out iof bounds");}
|
|
isintern();return(GET_BINTERNAL(pg,i ));
|
|
}
|
|
// get the i'th entry stored in this (nonleave) page. (ptr to packed)
|
|
// an entry can either be a key or a data entry
|
|
BKEYDATA *entry (int i)
|
|
{
|
|
if(i<0 || i>=pg->entries){printf("entry:%d\n",i);errr("WordDBPage::entry out iof bounds");}
|
|
isleave(); return(GET_BKEYDATA (pg,i ));
|
|
}
|
|
// get the i'th key stored in this (leave) page. (ptr to packed)
|
|
BKEYDATA *key (int i)
|
|
{
|
|
if(i<0 || 2*i>=pg->entries){printf("key:%d\n",i);errr("WordDBPage::key out iof bounds");}
|
|
isleave(); return(GET_BKEYDATA (pg,i*2 ));
|
|
}
|
|
// get the i'th data stored in this (leave) page. (ptr to packed)
|
|
BKEYDATA *data (int i)
|
|
{
|
|
if(i<0 || 2*i+1>=pg->entries){printf("data:%d\n",i);errr("WordDBPage::data out iof bounds");}
|
|
isleave(); return(GET_BKEYDATA (pg,i*2+1));
|
|
}
|
|
|
|
|
|
// ********************* Inserting entries into a page ***************
|
|
|
|
int insert_pos; // offset in page of last inserted entry
|
|
int insert_indx; // index of next entry to be inserted
|
|
|
|
int e_offset(int i) {return((int)(pg->inp[i]));}
|
|
|
|
// allocate space (in the db page) for adding an entry to this page
|
|
void *alloc_entry(int size)
|
|
{
|
|
size=WORD_ALIGN_TO(size,4);
|
|
int inp_pos=((byte *)&(pg->inp[insert_indx]))-(byte *)pg;
|
|
insert_pos-=size;
|
|
if(insert_pos<=inp_pos)
|
|
{
|
|
show();
|
|
printf("alloc_entry: allocating size:%4d entrynum:insert_indx:%4d at:insert_pos:%4d\n",size,insert_indx,insert_pos);
|
|
errr("WordDBPage::alloc_entry: PAGE OVERFLOW");
|
|
}
|
|
pg->inp[insert_indx++]=insert_pos;
|
|
return((void *)((byte *)pg+insert_pos));
|
|
}
|
|
|
|
|
|
// add a data entry to this page
|
|
void insert_data(WordDBRecord &wrec)
|
|
{
|
|
isleave();
|
|
if(!(insert_indx%2)){errr("WordDBPage::insert_data data must be an odd number!");}
|
|
String prec;
|
|
wrec.Pack(prec);
|
|
int len=prec.length();
|
|
int size=len+(sizeof(BKEYDATA)-1);
|
|
|
|
BKEYDATA *dat=(BKEYDATA *)alloc_entry(size);
|
|
dat->len=len;
|
|
dat->type=1;//!!!!!!!!!!!!!
|
|
memcpy((void *)dat->data,(void *)(char *)prec,len);
|
|
}
|
|
// add a key entry to this page
|
|
void insert_key(WordDBKey &ky)
|
|
{
|
|
isleave();
|
|
if(insert_indx%2){errr("WordDBPage::insert_key key must be an even number!");}
|
|
String pkey;
|
|
ky.Pack(pkey);
|
|
int keylen=pkey.length();
|
|
int size=keylen+(sizeof(BKEYDATA)-1);
|
|
BKEYDATA *bky=(BKEYDATA *)alloc_entry(size);
|
|
bky->len=keylen;
|
|
bky->type=1;// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
memcpy((void *)bky->data,(void *)(char *)pkey,keylen);
|
|
}
|
|
// add a key entry to this internal page
|
|
void insert_btikey(WordDBKey &ky,BINTERNAL &bti,int empty=0)
|
|
{
|
|
isintern();
|
|
int keylen=0;
|
|
String pkey;
|
|
if(!empty)
|
|
{
|
|
ky.Pack(pkey);
|
|
keylen=pkey.length();
|
|
}
|
|
int size=keylen+((byte *)&(bti.data))-((byte *)&bti);// pos of data field in BINTERNAL
|
|
if(empty)
|
|
{
|
|
if(verbose){printf("WordDBPage::insert_btikey: empty : BINTERNAL:%d datapos:%d keylen:%d size:%d alligned to:%d\n",(int)sizeof(BINTERNAL),
|
|
(int)(((byte *)&(bti.data))-((byte *)&bti)),
|
|
keylen,size,WORD_ALIGN_TO(size,4));}
|
|
}
|
|
|
|
BINTERNAL *btik=(BINTERNAL *)alloc_entry(size);
|
|
btik->len =(empty ? 0 : keylen);
|
|
btik->type=1;// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
btik->pgno =bti.pgno;
|
|
btik->nrecs=bti.nrecs;
|
|
if(!empty){memcpy((void *)btik->data,(void *)(char *)pkey,keylen);}
|
|
// else
|
|
// {btik->data[0]=0;}// just to avoid uninit memory read
|
|
}
|
|
int entry_struct_size()
|
|
{
|
|
return(type==P_IBTREE ? sizeof(BINTERNAL) : sizeof(BKEYDATA ) )-1;
|
|
}
|
|
int entry_size(int i)
|
|
{
|
|
return entry_struct_size() + (type==P_IBTREE ? btikey(i)->len : key(i)->len );
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ************** Comrpession/Uncompression ***************************
|
|
|
|
// The compression functions
|
|
void Compress_extract_vals_wordiffs(int *nums,int *nums_pos,int nnums,HtVector_byte &wordiffs);
|
|
void Compress_show_extracted(int *nums,int *nums_pos,int nnums,HtVector_byte &wordiffs);
|
|
void Compress_vals(Compressor &out,int *nums,int *nums_pos,int nnums);
|
|
void Compress_vals_changed_flags(Compressor &out,unsigned int *cflags,int n);
|
|
void Compress_header(Compressor &out);
|
|
int Compress_main(Compressor &out);
|
|
Compressor *Compress(int debug=0, DB_CMPR_INFO *cmprInfo=NULL);
|
|
|
|
// The uncompression functions
|
|
int Uncompress(Compressor *pin,int debug=0, DB_CMPR_INFO *cmprInfo=NULL);
|
|
int Uncompress_main(Compressor *pin);
|
|
void Uncompress_vals_chaged_flags(Compressor &in,unsigned int **pcflags,int *pn);
|
|
int Uncompress_header(Compressor &in);
|
|
void Uncompress_rebuild(unsigned int **rnums,int *rnum_sizes,int nnums,byte *rworddiffs,int nrworddiffs);
|
|
void Uncompress_show_rebuild(unsigned int **rnums,int *rnum_sizes,int nnums,byte *rworddiffs,int nrworddiffs);
|
|
|
|
int TestCompress(int debuglevel);
|
|
int Compare(WordDBPage &other);
|
|
|
|
// the following functions are use to compress/uncompress
|
|
// keys/data directly
|
|
// This is necesary for the first key/data elements of the page
|
|
void compress_key(Compressor &out,int i)
|
|
{
|
|
if(type==P_IBTREE)
|
|
{
|
|
int len=btikey(i)->len;
|
|
out.put_uint(len,NBITS_KEYLEN,label_str("seperatekey_len",i));
|
|
if(verbose){printf("WordDBPage::compress_key:compress(typ3):%d ::: sizeof(BINTERNAL):%d\n",len,(int)sizeof(BINTERNAL));}
|
|
out.put_uint(btikey(i)->len ,sizeof(btikey(i)->len )*8,label_str("seperatekey_bti_len" ,i));
|
|
out.put_uint(btikey(i)->type ,sizeof(btikey(i)->type )*8,label_str("seperatekey_bti_type" ,i));
|
|
out.put_uint(btikey(i)->pgno ,sizeof(btikey(i)->pgno )*8,label_str("seperatekey_bti_pgno" ,i));
|
|
out.put_uint(btikey(i)->nrecs,sizeof(btikey(i)->nrecs)*8,label_str("seperatekey_bti_nrecs",i));
|
|
if(len){out.put_zone((byte *)btikey(i)->data,8*len,label_str("seperatekey_btidata",i));}
|
|
}
|
|
else
|
|
{
|
|
int len=key(i)->len;
|
|
out.put_uint(len,NBITS_KEYLEN,label_str("seperatekey_len",i));
|
|
if(verbose){printf("WordDBPage::compress_key: compress(typ5):%d\n",len);}
|
|
out.put_zone((byte *)key(i)->data,8*len,label_str("seperatekey_data",i));
|
|
}
|
|
}
|
|
void compress_data(Compressor &out,int i)
|
|
{
|
|
int len=data(i)->len;
|
|
out.put_uint(len,NBITS_DATALEN,label_str("seperatedata_len",i));
|
|
if(verbose){printf("WordDBPage::compress_data: compressdata(typ5):%d\n",len);}
|
|
out.put_zone((byte *)data(i)->data,8*len,label_str("seperatedata_data",i));
|
|
}
|
|
WordDBKey uncompress_key(Compressor &in,int i)
|
|
{
|
|
WordDBKey res;
|
|
int len=in.get_uint(NBITS_KEYLEN,label_str("seperatekey_len",i));
|
|
if(verbose){printf("WordDBPage::uncompress_key: seperatekey:len:%d\n",len);}
|
|
|
|
if(type==P_IBTREE)
|
|
{
|
|
if(len==0 && i!=0){errr("WordDBPage::uncompress_key: keylen=0 && i!=0");}
|
|
BINTERNAL bti;
|
|
bti.len =in.get_uint(sizeof(bti.len )*8,label_str("seperatekey_bti_len" ,i));
|
|
bti.type =in.get_uint(sizeof(bti.type )*8,label_str("seperatekey_bti_type" ,i));
|
|
bti.pgno =in.get_uint(sizeof(bti.pgno )*8,label_str("seperatekey_bti_pgno" ,i));
|
|
bti.nrecs=in.get_uint(sizeof(bti.nrecs)*8,label_str("seperatekey_bti_nrecs",i));
|
|
if(len!=bti.len){errr("WordDBPage::uncompress_key: incoherence: len!=bti.len");}
|
|
if(len)
|
|
{
|
|
byte *gotdata=new byte[len];
|
|
CHECK_MEM(gotdata);
|
|
in.get_zone(gotdata,8*len,label_str("seperatekey_btidata",i));
|
|
res=WordDBKey(gotdata,len);
|
|
delete [] gotdata;
|
|
}
|
|
insert_btikey(res,bti,(len==0 ? 1:0));
|
|
}
|
|
else
|
|
{
|
|
byte *gotdata=new byte[len];
|
|
CHECK_MEM(gotdata);
|
|
in.get_zone(gotdata,8*len,label_str("seperatekey_data",i));
|
|
res=WordDBKey(gotdata,len);
|
|
insert_key(res);
|
|
delete [] gotdata;
|
|
}
|
|
return res;
|
|
}
|
|
WordDBRecord uncompress_data(Compressor &in,int i,int rectyp)
|
|
{
|
|
WordDBRecord res;
|
|
int len=in.get_uint(NBITS_DATALEN,label_str("seperatedata_len",i));
|
|
if(verbose)printf("uncompressdata:len:%d\n",len);
|
|
byte *gotdata=new byte[len];
|
|
CHECK_MEM(gotdata);
|
|
in.get_zone(gotdata,8*len,label_str("seperatedata_data",i));
|
|
res=WordDBRecord(gotdata,len,rectyp);
|
|
insert_data(res);
|
|
delete [] gotdata;
|
|
return res;
|
|
}
|
|
|
|
|
|
// exctracted numerical fields
|
|
|
|
const char* number_field_label(int j)
|
|
{
|
|
if(j>0 && j<WordKey::NFields()){return (char *)(WordKey::Info()->sort[j].name);}
|
|
if( j==CNFLAGS )return "CNFLAGS " ;
|
|
if( j==CNDATASTATS0 )return "CNDATASTATS0 " ;
|
|
if( j==CNDATASTATS1 )return "CNDATASTATS1 " ;
|
|
if( j==CNDATADATA )return "CNDATADATA " ;
|
|
if( j==CNBTIPGNO )return "CNBTIPGNO " ;
|
|
if( j==CNBTINRECS )return "CNBTINRECS " ;
|
|
if( j==CNWORDDIFFPOS )return "CNWORDDIFFPOS" ;
|
|
if( j==CNWORDDIFFLEN )return "CNWORDDIFFLEN" ;
|
|
return "BADFIELD";
|
|
}
|
|
// positions of different fileds in
|
|
// number arrays that are extracted
|
|
int CNFLAGS ;// FLAGS: which key-fields have changed
|
|
int CNFIELDS ;// first numerical field
|
|
int CNDATASTATS0 ;// word record - stats element 0
|
|
int CNDATASTATS1 ;// word record - stats element 1
|
|
int CNDATADATA ;// word record - data
|
|
int CNBTIPGNO ;// internal page: page pointed at by node
|
|
int CNBTINRECS ;// internal page: ??
|
|
int CNWORDDIFFPOS ;// position of first caracter that changed in word
|
|
int CNWORDDIFFLEN ;// number of chars that changed in word
|
|
int nnums ;
|
|
|
|
|
|
// ************** DEBUGING/BENCHMARKING ***************
|
|
void show();
|
|
int verbose;
|
|
int debug;
|
|
|
|
|
|
// ************** Initialization/Destruction *****************
|
|
|
|
// initialize when header is valid
|
|
void init()
|
|
{
|
|
type=pg->type;
|
|
n=pg->entries;
|
|
nk=(type==P_LBTREE ? n/2 : n);
|
|
insert_pos=pgsz;
|
|
insert_indx=0;
|
|
}
|
|
|
|
void init0()
|
|
{
|
|
CNFLAGS =0;
|
|
CNFIELDS =1;
|
|
CNDATASTATS0 = WordKey::NFields() ;
|
|
CNDATASTATS1 = WordKey::NFields() + 1;
|
|
CNDATADATA = WordKey::NFields() + 2;
|
|
CNBTIPGNO = WordKey::NFields() + 3;
|
|
CNBTINRECS = WordKey::NFields() + 4;
|
|
CNWORDDIFFPOS = WordKey::NFields() + 5;
|
|
CNWORDDIFFLEN = WordKey::NFields() + 6;
|
|
nnums=(CNWORDDIFFLEN+1);
|
|
|
|
pg=NULL;
|
|
pgsz=0;
|
|
n=0;
|
|
nk=0;
|
|
type=-1;
|
|
verbose=0;
|
|
debug=0;
|
|
insert_pos=pgsz;
|
|
insert_indx=0;
|
|
}
|
|
|
|
// db page was created here, destroy it
|
|
void delete_page()
|
|
{
|
|
if(!pg){errr("WordDBPage::delete_page: pg==NULL");}
|
|
delete [] pg;
|
|
pg=NULL;
|
|
}
|
|
// unlink db page from this encapsulation
|
|
void unset_page()
|
|
{
|
|
if(!pg){errr("WordDBPage::unset_page: pg==NULL");}
|
|
pg=NULL;
|
|
}
|
|
// the DB page must be unset or deleted
|
|
// before destroying this encapsulation
|
|
~WordDBPage()
|
|
{
|
|
if(pg){errr("WordDBPage::~WordDBPage: page not empty");}
|
|
}
|
|
WordDBPage(int npgsz)
|
|
{
|
|
init0();
|
|
pgsz=npgsz;
|
|
pg=(PAGE *)(new byte[pgsz]);
|
|
CHECK_MEM(pg);
|
|
insert_pos=pgsz;
|
|
insert_indx=0;
|
|
}
|
|
WordDBPage(const u_int8_t* buff,int buff_length)
|
|
{
|
|
init0();
|
|
pg=(PAGE *)buff;
|
|
pgsz=buff_length;
|
|
insert_pos=pgsz;
|
|
insert_indx=0;
|
|
init();
|
|
}
|
|
};
|
|
|
|
|
|
#endif// _WordDBPage_h_
|