You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kchmviewer/lib/libchmfile/libchmfileimpl.h

289 lines
10 KiB

/***************************************************************************
* Copyright (C) 2004-2007 by Georgy Yunaev, gyunaev@ulduzsoft.com *
* Portions Copyright (C) 2003 Razvan Cojocaru <razvanco@gmx.net> *
* Please do not use email address above for bug reports; see *
* the README file *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
***************************************************************************/
#if USE_BUILTIN_CHMLIB
#include "chm_lib.h"
#else
#include <chm_lib.h>
#endif
#include "libchmfile.h"
#include "libchmtocimage.h"
#include <sys/types.h> /* for u_int{32,64}_t */
//! Keeps the intermediate search result
class LCHMSearchProgressResult
{
public:
inline LCHMSearchProgressResult() {}
inline LCHMSearchProgressResult( u_int32_t t, u_int32_t u ) : titleoff(t),urloff(u) {}
TQValueVector<u_int64_t> offsets;
u_int32_t titleoff;
u_int32_t urloff;
};
//! An array to keeps the intermediate search results
typedef QT34VECTOR<LCHMSearchProgressResult> LCHMSearchProgressResults;
//! CHM files processor; the implementation
class LCHMFileImpl
{
public:
LCHMFileImpl();
~LCHMFileImpl();
// Implementations for LCHMFile members
bool loadFile( const TQString& archiveName );
void closeAll();
TQString title() const { return encodeWithCurrentCodec( m_title ); }
TQString homeUrl() const { return encodeWithCurrentCodec( m_home ); }
bool getFileContentAsString( TQString * str, const TQString& url, bool internal_encoding = false );
bool getFileContentAsBinary( TQByteArray * data, const TQString& url ) const;
bool getFileSize( unsigned int * size, const TQString& url );
bool enumerateFiles( TQStringList * files );
TQString getTopicByUrl ( const TQString& url ) const;
const TQPixmap * getBookIconPixmap( unsigned int imagenum );
bool setCurrentEncoding( const LCHMTextEncoding * encoding );
//! Parse the HHC or HHS file, and fill the context (asIndex is false) or index (asIndex is true) array.
bool parseFileAndFillArray (const TQString& file, QT34VECTOR< LCHMParsedEntry > * data, bool asIndex );
/*!
* \brief Fast search using the $FIftiMain file in the .chm.
* \param text The text we're looking for.
* \param wholeWords Are we looking for whole words only?
* \param titlesOnly Are we looking for titles only?
* \param results A string-string hashmap that will hold
* the results in case of successful search. The keys are
* the URLs and the values are the page titles.
* \param phrase_search Indicates that word offset information should be kept.
* \return true if the search found something, false otherwise.
*/
bool searchWord( const TQString& word,
bool wholeWords,
bool titlesOnly,
LCHMSearchProgressResults& results,
bool phrase_search );
/*!
* \brief Finalize the search, resolve the matches, the and generate the results array.
* \param tempres Temporary search results from SearchWord.
* \param results A string-string hashmap that will hold the results in case of successful search.
* The keys are the URLs and the values are the page titles.
*/
void getSearchResults( const LCHMSearchProgressResults& tempres,
TQStringList * results,
unsigned int limit_results = 500 );
//! Looks up fileName in the archive.
bool ResolveObject( const TQString& fileName, chmUnitInfo *ui ) const;
//! Retrieves an uncompressed chunk of a file in the .chm.
size_t RetrieveObject(const chmUnitInfo *ui, unsigned char *buffer, LONGUINT64 fileOffset, LONGINT64 bufferSize) const;
//! Encode the string with the currently selected text codec, if possible. Or return as-is, if not.
inline TQString encodeWithCurrentCodec (const TQString& str) const
{
return (m_textCodec ? m_textCodec->toUnicode (str.utf8()) : str);
}
//! Encode the string with the currently selected text codec, if possible. Or return as-is, if not.
inline TQString encodeWithCurrentCodec (const char * str) const
{
return (m_textCodec ? m_textCodec->toUnicode (str) : (TQString) str);
}
//! Encode the string from internal files with the currently selected text codec, if possible.
//! Or return as-is, if not.
inline TQString encodeInternalWithCurrentCodec (const TQString& str) const
{
return (m_textCodecForSpecialFiles ? m_textCodecForSpecialFiles->toUnicode (str.utf8()) : str);
}
//! Encode the string from internal files with the currently selected text codec, if possible.
//! Or return as-is, if not.
inline TQString encodeInternalWithCurrentCodec (const char * str) const
{
return (m_textCodecForSpecialFiles ? m_textCodecForSpecialFiles->toUnicode (str) : (TQString) str);
}
//! Helper. Translates from Win32 encodings to generic wxWidgets ones.
const char * GetFontEncFromCharSet (const TQString& font) const;
//! Helper. Returns the $FIftiMain offset of leaf node or 0.
u_int32_t GetLeafNodeOffset(const TQString& text,
u_int32_t initalOffset,
u_int32_t buffSize,
u_int16_t treeDepth );
//! Helper. Processes the word location code entries while searching.
bool ProcessWLC(u_int64_t wlc_count,
u_int64_t wlc_size,
u_int32_t wlc_offset,
unsigned char ds,
unsigned char dr,
unsigned char cs,
unsigned char cr,
unsigned char ls,
unsigned char lr,
LCHMSearchProgressResults& results,
bool phrase_search );
//! Looks up as much information as possible from #WINDOWS/#STRINGS.
bool getInfoFromWindows();
//! Looks up as much information as possible from #SYSTEM.
bool getInfoFromSystem();
//! Fill the topic-url map
void fillTopicsUrlMap();
//! Sets up textCodec
void setupTextCodec (const char * name);
//! Guess used text encoding, using m_detectedLCID and m_font. Set up m_textCodec
bool guessTextEncoding ();
//! Change the current CHM encoding for internal files and texts.
//! Encoding could be either simple TQt codepage, or set like CP1251/KOI8, which allows to
//! set up encodings separately for text (first) and internal files (second)
bool changeFileEncoding( const char *qtencoding );
//! Convert the word, so it has an appropriate encoding
TQCString convertSearchWord ( const TQString &src );
/*!
* Helper procedure in TOC parsing, decodes the string between the quotes (first or last) with decoding HTML
* entities like &iacute;
*/
int findStringInQuotes (const TQString& tag, int offset, TQString& value, bool firstquote, bool decodeentities );
/*!
* Decodes Unicode HTML entities according to current encoding.
*/
TQString decodeEntity (const TQString& entity );
/*!
* \brief Returns the list of all available text encodings.
* \return A pointer to the beginning of the text encoding table. The table could be
* enumerated until language == 0, which means end of table.
*
* \ingroup encoding
*/
static const LCHMTextEncoding * getTextEncodingTable();
/*!
* \brief Looks up for encoding by LCID
* \param lcid LCID to look up
* \return A pointer to encoding structure.
*
* \ingroup encoding
*/
static const LCHMTextEncoding * lookupByLCID( short lcid );
/*!
* \brief Get the encoding index
* \param enc Encoding
* \return An index in encoding table. getTextEncodingTable() + i gets the encoding.
*
* \ingroup encoding
*/
static int getEncodingIndex( const LCHMTextEncoding * enc);
/*!
* Normalizes path to search in internal arrays
*/
TQString normalizeUrl (const TQString& path ) const;
// Members
//! Pointer to the chmlib structure
chmFile * m_chmFile;
//! Opened file name
TQString m_filename;
//! Home url, got from CHM file
TQString m_home;
//! Context tree filename. Got from CHM file
TQString m_topicsFile;
//! Index filename. Got from CHM file
TQString m_indexFile;
//! Chm Title. Got from CHM file
TQString m_title;
// Localization stuff
//! LCID from CHM file, used in encoding detection
short m_detectedLCID;
//! font charset from CHM file, used in encoding detection
TQString m_font;
//! Chosen text codec
TQTextCodec * m_textCodec;
TQTextCodec * m_textCodecForSpecialFiles;
//! Current encoding
const LCHMTextEncoding * m_currentEncoding;
//! Map to decode HTML entitles like &acute; based on current encoding
TQMap<TQString, TQString> m_entityDecodeMap;
//! TRUE if /#TOPICS, /#STRINGS, /#URLTBL and /#URLSTR are resolved, and the members below are valid
bool m_lookupTablesValid;
//! pointer to /#TOPICS
chmUnitInfo m_chmTOPICS;
//! pointer to /#STRINGS
chmUnitInfo m_chmSTRINGS;
//! pointer to /#URLTBL
chmUnitInfo m_chmURLTBL;
//! pointer to /#URLSTR
chmUnitInfo m_chmURLSTR;
//! Indicates whether the built-in search is available. This is true only when m_lookupTablesValid
//! is TRUE, and m_chmFIftiMain is resolved.
bool m_searchAvailable;
//! pointer to /$FIftiMain
chmUnitInfo m_chmFIftiMain;
//! Book TOC icon images storage
LCHMTocImageKeeper m_imagesKeeper;
//! Map url->topic
TQMap< TQString, TQString > m_url2topics;
};