You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
167 lines
4.5 KiB
167 lines
4.5 KiB
/*
|
|
This file was taken from the KDE 4.x libraries and backported to TQt 3.
|
|
|
|
Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
|
|
Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net)
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Library General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2 of the License, or (at your option) any later version.
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Library General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Library General Public License
|
|
along with this library; see the file COPYING.LIB. If not, write to
|
|
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
|
Boston, MA 02110-1301, USA.
|
|
|
|
*/
|
|
#ifndef ENCODINGDETECTOR_H
|
|
#define ENCODINGDETECTOR_H
|
|
|
|
#include <tqstring.h>
|
|
|
|
class TQTextCodec;
|
|
class TQTextDecoder;
|
|
class EncodingDetectorPrivate;
|
|
|
|
/**
|
|
* @short Provides encoding detection capabilities.
|
|
*
|
|
* Searches for encoding declaration inside raw data -- meta and xml tags.
|
|
* In the case it can't find it, uses heuristics for specified language.
|
|
*
|
|
* If it finds unicode BOM marks, it changes encoding regardless of what the user has told
|
|
*
|
|
* Intended lifetime of the object: one instance per document.
|
|
*
|
|
* Typical use:
|
|
* \code
|
|
* TQByteArray data;
|
|
* ...
|
|
* EncodingDetector detector;
|
|
* detector.setAutoDetectLanguage(EncodingDetector::Cyrillic);
|
|
* TQString out=detector.decode(data);
|
|
* \endcode
|
|
*
|
|
*
|
|
* Do not mix decode() with decodeWithBuffering()
|
|
*
|
|
* @short Guess encoding of char array
|
|
*
|
|
*/
|
|
class EncodingDetector
|
|
{
|
|
public:
|
|
enum EncodingChoiceSource
|
|
{
|
|
DefaultEncoding,
|
|
AutoDetectedEncoding,
|
|
BOM,
|
|
EncodingFromXMLHeader,
|
|
EncodingFromMetaTag,
|
|
EncodingFromHTTPHeader,
|
|
UserChosenEncoding
|
|
};
|
|
|
|
enum AutoDetectScript
|
|
{
|
|
None,
|
|
SemiautomaticDetection,
|
|
Arabic,
|
|
Baltic,
|
|
CentralEuropean,
|
|
ChineseSimplified,
|
|
ChineseTraditional,
|
|
Cyrillic,
|
|
Greek,
|
|
Hebrew,
|
|
Japanese,
|
|
Korean,
|
|
NorthernSaami,
|
|
SouthEasternEurope,
|
|
Thai,
|
|
Turkish,
|
|
Unicode,
|
|
WesternEuropean
|
|
};
|
|
|
|
/**
|
|
* Default codec is latin1 (as html spec says), EncodingChoiceSource is default, AutoDetectScript=Semiautomatic
|
|
*/
|
|
EncodingDetector();
|
|
|
|
/**
|
|
* Allows to set Default codec, EncodingChoiceSource, AutoDetectScript
|
|
*/
|
|
EncodingDetector(TQTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script=None);
|
|
~EncodingDetector();
|
|
|
|
//const TQTextCodec* codec() const;
|
|
|
|
/**
|
|
* @returns true if specified encoding was recognized
|
|
*/
|
|
bool setEncoding(const char *encoding, EncodingChoiceSource type);
|
|
|
|
/**
|
|
* Convenience method.
|
|
* @returns mime name of detected encoding
|
|
*/
|
|
const char* encoding() const;
|
|
|
|
bool visuallyOrdered() const;
|
|
|
|
// void setAutoDetectLanguage( const TQString& );
|
|
// const TQString& autoDetectLanguage() const;
|
|
|
|
void setAutoDetectLanguage( AutoDetectScript );
|
|
AutoDetectScript autoDetectLanguage() const;
|
|
|
|
EncodingChoiceSource encodingChoiceSource() const;
|
|
|
|
/**
|
|
* Analyze text data.
|
|
* @returns true if there was enough data for accurate detection
|
|
*/
|
|
bool analyze( const char *data, int len );
|
|
|
|
/**
|
|
* Analyze text data.
|
|
* @returns true if there was enough data for accurate detection
|
|
*/
|
|
bool analyze( const TQByteArray &data );
|
|
|
|
/**
|
|
* Takes lang name _after_ it were i18n()'ed
|
|
*/
|
|
static AutoDetectScript scriptForName(const TQString& lang);
|
|
static TQString nameForScript(AutoDetectScript);
|
|
static AutoDetectScript scriptForLanguageCode(const TQString &lang);
|
|
static bool hasAutoDetectionForScript(AutoDetectScript);
|
|
|
|
protected:
|
|
/**
|
|
* Check if we are really utf8. Taken from kate
|
|
*
|
|
* @returns true if current encoding is utf8 and the text cannot be in this encoding
|
|
*
|
|
* Please somebody read http://de.wikipedia.org/wiki/UTF-8 and check this code...
|
|
*/
|
|
bool errorsIfUtf8 (const char* data, int length);
|
|
|
|
/**
|
|
* @returns TQTextDecoder for detected encoding
|
|
*/
|
|
TQTextDecoder* decoder();
|
|
|
|
private:
|
|
EncodingDetectorPrivate* const d;
|
|
};
|
|
|
|
#endif
|