|
|
|
/*
|
|
|
|
kmime_charfreq.cpp
|
|
|
|
|
|
|
|
KMime, the KDE internet mail/usenet news message library.
|
|
|
|
Copyright (c) 2001-2002 Marc Mutz <mutz@kde.org>
|
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation; version 2 of the License.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with this program; if not, write to the Free Software Foundation,
|
|
|
|
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, US
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "kmime_charfreq.h"
|
|
|
|
|
|
|
|
namespace KMime {
|
|
|
|
|
|
|
|
CharFreq::CharFreq( const TQByteArray & buf )
|
|
|
|
: NUL(0),
|
|
|
|
CTL(0),
|
|
|
|
CR(0), LF(0),
|
|
|
|
CRLF(0),
|
|
|
|
printable(0),
|
|
|
|
eightBit(0),
|
|
|
|
total(0),
|
|
|
|
lineMin(0xffffffff),
|
|
|
|
lineMax(0),
|
|
|
|
mTrailingWS(false),
|
|
|
|
mLeadingFrom(false)
|
|
|
|
{
|
|
|
|
if ( !buf.isEmpty() )
|
|
|
|
count( buf.data(), buf.size() );
|
|
|
|
}
|
|
|
|
|
|
|
|
CharFreq::CharFreq( const char * buf, size_t len )
|
|
|
|
: NUL(0),
|
|
|
|
CTL(0),
|
|
|
|
CR(0), LF(0),
|
|
|
|
CRLF(0),
|
|
|
|
printable(0),
|
|
|
|
eightBit(0),
|
|
|
|
total(0),
|
|
|
|
lineMin(0xffffffff),
|
|
|
|
lineMax(0),
|
|
|
|
mTrailingWS(false),
|
|
|
|
mLeadingFrom(false)
|
|
|
|
{
|
|
|
|
if ( buf && len > 0 )
|
|
|
|
count( buf, len );
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool isWS( char ch ) { return ( ch == '\t' || ch == ' ' ); }
|
|
|
|
|
|
|
|
void CharFreq::count( const char * it, size_t len ) {
|
|
|
|
|
|
|
|
const char * end = it + len;
|
|
|
|
uint currentLineLength = 0;
|
|
|
|
// initialize the prevChar with LF so that From_ detection works w/o
|
|
|
|
// special-casing:
|
|
|
|
char prevChar = '\n';
|
|
|
|
char prevPrevChar = 0;
|
|
|
|
|
|
|
|
for ( ; it != end ; ++it ) {
|
|
|
|
++currentLineLength;
|
|
|
|
switch ( *it ) {
|
|
|
|
case '\0': ++NUL; break;
|
|
|
|
case '\r': ++CR; break;
|
|
|
|
case '\n': ++LF;
|
|
|
|
if ( prevChar == '\r' ) { --currentLineLength; ++CRLF; }
|
|
|
|
if ( currentLineLength >= lineMax ) lineMax = currentLineLength-1;
|
|
|
|
if ( currentLineLength <= lineMin ) lineMin = currentLineLength-1;
|
|
|
|
if ( !mTrailingWS )
|
|
|
|
if ( isWS( prevChar ) || ( prevChar == '\r' && isWS( prevPrevChar ) ) )
|
|
|
|
mTrailingWS = true;
|
|
|
|
currentLineLength = 0;
|
|
|
|
break;
|
|
|
|
case 'F': // check for lines starting with From_ if not found already:
|
|
|
|
if ( !mLeadingFrom )
|
|
|
|
if ( prevChar == '\n' && end - it >= 5 && !tqstrncmp( "From ", it, 5 ) )
|
|
|
|
mLeadingFrom = true;
|
|
|
|
++printable;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
{
|
|
|
|
uchar c = *it;
|
|
|
|
if ( (c == '\t') || ((c >= ' ') && (c <= '~')) )
|
|
|
|
++printable;
|
|
|
|
else if ( (c == 127) || (c < ' ') )
|
|
|
|
++CTL;
|
|
|
|
else
|
|
|
|
++eightBit;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
prevPrevChar = prevChar;
|
|
|
|
prevChar = *it;
|
|
|
|
}
|
|
|
|
|
|
|
|
// consider the length of the last line
|
|
|
|
if ( currentLineLength >= lineMax ) lineMax = currentLineLength;
|
|
|
|
if ( currentLineLength <= lineMin ) lineMin = currentLineLength;
|
|
|
|
|
|
|
|
// check whether the last character is tab or space
|
|
|
|
if ( isWS( prevChar ) )
|
|
|
|
mTrailingWS = true;
|
|
|
|
|
|
|
|
total = len;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool CharFreq::isEightBitData() const {
|
|
|
|
return type() == EightBitData;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool CharFreq::isEightBitText() const {
|
|
|
|
return type() == EightBitText;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool CharFreq::isSevenBitData() const {
|
|
|
|
return type() == SevenBitData;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool CharFreq::isSevenBitText() const {
|
|
|
|
return type() == SevenBitText;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool CharFreq::hasTrailingWhitespace() const {
|
|
|
|
return mTrailingWS;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool CharFreq::hasLeadingFrom() const {
|
|
|
|
return mLeadingFrom;
|
|
|
|
}
|
|
|
|
|
|
|
|
CharFreq::Type CharFreq::type() const {
|
|
|
|
#if 0
|
|
|
|
tqDebug( "Total: %d; NUL: %d; CTL: %d;\n"
|
|
|
|
"CR: %d; LF: %d; CRLF: %d;\n"
|
|
|
|
"lineMin: %d; lineMax: %d;\n"
|
|
|
|
"printable: %d; eightBit: %d;\n"
|
|
|
|
"trailing whitespace: %s;\n"
|
|
|
|
"leading 'From ': %s;\n",
|
|
|
|
total, NUL, CTL, CR, LF, CRLF, lineMin, lineMax,
|
|
|
|
printable, eightBit,
|
|
|
|
mTrailingWS ? "yes" : "no" , mLeadingFrom ? "yes" : "no" );
|
|
|
|
#endif
|
|
|
|
if ( NUL ) // must be binary
|
|
|
|
return Binary;
|
|
|
|
|
|
|
|
// doesn't contain NUL's:
|
|
|
|
if ( eightBit ) {
|
|
|
|
if ( lineMax > 988 ) return EightBitData; // not allowed in 8bit
|
|
|
|
if ( CR != CRLF || controlCodesRatio() > 0.2 ) return EightBitData;
|
|
|
|
return EightBitText;
|
|
|
|
}
|
|
|
|
|
|
|
|
// doesn't contain NUL's, nor 8bit chars:
|
|
|
|
if ( lineMax > 988 ) return SevenBitData;
|
|
|
|
if ( CR != CRLF || controlCodesRatio() > 0.2 ) return SevenBitData;
|
|
|
|
|
|
|
|
// no NUL, no 8bit chars, no excessive CTLs and no lines > 998 chars:
|
|
|
|
return SevenBitText;
|
|
|
|
}
|
|
|
|
|
|
|
|
float CharFreq::printableRatio() const {
|
|
|
|
if ( total ) return float(printable) / float(total);
|
|
|
|
else return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
float CharFreq::controlCodesRatio() const {
|
|
|
|
if ( total ) return float(CTL) / float(total);
|
|
|
|
else return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace KMime
|
|
|
|
|
|
|
|
|