You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
667 lines
17 KiB
667 lines
17 KiB
/* -*- c++ -*-
|
|
parser/lexer.cpp
|
|
|
|
This file is part of KSieve,
|
|
the KDE internet mail/usenet news message filtering library.
|
|
Copyright (c) 2002-2003 Marc Mutz <mutz@kde.org>
|
|
|
|
KSieve is free software; you can redistribute it and/or modify it
|
|
under the terms of the GNU General Public License, version 2, as
|
|
published by the Free Software Foundation.
|
|
|
|
KSieve is distributed in the hope that it will be useful, but
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
In addition, as a special exception, the copyright holders give
|
|
permission to link the code of this program with any edition of
|
|
the TQt library by Trolltech AS, Norway (or with modified versions
|
|
of TQt that use the same license as TQt), and distribute linked
|
|
combinations including the two. You must obey the GNU General
|
|
Public License in all respects for all of the code used other than
|
|
TQt. If you modify this file, you may extend this exception to
|
|
your version of the file, but you are not obligated to do so. If
|
|
you do not wish to do so, delete this exception statement from
|
|
your version.
|
|
*/
|
|
|
|
#include <config.h>
|
|
|
|
#include <ksieve/lexer.h>
|
|
#include <impl/lexer.h>
|
|
|
|
#include <impl/utf8validator.h>
|
|
#include <ksieve/error.h>
|
|
|
|
#include <tqstring.h>
|
|
#include <tqstringlist.h>
|
|
#include <tqtextcodec.h>
|
|
|
|
#include <memory> // std::auto_ptr
|
|
|
|
#include <assert.h>
|
|
#include <ctype.h> // isdigit
|
|
|
|
#ifdef STR_DIM
|
|
# undef STR_DIM
|
|
#endif
|
|
#define STR_DIM(x) (sizeof(x) - 1)
|
|
|
|
namespace KSieve {
|
|
|
|
//
|
|
//
|
|
// Lexer Bridge implementation
|
|
//
|
|
//
|
|
|
|
Lexer::Lexer( const char * scursor, const char * send, int options )
|
|
: i( 0 )
|
|
{
|
|
i = new Impl( scursor, send, options );
|
|
}
|
|
|
|
Lexer::~Lexer() {
|
|
delete i; i = 0;
|
|
}
|
|
|
|
bool Lexer::ignoreComments() const {
|
|
assert( i );
|
|
return i->ignoreComments();
|
|
}
|
|
|
|
const Error & Lexer::error() const {
|
|
assert( i );
|
|
return i->error();
|
|
}
|
|
|
|
bool Lexer::atEnd() const {
|
|
assert( i );
|
|
return i->atEnd();
|
|
}
|
|
|
|
int Lexer::column() const {
|
|
assert( i );
|
|
return i->column();
|
|
}
|
|
|
|
int Lexer::line() const {
|
|
assert( i );
|
|
return i->line();
|
|
}
|
|
|
|
void Lexer::save() {
|
|
assert( i );
|
|
i->save();
|
|
}
|
|
|
|
void Lexer::restore() {
|
|
assert( i );
|
|
i->restore();
|
|
}
|
|
|
|
Lexer::Token Lexer::nextToken( TQString & result ) {
|
|
assert( i );
|
|
return i->nextToken( result );
|
|
}
|
|
|
|
} // namespace KSieve
|
|
|
|
|
|
// none except a-zA-Z0-9_
|
|
static const unsigned char iTextMap[16] = {
|
|
0x00, 0x00, 0x00, 0x00, // CTLs: none
|
|
0x00, 0x00, 0xFF, 0xC0, // SP ... '?': 0-9
|
|
0x7F, 0xFF, 0xFF, 0xE1, // '@' ... '_': A-Z_
|
|
0x7F, 0xFF, 0xFF, 0xE0 // '`' ... DEL: a-z
|
|
};
|
|
|
|
// SP, HT, CR, LF, {}[]();,#/
|
|
// ### exclude '['? Why would one want to write identifier["foo"]?
|
|
static const unsigned char delimMap[16] = {
|
|
0x00, 0x64, 0x00, 0x00, // CTLs: CR, HT, LF
|
|
0x90, 0xC9, 0x00, 0x10, // SP ... '?': SP, #(),;
|
|
0x00, 0x00, 0x00, 0x16, // '@' ... '_': []
|
|
0x00, 0x00, 0x00, 0x16 // '`' ... DEL: {}
|
|
};
|
|
|
|
// All except iText, delim, "*:
|
|
static const unsigned char illegalMap[16] = {
|
|
0xFF, 0x9B, 0xFF, 0xFF,
|
|
0x4F, 0x16, 0x00, 0x0F,
|
|
0x80, 0x00, 0x00, 0x0A,
|
|
0x80, 0x00, 0x00, 0x0A
|
|
};
|
|
|
|
static inline bool isOfSet( const unsigned char map[16], unsigned char ch ) {
|
|
assert( ch < 128 );
|
|
return ( map[ ch/8 ] & 0x80 >> ch%8 );
|
|
}
|
|
|
|
static inline bool isIText( unsigned char ch ) {
|
|
return ch <= 'z' && isOfSet( iTextMap, ch );
|
|
}
|
|
|
|
static inline bool isDelim( unsigned char ch ) {
|
|
return ch <= '}' && isOfSet( delimMap, ch );
|
|
}
|
|
|
|
static inline bool isIllegal( unsigned char ch ) {
|
|
return ch >= '~' || isOfSet( illegalMap, ch );
|
|
}
|
|
|
|
static inline bool is8Bit( signed char ch ) {
|
|
return ch < 0;
|
|
}
|
|
|
|
static TQString removeCRLF( const TQString & s ) {
|
|
const bool CRLF = s.endsWith( "\r\n" );
|
|
const bool LF = !CRLF && s.endsWith( "\n" );
|
|
|
|
const int e = CRLF ? 2 : LF ? 1 : 0 ; // what to chop off at the end
|
|
|
|
return s.left( s.length() - e );
|
|
}
|
|
|
|
static TQString removeDotStuff( const TQString & s ) {
|
|
return s.startsWith( ".." ) ? s.mid( 1 ) : s ;
|
|
}
|
|
|
|
namespace KSieve {
|
|
|
|
//
|
|
//
|
|
// Lexer Implementation
|
|
//
|
|
//
|
|
|
|
Lexer::Impl::Impl( const char * scursor, const char * send, int options )
|
|
: mState( scursor ? scursor : send ),
|
|
mEnd( send ? send : scursor ),
|
|
mIgnoreComments( options & IgnoreComments ),
|
|
mIgnoreLF( options & IgnoreLineFeeds )
|
|
{
|
|
if ( !scursor || !send )
|
|
assert( atEnd() );
|
|
}
|
|
|
|
Lexer::Token Lexer::Impl::nextToken( TQString & result ) {
|
|
assert( !atEnd() );
|
|
result = TQString();
|
|
//clearErrors();
|
|
|
|
const int oldLine = line();
|
|
|
|
const bool eatingWSSucceeded = ignoreComments() ? eatCWS() : eatWS() ;
|
|
|
|
if ( !ignoreLineFeeds() && oldLine != line() ) {
|
|
result.setNum( line() - oldLine ); // return number of linefeeds encountered
|
|
return LineFeeds;
|
|
}
|
|
|
|
if ( !eatingWSSucceeded )
|
|
return None;
|
|
|
|
if ( atEnd() )
|
|
return None;
|
|
|
|
switch ( *mState.cursor ) {
|
|
case '#': // HashComment
|
|
assert( !ignoreComments() );
|
|
++mState.cursor;
|
|
if ( !atEnd() )
|
|
parseHashComment( result, true );
|
|
return HashComment;
|
|
case '/': // BracketComment
|
|
assert( !ignoreComments() );
|
|
++mState.cursor; // eat slash
|
|
if ( atEnd() || *mState.cursor != '*' ) {
|
|
makeError( Error::SlashWithoutAsterisk );
|
|
return BracketComment;
|
|
}
|
|
++mState.cursor; // eat asterisk
|
|
if ( atEnd() ) {
|
|
makeError( Error::UnfinishedBracketComment );
|
|
return BracketComment;
|
|
}
|
|
parseBracketComment( result, true );
|
|
return BracketComment;
|
|
case ':': // Tag
|
|
++mState.cursor;
|
|
if ( atEnd() ) {
|
|
makeError( Error::UnexpectedCharacter, line(), column() - 1 );
|
|
return Tag;
|
|
}
|
|
if ( !isIText( *mState.cursor ) ) {
|
|
makeIllegalCharError( *mState.cursor );
|
|
return Tag;
|
|
}
|
|
parseTag( result );
|
|
return Tag;
|
|
case '"': // QuotedString
|
|
++mState.cursor;
|
|
parseQuotedString( result );
|
|
return QuotedString;
|
|
case '{':
|
|
case '}':
|
|
case '[':
|
|
case ']':
|
|
case '(':
|
|
case ')':
|
|
case ';':
|
|
case ',': // Special
|
|
result = *mState.cursor++;
|
|
return Special;
|
|
case '0':
|
|
case '1':
|
|
case '2':
|
|
case '3':
|
|
case '4':
|
|
case '5':
|
|
case '6':
|
|
case '7':
|
|
case '8':
|
|
case '9': // Number
|
|
parseNumber( result );
|
|
return Number;
|
|
case 't': // maybe MultiLineString, else Identifier
|
|
if ( _strnicmp( mState.cursor, "text:", STR_DIM("text:") ) == 0 ) {
|
|
// MultiLineString
|
|
mState.cursor += STR_DIM("text:");
|
|
parseMultiLine( result );
|
|
// ### FIXME: There can be a hash-comment between "text:"
|
|
// and CRLF! That should be preserved somehow...
|
|
return MultiLineString;
|
|
}
|
|
// else fall through:
|
|
default: // Identifier (first must not be 0-9, and can't (caught by Number above))
|
|
if ( !isIText( *mState.cursor ) ) {
|
|
makeError( Error::IllegalCharacter );
|
|
return None;
|
|
}
|
|
parseIdentifier( result );
|
|
return Identifier;
|
|
}
|
|
}
|
|
|
|
bool Lexer::Impl::eatWS() {
|
|
while ( !atEnd() )
|
|
switch ( *mState.cursor ) {
|
|
case '\r':
|
|
case '\n':
|
|
if ( !eatCRLF() )
|
|
return false;
|
|
break;
|
|
case ' ':
|
|
case '\t':
|
|
++mState.cursor;
|
|
break;
|
|
default:
|
|
return true;
|
|
}
|
|
|
|
// at end:
|
|
return true;
|
|
}
|
|
|
|
bool Lexer::Impl::eatCRLF() {
|
|
assert( !atEnd() );
|
|
assert( *mState.cursor == '\n' || *mState.cursor == '\r' );
|
|
|
|
if ( *mState.cursor == '\r' ) {
|
|
++mState.cursor;
|
|
if ( atEnd() || *mState.cursor != '\n' ) {
|
|
// CR w/o LF -> error
|
|
makeError( Error::CRWithoutLF );
|
|
return false;
|
|
} else {
|
|
// good CRLF
|
|
newLine();
|
|
return true;
|
|
}
|
|
} else /* *mState.cursor == '\n' */ {
|
|
// good, LF only
|
|
newLine();
|
|
return true;
|
|
}
|
|
}
|
|
|
|
|
|
bool Lexer::Impl::parseHashComment( TQString & result, bool reallySave ) {
|
|
// hash-comment := "#" *CHAR-NOT-CRLF CRLF
|
|
|
|
// check that the caller plays by the rules:
|
|
assert( *(mState.cursor-1) == '#' );
|
|
|
|
const char * const commentStart = mState.cursor;
|
|
|
|
// find next CRLF:
|
|
while ( !atEnd() ) {
|
|
if ( *mState.cursor == '\n' || *mState.cursor == '\r' ) break;
|
|
++mState.cursor;
|
|
}
|
|
|
|
const char * const commentEnd = mState.cursor - 1;
|
|
|
|
if ( commentEnd == commentStart ) return true; // # was last char in script...
|
|
|
|
if ( atEnd() || eatCRLF() ) {
|
|
const int commentLength = commentEnd - commentStart + 1;
|
|
if ( commentLength > 0 ) {
|
|
if ( !isValidUtf8( commentStart, commentLength ) ) {
|
|
makeError( Error::InvalidUTF8 );
|
|
return false;
|
|
}
|
|
if ( reallySave )
|
|
result += TQString::fromUtf8( commentStart, commentLength );
|
|
}
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool Lexer::Impl::parseBracketComment( TQString & result, bool reallySave ) {
|
|
// bracket-comment := "/*" *(CHAR-NOT-STAR / ("*" CHAR-NOT-SLASH )) "*/"
|
|
|
|
// check that caller plays by the rules:
|
|
assert( *(mState.cursor-2) == '/' );
|
|
assert( *(mState.cursor-1) == '*' );
|
|
|
|
const char * const commentStart = mState.cursor;
|
|
const int commentCol = column() - 2;
|
|
const int commentLine = line();
|
|
|
|
// find next asterisk:
|
|
do {
|
|
if ( !skipTo( '*' ) ) {
|
|
if ( !error() )
|
|
makeError( Error::UnfinishedBracketComment, commentLine, commentCol );
|
|
return false;
|
|
}
|
|
} while ( !atEnd() && *++mState.cursor != '/' );
|
|
|
|
if ( atEnd() ) {
|
|
makeError( Error::UnfinishedBracketComment, commentLine, commentCol );
|
|
return false;
|
|
}
|
|
|
|
assert( *mState.cursor == '/' );
|
|
|
|
const int commentLength = mState.cursor - commentStart - 1;
|
|
if ( commentLength > 0 ) {
|
|
if ( !isValidUtf8( commentStart, commentLength ) ) {
|
|
makeError( Error::InvalidUTF8 );
|
|
return false;
|
|
}
|
|
if ( reallySave ) {
|
|
TQString tmp = TQString::fromUtf8( commentStart, commentLength );
|
|
result += tmp.remove( '\r' ); // get rid of CR in CRLF pairs
|
|
}
|
|
}
|
|
|
|
++mState.cursor; // eat '/'
|
|
return true;
|
|
}
|
|
|
|
bool Lexer::Impl::parseComment( TQString & result, bool reallySave ) {
|
|
// comment := hash-comment / bracket-comment
|
|
|
|
switch( *mState.cursor ) {
|
|
case '#':
|
|
++mState.cursor;
|
|
return parseHashComment( result, reallySave );
|
|
case '/':
|
|
if ( charsLeft() < 2 || mState.cursor[1] != '*' ) {
|
|
makeError( Error::IllegalCharacter );
|
|
return false;
|
|
} else {
|
|
mState.cursor += 2; // eat "/*"
|
|
return parseBracketComment( result, reallySave );
|
|
}
|
|
default:
|
|
return false; // don't set an error here - there was no comment
|
|
}
|
|
}
|
|
|
|
bool Lexer::Impl::eatCWS() {
|
|
// white-space := 1*(SP / CRLF / HTAB / comment )
|
|
|
|
while ( !atEnd() ) {
|
|
switch( *mState.cursor ) {
|
|
case ' ':
|
|
case '\t': // SP / HTAB
|
|
++mState.cursor;
|
|
break;;
|
|
case '\n':
|
|
case '\r': // CRLF
|
|
if ( !eatCRLF() )
|
|
return false;
|
|
break;
|
|
case '#':
|
|
case '/': // comments
|
|
{
|
|
TQString dummy;
|
|
if ( !parseComment( dummy ) )
|
|
return false;
|
|
}
|
|
break;
|
|
default:
|
|
return true;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool Lexer::Impl::parseIdentifier( TQString & result ) {
|
|
// identifier := (ALPHA / "_") *(ALPHA DIGIT "_")
|
|
|
|
assert( isIText( *mState.cursor ) );
|
|
|
|
const char * const identifierStart = mState.cursor;
|
|
|
|
// first char:
|
|
if ( isdigit( *mState.cursor ) ) { // no digits for the first
|
|
makeError( Error::NoLeadingDigits );
|
|
return false;
|
|
}
|
|
|
|
// rest of identifier chars ( now digits are allowed ):
|
|
for ( ++mState.cursor ; !atEnd() && isIText( *mState.cursor ) ; ++mState.cursor );
|
|
|
|
const int identifierLength = mState.cursor - identifierStart;
|
|
|
|
// Can use the fast fromLatin1 here, since identifiers are always
|
|
// in the us-ascii subset:
|
|
result += TQString::fromLatin1( identifierStart, identifierLength );
|
|
|
|
if ( atEnd() || isDelim( *mState.cursor ) )
|
|
return true;
|
|
|
|
makeIllegalCharError( *mState.cursor );
|
|
return false;
|
|
}
|
|
|
|
bool Lexer::Impl::parseTag( TQString & result ) {
|
|
// tag := ":" identifier
|
|
|
|
// check that the caller plays by the rules:
|
|
assert( *(mState.cursor-1) == ':' );
|
|
assert( !atEnd() );
|
|
assert( isIText( *mState.cursor ) );
|
|
|
|
return parseIdentifier( result );
|
|
}
|
|
|
|
bool Lexer::Impl::parseNumber( TQString & result ) {
|
|
// number := 1*DIGIT [QUANTIFIER]
|
|
// QUANTIFIER := "K" / "M" / "G"
|
|
|
|
assert( isdigit( *mState.cursor ) );
|
|
|
|
while ( !atEnd() && isdigit( *mState.cursor ) )
|
|
result += *mState.cursor++;
|
|
|
|
if ( atEnd() || isDelim( *mState.cursor ) )
|
|
return true;
|
|
|
|
switch ( *mState.cursor ) {
|
|
case 'G':
|
|
case 'g':
|
|
case 'M':
|
|
case 'm':
|
|
case 'K':
|
|
case 'k':
|
|
result += *mState.cursor++;
|
|
break;
|
|
default:
|
|
makeIllegalCharError();
|
|
return false;
|
|
}
|
|
|
|
// quantifier found. Check for delimiter:
|
|
if ( atEnd() || isDelim( *mState.cursor ) )
|
|
return true;
|
|
makeIllegalCharError();
|
|
return false;
|
|
}
|
|
|
|
bool Lexer::Impl::parseMultiLine( TQString & result ) {
|
|
// multi-line := "text:" *(SP / HTAB) (hash-comment / CRLF)
|
|
// *(multi-line-literal / multi-line-dotstuff)
|
|
// "." CRLF
|
|
// multi-line-literal := [CHAR-NOT-DOT *CHAR-NOT-CRLF] CRLF
|
|
// multi-line-dotstuff := "." 1*CHAR-NOT-CRLF CRLF
|
|
// ;; A line containing only "." ends the multi-line.
|
|
// ;; Remove a leading '.' if followed by another '.'.
|
|
|
|
assert( _strnicmp( mState.cursor - 5, "text:", STR_DIM("text:") ) == 0 );
|
|
|
|
const int mlBeginLine = line();
|
|
const int mlBeginCol = column() - 5;
|
|
|
|
while ( !atEnd() ) {
|
|
switch ( *mState.cursor ) {
|
|
case ' ':
|
|
case '\t':
|
|
++mState.cursor;
|
|
break;
|
|
case '#':
|
|
{
|
|
++mState.cursor;
|
|
TQString dummy;
|
|
if ( !parseHashComment( dummy ) )
|
|
return false;
|
|
goto MultiLineStart; // break from switch _and_ while
|
|
}
|
|
case '\n':
|
|
case '\r':
|
|
if ( !eatCRLF() ) return false;
|
|
goto MultiLineStart; // break from switch _and_ while
|
|
default:
|
|
makeError( Error::NonCWSAfterTextColon );
|
|
return false;
|
|
}
|
|
}
|
|
|
|
MultiLineStart:
|
|
if ( atEnd() ) {
|
|
makeError( Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol );
|
|
return false;
|
|
}
|
|
|
|
// Now, collect the single lines until one with only a single dot is found:
|
|
TQStringList lines;
|
|
while ( !atEnd() ) {
|
|
const char * const oldBeginOfLine = beginOfLine();
|
|
if ( !skipToCRLF() )
|
|
return false;
|
|
const int lineLength = mState.cursor - oldBeginOfLine;
|
|
if ( lineLength > 0 ) {
|
|
if ( !isValidUtf8( oldBeginOfLine, lineLength ) ) {
|
|
makeError( Error::InvalidUTF8 );
|
|
return false;
|
|
}
|
|
const TQString line = removeCRLF( TQString::fromUtf8( oldBeginOfLine, lineLength ) );
|
|
lines.push_back( removeDotStuff( line ) );
|
|
if ( line == "." )
|
|
break;
|
|
} else {
|
|
lines.push_back( TQString() );
|
|
}
|
|
}
|
|
|
|
if ( lines.back() != "." ) {
|
|
makeError( Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol );
|
|
return false;
|
|
}
|
|
|
|
assert( !lines.empty() );
|
|
lines.erase( --lines.end() ); // don't include the lone dot.
|
|
result = lines.join("\n");
|
|
return true;
|
|
}
|
|
|
|
bool Lexer::Impl::parseQuotedString( TQString & result ) {
|
|
// quoted-string := DQUOTE *CHAR DQUOTE
|
|
|
|
// check that caller plays by the rules:
|
|
assert( *(mState.cursor-1) == '"' );
|
|
|
|
const int qsBeginCol = column() - 1;
|
|
const int qsBeginLine = line();
|
|
|
|
const TQTextCodec * const codec = TQTextCodec::codecForMib( 106 ); // UTF-8
|
|
assert( codec );
|
|
const std::auto_ptr<TQTextDecoder> dec( codec->makeDecoder() );
|
|
assert( dec.get() );
|
|
|
|
while ( !atEnd() )
|
|
switch ( *mState.cursor ) {
|
|
case '"':
|
|
++mState.cursor;
|
|
return true;
|
|
case '\r':
|
|
case '\n':
|
|
if ( !eatCRLF() )
|
|
return false;
|
|
result += '\n';
|
|
break;
|
|
case '\\':
|
|
++mState.cursor;
|
|
if ( atEnd() )
|
|
break;
|
|
// else fall through:
|
|
default:
|
|
if ( !is8Bit( *mState.cursor ) )
|
|
result += *mState.cursor++;
|
|
else { // probably UTF-8
|
|
const char * const eightBitBegin = mState.cursor;
|
|
skipTo8BitEnd();
|
|
const int eightBitLen = mState.cursor - eightBitBegin;
|
|
assert( eightBitLen > 0 );
|
|
if ( isValidUtf8( eightBitBegin, eightBitLen ) )
|
|
result += dec->toUnicode( eightBitBegin, eightBitLen );
|
|
else {
|
|
assert( column() >= eightBitLen );
|
|
makeError( Error::InvalidUTF8, line(), column() - eightBitLen );
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
makeError( Error::PrematureEndOfQuotedString, qsBeginLine, qsBeginCol );
|
|
return false;
|
|
}
|
|
|
|
void Lexer::Impl::makeIllegalCharError( char ch ) {
|
|
makeError( isIllegal( ch ) ? Error::IllegalCharacter : Error::UnexpectedCharacter );
|
|
}
|
|
|
|
} // namespace KSieve
|