/***************************************************************************
* Copyright ( C ) 2004 - 2009 by Thomas Fischer *
* fischer @ unix - ag . uni - kl . de *
* *
* This program is free software ; you can redistribute it and / or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation ; either version 2 of the License , or *
* ( at your option ) any later version . *
* *
* This program is distributed in the hope that it will be useful , *
* but WITHOUT ANY WARRANTY ; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the *
* GNU General Public License for more details . *
* *
* You should have received a copy of the GNU General Public License *
* along with this program ; if not , write to the *
* Free Software Foundation , Inc . , *
* 59 Temple Place - Suite 330 , Boston , MA 02111 - 1307 , USA . *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
# include <tqiodevice.h>
# include <tqregexp.h>
# include <tqapplication.h>
# include "file.h"
# include "comment.h"
# include "macro.h"
# include "preamble.h"
# include "entry.h"
# include "element.h"
# include "encoderlatex.h"
# include "value.h"
# include "fileimporterbibtex.h"
# define max(a,b) ((a)<(b)?(b):(a))
namespace BibTeX
{
const TQString extraAlphaNumChars = TQString ( " ?'`-_:.+/$ \\ \" & " ) ;
const TQRegExp htmlRegExp = TQRegExp ( " </?(a|pre)[^>]*> " , false ) ;
FileImporterBibTeX : : FileImporterBibTeX ( bool personFirstNameFirst , TQString encoding ) : FileImporter ( ) , m_personFirstNameFirst ( personFirstNameFirst ) , m_currentChar ( ' ' ) , m_ignoreComments ( FALSE ) , m_lineBufferSize ( 4096 ) , m_encoding ( encoding )
{
cancelFlag = FALSE ;
m_lineBuffer = new char [ m_lineBufferSize ] ;
m_textStream = NULL ;
}
FileImporterBibTeX : : ~ FileImporterBibTeX ( )
{
delete [ ] m_lineBuffer ;
}
File * FileImporterBibTeX : : load ( TQIODevice * iodevice )
{
m_mutex . lock ( ) ;
cancelFlag = FALSE ;
TQString rawText ;
const char * encodingFrom = m_encoding = = " latex " ? " utf-8 \0 " : m_encoding . append ( " \0 " ) . ascii ( ) ;
iconv_t iconvHandle = iconv_open ( " utf-8 " , encodingFrom ) ;
char * convertedLine = new char [ m_lineBufferSize * 4 ] ;
int len ;
bool encodingOk = true ;
while ( encodingOk & & iodevice - > isReadable ( ) & & ( len = iodevice - > readLine ( m_lineBuffer , m_lineBufferSize ) ) > 0 )
{
evaluateParameterComments ( iconvHandle , m_lineBuffer ) ;
char * raw = m_lineBuffer ;
char * enc = convertedLine ;
size_t encLen = m_lineBufferSize , rawLen = ( size_t ) len ;
size_t result = iconv ( iconvHandle , & raw , & rawLen , & enc , & encLen ) ;
tqApp - > processEvents ( ) ;
if ( result ! = 0 )
{
TQString problematic = TQString ( m_lineBuffer ) . mid ( max ( 0 , m_lineBufferSize - encLen - 15 ) , 30 ) ;
if ( problematic . isNull ( ) | | problematic . isEmpty ( ) ) problematic = TQString ( m_lineBuffer ) ;
tqDebug ( " iconv resulted in error code %i for source encoding %s, maybe file is in different encoding? Problem is somewhere here: \" %s \" " , result , encodingFrom , problematic . latin1 ( ) ) ;
encodingOk = false ;
break ;
}
if ( rawLen > 0 )
{
tqDebug ( " iconv could not convert complete string, only %i out of %i chars " , len - rawLen , len ) ;
encodingOk = false ;
break ;
}
enc [ 0 ] = ' \0 ' ;
/** remove leading UTF-8 byte-order mark (BOM) */
int offset = 0 ;
while ( ( ( ( unsigned char ) convertedLine [ offset ] ) = = 0xef | | ( ( unsigned char ) convertedLine [ offset ] ) = = 0xbb | | ( ( unsigned char ) convertedLine [ offset ] ) = = 0xbf ) & & offset < 4 )
+ + offset ;
TQString line = TQString : : fromUtf8 ( convertedLine + offset ) ;
rawText . append ( line ) ;
}
iconv_close ( iconvHandle ) ;
delete [ ] convertedLine ;
if ( ! encodingOk )
{
tqDebug ( " Decoding failed, cannot load file. Please fix encoding manually. " ) ;
m_mutex . unlock ( ) ;
return NULL ;
}
/** Cleaning up code comming from DBLP */
rawText = rawText . replace ( htmlRegExp , " " ) ;
rawText = EncoderLaTeX : : currentEncoderLaTeX ( ) - > decode ( rawText ) ;
unescapeLaTeXChars ( rawText ) ;
m_textStream = new TQTextStream ( rawText , IO_ReadOnly ) ;
m_textStream - > setEncoding ( TQTextStream : : UnicodeUTF8 ) ;
m_currentLineNumber = 0 ;
m_posIntCurrentLine = 0 ;
m_currentLine = " " ;
File * result = new File ( ) ;
TQIODevice * streamDevice = m_textStream - > device ( ) ;
while ( ! cancelFlag & & ! m_textStream - > atEnd ( ) )
{
emit progress ( streamDevice - > at ( ) , streamDevice - > size ( ) ) ;
tqApp - > processEvents ( ) ;
Element * element = nextElement ( ) ;
if ( element ! = NULL )
{
Comment * comment = dynamic_cast < Comment * > ( element ) ;
if ( ! m_ignoreComments | | comment = = NULL )
result - > appendElement ( element ) ;
else
delete element ;
}
tqApp - > processEvents ( ) ;
}
emit progress ( streamDevice - > size ( ) , streamDevice - > size ( ) ) ;
if ( cancelFlag )
{
tqDebug ( " Loading file has been canceled " ) ;
delete result ;
result = NULL ;
}
delete m_textStream ;
m_mutex . unlock ( ) ;
return result ;
}
bool FileImporterBibTeX : : guessCanDecode ( const TQString & rawText )
{
TQString text = EncoderLaTeX : : currentEncoderLaTeX ( ) - > decode ( rawText ) ;
return text . find ( TQRegExp ( " @ \\ w+ \\ {.+ \\ } " ) ) > = 0 ;
}
void FileImporterBibTeX : : setIgnoreComments ( bool ignoreComments )
{
m_ignoreComments = ignoreComments ;
}
void FileImporterBibTeX : : cancel ( )
{
cancelFlag = TRUE ;
}
Element * FileImporterBibTeX : : nextElement ( )
{
Token token = nextToken ( ) ;
if ( token = = tAt )
{
TQString elementType = readSimpleString ( ) ;
if ( elementType . lower ( ) = = " comment " )
return readCommentElement ( ) ;
else if ( elementType . lower ( ) = = " string " )
return readMacroElement ( ) ;
else if ( elementType . lower ( ) = = " preamble " )
return readPreambleElement ( ) ;
else if ( ! elementType . isEmpty ( ) )
return readEntryElement ( elementType ) ;
else
{
tqDebug ( " ElementType is empty " ) ;
return NULL ;
}
}
else if ( token = = tUnknown )
{
tqDebug ( " Unknown token near line %i, treating as comment " , m_currentLineNumber ) ;
return readPlainCommentElement ( ) ;
}
if ( token ! = tEOF )
tqDebug ( " Don't know how to parse next token near line %i: %s " , m_currentLineNumber , tokenidToString ( token ) . latin1 ( ) ) ;
return NULL ;
}
Comment * FileImporterBibTeX : : readCommentElement ( )
{
while ( m_currentChar ! = ' { ' & & m_currentChar ! = ' ( ' & & ! m_textStream - > atEnd ( ) )
m_currentChar = nextChar ( ) ;
return new Comment ( readBracketString ( m_currentChar ) , TRUE ) ;
}
Comment * FileImporterBibTeX : : readPlainCommentElement ( )
{
TQString result = m_currentChar ;
result + = readLine ( ) ;
m_currentChar = nextChar ( ) ;
while ( ! m_textStream - > atEnd ( ) & & m_currentChar ! = ' @ ' & & ! m_currentChar . isSpace ( ) )
{
result . append ( ' \n ' ) . append ( m_currentChar ) ;
m_currentChar = nextChar ( ) ;
result . append ( readLine ( ) ) ;
m_currentChar = nextChar ( ) ;
}
return new Comment ( result , FALSE ) ;
}
Macro * FileImporterBibTeX : : readMacroElement ( )
{
Token token = nextToken ( ) ;
while ( token ! = tBracketOpen )
{
if ( token = = tEOF )
{
tqDebug ( " Error in parsing unknown macro (near line %i): Opening curly brace ({) expected " , m_currentLineNumber ) ;
return NULL ;
}
token = nextToken ( ) ;
}
TQString key = readSimpleString ( ) ;
if ( nextToken ( ) ! = tAssign )
{
tqDebug ( " Error in parsing macro '%s' (near line %i): Assign symbol (=) expected " , key . latin1 ( ) , m_currentLineNumber ) ;
return NULL ;
}
Macro * macro = new Macro ( key ) ;
do
{
bool isStringKey = FALSE ;
TQString text = readString ( isStringKey ) . replace ( TQRegExp ( " \\ s+ " ) , " " ) ;
if ( isStringKey )
macro - > value ( ) - > items . append ( new MacroKey ( text ) ) ;
else
macro - > value ( ) - > items . append ( new BibTeX : : PlainText ( text ) ) ;
token = nextToken ( ) ;
}
while ( token = = tDoublecross ) ;
return macro ;
}
Preamble * FileImporterBibTeX : : readPreambleElement ( )
{
Token token = nextToken ( ) ;
while ( token ! = tBracketOpen )
{
if ( token = = tEOF )
{
tqDebug ( " Error in parsing unknown preamble (near line %i): Opening curly brace ({) expected " , m_currentLineNumber ) ;
return NULL ;
}
token = nextToken ( ) ;
}
Preamble * preamble = new Preamble ( ) ;
do
{
bool isStringKey = FALSE ;
TQString text = readString ( isStringKey ) . replace ( TQRegExp ( " \\ s+ " ) , " " ) ;
if ( isStringKey )
preamble - > value ( ) - > items . append ( new MacroKey ( text ) ) ;
else
preamble - > value ( ) - > items . append ( new BibTeX : : PlainText ( text ) ) ;
token = nextToken ( ) ;
}
while ( token = = tDoublecross ) ;
return preamble ;
}
Entry * FileImporterBibTeX : : readEntryElement ( const TQString & typeString )
{
Token token = nextToken ( ) ;
while ( token ! = tBracketOpen )
{
if ( token = = tEOF )
{
tqDebug ( " Error in parsing unknown entry (near line %i): Opening curly brace ({) expected " , m_currentLineNumber ) ;
return NULL ;
}
token = nextToken ( ) ;
}
TQString key = readSimpleString ( ) ;
Entry * entry = new Entry ( typeString , key ) ;
token = nextToken ( ) ;
do
{
if ( token = = tBracketClose | | token = = tEOF )
break ;
else if ( token ! = tComma )
{
tqDebug ( " Error in parsing entry '%s' (near line %i): Comma symbol (,) expected but got 0x%x (token %s) " , key . latin1 ( ) , m_currentLineNumber , m_currentChar . unicode ( ) , tokenidToString ( token ) . latin1 ( ) ) ;
delete entry ;
return NULL ;
}
TQString fieldTypeName = readSimpleString ( ) ;
token = nextToken ( ) ;
if ( fieldTypeName = = TQString : : null | | token = = tBracketClose )
{
// entry is buggy, but we still accept it
break ;
}
else if ( token ! = tAssign )
{
tqDebug ( " Error in parsing entry '%s' (near line %i): Assign symbol (=) expected after field name '%s' " , key . latin1 ( ) , m_currentLineNumber , fieldTypeName . latin1 ( ) ) ;
delete entry ;
return NULL ;
}
/** check for duplicate fields */
if ( entry - > getField ( fieldTypeName ) ! = NULL )
{
int i = 1 ;
TQString appendix = TQString : : number ( i ) ;
while ( entry - > getField ( fieldTypeName + appendix ) ! = NULL )
{
+ + i ;
appendix = TQString : : number ( i ) ;
}
fieldTypeName + = appendix ;
}
EntryField * entryField = new EntryField ( fieldTypeName ) ;
token = readValue ( entryField - > value ( ) , entryField - > fieldType ( ) ) ;
entry - > addField ( entryField ) ;
}
while ( TRUE ) ;
return entry ;
}
FileImporterBibTeX : : Token FileImporterBibTeX : : nextToken ( )
{
if ( m_textStream - > atEnd ( ) )
return tEOF ;
Token curToken = tUnknown ;
while ( ( m_currentChar . isSpace ( ) | | m_currentChar = = ' \t ' ) & & ! m_textStream - > atEnd ( ) )
m_currentChar = nextChar ( ) ;
switch ( m_currentChar . latin1 ( ) )
{
case ' @ ' :
curToken = tAt ;
break ;
case ' { ' :
case ' ( ' :
curToken = tBracketOpen ;
break ;
case ' } ' :
case ' ) ' :
curToken = tBracketClose ;
break ;
case ' , ' :
curToken = tComma ;
break ;
case ' = ' :
curToken = tAssign ;
break ;
case ' # ' :
curToken = tDoublecross ;
break ;
default :
if ( m_textStream - > atEnd ( ) )
curToken = tEOF ;
}
if ( curToken ! = tUnknown & & curToken ! = tEOF )
m_currentChar = nextChar ( ) ;
return curToken ;
}
TQString FileImporterBibTeX : : readString ( bool & isStringKey )
{
while ( m_currentChar . isSpace ( ) )
m_currentChar = nextChar ( ) ;
isStringKey = FALSE ;
switch ( m_currentChar . latin1 ( ) )
{
case ' { ' :
case ' ( ' :
return readBracketString ( m_currentChar ) ;
case ' " ' :
return readQuotedString ( ) ;
default :
isStringKey = TRUE ;
return readSimpleString ( ) ;
}
}
TQString FileImporterBibTeX : : readSimpleString ( TQChar until )
{
TQString result ;
while ( m_currentChar . isSpace ( ) )
m_currentChar = nextChar ( ) ;
if ( m_currentChar . isLetterOrNumber ( ) | | extraAlphaNumChars . contains ( m_currentChar ) )
{
result . append ( m_currentChar ) ;
m_currentChar = nextChar ( ) ;
}
while ( ! m_textStream - > atEnd ( ) )
{
if ( until ! = ' \0 ' )
{
if ( m_currentChar ! = until )
result . append ( m_currentChar ) ;
else
break ;
}
else
if ( m_currentChar . isLetterOrNumber ( ) | | extraAlphaNumChars . contains ( m_currentChar ) )
result . append ( m_currentChar ) ;
else if ( m_currentChar = = " , " | | m_currentChar = = " ( " | | m_currentChar = = " ) " || m_currentChar == " { " || m_currentChar == " } " || m_currentChar == " = " || m_currentChar == " # " || m_currentChar == " @ " || m_currentChar.isSpace() )
break ;
else
{
tqDebug ( " Unknown letter or number: 0x%x " , m_currentChar . unicode ( ) ) ;
// break;
}
m_currentChar = nextChar ( ) ;
}
return result ;
}
TQString FileImporterBibTeX : : readQuotedString ( )
{
TQString result ;
TQChar lastChar = m_currentChar ;
m_currentChar = nextChar ( ) ;
while ( ! m_textStream - > atEnd ( ) )
{
if ( m_currentChar ! = ' " ' | | lastChar = = ' \\ ' )
result . append ( m_currentChar ) ;
else
break ;
lastChar = m_currentChar ;
m_currentChar = nextChar ( ) ;
}
/** read character after closing " */
m_currentChar = nextChar ( ) ;
return result ;
}
TQString FileImporterBibTeX : : readLine ( )
{
TQString result = m_currentLine . mid ( m_posIntCurrentLine ) ;
m_posIntCurrentLine = m_currentLine . length ( ) + 2 ;
return result ;
}
TQString FileImporterBibTeX : : readBracketString ( const TQChar openingBracket )
{
TQString result ;
TQChar closingBracket = ' } ' ;
if ( openingBracket = = ' ( ' )
closingBracket = ' ) ' ;
int counter = 1 ;
m_currentChar = nextChar ( ) ;
while ( ! m_textStream - > atEnd ( ) )
{
if ( m_currentChar = = openingBracket )
counter + + ;
else if ( m_currentChar = = closingBracket )
counter - - ;
if ( counter = = 0 )
break ;
else
result . append ( m_currentChar ) ;
m_currentChar = nextChar ( ) ;
}
m_currentChar = nextChar ( ) ;
return result ;
}
FileImporterBibTeX : : Token FileImporterBibTeX : : readValue ( Value * value , EntryField : : FieldType fieldType )
{
Token token = tUnknown ;
do
{
bool isStringKey = FALSE ;
TQString text = readString ( isStringKey ) . replace ( TQRegExp ( " \\ s+ " ) , " " ) ;
switch ( fieldType )
{
case EntryField : : ftKeywords :
{
if ( isStringKey )
tqDebug ( " WARNING: Cannot handle keywords that are macros " ) ;
else
value - > items . append ( new KeywordContainer ( text ) ) ;
}
break ;
case EntryField : : ftAuthor :
case EntryField : : ftEditor :
{
if ( isStringKey )
tqDebug ( " WARNING: Cannot handle authors/editors that are macros " ) ;
else
{
TQStringList persons ;
splitPersons ( text , persons ) ;
PersonContainer * container = new PersonContainer ( m_personFirstNameFirst ) ;
for ( TQStringList : : ConstIterator pit = persons . constBegin ( ) ; pit ! = persons . constEnd ( ) ; + + pit )
container - > persons . append ( new Person ( * pit , m_personFirstNameFirst ) ) ;
value - > items . append ( container ) ;
}
}
break ;
case EntryField : : ftPages :
text . replace ( TQRegExp ( " \\ s*--? \\ s* " ) , TQChar ( 0x2013 ) ) ;
default :
{
if ( isStringKey )
value - > items . append ( new MacroKey ( text ) ) ;
else
value - > items . append ( new BibTeX : : PlainText ( text ) ) ;
}
}
token = nextToken ( ) ;
}
while ( token = = tDoublecross ) ;
return token ;
}
void FileImporterBibTeX : : unescapeLaTeXChars ( TQString & text )
{
text . replace ( " \\ & " , " & " ) ;
}
void FileImporterBibTeX : : splitPersons ( const TQString & text , TQStringList & persons )
{
TQStringList wordList ;
TQString word ;
int bracketCounter = 0 ;
for ( unsigned int pos = 0 ; pos < text . length ( ) ; + + pos )
{
if ( text [ pos ] = = ' { ' )
+ + bracketCounter ;
else if ( text [ pos ] = = ' } ' )
- - bracketCounter ;
if ( text [ pos ] = = ' ' | | text [ pos ] = = ' \n ' | | text [ pos ] = = ' \r ' )
{
if ( word = = " and " & & bracketCounter = = 0 )
{
persons . append ( wordList . join ( " " ) ) ;
wordList . clear ( ) ;
}
else if ( ! word . isEmpty ( ) )
wordList . append ( word ) ;
word = " " ;
}
else
word . append ( text [ pos ] ) ;
}
wordList . append ( word ) ;
persons . append ( wordList . join ( " " ) ) ;
}
void FileImporterBibTeX : : evaluateParameterComments ( iconv_t & iconvHandle , const char * cline )
{
/** simple preliminary checks before expensive conversion to TQString */
if ( cline [ 0 ] = = ' @ ' & & cline [ 1 ] = = ' c ' )
{
TQString line = TQString ( cline ) . lower ( ) ;
/** check if this file requests a special encoding */
if ( line . startsWith ( " @comment{x-kbibtex-encoding= " ) & & line . endsWith ( " } \n " ) )
{
TQString newEncoding = line . mid ( 28 , line . length ( ) - 30 ) ;
tqDebug ( " x-kbibtex-encoding=<%s> " , newEncoding . latin1 ( ) ) ;
if ( newEncoding = = " latex " ) newEncoding = " utf-8 " ;
iconv_close ( iconvHandle ) ;
iconvHandle = iconv_open ( " utf-8 " , newEncoding . append ( ' \0 ' ) . ascii ( ) ) ;
}
}
}
TQChar FileImporterBibTeX : : nextChar ( )
{
bool atEndOfLine = m_posIntCurrentLine > = m_currentLine . length ( ) ;
while ( ( m_posIntCurrentLine > = m_currentLine . length ( ) | | m_currentLine . isEmpty ( ) | | m_currentLine . isNull ( ) ) & & ! m_textStream - > atEnd ( ) )
{
m_currentLine = m_textStream - > readLine ( ) ;
m_posIntCurrentLine = 0 ;
+ + m_currentLineNumber ;
}
if ( atEndOfLine )
return TQChar ( ' ' ) ;
else if ( m_posIntCurrentLine < m_currentLine . length ( ) )
{
TQChar result = m_currentLine [ m_posIntCurrentLine ] ;
+ + m_posIntCurrentLine ;
return result ;
}
return TQChar ( ) ;
}
TQString FileImporterBibTeX : : tokenidToString ( Token token )
{
switch ( token )
{
case tAt : return TQString ( " At " ) ;
case tBracketClose : return TQString ( " BracketClose " ) ;
case tBracketOpen : return TQString ( " BracketOpen " ) ;
case tAlphaNumText : return TQString ( " AlphaNumText " ) ;
case tAssign : return TQString ( " Assign " ) ;
case tComma : return TQString ( " Comma " ) ;
case tDoublecross : return TQString ( " Doublecross " ) ;
case tEOF : return TQString ( " EOF " ) ;
case tUnknown : return TQString ( " Unknown " ) ;
default : return TQString ( " <Unknown> " ) ;
}
}
}