You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kbibtex/src/fileimporterbibtex.cpp

659 lines
22 KiB

/***************************************************************************
* Copyright (C) 2004-2009 by Thomas Fischer *
* fischer@unix-ag.uni-kl.de *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
***************************************************************************/
#include <ntqiodevice.h>
#include <ntqregexp.h>
#include <ntqapplication.h>
#include <file.h>
#include <comment.h>
#include <macro.h>
#include <preamble.h>
#include <entry.h>
#include <element.h>
#include <encoderlatex.h>
#include <value.h>
#include "fileimporterbibtex.h"
#define max(a,b) ((a)<(b)?(b):(a))
namespace BibTeX
{
const TQString extraAlphaNumChars = TQString( "?'`-_:.+/$\\\"&" );
const TQRegExp htmlRegExp = TQRegExp( "</?(a|pre)[^>]*>", false );
FileImporterBibTeX::FileImporterBibTeX( bool personFirstNameFirst, TQString encoding ) : FileImporter(), m_personFirstNameFirst( personFirstNameFirst ), m_currentChar( ' ' ), m_ignoreComments( FALSE ), m_lineBufferSize( 4096 ), m_encoding( encoding )
{
cancelFlag = FALSE;
m_lineBuffer = new char[m_lineBufferSize];
m_textStream = NULL;
}
FileImporterBibTeX::~FileImporterBibTeX()
{
delete[] m_lineBuffer;
}
File* FileImporterBibTeX::load( TQIODevice *iodevice )
{
m_mutex.lock();
cancelFlag = FALSE;
TQString rawText;
const char *encodingFrom = m_encoding == "latex" ? "utf-8\0" : m_encoding.append( "\0" ).ascii();
iconv_t iconvHandle = iconv_open( "utf-8", encodingFrom );
char *convertedLine = new char[m_lineBufferSize * 4];
int len;
bool encodingOk = true;
while ( encodingOk && iodevice->isReadable() && ( len = iodevice->readLine( m_lineBuffer, m_lineBufferSize ) ) > 0 )
{
evaluateParameterComments( iconvHandle, m_lineBuffer );
char *raw = m_lineBuffer;
char *enc = convertedLine;
size_t encLen = m_lineBufferSize, rawLen = ( size_t )len;
size_t result = iconv( iconvHandle, &raw, &rawLen, &enc, &encLen );
tqApp->processEvents();
if ( result != 0 )
{
TQString problematic = TQString( m_lineBuffer ).mid( max( 0, m_lineBufferSize - encLen - 15 ), 30 );
if ( problematic.isNull() || problematic.isEmpty() ) problematic = TQString( m_lineBuffer );
tqDebug( "iconv resulted in error code %i for source encoding %s, maybe file is in different encoding? Problem is somewhere here: \"%s\"", result, encodingFrom, problematic.latin1() );
encodingOk = false;
break;
}
if ( rawLen > 0 )
{
tqDebug( "iconv could not convert complete string, only %i out of %i chars", len - rawLen, len );
encodingOk = false;
break;
}
enc[0] = '\0';
/** remove leading UTF-8 byte-order mark (BOM) */
int offset = 0;
while (((( unsigned char )convertedLine[offset] ) == 0xef || (( unsigned char )convertedLine[offset] ) == 0xbb || (( unsigned char )convertedLine[offset] ) == 0xbf ) && offset < 4 )
++offset;
TQString line = TQString::fromUtf8( convertedLine + offset );
rawText.append( line );
}
iconv_close( iconvHandle );
delete[] convertedLine;
if ( !encodingOk )
{
tqDebug( "Decoding failed, cannot load file. Please fix encoding manually." );
m_mutex.unlock();
return NULL;
}
/** Cleaning up code comming from DBLP */
rawText = rawText.replace( htmlRegExp, "" );
rawText = EncoderLaTeX::currentEncoderLaTeX() ->decode( rawText );
unescapeLaTeXChars( rawText );
m_textStream = new TQTextStream( rawText, IO_ReadOnly );
m_textStream->setEncoding( TQTextStream::UnicodeUTF8 );
m_currentLineNumber = 0;
m_posIntCurrentLine = 0;
m_currentLine = "";
File *result = new File();
TQIODevice *streamDevice = m_textStream->device();
while ( !cancelFlag && !m_textStream->atEnd() )
{
emit progress( streamDevice->at(), streamDevice->size() );
tqApp->processEvents();
Element * element = nextElement();
if ( element != NULL )
{
Comment *comment = dynamic_cast<Comment*>( element );
if ( !m_ignoreComments || comment == NULL )
result->appendElement( element );
else
delete element;
}
tqApp->processEvents();
}
emit progress( streamDevice->size(), streamDevice->size() );
if ( cancelFlag )
{
tqDebug( "Loading file has been canceled" );
delete result;
result = NULL;
}
delete m_textStream;
m_mutex.unlock();
return result;
}
bool FileImporterBibTeX::guessCanDecode( const TQString & rawText )
{
TQString text = EncoderLaTeX::currentEncoderLaTeX() ->decode( rawText );
return text.find( TQRegExp( "@\\w+\\{.+\\}" ) ) >= 0;
}
void FileImporterBibTeX::setIgnoreComments( bool ignoreComments )
{
m_ignoreComments = ignoreComments;
}
void FileImporterBibTeX::cancel()
{
cancelFlag = TRUE;
}
Element *FileImporterBibTeX::nextElement()
{
Token token = nextToken();
if ( token == tAt )
{
TQString elementType = readSimpleString();
if ( elementType.lower() == "comment" )
return readCommentElement();
else if ( elementType.lower() == "string" )
return readMacroElement();
else if ( elementType.lower() == "preamble" )
return readPreambleElement();
else if ( !elementType.isEmpty() )
return readEntryElement( elementType );
else
{
tqDebug( "ElementType is empty" );
return NULL;
}
}
else if ( token == tUnknown )
{
tqDebug( "Unknown token near line %i, treating as comment", m_currentLineNumber );
return readPlainCommentElement();
}
if ( token != tEOF )
tqDebug( "Don't know how to parse next token near line %i: %s", m_currentLineNumber, tokenidToString( token ).latin1() );
return NULL;
}
Comment *FileImporterBibTeX::readCommentElement()
{
while ( m_currentChar != '{' && m_currentChar != '(' && !m_textStream->atEnd() )
m_currentChar = nextChar();
return new Comment( readBracketString( m_currentChar ), TRUE );
}
Comment *FileImporterBibTeX::readPlainCommentElement()
{
TQString result = m_currentChar;
result += readLine();
m_currentChar = nextChar();
while ( !m_textStream->atEnd() && m_currentChar != '@' && !m_currentChar.isSpace() )
{
result.append( '\n' ).append( m_currentChar );
m_currentChar = nextChar();
result.append( readLine() );
m_currentChar = nextChar();
}
return new Comment( result, FALSE );
}
Macro *FileImporterBibTeX::readMacroElement()
{
Token token = nextToken();
while ( token != tBracketOpen )
{
if ( token == tEOF )
{
tqDebug( "Error in parsing unknown macro (near line %i): Opening curly brace ({) expected", m_currentLineNumber );
return NULL;
}
token = nextToken();
}
TQString key = readSimpleString();
if ( nextToken() != tAssign )
{
tqDebug( "Error in parsing macro '%s' (near line %i): Assign symbol (=) expected", key.latin1(), m_currentLineNumber );
return NULL;
}
Macro *macro = new Macro( key );
do
{
bool isStringKey = FALSE;
TQString text = readString( isStringKey ).replace( TQRegExp( "\\s+" ), " " );
if ( isStringKey )
macro->value()->items.append( new MacroKey( text ) );
else
macro->value()->items.append( new BibTeX::PlainText( text ) );
token = nextToken();
}
while ( token == tDoublecross );
return macro;
}
Preamble *FileImporterBibTeX::readPreambleElement()
{
Token token = nextToken();
while ( token != tBracketOpen )
{
if ( token == tEOF )
{
tqDebug( "Error in parsing unknown preamble (near line %i): Opening curly brace ({) expected", m_currentLineNumber );
return NULL;
}
token = nextToken();
}
Preamble *preamble = new Preamble( );
do
{
bool isStringKey = FALSE;
TQString text = readString( isStringKey ).replace( TQRegExp( "\\s+" ), " " );
if ( isStringKey )
preamble->value()->items.append( new MacroKey( text ) );
else
preamble->value()->items.append( new BibTeX::PlainText( text ) );
token = nextToken();
}
while ( token == tDoublecross );
return preamble;
}
Entry *FileImporterBibTeX::readEntryElement( const TQString& typeString )
{
Token token = nextToken();
while ( token != tBracketOpen )
{
if ( token == tEOF )
{
tqDebug( "Error in parsing unknown entry (near line %i): Opening curly brace ({) expected", m_currentLineNumber );
return NULL;
}
token = nextToken();
}
TQString key = readSimpleString();
Entry *entry = new Entry( typeString, key );
token = nextToken();
do
{
if ( token == tBracketClose || token == tEOF )
break;
else if ( token != tComma )
{
tqDebug( "Error in parsing entry '%s' (near line %i): Comma symbol (,) expected but got 0x%x (token %s)", key.latin1(), m_currentLineNumber, m_currentChar.unicode(), tokenidToString( token ).latin1() );
delete entry;
return NULL;
}
TQString fieldTypeName = readSimpleString();
token = nextToken();
if ( fieldTypeName == TQString::null || token == tBracketClose )
{
// entry is buggy, but we still accept it
break;
}
else if ( token != tAssign )
{
tqDebug( "Error in parsing entry '%s' (near line %i): Assign symbol (=) expected after field name '%s'", key.latin1(), m_currentLineNumber, fieldTypeName.latin1() );
delete entry;
return NULL;
}
/** check for duplicate fields */
if ( entry->getField( fieldTypeName ) != NULL )
{
int i = 1;
TQString appendix = TQString::number( i );
while ( entry->getField( fieldTypeName + appendix ) != NULL )
{
++i;
appendix = TQString::number( i );
}
fieldTypeName += appendix;
}
EntryField *entryField = new EntryField( fieldTypeName );
token = readValue( entryField->value(), entryField->fieldType() );
entry->addField( entryField );
}
while ( TRUE );
return entry;
}
FileImporterBibTeX::Token FileImporterBibTeX::nextToken()
{
if ( m_textStream->atEnd() )
return tEOF;
Token curToken = tUnknown;
while (( m_currentChar.isSpace() || m_currentChar == '\t' ) && !m_textStream->atEnd() )
m_currentChar = nextChar();
switch ( m_currentChar.latin1() )
{
case '@':
curToken = tAt;
break;
case '{':
case '(':
curToken = tBracketOpen;
break;
case '}':
case ')':
curToken = tBracketClose;
break;
case ',':
curToken = tComma;
break;
case '=':
curToken = tAssign;
break;
case '#':
curToken = tDoublecross;
break;
default:
if ( m_textStream->atEnd() )
curToken = tEOF;
}
if ( curToken != tUnknown && curToken != tEOF )
m_currentChar = nextChar();
return curToken;
}
TQString FileImporterBibTeX::readString( bool &isStringKey )
{
while ( m_currentChar.isSpace() )
m_currentChar = nextChar();
isStringKey = FALSE;
switch ( m_currentChar.latin1() )
{
case '{':
case '(':
return readBracketString( m_currentChar );
case '"':
return readQuotedString();
default:
isStringKey = TRUE;
return readSimpleString();
}
}
TQString FileImporterBibTeX::readSimpleString( TQChar until )
{
TQString result;
while ( m_currentChar.isSpace() )
m_currentChar = nextChar();
if ( m_currentChar.isLetterOrNumber() || extraAlphaNumChars.contains( m_currentChar ) )
{
result.append( m_currentChar );
m_currentChar = nextChar();
}
while ( !m_textStream->atEnd() )
{
if ( until != '\0' )
{
if ( m_currentChar != until )
result.append( m_currentChar );
else
break;
}
else
if ( m_currentChar.isLetterOrNumber() || extraAlphaNumChars.contains( m_currentChar ) )
result.append( m_currentChar );
else if ( m_currentChar == "," || m_currentChar == "(" || m_currentChar == ")" || m_currentChar == "{" || m_currentChar == "}" || m_currentChar == "=" || m_currentChar == "#" || m_currentChar == "@" || m_currentChar.isSpace() )
break;
else
{
tqDebug( "Unknown letter or number: 0x%x", m_currentChar.unicode() );
// break;
}
m_currentChar = nextChar();
}
return result;
}
TQString FileImporterBibTeX::readQuotedString()
{
TQString result;
TQChar lastChar = m_currentChar;
m_currentChar = nextChar();
while ( !m_textStream->atEnd() )
{
if ( m_currentChar != '"' || lastChar == '\\' )
result.append( m_currentChar );
else
break;
lastChar = m_currentChar;
m_currentChar = nextChar();
}
/** read character after closing " */
m_currentChar = nextChar();
return result;
}
TQString FileImporterBibTeX::readLine()
{
TQString result = m_currentLine.mid( m_posIntCurrentLine );
m_posIntCurrentLine = m_currentLine.length() + 2;
return result;
}
TQString FileImporterBibTeX::readBracketString( const TQChar openingBracket )
{
TQString result;
TQChar closingBracket = '}';
if ( openingBracket == '(' )
closingBracket = ')';
int counter = 1;
m_currentChar = nextChar();
while ( !m_textStream->atEnd() )
{
if ( m_currentChar == openingBracket )
counter++;
else if ( m_currentChar == closingBracket )
counter--;
if ( counter == 0 )
break;
else
result.append( m_currentChar );
m_currentChar = nextChar();
}
m_currentChar = nextChar();
return result;
}
FileImporterBibTeX::Token FileImporterBibTeX::readValue( Value *value, EntryField::FieldType fieldType )
{
Token token = tUnknown;
do
{
bool isStringKey = FALSE;
TQString text = readString( isStringKey ).replace( TQRegExp( "\\s+" ), " " );
switch ( fieldType )
{
case EntryField::ftKeywords:
{
if ( isStringKey )
tqDebug( "WARNING: Cannot handle keywords that are macros" );
else
value->items.append( new KeywordContainer( text ) );
}
break;
case EntryField::ftAuthor:
case EntryField::ftEditor:
{
if ( isStringKey )
tqDebug( "WARNING: Cannot handle authors/editors that are macros" );
else
{
TQStringList persons;
splitPersons( text, persons );
PersonContainer *container = new PersonContainer( m_personFirstNameFirst );
for ( TQStringList::ConstIterator pit = persons.constBegin(); pit != persons.constEnd(); ++pit )
container->persons.append( new Person( *pit, m_personFirstNameFirst ) );
value->items.append( container );
}
}
break;
case EntryField::ftPages:
text.replace( TQRegExp( "\\s*--?\\s*" ), TQChar( 0x2013 ) );
default:
{
if ( isStringKey )
value->items.append( new MacroKey( text ) );
else
value->items.append( new BibTeX::PlainText( text ) );
}
}
token = nextToken();
}
while ( token == tDoublecross );
return token;
}
void FileImporterBibTeX::unescapeLaTeXChars( TQString &text )
{
text.replace( "\\&", "&" );
}
void FileImporterBibTeX::splitPersons( const TQString& text, TQStringList &persons )
{
TQStringList wordList;
TQString word;
int bracketCounter = 0;
for ( unsigned int pos = 0;pos < text.length();++pos )
{
if ( text[pos] == '{' )
++bracketCounter;
else if ( text[pos] == '}' )
--bracketCounter;
if ( text[pos] == ' ' || text[pos] == '\n' || text[pos] == '\r' )
{
if ( word == "and" && bracketCounter == 0 )
{
persons.append( wordList.join( " " ) );
wordList.clear();
}
else if ( !word.isEmpty() )
wordList.append( word );
word = "";
}
else
word.append( text[pos] );
}
wordList.append( word );
persons.append( wordList.join( " " ) );
}
void FileImporterBibTeX::evaluateParameterComments( iconv_t &iconvHandle, const char *cline )
{
/** simple preliminary checks before expensive conversion to TQString */
if ( cline[0] == '@' && cline[1] == 'c' )
{
TQString line = TQString( cline ).lower();
/** check if this file requests a special encoding */
if ( line.startsWith( "@comment{x-kbibtex-encoding=" ) && line.endsWith( "}\n" ) )
{
TQString newEncoding = line.mid( 28, line.length() - 30 );
tqDebug( "x-kbibtex-encoding=<%s>", newEncoding.latin1() );
if ( newEncoding == "latex" ) newEncoding = "utf-8";
iconv_close( iconvHandle );
iconvHandle = iconv_open( "utf-8", newEncoding.append( '\0' ).ascii() );
}
}
}
TQChar FileImporterBibTeX::nextChar()
{
bool atEndOfLine = m_posIntCurrentLine >= m_currentLine.length();
while (( m_posIntCurrentLine >= m_currentLine.length() || m_currentLine.isEmpty() || m_currentLine.isNull() ) && !m_textStream->atEnd() )
{
m_currentLine = m_textStream->readLine();
m_posIntCurrentLine = 0;
++m_currentLineNumber;
}
if ( atEndOfLine )
return TQChar( ' ' );
else if ( m_posIntCurrentLine < m_currentLine.length() )
{
TQChar result = m_currentLine[m_posIntCurrentLine];
++m_posIntCurrentLine;
return result;
}
return TQChar();
}
TQString FileImporterBibTeX::tokenidToString( Token token )
{
switch ( token )
{
case tAt: return TQString( "At" );
case tBracketClose: return TQString( "BracketClose" );
case tBracketOpen: return TQString( "BracketOpen" );
case tAlphaNumText: return TQString( "AlphaNumText" );
case tAssign: return TQString( "Assign" );
case tComma: return TQString( "Comma" );
case tDoublecross: return TQString( "Doublecross" );
case tEOF: return TQString( "EOF" );
case tUnknown: return TQString( "Unknown" );
default: return TQString( "<Unknown>" );
}
}
}