You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kbibtex/src/webqueryciteseerx.cpp

319 lines
11 KiB

/***************************************************************************
* Copyright (C) 2008 by Jacob Kanev <j_kanev@arcor.de>, *
* Thomas Fischer <fischer@unix-ag.uni-kl.de> *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
***************************************************************************/
#include <ntqfile.h>
#include <ntqregexp.h>
#include <ntqbuffer.h>
#include <ntqspinbox.h>
#include <tdelocale.h>
#include <klineedit.h>
#include <tdemessagebox.h>
#include <kurl.h>
#include <kdebug.h>
#include <fileimporterbibtex.h>
#include <encoderxml.h>
#include <settings.h>
#include "webqueryciteseerx.h"
using BibTeX::Value;
using BibTeX::Entry;
using BibTeX::EntryField;
namespace KBibTeX
{
//_______________________________________________________________________________________________________________
// Construct widget
WebQueryCiteSeerXWidget::WebQueryCiteSeerXWidget( TQWidget *parent, const char *name )
: WebQueryWidget( parent, name )
{
init();
Settings *settings = Settings::self();
TQString value = settings->getWebQueryDefault( "CiteSeerX" );
value = value == TQString::null ? "" : value;
lineEditQuery->setText( value );
slotTextChanged( value, true );
}
//_______________________________________________________________________________________________________________
// Construct
WebQueryCiteSeerX::WebQueryCiteSeerX( TQWidget* parent )
: WebQuery( parent ), m_citeSeerXServer( "citeseerx.ist.psu.edu" )
{
m_widget = new WebQueryCiteSeerXWidget( parent );
}
//_______________________________________________________________________________________________________________
// Destroy
WebQueryCiteSeerX::~WebQueryCiteSeerX()
{
delete m_widget;
}
//_______________________________________________________________________________________________________________
// GUI string
TQString WebQueryCiteSeerX::title()
{
return i18n( "CiteSeerX" );
}
//_______________________________________________________________________________________________________________
// GUI info
TQString WebQueryCiteSeerX::disclaimer()
{
return i18n( "About CiteSeerX" );
}
//_______________________________________________________________________________________________________________
// URL for disclaimer
TQString WebQueryCiteSeerX::disclaimerURL()
{
return "http://citeseerx.ist.psu.edu/about/site";
}
//_______________________________________________________________________________________________________________
// return pointer to widget
WebQueryWidget *WebQueryCiteSeerX::widget()
{
return m_widget;
}
//_______________________________________________________________________________________________________________
// user has pressed "Cancel"
void WebQueryCiteSeerX::cancelQuery()
{
m_queryQueue.clear();
}
//_______________________________________________________________________________________________________________
// main function -- collects all queries for one search
void WebQueryCiteSeerX::query()
{
// store CiteSeerX as future default
WebQuery::query();
Settings *settings = Settings::self();
settings->setWebQueryDefault( "CiteSeerX", m_widget->lineEditQuery->text() );
// read number of desired results from GUI
m_queryQueue.clear();
m_desiredHits = m_widget->spinBoxMaxHits->value();
// one for each entry, and one for each page of 10 links
setNumStages( m_desiredHits + ( m_desiredHits / 10 + 1 ) );
// prepare search term
TQString searchTerm = m_widget->lineEditQuery->text().stripWhiteSpace().replace( '$', "" );
TQStringList queryWords = TQStringList::split( TQRegExp( "\\s+" ), searchTerm );
if ( searchTerm.isEmpty() || queryWords.size() == 0 )
{
setEndSearch( WebQuery::statusInvalidQuery );
return;
}
// build query from search term
TQString query;
for ( uint i = 0; i < queryWords.size(); ++i )
{
if ( i ) query += " AND ";
query += queryWords[i];
}
query = query.replace( "%", "%25" ).replace( "+", "%2B" ).replace( " ", "%20" ).replace( "#", "%23" ).replace( "&", "%26" ).replace( "?", "%3F" );
// schedule jobs
DataRequest dr;
dr.url = KURL( TQString( "http://citeseerx.ist.psu.edu/search?q=" ).append( query ).append( "&submit=Search&sort=rel" ) );
dr.parser = &WebQueryCiteSeerX::parseSummaryPage;
m_queryQueue.push_back( dr );
// start job queue
nextJob();
}
//_______________________________________________________________________________________________________________
// process results from current job
void WebQueryCiteSeerX::parseSummaryPage( const TQString& data )
{
// regexp. for finding paper entries (example: href="/viewdoc/summary;jsessionid=12345ABCD?doi=10.1.1.108.9937")
TQRegExp paperXpr( "href=\"(/viewdoc/summary[^?]*\\?doi=[^\"]+)\"" );
// count paper results and schedule single paper URLs
for ( int p = paperXpr.search( data ); p >= 0; p = paperXpr.search( data, p + paperXpr.matchedLength() ) )
{
if ( ++m_receivedHits > m_desiredHits )
break;
DataRequest dr;
dr.url = KURL( TQString( "http://" ) + m_citeSeerXServer + paperXpr.cap( 1 ) );
dr.parser = &WebQueryCiteSeerX::parsePaperPage;
m_queryQueue.push_back( dr );
}
// if we haven't reached the desired number of hits, schedule the next summary page
TQRegExp nextSummaryXpr( "<a href=\"([^\"]+)\">Next 10" );
if ( m_receivedHits < m_desiredHits )
if ( nextSummaryXpr.search( data ) >= 0 )
{
DataRequest dr;
dr.url = KURL( TQString( "http://" ) + m_citeSeerXServer + nextSummaryXpr.cap( 1 ).replace( "&amp;", "&" ) );
dr.parser = &WebQueryCiteSeerX::parseSummaryPage;
m_queryQueue.push_back( dr );
}
}
//_______________________________________________________________________________________________________________
// process the result of one single paper link
void WebQueryCiteSeerX::parsePaperPage( const TQString& data )
{
// find type and id: @XXX{ YYY
TQRegExp typeIdXpr( "@(.*)\\{(.*)," );
typeIdXpr.setMinimal( true );
typeIdXpr.search( data );
TQString typeStr = typeIdXpr.cap( 1 );
TQString id = typeIdXpr.cap( 2 );
// create entry
Entry *entry = new BibTeX::Entry( typeIdXpr.cap( 1 ), typeIdXpr.cap( 2 ) );
// find abstract: <..>Abstract:</..> <..> XXX </..>
parseForSingleExpression( "<[^<]+>Abstract:</[^<]+>\\s*<[^<]+>([^<]+)</[^<]+>", data, entry, BibTeX::EntryField::ftAbstract );
// find title: title = {XXX}
parseForSingleExpression( "title = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftTitle );
// find author: author = {XXX}
parseForSingleExpression( "author = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftAuthor );
// find year: year = {XXX}
parseForSingleExpression( "year = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftYear );
// find journal: journal = {XXX}
parseForSingleExpression( "journal = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftJournal );
// find pages: pages = {XXX}
parseForSingleExpression( "pages = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftPages );
// publish what we've found
emit foundEntry( entry, false );
}
//_______________________________________________________________________________________________________________
// find single bibtex field in html page and add to entry
void WebQueryCiteSeerX::parseForSingleExpression( TQString description, const TQString &data, Entry *entry, BibTeX::EntryField::FieldType type )
{
// search, and add to entry if found
TQRegExp xpr( description );
if ( xpr.search( data ) + 1 )
{
EntryField *field = new EntryField( type );
field->setValue( new Value( xpr.cap( 1 ), false ) );
entry->addField( field );
}
}
//_______________________________________________________________________________________________________________
// read data from the job and start the current parser
void WebQueryCiteSeerX::getData( TDEIO::Job *job )
{
// advance GUI progress bar
enterNextStage();
if ( job && !job->error() && !m_aborted )
{
// read data
TQBuffer data;
data.open( IO_WriteOnly );
data.writeBlock( dynamic_cast<TDEIO::StoredTransferJob*>( job )->data() );
data.close();
data.open( IO_ReadOnly );
TQTextStream ts( &data );
TQString result = ts.read();
data.close();
// hand the read data over to the parser
( this->*m_currentParser )( result );
}
// proceed
nextJob();
}
//_______________________________________________________________________________________________________________
// call the next job
void WebQueryCiteSeerX::nextJob()
{
// no more requests: finished
if ( !m_queryQueue.size() )
{
setEndSearch( WebQuery::statusSuccess );
m_receivedHits = 0;
}
// else: take the next request from queue and start it
else if ( !m_aborted )
{
m_currentParser = m_queryQueue.front().parser;
TDEIO::Job *job = TDEIO::storedGet( m_queryQueue.front().url, FALSE, FALSE );
m_queryQueue.pop_front();
connect( job, SIGNAL( result( TDEIO::Job * ) ), this, SLOT( getData( TDEIO::Job * ) ) );
}
}
}
#include "webqueryciteseerx.moc"