|
|
|
/***************************************************************************
|
|
|
|
copyright : (C) 2007 by Robby Stephenson
|
|
|
|
email : robby@periapsis.org
|
|
|
|
***************************************************************************/
|
|
|
|
|
|
|
|
/***************************************************************************
|
|
|
|
* *
|
|
|
|
* This program is free software; you can redistribute it and/or modify *
|
|
|
|
* it under the terms of version 2 of the GNU General Public License as *
|
|
|
|
* published by the Free Software Foundation; *
|
|
|
|
* *
|
|
|
|
***************************************************************************/
|
|
|
|
|
|
|
|
#include "pdfimporter.h"
|
|
|
|
#include "tellicoimporter.h"
|
|
|
|
#include "xslthandler.h"
|
|
|
|
#include "../collections/bibtexcollection.h"
|
|
|
|
#include "../xmphandler.h"
|
|
|
|
#include "../filehandler.h"
|
|
|
|
#include "../imagefactory.h"
|
|
|
|
#include "../tellico_kernel.h"
|
|
|
|
#include "../fetch/fetchmanager.h"
|
|
|
|
#include "../fetch/crossreffetcher.h"
|
|
|
|
#include "../tellico_utils.h"
|
|
|
|
#include "../progressmanager.h"
|
|
|
|
#include "../core/netaccess.h"
|
|
|
|
#include "../tellico_debug.h"
|
|
|
|
|
|
|
|
#include <kstandarddirs.h>
|
|
|
|
#include <tdemessagebox.h>
|
|
|
|
|
|
|
|
#include <config.h>
|
|
|
|
#ifdef HAVE_POPPLER
|
|
|
|
#include <poppler-qt.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
static const int PDF_FILE_PREVIEW_SIZE = 196;
|
|
|
|
}
|
|
|
|
|
|
|
|
using Tellico::Import::PDFImporter;
|
|
|
|
|
|
|
|
PDFImporter::PDFImporter(const KURL::List& urls_) : Importer(urls_), m_cancelled(false) {
|
|
|
|
}
|
|
|
|
|
|
|
|
bool PDFImporter::canImport(int type_) const {
|
|
|
|
return type_ == Data::Collection::Bibtex;
|
|
|
|
}
|
|
|
|
|
|
|
|
Tellico::Data::CollPtr PDFImporter::collection() {
|
|
|
|
TQString xsltfile = ::locate("appdata", TQString::fromLatin1("xmp2tellico.xsl"));
|
|
|
|
if(xsltfile.isEmpty()) {
|
|
|
|
kdWarning() << "DropHandler::handleURL() - can not locate xmp2tellico.xsl" << endl;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
ProgressItem& item = ProgressManager::self()->newProgressItem(this, progressLabel(), true);
|
|
|
|
item.setTotalSteps(urls().count());
|
|
|
|
connect(&item, TQT_SIGNAL(signalCancelled(ProgressItem*)), TQT_SLOT(slotCancel()));
|
|
|
|
ProgressItem::Done done(this);
|
|
|
|
const bool showProgress = options() & ImportProgress;
|
|
|
|
|
|
|
|
KURL u;
|
|
|
|
u.setPath(xsltfile);
|
|
|
|
|
|
|
|
XSLTHandler xsltHandler(u);
|
|
|
|
if(!xsltHandler.isValid()) {
|
|
|
|
kdWarning() << "DropHandler::handleURL() - invalid xslt in xmp2tellico.xsl" << endl;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool hasDOI = false;
|
|
|
|
bool hasArxiv = false;
|
|
|
|
|
|
|
|
uint j = 0;
|
|
|
|
|
|
|
|
Data::CollPtr coll;
|
|
|
|
XMPHandler xmpHandler;
|
|
|
|
KURL::List list = urls();
|
|
|
|
for(KURL::List::Iterator it = list.begin(); it != list.end() && !m_cancelled; ++it, ++j) {
|
|
|
|
FileHandler::FileRef* ref = FileHandler::fileRef(*it);
|
|
|
|
if(!ref) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
Data::CollPtr newColl;
|
|
|
|
Data::EntryPtr entry;
|
|
|
|
|
|
|
|
TQString xmp = xmpHandler.extractXMP(ref->fileName());
|
|
|
|
// myDebug() << xmp << endl;
|
|
|
|
if(xmp.isEmpty()) {
|
|
|
|
setStatusMessage(i18n("Tellico was unable to read any metadata from the PDF file."));
|
|
|
|
} else {
|
|
|
|
setStatusMessage(TQString());
|
|
|
|
|
|
|
|
Import::TellicoImporter importer(xsltHandler.applyStylesheet(xmp));
|
|
|
|
newColl = importer.collection();
|
|
|
|
if(!newColl || newColl->entryCount() == 0) {
|
|
|
|
kdWarning() << "DropHandler::handleURL() - no collection found" << endl;
|
|
|
|
setStatusMessage(i18n("Tellico was unable to read any metadata from the PDF file."));
|
|
|
|
} else {
|
|
|
|
entry = newColl->entries().front();
|
|
|
|
hasDOI |= !entry->field(TQString::fromLatin1("doi")).isEmpty();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if(!newColl) {
|
|
|
|
newColl = new Data::BibtexCollection(true);
|
|
|
|
}
|
|
|
|
if(!entry) {
|
|
|
|
entry = new Data::Entry(newColl);
|
|
|
|
newColl->addEntries(entry);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef HAVE_POPPLER
|
|
|
|
|
|
|
|
// now load from poppler
|
|
|
|
Poppler::Document* doc = Poppler::Document::load(ref->fileName());
|
|
|
|
if(doc && !doc->isLocked()) {
|
|
|
|
// now the question is, do we overwrite XMP data with Poppler data?
|
|
|
|
// for now, let's say yes conditionally
|
|
|
|
TQString s = doc->getInfo(TQString::fromLatin1("Title")).simplifyWhiteSpace();
|
|
|
|
if(!s.isEmpty()) {
|
|
|
|
entry->setField(TQString::fromLatin1("title"), s);
|
|
|
|
}
|
|
|
|
// author could be separated by commas, "and" or whatever
|
|
|
|
// we're not going to overwrite it
|
|
|
|
if(entry->field(TQString::fromLatin1("author")).isEmpty()) {
|
|
|
|
TQRegExp rx(TQString::fromLatin1("\\s*(and|,|;)\\s*"));
|
|
|
|
TQStringList authors = TQStringList::split(rx, doc->getInfo(TQString::fromLatin1("Author")).simplifyWhiteSpace());
|
|
|
|
entry->setField(TQString::fromLatin1("author"), authors.join(TQString::fromLatin1("; ")));
|
|
|
|
}
|
|
|
|
s = doc->getInfo(TQString::fromLatin1("Keywords")).simplifyWhiteSpace();
|
|
|
|
if(!s.isEmpty()) {
|
|
|
|
// keywords are also separated by semi-colons in poppler
|
|
|
|
entry->setField(TQString::fromLatin1("keyword"), s);
|
|
|
|
}
|
|
|
|
|
|
|
|
// now parse the first page text and try to guess
|
|
|
|
Poppler::Page* page = doc->getPage(0);
|
|
|
|
if(page) {
|
|
|
|
// a null rectangle means get all text on page
|
|
|
|
TQString text = page->getText(Poppler::Rectangle());
|
|
|
|
// borrowed from Referencer
|
|
|
|
TQRegExp rx(TQString::fromLatin1("(?:"
|
|
|
|
"(?:[Dd][Oo][Ii]:? *)"
|
|
|
|
"|"
|
|
|
|
"(?:[Dd]igital *[Oo]bject *[Ii]dentifier:? *)"
|
|
|
|
")"
|
|
|
|
"("
|
|
|
|
"[^\\.\\s]+"
|
|
|
|
"\\."
|
|
|
|
"[^\\/\\s]+"
|
|
|
|
"\\/"
|
|
|
|
"[^\\s]+"
|
|
|
|
")"));
|
|
|
|
if(rx.search(text) > -1) {
|
|
|
|
TQString doi = rx.cap(1);
|
|
|
|
myDebug() << "PDFImporter::collection() - in PDF file, found DOI: " << doi << endl;
|
|
|
|
entry->setField(TQString::fromLatin1("doi"), doi);
|
|
|
|
hasDOI = true;
|
|
|
|
}
|
|
|
|
rx = TQRegExp(TQString::fromLatin1("arXiv:"
|
|
|
|
"("
|
|
|
|
"[^\\/\\s]+"
|
|
|
|
"[\\/\\.]"
|
|
|
|
"[^\\s]+"
|
|
|
|
")"));
|
|
|
|
if(rx.search(text) > -1) {
|
|
|
|
TQString arxiv = rx.cap(1);
|
|
|
|
myDebug() << "PDFImporter::collection() - in PDF file, found arxiv: " << arxiv << endl;
|
|
|
|
if(entry->collection()->fieldByName(TQString::fromLatin1("arxiv")) == 0) {
|
|
|
|
Data::FieldPtr field = new Data::Field(TQString::fromLatin1("arxiv"), i18n("arXiv ID"));
|
|
|
|
field->setCategory(i18n("Publishing"));
|
|
|
|
entry->collection()->addField(field);
|
|
|
|
}
|
|
|
|
entry->setField(TQString::fromLatin1("arxiv"), arxiv);
|
|
|
|
hasArxiv = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
delete page;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
myDebug() << "PDFImporter::collection() - unable to read PDF info (poppler)" << endl;
|
|
|
|
}
|
|
|
|
delete doc;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
entry->setField(TQString::fromLatin1("url"), (*it).url());
|
|
|
|
// always an article?
|
|
|
|
entry->setField(TQString::fromLatin1("entry-type"), TQString::fromLatin1("article"));
|
|
|
|
|
|
|
|
TQPixmap pix = NetAccess::filePreview(ref->fileName(), PDF_FILE_PREVIEW_SIZE);
|
|
|
|
delete ref; // removes temp file
|
|
|
|
|
|
|
|
if(!pix.isNull()) {
|
|
|
|
// is png best option?
|
|
|
|
TQString id = ImageFactory::addImage(pix, TQString::fromLatin1("PNG"));
|
|
|
|
if(!id.isEmpty()) {
|
|
|
|
Data::FieldPtr field = newColl->fieldByName(TQString::fromLatin1("cover"));
|
|
|
|
if(!field && !newColl->imageFields().isEmpty()) {
|
|
|
|
field = newColl->imageFields().front();
|
|
|
|
} else if(!field) {
|
|
|
|
field = new Data::Field(TQString::fromLatin1("cover"), i18n("Front Cover"), Data::Field::Image);
|
|
|
|
newColl->addField(field);
|
|
|
|
}
|
|
|
|
entry->setField(field, id);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if(coll) {
|
|
|
|
coll->addEntries(newColl->entries());
|
|
|
|
} else {
|
|
|
|
coll = newColl;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(showProgress) {
|
|
|
|
ProgressManager::self()->setProgress(this, j);
|
|
|
|
kapp->processEvents();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if(m_cancelled) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(hasDOI) {
|
|
|
|
myDebug() << "looking for DOI" << endl;
|
|
|
|
Fetch::FetcherVec vec = Fetch::Manager::self()->createUpdateFetchers(coll->type(), Fetch::DOI);
|
|
|
|
if(vec.isEmpty()) {
|
|
|
|
GUI::CursorSaver cs(TQt::arrowCursor);
|
|
|
|
KMessageBox::information(Kernel::self()->widget(),
|
|
|
|
i18n("Tellico is able to download information about entries with a DOI from "
|
|
|
|
"CrossRef.org. However, you must create an CrossRef account and add a new "
|
|
|
|
"data source with your account information."),
|
|
|
|
TQString(),
|
|
|
|
TQString::fromLatin1("CrossRefSourceNeeded"));
|
|
|
|
} else {
|
|
|
|
Data::EntryVec entries = coll->entries();
|
|
|
|
for(Fetch::FetcherVec::Iterator fetcher = vec.begin(); fetcher != vec.end(); ++fetcher) {
|
|
|
|
for(Data::EntryVecIt entry = entries.begin(); entry != entries.end(); ++entry) {
|
|
|
|
fetcher->updateEntrySynchronous(entry);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if(m_cancelled) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(hasArxiv) {
|
|
|
|
Data::EntryVec entries = coll->entries();
|
|
|
|
Fetch::FetcherVec vec = Fetch::Manager::self()->createUpdateFetchers(coll->type(), Fetch::ArxivID);
|
|
|
|
for(Fetch::FetcherVec::Iterator fetcher = vec.begin(); fetcher != vec.end(); ++fetcher) {
|
|
|
|
for(Data::EntryVecIt entry = entries.begin(); entry != entries.end(); ++entry) {
|
|
|
|
fetcher->updateEntrySynchronous(entry);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// finally
|
|
|
|
Data::EntryVec entries = coll->entries();
|
|
|
|
for(Data::EntryVecIt entry = entries.begin(); entry != entries.end(); ++entry) {
|
|
|
|
if(entry->title().isEmpty()) {
|
|
|
|
// use file name
|
|
|
|
KURL u = entry->field(TQString::fromLatin1("url"));
|
|
|
|
entry->setField(TQString::fromLatin1("title"), u.fileName());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if(m_cancelled) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return coll;
|
|
|
|
}
|
|
|
|
|
|
|
|
void PDFImporter::slotCancel() {
|
|
|
|
m_cancelled = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
#include "pdfimporter.moc"
|