You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tellico/src/fetch/entrezfetcher.cpp

499 lines
16 KiB

/***************************************************************************
copyright : (C) 2005-2006 by Robby Stephenson
email : robby@periapsis.org
***************************************************************************/
/***************************************************************************
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of version 2 of the GNU General Public License as *
* published by the Free Software Foundation; *
* *
***************************************************************************/
#include "entrezfetcher.h"
#include "../tellico_kernel.h"
#include "../latin1literal.h"
#include "../collection.h"
#include "../entry.h"
#include "../filehandler.h"
#include "../translators/xslthandler.h"
#include "../translators/tellicoimporter.h"
#include "../tellico_debug.h"
#include <klocale.h>
#include <kconfig.h>
#include <kstandarddirs.h>
#include <kio/job.h>
#include <tqdom.h>
#include <tqlabel.h>
#include <tqlayout.h>
#include <tqfile.h>
//#define ENTREZ_TEST
namespace {
static const int ENTREZ_MAX_RETURNS_TOTAL = 25;
static const char* ENTREZ_BASE_URL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/";
static const char* ENTREZ_SEARCH_CGI = "esearch.fcgi";
static const char* ENTREZ_SUMMARY_CGI = "esummary.fcgi";
static const char* ENTREZ_FETCH_CGI = "efetch.fcgi";
static const char* ENTREZ_LINK_CGI = "elink.fcgi";
static const char* ENTREZ_DEFAULT_DATABASE = "pubmed";
}
using Tellico::Fetch::EntrezFetcher;
EntrezFetcher::EntrezFetcher(TQObject* parent_, const char* name_) : Fetcher(parent_, name_), m_xsltHandler(0),
m_step(Begin), m_started(false) {
}
EntrezFetcher::~EntrezFetcher() {
}
TQString EntrezFetcher::defaultName() {
return i18n("Entrez Database");
}
TQString EntrezFetcher::source() const {
return m_name.isEmpty() ? defaultName() : m_name;
}
bool EntrezFetcher::canFetch(int type) const {
return type == Data::Collection::Bibtex;
}
void EntrezFetcher::readConfigHook(const KConfigGroup& config_) {
TQString s = config_.readEntry("Database", TQString::fromLatin1(ENTREZ_DEFAULT_DATABASE)); // default to pubmed
if(!s.isEmpty()) {
m_dbname = s;
}
m_fields = config_.readListEntry("Custom Fields");
}
void EntrezFetcher::search(FetchKey key_, const TQString& value_) {
m_started = true;
m_start = 1;
m_total = -1;
// only search if current collection is a bibliography
if(!canFetch(Kernel::self()->collectionType())) {
myDebug() << "EntrezFetcher::search() - collection type mismatch, stopping" << endl;
stop();
return;
}
if(m_dbname.isEmpty()) {
m_dbname = TQString::fromLatin1(ENTREZ_DEFAULT_DATABASE);
}
#ifdef ENTREZ_TEST
KURL u = KURL::fromPathOrURL(TQString::fromLatin1("/home/robby/esearch.xml"));
#else
KURL u(TQString::fromLatin1(ENTREZ_BASE_URL));
u.addPath(TQString::fromLatin1(ENTREZ_SEARCH_CGI));
u.addQueryItem(TQString::fromLatin1("tool"), TQString::fromLatin1("Tellico"));
u.addQueryItem(TQString::fromLatin1("retmode"), TQString::fromLatin1("xml"));
u.addQueryItem(TQString::fromLatin1("usehistory"), TQString::fromLatin1("y"));
u.addQueryItem(TQString::fromLatin1("retmax"), TQString::fromLatin1("1")); // we're just getting the count
u.addQueryItem(TQString::fromLatin1("db"), m_dbname);
u.addQueryItem(TQString::fromLatin1("term"), value_);
switch(key_) {
case Title:
u.addQueryItem(TQString::fromLatin1("field"), TQString::fromLatin1("titl"));
break;
case Person:
u.addQueryItem(TQString::fromLatin1("field"), TQString::fromLatin1("auth"));
break;
case Keyword:
// for Tellico Keyword searches basically mean search for any field matching
// u.addQueryItem(TQString::fromLatin1("field"), TQString::fromLatin1("word"));
break;
case PubmedID:
u.addQueryItem(TQString::fromLatin1("field"), TQString::fromLatin1("pmid"));
break;
case DOI:
case Raw:
u.setQuery(u.query() + '&' + value_);
break;
default:
kdWarning() << "EntrezFetcher::search() - FetchKey not supported" << endl;
stop();
return;
}
#endif
m_step = Search;
// myLog() << "EntrezFetcher::doSearch() - url: " << u.url() << endl;
m_job = KIO::get(u, false, false);
connect(m_job, TQT_SIGNAL(data(KIO::Job*, const TQByteArray&)),
TQT_SLOT(slotData(KIO::Job*, const TQByteArray&)));
connect(m_job, TQT_SIGNAL(result(KIO::Job*)),
TQT_SLOT(slotComplete(KIO::Job*)));
}
void EntrezFetcher::continueSearch() {
m_started = true;
doSummary();
}
void EntrezFetcher::stop() {
if(!m_started) {
return;
}
if(m_job) {
m_job->kill();
m_job = 0;
}
m_data.truncate(0);
m_started = false;
m_step = Begin;
emit signalDone(this);
}
void EntrezFetcher::slotData(KIO::Job*, const TQByteArray& data_) {
TQDataStream stream(m_data, IO_WriteOnly | IO_Append);
stream.writeRawBytes(data_.data(), data_.size());
}
void EntrezFetcher::slotComplete(KIO::Job* job_) {
// since the fetch is done, don't worry about holding the job pointer
m_job = 0;
if(job_->error()) {
job_->showErrorDialog(Kernel::self()->widget());
stop();
return;
}
if(m_data.isEmpty()) {
myDebug() << "EntrezFetcher::slotComplete() - no data" << endl;
stop();
return;
}
#if 0
kdWarning() << "Remove debug from entrezfetcher.cpp: " << __LINE__ << endl;
TQFile f(TQString::fromLatin1("/tmp/test.xml"));
if(f.open(IO_WriteOnly)) {
TQTextStream t(&f);
t.setEncoding(TQTextStream::UnicodeUTF8);
t << TQCString(m_data, m_data.size()+1);
}
f.close();
#endif
switch(m_step) {
case Search:
searchResults();
break;
case Summary:
summaryResults();
break;
case Begin:
case Fetch:
default:
myLog() << "EntrezFetcher::slotComplete() - wrong step = " << m_step << endl;
stop();
break;
}
}
void EntrezFetcher::searchResults() {
TQDomDocument dom;
if(!dom.setContent(m_data, false)) {
kdWarning() << "EntrezFetcher::searchResults() - server did not return valid XML." << endl;
stop();
return;
}
// find Count, QueryKey, and WebEnv elements
int count = 0;
for(TQDomNode n = dom.documentElement().firstChild(); !n.isNull(); n = n.nextSibling()) {
TQDomElement e = n.toElement();
if(e.isNull()) {
continue;
}
if(e.tagName() == Latin1Literal("Count")) {
m_total = e.text().toInt();
++count;
} else if(e.tagName() == Latin1Literal("QueryKey")) {
m_queryKey = e.text();
++count;
} else if(e.tagName() == Latin1Literal("WebEnv")) {
m_webEnv = e.text();
++count;
}
if(count >= 3) {
break; // found them all
}
}
m_data.truncate(0);
doSummary();
}
void EntrezFetcher::doSummary() {
#ifdef ENTREZ_TEST
KURL u = KURL::fromPathOrURL(TQString::fromLatin1("/home/robby/esummary.xml"));
#else
KURL u(TQString::fromLatin1(ENTREZ_BASE_URL));
u.addPath(TQString::fromLatin1(ENTREZ_SUMMARY_CGI));
u.addQueryItem(TQString::fromLatin1("tool"), TQString::fromLatin1("Tellico"));
u.addQueryItem(TQString::fromLatin1("retmode"), TQString::fromLatin1("xml"));
u.addQueryItem(TQString::fromLatin1("retstart"), TQString::number(m_start));
u.addQueryItem(TQString::fromLatin1("retmax"), TQString::number(TQMIN(m_total-m_start-1, ENTREZ_MAX_RETURNS_TOTAL)));
u.addQueryItem(TQString::fromLatin1("usehistory"), TQString::fromLatin1("y"));
u.addQueryItem(TQString::fromLatin1("db"), m_dbname);
u.addQueryItem(TQString::fromLatin1("query_key"), m_queryKey);
u.addQueryItem(TQString::fromLatin1("WebEnv"), m_webEnv);
#endif
m_step = Summary;
// myLog() << "EntrezFetcher::searchResults() - url: " << u.url() << endl;
m_job = KIO::get(u, false, false);
connect(m_job, TQT_SIGNAL(data(KIO::Job*, const TQByteArray&)),
TQT_SLOT(slotData(KIO::Job*, const TQByteArray&)));
connect(m_job, TQT_SIGNAL(result(KIO::Job*)),
TQT_SLOT(slotComplete(KIO::Job*)));
}
void EntrezFetcher::summaryResults() {
TQDomDocument dom;
if(!dom.setContent(m_data, false)) {
kdWarning() << "EntrezFetcher::summaryResults() - server did not return valid XML." << endl;
stop();
return;
}
// top child is eSummaryResult
// all tqchildren are DocSum
for(TQDomNode n = dom.documentElement().firstChild(); !n.isNull(); n = n.nextSibling()) {
TQDomElement e = n.toElement();
if(e.isNull() || e.tagName() != Latin1Literal("DocSum")) {
continue;
}
TQDomNodeList nodes = e.elementsByTagName(TQString::fromLatin1("Id"));
if(nodes.count() == 0) {
myDebug() << "EntrezFetcher::summaryResults() - no Id elements" << endl;
continue;
}
int id = nodes.item(0).toElement().text().toInt();
TQString title, pubdate, authors;
nodes = e.elementsByTagName(TQString::fromLatin1("Item"));
for(uint j = 0; j < nodes.count(); ++j) {
if(nodes.item(j).toElement().attribute(TQString::fromLatin1("Name")) == Latin1Literal("Title")) {
title = nodes.item(j).toElement().text();
} else if(nodes.item(j).toElement().attribute(TQString::fromLatin1("Name")) == Latin1Literal("PubDate")) {
pubdate = nodes.item(j).toElement().text();
} else if(nodes.item(j).toElement().attribute(TQString::fromLatin1("Name")) == Latin1Literal("AuthorList")) {
TQStringList list;
for(TQDomNode aNode = nodes.item(j).firstChild(); !aNode.isNull(); aNode = aNode.nextSibling()) {
// lazy, assume all tqchildren Items are authors
if(aNode.nodeName() == Latin1Literal("Item")) {
list << aNode.toElement().text();
}
}
authors = list.join(TQString::fromLatin1("; "));
}
if(!title.isEmpty() && !pubdate.isEmpty() && !authors.isEmpty()) {
break; // done now
}
}
SearchResult* r = new SearchResult(this, title, pubdate + '/' + authors, TQString());
m_matches.insert(r->uid, id);
emit signalResultFound(r);
}
m_start = m_matches.count() + 1;
m_hasMoreResults = m_start <= m_total;
stop(); // done searching
}
Tellico::Data::EntryPtr EntrezFetcher::fetchEntry(uint uid_) {
// if we already grabbed this one, then just pull it out of the dict
Data::EntryPtr entry = m_entries[uid_];
if(entry) {
return entry;
}
if(!m_matches.contains(uid_)) {
return 0;
}
if(!m_xsltHandler) {
initXSLTHandler();
if(!m_xsltHandler) { // probably an error somewhere in the stylesheet loading
stop();
return 0;
}
}
int id = m_matches[uid_];
#ifdef ENTREZ_TEST
KURL u = KURL::fromPathOrURL(TQString::fromLatin1("/home/robby/pubmed.xml"));
#else
KURL u(TQString::fromLatin1(ENTREZ_BASE_URL));
u.addPath(TQString::fromLatin1(ENTREZ_FETCH_CGI));
u.addQueryItem(TQString::fromLatin1("tool"), TQString::fromLatin1("Tellico"));
u.addQueryItem(TQString::fromLatin1("retmode"), TQString::fromLatin1("xml"));
u.addQueryItem(TQString::fromLatin1("rettype"), TQString::fromLatin1("abstract"));
u.addQueryItem(TQString::fromLatin1("db"), m_dbname);
u.addQueryItem(TQString::fromLatin1("id"), TQString::number(id));
#endif
// now it's sychronous, and we know that it's utf8
TQString xmlOutput = FileHandler::readTextFile(u, false /*quiet*/, true /*utf8*/);
if(xmlOutput.isEmpty()) {
kdWarning() << "EntrezFetcher::fetchEntry() - unable to download " << u << endl;
return 0;
}
#if 0
kdWarning() << "EntrezFetcher::fetchEntry() - turn me off!" << endl;
TQFile f1(TQString::fromLatin1("/tmp/test-entry.xml"));
if(f1.open(IO_WriteOnly)) {
TQTextStream t(&f1);
t.setEncoding(TQTextStream::UnicodeUTF8);
t << xmlOutput;
}
f1.close();
#endif
TQString str = m_xsltHandler->applyStylesheet(xmlOutput);
Import::TellicoImporter imp(str);
Data::CollPtr coll = imp.collection();
if(!coll) {
kdWarning() << "EntrezFetcher::fetchEntry() - invalid collection" << endl;
return 0;
}
if(coll->entryCount() == 0) {
myDebug() << "EntrezFetcher::fetchEntry() - no entries in collection" << endl;
return 0;
} else if(coll->entryCount() > 1) {
myDebug() << "EntrezFetcher::fetchEntry() - collection has multiple entries, taking first one" << endl;
}
Data::EntryPtr e = coll->entries().front();
// try to get a link, but only if necessary
if(m_fields.contains(TQString::fromLatin1("url"))) {
KURL link(TQString::fromLatin1(ENTREZ_BASE_URL));
link.addPath(TQString::fromLatin1(ENTREZ_LINK_CGI));
link.addQueryItem(TQString::fromLatin1("tool"), TQString::fromLatin1("Tellico"));
link.addQueryItem(TQString::fromLatin1("cmd"), TQString::fromLatin1("llinks"));
link.addQueryItem(TQString::fromLatin1("db"), m_dbname);
link.addQueryItem(TQString::fromLatin1("dbfrom"), m_dbname);
link.addQueryItem(TQString::fromLatin1("id"), TQString::number(id));
TQDomDocument linkDom = FileHandler::readXMLFile(link, false /* namespace */, true /* quiet */);
// need eLinkResult/LinkSet/IdUrlList/IdUrlSet/ObjUrl/Url
TQDomNode linkNode = linkDom.namedItem(TQString::fromLatin1("eLinkResult"))
.namedItem(TQString::fromLatin1("LinkSet"))
.namedItem(TQString::fromLatin1("IdUrlList"))
.namedItem(TQString::fromLatin1("IdUrlSet"))
.namedItem(TQString::fromLatin1("ObjUrl"))
.namedItem(TQString::fromLatin1("Url"));
if(!linkNode.isNull()) {
TQString u = linkNode.toElement().text();
// myDebug() << u << endl;
if(!u.isEmpty()) {
if(!coll->hasField(TQString::fromLatin1("url"))) {
Data::FieldPtr field = new Data::Field(TQString::fromLatin1("url"), i18n("URL"), Data::Field::URL);
field->setCategory(i18n("Miscellaneous"));
coll->addField(field);
}
e->setField(TQString::fromLatin1("url"), u);
}
}
}
const StringMap customFields = EntrezFetcher::customFields();
for(StringMap::ConstIterator it = customFields.begin(); it != customFields.end(); ++it) {
if(!m_fields.contains(it.key())) {
coll->removeField(it.key());
}
}
m_entries.insert(uid_, e);
return e;
}
void EntrezFetcher::initXSLTHandler() {
TQString xsltfile = locate("appdata", TQString::fromLatin1("pubmed2tellico.xsl"));
if(xsltfile.isEmpty()) {
kdWarning() << "EntrezFetcher::initXSLTHandler() - can not locate pubmed2tellico.xsl." << endl;
return;
}
KURL u;
u.setPath(xsltfile);
if(!m_xsltHandler) {
m_xsltHandler = new XSLTHandler(u);
}
if(!m_xsltHandler->isValid()) {
kdWarning() << "EntrezFetcher::initXSLTHandler() - error in pubmed2tellico.xsl." << endl;
delete m_xsltHandler;
m_xsltHandler = 0;
return;
}
}
void EntrezFetcher::updateEntry(Data::EntryPtr entry_) {
// myDebug() << "EntrezFetcher::updateEntry()" << endl;
TQString s = entry_->field(TQString::fromLatin1("pmid"));
if(!s.isEmpty()) {
search(PubmedID, s);
return;
}
s = entry_->field(TQString::fromLatin1("doi"));
if(!s.isEmpty()) {
search(DOI, s);
return;
}
s = entry_->field(TQString::fromLatin1("title"));
if(!s.isEmpty()) {
search(Title, s);
return;
}
myDebug() << "EntrezFetcher::updateEntry() - insufficient info to search" << endl;
emit signalDone(this); // always need to emit this if not continuing with the search
}
Tellico::Fetch::ConfigWidget* EntrezFetcher::configWidget(TQWidget* parent_) const {
return new EntrezFetcher::ConfigWidget(parent_, this);
}
EntrezFetcher::ConfigWidget::ConfigWidget(TQWidget* parent_, const EntrezFetcher* fetcher_/*=0*/)
: Fetch::ConfigWidget(parent_) {
TQVBoxLayout* l = new TQVBoxLayout(optionsWidget());
l->addWidget(new TQLabel(i18n("This source has no options."), optionsWidget()));
l->addStretch();
// now add additional fields widget
addFieldsWidget(EntrezFetcher::customFields(), fetcher_ ? fetcher_->m_fields : TQStringList());
}
void EntrezFetcher::ConfigWidget::saveConfig(KConfigGroup& config_) {
saveFieldsConfig(config_);
slotSetModified(false);
}
TQString EntrezFetcher::ConfigWidget::preferredName() const {
return EntrezFetcher::defaultName();
}
//static
Tellico::StringMap EntrezFetcher::customFields() {
StringMap map;
map[TQString::fromLatin1("institution")] = i18n("Institution");
map[TQString::fromLatin1("abstract")] = i18n("Abstract");
map[TQString::fromLatin1("url")] = i18n("URL");
return map;
}
#include "entrezfetcher.moc"