You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tellico/src/translators/risimporter.cpp

337 lines
14 KiB

/***************************************************************************
copyright : (C) 2004-2006 by Robby Stephenson
email : robby@periapsis.org
***************************************************************************/
/***************************************************************************
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of version 2 of the GNU General Public License as *
* published by the Free Software Foundation; *
* *
***************************************************************************/
#include "risimporter.h"
#include "../collections/bibtexcollection.h"
#include "../document.h"
#include "../entry.h"
#include "../field.h"
#include "../latin1literal.h"
#include "../progressmanager.h"
#include "../filehandler.h"
#include "../isbnvalidator.h"
#include "../tellico_debug.h"
#include <tdeapplication.h>
#include <tqdict.h>
#include <tqregexp.h>
#include <tqmap.h>
using Tellico::Import::RISImporter;
TQMap<TQString, TQString>* RISImporter::s_tagMap = 0;
TQMap<TQString, TQString>* RISImporter::s_typeMap = 0;
// static
void RISImporter::initTagMap() {
if(!s_tagMap) {
s_tagMap = new TQMap<TQString, TQString>();
// BT is special and is handled separately
s_tagMap->insert(TQString::fromLatin1("TY"), TQString::fromLatin1("entry-type"));
s_tagMap->insert(TQString::fromLatin1("ID"), TQString::fromLatin1("bibtex-key"));
s_tagMap->insert(TQString::fromLatin1("T1"), TQString::fromLatin1("title"));
s_tagMap->insert(TQString::fromLatin1("TI"), TQString::fromLatin1("title"));
s_tagMap->insert(TQString::fromLatin1("T2"), TQString::fromLatin1("booktitle"));
s_tagMap->insert(TQString::fromLatin1("A1"), TQString::fromLatin1("author"));
s_tagMap->insert(TQString::fromLatin1("AU"), TQString::fromLatin1("author"));
s_tagMap->insert(TQString::fromLatin1("ED"), TQString::fromLatin1("editor"));
s_tagMap->insert(TQString::fromLatin1("YR"), TQString::fromLatin1("year"));
s_tagMap->insert(TQString::fromLatin1("PY"), TQString::fromLatin1("year"));
s_tagMap->insert(TQString::fromLatin1("N1"), TQString::fromLatin1("note"));
s_tagMap->insert(TQString::fromLatin1("AB"), TQString::fromLatin1("abstract")); // should be note?
s_tagMap->insert(TQString::fromLatin1("N2"), TQString::fromLatin1("abstract"));
s_tagMap->insert(TQString::fromLatin1("KW"), TQString::fromLatin1("keyword"));
s_tagMap->insert(TQString::fromLatin1("JF"), TQString::fromLatin1("journal"));
s_tagMap->insert(TQString::fromLatin1("JO"), TQString::fromLatin1("journal"));
s_tagMap->insert(TQString::fromLatin1("JA"), TQString::fromLatin1("journal"));
s_tagMap->insert(TQString::fromLatin1("VL"), TQString::fromLatin1("volume"));
s_tagMap->insert(TQString::fromLatin1("IS"), TQString::fromLatin1("number"));
s_tagMap->insert(TQString::fromLatin1("PB"), TQString::fromLatin1("publisher"));
s_tagMap->insert(TQString::fromLatin1("SN"), TQString::fromLatin1("isbn"));
s_tagMap->insert(TQString::fromLatin1("AD"), TQString::fromLatin1("address"));
s_tagMap->insert(TQString::fromLatin1("CY"), TQString::fromLatin1("address"));
s_tagMap->insert(TQString::fromLatin1("UR"), TQString::fromLatin1("url"));
s_tagMap->insert(TQString::fromLatin1("L1"), TQString::fromLatin1("pdf"));
s_tagMap->insert(TQString::fromLatin1("T3"), TQString::fromLatin1("series"));
s_tagMap->insert(TQString::fromLatin1("EP"), TQString::fromLatin1("pages"));
}
}
// static
void RISImporter::initTypeMap() {
if(!s_typeMap) {
s_typeMap = new TQMap<TQString, TQString>();
// leave capitalized, except for bibtex types
s_typeMap->insert(TQString::fromLatin1("ABST"), TQString::fromLatin1("Abstract"));
s_typeMap->insert(TQString::fromLatin1("ADVS"), TQString::fromLatin1("Audiovisual material"));
s_typeMap->insert(TQString::fromLatin1("ART"), TQString::fromLatin1("Art Work"));
s_typeMap->insert(TQString::fromLatin1("BILL"), TQString::fromLatin1("Bill/Resolution"));
s_typeMap->insert(TQString::fromLatin1("BOOK"), TQString::fromLatin1("book")); // bibtex
s_typeMap->insert(TQString::fromLatin1("CASE"), TQString::fromLatin1("Case"));
s_typeMap->insert(TQString::fromLatin1("CHAP"), TQString::fromLatin1("inbook")); // == "inbook" ?
s_typeMap->insert(TQString::fromLatin1("COMP"), TQString::fromLatin1("Computer program"));
s_typeMap->insert(TQString::fromLatin1("CONF"), TQString::fromLatin1("inproceedings")); // == "conference" ?
s_typeMap->insert(TQString::fromLatin1("CTLG"), TQString::fromLatin1("Catalog"));
s_typeMap->insert(TQString::fromLatin1("DATA"), TQString::fromLatin1("Data file"));
s_typeMap->insert(TQString::fromLatin1("ELEC"), TQString::fromLatin1("Electronic Citation"));
s_typeMap->insert(TQString::fromLatin1("GEN"), TQString::fromLatin1("Generic"));
s_typeMap->insert(TQString::fromLatin1("HEAR"), TQString::fromLatin1("Hearing"));
s_typeMap->insert(TQString::fromLatin1("ICOMM"), TQString::fromLatin1("Internet Communication"));
s_typeMap->insert(TQString::fromLatin1("INPR"), TQString::fromLatin1("In Press"));
s_typeMap->insert(TQString::fromLatin1("JFULL"), TQString::fromLatin1("Journal (full)")); // = "periodical" ?
s_typeMap->insert(TQString::fromLatin1("JOUR"), TQString::fromLatin1("article")); // "Journal"
s_typeMap->insert(TQString::fromLatin1("MAP"), TQString::fromLatin1("Map"));
s_typeMap->insert(TQString::fromLatin1("MGZN"), TQString::fromLatin1("article")); // bibtex
s_typeMap->insert(TQString::fromLatin1("MPCT"), TQString::fromLatin1("Motion picture"));
s_typeMap->insert(TQString::fromLatin1("MUSIC"), TQString::fromLatin1("Music score"));
s_typeMap->insert(TQString::fromLatin1("NEWS"), TQString::fromLatin1("Newspaper"));
s_typeMap->insert(TQString::fromLatin1("PAMP"), TQString::fromLatin1("Pamphlet")); // = "booklet" ?
s_typeMap->insert(TQString::fromLatin1("PAT"), TQString::fromLatin1("Patent"));
s_typeMap->insert(TQString::fromLatin1("PCOMM"), TQString::fromLatin1("Personal communication"));
s_typeMap->insert(TQString::fromLatin1("RPRT"), TQString::fromLatin1("Report")); // = "techreport" ?
s_typeMap->insert(TQString::fromLatin1("SER"), TQString::fromLatin1("Serial (BookMonograph)"));
s_typeMap->insert(TQString::fromLatin1("SLIDE"), TQString::fromLatin1("Slide"));
s_typeMap->insert(TQString::fromLatin1("SOUND"), TQString::fromLatin1("Sound recording"));
s_typeMap->insert(TQString::fromLatin1("STAT"), TQString::fromLatin1("Statute"));
s_typeMap->insert(TQString::fromLatin1("THES"), TQString::fromLatin1("phdthesis")); // "mastersthesis" ?
s_typeMap->insert(TQString::fromLatin1("UNBILL"), TQString::fromLatin1("Unenacted bill/resolution"));
s_typeMap->insert(TQString::fromLatin1("UNPB"), TQString::fromLatin1("unpublished")); // bibtex
s_typeMap->insert(TQString::fromLatin1("VIDEO"), TQString::fromLatin1("Video recording"));
}
}
RISImporter::RISImporter(const KURL::List& urls_) : Tellico::Import::Importer(urls_), m_coll(0), m_cancelled(false) {
initTagMap();
initTypeMap();
}
bool RISImporter::canImport(int type) const {
return type == Data::Collection::Bibtex;
}
Tellico::Data::CollPtr RISImporter::collection() {
if(m_coll) {
return m_coll;
}
m_coll = new Data::BibtexCollection(true);
TQDict<Data::Field> risFields;
// need to know if any extended properties in current collection point to RIS
// if so, add to collection
Data::CollPtr currColl = Data::Document::self()->collection();
Data::FieldVec vec = currColl->fields();
for(Data::FieldVec::Iterator it = vec.begin(); it != vec.end(); ++it) {
// continue if property is empty
TQString ris = it->property(TQString::fromLatin1("ris"));
if(ris.isEmpty()) {
continue;
}
// if current collection has one with the same name, set the property
Data::FieldPtr f = m_coll->fieldByName(it->name());
if(!f) {
f = new Data::Field(*it);
m_coll->addField(f);
}
f->setProperty(TQString::fromLatin1("ris"), ris);
risFields.insert(ris, f);
}
ProgressItem& item = ProgressManager::self()->newProgressItem(this, progressLabel(), true);
item.setTotalSteps(urls().count() * 100);
connect(&item, TQT_SIGNAL(signalCancelled(ProgressItem*)), TQT_SLOT(slotCancel()));
ProgressItem::Done done(this);
int count = 0;
KURL::List urls = this->urls();
for(KURL::List::ConstIterator it = urls.begin(); it != urls.end() && !m_cancelled; ++it, ++count) {
readURL(*it, count, risFields);
}
if(m_cancelled) {
m_coll = 0;
}
return m_coll;
}
void RISImporter::readURL(const KURL& url_, int n, const TQDict<Data::Field>& risFields_) {
TQString str = FileHandler::readTextFile(url_);
if(str.isEmpty()) {
return;
}
ISBNValidator isbnval(this);
TQTextIStream t(&str);
const uint length = str.length();
const uint stepSize = TQMAX(s_stepSize, length/100);
const bool showProgress = options() & ImportProgress;
bool needToAddFinal = false;
TQString sp, ep;
uint j = 0;
Data::EntryPtr entry = new Data::Entry(m_coll);
// technically, the spec requires a space immediately after the hyphen
// however, at least one website (Springer) outputs RIS with no space after the final "ER -"
// so just strip the white space later
// also be gracious and allow any amount of space before hyphen
TQRegExp rx(TQString::fromLatin1("^(\\w\\w)\\s+-(.*)$"));
TQString currLine, nextLine;
for(currLine = t.readLine(); !m_cancelled && !currLine.isNull(); currLine = nextLine, j += currLine.length()) {
nextLine = t.readLine();
rx.search(currLine);
TQString tag = rx.cap(1);
TQString value = rx.cap(2).stripWhiteSpace();
if(tag.isEmpty()) {
continue;
}
// myDebug() << tag << ": " << value << endl;
// if the next line is not empty and does not match start regexp, append to value
while(!nextLine.isEmpty() && nextLine.find(rx) == -1) {
value += nextLine.stripWhiteSpace();
nextLine = t.readLine();
}
// every entry ends with "ER"
if(tag == Latin1Literal("ER")) {
m_coll->addEntries(entry);
entry = new Data::Entry(m_coll);
needToAddFinal = false;
continue;
} else if(tag == Latin1Literal("TY") && s_typeMap->contains(value)) {
// for entry-type, switch it to normalized type name
value = (*s_typeMap)[value];
} else if(tag == Latin1Literal("SN")) {
// test for valid isbn, sometimes the issn gets stuck here
int pos = 0;
if(isbnval.validate(value, pos) != ISBNValidator::Acceptable) {
continue;
}
} else if(tag == Latin1Literal("SP")) {
sp = value;
if(!ep.isEmpty()) {
value = sp + '-' + ep;
tag = TQString::fromLatin1("EP");
sp = TQString();
ep = TQString();
} else {
// nothing else to do
continue;
}
} else if(tag == Latin1Literal("EP")) {
ep = value;
if(!sp.isEmpty()) {
value = sp + '-' + ep;
sp = TQString();
ep = TQString();
} else {
continue;
}
} else if(tag == Latin1Literal("YR") || tag == Latin1Literal("PY")) { // for now, just grab the year
value = value.section('/', 0, 0);
}
// the lookup scheme is:
// 1. any field has an RIS property that matches the tag name
// 2. default field mapping tag -> field name
Data::FieldPtr f = risFields_.find(tag);
if(!f) {
// special case for BT
// primary title for books, secondary for everything else
if(tag == Latin1Literal("BT")) {
if(entry->field(TQString::fromLatin1("entry-type")) == Latin1Literal("book")) {
f = m_coll->fieldByName(TQString::fromLatin1("title"));
} else {
f = m_coll->fieldByName(TQString::fromLatin1("booktitle"));
}
} else {
f = fieldByTag(tag);
}
}
if(!f) {
continue;
}
needToAddFinal = true;
// harmless for non-choice fields
// for entry-type, want it in lower case
f->addAllowed(value);
// if the field can have multiple values, append current values to new value
if((f->flags() & Data::Field::AllowMultiple) && !entry->field(f->name()).isEmpty()) {
value.prepend(entry->field(f->name()) + TQString::fromLatin1("; "));
}
entry->setField(f, value);
if(showProgress && j%stepSize == 0) {
ProgressManager::self()->setProgress(this, n*100 + 100*j/length);
kapp->processEvents();
}
}
if(needToAddFinal) {
m_coll->addEntries(entry);
}
}
Tellico::Data::FieldPtr RISImporter::fieldByTag(const TQString& tag_) {
Data::FieldPtr f = 0;
const TQString& fieldTag = (*s_tagMap)[tag_];
if(!fieldTag.isEmpty()) {
f = m_coll->fieldByName(fieldTag);
if(f) {
f->setProperty(TQString::fromLatin1("ris"), tag_);
return f;
}
}
// add non-default fields if not already there
if(tag_== Latin1Literal("L1")) {
f = new Data::Field(TQString::fromLatin1("pdf"), i18n("PDF"), Data::Field::URL);
f->setProperty(TQString::fromLatin1("ris"), TQString::fromLatin1("L1"));
f->setCategory(i18n("Miscellaneous"));
}
m_coll->addField(f);
return f;
}
void RISImporter::slotCancel() {
m_cancelled = true;
}
bool RISImporter::maybeRIS(const KURL& url_) {
TQString text = FileHandler::readTextFile(url_, true /*quiet*/);
if(text.isEmpty()) {
return false;
}
// bare bones check, strip white space at beginning
// and then first text line must be valid RIS
TQTextIStream t(&text);
TQRegExp rx(TQString::fromLatin1("^(\\w\\w)\\s+-(.*)$"));
TQString currLine;
for(currLine = t.readLine(); !currLine.isNull(); currLine = t.readLine()) {
if(currLine.stripWhiteSpace().isEmpty()) {
continue;
}
break;
}
return rx.exactMatch(currLine);
}
#include "risimporter.moc"