You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kbibtex/src/findduplicates.cpp

445 lines
17 KiB

/***************************************************************************
* Copyright (C) 2004-2009 by Thomas Fischer *
* fischer@unix-ag.uni-kl.de *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
***************************************************************************/
#include <math.h>
#include <tqstring.h>
#include <tqstringlist.h>
#include <tqregexp.h>
#include <tqapplication.h>
#include <kdebug.h>
#include <tdelocale.h>
#include <tdemessagebox.h>
#include <kprogress.h>
#include "element.h"
#include "entry.h"
#include "file.h"
#include "preamble.h"
#include "macro.h"
#include "findduplicates.h"
#define max(a,b) ((a)>(b)?(a):(b))
#define min(a,b) ((a)<(b)?(a):(b))
namespace KBibTeX
{
const unsigned int FindDuplicates::maxDistance = 0xffffff;
FindDuplicates::FindDuplicates( DuplicateCliqueList &result, unsigned int sensitivity, BibTeX::File *file, TQWidget *parent )
: TQObject( NULL, NULL ), m_doCancel( false )
{
if ( file->count() < 2 )
return;
int len = file->count() * ( file->count() - 1 ) / 2;
unsigned int *distVector = new unsigned int[len];
memset( distVector, 0xff, sizeof( unsigned int )*len );
TQMap<BibTeX::Element*, int> mapElementToIndex;
TQApplication::setOverrideCursor( TQt::waitCursor );
KProgressDialog *progDlg = new KProgressDialog( parent, NULL, i18n( "Find Duplicates" ), i18n( "Searching for duplicates..." ), true );
connect( progDlg, SIGNAL( cancelClicked() ), this, SLOT( slotCancel() ) );
progDlg->progressBar()->setTotalSteps( len );
determineDistances( file, distVector, mapElementToIndex, progDlg );
progDlg->progressBar()->setValue( len );
if ( !m_doCancel )
buildClique( result, file, distVector, mapElementToIndex, sensitivity );
delete progDlg;
delete[] distVector;
TQApplication::restoreOverrideCursor();
}
/**
* Determine the distance between elements either from two different BibTeX
* files (merging operation) or within the same file (find duplicates).
* Inter-element distances will be written into the distance vector.
* @param file BibTeX file
* @param distVector inter-element distance vector
* @param mapElementToIndex map from elements to indices (will be written)
* @param progDlg progress dialog to write status information to
*/
void FindDuplicates::determineDistances( BibTeX::File *file, unsigned int *distVector, TQMap<BibTeX::Element*, int> &mapElementToIndex, KProgressDialog *progDlg )
{
int progress = 0, i = 0;
for ( BibTeX::File::ElementList::ConstIterator it1 = file->constBegin(); !m_doCancel && it1 != file->constEnd(); ++it1, ++i )
{
BibTeX::Entry *entryA = dynamic_cast<BibTeX::Entry*>( *it1 );
if ( entryA != NULL )
{
mapElementToIndex.insert( entryA, i );
int j = i + 1;
for ( BibTeX::File::ElementList::ConstIterator it2 = ++BibTeX::File::ElementList::ConstIterator( it1 ); !m_doCancel && it2 != file->constEnd(); ++it2, ++j )
{
BibTeX::Entry *entryB = dynamic_cast<BibTeX::Entry*>( *it2 );
if ( entryB == NULL ) continue;
unsigned int d = entryDistance( entryA, entryB );
distVector[arrayOffset( i, j )] = d;
progDlg->progressBar()->setValue( ++progress );
tqApp->processEvents();
}
}
else
{
BibTeX::Macro *macroA = dynamic_cast<BibTeX::Macro*>( *it1 );
if ( macroA != NULL )
{
mapElementToIndex.insert( macroA, i );
int j = i + 1;
for ( BibTeX::File::ElementList::ConstIterator it2 = ++BibTeX::File::ElementList::ConstIterator( it1 ); !m_doCancel && it2 != file->constEnd(); ++it2, ++j )
{
BibTeX::Macro *macroB = dynamic_cast<BibTeX::Macro*>( *it2 );
if ( macroB == NULL ) continue;
distVector[arrayOffset( i, j )] = macroDistance( macroA, macroB );
progDlg->progressBar()->setValue( ++progress );
tqApp->processEvents();
}
}
else
{
BibTeX::Preamble *preambleA = dynamic_cast<BibTeX::Preamble*>( *it1 );
if ( preambleA != NULL )
{
mapElementToIndex.insert( preambleA, i );
int j = i + 1;
for ( BibTeX::File::ElementList::ConstIterator it2 = ++BibTeX::File::ElementList::ConstIterator( it1 ); !m_doCancel && it2 != file->constEnd(); ++it2, ++j )
{
BibTeX::Preamble *preambleB = dynamic_cast<BibTeX::Preamble*>( *it2 );
if ( preambleB == NULL ) continue;
distVector[arrayOffset( i, j )] = preambleDistance( preambleA, preambleB );
progDlg->progressBar()->setValue( ++progress );
tqApp->processEvents();
}
}
}
}
}
}
/**
* Build a list of clique of BibTeX elements with a distance below the
* sensitivity threshold. The list of cliques is added to the cliqueList
* parameter.
* @param cliqueList List of cliques found in this function
* @param file BibTeX file
* @param distVector inter-element distance vector
* @param mapElementToIndex map from elements to indices
* @param sensitivity sensitivity threshold value
*/
void FindDuplicates::buildClique( DuplicateCliqueList &cliqueList, BibTeX::File *file, unsigned int *distVector, TQMap<BibTeX::Element*, int> &mapElementToIndex, unsigned int sensitivity )
{
int usedLen = file->count();
bool* used = new bool[usedLen];
memset( used, false, sizeof( bool ) * usedLen );
TQValueList<BibTeX::Element*> queue;
for ( BibTeX::File::ElementList::ConstIterator it1 = file->constBegin(); it1 != file->constEnd(); ++it1 )
{
/** current element must be either entry, preamble, or macro */
BibTeX::Element *elem1 = dynamic_cast<BibTeX::Entry*>( *it1 );
if ( elem1 == NULL )
elem1 = dynamic_cast<BibTeX::Macro*>( *it1 );
if ( elem1 == NULL )
elem1 = dynamic_cast<BibTeX::Preamble*>( *it1 );
/** skip element otherwise or if already used */
if ( elem1 == NULL || used[mapElementToIndex[elem1]] ) continue;
DuplicateClique clique;
queue.clear();
queue.append( elem1 );
used[mapElementToIndex[elem1]] = true;
while ( !queue.isEmpty() )
{
elem1 = *( queue.begin() );
queue.remove( queue.begin() );
int curIndex = mapElementToIndex[elem1];
clique.append( elem1 );
for ( BibTeX::File::ElementList::ConstIterator it2 = file->constBegin(); it2 != file->constEnd(); ++it2 )
{
/** current element must be either entry, preamble, or macro */
BibTeX::Element *elem2 = dynamic_cast<BibTeX::Entry*>( *it2 );
int otherIndex=mapElementToIndex[elem2];
if ( elem2 == NULL )
elem2 = dynamic_cast<BibTeX::Macro*>( *it2 );
if ( elem2 == NULL )
elem2 = dynamic_cast<BibTeX::Preamble*>( *it2 );
/** skip element otherwise or if already used */
if ( elem2 == NULL || used[( otherIndex = mapElementToIndex[elem2] )] )
continue;
unsigned int distance = distVector[arrayOffset( curIndex, otherIndex )];
if ( distance <= sensitivity )
{
queue.append( elem2 );
used[otherIndex ] = true;
}
}
}
if ( clique.size() > 1 )
cliqueList.append( clique );
}
delete[] used;
}
/**
* Distance between two BibTeX entries, scaled by maxDistance.
*/
unsigned int FindDuplicates::entryDistance( BibTeX::Entry *entryA, BibTeX::Entry *entryB )
{
double titleValue = levenshteinDistance( extractTitle( entryA ), extractTitle( entryB ) );
double authorValue = levenshteinDistance( authorsLastName( entryA ), authorsLastName( entryB ) );
double yearValue = extractYear( entryA ) - extractYear( entryB );
yearValue = min( 1.0, yearValue * yearValue / 100.0 );
unsigned int distance = ( unsigned int )( maxDistance * ( titleValue * 0.6 + authorValue * 0.3 + yearValue * 0.1 ) );
return distance;
}
/**
* Distance between two BibTeX macros, scaled by maxDistance.
*/
unsigned int FindDuplicates::macroDistance( BibTeX::Macro *macroA, BibTeX::Macro *macroB )
{
double keyValue = levenshteinDistance( extractMacroKey( macroA ), extractMacroKey( macroB ) );
double valueValue = levenshteinDistance( extractMacroValue( macroA ), extractMacroValue( macroB ) );
unsigned int distance = ( unsigned int )( maxDistance * ( keyValue * 0.7 + valueValue * 0.3 ) );
return distance;
}
unsigned int FindDuplicates::preambleDistance( BibTeX::Preamble *preambleA, BibTeX::Preamble *preambleB )
{
return ( unsigned int )( maxDistance * levenshteinDistance( preambleA->value()->text(), preambleB->value()->text() ) );
}
FindDuplicates::~FindDuplicates()
{
// nothing
}
/**
* Determine the Levenshtein distance between two sentences,
* where each sentence is in a string (not split into single words).
* See also http://en.wikipedia.org/wiki/Levenshtein_distance
* @param s first sentence
* @param t second sentence
* @return distance between both sentences
*/
double FindDuplicates::levenshteinDistance( const TQString &s, const TQString &t )
{
const TQRegExp nonWordRegExp( "[^a-zA-Z']+" );
if ( s == TQString::null || t == TQString::null ) return 1.0;
return levenshteinDistance( TQStringList::split( nonWordRegExp, s ), TQStringList::split( nonWordRegExp, t ) );
}
/**
* Determine the Levenshtein distance between two words.
* See also http://en.wikipedia.org/wiki/Levenshtein_distance
* @param s first word
* @param t second word
* @return distance between both words
*/
double FindDuplicates::levenshteinDistanceWord( const TQString &s, const TQString &t )
{
TQString mys = s.lower(), myt = t.lower();
int m = s.length(), n = t.length();
if ( m < 1 && n < 1 ) return 0.0;
if ( m < 1 || n < 1 ) return 1.0;
int **d = new int*[m+1];
for ( int i = 0; i <= m; ++i ) {d[i] = new int[n+1]; d[i][0] = i;}
for ( int i = 0; i <= n; ++i ) d[0][i] = i;
for ( int i = 1; i <= m;++i )
for ( int j = 1; j <= n;++j )
{
d[i][j] = d[i-1][j] + 1;
int c = d[i][j-1] + 1;
if ( c < d[i][j] ) d[i][j] = c;
c = d[i-1][j-1] + ( mys[i-1] == myt[j-1] ? 0 : 1 );
if ( c < d[i][j] ) d[i][j] = c;
}
double result = d[m][n];
for ( int i = 0; i <= m; ++i ) delete[] d[i];
delete [] d;
result = result / ( double )max( m, n );
result *= result;
return result;
}
/**
* Determine the Levenshtein distance between two sentences (list of words).
* See also http://en.wikipedia.org/wiki/Levenshtein_distance
* @param s first sentence
* @param t second sentence
* @return distance between both sentences
*/
double FindDuplicates::levenshteinDistance( const TQStringList &s, const TQStringList &t )
{
int m = s.size(), n = t.size();
if ( m < 1 && n < 1 ) return 0.0;
if ( m < 1 || n < 1 ) return 1.0;
double **d = new double*[m+1];
for ( int i = 0; i <= m; ++i ) {d[i] = new double[n+1]; d[i][0] = i;}
for ( int i = 0; i <= n; ++i ) d[0][i] = i;
for ( int i = 1; i <= m;++i )
for ( int j = 1; j <= n;++j )
{
d[i][j] = d[i-1][j] + 1;
double c = d[i][j-1] + 1;
if ( c < d[i][j] ) d[i][j] = c;
c = d[i-1][j-1] + levenshteinDistanceWord( s[i-1], t[j-1] );
if ( c < d[i][j] ) d[i][j] = c;
}
double result = d[m][n];
for ( int i = 0; i <= m; ++i ) delete[] d[i];
delete [] d;
result = result / ( double )max( m, n );
return result;
}
/**
* Linearize a two-dimensional triangle matrix
*/
int FindDuplicates::arrayOffset( int a, int b )
{
if ( a == b )
return -1;
else if ( b < a )
{
int swap = a;
a = b;
b = swap;
}
return ( b * ( b - 1 ) / 2 + a );
}
/**
* Determine title for a given entry
*/
TQString FindDuplicates::extractTitle( BibTeX::Entry *entry )
{
/** retrieve field holding title information for entry */
BibTeX::EntryField *field = entry->getField( BibTeX::EntryField::ftTitle );
if ( field == NULL )
return TQString::null; /** no title field available */
/** *fetch value item holding title */
BibTeX::ValueItem *valueItem = field->value()->items.isEmpty() ? NULL : field->value()->items.first();
if ( valueItem == NULL )
return TQString::null; /** no value item found or is empty */
return valueItem->text(); // TODO: Perform some postprocessing?
}
/**
* Determine list of authors for a given entry
*/
TQStringList FindDuplicates::authorsLastName( BibTeX::Entry *entry )
{
TQStringList result;
/** retrieve field holding authors information for entry */
BibTeX::EntryField *field = entry->getField( BibTeX::EntryField::ftAuthor );
if ( field == NULL )
return result; /** no author field available */
/** fetch container holding list of author names */
BibTeX::PersonContainer *personContainer = field != NULL ? dynamic_cast<BibTeX::PersonContainer*>( field->value()->items.isEmpty() ? NULL : field->value()->items.first() ) : NULL;
if ( personContainer == NULL || personContainer->persons.isEmpty() )
return result; /** container not found or is empty */
/** iterate through container and fetch each author's last name */
for ( TQValueList<BibTeX::Person*>::ConstIterator it = personContainer->persons.begin(); it != personContainer->persons.end(); ++it )
result.append(( *it )->lastName() );
return result;
}
/**
* Determine year for a given entry
*/
int FindDuplicates::extractYear( BibTeX::Entry *entry )
{
/** retrieve field holding year information for entry */
BibTeX::EntryField *field = entry->getField( BibTeX::EntryField::ftYear );
if ( field == NULL )
return -1; /** no year field available */
/** *fetch value item holding year */
BibTeX::ValueItem *valueItem = field != NULL ? ( field->value()->items.isEmpty() ? NULL : field->value()->items.first() ) : NULL;
if ( valueItem == NULL )
return -1; /** no value item found or is empty */
/** parse value item's text */
bool ok = FALSE;
int year = TQString( valueItem->text() ).toInt( &ok );
if ( !ok ) year = -1;
return year;
}
/**
* Determine key from a given macro
*/
TQString FindDuplicates::extractMacroKey( BibTeX::Macro *macro )
{
return macro->key();
}
/**
* Determine key from a given macro
*/
TQString FindDuplicates::extractMacroValue( BibTeX::Macro *macro )
{
return macro->value()->text();
}
void FindDuplicates::slotCancel()
{
m_doCancel = true;
}
}
#include "findduplicates.moc"