|
|
|
/***************************************************************************
|
|
|
|
* Copyright (C) 2004-2009 by Thomas Fischer *
|
|
|
|
* fischer@unix-ag.uni-kl.de *
|
|
|
|
* *
|
|
|
|
* This program is free software; you can redistribute it and/or modify *
|
|
|
|
* it under the terms of the GNU General Public License as published by *
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or *
|
|
|
|
* (at your option) any later version. *
|
|
|
|
* *
|
|
|
|
* This program is distributed in the hope that it will be useful, *
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
|
|
|
* GNU General Public License for more details. *
|
|
|
|
* *
|
|
|
|
* You should have received a copy of the GNU General Public License *
|
|
|
|
* along with this program; if not, write to the *
|
|
|
|
* Free Software Foundation, Inc., *
|
|
|
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
|
|
|
|
***************************************************************************/
|
|
|
|
#include <math.h>
|
|
|
|
|
|
|
|
#include <tqstring.h>
|
|
|
|
#include <tqstringlist.h>
|
|
|
|
#include <tqregexp.h>
|
|
|
|
#include <tqapplication.h>
|
|
|
|
|
|
|
|
#include <kdebug.h>
|
|
|
|
#include <tdelocale.h>
|
|
|
|
#include <tdemessagebox.h>
|
|
|
|
#include <kprogress.h>
|
|
|
|
|
|
|
|
#include "element.h"
|
|
|
|
#include "entry.h"
|
|
|
|
#include "file.h"
|
|
|
|
#include "preamble.h"
|
|
|
|
#include "macro.h"
|
|
|
|
#include "findduplicates.h"
|
|
|
|
|
|
|
|
#define max(a,b) ((a)>(b)?(a):(b))
|
|
|
|
#define min(a,b) ((a)<(b)?(a):(b))
|
|
|
|
|
|
|
|
namespace KBibTeX
|
|
|
|
{
|
|
|
|
const unsigned int FindDuplicates::maxDistance = 0xffffff;
|
|
|
|
|
|
|
|
FindDuplicates::FindDuplicates( DuplicateCliqueList &result, unsigned int sensitivity, BibTeX::File *file, TQWidget *parent )
|
|
|
|
: TQObject( NULL, NULL ), m_doCancel( false )
|
|
|
|
{
|
|
|
|
if ( file->count() < 2 )
|
|
|
|
return;
|
|
|
|
|
|
|
|
int len = file->count() * ( file->count() - 1 ) / 2;
|
|
|
|
unsigned int *distVector = new unsigned int[len];
|
|
|
|
memset( distVector, 0xff, sizeof( unsigned int )*len );
|
|
|
|
TQMap<BibTeX::Element*, int> mapElementToIndex;
|
|
|
|
|
|
|
|
TQApplication::setOverrideCursor( TQt::waitCursor );
|
|
|
|
KProgressDialog *progDlg = new KProgressDialog( parent, NULL, i18n( "Find Duplicates" ), i18n( "Searching for duplicates..." ), true );
|
|
|
|
connect( progDlg, SIGNAL( cancelClicked() ), this, SLOT( slotCancel() ) );
|
|
|
|
progDlg->progressBar()->setTotalSteps( len );
|
|
|
|
|
|
|
|
determineDistances( file, distVector, mapElementToIndex, progDlg );
|
|
|
|
progDlg->progressBar()->setValue( len );
|
|
|
|
|
|
|
|
if ( !m_doCancel )
|
|
|
|
buildClique( result, file, distVector, mapElementToIndex, sensitivity );
|
|
|
|
|
|
|
|
delete progDlg;
|
|
|
|
delete[] distVector;
|
|
|
|
TQApplication::restoreOverrideCursor();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Determine the distance between elements either from two different BibTeX
|
|
|
|
* files (merging operation) or within the same file (find duplicates).
|
|
|
|
* Inter-element distances will be written into the distance vector.
|
|
|
|
* @param file BibTeX file
|
|
|
|
* @param distVector inter-element distance vector
|
|
|
|
* @param mapElementToIndex map from elements to indices (will be written)
|
|
|
|
* @param progDlg progress dialog to write status information to
|
|
|
|
*/
|
|
|
|
void FindDuplicates::determineDistances( BibTeX::File *file, unsigned int *distVector, TQMap<BibTeX::Element*, int> &mapElementToIndex, KProgressDialog *progDlg )
|
|
|
|
{
|
|
|
|
int progress = 0, i = 0;
|
|
|
|
for ( BibTeX::File::ElementList::ConstIterator it1 = file->constBegin(); !m_doCancel && it1 != file->constEnd(); ++it1, ++i )
|
|
|
|
{
|
|
|
|
BibTeX::Entry *entryA = dynamic_cast<BibTeX::Entry*>( *it1 );
|
|
|
|
if ( entryA != NULL )
|
|
|
|
{
|
|
|
|
mapElementToIndex.insert( entryA, i );
|
|
|
|
|
|
|
|
int j = i + 1;
|
|
|
|
for ( BibTeX::File::ElementList::ConstIterator it2 = ++BibTeX::File::ElementList::ConstIterator( it1 ); !m_doCancel && it2 != file->constEnd(); ++it2, ++j )
|
|
|
|
{
|
|
|
|
BibTeX::Entry *entryB = dynamic_cast<BibTeX::Entry*>( *it2 );
|
|
|
|
if ( entryB == NULL ) continue;
|
|
|
|
|
|
|
|
unsigned int d = entryDistance( entryA, entryB );
|
|
|
|
distVector[arrayOffset( i, j )] = d;
|
|
|
|
|
|
|
|
progDlg->progressBar()->setValue( ++progress );
|
|
|
|
tqApp->processEvents();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
BibTeX::Macro *macroA = dynamic_cast<BibTeX::Macro*>( *it1 );
|
|
|
|
if ( macroA != NULL )
|
|
|
|
{
|
|
|
|
mapElementToIndex.insert( macroA, i );
|
|
|
|
|
|
|
|
int j = i + 1;
|
|
|
|
for ( BibTeX::File::ElementList::ConstIterator it2 = ++BibTeX::File::ElementList::ConstIterator( it1 ); !m_doCancel && it2 != file->constEnd(); ++it2, ++j )
|
|
|
|
{
|
|
|
|
BibTeX::Macro *macroB = dynamic_cast<BibTeX::Macro*>( *it2 );
|
|
|
|
if ( macroB == NULL ) continue;
|
|
|
|
|
|
|
|
distVector[arrayOffset( i, j )] = macroDistance( macroA, macroB );
|
|
|
|
|
|
|
|
progDlg->progressBar()->setValue( ++progress );
|
|
|
|
tqApp->processEvents();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
BibTeX::Preamble *preambleA = dynamic_cast<BibTeX::Preamble*>( *it1 );
|
|
|
|
if ( preambleA != NULL )
|
|
|
|
{
|
|
|
|
mapElementToIndex.insert( preambleA, i );
|
|
|
|
|
|
|
|
int j = i + 1;
|
|
|
|
for ( BibTeX::File::ElementList::ConstIterator it2 = ++BibTeX::File::ElementList::ConstIterator( it1 ); !m_doCancel && it2 != file->constEnd(); ++it2, ++j )
|
|
|
|
{
|
|
|
|
BibTeX::Preamble *preambleB = dynamic_cast<BibTeX::Preamble*>( *it2 );
|
|
|
|
if ( preambleB == NULL ) continue;
|
|
|
|
|
|
|
|
distVector[arrayOffset( i, j )] = preambleDistance( preambleA, preambleB );
|
|
|
|
|
|
|
|
progDlg->progressBar()->setValue( ++progress );
|
|
|
|
tqApp->processEvents();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Build a list of clique of BibTeX elements with a distance below the
|
|
|
|
* sensitivity threshold. The list of cliques is added to the cliqueList
|
|
|
|
* parameter.
|
|
|
|
* @param cliqueList List of cliques found in this function
|
|
|
|
* @param file BibTeX file
|
|
|
|
* @param distVector inter-element distance vector
|
|
|
|
* @param mapElementToIndex map from elements to indices
|
|
|
|
* @param sensitivity sensitivity threshold value
|
|
|
|
*/
|
|
|
|
void FindDuplicates::buildClique( DuplicateCliqueList &cliqueList, BibTeX::File *file, unsigned int *distVector, TQMap<BibTeX::Element*, int> &mapElementToIndex, unsigned int sensitivity )
|
|
|
|
{
|
|
|
|
int usedLen = file->count();
|
|
|
|
bool* used = new bool[usedLen];
|
|
|
|
memset( used, false, sizeof( bool ) * usedLen );
|
|
|
|
TQValueList<BibTeX::Element*> queue;
|
|
|
|
for ( BibTeX::File::ElementList::ConstIterator it1 = file->constBegin(); it1 != file->constEnd(); ++it1 )
|
|
|
|
{
|
|
|
|
/** current element must be either entry, preamble, or macro */
|
|
|
|
BibTeX::Element *elem1 = dynamic_cast<BibTeX::Entry*>( *it1 );
|
|
|
|
if ( elem1 == NULL )
|
|
|
|
elem1 = dynamic_cast<BibTeX::Macro*>( *it1 );
|
|
|
|
if ( elem1 == NULL )
|
|
|
|
elem1 = dynamic_cast<BibTeX::Preamble*>( *it1 );
|
|
|
|
/** skip element otherwise or if already used */
|
|
|
|
if ( elem1 == NULL || used[mapElementToIndex[elem1]] ) continue;
|
|
|
|
|
|
|
|
DuplicateClique clique;
|
|
|
|
|
|
|
|
queue.clear();
|
|
|
|
queue.append( elem1 );
|
|
|
|
used[mapElementToIndex[elem1]] = true;
|
|
|
|
|
|
|
|
while ( !queue.isEmpty() )
|
|
|
|
{
|
|
|
|
elem1 = *( queue.begin() );
|
|
|
|
queue.remove( queue.begin() );
|
|
|
|
int curIndex = mapElementToIndex[elem1];
|
|
|
|
clique.append( elem1 );
|
|
|
|
|
|
|
|
for ( BibTeX::File::ElementList::ConstIterator it2 = file->constBegin(); it2 != file->constEnd(); ++it2 )
|
|
|
|
{
|
|
|
|
/** current element must be either entry, preamble, or macro */
|
|
|
|
BibTeX::Element *elem2 = dynamic_cast<BibTeX::Entry*>( *it2 );
|
|
|
|
int otherIndex=mapElementToIndex[elem2];
|
|
|
|
if ( elem2 == NULL )
|
|
|
|
elem2 = dynamic_cast<BibTeX::Macro*>( *it2 );
|
|
|
|
if ( elem2 == NULL )
|
|
|
|
elem2 = dynamic_cast<BibTeX::Preamble*>( *it2 );
|
|
|
|
/** skip element otherwise or if already used */
|
|
|
|
if ( elem2 == NULL || used[( otherIndex = mapElementToIndex[elem2] )] )
|
|
|
|
continue;
|
|
|
|
|
|
|
|
unsigned int distance = distVector[arrayOffset( curIndex, otherIndex )];
|
|
|
|
|
|
|
|
if ( distance <= sensitivity )
|
|
|
|
{
|
|
|
|
queue.append( elem2 );
|
|
|
|
used[otherIndex ] = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( clique.size() > 1 )
|
|
|
|
cliqueList.append( clique );
|
|
|
|
}
|
|
|
|
delete[] used;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Distance between two BibTeX entries, scaled by maxDistance.
|
|
|
|
*/
|
|
|
|
unsigned int FindDuplicates::entryDistance( BibTeX::Entry *entryA, BibTeX::Entry *entryB )
|
|
|
|
{
|
|
|
|
double titleValue = levenshteinDistance( extractTitle( entryA ), extractTitle( entryB ) );
|
|
|
|
double authorValue = levenshteinDistance( authorsLastName( entryA ), authorsLastName( entryB ) );
|
|
|
|
double yearValue = extractYear( entryA ) - extractYear( entryB );
|
|
|
|
yearValue = min( 1.0, yearValue * yearValue / 100.0 );
|
|
|
|
unsigned int distance = ( unsigned int )( maxDistance * ( titleValue * 0.6 + authorValue * 0.3 + yearValue * 0.1 ) );
|
|
|
|
|
|
|
|
return distance;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Distance between two BibTeX macros, scaled by maxDistance.
|
|
|
|
*/
|
|
|
|
unsigned int FindDuplicates::macroDistance( BibTeX::Macro *macroA, BibTeX::Macro *macroB )
|
|
|
|
{
|
|
|
|
double keyValue = levenshteinDistance( extractMacroKey( macroA ), extractMacroKey( macroB ) );
|
|
|
|
double valueValue = levenshteinDistance( extractMacroValue( macroA ), extractMacroValue( macroB ) );
|
|
|
|
unsigned int distance = ( unsigned int )( maxDistance * ( keyValue * 0.7 + valueValue * 0.3 ) );
|
|
|
|
|
|
|
|
return distance;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned int FindDuplicates::preambleDistance( BibTeX::Preamble *preambleA, BibTeX::Preamble *preambleB )
|
|
|
|
{
|
|
|
|
return ( unsigned int )( maxDistance * levenshteinDistance( preambleA->value()->text(), preambleB->value()->text() ) );
|
|
|
|
}
|
|
|
|
|
|
|
|
FindDuplicates::~FindDuplicates()
|
|
|
|
{
|
|
|
|
// nothing
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Determine the Levenshtein distance between two sentences,
|
|
|
|
* where each sentence is in a string (not split into single words).
|
|
|
|
* See also http://en.wikipedia.org/wiki/Levenshtein_distance
|
|
|
|
* @param s first sentence
|
|
|
|
* @param t second sentence
|
|
|
|
* @return distance between both sentences
|
|
|
|
*/
|
|
|
|
double FindDuplicates::levenshteinDistance( const TQString &s, const TQString &t )
|
|
|
|
{
|
|
|
|
const TQRegExp nonWordRegExp( "[^a-zA-Z']+" );
|
|
|
|
if ( s == TQString::null || t == TQString::null ) return 1.0;
|
|
|
|
return levenshteinDistance( TQStringList::split( nonWordRegExp, s ), TQStringList::split( nonWordRegExp, t ) );
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Determine the Levenshtein distance between two words.
|
|
|
|
* See also http://en.wikipedia.org/wiki/Levenshtein_distance
|
|
|
|
* @param s first word
|
|
|
|
* @param t second word
|
|
|
|
* @return distance between both words
|
|
|
|
*/
|
|
|
|
double FindDuplicates::levenshteinDistanceWord( const TQString &s, const TQString &t )
|
|
|
|
{
|
|
|
|
TQString mys = s.lower(), myt = t.lower();
|
|
|
|
int m = s.length(), n = t.length();
|
|
|
|
if ( m < 1 && n < 1 ) return 0.0;
|
|
|
|
if ( m < 1 || n < 1 ) return 1.0;
|
|
|
|
|
|
|
|
int **d = new int*[m+1];
|
|
|
|
for ( int i = 0; i <= m; ++i ) {d[i] = new int[n+1]; d[i][0] = i;}
|
|
|
|
for ( int i = 0; i <= n; ++i ) d[0][i] = i;
|
|
|
|
|
|
|
|
for ( int i = 1; i <= m;++i )
|
|
|
|
for ( int j = 1; j <= n;++j )
|
|
|
|
{
|
|
|
|
d[i][j] = d[i-1][j] + 1;
|
|
|
|
int c = d[i][j-1] + 1;
|
|
|
|
if ( c < d[i][j] ) d[i][j] = c;
|
|
|
|
c = d[i-1][j-1] + ( mys[i-1] == myt[j-1] ? 0 : 1 );
|
|
|
|
if ( c < d[i][j] ) d[i][j] = c;
|
|
|
|
}
|
|
|
|
|
|
|
|
double result = d[m][n];
|
|
|
|
for ( int i = 0; i <= m; ++i ) delete[] d[i];
|
|
|
|
delete [] d;
|
|
|
|
|
|
|
|
result = result / ( double )max( m, n );
|
|
|
|
result *= result;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Determine the Levenshtein distance between two sentences (list of words).
|
|
|
|
* See also http://en.wikipedia.org/wiki/Levenshtein_distance
|
|
|
|
* @param s first sentence
|
|
|
|
* @param t second sentence
|
|
|
|
* @return distance between both sentences
|
|
|
|
*/
|
|
|
|
double FindDuplicates::levenshteinDistance( const TQStringList &s, const TQStringList &t )
|
|
|
|
{
|
|
|
|
int m = s.size(), n = t.size();
|
|
|
|
if ( m < 1 && n < 1 ) return 0.0;
|
|
|
|
if ( m < 1 || n < 1 ) return 1.0;
|
|
|
|
|
|
|
|
double **d = new double*[m+1];
|
|
|
|
for ( int i = 0; i <= m; ++i ) {d[i] = new double[n+1]; d[i][0] = i;}
|
|
|
|
for ( int i = 0; i <= n; ++i ) d[0][i] = i;
|
|
|
|
|
|
|
|
for ( int i = 1; i <= m;++i )
|
|
|
|
for ( int j = 1; j <= n;++j )
|
|
|
|
{
|
|
|
|
d[i][j] = d[i-1][j] + 1;
|
|
|
|
double c = d[i][j-1] + 1;
|
|
|
|
if ( c < d[i][j] ) d[i][j] = c;
|
|
|
|
c = d[i-1][j-1] + levenshteinDistanceWord( s[i-1], t[j-1] );
|
|
|
|
if ( c < d[i][j] ) d[i][j] = c;
|
|
|
|
}
|
|
|
|
|
|
|
|
double result = d[m][n];
|
|
|
|
for ( int i = 0; i <= m; ++i ) delete[] d[i];
|
|
|
|
delete [] d;
|
|
|
|
|
|
|
|
result = result / ( double )max( m, n );
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Linearize a two-dimensional triangle matrix
|
|
|
|
*/
|
|
|
|
int FindDuplicates::arrayOffset( int a, int b )
|
|
|
|
{
|
|
|
|
if ( a == b )
|
|
|
|
return -1;
|
|
|
|
else if ( b < a )
|
|
|
|
{
|
|
|
|
int swap = a;
|
|
|
|
a = b;
|
|
|
|
b = swap;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ( b * ( b - 1 ) / 2 + a );
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Determine title for a given entry
|
|
|
|
*/
|
|
|
|
TQString FindDuplicates::extractTitle( BibTeX::Entry *entry )
|
|
|
|
{
|
|
|
|
/** retrieve field holding title information for entry */
|
|
|
|
BibTeX::EntryField *field = entry->getField( BibTeX::EntryField::ftTitle );
|
|
|
|
if ( field == NULL )
|
|
|
|
return TQString::null; /** no title field available */
|
|
|
|
|
|
|
|
/** *fetch value item holding title */
|
|
|
|
BibTeX::ValueItem *valueItem = field->value()->items.isEmpty() ? NULL : field->value()->items.first();
|
|
|
|
if ( valueItem == NULL )
|
|
|
|
return TQString::null; /** no value item found or is empty */
|
|
|
|
|
|
|
|
return valueItem->text(); // TODO: Perform some postprocessing?
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Determine list of authors for a given entry
|
|
|
|
*/
|
|
|
|
TQStringList FindDuplicates::authorsLastName( BibTeX::Entry *entry )
|
|
|
|
{
|
|
|
|
TQStringList result;
|
|
|
|
|
|
|
|
/** retrieve field holding authors information for entry */
|
|
|
|
BibTeX::EntryField *field = entry->getField( BibTeX::EntryField::ftAuthor );
|
|
|
|
if ( field == NULL )
|
|
|
|
return result; /** no author field available */
|
|
|
|
|
|
|
|
/** fetch container holding list of author names */
|
|
|
|
BibTeX::PersonContainer *personContainer = field != NULL ? dynamic_cast<BibTeX::PersonContainer*>( field->value()->items.isEmpty() ? NULL : field->value()->items.first() ) : NULL;
|
|
|
|
if ( personContainer == NULL || personContainer->persons.isEmpty() )
|
|
|
|
return result; /** container not found or is empty */
|
|
|
|
|
|
|
|
/** iterate through container and fetch each author's last name */
|
|
|
|
for ( TQValueList<BibTeX::Person*>::ConstIterator it = personContainer->persons.begin(); it != personContainer->persons.end(); ++it )
|
|
|
|
result.append(( *it )->lastName() );
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Determine year for a given entry
|
|
|
|
*/
|
|
|
|
int FindDuplicates::extractYear( BibTeX::Entry *entry )
|
|
|
|
{
|
|
|
|
/** retrieve field holding year information for entry */
|
|
|
|
BibTeX::EntryField *field = entry->getField( BibTeX::EntryField::ftYear );
|
|
|
|
if ( field == NULL )
|
|
|
|
return -1; /** no year field available */
|
|
|
|
|
|
|
|
/** *fetch value item holding year */
|
|
|
|
BibTeX::ValueItem *valueItem = field != NULL ? ( field->value()->items.isEmpty() ? NULL : field->value()->items.first() ) : NULL;
|
|
|
|
if ( valueItem == NULL )
|
|
|
|
return -1; /** no value item found or is empty */
|
|
|
|
|
|
|
|
/** parse value item's text */
|
|
|
|
bool ok = FALSE;
|
|
|
|
int year = TQString( valueItem->text() ).toInt( &ok );
|
|
|
|
if ( !ok ) year = -1;
|
|
|
|
|
|
|
|
return year;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Determine key from a given macro
|
|
|
|
*/
|
|
|
|
TQString FindDuplicates::extractMacroKey( BibTeX::Macro *macro )
|
|
|
|
{
|
|
|
|
return macro->key();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Determine key from a given macro
|
|
|
|
*/
|
|
|
|
TQString FindDuplicates::extractMacroValue( BibTeX::Macro *macro )
|
|
|
|
{
|
|
|
|
return macro->value()->text();
|
|
|
|
}
|
|
|
|
|
|
|
|
void FindDuplicates::slotCancel()
|
|
|
|
{
|
|
|
|
m_doCancel = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#include "findduplicates.moc"
|