You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tdepim/indexlib/tests/tokenizer-test.cpp

70 lines
2.3 KiB

#include <boost/test/unit_test.hpp>
#include "tokenizer.h"
#include <cassert>
using namespace ::boost::unit_test;
namespace indexlib { namespace tests { namespace tokenizer_test {
using indexlib::detail::tokenizer;
using indexlib::detail::get_tokenizer;
void simple() {
std::unique_ptr<tokenizer> tokenizer = get_tokenizer( "latin-1:european" );
assert(tokenizer);
std::vector<std::string> tokens = tokenizer->string_to_words( "one ,as, ''#`:ThReE, <20><><EFBFBD><EFBFBD><EFBFBD>" );
std::vector<std::string> expected;
expected.push_back( "ONE" );
expected.push_back( "AS" );
expected.push_back( "THREE" );
expected.push_back( "AAACE" );
std::sort( tokens.begin(), tokens.end() );
std::sort( expected.begin(), expected.end() );
BOOST_CHECK_EQUAL( expected.size(), tokens.size() );
for ( int i = 0; i < expected.size() && i < tokens.size(); ++i ) {
BOOST_CHECK_EQUAL( expected[ i ], tokens[ i ] );
}
}
void with_newlines() {
std::unique_ptr<tokenizer> tokenizer = get_tokenizer( "latin-1:european" );
assert(tokenizer);
std::vector<std::string> tokens = tokenizer->string_to_words( "one\ntwo\nthree" );
std::vector<std::string> expected;
expected.push_back( "ONE" );
expected.push_back( "TWO" );
expected.push_back( "THREE" );
std::sort( tokens.begin(), tokens.end() );
std::sort( expected.begin(), expected.end() );
BOOST_CHECK_EQUAL( expected.size(), tokens.size() );
for ( int i = 0; i < expected.size() && i < tokens.size(); ++i ) {
BOOST_CHECK_EQUAL( expected.at( i ), tokens.at( i ) );
}
}
void with_numbers() {
std::unique_ptr<tokenizer> tokenizer = get_tokenizer( "latin-1:european" );
assert(tokenizer);
std::vector<std::string> tokens = tokenizer->string_to_words( "one 012 123 four" );
std::vector<std::string> expected;
expected.push_back( "ONE" );
expected.push_back( "012" );
expected.push_back( "123" );
expected.push_back( "FOUR" );
std::sort( tokens.begin(), tokens.end() );
std::sort( expected.begin(), expected.end() );
BOOST_CHECK_EQUAL( expected.size(), tokens.size() );
for ( int i = 0; i < expected.size() && i < tokens.size(); ++i ) {
BOOST_CHECK_EQUAL( expected.at( i ), tokens.at( i ) );
}
}
test_suite* get_suite() {
test_suite* test = BOOST_TEST_SUITE( "Tokenizer tests" );
test->add( BOOST_TEST_CASE( &simple ) );
test->add( BOOST_TEST_CASE( &with_newlines ) );
test->add( BOOST_TEST_CASE( &with_numbers ) );
return test;
}
}}} //namespaces