From 37830ca9a16449ed145da79c80e7a37b465e4816 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Kim=20Nguy=E1=BB=85n?= Date: Wed, 29 Feb 2012 11:03:13 +0100 Subject: [PATCH] Add index testing program. Fixes bug in hashtable construction. --- Makefile | 5 +- createIndex.cpp | 110 ++++++++++++++++++++++++++++++++++++++++++++ swcsa/buildFacade.c | 1 - swcsa/utils/hash.c | 8 +++- 4 files changed, 120 insertions(+), 4 deletions(-) create mode 100644 createIndex.cpp diff --git a/Makefile b/Makefile index f6d781c..bb8d60d 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,7 @@ dcover_obs = dcover/difference_cover.o TextCollection_obs = TextCollection.o TextCollectionBuilder.o FMIndexBuilder.o RLCSABuilder.o FMIndex.o Tools.o \ TextStorage.o Query.o EditDistance.o ResultSet.o -TCDebug_obs = bittree.o rbtree.o dynFMI.o +TCDebug_obs = bittree.o rbtree.o dynFMI.o TEXTCOLLECTION_A=libTextCollection.a @@ -25,6 +25,9 @@ testTextCollection: testTextCollection.o $(TextCollection_obs) $(LIBS) $(TCDebug timeTextCollection: timeTextCollection.o $(TextCollection_obs) $(LIBS) $(TCDebug_obs) $(CC) -o timeTextCollection timeTextCollection.o $(TextCollection_obs) $(TCDebug_obs) +createIndex: createIndex.o $(TextCollection_obs) $(LIBS) + $(CC) -o createIndex createIndex.o $(TextCollection_obs) $(LIBS) + test2dRange: test2dRange.o ${LIBCDSA} $(CC) -o test2dRange test2dRange.o ${LIBCDSA} diff --git a/createIndex.cpp b/createIndex.cpp new file mode 100644 index 0000000..6226f5d --- /dev/null +++ b/createIndex.cpp @@ -0,0 +1,110 @@ +// Test driver for text collection +#include +#include +#include +using std::cout; +using std::endl; +using std::cin; +using std::string; +#include +#include +#include +#include + +static struct timeval t1; +static struct timeval t2; + + +#define STARTTIMER() (gettimeofday(&t1,NULL)) +#define STOPTIMER() (gettimeofday(&t2,NULL)) +#define GETTIME() (((t2.tv_sec - t1.tv_sec) * 1000000.0 + (t2.tv_usec - t1.tv_usec))/1000.0) + +void * last_brk = NULL; + +long int get_mem(){ + void * current_brk = sbrk(0); + long int mem = ((long int) current_brk ) - ((long int) last_brk); + //last_brk = current_brk; + return (mem/1024/1024); +} + +#include "TextCollectionBuilder.h" +using SXSI::TextCollection; +using SXSI::TextCollectionBuilder; + + + +int main(int argc, char**argv) +{ + string str = string(""); + unsigned int text_size = 0; + unsigned int max_str = 0; + unsigned int num_str = 0; + double time; + FILE* file; + TextCollection * tc; + + TextCollectionBuilder *tcb = + TextCollectionBuilder::create(64, TextCollectionBuilder::index_type_swcsa); + + STARTTIMER(); + last_brk= sbrk(0); + + std::cerr << "Filling collection\n"; + + while (not(cin.eof()) ){ + std::getline(cin, str); // Read line by line. + if (str.compare("------") != 0 ){ + if (!str.empty()) + tcb->InsertText((unsigned char*) str.c_str()); + + + if (num_str % 10000 == 0){ + STOPTIMER(); + time = GETTIME(); + std::cerr << "Added " << num_str << " strings in " + << time << " ms, max_mem=" << get_mem() << "\n"; + std::cerr.flush(); + //STARTTIMER(); + }; + + num_str++; + if (max_str < str.size()) + max_str = str.size(); + text_size += str.size(); + str.clear(); + } + else + str.clear(); + }; + + std::cerr << "Number of bytes inserted : " << text_size << "\n"; + + std::cerr << "Calling InitTextCollection() for sf=64: "; + STARTTIMER(); + tc = tcb->InitTextCollection(); + STOPTIMER(); + time = GETTIME(); + std::cerr << time << "ms, max_mem = " << get_mem() << "\n" ; + delete tcb; + tcb = NULL; + + file = fopen("index_64.tc","w+"); + std::cerr << "Saving to index_64.tc "; + STARTTIMER(); + tc->Save(file,"index_64.tc"); + STOPTIMER(); + time = GETTIME(); + std::cerr << time << "ms, max_mem = " << get_mem() << "\n" ; + fclose(file); + delete tc; + tc = NULL; + std::cerr << "Freeing memory : max_mem = " << get_mem() << "\n"; + std::cerr << "Loading TextCollection from saved file ... "; + file = fopen("index_64.tc","r"); + tc = TextCollection::Load(file, "index_64.tc", TextCollection::index_mode_default); + std::cerr << "ok\n"; + delete tc; + + return 0; +} diff --git a/swcsa/buildFacade.c b/swcsa/buildFacade.c index 4cd28fe..a9c4e38 100755 --- a/swcsa/buildFacade.c +++ b/swcsa/buildFacade.c @@ -597,7 +597,6 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) { //Stimation (Using Heap's law) of the number of different "meaningful" words. //sizeNValue=N_value; if(bytesFile<5000000) bytesFile = 5000000; - sizeNValue = (unsigned long) floor(3.9* pow(bytesFile,0.70) ); // Inicializes the arrays used to detect if a char is valid or not. diff --git a/swcsa/utils/hash.c b/swcsa/utils/hash.c index a8aa4f7..a5be2d3 100755 --- a/swcsa/utils/hash.c +++ b/swcsa/utils/hash.c @@ -47,8 +47,12 @@ t_hash initialize_hash (unsigned long sizeVoc) { unsigned long i; h = (t_hash) malloc(sizeof(struct hashStr)); - h->SIZE_HASH = (unsigned long) (OCUP_HASH * sizeVoc); - h->SIZE_HASH = NearestPrime(h->SIZE_HASH); + unsigned long m = 6 * sizeVoc; + h->SIZE_HASH = sizeVoc; + do { + h->SIZE_HASH = NearestPrime(h->SIZE_HASH); + } while (h->SIZE_HASH < m); + h->hash = (t_hashNode *) malloc(h->SIZE_HASH * sizeof(t_hashNode)); h->NumElem = 0; -- 2.17.1