X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=timeTextCollection.cpp;h=74c9615a0c326f0337900c0cca3614bdc4de65e1;hb=9abbea1aba3d81f1eccd84a92d1857e46a1b3ba2;hp=dbe16646e810026c0e455595928053a143f28cc7;hpb=95fded64d90aca79f7179f147a5184e9ba3176af;p=SXSI%2FTextCollection.git diff --git a/timeTextCollection.cpp b/timeTextCollection.cpp index dbe1664..74c9615 100644 --- a/timeTextCollection.cpp +++ b/timeTextCollection.cpp @@ -8,99 +8,206 @@ using std::cin; using std::string; #include #include +#include +#include +static struct timeval t1; +static struct timeval t2; -#include "TextCollection.h" + +#define STARTTIMER() (gettimeofday(&t1,NULL)) +#define STOPTIMER() (gettimeofday(&t2,NULL)) +#define GETTIME() (((t2.tv_sec - t1.tv_sec) * 1000000.0 + (t2.tv_usec - t1.tv_usec))/1000.0) + +void * last_brk = NULL; + +long int get_mem(){ + void * current_brk = sbrk(0); + long int mem = ((long int) current_brk ) - ((long int) last_brk); + //last_brk = current_brk; + return (mem/1024/1024); +} + +#include "TextCollectionBuilder.h" using SXSI::TextCollection; +using SXSI::TextCollectionBuilder; +string words[] = { "Bakst", + "ruminants", "morphine", "AUSTRALIA", + "molecule" ,"brain", "human", "blood","from", + "with", " in", "the", "of", + "a", + "\n" }; -int main(int argc, char**argv) -{ - string str; - string buffer; - unsigned int max_str = 0; - unsigned int num_str = 0; - struct timeval t1; - struct timeval t2; - double time; - string words[] = { "abcd", "abc", "mirrors", "attires", "mature", - "rescue", "such", "embrace", "shipping", "ae", - "preventions", "ab", "fe", "w" }; - - - - TextCollection *csa = TextCollection::InitTextCollection(64); +unsigned int NWORDS = 15; - gettimeofday(&t1,NULL); - std::cerr << "Filling collection\n"; - while (not(cin.eof()) && num_str < 100000 ){ - getline(cin,str); // Read line by line. - if (str.compare("----------") == 0){ - csa->InsertText((unsigned char*) buffer.c_str()); - - if (num_str % 10000 == 0){ - gettimeofday(&t2,NULL); - time = ((t2.tv_sec - t1.tv_sec) * 1000000.0 - + (t2.tv_usec - t1.tv_usec))/1000.0; - std::cerr << "Added " << num_str << " strings in " - << time << " ms\n"; - gettimeofday(&t1,NULL); - }; - - num_str++; - if (max_str < buffer.size()) - max_str = buffer.size(); - buffer.clear(); - - } - else - buffer.append(str); - }; - std::cerr << "Calling MakeStatic()\n"; - csa->MakeStatic(); - std::cerr << "Statistics: " << num_str << " strings, " << max_str << " = max length\n"; +void time_tc(TextCollection *tc){ + double time; int count; bool is; TextCollection::document_result res; - for (int i = 0; i < 14; i++){ - gettimeofday(&t1,NULL); - is = csa->IsContains((unsigned char*) words[i].c_str()); - gettimeofday(&t2,NULL); - time = ((t2.tv_sec - t1.tv_sec) * 1000000.0 - + (t2.tv_usec - t1.tv_usec))/1000.0; - - std::cerr << is << ", " << time << ", "; - - - gettimeofday(&t1,NULL); - count = csa->Count((unsigned char*) words[i].c_str()); - gettimeofday(&t2,NULL); - time = ((t2.tv_sec - t1.tv_sec) * 1000000.0 - + (t2.tv_usec - t1.tv_usec))/1000.0; + for (unsigned int i = 0; i < NWORDS; i++){ + std::cerr << "\"" << words[i] << "\": "; + STARTTIMER(); + is = tc->IsContains((unsigned char*) words[i].c_str()); + STOPTIMER(); + time = GETTIME(); + + std::cerr << is << ", " << time << ", "; + + + STARTTIMER(); + count = tc->Count((unsigned char*) words[i].c_str()); + STOPTIMER(); + time = GETTIME(); + + std::cerr << count << ", " << time << ", "; + + + STARTTIMER(); + count = tc->CountContains((unsigned char*) words[i].c_str()); + STOPTIMER(); + time = GETTIME(); + + std::cerr << count << ", " << time << ", "; + + + STARTTIMER(); + res = tc->Contains((unsigned char*) words[i].c_str()); + STOPTIMER(); + time = GETTIME(); + + std::cerr << time << ", max_mem = " << get_mem() << "\n" ; + }; +} - std::cerr << count << ", " << time << ", "; +int main(int argc, char**argv) +{ + string * str = new string("Foo"); + string * buffer = new string("Foo"); + unsigned int text_size = 0; + unsigned int max_str = 0; + unsigned int num_str = 0; + double time; + FILE* file; + TextCollection * tc; + - gettimeofday(&t1,NULL); - count = csa->CountContains((unsigned char*) words[i].c_str()); - gettimeofday(&t2,NULL); - time = ((t2.tv_sec - t1.tv_sec) * 1000000.0 - + (t2.tv_usec - t1.tv_usec))/1000.0; + TextCollectionBuilder *tcb64 = new TextCollectionBuilder(64); + TextCollectionBuilder *tcb2 = new TextCollectionBuilder(5); - std::cerr << count << ", " << time << ", "; - + STARTTIMER(); + last_brk= sbrk(0); - gettimeofday(&t1,NULL); - res = csa->Contains((unsigned char*) words[i].c_str()); - gettimeofday(&t2,NULL); - time = ((t2.tv_sec - t1.tv_sec) * 1000000.0 - + (t2.tv_usec - t1.tv_usec))/1000.0; - - std::cerr << time << "\n"; - + std::cerr << "Filling collection\n"; + while (not(cin.eof()) ){ + std::getline(cin, *str ); // Read line by line. + str->append("\n"); + if (str->compare("----------\n") == 0 ){ + tcb64->InsertText((unsigned char*) buffer->c_str()); + tcb2->InsertText((unsigned char*) buffer->c_str()); + + if (num_str % 10000 == 0){ + STOPTIMER(); + time = GETTIME(); + std::cerr << "Added " << num_str << " strings in " + << time << " ms, max_mem=" << get_mem() << "\n"; + std::cerr.flush(); + //STARTTIMER(); + }; + + num_str++; + if (max_str < buffer->size()) + max_str = buffer->size(); + text_size += buffer->size(); + buffer->clear(); + } + else + buffer->append(*str); }; + delete str; + delete buffer; + buffer = NULL; + str = NULL; + + std::cerr << "Freeing text buffers : max_mem = " << get_mem() << "\n"; + + std::cerr << "Number of bytes inserted : " << text_size << "\n"; + + std::cerr << "Calling InitTextCollection() for sf=64: "; + STARTTIMER(); + tc = tcb64->InitTextCollection(); + STOPTIMER(); + time = GETTIME(); + std::cerr << time << "ms, max_mem = " << get_mem() << "\n" ; + delete tcb64; + tcb64 = NULL; + + file = fopen("index_64.tc","w+"); + std::cerr << "Saving to index_64.tc "; + STARTTIMER(); + tc->Save(file); + STOPTIMER(); + time = GETTIME(); + std::cerr << time << "ms, max_mem = " << get_mem() << "\n" ; + fclose(file); + delete tc; + tc = NULL; + std::cerr << "Freeing memory : max_mem = " << get_mem() << "\n"; + + std::cerr << "Calling InitTextCollection() for sf=5: "; + STARTTIMER(); + tc = tcb2->InitTextCollection(); + STOPTIMER(); + time = GETTIME(); + std::cerr << time << "ms, max_mem = " << get_mem() << "\n" ; + free(tcb2); + tcb2=NULL; + + + file = fopen("index_05.tc","w+"); + std::cerr << "Saving to index_05.tc "; + STARTTIMER(); + tc->Save(file); + STOPTIMER(); + time = GETTIME(); + std::cerr << time << "ms, max_mem = " << get_mem() << "\n"; + fclose(file); + delete tc; + tc = NULL; + std::cerr << "Freeing memory : max_mem = " << get_mem() << "\n"; + std::cerr << "Statistics: " << num_str << " strings, " << max_str << " = max length\n"; + + std::cerr << "Loading sf=5 TextCollection "; + STARTTIMER(); + file = fopen("index_05.tc","r"); + tc = TextCollection::Load(file,5); // sample rate is not used. + STOPTIMER(); + time = GETTIME(); + std::cerr << time << "ms, max_mem = " << get_mem() << "\n" ; + fclose(file); + std::cerr << "-----------------\nSampling rate 5\n"; + time_tc(tc); + delete tc; + tc = NULL; + std::cerr << "Freeing memory : max_mem = " << get_mem() << "\n"; + + std::cerr << "Loading sf=64 TextCollection "; + STARTTIMER(); + file = fopen("index_64.tc","r"); + tc = TextCollection::Load(file,64); + STOPTIMER(); + time = GETTIME(); + std::cerr << time << "ms, max_mem = " << get_mem() << "\n" ; + fclose(file); + std::cerr << "-----------------\nSampling rate 64\n"; + time_tc(tc); + delete tc; + tc = NULL; + std::cerr << "Freeing memory : max_mem = " << get_mem() << "\n"; return 0; }