X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=timeTextCollection.cpp;h=74c9615a0c326f0337900c0cca3614bdc4de65e1;hb=47be601752d3cd9d24c831a16621cd6d4ced6670;hp=91c940456858e2cd9e3d184251a3cfdc31b78b95;hpb=7eaf8bbcb18ab4a481905d219ab0b9e8286c75b2;p=SXSI%2FTextCollection.git diff --git a/timeTextCollection.cpp b/timeTextCollection.cpp index 91c9404..74c9615 100644 --- a/timeTextCollection.cpp +++ b/timeTextCollection.cpp @@ -8,104 +8,206 @@ using std::cin; using std::string; #include #include +#include +#include static struct timeval t1; static struct timeval t2; + #define STARTTIMER() (gettimeofday(&t1,NULL)) #define STOPTIMER() (gettimeofday(&t2,NULL)) #define GETTIME() (((t2.tv_sec - t1.tv_sec) * 1000000.0 + (t2.tv_usec - t1.tv_usec))/1000.0) - +void * last_brk = NULL; + +long int get_mem(){ + void * current_brk = sbrk(0); + long int mem = ((long int) current_brk ) - ((long int) last_brk); + //last_brk = current_brk; + return (mem/1024/1024); +} -#include "TextCollection.h" +#include "TextCollectionBuilder.h" using SXSI::TextCollection; +using SXSI::TextCollectionBuilder; +string words[] = { "Bakst", + "ruminants", "morphine", "AUSTRALIA", + "molecule" ,"brain", "human", "blood","from", + "with", " in", "the", "of", + "a", + "\n" }; + + +unsigned int NWORDS = 15; + + +void time_tc(TextCollection *tc){ + double time; + int count; + bool is; + TextCollection::document_result res; + for (unsigned int i = 0; i < NWORDS; i++){ + std::cerr << "\"" << words[i] << "\": "; + STARTTIMER(); + is = tc->IsContains((unsigned char*) words[i].c_str()); + STOPTIMER(); + time = GETTIME(); + + std::cerr << is << ", " << time << ", "; + + + STARTTIMER(); + count = tc->Count((unsigned char*) words[i].c_str()); + STOPTIMER(); + time = GETTIME(); + + std::cerr << count << ", " << time << ", "; + + + STARTTIMER(); + count = tc->CountContains((unsigned char*) words[i].c_str()); + STOPTIMER(); + time = GETTIME(); + + std::cerr << count << ", " << time << ", "; + + + STARTTIMER(); + res = tc->Contains((unsigned char*) words[i].c_str()); + STOPTIMER(); + time = GETTIME(); + + std::cerr << time << ", max_mem = " << get_mem() << "\n" ; + }; +} int main(int argc, char**argv) { - string str; - string buffer; + string * str = new string("Foo"); + string * buffer = new string("Foo"); + unsigned int text_size = 0; unsigned int max_str = 0; unsigned int num_str = 0; double time; + FILE* file; + TextCollection * tc; + - string words[] = { "abcd", "abc", "mirrors", "attires", "mature", - "rescue", "such", "embrace", "shipping", "ae", - "preventions", "ab", "fe", "w" }; - - - - TextCollection *csa = TextCollection::InitTextCollection(64); - + TextCollectionBuilder *tcb64 = new TextCollectionBuilder(64); + TextCollectionBuilder *tcb2 = new TextCollectionBuilder(5); STARTTIMER(); - std::cerr << "Filling collection\n"; - // read only 100000 strings - while (not(cin.eof()) && num_str < 100000 ){ - getline(cin,str); // Read line by line. - if (str.compare("----------") == 0){ - csa->InsertText((unsigned char*) buffer.c_str()); - - if (num_str % 10000 == 0){ - STOPTIMER(); - time = GETTIME(); - std::cerr << "Added " << num_str << " strings in " - << time << " ms\n"; - STARTTIMER(); - }; - - num_str++; - if (max_str < buffer.size()) - max_str = buffer.size(); - buffer.clear(); - - } - else - buffer.append(str); - }; - std::cerr << "Calling MakeStatic()\n"; - - csa->MakeStatic(); - - std::cerr << "Statistics: " << num_str << " strings, " << max_str << " = max length\n"; - int count; - bool is; - TextCollection::document_result res; - for (unsigned int i = 0; i < (sizeof(words)/sizeof(char*)) ; i++){ - - STARTTIMER(); - is = csa->IsContains((unsigned char*) words[i].c_str()); - STOPTIMER(); - time = GETTIME(); + last_brk= sbrk(0); - std::cerr << is << ", " << time << ", "; + std::cerr << "Filling collection\n"; + while (not(cin.eof()) ){ + std::getline(cin, *str ); // Read line by line. + str->append("\n"); + + if (str->compare("----------\n") == 0 ){ + tcb64->InsertText((unsigned char*) buffer->c_str()); + tcb2->InsertText((unsigned char*) buffer->c_str()); + + if (num_str % 10000 == 0){ + STOPTIMER(); + time = GETTIME(); + std::cerr << "Added " << num_str << " strings in " + << time << " ms, max_mem=" << get_mem() << "\n"; + std::cerr.flush(); + //STARTTIMER(); + }; + + num_str++; + if (max_str < buffer->size()) + max_str = buffer->size(); + text_size += buffer->size(); + buffer->clear(); + } + else + buffer->append(*str); + }; + delete str; + delete buffer; + buffer = NULL; + str = NULL; - STARTTIMER(); - count = csa->Count((unsigned char*) words[i].c_str()); - STOPTIMER(); - time = GETTIME(); + std::cerr << "Freeing text buffers : max_mem = " << get_mem() << "\n"; - std::cerr << count << ", " << time << ", "; + std::cerr << "Number of bytes inserted : " << text_size << "\n"; + std::cerr << "Calling InitTextCollection() for sf=64: "; + STARTTIMER(); + tc = tcb64->InitTextCollection(); + STOPTIMER(); + time = GETTIME(); + std::cerr << time << "ms, max_mem = " << get_mem() << "\n" ; + delete tcb64; + tcb64 = NULL; + + file = fopen("index_64.tc","w+"); + std::cerr << "Saving to index_64.tc "; + STARTTIMER(); + tc->Save(file); + STOPTIMER(); + time = GETTIME(); + std::cerr << time << "ms, max_mem = " << get_mem() << "\n" ; + fclose(file); + delete tc; + tc = NULL; + std::cerr << "Freeing memory : max_mem = " << get_mem() << "\n"; + + std::cerr << "Calling InitTextCollection() for sf=5: "; + STARTTIMER(); + tc = tcb2->InitTextCollection(); + STOPTIMER(); + time = GETTIME(); + std::cerr << time << "ms, max_mem = " << get_mem() << "\n" ; + free(tcb2); + tcb2=NULL; - STARTTIMER(); - count = csa->CountContains((unsigned char*) words[i].c_str()); - STOPTIMER(); - time = GETTIME(); - std::cerr << count << ", " << time << ", "; - + file = fopen("index_05.tc","w+"); + std::cerr << "Saving to index_05.tc "; + STARTTIMER(); + tc->Save(file); + STOPTIMER(); + time = GETTIME(); + std::cerr << time << "ms, max_mem = " << get_mem() << "\n"; + fclose(file); + delete tc; + tc = NULL; + std::cerr << "Freeing memory : max_mem = " << get_mem() << "\n"; + std::cerr << "Statistics: " << num_str << " strings, " << max_str << " = max length\n"; - STARTTIMER(); - res = csa->Contains((unsigned char*) words[i].c_str()); - STOPTIMER(); - time = GETTIME(); - - std::cerr << time << "\n"; - - - }; + std::cerr << "Loading sf=5 TextCollection "; + STARTTIMER(); + file = fopen("index_05.tc","r"); + tc = TextCollection::Load(file,5); // sample rate is not used. + STOPTIMER(); + time = GETTIME(); + std::cerr << time << "ms, max_mem = " << get_mem() << "\n" ; + fclose(file); + std::cerr << "-----------------\nSampling rate 5\n"; + time_tc(tc); + delete tc; + tc = NULL; + std::cerr << "Freeing memory : max_mem = " << get_mem() << "\n"; + + std::cerr << "Loading sf=64 TextCollection "; + STARTTIMER(); + file = fopen("index_64.tc","r"); + tc = TextCollection::Load(file,64); + STOPTIMER(); + time = GETTIME(); + std::cerr << time << "ms, max_mem = " << get_mem() << "\n" ; + fclose(file); + std::cerr << "-----------------\nSampling rate 64\n"; + time_tc(tc); + delete tc; + tc = NULL; + std::cerr << "Freeing memory : max_mem = " << get_mem() << "\n"; return 0; }