X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=timeTextCollection.cpp;h=40abfac55d6383513f5e78e7517856b93d111db9;hb=32cae827732576f372f4af31e9e6eff6ee89070a;hp=47d19760ce7119da70ebd5e91dbde12abf6c6cf4;hpb=9db8f89e9dbf9e1d61a12d0e6094c4db0b7111e4;p=SXSI%2FTextCollection.git diff --git a/timeTextCollection.cpp b/timeTextCollection.cpp index 47d1976..40abfac 100644 --- a/timeTextCollection.cpp +++ b/timeTextCollection.cpp @@ -8,6 +8,7 @@ using std::cin; using std::string; #include #include +#include static struct timeval t1; static struct timeval t2; @@ -22,91 +23,137 @@ static struct timeval t2; using SXSI::TextCollection; using SXSI::TextCollectionBuilder; + int main(int argc, char**argv) { string str; string buffer; + unsigned int text_size = 0; unsigned int max_str = 0; unsigned int num_str = 0; double time; - string words[] = { "abcd", "abc", "mirrors", "attires", "mature", - "rescue", "such", "embrace", "shipping", "ae", - "preventions", "ab", "fe", "w" }; - - - - TextCollectionBuilder *tcb = new TextCollectionBuilder(64); + string words[] = { "Bakst", + "ruminants", "morphine", "AUSTRALIA","molecule" ,"brain", "human", "blood","from", "with", " in", "the", "of", + "a", + "\n" }; + unsigned int NWORDS = 15; + TextCollectionBuilder *tcb64 = new TextCollectionBuilder(64); + TextCollectionBuilder *tcb2 = new TextCollectionBuilder(5); STARTTIMER(); std::cerr << "Filling collection\n"; - // read only 100000 strings - while (not(cin.eof()) && num_str < 100000 ){ - getline(cin,str); // Read line by line. - if (str.compare("----------") == 0){ - tcb->InsertText((unsigned char*) buffer.c_str()); - - if (num_str % 10000 == 0){ - STOPTIMER(); - time = GETTIME(); - std::cerr << "Added " << num_str << " strings in " - << time << " ms\n"; - STARTTIMER(); - }; - num_str++; - if (max_str < buffer.size()) - max_str = buffer.size(); - buffer.clear(); - } - else - buffer.append(str); + while (not(cin.eof()) ){ + getline(cin,str); // Read line by line. + str.append("\n"); + + if (str.compare("----------\n") == 0 ){ + tcb64->InsertText((unsigned char*) buffer.c_str()); + tcb2->InsertText((unsigned char*) buffer.c_str()); + + if (num_str % 10000 == 0){ + STOPTIMER(); + time = GETTIME(); + std::cerr << "Added " << num_str << " strings in " + << time << " ms\n"; + std::cerr.flush(); + //STARTTIMER(); + }; + + num_str++; + if (max_str < buffer.size()) + max_str = buffer.size(); + text_size += buffer.size(); + buffer.clear(); + } + else + buffer.append(str); }; - std::cerr << "Calling MakeStatic()\n"; + std::cerr << "Number of bytes inserted : " << text_size << "\n"; + std::cerr << "Calling InitTextCollection() for sf=64: "; + STARTTIMER(); + TextCollection *tc64 = tcb64->InitTextCollection(); + STOPTIMER(); + time = GETTIME(); + std::cerr << time << "ms\n"; + std::cerr << "Calling InitTextCollection() for sf=5: "; + STARTTIMER(); + TextCollection *tc2 = tcb2->InitTextCollection(); + STOPTIMER(); + time = GETTIME(); + std::cerr << time << "ms\n"; + FILE* file; + + file = fopen("index_64.tc","w+"); + std::cerr << "Saving to index_64.tc "; + STARTTIMER(); + tc64->Save(file); + STOPTIMER(); + time = GETTIME(); + std::cerr << time << "ms\n"; + fclose(file); + + file = fopen("index_05.tc","w+"); + std::cerr << "Saving to index_05.tc "; + STARTTIMER(); + tc2->Save(file); + STOPTIMER(); + time = GETTIME(); + std::cerr << time << "ms\n"; + fclose(file); - TextCollection *tc = tcb->InitTextCollection(); std::cerr << "Statistics: " << num_str << " strings, " << max_str << " = max length\n"; int count; bool is; TextCollection::document_result res; - for (unsigned int i = 0; i < (sizeof(words)/sizeof(char*)) ; i++){ - - STARTTIMER(); - is = tc->IsContains((unsigned char*) words[i].c_str()); - STOPTIMER(); - time = GETTIME(); - - std::cerr << is << ", " << time << ", "; - - - STARTTIMER(); - count = tc->Count((unsigned char*) words[i].c_str()); - STOPTIMER(); - time = GETTIME(); - - std::cerr << count << ", " << time << ", "; - - - STARTTIMER(); - count = tc->CountContains((unsigned char*) words[i].c_str()); - STOPTIMER(); - time = GETTIME(); - - std::cerr << count << ", " << time << ", "; - - - STARTTIMER(); - res = tc->Contains((unsigned char*) words[i].c_str()); - STOPTIMER(); - time = GETTIME(); - - std::cerr << time << "\n"; + TextCollection *tc; + tc = tc64; + std::cerr << "Sampling rate 64\n"; + for (unsigned int num = 0; num < 2; num ++){ + for (unsigned int i = 0; i < NWORDS ; i++){ + + std::cerr << "\"" << words[i] << ": "; + STARTTIMER(); + is = tc->IsContains((unsigned char*) words[i].c_str()); + STOPTIMER(); + time = GETTIME(); + + std::cerr << is << ", " << time << ", "; + + + STARTTIMER(); + count = tc->Count((unsigned char*) words[i].c_str()); + STOPTIMER(); + time = GETTIME(); + + std::cerr << count << ", " << time << ", "; + + + STARTTIMER(); + count = tc->CountContains((unsigned char*) words[i].c_str()); + STOPTIMER(); + time = GETTIME(); + + std::cerr << count << ", " << time << ", "; + + STARTTIMER(); + res = tc->Contains((unsigned char*) words[i].c_str()); + STOPTIMER(); + time = GETTIME(); + + std::cerr << time << "\n"; + + + }; + tc = tc2; + std::cerr << "---------------------------\n"; + std::cerr << "Sampling rate 5\n"; }; - return 0; }