using std::string;
#include <sys/time.h>
#include <time.h>
+#include <stdio.h>
static struct timeval t1;
static struct timeval t2;
using SXSI::TextCollection;
using SXSI::TextCollectionBuilder;
+
int main(int argc, char**argv)
{
string str;
string buffer;
+ unsigned int text_size = 0;
unsigned int max_str = 0;
unsigned int num_str = 0;
double time;
- string words[] = { "abcd", "abc", "mirrors", "attires", "mature",
- "rescue", "such", "embrace", "shipping", "ae",
- "preventions", "ab", "fe", "w" };
-
-
-
- TextCollectionBuilder *tcb = new TextCollectionBuilder(64);
+ string words[] = { "Bakst",
+ "ruminants", "morphine", "AUSTRALIA","molecule" ,"brain", "human", "blood","from", "with", " in", "the", "of",
+ "a",
+ "\n" };
+ unsigned int NWORDS = 15;
+ TextCollectionBuilder *tcb64 = new TextCollectionBuilder(64);
+ TextCollectionBuilder *tcb2 = new TextCollectionBuilder(5);
STARTTIMER();
std::cerr << "Filling collection\n";
- // read only 100000 strings
- while (not(cin.eof()) && num_str < 100000 ){
- getline(cin,str); // Read line by line.
- if (str.compare("----------") == 0){
- tcb->InsertText((unsigned char*) buffer.c_str());
-
- if (num_str % 10000 == 0){
- STOPTIMER();
- time = GETTIME();
- std::cerr << "Added " << num_str << " strings in "
- << time << " ms\n";
- STARTTIMER();
- };
- num_str++;
- if (max_str < buffer.size())
- max_str = buffer.size();
- buffer.clear();
- }
- else
- buffer.append(str);
+ while (not(cin.eof()) ){
+ getline(cin,str); // Read line by line.
+ str.append("\n");
+
+ if (str.compare("----------\n") == 0 ){
+ tcb64->InsertText((unsigned char*) buffer.c_str());
+ tcb2->InsertText((unsigned char*) buffer.c_str());
+
+ if (num_str % 10000 == 0){
+ STOPTIMER();
+ time = GETTIME();
+ std::cerr << "Added " << num_str << " strings in "
+ << time << " ms\n";
+ std::cerr.flush();
+ //STARTTIMER();
+ };
+
+ num_str++;
+ if (max_str < buffer.size())
+ max_str = buffer.size();
+ text_size += buffer.size();
+ buffer.clear();
+ }
+ else
+ buffer.append(str);
};
- std::cerr << "Calling MakeStatic()\n";
+ std::cerr << "Number of bytes inserted : " << text_size << "\n";
+ std::cerr << "Calling InitTextCollection() for sf=64: ";
+ STARTTIMER();
+ TextCollection *tc64 = tcb64->InitTextCollection();
+ STOPTIMER();
+ time = GETTIME();
+ std::cerr << time << "ms\n";
+ std::cerr << "Calling InitTextCollection() for sf=5: ";
+ STARTTIMER();
+ TextCollection *tc2 = tcb2->InitTextCollection();
+ STOPTIMER();
+ time = GETTIME();
+ std::cerr << time << "ms\n";
+ FILE* file;
+
+ file = fopen("index_64.tc","w+");
+ std::cerr << "Saving to index_64.tc ";
+ STARTTIMER();
+ tc64->Save(file);
+ STOPTIMER();
+ time = GETTIME();
+ std::cerr << time << "ms\n";
+ fclose(file);
+
+ file = fopen("index_05.tc","w+");
+ std::cerr << "Saving to index_05.tc ";
+ STARTTIMER();
+ tc2->Save(file);
+ STOPTIMER();
+ time = GETTIME();
+ std::cerr << time << "ms\n";
+ fclose(file);
- TextCollection *tc = tcb->InitTextCollection();
std::cerr << "Statistics: " << num_str << " strings, " << max_str << " = max length\n";
int count;
bool is;
TextCollection::document_result res;
- for (unsigned int i = 0; i < (sizeof(words)/sizeof(char*)) ; i++){
-
- STARTTIMER();
- is = tc->IsContains((unsigned char*) words[i].c_str());
- STOPTIMER();
- time = GETTIME();
-
- std::cerr << is << ", " << time << ", ";
-
-
- STARTTIMER();
- count = tc->Count((unsigned char*) words[i].c_str());
- STOPTIMER();
- time = GETTIME();
-
- std::cerr << count << ", " << time << ", ";
-
-
- STARTTIMER();
- count = tc->CountContains((unsigned char*) words[i].c_str());
- STOPTIMER();
- time = GETTIME();
-
- std::cerr << count << ", " << time << ", ";
-
-
- STARTTIMER();
- res = tc->Contains((unsigned char*) words[i].c_str());
- STOPTIMER();
- time = GETTIME();
-
- std::cerr << time << "\n";
+ TextCollection *tc;
+ tc = tc64;
+ std::cerr << "Sampling rate 64\n";
+ for (unsigned int num = 0; num < 2; num ++){
+ for (unsigned int i = 0; i < NWORDS ; i++){
+
+ std::cerr << "\"" << words[i] << ": ";
+ STARTTIMER();
+ is = tc->IsContains((unsigned char*) words[i].c_str());
+ STOPTIMER();
+ time = GETTIME();
+
+ std::cerr << is << ", " << time << ", ";
+
+
+ STARTTIMER();
+ count = tc->Count((unsigned char*) words[i].c_str());
+ STOPTIMER();
+ time = GETTIME();
+
+ std::cerr << count << ", " << time << ", ";
+
+
+ STARTTIMER();
+ count = tc->CountContains((unsigned char*) words[i].c_str());
+ STOPTIMER();
+ time = GETTIME();
+
+ std::cerr << count << ", " << time << ", ";
+
+ STARTTIMER();
+ res = tc->Contains((unsigned char*) words[i].c_str());
+ STOPTIMER();
+ time = GETTIME();
+
+ std::cerr << time << "\n";
+
+
+ };
+ tc = tc2;
+ std::cerr << "---------------------------\n";
+ std::cerr << "Sampling rate 5\n";
};
-
return 0;
}