X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=testTextCollection.cpp;h=a19d2b20fc2dbddc539d1d1883f37b176469fb81;hb=HEAD;hp=59b6b43c11f2d64ef305605e0ecdf67c2b3fb912;hpb=9c741e645d2c51f59b7cc40bf9ca0a91e1ed51d4;p=SXSI%2FTextCollection.git diff --git a/testTextCollection.cpp b/testTextCollection.cpp index 59b6b43..a19d2b2 100644 --- a/testTextCollection.cpp +++ b/testTextCollection.cpp @@ -7,8 +7,9 @@ using std::endl; using std::cin; using std::string; -#include "TextCollection.h" +#include "TextCollectionBuilder.h" #include "HeapProfiler.h" +using SXSI::TextCollectionBuilder; using SXSI::TextCollection; void printDocumentResult(TextCollection::document_result dr) @@ -39,20 +40,23 @@ int main() int i = 0 ,j = 0; int heap_base = HeapProfiler::GetHeapConsumption(); std::cerr << "Initial heap usage : " << heap_base << "\n"; - TextCollection *csa = TextCollection::InitTextCollection(5); // Avoid small samplerates ;) + TextCollectionBuilder *tcb = TextCollectionBuilder::create(5); heap_base = HeapProfiler::GetHeapConsumption (); std::cerr << "Heap usage after InitTextCollection : " << heap_base << "\n"; - + Tools::StartTimer(); while (not(cin.eof())){ - getline(cin,str); // Read line by line. -// cin >> str; // Read word by word. + getline(cin,str); // Read line by line. +// cin >> str; // Read word by word. data = (uchar *) str.c_str(); - csa->InsertText(data); + if (str.size() == 0) + continue; + + tcb->InsertText(data); i++; j+= str.size(); str.clear(); - if ( i % 1000 == 0) { + if ( i % 100000 == 0) { std::cerr << "Inserted : " << i << " strings\n"; std::cerr << "Number of bytes inserted : " << j << "b \n"; std::cerr << "Heap usage used for strings: " << HeapProfiler::GetHeapConsumption() - heap_base @@ -62,26 +66,43 @@ int main() }; }; +/**/ + //the whole file as 20 strings: + /* uchar *temp = Tools::GetFileContents("data/english.100MB", 0); + ulong n = strlen((char *)temp); + std::cout << "n = " << n << std::endl; + ulong offset = n/40; + uchar *it = temp; + for (i = 0; i < 5; ++i) + { + it[offset] = '\0'; + tcb->InsertText(it); + std::cout << "inserted " << strlen((char *)it) << " bytes." << std::endl; + it += offset +1; + } + it -= offset+1; -/* the whole file as one string: - uchar *temp = Tools::GetFileContents("data.txt", 0); - csa->InsertText(temp); + if (it > temp + n) + std::cout << "over bounds" << std::endl; delete [] temp;*/ + std::cerr << "Creating new text collection with " << i << " strings (total " << j/1024 << " kb)\n"; + std::cerr << "max heap usage: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); std::cerr << "Before MakeStatic() [press enter]\n"; - std::cin >> kbd; + //std::cin >> kbd; // This will print the maximum mem usage during construction time: - std::cerr << "max heap usage: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; - csa->MakeStatic(); + TextCollection* tc = tcb->InitTextCollection();//SXSI::TextStorage::TYPE_LZ_INDEX); + delete tcb; tcb = 0; std::cerr << "After MakeStatic() [press enter]\n"; // This will print the maximum mem usage during MakeStatic(): std::cerr << "max heap usage: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; - std::cin >> kbd; + //std::cin >> kbd; std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; - delete csa; + delete tc; std::cerr << "After Delete [press enter]\n"; - std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; - std::cin >> kbd; + std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption() << " bytes" << std::endl; + //std::cin >> kbd; return 0; }