{
// Save file version info
-const uchar TCImplementation::versionFlag = 3;
+const uchar TCImplementation::versionFlag = 4;
/**
* Constructor inits an empty dynamic FM-index.
*/
TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, unsigned numberOfTexts_, ulong maxTextLength_)
: n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
- suffixDocId(0), numberOfTexts(numberOfTexts_), numberOfAllTexts(numberOfTexts_), maxTextLength(maxTextLength_), endmarkerDocId(0),
- emptyTextRank(0)
+ suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0)
{
-
- // Bitvector of empty texts, FIXME Remove?
- {
- //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
- ulong *tempB = new ulong[numberOfTexts/W + 1];
- for (ulong i = 0; i < numberOfTexts/W + 1; ++i)
- tempB[i] = 0;
-// for (std::set<unsigned>::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it)
-// Tools::SetField(tempB, 1, (*it), 1);
- emptyTextRank = new BSGAP(tempB, numberOfTexts, true);
- }
makewavelet(bwt); // Deletes bwt!
bwt = 0;
bool TCImplementation::EmptyText(DocId k) const
{
assert(k < (DocId)numberOfTexts);
- if (emptyTextRank->IsBitSet(k))
- return true;
- return false;
+ return false; // Empty texts are not indexed
}
uchar* TCImplementation::GetText(DocId k) const
{
assert(k < (DocId)numberOfTexts);
-
- ulong textRank = 0;
- if (emptyTextRank->IsBitSet(k, &textRank))
- {
- uchar* result = new uchar[1];
- result[0] = 0;
- return result;
- }
- // Map to non-empty text
- k -= textRank; //emptyTextRank->rank(k);
-
TextPosition i = k;
string result;
assert(i <= j);
ulong textRank = 0;
- if (emptyTextRank->IsBitSet(k, &textRank))
- {
- uchar* result = new uchar[1];
- result[0] = 0;
- return result;
- }
-
- // Map to non-empty text
- k -= textRank; // emptyTextRank->rank(k);
// Start position of k'th text
ulong start = (*textStartPos)[k];
{
TextPosition m = std::strlen((char *)pattern);
if (m == 0)
- {
- if (numberOfAllTexts - numberOfTexts > 0u)
- return true; // Empty texts exists
return false; // No empty texts exists
- }
TextPosition sp = 0, ep = 0;
// Match including end-marker
{
TextPosition m = std::strlen((char *)pattern);
if (m == 0)
- {
- if (numberOfAllTexts - numberOfTexts > 0u)
- return true; // Empty texts exists
return false; // No empty texts exists
- }
TextPosition sp = 0, ep = 0;
// Match including end-marker
{
TextPosition m = strlen((char *)pattern);
if (m == 0)
- return numberOfAllTexts;
+ return numberOfTexts;
TextPosition sp = 0, ep = 0;
Search(pattern, m, &sp, &ep);
{
TextPosition m = strlen((char *)pattern);
if (m == 0)
- return numberOfAllTexts;
+ return numberOfTexts;
TextPosition sp = 0, ep = 0;
Search(pattern, m, &sp, &ep);
{
TextPosition m = strlen((char *)pattern);
if (m == 0)
- return numberOfAllTexts;
+ return numberOfTexts;
TextPosition sp = 0, ep = 0;
// Search with end-marker
{
TextPosition m = strlen((char *)pattern);
if (m == 0)
- return numberOfAllTexts;
+ return numberOfTexts;
TextPosition sp = 0, ep = 0;
// Search with end-marker
{
TextPosition m = strlen((char const *)pattern);
if (m == 0)
- return numberOfAllTexts - numberOfTexts; // Empty texts.
+ return 0; // No empty texts.
TextPosition sp = 0, ep = 0;
// Match including end-marker
{
TextPosition m = strlen((char const *)pattern);
if (m == 0)
- return numberOfAllTexts - numberOfTexts; // Empty texts.
+ return 0; // No empty texts.
TextPosition sp = 0, ep = 0;
// Match including end-marker
{
TextPosition m = strlen((char const *)pattern);
if (m == 0)
- return numberOfAllTexts; // Total number of texts.
+ return numberOfTexts; // Total number of texts.
// Here counting is as slow as fetching the result set
// because we have to filter out occ's that fall within same document.
{
TextPosition m = strlen((char const *)pattern);
if (m == 0)
- return numberOfAllTexts; // Total number of texts.
+ return numberOfTexts; // Total number of texts.
// Here counting is as slow as fetching the result set
// because we have to filter out occ's that fall within same document.
{
TextPosition m = strlen((char const *)pattern);
if (m == 0)
- return numberOfAllTexts - numberOfTexts; // Empty texts.
+ return 0; // No empty texts.
TextPosition sp = 0, ep = 0;
SearchLessThan(pattern, m, &sp, &ep);
{
TextPosition m = strlen((char const *)pattern);
if (m == 0)
- return numberOfAllTexts - numberOfTexts; // Empty texts.
+ return 0; // No empty texts.
TextPosition sp = 0, ep = 0;
SearchLessThan(pattern, m, &sp, &ep);
TextPosition sp = 0, ep = 0;
Search(pattern, m, &sp, &ep);
-
- TextCollection::document_result result;
- // Report end-markers in result interval
- unsigned resultSize = CountEndmarkers(sp, ep);
- if (resultSize == 0)
- return result;
-
- result.reserve(resultSize); // Try to avoid reallocation.
-
// Iterate through end-markers in [sp,ep]:
- unsigned i = 0;
- if (sp > 0)
- i = alphabetrank->rank(0, sp - 1);
- while (resultSize)
- {
- // End-marker we found belongs to the "preceeding" doc in the collection
- DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- result.push_back(docId);
-
- -- resultSize;
- ++ i;
- }
-
- return result;
+ return EnumerateEndmarkers(sp, ep);
}
TextCollection::document_result TCImplementation::Prefix(uchar const * pattern, DocId begin, DocId end) const
TextPosition sp = 0, ep = 0;
Search(pattern, m, &sp, &ep);
- TextCollection::document_result result;
-
- // Report end-markers in result interval
- unsigned resultSize = CountEndmarkers(sp, ep);
- if (resultSize == 0)
- return result;
-
- result.reserve(resultSize); // Try to avoid reallocation.
-
- // Iterate through end-markers in [sp,ep] and [begin, end]:
- unsigned i = 0;
- if (sp > 0)
- i = alphabetrank->rank(0, sp - 1);
- while (resultSize)
- {
- // End-marker we found belongs to the "preceeding" doc in the collection
- DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- if (docId >= begin && docId <= end)
- result.push_back(docId);
-
- -- resultSize;
- ++ i;
- }
-
- return result;
+ // Return end-markers in [sp,ep] and [begin, end]:
+ return EnumerateEndmarkers(sp, ep, begin, end);
}
TextCollection::document_result TCImplementation::Suffix(uchar const * pattern) const
{
// Rank among the end-markers in BWT
unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
-
- // End-marker that we found belongs to the "preceeding" doc in collection:
- DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- result.push_back(docId);
+ result.push_back(Doc->access(endmarkerRank));
}
else // Sampled position
{
DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
result.push_back(docId);
}
}
{
// Rank among the end-markers in BWT
unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
-
- // End-marker that we found belongs to the "preceeding" doc in collection:
- DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- result.push_back(docId);
+ result.push_back(Doc->access(endmarkerRank));
}
else // Sampled position
{
DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
result.push_back(docId);
}
}
// Match including end-marker
Search(pattern, m+1, &sp, &ep);
- TextCollection::document_result result;
-
// Report end-markers in result interval
- unsigned resultSize = CountEndmarkers(sp, ep);
- if (resultSize == 0)
- return result;
-
- result.reserve(resultSize); // Try to avoid reallocation.
-
- unsigned i = 0;
- if (sp > 0)
- i = alphabetrank->rank(0, sp - 1);
- while (resultSize)
- {
- // End-marker we found belongs to the "previous" doc in the collection
- DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- result.push_back(docId);
-
- -- resultSize;
- ++ i;
- }
-
- return result;
+ return EnumerateEndmarkers(sp, ep);
}
TextCollection::document_result TCImplementation::Equal(uchar const *pattern, DocId begin, DocId end) const
// Match including end-marker
Search(pattern, m+1, &sp, &ep, begin, end);
- TextCollection::document_result result;
-
// Report end-markers in result interval
- unsigned resultSize = CountEndmarkers(sp, ep);
- if (resultSize == 0)
- return result;
-
- result.reserve(resultSize); // Try to avoid reallocation.
-
- unsigned i = 0;
- if (sp > 0)
- i = alphabetrank->rank(0, sp - 1);
- while (resultSize)
- {
- // End-marker we found belongs to the "previous" doc in the collection
- DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- result.push_back(docId); // already within [begin, end]
-
- -- resultSize;
- ++ i;
- }
-
- return result;
+ return EnumerateEndmarkers(sp, ep, begin, end);
}
{
// Rank among the end-markers in BWT
unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
-
- // End-marker that we found belongs to the "preceeding" doc in collection:
- DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
- resultSet.insert(docId);
+ resultSet.insert(Doc->access(endmarkerRank));
}
else
{
// Convert std::set to std::vector
TextCollection::document_result result(resultSet.begin(), resultSet.end());
- // Map to doc ID's
- for (document_result::iterator it = result.begin(); it != result.end(); ++it)
- *it = emptyTextRank->select0(*it+1);
return result;
}
{
// Rank among the end-markers in BWT
unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
-
- // End-marker that we found belongs to the "preceeding" doc in collection:
- DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
+ DocId docId = Doc->access(endmarkerRank);
if (docId >= begin && docId <= end)
resultSet.insert(docId);
}
// Convert std::set to std::vector
TextCollection::document_result result(resultSet.begin(), resultSet.end());
- // Map to doc ID's
- for (document_result::iterator it = result.begin(); it != result.end(); ++it)
- *it = emptyTextRank->select0(*it+1);
return result;
}
TextPosition sp = 0, ep = 0;
SearchLessThan(pattern, m, &sp, &ep);
- TextCollection::document_result result;
-
// Report end-markers in result interval
- unsigned resultSize = CountEndmarkers(sp, ep);
- if (resultSize == 0)
- return result;
-
- result.reserve(resultSize); // Try to avoid reallocation.
-
- // Iterate through end-markers in [sp,ep]:
- unsigned i = 0;
- if (sp > 0)
- i = alphabetrank->rank(0, sp - 1);
- while (resultSize)
- {
- // End-marker we found belongs to the "preceeding" doc in the collection
- DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- result.push_back(docId);
-
- -- resultSize;
- ++ i;
- }
-
- return result;
+ return EnumerateEndmarkers(sp, ep);
}
TextCollection::document_result TCImplementation::LessThan(uchar const * pattern, DocId begin, DocId end) const
TextPosition sp = 0, ep = 0;
SearchLessThan(pattern, m, &sp, &ep);
- TextCollection::document_result result;
-
- // Report end-markers in result interval
- unsigned resultSize = CountEndmarkers(sp, ep);
- if (resultSize == 0)
- return result;
-
- result.reserve(resultSize); // Try to avoid reallocation.
-
// Iterate through end-markers in [sp,ep] and [begin, end]:
- unsigned i = 0;
- if (sp > 0)
- i = alphabetrank->rank(0, sp - 1);
- while (resultSize)
- {
- // End-marker we found belongs to the "preceeding" doc in the collection
- DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- if (docId >= begin && docId <= end)
- result.push_back(docId);
-
- -- resultSize;
- ++ i;
- }
-
- return result;
+ return EnumerateEndmarkers(sp, ep, begin, end);
}
/**
{
// Rank among the end-markers in BWT
unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
-
- // End-marker that we found belongs to the "preceeding" doc in collection:
- DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
+ DocId docId = Doc->access(endmarkerRank);
result.push_back(make_pair(docId, dist));
}
else
DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
// textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
result.push_back(make_pair(docId, textPos));
}
}
unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
// End-marker that we found belongs to the "preceeding" doc in collection:
- DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
+ DocId docId = Doc->access(endmarkerRank);
if (docId >= begin && docId <= end)
result.push_back(make_pair(docId, dist));
}
DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
// textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
if (docId >= begin && docId <= end)
result.push_back(make_pair(docId, textPos));
}
BlockArray *suffixes;
BlockArray *suffixDocId;
unsigned numberOfTexts;
- unsigned numberOfAllTexts;
ulong maxTextLength;
- BlockArray *endmarkerDocId;
- BSGAP *emptyTextRank;
+ static_sequence *docId;
*/
void TCImplementation::Save(FILE *file) const
{
if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
throw std::runtime_error("TCImplementation::Save(): file write error (numberOfTexts).");
- if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
- throw std::runtime_error("TCImplementation::Save(): file write error (numberOfAllTexts).");
if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
throw std::runtime_error("TCImplementation::Save(): file write error (maxTextLength).");
- endmarkerDocId->Save(file);
- emptyTextRank->Save(file);
+ Doc->save(file);
fflush(file);
}
*/
TCImplementation::TCImplementation(FILE *file, unsigned samplerate_)
: n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
- suffixDocId(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0),
- emptyTextRank(0)
+ suffixDocId(0), numberOfTexts(0), maxTextLength(0), Doc(0)
{
uchar verFlag = 0;
if (std::fread(&verFlag, 1, 1, file) != 1)
if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
throw std::runtime_error("TCImplementation::Load(): file read error (numberOfTexts).");
- if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
- throw std::runtime_error("TCImplementation::Load(): file read error (numberOfAllTexts).");
if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength).");
- endmarkerDocId = new BlockArray(file);
- emptyTextRank = new BSGAP(file);
+ Doc = static_sequence::load(file);
// FIXME Construct data structures with new samplerate
//maketables();
*/
-// FIXME Use 2D-search structure
-unsigned TCImplementation::CountEndmarkers(TextPosition sp, TextPosition ep, DocId begin, DocId end) const
-{
- if (sp > ep)
- return 0;
-
- ulong ranksp = 0;
- if (sp != 0)
- ranksp = alphabetrank->rank(0, sp - 1);
-
- unsigned resultSize = alphabetrank->rank(0, ep) - ranksp;
- if (resultSize == 0)
- return 0;
- // Count end-markers in result interval and within [begin, end]
- unsigned count = 0;
- unsigned i = ranksp;
- while (resultSize)
- {
- // End-marker we found belongs to the "previous" doc in the collection
- DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- if (docId >= begin && docId <= end)
- ++ count;
-
- -- resultSize;
- ++ i;
- }
- return count;
-}
-
ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
{
// use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
TCImplementation::~TCImplementation() {
-#ifdef CSA_TEST_BWT
- delete dynFMI;
-#endif
delete alphabetrank;
delete sampled;
delete suffixes;
delete suffixDocId;
-
- delete endmarkerDocId;
- delete emptyTextRank;
+ delete Doc;
}
void TCImplementation::makewavelet(uchar *bwt)
BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
// Mapping from end-markers to doc ID's:
- endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
+ BlockArray *endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
ulong *sampledpositions = new ulong[n/W+1];
for (ulong i=0;i<n/W+1;i++)
delete positions;
// delete textLength;
delete textStartPos;
+
+
+ uint *tmp = new uint[numberOfTexts]; // FIXME Silly...
+// cout << "Doc: ";
+ for (unsigned i = 0; i < numberOfTexts; ++i)
+ {
+ tmp[i] = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+ // cout << tmp[i] << ", ";
+ }
+// cout << endl;
+ delete endmarkerDocId;
+ alphabet_mapper * am = new alphabet_mapper_none();
+ static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
+ wt_coder * wtc = new wt_coder_binary(tmp, numberOfTexts, am);
+ Doc = new static_sequence_wvtree(tmp, numberOfTexts, wtc, bmb, am);
+ delete bmb;
+ delete [] tmp;
+
+ /* document_result res = Doc->access(1, 2, 0, 1);
+ cout << "result: ";
+ for (document_result::iterator it = res.begin(); it != res.end(); ++it)
+ cout << *it << ", ";
+ cout << endl;*/
}