}
-////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////f///////////////////////////////
// Class CSA
/**
*/
CSA::CSA(unsigned samplerate)
: n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0),
- codetable(0), dynFMI(0)
+ codetable(0), dynFMI(0), numberOfTexts(0), maxTextLength(0), endmarkerDocId(0),
+ textLength(0), textStartPos(0)
{
this->samplerate = samplerate;
temp[i] = i+1;
temp[255] = 0;
dynFMI = new DynFMI(temp, 1, 255, false);
+
+ /* Debug code: take char distribution from data.txt.
+ uchar *temp = Tools::GetFileContents("data.txt", 0);
+ dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false);
+ delete [] temp;*/
}
/**
void CSA::InsertText(uchar const * text)
{
// Sanity check:
- assert(dynFMI != 0);
+ assert(dynFMI != 0);
TextPosition m = std::strlen((char *)text) + 1;
- // Store text length and starting position
- textLength.push_back(make_pair(m, n));
+ if (m > maxTextLength)
+ maxTextLength = m; // Store length of the longest text seen so far.
+
this->n += m;
+ this->numberOfTexts ++;
dynFMI->addText(text, m);
}
}
uchar *bwt = dynFMI->getBWT();
- /*printf("1234567890123456789\n");
+ /*printf("123456789012345678901234567890123456789\n");
for (TextPosition i = 0; i < n; i ++)
if (bwt[i] < 50)
printf("%d", (int)bwt[i]);
else
printf("%c", bwt[i]);
- printf("\n");
- */
+ printf("\n");*/
+
/* for (TextPosition i = 1; i <= n; i ++)
{
}*/
// Sanity check
- assert(textLength.size() == dynFMI->getCollectionSize());
+ assert(numberOfTexts == dynFMI->getCollectionSize());
delete dynFMI;
dynFMI = 0;
- makewavelet(bwt);
+ makewavelet(bwt);
// Calculate BWT end-marker position (of last inserted text)
+ // and the length of the first text (counter l):
ulong i = 0;
+ ulong l = 1;
while (bwt[i] != '\0')
{
uchar c = bwt[i];
i = C[c]+alphabetrank->rank(c, i)-1;
+ ++l;
}
bwtEndPos = i;
//printf("bwtEndPos = %lu\n", bwtEndPos);
delete [] bwt;
+ // Build up arrays for text length and starting positions
+ textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
+ textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
+ (*textLength)[0] = l;
+ (*textStartPos)[0] = 0; // Rest of the values are updated in CSA::maketables().
+
// Make sampling tables
maketables();
}
bool CSA::EmptyText(DocId k) const
{
- assert(k < (DocId)textLength.size());
- return (1 == textLength[k].first);
+ assert(k < (DocId)numberOfTexts);
+ return (1 == (*textLength)[k]);
}
uchar* CSA::GetText(DocId k) const
{
- assert(k < (DocId)textLength.size());
+ assert(k < (DocId)numberOfTexts);
- uchar* result = new uchar[textLength[k].first];
+ uchar* result = new uchar[(*textLength)[k]];
TextPosition i = k;
- TextPosition j = textLength[k].first-1;
+ TextPosition j = (*textLength)[k] - 1;
result[j] = '\0';
uchar c = alphabetrank->charAtPos(i);
uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
{
- assert(k < (DocId)textLength.size());
- assert(j < textLength[k].first);
+ assert(k < (DocId)numberOfTexts);
+ assert(j < (*textLength)[k]);
assert(i <= j);
// Start position of k'th text
- ulong start = textLength[k].second;
+ ulong start = (*textStartPos)[k];
return Substring(i + start, j-i+1);
}
-/**
+/******************************************************************
* Existential queries
*/
bool CSA::IsPrefix(uchar const * pattern) const
{
TextPosition m = strlen((char *)pattern);
if (m == 0)
- return textLength.size();
+ return true;
TextPosition sp = 0, ep = 0;
Search(pattern, m, &sp, &ep);
// Check for end-marker(s) in result interval
- map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
- it = endmarkers.lower_bound(sp);
- if (it != endmarkers.end() && it->first <= ep)
+ if (CountEndmarkers(sp, ep))
return true;
return false;
}
bool CSA::IsEqual(uchar const *pattern) const
{
TextPosition m = std::strlen((char *)pattern);
- if (m == 0)
- return textLength.size();
+ //if (m == 0)
+ // return false; // FIXME Check for empty texts!
TextPosition sp = 0, ep = 0;
// Match including end-marker
Search(pattern, m+1, &sp, &ep);
// Check for end-marker(s) in result interval
- map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
- it = endmarkers.lower_bound(sp);
- if (it != endmarkers.end() && it->first <= ep)
+ if (CountEndmarkers(sp, ep))
return true;
return false;
}
bool CSA::IsLessThan(uchar const*) const
{
// TODO
+ assert(0);
return false;
}
-/**
+/******************************************************************
* Counting queries
*/
unsigned CSA::CountPrefix(uchar const * pattern) const
{
TextPosition m = strlen((char *)pattern);
if (m == 0)
- return textLength.size();
+ return numberOfTexts;
TextPosition sp = 0, ep = 0;
Search(pattern, m, &sp, &ep);
// Count end-markers in result interval
- map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
- it = endmarkers.lower_bound(sp);
- unsigned count = 0;
- while (it != endmarkers.end() && it->first <= ep)
- {
- ++ count;
- ++ it;
- }
-
- return count;
+ return CountEndmarkers(sp, ep);
}
unsigned CSA::CountSuffix(uchar const * pattern) const
{
TextPosition m = strlen((char *)pattern);
if (m == 0)
- return textLength.size();
+ return numberOfTexts;
TextPosition sp = 0, ep = 0;
// Search with end-marker
unsigned CSA::CountEqual(uchar const *pattern) const
{
TextPosition m = strlen((char const *)pattern);
- if (m == 0)
- return textLength.size();
+// if (m == 0)
+// return ; // FIXME Test for empty texts.
TextPosition sp = 0, ep = 0;
// Match including end-marker
Search(pattern, m+1, &sp, &ep);
// Count end-markers in result interval
- map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
- it = endmarkers.lower_bound(sp);
- unsigned count = 0;
- while (it != endmarkers.end() && it->first <= ep)
- {
- ++ count;
- ++ it;
- }
-
- return count;
+ return CountEndmarkers(sp, ep);
}
unsigned CSA::CountContains(uchar const * pattern) const
unsigned CSA::CountLessThan(uchar const * pattern) const
{
// TODO
+ assert(0);
return 0;
}
TextCollection::document_result result;
// Report end-markers in result interval
- map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
- it = endmarkers.lower_bound(sp);
- unsigned count = 0;
- while (it != endmarkers.end() && it->first <= ep)
+ unsigned resultSize = CountEndmarkers(sp, ep);
+ if (resultSize == 0)
+ return result;
+
+ unsigned i = 0;
+ if (sp > 0)
+ i = alphabetrank->rank(0, sp - 1);
+ while (resultSize)
{
- // End-marker we found belongs to the "previous" doc in the collection
- DocId docId = ((it->second).first + 1) % textLength.size();
+ // End-marker we found belongs to the "preceeding" doc in the collection
+ DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
result.push_back(docId);
- ++ count;
- ++ it;
+ -- resultSize;
+ ++ i;
}
-
+
return result;
}
}
if (c == '\0')
{
- // map::operator[] is not const, using find() instead:
- pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
- // End-marker that we found belongs to the "previous" doc in collection:
- DocId docId = (endm.first + 1) % textLength.size();
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
+
+ // End-marker that we found belongs to the "preceeding" doc in collection:
+ DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
result.push_back(docId);
}
- else
+ else // Sampled position
{
- TextPosition textPos = suffixes[sampled->rank(i)-1];
+ TextPosition textPos = (*suffixes)[sampled->rank(i)-1];
result.push_back(DocIdAtTextPos(textPos));
}
}
TextCollection::document_result result;
// Report end-markers in result interval
- map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
- it = endmarkers.lower_bound(sp);
- unsigned count = 0;
- while (it != endmarkers.end() && it->first <= ep)
+ unsigned resultSize = CountEndmarkers(sp, ep);
+ if (resultSize == 0)
+ return result;
+
+ unsigned i = 0;
+ if (sp > 0)
+ i = alphabetrank->rank(0, sp - 1);
+ while (resultSize)
{
- // End-marker that we found belongs to the "previous" doc in collection:
- DocId docId = ((it->second).first + 1) % textLength.size();
+ // End-marker we found belongs to the "previous" doc in the collection
+ DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
result.push_back(docId);
- ++ count;
- ++ it;
+ -- resultSize;
+ ++ i;
}
-
+
return result;
}
}
if (c == '\0')
{
- // map::operator[] is not const, using find() instead:
- pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
- // End-marker we found belongs to the "previous" doc in collection
- DocId docId = (endm.first + 1) % textLength.size();
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
+
+ // End-marker that we found belongs to the "preceeding" doc in collection:
+ DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
resultSet.insert(docId);
}
else
{
- TextPosition textPos = suffixes[sampled->rank(i)-1];
+ TextPosition textPos = (*suffixes)[sampled->rank(i)-1];
resultSet.insert(DocIdAtTextPos(textPos));
}
}
TextCollection::document_result CSA::LessThan(uchar const * pattern) const
{
// TODO
+ assert(0);
return document_result();
}
}
if (c == '\0')
{
- // map::operator[] is not const, using find() instead:
- pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
- // End-marker we found belongs to the "previous" doc in the collection:
- DocId docId = (endm.first+1)%textLength.size();
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
+
+ // End-marker that we found belongs to the "preceeding" doc in collection:
+ DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
result.push_back(make_pair(docId, dist));
}
else
{
- TextPosition textPos = suffixes[sampled->rank(i)-1] + dist;
+ TextPosition textPos = (*suffixes)[sampled->rank(i)-1] + dist;
// Read document identifier and its starting position in text order
DocId docId = DocIdAtTextPos(textPos);
- result.push_back(make_pair(docId, textPos - textLength[docId].second));
+ result.push_back(make_pair(docId, textPos - (*textStartPos)[docId]));
}
}
assert(i < n);
DocId a = 0;
- DocId b = textLength.size() - 1;
+ DocId b = numberOfTexts - 1;
while (a < b)
{
DocId c = a + (b - a)/2;
- if (textLength[c].second > i)
+ if ((*textStartPos)[c] > i)
b = c - 1;
- else if (textLength[c+1].second > i)
+ else if ((*textStartPos)[c+1] > i)
return c;
else
a = c + 1;
}
- assert(a < (DocId)textLength.size());
- assert(i > textLength[a].second);
- assert(i < (a == (DocId)textLength.size() - 1 ? n : textLength[a+1].second));
+ assert(a < (DocId)numberOfTexts);
+ assert(i > (*textStartPos)[a]);
+ assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
return a;
}
+/**
+ * Count end-markers in given interval
+ */
+unsigned CSA::CountEndmarkers(TextPosition sp, TextPosition ep) const
+{
+ if (sp > ep)
+ return 0;
+
+ ulong ranksp = 0;
+ if (sp != 0)
+ ranksp = alphabetrank->rank(0, sp - 1);
+
+ return alphabetrank->rank(0, ep) - ranksp;
+}
+
/**
* Save index to a file handle
*
* <ulong bwt> end marker position in BWT;
* <uchar *> BWT string of length n;
* <unsigned r> number of texts;
+ * <ulong max> length of longest text;
* <vector textLength>
* array of <ulong, ulong> pairs.
*
* TODO: Save the data structures instead of BWT sequence?
+ * TODO: Don't save textLength and textStartPos arrays.
*/
void CSA::Save(FILE *file) const
{
throw std::runtime_error("CSA::Save(): file write error (bwt sequence).");
}
- unsigned r = textLength.size();
+ unsigned r = numberOfTexts;
if (std::fwrite(&r, sizeof(unsigned), 1, file) != 1)
throw std::runtime_error("CSA::Save(): file write error (r).");
+
+ if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("CSA::Save(): file write error (maxTextLength).");
- for (r = 0; r < textLength.size(); ++ r)
+ for (r = 0; r < numberOfTexts; ++ r)
{
- if (std::fwrite(&(textLength[r].first), sizeof(TextPosition), 1, file) != 1)
+ if (std::fwrite(&((*textLength)[r]), sizeof(TextPosition), 1, file) != 1)
throw std::runtime_error("CSA::Save(): file write error (text length).");
- if (std::fwrite(&(textLength[r].second), sizeof(TextPosition), 1, file) != 1)
+ if (std::fwrite(&((*textStartPos)[r]), sizeof(TextPosition), 1, file) != 1)
throw std::runtime_error("CSA::Save(): file write error (text start).");
}
}
delete [] positions; positions = 0;
delete [] codetable; codetable = 0;
- endmarkers.clear();
- textLength.clear();
+ delete endmarkerDocId; endmarkerDocId = 0;
+ delete textLength; textLength = 0;
+ delete textStartPos; textStartPos = 0;
+ this->maxTextLength = 0;
+ this->numberOfTexts = 0;
this->samplerate = samplerate;
this->n = 0;
unsigned r = 0;
if (std::fread(&r, sizeof(unsigned), 1, file) != 1)
throw std::runtime_error("CSA::Load(): file read error (r).");
+
+ if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("CSA::Load(): file read error (maxTextLength).");
+ // Build up arrays for text length and starting positions
+ textLength = new BlockArray(r, Tools::CeilLog2(maxTextLength));
+ textStartPos = new BlockArray(r, Tools::CeilLog2(this->n));
+
while (r > 0)
{
TextPosition length = 0, start = 0;
if (std::fread(&start, sizeof(TextPosition), 1, file) != 1)
throw std::runtime_error("CSA::Load(): file read error (text start).");
- textLength.push_back(make_pair(length, start));
+ (*textLength)[numberOfTexts] = length;
+ (*textStartPos)[numberOfTexts] = start;
+ ++numberOfTexts;
--r;
}
// Construct data structures
makewavelet(bwt);
delete [] bwt;
- maketables();
+ maketables(); // FIXME: this will redo text length tables
}
}
else
{
- j = positions[k/samplerate+1];
+ j = (*positions)[k/samplerate+1];
//cout << samplerate << ' ' << j << '\n';
}
int c = alphabetrank->charAtPos(j);
if (c == '\0')
{
- // map::operator[] is not const, using find() instead:
- pair<DocId, TextPosition> endm = (endmarkers.find(j))->second;
- j = endm.first; // LF-mapping for end-marker
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = alphabetrank->rank(0, j) - 1;
+ j = (*endmarkerDocId)[endmarkerRank]; // LF-mapping for end-marker
}
else
j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
}
else
{
- j = positions[i/samplerate+1];
+ j = (*positions)[i/samplerate+1];
//cout << samplerate << ' ' << j << '\n';
}
int c = alphabetrank->charAtPos(i);
if (c == '\0')
{
- // map::operator[] is not const, using find() instead:
- pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
- return endm.second + dist; // returns text position.
+ // End-markers are sampled
+ unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
+ DocId docId = (*endmarkerDocId)[endmarkerRank];
+ return (*textStartPos)[docId] + dist;
}
i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
++dist;
}
- return suffixes[sampled->rank(i)-1]+dist;
+ return (*suffixes)[sampled->rank(i)-1]+dist;
}
CSA::~CSA() {
delete dynFMI;
delete alphabetrank;
delete sampled;
- delete [] suffixes;
- delete [] positions;
+ delete suffixes;
+ delete positions;
delete [] codetable;
+
+ delete endmarkerDocId;
+ delete textLength;
+ delete textStartPos;
}
void CSA::makewavelet(uchar *bwt)
if (C[i]>0) {max = i; break;}
// Print frequencies
-/* for(i = 0; i < 256; i++)
+ /*for(i = 0; i < 256; i++)
if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]);
fflush(stdout);*/
ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
ulong *sampledpositions = new ulong[n/W+1];
- suffixes = new ulong[sampleLength];
- positions = new ulong[sampleLength];
-
+ unsigned ceilLog2n = Tools::CeilLog2(n);
+ suffixes = new BlockArray(sampleLength, ceilLog2n);
+ positions = new BlockArray(sampleLength, ceilLog2n);
+
+ // Mapping from end-markers to doc ID's:
+ endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
+
ulong i,j=0;
for (i=0;i<n/W+1;i++)
sampledpositions[i]=0lu;
ulong x,p=bwtEndPos;
- DocId textId = textLength.size();
+ // Keeping track of text position of end-markers seen
+ ulong posOfSuccEndmarker = n;
+ DocId textId = numberOfTexts;
ulong ulongmax = 0;
ulongmax--;
- //positions:
+ //positions:
for (i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
// i substitutes SA->GetPos(i)
x=(i==n-1)?0:i+1;
if (x % samplerate == 0) {
Tools::SetField(sampledpositions,1,p,1);
- positions[x/samplerate] = p;
+ (*positions)[x/samplerate] = p;
}
uchar c = alphabetrank->charAtPos(p);
if (c == '\0')
{
- // Record the order of end-markers in BWT:
--textId;
- endmarkers[p] = make_pair(textId, (TextPosition)x);
+
+ // Record the order of end-markers in BWT:
+ ulong endmarkerRank = alphabetrank->rank(0, p) - 1;
+ (*endmarkerDocId)[endmarkerRank] = textId;
+
+ // Store text length and text start position:
+ if (textId < (DocId)numberOfTexts - 1)
+ {
+ (*textLength)[textId + 1] = posOfSuccEndmarker - x;
+ (*textStartPos)[textId + 1] = x; // x is the position of end-marker.
+ posOfSuccEndmarker = x;
+ }
+
// LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
p = textId; // Correct LF-mapping to the last char of the previous text.
}
p = C[c]+alphabetrank->rank(c, p)-1;
}
assert(textId == 0);
- /*for (map<ulong, pair<DocId, ulong> >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it)
+
+ /*i = 0;
+ for (map<ulong, pair<DocId, ulong> >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it, ++i)
{
- printf("endm[%u] = %lu (text pos: %lu)\n", (it->second).first, it->first, (it->second).second);
+ int docc = (*endmarkerDocId)[i];
+ ulong poss = (*endmarkerPos)[i];
+ printf("endm[%u] = %lu (text pos: %lu) (recorded: %d, %lu)\n", (it->second).first, it->first, (it->second).second, docc, poss);
}*/
-
+/*
+ for (i = 0; i < numberOfTexts; ++ i)
+ {
+ //std::cout << "textlength = " << dynTextLength[i].first << " vs " << (*textLength)[i] << ", textStartPos = " << dynTextLength[i].second << " vs " << (*textStartPos)[i] << std::endl;
+ assert(dynTextLength[i].first == (*textLength)[i]);
+ assert(dynTextLength[i].second == (*textStartPos)[i]);
+ }
+*/
sampled = new BitRank(sampledpositions,n,true);
//suffixes:
for(i=0; i<sampleLength; i++) {
- j = sampled->rank(positions[i]);
+ j = sampled->rank((*positions)[i]);
if (j==0) j=sampleLength;
- suffixes[ j-1] = (i*samplerate==n)?0:i*samplerate;
+ (*suffixes)[j-1] = (i*samplerate==n)?0:i*samplerate;
}
}
-uchar * CSA::LoadFromFile(const char *filename)
-{
- uchar *s;
- std::ifstream file (filename, std::ios::in|std::ios::binary);
- if (file.is_open())
- {
- std::cerr << "Loading CSA from file: " << filename << std::endl;
- file.read((char *)&bwtEndPos, sizeof(TextPosition));
- s = new uchar[n];
- for (ulong offset = 0; offset < n; offset ++)
- file.read((char *)(s + offset), sizeof(char));
- file.close();
- }
- else
- {
- std::cerr << "Unable to open file " << filename << std::endl;
- exit(1);
- }
- return s;
-}
-
-void CSA::SaveToFile(const char *filename, uchar *bwt)
-{
- std::ofstream file (filename, std::ios::out|std::ios::binary|std::ios::trunc);
- if (file.is_open())
- {
- std::cerr << "Writing CSA to file: " << filename << std::endl;
- file.write((char *)&bwtEndPos, sizeof(TextPosition));
- std::cerr << "Writing BWT of " << n << " bytes." << std::endl;
- for (ulong offset = 0; offset < n; offset ++)
- file.write((char *)(bwt + offset), sizeof(char));
- file.close();
- }
- else
- {
- std::cerr << "Unable to open file " << filename << std::endl;
- exit(1);
- }
-}
-
-/*uchar * CSA::BWT(uchar *text)
-{
- uchar *s;
-
- DynFMI *wt = new DynFMI((uchar *) text, n);
- s = wt->getBWT();
- for (ulong i=0;i<n;i++)
- if (s[i]==0u) {
- bwtEndPos = i; // TODO: better solution ?
- i=n;
- }
-
- delete wt;
- return s;
- }*/
-
CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
{
TCodeEntry *result = new TCodeEntry[ 256 ];