+/******************************************************************************
+ * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
+ * *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU Lesser General Public License as published *
+ * by the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU Lesser General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU Lesser General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ *****************************************************************************/
+#include "CSA.h"
+#include "TextCollection.h"
+#include <iostream>
+#include <queue>
+#include <iomanip>
+#include <set>
+#include <vector>
+#include <utility>
+#include <cassert>
+#include <cstring> // For strlen()
+using std::vector;
+using std::pair;
+using std::make_pair;
+using std::map;
+
+using SXSI::TextCollection;
+
+
+////////////////////////////////////////////////////////////////////////////
+// Class CSA::THuffAlphabetRank
+
+CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
+ left = NULL;
+ right = NULL;
+ bitrank = NULL;
+ ch = s[0];
+ leaf = false;
+ this->codetable = codetable;
+
+ bool *B = new bool[n];
+ TextPosition sum=0,i;
+ /*
+ for (i=0; i< n; i++) {
+ printf("%c:", (char)((int)s[i]-128));
+ for (r=0;r<codetable[(int)s[i]].bits;r++)
+ if (codetable[(int)s[i]].code & (1u <<r))
+ printf("1");
+ else printf("0");
+ printf("\n");
+ }
+ printf("\n");
+ if (level > 100) return;*/
+ for (i=0;i<n;i++)
+ if (codetable[(int)s[i]].code & (1u << level)) {
+ B[i] = true;
+ sum++;
+ }
+ else B[i] = false;
+ if (sum==0 || sum==n) {
+ delete [] B;
+ leaf = true;
+ return;
+ }
+ uchar *sfirst, *ssecond;
+ //if (n-sum > 0)
+ sfirst = new uchar[n-sum];
+ //if (sum > 0)
+ ssecond = new uchar[sum];
+ unsigned j=0,k=0;
+ for (i=0;i<n;i++)
+ if (B[i]) ssecond[k++] = s[i];
+ else sfirst[j++] = s[i];
+ ulong *Binbits = new ulong[n/W+1];
+ for (i=0;i<n;i++)
+ Tools::SetField(Binbits,1,i,B[i]);
+ delete [] B;
+ bitrank = new BitRank(Binbits,n,true);
+ //if (j > 0) {
+ left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
+ delete [] sfirst;
+ //}
+ //if (k>0) {
+ right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
+ delete [] ssecond;
+ //}
+}
+
+
+bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
+ // testing that the code works correctly
+ int C[256];
+ unsigned i,j;
+ bool correct=true;
+ for (j=0;j<256;j++)
+ C[j] = 0;
+ for (i=0;i<n;i++) {
+ C[(int)s[i]]++;
+ if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
+ correct = false;
+ printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
+ }
+ }
+ return correct;
+}
+
+CSA::THuffAlphabetRank::~THuffAlphabetRank() {
+ if (left!=NULL) delete left;
+ if (right!=NULL) delete right;
+ if (bitrank!=NULL)
+ delete bitrank;
+}
+
+
+////////////////////////////////////////////////////////////////////////////
+// Class CSA
+
+/**
+ * Constructor inits an empty dynamic FM-index.
+ * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
+ */
+CSA::CSA(unsigned samplerate)
+ : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0),
+ codetable(0), dynFMI(0)
+{
+ this->samplerate = samplerate;
+
+ // FIXME TODO : DynFMI needs distribution of characters before hand
+ // This will create fully balanced wavelet tree for all chars [0, 255].
+ uchar temp[256];
+ for (unsigned i = 0; i < 255; ++i)
+ temp[i] = i+1;
+ temp[255] = 0;
+ dynFMI = new DynFMI(temp, 1, 255, false);
+}
+
+/**
+ * Insert new text
+ *
+ * Given text must include end-marker.
+ * Text identifiers are numbered starting from 0.
+ */
+void CSA::InsertText(uchar const * text)
+{
+ // Sanity check:
+ assert(dynFMI != 0);
+
+ TextPosition m = std::strlen((char *)text) + 1;
+ // Store text length and starting position
+ textLength.push_back(make_pair(m, n));
+ this->n += m;
+ dynFMI->addText(text, m);
+}
+
+void CSA::MakeStatic()
+{
+ // Sanity check:
+ if (dynFMI == 0)
+ {
+ std::cerr << "Data structure is already static (dynFMI == 0)." << std::endl;
+ std::exit(0);
+ }
+
+ uchar *bwt = dynFMI->getBWT();
+ /*printf("1234567890123456789\n");
+ for (TextPosition i = 0; i < n; i ++)
+ if (bwt[i] < 50)
+ printf("%d", (int)bwt[i]);
+ else
+ printf("%c", bwt[i]);
+ printf("\n");
+ */
+
+/* for (TextPosition i = 1; i <= n; i ++)
+ {
+ printf("LF[%lu, %c] = %lu\n", i, (*dynFMI)[i], dynFMI->LFmapping(i));
+ }*/
+
+ // Sanity check
+ assert(textLength.size() == dynFMI->getCollectionSize());
+
+ delete dynFMI;
+ dynFMI = 0;
+
+ ulong i, min = 0,
+ max;
+ for (i=0;i<256;i++)
+ C[i]=0;
+ for (i=0;i<n;++i)
+ C[(int)bwt[i]]++;
+ for (i=0;i<256;i++)
+ if (C[i]>0) {min = i; break;}
+ for (i=255;i>=min;--i)
+ if (C[i]>0) {max = i; break;}
+
+ // Print frequencies
+/* for(i = 0; i < 256; i++)
+ if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]);
+ fflush(stdout);*/
+
+ ulong prev=C[0], temp;
+ C[0]=0;
+ for (i=1;i<256;i++) {
+ temp = C[i];
+ C[i]=C[i-1]+prev;
+ prev = temp;
+ }
+ this->codetable = node::makecodetable(bwt,n);
+ alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
+ //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n");
+
+/* for (i = 0; i < n; ++i)
+ {
+ uchar c = alphabetrank->charAtPos(i);
+ TextPosition j = C[c]+alphabetrank->rank(c, i)-1;
+ printf("LF[%lu] = %lu\n", i, j);
+ }*/
+
+ // Calculate BWT end-marker position (of last inserted text)
+ i = 0;
+ while (bwt[i] != '\0')
+ {
+ uchar c = alphabetrank->charAtPos(i);
+ i = C[c]+alphabetrank->rank(c, i)-1;
+ }
+ bwtEndPos = i;
+ //printf("bwtEndPos = %lu\n", bwtEndPos);
+
+ delete [] bwt;
+
+ // Make sampling tables
+ maketables();
+ // to avoid running out of unsigned, the sizes are computed in specific order (large/small)*small
+ // |class CSA| +256*|TCodeEntry|+|C[]|+|suffixes[]+positions[]|+...
+ //printf("FMindex takes %d B\n",
+ // 6*W/8+256*3*W/8+256*W/8+ (2*n/(samplerate*8))*W+sampled->SpaceRequirementInBits()/8+alphabetrank->SpaceRequirementInBits()/8+W/8);
+}
+
+
+uchar* CSA::GetText(DocId k) const
+{
+ assert(k < (DocId)textLength.size());
+
+ uchar* result = new uchar[textLength[k].first];
+ TextPosition i = k;
+ TextPosition j = textLength[k].first-1;
+ result[j] = '\0';
+
+ uchar c = alphabetrank->charAtPos(i);
+ while (c != '\0')
+ {
+ --j;
+ result[j] = c;
+ i = C[c]+alphabetrank->rank(c,i)-1;
+
+ c = alphabetrank->charAtPos(i); // "next" char.
+ }
+ assert(j == 0);
+ return result;
+}
+
+uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
+{
+ assert(k < (DocId)textLength.size());
+ assert(j < textLength[k].first);
+ assert(i <= j);
+
+ // Start position of k'th text
+ ulong start = textLength[k].second;
+
+ return Substring(i + start, j-i+1);
+}
+
+/**
+ * Existential queries
+ */
+bool CSA::IsPrefix(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return textLength.size();
+
+ TextPosition sp = 0, ep = 0;
+ Search(pattern, m, &sp, &ep);
+
+ // Check for end-marker(s) in result interval
+ map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
+ it = endmarkers.lower_bound(sp);
+ if (it != endmarkers.end() && it->first <= ep)
+ return true;
+ return false;
+}
+
+bool CSA::IsSuffix(uchar const *pattern) const
+{
+ // Here counting is as fast as isSuffix():
+ if (CountSuffix(pattern) > 0)
+ return true;
+ return false;
+}
+
+bool CSA::IsEqual(uchar const *pattern) const
+{
+ TextPosition m = std::strlen((char *)pattern);
+ if (m == 0)
+ return textLength.size();
+
+ TextPosition sp = 0, ep = 0;
+ // Match including end-marker
+ Search(pattern, m+1, &sp, &ep);
+
+ // Check for end-marker(s) in result interval
+ map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
+ it = endmarkers.lower_bound(sp);
+ if (it != endmarkers.end() && it->first <= ep)
+ return true;
+ return false;
+}
+
+bool CSA::IsContains(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return true;
+
+ TextPosition sp = 0, ep = 0;
+ // Just check if pattern exists somewhere
+ ulong count = Search(pattern, m, &sp, &ep);
+
+ if (count > 0)
+ return true;
+ return false;
+}
+
+bool CSA::IsLessThan(uchar const*) const
+{
+ // TODO
+ return false;
+}
+
+/**
+ * Counting queries
+ */
+unsigned CSA::CountPrefix(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return textLength.size();
+
+ TextPosition sp = 0, ep = 0;
+ Search(pattern, m, &sp, &ep);
+
+ // Count end-markers in result interval
+ map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
+ it = endmarkers.lower_bound(sp);
+ unsigned count = 0;
+ while (it != endmarkers.end() && it->first <= ep)
+ {
+ ++ count;
+ ++ it;
+ }
+
+ return count;
+}
+
+unsigned CSA::CountSuffix(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return textLength.size();
+
+ TextPosition sp = 0, ep = 0;
+ // Search with end-marker
+ unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
+
+ return count;
+}
+
+unsigned CSA::CountEqual(uchar const *pattern) const
+{
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return textLength.size();
+
+ TextPosition sp = 0, ep = 0;
+ // Match including end-marker
+ Search(pattern, m+1, &sp, &ep);
+
+ // Count end-markers in result interval
+ map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
+ it = endmarkers.lower_bound(sp);
+ unsigned count = 0;
+ while (it != endmarkers.end() && it->first <= ep)
+ {
+ ++ count;
+ ++ it;
+ }
+
+ return count;
+}
+
+unsigned CSA::CountContains(uchar const * pattern) const
+{
+ // Here counting is as slow as fetching the result set
+ // because we would have to filter out occ's that fall within same document.
+ TextCollection::document_result result = Contains(pattern);
+ return result.size();
+}
+
+unsigned CSA::CountLessThan(uchar const * pattern) const
+{
+ // TODO
+ return 0;
+}
+
+/**
+ * Document reporting queries
+ */
+TextCollection::document_result CSA::Prefix(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result();
+
+ TextPosition sp = 0, ep = 0;
+ Search(pattern, m, &sp, &ep);
+
+ TextCollection::document_result result;
+
+ // Report end-markers in result interval
+ map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
+ it = endmarkers.lower_bound(sp);
+ unsigned count = 0;
+ while (it != endmarkers.end() && it->first <= ep)
+ {
+ // End-marker we found belongs to the "previous" doc in the collection
+ DocId docId = ((it->second).first + 1) % textLength.size();
+ result.push_back(docId);
+ ++ count;
+ ++ it;
+ }
+
+ return result;
+}
+
+TextCollection::document_result CSA::Suffix(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result();
+
+ TextPosition sp = 0, ep = 0;
+ // Search with end-marker
+ Search(pattern, m+1, &sp, &ep);
+
+ TextCollection::document_result result;
+
+ // Check each occurrence
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ uchar c = alphabetrank->charAtPos(i);
+ while (c != '\0' && !sampled->IsBitSet(i))
+ {
+ i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
+ c = alphabetrank->charAtPos(i);
+ }
+ if (c == '\0')
+ {
+ // map::operator[] is not const, using find() instead:
+ pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
+ // End-marker that we found belongs to the "previous" doc in collection:
+ DocId docId = (endm.first + 1) % textLength.size();
+ result.push_back(docId);
+ }
+ else
+ {
+ TextPosition textPos = suffixes[sampled->rank(i)-1];
+ result.push_back(DocIdAtTextPos(textPos));
+ }
+ }
+
+ return result;
+}
+
+TextCollection::document_result CSA::Equal(uchar const *pattern) const
+{
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return TextCollection::document_result();
+
+ TextPosition sp = 0, ep = 0;
+ // Match including end-marker
+ Search(pattern, m+1, &sp, &ep);
+
+ TextCollection::document_result result;
+
+ // Report end-markers in result interval
+ map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
+ it = endmarkers.lower_bound(sp);
+ unsigned count = 0;
+ while (it != endmarkers.end() && it->first <= ep)
+ {
+ // End-marker that we found belongs to the "previous" doc in collection:
+ DocId docId = ((it->second).first + 1) % textLength.size();
+ result.push_back(docId);
+ ++ count;
+ ++ it;
+ }
+
+ return result;
+}
+
+
+TextCollection::document_result CSA::Contains(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result();
+
+ TextPosition sp = 0, ep = 0;
+ // Search all occurrences
+ Search(pattern, m, &sp, &ep);
+
+ // We want unique document indentifiers, using std::set to collect them
+ std::set<DocId> resultSet;
+
+ // Check each occurrence
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ uchar c = alphabetrank->charAtPos(i);
+ while (c != '\0' && !sampled->IsBitSet(i))
+ {
+ i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
+ c = alphabetrank->charAtPos(i);
+ }
+ if (c == '\0')
+ {
+ // map::operator[] is not const, using find() instead:
+ pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
+ // End-marker we found belongs to the "previous" doc in collection
+ DocId docId = (endm.first + 1) % textLength.size();
+ resultSet.insert(docId);
+ }
+ else
+ {
+ TextPosition textPos = suffixes[sampled->rank(i)-1];
+ resultSet.insert(DocIdAtTextPos(textPos));
+ }
+ }
+
+ // Convert std::set to std::vector (TODO better way to construct result vector?)
+ TextCollection::document_result result;
+ for (std::set<DocId>::iterator it = resultSet.begin(); it != resultSet.end(); ++it)
+ result.push_back(*it);
+ return result;
+}
+
+TextCollection::document_result CSA::LessThan(uchar const * pattern) const
+{
+ // TODO
+ return document_result();
+}
+
+/**
+ * Full result set queries
+ */
+TextCollection::full_result CSA::FullContains(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return full_result();
+
+ TextPosition sp = 0, ep = 0;
+ // Search all occurrences
+ Search(pattern, m, &sp, &ep);
+
+ full_result result;
+
+ // Report each occurrence
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ TextPosition dist = 0;
+ uchar c = alphabetrank->charAtPos(i);
+ while (c != '\0' && !sampled->IsBitSet(i))
+ {
+ i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
+ c = alphabetrank->charAtPos(i);
+ ++ dist;
+ }
+ if (c == '\0')
+ {
+ // map::operator[] is not const, using find() instead:
+ pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
+ // End-marker we found belongs to the "previous" doc in the collection:
+ DocId docId = (endm.first+1)%textLength.size();
+ result.push_back(make_pair(docId, dist));
+ }
+ else
+ {
+ TextPosition textPos = suffixes[sampled->rank(i)-1] + dist;
+ // Read document identifier and its starting position in text order
+ DocId docId = DocIdAtTextPos(textPos);
+ result.push_back(make_pair(docId, textPos - textLength[docId].second));
+ }
+ }
+
+ return result;
+}
+
+
+/**
+ * Finds document identifier for given text position
+ *
+ * Starting text position of the document is stored into second parameter.
+ * Binary searching on text starting positions.
+ */
+TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const
+{
+ assert(i < n);
+
+ DocId a = 0;
+ DocId b = textLength.size() - 1;
+ while (a < b)
+ {
+ DocId c = a + (b - a)/2;
+ if (textLength[c].second > i)
+ b = c - 1;
+ else if (textLength[c+1].second > i)
+ return c;
+ else
+ a = c + 1;
+ }
+
+ assert(a < (DocId)textLength.size());
+ assert(i > textLength[a].second);
+ assert(i < (a == (DocId)textLength.size() - 1 ? n : textLength[a+1].second));
+ return a;
+}
+
+void CSA::Load(FILE *filename, unsigned samplerate)
+{
+ // TODO
+
+}
+
+void CSA::Save(FILE *filename)
+{
+ // TODO
+}
+
+
+/**
+ * Rest of the functions follow...
+ */
+
+uchar * CSA::Substring(TextPosition i, TextPosition l) const
+{
+ uchar *result = new uchar[l + 1];
+ if (l == 0)
+ {
+ result[0] = 0u;
+ return result;
+ }
+
+ TextPosition dist;
+ TextPosition k = i + l - 1;
+ // Check for end of the string
+ if (k > n - 1)
+ {
+ l -= k - n + 1;
+ k = n - 1;
+ }
+
+ TextPosition skip = samplerate - k % samplerate - 1;
+ TextPosition j;
+ if (k / samplerate + 1 >= n / samplerate)
+ {
+ j = bwtEndPos;
+ skip = n - k - 1;
+ }
+ else
+ {
+ j = positions[k/samplerate+1];
+ //cout << samplerate << ' ' << j << '\n';
+ }
+
+ for (dist = 0; dist < skip + l; dist++)
+ {
+ int c = alphabetrank->charAtPos(j);
+ if (c == '\0')
+ {
+ // map::operator[] is not const, using find() instead:
+ pair<DocId, TextPosition> endm = (endmarkers.find(j))->second;
+ j = endm.first; // LF-mapping for end-marker
+ }
+ else
+ j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
+ if (dist >= skip)
+ result[l + skip - dist - 1] = c;
+ }
+ result[l] = 0u;
+ return result;
+}
+
+ /*TextPosition CSA::inverse(TextPosition i)
+{
+ TextPosition skip = samplerate - i % samplerate;
+ TextPosition j;
+ if (i / samplerate + 1 >= n / samplerate)
+ {
+ j = bwtEndPos;
+ skip = n - i;
+ }
+ else
+ {
+ j = positions[i/samplerate+1];
+ //cout << samplerate << ' ' << j << '\n';
+ }
+
+ while (skip > 0)
+ {
+ int c = alphabetrank->charAtPos(j);
+ j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
+ skip --;
+ }
+ return j;
+ }*/
+
+ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const {
+ // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
+ int c = (int)pattern[m-1];
+ TextPosition i=m-1;
+ TextPosition sp = C[c];
+ TextPosition ep = C[c+1]-1;
+ while (sp<=ep && i>=1)
+ {
+// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
+ c = (int)pattern[--i];
+ sp = C[c]+alphabetrank->rank(c,sp-1);
+ ep = C[c]+alphabetrank->rank(c,ep)-1;
+ }
+ *spResult = sp;
+ *epResult = ep;
+ if (sp<=ep)
+ return ep - sp + 1;
+ else
+ return 0;
+ }
+
+ /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep)
+{
+ sp = C[(int)c]+alphabetrank->rank(c,sp-1);
+ ep = C[(int)c]+alphabetrank->rank(c,ep)-1;
+
+ if (sp<=ep)
+ return ep - sp + 1;
+ else
+ return 0;
+ }*/
+
+CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(samplerate log \sigma)
+{
+ TextPosition dist=0;
+ while (!sampled->IsBitSet(i))
+ {
+ int c = alphabetrank->charAtPos(i);
+ if (c == '\0')
+ {
+ // map::operator[] is not const, using find() instead:
+ pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
+ return endm.second + dist; // returns text position.
+ }
+ i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
+ ++dist;
+ }
+
+ return suffixes[sampled->rank(i)-1]+dist;
+}
+
+CSA::~CSA() {
+ delete alphabetrank;
+ delete sampled;
+ delete [] suffixes;
+ delete [] positions;
+ delete [] codetable;
+}
+
+
+void CSA::maketables()
+{
+ ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
+
+ ulong *sampledpositions = new ulong[n/W+1];
+ suffixes = new ulong[sampleLength];
+ positions = new ulong[sampleLength];
+
+ ulong i,j=0;
+ for (i=0;i<n/W+1;i++)
+ sampledpositions[i]=0lu;
+
+ ulong x,p=bwtEndPos;
+ DocId textId = textLength.size();
+ ulong ulongmax = 0;
+ ulongmax--;
+
+ //positions:
+ for (i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
+ // i substitutes SA->GetPos(i)
+ x=(i==n-1)?0:i+1;
+
+ if (x % samplerate == 0) {
+ Tools::SetField(sampledpositions,1,p,1);
+ positions[x/samplerate] = p;
+ }
+
+ uchar c = alphabetrank->charAtPos(p);
+ if (c == '\0')
+ {
+ // Record the order of end-markers in BWT:
+ --textId;
+ endmarkers[p] = make_pair(textId, (TextPosition)x);
+ // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
+ p = textId; // Correct LF-mapping to the last char of the previous text.
+ }
+ else
+ p = C[c]+alphabetrank->rank(c, p)-1;
+ }
+ assert(textId == 0);
+ /*for (map<ulong, pair<DocId, ulong> >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it)
+ {
+ printf("endm[%u] = %lu (text pos: %lu)\n", (it->second).first, it->first, (it->second).second);
+ }*/
+
+ sampled = new BitRank(sampledpositions,n,true);
+
+ //suffixes:
+ for(i=0; i<sampleLength; i++) {
+ j = sampled->rank(positions[i]);
+ if (j==0) j=sampleLength;
+ suffixes[ j-1] = (i*samplerate==n)?0:i*samplerate;
+ }
+}
+
+uchar * CSA::LoadFromFile(const char *filename)
+{
+ uchar *s;
+ std::ifstream file (filename, std::ios::in|std::ios::binary);
+ if (file.is_open())
+ {
+ std::cerr << "Loading CSA from file: " << filename << std::endl;
+ file.read((char *)&bwtEndPos, sizeof(TextPosition));
+ s = new uchar[n];
+ for (ulong offset = 0; offset < n; offset ++)
+ file.read((char *)(s + offset), sizeof(char));
+ file.close();
+ }
+ else
+ {
+ std::cerr << "Unable to open file " << filename << std::endl;
+ exit(1);
+ }
+ return s;
+}
+
+void CSA::SaveToFile(const char *filename, uchar *bwt)
+{
+ std::ofstream file (filename, std::ios::out|std::ios::binary|std::ios::trunc);
+ if (file.is_open())
+ {
+ std::cerr << "Writing CSA to file: " << filename << std::endl;
+ file.write((char *)&bwtEndPos, sizeof(TextPosition));
+ std::cerr << "Writing BWT of " << n << " bytes." << std::endl;
+ for (ulong offset = 0; offset < n; offset ++)
+ file.write((char *)(bwt + offset), sizeof(char));
+ file.close();
+ }
+ else
+ {
+ std::cerr << "Unable to open file " << filename << std::endl;
+ exit(1);
+ }
+}
+
+/*uchar * CSA::BWT(uchar *text)
+{
+ uchar *s;
+
+ DynFMI *wt = new DynFMI((uchar *) text, n);
+ s = wt->getBWT();
+ for (ulong i=0;i<n;i++)
+ if (s[i]==0u) {
+ bwtEndPos = i; // TODO: better solution ?
+ i=n;
+ }
+
+ delete wt;
+ return s;
+ }*/
+
+CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
+{
+ TCodeEntry *result = new TCodeEntry[ 256 ];
+
+ count_chars( text, n, result );
+ std::priority_queue< node, std::vector< node >, std::greater<node> > q;
+//
+// First I push all the leaf nodes into the queue
+//
+ for ( unsigned int i = 0 ; i < 256 ; i++ )
+ if ( result[ i ].count )
+ q.push(node( i, result[ i ].count ) );
+//
+// This loop removes the two smallest nodes from the
+// queue. It creates a new internal node that has
+// those two nodes as children. The new internal node
+// is then inserted into the priority queue. When there
+// is only one node in the priority queue, the tree
+// is complete.
+//
+
+ while ( q.size() > 1 ) {
+ node *child0 = new node( q.top() );
+ q.pop();
+ node *child1 = new node( q.top() );
+ q.pop();
+ q.push( node( child0, child1 ) );
+ }
+//
+// Now I compute and return the codetable
+//
+ q.top().maketable(0u,0u, result);
+ q.pop();
+ return result;
+}
+
+
+void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
+{
+ if ( child0 )
+ {
+ child0->maketable( SetBit(code,bits,0), bits+1, codetable );
+ child1->maketable( SetBit(code,bits,1), bits+1, codetable );
+ delete child0;
+ delete child1;
+ }
+ else
+ {
+ codetable[value].code = code;
+ codetable[value].bits = bits;
+ }
+}
+
+void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
+{
+ ulong i;
+ for (i = 0 ; i < 256 ; i++ )
+ counts[ i ].count = 0;
+ for (i=0; i<n; i++)
+ counts[(int)text[i]].count++;
+}
+
+unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
+ return x | (bit << pos);
+}
+