+++ /dev/null
-/******************************************************************************
- * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
- * *
- * *
- * This program is free software; you can redistribute it and/or modify *
- * it under the terms of the GNU Lesser General Public License as published *
- * by the Free Software Foundation; either version 2 of the License, or *
- * (at your option) any later version. *
- * *
- * This program is distributed in the hope that it will be useful, *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
- * GNU Lesser General Public License for more details. *
- * *
- * You should have received a copy of the GNU Lesser General Public License *
- * along with this program; if not, write to the *
- * Free Software Foundation, Inc., *
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
- *****************************************************************************/
-#include "CSA.h"
-#include "TextCollection.h"
-
-// Conflicts with std::min and std::max
-#undef min
-#undef max
-#include "dcover/bwt.hpp"
-
-//#include "HeapProfiler.h" // FIXME remove
-
-#include <iostream>
-#include <queue>
-#include <set>
-#include <vector>
-#include <utility>
-#include <stdexcept>
-#include <cassert>
-#include <cstring> // For strlen()
-using std::vector;
-using std::pair;
-using std::make_pair;
-using std::map;
-
-namespace SXSI
-{
-
-// Save file version info
-const uchar CSA::versionFlag = 2;
-
-////////////////////////////////////////////////////////////////////////////
-// Class CSA::THuffAlphabetRank
-// FIXME Unused code
-CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
- left = NULL;
- right = NULL;
- bitrank = NULL;
- ch = s[0];
- leaf = false;
- this->codetable = codetable;
-
- bool *B = new bool[n];
- TextPosition sum=0,i;
- /*
- for (i=0; i< n; i++) {
- printf("%c:", (char)((int)s[i]-128));
- for (r=0;r<codetable[(int)s[i]].bits;r++)
- if (codetable[(int)s[i]].code & (1u <<r))
- printf("1");
- else printf("0");
- printf("\n");
- }
- printf("\n");
- if (level > 100) return;*/
- for (i=0;i<n;i++)
- if (codetable[(int)s[i]].code & (1u << level)) {
- B[i] = true;
- sum++;
- }
- else B[i] = false;
- if (sum==0 || sum==n) {
- delete [] B;
- leaf = true;
- return;
- }
- uchar *sfirst, *ssecond;
- //if (n-sum > 0)
- sfirst = new uchar[n-sum];
- //if (sum > 0)
- ssecond = new uchar[sum];
- unsigned j=0,k=0;
- for (i=0;i<n;i++)
- if (B[i]) ssecond[k++] = s[i];
- else sfirst[j++] = s[i];
- ulong *Binbits = new ulong[n/W+1];
- for (i=0;i<n;i++)
- Tools::SetField(Binbits,1,i,B[i]);
- delete [] B;
- bitrank = new BitRank(Binbits,n,true);
- //if (j > 0) {
- left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
- delete [] sfirst;
- //}
- //if (k>0) {
- right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
- delete [] ssecond;
- //}
-}
-
-
-bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
- // testing that the code works correctly
- int C[256];
- unsigned i,j;
- bool correct=true;
- for (j=0;j<256;j++)
- C[j] = 0;
- for (i=0;i<n;i++) {
- C[(int)s[i]]++;
- if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
- correct = false;
- printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
- }
- }
- return correct;
-}
-
-CSA::THuffAlphabetRank::~THuffAlphabetRank() {
- if (left!=NULL) delete left;
- if (right!=NULL) delete right;
- if (bitrank!=NULL)
- delete bitrank;
-}
-
-
-////////////////////////////////////////////////////////////////////////////
-// Class CSA
-
-/**
- * Constructor inits an empty dynamic FM-index.
- * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
- */
-CSA::CSA(unsigned samplerate)
- : n(0), samplerate(0), alphabetrank(0), codetable(0), sampled(0), suffixes(0),
- suffixDocId(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0),
- emptyTextRank(0)
-{
- this->samplerate = samplerate;
-
-#ifdef CSA_TEST_BWT
- // FIXME TODO : DynFMI needs distribution of characters before hand
- // This will create fully balanced wavelet tree for all chars [0, 255].
- uchar temp[256];
- for (unsigned i = 0; i < 255; ++i)
- temp[i] = i+1;
- temp[255] = 0;
- dynFMI = new DynFMI(temp, 1, 255, false);
-
- /* Debug code: take char distribution from data.txt.
- uchar *temp = Tools::GetFileContents("data.txt", 0);
- dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false);
- delete [] temp;*/
-#endif
-}
-
-/**
- * Insert new text
- *
- * Given text must include end-marker.
- * Text identifiers are numbered starting from 0.
- */
-void CSA::InsertText(uchar const * text)
-{
- // Sanity check:
-#ifdef CSA_TEST_BWT
- assert(dynFMI != 0);
-#endif
-
- TextPosition m = std::strlen((char *)text) + 1;
- if (m > maxTextLength)
- maxTextLength = m; // Store length of the longest text seen so far.
-
- if (m > 1)
- {
- this->n += m;
- this->numberOfTexts ++;
- this->numberOfAllTexts ++;
-#ifdef CSA_TEST_BWT
- dynFMI->addText(text, m);
-#endif
-
- texts.append((char *) text);
- texts.append(1, '\0');
- }
- else
- {
- emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here
- this->numberOfAllTexts ++;
- }
-}
-
-void CSA::MakeStatic()
-{
- // Sanity check:
-#ifdef CSA_TEST_BWT
- if (dynFMI == 0)
- throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0).");
-#endif
-
- if (texts.size() == 0) // Empty collection
- {
- ++n;
- ++numberOfTexts;
- texts.append(1, '\0');
- }
-
- // Bitvector of empty texts, FIXME Remove?
- {
- //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
- ulong *tempB = new ulong[numberOfAllTexts/W + 1];
- for (ulong i = 0; i < numberOfAllTexts/W + 1; ++i)
- tempB[i] = 0;
- for (std::set<unsigned>::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it)
- Tools::SetField(tempB, 1, (*it), 1);
- emptyTextId.clear();
- emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true);
- }
-
- texts.reserve(0); // Release extra capacity
-
-/* FILE *fp = fopen("texts.txt", "wb");
- std::cout << "Wrote " << fwrite(texts.c_str(), 1, n, fp) << " bytes into texts.txt." << std::endl;
- fclose(fp);*/
-
- uchar *bwt = new uchar[n];
- {
- // More succinct solution via StringIterator, see below.
-/* unsigned* maptexts = new unsigned[n+1];
- // Map text chars to [0..255+numberOfTexts]
- unsigned count = 0;
- for (ulong i = 0; i < n; ++i)
- if (texts[i] == 0)
- maptexts[i] = ++count; // endmarkers \in [1..numberOfTexts]
- else {
- uchar c = (uchar)texts[i];
- maptexts[i] = (unsigned)c + numberOfTexts;
- }
- maptexts[n] = '\0';
- assert(count == numberOfTexts);
-
- std::cout << "maptext: ";
- for (ulong i = 0; i <= n; ++i)
- std::cout << maptexts[i] << ", ";
- std::cout << std::endl;*/
-
- // Mark endmarker positions into bitvector
- ulong * endmarkers = new ulong[n/W+1];
- for (ulong i = 0; i < n/W+1; ++i)
- endmarkers[i] = 0;
- for (ulong i = 0; i < n; ++i)
- if (texts[i] == 0)
- Tools::SetField(endmarkers, 1, i, 1);
- BitRank* br = new BitRank(endmarkers, n, true);
- assert(numberOfTexts == br->rank(n-1));
-
- // Build iterators, FIXME clean up iterator construction.
- StringIterator itBegin((uchar const *)texts.c_str(), (uchar const *)texts.c_str(), n, numberOfTexts, br);
- StringIterator itEnd((uchar const *)texts.c_str() + n,(uchar const *)texts.c_str(), n, numberOfTexts, br);
-
- bwtEndPos = (ulong)compute_bwt(itBegin, itEnd, //&maptexts[0], &maptexts[n],
- &bwt[0], numberOfTexts);
-
- bwt[--bwtEndPos] = '\0';
- texts.erase();
- texts.reserve(0); // Release capacity
- delete br; // bitvector of endmarkers
- } // End of bw transform
-// std::cerr << "Time after BWT: " << Tools::GetTime() << std::endl;
-
-/* fp = fopen("bwt.txt", "wb");
- std::cout << "Wrote " << fwrite(bwt, 1, n, fp) << " bytes into bwt.txt." << std::endl;
- fclose(fp);*/
-
-
-#ifdef CSA_TEST_BWT
- {
- uchar *bwtTest = dynFMI->getBWT();
- printf("123456789012345678901234567890123456789\n");
- for (TextPosition i = 0; i < n && i < 100; i ++)
- if (bwt[i] < 50)
- printf("%d", (int)bwt[i]);
- else
- printf("%c", bwt[i]);
- printf("\n");
- for (TextPosition i = 0; i < n && i < 100; i ++)
- if (bwtTest[i] < 50)
- printf("%d", (int)bwtTest[i]);
- else
- printf("%c", bwtTest[i]);
- printf("\n");
-
- // Sanity check
- assert(numberOfTexts == dynFMI->getCollectionSize());
-
- delete dynFMI;
- dynFMI = 0;
- for (ulong i = 0; i < n; ++i)
- if (bwt[i] != bwtTest[i])
- {
- std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " << (unsigned)bwtTest[i] << std::endl;
- assert(0);
- }
- delete [] bwtTest;
- }
-#endif // CSA_TEST_BWT
-
- makewavelet(bwt); // Deletes bwt!
- bwt = 0;
-
- // Make sampling tables
- maketables();
-}
-
-bool CSA::EmptyText(DocId k) const
-{
- assert(k < (DocId)numberOfTexts);
- if (emptyTextRank->IsBitSet(k))
- return true;
- return false;
-}
-
-uchar* CSA::GetText(DocId k) const
-{
- assert(k < (DocId)numberOfTexts);
-
- ulong textRank = 0;
- if (emptyTextRank->IsBitSet(k, &textRank))
- {
- uchar* result = new uchar[1];
- result[0] = 0;
- return result;
- }
- // Map to non-empty text
- k -= textRank; //emptyTextRank->rank(k);
-
- TextPosition i = k;
-
- string result;
- // Reserve average string length to avoid reallocs
- result.reserve(n/numberOfTexts);
-
- uchar c = alphabetrank->access(i);
- while (c != '\0')
- {
- result.push_back(c);
- i = C[c]+alphabetrank->rank(c,i)-1;
-
- c = alphabetrank->access(i); // "next" char.
- }
-
- // Convert to uchar (FIXME return string?)
- i = result.size();
- uchar* res = new uchar[i+1];
- res[i] = '\0';
- for (ulong j = 0; j < i; ++j)
- res[i-j-1] = result[j];
- return res;
-}
-/*
- * Not supported
-uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
-{
- assert(k < (DocId)numberOfTexts);
- assert(j < (*textLength)[k]);
- assert(i <= j);
-
- ulong textRank = 0;
- if (emptyTextRank->IsBitSet(k, &textRank))
- {
- uchar* result = new uchar[1];
- result[0] = 0;
- return result;
- }
-
- // Map to non-empty text
- k -= textRank; // emptyTextRank->rank(k);
-
- // Start position of k'th text
- ulong start = (*textStartPos)[k];
-
- return Substring(i + start, j-i+1);
- }*/
-
-/******************************************************************
- * Existential queries
- */
-bool CSA::IsPrefix(uchar const * pattern) const
-{
- TextPosition m = strlen((char *)pattern);
- if (m == 0)
- return true;
-
- TextPosition sp = 0, ep = 0;
- Search(pattern, m, &sp, &ep);
-
- // Check for end-marker(s) in result interval
- if (CountEndmarkers(sp, ep))
- return true;
- return false;
-}
-
-bool CSA::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
-{
- TextPosition m = strlen((char *)pattern);
- if (m == 0)
- return true;
-
- TextPosition sp = 0, ep = 0;
- Search(pattern, m, &sp, &ep);
-
- // Check for end-marker(s) in result interval
- if (CountEndmarkers(sp, ep, begin, end))
- return true;
- return false;
-}
-
-
-bool CSA::IsSuffix(uchar const *pattern) const
-{
- // Here counting is as fast as IsSuffix():
- if (CountSuffix(pattern) > 0)
- return true;
- return false;
-}
-
-bool CSA::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
-{
- // Here counting is as fast as IsSuffix():
- if (CountSuffix(pattern, begin, end) > 0)
- return true;
- return false;
-}
-
-bool CSA::IsEqual(uchar const *pattern) const
-{
- TextPosition m = std::strlen((char *)pattern);
- if (m == 0)
- {
- if (numberOfAllTexts - numberOfTexts > 0u)
- return true; // Empty texts exists
- return false; // No empty texts exists
- }
-
- TextPosition sp = 0, ep = 0;
- // Match including end-marker
- Search(pattern, m+1, &sp, &ep);
-
- // Check for end-marker(s) in result interval
- if (CountEndmarkers(sp, ep))
- return true;
- return false;
-}
-
-bool CSA::IsEqual(uchar const *pattern, DocId begin, DocId end) const
-{
- TextPosition m = std::strlen((char *)pattern);
- if (m == 0)
- {
- if (numberOfAllTexts - numberOfTexts > 0u)
- return true; // Empty texts exists
- return false; // No empty texts exists
- }
-
- TextPosition sp = 0, ep = 0;
- // Match including end-marker
- Search(pattern, m+1, &sp, &ep, begin, end);
-
- // Check for end-marker(s) in result interval
- if (CountEndmarkers(sp, ep))
- return true;
- return false;
-}
-
-bool CSA::IsContains(uchar const * pattern) const
-{
- TextPosition m = strlen((char *)pattern);
- if (m == 0)
- return true;
-
- TextPosition sp = 0, ep = 0;
- // Just check if pattern exists somewhere
- ulong count = Search(pattern, m, &sp, &ep);
-
- if (count > 0)
- return true;
- return false;
-}
-
-bool CSA::IsContains(uchar const * pattern, DocId begin, DocId end) const
-{
- // Here counting is as fast as existential querying
- if (CountContains(pattern, begin, end) > 0)
- return true; // FIXME No need to filter result set
- return false;
-}
-
-bool CSA::IsLessThan(uchar const * pattern) const
-{
- if (CountLessThan(pattern) > 0)
- return true;
- return false;
-}
-
-bool CSA::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
-{
- if (CountLessThan(pattern, begin, end) > 0)
- return true;
- return false;
-}
-
-/******************************************************************
- * Counting queries
- */
-ulong CSA::Count(uchar const * pattern) const
-{
- TextPosition m = strlen((char *)pattern);
- if (m == 0)
- return 0;
-
- TextPosition sp = 0, ep = 0;
- unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
- return count;
-}
-
-unsigned CSA::CountPrefix(uchar const * pattern) const
-{
- TextPosition m = strlen((char *)pattern);
- if (m == 0)
- return numberOfAllTexts;
-
- TextPosition sp = 0, ep = 0;
- Search(pattern, m, &sp, &ep);
-
- // Count end-markers in result interval
- return CountEndmarkers(sp, ep);
-}
-
-unsigned CSA::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
-{
- TextPosition m = strlen((char *)pattern);
- if (m == 0)
- return numberOfAllTexts;
-
- TextPosition sp = 0, ep = 0;
- Search(pattern, m, &sp, &ep);
-
- // Count end-markers in result interval
- return CountEndmarkers(sp, ep, begin, end);
-}
-
-unsigned CSA::CountSuffix(uchar const * pattern) const
-{
- TextPosition m = strlen((char *)pattern);
- if (m == 0)
- return numberOfAllTexts;
-
- TextPosition sp = 0, ep = 0;
- // Search with end-marker
- unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
-
- return count;
-}
-
-unsigned CSA::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
-{
- TextPosition m = strlen((char *)pattern);
- if (m == 0)
- return numberOfAllTexts;
-
- TextPosition sp = 0, ep = 0;
- // Search with end-marker
- unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
-
- return count;
-}
-
-unsigned CSA::CountEqual(uchar const *pattern) const
-{
- TextPosition m = strlen((char const *)pattern);
- if (m == 0)
- return numberOfAllTexts - numberOfTexts; // Empty texts.
-
- TextPosition sp = 0, ep = 0;
- // Match including end-marker
- Search(pattern, m+1, &sp, &ep);
-
- // Count end-markers in result interval
- return CountEndmarkers(sp, ep);
-}
-
-unsigned CSA::CountEqual(uchar const *pattern, DocId begin, DocId end) const
-{
- TextPosition m = strlen((char const *)pattern);
- if (m == 0)
- return numberOfAllTexts - numberOfTexts; // Empty texts.
-
- TextPosition sp = 0, ep = 0;
- // Match including end-marker
- Search(pattern, m+1, &sp, &ep, begin, end);
-
- // Count end-markers in result interval
- return CountEndmarkers(sp, ep); // Already within [begin, end]
-}
-
-unsigned CSA::CountContains(uchar const * pattern) const
-{
- TextPosition m = strlen((char const *)pattern);
- if (m == 0)
- return numberOfAllTexts; // Total number of texts.
-
- // Here counting is as slow as fetching the result set
- // because we have to filter out occ's that fall within same document.
- TextCollection::document_result result = Contains(pattern);
- return result.size();
-}
-
-unsigned CSA::CountContains(uchar const * pattern, DocId begin, DocId end) const
-{
- TextPosition m = strlen((char const *)pattern);
- if (m == 0)
- return numberOfAllTexts; // Total number of texts.
-
- // Here counting is as slow as fetching the result set
- // because we have to filter out occ's that fall within same document.
- TextCollection::document_result result = Contains(pattern, begin, end);
- return result.size();
-}
-
-// Less than or equal
-unsigned CSA::CountLessThan(uchar const * pattern) const
-{
- TextPosition m = strlen((char const *)pattern);
- if (m == 0)
- return numberOfAllTexts - numberOfTexts; // Empty texts.
-
- TextPosition sp = 0, ep = 0;
- SearchLessThan(pattern, m, &sp, &ep);
-
- // Count end-markers in result interval
- return CountEndmarkers(sp, ep);
-}
-
-unsigned CSA::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
-{
- TextPosition m = strlen((char const *)pattern);
- if (m == 0)
- return numberOfAllTexts - numberOfTexts; // Empty texts.
-
- TextPosition sp = 0, ep = 0;
- SearchLessThan(pattern, m, &sp, &ep);
-
- // Count end-markers in result interval
- return CountEndmarkers(sp, ep, begin, end);
-}
-
-/**
- * Document reporting queries
- */
-TextCollection::document_result CSA::Prefix(uchar const * pattern) const
-{
- TextPosition m = strlen((char *)pattern);
- if (m == 0)
- return TextCollection::document_result(); // FIXME Should return all 1...k
-
- TextPosition sp = 0, ep = 0;
- Search(pattern, m, &sp, &ep);
-
- TextCollection::document_result result;
-
- // Report end-markers in result interval
- unsigned resultSize = CountEndmarkers(sp, ep);
- if (resultSize == 0)
- return result;
-
- result.reserve(resultSize); // Try to avoid reallocation.
-
- // Iterate through end-markers in [sp,ep]:
- unsigned i = 0;
- if (sp > 0)
- i = alphabetrank->rank(0, sp - 1);
- while (resultSize)
- {
- // End-marker we found belongs to the "preceeding" doc in the collection
- DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- result.push_back(docId);
-
- -- resultSize;
- ++ i;
- }
-
- return result;
-}
-
-TextCollection::document_result CSA::Prefix(uchar const * pattern, DocId begin, DocId end) const
-{
- TextPosition m = strlen((char *)pattern);
- if (m == 0)
- return TextCollection::document_result(); // FIXME Should return all 1...k
-
- TextPosition sp = 0, ep = 0;
- Search(pattern, m, &sp, &ep);
-
- TextCollection::document_result result;
-
- // Report end-markers in result interval
- unsigned resultSize = CountEndmarkers(sp, ep);
- if (resultSize == 0)
- return result;
-
- result.reserve(resultSize); // Try to avoid reallocation.
-
- // Iterate through end-markers in [sp,ep] and [begin, end]:
- unsigned i = 0;
- if (sp > 0)
- i = alphabetrank->rank(0, sp - 1);
- while (resultSize)
- {
- // End-marker we found belongs to the "preceeding" doc in the collection
- DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- if (docId >= begin && docId <= end)
- result.push_back(docId);
-
- -- resultSize;
- ++ i;
- }
-
- return result;
-}
-
-TextCollection::document_result CSA::Suffix(uchar const * pattern) const
-{
- TextPosition m = strlen((char *)pattern);
- if (m == 0)
- return TextCollection::document_result(); // FIXME Should return all 1...k
-
- TextPosition sp = 0, ep = 0;
- // Search with end-marker
- Search(pattern, m+1, &sp, &ep);
-
- TextCollection::document_result result;
- result.reserve(ep-sp+1); // Try to avoid reallocation.
-
- ulong sampled_rank_i = 0;
- // Check each occurrence
- for (; sp <= ep; ++sp)
- {
- TextPosition i = sp;
-
- uchar c = alphabetrank->access(i);
- while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
- {
- i = C[c]+alphabetrank->rank(c,i)-1;
- c = alphabetrank->access(i);
- }
- // Assert: c == '\0' OR sampled->IsBitSet(i)
-
- if (c == '\0')
- {
- // Rank among the end-markers in BWT
- unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
-
- // End-marker that we found belongs to the "preceeding" doc in collection:
- DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- result.push_back(docId);
- }
- else // Sampled position
- {
- DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- result.push_back(docId);
- }
- }
-
- return result;
-}
-
-TextCollection::document_result CSA::Suffix(uchar const * pattern, DocId begin, DocId end) const
-{
- TextPosition m = strlen((char *)pattern);
- if (m == 0)
- return TextCollection::document_result(); // FIXME Should return all 1...k
-
- TextPosition sp = 0, ep = 0;
- // Search with end-marker
- Search(pattern, m+1, &sp, &ep, begin, end);
-
- TextCollection::document_result result;
- result.reserve(ep-sp+1); // Try to avoid reallocation.
-
- ulong sampled_rank_i = 0;
- // Check each occurrence, already within [begin, end]
- for (; sp <= ep; ++sp)
- {
- TextPosition i = sp;
-
- uchar c = alphabetrank->access(i);
- while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
- {
- i = C[c]+alphabetrank->rank(c,i)-1;
- c = alphabetrank->access(i);
- }
- // Assert: c == '\0' OR sampled->IsBitSet(i)
-
- if (c == '\0')
- {
- // Rank among the end-markers in BWT
- unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
-
- // End-marker that we found belongs to the "preceeding" doc in collection:
- DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- result.push_back(docId);
- }
- else // Sampled position
- {
- DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- result.push_back(docId);
- }
- }
-
- return result;
-}
-
-
-TextCollection::document_result CSA::Equal(uchar const *pattern) const
-{
- TextPosition m = strlen((char const *)pattern);
- if (m == 0)
- return TextCollection::document_result(); // FIXME Should return all empty texts
-
- TextPosition sp = 0, ep = 0;
- // Match including end-marker
- Search(pattern, m+1, &sp, &ep);
-
- TextCollection::document_result result;
-
- // Report end-markers in result interval
- unsigned resultSize = CountEndmarkers(sp, ep);
- if (resultSize == 0)
- return result;
-
- result.reserve(resultSize); // Try to avoid reallocation.
-
- unsigned i = 0;
- if (sp > 0)
- i = alphabetrank->rank(0, sp - 1);
- while (resultSize)
- {
- // End-marker we found belongs to the "previous" doc in the collection
- DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- result.push_back(docId);
-
- -- resultSize;
- ++ i;
- }
-
- return result;
-}
-
-TextCollection::document_result CSA::Equal(uchar const *pattern, DocId begin, DocId end) const
-{
- TextPosition m = strlen((char const *)pattern);
- if (m == 0)
- return TextCollection::document_result(); // FIXME Should return all empty texts
-
- TextPosition sp = 0, ep = 0;
- // Match including end-marker
- Search(pattern, m+1, &sp, &ep, begin, end);
-
- TextCollection::document_result result;
-
- // Report end-markers in result interval
- unsigned resultSize = CountEndmarkers(sp, ep);
- if (resultSize == 0)
- return result;
-
- result.reserve(resultSize); // Try to avoid reallocation.
-
- unsigned i = 0;
- if (sp > 0)
- i = alphabetrank->rank(0, sp - 1);
- while (resultSize)
- {
- // End-marker we found belongs to the "previous" doc in the collection
- DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- result.push_back(docId); // already within [begin, end]
-
- -- resultSize;
- ++ i;
- }
-
- return result;
-}
-
-
-TextCollection::document_result CSA::Contains(uchar const * pattern) const
-{
- TextPosition m = strlen((char *)pattern);
- if (m == 0)
- return TextCollection::document_result();
-
- TextPosition sp = 0, ep = 0;
- // Search all occurrences
- Search(pattern, m, &sp, &ep);
-
- // We want unique document indentifiers, using std::set to collect them
- std::set<DocId> resultSet;
-
- ulong sampled_rank_i = 0;
- // Check each occurrence
- for (; sp <= ep; ++sp)
- {
- TextPosition i = sp;
- uchar c = alphabetrank->access(i);
- while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
- {
- i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
- c = alphabetrank->access(i);
- }
- if (c == '\0')
- {
- // Rank among the end-markers in BWT
- unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
-
- // End-marker that we found belongs to the "preceeding" doc in collection:
- DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
- resultSet.insert(docId);
- }
- else
- {
- DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
- assert((unsigned)di < numberOfTexts);
- resultSet.insert(di);
- }
- }
-
- // Convert std::set to std::vector
- TextCollection::document_result result(resultSet.begin(), resultSet.end());
- // Map to doc ID's
- for (document_result::iterator it = result.begin(); it != result.end(); ++it)
- *it = emptyTextRank->select0(*it+1);
- return result;
-}
-
-TextCollection::document_result CSA::Contains(uchar const * pattern, DocId begin, DocId end) const
-{
- TextPosition m = strlen((char *)pattern);
- if (m == 0)
- return TextCollection::document_result();
-
- TextPosition sp = 0, ep = 0;
- // Search all occurrences
- Search(pattern, m, &sp, &ep);
-
- // We want unique document indentifiers, using std::set to collect them
- std::set<DocId> resultSet;
-
- ulong sampled_rank_i = 0;
- // Check each occurrence
- for (; sp <= ep; ++sp)
- {
- TextPosition i = sp;
- uchar c = alphabetrank->access(i);
- while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
- {
- i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
- c = alphabetrank->access(i);
- }
- if (c == '\0')
- {
- // Rank among the end-markers in BWT
- unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
-
- // End-marker that we found belongs to the "preceeding" doc in collection:
- DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
- if (docId >= begin && docId <= end)
- resultSet.insert(docId);
- }
- else
- {
- DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
- assert((unsigned)docId < numberOfTexts);
- if (docId >= begin && docId <= end)
- resultSet.insert(docId);
- }
- }
-
- // Convert std::set to std::vector
- TextCollection::document_result result(resultSet.begin(), resultSet.end());
- // Map to doc ID's
- for (document_result::iterator it = result.begin(); it != result.end(); ++it)
- *it = emptyTextRank->select0(*it+1);
- return result;
-}
-
-TextCollection::document_result CSA::LessThan(uchar const * pattern) const
-{
- TextPosition m = strlen((char *)pattern);
- if (m == 0)
- return TextCollection::document_result(); // empty result set
-
- TextPosition sp = 0, ep = 0;
- SearchLessThan(pattern, m, &sp, &ep);
-
- TextCollection::document_result result;
-
- // Report end-markers in result interval
- unsigned resultSize = CountEndmarkers(sp, ep);
- if (resultSize == 0)
- return result;
-
- result.reserve(resultSize); // Try to avoid reallocation.
-
- // Iterate through end-markers in [sp,ep]:
- unsigned i = 0;
- if (sp > 0)
- i = alphabetrank->rank(0, sp - 1);
- while (resultSize)
- {
- // End-marker we found belongs to the "preceeding" doc in the collection
- DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- result.push_back(docId);
-
- -- resultSize;
- ++ i;
- }
-
- return result;
-}
-
-TextCollection::document_result CSA::LessThan(uchar const * pattern, DocId begin, DocId end) const
-{
- TextPosition m = strlen((char *)pattern);
- if (m == 0)
- return TextCollection::document_result(); // empty result set
-
- TextPosition sp = 0, ep = 0;
- SearchLessThan(pattern, m, &sp, &ep);
-
- TextCollection::document_result result;
-
- // Report end-markers in result interval
- unsigned resultSize = CountEndmarkers(sp, ep);
- if (resultSize == 0)
- return result;
-
- result.reserve(resultSize); // Try to avoid reallocation.
-
- // Iterate through end-markers in [sp,ep] and [begin, end]:
- unsigned i = 0;
- if (sp > 0)
- i = alphabetrank->rank(0, sp - 1);
- while (resultSize)
- {
- // End-marker we found belongs to the "preceeding" doc in the collection
- DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- if (docId >= begin && docId <= end)
- result.push_back(docId);
-
- -- resultSize;
- ++ i;
- }
-
- return result;
-}
-
-/**
- * Full result set queries
- */
-TextCollection::full_result CSA::FullContains(uchar const * pattern) const
-{
- TextPosition m = strlen((char *)pattern);
- if (m == 0)
- return full_result(); // FIXME Throw exception?
-
- TextPosition sp = 0, ep = 0;
- // Search all occurrences
- Search(pattern, m, &sp, &ep);
-
- full_result result;
- result.reserve(ep-sp+1); // Try to avoid reallocation.
-
- ulong sampled_rank_i = 0;
- // Report each occurrence
- for (; sp <= ep; ++sp)
- {
- TextPosition i = sp;
- TextPosition dist = 0;
- uchar c = alphabetrank->access(i);
- while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
- {
- i = C[c]+alphabetrank->rank(c,i)-1;
- c = alphabetrank->access(i);
- ++ dist;
- }
- if (c == '\0')
- {
- // Rank among the end-markers in BWT
- unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
-
- // End-marker that we found belongs to the "preceeding" doc in collection:
- DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- result.push_back(make_pair(docId, dist));
- }
- else
- {
- TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
- DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
-// textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
-
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- result.push_back(make_pair(docId, textPos));
- }
- }
-
- return result;
-}
-
-TextCollection::full_result CSA::FullContains(uchar const * pattern, DocId begin, DocId end) const
-{
- TextPosition m = strlen((char *)pattern);
- if (m == 0)
- return full_result(); // FIXME Throw exception?
-
- TextPosition sp = 0, ep = 0;
- // Search all occurrences
- Search(pattern, m, &sp, &ep);
-
- full_result result;
- result.reserve(ep-sp+1); // Try to avoid reallocation.
-
- ulong sampled_rank_i = 0;
- // Report each occurrence
- for (; sp <= ep; ++sp)
- {
- TextPosition i = sp;
- TextPosition dist = 0;
- uchar c = alphabetrank->access(i);
- while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
- {
- i = C[c]+alphabetrank->rank(c,i)-1;
- c = alphabetrank->access(i);
- ++ dist;
- }
- if (c == '\0')
- {
- // Rank among the end-markers in BWT
- unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
-
- // End-marker that we found belongs to the "preceeding" doc in collection:
- DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- if (docId >= begin && docId <= end)
- result.push_back(make_pair(docId, dist));
- }
- else
- {
- TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
- DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
-// textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
-
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- if (docId >= begin && docId <= end)
- result.push_back(make_pair(docId, textPos));
- }
- }
-
- return result;
-}
-
-
-/**
- * Save index to a file handle
- *
- * Throws a std::runtime_error exception on i/o error.
- * First byte that is saved represents the version number of the save file.
- * In version 2 files, the data fields are:
- * uchar versionFlag;
- TextPosition n;
- unsigned samplerate;
- unsigned C[256];
- TextPosition bwtEndPos;
- static_sequence * alphabetrank;
- BSGAP *sampled;
- BlockArray *suffixes;
- BlockArray *suffixDocId;
- unsigned numberOfTexts;
- unsigned numberOfAllTexts;
- ulong maxTextLength;
- BlockArray *endmarkerDocId;
- BSGAP *emptyTextRank;
- */
-void CSA::Save(FILE *file) const
-{
- // Saving version info:
- if (std::fwrite(&versionFlag, 1, 1, file) != 1)
- throw std::runtime_error("CSA::Save(): file write error (version flag).");
-
- if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
- throw std::runtime_error("CSA::Save(): file write error (n).");
- if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
- throw std::runtime_error("CSA::Save(): file write error (samplerate).");
-
- for(ulong i = 0; i < 256; ++i)
- if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
- throw std::runtime_error("CSA::Save(): file write error (C table).");
-
- if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
- throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
-
- alphabetrank->save(file);
- sampled->Save(file);
- suffixes->Save(file);
- suffixDocId->Save(file);
-
- if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
- throw std::runtime_error("CSA::Save(): file write error (numberOfTexts).");
- if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
- throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts).");
- if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("CSA::Save(): file write error (maxTextLength).");
-
- endmarkerDocId->Save(file);
- emptyTextRank->Save(file);
- fflush(file);
-}
-
-
-/**
- * Load index from a file handle
- *
- * Throws a std::runtime_error exception on i/o error.
- * For more info, see CSA::Save().
- */
-void CSA::Load(FILE *file, unsigned samplerate)
-{
- // Delete everything
-#ifdef CSA_TEST_BWT
- delete dynFMI; dynFMI = 0;
-#endif
- delete alphabetrank; alphabetrank = 0;
- delete sampled; sampled = 0;
- delete suffixes; suffixes = 0;
- delete suffixDocId; suffixDocId = 0;
- delete [] codetable; codetable = 0;
-
- delete endmarkerDocId; endmarkerDocId = 0;
- delete emptyTextRank; emptyTextRank = 0;
-
- this->maxTextLength = 0;
- this->numberOfTexts = 0;
- this->numberOfAllTexts = 0;
- this->samplerate = samplerate;
- this->n = 0;
-
- uchar verFlag = 0;
- if (std::fread(&verFlag, 1, 1, file) != 1)
- throw std::runtime_error("CSA::Load(): file read error (version flag).");
- if (verFlag != CSA::versionFlag)
- throw std::runtime_error("CSA::Load(): invalid save file version.");
-
- if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
- throw std::runtime_error("CSA::Load(): file read error (n).");
- if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
- throw std::runtime_error("CSA::Load(): file read error (samplerate).");
-// FIXME samplerate can not be changed during load.
-// if (this->samplerate == 0)
-// this->samplerate = samplerate;
-
- for(ulong i = 0; i < 256; ++i)
- if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
- throw std::runtime_error("CSA::Load(): file read error (C table).");
-
- if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
- throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
-
- alphabetrank = static_sequence::load(file);
- sampled = new BSGAP(file);
- suffixes = new BlockArray(file);
- suffixDocId = new BlockArray(file);
-
- if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
- throw std::runtime_error("CSA::Load(): file read error (numberOfTexts).");
- if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
- throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts).");
- if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("CSA::Load(): file read error (maxTextLength).");
-
- endmarkerDocId = new BlockArray(file);
- emptyTextRank = new BSGAP(file);
-
- // FIXME Construct data structures with new samplerate
- //maketables();
-}
-
-
-
-/**
- * Rest of the functions follow...
- */
-
-
-// FIXME Use 2D-search structure
-unsigned CSA::CountEndmarkers(TextPosition sp, TextPosition ep, DocId begin, DocId end) const
-{
- if (sp > ep)
- return 0;
-
- ulong ranksp = 0;
- if (sp != 0)
- ranksp = alphabetrank->rank(0, sp - 1);
-
- unsigned resultSize = alphabetrank->rank(0, ep) - ranksp;
- if (resultSize == 0)
- return 0;
-
- // Count end-markers in result interval and within [begin, end]
- unsigned count = 0;
- unsigned i = ranksp;
- while (resultSize)
- {
- // End-marker we found belongs to the "previous" doc in the collection
- DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
- // Map to doc ID:
- docId = emptyTextRank->select0(docId+1);
- if (docId >= begin && docId <= end)
- ++ count;
-
- -- resultSize;
- ++ i;
- }
- return count;
-}
-
-ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
-{
- // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
- int c = (int)pattern[m-1];
- TextPosition i=m-1;
- TextPosition sp = C[c];
- TextPosition ep = C[c+1]-1;
- while (sp<=ep && i>=1)
- {
-// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
- c = (int)pattern[--i];
- sp = C[c]+alphabetrank->rank(c,sp-1);
- ep = C[c]+alphabetrank->rank(c,ep)-1;
- }
- *spResult = sp;
- *epResult = ep;
- if (sp<=ep)
- return ep - sp + 1;
- else
- return 0;
-}
-
-ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
-{
- // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
- int c = (int)pattern[m-1];
- assert(c == 0); // Start from endmarkers
- TextPosition i=m-1;
- TextPosition sp = begin;
- TextPosition ep = end;
- while (sp<=ep && i>=1)
- {
-// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
- c = (int)pattern[--i];
- sp = C[c]+alphabetrank->rank(c,sp-1);
- ep = C[c]+alphabetrank->rank(c,ep)-1;
- }
- *spResult = sp;
- *epResult = ep;
- if (sp<=ep)
- return ep - sp + 1;
- else
- return 0;
-}
-
-
-ulong CSA::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
-{
- // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
- uint c = (int)pattern[m-1];
- TextPosition i=m-1;
- TextPosition sp = 1;
- TextPosition ep = C[c+1]-1;
- while (sp<=ep && i>=1)
- {
-// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
- c = (int)pattern[--i];
- uint result = alphabetrank->rank(c,ep);
- if (result == ~0u)
- ep = 0;
- else
- ep = C[c]+result-1;
- }
- *spResult = sp;
- *epResult = ep;
- if (sp<=ep)
- return ep - sp + 1;
- else
- return 0;
-}
-
-
-CSA::~CSA() {
-#ifdef CSA_TEST_BWT
- delete dynFMI;
-#endif
- delete alphabetrank;
- delete sampled;
- delete suffixes;
- delete suffixDocId;
- delete [] codetable; // FIXME remove
-
- delete endmarkerDocId;
- delete emptyTextRank;
-}
-
-void CSA::makewavelet(uchar *bwt)
-{
- ulong i, min = 0,
- max;
- for (i=0;i<256;i++)
- C[i]=0;
- for (i=0;i<n;++i)
- C[(int)bwt[i]]++;
- for (i=0;i<256;i++)
- if (C[i]>0) {min = i; break;}
- for (i=255;i>=min;--i)
- if (C[i]>0) {max = i; break;}
-
- ulong prev=C[0], temp;
- C[0]=0;
- for (i=1;i<256;i++) {
- temp = C[i];
- C[i]=C[i-1]+prev;
- prev = temp;
- }
-// this->codetable = node::makecodetable(bwt,n);
-// alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
-// delete [] bwt;
- //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
-// std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
-// std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
-
-// HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
-
- alphabet_mapper * am = new alphabet_mapper_none();
- static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
-// static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
- wt_coder * wtc = new wt_coder_binary(bwt,n,am);
- alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
- delete bmb;
- bwt = 0; // already deleted
-
-// std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
-// std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
-}
-
-void CSA::maketables()
-{
- // Calculate BWT end-marker position (of last inserted text)
- // Note: bwtEndPos already set!
- // and the length of the first text (counter l):
-#ifdef CSA_TEST_BWT
- {
- ulong i = 0;
- ulong l = 1;
- uint alphabetrank_i_tmp = 0;
- uchar c = alphabetrank->access(i, alphabetrank_i_tmp);
- while (c != '\0')
- {
- i = C[c]+alphabetrank_i_tmp-1;
- ++l;
- c = alphabetrank->access(i, alphabetrank_i_tmp);
- }
-
- if (i != bwtEndPos) // compare result
- {
- std::cout << "i = " << i << ", bwtendpos = " << bwtEndPos << std::endl;
- exit(0);
- }
- }
-#endif
-
- // Build up arrays for text length and starting positions
- // FIXME Temp, remove
- //BlockArray* textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
- BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
- //(*textLength)[0] = l;
- (*textStartPos)[0] = 0;
-
- // Construct samples
- ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
- unsigned ceilLog2n = Tools::CeilLog2(n);
- BlockArray* positions = new BlockArray(sampleLength, ceilLog2n);
- BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
-
- // Mapping from end-markers to doc ID's:
- endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
-
- ulong *sampledpositions = new ulong[n/W+1];
- for (ulong i=0;i<n/W+1;i++)
- sampledpositions[i]=0lu;
-
- ulong x,p=bwtEndPos;
- ulong sampleCount = 0;
- // Keeping track of text position of prev. end-marker seen
- ulong posOfSuccEndmarker = n;
- DocId textId = numberOfTexts;
- ulong ulongmax = 0;
- ulongmax--;
- uint alphabetrank_i_tmp =0;
-
- //positions:
- for (ulong i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
- // i substitutes SA->GetPos(i)
- x=(i==n-1)?0:i+1;
-
- if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
- Tools::SetField(sampledpositions,1,p,1);
- (*positions)[sampleCount] = p;
- (*tmpSuffix)[sampleCount] = x; // FIXME remove
- sampleCount ++;
- }
-
- uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
- if (c == '\0')
- {
- --textId;
-
- // Record the order of end-markers in BWT:
- ulong endmarkerRank = alphabetrank_i_tmp - 1;
- (*endmarkerDocId)[endmarkerRank] = textId;
-
- // Store text length and text start position:
- if (textId < (DocId)numberOfTexts - 1)
- {
- //(*textLength)[textId + 1] = posOfSuccEndmarker - x;
- (*textStartPos)[textId + 1] = x; // x is the position of end-marker.
- posOfSuccEndmarker = x;
- }
-
- // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
- p = textId; // Correct LF-mapping to the last char of the previous text.
- }
- else
- p = C[c]+alphabetrank_i_tmp-1;
- }
- assert(textId == 0);
-
- sampled = new BSGAP(sampledpositions,n,true);
- sampleLength = sampled->rank(n-1);
- assert(sampleCount == sampleLength);
-
- // Suffixes store an offset from the text start position
- suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
- suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
-
- for(ulong i=0; i<sampleLength; i++) {
- assert((*positions)[i] < n);
- ulong j = sampled->rank((*positions)[i]);
- if (j==0) j=sampleLength;
- TextPosition textPos = (*tmpSuffix)[i];
- (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos);
-
- assert((unsigned)DocIdAtTextPos(textStartPos, textPos) < numberOfTexts);
- assert((*suffixDocId)[j-1] < numberOfTexts);
- // calculate offset from text start:
- (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
- }
- // FIXME Temp, remove
- delete tmpSuffix;
- delete positions;
-// delete textLength;
- delete textStartPos;
-}
-
-
-/**
- * Finds document identifier for given text position
- *
- * Starting text position of the document is stored into second parameter.
- * Binary searching on text starting positions.
- */
-TextCollection::DocId CSA::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
-{
- assert(i < n);
-
- DocId a = 0;
- DocId b = numberOfTexts - 1;
- while (a < b)
- {
- DocId c = a + (b - a)/2;
- if ((*textStartPos)[c] > i)
- b = c - 1;
- else if ((*textStartPos)[c+1] > i)
- return c;
- else
- a = c + 1;
- }
-
- assert(a < (DocId)numberOfTexts);
- assert(i >= (*textStartPos)[a]);
- assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
- return a;
-}
-
-CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
-{
- TCodeEntry *result = new TCodeEntry[ 256 ];
-
- count_chars( text, n, result );
- std::priority_queue< node, std::vector< node >, std::greater<node> > q;
-//
-// First I push all the leaf nodes into the queue
-//
- for ( unsigned int i = 0 ; i < 256 ; i++ )
- if ( result[ i ].count )
- q.push(node( i, result[ i ].count ) );
-//
-// This loop removes the two smallest nodes from the
-// queue. It creates a new internal node that has
-// those two nodes as children. The new internal node
-// is then inserted into the priority queue. When there
-// is only one node in the priority queue, the tree
-// is complete.
-//
-
- while ( q.size() > 1 ) {
- node *child0 = new node( q.top() );
- q.pop();
- node *child1 = new node( q.top() );
- q.pop();
- q.push( node( child0, child1 ) );
- }
-//
-// Now I compute and return the codetable
-//
- q.top().maketable(0u,0u, result);
- q.pop();
- return result;
-}
-
-
-void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
-{
- if ( child0 )
- {
- child0->maketable( SetBit(code,bits,0), bits+1, codetable );
- child1->maketable( SetBit(code,bits,1), bits+1, codetable );
- delete child0;
- delete child1;
- }
- else
- {
- codetable[value].code = code;
- codetable[value].bits = bits;
- }
-}
-
-void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
-{
- ulong i;
- for (i = 0 ; i < 256 ; i++ )
- counts[ i ].count = 0;
- for (i=0; i<n; i++)
- counts[(int)text[i]].count++;
-}
-
-unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
- return x | (bit << pos);
-}
-
-} // namespace SXSI
-
+++ /dev/null
-/******************************************************************************
- * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
- * *
- * *
- * This program is free software; you can redistribute it and/or modify *
- * it under the terms of the GNU Lesser General Public License as published *
- * by the Free Software Foundation; either version 2 of the License, or *
- * (at your option) any later version. *
- * *
- * This program is distributed in the hope that it will be useful, *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
- * GNU Lesser General Public License for more details. *
- * *
- * You should have received a copy of the GNU Lesser General Public License *
- * along with this program; if not, write to the *
- * Free Software Foundation, Inc., *
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
- *****************************************************************************/
-
-#ifndef _CSA_H_
-#define _CSA_H_
-#include "dynFMI.h"
-#include "BitRank.h"
-#include "TextCollection.h"
-#include "BlockArray.h"
-#include "RLWaveletTree.h"
-#include "StringIterator.h"
-#include <set>
-#include <vector>
-
-// Include from XMLTree/libcds
-#include <basics.h> // Defines W == 32
-#include <static_bitsequence.h>
-#include <alphabet_mapper.h>
-#include <static_sequence.h>
-
-// Re-define word size to ulong:
-#undef W
-#if __WORDSIZE == 64
-# define W 64
-#else
-# define W 32
-#endif
-#undef bitset
-
-namespace SXSI
-{
-
-// Un-comment to compare BWT against a BWT generated from class dynFMI:
-//#define CSA_TEST_BWT
-
-/**
- * Implementation of the TextCollection interface
- *
- */
-class CSA : public SXSI::TextCollection {
-public:
- /**
- * Constructor with (default) samplerate
- */
- explicit CSA(unsigned);
- ~CSA();
- /**
- * Following functions implement the interface described in TextCollection.h.
- * For details/documentation, see TextCollection.h.
- */
- void InsertText(uchar const *);
- void MakeStatic();
- bool EmptyText(DocId) const;
- uchar* GetText(DocId) const;
- /**
- * Next method is not supported:
- * Supporting GetText for some substring [i,j]
- * would require more space.
- */
-// uchar* GetText(DocId, TextPosition, TextPosition) const;
-
- bool IsPrefix(uchar const *) const;
- bool IsSuffix(uchar const *) const;
- bool IsEqual(uchar const *) const;
- bool IsContains(uchar const *) const;
- bool IsLessThan(uchar const *) const;
-
- bool IsPrefix(uchar const *, DocId, DocId) const;
- bool IsSuffix(uchar const *, DocId, DocId) const;
- bool IsEqual(uchar const *, DocId, DocId) const;
- bool IsContains(uchar const *, DocId, DocId) const;
- bool IsLessThan(uchar const *, DocId, DocId) const;
-
- ulong Count(uchar const *) const;
- unsigned CountPrefix(uchar const *) const;
- unsigned CountSuffix(uchar const *) const;
- unsigned CountEqual(uchar const *) const;
- unsigned CountContains(uchar const *) const;
- unsigned CountLessThan(const unsigned char*) const;
-
- unsigned CountPrefix(uchar const *, DocId, DocId) const;
- unsigned CountSuffix(uchar const *, DocId, DocId) const;
- unsigned CountEqual(uchar const *, DocId, DocId) const;
- unsigned CountContains(uchar const *, DocId, DocId) const;
- unsigned CountLessThan(uchar const *, DocId, DocId) const;
-
- // Definition of document_result is inherited from SXSI::TextCollection.
- document_result Prefix(uchar const *) const;
- document_result Suffix(uchar const *) const;
- document_result Equal(uchar const *) const;
- document_result Contains(uchar const *) const;
- document_result LessThan(uchar const *) const;
-
- document_result Prefix(uchar const *, DocId, DocId) const;
- document_result Suffix(uchar const *, DocId, DocId) const;
- document_result Equal(uchar const *, DocId, DocId) const;
- document_result Contains(uchar const *, DocId, DocId) const;
- document_result LessThan(uchar const *, DocId, DocId) const;
-
- // Definition of full_result is inherited from SXSI::TextCollection.
- full_result FullContains(uchar const *) const;
- full_result FullContains(uchar const *, DocId, DocId) const;
-
- // Index from/to disk
- void Load(FILE *, unsigned);
- void Save(FILE *) const;
-
-
- // FIXME Remove
- void deleteWT()
- {
- delete alphabetrank;
- alphabetrank = 0;
- delete [] codetable;
- codetable = 0;
- }
- void deleteSamples()
- {
- delete sampled;
- sampled =0;
- delete suffixes;
- suffixes = 0;
- delete suffixDocId;
- suffixDocId = 0;
- }
- void deleteEndmarker()
- {
- delete endmarkerDocId;
- endmarkerDocId = 0;
- }
-
-private:
- // FIXME Unused code
- class TCodeEntry {
- public:
- unsigned count;
- unsigned bits;
- unsigned code;
- TCodeEntry() {count=0;bits=0;code=0u;};
- };
-
-
- // FIXME Unused code
- class THuffAlphabetRank {
- // using fixed 0...255 alphabet
- private:
- BitRank *bitrank;
- THuffAlphabetRank *left;
- THuffAlphabetRank *right;
- TCodeEntry *codetable;
- uchar ch;
- bool leaf;
- public:
- THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned);
- THuffAlphabetRank(FILE *);
- ~THuffAlphabetRank();
-
- void Save(FILE *);
- bool Test(uchar *, TextPosition);
-
- inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
- THuffAlphabetRank const * temp=this;
- if (codetable[c].count == 0) return 0;
- unsigned level = 0;
- unsigned code = codetable[c].code;
- while (!temp->leaf) {
- if ((code & (1u<<level)) == 0) {
- i = i-temp->bitrank->rank(i);
- temp = temp->left;
- }
- else {
- i = temp->bitrank->rank(i)-1;
- temp = temp->right;
- }
- ++level;
- }
- return i+1;
- };
- inline bool IsCharAtPos(int c, TextPosition i) const {
- THuffAlphabetRank const * temp=this;
- if (codetable[c].count == 0) return false;
- unsigned level = 0;
- unsigned code = codetable[c].code;
- while (!temp->leaf) {
- if ((code & (1u<<level))==0) {
- if (temp->bitrank->IsBitSet(i)) return false;
- i = i-temp->bitrank->rank(i);
- temp = temp->left;
- }
- else {
- if (!temp->bitrank->IsBitSet(i)) return false;
- i = temp->bitrank->rank(i)-1;
- temp = temp->right;
- }
- ++level;
- }
- return true;
- }
- inline uchar access(TextPosition i) const {
- THuffAlphabetRank const * temp=this;
- while (!temp->leaf) {
- if (temp->bitrank->IsBitSet(i)) {
- i = temp->bitrank->rank(i)-1;
- temp = temp->right;
- }
- else {
- i = i-temp->bitrank->rank(i);
- temp = temp->left;
- }
- }
- return temp->ch;
- }
-
- inline uchar charAtPos(ulong i, TextPosition *rank) const {
- THuffAlphabetRank const * temp=this;
- while (!temp->leaf) {
- if (temp->bitrank->IsBitSet(i)) {
- i = temp->bitrank->rank(i)-1;
- temp = temp->right;
- } else {
- i = i-temp->bitrank->rank(i);
- temp = temp->left;
- }
- }
- (*rank)=i;
- return temp->ch;
- }
- };
-
- // FIXME Unused code
- class node {
- private:
- unsigned weight;
- uchar value;
- node *child0;
- node *child1;
-
- void maketable( unsigned code, unsigned bits, TCodeEntry *codetable ) const;
- static void count_chars(uchar *, TextPosition , TCodeEntry *);
- static unsigned SetBit(unsigned , unsigned , unsigned );
- public:
- node( unsigned char c = 0, unsigned i = 0 ) {
- value = c;
- weight = i;
- child0 = 0;
- child1 = 0;
- }
-
- node( node* c0, node *c1 ) {
- value = 0;
- weight = c0->weight + c1->weight;
- child0 = c0;
- child1 = c1;
- }
-
-
- bool operator>( const node &a ) const {
- return weight > a.weight;
- }
-
- static TCodeEntry *makecodetable(uchar *, TextPosition);
- };
-
- static const uchar versionFlag;
- TextPosition n;
- unsigned samplerate;
- unsigned C[256];
- TextPosition bwtEndPos;
-// THuffAlphabetRank *alphabetrank;
- // RLWaveletTree *alphabetrank;
- static_sequence * alphabetrank;
-
-#ifdef CSA_TEST_BWT
- DynFMI *dynFMI;
-#endif
- TCodeEntry *codetable;
-
- // Sample structures for texts longer than samplerate
- BSGAP *sampled;
- BlockArray *suffixes;
- BlockArray *suffixDocId;
-
- // Total number of texts in the collection
- unsigned numberOfTexts;
- // Total number of texts including empty texts
- unsigned numberOfAllTexts;
- // Length of the longest text
- ulong maxTextLength;
-
- // Array of document id's in the order of end-markers in BWT
- // Access by endmarkerDocId[rank_$(L, p) - 1].
- BlockArray *endmarkerDocId;
-
- // FIXME Replace with a more succinct data structure
- // Note: Empty texts are already handled inside XMLTree class.
- std::set<unsigned> emptyTextId;
- BSGAP *emptyTextRank;
-
- // FIXME A better solution?
- std::string texts;
-
- // Following are not part of the public API
- uchar * BWT(uchar *);
- void makewavelet(uchar *);
- void maketables();
- DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
- ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
- ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
- ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
-// TextPosition Inverse(TextPosition) const;
-// TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
-// TextPosition Psi(TextPosition) const;
-// uchar * Substring(TextPosition, TextPosition) const;
-// TextPosition Lookup(TextPosition) const;
-
- /**
- * Count end-markers in given interval
- */
- inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
- {
- if (sp > ep)
- return 0;
-
- ulong ranksp = 0;
- if (sp != 0)
- ranksp = alphabetrank->rank(0, sp - 1);
-
- return alphabetrank->rank(0, ep) - ranksp;
- }
- unsigned CountEndmarkers(TextPosition, TextPosition, DocId, DocId) const;
-}; // class CSA
-
-} // namespace SXSI
-
-#endif
unsigned long HeapProfiler::GetMaxHeapConsumption() {
return maxConsumption;
}
+void HeapProfiler::ResetMaxHeapConsumption() {
+ maxConsumption = 0;
+}
unsigned long HeapProfiler::GetHeapConsumption() {
return consumption;
{
public:
static unsigned long GetMaxHeapConsumption();
+ static void ResetMaxHeapConsumption();
static unsigned long GetHeapConsumption();
// Prototypes for hooks
--- /dev/null
+/******************************************************************************
+ * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
+ * *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU Lesser General Public License as published *
+ * by the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU Lesser General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU Lesser General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ *****************************************************************************/
+#include "TCImplementation.h"
+
+//#include "HeapProfiler.h" // FIXME remove
+
+#include <iostream>
+#include <map>
+#include <set>
+#include <vector>
+#include <utility>
+#include <stdexcept>
+#include <cassert>
+#include <cstring> // For strlen()
+using std::vector;
+using std::pair;
+using std::make_pair;
+using std::map;
+
+namespace SXSI
+{
+
+// Save file version info
+const uchar TCImplementation::versionFlag = 3;
+
+/**
+ * Constructor inits an empty dynamic FM-index.
+ * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
+ */
+TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, unsigned numberOfTexts_, ulong maxTextLength_)
+ : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
+ suffixDocId(0), numberOfTexts(numberOfTexts_), numberOfAllTexts(numberOfTexts_), maxTextLength(maxTextLength_), endmarkerDocId(0),
+ emptyTextRank(0)
+{
+
+ // Bitvector of empty texts, FIXME Remove?
+ {
+ //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
+ ulong *tempB = new ulong[numberOfTexts/W + 1];
+ for (ulong i = 0; i < numberOfTexts/W + 1; ++i)
+ tempB[i] = 0;
+// for (std::set<unsigned>::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it)
+// Tools::SetField(tempB, 1, (*it), 1);
+ emptyTextRank = new BSGAP(tempB, numberOfTexts, true);
+ }
+ makewavelet(bwt); // Deletes bwt!
+ bwt = 0;
+
+ // Make sampling tables
+ maketables();
+}
+
+bool TCImplementation::EmptyText(DocId k) const
+{
+ assert(k < (DocId)numberOfTexts);
+ if (emptyTextRank->IsBitSet(k))
+ return true;
+ return false;
+}
+
+uchar* TCImplementation::GetText(DocId k) const
+{
+ assert(k < (DocId)numberOfTexts);
+
+ ulong textRank = 0;
+ if (emptyTextRank->IsBitSet(k, &textRank))
+ {
+ uchar* result = new uchar[1];
+ result[0] = 0;
+ return result;
+ }
+ // Map to non-empty text
+ k -= textRank; //emptyTextRank->rank(k);
+
+ TextPosition i = k;
+
+ string result;
+ // Reserve average string length to avoid reallocs
+ result.reserve(n/numberOfTexts);
+
+ uchar c = alphabetrank->access(i);
+ while (c != '\0')
+ {
+ result.push_back(c);
+ i = C[c]+alphabetrank->rank(c,i)-1;
+
+ c = alphabetrank->access(i); // "next" char.
+ }
+
+ // Convert to uchar (FIXME return string?)
+ i = result.size();
+ uchar* res = new uchar[i+1];
+ res[i] = '\0';
+ for (ulong j = 0; j < i; ++j)
+ res[i-j-1] = result[j];
+ return res;
+}
+/*
+ * Not supported
+uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const
+{
+ assert(k < (DocId)numberOfTexts);
+ assert(j < (*textLength)[k]);
+ assert(i <= j);
+
+ ulong textRank = 0;
+ if (emptyTextRank->IsBitSet(k, &textRank))
+ {
+ uchar* result = new uchar[1];
+ result[0] = 0;
+ return result;
+ }
+
+ // Map to non-empty text
+ k -= textRank; // emptyTextRank->rank(k);
+
+ // Start position of k'th text
+ ulong start = (*textStartPos)[k];
+
+ return Substring(i + start, j-i+1);
+ }*/
+
+/******************************************************************
+ * Existential queries
+ */
+bool TCImplementation::IsPrefix(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return true;
+
+ TextPosition sp = 0, ep = 0;
+ Search(pattern, m, &sp, &ep);
+
+ // Check for end-marker(s) in result interval
+ if (CountEndmarkers(sp, ep))
+ return true;
+ return false;
+}
+
+bool TCImplementation::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return true;
+
+ TextPosition sp = 0, ep = 0;
+ Search(pattern, m, &sp, &ep);
+
+ // Check for end-marker(s) in result interval
+ if (CountEndmarkers(sp, ep, begin, end))
+ return true;
+ return false;
+}
+
+
+bool TCImplementation::IsSuffix(uchar const *pattern) const
+{
+ // Here counting is as fast as IsSuffix():
+ if (CountSuffix(pattern) > 0)
+ return true;
+ return false;
+}
+
+bool TCImplementation::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
+{
+ // Here counting is as fast as IsSuffix():
+ if (CountSuffix(pattern, begin, end) > 0)
+ return true;
+ return false;
+}
+
+bool TCImplementation::IsEqual(uchar const *pattern) const
+{
+ TextPosition m = std::strlen((char *)pattern);
+ if (m == 0)
+ {
+ if (numberOfAllTexts - numberOfTexts > 0u)
+ return true; // Empty texts exists
+ return false; // No empty texts exists
+ }
+
+ TextPosition sp = 0, ep = 0;
+ // Match including end-marker
+ Search(pattern, m+1, &sp, &ep);
+
+ // Check for end-marker(s) in result interval
+ if (CountEndmarkers(sp, ep))
+ return true;
+ return false;
+}
+
+bool TCImplementation::IsEqual(uchar const *pattern, DocId begin, DocId end) const
+{
+ TextPosition m = std::strlen((char *)pattern);
+ if (m == 0)
+ {
+ if (numberOfAllTexts - numberOfTexts > 0u)
+ return true; // Empty texts exists
+ return false; // No empty texts exists
+ }
+
+ TextPosition sp = 0, ep = 0;
+ // Match including end-marker
+ Search(pattern, m+1, &sp, &ep, begin, end);
+
+ // Check for end-marker(s) in result interval
+ if (CountEndmarkers(sp, ep))
+ return true;
+ return false;
+}
+
+bool TCImplementation::IsContains(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return true;
+
+ TextPosition sp = 0, ep = 0;
+ // Just check if pattern exists somewhere
+ ulong count = Search(pattern, m, &sp, &ep);
+
+ if (count > 0)
+ return true;
+ return false;
+}
+
+bool TCImplementation::IsContains(uchar const * pattern, DocId begin, DocId end) const
+{
+ // Here counting is as fast as existential querying
+ if (CountContains(pattern, begin, end) > 0)
+ return true; // FIXME No need to filter result set
+ return false;
+}
+
+bool TCImplementation::IsLessThan(uchar const * pattern) const
+{
+ if (CountLessThan(pattern) > 0)
+ return true;
+ return false;
+}
+
+bool TCImplementation::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
+{
+ if (CountLessThan(pattern, begin, end) > 0)
+ return true;
+ return false;
+}
+
+/******************************************************************
+ * Counting queries
+ */
+ulong TCImplementation::Count(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return 0;
+
+ TextPosition sp = 0, ep = 0;
+ unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
+ return count;
+}
+
+unsigned TCImplementation::CountPrefix(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return numberOfAllTexts;
+
+ TextPosition sp = 0, ep = 0;
+ Search(pattern, m, &sp, &ep);
+
+ // Count end-markers in result interval
+ return CountEndmarkers(sp, ep);
+}
+
+unsigned TCImplementation::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return numberOfAllTexts;
+
+ TextPosition sp = 0, ep = 0;
+ Search(pattern, m, &sp, &ep);
+
+ // Count end-markers in result interval
+ return CountEndmarkers(sp, ep, begin, end);
+}
+
+unsigned TCImplementation::CountSuffix(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return numberOfAllTexts;
+
+ TextPosition sp = 0, ep = 0;
+ // Search with end-marker
+ unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
+
+ return count;
+}
+
+unsigned TCImplementation::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return numberOfAllTexts;
+
+ TextPosition sp = 0, ep = 0;
+ // Search with end-marker
+ unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
+
+ return count;
+}
+
+unsigned TCImplementation::CountEqual(uchar const *pattern) const
+{
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return numberOfAllTexts - numberOfTexts; // Empty texts.
+
+ TextPosition sp = 0, ep = 0;
+ // Match including end-marker
+ Search(pattern, m+1, &sp, &ep);
+
+ // Count end-markers in result interval
+ return CountEndmarkers(sp, ep);
+}
+
+unsigned TCImplementation::CountEqual(uchar const *pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return numberOfAllTexts - numberOfTexts; // Empty texts.
+
+ TextPosition sp = 0, ep = 0;
+ // Match including end-marker
+ Search(pattern, m+1, &sp, &ep, begin, end);
+
+ // Count end-markers in result interval
+ return CountEndmarkers(sp, ep); // Already within [begin, end]
+}
+
+unsigned TCImplementation::CountContains(uchar const * pattern) const
+{
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return numberOfAllTexts; // Total number of texts.
+
+ // Here counting is as slow as fetching the result set
+ // because we have to filter out occ's that fall within same document.
+ TextCollection::document_result result = Contains(pattern);
+ return result.size();
+}
+
+unsigned TCImplementation::CountContains(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return numberOfAllTexts; // Total number of texts.
+
+ // Here counting is as slow as fetching the result set
+ // because we have to filter out occ's that fall within same document.
+ TextCollection::document_result result = Contains(pattern, begin, end);
+ return result.size();
+}
+
+// Less than or equal
+unsigned TCImplementation::CountLessThan(uchar const * pattern) const
+{
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return numberOfAllTexts - numberOfTexts; // Empty texts.
+
+ TextPosition sp = 0, ep = 0;
+ SearchLessThan(pattern, m, &sp, &ep);
+
+ // Count end-markers in result interval
+ return CountEndmarkers(sp, ep);
+}
+
+unsigned TCImplementation::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return numberOfAllTexts - numberOfTexts; // Empty texts.
+
+ TextPosition sp = 0, ep = 0;
+ SearchLessThan(pattern, m, &sp, &ep);
+
+ // Count end-markers in result interval
+ return CountEndmarkers(sp, ep, begin, end);
+}
+
+/**
+ * Document reporting queries
+ */
+TextCollection::document_result TCImplementation::Prefix(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result(); // FIXME Should return all 1...k
+
+ TextPosition sp = 0, ep = 0;
+ Search(pattern, m, &sp, &ep);
+
+ TextCollection::document_result result;
+
+ // Report end-markers in result interval
+ unsigned resultSize = CountEndmarkers(sp, ep);
+ if (resultSize == 0)
+ return result;
+
+ result.reserve(resultSize); // Try to avoid reallocation.
+
+ // Iterate through end-markers in [sp,ep]:
+ unsigned i = 0;
+ if (sp > 0)
+ i = alphabetrank->rank(0, sp - 1);
+ while (resultSize)
+ {
+ // End-marker we found belongs to the "preceeding" doc in the collection
+ DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ result.push_back(docId);
+
+ -- resultSize;
+ ++ i;
+ }
+
+ return result;
+}
+
+TextCollection::document_result TCImplementation::Prefix(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result(); // FIXME Should return all 1...k
+
+ TextPosition sp = 0, ep = 0;
+ Search(pattern, m, &sp, &ep);
+
+ TextCollection::document_result result;
+
+ // Report end-markers in result interval
+ unsigned resultSize = CountEndmarkers(sp, ep);
+ if (resultSize == 0)
+ return result;
+
+ result.reserve(resultSize); // Try to avoid reallocation.
+
+ // Iterate through end-markers in [sp,ep] and [begin, end]:
+ unsigned i = 0;
+ if (sp > 0)
+ i = alphabetrank->rank(0, sp - 1);
+ while (resultSize)
+ {
+ // End-marker we found belongs to the "preceeding" doc in the collection
+ DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ if (docId >= begin && docId <= end)
+ result.push_back(docId);
+
+ -- resultSize;
+ ++ i;
+ }
+
+ return result;
+}
+
+TextCollection::document_result TCImplementation::Suffix(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result(); // FIXME Should return all 1...k
+
+ TextPosition sp = 0, ep = 0;
+ // Search with end-marker
+ Search(pattern, m+1, &sp, &ep);
+
+ TextCollection::document_result result;
+ result.reserve(ep-sp+1); // Try to avoid reallocation.
+
+ ulong sampled_rank_i = 0;
+ // Check each occurrence
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+
+ uchar c = alphabetrank->access(i);
+ while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
+ {
+ i = C[c]+alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i);
+ }
+ // Assert: c == '\0' OR sampled->IsBitSet(i)
+
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
+
+ // End-marker that we found belongs to the "preceeding" doc in collection:
+ DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ result.push_back(docId);
+ }
+ else // Sampled position
+ {
+ DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ result.push_back(docId);
+ }
+ }
+
+ return result;
+}
+
+TextCollection::document_result TCImplementation::Suffix(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result(); // FIXME Should return all 1...k
+
+ TextPosition sp = 0, ep = 0;
+ // Search with end-marker
+ Search(pattern, m+1, &sp, &ep, begin, end);
+
+ TextCollection::document_result result;
+ result.reserve(ep-sp+1); // Try to avoid reallocation.
+
+ ulong sampled_rank_i = 0;
+ // Check each occurrence, already within [begin, end]
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+
+ uchar c = alphabetrank->access(i);
+ while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
+ {
+ i = C[c]+alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i);
+ }
+ // Assert: c == '\0' OR sampled->IsBitSet(i)
+
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
+
+ // End-marker that we found belongs to the "preceeding" doc in collection:
+ DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ result.push_back(docId);
+ }
+ else // Sampled position
+ {
+ DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ result.push_back(docId);
+ }
+ }
+
+ return result;
+}
+
+
+TextCollection::document_result TCImplementation::Equal(uchar const *pattern) const
+{
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return TextCollection::document_result(); // FIXME Should return all empty texts
+
+ TextPosition sp = 0, ep = 0;
+ // Match including end-marker
+ Search(pattern, m+1, &sp, &ep);
+
+ TextCollection::document_result result;
+
+ // Report end-markers in result interval
+ unsigned resultSize = CountEndmarkers(sp, ep);
+ if (resultSize == 0)
+ return result;
+
+ result.reserve(resultSize); // Try to avoid reallocation.
+
+ unsigned i = 0;
+ if (sp > 0)
+ i = alphabetrank->rank(0, sp - 1);
+ while (resultSize)
+ {
+ // End-marker we found belongs to the "previous" doc in the collection
+ DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ result.push_back(docId);
+
+ -- resultSize;
+ ++ i;
+ }
+
+ return result;
+}
+
+TextCollection::document_result TCImplementation::Equal(uchar const *pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return TextCollection::document_result(); // FIXME Should return all empty texts
+
+ TextPosition sp = 0, ep = 0;
+ // Match including end-marker
+ Search(pattern, m+1, &sp, &ep, begin, end);
+
+ TextCollection::document_result result;
+
+ // Report end-markers in result interval
+ unsigned resultSize = CountEndmarkers(sp, ep);
+ if (resultSize == 0)
+ return result;
+
+ result.reserve(resultSize); // Try to avoid reallocation.
+
+ unsigned i = 0;
+ if (sp > 0)
+ i = alphabetrank->rank(0, sp - 1);
+ while (resultSize)
+ {
+ // End-marker we found belongs to the "previous" doc in the collection
+ DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ result.push_back(docId); // already within [begin, end]
+
+ -- resultSize;
+ ++ i;
+ }
+
+ return result;
+}
+
+
+TextCollection::document_result TCImplementation::Contains(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result();
+
+ TextPosition sp = 0, ep = 0;
+ // Search all occurrences
+ Search(pattern, m, &sp, &ep);
+
+ // We want unique document indentifiers, using std::set to collect them
+ std::set<DocId> resultSet;
+
+ ulong sampled_rank_i = 0;
+ // Check each occurrence
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ uchar c = alphabetrank->access(i);
+ while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
+ {
+ i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
+ c = alphabetrank->access(i);
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
+
+ // End-marker that we found belongs to the "preceeding" doc in collection:
+ DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
+ resultSet.insert(docId);
+ }
+ else
+ {
+ DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+ assert((unsigned)di < numberOfTexts);
+ resultSet.insert(di);
+ }
+ }
+
+ // Convert std::set to std::vector
+ TextCollection::document_result result(resultSet.begin(), resultSet.end());
+ // Map to doc ID's
+ for (document_result::iterator it = result.begin(); it != result.end(); ++it)
+ *it = emptyTextRank->select0(*it+1);
+ return result;
+}
+
+TextCollection::document_result TCImplementation::Contains(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result();
+
+ TextPosition sp = 0, ep = 0;
+ // Search all occurrences
+ Search(pattern, m, &sp, &ep);
+
+ // We want unique document indentifiers, using std::set to collect them
+ std::set<DocId> resultSet;
+
+ ulong sampled_rank_i = 0;
+ // Check each occurrence
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ uchar c = alphabetrank->access(i);
+ while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
+ {
+ i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
+ c = alphabetrank->access(i);
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
+
+ // End-marker that we found belongs to the "preceeding" doc in collection:
+ DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
+ if (docId >= begin && docId <= end)
+ resultSet.insert(docId);
+ }
+ else
+ {
+ DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+ assert((unsigned)docId < numberOfTexts);
+ if (docId >= begin && docId <= end)
+ resultSet.insert(docId);
+ }
+ }
+
+ // Convert std::set to std::vector
+ TextCollection::document_result result(resultSet.begin(), resultSet.end());
+ // Map to doc ID's
+ for (document_result::iterator it = result.begin(); it != result.end(); ++it)
+ *it = emptyTextRank->select0(*it+1);
+ return result;
+}
+
+TextCollection::document_result TCImplementation::LessThan(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result(); // empty result set
+
+ TextPosition sp = 0, ep = 0;
+ SearchLessThan(pattern, m, &sp, &ep);
+
+ TextCollection::document_result result;
+
+ // Report end-markers in result interval
+ unsigned resultSize = CountEndmarkers(sp, ep);
+ if (resultSize == 0)
+ return result;
+
+ result.reserve(resultSize); // Try to avoid reallocation.
+
+ // Iterate through end-markers in [sp,ep]:
+ unsigned i = 0;
+ if (sp > 0)
+ i = alphabetrank->rank(0, sp - 1);
+ while (resultSize)
+ {
+ // End-marker we found belongs to the "preceeding" doc in the collection
+ DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ result.push_back(docId);
+
+ -- resultSize;
+ ++ i;
+ }
+
+ return result;
+}
+
+TextCollection::document_result TCImplementation::LessThan(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result(); // empty result set
+
+ TextPosition sp = 0, ep = 0;
+ SearchLessThan(pattern, m, &sp, &ep);
+
+ TextCollection::document_result result;
+
+ // Report end-markers in result interval
+ unsigned resultSize = CountEndmarkers(sp, ep);
+ if (resultSize == 0)
+ return result;
+
+ result.reserve(resultSize); // Try to avoid reallocation.
+
+ // Iterate through end-markers in [sp,ep] and [begin, end]:
+ unsigned i = 0;
+ if (sp > 0)
+ i = alphabetrank->rank(0, sp - 1);
+ while (resultSize)
+ {
+ // End-marker we found belongs to the "preceeding" doc in the collection
+ DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ if (docId >= begin && docId <= end)
+ result.push_back(docId);
+
+ -- resultSize;
+ ++ i;
+ }
+
+ return result;
+}
+
+/**
+ * Full result set queries
+ */
+TextCollection::full_result TCImplementation::FullContains(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return full_result(); // FIXME Throw exception?
+
+ TextPosition sp = 0, ep = 0;
+ // Search all occurrences
+ Search(pattern, m, &sp, &ep);
+
+ full_result result;
+ result.reserve(ep-sp+1); // Try to avoid reallocation.
+
+ ulong sampled_rank_i = 0;
+ // Report each occurrence
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ TextPosition dist = 0;
+ uchar c = alphabetrank->access(i);
+ while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
+ {
+ i = C[c]+alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i);
+ ++ dist;
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
+
+ // End-marker that we found belongs to the "preceeding" doc in collection:
+ DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ result.push_back(make_pair(docId, dist));
+ }
+ else
+ {
+ TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
+ DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+// textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
+
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ result.push_back(make_pair(docId, textPos));
+ }
+ }
+
+ return result;
+}
+
+TextCollection::full_result TCImplementation::FullContains(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return full_result(); // FIXME Throw exception?
+
+ TextPosition sp = 0, ep = 0;
+ // Search all occurrences
+ Search(pattern, m, &sp, &ep);
+
+ full_result result;
+ result.reserve(ep-sp+1); // Try to avoid reallocation.
+
+ ulong sampled_rank_i = 0;
+ // Report each occurrence
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ TextPosition dist = 0;
+ uchar c = alphabetrank->access(i);
+ while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
+ {
+ i = C[c]+alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i);
+ ++ dist;
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
+
+ // End-marker that we found belongs to the "preceeding" doc in collection:
+ DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ if (docId >= begin && docId <= end)
+ result.push_back(make_pair(docId, dist));
+ }
+ else
+ {
+ TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
+ DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+// textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
+
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ if (docId >= begin && docId <= end)
+ result.push_back(make_pair(docId, textPos));
+ }
+ }
+
+ return result;
+}
+
+
+/**
+ * Save index to a file handle
+ *
+ * Throws a std::runtime_error exception on i/o error.
+ * First byte that is saved represents the version number of the save file.
+ * In version 2 files, the data fields are:
+ * uchar versionFlag;
+ TextPosition n;
+ unsigned samplerate;
+ unsigned C[256];
+ TextPosition bwtEndPos;
+ static_sequence * alphabetrank;
+ BSGAP *sampled;
+ BlockArray *suffixes;
+ BlockArray *suffixDocId;
+ unsigned numberOfTexts;
+ unsigned numberOfAllTexts;
+ ulong maxTextLength;
+ BlockArray *endmarkerDocId;
+ BSGAP *emptyTextRank;
+ */
+void TCImplementation::Save(FILE *file) const
+{
+ // Saving version info:
+ if (std::fwrite(&versionFlag, 1, 1, file) != 1)
+ throw std::runtime_error("TCImplementation::Save(): file write error (version flag).");
+
+ if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
+ throw std::runtime_error("TCImplementation::Save(): file write error (n).");
+ if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
+ throw std::runtime_error("TCImplementation::Save(): file write error (samplerate).");
+
+ for(ulong i = 0; i < 256; ++i)
+ if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
+ throw std::runtime_error("TCImplementation::Save(): file write error (C table).");
+
+ if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
+ throw std::runtime_error("TCImplementation::Save(): file write error (bwt end position).");
+
+ alphabetrank->save(file);
+ sampled->Save(file);
+ suffixes->Save(file);
+ suffixDocId->Save(file);
+
+ if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
+ throw std::runtime_error("TCImplementation::Save(): file write error (numberOfTexts).");
+ if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
+ throw std::runtime_error("TCImplementation::Save(): file write error (numberOfAllTexts).");
+ if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("TCImplementation::Save(): file write error (maxTextLength).");
+
+ endmarkerDocId->Save(file);
+ emptyTextRank->Save(file);
+ fflush(file);
+}
+
+
+/**
+ * Load index from a file handle
+ *
+ * Throws a std::runtime_error exception on i/o error.
+ * For more info, see TCImplementation::Save().
+ */
+TCImplementation::TCImplementation(FILE *file, unsigned samplerate_)
+ : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
+ suffixDocId(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0),
+ emptyTextRank(0)
+{
+ uchar verFlag = 0;
+ if (std::fread(&verFlag, 1, 1, file) != 1)
+ throw std::runtime_error("TCImplementation::Load(): file read error (version flag).");
+ if (verFlag != TCImplementation::versionFlag)
+ throw std::runtime_error("TCImplementation::Load(): invalid save file version.");
+
+ if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
+ throw std::runtime_error("TCImplementation::Load(): file read error (n).");
+ if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
+ throw std::runtime_error("TCImplementation::Load(): file read error (samplerate).");
+// FIXME samplerate can not be changed during load.
+// if (this->samplerate == 0)
+// this->samplerate = samplerate;
+
+ for(ulong i = 0; i < 256; ++i)
+ if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
+ throw std::runtime_error("TCImplementation::Load(): file read error (C table).");
+
+ if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
+ throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position).");
+
+ alphabetrank = static_sequence::load(file);
+ sampled = new BSGAP(file);
+ suffixes = new BlockArray(file);
+ suffixDocId = new BlockArray(file);
+
+ if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
+ throw std::runtime_error("TCImplementation::Load(): file read error (numberOfTexts).");
+ if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
+ throw std::runtime_error("TCImplementation::Load(): file read error (numberOfAllTexts).");
+ if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength).");
+
+ endmarkerDocId = new BlockArray(file);
+ emptyTextRank = new BSGAP(file);
+
+ // FIXME Construct data structures with new samplerate
+ //maketables();
+}
+
+
+
+/**
+ * Rest of the functions follow...
+ */
+
+
+// FIXME Use 2D-search structure
+unsigned TCImplementation::CountEndmarkers(TextPosition sp, TextPosition ep, DocId begin, DocId end) const
+{
+ if (sp > ep)
+ return 0;
+
+ ulong ranksp = 0;
+ if (sp != 0)
+ ranksp = alphabetrank->rank(0, sp - 1);
+
+ unsigned resultSize = alphabetrank->rank(0, ep) - ranksp;
+ if (resultSize == 0)
+ return 0;
+
+ // Count end-markers in result interval and within [begin, end]
+ unsigned count = 0;
+ unsigned i = ranksp;
+ while (resultSize)
+ {
+ // End-marker we found belongs to the "previous" doc in the collection
+ DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ if (docId >= begin && docId <= end)
+ ++ count;
+
+ -- resultSize;
+ ++ i;
+ }
+ return count;
+}
+
+ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
+{
+ // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
+ int c = (int)pattern[m-1];
+ TextPosition i=m-1;
+ TextPosition sp = C[c];
+ TextPosition ep = C[c+1]-1;
+ while (sp<=ep && i>=1)
+ {
+// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
+ c = (int)pattern[--i];
+ sp = C[c]+alphabetrank->rank(c,sp-1);
+ ep = C[c]+alphabetrank->rank(c,ep)-1;
+ }
+ *spResult = sp;
+ *epResult = ep;
+ if (sp<=ep)
+ return ep - sp + 1;
+ else
+ return 0;
+}
+
+ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
+{
+ // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
+ int c = (int)pattern[m-1];
+ assert(c == 0); // Start from endmarkers
+ TextPosition i=m-1;
+ TextPosition sp = begin;
+ TextPosition ep = end;
+ while (sp<=ep && i>=1)
+ {
+// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
+ c = (int)pattern[--i];
+ sp = C[c]+alphabetrank->rank(c,sp-1);
+ ep = C[c]+alphabetrank->rank(c,ep)-1;
+ }
+ *spResult = sp;
+ *epResult = ep;
+ if (sp<=ep)
+ return ep - sp + 1;
+ else
+ return 0;
+}
+
+
+ulong TCImplementation::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
+{
+ // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
+ uint c = (int)pattern[m-1];
+ TextPosition i=m-1;
+ TextPosition sp = 1;
+ TextPosition ep = C[c+1]-1;
+ while (sp<=ep && i>=1)
+ {
+// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
+ c = (int)pattern[--i];
+ uint result = alphabetrank->rank(c,ep);
+ if (result == ~0u)
+ ep = 0;
+ else
+ ep = C[c]+result-1;
+ }
+ *spResult = sp;
+ *epResult = ep;
+ if (sp<=ep)
+ return ep - sp + 1;
+ else
+ return 0;
+}
+
+
+TCImplementation::~TCImplementation() {
+#ifdef CSA_TEST_BWT
+ delete dynFMI;
+#endif
+ delete alphabetrank;
+ delete sampled;
+ delete suffixes;
+ delete suffixDocId;
+
+ delete endmarkerDocId;
+ delete emptyTextRank;
+}
+
+void TCImplementation::makewavelet(uchar *bwt)
+{
+ ulong i, min = 0,
+ max;
+ for (i=0;i<256;i++)
+ C[i]=0;
+ for (i=0;i<n;++i)
+ C[(int)bwt[i]]++;
+ for (i=0;i<256;i++)
+ if (C[i]>0) {min = i; break;}
+ for (i=255;i>=min;--i)
+ if (C[i]>0) {max = i; break;}
+
+ ulong prev=C[0], temp;
+ C[0]=0;
+ for (i=1;i<256;i++) {
+ temp = C[i];
+ C[i]=C[i-1]+prev;
+ prev = temp;
+ }
+// this->codetable = node::makecodetable(bwt,n);
+// alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
+// delete [] bwt;
+ //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
+// std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
+// std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
+
+// HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
+
+ alphabet_mapper * am = new alphabet_mapper_none();
+ static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
+// static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
+ wt_coder * wtc = new wt_coder_binary(bwt,n,am);
+ alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
+ delete bmb;
+ bwt = 0; // already deleted
+
+// std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
+// std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
+}
+
+void TCImplementation::maketables()
+{
+ // Calculate BWT end-marker position (of last inserted text)
+ {
+ ulong i = 0;
+ uint alphabetrank_i_tmp = 0;
+ uchar c = alphabetrank->access(i, alphabetrank_i_tmp);
+ while (c != '\0')
+ {
+ i = C[c]+alphabetrank_i_tmp-1;
+ c = alphabetrank->access(i, alphabetrank_i_tmp);
+ }
+
+ this->bwtEndPos = i;
+ }
+
+ // Build up arrays for text length and starting positions
+ // FIXME Temp, remove
+ //BlockArray* textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
+ BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
+ //(*textLength)[0] = l;
+ (*textStartPos)[0] = 0;
+
+ // Construct samples
+ ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
+ unsigned ceilLog2n = Tools::CeilLog2(n);
+ BlockArray* positions = new BlockArray(sampleLength, ceilLog2n);
+ BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
+
+ // Mapping from end-markers to doc ID's:
+ endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
+
+ ulong *sampledpositions = new ulong[n/W+1];
+ for (ulong i=0;i<n/W+1;i++)
+ sampledpositions[i]=0lu;
+
+ ulong x,p=bwtEndPos;
+ ulong sampleCount = 0;
+ // Keeping track of text position of prev. end-marker seen
+ ulong posOfSuccEndmarker = n;
+ DocId textId = numberOfTexts;
+ ulong ulongmax = 0;
+ ulongmax--;
+ uint alphabetrank_i_tmp =0;
+
+ //positions:
+ for (ulong i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
+ // i substitutes SA->GetPos(i)
+ x=(i==n-1)?0:i+1;
+
+ if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
+ Tools::SetField(sampledpositions,1,p,1);
+ (*positions)[sampleCount] = p;
+ (*tmpSuffix)[sampleCount] = x; // FIXME remove
+ sampleCount ++;
+ }
+
+ uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
+ if (c == '\0')
+ {
+ --textId;
+
+ // Record the order of end-markers in BWT:
+ ulong endmarkerRank = alphabetrank_i_tmp - 1;
+ (*endmarkerDocId)[endmarkerRank] = textId;
+
+ // Store text length and text start position:
+ if (textId < (DocId)numberOfTexts - 1)
+ {
+ //(*textLength)[textId + 1] = posOfSuccEndmarker - x;
+ (*textStartPos)[textId + 1] = x; // x is the position of end-marker.
+ posOfSuccEndmarker = x;
+ }
+
+ // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
+ p = textId; // Correct LF-mapping to the last char of the previous text.
+ }
+ else
+ p = C[c]+alphabetrank_i_tmp-1;
+ }
+ assert(textId == 0);
+
+ sampled = new BSGAP(sampledpositions,n,true);
+ sampleLength = sampled->rank(n-1);
+ assert(sampleCount == sampleLength);
+
+ // Suffixes store an offset from the text start position
+ suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
+ suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
+
+ for(ulong i=0; i<sampleLength; i++) {
+ assert((*positions)[i] < n);
+ ulong j = sampled->rank((*positions)[i]);
+ if (j==0) j=sampleLength;
+ TextPosition textPos = (*tmpSuffix)[i];
+ (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos);
+
+ assert((unsigned)DocIdAtTextPos(textStartPos, textPos) < numberOfTexts);
+ assert((*suffixDocId)[j-1] < numberOfTexts);
+ // calculate offset from text start:
+ (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
+ }
+ // FIXME Temp, remove
+ delete tmpSuffix;
+ delete positions;
+// delete textLength;
+ delete textStartPos;
+}
+
+
+/**
+ * Finds document identifier for given text position
+ *
+ * Starting text position of the document is stored into second parameter.
+ * Binary searching on text starting positions.
+ */
+TextCollection::DocId TCImplementation::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
+{
+ assert(i < n);
+
+ DocId a = 0;
+ DocId b = numberOfTexts - 1;
+ while (a < b)
+ {
+ DocId c = a + (b - a)/2;
+ if ((*textStartPos)[c] > i)
+ b = c - 1;
+ else if ((*textStartPos)[c+1] > i)
+ return c;
+ else
+ a = c + 1;
+ }
+
+ assert(a < (DocId)numberOfTexts);
+ assert(i >= (*textStartPos)[a]);
+ assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
+ return a;
+}
+
+
+} // namespace SXSI
+
--- /dev/null
+/******************************************************************************
+ * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
+ * *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU Lesser General Public License as published *
+ * by the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU Lesser General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU Lesser General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ *****************************************************************************/
+
+#ifndef _TCImplementation_H_
+#define _TCImplementation_H_
+#include "BitRank.h"
+#include "TextCollection.h"
+#include "BlockArray.h"
+#include "BSGAP.h"
+
+// Include from XMLTree/libcds
+#include <basics.h> // Defines W == 32
+#include <static_bitsequence.h>
+#include <alphabet_mapper.h>
+#include <static_sequence.h>
+
+// Re-define word size to ulong:
+#undef W
+#if __WORDSIZE == 64
+# define W 64
+#else
+# define W 32
+#endif
+#undef bitset
+
+namespace SXSI
+{
+
+// Un-comment to compare BWT against a BWT generated from class dynFMI:
+//#define CSA_TEST_BWT
+
+/**
+ * Implementation of the TextCollection interface
+ *
+ */
+class TCImplementation : public SXSI::TextCollection {
+public:
+ TCImplementation(uchar *, ulong, unsigned, unsigned, ulong);
+ ~TCImplementation();
+
+ bool EmptyText(DocId) const;
+ uchar* GetText(DocId) const;
+ /**
+ * Next method is not supported:
+ * Supporting GetText for some substring [i,j]
+ * would require more space.
+ */
+// uchar* GetText(DocId, TextPosition, TextPosition) const;
+
+ bool IsPrefix(uchar const *) const;
+ bool IsSuffix(uchar const *) const;
+ bool IsEqual(uchar const *) const;
+ bool IsContains(uchar const *) const;
+ bool IsLessThan(uchar const *) const;
+
+ bool IsPrefix(uchar const *, DocId, DocId) const;
+ bool IsSuffix(uchar const *, DocId, DocId) const;
+ bool IsEqual(uchar const *, DocId, DocId) const;
+ bool IsContains(uchar const *, DocId, DocId) const;
+ bool IsLessThan(uchar const *, DocId, DocId) const;
+
+ ulong Count(uchar const *) const;
+ unsigned CountPrefix(uchar const *) const;
+ unsigned CountSuffix(uchar const *) const;
+ unsigned CountEqual(uchar const *) const;
+ unsigned CountContains(uchar const *) const;
+ unsigned CountLessThan(const unsigned char*) const;
+
+ unsigned CountPrefix(uchar const *, DocId, DocId) const;
+ unsigned CountSuffix(uchar const *, DocId, DocId) const;
+ unsigned CountEqual(uchar const *, DocId, DocId) const;
+ unsigned CountContains(uchar const *, DocId, DocId) const;
+ unsigned CountLessThan(uchar const *, DocId, DocId) const;
+
+ // Definition of document_result is inherited from SXSI::TextCollection.
+ document_result Prefix(uchar const *) const;
+ document_result Suffix(uchar const *) const;
+ document_result Equal(uchar const *) const;
+ document_result Contains(uchar const *) const;
+ document_result LessThan(uchar const *) const;
+
+ document_result Prefix(uchar const *, DocId, DocId) const;
+ document_result Suffix(uchar const *, DocId, DocId) const;
+ document_result Equal(uchar const *, DocId, DocId) const;
+ document_result Contains(uchar const *, DocId, DocId) const;
+ document_result LessThan(uchar const *, DocId, DocId) const;
+
+ // Definition of full_result is inherited from SXSI::TextCollection.
+ full_result FullContains(uchar const *) const;
+ full_result FullContains(uchar const *, DocId, DocId) const;
+
+ // Index from/to disk
+ TCImplementation(FILE *, unsigned);
+ void Save(FILE *) const;
+
+private:
+ static const uchar versionFlag;
+ TextPosition n;
+ unsigned samplerate;
+ unsigned C[256];
+ TextPosition bwtEndPos;
+ static_sequence * alphabetrank;
+
+ // Sample structures for texts longer than samplerate
+ BSGAP *sampled; // FIXME Replace with RRR02
+ BlockArray *suffixes;
+ BlockArray *suffixDocId;
+
+ // Total number of texts in the collection
+ unsigned numberOfTexts;
+ // Total number of texts including empty texts
+ unsigned numberOfAllTexts;
+ // Length of the longest text
+ ulong maxTextLength;
+
+ // Array of document id's in the order of end-markers in BWT
+ // Access by endmarkerDocId[rank_$(L, p) - 1].
+ BlockArray *endmarkerDocId;
+
+ // FIXME Replace with a more succinct data structure
+ // Note: Empty texts are already handled inside XMLTree class.
+ BSGAP *emptyTextRank; // FIXME Remove
+
+ // Following are not part of the public API
+ uchar * BWT(uchar *);
+ void makewavelet(uchar *);
+ void maketables();
+ DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
+ ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
+ ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
+ ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
+
+ /**
+ * Count end-markers in given interval
+ */
+ inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
+ {
+ if (sp > ep)
+ return 0;
+
+ ulong ranksp = 0;
+ if (sp != 0)
+ ranksp = alphabetrank->rank(0, sp - 1);
+
+ return alphabetrank->rank(0, ep) - ranksp;
+ }
+ unsigned CountEndmarkers(TextPosition, TextPosition, DocId, DocId) const;
+}; // class TCImplementation
+
+} // namespace SXSI
+
+#endif
#include "TextCollection.h"
-#include "CSA.h"
+#include "TCImplementation.h"
namespace SXSI
{
/**
- * Init text collection
+ * Init text collection from a file
*
- * See CSA.h for more details.
+ * See TCImplementation.h for more details.
*/
- TextCollection * TextCollection::InitTextCollection(unsigned samplerate)
+ TextCollection * TextCollection::Load(FILE *fp, unsigned samplerate)
{
- TextCollection *result = new CSA(samplerate);
+ TextCollection *result = new TCImplementation(fp, samplerate);
return result;
}
}
namespace SXSI
{
+
/**
* General interface for a text collection
*
// Type for text position (FIXME ulong or long?)
typedef ulong TextPosition;
- /**
- * Init an instance of a text collection object
- *
- * Returns a pointer to an object implementing this interface.
- */
- static TextCollection * InitTextCollection(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE);
/**
* Load from a file
*
* Throws an exception if std::fread() fails.
*
*/
- virtual void Load(FILE *, unsigned samplerate = 0) = 0;
+ static TextCollection* Load(FILE *, unsigned samplerate = 0);
+
/**
* Save data structure into a file
*
* Throws an exception if std::fwrite() fails.
*/
virtual void Save(FILE *) const = 0;
+
/**
* Virtual destructor
*/
virtual ~TextCollection() { };
- /**
- * Insert text
- *
- * Must be a zero-terminated string from alphabet [1,255].
- * Can not be called after makeStatic().
- * The i'th text insertion gets an identifier value i-1.
- * In other words, document identifiers start from 0.
- */
- virtual void InsertText(uchar const *) = 0;
- /**
- * Make static
- *
- * Convert to a static collection; reduces space and time complexities.
- * New texts can not be inserted after this operation.
- */
- virtual void MakeStatic() = 0;
-
+
/**
- tests if the string pointed to by DocId is empty
- */
-
+ * Tests if the string pointed to by DocId is empty
+ */
virtual bool EmptyText(DocId) const = 0;
/**
virtual full_result FullContains(uchar const *, DocId, DocId) const = 0;
protected:
- // Protected constructor; call the static function InitTextCollection().
+ // Protected constructor; use TextCollectionBuilder
TextCollection() { };
// No copy constructor or assignment
--- /dev/null
+#include "incbwt/rlcsa_builder.h"
+#include "TextCollectionBuilder.h"
+
+// Un-comment next line to run a comparison of resulting BWT
+//#define TCB_TEST_BWT
+
+#ifdef TCB_TEST_BWT
+#include "dynFMI.h"
+#endif
+
+#include "TCImplementation.h"
+
+namespace SXSI
+{
+
+struct TCBuilderRep
+{
+ unsigned samplerate;
+ CSA::RLCSABuilder * sa;
+
+ ulong n;
+ // Total number of texts in the collection
+ unsigned numberOfTexts;
+ // Length of the longest text
+ ulong maxTextLength;
+
+#ifdef TCB_TEST_BWT
+ DynFMI *dynFMI;
+#endif
+};
+
+/**
+ * Init text collection
+ *
+ * See CSA.h for more details.
+ */
+TextCollectionBuilder::TextCollectionBuilder(unsigned samplerate)
+ : p_(new struct TCBuilderRep())
+{
+ p_->n = 0;
+ p_->samplerate = samplerate;
+ p_->numberOfTexts = 0;
+
+ // Current params: 8 bytes, 15 MB, no samples
+ p_->sa = new CSA::RLCSABuilder(8, 0, 15 * 1024 * 1024);
+ assert(p_->sa->isOk());
+
+#ifdef TCB_TEST_BWT
+ uchar temp[256];
+ for (unsigned i = 0; i < 255; ++i)
+ temp[i] = i+1;
+ temp[255] = 0;
+ p_->dynFMI = new DynFMI(temp, 1, 255, false);
+#endif
+}
+
+TextCollectionBuilder::~TextCollectionBuilder()
+{
+#ifdef TCB_TEST_BWT
+ delete p_->dynFMI;
+#endif
+
+ delete p_->sa;
+ delete p_;
+}
+
+void TextCollectionBuilder::InsertText(uchar const * text)
+{
+ TextCollection::TextPosition m = std::strlen((char *)text) + 1;
+ if (m > p_->maxTextLength)
+ p_->maxTextLength = m; // Store length of the longest text seen so far.
+
+ if (m > 1)
+ {
+ p_->n += m;
+ p_->numberOfTexts ++;
+
+#ifdef TCB_TEST_BWT
+ p_->dynFMI->addText(text, m);
+#endif
+
+ p_->sa->insertSequence((char*)text, m-1, 0);
+ assert(p_->sa->isOk());
+ }
+ else
+ {
+ // FIXME indexing empty texts
+ std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl;
+ exit(1);
+ }
+}
+
+
+TextCollection * TextCollectionBuilder::InitTextCollection()
+{
+ uchar * bwt = 0;
+ CSA::usint length = 0;
+ if (p_->numberOfTexts == 0)
+ {
+ p_->numberOfTexts ++; // Add one empty text
+ bwt = new uchar[2];
+ bwt[0] = '\0';
+ bwt[1] = '\0';
+ length = 1;
+ p_->maxTextLength = 1;
+ }
+ else
+ {
+ bwt = (uchar *)p_->sa->getBWT(length);
+ delete p_->sa;
+ p_->sa = 0;
+
+ assert(length == p_->n);
+
+#ifdef TCB_TEST_BWT
+ {
+ uchar *bwtTest = p_->dynFMI->getBWT();
+ printf("123456789012345678901234567890123456789\n");
+ for (ulong i = 0; i < p_->n && i < 100; i ++)
+ if (bwt[i] < 50)
+ printf("%d", (int)bwt[i]);
+ else
+ printf("%c", bwt[i]);
+ printf("\n");
+ for (ulong i = 0; i < p_->n && i < 100; i ++)
+ if (bwtTest[i] < 50)
+ printf("%d", (int)bwtTest[i]);
+ else
+ printf("%c", bwtTest[i]);
+ printf("\n");
+
+ // Sanity check
+ assert(p_->numberOfTexts == p_->dynFMI->getCollectionSize());
+
+ delete p_->dynFMI;
+ p_->dynFMI = 0;
+ for (ulong i = 0; i < p_->n; ++i)
+ if (bwt[i] != bwtTest[i])
+ {
+ std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " << (unsigned)bwtTest[i] << std::endl;
+ assert(0);
+ }
+ delete [] bwtTest;
+ }
+#endif // TCB_TEST_BWT
+ }
+
+ TextCollection *result = new TCImplementation(bwt, (ulong)length, p_->samplerate, p_->numberOfTexts, p_->maxTextLength);
+ return result;
+}
+
+
+} // namespace SXSI
--- /dev/null
+/******************************************************************************
+ * Copyright (C) 2009 by Niko Valimaki <nvalimak@cs.helsinki.fi> *
+ * Text collection interface for an in-memory XQuery/XPath engine *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU Lesser General Public License as published *
+ * by the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU Lesser General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU Lesser General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
+ ******************************************************************************/
+
+#ifndef _SXSI_TextCollectionBuilder_h_
+#define _SXSI_TextCollectionBuilder_h_
+
+#include "TextCollection.h"
+#include "Tools.h" // Defines ulong and uchar.
+#include <vector>
+#include <utility> // Defines std::pair.
+
+namespace SXSI
+{
+ struct TCBuilderRep; // Pimpl
+
+ /**
+ * Build an instance of the TextCollection class.
+ */
+ class TextCollectionBuilder
+ {
+ public:
+ explicit TextCollectionBuilder(unsigned);
+ ~TextCollectionBuilder();
+
+ /**
+ * Insert text
+ *
+ * Must be a zero-terminated string from alphabet [1,255].
+ * Can not be called after makeStatic().
+ * The i'th text insertion gets an identifier value i-1.
+ * In other words, document identifiers start from 0.
+ */
+ void InsertText(uchar const *);
+ /**
+ * Make static
+ *
+ * Convert to a static collection; reduces space and time complexities.
+ * New texts can not be inserted after this operation.
+ */
+ TextCollection * InitTextCollection();
+
+ private:
+ struct TCBuilderRep * p_;
+
+ // No copy constructor or assignment
+ TextCollectionBuilder();
+ TextCollectionBuilder(TextCollectionBuilder const&);
+ TextCollectionBuilder& operator = (TextCollectionBuilder const&);
+ };
+}
+#endif
BSGAP.o: BSGAP.cpp BSGAP.h Tools.h BlockArray.h
BitRank.o: BitRank.cpp BitRank.h BlockArray.h Tools.h
-CSA.o: CSA.cpp CSA.h dynFMI.h bittree.h rbtree.h Tools.h BitRank.h \
- BlockArray.h TextCollection.h RLWaveletTree.h GapEncode.h BSGAP.h
GapEncode.o: GapEncode.cpp GapEncode.h Tools.h BlockArray.h BSGAP.h
HeapProfiler.o: HeapProfiler.cpp HeapProfiler.h
RLWaveletTree.o: RLWaveletTree.cpp RLWaveletTree.h GapEncode.h Tools.h \
BlockArray.h BSGAP.h BitRank.h
-TextCollection.o: TextCollection.cpp TextCollection.h Tools.h CSA.h \
- dynFMI.h bittree.h rbtree.h BitRank.h BlockArray.h RLWaveletTree.h \
- GapEncode.h BSGAP.h
+TCImplementation.o: TCImplementation.cpp TCImplementation.h BitRank.h \
+ BlockArray.h Tools.h TextCollection.h BSGAP.h
+TextCollection.o: TextCollection.cpp TextCollection.h Tools.h \
+ TCImplementation.h BitRank.h BlockArray.h BSGAP.h
+TextCollectionBuilder.o: TextCollectionBuilder.cpp \
+ TextCollectionBuilder.h TextCollection.h Tools.h incbwt/rlcsa_builder.h \
+ incbwt/rlcsa.h incbwt/bits/vectors.h incbwt/bits/deltavector.h \
+ incbwt/bits/bitvector.h incbwt/bits/../misc/definitions.h \
+ incbwt/bits/bitbuffer.h incbwt/bits/rlevector.h incbwt/sasamples.h \
+ incbwt/misc/definitions.h incbwt/bits/bitbuffer.h \
+ incbwt/bits/deltavector.h incbwt/misc/parameters.h \
+ incbwt/misc/definitions.h TCImplementation.h BitRank.h BlockArray.h \
+ BSGAP.h dynFMI.h bittree.h rbtree.h
Tools.o: Tools.cpp Tools.h
bittree.o: bittree.cpp bittree.h rbtree.h Tools.h
dynFMI.o: dynFMI.cpp dynFMI.h bittree.h rbtree.h Tools.h
rbtree.o: rbtree.cpp rbtree.h
-testTextCollection.o: testTextCollection.cpp HeapProfiler.h \
- TextCollection.h Tools.h
+testTextCollection.o: testTextCollection.cpp TextCollectionBuilder.h \
+ TextCollection.h Tools.h HeapProfiler.h
+testTextCollection2.o: testTextCollection2.cpp TextCollection.h Tools.h \
+ HeapProfiler.h
+testTextCollection3.o: testTextCollection3.cpp TextCollection.h Tools.h
+testTextCollection4.o: testTextCollection4.cpp TCImplementation.h \
+ BitRank.h BlockArray.h Tools.h TextCollection.h BSGAP.h HeapProfiler.h
testTextCollection5.o: testTextCollection5.cpp HeapProfiler.h \
TextCollection.h Tools.h
+timeTextCollection.o: timeTextCollection.cpp TextCollection.h Tools.h
--- /dev/null
+CC = g++
+CPPFLAGS = -Wall -O3
+OBJS = rlcsa.o rlcsa_builder.o sasamples.o bits/bitvector.o bits/deltavector.o bits/rlevector.o bits/vectors.o misc/parameters.o misc/utils.o qsufsort/qsufsort.o
+
+
+all: library
+
+library: $(OBJS)
+ ar rc rlcsa.a $(OBJS)
+
+depend:
+ g++ -MM *.cpp bits/*.cpp misc/*.cpp qsufsort/*.c > dependencies.mk
+
+update:
+ cp ../rlcsa.* ../rlcsa_builder.* ../sasamples.* .
+ cp ../bits/* bits/
+ cp ../misc/* misc/
+ cp ../qsufsort/* qsufsort/
+
+clean:
+ rm -f rlcsa.a
+ rm -f $(OBJS)
+
+
+include dependencies.mk
--- /dev/null
+#ifndef BITBUFFER_H
+#define BITBUFFER_H
+
+#include <algorithm>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+
+#include "../misc/definitions.h"
+
+
+namespace CSA
+{
+
+
+template<class Data>
+class GenericBitBuffer
+{
+ public:
+ GenericBitBuffer(usint words);
+ GenericBitBuffer(usint _items, usint item_size);
+ GenericBitBuffer(std::ifstream& file, usint words);
+ GenericBitBuffer(std::ifstream& file, usint _items, usint item_size);
+ GenericBitBuffer(Data* buffer, usint words);
+ GenericBitBuffer(Data* buffer, usint _items, usint item_size);
+ ~GenericBitBuffer();
+
+ void writeBuffer(std::ofstream& file, bool erase = true);
+ void readBuffer(std::ifstream& file, usint words, bool erase = true);
+ void setBuffer(Data* buffer, usint words);
+
+ // We assume we are already using a buffer not owned by this.
+ inline void moveBuffer(Data* buffer)
+ {
+ this->data = buffer;
+ this->pos = 0;
+ this->bits = DATA_BITS;
+ this->current = 0;
+ }
+
+//--------------------------------------------------------------------------
+
+ inline void reset(bool erase)
+ {
+ this->pos = 0;
+ this->bits = DATA_BITS;
+ this->current = 0;
+ if(erase)
+ {
+ memset(this->data, 0, this->size * sizeof(Data));
+ }
+ }
+
+ inline void skipBits(usint count)
+ {
+ if(count < this->bits)
+ {
+ this->bits -= count;
+ return;
+ }
+
+ count -= this->bits;
+ this->pos += 1 + count / DATA_BITS;
+ this->bits = DATA_BITS - count % DATA_BITS;
+ }
+
+ inline void rewind(usint count)
+ {
+ this->bits += count;
+ if(this->bits > DATA_BITS)
+ {
+ usint back = (this->bits - 1) / DATA_BITS;
+ this->pos -= back;
+ this->bits -= back * DATA_BITS;
+ }
+ }
+
+//--------------------------------------------------------------------------
+
+ inline usint bitsLeft()
+ {
+ return this->bits + DATA_BITS * (this->size - this->pos - 1);
+ }
+
+ inline void writeBits(usint value, usint count)
+ {
+ while(count >= this->bits)
+ {
+ count -= this->bits;
+ this->data[this->pos] |= GET(LOWER(value, count), this->bits);
+ this->pos++; this->bits = DATA_BITS;
+ }
+ if(count > 0)
+ {
+ this->bits -= count;
+ this->data[this->pos] |= HIGHER(GET(value, count), this->bits);
+ }
+ }
+
+ // Returns nonzero if bit is 1
+ inline usint readBit()
+ {
+ this->bits--;
+ usint bit = this->data[this->pos] & ((usint)1 << this->bits);
+
+ if(this->bits == 0) { this->pos++; this->bits = DATA_BITS; }
+
+ return bit;
+ }
+
+ inline usint readBits(usint count)
+ {
+ usint value = 0;
+
+ while(count >= this->bits)
+ {
+ count -= this->bits;
+ value |= HIGHER(GET(this->data[this->pos], this->bits), count);
+ this->pos++; this->bits = DATA_BITS;
+ }
+ if(count > 0)
+ {
+ this->bits -= count;
+ value |= GET(LOWER(this->data[this->pos], this->bits), count);
+ }
+
+ return value;
+ }
+
+//--------------------------------------------------------------------------
+
+ /*
+ These operations work on fixed-size items.
+ */
+
+ inline usint getItemSize()
+ {
+ return this->item_bits;
+ }
+
+/*
+ inline void setItemSize(usint new_size)
+ {
+ if(new_size == 0) { return; }
+ this->item_bits = new_size;
+ }
+*/
+
+ inline void goToItem(usint item)
+ {
+ usint b = item * this->item_bits;
+ this->pos = b / DATA_BITS;
+ this->bits = DATA_BITS - b % DATA_BITS;
+ this->current = item;
+ }
+
+ inline usint readItem()
+ {
+ this->current++;
+ return this->readBits(this->item_bits);
+ }
+
+ inline usint readItem(usint item)
+ {
+ this->goToItem(item);
+ return this->readItem();
+ }
+
+ inline usint readFirstItem()
+ {
+ return this->readItem(0);
+ }
+
+ inline bool hasNextItem()
+ {
+ return (this->current < this->items);
+ }
+
+ inline void writeItem(usint item)
+ {
+ this->writeBits(item, this->item_bits);
+ this->current++;
+ }
+
+ inline void skipItem()
+ {
+ this->skipBits(this->item_bits);
+ this->current++;
+ }
+
+//--------------------------------------------------------------------------
+
+ /*
+ Delta coding for positive integers
+ */
+
+ inline bool canDeltaCode(usint value)
+ {
+ return this->deltaCodeLength(value) <= this->bitsLeft();
+ }
+
+ inline usint deltaCodeLength(usint value)
+ {
+ usint len = length(value);
+ usint llen = length(len);
+ return (len + llen + llen - 2);
+ }
+
+ // This version returns false if there is no space left for the encoding.
+ inline bool writeDeltaCode(usint value)
+ {
+ usint len = length(value);
+ usint llen = length(len);
+
+ if(len + llen + llen - 2 > this->bitsLeft()) { return false; }
+
+ // this->writeBits(0, llen - 1); // Now included in the next writeBits()
+ this->writeBits(len, llen + llen - 1);
+ this->writeBits(value, len - 1);
+ return true;
+ }
+
+ // This version assumes the code fits into the buffer.
+ inline void writeDeltaCodeDirect(usint value)
+ {
+ usint len = length(value);
+ usint llen = length(len);
+
+ // this->writeBits(0, llen - 1); // Now included in the next writeBits()
+ this->writeBits(len, llen + llen - 1);
+ this->writeBits(value, len - 1);
+ }
+
+ // We assume the code fits into usint:
+ // 32-bit: value < 2^24
+ // 64-bit: value < 2^54
+ inline void writeDeltaCodeFast(usint value)
+ {
+ usint len = length(value);
+
+ value ^= ((usint)1 << (len - 1));
+ this->writeBits((len << (len - 1)) | value, len + 2 * length(len) - 2);
+ }
+
+ inline usint readDeltaCode()
+ {
+ usint len = 0;
+ while(this->readBit() == 0) { len++; }
+
+ usint temp = (((usint)1 << len) | this->readBits(len)) - 1;
+ temp = ((usint)1 << temp) | this->readBits(temp);
+ return temp;
+ }
+
+//--------------------------------------------------------------------------
+
+ /*
+ Gamma coding for positive integers
+ */
+
+ inline bool canGammaCode(usint value)
+ {
+ return this->gammaCodeLength(value) <= this->bitsLeft();
+ }
+
+ inline usint gammaCodeLength(usint value)
+ {
+ return 2 * length(value) - 1;
+ }
+
+ // This version returns false if there is no space left for the encoding.
+ inline bool writeGammaCode(usint value)
+ {
+ usint len = length(value);
+
+ if(len > this->bitsLeft()) { return false; }
+
+ this->writeBits(0, len - 1);
+ this->writeBits(value, len);
+ return true;
+ }
+
+ // This version assumes the code fits into the buffer.
+ inline void writeGammaCodeDirect(usint value)
+ {
+ usint len = length(value);
+
+ this->writeBits(0, len - 1);
+ this->writeBits(value, len);
+ }
+
+ // We assume the code fits into usint:
+ // 32-bit: value < 2^16
+ // 64-bit: value < 2^32
+ inline void writeGammaCodeFast(usint value)
+ {
+ this->writeBits(value, this->gammaCodeLength(value));
+ }
+
+ inline usint readGammaCode()
+ {
+ usint len = 1;
+ while(this->readBit() == 0) { len++; }
+ return ((usint)1 << len) | this->readBits(len);
+ }
+
+//--------------------------------------------------------------------------
+
+ usint reportSize();
+
+//--------------------------------------------------------------------------
+
+ private:
+ Data* data;
+ usint size, pos, bits;
+ usint item_bits, items, current;
+ bool free_buffer;
+
+ const static usint DATA_BITS = sizeof(Data) * CHAR_BIT;
+
+ inline static usint bitsToData(usint _bits) { return (_bits + DATA_BITS - 1) / DATA_BITS; }
+};
+
+
+typedef GenericBitBuffer<uchar> BitBuffer;
+typedef GenericBitBuffer<usint> FastBitBuffer;
+
+
+//--------------------------------------------------------------------------
+
+
+template<class Data>
+GenericBitBuffer<Data>::GenericBitBuffer(usint words) :
+ size(words),
+ pos(0),
+ bits(DATA_BITS),
+ item_bits(1),
+ items(0),
+ current(0),
+ free_buffer(true)
+{
+ this->data = new Data[words];
+ memset(this->data, 0, words * sizeof(Data));
+}
+
+template<class Data>
+GenericBitBuffer<Data>::GenericBitBuffer(usint _items, usint item_size) :
+ pos(0),
+ bits(DATA_BITS),
+ item_bits(item_size),
+ items(_items),
+ current(0),
+ free_buffer(true)
+{
+ this->size = bitsToData(items * item_size);
+ this->data = new Data[this->size];
+ memset(this->data, 0, this->size * sizeof(Data));
+}
+
+template<class Data>
+GenericBitBuffer<Data>::GenericBitBuffer(std::ifstream& file, usint words) :
+ size(words),
+ pos(0),
+ bits(DATA_BITS),
+ item_bits(1),
+ items(0),
+ current(0),
+ free_buffer(true)
+{
+ this->data = new Data[words];
+ memset(this->data, 0, words * sizeof(Data));
+ file.read((char*)this->data, words * sizeof(Data));
+}
+
+template<class Data>
+GenericBitBuffer<Data>::GenericBitBuffer(std::ifstream& file, usint _items, usint item_size) :
+ size(BITS_TO_WORDS(_items * item_size)),
+ pos(0),
+ bits(DATA_BITS),
+ item_bits(item_size),
+ items(_items),
+ current(0),
+ free_buffer(true)
+{
+ this->data = new Data[this->size];
+ memset(this->data, 0, this->size * sizeof(Data));
+ file.read((char*)this->data, this->size * sizeof(Data));
+}
+
+template<class Data>
+GenericBitBuffer<Data>::GenericBitBuffer(Data* buffer, usint words) :
+ size(words),
+ pos(0),
+ bits(DATA_BITS),
+ item_bits(1),
+ items(0),
+ current(0),
+ free_buffer(false)
+{
+ this->data = buffer;
+}
+
+template<class Data>
+GenericBitBuffer<Data>::GenericBitBuffer(Data* buffer, usint _items, usint item_size) :
+ size(BITS_TO_WORDS(_items * item_size)),
+ pos(0),
+ bits(DATA_BITS),
+ item_bits(item_size),
+ items(_items),
+ current(0),
+ free_buffer(false)
+{
+ this->data = buffer;
+}
+
+template<class Data>
+GenericBitBuffer<Data>::~GenericBitBuffer()
+{
+ if(this->free_buffer)
+ {
+ delete[] this->data;
+ }
+}
+
+//--------------------------------------------------------------------------
+
+template<class Data>
+void
+GenericBitBuffer<Data>::writeBuffer(std::ofstream& file, bool erase)
+{
+ file.write((char*)this->data, this->size * sizeof(Data));
+ this->reset(erase);
+}
+
+template<class Data>
+void
+GenericBitBuffer<Data>::readBuffer(std::ifstream& file, usint words, bool erase)
+{
+ if(words > this->size || !(this->free_buffer))
+ {
+ if(this->free_buffer)
+ {
+ delete[] this->data;
+ }
+ this->size = words;
+ this->data = new Data[words];
+ this->free_buffer = true;
+ }
+
+ this->reset(erase);
+ file.read((char*)this->data, words * sizeof(Data));
+}
+
+template<class Data>
+void
+GenericBitBuffer<Data>::setBuffer(Data* buffer, usint words)
+{
+ if(this->free_buffer)
+ {
+ delete[] this->data;
+ this->free_buffer = false;
+ }
+
+ this->data = buffer;
+ this->size = words;
+ this->reset(false);
+}
+
+//--------------------------------------------------------------------------
+
+template<class Data>
+usint
+GenericBitBuffer<Data>::reportSize()
+{
+ usint bytes = sizeof(*this);
+ if(this->free_buffer) { bytes += this->size * sizeof(Data); }
+ return bytes;
+}
+
+//--------------------------------------------------------------------------
+
+
+} // namespace CSA
+
+
+#endif // BITBUFFER_H
--- /dev/null
+#include <cstdlib>
+
+#include "bitvector.h"
+
+
+namespace CSA
+{
+
+
+BitVector::BitVector(std::ifstream& file) :
+ rank_index(0), select_index(0)
+{
+ file.read((char*)&(this->size), sizeof(this->size));
+ file.read((char*)&(this->items), sizeof(this->items));
+ file.read((char*)&(this->number_of_blocks), sizeof(this->number_of_blocks));
+ file.read((char*)&(this->block_size), sizeof(this->block_size));
+
+ this->array = new usint[this->block_size * this->number_of_blocks];
+ file.read((char*)(this->array), this->block_size * this->number_of_blocks * sizeof(usint));
+ this->buffer = new FastBitBuffer(this->array, this->block_size);
+
+ this->integer_bits = length(this->size);
+ this->samples = new FastBitBuffer(file, 2 * (this->number_of_blocks + 1), this->integer_bits);
+
+ this->indexForRank();
+ this->indexForSelect();
+}
+
+BitVector::BitVector(VectorEncoder& encoder, usint universe_size) :
+ size(universe_size), items(encoder.items),
+ block_size(encoder.block_size),
+ number_of_blocks(encoder.blocks),
+ rank_index(0), select_index(0)
+{
+ this->array = new usint[this->block_size * this->number_of_blocks];
+ this->buffer = new FastBitBuffer(this->array, this->block_size);
+
+ this->integer_bits = length(this->size);
+ this->samples = new FastBitBuffer(2 * (this->number_of_blocks + 1), this->integer_bits);
+
+ // Copy & linearize the array.
+ usint pos = 0;
+ for(std::list<usint*>::iterator iter = encoder.array_blocks.begin(); iter != encoder.array_blocks.end(); iter++)
+ {
+ memcpy(this->array + pos, *iter, encoder.superblock_bytes);
+ pos += encoder.block_size * encoder.blocks_in_superblock;
+ }
+ memcpy(this->array + pos, encoder.array, encoder.current_blocks * encoder.block_size * sizeof(usint));
+
+ // Compress the samples.
+ for(std::list<usint*>::iterator iter = encoder.sample_blocks.begin(); iter != encoder.sample_blocks.end(); iter++)
+ {
+ usint* buf = *iter;
+ for(usint i = 0; i < 2 * encoder.samples_in_superblock; i++)
+ {
+ this->samples->writeItem(buf[i]);
+ }
+ }
+ for(usint i = 0; i < 2 * encoder.current_samples; i++)
+ {
+ this->samples->writeItem(encoder.samples[i]);
+ }
+ this->samples->writeItem(this->items);
+ this->samples->writeItem(this->size);
+
+ this->indexForRank();
+ this->indexForSelect();
+}
+
+BitVector::~BitVector()
+{
+ delete[] this->array;
+ delete this->buffer;
+ delete this->samples;
+ delete this->rank_index;
+ delete this->select_index;
+}
+
+void
+BitVector::writeTo(std::ofstream& file)
+{
+ file.write((char*)&(this->size), sizeof(this->size));
+ file.write((char*)&(this->items), sizeof(this->items));
+ file.write((char*)&(this->number_of_blocks), sizeof(this->number_of_blocks));
+ file.write((char*)&(this->block_size), sizeof(this->block_size));
+ file.write((char*)(this->array), this->block_size * this->number_of_blocks * sizeof(usint));
+ this->samples->writeBuffer(file, false);
+}
+
+//--------------------------------------------------------------------------
+
+usint
+BitVector::reportSize()
+{
+ usint bytes = this->buffer->reportSize();
+ bytes += this->block_size * this->number_of_blocks * sizeof(usint);
+ bytes += this->samples->reportSize();
+ bytes += this->rank_index->reportSize();
+ bytes += this->select_index->reportSize();
+ return bytes;
+}
+
+//--------------------------------------------------------------------------
+
+usint
+BitVector::sampleForIndex(usint index)
+{
+ usint low = this->select_index->readItem(index / this->select_rate);
+ usint high = this->select_index->readItem(index / this->select_rate + 1);
+
+ this->samples->goToItem(2 * low + 2);
+ for(; low < high; low++)
+ {
+ if(this->samples->readItem() > index) { return low; }
+ this->samples->skipItem();
+ }
+
+ return low;
+}
+
+usint
+BitVector::sampleForValue(usint value)
+{
+ usint low = this->rank_index->readItem(value / this->rank_rate);
+ usint high = this->rank_index->readItem(value / this->rank_rate + 1);
+
+ this->samples->goToItem(2 * low + 3);
+ for(; low < high; low++)
+ {
+ if(this->samples->readItem() > value) { return low; }
+ this->samples->skipItem();
+ }
+
+ return low;
+}
+
+//--------------------------------------------------------------------------
+
+void
+BitVector::indexForRank()
+{
+ delete this->rank_index;
+
+ usint value_samples = (this->number_of_blocks + BitVector::INDEX_RATE - 1) / BitVector::INDEX_RATE;
+ this->rank_rate = (this->size + value_samples - 1) / value_samples;
+ value_samples = (this->size + this->rank_rate - 1) / this->rank_rate + 1;
+ this->rank_index = new FastBitBuffer(value_samples, length(this->number_of_blocks - 1));
+
+ usint current = 0, pointer = 0;
+ this->samples->goToItem(2);
+ while(this->samples->hasNextItem())
+ {
+ this->samples->skipItem();
+ usint limit = this->samples->readItem();
+ while(current < limit)
+ {
+ this->rank_index->writeItem(pointer);
+ current += this->rank_rate;
+ }
+ pointer++;
+ }
+ this->rank_index->writeItem(this->number_of_blocks - 1);
+}
+
+void
+BitVector::indexForSelect()
+{
+ delete this->select_index;
+
+ usint index_samples = (this->number_of_blocks + BitVector::INDEX_RATE - 1) / BitVector::INDEX_RATE;
+ this->select_rate = (this->items + index_samples - 1) / index_samples;
+ index_samples = (this->items + this->select_rate - 1) / this->select_rate + 1;
+ this->select_index = new FastBitBuffer(index_samples, length(this->number_of_blocks - 1));
+
+ usint current = 0, pointer = 0;
+ this->samples->goToItem(2);
+ while(this->samples->hasNextItem())
+ {
+ usint limit = this->samples->readItem();
+ this->samples->skipItem();
+ while(current < limit)
+ {
+ this->select_index->writeItem(pointer);
+ current += this->select_rate;
+ }
+ pointer++;
+ }
+ this->select_index->writeItem(this->number_of_blocks - 1);
+}
+
+//--------------------------------------------------------------------------
+
+VectorEncoder::VectorEncoder(usint block_bytes, usint superblock_size) :
+ size(0), items(0), blocks(0),
+ block_size(BYTES_TO_WORDS(block_bytes)),
+ superblock_bytes(superblock_size)
+{
+ this->array = new usint[this->superblock_bytes / sizeof(usint)];
+ memset(this->array, 0, this->superblock_bytes);
+ this->blocks_in_superblock = this->superblock_bytes / (sizeof(usint) * this->block_size);
+ this->current_blocks = 0;
+
+ this->samples = new usint[this->superblock_bytes / sizeof(usint)];
+ this->samples_in_superblock = this->superblock_bytes / (2 * sizeof(usint));
+ this->current_samples = 0;
+
+ this->buffer = new FastBitBuffer(this->array, this->block_size);
+}
+
+VectorEncoder::~VectorEncoder()
+{
+ delete[] this->array;
+
+ delete this->buffer;
+ for(std::list<usint*>::iterator iter = this->array_blocks.begin(); iter != this->array_blocks.end(); iter++)
+ {
+ delete[] *iter;
+ }
+
+ delete[] this->samples;
+ for(std::list<usint*>::iterator iter = this->sample_blocks.begin(); iter != this->sample_blocks.end(); iter++)
+ {
+ delete[] *iter;
+ }
+}
+
+void
+VectorEncoder::addNewBlock()
+{
+ this->blocks++;
+ this->current_blocks++;
+ this->current_samples++;
+
+ // Do we need a new superblock for the block?
+ if(this->current_blocks > this->blocks_in_superblock)
+ {
+ this->array_blocks.push_back(this->array);
+ this->array = new usint[this->superblock_bytes / sizeof(usint)];
+ memset(this->array, 0, this->superblock_bytes);
+ this->current_blocks = 1;
+ }
+ this->buffer->moveBuffer(this->array + (this->block_size * (this->current_blocks - 1)));
+
+ // Do we need a new superblock for the sample?
+ if(this->current_samples > this->samples_in_superblock)
+ {
+ this->sample_blocks.push_back(this->samples);
+ this->samples = new usint[this->superblock_bytes / sizeof(usint)];
+ this->current_samples = 1;
+ }
+ this->samples[2 * this->current_samples - 2] = this->items - 1;
+ this->samples[2 * this->current_samples - 1] = this->size - 1;
+}
+
+void
+VectorEncoder::setFirstBit(usint value)
+{
+ this->samples[0] = 0;
+ this->samples[1] = value;
+
+ this->size = value + 1;
+ this->items = 1;
+ this->blocks = 1;
+
+ this->current_blocks = 1;
+ this->current_samples = 1;
+}
+
+
+} // namespace CSA
--- /dev/null
+#ifndef BITVECTOR_H
+#define BITVECTOR_H
+
+#include <fstream>
+#include <list>
+
+#include "../misc/definitions.h"
+#include "bitbuffer.h"
+
+
+namespace CSA
+{
+
+
+/*
+ This class provides the core functionality for encoding a bit vector.
+*/
+
+class VectorEncoder
+{
+ public:
+ const static usint SUPERBLOCK_SIZE = MEGABYTE;
+
+ // We assume superblock size is divisible by block and sample size.
+ VectorEncoder(usint block_bytes, usint superblock_size = SUPERBLOCK_SIZE);
+ ~VectorEncoder();
+
+/*
+ This should be implemented in any inherited class.
+
+ void setBit(usint value); // Values must be in increasing order.
+*/
+
+ void addNewBlock();
+ void setFirstBit(usint value);
+
+ usint size, items, blocks;
+ usint block_size, superblock_bytes;
+
+ FastBitBuffer* buffer;
+
+ std::list<usint*> array_blocks;
+ usint* array;
+ usint blocks_in_superblock, current_blocks;
+
+ std::list<usint*> sample_blocks;
+ usint* samples;
+ usint samples_in_superblock, current_samples;
+};
+
+
+/*
+ This class provides the core functionality for a bit vector.
+*/
+
+class BitVector
+{
+ public:
+ const static usint INDEX_RATE = 5;
+
+ BitVector(std::ifstream& file);
+ BitVector(VectorEncoder& encoder, usint universe_size);
+ ~BitVector();
+
+ void writeTo(std::ofstream& file);
+
+//--------------------------------------------------------------------------
+
+/*
+ These should be implemented in any actual bit vector.
+
+ // rank is not iterator-safe
+ // regular: \sum_{i = 0}^{value} V[i]
+ // at_least: \sum_{i = 0}^{value - 1} V[i] + 1
+ usint rank(usint value, bool at_least = false);
+
+ usint select(usint index); // \min value: \sum_{i = 0}^{value} V[i] = index + 1
+ usint selectNext();
+
+ pair_type valueAfter(usint value); // (\min i >= value: V[i] = 1, rank(i) - 1)
+ pair_type nextValue();
+
+ // These versions of select return (value, length_of_run).
+ // max_length is an upper bound for the length of the run returned.
+ // V[value] is not included in the length of the run
+ // These functions are not greedy: the actual length of the run can be more than reported.
+ // This can happen even if max_length was not reached.
+ // length_of_run is actually the number of extra items returned past value
+
+ pair_type selectRun(usint index, usint max_length);
+ pair_type selectNextRun(usint max_length);
+
+ // isSet is not iterator-safe
+ bool isSet(usint value); // V[value]
+*/
+
+ inline bool hasNext()
+ {
+ return (this->sample.first + cur < this->items - 1);
+ }
+
+//--------------------------------------------------------------------------
+
+ inline usint getSize() { return this->size; }
+ inline usint getNumberOfItems() { return this->items; }
+ inline usint getBlockSize() { return this->block_size; }
+
+ // This returns only the sizes of dynamically allocated structures.
+ usint reportSize();
+
+ protected:
+ usint size, items;
+
+ FastBitBuffer* buffer;
+ usint* array;
+ usint block_size;
+ usint number_of_blocks;
+
+ /*
+ Each sample is (rank(i) - 1, i) where V[i] = 1.
+ Number of samples is number_of_blocks + 1.
+ */
+ FastBitBuffer* samples;
+ usint integer_bits;
+
+ FastBitBuffer* rank_index;
+ usint rank_rate;
+
+ FastBitBuffer* select_index;
+ usint select_rate;
+
+ // Iterator data. These are enough for a gap-encoded vector.
+ usint block;
+ pair_type sample;
+ usint cur, val; // cur == 0 is the sample
+ usint block_items;
+
+ /*
+ These functions return the sample corresponding to the block the given
+ index/value might be found in. Parameters are assumed to be valid.
+ */
+ usint sampleForIndex(usint index);
+ usint sampleForValue(usint value);
+
+ inline void getSample(usint sample_number)
+ {
+ this->block = sample_number;
+ this->samples->goToItem(2 * sample_number);
+ this->sample.first = this->samples->readItem();
+ this->sample.second = this->samples->readItem();
+ this->cur = 0;
+ this->val = this->sample.second;
+ this->block_items = this->samples->readItem() - this->sample.first - 1;
+ this->buffer->moveBuffer(this->array + (this->block * this->block_size));
+ }
+
+ /*
+ These functions build a higher level index for faster rank/select queries.
+ The index consists of about (number of samples) / INDEX_RATE pointers.
+ The bit vector cannot be used without the index.
+ */
+ void indexForRank();
+ void indexForSelect();
+};
+
+
+} // namespace CSA
+
+
+#endif // BITVECTOR_H
--- /dev/null
+#include <cstdlib>
+
+#include "deltavector.h"
+
+
+namespace CSA
+{
+
+
+DeltaVector::DeltaVector(std::ifstream& file) :
+ BitVector(file)
+{
+}
+
+DeltaVector::DeltaVector(DeltaEncoder& encoder, usint universe_size) :
+ BitVector(encoder, universe_size)
+{
+}
+
+DeltaVector::~DeltaVector()
+{
+}
+
+//--------------------------------------------------------------------------
+
+usint
+DeltaVector::rank(usint value, bool at_least)
+{
+ if(value >= this->size) { return this->items; }
+ this->getSample(this->sampleForValue(value));
+
+ while(this->cur < this->block_items && this->val < value)
+ {
+ this->val += this->buffer->readDeltaCode();
+ this->cur++;
+ }
+
+ usint idx = this->sample.first + this->cur + 1;
+ if(!at_least && this->val > value) { idx--; }
+ if(at_least && this->val < value) { this->getSample(this->block + 1); }
+ return idx;
+}
+
+usint
+DeltaVector::select(usint index)
+{
+ if(index >= this->items) { return this->size; }
+ this->getSample(this->sampleForIndex(index));
+
+ usint lim = index - this->sample.first;
+ for(; this->cur < lim; this->cur++)
+ {
+ this->val += this->buffer->readDeltaCode();
+ }
+
+ return this->val;
+}
+
+usint
+DeltaVector::selectNext()
+{
+ if(this->cur >= this->block_items)
+ {
+ this->getSample(this->block + 1);
+ return this->val;
+ }
+
+ this->cur++;
+ this->val += this->buffer->readDeltaCode();
+ return this->val;
+}
+
+pair_type
+DeltaVector::valueAfter(usint value)
+{
+ if(value >= this->size) { return pair_type(this->size, this->items); }
+ this->getSample(this->sampleForValue(value));
+
+ while(this->cur < this->block_items && this->val < value)
+ {
+ this->val += this->buffer->readDeltaCode();
+ this->cur++;
+ }
+ if(this->val < value)
+ {
+ this->getSample(this->block + 1);
+ }
+
+ return pair_type(this->val, this->sample.first + this->cur);
+}
+
+pair_type
+DeltaVector::nextValue()
+{
+ if(this->cur >= this->block_items)
+ {
+ this->getSample(this->block + 1);
+ return pair_type(this->val, this->sample.first);
+ }
+
+ this->cur++;
+ this->val += this->buffer->readDeltaCode();
+ return pair_type(this->val, this->sample.first + this->cur);
+}
+
+pair_type
+DeltaVector::selectRun(usint index, usint max_length)
+{
+ return pair_type(this->select(index), 0);
+}
+
+pair_type
+DeltaVector::selectNextRun(usint max_length)
+{
+ return pair_type(this->selectNext(), 0);
+}
+
+bool
+DeltaVector::isSet(usint value)
+{
+ if(value >= this->size) { return false; }
+ this->getSample(this->sampleForValue(value));
+
+ while(this->cur < this->block_items && this->val < value)
+ {
+ this->val += this->buffer->readDeltaCode();
+ this->cur++;
+ }
+
+ return (this->val == value);
+}
+
+//--------------------------------------------------------------------------
+
+usint
+DeltaVector::reportSize()
+{
+ usint bytes = sizeof(*this);
+ bytes += BitVector::reportSize();
+ return bytes;
+}
+
+//--------------------------------------------------------------------------
+
+DeltaEncoder::DeltaEncoder(usint block_bytes, usint superblock_size) :
+ VectorEncoder(block_bytes, superblock_size)
+{
+}
+
+DeltaEncoder::~DeltaEncoder()
+{
+}
+
+void
+DeltaEncoder::setBit(usint value)
+{
+ if(this->items == 0)
+ {
+ this->setFirstBit(value);
+ return;
+ }
+ if(value < this->size) { return; }
+
+ usint diff = value + 1 - this->size;
+ this->size = value + 1;
+ this->items++;
+ if(this->buffer->writeDeltaCode(diff)) { return; }
+
+ // Didn't fit into the block. A new sample & block required.
+ this->addNewBlock();
+}
+
+
+} // namespace CSA
--- /dev/null
+#ifndef DELTAVECTOR_H
+#define DELTAVECTOR_H
+
+#include <fstream>
+
+#include "bitvector.h"
+
+
+namespace CSA
+{
+
+
+/*
+ This class is used to construct a DeltaVector.
+*/
+
+class DeltaEncoder : public VectorEncoder
+{
+ public:
+ DeltaEncoder(usint block_bytes, usint superblock_size = VectorEncoder::SUPERBLOCK_SIZE);
+ ~DeltaEncoder();
+
+ void setBit(usint value);
+};
+
+
+/*
+ This is a gap-encoded bit vector using delta coding.
+*/
+
+class DeltaVector : public BitVector
+{
+ public:
+ DeltaVector(std::ifstream& file);
+ DeltaVector(DeltaEncoder& encoder, usint universe_size);
+ ~DeltaVector();
+
+//--------------------------------------------------------------------------
+
+ usint rank(usint value, bool at_least = false);
+
+ usint select(usint index);
+ usint selectNext();
+
+ pair_type valueAfter(usint value);
+ pair_type nextValue();
+
+ pair_type selectRun(usint index, usint max_length);
+ pair_type selectNextRun(usint max_length);
+
+ bool isSet(usint value);
+
+//--------------------------------------------------------------------------
+
+ usint reportSize();
+};
+
+
+} // namespace CSA
+
+
+#endif // DELTAVECTOR_H
--- /dev/null
+#include <cstdlib>
+
+#include "rlevector.h"
+#include "../misc/utils.h"
+
+
+namespace CSA
+{
+
+
+RLEVector::RLEVector(std::ifstream& file) :
+ BitVector(file)
+{
+}
+
+RLEVector::RLEVector(RLEEncoder& encoder, usint universe_size) :
+ BitVector(encoder, universe_size)
+{
+}
+
+RLEVector::~RLEVector()
+{
+}
+
+//--------------------------------------------------------------------------
+
+usint
+RLEVector::rank(usint value, bool at_least)
+{
+ if(value >= this->size) { return this->items; }
+
+ this->valueLoop(value);
+
+ usint idx = this->sample.first + this->cur + 1;
+ if(!at_least && this->val > value)
+ {
+ idx--;
+ }
+ if(at_least && this->val < value)
+ {
+ this->getSample(this->block + 1);
+ this->run = 0;
+ idx = this->sample.first + this->cur + 1;
+ }
+ return idx;
+}
+
+usint
+RLEVector::select(usint index)
+{
+ if(index >= this->items) { return this->size; }
+ this->getSample(this->sampleForIndex(index));
+ this->run = 0;
+
+ usint lim = index - this->sample.first;
+ while(this->cur < lim)
+ {
+ this->val += this->buffer->readDeltaCode();
+ usint temp = this->buffer->readDeltaCode();
+ this->val += temp - 1;
+ this->cur += temp;
+ }
+ if(this->cur > lim)
+ {
+ this->run = this->cur - lim;
+ this->cur -= this->run;
+ this->val -= this->run;
+ }
+
+ return this->val;
+}
+
+usint
+RLEVector::selectNext()
+{
+ if(this->cur >= this->block_items)
+ {
+ this->getSample(this->block + 1);
+ this->run = 0;
+ return this->val;
+ }
+
+ this->cur++;
+ if(this->run > 0)
+ {
+ this->val++;
+ this->run--;
+ }
+ else
+ {
+ this->val += this->buffer->readDeltaCode();
+ this->run = this->buffer->readDeltaCode() - 1;
+ }
+
+ return this->val;
+}
+
+pair_type
+RLEVector::valueAfter(usint value)
+{
+ if(value >= this->size) { return pair_type(this->size, this->items); }
+
+ this->valueLoop(value);
+
+ if(this->val < value)
+ {
+ this->getSample(this->block + 1);
+ this->run = 0;
+ }
+
+ return pair_type(this->val, this->sample.first + this->cur);
+}
+
+pair_type
+RLEVector::nextValue()
+{
+ if(this->cur >= this->block_items)
+ {
+ this->getSample(this->block + 1);
+ this->run = 0;
+ return pair_type(this->val, this->sample.first);
+ }
+
+ this->cur++;
+ if(this->run > 0)
+ {
+ this->val++;
+ this->run--;
+ }
+ else
+ {
+ this->val += this->buffer->readDeltaCode();
+ this->run = this->buffer->readDeltaCode() - 1;
+ }
+
+ return pair_type(this->val, this->sample.first + this->cur);
+}
+
+pair_type
+RLEVector::selectRun(usint index, usint max_length)
+{
+ usint value = this->select(index);
+
+ usint len = std::min(max_length, this->run);
+ this->run -= len; this->cur += len; this->val += len;
+
+ return pair_type(value, len);
+}
+
+pair_type
+RLEVector::selectNextRun(usint max_length)
+{
+ usint value = this->selectNext();
+
+ usint len = std::min(max_length, this->run);
+ this->run -= len; this->cur += len; this->val += len;
+
+ return pair_type(value, len);
+}
+
+bool
+RLEVector::isSet(usint value)
+{
+ if(value >= this->size) { return false; }
+
+ this->valueLoop(value);
+
+ return (this->val == value);
+}
+
+//--------------------------------------------------------------------------
+
+usint
+RLEVector::reportSize()
+{
+ usint bytes = sizeof(*this);
+ bytes += BitVector::reportSize();
+ return bytes;
+}
+
+//--------------------------------------------------------------------------
+
+RLEEncoder::RLEEncoder(usint block_bytes, usint superblock_size) :
+ VectorEncoder(block_bytes, superblock_size)
+{
+}
+
+RLEEncoder::~RLEEncoder()
+{
+}
+
+void
+RLEEncoder::setRun(usint start, usint len)
+{
+ if(this->items == 0)
+ {
+ this->setFirstBit(start);
+ if(len > 1)
+ {
+ this->RLEncode(1, len - 1);
+ }
+ return;
+ }
+ if(start < this->size || len == 0) { return; }
+
+ // Write as much into the buffer as possible.
+ usint diff = start + 1 - this->size;
+ usint free_bits = this->buffer->bitsLeft();
+ usint code_bits = this->buffer->deltaCodeLength(diff);
+ if(free_bits > code_bits) // At least a part of the run fits into the block.
+ {
+ free_bits -= code_bits;
+ usint run_bits = this->buffer->deltaCodeLength(len);
+ if(run_bits <= free_bits)
+ {
+ this->RLEncode(diff, len);
+ return;
+ }
+
+ // Encode as much as possible and let the rest spill.
+ usint llen = 1;
+ while(llen + 2 * length(llen + 1) - 1 <= free_bits) { llen++; }
+ llen = ((usint)1 << llen) - 1;
+
+ this->RLEncode(diff, llen);
+ len -= llen;
+
+ // A new sample will be added.
+ this->size++;
+ this->items++;
+ }
+ else
+ {
+ this->size = start + 1;
+ this->items++;
+ }
+
+ // Didn't fit into the block. A new sample & block required.
+ this->addNewBlock();
+ if(len > 1)
+ {
+ this->RLEncode(1, len - 1);
+ }
+}
+
+
+} // namespace CSA
--- /dev/null
+#ifndef RLEVECTOR_H
+#define RLEVECTOR_H
+
+#include <fstream>
+
+#include "bitvector.h"
+
+
+namespace CSA
+{
+
+
+/*
+ This class is used to construct a RLEVector.
+*/
+
+class RLEEncoder : public VectorEncoder
+{
+ public:
+ RLEEncoder(usint block_bytes, usint superblock_size = VectorEncoder::SUPERBLOCK_SIZE);
+ ~RLEEncoder();
+
+// void setBit(usint value);
+ void setRun(usint start, usint len);
+
+ inline void RLEncode(usint diff, usint len)
+ {
+ this->size += diff + len - 1;
+ this->items += len;
+ this->buffer->writeDeltaCode(diff);
+ this->buffer->writeDeltaCode(len);
+ }
+};
+
+
+/*
+ This is a run-length encoded bit vector using delta coding.
+*/
+
+class RLEVector : public BitVector
+{
+ public:
+ RLEVector(std::ifstream& file);
+ RLEVector(RLEEncoder& encoder, usint universe_size);
+ ~RLEVector();
+
+//--------------------------------------------------------------------------
+
+ usint rank(usint value, bool at_least = false);
+
+ usint select(usint index);
+ usint selectNext();
+
+ pair_type valueAfter(usint value);
+ pair_type nextValue();
+
+ pair_type selectRun(usint index, usint max_length);
+ pair_type selectNextRun(usint max_length);
+
+ bool isSet(usint value);
+
+//--------------------------------------------------------------------------
+
+ usint reportSize();
+
+ protected:
+ usint run;
+
+ inline void valueLoop(usint value)
+ {
+ this->getSample(this->sampleForValue(value));
+ this->run = 0;
+
+ if(this->val >= value) { return; }
+ while(this->cur < this->block_items)
+ {
+ this->val += this->buffer->readDeltaCode();
+ this->cur++;
+ this->run = this->buffer->readDeltaCode() - 1;
+ if(this->val >= value) { break; }
+
+ this->cur += this->run;
+ this->val += this->run;
+ if(this->val >= value)
+ {
+ this->run = this->val - value;
+ this->val = value;
+ this->cur -= this->run;
+ break;
+ }
+ this->run = 0;
+ }
+ }
+};
+
+
+} // namespace CSA
+
+
+#endif // RLEVECTOR_H
--- /dev/null
+#include <algorithm>
+#include <cstdlib>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+
+#include "vectors.h"
+#include "../misc/utils.h"
+
+
+namespace CSA
+{
+
+
+inline void
+handleOne(RLEEncoder& encoder, pair_type& run, usint position)
+{
+ if(run.second == 0)
+ {
+ run.first = position;
+ run.second = 1;
+ return;
+ }
+ if(position == run.first + run.second)
+ {
+ run.second++;
+ return;
+ }
+ encoder.setRun(run.first, run.second);
+ run.first = position;
+ run.second = 1;
+}
+
+inline void
+handleRun(RLEEncoder& encoder, pair_type& run, pair_type& next, usint limit)
+{
+ if(run.second == 0)
+ {
+ run.first = next.first;
+ run.second = std::min(limit - run.first, next.second);
+ next.first += run.second;
+ next.second -= run.second;
+ return;
+ }
+
+ if(next.first == run.first + run.second)
+ {
+ usint cont = std::min(limit - next.first, next.second);
+ run.second += cont;
+ next.first += cont;
+ next.second -= cont;
+ return;
+ }
+
+ encoder.setRun(run.first, run.second);
+ run.first = next.first;
+ run.second = std::min(limit - run.first, next.second);;
+ next.first += run.second;
+ next.second -= run.second;
+}
+
+RLEVector*
+mergeVectors(RLEVector* first, RLEVector* second, usint* positions, usint n, usint size, usint block_size)
+{
+ if((first == 0 && second == 0) || positions == 0) { return 0; }
+
+ pair_type first_run;
+ bool first_finished;
+ if(first == 0)
+ {
+ first_run = pair_type(size, 0);
+ first_finished = true;
+ }
+ else
+ {
+ first_run = first->selectRun(0, size);
+ first_run.second++;
+ first_finished = false;
+ }
+
+ usint second_bit;
+ if(second == 0)
+ {
+ second_bit = n;
+ }
+ else
+ {
+ second_bit = second->select(0);
+ }
+
+ RLEEncoder encoder(block_size);
+ pair_type run = pair_type(size, 0);
+ for(usint i = 0; i < n; i++, first_run.first++)
+ {
+ while(!first_finished && first_run.first < positions[i])
+ {
+ handleRun(encoder, run, first_run, positions[i]);
+ if(first_run.second == 0)
+ {
+ if(first->hasNext())
+ {
+ first_run = first->selectNextRun(size);
+ first_run.first += i;
+ first_run.second++;
+ }
+ else
+ {
+ first_finished = true;
+ }
+ }
+ }
+
+ if(i == second_bit) // positions[i] is one
+ {
+ handleOne(encoder, run, positions[i]);
+ second_bit = second->selectNext();
+ }
+ else // positions[i] is zero
+ {
+ if(run.second != 0)
+ {
+ encoder.setRun(run.first, run.second);
+ run.second = 0;
+ }
+ }
+ }
+
+ while(!first_finished)
+ {
+ handleRun(encoder, run, first_run, size);
+ if(first->hasNext())
+ {
+ first_run = first->selectNextRun(size);
+ first_run.first += n;
+ first_run.second++;
+ }
+ else { break; }
+ }
+
+ if(run.second != 0)
+ {
+ encoder.setRun(run.first, run.second);
+ }
+
+ delete first; delete second;
+ return new RLEVector(encoder, size);
+}
+
+//--------------------------------------------------------------------------
+
+DeltaVector*
+mergeVectors(DeltaVector* first, DeltaVector* second, usint* positions, usint n, usint size, usint block_size)
+{
+ if((first == 0 && second == 0) || positions == 0) { return 0; }
+
+ usint first_bit;
+ bool first_finished;
+ if(first == 0)
+ {
+ first_bit = 0;
+ first_finished = true;
+ }
+ else
+ {
+ first_bit = first->select(0);
+ first_finished = false;
+ }
+
+ usint second_bit;
+ if(second == 0)
+ {
+ second_bit = n;
+ }
+ else
+ {
+ second_bit = second->select(0);
+ }
+
+ DeltaEncoder encoder(block_size);
+ for(usint i = 0; i < n; i++, first_bit++)
+ {
+ while(!first_finished && first_bit < positions[i])
+ {
+ encoder.setBit(first_bit);
+ if(first->hasNext())
+ {
+ first_bit = first->selectNext() + i;
+ }
+ else
+ {
+ first_finished = true;
+ }
+ }
+
+ if(i == second_bit) // positions[i] is one
+ {
+ encoder.setBit(positions[i]);
+ second_bit = second->selectNext();
+ }
+ }
+
+ while(!first_finished)
+ {
+ encoder.setBit(first_bit);
+ if(!first->hasNext()) { break; }
+ first_bit = first->selectNext() + n;
+ }
+
+ delete first; delete second;
+ return new DeltaVector(encoder, size);
+}
+
+
+} // namespace CSA
--- /dev/null
+#ifndef VECTORS_H
+#define VECTORS_H
+
+#include "deltavector.h"
+#include "rlevector.h"
+
+
+namespace CSA
+{
+
+
+/*
+ These functions merge two vectors using marked positions.
+ The original vectors are deleted.
+*/
+
+RLEVector* mergeVectors(RLEVector* first, RLEVector* second, usint* positions, usint n, usint size, usint block_size);
+
+DeltaVector* mergeVectors(DeltaVector* first, DeltaVector* second, usint* positions, usint n, usint size, usint block_size);
+
+
+} // namespace CSA
+
+
+#endif // VECTORS_H
--- /dev/null
+rlcsa.o: rlcsa.cpp rlcsa.h bits/vectors.h bits/deltavector.h \
+ bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \
+ bits/rlevector.h sasamples.h misc/definitions.h bits/bitbuffer.h \
+ bits/deltavector.h misc/parameters.h misc/definitions.h misc/utils.h \
+ qsufsort/qsufsort.h qsufsort/../misc/definitions.h
+rlcsa_builder.o: rlcsa_builder.cpp rlcsa_builder.h rlcsa.h bits/vectors.h \
+ bits/deltavector.h bits/bitvector.h bits/../misc/definitions.h \
+ bits/bitbuffer.h bits/rlevector.h sasamples.h misc/definitions.h \
+ bits/bitbuffer.h bits/deltavector.h misc/parameters.h \
+ misc/definitions.h
+sasamples.o: sasamples.cpp sasamples.h misc/definitions.h \
+ bits/bitbuffer.h bits/../misc/definitions.h bits/deltavector.h \
+ bits/bitvector.h bits/bitbuffer.h misc/utils.h misc/definitions.h
+bitvector.o: bits/bitvector.cpp bits/bitvector.h \
+ bits/../misc/definitions.h bits/bitbuffer.h
+deltavector.o: bits/deltavector.cpp bits/deltavector.h bits/bitvector.h \
+ bits/../misc/definitions.h bits/bitbuffer.h
+rlevector.o: bits/rlevector.cpp bits/rlevector.h bits/bitvector.h \
+ bits/../misc/definitions.h bits/bitbuffer.h bits/../misc/utils.h \
+ bits/../misc/definitions.h
+vectors.o: bits/vectors.cpp bits/vectors.h bits/deltavector.h \
+ bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \
+ bits/rlevector.h bits/../misc/utils.h bits/../misc/definitions.h
+parameters.o: misc/parameters.cpp misc/parameters.h misc/definitions.h
+utils.o: misc/utils.cpp misc/utils.h misc/definitions.h
+qsufsort.o: qsufsort/qsufsort.c qsufsort/qsufsort.h \
+ qsufsort/../misc/definitions.h
--- /dev/null
+#ifndef DEFINITIONS_H
+#define DEFINITIONS_H
+
+#include <algorithm>
+#include <climits>
+
+
+namespace CSA
+{
+
+
+#ifdef MASSIVE_DATA_RLCSA
+typedef unsigned long usint;
+typedef signed long sint;
+#else
+typedef unsigned int usint;
+typedef signed int sint;
+#endif
+
+
+typedef unsigned char uchar;
+typedef std::pair<usint, usint> pair_type;
+
+
+inline usint length(usint n)
+{
+ usint b = 0;
+ while(n > 0) { b++; n >>= 1; }
+ return b;
+}
+
+inline bool isEmpty(const pair_type& data)
+{
+ return (data.first > data.second);
+}
+
+inline usint length(const pair_type& data)
+{
+ return data.second + 1 - data.first;
+}
+
+
+const usint CHARS = ((usint)1 << CHAR_BIT);
+const usint MEGABYTE = 1048576;
+const usint MILLION = 1000000;
+const usint WORD_BITS = CHAR_BIT * sizeof(usint);
+const usint WORD_MAX = ~((usint)0);
+
+const pair_type EMPTY_PAIR = pair_type(1, 0);
+
+
+// Previous GET was broken when BITS == WORD_BITS
+// Current version works for usints and less
+//#define GET(FIELD, BITS) ((FIELD) & ((1 << (BITS)) - 1))
+#define GET(FIELD, BITS) ((FIELD) & (WORD_MAX >> (WORD_BITS - (BITS))))
+#define LOWER(FIELD, N) ((FIELD) >> (N))
+#define HIGHER(FIELD, N) ((FIELD) << (N))
+
+#define BITS_TO_BYTES(BITS) (((BITS) + CHAR_BIT - 1) / CHAR_BIT)
+#define BYTES_TO_WORDS(BYTES) (((BYTES) + sizeof(usint) - 1) / sizeof(usint))
+#define BITS_TO_WORDS(BITS) (((BITS) + WORD_BITS - 1) / WORD_BITS)
+
+
+} // namespace CSA
+
+
+#endif
--- /dev/null
+#include <cstdlib>
+#include <iostream>
+#include <map>
+
+#include "parameters.h"
+
+
+namespace CSA
+{
+
+
+Parameters::Parameters()
+{
+}
+
+Parameters::~Parameters()
+{
+}
+
+bool
+Parameters::contains(const std::string& key)
+{
+ return (this->parameters.find(key) != this->parameters.end());
+}
+
+usint
+Parameters::get(const std::string& key)
+{
+ std::map<std::string, usint>::iterator iter = this->parameters.find(key);
+ if(iter == this->parameters.end()) { return 0; }
+ return iter->second;
+}
+
+usint
+Parameters::get(const parameter_type& param)
+{
+ return this->get(param.first);
+}
+
+void
+Parameters::set(const std::string& key, usint value)
+{
+ this->parameters[key] = value;
+}
+
+void
+Parameters::set(const parameter_type& param)
+{
+ this->set(param.first, param.second);
+}
+
+void
+Parameters::read(std::ifstream& file)
+{
+ while(file)
+ {
+ std::string key;
+ std::string c;
+ usint value;
+
+ file >> key >> c >> value;
+ if(c == "=") { this->parameters[key] = value; }
+ }
+}
+
+void Parameters::read(const std::string& file_name)
+{
+ std::ifstream file(file_name.c_str(), std::ios_base::binary);
+ if(!file)
+ {
+ std::cerr << "Cannot open parameter file " << file_name << " for reading!" << std::endl;
+ return;
+ }
+ this->read(file);
+ file.close();
+}
+
+void
+Parameters::print()
+{
+ this->write(std::cout);
+ std::cout << std::endl;
+}
+
+void
+Parameters::write(std::ostream& stream)
+{
+ for(std::map<std::string, usint>::iterator iter = this->parameters.begin(); iter != this->parameters.end(); iter++)
+ {
+ stream << iter->first << " = " << iter->second << std::endl;
+ }
+}
+
+void
+Parameters::write(const std::string& file_name)
+{
+ std::ofstream file(file_name.c_str(), std::ios_base::binary);
+ if(!file)
+ {
+ std::cerr << "Cannot open parameter file " << file_name << " for writing!" << std::endl;
+ return;
+ }
+ this->write(file);
+ file.close();
+}
+
+
+} // namespace CSA
--- /dev/null
+#ifndef PARAMETERS_H
+#define PARAMETERS_H
+
+#include <fstream>
+#include <map>
+
+#include "definitions.h"
+
+
+namespace CSA
+{
+
+
+typedef std::pair<std::string, usint> parameter_type;
+
+
+class Parameters
+{
+ public:
+ Parameters();
+ ~Parameters();
+
+ bool contains(const std::string& key);
+ usint get(const std::string& key);
+ usint get(const parameter_type& param);
+ void set(const std::string& key, usint value);
+ void set(const parameter_type& param);
+
+ void read(std::ifstream& file);
+ void read(const std::string& file_name);
+ void print();
+ void write(std::ostream& stream);
+ void write(const std::string& file_name);
+
+ private:
+ std::map<std::string, usint> parameters;
+};
+
+
+} // namespace CSA
+
+
+#endif
--- /dev/null
+#include "utils.h"
+
+
+namespace CSA
+{
+
+
+std::streamoff
+fileSize(std::ifstream& file)
+{
+ std::streamoff curr = file.tellg();
+
+ file.seekg(0, std::ios::end);
+ std::streamoff size = file.tellg();
+ file.seekg(0, std::ios::beg);
+ size -= file.tellg();
+
+ file.seekg(curr, std::ios::beg);
+ return size;
+}
+
+std::streamoff
+fileSize(std::ofstream& file)
+{
+ std::streamoff curr = file.tellp();
+
+ file.seekp(0, std::ios::end);
+ std::streamoff size = file.tellp();
+ file.seekp(0, std::ios::beg);
+ size -= file.tellp();
+
+ file.seekp(curr, std::ios::beg);
+ return size;
+}
+
+std::ostream&
+operator<<(std::ostream& stream, pair_type data)
+{
+ return stream << "(" << data.first << ", " << data.second << ")";
+}
+
+void
+readRows(std::ifstream& file, std::list<std::string>& rows, bool skipEmptyRows)
+{
+ while(file)
+ {
+ std::string buf;
+ std::getline(file, buf);
+ if(skipEmptyRows && buf.length() == 0) { continue; }
+ rows.push_back(buf);
+ }
+}
+
+
+} // namespace CSA
--- /dev/null
+#ifndef UTILS_H
+#define UTILS_H
+
+#include <fstream>
+#include <list>
+
+#include "definitions.h"
+
+
+namespace CSA
+{
+
+
+std::streamoff fileSize(std::ifstream& file);
+std::streamoff fileSize(std::ofstream& file);
+
+
+std::ostream& operator<<(std::ostream& stream, pair_type data);
+
+
+void readRows(std::ifstream& file, std::list<std::string>& rows, bool skipEmptyRows);
+
+
+} // namespace CSA
+
+
+#endif // UTILS_H
--- /dev/null
+/* qsufsort.c
+ Copyright 1999, N. Jesper Larsson, all rights reserved.
+
+ This file contains an implementation of the algorithm presented in "Faster
+ Suffix Sorting" by N. Jesper Larsson (jesper@@cs.lth.se) and Kunihiko
+ Sadakane (sada@@is.s.u-tokyo.ac.jp).
+
+ This software may be used freely for any purpose. However, when distributed,
+ the original source must be clearly stated, and, when the source code is
+ distributed, the copyright notice must be retained and any alterations in
+ the code must be clearly marked. No warranty is given regarding the quality
+ of this software.*/
+
+/*
+ Replaced int -> sint for massive data support and moved into namespace CSA.
+
+ - Jouni Siren 2009-03-16
+*/
+
+#include <climits>
+#include "qsufsort.h"
+
+
+namespace CSA
+{
+
+
+static sint *I, /* group array, ultimately suffix array.*/
+ *V, /* inverse array, ultimately inverse of I.*/
+ r, /* number of symbols aggregated by transform.*/
+ h; /* length of already-sorted prefixes.*/
+
+#define KEY(p) (V[*(p)+(h)])
+#define SWAP(p, q) (tmp=*(p), *(p)=*(q), *(q)=tmp)
+#define MED3(a, b, c) (KEY(a)<KEY(b) ? (KEY(b)<KEY(c) ? b : KEY(a)<KEY(c) ? c : (a)) : (KEY(b)>KEY(c) ? b : KEY(a)>KEY(c) ? c : (a)))
+
+/* Subroutine for select_sort_split and sort_split. Sets group numbers for a
+ group whose lowest position in I is pl and highest position is pm.*/
+
+static void update_group(sint *pl, sint *pm)
+{
+ sint g;
+
+ g=pm-I; /* group number.*/
+ V[*pl]=g; /* update group number of first position.*/
+ if (pl==pm)
+ *pl=-1; /* one element, sorted group.*/
+ else
+ do /* more than one element, unsorted group.*/
+ V[*++pl]=g; /* update group numbers.*/
+ while (pl<pm);
+}
+
+/* Quadratic sorting method to use for small subarrays. To be able to update
+ group numbers consistently, a variant of selection sorting is used.*/
+
+static void select_sort_split(sint *p, sint n) {
+ sint *pa, *pb, *pi, *pn;
+ sint f, v, tmp;
+
+ pa=p; /* pa is start of group being picked out.*/
+ pn=p+n-1; /* pn is last position of subarray.*/
+ while (pa<pn) {
+ for (pi=pb=pa+1, f=KEY(pa); pi<=pn; ++pi)
+ if ((v=KEY(pi))<f) {
+ f=v; /* f is smallest key found.*/
+ SWAP(pi, pa); /* place smallest element at beginning.*/
+ pb=pa+1; /* pb is position for elements equal to f.*/
+ } else if (v==f) { /* if equal to smallest key.*/
+ SWAP(pi, pb); /* place next to other smallest elements.*/
+ ++pb;
+ }
+ update_group(pa, pb-1); /* update group values for new group.*/
+ pa=pb; /* continue sorting rest of the subarray.*/
+ }
+ if (pa==pn) { /* check if last part is single element.*/
+ V[*pa]=pa-I;
+ *pa=-1; /* sorted group.*/
+ }
+}
+
+/* Subroutine for sort_split, algorithm by Bentley & McIlroy.*/
+
+static sint choose_pivot(sint *p, sint n) {
+ sint *pl, *pm, *pn;
+ sint s;
+
+ pm=p+(n>>1); /* small arrays, middle element.*/
+ if (n>7) {
+ pl=p;
+ pn=p+n-1;
+ if (n>40) { /* big arrays, pseudomedian of 9.*/
+ s=n>>3;
+ pl=MED3(pl, pl+s, pl+s+s);
+ pm=MED3(pm-s, pm, pm+s);
+ pn=MED3(pn-s-s, pn-s, pn);
+ }
+ pm=MED3(pl, pm, pn); /* midsize arrays, median of 3.*/
+ }
+ return KEY(pm);
+}
+
+/* Sorting routine called for each unsorted group. Sorts the array of integers
+ (suffix numbers) of length n starting at p. The algorithm is a ternary-split
+ quicksort taken from Bentley & McIlroy, "Engineering a Sort Function",
+ Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This
+ function is based on Program 7.*/
+
+static void sort_split(sint *p, sint n)
+{
+ sint *pa, *pb, *pc, *pd, *pl, *pm, *pn;
+ sint f, v, s, t, tmp;
+
+ if (n<7) { /* multi-selection sort smallest arrays.*/
+ select_sort_split(p, n);
+ return;
+ }
+
+ v=choose_pivot(p, n);
+ pa=pb=p;
+ pc=pd=p+n-1;
+ while (1) { /* split-end partition.*/
+ while (pb<=pc && (f=KEY(pb))<=v) {
+ if (f==v) {
+ SWAP(pa, pb);
+ ++pa;
+ }
+ ++pb;
+ }
+ while (pc>=pb && (f=KEY(pc))>=v) {
+ if (f==v) {
+ SWAP(pc, pd);
+ --pd;
+ }
+ --pc;
+ }
+ if (pb>pc)
+ break;
+ SWAP(pb, pc);
+ ++pb;
+ --pc;
+ }
+ pn=p+n;
+ if ((s=pa-p)>(t=pb-pa))
+ s=t;
+ for (pl=p, pm=pb-s; s; --s, ++pl, ++pm)
+ SWAP(pl, pm);
+ if ((s=pd-pc)>(t=pn-pd-1))
+ s=t;
+ for (pl=pb, pm=pn-s; s; --s, ++pl, ++pm)
+ SWAP(pl, pm);
+
+ s=pb-pa;
+ t=pd-pc;
+ if (s>0)
+ sort_split(p, s);
+ update_group(p+s, p+n-t-1);
+ if (t>0)
+ sort_split(p+n-t, t);
+}
+
+/* Bucketsort for first iteration.
+
+ Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear
+ at least once. x[n] is 0. (This is the corresponding output of transform.) k
+ must be at most n+1. p is array of size n+1 whose contents are disregarded.
+
+ Output: x is V and p is I after the initial sorting stage of the refined
+ suffix sorting algorithm.*/
+
+static void bucketsort(sint *x, sint *p, sint n, sint k)
+{
+ sint *pi, i, c, d, g;
+
+ for (pi=p; pi<p+k; ++pi)
+ *pi=-1; /* mark linked lists empty.*/
+ for (i=0; i<=n; ++i) {
+ x[i]=p[c=x[i]]; /* insert in linked list.*/
+ p[c]=i;
+ }
+ for (pi=p+k-1, i=n; pi>=p; --pi) {
+ d=x[c=*pi]; /* c is position, d is next in list.*/
+ x[c]=g=i; /* last position equals group number.*/
+ if (d>=0) { /* if more than one element in group.*/
+ p[i--]=c; /* p is permutation for the sorted x.*/
+ do {
+ d=x[c=d]; /* next in linked list.*/
+ x[c]=g; /* group number in x.*/
+ p[i--]=c; /* permutation in p.*/
+ } while (d>=0);
+ } else
+ p[i--]=-1; /* one element, sorted group.*/
+ }
+}
+
+/* Transforms the alphabet of x by attempting to aggregate several symbols into
+ one, while preserving the suffix order of x. The alphabet may also be
+ compacted, so that x on output comprises all integers of the new alphabet
+ with no skipped numbers.
+
+ Input: x is an array of size n+1 whose first n elements are positive
+ integers in the range l...k-1. p is array of size n+1, used for temporary
+ storage. q controls aggregation and compaction by defining the maximum value
+ for any symbol during transformation: q must be at least k-l; if q<=n,
+ compaction is guaranteed; if k-l>n, compaction is never done; if q is
+ INT_MAX, the maximum number of symbols are aggregated into one.
+
+ Output: Returns an integer j in the range 1...q representing the size of the
+ new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is
+ set to the number of old symbols grouped into one. Only x[n] is 0.*/
+
+static sint transform(sint *x, sint *p, sint n, sint k, sint l, sint q)
+{
+ sint b, c, d, e, i, j, m, s;
+ sint *pi, *pj;
+
+ for (s=0, i=k-l; i; i>>=1)
+ ++s; /* s is number of bits in old symbol.*/
+ e=INT_MAX>>s; /* e is for overflow checking.*/
+ for (b=d=r=0; r<n && d<=e && (c=d<<s|(k-l))<=q; ++r) {
+ b=b<<s|(x[r]-l+1); /* b is start of x in chunk alphabet.*/
+ d=c; /* d is max symbol in chunk alphabet.*/
+ }
+ m=(1<<(r-1)*s)-1; /* m masks off top old symbol from chunk.*/
+ x[n]=l-1; /* emulate zero terminator.*/
+ if (d<=n) { /* if bucketing possible, compact alphabet.*/
+ for (pi=p; pi<=p+d; ++pi)
+ *pi=0; /* zero transformation table.*/
+ for (pi=x+r, c=b; pi<=x+n; ++pi) {
+ p[c]=1; /* mark used chunk symbol.*/
+ c=(c&m)<<s|(*pi-l+1); /* shift in next old symbol in chunk.*/
+ }
+ for (i=1; i<r; ++i) { /* handle last r-1 positions.*/
+ p[c]=1; /* mark used chunk symbol.*/
+ c=(c&m)<<s; /* shift in next old symbol in chunk.*/
+ }
+ for (pi=p, j=1; pi<=p+d; ++pi)
+ if (*pi)
+ *pi=j++; /* j is new alphabet size.*/
+ for (pi=x, pj=x+r, c=b; pj<=x+n; ++pi, ++pj) {
+ *pi=p[c]; /* transform to new alphabet.*/
+ c=(c&m)<<s|(*pj-l+1); /* shift in next old symbol in chunk.*/
+ }
+ while (pi<x+n) { /* handle last r-1 positions.*/
+ *pi++=p[c]; /* transform to new alphabet.*/
+ c=(c&m)<<s; /* shift right-end zero in chunk.*/
+ }
+ } else { /* bucketing not possible, don't compact.*/
+ for (pi=x, pj=x+r, c=b; pj<=x+n; ++pi, ++pj) {
+ *pi=c; /* transform to new alphabet.*/
+ c=(c&m)<<s|(*pj-l+1); /* shift in next old symbol in chunk.*/
+ }
+ while (pi<x+n) { /* handle last r-1 positions.*/
+ *pi++=c; /* transform to new alphabet.*/
+ c=(c&m)<<s; /* shift right-end zero in chunk.*/
+ }
+ j=d+1; /* new alphabet size.*/
+ }
+ x[n]=0; /* end-of-string symbol is zero.*/
+ return j; /* return new alphabet size.*/
+}
+
+/* Makes suffix array p of x. x becomes inverse of p. p and x are both of size
+ n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
+ contents of x[n] is disregarded, the n-th symbol being regarded as
+ end-of-string smaller than all other symbols.*/
+
+void suffixsort(sint *x, sint *p, sint n, sint k, sint l)
+{
+ sint *pi, *pk;
+ sint i, j, s, sl;
+
+ V=x; /* set global values.*/
+ I=p;
+
+ if (n>=k-l) { /* if bucketing possible,*/
+ j=transform(V, I, n, k, l, n);
+ bucketsort(V, I, n, j); /* bucketsort on first r positions.*/
+ } else {
+ transform(V, I, n, k, l, INT_MAX);
+ for (i=0; i<=n; ++i)
+ I[i]=i; /* initialize I with suffix numbers.*/
+ h=0;
+ sort_split(I, n+1); /* quicksort on first r positions.*/
+ }
+ h=r; /* number of symbols aggregated by transform.*/
+
+ while (*I>=-n) {
+ pi=I; /* pi is first position of group.*/
+ sl=0; /* sl is negated length of sorted groups.*/
+ do {
+ if ((s=*pi)<0) {
+ pi-=s; /* skip over sorted group.*/
+ sl+=s; /* add negated length to sl.*/
+ } else {
+ if (sl) {
+ *(pi+sl)=sl; /* combine sorted groups before pi.*/
+ sl=0;
+ }
+ pk=I+V[s]+1; /* pk-1 is last position of unsorted group.*/
+ sort_split(pi, pk-pi);
+ pi=pk; /* next group.*/
+ }
+ } while (pi<=I+n);
+ if (sl) /* if the array ends with a sorted group.*/
+ *(pi+sl)=sl; /* combine sorted groups at end of I.*/
+ h=2*h; /* double sorted-depth.*/
+ }
+
+ for (i=0; i<=n; ++i) /* reconstruct suffix array from inverse.*/
+ I[V[i]]=i;
+}
+
+
+} // namespace CSA
--- /dev/null
+#ifndef _QSUFSORT_H_
+#define _QSUFSORT_H_
+
+#include "../misc/definitions.h"
+
+
+namespace CSA
+{
+
+
+void suffixsort(sint *, sint *, sint, sint, sint);
+
+
+} // namespace CSA
+
+
+#endif
--- /dev/null
+#include <algorithm>
+#include <cstdlib>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+
+#include "rlcsa.h"
+#include "misc/utils.h"
+#include "qsufsort/qsufsort.h"
+
+
+namespace CSA
+{
+
+
+RLCSA::RLCSA(const std::string& base_name, bool print) :
+ ok(false),
+ sa_samples(0),
+ end_points(0)
+{
+ for(usint c = 0; c < CHARS; c++) { this->array[c] = 0; }
+
+ std::string array_name = base_name + ARRAY_EXTENSION;
+ std::ifstream array_file(array_name.c_str(), std::ios_base::binary);
+ if(!array_file)
+ {
+ std::cerr << "RLCSA: Error opening Psi array file!" << std::endl;
+ return;
+ }
+
+ usint distribution[CHARS];
+ array_file.read((char*)distribution, CHARS * sizeof(usint));
+ this->buildCharIndexes(distribution);
+
+ Parameters parameters;
+ parameters.read(base_name + PARAMETERS_EXTENSION);
+ for(usint c = 0; c < CHARS; c++)
+ {
+ if(!isEmpty(this->index_ranges[c])) { this->array[c] = new RLEVector(array_file); }
+ }
+
+ this->end_points = new DeltaVector(array_file);
+ this->number_of_sequences = this->end_points->getNumberOfItems();
+
+ array_file.read((char*)&(this->sample_rate), sizeof(this->sample_rate));
+ array_file.close();
+
+ this->support_locate = parameters.get(SUPPORT_LOCATE);
+ this->support_display = parameters.get(SUPPORT_DISPLAY);
+
+ if(this->support_locate || this->support_display)
+ {
+ std::string sa_sample_name = base_name + SA_SAMPLES_EXTENSION;
+ std::ifstream sa_sample_file(sa_sample_name.c_str(), std::ios_base::binary);
+ if(!sa_sample_file)
+ {
+ std::cerr << "RLCSA: Error opening suffix array sample file!" << std::endl;
+ return;
+ }
+ this->sa_samples = new SASamples(sa_sample_file, this->sample_rate);
+ sa_sample_file.close();
+ }
+
+ if(print) { parameters.print(); }
+
+ this->ok = true;
+}
+
+RLCSA::RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, bool multiple_sequences, bool delete_data) :
+ ok(false),
+ sa_samples(0),
+ sample_rate(sa_sample_rate),
+ end_points(0)
+{
+ for(usint c = 0; c < CHARS; c++) { this->array[c] = 0; }
+
+ if(!data || bytes == 0)
+ {
+ std::cerr << "RLCSA: No input data given!" << std::endl;
+ if(delete_data) { delete[] data; }
+ return;
+ }
+ if(block_size < 2 * sizeof(usint) || block_size % sizeof(usint) != 0)
+ {
+ std::cerr << "RLCSA: Block size must be a multiple of " << sizeof(usint) << " bytes!" << std::endl;
+ if(delete_data) { delete[] data; }
+ return;
+ }
+
+
+ // Do we store SA samples?
+ if(this->sample_rate > 0)
+ {
+ this->support_locate = this->support_display = true;
+ }
+ else
+ {
+ this->support_locate = this->support_display = false;
+ this->sample_rate = 1;
+ }
+
+
+ // Determine the number of sequences and mark their end points.
+ if(multiple_sequences)
+ {
+ DeltaEncoder endings(RLCSA::ENDPOINT_BLOCK_SIZE);
+ this->number_of_sequences = 0;
+ usint marker = 0;
+ usint padding = 0, chars_encountered = 0;
+
+ for(usint i = 0; i < bytes; i++)
+ {
+ if(data[i] == 0)
+ {
+ if(i == marker) { break; } // Empty sequence.
+ this->number_of_sequences++;
+ marker = i + 1;
+
+ usint pos = chars_encountered + padding - 1;
+ endings.setBit(pos);
+ padding = ((pos + this->sample_rate) / this->sample_rate) * this->sample_rate - chars_encountered;
+ }
+ else
+ {
+ chars_encountered++;
+ }
+ }
+
+ if(this->number_of_sequences == 0 || marker != bytes)
+ {
+ std::cerr << "RLCSA: Collection must consist of 0-terminated nonempty sequences!" << std::endl;
+ if(delete_data) { delete[] data; }
+ return;
+ }
+ this->end_points = new DeltaVector(endings, chars_encountered + padding);
+ }
+ else
+ {
+ this->number_of_sequences = 1;
+ DeltaEncoder endings(RLCSA::ENDPOINT_BLOCK_SIZE, RLCSA::ENDPOINT_BLOCK_SIZE);
+ usint pos = bytes - 1;
+ endings.setBit(pos);
+ usint padding = ((pos + this->sample_rate) / this->sample_rate) * this->sample_rate - bytes;
+ this->end_points = new DeltaVector(endings, bytes + padding);
+ }
+
+
+ // Build character tables etc.
+ usint distribution[CHARS];
+ sint low = CHARS, high = 0;
+ for(usint c = 0; c < CHARS; c++) { distribution[c] = 0; }
+ for(usint i = 0; i < bytes; i++)
+ {
+ if(data[i] < low) { low = data[i]; }
+ if(data[i] > high) { high = data[i]; }
+ distribution[(usint)data[i]]++;
+ }
+ if(multiple_sequences)
+ {
+ distribution[0] = 0;
+ low = 0;
+ high = high + this->number_of_sequences;
+ }
+ this->buildCharIndexes(distribution);
+
+
+ // Build suffix array.
+ sint* inverse = new sint[bytes + 1];
+ if(multiple_sequences)
+ {
+ sint zeros = 0;
+ for(usint i = 0; i < bytes; i++)
+ {
+ if(data[i] == 0)
+ {
+ inverse[i] = zeros;
+ zeros++;
+ }
+ else
+ {
+ inverse[i] = (sint)data[i] + this->number_of_sequences;
+ }
+ }
+ }
+ else
+ {
+ for(usint i = 0; i < bytes; i++) { inverse[i] = (sint)data[i]; }
+ }
+ if(delete_data) { delete[] data; }
+ sint* sa = new sint[bytes + 1];
+ suffixsort(inverse, sa, bytes, high + 1, low);
+
+
+ // Sample SA.
+ usint incr = (multiple_sequences ? this->number_of_sequences + 1 : 1); // No e_of_s markers in SA.
+ if(this->support_locate || this->support_display)
+ {
+ if(multiple_sequences)
+ {
+ std::cout << "We shouldn't be here!" << std::endl;
+ // Real SA starts at sa + incr.
+ // for each sequence
+ // find starting position
+ // sample relative to starting position
+ // sort samples to SA order
+ }
+ else
+ {
+ this->sa_samples = new SASamples((usint*)(sa + incr), this->data_size, this->sample_rate);
+ }
+ }
+
+
+ // Build Psi.
+ for(usint i = 0; i <= bytes; i++)
+ {
+ sa[i] = inverse[(sa[i] + 1) % (bytes + 1)];
+ }
+ delete[] inverse;
+
+
+ // Build RLCSA.
+ usint decr = (multiple_sequences ? 1 : 0); // Ignore the last e_of_s marker if multiple sequences.
+ for(usint c = 0; c < CHARS; c++)
+ {
+ if(distribution[c] == 0) { this->array[c] = 0; continue; }
+
+ usint* curr = (usint*)(sa + index_ranges[c].first + incr);
+ usint* limit = (usint*)(sa + index_ranges[c].second + incr + 1);
+ RLEEncoder encoder(block_size);
+ pair_type run(*curr, 1);
+ curr++;
+
+ for(; curr < limit; curr++)
+ {
+ if(*curr == run.first + run.second) { run.second++; }
+ else
+ {
+ encoder.setRun(run.first - decr, run.second);
+ run = pair_type(*curr, 1);
+ }
+ }
+ encoder.setRun(run.first - decr, run.second);
+
+ this->array[c] = new RLEVector(encoder, this->data_size + incr - decr);
+ }
+ delete[] sa;
+
+
+ this->ok = true;
+}
+
+RLCSA::RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size) :
+ ok(false),
+ sa_samples(0),
+ end_points(0)
+{
+ for(usint c = 0; c < CHARS; c++) { this->array[c] = 0; }
+
+ if(!index.isOk() || !increment.isOk())
+ {
+ return; // Fail silently. Actual error has already been reported.
+ }
+ if(positions == 0)
+ {
+ std::cerr << "RLCSA: Positions for insertions not available!" << std::endl;
+ return;
+ }
+ if(index.sample_rate != increment.sample_rate)
+ {
+ std::cerr << "RLCSA: Cannot combine indexes with different sample rates!" << std::endl;
+ std::cout << "Index: " << index.sample_rate << ", increment: " << increment.sample_rate << std::endl;
+ return;
+ }
+
+ if(index.sa_samples == 0 || increment.sa_samples == 0)
+ {
+ this->support_locate = this->support_display = false;
+ }
+ else
+ {
+ this->support_locate = this->support_display = true;
+ }
+
+
+ // Build character tables etc.
+ usint distribution[CHARS];
+ for(usint c = 0; c < CHARS; c++)
+ {
+ distribution[c] = length(index.index_ranges[c]) + length(increment.index_ranges[c]);
+ }
+ this->buildCharIndexes(distribution);
+ this->sample_rate = index.sample_rate;
+
+
+ // Combine end points of sequences.
+ this->number_of_sequences = index.number_of_sequences + increment.number_of_sequences;
+ DeltaEncoder* endings = new DeltaEncoder(RLCSA::ENDPOINT_BLOCK_SIZE);
+
+ endings->setBit(index.end_points->select(0));
+ for(usint i = 1; i < index.number_of_sequences; i++)
+ {
+ endings->setBit(index.end_points->selectNext());
+ }
+ usint sum = index.end_points->getSize();
+ delete index.end_points; index.end_points = 0;
+
+ endings->setBit(sum + increment.end_points->select(0));
+ for(usint i = 1; i < increment.number_of_sequences; i++)
+ {
+ endings->setBit(sum + increment.end_points->selectNext());
+ }
+ sum += increment.end_points->getSize();
+ delete increment.end_points; increment.end_points = 0;
+
+ this->end_points = new DeltaVector(*endings, sum);
+ delete endings;
+
+
+ // Combine Psi.
+ usint psi_size = this->data_size + this->number_of_sequences;
+ for(usint c = 0; c < CHARS; c++)
+ {
+ if(distribution[c] == 0) { this->array[c] = 0; continue; }
+ this->array[c] = mergeVectors(index.array[c], increment.array[c], positions, increment.data_size + increment.number_of_sequences, psi_size, block_size);
+ index.array[c] = 0;
+ increment.array[c] = 0;
+
+ if(this->array[c] == 0)
+ {
+ std::cerr << "RLCSA: Merge failed for vectors " << c << "!" << std::endl;
+ return;
+ }
+ }
+
+
+ // Combine suffix array samples.
+ if(this->support_locate || this->support_display)
+ {
+ positions += increment.number_of_sequences;
+ for(usint i = 0; i < increment.data_size; i++)
+ {
+ positions[i] -= this->number_of_sequences;
+ }
+ this->sa_samples = new SASamples(*(index.sa_samples), *(increment.sa_samples), positions, increment.data_size);
+ }
+
+ this->ok = true;
+}
+
+RLCSA::~RLCSA()
+{
+ for(usint c = 0; c < CHARS; c++) { delete this->array[c]; }
+ delete this->sa_samples;
+ delete this->end_points;
+}
+
+void
+RLCSA::writeTo(const std::string& base_name)
+{
+ std::string array_name = base_name + ARRAY_EXTENSION;
+ std::ofstream array_file(array_name.c_str(), std::ios_base::binary);
+ if(!array_file)
+ {
+ std::cerr << "RLCSA: Error creating Psi array file!" << std::endl;
+ return;
+ }
+
+ usint distribution[CHARS];
+ for(usint c = 0; c < CHARS; c++)
+ {
+ distribution[c] = length(this->index_ranges[c]);
+ }
+ array_file.write((char*)distribution, CHARS * sizeof(usint));
+ for(usint c = 0; c < CHARS; c++)
+ {
+ if(this->array[c] != 0)
+ {
+ this->array[c]->writeTo(array_file);
+ }
+ }
+
+ this->end_points->writeTo(array_file);
+ array_file.write((char*)&(this->sample_rate), sizeof(this->sample_rate));
+ array_file.close();
+
+ if(this->support_locate || this->support_display)
+ {
+ std::string sa_sample_name = base_name + SA_SAMPLES_EXTENSION;
+ std::ofstream sa_sample_file(sa_sample_name.c_str(), std::ios_base::binary);
+ if(!sa_sample_file)
+ {
+ std::cerr << "RLCSA: Error creating suffix array sample file!" << std::endl;
+ return;
+ }
+
+ this->sa_samples->writeTo(sa_sample_file);
+ sa_sample_file.close();
+ }
+}
+
+bool
+RLCSA::isOk()
+{
+ return this->ok;
+}
+
+//--------------------------------------------------------------------------
+
+pair_type
+RLCSA::count(const std::string& pattern)
+{
+ if(pattern.length() == 0) { return pair_type(0, this->data_size - 1); }
+
+ pair_type index_range = this->index_ranges[(usint)*(pattern.rbegin())];
+ if(isEmpty(index_range)) { return index_range; }
+ index_range.first += this->number_of_sequences;
+ index_range.second += this->number_of_sequences;
+
+ for(std::string::const_reverse_iterator iter = ++pattern.rbegin(); iter != pattern.rend(); iter++)
+ {
+ RLEVector* vector = this->array[(usint)*iter];
+ usint start = this->index_ranges[(usint)*iter].first;
+
+ index_range.first = start + vector->rank(index_range.first, true) - 1 + this->number_of_sequences;
+ index_range.second = start + vector->rank(index_range.second) - 1 + this->number_of_sequences;
+
+ if(isEmpty(index_range)) { return index_range; }
+ }
+
+ // Suffix array indexes are 0-based.
+ index_range.first -= this->number_of_sequences;
+ index_range.second -= this->number_of_sequences;
+
+ return index_range;
+}
+
+void
+RLCSA::reportPositions(uchar* data, usint length, usint* positions)
+{
+ if(data == 0 || length == 0 || positions == 0) { return; }
+
+ usint current = this->number_of_sequences - 1;
+ positions[length] = current; // "immediately after current"
+ for(sint i = (sint)(length - 1); i >= 0; i--)
+ {
+// positions[i] = current; // "immediately after current"
+ usint c = (usint)data[i];
+ if(array[c])
+ {
+ current = this->index_ranges[c].first + this->array[c]->rank(current) - 1 + this->number_of_sequences;
+ }
+ else
+ {
+ if(c < this->text_chars[0]) // No previous characters either.
+ {
+ current = this->number_of_sequences - 1;
+ }
+ else
+ {
+ current = this->index_ranges[c].first - 1 + this->number_of_sequences;
+ }
+ }
+ positions[i] = current; // "immediately after current"
+ }
+}
+
+//--------------------------------------------------------------------------
+
+LocateItem*
+RLCSA::locate(pair_type range)
+{
+ if(!(this->support_locate) || isEmpty(range) || range.second >= this->data_size) { return 0; }
+
+ LocateItem* data = new LocateItem[range.second + 1 - range.first];
+ if(!data) { return 0; }
+ this->locateUnsafe(range, data);
+
+ return data;
+}
+
+LocateItem*
+RLCSA::locate(pair_type range, LocateItem* data)
+{
+ if(!(this->support_locate) || isEmpty(range) || range.second >= this->data_size || data == 0) { return 0; }
+ this->locateUnsafe(range, data);
+ return data;
+}
+
+void
+RLCSA::locateUnsafe(pair_type range, LocateItem* data)
+{
+ usint items = range.second + 1 - range.first;
+ for(usint i = 0, j = range.first; i < items; i++, j++)
+ {
+ data[i].value = j + this->number_of_sequences;
+ data[i].offset = 0;
+ data[i].found = false;
+ }
+
+ bool found = false;
+ while(!found)
+ {
+ found = true;
+ pair_type run = EMPTY_PAIR;
+ for(usint i = 0; i < items; i++)
+ {
+ if(data[i].found)
+ {
+ continue; // The run might continue after this.
+ }
+ else if(isEmpty(run))
+ {
+ run = pair_type(i, i);
+ }
+ else if(data[i].value - data[run.first].value == i - run.first)
+ {
+ run.second = i;
+ }
+ else
+ {
+ found &= this->processRun(run, data);
+ run = pair_type(i, i);
+ }
+ }
+ if(!isEmpty(run)) { found &= this->processRun(run, data); }
+ }
+}
+
+bool
+RLCSA::processRun(pair_type run, LocateItem* data)
+{
+ bool found = true;
+ usint run_start = 0, run_left = 0;
+ pair_type next_sample = pair_type(0, 0);
+
+ for(usint i = run.first; i <= run.second; i++)
+ {
+ if(data[i].found)
+ {
+ if(run_left > 0) { run_left--; }
+ continue;
+ }
+ if(data[i].value < this->number_of_sequences) // Implicit sample here.
+ {
+ data[i].value = this->end_points->select(data[i].value) + 1 - data[i].offset;
+ data[i].found = true;
+ if(run_left > 0) { run_left--; }
+ continue;
+ }
+ if(next_sample.first < data[i].value) // Need another sample.
+ {
+ next_sample = this->sa_samples->getFirstSampleAfter(data[i].value - this->number_of_sequences);
+ next_sample.first += this->number_of_sequences;
+ }
+ if(data[i].value < next_sample.first) // No sample found for current position.
+ {
+ if(run_left > 0)
+ {
+ data[i].value = data[run_start].value + i - run_start;
+ run_left--;
+ }
+ else
+ {
+ pair_type value = this->psi(data[i].value - this->number_of_sequences, run.second - i);
+ data[i].value = value.first;
+ run_left = value.second;
+ run_start = i;
+ }
+ data[i].offset++;
+ found = false;
+ }
+ else // Sampled position found.
+ {
+ data[i].value = this->sa_samples->getSample(next_sample.second) - data[i].offset;
+ data[i].found = true;
+ if(run_left > 0) { run_left--; }
+ }
+ }
+ return found;
+}
+
+usint
+RLCSA::locate(usint index)
+{
+ if(!(this->support_locate) || index >= this->data_size) { return this->data_size; }
+
+ usint offset = 0;
+ index += this->number_of_sequences;
+ while(true)
+ {
+ if(index < this->number_of_sequences) // Implicit sample here
+ {
+ return this->end_points->select(index) + 1 - offset;
+ }
+ pair_type next_sample = this->sa_samples->getFirstSampleAfter(index - this->number_of_sequences);
+ next_sample.first += this->number_of_sequences;
+ if(next_sample.first == index)
+ {
+ return this->sa_samples->getSample(next_sample.second) - offset;
+ }
+ index = this->psi(index - this->number_of_sequences);
+ offset++;
+ }
+}
+
+//--------------------------------------------------------------------------
+
+uchar*
+RLCSA::display(usint sequence, pair_type range)
+{
+ if(!(this->support_display) || isEmpty(range)) { return 0; }
+
+ pair_type seq_range = this->getSequence(sequence);
+ if(isEmpty(seq_range)) { return 0; }
+
+ range.first += seq_range.first; range.second += seq_range.first;
+ if(range.second > seq_range.second) { return 0; }
+
+ uchar* data = new uchar[range.second + 1 - range.first];
+ if(!data) { return 0; }
+ this->displayUnsafe(range, data);
+
+ return data;
+}
+
+uchar*
+RLCSA::display(usint sequence, pair_type range, uchar* data)
+{
+ if(!(this->support_display) || isEmpty(range) || data == 0) { return 0; }
+
+ pair_type seq_range = this->getSequence(sequence);
+ if(isEmpty(seq_range)) { return 0; }
+
+ range.first += seq_range.first; range.second += seq_range.first;
+ if(range.second > seq_range.second) { return 0; }
+
+ this->displayUnsafe(range, data);
+ return data;
+}
+
+void
+RLCSA::displayUnsafe(pair_type range, uchar* data)
+{
+ usint i = range.first - range.first % this->sa_samples->getSampleRate();
+
+ usint pos = this->sa_samples->inverseSA(i);
+
+ for(; i < range.first; i++)
+ {
+ pos = this->psi(pos) - this->number_of_sequences;
+ }
+ for(; i <= range.second; i++)
+ {
+ data[i - range.first] = this->getCharacter(pos);
+ pos = this->psi(pos) - this->number_of_sequences;
+ }
+}
+
+//--------------------------------------------------------------------------
+
+void
+RLCSA::decompressInto(std::ofstream& psi_file)
+{
+ for(usint c = 0; c < CHARS; c++)
+ {
+ if(!(this->array[c])) { continue; }
+ usint value = this->array[c]->select(0);
+ psi_file.write((char*)&(value), sizeof(value));
+ while(this->array[c]->hasNext())
+ {
+ value = this->array[c]->selectNext();
+ psi_file.write((char*)&value, sizeof(value));
+ }
+ }
+}
+
+uchar*
+RLCSA::readBWT()
+{
+ usint n = this->data_size + this->number_of_sequences;
+
+ uchar* bwt = new uchar[n];
+ memset(bwt, 0, n);
+
+ for(usint c = 0; c < CHARS; c++)
+ {
+ RLEVector* vector = this->array[c];
+ if(vector != 0)
+ {
+ bwt[vector->select(0)] = c;
+ while(vector->hasNext()) { bwt[vector->selectNext()] = c; }
+ }
+ }
+
+ return bwt;
+}
+
+//--------------------------------------------------------------------------
+
+usint
+RLCSA::psi(usint index)
+{
+ if(index >= this->data_size)
+ {
+ return this->data_size + this->number_of_sequences;
+ }
+
+ usint c = this->getCharacter(index);
+ return this->array[c]->select(index - this->index_ranges[c].first);
+}
+
+pair_type
+RLCSA::psi(usint index, usint max_length)
+{
+ if(index >= this->data_size)
+ {
+ return pair_type(this->data_size + this->number_of_sequences, 0);
+ }
+
+ usint c = this->getCharacter(index);
+ return this->array[c]->selectRun(index - this->index_ranges[c].first, max_length);
+}
+
+//--------------------------------------------------------------------------
+
+pair_type
+RLCSA::getSequence(usint number)
+{
+ if(number >= this->number_of_sequences) { return EMPTY_PAIR; }
+
+ pair_type result;
+ if(number == 0)
+ {
+ result.first = 0;
+ result.second = this->end_points->select(number);
+ }
+ else
+ {
+ if(this->sa_samples == 0) { return EMPTY_PAIR; }
+ usint d = this->sa_samples->getSampleRate();
+ result.first = d * ((this->end_points->select(number - 1) / d) + 1);
+ result.second = this->end_points->selectNext();
+ }
+
+ return result;
+}
+
+usint
+RLCSA::reportSize(bool print)
+{
+ usint bytes = 0, temp = 0;
+
+ for(usint c = 0; c < CHARS; c++)
+ {
+ if(this->array[c]) { temp += this->array[c]->reportSize(); }
+ }
+ if(print) { std::cout << "Psi array: " << temp << std::endl; }
+ bytes += temp;
+
+ if(this->support_locate || this->support_display)
+ {
+ temp = this->sa_samples->reportSize();
+ if(print) { std::cout << "SA samples: " << temp << std::endl; }
+ bytes += temp;
+ }
+
+ temp = sizeof(*this) + this->end_points->reportSize();
+ if(print) { std::cout << "RLCSA overhead: " << temp << std::endl; }
+ bytes += temp;
+
+ if(print)
+ {
+ std::cout << "Total size: " << bytes << std::endl;
+ std::cout << std::endl;
+ }
+
+ return bytes;
+}
+
+//--------------------------------------------------------------------------
+
+void
+RLCSA::buildCharIndexes(usint* distribution)
+{
+ this->data_size = buildRanges(distribution, this->index_ranges);
+
+ usint i = 0, c = 0;
+ for(; c < CHARS; c++)
+ {
+ if(!isEmpty(this->index_ranges[c]))
+ {
+ this->text_chars[i] = c;
+ i++;
+ }
+ }
+ this->chars = i;
+
+ this->index_rate = std::max((this->data_size + CHARS - 1) / CHARS, (usint)1);
+ usint current = 0;
+
+ for(c = 0, i = 0; c < this->chars; c++)
+ {
+ pair_type range = this->index_ranges[this->text_chars[c]];
+ while(current <= range.second)
+ {
+ this->index_pointers[i] = c;
+ current += this->index_rate;
+ i++;
+ }
+ }
+}
+
+usint
+buildRanges(usint* distribution, pair_type* index_ranges)
+{
+ if(distribution == 0 || index_ranges == 0) { return 0; }
+
+ usint sum = 0;
+ for(usint c = 0; c < CHARS; c++)
+ {
+ if(distribution[c] == 0)
+ {
+ if(sum == 0) { index_ranges[c] = EMPTY_PAIR; }
+ else { index_ranges[c] = pair_type(sum, sum - 1); }
+ }
+ else
+ {
+ index_ranges[c].first = sum;
+ sum += distribution[c];
+ index_ranges[c].second = sum - 1;
+ }
+ }
+
+ return sum;
+}
+
+
+} // namespace CSA
--- /dev/null
+#ifndef RLCSA_H
+#define RLCSA_H
+
+#include <fstream>
+
+#include "bits/vectors.h"
+#include "sasamples.h"
+#include "misc/parameters.h"
+
+
+namespace CSA
+{
+
+
+const std::string SA_EXTENSION = ".sa";
+const std::string PSI_EXTENSION = ".psi";
+const std::string ARRAY_EXTENSION = ".rlcsa.array";
+const std::string SA_SAMPLES_EXTENSION = ".rlcsa.sa_samples";
+const std::string PARAMETERS_EXTENSION = ".rlcsa.parameters";
+
+
+const parameter_type RLCSA_BLOCK_SIZE = parameter_type("RLCSA_BLOCK_SIZE", 32);
+const parameter_type SAMPLE_RATE = parameter_type("SAMPLE_RATE", 512);
+const parameter_type SUPPORT_LOCATE = parameter_type("SUPPORT_LOCATE", 0);
+const parameter_type SUPPORT_DISPLAY = parameter_type("SUPPORT_DISPLAY", 0);
+
+
+struct LocateItem
+{
+ usint value;
+ usint offset;
+ bool found;
+};
+
+
+
+class RLCSA
+{
+ public:
+ const static usint ENDPOINT_BLOCK_SIZE = 16;
+
+ RLCSA(const std::string& base_name, bool print = false);
+
+ /*
+ If multiple_sequences is true, each 0 is treated as a end of sequence marker.
+ There must be nonzero characters between the 0s. data[bytes - 1] must also be 0.
+ */
+ RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, bool multiple_sequences, bool delete_data);
+
+ // Destroys contents of index and increment.
+ RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size);
+ ~RLCSA();
+
+ void writeTo(const std::string& base_name);
+
+ bool isOk();
+
+ // Returns the closed interval of suffix array containing the matches.
+ pair_type count(const std::string& pattern);
+
+ void reportPositions(uchar* data, usint length, usint* positions);
+
+ // Returns SA[range]. User must free the buffer. Latter version uses buffer provided by the user.
+ LocateItem* locate(pair_type range);
+ LocateItem* locate(pair_type range, LocateItem* data);
+
+ // Returns SA[index].
+ usint locate(usint index);
+
+ // Returns Ti[range]. User must free the buffer. Latter version uses buffer provided by the user.
+ uchar* display(usint sequence, pair_type range);
+ uchar* display(usint sequence, pair_type range, uchar* data);
+
+ // Writes the Psi array into given file. End of sequence markers are not written.
+ void decompressInto(std::ofstream& psi_file);
+
+ // Returns the BWT of the collection including end of sequence markers.
+ uchar* readBWT();
+
+ // Return value includes the implicit end of sequence markers. To get suffix array indexes,
+ // subtract getNumberOfSequences() from the value.
+ usint psi(usint index);
+ pair_type psi(usint index, usint max_length); // This version returns a run.
+
+ inline bool supportsLocate() { return this->support_locate; }
+ inline bool supportsDisplay() { return this->support_display; }
+ inline usint getSize() { return this->data_size; }
+ inline usint getNumberOfSequences() { return this->number_of_sequences; }
+
+ pair_type getSequence(usint number);
+
+ // Returns the size of the data structure.
+ usint reportSize(bool print = false);
+
+ private:
+ bool ok;
+
+ RLEVector* array[CHARS];
+ SASamples* sa_samples;
+
+ pair_type index_ranges[CHARS];
+ usint data_size;
+
+ usint text_chars[CHARS]; // which characters are present in the text
+ usint chars;
+
+ usint index_pointers[CHARS]; // which of the above is at i * index_rate
+ usint index_rate;
+
+ bool support_locate, support_display;
+ usint sample_rate;
+
+ usint number_of_sequences;
+ DeltaVector* end_points;
+
+ void locateUnsafe(pair_type range, LocateItem* data);
+ bool processRun(pair_type run, LocateItem* data);
+ void displayUnsafe(pair_type range, uchar* data);
+
+ inline usint getCharacter(usint pos)
+ {
+ usint* curr = &(this->text_chars[this->index_pointers[pos / this->index_rate]]);
+ while(pos > this->index_ranges[*curr].second) { curr++; }
+ return *curr;
+ }
+
+ void buildCharIndexes(usint* distribution);
+};
+
+
+// Returns the total number of characters.
+usint buildRanges(usint* distribution, pair_type* index_ranges);
+
+
+} // namespace CSA
+
+
+#endif // RLCSA_H
--- /dev/null
+#include <iostream>
+
+#include "rlcsa_builder.h"
+
+
+namespace CSA
+{
+
+
+RLCSABuilder::RLCSABuilder(usint _block_size, usint _sample_rate, usint _buffer_size) :
+ block_size(_block_size), sample_rate(_sample_rate), buffer_size(_buffer_size),
+ buffer(0)
+{
+ this->reset();
+}
+
+RLCSABuilder::~RLCSABuilder()
+{
+ delete this->index;
+ delete[] this->buffer;
+}
+
+//--------------------------------------------------------------------------
+
+void
+RLCSABuilder::insertSequence(char* sequence, usint length, bool delete_sequence)
+{
+ if(sequence == 0 || length == 0 || !this->ok)
+ {
+ if(delete_sequence) { delete[] sequence; }
+ return;
+ }
+
+ if(this->buffer == 0)
+ {
+ clock_t start = clock();
+ RLCSA* temp = new RLCSA((uchar*)sequence, length, this->block_size, this->sample_rate, false, false);
+ this->build_time += clock() - start;
+ this->addRLCSA(temp, (uchar*)sequence, length + 1, delete_sequence);
+ return;
+ }
+
+ if(this->buffer_size - this->chars > length)
+ {
+ memcpy(this->buffer + this->chars, sequence, length);
+ if(delete_sequence) { delete[] sequence; }
+ this->chars += length;
+ this->buffer[this->chars] = 0;
+ this->chars++;
+ }
+ else
+ {
+ this->flush();
+ this->buffer = new uchar[this->buffer_size];
+ if(length >= this->buffer_size - 1)
+ {
+ clock_t start = clock();
+ RLCSA* temp = new RLCSA((uchar*)sequence, length, this->block_size, this->sample_rate, false, false);
+ this->build_time += clock() - start;
+ this->addRLCSA(temp, (uchar*)sequence, length + 1, delete_sequence);
+ }
+ else
+ {
+ memcpy(this->buffer + this->chars, sequence, length);
+ if(delete_sequence) { delete[] sequence; }
+ this->chars += length;
+ this->buffer[this->chars] = 0;
+ this->chars++;
+ }
+ }
+}
+
+RLCSA*
+RLCSABuilder::getRLCSA()
+{
+ if(this->chars > 0) { this->flush(); }
+
+ RLCSA* temp = this->index;
+ this->reset();
+
+ return temp;
+}
+
+char*
+RLCSABuilder::getBWT(usint& length)
+{
+ if(this->chars > 0)
+ {
+ this->flush();
+ if(this->buffer_size > 0) { this->buffer = new uchar[this->buffer_size]; }
+ }
+
+ if(this->index == 0 || !(this->ok))
+ {
+ length = 0;
+ return 0;
+ }
+
+ length = this->index->getSize() + this->index->getNumberOfSequences();
+ return (char*)(this->index->readBWT());
+}
+
+bool
+RLCSABuilder::isOk()
+{
+ return this->ok;
+}
+
+double
+RLCSABuilder::getBuildTime()
+{
+ return this->build_time / (double)CLOCKS_PER_SEC;
+}
+
+double
+RLCSABuilder::getSearchTime()
+{
+ return this->search_time / (double)CLOCKS_PER_SEC;
+}
+
+double
+RLCSABuilder::getMergeTime()
+{
+ return this->merge_time / (double)CLOCKS_PER_SEC;
+}
+
+//--------------------------------------------------------------------------
+
+void
+RLCSABuilder::flush()
+{
+ clock_t start = clock();
+ RLCSA* temp = new RLCSA(this->buffer, this->chars, this->block_size, this->sample_rate, true, (this->index == 0));
+ this->build_time += clock() - start;
+ this->addRLCSA(temp, this->buffer, this->chars, true);
+ this->buffer = 0; this->chars = 0;
+}
+
+void
+RLCSABuilder::addRLCSA(RLCSA* increment, uchar* sequence, usint length, bool delete_sequence)
+{
+ if(this->index != 0)
+ {
+ clock_t start = clock();
+
+ usint* positions = new usint[length];
+ usint begin = 0;
+ for(usint i = 0; i < length - 1; i++)
+ {
+ if(sequence[i] == 0)
+ {
+ this->index->reportPositions(&(sequence[begin]), i - begin, &(positions[begin]));
+ begin = i + 1;
+ }
+ }
+ this->index->reportPositions(&(sequence[begin]), length - 1 - begin, &(positions[begin]));
+
+ std::sort(positions, positions + length);
+ for(usint i = 0; i < length; i++)
+ {
+ positions[i] += i + 1; // +1 because the insertion will be after positions[i]
+ }
+ if(delete_sequence) { delete[] sequence; }
+
+ clock_t mark = clock();
+ this->search_time += mark - start;
+
+ RLCSA* merged = new RLCSA(*(this->index), *increment, positions, this->block_size);
+ delete[] positions;
+ delete this->index;
+ delete increment;
+ this->index = merged;
+
+ this->merge_time += clock() - mark;
+ }
+ else
+ {
+ this->index = increment;
+ }
+
+ this->ok &= this->index->isOk();
+}
+
+void
+RLCSABuilder::reset()
+{
+ this->index = 0;
+
+ if(this->buffer_size != 0)
+ {
+ this->buffer = new uchar[this->buffer_size];
+ }
+ this->chars = 0;
+
+ this->ok = true;
+}
+
+
+} // namespace CSA
--- /dev/null
+#ifndef RLCSA_BUILDER_H
+#define RLCSA_BUILDER_H
+
+#include <cstdlib>
+#include "rlcsa.h"
+
+
+namespace CSA
+{
+
+
+class RLCSABuilder
+{
+ public:
+ RLCSABuilder(usint _block_size, usint _sample_rate, usint _buffer_size);
+ ~RLCSABuilder();
+
+ void insertSequence(char* sequence, usint length, bool delete_sequence);
+
+ // User must free the index. Builder no longer contains it.
+ RLCSA* getRLCSA();
+
+ // User must free the BWT. length becomes the length of BWT.
+ char* getBWT(usint& length);
+
+ bool isOk();
+
+ // These times are not reset with the rest of the builder.
+ double getBuildTime();
+ double getSearchTime();
+ double getMergeTime();
+
+ private:
+ RLCSA* index;
+
+ usint block_size;
+ usint sample_rate;
+ usint buffer_size;
+
+ uchar* buffer;
+ usint chars;
+
+ bool ok;
+
+ clock_t build_time;
+ clock_t search_time;
+ clock_t merge_time;
+
+ void flush();
+ void addRLCSA(RLCSA* increment, uchar* sequence, usint length, bool delete_sequence);
+ void reset();
+};
+
+
+} // namespace CSA
+
+
+#endif // RLCSA_BUILDER_H
--- /dev/null
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+
+#include "sasamples.h"
+#include "misc/utils.h"
+
+
+namespace CSA
+{
+
+
+SASamples::SASamples(std::ifstream& sample_file, usint sample_rate) :
+ rate(sample_rate)
+{
+ this->indexes = new DeltaVector(sample_file);
+
+ this->size = indexes->getSize();
+ this->items = indexes->getNumberOfItems();
+ this->integer_bits = length(this->items - 1);
+
+ this->samples = new FastBitBuffer(sample_file, this->items, this->integer_bits);
+ this->buildInverseSamples();
+}
+
+SASamples::SASamples(usint* array, usint data_size, usint sample_rate) :
+ rate(sample_rate),
+ size(data_size)
+{
+ this->items = (this->size - 1) / this->rate + 1;
+ this->integer_bits = length(this->items - 1);
+
+ this->samples = new FastBitBuffer(this->items, this->integer_bits);
+ DeltaEncoder encoder(SASamples::BLOCK_SIZE);
+ for(usint i = 0; i < this->size; i++)
+ {
+ if(array[i] % rate == 0)
+ {
+ encoder.setBit(i);
+ this->samples->writeItem(array[i] / sample_rate);
+ }
+ }
+ this->indexes = new DeltaVector(encoder, this->size);
+
+ this->buildInverseSamples();
+}
+
+SASamples::SASamples(SASamples& index, SASamples& increment, usint* positions, usint number_of_positions) :
+ rate(index.rate),
+ size(index.size + increment.size),
+ items(index.items + increment.items)
+{
+ this->mergeSamples(index, increment, positions, number_of_positions);
+
+ index.indexes = 0;
+ index.samples = 0;
+ index.inverse_samples = 0;
+ increment.indexes = 0;
+ increment.samples = 0;
+ increment.inverse_samples = 0;
+
+ this->buildInverseSamples();
+}
+
+SASamples::~SASamples()
+{
+ delete this->indexes;
+ delete this->samples;
+ delete this->inverse_samples;
+}
+
+void
+SASamples::writeTo(std::ofstream& sample_file)
+{
+ this->indexes->writeTo(sample_file);
+ this->samples->writeBuffer(sample_file, false);
+}
+
+//--------------------------------------------------------------------------
+
+usint
+SASamples::inverseSA(usint value)
+{
+ if(value >= this->size) { return this->size; }
+ usint i = this->inverse_samples->readItem(value / this->rate);
+ return this->indexes->select(i);
+}
+
+pair_type
+SASamples::getFirstSampleAfter(usint index)
+{
+ if(index >= this->size) { return pair_type(this->size, this->size); }
+
+ return this->indexes->valueAfter(index);
+}
+
+//--------------------------------------------------------------------------
+
+usint
+SASamples::reportSize()
+{
+ usint bytes = sizeof(*this) - sizeof(this->indexes);
+ bytes += this->indexes->reportSize();
+ bytes += this->samples->reportSize();
+ bytes += this->inverse_samples->reportSize();
+ return bytes;
+}
+
+//--------------------------------------------------------------------------
+
+void
+SASamples::buildInverseSamples()
+{
+ this->inverse_samples = new FastBitBuffer(this->items, this->integer_bits);
+ this->samples->goToItem(0);
+ for(usint i = 0; i < this->items; i++)
+ {
+ this->inverse_samples->goToItem(this->samples->readItem());
+ this->inverse_samples->writeItem(i);
+ }
+}
+
+
+//--------------------------------------------------------------------------
+
+
+void
+SASamples::mergeSamples(SASamples& index, SASamples& increment, usint* positions, usint n)
+{
+ DeltaVector* first = index.indexes;
+ DeltaVector* second = increment.indexes;
+ FastBitBuffer* first_samples = index.samples;
+ FastBitBuffer* second_samples = increment.samples;
+
+ usint first_bit = first->select(0);
+ bool first_finished = false;
+ usint second_bit = second->select(0);
+ usint sum = index.items;
+ first_samples->goToItem(0);
+ second_samples->goToItem(0);
+
+ DeltaEncoder encoder(SASamples::BLOCK_SIZE);
+ this->integer_bits = length(this->items - 1);
+ this->samples = new FastBitBuffer(this->items, this->integer_bits);
+ for(usint i = 0; i < n; i++, first_bit++)
+ {
+ while(!first_finished && first_bit < positions[i])
+ {
+ encoder.setBit(first_bit);
+ this->samples->writeItem(first_samples->readItem());
+ if(first->hasNext())
+ {
+ first_bit = first->selectNext() + i;
+ }
+ else
+ {
+ first_finished = true;
+ }
+ }
+
+ if(i == second_bit) // positions[i] is one
+ {
+ encoder.setBit(positions[i]);
+ this->samples->writeItem(second_samples->readItem() + sum);
+ second_bit = second->selectNext();
+ }
+ }
+
+ while(!first_finished)
+ {
+ encoder.setBit(first_bit);
+ this->samples->writeItem(first_samples->readItem());
+ if(!first->hasNext()) { break; }
+ first_bit = first->selectNext() + n;
+ }
+
+ delete index.indexes;
+ delete index.samples;
+ delete index.inverse_samples;
+ delete increment.indexes;
+ delete increment.samples;
+ delete increment.inverse_samples;
+
+ this->indexes = new DeltaVector(encoder, size);
+}
+
+
+} // namespace CSA
--- /dev/null
+#ifndef SASAMPLES_H
+#define SASAMPLES_H
+
+#include <fstream>
+
+#include "misc/definitions.h"
+#include "bits/bitbuffer.h"
+#include "bits/deltavector.h"
+
+
+namespace CSA
+{
+
+
+class SASamples
+{
+ public:
+ const static usint BLOCK_SIZE = 16;
+
+ SASamples(std::ifstream& sample_file, usint sample_rate);
+ SASamples(usint* array, usint data_size, usint sample_rate);
+ ~SASamples();
+
+ // Destroys contents of index and increment.
+ // We assume index and increment have same sample rate.
+ SASamples(SASamples& index, SASamples& increment, usint* positions, usint number_of_positions);
+
+ void writeTo(std::ofstream& sample_file);
+
+ // Returns i such that SA[i] = value.
+ // If SA[i] is not sampled, returns the next sampled value. (Don't try!)
+ // Value is actual 0-based suffix array value.
+ // Returns size if value is too large.
+ usint inverseSA(usint value);
+
+ // Returns the value of ith sample in suffix array order.
+ inline usint getSample(usint i)
+ {
+ return std::min(this->samples->readItem(i) * this->rate, this->size - 1);
+ }
+
+ // Returns (ind, sample number) where ind >= index or (size, ???).
+ pair_type getFirstSampleAfter(usint index);
+
+ inline usint getSampleRate() { return this->rate; }
+ inline usint getNumberOfSamples() { return this->items; }
+
+ usint reportSize();
+
+ private:
+ usint integer_bits;
+ usint rate, size, items;
+
+ DeltaVector* indexes;
+
+ FastBitBuffer* samples;
+ FastBitBuffer* inverse_samples;
+
+ void buildInverseSamples();
+
+ // Note: contents of original samples are deleted.
+ void mergeSamples(SASamples& index, SASamples& increment, usint* positions, usint n);
+};
+
+
+} // namespace CSA
+
+
+#endif // SASAMPLES_H
dcover_obs = dcover/difference_cover.o
-testTextCollection_obs = testTextCollection.o TextCollection.o CSA.o Tools.o BitRank.o bittree.o rbtree.o dynFMI.o RLWaveletTree.o GapEncode.o BSGAP.o ${LIBCDSA} ${dcover_obs}
+TextCollection_obs = TextCollection.o TextCollectionBuilder.o TCImplementation.o Tools.o BitRank.o \
+ BSGAP.o incbwt/rlcsa.a ${LIBCDSA} ${dcover_obs}
+TCDebug_obs = bittree.o rbtree.o dynFMI.o
-timeTextCollection_obs = timeTextCollection.o TextCollection.o CSA.o Tools.o BitRank.o bittree.o rbtree.o dynFMI.o RLWaveletTree.o GapEncode.o BSGAP.o ${LIBCDSA} ${dcover_obs}
+all: testTextCollection
-all: $(testTextCollection_obs) testTextCollection
+testTextCollection: testTextCollection.o $(TextCollection_obs) $(TCDebug_obs) HeapProfiler.o
+ $(CC) -o testTextCollection testTextCollection.o $(TextCollection_obs) $(TCDebug_obs) HeapProfiler.o
-testTextCollection: $(testTextCollection_obs) HeapProfiler.o
- $(CC) -o testTextCollection $(testTextCollection_obs) HeapProfiler.o
+timeTextCollection: timeTextCollection.o $(TextCollection_obs) $(TCDebug_obs)
+ $(CC) -o timeTextCollection timeTextCollection.o $(TextCollection_obs) $(TCDebug_obs)
+
+incbwt/rlcsa.a:
+ @make -C incbwt
-timeTextCollection: $(timeTextCollection_obs)
- $(CC) -o timeTextCollection $(timeTextCollection_obs)
clean:
+ @make clean -C incbwt
rm -f core *.o *~ testTextCollection timeTextCollection dcover/*.o dcover/*~
depend:
- $(CC) -MM *.cpp *.c > dependencies.mk
+ @make depend -C incbwt
+ $(CC) -MM *.cpp > dependencies.mk
include dependencies.mk
using std::cin;
using std::string;
-#include "TextCollection.h"
+#include "TextCollectionBuilder.h"
#include "HeapProfiler.h"
+using SXSI::TextCollectionBuilder;
using SXSI::TextCollection;
void printDocumentResult(TextCollection::document_result dr)
int i = 0 ,j = 0;
int heap_base = HeapProfiler::GetHeapConsumption();
std::cerr << "Initial heap usage : " << heap_base << "\n";
- TextCollection *csa = TextCollection::InitTextCollection(5); // Avoid small samplerates ;)
+ TextCollectionBuilder *tcb = new TextCollectionBuilder(32);
heap_base = HeapProfiler::GetHeapConsumption ();
std::cerr << "Heap usage after InitTextCollection : " << heap_base << "\n";
-
+ Tools::StartTimer();
while (not(cin.eof())){
getline(cin,str); // Read line by line.
// cin >> str; // Read word by word.
data = (uchar *) str.c_str();
- csa->InsertText(data);
+ if (str.size() == 0)
+ continue;
+
+ tcb->InsertText(data);
i++;
j+= str.size();
str.clear();
- if ( i % 1000 == 0) {
+ if ( i % 100000 == 0) {
std::cerr << "Inserted : " << i << " strings\n";
std::cerr << "Number of bytes inserted : " << j << "b \n";
std::cerr << "Heap usage used for strings: " << HeapProfiler::GetHeapConsumption() - heap_base
};
};
+/**/
+ //the whole file as 20 strings:
+ /* uchar *temp = Tools::GetFileContents("data/english.100MB", 0);
+ ulong n = strlen((char *)temp);
+ std::cout << "n = " << n << std::endl;
+ ulong offset = n/40;
+ uchar *it = temp;
+ for (i = 0; i < 5; ++i)
+ {
+ it[offset] = '\0';
+ tcb->InsertText(it);
+ std::cout << "inserted " << strlen((char *)it) << " bytes." << std::endl;
+ it += offset +1;
+ }
+ it -= offset+1;
-/* the whole file as one string:
- uchar *temp = Tools::GetFileContents("data.txt", 0);
- csa->InsertText(temp);
+ if (it > temp + n)
+ std::cout << "over bounds" << std::endl;
delete [] temp;*/
+
+ HeapProfiler::ResetMaxHeapConsumption();
std::cerr << "Creating new text collection with " << i << " strings (total " << j/1024 << " kb)\n";
std::cerr << "Before MakeStatic() [press enter]\n";
- std::cin >> kbd;
+ //std::cin >> kbd;
// This will print the maximum mem usage during construction time:
std::cerr << "max heap usage: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
- csa->MakeStatic();
+ TextCollection* tc = tcb->InitTextCollection();
+ delete tcb; tcb = 0;
std::cerr << "After MakeStatic() [press enter]\n";
// This will print the maximum mem usage during MakeStatic():
std::cerr << "max heap usage: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
- std::cin >> kbd;
+ //std::cin >> kbd;
std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
- delete csa;
+ delete tc;
std::cerr << "After Delete [press enter]\n";
- std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
- std::cin >> kbd;
+ std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption() << " bytes" << std::endl;
+ //std::cin >> kbd;
return 0;
}
-#include "TextCollection.h"
+#include "TextCollectionBuilder.h"
using SXSI::TextCollection;
+using SXSI::TextCollectionBuilder;
int main(int argc, char**argv)
{
- TextCollection *csa = TextCollection::InitTextCollection(64);
+ TextCollectionBuilder *tcb = new TextCollectionBuilder(64);
STARTTIMER();
while (not(cin.eof()) && num_str < 100000 ){
getline(cin,str); // Read line by line.
if (str.compare("----------") == 0){
- csa->InsertText((unsigned char*) buffer.c_str());
+ tcb->InsertText((unsigned char*) buffer.c_str());
if (num_str % 10000 == 0){
STOPTIMER();
};
std::cerr << "Calling MakeStatic()\n";
- csa->MakeStatic();
+ TextCollection *tc = tcb->InitTextCollection();
std::cerr << "Statistics: " << num_str << " strings, " << max_str << " = max length\n";
int count;
for (unsigned int i = 0; i < (sizeof(words)/sizeof(char*)) ; i++){
STARTTIMER();
- is = csa->IsContains((unsigned char*) words[i].c_str());
+ is = tc->IsContains((unsigned char*) words[i].c_str());
STOPTIMER();
time = GETTIME();
STARTTIMER();
- count = csa->Count((unsigned char*) words[i].c_str());
+ count = tc->Count((unsigned char*) words[i].c_str());
STOPTIMER();
time = GETTIME();
STARTTIMER();
- count = csa->CountContains((unsigned char*) words[i].c_str());
+ count = tc->CountContains((unsigned char*) words[i].c_str());
STOPTIMER();
time = GETTIME();
STARTTIMER();
- res = csa->Contains((unsigned char*) words[i].c_str());
+ res = tc->Contains((unsigned char*) words[i].c_str());
STOPTIMER();
time = GETTIME();