+++ /dev/null
-/******************************************************************************
- * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
- * *
- * *
- * This program is free software; you can redistribute it and/or modify *
- * it under the terms of the GNU Lesser General Public License as published *
- * by the Free Software Foundation; either version 2 of the License, or *
- * (at your option) any later version. *
- * *
- * This program is distributed in the hope that it will be useful, *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
- * GNU Lesser General Public License for more details. *
- * *
- * You should have received a copy of the GNU Lesser General Public License *
- * along with this program; if not, write to the *
- * Free Software Foundation, Inc., *
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
- *****************************************************************************/
-
-#ifndef _CSA_H_
-#define _CSA_H_
-#include "dynFMI.h"
-#include "BitRank.h"
-#include "TextCollection.h"
-#include "BlockArray.h"
-#include "RLWaveletTree.h"
-#include "StringIterator.h"
-#include <set>
-#include <vector>
-
-// Include from XMLTree/libcds
-#include <basics.h> // Defines W == 32
-#include <static_bitsequence.h>
-#include <alphabet_mapper.h>
-#include <static_sequence.h>
-
-// Re-define word size to ulong:
-#undef W
-#if __WORDSIZE == 64
-# define W 64
-#else
-# define W 32
-#endif
-#undef bitset
-
-namespace SXSI
-{
-
-// Un-comment to compare BWT against a BWT generated from class dynFMI:
-//#define CSA_TEST_BWT
-
-/**
- * Implementation of the TextCollection interface
- *
- */
-class CSA : public SXSI::TextCollection {
-public:
- /**
- * Constructor with (default) samplerate
- */
- explicit CSA(unsigned);
- ~CSA();
- /**
- * Following functions implement the interface described in TextCollection.h.
- * For details/documentation, see TextCollection.h.
- */
- void InsertText(uchar const *);
- void MakeStatic();
- bool EmptyText(DocId) const;
- uchar* GetText(DocId) const;
- /**
- * Next method is not supported:
- * Supporting GetText for some substring [i,j]
- * would require more space.
- */
-// uchar* GetText(DocId, TextPosition, TextPosition) const;
-
- bool IsPrefix(uchar const *) const;
- bool IsSuffix(uchar const *) const;
- bool IsEqual(uchar const *) const;
- bool IsContains(uchar const *) const;
- bool IsLessThan(uchar const *) const;
-
- bool IsPrefix(uchar const *, DocId, DocId) const;
- bool IsSuffix(uchar const *, DocId, DocId) const;
- bool IsEqual(uchar const *, DocId, DocId) const;
- bool IsContains(uchar const *, DocId, DocId) const;
- bool IsLessThan(uchar const *, DocId, DocId) const;
-
- ulong Count(uchar const *) const;
- unsigned CountPrefix(uchar const *) const;
- unsigned CountSuffix(uchar const *) const;
- unsigned CountEqual(uchar const *) const;
- unsigned CountContains(uchar const *) const;
- unsigned CountLessThan(const unsigned char*) const;
-
- unsigned CountPrefix(uchar const *, DocId, DocId) const;
- unsigned CountSuffix(uchar const *, DocId, DocId) const;
- unsigned CountEqual(uchar const *, DocId, DocId) const;
- unsigned CountContains(uchar const *, DocId, DocId) const;
- unsigned CountLessThan(uchar const *, DocId, DocId) const;
-
- // Definition of document_result is inherited from SXSI::TextCollection.
- document_result Prefix(uchar const *) const;
- document_result Suffix(uchar const *) const;
- document_result Equal(uchar const *) const;
- document_result Contains(uchar const *) const;
- document_result LessThan(uchar const *) const;
-
- document_result Prefix(uchar const *, DocId, DocId) const;
- document_result Suffix(uchar const *, DocId, DocId) const;
- document_result Equal(uchar const *, DocId, DocId) const;
- document_result Contains(uchar const *, DocId, DocId) const;
- document_result LessThan(uchar const *, DocId, DocId) const;
-
- // Definition of full_result is inherited from SXSI::TextCollection.
- full_result FullContains(uchar const *) const;
- full_result FullContains(uchar const *, DocId, DocId) const;
-
- // Index from/to disk
- void Load(FILE *, unsigned);
- void Save(FILE *) const;
-
-
- // FIXME Remove
- void deleteWT()
- {
- delete alphabetrank;
- alphabetrank = 0;
- delete [] codetable;
- codetable = 0;
- }
- void deleteSamples()
- {
- delete sampled;
- sampled =0;
- delete suffixes;
- suffixes = 0;
- delete suffixDocId;
- suffixDocId = 0;
- }
- void deleteEndmarker()
- {
- delete endmarkerDocId;
- endmarkerDocId = 0;
- }
-
-private:
- // FIXME Unused code
- class TCodeEntry {
- public:
- unsigned count;
- unsigned bits;
- unsigned code;
- TCodeEntry() {count=0;bits=0;code=0u;};
- };
-
-
- // FIXME Unused code
- class THuffAlphabetRank {
- // using fixed 0...255 alphabet
- private:
- BitRank *bitrank;
- THuffAlphabetRank *left;
- THuffAlphabetRank *right;
- TCodeEntry *codetable;
- uchar ch;
- bool leaf;
- public:
- THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned);
- THuffAlphabetRank(FILE *);
- ~THuffAlphabetRank();
-
- void Save(FILE *);
- bool Test(uchar *, TextPosition);
-
- inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
- THuffAlphabetRank const * temp=this;
- if (codetable[c].count == 0) return 0;
- unsigned level = 0;
- unsigned code = codetable[c].code;
- while (!temp->leaf) {
- if ((code & (1u<<level)) == 0) {
- i = i-temp->bitrank->rank(i);
- temp = temp->left;
- }
- else {
- i = temp->bitrank->rank(i)-1;
- temp = temp->right;
- }
- ++level;
- }
- return i+1;
- };
- inline bool IsCharAtPos(int c, TextPosition i) const {
- THuffAlphabetRank const * temp=this;
- if (codetable[c].count == 0) return false;
- unsigned level = 0;
- unsigned code = codetable[c].code;
- while (!temp->leaf) {
- if ((code & (1u<<level))==0) {
- if (temp->bitrank->IsBitSet(i)) return false;
- i = i-temp->bitrank->rank(i);
- temp = temp->left;
- }
- else {
- if (!temp->bitrank->IsBitSet(i)) return false;
- i = temp->bitrank->rank(i)-1;
- temp = temp->right;
- }
- ++level;
- }
- return true;
- }
- inline uchar access(TextPosition i) const {
- THuffAlphabetRank const * temp=this;
- while (!temp->leaf) {
- if (temp->bitrank->IsBitSet(i)) {
- i = temp->bitrank->rank(i)-1;
- temp = temp->right;
- }
- else {
- i = i-temp->bitrank->rank(i);
- temp = temp->left;
- }
- }
- return temp->ch;
- }
-
- inline uchar charAtPos(ulong i, TextPosition *rank) const {
- THuffAlphabetRank const * temp=this;
- while (!temp->leaf) {
- if (temp->bitrank->IsBitSet(i)) {
- i = temp->bitrank->rank(i)-1;
- temp = temp->right;
- } else {
- i = i-temp->bitrank->rank(i);
- temp = temp->left;
- }
- }
- (*rank)=i;
- return temp->ch;
- }
- };
-
- // FIXME Unused code
- class node {
- private:
- unsigned weight;
- uchar value;
- node *child0;
- node *child1;
-
- void maketable( unsigned code, unsigned bits, TCodeEntry *codetable ) const;
- static void count_chars(uchar *, TextPosition , TCodeEntry *);
- static unsigned SetBit(unsigned , unsigned , unsigned );
- public:
- node( unsigned char c = 0, unsigned i = 0 ) {
- value = c;
- weight = i;
- child0 = 0;
- child1 = 0;
- }
-
- node( node* c0, node *c1 ) {
- value = 0;
- weight = c0->weight + c1->weight;
- child0 = c0;
- child1 = c1;
- }
-
-
- bool operator>( const node &a ) const {
- return weight > a.weight;
- }
-
- static TCodeEntry *makecodetable(uchar *, TextPosition);
- };
-
- static const uchar versionFlag;
- TextPosition n;
- unsigned samplerate;
- unsigned C[256];
- TextPosition bwtEndPos;
-// THuffAlphabetRank *alphabetrank;
- // RLWaveletTree *alphabetrank;
- static_sequence * alphabetrank;
-
-#ifdef CSA_TEST_BWT
- DynFMI *dynFMI;
-#endif
- TCodeEntry *codetable;
-
- // Sample structures for texts longer than samplerate
- BSGAP *sampled;
- BlockArray *suffixes;
- BlockArray *suffixDocId;
-
- // Total number of texts in the collection
- unsigned numberOfTexts;
- // Total number of texts including empty texts
- unsigned numberOfAllTexts;
- // Length of the longest text
- ulong maxTextLength;
-
- // Array of document id's in the order of end-markers in BWT
- // Access by endmarkerDocId[rank_$(L, p) - 1].
- BlockArray *endmarkerDocId;
-
- // FIXME Replace with a more succinct data structure
- // Note: Empty texts are already handled inside XMLTree class.
- std::set<unsigned> emptyTextId;
- BSGAP *emptyTextRank;
-
- // FIXME A better solution?
- std::string texts;
-
- // Following are not part of the public API
- uchar * BWT(uchar *);
- void makewavelet(uchar *);
- void maketables();
- DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
- ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
- ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
- ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
-// TextPosition Inverse(TextPosition) const;
-// TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
-// TextPosition Psi(TextPosition) const;
-// uchar * Substring(TextPosition, TextPosition) const;
-// TextPosition Lookup(TextPosition) const;
-
- /**
- * Count end-markers in given interval
- */
- inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
- {
- if (sp > ep)
- return 0;
-
- ulong ranksp = 0;
- if (sp != 0)
- ranksp = alphabetrank->rank(0, sp - 1);
-
- return alphabetrank->rank(0, ep) - ranksp;
- }
- unsigned CountEndmarkers(TextPosition, TextPosition, DocId, DocId) const;
-}; // class CSA
-
-} // namespace SXSI
-
-#endif