TCImplementation.h

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20
  21 #ifndef _TCImplementation_H_
  22 #define _TCImplementation_H_
  23
  24 #include "incbwt/bits/deltavector.h"
  25
  26 #include "BitRank.h"
  27 #include "TextCollection.h"
  28 #include "BlockArray.h"
  29
  30 // Include  from XMLTree/libcds
  31 #include <basics.h> // Defines W == 32
  32 #include <static_bitsequence.h>
  33 #include <alphabet_mapper.h>
  34 #include <static_sequence.h>
  35 #include <static_sequence_wvtree_noptrs.h>
  36
  37 // Re-define word size to ulong:
  38 #undef W
  39 #if __WORDSIZE == 64
  40 #   define W 64
  41 #else
  42 #   define W 32
  43 #endif
  44 #undef bitset
  45 #undef bitget
  46
  47
  48 #include "TextStorage.h"
  49 #include "ArrayDoc.h"
  50 #include <set>
  51 #include <string>
  52
  53 namespace SXSI
  54 {
  55
  56
  57 /**
  58  * Implementation of the TextCollection interface
  59  *
  60  */
  61 class TCImplementation : public SXSI::TextCollection {
  62 public:
  63     TCImplementation(uchar *, ulong, unsigned, unsigned, ulong, ulong,
  64                      CSA::DeltaVector &, const std::string &, char);
  65     ~TCImplementation();
  66
  67     bool EmptyText(DocId) const;
  68
  69     /**
  70      * Extracting one text.
  71      *
  72      * Call DeleteText() for each pointer returned by GetText()
  73      * to avoid possible memory leaks.
  74      */
  75     uchar * GetText(DocId) const;
  76     void DeleteText(uchar *text) const
  77     { textStorage->DeleteText(text); }
  78
  79     /**
  80      * Returns a pointer to the beginning of texts i, i+1, ..., j.
  81      * Texts are separated by a '\0' byte.
  82      *
  83      * Call DeleteText() for each pointer returned by GetText()
  84      * to avoid possible memory leaks.
  85      */
  86     uchar * GetText(DocId i, DocId j) const
  87     { return textStorage->GetText(i, j); }
  88
  89     /**
  90      * Returns a substring of given text ID.
  91      *
  92      * FIXME This may be reimplemented via TextStorage.
  93      */
  94 //    uchar* GetText(DocId, TextPosition, TextPosition) const;
  95
  96     bool IsPrefix(uchar const *) const;
  97     bool IsSuffix(uchar const *) const;
  98     bool IsEqual(uchar const *) const;
  99     bool IsContains(uchar const *) const;
 100     bool IsLessThan(uchar const *) const;
 101
 102     bool IsPrefix(uchar const *, DocId, DocId) const;
 103     bool IsSuffix(uchar const *, DocId, DocId) const;
 104     bool IsEqual(uchar const *, DocId, DocId) const;
 105     bool IsContains(uchar const *, DocId, DocId) const;
 106     bool IsLessThan(uchar const *, DocId, DocId) const;
 107
 108     ulong Count(uchar const *) const;
 109     unsigned CountPrefix(uchar const *) const;
 110     unsigned CountSuffix(uchar const *) const;
 111     unsigned CountEqual(uchar const *) const;
 112     unsigned CountContains(uchar const *) const;
 113     unsigned CountLessThan(const unsigned char*) const;
 114
 115     unsigned CountPrefix(uchar const *, DocId, DocId) const;
 116     unsigned CountSuffix(uchar const *, DocId, DocId) const;
 117     unsigned CountEqual(uchar const *, DocId, DocId) const;
 118     unsigned CountContains(uchar const *, DocId, DocId) const;
 119     unsigned CountLessThan(uchar const *, DocId, DocId) const;
 120
 121     // Definition of document_result is inherited from SXSI::TextCollection.
 122     document_result Prefix(uchar const *) const;
 123     document_result Suffix(uchar const *) const;
 124     document_result Equal(uchar const *) const;
 125     document_result Contains(uchar const *) const;
 126     document_result LessThan(uchar const *) const;
 127     document_result KMismaches(uchar const *, unsigned) const;
 128     document_result KErrors(uchar const *, unsigned) const;
 129
 130     document_result Prefix(uchar const *, DocId, DocId) const;
 131     document_result Suffix(uchar const *, DocId, DocId) const;
 132     document_result Equal(uchar const *, DocId, DocId) const;
 133     document_result Contains(uchar const *, DocId, DocId) const;
 134     document_result LessThan(uchar const *, DocId, DocId) const;
 135
 136     // Definition of full_result is inherited from SXSI::TextCollection.
 137     full_result FullContains(uchar const *) const;
 138     full_result FullContains(uchar const *, DocId, DocId) const;
 139     full_result FullKMismatches(uchar const *, unsigned) const;
 140     full_result FullKErrors(uchar const *, unsigned) const;
 141
 142     // Index from/to disk
 143     TCImplementation(FILE *, unsigned);
 144     void Save(FILE *) const;
 145
 146 private:
 147     typedef std::vector<std::pair<ulong, ulong> > suffix_range_vector;
 148
 149     static const uchar versionFlag;
 150     TextPosition n;
 151     unsigned samplerate;
 152     unsigned C[256];
 153     TextPosition bwtEndPos;
 154     static_sequence * alphabetrank;
 155
 156     // Sample structures for texts longer than samplerate
 157     static_bitsequence * sampled;
 158     BlockArray *suffixes;
 159     BlockArray *suffixDocId;
 160
 161     // Total number of texts in the collection
 162     unsigned numberOfTexts;
 163     // Length of the longest text
 164     ulong maxTextLength;
 165
 166     // Array of document id's in the order of end-markers in BWT
 167 //    static_sequence *Doc;
 168     ArrayDoc *Doc;
 169
 170     // Text storage for fast extraction
 171     TextStorage * textStorage;
 172
 173     // Following methods are not part of the public API
 174     uchar * BWT(uchar *);
 175     void makewavelet(uchar *);
 176     void maketables(ulong, char, CSA::DeltaVector &, const std::string &);
 177     DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
 178     ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
 179     ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
 180     ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
 181     ulong searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const;
 182     ulong kmismatches(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned) const;
 183     ulong kerrors(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned, ulong const *, ulong) const;
 184     /**
 185      * Count end-markers in given interval
 186      */
 187     inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
 188     {
 189         if (sp > ep)
 190             return 0;
 191
 192         ulong ranksp = 0;
 193         if (sp != 0)
 194             ranksp = alphabetrank->rank(0, sp - 1);
 195
 196         return alphabetrank->rank(0, ep) - ranksp;
 197     }
 198
 199     /**
 200      * Count end-markers in given interval and
 201      * within docIds [min,max]
 202      */
 203     inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
 204     {
 205         if (sp != 0)
 206             sp = alphabetrank->rank(0, sp - 1);
 207         ep = alphabetrank->rank(0, ep);
 208         if (ep == 0)
 209             return 0;
 210
 211         return Doc->count(sp, ep-1, min, max);
 212     }
 213
 214     /**
 215      * Enumerate all end-markers in given interval
 216      */
 217     inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep) const
 218     {
 219         if (sp != 0)
 220             sp = alphabetrank->rank(0, sp - 1);
 221         ep = alphabetrank->rank(0, ep);
 222         if (ep == 0)
 223             return document_result();
 224
 225         return Doc->accessAll(sp, ep-1);
 226     }
 227
 228     /**
 229      * Enumerate end-markers in given interval and
 230      * within docIds [min,max]
 231      */
 232     inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
 233     {
 234         if (sp != 0)
 235             sp = alphabetrank->rank(0, sp - 1);
 236         ep = alphabetrank->rank(0, ep);
 237         if (ep == 0)
 238             return document_result();
 239
 240         return Doc->access(sp, ep-1, min, max);
 241     }
 242
 243     /**
 244      * Enumerate documents in given interval [sp, ep]
 245      */
 246     inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep) const
 247     {
 248         // We want unique document indentifiers, using std::set to collect them
 249         // FIXME use unordered_set?
 250         uint tmp_rank_c = 0; // Cache rank value of c.
 251         for (; sp <= ep; ++sp)
 252         {
 253             TextPosition i = sp;
 254             uchar c = alphabetrank->access(i, tmp_rank_c);
 255             while (c != '\0' && !sampled->access(i))
 256             {
 257                 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
 258                 c = alphabetrank->access(i, tmp_rank_c);
 259             }
 260             if (c == '\0')
 261             {
 262                 // Rank among the end-markers in BWT
 263                 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
 264                 resultSet.insert(Doc->access(endmarkerRank));
 265             }
 266             else
 267             {
 268                 DocId di = (*suffixDocId)[sampled->rank1(i)-1];
 269                 assert((unsigned)di < numberOfTexts);
 270                 resultSet.insert(di);
 271             }
 272         }
 273     }
 274
 275     /**
 276      * Enumerate documents in given interval [sp, ep]
 277      * and within [begin, end]
 278      */
 279     inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
 280     {
 281         // We want unique document indentifiers, using std::set to collect them
 282         uint tmp_rank_c = 0; // Cache rank value of c.
 283         for (; sp <= ep; ++sp)
 284         {
 285             TextPosition i = sp;
 286             uchar c = alphabetrank->access(i, tmp_rank_c);
 287             while (c != '\0' && !sampled->access(i))
 288             {
 289                 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
 290                 c = alphabetrank->access(i, tmp_rank_c);
 291             }
 292             if (c == '\0')
 293             {
 294                 // Rank among the end-markers in BWT
 295                 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
 296                 DocId docId = Doc->access(endmarkerRank);
 297                 if (docId >= begin && docId <= end)
 298                     resultSet.insert(docId);
 299             }
 300             else
 301             {
 302                 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 303                 assert((unsigned)docId < numberOfTexts);
 304                 if (docId >= begin && docId <= end)
 305                     resultSet.insert(docId);
 306             }
 307         }
 308     }
 309
 310     /**
 311      * Enumerate document+position pairs (full_result) of
 312      * each suffix in given interval.
 313      */
 314     inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep) const
 315     {
 316         uint tmp_rank_c = 0; // Cache rank value of c.
 317         for (; sp <= ep; ++sp)
 318         {
 319             TextPosition i = sp;
 320             TextPosition dist = 0;
 321             uchar c = alphabetrank->access(i, tmp_rank_c);
 322             while (c != '\0' && !sampled->access(i))
 323             {
 324                 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
 325                 c = alphabetrank->access(i, tmp_rank_c);
 326                 ++ dist;
 327             }
 328             if (c == '\0')
 329             {
 330                 // Rank among the end-markers in BWT
 331                 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
 332                 DocId docId = Doc->access(endmarkerRank);
 333                 result.push_back(std::make_pair(docId, dist));
 334             }
 335             else
 336             {
 337                 TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
 338                 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 339
 340                 result.push_back(std::make_pair(docId, textPos));
 341             }
 342         }
 343     }
 344
 345     /**
 346      * Enumerate document+position pairs (full_result) of
 347      * each suffix in given interval and within [begin, end].
 348      */
 349     inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
 350     {
 351         uint tmp_rank_c = 0; // Cache rank value of c.
 352         for (; sp <= ep; ++sp)
 353         {
 354             TextPosition i = sp;
 355             TextPosition dist = 0;
 356             uchar c = alphabetrank->access(i, tmp_rank_c);
 357             while (c != '\0' && !sampled->access(i))
 358             {
 359                 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
 360                 c = alphabetrank->access(i, tmp_rank_c);
 361                 ++ dist;
 362             }
 363             if (c == '\0')
 364             {
 365                 // Rank among the end-markers in BWT
 366                 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
 367                 DocId docId = Doc->access(endmarkerRank);
 368                 if (docId >= begin && docId <= end)
 369                   result.push_back(std::make_pair(docId, dist));
 370             }
 371             else
 372             {
 373                 TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
 374                 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 375
 376                 if (docId >= begin && docId <= end)
 377                   result.push_back(std::make_pair(docId, textPos));
 378             }
 379         }
 380     }
 381
 382 }; // class TCImplementation
 383
 384 } // namespace SXSI
 385
 386 #endif