FMIndex.h

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *   FMIndex implementation for the TextCollection interface                  *
   5  *                                                                            *
   6  *   This program is free software; you can redistribute it and/or modify     *
   7  *   it under the terms of the GNU Lesser General Public License as published *
   8  *   by the Free Software Foundation; either version 2 of the License, or     *
   9  *   (at your option) any later version.                                      *
  10  *                                                                            *
  11  *   This program is distributed in the hope that it will be useful,          *
  12  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  13  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  14  *   GNU Lesser General Public License for more details.                      *
  15  *                                                                            *
  16  *   You should have received a copy of the GNU Lesser General Public License *
  17  *   along with this program; if not, write to the                            *
  18  *   Free Software Foundation, Inc.,                                          *
  19  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  20  *****************************************************************************/
  21
  22 #ifndef _FMIndex_H_
  23 #define _FMIndex_H_
  24
  25 #include "incbwt/bits/deltavector.h"
  26
  27 #include "BitRank.h"
  28 #include "TextCollection.h"
  29 #include "BlockArray.h"
  30
  31 // Include  from XMLTree/libcds
  32 #include <basics.h> // Defines W == 32
  33 #include <static_bitsequence.h>
  34 #include <alphabet_mapper.h>
  35 #include <static_sequence.h>
  36 #include <static_sequence_wvtree_noptrs.h>
  37
  38 // Re-define word size to ulong:
  39 #undef W
  40 #if __WORDSIZE == 64
  41 #   define W 64
  42 #else
  43 #   define W 32
  44 #endif
  45 #undef bitset
  46 #undef bitget
  47
  48
  49 #include "TextStorage.h"
  50 #include "ArrayDoc.h"
  51 #include <set>
  52 #include <string>
  53
  54 namespace SXSI
  55 {
  56
  57
  58 /**
  59  * Implementation of the TextCollection interface
  60  *
  61  */
  62 class FMIndex : public SXSI::TextCollection {
  63 public:
  64     FMIndex(uchar *, ulong, unsigned, unsigned, ulong, ulong,
  65             CSA::DeltaVector &, const std::string &, char);
  66     ~FMIndex();
  67
  68     bool EmptyText(DocId) const;
  69
  70     /**
  71      * Extracting one text.
  72      *
  73      * Call DeleteText() for each pointer returned by GetText()
  74      * to avoid possible memory leaks.
  75      */
  76     uchar * GetText(DocId) const;
  77     void DeleteText(uchar *text) const
  78     { textStorage->DeleteText(text); }
  79
  80     /**
  81      * Returns a pointer to the beginning of texts i, i+1, ..., j.
  82      * Texts are separated by a '\0' byte.
  83      *
  84      * Call DeleteText() for each pointer returned by GetText()
  85      * to avoid possible memory leaks.
  86      */
  87     uchar * GetText(DocId i, DocId j) const
  88     { return textStorage->GetText(i, j); }
  89
  90     /**
  91      * Returns a substring of given text ID.
  92      *
  93      * FIXME This may be reimplemented via TextStorage.
  94      */
  95 //    uchar* GetText(DocId, TextPosition, TextPosition) const;
  96
  97     bool IsPrefix(uchar const *) const;
  98     bool IsSuffix(uchar const *) const;
  99     bool IsEqual(uchar const *) const;
 100     bool IsContains(uchar const *) const;
 101     bool IsLessThan(uchar const *) const;
 102
 103     bool IsPrefix(uchar const *, DocId, DocId) const;
 104     bool IsSuffix(uchar const *, DocId, DocId) const;
 105     bool IsEqual(uchar const *, DocId, DocId) const;
 106     bool IsContains(uchar const *, DocId, DocId) const;
 107     bool IsLessThan(uchar const *, DocId, DocId) const;
 108
 109     ulong Count(uchar const *) const;
 110     unsigned CountPrefix(uchar const *) const;
 111     unsigned CountSuffix(uchar const *) const;
 112     unsigned CountEqual(uchar const *) const;
 113     unsigned CountContains(uchar const *) const;
 114     unsigned CountLessThan(const unsigned char*) const;
 115
 116     unsigned CountPrefix(uchar const *, DocId, DocId) const;
 117     unsigned CountSuffix(uchar const *, DocId, DocId) const;
 118     unsigned CountEqual(uchar const *, DocId, DocId) const;
 119     unsigned CountContains(uchar const *, DocId, DocId) const;
 120     unsigned CountLessThan(uchar const *, DocId, DocId) const;
 121
 122     // Definition of document_result is inherited from SXSI::TextCollection.
 123     document_result Prefix(uchar const *) const;
 124     document_result Suffix(uchar const *) const;
 125     document_result Equal(uchar const *) const;
 126     document_result Contains(uchar const *) const;
 127     document_result LessThan(uchar const *) const;
 128     document_result KMismaches(uchar const *, unsigned) const;
 129     document_result KErrors(uchar const *, unsigned) const;
 130
 131     document_result Prefix(uchar const *, DocId, DocId) const;
 132     document_result Suffix(uchar const *, DocId, DocId) const;
 133     document_result Equal(uchar const *, DocId, DocId) const;
 134     document_result Contains(uchar const *, DocId, DocId) const;
 135     document_result LessThan(uchar const *, DocId, DocId) const;
 136
 137     // Definition of full_result is inherited from SXSI::TextCollection.
 138     full_result FullContains(uchar const *) const;
 139     full_result FullContains(uchar const *, DocId, DocId) const;
 140     full_result FullKMismatches(uchar const *, unsigned) const;
 141     full_result FullKErrors(uchar const *, unsigned) const;
 142
 143     // Index from/to disk
 144     FMIndex(FILE *, index_mode_t, unsigned);
 145     void Save(FILE *, char const *) const;
 146
 147 private:
 148     typedef std::vector<std::pair<ulong, ulong> > suffix_range_vector;
 149
 150     static const uchar versionFlag;
 151     TextPosition n;
 152     unsigned samplerate;
 153     unsigned C[256];
 154     TextPosition bwtEndPos;
 155     static_sequence * alphabetrank;
 156
 157     // Sample structures for texts longer than samplerate
 158     static_bitsequence * sampled;
 159     BlockArray *suffixes;
 160     BlockArray *suffixDocId;
 161
 162     // Total number of texts in the collection
 163     unsigned numberOfTexts;
 164     // Length of the longest text
 165     ulong maxTextLength;
 166
 167     // Array of document id's in the order of end-markers in BWT
 168 //    static_sequence *Doc;
 169     ArrayDoc *Doc;
 170
 171     // Text storage for fast extraction
 172     TextStorage * textStorage;
 173
 174     // Following methods are not part of the public API
 175     uchar * BWT(uchar *);
 176     void makewavelet(uchar *);
 177     void maketables(ulong, char, CSA::DeltaVector::Iterator &, const std::string &);
 178     DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
 179     ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
 180     ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
 181     ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
 182     ulong searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const;
 183     ulong kmismatches(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned) const;
 184     ulong kerrors(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned, ulong const *, ulong) const;
 185     /**
 186      * Count end-markers in given interval
 187      */
 188     inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
 189     {
 190         if (sp > ep)
 191             return 0;
 192
 193         ulong ranksp = 0;
 194         if (sp != 0)
 195             ranksp = alphabetrank->rank(0, sp - 1);
 196
 197         return alphabetrank->rank(0, ep) - ranksp;
 198     }
 199
 200     /**
 201      * Count end-markers in given interval and
 202      * within docIds [min,max]
 203      */
 204     inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
 205     {
 206         if (sp != 0)
 207             sp = alphabetrank->rank(0, sp - 1);
 208         ep = alphabetrank->rank(0, ep);
 209         if (ep == 0)
 210             return 0;
 211
 212         return Doc->count(sp, ep-1, min, max);
 213     }
 214
 215     /**
 216      * Enumerate all end-markers in given interval
 217      */
 218     inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep) const
 219     {
 220         if (sp != 0)
 221             sp = alphabetrank->rank(0, sp - 1);
 222         ep = alphabetrank->rank(0, ep);
 223         if (ep == 0)
 224             return document_result();
 225
 226         return Doc->accessAll(sp, ep-1);
 227     }
 228
 229     /**
 230      * Enumerate end-markers in given interval and
 231      * within docIds [min,max]
 232      */
 233     inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
 234     {
 235         if (sp != 0)
 236             sp = alphabetrank->rank(0, sp - 1);
 237         ep = alphabetrank->rank(0, ep);
 238         if (ep == 0)
 239             return document_result();
 240
 241         return Doc->access(sp, ep-1, min, max);
 242     }
 243
 244     /**
 245      * Enumerate documents in given interval [sp, ep]
 246      */
 247     inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep) const
 248     {
 249         // We want unique document indentifiers, using std::set to collect them
 250         // FIXME use unordered_set?
 251         uint tmp_rank_c = 0; // Cache rank value of c.
 252         for (; sp <= ep; ++sp)
 253         {
 254             TextPosition i = sp;
 255             uchar c = alphabetrank->access(i, tmp_rank_c);
 256             while (c != '\0' && !sampled->access(i))
 257             {
 258                 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
 259                 c = alphabetrank->access(i, tmp_rank_c);
 260             }
 261             if (c == '\0')
 262             {
 263                 // Rank among the end-markers in BWT
 264                 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
 265                 resultSet.insert(Doc->access(endmarkerRank));
 266             }
 267             else
 268             {
 269                 DocId di = (*suffixDocId)[sampled->rank1(i)-1];
 270                 assert((unsigned)di < numberOfTexts);
 271                 resultSet.insert(di);
 272             }
 273         }
 274     }
 275
 276     /**
 277      * Enumerate documents in given interval [sp, ep]
 278      * and within [begin, end]
 279      */
 280     inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
 281     {
 282         // We want unique document indentifiers, using std::set to collect them
 283         uint tmp_rank_c = 0; // Cache rank value of c.
 284         for (; sp <= ep; ++sp)
 285         {
 286             TextPosition i = sp;
 287             uchar c = alphabetrank->access(i, tmp_rank_c);
 288             while (c != '\0' && !sampled->access(i))
 289             {
 290                 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
 291                 c = alphabetrank->access(i, tmp_rank_c);
 292             }
 293             if (c == '\0')
 294             {
 295                 // Rank among the end-markers in BWT
 296                 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
 297                 DocId docId = Doc->access(endmarkerRank);
 298                 if (docId >= begin && docId <= end)
 299                     resultSet.insert(docId);
 300             }
 301             else
 302             {
 303                 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 304                 assert((unsigned)docId < numberOfTexts);
 305                 if (docId >= begin && docId <= end)
 306                     resultSet.insert(docId);
 307             }
 308         }
 309     }
 310
 311     /**
 312      * Enumerate document+position pairs (full_result) of
 313      * each suffix in given interval.
 314      */
 315     inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep) const
 316     {
 317         uint tmp_rank_c = 0; // Cache rank value of c.
 318         for (; sp <= ep; ++sp)
 319         {
 320             TextPosition i = sp;
 321             TextPosition dist = 0;
 322             uchar c = alphabetrank->access(i, tmp_rank_c);
 323             while (c != '\0' && !sampled->access(i))
 324             {
 325                 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
 326                 c = alphabetrank->access(i, tmp_rank_c);
 327                 ++ dist;
 328             }
 329             if (c == '\0')
 330             {
 331                 // Rank among the end-markers in BWT
 332                 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
 333                 DocId docId = Doc->access(endmarkerRank);
 334                 result.push_back(std::make_pair(docId, dist));
 335             }
 336             else
 337             {
 338                 TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
 339                 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 340
 341                 result.push_back(std::make_pair(docId, textPos));
 342             }
 343         }
 344     }
 345
 346     /**
 347      * Enumerate document+position pairs (full_result) of
 348      * each suffix in given interval and within [begin, end].
 349      */
 350     inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
 351     {
 352         uint tmp_rank_c = 0; // Cache rank value of c.
 353         for (; sp <= ep; ++sp)
 354         {
 355             TextPosition i = sp;
 356             TextPosition dist = 0;
 357             uchar c = alphabetrank->access(i, tmp_rank_c);
 358             while (c != '\0' && !sampled->access(i))
 359             {
 360                 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
 361                 c = alphabetrank->access(i, tmp_rank_c);
 362                 ++ dist;
 363             }
 364             if (c == '\0')
 365             {
 366                 // Rank among the end-markers in BWT
 367                 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
 368                 DocId docId = Doc->access(endmarkerRank);
 369                 if (docId >= begin && docId <= end)
 370                   result.push_back(std::make_pair(docId, dist));
 371             }
 372             else
 373             {
 374                 TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
 375                 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 376
 377                 if (docId >= begin && docId <= end)
 378                   result.push_back(std::make_pair(docId, textPos));
 379             }
 380         }
 381     }
 382
 383 }; // class FMIndex
 384
 385 } // namespace SXSI
 386
 387 #endif