TCImplementation.h

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20
  21 #ifndef _TCImplementation_H_
  22 #define _TCImplementation_H_
  23 #include "BitRank.h"
  24 #include "TextCollection.h"
  25 #include "BlockArray.h"
  26
  27 // Include  from XMLTree/libcds
  28 #include <basics.h> // Defines W == 32
  29 #include <static_bitsequence.h>
  30 #include <alphabet_mapper.h>
  31 #include <static_sequence.h>
  32 #include <static_sequence_wvtree_noptrs.h>
  33
  34 // Re-define word size to ulong:
  35 #undef W
  36 #if __WORDSIZE == 64
  37 #   define W 64
  38 #else
  39 #   define W 32
  40 #endif
  41 #undef bitset
  42 #undef bitget
  43
  44 #include "TextStorage.h"
  45 #include "ArrayDoc.h"
  46 #include <set>
  47
  48 namespace SXSI
  49 {
  50
  51
  52 /**
  53  * Implementation of the TextCollection interface
  54  *
  55  */
  56 class TCImplementation : public SXSI::TextCollection {
  57 public:
  58     TCImplementation(uchar *, ulong, unsigned, unsigned, ulong, ulong, char);
  59     ~TCImplementation();
  60
  61     bool EmptyText(DocId) const;
  62
  63     /**
  64      * Extracting one text.
  65      *
  66      * Call DeleteText() for each pointer returned by GetText()
  67      * to avoid possible memory leaks.
  68      */
  69     uchar * GetText(DocId) const;
  70     void DeleteText(uchar *text) const
  71     { textStorage->DeleteText(text); }
  72
  73     /**
  74      * Returns a pointer to the beginning of texts i, i+1, ..., j.
  75      * Texts are separated by a '\0' byte.
  76      *
  77      * Call DeleteText() for each pointer returned by GetText()
  78      * to avoid possible memory leaks.
  79      */
  80     uchar * GetText(DocId i, DocId j) const
  81     { return textStorage->GetText(i, j); }
  82
  83     /**
  84      * Returns a substring of given text ID.
  85      *
  86      * FIXME This may be reimplemented via TextStorage.
  87      */
  88 //    uchar* GetText(DocId, TextPosition, TextPosition) const;
  89
  90     bool IsPrefix(uchar const *) const;
  91     bool IsSuffix(uchar const *) const;
  92     bool IsEqual(uchar const *) const;
  93     bool IsContains(uchar const *) const;
  94     bool IsLessThan(uchar const *) const;
  95
  96     bool IsPrefix(uchar const *, DocId, DocId) const;
  97     bool IsSuffix(uchar const *, DocId, DocId) const;
  98     bool IsEqual(uchar const *, DocId, DocId) const;
  99     bool IsContains(uchar const *, DocId, DocId) const;
 100     bool IsLessThan(uchar const *, DocId, DocId) const;
 101
 102     ulong Count(uchar const *) const;
 103     unsigned CountPrefix(uchar const *) const;
 104     unsigned CountSuffix(uchar const *) const;
 105     unsigned CountEqual(uchar const *) const;
 106     unsigned CountContains(uchar const *) const;
 107     unsigned CountLessThan(const unsigned char*) const;
 108
 109     unsigned CountPrefix(uchar const *, DocId, DocId) const;
 110     unsigned CountSuffix(uchar const *, DocId, DocId) const;
 111     unsigned CountEqual(uchar const *, DocId, DocId) const;
 112     unsigned CountContains(uchar const *, DocId, DocId) const;
 113     unsigned CountLessThan(uchar const *, DocId, DocId) const;
 114
 115     // Definition of document_result is inherited from SXSI::TextCollection.
 116     document_result Prefix(uchar const *) const;
 117     document_result Suffix(uchar const *) const;
 118     document_result Equal(uchar const *) const;
 119     document_result Contains(uchar const *) const;
 120     document_result LessThan(uchar const *) const;
 121     document_result KMismaches(uchar const *, unsigned) const;
 122     document_result KErrors(uchar const *, unsigned) const;
 123
 124     document_result Prefix(uchar const *, DocId, DocId) const;
 125     document_result Suffix(uchar const *, DocId, DocId) const;
 126     document_result Equal(uchar const *, DocId, DocId) const;
 127     document_result Contains(uchar const *, DocId, DocId) const;
 128     document_result LessThan(uchar const *, DocId, DocId) const;
 129
 130     // Definition of full_result is inherited from SXSI::TextCollection.
 131     full_result FullContains(uchar const *) const;
 132     full_result FullContains(uchar const *, DocId, DocId) const;
 133     full_result FullKMismatches(uchar const *, unsigned) const;
 134     full_result FullKErrors(uchar const *, unsigned) const;
 135
 136     // Index from/to disk
 137     TCImplementation(FILE *, unsigned);
 138     void Save(FILE *) const;
 139
 140 private:
 141     typedef std::vector<std::pair<ulong, ulong> > suffix_range_vector;
 142
 143     static const uchar versionFlag;
 144     TextPosition n;
 145     unsigned samplerate;
 146     unsigned C[256];
 147     TextPosition bwtEndPos;
 148     static_sequence * alphabetrank;
 149
 150     // Sample structures for texts longer than samplerate
 151     static_bitsequence * sampled;
 152     BlockArray *suffixes;
 153     BlockArray *suffixDocId;
 154
 155     // Total number of texts in the collection
 156     unsigned numberOfTexts;
 157     // Length of the longest text
 158     ulong maxTextLength;
 159
 160     // Array of document id's in the order of end-markers in BWT
 161 //    static_sequence *Doc;
 162     ArrayDoc *Doc;
 163
 164     // Text storage for fast extraction
 165     TextStorage * textStorage;
 166
 167     // Following methods are not part of the public API
 168     uchar * BWT(uchar *);
 169     void makewavelet(uchar *);
 170     void maketables(ulong, char);
 171     DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
 172     ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
 173     ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
 174     ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
 175     ulong searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const;
 176     ulong kmismatches(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned) const;
 177     ulong kerrors(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned, ulong const *, ulong) const;
 178     /**
 179      * Count end-markers in given interval
 180      */
 181     inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
 182     {
 183         if (sp > ep)
 184             return 0;
 185
 186         ulong ranksp = 0;
 187         if (sp != 0)
 188             ranksp = alphabetrank->rank(0, sp - 1);
 189
 190         return alphabetrank->rank(0, ep) - ranksp;
 191     }
 192
 193     /**
 194      * Count end-markers in given interval and
 195      * within docIds [min,max]
 196      */
 197     inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
 198     {
 199         if (sp != 0)
 200             sp = alphabetrank->rank(0, sp - 1);
 201         ep = alphabetrank->rank(0, ep);
 202         if (ep == 0)
 203             return 0;
 204
 205         return Doc->count(sp, ep-1, min, max);
 206     }
 207
 208     /**
 209      * Enumerate all end-markers in given interval
 210      */
 211     inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep) const
 212     {
 213         if (sp != 0)
 214             sp = alphabetrank->rank(0, sp - 1);
 215         ep = alphabetrank->rank(0, ep);
 216         if (ep == 0)
 217             return document_result();
 218
 219         return Doc->accessAll(sp, ep-1);
 220     }
 221
 222     /**
 223      * Enumerate end-markers in given interval and
 224      * within docIds [min,max]
 225      */
 226     inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
 227     {
 228         if (sp != 0)
 229             sp = alphabetrank->rank(0, sp - 1);
 230         ep = alphabetrank->rank(0, ep);
 231         if (ep == 0)
 232             return document_result();
 233
 234         return Doc->access(sp, ep-1, min, max);
 235     }
 236
 237     /**
 238      * Enumerate documents in given interval [sp, ep]
 239      */
 240     inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep) const
 241     {
 242         // We want unique document indentifiers, using std::set to collect them
 243         // FIXME use unordered_set?
 244         uint tmp_rank_c = 0; // Cache rank value of c.
 245         for (; sp <= ep; ++sp)
 246         {
 247             TextPosition i = sp;
 248             uchar c = alphabetrank->access(i, tmp_rank_c);
 249             while (c != '\0' && !sampled->access(i))
 250             {
 251                 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
 252                 c = alphabetrank->access(i, tmp_rank_c);
 253             }
 254             if (c == '\0')
 255             {
 256                 // Rank among the end-markers in BWT
 257                 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
 258                 resultSet.insert(Doc->access(endmarkerRank));
 259             }
 260             else
 261             {
 262                 DocId di = (*suffixDocId)[sampled->rank1(i)-1];
 263                 assert((unsigned)di < numberOfTexts);
 264                 resultSet.insert(di);
 265             }
 266         }
 267     }
 268
 269     /**
 270      * Enumerate documents in given interval [sp, ep]
 271      * and within [begin, end]
 272      */
 273     inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
 274     {
 275         // We want unique document indentifiers, using std::set to collect them
 276         uint tmp_rank_c = 0; // Cache rank value of c.
 277         for (; sp <= ep; ++sp)
 278         {
 279             TextPosition i = sp;
 280             uchar c = alphabetrank->access(i, tmp_rank_c);
 281             while (c != '\0' && !sampled->access(i))
 282             {
 283                 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
 284                 c = alphabetrank->access(i, tmp_rank_c);
 285             }
 286             if (c == '\0')
 287             {
 288                 // Rank among the end-markers in BWT
 289                 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
 290                 DocId docId = Doc->access(endmarkerRank);
 291                 if (docId >= begin && docId <= end)
 292                     resultSet.insert(docId);
 293             }
 294             else
 295             {
 296                 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 297                 assert((unsigned)docId < numberOfTexts);
 298                 if (docId >= begin && docId <= end)
 299                     resultSet.insert(docId);
 300             }
 301         }
 302     }
 303
 304     /**
 305      * Enumerate document+position pairs (full_result) of
 306      * each suffix in given interval.
 307      */
 308     inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep) const
 309     {
 310         uint tmp_rank_c = 0; // Cache rank value of c.
 311         for (; sp <= ep; ++sp)
 312         {
 313             TextPosition i = sp;
 314             TextPosition dist = 0;
 315             uchar c = alphabetrank->access(i, tmp_rank_c);
 316             while (c != '\0' && !sampled->access(i))
 317             {
 318                 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
 319                 c = alphabetrank->access(i, tmp_rank_c);
 320                 ++ dist;
 321             }
 322             if (c == '\0')
 323             {
 324                 // Rank among the end-markers in BWT
 325                 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
 326                 DocId docId = Doc->access(endmarkerRank);
 327                 result.push_back(make_pair(docId, dist));
 328             }
 329             else
 330             {
 331                 TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
 332                 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 333
 334                 result.push_back(make_pair(docId, textPos));
 335             }
 336         }
 337     }
 338
 339     /**
 340      * Enumerate document+position pairs (full_result) of
 341      * each suffix in given interval and within [begin, end].
 342      */
 343     inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
 344     {
 345         uint tmp_rank_c = 0; // Cache rank value of c.
 346         for (; sp <= ep; ++sp)
 347         {
 348             TextPosition i = sp;
 349             TextPosition dist = 0;
 350             uchar c = alphabetrank->access(i, tmp_rank_c);
 351             while (c != '\0' && !sampled->access(i))
 352             {
 353                 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
 354                 c = alphabetrank->access(i, tmp_rank_c);
 355                 ++ dist;
 356             }
 357             if (c == '\0')
 358             {
 359                 // Rank among the end-markers in BWT
 360                 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
 361                 DocId docId = Doc->access(endmarkerRank);
 362                 if (docId >= begin && docId <= end)
 363                     result.push_back(make_pair(docId, dist));
 364             }
 365             else
 366             {
 367                 TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
 368                 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 369
 370                 if (docId >= begin && docId <= end)
 371                     result.push_back(make_pair(docId, textPos));
 372             }
 373         }
 374     }
 375
 376 }; // class TCImplementation
 377
 378 } // namespace SXSI
 379
 380 #endif