TCImplementation.h

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20
  21 #ifndef _TCImplementation_H_
  22 #define _TCImplementation_H_
  23 #include "BitRank.h"
  24 #include "TextCollection.h"
  25 #include "BlockArray.h"
  26
  27 // Include  from XMLTree/libcds
  28 #include <basics.h> // Defines W == 32
  29 #include <static_bitsequence.h>
  30 #include <alphabet_mapper.h>
  31 #include <static_sequence.h>
  32 #include <static_sequence_wvtree_noptrs.h>
  33
  34 // Re-define word size to ulong:
  35 #undef W
  36 #if __WORDSIZE == 64
  37 #   define W 64
  38 #else
  39 #   define W 32
  40 #endif
  41 #undef bitset
  42
  43 #include "TextStorage.h"
  44 #include <set>
  45
  46 namespace SXSI
  47 {
  48
  49
  50 /**
  51  * Implementation of the TextCollection interface
  52  *
  53  */
  54 class TCImplementation : public SXSI::TextCollection {
  55 public:
  56     TCImplementation(uchar *, ulong, unsigned, unsigned, ulong, ulong);
  57     ~TCImplementation();
  58
  59     bool EmptyText(DocId) const;
  60
  61     /**
  62      * Returns a pointer to the original text.
  63      *
  64      * Do *not* try to free the array.
  65      * (However, this implementation is suspect to change.)
  66      *
  67      * See TextStorage.h for details.
  68      */
  69     uchar * GetText(DocId) const;
  70
  71     /**
  72      * Returns a pointer to the beginning of texts i, i+1, ..., j.
  73      * Texts are separated by a '\0' byte.
  74      *
  75      * Do *not* try to free the array.
  76      * (However, this implementation is suspect to change.)
  77      *
  78      * See TextStorage.h for details.
  79      */
  80     uchar * GetText(DocId i, DocId j) const;
  81
  82     /**
  83      * Returns a substring of given text ID.
  84      *
  85      * FIXME This may be reimplemented via TextStorage.
  86      */
  87 //    uchar* GetText(DocId, TextPosition, TextPosition) const;
  88
  89     bool IsPrefix(uchar const *) const;
  90     bool IsSuffix(uchar const *) const;
  91     bool IsEqual(uchar const *) const;
  92     bool IsContains(uchar const *) const;
  93     bool IsLessThan(uchar const *) const;
  94
  95     bool IsPrefix(uchar const *, DocId, DocId) const;
  96     bool IsSuffix(uchar const *, DocId, DocId) const;
  97     bool IsEqual(uchar const *, DocId, DocId) const;
  98     bool IsContains(uchar const *, DocId, DocId) const;
  99     bool IsLessThan(uchar const *, DocId, DocId) const;
 100
 101     ulong Count(uchar const *) const;
 102     unsigned CountPrefix(uchar const *) const;
 103     unsigned CountSuffix(uchar const *) const;
 104     unsigned CountEqual(uchar const *) const;
 105     unsigned CountContains(uchar const *) const;
 106     unsigned CountLessThan(const unsigned char*) const;
 107
 108     unsigned CountPrefix(uchar const *, DocId, DocId) const;
 109     unsigned CountSuffix(uchar const *, DocId, DocId) const;
 110     unsigned CountEqual(uchar const *, DocId, DocId) const;
 111     unsigned CountContains(uchar const *, DocId, DocId) const;
 112     unsigned CountLessThan(uchar const *, DocId, DocId) const;
 113
 114     // Definition of document_result is inherited from SXSI::TextCollection.
 115     document_result Prefix(uchar const *) const;
 116     document_result Suffix(uchar const *) const;
 117     document_result Equal(uchar const *) const;
 118     document_result Contains(uchar const *) const;
 119     document_result LessThan(uchar const *) const;
 120     document_result Kmismaches(uchar const *, unsigned) const;
 121     document_result Kerrors(uchar const *, unsigned) const;
 122
 123     document_result Prefix(uchar const *, DocId, DocId) const;
 124     document_result Suffix(uchar const *, DocId, DocId) const;
 125     document_result Equal(uchar const *, DocId, DocId) const;
 126     document_result Contains(uchar const *, DocId, DocId) const;
 127     document_result LessThan(uchar const *, DocId, DocId) const;
 128
 129     // Definition of full_result is inherited from SXSI::TextCollection.
 130     full_result FullContains(uchar const *) const;
 131     full_result FullContains(uchar const *, DocId, DocId) const;
 132     full_result FullKmismatches(uchar const *, unsigned) const;
 133     full_result FullKerrors(uchar const *, unsigned) const;
 134
 135     // Index from/to disk
 136     TCImplementation(FILE *, unsigned);
 137     void Save(FILE *) const;
 138
 139 private:
 140     typedef std::vector<std::pair<ulong, ulong> > suffix_range_vector;
 141
 142     static const uchar versionFlag;
 143     TextPosition n;
 144     unsigned samplerate;
 145     unsigned C[256];
 146     TextPosition bwtEndPos;
 147     static_sequence * alphabetrank;
 148
 149     // Sample structures for texts longer than samplerate
 150     static_bitsequence * sampled;
 151     BlockArray *suffixes;
 152     BlockArray *suffixDocId;
 153
 154     // Total number of texts in the collection
 155     unsigned numberOfTexts;
 156     // Length of the longest text
 157     ulong maxTextLength;
 158
 159     // Array of document id's in the order of end-markers in BWT
 160     static_sequence *Doc;
 161
 162     // Text storage for fast extraction
 163     TextStorage * textStorage;
 164
 165     // Following methods are not part of the public API
 166     uchar * BWT(uchar *);
 167     void makewavelet(uchar *);
 168     void maketables(ulong);
 169     DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
 170     ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
 171     ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
 172     ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
 173     ulong searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const;
 174     ulong kmismatches(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned) const;
 175     ulong kerrors(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned, ulong const *, ulong) const;
 176     /**
 177      * Count end-markers in given interval
 178      */
 179     inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
 180     {
 181         if (sp > ep)
 182             return 0;
 183
 184         ulong ranksp = 0;
 185         if (sp != 0)
 186             ranksp = alphabetrank->rank(0, sp - 1);
 187
 188         return alphabetrank->rank(0, ep) - ranksp;
 189     }
 190
 191     /**
 192      * Count end-markers in given interval and
 193      * within docIds [min,max]
 194      */
 195     inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
 196     {
 197         if (sp != 0)
 198             sp = alphabetrank->rank(0, sp - 1);
 199         ep = alphabetrank->rank(0, ep);
 200         if (ep == 0)
 201             return 0;
 202
 203         return Doc->count(sp, ep-1, min, max);
 204     }
 205
 206     /**
 207      * Enumerate all end-markers in given interval
 208      */
 209     inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep) const
 210     {
 211         if (sp != 0)
 212             sp = alphabetrank->rank(0, sp - 1);
 213         ep = alphabetrank->rank(0, ep);
 214         if (ep == 0)
 215             return document_result();
 216
 217         return Doc->accessAll(sp, ep-1);
 218     }
 219
 220     /**
 221      * Enumerate end-markers in given interval and
 222      * within docIds [min,max]
 223      */
 224     inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
 225     {
 226         if (sp != 0)
 227             sp = alphabetrank->rank(0, sp - 1);
 228         ep = alphabetrank->rank(0, ep);
 229         if (ep == 0)
 230             return document_result();
 231
 232         return Doc->access(sp, ep-1, min, max);
 233     }
 234
 235     /**
 236      * Enumerate documents in given interval [sp, ep]
 237      */
 238     inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep) const
 239     {
 240         // We want unique document indentifiers, using std::set to collect them
 241         // FIXME use unordered_set?
 242         uint tmp_rank_c = 0; // Cache rank value of c.
 243         for (; sp <= ep; ++sp)
 244         {
 245             TextPosition i = sp;
 246             uchar c = alphabetrank->access(i, tmp_rank_c);
 247             while (c != '\0' && !sampled->access(i))
 248             {
 249                 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
 250                 c = alphabetrank->access(i, tmp_rank_c);
 251             }
 252             if (c == '\0')
 253             {
 254                 // Rank among the end-markers in BWT
 255                 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
 256                 resultSet.insert(Doc->access(endmarkerRank));
 257             }
 258             else
 259             {
 260                 DocId di = (*suffixDocId)[sampled->rank1(i)-1];
 261                 assert((unsigned)di < numberOfTexts);
 262                 resultSet.insert(di);
 263             }
 264         }
 265     }
 266
 267     /**
 268      * Enumerate documents in given interval [sp, ep]
 269      * and within [begin, end]
 270      */
 271     inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
 272     {
 273         // We want unique document indentifiers, using std::set to collect them
 274         uint tmp_rank_c = 0; // Cache rank value of c.
 275         for (; sp <= ep; ++sp)
 276         {
 277             TextPosition i = sp;
 278             uchar c = alphabetrank->access(i, tmp_rank_c);
 279             while (c != '\0' && !sampled->access(i))
 280             {
 281                 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
 282                 c = alphabetrank->access(i, tmp_rank_c);
 283             }
 284             if (c == '\0')
 285             {
 286                 // Rank among the end-markers in BWT
 287                 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
 288                 DocId docId = Doc->access(endmarkerRank);
 289                 if (docId >= begin && docId <= end)
 290                     resultSet.insert(docId);
 291             }
 292             else
 293             {
 294                 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 295                 assert((unsigned)docId < numberOfTexts);
 296                 if (docId >= begin && docId <= end)
 297                     resultSet.insert(docId);
 298             }
 299         }
 300     }
 301
 302     /**
 303      * Enumerate document+position pairs (full_result) of
 304      * each suffix in given interval.
 305      */
 306     inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep) const
 307     {
 308         uint tmp_rank_c = 0; // Cache rank value of c.
 309         for (; sp <= ep; ++sp)
 310         {
 311             TextPosition i = sp;
 312             TextPosition dist = 0;
 313             uchar c = alphabetrank->access(i, tmp_rank_c);
 314             while (c != '\0' && !sampled->access(i))
 315             {
 316                 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
 317                 c = alphabetrank->access(i, tmp_rank_c);
 318                 ++ dist;
 319             }
 320             if (c == '\0')
 321             {
 322                 // Rank among the end-markers in BWT
 323                 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
 324                 DocId docId = Doc->access(endmarkerRank);
 325                 result.push_back(make_pair(docId, dist));
 326             }
 327             else
 328             {
 329                 TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
 330                 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 331
 332                 result.push_back(make_pair(docId, textPos));
 333             }
 334         }
 335     }
 336
 337     /**
 338      * Enumerate document+position pairs (full_result) of
 339      * each suffix in given interval and within [begin, end].
 340      */
 341     inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
 342     {
 343         uint tmp_rank_c = 0; // Cache rank value of c.
 344         for (; sp <= ep; ++sp)
 345         {
 346             TextPosition i = sp;
 347             TextPosition dist = 0;
 348             uchar c = alphabetrank->access(i, tmp_rank_c);
 349             while (c != '\0' && !sampled->access(i))
 350             {
 351                 i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
 352                 c = alphabetrank->access(i, tmp_rank_c);
 353                 ++ dist;
 354             }
 355             if (c == '\0')
 356             {
 357                 // Rank among the end-markers in BWT
 358                 unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
 359                 DocId docId = Doc->access(endmarkerRank);
 360                 if (docId >= begin && docId <= end)
 361                     result.push_back(make_pair(docId, dist));
 362             }
 363             else
 364             {
 365                 TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
 366                 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 367
 368                 if (docId >= begin && docId <= end)
 369                     result.push_back(make_pair(docId, textPos));
 370             }
 371         }
 372     }
 373
 374 }; // class TCImplementation
 375
 376 } // namespace SXSI
 377
 378 #endif