X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=TextCollectionBuilder.h;h=422bd224a5b7d4922f7c619918864a5a484143fb;hb=e4d0188a098767c4950ba8936b754922343cbe8b;hp=bcb3672515b613179f7cb58b580837720ff0b0cc;hpb=ee8a3e526fe7f39cdc075263824faf6c17389297;p=SXSI%2FTextCollection.git diff --git a/TextCollectionBuilder.h b/TextCollectionBuilder.h index bcb3672..422bd22 100644 --- a/TextCollectionBuilder.h +++ b/TextCollectionBuilder.h @@ -22,23 +22,35 @@ #define _SXSI_TextCollectionBuilder_h_ #include "TextCollection.h" -#include "Tools.h" // Defines ulong and uchar. -#include -#include // Defines std::pair. -#include // Defines std::strlen, added by Kim +#include "TextStorage.h" +#include "Tools.h" + +// Default samplerate for suffix array samples +#define TEXTCOLLECTION_DEFAULT_SAMPLERATE 32 + +// Default input length, used to calculate the buffer size. +#define TEXTCOLLECTION_DEFAULT_INPUT_LENGTH (150 * 1024 * 1024) + namespace SXSI { - struct TCBuilderRep; // Pimpl - /** - * Build an instance of the TextCollection class. + * Builder for an instance of the TextCollection class. */ class TextCollectionBuilder { public: - explicit TextCollectionBuilder(unsigned); - ~TextCollectionBuilder(); + // Index type defaults to FM-index. + // SWCSA can be used for natural language inputs. + // NB: Current SWCSA uses a lot of memory during construction! + enum index_type_t { index_type_default, index_type_swcsa, index_type_rlcsa }; + + static TextCollectionBuilder* create(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE, + index_type_t type = index_type_default, + ulong estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH); + + + virtual ~TextCollectionBuilder() { }; /** * Insert text @@ -47,23 +59,33 @@ namespace SXSI * Can not be called after makeStatic(). * The i'th text insertion gets an identifier value i-1. * In other words, document identifiers start from 0. + * + * Second parameter tells if the text will be added to the + * index also. If false, text is added only to the TextCollection + * and can not be searched for. */ - void InsertText(uchar const *); + virtual void InsertText(uchar const *, bool index = true) = 0; /** * Make static * - * Convert to a static collection; reduces space and time complexities. + * Convert to a static collection. * New texts can not be inserted after this operation. + * + * TextStorage type defaults to TYPE_PLAIN_TEXT, another + * possible type is TYPE_LZ_INDEX. */ - TextCollection * InitTextCollection(); + virtual TextCollection * InitTextCollection(char type = TextStorage::TYPE_PLAIN_TEXT) = 0; - private: - struct TCBuilderRep * p_; + protected: + // Protected constructor; use the static method TextCollectionBuilder::create() + TextCollectionBuilder() { }; + private: // No copy constructor or assignment - TextCollectionBuilder(); TextCollectionBuilder(TextCollectionBuilder const&); TextCollectionBuilder& operator = (TextCollectionBuilder const&); }; + } + #endif