X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=TextCollectionBuilder.h;h=422bd224a5b7d4922f7c619918864a5a484143fb;hb=898f6e5c6b7223f4753b7ccb7939809ee5f53aae;hp=1eb0275550fb05145972dfd0ad5399cf3400cdd0;hpb=e36f9fc04d80b572987d00d6a6ae15af08ad1dbb;p=SXSI%2FTextCollection.git diff --git a/TextCollectionBuilder.h b/TextCollectionBuilder.h index 1eb0275..422bd22 100644 --- a/TextCollectionBuilder.h +++ b/TextCollectionBuilder.h @@ -23,16 +23,10 @@ #include "TextCollection.h" #include "TextStorage.h" -#include "Tools.h" // Defines ulong and uchar. -#include -#include // Defines std::pair. -#include // Defines std::strlen, added by Kim - -// Un-comment to compare BWT against a BWT generated from class dynFMI: -//#define TCB_TEST_BWT +#include "Tools.h" // Default samplerate for suffix array samples -#define TEXTCOLLECTION_DEFAULT_SAMPLERATE 64 +#define TEXTCOLLECTION_DEFAULT_SAMPLERATE 32 // Default input length, used to calculate the buffer size. #define TEXTCOLLECTION_DEFAULT_INPUT_LENGTH (150 * 1024 * 1024) @@ -40,17 +34,23 @@ namespace SXSI { - struct TCBuilderRep; // Pimpl - /** - * Build an instance of the TextCollection class. + * Builder for an instance of the TextCollection class. */ class TextCollectionBuilder { public: - explicit TextCollectionBuilder(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE, - ulong estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH); - ~TextCollectionBuilder(); + // Index type defaults to FM-index. + // SWCSA can be used for natural language inputs. + // NB: Current SWCSA uses a lot of memory during construction! + enum index_type_t { index_type_default, index_type_swcsa, index_type_rlcsa }; + + static TextCollectionBuilder* create(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE, + index_type_t type = index_type_default, + ulong estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH); + + + virtual ~TextCollectionBuilder() { }; /** * Insert text @@ -59,8 +59,12 @@ namespace SXSI * Can not be called after makeStatic(). * The i'th text insertion gets an identifier value i-1. * In other words, document identifiers start from 0. + * + * Second parameter tells if the text will be added to the + * index also. If false, text is added only to the TextCollection + * and can not be searched for. */ - void InsertText(uchar const *); + virtual void InsertText(uchar const *, bool index = true) = 0; /** * Make static * @@ -70,14 +74,18 @@ namespace SXSI * TextStorage type defaults to TYPE_PLAIN_TEXT, another * possible type is TYPE_LZ_INDEX. */ - TextCollection * InitTextCollection(char type = TextStorage::TYPE_PLAIN_TEXT); + virtual TextCollection * InitTextCollection(char type = TextStorage::TYPE_PLAIN_TEXT) = 0; - private: - struct TCBuilderRep * p_; + protected: + // Protected constructor; use the static method TextCollectionBuilder::create() + TextCollectionBuilder() { }; + private: // No copy constructor or assignment TextCollectionBuilder(TextCollectionBuilder const&); TextCollectionBuilder& operator = (TextCollectionBuilder const&); }; + } + #endif