X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=TextCollectionBuilder.h;h=6b3819a2ead0ff2aba0fa7ac9f3f350bd9ac556c;hb=2dba1ef94915ba5c1a06c3756d6dd0333605a5f7;hp=13734d22f74ec09f8c18cb2c04b39992dd131f89;hpb=f58cd5c3504095d721eb4e21bfb0a70a0a4928a2;p=SXSI%2FTextCollection.git diff --git a/TextCollectionBuilder.h b/TextCollectionBuilder.h index 13734d2..6b3819a 100644 --- a/TextCollectionBuilder.h +++ b/TextCollectionBuilder.h @@ -22,7 +22,10 @@ #define _SXSI_TextCollectionBuilder_h_ #include "TextCollection.h" +#include "TextStorage.h" #include "Tools.h" // Defines ulong and uchar. + +#include #include #include // Defines std::pair. #include // Defines std::strlen, added by Kim @@ -33,6 +36,8 @@ // Default samplerate for suffix array samples #define TEXTCOLLECTION_DEFAULT_SAMPLERATE 64 +// Default input length, used to calculate the buffer size. +#define TEXTCOLLECTION_DEFAULT_INPUT_LENGTH (150 * 1024 * 1024) namespace SXSI @@ -45,7 +50,8 @@ namespace SXSI class TextCollectionBuilder { public: - explicit TextCollectionBuilder(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE); + explicit TextCollectionBuilder(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE, + ulong estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH); ~TextCollectionBuilder(); /** @@ -55,17 +61,25 @@ namespace SXSI * Can not be called after makeStatic(). * The i'th text insertion gets an identifier value i-1. * In other words, document identifiers start from 0. + * + * Second parameter tells if the text will be added to the + * index also. If false, text is added only to the TextCollection + * and can not be searched for. */ - void InsertText(uchar const *); + void InsertText(uchar const *, bool index = true); /** * Make static * - * Convert to a static collection; reduces space and time complexities. + * Convert to a static collection. * New texts can not be inserted after this operation. + * + * TextStorage type defaults to TYPE_PLAIN_TEXT, another + * possible type is TYPE_LZ_INDEX. */ - TextCollection * InitTextCollection(); + TextCollection * InitTextCollection(char type = TextStorage::TYPE_PLAIN_TEXT); private: + // Using Pimpl idiom to hide RLCSA implementation. struct TCBuilderRep * p_; // No copy constructor or assignment