#define _SXSI_TextCollectionBuilder_h_
#include "TextCollection.h"
+#include "TextStorage.h"
#include "Tools.h" // Defines ulong and uchar.
+
+#include <string>
#include <vector>
#include <utility> // Defines std::pair.
#include <cstring> // Defines std::strlen, added by Kim
// Default samplerate for suffix array samples
#define TEXTCOLLECTION_DEFAULT_SAMPLERATE 64
+// Default input length, used to calculate the buffer size.
+#define TEXTCOLLECTION_DEFAULT_INPUT_LENGTH (150 * 1024 * 1024)
namespace SXSI
class TextCollectionBuilder
{
public:
- explicit TextCollectionBuilder(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE);
+ explicit TextCollectionBuilder(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE,
+ ulong estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH);
~TextCollectionBuilder();
/**
* Can not be called after makeStatic().
* The i'th text insertion gets an identifier value i-1.
* In other words, document identifiers start from 0.
+ *
+ * Second parameter tells if the text will be added to the
+ * index also. If false, text is added only to the TextCollection
+ * and can not be searched for.
*/
- void InsertText(uchar const *);
+ void InsertText(uchar const *, bool index = true);
/**
* Make static
*
- * Convert to a static collection; reduces space and time complexities.
+ * Convert to a static collection.
* New texts can not be inserted after this operation.
+ *
+ * TextStorage type defaults to TYPE_PLAIN_TEXT, another
+ * possible type is TYPE_LZ_INDEX.
*/
- TextCollection * InitTextCollection();
+ TextCollection * InitTextCollection(char type = TextStorage::TYPE_PLAIN_TEXT);
private:
+ // Using Pimpl idiom to hide RLCSA implementation.
struct TCBuilderRep * p_;
// No copy constructor or assignment