#include <vector>
#include <utility> // Defines std::pair.
-// Default samplerate for suffix array samples
-#define TEXTCOLLECTION_DEFAULT_SAMPLERATE 64
-
namespace SXSI
{
+
/**
* General interface for a text collection
*
// Type for text position (FIXME ulong or long?)
typedef ulong TextPosition;
- /**
- * Init an instance of a text collection object
- *
- * Returns a pointer to an object implementing this interface.
- */
- static TextCollection * InitTextCollection(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE);
+ // mode flag: Default includes both the index and "naive" text
+ enum index_mode_t { index_mode_default, index_mode_text_only }; // index_mode_index_only
+
/**
* Load from a file
*
+ * The second parameter is a prefix to be used for multiple
+ * files. (SWCSAWrapper uses multiple save files!)
+ *
* New samplerate can be given, otherwise will use the one specified in the save file!
- * Note: This is not a static method; call InitTextCollection() first to get the object handle.
*
* Throws an exception if std::fread() fails.
*
*/
- virtual void Load(FILE *, unsigned samplerate = 0) = 0;
+ static TextCollection* Load(FILE *, char const *, index_mode_t = index_mode_default, unsigned samplerate = 0);
+
/**
* Save data structure into a file
- *
+ *
+ * The second parameter is a prefix to be used for multiple
+ * files. (SWCSAWrapper uses multiple save files!)
+ *
* Throws an exception if std::fwrite() fails.
*/
- virtual void Save(FILE *) const = 0;
+ virtual void Save(FILE *, char const *) const = 0;
+
/**
* Virtual destructor
*/
virtual ~TextCollection() { };
- /**
- * Insert text
- *
- * Must be a zero-terminated string from alphabet [1,255].
- * Can not be called after makeStatic().
- * The i'th text insertion gets an identifier value i-1.
- * In other words, document identifiers start from 0.
- */
- virtual void InsertText(uchar const *) = 0;
- /**
- * Make static
- *
- * Convert to a static collection; reduces space and time complexities.
- * New texts can not be inserted after this operation.
- */
- virtual void MakeStatic() = 0;
-
+
/**
- tests if the string pointed to by DocId is empty
- */
-
+ * Tests if the string pointed to by DocId is empty
+ */
virtual bool EmptyText(DocId) const = 0;
/**
*
* Returns the i'th text in the collection.
* The numbering starts from 0.
+ *
+ * Call DeleteText() for each pointer returned by GetText()
+ * to avoid possible memory leaks.
*/
virtual uchar* GetText(DocId) const = 0;
+ virtual void DeleteText(uchar *text) const = 0;
+
+ /**
+ * Returns a pointer to the beginning of texts i, i+1, ..., j.
+ * Texts are separated by a '\0' byte.
+ *
+ * Call DeleteText() for each pointer returned by GetText()
+ * to avoid possible memory leaks.
+ */
+ virtual uchar * GetText(DocId i, DocId j) const = 0;
+
/**
* Returns substring [i, j] of k'th text
*
*
* Note: Do we need this?
* Forward iterator would be really in-efficient compared to
- * getText(k) and getText(k, i, j).
+ * getText(k).
*
* TODO Define and implement const_reverse_iterator.
*/
virtual bool IsLessThan(uchar const *) const = 0;
/**
- * Counting queries
- *
+ * Existential queries for given DocId interval.
+ */
+ virtual bool IsPrefix(uchar const *, DocId, DocId) const = 0;
+ virtual bool IsSuffix(uchar const *, DocId, DocId) const = 0;
+ virtual bool IsEqual(uchar const *, DocId, DocId) const = 0;
+ virtual bool IsContains(uchar const *, DocId, DocId) const = 0;
+ virtual bool IsLessThan(uchar const *, DocId, DocId) const = 0;
+
+ /**
+ * Counting queries
+ * Result is the number of occurrences.
+ */
+ virtual ulong Count(uchar const *) const = 0;
+ /**
+ * More counting queries
* Result is the number of documents.
*/
virtual unsigned CountPrefix(uchar const *) const = 0;
virtual unsigned CountContains(uchar const *) const = 0;
virtual unsigned CountLessThan(uchar const *) const = 0;
+ /**
+ * Counting queries for given DocId interval
+ */
+ virtual unsigned CountPrefix(uchar const *, DocId, DocId) const = 0;
+ virtual unsigned CountSuffix(uchar const *, DocId, DocId) const = 0;
+ virtual unsigned CountEqual(uchar const *, DocId, DocId) const = 0;
+ virtual unsigned CountContains(uchar const *, DocId, DocId) const = 0;
+ virtual unsigned CountLessThan(uchar const *, DocId, DocId) const = 0;
+
/**
* Document reporting queries
*
virtual document_result Equal(uchar const *) const = 0;
virtual document_result Contains(uchar const *) const = 0;
virtual document_result LessThan(uchar const *) const = 0;
+ virtual document_result KMismaches(uchar const *, unsigned) const = 0;
+ virtual document_result KErrors(uchar const *, unsigned) const = 0;
+
+ /**
+ * Document reporting queries for given DocId interval.
+ */
+ virtual document_result Prefix(uchar const *, DocId, DocId) const = 0;
+ virtual document_result Suffix(uchar const *, DocId, DocId) const = 0;
+ virtual document_result Equal(uchar const *, DocId, DocId) const = 0;
+ virtual document_result Contains(uchar const *, DocId, DocId) const = 0;
+ virtual document_result LessThan(uchar const *, DocId, DocId) const = 0;
/**
* Full reporting queries
// Data type for results
typedef std::vector<std::pair<DocId, TextPosition> > full_result;
virtual full_result FullContains(uchar const *) const = 0;
+ // Full reporting query for given DocId interval
+ virtual full_result FullContains(uchar const *, DocId, DocId) const = 0;
+ virtual full_result FullKMismatches(uchar const *, unsigned) const = 0;
+ virtual full_result FullKErrors(uchar const *, unsigned) const = 0;
+
+
+ virtual TextPosition getLength() const
+ {
+ std::cerr << "TextCollection::getLength() is unsupported! Use RLCSA instead." << std::endl;
+ std::exit(2);
+ return 0;
+ }
+
+ virtual TextPosition LF(uchar c, TextPosition i) const
+ {
+ std::cerr << "TextCollection::LF() is unsupported! Use RLCSA instead." << std::endl;
+ std::exit(2);
+ return 0;
+ }
+
+ virtual uchar* getSuffix(TextPosition pos, unsigned l) const
+ {
+ std::cerr << "TextCollection::getSuffix() is unsupported! Use RLCSA instead." << std::endl;
+ std::exit(2);
+ return 0;
+ }
+
+ virtual DocId getDoc(TextPosition i) const
+ {
+ std::cerr << "TextCollection::getDoc() is unsupported! Use RLCSA instead." << std::endl;
+ std::exit(2);
+ return 0;
+ }
+
- /**
- *Debug
- *
- */
- virtual TextPosition Lookup(TextPosition) const = 0;
protected:
- // Protected constructor; call the static function InitTextCollection().
+ // Protected constructor; use TextCollectionBuilder
TextCollection() { };
+// index_mode_t indexMode;
+
// No copy constructor or assignment
TextCollection(TextCollection const&);
TextCollection& operator = (TextCollection const&);