#include <vector>
#include <utility> // Defines std::pair.
-// Default samplerate for suffix array samples
-#define TEXTCOLLECTION_DEFAULT_SAMPLERATE 64
-
namespace SXSI
{
+
/**
* General interface for a text collection
*
// Type for text position (FIXME ulong or long?)
typedef ulong TextPosition;
- /**
- * Init an instance of a text collection object
- *
- * Returns a pointer to an object implementing this interface.
- */
- static TextCollection * InitTextCollection(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE);
/**
* Load from a file
*
* Throws an exception if std::fread() fails.
*
*/
- virtual void Load(FILE *, unsigned samplerate = 0) = 0;
+ static TextCollection* Load(FILE *, unsigned samplerate = 0);
+
/**
* Save data structure into a file
*
* Throws an exception if std::fwrite() fails.
*/
virtual void Save(FILE *) const = 0;
+
/**
* Virtual destructor
*/
virtual ~TextCollection() { };
- /**
- * Insert text
- *
- * Must be a zero-terminated string from alphabet [1,255].
- * Can not be called after makeStatic().
- * The i'th text insertion gets an identifier value i-1.
- * In other words, document identifiers start from 0.
- */
- virtual void InsertText(uchar const *) = 0;
- /**
- * Make static
- *
- * Convert to a static collection; reduces space and time complexities.
- * New texts can not be inserted after this operation.
- */
- virtual void MakeStatic() = 0;
-
+
/**
- tests if the string pointed to by DocId is empty
- */
-
+ * Tests if the string pointed to by DocId is empty
+ */
virtual bool EmptyText(DocId) const = 0;
/**
*
* Returns the i'th text in the collection.
* The numbering starts from 0.
+ *
+ * Call DeleteText() for each pointer returned by GetText()
+ * to avoid possible memory leaks.
*/
virtual uchar* GetText(DocId) const = 0;
+ virtual void DeleteText(uchar *text) const = 0;
+
+ /**
+ * Returns a pointer to the beginning of texts i, i+1, ..., j.
+ * Texts are separated by a '\0' byte.
+ *
+ * Call DeleteText() for each pointer returned by GetText()
+ * to avoid possible memory leaks.
+ */
+ virtual uchar * GetText(DocId i, DocId j) const = 0;
+
/**
* Returns substring [i, j] of k'th text
*
// Is there a text that is lexicographically less than given string?
virtual bool IsLessThan(uchar const *) const = 0;
+ /**
+ * Existential queries for given DocId interval.
+ */
+ virtual bool IsPrefix(uchar const *, DocId, DocId) const = 0;
+ virtual bool IsSuffix(uchar const *, DocId, DocId) const = 0;
+ virtual bool IsEqual(uchar const *, DocId, DocId) const = 0;
+ virtual bool IsContains(uchar const *, DocId, DocId) const = 0;
+ virtual bool IsLessThan(uchar const *, DocId, DocId) const = 0;
+
/**
* Counting queries
* Result is the number of occurrences.
virtual unsigned CountContains(uchar const *) const = 0;
virtual unsigned CountLessThan(uchar const *) const = 0;
+ /**
+ * Counting queries for given DocId interval
+ */
+ virtual unsigned CountPrefix(uchar const *, DocId, DocId) const = 0;
+ virtual unsigned CountSuffix(uchar const *, DocId, DocId) const = 0;
+ virtual unsigned CountEqual(uchar const *, DocId, DocId) const = 0;
+ virtual unsigned CountContains(uchar const *, DocId, DocId) const = 0;
+ virtual unsigned CountLessThan(uchar const *, DocId, DocId) const = 0;
+
/**
* Document reporting queries
*
virtual document_result Equal(uchar const *) const = 0;
virtual document_result Contains(uchar const *) const = 0;
virtual document_result LessThan(uchar const *) const = 0;
+ virtual document_result KMismaches(uchar const *, unsigned) const = 0;
+ virtual document_result KErrors(uchar const *, unsigned) const = 0;
+
+ /**
+ * Document reporting queries for given DocId interval.
+ */
+ virtual document_result Prefix(uchar const *, DocId, DocId) const = 0;
+ virtual document_result Suffix(uchar const *, DocId, DocId) const = 0;
+ virtual document_result Equal(uchar const *, DocId, DocId) const = 0;
+ virtual document_result Contains(uchar const *, DocId, DocId) const = 0;
+ virtual document_result LessThan(uchar const *, DocId, DocId) const = 0;
/**
* Full reporting queries
// Data type for results
typedef std::vector<std::pair<DocId, TextPosition> > full_result;
virtual full_result FullContains(uchar const *) const = 0;
+ // Full reporting query for given DocId interval
+ virtual full_result FullContains(uchar const *, DocId, DocId) const = 0;
+
+ virtual full_result FullKMismatches(uchar const *, unsigned) const = 0;
+ virtual full_result FullKErrors(uchar const *, unsigned) const = 0;
protected:
- // Protected constructor; call the static function InitTextCollection().
+ // Protected constructor; use TextCollectionBuilder
TextCollection() { };
// No copy constructor or assignment