X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=TextCollection.h;h=98272c1a4d572124286b2dcb0c46b2049704cc3b;hb=18a3c1f1b87744c78fde76f96c3e9cf43e137b36;hp=6a4eb1cc4252382c8092840c6f64a0d6c4e23702;hpb=56d77631b00f8622c211db6721cb087a5d29f0e2;p=SXSI%2FTextCollection.git diff --git a/TextCollection.h b/TextCollection.h index 6a4eb1c..98272c1 100644 --- a/TextCollection.h +++ b/TextCollection.h @@ -25,11 +25,9 @@ #include #include // Defines std::pair. -// Default samplerate for suffix array samples -#define TEXTCOLLECTION_DEFAULT_SAMPLERATE 64 - namespace SXSI { + /** * General interface for a text collection * @@ -44,53 +42,40 @@ namespace SXSI // Type for text position (FIXME ulong or long?) typedef ulong TextPosition; - /** - * Init an instance of a text collection object - * - * Returns a pointer to an object implementing this interface. - */ - static TextCollection * InitTextCollection(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE); + // mode flag: Default includes both the index and "naive" text + enum index_mode_t { index_mode_default, index_mode_text_only }; // index_mode_index_only + /** * Load from a file * + * The second parameter is a prefix to be used for multiple + * files. (SWCSAWrapper uses multiple save files!) + * * New samplerate can be given, otherwise will use the one specified in the save file! - * Note: This is not a static method; call InitTextCollection() first to get the object handle. * * Throws an exception if std::fread() fails. * */ - virtual void Load(FILE *, unsigned samplerate = 0) = 0; + static TextCollection* Load(FILE *, char const *, index_mode_t = index_mode_default, unsigned samplerate = 0); + /** * Save data structure into a file - * + * + * The second parameter is a prefix to be used for multiple + * files. (SWCSAWrapper uses multiple save files!) + * * Throws an exception if std::fwrite() fails. */ - virtual void Save(FILE *) const = 0; + virtual void Save(FILE *, char const *) const = 0; + /** * Virtual destructor */ virtual ~TextCollection() { }; - /** - * Insert text - * - * Must be a zero-terminated string from alphabet [1,255]. - * Can not be called after makeStatic(). - * The i'th text insertion gets an identifier value i-1. - * In other words, document identifiers start from 0. - */ - virtual void InsertText(uchar const *) = 0; - /** - * Make static - * - * Convert to a static collection; reduces space and time complexities. - * New texts can not be inserted after this operation. - */ - virtual void MakeStatic() = 0; - + /** - tests if the string pointed to by DocId is empty - */ - + * Tests if the string pointed to by DocId is empty + */ virtual bool EmptyText(DocId) const = 0; /** @@ -98,20 +83,34 @@ namespace SXSI * * Returns the i'th text in the collection. * The numbering starts from 0. + * + * Call DeleteText() for each pointer returned by GetText() + * to avoid possible memory leaks. */ virtual uchar* GetText(DocId) const = 0; + virtual void DeleteText(uchar *text) const = 0; + + /** + * Returns a pointer to the beginning of texts i, i+1, ..., j. + * Texts are separated by a '\0' byte. + * + * Call DeleteText() for each pointer returned by GetText() + * to avoid possible memory leaks. + */ + virtual uchar * GetText(DocId i, DocId j) const = 0; + /** * Returns substring [i, j] of k'th text * * Note: Parameters i and j are text positions inside the k'th text. */ - virtual uchar* GetText(DocId, TextPosition, TextPosition) const = 0; +// virtual uchar* GetText(DocId, TextPosition, TextPosition) const = 0; /** * Returns backwards (reverse) iterator to the end of i'th text * * Note: Do we need this? * Forward iterator would be really in-efficient compared to - * getText(k) and getText(k, i, j). + * getText(k). * * TODO Define and implement const_reverse_iterator. */ @@ -132,8 +131,21 @@ namespace SXSI virtual bool IsLessThan(uchar const *) const = 0; /** - * Counting queries - * + * Existential queries for given DocId interval. + */ + virtual bool IsPrefix(uchar const *, DocId, DocId) const = 0; + virtual bool IsSuffix(uchar const *, DocId, DocId) const = 0; + virtual bool IsEqual(uchar const *, DocId, DocId) const = 0; + virtual bool IsContains(uchar const *, DocId, DocId) const = 0; + virtual bool IsLessThan(uchar const *, DocId, DocId) const = 0; + + /** + * Counting queries + * Result is the number of occurrences. + */ + virtual ulong Count(uchar const *) const = 0; + /** + * More counting queries * Result is the number of documents. */ virtual unsigned CountPrefix(uchar const *) const = 0; @@ -142,6 +154,15 @@ namespace SXSI virtual unsigned CountContains(uchar const *) const = 0; virtual unsigned CountLessThan(uchar const *) const = 0; + /** + * Counting queries for given DocId interval + */ + virtual unsigned CountPrefix(uchar const *, DocId, DocId) const = 0; + virtual unsigned CountSuffix(uchar const *, DocId, DocId) const = 0; + virtual unsigned CountEqual(uchar const *, DocId, DocId) const = 0; + virtual unsigned CountContains(uchar const *, DocId, DocId) const = 0; + virtual unsigned CountLessThan(uchar const *, DocId, DocId) const = 0; + /** * Document reporting queries * @@ -154,6 +175,17 @@ namespace SXSI virtual document_result Equal(uchar const *) const = 0; virtual document_result Contains(uchar const *) const = 0; virtual document_result LessThan(uchar const *) const = 0; + virtual document_result KMismaches(uchar const *, unsigned) const = 0; + virtual document_result KErrors(uchar const *, unsigned) const = 0; + + /** + * Document reporting queries for given DocId interval. + */ + virtual document_result Prefix(uchar const *, DocId, DocId) const = 0; + virtual document_result Suffix(uchar const *, DocId, DocId) const = 0; + virtual document_result Equal(uchar const *, DocId, DocId) const = 0; + virtual document_result Contains(uchar const *, DocId, DocId) const = 0; + virtual document_result LessThan(uchar const *, DocId, DocId) const = 0; /** * Full reporting queries @@ -163,11 +195,48 @@ namespace SXSI // Data type for results typedef std::vector > full_result; virtual full_result FullContains(uchar const *) const = 0; + // Full reporting query for given DocId interval + virtual full_result FullContains(uchar const *, DocId, DocId) const = 0; + + virtual full_result FullKMismatches(uchar const *, unsigned) const = 0; + virtual full_result FullKErrors(uchar const *, unsigned) const = 0; + + + virtual TextPosition getLength() const + { + std::cerr << "TextCollection::getLength() is unsupported! Use RLCSA instead." << std::endl; + std::exit(2); + return 0; + } + + virtual TextPosition LF(uchar c, TextPosition i) const + { + std::cerr << "TextCollection::LF() is unsupported! Use RLCSA instead." << std::endl; + std::exit(2); + return 0; + } + + virtual uchar* getSuffix(TextPosition pos, unsigned l) const + { + std::cerr << "TextCollection::getSuffix() is unsupported! Use RLCSA instead." << std::endl; + std::exit(2); + return 0; + } + + virtual DocId getDoc(TextPosition i) const + { + std::cerr << "TextCollection::getDoc() is unsupported! Use RLCSA instead." << std::endl; + std::exit(2); + return 0; + } + protected: - // Protected constructor; call the static function InitTextCollection(). + // Protected constructor; use TextCollectionBuilder TextCollection() { }; +// index_mode_t indexMode; + // No copy constructor or assignment TextCollection(TextCollection const&); TextCollection& operator = (TextCollection const&);