From ccc18959e93d8986c0ce81209865a0a2b5f42be6 Mon Sep 17 00:00:00 2001 From: nvalimak Date: Mon, 1 Dec 2008 13:06:13 +0000 Subject: [PATCH] Added TextCollection::Save() and TextCollection::Load() functionality git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@22 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- CSA.cpp | 200 ++++++++++++++++++++++++++++++++--------- CSA.h | 5 +- TextCollection.h | 6 +- dynFMI.h | 2 +- testTextCollection.cpp | 5 ++ 5 files changed, 169 insertions(+), 49 deletions(-) diff --git a/CSA.cpp b/CSA.cpp index a15c733..754dfa8 100644 --- a/CSA.cpp +++ b/CSA.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include // For strlen() using std::vector; @@ -190,58 +191,22 @@ void CSA::MakeStatic() delete dynFMI; dynFMI = 0; - ulong i, min = 0, - max; - for (i=0;i<256;i++) - C[i]=0; - for (i=0;i0) {min = i; break;} - for (i=255;i>=min;--i) - if (C[i]>0) {max = i; break;} - - // Print frequencies -/* for(i = 0; i < 256; i++) - if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]); - fflush(stdout);*/ - - ulong prev=C[0], temp; - C[0]=0; - for (i=1;i<256;i++) { - temp = C[i]; - C[i]=C[i-1]+prev; - prev = temp; - } - this->codetable = node::makecodetable(bwt,n); - alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0); - //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n"); - -/* for (i = 0; i < n; ++i) - { - uchar c = alphabetrank->charAtPos(i); - TextPosition j = C[c]+alphabetrank->rank(c, i)-1; - printf("LF[%lu] = %lu\n", i, j); - }*/ + makewavelet(bwt); // Calculate BWT end-marker position (of last inserted text) - i = 0; + ulong i = 0; while (bwt[i] != '\0') { - uchar c = alphabetrank->charAtPos(i); + uchar c = bwt[i]; i = C[c]+alphabetrank->rank(c, i)-1; } bwtEndPos = i; //printf("bwtEndPos = %lu\n", bwtEndPos); - + delete [] bwt; // Make sampling tables maketables(); - // to avoid running out of unsigned, the sizes are computed in specific order (large/small)*small - // |class CSA| +256*|TCodeEntry|+|C[]|+|suffixes[]+positions[]|+... - //printf("FMindex takes %d B\n", - // 6*W/8+256*3*W/8+256*W/8+ (2*n/(samplerate*8))*W+sampled->SpaceRequirementInBits()/8+alphabetrank->SpaceRequirementInBits()/8+W/8); } @@ -648,18 +613,128 @@ TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const return a; } -void CSA::Load(FILE *filename, unsigned samplerate) +/** + * Save index to a file handle + * + * Throws a std::runtime_error exception on i/o error. + * First byte that is saved represents the version number of the save file. + * In version 1 files, the data fields are: + * version info; + * samplerate; + * length of the BWT; + * end marker position in BWT; + * BWT string of length n; + * number of texts; + * + * array of pairs. + * + * TODO: Save the data structures instead of BWT sequence? + */ +void CSA::Save(FILE *file) const { - // TODO + // Saving version 1 data: + uchar versionFlag = 1; + if (std::fwrite(&versionFlag, 1, 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (version flag)."); + if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (samplerate)."); + + if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (n)."); + + if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (bwt end position)."); + + for (ulong offset = 0; offset < n; offset ++) + { + uchar c = alphabetrank->charAtPos(offset); + if (std::fwrite(&c, sizeof(uchar), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (bwt sequence)."); + } + + unsigned r = textLength.size(); + if (std::fwrite(&r, sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (r)."); + + for (r = 0; r < textLength.size(); ++ r) + { + if (std::fwrite(&(textLength[r].first), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (text length)."); + if (std::fwrite(&(textLength[r].second), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (text start)."); + } } -void CSA::Save(FILE *filename) + +/** + * Load index from a file handle + * + * Throws a std::runtime_error exception on i/o error. + * For more info, see CSA::Save(). + */ +void CSA::Load(FILE *file, unsigned samplerate) { - // TODO + // Delete everything + delete dynFMI; dynFMI = 0; + delete alphabetrank; alphabetrank = 0; + delete sampled; sampled = 0; + delete [] suffixes; suffixes = 0; + delete [] positions; positions = 0; + delete [] codetable; codetable = 0; + + endmarkers.clear(); + textLength.clear(); + + this->samplerate = samplerate; + this->n = 0; + + uchar versionFlag = 0; + if (std::fread(&versionFlag, 1, 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (version flag)."); + if (versionFlag != 1) + throw std::runtime_error("CSA::Load(): invalid start byte."); + + if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (samplerate)."); + if (this->samplerate == 0) + this->samplerate = samplerate; + + if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (n)."); + + if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (bwt end position)."); + + uchar *bwt = new uchar[n]; + for (ulong offset = 0; offset < n; offset ++) + if (std::fread((bwt + offset), sizeof(uchar), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (bwt sequence)."); + + unsigned r = 0; + if (std::fread(&r, sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (r)."); + + while (r > 0) + { + TextPosition length = 0, start = 0; + if (std::fread(&length, sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (text length)."); + if (std::fread(&start, sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (text start)."); + + textLength.push_back(make_pair(length, start)); + --r; + } + + // Construct data structures + makewavelet(bwt); + delete [] bwt; + maketables(); } + /** * Rest of the functions follow... */ @@ -789,6 +864,7 @@ CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(sample } CSA::~CSA() { + delete dynFMI; delete alphabetrank; delete sampled; delete [] suffixes; @@ -796,6 +872,42 @@ CSA::~CSA() { delete [] codetable; } +void CSA::makewavelet(uchar *bwt) +{ + ulong i, min = 0, + max; + for (i=0;i<256;i++) + C[i]=0; + for (i=0;i0) {min = i; break;} + for (i=255;i>=min;--i) + if (C[i]>0) {max = i; break;} + + // Print frequencies +/* for(i = 0; i < 256; i++) + if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]); + fflush(stdout);*/ + + ulong prev=C[0], temp; + C[0]=0; + for (i=1;i<256;i++) { + temp = C[i]; + C[i]=C[i-1]+prev; + prev = temp; + } + this->codetable = node::makecodetable(bwt,n); + alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0); + //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n"); + +/* for (i = 0; i < n; ++i) + { + uchar c = alphabetrank->charAtPos(i); + TextPosition j = C[c]+alphabetrank->rank(c, i)-1; + printf("LF[%lu] = %lu\n", i, j); + }*/ +} void CSA::maketables() { diff --git a/CSA.h b/CSA.h index 1ecddc6..9f6c500 100644 --- a/CSA.h +++ b/CSA.h @@ -80,9 +80,9 @@ public: // full_result is inherited from SXSI::TextCollection. full_result FullContains(uchar const *) const; - // TODO implement: + // Index from/to disk void Load(FILE *, unsigned); - void Save(FILE *); + void Save(FILE *) const; private: class TCodeEntry { @@ -216,6 +216,7 @@ private: uchar * BWT(uchar *); uchar * LoadFromFile(const char *); void SaveToFile(const char *, uchar *); + void makewavelet(uchar *); void maketables(); // Following are not part of the public API diff --git a/TextCollection.h b/TextCollection.h index e7e36e2..e7f0d4b 100644 --- a/TextCollection.h +++ b/TextCollection.h @@ -56,14 +56,16 @@ namespace SXSI * New samplerate can be given, otherwise will use the one specified in the save file! * Note: This is not a static method; call InitTextCollection() first to get the object handle. * - * Throws an exception if something goes wrong (unlikely since we are passing a file descriptor). + * Throws an exception if std::fread() fails. * */ virtual void Load(FILE *, unsigned samplerate = 0) = 0; /** * Save data structure into a file + * + * Throws an exception if std::fwrite() fails. */ - virtual void Save(FILE *) = 0; + virtual void Save(FILE *) const = 0; /** * Virtual destructor */ diff --git a/dynFMI.h b/dynFMI.h index 41ef992..a57ee92 100644 --- a/dynFMI.h +++ b/dynFMI.h @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #ifndef SAMPLE diff --git a/testTextCollection.cpp b/testTextCollection.cpp index d005b8f..d89a328 100644 --- a/testTextCollection.cpp +++ b/testTextCollection.cpp @@ -37,6 +37,8 @@ int main() csa->InsertText(text); csa->MakeStatic(); +// FILE *pFile = fopen ( "mysave.txt" , "rb" ); +// csa->Load(pFile); text = csa->GetText(0); cout << "Text 0: \"" << text << "\"" << endl; @@ -65,5 +67,8 @@ int main() fr = csa->FullContains((uchar *)"ab"); printFullResult(fr); +// FILE *pFile2 = fopen ( "mysave.txt" , "wb" ); +// csa->Save(pFile2); + delete csa; } -- 2.17.1