1 /***************************************************************************
2 * Copyright (C) 2006 by Wolfgang Gerlach *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU General Public License as published by *
7 * the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU General Public License for more details. *
15 * You should have received a copy of the GNU General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19 ***************************************************************************/
38 using std::priority_queue;
43 // -------- DynFMI --------
46 deleteDynFMINodes(root);
48 delete[] leaves; //free(leaves) // ???;
50 delete SampledSAPositionsIndicator;
59 void DynFMI::deleteDynFMINodes(WaveletNode *n){
60 if (n->right) deleteDynFMINodes(n->right);
61 if (n->left) deleteDynFMINodes(n->left);
65 DynFMI::DynFMI(uchar *text, ulong x, ulong n, bool del){
66 initEmptyDynFMI(text, x, n);
67 if (del) delete [] text;
70 DynFMI::DynFMI(char *readfile){
72 std::ifstream file(readfile);
75 cerr << "error reading file: " << readfile << endl;
79 file.seekg(0, ios::end);
80 ulong n = file.tellg();
81 file.seekg(0, ios::beg);
83 uchar *text = new uchar[n];
96 initEmptyDynFMI(text, 1, n);
103 void DynFMI::iterateReset(){
105 recursiveIterateResetWaveletNode(root);
108 void DynFMI::recursiveIterateResetWaveletNode(WaveletNode *w){
109 w->bittree->iterateReset();
111 if (w->left) recursiveIterateResetWaveletNode(w->left);
112 if (w->right) recursiveIterateResetWaveletNode(w->right);
116 bool DynFMI::iterateNext(){
118 return !(iterate > getSize());
121 uchar DynFMI::iterateGetSymbol(){
124 WaveletNode *walk = root;
129 bit = walk->bittree->iterateGetBit(); // TODO improve
130 i=walk->bittree->iterateGetRank(bit);
132 walk->bittree->iterateNext();
136 if (walk->right == 0) return walk->c1;
139 if (walk->left == 0) return walk->c0;
149 uchar* DynFMI::getBWT(){
150 ulong n = root->bittree->getPositions();
152 uchar *text = new uchar[n];
156 //for (ulong i=1; i <= root->bittree->getPositions(); i++)
157 // text[i-1]=(*this)[i];
164 text[i] = iterateGetSymbol();
165 data = iterateNext();
173 std::ostream& DynFMI::getBWTStream(std::ostream& stream){
174 ulong n = root->bittree->getPositions();
176 //uchar *text = new uchar[n];
180 //for (ulong i=1; i <= root->bittree->getPositions(); i++)
181 // text[i-1]=(*this)[i];
188 //text[i] = iterateGetSymbol();
189 stream << iterateGetSymbol();
190 data = iterateNext();
198 //void DynFMI::printDynFMIContent(ostream& stream){
200 // for (ulong i=1; i<=getSize(); i++)
208 void DynFMI::deleteLeaves(WaveletNode *node){
214 deleteLeaves(node->left);
219 deleteLeaves(node->right);
223 // is a leaf, delete it!
225 if (node==node->parent->left) node->parent->left=0;
226 else node->parent->right=0;
232 void DynFMI::makeCodes(ulong code, int bits, WaveletNode *node){
234 if (node == node->left) {
235 cout << "makeCodes: autsch" << endl;
241 makeCodes(code | (0 << bits), bits+1, node->left);
242 makeCodes(code | (1 << bits), bits+1, node->right);
244 codes[node->c0] = code;
245 codelengths[node->c0] = bits+1;
249 void DynFMI::appendBVTrees(WaveletNode *node){
250 node->bittree = new BVTree();
252 if (node->left) appendBVTrees(node->left);
253 if (node->right) appendBVTrees(node->right);
256 void DynFMI::initEmptyDynFMI(uchar *text, ulong x, ulong n){
257 // pointers to the leaves for select
258 leaves = (WaveletNode**) new WaveletNode*[256];
259 for(int j=0; j<256; j++) leaves[j]=0;
263 for(i=0; i < n; i++) {
265 cerr << "Error: unsigned char 0 is reserved." << endl;
269 if (leaves[text[i]]==0) {
270 leaves[text[i]] = new WaveletNode(text[i]);
272 leaves[text[i]]->weight++;
275 // separation symbol:
276 leaves[0] = new WaveletNode((uchar)0);
280 priority_queue< WaveletNode*, vector<WaveletNode*>, std::greater<WaveletNode*> > q;
283 for(int j=255; j>=0; j--){
285 q.push( (leaves[j]) );
291 // creates huffman shape:
292 while (q.size() > 1) {
294 WaveletNode *left = q.top();
297 WaveletNode *right = q.top();
300 q.push( new WaveletNode(left, right) );
308 makeCodes(0,0, root); // writes codes and codelengths
311 // merge leaves (one leaf represent two characters!)
312 for(int j=0; j<256; j++){
316 if (leaves[j]->parent->left==leaves[j]) {
317 leaves[j]->parent->c0=j;
319 leaves[j]->parent->c1=j;
321 leaves[j]=leaves[j]->parent; // merge
328 // array C needed for backwards search
329 for(int j=0; j<256+256; j++) C[j] = 0;
332 this->SampledSAPositionsIndicator = new BVTree();
333 this->sampledSATree = new SampledSATree();
336 this->pos = new Pos(sampleInterval);
337 this->handle = new Handle();
339 pos->handle=this->handle;
340 handle->pos=this->pos;
344 void DynFMI::append(uchar *text, ulong length){
346 ulong start = root->bittree->getPositions();
347 for (i = start; i<start+length; i++)
348 insert(text[i-start], i+1);
352 void DynFMI::append(uchar c){
353 insert(c, root->bittree->getPositions()+1);
356 void DynFMI::insert(uchar c, ulong i){
358 if (codelengths[c]==0) {
359 cerr << "error: Symbol \"" << c << "\" (" << (int)c << ") is not in the code table!" << endl;;
365 ulong code = codes[c];
368 WaveletNode *walk = root;
372 bit = ((code & (1u << level)) != 0);
374 walk->bittree->insertBit(bit,i); // TODO improve
375 i=walk->bittree->rank(bit, i);
389 j=binaryTree_parent(j);
395 void DynFMI::deleteSymbol(ulong i){
396 WaveletNode *walk = root;
401 // original slow version:
402 //bit = (*walk->bittree)[i];
404 //i=walk->bittree->rank(bit, i);
405 //walk->bittree->deleteBit(old_i);
408 walk->bittree->deleteBit(i);
409 bit=walk->bittree->getLastBitDeleted();
410 i=walk->bittree->getLastRank();
413 if (walk->right == 0) {
419 if (walk->left == 0) {
432 j=binaryTree_parent(j);
440 uchar DynFMI::operator[](ulong i){
441 WaveletNode *walk = root;
446 bit = (*walk->bittree)[i]; //TODO improve by reducing
447 i=walk->bittree->rank(bit, i);
450 if (walk->right == 0) return walk->c1;
453 if (walk->left == 0) return walk->c0;
462 ulong DynFMI::rank(uchar c, ulong i){
466 ulong code = codes[c];
470 WaveletNode *walk = root;
474 bit = ((code & (1u << level)) != 0);
476 i=walk->bittree->rank(bit, i);
478 if (walk->right == 0) return i;
481 if (walk->left == 0) return i;
488 cerr << "error: DynFMI::rank: left while loop" << endl;
493 ulong DynFMI::select(uchar c, ulong i){
495 WaveletNode *walk = leaves[c];
497 bool bit = (walk->c1==c);
499 while (walk->parent) {
500 i=walk->bittree->select(bit, i);
502 bit = (walk == walk->parent->right);
506 i=walk->bittree->select(bit, i);
512 ulong DynFMI::count(uchar *pattern){
514 ulong i = strlen((char*)pattern);
517 ulong ep = getSize();
519 while ((sp <= ep) && (i >=1)) {
521 sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1;
522 ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep);
530 ulong* DynFMI::locate(uchar *pattern){
532 ulong i = strlen((char*)pattern);
535 ulong ep = getSize();
537 while ((sp <= ep) && (i >=1)) {
539 sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1;
540 ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep);
545 ulong numberOfMatches = ep-sp +1;
546 ulong *matches = new ulong[2*numberOfMatches + 1];
547 matches[0] = numberOfMatches;
551 cout << "----------------------" << endl;
552 for (ulong j=sp; j <= ep; j++) {
556 while (!(*SampledSAPositionsIndicator)[i]) {
559 i= (ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, i);
564 sampledSATree->getSample(SampledSAPositionsIndicator->rank(true, i));
566 //cout << "found: (" << sampledSATree->handle << ", " << sampledSATree->sampledSAValue + diff << ")"<< endl;
567 matches[k++] = sampledSATree->handle;
568 matches[k++] = sampledSATree->sampledSAValue + diff;
575 // size must include endmarker!
576 ulong DynFMI::addText(uchar const * str, ulong n){
583 ulong key = pos->appendText(n);
585 // Adding an empty text (end-marker)
589 //if (del) delete [] str;
593 insert(str[n-2],i); // insert second last character, corresponds to suffix of length 1
595 sample = (n-1 % sampleInterval == 0);
596 SampledSAPositionsIndicator->insertBit(sample, i);
597 if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i));
599 for (ulong t=n-2; t > 0; t--) {
600 i= 1+getNumberOfSymbolsSmallerThan(str[t]) + rank(str[t],i);
603 sample = ((t) % sampleInterval == 0);
604 SampledSAPositionsIndicator->insertBit(sample, i);
606 if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i));
611 i= 1+ getNumberOfSymbolsSmallerThan(str[0]) + rank(str[0],i);
614 SampledSAPositionsIndicator->insertBit(true, i);
615 sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i));
617 // if (del) delete [] str;
623 ulong DynFMI::addTextFromFile(char* filename, ulong n){
631 std::ifstream str(filename, ios::binary);
634 cerr << "error reading file: " << filename << endl;
642 ulong buf_len; //1048576; //10MB
646 if (BUFFER==0) buf_len = n;
647 else buf_len = BUFFER;
649 buffer = new char[buf_len];
650 buf_pos = (n-2) / buf_len;
651 str.seekg(buf_pos*buf_len);
652 str.read(buffer, buf_len);
653 str.clear(std::ios_base::goodbit);
656 ulong key = pos->appendText(n);
661 c=buffer[(n-2)%buf_len];
662 insert(c,i); // insert second last character, corresponds to suffix of length 1
664 sample = (n-1 % sampleInterval == 0);
665 SampledSAPositionsIndicator->insertBit(sample, i);
666 if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i));
668 if ((n-2)%buf_len == 0) {
671 str.seekg(buf_pos*buf_len);
672 str.read(buffer, buf_len);
674 for (ulong t=n-2; t > 0; t--) {
676 i= 1+getNumberOfSymbolsSmallerThan(c) + rank(c,i);
678 c=buffer[(t-1)%buf_len];
682 if (((t-1)%buf_len == 0) && ((t-1)!=0)) {
685 cerr << "buf_pos too small" << endl;
690 str.seekg(buf_pos*buf_len);
691 str.read(buffer, buf_len);
695 sample = ((t) % sampleInterval == 0);
696 SampledSAPositionsIndicator->insertBit(sample, i);
698 if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i));
702 i= 1+ getNumberOfSymbolsSmallerThan(c) + rank(c,i);
706 SampledSAPositionsIndicator->insertBit(true, i);
707 sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i));
716 uchar* DynFMI::retrieveText(ulong key){
717 cout << "key: " << key << endl;
719 // TODO access two times, too much
720 ulong i=handle->getPos(key); //=bwtEnd;
725 ulong n=pos->getTextSize(i);
727 text = new uchar[n]; // last byte 0 for cout
732 for (ulong t=n-2; t < n; t--) {
734 text[t]=(c==0?'#':c);
736 i= 0+ getNumberOfSymbolsSmallerThan(c) + rank(c,i);
740 for (ulong i=0; i< n-1 ; i++) if (text[i]==0) text[i]='-';
748 bool DynFMI::deleteText(ulong key){
749 // TODO access two times, can do better
750 ulong i=handle->getPos(key); //=bwtEnd;
751 if (i == 0) return false;
753 ulong n=pos->getTextSize(i);
754 ulong *text = new ulong[n];
757 for (ulong t=n-1; t <n; t--) {
760 i= 0+ getNumberOfSymbolsSmallerThan(c) + rank(c,i); // TODO improve by lastrank!
764 std::sort(text, text + n);
767 for (i=n-1; i<n; i--) {
768 this->deleteSymbol(text[i]);
770 SampledSAPositionsIndicator->deleteBit(text[i]);
775 handle->deleteKey(key);
781 ulong DynFMI::getNumberOfSymbolsSmallerThan(uchar c){
785 if (binaryTree_isRightChild(j))
786 r += C[binaryTree_left(binaryTree_parent(j))];
788 j=binaryTree_parent(j);
796 void DynFMI::printDynFMIContent(std::ostream& stream){
798 for (ulong i=1; i<=getSize(); i++)