1 /***************************************************************************
2 * Copyright (C) 2006 by Wolfgang Gerlach *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU General Public License as published by *
7 * the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU General Public License for more details. *
15 * You should have received a copy of the GNU General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19 ***************************************************************************/
38 using std::priority_queue;
43 // -------- DynFMI --------
46 deleteDynFMINodes(root);
48 delete[] leaves; //free(leaves) // ???;
50 delete SampledSAPositionsIndicator;
59 void DynFMI::deleteDynFMINodes(WaveletNode *n){
60 if (n->right) deleteDynFMINodes(n->right);
61 if (n->left) deleteDynFMINodes(n->left);
65 DynFMI::DynFMI(uchar *text, ulong x, ulong n, bool del){
67 initEmptyDynFMI(text, x, n);
68 if (del) delete [] text;
71 DynFMI::DynFMI(char *readfile){
73 std::ifstream file(readfile);
76 cerr << "error reading file: " << readfile << endl;
80 file.seekg(0, ios::end);
81 ulong n = file.tellg();
82 file.seekg(0, ios::beg);
84 uchar *text = new uchar[n];
97 initEmptyDynFMI(text, 1, n);
104 void DynFMI::iterateReset(){
106 recursiveIterateResetWaveletNode(root);
109 void DynFMI::recursiveIterateResetWaveletNode(WaveletNode *w){
110 w->bittree->iterateReset();
112 if (w->left) recursiveIterateResetWaveletNode(w->left);
113 if (w->right) recursiveIterateResetWaveletNode(w->right);
117 bool DynFMI::iterateNext(){
119 return !(iterate > getSize());
122 uchar DynFMI::iterateGetSymbol(){
125 WaveletNode *walk = root;
130 bit = walk->bittree->iterateGetBit(); // TODO improve
131 i=walk->bittree->iterateGetRank(bit);
133 walk->bittree->iterateNext();
137 if (walk->right == 0) return walk->c1;
140 if (walk->left == 0) return walk->c0;
150 uchar* DynFMI::getBWT(){
151 ulong n = root->bittree->getPositions();
153 uchar *text = new uchar[n];
157 //for (ulong i=1; i <= root->bittree->getPositions(); i++)
158 // text[i-1]=(*this)[i];
165 text[i] = iterateGetSymbol();
166 data = iterateNext();
174 std::ostream& DynFMI::getBWTStream(std::ostream& stream){
175 ulong n = root->bittree->getPositions();
177 //uchar *text = new uchar[n];
181 //for (ulong i=1; i <= root->bittree->getPositions(); i++)
182 // text[i-1]=(*this)[i];
189 //text[i] = iterateGetSymbol();
190 stream << iterateGetSymbol();
191 data = iterateNext();
199 //void DynFMI::printDynFMIContent(ostream& stream){
201 // for (ulong i=1; i<=getSize(); i++)
209 void DynFMI::deleteLeaves(WaveletNode *node){
215 deleteLeaves(node->left);
220 deleteLeaves(node->right);
224 // is a leaf, delete it!
226 if (node==node->parent->left) node->parent->left=0;
227 else node->parent->right=0;
233 void DynFMI::makeCodes(ulong code, int bits, WaveletNode *node){
235 if (node == node->left) {
236 cout << "makeCodes: autsch" << endl;
242 makeCodes(code | (0 << bits), bits+1, node->left);
243 makeCodes(code | (1 << bits), bits+1, node->right);
245 codes[node->c0] = code;
246 codelengths[node->c0] = bits+1;
250 void DynFMI::appendBVTrees(WaveletNode *node){
251 node->bittree = new BVTree();
253 if (node->left) appendBVTrees(node->left);
254 if (node->right) appendBVTrees(node->right);
257 void DynFMI::initEmptyDynFMI(uchar *text, ulong x, ulong n){
258 // pointers to the leaves for select
259 leaves = (WaveletNode**) new WaveletNode*[256];
260 for(int j=0; j<256; j++) leaves[j]=0;
264 for(i=0; i < n; i++) {
266 cerr << "Error: unsigned char 0 is reserved." << endl;
270 if (leaves[text[i]]==0) {
271 leaves[text[i]] = new WaveletNode(text[i]);
273 leaves[text[i]]->weight++;
276 // separation symbol:
277 leaves[0] = new WaveletNode((uchar)0);
281 priority_queue< WaveletNode*, vector<WaveletNode*>, std::greater<WaveletNode*> > q;
284 for(int j=255; j>=0; j--){
286 q.push( (leaves[j]) );
292 // creates huffman shape:
293 while (q.size() > 1) {
295 WaveletNode *left = q.top();
298 WaveletNode *right = q.top();
301 q.push( new WaveletNode(left, right) );
309 makeCodes(0,0, root); // writes codes and codelengths
312 // merge leaves (one leaf represent two characters!)
313 for(int j=0; j<256; j++){
317 if (leaves[j]->parent->left==leaves[j]) {
318 leaves[j]->parent->c0=j;
320 leaves[j]->parent->c1=j;
322 leaves[j]=leaves[j]->parent; // merge
329 // array C needed for backwards search
330 for(int j=0; j<256+256; j++) C[j] = 0;
333 this->SampledSAPositionsIndicator = new BVTree();
334 this->sampledSATree = new SampledSATree();
337 //this->pos = new Pos(sampleInterval);
338 //this->handle = new Handle();
340 //pos->handle=this->handle;
341 //handle->pos=this->pos;
345 void DynFMI::append(uchar *text, ulong length){
347 ulong start = root->bittree->getPositions();
348 for (i = start; i<start+length; i++)
349 insert(text[i-start], i+1);
353 void DynFMI::append(uchar c){
354 insert(c, root->bittree->getPositions()+1);
357 void DynFMI::insert(uchar c, ulong i){
359 if (codelengths[c]==0) {
360 cerr << "error: Symbol \"" << c << "\" (" << (int)c << ") is not in the code table!" << endl;;
366 ulong code = codes[c];
369 WaveletNode *walk = root;
373 bit = ((code & (1u << level)) != 0);
375 walk->bittree->insertBit(bit,i); // TODO improve
376 i=walk->bittree->rank(bit, i);
390 j=binaryTree_parent(j);
396 void DynFMI::deleteSymbol(ulong i){
397 WaveletNode *walk = root;
402 // original slow version:
403 //bit = (*walk->bittree)[i];
405 //i=walk->bittree->rank(bit, i);
406 //walk->bittree->deleteBit(old_i);
409 walk->bittree->deleteBit(i);
410 bit=walk->bittree->getLastBitDeleted();
411 i=walk->bittree->getLastRank();
414 if (walk->right == 0) {
420 if (walk->left == 0) {
433 j=binaryTree_parent(j);
441 uchar DynFMI::operator[](ulong i){
442 WaveletNode *walk = root;
447 bit = (*walk->bittree)[i]; //TODO improve by reducing
448 i=walk->bittree->rank(bit, i);
451 if (walk->right == 0) return walk->c1;
454 if (walk->left == 0) return walk->c0;
463 ulong DynFMI::rank(uchar c, ulong i){
467 ulong code = codes[c];
471 WaveletNode *walk = root;
475 bit = ((code & (1u << level)) != 0);
477 i=walk->bittree->rank(bit, i);
479 if (walk->right == 0) return i;
482 if (walk->left == 0) return i;
489 cerr << "error: DynFMI::rank: left while loop" << endl;
494 ulong DynFMI::select(uchar c, ulong i){
496 WaveletNode *walk = leaves[c];
498 bool bit = (walk->c1==c);
500 while (walk->parent) {
501 i=walk->bittree->select(bit, i);
503 bit = (walk == walk->parent->right);
507 i=walk->bittree->select(bit, i);
513 ulong DynFMI::count(uchar *pattern){
515 ulong i = strlen((char*)pattern);
518 ulong ep = getSize();
520 while ((sp <= ep) && (i >=1)) {
522 sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1;
523 ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep);
531 ulong* DynFMI::locate(uchar *pattern){
533 ulong i = strlen((char*)pattern);
536 ulong ep = getSize();
538 while ((sp <= ep) && (i >=1)) {
540 sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1;
541 ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep);
546 ulong numberOfMatches = ep-sp +1;
547 ulong *matches = new ulong[2*numberOfMatches + 1];
548 matches[0] = numberOfMatches;
552 cout << "----------------------" << endl;
553 for (ulong j=sp; j <= ep; j++) {
557 while (!(*SampledSAPositionsIndicator)[i]) {
560 i= (ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, i);
565 sampledSATree->getSample(SampledSAPositionsIndicator->rank(true, i));
567 //cout << "found: (" << sampledSATree->handle << ", " << sampledSATree->sampledSAValue + diff << ")"<< endl;
568 matches[k++] = sampledSATree->handle;
569 matches[k++] = sampledSATree->sampledSAValue + diff;
576 // size must include endmarker!
577 void DynFMI::addText(uchar const * str, ulong n){
582 i = numberOfTexts + 1;
586 // Adding an empty text (end-marker)
590 //if (del) delete [] str;
594 insert(str[n-2],i); // insert second last character, corresponds to suffix of length 1
596 sample = (n-1 % sampleInterval == 0);
597 SampledSAPositionsIndicator->insertBit(sample, i);
598 if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i));
600 for (ulong t=n-2; t > 0; t--) {
601 i= 1+getNumberOfSymbolsSmallerThan(str[t]) + rank(str[t],i);
604 sample = ((t) % sampleInterval == 0);
605 SampledSAPositionsIndicator->insertBit(sample, i);
607 if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i));
612 i= 1+ getNumberOfSymbolsSmallerThan(str[0]) + rank(str[0],i);
615 SampledSAPositionsIndicator->insertBit(true, i);
616 sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i));
618 // if (del) delete [] str;
624 /*ulong DynFMI::addTextFromFile(char* filename, ulong n){
632 std::ifstream str(filename, ios::binary);
635 cerr << "error reading file: " << filename << endl;
643 ulong buf_len; //1048576; //10MB
647 if (BUFFER==0) buf_len = n;
648 else buf_len = BUFFER;
650 buffer = new char[buf_len];
651 buf_pos = (n-2) / buf_len;
652 str.seekg(buf_pos*buf_len);
653 str.read(buffer, buf_len);
654 str.clear(std::ios_base::goodbit);
657 ulong key = pos->appendText(n);
662 c=buffer[(n-2)%buf_len];
663 insert(c,i); // insert second last character, corresponds to suffix of length 1
665 sample = (n-1 % sampleInterval == 0);
666 SampledSAPositionsIndicator->insertBit(sample, i);
667 if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i));
669 if ((n-2)%buf_len == 0) {
672 str.seekg(buf_pos*buf_len);
673 str.read(buffer, buf_len);
675 for (ulong t=n-2; t > 0; t--) {
677 i= 1+getNumberOfSymbolsSmallerThan(c) + rank(c,i);
679 c=buffer[(t-1)%buf_len];
683 if (((t-1)%buf_len == 0) && ((t-1)!=0)) {
686 cerr << "buf_pos too small" << endl;
691 str.seekg(buf_pos*buf_len);
692 str.read(buffer, buf_len);
696 sample = ((t) % sampleInterval == 0);
697 SampledSAPositionsIndicator->insertBit(sample, i);
699 if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i));
703 i= 1+ getNumberOfSymbolsSmallerThan(c) + rank(c,i);
707 SampledSAPositionsIndicator->insertBit(true, i);
708 sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i));
717 uchar* DynFMI::retrieveText(ulong key){
718 cout << "key: " << key << endl;
720 // TODO access two times, too much
721 ulong i=handle->getPos(key); //=bwtEnd;
726 ulong n=pos->getTextSize(i);
728 text = new uchar[n]; // last byte 0 for cout
733 for (ulong t=n-2; t < n; t--) {
735 text[t]=(c==0?'#':c);
737 i= 0+ getNumberOfSymbolsSmallerThan(c) + rank(c,i);
741 for (ulong i=0; i< n-1 ; i++) if (text[i]==0) text[i]='-';
749 bool DynFMI::deleteText(ulong key){
750 // TODO access two times, can do better
751 ulong i=handle->getPos(key); //=bwtEnd;
752 if (i == 0) return false;
754 ulong n=pos->getTextSize(i);
755 ulong *text = new ulong[n];
758 for (ulong t=n-1; t <n; t--) {
761 i= 0+ getNumberOfSymbolsSmallerThan(c) + rank(c,i); // TODO improve by lastrank!
765 std::sort(text, text + n);
768 for (i=n-1; i<n; i--) {
769 this->deleteSymbol(text[i]);
771 SampledSAPositionsIndicator->deleteBit(text[i]);
776 handle->deleteKey(key);
782 ulong DynFMI::getNumberOfSymbolsSmallerThan(uchar c){
786 if (binaryTree_isRightChild(j))
787 r += C[binaryTree_left(binaryTree_parent(j))];
789 j=binaryTree_parent(j);
797 void DynFMI::printDynFMIContent(std::ostream& stream){
799 for (ulong i=1; i<=getSize(); i++)