1 /***************************************************************************
2 * Copyright (C) 2006 by Wolfgang Gerlach *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU General Public License as published by *
7 * the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU General Public License for more details. *
15 * You should have received a copy of the GNU General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19 ***************************************************************************/
37 using std::priority_queue;
42 // -------- DynFMI --------
45 deleteDynFMINodes(root);
47 delete[] leaves; //free(leaves) // ???;
49 delete SampledSAPositionsIndicator;
58 void DynFMI::deleteDynFMINodes(WaveletNode *n){
59 if (n->right) deleteDynFMINodes(n->right);
60 if (n->left) deleteDynFMINodes(n->left);
64 DynFMI::DynFMI(uchar *text, ulong x, ulong n, bool del){
65 initEmptyDynFMI(text, x, n);
66 if (del) delete [] text;
69 DynFMI::DynFMI(char *readfile){
71 std::ifstream file(readfile);
74 cerr << "error reading file: " << readfile << endl;
78 file.seekg(0, ios::end);
79 ulong n = file.tellg();
80 file.seekg(0, ios::beg);
82 uchar *text = new uchar[n];
95 initEmptyDynFMI(text, 1, n);
102 void DynFMI::iterateReset(){
104 recursiveIterateResetWaveletNode(root);
107 void DynFMI::recursiveIterateResetWaveletNode(WaveletNode *w){
108 w->bittree->iterateReset();
110 if (w->left) recursiveIterateResetWaveletNode(w->left);
111 if (w->right) recursiveIterateResetWaveletNode(w->right);
115 bool DynFMI::iterateNext(){
117 return !(iterate > getSize());
120 uchar DynFMI::iterateGetSymbol(){
123 WaveletNode *walk = root;
128 bit = walk->bittree->iterateGetBit(); // TODO improve
129 i=walk->bittree->iterateGetRank(bit);
131 walk->bittree->iterateNext();
135 if (walk->right == 0) return walk->c1;
138 if (walk->left == 0) return walk->c0;
148 uchar* DynFMI::getBWT(){
149 ulong n = root->bittree->getPositions();
151 uchar *text = new uchar[n];
155 //for (ulong i=1; i <= root->bittree->getPositions(); i++)
156 // text[i-1]=(*this)[i];
163 text[i] = iterateGetSymbol();
164 data = iterateNext();
172 std::ostream& DynFMI::getBWTStream(std::ostream& stream){
173 ulong n = root->bittree->getPositions();
175 //uchar *text = new uchar[n];
179 //for (ulong i=1; i <= root->bittree->getPositions(); i++)
180 // text[i-1]=(*this)[i];
187 //text[i] = iterateGetSymbol();
188 stream << iterateGetSymbol();
189 data = iterateNext();
197 //void DynFMI::printDynFMIContent(ostream& stream){
199 // for (ulong i=1; i<=getSize(); i++)
207 void DynFMI::deleteLeaves(WaveletNode *node){
213 deleteLeaves(node->left);
218 deleteLeaves(node->right);
222 // is a leaf, delete it!
224 if (node==node->parent->left) node->parent->left=0;
225 else node->parent->right=0;
231 void DynFMI::makeCodes(ulong code, int bits, WaveletNode *node){
233 if (node == node->left) {
234 cout << "makeCodes: autsch" << endl;
240 makeCodes(code | (0 << bits), bits+1, node->left);
241 makeCodes(code | (1 << bits), bits+1, node->right);
243 codes[node->c0] = code;
244 codelengths[node->c0] = bits+1;
248 void DynFMI::appendBVTrees(WaveletNode *node){
249 node->bittree = new BVTree();
251 if (node->left) appendBVTrees(node->left);
252 if (node->right) appendBVTrees(node->right);
255 void DynFMI::initEmptyDynFMI(uchar *text, ulong x, ulong n){
256 // pointers to the leaves for select
257 leaves = (WaveletNode**) new WaveletNode*[256];
258 for(int j=0; j<256; j++) leaves[j]=0;
262 for(i=0; i < n; i++) {
264 cerr << "Error: unsigned char 0 is reserved." << endl;
268 if (leaves[text[i]]==0) {
269 leaves[text[i]] = new WaveletNode(text[i]);
271 leaves[text[i]]->weight++;
274 // separation symbol:
275 leaves[0] = new WaveletNode((uchar)0);
279 priority_queue< WaveletNode*, vector<WaveletNode*>, std::greater<WaveletNode*> > q;
282 for(int j=255; j>=0; j--){
284 q.push( (leaves[j]) );
290 // creates huffman shape:
291 while (q.size() > 1) {
293 WaveletNode *left = q.top();
296 WaveletNode *right = q.top();
299 q.push( new WaveletNode(left, right) );
307 makeCodes(0,0, root); // writes codes and codelengths
310 // merge leaves (one leaf represent two characters!)
311 for(int j=0; j<256; j++){
315 if (leaves[j]->parent->left==leaves[j]) {
316 leaves[j]->parent->c0=j;
318 leaves[j]->parent->c1=j;
320 leaves[j]=leaves[j]->parent; // merge
327 // array C needed for backwards search
328 for(int j=0; j<256+256; j++) C[j] = 0;
331 this->SampledSAPositionsIndicator = new BVTree();
332 this->sampledSATree = new SampledSATree();
335 this->pos = new Pos(sampleInterval);
336 this->handle = new Handle();
338 pos->handle=this->handle;
339 handle->pos=this->pos;
343 void DynFMI::append(uchar *text, ulong length){
345 ulong start = root->bittree->getPositions();
346 for (i = start; i<start+length; i++)
347 insert(text[i-start], i+1);
351 void DynFMI::append(uchar c){
352 insert(c, root->bittree->getPositions()+1);
355 void DynFMI::insert(uchar c, ulong i){
357 if (codelengths[c]==0) {
358 cerr << "error: Symbol \"" << c << "\" (" << (int)c << ") is not in the code table!" << endl;;
364 ulong code = codes[c];
367 WaveletNode *walk = root;
371 bit = ((code & (1u << level)) != 0);
373 walk->bittree->insertBit(bit,i); // TODO improve
374 i=walk->bittree->rank(bit, i);
388 j=binaryTree_parent(j);
394 void DynFMI::deleteSymbol(ulong i){
395 WaveletNode *walk = root;
400 // original slow version:
401 //bit = (*walk->bittree)[i];
403 //i=walk->bittree->rank(bit, i);
404 //walk->bittree->deleteBit(old_i);
407 walk->bittree->deleteBit(i);
408 bit=walk->bittree->getLastBitDeleted();
409 i=walk->bittree->getLastRank();
412 if (walk->right == 0) {
418 if (walk->left == 0) {
431 j=binaryTree_parent(j);
439 uchar DynFMI::operator[](ulong i){
440 WaveletNode *walk = root;
445 bit = (*walk->bittree)[i]; //TODO improve by reducing
446 i=walk->bittree->rank(bit, i);
449 if (walk->right == 0) return walk->c1;
452 if (walk->left == 0) return walk->c0;
461 ulong DynFMI::rank(uchar c, ulong i){
465 ulong code = codes[c];
469 WaveletNode *walk = root;
473 bit = ((code & (1u << level)) != 0);
475 i=walk->bittree->rank(bit, i);
477 if (walk->right == 0) return i;
480 if (walk->left == 0) return i;
487 cerr << "error: DynFMI::rank: left while loop" << endl;
492 ulong DynFMI::select(uchar c, ulong i){
494 WaveletNode *walk = leaves[c];
496 bool bit = (walk->c1==c);
498 while (walk->parent) {
499 i=walk->bittree->select(bit, i);
501 bit = (walk == walk->parent->right);
505 i=walk->bittree->select(bit, i);
511 ulong DynFMI::count(uchar *pattern){
513 ulong i = strlen((char*)pattern);
516 ulong ep = getSize();
518 while ((sp <= ep) && (i >=1)) {
520 sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1;
521 ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep);
529 ulong* DynFMI::locate(uchar *pattern){
531 ulong i = strlen((char*)pattern);
534 ulong ep = getSize();
536 while ((sp <= ep) && (i >=1)) {
538 sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1;
539 ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep);
544 ulong numberOfMatches = ep-sp +1;
545 ulong *matches = new ulong[2*numberOfMatches + 1];
546 matches[0] = numberOfMatches;
550 cout << "----------------------" << endl;
551 for (ulong j=sp; j <= ep; j++) {
555 while (!(*SampledSAPositionsIndicator)[i]) {
558 i= (ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, i);
563 sampledSATree->getSample(SampledSAPositionsIndicator->rank(true, i));
565 //cout << "found: (" << sampledSATree->handle << ", " << sampledSATree->sampledSAValue + diff << ")"<< endl;
566 matches[k++] = sampledSATree->handle;
567 matches[k++] = sampledSATree->sampledSAValue + diff;
574 // size must include endmarker!
575 ulong DynFMI::addText(uchar const * str, ulong n){
582 ulong key = pos->appendText(n);
584 // Adding an empty text (end-marker)
588 //if (del) delete [] str;
592 insert(str[n-2],i); // insert second last character, corresponds to suffix of length 1
594 sample = (n-1 % sampleInterval == 0);
595 SampledSAPositionsIndicator->insertBit(sample, i);
596 if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i));
598 for (ulong t=n-2; t > 0; t--) {
599 i= 1+getNumberOfSymbolsSmallerThan(str[t]) + rank(str[t],i);
602 sample = ((t) % sampleInterval == 0);
603 SampledSAPositionsIndicator->insertBit(sample, i);
605 if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i));
610 i= 1+ getNumberOfSymbolsSmallerThan(str[0]) + rank(str[0],i);
613 SampledSAPositionsIndicator->insertBit(true, i);
614 sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i));
616 // if (del) delete [] str;
622 ulong DynFMI::addTextFromFile(char* filename, ulong n){
630 std::ifstream str(filename, ios::binary);
633 cerr << "error reading file: " << filename << endl;
641 ulong buf_len; //1048576; //10MB
645 if (BUFFER==0) buf_len = n;
646 else buf_len = BUFFER;
648 buffer = new char[buf_len];
649 buf_pos = (n-2) / buf_len;
650 str.seekg(buf_pos*buf_len);
651 str.read(buffer, buf_len);
652 str.clear(std::ios_base::goodbit);
655 ulong key = pos->appendText(n);
660 c=buffer[(n-2)%buf_len];
661 insert(c,i); // insert second last character, corresponds to suffix of length 1
663 sample = (n-1 % sampleInterval == 0);
664 SampledSAPositionsIndicator->insertBit(sample, i);
665 if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i));
667 if ((n-2)%buf_len == 0) {
670 str.seekg(buf_pos*buf_len);
671 str.read(buffer, buf_len);
673 for (ulong t=n-2; t > 0; t--) {
675 i= 1+getNumberOfSymbolsSmallerThan(c) + rank(c,i);
677 c=buffer[(t-1)%buf_len];
681 if (((t-1)%buf_len == 0) && ((t-1)!=0)) {
684 cerr << "buf_pos too small" << endl;
689 str.seekg(buf_pos*buf_len);
690 str.read(buffer, buf_len);
694 sample = ((t) % sampleInterval == 0);
695 SampledSAPositionsIndicator->insertBit(sample, i);
697 if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i));
701 i= 1+ getNumberOfSymbolsSmallerThan(c) + rank(c,i);
705 SampledSAPositionsIndicator->insertBit(true, i);
706 sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i));
715 uchar* DynFMI::retrieveText(ulong key){
716 cout << "key: " << key << endl;
718 // TODO access two times, too much
719 ulong i=handle->getPos(key); //=bwtEnd;
724 ulong n=pos->getTextSize(i);
726 text = new uchar[n]; // last byte 0 for cout
731 for (ulong t=n-2; t < n; t--) {
733 text[t]=(c==0?'#':c);
735 i= 0+ getNumberOfSymbolsSmallerThan(c) + rank(c,i);
739 for (ulong i=0; i< n-1 ; i++) if (text[i]==0) text[i]='-';
747 bool DynFMI::deleteText(ulong key){
748 // TODO access two times, can do better
749 ulong i=handle->getPos(key); //=bwtEnd;
750 if (i == 0) return false;
752 ulong n=pos->getTextSize(i);
753 ulong *text = new ulong[n];
756 for (ulong t=n-1; t <n; t--) {
759 i= 0+ getNumberOfSymbolsSmallerThan(c) + rank(c,i); // TODO improve by lastrank!
763 std::sort(text, text + n);
766 for (i=n-1; i<n; i--) {
767 this->deleteSymbol(text[i]);
769 SampledSAPositionsIndicator->deleteBit(text[i]);
774 handle->deleteKey(key);
780 ulong DynFMI::getNumberOfSymbolsSmallerThan(uchar c){
784 if (binaryTree_isRightChild(j))
785 r += C[binaryTree_left(binaryTree_parent(j))];
787 j=binaryTree_parent(j);
795 void DynFMI::printDynFMIContent(std::ostream& stream){
797 for (ulong i=1; i<=getSize(); i++)