Allow arbitrary SWCSA index size
authorKim Nguyễn <kn@lri.fr>
Wed, 29 Feb 2012 00:33:25 +0000 (01:33 +0100)
committerKim Nguyễn <kn@lri.fr>
Wed, 29 Feb 2012 00:33:25 +0000 (01:33 +0100)
swcsa/buildFacade.c
swcsa/utils/hash.c

index b581059..4cd28fe 100755 (executable)
@@ -4,26 +4,26 @@
 
 /** Building the index */
 
-    /* Creates index from text[0..length-1]. Note that the index is an 
-      opaque data type. Any build option must be passed in string 
-      build_options, whose syntax depends on the index. The index must 
-      always work with some default parameters if build_options is NULL. 
+    /* Creates index from text[0..length-1]. Note that the index is an
+      opaque data type. Any build option must be passed in string
+      build_options, whose syntax depends on the index. The index must
+      always work with some default parameters if build_options is NULL.
       The returned index is ready to be queried. */
 int build_index (uchar *text, ulong length, char *build_options, void **index) {
        int returnvalue;
 
      printf("\n parameters: \"%s\"\n",build_options); fflush(stderr);
-        
+
     returnvalue = build_WCSA (text, length, build_options, index);
-    if (!returnvalue) 
+
+    if (!returnvalue)
        returnvalue = build_iCSA (build_options,*index);
-                  
-    return returnvalue;  
+
+    return returnvalue;
 }
 
 
-/**  Saves index on disk by using single or multiple files, having 
+/**  Saves index on disk by using single or multiple files, having
        proper extensions. */
 int save_index (void *index, char *filename) {
 
@@ -35,7 +35,7 @@ int save_index (void *index, char *filename) {
        int file;
        char c;
 
-       printf("\n Saving structures to disk: %s.*",filename);                  
+       printf("\n Saving structures to disk: %s.*",filename);
        outfilename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
 
        /**File with some constants (bSize and tohSize); */
@@ -47,11 +47,11 @@ int save_index (void *index, char *filename) {
                if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
                        printf("Cannot open file %s\n", outfilename);
                        exit(0);
-               }       
-               write(file, &(wcsa->sourceTextSize), sizeof(uint));                     
-               write(file, &(wcsa->seSize), sizeof(uint));                     
-               close(file);            
-       }       
+               }
+               write(file, &(wcsa->sourceTextSize), sizeof(uint));
+               write(file, &(wcsa->seSize), sizeof(uint));
+               close(file);
+       }
 
        /** The Words in the vocabulary of words  (sorted alphabetically)*/
        {       strcpy(outfilename, basename);
@@ -68,26 +68,26 @@ int save_index (void *index, char *filename) {
                write(file, &n, sizeof(uint));
                write(file, &elemSize, sizeof(uint));
                write(file, &(wcsa->wordsData.wordsZoneMem.size), sizeof(uint));
-               
+
                //the number of canonical words
                write(file, (char *)wcsa->wordsData.wordsZoneMem.zone, wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
                write(file, (char *)wcsa->wordsData.words, ((((n+1)* (elemSize))+W-1) /W) * (sizeof(uint)) );
 
                close(file);
-       }       
-                       
+       }
+
        free(outfilename);
 
        if (wcsa->myicsa) {
                /******** saves index on integers (bottom) ******/
                //Storing the CSA
                //storeStructsCSA(wcsa->myicsa,basename);
-               saveIntIndex((void *) wcsa->myicsa, basename);          
-       }       
-       
+               saveIntIndex((void *) wcsa->myicsa, basename);
+       }
+
        if (wcsa->se) {
                saveSEfile(basename,wcsa->se, wcsa->seSize+1);
-               //free(wcsa->se);               
+               //free(wcsa->se);
        }
 
        return 0;
@@ -95,7 +95,7 @@ int save_index (void *index, char *filename) {
 
 
 
-    /**  Loads index from one or more file(s) named filename, possibly 
+    /**  Loads index from one or more file(s) named filename, possibly
       adding the proper extensions. */
 int load_index(char *filename, void **index){
        twcsa *wcsa;
@@ -109,9 +109,9 @@ int free_index(void *index){
        twcsa *wcsa=(twcsa *) index;
        ulong size;
        index_size(index,&size);
-       printf("\n[destroying index] ...Freed %lu bytes... RAM", size); 
-       
-       
+       printf("\n[destroying index] ...Freed %lu bytes... RAM", size);
+
+
        //frees the array SE.
        if (wcsa->se)
                free (wcsa->se);
@@ -125,8 +125,8 @@ int free_index(void *index){
        //the words.
        free (wcsa->wordsData.wordsZoneMem.zone);
        free (wcsa->wordsData.words); /** huge!! */
-       
-       //the pointer to wcsa.          
+
+       //the pointer to wcsa.
        free(wcsa);
        return 0;
 }
@@ -137,46 +137,46 @@ int index_size(void *index, ulong *size) {
        twcsa *wcsa=(twcsa *)index;
        uint n= wcsa->n;
        *size=0;
-       *size += sizeof(twcsa); 
+       *size += sizeof(twcsa);
 
        totaltmp=0;  //words
        totaltmp += ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));  //the pointers
        totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words.
        *size += totaltmp;
-       
+
        if (wcsa->myicsa) {
                uint nbytes;
                int err = sizeIntIndex((void *) wcsa->myicsa, &nbytes);
                *size += nbytes;
-               //*size += CSA_size(wcsa->myicsa);              
+               //*size += CSA_size(wcsa->myicsa);
        }
 
-       return 0;       
+       return 0;
 }
 
 
 /** Querying the index =============================================================*/
-       
-       /* Writes in numocc the number of occurrences of the substring 
+
+       /* Writes in numocc the number of occurrences of the substring
           pattern[0..length-1] found in the text indexed by index. */
 int count (void *index, uchar *pattern, ulong length, ulong *numocc){
        uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
        uint integerPatternSize;
-       ulong l,r;      
+       ulong l,r;
 
        twcsa *wcsa=(twcsa *) index;
-       parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);     
+       parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
        if (!integerPatternSize) {*numocc=0; return 0;} //not found
-       
+
        //*numocc = countCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &l, &r);
        int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, numocc,  &l, &r);
        return 0;
 }
 
-       /* Writes in numocc the number of occurrences of the substring 
+       /* Writes in numocc the number of occurrences of the substring
          pattern[0..length-1] in the text indexed by index. It also allocates
-         occ (which must be freed by the caller) and writes the locations of 
-         the numocc occurrences in occ, in arbitrary order.  */          
+         occ (which must be freed by the caller) and writes the locations of
+         the numocc occurrences in occ, in arbitrary order.  */
 int locate(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc){
        return 99;
 }
@@ -196,31 +196,31 @@ int length (void *index, ulong *length) {
 
 
 /** ***********************************************************************************
-  * Accessing the indexed text  
+  * Accessing the indexed text
   * ***********************************************************************************/
 
 
-       /**  Allocates snippet (which must be freed by the caller) and writes 
-         the substring text[from..to] into it. Returns in snippet_length the 
-         length of the text snippet actually extracted (that could be less 
+       /**  Allocates snippet (which must be freed by the caller) and writes
+         the substring text[from..to] into it. Returns in snippet_length the
+         length of the text snippet actually extracted (that could be less
          than to-from+1 if to is larger than the text size).                      */
-int extract (void *index, ulong from, ulong to, uchar **snippet, ulong *snippet_length) {              
+int extract (void *index, ulong from, ulong to, uchar **snippet, ulong *snippet_length) {
     twcsa *wcsa=(twcsa *) index;
-       return 99;    
+       return 99;
 }
 
-  /** Displays the text (snippet) surrounding any occurrence of the 
-    substring pattern[0..length-1] within the text indexed by index. 
-    The snippet must include numc characters before and after the 
-    pattern occurrence, totalizing length+2*numc characters, or less if 
-    the text boundaries are reached. Writes in numocc the number of 
-    occurrences, and allocates the arrays snippet_text and 
-    snippet_lengths (which must be freed by the caller). The first is a 
-    character array of numocc*(length+2*numc) characters, with a new 
-    snippet starting at every multiple of length+2*numc. The second 
+  /** Displays the text (snippet) surrounding any occurrence of the
+    substring pattern[0..length-1] within the text indexed by index.
+    The snippet must include numc characters before and after the
+    pattern occurrence, totalizing length+2*numc characters, or less if
+    the text boundaries are reached. Writes in numocc the number of
+    occurrences, and allocates the arrays snippet_text and
+    snippet_lengths (which must be freed by the caller). The first is a
+    character array of numocc*(length+2*numc) characters, with a new
+    snippet starting at every multiple of length+2*numc. The second
     gives the real length of each of the numocc snippets. */
 
-int display (void *index, uchar *pattern, ulong length, ulong numc, 
+int display (void *index, uchar *pattern, ulong length, ulong numc,
         ulong *numocc, uchar **snippet_text, ulong **snippet_lengths) {
        return 99;
 }
@@ -229,74 +229,74 @@ int display (void *index, uchar *pattern, ulong length, ulong numc,
 
 /** ***********************************************************************************
   * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
-  * ***********************************************************************************/  
-       /* Writes in numocc the number of occurrences of the substring 
+  * ***********************************************************************************/
+       /* Writes in numocc the number of occurrences of the substring
          pattern[0..length-1] in the text indexed by index. It also allocates
-         occ (which must be freed by the caller) and writes the locations of 
+         occ (which must be freed by the caller) and writes the locations of
          the numocc occurrences in occ, in arbitrary order. These occurrences
          refer to the offsets in TOH where the caller could start a display
          operation. So locateWord implies synchronization using B.
          Moreover, positions occ[numocc.. 2*numocc-1] is set with the rank in SE of the
          words whose codes begin in TOH in the positions in occ[0... numocc-1]
          ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
-            searched word, but the offset in TOH of k-before words before.       
-       */        
-         
+            searched word, but the offset in TOH of k-before words before.
+       */
+
 int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore){
        uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
        uint integerPatternSize;
-       ulong occurrences,l,r;  
+       ulong occurrences,l,r;
        twcsa *wcsa=(twcsa *) index;
-       
-       parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);     
-       if (!integerPatternSize) {*numocc=0; return 0;} //not found 
+
+       parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
+       if (!integerPatternSize) {*numocc=0; return 0;} //not found
 
        ulong *seOffsets;
-  
+
        //obtains the indexes in vector SE where the pattern appears.
        //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
        int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
-               
+
        *numocc = occurrences;
-       
+
        if (!occurrences) {(*occ)=NULL;return 0;}
 
        (*occ) = (ulong *)seOffsets;
        return 0;
-}  
-
-
-  /** Displays the text (snippet) surrounding any occurrence of the 
-    substring pattern[0..length-1] within the text indexed by index. 
-    The snippet must include numc characters before and after the 
-    pattern occurrence, totalizing length+2*numc characters, or less if 
-    the text boundaries are reached. Writes in numocc the number of 
-    occurrences, and allocates the arrays snippet_text and 
-    snippet_lengths (which must be freed by the caller). The first is a 
-    character array of numocc*(length+2*numc) characters, with a new 
-    snippet starting at every multiple of length+2*numc. The second 
+}
+
+
+  /** Displays the text (snippet) surrounding any occurrence of the
+    substring pattern[0..length-1] within the text indexed by index.
+    The snippet must include numc characters before and after the
+    pattern occurrence, totalizing length+2*numc characters, or less if
+    the text boundaries are reached. Writes in numocc the number of
+    occurrences, and allocates the arrays snippet_text and
+    snippet_lengths (which must be freed by the caller). The first is a
+    character array of numocc*(length+2*numc) characters, with a new
+    snippet starting at every multiple of length+2*numc. The second
     gives the real length of each of the numocc snippets. */
 
- int displayWords (void *index, uchar *pattern, ulong length, ulong numc, 
+ int displayWords (void *index, uchar *pattern, ulong length, ulong numc,
          ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore) {
-               
+
     /** actually extracts upto length + 2*numc chars, starting extraction kbefore
-     *  words before the occurrence **/        
+     *  words before the occurrence **/
 
        ulong *indexesInSE;
        ulong occurrences;
        uint bytesPerSnippet;
        byte *text_aux;
        twcsa *wcsa=(twcsa *) index;
-       
-       locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);    
-       (*numocc) = occurrences;   
+
+       locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
+       (*numocc) = occurrences;
 
        if (!occurrences) {
                *snippet_text =NULL;
                *snippet_lengths =NULL;
                return 0;
-       }       
+       }
 
        bytesPerSnippet = length+2*numc;
 //     bytesPerSnippet = 2*numc;
@@ -307,90 +307,90 @@ int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *nu
 
  //    fprintf(stderr,"\n occs found = %7d for pattern %s",*numocc, pattern);
  //    fflush(stderr);
-       
-       text_aux=*snippet_text;         
+
+       text_aux=*snippet_text;
        {
                uint i, j, tmplen;
                uint ptr, maxptr;
                byte *src, *dst;
                uint snippetLen;
                uint posSEValue,indexSE;
-                       
+
                for (i=0;i<occurrences;i++) {
                                uint prevValid=0;
                                uint endSnippet =0;
-       
-                               /** decodes words from there */                 
-                               snippetLen=0;                                                   
+
+                               /** decodes words from there */
+                               snippetLen=0;
                                indexSE = indexesInSE[i];
-                               indexSE = (indexSE > kbefore) ? indexSE-kbefore : 0;                     
+                               indexSE = (indexSE > kbefore) ? indexSE-kbefore : 0;
 
-                               dst = text_aux;                                 
+                               dst = text_aux;
                                while ((!endSnippet) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
 
-                                       //posSEValue =displayCSA(wcsa->myicsa,indexSE); 
+                                       //posSEValue =displayCSA(wcsa->myicsa,indexSE);
                                        int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
-                                       
+
                                        {//obtains pointer to the ith word
                                                uint offtmp;
-                                               uint ith = posSEValue -1;  // !! 
+                                               uint ith = posSEValue -1;  // !!
                                                tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
                                                offtmp = bitread (wcsa->wordsData.words, ( ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
                                                tmplen -=offtmp;  //the lenght of the ith word.
 
                                                src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
-                                       }                                       
+                                       }
 
                                        if (_Valid[*src]) {
                                                if (prevValid){
-                                                        *dst++ =' ';   
+                                                        *dst++ =' ';
                                                         snippetLen++;
                                                         if  (snippetLen==bytesPerSnippet) break;  //end of snippet (ends in BLANK_SPACE)
                                                }
                                                prevValid =1;   //for the next iteration
                                        }
-                                       else prevValid=0;               
-                                                                                                                                                                                                       
+                                       else prevValid=0;
+
                                        indexSE++;
-                                                                       
+
                                        /* at the end ?? */
                                        if  ((tmplen+snippetLen)>=bytesPerSnippet) {
                                                tmplen =(bytesPerSnippet - snippetLen);
                                                endSnippet=1; //so while loop ends;
-                                       }       
-                                       
+                                       }
+
                                        for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
-                                       snippetLen +=tmplen;                            
+                                       snippetLen +=tmplen;
                                }//while
-       
+
                                text_aux += bytesPerSnippet;
-                               (*snippet_lengths)[i] = snippetLen;                                             
+                               (*snippet_lengths)[i] = snippetLen;
                        }       //for
-                                               
+
                        if (occurrences) free(indexesInSE);
                }
-               return 0;               
+               return 0;
 }
 
-/** simulates extration of text process, but do not actually returns anything at all 
+/** simulates extration of text process, but do not actually returns anything at all
    Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
    Less than 2K words can be extracted if more than numc characters have been already obtained.
    Does nothing else... does not return the text */
 
 int  displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc) {
-       
+
        ulong *indexesInSE;
        ulong occurrences;
        byte *text_aux;
+
        twcsa *wcsa=(twcsa *) index;
-       
-       locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);    
+
+       locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
+
        if (!occurrences) {
                return 0;
-       }       
-       
+       }
+
        ulong maxsnippetLen  = maxnumc;
        ulong extractedbytes = 0;
 
@@ -402,81 +402,81 @@ int  displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint
                byte *src, *dst;
                uint snippetLen;
                uint posSEValue,indexSE;
-               
+
                uint numWordsToExtract = 2 * wordsbefore;
                uint z;
                //printf("\n occurrences... = %lu",occurrences);
-                       
+
                for (i=0;i<occurrences;i++) {
                                uint prevValid=0;
                                uint endSnippet =0;
 
-                               /** decodes words from there */                 
-                               snippetLen=0;                                                   
+                               /** decodes words from there */
+                               snippetLen=0;
                                indexSE = indexesInSE[i];
-                               indexSE = (indexSE > wordsbefore) ? indexSE-wordsbefore : 0;                     
+                               indexSE = (indexSE > wordsbefore) ? indexSE-wordsbefore : 0;
 
-                               dst = text_aux; 
-                               z=0;                            
+                               dst = text_aux;
+                               z=0;
                                while ((z<numWordsToExtract) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
 
-                                       //posSEValue =displayCSA(wcsa->myicsa,indexSE); 
-                                       int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue); 
-                               
+                                       //posSEValue =displayCSA(wcsa->myicsa,indexSE);
+                                       int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
+
                                        {//obtains pointer to the ith word
                                                uint offtmp;
-                                               uint ith = posSEValue -1;  // !! 
+                                               uint ith = posSEValue -1;  // !!
                                                tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
                                                offtmp = bitread (wcsa->wordsData.words, ( ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
                                                tmplen -=offtmp;  //the lenght of the ith word.
 
                                                src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
-                                       }                                       
-       
+                                       }
+
                                        if (_Valid[*src]) {
                                                        if (prevValid){
-                                                                *dst++ =' ';   
+                                                                *dst++ =' ';
                                                                 snippetLen++;
                                                                 if  (snippetLen==maxsnippetLen) break;  //end of snippet (ends in BLANK_SPACE)
                                                                }
                                                        prevValid =1;   //for the next iteration
                                        }
-                                       else prevValid=0;               
-                                                                                                                                                                                                       
+                                       else prevValid=0;
+
                                        indexSE++;
-                                                                       
+
                                        /* at the end ?? */
                                        if  ((tmplen+snippetLen)>=maxsnippetLen) {
                                                        break;
-                                       }       
-                               
-                                       //fprintf(stderr,"\ntmplen = %d ",tmplen); fflush(stderr);                                                      
+                                       }
+
+                                       //fprintf(stderr,"\ntmplen = %d ",tmplen); fflush(stderr);
                                        for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
-                                       snippetLen +=tmplen;    
-                                       z++;                    
+                                       snippetLen +=tmplen;
+                                       z++;
                                }//while
 
                                extractedbytes += snippetLen;
-                                       
+
                        }       //for
-                                       
+
                        if (occurrences) free(indexesInSE);
                }
                if (text_aux) free (text_aux);
-               return extractedbytes; 
+               return extractedbytes;
 }
 
 
 
 /**  Allocates text (which must be freed by the caller) and recovers the
   the substring of text starting from the "fromword"-th word up to the
-  "toWord"-th words. Returns in text the text, and in "text_lenght" the 
-  length of the text  actually extracted. Text is allocated. 
+  "toWord"-th words. Returns in text the text, and in "text_lenght" the
+  length of the text  actually extracted. Text is allocated.
   Actually extracts SE[fromWord .. toWord) ... not the last word.    */
 
-int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text, 
+int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
         ulong *text_length){
-               
+
        twcsa *wcsa=(twcsa *) index;
     uint initTextLen=10000;
     uint avgWordLen =7;
@@ -488,31 +488,31 @@ int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
 
        uint buffBytes = 1000;
        uint leng=0; //curr pos in buffer that was occupied.
-       
+
        if (toWord > wcsa->seSize) toWord = wcsa->seSize;
        if (fromWord >= wcsa->seSize) fromWord = wcsa->seSize-1;
        if (buffBytes < ( (toWord-fromWord)* avgWordLen)) buffBytes = ((toWord-fromWord)* avgWordLen);
-               
+
        buff = (uchar *) malloc (buffBytes * sizeof(char));
        if (!buff) return 1; //out of memory.
        dst = buff;
-       
+
        register uint indexSE=fromWord;
        uint posSEValue=0;
        register uint ith;
-               
+
        while  ( (indexSE < toWord) ){ /** extracting words (if not at the end) */
 
-               int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);  
-               
+               int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
+
                {//obtains pointer to the ith word
                        uint offtmp;
-                       ith= posSEValue -1;  // !! 
+                       ith= posSEValue -1;  // !!
                        tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
                        offtmp = bitread (wcsa->wordsData.words, (ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
                        tmplen -=offtmp;  //the lenght of the ith word.
                        src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
-               }                                       
+               }
 
                if ( buffBytes < (leng + tmplen+1) ) {
                        buffBytes *=2;
@@ -523,15 +523,15 @@ int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
 
                if (_Valid[*src]) {
                        if (prevValid){
-                               *dst++ =' ';                                    
+                               *dst++ =' ';
                                leng  += 1;
                        }
                        prevValid =1;   //for the next iteration
                }
-               else prevValid=0;               
-                                                                                                                                                                               
+               else prevValid=0;
+
                indexSE++;
-                                                                                       
+
                for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
                leng +=tmplen;
        }//while
@@ -549,12 +549,12 @@ int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
     ***********************************************************************************/
 
        /**------------------------------------------------------------------
-       Compares two slots (alphanumericaly). For qsort of canonical words 
-       ------------------------------------------------------------------ */                      
+       Compares two slots (alphanumericaly). For qsort of canonical words
+       ------------------------------------------------------------------ */
        int qSortWordsCompareAlpha(const void *arg1, const void *arg2) {
                tposInHT *a1 = (tposInHT *) arg1;
                tposInHT *a2 = (tposInHT *) arg2;
-               return strcmp((char*)a1->word, (char *)a2->word);       
+               return strcmp((char*)a1->word, (char *)a2->word);
        }
 
 /**
@@ -562,15 +562,15 @@ int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
   */
 
 int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
-       
+
        unsigned long zeroNode;  //number of different canonical words.
-       
-       t_hash hash;            // the hash table to store both variants and canonical words.                                   
-       tposInHT *posInHT;      // structure for canonicals and variants+huffmans 
 
-       uint sourceTextSize;  
-       uint seSize=0;  //it's size == "numberOfValidWords". 
+       t_hash hash;            // the hash table to store both variants and canonical words.
+       tposInHT *posInHT;      // structure for canonicals and variants+huffmans
+
+       uint sourceTextSize;
+
+       uint seSize=0;  //it's size == "numberOfValidWords".
        uint *SE;       //Integers vector. (represents the rank of the valid words in the source text).
 
        uint totallenWords=0; //The numberOfBytes that occupy canonical words (their ascii version) in memory
@@ -578,16 +578,16 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
 
        ulong bytesFile,bytesFileReal;
        long sizeNValue;
+       unsigned long size_posInHT;
+       /* used during first pass */
 
-       /* used during first pass */    
-                       
        ulong addrInTH;
-       
-       byte* inputBuffer = text;       
+
+       byte* inputBuffer = text;
        bytesFileReal= bytesFile = length;
 
        sourceTextSize=length;
-       
+
        /** Initializes WCSA structure*/
        twcsa *wcsa;
        wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
@@ -597,34 +597,35 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
        //Stimation (Using Heap's law) of the number of different "meaningful" words.
        //sizeNValue=N_value;
        if(bytesFile<5000000) bytesFile = 5000000;
-       sizeNValue = (unsigned long) floor(3.9* pow(bytesFile,0.60) );
-       
+
+       sizeNValue = (unsigned long) floor(3.9* pow(bytesFile,0.70) );
 
        // Inicializes the arrays used to detect if a char is valid or not.
        StartValid();
        // Inicializes the arrays used translated a char into lowercase.
        StartToLow();
-               
-       
+
+
        // **********************************************************************************
-       //STARTING THE FIRST PASS. 
+       //STARTING THE FIRST PASS.
        // **********************************************************************************
        printf("\nSTARTING THE FIRST PASS...");
-       
-       posInHT = (tposInHT *) malloc(sizeof(tposInHT) * sizeNValue); 
+
+       posInHT = (tposInHT *) malloc(sizeof(tposInHT) * sizeNValue);
+       size_posInHT = sizeNValue;
        hash = initialize_hash (sizeNValue); //hash to cointain both the parsed words
-       
-       //-----------------------------------------------------------------     
+
+       //-----------------------------------------------------------------
        //1st pass (processing the file)
-       { 
+       {
             byte *pbeg,*pend,*wordstart,*aWord;
             register ulong size;
             register uint i;
-            
+
             pbeg = inputBuffer;
             pend = inputBuffer+bytesFileReal;
-            
-            while (pbeg <pend) 
+
+            while (pbeg <pend)
             {
                 if (*pbeg == 0)
                 {
@@ -632,15 +633,15 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
                     exit(1);
                 }
 
-                //parsing either a word or separator.                  
+                //parsing either a word or separator.
                 size=0;
                 wordstart = pbeg;
                 if (_Valid[*pbeg]) {   //alphanumerical data
                     while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
-                }                  
+                }
                 else
                 {
-                    if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word                                 
+                    if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
                         while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
                     }
                     else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
@@ -656,10 +657,10 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
                                 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
                             }//else {  // a "separator word"
                         }//else ... not a unique BLANK AT THE END.
-                    }//else ... starting by a BLANK... 
+                    }//else ... starting by a BLANK...
                 }
-                
-                if (pbeg < pend && *pbeg == 0) 
+
+                if (pbeg < pend && *pbeg == 0)
                     pbeg ++;    // Skip the 0-bytes
 
                 if (size == 0)
@@ -669,43 +670,47 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
                 }
 
                 //The parsed word/separator is  is "wordstart", and its length is "size"...
-                aWord=wordstart;                               
-                
-                //Processement done for each word word                 
-                i = inHashTable(hash,aWord, size, &addrInTH );                         
+                aWord=wordstart;
+
+                //Processement done for each word word
+                i = inHashTable(hash,aWord, size, &addrInTH );
                 if (!i){
                     insertElement (hash,aWord, size, &addrInTH);
+                   if (zeroNode >= size_posInHT){
+                     size_posInHT *= 2;
+                     posInHT = (tposInHT*) realloc(posInHT, size_posInHT * sizeof(tposInHT));
+                   }
                     posInHT[zeroNode].slot=addrInTH;
                     posInHT[zeroNode].word=hash->hash[addrInTH].word;
                     hash->hash[addrInTH].posInVoc = zeroNode;
                     zeroNode++;
-                    totallenWords += size +1;                  // +1 due to the '\0' char...           
+                    totallenWords += size +1;                  // +1 due to the '\0' char...
                    //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);
-                }                                                      
+                }
                 seSize ++;
-                
+
             }//while pbeg<pend
-            
+
             fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
-            
+
         }//1st pass ends
-        
+
 
        // **********************************************************************************
        // END OF 1ST PASS
        // **********************************************************************************
 
        // Sorting the words alphanumerically (over posInHT)
-       {       register unsigned long i,j;     
+       {       register unsigned long i,j;
                //sorting canonical words ...
-               qsort(posInHT, zeroNode, sizeof(tposInHT), qSortWordsCompareAlpha);             
-               
+               qsort(posInHT, zeroNode, sizeof(tposInHT), qSortWordsCompareAlpha);
+
                //setting in hash the new positions of the  words in the hash table
-               for (i=0;i<zeroNode;i++) {      
-                       hash->hash[posInHT[i].slot].posInVoc = i;       
+               for (i=0;i<zeroNode;i++) {
+                       hash->hash[posInHT[i].slot].posInVoc = i;
                }
-       }       
-       
+       }
+
        // INITIALIZING structures for the 2nd pass ......................................
        {
                SE  = (uint *) malloc ((seSize+1)*sizeof (uint));
@@ -718,7 +723,7 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
 
        printf("\nSTARTING THE SECOND PASS... ");
        //2nd pass (processing the file)
-       { 
+       {
             byte *pbeg,*pend,*wordstart,*aWord;
             register ulong size;
             register uint i;
@@ -727,22 +732,22 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
 
             pbeg = inputBuffer;
             pend = inputBuffer+bytesFileReal;
-                               
-            while (pbeg <pend) {  
+
+            while (pbeg <pend) {
                 if (*pbeg == 0)
                 {
                     fprintf(stderr, "buildFacade.c 2nd pass: assert failed, *pbeg == 0\n");
                     exit(1);
                 }
-                       
-                //parsing either a word or separator.                  
+
+                //parsing either a word or separator.
                 size=0;
                 wordstart = pbeg;
                 if (_Valid[*pbeg]) {   //alphanumerical data
                     while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
-                }                  
+                }
                 else {
-                    if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word                                 
+                    if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
                         while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
                     }
                     else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
@@ -758,10 +763,10 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
                                 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg]  && *pbeg != 0)) {size++;pbeg++;}
                             }//else {  // a "separator word"
                         }//else ... not a unique BLANK AT THE END.
-                    }//else ... starting by a BLANK... 
+                    }//else ... starting by a BLANK...
                 }
-                    
-                if (pbeg < pend && *pbeg == 0) 
+
+                if (pbeg < pend && *pbeg == 0)
                     pbeg ++;    // Skip the 0-bytes
 
                 if (size == 0)
@@ -771,57 +776,57 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
                 }
 
                 //The parsed word/separator is  is "wordstart", and its length is "size"...
-                aWord=wordstart;                                       
-                    
-                //Processement done for each word word                 
-                i = inHashTable(hash,aWord, size, &addrInTH );                         
-                    
+                aWord=wordstart;
+
+                //Processement done for each word word
+                i = inHashTable(hash,aWord, size, &addrInTH );
+
                 SE[countValidWords]=hash->hash[addrInTH].posInVoc+1;  // !!!!
-                countValidWords++;             
-                    
+                countValidWords++;
+
             }// while pbeg<pend
-               
+
             SE[countValidWords] = 0;
             fprintf(stderr,"\n2nd pass ends: TOTAL distinct words: %lu totalWords = %lu",zeroNode,countValidWords);
-                       
+
        }//2nd pass ends
-       
+
        // **********************************************************************************
        // END OF 2ND PASS
        // **********************************************************************************
-       
+
        //freeing the source text (it is no longer needed).
-       delete [] inputBuffer; //the text       
+       free(inputBuffer); //the text   
 
        /** Now Setting the data of the index **/
        wcsa->n = zeroNode;
        wcsa->sourceTextSize = sourceTextSize;
        wcsa->seSize = seSize;
-       
+
        // Creating the words of the vocabulary...
-       {  
+       {
        /** copying the words into WCSA. */
                uint *tmpOffsets = (uint *) malloc (sizeof(uint) * (zeroNode  +1) );  //1 extra uint (to point to the virtual "zeroNode+1" ^th word.
                uint tmpOffset =0;
-                
+
                byte *zoneMem,*src;
                uint i;
-                                                                
+
                //Moving data from posInHT to WCSA structure
-               //wcsa->wordsData = (twords *) malloc(sizeof(twords) * zeroNode); 
+               //wcsa->wordsData = (twords *) malloc(sizeof(twords) * zeroNode);
                wcsa->wordsData.wordsZoneMem.size = totallenWords - zeroNode; //without '\0' bytes (end-tag).
-               wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc ( wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );  
-               zoneMem = wcsa->wordsData.wordsZoneMem.zone;                                    
+               wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc ( wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
+               zoneMem = wcsa->wordsData.wordsZoneMem.zone;
                for(i = 0; i < zeroNode; i++) {
                        src = posInHT[i].word;                           //copying the canonical word
                        //wcsa->wordsData.words[i].word = zoneMem;   //setting the pointer
                        tmpOffsets[i]=tmpOffset;  //offset in zoneMem
                        while (*src) {*zoneMem++ = *src++; tmpOffset++;}  //moving data until '\0'
                        //*zoneMem='\0'; zoneMem++;            //copies also the '\0'
-                       
+
                }
                tmpOffsets[zeroNode]=tmpOffset; //setting pointer to the "virtual" word {zeroNode+1}^{th}
-               
+
                //kbit encoding of the offsets
                uint elemSize = _bits(tmpOffset);
                wcsa->wordsData.elemSize = elemSize;
@@ -832,15 +837,15 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
                tmpOffset=0;
                for (i=0; i<=zeroNode; i++) {  //setting "zeroNode+1" offsets
                                bitwrite(wcsa->wordsData.words, tmpOffset, elemSize, tmpOffsets[i]);
-                               tmpOffset+=elemSize; 
+                               tmpOffset+=elemSize;
                }
-                               
+
                //////////// CHECKS IT WORKED. old !!!!
        //              { uint kk;
        //                      tmpOffset=0;
        //                      for (i=0; i<zeroNode; i++) {  //setting "zeroNode+1" offsets
        //                                      kk=bitread(wcsa->wordsData.words, i* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
-       //                                      tmpOffset+=elemSize; 
+       //                                      tmpOffset+=elemSize;
        //                                      if (kk != tmpOffsets[i]) {fprintf(stderr,"\n @@@@@@@@ DISTINTOS OFFSETS "); break;}
        //                                      else fprintf(stderr,"\n iguales, %d, %d  :: <<%s>>len=%d",kk,i, posInHT[i].word, strlen((char*)posInHT[i].word));
        //                      }
@@ -849,42 +854,42 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
        //              { uint len1, len, tmplen, len2;
        //                      uint i,p;
        //                      byte *wcsaWord, *src;
-       //                      
+       //
        //                      for (p=0;p<zeroNode;p++) {
        //                              {//preparing for strcompL
        //                                      len    = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p+1)), wcsa->wordsData.elemSize);
        //                                      tmplen = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p))  , wcsa->wordsData.elemSize);
-       //                      
+       //
        //                              //fprintf(stderr,"\n  :: off[%d]= %d  -  off [%d] = %d  ==> %d",p+1,len,p,tmplen,len-tmplen);
-       //                      
+       //
        //                              len2 =len-tmplen;
        //                                      wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
-       //                              }               
-       //                              
-       //                              src = posInHT[p].word;  
+       //                              }
+       //
+       //                              src = posInHT[p].word;
        //                              len1 = strlen((char *)src);
-       //                              
+       //
        //                              if (strcompL(src,wcsaWord,len1,len2) != 0) {
-       //                                      fprintf(stderr,"\n %6d DISTINTOS !! ===len1 %d,len %d===== <<",p,len1,len2);printWord(src,len1); 
+       //                                      fprintf(stderr,"\n %6d DISTINTOS !! ===len1 %d,len %d===== <<",p,len1,len2);printWord(src,len1);
        //                                      fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
        //                                      exit(0);
        //                              }
        //                              else {
-       //                                      fprintf(stderr,"\n %6d ======len1 %d,len2 %d===== <<",p,len1,len2);printWord(src,len1); 
+       //                                      fprintf(stderr,"\n %6d ======len1 %d,len2 %d===== <<",p,len1,len2);printWord(src,len1);
        //                                      fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
-       //                              }                                       
+       //                              }
        //                      }
-       //                              
+       //
        //              }
-       //      
-       
+       //
+
        /**-----------*/
        //frees memory from hash table and posInHT structures.
                free(tmpOffsets);
                destroy_hash(hash);
                free(posInHT);
        }
-       
+
        /** ******* creates the self-index on ints (bottom layer) ==> see build_icsa *********/
 /**
        #ifdef CSA_ON
@@ -895,10 +900,10 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
                myicsa = createIntegerCSA(&SE,seSize+1,build_options);
                wcsa->myicsa= myicsa;
                total = CSA_size(myicsa);
-               
+
                free(SE);  //SE is no longer needed, (it is indexed by the iCSA)
                printf("\n\t**** [iCSA built on %d words. Size = %ld bytes... RAM",seSize,total);
-       }       
+       }
        #endif
 */
 
@@ -906,8 +911,8 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
                wcsa->se = SE;
                wcsa->myicsa = NULL;
        //#endif
-       
-       printf("\n\t ** Building done! **\n");  
+
+       printf("\n\t ** Building done! **\n");
        printf("\n Process finished!\n");
 
        *index = wcsa;
@@ -924,25 +929,25 @@ int build_iCSA (char *build_options, void *index)
                uint total;
                fprintf(stderr,"\n **** CREATING CSA-bottom-layer *****");
                void *bottomIntIndex;
-               int err =  buildIntIndex(wcsa->se,wcsa->seSize+1, build_options,(void **)&bottomIntIndex);                              
+               int err =  buildIntIndex(wcsa->se,wcsa->seSize+1, build_options,(void **)&bottomIntIndex);
                wcsa->myicsa = bottomIntIndex;
 
                //total = CSA_size(wcsa->myicsa);
                err = sizeIntIndex((void *) wcsa->myicsa, &total);
 
                printf("\n\t**** [iCSA built on %d words. Size = %u bytes... RAM",wcsa->seSize,total);
-       }               
+       }
        return 0;
 }
 
 /** ********************************************************************
   * Loading from disk
-  **********************************************************************/ 
+  **********************************************************************/
 
-/**-----------------------------------------------------------------  
- * LoadWCSA.                                                       
- * Loads all the data structures of WCSA (included the icsa)     
- ----------------------------------------------------------------- */ 
+/**-----------------------------------------------------------------
+ * LoadWCSA.
+ * Loads all the data structures of WCSA (included the icsa)
+ ----------------------------------------------------------------- */
 
 twcsa *loadWCSA(char *filename) {
        twcsa *wcsa;
@@ -950,13 +955,13 @@ twcsa *loadWCSA(char *filename) {
        StartValid();
        // Inicializes the arrays used translated a char into lowercase.
        StartToLow();
-       
+
        wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
        wcsa->n=0;
 
        int err = loadIntIndex(filename, (void **)&wcsa->myicsa);
 
-       loadStructs(wcsa,filename);                                                                       
+       loadStructs(wcsa,filename);
 
        return wcsa;
 }
@@ -964,7 +969,7 @@ twcsa *loadWCSA(char *filename) {
 /** ------------------------------------------------------------------
  * LoadStructs.
  *     Reads files and loads all the data needed for searcherFacade
- ----------------------------------------------------------------- */ 
+ ----------------------------------------------------------------- */
  void loadStructs(twcsa *wcsa, char *basename) {
        uint i,j;
        char *filename;
@@ -972,28 +977,28 @@ twcsa *loadWCSA(char *filename) {
        uint sizeFile;
        char c;
        uint n;
-               
+
        filename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
        fprintf(stderr,"Loading Index from file %s.*\n", basename);
-       
+
        //** SOME CONSTANTS: sourceTextSize
        {       strcpy(filename, basename);
                strcat(filename, ".");
                strcat(filename, CONSTANTS_FILE_EXT);
-               
+
                if( (file = open(filename, O_RDONLY)) < 0) {
                        printf("Cannot open file %s\n", filename);
                        exit(0);
                }
-                               
-               read(file, &(wcsa->sourceTextSize), sizeof(uint));      
-               read(file, &(wcsa->seSize), sizeof(uint));      
+
+               read(file, &(wcsa->sourceTextSize), sizeof(uint));
+               read(file, &(wcsa->seSize), sizeof(uint));
                close(file);
-       }               
-       
+       }
+
        /** File with the words from the vocabulary (sorted alphabetically) */
        {       byte *zoneMem;
-               
+
                strcpy(filename, basename);
                strcat(filename, ".");
                strcat(filename, VOCABULARY_WORDS_FILE_EXT);
@@ -1003,26 +1008,26 @@ twcsa *loadWCSA(char *filename) {
                        printf("Cannot open file %s\n", filename);
                        exit(0);
                }
-               
+
                //the number of canonical words
                read(file, &n, sizeof(uint));
                wcsa->n = n;
                read(file, &(wcsa->wordsData.elemSize), (sizeof(uint)));
                read(file, &(wcsa->wordsData.wordsZoneMem.size), (sizeof(uint)));
-               
-               //allocating the memory needed for all words and reading them   //(ascii) << no \0 chars are needed>>.  
-               wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc(wcsa->wordsData.wordsZoneMem.size * sizeof(byte));                  
+
+               //allocating the memory needed for all words and reading them   //(ascii) << no \0 chars are needed>>.
+               wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc(wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
                read(file, (wcsa->wordsData.wordsZoneMem.zone),    wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
-       
+
                //reading the offsets of the words (kbitArray that points to offsets in zoneMem of words.
-               wcsa->wordsData.words = (uint *) malloc (((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * sizeof(uint));        
+               wcsa->wordsData.words = (uint *) malloc (((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * sizeof(uint));
                wcsa->wordsData.words[ ((((n+1)*(wcsa->wordsData.elemSize))+W-1) /W)   -1 ] =0000;
                read(file, (wcsa->wordsData.words),     ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)));
-               
+
 
                close(file);
-       }       
-       wcsa->se= NULL; 
+       }
+       wcsa->se= NULL;
        free(filename);
 }
 
@@ -1040,26 +1045,26 @@ twcsa *loadWCSA(char *filename) {
 
 /*------------------------------------------------------------------
  * Given a text pattern translates it into a list of integers (corresponding to the
- * canonical words associated to the valid words in the text pattern) 
+ * canonical words associated to the valid words in the text pattern)
  ------------------------------------------------------------------*/
 void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) {
-       
+
        byte *pbeg,*pend,*wordstart,*aWord;
        register unsigned long size;
-       uint index =0; 
-                       
-       pbeg = textPattern; 
+       uint index =0;
+
+       pbeg = textPattern;
        pend = pbeg + patLen;
-                                                       
-       while (pbeg <pend) {  
-               //parsing either a word or separator.                   
+
+       while (pbeg <pend) {
+               //parsing either a word or separator.
                size=0;
                wordstart = pbeg;
                if (_Valid[*pbeg]) {   //alphanumerical data
                        while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
-         }                 
+         }
                else {
-                       if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word                                      
+                       if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
                                while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
                        }
                        else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
@@ -1075,29 +1080,29 @@ void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *in
                                                while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
                                        }//else {  // a "separator word"
                                }//else ... not a unique BLANK AT THE END.
-                       }//else ... starting by a BLANK... 
+                       }//else ... starting by a BLANK...
                }
-                                               
+
                //The parsed word is "aWord", and its length is "size"...
-               aWord=wordstart;        
-               
+               aWord=wordstart;
+
                // Binary search on the canonical words (wordsData)
                {
                        uint len, tmplen;
                        uchar *wcsaWord;
-                       register uint min,max,p;        
+                       register uint min,max,p;
                        min = 0;
                        max = (wcsa->n) - 1;
                        while(min < max) {
                                p = (min+max)/2;
-                               
+
                                {//preparing for strcompL
                                        len    = bitread (wcsa->wordsData.words, (p+1)* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize);
                                        tmplen = bitread (wcsa->wordsData.words, (p )* wcsa->wordsData.elemSize  , wcsa->wordsData.elemSize);
                                        len -=tmplen;
                                        wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
                                }
-                               
+
                                //if(strncmp((char*)aWord, (char*)wcsa->wordsData[p].word,size) > 0) min = p+1;
                                if(strcompL(aWord, wcsaWord, size, len) > 0) min = p+1;
                                else max = p;
@@ -1107,9 +1112,9 @@ void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *in
                                //                                      fprintf(stderr,"\n Patron = <<%s>>, curposWord= %d ==><<",aWord,p);
                                //                                      printWord(wcsaWord,len); fprintf(stderr,">> len =%d",len);
                                //                              }
-                                       
+
                        }
-                       
+
                        {//preparing for strcompL
                                len    = bitread (wcsa->wordsData.words, (min+1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
                                tmplen = bitread (wcsa->wordsData.words, ( min )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
@@ -1122,19 +1127,19 @@ void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *in
                                integerPattern[index++] = min +1 ;  //<--
                        }
                        else {*sizeIntegers = 0; return;}  // a valid word that does not appear in the source text.
-                       
-               }                                                                                                               
+
+               }
        }// end while
-       *sizeIntegers = index;  
-       
+       *sizeIntegers = index;
+
        //      //shows the parsed words:
        //      {uint i;
        //              printf("\n\n >>%s>> HA SIDO PARSEADO COMO:",textPattern);
        //              for (i=0; i<index;i++) {
        //                              printf("<<%s>>",wcsa->wordsData[integerPattern[i] -1].word);
        //              }
-       //              
-       //      }       
+       //
+       //      }
 }
 
 
@@ -1148,66 +1153,66 @@ int countTextOcurrences(twcsa *wcsa, byte *textPattern) {
        ulong left, right;
        uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
        uint integerPatternSize, min, max;
-       
+
        uint lenpat = strlen((char*)textPattern);
-       parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize); 
+       parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
        if (!integerPatternSize) return -1;
-               
-//     #ifdef DEBUG_ON         
+
+//     #ifdef DEBUG_ON
 //             uint i;
 //             printf("\n %d Integers to search for:",integerPatternSize );
 //             for (i=0;i<integerPatternSize;i++) {
-//                     printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);                       
-//             }       
+//                     printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
+//             }
 //             printf("\n");
 //     #endif
 
        ulong numocc;
        int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, &numocc,  &left, &right);
-       return numocc;  
+       return numocc;
 
 }
 
 
        /** ------------------------------------------------------------------
-        * locateTextOcurrences: 
+        * locateTextOcurrences:
         * Returns the offsets of the source text where a word/phrase appears
         * Returns also the number of occurrences.
         *------------------------------------------------------------------ */
 uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences) {
        uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
        uint integerPatternSize, min, max;
-       
+
        uint lenpat = strlen((char*)textPattern);
-       parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);         
+       parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
        if (!integerPatternSize) {*numberOccurrences = -1; return NULL;}
-               
-//     #ifdef DEBUG_ON         
+
+//     #ifdef DEBUG_ON
 //             uint i;
 //             printf("\n %d Integers to search for:",integerPatternSize );
 //             for (i=0;i<integerPatternSize;i++) {
-//                     printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);                       
-//             }       
+//                     printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
+//             }
 //             printf("\n");
 //     #endif
-               
+
        ulong occurrences, left, right;
        ulong *seOffsets;
        ulong *sourceOffsets;
-       
+
        //obtains the indexes in vector SE where the pattern appears.
        //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
        int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
-       
+
        //sourceOffsets = (uint *) malloc (sizeof(uint)*occurrences);
-       
+
        sourceOffsets=seOffsets;
        //obtains the offsets in the source text of the pattern (sourceOffsets)
        locateFacade(wcsa, (uint *)sourceOffsets, (uint *)seOffsets,occurrences);
 
-       #ifdef DEBUG_ON         
+       #ifdef DEBUG_ON
                fprintf(stderr,"\n*** %s appears in the source text in positions:\n\t",textPattern);
-               for (i=0;i<occurrences;i++) 
+               for (i=0;i<occurrences;i++)
                        fprintf(stderr,"[%u]",sourceOffsets[i]);
                fflush(stderr);
        #endif
@@ -1227,10 +1232,10 @@ int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay) {
 }
 
        /** ------------------------------------------------------------------
-        * Locate Facade: 
+        * Locate Facade:
         * For given sePositions, returns the sourceTextPositions
         * where the those valid-words in se[sePositions[i]] occurr.
-        *------------------------------------------------------------------*/ 
+        *------------------------------------------------------------------*/
 int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number) {
        return 99;  //not implemented: function not available for this index
 }
@@ -1243,7 +1248,7 @@ int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint
        * Precondition: offsetIni >=0;
        ------------------------------------------------------------------*/
  int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr) {
-       return 99;  //not implemented: function not available for this index            
+       return 99;  //not implemented: function not available for this index
 }
 
 
@@ -1255,15 +1260,15 @@ int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint
         ------------------------------------------------------------------*/
 byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length) {
        byte *dstptr=NULL;         //not implemented: function not available
-       return dstptr;          
+       return dstptr;
 }
 
 
-       /** ------------------------------------------------------------------ 
+       /** ------------------------------------------------------------------
         * LOCATEALLandDISPLAY:
         * Displays the text around an occurrence of the searched word in the source text.
         * Assuming that $p$ is that position --> shows only chars in [p_radix-1,p_radix]
-        ------------------------------------------------------------------*/ 
+        ------------------------------------------------------------------*/
 int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix) {
 return 99;   //not implemented: function not available for this index
 
@@ -1273,13 +1278,13 @@ return 99;   //not implemented: function not available for this index
        /** ------------------------------------------------------------------
         * recovers the source text by calling display(0,fileSize);
         * ------------------------------------------------------------------ */
-void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {         
+void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
 
        int start;int end;
        byte *cc;
        char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
        ulong length;
-       
+
        strcpy( filename,  basename);
        strcat( filename, ext);
        filename[strlen( basename)+ strlen(ext)]='\0';
@@ -1289,45 +1294,45 @@ void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextS
        unlink( filename);
        salida = fopen( filename,"w");
        start=0; end = sourceTextSize-1;
-       
+
        cc = (byte *) malloc (sourceTextSize* sizeof(uchar));
 
        {
                uint i, j;//, tmplen;
-               uint prevValid;
+               uint prevValid = 0;
                //uint ptr, maxptr;
                byte *src, *dst;
                uint leng =0;
                uint tmplen =0;
-               
+
                uint indexSE=0;
                uint posSEValue=0;
-       
+
                dst=cc;
                while  ( (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
 
                        int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
-                       
+
                        {//obtains pointer to the ith word
                                uint offtmp;
-                               uint ith = posSEValue -1;  // !! 
+                               uint ith = posSEValue -1;  // !!
                                tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
                                offtmp = bitread (wcsa->wordsData.words, ( ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
                                tmplen -=offtmp;  //the lenght of the ith word.
                                src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
-                       }                                       
+                       }
 
                        if (_Valid[*src]) {
                                if (prevValid){
-                                        *dst++ =' ';                                           
+                                        *dst++ =' ';
                                        leng +=1;
                                }
                                prevValid =1;   //for the next iteration
                        }
-                       else prevValid=0;               
-                                                                                                                                                                                       
+                       else prevValid=0;
+
                        indexSE++;
-                                                                                               
+
                        for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
                        leng +=tmplen;
                }//while
@@ -1344,13 +1349,13 @@ void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextS
 }
 
                //recovers the source text by calling extract Words.
-void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {         
+void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
 
        int start;int end; int error;
        char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
        byte *cc;
        ulong length;
-       
+
        strcpy( filename,  basename);
        strcat( filename, ext);
        filename[strlen( basename)+ strlen(ext)]='\0';
@@ -1360,7 +1365,7 @@ void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextS
        unlink( filename);
        salida = fopen( filename,"w");
        start=0; end = wcsa->seSize;
-       
+
        error = extractWords((void *) wcsa, start, end, &cc, &length);
        if (error) {fprintf(stderr,"\n error during recoverSourceText2"); exit(0);}
 
@@ -1369,7 +1374,7 @@ void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextS
        fclose(salida);
 
        free(cc);
-       free(filename);         
+       free(filename);
 }
 
 /** *******************************************************************************
@@ -1384,20 +1389,20 @@ int printInfo(void *index) {
        uint n;
 
        twcsa *wcsa = (twcsa *) index;
-       
+
        unsigned long indexSize;
        uint intIndexSize, presentationSize;
        int err;
-       
+
        err = index_size(index, &indexSize);
        if (err!=0) return err;
-       err = sizeIntIndex(wcsa->myicsa, &intIndexSize);        
+       err = sizeIntIndex(wcsa->myicsa, &intIndexSize);
        if (err!=0) return err;
-       
+
        presentationSize = indexSize - intIndexSize;
-       
-               printf("\n ===================================================:");              
-               printf("\n Summary of Presentation layer:");            
+
+               printf("\n ===================================================:");
+               printf("\n Summary of Presentation layer:");
                printf("\n   Number of valid words (SEsize) = %u",wcsa->seSize);
                printf("\n   Number of different words = %ld",wcsa->n);
                printf("\n   WCSA structure = %lu bytes", sizeof(twcsa));
@@ -1407,14 +1412,14 @@ int printInfo(void *index) {
                uint totalwords = totalasciizone + totalpointers;
 
                printf("\n   Size Of words structure (%d bytes):",totalwords);
-               printf("\n      [     pointers = %d bytes || AsciiZone = %d bytes", totalpointers,      totalasciizone);        
-                               
-               printf("\n\n Total = **  %u bytes (in RAM) **",presentationSize);               
+               printf("\n      [     pointers = %d bytes || AsciiZone = %d bytes", totalpointers,      totalasciizone);
+
+               printf("\n\n Total = **  %u bytes (in RAM) **",presentationSize);
                //printf("\n\n @@ Summary of self-index on Integers:");
                err = printInfoIntIndex(wcsa->myicsa, " ");
                if (err!=0) return err;
-                       
-               printf("\n ===================================================:");              
+
+               printf("\n ===================================================:");
                printf("\n");
                return 0;
        }
@@ -1422,7 +1427,7 @@ int printInfo(void *index) {
 /**------------------------------------------------------------------
  * structsSize.
  *     Counts the memory amount needed by the Facade (Presentation Layer).
- * skipping the stop_words hash table 
+ * skipping the stop_words hash table
  ----------------------------------------------------------------- */
 uint structsSizeMem(twcsa *wcsa) {
        return 0;   //not implemented: function not available for this index.
@@ -1448,7 +1453,7 @@ int saveSEfile (char *basename, uint *v, uint n) {
                exit(0);
        }
 
-       write(file, v, sizeof(uint) * n ); 
+       write(file, v, sizeof(uint) * n );
        close(file);
 }
 
@@ -1471,56 +1476,56 @@ double getTime2 (void)
 
 
 
-/**------------------------------------------------------------------ 
+/**------------------------------------------------------------------
   *  MAIN PROGRAM.
   *------------------------------------------------------------------ */
 #ifdef FACADEWITHMAIN
        int main(int argc, char* argv[])
        {
-               
-                       
-               
+
+
+
                char *infile, *outbasename, *stopwordsfile;     // Name of in/out files
                byte *inputBuffer;
                ulong finsize;
-       
+
                int f_in;
                void *Index;
 
-               
+
                printf("\n*Word-based iCSA: A word-based CSA");
                printf("\n*CopyRight (c) 2008 [LBD & G.N.]\n\n");
-       
+
                // Reads input parameters from command line.
                if(argc < 3) {
                        printf("Use: %s <in file> <out basename> \n", argv[0]);
                        exit(0);
                }
-       
+
                // Reads params (input file, output basename, and stopwords file)
                infile = argv[1];
                outbasename = argv[2];
                stopwordsfile = argv[3];
-               
+
                finsize= fileSize(infile);
-               
+
                if (! finsize) {
                        printf( "\nFILE EMPTY OR FILE NOT FOUND %s !!\nSkipping processement ...\n",infile);
                        exit(0);
-               }       
-       
+               }
+
                // Opening the input text file.
                if( (f_in = open(infile, O_RDONLY)) < 0) {
                        printf("Cannot read file %s\n", infile);
                        exit(0);
-               }       
+               }
                inputBuffer = (byte *) malloc(finsize *sizeof(byte));// +1);
-               read (f_in,inputBuffer,finsize);        
-               close (f_in);   
-               
-               
+               read (f_in,inputBuffer,finsize);
+               close (f_in);
+
+
        {
-               //printf("\n parametros <<%s>>\n\n",stopwordsfile);     
+               //printf("\n parametros <<%s>>\n\n",stopwordsfile);
                build_index (inputBuffer, finsize, stopwordsfile, &Index);  /** building the index */
 
 //             /** recovering the source text from the index */
@@ -1530,39 +1535,39 @@ double getTime2 (void)
                                ulong size;
                                get_length(Index, &size);
                                char extension[10]= ".source";
-                               
+
                                //recoverSourceText1((twcsa*) Index, outbasename,extension, size);
                                strcat(extension,"2");
                                recoverSourceText2((twcsa*) Index, outbasename,extension,size);
-                               end = getTime2();       
-                               fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start );      
+                               end = getTime2();
+                               fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start );
                        }
-//             
+//
                // DISPLAYING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
                        {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
                         int error = 0;
                        ulong numocc,numc, length, i, *snippet_len, tot_numcharext = 0, numpatt;
                        uchar *pattern, *snippet_text;
-                                
+
                                 pattern = textPattern;
                         printf("\nSEARCH TEST for DISPLAY (pizzachili interface)\n");
-                               while(1) {      
+                               while(1) {
                                        printf("Intro string: ");
                                        fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
                                        if (!strcmp((char*)textPattern,"\n") ) break;
                                         textPattern[strlen((char*)textPattern)-1] = '\0';
-               
+
                                        length = strlen( (char*)textPattern);
                                        numc=50;
-         
-//                                     error = display (Index, textPattern, length, numc, &numocc, 
+
+//                                     error = display (Index, textPattern, length, numc, &numocc,
 //                                                              &snippet_text, &snippet_len);
-                                       error = displayWords (Index, textPattern, length, numc, &numocc, 
+                                       error = displayWords (Index, textPattern, length, numc, &numocc,
                                                                 &snippet_text, &snippet_len,1);
-                                       
+
                                        if (error){ fprintf(stderr, "%s\n", "Hubo un error durante display");exit(0);}
-               
-                                               fprintf(stderr,"\n acabou display");fflush(stderr);                     
+
+                                               fprintf(stderr,"\n acabou display");fflush(stderr);
                                        {//show the results
                                                ulong j, len = length + 2*numc;
                                            char blank = '\0';
@@ -1570,7 +1575,7 @@ double getTime2 (void)
                                                fprintf(stderr,"\n pattern = %s",pattern);fflush(stderr);
                                                fprintf(stderr,"\n numocc = %d",numocc);fflush(stderr);
                                                fprintf(stderr,"\n snippet len = %d",len);fflush(stderr);
-                                               fprintf(stderr,"\n =========");fflush(stderr);          
+                                               fprintf(stderr,"\n =========");fflush(stderr);
                                                for (i = 0; i < numocc; i++){
                                                        fprintf(stderr,"\n[%2d][len=%3d]<<",i+1,snippet_len[i]);fflush(stderr);
                                                        fwrite(snippet_text+len*i,sizeof(uchar),snippet_len[i],stderr);fflush(stderr);
@@ -1578,21 +1583,21 @@ double getTime2 (void)
                                                }
                                        }
                                        numpatt--;
-                                       
+
                                        for(i=0; i<numocc; i++) {
                                                tot_numcharext += snippet_len[i];
                                        }
-                                               
+
                                        if (numocc) {
                                                free (snippet_len);
                                                free (snippet_text);
                                        }
-                                       
+
                                        printf("Ocurrences = %d\n", numocc);
                                        if (!strcmp((char*)textPattern,"\n") ) break;
                                }
                        }
-       
+
 //
 //
 //     // SEARCHING FOR A TEXT PATTERN (word/phrase).
@@ -1602,28 +1607,28 @@ double getTime2 (void)
 //      uint *occs;
 //      int i;
 //      printf("\nSEARCH TEST for LOCATE\n");
-//             while(1) {      
+//             while(1) {
 //                     printf("Intro string: ");
 //                     fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
 //                     len = strlen((char*)textPattern);
 //                     if (!strcmp((char*)textPattern,"\n") ) break;
 //                     textPattern[len-1] = '\0';
 //                     len --;
-//                     
+//
 //                     //occs = locateTextOcurrences(wcsa,textPattern,&occ);
 //                     // locate(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ);
 //                       locateWord(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ, 0);
-//                     
+//
 //                     printf("\n*** %s occurs %d times: In the source text in positions:\n\t",textPattern,occ);
-//                     for (i=0;i<occ;i++) 
+//                     for (i=0;i<occ;i++)
 //                             printf("[%u]",occs[i]);
-//                     fflush(stderr); 
-//                     free(occs);             
-//                     
+//                     fflush(stderr);
+//                     free(occs);
+//
 //                     if (!strcmp((char*)textPattern,"\n") ) break;
 //             }
-//     }       
-//     
+//     }
+//
 //
 
                // COUNTING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
@@ -1632,15 +1637,15 @@ double getTime2 (void)
                 int occ;
                 int len;
                 printf("\nSEARCH TEST for COUNT.\n");
-                       while(1) {      
+                       while(1) {
                                printf("Intro string: ");
                                fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
                                len = strlen((char*)textPattern);
                                if (!strcmp((char*)textPattern,"\n") ) break;
                                textPattern[len-1] = '\0';
                                len --;
-                               
-                               count(Index, textPattern, len, (ulong *)&occ);                  
+
+                               count(Index, textPattern, len, (ulong *)&occ);
                                //occ = countTextOcurrences(wcsa,textPattern);
                                printf("Ocurrences = %d\n", occ);
                        }
@@ -1653,17 +1658,17 @@ double getTime2 (void)
                save_index (Index, outbasename);
 
                /** tells the mem used by the index */
-               ulong indexsize;                
+               ulong indexsize;
                index_size(Index, &indexsize);
                fprintf(stderr,"Index occupied %d bytes, 2 extra mallocs = %d",indexsize,2* sizeof(uint));
 
-               /** freeing the index */                                        
-               free_index(Index);                              
-                        
+               /** freeing the index */
+               free_index(Index);
+
        }
 }
-       
+
 #endif
 
 
-       
+
index 8e77870..a8aa4f7 100755 (executable)
@@ -1,12 +1,12 @@
 
-/* DYNAMIC END-TAGGED DENSE CODE. -- 
-A dynamic word-based byte oriented compressor for text files based on 
+/* DYNAMIC END-TAGGED DENSE CODE. --
+A dynamic word-based byte oriented compressor for text files based on
 dynamic End-Tagged Dense Code.
 
-Brisaboa, N. R., Faria, A., Navarro, G., Param, J. R. 
-Simple, Fast, and Efficient Natural Language Adaptive Compression. 
-11th International Symposium on String Processing and Information Retrieval (SPIRE'04) - LNCS 3246. A. Apostolico, M. Melucci (Ed.), pp. 230-241. 
-Padova (Italia), 2004. 
+Brisaboa, N. R., Faria, A., Navarro, G., Param, J. R.
+Simple, Fast, and Efficient Natural Language Adaptive Compression.
+11th International Symposium on String Processing and Information Retrieval (SPIRE'04) - LNCS 3246. A. Apostolico, M. Melucci (Ed.), pp. 230-241.
+Padova (Italia), 2004.
 
 Copyright (C) 2005 Antonio Faria.
 
@@ -31,7 +31,7 @@ A Corua. Campus de Elvia s/n. Spain  fari@udc.es
 /*-----------------------------------------------------------------------
  Hash: Definition of HashTable class (Linear Hash)
  ------------------------------------------------------------------------*/
+
 #include "hash.h"
 
 #include <stdio.h>
@@ -61,9 +61,8 @@ t_hash initialize_hash (unsigned long sizeVoc) {
                h->hash[i].len = 0;
                h->hash[i].posInVoc = 0;
        }
-       printf("\nHash Table initilized with: %lu elements\n",h->SIZE_HASH);
 
-       return h;       
+       return h;
 }
 
 
@@ -74,7 +73,6 @@ unsigned long NearestPrime(unsigned long n)
 {
     long position;  /* the prime number being sought */
     long index;
-
     for (position = n; ; position++)
     {
         // checks if those values from 2 to $\sqrt{m}$ can be factors of $m$ */
@@ -109,12 +107,22 @@ unsigned long hashFunction (const unsigned char *aWord, unsigned int len, unsign
     //for(c=*aWord; (aWord++)<last ;  )
     {
        c=*(aWord++);
+
                //c=*aWord;
-               h ^= ( (h << 5) + c + (h >> 2) );
+       h ^= ( (h << 5) + c + (h >> 2) );
     }
+
     return((unsigned int)((h&0x7fffffff) % sizeHash));
 }
 
+unsigned long newhashFunction(const unsigned char *aWord, unsigned int len, unsigned long sizeHash)
+{
+  unsigned int h = 0;
+  unsigned int i = 0;
+  for(i = 0; i  < len; i ++)
+    h = ((unsigned int) aWord[i]) + (h << 6) + (h << 16) - h;
+  return (h&0x7fffffff) % sizeHash;
+}
 
 /*-----------------------------------------------------------------------
   compares two strings
@@ -127,12 +135,12 @@ unsigned long hashFunction (const unsigned char *aWord, unsigned int len, unsign
 
 
 
- int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2) {         
+ int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2) {
         if (ws1 !=ws2) {
                        return -1;
         }
-        
-        {  register unsigned long iters;           
+
+        {  register unsigned long iters;
                 iters=1;
            while( iters<ws1 && *s1 == *s2 )
            {
@@ -146,11 +154,11 @@ unsigned long hashFunction (const unsigned char *aWord, unsigned int len, unsign
 }
 
 inline int strcomp3(const unsigned char *s1, const unsigned char *s2, unsigned long ws1, unsigned long ws2) {
-        
+
         if (ws1 !=ws2) {
                        return -1;
         }
-        
+
         {  register unsigned long iters;
            register unsigned long end;
            end = MIN(ws1,ws2);
@@ -185,7 +193,7 @@ inline int strcomp3(const unsigned char *s1, const unsigned char *s2, unsigned l
 //}
 
 //int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
-//     register ulong iters = 1;           
+//     register ulong iters = 1;
 //
 //     while( iters<ws1 && *s1 == *s2 ){
 //             s1++;
@@ -196,18 +204,18 @@ inline int strcomp3(const unsigned char *s1, const unsigned char *s2, unsigned l
 //     if (ws1 == iters) {
 //                     if (ws2 == ws1)
 //                             return 0;  // s1 equals to s2
-//                     else 
+//                     else
 //                             return -1;   // s1 < s2
 //     }
 //     else { //(ws1 > iters) so s1 != s2
 //             return( *s1-*s2);
-//     }        
+//     }
 //}
 
 
 //permits to compare 2 strings of len ws1 and ws2 that do not end in '\0'
 int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
-       register ulong iters = 0;           
+       register ulong iters = 0;
 
        while( iters<ws1 && iters<ws2 && *s1 == *s2 ){
                s1++;
@@ -218,37 +226,77 @@ int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong
        if (ws1 == iters) {
                        if (ws2 == ws1)
                                return 0;  // s1 equals to s2
-                       else 
+                       else
                                return -1;   // w1 < w2  and  *s1_i == *s2_i for i=0 to iters-1
        }
-       else 
+       else
        if (ws2 == iters) {
                        if (ws2 == ws1)
                                return 0;  // s1 equals to s2
-                       else 
+                       else
                                return +1;   // w2 < w1  and  *ws1 is '\0'
        }
        else { //*s1 != *s2
                return (*s1-*s2);
-       }       
-       
+       }
+
 }
-       
+
 
 /*-----------------------------------------------------------------------
  Insert a new word in a position of the hashTable (position previously computed)
  ---------------------------------------------------------------------*/
+
+void insertAux(t_hash h, unsigned char *aWord, unsigned long len, unsigned long pos)
+{
+  unsigned long addr = 0;
+  unsigned long dummy = search(h, aWord, len, &addr);
+  h->hash[addr].word = aWord;
+  h->hash[addr].len = len;
+  h->hash[addr].posInVoc = pos;
+  h->NumElem++;
+}
+
+void resize(t_hash h){
+  unsigned long oldsize = h->SIZE_HASH;
+  unsigned long newsize = NearestPrime(h->SIZE_HASH * 2);
+  t_hashNode* oldh = h->hash;
+  unsigned long l = newsize * sizeof(t_hashNode);
+  t_hashNode* newh = (t_hashNode *) malloc(newsize * sizeof(t_hashNode));
+  unsigned long i;
+  fprintf(stderr, "Resizing hashtable from %lu to %lu (%lu bytes)\n", oldsize, newsize, l);
+  h->hash = newh;
+  h->SIZE_HASH = newsize;
+  for (i = 0; i < h->SIZE_HASH; i++)    {
+    h->hash[i].word = NULL;
+    h->hash[i].len = 0;
+    h->hash[i].posInVoc = 0;
+  }
+  h->NumElem = 0;
+  for(i = 0; i < oldsize; i++)
+    if (oldh[i].word != NULL){
+      insertAux(h, oldh[i].word, oldh[i].len, oldh[i].posInVoc);
+      oldh[i].word = NULL;
+      oldh[i].len = 0;
+      oldh[i].posInVoc=0;
+    }
+  //free(oldh);
+
+}
+
+
 void insertElement (t_hash h, const unsigned char *aWord, register unsigned long len, register unsigned long *addr) {
     //fprintf(stderr,"\n Entra en la funcin [%s], [%ld]",aWord, len);
 
-       if(h->NumElem >= h->SIZE_HASH -1)   //loses 1 slot, but ensures that "search function" does not enter an infinity loop
+       if(h->NumElem >= h->SIZE_HASH/2)
        {
-               printf("\n\nHash table full!! Change size and recompile !\n");
-               exit(1);
-       }
+        
+        resize(h);
 
-    getMemoryBlock(h->_memMgr,( byte **)&(h->hash[*addr].word),len+1);
-    //fprintf(stderr,"\n tras obter memoria");
+       }
+       //h->hash[*addr].word = (unsigned char*) malloc((len+1) * sizeof(unsigned char));
+       getMemoryBlock(h->_memMgr,( byte **)&(h->hash[*addr].word),len+1);
+       //fprintf(stderr,"\n tras obter memoria");
 
        strncpy ((char *) h->hash[*addr].word, (char *)aWord, len);
 
@@ -258,7 +306,7 @@ void insertElement (t_hash h, const unsigned char *aWord, register unsigned long
        h->NumElem++;
 
        //fprintf(stderr,"\n####inserted word [%s] ",h->hash[*addr].word);
-       
+
        //return *addr;
 }
 
@@ -273,23 +321,24 @@ unsigned long search (t_hash h, const unsigned char *aWord, register unsigned le
                                                                 unsigned long *returnedAddr){
 
        register unsigned long addr, Saddr;
-       
+
        //fprintf(stderr,"\n searching for  [%s], [%d], sizehash= %ld",aWord,len,h->SIZE_HASH);
        addr = hashFunction(aWord,len, h->SIZE_HASH);
        Saddr = addr;
 
        t_hashNode *hash;
        hash = h->hash;
-       
-       while((hash[addr].word  != NULL)&&((strcomp(hash[addr].word, aWord,  hash[addr].len, len)) != 0))  {    
+
+       while((hash[addr].word  != NULL)&&((strcomp(hash[addr].word, aWord,  hash[addr].len, len)) != 0))  {
                //fprintf(stderr,"\nComprueba [%s], [%d]",hash[addr].word,strcomp(hash[addr].word, aWord, len));
                addr = ((addr + JUMP) % h->SIZE_HASH);
        }
 
        *returnedAddr = addr;
-       
+
        if(hash[addr].word  == NULL) {
-               return h->NumElem;                              //Word was not found
+         //      fprintf(stderr, "word:'%.*s', hash %lu\n", len, aWord, Saddr);
+         return h->NumElem;                            //Word was not found
        }
        else {
                return h->hash[addr].posInVoc; //Word was found in this position in the vocabulary
@@ -302,7 +351,7 @@ unsigned long search (t_hash h, const unsigned char *aWord, register unsigned le
   Tells if a word appears or not in the hash table.
  -----------------------------------------------------------------------*/
 unsigned long inHashTable (t_hash h, const unsigned char *aWord, register unsigned len, unsigned long *returnedAddr){
-       
+
        unsigned long searched;
        unsigned long nothing;
        searched = search(h,aWord,len,&nothing);
@@ -315,12 +364,15 @@ unsigned long inHashTable (t_hash h, const unsigned char *aWord, register unsign
  ------------------------------------------------------------------ */
 void destroy_hash (t_hash hash){
        unsigned long mem=0;
-       mem += sizeof(struct hashStr) + hash->SIZE_HASH * sizeof(t_hashNode);   
+       unsigned long i = 0;
+       mem = sizeof(struct hashStr) + hash->SIZE_HASH * sizeof(t_hashNode);
+       printf("\n[destroying hash table]...Freed %ld bytes... RAM\n", mem);
+       fflush(stdout);
+       
        free(hash->hash);
-       destroyMemoryManager(hash->_memMgr); //frees words and variants 
-//     free(hash->_memMgr);
-       free(hash);     
-       printf("\n[destroying hash table]...Freed %ld bytes... RAM", mem);      
+       destroyMemoryManager(hash->_memMgr); //frees words and variants
+///    free(hash->_memMgr);
+       free(hash);
 }
 
 
@@ -340,48 +392,48 @@ int main(int argc, char* argv[])
        byte * w;
        unsigned int size;
        unsigned long i,addrInTH;
-       
+
        t_hash hash;
-       
+
        _memMgr=createMemoryManager();
 
        hash = initialize_hash (2);
-       
+
        w=a;
        i = search (hash,w, strlen(w), &addrInTH );
        insertElement (hash, w, strlen(w), &addrInTH);
        fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
-       fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);   
+       fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
 
        w=b;
        i = search (hash,w, strlen(w), &addrInTH );
        insertElement (hash, w, strlen(w), &addrInTH);
        fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
-       fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);   
+       fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
 
        w=c;
        i = search (hash,w, strlen(w), &addrInTH );
        insertElement (hash, w, strlen(w), &addrInTH);
        fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
-       fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);   
+       fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
 
        w=d;
        i = search (hash,w, strlen(w), &addrInTH );
        insertElement (hash, w, strlen(w), &addrInTH);
        fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
-       fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);   
+       fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
 
 //     w=e;
 //     i = search (hash,w, strlen(w), &addrInTH );
 //     insertElement (hash, w, strlen(w), &addrInTH);
 //     fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
-//     fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);   
+//     fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
 
 //     w=f;
 //     i = search (hash,w, strlen(w), &addrInTH );
 //     insertElement (hash, w, strlen(w), &addrInTH);
 //     fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
-//     fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);   
+//     fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
 
        fprintf(stderr,"\n in: %s  hash ? = %d",a,inHashTable(hash,a,strlen(a)) );
        fprintf(stderr,"\n in: %s  hash ? = %d",e, inHashTable(hash,e,strlen(e)) );