Various fixes:
[SXSI/TextCollection.git] / swcsa / buildFacade.c
index 768e943..b581059 100755 (executable)
@@ -482,7 +482,7 @@ int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
     uint avgWordLen =7;
 
        uint i, j;//, tmplen;
-       uint prevValid;
+       uint prevValid = 0;
        byte *src, *dst, *buff;
        uint tmplen =0;
 
@@ -617,62 +617,79 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
        //-----------------------------------------------------------------     
        //1st pass (processing the file)
        { 
-               byte *pbeg,*pend,*wordstart,*aWord;
-               register ulong size;
-               register uint i;
-
-               pbeg = inputBuffer;
-               pend = inputBuffer+bytesFileReal;
-                               
-               while (pbeg <pend) {  
-                       
-                       //parsing either a word or separator.                   
-                       size=0;
-                       wordstart = pbeg;
-                       if (_Valid[*pbeg]) {   //alphanumerical data
-                               while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
-                   }               
-                       else {
-                               if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word                                      
-                                       while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
-                               }
-                               else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
-                                       pbeg++;
-                                       if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
-                                       else {
-                                               if (_Valid [*pbeg] ) {
-                                                       wordstart = pbeg;   //So skipping 1 blank character
-                                                       while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
-                                               }
-                                               else {   // a "separator word" ...
-                                                       size++; //the prev BLANK...
-                                                       while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
-                                               }//else {  // a "separator word"
-                                       }//else ... not a unique BLANK AT THE END.
-                               }//else ... starting by a BLANK... 
-                       }
-
-                       //The parsed word/separator is  is "wordstart", and its length is "size"...
-                       aWord=wordstart;                                
-
-                       //Processement done for each word word                  
-                       i = inHashTable(hash,aWord, size, &addrInTH );                          
-                       if (!i){
-                               insertElement (hash,aWord, size, &addrInTH);
-                               posInHT[zeroNode].slot=addrInTH;
-                               posInHT[zeroNode].word=hash->hash[addrInTH].word;
-                               hash->hash[addrInTH].posInVoc = zeroNode;
-                               zeroNode++;
-                               totallenWords += size +1;                       // +1 due to the '\0' char...           
-                               //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);                           
-                       }                                                       
-                       seSize ++;
-               }//while pbeg<pend
-
-               fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
-
-       }//1st pass ends
-
+            byte *pbeg,*pend,*wordstart,*aWord;
+            register ulong size;
+            register uint i;
+            
+            pbeg = inputBuffer;
+            pend = inputBuffer+bytesFileReal;
+            
+            while (pbeg <pend) 
+            {
+                if (*pbeg == 0)
+                {
+                    fprintf(stderr, "buildFacade.c: assert failed, *pbeg == 0\n");
+                    exit(1);
+                }
+
+                //parsing either a word or separator.                  
+                size=0;
+                wordstart = pbeg;
+                if (_Valid[*pbeg]) {   //alphanumerical data
+                    while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+                }                  
+                else
+                {
+                    if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word                                 
+                        while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
+                    }
+                    else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
+                        pbeg++;
+                        if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
+                        else {
+                            if (_Valid [*pbeg] ) {
+                                wordstart = pbeg;   //So skipping 1 blank character
+                                while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+                            }
+                            else {   // a "separator word" ...
+                                size++; //the prev BLANK...
+                                while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
+                            }//else {  // a "separator word"
+                        }//else ... not a unique BLANK AT THE END.
+                    }//else ... starting by a BLANK... 
+                }
+                
+                if (pbeg < pend && *pbeg == 0) 
+                    pbeg ++;    // Skip the 0-bytes
+
+                if (size == 0)
+                {
+                    fprintf(stderr, "buildFacade.c: assert failed, size == 0\n");
+                    exit(1);
+                }
+
+                //The parsed word/separator is  is "wordstart", and its length is "size"...
+                aWord=wordstart;                               
+                
+                //Processement done for each word word                 
+                i = inHashTable(hash,aWord, size, &addrInTH );                         
+                if (!i){
+                    insertElement (hash,aWord, size, &addrInTH);
+                    posInHT[zeroNode].slot=addrInTH;
+                    posInHT[zeroNode].word=hash->hash[addrInTH].word;
+                    hash->hash[addrInTH].posInVoc = zeroNode;
+                    zeroNode++;
+                    totallenWords += size +1;                  // +1 due to the '\0' char...           
+                   //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);
+                }                                                      
+                seSize ++;
+                
+            }//while pbeg<pend
+            
+            fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
+            
+        }//1st pass ends
+        
 
        // **********************************************************************************
        // END OF 1ST PASS
@@ -702,56 +719,70 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
        printf("\nSTARTING THE SECOND PASS... ");
        //2nd pass (processing the file)
        { 
-               byte *pbeg,*pend,*wordstart,*aWord;
-               register ulong size;
-               register uint i;
-               register ulong countValidWords = 0;
+            byte *pbeg,*pend,*wordstart,*aWord;
+            register ulong size;
+            register uint i;
+            register ulong countValidWords = 0;
 
 
-               pbeg = inputBuffer;
-               pend = inputBuffer+bytesFileReal;
+            pbeg = inputBuffer;
+            pend = inputBuffer+bytesFileReal;
                                
-               while (pbeg <pend) {  
+            while (pbeg <pend) {  
+                if (*pbeg == 0)
+                {
+                    fprintf(stderr, "buildFacade.c 2nd pass: assert failed, *pbeg == 0\n");
+                    exit(1);
+                }
                        
-                       //parsing either a word or separator.                   
-                       size=0;
-                       wordstart = pbeg;
-                       if (_Valid[*pbeg]) {   //alphanumerical data
-                               while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
-                 }                 
-                       else {
-                               if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word                                      
-                                       while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
-                               }
-                               else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
-                                       pbeg++;
-                                       if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
-                                       else {
-                                               if (_Valid [*pbeg] ) {
-                                                       wordstart = pbeg;   //So skipping 1 blank character
-                                                       while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
-                                               }
-                                               else {   // a "separator word" ...
-                                                       size++; //the prev BLANK...
-                                                       while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
-                                               }//else {  // a "separator word"
-                                       }//else ... not a unique BLANK AT THE END.
-                               }//else ... starting by a BLANK... 
-                       }
-
-                       //The parsed word/separator is  is "wordstart", and its length is "size"...
-                       aWord=wordstart;                                        
-
-                       //Processement done for each word word                  
-                       i = inHashTable(hash,aWord, size, &addrInTH );                          
-
-                       SE[countValidWords]=hash->hash[addrInTH].posInVoc+1;  // !!!!
-                       countValidWords++;              
-
-               }// while pbeg<pend
+                //parsing either a word or separator.                  
+                size=0;
+                wordstart = pbeg;
+                if (_Valid[*pbeg]) {   //alphanumerical data
+                    while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+                }                  
+                else {
+                    if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word                                 
+                        while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
+                    }
+                    else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
+                        pbeg++;
+                        if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
+                        else {
+                            if (_Valid [*pbeg] ) {
+                                wordstart = pbeg;   //So skipping 1 blank character
+                                while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+                            }
+                            else {   // a "separator word" ...
+                                size++; //the prev BLANK...
+                                while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg]  && *pbeg != 0)) {size++;pbeg++;}
+                            }//else {  // a "separator word"
+                        }//else ... not a unique BLANK AT THE END.
+                    }//else ... starting by a BLANK... 
+                }
+                    
+                if (pbeg < pend && *pbeg == 0) 
+                    pbeg ++;    // Skip the 0-bytes
+
+                if (size == 0)
+                {
+                    fprintf(stderr, "buildFacade.c 2nd pass: assert failed, size == 0\n");
+                    exit(1);
+                }
+
+                //The parsed word/separator is  is "wordstart", and its length is "size"...
+                aWord=wordstart;                                       
+                    
+                //Processement done for each word word                 
+                i = inHashTable(hash,aWord, size, &addrInTH );                         
+                    
+                SE[countValidWords]=hash->hash[addrInTH].posInVoc+1;  // !!!!
+                countValidWords++;             
+                    
+            }// while pbeg<pend
                
-               SE[countValidWords] = 0;
-               fprintf(stderr,"\n2nd pass ends: TOTAL distinct words: %lu totalWords = %lu",zeroNode,countValidWords);
+            SE[countValidWords] = 0;
+            fprintf(stderr,"\n2nd pass ends: TOTAL distinct words: %lu totalWords = %lu",zeroNode,countValidWords);
                        
        }//2nd pass ends
        
@@ -760,7 +791,7 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
        // **********************************************************************************
        
        //freeing the source text (it is no longer needed).
-       free(inputBuffer); //the text   
+       delete [] inputBuffer; //the text       
 
        /** Now Setting the data of the index **/
        wcsa->n = zeroNode;
@@ -792,7 +823,7 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
                tmpOffsets[zeroNode]=tmpOffset; //setting pointer to the "virtual" word {zeroNode+1}^{th}
                
                //kbit encoding of the offsets
-               uint elemSize = bits(tmpOffset);
+               uint elemSize = _bits(tmpOffset);
                wcsa->wordsData.elemSize = elemSize;
                wcsa->wordsData.words = (uint *) malloc (((((zeroNode +1)*elemSize)+W-1) /W) * sizeof(uint));  //with 1 extra slot !.
                wcsa->wordsData.words[((((zeroNode +1)*elemSize)+W-1) /W)   -1 ] =0000;
@@ -1369,7 +1400,7 @@ int printInfo(void *index) {
                printf("\n Summary of Presentation layer:");            
                printf("\n   Number of valid words (SEsize) = %u",wcsa->seSize);
                printf("\n   Number of different words = %ld",wcsa->n);
-               printf("\n   WCSA structure = %d bytes", sizeof(twcsa));
+               printf("\n   WCSA structure = %lu bytes", sizeof(twcsa));
 
                uint totalpointers = ((((wcsa->n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));
                uint totalasciizone = wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ;