- byte *pbeg,*pend,*wordstart,*aWord;
- register ulong size;
- register uint i;
-
- pbeg = inputBuffer;
- pend = inputBuffer+bytesFileReal;
-
- while (pbeg <pend) {
-
- //parsing either a word or separator.
- size=0;
- wordstart = pbeg;
- if (_Valid[*pbeg]) { //alphanumerical data
- while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
- }
- else {
- if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
- while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
- }
- else { //a SPACE comes, so we have to test if next character is alphanumerical or not
- pbeg++;
- if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
- else {
- if (_Valid [*pbeg] ) {
- wordstart = pbeg; //So skipping 1 blank character
- while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
- }
- else { // a "separator word" ...
- size++; //the prev BLANK...
- while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
- }//else { // a "separator word"
- }//else ... not a unique BLANK AT THE END.
- }//else ... starting by a BLANK...
- }
-
- //The parsed word/separator is is "wordstart", and its length is "size"...
- aWord=wordstart;
-
- //Processement done for each word word
- i = inHashTable(hash,aWord, size, &addrInTH );
- if (!i){
- insertElement (hash,aWord, size, &addrInTH);
- posInHT[zeroNode].slot=addrInTH;
- posInHT[zeroNode].word=hash->hash[addrInTH].word;
- hash->hash[addrInTH].posInVoc = zeroNode;
- zeroNode++;
- totallenWords += size +1; // +1 due to the '\0' char...
- //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);
- }
- seSize ++;
- }//while pbeg<pend
-
- fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
-
- }//1st pass ends
-
+ byte *pbeg,*pend,*wordstart,*aWord;
+ register ulong size;
+ register uint i;
+
+ pbeg = inputBuffer;
+ pend = inputBuffer+bytesFileReal;
+
+ while (pbeg <pend)
+ {
+ if (*pbeg == 0)
+ {
+ fprintf(stderr, "buildFacade.c: assert failed, *pbeg == 0\n");
+ exit(1);
+ }
+
+ //parsing either a word or separator.
+ size=0;
+ wordstart = pbeg;
+ if (_Valid[*pbeg]) { //alphanumerical data
+ while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else
+ {
+ if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
+ while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
+ }
+ else { //a SPACE comes, so we have to test if next character is alphanumerical or not
+ pbeg++;
+ if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
+ else {
+ if (_Valid [*pbeg] ) {
+ wordstart = pbeg; //So skipping 1 blank character
+ while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else { // a "separator word" ...
+ size++; //the prev BLANK...
+ while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
+ }//else { // a "separator word"
+ }//else ... not a unique BLANK AT THE END.
+ }//else ... starting by a BLANK...
+ }
+
+ if (pbeg < pend && *pbeg == 0)
+ pbeg ++; // Skip the 0-bytes
+
+ if (size == 0)
+ {
+ fprintf(stderr, "buildFacade.c: assert failed, size == 0\n");
+ exit(1);
+ }
+
+ //The parsed word/separator is is "wordstart", and its length is "size"...
+ aWord=wordstart;
+
+ //Processement done for each word word
+ i = inHashTable(hash,aWord, size, &addrInTH );
+ if (!i){
+ insertElement (hash,aWord, size, &addrInTH);
+ posInHT[zeroNode].slot=addrInTH;
+ posInHT[zeroNode].word=hash->hash[addrInTH].word;
+ hash->hash[addrInTH].posInVoc = zeroNode;
+ zeroNode++;
+ totallenWords += size +1; // +1 due to the '\0' char...
+ //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);
+ }
+ seSize ++;
+
+ }//while pbeg<pend
+
+ fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
+
+ }//1st pass ends
+