uint avgWordLen =7;
uint i, j;//, tmplen;
- uint prevValid;
+ uint prevValid = 0;
byte *src, *dst, *buff;
uint tmplen =0;
//-----------------------------------------------------------------
//1st pass (processing the file)
{
- byte *pbeg,*pend,*wordstart,*aWord;
- register ulong size;
- register uint i;
-
- pbeg = inputBuffer;
- pend = inputBuffer+bytesFileReal;
-
- while (pbeg <pend) {
-
- //parsing either a word or separator.
- size=0;
- wordstart = pbeg;
- if (_Valid[*pbeg]) { //alphanumerical data
- while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
- }
- else {
- if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
- while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
- }
- else { //a SPACE comes, so we have to test if next character is alphanumerical or not
- pbeg++;
- if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
- else {
- if (_Valid [*pbeg] ) {
- wordstart = pbeg; //So skipping 1 blank character
- while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
- }
- else { // a "separator word" ...
- size++; //the prev BLANK...
- while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
- }//else { // a "separator word"
- }//else ... not a unique BLANK AT THE END.
- }//else ... starting by a BLANK...
- }
-
- //The parsed word/separator is is "wordstart", and its length is "size"...
- aWord=wordstart;
-
- //Processement done for each word word
- i = inHashTable(hash,aWord, size, &addrInTH );
- if (!i){
- insertElement (hash,aWord, size, &addrInTH);
- posInHT[zeroNode].slot=addrInTH;
- posInHT[zeroNode].word=hash->hash[addrInTH].word;
- hash->hash[addrInTH].posInVoc = zeroNode;
- zeroNode++;
- totallenWords += size +1; // +1 due to the '\0' char...
- //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);
- }
- seSize ++;
- }//while pbeg<pend
-
- fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
-
- }//1st pass ends
-
+ byte *pbeg,*pend,*wordstart,*aWord;
+ register ulong size;
+ register uint i;
+
+ pbeg = inputBuffer;
+ pend = inputBuffer+bytesFileReal;
+
+ while (pbeg <pend)
+ {
+ if (*pbeg == 0)
+ {
+ fprintf(stderr, "buildFacade.c: assert failed, *pbeg == 0\n");
+ exit(1);
+ }
+
+ //parsing either a word or separator.
+ size=0;
+ wordstart = pbeg;
+ if (_Valid[*pbeg]) { //alphanumerical data
+ while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else
+ {
+ if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
+ while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
+ }
+ else { //a SPACE comes, so we have to test if next character is alphanumerical or not
+ pbeg++;
+ if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
+ else {
+ if (_Valid [*pbeg] ) {
+ wordstart = pbeg; //So skipping 1 blank character
+ while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else { // a "separator word" ...
+ size++; //the prev BLANK...
+ while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
+ }//else { // a "separator word"
+ }//else ... not a unique BLANK AT THE END.
+ }//else ... starting by a BLANK...
+ }
+
+ if (pbeg < pend && *pbeg == 0)
+ pbeg ++; // Skip the 0-bytes
+
+ if (size == 0)
+ {
+ fprintf(stderr, "buildFacade.c: assert failed, size == 0\n");
+ exit(1);
+ }
+
+ //The parsed word/separator is is "wordstart", and its length is "size"...
+ aWord=wordstart;
+
+ //Processement done for each word word
+ i = inHashTable(hash,aWord, size, &addrInTH );
+ if (!i){
+ insertElement (hash,aWord, size, &addrInTH);
+ posInHT[zeroNode].slot=addrInTH;
+ posInHT[zeroNode].word=hash->hash[addrInTH].word;
+ hash->hash[addrInTH].posInVoc = zeroNode;
+ zeroNode++;
+ totallenWords += size +1; // +1 due to the '\0' char...
+ //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);
+ }
+ seSize ++;
+
+ }//while pbeg<pend
+
+ fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
+
+ }//1st pass ends
+
// **********************************************************************************
// END OF 1ST PASS
printf("\nSTARTING THE SECOND PASS... ");
//2nd pass (processing the file)
{
- byte *pbeg,*pend,*wordstart,*aWord;
- register ulong size;
- register uint i;
- register ulong countValidWords = 0;
+ byte *pbeg,*pend,*wordstart,*aWord;
+ register ulong size;
+ register uint i;
+ register ulong countValidWords = 0;
- pbeg = inputBuffer;
- pend = inputBuffer+bytesFileReal;
+ pbeg = inputBuffer;
+ pend = inputBuffer+bytesFileReal;
- while (pbeg <pend) {
+ while (pbeg <pend) {
+ if (*pbeg == 0)
+ {
+ fprintf(stderr, "buildFacade.c 2nd pass: assert failed, *pbeg == 0\n");
+ exit(1);
+ }
- //parsing either a word or separator.
- size=0;
- wordstart = pbeg;
- if (_Valid[*pbeg]) { //alphanumerical data
- while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
- }
- else {
- if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
- while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
- }
- else { //a SPACE comes, so we have to test if next character is alphanumerical or not
- pbeg++;
- if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
- else {
- if (_Valid [*pbeg] ) {
- wordstart = pbeg; //So skipping 1 blank character
- while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
- }
- else { // a "separator word" ...
- size++; //the prev BLANK...
- while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
- }//else { // a "separator word"
- }//else ... not a unique BLANK AT THE END.
- }//else ... starting by a BLANK...
- }
-
- //The parsed word/separator is is "wordstart", and its length is "size"...
- aWord=wordstart;
-
- //Processement done for each word word
- i = inHashTable(hash,aWord, size, &addrInTH );
-
- SE[countValidWords]=hash->hash[addrInTH].posInVoc+1; // !!!!
- countValidWords++;
-
- }// while pbeg<pend
+ //parsing either a word or separator.
+ size=0;
+ wordstart = pbeg;
+ if (_Valid[*pbeg]) { //alphanumerical data
+ while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else {
+ if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
+ while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
+ }
+ else { //a SPACE comes, so we have to test if next character is alphanumerical or not
+ pbeg++;
+ if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
+ else {
+ if (_Valid [*pbeg] ) {
+ wordstart = pbeg; //So skipping 1 blank character
+ while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else { // a "separator word" ...
+ size++; //the prev BLANK...
+ while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
+ }//else { // a "separator word"
+ }//else ... not a unique BLANK AT THE END.
+ }//else ... starting by a BLANK...
+ }
+
+ if (pbeg < pend && *pbeg == 0)
+ pbeg ++; // Skip the 0-bytes
+
+ if (size == 0)
+ {
+ fprintf(stderr, "buildFacade.c 2nd pass: assert failed, size == 0\n");
+ exit(1);
+ }
+
+ //The parsed word/separator is is "wordstart", and its length is "size"...
+ aWord=wordstart;
+
+ //Processement done for each word word
+ i = inHashTable(hash,aWord, size, &addrInTH );
+
+ SE[countValidWords]=hash->hash[addrInTH].posInVoc+1; // !!!!
+ countValidWords++;
+
+ }// while pbeg<pend
- SE[countValidWords] = 0;
- fprintf(stderr,"\n2nd pass ends: TOTAL distinct words: %lu totalWords = %lu",zeroNode,countValidWords);
+ SE[countValidWords] = 0;
+ fprintf(stderr,"\n2nd pass ends: TOTAL distinct words: %lu totalWords = %lu",zeroNode,countValidWords);
}//2nd pass ends
// **********************************************************************************
//freeing the source text (it is no longer needed).
- free(inputBuffer); //the text
+ delete [] inputBuffer; //the text
/** Now Setting the data of the index **/
wcsa->n = zeroNode;
tmpOffsets[zeroNode]=tmpOffset; //setting pointer to the "virtual" word {zeroNode+1}^{th}
//kbit encoding of the offsets
- uint elemSize = bits(tmpOffset);
+ uint elemSize = _bits(tmpOffset);
wcsa->wordsData.elemSize = elemSize;
wcsa->wordsData.words = (uint *) malloc (((((zeroNode +1)*elemSize)+W-1) /W) * sizeof(uint)); //with 1 extra slot !.
wcsa->wordsData.words[((((zeroNode +1)*elemSize)+W-1) /W) -1 ] =0000;
printf("\n Summary of Presentation layer:");
printf("\n Number of valid words (SEsize) = %u",wcsa->seSize);
printf("\n Number of different words = %ld",wcsa->n);
- printf("\n WCSA structure = %d bytes", sizeof(twcsa));
+ printf("\n WCSA structure = %lu bytes", sizeof(twcsa));
uint totalpointers = ((((wcsa->n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));
uint totalasciizone = wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ;