uint avgWordLen =7;
uint i, j;//, tmplen;
- uint prevValid;
+ uint prevValid = 0;
byte *src, *dst, *buff;
uint tmplen =0;
//-----------------------------------------------------------------
//1st pass (processing the file)
{
- byte *pbeg,*pend,*wordstart,*aWord;
- register ulong size;
- register uint i;
-
- pbeg = inputBuffer;
- pend = inputBuffer+bytesFileReal;
-
- while (pbeg <pend) {
-
- //parsing either a word or separator.
- size=0;
- wordstart = pbeg;
- if (_Valid[*pbeg]) { //alphanumerical data
- while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
- }
- else {
- if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
- while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
- }
- else { //a SPACE comes, so we have to test if next character is alphanumerical or not
- pbeg++;
- if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
- else {
- if (_Valid [*pbeg] ) {
- wordstart = pbeg; //So skipping 1 blank character
- while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
- }
- else { // a "separator word" ...
- size++; //the prev BLANK...
- while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
- }//else { // a "separator word"
- }//else ... not a unique BLANK AT THE END.
- }//else ... starting by a BLANK...
- }
-
- //The parsed word/separator is is "wordstart", and its length is "size"...
- aWord=wordstart;
-
- //Processement done for each word word
- i = inHashTable(hash,aWord, size, &addrInTH );
- if (!i){
- insertElement (hash,aWord, size, &addrInTH);
- posInHT[zeroNode].slot=addrInTH;
- posInHT[zeroNode].word=hash->hash[addrInTH].word;
- hash->hash[addrInTH].posInVoc = zeroNode;
- zeroNode++;
- totallenWords += size +1; // +1 due to the '\0' char...
- //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);
- }
- seSize ++;
- }//while pbeg<pend
-
- fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
-
- }//1st pass ends
-
+ byte *pbeg,*pend,*wordstart,*aWord;
+ register ulong size;
+ register uint i;
+
+ pbeg = inputBuffer;
+ pend = inputBuffer+bytesFileReal;
+
+ while (pbeg <pend)
+ {
+ if (*pbeg == 0)
+ {
+ fprintf(stderr, "buildFacade.c: assert failed, *pbeg == 0\n");
+ exit(1);
+ }
+
+ //parsing either a word or separator.
+ size=0;
+ wordstart = pbeg;
+ if (_Valid[*pbeg]) { //alphanumerical data
+ while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else
+ {
+ if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
+ while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
+ }
+ else { //a SPACE comes, so we have to test if next character is alphanumerical or not
+ pbeg++;
+ if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
+ else {
+ if (_Valid [*pbeg] ) {
+ wordstart = pbeg; //So skipping 1 blank character
+ while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else { // a "separator word" ...
+ size++; //the prev BLANK...
+ while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
+ }//else { // a "separator word"
+ }//else ... not a unique BLANK AT THE END.
+ }//else ... starting by a BLANK...
+ }
+
+ if (pbeg < pend && *pbeg == 0)
+ pbeg ++; // Skip the 0-bytes
+
+ if (size == 0)
+ {
+ fprintf(stderr, "buildFacade.c: assert failed, size == 0\n");
+ exit(1);
+ }
+
+ //The parsed word/separator is is "wordstart", and its length is "size"...
+ aWord=wordstart;
+
+ //Processement done for each word word
+ i = inHashTable(hash,aWord, size, &addrInTH );
+ if (!i){
+ insertElement (hash,aWord, size, &addrInTH);
+ posInHT[zeroNode].slot=addrInTH;
+ posInHT[zeroNode].word=hash->hash[addrInTH].word;
+ hash->hash[addrInTH].posInVoc = zeroNode;
+ zeroNode++;
+ totallenWords += size +1; // +1 due to the '\0' char...
+ //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);
+ }
+ seSize ++;
+
+ }//while pbeg<pend
+
+ fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
+
+ }//1st pass ends
+
// **********************************************************************************
// END OF 1ST PASS
printf("\nSTARTING THE SECOND PASS... ");
//2nd pass (processing the file)
{
- byte *pbeg,*pend,*wordstart,*aWord;
- register ulong size;
- register uint i;
- register ulong countValidWords = 0;
+ byte *pbeg,*pend,*wordstart,*aWord;
+ register ulong size;
+ register uint i;
+ register ulong countValidWords = 0;
- pbeg = inputBuffer;
- pend = inputBuffer+bytesFileReal;
+ pbeg = inputBuffer;
+ pend = inputBuffer+bytesFileReal;
- while (pbeg <pend) {
+ while (pbeg <pend) {
+ if (*pbeg == 0)
+ {
+ fprintf(stderr, "buildFacade.c 2nd pass: assert failed, *pbeg == 0\n");
+ exit(1);
+ }
- //parsing either a word or separator.
- size=0;
- wordstart = pbeg;
- if (_Valid[*pbeg]) { //alphanumerical data
- while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
- }
- else {
- if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
- while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
- }
- else { //a SPACE comes, so we have to test if next character is alphanumerical or not
- pbeg++;
- if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
- else {
- if (_Valid [*pbeg] ) {
- wordstart = pbeg; //So skipping 1 blank character
- while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
- }
- else { // a "separator word" ...
- size++; //the prev BLANK...
- while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
- }//else { // a "separator word"
- }//else ... not a unique BLANK AT THE END.
- }//else ... starting by a BLANK...
- }
-
- //The parsed word/separator is is "wordstart", and its length is "size"...
- aWord=wordstart;
-
- //Processement done for each word word
- i = inHashTable(hash,aWord, size, &addrInTH );
-
- SE[countValidWords]=hash->hash[addrInTH].posInVoc+1; // !!!!
- countValidWords++;
-
- }// while pbeg<pend
+ //parsing either a word or separator.
+ size=0;
+ wordstart = pbeg;
+ if (_Valid[*pbeg]) { //alphanumerical data
+ while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else {
+ if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
+ while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
+ }
+ else { //a SPACE comes, so we have to test if next character is alphanumerical or not
+ pbeg++;
+ if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
+ else {
+ if (_Valid [*pbeg] ) {
+ wordstart = pbeg; //So skipping 1 blank character
+ while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else { // a "separator word" ...
+ size++; //the prev BLANK...
+ while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
+ }//else { // a "separator word"
+ }//else ... not a unique BLANK AT THE END.
+ }//else ... starting by a BLANK...
+ }
+
+ if (pbeg < pend && *pbeg == 0)
+ pbeg ++; // Skip the 0-bytes
+
+ if (size == 0)
+ {
+ fprintf(stderr, "buildFacade.c 2nd pass: assert failed, size == 0\n");
+ exit(1);
+ }
+
+ //The parsed word/separator is is "wordstart", and its length is "size"...
+ aWord=wordstart;
+
+ //Processement done for each word word
+ i = inHashTable(hash,aWord, size, &addrInTH );
+
+ SE[countValidWords]=hash->hash[addrInTH].posInVoc+1; // !!!!
+ countValidWords++;
+
+ }// while pbeg<pend
- SE[countValidWords] = 0;
- fprintf(stderr,"\n2nd pass ends: TOTAL distinct words: %lu totalWords = %lu",zeroNode,countValidWords);
+ SE[countValidWords] = 0;
+ fprintf(stderr,"\n2nd pass ends: TOTAL distinct words: %lu totalWords = %lu",zeroNode,countValidWords);
}//2nd pass ends
// **********************************************************************************
//freeing the source text (it is no longer needed).
- free(inputBuffer); //the text
+ delete [] inputBuffer; //the text
/** Now Setting the data of the index **/
wcsa->n = zeroNode;
printf("\n Summary of Presentation layer:");
printf("\n Number of valid words (SEsize) = %u",wcsa->seSize);
printf("\n Number of different words = %ld",wcsa->n);
- printf("\n WCSA structure = %d bytes", sizeof(twcsa));
+ printf("\n WCSA structure = %lu bytes", sizeof(twcsa));
uint totalpointers = ((((wcsa->n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));
uint totalasciizone = wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ;
/* Writes in numocc the number of occurrences of the substring
pattern[0..length-1] found in the text indexed by index. */
-//int count (void *index, uchar *pattern, ulong length, ulong *numocc);
-//
-// /* Writes in numocc the number of occurrences of the substring
-// pattern[0..length-1] in the text indexed by index. It also allocates
-// occ (which must be freed by the caller) and writes the locations of
-// the numocc occurrences in occ, in arbitrary order. */
-//
-//int locate (void *index, uchar *pattern, ulong length, ulong **occ,
-// ulong *numocc);
-//
-// /* Gives the length of the text indexed */
-//
-//int get_length(void *index, ulong *length);
-//
-///* Accessing the indexed text */
-//
-// /* Allocates snippet (which must be freed by the caller) and writes
-// the substring text[from..to] into it. Returns in snippet_length the
-// length of the text snippet actually extracted (that could be less
-// than to-from+1 if to is larger than the text size). */
-//
-//int extract (void *index, ulong from, ulong to, uchar **snippet,
-// ulong *snippet_length);
-//
-// /* Displays the text (snippet) surrounding any occurrence of the
-// substring pattern[0..length-1] within the text indexed by index.
-// The snippet must include numc characters before and after the
-// pattern occurrence, totalizing length+2*numc characters, or less if
-// the text boundaries are reached. Writes in numocc the number of
-// occurrences, and allocates the arrays snippet_text and
-// snippet_lengths (which must be freed by the caller). The first is a
-// character array of numocc*(length+2*numc) characters, with a new
-// snippet starting at every multiple of length+2*numc. The second
-// gives the real length of each of the numocc snippets. */
-//
-//int display (void *index, uchar *pattern, ulong length, ulong numc,
-// ulong *numocc, uchar **snippet_text, ulong **snippet_lengths);
+int count (void *index, uchar *pattern, ulong length, ulong *numocc);
+
+ /* Writes in numocc the number of occurrences of the substring
+ pattern[0..length-1] in the text indexed by index. It also allocates
+ occ (which must be freed by the caller) and writes the locations of
+ the numocc occurrences in occ, in arbitrary order. */
+
+int locate (void *index, uchar *pattern, ulong length, ulong **occ,
+ ulong *numocc);
+
+ /* Gives the length of the text indexed */
+
+int get_length(void *index, ulong *length);
+
+/* Accessing the indexed text */
+
+ /* Allocates snippet (which must be freed by the caller) and writes
+ the substring text[from..to] into it. Returns in snippet_length the
+ length of the text snippet actually extracted (that could be less
+ than to-from+1 if to is larger than the text size). */
+
+int extract (void *index, ulong from, ulong to, uchar **snippet,
+ ulong *snippet_length);
+
+ /* Displays the text (snippet) surrounding any occurrence of the
+ substring pattern[0..length-1] within the text indexed by index.
+ The snippet must include numc characters before and after the
+ pattern occurrence, totalizing length+2*numc characters, or less if
+ the text boundaries are reached. Writes in numocc the number of
+ occurrences, and allocates the arrays snippet_text and
+ snippet_lengths (which must be freed by the caller). The first is a
+ character array of numocc*(length+2*numc) characters, with a new
+ snippet starting at every multiple of length+2*numc. The second
+ gives the real length of each of the numocc snippets. */
+
+int display (void *index, uchar *pattern, ulong length, ulong numc,
+ ulong *numocc, uchar **snippet_text, ulong **snippet_lengths);
/* Obtains the length of the text indexed by index. */
int length (void *index, ulong *length);
+ /* Shows summary info of the index */
+
+int printInfo(void *index);
+
+
+
+/** ***********************************************************************************
+ * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
+ * ***********************************************************************************/
+ /** Writes in numocc the number of occurrences of the substring
+ pattern[0..length-1] in the text indexed by index. It also allocates
+ occ (which must be freed by the caller) and writes the locations of
+ the numocc occurrences in occ, in arbitrary order. These occurrences
+ refer to the offsets in TOH where the caller could start a display
+ operation. So locateWord implies synchronization using B.
+ ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
+ searched word, but the offset in TOH of k-before words before.
+ */
+
+int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore);
+
+ /** Displays the text (snippet) surrounding any occurrence of the
+ substring pattern[0..length-1] within the text indexed by index.
+ The snippet must include numc characters before and after the
+ pattern occurrence, totalizing length+2*numc characters, or less if
+ the text boundaries are reached. Writes in numocc the number of
+ occurrences, and allocates the arrays snippet_text and
+ snippet_lengths (which must be freed by the caller). The first is a
+ character array of numocc*(length+2*numc) characters, with a new
+ snippet starting at every multiple of length+2*numc. The second
+ gives the real length of each of the numocc snippets. */
+
+ int displayWords (void *index, uchar *pattern, ulong length, ulong numc,
+ ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore);
+
+
+/** simulates extration of text process, but do not actually returns anything at all
+ Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
+ Less than 2K words can be extracted if more than numc characters have been already obtained.
+ Do nothing else... do not return the text */
+
+int displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc);
+
+
+
+/** Allocates text (which must be freed by the caller) and recovers the
+ the substring of text starting from the "fromword"-th word up to the
+ "toWord"-th words. Returns in text the text, and in "text_lenght" the
+ length of the text actually extracted. Text is allocated.
+ Actually extracts SE[fromWord .. toWord) ... not the last word. */
+
+int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
+ ulong *text_length);