1 #include "buildFacade.h"
2 #include "utils/errors.c"
5 /** Building the index */
7 /* Creates index from text[0..length-1]. Note that the index is an
8 opaque data type. Any build option must be passed in string
9 build_options, whose syntax depends on the index. The index must
10 always work with some default parameters if build_options is NULL.
11 The returned index is ready to be queried. */
12 int build_index (uchar *text, ulong length, char *build_options, void **index) {
15 printf("\n parameters: \"%s\"\n",build_options); fflush(stderr);
17 returnvalue = build_WCSA (text, length, build_options, index);
20 returnvalue = build_iCSA (build_options,*index);
26 /** Saves index on disk by using single or multiple files, having
28 int save_index (void *index, char *filename) {
30 char *basename = filename;
31 twcsa *wcsa=(twcsa *) index;
38 printf("\n Saving structures to disk: %s.*",filename);
39 outfilename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
41 /**File with some constants (bSize and tohSize); */
43 strcpy(outfilename, basename);
44 strcat(outfilename, ".");
45 strcat(outfilename, CONSTANTS_FILE_EXT);
47 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
48 printf("Cannot open file %s\n", outfilename);
51 write(file, &(wcsa->sourceTextSize), sizeof(uint));
52 write(file, &(wcsa->seSize), sizeof(uint));
56 /** The Words in the vocabulary of words (sorted alphabetically)*/
57 { strcpy(outfilename, basename);
58 strcat(outfilename, ".");
59 strcat(outfilename, VOCABULARY_WORDS_FILE_EXT);
61 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
62 printf("Cannot open file %s\n", outfilename);
67 uint elemSize = wcsa->wordsData.elemSize;
68 write(file, &n, sizeof(uint));
69 write(file, &elemSize, sizeof(uint));
70 write(file, &(wcsa->wordsData.wordsZoneMem.size), sizeof(uint));
72 //the number of canonical words
73 write(file, (char *)wcsa->wordsData.wordsZoneMem.zone, wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
74 write(file, (char *)wcsa->wordsData.words, ((((n+1)* (elemSize))+W-1) /W) * (sizeof(uint)) );
82 /******** saves index on integers (bottom) ******/
84 //storeStructsCSA(wcsa->myicsa,basename);
85 saveIntIndex((void *) wcsa->myicsa, basename);
89 saveSEfile(basename,wcsa->se, wcsa->seSize+1);
98 /** Loads index from one or more file(s) named filename, possibly
99 adding the proper extensions. */
100 int load_index(char *filename, void **index){
102 wcsa = loadWCSA (filename);
103 (*index) = (void *) wcsa;
107 /** Frees the memory occupied by index. */
108 int free_index(void *index){
109 twcsa *wcsa=(twcsa *) index;
111 index_size(index,&size);
112 printf("\n[destroying index] ...Freed %lu bytes... RAM", size);
115 //frees the array SE.
121 //destroyStructsCSA(wcsa->myicsa);
122 int err = freeIntIndex((void *) wcsa->myicsa);
126 free (wcsa->wordsData.wordsZoneMem.zone);
127 free (wcsa->wordsData.words); /** huge!! */
129 //the pointer to wcsa.
134 /** Gives the memory occupied by index in bytes. */
135 int index_size(void *index, ulong *size) {
137 twcsa *wcsa=(twcsa *)index;
140 *size += sizeof(twcsa);
143 totaltmp += ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)); //the pointers
144 totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words.
149 int err = sizeIntIndex((void *) wcsa->myicsa, &nbytes);
151 //*size += CSA_size(wcsa->myicsa);
158 /** Querying the index =============================================================*/
160 /* Writes in numocc the number of occurrences of the substring
161 pattern[0..length-1] found in the text indexed by index. */
162 int count (void *index, uchar *pattern, ulong length, ulong *numocc){
163 uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
164 uint integerPatternSize;
167 twcsa *wcsa=(twcsa *) index;
168 parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
169 if (!integerPatternSize) {*numocc=0; return 0;} //not found
171 //*numocc = countCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &l, &r);
172 int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, numocc, &l, &r);
176 /* Writes in numocc the number of occurrences of the substring
177 pattern[0..length-1] in the text indexed by index. It also allocates
178 occ (which must be freed by the caller) and writes the locations of
179 the numocc occurrences in occ, in arbitrary order. */
180 int locate(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc){
184 /* Gives the length of the text indexed */
185 int get_length(void *index, ulong *length) {
186 twcsa *wcsa=(twcsa *) index;
187 *length = wcsa->sourceTextSize;
191 /** Obtains the length of the text indexed by index. */
193 int length (void *index, ulong *length) {
194 return (get_length(index,length));
198 /** ***********************************************************************************
199 * Accessing the indexed text
200 * ***********************************************************************************/
203 /** Allocates snippet (which must be freed by the caller) and writes
204 the substring text[from..to] into it. Returns in snippet_length the
205 length of the text snippet actually extracted (that could be less
206 than to-from+1 if to is larger than the text size). */
207 int extract (void *index, ulong from, ulong to, uchar **snippet, ulong *snippet_length) {
208 twcsa *wcsa=(twcsa *) index;
212 /** Displays the text (snippet) surrounding any occurrence of the
213 substring pattern[0..length-1] within the text indexed by index.
214 The snippet must include numc characters before and after the
215 pattern occurrence, totalizing length+2*numc characters, or less if
216 the text boundaries are reached. Writes in numocc the number of
217 occurrences, and allocates the arrays snippet_text and
218 snippet_lengths (which must be freed by the caller). The first is a
219 character array of numocc*(length+2*numc) characters, with a new
220 snippet starting at every multiple of length+2*numc. The second
221 gives the real length of each of the numocc snippets. */
223 int display (void *index, uchar *pattern, ulong length, ulong numc,
224 ulong *numocc, uchar **snippet_text, ulong **snippet_lengths) {
230 /** ***********************************************************************************
231 * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
232 * ***********************************************************************************/
233 /* Writes in numocc the number of occurrences of the substring
234 pattern[0..length-1] in the text indexed by index. It also allocates
235 occ (which must be freed by the caller) and writes the locations of
236 the numocc occurrences in occ, in arbitrary order. These occurrences
237 refer to the offsets in TOH where the caller could start a display
238 operation. So locateWord implies synchronization using B.
239 Moreover, positions occ[numocc.. 2*numocc-1] is set with the rank in SE of the
240 words whose codes begin in TOH in the positions in occ[0... numocc-1]
241 ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
242 searched word, but the offset in TOH of k-before words before.
245 int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore){
246 uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
247 uint integerPatternSize;
248 ulong occurrences,l,r;
249 twcsa *wcsa=(twcsa *) index;
251 parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
252 if (!integerPatternSize) {*numocc=0; return 0;} //not found
256 //obtains the indexes in vector SE where the pattern appears.
257 //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
258 int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
260 *numocc = occurrences;
262 if (!occurrences) {(*occ)=NULL;return 0;}
264 (*occ) = (ulong *)seOffsets;
269 /** Displays the text (snippet) surrounding any occurrence of the
270 substring pattern[0..length-1] within the text indexed by index.
271 The snippet must include numc characters before and after the
272 pattern occurrence, totalizing length+2*numc characters, or less if
273 the text boundaries are reached. Writes in numocc the number of
274 occurrences, and allocates the arrays snippet_text and
275 snippet_lengths (which must be freed by the caller). The first is a
276 character array of numocc*(length+2*numc) characters, with a new
277 snippet starting at every multiple of length+2*numc. The second
278 gives the real length of each of the numocc snippets. */
280 int displayWords (void *index, uchar *pattern, ulong length, ulong numc,
281 ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore) {
283 /** actually extracts upto length + 2*numc chars, starting extraction kbefore
284 * words before the occurrence **/
288 uint bytesPerSnippet;
290 twcsa *wcsa=(twcsa *) index;
292 locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
293 (*numocc) = occurrences;
297 *snippet_lengths =NULL;
301 bytesPerSnippet = length+2*numc;
302 // bytesPerSnippet = 2*numc;
303 *snippet_lengths = (ulong *) malloc((*numocc)*sizeof(ulong));
304 if (!(*snippet_lengths)) return 1;
305 *snippet_text = (uchar *) malloc((*numocc)*(bytesPerSnippet)*sizeof(uchar) +1) ; //(the last "1" is for '\0');
306 if (!(*snippet_text)) return 1;
308 // fprintf(stderr,"\n occs found = %7d for pattern %s",*numocc, pattern);
311 text_aux=*snippet_text;
317 uint posSEValue,indexSE;
319 for (i=0;i<occurrences;i++) {
323 /** decodes words from there */
325 indexSE = indexesInSE[i];
326 indexSE = (indexSE > kbefore) ? indexSE-kbefore : 0;
329 while ((!endSnippet) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
331 //posSEValue =displayCSA(wcsa->myicsa,indexSE);
332 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
334 {//obtains pointer to the ith word
336 uint ith = posSEValue -1; // !!
337 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
338 offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
339 tmplen -=offtmp; //the lenght of the ith word.
341 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
348 if (snippetLen==bytesPerSnippet) break; //end of snippet (ends in BLANK_SPACE)
350 prevValid =1; //for the next iteration
357 if ((tmplen+snippetLen)>=bytesPerSnippet) {
358 tmplen =(bytesPerSnippet - snippetLen);
359 endSnippet=1; //so while loop ends;
362 for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
366 text_aux += bytesPerSnippet;
367 (*snippet_lengths)[i] = snippetLen;
370 if (occurrences) free(indexesInSE);
375 /** simulates extration of text process, but do not actually returns anything at all
376 Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
377 Less than 2K words can be extracted if more than numc characters have been already obtained.
378 Does nothing else... does not return the text */
380 int displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc) {
386 twcsa *wcsa=(twcsa *) index;
388 locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
394 ulong maxsnippetLen = maxnumc;
395 ulong extractedbytes = 0;
397 text_aux = (byte *) malloc (maxsnippetLen+1);
404 uint posSEValue,indexSE;
406 uint numWordsToExtract = 2 * wordsbefore;
408 //printf("\n occurrences... = %lu",occurrences);
410 for (i=0;i<occurrences;i++) {
414 /** decodes words from there */
416 indexSE = indexesInSE[i];
417 indexSE = (indexSE > wordsbefore) ? indexSE-wordsbefore : 0;
421 while ((z<numWordsToExtract) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
423 //posSEValue =displayCSA(wcsa->myicsa,indexSE);
424 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
426 {//obtains pointer to the ith word
428 uint ith = posSEValue -1; // !!
429 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
430 offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
431 tmplen -=offtmp; //the lenght of the ith word.
433 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
440 if (snippetLen==maxsnippetLen) break; //end of snippet (ends in BLANK_SPACE)
442 prevValid =1; //for the next iteration
449 if ((tmplen+snippetLen)>=maxsnippetLen) {
453 //fprintf(stderr,"\ntmplen = %d ",tmplen); fflush(stderr);
454 for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
459 extractedbytes += snippetLen;
463 if (occurrences) free(indexesInSE);
465 if (text_aux) free (text_aux);
466 return extractedbytes;
471 /** Allocates text (which must be freed by the caller) and recovers the
472 the substring of text starting from the "fromword"-th word up to the
473 "toWord"-th words. Returns in text the text, and in "text_lenght" the
474 length of the text actually extracted. Text is allocated.
475 Actually extracts SE[fromWord .. toWord) ... not the last word. */
477 int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
480 twcsa *wcsa=(twcsa *) index;
481 uint initTextLen=10000;
484 uint i, j;//, tmplen;
486 byte *src, *dst, *buff;
489 uint buffBytes = 1000;
490 uint leng=0; //curr pos in buffer that was occupied.
492 if (toWord > wcsa->seSize) toWord = wcsa->seSize;
493 if (fromWord >= wcsa->seSize) fromWord = wcsa->seSize-1;
494 if (buffBytes < ( (toWord-fromWord)* avgWordLen)) buffBytes = ((toWord-fromWord)* avgWordLen);
496 buff = (uchar *) malloc (buffBytes * sizeof(char));
497 if (!buff) return 1; //out of memory.
500 register uint indexSE=fromWord;
504 while ( (indexSE < toWord) ){ /** extracting words (if not at the end) */
506 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
508 {//obtains pointer to the ith word
510 ith= posSEValue -1; // !!
511 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
512 offtmp = bitread (wcsa->wordsData.words, (ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
513 tmplen -=offtmp; //the lenght of the ith word.
514 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
517 if ( buffBytes < (leng + tmplen+1) ) {
519 buff = (uchar*) realloc(buff, buffBytes);
520 if (!buff) return 1; //out of memory.
529 prevValid =1; //for the next iteration
535 for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
547 /** ***********************************************************************************
548 CONSTRUCTION OF THE INDEX WCSA
549 ***********************************************************************************/
551 /**------------------------------------------------------------------
552 Compares two slots (alphanumericaly). For qsort of canonical words
553 ------------------------------------------------------------------ */
554 int qSortWordsCompareAlpha(const void *arg1, const void *arg2) {
555 tposInHT *a1 = (tposInHT *) arg1;
556 tposInHT *a2 = (tposInHT *) arg2;
557 return strcmp((char*)a1->word, (char *)a2->word);
561 * BUILDS THE WCSA INDEX
564 int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
566 unsigned long zeroNode; //number of different canonical words.
568 t_hash hash; // the hash table to store both variants and canonical words.
569 tposInHT *posInHT; // structure for canonicals and variants+huffmans
573 uint seSize=0; //it's size == "numberOfValidWords".
574 uint *SE; //Integers vector. (represents the rank of the valid words in the source text).
576 uint totallenWords=0; //The numberOfBytes that occupy canonical words (their ascii version) in memory
579 ulong bytesFile,bytesFileReal;
581 unsigned long size_posInHT;
582 /* used during first pass */
586 byte* inputBuffer = text;
587 bytesFileReal= bytesFile = length;
589 sourceTextSize=length;
591 /** Initializes WCSA structure*/
593 wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
597 //Stimation (Using Heap's law) of the number of different "meaningful" words.
598 //sizeNValue=N_value;
599 if(bytesFile<5000000) bytesFile = 5000000;
600 sizeNValue = (unsigned long) floor(3.9* pow(bytesFile,0.70) );
602 // Inicializes the arrays used to detect if a char is valid or not.
604 // Inicializes the arrays used translated a char into lowercase.
608 // **********************************************************************************
609 //STARTING THE FIRST PASS.
610 // **********************************************************************************
611 printf("\nSTARTING THE FIRST PASS...");
613 posInHT = (tposInHT *) malloc(sizeof(tposInHT) * sizeNValue);
614 size_posInHT = sizeNValue;
615 hash = initialize_hash (sizeNValue); //hash to cointain both the parsed words
617 //-----------------------------------------------------------------
618 //1st pass (processing the file)
620 byte *pbeg,*pend,*wordstart,*aWord;
625 pend = inputBuffer+bytesFileReal;
631 fprintf(stderr, "buildFacade.c: assert failed, *pbeg == 0\n");
635 //parsing either a word or separator.
638 if (_Valid[*pbeg]) { //alphanumerical data
639 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
643 if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
644 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
646 else { //a SPACE comes, so we have to test if next character is alphanumerical or not
648 if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
650 if (_Valid [*pbeg] ) {
651 wordstart = pbeg; //So skipping 1 blank character
652 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
654 else { // a "separator word" ...
655 size++; //the prev BLANK...
656 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
657 }//else { // a "separator word"
658 }//else ... not a unique BLANK AT THE END.
659 }//else ... starting by a BLANK...
662 if (pbeg < pend && *pbeg == 0)
663 pbeg ++; // Skip the 0-bytes
667 fprintf(stderr, "buildFacade.c: assert failed, size == 0\n");
671 //The parsed word/separator is is "wordstart", and its length is "size"...
674 //Processement done for each word word
675 i = inHashTable(hash,aWord, size, &addrInTH );
677 insertElement (hash,aWord, size, &addrInTH);
678 if (zeroNode >= size_posInHT){
680 posInHT = (tposInHT*) realloc(posInHT, size_posInHT * sizeof(tposInHT));
682 posInHT[zeroNode].slot=addrInTH;
683 posInHT[zeroNode].word=hash->hash[addrInTH].word;
684 hash->hash[addrInTH].posInVoc = zeroNode;
686 totallenWords += size +1; // +1 due to the '\0' char...
687 //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);
693 fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
698 // **********************************************************************************
700 // **********************************************************************************
702 // Sorting the words alphanumerically (over posInHT)
703 { register unsigned long i,j;
704 //sorting canonical words ...
705 qsort(posInHT, zeroNode, sizeof(tposInHT), qSortWordsCompareAlpha);
707 //setting in hash the new positions of the words in the hash table
708 for (i=0;i<zeroNode;i++) {
709 hash->hash[posInHT[i].slot].posInVoc = i;
713 // INITIALIZING structures for the 2nd pass ......................................
715 SE = (uint *) malloc ((seSize+1)*sizeof (uint));
719 // **********************************************************************************
720 // STARTING THE SECOND PASS.
721 // **********************************************************************************/
723 printf("\nSTARTING THE SECOND PASS... ");
724 //2nd pass (processing the file)
726 byte *pbeg,*pend,*wordstart,*aWord;
729 register ulong countValidWords = 0;
733 pend = inputBuffer+bytesFileReal;
738 fprintf(stderr, "buildFacade.c 2nd pass: assert failed, *pbeg == 0\n");
742 //parsing either a word or separator.
745 if (_Valid[*pbeg]) { //alphanumerical data
746 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
749 if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
750 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
752 else { //a SPACE comes, so we have to test if next character is alphanumerical or not
754 if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
756 if (_Valid [*pbeg] ) {
757 wordstart = pbeg; //So skipping 1 blank character
758 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
760 else { // a "separator word" ...
761 size++; //the prev BLANK...
762 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
763 }//else { // a "separator word"
764 }//else ... not a unique BLANK AT THE END.
765 }//else ... starting by a BLANK...
768 if (pbeg < pend && *pbeg == 0)
769 pbeg ++; // Skip the 0-bytes
773 fprintf(stderr, "buildFacade.c 2nd pass: assert failed, size == 0\n");
777 //The parsed word/separator is is "wordstart", and its length is "size"...
780 //Processement done for each word word
781 i = inHashTable(hash,aWord, size, &addrInTH );
783 SE[countValidWords]=hash->hash[addrInTH].posInVoc+1; // !!!!
788 SE[countValidWords] = 0;
789 fprintf(stderr,"\n2nd pass ends: TOTAL distinct words: %lu totalWords = %lu",zeroNode,countValidWords);
793 // **********************************************************************************
795 // **********************************************************************************
797 //freeing the source text (it is no longer needed).
798 free(inputBuffer); //the text
800 /** Now Setting the data of the index **/
802 wcsa->sourceTextSize = sourceTextSize;
803 wcsa->seSize = seSize;
805 // Creating the words of the vocabulary...
807 /** copying the words into WCSA. */
808 uint *tmpOffsets = (uint *) malloc (sizeof(uint) * (zeroNode +1) ); //1 extra uint (to point to the virtual "zeroNode+1" ^th word.
814 //Moving data from posInHT to WCSA structure
815 //wcsa->wordsData = (twords *) malloc(sizeof(twords) * zeroNode);
816 wcsa->wordsData.wordsZoneMem.size = totallenWords - zeroNode; //without '\0' bytes (end-tag).
817 wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc ( wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
818 zoneMem = wcsa->wordsData.wordsZoneMem.zone;
819 for(i = 0; i < zeroNode; i++) {
820 src = posInHT[i].word; //copying the canonical word
821 //wcsa->wordsData.words[i].word = zoneMem; //setting the pointer
822 tmpOffsets[i]=tmpOffset; //offset in zoneMem
823 while (*src) {*zoneMem++ = *src++; tmpOffset++;} //moving data until '\0'
824 //*zoneMem='\0'; zoneMem++; //copies also the '\0'
827 tmpOffsets[zeroNode]=tmpOffset; //setting pointer to the "virtual" word {zeroNode+1}^{th}
829 //kbit encoding of the offsets
830 uint elemSize = _bits(tmpOffset);
831 wcsa->wordsData.elemSize = elemSize;
832 wcsa->wordsData.words = (uint *) malloc (((((zeroNode +1)*elemSize)+W-1) /W) * sizeof(uint)); //with 1 extra slot !.
833 wcsa->wordsData.words[((((zeroNode +1)*elemSize)+W-1) /W) -1 ] =0000;
834 // fprintf(stderr,"\n ElemSize = %d, maxOffset = %d",elemSize,tmpOffset);
837 for (i=0; i<=zeroNode; i++) { //setting "zeroNode+1" offsets
838 bitwrite(wcsa->wordsData.words, tmpOffset, elemSize, tmpOffsets[i]);
842 //////////// CHECKS IT WORKED. old !!!!
845 // for (i=0; i<zeroNode; i++) { //setting "zeroNode+1" offsets
846 // kk=bitread(wcsa->wordsData.words, i* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
847 // tmpOffset+=elemSize;
848 // if (kk != tmpOffsets[i]) {fprintf(stderr,"\n @@@@@@@@ DISTINTOS OFFSETS "); break;}
849 // else fprintf(stderr,"\n iguales, %d, %d :: <<%s>>len=%d",kk,i, posInHT[i].word, strlen((char*)posInHT[i].word));
853 // { uint len1, len, tmplen, len2;
855 // byte *wcsaWord, *src;
857 // for (p=0;p<zeroNode;p++) {
858 // {//preparing for strcompL
859 // len = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p+1)), wcsa->wordsData.elemSize);
860 // tmplen = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p)) , wcsa->wordsData.elemSize);
862 // //fprintf(stderr,"\n :: off[%d]= %d - off [%d] = %d ==> %d",p+1,len,p,tmplen,len-tmplen);
865 // wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
868 // src = posInHT[p].word;
869 // len1 = strlen((char *)src);
871 // if (strcompL(src,wcsaWord,len1,len2) != 0) {
872 // fprintf(stderr,"\n %6d DISTINTOS !! ===len1 %d,len %d===== <<",p,len1,len2);printWord(src,len1);
873 // fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
877 // fprintf(stderr,"\n %6d ======len1 %d,len2 %d===== <<",p,len1,len2);printWord(src,len1);
878 // fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
886 //frees memory from hash table and posInHT structures.
892 /** ******* creates the self-index on ints (bottom layer) ==> see build_icsa *********/
897 fprintf(stderr,"\n **** CREATING CSA from Edu's Code *****");
899 myicsa = createIntegerCSA(&SE,seSize+1,build_options);
900 wcsa->myicsa= myicsa;
901 total = CSA_size(myicsa);
903 free(SE); //SE is no longer needed, (it is indexed by the iCSA)
904 printf("\n\t**** [iCSA built on %d words. Size = %ld bytes... RAM",seSize,total);
914 printf("\n\t ** Building done! **\n");
915 printf("\n Process finished!\n");
922 int build_iCSA (char *build_options, void *index)
924 twcsa *wcsa = (twcsa *) index;
925 /********* creates the self-index on ints (bottom layer) *********/
926 //creating CSA from Edu's code...
929 fprintf(stderr,"\n **** CREATING CSA-bottom-layer *****");
930 void *bottomIntIndex;
931 int err = buildIntIndex(wcsa->se,wcsa->seSize+1, build_options,(void **)&bottomIntIndex);
932 wcsa->myicsa = bottomIntIndex;
934 //total = CSA_size(wcsa->myicsa);
935 err = sizeIntIndex((void *) wcsa->myicsa, &total);
937 printf("\n\t**** [iCSA built on %d words. Size = %u bytes... RAM",wcsa->seSize,total);
942 /** ********************************************************************
944 **********************************************************************/
946 /**-----------------------------------------------------------------
948 * Loads all the data structures of WCSA (included the icsa)
949 ----------------------------------------------------------------- */
951 twcsa *loadWCSA(char *filename) {
953 // Inicializes the arrays used to detect if a char is valid or not.
955 // Inicializes the arrays used translated a char into lowercase.
958 wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
961 int err = loadIntIndex(filename, (void **)&wcsa->myicsa);
963 loadStructs(wcsa,filename);
968 /** ------------------------------------------------------------------
970 * Reads files and loads all the data needed for searcherFacade
971 ----------------------------------------------------------------- */
972 void loadStructs(twcsa *wcsa, char *basename) {
980 filename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
981 fprintf(stderr,"Loading Index from file %s.*\n", basename);
983 //** SOME CONSTANTS: sourceTextSize
984 { strcpy(filename, basename);
985 strcat(filename, ".");
986 strcat(filename, CONSTANTS_FILE_EXT);
988 if( (file = open(filename, O_RDONLY)) < 0) {
989 printf("Cannot open file %s\n", filename);
993 read(file, &(wcsa->sourceTextSize), sizeof(uint));
994 read(file, &(wcsa->seSize), sizeof(uint));
998 /** File with the words from the vocabulary (sorted alphabetically) */
1001 strcpy(filename, basename);
1002 strcat(filename, ".");
1003 strcat(filename, VOCABULARY_WORDS_FILE_EXT);
1004 //sizeFile= fileSize(filename)-sizeof(uint);
1006 if( (file = open(filename, O_RDONLY)) < 0) {
1007 printf("Cannot open file %s\n", filename);
1011 //the number of canonical words
1012 read(file, &n, sizeof(uint));
1014 read(file, &(wcsa->wordsData.elemSize), (sizeof(uint)));
1015 read(file, &(wcsa->wordsData.wordsZoneMem.size), (sizeof(uint)));
1017 //allocating the memory needed for all words and reading them //(ascii) << no \0 chars are needed>>.
1018 wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc(wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
1019 read(file, (wcsa->wordsData.wordsZoneMem.zone), wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
1021 //reading the offsets of the words (kbitArray that points to offsets in zoneMem of words.
1022 wcsa->wordsData.words = (uint *) malloc (((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * sizeof(uint));
1023 wcsa->wordsData.words[ ((((n+1)*(wcsa->wordsData.elemSize))+W-1) /W) -1 ] =0000;
1024 read(file, (wcsa->wordsData.words), ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)));
1036 /** ****************************************************************
1037 * Querying the index WCSA
1038 * ***************************************************************/
1039 ///////////////////////////////////////////////////////////////////////////////////////
1040 // FUNCTIONS NEEDED FOR SEARCHING A PATTERN //
1041 ///////////////////////////////////////////////////////////////////////////////////////
1045 /*------------------------------------------------------------------
1046 * Given a text pattern translates it into a list of integers (corresponding to the
1047 * canonical words associated to the valid words in the text pattern)
1048 ------------------------------------------------------------------*/
1049 void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) {
1051 byte *pbeg,*pend,*wordstart,*aWord;
1052 register unsigned long size;
1056 pend = pbeg + patLen;
1058 while (pbeg <pend) {
1059 //parsing either a word or separator.
1062 if (_Valid[*pbeg]) { //alphanumerical data
1063 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
1066 if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
1067 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
1069 else { //a SPACE comes, so we have to test if next character is alphanumerical or not
1071 if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
1073 if (_Valid [*pbeg] ) {
1074 wordstart = pbeg; //So skipping 1 blank character
1075 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
1077 else { // a "separator word" ...
1078 size++; //the prev BLANK...
1079 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
1080 }//else { // a "separator word"
1081 }//else ... not a unique BLANK AT THE END.
1082 }//else ... starting by a BLANK...
1085 //The parsed word is "aWord", and its length is "size"...
1088 // Binary search on the canonical words (wordsData)
1092 register uint min,max,p;
1094 max = (wcsa->n) - 1;
1098 {//preparing for strcompL
1099 len = bitread (wcsa->wordsData.words, (p+1)* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize);
1100 tmplen = bitread (wcsa->wordsData.words, (p )* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize);
1102 wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
1105 //if(strncmp((char*)aWord, (char*)wcsa->wordsData[p].word,size) > 0) min = p+1;
1106 if(strcompL(aWord, wcsaWord, size, len) > 0) min = p+1;
1110 // { //SHOW PROGRESS
1111 // fprintf(stderr,"\n Patron = <<%s>>, curposWord= %d ==><<",aWord,p);
1112 // printWord(wcsaWord,len); fprintf(stderr,">> len =%d",len);
1117 {//preparing for strcompL
1118 len = bitread (wcsa->wordsData.words, (min+1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1119 tmplen = bitread (wcsa->wordsData.words, ( min )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1121 wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
1124 // if(!strncmp((char*)aWord, (char*)wcsa->wordsData[min].word, size)) {
1125 if(!strcompL(aWord, wcsaWord, size, len)) {
1126 integerPattern[index++] = min +1 ; //<--
1128 else {*sizeIntegers = 0; return;} // a valid word that does not appear in the source text.
1132 *sizeIntegers = index;
1134 // //shows the parsed words:
1136 // printf("\n\n >>%s>> HA SIDO PARSEADO COMO:",textPattern);
1137 // for (i=0; i<index;i++) {
1138 // printf("<<%s>>",wcsa->wordsData[integerPattern[i] -1].word);
1148 /** ------------------------------------------------------------------
1149 * Returns the number of occurrences of a given text pattern
1150 *------------------------------------------------------------------ */
1151 int countTextOcurrences(twcsa *wcsa, byte *textPattern) {
1153 uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
1154 uint integerPatternSize, min, max;
1156 uint lenpat = strlen((char*)textPattern);
1157 parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
1158 if (!integerPatternSize) return -1;
1162 // printf("\n %d Integers to search for:",integerPatternSize );
1163 // for (i=0;i<integerPatternSize;i++) {
1164 // printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
1170 int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, &numocc, &left, &right);
1176 /** ------------------------------------------------------------------
1177 * locateTextOcurrences:
1178 * Returns the offsets of the source text where a word/phrase appears
1179 * Returns also the number of occurrences.
1180 *------------------------------------------------------------------ */
1181 uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences) {
1182 uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
1183 uint integerPatternSize, min, max;
1185 uint lenpat = strlen((char*)textPattern);
1186 parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
1187 if (!integerPatternSize) {*numberOccurrences = -1; return NULL;}
1191 // printf("\n %d Integers to search for:",integerPatternSize );
1192 // for (i=0;i<integerPatternSize;i++) {
1193 // printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
1198 ulong occurrences, left, right;
1200 ulong *sourceOffsets;
1202 //obtains the indexes in vector SE where the pattern appears.
1203 //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
1204 int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
1206 //sourceOffsets = (uint *) malloc (sizeof(uint)*occurrences);
1208 sourceOffsets=seOffsets;
1209 //obtains the offsets in the source text of the pattern (sourceOffsets)
1210 locateFacade(wcsa, (uint *)sourceOffsets, (uint *)seOffsets,occurrences);
1213 fprintf(stderr,"\n*** %s appears in the source text in positions:\n\t",textPattern);
1214 for (i=0;i<occurrences;i++)
1215 fprintf(stderr,"[%u]",sourceOffsets[i]);
1219 *numberOccurrences = occurrences;
1220 return (uint *) sourceOffsets;
1224 /** ------------------------------------------------------------------
1225 * displayTextOcurrences:
1226 * Shows in stdout, the text around the occurrences of a word/phrase
1227 * Returns also the number of occurrences.
1228 *------------------------------------------------------------------ */
1229 int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay) {
1230 return 99; //not implemented: function not available
1233 /** ------------------------------------------------------------------
1235 * For given sePositions, returns the sourceTextPositions
1236 * where the those valid-words in se[sePositions[i]] occurr.
1237 *------------------------------------------------------------------*/
1238 int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number) {
1239 return 99; //not implemented: function not available for this index
1243 /** ------------------------------------------------------------------
1245 * Returns the subString from a starting offset to a final offset
1246 * in the source text. It does not allocate any memory, receives "dstptr"
1247 * Precondition: offsetIni >=0;
1248 ------------------------------------------------------------------*/
1249 int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr) {
1250 return 99; //not implemented: function not available for this index
1254 /**------------------------------------------------------------------
1255 * DISPLAYFacadeMalloc:
1256 * Returns the subString from a starting offset to a final offset
1257 * in the source text. It allocates Memory !!
1258 * NOT CURRENTLY USED
1259 ------------------------------------------------------------------*/
1260 byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length) {
1261 byte *dstptr=NULL; //not implemented: function not available
1266 /** ------------------------------------------------------------------
1267 * LOCATEALLandDISPLAY:
1268 * Displays the text around an occurrence of the searched word in the source text.
1269 * Assuming that $p$ is that position --> shows only chars in [p_radix-1,p_radix]
1270 ------------------------------------------------------------------*/
1271 int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix) {
1272 return 99; //not implemented: function not available for this index
1277 /** ------------------------------------------------------------------
1278 * recovers the source text by calling display(0,fileSize);
1279 * ------------------------------------------------------------------ */
1280 void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
1284 char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
1287 strcpy( filename, basename);
1288 strcat( filename, ext);
1289 filename[strlen( basename)+ strlen(ext)]='\0';
1290 fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
1294 salida = fopen( filename,"w");
1295 start=0; end = sourceTextSize-1;
1297 cc = (byte *) malloc (sourceTextSize* sizeof(uchar));
1300 uint i, j;//, tmplen;
1311 while ( (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
1313 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
1315 {//obtains pointer to the ith word
1317 uint ith = posSEValue -1; // !!
1318 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1319 offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1320 tmplen -=offtmp; //the lenght of the ith word.
1321 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
1329 prevValid =1; //for the next iteration
1335 for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
1339 fprintf(stderr,"\n sourceTextSize = %d, len = %d",sourceTextSize,leng);
1340 fwrite(cc,sizeof(byte),leng,salida);
1350 //recovers the source text by calling extract Words.
1351 void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
1353 int start;int end; int error;
1354 char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
1358 strcpy( filename, basename);
1359 strcat( filename, ext);
1360 filename[strlen( basename)+ strlen(ext)]='\0';
1361 fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
1365 salida = fopen( filename,"w");
1366 start=0; end = wcsa->seSize;
1368 error = extractWords((void *) wcsa, start, end, &cc, &length);
1369 if (error) {fprintf(stderr,"\n error during recoverSourceText2"); exit(0);}
1371 fprintf(stderr,"\n sourceTextSize = %d, len = %ld",sourceTextSize,length);
1372 fwrite(cc,sizeof(byte),length,salida);
1379 /** *******************************************************************************
1380 * Showing some statistics and info of the index
1381 * *******************************************************************************/
1382 void printInfoReduced(twcsa *wcsa) {
1383 //not implemented: function not available
1386 /* Shows summary info of the index */
1387 int printInfo(void *index) {
1390 twcsa *wcsa = (twcsa *) index;
1392 unsigned long indexSize;
1393 uint intIndexSize, presentationSize;
1396 err = index_size(index, &indexSize);
1397 if (err!=0) return err;
1398 err = sizeIntIndex(wcsa->myicsa, &intIndexSize);
1399 if (err!=0) return err;
1401 presentationSize = indexSize - intIndexSize;
1403 printf("\n ===================================================:");
1404 printf("\n Summary of Presentation layer:");
1405 printf("\n Number of valid words (SEsize) = %u",wcsa->seSize);
1406 printf("\n Number of different words = %ld",wcsa->n);
1407 printf("\n WCSA structure = %lu bytes", sizeof(twcsa));
1409 uint totalpointers = ((((wcsa->n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));
1410 uint totalasciizone = wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ;
1411 uint totalwords = totalasciizone + totalpointers;
1413 printf("\n Size Of words structure (%d bytes):",totalwords);
1414 printf("\n [ pointers = %d bytes || AsciiZone = %d bytes", totalpointers, totalasciizone);
1416 printf("\n\n Total = ** %u bytes (in RAM) **",presentationSize);
1417 //printf("\n\n @@ Summary of self-index on Integers:");
1418 err = printInfoIntIndex(wcsa->myicsa, " ");
1419 if (err!=0) return err;
1421 printf("\n ===================================================:");
1426 /**------------------------------------------------------------------
1428 * Counts the memory amount needed by the Facade (Presentation Layer).
1429 * skipping the stop_words hash table
1430 ----------------------------------------------------------------- */
1431 uint structsSizeMem(twcsa *wcsa) {
1432 return 0; //not implemented: function not available for this index.
1436 /** for debugging **/
1437 void printWord(uchar *str, uint len) {
1440 fprintf(stderr,"%c",str[i]);
1444 /** saves the content of the file SE (ids of the source words) **/
1445 int saveSEfile (char *basename, uint *v, uint n) {
1446 char outfilename[255];
1448 sprintf(outfilename,"%s.%s",basename,SE_FILE_EXT);
1449 unlink(outfilename);
1450 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
1451 printf("Cannot open file %s\n", outfilename);
1455 write(file, v, sizeof(uint) * n );
1461 double getTime2 (void)
1463 double usertime, systime;
1464 struct rusage usage;
1466 getrusage (RUSAGE_SELF, &usage);
1468 usertime = (double) usage.ru_utime.tv_sec +
1469 (double) usage.ru_utime.tv_usec / 1000000.0;
1470 systime = (double) usage.ru_stime.tv_sec +
1471 (double) usage.ru_stime.tv_usec / 1000000.0;
1473 return (usertime + systime);
1478 /**------------------------------------------------------------------
1480 *------------------------------------------------------------------ */
1481 #ifdef FACADEWITHMAIN
1482 int main(int argc, char* argv[])
1487 char *infile, *outbasename, *stopwordsfile; // Name of in/out files
1495 printf("\n*Word-based iCSA: A word-based CSA");
1496 printf("\n*CopyRight (c) 2008 [LBD & G.N.]\n\n");
1498 // Reads input parameters from command line.
1500 printf("Use: %s <in file> <out basename> \n", argv[0]);
1504 // Reads params (input file, output basename, and stopwords file)
1506 outbasename = argv[2];
1507 stopwordsfile = argv[3];
1509 finsize= fileSize(infile);
1512 printf( "\nFILE EMPTY OR FILE NOT FOUND %s !!\nSkipping processement ...\n",infile);
1516 // Opening the input text file.
1517 if( (f_in = open(infile, O_RDONLY)) < 0) {
1518 printf("Cannot read file %s\n", infile);
1521 inputBuffer = (byte *) malloc(finsize *sizeof(byte));// +1);
1522 read (f_in,inputBuffer,finsize);
1527 //printf("\n parametros <<%s>>\n\n",stopwordsfile);
1528 build_index (inputBuffer, finsize, stopwordsfile, &Index); /** building the index */
1530 // /** recovering the source text from the index */
1535 get_length(Index, &size);
1536 char extension[10]= ".source";
1538 //recoverSourceText1((twcsa*) Index, outbasename,extension, size);
1539 strcat(extension,"2");
1540 recoverSourceText2((twcsa*) Index, outbasename,extension,size);
1542 fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start );
1545 // DISPLAYING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
1546 {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1548 ulong numocc,numc, length, i, *snippet_len, tot_numcharext = 0, numpatt;
1549 uchar *pattern, *snippet_text;
1551 pattern = textPattern;
1552 printf("\nSEARCH TEST for DISPLAY (pizzachili interface)\n");
1554 printf("Intro string: ");
1555 fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1556 if (!strcmp((char*)textPattern,"\n") ) break;
1557 textPattern[strlen((char*)textPattern)-1] = '\0';
1559 length = strlen( (char*)textPattern);
1562 // error = display (Index, textPattern, length, numc, &numocc,
1563 // &snippet_text, &snippet_len);
1564 error = displayWords (Index, textPattern, length, numc, &numocc,
1565 &snippet_text, &snippet_len,1);
1567 if (error){ fprintf(stderr, "%s\n", "Hubo un error durante display");exit(0);}
1569 fprintf(stderr,"\n acabou display");fflush(stderr);
1571 ulong j, len = length + 2*numc;
1573 fprintf(stderr,"\n length = %d",length);
1574 fprintf(stderr,"\n pattern = %s",pattern);fflush(stderr);
1575 fprintf(stderr,"\n numocc = %d",numocc);fflush(stderr);
1576 fprintf(stderr,"\n snippet len = %d",len);fflush(stderr);
1577 fprintf(stderr,"\n =========");fflush(stderr);
1578 for (i = 0; i < numocc; i++){
1579 fprintf(stderr,"\n[%2d][len=%3d]<<",i+1,snippet_len[i]);fflush(stderr);
1580 fwrite(snippet_text+len*i,sizeof(uchar),snippet_len[i],stderr);fflush(stderr);
1581 fprintf(stderr,">>");fflush(stderr);
1586 for(i=0; i<numocc; i++) {
1587 tot_numcharext += snippet_len[i];
1592 free (snippet_text);
1595 printf("Ocurrences = %d\n", numocc);
1596 if (!strcmp((char*)textPattern,"\n") ) break;
1602 // // SEARCHING FOR A TEXT PATTERN (word/phrase).
1603 // {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1608 // printf("\nSEARCH TEST for LOCATE\n");
1610 // printf("Intro string: ");
1611 // fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1612 // len = strlen((char*)textPattern);
1613 // if (!strcmp((char*)textPattern,"\n") ) break;
1614 // textPattern[len-1] = '\0';
1617 // //occs = locateTextOcurrences(wcsa,textPattern,&occ);
1618 // // locate(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ);
1619 // locateWord(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ, 0);
1621 // printf("\n*** %s occurs %d times: In the source text in positions:\n\t",textPattern,occ);
1622 // for (i=0;i<occ;i++)
1623 // printf("[%u]",occs[i]);
1627 // if (!strcmp((char*)textPattern,"\n") ) break;
1633 // COUNTING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
1635 {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1638 printf("\nSEARCH TEST for COUNT.\n");
1640 printf("Intro string: ");
1641 fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1642 len = strlen((char*)textPattern);
1643 if (!strcmp((char*)textPattern,"\n") ) break;
1644 textPattern[len-1] = '\0';
1647 count(Index, textPattern, len, (ulong *)&occ);
1648 //occ = countTextOcurrences(wcsa,textPattern);
1649 printf("Ocurrences = %d\n", occ);
1652 printf("\n END COUNTING OCCURRENCES OF PATTERNS. ...\n");
1656 /** saving the index to disk*/
1657 save_index (Index, outbasename);
1659 /** tells the mem used by the index */
1661 index_size(Index, &indexsize);
1662 fprintf(stderr,"Index occupied %d bytes, 2 extra mallocs = %d",indexsize,2* sizeof(uint));
1664 /** freeing the index */