1 #include "buildFacade.h"
2 #include "utils/errors.c"
5 /** Building the index */
7 /* Creates index from text[0..length-1]. Note that the index is an
8 opaque data type. Any build option must be passed in string
9 build_options, whose syntax depends on the index. The index must
10 always work with some default parameters if build_options is NULL.
11 The returned index is ready to be queried. */
12 int build_index (uchar *text, ulong length, char *build_options, void **index) {
15 printf("\n parameters: \"%s\"\n",build_options); fflush(stderr);
17 returnvalue = build_WCSA (text, length, build_options, index);
20 returnvalue = build_iCSA (build_options,*index);
26 /** Saves index on disk by using single or multiple files, having
28 int save_index (void *index, char *filename) {
30 char *basename = filename;
31 twcsa *wcsa=(twcsa *) index;
38 printf("\n Saving structures to disk: %s.*",filename);
39 outfilename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
41 /**File with some constants (bSize and tohSize); */
43 strcpy(outfilename, basename);
44 strcat(outfilename, ".");
45 strcat(outfilename, CONSTANTS_FILE_EXT);
47 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
48 printf("Cannot open file %s\n", outfilename);
51 write(file, &(wcsa->sourceTextSize), sizeof(uint));
52 write(file, &(wcsa->seSize), sizeof(uint));
56 /** The Words in the vocabulary of words (sorted alphabetically)*/
57 { strcpy(outfilename, basename);
58 strcat(outfilename, ".");
59 strcat(outfilename, VOCABULARY_WORDS_FILE_EXT);
61 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
62 printf("Cannot open file %s\n", outfilename);
67 uint elemSize = wcsa->wordsData.elemSize;
68 write(file, &n, sizeof(uint));
69 write(file, &elemSize, sizeof(uint));
70 write(file, &(wcsa->wordsData.wordsZoneMem.size), sizeof(uint));
72 //the number of canonical words
73 write(file, (char *)wcsa->wordsData.wordsZoneMem.zone, wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
74 write(file, (char *)wcsa->wordsData.words, ((((n+1)* (elemSize))+W-1) /W) * (sizeof(uint)) );
82 /******** saves index on integers (bottom) ******/
84 //storeStructsCSA(wcsa->myicsa,basename);
85 saveIntIndex((void *) wcsa->myicsa, basename);
89 saveSEfile(basename,wcsa->se, wcsa->seSize+1);
98 /** Loads index from one or more file(s) named filename, possibly
99 adding the proper extensions. */
100 int load_index(char *filename, void **index){
102 wcsa = loadWCSA (filename);
103 (*index) = (void *) wcsa;
107 /** Frees the memory occupied by index. */
108 int free_index(void *index){
109 twcsa *wcsa=(twcsa *) index;
111 index_size(index,&size);
112 printf("\n[destroying index] ...Freed %lu bytes... RAM", size);
115 //frees the array SE.
121 //destroyStructsCSA(wcsa->myicsa);
122 int err = freeIntIndex((void *) wcsa->myicsa);
126 free (wcsa->wordsData.wordsZoneMem.zone);
127 free (wcsa->wordsData.words); /** huge!! */
129 //the pointer to wcsa.
134 /** Gives the memory occupied by index in bytes. */
135 int index_size(void *index, ulong *size) {
137 twcsa *wcsa=(twcsa *)index;
140 *size += sizeof(twcsa);
143 totaltmp += ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)); //the pointers
144 totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words.
149 int err = sizeIntIndex((void *) wcsa->myicsa, &nbytes);
151 //*size += CSA_size(wcsa->myicsa);
158 /** Querying the index =============================================================*/
160 /* Writes in numocc the number of occurrences of the substring
161 pattern[0..length-1] found in the text indexed by index. */
162 int count (void *index, uchar *pattern, ulong length, ulong *numocc){
163 uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
164 uint integerPatternSize;
167 twcsa *wcsa=(twcsa *) index;
168 parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
169 if (!integerPatternSize) {*numocc=0; return 0;} //not found
171 //*numocc = countCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &l, &r);
172 int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, numocc, &l, &r);
176 /* Writes in numocc the number of occurrences of the substring
177 pattern[0..length-1] in the text indexed by index. It also allocates
178 occ (which must be freed by the caller) and writes the locations of
179 the numocc occurrences in occ, in arbitrary order. */
180 int locate(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc){
184 /* Gives the length of the text indexed */
185 int get_length(void *index, ulong *length) {
186 twcsa *wcsa=(twcsa *) index;
187 *length = wcsa->sourceTextSize;
191 /** Obtains the length of the text indexed by index. */
193 int length (void *index, ulong *length) {
194 return (get_length(index,length));
198 /** ***********************************************************************************
199 * Accessing the indexed text
200 * ***********************************************************************************/
203 /** Allocates snippet (which must be freed by the caller) and writes
204 the substring text[from..to] into it. Returns in snippet_length the
205 length of the text snippet actually extracted (that could be less
206 than to-from+1 if to is larger than the text size). */
207 int extract (void *index, ulong from, ulong to, uchar **snippet, ulong *snippet_length) {
208 twcsa *wcsa=(twcsa *) index;
212 /** Displays the text (snippet) surrounding any occurrence of the
213 substring pattern[0..length-1] within the text indexed by index.
214 The snippet must include numc characters before and after the
215 pattern occurrence, totalizing length+2*numc characters, or less if
216 the text boundaries are reached. Writes in numocc the number of
217 occurrences, and allocates the arrays snippet_text and
218 snippet_lengths (which must be freed by the caller). The first is a
219 character array of numocc*(length+2*numc) characters, with a new
220 snippet starting at every multiple of length+2*numc. The second
221 gives the real length of each of the numocc snippets. */
223 int display (void *index, uchar *pattern, ulong length, ulong numc,
224 ulong *numocc, uchar **snippet_text, ulong **snippet_lengths) {
230 /** ***********************************************************************************
231 * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
232 * ***********************************************************************************/
233 /* Writes in numocc the number of occurrences of the substring
234 pattern[0..length-1] in the text indexed by index. It also allocates
235 occ (which must be freed by the caller) and writes the locations of
236 the numocc occurrences in occ, in arbitrary order. These occurrences
237 refer to the offsets in TOH where the caller could start a display
238 operation. So locateWord implies synchronization using B.
239 Moreover, positions occ[numocc.. 2*numocc-1] is set with the rank in SE of the
240 words whose codes begin in TOH in the positions in occ[0... numocc-1]
241 ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
242 searched word, but the offset in TOH of k-before words before.
245 int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore){
246 uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
247 uint integerPatternSize;
248 ulong occurrences,l,r;
249 twcsa *wcsa=(twcsa *) index;
251 parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
252 if (!integerPatternSize) {*numocc=0; return 0;} //not found
256 //obtains the indexes in vector SE where the pattern appears.
257 //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
258 int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
260 *numocc = occurrences;
262 if (!occurrences) {(*occ)=NULL;return 0;}
264 (*occ) = (ulong *)seOffsets;
269 /** Displays the text (snippet) surrounding any occurrence of the
270 substring pattern[0..length-1] within the text indexed by index.
271 The snippet must include numc characters before and after the
272 pattern occurrence, totalizing length+2*numc characters, or less if
273 the text boundaries are reached. Writes in numocc the number of
274 occurrences, and allocates the arrays snippet_text and
275 snippet_lengths (which must be freed by the caller). The first is a
276 character array of numocc*(length+2*numc) characters, with a new
277 snippet starting at every multiple of length+2*numc. The second
278 gives the real length of each of the numocc snippets. */
280 int displayWords (void *index, uchar *pattern, ulong length, ulong numc,
281 ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore) {
283 /** actually extracts upto length + 2*numc chars, starting extraction kbefore
284 * words before the occurrence **/
288 uint bytesPerSnippet;
290 twcsa *wcsa=(twcsa *) index;
292 locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
293 (*numocc) = occurrences;
297 *snippet_lengths =NULL;
301 bytesPerSnippet = length+2*numc;
302 // bytesPerSnippet = 2*numc;
303 *snippet_lengths = (ulong *) malloc((*numocc)*sizeof(ulong));
304 if (!(*snippet_lengths)) return 1;
305 *snippet_text = (uchar *) malloc((*numocc)*(bytesPerSnippet)*sizeof(uchar) +1) ; //(the last "1" is for '\0');
306 if (!(*snippet_text)) return 1;
308 // fprintf(stderr,"\n occs found = %7d for pattern %s",*numocc, pattern);
311 text_aux=*snippet_text;
317 uint posSEValue,indexSE;
319 for (i=0;i<occurrences;i++) {
323 /** decodes words from there */
325 indexSE = indexesInSE[i];
326 indexSE = (indexSE > kbefore) ? indexSE-kbefore : 0;
329 while ((!endSnippet) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
331 //posSEValue =displayCSA(wcsa->myicsa,indexSE);
332 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
334 {//obtains pointer to the ith word
336 uint ith = posSEValue -1; // !!
337 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
338 offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
339 tmplen -=offtmp; //the lenght of the ith word.
341 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
348 if (snippetLen==bytesPerSnippet) break; //end of snippet (ends in BLANK_SPACE)
350 prevValid =1; //for the next iteration
357 if ((tmplen+snippetLen)>=bytesPerSnippet) {
358 tmplen =(bytesPerSnippet - snippetLen);
359 endSnippet=1; //so while loop ends;
362 for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
366 text_aux += bytesPerSnippet;
367 (*snippet_lengths)[i] = snippetLen;
370 if (occurrences) free(indexesInSE);
375 /** simulates extration of text process, but do not actually returns anything at all
376 Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
377 Less than 2K words can be extracted if more than numc characters have been already obtained.
378 Does nothing else... does not return the text */
380 int displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc) {
386 twcsa *wcsa=(twcsa *) index;
388 locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
394 ulong maxsnippetLen = maxnumc;
395 ulong extractedbytes = 0;
397 text_aux = (byte *) malloc (maxsnippetLen+1);
404 uint posSEValue,indexSE;
406 uint numWordsToExtract = 2 * wordsbefore;
408 //printf("\n occurrences... = %lu",occurrences);
410 for (i=0;i<occurrences;i++) {
414 /** decodes words from there */
416 indexSE = indexesInSE[i];
417 indexSE = (indexSE > wordsbefore) ? indexSE-wordsbefore : 0;
421 while ((z<numWordsToExtract) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
423 //posSEValue =displayCSA(wcsa->myicsa,indexSE);
424 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
426 {//obtains pointer to the ith word
428 uint ith = posSEValue -1; // !!
429 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
430 offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
431 tmplen -=offtmp; //the lenght of the ith word.
433 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
440 if (snippetLen==maxsnippetLen) break; //end of snippet (ends in BLANK_SPACE)
442 prevValid =1; //for the next iteration
449 if ((tmplen+snippetLen)>=maxsnippetLen) {
453 //fprintf(stderr,"\ntmplen = %d ",tmplen); fflush(stderr);
454 for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
459 extractedbytes += snippetLen;
463 if (occurrences) free(indexesInSE);
465 if (text_aux) free (text_aux);
466 return extractedbytes;
471 /** Allocates text (which must be freed by the caller) and recovers the
472 the substring of text starting from the "fromword"-th word up to the
473 "toWord"-th words. Returns in text the text, and in "text_lenght" the
474 length of the text actually extracted. Text is allocated.
475 Actually extracts SE[fromWord .. toWord) ... not the last word. */
477 int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
480 twcsa *wcsa=(twcsa *) index;
481 uint initTextLen=10000;
484 uint i, j;//, tmplen;
486 byte *src, *dst, *buff;
489 uint buffBytes = 1000;
490 uint leng=0; //curr pos in buffer that was occupied.
492 if (toWord > wcsa->seSize) toWord = wcsa->seSize;
493 if (fromWord >= wcsa->seSize) fromWord = wcsa->seSize-1;
494 if (buffBytes < ( (toWord-fromWord)* avgWordLen)) buffBytes = ((toWord-fromWord)* avgWordLen);
496 buff = (uchar *) malloc (buffBytes * sizeof(char));
497 if (!buff) return 1; //out of memory.
500 register uint indexSE=fromWord;
504 while ( (indexSE < toWord) ){ /** extracting words (if not at the end) */
506 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
508 {//obtains pointer to the ith word
510 ith= posSEValue -1; // !!
511 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
512 offtmp = bitread (wcsa->wordsData.words, (ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
513 tmplen -=offtmp; //the lenght of the ith word.
514 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
517 if ( buffBytes < (leng + tmplen+1) ) {
519 buff = (uchar*) realloc(buff, buffBytes);
520 if (!buff) return 1; //out of memory.
529 prevValid =1; //for the next iteration
535 for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
547 /** ***********************************************************************************
548 CONSTRUCTION OF THE INDEX WCSA
549 ***********************************************************************************/
551 /**------------------------------------------------------------------
552 Compares two slots (alphanumericaly). For qsort of canonical words
553 ------------------------------------------------------------------ */
554 int qSortWordsCompareAlpha(const void *arg1, const void *arg2) {
555 tposInHT *a1 = (tposInHT *) arg1;
556 tposInHT *a2 = (tposInHT *) arg2;
557 return strcmp((char*)a1->word, (char *)a2->word);
561 * BUILDS THE WCSA INDEX
564 int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
566 unsigned long zeroNode; //number of different canonical words.
568 t_hash hash; // the hash table to store both variants and canonical words.
569 tposInHT *posInHT; // structure for canonicals and variants+huffmans
573 uint seSize=0; //it's size == "numberOfValidWords".
574 uint *SE; //Integers vector. (represents the rank of the valid words in the source text).
576 uint totallenWords=0; //The numberOfBytes that occupy canonical words (their ascii version) in memory
579 ulong bytesFile,bytesFileReal;
582 /* used during first pass */
586 byte* inputBuffer = text;
587 bytesFileReal= bytesFile = length;
589 sourceTextSize=length;
591 /** Initializes WCSA structure*/
593 wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
597 //Stimation (Using Heap's law) of the number of different "meaningful" words.
598 //sizeNValue=N_value;
599 if(bytesFile<5000000) bytesFile = 5000000;
600 sizeNValue = (unsigned long) floor(3.9* pow(bytesFile,0.60) );
603 // Inicializes the arrays used to detect if a char is valid or not.
605 // Inicializes the arrays used translated a char into lowercase.
609 // **********************************************************************************
610 //STARTING THE FIRST PASS.
611 // **********************************************************************************
612 printf("\nSTARTING THE FIRST PASS...");
614 posInHT = (tposInHT *) malloc(sizeof(tposInHT) * sizeNValue);
615 hash = initialize_hash (sizeNValue); //hash to cointain both the parsed words
617 //-----------------------------------------------------------------
618 //1st pass (processing the file)
620 byte *pbeg,*pend,*wordstart,*aWord;
625 pend = inputBuffer+bytesFileReal;
631 fprintf(stderr, "buildFacade.c: assert failed, *pbeg == 0\n");
635 //parsing either a word or separator.
638 if (_Valid[*pbeg]) { //alphanumerical data
639 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
643 if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
644 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
646 else { //a SPACE comes, so we have to test if next character is alphanumerical or not
648 if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
650 if (_Valid [*pbeg] ) {
651 wordstart = pbeg; //So skipping 1 blank character
652 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
654 else { // a "separator word" ...
655 size++; //the prev BLANK...
656 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
657 }//else { // a "separator word"
658 }//else ... not a unique BLANK AT THE END.
659 }//else ... starting by a BLANK...
662 if (pbeg < pend && *pbeg == 0)
663 pbeg ++; // Skip the 0-bytes
667 fprintf(stderr, "buildFacade.c: assert failed, size == 0\n");
671 //The parsed word/separator is is "wordstart", and its length is "size"...
674 //Processement done for each word word
675 i = inHashTable(hash,aWord, size, &addrInTH );
677 insertElement (hash,aWord, size, &addrInTH);
678 posInHT[zeroNode].slot=addrInTH;
679 posInHT[zeroNode].word=hash->hash[addrInTH].word;
680 hash->hash[addrInTH].posInVoc = zeroNode;
682 totallenWords += size +1; // +1 due to the '\0' char...
683 //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);
689 fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
694 // **********************************************************************************
696 // **********************************************************************************
698 // Sorting the words alphanumerically (over posInHT)
699 { register unsigned long i,j;
700 //sorting canonical words ...
701 qsort(posInHT, zeroNode, sizeof(tposInHT), qSortWordsCompareAlpha);
703 //setting in hash the new positions of the words in the hash table
704 for (i=0;i<zeroNode;i++) {
705 hash->hash[posInHT[i].slot].posInVoc = i;
709 // INITIALIZING structures for the 2nd pass ......................................
711 SE = (uint *) malloc ((seSize+1)*sizeof (uint));
715 // **********************************************************************************
716 // STARTING THE SECOND PASS.
717 // **********************************************************************************/
719 printf("\nSTARTING THE SECOND PASS... ");
720 //2nd pass (processing the file)
722 byte *pbeg,*pend,*wordstart,*aWord;
725 register ulong countValidWords = 0;
729 pend = inputBuffer+bytesFileReal;
734 fprintf(stderr, "buildFacade.c 2nd pass: assert failed, *pbeg == 0\n");
738 //parsing either a word or separator.
741 if (_Valid[*pbeg]) { //alphanumerical data
742 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
745 if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
746 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
748 else { //a SPACE comes, so we have to test if next character is alphanumerical or not
750 if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
752 if (_Valid [*pbeg] ) {
753 wordstart = pbeg; //So skipping 1 blank character
754 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
756 else { // a "separator word" ...
757 size++; //the prev BLANK...
758 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
759 }//else { // a "separator word"
760 }//else ... not a unique BLANK AT THE END.
761 }//else ... starting by a BLANK...
764 if (pbeg < pend && *pbeg == 0)
765 pbeg ++; // Skip the 0-bytes
769 fprintf(stderr, "buildFacade.c 2nd pass: assert failed, size == 0\n");
773 //The parsed word/separator is is "wordstart", and its length is "size"...
776 //Processement done for each word word
777 i = inHashTable(hash,aWord, size, &addrInTH );
779 SE[countValidWords]=hash->hash[addrInTH].posInVoc+1; // !!!!
784 SE[countValidWords] = 0;
785 fprintf(stderr,"\n2nd pass ends: TOTAL distinct words: %lu totalWords = %lu",zeroNode,countValidWords);
789 // **********************************************************************************
791 // **********************************************************************************
793 //freeing the source text (it is no longer needed).
794 delete [] inputBuffer; //the text
796 /** Now Setting the data of the index **/
798 wcsa->sourceTextSize = sourceTextSize;
799 wcsa->seSize = seSize;
801 // Creating the words of the vocabulary...
803 /** copying the words into WCSA. */
804 uint *tmpOffsets = (uint *) malloc (sizeof(uint) * (zeroNode +1) ); //1 extra uint (to point to the virtual "zeroNode+1" ^th word.
810 //Moving data from posInHT to WCSA structure
811 //wcsa->wordsData = (twords *) malloc(sizeof(twords) * zeroNode);
812 wcsa->wordsData.wordsZoneMem.size = totallenWords - zeroNode; //without '\0' bytes (end-tag).
813 wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc ( wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
814 zoneMem = wcsa->wordsData.wordsZoneMem.zone;
815 for(i = 0; i < zeroNode; i++) {
816 src = posInHT[i].word; //copying the canonical word
817 //wcsa->wordsData.words[i].word = zoneMem; //setting the pointer
818 tmpOffsets[i]=tmpOffset; //offset in zoneMem
819 while (*src) {*zoneMem++ = *src++; tmpOffset++;} //moving data until '\0'
820 //*zoneMem='\0'; zoneMem++; //copies also the '\0'
823 tmpOffsets[zeroNode]=tmpOffset; //setting pointer to the "virtual" word {zeroNode+1}^{th}
825 //kbit encoding of the offsets
826 uint elemSize = bits(tmpOffset);
827 wcsa->wordsData.elemSize = elemSize;
828 wcsa->wordsData.words = (uint *) malloc (((((zeroNode +1)*elemSize)+W-1) /W) * sizeof(uint)); //with 1 extra slot !.
829 wcsa->wordsData.words[((((zeroNode +1)*elemSize)+W-1) /W) -1 ] =0000;
830 // fprintf(stderr,"\n ElemSize = %d, maxOffset = %d",elemSize,tmpOffset);
833 for (i=0; i<=zeroNode; i++) { //setting "zeroNode+1" offsets
834 bitwrite(wcsa->wordsData.words, tmpOffset, elemSize, tmpOffsets[i]);
838 //////////// CHECKS IT WORKED. old !!!!
841 // for (i=0; i<zeroNode; i++) { //setting "zeroNode+1" offsets
842 // kk=bitread(wcsa->wordsData.words, i* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
843 // tmpOffset+=elemSize;
844 // if (kk != tmpOffsets[i]) {fprintf(stderr,"\n @@@@@@@@ DISTINTOS OFFSETS "); break;}
845 // else fprintf(stderr,"\n iguales, %d, %d :: <<%s>>len=%d",kk,i, posInHT[i].word, strlen((char*)posInHT[i].word));
849 // { uint len1, len, tmplen, len2;
851 // byte *wcsaWord, *src;
853 // for (p=0;p<zeroNode;p++) {
854 // {//preparing for strcompL
855 // len = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p+1)), wcsa->wordsData.elemSize);
856 // tmplen = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p)) , wcsa->wordsData.elemSize);
858 // //fprintf(stderr,"\n :: off[%d]= %d - off [%d] = %d ==> %d",p+1,len,p,tmplen,len-tmplen);
861 // wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
864 // src = posInHT[p].word;
865 // len1 = strlen((char *)src);
867 // if (strcompL(src,wcsaWord,len1,len2) != 0) {
868 // fprintf(stderr,"\n %6d DISTINTOS !! ===len1 %d,len %d===== <<",p,len1,len2);printWord(src,len1);
869 // fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
873 // fprintf(stderr,"\n %6d ======len1 %d,len2 %d===== <<",p,len1,len2);printWord(src,len1);
874 // fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
882 //frees memory from hash table and posInHT structures.
888 /** ******* creates the self-index on ints (bottom layer) ==> see build_icsa *********/
893 fprintf(stderr,"\n **** CREATING CSA from Edu's Code *****");
895 myicsa = createIntegerCSA(&SE,seSize+1,build_options);
896 wcsa->myicsa= myicsa;
897 total = CSA_size(myicsa);
899 free(SE); //SE is no longer needed, (it is indexed by the iCSA)
900 printf("\n\t**** [iCSA built on %d words. Size = %ld bytes... RAM",seSize,total);
910 printf("\n\t ** Building done! **\n");
911 printf("\n Process finished!\n");
918 int build_iCSA (char *build_options, void *index)
920 twcsa *wcsa = (twcsa *) index;
921 /********* creates the self-index on ints (bottom layer) *********/
922 //creating CSA from Edu's code...
925 fprintf(stderr,"\n **** CREATING CSA-bottom-layer *****");
926 void *bottomIntIndex;
927 int err = buildIntIndex(wcsa->se,wcsa->seSize+1, build_options,(void **)&bottomIntIndex);
928 wcsa->myicsa = bottomIntIndex;
930 //total = CSA_size(wcsa->myicsa);
931 err = sizeIntIndex((void *) wcsa->myicsa, &total);
933 printf("\n\t**** [iCSA built on %d words. Size = %u bytes... RAM",wcsa->seSize,total);
938 /** ********************************************************************
940 **********************************************************************/
942 /**-----------------------------------------------------------------
944 * Loads all the data structures of WCSA (included the icsa)
945 ----------------------------------------------------------------- */
947 twcsa *loadWCSA(char *filename) {
949 // Inicializes the arrays used to detect if a char is valid or not.
951 // Inicializes the arrays used translated a char into lowercase.
954 wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
957 int err = loadIntIndex(filename, (void **)&wcsa->myicsa);
959 loadStructs(wcsa,filename);
964 /** ------------------------------------------------------------------
966 * Reads files and loads all the data needed for searcherFacade
967 ----------------------------------------------------------------- */
968 void loadStructs(twcsa *wcsa, char *basename) {
976 filename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
977 fprintf(stderr,"Loading Index from file %s.*\n", basename);
979 //** SOME CONSTANTS: sourceTextSize
980 { strcpy(filename, basename);
981 strcat(filename, ".");
982 strcat(filename, CONSTANTS_FILE_EXT);
984 if( (file = open(filename, O_RDONLY)) < 0) {
985 printf("Cannot open file %s\n", filename);
989 read(file, &(wcsa->sourceTextSize), sizeof(uint));
990 read(file, &(wcsa->seSize), sizeof(uint));
994 /** File with the words from the vocabulary (sorted alphabetically) */
997 strcpy(filename, basename);
998 strcat(filename, ".");
999 strcat(filename, VOCABULARY_WORDS_FILE_EXT);
1000 //sizeFile= fileSize(filename)-sizeof(uint);
1002 if( (file = open(filename, O_RDONLY)) < 0) {
1003 printf("Cannot open file %s\n", filename);
1007 //the number of canonical words
1008 read(file, &n, sizeof(uint));
1010 read(file, &(wcsa->wordsData.elemSize), (sizeof(uint)));
1011 read(file, &(wcsa->wordsData.wordsZoneMem.size), (sizeof(uint)));
1013 //allocating the memory needed for all words and reading them //(ascii) << no \0 chars are needed>>.
1014 wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc(wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
1015 read(file, (wcsa->wordsData.wordsZoneMem.zone), wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
1017 //reading the offsets of the words (kbitArray that points to offsets in zoneMem of words.
1018 wcsa->wordsData.words = (uint *) malloc (((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * sizeof(uint));
1019 wcsa->wordsData.words[ ((((n+1)*(wcsa->wordsData.elemSize))+W-1) /W) -1 ] =0000;
1020 read(file, (wcsa->wordsData.words), ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)));
1032 /** ****************************************************************
1033 * Querying the index WCSA
1034 * ***************************************************************/
1035 ///////////////////////////////////////////////////////////////////////////////////////
1036 // FUNCTIONS NEEDED FOR SEARCHING A PATTERN //
1037 ///////////////////////////////////////////////////////////////////////////////////////
1041 /*------------------------------------------------------------------
1042 * Given a text pattern translates it into a list of integers (corresponding to the
1043 * canonical words associated to the valid words in the text pattern)
1044 ------------------------------------------------------------------*/
1045 void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) {
1047 byte *pbeg,*pend,*wordstart,*aWord;
1048 register unsigned long size;
1052 pend = pbeg + patLen;
1054 while (pbeg <pend) {
1055 //parsing either a word or separator.
1058 if (_Valid[*pbeg]) { //alphanumerical data
1059 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
1062 if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
1063 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
1065 else { //a SPACE comes, so we have to test if next character is alphanumerical or not
1067 if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
1069 if (_Valid [*pbeg] ) {
1070 wordstart = pbeg; //So skipping 1 blank character
1071 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
1073 else { // a "separator word" ...
1074 size++; //the prev BLANK...
1075 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
1076 }//else { // a "separator word"
1077 }//else ... not a unique BLANK AT THE END.
1078 }//else ... starting by a BLANK...
1081 //The parsed word is "aWord", and its length is "size"...
1084 // Binary search on the canonical words (wordsData)
1088 register uint min,max,p;
1090 max = (wcsa->n) - 1;
1094 {//preparing for strcompL
1095 len = bitread (wcsa->wordsData.words, (p+1)* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize);
1096 tmplen = bitread (wcsa->wordsData.words, (p )* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize);
1098 wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
1101 //if(strncmp((char*)aWord, (char*)wcsa->wordsData[p].word,size) > 0) min = p+1;
1102 if(strcompL(aWord, wcsaWord, size, len) > 0) min = p+1;
1106 // { //SHOW PROGRESS
1107 // fprintf(stderr,"\n Patron = <<%s>>, curposWord= %d ==><<",aWord,p);
1108 // printWord(wcsaWord,len); fprintf(stderr,">> len =%d",len);
1113 {//preparing for strcompL
1114 len = bitread (wcsa->wordsData.words, (min+1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1115 tmplen = bitread (wcsa->wordsData.words, ( min )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1117 wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
1120 // if(!strncmp((char*)aWord, (char*)wcsa->wordsData[min].word, size)) {
1121 if(!strcompL(aWord, wcsaWord, size, len)) {
1122 integerPattern[index++] = min +1 ; //<--
1124 else {*sizeIntegers = 0; return;} // a valid word that does not appear in the source text.
1128 *sizeIntegers = index;
1130 // //shows the parsed words:
1132 // printf("\n\n >>%s>> HA SIDO PARSEADO COMO:",textPattern);
1133 // for (i=0; i<index;i++) {
1134 // printf("<<%s>>",wcsa->wordsData[integerPattern[i] -1].word);
1144 /** ------------------------------------------------------------------
1145 * Returns the number of occurrences of a given text pattern
1146 *------------------------------------------------------------------ */
1147 int countTextOcurrences(twcsa *wcsa, byte *textPattern) {
1149 uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
1150 uint integerPatternSize, min, max;
1152 uint lenpat = strlen((char*)textPattern);
1153 parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
1154 if (!integerPatternSize) return -1;
1158 // printf("\n %d Integers to search for:",integerPatternSize );
1159 // for (i=0;i<integerPatternSize;i++) {
1160 // printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
1166 int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, &numocc, &left, &right);
1172 /** ------------------------------------------------------------------
1173 * locateTextOcurrences:
1174 * Returns the offsets of the source text where a word/phrase appears
1175 * Returns also the number of occurrences.
1176 *------------------------------------------------------------------ */
1177 uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences) {
1178 uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
1179 uint integerPatternSize, min, max;
1181 uint lenpat = strlen((char*)textPattern);
1182 parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
1183 if (!integerPatternSize) {*numberOccurrences = -1; return NULL;}
1187 // printf("\n %d Integers to search for:",integerPatternSize );
1188 // for (i=0;i<integerPatternSize;i++) {
1189 // printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
1194 ulong occurrences, left, right;
1196 ulong *sourceOffsets;
1198 //obtains the indexes in vector SE where the pattern appears.
1199 //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
1200 int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
1202 //sourceOffsets = (uint *) malloc (sizeof(uint)*occurrences);
1204 sourceOffsets=seOffsets;
1205 //obtains the offsets in the source text of the pattern (sourceOffsets)
1206 locateFacade(wcsa, (uint *)sourceOffsets, (uint *)seOffsets,occurrences);
1209 fprintf(stderr,"\n*** %s appears in the source text in positions:\n\t",textPattern);
1210 for (i=0;i<occurrences;i++)
1211 fprintf(stderr,"[%u]",sourceOffsets[i]);
1215 *numberOccurrences = occurrences;
1216 return (uint *) sourceOffsets;
1220 /** ------------------------------------------------------------------
1221 * displayTextOcurrences:
1222 * Shows in stdout, the text around the occurrences of a word/phrase
1223 * Returns also the number of occurrences.
1224 *------------------------------------------------------------------ */
1225 int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay) {
1226 return 99; //not implemented: function not available
1229 /** ------------------------------------------------------------------
1231 * For given sePositions, returns the sourceTextPositions
1232 * where the those valid-words in se[sePositions[i]] occurr.
1233 *------------------------------------------------------------------*/
1234 int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number) {
1235 return 99; //not implemented: function not available for this index
1239 /** ------------------------------------------------------------------
1241 * Returns the subString from a starting offset to a final offset
1242 * in the source text. It does not allocate any memory, receives "dstptr"
1243 * Precondition: offsetIni >=0;
1244 ------------------------------------------------------------------*/
1245 int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr) {
1246 return 99; //not implemented: function not available for this index
1250 /**------------------------------------------------------------------
1251 * DISPLAYFacadeMalloc:
1252 * Returns the subString from a starting offset to a final offset
1253 * in the source text. It allocates Memory !!
1254 * NOT CURRENTLY USED
1255 ------------------------------------------------------------------*/
1256 byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length) {
1257 byte *dstptr=NULL; //not implemented: function not available
1262 /** ------------------------------------------------------------------
1263 * LOCATEALLandDISPLAY:
1264 * Displays the text around an occurrence of the searched word in the source text.
1265 * Assuming that $p$ is that position --> shows only chars in [p_radix-1,p_radix]
1266 ------------------------------------------------------------------*/
1267 int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix) {
1268 return 99; //not implemented: function not available for this index
1273 /** ------------------------------------------------------------------
1274 * recovers the source text by calling display(0,fileSize);
1275 * ------------------------------------------------------------------ */
1276 void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
1280 char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
1283 strcpy( filename, basename);
1284 strcat( filename, ext);
1285 filename[strlen( basename)+ strlen(ext)]='\0';
1286 fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
1290 salida = fopen( filename,"w");
1291 start=0; end = sourceTextSize-1;
1293 cc = (byte *) malloc (sourceTextSize* sizeof(uchar));
1296 uint i, j;//, tmplen;
1307 while ( (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
1309 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
1311 {//obtains pointer to the ith word
1313 uint ith = posSEValue -1; // !!
1314 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1315 offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1316 tmplen -=offtmp; //the lenght of the ith word.
1317 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
1325 prevValid =1; //for the next iteration
1331 for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
1335 fprintf(stderr,"\n sourceTextSize = %d, len = %d",sourceTextSize,leng);
1336 fwrite(cc,sizeof(byte),leng,salida);
1346 //recovers the source text by calling extract Words.
1347 void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
1349 int start;int end; int error;
1350 char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
1354 strcpy( filename, basename);
1355 strcat( filename, ext);
1356 filename[strlen( basename)+ strlen(ext)]='\0';
1357 fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
1361 salida = fopen( filename,"w");
1362 start=0; end = wcsa->seSize;
1364 error = extractWords((void *) wcsa, start, end, &cc, &length);
1365 if (error) {fprintf(stderr,"\n error during recoverSourceText2"); exit(0);}
1367 fprintf(stderr,"\n sourceTextSize = %d, len = %ld",sourceTextSize,length);
1368 fwrite(cc,sizeof(byte),length,salida);
1375 /** *******************************************************************************
1376 * Showing some statistics and info of the index
1377 * *******************************************************************************/
1378 void printInfoReduced(twcsa *wcsa) {
1379 //not implemented: function not available
1382 /* Shows summary info of the index */
1383 int printInfo(void *index) {
1386 twcsa *wcsa = (twcsa *) index;
1388 unsigned long indexSize;
1389 uint intIndexSize, presentationSize;
1392 err = index_size(index, &indexSize);
1393 if (err!=0) return err;
1394 err = sizeIntIndex(wcsa->myicsa, &intIndexSize);
1395 if (err!=0) return err;
1397 presentationSize = indexSize - intIndexSize;
1399 printf("\n ===================================================:");
1400 printf("\n Summary of Presentation layer:");
1401 printf("\n Number of valid words (SEsize) = %u",wcsa->seSize);
1402 printf("\n Number of different words = %ld",wcsa->n);
1403 printf("\n WCSA structure = %lu bytes", sizeof(twcsa));
1405 uint totalpointers = ((((wcsa->n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));
1406 uint totalasciizone = wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ;
1407 uint totalwords = totalasciizone + totalpointers;
1409 printf("\n Size Of words structure (%d bytes):",totalwords);
1410 printf("\n [ pointers = %d bytes || AsciiZone = %d bytes", totalpointers, totalasciizone);
1412 printf("\n\n Total = ** %u bytes (in RAM) **",presentationSize);
1413 //printf("\n\n @@ Summary of self-index on Integers:");
1414 err = printInfoIntIndex(wcsa->myicsa, " ");
1415 if (err!=0) return err;
1417 printf("\n ===================================================:");
1422 /**------------------------------------------------------------------
1424 * Counts the memory amount needed by the Facade (Presentation Layer).
1425 * skipping the stop_words hash table
1426 ----------------------------------------------------------------- */
1427 uint structsSizeMem(twcsa *wcsa) {
1428 return 0; //not implemented: function not available for this index.
1432 /** for debugging **/
1433 void printWord(uchar *str, uint len) {
1436 fprintf(stderr,"%c",str[i]);
1440 /** saves the content of the file SE (ids of the source words) **/
1441 int saveSEfile (char *basename, uint *v, uint n) {
1442 char outfilename[255];
1444 sprintf(outfilename,"%s.%s",basename,SE_FILE_EXT);
1445 unlink(outfilename);
1446 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
1447 printf("Cannot open file %s\n", outfilename);
1451 write(file, v, sizeof(uint) * n );
1457 double getTime2 (void)
1459 double usertime, systime;
1460 struct rusage usage;
1462 getrusage (RUSAGE_SELF, &usage);
1464 usertime = (double) usage.ru_utime.tv_sec +
1465 (double) usage.ru_utime.tv_usec / 1000000.0;
1466 systime = (double) usage.ru_stime.tv_sec +
1467 (double) usage.ru_stime.tv_usec / 1000000.0;
1469 return (usertime + systime);
1474 /**------------------------------------------------------------------
1476 *------------------------------------------------------------------ */
1477 #ifdef FACADEWITHMAIN
1478 int main(int argc, char* argv[])
1483 char *infile, *outbasename, *stopwordsfile; // Name of in/out files
1491 printf("\n*Word-based iCSA: A word-based CSA");
1492 printf("\n*CopyRight (c) 2008 [LBD & G.N.]\n\n");
1494 // Reads input parameters from command line.
1496 printf("Use: %s <in file> <out basename> \n", argv[0]);
1500 // Reads params (input file, output basename, and stopwords file)
1502 outbasename = argv[2];
1503 stopwordsfile = argv[3];
1505 finsize= fileSize(infile);
1508 printf( "\nFILE EMPTY OR FILE NOT FOUND %s !!\nSkipping processement ...\n",infile);
1512 // Opening the input text file.
1513 if( (f_in = open(infile, O_RDONLY)) < 0) {
1514 printf("Cannot read file %s\n", infile);
1517 inputBuffer = (byte *) malloc(finsize *sizeof(byte));// +1);
1518 read (f_in,inputBuffer,finsize);
1523 //printf("\n parametros <<%s>>\n\n",stopwordsfile);
1524 build_index (inputBuffer, finsize, stopwordsfile, &Index); /** building the index */
1526 // /** recovering the source text from the index */
1531 get_length(Index, &size);
1532 char extension[10]= ".source";
1534 //recoverSourceText1((twcsa*) Index, outbasename,extension, size);
1535 strcat(extension,"2");
1536 recoverSourceText2((twcsa*) Index, outbasename,extension,size);
1538 fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start );
1541 // DISPLAYING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
1542 {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1544 ulong numocc,numc, length, i, *snippet_len, tot_numcharext = 0, numpatt;
1545 uchar *pattern, *snippet_text;
1547 pattern = textPattern;
1548 printf("\nSEARCH TEST for DISPLAY (pizzachili interface)\n");
1550 printf("Intro string: ");
1551 fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1552 if (!strcmp((char*)textPattern,"\n") ) break;
1553 textPattern[strlen((char*)textPattern)-1] = '\0';
1555 length = strlen( (char*)textPattern);
1558 // error = display (Index, textPattern, length, numc, &numocc,
1559 // &snippet_text, &snippet_len);
1560 error = displayWords (Index, textPattern, length, numc, &numocc,
1561 &snippet_text, &snippet_len,1);
1563 if (error){ fprintf(stderr, "%s\n", "Hubo un error durante display");exit(0);}
1565 fprintf(stderr,"\n acabou display");fflush(stderr);
1567 ulong j, len = length + 2*numc;
1569 fprintf(stderr,"\n length = %d",length);
1570 fprintf(stderr,"\n pattern = %s",pattern);fflush(stderr);
1571 fprintf(stderr,"\n numocc = %d",numocc);fflush(stderr);
1572 fprintf(stderr,"\n snippet len = %d",len);fflush(stderr);
1573 fprintf(stderr,"\n =========");fflush(stderr);
1574 for (i = 0; i < numocc; i++){
1575 fprintf(stderr,"\n[%2d][len=%3d]<<",i+1,snippet_len[i]);fflush(stderr);
1576 fwrite(snippet_text+len*i,sizeof(uchar),snippet_len[i],stderr);fflush(stderr);
1577 fprintf(stderr,">>");fflush(stderr);
1582 for(i=0; i<numocc; i++) {
1583 tot_numcharext += snippet_len[i];
1588 free (snippet_text);
1591 printf("Ocurrences = %d\n", numocc);
1592 if (!strcmp((char*)textPattern,"\n") ) break;
1598 // // SEARCHING FOR A TEXT PATTERN (word/phrase).
1599 // {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1604 // printf("\nSEARCH TEST for LOCATE\n");
1606 // printf("Intro string: ");
1607 // fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1608 // len = strlen((char*)textPattern);
1609 // if (!strcmp((char*)textPattern,"\n") ) break;
1610 // textPattern[len-1] = '\0';
1613 // //occs = locateTextOcurrences(wcsa,textPattern,&occ);
1614 // // locate(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ);
1615 // locateWord(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ, 0);
1617 // printf("\n*** %s occurs %d times: In the source text in positions:\n\t",textPattern,occ);
1618 // for (i=0;i<occ;i++)
1619 // printf("[%u]",occs[i]);
1623 // if (!strcmp((char*)textPattern,"\n") ) break;
1629 // COUNTING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
1631 {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1634 printf("\nSEARCH TEST for COUNT.\n");
1636 printf("Intro string: ");
1637 fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1638 len = strlen((char*)textPattern);
1639 if (!strcmp((char*)textPattern,"\n") ) break;
1640 textPattern[len-1] = '\0';
1643 count(Index, textPattern, len, (ulong *)&occ);
1644 //occ = countTextOcurrences(wcsa,textPattern);
1645 printf("Ocurrences = %d\n", occ);
1648 printf("\n END COUNTING OCCURRENCES OF PATTERNS. ...\n");
1652 /** saving the index to disk*/
1653 save_index (Index, outbasename);
1655 /** tells the mem used by the index */
1657 index_size(Index, &indexsize);
1658 fprintf(stderr,"Index occupied %d bytes, 2 extra mallocs = %d",indexsize,2* sizeof(uint));
1660 /** freeing the index */