1 #include "buildFacade.h"
2 #include "utils/errors.c"
5 /** Building the index */
7 /* Creates index from text[0..length-1]. Note that the index is an
8 opaque data type. Any build option must be passed in string
9 build_options, whose syntax depends on the index. The index must
10 always work with some default parameters if build_options is NULL.
11 The returned index is ready to be queried. */
12 int build_index (uchar *text, ulong length, char *build_options, void **index) {
15 printf("\n parameters: \"%s\"\n",build_options); fflush(stderr);
17 returnvalue = build_WCSA (text, length, build_options, index);
20 returnvalue = build_iCSA (build_options,*index);
26 /** Saves index on disk by using single or multiple files, having
28 int save_index (void *index, char *filename) {
30 char *basename = filename;
31 twcsa *wcsa=(twcsa *) index;
38 printf("\n Saving structures to disk: %s.*",filename);
39 outfilename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
41 /**File with some constants (bSize and tohSize); */
43 strcpy(outfilename, basename);
44 strcat(outfilename, ".");
45 strcat(outfilename, CONSTANTS_FILE_EXT);
47 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
48 printf("Cannot open file %s\n", outfilename);
51 write(file, &(wcsa->sourceTextSize), sizeof(uint));
52 write(file, &(wcsa->seSize), sizeof(uint));
56 /** The Words in the vocabulary of words (sorted alphabetically)*/
57 { strcpy(outfilename, basename);
58 strcat(outfilename, ".");
59 strcat(outfilename, VOCABULARY_WORDS_FILE_EXT);
61 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
62 printf("Cannot open file %s\n", outfilename);
67 uint elemSize = wcsa->wordsData.elemSize;
68 write(file, &n, sizeof(uint));
69 write(file, &elemSize, sizeof(uint));
70 write(file, &(wcsa->wordsData.wordsZoneMem.size), sizeof(uint));
72 //the number of canonical words
73 write(file, (char *)wcsa->wordsData.wordsZoneMem.zone, wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
74 write(file, (char *)wcsa->wordsData.words, ((((n+1)* (elemSize))+W-1) /W) * (sizeof(uint)) );
82 /******** saves index on integers (bottom) ******/
84 //storeStructsCSA(wcsa->myicsa,basename);
85 saveIntIndex((void *) wcsa->myicsa, basename);
89 saveSEfile(basename,wcsa->se, wcsa->seSize+1);
98 /** Loads index from one or more file(s) named filename, possibly
99 adding the proper extensions. */
100 int load_index(char *filename, void **index){
102 wcsa = loadWCSA (filename);
103 (*index) = (void *) wcsa;
107 /** Frees the memory occupied by index. */
108 int free_index(void *index){
109 twcsa *wcsa=(twcsa *) index;
111 index_size(index,&size);
112 printf("\n[destroying index] ...Freed %lu bytes... RAM", size);
115 //frees the array SE.
121 //destroyStructsCSA(wcsa->myicsa);
122 int err = freeIntIndex((void *) wcsa->myicsa);
126 free (wcsa->wordsData.wordsZoneMem.zone);
127 free (wcsa->wordsData.words); /** huge!! */
129 //the pointer to wcsa.
134 /** Gives the memory occupied by index in bytes. */
135 int index_size(void *index, ulong *size) {
137 twcsa *wcsa=(twcsa *)index;
140 *size += sizeof(twcsa);
143 totaltmp += ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)); //the pointers
144 totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words.
149 int err = sizeIntIndex((void *) wcsa->myicsa, &nbytes);
151 //*size += CSA_size(wcsa->myicsa);
158 /** Querying the index =============================================================*/
160 /* Writes in numocc the number of occurrences of the substring
161 pattern[0..length-1] found in the text indexed by index. */
162 int count (void *index, uchar *pattern, ulong length, ulong *numocc){
163 uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
164 uint integerPatternSize;
167 twcsa *wcsa=(twcsa *) index;
168 parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
169 if (!integerPatternSize) {*numocc=0; return 0;} //not found
171 //*numocc = countCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &l, &r);
172 int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, numocc, &l, &r);
176 /* Writes in numocc the number of occurrences of the substring
177 pattern[0..length-1] in the text indexed by index. It also allocates
178 occ (which must be freed by the caller) and writes the locations of
179 the numocc occurrences in occ, in arbitrary order. */
180 int locate(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc){
184 /* Gives the length of the text indexed */
185 int get_length(void *index, ulong *length) {
186 twcsa *wcsa=(twcsa *) index;
187 *length = wcsa->sourceTextSize;
191 /** Obtains the length of the text indexed by index. */
193 int length (void *index, ulong *length) {
194 return (get_length(index,length));
198 /** ***********************************************************************************
199 * Accessing the indexed text
200 * ***********************************************************************************/
203 /** Allocates snippet (which must be freed by the caller) and writes
204 the substring text[from..to] into it. Returns in snippet_length the
205 length of the text snippet actually extracted (that could be less
206 than to-from+1 if to is larger than the text size). */
207 int extract (void *index, ulong from, ulong to, uchar **snippet, ulong *snippet_length) {
208 twcsa *wcsa=(twcsa *) index;
212 /** Displays the text (snippet) surrounding any occurrence of the
213 substring pattern[0..length-1] within the text indexed by index.
214 The snippet must include numc characters before and after the
215 pattern occurrence, totalizing length+2*numc characters, or less if
216 the text boundaries are reached. Writes in numocc the number of
217 occurrences, and allocates the arrays snippet_text and
218 snippet_lengths (which must be freed by the caller). The first is a
219 character array of numocc*(length+2*numc) characters, with a new
220 snippet starting at every multiple of length+2*numc. The second
221 gives the real length of each of the numocc snippets. */
223 int display (void *index, uchar *pattern, ulong length, ulong numc,
224 ulong *numocc, uchar **snippet_text, ulong **snippet_lengths) {
230 /** ***********************************************************************************
231 * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
232 * ***********************************************************************************/
233 /* Writes in numocc the number of occurrences of the substring
234 pattern[0..length-1] in the text indexed by index. It also allocates
235 occ (which must be freed by the caller) and writes the locations of
236 the numocc occurrences in occ, in arbitrary order. These occurrences
237 refer to the offsets in TOH where the caller could start a display
238 operation. So locateWord implies synchronization using B.
239 Moreover, positions occ[numocc.. 2*numocc-1] is set with the rank in SE of the
240 words whose codes begin in TOH in the positions in occ[0... numocc-1]
241 ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
242 searched word, but the offset in TOH of k-before words before.
245 int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore){
246 uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
247 uint integerPatternSize;
248 ulong occurrences,l,r;
249 twcsa *wcsa=(twcsa *) index;
251 parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
252 if (!integerPatternSize) {*numocc=0; return 0;} //not found
256 //obtains the indexes in vector SE where the pattern appears.
257 //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
258 int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
260 *numocc = occurrences;
262 if (!occurrences) {(*occ)=NULL;return 0;}
264 (*occ) = (ulong *)seOffsets;
269 /** Displays the text (snippet) surrounding any occurrence of the
270 substring pattern[0..length-1] within the text indexed by index.
271 The snippet must include numc characters before and after the
272 pattern occurrence, totalizing length+2*numc characters, or less if
273 the text boundaries are reached. Writes in numocc the number of
274 occurrences, and allocates the arrays snippet_text and
275 snippet_lengths (which must be freed by the caller). The first is a
276 character array of numocc*(length+2*numc) characters, with a new
277 snippet starting at every multiple of length+2*numc. The second
278 gives the real length of each of the numocc snippets. */
280 int displayWords (void *index, uchar *pattern, ulong length, ulong numc,
281 ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore) {
283 /** actually extracts upto length + 2*numc chars, starting extraction kbefore
284 * words before the occurrence **/
288 uint bytesPerSnippet;
290 twcsa *wcsa=(twcsa *) index;
292 locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
293 (*numocc) = occurrences;
297 *snippet_lengths =NULL;
301 bytesPerSnippet = length+2*numc;
302 // bytesPerSnippet = 2*numc;
303 *snippet_lengths = (ulong *) malloc((*numocc)*sizeof(ulong));
304 if (!(*snippet_lengths)) return 1;
305 *snippet_text = (uchar *) malloc((*numocc)*(bytesPerSnippet)*sizeof(uchar) +1) ; //(the last "1" is for '\0');
306 if (!(*snippet_text)) return 1;
308 // fprintf(stderr,"\n occs found = %7d for pattern %s",*numocc, pattern);
311 text_aux=*snippet_text;
317 uint posSEValue,indexSE;
319 for (i=0;i<occurrences;i++) {
323 /** decodes words from there */
325 indexSE = indexesInSE[i];
326 indexSE = (indexSE > kbefore) ? indexSE-kbefore : 0;
329 while ((!endSnippet) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
331 //posSEValue =displayCSA(wcsa->myicsa,indexSE);
332 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
334 {//obtains pointer to the ith word
336 uint ith = posSEValue -1; // !!
337 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
338 offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
339 tmplen -=offtmp; //the lenght of the ith word.
341 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
348 if (snippetLen==bytesPerSnippet) break; //end of snippet (ends in BLANK_SPACE)
350 prevValid =1; //for the next iteration
357 if ((tmplen+snippetLen)>=bytesPerSnippet) {
358 tmplen =(bytesPerSnippet - snippetLen);
359 endSnippet=1; //so while loop ends;
362 for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
366 text_aux += bytesPerSnippet;
367 (*snippet_lengths)[i] = snippetLen;
370 if (occurrences) free(indexesInSE);
375 /** simulates extration of text process, but do not actually returns anything at all
376 Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
377 Less than 2K words can be extracted if more than numc characters have been already obtained.
378 Does nothing else... does not return the text */
380 int displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc) {
386 twcsa *wcsa=(twcsa *) index;
388 locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
394 ulong maxsnippetLen = maxnumc;
395 ulong extractedbytes = 0;
397 text_aux = (byte *) malloc (maxsnippetLen+1);
404 uint posSEValue,indexSE;
406 uint numWordsToExtract = 2 * wordsbefore;
408 //printf("\n occurrences... = %lu",occurrences);
410 for (i=0;i<occurrences;i++) {
414 /** decodes words from there */
416 indexSE = indexesInSE[i];
417 indexSE = (indexSE > wordsbefore) ? indexSE-wordsbefore : 0;
421 while ((z<numWordsToExtract) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
423 //posSEValue =displayCSA(wcsa->myicsa,indexSE);
424 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
426 {//obtains pointer to the ith word
428 uint ith = posSEValue -1; // !!
429 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
430 offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
431 tmplen -=offtmp; //the lenght of the ith word.
433 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
440 if (snippetLen==maxsnippetLen) break; //end of snippet (ends in BLANK_SPACE)
442 prevValid =1; //for the next iteration
449 if ((tmplen+snippetLen)>=maxsnippetLen) {
453 //fprintf(stderr,"\ntmplen = %d ",tmplen); fflush(stderr);
454 for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
459 extractedbytes += snippetLen;
463 if (occurrences) free(indexesInSE);
465 if (text_aux) free (text_aux);
466 return extractedbytes;
471 /** Allocates text (which must be freed by the caller) and recovers the
472 the substring of text starting from the "fromword"-th word up to the
473 "toWord"-th words. Returns in text the text, and in "text_lenght" the
474 length of the text actually extracted. Text is allocated.
475 Actually extracts SE[fromWord .. toWord) ... not the last word. */
477 int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
480 twcsa *wcsa=(twcsa *) index;
481 uint initTextLen=10000;
484 uint i, j;//, tmplen;
486 byte *src, *dst, *buff;
489 uint buffBytes = 1000;
490 uint leng=0; //curr pos in buffer that was occupied.
492 if (toWord > wcsa->seSize) toWord = wcsa->seSize;
493 if (fromWord >= wcsa->seSize) fromWord = wcsa->seSize-1;
494 if (buffBytes < ( (toWord-fromWord)* avgWordLen)) buffBytes = ((toWord-fromWord)* avgWordLen);
496 buff = (uchar *) malloc (buffBytes * sizeof(char));
497 if (!buff) return 1; //out of memory.
500 register uint indexSE=fromWord;
504 while ( (indexSE < toWord) ){ /** extracting words (if not at the end) */
506 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
508 {//obtains pointer to the ith word
510 ith= posSEValue -1; // !!
511 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
512 offtmp = bitread (wcsa->wordsData.words, (ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
513 tmplen -=offtmp; //the lenght of the ith word.
514 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
517 if ( buffBytes < (leng + tmplen+1) ) {
519 buff = (uchar*) realloc(buff, buffBytes);
520 if (!buff) return 1; //out of memory.
529 prevValid =1; //for the next iteration
535 for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
547 /** ***********************************************************************************
548 CONSTRUCTION OF THE INDEX WCSA
549 ***********************************************************************************/
551 /**------------------------------------------------------------------
552 Compares two slots (alphanumericaly). For qsort of canonical words
553 ------------------------------------------------------------------ */
554 int qSortWordsCompareAlpha(const void *arg1, const void *arg2) {
555 tposInHT *a1 = (tposInHT *) arg1;
556 tposInHT *a2 = (tposInHT *) arg2;
557 return strcmp((char*)a1->word, (char *)a2->word);
561 * BUILDS THE WCSA INDEX
564 int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
566 unsigned long zeroNode; //number of different canonical words.
568 t_hash hash; // the hash table to store both variants and canonical words.
569 tposInHT *posInHT; // structure for canonicals and variants+huffmans
573 uint seSize=0; //it's size == "numberOfValidWords".
574 uint *SE; //Integers vector. (represents the rank of the valid words in the source text).
576 uint totallenWords=0; //The numberOfBytes that occupy canonical words (their ascii version) in memory
579 ulong bytesFile,bytesFileReal;
582 /* used during first pass */
586 byte* inputBuffer = text;
587 bytesFileReal= bytesFile = length;
589 sourceTextSize=length;
591 /** Initializes WCSA structure*/
593 wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
597 //Stimation (Using Heap's law) of the number of different "meaningful" words.
598 //sizeNValue=N_value;
599 if(bytesFile<5000000) bytesFile = 5000000;
600 sizeNValue = (unsigned long) floor(3.9* pow(bytesFile,0.60) );
603 // Inicializes the arrays used to detect if a char is valid or not.
605 // Inicializes the arrays used translated a char into lowercase.
609 // **********************************************************************************
610 //STARTING THE FIRST PASS.
611 // **********************************************************************************
612 printf("\nSTARTING THE FIRST PASS...");
614 posInHT = (tposInHT *) malloc(sizeof(tposInHT) * sizeNValue);
615 hash = initialize_hash (sizeNValue); //hash to cointain both the parsed words
617 //-----------------------------------------------------------------
618 //1st pass (processing the file)
620 byte *pbeg,*pend,*wordstart,*aWord;
625 pend = inputBuffer+bytesFileReal;
629 //parsing either a word or separator.
632 if (_Valid[*pbeg]) { //alphanumerical data
633 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
636 if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
637 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
639 else { //a SPACE comes, so we have to test if next character is alphanumerical or not
641 if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
643 if (_Valid [*pbeg] ) {
644 wordstart = pbeg; //So skipping 1 blank character
645 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
647 else { // a "separator word" ...
648 size++; //the prev BLANK...
649 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
650 }//else { // a "separator word"
651 }//else ... not a unique BLANK AT THE END.
652 }//else ... starting by a BLANK...
655 //The parsed word/separator is is "wordstart", and its length is "size"...
658 //Processement done for each word word
659 i = inHashTable(hash,aWord, size, &addrInTH );
661 insertElement (hash,aWord, size, &addrInTH);
662 posInHT[zeroNode].slot=addrInTH;
663 posInHT[zeroNode].word=hash->hash[addrInTH].word;
664 hash->hash[addrInTH].posInVoc = zeroNode;
666 totallenWords += size +1; // +1 due to the '\0' char...
667 //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);
672 fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
677 // **********************************************************************************
679 // **********************************************************************************
681 // Sorting the words alphanumerically (over posInHT)
682 { register unsigned long i,j;
683 //sorting canonical words ...
684 qsort(posInHT, zeroNode, sizeof(tposInHT), qSortWordsCompareAlpha);
686 //setting in hash the new positions of the words in the hash table
687 for (i=0;i<zeroNode;i++) {
688 hash->hash[posInHT[i].slot].posInVoc = i;
692 // INITIALIZING structures for the 2nd pass ......................................
694 SE = (uint *) malloc ((seSize+1)*sizeof (uint));
698 // **********************************************************************************
699 // STARTING THE SECOND PASS.
700 // **********************************************************************************/
702 printf("\nSTARTING THE SECOND PASS... ");
703 //2nd pass (processing the file)
705 byte *pbeg,*pend,*wordstart,*aWord;
708 register ulong countValidWords = 0;
712 pend = inputBuffer+bytesFileReal;
716 //parsing either a word or separator.
719 if (_Valid[*pbeg]) { //alphanumerical data
720 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
723 if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
724 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
726 else { //a SPACE comes, so we have to test if next character is alphanumerical or not
728 if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
730 if (_Valid [*pbeg] ) {
731 wordstart = pbeg; //So skipping 1 blank character
732 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
734 else { // a "separator word" ...
735 size++; //the prev BLANK...
736 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
737 }//else { // a "separator word"
738 }//else ... not a unique BLANK AT THE END.
739 }//else ... starting by a BLANK...
742 //The parsed word/separator is is "wordstart", and its length is "size"...
745 //Processement done for each word word
746 i = inHashTable(hash,aWord, size, &addrInTH );
748 SE[countValidWords]=hash->hash[addrInTH].posInVoc+1; // !!!!
753 SE[countValidWords] = 0;
754 fprintf(stderr,"\n2nd pass ends: TOTAL distinct words: %lu totalWords = %lu",zeroNode,countValidWords);
758 // **********************************************************************************
760 // **********************************************************************************
762 //freeing the source text (it is no longer needed).
763 free(inputBuffer); //the text
765 /** Now Setting the data of the index **/
767 wcsa->sourceTextSize = sourceTextSize;
768 wcsa->seSize = seSize;
770 // Creating the words of the vocabulary...
772 /** copying the words into WCSA. */
773 uint *tmpOffsets = (uint *) malloc (sizeof(uint) * (zeroNode +1) ); //1 extra uint (to point to the virtual "zeroNode+1" ^th word.
779 //Moving data from posInHT to WCSA structure
780 //wcsa->wordsData = (twords *) malloc(sizeof(twords) * zeroNode);
781 wcsa->wordsData.wordsZoneMem.size = totallenWords - zeroNode; //without '\0' bytes (end-tag).
782 wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc ( wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
783 zoneMem = wcsa->wordsData.wordsZoneMem.zone;
784 for(i = 0; i < zeroNode; i++) {
785 src = posInHT[i].word; //copying the canonical word
786 //wcsa->wordsData.words[i].word = zoneMem; //setting the pointer
787 tmpOffsets[i]=tmpOffset; //offset in zoneMem
788 while (*src) {*zoneMem++ = *src++; tmpOffset++;} //moving data until '\0'
789 //*zoneMem='\0'; zoneMem++; //copies also the '\0'
792 tmpOffsets[zeroNode]=tmpOffset; //setting pointer to the "virtual" word {zeroNode+1}^{th}
794 //kbit encoding of the offsets
795 uint elemSize = bits(tmpOffset);
796 wcsa->wordsData.elemSize = elemSize;
797 wcsa->wordsData.words = (uint *) malloc (((((zeroNode +1)*elemSize)+W-1) /W) * sizeof(uint)); //with 1 extra slot !.
798 wcsa->wordsData.words[((((zeroNode +1)*elemSize)+W-1) /W) -1 ] =0000;
799 // fprintf(stderr,"\n ElemSize = %d, maxOffset = %d",elemSize,tmpOffset);
802 for (i=0; i<=zeroNode; i++) { //setting "zeroNode+1" offsets
803 bitwrite(wcsa->wordsData.words, tmpOffset, elemSize, tmpOffsets[i]);
807 //////////// CHECKS IT WORKED. old !!!!
810 // for (i=0; i<zeroNode; i++) { //setting "zeroNode+1" offsets
811 // kk=bitread(wcsa->wordsData.words, i* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
812 // tmpOffset+=elemSize;
813 // if (kk != tmpOffsets[i]) {fprintf(stderr,"\n @@@@@@@@ DISTINTOS OFFSETS "); break;}
814 // else fprintf(stderr,"\n iguales, %d, %d :: <<%s>>len=%d",kk,i, posInHT[i].word, strlen((char*)posInHT[i].word));
818 // { uint len1, len, tmplen, len2;
820 // byte *wcsaWord, *src;
822 // for (p=0;p<zeroNode;p++) {
823 // {//preparing for strcompL
824 // len = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p+1)), wcsa->wordsData.elemSize);
825 // tmplen = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p)) , wcsa->wordsData.elemSize);
827 // //fprintf(stderr,"\n :: off[%d]= %d - off [%d] = %d ==> %d",p+1,len,p,tmplen,len-tmplen);
830 // wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
833 // src = posInHT[p].word;
834 // len1 = strlen((char *)src);
836 // if (strcompL(src,wcsaWord,len1,len2) != 0) {
837 // fprintf(stderr,"\n %6d DISTINTOS !! ===len1 %d,len %d===== <<",p,len1,len2);printWord(src,len1);
838 // fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
842 // fprintf(stderr,"\n %6d ======len1 %d,len2 %d===== <<",p,len1,len2);printWord(src,len1);
843 // fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
851 //frees memory from hash table and posInHT structures.
857 /** ******* creates the self-index on ints (bottom layer) ==> see build_icsa *********/
862 fprintf(stderr,"\n **** CREATING CSA from Edu's Code *****");
864 myicsa = createIntegerCSA(&SE,seSize+1,build_options);
865 wcsa->myicsa= myicsa;
866 total = CSA_size(myicsa);
868 free(SE); //SE is no longer needed, (it is indexed by the iCSA)
869 printf("\n\t**** [iCSA built on %d words. Size = %ld bytes... RAM",seSize,total);
879 printf("\n\t ** Building done! **\n");
880 printf("\n Process finished!\n");
887 int build_iCSA (char *build_options, void *index)
889 twcsa *wcsa = (twcsa *) index;
890 /********* creates the self-index on ints (bottom layer) *********/
891 //creating CSA from Edu's code...
894 fprintf(stderr,"\n **** CREATING CSA-bottom-layer *****");
895 void *bottomIntIndex;
896 int err = buildIntIndex(wcsa->se,wcsa->seSize+1, build_options,(void **)&bottomIntIndex);
897 wcsa->myicsa = bottomIntIndex;
899 //total = CSA_size(wcsa->myicsa);
900 err = sizeIntIndex((void *) wcsa->myicsa, &total);
902 printf("\n\t**** [iCSA built on %d words. Size = %u bytes... RAM",wcsa->seSize,total);
907 /** ********************************************************************
909 **********************************************************************/
911 /**-----------------------------------------------------------------
913 * Loads all the data structures of WCSA (included the icsa)
914 ----------------------------------------------------------------- */
916 twcsa *loadWCSA(char *filename) {
918 // Inicializes the arrays used to detect if a char is valid or not.
920 // Inicializes the arrays used translated a char into lowercase.
923 wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
926 int err = loadIntIndex(filename, (void **)&wcsa->myicsa);
928 loadStructs(wcsa,filename);
933 /** ------------------------------------------------------------------
935 * Reads files and loads all the data needed for searcherFacade
936 ----------------------------------------------------------------- */
937 void loadStructs(twcsa *wcsa, char *basename) {
945 filename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
946 fprintf(stderr,"Loading Index from file %s.*\n", basename);
948 //** SOME CONSTANTS: sourceTextSize
949 { strcpy(filename, basename);
950 strcat(filename, ".");
951 strcat(filename, CONSTANTS_FILE_EXT);
953 if( (file = open(filename, O_RDONLY)) < 0) {
954 printf("Cannot open file %s\n", filename);
958 read(file, &(wcsa->sourceTextSize), sizeof(uint));
959 read(file, &(wcsa->seSize), sizeof(uint));
963 /** File with the words from the vocabulary (sorted alphabetically) */
966 strcpy(filename, basename);
967 strcat(filename, ".");
968 strcat(filename, VOCABULARY_WORDS_FILE_EXT);
969 //sizeFile= fileSize(filename)-sizeof(uint);
971 if( (file = open(filename, O_RDONLY)) < 0) {
972 printf("Cannot open file %s\n", filename);
976 //the number of canonical words
977 read(file, &n, sizeof(uint));
979 read(file, &(wcsa->wordsData.elemSize), (sizeof(uint)));
980 read(file, &(wcsa->wordsData.wordsZoneMem.size), (sizeof(uint)));
982 //allocating the memory needed for all words and reading them //(ascii) << no \0 chars are needed>>.
983 wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc(wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
984 read(file, (wcsa->wordsData.wordsZoneMem.zone), wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
986 //reading the offsets of the words (kbitArray that points to offsets in zoneMem of words.
987 wcsa->wordsData.words = (uint *) malloc (((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * sizeof(uint));
988 wcsa->wordsData.words[ ((((n+1)*(wcsa->wordsData.elemSize))+W-1) /W) -1 ] =0000;
989 read(file, (wcsa->wordsData.words), ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)));
1001 /** ****************************************************************
1002 * Querying the index WCSA
1003 * ***************************************************************/
1004 ///////////////////////////////////////////////////////////////////////////////////////
1005 // FUNCTIONS NEEDED FOR SEARCHING A PATTERN //
1006 ///////////////////////////////////////////////////////////////////////////////////////
1010 /*------------------------------------------------------------------
1011 * Given a text pattern translates it into a list of integers (corresponding to the
1012 * canonical words associated to the valid words in the text pattern)
1013 ------------------------------------------------------------------*/
1014 void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) {
1016 byte *pbeg,*pend,*wordstart,*aWord;
1017 register unsigned long size;
1021 pend = pbeg + patLen;
1023 while (pbeg <pend) {
1024 //parsing either a word or separator.
1027 if (_Valid[*pbeg]) { //alphanumerical data
1028 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
1031 if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
1032 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
1034 else { //a SPACE comes, so we have to test if next character is alphanumerical or not
1036 if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
1038 if (_Valid [*pbeg] ) {
1039 wordstart = pbeg; //So skipping 1 blank character
1040 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
1042 else { // a "separator word" ...
1043 size++; //the prev BLANK...
1044 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
1045 }//else { // a "separator word"
1046 }//else ... not a unique BLANK AT THE END.
1047 }//else ... starting by a BLANK...
1050 //The parsed word is "aWord", and its length is "size"...
1053 // Binary search on the canonical words (wordsData)
1057 register uint min,max,p;
1059 max = (wcsa->n) - 1;
1063 {//preparing for strcompL
1064 len = bitread (wcsa->wordsData.words, (p+1)* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize);
1065 tmplen = bitread (wcsa->wordsData.words, (p )* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize);
1067 wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
1070 //if(strncmp((char*)aWord, (char*)wcsa->wordsData[p].word,size) > 0) min = p+1;
1071 if(strcompL(aWord, wcsaWord, size, len) > 0) min = p+1;
1075 // { //SHOW PROGRESS
1076 // fprintf(stderr,"\n Patron = <<%s>>, curposWord= %d ==><<",aWord,p);
1077 // printWord(wcsaWord,len); fprintf(stderr,">> len =%d",len);
1082 {//preparing for strcompL
1083 len = bitread (wcsa->wordsData.words, (min+1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1084 tmplen = bitread (wcsa->wordsData.words, ( min )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1086 wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
1089 // if(!strncmp((char*)aWord, (char*)wcsa->wordsData[min].word, size)) {
1090 if(!strcompL(aWord, wcsaWord, size, len)) {
1091 integerPattern[index++] = min +1 ; //<--
1093 else {*sizeIntegers = 0; return;} // a valid word that does not appear in the source text.
1097 *sizeIntegers = index;
1099 // //shows the parsed words:
1101 // printf("\n\n >>%s>> HA SIDO PARSEADO COMO:",textPattern);
1102 // for (i=0; i<index;i++) {
1103 // printf("<<%s>>",wcsa->wordsData[integerPattern[i] -1].word);
1113 /** ------------------------------------------------------------------
1114 * Returns the number of occurrences of a given text pattern
1115 *------------------------------------------------------------------ */
1116 int countTextOcurrences(twcsa *wcsa, byte *textPattern) {
1118 uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
1119 uint integerPatternSize, min, max;
1121 uint lenpat = strlen((char*)textPattern);
1122 parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
1123 if (!integerPatternSize) return -1;
1127 // printf("\n %d Integers to search for:",integerPatternSize );
1128 // for (i=0;i<integerPatternSize;i++) {
1129 // printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
1135 int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, &numocc, &left, &right);
1141 /** ------------------------------------------------------------------
1142 * locateTextOcurrences:
1143 * Returns the offsets of the source text where a word/phrase appears
1144 * Returns also the number of occurrences.
1145 *------------------------------------------------------------------ */
1146 uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences) {
1147 uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
1148 uint integerPatternSize, min, max;
1150 uint lenpat = strlen((char*)textPattern);
1151 parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
1152 if (!integerPatternSize) {*numberOccurrences = -1; return NULL;}
1156 // printf("\n %d Integers to search for:",integerPatternSize );
1157 // for (i=0;i<integerPatternSize;i++) {
1158 // printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
1163 ulong occurrences, left, right;
1165 ulong *sourceOffsets;
1167 //obtains the indexes in vector SE where the pattern appears.
1168 //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
1169 int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
1171 //sourceOffsets = (uint *) malloc (sizeof(uint)*occurrences);
1173 sourceOffsets=seOffsets;
1174 //obtains the offsets in the source text of the pattern (sourceOffsets)
1175 locateFacade(wcsa, (uint *)sourceOffsets, (uint *)seOffsets,occurrences);
1178 fprintf(stderr,"\n*** %s appears in the source text in positions:\n\t",textPattern);
1179 for (i=0;i<occurrences;i++)
1180 fprintf(stderr,"[%u]",sourceOffsets[i]);
1184 *numberOccurrences = occurrences;
1185 return (uint *) sourceOffsets;
1189 /** ------------------------------------------------------------------
1190 * displayTextOcurrences:
1191 * Shows in stdout, the text around the occurrences of a word/phrase
1192 * Returns also the number of occurrences.
1193 *------------------------------------------------------------------ */
1194 int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay) {
1195 return 99; //not implemented: function not available
1198 /** ------------------------------------------------------------------
1200 * For given sePositions, returns the sourceTextPositions
1201 * where the those valid-words in se[sePositions[i]] occurr.
1202 *------------------------------------------------------------------*/
1203 int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number) {
1204 return 99; //not implemented: function not available for this index
1208 /** ------------------------------------------------------------------
1210 * Returns the subString from a starting offset to a final offset
1211 * in the source text. It does not allocate any memory, receives "dstptr"
1212 * Precondition: offsetIni >=0;
1213 ------------------------------------------------------------------*/
1214 int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr) {
1215 return 99; //not implemented: function not available for this index
1219 /**------------------------------------------------------------------
1220 * DISPLAYFacadeMalloc:
1221 * Returns the subString from a starting offset to a final offset
1222 * in the source text. It allocates Memory !!
1223 * NOT CURRENTLY USED
1224 ------------------------------------------------------------------*/
1225 byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length) {
1226 byte *dstptr=NULL; //not implemented: function not available
1231 /** ------------------------------------------------------------------
1232 * LOCATEALLandDISPLAY:
1233 * Displays the text around an occurrence of the searched word in the source text.
1234 * Assuming that $p$ is that position --> shows only chars in [p_radix-1,p_radix]
1235 ------------------------------------------------------------------*/
1236 int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix) {
1237 return 99; //not implemented: function not available for this index
1242 /** ------------------------------------------------------------------
1243 * recovers the source text by calling display(0,fileSize);
1244 * ------------------------------------------------------------------ */
1245 void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
1249 char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
1252 strcpy( filename, basename);
1253 strcat( filename, ext);
1254 filename[strlen( basename)+ strlen(ext)]='\0';
1255 fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
1259 salida = fopen( filename,"w");
1260 start=0; end = sourceTextSize-1;
1262 cc = (byte *) malloc (sourceTextSize* sizeof(uchar));
1265 uint i, j;//, tmplen;
1276 while ( (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
1278 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
1280 {//obtains pointer to the ith word
1282 uint ith = posSEValue -1; // !!
1283 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1284 offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1285 tmplen -=offtmp; //the lenght of the ith word.
1286 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
1294 prevValid =1; //for the next iteration
1300 for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
1304 fprintf(stderr,"\n sourceTextSize = %d, len = %d",sourceTextSize,leng);
1305 fwrite(cc,sizeof(byte),leng,salida);
1315 //recovers the source text by calling extract Words.
1316 void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
1318 int start;int end; int error;
1319 char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
1323 strcpy( filename, basename);
1324 strcat( filename, ext);
1325 filename[strlen( basename)+ strlen(ext)]='\0';
1326 fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
1330 salida = fopen( filename,"w");
1331 start=0; end = wcsa->seSize;
1333 error = extractWords((void *) wcsa, start, end, &cc, &length);
1334 if (error) {fprintf(stderr,"\n error during recoverSourceText2"); exit(0);}
1336 fprintf(stderr,"\n sourceTextSize = %d, len = %ld",sourceTextSize,length);
1337 fwrite(cc,sizeof(byte),length,salida);
1344 /** *******************************************************************************
1345 * Showing some statistics and info of the index
1346 * *******************************************************************************/
1347 void printInfoReduced(twcsa *wcsa) {
1348 //not implemented: function not available
1351 /* Shows summary info of the index */
1352 int printInfo(void *index) {
1355 twcsa *wcsa = (twcsa *) index;
1357 unsigned long indexSize;
1358 uint intIndexSize, presentationSize;
1361 err = index_size(index, &indexSize);
1362 if (err!=0) return err;
1363 err = sizeIntIndex(wcsa->myicsa, &intIndexSize);
1364 if (err!=0) return err;
1366 presentationSize = indexSize - intIndexSize;
1368 printf("\n ===================================================:");
1369 printf("\n Summary of Presentation layer:");
1370 printf("\n Number of valid words (SEsize) = %u",wcsa->seSize);
1371 printf("\n Number of different words = %ld",wcsa->n);
1372 printf("\n WCSA structure = %d bytes", sizeof(twcsa));
1374 uint totalpointers = ((((wcsa->n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));
1375 uint totalasciizone = wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ;
1376 uint totalwords = totalasciizone + totalpointers;
1378 printf("\n Size Of words structure (%d bytes):",totalwords);
1379 printf("\n [ pointers = %d bytes || AsciiZone = %d bytes", totalpointers, totalasciizone);
1381 printf("\n\n Total = ** %u bytes (in RAM) **",presentationSize);
1382 //printf("\n\n @@ Summary of self-index on Integers:");
1383 err = printInfoIntIndex(wcsa->myicsa, " ");
1384 if (err!=0) return err;
1386 printf("\n ===================================================:");
1391 /**------------------------------------------------------------------
1393 * Counts the memory amount needed by the Facade (Presentation Layer).
1394 * skipping the stop_words hash table
1395 ----------------------------------------------------------------- */
1396 uint structsSizeMem(twcsa *wcsa) {
1397 return 0; //not implemented: function not available for this index.
1401 /** for debugging **/
1402 void printWord(uchar *str, uint len) {
1405 fprintf(stderr,"%c",str[i]);
1409 /** saves the content of the file SE (ids of the source words) **/
1410 int saveSEfile (char *basename, uint *v, uint n) {
1411 char outfilename[255];
1413 sprintf(outfilename,"%s.%s",basename,SE_FILE_EXT);
1414 unlink(outfilename);
1415 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
1416 printf("Cannot open file %s\n", outfilename);
1420 write(file, v, sizeof(uint) * n );
1426 double getTime2 (void)
1428 double usertime, systime;
1429 struct rusage usage;
1431 getrusage (RUSAGE_SELF, &usage);
1433 usertime = (double) usage.ru_utime.tv_sec +
1434 (double) usage.ru_utime.tv_usec / 1000000.0;
1435 systime = (double) usage.ru_stime.tv_sec +
1436 (double) usage.ru_stime.tv_usec / 1000000.0;
1438 return (usertime + systime);
1443 /**------------------------------------------------------------------
1445 *------------------------------------------------------------------ */
1446 #ifdef FACADEWITHMAIN
1447 int main(int argc, char* argv[])
1452 char *infile, *outbasename, *stopwordsfile; // Name of in/out files
1460 printf("\n*Word-based iCSA: A word-based CSA");
1461 printf("\n*CopyRight (c) 2008 [LBD & G.N.]\n\n");
1463 // Reads input parameters from command line.
1465 printf("Use: %s <in file> <out basename> \n", argv[0]);
1469 // Reads params (input file, output basename, and stopwords file)
1471 outbasename = argv[2];
1472 stopwordsfile = argv[3];
1474 finsize= fileSize(infile);
1477 printf( "\nFILE EMPTY OR FILE NOT FOUND %s !!\nSkipping processement ...\n",infile);
1481 // Opening the input text file.
1482 if( (f_in = open(infile, O_RDONLY)) < 0) {
1483 printf("Cannot read file %s\n", infile);
1486 inputBuffer = (byte *) malloc(finsize *sizeof(byte));// +1);
1487 read (f_in,inputBuffer,finsize);
1492 //printf("\n parametros <<%s>>\n\n",stopwordsfile);
1493 build_index (inputBuffer, finsize, stopwordsfile, &Index); /** building the index */
1495 // /** recovering the source text from the index */
1500 get_length(Index, &size);
1501 char extension[10]= ".source";
1503 //recoverSourceText1((twcsa*) Index, outbasename,extension, size);
1504 strcat(extension,"2");
1505 recoverSourceText2((twcsa*) Index, outbasename,extension,size);
1507 fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start );
1510 // DISPLAYING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
1511 {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1513 ulong numocc,numc, length, i, *snippet_len, tot_numcharext = 0, numpatt;
1514 uchar *pattern, *snippet_text;
1516 pattern = textPattern;
1517 printf("\nSEARCH TEST for DISPLAY (pizzachili interface)\n");
1519 printf("Intro string: ");
1520 fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1521 if (!strcmp((char*)textPattern,"\n") ) break;
1522 textPattern[strlen((char*)textPattern)-1] = '\0';
1524 length = strlen( (char*)textPattern);
1527 // error = display (Index, textPattern, length, numc, &numocc,
1528 // &snippet_text, &snippet_len);
1529 error = displayWords (Index, textPattern, length, numc, &numocc,
1530 &snippet_text, &snippet_len,1);
1532 if (error){ fprintf(stderr, "%s\n", "Hubo un error durante display");exit(0);}
1534 fprintf(stderr,"\n acabou display");fflush(stderr);
1536 ulong j, len = length + 2*numc;
1538 fprintf(stderr,"\n length = %d",length);
1539 fprintf(stderr,"\n pattern = %s",pattern);fflush(stderr);
1540 fprintf(stderr,"\n numocc = %d",numocc);fflush(stderr);
1541 fprintf(stderr,"\n snippet len = %d",len);fflush(stderr);
1542 fprintf(stderr,"\n =========");fflush(stderr);
1543 for (i = 0; i < numocc; i++){
1544 fprintf(stderr,"\n[%2d][len=%3d]<<",i+1,snippet_len[i]);fflush(stderr);
1545 fwrite(snippet_text+len*i,sizeof(uchar),snippet_len[i],stderr);fflush(stderr);
1546 fprintf(stderr,">>");fflush(stderr);
1551 for(i=0; i<numocc; i++) {
1552 tot_numcharext += snippet_len[i];
1557 free (snippet_text);
1560 printf("Ocurrences = %d\n", numocc);
1561 if (!strcmp((char*)textPattern,"\n") ) break;
1567 // // SEARCHING FOR A TEXT PATTERN (word/phrase).
1568 // {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1573 // printf("\nSEARCH TEST for LOCATE\n");
1575 // printf("Intro string: ");
1576 // fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1577 // len = strlen((char*)textPattern);
1578 // if (!strcmp((char*)textPattern,"\n") ) break;
1579 // textPattern[len-1] = '\0';
1582 // //occs = locateTextOcurrences(wcsa,textPattern,&occ);
1583 // // locate(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ);
1584 // locateWord(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ, 0);
1586 // printf("\n*** %s occurs %d times: In the source text in positions:\n\t",textPattern,occ);
1587 // for (i=0;i<occ;i++)
1588 // printf("[%u]",occs[i]);
1592 // if (!strcmp((char*)textPattern,"\n") ) break;
1598 // COUNTING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
1600 {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1603 printf("\nSEARCH TEST for COUNT.\n");
1605 printf("Intro string: ");
1606 fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1607 len = strlen((char*)textPattern);
1608 if (!strcmp((char*)textPattern,"\n") ) break;
1609 textPattern[len-1] = '\0';
1612 count(Index, textPattern, len, (ulong *)&occ);
1613 //occ = countTextOcurrences(wcsa,textPattern);
1614 printf("Ocurrences = %d\n", occ);
1617 printf("\n END COUNTING OCCURRENCES OF PATTERNS. ...\n");
1621 /** saving the index to disk*/
1622 save_index (Index, outbasename);
1624 /** tells the mem used by the index */
1626 index_size(Index, &indexsize);
1627 fprintf(stderr,"Index occupied %d bytes, 2 extra mallocs = %d",indexsize,2* sizeof(uint));
1629 /** freeing the index */