1 #include "buildFacade.h"
2 #include "utils/errors.c"
5 /** Building the index */
7 /* Creates index from text[0..length-1]. Note that the index is an
8 opaque data type. Any build option must be passed in string
9 build_options, whose syntax depends on the index. The index must
10 always work with some default parameters if build_options is NULL.
11 The returned index is ready to be queried. */
12 int build_index (uchar *text, ulong length, char *build_options, void **index) {
15 printf("\n parameters: \"%s\"\n",build_options); fflush(stderr);
17 returnvalue = build_WCSA (text, length, build_options, index);
20 returnvalue = build_iCSA (build_options,*index);
26 /** Saves index on disk by using single or multiple files, having
28 int save_index (void *index, char *filename) {
30 char *basename = filename;
31 twcsa *wcsa=(twcsa *) index;
38 printf("\n Saving structures to disk: %s.*",filename);
39 outfilename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
41 /**File with some constants (bSize and tohSize); */
43 strcpy(outfilename, basename);
44 strcat(outfilename, ".");
45 strcat(outfilename, CONSTANTS_FILE_EXT);
47 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
48 printf("Cannot open file %s\n", outfilename);
51 write(file, &(wcsa->sourceTextSize), sizeof(uint));
52 write(file, &(wcsa->seSize), sizeof(uint));
56 /** The Words in the vocabulary of words (sorted alphabetically)*/
57 { strcpy(outfilename, basename);
58 strcat(outfilename, ".");
59 strcat(outfilename, VOCABULARY_WORDS_FILE_EXT);
61 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
62 printf("Cannot open file %s\n", outfilename);
67 uint elemSize = wcsa->wordsData.elemSize;
68 write(file, &n, sizeof(uint));
69 write(file, &elemSize, sizeof(uint));
70 write(file, &(wcsa->wordsData.wordsZoneMem.size), sizeof(uint));
72 //the number of canonical words
73 write(file, (char *)wcsa->wordsData.wordsZoneMem.zone, wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
74 write(file, (char *)wcsa->wordsData.words, ((((n+1)* (elemSize))+W-1) /W) * (sizeof(uint)) );
82 /******** saves index on integers (bottom) ******/
84 //storeStructsCSA(wcsa->myicsa,basename);
85 saveIntIndex((void *) wcsa->myicsa, basename);
89 saveSEfile(basename,wcsa->se, wcsa->seSize+1);
98 /** Loads index from one or more file(s) named filename, possibly
99 adding the proper extensions. */
100 int load_index(char *filename, void **index){
102 wcsa = loadWCSA (filename);
103 (*index) = (void *) wcsa;
107 /** Frees the memory occupied by index. */
108 int free_index(void *index){
109 twcsa *wcsa=(twcsa *) index;
111 index_size(index,&size);
112 printf("\n[destroying index] ...Freed %lu bytes... RAM", size);
115 //frees the array SE.
121 //destroyStructsCSA(wcsa->myicsa);
122 int err = freeIntIndex((void *) wcsa->myicsa);
126 free (wcsa->wordsData.wordsZoneMem.zone);
127 free (wcsa->wordsData.words); /** huge!! */
129 //the pointer to wcsa.
134 /** Gives the memory occupied by index in bytes. */
135 int index_size(void *index, ulong *size) {
137 twcsa *wcsa=(twcsa *)index;
140 *size += sizeof(twcsa);
143 totaltmp += ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)); //the pointers
144 totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words.
149 int err = sizeIntIndex((void *) wcsa->myicsa, &nbytes);
151 //*size += CSA_size(wcsa->myicsa);
158 /** Querying the index =============================================================*/
160 /* Writes in numocc the number of occurrences of the substring
161 pattern[0..length-1] found in the text indexed by index. */
162 int count (void *index, uchar *pattern, ulong length, ulong *numocc){
163 uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
164 uint integerPatternSize;
167 twcsa *wcsa=(twcsa *) index;
168 parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
169 if (!integerPatternSize) {*numocc=0; return 0;} //not found
171 //*numocc = countCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &l, &r);
172 int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, numocc, &l, &r);
176 /* Writes in numocc the number of occurrences of the substring
177 pattern[0..length-1] in the text indexed by index. It also allocates
178 occ (which must be freed by the caller) and writes the locations of
179 the numocc occurrences in occ, in arbitrary order. */
180 int locate(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc){
184 /* Gives the length of the text indexed */
185 int get_length(void *index, ulong *length) {
186 twcsa *wcsa=(twcsa *) index;
187 *length = wcsa->sourceTextSize;
191 /** Obtains the length of the text indexed by index. */
193 int length (void *index, ulong *length) {
194 return (get_length(index,length));
198 /** ***********************************************************************************
199 * Accessing the indexed text
200 * ***********************************************************************************/
203 /** Allocates snippet (which must be freed by the caller) and writes
204 the substring text[from..to] into it. Returns in snippet_length the
205 length of the text snippet actually extracted (that could be less
206 than to-from+1 if to is larger than the text size). */
207 int extract (void *index, ulong from, ulong to, uchar **snippet, ulong *snippet_length) {
208 twcsa *wcsa=(twcsa *) index;
212 /** Displays the text (snippet) surrounding any occurrence of the
213 substring pattern[0..length-1] within the text indexed by index.
214 The snippet must include numc characters before and after the
215 pattern occurrence, totalizing length+2*numc characters, or less if
216 the text boundaries are reached. Writes in numocc the number of
217 occurrences, and allocates the arrays snippet_text and
218 snippet_lengths (which must be freed by the caller). The first is a
219 character array of numocc*(length+2*numc) characters, with a new
220 snippet starting at every multiple of length+2*numc. The second
221 gives the real length of each of the numocc snippets. */
223 int display (void *index, uchar *pattern, ulong length, ulong numc,
224 ulong *numocc, uchar **snippet_text, ulong **snippet_lengths) {
230 /** ***********************************************************************************
231 * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
232 * ***********************************************************************************/
233 /* Writes in numocc the number of occurrences of the substring
234 pattern[0..length-1] in the text indexed by index. It also allocates
235 occ (which must be freed by the caller) and writes the locations of
236 the numocc occurrences in occ, in arbitrary order. These occurrences
237 refer to the offsets in TOH where the caller could start a display
238 operation. So locateWord implies synchronization using B.
239 Moreover, positions occ[numocc.. 2*numocc-1] is set with the rank in SE of the
240 words whose codes begin in TOH in the positions in occ[0... numocc-1]
241 ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
242 searched word, but the offset in TOH of k-before words before.
245 int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore){
246 uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
247 uint integerPatternSize;
248 ulong occurrences,l,r;
249 twcsa *wcsa=(twcsa *) index;
251 parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
252 if (!integerPatternSize) {*numocc=0; return 0;} //not found
256 //obtains the indexes in vector SE where the pattern appears.
257 //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
258 int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
260 *numocc = occurrences;
262 if (!occurrences) {(*occ)=NULL;return 0;}
264 (*occ) = (ulong *)seOffsets;
269 /** Displays the text (snippet) surrounding any occurrence of the
270 substring pattern[0..length-1] within the text indexed by index.
271 The snippet must include numc characters before and after the
272 pattern occurrence, totalizing length+2*numc characters, or less if
273 the text boundaries are reached. Writes in numocc the number of
274 occurrences, and allocates the arrays snippet_text and
275 snippet_lengths (which must be freed by the caller). The first is a
276 character array of numocc*(length+2*numc) characters, with a new
277 snippet starting at every multiple of length+2*numc. The second
278 gives the real length of each of the numocc snippets. */
280 int displayWords (void *index, uchar *pattern, ulong length, ulong numc,
281 ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore) {
283 /** actually extracts upto length + 2*numc chars, starting extraction kbefore
284 * words before the occurrence **/
288 uint bytesPerSnippet;
290 twcsa *wcsa=(twcsa *) index;
292 locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
293 (*numocc) = occurrences;
297 *snippet_lengths =NULL;
301 bytesPerSnippet = length+2*numc;
302 // bytesPerSnippet = 2*numc;
303 *snippet_lengths = (ulong *) malloc((*numocc)*sizeof(ulong));
304 if (!(*snippet_lengths)) return 1;
305 *snippet_text = (uchar *) malloc((*numocc)*(bytesPerSnippet)*sizeof(uchar) +1) ; //(the last "1" is for '\0');
306 if (!(*snippet_text)) return 1;
308 // fprintf(stderr,"\n occs found = %7d for pattern %s",*numocc, pattern);
311 text_aux=*snippet_text;
317 uint posSEValue,indexSE;
319 for (i=0;i<occurrences;i++) {
323 /** decodes words from there */
325 indexSE = indexesInSE[i];
326 indexSE = (indexSE > kbefore) ? indexSE-kbefore : 0;
329 while ((!endSnippet) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
331 //posSEValue =displayCSA(wcsa->myicsa,indexSE);
332 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
334 {//obtains pointer to the ith word
336 uint ith = posSEValue -1; // !!
337 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
338 offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
339 tmplen -=offtmp; //the lenght of the ith word.
341 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
348 if (snippetLen==bytesPerSnippet) break; //end of snippet (ends in BLANK_SPACE)
350 prevValid =1; //for the next iteration
357 if ((tmplen+snippetLen)>=bytesPerSnippet) {
358 tmplen =(bytesPerSnippet - snippetLen);
359 endSnippet=1; //so while loop ends;
362 for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
366 text_aux += bytesPerSnippet;
367 (*snippet_lengths)[i] = snippetLen;
370 if (occurrences) free(indexesInSE);
375 /** simulates extration of text process, but do not actually returns anything at all
376 Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
377 Less than 2K words can be extracted if more than numc characters have been already obtained.
378 Does nothing else... does not return the text */
380 int displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc) {
386 twcsa *wcsa=(twcsa *) index;
388 locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
394 ulong maxsnippetLen = maxnumc;
395 ulong extractedbytes = 0;
397 text_aux = (byte *) malloc (maxsnippetLen+1);
404 uint posSEValue,indexSE;
406 uint numWordsToExtract = 2 * wordsbefore;
408 //printf("\n occurrences... = %lu",occurrences);
410 for (i=0;i<occurrences;i++) {
414 /** decodes words from there */
416 indexSE = indexesInSE[i];
417 indexSE = (indexSE > wordsbefore) ? indexSE-wordsbefore : 0;
421 while ((z<numWordsToExtract) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
423 //posSEValue =displayCSA(wcsa->myicsa,indexSE);
424 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
426 {//obtains pointer to the ith word
428 uint ith = posSEValue -1; // !!
429 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
430 offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
431 tmplen -=offtmp; //the lenght of the ith word.
433 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
440 if (snippetLen==maxsnippetLen) break; //end of snippet (ends in BLANK_SPACE)
442 prevValid =1; //for the next iteration
449 if ((tmplen+snippetLen)>=maxsnippetLen) {
453 //fprintf(stderr,"\ntmplen = %d ",tmplen); fflush(stderr);
454 for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
459 extractedbytes += snippetLen;
463 if (occurrences) free(indexesInSE);
465 if (text_aux) free (text_aux);
466 return extractedbytes;
471 /** Allocates text (which must be freed by the caller) and recovers the
472 the substring of text starting from the "fromword"-th word up to the
473 "toWord"-th words. Returns in text the text, and in "text_lenght" the
474 length of the text actually extracted. Text is allocated.
475 Actually extracts SE[fromWord .. toWord) ... not the last word. */
477 int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
480 twcsa *wcsa=(twcsa *) index;
481 uint initTextLen=10000;
484 uint i, j;//, tmplen;
486 byte *src, *dst, *buff;
489 uint buffBytes = 1000;
490 uint leng=0; //curr pos in buffer that was occupied.
492 if (toWord > wcsa->seSize) toWord = wcsa->seSize;
493 if (fromWord >= wcsa->seSize) fromWord = wcsa->seSize-1;
494 if (buffBytes < ( (toWord-fromWord)* avgWordLen)) buffBytes = ((toWord-fromWord)* avgWordLen);
496 buff = (uchar *) malloc (buffBytes * sizeof(char));
497 if (!buff) return 1; //out of memory.
500 register uint indexSE=fromWord;
504 while ( (indexSE < toWord) ){ /** extracting words (if not at the end) */
506 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
508 {//obtains pointer to the ith word
510 ith= posSEValue -1; // !!
511 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
512 offtmp = bitread (wcsa->wordsData.words, (ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
513 tmplen -=offtmp; //the lenght of the ith word.
514 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
517 if ( buffBytes < (leng + tmplen+1) ) {
519 buff = (uchar*) realloc(buff, buffBytes);
520 if (!buff) return 1; //out of memory.
529 prevValid =1; //for the next iteration
535 for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
547 /** ***********************************************************************************
548 CONSTRUCTION OF THE INDEX WCSA
549 ***********************************************************************************/
551 /**------------------------------------------------------------------
552 Compares two slots (alphanumericaly). For qsort of canonical words
553 ------------------------------------------------------------------ */
554 int qSortWordsCompareAlpha(const void *arg1, const void *arg2) {
555 tposInHT *a1 = (tposInHT *) arg1;
556 tposInHT *a2 = (tposInHT *) arg2;
557 return strcmp((char*)a1->word, (char *)a2->word);
561 * BUILDS THE WCSA INDEX
564 int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
566 unsigned long zeroNode; //number of different canonical words.
568 t_hash hash; // the hash table to store both variants and canonical words.
569 tposInHT *posInHT; // structure for canonicals and variants+huffmans
573 uint seSize=0; //it's size == "numberOfValidWords".
574 uint *SE; //Integers vector. (represents the rank of the valid words in the source text).
576 uint totallenWords=0; //The numberOfBytes that occupy canonical words (their ascii version) in memory
579 ulong bytesFile,bytesFileReal;
581 unsigned long size_posInHT;
582 /* used during first pass */
586 byte* inputBuffer = text;
587 bytesFileReal= bytesFile = length;
589 sourceTextSize=length;
591 /** Initializes WCSA structure*/
593 wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
597 //Stimation (Using Heap's law) of the number of different "meaningful" words.
598 //sizeNValue=N_value;
599 if(bytesFile<5000000) bytesFile = 5000000;
601 sizeNValue = (unsigned long) floor(3.9* pow(bytesFile,0.70) );
603 // Inicializes the arrays used to detect if a char is valid or not.
605 // Inicializes the arrays used translated a char into lowercase.
609 // **********************************************************************************
610 //STARTING THE FIRST PASS.
611 // **********************************************************************************
612 printf("\nSTARTING THE FIRST PASS...");
614 posInHT = (tposInHT *) malloc(sizeof(tposInHT) * sizeNValue);
615 size_posInHT = sizeNValue;
616 hash = initialize_hash (sizeNValue); //hash to cointain both the parsed words
618 //-----------------------------------------------------------------
619 //1st pass (processing the file)
621 byte *pbeg,*pend,*wordstart,*aWord;
626 pend = inputBuffer+bytesFileReal;
632 fprintf(stderr, "buildFacade.c: assert failed, *pbeg == 0\n");
636 //parsing either a word or separator.
639 if (_Valid[*pbeg]) { //alphanumerical data
640 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
644 if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
645 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
647 else { //a SPACE comes, so we have to test if next character is alphanumerical or not
649 if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
651 if (_Valid [*pbeg] ) {
652 wordstart = pbeg; //So skipping 1 blank character
653 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
655 else { // a "separator word" ...
656 size++; //the prev BLANK...
657 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
658 }//else { // a "separator word"
659 }//else ... not a unique BLANK AT THE END.
660 }//else ... starting by a BLANK...
663 if (pbeg < pend && *pbeg == 0)
664 pbeg ++; // Skip the 0-bytes
668 fprintf(stderr, "buildFacade.c: assert failed, size == 0\n");
672 //The parsed word/separator is is "wordstart", and its length is "size"...
675 //Processement done for each word word
676 i = inHashTable(hash,aWord, size, &addrInTH );
678 insertElement (hash,aWord, size, &addrInTH);
679 if (zeroNode >= size_posInHT){
681 posInHT = (tposInHT*) realloc(posInHT, size_posInHT * sizeof(tposInHT));
683 posInHT[zeroNode].slot=addrInTH;
684 posInHT[zeroNode].word=hash->hash[addrInTH].word;
685 hash->hash[addrInTH].posInVoc = zeroNode;
687 totallenWords += size +1; // +1 due to the '\0' char...
688 //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);
694 fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
699 // **********************************************************************************
701 // **********************************************************************************
703 // Sorting the words alphanumerically (over posInHT)
704 { register unsigned long i,j;
705 //sorting canonical words ...
706 qsort(posInHT, zeroNode, sizeof(tposInHT), qSortWordsCompareAlpha);
708 //setting in hash the new positions of the words in the hash table
709 for (i=0;i<zeroNode;i++) {
710 hash->hash[posInHT[i].slot].posInVoc = i;
714 // INITIALIZING structures for the 2nd pass ......................................
716 SE = (uint *) malloc ((seSize+1)*sizeof (uint));
720 // **********************************************************************************
721 // STARTING THE SECOND PASS.
722 // **********************************************************************************/
724 printf("\nSTARTING THE SECOND PASS... ");
725 //2nd pass (processing the file)
727 byte *pbeg,*pend,*wordstart,*aWord;
730 register ulong countValidWords = 0;
734 pend = inputBuffer+bytesFileReal;
739 fprintf(stderr, "buildFacade.c 2nd pass: assert failed, *pbeg == 0\n");
743 //parsing either a word or separator.
746 if (_Valid[*pbeg]) { //alphanumerical data
747 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
750 if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
751 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
753 else { //a SPACE comes, so we have to test if next character is alphanumerical or not
755 if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
757 if (_Valid [*pbeg] ) {
758 wordstart = pbeg; //So skipping 1 blank character
759 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
761 else { // a "separator word" ...
762 size++; //the prev BLANK...
763 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
764 }//else { // a "separator word"
765 }//else ... not a unique BLANK AT THE END.
766 }//else ... starting by a BLANK...
769 if (pbeg < pend && *pbeg == 0)
770 pbeg ++; // Skip the 0-bytes
774 fprintf(stderr, "buildFacade.c 2nd pass: assert failed, size == 0\n");
778 //The parsed word/separator is is "wordstart", and its length is "size"...
781 //Processement done for each word word
782 i = inHashTable(hash,aWord, size, &addrInTH );
784 SE[countValidWords]=hash->hash[addrInTH].posInVoc+1; // !!!!
789 SE[countValidWords] = 0;
790 fprintf(stderr,"\n2nd pass ends: TOTAL distinct words: %lu totalWords = %lu",zeroNode,countValidWords);
794 // **********************************************************************************
796 // **********************************************************************************
798 //freeing the source text (it is no longer needed).
799 free(inputBuffer); //the text
801 /** Now Setting the data of the index **/
803 wcsa->sourceTextSize = sourceTextSize;
804 wcsa->seSize = seSize;
806 // Creating the words of the vocabulary...
808 /** copying the words into WCSA. */
809 uint *tmpOffsets = (uint *) malloc (sizeof(uint) * (zeroNode +1) ); //1 extra uint (to point to the virtual "zeroNode+1" ^th word.
815 //Moving data from posInHT to WCSA structure
816 //wcsa->wordsData = (twords *) malloc(sizeof(twords) * zeroNode);
817 wcsa->wordsData.wordsZoneMem.size = totallenWords - zeroNode; //without '\0' bytes (end-tag).
818 wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc ( wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
819 zoneMem = wcsa->wordsData.wordsZoneMem.zone;
820 for(i = 0; i < zeroNode; i++) {
821 src = posInHT[i].word; //copying the canonical word
822 //wcsa->wordsData.words[i].word = zoneMem; //setting the pointer
823 tmpOffsets[i]=tmpOffset; //offset in zoneMem
824 while (*src) {*zoneMem++ = *src++; tmpOffset++;} //moving data until '\0'
825 //*zoneMem='\0'; zoneMem++; //copies also the '\0'
828 tmpOffsets[zeroNode]=tmpOffset; //setting pointer to the "virtual" word {zeroNode+1}^{th}
830 //kbit encoding of the offsets
831 uint elemSize = _bits(tmpOffset);
832 wcsa->wordsData.elemSize = elemSize;
833 wcsa->wordsData.words = (uint *) malloc (((((zeroNode +1)*elemSize)+W-1) /W) * sizeof(uint)); //with 1 extra slot !.
834 wcsa->wordsData.words[((((zeroNode +1)*elemSize)+W-1) /W) -1 ] =0000;
835 // fprintf(stderr,"\n ElemSize = %d, maxOffset = %d",elemSize,tmpOffset);
838 for (i=0; i<=zeroNode; i++) { //setting "zeroNode+1" offsets
839 bitwrite(wcsa->wordsData.words, tmpOffset, elemSize, tmpOffsets[i]);
843 //////////// CHECKS IT WORKED. old !!!!
846 // for (i=0; i<zeroNode; i++) { //setting "zeroNode+1" offsets
847 // kk=bitread(wcsa->wordsData.words, i* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
848 // tmpOffset+=elemSize;
849 // if (kk != tmpOffsets[i]) {fprintf(stderr,"\n @@@@@@@@ DISTINTOS OFFSETS "); break;}
850 // else fprintf(stderr,"\n iguales, %d, %d :: <<%s>>len=%d",kk,i, posInHT[i].word, strlen((char*)posInHT[i].word));
854 // { uint len1, len, tmplen, len2;
856 // byte *wcsaWord, *src;
858 // for (p=0;p<zeroNode;p++) {
859 // {//preparing for strcompL
860 // len = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p+1)), wcsa->wordsData.elemSize);
861 // tmplen = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p)) , wcsa->wordsData.elemSize);
863 // //fprintf(stderr,"\n :: off[%d]= %d - off [%d] = %d ==> %d",p+1,len,p,tmplen,len-tmplen);
866 // wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
869 // src = posInHT[p].word;
870 // len1 = strlen((char *)src);
872 // if (strcompL(src,wcsaWord,len1,len2) != 0) {
873 // fprintf(stderr,"\n %6d DISTINTOS !! ===len1 %d,len %d===== <<",p,len1,len2);printWord(src,len1);
874 // fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
878 // fprintf(stderr,"\n %6d ======len1 %d,len2 %d===== <<",p,len1,len2);printWord(src,len1);
879 // fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
887 //frees memory from hash table and posInHT structures.
893 /** ******* creates the self-index on ints (bottom layer) ==> see build_icsa *********/
898 fprintf(stderr,"\n **** CREATING CSA from Edu's Code *****");
900 myicsa = createIntegerCSA(&SE,seSize+1,build_options);
901 wcsa->myicsa= myicsa;
902 total = CSA_size(myicsa);
904 free(SE); //SE is no longer needed, (it is indexed by the iCSA)
905 printf("\n\t**** [iCSA built on %d words. Size = %ld bytes... RAM",seSize,total);
915 printf("\n\t ** Building done! **\n");
916 printf("\n Process finished!\n");
923 int build_iCSA (char *build_options, void *index)
925 twcsa *wcsa = (twcsa *) index;
926 /********* creates the self-index on ints (bottom layer) *********/
927 //creating CSA from Edu's code...
930 fprintf(stderr,"\n **** CREATING CSA-bottom-layer *****");
931 void *bottomIntIndex;
932 int err = buildIntIndex(wcsa->se,wcsa->seSize+1, build_options,(void **)&bottomIntIndex);
933 wcsa->myicsa = bottomIntIndex;
935 //total = CSA_size(wcsa->myicsa);
936 err = sizeIntIndex((void *) wcsa->myicsa, &total);
938 printf("\n\t**** [iCSA built on %d words. Size = %u bytes... RAM",wcsa->seSize,total);
943 /** ********************************************************************
945 **********************************************************************/
947 /**-----------------------------------------------------------------
949 * Loads all the data structures of WCSA (included the icsa)
950 ----------------------------------------------------------------- */
952 twcsa *loadWCSA(char *filename) {
954 // Inicializes the arrays used to detect if a char is valid or not.
956 // Inicializes the arrays used translated a char into lowercase.
959 wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
962 int err = loadIntIndex(filename, (void **)&wcsa->myicsa);
964 loadStructs(wcsa,filename);
969 /** ------------------------------------------------------------------
971 * Reads files and loads all the data needed for searcherFacade
972 ----------------------------------------------------------------- */
973 void loadStructs(twcsa *wcsa, char *basename) {
981 filename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
982 fprintf(stderr,"Loading Index from file %s.*\n", basename);
984 //** SOME CONSTANTS: sourceTextSize
985 { strcpy(filename, basename);
986 strcat(filename, ".");
987 strcat(filename, CONSTANTS_FILE_EXT);
989 if( (file = open(filename, O_RDONLY)) < 0) {
990 printf("Cannot open file %s\n", filename);
994 read(file, &(wcsa->sourceTextSize), sizeof(uint));
995 read(file, &(wcsa->seSize), sizeof(uint));
999 /** File with the words from the vocabulary (sorted alphabetically) */
1002 strcpy(filename, basename);
1003 strcat(filename, ".");
1004 strcat(filename, VOCABULARY_WORDS_FILE_EXT);
1005 //sizeFile= fileSize(filename)-sizeof(uint);
1007 if( (file = open(filename, O_RDONLY)) < 0) {
1008 printf("Cannot open file %s\n", filename);
1012 //the number of canonical words
1013 read(file, &n, sizeof(uint));
1015 read(file, &(wcsa->wordsData.elemSize), (sizeof(uint)));
1016 read(file, &(wcsa->wordsData.wordsZoneMem.size), (sizeof(uint)));
1018 //allocating the memory needed for all words and reading them //(ascii) << no \0 chars are needed>>.
1019 wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc(wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
1020 read(file, (wcsa->wordsData.wordsZoneMem.zone), wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
1022 //reading the offsets of the words (kbitArray that points to offsets in zoneMem of words.
1023 wcsa->wordsData.words = (uint *) malloc (((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * sizeof(uint));
1024 wcsa->wordsData.words[ ((((n+1)*(wcsa->wordsData.elemSize))+W-1) /W) -1 ] =0000;
1025 read(file, (wcsa->wordsData.words), ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)));
1037 /** ****************************************************************
1038 * Querying the index WCSA
1039 * ***************************************************************/
1040 ///////////////////////////////////////////////////////////////////////////////////////
1041 // FUNCTIONS NEEDED FOR SEARCHING A PATTERN //
1042 ///////////////////////////////////////////////////////////////////////////////////////
1046 /*------------------------------------------------------------------
1047 * Given a text pattern translates it into a list of integers (corresponding to the
1048 * canonical words associated to the valid words in the text pattern)
1049 ------------------------------------------------------------------*/
1050 void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) {
1052 byte *pbeg,*pend,*wordstart,*aWord;
1053 register unsigned long size;
1057 pend = pbeg + patLen;
1059 while (pbeg <pend) {
1060 //parsing either a word or separator.
1063 if (_Valid[*pbeg]) { //alphanumerical data
1064 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
1067 if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
1068 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
1070 else { //a SPACE comes, so we have to test if next character is alphanumerical or not
1072 if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
1074 if (_Valid [*pbeg] ) {
1075 wordstart = pbeg; //So skipping 1 blank character
1076 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
1078 else { // a "separator word" ...
1079 size++; //the prev BLANK...
1080 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
1081 }//else { // a "separator word"
1082 }//else ... not a unique BLANK AT THE END.
1083 }//else ... starting by a BLANK...
1086 //The parsed word is "aWord", and its length is "size"...
1089 // Binary search on the canonical words (wordsData)
1093 register uint min,max,p;
1095 max = (wcsa->n) - 1;
1099 {//preparing for strcompL
1100 len = bitread (wcsa->wordsData.words, (p+1)* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize);
1101 tmplen = bitread (wcsa->wordsData.words, (p )* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize);
1103 wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
1106 //if(strncmp((char*)aWord, (char*)wcsa->wordsData[p].word,size) > 0) min = p+1;
1107 if(strcompL(aWord, wcsaWord, size, len) > 0) min = p+1;
1111 // { //SHOW PROGRESS
1112 // fprintf(stderr,"\n Patron = <<%s>>, curposWord= %d ==><<",aWord,p);
1113 // printWord(wcsaWord,len); fprintf(stderr,">> len =%d",len);
1118 {//preparing for strcompL
1119 len = bitread (wcsa->wordsData.words, (min+1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1120 tmplen = bitread (wcsa->wordsData.words, ( min )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1122 wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
1125 // if(!strncmp((char*)aWord, (char*)wcsa->wordsData[min].word, size)) {
1126 if(!strcompL(aWord, wcsaWord, size, len)) {
1127 integerPattern[index++] = min +1 ; //<--
1129 else {*sizeIntegers = 0; return;} // a valid word that does not appear in the source text.
1133 *sizeIntegers = index;
1135 // //shows the parsed words:
1137 // printf("\n\n >>%s>> HA SIDO PARSEADO COMO:",textPattern);
1138 // for (i=0; i<index;i++) {
1139 // printf("<<%s>>",wcsa->wordsData[integerPattern[i] -1].word);
1149 /** ------------------------------------------------------------------
1150 * Returns the number of occurrences of a given text pattern
1151 *------------------------------------------------------------------ */
1152 int countTextOcurrences(twcsa *wcsa, byte *textPattern) {
1154 uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
1155 uint integerPatternSize, min, max;
1157 uint lenpat = strlen((char*)textPattern);
1158 parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
1159 if (!integerPatternSize) return -1;
1163 // printf("\n %d Integers to search for:",integerPatternSize );
1164 // for (i=0;i<integerPatternSize;i++) {
1165 // printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
1171 int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, &numocc, &left, &right);
1177 /** ------------------------------------------------------------------
1178 * locateTextOcurrences:
1179 * Returns the offsets of the source text where a word/phrase appears
1180 * Returns also the number of occurrences.
1181 *------------------------------------------------------------------ */
1182 uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences) {
1183 uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
1184 uint integerPatternSize, min, max;
1186 uint lenpat = strlen((char*)textPattern);
1187 parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
1188 if (!integerPatternSize) {*numberOccurrences = -1; return NULL;}
1192 // printf("\n %d Integers to search for:",integerPatternSize );
1193 // for (i=0;i<integerPatternSize;i++) {
1194 // printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
1199 ulong occurrences, left, right;
1201 ulong *sourceOffsets;
1203 //obtains the indexes in vector SE where the pattern appears.
1204 //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
1205 int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
1207 //sourceOffsets = (uint *) malloc (sizeof(uint)*occurrences);
1209 sourceOffsets=seOffsets;
1210 //obtains the offsets in the source text of the pattern (sourceOffsets)
1211 locateFacade(wcsa, (uint *)sourceOffsets, (uint *)seOffsets,occurrences);
1214 fprintf(stderr,"\n*** %s appears in the source text in positions:\n\t",textPattern);
1215 for (i=0;i<occurrences;i++)
1216 fprintf(stderr,"[%u]",sourceOffsets[i]);
1220 *numberOccurrences = occurrences;
1221 return (uint *) sourceOffsets;
1225 /** ------------------------------------------------------------------
1226 * displayTextOcurrences:
1227 * Shows in stdout, the text around the occurrences of a word/phrase
1228 * Returns also the number of occurrences.
1229 *------------------------------------------------------------------ */
1230 int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay) {
1231 return 99; //not implemented: function not available
1234 /** ------------------------------------------------------------------
1236 * For given sePositions, returns the sourceTextPositions
1237 * where the those valid-words in se[sePositions[i]] occurr.
1238 *------------------------------------------------------------------*/
1239 int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number) {
1240 return 99; //not implemented: function not available for this index
1244 /** ------------------------------------------------------------------
1246 * Returns the subString from a starting offset to a final offset
1247 * in the source text. It does not allocate any memory, receives "dstptr"
1248 * Precondition: offsetIni >=0;
1249 ------------------------------------------------------------------*/
1250 int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr) {
1251 return 99; //not implemented: function not available for this index
1255 /**------------------------------------------------------------------
1256 * DISPLAYFacadeMalloc:
1257 * Returns the subString from a starting offset to a final offset
1258 * in the source text. It allocates Memory !!
1259 * NOT CURRENTLY USED
1260 ------------------------------------------------------------------*/
1261 byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length) {
1262 byte *dstptr=NULL; //not implemented: function not available
1267 /** ------------------------------------------------------------------
1268 * LOCATEALLandDISPLAY:
1269 * Displays the text around an occurrence of the searched word in the source text.
1270 * Assuming that $p$ is that position --> shows only chars in [p_radix-1,p_radix]
1271 ------------------------------------------------------------------*/
1272 int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix) {
1273 return 99; //not implemented: function not available for this index
1278 /** ------------------------------------------------------------------
1279 * recovers the source text by calling display(0,fileSize);
1280 * ------------------------------------------------------------------ */
1281 void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
1285 char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
1288 strcpy( filename, basename);
1289 strcat( filename, ext);
1290 filename[strlen( basename)+ strlen(ext)]='\0';
1291 fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
1295 salida = fopen( filename,"w");
1296 start=0; end = sourceTextSize-1;
1298 cc = (byte *) malloc (sourceTextSize* sizeof(uchar));
1301 uint i, j;//, tmplen;
1312 while ( (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
1314 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
1316 {//obtains pointer to the ith word
1318 uint ith = posSEValue -1; // !!
1319 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1320 offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1321 tmplen -=offtmp; //the lenght of the ith word.
1322 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
1330 prevValid =1; //for the next iteration
1336 for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
1340 fprintf(stderr,"\n sourceTextSize = %d, len = %d",sourceTextSize,leng);
1341 fwrite(cc,sizeof(byte),leng,salida);
1351 //recovers the source text by calling extract Words.
1352 void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
1354 int start;int end; int error;
1355 char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
1359 strcpy( filename, basename);
1360 strcat( filename, ext);
1361 filename[strlen( basename)+ strlen(ext)]='\0';
1362 fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
1366 salida = fopen( filename,"w");
1367 start=0; end = wcsa->seSize;
1369 error = extractWords((void *) wcsa, start, end, &cc, &length);
1370 if (error) {fprintf(stderr,"\n error during recoverSourceText2"); exit(0);}
1372 fprintf(stderr,"\n sourceTextSize = %d, len = %ld",sourceTextSize,length);
1373 fwrite(cc,sizeof(byte),length,salida);
1380 /** *******************************************************************************
1381 * Showing some statistics and info of the index
1382 * *******************************************************************************/
1383 void printInfoReduced(twcsa *wcsa) {
1384 //not implemented: function not available
1387 /* Shows summary info of the index */
1388 int printInfo(void *index) {
1391 twcsa *wcsa = (twcsa *) index;
1393 unsigned long indexSize;
1394 uint intIndexSize, presentationSize;
1397 err = index_size(index, &indexSize);
1398 if (err!=0) return err;
1399 err = sizeIntIndex(wcsa->myicsa, &intIndexSize);
1400 if (err!=0) return err;
1402 presentationSize = indexSize - intIndexSize;
1404 printf("\n ===================================================:");
1405 printf("\n Summary of Presentation layer:");
1406 printf("\n Number of valid words (SEsize) = %u",wcsa->seSize);
1407 printf("\n Number of different words = %ld",wcsa->n);
1408 printf("\n WCSA structure = %lu bytes", sizeof(twcsa));
1410 uint totalpointers = ((((wcsa->n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));
1411 uint totalasciizone = wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ;
1412 uint totalwords = totalasciizone + totalpointers;
1414 printf("\n Size Of words structure (%d bytes):",totalwords);
1415 printf("\n [ pointers = %d bytes || AsciiZone = %d bytes", totalpointers, totalasciizone);
1417 printf("\n\n Total = ** %u bytes (in RAM) **",presentationSize);
1418 //printf("\n\n @@ Summary of self-index on Integers:");
1419 err = printInfoIntIndex(wcsa->myicsa, " ");
1420 if (err!=0) return err;
1422 printf("\n ===================================================:");
1427 /**------------------------------------------------------------------
1429 * Counts the memory amount needed by the Facade (Presentation Layer).
1430 * skipping the stop_words hash table
1431 ----------------------------------------------------------------- */
1432 uint structsSizeMem(twcsa *wcsa) {
1433 return 0; //not implemented: function not available for this index.
1437 /** for debugging **/
1438 void printWord(uchar *str, uint len) {
1441 fprintf(stderr,"%c",str[i]);
1445 /** saves the content of the file SE (ids of the source words) **/
1446 int saveSEfile (char *basename, uint *v, uint n) {
1447 char outfilename[255];
1449 sprintf(outfilename,"%s.%s",basename,SE_FILE_EXT);
1450 unlink(outfilename);
1451 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
1452 printf("Cannot open file %s\n", outfilename);
1456 write(file, v, sizeof(uint) * n );
1462 double getTime2 (void)
1464 double usertime, systime;
1465 struct rusage usage;
1467 getrusage (RUSAGE_SELF, &usage);
1469 usertime = (double) usage.ru_utime.tv_sec +
1470 (double) usage.ru_utime.tv_usec / 1000000.0;
1471 systime = (double) usage.ru_stime.tv_sec +
1472 (double) usage.ru_stime.tv_usec / 1000000.0;
1474 return (usertime + systime);
1479 /**------------------------------------------------------------------
1481 *------------------------------------------------------------------ */
1482 #ifdef FACADEWITHMAIN
1483 int main(int argc, char* argv[])
1488 char *infile, *outbasename, *stopwordsfile; // Name of in/out files
1496 printf("\n*Word-based iCSA: A word-based CSA");
1497 printf("\n*CopyRight (c) 2008 [LBD & G.N.]\n\n");
1499 // Reads input parameters from command line.
1501 printf("Use: %s <in file> <out basename> \n", argv[0]);
1505 // Reads params (input file, output basename, and stopwords file)
1507 outbasename = argv[2];
1508 stopwordsfile = argv[3];
1510 finsize= fileSize(infile);
1513 printf( "\nFILE EMPTY OR FILE NOT FOUND %s !!\nSkipping processement ...\n",infile);
1517 // Opening the input text file.
1518 if( (f_in = open(infile, O_RDONLY)) < 0) {
1519 printf("Cannot read file %s\n", infile);
1522 inputBuffer = (byte *) malloc(finsize *sizeof(byte));// +1);
1523 read (f_in,inputBuffer,finsize);
1528 //printf("\n parametros <<%s>>\n\n",stopwordsfile);
1529 build_index (inputBuffer, finsize, stopwordsfile, &Index); /** building the index */
1531 // /** recovering the source text from the index */
1536 get_length(Index, &size);
1537 char extension[10]= ".source";
1539 //recoverSourceText1((twcsa*) Index, outbasename,extension, size);
1540 strcat(extension,"2");
1541 recoverSourceText2((twcsa*) Index, outbasename,extension,size);
1543 fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start );
1546 // DISPLAYING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
1547 {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1549 ulong numocc,numc, length, i, *snippet_len, tot_numcharext = 0, numpatt;
1550 uchar *pattern, *snippet_text;
1552 pattern = textPattern;
1553 printf("\nSEARCH TEST for DISPLAY (pizzachili interface)\n");
1555 printf("Intro string: ");
1556 fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1557 if (!strcmp((char*)textPattern,"\n") ) break;
1558 textPattern[strlen((char*)textPattern)-1] = '\0';
1560 length = strlen( (char*)textPattern);
1563 // error = display (Index, textPattern, length, numc, &numocc,
1564 // &snippet_text, &snippet_len);
1565 error = displayWords (Index, textPattern, length, numc, &numocc,
1566 &snippet_text, &snippet_len,1);
1568 if (error){ fprintf(stderr, "%s\n", "Hubo un error durante display");exit(0);}
1570 fprintf(stderr,"\n acabou display");fflush(stderr);
1572 ulong j, len = length + 2*numc;
1574 fprintf(stderr,"\n length = %d",length);
1575 fprintf(stderr,"\n pattern = %s",pattern);fflush(stderr);
1576 fprintf(stderr,"\n numocc = %d",numocc);fflush(stderr);
1577 fprintf(stderr,"\n snippet len = %d",len);fflush(stderr);
1578 fprintf(stderr,"\n =========");fflush(stderr);
1579 for (i = 0; i < numocc; i++){
1580 fprintf(stderr,"\n[%2d][len=%3d]<<",i+1,snippet_len[i]);fflush(stderr);
1581 fwrite(snippet_text+len*i,sizeof(uchar),snippet_len[i],stderr);fflush(stderr);
1582 fprintf(stderr,">>");fflush(stderr);
1587 for(i=0; i<numocc; i++) {
1588 tot_numcharext += snippet_len[i];
1593 free (snippet_text);
1596 printf("Ocurrences = %d\n", numocc);
1597 if (!strcmp((char*)textPattern,"\n") ) break;
1603 // // SEARCHING FOR A TEXT PATTERN (word/phrase).
1604 // {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1609 // printf("\nSEARCH TEST for LOCATE\n");
1611 // printf("Intro string: ");
1612 // fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1613 // len = strlen((char*)textPattern);
1614 // if (!strcmp((char*)textPattern,"\n") ) break;
1615 // textPattern[len-1] = '\0';
1618 // //occs = locateTextOcurrences(wcsa,textPattern,&occ);
1619 // // locate(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ);
1620 // locateWord(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ, 0);
1622 // printf("\n*** %s occurs %d times: In the source text in positions:\n\t",textPattern,occ);
1623 // for (i=0;i<occ;i++)
1624 // printf("[%u]",occs[i]);
1628 // if (!strcmp((char*)textPattern,"\n") ) break;
1634 // COUNTING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
1636 {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1639 printf("\nSEARCH TEST for COUNT.\n");
1641 printf("Intro string: ");
1642 fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1643 len = strlen((char*)textPattern);
1644 if (!strcmp((char*)textPattern,"\n") ) break;
1645 textPattern[len-1] = '\0';
1648 count(Index, textPattern, len, (ulong *)&occ);
1649 //occ = countTextOcurrences(wcsa,textPattern);
1650 printf("Ocurrences = %d\n", occ);
1653 printf("\n END COUNTING OCCURRENCES OF PATTERNS. ...\n");
1657 /** saving the index to disk*/
1658 save_index (Index, outbasename);
1660 /** tells the mem used by the index */
1662 index_size(Index, &indexsize);
1663 fprintf(stderr,"Index occupied %d bytes, 2 extra mallocs = %d",indexsize,2* sizeof(uint));
1665 /** freeing the index */