swcsa/buildFacade.c

   1 #include "buildFacade.h"
   2 #include "utils/errors.c"
   3
   4
   5 /** Building the index */
   6
   7     /* Creates index from text[0..length-1]. Note that the index is an
   8       opaque data type. Any build option must be passed in string
   9       build_options, whose syntax depends on the index. The index must
  10       always work with some default parameters if build_options is NULL.
  11       The returned index is ready to be queried. */
  12 int build_index (uchar *text, ulong length, char *build_options, void **index) {
  13         int returnvalue;
  14
  15      printf("\n parameters: \"%s\"\n",build_options); fflush(stderr);
  16
  17     returnvalue = build_WCSA (text, length, build_options, index);
  18
  19     if (!returnvalue)
  20         returnvalue = build_iCSA (build_options,*index);
  21
  22     return returnvalue;
  23 }
  24
  25
  26 /**  Saves index on disk by using single or multiple files, having
  27         proper extensions. */
  28 int save_index (void *index, char *filename) {
  29
  30         char *basename = filename;
  31         twcsa *wcsa=(twcsa *) index;
  32
  33         uint i,j;
  34         char *outfilename;
  35         int file;
  36         char c;
  37
  38         printf("\n Saving structures to disk: %s.*",filename);
  39         outfilename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
  40
  41         /**File with some constants (bSize and tohSize); */
  42         { uint number;
  43                 strcpy(outfilename, basename);
  44                 strcat(outfilename, ".");
  45                 strcat(outfilename, CONSTANTS_FILE_EXT);
  46                 unlink(outfilename);
  47                 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
  48                         printf("Cannot open file %s\n", outfilename);
  49                         exit(0);
  50                 }
  51                 write(file, &(wcsa->sourceTextSize), sizeof(uint));
  52                 write(file, &(wcsa->seSize), sizeof(uint));
  53                 close(file);
  54         }
  55
  56         /** The Words in the vocabulary of words  (sorted alphabetically)*/
  57         {       strcpy(outfilename, basename);
  58                 strcat(outfilename, ".");
  59                 strcat(outfilename, VOCABULARY_WORDS_FILE_EXT);
  60                 unlink(outfilename);
  61                 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
  62                         printf("Cannot open file %s\n", outfilename);
  63                         exit(0);
  64                 }
  65
  66                 uint n = wcsa->n;
  67                 uint elemSize = wcsa->wordsData.elemSize;
  68                 write(file, &n, sizeof(uint));
  69                 write(file, &elemSize, sizeof(uint));
  70                 write(file, &(wcsa->wordsData.wordsZoneMem.size), sizeof(uint));
  71
  72                 //the number of canonical words
  73                 write(file, (char *)wcsa->wordsData.wordsZoneMem.zone, wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
  74                 write(file, (char *)wcsa->wordsData.words, ((((n+1)* (elemSize))+W-1) /W) * (sizeof(uint)) );
  75
  76                 close(file);
  77         }
  78
  79         free(outfilename);
  80
  81         if (wcsa->myicsa) {
  82                 /******** saves index on integers (bottom) ******/
  83                 //Storing the CSA
  84                 //storeStructsCSA(wcsa->myicsa,basename);
  85                 saveIntIndex((void *) wcsa->myicsa, basename);
  86         }
  87
  88         if (wcsa->se) {
  89                 saveSEfile(basename,wcsa->se, wcsa->seSize+1);
  90                 //free(wcsa->se);
  91         }
  92
  93         return 0;
  94 }
  95
  96
  97
  98     /**  Loads index from one or more file(s) named filename, possibly
  99       adding the proper extensions. */
 100 int load_index(char *filename, void **index){
 101         twcsa *wcsa;
 102         wcsa = loadWCSA (filename);
 103         (*index) = (void *) wcsa;
 104         return 0;
 105 }
 106
 107         /** Frees the memory occupied by index. */
 108 int free_index(void *index){
 109         twcsa *wcsa=(twcsa *) index;
 110         ulong size;
 111         index_size(index,&size);
 112         printf("\n[destroying index] ...Freed %lu bytes... RAM", size);
 113
 114
 115         //frees the array SE.
 116         if (wcsa->se)
 117                 free (wcsa->se);
 118
 119         //the iCSA.
 120         if (wcsa->myicsa) {
 121                 //destroyStructsCSA(wcsa->myicsa);
 122                 int err = freeIntIndex((void *) wcsa->myicsa);
 123         }
 124
 125         //the words.
 126         free (wcsa->wordsData.wordsZoneMem.zone);
 127         free (wcsa->wordsData.words); /** huge!! */
 128
 129         //the pointer to wcsa.
 130         free(wcsa);
 131         return 0;
 132 }
 133
 134         /** Gives the memory occupied by index in bytes. */
 135 int index_size(void *index, ulong *size) {
 136         ulong totaltmp;
 137         twcsa *wcsa=(twcsa *)index;
 138         uint n= wcsa->n;
 139         *size=0;
 140         *size += sizeof(twcsa);
 141
 142         totaltmp=0;  //words
 143         totaltmp += ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));  //the pointers
 144         totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words.
 145         *size += totaltmp;
 146
 147         if (wcsa->myicsa) {
 148                 uint nbytes;
 149                 int err = sizeIntIndex((void *) wcsa->myicsa, &nbytes);
 150                 *size += nbytes;
 151                 //*size += CSA_size(wcsa->myicsa);
 152         }
 153
 154         return 0;
 155 }
 156
 157
 158 /** Querying the index =============================================================*/
 159
 160         /* Writes in numocc the number of occurrences of the substring
 161            pattern[0..length-1] found in the text indexed by index. */
 162 int count (void *index, uchar *pattern, ulong length, ulong *numocc){
 163         uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
 164         uint integerPatternSize;
 165         ulong l,r;
 166
 167         twcsa *wcsa=(twcsa *) index;
 168         parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
 169         if (!integerPatternSize) {*numocc=0; return 0;} //not found
 170
 171         //*numocc = countCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &l, &r);
 172         int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, numocc,  &l, &r);
 173         return 0;
 174 }
 175
 176         /* Writes in numocc the number of occurrences of the substring
 177           pattern[0..length-1] in the text indexed by index. It also allocates
 178           occ (which must be freed by the caller) and writes the locations of
 179           the numocc occurrences in occ, in arbitrary order.  */
 180 int locate(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc){
 181         return 99;
 182 }
 183
 184    /* Gives the length of the text indexed */
 185 int get_length(void *index, ulong *length) {
 186         twcsa *wcsa=(twcsa *) index;
 187         *length = wcsa->sourceTextSize;
 188         return 0;
 189 }
 190
 191     /**  Obtains the length of the text indexed by index. */
 192
 193 int length (void *index, ulong *length) {
 194         return (get_length(index,length));
 195 }
 196
 197
 198 /** ***********************************************************************************
 199   * Accessing the indexed text
 200   * ***********************************************************************************/
 201
 202
 203         /**  Allocates snippet (which must be freed by the caller) and writes
 204           the substring text[from..to] into it. Returns in snippet_length the
 205           length of the text snippet actually extracted (that could be less
 206           than to-from+1 if to is larger than the text size).                      */
 207 int extract (void *index, ulong from, ulong to, uchar **snippet, ulong *snippet_length) {
 208     twcsa *wcsa=(twcsa *) index;
 209         return 99;
 210 }
 211
 212   /** Displays the text (snippet) surrounding any occurrence of the
 213     substring pattern[0..length-1] within the text indexed by index.
 214     The snippet must include numc characters before and after the
 215     pattern occurrence, totalizing length+2*numc characters, or less if
 216     the text boundaries are reached. Writes in numocc the number of
 217     occurrences, and allocates the arrays snippet_text and
 218     snippet_lengths (which must be freed by the caller). The first is a
 219     character array of numocc*(length+2*numc) characters, with a new
 220     snippet starting at every multiple of length+2*numc. The second
 221     gives the real length of each of the numocc snippets. */
 222
 223 int display (void *index, uchar *pattern, ulong length, ulong numc,
 224         ulong *numocc, uchar **snippet_text, ulong **snippet_lengths) {
 225         return 99;
 226 }
 227
 228
 229
 230 /** ***********************************************************************************
 231   * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
 232   * ***********************************************************************************/
 233         /* Writes in numocc the number of occurrences of the substring
 234           pattern[0..length-1] in the text indexed by index. It also allocates
 235           occ (which must be freed by the caller) and writes the locations of
 236           the numocc occurrences in occ, in arbitrary order. These occurrences
 237           refer to the offsets in TOH where the caller could start a display
 238           operation. So locateWord implies synchronization using B.
 239           Moreover, positions occ[numocc.. 2*numocc-1] is set with the rank in SE of the
 240           words whose codes begin in TOH in the positions in occ[0... numocc-1]
 241           ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
 242              searched word, but the offset in TOH of k-before words before.
 243         */
 244
 245 int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore){
 246         uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
 247         uint integerPatternSize;
 248         ulong occurrences,l,r;
 249         twcsa *wcsa=(twcsa *) index;
 250
 251         parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
 252         if (!integerPatternSize) {*numocc=0; return 0;} //not found
 253
 254         ulong *seOffsets;
 255
 256         //obtains the indexes in vector SE where the pattern appears.
 257         //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
 258         int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
 259
 260         *numocc = occurrences;
 261
 262         if (!occurrences) {(*occ)=NULL;return 0;}
 263
 264         (*occ) = (ulong *)seOffsets;
 265         return 0;
 266 }
 267
 268
 269   /** Displays the text (snippet) surrounding any occurrence of the
 270     substring pattern[0..length-1] within the text indexed by index.
 271     The snippet must include numc characters before and after the
 272     pattern occurrence, totalizing length+2*numc characters, or less if
 273     the text boundaries are reached. Writes in numocc the number of
 274     occurrences, and allocates the arrays snippet_text and
 275     snippet_lengths (which must be freed by the caller). The first is a
 276     character array of numocc*(length+2*numc) characters, with a new
 277     snippet starting at every multiple of length+2*numc. The second
 278     gives the real length of each of the numocc snippets. */
 279
 280  int displayWords (void *index, uchar *pattern, ulong length, ulong numc,
 281          ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore) {
 282
 283     /** actually extracts upto length + 2*numc chars, starting extraction kbefore
 284      *  words before the occurrence **/
 285
 286         ulong *indexesInSE;
 287         ulong occurrences;
 288         uint bytesPerSnippet;
 289         byte *text_aux;
 290         twcsa *wcsa=(twcsa *) index;
 291
 292         locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
 293         (*numocc) = occurrences;
 294
 295         if (!occurrences) {
 296                 *snippet_text =NULL;
 297                 *snippet_lengths =NULL;
 298                 return 0;
 299         }
 300
 301         bytesPerSnippet = length+2*numc;
 302 //      bytesPerSnippet = 2*numc;
 303         *snippet_lengths = (ulong *) malloc((*numocc)*sizeof(ulong));
 304         if (!(*snippet_lengths)) return 1;
 305         *snippet_text = (uchar *) malloc((*numocc)*(bytesPerSnippet)*sizeof(uchar) +1) ;  //(the last "1" is for '\0');
 306         if (!(*snippet_text)) return 1;
 307
 308  //     fprintf(stderr,"\n occs found = %7d for pattern %s",*numocc, pattern);
 309  //     fflush(stderr);
 310
 311         text_aux=*snippet_text;
 312         {
 313                 uint i, j, tmplen;
 314                 uint ptr, maxptr;
 315                 byte *src, *dst;
 316                 uint snippetLen;
 317                 uint posSEValue,indexSE;
 318
 319                 for (i=0;i<occurrences;i++) {
 320                                 uint prevValid=0;
 321                                 uint endSnippet =0;
 322
 323                                 /** decodes words from there */
 324                                 snippetLen=0;
 325                                 indexSE = indexesInSE[i];
 326                                 indexSE = (indexSE > kbefore) ? indexSE-kbefore : 0;
 327
 328                                 dst = text_aux;
 329                                 while ((!endSnippet) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
 330
 331                                         //posSEValue =displayCSA(wcsa->myicsa,indexSE);
 332                                         int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
 333
 334                                         {//obtains pointer to the ith word
 335                                                 uint offtmp;
 336                                                 uint ith = posSEValue -1;  // !!
 337                                                 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 338                                                 offtmp = bitread (wcsa->wordsData.words, ( ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 339                                                 tmplen -=offtmp;  //the lenght of the ith word.
 340
 341                                                 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
 342                                         }
 343
 344                                         if (_Valid[*src]) {
 345                                                 if (prevValid){
 346                                                          *dst++ =' ';
 347                                                          snippetLen++;
 348                                                          if  (snippetLen==bytesPerSnippet) break;  //end of snippet (ends in BLANK_SPACE)
 349                                                 }
 350                                                 prevValid =1;   //for the next iteration
 351                                         }
 352                                         else prevValid=0;
 353
 354                                         indexSE++;
 355
 356                                         /* at the end ?? */
 357                                         if  ((tmplen+snippetLen)>=bytesPerSnippet) {
 358                                                 tmplen =(bytesPerSnippet - snippetLen);
 359                                                 endSnippet=1; //so while loop ends;
 360                                         }
 361
 362                                         for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
 363                                         snippetLen +=tmplen;
 364                                 }//while
 365
 366                                 text_aux += bytesPerSnippet;
 367                                 (*snippet_lengths)[i] = snippetLen;
 368                         }       //for
 369
 370                         if (occurrences) free(indexesInSE);
 371                 }
 372                 return 0;
 373 }
 374
 375 /** simulates extration of text process, but do not actually returns anything at all
 376    Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
 377    Less than 2K words can be extracted if more than numc characters have been already obtained.
 378    Does nothing else... does not return the text */
 379
 380 int  displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc) {
 381
 382         ulong *indexesInSE;
 383         ulong occurrences;
 384         byte *text_aux;
 385
 386         twcsa *wcsa=(twcsa *) index;
 387
 388         locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
 389
 390         if (!occurrences) {
 391                 return 0;
 392         }
 393
 394         ulong maxsnippetLen  = maxnumc;
 395         ulong extractedbytes = 0;
 396
 397         text_aux = (byte *) malloc (maxsnippetLen+1);
 398
 399         {
 400                 uint i, j, tmplen;
 401                 uint ptr, maxptr;
 402                 byte *src, *dst;
 403                 uint snippetLen;
 404                 uint posSEValue,indexSE;
 405
 406                 uint numWordsToExtract = 2 * wordsbefore;
 407                 uint z;
 408                 //printf("\n occurrences... = %lu",occurrences);
 409
 410                 for (i=0;i<occurrences;i++) {
 411                                 uint prevValid=0;
 412                                 uint endSnippet =0;
 413
 414                                 /** decodes words from there */
 415                                 snippetLen=0;
 416                                 indexSE = indexesInSE[i];
 417                                 indexSE = (indexSE > wordsbefore) ? indexSE-wordsbefore : 0;
 418
 419                                 dst = text_aux;
 420                                 z=0;
 421                                 while ((z<numWordsToExtract) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
 422
 423                                         //posSEValue =displayCSA(wcsa->myicsa,indexSE);
 424                                         int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
 425
 426                                         {//obtains pointer to the ith word
 427                                                 uint offtmp;
 428                                                 uint ith = posSEValue -1;  // !!
 429                                                 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 430                                                 offtmp = bitread (wcsa->wordsData.words, ( ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 431                                                 tmplen -=offtmp;  //the lenght of the ith word.
 432
 433                                                 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
 434                                         }
 435
 436                                         if (_Valid[*src]) {
 437                                                         if (prevValid){
 438                                                                  *dst++ =' ';
 439                                                                  snippetLen++;
 440                                                                  if  (snippetLen==maxsnippetLen) break;  //end of snippet (ends in BLANK_SPACE)
 441                                                                 }
 442                                                         prevValid =1;   //for the next iteration
 443                                         }
 444                                         else prevValid=0;
 445
 446                                         indexSE++;
 447
 448                                         /* at the end ?? */
 449                                         if  ((tmplen+snippetLen)>=maxsnippetLen) {
 450                                                         break;
 451                                         }
 452
 453                                         //fprintf(stderr,"\ntmplen = %d ",tmplen); fflush(stderr);
 454                                         for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
 455                                         snippetLen +=tmplen;
 456                                         z++;
 457                                 }//while
 458
 459                                 extractedbytes += snippetLen;
 460
 461                         }       //for
 462
 463                         if (occurrences) free(indexesInSE);
 464                 }
 465                 if (text_aux) free (text_aux);
 466                 return extractedbytes;
 467 }
 468
 469
 470
 471 /**  Allocates text (which must be freed by the caller) and recovers the
 472   the substring of text starting from the "fromword"-th word up to the
 473   "toWord"-th words. Returns in text the text, and in "text_lenght" the
 474   length of the text  actually extracted. Text is allocated.
 475   Actually extracts SE[fromWord .. toWord) ... not the last word.    */
 476
 477 int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
 478         ulong *text_length){
 479
 480         twcsa *wcsa=(twcsa *) index;
 481     uint initTextLen=10000;
 482     uint avgWordLen =7;
 483
 484         uint i, j;//, tmplen;
 485         uint prevValid = 0;
 486         byte *src, *dst, *buff;
 487         uint tmplen =0;
 488
 489         uint buffBytes = 1000;
 490         uint leng=0; //curr pos in buffer that was occupied.
 491
 492         if (toWord > wcsa->seSize) toWord = wcsa->seSize;
 493         if (fromWord >= wcsa->seSize) fromWord = wcsa->seSize-1;
 494         if (buffBytes < ( (toWord-fromWord)* avgWordLen)) buffBytes = ((toWord-fromWord)* avgWordLen);
 495
 496         buff = (uchar *) malloc (buffBytes * sizeof(char));
 497         if (!buff) return 1; //out of memory.
 498         dst = buff;
 499
 500         register uint indexSE=fromWord;
 501         uint posSEValue=0;
 502         register uint ith;
 503
 504         while  ( (indexSE < toWord) ){ /** extracting words (if not at the end) */
 505
 506                 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
 507
 508                 {//obtains pointer to the ith word
 509                         uint offtmp;
 510                         ith= posSEValue -1;  // !!
 511                         tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 512                         offtmp = bitread (wcsa->wordsData.words, (ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 513                         tmplen -=offtmp;  //the lenght of the ith word.
 514                         src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
 515                 }
 516
 517                 if ( buffBytes < (leng + tmplen+1) ) {
 518                         buffBytes *=2;
 519                         buff = (uchar*) realloc(buff, buffBytes);
 520                         if (!buff) return 1; //out of memory.
 521                         dst = buff + leng;
 522                 }
 523
 524                 if (_Valid[*src]) {
 525                         if (prevValid){
 526                                 *dst++ =' ';
 527                                 leng  += 1;
 528                         }
 529                         prevValid =1;   //for the next iteration
 530                 }
 531                 else prevValid=0;
 532
 533                 indexSE++;
 534
 535                 for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
 536                 leng +=tmplen;
 537         }//while
 538
 539         *text_length =leng;
 540         *dst='\0';
 541         *text = buff;
 542         return 0;
 543 }
 544
 545
 546
 547 /** ***********************************************************************************
 548          CONSTRUCTION OF THE INDEX WCSA
 549     ***********************************************************************************/
 550
 551         /**------------------------------------------------------------------
 552         Compares two slots (alphanumericaly). For qsort of canonical words
 553         ------------------------------------------------------------------ */
 554         int qSortWordsCompareAlpha(const void *arg1, const void *arg2) {
 555                 tposInHT *a1 = (tposInHT *) arg1;
 556                 tposInHT *a2 = (tposInHT *) arg2;
 557                 return strcmp((char*)a1->word, (char *)a2->word);
 558         }
 559
 560 /**
 561   * BUILDS THE WCSA INDEX
 562   */
 563
 564 int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
 565
 566         unsigned long zeroNode;  //number of different canonical words.
 567
 568         t_hash hash;            // the hash table to store both variants and canonical words.
 569         tposInHT *posInHT;      // structure for canonicals and variants+huffmans
 570
 571         uint sourceTextSize;
 572
 573         uint seSize=0;  //it's size == "numberOfValidWords".
 574         uint *SE;       //Integers vector. (represents the rank of the valid words in the source text).
 575
 576         uint totallenWords=0; //The numberOfBytes that occupy canonical words (their ascii version) in memory
 577
 578
 579         ulong bytesFile,bytesFileReal;
 580         long sizeNValue;
 581         unsigned long size_posInHT;
 582         /* used during first pass */
 583
 584         ulong addrInTH;
 585
 586         byte* inputBuffer = text;
 587         bytesFileReal= bytesFile = length;
 588
 589         sourceTextSize=length;
 590
 591         /** Initializes WCSA structure*/
 592         twcsa *wcsa;
 593         wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
 594         zeroNode=0;
 595         /**      */
 596
 597         //Stimation (Using Heap's law) of the number of different "meaningful" words.
 598         //sizeNValue=N_value;
 599         if(bytesFile<5000000) bytesFile = 5000000;
 600
 601         sizeNValue = (unsigned long) floor(3.9* pow(bytesFile,0.70) );
 602
 603         // Inicializes the arrays used to detect if a char is valid or not.
 604         StartValid();
 605         // Inicializes the arrays used translated a char into lowercase.
 606         StartToLow();
 607
 608
 609         // **********************************************************************************
 610         //STARTING THE FIRST PASS.
 611         // **********************************************************************************
 612         printf("\nSTARTING THE FIRST PASS...");
 613
 614         posInHT = (tposInHT *) malloc(sizeof(tposInHT) * sizeNValue);
 615         size_posInHT = sizeNValue;
 616         hash = initialize_hash (sizeNValue); //hash to cointain both the parsed words
 617
 618         //-----------------------------------------------------------------
 619         //1st pass (processing the file)
 620         {
 621             byte *pbeg,*pend,*wordstart,*aWord;
 622             register ulong size;
 623             register uint i;
 624
 625             pbeg = inputBuffer;
 626             pend = inputBuffer+bytesFileReal;
 627
 628             while (pbeg <pend)
 629             {
 630                 if (*pbeg == 0)
 631                 {
 632                     fprintf(stderr, "buildFacade.c: assert failed, *pbeg == 0\n");
 633                     exit(1);
 634                 }
 635
 636                 //parsing either a word or separator.
 637                 size=0;
 638                 wordstart = pbeg;
 639                 if (_Valid[*pbeg]) {   //alphanumerical data
 640                     while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
 641                 }
 642                 else
 643                 {
 644                     if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
 645                         while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
 646                     }
 647                     else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
 648                         pbeg++;
 649                         if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
 650                         else {
 651                             if (_Valid [*pbeg] ) {
 652                                 wordstart = pbeg;   //So skipping 1 blank character
 653                                 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
 654                             }
 655                             else {   // a "separator word" ...
 656                                 size++; //the prev BLANK...
 657                                 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
 658                             }//else {  // a "separator word"
 659                         }//else ... not a unique BLANK AT THE END.
 660                     }//else ... starting by a BLANK...
 661                 }
 662
 663                 if (pbeg < pend && *pbeg == 0)
 664                     pbeg ++;    // Skip the 0-bytes
 665
 666                 if (size == 0)
 667                 {
 668                     fprintf(stderr, "buildFacade.c: assert failed, size == 0\n");
 669                     exit(1);
 670                 }
 671
 672                 //The parsed word/separator is  is "wordstart", and its length is "size"...
 673                 aWord=wordstart;
 674
 675                 //Processement done for each word word
 676                 i = inHashTable(hash,aWord, size, &addrInTH );
 677                 if (!i){
 678                     insertElement (hash,aWord, size, &addrInTH);
 679                     if (zeroNode >= size_posInHT){
 680                       size_posInHT *= 2;
 681                       posInHT = (tposInHT*) realloc(posInHT, size_posInHT * sizeof(tposInHT));
 682                     }
 683                     posInHT[zeroNode].slot=addrInTH;
 684                     posInHT[zeroNode].word=hash->hash[addrInTH].word;
 685                     hash->hash[addrInTH].posInVoc = zeroNode;
 686                     zeroNode++;
 687                     totallenWords += size +1;                   // +1 due to the '\0' char...
 688                    //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);
 689                 }
 690                 seSize ++;
 691
 692             }//while pbeg<pend
 693
 694             fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
 695
 696         }//1st pass ends
 697
 698
 699         // **********************************************************************************
 700         // END OF 1ST PASS
 701         // **********************************************************************************
 702
 703         // Sorting the words alphanumerically (over posInHT)
 704         {       register unsigned long i,j;
 705                 //sorting canonical words ...
 706                 qsort(posInHT, zeroNode, sizeof(tposInHT), qSortWordsCompareAlpha);
 707
 708                 //setting in hash the new positions of the  words in the hash table
 709                 for (i=0;i<zeroNode;i++) {
 710                         hash->hash[posInHT[i].slot].posInVoc = i;
 711                 }
 712         }
 713
 714         // INITIALIZING structures for the 2nd pass ......................................
 715         {
 716                 SE  = (uint *) malloc ((seSize+1)*sizeof (uint));
 717         }
 718
 719
 720         // **********************************************************************************
 721         //  STARTING THE SECOND PASS.
 722         // **********************************************************************************/
 723
 724         printf("\nSTARTING THE SECOND PASS... ");
 725         //2nd pass (processing the file)
 726         {
 727             byte *pbeg,*pend,*wordstart,*aWord;
 728             register ulong size;
 729             register uint i;
 730             register ulong countValidWords = 0;
 731
 732
 733             pbeg = inputBuffer;
 734             pend = inputBuffer+bytesFileReal;
 735
 736             while (pbeg <pend) {
 737                 if (*pbeg == 0)
 738                 {
 739                     fprintf(stderr, "buildFacade.c 2nd pass: assert failed, *pbeg == 0\n");
 740                     exit(1);
 741                 }
 742
 743                 //parsing either a word or separator.
 744                 size=0;
 745                 wordstart = pbeg;
 746                 if (_Valid[*pbeg]) {   //alphanumerical data
 747                     while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
 748                 }
 749                 else {
 750                     if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
 751                         while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
 752                     }
 753                     else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
 754                         pbeg++;
 755                         if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
 756                         else {
 757                             if (_Valid [*pbeg] ) {
 758                                 wordstart = pbeg;   //So skipping 1 blank character
 759                                 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
 760                             }
 761                             else {   // a "separator word" ...
 762                                 size++; //the prev BLANK...
 763                                 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg]  && *pbeg != 0)) {size++;pbeg++;}
 764                             }//else {  // a "separator word"
 765                         }//else ... not a unique BLANK AT THE END.
 766                     }//else ... starting by a BLANK...
 767                 }
 768
 769                 if (pbeg < pend && *pbeg == 0)
 770                     pbeg ++;    // Skip the 0-bytes
 771
 772                 if (size == 0)
 773                 {
 774                     fprintf(stderr, "buildFacade.c 2nd pass: assert failed, size == 0\n");
 775                     exit(1);
 776                 }
 777
 778                 //The parsed word/separator is  is "wordstart", and its length is "size"...
 779                 aWord=wordstart;
 780
 781                 //Processement done for each word word
 782                 i = inHashTable(hash,aWord, size, &addrInTH );
 783
 784                 SE[countValidWords]=hash->hash[addrInTH].posInVoc+1;  // !!!!
 785                 countValidWords++;
 786
 787             }// while pbeg<pend
 788
 789             SE[countValidWords] = 0;
 790             fprintf(stderr,"\n2nd pass ends: TOTAL distinct words: %lu totalWords = %lu",zeroNode,countValidWords);
 791
 792         }//2nd pass ends
 793
 794         // **********************************************************************************
 795         // END OF 2ND PASS
 796         // **********************************************************************************
 797
 798         //freeing the source text (it is no longer needed).
 799         free(inputBuffer); //the text
 800
 801         /** Now Setting the data of the index **/
 802         wcsa->n = zeroNode;
 803         wcsa->sourceTextSize = sourceTextSize;
 804         wcsa->seSize = seSize;
 805
 806         // Creating the words of the vocabulary...
 807         {
 808         /** copying the words into WCSA. */
 809                 uint *tmpOffsets = (uint *) malloc (sizeof(uint) * (zeroNode  +1) );  //1 extra uint (to point to the virtual "zeroNode+1" ^th word.
 810                 uint tmpOffset =0;
 811
 812                 byte *zoneMem,*src;
 813                 uint i;
 814
 815                 //Moving data from posInHT to WCSA structure
 816                 //wcsa->wordsData = (twords *) malloc(sizeof(twords) * zeroNode);
 817                 wcsa->wordsData.wordsZoneMem.size = totallenWords - zeroNode; //without '\0' bytes (end-tag).
 818                 wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc ( wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
 819                 zoneMem = wcsa->wordsData.wordsZoneMem.zone;
 820                 for(i = 0; i < zeroNode; i++) {
 821                         src = posInHT[i].word;                           //copying the canonical word
 822                         //wcsa->wordsData.words[i].word = zoneMem;   //setting the pointer
 823                         tmpOffsets[i]=tmpOffset;  //offset in zoneMem
 824                         while (*src) {*zoneMem++ = *src++; tmpOffset++;}  //moving data until '\0'
 825                         //*zoneMem='\0'; zoneMem++;            //copies also the '\0'
 826
 827                 }
 828                 tmpOffsets[zeroNode]=tmpOffset; //setting pointer to the "virtual" word {zeroNode+1}^{th}
 829
 830                 //kbit encoding of the offsets
 831                 uint elemSize = _bits(tmpOffset);
 832                 wcsa->wordsData.elemSize = elemSize;
 833                 wcsa->wordsData.words = (uint *) malloc (((((zeroNode +1)*elemSize)+W-1) /W) * sizeof(uint));  //with 1 extra slot !.
 834                 wcsa->wordsData.words[((((zeroNode +1)*elemSize)+W-1) /W)   -1 ] =0000;
 835                 //              fprintf(stderr,"\n ElemSize = %d, maxOffset = %d",elemSize,tmpOffset);
 836
 837                 tmpOffset=0;
 838                 for (i=0; i<=zeroNode; i++) {  //setting "zeroNode+1" offsets
 839                                 bitwrite(wcsa->wordsData.words, tmpOffset, elemSize, tmpOffsets[i]);
 840                                 tmpOffset+=elemSize;
 841                 }
 842
 843                 //////////// CHECKS IT WORKED. old !!!!
 844         //              { uint kk;
 845         //                      tmpOffset=0;
 846         //                      for (i=0; i<zeroNode; i++) {  //setting "zeroNode+1" offsets
 847         //                                      kk=bitread(wcsa->wordsData.words, i* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 848         //                                      tmpOffset+=elemSize;
 849         //                                      if (kk != tmpOffsets[i]) {fprintf(stderr,"\n @@@@@@@@ DISTINTOS OFFSETS "); break;}
 850         //                                      else fprintf(stderr,"\n iguales, %d, %d  :: <<%s>>len=%d",kk,i, posInHT[i].word, strlen((char*)posInHT[i].word));
 851         //                      }
 852         //              }
 853         //
 854         //              { uint len1, len, tmplen, len2;
 855         //                      uint i,p;
 856         //                      byte *wcsaWord, *src;
 857         //
 858         //                      for (p=0;p<zeroNode;p++) {
 859         //                              {//preparing for strcompL
 860         //                                      len    = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p+1)), wcsa->wordsData.elemSize);
 861         //                                      tmplen = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p))  , wcsa->wordsData.elemSize);
 862         //
 863         //                              //fprintf(stderr,"\n  :: off[%d]= %d  -  off [%d] = %d  ==> %d",p+1,len,p,tmplen,len-tmplen);
 864         //
 865         //                              len2 =len-tmplen;
 866         //                                      wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
 867         //                              }
 868         //
 869         //                              src = posInHT[p].word;
 870         //                              len1 = strlen((char *)src);
 871         //
 872         //                              if (strcompL(src,wcsaWord,len1,len2) != 0) {
 873         //                                      fprintf(stderr,"\n %6d DISTINTOS !! ===len1 %d,len %d===== <<",p,len1,len2);printWord(src,len1);
 874         //                                      fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
 875         //                                      exit(0);
 876         //                              }
 877         //                              else {
 878         //                                      fprintf(stderr,"\n %6d ======len1 %d,len2 %d===== <<",p,len1,len2);printWord(src,len1);
 879         //                                      fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
 880         //                              }
 881         //                      }
 882         //
 883         //              }
 884         //
 885
 886         /**-----------*/
 887         //frees memory from hash table and posInHT structures.
 888                 free(tmpOffsets);
 889                 destroy_hash(hash);
 890                 free(posInHT);
 891         }
 892
 893         /** ******* creates the self-index on ints (bottom layer) ==> see build_icsa *********/
 894 /**
 895         #ifdef CSA_ON
 896         {
 897                 uint total;
 898                 fprintf(stderr,"\n **** CREATING CSA from Edu's Code *****");
 899                 ticsa *myicsa;
 900                 myicsa = createIntegerCSA(&SE,seSize+1,build_options);
 901                 wcsa->myicsa= myicsa;
 902                 total = CSA_size(myicsa);
 903
 904                 free(SE);  //SE is no longer needed, (it is indexed by the iCSA)
 905                 printf("\n\t**** [iCSA built on %d words. Size = %ld bytes... RAM",seSize,total);
 906         }
 907         #endif
 908 */
 909
 910         //#ifndef CSA_ON
 911                 wcsa->se = SE;
 912                 wcsa->myicsa = NULL;
 913         //#endif
 914
 915         printf("\n\t ** Building done! **\n");
 916         printf("\n Process finished!\n");
 917
 918         *index = wcsa;
 919         return 0;
 920 }
 921
 922
 923 int build_iCSA (char *build_options, void *index)
 924 {
 925         twcsa *wcsa = (twcsa *) index;
 926         /********* creates the self-index on ints (bottom layer) *********/
 927         //creating CSA from Edu's code...
 928         {
 929                 uint total;
 930                 fprintf(stderr,"\n **** CREATING CSA-bottom-layer *****");
 931                 void *bottomIntIndex;
 932                 int err =  buildIntIndex(wcsa->se,wcsa->seSize+1, build_options,(void **)&bottomIntIndex);
 933                 wcsa->myicsa = bottomIntIndex;
 934
 935                 //total = CSA_size(wcsa->myicsa);
 936                 err = sizeIntIndex((void *) wcsa->myicsa, &total);
 937
 938                 printf("\n\t**** [iCSA built on %d words. Size = %u bytes... RAM",wcsa->seSize,total);
 939         }
 940         return 0;
 941 }
 942
 943 /** ********************************************************************
 944   * Loading from disk
 945   **********************************************************************/
 946
 947 /**-----------------------------------------------------------------
 948  * LoadWCSA.
 949  * Loads all the data structures of WCSA (included the icsa)
 950  ----------------------------------------------------------------- */
 951
 952 twcsa *loadWCSA(char *filename) {
 953         twcsa *wcsa;
 954         // Inicializes the arrays used to detect if a char is valid or not.
 955         StartValid();
 956         // Inicializes the arrays used translated a char into lowercase.
 957         StartToLow();
 958
 959         wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
 960         wcsa->n=0;
 961
 962         int err = loadIntIndex(filename, (void **)&wcsa->myicsa);
 963
 964         loadStructs(wcsa,filename);
 965
 966         return wcsa;
 967 }
 968
 969 /** ------------------------------------------------------------------
 970  * LoadStructs.
 971  *      Reads files and loads all the data needed for searcherFacade
 972  ----------------------------------------------------------------- */
 973  void loadStructs(twcsa *wcsa, char *basename) {
 974         uint i,j;
 975         char *filename;
 976         int file;
 977         uint sizeFile;
 978         char c;
 979         uint n;
 980
 981         filename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
 982         fprintf(stderr,"Loading Index from file %s.*\n", basename);
 983
 984         //** SOME CONSTANTS: sourceTextSize
 985         {       strcpy(filename, basename);
 986                 strcat(filename, ".");
 987                 strcat(filename, CONSTANTS_FILE_EXT);
 988
 989                 if( (file = open(filename, O_RDONLY)) < 0) {
 990                         printf("Cannot open file %s\n", filename);
 991                         exit(0);
 992                 }
 993
 994                 read(file, &(wcsa->sourceTextSize), sizeof(uint));
 995                 read(file, &(wcsa->seSize), sizeof(uint));
 996                 close(file);
 997         }
 998
 999         /** File with the words from the vocabulary (sorted alphabetically) */
1000         {       byte *zoneMem;
1001
1002                 strcpy(filename, basename);
1003                 strcat(filename, ".");
1004                 strcat(filename, VOCABULARY_WORDS_FILE_EXT);
1005                 //sizeFile= fileSize(filename)-sizeof(uint);
1006
1007                 if( (file = open(filename, O_RDONLY)) < 0) {
1008                         printf("Cannot open file %s\n", filename);
1009                         exit(0);
1010                 }
1011
1012                 //the number of canonical words
1013                 read(file, &n, sizeof(uint));
1014                 wcsa->n = n;
1015                 read(file, &(wcsa->wordsData.elemSize), (sizeof(uint)));
1016                 read(file, &(wcsa->wordsData.wordsZoneMem.size), (sizeof(uint)));
1017
1018                 //allocating the memory needed for all words and reading them   //(ascii) << no \0 chars are needed>>.
1019                 wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc(wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
1020                 read(file, (wcsa->wordsData.wordsZoneMem.zone),    wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
1021
1022                 //reading the offsets of the words (kbitArray that points to offsets in zoneMem of words.
1023                 wcsa->wordsData.words = (uint *) malloc (((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * sizeof(uint));
1024                 wcsa->wordsData.words[ ((((n+1)*(wcsa->wordsData.elemSize))+W-1) /W)   -1 ] =0000;
1025                 read(file, (wcsa->wordsData.words),     ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)));
1026
1027
1028                 close(file);
1029         }
1030         wcsa->se= NULL;
1031         free(filename);
1032 }
1033
1034
1035
1036
1037 /** ****************************************************************
1038   * Querying the index WCSA
1039   * ***************************************************************/
1040 ///////////////////////////////////////////////////////////////////////////////////////
1041 //                                         FUNCTIONS NEEDED FOR SEARCHING A PATTERN                                      //
1042 ///////////////////////////////////////////////////////////////////////////////////////
1043
1044
1045
1046 /*------------------------------------------------------------------
1047  * Given a text pattern translates it into a list of integers (corresponding to the
1048  * canonical words associated to the valid words in the text pattern)
1049  ------------------------------------------------------------------*/
1050 void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) {
1051
1052         byte *pbeg,*pend,*wordstart,*aWord;
1053         register unsigned long size;
1054         uint index =0;
1055
1056         pbeg = textPattern;
1057         pend = pbeg + patLen;
1058
1059         while (pbeg <pend) {
1060                 //parsing either a word or separator.
1061                 size=0;
1062                 wordstart = pbeg;
1063                 if (_Valid[*pbeg]) {   //alphanumerical data
1064                         while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
1065           }
1066                 else {
1067                         if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
1068                                 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
1069                         }
1070                         else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
1071                                 pbeg++;
1072                                 if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
1073                                 else {
1074                                         if (_Valid [*pbeg] ) {
1075                                                 wordstart = pbeg;   //So skipping 1 blank character
1076                                                 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
1077                                         }
1078                                         else {   // a "separator word" ...
1079                                                 size++; //the prev BLANK...
1080                                                 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
1081                                         }//else {  // a "separator word"
1082                                 }//else ... not a unique BLANK AT THE END.
1083                         }//else ... starting by a BLANK...
1084                 }
1085
1086                 //The parsed word is "aWord", and its length is "size"...
1087                 aWord=wordstart;
1088
1089                 // Binary search on the canonical words (wordsData)
1090                 {
1091                         uint len, tmplen;
1092                         uchar *wcsaWord;
1093                         register uint min,max,p;
1094                         min = 0;
1095                         max = (wcsa->n) - 1;
1096                         while(min < max) {
1097                                 p = (min+max)/2;
1098
1099                                 {//preparing for strcompL
1100                                         len    = bitread (wcsa->wordsData.words, (p+1)* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize);
1101                                         tmplen = bitread (wcsa->wordsData.words, (p )* wcsa->wordsData.elemSize  , wcsa->wordsData.elemSize);
1102                                         len -=tmplen;
1103                                         wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
1104                                 }
1105
1106                                 //if(strncmp((char*)aWord, (char*)wcsa->wordsData[p].word,size) > 0) min = p+1;
1107                                 if(strcompL(aWord, wcsaWord, size, len) > 0) min = p+1;
1108                                 else max = p;
1109
1110
1111                                 //                              { //SHOW PROGRESS
1112                                 //                                      fprintf(stderr,"\n Patron = <<%s>>, curposWord= %d ==><<",aWord,p);
1113                                 //                                      printWord(wcsaWord,len); fprintf(stderr,">> len =%d",len);
1114                                 //                              }
1115
1116                         }
1117
1118                         {//preparing for strcompL
1119                                 len    = bitread (wcsa->wordsData.words, (min+1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1120                                 tmplen = bitread (wcsa->wordsData.words, ( min )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1121                                 len -=tmplen;
1122                                         wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
1123                         }
1124
1125                         //      if(!strncmp((char*)aWord, (char*)wcsa->wordsData[min].word, size)) {
1126                         if(!strcompL(aWord, wcsaWord, size, len)) {
1127                                 integerPattern[index++] = min +1 ;  //<--
1128                         }
1129                         else {*sizeIntegers = 0; return;}  // a valid word that does not appear in the source text.
1130
1131                 }
1132         }// end while
1133         *sizeIntegers = index;
1134
1135         //      //shows the parsed words:
1136         //      {uint i;
1137         //              printf("\n\n >>%s>> HA SIDO PARSEADO COMO:",textPattern);
1138         //              for (i=0; i<index;i++) {
1139         //                              printf("<<%s>>",wcsa->wordsData[integerPattern[i] -1].word);
1140         //              }
1141         //
1142         //      }
1143 }
1144
1145
1146
1147
1148
1149         /** ------------------------------------------------------------------
1150          * Returns the number of occurrences of a given text pattern
1151          *------------------------------------------------------------------ */
1152 int countTextOcurrences(twcsa *wcsa, byte *textPattern) {
1153         ulong left, right;
1154         uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
1155         uint integerPatternSize, min, max;
1156
1157         uint lenpat = strlen((char*)textPattern);
1158         parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
1159         if (!integerPatternSize) return -1;
1160
1161 //      #ifdef DEBUG_ON
1162 //              uint i;
1163 //              printf("\n %d Integers to search for:",integerPatternSize );
1164 //              for (i=0;i<integerPatternSize;i++) {
1165 //                      printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
1166 //              }
1167 //              printf("\n");
1168 //      #endif
1169
1170         ulong numocc;
1171         int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, &numocc,  &left, &right);
1172         return numocc;
1173
1174 }
1175
1176
1177         /** ------------------------------------------------------------------
1178          * locateTextOcurrences:
1179          * Returns the offsets of the source text where a word/phrase appears
1180          * Returns also the number of occurrences.
1181          *------------------------------------------------------------------ */
1182 uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences) {
1183         uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
1184         uint integerPatternSize, min, max;
1185
1186         uint lenpat = strlen((char*)textPattern);
1187         parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
1188         if (!integerPatternSize) {*numberOccurrences = -1; return NULL;}
1189
1190 //      #ifdef DEBUG_ON
1191 //              uint i;
1192 //              printf("\n %d Integers to search for:",integerPatternSize );
1193 //              for (i=0;i<integerPatternSize;i++) {
1194 //                      printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
1195 //              }
1196 //              printf("\n");
1197 //      #endif
1198
1199         ulong occurrences, left, right;
1200         ulong *seOffsets;
1201         ulong *sourceOffsets;
1202
1203         //obtains the indexes in vector SE where the pattern appears.
1204         //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
1205         int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
1206
1207         //sourceOffsets = (uint *) malloc (sizeof(uint)*occurrences);
1208
1209         sourceOffsets=seOffsets;
1210         //obtains the offsets in the source text of the pattern (sourceOffsets)
1211         locateFacade(wcsa, (uint *)sourceOffsets, (uint *)seOffsets,occurrences);
1212
1213         #ifdef DEBUG_ON
1214                 fprintf(stderr,"\n*** %s appears in the source text in positions:\n\t",textPattern);
1215                 for (i=0;i<occurrences;i++)
1216                         fprintf(stderr,"[%u]",sourceOffsets[i]);
1217                 fflush(stderr);
1218         #endif
1219
1220         *numberOccurrences = occurrences;
1221         return (uint *) sourceOffsets;
1222 }
1223
1224
1225         /** ------------------------------------------------------------------
1226          * displayTextOcurrences:
1227          * Shows in stdout, the text around the occurrences of a word/phrase
1228          * Returns also the number of occurrences.
1229          *------------------------------------------------------------------ */
1230 int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay) {
1231         return 99;  //not implemented: function not available
1232 }
1233
1234         /** ------------------------------------------------------------------
1235          * Locate Facade:
1236          * For given sePositions, returns the sourceTextPositions
1237          * where the those valid-words in se[sePositions[i]] occurr.
1238          *------------------------------------------------------------------*/
1239 int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number) {
1240         return 99;  //not implemented: function not available for this index
1241 }
1242
1243
1244 /** ------------------------------------------------------------------
1245         * DISPLAYFACADE:
1246         * Returns the subString from a starting offset to a final offset
1247         * in the source text. It does not allocate any memory, receives "dstptr"
1248         * Precondition: offsetIni >=0;
1249         ------------------------------------------------------------------*/
1250  int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr) {
1251         return 99;  //not implemented: function not available for this index
1252 }
1253
1254
1255         /**------------------------------------------------------------------
1256          * DISPLAYFacadeMalloc:
1257          * Returns the subString from a starting offset to a final offset
1258          * in the source text. It allocates Memory !!
1259          * NOT CURRENTLY USED
1260          ------------------------------------------------------------------*/
1261 byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length) {
1262         byte *dstptr=NULL;         //not implemented: function not available
1263         return dstptr;
1264 }
1265
1266
1267         /** ------------------------------------------------------------------
1268          * LOCATEALLandDISPLAY:
1269          * Displays the text around an occurrence of the searched word in the source text.
1270          * Assuming that $p$ is that position --> shows only chars in [p_radix-1,p_radix]
1271          ------------------------------------------------------------------*/
1272 int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix) {
1273 return 99;   //not implemented: function not available for this index
1274
1275 }
1276
1277
1278         /** ------------------------------------------------------------------
1279          * recovers the source text by calling display(0,fileSize);
1280          * ------------------------------------------------------------------ */
1281 void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
1282
1283         int start;int end;
1284         byte *cc;
1285         char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
1286         ulong length;
1287
1288         strcpy( filename,  basename);
1289         strcat( filename, ext);
1290         filename[strlen( basename)+ strlen(ext)]='\0';
1291         fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
1292
1293         FILE *salida;
1294         unlink( filename);
1295         salida = fopen( filename,"w");
1296         start=0; end = sourceTextSize-1;
1297
1298         cc = (byte *) malloc (sourceTextSize* sizeof(uchar));
1299
1300         {
1301                 uint i, j;//, tmplen;
1302                 uint prevValid = 0;
1303                 //uint ptr, maxptr;
1304                 byte *src, *dst;
1305                 uint leng =0;
1306                 uint tmplen =0;
1307
1308                 uint indexSE=0;
1309                 uint posSEValue=0;
1310
1311                 dst=cc;
1312                 while  ( (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
1313
1314                         int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
1315
1316                         {//obtains pointer to the ith word
1317                                 uint offtmp;
1318                                 uint ith = posSEValue -1;  // !!
1319                                 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1320                                 offtmp = bitread (wcsa->wordsData.words, ( ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1321                                 tmplen -=offtmp;  //the lenght of the ith word.
1322                                 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
1323                         }
1324
1325                         if (_Valid[*src]) {
1326                                 if (prevValid){
1327                                          *dst++ =' ';
1328                                         leng +=1;
1329                                 }
1330                                 prevValid =1;   //for the next iteration
1331                         }
1332                         else prevValid=0;
1333
1334                         indexSE++;
1335
1336                         for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
1337                         leng +=tmplen;
1338                 }//while
1339
1340         fprintf(stderr,"\n sourceTextSize = %d, len = %d",sourceTextSize,leng);
1341         fwrite(cc,sizeof(byte),leng,salida);
1342         fclose(salida);
1343
1344         free(cc);
1345         free(filename);
1346 }
1347
1348
1349 }
1350
1351                 //recovers the source text by calling extract Words.
1352 void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
1353
1354         int start;int end; int error;
1355         char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
1356         byte *cc;
1357         ulong length;
1358
1359         strcpy( filename,  basename);
1360         strcat( filename, ext);
1361         filename[strlen( basename)+ strlen(ext)]='\0';
1362         fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
1363
1364         FILE *salida;
1365         unlink( filename);
1366         salida = fopen( filename,"w");
1367         start=0; end = wcsa->seSize;
1368
1369         error = extractWords((void *) wcsa, start, end, &cc, &length);
1370         if (error) {fprintf(stderr,"\n error during recoverSourceText2"); exit(0);}
1371
1372         fprintf(stderr,"\n sourceTextSize = %d, len = %ld",sourceTextSize,length);
1373         fwrite(cc,sizeof(byte),length,salida);
1374         fclose(salida);
1375
1376         free(cc);
1377         free(filename);
1378 }
1379
1380 /** *******************************************************************************
1381   * Showing some statistics and info of the index
1382   * *******************************************************************************/
1383 void printInfoReduced(twcsa *wcsa) {
1384           //not implemented: function not available
1385 }
1386
1387                 /* Shows summary info of the index */
1388 int printInfo(void *index) {
1389         uint n;
1390
1391         twcsa *wcsa = (twcsa *) index;
1392
1393         unsigned long indexSize;
1394         uint intIndexSize, presentationSize;
1395         int err;
1396
1397         err = index_size(index, &indexSize);
1398         if (err!=0) return err;
1399         err = sizeIntIndex(wcsa->myicsa, &intIndexSize);
1400         if (err!=0) return err;
1401
1402         presentationSize = indexSize - intIndexSize;
1403
1404                 printf("\n ===================================================:");
1405                 printf("\n Summary of Presentation layer:");
1406                 printf("\n   Number of valid words (SEsize) = %u",wcsa->seSize);
1407                 printf("\n   Number of different words = %ld",wcsa->n);
1408                 printf("\n   WCSA structure = %lu bytes", sizeof(twcsa));
1409
1410                 uint totalpointers = ((((wcsa->n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));
1411                 uint totalasciizone = wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ;
1412                 uint totalwords = totalasciizone + totalpointers;
1413
1414                 printf("\n   Size Of words structure (%d bytes):",totalwords);
1415                 printf("\n      [     pointers = %d bytes || AsciiZone = %d bytes", totalpointers,      totalasciizone);
1416
1417                 printf("\n\n Total = **  %u bytes (in RAM) **",presentationSize);
1418                 //printf("\n\n @@ Summary of self-index on Integers:");
1419                 err = printInfoIntIndex(wcsa->myicsa, " ");
1420                 if (err!=0) return err;
1421
1422                 printf("\n ===================================================:");
1423                 printf("\n");
1424                 return 0;
1425         }
1426
1427 /**------------------------------------------------------------------
1428  * structsSize.
1429  *      Counts the memory amount needed by the Facade (Presentation Layer).
1430  * skipping the stop_words hash table
1431  ----------------------------------------------------------------- */
1432 uint structsSizeMem(twcsa *wcsa) {
1433         return 0;   //not implemented: function not available for this index.
1434 }
1435
1436
1437 /** for debugging **/
1438 void printWord(uchar *str, uint len) {
1439                 uint i;
1440                 for (i=0;i<len;i++)
1441                         fprintf(stderr,"%c",str[i]);
1442 }
1443
1444
1445         /** saves the content of the file SE (ids of the source words) **/
1446 int saveSEfile (char *basename, uint *v, uint n) {
1447         char outfilename[255];
1448         int file;
1449         sprintf(outfilename,"%s.%s",basename,SE_FILE_EXT);
1450         unlink(outfilename);
1451         if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
1452                 printf("Cannot open file %s\n", outfilename);
1453                 exit(0);
1454         }
1455
1456         write(file, v, sizeof(uint) * n );
1457         close(file);
1458 }
1459
1460
1461
1462 double getTime2 (void)
1463 {
1464         double usertime, systime;
1465         struct rusage usage;
1466
1467         getrusage (RUSAGE_SELF, &usage);
1468
1469         usertime = (double) usage.ru_utime.tv_sec +
1470                 (double) usage.ru_utime.tv_usec / 1000000.0;
1471         systime = (double) usage.ru_stime.tv_sec +
1472                 (double) usage.ru_stime.tv_usec / 1000000.0;
1473
1474         return (usertime + systime);
1475 }
1476
1477
1478
1479 /**------------------------------------------------------------------
1480   *  MAIN PROGRAM.
1481   *------------------------------------------------------------------ */
1482 #ifdef FACADEWITHMAIN
1483         int main(int argc, char* argv[])
1484         {
1485
1486
1487
1488                 char *infile, *outbasename, *stopwordsfile;     // Name of in/out files
1489                 byte *inputBuffer;
1490                 ulong finsize;
1491
1492                 int f_in;
1493                 void *Index;
1494
1495
1496                 printf("\n*Word-based iCSA: A word-based CSA");
1497                 printf("\n*CopyRight (c) 2008 [LBD & G.N.]\n\n");
1498
1499                 // Reads input parameters from command line.
1500                 if(argc < 3) {
1501                         printf("Use: %s <in file> <out basename> \n", argv[0]);
1502                         exit(0);
1503                 }
1504
1505                 // Reads params (input file, output basename, and stopwords file)
1506                 infile = argv[1];
1507                 outbasename = argv[2];
1508                 stopwordsfile = argv[3];
1509
1510                 finsize= fileSize(infile);
1511
1512                 if (! finsize) {
1513                         printf( "\nFILE EMPTY OR FILE NOT FOUND %s !!\nSkipping processement ...\n",infile);
1514                         exit(0);
1515                 }
1516
1517                 // Opening the input text file.
1518                 if( (f_in = open(infile, O_RDONLY)) < 0) {
1519                         printf("Cannot read file %s\n", infile);
1520                         exit(0);
1521                 }
1522                 inputBuffer = (byte *) malloc(finsize *sizeof(byte));// +1);
1523                 read (f_in,inputBuffer,finsize);
1524                 close (f_in);
1525
1526
1527         {
1528                 //printf("\n parametros <<%s>>\n\n",stopwordsfile);
1529                 build_index (inputBuffer, finsize, stopwordsfile, &Index);  /** building the index */
1530
1531 //              /** recovering the source text from the index */
1532                         {
1533                                 double start, end;
1534                                 start = getTime2();
1535                                 ulong size;
1536                                 get_length(Index, &size);
1537                                 char extension[10]= ".source";
1538
1539                                 //recoverSourceText1((twcsa*) Index, outbasename,extension, size);
1540                                 strcat(extension,"2");
1541                                 recoverSourceText2((twcsa*) Index, outbasename,extension,size);
1542                                 end = getTime2();
1543                                 fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start );
1544                         }
1545 //
1546                 // DISPLAYING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
1547                         {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1548                          int error = 0;
1549                         ulong numocc,numc, length, i, *snippet_len, tot_numcharext = 0, numpatt;
1550                         uchar *pattern, *snippet_text;
1551
1552                                  pattern = textPattern;
1553                          printf("\nSEARCH TEST for DISPLAY (pizzachili interface)\n");
1554                                 while(1) {
1555                                         printf("Intro string: ");
1556                                         fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1557                                         if (!strcmp((char*)textPattern,"\n") ) break;
1558                                          textPattern[strlen((char*)textPattern)-1] = '\0';
1559
1560                                         length = strlen( (char*)textPattern);
1561                                         numc=50;
1562
1563 //                                      error = display (Index, textPattern, length, numc, &numocc,
1564 //                                                               &snippet_text, &snippet_len);
1565                                         error = displayWords (Index, textPattern, length, numc, &numocc,
1566                                                                  &snippet_text, &snippet_len,1);
1567
1568                                         if (error){ fprintf(stderr, "%s\n", "Hubo un error durante display");exit(0);}
1569
1570                                                 fprintf(stderr,"\n acabou display");fflush(stderr);
1571                                         {//show the results
1572                                                 ulong j, len = length + 2*numc;
1573                                             char blank = '\0';
1574                                                 fprintf(stderr,"\n length = %d",length);
1575                                                 fprintf(stderr,"\n pattern = %s",pattern);fflush(stderr);
1576                                                 fprintf(stderr,"\n numocc = %d",numocc);fflush(stderr);
1577                                                 fprintf(stderr,"\n snippet len = %d",len);fflush(stderr);
1578                                                 fprintf(stderr,"\n =========");fflush(stderr);
1579                                                 for (i = 0; i < numocc; i++){
1580                                                         fprintf(stderr,"\n[%2d][len=%3d]<<",i+1,snippet_len[i]);fflush(stderr);
1581                                                         fwrite(snippet_text+len*i,sizeof(uchar),snippet_len[i],stderr);fflush(stderr);
1582                                                         fprintf(stderr,">>");fflush(stderr);
1583                                                 }
1584                                         }
1585                                         numpatt--;
1586
1587                                         for(i=0; i<numocc; i++) {
1588                                                 tot_numcharext += snippet_len[i];
1589                                         }
1590
1591                                         if (numocc) {
1592                                                 free (snippet_len);
1593                                                 free (snippet_text);
1594                                         }
1595
1596                                         printf("Ocurrences = %d\n", numocc);
1597                                         if (!strcmp((char*)textPattern,"\n") ) break;
1598                                 }
1599                         }
1600
1601 //
1602 //
1603 //      // SEARCHING FOR A TEXT PATTERN (word/phrase).
1604 //      {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1605 //       int occ;
1606 //       int len;
1607 //       uint *occs;
1608 //       int i;
1609 //       printf("\nSEARCH TEST for LOCATE\n");
1610 //              while(1) {
1611 //                      printf("Intro string: ");
1612 //                      fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1613 //                      len = strlen((char*)textPattern);
1614 //                      if (!strcmp((char*)textPattern,"\n") ) break;
1615 //                      textPattern[len-1] = '\0';
1616 //                      len --;
1617 //
1618 //                      //occs = locateTextOcurrences(wcsa,textPattern,&occ);
1619 //                      // locate(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ);
1620 //                        locateWord(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ, 0);
1621 //
1622 //                      printf("\n*** %s occurs %d times: In the source text in positions:\n\t",textPattern,occ);
1623 //                      for (i=0;i<occ;i++)
1624 //                              printf("[%u]",occs[i]);
1625 //                      fflush(stderr);
1626 //                      free(occs);
1627 //
1628 //                      if (!strcmp((char*)textPattern,"\n") ) break;
1629 //              }
1630 //      }
1631 //
1632 //
1633
1634                 // COUNTING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
1635                 /*
1636                 {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1637                  int occ;
1638                  int len;
1639                  printf("\nSEARCH TEST for COUNT.\n");
1640                         while(1) {
1641                                 printf("Intro string: ");
1642                                 fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1643                                 len = strlen((char*)textPattern);
1644                                 if (!strcmp((char*)textPattern,"\n") ) break;
1645                                 textPattern[len-1] = '\0';
1646                                 len --;
1647
1648                                 count(Index, textPattern, len, (ulong *)&occ);
1649                                 //occ = countTextOcurrences(wcsa,textPattern);
1650                                 printf("Ocurrences = %d\n", occ);
1651                         }
1652                 }
1653                 printf("\n END COUNTING OCCURRENCES OF PATTERNS. ...\n");
1654                 //exit(0);
1655                 */
1656
1657                 /** saving the index to disk*/
1658                 save_index (Index, outbasename);
1659
1660                 /** tells the mem used by the index */
1661                 ulong indexsize;
1662                 index_size(Index, &indexsize);
1663                 fprintf(stderr,"Index occupied %d bytes, 2 extra mallocs = %d",indexsize,2* sizeof(uint));
1664
1665                 /** freeing the index */
1666                 free_index(Index);
1667
1668         }
1669 }
1670
1671 #endif
1672
1673
1674