swcsa/buildFacade.c

   1 #include "buildFacade.h"
   2 #include "utils/errors.c"
   3
   4
   5 /** Building the index */
   6
   7     /* Creates index from text[0..length-1]. Note that the index is an
   8       opaque data type. Any build option must be passed in string
   9       build_options, whose syntax depends on the index. The index must
  10       always work with some default parameters if build_options is NULL.
  11       The returned index is ready to be queried. */
  12 int build_index (uchar *text, ulong length, char *build_options, void **index) {
  13         int returnvalue;
  14
  15      printf("\n parameters: \"%s\"\n",build_options); fflush(stderr);
  16
  17     returnvalue = build_WCSA (text, length, build_options, index);
  18
  19     if (!returnvalue)
  20         returnvalue = build_iCSA (build_options,*index);
  21
  22     return returnvalue;
  23 }
  24
  25
  26 /**  Saves index on disk by using single or multiple files, having
  27         proper extensions. */
  28 int save_index (void *index, char *filename) {
  29
  30         char *basename = filename;
  31         twcsa *wcsa=(twcsa *) index;
  32
  33         uint i,j;
  34         char *outfilename;
  35         int file;
  36         char c;
  37
  38         printf("\n Saving structures to disk: %s.*",filename);
  39         outfilename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
  40
  41         /**File with some constants (bSize and tohSize); */
  42         { uint number;
  43                 strcpy(outfilename, basename);
  44                 strcat(outfilename, ".");
  45                 strcat(outfilename, CONSTANTS_FILE_EXT);
  46                 unlink(outfilename);
  47                 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
  48                         printf("Cannot open file %s\n", outfilename);
  49                         exit(0);
  50                 }
  51                 write(file, &(wcsa->sourceTextSize), sizeof(uint));
  52                 write(file, &(wcsa->seSize), sizeof(uint));
  53                 close(file);
  54         }
  55
  56         /** The Words in the vocabulary of words  (sorted alphabetically)*/
  57         {       strcpy(outfilename, basename);
  58                 strcat(outfilename, ".");
  59                 strcat(outfilename, VOCABULARY_WORDS_FILE_EXT);
  60                 unlink(outfilename);
  61                 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
  62                         printf("Cannot open file %s\n", outfilename);
  63                         exit(0);
  64                 }
  65
  66                 uint n = wcsa->n;
  67                 uint elemSize = wcsa->wordsData.elemSize;
  68                 write(file, &n, sizeof(uint));
  69                 write(file, &elemSize, sizeof(uint));
  70                 write(file, &(wcsa->wordsData.wordsZoneMem.size), sizeof(uint));
  71
  72                 //the number of canonical words
  73                 write(file, (char *)wcsa->wordsData.wordsZoneMem.zone, wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
  74                 write(file, (char *)wcsa->wordsData.words, ((((n+1)* (elemSize))+W-1) /W) * (sizeof(uint)) );
  75
  76                 close(file);
  77         }
  78
  79         free(outfilename);
  80
  81         if (wcsa->myicsa) {
  82                 /******** saves index on integers (bottom) ******/
  83                 //Storing the CSA
  84                 //storeStructsCSA(wcsa->myicsa,basename);
  85                 saveIntIndex((void *) wcsa->myicsa, basename);
  86         }
  87
  88         if (wcsa->se) {
  89                 saveSEfile(basename,wcsa->se, wcsa->seSize+1);
  90                 //free(wcsa->se);
  91         }
  92
  93         return 0;
  94 }
  95
  96
  97
  98     /**  Loads index from one or more file(s) named filename, possibly
  99       adding the proper extensions. */
 100 int load_index(char *filename, void **index){
 101         twcsa *wcsa;
 102         wcsa = loadWCSA (filename);
 103         (*index) = (void *) wcsa;
 104         return 0;
 105 }
 106
 107         /** Frees the memory occupied by index. */
 108 int free_index(void *index){
 109         twcsa *wcsa=(twcsa *) index;
 110         ulong size;
 111         index_size(index,&size);
 112         printf("\n[destroying index] ...Freed %lu bytes... RAM", size);
 113
 114
 115         //frees the array SE.
 116         if (wcsa->se)
 117                 free (wcsa->se);
 118
 119         //the iCSA.
 120         if (wcsa->myicsa) {
 121                 //destroyStructsCSA(wcsa->myicsa);
 122                 int err = freeIntIndex((void *) wcsa->myicsa);
 123         }
 124
 125         //the words.
 126         free (wcsa->wordsData.wordsZoneMem.zone);
 127         free (wcsa->wordsData.words); /** huge!! */
 128
 129         //the pointer to wcsa.
 130         free(wcsa);
 131         return 0;
 132 }
 133
 134         /** Gives the memory occupied by index in bytes. */
 135 int index_size(void *index, ulong *size) {
 136         ulong totaltmp;
 137         twcsa *wcsa=(twcsa *)index;
 138         uint n= wcsa->n;
 139         *size=0;
 140         *size += sizeof(twcsa);
 141
 142         totaltmp=0;  //words
 143         totaltmp += ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));  //the pointers
 144         totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words.
 145         *size += totaltmp;
 146
 147         if (wcsa->myicsa) {
 148                 uint nbytes;
 149                 int err = sizeIntIndex((void *) wcsa->myicsa, &nbytes);
 150                 *size += nbytes;
 151                 //*size += CSA_size(wcsa->myicsa);
 152         }
 153
 154         return 0;
 155 }
 156
 157
 158 /** Querying the index =============================================================*/
 159
 160         /* Writes in numocc the number of occurrences of the substring
 161            pattern[0..length-1] found in the text indexed by index. */
 162 int count (void *index, uchar *pattern, ulong length, ulong *numocc){
 163         uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
 164         uint integerPatternSize;
 165         ulong l,r;
 166
 167         twcsa *wcsa=(twcsa *) index;
 168         parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
 169         if (!integerPatternSize) {*numocc=0; return 0;} //not found
 170
 171         //*numocc = countCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &l, &r);
 172         int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, numocc,  &l, &r);
 173         return 0;
 174 }
 175
 176         /* Writes in numocc the number of occurrences of the substring
 177           pattern[0..length-1] in the text indexed by index. It also allocates
 178           occ (which must be freed by the caller) and writes the locations of
 179           the numocc occurrences in occ, in arbitrary order.  */
 180 int locate(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc){
 181         return 99;
 182 }
 183
 184    /* Gives the length of the text indexed */
 185 int get_length(void *index, ulong *length) {
 186         twcsa *wcsa=(twcsa *) index;
 187         *length = wcsa->sourceTextSize;
 188         return 0;
 189 }
 190
 191     /**  Obtains the length of the text indexed by index. */
 192
 193 int length (void *index, ulong *length) {
 194         return (get_length(index,length));
 195 }
 196
 197
 198 /** ***********************************************************************************
 199   * Accessing the indexed text
 200   * ***********************************************************************************/
 201
 202
 203         /**  Allocates snippet (which must be freed by the caller) and writes
 204           the substring text[from..to] into it. Returns in snippet_length the
 205           length of the text snippet actually extracted (that could be less
 206           than to-from+1 if to is larger than the text size).                      */
 207 int extract (void *index, ulong from, ulong to, uchar **snippet, ulong *snippet_length) {
 208     twcsa *wcsa=(twcsa *) index;
 209         return 99;
 210 }
 211
 212   /** Displays the text (snippet) surrounding any occurrence of the
 213     substring pattern[0..length-1] within the text indexed by index.
 214     The snippet must include numc characters before and after the
 215     pattern occurrence, totalizing length+2*numc characters, or less if
 216     the text boundaries are reached. Writes in numocc the number of
 217     occurrences, and allocates the arrays snippet_text and
 218     snippet_lengths (which must be freed by the caller). The first is a
 219     character array of numocc*(length+2*numc) characters, with a new
 220     snippet starting at every multiple of length+2*numc. The second
 221     gives the real length of each of the numocc snippets. */
 222
 223 int display (void *index, uchar *pattern, ulong length, ulong numc,
 224         ulong *numocc, uchar **snippet_text, ulong **snippet_lengths) {
 225         return 99;
 226 }
 227
 228
 229
 230 /** ***********************************************************************************
 231   * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
 232   * ***********************************************************************************/
 233         /* Writes in numocc the number of occurrences of the substring
 234           pattern[0..length-1] in the text indexed by index. It also allocates
 235           occ (which must be freed by the caller) and writes the locations of
 236           the numocc occurrences in occ, in arbitrary order. These occurrences
 237           refer to the offsets in TOH where the caller could start a display
 238           operation. So locateWord implies synchronization using B.
 239           Moreover, positions occ[numocc.. 2*numocc-1] is set with the rank in SE of the
 240           words whose codes begin in TOH in the positions in occ[0... numocc-1]
 241           ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
 242              searched word, but the offset in TOH of k-before words before.
 243         */
 244
 245 int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore){
 246         uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
 247         uint integerPatternSize;
 248         ulong occurrences,l,r;
 249         twcsa *wcsa=(twcsa *) index;
 250
 251         parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
 252         if (!integerPatternSize) {*numocc=0; return 0;} //not found
 253
 254         ulong *seOffsets;
 255
 256         //obtains the indexes in vector SE where the pattern appears.
 257         //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
 258         int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
 259
 260         *numocc = occurrences;
 261
 262         if (!occurrences) {(*occ)=NULL;return 0;}
 263
 264         (*occ) = (ulong *)seOffsets;
 265         return 0;
 266 }
 267
 268
 269   /** Displays the text (snippet) surrounding any occurrence of the
 270     substring pattern[0..length-1] within the text indexed by index.
 271     The snippet must include numc characters before and after the
 272     pattern occurrence, totalizing length+2*numc characters, or less if
 273     the text boundaries are reached. Writes in numocc the number of
 274     occurrences, and allocates the arrays snippet_text and
 275     snippet_lengths (which must be freed by the caller). The first is a
 276     character array of numocc*(length+2*numc) characters, with a new
 277     snippet starting at every multiple of length+2*numc. The second
 278     gives the real length of each of the numocc snippets. */
 279
 280  int displayWords (void *index, uchar *pattern, ulong length, ulong numc,
 281          ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore) {
 282
 283     /** actually extracts upto length + 2*numc chars, starting extraction kbefore
 284      *  words before the occurrence **/
 285
 286         ulong *indexesInSE;
 287         ulong occurrences;
 288         uint bytesPerSnippet;
 289         byte *text_aux;
 290         twcsa *wcsa=(twcsa *) index;
 291
 292         locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
 293         (*numocc) = occurrences;
 294
 295         if (!occurrences) {
 296                 *snippet_text =NULL;
 297                 *snippet_lengths =NULL;
 298                 return 0;
 299         }
 300
 301         bytesPerSnippet = length+2*numc;
 302 //      bytesPerSnippet = 2*numc;
 303         *snippet_lengths = (ulong *) malloc((*numocc)*sizeof(ulong));
 304         if (!(*snippet_lengths)) return 1;
 305         *snippet_text = (uchar *) malloc((*numocc)*(bytesPerSnippet)*sizeof(uchar) +1) ;  //(the last "1" is for '\0');
 306         if (!(*snippet_text)) return 1;
 307
 308  //     fprintf(stderr,"\n occs found = %7d for pattern %s",*numocc, pattern);
 309  //     fflush(stderr);
 310
 311         text_aux=*snippet_text;
 312         {
 313                 uint i, j, tmplen;
 314                 uint ptr, maxptr;
 315                 byte *src, *dst;
 316                 uint snippetLen;
 317                 uint posSEValue,indexSE;
 318
 319                 for (i=0;i<occurrences;i++) {
 320                                 uint prevValid=0;
 321                                 uint endSnippet =0;
 322
 323                                 /** decodes words from there */
 324                                 snippetLen=0;
 325                                 indexSE = indexesInSE[i];
 326                                 indexSE = (indexSE > kbefore) ? indexSE-kbefore : 0;
 327
 328                                 dst = text_aux;
 329                                 while ((!endSnippet) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
 330
 331                                         //posSEValue =displayCSA(wcsa->myicsa,indexSE);
 332                                         int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
 333
 334                                         {//obtains pointer to the ith word
 335                                                 uint offtmp;
 336                                                 uint ith = posSEValue -1;  // !!
 337                                                 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 338                                                 offtmp = bitread (wcsa->wordsData.words, ( ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 339                                                 tmplen -=offtmp;  //the lenght of the ith word.
 340
 341                                                 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
 342                                         }
 343
 344                                         if (_Valid[*src]) {
 345                                                 if (prevValid){
 346                                                          *dst++ =' ';
 347                                                          snippetLen++;
 348                                                          if  (snippetLen==bytesPerSnippet) break;  //end of snippet (ends in BLANK_SPACE)
 349                                                 }
 350                                                 prevValid =1;   //for the next iteration
 351                                         }
 352                                         else prevValid=0;
 353
 354                                         indexSE++;
 355
 356                                         /* at the end ?? */
 357                                         if  ((tmplen+snippetLen)>=bytesPerSnippet) {
 358                                                 tmplen =(bytesPerSnippet - snippetLen);
 359                                                 endSnippet=1; //so while loop ends;
 360                                         }
 361
 362                                         for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
 363                                         snippetLen +=tmplen;
 364                                 }//while
 365
 366                                 text_aux += bytesPerSnippet;
 367                                 (*snippet_lengths)[i] = snippetLen;
 368                         }       //for
 369
 370                         if (occurrences) free(indexesInSE);
 371                 }
 372                 return 0;
 373 }
 374
 375 /** simulates extration of text process, but do not actually returns anything at all
 376    Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
 377    Less than 2K words can be extracted if more than numc characters have been already obtained.
 378    Does nothing else... does not return the text */
 379
 380 int  displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc) {
 381
 382         ulong *indexesInSE;
 383         ulong occurrences;
 384         byte *text_aux;
 385
 386         twcsa *wcsa=(twcsa *) index;
 387
 388         locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
 389
 390         if (!occurrences) {
 391                 return 0;
 392         }
 393
 394         ulong maxsnippetLen  = maxnumc;
 395         ulong extractedbytes = 0;
 396
 397         text_aux = (byte *) malloc (maxsnippetLen+1);
 398
 399         {
 400                 uint i, j, tmplen;
 401                 uint ptr, maxptr;
 402                 byte *src, *dst;
 403                 uint snippetLen;
 404                 uint posSEValue,indexSE;
 405
 406                 uint numWordsToExtract = 2 * wordsbefore;
 407                 uint z;
 408                 //printf("\n occurrences... = %lu",occurrences);
 409
 410                 for (i=0;i<occurrences;i++) {
 411                                 uint prevValid=0;
 412                                 uint endSnippet =0;
 413
 414                                 /** decodes words from there */
 415                                 snippetLen=0;
 416                                 indexSE = indexesInSE[i];
 417                                 indexSE = (indexSE > wordsbefore) ? indexSE-wordsbefore : 0;
 418
 419                                 dst = text_aux;
 420                                 z=0;
 421                                 while ((z<numWordsToExtract) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
 422
 423                                         //posSEValue =displayCSA(wcsa->myicsa,indexSE);
 424                                         int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
 425
 426                                         {//obtains pointer to the ith word
 427                                                 uint offtmp;
 428                                                 uint ith = posSEValue -1;  // !!
 429                                                 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 430                                                 offtmp = bitread (wcsa->wordsData.words, ( ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 431                                                 tmplen -=offtmp;  //the lenght of the ith word.
 432
 433                                                 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
 434                                         }
 435
 436                                         if (_Valid[*src]) {
 437                                                         if (prevValid){
 438                                                                  *dst++ =' ';
 439                                                                  snippetLen++;
 440                                                                  if  (snippetLen==maxsnippetLen) break;  //end of snippet (ends in BLANK_SPACE)
 441                                                                 }
 442                                                         prevValid =1;   //for the next iteration
 443                                         }
 444                                         else prevValid=0;
 445
 446                                         indexSE++;
 447
 448                                         /* at the end ?? */
 449                                         if  ((tmplen+snippetLen)>=maxsnippetLen) {
 450                                                         break;
 451                                         }
 452
 453                                         //fprintf(stderr,"\ntmplen = %d ",tmplen); fflush(stderr);
 454                                         for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
 455                                         snippetLen +=tmplen;
 456                                         z++;
 457                                 }//while
 458
 459                                 extractedbytes += snippetLen;
 460
 461                         }       //for
 462
 463                         if (occurrences) free(indexesInSE);
 464                 }
 465                 if (text_aux) free (text_aux);
 466                 return extractedbytes;
 467 }
 468
 469
 470
 471 /**  Allocates text (which must be freed by the caller) and recovers the
 472   the substring of text starting from the "fromword"-th word up to the
 473   "toWord"-th words. Returns in text the text, and in "text_lenght" the
 474   length of the text  actually extracted. Text is allocated.
 475   Actually extracts SE[fromWord .. toWord) ... not the last word.    */
 476
 477 int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
 478         ulong *text_length){
 479
 480         twcsa *wcsa=(twcsa *) index;
 481     uint initTextLen=10000;
 482     uint avgWordLen =7;
 483
 484         uint i, j;//, tmplen;
 485         uint prevValid = 0;
 486         byte *src, *dst, *buff;
 487         uint tmplen =0;
 488
 489         uint buffBytes = 1000;
 490         uint leng=0; //curr pos in buffer that was occupied.
 491
 492         if (toWord > wcsa->seSize) toWord = wcsa->seSize;
 493         if (fromWord >= wcsa->seSize) fromWord = wcsa->seSize-1;
 494         if (buffBytes < ( (toWord-fromWord)* avgWordLen)) buffBytes = ((toWord-fromWord)* avgWordLen);
 495
 496         buff = (uchar *) malloc (buffBytes * sizeof(char));
 497         if (!buff) return 1; //out of memory.
 498         dst = buff;
 499
 500         register uint indexSE=fromWord;
 501         uint posSEValue=0;
 502         register uint ith;
 503
 504         while  ( (indexSE < toWord) ){ /** extracting words (if not at the end) */
 505
 506                 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
 507
 508                 {//obtains pointer to the ith word
 509                         uint offtmp;
 510                         ith= posSEValue -1;  // !!
 511                         tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 512                         offtmp = bitread (wcsa->wordsData.words, (ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 513                         tmplen -=offtmp;  //the lenght of the ith word.
 514                         src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
 515                 }
 516
 517                 if ( buffBytes < (leng + tmplen+1) ) {
 518                         buffBytes *=2;
 519                         buff = (uchar*) realloc(buff, buffBytes);
 520                         if (!buff) return 1; //out of memory.
 521                         dst = buff + leng;
 522                 }
 523
 524                 if (_Valid[*src]) {
 525                         if (prevValid){
 526                                 *dst++ =' ';
 527                                 leng  += 1;
 528                         }
 529                         prevValid =1;   //for the next iteration
 530                 }
 531                 else prevValid=0;
 532
 533                 indexSE++;
 534
 535                 for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
 536                 leng +=tmplen;
 537         }//while
 538
 539         *text_length =leng;
 540         *dst='\0';
 541         *text = buff;
 542         return 0;
 543 }
 544
 545
 546
 547 /** ***********************************************************************************
 548          CONSTRUCTION OF THE INDEX WCSA
 549     ***********************************************************************************/
 550
 551         /**------------------------------------------------------------------
 552         Compares two slots (alphanumericaly). For qsort of canonical words
 553         ------------------------------------------------------------------ */
 554         int qSortWordsCompareAlpha(const void *arg1, const void *arg2) {
 555                 tposInHT *a1 = (tposInHT *) arg1;
 556                 tposInHT *a2 = (tposInHT *) arg2;
 557                 return strcmp((char*)a1->word, (char *)a2->word);
 558         }
 559
 560 /**
 561   * BUILDS THE WCSA INDEX
 562   */
 563
 564 int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
 565
 566         unsigned long zeroNode;  //number of different canonical words.
 567
 568         t_hash hash;            // the hash table to store both variants and canonical words.
 569         tposInHT *posInHT;      // structure for canonicals and variants+huffmans
 570
 571         uint sourceTextSize;
 572
 573         uint seSize=0;  //it's size == "numberOfValidWords".
 574         uint *SE;       //Integers vector. (represents the rank of the valid words in the source text).
 575
 576         uint totallenWords=0; //The numberOfBytes that occupy canonical words (their ascii version) in memory
 577
 578
 579         ulong bytesFile,bytesFileReal;
 580         long sizeNValue;
 581         unsigned long size_posInHT;
 582         /* used during first pass */
 583
 584         ulong addrInTH;
 585
 586         byte* inputBuffer = text;
 587         bytesFileReal= bytesFile = length;
 588
 589         sourceTextSize=length;
 590
 591         /** Initializes WCSA structure*/
 592         twcsa *wcsa;
 593         wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
 594         zeroNode=0;
 595         /**      */
 596
 597         //Stimation (Using Heap's law) of the number of different "meaningful" words.
 598         //sizeNValue=N_value;
 599         if(bytesFile<5000000) bytesFile = 5000000;
 600         sizeNValue = (unsigned long) floor(3.9* pow(bytesFile,0.70) );
 601
 602         // Inicializes the arrays used to detect if a char is valid or not.
 603         StartValid();
 604         // Inicializes the arrays used translated a char into lowercase.
 605         StartToLow();
 606
 607
 608         // **********************************************************************************
 609         //STARTING THE FIRST PASS.
 610         // **********************************************************************************
 611         printf("\nSTARTING THE FIRST PASS...");
 612
 613         posInHT = (tposInHT *) malloc(sizeof(tposInHT) * sizeNValue);
 614         size_posInHT = sizeNValue;
 615         hash = initialize_hash (sizeNValue); //hash to cointain both the parsed words
 616
 617         //-----------------------------------------------------------------
 618         //1st pass (processing the file)
 619         {
 620             byte *pbeg,*pend,*wordstart,*aWord;
 621             register ulong size;
 622             register uint i;
 623
 624             pbeg = inputBuffer;
 625             pend = inputBuffer+bytesFileReal;
 626
 627             while (pbeg <pend)
 628             {
 629                 if (*pbeg == 0)
 630                 {
 631                     fprintf(stderr, "buildFacade.c: assert failed, *pbeg == 0\n");
 632                     exit(1);
 633                 }
 634
 635                 //parsing either a word or separator.
 636                 size=0;
 637                 wordstart = pbeg;
 638                 if (_Valid[*pbeg]) {   //alphanumerical data
 639                     while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
 640                 }
 641                 else
 642                 {
 643                     if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
 644                         while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
 645                     }
 646                     else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
 647                         pbeg++;
 648                         if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
 649                         else {
 650                             if (_Valid [*pbeg] ) {
 651                                 wordstart = pbeg;   //So skipping 1 blank character
 652                                 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
 653                             }
 654                             else {   // a "separator word" ...
 655                                 size++; //the prev BLANK...
 656                                 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
 657                             }//else {  // a "separator word"
 658                         }//else ... not a unique BLANK AT THE END.
 659                     }//else ... starting by a BLANK...
 660                 }
 661
 662                 if (pbeg < pend && *pbeg == 0)
 663                     pbeg ++;    // Skip the 0-bytes
 664
 665                 if (size == 0)
 666                 {
 667                     fprintf(stderr, "buildFacade.c: assert failed, size == 0\n");
 668                     exit(1);
 669                 }
 670
 671                 //The parsed word/separator is  is "wordstart", and its length is "size"...
 672                 aWord=wordstart;
 673
 674                 //Processement done for each word word
 675                 i = inHashTable(hash,aWord, size, &addrInTH );
 676                 if (!i){
 677                     insertElement (hash,aWord, size, &addrInTH);
 678                     if (zeroNode >= size_posInHT){
 679                       size_posInHT *= 2;
 680                       posInHT = (tposInHT*) realloc(posInHT, size_posInHT * sizeof(tposInHT));
 681                     }
 682                     posInHT[zeroNode].slot=addrInTH;
 683                     posInHT[zeroNode].word=hash->hash[addrInTH].word;
 684                     hash->hash[addrInTH].posInVoc = zeroNode;
 685                     zeroNode++;
 686                     totallenWords += size +1;                   // +1 due to the '\0' char...
 687                    //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);
 688                 }
 689                 seSize ++;
 690
 691             }//while pbeg<pend
 692
 693             fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
 694
 695         }//1st pass ends
 696
 697
 698         // **********************************************************************************
 699         // END OF 1ST PASS
 700         // **********************************************************************************
 701
 702         // Sorting the words alphanumerically (over posInHT)
 703         {       register unsigned long i,j;
 704                 //sorting canonical words ...
 705                 qsort(posInHT, zeroNode, sizeof(tposInHT), qSortWordsCompareAlpha);
 706
 707                 //setting in hash the new positions of the  words in the hash table
 708                 for (i=0;i<zeroNode;i++) {
 709                         hash->hash[posInHT[i].slot].posInVoc = i;
 710                 }
 711         }
 712
 713         // INITIALIZING structures for the 2nd pass ......................................
 714         {
 715                 SE  = (uint *) malloc ((seSize+1)*sizeof (uint));
 716         }
 717
 718
 719         // **********************************************************************************
 720         //  STARTING THE SECOND PASS.
 721         // **********************************************************************************/
 722
 723         printf("\nSTARTING THE SECOND PASS... ");
 724         //2nd pass (processing the file)
 725         {
 726             byte *pbeg,*pend,*wordstart,*aWord;
 727             register ulong size;
 728             register uint i;
 729             register ulong countValidWords = 0;
 730
 731
 732             pbeg = inputBuffer;
 733             pend = inputBuffer+bytesFileReal;
 734
 735             while (pbeg <pend) {
 736                 if (*pbeg == 0)
 737                 {
 738                     fprintf(stderr, "buildFacade.c 2nd pass: assert failed, *pbeg == 0\n");
 739                     exit(1);
 740                 }
 741
 742                 //parsing either a word or separator.
 743                 size=0;
 744                 wordstart = pbeg;
 745                 if (_Valid[*pbeg]) {   //alphanumerical data
 746                     while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
 747                 }
 748                 else {
 749                     if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
 750                         while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
 751                     }
 752                     else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
 753                         pbeg++;
 754                         if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
 755                         else {
 756                             if (_Valid [*pbeg] ) {
 757                                 wordstart = pbeg;   //So skipping 1 blank character
 758                                 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
 759                             }
 760                             else {   // a "separator word" ...
 761                                 size++; //the prev BLANK...
 762                                 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg]  && *pbeg != 0)) {size++;pbeg++;}
 763                             }//else {  // a "separator word"
 764                         }//else ... not a unique BLANK AT THE END.
 765                     }//else ... starting by a BLANK...
 766                 }
 767
 768                 if (pbeg < pend && *pbeg == 0)
 769                     pbeg ++;    // Skip the 0-bytes
 770
 771                 if (size == 0)
 772                 {
 773                     fprintf(stderr, "buildFacade.c 2nd pass: assert failed, size == 0\n");
 774                     exit(1);
 775                 }
 776
 777                 //The parsed word/separator is  is "wordstart", and its length is "size"...
 778                 aWord=wordstart;
 779
 780                 //Processement done for each word word
 781                 i = inHashTable(hash,aWord, size, &addrInTH );
 782
 783                 SE[countValidWords]=hash->hash[addrInTH].posInVoc+1;  // !!!!
 784                 countValidWords++;
 785
 786             }// while pbeg<pend
 787
 788             SE[countValidWords] = 0;
 789             fprintf(stderr,"\n2nd pass ends: TOTAL distinct words: %lu totalWords = %lu",zeroNode,countValidWords);
 790
 791         }//2nd pass ends
 792
 793         // **********************************************************************************
 794         // END OF 2ND PASS
 795         // **********************************************************************************
 796
 797         //freeing the source text (it is no longer needed).
 798         free(inputBuffer); //the text
 799
 800         /** Now Setting the data of the index **/
 801         wcsa->n = zeroNode;
 802         wcsa->sourceTextSize = sourceTextSize;
 803         wcsa->seSize = seSize;
 804
 805         // Creating the words of the vocabulary...
 806         {
 807         /** copying the words into WCSA. */
 808                 uint *tmpOffsets = (uint *) malloc (sizeof(uint) * (zeroNode  +1) );  //1 extra uint (to point to the virtual "zeroNode+1" ^th word.
 809                 uint tmpOffset =0;
 810
 811                 byte *zoneMem,*src;
 812                 uint i;
 813
 814                 //Moving data from posInHT to WCSA structure
 815                 //wcsa->wordsData = (twords *) malloc(sizeof(twords) * zeroNode);
 816                 wcsa->wordsData.wordsZoneMem.size = totallenWords - zeroNode; //without '\0' bytes (end-tag).
 817                 wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc ( wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
 818                 zoneMem = wcsa->wordsData.wordsZoneMem.zone;
 819                 for(i = 0; i < zeroNode; i++) {
 820                         src = posInHT[i].word;                           //copying the canonical word
 821                         //wcsa->wordsData.words[i].word = zoneMem;   //setting the pointer
 822                         tmpOffsets[i]=tmpOffset;  //offset in zoneMem
 823                         while (*src) {*zoneMem++ = *src++; tmpOffset++;}  //moving data until '\0'
 824                         //*zoneMem='\0'; zoneMem++;            //copies also the '\0'
 825
 826                 }
 827                 tmpOffsets[zeroNode]=tmpOffset; //setting pointer to the "virtual" word {zeroNode+1}^{th}
 828
 829                 //kbit encoding of the offsets
 830                 uint elemSize = _bits(tmpOffset);
 831                 wcsa->wordsData.elemSize = elemSize;
 832                 wcsa->wordsData.words = (uint *) malloc (((((zeroNode +1)*elemSize)+W-1) /W) * sizeof(uint));  //with 1 extra slot !.
 833                 wcsa->wordsData.words[((((zeroNode +1)*elemSize)+W-1) /W)   -1 ] =0000;
 834                 //              fprintf(stderr,"\n ElemSize = %d, maxOffset = %d",elemSize,tmpOffset);
 835
 836                 tmpOffset=0;
 837                 for (i=0; i<=zeroNode; i++) {  //setting "zeroNode+1" offsets
 838                                 bitwrite(wcsa->wordsData.words, tmpOffset, elemSize, tmpOffsets[i]);
 839                                 tmpOffset+=elemSize;
 840                 }
 841
 842                 //////////// CHECKS IT WORKED. old !!!!
 843         //              { uint kk;
 844         //                      tmpOffset=0;
 845         //                      for (i=0; i<zeroNode; i++) {  //setting "zeroNode+1" offsets
 846         //                                      kk=bitread(wcsa->wordsData.words, i* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 847         //                                      tmpOffset+=elemSize;
 848         //                                      if (kk != tmpOffsets[i]) {fprintf(stderr,"\n @@@@@@@@ DISTINTOS OFFSETS "); break;}
 849         //                                      else fprintf(stderr,"\n iguales, %d, %d  :: <<%s>>len=%d",kk,i, posInHT[i].word, strlen((char*)posInHT[i].word));
 850         //                      }
 851         //              }
 852         //
 853         //              { uint len1, len, tmplen, len2;
 854         //                      uint i,p;
 855         //                      byte *wcsaWord, *src;
 856         //
 857         //                      for (p=0;p<zeroNode;p++) {
 858         //                              {//preparing for strcompL
 859         //                                      len    = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p+1)), wcsa->wordsData.elemSize);
 860         //                                      tmplen = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p))  , wcsa->wordsData.elemSize);
 861         //
 862         //                              //fprintf(stderr,"\n  :: off[%d]= %d  -  off [%d] = %d  ==> %d",p+1,len,p,tmplen,len-tmplen);
 863         //
 864         //                              len2 =len-tmplen;
 865         //                                      wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
 866         //                              }
 867         //
 868         //                              src = posInHT[p].word;
 869         //                              len1 = strlen((char *)src);
 870         //
 871         //                              if (strcompL(src,wcsaWord,len1,len2) != 0) {
 872         //                                      fprintf(stderr,"\n %6d DISTINTOS !! ===len1 %d,len %d===== <<",p,len1,len2);printWord(src,len1);
 873         //                                      fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
 874         //                                      exit(0);
 875         //                              }
 876         //                              else {
 877         //                                      fprintf(stderr,"\n %6d ======len1 %d,len2 %d===== <<",p,len1,len2);printWord(src,len1);
 878         //                                      fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
 879         //                              }
 880         //                      }
 881         //
 882         //              }
 883         //
 884
 885         /**-----------*/
 886         //frees memory from hash table and posInHT structures.
 887                 free(tmpOffsets);
 888                 destroy_hash(hash);
 889                 free(posInHT);
 890         }
 891
 892         /** ******* creates the self-index on ints (bottom layer) ==> see build_icsa *********/
 893 /**
 894         #ifdef CSA_ON
 895         {
 896                 uint total;
 897                 fprintf(stderr,"\n **** CREATING CSA from Edu's Code *****");
 898                 ticsa *myicsa;
 899                 myicsa = createIntegerCSA(&SE,seSize+1,build_options);
 900                 wcsa->myicsa= myicsa;
 901                 total = CSA_size(myicsa);
 902
 903                 free(SE);  //SE is no longer needed, (it is indexed by the iCSA)
 904                 printf("\n\t**** [iCSA built on %d words. Size = %ld bytes... RAM",seSize,total);
 905         }
 906         #endif
 907 */
 908
 909         //#ifndef CSA_ON
 910                 wcsa->se = SE;
 911                 wcsa->myicsa = NULL;
 912         //#endif
 913
 914         printf("\n\t ** Building done! **\n");
 915         printf("\n Process finished!\n");
 916
 917         *index = wcsa;
 918         return 0;
 919 }
 920
 921
 922 int build_iCSA (char *build_options, void *index)
 923 {
 924         twcsa *wcsa = (twcsa *) index;
 925         /********* creates the self-index on ints (bottom layer) *********/
 926         //creating CSA from Edu's code...
 927         {
 928                 uint total;
 929                 fprintf(stderr,"\n **** CREATING CSA-bottom-layer *****");
 930                 void *bottomIntIndex;
 931                 int err =  buildIntIndex(wcsa->se,wcsa->seSize+1, build_options,(void **)&bottomIntIndex);
 932                 wcsa->myicsa = bottomIntIndex;
 933
 934                 //total = CSA_size(wcsa->myicsa);
 935                 err = sizeIntIndex((void *) wcsa->myicsa, &total);
 936
 937                 printf("\n\t**** [iCSA built on %d words. Size = %u bytes... RAM",wcsa->seSize,total);
 938         }
 939         return 0;
 940 }
 941
 942 /** ********************************************************************
 943   * Loading from disk
 944   **********************************************************************/
 945
 946 /**-----------------------------------------------------------------
 947  * LoadWCSA.
 948  * Loads all the data structures of WCSA (included the icsa)
 949  ----------------------------------------------------------------- */
 950
 951 twcsa *loadWCSA(char *filename) {
 952         twcsa *wcsa;
 953         // Inicializes the arrays used to detect if a char is valid or not.
 954         StartValid();
 955         // Inicializes the arrays used translated a char into lowercase.
 956         StartToLow();
 957
 958         wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
 959         wcsa->n=0;
 960
 961         int err = loadIntIndex(filename, (void **)&wcsa->myicsa);
 962
 963         loadStructs(wcsa,filename);
 964
 965         return wcsa;
 966 }
 967
 968 /** ------------------------------------------------------------------
 969  * LoadStructs.
 970  *      Reads files and loads all the data needed for searcherFacade
 971  ----------------------------------------------------------------- */
 972  void loadStructs(twcsa *wcsa, char *basename) {
 973         uint i,j;
 974         char *filename;
 975         int file;
 976         uint sizeFile;
 977         char c;
 978         uint n;
 979
 980         filename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
 981         fprintf(stderr,"Loading Index from file %s.*\n", basename);
 982
 983         //** SOME CONSTANTS: sourceTextSize
 984         {       strcpy(filename, basename);
 985                 strcat(filename, ".");
 986                 strcat(filename, CONSTANTS_FILE_EXT);
 987
 988                 if( (file = open(filename, O_RDONLY)) < 0) {
 989                         printf("Cannot open file %s\n", filename);
 990                         exit(0);
 991                 }
 992
 993                 read(file, &(wcsa->sourceTextSize), sizeof(uint));
 994                 read(file, &(wcsa->seSize), sizeof(uint));
 995                 close(file);
 996         }
 997
 998         /** File with the words from the vocabulary (sorted alphabetically) */
 999         {       byte *zoneMem;
1000
1001                 strcpy(filename, basename);
1002                 strcat(filename, ".");
1003                 strcat(filename, VOCABULARY_WORDS_FILE_EXT);
1004                 //sizeFile= fileSize(filename)-sizeof(uint);
1005
1006                 if( (file = open(filename, O_RDONLY)) < 0) {
1007                         printf("Cannot open file %s\n", filename);
1008                         exit(0);
1009                 }
1010
1011                 //the number of canonical words
1012                 read(file, &n, sizeof(uint));
1013                 wcsa->n = n;
1014                 read(file, &(wcsa->wordsData.elemSize), (sizeof(uint)));
1015                 read(file, &(wcsa->wordsData.wordsZoneMem.size), (sizeof(uint)));
1016
1017                 //allocating the memory needed for all words and reading them   //(ascii) << no \0 chars are needed>>.
1018                 wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc(wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
1019                 read(file, (wcsa->wordsData.wordsZoneMem.zone),    wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
1020
1021                 //reading the offsets of the words (kbitArray that points to offsets in zoneMem of words.
1022                 wcsa->wordsData.words = (uint *) malloc (((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * sizeof(uint));
1023                 wcsa->wordsData.words[ ((((n+1)*(wcsa->wordsData.elemSize))+W-1) /W)   -1 ] =0000;
1024                 read(file, (wcsa->wordsData.words),     ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)));
1025
1026
1027                 close(file);
1028         }
1029         wcsa->se= NULL;
1030         free(filename);
1031 }
1032
1033
1034
1035
1036 /** ****************************************************************
1037   * Querying the index WCSA
1038   * ***************************************************************/
1039 ///////////////////////////////////////////////////////////////////////////////////////
1040 //                                         FUNCTIONS NEEDED FOR SEARCHING A PATTERN                                      //
1041 ///////////////////////////////////////////////////////////////////////////////////////
1042
1043
1044
1045 /*------------------------------------------------------------------
1046  * Given a text pattern translates it into a list of integers (corresponding to the
1047  * canonical words associated to the valid words in the text pattern)
1048  ------------------------------------------------------------------*/
1049 void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) {
1050
1051         byte *pbeg,*pend,*wordstart,*aWord;
1052         register unsigned long size;
1053         uint index =0;
1054
1055         pbeg = textPattern;
1056         pend = pbeg + patLen;
1057
1058         while (pbeg <pend) {
1059                 //parsing either a word or separator.
1060                 size=0;
1061                 wordstart = pbeg;
1062                 if (_Valid[*pbeg]) {   //alphanumerical data
1063                         while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
1064           }
1065                 else {
1066                         if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
1067                                 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
1068                         }
1069                         else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
1070                                 pbeg++;
1071                                 if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
1072                                 else {
1073                                         if (_Valid [*pbeg] ) {
1074                                                 wordstart = pbeg;   //So skipping 1 blank character
1075                                                 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
1076                                         }
1077                                         else {   // a "separator word" ...
1078                                                 size++; //the prev BLANK...
1079                                                 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
1080                                         }//else {  // a "separator word"
1081                                 }//else ... not a unique BLANK AT THE END.
1082                         }//else ... starting by a BLANK...
1083                 }
1084
1085                 //The parsed word is "aWord", and its length is "size"...
1086                 aWord=wordstart;
1087
1088                 // Binary search on the canonical words (wordsData)
1089                 {
1090                         uint len, tmplen;
1091                         uchar *wcsaWord;
1092                         register uint min,max,p;
1093                         min = 0;
1094                         max = (wcsa->n) - 1;
1095                         while(min < max) {
1096                                 p = (min+max)/2;
1097
1098                                 {//preparing for strcompL
1099                                         len    = bitread (wcsa->wordsData.words, (p+1)* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize);
1100                                         tmplen = bitread (wcsa->wordsData.words, (p )* wcsa->wordsData.elemSize  , wcsa->wordsData.elemSize);
1101                                         len -=tmplen;
1102                                         wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
1103                                 }
1104
1105                                 //if(strncmp((char*)aWord, (char*)wcsa->wordsData[p].word,size) > 0) min = p+1;
1106                                 if(strcompL(aWord, wcsaWord, size, len) > 0) min = p+1;
1107                                 else max = p;
1108
1109
1110                                 //                              { //SHOW PROGRESS
1111                                 //                                      fprintf(stderr,"\n Patron = <<%s>>, curposWord= %d ==><<",aWord,p);
1112                                 //                                      printWord(wcsaWord,len); fprintf(stderr,">> len =%d",len);
1113                                 //                              }
1114
1115                         }
1116
1117                         {//preparing for strcompL
1118                                 len    = bitread (wcsa->wordsData.words, (min+1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1119                                 tmplen = bitread (wcsa->wordsData.words, ( min )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1120                                 len -=tmplen;
1121                                         wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
1122                         }
1123
1124                         //      if(!strncmp((char*)aWord, (char*)wcsa->wordsData[min].word, size)) {
1125                         if(!strcompL(aWord, wcsaWord, size, len)) {
1126                                 integerPattern[index++] = min +1 ;  //<--
1127                         }
1128                         else {*sizeIntegers = 0; return;}  // a valid word that does not appear in the source text.
1129
1130                 }
1131         }// end while
1132         *sizeIntegers = index;
1133
1134         //      //shows the parsed words:
1135         //      {uint i;
1136         //              printf("\n\n >>%s>> HA SIDO PARSEADO COMO:",textPattern);
1137         //              for (i=0; i<index;i++) {
1138         //                              printf("<<%s>>",wcsa->wordsData[integerPattern[i] -1].word);
1139         //              }
1140         //
1141         //      }
1142 }
1143
1144
1145
1146
1147
1148         /** ------------------------------------------------------------------
1149          * Returns the number of occurrences of a given text pattern
1150          *------------------------------------------------------------------ */
1151 int countTextOcurrences(twcsa *wcsa, byte *textPattern) {
1152         ulong left, right;
1153         uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
1154         uint integerPatternSize, min, max;
1155
1156         uint lenpat = strlen((char*)textPattern);
1157         parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
1158         if (!integerPatternSize) return -1;
1159
1160 //      #ifdef DEBUG_ON
1161 //              uint i;
1162 //              printf("\n %d Integers to search for:",integerPatternSize );
1163 //              for (i=0;i<integerPatternSize;i++) {
1164 //                      printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
1165 //              }
1166 //              printf("\n");
1167 //      #endif
1168
1169         ulong numocc;
1170         int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, &numocc,  &left, &right);
1171         return numocc;
1172
1173 }
1174
1175
1176         /** ------------------------------------------------------------------
1177          * locateTextOcurrences:
1178          * Returns the offsets of the source text where a word/phrase appears
1179          * Returns also the number of occurrences.
1180          *------------------------------------------------------------------ */
1181 uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences) {
1182         uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
1183         uint integerPatternSize, min, max;
1184
1185         uint lenpat = strlen((char*)textPattern);
1186         parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
1187         if (!integerPatternSize) {*numberOccurrences = -1; return NULL;}
1188
1189 //      #ifdef DEBUG_ON
1190 //              uint i;
1191 //              printf("\n %d Integers to search for:",integerPatternSize );
1192 //              for (i=0;i<integerPatternSize;i++) {
1193 //                      printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
1194 //              }
1195 //              printf("\n");
1196 //      #endif
1197
1198         ulong occurrences, left, right;
1199         ulong *seOffsets;
1200         ulong *sourceOffsets;
1201
1202         //obtains the indexes in vector SE where the pattern appears.
1203         //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
1204         int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
1205
1206         //sourceOffsets = (uint *) malloc (sizeof(uint)*occurrences);
1207
1208         sourceOffsets=seOffsets;
1209         //obtains the offsets in the source text of the pattern (sourceOffsets)
1210         locateFacade(wcsa, (uint *)sourceOffsets, (uint *)seOffsets,occurrences);
1211
1212         #ifdef DEBUG_ON
1213                 fprintf(stderr,"\n*** %s appears in the source text in positions:\n\t",textPattern);
1214                 for (i=0;i<occurrences;i++)
1215                         fprintf(stderr,"[%u]",sourceOffsets[i]);
1216                 fflush(stderr);
1217         #endif
1218
1219         *numberOccurrences = occurrences;
1220         return (uint *) sourceOffsets;
1221 }
1222
1223
1224         /** ------------------------------------------------------------------
1225          * displayTextOcurrences:
1226          * Shows in stdout, the text around the occurrences of a word/phrase
1227          * Returns also the number of occurrences.
1228          *------------------------------------------------------------------ */
1229 int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay) {
1230         return 99;  //not implemented: function not available
1231 }
1232
1233         /** ------------------------------------------------------------------
1234          * Locate Facade:
1235          * For given sePositions, returns the sourceTextPositions
1236          * where the those valid-words in se[sePositions[i]] occurr.
1237          *------------------------------------------------------------------*/
1238 int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number) {
1239         return 99;  //not implemented: function not available for this index
1240 }
1241
1242
1243 /** ------------------------------------------------------------------
1244         * DISPLAYFACADE:
1245         * Returns the subString from a starting offset to a final offset
1246         * in the source text. It does not allocate any memory, receives "dstptr"
1247         * Precondition: offsetIni >=0;
1248         ------------------------------------------------------------------*/
1249  int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr) {
1250         return 99;  //not implemented: function not available for this index
1251 }
1252
1253
1254         /**------------------------------------------------------------------
1255          * DISPLAYFacadeMalloc:
1256          * Returns the subString from a starting offset to a final offset
1257          * in the source text. It allocates Memory !!
1258          * NOT CURRENTLY USED
1259          ------------------------------------------------------------------*/
1260 byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length) {
1261         byte *dstptr=NULL;         //not implemented: function not available
1262         return dstptr;
1263 }
1264
1265
1266         /** ------------------------------------------------------------------
1267          * LOCATEALLandDISPLAY:
1268          * Displays the text around an occurrence of the searched word in the source text.
1269          * Assuming that $p$ is that position --> shows only chars in [p_radix-1,p_radix]
1270          ------------------------------------------------------------------*/
1271 int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix) {
1272 return 99;   //not implemented: function not available for this index
1273
1274 }
1275
1276
1277         /** ------------------------------------------------------------------
1278          * recovers the source text by calling display(0,fileSize);
1279          * ------------------------------------------------------------------ */
1280 void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
1281
1282         int start;int end;
1283         byte *cc;
1284         char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
1285         ulong length;
1286
1287         strcpy( filename,  basename);
1288         strcat( filename, ext);
1289         filename[strlen( basename)+ strlen(ext)]='\0';
1290         fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
1291
1292         FILE *salida;
1293         unlink( filename);
1294         salida = fopen( filename,"w");
1295         start=0; end = sourceTextSize-1;
1296
1297         cc = (byte *) malloc (sourceTextSize* sizeof(uchar));
1298
1299         {
1300                 uint i, j;//, tmplen;
1301                 uint prevValid = 0;
1302                 //uint ptr, maxptr;
1303                 byte *src, *dst;
1304                 uint leng =0;
1305                 uint tmplen =0;
1306
1307                 uint indexSE=0;
1308                 uint posSEValue=0;
1309
1310                 dst=cc;
1311                 while  ( (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
1312
1313                         int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
1314
1315                         {//obtains pointer to the ith word
1316                                 uint offtmp;
1317                                 uint ith = posSEValue -1;  // !!
1318                                 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1319                                 offtmp = bitread (wcsa->wordsData.words, ( ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1320                                 tmplen -=offtmp;  //the lenght of the ith word.
1321                                 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
1322                         }
1323
1324                         if (_Valid[*src]) {
1325                                 if (prevValid){
1326                                          *dst++ =' ';
1327                                         leng +=1;
1328                                 }
1329                                 prevValid =1;   //for the next iteration
1330                         }
1331                         else prevValid=0;
1332
1333                         indexSE++;
1334
1335                         for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
1336                         leng +=tmplen;
1337                 }//while
1338
1339         fprintf(stderr,"\n sourceTextSize = %d, len = %d",sourceTextSize,leng);
1340         fwrite(cc,sizeof(byte),leng,salida);
1341         fclose(salida);
1342
1343         free(cc);
1344         free(filename);
1345 }
1346
1347
1348 }
1349
1350                 //recovers the source text by calling extract Words.
1351 void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
1352
1353         int start;int end; int error;
1354         char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
1355         byte *cc;
1356         ulong length;
1357
1358         strcpy( filename,  basename);
1359         strcat( filename, ext);
1360         filename[strlen( basename)+ strlen(ext)]='\0';
1361         fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
1362
1363         FILE *salida;
1364         unlink( filename);
1365         salida = fopen( filename,"w");
1366         start=0; end = wcsa->seSize;
1367
1368         error = extractWords((void *) wcsa, start, end, &cc, &length);
1369         if (error) {fprintf(stderr,"\n error during recoverSourceText2"); exit(0);}
1370
1371         fprintf(stderr,"\n sourceTextSize = %d, len = %ld",sourceTextSize,length);
1372         fwrite(cc,sizeof(byte),length,salida);
1373         fclose(salida);
1374
1375         free(cc);
1376         free(filename);
1377 }
1378
1379 /** *******************************************************************************
1380   * Showing some statistics and info of the index
1381   * *******************************************************************************/
1382 void printInfoReduced(twcsa *wcsa) {
1383           //not implemented: function not available
1384 }
1385
1386                 /* Shows summary info of the index */
1387 int printInfo(void *index) {
1388         uint n;
1389
1390         twcsa *wcsa = (twcsa *) index;
1391
1392         unsigned long indexSize;
1393         uint intIndexSize, presentationSize;
1394         int err;
1395
1396         err = index_size(index, &indexSize);
1397         if (err!=0) return err;
1398         err = sizeIntIndex(wcsa->myicsa, &intIndexSize);
1399         if (err!=0) return err;
1400
1401         presentationSize = indexSize - intIndexSize;
1402
1403                 printf("\n ===================================================:");
1404                 printf("\n Summary of Presentation layer:");
1405                 printf("\n   Number of valid words (SEsize) = %u",wcsa->seSize);
1406                 printf("\n   Number of different words = %ld",wcsa->n);
1407                 printf("\n   WCSA structure = %lu bytes", sizeof(twcsa));
1408
1409                 uint totalpointers = ((((wcsa->n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));
1410                 uint totalasciizone = wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ;
1411                 uint totalwords = totalasciizone + totalpointers;
1412
1413                 printf("\n   Size Of words structure (%d bytes):",totalwords);
1414                 printf("\n      [     pointers = %d bytes || AsciiZone = %d bytes", totalpointers,      totalasciizone);
1415
1416                 printf("\n\n Total = **  %u bytes (in RAM) **",presentationSize);
1417                 //printf("\n\n @@ Summary of self-index on Integers:");
1418                 err = printInfoIntIndex(wcsa->myicsa, " ");
1419                 if (err!=0) return err;
1420
1421                 printf("\n ===================================================:");
1422                 printf("\n");
1423                 return 0;
1424         }
1425
1426 /**------------------------------------------------------------------
1427  * structsSize.
1428  *      Counts the memory amount needed by the Facade (Presentation Layer).
1429  * skipping the stop_words hash table
1430  ----------------------------------------------------------------- */
1431 uint structsSizeMem(twcsa *wcsa) {
1432         return 0;   //not implemented: function not available for this index.
1433 }
1434
1435
1436 /** for debugging **/
1437 void printWord(uchar *str, uint len) {
1438                 uint i;
1439                 for (i=0;i<len;i++)
1440                         fprintf(stderr,"%c",str[i]);
1441 }
1442
1443
1444         /** saves the content of the file SE (ids of the source words) **/
1445 int saveSEfile (char *basename, uint *v, uint n) {
1446         char outfilename[255];
1447         int file;
1448         sprintf(outfilename,"%s.%s",basename,SE_FILE_EXT);
1449         unlink(outfilename);
1450         if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
1451                 printf("Cannot open file %s\n", outfilename);
1452                 exit(0);
1453         }
1454
1455         write(file, v, sizeof(uint) * n );
1456         close(file);
1457 }
1458
1459
1460
1461 double getTime2 (void)
1462 {
1463         double usertime, systime;
1464         struct rusage usage;
1465
1466         getrusage (RUSAGE_SELF, &usage);
1467
1468         usertime = (double) usage.ru_utime.tv_sec +
1469                 (double) usage.ru_utime.tv_usec / 1000000.0;
1470         systime = (double) usage.ru_stime.tv_sec +
1471                 (double) usage.ru_stime.tv_usec / 1000000.0;
1472
1473         return (usertime + systime);
1474 }
1475
1476
1477
1478 /**------------------------------------------------------------------
1479   *  MAIN PROGRAM.
1480   *------------------------------------------------------------------ */
1481 #ifdef FACADEWITHMAIN
1482         int main(int argc, char* argv[])
1483         {
1484
1485
1486
1487                 char *infile, *outbasename, *stopwordsfile;     // Name of in/out files
1488                 byte *inputBuffer;
1489                 ulong finsize;
1490
1491                 int f_in;
1492                 void *Index;
1493
1494
1495                 printf("\n*Word-based iCSA: A word-based CSA");
1496                 printf("\n*CopyRight (c) 2008 [LBD & G.N.]\n\n");
1497
1498                 // Reads input parameters from command line.
1499                 if(argc < 3) {
1500                         printf("Use: %s <in file> <out basename> \n", argv[0]);
1501                         exit(0);
1502                 }
1503
1504                 // Reads params (input file, output basename, and stopwords file)
1505                 infile = argv[1];
1506                 outbasename = argv[2];
1507                 stopwordsfile = argv[3];
1508
1509                 finsize= fileSize(infile);
1510
1511                 if (! finsize) {
1512                         printf( "\nFILE EMPTY OR FILE NOT FOUND %s !!\nSkipping processement ...\n",infile);
1513                         exit(0);
1514                 }
1515
1516                 // Opening the input text file.
1517                 if( (f_in = open(infile, O_RDONLY)) < 0) {
1518                         printf("Cannot read file %s\n", infile);
1519                         exit(0);
1520                 }
1521                 inputBuffer = (byte *) malloc(finsize *sizeof(byte));// +1);
1522                 read (f_in,inputBuffer,finsize);
1523                 close (f_in);
1524
1525
1526         {
1527                 //printf("\n parametros <<%s>>\n\n",stopwordsfile);
1528                 build_index (inputBuffer, finsize, stopwordsfile, &Index);  /** building the index */
1529
1530 //              /** recovering the source text from the index */
1531                         {
1532                                 double start, end;
1533                                 start = getTime2();
1534                                 ulong size;
1535                                 get_length(Index, &size);
1536                                 char extension[10]= ".source";
1537
1538                                 //recoverSourceText1((twcsa*) Index, outbasename,extension, size);
1539                                 strcat(extension,"2");
1540                                 recoverSourceText2((twcsa*) Index, outbasename,extension,size);
1541                                 end = getTime2();
1542                                 fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start );
1543                         }
1544 //
1545                 // DISPLAYING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
1546                         {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1547                          int error = 0;
1548                         ulong numocc,numc, length, i, *snippet_len, tot_numcharext = 0, numpatt;
1549                         uchar *pattern, *snippet_text;
1550
1551                                  pattern = textPattern;
1552                          printf("\nSEARCH TEST for DISPLAY (pizzachili interface)\n");
1553                                 while(1) {
1554                                         printf("Intro string: ");
1555                                         fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1556                                         if (!strcmp((char*)textPattern,"\n") ) break;
1557                                          textPattern[strlen((char*)textPattern)-1] = '\0';
1558
1559                                         length = strlen( (char*)textPattern);
1560                                         numc=50;
1561
1562 //                                      error = display (Index, textPattern, length, numc, &numocc,
1563 //                                                               &snippet_text, &snippet_len);
1564                                         error = displayWords (Index, textPattern, length, numc, &numocc,
1565                                                                  &snippet_text, &snippet_len,1);
1566
1567                                         if (error){ fprintf(stderr, "%s\n", "Hubo un error durante display");exit(0);}
1568
1569                                                 fprintf(stderr,"\n acabou display");fflush(stderr);
1570                                         {//show the results
1571                                                 ulong j, len = length + 2*numc;
1572                                             char blank = '\0';
1573                                                 fprintf(stderr,"\n length = %d",length);
1574                                                 fprintf(stderr,"\n pattern = %s",pattern);fflush(stderr);
1575                                                 fprintf(stderr,"\n numocc = %d",numocc);fflush(stderr);
1576                                                 fprintf(stderr,"\n snippet len = %d",len);fflush(stderr);
1577                                                 fprintf(stderr,"\n =========");fflush(stderr);
1578                                                 for (i = 0; i < numocc; i++){
1579                                                         fprintf(stderr,"\n[%2d][len=%3d]<<",i+1,snippet_len[i]);fflush(stderr);
1580                                                         fwrite(snippet_text+len*i,sizeof(uchar),snippet_len[i],stderr);fflush(stderr);
1581                                                         fprintf(stderr,">>");fflush(stderr);
1582                                                 }
1583                                         }
1584                                         numpatt--;
1585
1586                                         for(i=0; i<numocc; i++) {
1587                                                 tot_numcharext += snippet_len[i];
1588                                         }
1589
1590                                         if (numocc) {
1591                                                 free (snippet_len);
1592                                                 free (snippet_text);
1593                                         }
1594
1595                                         printf("Ocurrences = %d\n", numocc);
1596                                         if (!strcmp((char*)textPattern,"\n") ) break;
1597                                 }
1598                         }
1599
1600 //
1601 //
1602 //      // SEARCHING FOR A TEXT PATTERN (word/phrase).
1603 //      {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1604 //       int occ;
1605 //       int len;
1606 //       uint *occs;
1607 //       int i;
1608 //       printf("\nSEARCH TEST for LOCATE\n");
1609 //              while(1) {
1610 //                      printf("Intro string: ");
1611 //                      fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1612 //                      len = strlen((char*)textPattern);
1613 //                      if (!strcmp((char*)textPattern,"\n") ) break;
1614 //                      textPattern[len-1] = '\0';
1615 //                      len --;
1616 //
1617 //                      //occs = locateTextOcurrences(wcsa,textPattern,&occ);
1618 //                      // locate(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ);
1619 //                        locateWord(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ, 0);
1620 //
1621 //                      printf("\n*** %s occurs %d times: In the source text in positions:\n\t",textPattern,occ);
1622 //                      for (i=0;i<occ;i++)
1623 //                              printf("[%u]",occs[i]);
1624 //                      fflush(stderr);
1625 //                      free(occs);
1626 //
1627 //                      if (!strcmp((char*)textPattern,"\n") ) break;
1628 //              }
1629 //      }
1630 //
1631 //
1632
1633                 // COUNTING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
1634                 /*
1635                 {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1636                  int occ;
1637                  int len;
1638                  printf("\nSEARCH TEST for COUNT.\n");
1639                         while(1) {
1640                                 printf("Intro string: ");
1641                                 fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1642                                 len = strlen((char*)textPattern);
1643                                 if (!strcmp((char*)textPattern,"\n") ) break;
1644                                 textPattern[len-1] = '\0';
1645                                 len --;
1646
1647                                 count(Index, textPattern, len, (ulong *)&occ);
1648                                 //occ = countTextOcurrences(wcsa,textPattern);
1649                                 printf("Ocurrences = %d\n", occ);
1650                         }
1651                 }
1652                 printf("\n END COUNTING OCCURRENCES OF PATTERNS. ...\n");
1653                 //exit(0);
1654                 */
1655
1656                 /** saving the index to disk*/
1657                 save_index (Index, outbasename);
1658
1659                 /** tells the mem used by the index */
1660                 ulong indexsize;
1661                 index_size(Index, &indexsize);
1662                 fprintf(stderr,"Index occupied %d bytes, 2 extra mallocs = %d",indexsize,2* sizeof(uint));
1663
1664                 /** freeing the index */
1665                 free_index(Index);
1666
1667         }
1668 }
1669
1670 #endif
1671
1672
1673