swcsa/buildFacade.c

   1 #include "buildFacade.h"
   2 #include "utils/errors.c"
   3
   4
   5 /** Building the index */
   6
   7     /* Creates index from text[0..length-1]. Note that the index is an
   8       opaque data type. Any build option must be passed in string
   9       build_options, whose syntax depends on the index. The index must
  10       always work with some default parameters if build_options is NULL.
  11       The returned index is ready to be queried. */
  12 int build_index (uchar *text, ulong length, char *build_options, void **index) {
  13         int returnvalue;
  14
  15      printf("\n parameters: \"%s\"\n",build_options); fflush(stderr);
  16
  17     returnvalue = build_WCSA (text, length, build_options, index);
  18
  19     if (!returnvalue)
  20         returnvalue = build_iCSA (build_options,*index);
  21
  22     return returnvalue;
  23 }
  24
  25
  26 /**  Saves index on disk by using single or multiple files, having
  27         proper extensions. */
  28 int save_index (void *index, char *filename) {
  29
  30         char *basename = filename;
  31         twcsa *wcsa=(twcsa *) index;
  32
  33         uint i,j;
  34         char *outfilename;
  35         int file;
  36         char c;
  37
  38         printf("\n Saving structures to disk: %s.*",filename);
  39         outfilename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
  40
  41         /**File with some constants (bSize and tohSize); */
  42         { uint number;
  43                 strcpy(outfilename, basename);
  44                 strcat(outfilename, ".");
  45                 strcat(outfilename, CONSTANTS_FILE_EXT);
  46                 unlink(outfilename);
  47                 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
  48                         printf("Cannot open file %s\n", outfilename);
  49                         exit(0);
  50                 }
  51                 write(file, &(wcsa->sourceTextSize), sizeof(uint));
  52                 write(file, &(wcsa->seSize), sizeof(uint));
  53                 close(file);
  54         }
  55
  56         /** The Words in the vocabulary of words  (sorted alphabetically)*/
  57         {       strcpy(outfilename, basename);
  58                 strcat(outfilename, ".");
  59                 strcat(outfilename, VOCABULARY_WORDS_FILE_EXT);
  60                 unlink(outfilename);
  61                 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
  62                         printf("Cannot open file %s\n", outfilename);
  63                         exit(0);
  64                 }
  65
  66                 uint n = wcsa->n;
  67                 uint elemSize = wcsa->wordsData.elemSize;
  68                 write(file, &n, sizeof(uint));
  69                 write(file, &elemSize, sizeof(uint));
  70                 write(file, &(wcsa->wordsData.wordsZoneMem.size), sizeof(uint));
  71
  72                 //the number of canonical words
  73                 write(file, (char *)wcsa->wordsData.wordsZoneMem.zone, wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
  74                 write(file, (char *)wcsa->wordsData.words, ((((n+1)* (elemSize))+W-1) /W) * (sizeof(uint)) );
  75
  76                 close(file);
  77         }
  78
  79         free(outfilename);
  80
  81         if (wcsa->myicsa) {
  82                 /******** saves index on integers (bottom) ******/
  83                 //Storing the CSA
  84                 //storeStructsCSA(wcsa->myicsa,basename);
  85                 saveIntIndex((void *) wcsa->myicsa, basename);
  86         }
  87
  88         if (wcsa->se) {
  89                 saveSEfile(basename,wcsa->se, wcsa->seSize+1);
  90                 //free(wcsa->se);
  91         }
  92
  93         return 0;
  94 }
  95
  96
  97
  98     /**  Loads index from one or more file(s) named filename, possibly
  99       adding the proper extensions. */
 100 int load_index(char *filename, void **index){
 101         twcsa *wcsa;
 102         wcsa = loadWCSA (filename);
 103         (*index) = (void *) wcsa;
 104         return 0;
 105 }
 106
 107         /** Frees the memory occupied by index. */
 108 int free_index(void *index){
 109         twcsa *wcsa=(twcsa *) index;
 110         ulong size;
 111         index_size(index,&size);
 112         printf("\n[destroying index] ...Freed %lu bytes... RAM", size);
 113
 114
 115         //frees the array SE.
 116         if (wcsa->se)
 117                 free (wcsa->se);
 118
 119         //the iCSA.
 120         if (wcsa->myicsa) {
 121                 //destroyStructsCSA(wcsa->myicsa);
 122                 int err = freeIntIndex((void *) wcsa->myicsa);
 123         }
 124
 125         //the words.
 126         free (wcsa->wordsData.wordsZoneMem.zone);
 127         free (wcsa->wordsData.words); /** huge!! */
 128
 129         //the pointer to wcsa.
 130         free(wcsa);
 131         return 0;
 132 }
 133
 134         /** Gives the memory occupied by index in bytes. */
 135 int index_size(void *index, ulong *size) {
 136         ulong totaltmp;
 137         twcsa *wcsa=(twcsa *)index;
 138         uint n= wcsa->n;
 139         *size=0;
 140         *size += sizeof(twcsa);
 141
 142         totaltmp=0;  //words
 143         totaltmp += ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));  //the pointers
 144         totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words.
 145         *size += totaltmp;
 146
 147         if (wcsa->myicsa) {
 148                 uint nbytes;
 149                 int err = sizeIntIndex((void *) wcsa->myicsa, &nbytes);
 150                 *size += nbytes;
 151                 //*size += CSA_size(wcsa->myicsa);
 152         }
 153
 154         return 0;
 155 }
 156
 157
 158 /** Querying the index =============================================================*/
 159
 160         /* Writes in numocc the number of occurrences of the substring
 161            pattern[0..length-1] found in the text indexed by index. */
 162 int count (void *index, uchar *pattern, ulong length, ulong *numocc){
 163         uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
 164         uint integerPatternSize;
 165         ulong l,r;
 166
 167         twcsa *wcsa=(twcsa *) index;
 168         parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
 169         if (!integerPatternSize) {*numocc=0; return 0;} //not found
 170
 171         //*numocc = countCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &l, &r);
 172         int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, numocc,  &l, &r);
 173         return 0;
 174 }
 175
 176         /* Writes in numocc the number of occurrences of the substring
 177           pattern[0..length-1] in the text indexed by index. It also allocates
 178           occ (which must be freed by the caller) and writes the locations of
 179           the numocc occurrences in occ, in arbitrary order.  */
 180 int locate(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc){
 181         return 99;
 182 }
 183
 184    /* Gives the length of the text indexed */
 185 int get_length(void *index, ulong *length) {
 186         twcsa *wcsa=(twcsa *) index;
 187         *length = wcsa->sourceTextSize;
 188         return 0;
 189 }
 190
 191     /**  Obtains the length of the text indexed by index. */
 192
 193 int length (void *index, ulong *length) {
 194         return (get_length(index,length));
 195 }
 196
 197
 198 /** ***********************************************************************************
 199   * Accessing the indexed text
 200   * ***********************************************************************************/
 201
 202
 203         /**  Allocates snippet (which must be freed by the caller) and writes
 204           the substring text[from..to] into it. Returns in snippet_length the
 205           length of the text snippet actually extracted (that could be less
 206           than to-from+1 if to is larger than the text size).                      */
 207 int extract (void *index, ulong from, ulong to, uchar **snippet, ulong *snippet_length) {
 208     twcsa *wcsa=(twcsa *) index;
 209         return 99;
 210 }
 211
 212   /** Displays the text (snippet) surrounding any occurrence of the
 213     substring pattern[0..length-1] within the text indexed by index.
 214     The snippet must include numc characters before and after the
 215     pattern occurrence, totalizing length+2*numc characters, or less if
 216     the text boundaries are reached. Writes in numocc the number of
 217     occurrences, and allocates the arrays snippet_text and
 218     snippet_lengths (which must be freed by the caller). The first is a
 219     character array of numocc*(length+2*numc) characters, with a new
 220     snippet starting at every multiple of length+2*numc. The second
 221     gives the real length of each of the numocc snippets. */
 222
 223 int display (void *index, uchar *pattern, ulong length, ulong numc,
 224         ulong *numocc, uchar **snippet_text, ulong **snippet_lengths) {
 225         return 99;
 226 }
 227
 228
 229
 230 /** ***********************************************************************************
 231   * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
 232   * ***********************************************************************************/
 233         /* Writes in numocc the number of occurrences of the substring
 234           pattern[0..length-1] in the text indexed by index. It also allocates
 235           occ (which must be freed by the caller) and writes the locations of
 236           the numocc occurrences in occ, in arbitrary order. These occurrences
 237           refer to the offsets in TOH where the caller could start a display
 238           operation. So locateWord implies synchronization using B.
 239           Moreover, positions occ[numocc.. 2*numocc-1] is set with the rank in SE of the
 240           words whose codes begin in TOH in the positions in occ[0... numocc-1]
 241           ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
 242              searched word, but the offset in TOH of k-before words before.
 243         */
 244
 245 int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore){
 246         uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
 247         uint integerPatternSize;
 248         ulong occurrences,l,r;
 249         twcsa *wcsa=(twcsa *) index;
 250
 251         parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
 252         if (!integerPatternSize) {*numocc=0; return 0;} //not found
 253
 254         ulong *seOffsets;
 255
 256         //obtains the indexes in vector SE where the pattern appears.
 257         //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
 258         int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
 259
 260         *numocc = occurrences;
 261
 262         if (!occurrences) {(*occ)=NULL;return 0;}
 263
 264         (*occ) = (ulong *)seOffsets;
 265         return 0;
 266 }
 267
 268
 269   /** Displays the text (snippet) surrounding any occurrence of the
 270     substring pattern[0..length-1] within the text indexed by index.
 271     The snippet must include numc characters before and after the
 272     pattern occurrence, totalizing length+2*numc characters, or less if
 273     the text boundaries are reached. Writes in numocc the number of
 274     occurrences, and allocates the arrays snippet_text and
 275     snippet_lengths (which must be freed by the caller). The first is a
 276     character array of numocc*(length+2*numc) characters, with a new
 277     snippet starting at every multiple of length+2*numc. The second
 278     gives the real length of each of the numocc snippets. */
 279
 280  int displayWords (void *index, uchar *pattern, ulong length, ulong numc,
 281          ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore) {
 282
 283     /** actually extracts upto length + 2*numc chars, starting extraction kbefore
 284      *  words before the occurrence **/
 285
 286         ulong *indexesInSE;
 287         ulong occurrences;
 288         uint bytesPerSnippet;
 289         byte *text_aux;
 290         twcsa *wcsa=(twcsa *) index;
 291
 292         locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
 293         (*numocc) = occurrences;
 294
 295         if (!occurrences) {
 296                 *snippet_text =NULL;
 297                 *snippet_lengths =NULL;
 298                 return 0;
 299         }
 300
 301         bytesPerSnippet = length+2*numc;
 302 //      bytesPerSnippet = 2*numc;
 303         *snippet_lengths = (ulong *) malloc((*numocc)*sizeof(ulong));
 304         if (!(*snippet_lengths)) return 1;
 305         *snippet_text = (uchar *) malloc((*numocc)*(bytesPerSnippet)*sizeof(uchar) +1) ;  //(the last "1" is for '\0');
 306         if (!(*snippet_text)) return 1;
 307
 308  //     fprintf(stderr,"\n occs found = %7d for pattern %s",*numocc, pattern);
 309  //     fflush(stderr);
 310
 311         text_aux=*snippet_text;
 312         {
 313                 uint i, j, tmplen;
 314                 uint ptr, maxptr;
 315                 byte *src, *dst;
 316                 uint snippetLen;
 317                 uint posSEValue,indexSE;
 318
 319                 for (i=0;i<occurrences;i++) {
 320                                 uint prevValid=0;
 321                                 uint endSnippet =0;
 322
 323                                 /** decodes words from there */
 324                                 snippetLen=0;
 325                                 indexSE = indexesInSE[i];
 326                                 indexSE = (indexSE > kbefore) ? indexSE-kbefore : 0;
 327
 328                                 dst = text_aux;
 329                                 while ((!endSnippet) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
 330
 331                                         //posSEValue =displayCSA(wcsa->myicsa,indexSE);
 332                                         int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
 333
 334                                         {//obtains pointer to the ith word
 335                                                 uint offtmp;
 336                                                 uint ith = posSEValue -1;  // !!
 337                                                 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 338                                                 offtmp = bitread (wcsa->wordsData.words, ( ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 339                                                 tmplen -=offtmp;  //the lenght of the ith word.
 340
 341                                                 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
 342                                         }
 343
 344                                         if (_Valid[*src]) {
 345                                                 if (prevValid){
 346                                                          *dst++ =' ';
 347                                                          snippetLen++;
 348                                                          if  (snippetLen==bytesPerSnippet) break;  //end of snippet (ends in BLANK_SPACE)
 349                                                 }
 350                                                 prevValid =1;   //for the next iteration
 351                                         }
 352                                         else prevValid=0;
 353
 354                                         indexSE++;
 355
 356                                         /* at the end ?? */
 357                                         if  ((tmplen+snippetLen)>=bytesPerSnippet) {
 358                                                 tmplen =(bytesPerSnippet - snippetLen);
 359                                                 endSnippet=1; //so while loop ends;
 360                                         }
 361
 362                                         for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
 363                                         snippetLen +=tmplen;
 364                                 }//while
 365
 366                                 text_aux += bytesPerSnippet;
 367                                 (*snippet_lengths)[i] = snippetLen;
 368                         }       //for
 369
 370                         if (occurrences) free(indexesInSE);
 371                 }
 372                 return 0;
 373 }
 374
 375 /** simulates extration of text process, but do not actually returns anything at all
 376    Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
 377    Less than 2K words can be extracted if more than numc characters have been already obtained.
 378    Does nothing else... does not return the text */
 379
 380 int  displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc) {
 381
 382         ulong *indexesInSE;
 383         ulong occurrences;
 384         byte *text_aux;
 385
 386         twcsa *wcsa=(twcsa *) index;
 387
 388         locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
 389
 390         if (!occurrences) {
 391                 return 0;
 392         }
 393
 394         ulong maxsnippetLen  = maxnumc;
 395         ulong extractedbytes = 0;
 396
 397         text_aux = (byte *) malloc (maxsnippetLen+1);
 398
 399         {
 400                 uint i, j, tmplen;
 401                 uint ptr, maxptr;
 402                 byte *src, *dst;
 403                 uint snippetLen;
 404                 uint posSEValue,indexSE;
 405
 406                 uint numWordsToExtract = 2 * wordsbefore;
 407                 uint z;
 408                 //printf("\n occurrences... = %lu",occurrences);
 409
 410                 for (i=0;i<occurrences;i++) {
 411                                 uint prevValid=0;
 412                                 uint endSnippet =0;
 413
 414                                 /** decodes words from there */
 415                                 snippetLen=0;
 416                                 indexSE = indexesInSE[i];
 417                                 indexSE = (indexSE > wordsbefore) ? indexSE-wordsbefore : 0;
 418
 419                                 dst = text_aux;
 420                                 z=0;
 421                                 while ((z<numWordsToExtract) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
 422
 423                                         //posSEValue =displayCSA(wcsa->myicsa,indexSE);
 424                                         int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
 425
 426                                         {//obtains pointer to the ith word
 427                                                 uint offtmp;
 428                                                 uint ith = posSEValue -1;  // !!
 429                                                 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 430                                                 offtmp = bitread (wcsa->wordsData.words, ( ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 431                                                 tmplen -=offtmp;  //the lenght of the ith word.
 432
 433                                                 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
 434                                         }
 435
 436                                         if (_Valid[*src]) {
 437                                                         if (prevValid){
 438                                                                  *dst++ =' ';
 439                                                                  snippetLen++;
 440                                                                  if  (snippetLen==maxsnippetLen) break;  //end of snippet (ends in BLANK_SPACE)
 441                                                                 }
 442                                                         prevValid =1;   //for the next iteration
 443                                         }
 444                                         else prevValid=0;
 445
 446                                         indexSE++;
 447
 448                                         /* at the end ?? */
 449                                         if  ((tmplen+snippetLen)>=maxsnippetLen) {
 450                                                         break;
 451                                         }
 452
 453                                         //fprintf(stderr,"\ntmplen = %d ",tmplen); fflush(stderr);
 454                                         for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
 455                                         snippetLen +=tmplen;
 456                                         z++;
 457                                 }//while
 458
 459                                 extractedbytes += snippetLen;
 460
 461                         }       //for
 462
 463                         if (occurrences) free(indexesInSE);
 464                 }
 465                 if (text_aux) free (text_aux);
 466                 return extractedbytes;
 467 }
 468
 469
 470
 471 /**  Allocates text (which must be freed by the caller) and recovers the
 472   the substring of text starting from the "fromword"-th word up to the
 473   "toWord"-th words. Returns in text the text, and in "text_lenght" the
 474   length of the text  actually extracted. Text is allocated.
 475   Actually extracts SE[fromWord .. toWord) ... not the last word.    */
 476
 477 int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
 478         ulong *text_length){
 479
 480         twcsa *wcsa=(twcsa *) index;
 481     uint initTextLen=10000;
 482     uint avgWordLen =7;
 483
 484         uint i, j;//, tmplen;
 485         uint prevValid = 0;
 486         byte *src, *dst, *buff;
 487         uint tmplen =0;
 488
 489         uint buffBytes = 1000;
 490         uint leng=0; //curr pos in buffer that was occupied.
 491
 492         if (toWord > wcsa->seSize) toWord = wcsa->seSize;
 493         if (fromWord >= wcsa->seSize) fromWord = wcsa->seSize-1;
 494         if (buffBytes < ( (toWord-fromWord)* avgWordLen)) buffBytes = ((toWord-fromWord)* avgWordLen);
 495
 496         buff = (uchar *) malloc (buffBytes * sizeof(char));
 497         if (!buff) return 1; //out of memory.
 498         dst = buff;
 499
 500         register uint indexSE=fromWord;
 501         uint posSEValue=0;
 502         register uint ith;
 503
 504         while  ( (indexSE < toWord) ){ /** extracting words (if not at the end) */
 505
 506                 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
 507
 508                 {//obtains pointer to the ith word
 509                         uint offtmp;
 510                         ith= posSEValue -1;  // !!
 511                         tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 512                         offtmp = bitread (wcsa->wordsData.words, (ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 513                         tmplen -=offtmp;  //the lenght of the ith word.
 514                         src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
 515                 }
 516
 517                 if ( buffBytes < (leng + tmplen+1) ) {
 518                         buffBytes *=2;
 519                         buff = (uchar*) realloc(buff, buffBytes);
 520                         if (!buff) return 1; //out of memory.
 521                         dst = buff + leng;
 522                 }
 523
 524                 if (_Valid[*src]) {
 525                         if (prevValid){
 526                                 *dst++ =' ';
 527                                 leng  += 1;
 528                         }
 529                         prevValid =1;   //for the next iteration
 530                 }
 531                 else prevValid=0;
 532
 533                 indexSE++;
 534
 535                 for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
 536                 leng +=tmplen;
 537         }//while
 538
 539         *text_length =leng;
 540         *dst='\0';
 541         *text = buff;
 542         return 0;
 543 }
 544
 545
 546
 547 /** ***********************************************************************************
 548          CONSTRUCTION OF THE INDEX WCSA
 549     ***********************************************************************************/
 550
 551         /**------------------------------------------------------------------
 552         Compares two slots (alphanumericaly). For qsort of canonical words
 553         ------------------------------------------------------------------ */
 554         int qSortWordsCompareAlpha(const void *arg1, const void *arg2) {
 555                 tposInHT *a1 = (tposInHT *) arg1;
 556                 tposInHT *a2 = (tposInHT *) arg2;
 557                 return strcmp((char*)a1->word, (char *)a2->word);
 558         }
 559
 560 /**
 561   * BUILDS THE WCSA INDEX
 562   */
 563
 564 int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
 565
 566         unsigned long zeroNode;  //number of different canonical words.
 567
 568         t_hash hash;            // the hash table to store both variants and canonical words.
 569         tposInHT *posInHT;      // structure for canonicals and variants+huffmans
 570
 571         uint sourceTextSize;
 572
 573         uint seSize=0;  //it's size == "numberOfValidWords".
 574         uint *SE;       //Integers vector. (represents the rank of the valid words in the source text).
 575
 576         uint totallenWords=0; //The numberOfBytes that occupy canonical words (their ascii version) in memory
 577
 578
 579         ulong bytesFile,bytesFileReal;
 580         long sizeNValue;
 581
 582         /* used during first pass */
 583
 584         ulong addrInTH;
 585
 586         byte* inputBuffer = text;
 587         bytesFileReal= bytesFile = length;
 588
 589         sourceTextSize=length;
 590
 591         /** Initializes WCSA structure*/
 592         twcsa *wcsa;
 593         wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
 594         zeroNode=0;
 595         /**      */
 596
 597         //Stimation (Using Heap's law) of the number of different "meaningful" words.
 598         //sizeNValue=N_value;
 599         if(bytesFile<5000000) bytesFile = 5000000;
 600         sizeNValue = (unsigned long) floor(3.9* pow(bytesFile,0.60) );
 601
 602
 603         // Inicializes the arrays used to detect if a char is valid or not.
 604         StartValid();
 605         // Inicializes the arrays used translated a char into lowercase.
 606         StartToLow();
 607
 608
 609         // **********************************************************************************
 610         //STARTING THE FIRST PASS.
 611         // **********************************************************************************
 612         printf("\nSTARTING THE FIRST PASS...");
 613
 614         posInHT = (tposInHT *) malloc(sizeof(tposInHT) * sizeNValue);
 615         hash = initialize_hash (sizeNValue); //hash to cointain both the parsed words
 616
 617         //-----------------------------------------------------------------
 618         //1st pass (processing the file)
 619         {
 620             byte *pbeg,*pend,*wordstart,*aWord;
 621             register ulong size;
 622             register uint i;
 623
 624             pbeg = inputBuffer;
 625             pend = inputBuffer+bytesFileReal;
 626
 627             while (pbeg <pend)
 628             {
 629                 if (*pbeg == 0)
 630                 {
 631                     fprintf(stderr, "buildFacade.c: assert failed, *pbeg == 0\n");
 632                     exit(1);
 633                 }
 634
 635                 //parsing either a word or separator.
 636                 size=0;
 637                 wordstart = pbeg;
 638                 if (_Valid[*pbeg]) {   //alphanumerical data
 639                     while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
 640                 }
 641                 else
 642                 {
 643                     if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
 644                         while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
 645                     }
 646                     else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
 647                         pbeg++;
 648                         if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
 649                         else {
 650                             if (_Valid [*pbeg] ) {
 651                                 wordstart = pbeg;   //So skipping 1 blank character
 652                                 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
 653                             }
 654                             else {   // a "separator word" ...
 655                                 size++; //the prev BLANK...
 656                                 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
 657                             }//else {  // a "separator word"
 658                         }//else ... not a unique BLANK AT THE END.
 659                     }//else ... starting by a BLANK...
 660                 }
 661
 662                 if (pbeg < pend && *pbeg == 0)
 663                     pbeg ++;    // Skip the 0-bytes
 664
 665                 if (size == 0)
 666                 {
 667                     fprintf(stderr, "buildFacade.c: assert failed, size == 0\n");
 668                     exit(1);
 669                 }
 670
 671                 //The parsed word/separator is  is "wordstart", and its length is "size"...
 672                 aWord=wordstart;
 673
 674                 //Processement done for each word word
 675                 i = inHashTable(hash,aWord, size, &addrInTH );
 676                 if (!i){
 677                     insertElement (hash,aWord, size, &addrInTH);
 678                     posInHT[zeroNode].slot=addrInTH;
 679                     posInHT[zeroNode].word=hash->hash[addrInTH].word;
 680                     hash->hash[addrInTH].posInVoc = zeroNode;
 681                     zeroNode++;
 682                     totallenWords += size +1;                   // +1 due to the '\0' char...
 683                    //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);
 684                 }
 685                 seSize ++;
 686
 687             }//while pbeg<pend
 688
 689             fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
 690
 691         }//1st pass ends
 692
 693
 694         // **********************************************************************************
 695         // END OF 1ST PASS
 696         // **********************************************************************************
 697
 698         // Sorting the words alphanumerically (over posInHT)
 699         {       register unsigned long i,j;
 700                 //sorting canonical words ...
 701                 qsort(posInHT, zeroNode, sizeof(tposInHT), qSortWordsCompareAlpha);
 702
 703                 //setting in hash the new positions of the  words in the hash table
 704                 for (i=0;i<zeroNode;i++) {
 705                         hash->hash[posInHT[i].slot].posInVoc = i;
 706                 }
 707         }
 708
 709         // INITIALIZING structures for the 2nd pass ......................................
 710         {
 711                 SE  = (uint *) malloc ((seSize+1)*sizeof (uint));
 712         }
 713
 714
 715         // **********************************************************************************
 716         //  STARTING THE SECOND PASS.
 717         // **********************************************************************************/
 718
 719         printf("\nSTARTING THE SECOND PASS... ");
 720         //2nd pass (processing the file)
 721         {
 722             byte *pbeg,*pend,*wordstart,*aWord;
 723             register ulong size;
 724             register uint i;
 725             register ulong countValidWords = 0;
 726
 727
 728             pbeg = inputBuffer;
 729             pend = inputBuffer+bytesFileReal;
 730
 731             while (pbeg <pend) {
 732                 if (*pbeg == 0)
 733                 {
 734                     fprintf(stderr, "buildFacade.c 2nd pass: assert failed, *pbeg == 0\n");
 735                     exit(1);
 736                 }
 737
 738                 //parsing either a word or separator.
 739                 size=0;
 740                 wordstart = pbeg;
 741                 if (_Valid[*pbeg]) {   //alphanumerical data
 742                     while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
 743                 }
 744                 else {
 745                     if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
 746                         while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
 747                     }
 748                     else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
 749                         pbeg++;
 750                         if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
 751                         else {
 752                             if (_Valid [*pbeg] ) {
 753                                 wordstart = pbeg;   //So skipping 1 blank character
 754                                 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
 755                             }
 756                             else {   // a "separator word" ...
 757                                 size++; //the prev BLANK...
 758                                 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg]  && *pbeg != 0)) {size++;pbeg++;}
 759                             }//else {  // a "separator word"
 760                         }//else ... not a unique BLANK AT THE END.
 761                     }//else ... starting by a BLANK...
 762                 }
 763
 764                 if (pbeg < pend && *pbeg == 0)
 765                     pbeg ++;    // Skip the 0-bytes
 766
 767                 if (size == 0)
 768                 {
 769                     fprintf(stderr, "buildFacade.c 2nd pass: assert failed, size == 0\n");
 770                     exit(1);
 771                 }
 772
 773                 //The parsed word/separator is  is "wordstart", and its length is "size"...
 774                 aWord=wordstart;
 775
 776                 //Processement done for each word word
 777                 i = inHashTable(hash,aWord, size, &addrInTH );
 778
 779                 SE[countValidWords]=hash->hash[addrInTH].posInVoc+1;  // !!!!
 780                 countValidWords++;
 781
 782             }// while pbeg<pend
 783
 784             SE[countValidWords] = 0;
 785             fprintf(stderr,"\n2nd pass ends: TOTAL distinct words: %lu totalWords = %lu",zeroNode,countValidWords);
 786
 787         }//2nd pass ends
 788
 789         // **********************************************************************************
 790         // END OF 2ND PASS
 791         // **********************************************************************************
 792
 793         //freeing the source text (it is no longer needed).
 794         delete [] inputBuffer; //the text
 795
 796         /** Now Setting the data of the index **/
 797         wcsa->n = zeroNode;
 798         wcsa->sourceTextSize = sourceTextSize;
 799         wcsa->seSize = seSize;
 800
 801         // Creating the words of the vocabulary...
 802         {
 803         /** copying the words into WCSA. */
 804                 uint *tmpOffsets = (uint *) malloc (sizeof(uint) * (zeroNode  +1) );  //1 extra uint (to point to the virtual "zeroNode+1" ^th word.
 805                 uint tmpOffset =0;
 806
 807                 byte *zoneMem,*src;
 808                 uint i;
 809
 810                 //Moving data from posInHT to WCSA structure
 811                 //wcsa->wordsData = (twords *) malloc(sizeof(twords) * zeroNode);
 812                 wcsa->wordsData.wordsZoneMem.size = totallenWords - zeroNode; //without '\0' bytes (end-tag).
 813                 wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc ( wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
 814                 zoneMem = wcsa->wordsData.wordsZoneMem.zone;
 815                 for(i = 0; i < zeroNode; i++) {
 816                         src = posInHT[i].word;                           //copying the canonical word
 817                         //wcsa->wordsData.words[i].word = zoneMem;   //setting the pointer
 818                         tmpOffsets[i]=tmpOffset;  //offset in zoneMem
 819                         while (*src) {*zoneMem++ = *src++; tmpOffset++;}  //moving data until '\0'
 820                         //*zoneMem='\0'; zoneMem++;            //copies also the '\0'
 821
 822                 }
 823                 tmpOffsets[zeroNode]=tmpOffset; //setting pointer to the "virtual" word {zeroNode+1}^{th}
 824
 825                 //kbit encoding of the offsets
 826                 uint elemSize = _bits(tmpOffset);
 827                 wcsa->wordsData.elemSize = elemSize;
 828                 wcsa->wordsData.words = (uint *) malloc (((((zeroNode +1)*elemSize)+W-1) /W) * sizeof(uint));  //with 1 extra slot !.
 829                 wcsa->wordsData.words[((((zeroNode +1)*elemSize)+W-1) /W)   -1 ] =0000;
 830                 //              fprintf(stderr,"\n ElemSize = %d, maxOffset = %d",elemSize,tmpOffset);
 831
 832                 tmpOffset=0;
 833                 for (i=0; i<=zeroNode; i++) {  //setting "zeroNode+1" offsets
 834                                 bitwrite(wcsa->wordsData.words, tmpOffset, elemSize, tmpOffsets[i]);
 835                                 tmpOffset+=elemSize;
 836                 }
 837
 838                 //////////// CHECKS IT WORKED. old !!!!
 839         //              { uint kk;
 840         //                      tmpOffset=0;
 841         //                      for (i=0; i<zeroNode; i++) {  //setting "zeroNode+1" offsets
 842         //                                      kk=bitread(wcsa->wordsData.words, i* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 843         //                                      tmpOffset+=elemSize;
 844         //                                      if (kk != tmpOffsets[i]) {fprintf(stderr,"\n @@@@@@@@ DISTINTOS OFFSETS "); break;}
 845         //                                      else fprintf(stderr,"\n iguales, %d, %d  :: <<%s>>len=%d",kk,i, posInHT[i].word, strlen((char*)posInHT[i].word));
 846         //                      }
 847         //              }
 848         //
 849         //              { uint len1, len, tmplen, len2;
 850         //                      uint i,p;
 851         //                      byte *wcsaWord, *src;
 852         //
 853         //                      for (p=0;p<zeroNode;p++) {
 854         //                              {//preparing for strcompL
 855         //                                      len    = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p+1)), wcsa->wordsData.elemSize);
 856         //                                      tmplen = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p))  , wcsa->wordsData.elemSize);
 857         //
 858         //                              //fprintf(stderr,"\n  :: off[%d]= %d  -  off [%d] = %d  ==> %d",p+1,len,p,tmplen,len-tmplen);
 859         //
 860         //                              len2 =len-tmplen;
 861         //                                      wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
 862         //                              }
 863         //
 864         //                              src = posInHT[p].word;
 865         //                              len1 = strlen((char *)src);
 866         //
 867         //                              if (strcompL(src,wcsaWord,len1,len2) != 0) {
 868         //                                      fprintf(stderr,"\n %6d DISTINTOS !! ===len1 %d,len %d===== <<",p,len1,len2);printWord(src,len1);
 869         //                                      fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
 870         //                                      exit(0);
 871         //                              }
 872         //                              else {
 873         //                                      fprintf(stderr,"\n %6d ======len1 %d,len2 %d===== <<",p,len1,len2);printWord(src,len1);
 874         //                                      fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
 875         //                              }
 876         //                      }
 877         //
 878         //              }
 879         //
 880
 881         /**-----------*/
 882         //frees memory from hash table and posInHT structures.
 883                 free(tmpOffsets);
 884                 destroy_hash(hash);
 885                 free(posInHT);
 886         }
 887
 888         /** ******* creates the self-index on ints (bottom layer) ==> see build_icsa *********/
 889 /**
 890         #ifdef CSA_ON
 891         {
 892                 uint total;
 893                 fprintf(stderr,"\n **** CREATING CSA from Edu's Code *****");
 894                 ticsa *myicsa;
 895                 myicsa = createIntegerCSA(&SE,seSize+1,build_options);
 896                 wcsa->myicsa= myicsa;
 897                 total = CSA_size(myicsa);
 898
 899                 free(SE);  //SE is no longer needed, (it is indexed by the iCSA)
 900                 printf("\n\t**** [iCSA built on %d words. Size = %ld bytes... RAM",seSize,total);
 901         }
 902         #endif
 903 */
 904
 905         //#ifndef CSA_ON
 906                 wcsa->se = SE;
 907                 wcsa->myicsa = NULL;
 908         //#endif
 909
 910         printf("\n\t ** Building done! **\n");
 911         printf("\n Process finished!\n");
 912
 913         *index = wcsa;
 914         return 0;
 915 }
 916
 917
 918 int build_iCSA (char *build_options, void *index)
 919 {
 920         twcsa *wcsa = (twcsa *) index;
 921         /********* creates the self-index on ints (bottom layer) *********/
 922         //creating CSA from Edu's code...
 923         {
 924                 uint total;
 925                 fprintf(stderr,"\n **** CREATING CSA-bottom-layer *****");
 926                 void *bottomIntIndex;
 927                 int err =  buildIntIndex(wcsa->se,wcsa->seSize+1, build_options,(void **)&bottomIntIndex);
 928                 wcsa->myicsa = bottomIntIndex;
 929
 930                 //total = CSA_size(wcsa->myicsa);
 931                 err = sizeIntIndex((void *) wcsa->myicsa, &total);
 932
 933                 printf("\n\t**** [iCSA built on %d words. Size = %u bytes... RAM",wcsa->seSize,total);
 934         }
 935         return 0;
 936 }
 937
 938 /** ********************************************************************
 939   * Loading from disk
 940   **********************************************************************/
 941
 942 /**-----------------------------------------------------------------
 943  * LoadWCSA.
 944  * Loads all the data structures of WCSA (included the icsa)
 945  ----------------------------------------------------------------- */
 946
 947 twcsa *loadWCSA(char *filename) {
 948         twcsa *wcsa;
 949         // Inicializes the arrays used to detect if a char is valid or not.
 950         StartValid();
 951         // Inicializes the arrays used translated a char into lowercase.
 952         StartToLow();
 953
 954         wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
 955         wcsa->n=0;
 956
 957         int err = loadIntIndex(filename, (void **)&wcsa->myicsa);
 958
 959         loadStructs(wcsa,filename);
 960
 961         return wcsa;
 962 }
 963
 964 /** ------------------------------------------------------------------
 965  * LoadStructs.
 966  *      Reads files and loads all the data needed for searcherFacade
 967  ----------------------------------------------------------------- */
 968  void loadStructs(twcsa *wcsa, char *basename) {
 969         uint i,j;
 970         char *filename;
 971         int file;
 972         uint sizeFile;
 973         char c;
 974         uint n;
 975
 976         filename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
 977         fprintf(stderr,"Loading Index from file %s.*\n", basename);
 978
 979         //** SOME CONSTANTS: sourceTextSize
 980         {       strcpy(filename, basename);
 981                 strcat(filename, ".");
 982                 strcat(filename, CONSTANTS_FILE_EXT);
 983
 984                 if( (file = open(filename, O_RDONLY)) < 0) {
 985                         printf("Cannot open file %s\n", filename);
 986                         exit(0);
 987                 }
 988
 989                 read(file, &(wcsa->sourceTextSize), sizeof(uint));
 990                 read(file, &(wcsa->seSize), sizeof(uint));
 991                 close(file);
 992         }
 993
 994         /** File with the words from the vocabulary (sorted alphabetically) */
 995         {       byte *zoneMem;
 996
 997                 strcpy(filename, basename);
 998                 strcat(filename, ".");
 999                 strcat(filename, VOCABULARY_WORDS_FILE_EXT);
1000                 //sizeFile= fileSize(filename)-sizeof(uint);
1001
1002                 if( (file = open(filename, O_RDONLY)) < 0) {
1003                         printf("Cannot open file %s\n", filename);
1004                         exit(0);
1005                 }
1006
1007                 //the number of canonical words
1008                 read(file, &n, sizeof(uint));
1009                 wcsa->n = n;
1010                 read(file, &(wcsa->wordsData.elemSize), (sizeof(uint)));
1011                 read(file, &(wcsa->wordsData.wordsZoneMem.size), (sizeof(uint)));
1012
1013                 //allocating the memory needed for all words and reading them   //(ascii) << no \0 chars are needed>>.
1014                 wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc(wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
1015                 read(file, (wcsa->wordsData.wordsZoneMem.zone),    wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
1016
1017                 //reading the offsets of the words (kbitArray that points to offsets in zoneMem of words.
1018                 wcsa->wordsData.words = (uint *) malloc (((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * sizeof(uint));
1019                 wcsa->wordsData.words[ ((((n+1)*(wcsa->wordsData.elemSize))+W-1) /W)   -1 ] =0000;
1020                 read(file, (wcsa->wordsData.words),     ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)));
1021
1022
1023                 close(file);
1024         }
1025         wcsa->se= NULL;
1026         free(filename);
1027 }
1028
1029
1030
1031
1032 /** ****************************************************************
1033   * Querying the index WCSA
1034   * ***************************************************************/
1035 ///////////////////////////////////////////////////////////////////////////////////////
1036 //                                         FUNCTIONS NEEDED FOR SEARCHING A PATTERN                                      //
1037 ///////////////////////////////////////////////////////////////////////////////////////
1038
1039
1040
1041 /*------------------------------------------------------------------
1042  * Given a text pattern translates it into a list of integers (corresponding to the
1043  * canonical words associated to the valid words in the text pattern)
1044  ------------------------------------------------------------------*/
1045 void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) {
1046
1047         byte *pbeg,*pend,*wordstart,*aWord;
1048         register unsigned long size;
1049         uint index =0;
1050
1051         pbeg = textPattern;
1052         pend = pbeg + patLen;
1053
1054         while (pbeg <pend) {
1055                 //parsing either a word or separator.
1056                 size=0;
1057                 wordstart = pbeg;
1058                 if (_Valid[*pbeg]) {   //alphanumerical data
1059                         while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
1060           }
1061                 else {
1062                         if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
1063                                 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
1064                         }
1065                         else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
1066                                 pbeg++;
1067                                 if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
1068                                 else {
1069                                         if (_Valid [*pbeg] ) {
1070                                                 wordstart = pbeg;   //So skipping 1 blank character
1071                                                 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
1072                                         }
1073                                         else {   // a "separator word" ...
1074                                                 size++; //the prev BLANK...
1075                                                 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
1076                                         }//else {  // a "separator word"
1077                                 }//else ... not a unique BLANK AT THE END.
1078                         }//else ... starting by a BLANK...
1079                 }
1080
1081                 //The parsed word is "aWord", and its length is "size"...
1082                 aWord=wordstart;
1083
1084                 // Binary search on the canonical words (wordsData)
1085                 {
1086                         uint len, tmplen;
1087                         uchar *wcsaWord;
1088                         register uint min,max,p;
1089                         min = 0;
1090                         max = (wcsa->n) - 1;
1091                         while(min < max) {
1092                                 p = (min+max)/2;
1093
1094                                 {//preparing for strcompL
1095                                         len    = bitread (wcsa->wordsData.words, (p+1)* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize);
1096                                         tmplen = bitread (wcsa->wordsData.words, (p )* wcsa->wordsData.elemSize  , wcsa->wordsData.elemSize);
1097                                         len -=tmplen;
1098                                         wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
1099                                 }
1100
1101                                 //if(strncmp((char*)aWord, (char*)wcsa->wordsData[p].word,size) > 0) min = p+1;
1102                                 if(strcompL(aWord, wcsaWord, size, len) > 0) min = p+1;
1103                                 else max = p;
1104
1105
1106                                 //                              { //SHOW PROGRESS
1107                                 //                                      fprintf(stderr,"\n Patron = <<%s>>, curposWord= %d ==><<",aWord,p);
1108                                 //                                      printWord(wcsaWord,len); fprintf(stderr,">> len =%d",len);
1109                                 //                              }
1110
1111                         }
1112
1113                         {//preparing for strcompL
1114                                 len    = bitread (wcsa->wordsData.words, (min+1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1115                                 tmplen = bitread (wcsa->wordsData.words, ( min )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1116                                 len -=tmplen;
1117                                         wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
1118                         }
1119
1120                         //      if(!strncmp((char*)aWord, (char*)wcsa->wordsData[min].word, size)) {
1121                         if(!strcompL(aWord, wcsaWord, size, len)) {
1122                                 integerPattern[index++] = min +1 ;  //<--
1123                         }
1124                         else {*sizeIntegers = 0; return;}  // a valid word that does not appear in the source text.
1125
1126                 }
1127         }// end while
1128         *sizeIntegers = index;
1129
1130         //      //shows the parsed words:
1131         //      {uint i;
1132         //              printf("\n\n >>%s>> HA SIDO PARSEADO COMO:",textPattern);
1133         //              for (i=0; i<index;i++) {
1134         //                              printf("<<%s>>",wcsa->wordsData[integerPattern[i] -1].word);
1135         //              }
1136         //
1137         //      }
1138 }
1139
1140
1141
1142
1143
1144         /** ------------------------------------------------------------------
1145          * Returns the number of occurrences of a given text pattern
1146          *------------------------------------------------------------------ */
1147 int countTextOcurrences(twcsa *wcsa, byte *textPattern) {
1148         ulong left, right;
1149         uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
1150         uint integerPatternSize, min, max;
1151
1152         uint lenpat = strlen((char*)textPattern);
1153         parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
1154         if (!integerPatternSize) return -1;
1155
1156 //      #ifdef DEBUG_ON
1157 //              uint i;
1158 //              printf("\n %d Integers to search for:",integerPatternSize );
1159 //              for (i=0;i<integerPatternSize;i++) {
1160 //                      printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
1161 //              }
1162 //              printf("\n");
1163 //      #endif
1164
1165         ulong numocc;
1166         int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, &numocc,  &left, &right);
1167         return numocc;
1168
1169 }
1170
1171
1172         /** ------------------------------------------------------------------
1173          * locateTextOcurrences:
1174          * Returns the offsets of the source text where a word/phrase appears
1175          * Returns also the number of occurrences.
1176          *------------------------------------------------------------------ */
1177 uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences) {
1178         uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
1179         uint integerPatternSize, min, max;
1180
1181         uint lenpat = strlen((char*)textPattern);
1182         parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
1183         if (!integerPatternSize) {*numberOccurrences = -1; return NULL;}
1184
1185 //      #ifdef DEBUG_ON
1186 //              uint i;
1187 //              printf("\n %d Integers to search for:",integerPatternSize );
1188 //              for (i=0;i<integerPatternSize;i++) {
1189 //                      printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
1190 //              }
1191 //              printf("\n");
1192 //      #endif
1193
1194         ulong occurrences, left, right;
1195         ulong *seOffsets;
1196         ulong *sourceOffsets;
1197
1198         //obtains the indexes in vector SE where the pattern appears.
1199         //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
1200         int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
1201
1202         //sourceOffsets = (uint *) malloc (sizeof(uint)*occurrences);
1203
1204         sourceOffsets=seOffsets;
1205         //obtains the offsets in the source text of the pattern (sourceOffsets)
1206         locateFacade(wcsa, (uint *)sourceOffsets, (uint *)seOffsets,occurrences);
1207
1208         #ifdef DEBUG_ON
1209                 fprintf(stderr,"\n*** %s appears in the source text in positions:\n\t",textPattern);
1210                 for (i=0;i<occurrences;i++)
1211                         fprintf(stderr,"[%u]",sourceOffsets[i]);
1212                 fflush(stderr);
1213         #endif
1214
1215         *numberOccurrences = occurrences;
1216         return (uint *) sourceOffsets;
1217 }
1218
1219
1220         /** ------------------------------------------------------------------
1221          * displayTextOcurrences:
1222          * Shows in stdout, the text around the occurrences of a word/phrase
1223          * Returns also the number of occurrences.
1224          *------------------------------------------------------------------ */
1225 int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay) {
1226         return 99;  //not implemented: function not available
1227 }
1228
1229         /** ------------------------------------------------------------------
1230          * Locate Facade:
1231          * For given sePositions, returns the sourceTextPositions
1232          * where the those valid-words in se[sePositions[i]] occurr.
1233          *------------------------------------------------------------------*/
1234 int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number) {
1235         return 99;  //not implemented: function not available for this index
1236 }
1237
1238
1239 /** ------------------------------------------------------------------
1240         * DISPLAYFACADE:
1241         * Returns the subString from a starting offset to a final offset
1242         * in the source text. It does not allocate any memory, receives "dstptr"
1243         * Precondition: offsetIni >=0;
1244         ------------------------------------------------------------------*/
1245  int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr) {
1246         return 99;  //not implemented: function not available for this index
1247 }
1248
1249
1250         /**------------------------------------------------------------------
1251          * DISPLAYFacadeMalloc:
1252          * Returns the subString from a starting offset to a final offset
1253          * in the source text. It allocates Memory !!
1254          * NOT CURRENTLY USED
1255          ------------------------------------------------------------------*/
1256 byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length) {
1257         byte *dstptr=NULL;         //not implemented: function not available
1258         return dstptr;
1259 }
1260
1261
1262         /** ------------------------------------------------------------------
1263          * LOCATEALLandDISPLAY:
1264          * Displays the text around an occurrence of the searched word in the source text.
1265          * Assuming that $p$ is that position --> shows only chars in [p_radix-1,p_radix]
1266          ------------------------------------------------------------------*/
1267 int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix) {
1268 return 99;   //not implemented: function not available for this index
1269
1270 }
1271
1272
1273         /** ------------------------------------------------------------------
1274          * recovers the source text by calling display(0,fileSize);
1275          * ------------------------------------------------------------------ */
1276 void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
1277
1278         int start;int end;
1279         byte *cc;
1280         char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
1281         ulong length;
1282
1283         strcpy( filename,  basename);
1284         strcat( filename, ext);
1285         filename[strlen( basename)+ strlen(ext)]='\0';
1286         fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
1287
1288         FILE *salida;
1289         unlink( filename);
1290         salida = fopen( filename,"w");
1291         start=0; end = sourceTextSize-1;
1292
1293         cc = (byte *) malloc (sourceTextSize* sizeof(uchar));
1294
1295         {
1296                 uint i, j;//, tmplen;
1297                 uint prevValid;
1298                 //uint ptr, maxptr;
1299                 byte *src, *dst;
1300                 uint leng =0;
1301                 uint tmplen =0;
1302
1303                 uint indexSE=0;
1304                 uint posSEValue=0;
1305
1306                 dst=cc;
1307                 while  ( (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
1308
1309                         int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
1310
1311                         {//obtains pointer to the ith word
1312                                 uint offtmp;
1313                                 uint ith = posSEValue -1;  // !!
1314                                 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1315                                 offtmp = bitread (wcsa->wordsData.words, ( ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1316                                 tmplen -=offtmp;  //the lenght of the ith word.
1317                                 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
1318                         }
1319
1320                         if (_Valid[*src]) {
1321                                 if (prevValid){
1322                                          *dst++ =' ';
1323                                         leng +=1;
1324                                 }
1325                                 prevValid =1;   //for the next iteration
1326                         }
1327                         else prevValid=0;
1328
1329                         indexSE++;
1330
1331                         for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
1332                         leng +=tmplen;
1333                 }//while
1334
1335         fprintf(stderr,"\n sourceTextSize = %d, len = %d",sourceTextSize,leng);
1336         fwrite(cc,sizeof(byte),leng,salida);
1337         fclose(salida);
1338
1339         free(cc);
1340         free(filename);
1341 }
1342
1343
1344 }
1345
1346                 //recovers the source text by calling extract Words.
1347 void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
1348
1349         int start;int end; int error;
1350         char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
1351         byte *cc;
1352         ulong length;
1353
1354         strcpy( filename,  basename);
1355         strcat( filename, ext);
1356         filename[strlen( basename)+ strlen(ext)]='\0';
1357         fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
1358
1359         FILE *salida;
1360         unlink( filename);
1361         salida = fopen( filename,"w");
1362         start=0; end = wcsa->seSize;
1363
1364         error = extractWords((void *) wcsa, start, end, &cc, &length);
1365         if (error) {fprintf(stderr,"\n error during recoverSourceText2"); exit(0);}
1366
1367         fprintf(stderr,"\n sourceTextSize = %d, len = %ld",sourceTextSize,length);
1368         fwrite(cc,sizeof(byte),length,salida);
1369         fclose(salida);
1370
1371         free(cc);
1372         free(filename);
1373 }
1374
1375 /** *******************************************************************************
1376   * Showing some statistics and info of the index
1377   * *******************************************************************************/
1378 void printInfoReduced(twcsa *wcsa) {
1379           //not implemented: function not available
1380 }
1381
1382                 /* Shows summary info of the index */
1383 int printInfo(void *index) {
1384         uint n;
1385
1386         twcsa *wcsa = (twcsa *) index;
1387
1388         unsigned long indexSize;
1389         uint intIndexSize, presentationSize;
1390         int err;
1391
1392         err = index_size(index, &indexSize);
1393         if (err!=0) return err;
1394         err = sizeIntIndex(wcsa->myicsa, &intIndexSize);
1395         if (err!=0) return err;
1396
1397         presentationSize = indexSize - intIndexSize;
1398
1399                 printf("\n ===================================================:");
1400                 printf("\n Summary of Presentation layer:");
1401                 printf("\n   Number of valid words (SEsize) = %u",wcsa->seSize);
1402                 printf("\n   Number of different words = %ld",wcsa->n);
1403                 printf("\n   WCSA structure = %lu bytes", sizeof(twcsa));
1404
1405                 uint totalpointers = ((((wcsa->n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));
1406                 uint totalasciizone = wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ;
1407                 uint totalwords = totalasciizone + totalpointers;
1408
1409                 printf("\n   Size Of words structure (%d bytes):",totalwords);
1410                 printf("\n      [     pointers = %d bytes || AsciiZone = %d bytes", totalpointers,      totalasciizone);
1411
1412                 printf("\n\n Total = **  %u bytes (in RAM) **",presentationSize);
1413                 //printf("\n\n @@ Summary of self-index on Integers:");
1414                 err = printInfoIntIndex(wcsa->myicsa, " ");
1415                 if (err!=0) return err;
1416
1417                 printf("\n ===================================================:");
1418                 printf("\n");
1419                 return 0;
1420         }
1421
1422 /**------------------------------------------------------------------
1423  * structsSize.
1424  *      Counts the memory amount needed by the Facade (Presentation Layer).
1425  * skipping the stop_words hash table
1426  ----------------------------------------------------------------- */
1427 uint structsSizeMem(twcsa *wcsa) {
1428         return 0;   //not implemented: function not available for this index.
1429 }
1430
1431
1432 /** for debugging **/
1433 void printWord(uchar *str, uint len) {
1434                 uint i;
1435                 for (i=0;i<len;i++)
1436                         fprintf(stderr,"%c",str[i]);
1437 }
1438
1439
1440         /** saves the content of the file SE (ids of the source words) **/
1441 int saveSEfile (char *basename, uint *v, uint n) {
1442         char outfilename[255];
1443         int file;
1444         sprintf(outfilename,"%s.%s",basename,SE_FILE_EXT);
1445         unlink(outfilename);
1446         if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
1447                 printf("Cannot open file %s\n", outfilename);
1448                 exit(0);
1449         }
1450
1451         write(file, v, sizeof(uint) * n );
1452         close(file);
1453 }
1454
1455
1456
1457 double getTime2 (void)
1458 {
1459         double usertime, systime;
1460         struct rusage usage;
1461
1462         getrusage (RUSAGE_SELF, &usage);
1463
1464         usertime = (double) usage.ru_utime.tv_sec +
1465                 (double) usage.ru_utime.tv_usec / 1000000.0;
1466         systime = (double) usage.ru_stime.tv_sec +
1467                 (double) usage.ru_stime.tv_usec / 1000000.0;
1468
1469         return (usertime + systime);
1470 }
1471
1472
1473
1474 /**------------------------------------------------------------------
1475   *  MAIN PROGRAM.
1476   *------------------------------------------------------------------ */
1477 #ifdef FACADEWITHMAIN
1478         int main(int argc, char* argv[])
1479         {
1480
1481
1482
1483                 char *infile, *outbasename, *stopwordsfile;     // Name of in/out files
1484                 byte *inputBuffer;
1485                 ulong finsize;
1486
1487                 int f_in;
1488                 void *Index;
1489
1490
1491                 printf("\n*Word-based iCSA: A word-based CSA");
1492                 printf("\n*CopyRight (c) 2008 [LBD & G.N.]\n\n");
1493
1494                 // Reads input parameters from command line.
1495                 if(argc < 3) {
1496                         printf("Use: %s <in file> <out basename> \n", argv[0]);
1497                         exit(0);
1498                 }
1499
1500                 // Reads params (input file, output basename, and stopwords file)
1501                 infile = argv[1];
1502                 outbasename = argv[2];
1503                 stopwordsfile = argv[3];
1504
1505                 finsize= fileSize(infile);
1506
1507                 if (! finsize) {
1508                         printf( "\nFILE EMPTY OR FILE NOT FOUND %s !!\nSkipping processement ...\n",infile);
1509                         exit(0);
1510                 }
1511
1512                 // Opening the input text file.
1513                 if( (f_in = open(infile, O_RDONLY)) < 0) {
1514                         printf("Cannot read file %s\n", infile);
1515                         exit(0);
1516                 }
1517                 inputBuffer = (byte *) malloc(finsize *sizeof(byte));// +1);
1518                 read (f_in,inputBuffer,finsize);
1519                 close (f_in);
1520
1521
1522         {
1523                 //printf("\n parametros <<%s>>\n\n",stopwordsfile);
1524                 build_index (inputBuffer, finsize, stopwordsfile, &Index);  /** building the index */
1525
1526 //              /** recovering the source text from the index */
1527                         {
1528                                 double start, end;
1529                                 start = getTime2();
1530                                 ulong size;
1531                                 get_length(Index, &size);
1532                                 char extension[10]= ".source";
1533
1534                                 //recoverSourceText1((twcsa*) Index, outbasename,extension, size);
1535                                 strcat(extension,"2");
1536                                 recoverSourceText2((twcsa*) Index, outbasename,extension,size);
1537                                 end = getTime2();
1538                                 fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start );
1539                         }
1540 //
1541                 // DISPLAYING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
1542                         {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1543                          int error = 0;
1544                         ulong numocc,numc, length, i, *snippet_len, tot_numcharext = 0, numpatt;
1545                         uchar *pattern, *snippet_text;
1546
1547                                  pattern = textPattern;
1548                          printf("\nSEARCH TEST for DISPLAY (pizzachili interface)\n");
1549                                 while(1) {
1550                                         printf("Intro string: ");
1551                                         fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1552                                         if (!strcmp((char*)textPattern,"\n") ) break;
1553                                          textPattern[strlen((char*)textPattern)-1] = '\0';
1554
1555                                         length = strlen( (char*)textPattern);
1556                                         numc=50;
1557
1558 //                                      error = display (Index, textPattern, length, numc, &numocc,
1559 //                                                               &snippet_text, &snippet_len);
1560                                         error = displayWords (Index, textPattern, length, numc, &numocc,
1561                                                                  &snippet_text, &snippet_len,1);
1562
1563                                         if (error){ fprintf(stderr, "%s\n", "Hubo un error durante display");exit(0);}
1564
1565                                                 fprintf(stderr,"\n acabou display");fflush(stderr);
1566                                         {//show the results
1567                                                 ulong j, len = length + 2*numc;
1568                                             char blank = '\0';
1569                                                 fprintf(stderr,"\n length = %d",length);
1570                                                 fprintf(stderr,"\n pattern = %s",pattern);fflush(stderr);
1571                                                 fprintf(stderr,"\n numocc = %d",numocc);fflush(stderr);
1572                                                 fprintf(stderr,"\n snippet len = %d",len);fflush(stderr);
1573                                                 fprintf(stderr,"\n =========");fflush(stderr);
1574                                                 for (i = 0; i < numocc; i++){
1575                                                         fprintf(stderr,"\n[%2d][len=%3d]<<",i+1,snippet_len[i]);fflush(stderr);
1576                                                         fwrite(snippet_text+len*i,sizeof(uchar),snippet_len[i],stderr);fflush(stderr);
1577                                                         fprintf(stderr,">>");fflush(stderr);
1578                                                 }
1579                                         }
1580                                         numpatt--;
1581
1582                                         for(i=0; i<numocc; i++) {
1583                                                 tot_numcharext += snippet_len[i];
1584                                         }
1585
1586                                         if (numocc) {
1587                                                 free (snippet_len);
1588                                                 free (snippet_text);
1589                                         }
1590
1591                                         printf("Ocurrences = %d\n", numocc);
1592                                         if (!strcmp((char*)textPattern,"\n") ) break;
1593                                 }
1594                         }
1595
1596 //
1597 //
1598 //      // SEARCHING FOR A TEXT PATTERN (word/phrase).
1599 //      {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1600 //       int occ;
1601 //       int len;
1602 //       uint *occs;
1603 //       int i;
1604 //       printf("\nSEARCH TEST for LOCATE\n");
1605 //              while(1) {
1606 //                      printf("Intro string: ");
1607 //                      fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1608 //                      len = strlen((char*)textPattern);
1609 //                      if (!strcmp((char*)textPattern,"\n") ) break;
1610 //                      textPattern[len-1] = '\0';
1611 //                      len --;
1612 //
1613 //                      //occs = locateTextOcurrences(wcsa,textPattern,&occ);
1614 //                      // locate(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ);
1615 //                        locateWord(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ, 0);
1616 //
1617 //                      printf("\n*** %s occurs %d times: In the source text in positions:\n\t",textPattern,occ);
1618 //                      for (i=0;i<occ;i++)
1619 //                              printf("[%u]",occs[i]);
1620 //                      fflush(stderr);
1621 //                      free(occs);
1622 //
1623 //                      if (!strcmp((char*)textPattern,"\n") ) break;
1624 //              }
1625 //      }
1626 //
1627 //
1628
1629                 // COUNTING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
1630                 /*
1631                 {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1632                  int occ;
1633                  int len;
1634                  printf("\nSEARCH TEST for COUNT.\n");
1635                         while(1) {
1636                                 printf("Intro string: ");
1637                                 fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1638                                 len = strlen((char*)textPattern);
1639                                 if (!strcmp((char*)textPattern,"\n") ) break;
1640                                 textPattern[len-1] = '\0';
1641                                 len --;
1642
1643                                 count(Index, textPattern, len, (ulong *)&occ);
1644                                 //occ = countTextOcurrences(wcsa,textPattern);
1645                                 printf("Ocurrences = %d\n", occ);
1646                         }
1647                 }
1648                 printf("\n END COUNTING OCCURRENCES OF PATTERNS. ...\n");
1649                 //exit(0);
1650                 */
1651
1652                 /** saving the index to disk*/
1653                 save_index (Index, outbasename);
1654
1655                 /** tells the mem used by the index */
1656                 ulong indexsize;
1657                 index_size(Index, &indexsize);
1658                 fprintf(stderr,"Index occupied %d bytes, 2 extra mallocs = %d",indexsize,2* sizeof(uint));
1659
1660                 /** freeing the index */
1661                 free_index(Index);
1662
1663         }
1664 }
1665
1666 #endif
1667
1668
1669