swcsa/buildFacade.c

   1 #include "buildFacade.h"
   2 #include "utils/errors.c"
   3
   4
   5 /** Building the index */
   6
   7     /* Creates index from text[0..length-1]. Note that the index is an
   8       opaque data type. Any build option must be passed in string
   9       build_options, whose syntax depends on the index. The index must
  10       always work with some default parameters if build_options is NULL.
  11       The returned index is ready to be queried. */
  12 int build_index (uchar *text, ulong length, char *build_options, void **index) {
  13         int returnvalue;
  14
  15      printf("\n parameters: \"%s\"\n",build_options); fflush(stderr);
  16
  17     returnvalue = build_WCSA (text, length, build_options, index);
  18
  19     if (!returnvalue)
  20         returnvalue = build_iCSA (build_options,*index);
  21
  22     return returnvalue;
  23 }
  24
  25
  26 /**  Saves index on disk by using single or multiple files, having
  27         proper extensions. */
  28 int save_index (void *index, char *filename) {
  29
  30         char *basename = filename;
  31         twcsa *wcsa=(twcsa *) index;
  32
  33         uint i,j;
  34         char *outfilename;
  35         int file;
  36         char c;
  37
  38         printf("\n Saving structures to disk: %s.*",filename);
  39         outfilename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
  40
  41         /**File with some constants (bSize and tohSize); */
  42         { uint number;
  43                 strcpy(outfilename, basename);
  44                 strcat(outfilename, ".");
  45                 strcat(outfilename, CONSTANTS_FILE_EXT);
  46                 unlink(outfilename);
  47                 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
  48                         printf("Cannot open file %s\n", outfilename);
  49                         exit(0);
  50                 }
  51                 write(file, &(wcsa->sourceTextSize), sizeof(uint));
  52                 write(file, &(wcsa->seSize), sizeof(uint));
  53                 close(file);
  54         }
  55
  56         /** The Words in the vocabulary of words  (sorted alphabetically)*/
  57         {       strcpy(outfilename, basename);
  58                 strcat(outfilename, ".");
  59                 strcat(outfilename, VOCABULARY_WORDS_FILE_EXT);
  60                 unlink(outfilename);
  61                 if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
  62                         printf("Cannot open file %s\n", outfilename);
  63                         exit(0);
  64                 }
  65
  66                 uint n = wcsa->n;
  67                 uint elemSize = wcsa->wordsData.elemSize;
  68                 write(file, &n, sizeof(uint));
  69                 write(file, &elemSize, sizeof(uint));
  70                 write(file, &(wcsa->wordsData.wordsZoneMem.size), sizeof(uint));
  71
  72                 //the number of canonical words
  73                 write(file, (char *)wcsa->wordsData.wordsZoneMem.zone, wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
  74                 write(file, (char *)wcsa->wordsData.words, ((((n+1)* (elemSize))+W-1) /W) * (sizeof(uint)) );
  75
  76                 close(file);
  77         }
  78
  79         free(outfilename);
  80
  81         if (wcsa->myicsa) {
  82                 /******** saves index on integers (bottom) ******/
  83                 //Storing the CSA
  84                 //storeStructsCSA(wcsa->myicsa,basename);
  85                 saveIntIndex((void *) wcsa->myicsa, basename);
  86         }
  87
  88         if (wcsa->se) {
  89                 saveSEfile(basename,wcsa->se, wcsa->seSize+1);
  90                 //free(wcsa->se);
  91         }
  92
  93         return 0;
  94 }
  95
  96
  97
  98     /**  Loads index from one or more file(s) named filename, possibly
  99       adding the proper extensions. */
 100 int load_index(char *filename, void **index){
 101         twcsa *wcsa;
 102         wcsa = loadWCSA (filename);
 103         (*index) = (void *) wcsa;
 104         return 0;
 105 }
 106
 107         /** Frees the memory occupied by index. */
 108 int free_index(void *index){
 109         twcsa *wcsa=(twcsa *) index;
 110         ulong size;
 111         index_size(index,&size);
 112         printf("\n[destroying index] ...Freed %lu bytes... RAM", size);
 113
 114
 115         //frees the array SE.
 116         if (wcsa->se)
 117                 free (wcsa->se);
 118
 119         //the iCSA.
 120         if (wcsa->myicsa) {
 121                 //destroyStructsCSA(wcsa->myicsa);
 122                 int err = freeIntIndex((void *) wcsa->myicsa);
 123         }
 124
 125         //the words.
 126         free (wcsa->wordsData.wordsZoneMem.zone);
 127         free (wcsa->wordsData.words); /** huge!! */
 128
 129         //the pointer to wcsa.
 130         free(wcsa);
 131         return 0;
 132 }
 133
 134         /** Gives the memory occupied by index in bytes. */
 135 int index_size(void *index, ulong *size) {
 136         ulong totaltmp;
 137         twcsa *wcsa=(twcsa *)index;
 138         uint n= wcsa->n;
 139         *size=0;
 140         *size += sizeof(twcsa);
 141
 142         totaltmp=0;  //words
 143         totaltmp += ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));  //the pointers
 144         totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words.
 145         *size += totaltmp;
 146
 147         if (wcsa->myicsa) {
 148                 uint nbytes;
 149                 int err = sizeIntIndex((void *) wcsa->myicsa, &nbytes);
 150                 *size += nbytes;
 151                 //*size += CSA_size(wcsa->myicsa);
 152         }
 153
 154         return 0;
 155 }
 156
 157
 158 /** Querying the index =============================================================*/
 159
 160         /* Writes in numocc the number of occurrences of the substring
 161            pattern[0..length-1] found in the text indexed by index. */
 162 int count (void *index, uchar *pattern, ulong length, ulong *numocc){
 163         uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
 164         uint integerPatternSize;
 165         ulong l,r;
 166
 167         twcsa *wcsa=(twcsa *) index;
 168         parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
 169         if (!integerPatternSize) {*numocc=0; return 0;} //not found
 170
 171         //*numocc = countCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &l, &r);
 172         int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, numocc,  &l, &r);
 173         return 0;
 174 }
 175
 176         /* Writes in numocc the number of occurrences of the substring
 177           pattern[0..length-1] in the text indexed by index. It also allocates
 178           occ (which must be freed by the caller) and writes the locations of
 179           the numocc occurrences in occ, in arbitrary order.  */
 180 int locate(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc){
 181         return 99;
 182 }
 183
 184    /* Gives the length of the text indexed */
 185 int get_length(void *index, ulong *length) {
 186         twcsa *wcsa=(twcsa *) index;
 187         *length = wcsa->sourceTextSize;
 188         return 0;
 189 }
 190
 191     /**  Obtains the length of the text indexed by index. */
 192
 193 int length (void *index, ulong *length) {
 194         return (get_length(index,length));
 195 }
 196
 197
 198 /** ***********************************************************************************
 199   * Accessing the indexed text
 200   * ***********************************************************************************/
 201
 202
 203         /**  Allocates snippet (which must be freed by the caller) and writes
 204           the substring text[from..to] into it. Returns in snippet_length the
 205           length of the text snippet actually extracted (that could be less
 206           than to-from+1 if to is larger than the text size).                      */
 207 int extract (void *index, ulong from, ulong to, uchar **snippet, ulong *snippet_length) {
 208     twcsa *wcsa=(twcsa *) index;
 209         return 99;
 210 }
 211
 212   /** Displays the text (snippet) surrounding any occurrence of the
 213     substring pattern[0..length-1] within the text indexed by index.
 214     The snippet must include numc characters before and after the
 215     pattern occurrence, totalizing length+2*numc characters, or less if
 216     the text boundaries are reached. Writes in numocc the number of
 217     occurrences, and allocates the arrays snippet_text and
 218     snippet_lengths (which must be freed by the caller). The first is a
 219     character array of numocc*(length+2*numc) characters, with a new
 220     snippet starting at every multiple of length+2*numc. The second
 221     gives the real length of each of the numocc snippets. */
 222
 223 int display (void *index, uchar *pattern, ulong length, ulong numc,
 224         ulong *numocc, uchar **snippet_text, ulong **snippet_lengths) {
 225         return 99;
 226 }
 227
 228
 229
 230 /** ***********************************************************************************
 231   * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
 232   * ***********************************************************************************/
 233         /* Writes in numocc the number of occurrences of the substring
 234           pattern[0..length-1] in the text indexed by index. It also allocates
 235           occ (which must be freed by the caller) and writes the locations of
 236           the numocc occurrences in occ, in arbitrary order. These occurrences
 237           refer to the offsets in TOH where the caller could start a display
 238           operation. So locateWord implies synchronization using B.
 239           Moreover, positions occ[numocc.. 2*numocc-1] is set with the rank in SE of the
 240           words whose codes begin in TOH in the positions in occ[0... numocc-1]
 241           ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
 242              searched word, but the offset in TOH of k-before words before.
 243         */
 244
 245 int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore){
 246         uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
 247         uint integerPatternSize;
 248         ulong occurrences,l,r;
 249         twcsa *wcsa=(twcsa *) index;
 250
 251         parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
 252         if (!integerPatternSize) {*numocc=0; return 0;} //not found
 253
 254         ulong *seOffsets;
 255
 256         //obtains the indexes in vector SE where the pattern appears.
 257         //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
 258         int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
 259
 260         *numocc = occurrences;
 261
 262         if (!occurrences) {(*occ)=NULL;return 0;}
 263
 264         (*occ) = (ulong *)seOffsets;
 265         return 0;
 266 }
 267
 268
 269   /** Displays the text (snippet) surrounding any occurrence of the
 270     substring pattern[0..length-1] within the text indexed by index.
 271     The snippet must include numc characters before and after the
 272     pattern occurrence, totalizing length+2*numc characters, or less if
 273     the text boundaries are reached. Writes in numocc the number of
 274     occurrences, and allocates the arrays snippet_text and
 275     snippet_lengths (which must be freed by the caller). The first is a
 276     character array of numocc*(length+2*numc) characters, with a new
 277     snippet starting at every multiple of length+2*numc. The second
 278     gives the real length of each of the numocc snippets. */
 279
 280  int displayWords (void *index, uchar *pattern, ulong length, ulong numc,
 281          ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore) {
 282
 283     /** actually extracts upto length + 2*numc chars, starting extraction kbefore
 284      *  words before the occurrence **/
 285
 286         ulong *indexesInSE;
 287         ulong occurrences;
 288         uint bytesPerSnippet;
 289         byte *text_aux;
 290         twcsa *wcsa=(twcsa *) index;
 291
 292         locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
 293         (*numocc) = occurrences;
 294
 295         if (!occurrences) {
 296                 *snippet_text =NULL;
 297                 *snippet_lengths =NULL;
 298                 return 0;
 299         }
 300
 301         bytesPerSnippet = length+2*numc;
 302 //      bytesPerSnippet = 2*numc;
 303         *snippet_lengths = (ulong *) malloc((*numocc)*sizeof(ulong));
 304         if (!(*snippet_lengths)) return 1;
 305         *snippet_text = (uchar *) malloc((*numocc)*(bytesPerSnippet)*sizeof(uchar) +1) ;  //(the last "1" is for '\0');
 306         if (!(*snippet_text)) return 1;
 307
 308  //     fprintf(stderr,"\n occs found = %7d for pattern %s",*numocc, pattern);
 309  //     fflush(stderr);
 310
 311         text_aux=*snippet_text;
 312         {
 313                 uint i, j, tmplen;
 314                 uint ptr, maxptr;
 315                 byte *src, *dst;
 316                 uint snippetLen;
 317                 uint posSEValue,indexSE;
 318
 319                 for (i=0;i<occurrences;i++) {
 320                                 uint prevValid=0;
 321                                 uint endSnippet =0;
 322
 323                                 /** decodes words from there */
 324                                 snippetLen=0;
 325                                 indexSE = indexesInSE[i];
 326                                 indexSE = (indexSE > kbefore) ? indexSE-kbefore : 0;
 327
 328                                 dst = text_aux;
 329                                 while ((!endSnippet) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
 330
 331                                         //posSEValue =displayCSA(wcsa->myicsa,indexSE);
 332                                         int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
 333
 334                                         {//obtains pointer to the ith word
 335                                                 uint offtmp;
 336                                                 uint ith = posSEValue -1;  // !!
 337                                                 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 338                                                 offtmp = bitread (wcsa->wordsData.words, ( ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 339                                                 tmplen -=offtmp;  //the lenght of the ith word.
 340
 341                                                 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
 342                                         }
 343
 344                                         if (_Valid[*src]) {
 345                                                 if (prevValid){
 346                                                          *dst++ =' ';
 347                                                          snippetLen++;
 348                                                          if  (snippetLen==bytesPerSnippet) break;  //end of snippet (ends in BLANK_SPACE)
 349                                                 }
 350                                                 prevValid =1;   //for the next iteration
 351                                         }
 352                                         else prevValid=0;
 353
 354                                         indexSE++;
 355
 356                                         /* at the end ?? */
 357                                         if  ((tmplen+snippetLen)>=bytesPerSnippet) {
 358                                                 tmplen =(bytesPerSnippet - snippetLen);
 359                                                 endSnippet=1; //so while loop ends;
 360                                         }
 361
 362                                         for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
 363                                         snippetLen +=tmplen;
 364                                 }//while
 365
 366                                 text_aux += bytesPerSnippet;
 367                                 (*snippet_lengths)[i] = snippetLen;
 368                         }       //for
 369
 370                         if (occurrences) free(indexesInSE);
 371                 }
 372                 return 0;
 373 }
 374
 375 /** simulates extration of text process, but do not actually returns anything at all
 376    Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
 377    Less than 2K words can be extracted if more than numc characters have been already obtained.
 378    Does nothing else... does not return the text */
 379
 380 int  displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc) {
 381
 382         ulong *indexesInSE;
 383         ulong occurrences;
 384         byte *text_aux;
 385
 386         twcsa *wcsa=(twcsa *) index;
 387
 388         locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
 389
 390         if (!occurrences) {
 391                 return 0;
 392         }
 393
 394         ulong maxsnippetLen  = maxnumc;
 395         ulong extractedbytes = 0;
 396
 397         text_aux = (byte *) malloc (maxsnippetLen+1);
 398
 399         {
 400                 uint i, j, tmplen;
 401                 uint ptr, maxptr;
 402                 byte *src, *dst;
 403                 uint snippetLen;
 404                 uint posSEValue,indexSE;
 405
 406                 uint numWordsToExtract = 2 * wordsbefore;
 407                 uint z;
 408                 //printf("\n occurrences... = %lu",occurrences);
 409
 410                 for (i=0;i<occurrences;i++) {
 411                                 uint prevValid=0;
 412                                 uint endSnippet =0;
 413
 414                                 /** decodes words from there */
 415                                 snippetLen=0;
 416                                 indexSE = indexesInSE[i];
 417                                 indexSE = (indexSE > wordsbefore) ? indexSE-wordsbefore : 0;
 418
 419                                 dst = text_aux;
 420                                 z=0;
 421                                 while ((z<numWordsToExtract) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
 422
 423                                         //posSEValue =displayCSA(wcsa->myicsa,indexSE);
 424                                         int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
 425
 426                                         {//obtains pointer to the ith word
 427                                                 uint offtmp;
 428                                                 uint ith = posSEValue -1;  // !!
 429                                                 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 430                                                 offtmp = bitread (wcsa->wordsData.words, ( ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 431                                                 tmplen -=offtmp;  //the lenght of the ith word.
 432
 433                                                 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
 434                                         }
 435
 436                                         if (_Valid[*src]) {
 437                                                         if (prevValid){
 438                                                                  *dst++ =' ';
 439                                                                  snippetLen++;
 440                                                                  if  (snippetLen==maxsnippetLen) break;  //end of snippet (ends in BLANK_SPACE)
 441                                                                 }
 442                                                         prevValid =1;   //for the next iteration
 443                                         }
 444                                         else prevValid=0;
 445
 446                                         indexSE++;
 447
 448                                         /* at the end ?? */
 449                                         if  ((tmplen+snippetLen)>=maxsnippetLen) {
 450                                                         break;
 451                                         }
 452
 453                                         //fprintf(stderr,"\ntmplen = %d ",tmplen); fflush(stderr);
 454                                         for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
 455                                         snippetLen +=tmplen;
 456                                         z++;
 457                                 }//while
 458
 459                                 extractedbytes += snippetLen;
 460
 461                         }       //for
 462
 463                         if (occurrences) free(indexesInSE);
 464                 }
 465                 if (text_aux) free (text_aux);
 466                 return extractedbytes;
 467 }
 468
 469
 470
 471 /**  Allocates text (which must be freed by the caller) and recovers the
 472   the substring of text starting from the "fromword"-th word up to the
 473   "toWord"-th words. Returns in text the text, and in "text_lenght" the
 474   length of the text  actually extracted. Text is allocated.
 475   Actually extracts SE[fromWord .. toWord) ... not the last word.    */
 476
 477 int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
 478         ulong *text_length){
 479
 480         twcsa *wcsa=(twcsa *) index;
 481     uint initTextLen=10000;
 482     uint avgWordLen =7;
 483
 484         uint i, j;//, tmplen;
 485         uint prevValid;
 486         byte *src, *dst, *buff;
 487         uint tmplen =0;
 488
 489         uint buffBytes = 1000;
 490         uint leng=0; //curr pos in buffer that was occupied.
 491
 492         if (toWord > wcsa->seSize) toWord = wcsa->seSize;
 493         if (fromWord >= wcsa->seSize) fromWord = wcsa->seSize-1;
 494         if (buffBytes < ( (toWord-fromWord)* avgWordLen)) buffBytes = ((toWord-fromWord)* avgWordLen);
 495
 496         buff = (uchar *) malloc (buffBytes * sizeof(char));
 497         if (!buff) return 1; //out of memory.
 498         dst = buff;
 499
 500         register uint indexSE=fromWord;
 501         uint posSEValue=0;
 502         register uint ith;
 503
 504         while  ( (indexSE < toWord) ){ /** extracting words (if not at the end) */
 505
 506                 int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
 507
 508                 {//obtains pointer to the ith word
 509                         uint offtmp;
 510                         ith= posSEValue -1;  // !!
 511                         tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 512                         offtmp = bitread (wcsa->wordsData.words, (ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 513                         tmplen -=offtmp;  //the lenght of the ith word.
 514                         src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
 515                 }
 516
 517                 if ( buffBytes < (leng + tmplen+1) ) {
 518                         buffBytes *=2;
 519                         buff = (uchar*) realloc(buff, buffBytes);
 520                         if (!buff) return 1; //out of memory.
 521                         dst = buff + leng;
 522                 }
 523
 524                 if (_Valid[*src]) {
 525                         if (prevValid){
 526                                 *dst++ =' ';
 527                                 leng  += 1;
 528                         }
 529                         prevValid =1;   //for the next iteration
 530                 }
 531                 else prevValid=0;
 532
 533                 indexSE++;
 534
 535                 for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
 536                 leng +=tmplen;
 537         }//while
 538
 539         *text_length =leng;
 540         *dst='\0';
 541         *text = buff;
 542         return 0;
 543 }
 544
 545
 546
 547 /** ***********************************************************************************
 548          CONSTRUCTION OF THE INDEX WCSA
 549     ***********************************************************************************/
 550
 551         /**------------------------------------------------------------------
 552         Compares two slots (alphanumericaly). For qsort of canonical words
 553         ------------------------------------------------------------------ */
 554         int qSortWordsCompareAlpha(const void *arg1, const void *arg2) {
 555                 tposInHT *a1 = (tposInHT *) arg1;
 556                 tposInHT *a2 = (tposInHT *) arg2;
 557                 return strcmp((char*)a1->word, (char *)a2->word);
 558         }
 559
 560 /**
 561   * BUILDS THE WCSA INDEX
 562   */
 563
 564 int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
 565
 566         unsigned long zeroNode;  //number of different canonical words.
 567
 568         t_hash hash;            // the hash table to store both variants and canonical words.
 569         tposInHT *posInHT;      // structure for canonicals and variants+huffmans
 570
 571         uint sourceTextSize;
 572
 573         uint seSize=0;  //it's size == "numberOfValidWords".
 574         uint *SE;       //Integers vector. (represents the rank of the valid words in the source text).
 575
 576         uint totallenWords=0; //The numberOfBytes that occupy canonical words (their ascii version) in memory
 577
 578
 579         ulong bytesFile,bytesFileReal;
 580         long sizeNValue;
 581
 582         /* used during first pass */
 583
 584         ulong addrInTH;
 585
 586         byte* inputBuffer = text;
 587         bytesFileReal= bytesFile = length;
 588
 589         sourceTextSize=length;
 590
 591         /** Initializes WCSA structure*/
 592         twcsa *wcsa;
 593         wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
 594         zeroNode=0;
 595         /**      */
 596
 597         //Stimation (Using Heap's law) of the number of different "meaningful" words.
 598         //sizeNValue=N_value;
 599         if(bytesFile<5000000) bytesFile = 5000000;
 600         sizeNValue = (unsigned long) floor(3.9* pow(bytesFile,0.60) );
 601
 602
 603         // Inicializes the arrays used to detect if a char is valid or not.
 604         StartValid();
 605         // Inicializes the arrays used translated a char into lowercase.
 606         StartToLow();
 607
 608
 609         // **********************************************************************************
 610         //STARTING THE FIRST PASS.
 611         // **********************************************************************************
 612         printf("\nSTARTING THE FIRST PASS...");
 613
 614         posInHT = (tposInHT *) malloc(sizeof(tposInHT) * sizeNValue);
 615         hash = initialize_hash (sizeNValue); //hash to cointain both the parsed words
 616
 617         //-----------------------------------------------------------------
 618         //1st pass (processing the file)
 619         {
 620                 byte *pbeg,*pend,*wordstart,*aWord;
 621                 register ulong size;
 622                 register uint i;
 623
 624                 pbeg = inputBuffer;
 625                 pend = inputBuffer+bytesFileReal;
 626
 627                 while (pbeg <pend) {
 628
 629                         //parsing either a word or separator.
 630                         size=0;
 631                         wordstart = pbeg;
 632                         if (_Valid[*pbeg]) {   //alphanumerical data
 633                                 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
 634                     }
 635                         else {
 636                                 if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
 637                                         while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
 638                                 }
 639                                 else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
 640                                         pbeg++;
 641                                         if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
 642                                         else {
 643                                                 if (_Valid [*pbeg] ) {
 644                                                         wordstart = pbeg;   //So skipping 1 blank character
 645                                                         while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
 646                                                 }
 647                                                 else {   // a "separator word" ...
 648                                                         size++; //the prev BLANK...
 649                                                         while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
 650                                                 }//else {  // a "separator word"
 651                                         }//else ... not a unique BLANK AT THE END.
 652                                 }//else ... starting by a BLANK...
 653                         }
 654
 655                         //The parsed word/separator is  is "wordstart", and its length is "size"...
 656                         aWord=wordstart;
 657
 658                         //Processement done for each word word
 659                         i = inHashTable(hash,aWord, size, &addrInTH );
 660                         if (!i){
 661                                 insertElement (hash,aWord, size, &addrInTH);
 662                                 posInHT[zeroNode].slot=addrInTH;
 663                                 posInHT[zeroNode].word=hash->hash[addrInTH].word;
 664                                 hash->hash[addrInTH].posInVoc = zeroNode;
 665                                 zeroNode++;
 666                                 totallenWords += size +1;                       // +1 due to the '\0' char...
 667                                 //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);
 668                         }
 669                         seSize ++;
 670                 }//while pbeg<pend
 671
 672                 fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
 673
 674         }//1st pass ends
 675
 676
 677         // **********************************************************************************
 678         // END OF 1ST PASS
 679         // **********************************************************************************
 680
 681         // Sorting the words alphanumerically (over posInHT)
 682         {       register unsigned long i,j;
 683                 //sorting canonical words ...
 684                 qsort(posInHT, zeroNode, sizeof(tposInHT), qSortWordsCompareAlpha);
 685
 686                 //setting in hash the new positions of the  words in the hash table
 687                 for (i=0;i<zeroNode;i++) {
 688                         hash->hash[posInHT[i].slot].posInVoc = i;
 689                 }
 690         }
 691
 692         // INITIALIZING structures for the 2nd pass ......................................
 693         {
 694                 SE  = (uint *) malloc ((seSize+1)*sizeof (uint));
 695         }
 696
 697
 698         // **********************************************************************************
 699         //  STARTING THE SECOND PASS.
 700         // **********************************************************************************/
 701
 702         printf("\nSTARTING THE SECOND PASS... ");
 703         //2nd pass (processing the file)
 704         {
 705                 byte *pbeg,*pend,*wordstart,*aWord;
 706                 register ulong size;
 707                 register uint i;
 708                 register ulong countValidWords = 0;
 709
 710
 711                 pbeg = inputBuffer;
 712                 pend = inputBuffer+bytesFileReal;
 713
 714                 while (pbeg <pend) {
 715
 716                         //parsing either a word or separator.
 717                         size=0;
 718                         wordstart = pbeg;
 719                         if (_Valid[*pbeg]) {   //alphanumerical data
 720                                 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
 721                   }
 722                         else {
 723                                 if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
 724                                         while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
 725                                 }
 726                                 else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
 727                                         pbeg++;
 728                                         if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
 729                                         else {
 730                                                 if (_Valid [*pbeg] ) {
 731                                                         wordstart = pbeg;   //So skipping 1 blank character
 732                                                         while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
 733                                                 }
 734                                                 else {   // a "separator word" ...
 735                                                         size++; //the prev BLANK...
 736                                                         while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
 737                                                 }//else {  // a "separator word"
 738                                         }//else ... not a unique BLANK AT THE END.
 739                                 }//else ... starting by a BLANK...
 740                         }
 741
 742                         //The parsed word/separator is  is "wordstart", and its length is "size"...
 743                         aWord=wordstart;
 744
 745                         //Processement done for each word word
 746                         i = inHashTable(hash,aWord, size, &addrInTH );
 747
 748                         SE[countValidWords]=hash->hash[addrInTH].posInVoc+1;  // !!!!
 749                         countValidWords++;
 750
 751                 }// while pbeg<pend
 752
 753                 SE[countValidWords] = 0;
 754                 fprintf(stderr,"\n2nd pass ends: TOTAL distinct words: %lu totalWords = %lu",zeroNode,countValidWords);
 755
 756         }//2nd pass ends
 757
 758         // **********************************************************************************
 759         // END OF 2ND PASS
 760         // **********************************************************************************
 761
 762         //freeing the source text (it is no longer needed).
 763         free(inputBuffer); //the text
 764
 765         /** Now Setting the data of the index **/
 766         wcsa->n = zeroNode;
 767         wcsa->sourceTextSize = sourceTextSize;
 768         wcsa->seSize = seSize;
 769
 770         // Creating the words of the vocabulary...
 771         {
 772         /** copying the words into WCSA. */
 773                 uint *tmpOffsets = (uint *) malloc (sizeof(uint) * (zeroNode  +1) );  //1 extra uint (to point to the virtual "zeroNode+1" ^th word.
 774                 uint tmpOffset =0;
 775
 776                 byte *zoneMem,*src;
 777                 uint i;
 778
 779                 //Moving data from posInHT to WCSA structure
 780                 //wcsa->wordsData = (twords *) malloc(sizeof(twords) * zeroNode);
 781                 wcsa->wordsData.wordsZoneMem.size = totallenWords - zeroNode; //without '\0' bytes (end-tag).
 782                 wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc ( wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
 783                 zoneMem = wcsa->wordsData.wordsZoneMem.zone;
 784                 for(i = 0; i < zeroNode; i++) {
 785                         src = posInHT[i].word;                           //copying the canonical word
 786                         //wcsa->wordsData.words[i].word = zoneMem;   //setting the pointer
 787                         tmpOffsets[i]=tmpOffset;  //offset in zoneMem
 788                         while (*src) {*zoneMem++ = *src++; tmpOffset++;}  //moving data until '\0'
 789                         //*zoneMem='\0'; zoneMem++;            //copies also the '\0'
 790
 791                 }
 792                 tmpOffsets[zeroNode]=tmpOffset; //setting pointer to the "virtual" word {zeroNode+1}^{th}
 793
 794                 //kbit encoding of the offsets
 795                 uint elemSize = bits(tmpOffset);
 796                 wcsa->wordsData.elemSize = elemSize;
 797                 wcsa->wordsData.words = (uint *) malloc (((((zeroNode +1)*elemSize)+W-1) /W) * sizeof(uint));  //with 1 extra slot !.
 798                 wcsa->wordsData.words[((((zeroNode +1)*elemSize)+W-1) /W)   -1 ] =0000;
 799                 //              fprintf(stderr,"\n ElemSize = %d, maxOffset = %d",elemSize,tmpOffset);
 800
 801                 tmpOffset=0;
 802                 for (i=0; i<=zeroNode; i++) {  //setting "zeroNode+1" offsets
 803                                 bitwrite(wcsa->wordsData.words, tmpOffset, elemSize, tmpOffsets[i]);
 804                                 tmpOffset+=elemSize;
 805                 }
 806
 807                 //////////// CHECKS IT WORKED. old !!!!
 808         //              { uint kk;
 809         //                      tmpOffset=0;
 810         //                      for (i=0; i<zeroNode; i++) {  //setting "zeroNode+1" offsets
 811         //                                      kk=bitread(wcsa->wordsData.words, i* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
 812         //                                      tmpOffset+=elemSize;
 813         //                                      if (kk != tmpOffsets[i]) {fprintf(stderr,"\n @@@@@@@@ DISTINTOS OFFSETS "); break;}
 814         //                                      else fprintf(stderr,"\n iguales, %d, %d  :: <<%s>>len=%d",kk,i, posInHT[i].word, strlen((char*)posInHT[i].word));
 815         //                      }
 816         //              }
 817         //
 818         //              { uint len1, len, tmplen, len2;
 819         //                      uint i,p;
 820         //                      byte *wcsaWord, *src;
 821         //
 822         //                      for (p=0;p<zeroNode;p++) {
 823         //                              {//preparing for strcompL
 824         //                                      len    = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p+1)), wcsa->wordsData.elemSize);
 825         //                                      tmplen = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p))  , wcsa->wordsData.elemSize);
 826         //
 827         //                              //fprintf(stderr,"\n  :: off[%d]= %d  -  off [%d] = %d  ==> %d",p+1,len,p,tmplen,len-tmplen);
 828         //
 829         //                              len2 =len-tmplen;
 830         //                                      wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
 831         //                              }
 832         //
 833         //                              src = posInHT[p].word;
 834         //                              len1 = strlen((char *)src);
 835         //
 836         //                              if (strcompL(src,wcsaWord,len1,len2) != 0) {
 837         //                                      fprintf(stderr,"\n %6d DISTINTOS !! ===len1 %d,len %d===== <<",p,len1,len2);printWord(src,len1);
 838         //                                      fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
 839         //                                      exit(0);
 840         //                              }
 841         //                              else {
 842         //                                      fprintf(stderr,"\n %6d ======len1 %d,len2 %d===== <<",p,len1,len2);printWord(src,len1);
 843         //                                      fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
 844         //                              }
 845         //                      }
 846         //
 847         //              }
 848         //
 849
 850         /**-----------*/
 851         //frees memory from hash table and posInHT structures.
 852                 free(tmpOffsets);
 853                 destroy_hash(hash);
 854                 free(posInHT);
 855         }
 856
 857         /** ******* creates the self-index on ints (bottom layer) ==> see build_icsa *********/
 858 /**
 859         #ifdef CSA_ON
 860         {
 861                 uint total;
 862                 fprintf(stderr,"\n **** CREATING CSA from Edu's Code *****");
 863                 ticsa *myicsa;
 864                 myicsa = createIntegerCSA(&SE,seSize+1,build_options);
 865                 wcsa->myicsa= myicsa;
 866                 total = CSA_size(myicsa);
 867
 868                 free(SE);  //SE is no longer needed, (it is indexed by the iCSA)
 869                 printf("\n\t**** [iCSA built on %d words. Size = %ld bytes... RAM",seSize,total);
 870         }
 871         #endif
 872 */
 873
 874         //#ifndef CSA_ON
 875                 wcsa->se = SE;
 876                 wcsa->myicsa = NULL;
 877         //#endif
 878
 879         printf("\n\t ** Building done! **\n");
 880         printf("\n Process finished!\n");
 881
 882         *index = wcsa;
 883         return 0;
 884 }
 885
 886
 887 int build_iCSA (char *build_options, void *index)
 888 {
 889         twcsa *wcsa = (twcsa *) index;
 890         /********* creates the self-index on ints (bottom layer) *********/
 891         //creating CSA from Edu's code...
 892         {
 893                 uint total;
 894                 fprintf(stderr,"\n **** CREATING CSA-bottom-layer *****");
 895                 void *bottomIntIndex;
 896                 int err =  buildIntIndex(wcsa->se,wcsa->seSize+1, build_options,(void **)&bottomIntIndex);
 897                 wcsa->myicsa = bottomIntIndex;
 898
 899                 //total = CSA_size(wcsa->myicsa);
 900                 err = sizeIntIndex((void *) wcsa->myicsa, &total);
 901
 902                 printf("\n\t**** [iCSA built on %d words. Size = %u bytes... RAM",wcsa->seSize,total);
 903         }
 904         return 0;
 905 }
 906
 907 /** ********************************************************************
 908   * Loading from disk
 909   **********************************************************************/
 910
 911 /**-----------------------------------------------------------------
 912  * LoadWCSA.
 913  * Loads all the data structures of WCSA (included the icsa)
 914  ----------------------------------------------------------------- */
 915
 916 twcsa *loadWCSA(char *filename) {
 917         twcsa *wcsa;
 918         // Inicializes the arrays used to detect if a char is valid or not.
 919         StartValid();
 920         // Inicializes the arrays used translated a char into lowercase.
 921         StartToLow();
 922
 923         wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
 924         wcsa->n=0;
 925
 926         int err = loadIntIndex(filename, (void **)&wcsa->myicsa);
 927
 928         loadStructs(wcsa,filename);
 929
 930         return wcsa;
 931 }
 932
 933 /** ------------------------------------------------------------------
 934  * LoadStructs.
 935  *      Reads files and loads all the data needed for searcherFacade
 936  ----------------------------------------------------------------- */
 937  void loadStructs(twcsa *wcsa, char *basename) {
 938         uint i,j;
 939         char *filename;
 940         int file;
 941         uint sizeFile;
 942         char c;
 943         uint n;
 944
 945         filename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
 946         fprintf(stderr,"Loading Index from file %s.*\n", basename);
 947
 948         //** SOME CONSTANTS: sourceTextSize
 949         {       strcpy(filename, basename);
 950                 strcat(filename, ".");
 951                 strcat(filename, CONSTANTS_FILE_EXT);
 952
 953                 if( (file = open(filename, O_RDONLY)) < 0) {
 954                         printf("Cannot open file %s\n", filename);
 955                         exit(0);
 956                 }
 957
 958                 read(file, &(wcsa->sourceTextSize), sizeof(uint));
 959                 read(file, &(wcsa->seSize), sizeof(uint));
 960                 close(file);
 961         }
 962
 963         /** File with the words from the vocabulary (sorted alphabetically) */
 964         {       byte *zoneMem;
 965
 966                 strcpy(filename, basename);
 967                 strcat(filename, ".");
 968                 strcat(filename, VOCABULARY_WORDS_FILE_EXT);
 969                 //sizeFile= fileSize(filename)-sizeof(uint);
 970
 971                 if( (file = open(filename, O_RDONLY)) < 0) {
 972                         printf("Cannot open file %s\n", filename);
 973                         exit(0);
 974                 }
 975
 976                 //the number of canonical words
 977                 read(file, &n, sizeof(uint));
 978                 wcsa->n = n;
 979                 read(file, &(wcsa->wordsData.elemSize), (sizeof(uint)));
 980                 read(file, &(wcsa->wordsData.wordsZoneMem.size), (sizeof(uint)));
 981
 982                 //allocating the memory needed for all words and reading them   //(ascii) << no \0 chars are needed>>.
 983                 wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc(wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
 984                 read(file, (wcsa->wordsData.wordsZoneMem.zone),    wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
 985
 986                 //reading the offsets of the words (kbitArray that points to offsets in zoneMem of words.
 987                 wcsa->wordsData.words = (uint *) malloc (((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * sizeof(uint));
 988                 wcsa->wordsData.words[ ((((n+1)*(wcsa->wordsData.elemSize))+W-1) /W)   -1 ] =0000;
 989                 read(file, (wcsa->wordsData.words),     ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)));
 990
 991
 992                 close(file);
 993         }
 994         wcsa->se= NULL;
 995         free(filename);
 996 }
 997
 998
 999
1000
1001 /** ****************************************************************
1002   * Querying the index WCSA
1003   * ***************************************************************/
1004 ///////////////////////////////////////////////////////////////////////////////////////
1005 //                                         FUNCTIONS NEEDED FOR SEARCHING A PATTERN                                      //
1006 ///////////////////////////////////////////////////////////////////////////////////////
1007
1008
1009
1010 /*------------------------------------------------------------------
1011  * Given a text pattern translates it into a list of integers (corresponding to the
1012  * canonical words associated to the valid words in the text pattern)
1013  ------------------------------------------------------------------*/
1014 void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) {
1015
1016         byte *pbeg,*pend,*wordstart,*aWord;
1017         register unsigned long size;
1018         uint index =0;
1019
1020         pbeg = textPattern;
1021         pend = pbeg + patLen;
1022
1023         while (pbeg <pend) {
1024                 //parsing either a word or separator.
1025                 size=0;
1026                 wordstart = pbeg;
1027                 if (_Valid[*pbeg]) {   //alphanumerical data
1028                         while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
1029           }
1030                 else {
1031                         if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
1032                                 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
1033                         }
1034                         else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
1035                                 pbeg++;
1036                                 if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
1037                                 else {
1038                                         if (_Valid [*pbeg] ) {
1039                                                 wordstart = pbeg;   //So skipping 1 blank character
1040                                                 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
1041                                         }
1042                                         else {   // a "separator word" ...
1043                                                 size++; //the prev BLANK...
1044                                                 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
1045                                         }//else {  // a "separator word"
1046                                 }//else ... not a unique BLANK AT THE END.
1047                         }//else ... starting by a BLANK...
1048                 }
1049
1050                 //The parsed word is "aWord", and its length is "size"...
1051                 aWord=wordstart;
1052
1053                 // Binary search on the canonical words (wordsData)
1054                 {
1055                         uint len, tmplen;
1056                         uchar *wcsaWord;
1057                         register uint min,max,p;
1058                         min = 0;
1059                         max = (wcsa->n) - 1;
1060                         while(min < max) {
1061                                 p = (min+max)/2;
1062
1063                                 {//preparing for strcompL
1064                                         len    = bitread (wcsa->wordsData.words, (p+1)* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize);
1065                                         tmplen = bitread (wcsa->wordsData.words, (p )* wcsa->wordsData.elemSize  , wcsa->wordsData.elemSize);
1066                                         len -=tmplen;
1067                                         wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
1068                                 }
1069
1070                                 //if(strncmp((char*)aWord, (char*)wcsa->wordsData[p].word,size) > 0) min = p+1;
1071                                 if(strcompL(aWord, wcsaWord, size, len) > 0) min = p+1;
1072                                 else max = p;
1073
1074
1075                                 //                              { //SHOW PROGRESS
1076                                 //                                      fprintf(stderr,"\n Patron = <<%s>>, curposWord= %d ==><<",aWord,p);
1077                                 //                                      printWord(wcsaWord,len); fprintf(stderr,">> len =%d",len);
1078                                 //                              }
1079
1080                         }
1081
1082                         {//preparing for strcompL
1083                                 len    = bitread (wcsa->wordsData.words, (min+1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1084                                 tmplen = bitread (wcsa->wordsData.words, ( min )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1085                                 len -=tmplen;
1086                                         wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
1087                         }
1088
1089                         //      if(!strncmp((char*)aWord, (char*)wcsa->wordsData[min].word, size)) {
1090                         if(!strcompL(aWord, wcsaWord, size, len)) {
1091                                 integerPattern[index++] = min +1 ;  //<--
1092                         }
1093                         else {*sizeIntegers = 0; return;}  // a valid word that does not appear in the source text.
1094
1095                 }
1096         }// end while
1097         *sizeIntegers = index;
1098
1099         //      //shows the parsed words:
1100         //      {uint i;
1101         //              printf("\n\n >>%s>> HA SIDO PARSEADO COMO:",textPattern);
1102         //              for (i=0; i<index;i++) {
1103         //                              printf("<<%s>>",wcsa->wordsData[integerPattern[i] -1].word);
1104         //              }
1105         //
1106         //      }
1107 }
1108
1109
1110
1111
1112
1113         /** ------------------------------------------------------------------
1114          * Returns the number of occurrences of a given text pattern
1115          *------------------------------------------------------------------ */
1116 int countTextOcurrences(twcsa *wcsa, byte *textPattern) {
1117         ulong left, right;
1118         uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
1119         uint integerPatternSize, min, max;
1120
1121         uint lenpat = strlen((char*)textPattern);
1122         parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
1123         if (!integerPatternSize) return -1;
1124
1125 //      #ifdef DEBUG_ON
1126 //              uint i;
1127 //              printf("\n %d Integers to search for:",integerPatternSize );
1128 //              for (i=0;i<integerPatternSize;i++) {
1129 //                      printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
1130 //              }
1131 //              printf("\n");
1132 //      #endif
1133
1134         ulong numocc;
1135         int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, &numocc,  &left, &right);
1136         return numocc;
1137
1138 }
1139
1140
1141         /** ------------------------------------------------------------------
1142          * locateTextOcurrences:
1143          * Returns the offsets of the source text where a word/phrase appears
1144          * Returns also the number of occurrences.
1145          *------------------------------------------------------------------ */
1146 uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences) {
1147         uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
1148         uint integerPatternSize, min, max;
1149
1150         uint lenpat = strlen((char*)textPattern);
1151         parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
1152         if (!integerPatternSize) {*numberOccurrences = -1; return NULL;}
1153
1154 //      #ifdef DEBUG_ON
1155 //              uint i;
1156 //              printf("\n %d Integers to search for:",integerPatternSize );
1157 //              for (i=0;i<integerPatternSize;i++) {
1158 //                      printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
1159 //              }
1160 //              printf("\n");
1161 //      #endif
1162
1163         ulong occurrences, left, right;
1164         ulong *seOffsets;
1165         ulong *sourceOffsets;
1166
1167         //obtains the indexes in vector SE where the pattern appears.
1168         //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
1169         int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
1170
1171         //sourceOffsets = (uint *) malloc (sizeof(uint)*occurrences);
1172
1173         sourceOffsets=seOffsets;
1174         //obtains the offsets in the source text of the pattern (sourceOffsets)
1175         locateFacade(wcsa, (uint *)sourceOffsets, (uint *)seOffsets,occurrences);
1176
1177         #ifdef DEBUG_ON
1178                 fprintf(stderr,"\n*** %s appears in the source text in positions:\n\t",textPattern);
1179                 for (i=0;i<occurrences;i++)
1180                         fprintf(stderr,"[%u]",sourceOffsets[i]);
1181                 fflush(stderr);
1182         #endif
1183
1184         *numberOccurrences = occurrences;
1185         return (uint *) sourceOffsets;
1186 }
1187
1188
1189         /** ------------------------------------------------------------------
1190          * displayTextOcurrences:
1191          * Shows in stdout, the text around the occurrences of a word/phrase
1192          * Returns also the number of occurrences.
1193          *------------------------------------------------------------------ */
1194 int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay) {
1195         return 99;  //not implemented: function not available
1196 }
1197
1198         /** ------------------------------------------------------------------
1199          * Locate Facade:
1200          * For given sePositions, returns the sourceTextPositions
1201          * where the those valid-words in se[sePositions[i]] occurr.
1202          *------------------------------------------------------------------*/
1203 int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number) {
1204         return 99;  //not implemented: function not available for this index
1205 }
1206
1207
1208 /** ------------------------------------------------------------------
1209         * DISPLAYFACADE:
1210         * Returns the subString from a starting offset to a final offset
1211         * in the source text. It does not allocate any memory, receives "dstptr"
1212         * Precondition: offsetIni >=0;
1213         ------------------------------------------------------------------*/
1214  int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr) {
1215         return 99;  //not implemented: function not available for this index
1216 }
1217
1218
1219         /**------------------------------------------------------------------
1220          * DISPLAYFacadeMalloc:
1221          * Returns the subString from a starting offset to a final offset
1222          * in the source text. It allocates Memory !!
1223          * NOT CURRENTLY USED
1224          ------------------------------------------------------------------*/
1225 byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length) {
1226         byte *dstptr=NULL;         //not implemented: function not available
1227         return dstptr;
1228 }
1229
1230
1231         /** ------------------------------------------------------------------
1232          * LOCATEALLandDISPLAY:
1233          * Displays the text around an occurrence of the searched word in the source text.
1234          * Assuming that $p$ is that position --> shows only chars in [p_radix-1,p_radix]
1235          ------------------------------------------------------------------*/
1236 int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix) {
1237 return 99;   //not implemented: function not available for this index
1238
1239 }
1240
1241
1242         /** ------------------------------------------------------------------
1243          * recovers the source text by calling display(0,fileSize);
1244          * ------------------------------------------------------------------ */
1245 void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
1246
1247         int start;int end;
1248         byte *cc;
1249         char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
1250         ulong length;
1251
1252         strcpy( filename,  basename);
1253         strcat( filename, ext);
1254         filename[strlen( basename)+ strlen(ext)]='\0';
1255         fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
1256
1257         FILE *salida;
1258         unlink( filename);
1259         salida = fopen( filename,"w");
1260         start=0; end = sourceTextSize-1;
1261
1262         cc = (byte *) malloc (sourceTextSize* sizeof(uchar));
1263
1264         {
1265                 uint i, j;//, tmplen;
1266                 uint prevValid;
1267                 //uint ptr, maxptr;
1268                 byte *src, *dst;
1269                 uint leng =0;
1270                 uint tmplen =0;
1271
1272                 uint indexSE=0;
1273                 uint posSEValue=0;
1274
1275                 dst=cc;
1276                 while  ( (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
1277
1278                         int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
1279
1280                         {//obtains pointer to the ith word
1281                                 uint offtmp;
1282                                 uint ith = posSEValue -1;  // !!
1283                                 tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1284                                 offtmp = bitread (wcsa->wordsData.words, ( ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
1285                                 tmplen -=offtmp;  //the lenght of the ith word.
1286                                 src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
1287                         }
1288
1289                         if (_Valid[*src]) {
1290                                 if (prevValid){
1291                                          *dst++ =' ';
1292                                         leng +=1;
1293                                 }
1294                                 prevValid =1;   //for the next iteration
1295                         }
1296                         else prevValid=0;
1297
1298                         indexSE++;
1299
1300                         for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
1301                         leng +=tmplen;
1302                 }//while
1303
1304         fprintf(stderr,"\n sourceTextSize = %d, len = %d",sourceTextSize,leng);
1305         fwrite(cc,sizeof(byte),leng,salida);
1306         fclose(salida);
1307
1308         free(cc);
1309         free(filename);
1310 }
1311
1312
1313 }
1314
1315                 //recovers the source text by calling extract Words.
1316 void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
1317
1318         int start;int end; int error;
1319         char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
1320         byte *cc;
1321         ulong length;
1322
1323         strcpy( filename,  basename);
1324         strcat( filename, ext);
1325         filename[strlen( basename)+ strlen(ext)]='\0';
1326         fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
1327
1328         FILE *salida;
1329         unlink( filename);
1330         salida = fopen( filename,"w");
1331         start=0; end = wcsa->seSize;
1332
1333         error = extractWords((void *) wcsa, start, end, &cc, &length);
1334         if (error) {fprintf(stderr,"\n error during recoverSourceText2"); exit(0);}
1335
1336         fprintf(stderr,"\n sourceTextSize = %d, len = %ld",sourceTextSize,length);
1337         fwrite(cc,sizeof(byte),length,salida);
1338         fclose(salida);
1339
1340         free(cc);
1341         free(filename);
1342 }
1343
1344 /** *******************************************************************************
1345   * Showing some statistics and info of the index
1346   * *******************************************************************************/
1347 void printInfoReduced(twcsa *wcsa) {
1348           //not implemented: function not available
1349 }
1350
1351                 /* Shows summary info of the index */
1352 int printInfo(void *index) {
1353         uint n;
1354
1355         twcsa *wcsa = (twcsa *) index;
1356
1357         unsigned long indexSize;
1358         uint intIndexSize, presentationSize;
1359         int err;
1360
1361         err = index_size(index, &indexSize);
1362         if (err!=0) return err;
1363         err = sizeIntIndex(wcsa->myicsa, &intIndexSize);
1364         if (err!=0) return err;
1365
1366         presentationSize = indexSize - intIndexSize;
1367
1368                 printf("\n ===================================================:");
1369                 printf("\n Summary of Presentation layer:");
1370                 printf("\n   Number of valid words (SEsize) = %u",wcsa->seSize);
1371                 printf("\n   Number of different words = %ld",wcsa->n);
1372                 printf("\n   WCSA structure = %d bytes", sizeof(twcsa));
1373
1374                 uint totalpointers = ((((wcsa->n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));
1375                 uint totalasciizone = wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ;
1376                 uint totalwords = totalasciizone + totalpointers;
1377
1378                 printf("\n   Size Of words structure (%d bytes):",totalwords);
1379                 printf("\n      [     pointers = %d bytes || AsciiZone = %d bytes", totalpointers,      totalasciizone);
1380
1381                 printf("\n\n Total = **  %u bytes (in RAM) **",presentationSize);
1382                 //printf("\n\n @@ Summary of self-index on Integers:");
1383                 err = printInfoIntIndex(wcsa->myicsa, " ");
1384                 if (err!=0) return err;
1385
1386                 printf("\n ===================================================:");
1387                 printf("\n");
1388                 return 0;
1389         }
1390
1391 /**------------------------------------------------------------------
1392  * structsSize.
1393  *      Counts the memory amount needed by the Facade (Presentation Layer).
1394  * skipping the stop_words hash table
1395  ----------------------------------------------------------------- */
1396 uint structsSizeMem(twcsa *wcsa) {
1397         return 0;   //not implemented: function not available for this index.
1398 }
1399
1400
1401 /** for debugging **/
1402 void printWord(uchar *str, uint len) {
1403                 uint i;
1404                 for (i=0;i<len;i++)
1405                         fprintf(stderr,"%c",str[i]);
1406 }
1407
1408
1409         /** saves the content of the file SE (ids of the source words) **/
1410 int saveSEfile (char *basename, uint *v, uint n) {
1411         char outfilename[255];
1412         int file;
1413         sprintf(outfilename,"%s.%s",basename,SE_FILE_EXT);
1414         unlink(outfilename);
1415         if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
1416                 printf("Cannot open file %s\n", outfilename);
1417                 exit(0);
1418         }
1419
1420         write(file, v, sizeof(uint) * n );
1421         close(file);
1422 }
1423
1424
1425
1426 double getTime2 (void)
1427 {
1428         double usertime, systime;
1429         struct rusage usage;
1430
1431         getrusage (RUSAGE_SELF, &usage);
1432
1433         usertime = (double) usage.ru_utime.tv_sec +
1434                 (double) usage.ru_utime.tv_usec / 1000000.0;
1435         systime = (double) usage.ru_stime.tv_sec +
1436                 (double) usage.ru_stime.tv_usec / 1000000.0;
1437
1438         return (usertime + systime);
1439 }
1440
1441
1442
1443 /**------------------------------------------------------------------
1444   *  MAIN PROGRAM.
1445   *------------------------------------------------------------------ */
1446 #ifdef FACADEWITHMAIN
1447         int main(int argc, char* argv[])
1448         {
1449
1450
1451
1452                 char *infile, *outbasename, *stopwordsfile;     // Name of in/out files
1453                 byte *inputBuffer;
1454                 ulong finsize;
1455
1456                 int f_in;
1457                 void *Index;
1458
1459
1460                 printf("\n*Word-based iCSA: A word-based CSA");
1461                 printf("\n*CopyRight (c) 2008 [LBD & G.N.]\n\n");
1462
1463                 // Reads input parameters from command line.
1464                 if(argc < 3) {
1465                         printf("Use: %s <in file> <out basename> \n", argv[0]);
1466                         exit(0);
1467                 }
1468
1469                 // Reads params (input file, output basename, and stopwords file)
1470                 infile = argv[1];
1471                 outbasename = argv[2];
1472                 stopwordsfile = argv[3];
1473
1474                 finsize= fileSize(infile);
1475
1476                 if (! finsize) {
1477                         printf( "\nFILE EMPTY OR FILE NOT FOUND %s !!\nSkipping processement ...\n",infile);
1478                         exit(0);
1479                 }
1480
1481                 // Opening the input text file.
1482                 if( (f_in = open(infile, O_RDONLY)) < 0) {
1483                         printf("Cannot read file %s\n", infile);
1484                         exit(0);
1485                 }
1486                 inputBuffer = (byte *) malloc(finsize *sizeof(byte));// +1);
1487                 read (f_in,inputBuffer,finsize);
1488                 close (f_in);
1489
1490
1491         {
1492                 //printf("\n parametros <<%s>>\n\n",stopwordsfile);
1493                 build_index (inputBuffer, finsize, stopwordsfile, &Index);  /** building the index */
1494
1495 //              /** recovering the source text from the index */
1496                         {
1497                                 double start, end;
1498                                 start = getTime2();
1499                                 ulong size;
1500                                 get_length(Index, &size);
1501                                 char extension[10]= ".source";
1502
1503                                 //recoverSourceText1((twcsa*) Index, outbasename,extension, size);
1504                                 strcat(extension,"2");
1505                                 recoverSourceText2((twcsa*) Index, outbasename,extension,size);
1506                                 end = getTime2();
1507                                 fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start );
1508                         }
1509 //
1510                 // DISPLAYING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
1511                         {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1512                          int error = 0;
1513                         ulong numocc,numc, length, i, *snippet_len, tot_numcharext = 0, numpatt;
1514                         uchar *pattern, *snippet_text;
1515
1516                                  pattern = textPattern;
1517                          printf("\nSEARCH TEST for DISPLAY (pizzachili interface)\n");
1518                                 while(1) {
1519                                         printf("Intro string: ");
1520                                         fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1521                                         if (!strcmp((char*)textPattern,"\n") ) break;
1522                                          textPattern[strlen((char*)textPattern)-1] = '\0';
1523
1524                                         length = strlen( (char*)textPattern);
1525                                         numc=50;
1526
1527 //                                      error = display (Index, textPattern, length, numc, &numocc,
1528 //                                                               &snippet_text, &snippet_len);
1529                                         error = displayWords (Index, textPattern, length, numc, &numocc,
1530                                                                  &snippet_text, &snippet_len,1);
1531
1532                                         if (error){ fprintf(stderr, "%s\n", "Hubo un error durante display");exit(0);}
1533
1534                                                 fprintf(stderr,"\n acabou display");fflush(stderr);
1535                                         {//show the results
1536                                                 ulong j, len = length + 2*numc;
1537                                             char blank = '\0';
1538                                                 fprintf(stderr,"\n length = %d",length);
1539                                                 fprintf(stderr,"\n pattern = %s",pattern);fflush(stderr);
1540                                                 fprintf(stderr,"\n numocc = %d",numocc);fflush(stderr);
1541                                                 fprintf(stderr,"\n snippet len = %d",len);fflush(stderr);
1542                                                 fprintf(stderr,"\n =========");fflush(stderr);
1543                                                 for (i = 0; i < numocc; i++){
1544                                                         fprintf(stderr,"\n[%2d][len=%3d]<<",i+1,snippet_len[i]);fflush(stderr);
1545                                                         fwrite(snippet_text+len*i,sizeof(uchar),snippet_len[i],stderr);fflush(stderr);
1546                                                         fprintf(stderr,">>");fflush(stderr);
1547                                                 }
1548                                         }
1549                                         numpatt--;
1550
1551                                         for(i=0; i<numocc; i++) {
1552                                                 tot_numcharext += snippet_len[i];
1553                                         }
1554
1555                                         if (numocc) {
1556                                                 free (snippet_len);
1557                                                 free (snippet_text);
1558                                         }
1559
1560                                         printf("Ocurrences = %d\n", numocc);
1561                                         if (!strcmp((char*)textPattern,"\n") ) break;
1562                                 }
1563                         }
1564
1565 //
1566 //
1567 //      // SEARCHING FOR A TEXT PATTERN (word/phrase).
1568 //      {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1569 //       int occ;
1570 //       int len;
1571 //       uint *occs;
1572 //       int i;
1573 //       printf("\nSEARCH TEST for LOCATE\n");
1574 //              while(1) {
1575 //                      printf("Intro string: ");
1576 //                      fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1577 //                      len = strlen((char*)textPattern);
1578 //                      if (!strcmp((char*)textPattern,"\n") ) break;
1579 //                      textPattern[len-1] = '\0';
1580 //                      len --;
1581 //
1582 //                      //occs = locateTextOcurrences(wcsa,textPattern,&occ);
1583 //                      // locate(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ);
1584 //                        locateWord(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ, 0);
1585 //
1586 //                      printf("\n*** %s occurs %d times: In the source text in positions:\n\t",textPattern,occ);
1587 //                      for (i=0;i<occ;i++)
1588 //                              printf("[%u]",occs[i]);
1589 //                      fflush(stderr);
1590 //                      free(occs);
1591 //
1592 //                      if (!strcmp((char*)textPattern,"\n") ) break;
1593 //              }
1594 //      }
1595 //
1596 //
1597
1598                 // COUNTING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
1599                 /*
1600                 {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
1601                  int occ;
1602                  int len;
1603                  printf("\nSEARCH TEST for COUNT.\n");
1604                         while(1) {
1605                                 printf("Intro string: ");
1606                                 fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
1607                                 len = strlen((char*)textPattern);
1608                                 if (!strcmp((char*)textPattern,"\n") ) break;
1609                                 textPattern[len-1] = '\0';
1610                                 len --;
1611
1612                                 count(Index, textPattern, len, (ulong *)&occ);
1613                                 //occ = countTextOcurrences(wcsa,textPattern);
1614                                 printf("Ocurrences = %d\n", occ);
1615                         }
1616                 }
1617                 printf("\n END COUNTING OCCURRENCES OF PATTERNS. ...\n");
1618                 //exit(0);
1619                 */
1620
1621                 /** saving the index to disk*/
1622                 save_index (Index, outbasename);
1623
1624                 /** tells the mem used by the index */
1625                 ulong indexsize;
1626                 index_size(Index, &indexsize);
1627                 fprintf(stderr,"Index occupied %d bytes, 2 extra mallocs = %d",indexsize,2* sizeof(uint));
1628
1629                 /** freeing the index */
1630                 free_index(Index);
1631
1632         }
1633 }
1634
1635 #endif
1636
1637
1638