3 // Global para que funcione a funcion de comparacion do quicksort
7 int suffixCmp(const void *arg1, const void *arg2) {
13 while(intVector[a] == intVector[b]) { a++; b++; }
14 return (intVector[a] - intVector[b]);
20 /* **NO REVISADO TAMAÑO FILE.
21 int buildIntIndexFromFile (char *filename, char *build_options,void **index) {
25 //sprintf(filename, "%s.%s", basename,SE_FILE_EXT);
27 if( (file = open(filename, O_RDONLY)) < 0) {
28 printf("Cannot open file %s\n", filename);
31 if( fstat(file, &f_stat) < 0) {
32 printf("Cannot read information from file %s\n", filename); exit(0);
34 uint sizeFile = (f_stat.st_size)/sizeof(uint);
36 uint *se = (uint *) malloc (sizeFile);
37 uint seSize = sizeFile / sizeof(uint);
38 read(file, se, sizeFile); //the samples
40 return (buildIntIndex(se,seSize,build_options,index));
44 //ticsa *createIntegerCSA(uint **aintVector, uint textSize, char *build_options) {
45 int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index ){
47 intVector = aintVector; //global variable
49 myicsa = (ticsa *) malloc (sizeof (ticsa));
50 uint *Psi, *SAI, *C, vocSize;
54 parametersCSA(myicsa, build_options);
56 nsHUFF=myicsa->tempNSHUFF;
58 // Almacenamos o valor dalguns parametros
59 myicsa->suffixArraySize = textSize;
60 myicsa->D = (uint*) malloc (sizeof(uint) * ((textSize+31)/32));
62 myicsa->samplesASize = (textSize + myicsa->T_A - 1) / myicsa->T_A + 1;
63 myicsa->samplesA = (uint *)malloc(sizeof(int) * myicsa->samplesASize);
64 myicsa->BA = (uint*) malloc (sizeof(uint) * ((textSize+31)/32));
65 myicsa->samplesAInvSize = (textSize + myicsa->T_AInv - 1) / myicsa->T_AInv;
66 myicsa->samplesAInv = (uint *)malloc(sizeof(int) * myicsa->samplesAInvSize);
68 // Reservamos espacio para os vectores
69 Psi = (uint *) malloc (sizeof(uint) * textSize);
71 // CONSTRUIMOS A FUNCION C
73 for(i=0;i<textSize;i++) if(intVector[i]>vocSize) vocSize = intVector[i];
74 C = (uint *) malloc(sizeof(uint) * (vocSize + 1)); // Para contar o 0 terminador
75 for(i=0;i<vocSize;i++) C[i] = 0;
76 for(i=0;i<textSize;i++) C[intVector[i]]++;
77 for (i=0,j=0;i<=vocSize;i++) {
81 for(i=vocSize;i>0;i--) C[i] = C[i-1];
84 // Construimos o array de sufixos (en Psi) - con quicksort
85 printf("\n\t *BUILDING THE SUFFIX ARRAY over %d integers... (with qsort)", textSize);fflush(stdout);
86 for(i=0; i<textSize; i++) Psi[i]=i;
88 qsort(Psi, textSize, sizeof(uint), suffixCmp);
91 printf("\n\t ...... ended.");
93 // CONSTRUIMOS A INVERSA DO ARRAY DE SUFIXOS
94 SAI = (uint *) malloc (sizeof(uint) * (textSize + 1)); // +1 para repetir na ultima posición. Evitamos un if
95 for(i=0;i<textSize;i++) SAI[Psi[i]] = i;
96 SAI[textSize] = SAI[0];
98 // ALMACENAMOS AS MOSTRAS DO ARRAY DE SUFIXOS
99 for(i=0;i<((textSize+31)/32);i++) myicsa->BA[i] = 0;
100 for(i=0; i<textSize; i+=myicsa->T_A) bitset(myicsa->BA, SAI[i]);
101 bitset(myicsa->BA, SAI[textSize-1]); // A ultima posicion sempre muestreada
102 //printf("TextSize = %d\n", textSize);
103 myicsa->bBA = createBitmap(myicsa->BA, textSize);
104 for(i=0,j=0; i<textSize; i++) if(bitget(myicsa->BA, i)) myicsa->samplesA[j++] = Psi[i];
106 // ALMACENAMOS AS MOSTRAS DA INVERSA DO ARRAY DE SUFIXOS
107 for(i=0,j=0;i<textSize;i+=myicsa->T_AInv) myicsa->samplesAInv[j++] = SAI[i];
109 // CONSTRUIMOS E COMPRIMIMOS PSI
110 printf("\n\t Creating compressed Psi...");
111 for(i=0;i<textSize;i++) Psi[i] = SAI[Psi[i]+1];
113 #ifdef PSI_HUFFMANRLE
114 myicsa->hcPsi = huffmanCompressPsi(Psi,textSize,myicsa->T_Psi,nsHUFF);
117 myicsa->gcPsi = gonzaloCompressPsi(Psi,textSize,myicsa->T_Psi,nsHUFF);
119 #ifdef PSI_DELTACODES
120 myicsa->dcPsi = deltaCompressPsi(Psi,textSize,myicsa->T_Psi);
125 for(i=0;i<((textSize+31)/32);i++) myicsa->D[i] = 0;
126 for(i=0;i<=vocSize;i++) bitset(myicsa->D, C[i]);
127 myicsa->bD = createBitmap(myicsa->D,textSize);
130 // VARIABLE GLOBAL QUE ALMACENA O ESTADO DOS DISPLAYS (IMPORTANTE PARA DISPLAY SECUENCIAL)
131 // Almacena a última posición do array de sufixos que mostramos con display ou displayNext
132 // Se nos piden un displayNext, aplicamos PSI sobre esta posición e obtemos a seguinte,
133 // coa que podemos obter o símbolo pedido, e actualizamos displayState
134 myicsa->displayCSAState = 0;
135 myicsa->displayCSAPrevPosition = -2; //needed by DisplayCSA(position)
137 aintVector = intVector;
138 // Liberamos o espacion non necesario
146 //Returns number of elements in the indexed sequence of integers
147 int sourceLenIntIndex(void *index, uint *numInts){
148 ticsa *myicsa = (ticsa *) index;
149 *numInts= myicsa->suffixArraySize;
150 return 0; //no error;
153 int saveIntIndex(void *index, char *pathname) {
154 //void storeStructsCSA(ticsa *myicsa, char *basename) {
156 ticsa *myicsa = (ticsa *) index;
157 char *basename=pathname;
162 // Reservamos espacio para o nome do ficheiro
163 filename = (char *)malloc(sizeof(char)*MAX_FILENAME_LENGTH);
165 // Ficheiro co n�mero de elementos indexados (enteiros do texto orixinal)
166 strcpy(filename, basename);
167 strcat(filename, ".");
168 strcat(filename, NUMBER_OF_ELEMENTS_FILE_EXT);
170 if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
171 printf("Cannot open file %s\n", filename);
174 write(file, &(myicsa->suffixArraySize), sizeof(int));
177 strcpy(filename, basename);
178 strcat(filename, ".");
179 strcat(filename, PSI_COMPRESSED_FILE_EXT);
181 #ifdef PSI_HUFFMANRLE
182 storeHuffmanCompressedPsi(&(myicsa->hcPsi), filename);
185 storeGonzaloCompressedPsi(&(myicsa->gcPsi), filename);
187 #ifdef PSI_DELTACODES
188 storeDeltaCompressedPsi(&(myicsa->dcPsi), filename);
191 // Ficheiro co vector de bits D
192 strcpy(filename, basename);
193 strcat(filename, ".");
194 strcat(filename, D_FILE_EXT);
196 if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
197 printf("Cannot open file %s\n", filename);
200 write(file, myicsa->D, sizeof(int)*((myicsa->suffixArraySize+31)/32));
203 // Directorio de rank para D
204 // Almacenamos o n�mero de superbloques seguido dos superbloques
205 // E logo o n�mero de bloques seguido dos bloques
206 strcpy(filename, basename);
207 strcat(filename, ".");
208 strcat(filename, D_RANK_DIRECTORY_FILE_EXT);
209 saveBitmap(filename,myicsa->bD);
211 // Ficheiro coas mostras de A
212 strcpy(filename, basename);
213 strcat(filename, ".");
214 strcat(filename, SAMPLES_A_FILE_EXT);
216 if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
217 printf("Cannot open file %s\n", filename);
220 write(file, myicsa->samplesA, sizeof(int) * (myicsa->samplesASize));
223 // Ficheiro co vector BA (marca as posicions de A muestreadas)
224 strcpy(filename, basename);
225 strcat(filename, ".");
226 strcat(filename, BA_FILE_EXT);
228 if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
229 printf("Cannot open file %s\n", filename);
232 write(file, myicsa->BA, sizeof(int)*((myicsa->suffixArraySize+31)/32));
235 // Directorio de rank para BA
236 strcpy(filename, basename);
237 strcat(filename, ".");
238 strcat(filename, BA_RANK_DIRECTORY_FILE_EXT);
239 saveBitmap(filename, myicsa->bBA);
241 // Ficheiro coas mostras de A inversa
242 strcpy(filename, basename);
243 strcat(filename, ".");
244 strcat(filename, SAMPLES_A_INV_FILE_EXT);
246 if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
247 printf("Cannot open file %s\n", filename);
250 write(file, myicsa->samplesAInv, sizeof(int) * (myicsa->samplesAInvSize));
253 // Ficheiro co periodo de muestreo de A e A inversa
254 strcpy(filename, basename);
255 strcat(filename, ".");
256 strcat(filename, SAMPLING_PERIOD_A_FILE_EXT);
258 if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
259 printf("Cannot open file %s\n", filename);
262 write(file, &(myicsa->T_A), sizeof(int));
263 write(file, &(myicsa->T_AInv), sizeof(int));
265 write(file, &(myicsa->psiSearchFactorJump), sizeof(uint));
269 return 0; //no error.
272 //Returns the size (in bytes) of the index over the sequence of integers.
273 //uint CSA_size(ticsa *myicsa) {
274 int sizeIntIndex(void *index, uint *numBytes) {
275 ticsa *myicsa = (ticsa *) index;
277 size +=(sizeof (ticsa) * 1);
278 size += sizeof(uint)*((myicsa->suffixArraySize+31)/32) ; //D vector
279 size += myicsa->bD->mem_usage;
280 size += sizeof(uint) * myicsa->samplesASize ; // samples A
281 size += sizeof(uint) * myicsa->samplesAInvSize ; // samples A^{-1}
282 size += sizeof(uint)*((myicsa->suffixArraySize+31)/32) ; //BA vector
283 size += myicsa->bBA->mem_usage;
284 #ifdef PSI_HUFFMANRLE
285 size +=myicsa->hcPsi.totalMem;
288 size +=myicsa->gcPsi.totalMem;
290 #ifdef PSI_DELTACODES
291 size +=myicsa->dcPsi.totalMem;
294 return 0; //no error.
298 //ticsa *loadCSA(char *basename) {
299 int loadIntIndex(char *pathname, void **index){
301 char *basename=pathname;
308 uint suffixArraySize;
311 myicsa = (ticsa *) malloc (sizeof (ticsa) * 1);
314 // VARIABLE GLOBAL QUE ALMACENA O ESTADO DOS DISPLAYS (IMPORTANTE PARA DISPLAY SECUENCIAL)
315 // Almacena a �ltima posici�n do array de sufixos que mostramos con display ou displayNext
316 // Se nos piden un displayNext, aplicamos PSI sobre esta posici�n e obtemos a seguinte,
317 // coa que podemos obter o s�mbolo pedido, e actualizamos displayState
318 myicsa->displayCSAState = 0;
319 myicsa->displayCSAPrevPosition = -2; //needed by DisplayCSA(position)
321 // Reservamos espacio para o nome do ficheiro
322 filename = (char *)malloc(sizeof(char)*MAX_FILENAME_LENGTH);
324 // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O NUMERO DE ELEMENTOS INDEXADOS
325 strcpy(filename, basename);
326 strcat(filename, ".");
327 strcat(filename, NUMBER_OF_ELEMENTS_FILE_EXT);
328 if( (file = open(filename, O_RDONLY)) < 0) {
329 printf("Cannot read file %s\n", filename);exit(0);
331 read(file, &suffixArraySize, sizeof(uint));
332 myicsa->suffixArraySize = suffixArraySize;
333 printf("Number of indexed elements (suffix array size) = %d\n", suffixArraySize);
335 // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA PSI COMPRIMIDA
336 strcpy(filename, basename);
337 strcat(filename, ".");
338 strcat(filename, PSI_COMPRESSED_FILE_EXT);
339 #ifdef PSI_HUFFMANRLE
340 myicsa->hcPsi = loadHuffmanCompressedPsi(filename);
343 myicsa->gcPsi = loadGonzaloCompressedPsi(filename);
345 #ifdef PSI_DELTACODES
346 myicsa->dcPsi = loadDeltaCompressedPsi(filename);
349 // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA D
350 strcpy(filename, basename);
351 strcat(filename, ".");
352 strcat(filename, D_FILE_EXT);
353 if( (file = open(filename, O_RDONLY)) < 0) {
354 printf("Cannot read file %s\n", filename); exit(0);
356 myicsa->D = (uint *) malloc (sizeof(uint)*((suffixArraySize+31)/32));
357 read(file, myicsa->D, sizeof(uint)*((suffixArraySize+31)/32));
358 printf("Bit vector D loaded (%d bits)\n", suffixArraySize);
360 // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O DIRECTORIO DE RANK1 PARA D
361 strcpy(filename, basename);
362 strcat(filename, ".");
363 strcat(filename, D_RANK_DIRECTORY_FILE_EXT);
364 myicsa->bD = loadBitmap(filename,myicsa->D,suffixArraySize);
366 ns = myicsa->bD->sSize;
367 nb = myicsa->bD->bSize;
368 myicsa->bD->data = myicsa->D;
369 printf("Rank1 Directory for D loaded (%d superblocks, %d blocks)\n", ns, nb);
372 // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA SAMPLES A
373 strcpy(filename, basename);
374 strcat(filename, ".");
375 strcat(filename, SAMPLES_A_FILE_EXT);
376 if( (file = open(filename, O_RDONLY)) < 0) {
377 printf("Cannot read file %s\n", filename); exit(0);
379 if( fstat(file, &f_stat) < 0) {
380 printf("Cannot read information from file %s\n", filename); exit(0);
382 myicsa->samplesASize = (f_stat.st_size)/sizeof(uint);
383 myicsa->samplesA = (uint *)malloc(sizeof(uint) * myicsa->samplesASize);
384 read(file, myicsa->samplesA, sizeof(uint) * myicsa->samplesASize);
385 printf("Suffix array samples loaded (%d samples)\n", myicsa->samplesASize);
387 // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA BA
388 strcpy(filename, basename);
389 strcat(filename, ".");
390 strcat(filename, BA_FILE_EXT);
391 if( (file = open(filename, O_RDONLY)) < 0) {
392 printf("Cannot read file %s\n", filename); exit(0);
394 myicsa->BA = (uint *) malloc (sizeof(uint)*((suffixArraySize+31)/32));
395 read(file, myicsa->BA, sizeof(uint)*((suffixArraySize+31)/32));
396 printf("Bit vector BA loaded (%d bits)\n", suffixArraySize);
398 // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O DIRECTORIO DE RANK1 PARA BA
399 strcpy(filename, basename);
400 strcat(filename, ".");
401 strcat(filename, BA_RANK_DIRECTORY_FILE_EXT);
402 myicsa->bBA = loadBitmap(filename,myicsa->BA,suffixArraySize);
404 ns = myicsa->bBA->sSize;
405 nb = myicsa->bBA->bSize;
406 myicsa->bBA->data = myicsa->BA;
407 printf("Rank1 Directory for BA loaded (%d superblocks, %d blocks)\n", ns, nb);
410 // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA SAMPLES A INVERSA
411 strcpy(filename, basename);
412 strcat(filename, ".");
413 strcat(filename, SAMPLES_A_INV_FILE_EXT);
414 if( (file = open(filename, O_RDONLY)) < 0) {
415 printf("Cannot read file %s\n", filename); exit(0);
417 if( fstat(file, &f_stat) < 0) {
418 printf("Cannot read information from file %s\n", filename); exit(0);
420 myicsa->samplesAInvSize = (f_stat.st_size)/(sizeof(uint));
421 myicsa->samplesAInv = (uint *)malloc(sizeof(uint) * myicsa->samplesAInvSize);
422 read(file, myicsa->samplesAInv, sizeof(uint) * myicsa->samplesAInvSize);
423 printf("Suffix array inverse samples loaded (%d samples)\n", myicsa->samplesAInvSize);
425 // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O PERIODO DE MUESTREO DO ARRAY DE SUFIXOS E DA INVERSA
426 strcpy(filename, basename);
427 strcat(filename, ".");
428 strcat(filename, SAMPLING_PERIOD_A_FILE_EXT);
429 if( (file = open(filename, O_RDONLY)) < 0) {
430 printf("Cannot read file %s\n", filename); exit(0);
432 read(file, &(myicsa->T_A), sizeof(uint));
433 read(file, &(myicsa->T_AInv), sizeof(uint));
434 printf("Sampling A Period T = %d, Sampling A inv Period TInv = %d\n", myicsa->T_A, myicsa->T_AInv);
436 read(file, &(myicsa->psiSearchFactorJump), sizeof(uint));
437 printf("Psi Bin Search Factor-Jump is = %d\n", myicsa->psiSearchFactorJump);
448 //uint destroyStructsCSA(ticsa *myicsa) {
449 int freeIntIndex(void *index) {
450 ticsa *myicsa = (ticsa *) index;
451 // Liberamos o espacio reservado
453 if (!myicsa) return 0;
455 uint total=0, totaltmp=0;
457 uint nbytes;sizeIntIndex(index, &nbytes);
459 total +=(sizeof (ticsa) * 1);;
461 #ifdef PSI_HUFFMANRLE
462 total+= totaltmp = myicsa->hcPsi.totalMem;
463 destroyHuffmanCompressedPsi(&(myicsa->hcPsi));
466 total+= totaltmp = myicsa->gcPsi.totalMem;
467 destroyGonzaloCompressedPsi(&(myicsa->gcPsi));
469 #ifdef PSI_DELTACODES
470 total+= totaltmp = myicsa->dcPsi.totalMem;
471 destroyDeltaCodesCompressedPsi(&(myicsa->dcPsi));
473 printf("\n\t[destroying SA: compressed PSI structure] ...Freed %u bytes... RAM",totaltmp);
475 free(myicsa->D); total+= totaltmp = (sizeof(uint)*((myicsa->suffixArraySize+31)/32));
476 printf("\n\t[destroying SA: D vector] ...Freed %u bytes... RAM",totaltmp);
477 free(myicsa->samplesA); total+= totaltmp = (sizeof(uint) * myicsa->samplesASize);
478 printf("\n\t[destroying Samples A: A ] ...Freed %u bytes... RAM",totaltmp);
479 free(myicsa->samplesAInv); total+= totaltmp = (sizeof(uint) * myicsa->samplesAInvSize);
480 printf("\n\t[destroying Samples AInv: A ] ...Freed %u bytes... RAM",totaltmp);
481 printf("\n\t[destroying rank bit D ] ...Freed %u bytes... RAM",myicsa->bD->mem_usage);
482 free(myicsa->BA); total+= totaltmp = (sizeof(uint)*((myicsa->suffixArraySize+31)/32));
483 printf("\n\t[destroying SA: BA vector] ...Freed %u bytes... RAM",totaltmp);
485 total += myicsa->bD->mem_usage;
486 destroyBitmap(myicsa->bD);
487 total += myicsa->bBA->mem_usage;
488 destroyBitmap(myicsa->bBA);
490 printf("\n\t**** [the whole iCSA ocuppied ... %u bytes... RAM",total);
491 printf("\n\t**** iCSA size = %d bytes ",nbytes);
496 return 0; //no error.
499 // Shows detailed summary info of the self-index (memory usage of each structure)
500 int printInfoIntIndex(void *index, const char tab[]) {
501 ticsa *myicsa = (ticsa *) index;
502 if (!myicsa) return 0;
504 uint structure, totalpsi, totalD, totalBD, totalSA,totalSAinv, totalBA,totalBBA;
506 structure=sizeof(ticsa);
508 #ifdef PSI_HUFFMANRLE
509 totalpsi = myicsa->hcPsi.totalMem;
512 totalpsi = myicsa->gcPsi.totalMem;
514 #ifdef PSI_DELTACODES
515 totalpsi = myicsa->dcPsi.totalMem;
518 totalD = (sizeof(uint)*((myicsa->suffixArraySize+31)/32));
519 totalBD = myicsa->bD->mem_usage;
520 totalSA = (sizeof(uint) * myicsa->samplesASize);
521 totalSAinv = (sizeof(uint) * myicsa->samplesAInvSize);
522 totalBA = (sizeof(uint)*((myicsa->suffixArraySize+31)/32));
523 totalBBA = myicsa->bBA->mem_usage;
525 uint nbytes; sizeIntIndex(index, &nbytes); //whole self-index
527 printf("\n ===================================================:");
528 printf("\n%sSummary Self-index on integers (icsa) layer:",tab);
529 printf("\n%s icsa structure = %d bytes",tab, structure);
530 printf("\n%s psi = %8d bytes",tab, totalpsi);
531 printf("\n%s D (bitmap) = %8d bytes",tab, totalD);
532 printf("\n%s rank for D = %8d bytes",tab, totalBD);
533 printf("\n%s SA(sampled) = %8d bytes",tab, totalSA);
534 printf("\n%s SAinv(samp) = %8d bytes",tab, totalSAinv);
535 printf("\n%s BA (bitmap) = %8d bytes",tab, totalBA);
536 printf("\n%s rank for BA = %8d bytes",tab, totalBBA);
537 printf("\n%sTotal = ** %9d bytes (in RAM) ** ",tab, nbytes);
540 return 0; //no error.
546 // BUSCA BINARIA SOBRE MOSTRAS + 2 BUSCAS EXPONENCIAIS + 2 BUSCAS BINARIAS
547 // 1º Busca binaria sobre o array de sufixos, elexindo como pivote un múltiplo de bin_search_psi_skip_interval (que orixinalmente foi pensado para igualar co valor de Psi).
548 // 2º Esta busca pode deterse por:
549 // a) O pivote repítese entre dúas iteracións -> As ocorrencias están entre o pivote e a seguinte mostra (pivote + bin_search_psi_skip_interval) -> facemos dúas buscas binarias
550 // b) O pivote é unha ocorrencia do patrón -> Faise unha busca exponencial sobre mostras hacia a esquerda e outra hacia a dereita, ata atopar a unha mostra á esquerda e outra
551 // á dereita do intervalo de ocorrencias. Entre cada unha destas e a inmediatamente anterior da busca exponencial, faise unha busca binaria para atopar os extremos do intervalo.
553 int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right){
554 //unsigned int countCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right) {
556 uint patternSize = length;
557 ticsa *myicsa = (ticsa *) index;
559 register unsigned long l, r, i;
560 register long comp, p, previousP;
561 //register unsigned int l, r, i;
562 //register int comp, p, previousP;
563 register uint bin_search_psi_skip_interval = myicsa->psiSearchFactorJump;
565 //fprintf(stderr,"\n psiSearchFactor = %d",myicsa->psiSearchFactorJump);
568 r = (myicsa->suffixArraySize+bin_search_psi_skip_interval-2)/bin_search_psi_skip_interval*bin_search_psi_skip_interval;
569 p = ((l+r)/2)/bin_search_psi_skip_interval * bin_search_psi_skip_interval;
572 // BUSCA BINARIA SOBRE MOSTRAS
573 while( (p != previousP) && (comp = SadCSACompare(myicsa, pattern, patternSize, p)) ) {
577 p = ((l+r)/2)/bin_search_psi_skip_interval*bin_search_psi_skip_interval;
582 // BUSCA BINARIA ENTRE O PIVOTE E A SEGUINTE MOSTRA
584 r = previousP+bin_search_psi_skip_interval;
585 if(r > myicsa->suffixArraySize) r = myicsa->suffixArraySize - 1;
588 if(SadCSACompare(myicsa, pattern, patternSize, p) <= 0) r = p;
592 if(SadCSACompare(myicsa, pattern, patternSize, r)) {
597 return 0; //no error.
602 r = previousP+bin_search_psi_skip_interval;
603 if(r > myicsa->suffixArraySize) r = myicsa->suffixArraySize - 1;
606 if(SadCSACompare(myicsa, pattern, patternSize, p) >= 0) l = p;
613 previousP = p; // En previousP poñemos o p atopado na busca sobre as mostras
615 // BUSCA EXPONENCIAL HACIA ATRÁS
617 p -= bin_search_psi_skip_interval;
618 while(p>0 && !SadCSACompare(myicsa, pattern, patternSize, p)) {
620 p = previousP-(i*bin_search_psi_skip_interval);
622 // Busca binaria entre as duas ultimas mostras da exponencial
623 if(previousP > i*bin_search_psi_skip_interval) l = previousP-(i*bin_search_psi_skip_interval);
626 r = previousP-(i*bin_search_psi_skip_interval);
629 if(SadCSACompare(myicsa, pattern, patternSize, p) <= 0) r = p;
634 // BUSCA EXPONENCIAL HACIA ADIANTE
636 p = previousP+bin_search_psi_skip_interval;
637 while(p<myicsa->suffixArraySize && !SadCSACompare(myicsa, pattern, patternSize, p)) {
639 p = previousP+(i*bin_search_psi_skip_interval);
641 // Busca binaria entre as duas ultimas mostras da exponencial
642 if(p < myicsa->suffixArraySize) r = previousP+(i*bin_search_psi_skip_interval);
643 else r = myicsa->suffixArraySize-1;
645 l = previousP+(i*bin_search_psi_skip_interval);
648 if(SadCSACompare(myicsa, pattern, patternSize, p) >= 0) l = p;
654 //return *right-*left+1;
655 *numocc = (uint) *right-*left+1;
659 // Version inicial de busca binaria
660 unsigned int countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right) {
661 register ulong l, r, p;
664 r = myicsa->suffixArraySize-1;
668 if(SadCSACompare(myicsa, pattern, patternSize, p) <= 0) r = p;
672 // SE SON DISTINTOS O PATRON NON APARECE NO TEXTO E DEVOLVEMOS 0
673 if(SadCSACompare(myicsa, pattern, patternSize, r)) {
679 // Almacenamos o limite esquerdo
682 // SE SON IGUALES (O PATRON APARECE NO TEXTO), BUSCAMOS AGORA O LIMITE DEREITO, QUE ALMACENAREMOS EN right
683 // NOTA: INICIAMOS A BUSQUEDA A PARTIR DO ESQUERDO...
685 r = myicsa->suffixArraySize-1;
689 if(SadCSACompare(myicsa, pattern, patternSize, p) >= 0) l = p;
693 // Gardamos o limite dereito
696 return (uint) *right-*left+1;
699 int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc) {
700 //unsigned int *locateCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *occ) {
702 ticsa *myicsa = (ticsa *) index;
703 uint patternSize = length;
706 register ulong left, right;
708 //occurrences = countCSA(myicsa, pattern, patternSize, &left, &right);
710 err = countIntIndex((void *) myicsa, pattern, patternSize, &occurrences, &left, &right);
711 *numocc = occurrences;
714 register ulong idx = 0;
715 positions = (ulong*) malloc(sizeof(ulong) * occurrences);
716 while(left<=right) positions[idx++] = A(myicsa,left++);
724 return 0; //no error, but no occurrences.
729 // Devolve o enteiro do texto que ocupa a posicion dada,
730 // e fixa o estado para poder seguir obtendo os seguintes enteiros con displayNext();
732 int displayIntIndex(void *index, ulong position, uint *value){
733 //uint displayCSA(ticsa *myicsa, uint position) {
734 ticsa *myicsa = (ticsa *) index;
735 if (position == (myicsa->displayCSAPrevPosition +1)) {
736 myicsa->displayCSAPrevPosition = position;
737 //return displayCSANext(myicsa);
738 *value = displayCSANext(myicsa);
741 myicsa->displayCSAPrevPosition = position;
742 //return displayCSAFirst(myicsa, position);
743 *value = displayCSAFirst(myicsa, position);
749 /**********************************************************************/
751 // Devolve o enteiro do texto que ocupa a posicion dada, e fixa o estado
752 // para poder seguir obtendo os seguintes enteiros con displayNext();
753 uint displayCSAFirst(ticsa *myicsa, uint position) {
755 register uint positionAux, index;
756 register uint T_AInv = myicsa->T_AInv;
758 positionAux = myicsa->samplesAInv[position / T_AInv];
759 for(index = 0; index < position%T_AInv; index++) {
760 #ifdef PSI_HUFFMANRLE
761 positionAux=getHuffmanPsiValue(&(myicsa->hcPsi),positionAux);
764 positionAux=getGonzaloPsiValue(&(myicsa->gcPsi),positionAux);
766 #ifdef PSI_DELTACODES
767 positionAux=getDeltaPsiValue(&(myicsa->dcPsi),positionAux);
771 // Fijamos a variable global para a chamada a displayNext
772 myicsa->displayCSAState = positionAux;
774 // return rank1(D, Dir, positionAux) - 1;
775 return rank(myicsa->bD, positionAux) - 1;
779 // Devolve o seguinte enteiro do texto (a partir do estado)
780 unsigned int displayCSANext(ticsa *myicsa) {
781 #ifdef PSI_HUFFMANRLE
782 myicsa->displayCSAState=getHuffmanPsiValue(&(myicsa->hcPsi),myicsa->displayCSAState);
785 myicsa->displayCSAState = getGonzaloPsiValue(&(myicsa->gcPsi),myicsa->displayCSAState);
787 #ifdef PSI_DELTACODES
788 myicsa->displayCSAState = getDeltaPsiValue(&(myicsa->dcPsi),myicsa->displayCSAState);
790 return rank(myicsa->bD, myicsa->displayCSAState) - 1;
794 // Mostra as estructuras creadas
795 void showStructsCSA(ticsa *myicsa) {
799 // ESTRUCTURAS PARA CSA
800 printf("Basic CSA structures:\n\n");
802 // VALORES DA FUNCI�N PSI (decodificando)
803 printf("\tPSI: (Sampling period = %d)\n", myicsa->T_Psi);
804 for(index=0; index < myicsa->suffixArraySize; index++)
805 #ifdef PSI_HUFFMANRLE
806 printf("\tPsi[%d] = %d\n", index, getHuffmanPsiValue(&(myicsa->hcPsi),index));
809 printf("\tPsi[%d] = %d\n", index, getGonzaloPsiValue(&(myicsa->gcPsi),index));
811 #ifdef PSI_DELTACODES
812 printf("\tPsi[%d] = %d\n", index, getDeltaPsiValue(&(myicsa->dcPsi),index));
816 // VECTOR D DE SADAKANE CO DIRECTORIO DE RANK ASOCIADO
818 showBitVector(myicsa->D, myicsa->suffixArraySize);
819 printf("\n\nSuperbloques de D:\n");
822 ns = myicsa->bD->sSize;
823 nb= myicsa->bD->bSize;
824 for(index=0; index<ns; index++) {
825 //printf("\tDs[%d] = %d\n", index, Dir.Ds[index]);
826 printf("\tDs[%d] = %d\n", index, myicsa->bD->sdata[index]);
828 printf("\nBloques de D:\n");
830 for(index=0; index<nb; index++) {
831 //printf("\tDb[%d] = %d\n", index, Dir.Db[index]);
832 printf("\tDb[%d] = %d\n", index, myicsa->bD->bdata[index]);
836 // ESTRUCTURAS PARA ACCEDER O ARRAY DE SUFIXOS E A SUA INVERSA
837 printf("Suffix Array Sampling Structures: (Sampling period = %d)\n", myicsa->T_A);
838 printf("\tSuffix Array Samples:\n");
839 for(index=0; index < myicsa->samplesASize; index++)
840 printf("\tSamplesA[%d] = %d\n", index, myicsa->samplesA[index]);
842 printf("\tInverse Suffix Array Samples:\n");
843 for(index=0; index < myicsa->samplesASize; index++)
844 printf("\tSamplesAInv[%d] = %d\n", index, myicsa->samplesAInv[index]);
850 // Comparacion de Sadakane entre un patron (pattern) y el sufijo en la posicion p del array de sufijos
851 // IMPORTANTE EVITAR ULTIMA CHAMADA A PSI
852 int SadCSACompare(ticsa *myicsa, uint *pattern, uint patternSize, uint p) {
854 register unsigned int j, i, currentInteger, diff;
860 currentInteger = rank(myicsa->bD, i) - 1;
861 diff = pattern[j++] - currentInteger;
862 if(diff) return diff;
863 if(j == patternSize) return 0;
865 #ifdef PSI_HUFFMANRLE
866 i=getHuffmanPsiValue(&(myicsa->hcPsi),i);
869 i=getGonzaloPsiValue(&(myicsa->gcPsi),i);
871 #ifdef PSI_DELTACODES
872 i=getDeltaPsiValue(&(myicsa->dcPsi),i);
879 // Acceso a array de sufixos A
880 inline uint A(ticsa *myicsa, uint position) {
882 register uint timesPsi, sampleValue;
883 register uint T_A = myicsa->T_A;
885 uint proba = position;
888 while(!bitget(myicsa->BA, position)) {
890 #ifdef PSI_HUFFMANRLE
891 position=getHuffmanPsiValue(&(myicsa->hcPsi),position);
894 position=getGonzaloPsiValue(&(myicsa->gcPsi),position);
896 #ifdef PSI_DELTACODES
897 position=getDeltaPsiValue(&(myicsa->dcPsi),position);
902 sampleValue = myicsa->samplesA[rank(myicsa->bBA, position)-1];
904 return sampleValue - timesPsi;
909 // Acceso 'a inversa do array de sufixos
910 inline uint inverseA(ticsa *myicsa, uint offset) {
912 register uint index, inverseValue;
913 register uint T_AInv = myicsa->T_AInv;
915 inverseValue = myicsa->samplesAInv[offset/T_AInv];
916 for(index=0; index<(offset%T_AInv); index++)
917 #ifdef PSI_HUFFMANRLE
918 inverseValue=getHuffmanPsiValue(&(myicsa->hcPsi),inverseValue);
921 inverseValue = getGonzaloPsiValue(&(myicsa->gcPsi),inverseValue);
923 #ifdef PSI_DELTACODES
924 inverseValue = getDeltaPsiValue(&(myicsa->dcPsi),inverseValue);
930 // Initializes the parameters of the index.
931 uint parametersCSA(ticsa *myicsa, char *build_options){
932 char delimiters[] = " =;";
933 int j,num_parameters;
935 int ssA,ssAinv,ssPsi,nsHuff,psiSearchFactor;
937 ssA = DEFAULT_A_SAMPLING_PERIOD;
938 ssAinv = DEFAULT_A_INV_SAMPLING_PERIOD;
939 ssPsi = DEFAULT_PSI_SAMPLING_PERIOD;
940 nsHuff = DEFAULT_nsHUFF;
941 psiSearchFactor = DEFAULT_PSI_BINARY_SEARCH_FACTOR;
943 if (build_options != NULL) {
944 parse_parameters(build_options,&num_parameters, ¶meters, delimiters);
945 for (j=0; j<num_parameters;j++) {
947 if ((strcmp(parameters[j], "sA") == 0 ) && (j < num_parameters-1) ) {
948 ssA=atoi(parameters[j+1]);
950 if ((strcmp(parameters[j], "sAinv") == 0 ) && (j < num_parameters-1) ) {
951 ssAinv=atoi(parameters[j+1]);
953 if ((strcmp(parameters[j], "sPsi") == 0 ) && (j < num_parameters-1) ) {
954 ssPsi=atoi(parameters[j+1]);
956 if ((strcmp(parameters[j], "nsHuff") == 0 ) && (j < num_parameters-1) ) {
957 nsHuff=atoi(parameters[j+1]);
960 if ((strcmp(parameters[j], "psiSF") == 0 ) && (j < num_parameters-1) ) {
961 psiSearchFactor=atoi(parameters[j+1]);
965 free_parameters(num_parameters, ¶meters);
969 myicsa->T_AInv = ssAinv;
970 myicsa->T_Psi = ssPsi;
971 myicsa->tempNSHUFF = nsHuff;
972 myicsa->psiSearchFactorJump = psiSearchFactor * ssPsi;
974 printf("\n\t parameters for iCSA: sampleA=%d, sampleAinv=%d, samplePsi=%d",ssA,ssAinv,ssPsi);
975 printf("\n\t : nsHuff=%d, psiSearchFactor = %d --> jump = %d", nsHuff,psiSearchFactor, myicsa->psiSearchFactorJump);