X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=swcsa%2FintIndex%2Ficsa.c;h=26db36bde9e2cbcebe035d89c999d2603fd55930;hb=HEAD;hp=eec4e3479dccaf1356789afdf8c777f2e3140fc8;hpb=102e33b134075765e6d4e0c38bc1307568ce5602;p=SXSI%2FTextCollection.git diff --git a/swcsa/intIndex/icsa.c b/swcsa/intIndex/icsa.c old mode 100644 new mode 100755 index eec4e34..26db36b --- a/swcsa/intIndex/icsa.c +++ b/swcsa/intIndex/icsa.c @@ -1,50 +1,338 @@ +/* icsa.c + * Copyright (C) 2011, Antonio Fariña and Eduardo Rodriguez, all rights reserved. + * + * icsa.c: Implementation of the interface "../intIndex/interfaceIntIndex.h" + * that permits to represent a sequence of uint32 integers with an iCSA: + * An integer-oriented Compressed Suffix Array. + * Such representation will be handled as a "ticsa" data structure, that + * the WCSA self-index will use (as an opaque type) to + * create/save/load/search/recover/getSize of the representation of the + * original sequence. + * Suffix sorting is done via Larson/Sadakane suffix sort algorithm + * from the char-based csa. + * + * Ref: Larsson, N. J. and Sadakane, K. 2007. Faster suffix sorting. Theoretical + * Computer Science 387, 3, 258–272. + * + * + * See more details in: + * Antonio Fariña, Nieves Brisaboa, Gonzalo Navarro, Francisco Claude, Ángeles Places, + * and Eduardo Rodríguez. Word-based Self-Indexes for Natural Language Text. ACM + * Transactions on Information Systems (TOIS), 2012. + * http://vios.dc.fi.udc.es/indexing/wsi/publications.html + * http://www.dcc.uchile.cl/~gnavarro/ps/tois11.pdf + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include #include "icsa.h" -// Global para que funcione a funcion de comparacion do quicksort -uint *intVector; - -// Para o quicksort -int suffixCmp(const void *arg1, const void *arg2) { +#define KEY(p) (V[*(p)+(h)]) +#define SWAP(p, q) (tmp=*(p), *(p)=*(q), *(q)=tmp) +#define MED3(a, b, c) (KEY(a)KEY(c) ? (b) : KEY(a)>KEY(c) ? (c) : (a))) + +static int *I, /* group array, ultimately suffix array.*/ + *V, /* inverse array, ultimately inverse of I.*/ + r, /* number of symbols aggregated by transform.*/ + h; /* length of already-sorted prefixes.*/ + +/* Subroutine for select_sort_split and sort_split. Sets group numbers for a + group whose lowest position in I is pl and highest position is pm.*/ + +static void update_group(int *pl, int *pm) +{ + int g; + + g=pm-I; /* group number.*/ + V[*pl]=g; /* update group number of first position.*/ + if (pl==pm) + *pl=-1; /* one element, sorted group.*/ + else + do /* more than one element, unsorted group.*/ + V[*++pl]=g; /* update group numbers.*/ + while (pl>1); /* small arrays, middle element.*/ + if (n>7) { + pl=p; + pn=p+n-1; + if (n>40) { /* big arrays, pseudomedian of 9.*/ + s=n>>3; + pl=MED3(pl, pl+s, pl+s+s); + pm=MED3(pm-s, pm, pm+s); + pn=MED3(pn-s-s, pn-s, pn); + } + pm=MED3(pl, pm, pn); /* midsize arrays, median of 3.*/ + } + return KEY(pm); +} +/* Sorting routine called for each unsorted group. Sorts the array of integers + (suffix numbers) of length n starting at p. The algorithm is a ternary-split + quicksort taken from Bentley & McIlroy, "Engineering a Sort Function", + Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This + function is based on Program 7.*/ + +static void sort_split(int *p, int n) +{ + int *pa, *pb, *pc, *pd, *pl, *pm, *pn; + int f, v, s, t, tmp; + + if (n<7) { /* multi-selection sort smallest arrays.*/ + select_sort_split(p, n); + return; + } + + v=choose_pivot(p, n); + pa=pb=p; + pc=pd=p+n-1; + while (1) { /* split-end partition.*/ + while (pb<=pc && (f=KEY(pb))<=v) { + if (f==v) { + SWAP(pa, pb); + ++pa; + } + ++pb; + } + while (pc>=pb && (f=KEY(pc))>=v) { + if (f==v) { + SWAP(pc, pd); + --pd; + } + --pc; + } + if (pb>pc) + break; + SWAP(pb, pc); + ++pb; + --pc; + } + pn=p+n; + if ((s=pa-p)>(t=pb-pa)) + s=t; + for (pl=p, pm=pb-s; s; --s, ++pl, ++pm) + SWAP(pl, pm); + if ((s=pd-pc)>(t=pn-pd-1)) + s=t; + for (pl=pb, pm=pn-s; s; --s, ++pl, ++pm) + SWAP(pl, pm); + + s=pb-pa; + t=pd-pc; + if (s>0) + sort_split(p, s); + update_group(p+s, p+n-t-1); + if (t>0) + sort_split(p+n-t, t); } +/* Bucketsort for first iteration. + + Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear + at least once. x[n] is 0. (This is the corresponding output of transform.) k + must be at most n+1. p is array of size n+1 whose contents are disregarded. + + Output: x is V and p is I after the initial sorting stage of the refined + suffix sorting algorithm.*/ + +static void bucketsort(int *x, int *p, int n, int k) +{ + int *pi, i, c, d, g; + + for (pi=p; pi=p; --pi) { + d=x[c=*pi]; /* c is position, d is next in list.*/ + x[c]=g=i; /* last position equals group number.*/ + if (d>=0) { /* if more than one element in group.*/ + p[i--]=c; /* p is permutation for the sorted x.*/ + do { + d=x[c=d]; /* next in linked list.*/ + x[c]=g; /* group number in x.*/ + p[i--]=c; /* permutation in p.*/ + } while (d>=0); + } else + p[i--]=-1; /* one element, sorted group.*/ + } +} +/* Transforms the alphabet of x by attempting to aggregate several symbols into + one, while preserving the suffix order of x. The alphabet may also be + compacted, so that x on output comprises all integers of the new alphabet + with no skipped numbers. + + Input: x is an array of size n+1 whose first n elements are positive + integers in the range l...k-1. p is array of size n+1, used for temporary + storage. q controls aggregation and compaction by defining the maximum value + for any symbol during transformation: q must be at least k-l; if q<=n, + compaction is guaranteed; if k-l>n, compaction is never done; if q is + INT_MAX, the maximum number of symbols are aggregated into one. + + Output: Returns an integer j in the range 1...q representing the size of the + new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is + set to the number of old symbols grouped into one. Only x[n] is 0.*/ + +static int transform(int *x, int *p, int n, int k, int l, int q) +{ + int b, c, d, e, i, j, m, s; + int *pi, *pj; + + for (s=0, i=k-l; i; i>>=1) + ++s; /* s is number of bits in old symbol.*/ + e=INT_MAX>>s; /* e is for overflow checking.*/ + for (b=d=r=0; r=k-l) { /* if bucketing possible,*/ + j=transform(V, I, n, k, l, n); + bucketsort(V, I, n, j); /* bucketsort on first r positions.*/ + } else { + transform(V, I, n, k, l, INT_MAX); + for (i=0; i<=n; ++i) + I[i]=i; /* initialize I with suffix numbers.*/ + h=0; + sort_split(I, n+1); /* quicksort on first r positions.*/ + } + h=r; /* number of symbols aggregated by transform.*/ + + while (*I>=-n) { + pi=I; /* pi is first position of group.*/ + sl=0; /* sl is negated length of sorted groups.*/ + do { + if ((s=*pi)<0) { + pi-=s; /* skip over sorted group.*/ + sl+=s; /* add negated length to sl.*/ + } else { + if (sl) { + *(pi+sl)=sl; /* combine sorted groups before pi.*/ + sl=0; + } + pk=I+V[s]+1; /* pk-1 is last position of unsorted group.*/ + sort_split(pi, pk-pi); + pi=pk; /* next group.*/ + } + } while (pi<=I+n); + if (sl) /* if the array ends with a sorted group.*/ + *(pi+sl)=sl; /* combine sorted groups at end of I.*/ + h=2*h; /* double sorted-depth.*/ + } + + for (i=0; i<=n; ++i) /* reconstruct suffix array from inverse.*/ + I[V[i]]=i; } -*/ -//ticsa *createIntegerCSA(uint **aintVector, uint textSize, char *build_options) { -int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index ){ - uint textSize=n; - intVector = aintVector; //global variable + +//ticsa *createIntegerCSA(uint **intVector, uint textSize, char *build_options) { +int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ){ + uint textSize=n; ticsa *myicsa; myicsa = (ticsa *) malloc (sizeof (ticsa)); uint *Psi, *SAI, *C, vocSize; @@ -52,22 +340,22 @@ int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index ) uint nsHUFF; parametersCSA(myicsa, build_options); - + nsHUFF=myicsa->tempNSHUFF; - + // Almacenamos o valor dalguns parametros + myicsa->suffixArraySize = textSize; - myicsa->D = (uint*) malloc (sizeof(uint) * ((textSize+31)/32)); - - myicsa->samplesASize = (textSize + myicsa->T_A - 1) / myicsa->T_A + 1; - myicsa->samplesA = (uint *)malloc(sizeof(int) * myicsa->samplesASize); + myicsa->D = (uint*) malloc (sizeof(uint) * ((textSize+31)/32)); + + myicsa->samplesASize = (textSize + myicsa->T_A - 1) / myicsa->T_A + 1; //--> última pos siempre sampleada! +// myicsa->samplesASize = (textSize + myicsa->T_A - 1) / myicsa->T_A ;//+ 1; + myicsa->samplesA = (uint *)malloc(sizeof(uint) * myicsa->samplesASize); + myicsa->samplesA[myicsa->samplesASize-1] = 0000; myicsa->BA = (uint*) malloc (sizeof(uint) * ((textSize+31)/32)); myicsa->samplesAInvSize = (textSize + myicsa->T_AInv - 1) / myicsa->T_AInv; myicsa->samplesAInv = (uint *)malloc(sizeof(int) * myicsa->samplesAInvSize); - // Reservamos espacio para os vectores - Psi = (uint *) malloc (sizeof(uint) * textSize); - // CONSTRUIMOS A FUNCION C vocSize = 0; for(i=0;ivocSize) vocSize = intVector[i]; @@ -81,20 +369,36 @@ int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index ) for(i=vocSize;i>0;i--) C[i] = C[i-1]; C[0] = 0; - // Construimos o array de sufixos (en Psi) - con quicksort - printf("\n\t *BUILDING THE SUFFIX ARRAY over %d integers... (with qsort)", textSize);fflush(stdout); - for(i=0; iBA[i] = 0; for(i=0; iT_A) bitset(myicsa->BA, SAI[i]); @@ -102,27 +406,35 @@ int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index ) //printf("TextSize = %d\n", textSize); myicsa->bBA = createBitmap(myicsa->BA, textSize); for(i=0,j=0; iBA, i)) myicsa->samplesA[j++] = Psi[i]; - + // ALMACENAMOS AS MOSTRAS DA INVERSA DO ARRAY DE SUFIXOS for(i=0,j=0;iT_AInv) myicsa->samplesAInv[j++] = SAI[i]; - + // CONSTRUIMOS E COMPRIMIMOS PSI printf("\n\t Creating compressed Psi..."); for(i=0;ihcPsi = huffmanCompressPsi(Psi,textSize,myicsa->T_Psi,nsHUFF); - #endif - #ifdef PSI_GONZALO + #endif + #ifdef PSI_GONZALO myicsa->gcPsi = gonzaloCompressPsi(Psi,textSize,myicsa->T_Psi,nsHUFF); - #endif + #endif #ifdef PSI_DELTACODES - myicsa->dcPsi = deltaCompressPsi(Psi,textSize,myicsa->T_Psi); + myicsa->dcPsi = deltaCompressPsi(Psi,textSize,myicsa->T_Psi); #endif - free(Psi); - + + free(Psi); + // Contruimos D - for(i=0;i<((textSize+31)/32);i++) myicsa->D[i] = 0; + for(i=0;i<((textSize+31)/32);i++) myicsa->D[i] = 0; for(i=0;i<=vocSize;i++) bitset(myicsa->D, C[i]); myicsa->bD = createBitmap(myicsa->D,textSize); free(C); @@ -133,9 +445,9 @@ int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index ) // coa que podemos obter o símbolo pedido, e actualizamos displayState myicsa->displayCSAState = 0; myicsa->displayCSAPrevPosition = -2; //needed by DisplayCSA(position) - - aintVector = intVector; - // Liberamos o espacion non necesario + + //aintVector = intVector; + // Liberamos o espacio non necesario *index = myicsa; //return (myicsa); @@ -152,16 +464,16 @@ int sourceLenIntIndex(void *index, uint *numInts){ int saveIntIndex(void *index, char *pathname) { //void storeStructsCSA(ticsa *myicsa, char *basename) { - - ticsa *myicsa = (ticsa *) index; + + ticsa *myicsa = (ticsa *) index; char *basename=pathname; char *filename; int file; - + // Reservamos espacio para o nome do ficheiro filename = (char *)malloc(sizeof(char)*MAX_FILENAME_LENGTH); - + // Ficheiro co n�mero de elementos indexados (enteiros do texto orixinal) strcpy(filename, basename); strcat(filename, "."); @@ -172,26 +484,26 @@ int saveIntIndex(void *index, char *pathname) { exit(0); } write(file, &(myicsa->suffixArraySize), sizeof(int)); - close(file); + close(file); strcpy(filename, basename); strcat(filename, "."); - strcat(filename, PSI_COMPRESSED_FILE_EXT); + strcat(filename, PSI_COMPRESSED_FILE_EXT); #ifdef PSI_HUFFMANRLE - storeHuffmanCompressedPsi(&(myicsa->hcPsi), filename); - #endif + storeHuffmanCompressedPsi(&(myicsa->hcPsi), filename); + #endif #ifdef PSI_GONZALO - storeGonzaloCompressedPsi(&(myicsa->gcPsi), filename); + storeGonzaloCompressedPsi(&(myicsa->gcPsi), filename); #endif #ifdef PSI_DELTACODES storeDeltaCompressedPsi(&(myicsa->dcPsi), filename); #endif - + // Ficheiro co vector de bits D strcpy(filename, basename); strcat(filename, "."); - strcat(filename, D_FILE_EXT); + strcat(filename, D_FILE_EXT); unlink(filename); if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) { printf("Cannot open file %s\n", filename); @@ -207,11 +519,11 @@ int saveIntIndex(void *index, char *pathname) { strcat(filename, "."); strcat(filename, D_RANK_DIRECTORY_FILE_EXT); saveBitmap(filename,myicsa->bD); - + // Ficheiro coas mostras de A strcpy(filename, basename); strcat(filename, "."); - strcat(filename, SAMPLES_A_FILE_EXT); + strcat(filename, SAMPLES_A_FILE_EXT); unlink(filename); if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) { printf("Cannot open file %s\n", filename); @@ -223,7 +535,7 @@ int saveIntIndex(void *index, char *pathname) { // Ficheiro co vector BA (marca as posicions de A muestreadas) strcpy(filename, basename); strcat(filename, "."); - strcat(filename, BA_FILE_EXT); + strcat(filename, BA_FILE_EXT); unlink(filename); if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) { printf("Cannot open file %s\n", filename); @@ -237,7 +549,7 @@ int saveIntIndex(void *index, char *pathname) { strcat(filename, "."); strcat(filename, BA_RANK_DIRECTORY_FILE_EXT); saveBitmap(filename, myicsa->bBA); - + // Ficheiro coas mostras de A inversa strcpy(filename, basename); strcat(filename, "."); @@ -248,7 +560,7 @@ int saveIntIndex(void *index, char *pathname) { exit(0); } write(file, myicsa->samplesAInv, sizeof(int) * (myicsa->samplesAInvSize)); - close(file); + close(file); // Ficheiro co periodo de muestreo de A e A inversa strcpy(filename, basename); @@ -261,11 +573,11 @@ int saveIntIndex(void *index, char *pathname) { } write(file, &(myicsa->T_A), sizeof(int)); write(file, &(myicsa->T_AInv), sizeof(int)); - + write(file, &(myicsa->psiSearchFactorJump), sizeof(uint)); close(file); - free(filename); + free(filename); return 0; //no error. } @@ -282,14 +594,14 @@ int sizeIntIndex(void *index, uint *numBytes) { size += sizeof(uint)*((myicsa->suffixArraySize+31)/32) ; //BA vector size += myicsa->bBA->mem_usage; #ifdef PSI_HUFFMANRLE - size +=myicsa->hcPsi.totalMem; - #endif + size +=myicsa->hcPsi.totalMem; + #endif #ifdef PSI_GONZALO - size +=myicsa->gcPsi.totalMem; + size +=myicsa->gcPsi.totalMem; #endif #ifdef PSI_DELTACODES size +=myicsa->dcPsi.totalMem; - #endif + #endif *numBytes = size; return 0; //no error. } @@ -304,20 +616,20 @@ int loadIntIndex(char *pathname, void **index){ uint length; char c; char *word; - struct stat f_stat; + struct stat f_stat; uint suffixArraySize; ticsa *myicsa; myicsa = (ticsa *) malloc (sizeof (ticsa) * 1); - - + + // VARIABLE GLOBAL QUE ALMACENA O ESTADO DOS DISPLAYS (IMPORTANTE PARA DISPLAY SECUENCIAL) // Almacena a �ltima posici�n do array de sufixos que mostramos con display ou displayNext // Se nos piden un displayNext, aplicamos PSI sobre esta posici�n e obtemos a seguinte, // coa que podemos obter o s�mbolo pedido, e actualizamos displayState myicsa->displayCSAState = 0; myicsa->displayCSAPrevPosition = -2; //needed by DisplayCSA(position) - + // Reservamos espacio para o nome do ficheiro filename = (char *)malloc(sizeof(char)*MAX_FILENAME_LENGTH); @@ -325,34 +637,34 @@ int loadIntIndex(char *pathname, void **index){ strcpy(filename, basename); strcat(filename, "."); strcat(filename, NUMBER_OF_ELEMENTS_FILE_EXT); - if( (file = open(filename, O_RDONLY)) < 0) { + if( (file = open(filename, O_RDONLY)) < 0) { printf("Cannot read file %s\n", filename);exit(0); - } - read(file, &suffixArraySize, sizeof(uint)); + } + read(file, &suffixArraySize, sizeof(uint)); myicsa->suffixArraySize = suffixArraySize; printf("Number of indexed elements (suffix array size) = %d\n", suffixArraySize); - - // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA PSI COMPRIMIDA + + // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA PSI COMPRIMIDA strcpy(filename, basename); strcat(filename, "."); - strcat(filename, PSI_COMPRESSED_FILE_EXT); + strcat(filename, PSI_COMPRESSED_FILE_EXT); #ifdef PSI_HUFFMANRLE - myicsa->hcPsi = loadHuffmanCompressedPsi(filename); - #endif + myicsa->hcPsi = loadHuffmanCompressedPsi(filename); + #endif #ifdef PSI_GONZALO - myicsa->gcPsi = loadGonzaloCompressedPsi(filename); + myicsa->gcPsi = loadGonzaloCompressedPsi(filename); #endif #ifdef PSI_DELTACODES myicsa->dcPsi = loadDeltaCompressedPsi(filename); - #endif - + #endif + // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA D strcpy(filename, basename); strcat(filename, "."); strcat(filename, D_FILE_EXT); if( (file = open(filename, O_RDONLY)) < 0) { printf("Cannot read file %s\n", filename); exit(0); - } + } myicsa->D = (uint *) malloc (sizeof(uint)*((suffixArraySize+31)/32)); read(file, myicsa->D, sizeof(uint)*((suffixArraySize+31)/32)); printf("Bit vector D loaded (%d bits)\n", suffixArraySize); @@ -360,15 +672,15 @@ int loadIntIndex(char *pathname, void **index){ // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O DIRECTORIO DE RANK1 PARA D strcpy(filename, basename); strcat(filename, "."); - strcat(filename, D_RANK_DIRECTORY_FILE_EXT); + strcat(filename, D_RANK_DIRECTORY_FILE_EXT); myicsa->bD = loadBitmap(filename,myicsa->D,suffixArraySize); - { uint ns, nb; + { uint ns, nb; ns = myicsa->bD->sSize; nb = myicsa->bD->bSize; myicsa->bD->data = myicsa->D; - printf("Rank1 Directory for D loaded (%d superblocks, %d blocks)\n", ns, nb); + printf("Rank1 Directory for D loaded (%d superblocks, %d blocks)\n", ns, nb); } - + // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA SAMPLES A strcpy(filename, basename); strcat(filename, "."); @@ -378,19 +690,19 @@ int loadIntIndex(char *pathname, void **index){ } if( fstat(file, &f_stat) < 0) { printf("Cannot read information from file %s\n", filename); exit(0); - } + } myicsa->samplesASize = (f_stat.st_size)/sizeof(uint); myicsa->samplesA = (uint *)malloc(sizeof(uint) * myicsa->samplesASize); - read(file, myicsa->samplesA, sizeof(uint) * myicsa->samplesASize); - printf("Suffix array samples loaded (%d samples)\n", myicsa->samplesASize); - + read(file, myicsa->samplesA, sizeof(uint) * myicsa->samplesASize); + printf("Suffix array samples loaded (%d samples)\n", myicsa->samplesASize); + // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA BA strcpy(filename, basename); strcat(filename, "."); strcat(filename, BA_FILE_EXT); if( (file = open(filename, O_RDONLY)) < 0) { printf("Cannot read file %s\n", filename); exit(0); - } + } myicsa->BA = (uint *) malloc (sizeof(uint)*((suffixArraySize+31)/32)); read(file, myicsa->BA, sizeof(uint)*((suffixArraySize+31)/32)); printf("Bit vector BA loaded (%d bits)\n", suffixArraySize); @@ -398,13 +710,13 @@ int loadIntIndex(char *pathname, void **index){ // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O DIRECTORIO DE RANK1 PARA BA strcpy(filename, basename); strcat(filename, "."); - strcat(filename, BA_RANK_DIRECTORY_FILE_EXT); + strcat(filename, BA_RANK_DIRECTORY_FILE_EXT); myicsa->bBA = loadBitmap(filename,myicsa->BA,suffixArraySize); - { uint ns, nb; + { uint ns, nb; ns = myicsa->bBA->sSize; nb = myicsa->bBA->bSize; myicsa->bBA->data = myicsa->BA; - printf("Rank1 Directory for BA loaded (%d superblocks, %d blocks)\n", ns, nb); + printf("Rank1 Directory for BA loaded (%d superblocks, %d blocks)\n", ns, nb); } // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA SAMPLES A INVERSA @@ -416,26 +728,26 @@ int loadIntIndex(char *pathname, void **index){ } if( fstat(file, &f_stat) < 0) { printf("Cannot read information from file %s\n", filename); exit(0); - } + } myicsa->samplesAInvSize = (f_stat.st_size)/(sizeof(uint)); myicsa->samplesAInv = (uint *)malloc(sizeof(uint) * myicsa->samplesAInvSize); - read(file, myicsa->samplesAInv, sizeof(uint) * myicsa->samplesAInvSize); + read(file, myicsa->samplesAInv, sizeof(uint) * myicsa->samplesAInvSize); printf("Suffix array inverse samples loaded (%d samples)\n", myicsa->samplesAInvSize); - + // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O PERIODO DE MUESTREO DO ARRAY DE SUFIXOS E DA INVERSA strcpy(filename, basename); strcat(filename, "."); strcat(filename, SAMPLING_PERIOD_A_FILE_EXT); if( (file = open(filename, O_RDONLY)) < 0) { printf("Cannot read file %s\n", filename); exit(0); - } + } read(file, &(myicsa->T_A), sizeof(uint)); - read(file, &(myicsa->T_AInv), sizeof(uint)); - printf("Sampling A Period T = %d, Sampling A inv Period TInv = %d\n", myicsa->T_A, myicsa->T_AInv); - + read(file, &(myicsa->T_AInv), sizeof(uint)); + printf("Sampling A Period T = %d, Sampling A inv Period TInv = %d\n", myicsa->T_A, myicsa->T_AInv); + read(file, &(myicsa->psiSearchFactorJump), sizeof(uint)); - printf("Psi Bin Search Factor-Jump is = %d\n", myicsa->psiSearchFactorJump); - + printf("Psi Bin Search Factor-Jump is = %d\n", myicsa->psiSearchFactorJump); + close(file); free(filename); @@ -443,21 +755,21 @@ int loadIntIndex(char *pathname, void **index){ *index = myicsa; return 0; } - -//uint destroyStructsCSA(ticsa *myicsa) { -int freeIntIndex(void *index) { + +//uint destroyStructsCSA(ticsa *myicsa) { +int freeIntIndex(void *index) { ticsa *myicsa = (ticsa *) index; // Liberamos o espacio reservado - + if (!myicsa) return 0; - - uint total=0, totaltmp=0; - + + size_t total=0, totaltmp=0; + uint nbytes;sizeIntIndex(index, &nbytes); - + total +=(sizeof (ticsa) * 1);; - + #ifdef PSI_HUFFMANRLE total+= totaltmp = myicsa->hcPsi.totalMem; destroyHuffmanCompressedPsi(&(myicsa->hcPsi)); @@ -468,31 +780,31 @@ int freeIntIndex(void *index) { #endif #ifdef PSI_DELTACODES total+= totaltmp = myicsa->dcPsi.totalMem; - destroyDeltaCodesCompressedPsi(&(myicsa->dcPsi)); + destroyDeltaCodesCompressedPsi(&(myicsa->dcPsi)); #endif - printf("\n\t[destroying SA: compressed PSI structure] ...Freed %u bytes... RAM",totaltmp); - - free(myicsa->D); total+= totaltmp = (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); - printf("\n\t[destroying SA: D vector] ...Freed %u bytes... RAM",totaltmp); - free(myicsa->samplesA); total+= totaltmp = (sizeof(uint) * myicsa->samplesASize); - printf("\n\t[destroying Samples A: A ] ...Freed %u bytes... RAM",totaltmp); - free(myicsa->samplesAInv); total+= totaltmp = (sizeof(uint) * myicsa->samplesAInvSize); - printf("\n\t[destroying Samples AInv: A ] ...Freed %u bytes... RAM",totaltmp); - printf("\n\t[destroying rank bit D ] ...Freed %u bytes... RAM",myicsa->bD->mem_usage); - free(myicsa->BA); total+= totaltmp = (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); - printf("\n\t[destroying SA: BA vector] ...Freed %u bytes... RAM",totaltmp); - + printf("\n\t[destroying SA: compressed PSI structure] ...Freed %zu bytes... RAM", totaltmp); + + free(myicsa->D); total+= totaltmp = (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); + printf("\n\t[destroying SA: D vector] ...Freed %zu bytes... RAM",totaltmp); + free(myicsa->samplesA); total+= totaltmp = (sizeof(uint) * myicsa->samplesASize); + printf("\n\t[destroying Samples A: A ] ...Freed %zu bytes... RAM",totaltmp); + free(myicsa->samplesAInv); total+= totaltmp = (sizeof(uint) * myicsa->samplesAInvSize); + printf("\n\t[destroying Samples AInv: A ] ...Freed %zubytes... RAM",totaltmp); + printf("\n\t[destroying rank bit D ] ...Freed %zu bytes... RAM", (size_t)myicsa->bD->mem_usage); + free(myicsa->BA); total+= totaltmp = (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); + printf("\n\t[destroying SA: BA vector] ...Freed %zu bytes... RAM",totaltmp); + total += myicsa->bD->mem_usage; - destroyBitmap(myicsa->bD); + destroyBitmap(myicsa->bD); total += myicsa->bBA->mem_usage; destroyBitmap(myicsa->bBA); - - printf("\n\t**** [the whole iCSA ocuppied ... %u bytes... RAM",total); - printf("\n\t**** iCSA size = %d bytes ",nbytes); + + printf("\n\t**** [the whole iCSA ocuppied ... %zu bytes... RAM",total); + printf("\n\t**** iCSA size = %zu bytes ", (size_t) nbytes); printf("\n"); - + free(myicsa); - + return 0; //no error. } @@ -500,11 +812,11 @@ int freeIntIndex(void *index) { int printInfoIntIndex(void *index, const char tab[]) { ticsa *myicsa = (ticsa *) index; if (!myicsa) return 0; - + uint structure, totalpsi, totalD, totalBD, totalSA,totalSAinv, totalBA,totalBBA; - + structure=sizeof(ticsa); - + #ifdef PSI_HUFFMANRLE totalpsi = myicsa->hcPsi.totalMem; #endif @@ -517,15 +829,15 @@ int printInfoIntIndex(void *index, const char tab[]) { totalD = (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); totalBD = myicsa->bD->mem_usage; - totalSA = (sizeof(uint) * myicsa->samplesASize); - totalSAinv = (sizeof(uint) * myicsa->samplesAInvSize); + totalSA = (sizeof(uint) * myicsa->samplesASize); + totalSAinv = (sizeof(uint) * myicsa->samplesAInvSize); totalBA = (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); totalBBA = myicsa->bBA->mem_usage; - + uint nbytes; sizeIntIndex(index, &nbytes); //whole self-index - - printf("\n ===================================================:"); - printf("\n%sSummary Self-index on integers (icsa) layer:",tab); + + printf("\n ===================================================:"); + printf("\n%sSummary Self-index on integers (icsa) layer:",tab); printf("\n%s icsa structure = %d bytes",tab, structure); printf("\n%s psi = %8d bytes",tab, totalpsi); printf("\n%s D (bitmap) = %8d bytes",tab, totalD); @@ -536,7 +848,7 @@ int printInfoIntIndex(void *index, const char tab[]) { printf("\n%s rank for BA = %8d bytes",tab, totalBBA); printf("\n%sTotal = ** %9d bytes (in RAM) ** ",tab, nbytes); printf("\n"); - + return 0; //no error. } @@ -550,25 +862,25 @@ int printInfoIntIndex(void *index, const char tab[]) { // b) O pivote é unha ocorrencia do patrón -> Faise unha busca exponencial sobre mostras hacia a esquerda e outra hacia a dereita, ata atopar a unha mostra á esquerda e outra // á dereita do intervalo de ocorrencias. Entre cada unha destas e a inmediatamente anterior da busca exponencial, faise unha busca binaria para atopar os extremos do intervalo. -int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right){ +int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right){ //unsigned int countCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right) { uint patternSize = length; ticsa *myicsa = (ticsa *) index; - + register unsigned long l, r, i; register long comp, p, previousP; //register unsigned int l, r, i; //register int comp, p, previousP; register uint bin_search_psi_skip_interval = myicsa->psiSearchFactorJump; - + //fprintf(stderr,"\n psiSearchFactor = %d",myicsa->psiSearchFactorJump); - - l = 0; + + l = 0; r = (myicsa->suffixArraySize+bin_search_psi_skip_interval-2)/bin_search_psi_skip_interval*bin_search_psi_skip_interval; p = ((l+r)/2)/bin_search_psi_skip_interval * bin_search_psi_skip_interval; previousP = 0; - + // BUSCA BINARIA SOBRE MOSTRAS while( (p != previousP) && (comp = SadCSACompare(myicsa, pattern, patternSize, p)) ) { if(comp > 0) l = p; @@ -578,13 +890,13 @@ int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong } if(p==previousP) { - + // BUSCA BINARIA ENTRE O PIVOTE E A SEGUINTE MOSTRA - l = previousP; + l = previousP; r = previousP+bin_search_psi_skip_interval; if(r > myicsa->suffixArraySize) r = myicsa->suffixArraySize - 1; while(l < r) { - p = (l+r)/2; + p = (l+r)/2; if(SadCSACompare(myicsa, pattern, patternSize, p) <= 0) r = p; else l = p+1; } @@ -597,21 +909,21 @@ int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong return 0; //no error. } *left = r; - - l = previousP; + + l = previousP; r = previousP+bin_search_psi_skip_interval; if(r > myicsa->suffixArraySize) r = myicsa->suffixArraySize - 1; while(l < r) { p = (l+r+1)/2; if(SadCSACompare(myicsa, pattern, patternSize, p) >= 0) l = p; - else r = p-1; + else r = p-1; } *right = l; - + } else { - + previousP = p; // En previousP poñemos o p atopado na busca sobre as mostras - + // BUSCA EXPONENCIAL HACIA ATRÁS i = 1; p -= bin_search_psi_skip_interval; @@ -622,49 +934,49 @@ int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong // Busca binaria entre as duas ultimas mostras da exponencial if(previousP > i*bin_search_psi_skip_interval) l = previousP-(i*bin_search_psi_skip_interval); else l=0; - i>>=1; + i>>=1; r = previousP-(i*bin_search_psi_skip_interval); while(l < r) { - p = (l+r)/2; + p = (l+r)/2; if(SadCSACompare(myicsa, pattern, patternSize, p) <= 0) r = p; else l = p+1; } *left = r; - + // BUSCA EXPONENCIAL HACIA ADIANTE i = 1; p = previousP+bin_search_psi_skip_interval; while(psuffixArraySize && !SadCSACompare(myicsa, pattern, patternSize, p)) { i<<=1; - p = previousP+(i*bin_search_psi_skip_interval); + p = previousP+(i*bin_search_psi_skip_interval); } // Busca binaria entre as duas ultimas mostras da exponencial if(p < myicsa->suffixArraySize) r = previousP+(i*bin_search_psi_skip_interval); else r = myicsa->suffixArraySize-1; - i>>=1; + i>>=1; l = previousP+(i*bin_search_psi_skip_interval); while(l < r) { p = (l+r+1)/2; if(SadCSACompare(myicsa, pattern, patternSize, p) >= 0) l = p; - else r = p-1; + else r = p-1; } - *right = l; + *right = l; } //return *right-*left+1; *numocc = (uint) *right-*left+1; - return 0; //no error + return 0; //no error } // Version inicial de busca binaria unsigned int countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right) { register ulong l, r, p; - l = 0; + l = 0; r = myicsa->suffixArraySize-1; while(l < r) { - p = (l+r)/2; + p = (l+r)/2; if(SadCSACompare(myicsa, pattern, patternSize, p) <= 0) r = p; else l = p+1; } @@ -675,36 +987,36 @@ unsigned int countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *l *right = r; return 0; } - + // Almacenamos o limite esquerdo *left = r; // SE SON IGUALES (O PATRON APARECE NO TEXTO), BUSCAMOS AGORA O LIMITE DEREITO, QUE ALMACENAREMOS EN right // NOTA: INICIAMOS A BUSQUEDA A PARTIR DO ESQUERDO... - l = r; + l = r; r = myicsa->suffixArraySize-1; - + while(l < r) { p = (l+r+1)/2; if(SadCSACompare(myicsa, pattern, patternSize, p) >= 0) l = p; - else r = p-1; + else r = p-1; } - + // Gardamos o limite dereito *right = l; - - return (uint) *right-*left+1; + + return (uint) *right-*left+1; } int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc) { //unsigned int *locateCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *occ) { - + ticsa *myicsa = (ticsa *) index; uint patternSize = length; ulong *positions; ulong occurrences; register ulong left, right; - + //occurrences = countCSA(myicsa, pattern, patternSize, &left, &right); int err; err = countIntIndex((void *) myicsa, pattern, patternSize, &occurrences, &left, &right); @@ -713,16 +1025,16 @@ int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong * if (occurrences) { register ulong idx = 0; positions = (ulong*) malloc(sizeof(ulong) * occurrences); - while(left<=right) positions[idx++] = A(myicsa,left++); - + while(left<=right) positions[idx++] = A(myicsa,left++); + *occ = positions; - return 0; + return 0; //return positions; } - + (*occ)=NULL; return 0; //no error, but no occurrences. - + //return NULL; } @@ -748,46 +1060,46 @@ int displayIntIndex(void *index, ulong position, uint *value){ /**********************************************************************/ -// Devolve o enteiro do texto que ocupa a posicion dada, e fixa o estado +// Devolve o enteiro do texto que ocupa a posicion dada, e fixa o estado // para poder seguir obtendo os seguintes enteiros con displayNext(); uint displayCSAFirst(ticsa *myicsa, uint position) { register uint positionAux, index; register uint T_AInv = myicsa->T_AInv; - + positionAux = myicsa->samplesAInv[position / T_AInv]; for(index = 0; index < position%T_AInv; index++) { #ifdef PSI_HUFFMANRLE positionAux=getHuffmanPsiValue(&(myicsa->hcPsi),positionAux); - #endif + #endif #ifdef PSI_GONZALO positionAux=getGonzaloPsiValue(&(myicsa->gcPsi),positionAux); #endif #ifdef PSI_DELTACODES positionAux=getDeltaPsiValue(&(myicsa->dcPsi),positionAux); - #endif + #endif } - + // Fijamos a variable global para a chamada a displayNext myicsa->displayCSAState = positionAux; - + // return rank1(D, Dir, positionAux) - 1; return rank(myicsa->bD, positionAux) - 1; } // Devolve o seguinte enteiro do texto (a partir do estado) -unsigned int displayCSANext(ticsa *myicsa) { +unsigned int displayCSANext(ticsa *myicsa) { #ifdef PSI_HUFFMANRLE myicsa->displayCSAState=getHuffmanPsiValue(&(myicsa->hcPsi),myicsa->displayCSAState); - #endif + #endif #ifdef PSI_GONZALO myicsa->displayCSAState = getGonzaloPsiValue(&(myicsa->gcPsi),myicsa->displayCSAState); #endif #ifdef PSI_DELTACODES myicsa->displayCSAState = getDeltaPsiValue(&(myicsa->dcPsi),myicsa->displayCSAState); - #endif - return rank(myicsa->bD, myicsa->displayCSAState) - 1; + #endif + return rank(myicsa->bD, myicsa->displayCSAState) - 1; } @@ -798,10 +1110,10 @@ void showStructsCSA(ticsa *myicsa) { // ESTRUCTURAS PARA CSA printf("Basic CSA structures:\n\n"); - + // VALORES DA FUNCI�N PSI (decodificando) printf("\tPSI: (Sampling period = %d)\n", myicsa->T_Psi); - for(index=0; index < myicsa->suffixArraySize; index++) + for(index=0; index < myicsa->suffixArraySize; index++) #ifdef PSI_HUFFMANRLE printf("\tPsi[%d] = %d\n", index, getHuffmanPsiValue(&(myicsa->hcPsi),index)); #endif @@ -809,10 +1121,10 @@ void showStructsCSA(ticsa *myicsa) { printf("\tPsi[%d] = %d\n", index, getGonzaloPsiValue(&(myicsa->gcPsi),index)); #endif #ifdef PSI_DELTACODES - printf("\tPsi[%d] = %d\n", index, getDeltaPsiValue(&(myicsa->dcPsi),index)); - #endif + printf("\tPsi[%d] = %d\n", index, getDeltaPsiValue(&(myicsa->dcPsi),index)); + #endif printf("\n"); - + // VECTOR D DE SADAKANE CO DIRECTORIO DE RANK ASOCIADO printf("\tD = "); showBitVector(myicsa->D, myicsa->suffixArraySize); @@ -826,24 +1138,24 @@ void showStructsCSA(ticsa *myicsa) { printf("\tDs[%d] = %d\n", index, myicsa->bD->sdata[index]); } printf("\nBloques de D:\n"); - + for(index=0; indexbD->bdata[index]); - } + printf("\tDb[%d] = %d\n", index, myicsa->bD->bdata[index]); + } printf("\n\n"); } // ESTRUCTURAS PARA ACCEDER O ARRAY DE SUFIXOS E A SUA INVERSA printf("Suffix Array Sampling Structures: (Sampling period = %d)\n", myicsa->T_A); printf("\tSuffix Array Samples:\n"); - for(index=0; index < myicsa->samplesASize; index++) + for(index=0; index < myicsa->samplesASize; index++) printf("\tSamplesA[%d] = %d\n", index, myicsa->samplesA[index]); - printf("\n"); + printf("\n"); printf("\tInverse Suffix Array Samples:\n"); - for(index=0; index < myicsa->samplesASize; index++) + for(index=0; index < myicsa->samplesASize; index++) printf("\tSamplesAInv[%d] = %d\n", index, myicsa->samplesAInv[index]); printf("\n"); - + } @@ -852,27 +1164,27 @@ void showStructsCSA(ticsa *myicsa) { int SadCSACompare(ticsa *myicsa, uint *pattern, uint patternSize, uint p) { register unsigned int j, i, currentInteger, diff; - + i = p; j = 0; - + while(1) { currentInteger = rank(myicsa->bD, i) - 1; diff = pattern[j++] - currentInteger; if(diff) return diff; if(j == patternSize) return 0; - else + else #ifdef PSI_HUFFMANRLE i=getHuffmanPsiValue(&(myicsa->hcPsi),i); #endif #ifdef PSI_GONZALO i=getGonzaloPsiValue(&(myicsa->gcPsi),i); - #endif + #endif #ifdef PSI_DELTACODES i=getDeltaPsiValue(&(myicsa->dcPsi),i); #endif } - + } @@ -881,15 +1193,15 @@ inline uint A(ticsa *myicsa, uint position) { register uint timesPsi, sampleValue; register uint T_A = myicsa->T_A; - + uint proba = position; - + timesPsi = 0; while(!bitget(myicsa->BA, position)) { - + #ifdef PSI_HUFFMANRLE position=getHuffmanPsiValue(&(myicsa->hcPsi),position); - #endif + #endif #ifdef PSI_GONZALO position=getGonzaloPsiValue(&(myicsa->gcPsi),position); #endif @@ -897,10 +1209,10 @@ inline uint A(ticsa *myicsa, uint position) { position=getDeltaPsiValue(&(myicsa->dcPsi),position); #endif timesPsi++; - + } sampleValue = myicsa->samplesA[rank(myicsa->bBA, position)-1]; - + return sampleValue - timesPsi; } @@ -911,65 +1223,65 @@ inline uint inverseA(ticsa *myicsa, uint offset) { register uint index, inverseValue; register uint T_AInv = myicsa->T_AInv; - + inverseValue = myicsa->samplesAInv[offset/T_AInv]; - for(index=0; index<(offset%T_AInv); index++) + for(index=0; index<(offset%T_AInv); index++) #ifdef PSI_HUFFMANRLE inverseValue=getHuffmanPsiValue(&(myicsa->hcPsi),inverseValue); - #endif + #endif #ifdef PSI_GONZALO inverseValue = getGonzaloPsiValue(&(myicsa->gcPsi),inverseValue); - #endif + #endif #ifdef PSI_DELTACODES inverseValue = getDeltaPsiValue(&(myicsa->dcPsi),inverseValue); #endif return inverseValue; - + } // Initializes the parameters of the index. -uint parametersCSA(ticsa *myicsa, char *build_options){ +uint parametersCSA(ticsa *myicsa, char *build_options){ char delimiters[] = " =;"; int j,num_parameters; char ** parameters; int ssA,ssAinv,ssPsi,nsHuff,psiSearchFactor; - + ssA = DEFAULT_A_SAMPLING_PERIOD; ssAinv = DEFAULT_A_INV_SAMPLING_PERIOD; ssPsi = DEFAULT_PSI_SAMPLING_PERIOD; nsHuff = DEFAULT_nsHUFF; psiSearchFactor = DEFAULT_PSI_BINARY_SEARCH_FACTOR; - + if (build_options != NULL) { parse_parameters(build_options,&num_parameters, ¶meters, delimiters); for (j=0; jT_A = ssA; myicsa->T_AInv = ssAinv; myicsa->T_Psi = ssPsi; myicsa->tempNSHUFF = nsHuff; - myicsa->psiSearchFactorJump = psiSearchFactor * ssPsi; + myicsa->psiSearchFactorJump = psiSearchFactor * ssPsi; printf("\n\t parameters for iCSA: sampleA=%d, sampleAinv=%d, samplePsi=%d",ssA,ssAinv,ssPsi); printf("\n\t : nsHuff=%d, psiSearchFactor = %d --> jump = %d", nsHuff,psiSearchFactor, myicsa->psiSearchFactorJump);