From: Kim Nguyễn Date: Thu, 1 Mar 2012 13:18:13 +0000 (+0100) Subject: Update swcs with larson-sadakane suffix array construction method. X-Git-Url: http://git.nguyen.vg/gitweb/?p=SXSI%2FTextCollection.git;a=commitdiff_plain;h=6ba83dea468496a31eb57bdbac9b257a5a191a26 Update swcs with larson-sadakane suffix array construction method. --- diff --git a/Makefile b/Makefile index bb8d60d..156142d 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ HIDE=@ CC = g++ LIBCDSPATH = ../libcds -CPPFLAGS = -Wall -ansi -g -I$(LIBCDSPATH)/includes/ -O3 -DNDEBUG -Wno-deprecated-declarations +CPPFLAGS = -Wall -ansi -g -I$(LIBCDSPATH)/includes/ -O3 -DNDEBUG -Wno-deprecated-declarations -flto #CPPFLAGS = -Wall -ansi -g -I$(LIBCDSPATH)/includes/ LIBCDSA = $(LIBCDSPATH)/lib/libcds.a LIBRLCSA = incbwt/rlcsa.a diff --git a/swcsa/intIndex/Makefile b/swcsa/intIndex/Makefile index 8f02954..0585c6e 100755 --- a/swcsa/intIndex/Makefile +++ b/swcsa/intIndex/Makefile @@ -1,12 +1,13 @@ SRCDIR = . SRCDIRCSA = . +SRCDIRCSAUTILS = ../intIndexUtils SRCDIRUTILS = ../utils CC = g++ ifndef CFLAGS ##possibly already defined by the "main Makefile". ##CFLAGS = -O9 -m32 - CFLAGS = -O9 -m32 - ##CFLAGS = -g -m32 -O0 + ##CFLAGS = -O9 -m32 + CFLAGS = -g -m32 -O0 endif LIBINTINDEX = icsa.a @@ -15,41 +16,41 @@ all: intIndex cleanO intIndex: icsa.o parameters.o basics.o huff.o bitmap.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o - ar rc $(LIBINTINDEX) icsa.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o + ar rc $(LIBINTINDEX) icsa.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o #not including "parameters.o basics.o bitmap.o huff.o" as they are included by wcsa #they are already included into the library by the presentation layer. -icsa.o: parameters.o basics.o bitmap.o huff.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o - $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/icsa.c +icsa.o: parameters.o basics.o bitmap.o huff.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o + $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/icsa.c psiHuffmanRLE.o: huff.o - $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/psiHuffmanRLE.c + $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSAUTILS)/psiHuffmanRLE.c psiDeltaCode.o: - $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/psiDeltaCode.c + $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSAUTILS)/psiDeltaCode.c psiGonzalo.o: huff.o - $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/psiGonzalo.c + $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSAUTILS)/psiGonzalo.c -parameters.o: - $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/parameters.c - - -huff.o: +parameters.o: + $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/parameters.c + + +huff.o: $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/huff.c -basics.o: +basics.o: $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/basics.c -bitmap.o: +bitmap.o: $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/bitmap.c -cleanO: +cleanO: rm -f *.o - + clean: - rm -rf *~ *% *.o core *.bak icsa.a + rm -rf *~ *% *.o core *.bak icsa.a tar: tar czvf icsa.tar.gz Makefile diff --git a/swcsa/intIndex/icsa.c b/swcsa/intIndex/icsa.c old mode 100644 new mode 100755 index eec4e34..c126f9a --- a/swcsa/intIndex/icsa.c +++ b/swcsa/intIndex/icsa.c @@ -1,50 +1,338 @@ +/* icsa.c + * Copyright (C) 2011, Antonio Fariña and Eduardo Rodriguez, all rights reserved. + * + * icsa.c: Implementation of the interface "../intIndex/interfaceIntIndex.h" + * that permits to represent a sequence of uint32 integers with an iCSA: + * An integer-oriented Compressed Suffix Array. + * Such representation will be handled as a "ticsa" data structure, that + * the WCSA self-index will use (as an opaque type) to + * create/save/load/search/recover/getSize of the representation of the + * original sequence. + * Suffix sorting is done via Larson/Sadakane suffix sort algorithm + * from the char-based csa. + * + * Ref: Larsson, N. J. and Sadakane, K. 2007. Faster suffix sorting. Theoretical + * Computer Science 387, 3, 258–272. + * + * + * See more details in: + * Antonio Fariña, Nieves Brisaboa, Gonzalo Navarro, Francisco Claude, Ángeles Places, + * and Eduardo Rodríguez. Word-based Self-Indexes for Natural Language Text. ACM + * Transactions on Information Systems (TOIS), 2012. + * http://vios.dc.fi.udc.es/indexing/wsi/publications.html + * http://www.dcc.uchile.cl/~gnavarro/ps/tois11.pdf + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include #include "icsa.h" -// Global para que funcione a funcion de comparacion do quicksort -uint *intVector; - -// Para o quicksort -int suffixCmp(const void *arg1, const void *arg2) { - - register uint a,b; - a=*((uint *) arg1); - b=*((uint *) arg2); +#define KEY(p) (V[*(p)+(h)]) +#define SWAP(p, q) (tmp=*(p), *(p)=*(q), *(q)=tmp) +#define MED3(a, b, c) (KEY(a)KEY(c) ? (b) : KEY(a)>KEY(c) ? (c) : (a))) + +static int *I, /* group array, ultimately suffix array.*/ + *V, /* inverse array, ultimately inverse of I.*/ + r, /* number of symbols aggregated by transform.*/ + h; /* length of already-sorted prefixes.*/ + +/* Subroutine for select_sort_split and sort_split. Sets group numbers for a + group whose lowest position in I is pl and highest position is pm.*/ + +static void update_group(int *pl, int *pm) +{ + int g; + + g=pm-I; /* group number.*/ + V[*pl]=g; /* update group number of first position.*/ + if (pl==pm) + *pl=-1; /* one element, sorted group.*/ + else + do /* more than one element, unsorted group.*/ + V[*++pl]=g; /* update group numbers.*/ + while (pl>1); /* small arrays, middle element.*/ + if (n>7) { + pl=p; + pn=p+n-1; + if (n>40) { /* big arrays, pseudomedian of 9.*/ + s=n>>3; + pl=MED3(pl, pl+s, pl+s+s); + pm=MED3(pm-s, pm, pm+s); + pn=MED3(pn-s-s, pn-s, pn); + } + pm=MED3(pl, pm, pn); /* midsize arrays, median of 3.*/ + } + return KEY(pm); } +/* Sorting routine called for each unsorted group. Sorts the array of integers + (suffix numbers) of length n starting at p. The algorithm is a ternary-split + quicksort taken from Bentley & McIlroy, "Engineering a Sort Function", + Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This + function is based on Program 7.*/ + +static void sort_split(int *p, int n) +{ + int *pa, *pb, *pc, *pd, *pl, *pm, *pn; + int f, v, s, t, tmp; + + if (n<7) { /* multi-selection sort smallest arrays.*/ + select_sort_split(p, n); + return; + } + + v=choose_pivot(p, n); + pa=pb=p; + pc=pd=p+n-1; + while (1) { /* split-end partition.*/ + while (pb<=pc && (f=KEY(pb))<=v) { + if (f==v) { + SWAP(pa, pb); + ++pa; + } + ++pb; + } + while (pc>=pb && (f=KEY(pc))>=v) { + if (f==v) { + SWAP(pc, pd); + --pd; + } + --pc; + } + if (pb>pc) + break; + SWAP(pb, pc); + ++pb; + --pc; + } + pn=p+n; + if ((s=pa-p)>(t=pb-pa)) + s=t; + for (pl=p, pm=pb-s; s; --s, ++pl, ++pm) + SWAP(pl, pm); + if ((s=pd-pc)>(t=pn-pd-1)) + s=t; + for (pl=pb, pm=pn-s; s; --s, ++pl, ++pm) + SWAP(pl, pm); + + s=pb-pa; + t=pd-pc; + if (s>0) + sort_split(p, s); + update_group(p+s, p+n-t-1); + if (t>0) + sort_split(p+n-t, t); +} +/* Bucketsort for first iteration. + + Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear + at least once. x[n] is 0. (This is the corresponding output of transform.) k + must be at most n+1. p is array of size n+1 whose contents are disregarded. + + Output: x is V and p is I after the initial sorting stage of the refined + suffix sorting algorithm.*/ + +static void bucketsort(int *x, int *p, int n, int k) +{ + int *pi, i, c, d, g; + + for (pi=p; pi=p; --pi) { + d=x[c=*pi]; /* c is position, d is next in list.*/ + x[c]=g=i; /* last position equals group number.*/ + if (d>=0) { /* if more than one element in group.*/ + p[i--]=c; /* p is permutation for the sorted x.*/ + do { + d=x[c=d]; /* next in linked list.*/ + x[c]=g; /* group number in x.*/ + p[i--]=c; /* permutation in p.*/ + } while (d>=0); + } else + p[i--]=-1; /* one element, sorted group.*/ + } +} -/* **NO REVISADO TAMAÑO FILE. -int buildIntIndexFromFile (char *filename, char *build_options,void **index) { - //char filename[255]; - int file; - struct stat f_stat; - //sprintf(filename, "%s.%s", basename,SE_FILE_EXT); - - if( (file = open(filename, O_RDONLY)) < 0) { - printf("Cannot open file %s\n", filename); - exit(0); - } - if( fstat(file, &f_stat) < 0) { - printf("Cannot read information from file %s\n", filename); exit(0); - } - uint sizeFile = (f_stat.st_size)/sizeof(uint); +/* Transforms the alphabet of x by attempting to aggregate several symbols into + one, while preserving the suffix order of x. The alphabet may also be + compacted, so that x on output comprises all integers of the new alphabet + with no skipped numbers. + + Input: x is an array of size n+1 whose first n elements are positive + integers in the range l...k-1. p is array of size n+1, used for temporary + storage. q controls aggregation and compaction by defining the maximum value + for any symbol during transformation: q must be at least k-l; if q<=n, + compaction is guaranteed; if k-l>n, compaction is never done; if q is + INT_MAX, the maximum number of symbols are aggregated into one. + + Output: Returns an integer j in the range 1...q representing the size of the + new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is + set to the number of old symbols grouped into one. Only x[n] is 0.*/ + +static int transform(int *x, int *p, int n, int k, int l, int q) +{ + int b, c, d, e, i, j, m, s; + int *pi, *pj; + + for (s=0, i=k-l; i; i>>=1) + ++s; /* s is number of bits in old symbol.*/ + e=INT_MAX>>s; /* e is for overflow checking.*/ + for (b=d=r=0; r=k-l) { /* if bucketing possible,*/ + j=transform(V, I, n, k, l, n); + bucketsort(V, I, n, j); /* bucketsort on first r positions.*/ + } else { + transform(V, I, n, k, l, INT_MAX); + for (i=0; i<=n; ++i) + I[i]=i; /* initialize I with suffix numbers.*/ + h=0; + sort_split(I, n+1); /* quicksort on first r positions.*/ + } + h=r; /* number of symbols aggregated by transform.*/ + + while (*I>=-n) { + pi=I; /* pi is first position of group.*/ + sl=0; /* sl is negated length of sorted groups.*/ + do { + if ((s=*pi)<0) { + pi-=s; /* skip over sorted group.*/ + sl+=s; /* add negated length to sl.*/ + } else { + if (sl) { + *(pi+sl)=sl; /* combine sorted groups before pi.*/ + sl=0; + } + pk=I+V[s]+1; /* pk-1 is last position of unsorted group.*/ + sort_split(pi, pk-pi); + pi=pk; /* next group.*/ + } + } while (pi<=I+n); + if (sl) /* if the array ends with a sorted group.*/ + *(pi+sl)=sl; /* combine sorted groups at end of I.*/ + h=2*h; /* double sorted-depth.*/ + } + + for (i=0; i<=n; ++i) /* reconstruct suffix array from inverse.*/ + I[V[i]]=i; } -*/ -//ticsa *createIntegerCSA(uint **aintVector, uint textSize, char *build_options) { -int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index ){ - uint textSize=n; - intVector = aintVector; //global variable + +//ticsa *createIntegerCSA(uint **intVector, uint textSize, char *build_options) { +int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ){ + uint textSize=n; ticsa *myicsa; myicsa = (ticsa *) malloc (sizeof (ticsa)); uint *Psi, *SAI, *C, vocSize; @@ -56,18 +344,18 @@ int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index ) nsHUFF=myicsa->tempNSHUFF; // Almacenamos o valor dalguns parametros + myicsa->suffixArraySize = textSize; myicsa->D = (uint*) malloc (sizeof(uint) * ((textSize+31)/32)); - myicsa->samplesASize = (textSize + myicsa->T_A - 1) / myicsa->T_A + 1; - myicsa->samplesA = (uint *)malloc(sizeof(int) * myicsa->samplesASize); + myicsa->samplesASize = (textSize + myicsa->T_A - 1) / myicsa->T_A + 1; //--> última pos siempre sampleada! +// myicsa->samplesASize = (textSize + myicsa->T_A - 1) / myicsa->T_A ;//+ 1; + myicsa->samplesA = (uint *)malloc(sizeof(uint) * myicsa->samplesASize); + myicsa->samplesA[myicsa->samplesASize-1] = 0000; myicsa->BA = (uint*) malloc (sizeof(uint) * ((textSize+31)/32)); myicsa->samplesAInvSize = (textSize + myicsa->T_AInv - 1) / myicsa->T_AInv; myicsa->samplesAInv = (uint *)malloc(sizeof(int) * myicsa->samplesAInvSize); - // Reservamos espacio para os vectores - Psi = (uint *) malloc (sizeof(uint) * textSize); - // CONSTRUIMOS A FUNCION C vocSize = 0; for(i=0;ivocSize) vocSize = intVector[i]; @@ -81,19 +369,35 @@ int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index ) for(i=vocSize;i>0;i--) C[i] = C[i-1]; C[0] = 0; - // Construimos o array de sufixos (en Psi) - con quicksort - printf("\n\t *BUILDING THE SUFFIX ARRAY over %d integers... (with qsort)", textSize);fflush(stdout); - for(i=0; iBA[i] = 0; @@ -109,7 +413,14 @@ int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index ) // CONSTRUIMOS E COMPRIMIMOS PSI printf("\n\t Creating compressed Psi..."); for(i=0;ihcPsi = huffmanCompressPsi(Psi,textSize,myicsa->T_Psi,nsHUFF); #endif @@ -119,13 +430,14 @@ int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index ) #ifdef PSI_DELTACODES myicsa->dcPsi = deltaCompressPsi(Psi,textSize,myicsa->T_Psi); #endif + free(Psi); - + // Contruimos D for(i=0;i<((textSize+31)/32);i++) myicsa->D[i] = 0; for(i=0;i<=vocSize;i++) bitset(myicsa->D, C[i]); myicsa->bD = createBitmap(myicsa->D,textSize); - free(C); + free(C); // VARIABLE GLOBAL QUE ALMACENA O ESTADO DOS DISPLAYS (IMPORTANTE PARA DISPLAY SECUENCIAL) // Almacena a última posición do array de sufixos que mostramos con display ou displayNext @@ -134,8 +446,8 @@ int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index ) myicsa->displayCSAState = 0; myicsa->displayCSAPrevPosition = -2; //needed by DisplayCSA(position) - aintVector = intVector; - // Liberamos o espacion non necesario + //aintVector = intVector; + // Liberamos o espacio non necesario *index = myicsa; //return (myicsa); @@ -452,7 +764,7 @@ int freeIntIndex(void *index) { if (!myicsa) return 0; - uint total=0, totaltmp=0; + size_t total=0, totaltmp=0; uint nbytes;sizeIntIndex(index, &nbytes); @@ -470,25 +782,25 @@ int freeIntIndex(void *index) { total+= totaltmp = myicsa->dcPsi.totalMem; destroyDeltaCodesCompressedPsi(&(myicsa->dcPsi)); #endif - printf("\n\t[destroying SA: compressed PSI structure] ...Freed %u bytes... RAM",totaltmp); + printf("\n\t[destroying SA: compressed PSI structure] ...Freed %zu bytes... RAM", totaltmp); free(myicsa->D); total+= totaltmp = (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); - printf("\n\t[destroying SA: D vector] ...Freed %u bytes... RAM",totaltmp); + printf("\n\t[destroying SA: D vector] ...Freed %zu bytes... RAM",totaltmp); free(myicsa->samplesA); total+= totaltmp = (sizeof(uint) * myicsa->samplesASize); - printf("\n\t[destroying Samples A: A ] ...Freed %u bytes... RAM",totaltmp); + printf("\n\t[destroying Samples A: A ] ...Freed %zu bytes... RAM",totaltmp); free(myicsa->samplesAInv); total+= totaltmp = (sizeof(uint) * myicsa->samplesAInvSize); - printf("\n\t[destroying Samples AInv: A ] ...Freed %u bytes... RAM",totaltmp); - printf("\n\t[destroying rank bit D ] ...Freed %u bytes... RAM",myicsa->bD->mem_usage); + printf("\n\t[destroying Samples AInv: A ] ...Freed %zubytes... RAM",totaltmp); + printf("\n\t[destroying rank bit D ] ...Freed %zu bytes... RAM", (size_t)myicsa->bD->mem_usage); free(myicsa->BA); total+= totaltmp = (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); - printf("\n\t[destroying SA: BA vector] ...Freed %u bytes... RAM",totaltmp); + printf("\n\t[destroying SA: BA vector] ...Freed %zu bytes... RAM",totaltmp); total += myicsa->bD->mem_usage; destroyBitmap(myicsa->bD); total += myicsa->bBA->mem_usage; destroyBitmap(myicsa->bBA); - printf("\n\t**** [the whole iCSA ocuppied ... %u bytes... RAM",total); - printf("\n\t**** iCSA size = %d bytes ",nbytes); + printf("\n\t**** [the whole iCSA ocuppied ... %zu bytes... RAM",total); + printf("\n\t**** iCSA size = %zu bytes ", (size_t) nbytes); printf("\n"); free(myicsa); diff --git a/swcsa/intIndex/icsa.h b/swcsa/intIndex/icsa.h old mode 100644 new mode 100755 index 523859d..6d4dc3d --- a/swcsa/intIndex/icsa.h +++ b/swcsa/intIndex/icsa.h @@ -1 +1 @@ -//#include #include #include #include #include #include "defValues.h" #include "../utils/bitmap.h" #include "../utils/huff.h" #include "../utils/parameters.h" #ifdef PSI_HUFFMANRLE #include "psiHuffmanRLE.h" #endif #ifdef PSI_GONZALO #include "psiGonzalo.h" #endif #ifdef PSI_DELTACODES #include "psiDeltaCode.h" #endif typedef struct { uint suffixArraySize; uint T_Psi; uint *D; bitmap bD; uint T_A; uint T_AInv; uint *samplesA; uint samplesASize; uint *BA; bitmap bBA; uint *samplesAInv; uint samplesAInvSize; uint displayCSAState; int displayCSAPrevPosition; #ifdef PSI_HUFFMANRLE HuffmanCompressedPsi hcPsi; #endif #ifdef PSI_GONZALO GonzaloCompressedPsi gcPsi; #endif #ifdef PSI_DELTACODES DeltaCompressedPsi dcPsi; #endif //only needed during "parse_parameters". uint tempNSHUFF; uint psiSearchFactorJump; //factor of the T_Psi value. } ticsa; // FUNCTION PROTOTYPES: BUILDING THE INDEX //Creates the ICSA int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ); //ticsa *createIntegerCSA (uint **aintVector, uint SAsize, char *build_options); //Returns number of elements in the indexed sequence of integers int sourceLenIntIndex(void *index, uint *numInts); //Save the index to disk int saveIntIndex(void *index, char *pathname); //void storeStructsCSA(ticsa *myicsa, char *basename); // Loads the index from disk. int loadIntIndex(char *pathname, void **index); //ticsa *loadCSA(char *basename); // Frees memory int freeIntIndex(void *index); //uint destroyStructsCSA(ticsa *myicsa); //Returns the size (in bytes) of the index over the sequence of integers. int sizeIntIndex(void *index, uint *numBytes); //uint CSA_size(ticsa *myicsa); // Shows detailed summary info of the self-index (memory usage of each structure) int printInfoIntIndex(void *index, const char tab[]); //Number of occurrences of the pattern, and the interval [left,right] in the suffix array. int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right); //uint countCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right); // Exponential search //uint countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right); // Binary search // Returns an array with integers corresponding offsets to the occurrences of the pattern, // as well as the number of occurrences int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc); //uint *locateCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *occ); //Returns the value of the source (array of integers) at a given offset. // (that is, the element "position" from the original array of uints) int displayIntIndex(void *index, ulong position, uint *value); //uint displayCSA(ticsa *myicsa, uint position); /* Private function prototypes ********************************************/ uint parametersCSA(ticsa *myicsa, char *build_options); uint displayCSAFirst(ticsa *myicsa, uint position); uint displayCSANext(ticsa *myicsa); int SadCSACompare(ticsa *myicsa, uint *pattern, uint patternSize, uint p); uint A(ticsa *myicsa, uint position); uint inverseA(ticsa *myicsa, uint offset); void showStructsCSA(ticsa *myicsa); // For Debugging \ No newline at end of file +/* icsa.h * Copyright (C) 2011, Antonio Fariña and Eduardo Rodriguez, all rights reserved. * * icsa.h: Definition of the functions of the interface "../intIndex/interfaceIntIndex.h" * that permits to represent a sequence of uint32 integers with an iCSA: * An integer-oriented Compressed Suffix Array. * Such representation will be handled as a "ticsa" data structure, that * the WCSA self-index will use (as an opaque type) to * create/save/load/search/recover/getSize of the representation of the * original sequence. * Suffix sorting is done via Larson/Sadakane suffix sort algorithm * from the char-based csa. * * Ref: Larsson, N. J. and Sadakane, K. 2007. Faster suffix sorting. Theoretical * Computer Science 387, 3, 258–272. * * See more details in: * Antonio Fariña, Nieves Brisaboa, Gonzalo Navarro, Francisco Claude, Ángeles Places, * and Eduardo Rodríguez. Word-based Self-Indexes for Natural Language Text. ACM * Transactions on Information Systems (TOIS), 2012. * http://vios.dc.fi.udc.es/indexing/wsi/publications.html * http://www.dcc.uchile.cl/~gnavarro/ps/tois11.pdf * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include #include #include #include "../intIndexUtils/defValues.h" #include "../utils/bitmap.h" #include "../utils/huff.h" #include "../utils/parameters.h" #ifdef PSI_HUFFMANRLE #include "../intIndexUtils/psiHuffmanRLE.h" #endif #ifdef PSI_GONZALO #include "../intIndexUtils/psiGonzalo.h" #endif #ifdef PSI_DELTACODES #include "../intIndexUtils/psiDeltaCode.h" #endif typedef struct { uint suffixArraySize; uint T_Psi; uint *D; bitmap bD; uint T_A; uint T_AInv; uint *samplesA; uint samplesASize; uint *BA; bitmap bBA; uint *samplesAInv; uint samplesAInvSize; uint displayCSAState; long displayCSAPrevPosition; #ifdef PSI_HUFFMANRLE HuffmanCompressedPsi hcPsi; #endif #ifdef PSI_GONZALO GonzaloCompressedPsi gcPsi; #endif #ifdef PSI_DELTACODES DeltaCompressedPsi dcPsi; #endif //only needed during "parse_parameters". uint tempNSHUFF; uint psiSearchFactorJump; //factor of the T_Psi value. } ticsa; // FUNCTION PROTOTYPES: BUILDING THE INDEX //Creates the ICSA int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ); //ticsa *createIntegerCSA (uint **aintVector, uint SAsize, char *build_options); //Returns number of elements in the indexed sequence of integers int sourceLenIntIndex(void *index, uint *numInts); //Save the index to disk int saveIntIndex(void *index, char *pathname); //void storeStructsCSA(ticsa *myicsa, char *basename); // Loads the index from disk. int loadIntIndex(char *pathname, void **index); //ticsa *loadCSA(char *basename); // Frees memory int freeIntIndex(void *index); //uint destroyStructsCSA(ticsa *myicsa); //Returns the size (in bytes) of the index over the sequence of integers. int sizeIntIndex(void *index, uint *numBytes); //uint CSA_size(ticsa *myicsa); // Shows detailed summary info of the self-index (memory usage of each structure) int printInfoIntIndex(void *index, const char tab[]); //Number of occurrences of the pattern, and the interval [left,right] in the suffix array. int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right); //uint countCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right); // Exponential search //uint countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right); // Binary search // Returns an array with integers corresponding offsets to the occurrences of the pattern, // as well as the number of occurrences int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc); //uint *locateCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *occ); //Returns the value of the source (array of integers) at a given offset. // (that is, the element "position" from the original array of uints) int displayIntIndex(void *index, ulong position, uint *value); //uint displayCSA(ticsa *myicsa, uint position); /* Private function prototypes ********************************************/ uint parametersCSA(ticsa *myicsa, char *build_options); uint displayCSAFirst(ticsa *myicsa, uint position); uint displayCSANext(ticsa *myicsa); int SadCSACompare(ticsa *myicsa, uint *pattern, uint patternSize, uint p); uint A(ticsa *myicsa, uint position); uint inverseA(ticsa *myicsa, uint offset); void showStructsCSA(ticsa *myicsa); // For Debugging \ No newline at end of file diff --git a/swcsa/intIndexUtils b/swcsa/intIndexUtils new file mode 120000 index 0000000..245a84f --- /dev/null +++ b/swcsa/intIndexUtils @@ -0,0 +1 @@ +intIndex/ \ No newline at end of file