From: Kim Nguyễn Date: Thu, 1 Mar 2012 22:48:21 +0000 (+0100) Subject: Debug swcsa X-Git-Url: http://git.nguyen.vg/gitweb/?p=SXSI%2FTextCollection.git;a=commitdiff_plain;h=898f6e5c6b7223f4753b7ccb7939809ee5f53aae Debug swcsa --- diff --git a/swcsa/buildFacade.h b/swcsa/buildFacade.h index 05bd708..57be6c6 100755 --- a/swcsa/buildFacade.h +++ b/swcsa/buildFacade.h @@ -1,7 +1,8 @@ /* only for getTime() */ + #include #include - +#include "utils/basics.h" #include "utils/valstring.h" #include "utils/defValues.h" diff --git a/swcsa/intIndex/Makefile b/swcsa/intIndex/Makefile index 0585c6e..0be5d66 100755 --- a/swcsa/intIndex/Makefile +++ b/swcsa/intIndex/Makefile @@ -1,12 +1,12 @@ SRCDIR = . SRCDIRCSA = . -SRCDIRCSAUTILS = ../intIndexUtils +SRCDIRCSAUTILS = . SRCDIRUTILS = ../utils CC = g++ ifndef CFLAGS ##possibly already defined by the "main Makefile". ##CFLAGS = -O9 -m32 - ##CFLAGS = -O9 -m32 + ##CFLAGS = -O9 -m32 CFLAGS = -g -m32 -O0 endif @@ -16,41 +16,41 @@ all: intIndex cleanO intIndex: icsa.o parameters.o basics.o huff.o bitmap.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o - ar rc $(LIBINTINDEX) icsa.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o + ar rc $(LIBINTINDEX) icsa.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o #not including "parameters.o basics.o bitmap.o huff.o" as they are included by wcsa #they are already included into the library by the presentation layer. -icsa.o: parameters.o basics.o bitmap.o huff.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o - $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/icsa.c +icsa.o: parameters.o basics.o bitmap.o huff.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o + $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/icsa.c psiHuffmanRLE.o: huff.o - $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSAUTILS)/psiHuffmanRLE.c + $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSAUTILS)/psiHuffmanRLE.c psiDeltaCode.o: - $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSAUTILS)/psiDeltaCode.c + $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSAUTILS)/psiDeltaCode.c psiGonzalo.o: huff.o - $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSAUTILS)/psiGonzalo.c + $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSAUTILS)/psiGonzalo.c -parameters.o: - $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/parameters.c - - -huff.o: +parameters.o: + $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/parameters.c + + +huff.o: $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/huff.c -basics.o: +basics.o: $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/basics.c -bitmap.o: +bitmap.o: $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/bitmap.c -cleanO: +cleanO: rm -f *.o - + clean: - rm -rf *~ *% *.o core *.bak icsa.a + rm -rf *~ *% *.o core *.bak icsa.a tar: tar czvf icsa.tar.gz Makefile diff --git a/swcsa/intIndex/icsa.c b/swcsa/intIndex/icsa.c index c126f9a..26db36b 100755 --- a/swcsa/intIndex/icsa.c +++ b/swcsa/intIndex/icsa.c @@ -2,26 +2,26 @@ * Copyright (C) 2011, Antonio Fariña and Eduardo Rodriguez, all rights reserved. * * icsa.c: Implementation of the interface "../intIndex/interfaceIntIndex.h" - * that permits to represent a sequence of uint32 integers with an iCSA: + * that permits to represent a sequence of uint32 integers with an iCSA: * An integer-oriented Compressed Suffix Array. * Such representation will be handled as a "ticsa" data structure, that - * the WCSA self-index will use (as an opaque type) to - * create/save/load/search/recover/getSize of the representation of the + * the WCSA self-index will use (as an opaque type) to + * create/save/load/search/recover/getSize of the representation of the * original sequence. - * Suffix sorting is done via Larson/Sadakane suffix sort algorithm + * Suffix sorting is done via Larson/Sadakane suffix sort algorithm * from the char-based csa. - * - * Ref: Larsson, N. J. and Sadakane, K. 2007. Faster suffix sorting. Theoretical + * + * Ref: Larsson, N. J. and Sadakane, K. 2007. Faster suffix sorting. Theoretical * Computer Science 387, 3, 258–272. - * - * + * + * * See more details in: - * Antonio Fariña, Nieves Brisaboa, Gonzalo Navarro, Francisco Claude, Ángeles Places, - * and Eduardo Rodríguez. Word-based Self-Indexes for Natural Language Text. ACM - * Transactions on Information Systems (TOIS), 2012. + * Antonio Fariña, Nieves Brisaboa, Gonzalo Navarro, Francisco Claude, Ángeles Places, + * and Eduardo Rodríguez. Word-based Self-Indexes for Natural Language Text. ACM + * Transactions on Information Systems (TOIS), 2012. * http://vios.dc.fi.udc.es/indexing/wsi/publications.html - * http://www.dcc.uchile.cl/~gnavarro/ps/tois11.pdf - * + * http://www.dcc.uchile.cl/~gnavarro/ps/tois11.pdf + * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either @@ -46,12 +46,12 @@ #define MED3(a, b, c) (KEY(a)KEY(c) ? (b) : KEY(a)>KEY(c) ? (c) : (a))) - + static int *I, /* group array, ultimately suffix array.*/ *V, /* inverse array, ultimately inverse of I.*/ r, /* number of symbols aggregated by transform.*/ h; /* length of already-sorted prefixes.*/ - + /* Subroutine for select_sort_split and sort_split. Sets group numbers for a group whose lowest position in I is pl and highest position is pm.*/ @@ -102,7 +102,7 @@ static void select_sort_split(int *p, int n) { static int choose_pivot(int *p, int n) { int *pl, *pm, *pn; int s; - + pm=p+(n>>1); /* small arrays, middle element.*/ if (n>7) { pl=p; @@ -185,7 +185,7 @@ static void sort_split(int *p, int n) Output: x is V and p is I after the initial sorting stage of the refined suffix sorting algorithm.*/ - + static void bucketsort(int *x, int *p, int n, int k) { int *pi, i, c, d, g; @@ -222,7 +222,7 @@ static void bucketsort(int *x, int *p, int n, int k) for any symbol during transformation: q must be at least k-l; if q<=n, compaction is guaranteed; if k-l>n, compaction is never done; if q is INT_MAX, the maximum number of symbols are aggregated into one. - + Output: Returns an integer j in the range 1...q representing the size of the new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is set to the number of old symbols grouped into one. Only x[n] is 0.*/ @@ -231,7 +231,7 @@ static int transform(int *x, int *p, int n, int k, int l, int q) { int b, c, d, e, i, j, m, s; int *pi, *pj; - + for (s=0, i=k-l; i; i>>=1) ++s; /* s is number of bits in old symbol.*/ e=INT_MAX>>s; /* e is for overflow checking.*/ @@ -290,7 +290,7 @@ void suffixsort(int *x, int *p, int n, int k, int l) V=x; /* set global values.*/ I=p; - + if (n>=k-l) { /* if bucketing possible,*/ j=transform(V, I, n, k, l, n); bucketsort(V, I, n, j); /* bucketsort on first r positions.*/ @@ -302,7 +302,7 @@ void suffixsort(int *x, int *p, int n, int k, int l) sort_split(I, n+1); /* quicksort on first r positions.*/ } h=r; /* number of symbols aggregated by transform.*/ - + while (*I>=-n) { pi=I; /* pi is first position of group.*/ sl=0; /* sl is negated length of sorted groups.*/ @@ -340,14 +340,14 @@ int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ){ uint nsHUFF; parametersCSA(myicsa, build_options); - + nsHUFF=myicsa->tempNSHUFF; - + // Almacenamos o valor dalguns parametros - + myicsa->suffixArraySize = textSize; - myicsa->D = (uint*) malloc (sizeof(uint) * ((textSize+31)/32)); - + myicsa->D = (uint*) malloc (sizeof(uint) * ((textSize+31)/32)); + myicsa->samplesASize = (textSize + myicsa->T_A - 1) / myicsa->T_A + 1; //--> última pos siempre sampleada! // myicsa->samplesASize = (textSize + myicsa->T_A - 1) / myicsa->T_A ;//+ 1; myicsa->samplesA = (uint *)malloc(sizeof(uint) * myicsa->samplesASize); @@ -372,13 +372,13 @@ int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ){ // Reservamos espacio para os array de sufijos Psi = (uint *) malloc (sizeof(uint) * (textSize+1)); printf("\n\t *BUILDING THE SUFFIX ARRAY over %d integers... (with larsson-sadakane)", textSize);fflush(stdout); - + /* Makes suffix array p of x. x becomes inverse of p. p and x are both of size n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original contents of x[n] is disregarded, the n-th symbol being regarded as end-of-string smaller than all other symbols.*/ //void suffixsort(int *x, int *p, int n, int k, int l) - + //suffixsort((int *)intVector, (int *)Psi, n, vocSize+1, 1); suffixsort((int *)intVector, (int *)Psi, n-1, vocSize+1, 1); //*Psi = *Psi++; // Con esto temos o array de sufixos desde a primeira posición (sen o terminador) @@ -390,12 +390,12 @@ int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ){ //SAI = (uint *) malloc (sizeof(uint) * (textSize + 1)); // +1 para repetir na ultima posición. Evitamos un if //for(i=0;ibBA = createBitmap(myicsa->BA, textSize); for(i=0,j=0; iBA, i)) myicsa->samplesA[j++] = Psi[i]; - + // ALMACENAMOS AS MOSTRAS DA INVERSA DO ARRAY DE SUFIXOS for(i=0,j=0;iT_AInv) myicsa->samplesAInv[j++] = SAI[i]; - + // CONSTRUIMOS E COMPRIMIMOS PSI printf("\n\t Creating compressed Psi..."); for(i=0;ihcPsi = huffmanCompressPsi(Psi,textSize,myicsa->T_Psi,nsHUFF); - #endif - #ifdef PSI_GONZALO + #endif + #ifdef PSI_GONZALO myicsa->gcPsi = gonzaloCompressPsi(Psi,textSize,myicsa->T_Psi,nsHUFF); - #endif + #endif #ifdef PSI_DELTACODES - myicsa->dcPsi = deltaCompressPsi(Psi,textSize,myicsa->T_Psi); + myicsa->dcPsi = deltaCompressPsi(Psi,textSize,myicsa->T_Psi); #endif - - free(Psi); - + + free(Psi); + // Contruimos D - for(i=0;i<((textSize+31)/32);i++) myicsa->D[i] = 0; + for(i=0;i<((textSize+31)/32);i++) myicsa->D[i] = 0; for(i=0;i<=vocSize;i++) bitset(myicsa->D, C[i]); myicsa->bD = createBitmap(myicsa->D,textSize); - free(C); + free(C); // VARIABLE GLOBAL QUE ALMACENA O ESTADO DOS DISPLAYS (IMPORTANTE PARA DISPLAY SECUENCIAL) // Almacena a última posición do array de sufixos que mostramos con display ou displayNext @@ -445,7 +445,7 @@ int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ){ // coa que podemos obter o símbolo pedido, e actualizamos displayState myicsa->displayCSAState = 0; myicsa->displayCSAPrevPosition = -2; //needed by DisplayCSA(position) - + //aintVector = intVector; // Liberamos o espacio non necesario @@ -464,16 +464,16 @@ int sourceLenIntIndex(void *index, uint *numInts){ int saveIntIndex(void *index, char *pathname) { //void storeStructsCSA(ticsa *myicsa, char *basename) { - - ticsa *myicsa = (ticsa *) index; + + ticsa *myicsa = (ticsa *) index; char *basename=pathname; char *filename; int file; - + // Reservamos espacio para o nome do ficheiro filename = (char *)malloc(sizeof(char)*MAX_FILENAME_LENGTH); - + // Ficheiro co n�mero de elementos indexados (enteiros do texto orixinal) strcpy(filename, basename); strcat(filename, "."); @@ -484,26 +484,26 @@ int saveIntIndex(void *index, char *pathname) { exit(0); } write(file, &(myicsa->suffixArraySize), sizeof(int)); - close(file); + close(file); strcpy(filename, basename); strcat(filename, "."); - strcat(filename, PSI_COMPRESSED_FILE_EXT); + strcat(filename, PSI_COMPRESSED_FILE_EXT); #ifdef PSI_HUFFMANRLE - storeHuffmanCompressedPsi(&(myicsa->hcPsi), filename); - #endif + storeHuffmanCompressedPsi(&(myicsa->hcPsi), filename); + #endif #ifdef PSI_GONZALO - storeGonzaloCompressedPsi(&(myicsa->gcPsi), filename); + storeGonzaloCompressedPsi(&(myicsa->gcPsi), filename); #endif #ifdef PSI_DELTACODES storeDeltaCompressedPsi(&(myicsa->dcPsi), filename); #endif - + // Ficheiro co vector de bits D strcpy(filename, basename); strcat(filename, "."); - strcat(filename, D_FILE_EXT); + strcat(filename, D_FILE_EXT); unlink(filename); if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) { printf("Cannot open file %s\n", filename); @@ -519,11 +519,11 @@ int saveIntIndex(void *index, char *pathname) { strcat(filename, "."); strcat(filename, D_RANK_DIRECTORY_FILE_EXT); saveBitmap(filename,myicsa->bD); - + // Ficheiro coas mostras de A strcpy(filename, basename); strcat(filename, "."); - strcat(filename, SAMPLES_A_FILE_EXT); + strcat(filename, SAMPLES_A_FILE_EXT); unlink(filename); if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) { printf("Cannot open file %s\n", filename); @@ -535,7 +535,7 @@ int saveIntIndex(void *index, char *pathname) { // Ficheiro co vector BA (marca as posicions de A muestreadas) strcpy(filename, basename); strcat(filename, "."); - strcat(filename, BA_FILE_EXT); + strcat(filename, BA_FILE_EXT); unlink(filename); if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) { printf("Cannot open file %s\n", filename); @@ -549,7 +549,7 @@ int saveIntIndex(void *index, char *pathname) { strcat(filename, "."); strcat(filename, BA_RANK_DIRECTORY_FILE_EXT); saveBitmap(filename, myicsa->bBA); - + // Ficheiro coas mostras de A inversa strcpy(filename, basename); strcat(filename, "."); @@ -560,7 +560,7 @@ int saveIntIndex(void *index, char *pathname) { exit(0); } write(file, myicsa->samplesAInv, sizeof(int) * (myicsa->samplesAInvSize)); - close(file); + close(file); // Ficheiro co periodo de muestreo de A e A inversa strcpy(filename, basename); @@ -573,11 +573,11 @@ int saveIntIndex(void *index, char *pathname) { } write(file, &(myicsa->T_A), sizeof(int)); write(file, &(myicsa->T_AInv), sizeof(int)); - + write(file, &(myicsa->psiSearchFactorJump), sizeof(uint)); close(file); - free(filename); + free(filename); return 0; //no error. } @@ -594,14 +594,14 @@ int sizeIntIndex(void *index, uint *numBytes) { size += sizeof(uint)*((myicsa->suffixArraySize+31)/32) ; //BA vector size += myicsa->bBA->mem_usage; #ifdef PSI_HUFFMANRLE - size +=myicsa->hcPsi.totalMem; - #endif + size +=myicsa->hcPsi.totalMem; + #endif #ifdef PSI_GONZALO - size +=myicsa->gcPsi.totalMem; + size +=myicsa->gcPsi.totalMem; #endif #ifdef PSI_DELTACODES size +=myicsa->dcPsi.totalMem; - #endif + #endif *numBytes = size; return 0; //no error. } @@ -616,20 +616,20 @@ int loadIntIndex(char *pathname, void **index){ uint length; char c; char *word; - struct stat f_stat; + struct stat f_stat; uint suffixArraySize; ticsa *myicsa; myicsa = (ticsa *) malloc (sizeof (ticsa) * 1); - - + + // VARIABLE GLOBAL QUE ALMACENA O ESTADO DOS DISPLAYS (IMPORTANTE PARA DISPLAY SECUENCIAL) // Almacena a �ltima posici�n do array de sufixos que mostramos con display ou displayNext // Se nos piden un displayNext, aplicamos PSI sobre esta posici�n e obtemos a seguinte, // coa que podemos obter o s�mbolo pedido, e actualizamos displayState myicsa->displayCSAState = 0; myicsa->displayCSAPrevPosition = -2; //needed by DisplayCSA(position) - + // Reservamos espacio para o nome do ficheiro filename = (char *)malloc(sizeof(char)*MAX_FILENAME_LENGTH); @@ -637,34 +637,34 @@ int loadIntIndex(char *pathname, void **index){ strcpy(filename, basename); strcat(filename, "."); strcat(filename, NUMBER_OF_ELEMENTS_FILE_EXT); - if( (file = open(filename, O_RDONLY)) < 0) { + if( (file = open(filename, O_RDONLY)) < 0) { printf("Cannot read file %s\n", filename);exit(0); - } - read(file, &suffixArraySize, sizeof(uint)); + } + read(file, &suffixArraySize, sizeof(uint)); myicsa->suffixArraySize = suffixArraySize; printf("Number of indexed elements (suffix array size) = %d\n", suffixArraySize); - - // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA PSI COMPRIMIDA + + // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA PSI COMPRIMIDA strcpy(filename, basename); strcat(filename, "."); - strcat(filename, PSI_COMPRESSED_FILE_EXT); + strcat(filename, PSI_COMPRESSED_FILE_EXT); #ifdef PSI_HUFFMANRLE - myicsa->hcPsi = loadHuffmanCompressedPsi(filename); - #endif + myicsa->hcPsi = loadHuffmanCompressedPsi(filename); + #endif #ifdef PSI_GONZALO - myicsa->gcPsi = loadGonzaloCompressedPsi(filename); + myicsa->gcPsi = loadGonzaloCompressedPsi(filename); #endif #ifdef PSI_DELTACODES myicsa->dcPsi = loadDeltaCompressedPsi(filename); - #endif - + #endif + // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA D strcpy(filename, basename); strcat(filename, "."); strcat(filename, D_FILE_EXT); if( (file = open(filename, O_RDONLY)) < 0) { printf("Cannot read file %s\n", filename); exit(0); - } + } myicsa->D = (uint *) malloc (sizeof(uint)*((suffixArraySize+31)/32)); read(file, myicsa->D, sizeof(uint)*((suffixArraySize+31)/32)); printf("Bit vector D loaded (%d bits)\n", suffixArraySize); @@ -672,15 +672,15 @@ int loadIntIndex(char *pathname, void **index){ // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O DIRECTORIO DE RANK1 PARA D strcpy(filename, basename); strcat(filename, "."); - strcat(filename, D_RANK_DIRECTORY_FILE_EXT); + strcat(filename, D_RANK_DIRECTORY_FILE_EXT); myicsa->bD = loadBitmap(filename,myicsa->D,suffixArraySize); - { uint ns, nb; + { uint ns, nb; ns = myicsa->bD->sSize; nb = myicsa->bD->bSize; myicsa->bD->data = myicsa->D; - printf("Rank1 Directory for D loaded (%d superblocks, %d blocks)\n", ns, nb); + printf("Rank1 Directory for D loaded (%d superblocks, %d blocks)\n", ns, nb); } - + // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA SAMPLES A strcpy(filename, basename); strcat(filename, "."); @@ -690,19 +690,19 @@ int loadIntIndex(char *pathname, void **index){ } if( fstat(file, &f_stat) < 0) { printf("Cannot read information from file %s\n", filename); exit(0); - } + } myicsa->samplesASize = (f_stat.st_size)/sizeof(uint); myicsa->samplesA = (uint *)malloc(sizeof(uint) * myicsa->samplesASize); - read(file, myicsa->samplesA, sizeof(uint) * myicsa->samplesASize); - printf("Suffix array samples loaded (%d samples)\n", myicsa->samplesASize); - + read(file, myicsa->samplesA, sizeof(uint) * myicsa->samplesASize); + printf("Suffix array samples loaded (%d samples)\n", myicsa->samplesASize); + // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA BA strcpy(filename, basename); strcat(filename, "."); strcat(filename, BA_FILE_EXT); if( (file = open(filename, O_RDONLY)) < 0) { printf("Cannot read file %s\n", filename); exit(0); - } + } myicsa->BA = (uint *) malloc (sizeof(uint)*((suffixArraySize+31)/32)); read(file, myicsa->BA, sizeof(uint)*((suffixArraySize+31)/32)); printf("Bit vector BA loaded (%d bits)\n", suffixArraySize); @@ -710,13 +710,13 @@ int loadIntIndex(char *pathname, void **index){ // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O DIRECTORIO DE RANK1 PARA BA strcpy(filename, basename); strcat(filename, "."); - strcat(filename, BA_RANK_DIRECTORY_FILE_EXT); + strcat(filename, BA_RANK_DIRECTORY_FILE_EXT); myicsa->bBA = loadBitmap(filename,myicsa->BA,suffixArraySize); - { uint ns, nb; + { uint ns, nb; ns = myicsa->bBA->sSize; nb = myicsa->bBA->bSize; myicsa->bBA->data = myicsa->BA; - printf("Rank1 Directory for BA loaded (%d superblocks, %d blocks)\n", ns, nb); + printf("Rank1 Directory for BA loaded (%d superblocks, %d blocks)\n", ns, nb); } // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA SAMPLES A INVERSA @@ -728,26 +728,26 @@ int loadIntIndex(char *pathname, void **index){ } if( fstat(file, &f_stat) < 0) { printf("Cannot read information from file %s\n", filename); exit(0); - } + } myicsa->samplesAInvSize = (f_stat.st_size)/(sizeof(uint)); myicsa->samplesAInv = (uint *)malloc(sizeof(uint) * myicsa->samplesAInvSize); - read(file, myicsa->samplesAInv, sizeof(uint) * myicsa->samplesAInvSize); + read(file, myicsa->samplesAInv, sizeof(uint) * myicsa->samplesAInvSize); printf("Suffix array inverse samples loaded (%d samples)\n", myicsa->samplesAInvSize); - + // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O PERIODO DE MUESTREO DO ARRAY DE SUFIXOS E DA INVERSA strcpy(filename, basename); strcat(filename, "."); strcat(filename, SAMPLING_PERIOD_A_FILE_EXT); if( (file = open(filename, O_RDONLY)) < 0) { printf("Cannot read file %s\n", filename); exit(0); - } + } read(file, &(myicsa->T_A), sizeof(uint)); - read(file, &(myicsa->T_AInv), sizeof(uint)); - printf("Sampling A Period T = %d, Sampling A inv Period TInv = %d\n", myicsa->T_A, myicsa->T_AInv); - + read(file, &(myicsa->T_AInv), sizeof(uint)); + printf("Sampling A Period T = %d, Sampling A inv Period TInv = %d\n", myicsa->T_A, myicsa->T_AInv); + read(file, &(myicsa->psiSearchFactorJump), sizeof(uint)); - printf("Psi Bin Search Factor-Jump is = %d\n", myicsa->psiSearchFactorJump); - + printf("Psi Bin Search Factor-Jump is = %d\n", myicsa->psiSearchFactorJump); + close(file); free(filename); @@ -755,21 +755,21 @@ int loadIntIndex(char *pathname, void **index){ *index = myicsa; return 0; } - -//uint destroyStructsCSA(ticsa *myicsa) { -int freeIntIndex(void *index) { + +//uint destroyStructsCSA(ticsa *myicsa) { +int freeIntIndex(void *index) { ticsa *myicsa = (ticsa *) index; // Liberamos o espacio reservado - + if (!myicsa) return 0; - + size_t total=0, totaltmp=0; - + uint nbytes;sizeIntIndex(index, &nbytes); - + total +=(sizeof (ticsa) * 1);; - + #ifdef PSI_HUFFMANRLE total+= totaltmp = myicsa->hcPsi.totalMem; destroyHuffmanCompressedPsi(&(myicsa->hcPsi)); @@ -780,31 +780,31 @@ int freeIntIndex(void *index) { #endif #ifdef PSI_DELTACODES total+= totaltmp = myicsa->dcPsi.totalMem; - destroyDeltaCodesCompressedPsi(&(myicsa->dcPsi)); + destroyDeltaCodesCompressedPsi(&(myicsa->dcPsi)); #endif printf("\n\t[destroying SA: compressed PSI structure] ...Freed %zu bytes... RAM", totaltmp); - - free(myicsa->D); total+= totaltmp = (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); + + free(myicsa->D); total+= totaltmp = (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); printf("\n\t[destroying SA: D vector] ...Freed %zu bytes... RAM",totaltmp); - free(myicsa->samplesA); total+= totaltmp = (sizeof(uint) * myicsa->samplesASize); + free(myicsa->samplesA); total+= totaltmp = (sizeof(uint) * myicsa->samplesASize); printf("\n\t[destroying Samples A: A ] ...Freed %zu bytes... RAM",totaltmp); - free(myicsa->samplesAInv); total+= totaltmp = (sizeof(uint) * myicsa->samplesAInvSize); - printf("\n\t[destroying Samples AInv: A ] ...Freed %zubytes... RAM",totaltmp); + free(myicsa->samplesAInv); total+= totaltmp = (sizeof(uint) * myicsa->samplesAInvSize); + printf("\n\t[destroying Samples AInv: A ] ...Freed %zubytes... RAM",totaltmp); printf("\n\t[destroying rank bit D ] ...Freed %zu bytes... RAM", (size_t)myicsa->bD->mem_usage); - free(myicsa->BA); total+= totaltmp = (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); + free(myicsa->BA); total+= totaltmp = (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); printf("\n\t[destroying SA: BA vector] ...Freed %zu bytes... RAM",totaltmp); - + total += myicsa->bD->mem_usage; - destroyBitmap(myicsa->bD); + destroyBitmap(myicsa->bD); total += myicsa->bBA->mem_usage; destroyBitmap(myicsa->bBA); - + printf("\n\t**** [the whole iCSA ocuppied ... %zu bytes... RAM",total); printf("\n\t**** iCSA size = %zu bytes ", (size_t) nbytes); printf("\n"); - + free(myicsa); - + return 0; //no error. } @@ -812,11 +812,11 @@ int freeIntIndex(void *index) { int printInfoIntIndex(void *index, const char tab[]) { ticsa *myicsa = (ticsa *) index; if (!myicsa) return 0; - + uint structure, totalpsi, totalD, totalBD, totalSA,totalSAinv, totalBA,totalBBA; - + structure=sizeof(ticsa); - + #ifdef PSI_HUFFMANRLE totalpsi = myicsa->hcPsi.totalMem; #endif @@ -829,15 +829,15 @@ int printInfoIntIndex(void *index, const char tab[]) { totalD = (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); totalBD = myicsa->bD->mem_usage; - totalSA = (sizeof(uint) * myicsa->samplesASize); - totalSAinv = (sizeof(uint) * myicsa->samplesAInvSize); + totalSA = (sizeof(uint) * myicsa->samplesASize); + totalSAinv = (sizeof(uint) * myicsa->samplesAInvSize); totalBA = (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); totalBBA = myicsa->bBA->mem_usage; - + uint nbytes; sizeIntIndex(index, &nbytes); //whole self-index - - printf("\n ===================================================:"); - printf("\n%sSummary Self-index on integers (icsa) layer:",tab); + + printf("\n ===================================================:"); + printf("\n%sSummary Self-index on integers (icsa) layer:",tab); printf("\n%s icsa structure = %d bytes",tab, structure); printf("\n%s psi = %8d bytes",tab, totalpsi); printf("\n%s D (bitmap) = %8d bytes",tab, totalD); @@ -848,7 +848,7 @@ int printInfoIntIndex(void *index, const char tab[]) { printf("\n%s rank for BA = %8d bytes",tab, totalBBA); printf("\n%sTotal = ** %9d bytes (in RAM) ** ",tab, nbytes); printf("\n"); - + return 0; //no error. } @@ -862,25 +862,25 @@ int printInfoIntIndex(void *index, const char tab[]) { // b) O pivote é unha ocorrencia do patrón -> Faise unha busca exponencial sobre mostras hacia a esquerda e outra hacia a dereita, ata atopar a unha mostra á esquerda e outra // á dereita do intervalo de ocorrencias. Entre cada unha destas e a inmediatamente anterior da busca exponencial, faise unha busca binaria para atopar os extremos do intervalo. -int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right){ +int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right){ //unsigned int countCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right) { uint patternSize = length; ticsa *myicsa = (ticsa *) index; - + register unsigned long l, r, i; register long comp, p, previousP; //register unsigned int l, r, i; //register int comp, p, previousP; register uint bin_search_psi_skip_interval = myicsa->psiSearchFactorJump; - + //fprintf(stderr,"\n psiSearchFactor = %d",myicsa->psiSearchFactorJump); - - l = 0; + + l = 0; r = (myicsa->suffixArraySize+bin_search_psi_skip_interval-2)/bin_search_psi_skip_interval*bin_search_psi_skip_interval; p = ((l+r)/2)/bin_search_psi_skip_interval * bin_search_psi_skip_interval; previousP = 0; - + // BUSCA BINARIA SOBRE MOSTRAS while( (p != previousP) && (comp = SadCSACompare(myicsa, pattern, patternSize, p)) ) { if(comp > 0) l = p; @@ -890,13 +890,13 @@ int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong } if(p==previousP) { - + // BUSCA BINARIA ENTRE O PIVOTE E A SEGUINTE MOSTRA - l = previousP; + l = previousP; r = previousP+bin_search_psi_skip_interval; if(r > myicsa->suffixArraySize) r = myicsa->suffixArraySize - 1; while(l < r) { - p = (l+r)/2; + p = (l+r)/2; if(SadCSACompare(myicsa, pattern, patternSize, p) <= 0) r = p; else l = p+1; } @@ -909,21 +909,21 @@ int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong return 0; //no error. } *left = r; - - l = previousP; + + l = previousP; r = previousP+bin_search_psi_skip_interval; if(r > myicsa->suffixArraySize) r = myicsa->suffixArraySize - 1; while(l < r) { p = (l+r+1)/2; if(SadCSACompare(myicsa, pattern, patternSize, p) >= 0) l = p; - else r = p-1; + else r = p-1; } *right = l; - + } else { - + previousP = p; // En previousP poñemos o p atopado na busca sobre as mostras - + // BUSCA EXPONENCIAL HACIA ATRÁS i = 1; p -= bin_search_psi_skip_interval; @@ -934,49 +934,49 @@ int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong // Busca binaria entre as duas ultimas mostras da exponencial if(previousP > i*bin_search_psi_skip_interval) l = previousP-(i*bin_search_psi_skip_interval); else l=0; - i>>=1; + i>>=1; r = previousP-(i*bin_search_psi_skip_interval); while(l < r) { - p = (l+r)/2; + p = (l+r)/2; if(SadCSACompare(myicsa, pattern, patternSize, p) <= 0) r = p; else l = p+1; } *left = r; - + // BUSCA EXPONENCIAL HACIA ADIANTE i = 1; p = previousP+bin_search_psi_skip_interval; while(psuffixArraySize && !SadCSACompare(myicsa, pattern, patternSize, p)) { i<<=1; - p = previousP+(i*bin_search_psi_skip_interval); + p = previousP+(i*bin_search_psi_skip_interval); } // Busca binaria entre as duas ultimas mostras da exponencial if(p < myicsa->suffixArraySize) r = previousP+(i*bin_search_psi_skip_interval); else r = myicsa->suffixArraySize-1; - i>>=1; + i>>=1; l = previousP+(i*bin_search_psi_skip_interval); while(l < r) { p = (l+r+1)/2; if(SadCSACompare(myicsa, pattern, patternSize, p) >= 0) l = p; - else r = p-1; + else r = p-1; } - *right = l; + *right = l; } //return *right-*left+1; *numocc = (uint) *right-*left+1; - return 0; //no error + return 0; //no error } // Version inicial de busca binaria unsigned int countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right) { register ulong l, r, p; - l = 0; + l = 0; r = myicsa->suffixArraySize-1; while(l < r) { - p = (l+r)/2; + p = (l+r)/2; if(SadCSACompare(myicsa, pattern, patternSize, p) <= 0) r = p; else l = p+1; } @@ -987,36 +987,36 @@ unsigned int countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *l *right = r; return 0; } - + // Almacenamos o limite esquerdo *left = r; // SE SON IGUALES (O PATRON APARECE NO TEXTO), BUSCAMOS AGORA O LIMITE DEREITO, QUE ALMACENAREMOS EN right // NOTA: INICIAMOS A BUSQUEDA A PARTIR DO ESQUERDO... - l = r; + l = r; r = myicsa->suffixArraySize-1; - + while(l < r) { p = (l+r+1)/2; if(SadCSACompare(myicsa, pattern, patternSize, p) >= 0) l = p; - else r = p-1; + else r = p-1; } - + // Gardamos o limite dereito *right = l; - - return (uint) *right-*left+1; + + return (uint) *right-*left+1; } int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc) { //unsigned int *locateCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *occ) { - + ticsa *myicsa = (ticsa *) index; uint patternSize = length; ulong *positions; ulong occurrences; register ulong left, right; - + //occurrences = countCSA(myicsa, pattern, patternSize, &left, &right); int err; err = countIntIndex((void *) myicsa, pattern, patternSize, &occurrences, &left, &right); @@ -1025,16 +1025,16 @@ int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong * if (occurrences) { register ulong idx = 0; positions = (ulong*) malloc(sizeof(ulong) * occurrences); - while(left<=right) positions[idx++] = A(myicsa,left++); - + while(left<=right) positions[idx++] = A(myicsa,left++); + *occ = positions; - return 0; + return 0; //return positions; } - + (*occ)=NULL; return 0; //no error, but no occurrences. - + //return NULL; } @@ -1060,46 +1060,46 @@ int displayIntIndex(void *index, ulong position, uint *value){ /**********************************************************************/ -// Devolve o enteiro do texto que ocupa a posicion dada, e fixa o estado +// Devolve o enteiro do texto que ocupa a posicion dada, e fixa o estado // para poder seguir obtendo os seguintes enteiros con displayNext(); uint displayCSAFirst(ticsa *myicsa, uint position) { register uint positionAux, index; register uint T_AInv = myicsa->T_AInv; - + positionAux = myicsa->samplesAInv[position / T_AInv]; for(index = 0; index < position%T_AInv; index++) { #ifdef PSI_HUFFMANRLE positionAux=getHuffmanPsiValue(&(myicsa->hcPsi),positionAux); - #endif + #endif #ifdef PSI_GONZALO positionAux=getGonzaloPsiValue(&(myicsa->gcPsi),positionAux); #endif #ifdef PSI_DELTACODES positionAux=getDeltaPsiValue(&(myicsa->dcPsi),positionAux); - #endif + #endif } - + // Fijamos a variable global para a chamada a displayNext myicsa->displayCSAState = positionAux; - + // return rank1(D, Dir, positionAux) - 1; return rank(myicsa->bD, positionAux) - 1; } // Devolve o seguinte enteiro do texto (a partir do estado) -unsigned int displayCSANext(ticsa *myicsa) { +unsigned int displayCSANext(ticsa *myicsa) { #ifdef PSI_HUFFMANRLE myicsa->displayCSAState=getHuffmanPsiValue(&(myicsa->hcPsi),myicsa->displayCSAState); - #endif + #endif #ifdef PSI_GONZALO myicsa->displayCSAState = getGonzaloPsiValue(&(myicsa->gcPsi),myicsa->displayCSAState); #endif #ifdef PSI_DELTACODES myicsa->displayCSAState = getDeltaPsiValue(&(myicsa->dcPsi),myicsa->displayCSAState); - #endif - return rank(myicsa->bD, myicsa->displayCSAState) - 1; + #endif + return rank(myicsa->bD, myicsa->displayCSAState) - 1; } @@ -1110,10 +1110,10 @@ void showStructsCSA(ticsa *myicsa) { // ESTRUCTURAS PARA CSA printf("Basic CSA structures:\n\n"); - + // VALORES DA FUNCI�N PSI (decodificando) printf("\tPSI: (Sampling period = %d)\n", myicsa->T_Psi); - for(index=0; index < myicsa->suffixArraySize; index++) + for(index=0; index < myicsa->suffixArraySize; index++) #ifdef PSI_HUFFMANRLE printf("\tPsi[%d] = %d\n", index, getHuffmanPsiValue(&(myicsa->hcPsi),index)); #endif @@ -1121,10 +1121,10 @@ void showStructsCSA(ticsa *myicsa) { printf("\tPsi[%d] = %d\n", index, getGonzaloPsiValue(&(myicsa->gcPsi),index)); #endif #ifdef PSI_DELTACODES - printf("\tPsi[%d] = %d\n", index, getDeltaPsiValue(&(myicsa->dcPsi),index)); - #endif + printf("\tPsi[%d] = %d\n", index, getDeltaPsiValue(&(myicsa->dcPsi),index)); + #endif printf("\n"); - + // VECTOR D DE SADAKANE CO DIRECTORIO DE RANK ASOCIADO printf("\tD = "); showBitVector(myicsa->D, myicsa->suffixArraySize); @@ -1138,24 +1138,24 @@ void showStructsCSA(ticsa *myicsa) { printf("\tDs[%d] = %d\n", index, myicsa->bD->sdata[index]); } printf("\nBloques de D:\n"); - + for(index=0; indexbD->bdata[index]); - } + printf("\tDb[%d] = %d\n", index, myicsa->bD->bdata[index]); + } printf("\n\n"); } // ESTRUCTURAS PARA ACCEDER O ARRAY DE SUFIXOS E A SUA INVERSA printf("Suffix Array Sampling Structures: (Sampling period = %d)\n", myicsa->T_A); printf("\tSuffix Array Samples:\n"); - for(index=0; index < myicsa->samplesASize; index++) + for(index=0; index < myicsa->samplesASize; index++) printf("\tSamplesA[%d] = %d\n", index, myicsa->samplesA[index]); - printf("\n"); + printf("\n"); printf("\tInverse Suffix Array Samples:\n"); - for(index=0; index < myicsa->samplesASize; index++) + for(index=0; index < myicsa->samplesASize; index++) printf("\tSamplesAInv[%d] = %d\n", index, myicsa->samplesAInv[index]); printf("\n"); - + } @@ -1164,27 +1164,27 @@ void showStructsCSA(ticsa *myicsa) { int SadCSACompare(ticsa *myicsa, uint *pattern, uint patternSize, uint p) { register unsigned int j, i, currentInteger, diff; - + i = p; j = 0; - + while(1) { currentInteger = rank(myicsa->bD, i) - 1; diff = pattern[j++] - currentInteger; if(diff) return diff; if(j == patternSize) return 0; - else + else #ifdef PSI_HUFFMANRLE i=getHuffmanPsiValue(&(myicsa->hcPsi),i); #endif #ifdef PSI_GONZALO i=getGonzaloPsiValue(&(myicsa->gcPsi),i); - #endif + #endif #ifdef PSI_DELTACODES i=getDeltaPsiValue(&(myicsa->dcPsi),i); #endif } - + } @@ -1193,15 +1193,15 @@ inline uint A(ticsa *myicsa, uint position) { register uint timesPsi, sampleValue; register uint T_A = myicsa->T_A; - + uint proba = position; - + timesPsi = 0; while(!bitget(myicsa->BA, position)) { - + #ifdef PSI_HUFFMANRLE position=getHuffmanPsiValue(&(myicsa->hcPsi),position); - #endif + #endif #ifdef PSI_GONZALO position=getGonzaloPsiValue(&(myicsa->gcPsi),position); #endif @@ -1209,10 +1209,10 @@ inline uint A(ticsa *myicsa, uint position) { position=getDeltaPsiValue(&(myicsa->dcPsi),position); #endif timesPsi++; - + } sampleValue = myicsa->samplesA[rank(myicsa->bBA, position)-1]; - + return sampleValue - timesPsi; } @@ -1223,65 +1223,65 @@ inline uint inverseA(ticsa *myicsa, uint offset) { register uint index, inverseValue; register uint T_AInv = myicsa->T_AInv; - + inverseValue = myicsa->samplesAInv[offset/T_AInv]; - for(index=0; index<(offset%T_AInv); index++) + for(index=0; index<(offset%T_AInv); index++) #ifdef PSI_HUFFMANRLE inverseValue=getHuffmanPsiValue(&(myicsa->hcPsi),inverseValue); - #endif + #endif #ifdef PSI_GONZALO inverseValue = getGonzaloPsiValue(&(myicsa->gcPsi),inverseValue); - #endif + #endif #ifdef PSI_DELTACODES inverseValue = getDeltaPsiValue(&(myicsa->dcPsi),inverseValue); #endif return inverseValue; - + } // Initializes the parameters of the index. -uint parametersCSA(ticsa *myicsa, char *build_options){ +uint parametersCSA(ticsa *myicsa, char *build_options){ char delimiters[] = " =;"; int j,num_parameters; char ** parameters; int ssA,ssAinv,ssPsi,nsHuff,psiSearchFactor; - + ssA = DEFAULT_A_SAMPLING_PERIOD; ssAinv = DEFAULT_A_INV_SAMPLING_PERIOD; ssPsi = DEFAULT_PSI_SAMPLING_PERIOD; nsHuff = DEFAULT_nsHUFF; psiSearchFactor = DEFAULT_PSI_BINARY_SEARCH_FACTOR; - + if (build_options != NULL) { parse_parameters(build_options,&num_parameters, ¶meters, delimiters); for (j=0; jT_A = ssA; myicsa->T_AInv = ssAinv; myicsa->T_Psi = ssPsi; myicsa->tempNSHUFF = nsHuff; - myicsa->psiSearchFactorJump = psiSearchFactor * ssPsi; + myicsa->psiSearchFactorJump = psiSearchFactor * ssPsi; printf("\n\t parameters for iCSA: sampleA=%d, sampleAinv=%d, samplePsi=%d",ssA,ssAinv,ssPsi); printf("\n\t : nsHuff=%d, psiSearchFactor = %d --> jump = %d", nsHuff,psiSearchFactor, myicsa->psiSearchFactorJump); diff --git a/swcsa/intIndex/icsa.h b/swcsa/intIndex/icsa.h index 6d4dc3d..6114a35 100755 --- a/swcsa/intIndex/icsa.h +++ b/swcsa/intIndex/icsa.h @@ -1 +1 @@ -/* icsa.h * Copyright (C) 2011, Antonio Fariña and Eduardo Rodriguez, all rights reserved. * * icsa.h: Definition of the functions of the interface "../intIndex/interfaceIntIndex.h" * that permits to represent a sequence of uint32 integers with an iCSA: * An integer-oriented Compressed Suffix Array. * Such representation will be handled as a "ticsa" data structure, that * the WCSA self-index will use (as an opaque type) to * create/save/load/search/recover/getSize of the representation of the * original sequence. * Suffix sorting is done via Larson/Sadakane suffix sort algorithm * from the char-based csa. * * Ref: Larsson, N. J. and Sadakane, K. 2007. Faster suffix sorting. Theoretical * Computer Science 387, 3, 258–272. * * See more details in: * Antonio Fariña, Nieves Brisaboa, Gonzalo Navarro, Francisco Claude, Ángeles Places, * and Eduardo Rodríguez. Word-based Self-Indexes for Natural Language Text. ACM * Transactions on Information Systems (TOIS), 2012. * http://vios.dc.fi.udc.es/indexing/wsi/publications.html * http://www.dcc.uchile.cl/~gnavarro/ps/tois11.pdf * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include #include #include #include "../intIndexUtils/defValues.h" #include "../utils/bitmap.h" #include "../utils/huff.h" #include "../utils/parameters.h" #ifdef PSI_HUFFMANRLE #include "../intIndexUtils/psiHuffmanRLE.h" #endif #ifdef PSI_GONZALO #include "../intIndexUtils/psiGonzalo.h" #endif #ifdef PSI_DELTACODES #include "../intIndexUtils/psiDeltaCode.h" #endif typedef struct { uint suffixArraySize; uint T_Psi; uint *D; bitmap bD; uint T_A; uint T_AInv; uint *samplesA; uint samplesASize; uint *BA; bitmap bBA; uint *samplesAInv; uint samplesAInvSize; uint displayCSAState; long displayCSAPrevPosition; #ifdef PSI_HUFFMANRLE HuffmanCompressedPsi hcPsi; #endif #ifdef PSI_GONZALO GonzaloCompressedPsi gcPsi; #endif #ifdef PSI_DELTACODES DeltaCompressedPsi dcPsi; #endif //only needed during "parse_parameters". uint tempNSHUFF; uint psiSearchFactorJump; //factor of the T_Psi value. } ticsa; // FUNCTION PROTOTYPES: BUILDING THE INDEX //Creates the ICSA int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ); //ticsa *createIntegerCSA (uint **aintVector, uint SAsize, char *build_options); //Returns number of elements in the indexed sequence of integers int sourceLenIntIndex(void *index, uint *numInts); //Save the index to disk int saveIntIndex(void *index, char *pathname); //void storeStructsCSA(ticsa *myicsa, char *basename); // Loads the index from disk. int loadIntIndex(char *pathname, void **index); //ticsa *loadCSA(char *basename); // Frees memory int freeIntIndex(void *index); //uint destroyStructsCSA(ticsa *myicsa); //Returns the size (in bytes) of the index over the sequence of integers. int sizeIntIndex(void *index, uint *numBytes); //uint CSA_size(ticsa *myicsa); // Shows detailed summary info of the self-index (memory usage of each structure) int printInfoIntIndex(void *index, const char tab[]); //Number of occurrences of the pattern, and the interval [left,right] in the suffix array. int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right); //uint countCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right); // Exponential search //uint countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right); // Binary search // Returns an array with integers corresponding offsets to the occurrences of the pattern, // as well as the number of occurrences int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc); //uint *locateCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *occ); //Returns the value of the source (array of integers) at a given offset. // (that is, the element "position" from the original array of uints) int displayIntIndex(void *index, ulong position, uint *value); //uint displayCSA(ticsa *myicsa, uint position); /* Private function prototypes ********************************************/ uint parametersCSA(ticsa *myicsa, char *build_options); uint displayCSAFirst(ticsa *myicsa, uint position); uint displayCSANext(ticsa *myicsa); int SadCSACompare(ticsa *myicsa, uint *pattern, uint patternSize, uint p); uint A(ticsa *myicsa, uint position); uint inverseA(ticsa *myicsa, uint offset); void showStructsCSA(ticsa *myicsa); // For Debugging \ No newline at end of file +/* icsa.h * Copyright (C) 2011, Antonio Fariña and Eduardo Rodriguez, all rights reserved. * * icsa.h: Definition of the functions of the interface "../intIndex/interfaceIntIndex.h" * that permits to represent a sequence of uint32 integers with an iCSA: * An integer-oriented Compressed Suffix Array. * Such representation will be handled as a "ticsa" data structure, that * the WCSA self-index will use (as an opaque type) to * create/save/load/search/recover/getSize of the representation of the * original sequence. * Suffix sorting is done via Larson/Sadakane suffix sort algorithm * from the char-based csa. * * Ref: Larsson, N. J. and Sadakane, K. 2007. Faster suffix sorting. Theoretical * Computer Science 387, 3, 258–272. * * See more details in: * Antonio Fariña, Nieves Brisaboa, Gonzalo Navarro, Francisco Claude, Ángeles Places, * and Eduardo Rodríguez. Word-based Self-Indexes for Natural Language Text. ACM * Transactions on Information Systems (TOIS), 2012. * http://vios.dc.fi.udc.es/indexing/wsi/publications.html * http://www.dcc.uchile.cl/~gnavarro/ps/tois11.pdf * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include #include #include #include "../intIndex/defValues.h" #include "../utils/bitmap.h" #include "../utils/huff.h" #include "../utils/parameters.h" #include "../utils/basics.h" #ifdef PSI_HUFFMANRLE #include "../intIndex/psiHuffmanRLE.h" #endif #ifdef PSI_GONZALO #include "../intIndex/psiGonzalo.h" #endif #ifdef PSI_DELTACODES #include "../intIndex/psiDeltaCode.h" #endif typedef struct { uint suffixArraySize; uint T_Psi; uint *D; bitmap bD; uint T_A; uint T_AInv; uint *samplesA; uint samplesASize; uint *BA; bitmap bBA; uint *samplesAInv; uint samplesAInvSize; uint displayCSAState; long displayCSAPrevPosition; #ifdef PSI_HUFFMANRLE HuffmanCompressedPsi hcPsi; #endif #ifdef PSI_GONZALO GonzaloCompressedPsi gcPsi; #endif #ifdef PSI_DELTACODES DeltaCompressedPsi dcPsi; #endif //only needed during "parse_parameters". uint tempNSHUFF; uint psiSearchFactorJump; //factor of the T_Psi value. } ticsa; // FUNCTION PROTOTYPES: BUILDING THE INDEX //Creates the ICSA int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ); //ticsa *createIntegerCSA (uint **aintVector, uint SAsize, char *build_options); //Returns number of elements in the indexed sequence of integers int sourceLenIntIndex(void *index, uint *numInts); //Save the index to disk int saveIntIndex(void *index, char *pathname); //void storeStructsCSA(ticsa *myicsa, char *basename); // Loads the index from disk. int loadIntIndex(char *pathname, void **index); //ticsa *loadCSA(char *basename); // Frees memory int freeIntIndex(void *index); //uint destroyStructsCSA(ticsa *myicsa); //Returns the size (in bytes) of the index over the sequence of integers. int sizeIntIndex(void *index, uint *numBytes); //uint CSA_size(ticsa *myicsa); // Shows detailed summary info of the self-index (memory usage of each structure) int printInfoIntIndex(void *index, const char tab[]); //Number of occurrences of the pattern, and the interval [left,right] in the suffix array. int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right); //uint countCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right); // Exponential search //uint countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right); // Binary search // Returns an array with integers corresponding offsets to the occurrences of the pattern, // as well as the number of occurrences int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc); //uint *locateCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *occ); //Returns the value of the source (array of integers) at a given offset. // (that is, the element "position" from the original array of uints) int displayIntIndex(void *index, ulong position, uint *value); //uint displayCSA(ticsa *myicsa, uint position); /* Private function prototypes ********************************************/ uint parametersCSA(ticsa *myicsa, char *build_options); uint displayCSAFirst(ticsa *myicsa, uint position); uint displayCSANext(ticsa *myicsa); int SadCSACompare(ticsa *myicsa, uint *pattern, uint patternSize, uint p); uint A(ticsa *myicsa, uint position); uint inverseA(ticsa *myicsa, uint offset); void showStructsCSA(ticsa *myicsa); // For Debugging \ No newline at end of file diff --git a/swcsa/intIndex/psiDeltaCode.h b/swcsa/intIndex/psiDeltaCode.h index 1f955c8..b3c277a 100644 --- a/swcsa/intIndex/psiDeltaCode.h +++ b/swcsa/intIndex/psiDeltaCode.h @@ -1,6 +1,6 @@ #include #include -#include +//#include #include #include @@ -11,8 +11,8 @@ typedef struct { unsigned int T; unsigned int negativeGap; unsigned int deltaCodesSize; // En palabras - unsigned int *deltaCodes; - unsigned int numberOfSamples; + unsigned int *deltaCodes; + unsigned int numberOfSamples; unsigned int *samples; unsigned int *pointers; unsigned int totalMem; // the size in bytes used; @@ -23,9 +23,9 @@ typedef struct { DeltaCompressedPsi deltaCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T); int getDeltaPsiValue(DeltaCompressedPsi *cPsi, unsigned int position); void storeDeltaCompressedPsi(DeltaCompressedPsi *compressedPsi, char *filename); -DeltaCompressedPsi loadDeltaCompressedPsi(char *filename); +DeltaCompressedPsi loadDeltaCompressedPsi(char *filename); void destroyDeltaCodesCompressedPsi(DeltaCompressedPsi *compressedPsi); - + // IMPLEMENTACI�N DAS FUNCI�NS //void destroyDeltaCodesCompressedPsi(DeltaCompressedPsi *compressedPsi) { @@ -34,54 +34,54 @@ void destroyDeltaCodesCompressedPsi(DeltaCompressedPsi *compressedPsi); // free(compressedPsi->pointers); //} // -// +// //DeltaCompressedPsi deltaCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T) { // // DeltaCompressedPsi cPsi; -// +// // int numberOfSamples; -// register int diff, deltaCodesPos; +// register int diff, deltaCodesPos; // register unsigned int k, p, aux, diffpositive, code, index; // unsigned int samplesIndex, codeLenght, currentInput, wordsDeltaCodes, totalSize; // unsigned int *deltaCodes; // unsigned int *samples; // unsigned int *pointers; -// +// // // Auxiliar para deltaCodes (estimamos como espacio maximo o do array de sufixos) -// unsigned int *deltaCodesAux; -// +// unsigned int *deltaCodesAux; +// // // Calculamos o mellor valor para negativeGap <= 64 // unsigned int negativeGap; // register unsigned int maxNegativeBits = 0; // k = psiSize; // while(k) { // k >>= 1; -// maxNegativeBits++; +// maxNegativeBits++; // } // if(maxNegativeBits<=26) negativeGap = 64; -// else negativeGap = 1<<(32-maxNegativeBits); -// +// else negativeGap = 1<<(32-maxNegativeBits); +// // // Reservamos espacio para as estructuras // numberOfSamples = (psiSize + T - 1) / T; // samples = (unsigned int *)malloc(sizeof(int)*numberOfSamples); // pointers = (unsigned int *)malloc(sizeof(int)*numberOfSamples); -// +// // deltaCodesAux = (unsigned int *)malloc(sizeof(int)*psiSize); -// for(index=0; index0) diffpositive = (negativeGap*diff-1)/(negativeGap-1); // else diffpositive = -negativeGap*diff; -// +// // k = 0; // aux = diffpositive; // while(aux) { @@ -94,48 +94,48 @@ void destroyDeltaCodesCompressedPsi(DeltaCompressedPsi *compressedPsi); // aux >>= 1; // p++; // } -// +// // code = diffpositive & ((1<<(k-1))-1); -// codeLenght = 2*p+k-2; +// codeLenght = 2*p+k-2; // // // Primeiro metemos os p-1 0's iniciais // deltaCodesPos += p-1; -// +// // // Agora metemos os p bits de k -// if( ((deltaCodesPos%32) + p) > 32 ) { +// if( ((deltaCodesPos%32) + p) > 32 ) { // deltaCodesAux[deltaCodesPos/32] |= (k>>((deltaCodesPos%32)+p-32)); -// deltaCodesAux[deltaCodesPos/32+1] = (k<<(64-(deltaCodesPos%32)-p)); +// deltaCodesAux[deltaCodesPos/32+1] = (k<<(64-(deltaCodesPos%32)-p)); // } else { -// deltaCodesAux[deltaCodesPos/32] |= (k<<(32-p-(deltaCodesPos%32))); +// deltaCodesAux[deltaCodesPos/32] |= (k<<(32-p-(deltaCodesPos%32))); // } // deltaCodesPos += p; -// +// // // Por �ltimo metemos os k-1 bits de code (sen o 1 inicial) -// if( ((deltaCodesPos%32) + (k-1)) > 32 ) { +// if( ((deltaCodesPos%32) + (k-1)) > 32 ) { // deltaCodesAux[deltaCodesPos/32] |= (code>>((deltaCodesPos%32)+(k-1)-32)); -// deltaCodesAux[deltaCodesPos/32+1] = (code<<(64-(deltaCodesPos%32)-(k-1))); +// deltaCodesAux[deltaCodesPos/32+1] = (code<<(64-(deltaCodesPos%32)-(k-1))); // } else { -// deltaCodesAux[deltaCodesPos/32] |= (code<<(32-(k-1)-(deltaCodesPos%32))); +// deltaCodesAux[deltaCodesPos/32] |= (code<<(32-(k-1)-(deltaCodesPos%32))); // } // deltaCodesPos += k-1; -// +// // } else { // samples[samplesIndex] = Psi[index]; -// pointers[samplesIndex++] = deltaCodesPos; -// currentInput = Psi[index]; -// } +// pointers[samplesIndex++] = deltaCodesPos; +// currentInput = Psi[index]; +// } // // } -// +// // // Ahora que xa sabemos o espacio necesario para os deltaCodes, reservamolo e liberamos a estructura auxiliar // wordsDeltaCodes = (deltaCodesPos+31)/32; // deltaCodes = (unsigned int *)malloc(sizeof(int)*wordsDeltaCodes); // for(index=0;indexsamples[position/cPsi->T]; // pointer = cPsi->pointers[position/cPsi->T]; -// +// // // Calculamos o numero de codigos a decodificar a partir da mostra -// toDecode = position % cPsi->T; -// +// toDecode = position % cPsi->T; +// // while(toDecode--) { -// +// // // Collemos o n�mero ceros iniciais // // Po�emos o inicio do c�digo nun enteiro (code) alineado a esquerda // // Non importa que non colla todo o c�digo, pero si temos asegurado que // // colle p e k (k<=32 (6bits), p<=5bits) // code = (cPsi->deltaCodes[pointer/32] << (pointer%32)) | // ((pointer%32 != 0) * (cPsi->deltaCodes[pointer/32+1] >> (32-(pointer%32)))); -// +// // //Ahora contamos o n�mero de ceros (p) que hai nas posicions da esquerda de code // p = 1; // while(!(code & 0x80000000)) { // code <<= 1; // p++; // } -// +// // // Ahora calculamos o numero de digitos da representacion binaria do codigo (k) -// k = code >> (32-p); -// +// k = code >> (32-p); +// // // Actualizamos o punteiro global sobre deltaCodes // pointer += 2*p-1; -// +// // // Po�emos a representacion binaria do codigo nun enteiro (code) alineado a esquerda // code = (cPsi->deltaCodes[pointer/32] << (pointer%32)) | // ((pointer%32 != 0) * (cPsi->deltaCodes[pointer/32+1] >> (32-(pointer%32)))); // code = ((code >> 1) | 0x80000000) >> (32-k); // pointer += k-1; -// +// // // Bixecci�n // if(code % cPsi->negativeGap) result += (code - (code/cPsi->negativeGap)); // else result -= code/cPsi->negativeGap; -// +// // } -// +// // return result; -// +// //} // // @@ -217,12 +217,12 @@ void destroyDeltaCodesCompressedPsi(DeltaCompressedPsi *compressedPsi); // write(file, compressedPsi->samples, compressedPsi->numberOfSamples*sizeof(int)); // write(file, compressedPsi->pointers, compressedPsi->numberOfSamples*sizeof(int)); // close(file); -// +// //} // // //DeltaCompressedPsi loadDeltaCompressedPsi(char *filename) { -// +// // DeltaCompressedPsi compressedPsi; // // int file; @@ -231,18 +231,18 @@ void destroyDeltaCodesCompressedPsi(DeltaCompressedPsi *compressedPsi); // printf("Cannot read file %s\n", filename); // exit(0); // } -// read(file, &(compressedPsi.T), sizeof(int)); +// read(file, &(compressedPsi.T), sizeof(int)); // read(file, &(compressedPsi.negativeGap), sizeof(int)); // read(file, &(compressedPsi.deltaCodesSize), sizeof(int)); // compressedPsi.deltaCodes = (unsigned int *)malloc(compressedPsi.deltaCodesSize*sizeof(int)); // read(file, compressedPsi.deltaCodes, compressedPsi.deltaCodesSize*sizeof(int)); -// read(file, &(compressedPsi.numberOfSamples), sizeof(int)); +// read(file, &(compressedPsi.numberOfSamples), sizeof(int)); // compressedPsi.samples = (unsigned int *)malloc(compressedPsi.numberOfSamples*sizeof(int)); // compressedPsi.pointers = (unsigned int *)malloc(compressedPsi.numberOfSamples*sizeof(int)); // read(file, compressedPsi.samples, compressedPsi.numberOfSamples*sizeof(int)); -// read(file, compressedPsi.pointers, compressedPsi.numberOfSamples*sizeof(int)); +// read(file, compressedPsi.pointers, compressedPsi.numberOfSamples*sizeof(int)); // close(file); -// +// // return compressedPsi; // //} diff --git a/swcsa/intIndex/psiGonzalo.h b/swcsa/intIndex/psiGonzalo.h index 95693bd..3009fb2 100644 --- a/swcsa/intIndex/psiGonzalo.h +++ b/swcsa/intIndex/psiGonzalo.h @@ -1,6 +1,6 @@ #include #include -#include +//#include #include "../utils/huff.h" // ESTRUCTURA QUE REPRESENTA A FUNCION PSI COMPRIMIDA diff --git a/swcsa/intIndex/psiHuffmanRLE.h b/swcsa/intIndex/psiHuffmanRLE.h index 8757c73..b66db05 100644 --- a/swcsa/intIndex/psiHuffmanRLE.h +++ b/swcsa/intIndex/psiHuffmanRLE.h @@ -1,6 +1,6 @@ #include #include -#include +//#include #include "../utils/huff.h" /* diff --git a/swcsa/intIndexUtils b/swcsa/intIndexUtils deleted file mode 120000 index 245a84f..0000000 --- a/swcsa/intIndexUtils +++ /dev/null @@ -1 +0,0 @@ -intIndex/ \ No newline at end of file diff --git a/swcsa/utils/MemoryManager.h b/swcsa/utils/MemoryManager.h index fb51e63..7d26572 100755 --- a/swcsa/utils/MemoryManager.h +++ b/swcsa/utils/MemoryManager.h @@ -11,16 +11,16 @@ #ifndef MEMORYMANAGERINCLUDED #define MEMORYMANAGERINCLUDED // only used for hashTable of stopwords - + #include #include #include #include -#include +//#include #ifndef byte #define byte unsigned char -#endif +#endif #define LARGE_BLOCK_SIZE 1024*256 // Size of the blocks of memory that will be allocated #define MAX_BLOCKS 2048 // Maximum number of blocks of size LARGE_BLOCK_SIZE that @@ -37,7 +37,7 @@ } ; typedef struct sMem *MemoryManager; - + MemoryManager createMemoryManager(void); void destroyMemoryManager (MemoryManager mm); diff --git a/swcsa/utils/basics.c b/swcsa/utils/basics.c index c9cb275..437355e 100755 --- a/swcsa/utils/basics.c +++ b/swcsa/utils/basics.c @@ -8,37 +8,54 @@ extern "C" { #include #include #include +#define ALLOC_WARN_LIMIT (40*1024*1024) // Memory management -static void *Malloc (size_t n) +void *Malloc_ (size_t n, size_t l, char * file) { void *p; - if (n == 0) return NULL; + + if (n > ALLOC_WARN_LIMIT) + { + fprintf(stderr, "\nWarning: allocating %lu bytes, file:%s, line: %lu\n", + n, file, l); + }; + p = (void*) malloc (n); - if (p == NULL) - { fprintf (stderr,"Could not allocate %i bytes\n",n); - exit(1); + if (p == NULL && n > 0) + { + fprintf (stderr,"Could not allocate %lu bytes, file: %s, line: %lu\n",n,file,l); + exit(1); } return p; } -static void Free (void *p) +void Free_ (void *p, size_t l, char * file) { if (p) free (p); + else { + fprintf (stderr,"Double free, file: %s, line: %lu\n",file,l); + exit(1); + } } -static void *Realloc (void *p, size_t n) - - { if (p == NULL) return Malloc (n); - if (n == 0) { Free(p); return NULL; } - p = (void*) realloc (p,n); - if (p == NULL) - { fprintf (stderr,"Could not allocate %i bytes\n",n); - exit(1); - } - return p; - } +void *Realloc_ (void *p, size_t n, size_t l, char * file) +{ + if (n > ALLOC_WARN_LIMIT) + { + fprintf(stderr, "\nWarning: allocating %lu bytes, file:%s, line: %lu\n", + n, file, l); + }; + + p = (void*) realloc (p,n); + if (p == NULL) + { + fprintf (stderr,"Could not re-allocate %lu bytes, file: %s, line: %lu\n",n,file,l); + exit(1); + } + return p; +} #include "basics.h" diff --git a/swcsa/utils/basics.h b/swcsa/utils/basics.h index 01741b5..c9089cc 100755 --- a/swcsa/utils/basics.h +++ b/swcsa/utils/basics.h @@ -1,7 +1,7 @@ // Basics - + #ifndef BASICSINCLUDED #define BASICSINCLUDED @@ -9,7 +9,7 @@ extern "C" { #endif - // Includes + // Includes #include #include @@ -23,13 +23,13 @@ extern "C" { // Memory management - //#define malloc(n) Malloc(n) - //#define free(p) Free(p) - //#define realloc(p,n) Realloc(p,n) +#define malloc(n) Malloc_(n,__LINE__,__FILE__) +#define free(p) Free_(p,__LINE__,__FILE__) +#define realloc(p,n) Realloc_(p,n,__LINE__,__FILE__) - //static void *Malloc (size_t n); - //static void Free (void *p); - //static void *Realloc (void *p, size_t n); +void *Malloc_ (size_t n, size_t l, char * file); +void Free_ (void *p, size_t l, char * file); +void *Realloc_ (void *p, size_t n, size_t l, char * file); // Data types diff --git a/swcsa/utils/hash.h b/swcsa/utils/hash.h index d691c7a..cbad3a7 100755 --- a/swcsa/utils/hash.h +++ b/swcsa/utils/hash.h @@ -38,7 +38,7 @@ A Coru #include #include #include -#include +//#include #include "MemoryManager.h"