LIBCDSA = $(LIBCDSPATH)/lib/libcds.a
LIBRLCSA = incbwt/rlcsa.a
LIBLZTRIE = lzindex/lztrie.a
+LIBSWCSA = swcsa/swcsa.a
dcover_obs = dcover/difference_cover.o
TextCollection_obs = TextCollection.o TextCollectionBuilder.o TCImplementation.o Tools.o BitRank.o \
- TextStorage.o ${LIBRLCSA} ${LIBCDSA} ${LIBLZTRIE}
+ TextStorage.o ${LIBRLCSA} ${LIBCDSA} ${LIBLZTRIE} ${LIBSWCSA}
TCDebug_obs = bittree.o rbtree.o dynFMI.o
all: testTextCollection
lzindex/lztrie.a:
@make -C lzindex
+swcsa/swcsa.a:
+ @make -C swcsa
+
clean:
@make clean -C incbwt
@make clean -C lzindex
+ @make clean -C swcsa
rm -f core *.o *~ testTextCollection timeTextCollection dcover/*.o dcover/*~
+shallow_clean:
+ rm -f core *.o *~ testTextCollection timeTextCollection
+
depend:
@make depend -C incbwt
$(CC) -MM *.cpp > dependencies.mk
--- /dev/null
+SRCDIRUTILS = utils
+SRCDIRCSA = intIndex
+CC = g++
+
+# If you have trouble with make, e.g.:
+# /usr/bin/ld: skipping incompatible /usr/lib/gcc/x86_64-linux-gnu/4.4.3/libstdc++.a when searching for -lstdc++
+# ...
+# try adding
+# ln -s /usr/lib32/libstdc++.so.6.0.13 libstdc++.so
+#
+# The filename libstdc++.so.6.0.13 is probably different
+# but any from /usr/lib32 is fine.
+export CFLAGS = -O9 -m32 -L. -D_FORTIFY_SOURCE=0
+#export CFLAGS = -O0 -m32 -pg
+#export CFLAGS = -g -m32 -O0
+
+LIBINDEX = swcsa.a
+LIBINTINDEX = icsa.a
+
+all: clean wcsa cleanO
+
+wcsa: intIndexPackage buildFacade.o parameters.o hash.o valstring.o MemoryManager.o basics.o \
+ bitmap.o huffDec.o huff.o fileInfo.o
+ ar rc $(LIBINTINDEX) parameters.o buildFacade.o hash.o valstring.o MemoryManager.o basics.o \
+ bitmap.o huffDec.o huff.o fileInfo.o
+ mv $(LIBINTINDEX) $(LIBINDEX)
+
+################# SELF INDEX ON INTEGERS ##############################
+intIndexPackage:
+ cd $(SRCDIRCSA) && $(MAKE) -w
+ @echo "[copying the int-index lib into current dir]"
+ @cp $(SRCDIRCSA)/$(LIBINTINDEX) .
+
+
+####################### UTILS MODULES #################################
+
+parameters.o:
+ $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/parameters.c
+
+fileInfo.o:
+ $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/fileInfo.c
+
+hash.o: MemoryManager.o
+ $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/hash.c
+
+
+MemoryManager.o:
+ $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/MemoryManager.c
+
+valstring.o:
+ $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/valstring.c
+
+huff.o:
+ $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/huff.c
+
+huffDec.o:
+ $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/huffDec.c
+
+
+basics.o:
+ $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/basics.c
+
+bitmap.o:
+ $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/bitmap.c
+
+
+############################ CLEANING #################################
+
+cleanO:
+ rm -f *.o
+
+clean:
+ cd $(SRCDIRCSA) && $(MAKE) clean -w
+ rm -rf *~ *% *.o core *.bak $(LIBINTINDEX) $(LIBINDEX)
+
--- /dev/null
+#include "buildFacade.h"
+
+/**------------------------------------------------------------------
+ * MAIN PROGRAM.
+ *------------------------------------------------------------------ */
+
+ int main(int argc, char* argv[])
+ {
+
+ char *infile, *outbasename, *stopwordsfile; // Name of in/out files
+ byte *inputBuffer;
+ ulong finsize;
+
+ int f_in;
+ void *Index;
+
+
+ printf("\n*Presentation level for CSA (simple WCSA)");
+ printf("\n*CopyRight (c) 2007 [LBD & G.N.]\n\n");
+
+ // Reads input parameters from command line.
+ if(argc < 3) {
+ printf("Use: %s <in file> <out basename> [build_options]\n", argv[0]);
+ exit(0);
+ }
+
+ // Reads params (input file, output basename, and stopwords file)
+ infile = argv[1];
+ outbasename = argv[2];
+ stopwordsfile = argv[3];
+
+ finsize= fileSize(infile);
+
+ if (! finsize) {
+ printf( "\nFILE EMPTY OR FILE NOT FOUND %s !!\nSkipping processement ...\n",infile);
+ exit(0);
+ }
+
+ // Opening the input text file.
+ if( (f_in = open(infile, O_RDONLY)) < 0) {
+ printf("Cannot read file %s\n", infile);
+ exit(0);
+ }
+ inputBuffer = (byte *) malloc(finsize *sizeof(byte));// +1);
+ read (f_in,inputBuffer,finsize);
+ close (f_in);
+
+
+ {
+ //printf("\n parametros <<%s>>\n\n",stopwordsfile);
+ // build_WCSA (inputBuffer, finsize, stopwordsfile, NULL,outbasename);
+ build_index (inputBuffer, finsize, stopwordsfile, &Index); /** building the index */
+
+
+ /** saving the index to disk*/
+
+ save_index (Index, outbasename);
+ fprintf(stderr,"Index saved !! ");
+
+ /** tells the mem used by the index */
+ ulong indexsize;
+ index_size(Index, &indexsize);
+ fprintf(stderr,"Index occupied %d bytes, 2 extra mallocs = %d",indexsize,2* sizeof(uint));
+
+
+ /** recovering the source text from the index */
+ {
+ double start, end;
+ start = getTime2();
+ ulong size;
+ get_length(Index, &size);
+
+ fprintf(stderr, "\nRecovering source file "); fflush(stderr);
+ char ext1[10]=".source";
+ recoverSourceText1((twcsa*) Index, outbasename,ext1, size);
+ end = getTime2();
+ fprintf(stderr, " time: %.3f secs\n", end-start );
+
+ start=end;
+ char ext2[10]=".source2";
+ fprintf(stderr, "\nRecovering source file "); fflush(stderr);
+ recoverSourceText2((twcsa*) Index, outbasename,ext2,size);
+ end = getTime2();
+ fprintf(stderr, " time: %.3f secs\n", end-start );
+ //fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start );
+ }
+
+ // DISPLAYING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
+ {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
+ int error = 0;
+ ulong numocc,numc, length, i, *snippet_len, tot_numcharext = 0, numpatt;
+ uchar *pattern, *snippet_text;
+
+ pattern = textPattern;
+ printf("\nSEARCH TEST for DISPLAY (pizzachili interface)\n");
+ while(1) {
+ printf("Intro string: ");
+ fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
+ if (!strcmp((char*)textPattern,"\n") ) break;
+ textPattern[strlen((char*)textPattern)-1] = '\0';
+
+ length = strlen( (char*)textPattern);
+ numc=50;
+
+// error = display (Index, textPattern, length, numc, &numocc,
+// &snippet_text, &snippet_len);
+ error = displayWords (Index, textPattern, length, numc, &numocc,
+ &snippet_text, &snippet_len,1);
+
+ if (error){ fprintf(stderr, "%s\n", "Hubo un error durante display");exit(0);}
+
+ fprintf(stderr,"\n acabou display");fflush(stderr);
+ {//show the results
+ ulong j, len = length + 2*numc;
+ char blank = '\0';
+ fprintf(stderr,"\n length = %d",length);
+ fprintf(stderr,"\n pattern = %s",pattern);fflush(stderr);
+ fprintf(stderr,"\n numocc = %d",numocc);fflush(stderr);
+ fprintf(stderr,"\n snippet len = %d",len);fflush(stderr);
+ fprintf(stderr,"\n =========");fflush(stderr);
+ for (i = 0; i < numocc; i++){
+ fprintf(stderr,"\n[%2d][len=%3d]<<",i+1,snippet_len[i]);fflush(stderr);
+ fwrite(snippet_text+len*i,sizeof(uchar),snippet_len[i],stderr);fflush(stderr);
+ fprintf(stderr,">>");fflush(stderr);
+ }
+ }
+ numpatt--;
+
+ for(i=0; i<numocc; i++) {
+ tot_numcharext += snippet_len[i];
+ }
+
+ if (numocc) {
+ free (snippet_len);
+ free (snippet_text);
+ }
+
+ printf("Ocurrences = %d\n", numocc);
+ if (!strcmp((char*)textPattern,"\n") ) break;
+ }
+ }
+
+
+
+ // SEARCHING FOR A TEXT PATTERN (word/phrase).
+ {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
+ int occ;
+ int len;
+ uint *occs;
+ int i;
+ printf("\nSEARCH TEST for LOCATE\n");
+ while(1) {
+ printf("Intro string: ");
+ fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
+ len = strlen((char*)textPattern);
+ if (!strcmp((char*)textPattern,"\n") ) break;
+ textPattern[len-1] = '\0';
+ len --;
+
+ //occs = locateTextOcurrences(wcsa,textPattern,&occ);
+ // locate(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ);
+ locateWord(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ, 0);
+
+ printf("\n*** %s occurs %d times: In the source text in positions:\n\t",textPattern,occ);
+ /* for (i=0;i<occ;i++)
+ printf("[%u]",occs[i]);
+ fflush(stderr); */
+ if (occ >0) free(occs);
+
+ if (!strcmp((char*)textPattern,"\n") ) break;
+ }
+ }
+
+
+
+
+ /** freeing the index */
+ free_index(Index);
+
+ }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+//{
+// bG = createBitmap (B,len);
+// bE = createBitmapEdu (B,len);
+// //fprintf(stderr,"\n RANK DE GONZALO DA %u, de EDU DA %u\n",rank(bG,33),rank1Edu(bE,33));
+// //fprintf(stderr,"\n SELECT1(2) DE GONZALO DA %u, de EDU DA %u\n",bselect(bG,4),bselect(bE,4));
+//
+// showBitVector(bitvector,34);
+//}
+
+
+
+/*
+
+ //USING A HASH TABLE
+ // {
+ // char a[20]="beginnings";char b[20]="HOSTIAS";
+ // char *w;
+ // int i;
+ // w=a;
+ // i = inHashTable(stopwordshash,w, strlen(w), &addrInTH );
+ // if (!i) insertElement (stopwordshash, w, strlen(w), &addrInTH);
+ // else stopwordshash->hash[addrInTH].freq++;
+ // //fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
+ // //fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, stopwordshash->hash[addrInTH].word, stopwordshash->hash[addrInTH].freq, stopwordshash->hash[addrInTH].posInVoc);
+ // }
+
+
+
+ /// ENCODING THE separators ...
+{
+ freeHuff(gapsHuffman);
+ uint i;
+ uint *bitvector;
+ uint bitvectorSize;
+ uint ptr;
+ bitmap bG,bE;
+ uint len;
+ len = 1000; //number of bits
+ bitvector = (uint *) malloc ((len/32 +1)* sizeof(uint));
+
+ byte texto[100] = "####@?*";
+ uint freqs[256];
+
+ //fprintf(stderr,"\n este es el texto a codificar: %s",texto);
+ for (i=0;i<256;i++) freqs[i]=0;
+ for (i=0;i<strlen(texto);i++) freqs[texto[i]]++;
+ gapsHuffman = createHuff (freqs,255,UNSORTED);
+
+ ptr=0;
+ for (i=0;i<strlen(texto);i++) {
+ //fprintf(stderr,"\n ENCODING seprators !!\n");
+ //fprintf(stderr,"%d. \n",ptr=encodeHuff(gapsHuffman, texto[i],bitvector,ptr) );
+ }
+
+ prepareToDecode(&(gapsHuffman));
+ bitvectorSize = ptr;
+ showBitVector(bitvector,bitvectorSize);
+ uint pos;
+ //fprintf(stderr,"\n DECODING !!\n");
+ ptr=0;
+ while (ptr < bitvectorSize) {
+ ptr=decodeHuff (gapsHuffman, &pos, bitvector, ptr);
+ //fprintf(stderr,"\n DECODING pos is %ld!!\n",pos);
+ //fprintf(stderr,"%c. \n",pos);
+ }
+ exit(0);
+}
+
+/// ENCODING THE CANONICAL WORDS ...
+{ uint i;
+ uint *bitvector;
+ uint bitvectorSize;
+ uint ptr;
+ bitmap bG,bE;
+ uint len;
+ len = 1000; //number of bits
+ bitvector = (uint *) malloc ((len/32 +1)* sizeof(uint));
+
+
+ ptr=0;
+ //fprintf(stderr,"\n ENCODING VARIANTS !!\n");
+ //fprintf(stderr,"%d. \n",ptr=encodeHuff(posInHT[0].huffman, 0,bitvector,ptr) );
+ //fprintf(stderr,"%d. \n",ptr=encodeHuff(posInHT[0].huffman, 1,bitvector,ptr) );
+ //fprintf(stderr,"%d. \n",ptr=encodeHuff(posInHT[0].huffman, 2,bitvector,ptr) );
+ //fprintf(stderr,"%d. \n",ptr=encodeHuff(posInHT[0].huffman, 3,bitvector,ptr) );
+ //fprintf(stderr,"%d. \n",ptr=encodeHuff(posInHT[0].huffman, 4,bitvector,ptr) );
+ //fprintf(stderr,"%d. \n",ptr=encodeHuff(posInHT[0].huffman, 4,bitvector,ptr) );
+
+ // FILE *f;
+ // f = fopen("huff","w");
+ // saveHuff(posInHT[0].huffman,f);
+ // fclose(f);
+ //
+ // f = fopen("huff","r");
+ // posInHT[0].huffman = loadHuff (f,0);
+ // fclose(f);
+ //
+
+ prepareToDecode(&(posInHT[0].huffman));
+
+ FILE *f;
+ f = fopen("huff","w");
+ saveHuffAfterDecode(posInHT[0].huffman,f);
+ fclose(f);
+
+ f = fopen("huff","r");
+ //posInHT[0].huffman = loadHuffAfterDecode(f,0);
+ loadHuffAfterDecode2 (&(posInHT[0].huffman),f,0);
+ fclose(f);
+
+
+
+ bitvectorSize = ptr;
+ showBitVector(bitvector,bitvectorSize);
+ uint pos;
+ //fprintf(stderr,"\n DECODING !!\n");
+ ptr=0;
+ while (ptr < bitvectorSize) {
+ ptr=decodeHuff (posInHT[0].huffman, &pos, bitvector, ptr);
+ //fprintf(stderr,"\n DECODING pos is %ld!!\n",pos);
+ //fprintf(stderr,"%s. \n",posInHT[0].variants[pos]);
+ }
+ exit(0);
+}
+
+
+
+{ uint i;
+ uint *bitvector;
+ bitmap bG,bE;
+ uint len;
+ len = 101; //number of bits
+ bitvector = (uint *) malloc ((len/32 +1)* sizeof(uint));
+ //bitvector[0]=0;
+ //bitvector[1]=0;
+ bitzero (bitvector,0,101-1);
+ for (i=0; i<len;i++) setBit (bitvector,len,i,0);
+
+
+
+ bitset(bitvector,1);
+ bitset(bitvector,10);
+ bitset(bitvector,12);
+ //activateBit(bitvector,1);
+ //activateBit(bitvector,10);
+
+ bG = createBitmap (bitvector,len);
+ bE = createBitmapEdu (bitvector,len);
+ //fprintf(stderr,"\n RANK DE GONZALO DA %u, de EDU DA %u\n",rank(bG,33),rank1Edu(bE,33));
+ //fprintf(stderr,"\n SELECT1(2) DE GONZALO DA %u, de EDU DA %u\n",bselect(bG,4),bselect(bE,4));
+
+ showBitVector(bitvector,34);
+}
+*/
+
--- /dev/null
+#include "buildFacade.h"
+#include "utils/errors.c"
+
+
+/** Building the index */
+
+ /* Creates index from text[0..length-1]. Note that the index is an
+ opaque data type. Any build option must be passed in string
+ build_options, whose syntax depends on the index. The index must
+ always work with some default parameters if build_options is NULL.
+ The returned index is ready to be queried. */
+int build_index (uchar *text, ulong length, char *build_options, void **index) {
+ int returnvalue;
+
+ printf("\n parameters: \"%s\"\n",build_options); fflush(stderr);
+
+ returnvalue = build_WCSA (text, length, build_options, index);
+
+ if (!returnvalue)
+ returnvalue = build_iCSA (build_options,*index);
+
+ return returnvalue;
+}
+
+
+/** Saves index on disk by using single or multiple files, having
+ proper extensions. */
+int save_index (void *index, char *filename) {
+
+ char *basename = filename;
+ twcsa *wcsa=(twcsa *) index;
+
+ uint i,j;
+ char *outfilename;
+ int file;
+ char c;
+
+ printf("\n Saving structures to disk: %s.*",filename);
+ outfilename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
+
+ /**File with some constants (bSize and tohSize); */
+ { uint number;
+ strcpy(outfilename, basename);
+ strcat(outfilename, ".");
+ strcat(outfilename, CONSTANTS_FILE_EXT);
+ unlink(outfilename);
+ if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+ printf("Cannot open file %s\n", outfilename);
+ exit(0);
+ }
+ write(file, &(wcsa->sourceTextSize), sizeof(uint));
+ write(file, &(wcsa->seSize), sizeof(uint));
+ close(file);
+ }
+
+ /** The Words in the vocabulary of words (sorted alphabetically)*/
+ { strcpy(outfilename, basename);
+ strcat(outfilename, ".");
+ strcat(outfilename, VOCABULARY_WORDS_FILE_EXT);
+ unlink(outfilename);
+ if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+ printf("Cannot open file %s\n", outfilename);
+ exit(0);
+ }
+
+ uint n = wcsa->n;
+ uint elemSize = wcsa->wordsData.elemSize;
+ write(file, &n, sizeof(uint));
+ write(file, &elemSize, sizeof(uint));
+ write(file, &(wcsa->wordsData.wordsZoneMem.size), sizeof(uint));
+
+ //the number of canonical words
+ write(file, (char *)wcsa->wordsData.wordsZoneMem.zone, wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
+ write(file, (char *)wcsa->wordsData.words, ((((n+1)* (elemSize))+W-1) /W) * (sizeof(uint)) );
+
+ close(file);
+ }
+
+ free(outfilename);
+
+ if (wcsa->myicsa) {
+ /******** saves index on integers (bottom) ******/
+ //Storing the CSA
+ //storeStructsCSA(wcsa->myicsa,basename);
+ saveIntIndex((void *) wcsa->myicsa, basename);
+ }
+
+ if (wcsa->se) {
+ saveSEfile(basename,wcsa->se, wcsa->seSize+1);
+ //free(wcsa->se);
+ }
+
+ return 0;
+}
+
+
+
+ /** Loads index from one or more file(s) named filename, possibly
+ adding the proper extensions. */
+int load_index(char *filename, void **index){
+ twcsa *wcsa;
+ wcsa = loadWCSA (filename);
+ (*index) = (void *) wcsa;
+ return 0;
+}
+
+ /** Frees the memory occupied by index. */
+int free_index(void *index){
+ twcsa *wcsa=(twcsa *) index;
+ ulong size;
+ index_size(index,&size);
+ printf("\n[destroying index] ...Freed %lu bytes... RAM", size);
+
+
+ //frees the array SE.
+ if (wcsa->se)
+ free (wcsa->se);
+
+ //the iCSA.
+ if (wcsa->myicsa) {
+ //destroyStructsCSA(wcsa->myicsa);
+ int err = freeIntIndex((void *) wcsa->myicsa);
+ }
+
+ //the words.
+ free (wcsa->wordsData.wordsZoneMem.zone);
+ free (wcsa->wordsData.words); /** huge!! */
+
+ //the pointer to wcsa.
+ free(wcsa);
+ return 0;
+}
+
+ /** Gives the memory occupied by index in bytes. */
+int index_size(void *index, ulong *size) {
+ ulong totaltmp;
+ twcsa *wcsa=(twcsa *)index;
+ uint n= wcsa->n;
+ *size=0;
+ *size += sizeof(twcsa);
+
+ totaltmp=0; //words
+ totaltmp += ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)); //the pointers
+ totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words.
+ *size += totaltmp;
+
+ if (wcsa->myicsa) {
+ uint nbytes;
+ int err = sizeIntIndex((void *) wcsa->myicsa, &nbytes);
+ *size += nbytes;
+ //*size += CSA_size(wcsa->myicsa);
+ }
+
+ return 0;
+}
+
+
+/** Querying the index =============================================================*/
+
+ /* Writes in numocc the number of occurrences of the substring
+ pattern[0..length-1] found in the text indexed by index. */
+int count (void *index, uchar *pattern, ulong length, ulong *numocc){
+ uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
+ uint integerPatternSize;
+ ulong l,r;
+
+ twcsa *wcsa=(twcsa *) index;
+ parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
+ if (!integerPatternSize) {*numocc=0; return 0;} //not found
+
+ //*numocc = countCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &l, &r);
+ int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, numocc, &l, &r);
+ return 0;
+}
+
+ /* Writes in numocc the number of occurrences of the substring
+ pattern[0..length-1] in the text indexed by index. It also allocates
+ occ (which must be freed by the caller) and writes the locations of
+ the numocc occurrences in occ, in arbitrary order. */
+int locate(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc){
+ return 99;
+}
+
+ /* Gives the length of the text indexed */
+int get_length(void *index, ulong *length) {
+ twcsa *wcsa=(twcsa *) index;
+ *length = wcsa->sourceTextSize;
+ return 0;
+}
+
+ /** Obtains the length of the text indexed by index. */
+
+int length (void *index, ulong *length) {
+ return (get_length(index,length));
+}
+
+
+/** ***********************************************************************************
+ * Accessing the indexed text
+ * ***********************************************************************************/
+
+
+ /** Allocates snippet (which must be freed by the caller) and writes
+ the substring text[from..to] into it. Returns in snippet_length the
+ length of the text snippet actually extracted (that could be less
+ than to-from+1 if to is larger than the text size). */
+int extract (void *index, ulong from, ulong to, uchar **snippet, ulong *snippet_length) {
+ twcsa *wcsa=(twcsa *) index;
+ return 99;
+}
+
+ /** Displays the text (snippet) surrounding any occurrence of the
+ substring pattern[0..length-1] within the text indexed by index.
+ The snippet must include numc characters before and after the
+ pattern occurrence, totalizing length+2*numc characters, or less if
+ the text boundaries are reached. Writes in numocc the number of
+ occurrences, and allocates the arrays snippet_text and
+ snippet_lengths (which must be freed by the caller). The first is a
+ character array of numocc*(length+2*numc) characters, with a new
+ snippet starting at every multiple of length+2*numc. The second
+ gives the real length of each of the numocc snippets. */
+
+int display (void *index, uchar *pattern, ulong length, ulong numc,
+ ulong *numocc, uchar **snippet_text, ulong **snippet_lengths) {
+ return 99;
+}
+
+
+
+/** ***********************************************************************************
+ * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
+ * ***********************************************************************************/
+ /* Writes in numocc the number of occurrences of the substring
+ pattern[0..length-1] in the text indexed by index. It also allocates
+ occ (which must be freed by the caller) and writes the locations of
+ the numocc occurrences in occ, in arbitrary order. These occurrences
+ refer to the offsets in TOH where the caller could start a display
+ operation. So locateWord implies synchronization using B.
+ Moreover, positions occ[numocc.. 2*numocc-1] is set with the rank in SE of the
+ words whose codes begin in TOH in the positions in occ[0... numocc-1]
+ ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
+ searched word, but the offset in TOH of k-before words before.
+ */
+
+int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore){
+ uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
+ uint integerPatternSize;
+ ulong occurrences,l,r;
+ twcsa *wcsa=(twcsa *) index;
+
+ parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);
+ if (!integerPatternSize) {*numocc=0; return 0;} //not found
+
+ ulong *seOffsets;
+
+ //obtains the indexes in vector SE where the pattern appears.
+ //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
+ int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
+
+ *numocc = occurrences;
+
+ if (!occurrences) {(*occ)=NULL;return 0;}
+
+ (*occ) = (ulong *)seOffsets;
+ return 0;
+}
+
+
+ /** Displays the text (snippet) surrounding any occurrence of the
+ substring pattern[0..length-1] within the text indexed by index.
+ The snippet must include numc characters before and after the
+ pattern occurrence, totalizing length+2*numc characters, or less if
+ the text boundaries are reached. Writes in numocc the number of
+ occurrences, and allocates the arrays snippet_text and
+ snippet_lengths (which must be freed by the caller). The first is a
+ character array of numocc*(length+2*numc) characters, with a new
+ snippet starting at every multiple of length+2*numc. The second
+ gives the real length of each of the numocc snippets. */
+
+ int displayWords (void *index, uchar *pattern, ulong length, ulong numc,
+ ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore) {
+
+ /** actually extracts upto length + 2*numc chars, starting extraction kbefore
+ * words before the occurrence **/
+
+ ulong *indexesInSE;
+ ulong occurrences;
+ uint bytesPerSnippet;
+ byte *text_aux;
+ twcsa *wcsa=(twcsa *) index;
+
+ locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
+ (*numocc) = occurrences;
+
+ if (!occurrences) {
+ *snippet_text =NULL;
+ *snippet_lengths =NULL;
+ return 0;
+ }
+
+ bytesPerSnippet = length+2*numc;
+// bytesPerSnippet = 2*numc;
+ *snippet_lengths = (ulong *) malloc((*numocc)*sizeof(ulong));
+ if (!(*snippet_lengths)) return 1;
+ *snippet_text = (uchar *) malloc((*numocc)*(bytesPerSnippet)*sizeof(uchar) +1) ; //(the last "1" is for '\0');
+ if (!(*snippet_text)) return 1;
+
+ // fprintf(stderr,"\n occs found = %7d for pattern %s",*numocc, pattern);
+ // fflush(stderr);
+
+ text_aux=*snippet_text;
+ {
+ uint i, j, tmplen;
+ uint ptr, maxptr;
+ byte *src, *dst;
+ uint snippetLen;
+ uint posSEValue,indexSE;
+
+ for (i=0;i<occurrences;i++) {
+ uint prevValid=0;
+ uint endSnippet =0;
+
+ /** decodes words from there */
+ snippetLen=0;
+ indexSE = indexesInSE[i];
+ indexSE = (indexSE > kbefore) ? indexSE-kbefore : 0;
+
+ dst = text_aux;
+ while ((!endSnippet) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
+
+ //posSEValue =displayCSA(wcsa->myicsa,indexSE);
+ int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
+
+ {//obtains pointer to the ith word
+ uint offtmp;
+ uint ith = posSEValue -1; // !!
+ tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+ offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+ tmplen -=offtmp; //the lenght of the ith word.
+
+ src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
+ }
+
+ if (_Valid[*src]) {
+ if (prevValid){
+ *dst++ =' ';
+ snippetLen++;
+ if (snippetLen==bytesPerSnippet) break; //end of snippet (ends in BLANK_SPACE)
+ }
+ prevValid =1; //for the next iteration
+ }
+ else prevValid=0;
+
+ indexSE++;
+
+ /* at the end ?? */
+ if ((tmplen+snippetLen)>=bytesPerSnippet) {
+ tmplen =(bytesPerSnippet - snippetLen);
+ endSnippet=1; //so while loop ends;
+ }
+
+ for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
+ snippetLen +=tmplen;
+ }//while
+
+ text_aux += bytesPerSnippet;
+ (*snippet_lengths)[i] = snippetLen;
+ } //for
+
+ if (occurrences) free(indexesInSE);
+ }
+ return 0;
+}
+
+/** simulates extration of text process, but do not actually returns anything at all
+ Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
+ Less than 2K words can be extracted if more than numc characters have been already obtained.
+ Does nothing else... does not return the text */
+
+int displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc) {
+
+ ulong *indexesInSE;
+ ulong occurrences;
+ byte *text_aux;
+
+ twcsa *wcsa=(twcsa *) index;
+
+ locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);
+
+ if (!occurrences) {
+ return 0;
+ }
+
+ ulong maxsnippetLen = maxnumc;
+ ulong extractedbytes = 0;
+
+ text_aux = (byte *) malloc (maxsnippetLen+1);
+
+ {
+ uint i, j, tmplen;
+ uint ptr, maxptr;
+ byte *src, *dst;
+ uint snippetLen;
+ uint posSEValue,indexSE;
+
+ uint numWordsToExtract = 2 * wordsbefore;
+ uint z;
+ //printf("\n occurrences... = %lu",occurrences);
+
+ for (i=0;i<occurrences;i++) {
+ uint prevValid=0;
+ uint endSnippet =0;
+
+ /** decodes words from there */
+ snippetLen=0;
+ indexSE = indexesInSE[i];
+ indexSE = (indexSE > wordsbefore) ? indexSE-wordsbefore : 0;
+
+ dst = text_aux;
+ z=0;
+ while ((z<numWordsToExtract) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
+
+ //posSEValue =displayCSA(wcsa->myicsa,indexSE);
+ int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
+
+ {//obtains pointer to the ith word
+ uint offtmp;
+ uint ith = posSEValue -1; // !!
+ tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+ offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+ tmplen -=offtmp; //the lenght of the ith word.
+
+ src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
+ }
+
+ if (_Valid[*src]) {
+ if (prevValid){
+ *dst++ =' ';
+ snippetLen++;
+ if (snippetLen==maxsnippetLen) break; //end of snippet (ends in BLANK_SPACE)
+ }
+ prevValid =1; //for the next iteration
+ }
+ else prevValid=0;
+
+ indexSE++;
+
+ /* at the end ?? */
+ if ((tmplen+snippetLen)>=maxsnippetLen) {
+ break;
+ }
+
+ //fprintf(stderr,"\ntmplen = %d ",tmplen); fflush(stderr);
+ for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
+ snippetLen +=tmplen;
+ z++;
+ }//while
+
+ extractedbytes += snippetLen;
+
+ } //for
+
+ if (occurrences) free(indexesInSE);
+ }
+ if (text_aux) free (text_aux);
+ return extractedbytes;
+}
+
+
+
+/** Allocates text (which must be freed by the caller) and recovers the
+ the substring of text starting from the "fromword"-th word up to the
+ "toWord"-th words. Returns in text the text, and in "text_lenght" the
+ length of the text actually extracted. Text is allocated.
+ Actually extracts SE[fromWord .. toWord) ... not the last word. */
+
+int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
+ ulong *text_length){
+
+ twcsa *wcsa=(twcsa *) index;
+ uint initTextLen=10000;
+ uint avgWordLen =7;
+
+ uint i, j;//, tmplen;
+ uint prevValid;
+ byte *src, *dst, *buff;
+ uint tmplen =0;
+
+ uint buffBytes = 1000;
+ uint leng=0; //curr pos in buffer that was occupied.
+
+ if (toWord > wcsa->seSize) toWord = wcsa->seSize;
+ if (fromWord >= wcsa->seSize) fromWord = wcsa->seSize-1;
+ if (buffBytes < ( (toWord-fromWord)* avgWordLen)) buffBytes = ((toWord-fromWord)* avgWordLen);
+
+ buff = (uchar *) malloc (buffBytes * sizeof(char));
+ if (!buff) return 1; //out of memory.
+ dst = buff;
+
+ register uint indexSE=fromWord;
+ uint posSEValue=0;
+ register uint ith;
+
+ while ( (indexSE < toWord) ){ /** extracting words (if not at the end) */
+
+ int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
+
+ {//obtains pointer to the ith word
+ uint offtmp;
+ ith= posSEValue -1; // !!
+ tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+ offtmp = bitread (wcsa->wordsData.words, (ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+ tmplen -=offtmp; //the lenght of the ith word.
+ src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
+ }
+
+ if ( buffBytes < (leng + tmplen+1) ) {
+ buffBytes *=2;
+ buff = (uchar*) realloc(buff, buffBytes);
+ if (!buff) return 1; //out of memory.
+ dst = buff + leng;
+ }
+
+ if (_Valid[*src]) {
+ if (prevValid){
+ *dst++ =' ';
+ leng += 1;
+ }
+ prevValid =1; //for the next iteration
+ }
+ else prevValid=0;
+
+ indexSE++;
+
+ for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
+ leng +=tmplen;
+ }//while
+
+ *text_length =leng;
+ *dst='\0';
+ *text = buff;
+ return 0;
+}
+
+
+
+/** ***********************************************************************************
+ CONSTRUCTION OF THE INDEX WCSA
+ ***********************************************************************************/
+
+ /**------------------------------------------------------------------
+ Compares two slots (alphanumericaly). For qsort of canonical words
+ ------------------------------------------------------------------ */
+ int qSortWordsCompareAlpha(const void *arg1, const void *arg2) {
+ tposInHT *a1 = (tposInHT *) arg1;
+ tposInHT *a2 = (tposInHT *) arg2;
+ return strcmp((char*)a1->word, (char *)a2->word);
+ }
+
+/**
+ * BUILDS THE WCSA INDEX
+ */
+
+int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
+
+ unsigned long zeroNode; //number of different canonical words.
+
+ t_hash hash; // the hash table to store both variants and canonical words.
+ tposInHT *posInHT; // structure for canonicals and variants+huffmans
+
+ uint sourceTextSize;
+
+ uint seSize=0; //it's size == "numberOfValidWords".
+ uint *SE; //Integers vector. (represents the rank of the valid words in the source text).
+
+ uint totallenWords=0; //The numberOfBytes that occupy canonical words (their ascii version) in memory
+
+
+ ulong bytesFile,bytesFileReal;
+ long sizeNValue;
+
+ /* used during first pass */
+
+ ulong addrInTH;
+
+ byte* inputBuffer = text;
+ bytesFileReal= bytesFile = length;
+
+ sourceTextSize=length;
+
+ /** Initializes WCSA structure*/
+ twcsa *wcsa;
+ wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
+ zeroNode=0;
+ /** */
+
+ //Stimation (Using Heap's law) of the number of different "meaningful" words.
+ //sizeNValue=N_value;
+ if(bytesFile<5000000) bytesFile = 5000000;
+ sizeNValue = (unsigned long) floor(3.9* pow(bytesFile,0.60) );
+
+
+ // Inicializes the arrays used to detect if a char is valid or not.
+ StartValid();
+ // Inicializes the arrays used translated a char into lowercase.
+ StartToLow();
+
+
+ // **********************************************************************************
+ //STARTING THE FIRST PASS.
+ // **********************************************************************************
+ printf("\nSTARTING THE FIRST PASS...");
+
+ posInHT = (tposInHT *) malloc(sizeof(tposInHT) * sizeNValue);
+ hash = initialize_hash (sizeNValue); //hash to cointain both the parsed words
+
+ //-----------------------------------------------------------------
+ //1st pass (processing the file)
+ {
+ byte *pbeg,*pend,*wordstart,*aWord;
+ register ulong size;
+ register uint i;
+
+ pbeg = inputBuffer;
+ pend = inputBuffer+bytesFileReal;
+
+ while (pbeg <pend) {
+
+ //parsing either a word or separator.
+ size=0;
+ wordstart = pbeg;
+ if (_Valid[*pbeg]) { //alphanumerical data
+ while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else {
+ if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
+ while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else { //a SPACE comes, so we have to test if next character is alphanumerical or not
+ pbeg++;
+ if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
+ else {
+ if (_Valid [*pbeg] ) {
+ wordstart = pbeg; //So skipping 1 blank character
+ while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else { // a "separator word" ...
+ size++; //the prev BLANK...
+ while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
+ }//else { // a "separator word"
+ }//else ... not a unique BLANK AT THE END.
+ }//else ... starting by a BLANK...
+ }
+
+ //The parsed word/separator is is "wordstart", and its length is "size"...
+ aWord=wordstart;
+
+ //Processement done for each word word
+ i = inHashTable(hash,aWord, size, &addrInTH );
+ if (!i){
+ insertElement (hash,aWord, size, &addrInTH);
+ posInHT[zeroNode].slot=addrInTH;
+ posInHT[zeroNode].word=hash->hash[addrInTH].word;
+ hash->hash[addrInTH].posInVoc = zeroNode;
+ zeroNode++;
+ totallenWords += size +1; // +1 due to the '\0' char...
+ //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);
+ }
+ seSize ++;
+ }//while pbeg<pend
+
+ fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
+
+ }//1st pass ends
+
+
+ // **********************************************************************************
+ // END OF 1ST PASS
+ // **********************************************************************************
+
+ // Sorting the words alphanumerically (over posInHT)
+ { register unsigned long i,j;
+ //sorting canonical words ...
+ qsort(posInHT, zeroNode, sizeof(tposInHT), qSortWordsCompareAlpha);
+
+ //setting in hash the new positions of the words in the hash table
+ for (i=0;i<zeroNode;i++) {
+ hash->hash[posInHT[i].slot].posInVoc = i;
+ }
+ }
+
+ // INITIALIZING structures for the 2nd pass ......................................
+ {
+ SE = (uint *) malloc ((seSize+1)*sizeof (uint));
+ }
+
+
+ // **********************************************************************************
+ // STARTING THE SECOND PASS.
+ // **********************************************************************************/
+
+ printf("\nSTARTING THE SECOND PASS... ");
+ //2nd pass (processing the file)
+ {
+ byte *pbeg,*pend,*wordstart,*aWord;
+ register ulong size;
+ register uint i;
+ register ulong countValidWords = 0;
+
+
+ pbeg = inputBuffer;
+ pend = inputBuffer+bytesFileReal;
+
+ while (pbeg <pend) {
+
+ //parsing either a word or separator.
+ size=0;
+ wordstart = pbeg;
+ if (_Valid[*pbeg]) { //alphanumerical data
+ while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else {
+ if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
+ while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else { //a SPACE comes, so we have to test if next character is alphanumerical or not
+ pbeg++;
+ if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
+ else {
+ if (_Valid [*pbeg] ) {
+ wordstart = pbeg; //So skipping 1 blank character
+ while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else { // a "separator word" ...
+ size++; //the prev BLANK...
+ while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
+ }//else { // a "separator word"
+ }//else ... not a unique BLANK AT THE END.
+ }//else ... starting by a BLANK...
+ }
+
+ //The parsed word/separator is is "wordstart", and its length is "size"...
+ aWord=wordstart;
+
+ //Processement done for each word word
+ i = inHashTable(hash,aWord, size, &addrInTH );
+
+ SE[countValidWords]=hash->hash[addrInTH].posInVoc+1; // !!!!
+ countValidWords++;
+
+ }// while pbeg<pend
+
+ SE[countValidWords] = 0;
+ fprintf(stderr,"\n2nd pass ends: TOTAL distinct words: %lu totalWords = %lu",zeroNode,countValidWords);
+
+ }//2nd pass ends
+
+ // **********************************************************************************
+ // END OF 2ND PASS
+ // **********************************************************************************
+
+ //freeing the source text (it is no longer needed).
+ free(inputBuffer); //the text
+
+ /** Now Setting the data of the index **/
+ wcsa->n = zeroNode;
+ wcsa->sourceTextSize = sourceTextSize;
+ wcsa->seSize = seSize;
+
+ // Creating the words of the vocabulary...
+ {
+ /** copying the words into WCSA. */
+ uint *tmpOffsets = (uint *) malloc (sizeof(uint) * (zeroNode +1) ); //1 extra uint (to point to the virtual "zeroNode+1" ^th word.
+ uint tmpOffset =0;
+
+ byte *zoneMem,*src;
+ uint i;
+
+ //Moving data from posInHT to WCSA structure
+ //wcsa->wordsData = (twords *) malloc(sizeof(twords) * zeroNode);
+ wcsa->wordsData.wordsZoneMem.size = totallenWords - zeroNode; //without '\0' bytes (end-tag).
+ wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc ( wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
+ zoneMem = wcsa->wordsData.wordsZoneMem.zone;
+ for(i = 0; i < zeroNode; i++) {
+ src = posInHT[i].word; //copying the canonical word
+ //wcsa->wordsData.words[i].word = zoneMem; //setting the pointer
+ tmpOffsets[i]=tmpOffset; //offset in zoneMem
+ while (*src) {*zoneMem++ = *src++; tmpOffset++;} //moving data until '\0'
+ //*zoneMem='\0'; zoneMem++; //copies also the '\0'
+
+ }
+ tmpOffsets[zeroNode]=tmpOffset; //setting pointer to the "virtual" word {zeroNode+1}^{th}
+
+ //kbit encoding of the offsets
+ uint elemSize = bits(tmpOffset);
+ wcsa->wordsData.elemSize = elemSize;
+ wcsa->wordsData.words = (uint *) malloc (((((zeroNode +1)*elemSize)+W-1) /W) * sizeof(uint)); //with 1 extra slot !.
+ wcsa->wordsData.words[((((zeroNode +1)*elemSize)+W-1) /W) -1 ] =0000;
+ // fprintf(stderr,"\n ElemSize = %d, maxOffset = %d",elemSize,tmpOffset);
+
+ tmpOffset=0;
+ for (i=0; i<=zeroNode; i++) { //setting "zeroNode+1" offsets
+ bitwrite(wcsa->wordsData.words, tmpOffset, elemSize, tmpOffsets[i]);
+ tmpOffset+=elemSize;
+ }
+
+ //////////// CHECKS IT WORKED. old !!!!
+ // { uint kk;
+ // tmpOffset=0;
+ // for (i=0; i<zeroNode; i++) { //setting "zeroNode+1" offsets
+ // kk=bitread(wcsa->wordsData.words, i* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+ // tmpOffset+=elemSize;
+ // if (kk != tmpOffsets[i]) {fprintf(stderr,"\n @@@@@@@@ DISTINTOS OFFSETS "); break;}
+ // else fprintf(stderr,"\n iguales, %d, %d :: <<%s>>len=%d",kk,i, posInHT[i].word, strlen((char*)posInHT[i].word));
+ // }
+ // }
+ //
+ // { uint len1, len, tmplen, len2;
+ // uint i,p;
+ // byte *wcsaWord, *src;
+ //
+ // for (p=0;p<zeroNode;p++) {
+ // {//preparing for strcompL
+ // len = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p+1)), wcsa->wordsData.elemSize);
+ // tmplen = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p)) , wcsa->wordsData.elemSize);
+ //
+ // //fprintf(stderr,"\n :: off[%d]= %d - off [%d] = %d ==> %d",p+1,len,p,tmplen,len-tmplen);
+ //
+ // len2 =len-tmplen;
+ // wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
+ // }
+ //
+ // src = posInHT[p].word;
+ // len1 = strlen((char *)src);
+ //
+ // if (strcompL(src,wcsaWord,len1,len2) != 0) {
+ // fprintf(stderr,"\n %6d DISTINTOS !! ===len1 %d,len %d===== <<",p,len1,len2);printWord(src,len1);
+ // fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
+ // exit(0);
+ // }
+ // else {
+ // fprintf(stderr,"\n %6d ======len1 %d,len2 %d===== <<",p,len1,len2);printWord(src,len1);
+ // fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
+ // }
+ // }
+ //
+ // }
+ //
+
+ /**-----------*/
+ //frees memory from hash table and posInHT structures.
+ free(tmpOffsets);
+ destroy_hash(hash);
+ free(posInHT);
+ }
+
+ /** ******* creates the self-index on ints (bottom layer) ==> see build_icsa *********/
+/**
+ #ifdef CSA_ON
+ {
+ uint total;
+ fprintf(stderr,"\n **** CREATING CSA from Edu's Code *****");
+ ticsa *myicsa;
+ myicsa = createIntegerCSA(&SE,seSize+1,build_options);
+ wcsa->myicsa= myicsa;
+ total = CSA_size(myicsa);
+
+ free(SE); //SE is no longer needed, (it is indexed by the iCSA)
+ printf("\n\t**** [iCSA built on %d words. Size = %ld bytes... RAM",seSize,total);
+ }
+ #endif
+*/
+
+ //#ifndef CSA_ON
+ wcsa->se = SE;
+ wcsa->myicsa = NULL;
+ //#endif
+
+ printf("\n\t ** Building done! **\n");
+ printf("\n Process finished!\n");
+
+ *index = wcsa;
+ return 0;
+}
+
+
+int build_iCSA (char *build_options, void *index)
+{
+ twcsa *wcsa = (twcsa *) index;
+ /********* creates the self-index on ints (bottom layer) *********/
+ //creating CSA from Edu's code...
+ {
+ uint total;
+ fprintf(stderr,"\n **** CREATING CSA-bottom-layer *****");
+ void *bottomIntIndex;
+ int err = buildIntIndex(wcsa->se,wcsa->seSize+1, build_options,(void **)&bottomIntIndex);
+ wcsa->myicsa = bottomIntIndex;
+
+ //total = CSA_size(wcsa->myicsa);
+ err = sizeIntIndex((void *) wcsa->myicsa, &total);
+
+ printf("\n\t**** [iCSA built on %d words. Size = %u bytes... RAM",wcsa->seSize,total);
+ }
+ return 0;
+}
+
+/** ********************************************************************
+ * Loading from disk
+ **********************************************************************/
+
+/**-----------------------------------------------------------------
+ * LoadWCSA.
+ * Loads all the data structures of WCSA (included the icsa)
+ ----------------------------------------------------------------- */
+
+twcsa *loadWCSA(char *filename) {
+ twcsa *wcsa;
+ // Inicializes the arrays used to detect if a char is valid or not.
+ StartValid();
+ // Inicializes the arrays used translated a char into lowercase.
+ StartToLow();
+
+ wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
+ wcsa->n=0;
+
+ int err = loadIntIndex(filename, (void **)&wcsa->myicsa);
+
+ loadStructs(wcsa,filename);
+
+ return wcsa;
+}
+
+/** ------------------------------------------------------------------
+ * LoadStructs.
+ * Reads files and loads all the data needed for searcherFacade
+ ----------------------------------------------------------------- */
+ void loadStructs(twcsa *wcsa, char *basename) {
+ uint i,j;
+ char *filename;
+ int file;
+ uint sizeFile;
+ char c;
+ uint n;
+
+ filename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
+ fprintf(stderr,"Loading Index from file %s.*\n", basename);
+
+ //** SOME CONSTANTS: sourceTextSize
+ { strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, CONSTANTS_FILE_EXT);
+
+ if( (file = open(filename, O_RDONLY)) < 0) {
+ printf("Cannot open file %s\n", filename);
+ exit(0);
+ }
+
+ read(file, &(wcsa->sourceTextSize), sizeof(uint));
+ read(file, &(wcsa->seSize), sizeof(uint));
+ close(file);
+ }
+
+ /** File with the words from the vocabulary (sorted alphabetically) */
+ { byte *zoneMem;
+
+ strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, VOCABULARY_WORDS_FILE_EXT);
+ //sizeFile= fileSize(filename)-sizeof(uint);
+
+ if( (file = open(filename, O_RDONLY)) < 0) {
+ printf("Cannot open file %s\n", filename);
+ exit(0);
+ }
+
+ //the number of canonical words
+ read(file, &n, sizeof(uint));
+ wcsa->n = n;
+ read(file, &(wcsa->wordsData.elemSize), (sizeof(uint)));
+ read(file, &(wcsa->wordsData.wordsZoneMem.size), (sizeof(uint)));
+
+ //allocating the memory needed for all words and reading them //(ascii) << no \0 chars are needed>>.
+ wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc(wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
+ read(file, (wcsa->wordsData.wordsZoneMem.zone), wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
+
+ //reading the offsets of the words (kbitArray that points to offsets in zoneMem of words.
+ wcsa->wordsData.words = (uint *) malloc (((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * sizeof(uint));
+ wcsa->wordsData.words[ ((((n+1)*(wcsa->wordsData.elemSize))+W-1) /W) -1 ] =0000;
+ read(file, (wcsa->wordsData.words), ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)));
+
+
+ close(file);
+ }
+ wcsa->se= NULL;
+ free(filename);
+}
+
+
+
+
+/** ****************************************************************
+ * Querying the index WCSA
+ * ***************************************************************/
+///////////////////////////////////////////////////////////////////////////////////////
+// FUNCTIONS NEEDED FOR SEARCHING A PATTERN //
+///////////////////////////////////////////////////////////////////////////////////////
+
+
+
+/*------------------------------------------------------------------
+ * Given a text pattern translates it into a list of integers (corresponding to the
+ * canonical words associated to the valid words in the text pattern)
+ ------------------------------------------------------------------*/
+void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) {
+
+ byte *pbeg,*pend,*wordstart,*aWord;
+ register unsigned long size;
+ uint index =0;
+
+ pbeg = textPattern;
+ pend = pbeg + patLen;
+
+ while (pbeg <pend) {
+ //parsing either a word or separator.
+ size=0;
+ wordstart = pbeg;
+ if (_Valid[*pbeg]) { //alphanumerical data
+ while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else {
+ if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
+ while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else { //a SPACE comes, so we have to test if next character is alphanumerical or not
+ pbeg++;
+ if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
+ else {
+ if (_Valid [*pbeg] ) {
+ wordstart = pbeg; //So skipping 1 blank character
+ while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else { // a "separator word" ...
+ size++; //the prev BLANK...
+ while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] )) {size++;pbeg++;}
+ }//else { // a "separator word"
+ }//else ... not a unique BLANK AT THE END.
+ }//else ... starting by a BLANK...
+ }
+
+ //The parsed word is "aWord", and its length is "size"...
+ aWord=wordstart;
+
+ // Binary search on the canonical words (wordsData)
+ {
+ uint len, tmplen;
+ uchar *wcsaWord;
+ register uint min,max,p;
+ min = 0;
+ max = (wcsa->n) - 1;
+ while(min < max) {
+ p = (min+max)/2;
+
+ {//preparing for strcompL
+ len = bitread (wcsa->wordsData.words, (p+1)* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize);
+ tmplen = bitread (wcsa->wordsData.words, (p )* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize);
+ len -=tmplen;
+ wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
+ }
+
+ //if(strncmp((char*)aWord, (char*)wcsa->wordsData[p].word,size) > 0) min = p+1;
+ if(strcompL(aWord, wcsaWord, size, len) > 0) min = p+1;
+ else max = p;
+
+
+ // { //SHOW PROGRESS
+ // fprintf(stderr,"\n Patron = <<%s>>, curposWord= %d ==><<",aWord,p);
+ // printWord(wcsaWord,len); fprintf(stderr,">> len =%d",len);
+ // }
+
+ }
+
+ {//preparing for strcompL
+ len = bitread (wcsa->wordsData.words, (min+1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+ tmplen = bitread (wcsa->wordsData.words, ( min )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+ len -=tmplen;
+ wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
+ }
+
+ // if(!strncmp((char*)aWord, (char*)wcsa->wordsData[min].word, size)) {
+ if(!strcompL(aWord, wcsaWord, size, len)) {
+ integerPattern[index++] = min +1 ; //<--
+ }
+ else {*sizeIntegers = 0; return;} // a valid word that does not appear in the source text.
+
+ }
+ }// end while
+ *sizeIntegers = index;
+
+ // //shows the parsed words:
+ // {uint i;
+ // printf("\n\n >>%s>> HA SIDO PARSEADO COMO:",textPattern);
+ // for (i=0; i<index;i++) {
+ // printf("<<%s>>",wcsa->wordsData[integerPattern[i] -1].word);
+ // }
+ //
+ // }
+}
+
+
+
+
+
+ /** ------------------------------------------------------------------
+ * Returns the number of occurrences of a given text pattern
+ *------------------------------------------------------------------ */
+int countTextOcurrences(twcsa *wcsa, byte *textPattern) {
+ ulong left, right;
+ uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
+ uint integerPatternSize, min, max;
+
+ uint lenpat = strlen((char*)textPattern);
+ parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
+ if (!integerPatternSize) return -1;
+
+// #ifdef DEBUG_ON
+// uint i;
+// printf("\n %d Integers to search for:",integerPatternSize );
+// for (i=0;i<integerPatternSize;i++) {
+// printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
+// }
+// printf("\n");
+// #endif
+
+ ulong numocc;
+ int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, &numocc, &left, &right);
+ return numocc;
+
+}
+
+
+ /** ------------------------------------------------------------------
+ * locateTextOcurrences:
+ * Returns the offsets of the source text where a word/phrase appears
+ * Returns also the number of occurrences.
+ *------------------------------------------------------------------ */
+uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences) {
+ uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
+ uint integerPatternSize, min, max;
+
+ uint lenpat = strlen((char*)textPattern);
+ parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);
+ if (!integerPatternSize) {*numberOccurrences = -1; return NULL;}
+
+// #ifdef DEBUG_ON
+// uint i;
+// printf("\n %d Integers to search for:",integerPatternSize );
+// for (i=0;i<integerPatternSize;i++) {
+// printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);
+// }
+// printf("\n");
+// #endif
+
+ ulong occurrences, left, right;
+ ulong *seOffsets;
+ ulong *sourceOffsets;
+
+ //obtains the indexes in vector SE where the pattern appears.
+ //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
+ int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
+
+ //sourceOffsets = (uint *) malloc (sizeof(uint)*occurrences);
+
+ sourceOffsets=seOffsets;
+ //obtains the offsets in the source text of the pattern (sourceOffsets)
+ locateFacade(wcsa, (uint *)sourceOffsets, (uint *)seOffsets,occurrences);
+
+ #ifdef DEBUG_ON
+ fprintf(stderr,"\n*** %s appears in the source text in positions:\n\t",textPattern);
+ for (i=0;i<occurrences;i++)
+ fprintf(stderr,"[%u]",sourceOffsets[i]);
+ fflush(stderr);
+ #endif
+
+ *numberOccurrences = occurrences;
+ return (uint *) sourceOffsets;
+}
+
+
+ /** ------------------------------------------------------------------
+ * displayTextOcurrences:
+ * Shows in stdout, the text around the occurrences of a word/phrase
+ * Returns also the number of occurrences.
+ *------------------------------------------------------------------ */
+int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay) {
+ return 99; //not implemented: function not available
+}
+
+ /** ------------------------------------------------------------------
+ * Locate Facade:
+ * For given sePositions, returns the sourceTextPositions
+ * where the those valid-words in se[sePositions[i]] occurr.
+ *------------------------------------------------------------------*/
+int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number) {
+ return 99; //not implemented: function not available for this index
+}
+
+
+/** ------------------------------------------------------------------
+ * DISPLAYFACADE:
+ * Returns the subString from a starting offset to a final offset
+ * in the source text. It does not allocate any memory, receives "dstptr"
+ * Precondition: offsetIni >=0;
+ ------------------------------------------------------------------*/
+ int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr) {
+ return 99; //not implemented: function not available for this index
+}
+
+
+ /**------------------------------------------------------------------
+ * DISPLAYFacadeMalloc:
+ * Returns the subString from a starting offset to a final offset
+ * in the source text. It allocates Memory !!
+ * NOT CURRENTLY USED
+ ------------------------------------------------------------------*/
+byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length) {
+ byte *dstptr=NULL; //not implemented: function not available
+ return dstptr;
+}
+
+
+ /** ------------------------------------------------------------------
+ * LOCATEALLandDISPLAY:
+ * Displays the text around an occurrence of the searched word in the source text.
+ * Assuming that $p$ is that position --> shows only chars in [p_radix-1,p_radix]
+ ------------------------------------------------------------------*/
+int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix) {
+return 99; //not implemented: function not available for this index
+
+}
+
+
+ /** ------------------------------------------------------------------
+ * recovers the source text by calling display(0,fileSize);
+ * ------------------------------------------------------------------ */
+void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
+
+ int start;int end;
+ byte *cc;
+ char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
+ ulong length;
+
+ strcpy( filename, basename);
+ strcat( filename, ext);
+ filename[strlen( basename)+ strlen(ext)]='\0';
+ fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
+
+ FILE *salida;
+ unlink( filename);
+ salida = fopen( filename,"w");
+ start=0; end = sourceTextSize-1;
+
+ cc = (byte *) malloc (sourceTextSize* sizeof(uchar));
+
+ {
+ uint i, j;//, tmplen;
+ uint prevValid;
+ //uint ptr, maxptr;
+ byte *src, *dst;
+ uint leng =0;
+ uint tmplen =0;
+
+ uint indexSE=0;
+ uint posSEValue=0;
+
+ dst=cc;
+ while ( (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
+
+ int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
+
+ {//obtains pointer to the ith word
+ uint offtmp;
+ uint ith = posSEValue -1; // !!
+ tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+ offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+ tmplen -=offtmp; //the lenght of the ith word.
+ src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
+ }
+
+ if (_Valid[*src]) {
+ if (prevValid){
+ *dst++ =' ';
+ leng +=1;
+ }
+ prevValid =1; //for the next iteration
+ }
+ else prevValid=0;
+
+ indexSE++;
+
+ for (j=0;j<tmplen;j++) {*dst++ = *src++;} //copies word to the output buffer
+ leng +=tmplen;
+ }//while
+
+ fprintf(stderr,"\n sourceTextSize = %d, len = %d",sourceTextSize,leng);
+ fwrite(cc,sizeof(byte),leng,salida);
+ fclose(salida);
+
+ free(cc);
+ free(filename);
+}
+
+
+}
+
+ //recovers the source text by calling extract Words.
+void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {
+
+ int start;int end; int error;
+ char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
+ byte *cc;
+ ulong length;
+
+ strcpy( filename, basename);
+ strcat( filename, ext);
+ filename[strlen( basename)+ strlen(ext)]='\0';
+ fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
+
+ FILE *salida;
+ unlink( filename);
+ salida = fopen( filename,"w");
+ start=0; end = wcsa->seSize;
+
+ error = extractWords((void *) wcsa, start, end, &cc, &length);
+ if (error) {fprintf(stderr,"\n error during recoverSourceText2"); exit(0);}
+
+ fprintf(stderr,"\n sourceTextSize = %d, len = %ld",sourceTextSize,length);
+ fwrite(cc,sizeof(byte),length,salida);
+ fclose(salida);
+
+ free(cc);
+ free(filename);
+}
+
+/** *******************************************************************************
+ * Showing some statistics and info of the index
+ * *******************************************************************************/
+void printInfoReduced(twcsa *wcsa) {
+ //not implemented: function not available
+}
+
+ /* Shows summary info of the index */
+int printInfo(void *index) {
+ uint n;
+
+ twcsa *wcsa = (twcsa *) index;
+
+ unsigned long indexSize;
+ uint intIndexSize, presentationSize;
+ int err;
+
+ err = index_size(index, &indexSize);
+ if (err!=0) return err;
+ err = sizeIntIndex(wcsa->myicsa, &intIndexSize);
+ if (err!=0) return err;
+
+ presentationSize = indexSize - intIndexSize;
+
+ printf("\n ===================================================:");
+ printf("\n Summary of Presentation layer:");
+ printf("\n Number of valid words (SEsize) = %u",wcsa->seSize);
+ printf("\n Number of different words = %ld",wcsa->n);
+ printf("\n WCSA structure = %d bytes", sizeof(twcsa));
+
+ uint totalpointers = ((((wcsa->n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));
+ uint totalasciizone = wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ;
+ uint totalwords = totalasciizone + totalpointers;
+
+ printf("\n Size Of words structure (%d bytes):",totalwords);
+ printf("\n [ pointers = %d bytes || AsciiZone = %d bytes", totalpointers, totalasciizone);
+
+ printf("\n\n Total = ** %u bytes (in RAM) **",presentationSize);
+ //printf("\n\n @@ Summary of self-index on Integers:");
+ err = printInfoIntIndex(wcsa->myicsa, " ");
+ if (err!=0) return err;
+
+ printf("\n ===================================================:");
+ printf("\n");
+ return 0;
+ }
+
+/**------------------------------------------------------------------
+ * structsSize.
+ * Counts the memory amount needed by the Facade (Presentation Layer).
+ * skipping the stop_words hash table
+ ----------------------------------------------------------------- */
+uint structsSizeMem(twcsa *wcsa) {
+ return 0; //not implemented: function not available for this index.
+}
+
+
+/** for debugging **/
+void printWord(uchar *str, uint len) {
+ uint i;
+ for (i=0;i<len;i++)
+ fprintf(stderr,"%c",str[i]);
+}
+
+
+ /** saves the content of the file SE (ids of the source words) **/
+int saveSEfile (char *basename, uint *v, uint n) {
+ char outfilename[255];
+ int file;
+ sprintf(outfilename,"%s.%s",basename,SE_FILE_EXT);
+ unlink(outfilename);
+ if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+ printf("Cannot open file %s\n", outfilename);
+ exit(0);
+ }
+
+ write(file, v, sizeof(uint) * n );
+ close(file);
+}
+
+
+
+double getTime2 (void)
+{
+ double usertime, systime;
+ struct rusage usage;
+
+ getrusage (RUSAGE_SELF, &usage);
+
+ usertime = (double) usage.ru_utime.tv_sec +
+ (double) usage.ru_utime.tv_usec / 1000000.0;
+ systime = (double) usage.ru_stime.tv_sec +
+ (double) usage.ru_stime.tv_usec / 1000000.0;
+
+ return (usertime + systime);
+}
+
+
+
+/**------------------------------------------------------------------
+ * MAIN PROGRAM.
+ *------------------------------------------------------------------ */
+#ifdef FACADEWITHMAIN
+ int main(int argc, char* argv[])
+ {
+
+
+
+ char *infile, *outbasename, *stopwordsfile; // Name of in/out files
+ byte *inputBuffer;
+ ulong finsize;
+
+ int f_in;
+ void *Index;
+
+
+ printf("\n*Word-based iCSA: A word-based CSA");
+ printf("\n*CopyRight (c) 2008 [LBD & G.N.]\n\n");
+
+ // Reads input parameters from command line.
+ if(argc < 3) {
+ printf("Use: %s <in file> <out basename> \n", argv[0]);
+ exit(0);
+ }
+
+ // Reads params (input file, output basename, and stopwords file)
+ infile = argv[1];
+ outbasename = argv[2];
+ stopwordsfile = argv[3];
+
+ finsize= fileSize(infile);
+
+ if (! finsize) {
+ printf( "\nFILE EMPTY OR FILE NOT FOUND %s !!\nSkipping processement ...\n",infile);
+ exit(0);
+ }
+
+ // Opening the input text file.
+ if( (f_in = open(infile, O_RDONLY)) < 0) {
+ printf("Cannot read file %s\n", infile);
+ exit(0);
+ }
+ inputBuffer = (byte *) malloc(finsize *sizeof(byte));// +1);
+ read (f_in,inputBuffer,finsize);
+ close (f_in);
+
+
+ {
+ //printf("\n parametros <<%s>>\n\n",stopwordsfile);
+ build_index (inputBuffer, finsize, stopwordsfile, &Index); /** building the index */
+
+// /** recovering the source text from the index */
+ {
+ double start, end;
+ start = getTime2();
+ ulong size;
+ get_length(Index, &size);
+ char extension[10]= ".source";
+
+ //recoverSourceText1((twcsa*) Index, outbasename,extension, size);
+ strcat(extension,"2");
+ recoverSourceText2((twcsa*) Index, outbasename,extension,size);
+ end = getTime2();
+ fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start );
+ }
+//
+ // DISPLAYING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
+ {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
+ int error = 0;
+ ulong numocc,numc, length, i, *snippet_len, tot_numcharext = 0, numpatt;
+ uchar *pattern, *snippet_text;
+
+ pattern = textPattern;
+ printf("\nSEARCH TEST for DISPLAY (pizzachili interface)\n");
+ while(1) {
+ printf("Intro string: ");
+ fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
+ if (!strcmp((char*)textPattern,"\n") ) break;
+ textPattern[strlen((char*)textPattern)-1] = '\0';
+
+ length = strlen( (char*)textPattern);
+ numc=50;
+
+// error = display (Index, textPattern, length, numc, &numocc,
+// &snippet_text, &snippet_len);
+ error = displayWords (Index, textPattern, length, numc, &numocc,
+ &snippet_text, &snippet_len,1);
+
+ if (error){ fprintf(stderr, "%s\n", "Hubo un error durante display");exit(0);}
+
+ fprintf(stderr,"\n acabou display");fflush(stderr);
+ {//show the results
+ ulong j, len = length + 2*numc;
+ char blank = '\0';
+ fprintf(stderr,"\n length = %d",length);
+ fprintf(stderr,"\n pattern = %s",pattern);fflush(stderr);
+ fprintf(stderr,"\n numocc = %d",numocc);fflush(stderr);
+ fprintf(stderr,"\n snippet len = %d",len);fflush(stderr);
+ fprintf(stderr,"\n =========");fflush(stderr);
+ for (i = 0; i < numocc; i++){
+ fprintf(stderr,"\n[%2d][len=%3d]<<",i+1,snippet_len[i]);fflush(stderr);
+ fwrite(snippet_text+len*i,sizeof(uchar),snippet_len[i],stderr);fflush(stderr);
+ fprintf(stderr,">>");fflush(stderr);
+ }
+ }
+ numpatt--;
+
+ for(i=0; i<numocc; i++) {
+ tot_numcharext += snippet_len[i];
+ }
+
+ if (numocc) {
+ free (snippet_len);
+ free (snippet_text);
+ }
+
+ printf("Ocurrences = %d\n", numocc);
+ if (!strcmp((char*)textPattern,"\n") ) break;
+ }
+ }
+
+//
+//
+// // SEARCHING FOR A TEXT PATTERN (word/phrase).
+// {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
+// int occ;
+// int len;
+// uint *occs;
+// int i;
+// printf("\nSEARCH TEST for LOCATE\n");
+// while(1) {
+// printf("Intro string: ");
+// fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
+// len = strlen((char*)textPattern);
+// if (!strcmp((char*)textPattern,"\n") ) break;
+// textPattern[len-1] = '\0';
+// len --;
+//
+// //occs = locateTextOcurrences(wcsa,textPattern,&occ);
+// // locate(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ);
+// locateWord(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ, 0);
+//
+// printf("\n*** %s occurs %d times: In the source text in positions:\n\t",textPattern,occ);
+// for (i=0;i<occ;i++)
+// printf("[%u]",occs[i]);
+// fflush(stderr);
+// free(occs);
+//
+// if (!strcmp((char*)textPattern,"\n") ) break;
+// }
+// }
+//
+//
+
+ // COUNTING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
+ /*
+ {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
+ int occ;
+ int len;
+ printf("\nSEARCH TEST for COUNT.\n");
+ while(1) {
+ printf("Intro string: ");
+ fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
+ len = strlen((char*)textPattern);
+ if (!strcmp((char*)textPattern,"\n") ) break;
+ textPattern[len-1] = '\0';
+ len --;
+
+ count(Index, textPattern, len, (ulong *)&occ);
+ //occ = countTextOcurrences(wcsa,textPattern);
+ printf("Ocurrences = %d\n", occ);
+ }
+ }
+ printf("\n END COUNTING OCCURRENCES OF PATTERNS. ...\n");
+ //exit(0);
+ */
+
+ /** saving the index to disk*/
+ save_index (Index, outbasename);
+
+ /** tells the mem used by the index */
+ ulong indexsize;
+ index_size(Index, &indexsize);
+ fprintf(stderr,"Index occupied %d bytes, 2 extra mallocs = %d",indexsize,2* sizeof(uint));
+
+ /** freeing the index */
+ free_index(Index);
+
+ }
+}
+
+#endif
+
+
+
--- /dev/null
+/* only for getTime() */
+#include <sys/time.h>
+#include <sys/resource.h>
+
+
+#include "utils/valstring.h"
+#include "utils/defValues.h"
+#include "utils/MemoryManager.h"
+#include "utils/fileInfo.h"
+
+#include "utils/hash.h"
+
+#include "utils/huff.h"
+//#include "utils/errors.c"
+#include "utils/parameters.h"
+
+//from SEARCHER FACADE
+#include "utils/huffDec.h"
+//#include "icsa/icsa.h"
+
+#include "intIndex/interfaceIntIndex.h"
+
+#ifndef uchar
+#define uchar unsigned char
+#endif
+#ifndef uint
+#define uint unsigned int
+#endif
+#ifndef ulong
+#define ulong unsigned long
+#endif
+
+#define STRLEN(str,len) \
+{len=0; \
+ byte *ptr = str; \
+ while(*ptr++) len++; \
+}
+
+#define ADDLEN(str,len) \
+{byte *ptr = str; \
+ while(*ptr++) len++; \
+}
+
+/** Some data types used **ONLY**during construction process */
+
+// Words, both the canonical words and their variants
+ typedef struct {
+ unsigned long slot; // the position in the hash table of the canonical word
+ byte *word; //makes alphanumerical sorting easier...
+ } tposInHT;
+
+
+ typedef struct SzoneMem { //a large block of memory to load a file into mem.
+ byte *zone; //block of mem.
+ uint size; //number of bytes
+ } tZoneMem;
+
+ // words dataStructure.
+ typedef struct {
+ uint *words;
+ uint elemSize; //the size (in bits) of each pointer.
+ tZoneMem wordsZoneMem; // a block of memory where the canonical words are loaded (from file).
+
+ } twords;
+
+
+/** Some data types used during searches */
+
+
+
+
+ /**the WCSA index structures... */
+ typedef struct {
+
+ /**valid words */
+ twords wordsData; /* vocabulary (words) of the index */
+
+ ulong n; /* number of different words. */
+ uint seSize; /* number of words in the source text */
+
+ uint sourceTextSize; /*the size of the source text in bytes*/
+
+ //ticsa *myicsa; //the WiCSA on SE words
+ void *myicsa; //the WiCSA on SE words
+
+ //#ifndef CSA_ON
+ uint *se;
+ //#endif
+
+ }twcsa;
+
+
+/** ******************************************************************************
+ * Interface (from pizza chili) for using the WCSA index
+*********************************************************************************/
+
+/* Error management */
+
+ /* Returns a string describing the error associated with error number
+ e. The string must not be freed, and it will be overwritten with
+ subsequent calls. */
+
+char *error_index (int e);
+
+/* Building the index */
+
+ /* Creates index from text[0..length-1]. Note that the index is an
+ opaque data type. Any build option must be passed in string
+ build_options, whose syntax depends on the index. The index must
+ always work with some default parameters if build_options is NULL.
+ The returned index is ready to be queried. */
+
+int build_index (uchar *text, ulong length, char *build_options, void **index);
+
+ /* Saves index on disk by using single or multiple files, having
+ proper extensions. */
+
+int save_index (void *index, char *filename);
+
+ /* Loads index from one or more file(s) named filename, possibly
+ adding the proper extensions. */
+
+int load_index (char *filename, void **index);
+
+ /* Frees the memory occupied by index. */
+
+int free_index (void *index);
+
+ /* Gives the memory occupied by index in bytes. */
+
+int index_size(void *index, ulong *size);
+
+/* Querying the index */
+
+ /* Writes in numocc the number of occurrences of the substring
+ pattern[0..length-1] found in the text indexed by index. */
+
+int count (void *index, uchar *pattern, ulong length, ulong *numocc);
+
+ /* Gives the length of the text indexed */
+
+int get_length(void *index, ulong *length);
+
+/* Accessing the indexed text */
+
+ /* Writes in numocc the number of occurrences of the substring
+ pattern[0..length-1] in the text indexed by index. It also allocates
+ occ (which must be freed by the caller) and writes the locations of
+ the numocc occurrences in occ, in arbitrary order. */
+
+int locate (void *index, uchar *pattern, ulong length, ulong **occ,
+ ulong *numocc);
+
+ /* Allocates snippet (which must be freed by the caller) and writes
+ the substring text[from..to] into it. Returns in snippet_length the
+ length of the text snippet actually extracted (that could be less
+ than to-from+1 if to is larger than the text size). */
+
+int extract (void *index, ulong from, ulong to, uchar **snippet,
+ ulong *snippet_length);
+
+ /* Displays the text (snippet) surrounding any occurrence of the
+ substring pattern[0..length-1] within the text indexed by index.
+ The snippet must include numc characters before and after the
+ pattern occurrence, totalizing length+2*numc characters, or less if
+ the text boundaries are reached. Writes in numocc the number of
+ occurrences, and allocates the arrays snippet_text and
+ snippet_lengths (which must be freed by the caller). The first is a
+ character array of numocc*(length+2*numc) characters, with a new
+ snippet starting at every multiple of length+2*numc. The second
+ gives the real length of each of the numocc snippets. */
+
+int display (void *index, uchar *pattern, ulong length, ulong numc,
+ ulong *numocc, uchar **snippet_text, ulong **snippet_lengths);
+
+ /* Obtains the length of the text indexed by index. */
+
+int length (void *index, ulong *length);
+
+ /* Shows summary info of the index */
+int printInfo(void *index);
+
+/** *******************************************************************************************/
+/** Building part of the index ****************************************************************/
+
+int build_WCSA (uchar *text, ulong length, char *build_options, void **index);
+int build_iCSA (char *build_options, void *index);
+
+
+
+/** *******************************************************************************************/
+/** Search part of the index ******************************************************************/
+// Definitions of some PUBLIC function prototipes.
+
+ //loading/freeing the data structures into memory.
+
+ void loadStructs(twcsa *wcsa, char *basename);
+ twcsa *loadWCSA(char *filename);
+
+ //returns the source text from given [offsetIni, offsetFin] offsets.
+ //byte *displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin);
+ byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length);
+ int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr);
+
+ //locate all the ocurrences of a word/phrase
+ int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number);
+
+ //show text around the occurrences of a word.
+ int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix);
+
+ //recovers the source text by calling display (either only once or "len" times)
+ void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
+ void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
+
+ //***Searching for a TEXT pattern ...
+
+ //extracts the ids of the valid words of a "plain text".
+ void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) ;
+
+ //counts the occurrences of a given text pattern.
+ int countTextOcurrences(twcsa *wcsa, byte *textPattern);
+
+ //returns the offsets (to the source text) where of a given text pattern appears.
+ uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences);
+
+ //shows a snippet with the text around the ocurrences of a pattern.
+ int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay);
+
+
+/** ***********************************************************************************
+ * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
+ * ***********************************************************************************/
+ /** Writes in numocc the number of occurrences of the substring
+ pattern[0..length-1] in the text indexed by index. It also allocates
+ occ (which must be freed by the caller) and writes the locations of
+ the numocc occurrences in occ, in arbitrary order. These occurrences
+ refer to the offsets in TOH where the caller could start a display
+ operation. So locateWord implies synchronization using B.
+ ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
+ searched word, but the offset in TOH of k-before words before.
+ */
+
+int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore);
+
+ /** Displays the text (snippet) surrounding any occurrence of the
+ substring pattern[0..length-1] within the text indexed by index.
+ The snippet must include numc characters before and after the
+ pattern occurrence, totalizing length+2*numc characters, or less if
+ the text boundaries are reached. Writes in numocc the number of
+ occurrences, and allocates the arrays snippet_text and
+ snippet_lengths (which must be freed by the caller). The first is a
+ character array of numocc*(length+2*numc) characters, with a new
+ snippet starting at every multiple of length+2*numc. The second
+ gives the real length of each of the numocc snippets. */
+
+ int displayWords (void *index, uchar *pattern, ulong length, ulong numc,
+ ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore);
+
+
+/** simulates extration of text process, but do not actually returns anything at all
+ Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
+ Less than 2K words can be extracted if more than numc characters have been already obtained.
+ Do nothing else... do not return the text */
+
+ int displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc);
+
+
+/** Allocates text (which must be freed by the caller) and recovers the
+ the substring of text starting from the "fromword"-th word up to the
+ "toWord"-th words. Returns in text the text, and in "text_lenght" the
+ length of the text actually extracted. Text is allocated.
+ Actually extracts SE[fromWord .. toWord) ... not the last word. */
+
+int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
+ ulong *text_length);
+
+
+
+
+ //recovers the source text by calling display (either only once or "len" times)
+ void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
+ void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
+
+
+// Definitions of PRIVATE functions
+
+ //Auxiliary functions
+
+ uint structsSizeDisk(twcsa *wcsa);
+ uint structsSizeMem(twcsa *wcsa);
+ void printInfoReduced(twcsa *wcsa);
+ int saveSEfile (char *basename, uint *v, uint n);
+ double getTime2 (void);
+
+
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "buildFacade.h"
+
+/* only for getTime() */
+#include <sys/time.h>
+#include <sys/resource.h>
+
+/* macro to detect and notify errors */
+#define IFERROR(error) {{if (error) { fprintf(stderr, "%s\n", error_index(error)); exit(1); }}}
+
+int loadSEfile (char *basename, uint **v, uint *n);
+void print_usage(char *);
+double getTime(void);
+
+int main(int argc, char *argv[]) {
+
+ char *basenamefile;
+ char *params = NULL;
+ void *index; uint index_len;
+ int error, i;
+ double start, end;
+ uint *se;
+ uint seSize;
+
+ if (argc < 2) print_usage(argv[0]);
+ if (argc > 2) {
+ int nchars, len;
+ nchars = argc-2;
+ for(i=1;i<argc;i++)
+ nchars += strlen(argv[i]);
+ params = (char *) malloc((nchars+1)*sizeof(char));
+ params[nchars] = '\0';
+ nchars = 0;
+ for(i=2;i<argc;i++) {
+ len = strlen(argv[i]);
+ strncpy(params+nchars,argv[i],len);
+ params[nchars+len] = ' ';
+ nchars += len+1;
+ }
+ params[nchars-1] = '\0';
+ }
+
+ basenamefile = argv[1];
+
+ start = getTime();
+
+ error = loadSEfile (basenamefile, &se, &seSize);
+ IFERROR(error);
+
+ //fprintf(stderr,"\n parameters (stopwordsFilename): \"%s\"\n",params); fflush(stderr);
+
+ error = buildIntIndex(se,seSize, params,(void **)&index);
+ IFERROR(error);
+
+ if (index) {
+ error = saveIntIndex(index, basenamefile);
+ }
+
+ IFERROR(error);
+ end = getTime();
+
+ error = sizeIntIndex(index, &index_len);
+ IFERROR(error);
+
+ printf("\n\n\t Freeing memory...");
+
+ error = freeIntIndex(index);
+ IFERROR(error);
+
+ free(se);
+ free(params);
+
+ fprintf(stdout,"\n\tBuilding time (**building self-index on ints: %.3f secs", end-start );
+ fprintf(stdout,"\n\t ## Input: %u bytes --> Output (pres_layer) %u bytes.", seSize*sizeof(uint), index_len);
+ fprintf(stdout,"\n\t ## Overall compression --> %.3f%% (%.3f bits per char).\n\n",
+ (100.0*index_len)/(seSize*sizeof(uint)), (index_len*8.0)/(seSize*sizeof(uint)));
+
+ exit(0);
+}
+
+
+int loadSEfile (char *basename, uint **v, uint *n){
+ char filename[255];
+ int file;
+ sprintf(filename,"%s.%s",basename,SE_FILE_EXT);
+
+ uint sizeFile = fileSize(filename);
+
+ if( sizeFile <= 0) {
+ printf("Cannot read information from file %s\n", filename); return -1;
+ }
+
+ if( (file = open(filename, O_RDONLY)) < 0) {
+ printf("Cannot open file %s\n", filename);
+ return -1;
+ }
+
+ uint *se = (uint *) malloc (sizeFile);
+ uint seSize = sizeFile / sizeof(uint);
+ read(file, se, sizeFile); //the samples
+ close(file);
+ *v=se;
+ *n=seSize;
+ return 0;
+}
+
+
+
+double
+getTime (void)
+{
+
+ double usertime, systime;
+ struct rusage usage;
+
+ getrusage (RUSAGE_SELF, &usage);
+
+ usertime = (double) usage.ru_utime.tv_sec +
+ (double) usage.ru_utime.tv_usec / 1000000.0;
+
+ systime = (double) usage.ru_stime.tv_sec +
+ (double) usage.ru_stime.tv_usec / 1000000.0;
+
+ return (usertime + systime);
+
+}
+
+void print_usage(char * progname) {
+ fprintf(stderr, "Usage: %s <index file> [<parameters>]\n", progname);
+ fprintf(stderr, "\nIt builds the index on Integer for the sequence in <index file>.se,\n");
+ fprintf(stderr, "storing it in [<index file>.[*]]; Any additional <parameters> \n");
+ fprintf(stderr, "will be passed to the construction function.\n");
+ fprintf(stderr, "At the end, the program sends to the standard error \n");
+ fprintf(stderr, "performance measures on time to build it.\n\n");
+ exit(1);
+}
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "buildFacade.h"
+
+/* only for getTime() */
+#include <sys/time.h>
+#include <sys/resource.h>
+
+/* macro to detect and notify errors */
+#define IFERROR(error) {{if (error) { fprintf(stderr, "%s\n", error_index(error)); exit(1); }}}
+
+int read_file(char *filename, uchar **textt, ulong *length);
+void print_usage(char *);
+double getTime(void);
+
+int main(int argc, char *argv[]) {
+
+ char *infile, *outfile;
+ uchar *text;
+ char *params = NULL;
+ ulong text_len;
+ void *index;
+ int error, i;
+ double start, end;
+
+ if (argc < 3) print_usage(argv[0]);
+ if (argc > 3) {
+ int nchars, len;
+ nchars = argc-3;
+ for(i=2;i<argc;i++)
+ nchars += strlen(argv[i]);
+ params = (char *) malloc((nchars+1)*sizeof(char));
+ params[nchars] = '\0';
+ nchars = 0;
+ for(i=3;i<argc;i++) {
+ len = strlen(argv[i]);
+ strncpy(params+nchars,argv[i],len);
+ params[nchars+len] = ' ';
+ nchars += len+1;
+ }
+ params[nchars-1] = '\0';
+ }
+
+ infile = argv[1];
+ outfile = argv[2];
+
+ start = getTime();
+ error = read_file(infile, &text, &text_len);
+ IFERROR(error);
+
+ //error = build_index(text, text_len, params, &index);
+
+ //fprintf(stderr,"\n parameters (stopwordsFilename): \"%s\"\n",params); fflush(stderr);
+ error = build_WCSA (text, text_len, params, &index);
+ //returnvalue = build_iCSA (params,*index);
+
+ IFERROR(error);
+
+ error = save_index(index, outfile);
+ IFERROR(error);
+ end = getTime();
+
+
+ ulong index_len;
+ index_size(index, &index_len);
+
+ error = free_index(index);
+ IFERROR(error);
+
+ free(params);
+
+ fprintf(stdout,"\n\n\t ## Building time (**parsing into integers + present_layer: %.3f secs", end-start );
+ fprintf(stdout,"\n\t ## Input: %lu bytes --> Output (pres_layer) %lu bytes.", text_len, index_len);
+ fprintf(stdout,"\n\t ## Overall compression --> %.2f%% (%.2f bits per char).\n\n",
+ (100.0*index_len)/text_len, (index_len*8.0)/text_len);
+
+ exit(0);
+}
+
+/*
+ Opens and reads a text file
+*/
+int read_file(char *filename, uchar **textt, ulong *length) {
+
+ uchar *text;
+ unsigned long t;
+ FILE *infile;
+
+ infile = fopen(filename, "rb"); // b is for binary: required by DOS
+ if(infile == NULL) return 1;
+
+ /* store input file length */
+ if(fseek(infile,0,SEEK_END) !=0 ) return 1;
+ *length = ftell(infile);
+
+ /* alloc memory for text (the overshoot is for suffix sorting) */
+ text = (uchar *) malloc((*length)*sizeof(*text));
+ if(text == NULL) return 1;
+
+ /* read text in one sweep */
+ rewind(infile);
+ t = fread(text, sizeof(*text), (size_t) *length, infile);
+ if(t!=*length) return 1;
+ *textt = text;
+ fclose(infile);
+
+ return 0;
+}
+
+double
+getTime (void)
+{
+
+ double usertime, systime;
+ struct rusage usage;
+
+ getrusage (RUSAGE_SELF, &usage);
+
+ usertime = (double) usage.ru_utime.tv_sec +
+ (double) usage.ru_utime.tv_usec / 1000000.0;
+
+ systime = (double) usage.ru_stime.tv_sec +
+ (double) usage.ru_stime.tv_usec / 1000000.0;
+
+ return (usertime + systime);
+
+}
+
+void print_usage(char * progname) {
+ fprintf(stderr, "Usage: %s <source file> <index file> [<parameters>]\n", progname);
+ fprintf(stderr, "\nIt builds the index for the text in file <source file>,\n");
+ fprintf(stderr, "storing it in <index file>. Any additional <parameters> \n");
+ fprintf(stderr, "will be passed to the construction function.\n");
+ fprintf(stderr, "At the end, the program sends to the standard error \n");
+ fprintf(stderr, "performance measures on time to build the index.\n\n");
+ exit(1);
+}
--- /dev/null
+/*
+ * Run Queries
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include "buildFacade.h"
+
+/* only for getTime() */
+#include <sys/time.h>
+#include <sys/resource.h>
+
+/* macro to detect and to notify errors */
+#define IFERROR(error) {{if (error) { fprintf(stderr, "%s\n", error_index(error)); exit(1); }}}
+
+
+/* local headers */
+
+double getTime (void);
+void usage(char * progname);
+
+static void *Index; /* opauque data type */
+static ulong Index_size, Text_length;
+static double Load_time;
+
+
+
+/*
+ * Temporary usage: run_queries <index file> <type> [length] [V]
+ */
+int main (int argc, char *argv[])
+{
+ int error = 0;
+ char *filename;
+ char querytype;
+
+ if (argc != 2) {
+ usage(argv[0]);
+ exit (1);
+ }
+
+ filename = argv[1];
+
+ printf("\n Stats of index: %s\n",argv[1]);
+
+ Load_time = getTime ();
+ error = load_index (filename, &Index);
+ IFERROR (error);
+ Load_time = getTime () - Load_time;
+ fprintf (stderr, "\tLoad index time = %.2f secs\n", Load_time);
+
+ error = index_size(Index, &Index_size);
+ IFERROR (error);
+
+ error = get_length(Index, &Text_length);
+ IFERROR (error);
+
+
+ ulong text_len;
+ error = get_length(Index, &text_len);
+ IFERROR (error);
+
+ error = printInfo(Index);
+ IFERROR(error);
+
+ error = free_index(Index);
+ IFERROR(error);
+
+
+
+ fprintf(stdout,"\t===============================================\n");
+ fprintf(stdout,"\tInput: %lu bytes (text) --> Output %lu bytes (wcsa).\n", text_len, Index_size);
+ fprintf(stderr,"\tIndex size = %lu Kb\n", Index_size/1024);
+ fprintf(stdout,"\tOverall compression --> %.2f%% (%.2f bits per char).\n",
+ (100.0*Index_size)/text_len, (Index_size*8.0)/text_len);
+ fprintf(stdout,"\t===============================================\n");
+
+
+ return 0;
+}
+
+double
+getTime (void)
+{
+ double usertime, systime;
+ struct rusage usage;
+
+ getrusage (RUSAGE_SELF, &usage);
+
+ usertime = (double) usage.ru_utime.tv_sec +
+ (double) usage.ru_utime.tv_usec / 1000000.0;
+
+ systime = (double) usage.ru_stime.tv_sec +
+ (double) usage.ru_stime.tv_usec / 1000000.0;
+
+ //return (usertime + systime);
+ return (usertime );
+}
+
+
+void usage(char * progname) {
+ fprintf(stderr, "\nThe program loads <index> and shows some stats on it\n");
+ fprintf(stderr, "Usage: %s <index> \n", progname);
+}
--- /dev/null
+SRCDIR = .
+SRCDIRCSA = .
+SRCDIRUTILS = ../utils
+CC = g++
+
+ifndef CFLAGS ##possibly already defined by the "main Makefile".
+ ##CFLAGS = -O9 -m32
+ CFLAGS = -O9 -m32
+ ##CFLAGS = -g -m32 -O0
+endif
+
+LIBINTINDEX = icsa.a
+
+all: intIndex cleanO
+
+
+intIndex: icsa.o parameters.o basics.o huff.o bitmap.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o
+ ar rc $(LIBINTINDEX) icsa.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o
+ #not including "parameters.o basics.o bitmap.o huff.o" as they are included by wcsa
+ #they are already included into the library by the presentation layer.
+
+icsa.o: parameters.o basics.o bitmap.o huff.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o
+ $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/icsa.c
+
+psiHuffmanRLE.o: huff.o
+ $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/psiHuffmanRLE.c
+
+psiDeltaCode.o:
+ $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/psiDeltaCode.c
+
+psiGonzalo.o: huff.o
+ $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/psiGonzalo.c
+
+parameters.o:
+ $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/parameters.c
+
+
+huff.o:
+ $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/huff.c
+
+basics.o:
+ $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/basics.c
+
+bitmap.o:
+ $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/bitmap.c
+
+
+cleanO:
+ rm -f *.o
+
+clean:
+ rm -rf *~ *% *.o core *.bak icsa.a
+
+tar:
+ tar czvf icsa.tar.gz Makefile
--- /dev/null
+#ifndef DEFVALUESSAD
+#define DEFVALUESSAD
+
+// CONFIGURACI�NS DO CSA DE SADAKANE PARA TEXTOS DE ENTEIROS
+
+// Parametros configurables
+#define DEFAULT_A_SAMPLING_PERIOD 256 // Periodo de muestreo de A
+#define DEFAULT_A_INV_SAMPLING_PERIOD 256 // Periodo de muestreo de inversa A
+#define DEFAULT_PSI_SAMPLING_PERIOD 256 // Periodo de muestreo da funcion PSI
+
+/*********************************/
+//#define PSI_HUFFMANRLE // Uses Lbd's implementation of Psi. Improvement over Gonzalo's (in compression and speed)
+#define DEFAULT_nsHUFF 16*1024 // huffman limit on Psi. it can be optimized for space
+
+#define PSI_DELTACODES // Uses Delta codes for increments of Psi. Faster but compresses less than the others
+/*********************************/
+
+
+#define MAX_FILENAME_LENGTH 256 // Lonxitude maxima do nome do ficheiro
+
+// Extensions dos ficheiros creados
+#define NUMBER_OF_ELEMENTS_FILE_EXT "CSA.noe" // Numero de elementos (tama�o de A, Psi e D)
+
+#define D_FILE_EXT "CSA.Dbv" // O vector D de Sadakane
+#define D_RANK_DIRECTORY_FILE_EXT "CSA.Drd" // O directorio para Rank sobre D
+
+#define SAMPLES_A_FILE_EXT "CSA.sA" // Mostras do array de sufixos
+#define BA_FILE_EXT "CSA.BAbv" // O vector D de Sadakane
+#define BA_RANK_DIRECTORY_FILE_EXT "CSA.BArd" // O directorio para Rank sobre D
+
+#define SAMPLES_A_INV_FILE_EXT "CSA.sAI" // Mostras da inversa do array de sufixos
+#define SAMPLING_PERIOD_A_FILE_EXT "CSA.sTA" // Periodo de muestreo do array de sufixos e da inversa
+
+#define PSI_COMPRESSED_FILE_EXT "CSA.psi" // COdigos delta de PSI
+
+#define DEFAULT_PSI_BINARY_SEARCH_FACTOR 2 // Periodo de muestreo na busca binaria
+//#define BINARY_SEARCH_INTERVAL 128 // Periodo de muestreo na busca binaria
+
+#endif
--- /dev/null
+#include "icsa.h"
+
+// Global para que funcione a funcion de comparacion do quicksort
+uint *intVector;
+
+// Para o quicksort
+int suffixCmp(const void *arg1, const void *arg2) {
+
+ register uint a,b;
+ a=*((uint *) arg1);
+ b=*((uint *) arg2);
+
+ while(intVector[a] == intVector[b]) { a++; b++; }
+ return (intVector[a] - intVector[b]);
+
+}
+
+
+
+/* **NO REVISADO TAMAÑO FILE.
+int buildIntIndexFromFile (char *filename, char *build_options,void **index) {
+ //char filename[255];
+ int file;
+ struct stat f_stat;
+ //sprintf(filename, "%s.%s", basename,SE_FILE_EXT);
+
+ if( (file = open(filename, O_RDONLY)) < 0) {
+ printf("Cannot open file %s\n", filename);
+ exit(0);
+ }
+ if( fstat(file, &f_stat) < 0) {
+ printf("Cannot read information from file %s\n", filename); exit(0);
+ }
+ uint sizeFile = (f_stat.st_size)/sizeof(uint);
+
+ uint *se = (uint *) malloc (sizeFile);
+ uint seSize = sizeFile / sizeof(uint);
+ read(file, se, sizeFile); //the samples
+ close(file);
+ return (buildIntIndex(se,seSize,build_options,index));
+}
+*/
+
+//ticsa *createIntegerCSA(uint **aintVector, uint textSize, char *build_options) {
+int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index ){
+ uint textSize=n;
+ intVector = aintVector; //global variable
+ ticsa *myicsa;
+ myicsa = (ticsa *) malloc (sizeof (ticsa));
+ uint *Psi, *SAI, *C, vocSize;
+ register uint i, j;
+ uint nsHUFF;
+
+ parametersCSA(myicsa, build_options);
+
+ nsHUFF=myicsa->tempNSHUFF;
+
+ // Almacenamos o valor dalguns parametros
+ myicsa->suffixArraySize = textSize;
+ myicsa->D = (uint*) malloc (sizeof(uint) * ((textSize+31)/32));
+
+ myicsa->samplesASize = (textSize + myicsa->T_A - 1) / myicsa->T_A + 1;
+ myicsa->samplesA = (uint *)malloc(sizeof(int) * myicsa->samplesASize);
+ myicsa->BA = (uint*) malloc (sizeof(uint) * ((textSize+31)/32));
+ myicsa->samplesAInvSize = (textSize + myicsa->T_AInv - 1) / myicsa->T_AInv;
+ myicsa->samplesAInv = (uint *)malloc(sizeof(int) * myicsa->samplesAInvSize);
+
+ // Reservamos espacio para os vectores
+ Psi = (uint *) malloc (sizeof(uint) * textSize);
+
+ // CONSTRUIMOS A FUNCION C
+ vocSize = 0;
+ for(i=0;i<textSize;i++) if(intVector[i]>vocSize) vocSize = intVector[i];
+ C = (uint *) malloc(sizeof(uint) * (vocSize + 1)); // Para contar o 0 terminador
+ for(i=0;i<vocSize;i++) C[i] = 0;
+ for(i=0;i<textSize;i++) C[intVector[i]]++;
+ for (i=0,j=0;i<=vocSize;i++) {
+ j = j + C[i];
+ C[i] = j;
+ }
+ for(i=vocSize;i>0;i--) C[i] = C[i-1];
+ C[0] = 0;
+
+ // Construimos o array de sufixos (en Psi) - con quicksort
+ printf("\n\t *BUILDING THE SUFFIX ARRAY over %d integers... (with qsort)", textSize);fflush(stdout);
+ for(i=0; i<textSize; i++) Psi[i]=i;
+
+ qsort(Psi, textSize, sizeof(uint), suffixCmp);
+
+
+ printf("\n\t ...... ended.");
+
+ // CONSTRUIMOS A INVERSA DO ARRAY DE SUFIXOS
+ SAI = (uint *) malloc (sizeof(uint) * (textSize + 1)); // +1 para repetir na ultima posición. Evitamos un if
+ for(i=0;i<textSize;i++) SAI[Psi[i]] = i;
+ SAI[textSize] = SAI[0];
+
+ // ALMACENAMOS AS MOSTRAS DO ARRAY DE SUFIXOS
+ for(i=0;i<((textSize+31)/32);i++) myicsa->BA[i] = 0;
+ for(i=0; i<textSize; i+=myicsa->T_A) bitset(myicsa->BA, SAI[i]);
+ bitset(myicsa->BA, SAI[textSize-1]); // A ultima posicion sempre muestreada
+ //printf("TextSize = %d\n", textSize);
+ myicsa->bBA = createBitmap(myicsa->BA, textSize);
+ for(i=0,j=0; i<textSize; i++) if(bitget(myicsa->BA, i)) myicsa->samplesA[j++] = Psi[i];
+
+ // ALMACENAMOS AS MOSTRAS DA INVERSA DO ARRAY DE SUFIXOS
+ for(i=0,j=0;i<textSize;i+=myicsa->T_AInv) myicsa->samplesAInv[j++] = SAI[i];
+
+ // CONSTRUIMOS E COMPRIMIMOS PSI
+ printf("\n\t Creating compressed Psi...");
+ for(i=0;i<textSize;i++) Psi[i] = SAI[Psi[i]+1];
+ free(SAI);
+ #ifdef PSI_HUFFMANRLE
+ myicsa->hcPsi = huffmanCompressPsi(Psi,textSize,myicsa->T_Psi,nsHUFF);
+ #endif
+ #ifdef PSI_GONZALO
+ myicsa->gcPsi = gonzaloCompressPsi(Psi,textSize,myicsa->T_Psi,nsHUFF);
+ #endif
+ #ifdef PSI_DELTACODES
+ myicsa->dcPsi = deltaCompressPsi(Psi,textSize,myicsa->T_Psi);
+ #endif
+ free(Psi);
+
+ // Contruimos D
+ for(i=0;i<((textSize+31)/32);i++) myicsa->D[i] = 0;
+ for(i=0;i<=vocSize;i++) bitset(myicsa->D, C[i]);
+ myicsa->bD = createBitmap(myicsa->D,textSize);
+ free(C);
+
+ // VARIABLE GLOBAL QUE ALMACENA O ESTADO DOS DISPLAYS (IMPORTANTE PARA DISPLAY SECUENCIAL)
+ // Almacena a última posición do array de sufixos que mostramos con display ou displayNext
+ // Se nos piden un displayNext, aplicamos PSI sobre esta posición e obtemos a seguinte,
+ // coa que podemos obter o símbolo pedido, e actualizamos displayState
+ myicsa->displayCSAState = 0;
+ myicsa->displayCSAPrevPosition = -2; //needed by DisplayCSA(position)
+
+ aintVector = intVector;
+ // Liberamos o espacion non necesario
+
+ *index = myicsa;
+ //return (myicsa);
+ return 0;
+}
+
+
+//Returns number of elements in the indexed sequence of integers
+int sourceLenIntIndex(void *index, uint *numInts){
+ ticsa *myicsa = (ticsa *) index;
+ *numInts= myicsa->suffixArraySize;
+ return 0; //no error;
+}
+
+int saveIntIndex(void *index, char *pathname) {
+//void storeStructsCSA(ticsa *myicsa, char *basename) {
+
+ ticsa *myicsa = (ticsa *) index;
+ char *basename=pathname;
+
+ char *filename;
+ int file;
+
+ // Reservamos espacio para o nome do ficheiro
+ filename = (char *)malloc(sizeof(char)*MAX_FILENAME_LENGTH);
+
+ // Ficheiro co n�mero de elementos indexados (enteiros do texto orixinal)
+ strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, NUMBER_OF_ELEMENTS_FILE_EXT);
+ unlink(filename);
+ if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+ printf("Cannot open file %s\n", filename);
+ exit(0);
+ }
+ write(file, &(myicsa->suffixArraySize), sizeof(int));
+ close(file);
+
+ strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, PSI_COMPRESSED_FILE_EXT);
+
+ #ifdef PSI_HUFFMANRLE
+ storeHuffmanCompressedPsi(&(myicsa->hcPsi), filename);
+ #endif
+ #ifdef PSI_GONZALO
+ storeGonzaloCompressedPsi(&(myicsa->gcPsi), filename);
+ #endif
+ #ifdef PSI_DELTACODES
+ storeDeltaCompressedPsi(&(myicsa->dcPsi), filename);
+ #endif
+
+ // Ficheiro co vector de bits D
+ strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, D_FILE_EXT);
+ unlink(filename);
+ if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+ printf("Cannot open file %s\n", filename);
+ exit(0);
+ }
+ write(file, myicsa->D, sizeof(int)*((myicsa->suffixArraySize+31)/32));
+ close(file);
+
+ // Directorio de rank para D
+ // Almacenamos o n�mero de superbloques seguido dos superbloques
+ // E logo o n�mero de bloques seguido dos bloques
+ strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, D_RANK_DIRECTORY_FILE_EXT);
+ saveBitmap(filename,myicsa->bD);
+
+ // Ficheiro coas mostras de A
+ strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, SAMPLES_A_FILE_EXT);
+ unlink(filename);
+ if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+ printf("Cannot open file %s\n", filename);
+ exit(0);
+ }
+ write(file, myicsa->samplesA, sizeof(int) * (myicsa->samplesASize));
+ close(file);
+
+ // Ficheiro co vector BA (marca as posicions de A muestreadas)
+ strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, BA_FILE_EXT);
+ unlink(filename);
+ if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+ printf("Cannot open file %s\n", filename);
+ exit(0);
+ }
+ write(file, myicsa->BA, sizeof(int)*((myicsa->suffixArraySize+31)/32));
+ close(file);
+
+ // Directorio de rank para BA
+ strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, BA_RANK_DIRECTORY_FILE_EXT);
+ saveBitmap(filename, myicsa->bBA);
+
+ // Ficheiro coas mostras de A inversa
+ strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, SAMPLES_A_INV_FILE_EXT);
+ unlink(filename);
+ if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+ printf("Cannot open file %s\n", filename);
+ exit(0);
+ }
+ write(file, myicsa->samplesAInv, sizeof(int) * (myicsa->samplesAInvSize));
+ close(file);
+
+ // Ficheiro co periodo de muestreo de A e A inversa
+ strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, SAMPLING_PERIOD_A_FILE_EXT);
+ unlink(filename);
+ if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+ printf("Cannot open file %s\n", filename);
+ exit(0);
+ }
+ write(file, &(myicsa->T_A), sizeof(int));
+ write(file, &(myicsa->T_AInv), sizeof(int));
+
+ write(file, &(myicsa->psiSearchFactorJump), sizeof(uint));
+
+ close(file);
+ free(filename);
+ return 0; //no error.
+}
+
+//Returns the size (in bytes) of the index over the sequence of integers.
+//uint CSA_size(ticsa *myicsa) {
+int sizeIntIndex(void *index, uint *numBytes) {
+ ticsa *myicsa = (ticsa *) index;
+ uint size = 0;
+ size +=(sizeof (ticsa) * 1);
+ size += sizeof(uint)*((myicsa->suffixArraySize+31)/32) ; //D vector
+ size += myicsa->bD->mem_usage;
+ size += sizeof(uint) * myicsa->samplesASize ; // samples A
+ size += sizeof(uint) * myicsa->samplesAInvSize ; // samples A^{-1}
+ size += sizeof(uint)*((myicsa->suffixArraySize+31)/32) ; //BA vector
+ size += myicsa->bBA->mem_usage;
+ #ifdef PSI_HUFFMANRLE
+ size +=myicsa->hcPsi.totalMem;
+ #endif
+ #ifdef PSI_GONZALO
+ size +=myicsa->gcPsi.totalMem;
+ #endif
+ #ifdef PSI_DELTACODES
+ size +=myicsa->dcPsi.totalMem;
+ #endif
+ *numBytes = size;
+ return 0; //no error.
+}
+
+
+//ticsa *loadCSA(char *basename) {
+int loadIntIndex(char *pathname, void **index){
+
+ char *basename=pathname;
+ char *filename;
+ int file;
+ uint length;
+ char c;
+ char *word;
+ struct stat f_stat;
+ uint suffixArraySize;
+
+ ticsa *myicsa;
+ myicsa = (ticsa *) malloc (sizeof (ticsa) * 1);
+
+
+ // VARIABLE GLOBAL QUE ALMACENA O ESTADO DOS DISPLAYS (IMPORTANTE PARA DISPLAY SECUENCIAL)
+ // Almacena a �ltima posici�n do array de sufixos que mostramos con display ou displayNext
+ // Se nos piden un displayNext, aplicamos PSI sobre esta posici�n e obtemos a seguinte,
+ // coa que podemos obter o s�mbolo pedido, e actualizamos displayState
+ myicsa->displayCSAState = 0;
+ myicsa->displayCSAPrevPosition = -2; //needed by DisplayCSA(position)
+
+ // Reservamos espacio para o nome do ficheiro
+ filename = (char *)malloc(sizeof(char)*MAX_FILENAME_LENGTH);
+
+ // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O NUMERO DE ELEMENTOS INDEXADOS
+ strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, NUMBER_OF_ELEMENTS_FILE_EXT);
+ if( (file = open(filename, O_RDONLY)) < 0) {
+ printf("Cannot read file %s\n", filename);exit(0);
+ }
+ read(file, &suffixArraySize, sizeof(uint));
+ myicsa->suffixArraySize = suffixArraySize;
+ printf("Number of indexed elements (suffix array size) = %d\n", suffixArraySize);
+
+ // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA PSI COMPRIMIDA
+ strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, PSI_COMPRESSED_FILE_EXT);
+ #ifdef PSI_HUFFMANRLE
+ myicsa->hcPsi = loadHuffmanCompressedPsi(filename);
+ #endif
+ #ifdef PSI_GONZALO
+ myicsa->gcPsi = loadGonzaloCompressedPsi(filename);
+ #endif
+ #ifdef PSI_DELTACODES
+ myicsa->dcPsi = loadDeltaCompressedPsi(filename);
+ #endif
+
+ // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA D
+ strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, D_FILE_EXT);
+ if( (file = open(filename, O_RDONLY)) < 0) {
+ printf("Cannot read file %s\n", filename); exit(0);
+ }
+ myicsa->D = (uint *) malloc (sizeof(uint)*((suffixArraySize+31)/32));
+ read(file, myicsa->D, sizeof(uint)*((suffixArraySize+31)/32));
+ printf("Bit vector D loaded (%d bits)\n", suffixArraySize);
+
+ // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O DIRECTORIO DE RANK1 PARA D
+ strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, D_RANK_DIRECTORY_FILE_EXT);
+ myicsa->bD = loadBitmap(filename,myicsa->D,suffixArraySize);
+ { uint ns, nb;
+ ns = myicsa->bD->sSize;
+ nb = myicsa->bD->bSize;
+ myicsa->bD->data = myicsa->D;
+ printf("Rank1 Directory for D loaded (%d superblocks, %d blocks)\n", ns, nb);
+ }
+
+ // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA SAMPLES A
+ strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, SAMPLES_A_FILE_EXT);
+ if( (file = open(filename, O_RDONLY)) < 0) {
+ printf("Cannot read file %s\n", filename); exit(0);
+ }
+ if( fstat(file, &f_stat) < 0) {
+ printf("Cannot read information from file %s\n", filename); exit(0);
+ }
+ myicsa->samplesASize = (f_stat.st_size)/sizeof(uint);
+ myicsa->samplesA = (uint *)malloc(sizeof(uint) * myicsa->samplesASize);
+ read(file, myicsa->samplesA, sizeof(uint) * myicsa->samplesASize);
+ printf("Suffix array samples loaded (%d samples)\n", myicsa->samplesASize);
+
+ // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA BA
+ strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, BA_FILE_EXT);
+ if( (file = open(filename, O_RDONLY)) < 0) {
+ printf("Cannot read file %s\n", filename); exit(0);
+ }
+ myicsa->BA = (uint *) malloc (sizeof(uint)*((suffixArraySize+31)/32));
+ read(file, myicsa->BA, sizeof(uint)*((suffixArraySize+31)/32));
+ printf("Bit vector BA loaded (%d bits)\n", suffixArraySize);
+
+ // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O DIRECTORIO DE RANK1 PARA BA
+ strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, BA_RANK_DIRECTORY_FILE_EXT);
+ myicsa->bBA = loadBitmap(filename,myicsa->BA,suffixArraySize);
+ { uint ns, nb;
+ ns = myicsa->bBA->sSize;
+ nb = myicsa->bBA->bSize;
+ myicsa->bBA->data = myicsa->BA;
+ printf("Rank1 Directory for BA loaded (%d superblocks, %d blocks)\n", ns, nb);
+ }
+
+ // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA SAMPLES A INVERSA
+ strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, SAMPLES_A_INV_FILE_EXT);
+ if( (file = open(filename, O_RDONLY)) < 0) {
+ printf("Cannot read file %s\n", filename); exit(0);
+ }
+ if( fstat(file, &f_stat) < 0) {
+ printf("Cannot read information from file %s\n", filename); exit(0);
+ }
+ myicsa->samplesAInvSize = (f_stat.st_size)/(sizeof(uint));
+ myicsa->samplesAInv = (uint *)malloc(sizeof(uint) * myicsa->samplesAInvSize);
+ read(file, myicsa->samplesAInv, sizeof(uint) * myicsa->samplesAInvSize);
+ printf("Suffix array inverse samples loaded (%d samples)\n", myicsa->samplesAInvSize);
+
+ // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O PERIODO DE MUESTREO DO ARRAY DE SUFIXOS E DA INVERSA
+ strcpy(filename, basename);
+ strcat(filename, ".");
+ strcat(filename, SAMPLING_PERIOD_A_FILE_EXT);
+ if( (file = open(filename, O_RDONLY)) < 0) {
+ printf("Cannot read file %s\n", filename); exit(0);
+ }
+ read(file, &(myicsa->T_A), sizeof(uint));
+ read(file, &(myicsa->T_AInv), sizeof(uint));
+ printf("Sampling A Period T = %d, Sampling A inv Period TInv = %d\n", myicsa->T_A, myicsa->T_AInv);
+
+ read(file, &(myicsa->psiSearchFactorJump), sizeof(uint));
+ printf("Psi Bin Search Factor-Jump is = %d\n", myicsa->psiSearchFactorJump);
+
+ close(file);
+ free(filename);
+
+ //return myicsa;
+ *index = myicsa;
+ return 0;
+}
+
+
+//uint destroyStructsCSA(ticsa *myicsa) {
+int freeIntIndex(void *index) {
+ ticsa *myicsa = (ticsa *) index;
+ // Liberamos o espacio reservado
+
+ if (!myicsa) return 0;
+
+ uint total=0, totaltmp=0;
+
+ uint nbytes;sizeIntIndex(index, &nbytes);
+
+ total +=(sizeof (ticsa) * 1);;
+
+ #ifdef PSI_HUFFMANRLE
+ total+= totaltmp = myicsa->hcPsi.totalMem;
+ destroyHuffmanCompressedPsi(&(myicsa->hcPsi));
+ #endif
+ #ifdef PSI_GONZALO
+ total+= totaltmp = myicsa->gcPsi.totalMem;
+ destroyGonzaloCompressedPsi(&(myicsa->gcPsi));
+ #endif
+ #ifdef PSI_DELTACODES
+ total+= totaltmp = myicsa->dcPsi.totalMem;
+ destroyDeltaCodesCompressedPsi(&(myicsa->dcPsi));
+ #endif
+ printf("\n\t[destroying SA: compressed PSI structure] ...Freed %u bytes... RAM",totaltmp);
+
+ free(myicsa->D); total+= totaltmp = (sizeof(uint)*((myicsa->suffixArraySize+31)/32));
+ printf("\n\t[destroying SA: D vector] ...Freed %u bytes... RAM",totaltmp);
+ free(myicsa->samplesA); total+= totaltmp = (sizeof(uint) * myicsa->samplesASize);
+ printf("\n\t[destroying Samples A: A ] ...Freed %u bytes... RAM",totaltmp);
+ free(myicsa->samplesAInv); total+= totaltmp = (sizeof(uint) * myicsa->samplesAInvSize);
+ printf("\n\t[destroying Samples AInv: A ] ...Freed %u bytes... RAM",totaltmp);
+ printf("\n\t[destroying rank bit D ] ...Freed %u bytes... RAM",myicsa->bD->mem_usage);
+ free(myicsa->BA); total+= totaltmp = (sizeof(uint)*((myicsa->suffixArraySize+31)/32));
+ printf("\n\t[destroying SA: BA vector] ...Freed %u bytes... RAM",totaltmp);
+
+ total += myicsa->bD->mem_usage;
+ destroyBitmap(myicsa->bD);
+ total += myicsa->bBA->mem_usage;
+ destroyBitmap(myicsa->bBA);
+
+ printf("\n\t**** [the whole iCSA ocuppied ... %u bytes... RAM",total);
+ printf("\n\t**** iCSA size = %d bytes ",nbytes);
+ printf("\n");
+
+ free(myicsa);
+
+ return 0; //no error.
+}
+
+ // Shows detailed summary info of the self-index (memory usage of each structure)
+int printInfoIntIndex(void *index, const char tab[]) {
+ ticsa *myicsa = (ticsa *) index;
+ if (!myicsa) return 0;
+
+ uint structure, totalpsi, totalD, totalBD, totalSA,totalSAinv, totalBA,totalBBA;
+
+ structure=sizeof(ticsa);
+
+ #ifdef PSI_HUFFMANRLE
+ totalpsi = myicsa->hcPsi.totalMem;
+ #endif
+ #ifdef PSI_GONZALO
+ totalpsi = myicsa->gcPsi.totalMem;
+ #endif
+ #ifdef PSI_DELTACODES
+ totalpsi = myicsa->dcPsi.totalMem;
+ #endif
+
+ totalD = (sizeof(uint)*((myicsa->suffixArraySize+31)/32));
+ totalBD = myicsa->bD->mem_usage;
+ totalSA = (sizeof(uint) * myicsa->samplesASize);
+ totalSAinv = (sizeof(uint) * myicsa->samplesAInvSize);
+ totalBA = (sizeof(uint)*((myicsa->suffixArraySize+31)/32));
+ totalBBA = myicsa->bBA->mem_usage;
+
+ uint nbytes; sizeIntIndex(index, &nbytes); //whole self-index
+
+ printf("\n ===================================================:");
+ printf("\n%sSummary Self-index on integers (icsa) layer:",tab);
+ printf("\n%s icsa structure = %d bytes",tab, structure);
+ printf("\n%s psi = %8d bytes",tab, totalpsi);
+ printf("\n%s D (bitmap) = %8d bytes",tab, totalD);
+ printf("\n%s rank for D = %8d bytes",tab, totalBD);
+ printf("\n%s SA(sampled) = %8d bytes",tab, totalSA);
+ printf("\n%s SAinv(samp) = %8d bytes",tab, totalSAinv);
+ printf("\n%s BA (bitmap) = %8d bytes",tab, totalBA);
+ printf("\n%s rank for BA = %8d bytes",tab, totalBBA);
+ printf("\n%sTotal = ** %9d bytes (in RAM) ** ",tab, nbytes);
+ printf("\n");
+
+ return 0; //no error.
+}
+
+
+// OPERACIONS DO CSA
+
+// BUSCA BINARIA SOBRE MOSTRAS + 2 BUSCAS EXPONENCIAIS + 2 BUSCAS BINARIAS
+// 1º Busca binaria sobre o array de sufixos, elexindo como pivote un múltiplo de bin_search_psi_skip_interval (que orixinalmente foi pensado para igualar co valor de Psi).
+// 2º Esta busca pode deterse por:
+// a) O pivote repítese entre dúas iteracións -> As ocorrencias están entre o pivote e a seguinte mostra (pivote + bin_search_psi_skip_interval) -> facemos dúas buscas binarias
+// b) O pivote é unha ocorrencia do patrón -> Faise unha busca exponencial sobre mostras hacia a esquerda e outra hacia a dereita, ata atopar a unha mostra á esquerda e outra
+// á dereita do intervalo de ocorrencias. Entre cada unha destas e a inmediatamente anterior da busca exponencial, faise unha busca binaria para atopar os extremos do intervalo.
+
+int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right){
+ //unsigned int countCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right) {
+
+ uint patternSize = length;
+ ticsa *myicsa = (ticsa *) index;
+
+ register unsigned long l, r, i;
+ register long comp, p, previousP;
+ //register unsigned int l, r, i;
+ //register int comp, p, previousP;
+ register uint bin_search_psi_skip_interval = myicsa->psiSearchFactorJump;
+
+ //fprintf(stderr,"\n psiSearchFactor = %d",myicsa->psiSearchFactorJump);
+
+ l = 0;
+ r = (myicsa->suffixArraySize+bin_search_psi_skip_interval-2)/bin_search_psi_skip_interval*bin_search_psi_skip_interval;
+ p = ((l+r)/2)/bin_search_psi_skip_interval * bin_search_psi_skip_interval;
+ previousP = 0;
+
+ // BUSCA BINARIA SOBRE MOSTRAS
+ while( (p != previousP) && (comp = SadCSACompare(myicsa, pattern, patternSize, p)) ) {
+ if(comp > 0) l = p;
+ else r = p;
+ previousP = p;
+ p = ((l+r)/2)/bin_search_psi_skip_interval*bin_search_psi_skip_interval;
+ }
+
+ if(p==previousP) {
+
+ // BUSCA BINARIA ENTRE O PIVOTE E A SEGUINTE MOSTRA
+ l = previousP;
+ r = previousP+bin_search_psi_skip_interval;
+ if(r > myicsa->suffixArraySize) r = myicsa->suffixArraySize - 1;
+ while(l < r) {
+ p = (l+r)/2;
+ if(SadCSACompare(myicsa, pattern, patternSize, p) <= 0) r = p;
+ else l = p+1;
+ }
+
+ if(SadCSACompare(myicsa, pattern, patternSize, r)) {
+ *left = l;
+ *right = r;
+ //return 0;
+ *numocc = 0;
+ return 0; //no error.
+ }
+ *left = r;
+
+ l = previousP;
+ r = previousP+bin_search_psi_skip_interval;
+ if(r > myicsa->suffixArraySize) r = myicsa->suffixArraySize - 1;
+ while(l < r) {
+ p = (l+r+1)/2;
+ if(SadCSACompare(myicsa, pattern, patternSize, p) >= 0) l = p;
+ else r = p-1;
+ }
+ *right = l;
+
+ } else {
+
+ previousP = p; // En previousP poñemos o p atopado na busca sobre as mostras
+
+ // BUSCA EXPONENCIAL HACIA ATRÁS
+ i = 1;
+ p -= bin_search_psi_skip_interval;
+ while(p>0 && !SadCSACompare(myicsa, pattern, patternSize, p)) {
+ i<<=1;
+ p = previousP-(i*bin_search_psi_skip_interval);
+ }
+ // Busca binaria entre as duas ultimas mostras da exponencial
+ if(previousP > i*bin_search_psi_skip_interval) l = previousP-(i*bin_search_psi_skip_interval);
+ else l=0;
+ i>>=1;
+ r = previousP-(i*bin_search_psi_skip_interval);
+ while(l < r) {
+ p = (l+r)/2;
+ if(SadCSACompare(myicsa, pattern, patternSize, p) <= 0) r = p;
+ else l = p+1;
+ }
+ *left = r;
+
+ // BUSCA EXPONENCIAL HACIA ADIANTE
+ i = 1;
+ p = previousP+bin_search_psi_skip_interval;
+ while(p<myicsa->suffixArraySize && !SadCSACompare(myicsa, pattern, patternSize, p)) {
+ i<<=1;
+ p = previousP+(i*bin_search_psi_skip_interval);
+ }
+ // Busca binaria entre as duas ultimas mostras da exponencial
+ if(p < myicsa->suffixArraySize) r = previousP+(i*bin_search_psi_skip_interval);
+ else r = myicsa->suffixArraySize-1;
+ i>>=1;
+ l = previousP+(i*bin_search_psi_skip_interval);
+ while(l < r) {
+ p = (l+r+1)/2;
+ if(SadCSACompare(myicsa, pattern, patternSize, p) >= 0) l = p;
+ else r = p-1;
+ }
+ *right = l;
+ }
+
+ //return *right-*left+1;
+ *numocc = (uint) *right-*left+1;
+ return 0; //no error
+}
+
+// Version inicial de busca binaria
+unsigned int countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right) {
+ register ulong l, r, p;
+
+ l = 0;
+ r = myicsa->suffixArraySize-1;
+
+ while(l < r) {
+ p = (l+r)/2;
+ if(SadCSACompare(myicsa, pattern, patternSize, p) <= 0) r = p;
+ else l = p+1;
+ }
+
+ // SE SON DISTINTOS O PATRON NON APARECE NO TEXTO E DEVOLVEMOS 0
+ if(SadCSACompare(myicsa, pattern, patternSize, r)) {
+ *left = l;
+ *right = r;
+ return 0;
+ }
+
+ // Almacenamos o limite esquerdo
+ *left = r;
+
+ // SE SON IGUALES (O PATRON APARECE NO TEXTO), BUSCAMOS AGORA O LIMITE DEREITO, QUE ALMACENAREMOS EN right
+ // NOTA: INICIAMOS A BUSQUEDA A PARTIR DO ESQUERDO...
+ l = r;
+ r = myicsa->suffixArraySize-1;
+
+ while(l < r) {
+ p = (l+r+1)/2;
+ if(SadCSACompare(myicsa, pattern, patternSize, p) >= 0) l = p;
+ else r = p-1;
+ }
+
+ // Gardamos o limite dereito
+ *right = l;
+
+ return (uint) *right-*left+1;
+}
+
+int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc) {
+ //unsigned int *locateCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *occ) {
+
+ ticsa *myicsa = (ticsa *) index;
+ uint patternSize = length;
+ ulong *positions;
+ ulong occurrences;
+ register ulong left, right;
+
+ //occurrences = countCSA(myicsa, pattern, patternSize, &left, &right);
+ int err;
+ err = countIntIndex((void *) myicsa, pattern, patternSize, &occurrences, &left, &right);
+ *numocc = occurrences;
+
+ if (occurrences) {
+ register ulong idx = 0;
+ positions = (ulong*) malloc(sizeof(ulong) * occurrences);
+ while(left<=right) positions[idx++] = A(myicsa,left++);
+
+ *occ = positions;
+ return 0;
+ //return positions;
+ }
+
+ (*occ)=NULL;
+ return 0; //no error, but no occurrences.
+
+ //return NULL;
+}
+
+// Devolve o enteiro do texto que ocupa a posicion dada,
+// e fixa o estado para poder seguir obtendo os seguintes enteiros con displayNext();
+
+int displayIntIndex(void *index, ulong position, uint *value){
+ //uint displayCSA(ticsa *myicsa, uint position) {
+ ticsa *myicsa = (ticsa *) index;
+ if (position == (myicsa->displayCSAPrevPosition +1)) {
+ myicsa->displayCSAPrevPosition = position;
+ //return displayCSANext(myicsa);
+ *value = displayCSANext(myicsa);
+ }
+ else {
+ myicsa->displayCSAPrevPosition = position;
+ //return displayCSAFirst(myicsa, position);
+ *value = displayCSAFirst(myicsa, position);
+ }
+ return 0; //no error
+}
+
+
+/**********************************************************************/
+
+// Devolve o enteiro do texto que ocupa a posicion dada, e fixa o estado
+// para poder seguir obtendo os seguintes enteiros con displayNext();
+uint displayCSAFirst(ticsa *myicsa, uint position) {
+
+ register uint positionAux, index;
+ register uint T_AInv = myicsa->T_AInv;
+
+ positionAux = myicsa->samplesAInv[position / T_AInv];
+ for(index = 0; index < position%T_AInv; index++) {
+ #ifdef PSI_HUFFMANRLE
+ positionAux=getHuffmanPsiValue(&(myicsa->hcPsi),positionAux);
+ #endif
+ #ifdef PSI_GONZALO
+ positionAux=getGonzaloPsiValue(&(myicsa->gcPsi),positionAux);
+ #endif
+ #ifdef PSI_DELTACODES
+ positionAux=getDeltaPsiValue(&(myicsa->dcPsi),positionAux);
+ #endif
+ }
+
+ // Fijamos a variable global para a chamada a displayNext
+ myicsa->displayCSAState = positionAux;
+
+ // return rank1(D, Dir, positionAux) - 1;
+ return rank(myicsa->bD, positionAux) - 1;
+}
+
+
+// Devolve o seguinte enteiro do texto (a partir do estado)
+unsigned int displayCSANext(ticsa *myicsa) {
+ #ifdef PSI_HUFFMANRLE
+ myicsa->displayCSAState=getHuffmanPsiValue(&(myicsa->hcPsi),myicsa->displayCSAState);
+ #endif
+ #ifdef PSI_GONZALO
+ myicsa->displayCSAState = getGonzaloPsiValue(&(myicsa->gcPsi),myicsa->displayCSAState);
+ #endif
+ #ifdef PSI_DELTACODES
+ myicsa->displayCSAState = getDeltaPsiValue(&(myicsa->dcPsi),myicsa->displayCSAState);
+ #endif
+ return rank(myicsa->bD, myicsa->displayCSAState) - 1;
+}
+
+
+// Mostra as estructuras creadas
+void showStructsCSA(ticsa *myicsa) {
+
+ unsigned int index;
+
+ // ESTRUCTURAS PARA CSA
+ printf("Basic CSA structures:\n\n");
+
+ // VALORES DA FUNCI�N PSI (decodificando)
+ printf("\tPSI: (Sampling period = %d)\n", myicsa->T_Psi);
+ for(index=0; index < myicsa->suffixArraySize; index++)
+ #ifdef PSI_HUFFMANRLE
+ printf("\tPsi[%d] = %d\n", index, getHuffmanPsiValue(&(myicsa->hcPsi),index));
+ #endif
+ #ifdef PSI_GONZALO
+ printf("\tPsi[%d] = %d\n", index, getGonzaloPsiValue(&(myicsa->gcPsi),index));
+ #endif
+ #ifdef PSI_DELTACODES
+ printf("\tPsi[%d] = %d\n", index, getDeltaPsiValue(&(myicsa->dcPsi),index));
+ #endif
+ printf("\n");
+
+ // VECTOR D DE SADAKANE CO DIRECTORIO DE RANK ASOCIADO
+ printf("\tD = ");
+ showBitVector(myicsa->D, myicsa->suffixArraySize);
+ printf("\n\nSuperbloques de D:\n");
+ { uint ns;
+ uint nb;
+ ns = myicsa->bD->sSize;
+ nb= myicsa->bD->bSize;
+ for(index=0; index<ns; index++) {
+ //printf("\tDs[%d] = %d\n", index, Dir.Ds[index]);
+ printf("\tDs[%d] = %d\n", index, myicsa->bD->sdata[index]);
+ }
+ printf("\nBloques de D:\n");
+
+ for(index=0; index<nb; index++) {
+ //printf("\tDb[%d] = %d\n", index, Dir.Db[index]);
+ printf("\tDb[%d] = %d\n", index, myicsa->bD->bdata[index]);
+ }
+ printf("\n\n");
+ }
+ // ESTRUCTURAS PARA ACCEDER O ARRAY DE SUFIXOS E A SUA INVERSA
+ printf("Suffix Array Sampling Structures: (Sampling period = %d)\n", myicsa->T_A);
+ printf("\tSuffix Array Samples:\n");
+ for(index=0; index < myicsa->samplesASize; index++)
+ printf("\tSamplesA[%d] = %d\n", index, myicsa->samplesA[index]);
+ printf("\n");
+ printf("\tInverse Suffix Array Samples:\n");
+ for(index=0; index < myicsa->samplesASize; index++)
+ printf("\tSamplesAInv[%d] = %d\n", index, myicsa->samplesAInv[index]);
+ printf("\n");
+
+}
+
+
+// Comparacion de Sadakane entre un patron (pattern) y el sufijo en la posicion p del array de sufijos
+// IMPORTANTE EVITAR ULTIMA CHAMADA A PSI
+int SadCSACompare(ticsa *myicsa, uint *pattern, uint patternSize, uint p) {
+
+ register unsigned int j, i, currentInteger, diff;
+
+ i = p;
+ j = 0;
+
+ while(1) {
+ currentInteger = rank(myicsa->bD, i) - 1;
+ diff = pattern[j++] - currentInteger;
+ if(diff) return diff;
+ if(j == patternSize) return 0;
+ else
+ #ifdef PSI_HUFFMANRLE
+ i=getHuffmanPsiValue(&(myicsa->hcPsi),i);
+ #endif
+ #ifdef PSI_GONZALO
+ i=getGonzaloPsiValue(&(myicsa->gcPsi),i);
+ #endif
+ #ifdef PSI_DELTACODES
+ i=getDeltaPsiValue(&(myicsa->dcPsi),i);
+ #endif
+ }
+
+}
+
+
+// Acceso a array de sufixos A
+inline uint A(ticsa *myicsa, uint position) {
+
+ register uint timesPsi, sampleValue;
+ register uint T_A = myicsa->T_A;
+
+ uint proba = position;
+
+ timesPsi = 0;
+ while(!bitget(myicsa->BA, position)) {
+
+ #ifdef PSI_HUFFMANRLE
+ position=getHuffmanPsiValue(&(myicsa->hcPsi),position);
+ #endif
+ #ifdef PSI_GONZALO
+ position=getGonzaloPsiValue(&(myicsa->gcPsi),position);
+ #endif
+ #ifdef PSI_DELTACODES
+ position=getDeltaPsiValue(&(myicsa->dcPsi),position);
+ #endif
+ timesPsi++;
+
+ }
+ sampleValue = myicsa->samplesA[rank(myicsa->bBA, position)-1];
+
+ return sampleValue - timesPsi;
+
+}
+
+
+// Acceso 'a inversa do array de sufixos
+inline uint inverseA(ticsa *myicsa, uint offset) {
+
+ register uint index, inverseValue;
+ register uint T_AInv = myicsa->T_AInv;
+
+ inverseValue = myicsa->samplesAInv[offset/T_AInv];
+ for(index=0; index<(offset%T_AInv); index++)
+ #ifdef PSI_HUFFMANRLE
+ inverseValue=getHuffmanPsiValue(&(myicsa->hcPsi),inverseValue);
+ #endif
+ #ifdef PSI_GONZALO
+ inverseValue = getGonzaloPsiValue(&(myicsa->gcPsi),inverseValue);
+ #endif
+ #ifdef PSI_DELTACODES
+ inverseValue = getDeltaPsiValue(&(myicsa->dcPsi),inverseValue);
+ #endif
+ return inverseValue;
+
+}
+
+// Initializes the parameters of the index.
+uint parametersCSA(ticsa *myicsa, char *build_options){
+ char delimiters[] = " =;";
+ int j,num_parameters;
+ char ** parameters;
+ int ssA,ssAinv,ssPsi,nsHuff,psiSearchFactor;
+
+ ssA = DEFAULT_A_SAMPLING_PERIOD;
+ ssAinv = DEFAULT_A_INV_SAMPLING_PERIOD;
+ ssPsi = DEFAULT_PSI_SAMPLING_PERIOD;
+ nsHuff = DEFAULT_nsHUFF;
+ psiSearchFactor = DEFAULT_PSI_BINARY_SEARCH_FACTOR;
+
+ if (build_options != NULL) {
+ parse_parameters(build_options,&num_parameters, ¶meters, delimiters);
+ for (j=0; j<num_parameters;j++) {
+
+ if ((strcmp(parameters[j], "sA") == 0 ) && (j < num_parameters-1) ) {
+ ssA=atoi(parameters[j+1]);
+ }
+ if ((strcmp(parameters[j], "sAinv") == 0 ) && (j < num_parameters-1) ) {
+ ssAinv=atoi(parameters[j+1]);
+ }
+ if ((strcmp(parameters[j], "sPsi") == 0 ) && (j < num_parameters-1) ) {
+ ssPsi=atoi(parameters[j+1]);
+ }
+ if ((strcmp(parameters[j], "nsHuff") == 0 ) && (j < num_parameters-1) ) {
+ nsHuff=atoi(parameters[j+1]);
+ nsHuff *=1024;
+ }
+ if ((strcmp(parameters[j], "psiSF") == 0 ) && (j < num_parameters-1) ) {
+ psiSearchFactor=atoi(parameters[j+1]);
+ }
+ j++;
+ }
+ free_parameters(num_parameters, ¶meters);
+ }
+
+ myicsa->T_A = ssA;
+ myicsa->T_AInv = ssAinv;
+ myicsa->T_Psi = ssPsi;
+ myicsa->tempNSHUFF = nsHuff;
+ myicsa->psiSearchFactorJump = psiSearchFactor * ssPsi;
+
+ printf("\n\t parameters for iCSA: sampleA=%d, sampleAinv=%d, samplePsi=%d",ssA,ssAinv,ssPsi);
+ printf("\n\t : nsHuff=%d, psiSearchFactor = %d --> jump = %d", nsHuff,psiSearchFactor, myicsa->psiSearchFactorJump);
+}
--- /dev/null
+#include <stdio.h>\r#include <fcntl.h>\r#include <sys/stat.h>\r#include <time.h>\r#include <sys/time.h>\r\r#include "defValues.h"\r#include "../utils/bitmap.h"\r#include "../utils/huff.h"\r#include "../utils/parameters.h"\r\r#ifdef PSI_HUFFMANRLE\r #include "psiHuffmanRLE.h"\r#endif\r\r#ifdef PSI_GONZALO\r #include "psiGonzalo.h"\r#endif\r\r#ifdef PSI_DELTACODES\r #include "psiDeltaCode.h"\r#endif\r\r\rtypedef struct {\r uint suffixArraySize;\r uint T_Psi;\r uint *D;\r bitmap bD;\r uint T_A;\r uint T_AInv;\r uint *samplesA;\r uint samplesASize;\r uint *BA;\r bitmap bBA; \r uint *samplesAInv;\r uint samplesAInvSize;\r uint displayCSAState;\r int displayCSAPrevPosition;\r #ifdef PSI_HUFFMANRLE \r HuffmanCompressedPsi hcPsi;\r #endif \r #ifdef PSI_GONZALO\r GonzaloCompressedPsi gcPsi;\r #endif\r #ifdef PSI_DELTACODES\r DeltaCompressedPsi dcPsi;\r #endif\r \r //only needed during "parse_parameters".\r uint tempNSHUFF;\r uint psiSearchFactorJump; //factor of the T_Psi value.\r} ticsa; \r\r \r// FUNCTION PROTOTYPES: BUILDING THE INDEX\r\r//Creates the ICSA \r\r int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ); //ticsa *createIntegerCSA (uint **aintVector, uint SAsize, char *build_options);\r\r//Returns number of elements in the indexed sequence of integers\r int sourceLenIntIndex(void *index, uint *numInts);\r\r//Save the index to disk\r int saveIntIndex(void *index, char *pathname); //void storeStructsCSA(ticsa *myicsa, char *basename);\r\r// Loads the index from disk.\r int loadIntIndex(char *pathname, void **index); //ticsa *loadCSA(char *basename);\r\r// Frees memory \r int freeIntIndex(void *index); //uint destroyStructsCSA(ticsa *myicsa);\r\r//Returns the size (in bytes) of the index over the sequence of integers.\r int sizeIntIndex(void *index, uint *numBytes); //uint CSA_size(ticsa *myicsa); \r\r // Shows detailed summary info of the self-index (memory usage of each structure)\rint printInfoIntIndex(void *index, const char tab[]);\r\r//Number of occurrences of the pattern, and the interval [left,right] in the suffix array.\r int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right);\r //uint countCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right); // Exponential search\r //uint countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right); // Binary search\r\r// Returns an array with integers corresponding offsets to the occurrences of the pattern, \r// as well as the number of occurrences\r int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc);\r //uint *locateCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *occ);\r\r//Returns the value of the source (array of integers) at a given offset.\r// (that is, the element "position" from the original array of uints)\r int displayIntIndex(void *index, ulong position, uint *value);\r //uint displayCSA(ticsa *myicsa, uint position); \r\r\r/* Private function prototypes ********************************************/\ruint parametersCSA(ticsa *myicsa, char *build_options);\r\ruint displayCSAFirst(ticsa *myicsa, uint position);\ruint displayCSANext(ticsa *myicsa);\rint SadCSACompare(ticsa *myicsa, uint *pattern, uint patternSize, uint p);\ruint A(ticsa *myicsa, uint position);\ruint inverseA(ticsa *myicsa, uint offset);\r\rvoid showStructsCSA(ticsa *myicsa); // For Debugging\r\r
\ No newline at end of file
--- /dev/null
+
+// FUNCTION PROTOTYPES: SELF-INDEX ON INTEGERS.
+
+int buildIntIndex (uint *intVector, uint n, char *build_options, void **index );
+
+ //Saves the index to disk
+int saveIntIndex(void *index, char *pathname);
+
+ //Returns number of elements in the indexed sequence of integers
+int sourceLenIntIndex(void *index, uint *numInts);
+
+ //Loads the index from disk.
+int loadIntIndex(char *pathname, void **index);
+
+ //Frees the memory allocated to the int_index
+int freeIntIndex(void *index);
+
+ //Returns the size (in bytes) of the index over the sequence of integers.
+int sizeIntIndex(void *index, uint *numBytes);
+
+ // Shows detailed summary info of the self-index (memory usage of each structure)
+int printInfoIntIndex(void *index, const char tab[]);
+
+ //Number of occurrences of the pattern, and the interval [left,right] in the suffix array
+int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right);
+
+ //returns an array with integers corresponding offsets to the occurrences of the pattern,
+ //as well as the number of occurrences
+int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc);
+
+ //Returns the value of the source (array of integers) at a given offset.
+ // (that is, the element "position" from the original array of uints)
+int displayIntIndex(void *index, ulong position, uint *value);
+
+
--- /dev/null
+#include "psiDeltaCode.h"
+
+void destroyDeltaCodesCompressedPsi(DeltaCompressedPsi *compressedPsi) {
+ free(compressedPsi->deltaCodes);
+ free(compressedPsi->samples);
+ free(compressedPsi->pointers);
+}
+
+
+DeltaCompressedPsi deltaCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T) {
+
+ DeltaCompressedPsi cPsi;
+
+ int numberOfSamples;
+ register int diff, deltaCodesPos;
+ register unsigned int k, p, aux, diffpositive, code, index;
+ unsigned int samplesIndex, codeLenght, currentInput, wordsDeltaCodes, totalSize;
+ unsigned int *deltaCodes;
+ unsigned int *samples;
+ unsigned int *pointers;
+
+ // Auxiliar para deltaCodes (estimamos como espacio maximo o do array de sufixos)
+ unsigned int *deltaCodesAux;
+
+ // Calculamos o mellor valor para negativeGap <= 64
+ unsigned int negativeGap;
+ register unsigned int maxNegativeBits = 0;
+ k = psiSize;
+ while(k) {
+ k >>= 1;
+ maxNegativeBits++;
+ }
+ if(maxNegativeBits<=26) negativeGap = 64;
+ else negativeGap = 1<<(32-maxNegativeBits);
+
+ // Reservamos espacio para as estructuras
+ numberOfSamples = (psiSize + T - 1) / T;
+ samples = (unsigned int *)malloc(sizeof(int)*numberOfSamples);
+ pointers = (unsigned int *)malloc(sizeof(int)*numberOfSamples);
+
+ deltaCodesAux = (unsigned int *)malloc(sizeof(int)*psiSize);
+ for(index=0; index<psiSize; index++) deltaCodesAux[index] = 0;
+
+ samplesIndex = 0;
+ deltaCodesPos = 0;
+ for(index=0; index<psiSize; index++) {
+
+ if(index % T) {
+
+ diff = Psi[index] - currentInput;
+ currentInput = Psi[index];
+
+ // Calculamos o codigo correspondente
+ if(diff>0) diffpositive = (negativeGap*diff-1)/(negativeGap-1);
+ else diffpositive = -negativeGap*diff;
+
+ k = 0;
+ aux = diffpositive;
+ while(aux) {
+ aux >>= 1;
+ k++;
+ }
+ aux = k;
+ p = 0;
+ while(aux) {
+ aux >>= 1;
+ p++;
+ }
+
+ code = diffpositive & ((1<<(k-1))-1);
+ codeLenght = 2*p+k-2;
+
+ // Primeiro metemos os p-1 0's iniciais
+ deltaCodesPos += p-1;
+
+ // Agora metemos os p bits de k
+ if( ((deltaCodesPos%32) + p) > 32 ) {
+ deltaCodesAux[deltaCodesPos/32] |= (k>>((deltaCodesPos%32)+p-32));
+ deltaCodesAux[deltaCodesPos/32+1] = (k<<(64-(deltaCodesPos%32)-p));
+ } else {
+ deltaCodesAux[deltaCodesPos/32] |= (k<<(32-p-(deltaCodesPos%32)));
+ }
+ deltaCodesPos += p;
+
+ // Por �ltimo metemos os k-1 bits de code (sen o 1 inicial)
+ if( ((deltaCodesPos%32) + (k-1)) > 32 ) {
+ deltaCodesAux[deltaCodesPos/32] |= (code>>((deltaCodesPos%32)+(k-1)-32));
+ deltaCodesAux[deltaCodesPos/32+1] = (code<<(64-(deltaCodesPos%32)-(k-1)));
+ } else {
+ deltaCodesAux[deltaCodesPos/32] |= (code<<(32-(k-1)-(deltaCodesPos%32)));
+ }
+ deltaCodesPos += k-1;
+
+ } else {
+ samples[samplesIndex] = Psi[index];
+ pointers[samplesIndex++] = deltaCodesPos;
+ currentInput = Psi[index];
+ }
+
+ }
+
+ // Ahora que xa sabemos o espacio necesario para os deltaCodes, reservamolo e liberamos a estructura auxiliar
+ wordsDeltaCodes = (deltaCodesPos+31)/32;
+ deltaCodes = (unsigned int *)malloc(sizeof(int)*wordsDeltaCodes);
+ for(index=0;index<wordsDeltaCodes;index++) deltaCodes[index] = deltaCodesAux[index];
+ free(deltaCodesAux);
+
+ totalSize = sizeof(int)*wordsDeltaCodes + 2*sizeof(int)*numberOfSamples + 4*sizeof(int);
+ printf("\n\tCompressed Psi size = %d bytes\n", totalSize);
+
+ // Asignamos os valores a cPsi e devolvemolo
+ cPsi.T = T;
+ cPsi.negativeGap = negativeGap;
+ cPsi.deltaCodesSize = wordsDeltaCodes;
+ cPsi.deltaCodes = deltaCodes;
+ cPsi.numberOfSamples = numberOfSamples;
+ cPsi.samples = samples;
+ cPsi.pointers = pointers;
+ cPsi.totalMem = totalSize;
+
+ return cPsi;
+
+}
+
+
+int getDeltaPsiValue(DeltaCompressedPsi *cPsi, unsigned int position) {
+
+ int result;
+ register unsigned int code, aux, pointerAux, mask, pointer, toDecode, p, k;
+
+ // Collemos a mostra inmediatamente inferior, e o punteiro o array de codigos
+ // pointer = punteiro absoluto sobre deltaCodes
+ result = cPsi->samples[position/cPsi->T];
+ pointer = cPsi->pointers[position/cPsi->T];
+
+ // Calculamos o numero de codigos a decodificar a partir da mostra
+ toDecode = position % cPsi->T;
+
+ while(toDecode--) {
+
+ // Collemos o n�mero ceros iniciais
+ // Po�emos o inicio do c�digo nun enteiro (code) alineado a esquerda
+ // Non importa que non colla todo o c�digo, pero si temos asegurado que
+ // colle p e k (k<=32 (6bits), p<=5bits)
+ code = (cPsi->deltaCodes[pointer/32] << (pointer%32)) |
+ ((pointer%32 != 0) * (cPsi->deltaCodes[pointer/32+1] >> (32-(pointer%32))));
+
+ //Ahora contamos o n�mero de ceros (p) que hai nas posicions da esquerda de code
+ p = 1;
+ while(!(code & 0x80000000)) {
+ code <<= 1;
+ p++;
+ }
+
+ // Ahora calculamos o numero de digitos da representacion binaria do codigo (k)
+ k = code >> (32-p);
+
+ // Actualizamos o punteiro global sobre deltaCodes
+ pointer += 2*p-1;
+
+ // Po�emos a representacion binaria do codigo nun enteiro (code) alineado a esquerda
+ code = (cPsi->deltaCodes[pointer/32] << (pointer%32)) |
+ ((pointer%32 != 0) * (cPsi->deltaCodes[pointer/32+1] >> (32-(pointer%32))));
+ code = ((code >> 1) | 0x80000000) >> (32-k);
+ pointer += k-1;
+
+ // Bixecci�n
+ if(code % cPsi->negativeGap) result += (code - (code/cPsi->negativeGap));
+ else result -= code/cPsi->negativeGap;
+
+ }
+
+ return result;
+
+}
+
+
+void storeDeltaCompressedPsi(DeltaCompressedPsi *compressedPsi, char *filename) {
+
+ int file;
+
+ if( (file = open(filename, O_WRONLY|O_CREAT, 0700)) < 0) {
+ printf("Cannot open file %s\n", filename);
+ exit(0);
+ }
+ write(file, &(compressedPsi->T), sizeof(int));
+ write(file, &(compressedPsi->negativeGap), sizeof(int));
+ write(file, &(compressedPsi->deltaCodesSize), sizeof(int));
+ write(file, compressedPsi->deltaCodes, compressedPsi->deltaCodesSize*sizeof(int));
+ write(file, &(compressedPsi->numberOfSamples), sizeof(int));
+ write(file, compressedPsi->samples, compressedPsi->numberOfSamples*sizeof(int));
+ write(file, compressedPsi->pointers, compressedPsi->numberOfSamples*sizeof(int));
+ write(file, &(compressedPsi->totalMem), sizeof(int));
+
+ close(file);
+
+}
+
+
+DeltaCompressedPsi loadDeltaCompressedPsi(char *filename) {
+
+ DeltaCompressedPsi compressedPsi;
+
+ int file;
+
+ if( (file = open(filename, O_RDONLY)) < 0) {
+ printf("Cannot read file %s\n", filename);
+ exit(0);
+ }
+ read(file, &(compressedPsi.T), sizeof(int));
+ read(file, &(compressedPsi.negativeGap), sizeof(int));
+ read(file, &(compressedPsi.deltaCodesSize), sizeof(int));
+ compressedPsi.deltaCodes = (unsigned int *)malloc(compressedPsi.deltaCodesSize*sizeof(int));
+ read(file, compressedPsi.deltaCodes, compressedPsi.deltaCodesSize*sizeof(int));
+ read(file, &(compressedPsi.numberOfSamples), sizeof(int));
+ compressedPsi.samples = (unsigned int *)malloc(compressedPsi.numberOfSamples*sizeof(int));
+ compressedPsi.pointers = (unsigned int *)malloc(compressedPsi.numberOfSamples*sizeof(int));
+ read(file, compressedPsi.samples, compressedPsi.numberOfSamples*sizeof(int));
+ read(file, compressedPsi.pointers, compressedPsi.numberOfSamples*sizeof(int));
+ read(file, &(compressedPsi.totalMem), sizeof(int));
+
+ close(file);
+
+ return compressedPsi;
+
+}
--- /dev/null
+#include <stdlib.h>
+#include <stdio.h>
+#include <malloc.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+
+
+
+typedef struct {
+ unsigned int T;
+ unsigned int negativeGap;
+ unsigned int deltaCodesSize; // En palabras
+ unsigned int *deltaCodes;
+ unsigned int numberOfSamples;
+ unsigned int *samples;
+ unsigned int *pointers;
+ unsigned int totalMem; // the size in bytes used;
+} DeltaCompressedPsi;
+
+
+// PROTOTIPOS DE FUNCI�NS
+DeltaCompressedPsi deltaCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T);
+int getDeltaPsiValue(DeltaCompressedPsi *cPsi, unsigned int position);
+void storeDeltaCompressedPsi(DeltaCompressedPsi *compressedPsi, char *filename);
+DeltaCompressedPsi loadDeltaCompressedPsi(char *filename);
+void destroyDeltaCodesCompressedPsi(DeltaCompressedPsi *compressedPsi);
+
+// IMPLEMENTACI�N DAS FUNCI�NS
+
+//void destroyDeltaCodesCompressedPsi(DeltaCompressedPsi *compressedPsi) {
+// free(compressedPsi->deltaCodes);
+// free(compressedPsi->samples);
+// free(compressedPsi->pointers);
+//}
+//
+//
+//DeltaCompressedPsi deltaCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T) {
+//
+// DeltaCompressedPsi cPsi;
+//
+// int numberOfSamples;
+// register int diff, deltaCodesPos;
+// register unsigned int k, p, aux, diffpositive, code, index;
+// unsigned int samplesIndex, codeLenght, currentInput, wordsDeltaCodes, totalSize;
+// unsigned int *deltaCodes;
+// unsigned int *samples;
+// unsigned int *pointers;
+//
+// // Auxiliar para deltaCodes (estimamos como espacio maximo o do array de sufixos)
+// unsigned int *deltaCodesAux;
+//
+// // Calculamos o mellor valor para negativeGap <= 64
+// unsigned int negativeGap;
+// register unsigned int maxNegativeBits = 0;
+// k = psiSize;
+// while(k) {
+// k >>= 1;
+// maxNegativeBits++;
+// }
+// if(maxNegativeBits<=26) negativeGap = 64;
+// else negativeGap = 1<<(32-maxNegativeBits);
+//
+// // Reservamos espacio para as estructuras
+// numberOfSamples = (psiSize + T - 1) / T;
+// samples = (unsigned int *)malloc(sizeof(int)*numberOfSamples);
+// pointers = (unsigned int *)malloc(sizeof(int)*numberOfSamples);
+//
+// deltaCodesAux = (unsigned int *)malloc(sizeof(int)*psiSize);
+// for(index=0; index<psiSize; index++) deltaCodesAux[index] = 0;
+//
+// samplesIndex = 0;
+// deltaCodesPos = 0;
+// for(index=0; index<psiSize; index++) {
+//
+// if(index % T) {
+//
+// diff = Psi[index] - currentInput;
+// currentInput = Psi[index];
+//
+// // Calculamos o codigo correspondente
+// if(diff>0) diffpositive = (negativeGap*diff-1)/(negativeGap-1);
+// else diffpositive = -negativeGap*diff;
+//
+// k = 0;
+// aux = diffpositive;
+// while(aux) {
+// aux >>= 1;
+// k++;
+// }
+// aux = k;
+// p = 0;
+// while(aux) {
+// aux >>= 1;
+// p++;
+// }
+//
+// code = diffpositive & ((1<<(k-1))-1);
+// codeLenght = 2*p+k-2;
+//
+// // Primeiro metemos os p-1 0's iniciais
+// deltaCodesPos += p-1;
+//
+// // Agora metemos os p bits de k
+// if( ((deltaCodesPos%32) + p) > 32 ) {
+// deltaCodesAux[deltaCodesPos/32] |= (k>>((deltaCodesPos%32)+p-32));
+// deltaCodesAux[deltaCodesPos/32+1] = (k<<(64-(deltaCodesPos%32)-p));
+// } else {
+// deltaCodesAux[deltaCodesPos/32] |= (k<<(32-p-(deltaCodesPos%32)));
+// }
+// deltaCodesPos += p;
+//
+// // Por �ltimo metemos os k-1 bits de code (sen o 1 inicial)
+// if( ((deltaCodesPos%32) + (k-1)) > 32 ) {
+// deltaCodesAux[deltaCodesPos/32] |= (code>>((deltaCodesPos%32)+(k-1)-32));
+// deltaCodesAux[deltaCodesPos/32+1] = (code<<(64-(deltaCodesPos%32)-(k-1)));
+// } else {
+// deltaCodesAux[deltaCodesPos/32] |= (code<<(32-(k-1)-(deltaCodesPos%32)));
+// }
+// deltaCodesPos += k-1;
+//
+// } else {
+// samples[samplesIndex] = Psi[index];
+// pointers[samplesIndex++] = deltaCodesPos;
+// currentInput = Psi[index];
+// }
+//
+// }
+//
+// // Ahora que xa sabemos o espacio necesario para os deltaCodes, reservamolo e liberamos a estructura auxiliar
+// wordsDeltaCodes = (deltaCodesPos+31)/32;
+// deltaCodes = (unsigned int *)malloc(sizeof(int)*wordsDeltaCodes);
+// for(index=0;index<wordsDeltaCodes;index++) deltaCodes[index] = deltaCodesAux[index];
+// free(deltaCodesAux);
+//
+// totalSize = sizeof(int)*wordsDeltaCodes + 2*sizeof(int)*numberOfSamples + 4*sizeof(int);
+// printf("Compressed Psi size = %d bytes\n", totalSize);
+//
+// // Asignamos os valores a cPsi e devolvemolo
+// cPsi.T = T;
+// cPsi.negativeGap = negativeGap;
+// cPsi.deltaCodesSize = wordsDeltaCodes;
+// cPsi.deltaCodes = deltaCodes;
+// cPsi.numberOfSamples = numberOfSamples;
+// cPsi.samples = samples;
+// cPsi.pointers = pointers;
+// return cPsi;
+//
+//}
+//
+//
+//int getDeltaPsiValue(DeltaCompressedPsi *cPsi, unsigned int position) {
+//
+// int result;
+// register unsigned int code, aux, pointerAux, mask, pointer, toDecode, p, k;
+//
+// // Collemos a mostra inmediatamente inferior, e o punteiro o array de codigos
+// // pointer = punteiro absoluto sobre deltaCodes
+// result = cPsi->samples[position/cPsi->T];
+// pointer = cPsi->pointers[position/cPsi->T];
+//
+// // Calculamos o numero de codigos a decodificar a partir da mostra
+// toDecode = position % cPsi->T;
+//
+// while(toDecode--) {
+//
+// // Collemos o n�mero ceros iniciais
+// // Po�emos o inicio do c�digo nun enteiro (code) alineado a esquerda
+// // Non importa que non colla todo o c�digo, pero si temos asegurado que
+// // colle p e k (k<=32 (6bits), p<=5bits)
+// code = (cPsi->deltaCodes[pointer/32] << (pointer%32)) |
+// ((pointer%32 != 0) * (cPsi->deltaCodes[pointer/32+1] >> (32-(pointer%32))));
+//
+// //Ahora contamos o n�mero de ceros (p) que hai nas posicions da esquerda de code
+// p = 1;
+// while(!(code & 0x80000000)) {
+// code <<= 1;
+// p++;
+// }
+//
+// // Ahora calculamos o numero de digitos da representacion binaria do codigo (k)
+// k = code >> (32-p);
+//
+// // Actualizamos o punteiro global sobre deltaCodes
+// pointer += 2*p-1;
+//
+// // Po�emos a representacion binaria do codigo nun enteiro (code) alineado a esquerda
+// code = (cPsi->deltaCodes[pointer/32] << (pointer%32)) |
+// ((pointer%32 != 0) * (cPsi->deltaCodes[pointer/32+1] >> (32-(pointer%32))));
+// code = ((code >> 1) | 0x80000000) >> (32-k);
+// pointer += k-1;
+//
+// // Bixecci�n
+// if(code % cPsi->negativeGap) result += (code - (code/cPsi->negativeGap));
+// else result -= code/cPsi->negativeGap;
+//
+// }
+//
+// return result;
+//
+//}
+//
+//
+//void storeDeltaCompressedPsi(DeltaCompressedPsi *compressedPsi, char *filename) {
+//
+// int file;
+//
+// if( (file = open(filename, O_WRONLY|O_CREAT, 0700)) < 0) {
+// printf("Cannot open file %s\n", filename);
+// exit(0);
+// }
+// write(file, &(compressedPsi->T), sizeof(int));
+// write(file, &(compressedPsi->negativeGap), sizeof(int));
+// write(file, &(compressedPsi->deltaCodesSize), sizeof(int));
+// write(file, compressedPsi->deltaCodes, compressedPsi->deltaCodesSize*sizeof(int));
+// write(file, &(compressedPsi->numberOfSamples), sizeof(int));
+// write(file, compressedPsi->samples, compressedPsi->numberOfSamples*sizeof(int));
+// write(file, compressedPsi->pointers, compressedPsi->numberOfSamples*sizeof(int));
+// close(file);
+//
+//}
+//
+//
+//DeltaCompressedPsi loadDeltaCompressedPsi(char *filename) {
+//
+// DeltaCompressedPsi compressedPsi;
+//
+// int file;
+//
+// if( (file = open(filename, O_RDONLY)) < 0) {
+// printf("Cannot read file %s\n", filename);
+// exit(0);
+// }
+// read(file, &(compressedPsi.T), sizeof(int));
+// read(file, &(compressedPsi.negativeGap), sizeof(int));
+// read(file, &(compressedPsi.deltaCodesSize), sizeof(int));
+// compressedPsi.deltaCodes = (unsigned int *)malloc(compressedPsi.deltaCodesSize*sizeof(int));
+// read(file, compressedPsi.deltaCodes, compressedPsi.deltaCodesSize*sizeof(int));
+// read(file, &(compressedPsi.numberOfSamples), sizeof(int));
+// compressedPsi.samples = (unsigned int *)malloc(compressedPsi.numberOfSamples*sizeof(int));
+// compressedPsi.pointers = (unsigned int *)malloc(compressedPsi.numberOfSamples*sizeof(int));
+// read(file, compressedPsi.samples, compressedPsi.numberOfSamples*sizeof(int));
+// read(file, compressedPsi.pointers, compressedPsi.numberOfSamples*sizeof(int));
+// close(file);
+//
+// return compressedPsi;
+//
+//}
--- /dev/null
+
+#include "psiGonzalo.h"
+
+
+// IMPLEMENTACI�N DAS FUNCI�NS
+
+void destroyGonzaloCompressedPsi(GonzaloCompressedPsi *compressedPsi) {
+
+ //free(compressedPsi->Hlen.s.spos);
+ //free(compressedPsi->Hacc.s.spos);
+ freeHuff(compressedPsi->Hlen);
+ freeHuff(compressedPsi->Hacc);
+ free(compressedPsi->cPsi);
+ free(compressedPsi->bposS);
+}
+
+
+GonzaloCompressedPsi gonzaloCompressPsi(uint *Psi, uint psiSize, uint T, uint HUFF) {
+
+ GonzaloCompressedPsi compressedPsi;
+
+ register uint i;
+ uint oi,j;
+ int ok,k;
+ register uint _cptr;
+
+ uint *_cPsi;
+ uint *_bposS;
+
+ uint links = psiSize;
+ uint samplen = T;
+ uint _bplen;
+ uint pslen;
+ uint totexc;
+
+ uint *acc,*lacc;
+ THuff Hacc, Hlen;
+
+ uint totalSize;
+
+ // Construe os arboles de huffman, o dos valores directos
+ // e o das lonxitudes dos runs. Usa como vectores auxiliares de frecuencias
+ // a acc e lacc, que finalmente libera.
+ acc = (uint *)malloc (HUFF*sizeof(uint));
+ lacc = (uint *)malloc ((samplen-1)*sizeof(uint));
+ for (k=0;k<HUFF;k++) acc[k]=0;
+ for (k=0;k<samplen-1;k++) lacc[k]=0;
+
+ ok = 0;
+ k = Psi[0];
+ for (i=0;i<=links;i++) {
+ if ((k == 1) && (i % samplen)) { if (ok != 1) oi = i; }
+ else {
+ if (ok == 1) {
+ acc[1]++;
+ lacc[i-oi-1]++;
+ }
+ if (i % samplen)
+ if ((k < 1) || (k >= HUFF)) acc[0]++;
+ else acc[k]++;
+ }
+ ok = (i % samplen) ? k : 0;
+ k = Psi[i+1]-Psi[i];
+ }
+
+ if (ok == 1) {
+ acc[1]++;
+ lacc[i-oi-1]++;
+ }
+
+ Hacc = createHuff (acc,HUFF-1, UNSORTED);
+ Hlen = createHuff (lacc,samplen-2, UNSORTED);
+ totexc = acc[0];
+ pslen = bits(psiSize+1);
+ _bplen = bits(Hacc.total+Hlen.total+(1+links/samplen+totexc)*pslen);
+ _bposS = (uint *)malloc ((((1+links/samplen)*_bplen+W-1)/W)*sizeof(uint));
+ _cPsi = (uint *)malloc (((Hacc.total+Hlen.total+(1+links/samplen+totexc)*pslen+W-1)/W)*sizeof(uint));
+
+ _cptr = 0;
+ ok = 0;
+ k = Psi[0];
+
+ for (i=0;i<=links;i++) {
+
+ if ((k == 1) && (i % samplen)) { if (ok != 1) oi = i; }
+ else {
+ if (ok == 1) {
+ _cptr = encodeHuff (Hacc,1,_cPsi,_cptr);
+ _cptr = encodeHuff(Hlen,i-oi-1,_cPsi,_cptr);
+ }
+ if (i % samplen) {
+ if ((k > 1) && (k < HUFF)) _cptr = encodeHuff (Hacc,k,_cPsi,_cptr);
+ else {
+ _cptr = encodeHuff (Hacc,0,_cPsi,_cptr);
+ bitwrite (_cPsi,_cptr,pslen,Psi[i]);
+ _cptr += pslen;
+ }
+ }
+ else {
+ bitwrite (_bposS,(i/samplen)*_bplen,_bplen,_cptr);
+ bitwrite (_cPsi,_cptr,pslen,Psi[i]);
+ _cptr += pslen;
+ }
+ }
+ ok = (i % samplen) ? k : 0;
+ k = Psi[i+1]-Psi[i];
+ }
+
+ if (ok == 1) {
+ _cptr = encodeHuff (Hacc,1,_cPsi,_cptr);
+ _cptr = encodeHuff(Hlen,i-oi-1,_cPsi,_cptr);
+ }
+
+ // Calculamos o espacio total
+ totalSize = (((1+links/samplen)*_bplen+W-1)/W)*sizeof(uint) +
+ ((Hacc.total+Hlen.total+(1+links/samplen+totexc)*pslen+W-1)/W)*sizeof(uint) +
+ 5*sizeof(int) + sizeHuff(Hacc) + sizeHuff(Hlen);
+ printf("\n\tCompressed Psi size = %d bytes\n", totalSize);
+
+ // Necesario antes de decodificar
+ prepareToDecode(&Hacc);
+ prepareToDecode(&Hlen);
+
+ // Asignamos os valores e devolvemos psi comprimido
+ compressedPsi.links = psiSize;
+ compressedPsi.totexc = totexc;
+ compressedPsi.cPsi = _cPsi;
+ compressedPsi.samplen = samplen;
+ compressedPsi.bposS = _bposS;
+ compressedPsi.bplen = _bplen;
+ compressedPsi.pslen = pslen;
+ compressedPsi.Hacc = Hacc;
+ compressedPsi.Hlen = Hlen;
+ compressedPsi.totalMem = totalSize;
+
+ free(acc);
+ free(lacc);
+
+ return compressedPsi;
+}
+
+
+
+int getGonzaloPsiValue(GonzaloCompressedPsi *compressedPsi, unsigned int position) {
+
+ uint *cPsi = compressedPsi->cPsi;
+ uint samplen = compressedPsi->samplen;
+ uint *bposS = compressedPsi->bposS;
+ uint bplen = compressedPsi->bplen;
+ uint pslen = compressedPsi->pslen;
+ THuff *Hacc = &compressedPsi->Hacc;
+ THuff *Hlen = &compressedPsi->Hlen;
+
+ uint sampj,cptr,val,dval,rlen,head,hlen;
+
+ sampj = (position/samplen)*samplen;
+ cptr = bitread(bposS,(sampj/samplen)*bplen,bplen);
+ head = cptr;
+ val = bitread(cPsi,head,pslen);
+ head += pslen;
+
+ while (sampj < position) {
+
+ head = decodeHuff(Hacc,&dval,cPsi,head);
+
+ if (dval == 0) {
+
+ val = bitread(cPsi,head,pslen);
+ head += pslen;
+ sampj++;
+ }
+ else
+ if (dval == 1) {
+ head = decodeHuff(Hlen,&rlen,cPsi,head);
+ rlen++;
+ if (sampj + rlen >= position) return val + (position-sampj);
+ val += rlen;
+ sampj += rlen;
+ }
+ else {
+ val += dval;
+ sampj++;
+ }
+
+ }
+
+ return val;
+
+}
+
+
+void storeGonzaloCompressedPsi(GonzaloCompressedPsi *compressedPsi, char *filename) {
+
+ int file;
+ THuff Hacc;
+ THuff Hlen;
+
+ if( (file = open(filename, O_WRONLY|O_CREAT, 0700)) < 0) {
+ printf("Cannot open file %s\n", filename);
+ exit(0);
+ }
+
+ // Copias locales dos arboles de HUFFMAN
+ Hacc = compressedPsi->Hacc;
+ Hlen = compressedPsi->Hlen;
+
+ write(file, &(compressedPsi->links), sizeof(int));
+ write(file, &(compressedPsi->totexc), sizeof(int));
+ write(file, &(compressedPsi->samplen), sizeof(int));
+ write(file, &(compressedPsi->bplen), sizeof(int));
+ write(file, &(compressedPsi->pslen), sizeof(int));
+ // Almacenar o arbol de huffman principal
+ write(file, &Hacc.max, sizeof(int));
+ write(file, &Hacc.lim, sizeof(int));
+ write(file, &Hacc.depth, sizeof(int));
+// write(file, Hacc.s.spos, (Hacc.lim+1)*sizeof(int));
+ write(file, Hacc.s.symb, (Hacc.lim+1)*sizeof(int));
+ write(file, Hacc.num, (Hacc.depth+1)*sizeof(int));
+ write(file, Hacc.fst, (Hacc.depth+1)*sizeof(int));
+ // Fin de almacenar o arbol de huffman principal
+ // Almacenar o arbol de huffman das lonxitudes dos runs
+ write(file, &Hlen.max, sizeof(int));
+ write(file, &Hlen.lim, sizeof(int));
+ write(file, &Hlen.depth, sizeof(int));
+// write(file, Hlen.s.spos, (Hlen.lim+1)*sizeof(int));
+ write(file, Hlen.s.symb, (Hlen.lim+1)*sizeof(int));
+ write(file, Hlen.num, (Hlen.depth+1)*sizeof(int));
+ write(file, Hlen.fst, (Hlen.depth+1)*sizeof(int));
+ // Fin de almacenar o arbol de huffman das lonxitudes dos runs
+ write(file, compressedPsi->bposS, ((compressedPsi->bplen*(1+compressedPsi->links/compressedPsi->samplen)+W-1)/W)*sizeof(uint));
+ write(file, compressedPsi->cPsi, ((Hacc.total+Hlen.total+(1+compressedPsi->links/compressedPsi->samplen+compressedPsi->totexc)*compressedPsi->pslen+W-1)/W)*sizeof(int));
+
+ write(file, &(compressedPsi->totalMem), sizeof(int));
+
+ close(file);
+
+}
+
+
+GonzaloCompressedPsi loadGonzaloCompressedPsi(char *filename) {
+
+ GonzaloCompressedPsi compressedPsi;
+
+ THuff Hacc;
+ THuff Hlen;
+
+ int file;
+
+ if( (file = open(filename, O_RDONLY)) < 0) {
+ printf("Cannot read file %s\n", filename);
+ exit(0);
+ }
+ read(file, &(compressedPsi.links), sizeof(int));
+ read(file, &(compressedPsi.totexc), sizeof(int));
+ read(file, &(compressedPsi.samplen), sizeof(int));
+ read(file, &(compressedPsi.bplen), sizeof(int));
+ read(file, &(compressedPsi.pslen), sizeof(int));
+ // Cargamos o arbol de Huffman principal
+ read(file, &Hacc.max, sizeof(int));
+ read(file, &Hacc.lim, sizeof(int));
+ read(file, &Hacc.depth, sizeof(int));
+ //Hacc.s.spos = (unsigned int *) malloc((Hacc.lim+1)*sizeof(int));
+ Hacc.s.symb = (unsigned int *) malloc((Hacc.lim+1)*sizeof(int));
+ Hacc.num = (unsigned int *) malloc((Hacc.depth+1)*sizeof(int));
+ Hacc.fst = (unsigned int *) malloc((Hacc.depth+1)*sizeof(int));
+ //read(file, Hacc.s.spos, (Hacc.lim+1)*sizeof(int));
+ read(file, Hacc.s.symb, (Hacc.lim+1)*sizeof(int));
+ read(file, Hacc.num, (Hacc.depth+1)*sizeof(int));
+ read(file, Hacc.fst, (Hacc.depth+1)*sizeof(int));
+ compressedPsi.Hacc = Hacc;
+ // Fin da carga do arbol de Huffman principal
+ // Cargamos o arbol de Huffman coas lonxitudes dos runs
+ read(file, &Hlen.max, sizeof(int));
+ read(file, &Hlen.lim, sizeof(int));
+ read(file, &Hlen.depth, sizeof(int));
+ //Hlen.s.spos = (unsigned int *) malloc((Hlen.lim+1)*sizeof(int));
+ Hlen.s.symb = (unsigned int *) malloc((Hlen.lim+1)*sizeof(int));
+ Hlen.num = (unsigned int *) malloc((Hlen.depth+1)*sizeof(int));
+ Hlen.fst = (unsigned int *) malloc((Hlen.depth+1)*sizeof(int));
+ //read(file, Hlen.s.spos, (Hlen.lim+1)*sizeof(int));
+ read(file, Hlen.s.symb, (Hlen.lim+1)*sizeof(int));
+ read(file, Hlen.num, (Hlen.depth+1)*sizeof(int));
+ read(file, Hlen.fst, (Hlen.depth+1)*sizeof(int));
+ compressedPsi.Hlen = Hlen;
+ // Fin da carga do arbol de Huffman coas lonxitudes dos runs
+ compressedPsi.bposS = (uint *) malloc (((compressedPsi.bplen*(1+compressedPsi.links/compressedPsi.samplen)+W-1)/W)*sizeof(uint));
+ read(file, compressedPsi.bposS, ((compressedPsi.bplen*(1+compressedPsi.links/compressedPsi.samplen)+W-1)/W)*sizeof(uint));
+ compressedPsi.cPsi = (uint *) malloc (((Hacc.total+Hlen.total+(1+compressedPsi.links/compressedPsi.samplen+compressedPsi.totexc)*compressedPsi.pslen+W-1)/W)*sizeof(uint));
+ read(file, compressedPsi.cPsi, ((Hacc.total+Hlen.total+(1+compressedPsi.links/compressedPsi.samplen+compressedPsi.totexc)*compressedPsi.pslen+W-1)/W)*sizeof(uint));
+
+ read(file, &(compressedPsi.totalMem), sizeof(int));
+ close(file);
+
+ return compressedPsi;
+
+}
--- /dev/null
+#include <stdlib.h>
+#include <stdio.h>
+#include <malloc.h>
+#include "../utils/huff.h"
+
+// ESTRUCTURA QUE REPRESENTA A FUNCION PSI COMPRIMIDA
+typedef struct {
+ uint links;
+ uint totexc;
+ uint samplen;
+ uint bplen;
+ uint pslen;
+ uint *cPsi;
+ uint *bposS;
+ THuff Hacc;
+ THuff Hlen;
+ unsigned int totalMem; // the size in bytes used;
+} GonzaloCompressedPsi;
+
+
+// PROTOTIPOS DE FUNCIONS
+GonzaloCompressedPsi gonzaloCompressPsi(uint *Psi, uint psiSize, uint T, uint HUFF);
+int getGonzaloPsiValue(GonzaloCompressedPsi *compressedPsi, unsigned int position);
+void storeGonzaloCompressedPsi(GonzaloCompressedPsi *compressedPsi, char *filename);
+GonzaloCompressedPsi loadGonzaloCompressedPsi(char *filename);
+//frees the memory used.
+void destroyGonzaloCompressedPsi(GonzaloCompressedPsi *compressedPsi);
+
+
+//// IMPLEMENTACI�N DAS FUNCI�NS
+//
+//void destroyGonzaloCompressedPsi(GonzaloCompressedPsi *compressedPsi) {
+//
+// //free(compressedPsi->Hlen.s.spos);
+// //free(compressedPsi->Hacc.s.spos);
+// freeHuff(compressedPsi->Hlen);
+// freeHuff(compressedPsi->Hacc);
+// free(compressedPsi->cPsi);
+// free(compressedPsi->bposS);
+//}
+//
+//
+//GonzaloCompressedPsi gonzaloCompressPsi(uint *Psi, uint psiSize, uint T, uint HUFF) {
+//
+// GonzaloCompressedPsi compressedPsi;
+//
+// register uint i;
+// uint oi,j;
+// int ok,k;
+// register uint _cptr;
+//
+// uint *_cPsi;
+// uint *_bposS;
+//
+// uint links = psiSize;
+// uint samplen = T;
+// uint _bplen;
+// uint pslen;
+// uint totexc;
+//
+// uint *acc,*lacc;
+// THuff Hacc, Hlen;
+//
+// uint totalSize;
+//
+// // Construe os arboles de huffman, o dos valores directos
+// // e o das lonxitudes dos runs. Usa como vectores auxiliares de frecuencias
+// // a acc e lacc, que finalmente libera.
+// acc = (uint *)malloc (HUFF*sizeof(uint));
+// lacc = (uint *)malloc ((samplen-1)*sizeof(uint));
+// for (k=0;k<HUFF;k++) acc[k]=0;
+// for (k=0;k<samplen-1;k++) lacc[k]=0;
+//
+// ok = 0;
+// k = Psi[0];
+// for (i=0;i<=links;i++) {
+// if ((k == 1) && (i % samplen)) { if (ok != 1) oi = i; }
+// else {
+// if (ok == 1) {
+// acc[1]++;
+// lacc[i-oi-1]++;
+// }
+// if (i % samplen)
+// if ((k < 1) || (k >= HUFF)) acc[0]++;
+// else acc[k]++;
+// }
+// ok = (i % samplen) ? k : 0;
+// k = Psi[i+1]-Psi[i];
+// }
+//
+// if (ok == 1) {
+// acc[1]++;
+// lacc[i-oi-1]++;
+// }
+//
+// Hacc = createHuff (acc,HUFF-1, UNSORTED);
+// Hlen = createHuff (lacc,samplen-2, UNSORTED);
+// totexc = acc[0];
+// pslen = bits(psiSize+1);
+// _bplen = bits(Hacc.total+Hlen.total+(1+links/samplen+totexc)*pslen);
+// _bposS = (uint *)malloc ((((1+links/samplen)*_bplen+W-1)/W)*sizeof(uint));
+// _cPsi = (uint *)malloc (((Hacc.total+Hlen.total+(1+links/samplen+totexc)*pslen+W-1)/W)*sizeof(uint));
+//
+// _cptr = 0;
+// ok = 0;
+// k = Psi[0];
+//
+// for (i=0;i<=links;i++) {
+//
+// if ((k == 1) && (i % samplen)) { if (ok != 1) oi = i; }
+// else {
+// if (ok == 1) {
+// _cptr = encodeHuff (Hacc,1,_cPsi,_cptr);
+// _cptr = encodeHuff(Hlen,i-oi-1,_cPsi,_cptr);
+// }
+// if (i % samplen) {
+// if ((k > 1) && (k < HUFF)) _cptr = encodeHuff (Hacc,k,_cPsi,_cptr);
+// else {
+// _cptr = encodeHuff (Hacc,0,_cPsi,_cptr);
+// bitwrite (_cPsi,_cptr,pslen,Psi[i]);
+// _cptr += pslen;
+// }
+// }
+// else {
+// bitwrite (_bposS,(i/samplen)*_bplen,_bplen,_cptr);
+// bitwrite (_cPsi,_cptr,pslen,Psi[i]);
+// _cptr += pslen;
+// }
+// }
+// ok = (i % samplen) ? k : 0;
+// k = Psi[i+1]-Psi[i];
+// }
+//
+// if (ok == 1) {
+// _cptr = encodeHuff (Hacc,1,_cPsi,_cptr);
+// _cptr = encodeHuff(Hlen,i-oi-1,_cPsi,_cptr);
+// }
+//
+// // Calculamos o espacio total
+// totalSize = (((1+links/samplen)*_bplen+W-1)/W)*sizeof(uint) +
+// ((Hacc.total+Hlen.total+(1+links/samplen+totexc)*pslen+W-1)/W)*sizeof(uint) +
+// 5*sizeof(int) + sizeHuff(Hacc) + sizeHuff(Hlen);
+// printf("Compressed Psi size = %d bytes\n", totalSize);
+//
+// // Necesario antes de decodificar
+// prepareToDecode(&Hacc);
+// prepareToDecode(&Hlen);
+//
+// // Asignamos os valores e devolvemos psi comprimido
+// compressedPsi.links = psiSize;
+// compressedPsi.totexc = totexc;
+// compressedPsi.cPsi = _cPsi;
+// compressedPsi.samplen = samplen;
+// compressedPsi.bposS = _bposS;
+// compressedPsi.bplen = _bplen;
+// compressedPsi.pslen = pslen;
+// compressedPsi.Hacc = Hacc;
+// compressedPsi.Hlen = Hlen;
+//
+// free(acc);
+// free(lacc);
+//
+// return compressedPsi;
+//}
+//
+//
+//
+//int getGonzaloPsiValue(GonzaloCompressedPsi *compressedPsi, unsigned int position) {
+//
+// uint *cPsi = compressedPsi->cPsi;
+// uint samplen = compressedPsi->samplen;
+// uint *bposS = compressedPsi->bposS;
+// uint bplen = compressedPsi->bplen;
+// uint pslen = compressedPsi->pslen;
+// THuff *Hacc = &compressedPsi->Hacc;
+// THuff *Hlen = &compressedPsi->Hlen;
+//
+// uint sampj,cptr,val,dval,rlen,head,hlen;
+//
+// sampj = (position/samplen)*samplen;
+// cptr = bitread(bposS,(sampj/samplen)*bplen,bplen);
+// head = cptr;
+// val = bitread(cPsi,head,pslen);
+// head += pslen;
+//
+// while (sampj < position) {
+//
+// head = decodeHuff(Hacc,&dval,cPsi,head);
+//
+// if (dval == 0) {
+//
+// val = bitread(cPsi,head,pslen);
+// head += pslen;
+// sampj++;
+// }
+// else
+// if (dval == 1) {
+// head = decodeHuff(Hlen,&rlen,cPsi,head);
+// rlen++;
+// if (sampj + rlen >= position) return val + (position-sampj);
+// val += rlen;
+// sampj += rlen;
+// }
+// else {
+// val += dval;
+// sampj++;
+// }
+//
+// }
+//
+// return val;
+//
+//}
+//
+//
+//void storeGonzaloCompressedPsi(GonzaloCompressedPsi *compressedPsi, char *filename) {
+//
+// int file;
+// THuff Hacc;
+// THuff Hlen;
+//
+// if( (file = open(filename, O_WRONLY|O_CREAT, 0700)) < 0) {
+// printf("Cannot open file %s\n", filename);
+// exit(0);
+// }
+//
+// // Copias locales dos arboles de HUFFMAN
+// Hacc = compressedPsi->Hacc;
+// Hlen = compressedPsi->Hlen;
+//
+// write(file, &(compressedPsi->links), sizeof(int));
+// write(file, &(compressedPsi->totexc), sizeof(int));
+// write(file, &(compressedPsi->samplen), sizeof(int));
+// write(file, &(compressedPsi->bplen), sizeof(int));
+// write(file, &(compressedPsi->pslen), sizeof(int));
+// // Almacenar o arbol de huffman principal
+// write(file, &Hacc.max, sizeof(int));
+// write(file, &Hacc.lim, sizeof(int));
+// write(file, &Hacc.depth, sizeof(int));
+//// write(file, Hacc.s.spos, (Hacc.lim+1)*sizeof(int));
+// write(file, Hacc.s.symb, (Hacc.lim+1)*sizeof(int));
+// write(file, Hacc.num, (Hacc.depth+1)*sizeof(int));
+// write(file, Hacc.fst, (Hacc.depth+1)*sizeof(int));
+// // Fin de almacenar o arbol de huffman principal
+// // Almacenar o arbol de huffman das lonxitudes dos runs
+// write(file, &Hlen.max, sizeof(int));
+// write(file, &Hlen.lim, sizeof(int));
+// write(file, &Hlen.depth, sizeof(int));
+//// write(file, Hlen.s.spos, (Hlen.lim+1)*sizeof(int));
+// write(file, Hlen.s.symb, (Hlen.lim+1)*sizeof(int));
+// write(file, Hlen.num, (Hlen.depth+1)*sizeof(int));
+// write(file, Hlen.fst, (Hlen.depth+1)*sizeof(int));
+// // Fin de almacenar o arbol de huffman das lonxitudes dos runs
+// write(file, compressedPsi->bposS, ((compressedPsi->bplen*(1+compressedPsi->links/compressedPsi->samplen)+W-1)/W)*sizeof(uint));
+// write(file, compressedPsi->cPsi, ((Hacc.total+Hlen.total+(1+compressedPsi->links/compressedPsi->samplen+compressedPsi->totexc)*compressedPsi->pslen+W-1)/W)*sizeof(int));
+//
+// close(file);
+//
+//}
+//
+//
+//GonzaloCompressedPsi loadGonzaloCompressedPsi(char *filename) {
+//
+// GonzaloCompressedPsi compressedPsi;
+//
+// THuff Hacc;
+// THuff Hlen;
+//
+// int file;
+//
+// if( (file = open(filename, O_RDONLY)) < 0) {
+// printf("Cannot read file %s\n", filename);
+// exit(0);
+// }
+// read(file, &(compressedPsi.links), sizeof(int));
+// read(file, &(compressedPsi.totexc), sizeof(int));
+// read(file, &(compressedPsi.samplen), sizeof(int));
+// read(file, &(compressedPsi.bplen), sizeof(int));
+// read(file, &(compressedPsi.pslen), sizeof(int));
+// // Cargamos o arbol de Huffman principal
+// read(file, &Hacc.max, sizeof(int));
+// read(file, &Hacc.lim, sizeof(int));
+// read(file, &Hacc.depth, sizeof(int));
+// //Hacc.s.spos = (unsigned int *) malloc((Hacc.lim+1)*sizeof(int));
+// Hacc.s.symb = (unsigned int *) malloc((Hacc.lim+1)*sizeof(int));
+// Hacc.num = (unsigned int *) malloc((Hacc.depth+1)*sizeof(int));
+// Hacc.fst = (unsigned int *) malloc((Hacc.depth+1)*sizeof(int));
+// //read(file, Hacc.s.spos, (Hacc.lim+1)*sizeof(int));
+// read(file, Hacc.s.symb, (Hacc.lim+1)*sizeof(int));
+// read(file, Hacc.num, (Hacc.depth+1)*sizeof(int));
+// read(file, Hacc.fst, (Hacc.depth+1)*sizeof(int));
+// compressedPsi.Hacc = Hacc;
+// // Fin da carga do arbol de Huffman principal
+// // Cargamos o arbol de Huffman coas lonxitudes dos runs
+// read(file, &Hlen.max, sizeof(int));
+// read(file, &Hlen.lim, sizeof(int));
+// read(file, &Hlen.depth, sizeof(int));
+// //Hlen.s.spos = (unsigned int *) malloc((Hlen.lim+1)*sizeof(int));
+// Hlen.s.symb = (unsigned int *) malloc((Hlen.lim+1)*sizeof(int));
+// Hlen.num = (unsigned int *) malloc((Hlen.depth+1)*sizeof(int));
+// Hlen.fst = (unsigned int *) malloc((Hlen.depth+1)*sizeof(int));
+// //read(file, Hlen.s.spos, (Hlen.lim+1)*sizeof(int));
+// read(file, Hlen.s.symb, (Hlen.lim+1)*sizeof(int));
+// read(file, Hlen.num, (Hlen.depth+1)*sizeof(int));
+// read(file, Hlen.fst, (Hlen.depth+1)*sizeof(int));
+// compressedPsi.Hlen = Hlen;
+// // Fin da carga do arbol de Huffman coas lonxitudes dos runs
+// compressedPsi.bposS = (uint *) malloc (((compressedPsi.bplen*(1+compressedPsi.links/compressedPsi.samplen)+W-1)/W)*sizeof(uint));
+// read(file, compressedPsi.bposS, ((compressedPsi.bplen*(1+compressedPsi.links/compressedPsi.samplen)+W-1)/W)*sizeof(uint));
+// compressedPsi.cPsi = (uint *) malloc (((Hacc.total+Hlen.total+(1+compressedPsi.links/compressedPsi.samplen+compressedPsi.totexc)*compressedPsi.pslen+W-1)/W)*sizeof(uint));
+// read(file, compressedPsi.cPsi, ((Hacc.total+Hlen.total+(1+compressedPsi.links/compressedPsi.samplen+compressedPsi.totexc)*compressedPsi.pslen+W-1)/W)*sizeof(uint));
+//
+// close(file);
+//
+// return compressedPsi;
+//
+//}
--- /dev/null
+#include "psiHuffmanRLE.h"
+
+// IMPLEMENTACION DAS FUNCIONS
+
+void destroyHuffmanCompressedPsi(HuffmanCompressedPsi *compressedPsi) {
+ freeHuff(compressedPsi->diffsHT);
+ free(compressedPsi->samples);
+ free(compressedPsi->samplePointers);
+ free (compressedPsi->stream);
+}
+
+HuffmanCompressedPsi huffmanCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T, unsigned int nS) {
+
+ HuffmanCompressedPsi cPsi;
+
+ int absolute_value;
+ register unsigned int index, ptr, samplesPtr, samplePointersPtr;
+ unsigned int runLenght, binaryLenght;
+
+ int *diffs;
+ unsigned int *huffmanDst;
+
+ // Estructuras da funcion comprimida (para logo asignar)
+ // Tam�n se podian almacenar directamente
+ THuff diffsHT;
+ unsigned int numberOfSamples;
+ unsigned int *samples;
+ unsigned int sampleSize;
+ unsigned int *samplePointers;
+ unsigned int pointerSize;
+ unsigned int *stream;
+ unsigned int streamSize;
+
+ // Variables que marcan os intervalos dentro do vector de frecuencias
+ unsigned int runLenghtStart = nS - 64 - T; // Inicio das Runs
+ unsigned int negStart = nS - 64; // Inicio dos Negativos
+ unsigned int bigStart = nS - 32; // Inicio dos Grandes (>runLenghtStart)
+
+ // Para estadistica
+ unsigned int totalSize;
+
+ // Reservamos espacio para a distribuci�n de valores de Psi
+ huffmanDst = (unsigned int *)malloc(sizeof(int)*nS);
+ for(index=0;index<nS;index++) huffmanDst[index]=0;
+
+ // Inicializamos diferencias
+ diffs = (int *)malloc(sizeof(int)*psiSize);
+ diffs[0] = 0;
+ for(index=1; index<psiSize; index++)
+ diffs[index] = Psi[index] - Psi[index-1];
+
+ // Calculamos a distribucion de frecuencias
+ runLenght = 0;
+ for(index=0; index<psiSize; index++) {
+
+ if(index%T) {
+
+ if(diffs[index]==1) {
+ runLenght++;
+ } else { // Non estamos nun run
+ if(runLenght) {
+ huffmanDst[runLenght+runLenghtStart]++;
+ runLenght = 0;
+ }
+ if(diffs[index]>1 && diffs[index]<runLenghtStart)
+ huffmanDst[diffs[index]]++;
+ else
+ if(diffs[index]<0) { // Valor negativo
+ absolute_value = -diffs[index];
+ binaryLenght = bits(absolute_value);
+ huffmanDst[binaryLenght+negStart-1]++;
+ } else { // Valor grande >= 128
+ absolute_value = diffs[index];
+ binaryLenght = bits(absolute_value);
+ huffmanDst[binaryLenght+bigStart-1]++;
+ }
+ }
+
+ } else { // Rompemos o run porque atopamos unha mostra
+ if(runLenght) {
+ huffmanDst[runLenght+runLenghtStart]++;
+ runLenght = 0;
+ }
+ }
+
+ }
+
+ if(runLenght) huffmanDst[runLenght+runLenghtStart]++;
+
+ // Creamos o arbol de Huffman
+ diffsHT = createHuff(huffmanDst,nS-1,UNSORTED);
+
+ // Calculamos o espacio total ocupado pola secuencia Huffman + RLE
+ streamSize = diffsHT.total;
+ for(index=negStart;index<bigStart;index++)
+ streamSize += huffmanDst[index]*(index-negStart+1); // Negativos
+ for(index=bigStart;index<nS;index++)
+ streamSize += huffmanDst[index]*(index-bigStart+1); // Grandes
+
+ // Calculamos o numero de mostras e o espacio ocupado por cada mostra e por cada punteiro
+ numberOfSamples = (psiSize+T-1)/T;
+ sampleSize = bits(psiSize);
+ pointerSize = bits(streamSize);
+
+ // Reservamos espacio para a secuencia e para as mostras e punteiros
+ samples = (unsigned int *)malloc(sizeof(uint)*((numberOfSamples*sampleSize+31)/32));
+ samples[((numberOfSamples*sampleSize+31)/32)-1] =0000; //initialized only to avoid valgrind warnings
+ samplePointers = (unsigned int *)malloc(sizeof(int)*((numberOfSamples*pointerSize+31)/32));
+ samplePointers[((numberOfSamples*pointerSize+31)/32)-1] = 0000; //initialized only to avoid valgrind warnings
+ stream = (unsigned int *)malloc(sizeof(int)*((streamSize+31)/32));
+ stream[((streamSize+31)/32)-1]=0000;//initialized only to avoid valgrind warnings
+
+ // Comprimimos secuencialmente (haber� que levar un punteiro desde o inicio)
+ ptr = 0;
+ samplesPtr = 0;
+ samplePointersPtr = 0;
+ runLenght = 0;
+ for(index=0; index<psiSize; index++) {
+
+ if(index%T) {
+
+ if(diffs[index]==1) {
+ runLenght++;
+ } else { // Non estamos nun run
+ if(runLenght) {
+ ptr = encodeHuff(diffsHT,runLenght+runLenghtStart,stream,ptr);
+ runLenght = 0;
+ }
+ if(diffs[index]>1 && diffs[index]<runLenghtStart) {
+ ptr = encodeHuff(diffsHT,diffs[index],stream,ptr);
+ }
+ else
+ if(diffs[index]<0) { // Valor negativo
+ absolute_value = -diffs[index];
+ binaryLenght = bits(absolute_value);
+ ptr = encodeHuff(diffsHT,binaryLenght+negStart-1,stream,ptr);
+ bitwrite(stream,ptr,binaryLenght,absolute_value);
+ ptr += binaryLenght;
+ } else { // Valor grande >= 128
+ absolute_value = diffs[index];
+ binaryLenght = bits(absolute_value);
+ ptr = encodeHuff(diffsHT,binaryLenght+bigStart-1,stream,ptr);
+ bitwrite(stream,ptr,binaryLenght,absolute_value);
+ ptr += binaryLenght;
+ }
+ }
+
+ } else { // Rompemos o run porque atopamos unha mostra
+ if(runLenght) {
+ ptr = encodeHuff(diffsHT,runLenght+runLenghtStart,stream,ptr);
+ runLenght = 0;
+ }
+ bitwrite(samples,samplesPtr,sampleSize,Psi[index]);
+ samplesPtr += sampleSize;
+ bitwrite(samplePointers,samplePointersPtr,pointerSize,ptr);
+ samplePointersPtr += pointerSize;
+ }
+
+ }
+
+ if(runLenght) {
+ ptr = encodeHuff(diffsHT,runLenght+runLenghtStart,stream,ptr);
+ }
+
+ // Amosamos o espacio ocupado
+ totalSize = sizeof(HuffmanCompressedPsi) +
+ sizeof(int)*((numberOfSamples*sampleSize+31)/32) +
+ sizeof(int)*((numberOfSamples*pointerSize+31)/32) +
+ sizeof(int)*((streamSize+31)/32) + sizeHuff(diffsHT);
+
+ printf("\n\t Compressed Psi size = %d bytes, with %d different symbols.", totalSize, nS);
+
+ // Necesario antes de decodificar
+ prepareToDecode(&diffsHT);
+
+ // Asignamos os valores a cPsi e devolvemolo
+ cPsi.T = T;
+ cPsi.diffsHT = diffsHT;
+ cPsi.nS = nS;
+ cPsi.numberOfSamples = numberOfSamples;
+ cPsi.samples = samples;
+ cPsi.sampleSize = sampleSize;
+ cPsi.samplePointers = samplePointers;
+ cPsi.pointerSize = pointerSize;
+ cPsi.stream = stream;
+ cPsi.streamSize = streamSize;
+ cPsi.totalMem = totalSize;
+
+ //frees resources not needed in advance
+ free(diffs);
+ free(huffmanDst);
+
+ //returns the data structure that holds the compressed psi.
+ return cPsi;
+}
+
+
+unsigned int getHuffmanPsiValue(HuffmanCompressedPsi *cPsi, unsigned int position) {
+
+ register unsigned int index;
+ unsigned int sampleIndex, ptr, psiValue, huffmanCode, positionsSinceSample;
+ unsigned int absolute_value, binaryLenght, runLenght;
+
+ unsigned int runLenghtStart = cPsi->nS - 64 - cPsi->T;
+ unsigned int negStart = cPsi->nS - 64;
+ unsigned int bigStart = cPsi->nS - 32;
+
+ sampleIndex = position / cPsi->T;
+ psiValue = bitread(cPsi->samples,sampleIndex*cPsi->sampleSize,cPsi->sampleSize);
+ ptr = bitread(cPsi->samplePointers,sampleIndex*cPsi->pointerSize,cPsi->pointerSize);
+
+ positionsSinceSample = position%cPsi->T;
+
+ for(index=0;index<positionsSinceSample;index++) {
+
+ ptr = decodeHuff(&cPsi->diffsHT,&huffmanCode,cPsi->stream,ptr);
+
+ if(huffmanCode < runLenghtStart) { // Incremento directo
+ psiValue += huffmanCode;
+ }
+ else
+ if(huffmanCode < negStart) { // Estamos nun run
+ runLenght = huffmanCode - runLenghtStart;
+ if(index+runLenght>=positionsSinceSample)
+ return psiValue+positionsSinceSample-index;
+ else {
+ psiValue += runLenght;
+ index += runLenght-1;
+ }
+ }
+ else
+ if(huffmanCode < bigStart) { // Negativo
+ binaryLenght = huffmanCode-negStart+1;
+ absolute_value = bitread(cPsi->stream,ptr,binaryLenght);
+ ptr += binaryLenght;
+ psiValue -= absolute_value;
+ }
+ else { // Grande
+ binaryLenght = huffmanCode-bigStart+1;
+ absolute_value = bitread(cPsi->stream,ptr,binaryLenght);
+ ptr += binaryLenght;
+ psiValue += absolute_value;
+ }
+
+ }
+
+ return psiValue;
+
+}
+
+
+void storeHuffmanCompressedPsi(HuffmanCompressedPsi *compressedPsi, char *filename) {
+
+ int file;
+ THuff H;
+
+ if( (file = open(filename, O_WRONLY|O_CREAT, 0700)) < 0) {
+ printf("Cannot open file %s\n", filename);
+ exit(0);
+ }
+ write(file, &(compressedPsi->T), sizeof(int));
+ // Almacenar o arbol de huffman
+ H = compressedPsi->diffsHT;
+ write(file, &H.max, sizeof(int));
+ write(file, &H.lim, sizeof(int));
+ write(file, &H.depth, sizeof(int));
+// write(file, H.s.spos, (H.lim+1)*sizeof(int));
+ write(file, H.s.symb, (H.lim+1)*sizeof(int));
+ write(file, H.num, (H.depth+1)*sizeof(int));
+ write(file, H.fst, (H.depth+1)*sizeof(int));
+ // Fin de almacenar o arbol de huffman
+ write(file, &(compressedPsi->nS), sizeof(int));
+ write(file, &(compressedPsi->numberOfSamples), sizeof(int));
+ write(file, &(compressedPsi->sampleSize), sizeof(int));
+ write(file, compressedPsi->samples, ((compressedPsi->numberOfSamples*compressedPsi->sampleSize+31)/32)*sizeof(int));
+ write(file, &(compressedPsi->pointerSize), sizeof(int));
+ write(file, compressedPsi->samplePointers, ((compressedPsi->numberOfSamples*compressedPsi->pointerSize+31)/32)*sizeof(int));
+ write(file, &(compressedPsi->streamSize), sizeof(int));
+ write(file, compressedPsi->stream, ((compressedPsi->streamSize+31)/32)*sizeof(int));
+ write(file, &(compressedPsi->totalMem), sizeof(int));
+
+ close(file);
+
+}
+
+
+HuffmanCompressedPsi loadHuffmanCompressedPsi(char *filename) {
+
+ HuffmanCompressedPsi compressedPsi;
+
+ THuff H;
+
+ int file;
+
+ if( (file = open(filename, O_RDONLY)) < 0) {
+ printf("Cannot read file %s\n", filename);
+ exit(0);
+ }
+ read(file, &(compressedPsi.T), sizeof(int));
+ // Cargamos o arbol de Huffman
+ read(file, &H.max, sizeof(int));
+ read(file, &H.lim, sizeof(int));
+ read(file, &H.depth, sizeof(int));
+ //H.s.spos = (unsigned int *) malloc((H.lim+1)*sizeof(int));
+ //H.s.spos =H.s.symb = (unsigned int *) malloc((H.lim+1)*sizeof(int));
+ H.s.symb = (unsigned int *) malloc((H.lim+1)*sizeof(int));
+ H.num = (unsigned int *) malloc((H.depth+1)*sizeof(int));
+ H.fst = (unsigned int *) malloc((H.depth+1)*sizeof(int));
+
+ //read(file, H.s.spos, (H.lim+1)*sizeof(int));
+ //fprintf(stderr," \n read %d spos bytes\n", (H.lim+1)*sizeof(int));
+ read(file, H.s.symb, (H.lim+1)*sizeof(int));
+
+ read(file, H.num, (H.depth+1)*sizeof(int));
+ read(file, H.fst, (H.depth+1)*sizeof(int));
+ compressedPsi.diffsHT = H;
+ // Fin da carga do arbol de Huffman
+ read(file, &(compressedPsi.nS), sizeof(int));
+ read(file, &(compressedPsi.numberOfSamples), sizeof(int));
+ read(file, &(compressedPsi.sampleSize), sizeof(int));
+ compressedPsi.samples = (unsigned int *)malloc(((compressedPsi.numberOfSamples*compressedPsi.sampleSize+31)/32)*sizeof(int));
+ read(file, compressedPsi.samples, ((compressedPsi.numberOfSamples*compressedPsi.sampleSize+31)/32)*sizeof(int));
+ read(file, &(compressedPsi.pointerSize), sizeof(int));
+ compressedPsi.samplePointers = (unsigned int *)malloc(((compressedPsi.numberOfSamples*compressedPsi.pointerSize+31)/32)*sizeof(int));
+ read(file, compressedPsi.samplePointers, ((compressedPsi.numberOfSamples*compressedPsi.pointerSize+31)/32)*sizeof(int));
+ read(file, &(compressedPsi.streamSize), sizeof(int));
+ compressedPsi.stream = (unsigned int *)malloc(((compressedPsi.streamSize+31)/32)*sizeof(int));
+ read(file, compressedPsi.stream, ((compressedPsi.streamSize+31)/32)*sizeof(int));
+ read(file, &(compressedPsi.totalMem), sizeof(int));
+
+ close(file);
+
+ return compressedPsi;
+
+}
--- /dev/null
+#include <stdlib.h>
+#include <stdio.h>
+#include <malloc.h>
+#include "../utils/huff.h"
+
+/*
+Compresion de PSI utilizando codificación incremental e RLE para os runs.
+
+Utilizamos códigos Huffman, entre os que distinguimos 4 grupos.
+
+G1: Incrementos frecuentes: Entre 2 e Total - 32 negativos - 32 grandes - (máxima lonxitude dun Run == periodo de muestreo - 1))
+G2: Código que representa que hai un run e a súa lonxitude (periodo de muestreo códigos)
+G3: Numeros negativos (32 caracteres de escape, que representan a lonxitude da representación binaria do valor absoluto do negativo)
+G4: Numeros grandes, maiores que os de G1 (32 caracteres de escape representando novamente a lonxitude da representación binaria do seu valor)
+
+Os de G1 obtéñense directamente, tras decodificar Huffman.
+Os de G2 transfórmanse nunha run da lonxitude obtida tras decodificar con Huffman.
+Os de G3 van seguidos da representación binaria do valor absoluto do seu número.
+Os de G4 van seguidos da súa representación binaria.
+*/
+
+// ESTRUCTURA DE PSI COMPRIMIDA
+typedef struct {
+ unsigned int T; // Periodo de muestreo de PSI
+ THuff diffsHT; // Arbol de Huffman (codifica stream)
+ unsigned int nS; // Numero de simbolos para Huffman
+ unsigned int numberOfSamples;
+ unsigned int sampleSize; // Bits que ocupa cada mostra
+ unsigned int *samples; // Vector de mostras
+ unsigned int pointerSize; // Bits que ocupa cada punteiro
+ unsigned int *samplePointers; // Punteiros das mostras a stream
+ unsigned int streamSize; // Bits que ocupa stream
+ unsigned int *stream; // Secuencia Huffman + RLE
+ unsigned int totalMem; // the size in bytes used;
+} HuffmanCompressedPsi;
+
+
+// PROTOTIPOS DE FUNCIÓNS
+
+// Crea as estructuras de Psi comprimida:
+//
+// Psi: Funcion Psi original
+// psiSize: Numero de elementos de Psi
+// T: Periodo de muestreo en Psi
+// nS: Numero de simbolos que se utilizaran no arbol de Huffman
+//
+// Devolve unha estructura CompressedPSI que representa a Psi comprimida
+HuffmanCompressedPsi huffmanCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T, unsigned int nS);
+
+// Obtén un valor de Psi
+//
+// cPsi: A estructura que representa a Psi comprimida
+// position: A posicion da que queremos obter o valor de Psi
+unsigned int getHuffmanPsiValue(HuffmanCompressedPsi *cPsi, unsigned int position);
+
+void storeHuffmanCompressedPsi(HuffmanCompressedPsi *compressedPsi, char *filename);
+HuffmanCompressedPsi loadHuffmanCompressedPsi(char *filename);
+
+//frees the memory used.
+void destroyHuffmanCompressedPsi(HuffmanCompressedPsi *compressedPsi);
+
+
+//
+//// IMPLEMENTACIÓN DAS FUNCIÓNS
+//
+//void destroyHuffmanCompressedPsi(HuffmanCompressedPsi *compressedPsi) {
+// freeHuff(compressedPsi->diffsHT);
+// free(compressedPsi->samples);
+// free(compressedPsi->samplePointers);
+// free (compressedPsi->stream);
+//}
+//
+//HuffmanCompressedPsi huffmanCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T, unsigned int nS) {
+//
+// HuffmanCompressedPsi cPsi;
+//
+// int absolute_value;
+// register unsigned int index, ptr, samplesPtr, samplePointersPtr;
+// unsigned int runLenght, binaryLenght;
+//
+// int *diffs;
+// unsigned int *huffmanDst;
+//
+// // Estructuras da funcion comprimida (para logo asignar)
+// // Tamén se podian almacenar directamente
+// THuff diffsHT;
+// unsigned int numberOfSamples;
+// unsigned int *samples;
+// unsigned int sampleSize;
+// unsigned int *samplePointers;
+// unsigned int pointerSize;
+// unsigned int *stream;
+// unsigned int streamSize;
+//
+// // Variables que marcan os intervalos dentro do vector de frecuencias
+// unsigned int runLenghtStart = nS - 64 - T; // Inicio das Runs
+// unsigned int negStart = nS - 64; // Inicio dos Negativos
+// unsigned int bigStart = nS - 32; // Inicio dos Grandes (>runLenghtStart)
+//
+// // Para estadistica
+// unsigned int totalSize;
+//
+// // Reservamos espacio para a distribución de valores de Psi
+// huffmanDst = (unsigned int *)malloc(sizeof(int)*nS);
+// for(index=0;index<nS;index++) huffmanDst[index]=0;
+//
+// // Inicializamos diferencias
+// diffs = (int *)malloc(sizeof(int)*psiSize);
+// diffs[0] = 0;
+// for(index=1; index<psiSize; index++)
+// diffs[index] = Psi[index] - Psi[index-1];
+//
+// // Calculamos a distribucion de frecuencias
+// runLenght = 0;
+// for(index=0; index<psiSize; index++) {
+//
+// if(index%T) {
+//
+// if(diffs[index]==1) {
+// runLenght++;
+// } else { // Non estamos nun run
+// if(runLenght) {
+// huffmanDst[runLenght+runLenghtStart]++;
+// runLenght = 0;
+// }
+// if(diffs[index]>1 && diffs[index]<runLenghtStart)
+// huffmanDst[diffs[index]]++;
+// else
+// if(diffs[index]<0) { // Valor negativo
+// absolute_value = -diffs[index];
+// binaryLenght = bits(absolute_value);
+// huffmanDst[binaryLenght+negStart-1]++;
+// } else { // Valor grande >= 128
+// absolute_value = diffs[index];
+// binaryLenght = bits(absolute_value);
+// huffmanDst[binaryLenght+bigStart-1]++;
+// }
+// }
+//
+// } else { // Rompemos o run porque atopamos unha mostra
+// if(runLenght) {
+// huffmanDst[runLenght+runLenghtStart]++;
+// runLenght = 0;
+// }
+// }
+//
+// }
+//
+// if(runLenght) huffmanDst[runLenght+runLenghtStart]++;
+//
+// // Creamos o arbol de Huffman
+// diffsHT = createHuff(huffmanDst,nS-1,UNSORTED);
+//
+// // Calculamos o espacio total ocupado pola secuencia Huffman + RLE
+// streamSize = diffsHT.total;
+// for(index=negStart;index<bigStart;index++)
+// streamSize += huffmanDst[index]*(index-negStart+1); // Negativos
+// for(index=bigStart;index<nS;index++)
+// streamSize += huffmanDst[index]*(index-bigStart+1); // Grandes
+//
+// // Calculamos o numero de mostras e o espacio ocupado por cada mostra e por cada punteiro
+// numberOfSamples = (psiSize+T-1)/T;
+// sampleSize = bits(psiSize);
+// pointerSize = bits(streamSize);
+//
+// // Reservamos espacio para a secuencia e para as mostras e punteiros
+// samples = (unsigned int *)malloc(sizeof(int)*((numberOfSamples*sampleSize+31)/32));
+// samplePointers = (unsigned int *)malloc(sizeof(int)*((numberOfSamples*pointerSize+31)/32));
+// stream = (unsigned int *)malloc(sizeof(int)*((streamSize+31)/32));
+//
+// // Comprimimos secuencialmente (haberá que levar un punteiro desde o inicio)
+// ptr = 0;
+// samplesPtr = 0;
+// samplePointersPtr = 0;
+// runLenght = 0;
+// for(index=0; index<psiSize; index++) {
+//
+// if(index%T) {
+//
+// if(diffs[index]==1) {
+// runLenght++;
+// } else { // Non estamos nun run
+// if(runLenght) {
+// ptr = encodeHuff(diffsHT,runLenght+runLenghtStart,stream,ptr);
+// runLenght = 0;
+// }
+// if(diffs[index]>1 && diffs[index]<runLenghtStart) {
+// ptr = encodeHuff(diffsHT,diffs[index],stream,ptr);
+// }
+// else
+// if(diffs[index]<0) { // Valor negativo
+// absolute_value = -diffs[index];
+// binaryLenght = bits(absolute_value);
+// ptr = encodeHuff(diffsHT,binaryLenght+negStart-1,stream,ptr);
+// bitwrite(stream,ptr,binaryLenght,absolute_value);
+// ptr += binaryLenght;
+// } else { // Valor grande >= 128
+// absolute_value = diffs[index];
+// binaryLenght = bits(absolute_value);
+// ptr = encodeHuff(diffsHT,binaryLenght+bigStart-1,stream,ptr);
+// bitwrite(stream,ptr,binaryLenght,absolute_value);
+// ptr += binaryLenght;
+// }
+// }
+//
+// } else { // Rompemos o run porque atopamos unha mostra
+// if(runLenght) {
+// ptr = encodeHuff(diffsHT,runLenght+runLenghtStart,stream,ptr);
+// runLenght = 0;
+// }
+// bitwrite(samples,samplesPtr,sampleSize,Psi[index]);
+// samplesPtr += sampleSize;
+// bitwrite(samplePointers,samplePointersPtr,pointerSize,ptr);
+// samplePointersPtr += pointerSize;
+// }
+//
+// }
+//
+// if(runLenght) {
+// ptr = encodeHuff(diffsHT,runLenght+runLenghtStart,stream,ptr);
+// }
+//
+// // Amosamos o espacio ocupado
+// totalSize = sizeof(int)*((numberOfSamples*sampleSize+31)/32) +
+// sizeof(int)*((numberOfSamples*pointerSize+31)/32) +
+// sizeof(int)*((streamSize+31)/32) + sizeHuff(diffsHT) +
+// 6*sizeof(int);
+// printf("Compressed Psi size = %d bytes\n", totalSize);
+//
+// // Necesario antes de decodificar
+// prepareToDecode(&diffsHT);
+//
+// // Asignamos os valores a cPsi e devolvemolo
+// cPsi.T = T;
+// cPsi.diffsHT = diffsHT;
+// cPsi.nS = nS;
+// cPsi.numberOfSamples = numberOfSamples;
+// cPsi.samples = samples;
+// cPsi.sampleSize = sampleSize;
+// cPsi.samplePointers = samplePointers;
+// cPsi.pointerSize = pointerSize;
+// cPsi.stream = stream;
+// cPsi.streamSize = streamSize;
+//
+// //frees resources not needed in advance
+// free(diffs);
+// free(huffmanDst);
+//
+// //returns the data structure that holds the compressed psi.
+// return cPsi;
+//}
+//
+//
+//unsigned int getHuffmanPsiValue(HuffmanCompressedPsi *cPsi, unsigned int position) {
+//
+// register unsigned int index;
+// unsigned int sampleIndex, ptr, psiValue, huffmanCode, positionsSinceSample;
+// unsigned int absolute_value, binaryLenght, runLenght;
+//
+// unsigned int runLenghtStart = cPsi->nS - 64 - cPsi->T;
+// unsigned int negStart = cPsi->nS - 64;
+// unsigned int bigStart = cPsi->nS - 32;
+//
+// sampleIndex = position / cPsi->T;
+// psiValue = bitread(cPsi->samples,sampleIndex*cPsi->sampleSize,cPsi->sampleSize);
+// ptr = bitread(cPsi->samplePointers,sampleIndex*cPsi->pointerSize,cPsi->pointerSize);
+//
+// positionsSinceSample = position%cPsi->T;
+//
+// for(index=0;index<positionsSinceSample;index++) {
+//
+// ptr = decodeHuff(&cPsi->diffsHT,&huffmanCode,cPsi->stream,ptr);
+//
+// if(huffmanCode < runLenghtStart) { // Incremento directo
+// psiValue += huffmanCode;
+// }
+// else
+// if(huffmanCode < negStart) { // Estamos nun run
+// runLenght = huffmanCode - runLenghtStart;
+// if(index+runLenght>=positionsSinceSample)
+// return psiValue+positionsSinceSample-index;
+// else {
+// psiValue += runLenght;
+// index += runLenght-1;
+// }
+// }
+// else
+// if(huffmanCode < bigStart) { // Negativo
+// binaryLenght = huffmanCode-negStart+1;
+// absolute_value = bitread(cPsi->stream,ptr,binaryLenght);
+// ptr += binaryLenght;
+// psiValue -= absolute_value;
+// }
+// else { // Grande
+// binaryLenght = huffmanCode-bigStart+1;
+// absolute_value = bitread(cPsi->stream,ptr,binaryLenght);
+// ptr += binaryLenght;
+// psiValue += absolute_value;
+// }
+//
+// }
+//
+// return psiValue;
+//
+//}
+//
+//
+//void storeHuffmanCompressedPsi(HuffmanCompressedPsi *compressedPsi, char *filename) {
+//
+// int file;
+// THuff H;
+//
+// if( (file = open(filename, O_WRONLY|O_CREAT, 0700)) < 0) {
+// printf("Cannot open file %s\n", filename);
+// exit(0);
+// }
+// write(file, &(compressedPsi->T), sizeof(int));
+// // Almacenar o arbol de huffman
+// H = compressedPsi->diffsHT;
+// write(file, &H.max, sizeof(int));
+// write(file, &H.lim, sizeof(int));
+// write(file, &H.depth, sizeof(int));
+//// write(file, H.s.spos, (H.lim+1)*sizeof(int));
+// write(file, H.s.symb, (H.lim+1)*sizeof(int));
+// write(file, H.num, (H.depth+1)*sizeof(int));
+// write(file, H.fst, (H.depth+1)*sizeof(int));
+// // Fin de almacenar o arbol de huffman
+// write(file, &(compressedPsi->nS), sizeof(int));
+// write(file, &(compressedPsi->numberOfSamples), sizeof(int));
+// write(file, &(compressedPsi->sampleSize), sizeof(int));
+// write(file, compressedPsi->samples, ((compressedPsi->numberOfSamples*compressedPsi->sampleSize+31)/32)*sizeof(int));
+// write(file, &(compressedPsi->pointerSize), sizeof(int));
+// write(file, compressedPsi->samplePointers, ((compressedPsi->numberOfSamples*compressedPsi->pointerSize+31)/32)*sizeof(int));
+// write(file, &(compressedPsi->streamSize), sizeof(int));
+// write(file, compressedPsi->stream, ((compressedPsi->streamSize+31)/32)*sizeof(int));
+//
+// close(file);
+//
+//}
+//
+//
+//HuffmanCompressedPsi loadHuffmanCompressedPsi(char *filename) {
+//
+// HuffmanCompressedPsi compressedPsi;
+//
+// THuff H;
+//
+// int file;
+//
+// if( (file = open(filename, O_RDONLY)) < 0) {
+// printf("Cannot read file %s\n", filename);
+// exit(0);
+// }
+// read(file, &(compressedPsi.T), sizeof(int));
+// // Cargamos o arbol de Huffman
+// read(file, &H.max, sizeof(int));
+// read(file, &H.lim, sizeof(int));
+// read(file, &H.depth, sizeof(int));
+// //H.s.spos = (unsigned int *) malloc((H.lim+1)*sizeof(int));
+// //H.s.spos =H.s.symb = (unsigned int *) malloc((H.lim+1)*sizeof(int));
+// H.s.symb = (unsigned int *) malloc((H.lim+1)*sizeof(int));
+// H.num = (unsigned int *) malloc((H.depth+1)*sizeof(int));
+// H.fst = (unsigned int *) malloc((H.depth+1)*sizeof(int));
+//
+// //read(file, H.s.spos, (H.lim+1)*sizeof(int));
+// fprintf(stderr," \n read %d spos bytes\n", (H.lim+1)*sizeof(int));
+// read(file, H.s.symb, (H.lim+1)*sizeof(int));
+//
+// read(file, H.num, (H.depth+1)*sizeof(int));
+// read(file, H.fst, (H.depth+1)*sizeof(int));
+// compressedPsi.diffsHT = H;
+// // Fin da carga do arbol de Huffman
+// read(file, &(compressedPsi.nS), sizeof(int));
+// read(file, &(compressedPsi.numberOfSamples), sizeof(int));
+// read(file, &(compressedPsi.sampleSize), sizeof(int));
+// compressedPsi.samples = (unsigned int *)malloc(((compressedPsi.numberOfSamples*compressedPsi.sampleSize+31)/32)*sizeof(int));
+// read(file, compressedPsi.samples, ((compressedPsi.numberOfSamples*compressedPsi.sampleSize+31)/32)*sizeof(int));
+// read(file, &(compressedPsi.pointerSize), sizeof(int));
+// compressedPsi.samplePointers = (unsigned int *)malloc(((compressedPsi.numberOfSamples*compressedPsi.pointerSize+31)/32)*sizeof(int));
+// read(file, compressedPsi.samplePointers, ((compressedPsi.numberOfSamples*compressedPsi.pointerSize+31)/32)*sizeof(int));
+// read(file, &(compressedPsi.streamSize), sizeof(int));
+// compressedPsi.stream = (unsigned int *)malloc(((compressedPsi.streamSize+31)/32)*sizeof(int));
+// read(file, compressedPsi.stream, ((compressedPsi.streamSize+31)/32)*sizeof(int));
+//
+// close(file);
+//
+// return compressedPsi;
+//
+//}
--- /dev/null
+
+/* General interface for using the compressed index libraries */
+
+#ifndef uchar
+#define uchar unsigned char
+#endif
+#ifndef uint
+#define uint unsigned int
+#endif
+#ifndef ulong
+#define ulong unsigned long
+#endif
+
+/* Error management */
+
+ /* Returns a string describing the error associated with error number
+ e. The string must not be freed, and it will be overwritten with
+ subsequent calls. */
+
+char *error_index (int e);
+
+/* Building the index */
+
+ /* Creates index from text[0..length-1]. Note that the index is an
+ opaque data type. Any build option must be passed in string
+ build_options, whose syntax depends on the index. The index must
+ always work with some default parameters if build_options is NULL.
+ The returned index is ready to be queried. */
+
+int build_index (uchar *text, ulong length, char *build_options, void **index);
+
+ /* Saves index on disk by using single or multiple files, having
+ proper extensions. */
+
+int save_index (void *index, char *filename);
+
+ /* Loads index from one or more file(s) named filename, possibly
+ adding the proper extensions. */
+
+int load_index (char *filename, void **index);
+
+ /* Frees the memory occupied by index. */
+
+int free_index (void *index);
+
+ /* Gives the memory occupied by index in bytes. */
+
+int index_size(void *index, ulong *size);
+
+/* Querying the index */
+
+ /* Writes in numocc the number of occurrences of the substring
+ pattern[0..length-1] found in the text indexed by index. */
+
+//int count (void *index, uchar *pattern, ulong length, ulong *numocc);
+//
+// /* Writes in numocc the number of occurrences of the substring
+// pattern[0..length-1] in the text indexed by index. It also allocates
+// occ (which must be freed by the caller) and writes the locations of
+// the numocc occurrences in occ, in arbitrary order. */
+//
+//int locate (void *index, uchar *pattern, ulong length, ulong **occ,
+// ulong *numocc);
+//
+// /* Gives the length of the text indexed */
+//
+//int get_length(void *index, ulong *length);
+//
+///* Accessing the indexed text */
+//
+// /* Allocates snippet (which must be freed by the caller) and writes
+// the substring text[from..to] into it. Returns in snippet_length the
+// length of the text snippet actually extracted (that could be less
+// than to-from+1 if to is larger than the text size). */
+//
+//int extract (void *index, ulong from, ulong to, uchar **snippet,
+// ulong *snippet_length);
+//
+// /* Displays the text (snippet) surrounding any occurrence of the
+// substring pattern[0..length-1] within the text indexed by index.
+// The snippet must include numc characters before and after the
+// pattern occurrence, totalizing length+2*numc characters, or less if
+// the text boundaries are reached. Writes in numocc the number of
+// occurrences, and allocates the arrays snippet_text and
+// snippet_lengths (which must be freed by the caller). The first is a
+// character array of numocc*(length+2*numc) characters, with a new
+// snippet starting at every multiple of length+2*numc. The second
+// gives the real length of each of the numocc snippets. */
+//
+//int display (void *index, uchar *pattern, ulong length, ulong numc,
+// ulong *numocc, uchar **snippet_text, ulong **snippet_lengths);
+
+ /* Obtains the length of the text indexed by index. */
+
+int length (void *index, ulong *length);
+
--- /dev/null
+\r
+/*-----------------------------------------------------------------------\r
+ File : MemoryManager.cpp\r
+ Function : Reserves large blocks of memory and gives pointers to small\r
+ portions of that block when requested.\r
+ This improves performance since a unique "LARGE ALLOCATION"\r
+ of memory is needed (a unique call to malloc).\r
+ It is also responsible of freeing memory.\r
+ Last change: 10/03/2004\r
+ Purpose : Improve hash performance.\r
+ ------------------------------------------------------------------------*/\r
+#include "MemoryManager.h"\r
+\r
+\r
+/*------------------------------------------------------------------\r
+ Constructor method\r
+ ------------------------------------------------------------------ */\r
+MemoryManager createMemoryManager(void) {\r
+ MemoryManager mm;\r
+ mm = (MemoryManager) malloc (sizeof(struct sMem));\r
+ mm->currentBlock=0;\r
+ createNewMemoryBlock(mm); \r
+ return mm;\r
+}\r
+\r
+/*------------------------------------------------------------------\r
+ Destructor method\r
+ ------------------------------------------------------------------ */\r
+void destroyMemoryManager (MemoryManager mm){\r
+ register int i;\r
+ for (i=0; i<=mm->currentBlock;i++) free(mm->BLOCKS[i]);\r
+ printf("\n[destroying MemManager] ...Freed %u bytes... RAM", LARGE_BLOCK_SIZE* (mm->currentBlock+1));\r
+ free(mm);\r
+\r
+}\r
+\r
+/*------------------------------------------------------------------\r
+ createNewMemoryBlock method\r
+ Allocates a new memory block of size "LARGE_BLOCK_SIZE" and adds it to\r
+ vector BLOCKS\r
+ ------------------------------------------------------------------ */\r
+\r
+void createNewMemoryBlock (MemoryManager mm) {\r
+ mm->BLOCKS[mm->currentBlock] = (byte *) malloc (LARGE_BLOCK_SIZE);\r
+\r
+ if (mm->BLOCKS[mm->currentBlock] == NULL) {\r
+ fprintf(stderr, "\nERROR...\nUnable to allocate enough memory. Exitting...\n");\r
+ exit(0);\r
+ }\r
+\r
+ mm->remainderBytes = LARGE_BLOCK_SIZE;\r
+ mm->availableByte = mm->BLOCKS[mm->currentBlock]; //points to the begining of the block\r
+}\r
+\r
+\r
+/*------------------------------------------------------------------\r
+ getBlock method\r
+ returns a pointer to a free block of memory of size "size"\r
+ ------------------------------------------------------------------ */\r
+void getMemoryBlock (MemoryManager mm, byte **dst, const unsigned int size) {\r
+ if (mm->remainderBytes < size) {\r
+ mm->currentBlock++;\r
+ createNewMemoryBlock(mm);\r
+ //fprintf(stderr,"\new memory block");\r
+ }\r
+\r
+ *dst = mm->availableByte; //points to a free size-block\r
+ mm->remainderBytes -= (size);\r
+ mm->availableByte += (size);\r
+}\r
+\r
+\r
+/*------------------------------------------------------------------\r
+ main, to make unit proofs\r
+ ------------------------------------------------------------------ */\r
+/*\r
+int main(int argc, char* argv[])\r
+{ byte *word, *word2;\r
+ unsigned int size;\r
+int i;\r
+ MemoryManager memMgr;\r
+ memMgr=createMemoryManager();\r
+\r
+ size = 100;\r
+ getMemoryBlock(memMgr,&word,size);\r
+\r
+ fprintf(stderr,"pasei getblock\n");\r
+ strcpy((char *)(word), "01234567890123456789012345678901234567890123456789");\r
+ word[50]='\0';\r
+ fprintf(stderr,"pasei strcpy \n");\r
+ fprintf(stderr,"\n%s",word);\r
+ getMemoryBlock(memMgr,&word2,size);\r
+ strcpy((char *)(word2), "soy la word 2");\r
+\r
+ for (i=0;i<100000;i++) {\r
+ size = 89;\r
+ getMemoryBlock(memMgr,&word,size);\r
+ }\r
+\r
+ fprintf(stderr,"\n final %s",word2);\r
+\r
+ destroyMemoryManager(memMgr);\r
+\r
+} */\r
+\r
--- /dev/null
+/*-----------------------------------------------------------------------\r
+ File : MemoryManager.h\r
+ Function : Reserves large blocks of memory and gives pointers to small\r
+ portions of that block when requested.\r
+ This improves performance since a unique "LARGE ALLOCATION"\r
+ of memory is needed (a unique call to malloc).\r
+ It is also responsible of freeing memory.\r
+ Last change: 10/03/2004\r
+ Purpose : Improve hash performance.\r
+ ------------------------------------------------------------------------*/\r
+\r
+#ifndef MEMORYMANAGERINCLUDED\r
+#define MEMORYMANAGERINCLUDED // only used for hashTable of stopwords\r
+ \r
+#include <string.h>\r
+#include <stdlib.h>\r
+#include <math.h>\r
+#include <stdio.h>\r
+#include <malloc.h>\r
+\r
+#ifndef byte\r
+ #define byte unsigned char\r
+#endif \r
+\r
+#define LARGE_BLOCK_SIZE 1024*256 // Size of the blocks of memory that will be allocated\r
+#define MAX_BLOCKS 2048 // Maximum number of blocks of size LARGE_BLOCK_SIZE that\r
+ // can be allocated\r
+\r
+ /*\r
+ * Definition of structure MemoryManager\r
+ */\r
+ struct sMem {\r
+ byte *BLOCKS[MAX_BLOCKS]; //array of blocks of size LARGE_BLOCK_SIZE\r
+ unsigned int currentBlock; //currentBlock in the array of blocks\r
+ unsigned long remainderBytes; //number of bytes not yet assigned in BLOCKS[currentBlock]\r
+ byte *availableByte; //pointer to next byte not yet assigned\r
+ } ;\r
+\r
+ typedef struct sMem *MemoryManager;\r
+ \r
+\r
+ MemoryManager createMemoryManager(void);\r
+ void destroyMemoryManager (MemoryManager mm);\r
+ void getMemoryBlock (MemoryManager mm, byte **dst, const unsigned int size);\r
+ void createNewMemoryBlock (MemoryManager mm);\r
+\r
+#endif\r
--- /dev/null
+
+// Basics
+
+// #include "basics.h" included later to avoid macro recursion for malloc
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+ // Memory management
+
+ void *Malloc (int n)
+
+ { void *p;
+ if (n == 0) return NULL;
+ p = (void*) malloc (n);
+ if (p == NULL)
+ { fprintf (stderr,"Could not allocate %i bytes\n",n);
+ exit(1);
+ }
+ return p;
+ }
+
+ void Free (void *p)
+
+ { if (p) free (p);
+ }
+
+ void *Realloc (void *p, int n)
+
+ { if (p == NULL) return Malloc (n);
+ if (n == 0) { Free(p); return NULL; }
+ p = (void*) realloc (p,n);
+ if (p == NULL)
+ { fprintf (stderr,"Could not allocate %i bytes\n",n);
+ exit(1);
+ }
+ return p;
+ }
+
+#include "basics.h"
+
+ // bits needed to represent a number between 0 and n
+
+uint bits (uint n)
+
+ { uint b = 0;
+ while (n)
+ { b++; n >>= 1; }
+ return b;
+ }
+
+ // returns e[p..p+len-1], assuming len <= W
+
+uint bitread (uint *e, uint p, uint len)
+
+ { uint answ;
+ e += p/W; p %= W;
+ answ = *e >> p;
+ if (len == W)
+ { if (p) answ |= (*(e+1)) << (W-p);
+ }
+ else { if (p+len > W) answ |= (*(e+1)) << (W-p);
+ answ &= (1<<len)-1;
+ }
+ return answ;
+ }
+
+
+ // writes e[p..p+len-1] = s, len <= W
+
+void bitwrite (register uint *e, register uint p,
+ register uint len, register uint s)
+
+ { e += p/W; p %= W;
+ if (len == W)
+ { *e |= (*e & ((1<<p)-1)) | (s << p);
+ if (!p) return;
+ e++;
+ *e = (*e & ~((1<<p)-1)) | (s >> (W-p));
+ }
+ else { if (p+len <= W)
+ { *e = (*e & ~(((1<<len)-1)<<p)) | (s << p);
+ return;
+ }
+ *e = (*e & ((1<<p)-1)) | (s << p);
+ e++; len -= W-p;
+ *e = (*e & ~((1<<len)-1)) | (s >> (W-p));
+ }
+ }
+ // writes e[p..p+len-1] = 0
+
+void bitzero (register uint *e, register uint p,
+ register uint len)
+
+ { e += p/W; p %= W;
+ if (p+len >= W)
+ { *e &= ~((1<<p)-1);
+ len -= p;
+ e++; p = 0;
+ }
+ while (len >= W)
+ { *e++ = 0;
+ len -= W;
+ }
+ if (len > 0)
+ *e &= ~(((1<<len)-1)<<p);
+ }
--- /dev/null
+
+
+ // Basics
+
+#ifndef BASICSINCLUDED
+#define BASICSINCLUDED
+
+ // Includes
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/times.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+ // Memory management
+
+#define malloc(n) Malloc(n)
+#define free(p) Free(p)
+#define realloc(p,n) Realloc(p,n)
+
+void *Malloc (int n);
+void Free (void *p);
+void *Realloc (void *p, int n);
+
+ // Data types
+
+#ifndef byte
+ #define byte unsigned char
+#endif
+
+//typedef unsigned char byte;
+// typedef unsigned int uint;
+
+//typedef int bool;
+//#define true 1
+//#define false 0
+
+#define max(x,y) ((x)>(y)?(x):(y))
+#define min(x,y) ((x)<(y)?(x):(y))
+
+ // Bitstream management
+
+//#define W (8*sizeof(uint))
+#define W (32)
+
+ // bits needed to represent a number between 0 and n
+uint bits (uint n);
+ // returns e[p..p+len-1], assuming len <= W
+uint bitread (uint *e, uint p, uint len);
+ // writes e[p..p+len-1] = s, assuming len <= W
+void bitwrite (uint *e, uint p, uint len, uint s);
+ // writes e[p..p+len-1] = 0, no assumption on len
+
+ /**/ //FARI. WITH ASSUMPTION ON LEN, OR IT CRASHES
+ //NOt WORKING UPON THE LIMIT OF THE STARTING uint.
+void bitzero (uint *e, uint p, uint len);
+ // reads bit p from e
+#define bitget(e,p) (((e)[(p)/W] >> ((p)%W)) & 1)
+ // sets bit p in e
+#define bitset(e,p) ((e)[(p)/W] |= (1<<((p)%W)))
+ // cleans bit p in e
+#define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W)))
+
+
+
+/* bitRead and bitWrite as MACROS */
+ // returns e[p..p+len-1], assuming len <= W
+ //mybitread (uint returned value, uint *e, uint p, uint len)
+#define mybitread(answ, v, p, len) \
+ { uint *e ; \
+ e=v;\
+ e += p/W; p %= W; \
+ answ = *e >> p; \
+ if (len == W) \
+ { if (p) answ |= (*(e+1)) << (W-p); \
+ } \
+ else { if (p+len > W) answ |= (*(e+1)) << (W-p); \
+ answ &= (1<<len)-1; \
+ } \
+ }
+
+
+ // writes e[p..p+len-1] = s, len <= W
+ //void bitwrite (uint *e, uint p, uint len, uint s)
+#define mybitwrite(v, p, len, s) \
+ { uint *e ; \
+ e=v; \
+ e += p/W; p %= W; \
+ if (len == W) \
+ { *e |= (*e & ((1<<p)-1)) | (s << p); \
+ if (p) { \
+ e++; \
+ *e = (*e & ~((1<<p)-1)) | (s >> (W-p)); \
+ } \
+ } \
+ else { if (p+len <= W) \
+ { *e = (*e & ~(((1<<len)-1)<<p)) | (s << p); \
+ } \
+ else { \
+ *e = (*e & ((1<<p)-1)) | (s << p); \
+ e++; len -= W-p; \
+ *e = (*e & ~((1<<len)-1)) | (s >> (W-p)); \
+ } \
+ } \
+ }
+
+#endif
--- /dev/null
+
+// Implements operations over a bitmap
+
+#include "bitmap.h"
+
+
+ // In theory, we should have superblocks of size s=log^2 n divided into
+ // blocks of size b=(log n)/2. This takes
+ // O(n log n / log^2 n + n log log n / log n + log n sqrt n log log n) bits
+ // In practice, we can have any s and b, and the needed amount of bits is
+ // (n/s) log n + (n/b) log s + b 2^b log b bits
+ // Optimizing it turns out that s should be exactly s = b log n
+ // Optimizing b is more difficult but could be done numerically.
+ // However, the exponential table does no more than popcounting, so why not
+ // setting up a popcount algorithm tailored to the computer register size,
+ // defining that size as b, and proceeding.
+
+//unsigned char OnesInByte[] =
+const unsigned char popc[] = //number of ones in one byte value [0..255].
+{
+0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
+1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8,
+};
+
+uint popcount (register uint x)
+
+ { return popc[x&0xFF] + popc[(x>>8)&0xFF] + popc[(x>>16)&0xFF] + popc[x>>24];
+ }
+
+
+/******************************************************************/
+// FUNCIONS DE EDU ...
+/******************************************************************/
+/*
+ Creates a bitmap and structures to rank and select
+*/
+
+//bitmap createBitmapEdu (uint *string, uint n){ return createBitmap(string,n);}
+
+bitmap createBitmap (uint *string, uint n){
+ bitmap B;
+
+ unsigned int nb;
+ unsigned int ns;
+ unsigned int countB, countS, blockIndex, superblockIndex;
+ register unsigned int block;
+
+ B =(struct sbitmap *) malloc (sizeof(struct sbitmap));
+ B->data = string;
+ B->n = n;
+ ns = (n/256)+1;
+ nb = (n/32)+1;
+
+ B->bSize = nb;
+ B->sSize = ns;
+ B->bdata =(byte *)malloc(nb*sizeof(byte)); // Db = (unsigned char *)malloc(nb*sizeof(unsigned char));
+ B->sdata = (uint*)malloc(ns*sizeof(uint)); // Ds = (unsigned int *)malloc(ns*sizeof(unsigned int));
+
+ B->mem_usage = (ns*sizeof(uint)) + (nb*sizeof(byte)) + (sizeof(struct sbitmap));
+ /* Ahora construimos los bloques */
+ blockIndex = 0;
+ superblockIndex = 0;
+ countB = 0;
+ countS = 0;
+
+ while(blockIndex < nb) {
+
+ if(!(blockIndex%8)) {
+ countS += countB;
+ B->sdata[superblockIndex++] = countS;
+ countB = 0;
+ }
+
+ B->bdata[blockIndex] = countB;
+ block = string[blockIndex++];
+
+ countB += popcount(block);
+ }
+
+ B->pop = countS+countB;
+
+// {int i; //fprintf(stderr,"\n");
+// for (i=0;i<ns;i++) {//fprintf(stderr,"%d ",B->sdata[i]);
+// }
+// //fprintf(stderr,"\n");
+// for (i=0;i<8;i++) {//fprintf(stderr,"%d ",B->bdata[i]);
+// }
+// }
+ return B;
+}
+
+
+/*
+ Number of 1s in range [0,posicion]
+*/
+//uint rank1Edu(bitmap B, unsigned int position) {
+//uint rank1Edu(bitmap B, unsigned int position) { return rank(B,position);}
+uint rank(bitmap B, unsigned int position) {
+ register unsigned int block;
+ if (position > B->n) return B->pop;
+ //position -=1;
+
+ block = B->data[position/32] << (31-position%32);
+
+ return B->sdata[position/256] + B->bdata[position/32] +
+ popc[block & 0xff] + popc[(block>>8) & 0xff] +
+ popc[(block>>16) & 0xff] + popc[block>>24];
+}
+
+
+/********************************************************************************************/
+/**************************************************************************************/
+
+static uint binsearch (uint *data, uint size, uint val)
+
+ { uint i,j,m;
+ i = 0; j = size;
+ while (i+1 < j)
+ { m = (i+j)/2;
+ if (data[m] >= val) j = m;
+ else i = m;
+ }
+ return i;
+ }
+
+uint bselect (bitmap B, uint j)
+
+ { uint spos,bpos,pos,word,x;
+ byte *blk;
+ if (j > B->pop) return B->n;
+ spos = binsearch(B->sdata,(B->n+256-1)/256,j);
+
+ //fprintf(stderr,"\n SPOS IS %d, and B->sdata[pos] = %d",spos,B->sdata[spos]);
+ j -= B->sdata[spos];
+ pos = spos<<8;
+ blk = B->bdata + (pos>>5);
+ bpos = 0;
+
+ //while ((bpos < (1<<3)-1) && (blk[bpos+1] < j)) bpos++;
+ while ( ((spos*8+bpos) < ((B->n-1)/W)) && (bpos < (1<<3)-1) && (blk[bpos+1] < j)) bpos++;
+
+
+ //fprintf(stderr,"\n BPOS = %d",bpos);
+ pos += bpos<<5;
+ word = B->data[pos>>5];
+ j -= blk[bpos];
+ //fprintf(stderr,"\n pos>>5 = %d ... pasou XXX con word = %d, and j= %d",pos>>5,word,j);
+ while (1)
+ { x = popc[word & ((1<<8)-1)];
+ //fprintf(stderr,"\n word = %u popc vale %u",word & ((1<<8)-1),x);
+ if (j <= x) break;
+ j -= x; pos += 8;
+ word >>= 8;
+
+ }
+
+ while (j) { if (word & 1) j--; word >>= 1; pos++; }
+
+ // fprintf(stderr,"\n\nBSELECT::: POSICIÓN FINAL = %u",pos-1);
+ return pos-1;
+
+ }
+
+
+// destroys the bitmap, freeing the original bitstream
+void destroyBitmap (bitmap B)
+
+ { //free (B->data);
+ free (B->bdata);
+ free (B->sdata);
+ free (B);
+ }
+
+
+// Prints the bit vector
+void showBitVector(uint * V, int vectorSize) {
+ uint bitIndex=0;
+ while(bitIndex<vectorSize) {
+ fprintf(stderr,"%d",bitget(V,bitIndex));
+ bitIndex++;
+ }
+}
+
+void saveBitmap (char *filename, bitmap b) {
+ int file;
+ unlink(filename);
+ if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+ printf("Cannot open file %s\n", filename);
+ exit(0);
+ }
+ write(file, &(b->sSize), sizeof(uint));
+ write(file, b->sdata, sizeof(int) * (b->sSize));
+ write(file, &(b->bSize), sizeof(uint));
+ write(file, b->bdata, sizeof(byte) * (b->bSize));
+
+ write(file, &(b->pop), sizeof(uint));
+ write(file, &(b->n), sizeof(uint));
+ close(file);
+}
+
+/* loads the Rank structures from disk, and sets Bitmap->data ptr to "string"
+*/
+bitmap loadBitmap (char *filename, uint *string, uint n) {
+ bitmap B;
+ int file;
+
+ if( (file = open(filename, O_RDONLY)) < 0) {
+ printf("Cannot read file %s\n", filename);
+ exit(0);
+ }
+
+ B = (struct sbitmap *) malloc (sizeof(struct sbitmap));
+ B->data = string;
+
+ read(file, &(B->sSize), sizeof(uint));
+ B->sdata = (uint *) malloc(sizeof(uint) * B->sSize);
+ read(file, B->sdata, sizeof(uint) * B->sSize);
+
+ read(file, &(B->bSize), sizeof(uint));
+ B->bdata = (byte *) malloc(sizeof(byte) * B->bSize);
+ read(file, B->bdata, sizeof(byte) * B->bSize);
+
+ read(file, &(B->pop), sizeof(uint));
+ read(file, &(B->n), sizeof(uint));
+ close(file);
+ B->mem_usage = (sizeof(uint) * B->sSize) + (sizeof(byte) * B->bSize) + (sizeof(struct sbitmap));
+
+ if (n != B->n) {printf("\n LoadBitmap failed: %u distinto de %u",n,B->n); exit(0);}
+ return B;
+
+}
+
+
+/********************************************************************************************/
+/********************************************************************************************/
+
+
+
+
+ // creates a bitmap structure from a bitstring, which is shared
+
+bitmap createBitmapGONZA (uint *string, uint n)
+//bitmap createBitmap (uint *string, uint n)
+
+ { bitmap B;
+ uint i,j,pop,bpop,pos;
+ uint s,nb,ns,words;
+ B = (struct sbitmap *) malloc (sizeof(struct sbitmap));
+ B->data = string;
+
+
+ B->n = n; words = (n+W-1)/W;
+ ns = (n+256-1)/256; nb = 256/W; // adjustments
+
+ B->bSize = ns*nb;
+ B->bdata = (byte *) malloc (ns*nb*sizeof(byte));
+ B->sSize = ns;
+ B->sdata = (uint *)malloc (ns*sizeof(int));
+
+ B->mem_usage = (ns*sizeof(int)) + (ns*nb*sizeof(byte)) + (sizeof(struct sbitmap));
+#ifdef INDEXREPORT
+ printf (" Bitmap over %i bits took %i bits\n", n,n+ns*nb*8+ns*32);
+#endif
+ //fprintf (stderr," Bitmap over %i bits took %i bits\n", n,n+ns*nb*8+ns*32);
+ pop = 0; pos = 0;
+ for (i=0;i<ns;i++)
+ { bpop = 0;
+ B->sdata[i] = pop;
+ for (j=0;j<nb;j++)
+ { if (pos == words) break;
+ B->bdata[pos++] = bpop;
+ bpop += popcount(*string++);
+ }
+ pop += bpop;
+ }
+ B->pop = pop;
+
+ // //fprintf(stderr,"\n");
+ // for (i=0;i<ns;i++) {//fprintf(stderr,"%d ",B->sdata[i]);
+ // }
+ // //fprintf(stderr,"\n");
+ // for (i=0;i<ns*nb;i++) {//fprintf(stderr,"%d ",B->bdata[i]);
+ // }
+
+ return B;
+ }
+
+ // rank(i): how many 1's are there before position i, not included
+
+//uint rank (bitmap B, uint i)
+uint rankGONZA (bitmap B, uint i)
+
+ {
+ i++;
+ if (i > B->n) return B->pop;
+ return B->sdata[i>>8] + B->bdata[i>>5] +
+ popcount (B->data[i>>5] & ((1<<(i&0x1F))-1));
+ }
+
+
+
+
+
+
+
+
+
+
+
+
--- /dev/null
+
+// Implements operations over a bitmap
+
+#ifndef BITMAPINCLUDED
+#define BITMAPINCLUDED
+
+#include "basics.h"
+
+typedef struct sbitmap
+ { uint *data;
+ uint n; // # of bits
+ uint pop; // # bits set
+ uint *sdata; // superblock counters
+ uint sSize; // size of sdata vector
+ byte *bdata; // block counters
+ uint bSize; // size of bdata vector
+ uint mem_usage;
+ } *bitmap;
+
+
+ // creates a bitmap structure from a bitstring, which gets owned
+bitmap createBitmap (uint *string, uint n);
+ // rank(i): how many 1's are there before position i, not included
+uint rank (bitmap B, uint i);
+ // select(i): position of i-th 1
+uint bselect (bitmap B, uint i);
+ // destroys the bitmap, freeing the original bitstream
+void destroyBitmap (bitmap B);
+ // popcounts 1's in x
+uint popcount (register uint x);
+
+void saveBitmap (char *filename, bitmap b);
+bitmap loadBitmap (char *filename, uint *string, uint n);
+
+
+
+////EDU'S functions included here.
+//bitmap createBitmapEdu (uint *string, uint n);
+//uint popcountEdu (register uint x); //which is identical to popcount.
+//uint rank1Edu(bitmap B, unsigned int position);
+//unsigned int isActiveBit(uint *V, uint position);
+void showBitVector(uint * V, int vectorSize);
+
+#endif
+
+
--- /dev/null
+#ifndef DEFVALUES_INCLUDED
+#define DEFVALUES_INCLUDED
+
+//#define MAX_LEN_VOCABULARY 3000000 //number of different words --> (number of variants)
+#define MAX_MEANINGFUL_WORDS 100000000 //number of words (non separators)
+#define MAX_SIZE_OF_WORD 1000000 //255 //size of word
+#define MAX_SIZE_OF_GAP 1000000 //1000 //size of separator
+ //#define MAX_STOPW0RDS_SIZE 255 //size of stopword
+#define MAX_SIZE_OF_ANY 1000000 //1000
+
+
+#define DEFAULT_SUFFIX_ARRAY_SIZE 50000000
+
+#define DEFAULT_SAMPLE_PERIOD_Z 32
+#define DEFAULT_SAMPLE_PERIOD_B 32
+
+
+
+
+//for queries
+#define MAX_INTEGER_PATTERN_SIZE 20
+#define MAX_TEXT_PATTERN_SIZE 100 //maximum number of valid words in a searched pattern
+
+#ifndef DEBUG_ON
+ // #define DEBUG_ON
+#endif
+
+
+
+#define byte unsigned char
+
+
+#define CSA_ON //generates the CSA or only "presentation layer"
+//#define WRITE_SE_FILE //outputs to a file the array of integers indexed.
+
+// Extensions of created files
+
+#define VOCABULARY_WORDS_FILE_EXT "words"
+#define SE_FILE_EXT "se"
+#define CONSTANTS_FILE_EXT "cte"
+
+#endif
--- /dev/null
+
+/*////////////////
+//Error handling//
+////////////////*/
+/*
+char *error_index(int e){
+ switch(e) {
+ case 0: return "No error"; break;
+ case 1: return "Out of memory"; break;
+ case 2: return "The text must end with a \\0"; break;
+ case 5: return "You can't free the text if you don't copy it"; break;
+ case 20: return "Cannot create files"; break;
+ case 21: return "Error writing the index"; break;
+ case 22: return "Error writing the index"; break;
+ case 23: return "Cannot open index; break";
+ case 24: return "Cannot open text; break";
+ case 25: return "Error reading the index"; break;
+ case 26: return "Error reading the index"; break;
+ case 27: return "Error reading the text"; break;
+ case 28: return "Error reading the text"; break;
+ case 99: return "Not implemented"; break;
+ default: return "Unknown error";
+ }
+}
+*/
+
+
+char *error_index(int e){
+static char err[100];
+ switch(e) {
+ case 0: strcpy(err, "No error"); break;
+ case 1: strcpy(err, "Out of memory"); break;
+ case 2: strcpy(err, "The text must end with a \\0"); break;
+ case 5: strcpy(err, "You can't free the text if you don't copy it"); break;
+ case 20: strcpy(err, "Cannot create files"); break;
+ case 21: strcpy(err, "Error writing the index"); break;
+ case 22: strcpy(err, "Error writing the index"); break;
+ case 23: strcpy(err, "Cannot open index; break");
+ case 24: strcpy(err, "Cannot open text; break");
+ case 25: strcpy(err, "Error reading the index"); break;
+ case 26: strcpy(err, "Error reading the index"); break;
+ case 27: strcpy(err, "Error reading the text"); break;
+ case 28: strcpy(err, "Error reading the text"); break;
+ case 99: strcpy(err, "Not implemented"); break;
+ default: strcpy(err, "Unknown error");
+ }
+ return err;
+}
--- /dev/null
+
+#include "fileInfo.h"
+
+unsigned long fileSize (char *filename){
+ FILE *fpText;
+ unsigned long fsize;
+ fpText = fopen(filename,"rb");
+ fsize=0;
+ if (fpText) {
+ fseek(fpText,0,2);
+ fsize= ftell(fpText);
+ fclose(fpText);
+ ////fprintf(stderr,"fileSize = %ld",fsize);
+ }
+ return fsize;
+}
+
+/*copies from infile to outfile */
+void copyFile (char *infile, char *outfile){
+ FILE *in, *out;
+ unsigned long fsize;
+
+ if ( (in = fopen(infile,"rb")) <0) {
+ printf("Cannot open file %s\n", infile); exit(0);
+ }
+
+ unlink(outfile);
+ if( (out = fopen(outfile, "w")) < 0) {
+ printf("Cannot open file %s\n", outfile);
+ exit(0);
+ }
+
+ fsize=fileSize(infile);
+ if (fsize) {
+ char *buff = (char *) malloc(sizeof(char)*fsize);
+ if (fread(buff,sizeof(char),fsize,in)) {
+ fwrite(buff,sizeof(char),fsize,out);
+ }
+ free(buff);
+ }
+ fclose(in);
+ fclose(out);
+}
+
--- /dev/null
+#ifndef FILE_INFO_INCLUDED
+#define FILE_INFO_INCLUDED
+
+#include <stdio.h>
+#include <stdlib.h>
+ #include <unistd.h>
+
+
+
+ /*------------------------------------------------------------------
+ Obtains the size of a file.
+ ------------------------------------------------------------------ */
+ unsigned long fileSize (char *filename);
+
+ /* copies from infile to outfile */
+ void copyFile (char *infile, char *outfile);
+
+#endif
+
--- /dev/null
+
+/* DYNAMIC END-TAGGED DENSE CODE. --
+A dynamic word-based byte oriented compressor for text files based on
+dynamic End-Tagged Dense Code.
+
+Brisaboa, N. R., Faria, A., Navarro, G., Param, J. R.
+Simple, Fast, and Efficient Natural Language Adaptive Compression.
+11th International Symposium on String Processing and Information Retrieval (SPIRE'04) - LNCS 3246. A. Apostolico, M. Melucci (Ed.), pp. 230-241.
+Padova (Italia), 2004.
+
+Copyright (C) 2005 Antonio Faria.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+Author's contact: Antonio Faria, Dept. of Computer Science, University of
+A Corua. Campus de Elvia s/n. Spain fari@udc.es
+*/
+
+/*-----------------------------------------------------------------------
+ Hash: Definition of HashTable class (Linear Hash)
+ ------------------------------------------------------------------------*/
+
+#include "hash.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+
+/*-----------------------------------------------------------------
+ Initilization of data structures used by the hashTable
+ ---------------------------------------------------------------- */
+t_hash initialize_hash (unsigned long sizeVoc) {
+ t_hash h;
+ unsigned long i;
+
+ h = (t_hash) malloc(sizeof(struct hashStr));
+ h->SIZE_HASH = (unsigned long) (OCUP_HASH * sizeVoc);
+ h->SIZE_HASH = NearestPrime(h->SIZE_HASH);
+ h->hash = (t_hashNode *) malloc(h->SIZE_HASH * sizeof(t_hashNode));
+ h->NumElem = 0;
+
+ //creates the memory manager that is used to reserve small pieces of memory (for words)
+ h->_memMgr=createMemoryManager();
+
+
+ for (i = 0; i < h->SIZE_HASH; i++) {
+ h->hash[i].word = NULL;
+ h->hash[i].len = 0;
+ h->hash[i].posInVoc = 0;
+ }
+ printf("\nHash Table initilized with: %lu elements\n",h->SIZE_HASH);
+
+ return h;
+}
+
+
+/*------------------------------------------------------------------
+ Find the nearest prime number over n.
+ ---------------------------------------------------------------- */
+unsigned long NearestPrime(unsigned long n)
+{
+ long position; /* the prime number being sought */
+ long index;
+
+ for (position = n; ; position++)
+ {
+ // checks if those values from 2 to $\sqrt{m}$ can be factors of $m$ */
+ for (index = 2; index <= (long) sqrt((double) position) && position % index != 0; index++) ;
+
+ if (position % index != 0) /* No factors in that range, therefore a prime number was found */
+ {
+ break;
+ }
+ }
+ return position;
+}
+
+
+
+/*-----------------------------------------------------------------------
+ Computes a hash function over a string "aWord", it returns the position
+ where "aWord" should go in the hash table if no collision ocurrs.
+ ---------------------------------------------------------------------*/
+unsigned long hashFunction (const unsigned char *aWord, unsigned int len, unsigned long sizeHash)
+{
+ char c;
+ register unsigned int h;
+ register unsigned long last;
+ last=((unsigned long) aWord) +len -1;
+
+
+ h = SEED;
+ //last=aWord+len;
+
+ for( ; ((unsigned long) aWord <=last ) ; )
+ //for(c=*aWord; (aWord++)<last ; )
+ {
+ c=*(aWord++);
+ //c=*aWord;
+ h ^= ( (h << 5) + c + (h >> 2) );
+ }
+ return((unsigned int)((h&0x7fffffff) % sizeHash));
+}
+
+
+/*-----------------------------------------------------------------------
+ compares two strings
+ ---------------------------------------------------------------------*/
+
+/* Author J. Zobel, April 2001.
+ http://www.seg.rmit.edu.au/code/zwh-ipl/
+ Permission to use this code is freely granted, provided that this
+ statement is retained. */
+
+
+
+ int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2) {
+ if (ws1 !=ws2) {
+ return -1;
+ }
+
+ { register unsigned long iters;
+ iters=1;
+ while( iters<ws1 && *s1 == *s2 )
+ {
+ s1++;
+ s2++;
+ iters++;
+ }
+ //fprintf(stderr,"\nDevuelve [%d]",*s1-*s2);
+ return( *s1-*s2 );
+ }
+}
+
+inline int strcomp3(const unsigned char *s1, const unsigned char *s2, unsigned long ws1, unsigned long ws2) {
+
+ if (ws1 !=ws2) {
+ return -1;
+ }
+
+ { register unsigned long iters;
+ register unsigned long end;
+ end = MIN(ws1,ws2);
+ iters=1;
+ while( iters<end && *s1 == *s2 )
+ {
+ s1++;
+ s2++;
+ iters++;
+ }
+ //fprintf(stderr,"\nDevuelve [%d]",*s1-*s2);
+ return( *s1-*s2 );
+ }
+}
+
+
+/* Author J. Zobel, April 2001.
+ http://www.seg.rmit.edu.au/code/zwh-ipl/
+ Permission to use this code is freely granted, provided that this
+ statement is retained. */
+
+//int strcomp(const unsigned char *s1, const unsigned char *s2, unsigned long ws) {
+// register unsigned long i;
+// i=0;
+// while( i < ws-1 && *s1 == *s2 )*/
+// {
+// s1++;
+// s2++;
+// i++;
+// }
+// return( *s1-*s2 );
+//}
+
+//int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
+// register ulong iters = 1;
+//
+// while( iters<ws1 && *s1 == *s2 ){
+// s1++;
+// s2++;
+// iters++;
+// }
+// // so iters == ws1 OR *s1 != *s2
+// if (ws1 == iters) {
+// if (ws2 == ws1)
+// return 0; // s1 equals to s2
+// else
+// return -1; // s1 < s2
+// }
+// else { //(ws1 > iters) so s1 != s2
+// return( *s1-*s2);
+// }
+//}
+
+
+//permits to compare 2 strings of len ws1 and ws2 that do not end in '\0'
+int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
+ register ulong iters = 0;
+
+ while( iters<ws1 && iters<ws2 && *s1 == *s2 ){
+ s1++;
+ s2++;
+ iters++;
+ }
+
+ if (ws1 == iters) {
+ if (ws2 == ws1)
+ return 0; // s1 equals to s2
+ else
+ return -1; // w1 < w2 and *s1_i == *s2_i for i=0 to iters-1
+ }
+ else
+ if (ws2 == iters) {
+ if (ws2 == ws1)
+ return 0; // s1 equals to s2
+ else
+ return +1; // w2 < w1 and *ws1 is '\0'
+ }
+ else { //*s1 != *s2
+ return (*s1-*s2);
+ }
+
+}
+
+
+/*-----------------------------------------------------------------------
+ Insert a new word in a position of the hashTable (position previously computed)
+ ---------------------------------------------------------------------*/
+void insertElement (t_hash h, const unsigned char *aWord, register unsigned long len, register unsigned long *addr) {
+ //fprintf(stderr,"\n Entra en la funcin [%s], [%ld]",aWord, len);
+
+ if(h->NumElem >= h->SIZE_HASH -1) //loses 1 slot, but ensures that "search function" does not enter an infinity loop
+ {
+ printf("\n\nHash table full!! Change size and recompile !\n");
+ exit(1);
+ }
+
+ getMemoryBlock(h->_memMgr,( byte **)&(h->hash[*addr].word),len+1);
+ //fprintf(stderr,"\n tras obter memoria");
+
+ strncpy ((char *) h->hash[*addr].word, (char *)aWord, len);
+
+ h->hash[*addr].word[len]='\0';
+ h->hash[*addr].len =len;
+ h->hash[*addr].posInVoc = h->NumElem;
+ h->NumElem++;
+
+ //fprintf(stderr,"\n####inserted word [%s] ",h->hash[*addr].word);
+
+ //return *addr;
+}
+
+/*-----------------------------------------------------------------------
+ Search for a word in the hash table and returns its position in the
+ vocabulary. It returns the next "available" position in the vocabulary,
+ if the word is not in the hash table. That is: a "0-node" position.
+ It also returns -using attribute returnedAddr- the position where word
+ was found (or where it should go if it was inserted in next "insert call".
+ -----------------------------------------------------------------------*/
+unsigned long search (t_hash h, const unsigned char *aWord, register unsigned len,
+ unsigned long *returnedAddr){
+
+ register unsigned long addr, Saddr;
+
+ //fprintf(stderr,"\n searching for [%s], [%d], sizehash= %ld",aWord,len,h->SIZE_HASH);
+ addr = hashFunction(aWord,len, h->SIZE_HASH);
+ Saddr = addr;
+
+ t_hashNode *hash;
+ hash = h->hash;
+
+ while((hash[addr].word != NULL)&&((strcomp(hash[addr].word, aWord, hash[addr].len, len)) != 0)) {
+ //fprintf(stderr,"\nComprueba [%s], [%d]",hash[addr].word,strcomp(hash[addr].word, aWord, len));
+ addr = ((addr + JUMP) % h->SIZE_HASH);
+ }
+
+ *returnedAddr = addr;
+
+ if(hash[addr].word == NULL) {
+ return h->NumElem; //Word was not found
+ }
+ else {
+ return h->hash[addr].posInVoc; //Word was found in this position in the vocabulary
+ }
+}
+
+
+
+/*-----------------------------------------------------------------------
+ Tells if a word appears or not in the hash table.
+ -----------------------------------------------------------------------*/
+unsigned long inHashTable (t_hash h, const unsigned char *aWord, register unsigned len, unsigned long *returnedAddr){
+
+ unsigned long searched;
+ unsigned long nothing;
+ searched = search(h,aWord,len,¬hing);
+ *returnedAddr=nothing;
+ return (searched < (h->NumElem) );
+}
+
+/*------------------------------------------------------------------
+ Destructor method
+ ------------------------------------------------------------------ */
+void destroy_hash (t_hash hash){
+ unsigned long mem=0;
+ mem += sizeof(struct hashStr) + hash->SIZE_HASH * sizeof(t_hashNode);
+ free(hash->hash);
+ destroyMemoryManager(hash->_memMgr); //frees words and variants
+// free(hash->_memMgr);
+ free(hash);
+ printf("\n[destroying hash table]...Freed %ld bytes... RAM", mem);
+}
+
+
+
+
+/*------------------------------------------------------------------
+ main, to make unit proofs
+ ------------------------------------------------------------------ */
+/*
+int main(int argc, char* argv[])
+{ byte a[10]= "word1";
+ byte b[10]= "word2";
+ byte c[10]= "word3";
+ byte d[10]= "word4";
+ byte e[10]= "word5";
+ byte f[10]= "word6";
+ byte * w;
+ unsigned int size;
+ unsigned long i,addrInTH;
+
+ t_hash hash;
+
+ _memMgr=createMemoryManager();
+
+ hash = initialize_hash (2);
+
+ w=a;
+ i = search (hash,w, strlen(w), &addrInTH );
+ insertElement (hash, w, strlen(w), &addrInTH);
+ fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
+ fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
+
+ w=b;
+ i = search (hash,w, strlen(w), &addrInTH );
+ insertElement (hash, w, strlen(w), &addrInTH);
+ fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
+ fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
+
+ w=c;
+ i = search (hash,w, strlen(w), &addrInTH );
+ insertElement (hash, w, strlen(w), &addrInTH);
+ fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
+ fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
+
+ w=d;
+ i = search (hash,w, strlen(w), &addrInTH );
+ insertElement (hash, w, strlen(w), &addrInTH);
+ fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
+ fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
+
+// w=e;
+// i = search (hash,w, strlen(w), &addrInTH );
+// insertElement (hash, w, strlen(w), &addrInTH);
+// fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
+// fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
+
+// w=f;
+// i = search (hash,w, strlen(w), &addrInTH );
+// insertElement (hash, w, strlen(w), &addrInTH);
+// fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
+// fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
+
+ fprintf(stderr,"\n in: %s hash ? = %d",a,inHashTable(hash,a,strlen(a)) );
+ fprintf(stderr,"\n in: %s hash ? = %d",e, inHashTable(hash,e,strlen(e)) );
+ fprintf(stderr,"\n in: %s hash ? = %d",b, inHashTable(hash,b,strlen(b)) );
+
+ destroy_hash(hash);
+ destroyMemoryManager(_memMgr);
+ printf("\n\n");
+}
+*/
--- /dev/null
+\r
+/* DYNAMIC END-TAGGED DENSE CODE. -- \r
+A dynamic word-based byte oriented compressor for text files based on \r
+dynamic End-Tagged Dense Code.\r
+\r
+Brisaboa, N. R., Fariña, A., Navarro, G., Paramá, J. R. \r
+Simple, Fast, and Efficient Natural Language Adaptive Compression. \r
+11th International Symposium on String Processing and Information Retrieval (SPIRE'04) - LNCS 3246. A. Apostolico, M. Melucci (Ed.), pp. 230-241. \r
+Padova (Italia), 2004. \r
+\r
+Copyright (C) 2005 Antonio Fariña.\r
+\r
+This program is free software; you can redistribute it and/or\r
+modify it under the terms of the GNU General Public License\r
+as published by the Free Software Foundation; either version 2\r
+of the License, or (at your option) any later version.\r
+\r
+This program is distributed in the hope that it will be useful,\r
+but WITHOUT ANY WARRANTY; without even the implied warranty of\r
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\r
+GNU General Public License for more details.\r
+\r
+You should have received a copy of the GNU General Public License\r
+along with this program; if not, write to the Free Software\r
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.\r
+\r
+Author's contact: Antonio Fariña, Dept. of Computer Science, University of\r
+A Coruña. Campus de Elviña s/n. Spain fari@udc.es\r
+*/\r
+\r
+\r
+/*-----------------------------------------------------------------------\r
+ Hash: Definition of HashTable class (Linear Hash)\r
+ ------------------------------------------------------------------------*/\r
+#ifndef HASH_INCLUDED\r
+#define HASH_INCLUDED\r
+ \r
+#include <string.h>\r
+#include <stdlib.h>\r
+#include <math.h>\r
+#include <malloc.h>\r
+\r
+#include "MemoryManager.h"\r
+\r
+#define JUMP 101 //jump done when a collision appears\r
+#define OCUP_HASH 1.5 //index of occupation of the hash table\r
+#define SMALL_PRIME 1009 // a small prime number, used to compute a hash function\r
+#define SEED 1159241\r
+/* Type definitions */\r
+\r
+#define MIN(a,b) (a < b) ? a : b\r
+\r
+struct hashNode {\r
+ unsigned char *word;\r
+ unsigned long len;\r
+ unsigned long posInVoc; //positon of the canonical word in vector posInHT\r
+};\r
+typedef struct hashNode t_hashNode;\r
+\r
+struct hashStr {\r
+ t_hashNode *hash; /* the slots in the hash table */\r
+ unsigned long SIZE_HASH; /* # entries in the hash table */\r
+ unsigned long NumElem; /* # elements already added to the hash table*/\r
+ MemoryManager _memMgr; /* Holds dynamic memory allocation for words. */ \r
+};\r
+\r
+typedef struct hashStr *t_hash;\r
+\r
+// private:\r
+\r
+ unsigned long NearestPrime (unsigned long n);\r
+ unsigned long hashFunction (const unsigned char *aWord, unsigned int len, unsigned long sizeHash);\r
+\r
+// public:\r
+ \r
+ t_hash initialize_hash (unsigned long sizeVoc);\r
+ \r
+ void insertElement (t_hash h, const unsigned char *aWord, register unsigned long len,\r
+ register unsigned long *addr);\r
+ unsigned long search (t_hash h, const unsigned char *aWord, register unsigned len,\r
+ unsigned long *returnedAddr);\r
+ unsigned long inHashTable (t_hash h, const unsigned char *aWord, register unsigned len,\r
+ unsigned long *returnedAddr);\r
+ void destroy_hash (t_hash hash);\r
+ \r
+ int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2);\r
+\r
+ int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2);\r
+\r
+\r
+#endif\r
--- /dev/null
+
+// implements canonical Huffman
+
+#include "huff.h"
+
+typedef struct
+ { uint freq;
+ uint symb;
+ union
+ { int prev;
+ int depth;
+ } h;
+ int ch1,ch2;
+ } Ttree;
+
+static void sort (Ttree *tree, int lo, int up)
+
+ { int i, j;
+ Ttree temp;
+ while (up>lo)
+ { i = lo;
+ j = up;
+ temp = tree[lo];
+ while (i<j)
+ { while (tree[j].freq > temp.freq) j--;
+ tree[i] = tree[j];
+ while (i<j && tree[i].freq <= temp.freq) i++;
+ tree[j] = tree[i];
+ }
+ tree[i] = temp;
+ if (i-lo < up-i) { sort(tree,lo,i-1); lo = i+1; }
+ else { sort(tree,i+1,up); up = i-1; }
+ }
+ }
+
+static void setdepths (Ttree *tree, uint node, int depth)
+
+ { if (tree[node].ch1 == -1) // leaf
+ { tree[node].h.depth = depth;
+ return;
+ }
+ setdepths (tree,tree[node].ch1,depth+1);
+ setdepths (tree,tree[node].ch2,depth+1);
+ }
+
+
+THuff createHuff (uint *freq, uint lim, uint sorted)
+
+ { THuff H;
+ int i,j,d;
+ Ttree *tree;
+ int ptr,last,fre;
+ // remove zero frequencies
+ H.max = lim;
+ H.total = 0;
+
+ int onlyOneSymbol = (lim ==0);
+
+ // tree = malloc((2*(lim+1)-1)*sizeof(Ttree));
+ /*** BEG FARI... para solucionar caso en que s�lo hubiese un �nico s�mbolo a codificar ****/
+ if (onlyOneSymbol) {
+ tree = (Ttree *)malloc(3*sizeof(Ttree)); //root, the valid node (pos 0) and the "zeroNode" (pos 1)
+ H.max = lim+1;
+ }
+ else
+ tree = (Ttree *)malloc((2*(lim+1)-1)*sizeof(Ttree));
+ /*** END FARI... para solucionar caso en que s�lo hubiese un �nico s�mbolo a codificar ****/
+
+ //fprintf(stderr,"\n **** CALLED CREATE_HUFF WITH lim = %ld, freq[0]=%ld",lim,freq[0]);
+ j = 0;
+ for (i=0;i<=lim;i++)
+ { if (freq[i]>0)
+ { tree[j].freq = freq[i];
+ tree[j].symb = i;
+ j++;
+ //fprintf(stderr,"\n freq[%d] = [%d], para s�mbolo %d, now j=%ld",i,freq[i],tree[j-1].symb,j);
+ }
+ }
+
+ /*** BEG FARI... para solucionar caso en que s�lo hubiese un �nico s�mbolo a codificar ****/
+ if (onlyOneSymbol){
+ tree[j].freq = 0;
+ tree[j++].symb = 1; //<--- !!!!!!!!! (incrementa j !!!)
+
+ //H.total -=1 * 1; //we will add 1*1 at the end of this function.
+
+ }
+ //fprintf(stderr,"\n");
+
+ /*** END FARI... para solucionar caso en que s�lo hubiese un �nico s�mbolo a codificar ****/
+
+ H.lim = lim = j-1;
+
+
+ // now run Huffman algorithm
+ if (!sorted) sort (tree,0,lim);
+
+ for (i=0;i<=lim;i++) {
+ //fprintf(stderr,"\n XX freq[%d] = [%d], para symbolo %d",i,tree[i].freq,tree[i].symb);
+ tree[i].h.prev = i+1;
+ tree[i].ch1 = tree[i].ch2 = -1;
+ }
+ tree[lim].h.prev = -1;
+ // last = next node to process, ptr = search point, fre = next free cell
+ // leaves are in 0..lim in decreasing freq order
+ // internal nodes are in lim+1.. 2*lim, created in incr. fre order
+ last=0; ptr = 0; fre = lim+1;
+ for (i=0;i<lim;i++)
+ { tree[fre].ch1 = last;
+ last = tree[last].h.prev;
+ tree[fre].ch2 = last;
+ tree[fre].freq = tree[tree[fre].ch1].freq+tree[tree[fre].ch2].freq;
+ while ((tree[ptr].h.prev != -1) &&
+ (tree[tree[ptr].h.prev].freq <= tree[fre].freq))
+ ptr = tree[ptr].h.prev;
+ tree[fre].h.prev = tree[ptr].h.prev;
+ tree[ptr].h.prev = fre;
+ last = tree[last].h.prev;
+ fre++;
+ }
+ // now assign depths recursively
+ setdepths (tree,2*lim,0);
+ H.s.spos = (uint *)malloc ((H.max+1)*sizeof(uint));
+ for (i=0;i<=H.max;i++) H.s.spos[i] = ~0;
+ H.num = (uint *)malloc ((lim+1)*sizeof(uint)); // max possible depth
+ d=0;
+ for (i=lim;i>=0;i--)
+ { H.s.spos[tree[i].symb] = i;
+ while (tree[i].h.depth > d)
+ { H.num[d] = i+1; d++; }
+ }
+ H.num[d] = 0;
+ H.depth = d;
+ for (d=H.depth;d>0;d--) H.num[d] = H.num[d-1] - H.num[d];
+ H.num[0] = (lim == 0);
+ H.num = (uint *) realloc(H.num,(H.depth+1)*sizeof(uint));
+ //H.total = 0;
+
+
+
+
+ if (onlyOneSymbol){ H.total += freq[tree[0].symb] * tree[0].h.depth;}
+ else {
+ for (i=0;i<=lim;i++) {
+ H.total += freq[tree[i].symb] * tree[i].h.depth;
+ //fprintf(stderr,"\n ****tota[%d] = %d x %d =%d",i,freq[tree[i].symb],tree[i].h.depth,H.total);
+ }
+ }
+ free (tree);
+
+ //fprintf(stderr,"\n **** CALL ENDS CREATE_HUFF WITH lim = %ld, freq[0]=%ld, Huffsize=%ld",lim,freq[0],H.total);
+
+ return H;
+ }
+
+int encodeHuff (THuff H, uint symb, uint *stream, uint ptr)
+
+ { uint pos;
+ uint code;
+ int d;
+ pos = H.s.spos[symb];
+ code = 0;
+ d = H.depth;
+ while (pos >= H.num[d])
+ { code = (code + H.num[d]) >> 1;
+ pos -= H.num[d--];
+ }
+ code += pos;
+ if (d > W) { bitzero(stream,ptr,d-W); ptr += d-W; d = W; }
+ while (d--)
+ { if ((code >> d) & 1) bitset(stream,ptr);
+ else bitclean(stream,ptr);
+ ptr++;
+ }
+ return ptr;
+ }
+
+void printCodeHuff (THuff H, uint symb)
+
+ { uint pos;
+ uint code;
+ int d;
+ pos = H.s.spos[symb];
+ code = 0;
+ d = H.depth;
+
+ // fprintf(stderr,"\n H.depth= %ld and pos is %ld\n",H.depth,pos);
+ while (pos >= H.num[d])
+ { code = (code + H.num[d]) >> 1;
+ pos -= H.num[d--];
+ }
+ code += pos;
+ if (d > W) {fprintf(stderr,"code larger than W"); d=W;}
+
+ //show the code.
+ while (d--)
+ { if ((code >> d) & 1)
+ fprintf(stderr,"1");
+ else fprintf(stderr,"0");
+ }
+ }
+
+
+
+int decodeHuff (THuff *H, uint *symb, uint *stream, uint ptr)
+
+ { uint pos;
+ int d;
+ pos = 0;
+ d = 0;
+ while (pos < H->fst[d])
+ { pos = (pos << 1) | bitget(stream,ptr);
+ ptr++; d++;
+ }
+ *symb = H->s.symb[H->num[d]+pos-H->fst[d]];
+ return ptr;
+ }
+
+
+
+
+/* { uint pos; // This "improved" code is actually slower!
+ int d;
+ uint wrd,off;
+ stream += ptr/W;
+ off = ptr & (W-1);
+ wrd = *stream >> off;
+ pos = 0;
+ d = 0;
+ while (pos < H.fst[d])
+ { pos = (pos << 1) | (wrd & 1);
+ d++; wrd >>= 1; off++;
+ if (off == W) { wrd = *++stream; off = 0; }
+ }
+ *symb = H.s.symb[H.num[d]+pos-H.fst[d]];
+ return ptr+d;
+ }
+*/
+void saveHuff (THuff H, FILE *f)
+
+ { uint *symb = (uint *)malloc((H.lim+1)*sizeof(uint));
+ int i;
+ for (i=0;i<=H.max;i++)
+ if (H.s.spos[i] != ~0) symb[H.s.spos[i]] = i;
+ fwrite (&H.max,sizeof(uint),1,f);
+ fwrite (&H.lim,sizeof(uint),1,f);
+ fwrite (&H.depth,sizeof(uint),1,f);
+ fwrite (symb,sizeof(uint),H.lim+1,f);
+ fwrite (H.num,sizeof(uint),H.depth+1,f);
+ free (symb);
+ }
+
+uint sizeHuff (THuff H)
+
+ { return (4 +(H.lim+1)+2*(H.depth+1))*sizeof(uint);
+ }
+
+uint sizeHuffDisk (THuff H)
+
+ { return ( sizeof(THuff) + ((H.lim+1)+(H.depth+1))*sizeof(uint) );
+ }
+
+void freeHuff (THuff H)
+
+ { free (H.s.spos); free (H.num); free (H.fst);
+ }
+
+
+THuff loadHuff (FILE *f, int enc) //enc (0/1)-> do you only want to perform encoding ??
+
+ { THuff H;
+ uint *symb;
+ uint *num;
+ int i,d,dold,dact;
+ fread (&H.max,sizeof(uint),1,f);
+ fread (&H.lim,sizeof(uint),1,f);
+ fread (&H.depth,sizeof(uint),1,f);
+ symb = (uint *) malloc ((H.lim+1)*sizeof(uint));
+ fread (symb,sizeof(uint),H.lim+1,f);
+ if (enc)
+ { H.s.spos = (uint *) malloc ((H.max+1)*sizeof(uint));
+ for (i=0;i<=H.max;i++) H.s.spos[i] = ~0;
+ for (i=0;i<=H.lim;i++) H.s.spos[symb[i]] = i;
+ free (symb);
+ }
+ else H.s.symb = symb;
+
+ H.num = (uint *) malloc ((H.depth+1)*sizeof(uint));
+ fread (H.num,sizeof(uint),H.depth+1,f);
+ if (!enc)
+ { H.fst = (uint *) malloc ((H.depth+1)*sizeof(uint));
+ H.fst[H.depth] = 0; dold = 0;
+ for (d=H.depth-1;d>=0;d--)
+ { dact = H.num[d+1];
+ H.fst[d] = (H.fst[d+1]+dact) >> 1;
+ H.num[d+1] = dold;
+ dold += dact;
+ }
+ H.num[0] = dold;
+ }
+ return H;
+ }
+
+
+
+
+
+/***************************************************************/
+void prepareToDecode(THuff *H)
+//***///**// //by fari !!
+
+ { uint *symb = (uint *) malloc((H->lim+1)*sizeof(uint));
+ uint *num;
+ int i,d,dold,dact;
+
+ for (i=0;i<=H->max;i++)
+ if (H->s.spos[i] != ~0)
+ symb[H->s.spos[i]] = i;
+
+ for (i=0;i<=H->lim;i++)
+ H->s.symb[i] = symb[i];
+
+ //H.num = malloc ((H.depth+1)*sizeof(uint));
+ {
+ H->fst = (uint *)malloc ((H->depth+1)*sizeof(uint));
+ H->fst[H->depth] = 0; dold = 0;
+ for (d=H->depth-1;d>=0;d--)
+ { dact = H->num[d+1];
+ H->fst[d] = (H->fst[d+1]+dact) >> 1;
+ H->num[d+1] = dold;
+ dold += dact;
+ }
+ H->num[0] = dold;
+ }
+ free (symb);
+}
+
+
+////////////
+void saveHuffAfterDecode (THuff H, FILE *f)
+ {
+ fwrite (&H.max,sizeof(uint),1,f);
+ fwrite (&H.lim,sizeof(uint),1,f);
+ fwrite (&H.depth,sizeof(uint),1,f);
+ fwrite (H.s.symb,sizeof(uint),H.lim+1,f);
+
+ fwrite (H.fst,sizeof(uint),H.depth+1,f);
+ fwrite (H.num,sizeof(uint),H.depth+1,f);
+ }
+
+
+THuff loadHuffAfterDecode (FILE *f, int enc) //enc (0/1)-> do you only want to perform encoding ??
+
+ { THuff H;
+// int i,d,dold,dact;
+//
+// fread (&H.max,sizeof(uint),1,f);
+// fread (&H.lim,sizeof(uint),1,f);
+// fread (&H.depth,sizeof(uint),1,f);
+//
+// H.s.symb = malloc ((H.lim+1)*sizeof(uint));
+// fread (H.s.symb,sizeof(uint),H.lim+1,f);
+//
+// H.fst = malloc ((H.depth+1)*sizeof(uint));
+// fread (H.fst,sizeof(uint),H.depth+1,f);
+//
+// H.num = malloc ((H.depth+1)*sizeof(uint));
+// fread (H.num,sizeof(uint),H.depth+1,f);
+//
+ return H;
+ }
+
+
+
+void loadHuffAfterDecode2 (THuff *H, FILE *f, int enc) //enc (0/1)-> do you only want to perform encoding ??
+
+ {
+ int i,d,dold,dact;
+
+ fread (&H->max,sizeof(uint),1,f);
+ fread (&H->lim,sizeof(uint),1,f);
+ fread (&H->depth,sizeof(uint),1,f);
+
+ H->s.symb = (uint *) malloc ((H->lim+1)*sizeof(uint));
+ fread (H->s.symb,sizeof(uint),H->lim+1,f);
+
+ H->fst = (uint *) malloc ((H->depth+1)*sizeof(uint));
+ fread (H->fst,sizeof(uint),H->depth+1,f);
+
+ H->num = (uint *) malloc ((H->depth+1)*sizeof(uint));
+ fread (H->num,sizeof(uint),H->depth+1,f);
+ }
+
+
+
+
--- /dev/null
+
+// implements canonical Huffman
+
+#ifndef HUFFINCLUDED
+#define HUFFINCLUDED
+
+#include "basics.h"
+#define SORTED 1
+#define UNSORTED 0
+
+typedef struct
+ { uint max,lim; // maximum symbol (0..max), same excluding zero freqs
+ uint depth; // max symbol length
+ union
+ { uint *spos; // symbol positions after sorting by decr freq (enc)
+ uint *symb; // symbols sorted by freq (dec)
+ } s;
+ uint *num; // first pos of each length (dec), number of each length (enc)
+ uint *fst; // first code (numeric) of each length (dec)
+ uint total; // total length to achieve, in bits
+ } THuff;
+
+ // Creates Huffman encoder given symbols 0..lim with frequencies
+ // freq[i], ready for compression
+ // sorted --> are the symbols already sorted ?
+
+THuff createHuff (uint *freq, uint lim, uint sorted);
+
+ // Encodes symb using H, over stream[ptr...lim] (ptr and lim are
+ // bit positions of stream). Returns the new ptr.
+
+int encodeHuff (THuff H, uint symb, uint *stream, uint ptr);
+
+ // Decodes *symb using H, over stream[ptr...lim] (ptr and lim are
+ // bit positions of stream). Returns the new ptr.
+
+int decodeHuff (THuff *H, uint *symb, uint *stream, uint ptr);
+
+ //Prepares a Huffman tree for decoding (changes in spos & symb)
+
+void prepareToDecode(THuff *H);
+
+ // Writes H in file f
+
+void saveHuff (THuff H, FILE *f);
+
+ // Size of H written on file
+
+uint sizeHuffDisk (THuff H);
+
+ //Size of H in memory
+uint sizeHuff (THuff H);
+
+ // Frees H
+
+void freeHuff (THuff H);
+
+ // Loads H from file f, prepared for encoding or decoding depending
+ // on enc
+
+THuff loadHuff (FILE *f, int enc);
+
+//Decodes a code starting in position ptr from stream. Returns the ranking in the
+//vector of symbols.
+
+#define decodeNormalHuffMacro(H, symbol, stream, ptr) \
+ { uint pos; \
+ int d; \
+ pos = 0; \
+ d = 0; \
+ while (pos < H->fst[d]) \
+ { pos = (pos << 1) | bitget(stream,ptr); \
+ ptr++; d++; \
+ } \
+ symbol = (H->s.symb[ H->num[d] + pos - H->fst[d] ]); \
+ }
+
+
+#endif
+
--- /dev/null
+
+// implements canonical Huffman
+
+#include "huffDec.h"
+
+
+//THuff createHuff (uint *fst, uint *num, uint depth) {
+// THuff H;
+// H.fst = fst;
+// H.num = num;
+// H.depth = depth;
+// return H;
+//}
+
+
+void printCodeHuffDec (THuffDec H, uint symb)
+
+ { uint pos;
+ uint code;
+ int d;
+ //pos = H.s.spos[symb];
+ pos = symb;
+ code = 0;
+ d = H.depth;
+
+ // fprintf(stderr,"\n H.depth= %ld and pos is %ld\n",H.depth,pos);
+ while (pos >= H.num[d])
+ { code = (code + H.num[d]) >> 1;
+ pos -= H.num[d--];
+ }
+ code += pos;
+ if (d > W) {fprintf(stderr,"code larger than W"); d=W;}
+
+ //show the code.
+ while (d--)
+ { if ((code >> d) & 1)
+ fprintf(stderr,"1");
+ else fprintf(stderr,"0");
+ }
+ }
+
+
+
+//Decodes a code starting in position ptr from stream. Returns the ranking in the
+//vector of symbols.
+int decodeHuffDec (THuffDec *H, uint *symb, uint *stream, uint ptr)
+ { uint pos;
+ int d;
+ pos = 0;
+ d = 0;
+ while (pos < H->fst[d])
+ { pos = (pos << 1) | bitget(stream,ptr);
+ ptr++; d++;
+ }
+ *symb = H->num[d]+pos-H->fst[d];
+ return ptr;
+ }
+
+
+
+// //Decodes a code starting in position ptr from stream. Returns the ranking in the
+// //vector of symbols.
+// int decodeHuffDecVariantWord (uint *fst , uint *symb, uint *stream, uint ptr, uint depth)
+// { uint pos;
+// int d;
+// pos = 0;
+// d = 0;
+// while ((d< depth) && (pos < fst[d*2]) )
+// { pos = (pos << 1) | bitget(stream,ptr);
+// ptr++; d++;
+// }
+// if (depth==d)
+// *symb = pos;
+// else
+// *symb =
+// pos -
+// fst[d*2] +
+// fst[d*2+1];
+// //(*symb) = (depth==d) ? pos : fst[d*2+1] + pos - fst[d*2];
+//
+// return ptr;
+// }
+
+
+
+// the bytes used by HuffDecman struct
+uint sizeHuffDec (THuffDec H)
+ { return (1+ 2*(H.depth+1))*sizeof(uint);
+ }
+
+
+void freeHuffDec (THuffDec H)
+
+ { free (H.fst); free (H.num);
+ }
+
+
+THuffDec loadHuffDecAfterDecode (FILE *f, int enc) //enc (0/1)-> do you only want to perform encoding ??
+
+ { THuffDec H;
+// int i,d,dold,dact;
+//
+// fread (&H.max,sizeof(uint),1,f);
+// fread (&H.lim,sizeof(uint),1,f);
+// fread (&H.depth,sizeof(uint),1,f);
+//
+// H.s.symb = malloc ((H.lim+1)*sizeof(uint));
+// fread (H.s.symb,sizeof(uint),H.lim+1,f);
+//
+// H.fst = malloc ((H.depth+1)*sizeof(uint));
+// fread (H.fst,sizeof(uint),H.depth+1,f);
+//
+// H.num = malloc ((H.depth+1)*sizeof(uint));
+// fread (H.num,sizeof(uint),H.depth+1,f);
+//
+ return H;
+ }
+
+
+////THuffDec loadHuffDec (FILE *f, int enc)
+////
+//// { THuffDec H;
+//// uint *symb;
+//// uint *num;
+//// int i,d,dold,dact;
+//// fread (&H.max,sizeof(uint),1,f);
+//// fread (&H.lim,sizeof(uint),1,f);
+//// fread (&H.depth,sizeof(uint),1,f);
+//// symb = malloc ((H.lim+1)*sizeof(uint));
+//// fread (symb,sizeof(uint),H.lim+1,f);
+//// if (enc)
+//// { H.s.spos = malloc ((H.max+1)*sizeof(uint));
+//// for (i=0;i<=H.max;i++) H.s.spos[i] = ~0;
+//// for (i=0;i<=H.lim;i++) H.s.spos[symb[i]] = i;
+//// free (symb);
+//// }
+//// else H.s.symb = symb;
+//// H.num = malloc ((H.depth+1)*sizeof(uint));
+//// fread (H.num,sizeof(uint),H.depth+1,f);
+//// if (!enc)
+//// { H.fst = malloc ((H.depth+1)*sizeof(uint));
+//// H.fst[H.depth] = 0; dold = 0;
+//// for (d=H.depth-1;d>=0;d--)
+//// { dact = H.num[d+1];
+//// H.fst[d] = (H.fst[d+1]+dact) >> 1;
+//// H.num[d+1] = dold;
+//// dold += dact;
+//// }
+//// H.num[0] = dold;
+//// }
+//// return H;
+//// }
+
+
+void loadHuffDecAfterDecode2 (THuffDec *H, FILE *f, int enc) //enc (0/1)-> do you only want to perform encoding ??
+
+ {
+ int i,d,dold,dact;
+
+// fread (&H->max,sizeof(uint),1,f);
+// fread (&H->lim,sizeof(uint),1,f);
+ fread (&H->depth,sizeof(uint),1,f);
+
+// H->s.symb = malloc ((H->lim+1)*sizeof(uint));
+// fread (H->s.symb,sizeof(uint),H->lim+1,f);
+
+ H->fst = (uint *) malloc ((H->depth+1)*sizeof(uint));
+ fread (H->fst,sizeof(uint),H->depth+1,f);
+
+ H->num = (uint *) malloc ((H->depth+1)*sizeof(uint));
+ fread (H->num,sizeof(uint),H->depth+1,f);
+ }
+
+
+
+
+
+
+
+
--- /dev/null
+
+// implements canonical Huffman !! Just for decoding when symbols were sorted before creating huffman
+
+#ifndef HUFFDECINCLUDED
+#define HUFFDECINCLUDED
+
+#include "basics.h"
+#define SORTED 1
+#define UNSORTED 0
+
+typedef struct
+ { //uint lim;
+ uint depth; // max symbol length
+ uint *num; // first pos of each length (dec), number of each length (enc)
+ uint *fst; // first code (numeric) of each length (dec)
+ } THuffDec;
+
+
+//typedef struct
+// { uint max,lim; // maximum symbol (0..max), same excluding zero freqs
+// uint depth; // max symbol length
+// union
+// { uint *spos; // symbol positions after sorting by decr freq (enc)
+// uint *symb; // symbols sorted by freq (dec)
+// } s;
+// uint *num; // first pos of each length (dec), number of each length (enc)
+// uint *fst; // first code (numeric) of each length (dec)
+// uint total; // total length to achieve, in bits
+// } THuff;
+
+
+ // Decodes *symb using H, over stream[ptr...lim] (ptr and lim are
+ // bit positions of stream). Returns the new ptr.
+int decodeHuffDec (THuffDec *H, uint *symb, uint *stream, uint ptr);
+
+ // Writes H in file f
+void saveHuffDec (THuffDec H, FILE *f);
+
+ // Frees H
+void freeHuffDec (THuffDec H);
+
+ // the number of bytes used by HuffDecman struct.
+uint sizeHuffDec (THuffDec H);
+
+ // Loads H from file f, prepared for encoding or decoding depending
+ // on enc
+
+THuffDec loadHuffDec (FILE *f, int enc);
+
+
+//Decodes a code starting in position ptr from stream. Returns the ranking in the
+//vector of symbols.
+#define decodeHuffDecMacro(H, symb, stream, ptr) \
+ { uint pos; \
+ int d; \
+ pos = 0; \
+ d = 0; \
+ while (pos < H->fst[d]) \
+ { pos = (pos << 1) | bitget(stream,ptr); \
+ ptr++; d++; \
+ } \
+ fflush(stdout); \
+ symb = H->num[d]+pos-H->fst[d]; \
+ }
+
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+//#define decodeHuffDecMacroVALIDWORDS decodeHuffDecMacro
+
+//Decodes a code starting in position ptr from stream. Returns the ranking in the
+//vector of symbols.
+//Saves space, as the last level for num[] and fst[] are not needed.
+// at expenses of an extra-IF (the last line).
+#define decodeHuffDecMacroVALIDWORDS(H, symb, stream, ptr, depth) \
+ { uint pos; \
+ int d; \
+ pos = 0; \
+ d = 0; \
+ while ((pos < H->fst[d]) && (d< depth)) \
+ { pos = (pos << 1) | bitget(stream,ptr); \
+ ptr++; d++; \
+ } \
+ fflush(stdout); \
+ symb = (depth==d) ? pos : H->num[d]+pos-H->fst[d]; \
+ }
+
+
+//Decodes a code starting in position ptr from stream. Returns the ranking in the
+//vector of symbols.
+#define decodeHuffDecMacroVariantWordxx(fst,num, symb, stream, ptr, depth) \
+ { uint pos; \
+ int d; \
+ pos = 0; \
+ d = 0; \
+ while ((pos < fst[d]) && (d< depth)) \
+ { pos = (pos << 1) | bitget(stream,ptr); \
+ ptr++; d++; \
+ } \
+ fflush(stdout); \
+ symb = (depth==d) ? pos : num[d] + pos - fst[d]; \
+ }
+
+
+
+//Decodes a code starting in position ptr from stream. Returns the ranking in the
+//vector of symbols.
+// #define decodeHuffDecMacroVariantWord(fst,symb, stream, ptr, depth) \
+// { uint pos; \
+// int d; \
+// pos = 0; \
+// d = 0; \
+// while ((d< depth) && (pos < fst[d*2]) ) \
+// { pos = (pos << 1) | bitget(stream,ptr); \
+// ptr++; d++; \
+// } \
+// symb = (depth==d) ? pos : fst[d*2+1] + pos - fst[d*2]; \
+// }
+
+
+/** Decodes a variant of a word from a stream of compressed bits, starting in the ptr-th bit
+ The starting bucket in fstnum is "offbucket" [fst|num|fst|num|fst|num]
+ */
+#define decodeHuffDecMacroVariantWordPos(fstnum, offbucket, symb, stream, ptr, depth) \
+ { uint pos; \
+ register uint d; \
+ pos = 0; \
+ d = 0; \
+ while ((d< depth) && (pos < fstnum[offbucket + d*2]) ) \
+ { pos = (pos << 1) | bitget(stream,ptr); \
+ ptr++; d++; \
+ } \
+ symb = (depth==d) ? pos : fstnum[offbucket+d*2+1] + pos - fstnum[offbucket+d*2]; \
+ }
+#define decodeHuffDecMacroVariantWordPos2(HV, symb, stream, ptr, idCanonical) \
+ { uint pos; \
+ register uint d; \
+ uint *offfstnum = HV.offsetNumAndFst; \
+ uint *fstnum = HV.zoneNumFst; \
+ register uint offbucket = offfstnum[idCanonical]; \
+ register uint depth = (offfstnum[idCanonical+1] - offbucket)/2; \
+ pos = 0; \
+ d = 0; \
+ while ((d< depth) && (pos < fstnum[offbucket + d*2]) ) \
+ { pos = (pos << 1) | bitget(stream,ptr); \
+ ptr++; d++; \
+ } \
+ symb = (depth==d) ? pos : fstnum[offbucket+d*2+1] + pos - fstnum[offbucket+d*2]; \
+ }
+
+
+ //fst[0] = zone[0 ];
+ //fst[1] = zone[1*2];
+ //fst[i] = zone[i*2];
+
+ //num[0] = zone[0 +1];
+ //num[1] = zone[1*2+1];
+ //num[i] = zone[i*2+1];
+
+
+
+//#define decodeHuffDecMacroVariantWordPos2bits(HV, symb, stream, ptr, idCanonical) \
+// { uint pos; \
+// register uint d; \
+// uint sizeBuckbits= HV.sizeBuckbits; \
+// uint dirbElemSize = HV.dirbElemSize; \
+// uint sizeFstbits = HV.sizeFstbits; \
+// uint sizeNumbits = HV.sizeNumbits; \
+// uint *dirb = HV.Dirb; \
+// uint *zonefstnum = HV.zoneMem; \
+// register uint offbucket; \
+// offbucket = bitread (dirb, idCanonical*dirbElemSize, dirbElemSize); \
+// register uint depth; \
+// depth = bitread (dirb, (idCanonical+1)*dirbElemSize, dirbElemSize); \
+// depth = (depth - offbucket)/sizeBuckbits; \
+// pos = 0; \
+// d = 0; \
+// register uint currfst; \
+// currfst = bitread (zonefstnum, (offbucket + d*sizeBuckbits), sizeFstbits); \
+// while ((pos < currfst) ) \
+// { pos = (pos << 1) | bitget(stream,ptr); \
+// ptr++; d++; \
+// if (d<depth) \
+// currfst = bitread (zonefstnum, (offbucket + d*sizeBuckbits), sizeFstbits); \
+// else break; \
+// } \
+// if (depth==d) \
+// symb=pos; \
+// else { \
+// uint currNum; \
+// currNum = bitread (zonefstnum, (offbucket + d*sizeBuckbits+ sizeFstbits), sizeNumbits); \
+// symb = currNum + pos - currfst; \
+// } \
+// }
+
+
+
+// int decodeHuffDecVariantWord (uint *fst , uint *symb, uint *stream, uint ptr, uint depth);
+
+
--- /dev/null
+ #include "kBitArray.h"\r
+ \r
+/*-----------------------------------------------------------------\r
+ Initilization of the kBitArray\r
+ ---------------------------------------------------------------- */ \r
+ t_kBitArray create_kBitArray (uint size, uint elemSize) {\r
+ uint bitsNeeded; \r
+ t_kBitArray V;\r
+ \r
+ V = (t_kBitArray) malloc (sizeof(struct akbitArr));\r
+ bitsNeeded = size * elemSize;\r
+ V->totalInts = ((bitsNeeded+W-1)/W);\r
+ V->data = (uint *) malloc((V->totalInts) * sizeof(uint));\r
+ V->data[V->totalInts-1]=0000; /** avoids valgrind to blame*/\r
+ V->size = size;\r
+ V->elemSize= elemSize;\r
+ printf("\nKbitVector[0,%d), of elemSize = %u initialized\n",V->size,V->elemSize);\r
+ printf("\ntotalInts = %u, size = %u elemSize = %u\n",V->totalInts,V->size,V->elemSize);\r
+ return V;\r
+}\r
+\r
+uint getKBitArray(t_kBitArray V, register uint i) {\r
+ register uint eSize = V->elemSize;\r
+ register uint answ;\r
+ register uint pos = i * eSize;\r
+ mybitread(answ, V->data, pos, eSize);\r
+ return (answ);\r
+ //return ( bitread(V->data,i * eSize,eSize)); \r
+}\r
+\r
+uint getKBitArraySinMacro(t_kBitArray V, register uint i) {\r
+ register uint eSize = V->elemSize;\r
+ return ( bitread(V->data,i * eSize,eSize)); \r
+}\r
+\r
+void setKBitArray(t_kBitArray V, uint i, uint value){\r
+ register uint eSize = V->elemSize; \r
+ register uint pos = i * eSize;\r
+ mybitwrite(V->data, pos, eSize, value);\r
+}\r
+\r
+void setKBitArraySinMacro(t_kBitArray V, uint i, uint value){\r
+ register uint eSize = V->elemSize;\r
+ bitwrite(V->data,i *eSize, eSize, value);\r
+}\r
+\r
+/*-----------------------------------------------------------------\r
+ freeing resources\r
+ ---------------------------------------------------------------- */ \r
+void destroy_kBitArray (t_kBitArray kBitArray) {\r
+ uint total;\r
+ total = (kBitArray->totalInts) * sizeof(uint);\r
+ \r
+ free(kBitArray->data);\r
+ free(kBitArray); \r
+ printf("\n[destroying a Kbit array of size table]...Freed %u bytes... RAM", total); \r
+}\r
+\r
+\r
--- /dev/null
+#include "basics.h"\r
+ \r
+struct akbitArr {\r
+ uint *data; /* uint * contains space for the array of kbit elements */\r
+ uint size; /* number of kbitElements */\r
+ uint elemSize; /* number of bits of each element*/ \r
+ uint totalInts; /* number of ints used */\r
+};\r
+\r
+typedef struct akbitArr *t_kBitArray;\r
+\r
+ /*********/\r
+ t_kBitArray create_kBitArray (uint size, uint elemSize);\r
+ uint getKBitArray(t_kBitArray V,uint i);\r
+ void setKBitArray(t_kBitArray V, uint i, uint value);\r
+ void destroy_kBitArray (t_kBitArray kBitArray);\r
+\r
+\r
+ #define getKBitArrayMacro mybitread //mybitread(answ, v, p, len)\r
+\r
+\r
+\r
+\r
--- /dev/null
+
+#include "parameters.h"
+
+/***********************************************************************************/
+/*** FUNCTIONS USED FOR PARSING PARAMETERS FROM COMMAND LINE ***********************/
+/* Three function to variables to manage parameters */
+ bool is_delimeter(char *delimiters, char c) {
+ int i=0,len_delimiters=strlen(delimiters);
+ bool is=false;
+ for (i=0;i<len_delimiters;i++)
+ if (c == delimiters[i]) is=true;
+ return is;
+}
+
+
+ void parse_parameters(char *options, int *num_parameters, char ***parameters, char *delimiters) {
+ int i=0,j=0,temp=0,num=0, len_options=strlen(options);
+ char *options_temp;
+ while (i<len_options) {
+ while ((i<len_options) && is_delimeter(delimiters,options[i])) i++;
+ temp=i;
+ while ((i<len_options) && !is_delimeter(delimiters,options[i])) i++;
+ if (i!=temp) num++;
+ }
+ (*parameters) = (char **) malloc(num*sizeof(char * ));
+ i=0;
+ while (i<len_options) {
+ while ((i<len_options) && is_delimeter(delimiters,options[i])) i++;
+ temp=i;
+ while ((i<len_options) && !is_delimeter(delimiters,options[i])) i++;
+ if (i!=temp) {
+ (*parameters)[j]=(char *) malloc((i-temp+1)*sizeof(char));
+ options_temp = options+temp;
+ strncpy((*parameters)[j], options_temp, i-temp);
+ ((*parameters)[j])[i-temp] = '\0';
+ j++;
+ }
+ }
+ *num_parameters = num;
+}
+
+ void free_parameters(int num_parameters,char ***parameters) {
+ int i=0;
+ for (i=0; i<num_parameters;i++)
+ free((*parameters)[i]);
+ free((*parameters));
+}
--- /dev/null
+#include <string.h>
+#include <stdlib.h>
+/***********************************************************************************/
+/*** FUNCTIONS USED FOR PARSING PARAMETERS FROM COMMAND LINE ***********************/
+
+#ifndef PARAMETERS_INCLUDED
+#define PARAMETERS_INCLUDED
+ bool is_delimeter(char *delimiters, char c) ;
+ void parse_parameters(char *options, int *num_parameters, char ***parameters, char *delimiters);
+ void free_parameters(int num_parameters,char ***parameters);
+#endif
--- /dev/null
+#include "valstring.h"
+#include <stdio.h>
+
+unsigned char _Valid[256];
+unsigned char _Invalid[256];
+
+unsigned char _toLow[256];
+
+
+#ifndef ValidCh
+
+ #define ValidCh(ch) (isalnum(ch)) /* Teste de validacao */
+ #define InvalidCh(ch) (!ValidCh(ch))
+
+#endif
+
+
+
+
+void StartValid() {
+
+ unsigned x;
+
+ for(x=0;x<128;x++) {
+ if(ValidCh(x)) {
+ _Valid[x]=1;
+ _Invalid[x]=0;
+ }
+ else {
+ _Valid[x]=0;
+ _Invalid[x]=1;
+ }
+ }
+ for(x=128;x<256;x++) {
+ _Valid[x]=0;
+ _Invalid[x]=1;
+ }
+
+ _Valid[0]=_Invalid[0]=0;
+
+ // Caracteres especiales (acentuados, dieresis...)
+ // Caracter 'ñ'
+ _Valid[241]=1;
+ _Invalid[241]=0;
+ // Caracter 'Ñ'
+ _Valid[209]=1;
+ _Invalid[209]=0;
+ // Caracter 'á'
+ _Valid[225]=1;
+ _Invalid[225]=0;
+ // Caracter 'é'
+ _Valid[233]=1;
+ _Invalid[233]=0;
+ // Caracter 'í'
+ _Valid[237]=1;
+ _Invalid[237]=0;
+ // Caracter 'ó'
+ _Valid[243]=1;
+ _Invalid[243]=0;
+ // Caracter 'ú'
+ _Valid[250]=1;
+ _Invalid[250]=0;
+ // Caracter 'Á'
+ _Valid[193]=1;
+ _Invalid[193]=0;
+ // Caracter 'É'
+ _Valid[201]=1;
+ _Invalid[201]=0;
+ // Caracter 'Í'
+ _Valid[205]=1;
+ _Invalid[205]=0;
+ // Caracter 'Ó'
+ _Valid[211]=1;
+ _Invalid[211]=0;
+ // Caracter 'Ú'
+ _Valid[218]=1;
+ _Invalid[218]=0;
+ // Caracter 'ü'
+ _Valid[252]=1;
+ _Invalid[252]=0;
+ // Caracter 'Ü'
+ _Valid[220]=1;
+ _Invalid[220]=0;
+
+}
+
+
+
+
+void StartToLow() {
+ int i;
+ unsigned char c;
+ for (i=0;i<256;i++) {
+ c=i;
+ if( (c >= 'A') && (c <= 'Z') ){
+ _toLow[i]= c+ 'a'-'A';
+ }
+ else if (!_Valid[c]) {
+ _toLow[i]=c;
+ }
+ else {
+ switch(c) {
+ case 192: case 193: case 194: case 195: case 196: case 197: case 224: case 225:
+ case 226: case 227: case 228: case 229:
+ c = 97; break;
+
+ case 201: case 202: case 203: case 232: case 233: case 234: case 235:
+ c = 101; break;
+
+ case 204: case 205: case 206: case 207: case 236: case 237: case 238: case 239:
+ c = 105; break;
+
+ case 210: case 211: case 212: case 213: case 214: case 242: case 243: case 244:
+ case 245: case 246:
+ c = 111; break;
+
+ case 217: case 218: case 219: case 220: case 249: case 250: case 251: case 252:
+ c = 117; break;
+
+ case 209: c = 241; break;
+
+ case 138: case 154: c = 115; break;
+
+ case 159: case 221: case 253: case 255:
+ c = 121; break;
+ }
+ _toLow[i]=c;
+ }
+ }
+}
--- /dev/null
+#include <ctype.h>
+
+#ifndef ValidCh
+
+ #define ValidCh(ch) (isalnum(ch)) /* Teste de validacao */
+ #define InvalidCh(ch) (!ValidCh(ch))
+
+#endif
+
+
+extern unsigned char _Valid[256];
+extern unsigned char _Invalid[256];
+
+extern unsigned char _toLow[256];
+
+void StartValid();
+void StartToLow();