Added simple WCSA

author nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>

Sat, 23 Oct 2010 12:32:46 +0000 (12:32 +0000)

committer nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>

Sat, 23 Oct 2010 12:32:46 +0000 (12:32 +0000)
author nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Sat, 23 Oct 2010 12:32:46 +0000 (12:32 +0000)
committer nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Sat, 23 Oct 2010 12:32:46 +0000 (12:32 +0000)
diff --git a/makefile b/makefile

index 046043b..f12c965 100644 (file)
--- a/makefile
+++ b/makefile
@@ -4,11 +4,12 @@ CPPFLAGS = -Wall -ansi -g -I$(LIBCDSPATH)/includes/ -O3 -DNDEBUG
  LIBCDSA = $(LIBCDSPATH)/lib/libcds.a
  LIBRLCSA = incbwt/rlcsa.a
  LIBLZTRIE = lzindex/lztrie.a
+LIBSWCSA = swcsa/swcsa.a
  
  dcover_obs = dcover/difference_cover.o
  
  TextCollection_obs = TextCollection.o TextCollectionBuilder.o TCImplementation.o Tools.o BitRank.o \
-                     TextStorage.o ${LIBRLCSA} ${LIBCDSA} ${LIBLZTRIE}
+                     TextStorage.o ${LIBRLCSA} ${LIBCDSA} ${LIBLZTRIE} ${LIBSWCSA}
  TCDebug_obs =  bittree.o rbtree.o dynFMI.o 
  
  all: testTextCollection
@@ -28,11 +29,18 @@ incbwt/rlcsa.a:
  lzindex/lztrie.a:
         @make -C lzindex
  
+swcsa/swcsa.a:
+       @make -C swcsa
+
  clean:
         @make clean -C incbwt
         @make clean -C lzindex
+       @make clean -C swcsa
         rm -f core *.o *~ testTextCollection timeTextCollection dcover/*.o dcover/*~
  
+shallow_clean:
+       rm -f core *.o *~ testTextCollection timeTextCollection
+
  depend:
         @make depend -C incbwt
         $(CC) -MM *.cpp > dependencies.mk
diff --git a/swcsa/Makefile b/swcsa/Makefile

new file mode 100644 (file)

index 0000000..1d731af
--- /dev/null
+++ b/swcsa/Makefile
@@ -0,0 +1,75 @@
+SRCDIRUTILS = utils
+SRCDIRCSA   = intIndex
+CC          = g++
+
+# If you have trouble with make, e.g.:
+#    /usr/bin/ld: skipping incompatible /usr/lib/gcc/x86_64-linux-gnu/4.4.3/libstdc++.a when searching for -lstdc++
+#    ...
+# try adding
+#    ln -s /usr/lib32/libstdc++.so.6.0.13 libstdc++.so
+#
+# The filename libstdc++.so.6.0.13 is probably different
+# but any from /usr/lib32 is fine.
+export CFLAGS  = -O9 -m32 -L. -D_FORTIFY_SOURCE=0
+#export CFLAGS      = -O0 -m32 -pg
+#export CFLAGS      = -g -m32 -O0
+
+LIBINDEX       = swcsa.a
+LIBINTINDEX    = icsa.a
+
+all: clean wcsa cleanO
+
+wcsa: intIndexPackage buildFacade.o parameters.o hash.o valstring.o MemoryManager.o basics.o \
+               bitmap.o huffDec.o huff.o fileInfo.o
+       ar rc $(LIBINTINDEX) parameters.o buildFacade.o hash.o valstring.o MemoryManager.o basics.o \
+               bitmap.o huffDec.o huff.o fileInfo.o 
+       mv $(LIBINTINDEX) $(LIBINDEX)
+
+################# SELF INDEX ON INTEGERS ##############################
+intIndexPackage:
+       cd  $(SRCDIRCSA) && $(MAKE) -w
+       @echo "[copying the int-index lib into current dir]"
+       @cp $(SRCDIRCSA)/$(LIBINTINDEX) .
+
+
+####################### UTILS MODULES #################################
+
+parameters.o: 
+       $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/parameters.c 
+
+fileInfo.o:  
+       $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/fileInfo.c
+
+hash.o: MemoryManager.o
+       $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/hash.c
+
+
+MemoryManager.o:
+       $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/MemoryManager.c
+
+valstring.o:  
+       $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/valstring.c   
+
+huff.o: 
+       $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/huff.c
+
+huffDec.o: 
+       $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/huffDec.c
+
+
+basics.o: 
+       $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/basics.c
+
+bitmap.o: 
+       $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/bitmap.c
+
+
+############################ CLEANING #################################
+
+cleanO: 
+       rm -f *.o
+       
+clean:
+       cd  $(SRCDIRCSA) && $(MAKE) clean -w
+       rm -rf *~ *% *.o core *.bak $(LIBINTINDEX) $(LIBINDEX)
+
diff --git a/swcsa/buildAll.c b/swcsa/buildAll.c

new file mode 100644 (file)

index 0000000..b114d3a
--- /dev/null
+++ b/swcsa/buildAll.c
@@ -0,0 +1,365 @@
+#include "buildFacade.h"
+
+/**------------------------------------------------------------------ 
+  *  MAIN PROGRAM.
+  *------------------------------------------------------------------ */
+
+       int main(int argc, char* argv[])
+       {
+                               
+               char *infile, *outbasename, *stopwordsfile;     // Name of in/out files
+               byte *inputBuffer;
+               ulong finsize;
+       
+               int f_in;
+               void *Index;
+
+               
+               printf("\n*Presentation level for CSA (simple WCSA)");
+               printf("\n*CopyRight (c) 2007 [LBD & G.N.]\n\n");
+       
+               // Reads input parameters from command line.
+               if(argc < 3) {
+                       printf("Use: %s <in file> <out basename> [build_options]\n", argv[0]);
+                       exit(0);
+               }
+       
+               // Reads params (input file, output basename, and stopwords file)
+               infile = argv[1];
+               outbasename = argv[2];
+               stopwordsfile = argv[3];
+               
+               finsize= fileSize(infile);
+               
+               if (! finsize) {
+                       printf( "\nFILE EMPTY OR FILE NOT FOUND %s !!\nSkipping processement ...\n",infile);
+                       exit(0);
+               }       
+       
+               // Opening the input text file.
+               if( (f_in = open(infile, O_RDONLY)) < 0) {
+                       printf("Cannot read file %s\n", infile);
+                       exit(0);
+               }       
+               inputBuffer = (byte *) malloc(finsize *sizeof(byte));// +1);
+               read (f_in,inputBuffer,finsize);        
+               close (f_in);   
+               
+               
+       {
+               //printf("\n parametros <<%s>>\n\n",stopwordsfile);     
+               //      build_WCSA (inputBuffer, finsize, stopwordsfile, NULL,outbasename);                                                             
+               build_index (inputBuffer, finsize, stopwordsfile, &Index);  /** building the index */
+
+
+               /** saving the index to disk*/
+               
+               save_index (Index, outbasename);                
+               fprintf(stderr,"Index saved !! ");
+
+               /** tells the mem used by the index */
+               ulong indexsize;                
+               index_size(Index, &indexsize);
+               fprintf(stderr,"Index occupied %d bytes, 2 extra mallocs = %d",indexsize,2* sizeof(uint));
+
+
+               /** recovering the source text from the index */
+                       {
+                               double start, end;
+                               start = getTime2();
+                               ulong size;
+                               get_length(Index, &size);
+
+                               fprintf(stderr, "\nRecovering source file ");   fflush(stderr);
+                               char ext1[10]=".source";
+                               recoverSourceText1((twcsa*) Index, outbasename,ext1, size);
+                               end = getTime2();       
+                               fprintf(stderr, " time: %.3f secs\n", end-start );      
+
+                               start=end;
+                               char ext2[10]=".source2";
+                               fprintf(stderr, "\nRecovering source file ");   fflush(stderr);
+                               recoverSourceText2((twcsa*) Index, outbasename,ext2,size);
+                               end = getTime2();       
+                               fprintf(stderr, " time: %.3f secs\n", end-start );      
+                               //fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start );    
+                       }
+               
+               // DISPLAYING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
+                       {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
+                        int error = 0;
+                       ulong numocc,numc, length, i, *snippet_len, tot_numcharext = 0, numpatt;
+                       uchar *pattern, *snippet_text;
+                                
+                                pattern = textPattern;
+                        printf("\nSEARCH TEST for DISPLAY (pizzachili interface)\n");
+                               while(1) {      
+                                       printf("Intro string: ");
+                                       fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
+                                       if (!strcmp((char*)textPattern,"\n") ) break;
+                                        textPattern[strlen((char*)textPattern)-1] = '\0';
+               
+                                       length = strlen( (char*)textPattern);
+                                       numc=50;
+         
+//                                     error = display (Index, textPattern, length, numc, &numocc, 
+//                                                              &snippet_text, &snippet_len);
+                                       error = displayWords (Index, textPattern, length, numc, &numocc, 
+                                                                &snippet_text, &snippet_len,1);
+                                       
+                                       if (error){ fprintf(stderr, "%s\n", "Hubo un error durante display");exit(0);}
+               
+                                               fprintf(stderr,"\n acabou display");fflush(stderr);                     
+                                       {//show the results
+                                               ulong j, len = length + 2*numc;
+                                           char blank = '\0';
+                                               fprintf(stderr,"\n length = %d",length);
+                                               fprintf(stderr,"\n pattern = %s",pattern);fflush(stderr);
+                                               fprintf(stderr,"\n numocc = %d",numocc);fflush(stderr);
+                                               fprintf(stderr,"\n snippet len = %d",len);fflush(stderr);
+                                               fprintf(stderr,"\n =========");fflush(stderr);          
+                                               for (i = 0; i < numocc; i++){
+                                                       fprintf(stderr,"\n[%2d][len=%3d]<<",i+1,snippet_len[i]);fflush(stderr);
+                                                       fwrite(snippet_text+len*i,sizeof(uchar),snippet_len[i],stderr);fflush(stderr);
+                                                       fprintf(stderr,">>");fflush(stderr);
+                                               }
+                                       }
+                                       numpatt--;
+                                       
+                                       for(i=0; i<numocc; i++) {
+                                               tot_numcharext += snippet_len[i];
+                                       }
+                                               
+                                       if (numocc) {
+                                               free (snippet_len);
+                                               free (snippet_text);
+                                       }
+                                       
+                                       printf("Ocurrences = %d\n", numocc);
+                                       if (!strcmp((char*)textPattern,"\n") ) break;
+                               }
+                       }
+       
+
+
+                       // SEARCHING FOR A TEXT PATTERN (word/phrase).
+                       {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
+                        int occ;
+                        int len;
+                        uint *occs;
+                        int i;
+                        printf("\nSEARCH TEST for LOCATE\n");
+                               while(1) {      
+                                       printf("Intro string: ");
+                                       fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
+                                       len = strlen((char*)textPattern);
+                                       if (!strcmp((char*)textPattern,"\n") ) break;
+                                       textPattern[len-1] = '\0';
+                                       len --;
+                                       
+                                       //occs = locateTextOcurrences(wcsa,textPattern,&occ);
+                                       // locate(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ);
+                                         locateWord(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ, 0);
+                                       
+                                       printf("\n*** %s occurs %d times: In the source text in positions:\n\t",textPattern,occ);
+               /*                      for (i=0;i<occ;i++) 
+                                               printf("[%u]",occs[i]);
+                                       fflush(stderr);  */
+                                       if (occ >0) free(occs);         
+                                       
+                                       if (!strcmp((char*)textPattern,"\n") ) break;
+                               }
+                       }       
+       
+
+
+
+               /** freeing the index */                                        
+               free_index(Index);                              
+                        
+       }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+//{    
+//     bG = createBitmap (B,len);
+//     bE = createBitmapEdu (B,len);
+//     //fprintf(stderr,"\n RANK DE GONZALO DA %u, de EDU DA %u\n",rank(bG,33),rank1Edu(bE,33));
+//     //fprintf(stderr,"\n SELECT1(2) DE GONZALO DA %u, de EDU DA %u\n",bselect(bG,4),bselect(bE,4));
+//     
+//     showBitVector(bitvector,34);    
+//}
+
+
+
+/*     
+
+       //USING A HASH TABLE
+       //      {
+       //      char a[20]="beginnings";char b[20]="HOSTIAS";
+       //      char *w;        
+       //      int i;
+       //      w=a;
+       //      i = inHashTable(stopwordshash,w, strlen(w), &addrInTH );
+       //      if (!i) insertElement (stopwordshash, w, strlen(w), &addrInTH);
+       //      else stopwordshash->hash[addrInTH].freq++;
+       //      //fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
+       //      //fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, stopwordshash->hash[addrInTH].word, stopwordshash->hash[addrInTH].freq, stopwordshash->hash[addrInTH].posInVoc);      
+       //  }
+
+
+
+       /// ENCODING THE separators  ...
+{      
+       freeHuff(gapsHuffman);
+       uint i;
+       uint *bitvector;
+       uint bitvectorSize;
+       uint ptr;
+       bitmap bG,bE;
+       uint len;
+       len = 1000; //number of bits
+       bitvector = (uint *) malloc ((len/32 +1)* sizeof(uint));
+       
+       byte texto[100] = "####@?*";
+       uint freqs[256];
+       
+       //fprintf(stderr,"\n este es el texto a codificar: %s",texto);
+       for (i=0;i<256;i++) freqs[i]=0;
+       for (i=0;i<strlen(texto);i++) freqs[texto[i]]++;
+       gapsHuffman = createHuff (freqs,255,UNSORTED);
+               
+       ptr=0;
+       for (i=0;i<strlen(texto);i++) {
+               //fprintf(stderr,"\n ENCODING seprators !!\n");
+               //fprintf(stderr,"%d. \n",ptr=encodeHuff(gapsHuffman, texto[i],bitvector,ptr) );                
+       }       
+
+       prepareToDecode(&(gapsHuffman));
+       bitvectorSize = ptr;
+       showBitVector(bitvector,bitvectorSize); 
+       uint pos;
+       //fprintf(stderr,"\n DECODING !!\n");
+       ptr=0;
+       while (ptr < bitvectorSize) {
+               ptr=decodeHuff (gapsHuffman, &pos, bitvector, ptr);
+               //fprintf(stderr,"\n DECODING pos is %ld!!\n",pos);
+               //fprintf(stderr,"%c. \n",pos);
+       }
+       exit(0);        
+}                                                                                                                      
+       
+/// ENCODING THE CANONICAL WORDS ...
+{      uint i;
+       uint *bitvector;
+       uint bitvectorSize;
+       uint ptr;
+       bitmap bG,bE;
+       uint len;
+       len = 1000; //number of bits
+       bitvector = (uint *) malloc ((len/32 +1)* sizeof(uint));
+       
+       
+       ptr=0;
+       //fprintf(stderr,"\n ENCODING VARIANTS !!\n");
+       //fprintf(stderr,"%d. \n",ptr=encodeHuff(posInHT[0].huffman, 0,bitvector,ptr) );
+       //fprintf(stderr,"%d. \n",ptr=encodeHuff(posInHT[0].huffman, 1,bitvector,ptr) );
+       //fprintf(stderr,"%d. \n",ptr=encodeHuff(posInHT[0].huffman, 2,bitvector,ptr) );
+       //fprintf(stderr,"%d. \n",ptr=encodeHuff(posInHT[0].huffman, 3,bitvector,ptr) );
+       //fprintf(stderr,"%d. \n",ptr=encodeHuff(posInHT[0].huffman, 4,bitvector,ptr) );
+       //fprintf(stderr,"%d. \n",ptr=encodeHuff(posInHT[0].huffman, 4,bitvector,ptr) );
+                       
+       //      FILE *f;
+       //      f = fopen("huff","w");
+       //      saveHuff(posInHT[0].huffman,f);
+       //      fclose(f);
+       //      
+       //      f = fopen("huff","r");
+       //      posInHT[0].huffman = loadHuff (f,0);
+       //      fclose(f);
+       //      
+       
+       prepareToDecode(&(posInHT[0].huffman));
+
+       FILE *f;
+       f = fopen("huff","w");
+       saveHuffAfterDecode(posInHT[0].huffman,f);
+       fclose(f);
+       
+       f = fopen("huff","r");
+       //posInHT[0].huffman = loadHuffAfterDecode(f,0);
+       loadHuffAfterDecode2 (&(posInHT[0].huffman),f,0);
+       fclose(f);
+       
+
+
+       bitvectorSize = ptr;
+       showBitVector(bitvector,bitvectorSize); 
+       uint pos;
+       //fprintf(stderr,"\n DECODING !!\n");
+       ptr=0;
+       while (ptr < bitvectorSize) {
+               ptr=decodeHuff (posInHT[0].huffman, &pos, bitvector, ptr);
+               //fprintf(stderr,"\n DECODING pos is %ld!!\n",pos);
+               //fprintf(stderr,"%s. \n",posInHT[0].variants[pos]);
+       }
+       exit(0);
+}      
+       
+       
+                       
+{      uint i;
+       uint *bitvector;
+       bitmap bG,bE;
+       uint len;
+       len = 101; //number of bits
+       bitvector = (uint *) malloc ((len/32 +1)* sizeof(uint));
+       //bitvector[0]=0;
+       //bitvector[1]=0;
+       bitzero (bitvector,0,101-1);
+       for (i=0; i<len;i++) setBit (bitvector,len,i,0);
+        
+
+
+       bitset(bitvector,1); 
+       bitset(bitvector,10);
+       bitset(bitvector,12);
+       //activateBit(bitvector,1); 
+       //activateBit(bitvector,10); 
+       
+       bG = createBitmap (bitvector,len);
+       bE = createBitmapEdu (bitvector,len);
+       //fprintf(stderr,"\n RANK DE GONZALO DA %u, de EDU DA %u\n",rank(bG,33),rank1Edu(bE,33));
+       //fprintf(stderr,"\n SELECT1(2) DE GONZALO DA %u, de EDU DA %u\n",bselect(bG,4),bselect(bE,4));
+       
+       showBitVector(bitvector,34);
+}      
+*/             
+       
diff --git a/swcsa/buildFacade.c b/swcsa/buildFacade.c

new file mode 100755 (executable)

index 0000000..768e943
--- /dev/null
+++ b/swcsa/buildFacade.c
@@ -0,0 +1,1638 @@
+#include "buildFacade.h"
+#include "utils/errors.c"
+
+
+/** Building the index */
+
+    /* Creates index from text[0..length-1]. Note that the index is an 
+      opaque data type. Any build option must be passed in string 
+      build_options, whose syntax depends on the index. The index must 
+      always work with some default parameters if build_options is NULL. 
+      The returned index is ready to be queried. */
+int build_index (uchar *text, ulong length, char *build_options, void **index) {
+       int returnvalue;
+
+     printf("\n parameters: \"%s\"\n",build_options); fflush(stderr);
+        
+    returnvalue = build_WCSA (text, length, build_options, index);
+ 
+    if (!returnvalue) 
+       returnvalue = build_iCSA (build_options,*index);
+                  
+    return returnvalue;  
+}
+
+
+/**  Saves index on disk by using single or multiple files, having 
+       proper extensions. */
+int save_index (void *index, char *filename) {
+
+       char *basename = filename;
+       twcsa *wcsa=(twcsa *) index;
+
+       uint i,j;
+       char *outfilename;
+       int file;
+       char c;
+
+       printf("\n Saving structures to disk: %s.*",filename);                  
+       outfilename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
+
+       /**File with some constants (bSize and tohSize); */
+       { uint number;
+               strcpy(outfilename, basename);
+               strcat(outfilename, ".");
+               strcat(outfilename, CONSTANTS_FILE_EXT);
+               unlink(outfilename);
+               if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+                       printf("Cannot open file %s\n", outfilename);
+                       exit(0);
+               }       
+               write(file, &(wcsa->sourceTextSize), sizeof(uint));                     
+               write(file, &(wcsa->seSize), sizeof(uint));                     
+               close(file);            
+       }       
+
+       /** The Words in the vocabulary of words  (sorted alphabetically)*/
+       {       strcpy(outfilename, basename);
+               strcat(outfilename, ".");
+               strcat(outfilename, VOCABULARY_WORDS_FILE_EXT);
+               unlink(outfilename);
+               if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+                       printf("Cannot open file %s\n", outfilename);
+                       exit(0);
+               }
+
+               uint n = wcsa->n;
+               uint elemSize = wcsa->wordsData.elemSize;
+               write(file, &n, sizeof(uint));
+               write(file, &elemSize, sizeof(uint));
+               write(file, &(wcsa->wordsData.wordsZoneMem.size), sizeof(uint));
+               
+               //the number of canonical words
+               write(file, (char *)wcsa->wordsData.wordsZoneMem.zone, wcsa->wordsData.wordsZoneMem.size * sizeof(byte));
+               write(file, (char *)wcsa->wordsData.words, ((((n+1)* (elemSize))+W-1) /W) * (sizeof(uint)) );
+
+               close(file);
+       }       
+                       
+       free(outfilename);
+
+       if (wcsa->myicsa) {
+               /******** saves index on integers (bottom) ******/
+               //Storing the CSA
+               //storeStructsCSA(wcsa->myicsa,basename);
+               saveIntIndex((void *) wcsa->myicsa, basename);          
+       }       
+       
+       if (wcsa->se) {
+               saveSEfile(basename,wcsa->se, wcsa->seSize+1);
+               //free(wcsa->se);               
+       }
+
+       return 0;
+}
+
+
+
+    /**  Loads index from one or more file(s) named filename, possibly 
+      adding the proper extensions. */
+int load_index(char *filename, void **index){
+       twcsa *wcsa;
+       wcsa = loadWCSA (filename);
+       (*index) = (void *) wcsa;
+       return 0;
+}
+
+       /** Frees the memory occupied by index. */
+int free_index(void *index){
+       twcsa *wcsa=(twcsa *) index;
+       ulong size;
+       index_size(index,&size);
+       printf("\n[destroying index] ...Freed %lu bytes... RAM", size); 
+       
+       
+       //frees the array SE.
+       if (wcsa->se)
+               free (wcsa->se);
+
+       //the iCSA.
+       if (wcsa->myicsa) {
+               //destroyStructsCSA(wcsa->myicsa);
+               int err = freeIntIndex((void *) wcsa->myicsa);
+       }
+
+       //the words.
+       free (wcsa->wordsData.wordsZoneMem.zone);
+       free (wcsa->wordsData.words); /** huge!! */
+       
+       //the pointer to wcsa.          
+       free(wcsa);
+       return 0;
+}
+
+       /** Gives the memory occupied by index in bytes. */
+int index_size(void *index, ulong *size) {
+       ulong totaltmp;
+       twcsa *wcsa=(twcsa *)index;
+       uint n= wcsa->n;
+       *size=0;
+       *size += sizeof(twcsa); 
+
+       totaltmp=0;  //words
+       totaltmp += ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));  //the pointers
+       totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words.
+       *size += totaltmp;
+       
+       if (wcsa->myicsa) {
+               uint nbytes;
+               int err = sizeIntIndex((void *) wcsa->myicsa, &nbytes);
+               *size += nbytes;
+               //*size += CSA_size(wcsa->myicsa);              
+       }
+
+       return 0;       
+}
+
+
+/** Querying the index =============================================================*/
+       
+       /* Writes in numocc the number of occurrences of the substring 
+          pattern[0..length-1] found in the text indexed by index. */
+int count (void *index, uchar *pattern, ulong length, ulong *numocc){
+       uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
+       uint integerPatternSize;
+       ulong l,r;      
+
+       twcsa *wcsa=(twcsa *) index;
+       parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);     
+       if (!integerPatternSize) {*numocc=0; return 0;} //not found
+       
+       //*numocc = countCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &l, &r);
+       int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, numocc,  &l, &r);
+       return 0;
+}
+
+       /* Writes in numocc the number of occurrences of the substring 
+         pattern[0..length-1] in the text indexed by index. It also allocates
+         occ (which must be freed by the caller) and writes the locations of 
+         the numocc occurrences in occ, in arbitrary order.  */          
+int locate(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc){
+       return 99;
+}
+
+   /* Gives the length of the text indexed */
+int get_length(void *index, ulong *length) {
+       twcsa *wcsa=(twcsa *) index;
+       *length = wcsa->sourceTextSize;
+       return 0;
+}
+
+    /**  Obtains the length of the text indexed by index. */
+
+int length (void *index, ulong *length) {
+       return (get_length(index,length));
+}
+
+
+/** ***********************************************************************************
+  * Accessing the indexed text  
+  * ***********************************************************************************/
+
+
+       /**  Allocates snippet (which must be freed by the caller) and writes 
+         the substring text[from..to] into it. Returns in snippet_length the 
+         length of the text snippet actually extracted (that could be less 
+         than to-from+1 if to is larger than the text size).                      */
+int extract (void *index, ulong from, ulong to, uchar **snippet, ulong *snippet_length) {              
+    twcsa *wcsa=(twcsa *) index;
+       return 99;    
+}
+
+  /** Displays the text (snippet) surrounding any occurrence of the 
+    substring pattern[0..length-1] within the text indexed by index. 
+    The snippet must include numc characters before and after the 
+    pattern occurrence, totalizing length+2*numc characters, or less if 
+    the text boundaries are reached. Writes in numocc the number of 
+    occurrences, and allocates the arrays snippet_text and 
+    snippet_lengths (which must be freed by the caller). The first is a 
+    character array of numocc*(length+2*numc) characters, with a new 
+    snippet starting at every multiple of length+2*numc. The second 
+    gives the real length of each of the numocc snippets. */
+
+int display (void *index, uchar *pattern, ulong length, ulong numc, 
+        ulong *numocc, uchar **snippet_text, ulong **snippet_lengths) {
+       return 99;
+}
+
+
+
+/** ***********************************************************************************
+  * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
+  * ***********************************************************************************/  
+       /* Writes in numocc the number of occurrences of the substring 
+         pattern[0..length-1] in the text indexed by index. It also allocates
+         occ (which must be freed by the caller) and writes the locations of 
+         the numocc occurrences in occ, in arbitrary order. These occurrences
+         refer to the offsets in TOH where the caller could start a display
+         operation. So locateWord implies synchronization using B.
+         Moreover, positions occ[numocc.. 2*numocc-1] is set with the rank in SE of the
+         words whose codes begin in TOH in the positions in occ[0... numocc-1]
+         ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
+            searched word, but the offset in TOH of k-before words before.       
+       */        
+         
+int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore){
+       uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
+       uint integerPatternSize;
+       ulong occurrences,l,r;  
+       twcsa *wcsa=(twcsa *) index;
+       
+       parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize);     
+       if (!integerPatternSize) {*numocc=0; return 0;} //not found 
+
+       ulong *seOffsets;
+  
+       //obtains the indexes in vector SE where the pattern appears.
+       //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
+       int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
+               
+       *numocc = occurrences;
+       
+       if (!occurrences) {(*occ)=NULL;return 0;}
+
+       (*occ) = (ulong *)seOffsets;
+       return 0;
+}  
+
+
+  /** Displays the text (snippet) surrounding any occurrence of the 
+    substring pattern[0..length-1] within the text indexed by index. 
+    The snippet must include numc characters before and after the 
+    pattern occurrence, totalizing length+2*numc characters, or less if 
+    the text boundaries are reached. Writes in numocc the number of 
+    occurrences, and allocates the arrays snippet_text and 
+    snippet_lengths (which must be freed by the caller). The first is a 
+    character array of numocc*(length+2*numc) characters, with a new 
+    snippet starting at every multiple of length+2*numc. The second 
+    gives the real length of each of the numocc snippets. */
+
+ int displayWords (void *index, uchar *pattern, ulong length, ulong numc, 
+         ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore) {
+               
+    /** actually extracts upto length + 2*numc chars, starting extraction kbefore
+     *  words before the occurrence **/        
+
+       ulong *indexesInSE;
+       ulong occurrences;
+       uint bytesPerSnippet;
+       byte *text_aux;
+       twcsa *wcsa=(twcsa *) index;
+       
+       locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);    
+       (*numocc) = occurrences;   
+
+       if (!occurrences) {
+               *snippet_text =NULL;
+               *snippet_lengths =NULL;
+               return 0;
+       }       
+
+       bytesPerSnippet = length+2*numc;
+//     bytesPerSnippet = 2*numc;
+       *snippet_lengths = (ulong *) malloc((*numocc)*sizeof(ulong));
+       if (!(*snippet_lengths)) return 1;
+       *snippet_text = (uchar *) malloc((*numocc)*(bytesPerSnippet)*sizeof(uchar) +1) ;  //(the last "1" is for '\0');
+       if (!(*snippet_text)) return 1;
+
+ //    fprintf(stderr,"\n occs found = %7d for pattern %s",*numocc, pattern);
+ //    fflush(stderr);
+       
+       text_aux=*snippet_text;         
+       {
+               uint i, j, tmplen;
+               uint ptr, maxptr;
+               byte *src, *dst;
+               uint snippetLen;
+               uint posSEValue,indexSE;
+                       
+               for (i=0;i<occurrences;i++) {
+                               uint prevValid=0;
+                               uint endSnippet =0;
+       
+                               /** decodes words from there */                 
+                               snippetLen=0;                                                   
+                               indexSE = indexesInSE[i];
+                               indexSE = (indexSE > kbefore) ? indexSE-kbefore : 0;                     
+
+                               dst = text_aux;                                 
+                               while ((!endSnippet) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
+
+                                       //posSEValue =displayCSA(wcsa->myicsa,indexSE); 
+                                       int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
+                                       
+                                       {//obtains pointer to the ith word
+                                               uint offtmp;
+                                               uint ith = posSEValue -1;  // !! 
+                                               tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+                                               offtmp = bitread (wcsa->wordsData.words, ( ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+                                               tmplen -=offtmp;  //the lenght of the ith word.
+
+                                               src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
+                                       }                                       
+
+                                       if (_Valid[*src]) {
+                                               if (prevValid){
+                                                        *dst++ =' ';   
+                                                        snippetLen++;
+                                                        if  (snippetLen==bytesPerSnippet) break;  //end of snippet (ends in BLANK_SPACE)
+                                               }
+                                               prevValid =1;   //for the next iteration
+                                       }
+                                       else prevValid=0;               
+                                                                                                                                                                                                       
+                                       indexSE++;
+                                                                       
+                                       /* at the end ?? */
+                                       if  ((tmplen+snippetLen)>=bytesPerSnippet) {
+                                               tmplen =(bytesPerSnippet - snippetLen);
+                                               endSnippet=1; //so while loop ends;
+                                       }       
+                                       
+                                       for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
+                                       snippetLen +=tmplen;                            
+                               }//while
+       
+                               text_aux += bytesPerSnippet;
+                               (*snippet_lengths)[i] = snippetLen;                                             
+                       }       //for
+                                               
+                       if (occurrences) free(indexesInSE);
+               }
+               return 0;               
+}
+
+/** simulates extration of text process, but do not actually returns anything at all 
+   Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
+   Less than 2K words can be extracted if more than numc characters have been already obtained.
+   Does nothing else... does not return the text */
+
+int  displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc) {
+       
+       ulong *indexesInSE;
+       ulong occurrences;
+       byte *text_aux;
+ 
+       twcsa *wcsa=(twcsa *) index;
+       
+       locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0);    
+ 
+       if (!occurrences) {
+               return 0;
+       }       
+       
+       ulong maxsnippetLen  = maxnumc;
+       ulong extractedbytes = 0;
+
+       text_aux = (byte *) malloc (maxsnippetLen+1);
+
+       {
+               uint i, j, tmplen;
+               uint ptr, maxptr;
+               byte *src, *dst;
+               uint snippetLen;
+               uint posSEValue,indexSE;
+               
+               uint numWordsToExtract = 2 * wordsbefore;
+               uint z;
+               //printf("\n occurrences... = %lu",occurrences);
+                       
+               for (i=0;i<occurrences;i++) {
+                               uint prevValid=0;
+                               uint endSnippet =0;
+
+                               /** decodes words from there */                 
+                               snippetLen=0;                                                   
+                               indexSE = indexesInSE[i];
+                               indexSE = (indexSE > wordsbefore) ? indexSE-wordsbefore : 0;                     
+
+                               dst = text_aux; 
+                               z=0;                            
+                               while ((z<numWordsToExtract) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
+
+                                       //posSEValue =displayCSA(wcsa->myicsa,indexSE); 
+                                       int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue); 
+                               
+                                       {//obtains pointer to the ith word
+                                               uint offtmp;
+                                               uint ith = posSEValue -1;  // !! 
+                                               tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+                                               offtmp = bitread (wcsa->wordsData.words, ( ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+                                               tmplen -=offtmp;  //the lenght of the ith word.
+
+                                               src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
+                                       }                                       
+       
+                                       if (_Valid[*src]) {
+                                                       if (prevValid){
+                                                                *dst++ =' ';   
+                                                                snippetLen++;
+                                                                if  (snippetLen==maxsnippetLen) break;  //end of snippet (ends in BLANK_SPACE)
+                                                               }
+                                                       prevValid =1;   //for the next iteration
+                                       }
+                                       else prevValid=0;               
+                                                                                                                                                                                                       
+                                       indexSE++;
+                                                                       
+                                       /* at the end ?? */
+                                       if  ((tmplen+snippetLen)>=maxsnippetLen) {
+                                                       break;
+                                       }       
+                               
+                                       //fprintf(stderr,"\ntmplen = %d ",tmplen); fflush(stderr);                                                      
+                                       for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
+                                       snippetLen +=tmplen;    
+                                       z++;                    
+                               }//while
+
+                               extractedbytes += snippetLen;
+                                       
+                       }       //for
+                                       
+                       if (occurrences) free(indexesInSE);
+               }
+               if (text_aux) free (text_aux);
+               return extractedbytes; 
+}
+
+
+
+/**  Allocates text (which must be freed by the caller) and recovers the
+  the substring of text starting from the "fromword"-th word up to the
+  "toWord"-th words. Returns in text the text, and in "text_lenght" the 
+  length of the text  actually extracted. Text is allocated. 
+  Actually extracts SE[fromWord .. toWord) ... not the last word.    */
+
+int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text, 
+        ulong *text_length){
+               
+       twcsa *wcsa=(twcsa *) index;
+    uint initTextLen=10000;
+    uint avgWordLen =7;
+
+       uint i, j;//, tmplen;
+       uint prevValid;
+       byte *src, *dst, *buff;
+       uint tmplen =0;
+
+       uint buffBytes = 1000;
+       uint leng=0; //curr pos in buffer that was occupied.
+       
+       if (toWord > wcsa->seSize) toWord = wcsa->seSize;
+       if (fromWord >= wcsa->seSize) fromWord = wcsa->seSize-1;
+       if (buffBytes < ( (toWord-fromWord)* avgWordLen)) buffBytes = ((toWord-fromWord)* avgWordLen);
+               
+       buff = (uchar *) malloc (buffBytes * sizeof(char));
+       if (!buff) return 1; //out of memory.
+       dst = buff;
+       
+       register uint indexSE=fromWord;
+       uint posSEValue=0;
+       register uint ith;
+               
+       while  ( (indexSE < toWord) ){ /** extracting words (if not at the end) */
+
+               int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);  
+               
+               {//obtains pointer to the ith word
+                       uint offtmp;
+                       ith= posSEValue -1;  // !! 
+                       tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+                       offtmp = bitread (wcsa->wordsData.words, (ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+                       tmplen -=offtmp;  //the lenght of the ith word.
+                       src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
+               }                                       
+
+               if ( buffBytes < (leng + tmplen+1) ) {
+                       buffBytes *=2;
+                       buff = (uchar*) realloc(buff, buffBytes);
+                       if (!buff) return 1; //out of memory.
+                       dst = buff + leng;
+               }
+
+               if (_Valid[*src]) {
+                       if (prevValid){
+                               *dst++ =' ';                                    
+                               leng  += 1;
+                       }
+                       prevValid =1;   //for the next iteration
+               }
+               else prevValid=0;               
+                                                                                                                                                                               
+               indexSE++;
+                                                                                       
+               for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
+               leng +=tmplen;
+       }//while
+
+       *text_length =leng;
+       *dst='\0';
+       *text = buff;
+       return 0;
+}
+
+
+
+/** ***********************************************************************************
+        CONSTRUCTION OF THE INDEX WCSA
+    ***********************************************************************************/
+
+       /**------------------------------------------------------------------
+       Compares two slots (alphanumericaly). For qsort of canonical words 
+       ------------------------------------------------------------------ */                      
+       int qSortWordsCompareAlpha(const void *arg1, const void *arg2) {
+               tposInHT *a1 = (tposInHT *) arg1;
+               tposInHT *a2 = (tposInHT *) arg2;
+               return strcmp((char*)a1->word, (char *)a2->word);       
+       }
+
+/**
+  * BUILDS THE WCSA INDEX
+  */
+
+int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
+       
+       unsigned long zeroNode;  //number of different canonical words.
+       
+       t_hash hash;            // the hash table to store both variants and canonical words.                                   
+       tposInHT *posInHT;      // structure for canonicals and variants+huffmans 
+
+       uint sourceTextSize;  
+ 
+       uint seSize=0;  //it's size == "numberOfValidWords". 
+       uint *SE;       //Integers vector. (represents the rank of the valid words in the source text).
+
+       uint totallenWords=0; //The numberOfBytes that occupy canonical words (their ascii version) in memory
+
+
+       ulong bytesFile,bytesFileReal;
+       long sizeNValue;
+
+       /* used during first pass */    
+                       
+       ulong addrInTH;
+       
+       byte* inputBuffer = text;       
+       bytesFileReal= bytesFile = length;
+
+       sourceTextSize=length;
+       
+       /** Initializes WCSA structure*/
+       twcsa *wcsa;
+       wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
+       zeroNode=0;
+       /**      */
+
+       //Stimation (Using Heap's law) of the number of different "meaningful" words.
+       //sizeNValue=N_value;
+       if(bytesFile<5000000) bytesFile = 5000000;
+       sizeNValue = (unsigned long) floor(3.9* pow(bytesFile,0.60) );
+       
+
+       // Inicializes the arrays used to detect if a char is valid or not.
+       StartValid();
+       // Inicializes the arrays used translated a char into lowercase.
+       StartToLow();
+               
+       
+       // **********************************************************************************
+       //STARTING THE FIRST PASS. 
+       // **********************************************************************************
+       printf("\nSTARTING THE FIRST PASS...");
+       
+       posInHT = (tposInHT *) malloc(sizeof(tposInHT) * sizeNValue); 
+       hash = initialize_hash (sizeNValue); //hash to cointain both the parsed words
+       
+       //-----------------------------------------------------------------     
+       //1st pass (processing the file)
+       { 
+               byte *pbeg,*pend,*wordstart,*aWord;
+               register ulong size;
+               register uint i;
+
+               pbeg = inputBuffer;
+               pend = inputBuffer+bytesFileReal;
+                               
+               while (pbeg <pend) {  
+                       
+                       //parsing either a word or separator.                   
+                       size=0;
+                       wordstart = pbeg;
+                       if (_Valid[*pbeg]) {   //alphanumerical data
+                               while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+                   }               
+                       else {
+                               if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word                                      
+                                       while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
+                               }
+                               else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
+                                       pbeg++;
+                                       if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
+                                       else {
+                                               if (_Valid [*pbeg] ) {
+                                                       wordstart = pbeg;   //So skipping 1 blank character
+                                                       while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+                                               }
+                                               else {   // a "separator word" ...
+                                                       size++; //the prev BLANK...
+                                                       while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
+                                               }//else {  // a "separator word"
+                                       }//else ... not a unique BLANK AT THE END.
+                               }//else ... starting by a BLANK... 
+                       }
+
+                       //The parsed word/separator is  is "wordstart", and its length is "size"...
+                       aWord=wordstart;                                
+
+                       //Processement done for each word word                  
+                       i = inHashTable(hash,aWord, size, &addrInTH );                          
+                       if (!i){
+                               insertElement (hash,aWord, size, &addrInTH);
+                               posInHT[zeroNode].slot=addrInTH;
+                               posInHT[zeroNode].word=hash->hash[addrInTH].word;
+                               hash->hash[addrInTH].posInVoc = zeroNode;
+                               zeroNode++;
+                               totallenWords += size +1;                       // +1 due to the '\0' char...           
+                               //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);                           
+                       }                                                       
+                       seSize ++;
+               }//while pbeg<pend
+
+               fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
+
+       }//1st pass ends
+
+
+       // **********************************************************************************
+       // END OF 1ST PASS
+       // **********************************************************************************
+
+       // Sorting the words alphanumerically (over posInHT)
+       {       register unsigned long i,j;     
+               //sorting canonical words ...
+               qsort(posInHT, zeroNode, sizeof(tposInHT), qSortWordsCompareAlpha);             
+               
+               //setting in hash the new positions of the  words in the hash table
+               for (i=0;i<zeroNode;i++) {      
+                       hash->hash[posInHT[i].slot].posInVoc = i;       
+               }
+       }       
+       
+       // INITIALIZING structures for the 2nd pass ......................................
+       {
+               SE  = (uint *) malloc ((seSize+1)*sizeof (uint));
+       }
+
+
+       // **********************************************************************************
+       //  STARTING THE SECOND PASS.
+       // **********************************************************************************/
+
+       printf("\nSTARTING THE SECOND PASS... ");
+       //2nd pass (processing the file)
+       { 
+               byte *pbeg,*pend,*wordstart,*aWord;
+               register ulong size;
+               register uint i;
+               register ulong countValidWords = 0;
+
+
+               pbeg = inputBuffer;
+               pend = inputBuffer+bytesFileReal;
+                               
+               while (pbeg <pend) {  
+                       
+                       //parsing either a word or separator.                   
+                       size=0;
+                       wordstart = pbeg;
+                       if (_Valid[*pbeg]) {   //alphanumerical data
+                               while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+                 }                 
+                       else {
+                               if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word                                      
+                                       while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
+                               }
+                               else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
+                                       pbeg++;
+                                       if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
+                                       else {
+                                               if (_Valid [*pbeg] ) {
+                                                       wordstart = pbeg;   //So skipping 1 blank character
+                                                       while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+                                               }
+                                               else {   // a "separator word" ...
+                                                       size++; //the prev BLANK...
+                                                       while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
+                                               }//else {  // a "separator word"
+                                       }//else ... not a unique BLANK AT THE END.
+                               }//else ... starting by a BLANK... 
+                       }
+
+                       //The parsed word/separator is  is "wordstart", and its length is "size"...
+                       aWord=wordstart;                                        
+
+                       //Processement done for each word word                  
+                       i = inHashTable(hash,aWord, size, &addrInTH );                          
+
+                       SE[countValidWords]=hash->hash[addrInTH].posInVoc+1;  // !!!!
+                       countValidWords++;              
+
+               }// while pbeg<pend
+               
+               SE[countValidWords] = 0;
+               fprintf(stderr,"\n2nd pass ends: TOTAL distinct words: %lu totalWords = %lu",zeroNode,countValidWords);
+                       
+       }//2nd pass ends
+       
+       // **********************************************************************************
+       // END OF 2ND PASS
+       // **********************************************************************************
+       
+       //freeing the source text (it is no longer needed).
+       free(inputBuffer); //the text   
+
+       /** Now Setting the data of the index **/
+       wcsa->n = zeroNode;
+       wcsa->sourceTextSize = sourceTextSize;
+       wcsa->seSize = seSize;
+       
+       // Creating the words of the vocabulary...
+       {  
+       /** copying the words into WCSA. */
+               uint *tmpOffsets = (uint *) malloc (sizeof(uint) * (zeroNode  +1) );  //1 extra uint (to point to the virtual "zeroNode+1" ^th word.
+               uint tmpOffset =0;
+                
+               byte *zoneMem,*src;
+               uint i;
+                                                                
+               //Moving data from posInHT to WCSA structure
+               //wcsa->wordsData = (twords *) malloc(sizeof(twords) * zeroNode); 
+               wcsa->wordsData.wordsZoneMem.size = totallenWords - zeroNode; //without '\0' bytes (end-tag).
+               wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc ( wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );  
+               zoneMem = wcsa->wordsData.wordsZoneMem.zone;                                    
+               for(i = 0; i < zeroNode; i++) {
+                       src = posInHT[i].word;                           //copying the canonical word
+                       //wcsa->wordsData.words[i].word = zoneMem;   //setting the pointer
+                       tmpOffsets[i]=tmpOffset;  //offset in zoneMem
+                       while (*src) {*zoneMem++ = *src++; tmpOffset++;}  //moving data until '\0'
+                       //*zoneMem='\0'; zoneMem++;            //copies also the '\0'
+                       
+               }
+               tmpOffsets[zeroNode]=tmpOffset; //setting pointer to the "virtual" word {zeroNode+1}^{th}
+               
+               //kbit encoding of the offsets
+               uint elemSize = bits(tmpOffset);
+               wcsa->wordsData.elemSize = elemSize;
+               wcsa->wordsData.words = (uint *) malloc (((((zeroNode +1)*elemSize)+W-1) /W) * sizeof(uint));  //with 1 extra slot !.
+               wcsa->wordsData.words[((((zeroNode +1)*elemSize)+W-1) /W)   -1 ] =0000;
+               //              fprintf(stderr,"\n ElemSize = %d, maxOffset = %d",elemSize,tmpOffset);
+
+               tmpOffset=0;
+               for (i=0; i<=zeroNode; i++) {  //setting "zeroNode+1" offsets
+                               bitwrite(wcsa->wordsData.words, tmpOffset, elemSize, tmpOffsets[i]);
+                               tmpOffset+=elemSize; 
+               }
+                               
+               //////////// CHECKS IT WORKED. old !!!!
+       //              { uint kk;
+       //                      tmpOffset=0;
+       //                      for (i=0; i<zeroNode; i++) {  //setting "zeroNode+1" offsets
+       //                                      kk=bitread(wcsa->wordsData.words, i* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+       //                                      tmpOffset+=elemSize; 
+       //                                      if (kk != tmpOffsets[i]) {fprintf(stderr,"\n @@@@@@@@ DISTINTOS OFFSETS "); break;}
+       //                                      else fprintf(stderr,"\n iguales, %d, %d  :: <<%s>>len=%d",kk,i, posInHT[i].word, strlen((char*)posInHT[i].word));
+       //                      }
+       //              }
+       //
+       //              { uint len1, len, tmplen, len2;
+       //                      uint i,p;
+       //                      byte *wcsaWord, *src;
+       //                      
+       //                      for (p=0;p<zeroNode;p++) {
+       //                              {//preparing for strcompL
+       //                                      len    = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p+1)), wcsa->wordsData.elemSize);
+       //                                      tmplen = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p))  , wcsa->wordsData.elemSize);
+       //                      
+       //                              //fprintf(stderr,"\n  :: off[%d]= %d  -  off [%d] = %d  ==> %d",p+1,len,p,tmplen,len-tmplen);
+       //                      
+       //                              len2 =len-tmplen;
+       //                                      wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
+       //                              }               
+       //                              
+       //                              src = posInHT[p].word;  
+       //                              len1 = strlen((char *)src);
+       //                              
+       //                              if (strcompL(src,wcsaWord,len1,len2) != 0) {
+       //                                      fprintf(stderr,"\n %6d DISTINTOS !! ===len1 %d,len %d===== <<",p,len1,len2);printWord(src,len1); 
+       //                                      fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
+       //                                      exit(0);
+       //                              }
+       //                              else {
+       //                                      fprintf(stderr,"\n %6d ======len1 %d,len2 %d===== <<",p,len1,len2);printWord(src,len1); 
+       //                                      fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>");
+       //                              }                                       
+       //                      }
+       //                              
+       //              }
+       //      
+       
+       /**-----------*/
+       //frees memory from hash table and posInHT structures.
+               free(tmpOffsets);
+               destroy_hash(hash);
+               free(posInHT);
+       }
+       
+       /** ******* creates the self-index on ints (bottom layer) ==> see build_icsa *********/
+/**
+       #ifdef CSA_ON
+       {
+               uint total;
+               fprintf(stderr,"\n **** CREATING CSA from Edu's Code *****");
+               ticsa *myicsa;
+               myicsa = createIntegerCSA(&SE,seSize+1,build_options);
+               wcsa->myicsa= myicsa;
+               total = CSA_size(myicsa);
+               
+               free(SE);  //SE is no longer needed, (it is indexed by the iCSA)
+               printf("\n\t**** [iCSA built on %d words. Size = %ld bytes... RAM",seSize,total);
+       }       
+       #endif
+*/
+
+       //#ifndef CSA_ON
+               wcsa->se = SE;
+               wcsa->myicsa = NULL;
+       //#endif
+       
+       printf("\n\t ** Building done! **\n");  
+       printf("\n Process finished!\n");
+
+       *index = wcsa;
+       return 0;
+}
+
+
+int build_iCSA (char *build_options, void *index)
+{
+       twcsa *wcsa = (twcsa *) index;
+       /********* creates the self-index on ints (bottom layer) *********/
+       //creating CSA from Edu's code...
+       {
+               uint total;
+               fprintf(stderr,"\n **** CREATING CSA-bottom-layer *****");
+               void *bottomIntIndex;
+               int err =  buildIntIndex(wcsa->se,wcsa->seSize+1, build_options,(void **)&bottomIntIndex);                              
+               wcsa->myicsa = bottomIntIndex;
+
+               //total = CSA_size(wcsa->myicsa);
+               err = sizeIntIndex((void *) wcsa->myicsa, &total);
+
+               printf("\n\t**** [iCSA built on %d words. Size = %u bytes... RAM",wcsa->seSize,total);
+       }               
+       return 0;
+}
+
+/** ********************************************************************
+  * Loading from disk
+  **********************************************************************/ 
+
+/**-----------------------------------------------------------------  
+ * LoadWCSA.                                                       
+ * Loads all the data structures of WCSA (included the icsa)     
+ ----------------------------------------------------------------- */ 
+
+twcsa *loadWCSA(char *filename) {
+       twcsa *wcsa;
+       // Inicializes the arrays used to detect if a char is valid or not.
+       StartValid();
+       // Inicializes the arrays used translated a char into lowercase.
+       StartToLow();
+       
+       wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
+       wcsa->n=0;
+
+       int err = loadIntIndex(filename, (void **)&wcsa->myicsa);
+
+       loadStructs(wcsa,filename);                                                                       
+
+       return wcsa;
+}
+
+/** ------------------------------------------------------------------
+ * LoadStructs.
+ *     Reads files and loads all the data needed for searcherFacade
+ ----------------------------------------------------------------- */ 
+ void loadStructs(twcsa *wcsa, char *basename) {
+       uint i,j;
+       char *filename;
+       int file;
+       uint sizeFile;
+       char c;
+       uint n;
+               
+       filename = (char *)malloc(sizeof(char)*(strlen(basename)+10));
+       fprintf(stderr,"Loading Index from file %s.*\n", basename);
+       
+       //** SOME CONSTANTS: sourceTextSize
+       {       strcpy(filename, basename);
+               strcat(filename, ".");
+               strcat(filename, CONSTANTS_FILE_EXT);
+               
+               if( (file = open(filename, O_RDONLY)) < 0) {
+                       printf("Cannot open file %s\n", filename);
+                       exit(0);
+               }
+                               
+               read(file, &(wcsa->sourceTextSize), sizeof(uint));      
+               read(file, &(wcsa->seSize), sizeof(uint));      
+               close(file);
+       }               
+       
+       /** File with the words from the vocabulary (sorted alphabetically) */
+       {       byte *zoneMem;
+               
+               strcpy(filename, basename);
+               strcat(filename, ".");
+               strcat(filename, VOCABULARY_WORDS_FILE_EXT);
+               //sizeFile= fileSize(filename)-sizeof(uint);
+
+               if( (file = open(filename, O_RDONLY)) < 0) {
+                       printf("Cannot open file %s\n", filename);
+                       exit(0);
+               }
+               
+               //the number of canonical words
+               read(file, &n, sizeof(uint));
+               wcsa->n = n;
+               read(file, &(wcsa->wordsData.elemSize), (sizeof(uint)));
+               read(file, &(wcsa->wordsData.wordsZoneMem.size), (sizeof(uint)));
+               
+               //allocating the memory needed for all words and reading them   //(ascii) << no \0 chars are needed>>.  
+               wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc(wcsa->wordsData.wordsZoneMem.size * sizeof(byte));                  
+               read(file, (wcsa->wordsData.wordsZoneMem.zone),    wcsa->wordsData.wordsZoneMem.size * sizeof(byte) );
+       
+               //reading the offsets of the words (kbitArray that points to offsets in zoneMem of words.
+               wcsa->wordsData.words = (uint *) malloc (((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * sizeof(uint));        
+               wcsa->wordsData.words[ ((((n+1)*(wcsa->wordsData.elemSize))+W-1) /W)   -1 ] =0000;
+               read(file, (wcsa->wordsData.words),     ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)));
+               
+
+               close(file);
+       }       
+       wcsa->se= NULL; 
+       free(filename);
+}
+
+
+
+
+/** ****************************************************************
+  * Querying the index WCSA
+  * ***************************************************************/
+///////////////////////////////////////////////////////////////////////////////////////
+//                                        FUNCTIONS NEEDED FOR SEARCHING A PATTERN                                      //
+///////////////////////////////////////////////////////////////////////////////////////
+
+
+
+/*------------------------------------------------------------------
+ * Given a text pattern translates it into a list of integers (corresponding to the
+ * canonical words associated to the valid words in the text pattern) 
+ ------------------------------------------------------------------*/
+void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) {
+       
+       byte *pbeg,*pend,*wordstart,*aWord;
+       register unsigned long size;
+       uint index =0; 
+                       
+       pbeg = textPattern; 
+       pend = pbeg + patLen;
+                                                       
+       while (pbeg <pend) {  
+               //parsing either a word or separator.                   
+               size=0;
+               wordstart = pbeg;
+               if (_Valid[*pbeg]) {   //alphanumerical data
+                       while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+         }                 
+               else {
+                       if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word                                      
+                               while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
+                       }
+                       else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
+                               pbeg++;
+                               if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
+                               else {
+                                       if (_Valid [*pbeg] ) {
+                                               wordstart = pbeg;   //So skipping 1 blank character
+                                               while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+                                       }
+                                       else {   // a "separator word" ...
+                                               size++; //the prev BLANK...
+                                               while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
+                                       }//else {  // a "separator word"
+                               }//else ... not a unique BLANK AT THE END.
+                       }//else ... starting by a BLANK... 
+               }
+                                               
+               //The parsed word is "aWord", and its length is "size"...
+               aWord=wordstart;        
+               
+               // Binary search on the canonical words (wordsData)
+               {
+                       uint len, tmplen;
+                       uchar *wcsaWord;
+                       register uint min,max,p;        
+                       min = 0;
+                       max = (wcsa->n) - 1;
+                       while(min < max) {
+                               p = (min+max)/2;
+                               
+                               {//preparing for strcompL
+                                       len    = bitread (wcsa->wordsData.words, (p+1)* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize);
+                                       tmplen = bitread (wcsa->wordsData.words, (p )* wcsa->wordsData.elemSize  , wcsa->wordsData.elemSize);
+                                       len -=tmplen;
+                                       wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
+                               }
+                               
+                               //if(strncmp((char*)aWord, (char*)wcsa->wordsData[p].word,size) > 0) min = p+1;
+                               if(strcompL(aWord, wcsaWord, size, len) > 0) min = p+1;
+                               else max = p;
+
+
+                               //                              { //SHOW PROGRESS
+                               //                                      fprintf(stderr,"\n Patron = <<%s>>, curposWord= %d ==><<",aWord,p);
+                               //                                      printWord(wcsaWord,len); fprintf(stderr,">> len =%d",len);
+                               //                              }
+                                       
+                       }
+                       
+                       {//preparing for strcompL
+                               len    = bitread (wcsa->wordsData.words, (min+1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+                               tmplen = bitread (wcsa->wordsData.words, ( min )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+                               len -=tmplen;
+                                       wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen;
+                       }
+
+                       //      if(!strncmp((char*)aWord, (char*)wcsa->wordsData[min].word, size)) {
+                       if(!strcompL(aWord, wcsaWord, size, len)) {
+                               integerPattern[index++] = min +1 ;  //<--
+                       }
+                       else {*sizeIntegers = 0; return;}  // a valid word that does not appear in the source text.
+                       
+               }                                                                                                               
+       }// end while
+       *sizeIntegers = index;  
+       
+       //      //shows the parsed words:
+       //      {uint i;
+       //              printf("\n\n >>%s>> HA SIDO PARSEADO COMO:",textPattern);
+       //              for (i=0; i<index;i++) {
+       //                              printf("<<%s>>",wcsa->wordsData[integerPattern[i] -1].word);
+       //              }
+       //              
+       //      }       
+}
+
+
+
+
+
+       /** ------------------------------------------------------------------
+        * Returns the number of occurrences of a given text pattern
+        *------------------------------------------------------------------ */
+int countTextOcurrences(twcsa *wcsa, byte *textPattern) {
+       ulong left, right;
+       uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
+       uint integerPatternSize, min, max;
+       
+       uint lenpat = strlen((char*)textPattern);
+       parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize); 
+       if (!integerPatternSize) return -1;
+               
+//     #ifdef DEBUG_ON         
+//             uint i;
+//             printf("\n %d Integers to search for:",integerPatternSize );
+//             for (i=0;i<integerPatternSize;i++) {
+//                     printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);                       
+//             }       
+//             printf("\n");
+//     #endif
+
+       ulong numocc;
+       int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, &numocc,  &left, &right);
+       return numocc;  
+
+}
+
+
+       /** ------------------------------------------------------------------
+        * locateTextOcurrences: 
+        * Returns the offsets of the source text where a word/phrase appears
+        * Returns also the number of occurrences.
+        *------------------------------------------------------------------ */
+uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences) {
+       uint integerPatterns[MAX_INTEGER_PATTERN_SIZE];
+       uint integerPatternSize, min, max;
+       
+       uint lenpat = strlen((char*)textPattern);
+       parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize);         
+       if (!integerPatternSize) {*numberOccurrences = -1; return NULL;}
+               
+//     #ifdef DEBUG_ON         
+//             uint i;
+//             printf("\n %d Integers to search for:",integerPatternSize );
+//             for (i=0;i<integerPatternSize;i++) {
+//                     printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word);                       
+//             }       
+//             printf("\n");
+//     #endif
+               
+       ulong occurrences, left, right;
+       ulong *seOffsets;
+       ulong *sourceOffsets;
+       
+       //obtains the indexes in vector SE where the pattern appears.
+       //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences);
+       int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences);
+       
+       //sourceOffsets = (uint *) malloc (sizeof(uint)*occurrences);
+       
+       sourceOffsets=seOffsets;
+       //obtains the offsets in the source text of the pattern (sourceOffsets)
+       locateFacade(wcsa, (uint *)sourceOffsets, (uint *)seOffsets,occurrences);
+
+       #ifdef DEBUG_ON         
+               fprintf(stderr,"\n*** %s appears in the source text in positions:\n\t",textPattern);
+               for (i=0;i<occurrences;i++) 
+                       fprintf(stderr,"[%u]",sourceOffsets[i]);
+               fflush(stderr);
+       #endif
+
+       *numberOccurrences = occurrences;
+       return (uint *) sourceOffsets;
+}
+
+
+       /** ------------------------------------------------------------------
+        * displayTextOcurrences:
+        * Shows in stdout, the text around the occurrences of a word/phrase
+        * Returns also the number of occurrences.
+        *------------------------------------------------------------------ */
+int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay) {
+       return 99;  //not implemented: function not available
+}
+
+       /** ------------------------------------------------------------------
+        * Locate Facade: 
+        * For given sePositions, returns the sourceTextPositions
+        * where the those valid-words in se[sePositions[i]] occurr.
+        *------------------------------------------------------------------*/ 
+int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number) {
+       return 99;  //not implemented: function not available for this index
+}
+
+
+/** ------------------------------------------------------------------
+       * DISPLAYFACADE:
+       * Returns the subString from a starting offset to a final offset
+       * in the source text. It does not allocate any memory, receives "dstptr"
+       * Precondition: offsetIni >=0;
+       ------------------------------------------------------------------*/
+ int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr) {
+       return 99;  //not implemented: function not available for this index            
+}
+
+
+       /**------------------------------------------------------------------
+        * DISPLAYFacadeMalloc:
+        * Returns the subString from a starting offset to a final offset
+        * in the source text. It allocates Memory !!
+        * NOT CURRENTLY USED
+        ------------------------------------------------------------------*/
+byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length) {
+       byte *dstptr=NULL;         //not implemented: function not available
+       return dstptr;          
+}
+
+
+       /** ------------------------------------------------------------------ 
+        * LOCATEALLandDISPLAY:
+        * Displays the text around an occurrence of the searched word in the source text.
+        * Assuming that $p$ is that position --> shows only chars in [p_radix-1,p_radix]
+        ------------------------------------------------------------------*/ 
+int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix) {
+return 99;   //not implemented: function not available for this index
+
+}
+
+
+       /** ------------------------------------------------------------------
+        * recovers the source text by calling display(0,fileSize);
+        * ------------------------------------------------------------------ */
+void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {         
+
+       int start;int end;
+       byte *cc;
+       char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
+       ulong length;
+       
+       strcpy( filename,  basename);
+       strcat( filename, ext);
+       filename[strlen( basename)+ strlen(ext)]='\0';
+       fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
+
+       FILE *salida;
+       unlink( filename);
+       salida = fopen( filename,"w");
+       start=0; end = sourceTextSize-1;
+       
+       cc = (byte *) malloc (sourceTextSize* sizeof(uchar));
+
+       {
+               uint i, j;//, tmplen;
+               uint prevValid;
+               //uint ptr, maxptr;
+               byte *src, *dst;
+               uint leng =0;
+               uint tmplen =0;
+               
+               uint indexSE=0;
+               uint posSEValue=0;
+       
+               dst=cc;
+               while  ( (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */
+
+                       int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue);
+                       
+                       {//obtains pointer to the ith word
+                               uint offtmp;
+                               uint ith = posSEValue -1;  // !! 
+                               tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+                               offtmp = bitread (wcsa->wordsData.words, ( ith  )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize);
+                               tmplen -=offtmp;  //the lenght of the ith word.
+                               src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp;
+                       }                                       
+
+                       if (_Valid[*src]) {
+                               if (prevValid){
+                                        *dst++ =' ';                                           
+                                       leng +=1;
+                               }
+                               prevValid =1;   //for the next iteration
+                       }
+                       else prevValid=0;               
+                                                                                                                                                                                       
+                       indexSE++;
+                                                                                               
+                       for (j=0;j<tmplen;j++) {*dst++ = *src++;}         //copies word to the output buffer
+                       leng +=tmplen;
+               }//while
+
+       fprintf(stderr,"\n sourceTextSize = %d, len = %d",sourceTextSize,leng);
+       fwrite(cc,sizeof(byte),leng,salida);
+       fclose(salida);
+
+       free(cc);
+       free(filename);
+}
+
+
+}
+
+               //recovers the source text by calling extract Words.
+void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) {         
+
+       int start;int end; int error;
+       char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1);
+       byte *cc;
+       ulong length;
+       
+       strcpy( filename,  basename);
+       strcat( filename, ext);
+       filename[strlen( basename)+ strlen(ext)]='\0';
+       fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr);
+
+       FILE *salida;
+       unlink( filename);
+       salida = fopen( filename,"w");
+       start=0; end = wcsa->seSize;
+       
+       error = extractWords((void *) wcsa, start, end, &cc, &length);
+       if (error) {fprintf(stderr,"\n error during recoverSourceText2"); exit(0);}
+
+       fprintf(stderr,"\n sourceTextSize = %d, len = %ld",sourceTextSize,length);
+       fwrite(cc,sizeof(byte),length,salida);
+       fclose(salida);
+
+       free(cc);
+       free(filename);         
+}
+
+/** *******************************************************************************
+  * Showing some statistics and info of the index
+  * *******************************************************************************/
+void printInfoReduced(twcsa *wcsa) {
+         //not implemented: function not available
+}
+
+               /* Shows summary info of the index */
+int printInfo(void *index) {
+       uint n;
+
+       twcsa *wcsa = (twcsa *) index;
+       
+       unsigned long indexSize;
+       uint intIndexSize, presentationSize;
+       int err;
+       
+       err = index_size(index, &indexSize);
+       if (err!=0) return err;
+       err = sizeIntIndex(wcsa->myicsa, &intIndexSize);        
+       if (err!=0) return err;
+       
+       presentationSize = indexSize - intIndexSize;
+       
+               printf("\n ===================================================:");              
+               printf("\n Summary of Presentation layer:");            
+               printf("\n   Number of valid words (SEsize) = %u",wcsa->seSize);
+               printf("\n   Number of different words = %ld",wcsa->n);
+               printf("\n   WCSA structure = %d bytes", sizeof(twcsa));
+
+               uint totalpointers = ((((wcsa->n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));
+               uint totalasciizone = wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ;
+               uint totalwords = totalasciizone + totalpointers;
+
+               printf("\n   Size Of words structure (%d bytes):",totalwords);
+               printf("\n      [     pointers = %d bytes || AsciiZone = %d bytes", totalpointers,      totalasciizone);        
+                               
+               printf("\n\n Total = **  %u bytes (in RAM) **",presentationSize);               
+               //printf("\n\n @@ Summary of self-index on Integers:");
+               err = printInfoIntIndex(wcsa->myicsa, " ");
+               if (err!=0) return err;
+                       
+               printf("\n ===================================================:");              
+               printf("\n");
+               return 0;
+       }
+
+/**------------------------------------------------------------------
+ * structsSize.
+ *     Counts the memory amount needed by the Facade (Presentation Layer).
+ * skipping the stop_words hash table 
+ ----------------------------------------------------------------- */
+uint structsSizeMem(twcsa *wcsa) {
+       return 0;   //not implemented: function not available for this index.
+}
+
+
+/** for debugging **/
+void printWord(uchar *str, uint len) {
+               uint i;
+               for (i=0;i<len;i++)
+                       fprintf(stderr,"%c",str[i]);
+}
+
+
+       /** saves the content of the file SE (ids of the source words) **/
+int saveSEfile (char *basename, uint *v, uint n) {
+       char outfilename[255];
+       int file;
+       sprintf(outfilename,"%s.%s",basename,SE_FILE_EXT);
+       unlink(outfilename);
+       if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+               printf("Cannot open file %s\n", outfilename);
+               exit(0);
+       }
+
+       write(file, v, sizeof(uint) * n ); 
+       close(file);
+}
+
+
+
+double getTime2 (void)
+{
+       double usertime, systime;
+       struct rusage usage;
+
+       getrusage (RUSAGE_SELF, &usage);
+
+       usertime = (double) usage.ru_utime.tv_sec +
+               (double) usage.ru_utime.tv_usec / 1000000.0;
+       systime = (double) usage.ru_stime.tv_sec +
+               (double) usage.ru_stime.tv_usec / 1000000.0;
+
+       return (usertime + systime);
+}
+
+
+
+/**------------------------------------------------------------------ 
+  *  MAIN PROGRAM.
+  *------------------------------------------------------------------ */
+#ifdef FACADEWITHMAIN
+       int main(int argc, char* argv[])
+       {
+               
+                       
+               
+               char *infile, *outbasename, *stopwordsfile;     // Name of in/out files
+               byte *inputBuffer;
+               ulong finsize;
+       
+               int f_in;
+               void *Index;
+
+               
+               printf("\n*Word-based iCSA: A word-based CSA");
+               printf("\n*CopyRight (c) 2008 [LBD & G.N.]\n\n");
+       
+               // Reads input parameters from command line.
+               if(argc < 3) {
+                       printf("Use: %s <in file> <out basename> \n", argv[0]);
+                       exit(0);
+               }
+       
+               // Reads params (input file, output basename, and stopwords file)
+               infile = argv[1];
+               outbasename = argv[2];
+               stopwordsfile = argv[3];
+               
+               finsize= fileSize(infile);
+               
+               if (! finsize) {
+                       printf( "\nFILE EMPTY OR FILE NOT FOUND %s !!\nSkipping processement ...\n",infile);
+                       exit(0);
+               }       
+       
+               // Opening the input text file.
+               if( (f_in = open(infile, O_RDONLY)) < 0) {
+                       printf("Cannot read file %s\n", infile);
+                       exit(0);
+               }       
+               inputBuffer = (byte *) malloc(finsize *sizeof(byte));// +1);
+               read (f_in,inputBuffer,finsize);        
+               close (f_in);   
+               
+               
+       {
+               //printf("\n parametros <<%s>>\n\n",stopwordsfile);     
+               build_index (inputBuffer, finsize, stopwordsfile, &Index);  /** building the index */
+
+//             /** recovering the source text from the index */
+                       {
+                               double start, end;
+                               start = getTime2();
+                               ulong size;
+                               get_length(Index, &size);
+                               char extension[10]= ".source";
+                               
+                               //recoverSourceText1((twcsa*) Index, outbasename,extension, size);
+                               strcat(extension,"2");
+                               recoverSourceText2((twcsa*) Index, outbasename,extension,size);
+                               end = getTime2();       
+                               fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start );      
+                       }
+//             
+               // DISPLAYING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
+                       {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
+                        int error = 0;
+                       ulong numocc,numc, length, i, *snippet_len, tot_numcharext = 0, numpatt;
+                       uchar *pattern, *snippet_text;
+                                
+                                pattern = textPattern;
+                        printf("\nSEARCH TEST for DISPLAY (pizzachili interface)\n");
+                               while(1) {      
+                                       printf("Intro string: ");
+                                       fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
+                                       if (!strcmp((char*)textPattern,"\n") ) break;
+                                        textPattern[strlen((char*)textPattern)-1] = '\0';
+               
+                                       length = strlen( (char*)textPattern);
+                                       numc=50;
+         
+//                                     error = display (Index, textPattern, length, numc, &numocc, 
+//                                                              &snippet_text, &snippet_len);
+                                       error = displayWords (Index, textPattern, length, numc, &numocc, 
+                                                                &snippet_text, &snippet_len,1);
+                                       
+                                       if (error){ fprintf(stderr, "%s\n", "Hubo un error durante display");exit(0);}
+               
+                                               fprintf(stderr,"\n acabou display");fflush(stderr);                     
+                                       {//show the results
+                                               ulong j, len = length + 2*numc;
+                                           char blank = '\0';
+                                               fprintf(stderr,"\n length = %d",length);
+                                               fprintf(stderr,"\n pattern = %s",pattern);fflush(stderr);
+                                               fprintf(stderr,"\n numocc = %d",numocc);fflush(stderr);
+                                               fprintf(stderr,"\n snippet len = %d",len);fflush(stderr);
+                                               fprintf(stderr,"\n =========");fflush(stderr);          
+                                               for (i = 0; i < numocc; i++){
+                                                       fprintf(stderr,"\n[%2d][len=%3d]<<",i+1,snippet_len[i]);fflush(stderr);
+                                                       fwrite(snippet_text+len*i,sizeof(uchar),snippet_len[i],stderr);fflush(stderr);
+                                                       fprintf(stderr,">>");fflush(stderr);
+                                               }
+                                       }
+                                       numpatt--;
+                                       
+                                       for(i=0; i<numocc; i++) {
+                                               tot_numcharext += snippet_len[i];
+                                       }
+                                               
+                                       if (numocc) {
+                                               free (snippet_len);
+                                               free (snippet_text);
+                                       }
+                                       
+                                       printf("Ocurrences = %d\n", numocc);
+                                       if (!strcmp((char*)textPattern,"\n") ) break;
+                               }
+                       }
+       
+//
+//
+//     // SEARCHING FOR A TEXT PATTERN (word/phrase).
+//     {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
+//      int occ;
+//      int len;
+//      uint *occs;
+//      int i;
+//      printf("\nSEARCH TEST for LOCATE\n");
+//             while(1) {      
+//                     printf("Intro string: ");
+//                     fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
+//                     len = strlen((char*)textPattern);
+//                     if (!strcmp((char*)textPattern,"\n") ) break;
+//                     textPattern[len-1] = '\0';
+//                     len --;
+//                     
+//                     //occs = locateTextOcurrences(wcsa,textPattern,&occ);
+//                     // locate(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ);
+//                       locateWord(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ, 0);
+//                     
+//                     printf("\n*** %s occurs %d times: In the source text in positions:\n\t",textPattern,occ);
+//                     for (i=0;i<occ;i++) 
+//                             printf("[%u]",occs[i]);
+//                     fflush(stderr); 
+//                     free(occs);             
+//                     
+//                     if (!strcmp((char*)textPattern,"\n") ) break;
+//             }
+//     }       
+//     
+//
+
+               // COUNTING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
+               /*
+               {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
+                int occ;
+                int len;
+                printf("\nSEARCH TEST for COUNT.\n");
+                       while(1) {      
+                               printf("Intro string: ");
+                               fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
+                               len = strlen((char*)textPattern);
+                               if (!strcmp((char*)textPattern,"\n") ) break;
+                               textPattern[len-1] = '\0';
+                               len --;
+                               
+                               count(Index, textPattern, len, (ulong *)&occ);                  
+                               //occ = countTextOcurrences(wcsa,textPattern);
+                               printf("Ocurrences = %d\n", occ);
+                       }
+               }
+               printf("\n END COUNTING OCCURRENCES OF PATTERNS. ...\n");
+               //exit(0);
+               */
+
+               /** saving the index to disk*/
+               save_index (Index, outbasename);
+
+               /** tells the mem used by the index */
+               ulong indexsize;                
+               index_size(Index, &indexsize);
+               fprintf(stderr,"Index occupied %d bytes, 2 extra mallocs = %d",indexsize,2* sizeof(uint));
+
+               /** freeing the index */                                        
+               free_index(Index);                              
+                        
+       }
+}
+       
+#endif
+
+
+       
diff --git a/swcsa/buildFacade.h b/swcsa/buildFacade.h

new file mode 100755 (executable)

index 0000000..05bd708
--- /dev/null
+++ b/swcsa/buildFacade.h
@@ -0,0 +1,295 @@
+/* only for getTime() */
+#include <sys/time.h>
+#include <sys/resource.h>
+
+
+#include "utils/valstring.h"
+#include "utils/defValues.h"
+#include "utils/MemoryManager.h"
+#include "utils/fileInfo.h"
+
+#include "utils/hash.h"
+
+#include "utils/huff.h"
+//#include "utils/errors.c"
+#include "utils/parameters.h"
+
+//from SEARCHER FACADE 
+#include "utils/huffDec.h"
+//#include "icsa/icsa.h"
+
+#include "intIndex/interfaceIntIndex.h"
+
+#ifndef uchar
+#define uchar unsigned char
+#endif
+#ifndef uint
+#define uint  unsigned int
+#endif
+#ifndef ulong
+#define ulong unsigned long
+#endif
+
+#define STRLEN(str,len) \
+{len=0; \
+ byte *ptr = str; \
+ while(*ptr++) len++; \
+}
+
+#define ADDLEN(str,len) \
+{byte *ptr = str; \
+       while(*ptr++) len++; \
+}
+
+/** Some data types used **ONLY**during construction process */
+       
+// Words, both the canonical words and their variants
+       typedef struct {
+               unsigned long slot;   // the position in the hash table of the canonical word
+               byte *word; //makes alphanumerical sorting easier...
+       } tposInHT;
+       
+       
+       typedef struct SzoneMem {        //a large block of memory to load a file into mem.
+               byte *zone;  //block of mem.
+               uint size;  //number of bytes            
+       } tZoneMem;
+
+       //  words dataStructure.        
+       typedef struct {
+               uint *words;
+               uint elemSize;  //the size (in bits) of each pointer.
+               tZoneMem wordsZoneMem; // a block of memory where the canonical words are loaded (from file).
+
+       } twords;
+
+
+/** Some data types used during searches */
+
+       
+
+
+       /**the WCSA index structures... */
+       typedef struct {
+                       
+               /**valid words */
+               twords wordsData;               /* vocabulary (words) of the index */
+
+               ulong n;                                /* number of different words. */                
+               uint seSize;                    /* number of words in the source text */ 
+
+               uint sourceTextSize;    /*the size of the source text in bytes*/
+
+               //ticsa *myicsa; //the WiCSA on SE words
+               void *myicsa; //the WiCSA on SE words
+       
+               //#ifndef CSA_ON
+               uint *se;
+               //#endif
+
+       }twcsa;
+
+
+/** ******************************************************************************
+    * Interface (from pizza chili) for using the WCSA index
+*********************************************************************************/
+
+/* Error management */
+
+        /* Returns a string describing the error associated with error number
+          e. The string must not be freed, and it will be overwritten with
+          subsequent calls. */
+
+char *error_index (int e);
+
+/* Building the index */
+
+        /* Creates index from text[0..length-1]. Note that the index is an 
+          opaque data type. Any build option must be passed in string 
+          build_options, whose syntax depends on the index. The index must 
+          always work with some default parameters if build_options is NULL. 
+          The returned index is ready to be queried. */
+
+int build_index (uchar *text, ulong length, char *build_options, void **index);
+
+        /*  Saves index on disk by using single or multiple files, having 
+          proper extensions. */
+
+int save_index (void *index, char *filename);
+
+        /*  Loads index from one or more file(s) named filename, possibly 
+          adding the proper extensions. */
+
+int load_index (char *filename, void **index);
+
+        /* Frees the memory occupied by index. */
+
+int free_index (void *index);
+
+        /* Gives the memory occupied by index in bytes. */
+
+int index_size(void *index, ulong *size);
+
+/* Querying the index */
+
+        /* Writes in numocc the number of occurrences of the substring 
+          pattern[0..length-1] found in the text indexed by index. */
+
+int count (void *index, uchar *pattern, ulong length, ulong *numocc);
+
+        /* Gives the length of the text indexed */
+
+int get_length(void *index, ulong *length);
+
+/* Accessing the indexed text  */
+
+        /* Writes in numocc the number of occurrences of the substring 
+          pattern[0..length-1] in the text indexed by index. It also allocates
+          occ (which must be freed by the caller) and writes the locations of 
+          the numocc occurrences in occ, in arbitrary order.  */
+
+int locate (void *index, uchar *pattern, ulong length, ulong **occ, 
+        ulong *numocc);
+
+        /*  Allocates snippet (which must be freed by the caller) and writes 
+          the substring text[from..to] into it. Returns in snippet_length the 
+          length of the text snippet actually extracted (that could be less 
+          than to-from+1 if to is larger than the text size). */
+
+int extract (void *index, ulong from, ulong to, uchar **snippet, 
+        ulong *snippet_length);
+
+        /* Displays the text (snippet) surrounding any occurrence of the 
+          substring pattern[0..length-1] within the text indexed by index. 
+          The snippet must include numc characters before and after the 
+          pattern occurrence, totalizing length+2*numc characters, or less if 
+          the text boundaries are reached. Writes in numocc the number of 
+          occurrences, and allocates the arrays snippet_text and 
+          snippet_lengths (which must be freed by the caller). The first is a 
+          character array of numocc*(length+2*numc) characters, with a new 
+          snippet starting at every multiple of length+2*numc. The second 
+          gives the real length of each of the numocc snippets. */
+
+int display (void *index, uchar *pattern, ulong length, ulong numc, 
+        ulong *numocc, uchar **snippet_text, ulong **snippet_lengths);
+
+        /*  Obtains the length of the text indexed by index. */
+
+int length (void *index, ulong *length);
+
+               /* Shows summary info of the index */
+int printInfo(void *index);
+
+/** *******************************************************************************************/
+/** Building part of the index ****************************************************************/
+
+int build_WCSA (uchar *text, ulong length, char *build_options, void **index);
+int build_iCSA (char  *build_options, void *index); 
+
+
+
+/** *******************************************************************************************/
+/** Search part of the index ******************************************************************/
+// Definitions of some PUBLIC function prototipes.
+
+               //loading/freeing the data structures into memory.
+       
+    void loadStructs(twcsa *wcsa, char *basename);     
+       twcsa *loadWCSA(char *filename);        
+       
+               //returns the source text from given [offsetIni, offsetFin] offsets.
+       //byte *displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin);
+       byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length);  
+       int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr);
+
+               //locate all the ocurrences of a word/phrase 
+       int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number);
+       
+               //show text around the occurrences of a word. 
+       int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix);
+
+               //recovers the source text by calling display (either only once or "len" times)
+       void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
+       void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
+
+       //***Searching for a TEXT pattern ...
+       
+               //extracts the ids of the valid words of a "plain text".
+       void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) ;
+               
+               //counts the occurrences of a given text pattern.
+       int countTextOcurrences(twcsa *wcsa, byte *textPattern);
+       
+               //returns the offsets (to the source text) where of a given text pattern appears.       
+       uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences);
+       
+               //shows a snippet with the text around the ocurrences of a pattern.     
+       int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay);
+
+
+/** ***********************************************************************************
+  * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
+  * ***********************************************************************************/  
+       /** Writes in numocc the number of occurrences of the substring 
+         pattern[0..length-1] in the text indexed by index. It also allocates
+         occ (which must be freed by the caller) and writes the locations of 
+         the numocc occurrences in occ, in arbitrary order. These occurrences
+         refer to the offsets in TOH where the caller could start a display
+         operation. So locateWord implies synchronization using B.
+         ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
+            searched word, but the offset in TOH of k-before words before.       
+       */        
+         
+int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore);
+
+  /** Displays the text (snippet) surrounding any occurrence of the 
+    substring pattern[0..length-1] within the text indexed by index. 
+    The snippet must include numc characters before and after the 
+    pattern occurrence, totalizing length+2*numc characters, or less if 
+    the text boundaries are reached. Writes in numocc the number of 
+    occurrences, and allocates the arrays snippet_text and 
+    snippet_lengths (which must be freed by the caller). The first is a 
+    character array of numocc*(length+2*numc) characters, with a new 
+    snippet starting at every multiple of length+2*numc. The second 
+    gives the real length of each of the numocc snippets. */
+
+ int displayWords (void *index, uchar *pattern, ulong length, ulong numc, 
+         ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore);
+
+
+/** simulates extration of text process, but do not actually returns anything at all 
+   Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
+   Less than 2K words can be extracted if more than numc characters have been already obtained.
+   Do nothing else... do not return the text */
+   
+       int  displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc);
+
+
+/**  Allocates text (which must be freed by the caller) and recovers the
+  the substring of text starting from the "fromword"-th word up to the
+  "toWord"-th words. Returns in text the text, and in "text_lenght" the 
+  length of the text  actually extracted. Text is allocated. 
+  Actually extracts SE[fromWord .. toWord) ... not the last word.    */
+
+int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text, 
+        ulong *text_length);
+
+
+
+
+               //recovers the source text by calling display (either only once or "len" times)
+       void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
+       void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
+
+
+// Definitions of PRIVATE functions
+
+       //Auxiliary functions
+
+       uint structsSizeDisk(twcsa *wcsa);
+       uint structsSizeMem(twcsa *wcsa);
+       void printInfoReduced(twcsa *wcsa); 
+       int saveSEfile (char *basename, uint *v, uint n);
+       double getTime2 (void); 
+
+
diff --git a/swcsa/buildIntIndex.c b/swcsa/buildIntIndex.c

new file mode 100644 (file)

index 0000000..3622a4b
--- /dev/null
+++ b/swcsa/buildIntIndex.c
@@ -0,0 +1,138 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "buildFacade.h"
+
+/* only for getTime() */
+#include <sys/time.h>
+#include <sys/resource.h>
+ 
+/* macro to detect and notify errors  */
+#define IFERROR(error) {{if (error) { fprintf(stderr, "%s\n", error_index(error)); exit(1); }}}
+
+int loadSEfile (char *basename, uint **v, uint *n);
+void print_usage(char *);
+double getTime(void);
+
+int main(int argc, char *argv[]) {
+
+       char *basenamefile;
+       char *params = NULL;
+       void *index; uint index_len;
+       int error, i;
+       double start, end;
+       uint *se;
+       uint seSize;
+
+    if (argc < 2) print_usage(argv[0]);
+       if (argc > 2) { 
+               int nchars, len;
+               nchars = argc-2;
+               for(i=1;i<argc;i++)
+                       nchars += strlen(argv[i]);
+               params = (char *) malloc((nchars+1)*sizeof(char));
+               params[nchars] = '\0';
+               nchars = 0;
+               for(i=2;i<argc;i++) {
+                       len = strlen(argv[i]);
+                       strncpy(params+nchars,argv[i],len);
+                       params[nchars+len] = ' ';
+                       nchars += len+1;
+               }
+               params[nchars-1] = '\0';
+       }
+
+       basenamefile = argv[1];
+
+       start = getTime();
+
+       error = loadSEfile (basenamefile, &se, &seSize);
+       IFERROR(error);
+               
+    //fprintf(stderr,"\n parameters (stopwordsFilename): \"%s\"\n",params); fflush(stderr);
+    
+       error =  buildIntIndex(se,seSize, params,(void **)&index);                              
+       IFERROR(error);
+
+       if (index) {
+               error = saveIntIndex(index, basenamefile);              
+       }       
+               
+       IFERROR(error);
+       end = getTime();        
+       
+       error = sizeIntIndex(index, &index_len);
+       IFERROR(error);
+
+       printf("\n\n\t Freeing memory...");
+               
+       error = freeIntIndex(index);
+       IFERROR(error);
+
+       free(se); 
+       free(params);
+       
+       fprintf(stdout,"\n\tBuilding time (**building self-index on ints: %.3f secs", end-start );
+       fprintf(stdout,"\n\t ## Input: %u bytes --> Output (pres_layer) %u bytes.", seSize*sizeof(uint), index_len);
+       fprintf(stdout,"\n\t ## Overall compression --> %.3f%% (%.3f bits per char).\n\n",
+                       (100.0*index_len)/(seSize*sizeof(uint)), (index_len*8.0)/(seSize*sizeof(uint)));
+
+       exit(0);
+}
+
+
+int loadSEfile (char *basename, uint **v, uint *n){
+       char filename[255];
+       int file; 
+       sprintf(filename,"%s.%s",basename,SE_FILE_EXT);
+       
+       uint sizeFile = fileSize(filename);
+       
+       if( sizeFile <= 0) {
+               printf("Cannot read information from file %s\n", filename);     return -1;
+       }       
+       
+       if( (file = open(filename, O_RDONLY)) < 0) {
+               printf("Cannot open file %s\n", filename);
+               return -1;
+       }
+
+       uint *se = (uint *) malloc (sizeFile);
+       uint seSize = sizeFile / sizeof(uint);
+       read(file, se, sizeFile);               //the samples
+       close(file);
+       *v=se;
+       *n=seSize;      
+       return 0;       
+}
+
+
+
+double
+getTime (void)
+{
+
+       double usertime, systime;
+       struct rusage usage;
+
+       getrusage (RUSAGE_SELF, &usage);
+
+       usertime = (double) usage.ru_utime.tv_sec +
+               (double) usage.ru_utime.tv_usec / 1000000.0;
+
+       systime = (double) usage.ru_stime.tv_sec +
+               (double) usage.ru_stime.tv_usec / 1000000.0;
+
+       return (usertime + systime);
+
+}
+
+void print_usage(char * progname) {
+       fprintf(stderr, "Usage: %s <index file> [<parameters>]\n", progname);
+       fprintf(stderr, "\nIt builds the index on Integer for the sequence in <index file>.se,\n");
+       fprintf(stderr, "storing it in [<index file>.[*]]; Any additional <parameters> \n");
+       fprintf(stderr, "will be passed to the construction function.\n");
+       fprintf(stderr, "At the end, the program sends to the standard error \n");
+       fprintf(stderr, "performance measures on time to build it.\n\n");
+       exit(1);
+}
diff --git a/swcsa/buildParser.c b/swcsa/buildParser.c

new file mode 100644 (file)

index 0000000..b5ad256
--- /dev/null
+++ b/swcsa/buildParser.c
@@ -0,0 +1,138 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "buildFacade.h"
+
+/* only for getTime() */
+#include <sys/time.h>
+#include <sys/resource.h>
+ 
+/* macro to detect and notify errors  */
+#define IFERROR(error) {{if (error) { fprintf(stderr, "%s\n", error_index(error)); exit(1); }}}
+
+int read_file(char *filename, uchar **textt, ulong *length);
+void print_usage(char *);
+double getTime(void);
+
+int main(int argc, char *argv[]) {
+
+       char *infile, *outfile;
+    uchar *text;
+       char *params = NULL;
+       ulong text_len;
+       void *index;
+       int error, i;
+       double start, end;
+
+    if (argc < 3) print_usage(argv[0]);
+       if (argc > 3) { 
+               int nchars, len;
+               nchars = argc-3;
+               for(i=2;i<argc;i++)
+                       nchars += strlen(argv[i]);
+               params = (char *) malloc((nchars+1)*sizeof(char));
+               params[nchars] = '\0';
+               nchars = 0;
+               for(i=3;i<argc;i++) {
+                       len = strlen(argv[i]);
+                       strncpy(params+nchars,argv[i],len);
+                       params[nchars+len] = ' ';
+                       nchars += len+1;
+               }
+               params[nchars-1] = '\0';
+       }
+
+       infile = argv[1];
+       outfile = argv[2];
+
+       start = getTime();
+       error = read_file(infile, &text, &text_len);
+       IFERROR(error);
+
+       //error = build_index(text, text_len, params, &index);
+
+    //fprintf(stderr,"\n parameters (stopwordsFilename): \"%s\"\n",params); fflush(stderr);
+    error = build_WCSA (text, text_len, params, &index);
+    //returnvalue = build_iCSA (params,*index);
+                       
+       IFERROR(error);
+
+       error = save_index(index, outfile);
+       IFERROR(error);
+       end = getTime();        
+
+       
+       ulong index_len;
+       index_size(index, &index_len);
+
+       error = free_index(index);
+       IFERROR(error);
+
+       free(params);
+       
+       fprintf(stdout,"\n\n\t ## Building time (**parsing into integers + present_layer: %.3f secs", end-start );
+       fprintf(stdout,"\n\t ## Input: %lu bytes --> Output (pres_layer) %lu bytes.", text_len, index_len);
+       fprintf(stdout,"\n\t ## Overall compression --> %.2f%% (%.2f bits per char).\n\n",
+                       (100.0*index_len)/text_len, (index_len*8.0)/text_len);
+
+       exit(0);
+}
+
+/* 
+  Opens and reads a text file 
+*/
+int read_file(char *filename, uchar **textt, ulong *length) {
+
+  uchar *text;
+  unsigned long t; 
+  FILE *infile;
+  
+  infile = fopen(filename, "rb"); // b is for binary: required by DOS
+  if(infile == NULL) return 1;
+  
+  /* store input file length */
+  if(fseek(infile,0,SEEK_END) !=0 ) return 1;
+  *length = ftell(infile);
+  
+  /* alloc memory for text (the overshoot is for suffix sorting) */
+  text = (uchar *) malloc((*length)*sizeof(*text)); 
+  if(text == NULL) return 1;  
+  
+  /* read text in one sweep */
+  rewind(infile);
+  t = fread(text, sizeof(*text), (size_t) *length, infile);
+  if(t!=*length) return 1;
+  *textt = text;
+  fclose(infile);
+  
+  return 0;
+}
+
+double
+getTime (void)
+{
+
+       double usertime, systime;
+       struct rusage usage;
+
+       getrusage (RUSAGE_SELF, &usage);
+
+       usertime = (double) usage.ru_utime.tv_sec +
+               (double) usage.ru_utime.tv_usec / 1000000.0;
+
+       systime = (double) usage.ru_stime.tv_sec +
+               (double) usage.ru_stime.tv_usec / 1000000.0;
+
+       return (usertime + systime);
+
+}
+
+void print_usage(char * progname) {
+       fprintf(stderr, "Usage: %s <source file> <index file> [<parameters>]\n", progname);
+       fprintf(stderr, "\nIt builds the index for the text in file <source file>,\n");
+       fprintf(stderr, "storing it in <index file>. Any additional <parameters> \n");
+       fprintf(stderr, "will be passed to the construction function.\n");
+       fprintf(stderr, "At the end, the program sends to the standard error \n");
+       fprintf(stderr, "performance measures on time to build the index.\n\n");
+       exit(1);
+}
diff --git a/swcsa/buildStats.c b/swcsa/buildStats.c

new file mode 100644 (file)

index 0000000..4339bbd
--- /dev/null
+++ b/swcsa/buildStats.c
@@ -0,0 +1,106 @@
+/*
+ * Run Queries
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include "buildFacade.h"
+
+/* only for getTime() */
+#include <sys/time.h>
+#include <sys/resource.h>
+
+/* macro to detect and to notify errors */
+#define IFERROR(error) {{if (error) { fprintf(stderr, "%s\n", error_index(error)); exit(1); }}}
+
+
+/* local headers */
+
+double getTime (void);
+void usage(char * progname);
+               
+static void *Index;     /* opauque data type */
+static ulong Index_size, Text_length;
+static double Load_time;
+
+
+
+/*
+ * Temporary usage: run_queries <index file> <type> [length] [V]
+ */
+int main (int argc, char *argv[])
+{
+       int error = 0;
+       char *filename;
+       char querytype;
+       
+       if (argc != 2)  {
+               usage(argv[0]); 
+               exit (1);
+       }
+       
+       filename = argv[1];
+
+       printf("\n Stats of index: %s\n",argv[1]);
+       
+       Load_time = getTime ();
+       error = load_index (filename, &Index);
+       IFERROR (error);
+       Load_time = getTime () - Load_time;
+       fprintf (stderr, "\tLoad index time = %.2f secs\n", Load_time);
+
+       error = index_size(Index, &Index_size);
+       IFERROR (error);
+
+       error = get_length(Index, &Text_length);
+       IFERROR (error);
+       
+       
+       ulong text_len;
+       error = get_length(Index, &text_len);
+       IFERROR (error);
+       
+       error = printInfo(Index);
+       IFERROR(error);         
+       
+       error = free_index(Index);
+       IFERROR(error);
+       
+
+
+       fprintf(stdout,"\t===============================================\n");
+       fprintf(stdout,"\tInput: %lu bytes (text) --> Output %lu bytes (wcsa).\n", text_len, Index_size);
+       fprintf(stderr,"\tIndex size = %lu Kb\n", Index_size/1024);
+       fprintf(stdout,"\tOverall compression --> %.2f%% (%.2f bits per char).\n",
+                       (100.0*Index_size)/text_len, (Index_size*8.0)/text_len);
+       fprintf(stdout,"\t===============================================\n");
+
+
+       return 0;
+}
+
+double
+getTime (void)
+{
+       double usertime, systime;
+       struct rusage usage;
+
+       getrusage (RUSAGE_SELF, &usage);
+
+       usertime = (double) usage.ru_utime.tv_sec +
+               (double) usage.ru_utime.tv_usec / 1000000.0;
+
+       systime = (double) usage.ru_stime.tv_sec +
+               (double) usage.ru_stime.tv_usec / 1000000.0;
+
+       //return (usertime + systime);
+       return (usertime );
+}
+
+
+void usage(char * progname) {  
+       fprintf(stderr, "\nThe program loads <index> and shows some stats on it\n");
+       fprintf(stderr, "Usage:  %s <index> \n", progname);
+}
diff --git a/swcsa/intIndex/Makefile b/swcsa/intIndex/Makefile

new file mode 100755 (executable)

index 0000000..3fb9c0c
--- /dev/null
+++ b/swcsa/intIndex/Makefile
@@ -0,0 +1,55 @@
+SRCDIR = .
+SRCDIRCSA = .
+SRCDIRUTILS = ../utils
+CC          = g++
+
+ifndef CFLAGS  ##possibly already defined by the "main Makefile".
+       ##CFLAGS = -O9 -m32
+       CFLAGS      = -O9 -m32 
+       ##CFLAGS      = -g -m32 -O0
+endif
+
+LIBINTINDEX    = icsa.a
+
+all: intIndex cleanO
+
+
+intIndex: icsa.o parameters.o basics.o huff.o bitmap.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o
+       ar rc $(LIBINTINDEX) icsa.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o  
+                       #not including "parameters.o basics.o bitmap.o huff.o" as they are included by wcsa
+                       #they are already included into the library by the presentation layer.
+
+icsa.o: parameters.o basics.o bitmap.o huff.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o 
+        $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/icsa.c 
+
+psiHuffmanRLE.o: huff.o
+       $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/psiHuffmanRLE.c 
+
+psiDeltaCode.o:
+       $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/psiDeltaCode.c 
+
+psiGonzalo.o: huff.o
+       $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/psiGonzalo.c 
+
+parameters.o: 
+       $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/parameters.c 
+       
+       
+huff.o: 
+       $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/huff.c
+
+basics.o: 
+       $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/basics.c
+
+bitmap.o: 
+       $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/bitmap.c
+
+
+cleanO: 
+       rm -f *.o
+       
+clean:
+       rm -rf *~ *% *.o core *.bak icsa.a 
+
+tar:
+       tar czvf icsa.tar.gz  Makefile
diff --git a/swcsa/intIndex/defValues.h b/swcsa/intIndex/defValues.h

new file mode 100644 (file)

index 0000000..e19f339
--- /dev/null
+++ b/swcsa/intIndex/defValues.h
@@ -0,0 +1,39 @@
+#ifndef DEFVALUESSAD
+#define DEFVALUESSAD
+
+// CONFIGURACI�NS DO CSA DE SADAKANE PARA TEXTOS DE ENTEIROS
+
+// Parametros configurables
+#define DEFAULT_A_SAMPLING_PERIOD 256          // Periodo de muestreo de A
+#define DEFAULT_A_INV_SAMPLING_PERIOD 256              // Periodo de muestreo de inversa A
+#define DEFAULT_PSI_SAMPLING_PERIOD 256                // Periodo de muestreo da funcion PSI
+
+/*********************************/
+//#define PSI_HUFFMANRLE                                       // Uses Lbd's implementation of Psi. Improvement over Gonzalo's (in compression and speed)
+#define DEFAULT_nsHUFF  16*1024                // huffman limit on Psi. it can be optimized for space
+
+#define PSI_DELTACODES                         // Uses Delta codes for increments of Psi. Faster but compresses less than the others
+/*********************************/
+
+
+#define MAX_FILENAME_LENGTH 256                                                // Lonxitude maxima do nome do ficheiro
+
+// Extensions dos ficheiros creados
+#define NUMBER_OF_ELEMENTS_FILE_EXT "CSA.noe"                  // Numero de elementos (tama�o de A, Psi e D)
+
+#define D_FILE_EXT "CSA.Dbv"                                                   // O vector D de Sadakane
+#define D_RANK_DIRECTORY_FILE_EXT "CSA.Drd"                            // O directorio para Rank sobre D
+
+#define SAMPLES_A_FILE_EXT "CSA.sA"                                            // Mostras do array de sufixos
+#define BA_FILE_EXT "CSA.BAbv"                                                 // O vector D de Sadakane
+#define BA_RANK_DIRECTORY_FILE_EXT "CSA.BArd"                  // O directorio para Rank sobre D
+
+#define SAMPLES_A_INV_FILE_EXT "CSA.sAI"                               // Mostras da inversa do array de sufixos
+#define SAMPLING_PERIOD_A_FILE_EXT "CSA.sTA"                   // Periodo de muestreo do array de sufixos e da inversa
+
+#define PSI_COMPRESSED_FILE_EXT "CSA.psi"                              // COdigos delta de PSI
+
+#define DEFAULT_PSI_BINARY_SEARCH_FACTOR 2                             // Periodo de muestreo na busca binaria
+//#define BINARY_SEARCH_INTERVAL 128                                   // Periodo de muestreo na busca binaria
+
+#endif
diff --git a/swcsa/intIndex/icsa.c b/swcsa/intIndex/icsa.c

new file mode 100644 (file)

index 0000000..eec4e34
--- /dev/null
+++ b/swcsa/intIndex/icsa.c
@@ -0,0 +1,976 @@
+#include "icsa.h"
+
+// Global para que funcione a funcion de comparacion do quicksort
+uint *intVector;
+
+// Para o quicksort
+int suffixCmp(const void *arg1, const void *arg2) {
+
+       register uint a,b;
+       a=*((uint *) arg1);
+       b=*((uint *) arg2);
+
+       while(intVector[a] == intVector[b]) { a++; b++; }
+       return (intVector[a] - intVector[b]);
+
+}
+
+
+
+/* **NO REVISADO TAMAÑO FILE.
+int buildIntIndexFromFile (char *filename, char *build_options,void **index) {
+       //char filename[255];
+       int file;
+       struct stat f_stat;     
+       //sprintf(filename, "%s.%s", basename,SE_FILE_EXT);
+       
+       if( (file = open(filename, O_RDONLY)) < 0) {
+               printf("Cannot open file %s\n", filename);
+               exit(0);
+       }
+       if( fstat(file, &f_stat) < 0) {
+               printf("Cannot read information from file %s\n", filename);     exit(0);
+       }       
+       uint sizeFile = (f_stat.st_size)/sizeof(uint);
+
+       uint *se = (uint *) malloc (sizeFile);
+       uint seSize = sizeFile / sizeof(uint);
+       read(file, se,  sizeFile);              //the samples
+       close(file);    
+       return (buildIntIndex(se,seSize,build_options,index));
+}
+*/
+
+//ticsa *createIntegerCSA(uint **aintVector, uint textSize, char *build_options) {
+int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index ){
+       uint textSize=n;        
+       intVector = aintVector;  //global variable
+       ticsa *myicsa;
+       myicsa = (ticsa *) malloc (sizeof (ticsa));
+       uint *Psi, *SAI, *C, vocSize;
+       register uint i, j;
+       uint nsHUFF;
+
+       parametersCSA(myicsa, build_options);
+       
+       nsHUFF=myicsa->tempNSHUFF;
+       
+       // Almacenamos o valor dalguns parametros
+       myicsa->suffixArraySize = textSize;
+       myicsa->D = (uint*) malloc (sizeof(uint) * ((textSize+31)/32)); 
+       
+       myicsa->samplesASize = (textSize + myicsa->T_A - 1) / myicsa->T_A + 1;
+       myicsa->samplesA = (uint *)malloc(sizeof(int) * myicsa->samplesASize);
+       myicsa->BA = (uint*) malloc (sizeof(uint) * ((textSize+31)/32));
+       myicsa->samplesAInvSize = (textSize + myicsa->T_AInv - 1) / myicsa->T_AInv;
+       myicsa->samplesAInv = (uint *)malloc(sizeof(int) * myicsa->samplesAInvSize);
+
+       // Reservamos espacio para os vectores
+       Psi = (uint *) malloc (sizeof(uint) * textSize);
+
+       // CONSTRUIMOS A FUNCION C
+       vocSize = 0;
+       for(i=0;i<textSize;i++) if(intVector[i]>vocSize) vocSize = intVector[i];
+       C = (uint *) malloc(sizeof(uint) * (vocSize + 1));      // Para contar o 0 terminador
+       for(i=0;i<vocSize;i++) C[i] = 0;
+       for(i=0;i<textSize;i++) C[intVector[i]]++;
+       for (i=0,j=0;i<=vocSize;i++) {
+               j = j + C[i];
+               C[i] = j;
+       }
+       for(i=vocSize;i>0;i--) C[i] = C[i-1];
+       C[0] = 0;
+
+       // Construimos o array de sufixos (en Psi) - con quicksort
+       printf("\n\t *BUILDING THE SUFFIX ARRAY over %d integers... (with qsort)", textSize);fflush(stdout);
+       for(i=0; i<textSize; i++) Psi[i]=i;
+       
+       qsort(Psi, textSize, sizeof(uint), suffixCmp);
+       
+       
+       printf("\n\t ...... ended.");
+
+       // CONSTRUIMOS A INVERSA DO ARRAY DE SUFIXOS
+       SAI = (uint *) malloc (sizeof(uint) * (textSize + 1));  // +1 para repetir na ultima posición. Evitamos un if
+       for(i=0;i<textSize;i++) SAI[Psi[i]] = i;
+       SAI[textSize] = SAI[0];
+
+       // ALMACENAMOS AS MOSTRAS DO ARRAY DE SUFIXOS
+       for(i=0;i<((textSize+31)/32);i++) myicsa->BA[i] = 0;
+       for(i=0; i<textSize; i+=myicsa->T_A) bitset(myicsa->BA, SAI[i]);
+       bitset(myicsa->BA, SAI[textSize-1]);                    // A ultima posicion sempre muestreada
+       //printf("TextSize = %d\n", textSize);
+       myicsa->bBA = createBitmap(myicsa->BA, textSize);
+       for(i=0,j=0; i<textSize; i++) if(bitget(myicsa->BA, i)) myicsa->samplesA[j++] = Psi[i];
+       
+       // ALMACENAMOS AS MOSTRAS DA INVERSA DO ARRAY DE SUFIXOS
+       for(i=0,j=0;i<textSize;i+=myicsa->T_AInv) myicsa->samplesAInv[j++] = SAI[i];
+       
+       // CONSTRUIMOS E COMPRIMIMOS PSI
+       printf("\n\t Creating compressed Psi...");
+       for(i=0;i<textSize;i++) Psi[i] = SAI[Psi[i]+1];
+       free(SAI);
+       #ifdef PSI_HUFFMANRLE   
+       myicsa->hcPsi = huffmanCompressPsi(Psi,textSize,myicsa->T_Psi,nsHUFF);
+       #endif                          
+       #ifdef PSI_GONZALO      
+       myicsa->gcPsi = gonzaloCompressPsi(Psi,textSize,myicsa->T_Psi,nsHUFF);
+       #endif                  
+       #ifdef PSI_DELTACODES
+       myicsa->dcPsi = deltaCompressPsi(Psi,textSize,myicsa->T_Psi);           
+       #endif
+       free(Psi);      
+       
+       // Contruimos D
+       for(i=0;i<((textSize+31)/32);i++) myicsa->D[i] = 0;     
+       for(i=0;i<=vocSize;i++) bitset(myicsa->D, C[i]);
+       myicsa->bD = createBitmap(myicsa->D,textSize);
+       free(C);
+
+       // VARIABLE GLOBAL QUE ALMACENA O ESTADO DOS DISPLAYS (IMPORTANTE PARA DISPLAY SECUENCIAL)
+       // Almacena a última posición do array de sufixos que mostramos con display ou displayNext
+       // Se nos piden un displayNext, aplicamos PSI sobre esta posición e obtemos a seguinte,
+       // coa que podemos obter o símbolo pedido, e actualizamos displayState
+       myicsa->displayCSAState = 0;
+       myicsa->displayCSAPrevPosition = -2;  //needed by DisplayCSA(position)
+       
+       aintVector = intVector;
+       // Liberamos o espacion non necesario
+
+       *index = myicsa;
+       //return (myicsa);
+       return 0;
+}
+
+
+//Returns number of elements in the indexed sequence of integers
+int sourceLenIntIndex(void *index, uint *numInts){
+       ticsa *myicsa = (ticsa *) index;
+       *numInts= myicsa->suffixArraySize;
+       return 0; //no error;
+}
+
+int saveIntIndex(void *index, char *pathname) {
+//void storeStructsCSA(ticsa *myicsa, char *basename) {
+  
+       ticsa *myicsa = (ticsa *) index; 
+       char *basename=pathname;
+
+       char *filename;
+       int file;
+       
+       // Reservamos espacio para o nome do ficheiro
+       filename = (char *)malloc(sizeof(char)*MAX_FILENAME_LENGTH);
+               
+       // Ficheiro co n�mero de elementos indexados (enteiros do texto orixinal)
+       strcpy(filename, basename);
+       strcat(filename, ".");
+       strcat(filename, NUMBER_OF_ELEMENTS_FILE_EXT);
+       unlink(filename);
+       if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+               printf("Cannot open file %s\n", filename);
+               exit(0);
+       }
+       write(file, &(myicsa->suffixArraySize), sizeof(int));
+       close(file);            
+
+       strcpy(filename, basename);
+       strcat(filename, ".");
+       strcat(filename, PSI_COMPRESSED_FILE_EXT);      
+
+       #ifdef PSI_HUFFMANRLE
+               storeHuffmanCompressedPsi(&(myicsa->hcPsi), filename);  
+       #endif  
+       #ifdef PSI_GONZALO
+               storeGonzaloCompressedPsi(&(myicsa->gcPsi), filename);  
+       #endif
+       #ifdef PSI_DELTACODES
+               storeDeltaCompressedPsi(&(myicsa->dcPsi), filename);
+       #endif
+       
+       // Ficheiro co vector de bits D
+       strcpy(filename, basename);
+       strcat(filename, ".");
+       strcat(filename, D_FILE_EXT);  
+       unlink(filename);
+       if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+               printf("Cannot open file %s\n", filename);
+               exit(0);
+       }
+       write(file, myicsa->D, sizeof(int)*((myicsa->suffixArraySize+31)/32));
+       close(file);
+
+       // Directorio de rank para D
+       // Almacenamos o n�mero de superbloques seguido dos superbloques
+       // E logo o n�mero de bloques seguido dos bloques
+       strcpy(filename, basename);
+       strcat(filename, ".");
+       strcat(filename, D_RANK_DIRECTORY_FILE_EXT);
+       saveBitmap(filename,myicsa->bD);
+       
+       // Ficheiro coas mostras de A
+       strcpy(filename, basename);
+       strcat(filename, ".");
+       strcat(filename, SAMPLES_A_FILE_EXT); 
+       unlink(filename);
+       if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+               printf("Cannot open file %s\n", filename);
+               exit(0);
+       }
+       write(file, myicsa->samplesA, sizeof(int) * (myicsa->samplesASize));
+       close(file);
+
+       // Ficheiro co vector BA (marca as posicions de A muestreadas)
+       strcpy(filename, basename);
+       strcat(filename, ".");
+       strcat(filename, BA_FILE_EXT);  
+       unlink(filename);
+       if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+               printf("Cannot open file %s\n", filename);
+               exit(0);
+       }
+       write(file, myicsa->BA, sizeof(int)*((myicsa->suffixArraySize+31)/32));
+       close(file);
+
+       // Directorio de rank para BA
+       strcpy(filename, basename);
+       strcat(filename, ".");
+       strcat(filename, BA_RANK_DIRECTORY_FILE_EXT);
+       saveBitmap(filename, myicsa->bBA);
+       
+       // Ficheiro coas mostras de A inversa
+       strcpy(filename, basename);
+       strcat(filename, ".");
+       strcat(filename, SAMPLES_A_INV_FILE_EXT);
+       unlink(filename);
+       if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+               printf("Cannot open file %s\n", filename);
+               exit(0);
+       }
+       write(file, myicsa->samplesAInv, sizeof(int) * (myicsa->samplesAInvSize));
+       close(file);    
+
+       // Ficheiro co periodo de muestreo de A e A inversa
+       strcpy(filename, basename);
+       strcat(filename, ".");
+       strcat(filename, SAMPLING_PERIOD_A_FILE_EXT);
+       unlink(filename);
+       if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+               printf("Cannot open file %s\n", filename);
+               exit(0);
+       }
+       write(file, &(myicsa->T_A), sizeof(int));
+       write(file, &(myicsa->T_AInv), sizeof(int));
+       
+       write(file, &(myicsa->psiSearchFactorJump), sizeof(uint));
+
+       close(file);
+       free(filename); 
+       return 0; //no error.
+}
+
+//Returns the size (in bytes) of the index over the sequence of integers.
+//uint CSA_size(ticsa *myicsa) {
+int sizeIntIndex(void *index, uint *numBytes) {
+       ticsa *myicsa = (ticsa *) index;
+       uint size = 0;
+       size +=(sizeof (ticsa) * 1);
+       size += sizeof(uint)*((myicsa->suffixArraySize+31)/32) ;  //D vector
+       size += myicsa->bD->mem_usage;
+       size += sizeof(uint) * myicsa->samplesASize ;  // samples A
+       size += sizeof(uint) * myicsa->samplesAInvSize ;  // samples A^{-1}
+       size += sizeof(uint)*((myicsa->suffixArraySize+31)/32) ;  //BA vector
+       size += myicsa->bBA->mem_usage;
+       #ifdef PSI_HUFFMANRLE
+               size +=myicsa->hcPsi.totalMem;          
+       #endif  
+       #ifdef PSI_GONZALO
+               size +=myicsa->gcPsi.totalMem;          
+       #endif
+       #ifdef PSI_DELTACODES
+               size +=myicsa->dcPsi.totalMem;
+       #endif  
+       *numBytes = size;
+       return 0; //no error.
+}
+
+
+//ticsa *loadCSA(char *basename) {
+int loadIntIndex(char *pathname, void **index){
+
+       char *basename=pathname;
+       char *filename;
+       int file;
+       uint length;
+       char c;
+       char *word;
+       struct stat f_stat;     
+       uint suffixArraySize;
+
+       ticsa *myicsa;
+       myicsa = (ticsa *) malloc (sizeof (ticsa) * 1);
+       
+       
+       // VARIABLE GLOBAL QUE ALMACENA O ESTADO DOS DISPLAYS (IMPORTANTE PARA DISPLAY SECUENCIAL)
+       // Almacena a �ltima posici�n do array de sufixos que mostramos con display ou displayNext
+       // Se nos piden un displayNext, aplicamos PSI sobre esta posici�n e obtemos a seguinte,
+       // coa que podemos obter o s�mbolo pedido, e actualizamos displayState
+       myicsa->displayCSAState = 0;
+       myicsa->displayCSAPrevPosition = -2;  //needed by DisplayCSA(position)
+       
+       // Reservamos espacio para o nome do ficheiro
+       filename = (char *)malloc(sizeof(char)*MAX_FILENAME_LENGTH);
+
+       // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O NUMERO DE ELEMENTOS INDEXADOS
+       strcpy(filename, basename);
+       strcat(filename, ".");
+       strcat(filename, NUMBER_OF_ELEMENTS_FILE_EXT);
+       if( (file = open(filename, O_RDONLY)) < 0) { 
+               printf("Cannot read file %s\n", filename);exit(0);
+       }       
+       read(file, &suffixArraySize, sizeof(uint));             
+       myicsa->suffixArraySize = suffixArraySize;
+       printf("Number of indexed elements (suffix array size) = %d\n", suffixArraySize);
+       
+       // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA PSI COMPRIMIDA      
+       strcpy(filename, basename);
+       strcat(filename, ".");
+       strcat(filename, PSI_COMPRESSED_FILE_EXT);              
+       #ifdef PSI_HUFFMANRLE
+               myicsa->hcPsi = loadHuffmanCompressedPsi(filename);             
+       #endif  
+       #ifdef PSI_GONZALO
+               myicsa->gcPsi = loadGonzaloCompressedPsi(filename);             
+       #endif
+       #ifdef PSI_DELTACODES
+               myicsa->dcPsi = loadDeltaCompressedPsi(filename);
+       #endif   
+       
+       // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA D
+       strcpy(filename, basename);
+       strcat(filename, ".");
+       strcat(filename, D_FILE_EXT);
+       if( (file = open(filename, O_RDONLY)) < 0) {
+               printf("Cannot read file %s\n", filename); exit(0);
+       }       
+       myicsa->D = (uint *) malloc (sizeof(uint)*((suffixArraySize+31)/32));
+       read(file, myicsa->D, sizeof(uint)*((suffixArraySize+31)/32));
+       printf("Bit vector D loaded (%d bits)\n", suffixArraySize);
+
+       // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O DIRECTORIO DE RANK1 PARA D
+       strcpy(filename, basename);
+       strcat(filename, ".");
+       strcat(filename, D_RANK_DIRECTORY_FILE_EXT);                            
+       myicsa->bD = loadBitmap(filename,myicsa->D,suffixArraySize);
+       {       uint ns, nb;            
+               ns = myicsa->bD->sSize;
+               nb = myicsa->bD->bSize;
+               myicsa->bD->data = myicsa->D;
+               printf("Rank1 Directory for D loaded (%d superblocks, %d blocks)\n", ns, nb);   
+       }
+       
+       // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA SAMPLES A
+       strcpy(filename, basename);
+       strcat(filename, ".");
+       strcat(filename, SAMPLES_A_FILE_EXT);
+       if( (file = open(filename, O_RDONLY)) < 0) {
+               printf("Cannot read file %s\n", filename); exit(0);
+       }
+       if( fstat(file, &f_stat) < 0) {
+               printf("Cannot read information from file %s\n", filename);     exit(0);
+       }       
+       myicsa->samplesASize = (f_stat.st_size)/sizeof(uint);
+       myicsa->samplesA = (uint *)malloc(sizeof(uint) * myicsa->samplesASize);
+       read(file, myicsa->samplesA, sizeof(uint) * myicsa->samplesASize);              
+       printf("Suffix array samples loaded (%d samples)\n", myicsa->samplesASize);     
+       
+       // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA BA
+       strcpy(filename, basename);
+       strcat(filename, ".");
+       strcat(filename, BA_FILE_EXT);
+       if( (file = open(filename, O_RDONLY)) < 0) {
+               printf("Cannot read file %s\n", filename); exit(0);
+       }       
+       myicsa->BA = (uint *) malloc (sizeof(uint)*((suffixArraySize+31)/32));
+       read(file, myicsa->BA, sizeof(uint)*((suffixArraySize+31)/32));
+       printf("Bit vector BA loaded (%d bits)\n", suffixArraySize);
+
+       // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O DIRECTORIO DE RANK1 PARA BA
+       strcpy(filename, basename);
+       strcat(filename, ".");
+       strcat(filename, BA_RANK_DIRECTORY_FILE_EXT);                           
+       myicsa->bBA = loadBitmap(filename,myicsa->BA,suffixArraySize);
+       {       uint ns, nb;            
+               ns = myicsa->bBA->sSize;
+               nb = myicsa->bBA->bSize;
+               myicsa->bBA->data = myicsa->BA;
+               printf("Rank1 Directory for BA loaded (%d superblocks, %d blocks)\n", ns, nb);  
+       }
+
+       // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA SAMPLES A INVERSA
+       strcpy(filename, basename);
+       strcat(filename, ".");
+       strcat(filename, SAMPLES_A_INV_FILE_EXT);
+       if( (file = open(filename, O_RDONLY)) < 0) {
+               printf("Cannot read file %s\n", filename); exit(0);
+       }
+       if( fstat(file, &f_stat) < 0) {
+               printf("Cannot read information from file %s\n", filename);     exit(0);
+       }       
+       myicsa->samplesAInvSize = (f_stat.st_size)/(sizeof(uint));
+       myicsa->samplesAInv = (uint *)malloc(sizeof(uint) * myicsa->samplesAInvSize);
+       read(file, myicsa->samplesAInv, sizeof(uint) * myicsa->samplesAInvSize);                
+       printf("Suffix array inverse samples loaded (%d samples)\n", myicsa->samplesAInvSize);
+       
+       // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O PERIODO DE MUESTREO DO ARRAY DE SUFIXOS E DA INVERSA
+       strcpy(filename, basename);
+       strcat(filename, ".");
+       strcat(filename, SAMPLING_PERIOD_A_FILE_EXT);
+       if( (file = open(filename, O_RDONLY)) < 0) {
+               printf("Cannot read file %s\n", filename); exit(0);
+       }       
+       read(file, &(myicsa->T_A), sizeof(uint));
+       read(file, &(myicsa->T_AInv), sizeof(uint));    
+       printf("Sampling A Period T = %d, Sampling A inv Period TInv = %d\n", myicsa->T_A, myicsa->T_AInv);     
+       
+       read(file, &(myicsa->psiSearchFactorJump), sizeof(uint));
+       printf("Psi Bin Search Factor-Jump is = %d\n", myicsa->psiSearchFactorJump);    
+       
+       close(file);
+       free(filename);
+
+       //return myicsa;
+       *index = myicsa;
+       return 0;
+}
+       
+
+//uint destroyStructsCSA(ticsa *myicsa) {      
+int freeIntIndex(void *index) {        
+       ticsa *myicsa = (ticsa *) index;
+               // Liberamos o espacio reservado
+               
+       if (!myicsa) return 0;
+       
+       uint total=0, totaltmp=0;
+       
+       uint nbytes;sizeIntIndex(index, &nbytes);
+               
+       total +=(sizeof (ticsa) * 1);;
+       
+       #ifdef PSI_HUFFMANRLE
+               total+= totaltmp = myicsa->hcPsi.totalMem;
+               destroyHuffmanCompressedPsi(&(myicsa->hcPsi));
+       #endif
+       #ifdef PSI_GONZALO
+               total+= totaltmp = myicsa->gcPsi.totalMem;
+               destroyGonzaloCompressedPsi(&(myicsa->gcPsi));
+       #endif
+       #ifdef PSI_DELTACODES
+               total+= totaltmp = myicsa->dcPsi.totalMem;
+               destroyDeltaCodesCompressedPsi(&(myicsa->dcPsi));       
+       #endif
+       printf("\n\t[destroying  SA: compressed PSI structure] ...Freed %u bytes... RAM",totaltmp);
+       
+       free(myicsa->D);                        total+= totaltmp =  (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); 
+                                                       printf("\n\t[destroying  SA: D vector] ...Freed %u bytes... RAM",totaltmp);
+       free(myicsa->samplesA);         total+= totaltmp = (sizeof(uint) * myicsa->samplesASize); 
+                                                       printf("\n\t[destroying  Samples A: A   ] ...Freed %u bytes... RAM",totaltmp);
+       free(myicsa->samplesAInv);      total+= totaltmp =  (sizeof(uint) * myicsa->samplesAInvSize); 
+                                                       printf("\n\t[destroying  Samples AInv: A   ] ...Freed %u bytes... RAM",totaltmp);                                                       
+                                               printf("\n\t[destroying  rank bit D   ] ...Freed %u bytes... RAM",myicsa->bD->mem_usage);
+       free(myicsa->BA);                       total+= totaltmp =  (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); 
+                                                       printf("\n\t[destroying  SA: BA vector] ...Freed %u bytes... RAM",totaltmp);
+                                                               
+                                                               total += myicsa->bD->mem_usage;
+       destroyBitmap(myicsa->bD);                      
+                                                               total += myicsa->bBA->mem_usage;
+       destroyBitmap(myicsa->bBA);
+                                                               
+       printf("\n\t**** [the whole iCSA ocuppied ... %u bytes... RAM",total);
+       printf("\n\t**** iCSA size = %d bytes ",nbytes);
+       printf("\n");
+       
+       free(myicsa);
+       
+       return 0; //no error.
+}
+
+       // Shows detailed summary info of the self-index (memory usage of each structure)
+int printInfoIntIndex(void *index, const char tab[]) {
+       ticsa *myicsa = (ticsa *) index;
+       if (!myicsa) return 0;
+       
+       uint structure, totalpsi, totalD, totalBD, totalSA,totalSAinv, totalBA,totalBBA;
+       
+       structure=sizeof(ticsa);
+       
+       #ifdef PSI_HUFFMANRLE
+               totalpsi = myicsa->hcPsi.totalMem;
+       #endif
+       #ifdef PSI_GONZALO
+               totalpsi = myicsa->gcPsi.totalMem;
+       #endif
+       #ifdef PSI_DELTACODES
+               totalpsi = myicsa->dcPsi.totalMem;
+       #endif
+
+       totalD   = (sizeof(uint)*((myicsa->suffixArraySize+31)/32));
+       totalBD  = myicsa->bD->mem_usage;
+       totalSA  = (sizeof(uint) * myicsa->samplesASize); 
+       totalSAinv = (sizeof(uint) * myicsa->samplesAInvSize); 
+       totalBA  = (sizeof(uint)*((myicsa->suffixArraySize+31)/32));
+       totalBBA = myicsa->bBA->mem_usage;
+       
+       uint nbytes; sizeIntIndex(index, &nbytes); //whole self-index
+       
+       printf("\n ===================================================:");              
+       printf("\n%sSummary Self-index on integers (icsa) layer:",tab);         
+       printf("\n%s   icsa structure = %d bytes",tab, structure);
+       printf("\n%s   psi         = %8d bytes",tab, totalpsi);
+       printf("\n%s   D (bitmap)  = %8d bytes",tab, totalD);
+       printf("\n%s   rank for D  = %8d bytes",tab, totalBD);
+       printf("\n%s   SA(sampled) = %8d bytes",tab, totalSA);
+       printf("\n%s   SAinv(samp) = %8d bytes",tab, totalSAinv);
+       printf("\n%s   BA (bitmap) = %8d bytes",tab, totalBA);
+       printf("\n%s   rank for BA = %8d bytes",tab, totalBBA);
+       printf("\n%sTotal = ** %9d bytes (in RAM) ** ",tab, nbytes);
+       printf("\n");
+       
+       return 0; //no error.
+}
+
+
+// OPERACIONS DO CSA
+
+// BUSCA BINARIA SOBRE MOSTRAS + 2 BUSCAS EXPONENCIAIS + 2 BUSCAS BINARIAS
+// 1º Busca binaria sobre o array de sufixos, elexindo como pivote un múltiplo de bin_search_psi_skip_interval (que orixinalmente foi pensado para igualar co valor de Psi).
+// 2º Esta busca pode deterse por:
+//     a) O pivote repítese entre dúas iteracións -> As ocorrencias están entre o pivote e a seguinte mostra (pivote + bin_search_psi_skip_interval) -> facemos dúas buscas binarias
+//     b) O pivote é unha ocorrencia do patrón -> Faise unha busca exponencial sobre mostras hacia a esquerda e outra hacia a dereita, ata atopar a unha mostra á esquerda e outra
+//     á dereita do intervalo de ocorrencias. Entre cada unha destas e a inmediatamente anterior da busca exponencial, faise unha busca binaria para atopar os extremos do intervalo.
+
+int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right){  
+       //unsigned int countCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right) {
+
+       uint patternSize = length;
+       ticsa *myicsa = (ticsa *) index;
+       
+       register unsigned long l, r, i;
+       register long comp, p, previousP;
+       //register unsigned int l, r, i;
+       //register int comp, p, previousP;
+       register uint bin_search_psi_skip_interval = myicsa->psiSearchFactorJump;
+       
+       //fprintf(stderr,"\n psiSearchFactor = %d",myicsa->psiSearchFactorJump);
+       
+       l = 0; 
+       r = (myicsa->suffixArraySize+bin_search_psi_skip_interval-2)/bin_search_psi_skip_interval*bin_search_psi_skip_interval;
+       p = ((l+r)/2)/bin_search_psi_skip_interval * bin_search_psi_skip_interval;
+       previousP = 0;
+       
+       // BUSCA BINARIA SOBRE MOSTRAS
+       while( (p != previousP) && (comp = SadCSACompare(myicsa, pattern, patternSize, p)) ) {
+               if(comp > 0) l = p;
+               else r = p;
+               previousP = p;
+               p = ((l+r)/2)/bin_search_psi_skip_interval*bin_search_psi_skip_interval;
+       }
+
+       if(p==previousP) {
+       
+               // BUSCA BINARIA ENTRE O PIVOTE E A SEGUINTE MOSTRA
+               l = previousP; 
+               r = previousP+bin_search_psi_skip_interval;
+               if(r > myicsa->suffixArraySize) r = myicsa->suffixArraySize - 1;
+               while(l < r) {
+                       p = (l+r)/2; 
+                       if(SadCSACompare(myicsa, pattern, patternSize, p) <= 0) r = p;
+                       else l = p+1;
+               }
+
+               if(SadCSACompare(myicsa, pattern, patternSize, r)) {
+                       *left = l;
+                       *right = r;
+                       //return 0;
+                       *numocc = 0;
+                       return 0; //no error.
+               }
+               *left = r;
+       
+               l = previousP; 
+               r = previousP+bin_search_psi_skip_interval;
+               if(r > myicsa->suffixArraySize) r = myicsa->suffixArraySize - 1;
+               while(l < r) {
+                       p = (l+r+1)/2;
+                       if(SadCSACompare(myicsa, pattern, patternSize, p) >= 0) l = p;
+                       else r = p-1;   
+               }
+               *right = l;
+               
+       } else {
+               
+               previousP = p;  // En previousP poñemos o p atopado na busca sobre as mostras
+               
+               // BUSCA EXPONENCIAL HACIA ATRÁS
+               i = 1;
+               p -= bin_search_psi_skip_interval;
+               while(p>0 && !SadCSACompare(myicsa, pattern, patternSize, p)) {
+                       i<<=1;
+                       p = previousP-(i*bin_search_psi_skip_interval);
+               }
+               // Busca binaria entre as duas ultimas mostras da exponencial
+               if(previousP > i*bin_search_psi_skip_interval) l = previousP-(i*bin_search_psi_skip_interval);
+               else l=0;
+               i>>=1;          
+               r = previousP-(i*bin_search_psi_skip_interval);
+               while(l < r) {
+                       p = (l+r)/2; 
+                       if(SadCSACompare(myicsa, pattern, patternSize, p) <= 0) r = p;
+                       else l = p+1;
+               }
+               *left = r;
+               
+               // BUSCA EXPONENCIAL HACIA ADIANTE
+               i = 1;
+               p = previousP+bin_search_psi_skip_interval;
+               while(p<myicsa->suffixArraySize && !SadCSACompare(myicsa, pattern, patternSize, p)) {
+                       i<<=1;
+                       p = previousP+(i*bin_search_psi_skip_interval);         
+               }
+               // Busca binaria entre as duas ultimas mostras da exponencial
+               if(p < myicsa->suffixArraySize) r = previousP+(i*bin_search_psi_skip_interval);
+               else r = myicsa->suffixArraySize-1;
+               i>>=1;          
+               l = previousP+(i*bin_search_psi_skip_interval);
+               while(l < r) {
+                       p = (l+r+1)/2;
+                       if(SadCSACompare(myicsa, pattern, patternSize, p) >= 0) l = p;
+                       else r = p-1;   
+               }
+               *right = l;     
+       }
+
+       //return *right-*left+1;
+       *numocc = (uint) *right-*left+1;
+       return 0; //no error            
+}
+
+// Version inicial de busca binaria
+unsigned int countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right) {
+       register ulong l, r, p;
+
+       l = 0; 
+       r = myicsa->suffixArraySize-1;
+
+       while(l < r) {
+               p = (l+r)/2; 
+               if(SadCSACompare(myicsa, pattern, patternSize, p) <= 0) r = p;
+               else l = p+1;
+       }
+
+       // SE SON DISTINTOS O PATRON NON APARECE NO TEXTO E DEVOLVEMOS 0
+       if(SadCSACompare(myicsa, pattern, patternSize, r)) {
+               *left = l;
+               *right = r;
+               return 0;
+       }
+       
+       // Almacenamos o limite esquerdo
+       *left = r;
+
+       // SE SON IGUALES (O PATRON APARECE NO TEXTO), BUSCAMOS AGORA O LIMITE DEREITO, QUE ALMACENAREMOS EN right
+       // NOTA: INICIAMOS A BUSQUEDA A PARTIR DO ESQUERDO...
+       l = r; 
+       r = myicsa->suffixArraySize-1;
+       
+       while(l < r) {
+               p = (l+r+1)/2;
+               if(SadCSACompare(myicsa, pattern, patternSize, p) >= 0) l = p;
+               else r = p-1;   
+       }
+       
+       // Gardamos o limite dereito
+       *right = l;
+       
+       return (uint) *right-*left+1;   
+}
+
+int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc) {
+       //unsigned int *locateCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *occ) {
+       
+       ticsa *myicsa = (ticsa *) index;
+       uint patternSize = length;
+       ulong *positions;
+       ulong occurrences;
+       register ulong left, right;
+       
+               //occurrences = countCSA(myicsa, pattern, patternSize, &left, &right);
+       int err;
+       err = countIntIndex((void *) myicsa, pattern, patternSize, &occurrences, &left, &right);
+       *numocc = occurrences;
+
+       if (occurrences) {
+               register ulong idx = 0;
+               positions = (ulong*) malloc(sizeof(ulong) * occurrences);
+               while(left<=right) positions[idx++] = A(myicsa,left++); 
+               
+               *occ = positions;
+               return 0;       
+               //return positions;
+       }
+       
+       (*occ)=NULL;
+       return 0; //no error, but no occurrences.
+       
+       //return NULL;
+}
+
+// Devolve o enteiro do texto que ocupa a posicion dada,
+// e fixa o estado para poder seguir obtendo os seguintes enteiros con displayNext();
+
+int displayIntIndex(void *index, ulong position, uint *value){
+       //uint displayCSA(ticsa *myicsa, uint position) {
+       ticsa *myicsa = (ticsa *) index;
+       if (position == (myicsa->displayCSAPrevPosition +1)) {
+               myicsa->displayCSAPrevPosition = position;
+               //return displayCSANext(myicsa);
+               *value = displayCSANext(myicsa);
+       }
+       else {
+               myicsa->displayCSAPrevPosition = position;
+               //return displayCSAFirst(myicsa, position);
+               *value = displayCSAFirst(myicsa, position);
+       }
+       return 0; //no error
+}
+
+
+/**********************************************************************/
+
+// Devolve o enteiro do texto que ocupa a posicion dada, e fixa o estado 
+// para poder seguir obtendo os seguintes enteiros con displayNext();
+uint displayCSAFirst(ticsa *myicsa, uint position) {
+
+       register uint positionAux, index;
+       register uint T_AInv = myicsa->T_AInv;
+       
+       positionAux = myicsa->samplesAInv[position / T_AInv];
+       for(index = 0; index < position%T_AInv; index++) {
+               #ifdef PSI_HUFFMANRLE
+                       positionAux=getHuffmanPsiValue(&(myicsa->hcPsi),positionAux);
+               #endif                   
+               #ifdef PSI_GONZALO
+                       positionAux=getGonzaloPsiValue(&(myicsa->gcPsi),positionAux);
+               #endif
+               #ifdef PSI_DELTACODES
+                       positionAux=getDeltaPsiValue(&(myicsa->dcPsi),positionAux);
+               #endif          
+       }
+       
+       // Fijamos a variable global para a chamada a displayNext
+       myicsa->displayCSAState = positionAux;
+       
+       //      return rank1(D, Dir, positionAux) - 1;
+       return rank(myicsa->bD, positionAux) - 1;
+}
+
+
+// Devolve o seguinte enteiro do texto (a partir do estado)
+unsigned int displayCSANext(ticsa *myicsa) {   
+       #ifdef PSI_HUFFMANRLE
+               myicsa->displayCSAState=getHuffmanPsiValue(&(myicsa->hcPsi),myicsa->displayCSAState);
+       #endif           
+       #ifdef PSI_GONZALO
+               myicsa->displayCSAState = getGonzaloPsiValue(&(myicsa->gcPsi),myicsa->displayCSAState);
+       #endif
+       #ifdef PSI_DELTACODES
+               myicsa->displayCSAState = getDeltaPsiValue(&(myicsa->dcPsi),myicsa->displayCSAState);
+       #endif  
+       return rank(myicsa->bD, myicsa->displayCSAState) - 1;   
+}
+
+
+// Mostra as estructuras creadas
+void showStructsCSA(ticsa *myicsa) {
+
+       unsigned int index;
+
+       // ESTRUCTURAS PARA CSA
+       printf("Basic CSA structures:\n\n");
+       
+       // VALORES DA FUNCI�N PSI (decodificando)
+       printf("\tPSI: (Sampling period = %d)\n", myicsa->T_Psi);
+       for(index=0; index < myicsa->suffixArraySize; index++)   
+               #ifdef PSI_HUFFMANRLE
+                       printf("\tPsi[%d] = %d\n", index, getHuffmanPsiValue(&(myicsa->hcPsi),index));
+               #endif
+               #ifdef PSI_GONZALO
+                       printf("\tPsi[%d] = %d\n", index, getGonzaloPsiValue(&(myicsa->gcPsi),index));
+               #endif
+               #ifdef PSI_DELTACODES
+                       printf("\tPsi[%d] = %d\n", index, getDeltaPsiValue(&(myicsa->dcPsi),index));            
+               #endif                          
+       printf("\n");
+       
+       // VECTOR D DE SADAKANE CO DIRECTORIO DE RANK ASOCIADO
+       printf("\tD = ");
+       showBitVector(myicsa->D, myicsa->suffixArraySize);
+       printf("\n\nSuperbloques de D:\n");
+       {       uint ns;
+               uint nb;
+               ns = myicsa->bD->sSize;
+               nb= myicsa->bD->bSize;
+               for(index=0; index<ns; index++) {
+                       //printf("\tDs[%d] = %d\n", index, Dir.Ds[index]);
+                       printf("\tDs[%d] = %d\n", index, myicsa->bD->sdata[index]);
+               }
+               printf("\nBloques de D:\n");
+               
+               for(index=0; index<nb; index++) {
+                       //printf("\tDb[%d] = %d\n", index, Dir.Db[index]);
+                       printf("\tDb[%d] = %d\n", index, myicsa->bD->bdata[index]);             
+               }       
+               printf("\n\n");
+       }
+       // ESTRUCTURAS PARA ACCEDER O ARRAY DE SUFIXOS E A SUA INVERSA
+       printf("Suffix Array Sampling Structures: (Sampling period = %d)\n", myicsa->T_A);
+       printf("\tSuffix Array Samples:\n");
+       for(index=0; index < myicsa->samplesASize; index++) 
+               printf("\tSamplesA[%d] = %d\n", index, myicsa->samplesA[index]);
+       printf("\n");   
+       printf("\tInverse Suffix Array Samples:\n");
+       for(index=0; index < myicsa->samplesASize; index++) 
+               printf("\tSamplesAInv[%d] = %d\n", index, myicsa->samplesAInv[index]);
+       printf("\n");
+                       
+}
+
+
+// Comparacion de Sadakane entre un patron (pattern) y el sufijo en la posicion p del array de sufijos
+// IMPORTANTE EVITAR ULTIMA CHAMADA A PSI
+int SadCSACompare(ticsa *myicsa, uint *pattern, uint patternSize, uint p) {
+
+       register unsigned int j, i, currentInteger, diff;
+       
+       i = p;
+       j = 0;
+       
+       while(1) {
+               currentInteger = rank(myicsa->bD, i) - 1;
+               diff = pattern[j++] - currentInteger;
+               if(diff) return diff;
+               if(j == patternSize) return 0;
+               else 
+                       #ifdef PSI_HUFFMANRLE
+                               i=getHuffmanPsiValue(&(myicsa->hcPsi),i);
+                       #endif
+                       #ifdef PSI_GONZALO
+                               i=getGonzaloPsiValue(&(myicsa->gcPsi),i);
+                       #endif          
+                       #ifdef PSI_DELTACODES
+                               i=getDeltaPsiValue(&(myicsa->dcPsi),i);
+                       #endif
+       }
+       
+}
+
+
+// Acceso a array de sufixos A
+inline uint A(ticsa *myicsa, uint position) {
+
+       register uint timesPsi, sampleValue;
+       register uint T_A = myicsa->T_A;
+       
+       uint proba = position;
+       
+       timesPsi = 0;
+       while(!bitget(myicsa->BA, position)) {
+       
+               #ifdef PSI_HUFFMANRLE
+                       position=getHuffmanPsiValue(&(myicsa->hcPsi),position);
+               #endif           
+               #ifdef PSI_GONZALO
+                       position=getGonzaloPsiValue(&(myicsa->gcPsi),position);
+               #endif
+               #ifdef PSI_DELTACODES
+                       position=getDeltaPsiValue(&(myicsa->dcPsi),position);
+               #endif
+               timesPsi++;
+               
+       }
+       sampleValue = myicsa->samplesA[rank(myicsa->bBA, position)-1];
+       
+       return sampleValue - timesPsi;
+
+}
+
+
+// Acceso 'a inversa do array de sufixos
+inline uint inverseA(ticsa *myicsa, uint offset) {
+
+       register uint index, inverseValue;
+       register uint T_AInv = myicsa->T_AInv;
+       
+       inverseValue = myicsa->samplesAInv[offset/T_AInv];
+       for(index=0; index<(offset%T_AInv); index++) 
+               #ifdef PSI_HUFFMANRLE
+                       inverseValue=getHuffmanPsiValue(&(myicsa->hcPsi),inverseValue);
+               #endif          
+               #ifdef PSI_GONZALO
+                  inverseValue = getGonzaloPsiValue(&(myicsa->gcPsi),inverseValue);
+               #endif  
+               #ifdef PSI_DELTACODES
+                       inverseValue = getDeltaPsiValue(&(myicsa->dcPsi),inverseValue);
+               #endif
+       return inverseValue;
+       
+}
+
+// Initializes the parameters of the index.
+uint parametersCSA(ticsa *myicsa, char *build_options){ 
+       char delimiters[] = " =;";
+       int j,num_parameters;
+       char ** parameters;
+       int ssA,ssAinv,ssPsi,nsHuff,psiSearchFactor;
+       
+       ssA    = DEFAULT_A_SAMPLING_PERIOD;
+       ssAinv = DEFAULT_A_INV_SAMPLING_PERIOD;
+       ssPsi  = DEFAULT_PSI_SAMPLING_PERIOD;
+       nsHuff = DEFAULT_nsHUFF;
+       psiSearchFactor = DEFAULT_PSI_BINARY_SEARCH_FACTOR;
+       
+       if (build_options != NULL) {
+       parse_parameters(build_options,&num_parameters, &parameters, delimiters);
+       for (j=0; j<num_parameters;j++) {
+         
+               if ((strcmp(parameters[j], "sA") == 0 ) && (j < num_parameters-1) ) {
+                       ssA=atoi(parameters[j+1]);                      
+               } 
+               if ((strcmp(parameters[j], "sAinv") == 0 ) && (j < num_parameters-1) ) {
+                       ssAinv=atoi(parameters[j+1]);                   
+               }       
+               if ((strcmp(parameters[j], "sPsi") == 0 ) && (j < num_parameters-1) ) {
+                       ssPsi=atoi(parameters[j+1]);                    
+               }       
+               if ((strcmp(parameters[j], "nsHuff") == 0 ) && (j < num_parameters-1) ) {
+                       nsHuff=atoi(parameters[j+1]);
+                       nsHuff *=1024;                  
+               } 
+               if ((strcmp(parameters[j], "psiSF") == 0 ) && (j < num_parameters-1) ) {
+                       psiSearchFactor=atoi(parameters[j+1]);
+               }                       
+               j++;
+       }
+       free_parameters(num_parameters, &parameters);
+       }               
+
+       myicsa->T_A = ssA;
+       myicsa->T_AInv = ssAinv;
+       myicsa->T_Psi = ssPsi;
+       myicsa->tempNSHUFF = nsHuff;
+       myicsa->psiSearchFactorJump = psiSearchFactor * ssPsi;  
+
+       printf("\n\t parameters for iCSA: sampleA=%d, sampleAinv=%d, samplePsi=%d",ssA,ssAinv,ssPsi);
+       printf("\n\t              : nsHuff=%d, psiSearchFactor = %d --> jump = %d", nsHuff,psiSearchFactor, myicsa->psiSearchFactorJump);
+}
diff --git a/swcsa/intIndex/icsa.h b/swcsa/intIndex/icsa.h

new file mode 100644 (file)

index 0000000..102832c
--- /dev/null
+++ b/swcsa/intIndex/icsa.h
@@ -0,0 +1 @@
+#include <stdio.h>\r#include <fcntl.h>\r#include <sys/stat.h>\r#include <time.h>\r#include <sys/time.h>\r\r#include "defValues.h"\r#include "../utils/bitmap.h"\r#include "../utils/huff.h"\r#include "../utils/parameters.h"\r\r#ifdef PSI_HUFFMANRLE\r   #include "psiHuffmanRLE.h"\r#endif\r\r#ifdef PSI_GONZALO\r  #include "psiGonzalo.h"\r#endif\r\r#ifdef PSI_DELTACODES\r  #include "psiDeltaCode.h"\r#endif\r\r\rtypedef struct {\r    uint suffixArraySize;\r  uint T_Psi;\r    uint *D;\r       bitmap bD;\r     uint T_A;\r      uint T_AInv;\r   uint *samplesA;\r        uint samplesASize;\r     uint *BA;\r      bitmap bBA;     \r       uint *samplesAInv;\r     uint samplesAInvSize;\r  uint displayCSAState;\r  int displayCSAPrevPosition;\r    #ifdef PSI_HUFFMANRLE   \r       HuffmanCompressedPsi hcPsi;\r    #endif  \r       #ifdef PSI_GONZALO\r     GonzaloCompressedPsi gcPsi;\r    #endif\r #ifdef PSI_DELTACODES\r  DeltaCompressedPsi dcPsi;\r      #endif\r \r       //only needed during "parse_parameters".\r       uint tempNSHUFF;\r       uint psiSearchFactorJump;  //factor of the T_Psi value.\r} ticsa;        \r\r      \r// FUNCTION PROTOTYPES: BUILDING THE INDEX\r\r//Creates the ICSA \r\r      int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ); //ticsa *createIntegerCSA (uint **aintVector, uint SAsize, char *build_options);\r\r//Returns number of elements in the indexed sequence of integers\r    int sourceLenIntIndex(void *index, uint *numInts);\r\r//Save the index to disk\r   int saveIntIndex(void *index, char *pathname); //void storeStructsCSA(ticsa *myicsa, char *basename);\r\r// Loads the index from disk.\r   int loadIntIndex(char *pathname, void **index);  //ticsa *loadCSA(char *basename);\r\r//  Frees memory    \r       int freeIntIndex(void *index); //uint destroyStructsCSA(ticsa *myicsa);\r\r//Returns the size (in bytes) of the index over the sequence of integers.\r     int sizeIntIndex(void *index, uint *numBytes); //uint CSA_size(ticsa *myicsa);  \r\r      // Shows detailed summary info of the self-index (memory usage of each structure)\rint printInfoIntIndex(void *index, const char tab[]);\r\r//Number of occurrences of the pattern, and the interval [left,right] in the suffix array.\r    int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right);\r                  //uint countCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right);               // Exponential search\r                  //uint countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right);    // Binary search\r\r// Returns an array with integers corresponding offsets to the occurrences of the pattern, \r// as well as the number of occurrences\r  int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc);\r               //uint *locateCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *occ);\r\r//Returns the value of the source (array of integers) at a given offset.\r// (that is, the element "position" from the original array of uints)\r  int displayIntIndex(void *index, ulong position, uint *value);\r         //uint displayCSA(ticsa *myicsa, uint position);        \r\r\r/* Private function prototypes ********************************************/\ruint parametersCSA(ticsa *myicsa, char *build_options);\r\ruint displayCSAFirst(ticsa *myicsa, uint position);\ruint displayCSANext(ticsa *myicsa);\rint  SadCSACompare(ticsa *myicsa, uint *pattern, uint patternSize, uint p);\ruint A(ticsa *myicsa, uint position);\ruint inverseA(ticsa *myicsa, uint offset);\r\rvoid showStructsCSA(ticsa *myicsa);              // For Debugging\r\r
+\ No newline at end of file
diff --git a/swcsa/intIndex/interfaceIntIndex.h b/swcsa/intIndex/interfaceIntIndex.h

new file mode 100644 (file)

index 0000000..5000f18
--- /dev/null
+++ b/swcsa/intIndex/interfaceIntIndex.h
@@ -0,0 +1,35 @@
+
+// FUNCTION PROTOTYPES: SELF-INDEX ON INTEGERS.
+
+int buildIntIndex (uint *intVector, uint n, char *build_options, void **index );
+               
+       //Saves the index to disk
+int saveIntIndex(void *index, char *pathname);
+
+       //Returns number of elements in the indexed sequence of integers
+int sourceLenIntIndex(void *index, uint *numInts);
+
+       //Loads the index from disk.
+int loadIntIndex(char *pathname, void **index);
+       
+       //Frees the memory allocated to the int_index
+int freeIntIndex(void *index);
+
+       //Returns the size (in bytes) of the index over the sequence of integers.
+int sizeIntIndex(void *index, uint *numBytes);
+
+       // Shows detailed summary info of the self-index (memory usage of each structure)
+int printInfoIntIndex(void *index, const char tab[]);
+       
+       //Number of occurrences of the pattern, and the interval [left,right] in the suffix array
+int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right);
+       
+       //returns an array with integers corresponding offsets to the occurrences of the pattern, 
+       //as well as the number of occurrences
+int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc);
+       
+       //Returns the value of the source (array of integers) at a given offset.
+       // (that is, the element "position" from the original array of uints)
+int displayIntIndex(void *index, ulong position, uint *value);
+
+
diff --git a/swcsa/intIndex/psiDeltaCode.c b/swcsa/intIndex/psiDeltaCode.c

new file mode 100644 (file)

index 0000000..04a4f91
--- /dev/null
+++ b/swcsa/intIndex/psiDeltaCode.c
@@ -0,0 +1,226 @@
+#include "psiDeltaCode.h"
+
+void destroyDeltaCodesCompressedPsi(DeltaCompressedPsi *compressedPsi) {
+       free(compressedPsi->deltaCodes);
+       free(compressedPsi->samples);
+       free(compressedPsi->pointers);
+}
+
+                       
+DeltaCompressedPsi deltaCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T) {
+
+       DeltaCompressedPsi cPsi;
+       
+       int numberOfSamples;
+       register int diff, deltaCodesPos;                                
+       register unsigned int k, p, aux, diffpositive, code, index;
+       unsigned int samplesIndex, codeLenght, currentInput, wordsDeltaCodes, totalSize;
+       unsigned int *deltaCodes;
+       unsigned int *samples;
+       unsigned int *pointers;
+       
+       // Auxiliar para deltaCodes (estimamos como espacio maximo o do array de sufixos)
+       unsigned int *deltaCodesAux;                                     
+                        
+       // Calculamos o mellor valor para negativeGap <= 64
+       unsigned int negativeGap;
+       register unsigned int maxNegativeBits = 0;
+       k = psiSize;
+       while(k) {
+               k >>= 1;
+               maxNegativeBits++;              
+       }
+       if(maxNegativeBits<=26) negativeGap = 64;
+       else negativeGap = 1<<(32-maxNegativeBits);                              
+       
+       // Reservamos espacio para as estructuras
+       numberOfSamples = (psiSize + T - 1) / T;
+       samples = (unsigned int *)malloc(sizeof(int)*numberOfSamples);
+       pointers = (unsigned int *)malloc(sizeof(int)*numberOfSamples);
+       
+       deltaCodesAux = (unsigned int *)malloc(sizeof(int)*psiSize);
+       for(index=0; index<psiSize; index++) deltaCodesAux[index] = 0;           
+       
+       samplesIndex = 0;
+       deltaCodesPos = 0;
+       for(index=0; index<psiSize; index++) {
+
+               if(index % T) {
+                       
+                       diff = Psi[index] - currentInput;
+                       currentInput = Psi[index];
+                       
+                       // Calculamos o codigo correspondente
+                       if(diff>0) diffpositive = (negativeGap*diff-1)/(negativeGap-1);
+                       else diffpositive = -negativeGap*diff;
+       
+                       k = 0;
+                       aux = diffpositive;
+                       while(aux) {
+                               aux >>= 1;
+                               k++;
+                       }
+                       aux = k;
+                       p = 0;
+                       while(aux) {
+                               aux >>= 1;
+                               p++;
+                       }
+                       
+                       code = diffpositive & ((1<<(k-1))-1);
+                       codeLenght = 2*p+k-2;           
+
+                       // Primeiro metemos os p-1 0's iniciais
+                       deltaCodesPos += p-1;
+                       
+                       // Agora metemos os p bits de k
+                       if( ((deltaCodesPos%32) + p) > 32 ) {   
+                               deltaCodesAux[deltaCodesPos/32] |= (k>>((deltaCodesPos%32)+p-32));
+                               deltaCodesAux[deltaCodesPos/32+1] = (k<<(64-(deltaCodesPos%32)-p));     
+                       } else {
+                               deltaCodesAux[deltaCodesPos/32] |= (k<<(32-p-(deltaCodesPos%32)));                              
+                       }
+                       deltaCodesPos += p;
+                       
+                       // Por �ltimo metemos os k-1 bits de code (sen o 1 inicial)
+                       if( ((deltaCodesPos%32) + (k-1)) > 32 ) {       
+                               deltaCodesAux[deltaCodesPos/32] |= (code>>((deltaCodesPos%32)+(k-1)-32));
+                               deltaCodesAux[deltaCodesPos/32+1] = (code<<(64-(deltaCodesPos%32)-(k-1)));      
+                       } else {
+                               deltaCodesAux[deltaCodesPos/32] |= (code<<(32-(k-1)-(deltaCodesPos%32)));                               
+                       }
+                       deltaCodesPos += k-1;
+               
+               } else {
+                       samples[samplesIndex] = Psi[index];
+                       pointers[samplesIndex++] = deltaCodesPos;                       
+                       currentInput = Psi[index]; 
+               }       
+
+       }
+       
+       // Ahora que xa sabemos o espacio necesario para os deltaCodes, reservamolo e liberamos a estructura auxiliar
+       wordsDeltaCodes = (deltaCodesPos+31)/32;
+       deltaCodes = (unsigned int *)malloc(sizeof(int)*wordsDeltaCodes);
+       for(index=0;index<wordsDeltaCodes;index++) deltaCodes[index] = deltaCodesAux[index];
+       free(deltaCodesAux);
+       
+       totalSize = sizeof(int)*wordsDeltaCodes + 2*sizeof(int)*numberOfSamples + 4*sizeof(int);
+       printf("\n\tCompressed Psi size = %d bytes\n", totalSize);
+       
+       // Asignamos os valores a cPsi e devolvemolo
+       cPsi.T = T;
+       cPsi.negativeGap = negativeGap;
+       cPsi.deltaCodesSize = wordsDeltaCodes;
+       cPsi.deltaCodes = deltaCodes;
+       cPsi.numberOfSamples = numberOfSamples;
+       cPsi.samples = samples;
+       cPsi.pointers = pointers;
+       cPsi.totalMem = totalSize;
+
+       return cPsi;
+       
+}
+
+
+int getDeltaPsiValue(DeltaCompressedPsi *cPsi, unsigned int position) {
+       
+       int result;
+       register unsigned int code, aux, pointerAux, mask, pointer, toDecode, p, k;
+       
+       // Collemos a mostra inmediatamente inferior, e o punteiro o array de codigos
+       // pointer = punteiro absoluto sobre deltaCodes
+       result = cPsi->samples[position/cPsi->T];
+       pointer = cPsi->pointers[position/cPsi->T];
+       
+       // Calculamos o numero de codigos a decodificar a partir da mostra
+       toDecode = position % cPsi->T;  
+       
+       while(toDecode--) {
+               
+               // Collemos o n�mero ceros iniciais
+               // Po�emos o inicio do c�digo nun enteiro (code) alineado a esquerda
+               // Non importa que non colla todo o c�digo, pero si temos asegurado que
+               // colle p e k (k<=32 (6bits), p<=5bits)
+               code = (cPsi->deltaCodes[pointer/32] << (pointer%32)) |
+                          ((pointer%32 != 0) * (cPsi->deltaCodes[pointer/32+1] >> (32-(pointer%32))));
+               
+               //Ahora contamos o n�mero de ceros (p) que hai nas posicions da esquerda de code
+               p = 1;
+               while(!(code & 0x80000000)) {
+                       code <<= 1;
+                       p++;
+               }
+               
+               // Ahora calculamos o numero de digitos da representacion binaria do codigo (k)
+               k = code >> (32-p);             
+               
+               // Actualizamos o punteiro global sobre deltaCodes
+               pointer += 2*p-1;
+               
+               // Po�emos a representacion binaria do codigo nun enteiro (code) alineado a esquerda
+               code = (cPsi->deltaCodes[pointer/32] << (pointer%32)) |
+                          ((pointer%32 != 0) * (cPsi->deltaCodes[pointer/32+1] >> (32-(pointer%32))));
+               code = ((code >> 1) | 0x80000000) >> (32-k);
+               pointer += k-1;
+               
+               // Bixecci�n
+               if(code % cPsi->negativeGap) result += (code - (code/cPsi->negativeGap));
+               else result -= code/cPsi->negativeGap;
+                       
+       }
+       
+       return result;
+       
+}
+
+
+void storeDeltaCompressedPsi(DeltaCompressedPsi *compressedPsi, char *filename) {
+
+       int file;
+
+       if( (file = open(filename, O_WRONLY|O_CREAT, 0700)) < 0) {
+               printf("Cannot open file %s\n", filename);
+               exit(0);
+       }
+       write(file, &(compressedPsi->T), sizeof(int));
+       write(file, &(compressedPsi->negativeGap), sizeof(int));
+       write(file, &(compressedPsi->deltaCodesSize), sizeof(int));
+       write(file, compressedPsi->deltaCodes, compressedPsi->deltaCodesSize*sizeof(int));
+       write(file, &(compressedPsi->numberOfSamples), sizeof(int));
+       write(file,     compressedPsi->samples, compressedPsi->numberOfSamples*sizeof(int));
+       write(file,     compressedPsi->pointers, compressedPsi->numberOfSamples*sizeof(int));
+       write(file, &(compressedPsi->totalMem), sizeof(int));
+
+       close(file);
+               
+}
+
+
+DeltaCompressedPsi loadDeltaCompressedPsi(char *filename) {
+       
+       DeltaCompressedPsi compressedPsi;
+
+       int file;
+
+       if( (file = open(filename, O_RDONLY)) < 0) {
+               printf("Cannot read file %s\n", filename);
+               exit(0);
+       }
+       read(file, &(compressedPsi.T), sizeof(int)); 
+       read(file, &(compressedPsi.negativeGap), sizeof(int));
+       read(file, &(compressedPsi.deltaCodesSize), sizeof(int));
+       compressedPsi.deltaCodes = (unsigned int *)malloc(compressedPsi.deltaCodesSize*sizeof(int));
+       read(file, compressedPsi.deltaCodes, compressedPsi.deltaCodesSize*sizeof(int));
+       read(file, &(compressedPsi.numberOfSamples), sizeof(int));      
+       compressedPsi.samples = (unsigned int *)malloc(compressedPsi.numberOfSamples*sizeof(int));
+       compressedPsi.pointers = (unsigned int *)malloc(compressedPsi.numberOfSamples*sizeof(int));
+       read(file, compressedPsi.samples, compressedPsi.numberOfSamples*sizeof(int));
+       read(file, compressedPsi.pointers, compressedPsi.numberOfSamples*sizeof(int));          
+       read(file, &(compressedPsi.totalMem), sizeof(int));
+
+       close(file);
+               
+       return compressedPsi;
+
+}
diff --git a/swcsa/intIndex/psiDeltaCode.h b/swcsa/intIndex/psiDeltaCode.h

new file mode 100644 (file)

index 0000000..1f955c8
--- /dev/null
+++ b/swcsa/intIndex/psiDeltaCode.h
@@ -0,0 +1,248 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <malloc.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+
+
+
+typedef struct {
+       unsigned int T;
+       unsigned int negativeGap;
+       unsigned int deltaCodesSize;    // En palabras
+       unsigned int *deltaCodes;                                       
+       unsigned int numberOfSamples;                           
+       unsigned int *samples;
+       unsigned int *pointers;
+       unsigned int totalMem;                  // the size in bytes used;
+} DeltaCompressedPsi;
+
+
+// PROTOTIPOS DE FUNCI�NS
+DeltaCompressedPsi deltaCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T);
+int getDeltaPsiValue(DeltaCompressedPsi *cPsi, unsigned int position);
+void storeDeltaCompressedPsi(DeltaCompressedPsi *compressedPsi, char *filename);
+DeltaCompressedPsi loadDeltaCompressedPsi(char *filename);             
+void destroyDeltaCodesCompressedPsi(DeltaCompressedPsi *compressedPsi);
+       
+// IMPLEMENTACI�N DAS FUNCI�NS
+
+//void destroyDeltaCodesCompressedPsi(DeltaCompressedPsi *compressedPsi) {
+//     free(compressedPsi->deltaCodes);
+//     free(compressedPsi->samples);
+//     free(compressedPsi->pointers);
+//}
+//
+//                     
+//DeltaCompressedPsi deltaCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T) {
+//
+//     DeltaCompressedPsi cPsi;
+//     
+//     int numberOfSamples;
+//     register int diff, deltaCodesPos;                                
+//     register unsigned int k, p, aux, diffpositive, code, index;
+//     unsigned int samplesIndex, codeLenght, currentInput, wordsDeltaCodes, totalSize;
+//     unsigned int *deltaCodes;
+//     unsigned int *samples;
+//     unsigned int *pointers;
+//     
+//     // Auxiliar para deltaCodes (estimamos como espacio maximo o do array de sufixos)
+//     unsigned int *deltaCodesAux;                                     
+//                      
+//     // Calculamos o mellor valor para negativeGap <= 64
+//     unsigned int negativeGap;
+//     register unsigned int maxNegativeBits = 0;
+//     k = psiSize;
+//     while(k) {
+//             k >>= 1;
+//             maxNegativeBits++;              
+//     }
+//     if(maxNegativeBits<=26) negativeGap = 64;
+//     else negativeGap = 1<<(32-maxNegativeBits);                              
+//     
+//     // Reservamos espacio para as estructuras
+//     numberOfSamples = (psiSize + T - 1) / T;
+//     samples = (unsigned int *)malloc(sizeof(int)*numberOfSamples);
+//     pointers = (unsigned int *)malloc(sizeof(int)*numberOfSamples);
+//     
+//     deltaCodesAux = (unsigned int *)malloc(sizeof(int)*psiSize);
+//     for(index=0; index<psiSize; index++) deltaCodesAux[index] = 0;           
+//     
+//     samplesIndex = 0;
+//     deltaCodesPos = 0;
+//     for(index=0; index<psiSize; index++) {
+//
+//             if(index % T) {
+//                     
+//                     diff = Psi[index] - currentInput;
+//                     currentInput = Psi[index];
+//                     
+//                     // Calculamos o codigo correspondente
+//                     if(diff>0) diffpositive = (negativeGap*diff-1)/(negativeGap-1);
+//                     else diffpositive = -negativeGap*diff;
+//     
+//                     k = 0;
+//                     aux = diffpositive;
+//                     while(aux) {
+//                             aux >>= 1;
+//                             k++;
+//                     }
+//                     aux = k;
+//                     p = 0;
+//                     while(aux) {
+//                             aux >>= 1;
+//                             p++;
+//                     }
+//                     
+//                     code = diffpositive & ((1<<(k-1))-1);
+//                     codeLenght = 2*p+k-2;           
+//
+//                     // Primeiro metemos os p-1 0's iniciais
+//                     deltaCodesPos += p-1;
+//                     
+//                     // Agora metemos os p bits de k
+//                     if( ((deltaCodesPos%32) + p) > 32 ) {   
+//                             deltaCodesAux[deltaCodesPos/32] |= (k>>((deltaCodesPos%32)+p-32));
+//                             deltaCodesAux[deltaCodesPos/32+1] = (k<<(64-(deltaCodesPos%32)-p));     
+//                     } else {
+//                             deltaCodesAux[deltaCodesPos/32] |= (k<<(32-p-(deltaCodesPos%32)));                              
+//                     }
+//                     deltaCodesPos += p;
+//                     
+//                     // Por �ltimo metemos os k-1 bits de code (sen o 1 inicial)
+//                     if( ((deltaCodesPos%32) + (k-1)) > 32 ) {       
+//                             deltaCodesAux[deltaCodesPos/32] |= (code>>((deltaCodesPos%32)+(k-1)-32));
+//                             deltaCodesAux[deltaCodesPos/32+1] = (code<<(64-(deltaCodesPos%32)-(k-1)));      
+//                     } else {
+//                             deltaCodesAux[deltaCodesPos/32] |= (code<<(32-(k-1)-(deltaCodesPos%32)));                               
+//                     }
+//                     deltaCodesPos += k-1;
+//             
+//             } else {
+//                     samples[samplesIndex] = Psi[index];
+//                     pointers[samplesIndex++] = deltaCodesPos;                       
+//                     currentInput = Psi[index]; 
+//             }       
+//
+//     }
+//     
+//     // Ahora que xa sabemos o espacio necesario para os deltaCodes, reservamolo e liberamos a estructura auxiliar
+//     wordsDeltaCodes = (deltaCodesPos+31)/32;
+//     deltaCodes = (unsigned int *)malloc(sizeof(int)*wordsDeltaCodes);
+//     for(index=0;index<wordsDeltaCodes;index++) deltaCodes[index] = deltaCodesAux[index];
+//     free(deltaCodesAux);
+//     
+//     totalSize = sizeof(int)*wordsDeltaCodes + 2*sizeof(int)*numberOfSamples + 4*sizeof(int);
+//     printf("Compressed Psi size = %d bytes\n", totalSize);
+//     
+//     // Asignamos os valores a cPsi e devolvemolo
+//     cPsi.T = T;
+//     cPsi.negativeGap = negativeGap;
+//     cPsi.deltaCodesSize = wordsDeltaCodes;
+//     cPsi.deltaCodes = deltaCodes;
+//     cPsi.numberOfSamples = numberOfSamples;
+//     cPsi.samples = samples;
+//     cPsi.pointers = pointers;
+//     return cPsi;
+//     
+//}
+//
+//
+//int getDeltaPsiValue(DeltaCompressedPsi *cPsi, unsigned int position) {
+//     
+//     int result;
+//     register unsigned int code, aux, pointerAux, mask, pointer, toDecode, p, k;
+//     
+//     // Collemos a mostra inmediatamente inferior, e o punteiro o array de codigos
+//     // pointer = punteiro absoluto sobre deltaCodes
+//     result = cPsi->samples[position/cPsi->T];
+//     pointer = cPsi->pointers[position/cPsi->T];
+//     
+//     // Calculamos o numero de codigos a decodificar a partir da mostra
+//     toDecode = position % cPsi->T;  
+//     
+//     while(toDecode--) {
+//             
+//             // Collemos o n�mero ceros iniciais
+//             // Po�emos o inicio do c�digo nun enteiro (code) alineado a esquerda
+//             // Non importa que non colla todo o c�digo, pero si temos asegurado que
+//             // colle p e k (k<=32 (6bits), p<=5bits)
+//             code = (cPsi->deltaCodes[pointer/32] << (pointer%32)) |
+//                        ((pointer%32 != 0) * (cPsi->deltaCodes[pointer/32+1] >> (32-(pointer%32))));
+//             
+//             //Ahora contamos o n�mero de ceros (p) que hai nas posicions da esquerda de code
+//             p = 1;
+//             while(!(code & 0x80000000)) {
+//                     code <<= 1;
+//                     p++;
+//             }
+//             
+//             // Ahora calculamos o numero de digitos da representacion binaria do codigo (k)
+//             k = code >> (32-p);             
+//             
+//             // Actualizamos o punteiro global sobre deltaCodes
+//             pointer += 2*p-1;
+//             
+//             // Po�emos a representacion binaria do codigo nun enteiro (code) alineado a esquerda
+//             code = (cPsi->deltaCodes[pointer/32] << (pointer%32)) |
+//                        ((pointer%32 != 0) * (cPsi->deltaCodes[pointer/32+1] >> (32-(pointer%32))));
+//             code = ((code >> 1) | 0x80000000) >> (32-k);
+//             pointer += k-1;
+//             
+//             // Bixecci�n
+//             if(code % cPsi->negativeGap) result += (code - (code/cPsi->negativeGap));
+//             else result -= code/cPsi->negativeGap;
+//                     
+//     }
+//     
+//     return result;
+//     
+//}
+//
+//
+//void storeDeltaCompressedPsi(DeltaCompressedPsi *compressedPsi, char *filename) {
+//
+//     int file;
+//
+//     if( (file = open(filename, O_WRONLY|O_CREAT, 0700)) < 0) {
+//             printf("Cannot open file %s\n", filename);
+//             exit(0);
+//     }
+//     write(file, &(compressedPsi->T), sizeof(int));
+//     write(file, &(compressedPsi->negativeGap), sizeof(int));
+//     write(file, &(compressedPsi->deltaCodesSize), sizeof(int));
+//     write(file, compressedPsi->deltaCodes, compressedPsi->deltaCodesSize*sizeof(int));
+//     write(file, &(compressedPsi->numberOfSamples), sizeof(int));
+//     write(file,     compressedPsi->samples, compressedPsi->numberOfSamples*sizeof(int));
+//     write(file,     compressedPsi->pointers, compressedPsi->numberOfSamples*sizeof(int));
+//     close(file);
+//             
+//}
+//
+//
+//DeltaCompressedPsi loadDeltaCompressedPsi(char *filename) {
+//     
+//     DeltaCompressedPsi compressedPsi;
+//
+//     int file;
+//
+//     if( (file = open(filename, O_RDONLY)) < 0) {
+//             printf("Cannot read file %s\n", filename);
+//             exit(0);
+//     }
+//     read(file, &(compressedPsi.T), sizeof(int)); 
+//     read(file, &(compressedPsi.negativeGap), sizeof(int));
+//     read(file, &(compressedPsi.deltaCodesSize), sizeof(int));
+//     compressedPsi.deltaCodes = (unsigned int *)malloc(compressedPsi.deltaCodesSize*sizeof(int));
+//     read(file, compressedPsi.deltaCodes, compressedPsi.deltaCodesSize*sizeof(int));
+//     read(file, &(compressedPsi.numberOfSamples), sizeof(int));      
+//     compressedPsi.samples = (unsigned int *)malloc(compressedPsi.numberOfSamples*sizeof(int));
+//     compressedPsi.pointers = (unsigned int *)malloc(compressedPsi.numberOfSamples*sizeof(int));
+//     read(file, compressedPsi.samples, compressedPsi.numberOfSamples*sizeof(int));
+//     read(file, compressedPsi.pointers, compressedPsi.numberOfSamples*sizeof(int));          
+//     close(file);
+//             
+//     return compressedPsi;
+//
+//}
diff --git a/swcsa/intIndex/psiGonzalo.c b/swcsa/intIndex/psiGonzalo.c

new file mode 100644 (file)

index 0000000..42449a4
--- /dev/null
+++ b/swcsa/intIndex/psiGonzalo.c
@@ -0,0 +1,296 @@
+
+#include "psiGonzalo.h"
+
+       
+// IMPLEMENTACI�N DAS FUNCI�NS
+
+void destroyGonzaloCompressedPsi(GonzaloCompressedPsi *compressedPsi) {
+       
+       //free(compressedPsi->Hlen.s.spos);
+       //free(compressedPsi->Hacc.s.spos);             
+       freeHuff(compressedPsi->Hlen);
+       freeHuff(compressedPsi->Hacc);  
+       free(compressedPsi->cPsi);
+       free(compressedPsi->bposS);
+}
+
+
+GonzaloCompressedPsi gonzaloCompressPsi(uint *Psi, uint psiSize, uint T, uint HUFF) {
+       
+       GonzaloCompressedPsi compressedPsi;
+       
+       register uint i;
+       uint oi,j;
+       int ok,k;
+       register uint _cptr;
+
+       uint *_cPsi;
+       uint *_bposS;
+               
+       uint links = psiSize;
+       uint samplen = T;               
+       uint _bplen;
+       uint pslen;     
+       uint totexc;
+               
+       uint *acc,*lacc;
+       THuff Hacc, Hlen;
+       
+       uint totalSize;
+       
+       // Construe os arboles de huffman, o dos valores directos
+       // e o das lonxitudes dos runs. Usa como vectores auxiliares de frecuencias
+       // a acc e lacc, que finalmente libera.
+       acc = (uint *)malloc (HUFF*sizeof(uint));
+       lacc = (uint *)malloc ((samplen-1)*sizeof(uint));
+       for (k=0;k<HUFF;k++) acc[k]=0;
+       for (k=0;k<samplen-1;k++) lacc[k]=0;
+       
+       ok = 0; 
+       k = Psi[0];
+       for (i=0;i<=links;i++) { 
+               if ((k == 1) && (i % samplen)) { if (ok != 1) oi = i; }
+               else { 
+                       if (ok == 1) { 
+                               acc[1]++;
+                           lacc[i-oi-1]++;
+                       }
+                   if (i % samplen) 
+                               if ((k < 1) || (k >= HUFF)) acc[0]++;
+                       else acc[k]++;
+               }
+           ok = (i % samplen) ? k : 0;
+               k = Psi[i+1]-Psi[i];
+       }
+           
+       if (ok == 1) { 
+               acc[1]++; 
+               lacc[i-oi-1]++;
+       }
+       
+       Hacc = createHuff (acc,HUFF-1, UNSORTED);
+       Hlen = createHuff (lacc,samplen-2, UNSORTED);
+       totexc = acc[0];
+       pslen = bits(psiSize+1);
+       _bplen = bits(Hacc.total+Hlen.total+(1+links/samplen+totexc)*pslen);
+       _bposS = (uint *)malloc ((((1+links/samplen)*_bplen+W-1)/W)*sizeof(uint));
+       _cPsi  = (uint *)malloc (((Hacc.total+Hlen.total+(1+links/samplen+totexc)*pslen+W-1)/W)*sizeof(uint));  
+       
+       _cptr = 0; 
+       ok = 0; 
+       k = Psi[0];
+       
+       for (i=0;i<=links;i++) { 
+               
+               if ((k == 1) && (i % samplen)) { if (ok != 1) oi = i; }
+               else { 
+                       if (ok == 1) { 
+                               _cptr = encodeHuff (Hacc,1,_cPsi,_cptr);
+                           _cptr = encodeHuff(Hlen,i-oi-1,_cPsi,_cptr);
+                       }
+                       if (i % samplen) { 
+                               if ((k > 1) && (k < HUFF)) _cptr = encodeHuff (Hacc,k,_cPsi,_cptr);
+                       else {
+                                       _cptr = encodeHuff (Hacc,0,_cPsi,_cptr);
+                               bitwrite (_cPsi,_cptr,pslen,Psi[i]);
+                                       _cptr += pslen;
+                               }
+                       }
+                       else { 
+                               bitwrite (_bposS,(i/samplen)*_bplen,_bplen,_cptr);
+                           bitwrite (_cPsi,_cptr,pslen,Psi[i]);
+                           _cptr += pslen;
+                       }
+               }
+               ok = (i % samplen) ? k : 0;
+               k = Psi[i+1]-Psi[i];
+       }
+               
+       if (ok == 1) { 
+               _cptr = encodeHuff (Hacc,1,_cPsi,_cptr);
+               _cptr = encodeHuff(Hlen,i-oi-1,_cPsi,_cptr);
+       }
+       
+       // Calculamos o espacio total
+       totalSize = (((1+links/samplen)*_bplen+W-1)/W)*sizeof(uint) +
+               ((Hacc.total+Hlen.total+(1+links/samplen+totexc)*pslen+W-1)/W)*sizeof(uint) +
+               5*sizeof(int) + sizeHuff(Hacc) + sizeHuff(Hlen);
+       printf("\n\tCompressed Psi size = %d bytes\n", totalSize);
+       
+       // Necesario antes de decodificar
+       prepareToDecode(&Hacc);
+       prepareToDecode(&Hlen);
+       
+       // Asignamos os valores e devolvemos psi comprimido
+       compressedPsi.links = psiSize;
+       compressedPsi.totexc = totexc;
+       compressedPsi.cPsi = _cPsi;
+       compressedPsi.samplen = samplen;
+       compressedPsi.bposS = _bposS;
+       compressedPsi.bplen = _bplen;
+       compressedPsi.pslen = pslen;
+       compressedPsi.Hacc = Hacc;
+       compressedPsi.Hlen = Hlen;
+       compressedPsi.totalMem = totalSize;
+       
+       free(acc); 
+       free(lacc);
+       
+       return compressedPsi;   
+}
+
+
+
+int getGonzaloPsiValue(GonzaloCompressedPsi *compressedPsi, unsigned int position) {
+
+       uint *cPsi = compressedPsi->cPsi;
+       uint samplen = compressedPsi->samplen;
+       uint *bposS = compressedPsi->bposS;
+       uint bplen = compressedPsi->bplen;
+       uint pslen = compressedPsi->pslen;
+       THuff *Hacc = &compressedPsi->Hacc;
+       THuff *Hlen = &compressedPsi->Hlen;
+       
+       uint sampj,cptr,val,dval,rlen,head,hlen;
+       
+       sampj = (position/samplen)*samplen;
+       cptr = bitread(bposS,(sampj/samplen)*bplen,bplen);
+       head = cptr;
+       val = bitread(cPsi,head,pslen);
+       head += pslen;
+                       
+       while (sampj < position) {
+               
+               head = decodeHuff(Hacc,&dval,cPsi,head);
+               
+               if (dval == 0) { 
+                       
+                       val = bitread(cPsi,head,pslen);
+                       head += pslen;
+                       sampj++;
+               }
+               else 
+                       if (dval == 1) {
+                               head = decodeHuff(Hlen,&rlen,cPsi,head);
+                               rlen++;
+                       if (sampj + rlen >= position) return val + (position-sampj);
+                       val += rlen;
+                       sampj += rlen;
+                       }
+                       else { 
+                               val += dval;
+                       sampj++;
+                       }
+                       
+       }
+           
+       return val;
+       
+}
+
+
+void storeGonzaloCompressedPsi(GonzaloCompressedPsi *compressedPsi, char *filename) {
+       
+       int file;
+       THuff Hacc;
+       THuff Hlen;
+       
+       if( (file = open(filename, O_WRONLY|O_CREAT, 0700)) < 0) {
+               printf("Cannot open file %s\n", filename);
+               exit(0);
+       }
+               
+       // Copias locales dos arboles de HUFFMAN
+       Hacc = compressedPsi->Hacc;     
+       Hlen = compressedPsi->Hlen;
+       
+       write(file, &(compressedPsi->links), sizeof(int));
+       write(file, &(compressedPsi->totexc), sizeof(int));     
+       write(file, &(compressedPsi->samplen), sizeof(int));    
+       write(file, &(compressedPsi->bplen), sizeof(int));      
+       write(file, &(compressedPsi->pslen), sizeof(int));
+       // Almacenar o arbol de huffman principal
+       write(file, &Hacc.max, sizeof(int));
+       write(file, &Hacc.lim, sizeof(int));
+       write(file, &Hacc.depth, sizeof(int));
+//     write(file, Hacc.s.spos, (Hacc.lim+1)*sizeof(int));
+       write(file, Hacc.s.symb, (Hacc.lim+1)*sizeof(int));     
+       write(file, Hacc.num, (Hacc.depth+1)*sizeof(int));
+       write(file, Hacc.fst, (Hacc.depth+1)*sizeof(int));
+       // Fin de almacenar o arbol de huffman principal
+       // Almacenar o arbol de huffman das lonxitudes dos runs
+       write(file, &Hlen.max, sizeof(int));
+       write(file, &Hlen.lim, sizeof(int));
+       write(file, &Hlen.depth, sizeof(int));
+//     write(file, Hlen.s.spos, (Hlen.lim+1)*sizeof(int));
+       write(file, Hlen.s.symb, (Hlen.lim+1)*sizeof(int));     
+       write(file, Hlen.num, (Hlen.depth+1)*sizeof(int));
+       write(file, Hlen.fst, (Hlen.depth+1)*sizeof(int));
+       // Fin de almacenar o arbol de huffman das lonxitudes dos runs
+       write(file,     compressedPsi->bposS, ((compressedPsi->bplen*(1+compressedPsi->links/compressedPsi->samplen)+W-1)/W)*sizeof(uint));             
+       write(file,     compressedPsi->cPsi, ((Hacc.total+Hlen.total+(1+compressedPsi->links/compressedPsi->samplen+compressedPsi->totexc)*compressedPsi->pslen+W-1)/W)*sizeof(int));
+
+       write(file, &(compressedPsi->totalMem), sizeof(int));
+       
+       close(file);
+       
+}
+
+
+GonzaloCompressedPsi loadGonzaloCompressedPsi(char *filename) {
+       
+       GonzaloCompressedPsi compressedPsi;
+
+       THuff Hacc;
+       THuff Hlen;
+     
+       int file;
+       
+       if( (file = open(filename, O_RDONLY)) < 0) {
+               printf("Cannot read file %s\n", filename);
+               exit(0);
+       }
+       read(file, &(compressedPsi.links), sizeof(int));
+       read(file, &(compressedPsi.totexc), sizeof(int));
+       read(file, &(compressedPsi.samplen), sizeof(int));
+       read(file, &(compressedPsi.bplen), sizeof(int));
+       read(file, &(compressedPsi.pslen), sizeof(int));
+       // Cargamos o arbol de Huffman principal
+       read(file, &Hacc.max, sizeof(int));
+       read(file, &Hacc.lim, sizeof(int));
+       read(file, &Hacc.depth, sizeof(int));
+       //Hacc.s.spos = (unsigned int *) malloc((Hacc.lim+1)*sizeof(int));
+       Hacc.s.symb = (unsigned int *) malloc((Hacc.lim+1)*sizeof(int));
+       Hacc.num = (unsigned int *) malloc((Hacc.depth+1)*sizeof(int)); 
+       Hacc.fst = (unsigned int *) malloc((Hacc.depth+1)*sizeof(int)); 
+       //read(file, Hacc.s.spos, (Hacc.lim+1)*sizeof(int));
+       read(file, Hacc.s.symb, (Hacc.lim+1)*sizeof(int));      
+       read(file, Hacc.num, (Hacc.depth+1)*sizeof(int));
+       read(file, Hacc.fst, (Hacc.depth+1)*sizeof(int));       
+       compressedPsi.Hacc = Hacc;
+       // Fin da carga do arbol de Huffman     principal
+       // Cargamos o arbol de Huffman coas lonxitudes dos runs
+       read(file, &Hlen.max, sizeof(int));
+       read(file, &Hlen.lim, sizeof(int));
+       read(file, &Hlen.depth, sizeof(int));
+       //Hlen.s.spos = (unsigned int *) malloc((Hlen.lim+1)*sizeof(int));
+       Hlen.s.symb = (unsigned int *) malloc((Hlen.lim+1)*sizeof(int));
+       Hlen.num = (unsigned int *) malloc((Hlen.depth+1)*sizeof(int)); 
+       Hlen.fst = (unsigned int *) malloc((Hlen.depth+1)*sizeof(int)); 
+       //read(file, Hlen.s.spos, (Hlen.lim+1)*sizeof(int));
+       read(file, Hlen.s.symb, (Hlen.lim+1)*sizeof(int));      
+       read(file, Hlen.num, (Hlen.depth+1)*sizeof(int));
+       read(file, Hlen.fst, (Hlen.depth+1)*sizeof(int));       
+       compressedPsi.Hlen = Hlen;
+       // Fin da carga do arbol de Huffman     coas lonxitudes dos runs
+       compressedPsi.bposS = (uint *) malloc (((compressedPsi.bplen*(1+compressedPsi.links/compressedPsi.samplen)+W-1)/W)*sizeof(uint));
+       read(file, compressedPsi.bposS, ((compressedPsi.bplen*(1+compressedPsi.links/compressedPsi.samplen)+W-1)/W)*sizeof(uint));      
+       compressedPsi.cPsi = (uint *) malloc (((Hacc.total+Hlen.total+(1+compressedPsi.links/compressedPsi.samplen+compressedPsi.totexc)*compressedPsi.pslen+W-1)/W)*sizeof(uint));
+       read(file, compressedPsi.cPsi, ((Hacc.total+Hlen.total+(1+compressedPsi.links/compressedPsi.samplen+compressedPsi.totexc)*compressedPsi.pslen+W-1)/W)*sizeof(uint));
+
+       read(file, &(compressedPsi.totalMem), sizeof(int));     
+       close(file);
+       
+       return compressedPsi;
+       
+}
diff --git a/swcsa/intIndex/psiGonzalo.h b/swcsa/intIndex/psiGonzalo.h

new file mode 100644 (file)

index 0000000..95693bd
--- /dev/null
+++ b/swcsa/intIndex/psiGonzalo.h
@@ -0,0 +1,317 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <malloc.h>
+#include "../utils/huff.h"
+
+// ESTRUCTURA QUE REPRESENTA A FUNCION PSI COMPRIMIDA
+typedef struct {
+       uint links;
+       uint totexc;
+       uint samplen;
+       uint bplen;
+       uint pslen;
+       uint *cPsi;  
+       uint *bposS;
+       THuff Hacc;
+       THuff Hlen;
+       unsigned int totalMem;                  // the size in bytes used;
+} GonzaloCompressedPsi;
+
+
+// PROTOTIPOS DE FUNCIONS
+GonzaloCompressedPsi gonzaloCompressPsi(uint *Psi, uint psiSize, uint T, uint HUFF);
+int getGonzaloPsiValue(GonzaloCompressedPsi *compressedPsi, unsigned int position);
+void storeGonzaloCompressedPsi(GonzaloCompressedPsi *compressedPsi, char *filename);
+GonzaloCompressedPsi loadGonzaloCompressedPsi(char *filename);
+//frees the memory used.       
+void destroyGonzaloCompressedPsi(GonzaloCompressedPsi *compressedPsi);
+
+       
+//// IMPLEMENTACI�N DAS FUNCI�NS
+//
+//void destroyGonzaloCompressedPsi(GonzaloCompressedPsi *compressedPsi) {
+//     
+//     //free(compressedPsi->Hlen.s.spos);
+//     //free(compressedPsi->Hacc.s.spos);             
+//     freeHuff(compressedPsi->Hlen);
+//     freeHuff(compressedPsi->Hacc);  
+//     free(compressedPsi->cPsi);
+//     free(compressedPsi->bposS);
+//}
+//
+//
+//GonzaloCompressedPsi gonzaloCompressPsi(uint *Psi, uint psiSize, uint T, uint HUFF) {
+//     
+//     GonzaloCompressedPsi compressedPsi;
+//     
+//     register uint i;
+//     uint oi,j;
+//     int ok,k;
+//     register uint _cptr;
+//
+//     uint *_cPsi;
+//     uint *_bposS;
+//             
+//     uint links = psiSize;
+//     uint samplen = T;               
+//     uint _bplen;
+//     uint pslen;     
+//     uint totexc;
+//             
+//     uint *acc,*lacc;
+//     THuff Hacc, Hlen;
+//     
+//     uint totalSize;
+//     
+//     // Construe os arboles de huffman, o dos valores directos
+//     // e o das lonxitudes dos runs. Usa como vectores auxiliares de frecuencias
+//     // a acc e lacc, que finalmente libera.
+//     acc = (uint *)malloc (HUFF*sizeof(uint));
+//     lacc = (uint *)malloc ((samplen-1)*sizeof(uint));
+//     for (k=0;k<HUFF;k++) acc[k]=0;
+//     for (k=0;k<samplen-1;k++) lacc[k]=0;
+//     
+//     ok = 0; 
+//     k = Psi[0];
+//     for (i=0;i<=links;i++) { 
+//             if ((k == 1) && (i % samplen)) { if (ok != 1) oi = i; }
+//             else { 
+//                     if (ok == 1) { 
+//                             acc[1]++;
+//                         lacc[i-oi-1]++;
+//                     }
+//                 if (i % samplen) 
+//                             if ((k < 1) || (k >= HUFF)) acc[0]++;
+//                     else acc[k]++;
+//             }
+//         ok = (i % samplen) ? k : 0;
+//             k = Psi[i+1]-Psi[i];
+//     }
+//         
+//     if (ok == 1) { 
+//             acc[1]++; 
+//             lacc[i-oi-1]++;
+//     }
+//     
+//     Hacc = createHuff (acc,HUFF-1, UNSORTED);
+//     Hlen = createHuff (lacc,samplen-2, UNSORTED);
+//     totexc = acc[0];
+//     pslen = bits(psiSize+1);
+//     _bplen = bits(Hacc.total+Hlen.total+(1+links/samplen+totexc)*pslen);
+//     _bposS = (uint *)malloc ((((1+links/samplen)*_bplen+W-1)/W)*sizeof(uint));
+//     _cPsi  = (uint *)malloc (((Hacc.total+Hlen.total+(1+links/samplen+totexc)*pslen+W-1)/W)*sizeof(uint));  
+//     
+//     _cptr = 0; 
+//     ok = 0; 
+//     k = Psi[0];
+//     
+//     for (i=0;i<=links;i++) { 
+//             
+//             if ((k == 1) && (i % samplen)) { if (ok != 1) oi = i; }
+//             else { 
+//                     if (ok == 1) { 
+//                             _cptr = encodeHuff (Hacc,1,_cPsi,_cptr);
+//                         _cptr = encodeHuff(Hlen,i-oi-1,_cPsi,_cptr);
+//                     }
+//                     if (i % samplen) { 
+//                             if ((k > 1) && (k < HUFF)) _cptr = encodeHuff (Hacc,k,_cPsi,_cptr);
+//                     else {
+//                                     _cptr = encodeHuff (Hacc,0,_cPsi,_cptr);
+//                             bitwrite (_cPsi,_cptr,pslen,Psi[i]);
+//                                     _cptr += pslen;
+//                             }
+//                     }
+//                     else { 
+//                             bitwrite (_bposS,(i/samplen)*_bplen,_bplen,_cptr);
+//                         bitwrite (_cPsi,_cptr,pslen,Psi[i]);
+//                         _cptr += pslen;
+//                     }
+//             }
+//             ok = (i % samplen) ? k : 0;
+//             k = Psi[i+1]-Psi[i];
+//     }
+//             
+//     if (ok == 1) { 
+//             _cptr = encodeHuff (Hacc,1,_cPsi,_cptr);
+//             _cptr = encodeHuff(Hlen,i-oi-1,_cPsi,_cptr);
+//     }
+//     
+//     // Calculamos o espacio total
+//     totalSize = (((1+links/samplen)*_bplen+W-1)/W)*sizeof(uint) +
+//             ((Hacc.total+Hlen.total+(1+links/samplen+totexc)*pslen+W-1)/W)*sizeof(uint) +
+//             5*sizeof(int) + sizeHuff(Hacc) + sizeHuff(Hlen);
+//     printf("Compressed Psi size = %d bytes\n", totalSize);
+//     
+//     // Necesario antes de decodificar
+//     prepareToDecode(&Hacc);
+//     prepareToDecode(&Hlen);
+//     
+//     // Asignamos os valores e devolvemos psi comprimido
+//     compressedPsi.links = psiSize;
+//     compressedPsi.totexc = totexc;
+//     compressedPsi.cPsi = _cPsi;
+//     compressedPsi.samplen = samplen;
+//     compressedPsi.bposS = _bposS;
+//     compressedPsi.bplen = _bplen;
+//     compressedPsi.pslen = pslen;
+//     compressedPsi.Hacc = Hacc;
+//     compressedPsi.Hlen = Hlen;
+//     
+//     free(acc); 
+//     free(lacc);
+//     
+//     return compressedPsi;   
+//}
+//
+//
+//
+//int getGonzaloPsiValue(GonzaloCompressedPsi *compressedPsi, unsigned int position) {
+//
+//     uint *cPsi = compressedPsi->cPsi;
+//     uint samplen = compressedPsi->samplen;
+//     uint *bposS = compressedPsi->bposS;
+//     uint bplen = compressedPsi->bplen;
+//     uint pslen = compressedPsi->pslen;
+//     THuff *Hacc = &compressedPsi->Hacc;
+//     THuff *Hlen = &compressedPsi->Hlen;
+//     
+//     uint sampj,cptr,val,dval,rlen,head,hlen;
+//     
+//     sampj = (position/samplen)*samplen;
+//     cptr = bitread(bposS,(sampj/samplen)*bplen,bplen);
+//     head = cptr;
+//     val = bitread(cPsi,head,pslen);
+//     head += pslen;
+//                     
+//     while (sampj < position) {
+//             
+//             head = decodeHuff(Hacc,&dval,cPsi,head);
+//             
+//             if (dval == 0) { 
+//                     
+//                     val = bitread(cPsi,head,pslen);
+//                     head += pslen;
+//                     sampj++;
+//             }
+//             else 
+//                     if (dval == 1) {
+//                             head = decodeHuff(Hlen,&rlen,cPsi,head);
+//                             rlen++;
+//                     if (sampj + rlen >= position) return val + (position-sampj);
+//                     val += rlen;
+//                     sampj += rlen;
+//                     }
+//                     else { 
+//                             val += dval;
+//                     sampj++;
+//                     }
+//                     
+//     }
+//         
+//     return val;
+//     
+//}
+//
+//
+//void storeGonzaloCompressedPsi(GonzaloCompressedPsi *compressedPsi, char *filename) {
+//     
+//     int file;
+//     THuff Hacc;
+//     THuff Hlen;
+//     
+//     if( (file = open(filename, O_WRONLY|O_CREAT, 0700)) < 0) {
+//             printf("Cannot open file %s\n", filename);
+//             exit(0);
+//     }
+//             
+//     // Copias locales dos arboles de HUFFMAN
+//     Hacc = compressedPsi->Hacc;     
+//     Hlen = compressedPsi->Hlen;
+//     
+//     write(file, &(compressedPsi->links), sizeof(int));
+//     write(file, &(compressedPsi->totexc), sizeof(int));     
+//     write(file, &(compressedPsi->samplen), sizeof(int));    
+//     write(file, &(compressedPsi->bplen), sizeof(int));      
+//     write(file, &(compressedPsi->pslen), sizeof(int));
+//     // Almacenar o arbol de huffman principal
+//     write(file, &Hacc.max, sizeof(int));
+//     write(file, &Hacc.lim, sizeof(int));
+//     write(file, &Hacc.depth, sizeof(int));
+////   write(file, Hacc.s.spos, (Hacc.lim+1)*sizeof(int));
+//     write(file, Hacc.s.symb, (Hacc.lim+1)*sizeof(int));     
+//     write(file, Hacc.num, (Hacc.depth+1)*sizeof(int));
+//     write(file, Hacc.fst, (Hacc.depth+1)*sizeof(int));
+//     // Fin de almacenar o arbol de huffman principal
+//     // Almacenar o arbol de huffman das lonxitudes dos runs
+//     write(file, &Hlen.max, sizeof(int));
+//     write(file, &Hlen.lim, sizeof(int));
+//     write(file, &Hlen.depth, sizeof(int));
+////   write(file, Hlen.s.spos, (Hlen.lim+1)*sizeof(int));
+//     write(file, Hlen.s.symb, (Hlen.lim+1)*sizeof(int));     
+//     write(file, Hlen.num, (Hlen.depth+1)*sizeof(int));
+//     write(file, Hlen.fst, (Hlen.depth+1)*sizeof(int));
+//     // Fin de almacenar o arbol de huffman das lonxitudes dos runs
+//     write(file,     compressedPsi->bposS, ((compressedPsi->bplen*(1+compressedPsi->links/compressedPsi->samplen)+W-1)/W)*sizeof(uint));             
+//     write(file,     compressedPsi->cPsi, ((Hacc.total+Hlen.total+(1+compressedPsi->links/compressedPsi->samplen+compressedPsi->totexc)*compressedPsi->pslen+W-1)/W)*sizeof(int));
+//     
+//     close(file);
+//     
+//}
+//
+//
+//GonzaloCompressedPsi loadGonzaloCompressedPsi(char *filename) {
+//     
+//     GonzaloCompressedPsi compressedPsi;
+//
+//     THuff Hacc;
+//     THuff Hlen;
+//     
+//     int file;
+//     
+//     if( (file = open(filename, O_RDONLY)) < 0) {
+//             printf("Cannot read file %s\n", filename);
+//             exit(0);
+//     }
+//     read(file, &(compressedPsi.links), sizeof(int));
+//     read(file, &(compressedPsi.totexc), sizeof(int));
+//     read(file, &(compressedPsi.samplen), sizeof(int));
+//     read(file, &(compressedPsi.bplen), sizeof(int));
+//     read(file, &(compressedPsi.pslen), sizeof(int));
+//     // Cargamos o arbol de Huffman principal
+//     read(file, &Hacc.max, sizeof(int));
+//     read(file, &Hacc.lim, sizeof(int));
+//     read(file, &Hacc.depth, sizeof(int));
+//     //Hacc.s.spos = (unsigned int *) malloc((Hacc.lim+1)*sizeof(int));
+//     Hacc.s.symb = (unsigned int *) malloc((Hacc.lim+1)*sizeof(int));
+//     Hacc.num = (unsigned int *) malloc((Hacc.depth+1)*sizeof(int)); 
+//     Hacc.fst = (unsigned int *) malloc((Hacc.depth+1)*sizeof(int)); 
+//     //read(file, Hacc.s.spos, (Hacc.lim+1)*sizeof(int));
+//     read(file, Hacc.s.symb, (Hacc.lim+1)*sizeof(int));      
+//     read(file, Hacc.num, (Hacc.depth+1)*sizeof(int));
+//     read(file, Hacc.fst, (Hacc.depth+1)*sizeof(int));       
+//     compressedPsi.Hacc = Hacc;
+//     // Fin da carga do arbol de Huffman     principal
+//     // Cargamos o arbol de Huffman coas lonxitudes dos runs
+//     read(file, &Hlen.max, sizeof(int));
+//     read(file, &Hlen.lim, sizeof(int));
+//     read(file, &Hlen.depth, sizeof(int));
+//     //Hlen.s.spos = (unsigned int *) malloc((Hlen.lim+1)*sizeof(int));
+//     Hlen.s.symb = (unsigned int *) malloc((Hlen.lim+1)*sizeof(int));
+//     Hlen.num = (unsigned int *) malloc((Hlen.depth+1)*sizeof(int)); 
+//     Hlen.fst = (unsigned int *) malloc((Hlen.depth+1)*sizeof(int)); 
+//     //read(file, Hlen.s.spos, (Hlen.lim+1)*sizeof(int));
+//     read(file, Hlen.s.symb, (Hlen.lim+1)*sizeof(int));      
+//     read(file, Hlen.num, (Hlen.depth+1)*sizeof(int));
+//     read(file, Hlen.fst, (Hlen.depth+1)*sizeof(int));       
+//     compressedPsi.Hlen = Hlen;
+//     // Fin da carga do arbol de Huffman     coas lonxitudes dos runs
+//     compressedPsi.bposS = (uint *) malloc (((compressedPsi.bplen*(1+compressedPsi.links/compressedPsi.samplen)+W-1)/W)*sizeof(uint));
+//     read(file, compressedPsi.bposS, ((compressedPsi.bplen*(1+compressedPsi.links/compressedPsi.samplen)+W-1)/W)*sizeof(uint));      
+//     compressedPsi.cPsi = (uint *) malloc (((Hacc.total+Hlen.total+(1+compressedPsi.links/compressedPsi.samplen+compressedPsi.totexc)*compressedPsi.pslen+W-1)/W)*sizeof(uint));
+//     read(file, compressedPsi.cPsi, ((Hacc.total+Hlen.total+(1+compressedPsi.links/compressedPsi.samplen+compressedPsi.totexc)*compressedPsi.pslen+W-1)/W)*sizeof(uint));
+//     
+//     close(file);
+//     
+//     return compressedPsi;
+//     
+//}
diff --git a/swcsa/intIndex/psiHuffmanRLE.c b/swcsa/intIndex/psiHuffmanRLE.c

new file mode 100644 (file)

index 0000000..5e36869
--- /dev/null
+++ b/swcsa/intIndex/psiHuffmanRLE.c
@@ -0,0 +1,335 @@
+#include "psiHuffmanRLE.h"
+
+// IMPLEMENTACION DAS FUNCIONS
+
+void destroyHuffmanCompressedPsi(HuffmanCompressedPsi *compressedPsi) {
+       freeHuff(compressedPsi->diffsHT);
+       free(compressedPsi->samples);
+       free(compressedPsi->samplePointers);
+       free (compressedPsi->stream);
+}
+
+HuffmanCompressedPsi huffmanCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T, unsigned int nS) {
+       
+       HuffmanCompressedPsi cPsi;
+       
+       int absolute_value;
+       register unsigned int index, ptr, samplesPtr, samplePointersPtr;
+       unsigned int runLenght, binaryLenght;
+       
+       int *diffs;     
+       unsigned int *huffmanDst;
+       
+       // Estructuras da funcion comprimida (para logo asignar)
+       // Tam�n se podian almacenar directamente
+       THuff diffsHT;
+       unsigned int numberOfSamples;
+       unsigned int *samples;
+       unsigned int sampleSize;
+       unsigned int *samplePointers;
+       unsigned int pointerSize;
+       unsigned int *stream;
+       unsigned int streamSize;
+       
+       // Variables que marcan os intervalos dentro do vector de frecuencias
+       unsigned int runLenghtStart = nS - 64 - T;      // Inicio das Runs
+       unsigned int negStart = nS - 64;                        // Inicio dos Negativos
+       unsigned int bigStart = nS - 32;                        // Inicio dos Grandes (>runLenghtStart)
+       
+       // Para estadistica
+       unsigned int totalSize;
+       
+       // Reservamos espacio para a distribuci�n de valores de Psi
+       huffmanDst = (unsigned int *)malloc(sizeof(int)*nS);
+       for(index=0;index<nS;index++) huffmanDst[index]=0;
+       
+       // Inicializamos diferencias    
+       diffs = (int *)malloc(sizeof(int)*psiSize);     
+       diffs[0] = 0;
+       for(index=1; index<psiSize; index++) 
+               diffs[index] = Psi[index] - Psi[index-1];
+       
+       // Calculamos a distribucion de frecuencias
+       runLenght = 0;
+       for(index=0; index<psiSize; index++) {
+               
+               if(index%T) {
+                       
+                       if(diffs[index]==1) {
+                               runLenght++;
+                       } else {        // Non estamos nun run
+                               if(runLenght) {
+                                       huffmanDst[runLenght+runLenghtStart]++;
+                                       runLenght = 0;
+                               }
+                               if(diffs[index]>1 && diffs[index]<runLenghtStart) 
+                                       huffmanDst[diffs[index]]++;
+                               else
+                                       if(diffs[index]<0) {    // Valor negativo
+                                               absolute_value = -diffs[index];
+                                               binaryLenght = bits(absolute_value);
+                                               huffmanDst[binaryLenght+negStart-1]++;
+                                       } else {                                // Valor grande >= 128
+                                               absolute_value = diffs[index];
+                                               binaryLenght = bits(absolute_value);
+                                               huffmanDst[binaryLenght+bigStart-1]++;
+                                       }
+                       }
+                       
+               } else { // Rompemos o run porque atopamos unha mostra
+                       if(runLenght) {
+                               huffmanDst[runLenght+runLenghtStart]++;
+                               runLenght = 0;
+                       }
+               }
+               
+       }
+       
+       if(runLenght) huffmanDst[runLenght+runLenghtStart]++;
+       
+       // Creamos o arbol de Huffman
+       diffsHT = createHuff(huffmanDst,nS-1,UNSORTED);
+       
+       // Calculamos o espacio total ocupado pola secuencia Huffman + RLE
+       streamSize = diffsHT.total;
+       for(index=negStart;index<bigStart;index++) 
+               streamSize += huffmanDst[index]*(index-negStart+1);     // Negativos
+       for(index=bigStart;index<nS;index++) 
+               streamSize += huffmanDst[index]*(index-bigStart+1);             // Grandes      
+       
+       // Calculamos o numero de mostras e o espacio ocupado por cada mostra e por cada punteiro
+       numberOfSamples = (psiSize+T-1)/T;      
+       sampleSize = bits(psiSize);
+       pointerSize = bits(streamSize); 
+
+       // Reservamos espacio para a secuencia e para as mostras e punteiros
+       samples = (unsigned int *)malloc(sizeof(uint)*((numberOfSamples*sampleSize+31)/32));    
+               samples[((numberOfSamples*sampleSize+31)/32)-1] =0000; //initialized only to avoid valgrind warnings
+       samplePointers = (unsigned int *)malloc(sizeof(int)*((numberOfSamples*pointerSize+31)/32));
+               samplePointers[((numberOfSamples*pointerSize+31)/32)-1] = 0000;  //initialized only to avoid valgrind warnings
+       stream = (unsigned int *)malloc(sizeof(int)*((streamSize+31)/32));
+               stream[((streamSize+31)/32)-1]=0000;//initialized only to avoid valgrind warnings
+       
+       // Comprimimos secuencialmente (haber� que levar un punteiro desde o inicio)
+       ptr = 0;
+       samplesPtr = 0;
+       samplePointersPtr = 0;
+       runLenght = 0;
+       for(index=0; index<psiSize; index++) {
+               
+               if(index%T) {
+                       
+                       if(diffs[index]==1) {
+                               runLenght++;
+                       } else {        // Non estamos nun run
+                               if(runLenght) {
+                                       ptr = encodeHuff(diffsHT,runLenght+runLenghtStart,stream,ptr);
+                                       runLenght = 0;
+                               }
+                               if(diffs[index]>1 && diffs[index]<runLenghtStart) {                             
+                                       ptr = encodeHuff(diffsHT,diffs[index],stream,ptr);      
+                               }       
+                               else
+                                       if(diffs[index]<0) {    // Valor negativo
+                                               absolute_value = -diffs[index];
+                                               binaryLenght = bits(absolute_value);
+                                               ptr = encodeHuff(diffsHT,binaryLenght+negStart-1,stream,ptr);
+                                               bitwrite(stream,ptr,binaryLenght,absolute_value);
+                                               ptr += binaryLenght;                                            
+                                       } else {                                // Valor grande >= 128
+                                               absolute_value = diffs[index];
+                                               binaryLenght = bits(absolute_value);                                    
+                                               ptr = encodeHuff(diffsHT,binaryLenght+bigStart-1,stream,ptr);
+                                               bitwrite(stream,ptr,binaryLenght,absolute_value);                                               
+                                               ptr += binaryLenght;
+                                       }
+                       }
+                       
+               } else { // Rompemos o run porque atopamos unha mostra
+                       if(runLenght) {                         
+                               ptr = encodeHuff(diffsHT,runLenght+runLenghtStart,stream,ptr);
+                               runLenght = 0;
+                       }
+                       bitwrite(samples,samplesPtr,sampleSize,Psi[index]);
+                       samplesPtr += sampleSize;
+                       bitwrite(samplePointers,samplePointersPtr,pointerSize,ptr);
+                       samplePointersPtr += pointerSize;
+               }
+               
+       }
+       
+       if(runLenght) { 
+               ptr = encodeHuff(diffsHT,runLenght+runLenghtStart,stream,ptr);
+       }
+       
+       // Amosamos o espacio ocupado
+       totalSize = sizeof(HuffmanCompressedPsi) + 
+               sizeof(int)*((numberOfSamples*sampleSize+31)/32) + 
+               sizeof(int)*((numberOfSamples*pointerSize+31)/32) +
+               sizeof(int)*((streamSize+31)/32) + sizeHuff(diffsHT);
+
+       printf("\n\t Compressed Psi size = %d bytes, with %d different symbols.", totalSize, nS);
+       
+       // Necesario antes de decodificar
+       prepareToDecode(&diffsHT);
+       
+       // Asignamos os valores a cPsi e devolvemolo
+       cPsi.T = T;
+       cPsi.diffsHT = diffsHT;
+       cPsi.nS = nS;
+       cPsi.numberOfSamples = numberOfSamples;
+       cPsi.samples = samples;
+       cPsi.sampleSize = sampleSize;
+       cPsi.samplePointers = samplePointers;
+       cPsi.pointerSize = pointerSize;
+       cPsi.stream = stream;
+       cPsi.streamSize = streamSize;
+       cPsi.totalMem = totalSize;
+       
+       //frees resources not needed in advance
+       free(diffs);
+       free(huffmanDst);
+       
+       //returns the data structure that holds the compressed psi.
+       return cPsi;    
+}
+
+
+unsigned int getHuffmanPsiValue(HuffmanCompressedPsi *cPsi, unsigned int position) {
+       
+       register unsigned int index;
+       unsigned int sampleIndex, ptr, psiValue, huffmanCode, positionsSinceSample; 
+       unsigned int absolute_value, binaryLenght, runLenght;
+       
+       unsigned int runLenghtStart = cPsi->nS - 64 - cPsi->T;
+       unsigned int negStart = cPsi->nS - 64;
+       unsigned int bigStart = cPsi->nS - 32;  
+       
+       sampleIndex = position / cPsi->T;
+       psiValue = bitread(cPsi->samples,sampleIndex*cPsi->sampleSize,cPsi->sampleSize);
+       ptr = bitread(cPsi->samplePointers,sampleIndex*cPsi->pointerSize,cPsi->pointerSize);
+       
+       positionsSinceSample = position%cPsi->T;
+       
+       for(index=0;index<positionsSinceSample;index++) {
+               
+               ptr = decodeHuff(&cPsi->diffsHT,&huffmanCode,cPsi->stream,ptr);
+               
+               if(huffmanCode < runLenghtStart) {      // Incremento directo
+                       psiValue += huffmanCode;
+               }       
+               else 
+                       if(huffmanCode < negStart) {    // Estamos nun run
+                               runLenght = huffmanCode - runLenghtStart;
+                               if(index+runLenght>=positionsSinceSample)
+                                       return psiValue+positionsSinceSample-index;
+                               else {
+                                       psiValue += runLenght;
+                                       index += runLenght-1;
+                               }
+                       }
+                       else
+                               if(huffmanCode < bigStart) {    // Negativo
+                                       binaryLenght = huffmanCode-negStart+1;
+                                       absolute_value = bitread(cPsi->stream,ptr,binaryLenght);
+                                       ptr += binaryLenght;
+                                       psiValue -= absolute_value;     
+                               }
+                               else {  // Grande
+                                       binaryLenght = huffmanCode-bigStart+1;
+                                       absolute_value = bitread(cPsi->stream,ptr,binaryLenght);
+                                       ptr += binaryLenght;
+                                       psiValue += absolute_value;                              
+                               }
+                               
+       }
+       
+       return psiValue;
+
+}
+
+
+void storeHuffmanCompressedPsi(HuffmanCompressedPsi *compressedPsi, char *filename) {
+
+       int file;
+       THuff H;
+
+       if( (file = open(filename, O_WRONLY|O_CREAT, 0700)) < 0) {
+               printf("Cannot open file %s\n", filename);
+               exit(0);
+       }
+       write(file, &(compressedPsi->T), sizeof(int));
+       // Almacenar o arbol de huffman
+       H = compressedPsi->diffsHT;
+       write(file, &H.max, sizeof(int));
+       write(file, &H.lim, sizeof(int));
+       write(file, &H.depth, sizeof(int));
+//     write(file, H.s.spos, (H.lim+1)*sizeof(int));
+       write(file, H.s.symb, (H.lim+1)*sizeof(int));   
+       write(file, H.num, (H.depth+1)*sizeof(int));
+       write(file, H.fst, (H.depth+1)*sizeof(int));
+       // Fin de almacenar o arbol de huffman
+       write(file, &(compressedPsi->nS), sizeof(int));
+       write(file, &(compressedPsi->numberOfSamples), sizeof(int));
+       write(file, &(compressedPsi->sampleSize), sizeof(int));
+       write(file,     compressedPsi->samples, ((compressedPsi->numberOfSamples*compressedPsi->sampleSize+31)/32)*sizeof(int));        
+       write(file, &(compressedPsi->pointerSize), sizeof(int));
+       write(file,     compressedPsi->samplePointers, ((compressedPsi->numberOfSamples*compressedPsi->pointerSize+31)/32)*sizeof(int));
+       write(file, &(compressedPsi->streamSize), sizeof(int));
+       write(file,     compressedPsi->stream, ((compressedPsi->streamSize+31)/32)*sizeof(int));
+       write(file, &(compressedPsi->totalMem), sizeof(int));
+       
+       close(file);    
+
+}
+
+
+HuffmanCompressedPsi loadHuffmanCompressedPsi(char *filename) {
+       
+       HuffmanCompressedPsi compressedPsi;
+
+       THuff H;
+     
+       int file;
+
+       if( (file = open(filename, O_RDONLY)) < 0) {
+               printf("Cannot read file %s\n", filename);
+               exit(0);
+       }
+       read(file, &(compressedPsi.T), sizeof(int));
+       // Cargamos o arbol de Huffman
+       read(file, &H.max, sizeof(int));
+       read(file, &H.lim, sizeof(int));
+       read(file, &H.depth, sizeof(int));
+       //H.s.spos = (unsigned int *) malloc((H.lim+1)*sizeof(int));
+       //H.s.spos =H.s.symb = (unsigned int *) malloc((H.lim+1)*sizeof(int));
+       H.s.symb = (unsigned int *) malloc((H.lim+1)*sizeof(int));
+       H.num = (unsigned int *) malloc((H.depth+1)*sizeof(int));       
+       H.fst = (unsigned int *) malloc((H.depth+1)*sizeof(int));       
+
+       //read(file, H.s.spos, (H.lim+1)*sizeof(int));
+       //fprintf(stderr," \n read %d spos bytes\n", (H.lim+1)*sizeof(int));
+       read(file, H.s.symb, (H.lim+1)*sizeof(int));    
+
+       read(file, H.num, (H.depth+1)*sizeof(int));
+       read(file, H.fst, (H.depth+1)*sizeof(int));     
+       compressedPsi.diffsHT = H;
+       // Fin da carga do arbol de Huffman
+       read(file, &(compressedPsi.nS), sizeof(int));
+       read(file, &(compressedPsi.numberOfSamples), sizeof(int));      
+       read(file, &(compressedPsi.sampleSize), sizeof(int));
+       compressedPsi.samples = (unsigned int *)malloc(((compressedPsi.numberOfSamples*compressedPsi.sampleSize+31)/32)*sizeof(int));
+       read(file, compressedPsi.samples, ((compressedPsi.numberOfSamples*compressedPsi.sampleSize+31)/32)*sizeof(int));
+       read(file, &(compressedPsi.pointerSize), sizeof(int));
+       compressedPsi.samplePointers = (unsigned int *)malloc(((compressedPsi.numberOfSamples*compressedPsi.pointerSize+31)/32)*sizeof(int));
+       read(file, compressedPsi.samplePointers, ((compressedPsi.numberOfSamples*compressedPsi.pointerSize+31)/32)*sizeof(int));
+       read(file, &(compressedPsi.streamSize), sizeof(int));
+       compressedPsi.stream = (unsigned int *)malloc(((compressedPsi.streamSize+31)/32)*sizeof(int));
+       read(file, compressedPsi.stream, ((compressedPsi.streamSize+31)/32)*sizeof(int));
+       read(file, &(compressedPsi.totalMem), sizeof(int));
+       
+       close(file);
+               
+       return compressedPsi;
+
+}
diff --git a/swcsa/intIndex/psiHuffmanRLE.h b/swcsa/intIndex/psiHuffmanRLE.h

new file mode 100644 (file)

index 0000000..8757c73
--- /dev/null
+++ b/swcsa/intIndex/psiHuffmanRLE.h
@@ -0,0 +1,389 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <malloc.h>
+#include "../utils/huff.h"
+
+/*
+Compresion de PSI utilizando codificación incremental e RLE para os runs.
+
+Utilizamos códigos Huffman, entre os que distinguimos 4 grupos.
+
+G1: Incrementos frecuentes: Entre 2 e Total - 32 negativos - 32 grandes - (máxima lonxitude dun Run == periodo de muestreo - 1))
+G2: Código que representa que hai un run e a súa lonxitude (periodo de muestreo códigos)
+G3: Numeros negativos (32 caracteres de escape, que representan a lonxitude da representación binaria do valor absoluto do negativo)
+G4: Numeros grandes, maiores que os de G1 (32 caracteres de escape representando novamente a lonxitude da representación binaria do seu valor)
+
+Os de G1 obtéñense directamente, tras decodificar Huffman.
+Os de G2 transfórmanse nunha run da lonxitude obtida tras decodificar con Huffman.
+Os de G3 van seguidos da representación binaria do valor absoluto do seu número.
+Os de G4 van seguidos da súa representación binaria.
+*/
+
+// ESTRUCTURA DE PSI COMPRIMIDA
+typedef struct {
+       unsigned int T;                                 // Periodo de muestreo de PSI
+       THuff diffsHT;                                  // Arbol de Huffman (codifica stream)
+       unsigned int nS;                                // Numero de simbolos para Huffman
+       unsigned int numberOfSamples;
+       unsigned int sampleSize;                // Bits que ocupa cada mostra
+       unsigned int *samples;                  // Vector de mostras
+       unsigned int pointerSize;               // Bits que ocupa cada punteiro 
+       unsigned int *samplePointers;   // Punteiros das mostras a stream
+       unsigned int streamSize;                // Bits que ocupa stream        
+       unsigned int *stream;                   // Secuencia Huffman + RLE
+       unsigned int totalMem;                  // the size in bytes used;
+} HuffmanCompressedPsi;
+
+
+// PROTOTIPOS DE FUNCIÓNS
+
+//     Crea as estructuras de Psi comprimida:
+//     
+//     Psi: Funcion Psi original
+//     psiSize: Numero de elementos de Psi
+//     T: Periodo de muestreo en Psi
+//     nS: Numero de simbolos que se utilizaran no arbol de Huffman
+//
+//     Devolve unha estructura CompressedPSI que representa a Psi comprimida
+HuffmanCompressedPsi huffmanCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T, unsigned int nS);
+
+//     Obtén un valor de Psi
+//
+//     cPsi: A estructura que representa a Psi comprimida
+//     position: A posicion da que queremos obter o valor de Psi
+unsigned int getHuffmanPsiValue(HuffmanCompressedPsi *cPsi, unsigned int position);
+
+void storeHuffmanCompressedPsi(HuffmanCompressedPsi *compressedPsi, char *filename);
+HuffmanCompressedPsi loadHuffmanCompressedPsi(char *filename);
+
+//frees the memory used.       
+void destroyHuffmanCompressedPsi(HuffmanCompressedPsi *compressedPsi);
+       
+       
+//     
+//// IMPLEMENTACIÓN DAS FUNCIÓNS
+//
+//void destroyHuffmanCompressedPsi(HuffmanCompressedPsi *compressedPsi) {
+//     freeHuff(compressedPsi->diffsHT);
+//     free(compressedPsi->samples);
+//     free(compressedPsi->samplePointers);
+//     free (compressedPsi->stream);
+//}
+//
+//HuffmanCompressedPsi huffmanCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T, unsigned int nS) {
+//     
+//     HuffmanCompressedPsi cPsi;
+//     
+//     int absolute_value;
+//     register unsigned int index, ptr, samplesPtr, samplePointersPtr;
+//     unsigned int runLenght, binaryLenght;
+//     
+//     int *diffs;     
+//     unsigned int *huffmanDst;
+//     
+//     // Estructuras da funcion comprimida (para logo asignar)
+//     // Tamén se podian almacenar directamente
+//     THuff diffsHT;
+//     unsigned int numberOfSamples;
+//     unsigned int *samples;
+//     unsigned int sampleSize;
+//     unsigned int *samplePointers;
+//     unsigned int pointerSize;
+//     unsigned int *stream;
+//     unsigned int streamSize;
+//     
+//     // Variables que marcan os intervalos dentro do vector de frecuencias
+//     unsigned int runLenghtStart = nS - 64 - T;      // Inicio das Runs
+//     unsigned int negStart = nS - 64;                        // Inicio dos Negativos
+//     unsigned int bigStart = nS - 32;                        // Inicio dos Grandes (>runLenghtStart)
+//     
+//     // Para estadistica
+//     unsigned int totalSize;
+//     
+//     // Reservamos espacio para a distribución de valores de Psi
+//     huffmanDst = (unsigned int *)malloc(sizeof(int)*nS);
+//     for(index=0;index<nS;index++) huffmanDst[index]=0;
+//     
+//     // Inicializamos diferencias    
+//     diffs = (int *)malloc(sizeof(int)*psiSize);     
+//     diffs[0] = 0;
+//     for(index=1; index<psiSize; index++) 
+//             diffs[index] = Psi[index] - Psi[index-1];
+//     
+//     // Calculamos a distribucion de frecuencias
+//     runLenght = 0;
+//     for(index=0; index<psiSize; index++) {
+//             
+//             if(index%T) {
+//                     
+//                     if(diffs[index]==1) {
+//                             runLenght++;
+//                     } else {        // Non estamos nun run
+//                             if(runLenght) {
+//                                     huffmanDst[runLenght+runLenghtStart]++;
+//                                     runLenght = 0;
+//                             }
+//                             if(diffs[index]>1 && diffs[index]<runLenghtStart) 
+//                                     huffmanDst[diffs[index]]++;
+//                             else
+//                                     if(diffs[index]<0) {    // Valor negativo
+//                                             absolute_value = -diffs[index];
+//                                             binaryLenght = bits(absolute_value);
+//                                             huffmanDst[binaryLenght+negStart-1]++;
+//                                     } else {                                // Valor grande >= 128
+//                                             absolute_value = diffs[index];
+//                                             binaryLenght = bits(absolute_value);
+//                                             huffmanDst[binaryLenght+bigStart-1]++;
+//                                     }
+//                     }
+//                     
+//             } else { // Rompemos o run porque atopamos unha mostra
+//                     if(runLenght) {
+//                             huffmanDst[runLenght+runLenghtStart]++;
+//                             runLenght = 0;
+//                     }
+//             }
+//             
+//     }
+//     
+//     if(runLenght) huffmanDst[runLenght+runLenghtStart]++;
+//     
+//     // Creamos o arbol de Huffman
+//     diffsHT = createHuff(huffmanDst,nS-1,UNSORTED);
+//     
+//     // Calculamos o espacio total ocupado pola secuencia Huffman + RLE
+//     streamSize = diffsHT.total;
+//     for(index=negStart;index<bigStart;index++) 
+//             streamSize += huffmanDst[index]*(index-negStart+1);     // Negativos
+//     for(index=bigStart;index<nS;index++) 
+//             streamSize += huffmanDst[index]*(index-bigStart+1);             // Grandes      
+//     
+//     // Calculamos o numero de mostras e o espacio ocupado por cada mostra e por cada punteiro
+//     numberOfSamples = (psiSize+T-1)/T;      
+//     sampleSize = bits(psiSize);
+//     pointerSize = bits(streamSize); 
+//
+//     // Reservamos espacio para a secuencia e para as mostras e punteiros
+//     samples = (unsigned int *)malloc(sizeof(int)*((numberOfSamples*sampleSize+31)/32));     
+//     samplePointers = (unsigned int *)malloc(sizeof(int)*((numberOfSamples*pointerSize+31)/32));
+//     stream = (unsigned int *)malloc(sizeof(int)*((streamSize+31)/32));
+//     
+//     // Comprimimos secuencialmente (haberá que levar un punteiro desde o inicio)
+//     ptr = 0;
+//     samplesPtr = 0;
+//     samplePointersPtr = 0;
+//     runLenght = 0;
+//     for(index=0; index<psiSize; index++) {
+//             
+//             if(index%T) {
+//                     
+//                     if(diffs[index]==1) {
+//                             runLenght++;
+//                     } else {        // Non estamos nun run
+//                             if(runLenght) {
+//                                     ptr = encodeHuff(diffsHT,runLenght+runLenghtStart,stream,ptr);
+//                                     runLenght = 0;
+//                             }
+//                             if(diffs[index]>1 && diffs[index]<runLenghtStart) {                             
+//                                     ptr = encodeHuff(diffsHT,diffs[index],stream,ptr);      
+//                             }       
+//                             else
+//                                     if(diffs[index]<0) {    // Valor negativo
+//                                             absolute_value = -diffs[index];
+//                                             binaryLenght = bits(absolute_value);
+//                                             ptr = encodeHuff(diffsHT,binaryLenght+negStart-1,stream,ptr);
+//                                             bitwrite(stream,ptr,binaryLenght,absolute_value);
+//                                             ptr += binaryLenght;                                            
+//                                     } else {                                // Valor grande >= 128
+//                                             absolute_value = diffs[index];
+//                                             binaryLenght = bits(absolute_value);                                    
+//                                             ptr = encodeHuff(diffsHT,binaryLenght+bigStart-1,stream,ptr);
+//                                             bitwrite(stream,ptr,binaryLenght,absolute_value);                                               
+//                                             ptr += binaryLenght;
+//                                     }
+//                     }
+//                     
+//             } else { // Rompemos o run porque atopamos unha mostra
+//                     if(runLenght) {                         
+//                             ptr = encodeHuff(diffsHT,runLenght+runLenghtStart,stream,ptr);
+//                             runLenght = 0;
+//                     }
+//                     bitwrite(samples,samplesPtr,sampleSize,Psi[index]);
+//                     samplesPtr += sampleSize;
+//                     bitwrite(samplePointers,samplePointersPtr,pointerSize,ptr);
+//                     samplePointersPtr += pointerSize;
+//             }
+//             
+//     }
+//     
+//     if(runLenght) { 
+//             ptr = encodeHuff(diffsHT,runLenght+runLenghtStart,stream,ptr);
+//     }
+//     
+//     // Amosamos o espacio ocupado
+//     totalSize = sizeof(int)*((numberOfSamples*sampleSize+31)/32) + 
+//             sizeof(int)*((numberOfSamples*pointerSize+31)/32) +
+//             sizeof(int)*((streamSize+31)/32) + sizeHuff(diffsHT) +
+//             6*sizeof(int);
+//     printf("Compressed Psi size = %d bytes\n", totalSize);
+//     
+//     // Necesario antes de decodificar
+//     prepareToDecode(&diffsHT);
+//     
+//     // Asignamos os valores a cPsi e devolvemolo
+//     cPsi.T = T;
+//     cPsi.diffsHT = diffsHT;
+//     cPsi.nS = nS;
+//     cPsi.numberOfSamples = numberOfSamples;
+//     cPsi.samples = samples;
+//     cPsi.sampleSize = sampleSize;
+//     cPsi.samplePointers = samplePointers;
+//     cPsi.pointerSize = pointerSize;
+//     cPsi.stream = stream;
+//     cPsi.streamSize = streamSize;
+//     
+//     //frees resources not needed in advance
+//     free(diffs);
+//     free(huffmanDst);
+//     
+//     //returns the data structure that holds the compressed psi.
+//     return cPsi;    
+//}
+//
+//
+//unsigned int getHuffmanPsiValue(HuffmanCompressedPsi *cPsi, unsigned int position) {
+//     
+//     register unsigned int index;
+//     unsigned int sampleIndex, ptr, psiValue, huffmanCode, positionsSinceSample; 
+//     unsigned int absolute_value, binaryLenght, runLenght;
+//     
+//     unsigned int runLenghtStart = cPsi->nS - 64 - cPsi->T;
+//     unsigned int negStart = cPsi->nS - 64;
+//     unsigned int bigStart = cPsi->nS - 32;  
+//     
+//     sampleIndex = position / cPsi->T;
+//     psiValue = bitread(cPsi->samples,sampleIndex*cPsi->sampleSize,cPsi->sampleSize);
+//     ptr = bitread(cPsi->samplePointers,sampleIndex*cPsi->pointerSize,cPsi->pointerSize);
+//     
+//     positionsSinceSample = position%cPsi->T;
+//     
+//     for(index=0;index<positionsSinceSample;index++) {
+//             
+//             ptr = decodeHuff(&cPsi->diffsHT,&huffmanCode,cPsi->stream,ptr);
+//             
+//             if(huffmanCode < runLenghtStart) {      // Incremento directo
+//                     psiValue += huffmanCode;
+//             }       
+//             else 
+//                     if(huffmanCode < negStart) {    // Estamos nun run
+//                             runLenght = huffmanCode - runLenghtStart;
+//                             if(index+runLenght>=positionsSinceSample)
+//                                     return psiValue+positionsSinceSample-index;
+//                             else {
+//                                     psiValue += runLenght;
+//                                     index += runLenght-1;
+//                             }
+//                     }
+//                     else
+//                             if(huffmanCode < bigStart) {    // Negativo
+//                                     binaryLenght = huffmanCode-negStart+1;
+//                                     absolute_value = bitread(cPsi->stream,ptr,binaryLenght);
+//                                     ptr += binaryLenght;
+//                                     psiValue -= absolute_value;     
+//                             }
+//                             else {  // Grande
+//                                     binaryLenght = huffmanCode-bigStart+1;
+//                                     absolute_value = bitread(cPsi->stream,ptr,binaryLenght);
+//                                     ptr += binaryLenght;
+//                                     psiValue += absolute_value;                              
+//                             }
+//                             
+//     }
+//     
+//     return psiValue;
+//
+//}
+//
+//
+//void storeHuffmanCompressedPsi(HuffmanCompressedPsi *compressedPsi, char *filename) {
+//
+//     int file;
+//     THuff H;
+//
+//     if( (file = open(filename, O_WRONLY|O_CREAT, 0700)) < 0) {
+//             printf("Cannot open file %s\n", filename);
+//             exit(0);
+//     }
+//     write(file, &(compressedPsi->T), sizeof(int));
+//     // Almacenar o arbol de huffman
+//     H = compressedPsi->diffsHT;
+//     write(file, &H.max, sizeof(int));
+//     write(file, &H.lim, sizeof(int));
+//     write(file, &H.depth, sizeof(int));
+////   write(file, H.s.spos, (H.lim+1)*sizeof(int));
+//     write(file, H.s.symb, (H.lim+1)*sizeof(int));   
+//     write(file, H.num, (H.depth+1)*sizeof(int));
+//     write(file, H.fst, (H.depth+1)*sizeof(int));
+//     // Fin de almacenar o arbol de huffman
+//     write(file, &(compressedPsi->nS), sizeof(int));
+//     write(file, &(compressedPsi->numberOfSamples), sizeof(int));
+//     write(file, &(compressedPsi->sampleSize), sizeof(int));
+//     write(file,     compressedPsi->samples, ((compressedPsi->numberOfSamples*compressedPsi->sampleSize+31)/32)*sizeof(int));        
+//     write(file, &(compressedPsi->pointerSize), sizeof(int));
+//     write(file,     compressedPsi->samplePointers, ((compressedPsi->numberOfSamples*compressedPsi->pointerSize+31)/32)*sizeof(int));
+//     write(file, &(compressedPsi->streamSize), sizeof(int));
+//     write(file,     compressedPsi->stream, ((compressedPsi->streamSize+31)/32)*sizeof(int));
+//     
+//     close(file);    
+//
+//}
+//
+//
+//HuffmanCompressedPsi loadHuffmanCompressedPsi(char *filename) {
+//     
+//     HuffmanCompressedPsi compressedPsi;
+//
+//     THuff H;
+//     
+//     int file;
+//
+//     if( (file = open(filename, O_RDONLY)) < 0) {
+//             printf("Cannot read file %s\n", filename);
+//             exit(0);
+//     }
+//     read(file, &(compressedPsi.T), sizeof(int));
+//     // Cargamos o arbol de Huffman
+//     read(file, &H.max, sizeof(int));
+//     read(file, &H.lim, sizeof(int));
+//     read(file, &H.depth, sizeof(int));
+//     //H.s.spos = (unsigned int *) malloc((H.lim+1)*sizeof(int));
+//     //H.s.spos =H.s.symb = (unsigned int *) malloc((H.lim+1)*sizeof(int));
+//     H.s.symb = (unsigned int *) malloc((H.lim+1)*sizeof(int));
+//     H.num = (unsigned int *) malloc((H.depth+1)*sizeof(int));       
+//     H.fst = (unsigned int *) malloc((H.depth+1)*sizeof(int));       
+//
+//     //read(file, H.s.spos, (H.lim+1)*sizeof(int));
+//     fprintf(stderr," \n read %d spos bytes\n", (H.lim+1)*sizeof(int));
+//     read(file, H.s.symb, (H.lim+1)*sizeof(int));    
+//
+//     read(file, H.num, (H.depth+1)*sizeof(int));
+//     read(file, H.fst, (H.depth+1)*sizeof(int));     
+//     compressedPsi.diffsHT = H;
+//     // Fin da carga do arbol de Huffman
+//     read(file, &(compressedPsi.nS), sizeof(int));
+//     read(file, &(compressedPsi.numberOfSamples), sizeof(int));      
+//     read(file, &(compressedPsi.sampleSize), sizeof(int));
+//     compressedPsi.samples = (unsigned int *)malloc(((compressedPsi.numberOfSamples*compressedPsi.sampleSize+31)/32)*sizeof(int));
+//     read(file, compressedPsi.samples, ((compressedPsi.numberOfSamples*compressedPsi.sampleSize+31)/32)*sizeof(int));
+//     read(file, &(compressedPsi.pointerSize), sizeof(int));
+//     compressedPsi.samplePointers = (unsigned int *)malloc(((compressedPsi.numberOfSamples*compressedPsi.pointerSize+31)/32)*sizeof(int));
+//     read(file, compressedPsi.samplePointers, ((compressedPsi.numberOfSamples*compressedPsi.pointerSize+31)/32)*sizeof(int));
+//     read(file, &(compressedPsi.streamSize), sizeof(int));
+//     compressedPsi.stream = (unsigned int *)malloc(((compressedPsi.streamSize+31)/32)*sizeof(int));
+//     read(file, compressedPsi.stream, ((compressedPsi.streamSize+31)/32)*sizeof(int));
+//     
+//     close(file);
+//             
+//     return compressedPsi;
+//
+//}
diff --git a/swcsa/interface.h b/swcsa/interface.h

new file mode 100755 (executable)

index 0000000..938bd43
--- /dev/null
+++ b/swcsa/interface.h
@@ -0,0 +1,96 @@
+
+/* General interface for using the compressed index libraries */
+
+#ifndef uchar
+#define uchar unsigned char
+#endif
+#ifndef uint
+#define uint  unsigned int
+#endif
+#ifndef ulong
+#define ulong unsigned long
+#endif
+
+/* Error management */
+
+        /* Returns a string describing the error associated with error number
+          e. The string must not be freed, and it will be overwritten with
+          subsequent calls. */
+
+char *error_index (int e);
+
+/* Building the index */
+
+        /* Creates index from text[0..length-1]. Note that the index is an 
+          opaque data type. Any build option must be passed in string 
+          build_options, whose syntax depends on the index. The index must 
+          always work with some default parameters if build_options is NULL. 
+          The returned index is ready to be queried. */
+
+int build_index (uchar *text, ulong length, char *build_options, void **index);
+
+        /*  Saves index on disk by using single or multiple files, having 
+          proper extensions. */
+
+int save_index (void *index, char *filename);
+
+        /*  Loads index from one or more file(s) named filename, possibly 
+          adding the proper extensions. */
+
+int load_index (char *filename, void **index);
+
+        /* Frees the memory occupied by index. */
+
+int free_index (void *index);
+
+        /* Gives the memory occupied by index in bytes. */
+
+int index_size(void *index, ulong *size);
+
+/* Querying the index */
+
+        /* Writes in numocc the number of occurrences of the substring 
+          pattern[0..length-1] found in the text indexed by index. */
+
+//int count (void *index, uchar *pattern, ulong length, ulong *numocc);
+//
+//        /* Writes in numocc the number of occurrences of the substring 
+//          pattern[0..length-1] in the text indexed by index. It also allocates
+//          occ (which must be freed by the caller) and writes the locations of 
+//          the numocc occurrences in occ, in arbitrary order.  */
+//
+//int locate (void *index, uchar *pattern, ulong length, ulong **occ, 
+//        ulong *numocc);
+//
+//        /* Gives the length of the text indexed */
+//
+//int get_length(void *index, ulong *length);
+//
+///* Accessing the indexed text  */
+//
+//        /*  Allocates snippet (which must be freed by the caller) and writes 
+//          the substring text[from..to] into it. Returns in snippet_length the 
+//          length of the text snippet actually extracted (that could be less 
+//          than to-from+1 if to is larger than the text size). */
+//
+//int extract (void *index, ulong from, ulong to, uchar **snippet, 
+//        ulong *snippet_length);
+//
+//        /* Displays the text (snippet) surrounding any occurrence of the 
+//          substring pattern[0..length-1] within the text indexed by index. 
+//          The snippet must include numc characters before and after the 
+//          pattern occurrence, totalizing length+2*numc characters, or less if 
+//          the text boundaries are reached. Writes in numocc the number of 
+//          occurrences, and allocates the arrays snippet_text and 
+//          snippet_lengths (which must be freed by the caller). The first is a 
+//          character array of numocc*(length+2*numc) characters, with a new 
+//          snippet starting at every multiple of length+2*numc. The second 
+//          gives the real length of each of the numocc snippets. */
+//
+//int display (void *index, uchar *pattern, ulong length, ulong numc, 
+//        ulong *numocc, uchar **snippet_text, ulong **snippet_lengths);
+
+        /*  Obtains the length of the text indexed by index. */
+
+int length (void *index, ulong *length);
+
diff --git a/swcsa/utils/MemoryManager.c b/swcsa/utils/MemoryManager.c

new file mode 100755 (executable)

index 0000000..e50f5b7
--- /dev/null
+++ b/swcsa/utils/MemoryManager.c
@@ -0,0 +1,105 @@
+\r
+/*-----------------------------------------------------------------------\r
+ File       : MemoryManager.cpp\r
+ Function   : Reserves large blocks of memory and gives pointers to small\r
+              portions of that block when requested.\r
+              This improves performance since a unique "LARGE ALLOCATION"\r
+              of memory is needed (a unique call to malloc).\r
+              It is also responsible of freeing memory.\r
+ Last change: 10/03/2004\r
+ Purpose    : Improve hash performance.\r
+ ------------------------------------------------------------------------*/\r
+#include "MemoryManager.h"\r
+\r
+\r
+/*------------------------------------------------------------------\r
+ Constructor method\r
+ ------------------------------------------------------------------ */\r
+MemoryManager createMemoryManager(void) {\r
+       MemoryManager mm;\r
+       mm = (MemoryManager) malloc (sizeof(struct sMem));\r
+       mm->currentBlock=0;\r
+       createNewMemoryBlock(mm);       \r
+       return mm;\r
+}\r
+\r
+/*------------------------------------------------------------------\r
+ Destructor method\r
+ ------------------------------------------------------------------ */\r
+void destroyMemoryManager (MemoryManager mm){\r
+       register int i;\r
+       for (i=0; i<=mm->currentBlock;i++) free(mm->BLOCKS[i]);\r
+       printf("\n[destroying MemManager] ...Freed %u bytes... RAM", LARGE_BLOCK_SIZE* (mm->currentBlock+1));\r
+       free(mm);\r
+\r
+}\r
+\r
+/*------------------------------------------------------------------\r
+ createNewMemoryBlock method\r
+ Allocates a new memory block of size "LARGE_BLOCK_SIZE" and adds it to\r
+ vector BLOCKS\r
+ ------------------------------------------------------------------ */\r
+\r
+void createNewMemoryBlock (MemoryManager mm) {\r
+       mm->BLOCKS[mm->currentBlock] = (byte *) malloc (LARGE_BLOCK_SIZE);\r
+\r
+       if (mm->BLOCKS[mm->currentBlock] == NULL) {\r
+               fprintf(stderr, "\nERROR...\nUnable to allocate enough memory. Exitting...\n");\r
+               exit(0);\r
+       }\r
+\r
+       mm->remainderBytes = LARGE_BLOCK_SIZE;\r
+       mm->availableByte       = mm->BLOCKS[mm->currentBlock];  //points to the begining of the block\r
+}\r
+\r
+\r
+/*------------------------------------------------------------------\r
+ getBlock method\r
+ returns a pointer to a free block of memory of size "size"\r
+ ------------------------------------------------------------------ */\r
+void getMemoryBlock (MemoryManager mm, byte **dst, const unsigned int size) {\r
+       if (mm->remainderBytes < size)  {\r
+               mm->currentBlock++;\r
+               createNewMemoryBlock(mm);\r
+               //fprintf(stderr,"\new memory block");\r
+       }\r
+\r
+       *dst = mm->availableByte;                       //points to a free size-block\r
+       mm->remainderBytes -= (size);\r
+       mm->availableByte  += (size);\r
+}\r
+\r
+\r
+/*------------------------------------------------------------------\r
+ main, to make unit proofs\r
+ ------------------------------------------------------------------ */\r
+/*\r
+int main(int argc, char* argv[])\r
+{  byte *word, *word2;\r
+       unsigned int size;\r
+int i;\r
+       MemoryManager memMgr;\r
+       memMgr=createMemoryManager();\r
+\r
+       size = 100;\r
+       getMemoryBlock(memMgr,&word,size);\r
+\r
+       fprintf(stderr,"pasei getblock\n");\r
+       strcpy((char *)(word), "01234567890123456789012345678901234567890123456789");\r
+       word[50]='\0';\r
+       fprintf(stderr,"pasei strcpy \n");\r
+       fprintf(stderr,"\n%s",word);\r
+       getMemoryBlock(memMgr,&word2,size);\r
+       strcpy((char *)(word2), "soy la word 2");\r
+\r
+       for (i=0;i<100000;i++) {\r
+               size = 89;\r
+               getMemoryBlock(memMgr,&word,size);\r
+       }\r
+\r
+       fprintf(stderr,"\n  final %s",word2);\r
+\r
+       destroyMemoryManager(memMgr);\r
+\r
+} */\r
+\r
diff --git a/swcsa/utils/MemoryManager.h b/swcsa/utils/MemoryManager.h

new file mode 100755 (executable)

index 0000000..fb51e63
--- /dev/null
+++ b/swcsa/utils/MemoryManager.h
@@ -0,0 +1,47 @@
+/*-----------------------------------------------------------------------\r
+ File       : MemoryManager.h\r
+ Function   : Reserves large blocks of memory and gives pointers to small\r
+              portions of that block when requested.\r
+              This improves performance since a unique "LARGE ALLOCATION"\r
+              of memory is needed (a unique call to malloc).\r
+              It is also responsible of freeing memory.\r
+ Last change: 10/03/2004\r
+ Purpose    : Improve hash performance.\r
+ ------------------------------------------------------------------------*/\r
+\r
+#ifndef MEMORYMANAGERINCLUDED\r
+#define  MEMORYMANAGERINCLUDED      // only used for hashTable of stopwords\r
+       \r
+#include <string.h>\r
+#include <stdlib.h>\r
+#include <math.h>\r
+#include <stdio.h>\r
+#include <malloc.h>\r
+\r
+#ifndef byte\r
+       #define byte unsigned char\r
+#endif \r
+\r
+#define LARGE_BLOCK_SIZE 1024*256                      // Size of the blocks of memory that will be allocated\r
+#define MAX_BLOCKS 2048                                                // Maximum number of blocks of size LARGE_BLOCK_SIZE that\r
+                                                                                       // can be allocated\r
+\r
+       /*\r
+        * Definition of structure MemoryManager\r
+        */\r
+       struct sMem {\r
+               byte *BLOCKS[MAX_BLOCKS];                       //array of blocks of size LARGE_BLOCK_SIZE\r
+               unsigned int  currentBlock;                     //currentBlock in the array of blocks\r
+               unsigned long remainderBytes;           //number of bytes not yet assigned in BLOCKS[currentBlock]\r
+               byte *availableByte;                            //pointer to next byte not yet assigned\r
+       } ;\r
+\r
+       typedef struct sMem *MemoryManager;\r
+       \r
+\r
+       MemoryManager createMemoryManager(void);\r
+       void destroyMemoryManager (MemoryManager mm);\r
+       void getMemoryBlock (MemoryManager mm, byte **dst, const unsigned int size);\r
+       void createNewMemoryBlock (MemoryManager mm);\r
+\r
+#endif\r
diff --git a/swcsa/utils/basics.c b/swcsa/utils/basics.c

new file mode 100755 (executable)

index 0000000..f213ed6
--- /dev/null
+++ b/swcsa/utils/basics.c
@@ -0,0 +1,107 @@
+
+// Basics
+
+// #include "basics.h" included later to avoid macro recursion for malloc
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+               // Memory management
+       
+       void *Malloc (int n)
+       
+          { void *p;
+            if (n == 0) return NULL;
+            p = (void*) malloc (n);
+            if (p == NULL)
+               { fprintf (stderr,"Could not allocate %i bytes\n",n);
+                 exit(1);
+               }
+            return p;
+          }
+       
+       void Free (void *p)
+       
+          { if (p) free (p); 
+          }
+       
+       void *Realloc (void *p, int n)
+       
+          { if (p == NULL) return Malloc (n);
+            if (n == 0) { Free(p); return NULL; }
+            p = (void*) realloc (p,n);
+            if (p == NULL)
+               { fprintf (stderr,"Could not allocate %i bytes\n",n);
+                 exit(1);
+               }
+            return p;
+          }
+
+#include "basics.h"
+
+        // bits needed to represent a number between 0 and n
+
+uint bits (uint n)
+
+   { uint b = 0;
+     while (n)
+       { b++; n >>= 1; }
+     return b;
+   }
+
+        // returns e[p..p+len-1], assuming len <= W
+
+uint bitread (uint *e, uint p, uint len)
+
+   { uint answ;
+     e += p/W; p %= W;
+     answ = *e >> p;
+     if (len == W)
+         { if (p) answ |= (*(e+1)) << (W-p);
+         }
+     else { if (p+len > W) answ |= (*(e+1)) << (W-p);
+            answ &= (1<<len)-1;
+         }
+     return answ;
+   }
+
+
+       // writes e[p..p+len-1] = s, len <= W
+
+void bitwrite (register uint *e, register uint p, 
+              register uint len, register uint s)
+
+   { e += p/W; p %= W;
+     if (len == W)
+         { *e |= (*e & ((1<<p)-1)) | (s << p);
+            if (!p) return;
+            e++;
+            *e = (*e & ~((1<<p)-1)) | (s >> (W-p));
+         }
+     else { if (p+len <= W)
+              { *e = (*e & ~(((1<<len)-1)<<p)) | (s << p);
+                return;
+              }
+           *e = (*e & ((1<<p)-1)) | (s << p);
+            e++; len -= W-p;
+            *e = (*e & ~((1<<len)-1)) | (s >> (W-p));
+         }
+   }
+       // writes e[p..p+len-1] = 0
+
+void bitzero (register uint *e, register uint p, 
+              register uint len)
+
+   { e += p/W; p %= W;
+     if (p+len >= W)
+       { *e &= ~((1<<p)-1);
+         len -= p;
+         e++; p = 0;
+       }
+     while (len >= W)
+       { *e++ = 0;
+         len -= W;
+       }
+     if (len > 0)
+       *e &= ~(((1<<len)-1)<<p);
+   }
diff --git a/swcsa/utils/basics.h b/swcsa/utils/basics.h

new file mode 100755 (executable)

index 0000000..da91efb
--- /dev/null
+++ b/swcsa/utils/basics.h
@@ -0,0 +1,112 @@
+
+
+ // Basics
+ 
+#ifndef BASICSINCLUDED
+#define BASICSINCLUDED
+
+  // Includes 
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/times.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+  // Memory management
+
+#define malloc(n) Malloc(n)
+#define free(p) Free(p)
+#define realloc(p,n) Realloc(p,n)
+
+void *Malloc (int n);
+void Free (void *p);
+void *Realloc (void *p, int n);
+
+  // Data types
+
+#ifndef byte
+       #define byte unsigned char
+#endif 
+
+//typedef unsigned char byte;
+// typedef unsigned int uint;
+
+//typedef int bool;
+//#define true 1
+//#define false 0
+
+#define max(x,y) ((x)>(y)?(x):(y))
+#define min(x,y) ((x)<(y)?(x):(y))
+
+  // Bitstream management
+
+//#define W (8*sizeof(uint))
+#define W (32)
+
+       // bits needed to represent a number between 0 and n
+uint bits (uint n);
+        // returns e[p..p+len-1], assuming len <= W
+uint bitread (uint *e, uint p, uint len);
+        // writes e[p..p+len-1] = s, assuming len <= W
+void bitwrite (uint *e, uint p, uint len, uint s);
+    // writes e[p..p+len-1] = 0, no assumption on len
+        
+    /**/ //FARI. WITH ASSUMPTION ON LEN, OR IT CRASHES 
+         //NOt WORKING UPON THE LIMIT OF THE STARTING uint.
+void bitzero (uint *e, uint p, uint len);
+       // reads bit p from e
+#define bitget(e,p) (((e)[(p)/W] >> ((p)%W)) & 1)
+       // sets bit p in e
+#define bitset(e,p) ((e)[(p)/W] |= (1<<((p)%W)))
+       // cleans bit p in e
+#define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W)))
+
+
+
+/* bitRead and bitWrite as MACROS */
+       // returns e[p..p+len-1], assuming len <= W
+       //mybitread (uint returned value, uint *e, uint p, uint len)
+#define mybitread(answ, v, p, len) \
+   { uint *e ; \
+        e=v;\
+     e += p/W; p %= W; \
+     answ = *e >> p; \
+     if (len == W) \
+         { if (p) answ |= (*(e+1)) << (W-p); \
+         } \
+     else { if (p+len > W) answ |= (*(e+1)) << (W-p); \
+            answ &= (1<<len)-1; \
+         } \
+   }
+
+
+       // writes e[p..p+len-1] = s, len <= W
+       //void bitwrite (uint *e, uint p, uint len, uint s)
+#define mybitwrite(v, p, len, s) \
+   { uint *e ; \
+        e=v; \
+    e += p/W; p %= W; \
+     if (len == W) \
+         { *e |= (*e & ((1<<p)-1)) | (s << p); \
+            if (p) { \
+               e++; \
+               *e = (*e & ~((1<<p)-1)) | (s >> (W-p)); \
+               } \
+         } \
+     else { if (p+len <= W) \
+              { *e = (*e & ~(((1<<len)-1)<<p)) | (s << p); \
+              } \
+              else { \
+               *e = (*e & ((1<<p)-1)) | (s << p); \
+            e++; len -= W-p; \
+            *e = (*e & ~((1<<len)-1)) | (s >> (W-p)); \
+           } \
+         } \
+   } 
+
+#endif
diff --git a/swcsa/utils/bitmap.c b/swcsa/utils/bitmap.c

new file mode 100755 (executable)

index 0000000..92ac4c0
--- /dev/null
+++ b/swcsa/utils/bitmap.c
@@ -0,0 +1,316 @@
+
+// Implements operations over a bitmap
+
+#include "bitmap.h"
+
+
+  // In theory, we should have superblocks of size s=log^2 n divided into
+  // blocks of size b=(log n)/2. This takes 
+  // O(n log n / log^2 n + n log log n / log n + log n sqrt n log log n) bits
+  // In practice, we can have any s and b, and the needed amount of bits is
+  // (n/s) log n + (n/b) log s + b 2^b log b bits
+  // Optimizing it turns out that s should be exactly s = b log n
+  // Optimizing b is more difficult but could be done numerically.
+  // However, the exponential table does no more than popcounting, so why not
+  // setting up a popcount algorithm tailored to the computer register size,
+  // defining that size as b, and proceeding.
+
+//unsigned char OnesInByte[] = 
+const unsigned char popc[] =   //number of ones in one byte value [0..255].
+{
+0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
+1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8,
+};
+
+uint popcount (register uint x)
+
+   { return popc[x&0xFF] + popc[(x>>8)&0xFF] + popc[(x>>16)&0xFF] + popc[x>>24];
+   }
+
+  
+/******************************************************************/
+// FUNCIONS DE EDU ...
+/******************************************************************/
+/* 
+       Creates a bitmap and structures to rank and select 
+*/
+
+//bitmap createBitmapEdu (uint *string, uint n){  return createBitmap(string,n);}
+
+bitmap createBitmap (uint *string, uint n){
+    bitmap B;
+    
+       unsigned int nb;
+       unsigned int ns;
+       unsigned int countB, countS, blockIndex, superblockIndex;
+    register unsigned int block;
+
+       B =(struct sbitmap *) malloc (sizeof(struct sbitmap));
+       B->data = string;
+       B->n = n; 
+       ns = (n/256)+1;
+       nb = (n/32)+1;
+
+    B->bSize = nb;
+    B->sSize = ns;
+       B->bdata =(byte *)malloc(nb*sizeof(byte));  //  Db = (unsigned char *)malloc(nb*sizeof(unsigned char));
+       B->sdata = (uint*)malloc(ns*sizeof(uint));  //  Ds = (unsigned int *)malloc(ns*sizeof(unsigned int));
+
+       B->mem_usage = (ns*sizeof(uint)) + (nb*sizeof(byte)) + (sizeof(struct sbitmap));
+       /* Ahora construimos los bloques */
+    blockIndex = 0;
+    superblockIndex = 0;
+    countB = 0;
+    countS = 0;
+
+    while(blockIndex < nb) {
+       
+       if(!(blockIndex%8)) {
+          countS += countB;
+          B->sdata[superblockIndex++] = countS;
+          countB = 0;
+       }
+       
+       B->bdata[blockIndex] = countB;
+          block = string[blockIndex++];
+          
+          countB += popcount(block);
+    }
+
+       B->pop = countS+countB;
+       
+//     {int i;     //fprintf(stderr,"\n");
+//     for (i=0;i<ns;i++) {//fprintf(stderr,"%d ",B->sdata[i]);
+//             }
+//     //fprintf(stderr,"\n");
+//     for (i=0;i<8;i++) {//fprintf(stderr,"%d ",B->bdata[i]);
+//             }
+//     }
+    return B;
+}      
+
+
+/*
+  Number of 1s in range [0,posicion]
+*/
+//uint rank1Edu(bitmap B, unsigned int position) {
+//uint rank1Edu(bitmap B, unsigned int position) { return rank(B,position);}
+uint rank(bitmap B, unsigned int position) {
+    register unsigned int block;    
+    if (position > B->n) return B->pop;    
+       //position -=1;
+       
+       block = B->data[position/32] << (31-position%32);
+
+    return B->sdata[position/256] + B->bdata[position/32] + 
+           popc[block & 0xff] + popc[(block>>8) & 0xff] +
+           popc[(block>>16) & 0xff] + popc[block>>24];
+}   
+
+
+/********************************************************************************************/
+/**************************************************************************************/
+
+static uint binsearch (uint *data, uint size, uint val)
+
+   { uint i,j,m;
+     i = 0; j = size;
+     while (i+1 < j)
+       { m = (i+j)/2;
+         if (data[m] >= val) j = m;
+         else i = m;
+       }
+     return i;
+   }
+
+uint bselect (bitmap B, uint j)
+
+   { uint spos,bpos,pos,word,x;
+     byte *blk;
+     if (j > B->pop) return B->n;
+     spos = binsearch(B->sdata,(B->n+256-1)/256,j);
+     
+     //fprintf(stderr,"\n SPOS IS %d, and B->sdata[pos] = %d",spos,B->sdata[spos]);
+     j -= B->sdata[spos];
+     pos = spos<<8;
+     blk = B->bdata + (pos>>5);
+     bpos = 0;
+    
+    //while ((bpos < (1<<3)-1) && (blk[bpos+1] < j)) bpos++;
+    while ( ((spos*8+bpos) < ((B->n-1)/W)) && (bpos < (1<<3)-1) && (blk[bpos+1] < j)) bpos++;
+     
+    
+      //fprintf(stderr,"\n BPOS  = %d",bpos);
+     pos += bpos<<5;
+     word = B->data[pos>>5];
+     j -= blk[bpos];
+     //fprintf(stderr,"\n pos>>5 = %d ... pasou XXX con word = %d, and j= %d",pos>>5,word,j);
+     while (1) 
+    { x = popc[word & ((1<<8)-1)]; 
+       //fprintf(stderr,"\n word = %u popc vale %u",word & ((1<<8)-1),x);
+         if (j <= x) break;
+         j -= x; pos += 8;
+         word >>= 8;
+         
+       }
+       
+     while (j) { if (word & 1) j--; word >>= 1; pos++; }
+
+     //        fprintf(stderr,"\n\nBSELECT::: POSICIÓN FINAL = %u",pos-1);
+     return pos-1;
+     
+   }
+
+
+// destroys the bitmap, freeing the original bitstream
+void destroyBitmap (bitmap B)
+
+   { //free (B->data);
+     free (B->bdata);
+     free (B->sdata);
+     free (B);
+   }
+   
+   
+// Prints the bit vector
+void showBitVector(uint * V, int vectorSize) {
+     uint bitIndex=0;
+     while(bitIndex<vectorSize) {
+        fprintf(stderr,"%d",bitget(V,bitIndex));
+        bitIndex++;
+     }  
+}   
+  
+void saveBitmap (char *filename, bitmap b) {
+       int file;
+       unlink(filename);
+       if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) {
+               printf("Cannot open file %s\n", filename);
+               exit(0);
+       }
+       write(file, &(b->sSize), sizeof(uint));
+       write(file, b->sdata, sizeof(int) * (b->sSize));
+       write(file, &(b->bSize), sizeof(uint));
+       write(file, b->bdata, sizeof(byte) * (b->bSize));
+
+       write(file, &(b->pop), sizeof(uint));
+       write(file, &(b->n), sizeof(uint));
+       close(file);            
+}
+
+/* loads the Rank structures from disk, and sets Bitmap->data ptr to "string"
+*/
+bitmap loadBitmap (char *filename, uint *string, uint n) {  
+       bitmap B;
+       int file;
+       
+       if( (file = open(filename, O_RDONLY)) < 0) {
+               printf("Cannot read file %s\n", filename);
+               exit(0);
+       }               
+       
+       B = (struct sbitmap *) malloc (sizeof(struct sbitmap));
+       B->data = string;
+       
+       read(file, &(B->sSize), sizeof(uint));
+       B->sdata = (uint *) malloc(sizeof(uint) * B->sSize);
+       read(file, B->sdata, sizeof(uint) * B->sSize);
+
+       read(file, &(B->bSize), sizeof(uint));
+       B->bdata = (byte *) malloc(sizeof(byte) * B->bSize);
+       read(file, B->bdata, sizeof(byte) * B->bSize);  
+       
+       read(file, &(B->pop), sizeof(uint));
+       read(file, &(B->n), sizeof(uint));      
+       close(file);
+       B->mem_usage = (sizeof(uint) * B->sSize) + (sizeof(byte) * B->bSize) + (sizeof(struct sbitmap));
+       
+       if (n != B->n) {printf("\n LoadBitmap failed: %u distinto de %u",n,B->n); exit(0);}
+       return B;
+               
+} 
+   
+   
+/********************************************************************************************/
+/********************************************************************************************/
+
+
+
+
+       // creates a bitmap structure from a bitstring, which is shared
+
+bitmap createBitmapGONZA (uint *string, uint n)
+//bitmap createBitmap (uint *string, uint n)
+
+   { bitmap B;
+     uint i,j,pop,bpop,pos;
+     uint s,nb,ns,words;
+     B = (struct sbitmap *) malloc (sizeof(struct sbitmap));
+     B->data = string;
+     
+   
+        B->n = n; words = (n+W-1)/W;
+     ns = (n+256-1)/256; nb = 256/W; // adjustments          
+     
+     B->bSize = ns*nb;
+     B->bdata = (byte *) malloc (ns*nb*sizeof(byte));
+     B->sSize = ns;
+     B->sdata = (uint *)malloc (ns*sizeof(int));
+
+        B->mem_usage = (ns*sizeof(int)) + (ns*nb*sizeof(byte)) + (sizeof(struct sbitmap));
+#ifdef INDEXREPORT
+     printf ("     Bitmap over %i bits took %i bits\n", n,n+ns*nb*8+ns*32);
+#endif 
+         //fprintf (stderr,"     Bitmap over %i bits took %i bits\n", n,n+ns*nb*8+ns*32);
+     pop = 0; pos = 0;
+     for (i=0;i<ns;i++)
+       { bpop = 0;
+         B->sdata[i] = pop;
+         for (j=0;j<nb;j++)
+            { if (pos == words) break;
+              B->bdata[pos++] = bpop;
+              bpop += popcount(*string++);
+            }
+         pop += bpop;
+       }
+     B->pop = pop;
+     
+               //     //fprintf(stderr,"\n");
+               //     for (i=0;i<ns;i++) {//fprintf(stderr,"%d ",B->sdata[i]);
+               //      }
+               //     //fprintf(stderr,"\n");
+               //     for (i=0;i<ns*nb;i++) {//fprintf(stderr,"%d ",B->bdata[i]);
+               //      }
+                    
+     return B;
+   }
+
+       // rank(i): how many 1's are there before position i, not included
+
+//uint rank (bitmap B, uint i)
+uint rankGONZA (bitmap B, uint i)
+
+   { 
+       i++;
+       if (i > B->n) return B->pop;
+       return B->sdata[i>>8] + B->bdata[i>>5] +
+           popcount (B->data[i>>5] & ((1<<(i&0x1F))-1));
+   }
+
+
+   
+   
+   
+   
+   
+
+
+
+               
+
diff --git a/swcsa/utils/bitmap.h b/swcsa/utils/bitmap.h

new file mode 100755 (executable)

index 0000000..2c61ed0
--- /dev/null
+++ b/swcsa/utils/bitmap.h
@@ -0,0 +1,46 @@
+
+// Implements operations over a bitmap
+
+#ifndef BITMAPINCLUDED
+#define BITMAPINCLUDED
+
+#include "basics.h"
+
+typedef struct sbitmap
+   { uint *data;
+     uint n;        // # of bits
+     uint pop;     // # bits set
+     uint *sdata;   // superblock counters
+     uint sSize;       //              size of sdata vector
+     byte *bdata;   // block counters
+     uint bSize;       //     size of bdata vector
+     uint mem_usage;
+   } *bitmap;
+
+
+       // creates a bitmap structure from a bitstring, which gets owned
+bitmap createBitmap (uint *string, uint n);
+       // rank(i): how many 1's are there before position i, not included
+uint rank (bitmap B, uint i);
+       // select(i): position of i-th 1
+uint bselect (bitmap B, uint i);
+       // destroys the bitmap, freeing the original bitstream
+void destroyBitmap (bitmap B);
+       // popcounts 1's in x
+uint popcount (register uint x);
+
+void saveBitmap (char *filename, bitmap b);
+bitmap loadBitmap (char *filename, uint *string, uint n);
+
+
+
+////EDU'S functions included here.
+//bitmap createBitmapEdu (uint *string, uint n);
+//uint popcountEdu (register uint x);     //which is identical to popcount.
+//uint rank1Edu(bitmap B, unsigned int position);
+//unsigned int isActiveBit(uint *V, uint position);
+void showBitVector(uint * V, int vectorSize);
+
+#endif
+
+
diff --git a/swcsa/utils/defValues.h b/swcsa/utils/defValues.h

new file mode 100755 (executable)

index 0000000..a09122d
--- /dev/null
+++ b/swcsa/utils/defValues.h
@@ -0,0 +1,42 @@
+#ifndef DEFVALUES_INCLUDED
+#define DEFVALUES_INCLUDED
+
+//#define MAX_LEN_VOCABULARY 3000000    //number of different words --> (number of variants)
+#define MAX_MEANINGFUL_WORDS 100000000  //number of words (non separators)
+#define MAX_SIZE_OF_WORD 1000000 //255  //size of word
+#define MAX_SIZE_OF_GAP 1000000 //1000  //size of separator
+       //#define MAX_STOPW0RDS_SIZE 255        //size of stopword 
+#define MAX_SIZE_OF_ANY 1000000 //1000
+
+
+#define DEFAULT_SUFFIX_ARRAY_SIZE 50000000
+
+#define DEFAULT_SAMPLE_PERIOD_Z 32
+#define DEFAULT_SAMPLE_PERIOD_B 32
+
+
+
+
+//for queries
+#define MAX_INTEGER_PATTERN_SIZE 20
+#define MAX_TEXT_PATTERN_SIZE 100   //maximum number of valid words in a searched pattern
+
+#ifndef DEBUG_ON
+       // #define DEBUG_ON
+#endif
+
+
+
+#define byte unsigned char
+
+
+#define CSA_ON                         //generates the CSA or only "presentation layer"
+//#define WRITE_SE_FILE     //outputs to a file the array of integers indexed.
+
+// Extensions of created  files
+
+#define VOCABULARY_WORDS_FILE_EXT "words"
+#define SE_FILE_EXT "se"
+#define CONSTANTS_FILE_EXT "cte"
+
+#endif
diff --git a/swcsa/utils/errors.c b/swcsa/utils/errors.c

new file mode 100755 (executable)

index 0000000..42886ff
--- /dev/null
+++ b/swcsa/utils/errors.c
@@ -0,0 +1,48 @@
+
+/*////////////////
+//Error handling//
+////////////////*/
+/*
+char *error_index(int e){
+  switch(e) {
+    case 0:  return "No error"; break;
+    case 1:  return "Out of memory"; break;
+    case 2:  return "The text must end with a \\0"; break;
+    case 5:  return "You can't free the text if you don't copy it"; break;
+    case 20: return "Cannot create files"; break;
+    case 21: return "Error writing the index"; break;
+    case 22: return "Error writing the index"; break;
+    case 23: return "Cannot open index; break";
+    case 24: return "Cannot open text; break";
+    case 25: return "Error reading the index"; break;
+    case 26: return "Error reading the index"; break;
+    case 27: return "Error reading the text"; break;
+    case 28: return "Error reading the text"; break;
+    case 99: return "Not implemented"; break;
+    default: return "Unknown error";
+  }
+}
+*/
+
+
+char *error_index(int e){
+static char err[100];
+  switch(e) {
+    case 0:  strcpy(err, "No error"); break;
+    case 1:  strcpy(err, "Out of memory"); break;
+    case 2:  strcpy(err, "The text must end with a \\0"); break;
+    case 5:  strcpy(err, "You can't free the text if you don't copy it"); break;
+    case 20: strcpy(err, "Cannot create files"); break;
+    case 21: strcpy(err, "Error writing the index"); break;
+    case 22: strcpy(err, "Error writing the index"); break;
+    case 23: strcpy(err, "Cannot open index; break");
+    case 24: strcpy(err, "Cannot open text; break");
+    case 25: strcpy(err, "Error reading the index"); break;
+    case 26: strcpy(err, "Error reading the index"); break;
+    case 27: strcpy(err, "Error reading the text"); break;
+    case 28: strcpy(err, "Error reading the text"); break;
+    case 99: strcpy(err, "Not implemented"); break;
+    default: strcpy(err,  "Unknown error");
+  }
+  return err;
+}
diff --git a/swcsa/utils/fileInfo.c b/swcsa/utils/fileInfo.c

new file mode 100755 (executable)

index 0000000..3f9d413
--- /dev/null
+++ b/swcsa/utils/fileInfo.c
@@ -0,0 +1,44 @@
+
+#include "fileInfo.h"
+
+unsigned long fileSize (char *filename){
+       FILE *fpText;
+       unsigned long fsize;
+       fpText = fopen(filename,"rb");
+       fsize=0;
+       if (fpText) {
+               fseek(fpText,0,2);
+               fsize= ftell(fpText);
+               fclose(fpText);
+               ////fprintf(stderr,"fileSize = %ld",fsize);
+       }
+       return fsize;
+}
+
+/*copies from infile to outfile */
+void copyFile (char *infile, char *outfile){
+       FILE *in, *out;
+       unsigned long fsize;
+       
+       if ( (in = fopen(infile,"rb")) <0) {
+               printf("Cannot open file %s\n", infile); exit(0);
+       }       
+
+       unlink(outfile);
+       if( (out = fopen(outfile, "w")) < 0) {
+               printf("Cannot open file %s\n", outfile);
+               exit(0);
+       }       
+
+       fsize=fileSize(infile);
+       if (fsize) {
+               char *buff = (char *) malloc(sizeof(char)*fsize);
+               if (fread(buff,sizeof(char),fsize,in)) {
+                       fwrite(buff,sizeof(char),fsize,out);                    
+               }               
+               free(buff);
+       }                       
+       fclose(in);
+       fclose(out);    
+}
+
diff --git a/swcsa/utils/fileInfo.h b/swcsa/utils/fileInfo.h

new file mode 100755 (executable)

index 0000000..603b4f2
--- /dev/null
+++ b/swcsa/utils/fileInfo.h
@@ -0,0 +1,19 @@
+#ifndef FILE_INFO_INCLUDED
+#define FILE_INFO_INCLUDED
+
+#include <stdio.h>
+#include <stdlib.h>
+ #include <unistd.h>
+
+
+
+       /*------------------------------------------------------------------
+        Obtains the size of a file.
+       ------------------------------------------------------------------ */
+       unsigned long fileSize (char *filename);
+       
+       /* copies from infile to outfile */
+       void copyFile (char *infile, char *outfile);
+
+#endif
+
diff --git a/swcsa/utils/hash.c b/swcsa/utils/hash.c

new file mode 100755 (executable)

index 0000000..8e77870
--- /dev/null
+++ b/swcsa/utils/hash.c
@@ -0,0 +1,394 @@
+
+/* DYNAMIC END-TAGGED DENSE CODE. -- 
+A dynamic word-based byte oriented compressor for text files based on 
+dynamic End-Tagged Dense Code.
+
+Brisaboa, N. R., Faria, A., Navarro, G., Param, J. R. 
+Simple, Fast, and Efficient Natural Language Adaptive Compression. 
+11th International Symposium on String Processing and Information Retrieval (SPIRE'04) - LNCS 3246. A. Apostolico, M. Melucci (Ed.), pp. 230-241. 
+Padova (Italia), 2004. 
+
+Copyright (C) 2005 Antonio Faria.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+Author's contact: Antonio Faria, Dept. of Computer Science, University of
+A Corua. Campus de Elvia s/n. Spain  fari@udc.es
+*/
+
+/*-----------------------------------------------------------------------
+ Hash: Definition of HashTable class (Linear Hash)
+ ------------------------------------------------------------------------*/
+ 
+#include "hash.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+
+/*-----------------------------------------------------------------
+ Initilization of data structures used by the hashTable
+ ---------------------------------------------------------------- */
+t_hash initialize_hash (unsigned long sizeVoc) {
+       t_hash h;
+       unsigned long i;
+
+       h = (t_hash) malloc(sizeof(struct hashStr));
+       h->SIZE_HASH = (unsigned long) (OCUP_HASH * sizeVoc);
+       h->SIZE_HASH = NearestPrime(h->SIZE_HASH);
+       h->hash = (t_hashNode *) malloc(h->SIZE_HASH * sizeof(t_hashNode));
+       h->NumElem = 0;
+
+       //creates the memory manager that is used to reserve small pieces of memory (for words)
+       h->_memMgr=createMemoryManager();
+
+
+       for (i = 0; i < h->SIZE_HASH; i++)    {
+               h->hash[i].word = NULL;
+               h->hash[i].len = 0;
+               h->hash[i].posInVoc = 0;
+       }
+       printf("\nHash Table initilized with: %lu elements\n",h->SIZE_HASH);
+
+       return h;       
+}
+
+
+/*------------------------------------------------------------------
+ Find the nearest prime number over n.
+ ---------------------------------------------------------------- */
+unsigned long NearestPrime(unsigned long n)
+{
+    long position;  /* the prime number being sought */
+    long index;
+
+    for (position = n; ; position++)
+    {
+        // checks if those values from 2 to $\sqrt{m}$ can be factors of $m$ */
+        for (index = 2; index <= (long) sqrt((double) position) && position % index != 0; index++) ;
+
+        if (position % index != 0)  /* No factors in that range, therefore a prime number was found */
+        {
+            break;
+        }
+    }
+    return position;
+}
+
+
+
+/*-----------------------------------------------------------------------
+ Computes a hash function over a string "aWord", it returns the position
+ where "aWord" should go in the hash table if no collision ocurrs.
+ ---------------------------------------------------------------------*/
+unsigned long hashFunction (const unsigned char *aWord, unsigned int len, unsigned long sizeHash)
+{
+    char       c;
+    register unsigned int h;
+    register unsigned long last;
+    last=((unsigned long) aWord) +len -1;
+
+
+    h = SEED;
+    //last=aWord+len;
+
+    for( ; ((unsigned long) aWord <=last ) ; )
+    //for(c=*aWord; (aWord++)<last ;  )
+    {
+       c=*(aWord++);
+               //c=*aWord;
+               h ^= ( (h << 5) + c + (h >> 2) );
+    }
+    return((unsigned int)((h&0x7fffffff) % sizeHash));
+}
+
+
+/*-----------------------------------------------------------------------
+  compares two strings
+ ---------------------------------------------------------------------*/
+
+/* Author J. Zobel, April 2001.
+       http://www.seg.rmit.edu.au/code/zwh-ipl/
+   Permission to use this code is freely granted, provided that this
+   statement is retained. */
+
+
+
+ int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2) {         
+        if (ws1 !=ws2) {
+                       return -1;
+        }
+        
+        {  register unsigned long iters;           
+                iters=1;
+           while( iters<ws1 && *s1 == *s2 )
+           {
+                       s1++;
+                       s2++;
+                       iters++;
+           }
+           //fprintf(stderr,"\nDevuelve [%d]",*s1-*s2);
+           return( *s1-*s2 );
+        }
+}
+
+inline int strcomp3(const unsigned char *s1, const unsigned char *s2, unsigned long ws1, unsigned long ws2) {
+        
+        if (ws1 !=ws2) {
+                       return -1;
+        }
+        
+        {  register unsigned long iters;
+           register unsigned long end;
+           end = MIN(ws1,ws2);
+                iters=1;
+           while( iters<end && *s1 == *s2 )
+           {
+                       s1++;
+                       s2++;
+                       iters++;
+           }
+           //fprintf(stderr,"\nDevuelve [%d]",*s1-*s2);
+           return( *s1-*s2 );
+        }
+}
+
+
+/* Author J. Zobel, April 2001.
+       http://www.seg.rmit.edu.au/code/zwh-ipl/
+   Permission to use this code is freely granted, provided that this
+   statement is retained. */
+
+//int strcomp(const unsigned char *s1, const unsigned char *s2, unsigned long ws) {
+//    register unsigned long i;
+//    i=0;
+//    while( i < ws-1 && *s1 == *s2 )*/
+//    {
+//                     s1++;
+//                     s2++;
+//                     i++;
+//    }
+//    return( *s1-*s2 );
+//}
+
+//int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
+//     register ulong iters = 1;           
+//
+//     while( iters<ws1 && *s1 == *s2 ){
+//             s1++;
+//             s2++;
+//             iters++;
+//     }
+//       // so     iters == ws1   OR   *s1 != *s2
+//     if (ws1 == iters) {
+//                     if (ws2 == ws1)
+//                             return 0;  // s1 equals to s2
+//                     else 
+//                             return -1;   // s1 < s2
+//     }
+//     else { //(ws1 > iters) so s1 != s2
+//             return( *s1-*s2);
+//     }        
+//}
+
+
+//permits to compare 2 strings of len ws1 and ws2 that do not end in '\0'
+int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
+       register ulong iters = 0;           
+
+       while( iters<ws1 && iters<ws2 && *s1 == *s2 ){
+               s1++;
+               s2++;
+               iters++;
+       }
+
+       if (ws1 == iters) {
+                       if (ws2 == ws1)
+                               return 0;  // s1 equals to s2
+                       else 
+                               return -1;   // w1 < w2  and  *s1_i == *s2_i for i=0 to iters-1
+       }
+       else 
+       if (ws2 == iters) {
+                       if (ws2 == ws1)
+                               return 0;  // s1 equals to s2
+                       else 
+                               return +1;   // w2 < w1  and  *ws1 is '\0'
+       }
+       else { //*s1 != *s2
+               return (*s1-*s2);
+       }       
+       
+}
+       
+
+/*-----------------------------------------------------------------------
+ Insert a new word in a position of the hashTable (position previously computed)
+ ---------------------------------------------------------------------*/
+void insertElement (t_hash h, const unsigned char *aWord, register unsigned long len, register unsigned long *addr) {
+    //fprintf(stderr,"\n Entra en la funcin [%s], [%ld]",aWord, len);
+
+       if(h->NumElem >= h->SIZE_HASH -1)   //loses 1 slot, but ensures that "search function" does not enter an infinity loop
+       {
+               printf("\n\nHash table full!! Change size and recompile !\n");
+               exit(1);
+       }
+
+    getMemoryBlock(h->_memMgr,( byte **)&(h->hash[*addr].word),len+1);
+    //fprintf(stderr,"\n tras obter memoria");
+
+       strncpy ((char *) h->hash[*addr].word, (char *)aWord, len);
+
+       h->hash[*addr].word[len]='\0';
+       h->hash[*addr].len =len;
+       h->hash[*addr].posInVoc = h->NumElem;
+       h->NumElem++;
+
+       //fprintf(stderr,"\n####inserted word [%s] ",h->hash[*addr].word);
+       
+       //return *addr;
+}
+
+/*-----------------------------------------------------------------------
+ Search for a word in the hash table and returns its position in the
+ vocabulary. It returns the next "available" position in the vocabulary,
+ if the word is not in the hash table. That is: a "0-node" position.
+ It also returns -using attribute returnedAddr- the position where word
+ was found (or where it should go if it was inserted in next "insert call".
+ -----------------------------------------------------------------------*/
+unsigned long search (t_hash h, const unsigned char *aWord, register unsigned len,
+                                                                unsigned long *returnedAddr){
+
+       register unsigned long addr, Saddr;
+       
+       //fprintf(stderr,"\n searching for  [%s], [%d], sizehash= %ld",aWord,len,h->SIZE_HASH);
+       addr = hashFunction(aWord,len, h->SIZE_HASH);
+       Saddr = addr;
+
+       t_hashNode *hash;
+       hash = h->hash;
+       
+       while((hash[addr].word  != NULL)&&((strcomp(hash[addr].word, aWord,  hash[addr].len, len)) != 0))  {    
+               //fprintf(stderr,"\nComprueba [%s], [%d]",hash[addr].word,strcomp(hash[addr].word, aWord, len));
+               addr = ((addr + JUMP) % h->SIZE_HASH);
+       }
+
+       *returnedAddr = addr;
+       
+       if(hash[addr].word  == NULL) {
+               return h->NumElem;                              //Word was not found
+       }
+       else {
+               return h->hash[addr].posInVoc; //Word was found in this position in the vocabulary
+       }
+}
+
+
+
+/*-----------------------------------------------------------------------
+  Tells if a word appears or not in the hash table.
+ -----------------------------------------------------------------------*/
+unsigned long inHashTable (t_hash h, const unsigned char *aWord, register unsigned len, unsigned long *returnedAddr){
+       
+       unsigned long searched;
+       unsigned long nothing;
+       searched = search(h,aWord,len,&nothing);
+       *returnedAddr=nothing;
+       return (searched < (h->NumElem) );
+}
+
+/*------------------------------------------------------------------
+ Destructor method
+ ------------------------------------------------------------------ */
+void destroy_hash (t_hash hash){
+       unsigned long mem=0;
+       mem += sizeof(struct hashStr) + hash->SIZE_HASH * sizeof(t_hashNode);   
+       free(hash->hash);
+       destroyMemoryManager(hash->_memMgr); //frees words and variants 
+//     free(hash->_memMgr);
+       free(hash);     
+       printf("\n[destroying hash table]...Freed %ld bytes... RAM", mem);      
+}
+
+
+
+
+/*------------------------------------------------------------------
+ main, to make unit proofs
+ ------------------------------------------------------------------ */
+/*
+int main(int argc, char* argv[])
+{   byte a[10]= "word1";
+       byte b[10]= "word2";
+       byte c[10]= "word3";
+       byte d[10]= "word4";
+       byte e[10]= "word5";
+       byte f[10]= "word6";
+       byte * w;
+       unsigned int size;
+       unsigned long i,addrInTH;
+       
+       t_hash hash;
+       
+       _memMgr=createMemoryManager();
+
+       hash = initialize_hash (2);
+       
+       w=a;
+       i = search (hash,w, strlen(w), &addrInTH );
+       insertElement (hash, w, strlen(w), &addrInTH);
+       fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
+       fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);   
+
+       w=b;
+       i = search (hash,w, strlen(w), &addrInTH );
+       insertElement (hash, w, strlen(w), &addrInTH);
+       fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
+       fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);   
+
+       w=c;
+       i = search (hash,w, strlen(w), &addrInTH );
+       insertElement (hash, w, strlen(w), &addrInTH);
+       fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
+       fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);   
+
+       w=d;
+       i = search (hash,w, strlen(w), &addrInTH );
+       insertElement (hash, w, strlen(w), &addrInTH);
+       fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
+       fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);   
+
+//     w=e;
+//     i = search (hash,w, strlen(w), &addrInTH );
+//     insertElement (hash, w, strlen(w), &addrInTH);
+//     fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
+//     fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);   
+
+//     w=f;
+//     i = search (hash,w, strlen(w), &addrInTH );
+//     insertElement (hash, w, strlen(w), &addrInTH);
+//     fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
+//     fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);   
+
+       fprintf(stderr,"\n in: %s  hash ? = %d",a,inHashTable(hash,a,strlen(a)) );
+       fprintf(stderr,"\n in: %s  hash ? = %d",e, inHashTable(hash,e,strlen(e)) );
+       fprintf(stderr,"\n in: %s  hash ? = %d",b, inHashTable(hash,b,strlen(b)) );
+
+       destroy_hash(hash);
+       destroyMemoryManager(_memMgr);
+       printf("\n\n");
+}
+*/
diff --git a/swcsa/utils/hash.h b/swcsa/utils/hash.h

new file mode 100755 (executable)

index 0000000..d691c7a
--- /dev/null
+++ b/swcsa/utils/hash.h
@@ -0,0 +1,91 @@
+\r
+/* DYNAMIC END-TAGGED DENSE CODE. -- \r
+A dynamic word-based byte oriented compressor for text files based on \r
+dynamic End-Tagged Dense Code.\r
+\r
+Brisaboa, N. R., Fariña, A., Navarro, G., Paramá, J. R. \r
+Simple, Fast, and Efficient Natural Language Adaptive Compression. \r
+11th International Symposium on String Processing and Information Retrieval (SPIRE'04) - LNCS 3246. A. Apostolico, M. Melucci (Ed.), pp. 230-241. \r
+Padova (Italia), 2004. \r
+\r
+Copyright (C) 2005 Antonio Fariña.\r
+\r
+This program is free software; you can redistribute it and/or\r
+modify it under the terms of the GNU General Public License\r
+as published by the Free Software Foundation; either version 2\r
+of the License, or (at your option) any later version.\r
+\r
+This program is distributed in the hope that it will be useful,\r
+but WITHOUT ANY WARRANTY; without even the implied warranty of\r
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r
+GNU General Public License for more details.\r
+\r
+You should have received a copy of the GNU General Public License\r
+along with this program; if not, write to the Free Software\r
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.\r
+\r
+Author's contact: Antonio Fariña, Dept. of Computer Science, University of\r
+A Coruña. Campus de Elviña s/n. Spain  fari@udc.es\r
+*/\r
+\r
+\r
+/*-----------------------------------------------------------------------\r
+ Hash: Definition of HashTable class (Linear Hash)\r
+ ------------------------------------------------------------------------*/\r
+#ifndef HASH_INCLUDED\r
+#define HASH_INCLUDED\r
+ \r
+#include <string.h>\r
+#include <stdlib.h>\r
+#include <math.h>\r
+#include <malloc.h>\r
+\r
+#include "MemoryManager.h"\r
+\r
+#define JUMP 101                //jump done when a collision appears\r
+#define OCUP_HASH 1.5           //index of occupation of the hash table\r
+#define SMALL_PRIME 1009 // a small prime number, used to compute a hash function\r
+#define SEED   1159241\r
+/* Type definitions */\r
+\r
+#define MIN(a,b) (a < b) ? a : b\r
+\r
+struct hashNode {\r
+         unsigned char *word;\r
+         unsigned long len;\r
+         unsigned long posInVoc;         //positon of the canonical word in vector posInHT\r
+};\r
+typedef struct hashNode t_hashNode;\r
+\r
+struct hashStr {\r
+       t_hashNode *hash;                 /* the slots in the hash table        */\r
+       unsigned long SIZE_HASH;   /* # entries in the hash table    */\r
+       unsigned long NumElem;    /* # elements already added to the hash table*/\r
+       MemoryManager _memMgr;    /* Holds dynamic memory allocation for words. */      \r
+};\r
+\r
+typedef struct hashStr *t_hash;\r
+\r
+// private:\r
+\r
+       unsigned long NearestPrime (unsigned long n);\r
+       unsigned long hashFunction (const unsigned char *aWord, unsigned int len, unsigned long sizeHash);\r
+\r
+// public:\r
+       \r
+       t_hash initialize_hash (unsigned long sizeVoc);\r
+       \r
+       void insertElement (t_hash h, const unsigned char *aWord, register unsigned long len,\r
+                                                             register unsigned long *addr);\r
+       unsigned long search (t_hash h, const unsigned char *aWord, register unsigned len,\r
+                                                                 unsigned long *returnedAddr);\r
+       unsigned long inHashTable (t_hash h, const unsigned char *aWord, register unsigned len,\r
+                                                                 unsigned long *returnedAddr);\r
+       void destroy_hash (t_hash hash);\r
+       \r
+       int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2);\r
+\r
+       int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2);\r
+\r
+\r
+#endif\r
diff --git a/swcsa/utils/huff.c b/swcsa/utils/huff.c

new file mode 100755 (executable)

index 0000000..b45a484
--- /dev/null
+++ b/swcsa/utils/huff.c
@@ -0,0 +1,396 @@
+
+// implements canonical Huffman 
+
+#include "huff.h"
+
+typedef struct
+   { uint freq;
+     uint symb;
+     union
+       { int prev;
+        int depth;
+       } h;
+     int ch1,ch2;
+   } Ttree;
+  
+static void sort (Ttree *tree, int lo, int up)
+
+   { int i, j;
+     Ttree temp;
+     while (up>lo) 
+        { i = lo;
+         j = up;
+         temp = tree[lo]; 
+         while (i<j) 
+            { while (tree[j].freq > temp.freq) j--;
+              tree[i] = tree[j]; 
+              while (i<j && tree[i].freq <= temp.freq) i++;
+              tree[j] = tree[i];
+            }
+         tree[i] = temp;
+         if (i-lo < up-i) { sort(tree,lo,i-1); lo = i+1; }
+         else { sort(tree,i+1,up); up = i-1; }
+       }
+   }
+
+static void setdepths (Ttree *tree, uint node, int depth)
+
+   { if (tree[node].ch1 == -1) // leaf
+       { tree[node].h.depth = depth; 
+         return;
+       }
+     setdepths (tree,tree[node].ch1,depth+1);
+     setdepths (tree,tree[node].ch2,depth+1);
+   }
+
+
+THuff createHuff (uint *freq, uint lim, uint sorted)
+
+   { THuff H;
+     int i,j,d;
+     Ttree *tree;
+     int ptr,last,fre;
+         // remove zero frequencies
+     H.max = lim;
+     H.total = 0;
+     
+     int onlyOneSymbol = (lim ==0);
+
+       //     tree = malloc((2*(lim+1)-1)*sizeof(Ttree));
+               /*** BEG FARI... para solucionar caso en que s�lo hubiese un �nico s�mbolo a codificar ****/
+                if (onlyOneSymbol) {
+                       tree = (Ttree *)malloc(3*sizeof(Ttree));  //root, the valid node (pos 0) and the "zeroNode" (pos 1)
+                       H.max =  lim+1;
+               }
+                else 
+                       tree =  (Ttree *)malloc((2*(lim+1)-1)*sizeof(Ttree));           
+       /*** END FARI... para solucionar caso en que s�lo hubiese un �nico s�mbolo a codificar ****/      
+     
+     //fprintf(stderr,"\n **** CALLED CREATE_HUFF WITH  lim = %ld, freq[0]=%ld",lim,freq[0]);     
+     j = 0;
+     for (i=0;i<=lim;i++) 
+       { if (freq[i]>0) 
+            { tree[j].freq = freq[i];   
+               tree[j].symb = i;   
+              j++;
+              //fprintf(stderr,"\n freq[%d] = [%d], para s�mbolo %d, now j=%ld",i,freq[i],tree[j-1].symb,j);
+            }
+       }
+       
+       /*** BEG FARI... para solucionar caso en que s�lo hubiese un �nico s�mbolo a codificar ****/
+               if (onlyOneSymbol){     
+                       tree[j].freq = 0;                       
+                       tree[j++].symb = 1; //<--- !!!!!!!!! (incrementa j !!!)
+                       
+                       //H.total -=1 * 1;     //we will add 1*1 at the end of this function.
+
+               }
+                //fprintf(stderr,"\n");
+       
+       /*** END FARI... para solucionar caso en que s�lo hubiese un �nico s�mbolo a codificar ****/      
+       
+       H.lim = lim = j-1;              
+
+     
+       // now run Huffman algorithm
+     if (!sorted) sort (tree,0,lim);
+       
+     for (i=0;i<=lim;i++) {
+       //fprintf(stderr,"\n    XX freq[%d] = [%d], para symbolo %d",i,tree[i].freq,tree[i].symb);      
+       tree[i].h.prev = i+1;
+          tree[i].ch1 = tree[i].ch2 = -1;
+        }
+     tree[lim].h.prev = -1;
+    // last = next node to process, ptr = search point, fre = next free cell
+       // leaves are in 0..lim in decreasing freq order
+       // internal nodes are in lim+1.. 2*lim, created in incr. fre order
+     last=0; ptr = 0; fre = lim+1;
+     for (i=0;i<lim;i++)
+       { tree[fre].ch1 = last;
+         last = tree[last].h.prev;
+         tree[fre].ch2 = last;
+         tree[fre].freq = tree[tree[fre].ch1].freq+tree[tree[fre].ch2].freq;
+         while ((tree[ptr].h.prev != -1) &&
+                (tree[tree[ptr].h.prev].freq <= tree[fre].freq))
+             ptr = tree[ptr].h.prev;
+         tree[fre].h.prev = tree[ptr].h.prev;
+         tree[ptr].h.prev = fre;
+         last = tree[last].h.prev;
+         fre++;
+       }
+        // now assign depths recursively
+     setdepths (tree,2*lim,0);
+     H.s.spos = (uint *)malloc ((H.max+1)*sizeof(uint));
+     for (i=0;i<=H.max;i++) H.s.spos[i] = ~0;
+     H.num = (uint *)malloc ((lim+1)*sizeof(uint)); // max possible depth
+     d=0;
+     for (i=lim;i>=0;i--)
+       { H.s.spos[tree[i].symb] = i;
+         while (tree[i].h.depth > d) 
+             { H.num[d] = i+1; d++; }
+       }
+     H.num[d] = 0;
+     H.depth = d;
+     for (d=H.depth;d>0;d--) H.num[d] = H.num[d-1] - H.num[d];
+     H.num[0] = (lim == 0);
+     H.num = (uint *) realloc(H.num,(H.depth+1)*sizeof(uint));
+     //H.total = 0;
+     
+     
+     
+       
+     if (onlyOneSymbol){       H.total += freq[tree[0].symb] * tree[0].h.depth;}
+     else {
+            for (i=0;i<=lim;i++) {
+              H.total += freq[tree[i].symb] * tree[i].h.depth;
+              //fprintf(stderr,"\n         ****tota[%d] = %d x %d =%d",i,freq[tree[i].symb],tree[i].h.depth,H.total);
+           }
+       }
+     free (tree);
+
+     //fprintf(stderr,"\n **** CALL ENDS CREATE_HUFF WITH  lim = %ld, freq[0]=%ld, Huffsize=%ld",lim,freq[0],H.total); 
+
+     return H;
+   }
+
+int encodeHuff (THuff H, uint symb, uint *stream, uint ptr)
+
+   { uint pos;
+     uint code;
+     int d;
+     pos = H.s.spos[symb];
+     code = 0;
+     d = H.depth; 
+     while (pos >= H.num[d])
+       { code = (code + H.num[d]) >> 1;
+         pos -= H.num[d--];
+       }
+     code += pos;
+     if (d > W) { bitzero(stream,ptr,d-W); ptr += d-W; d = W; }
+     while (d--)
+        { if ((code >> d) & 1) bitset(stream,ptr);
+         else bitclean(stream,ptr);
+         ptr++;
+       }
+     return ptr;
+   } 
+
+void printCodeHuff (THuff H, uint symb)
+
+   { uint pos;
+     uint code;
+     int d;
+     pos = H.s.spos[symb];
+     code = 0;
+     d = H.depth;
+     
+    // fprintf(stderr,"\n H.depth= %ld and pos is %ld\n",H.depth,pos); 
+     while (pos >= H.num[d])
+       { code = (code + H.num[d]) >> 1;
+         pos -= H.num[d--];
+       }
+     code += pos;
+     if (d > W) {fprintf(stderr,"code larger than W"); d=W;}
+   
+        //show the code.
+     while (d--)
+        { if ((code >> d) & 1) 
+               fprintf(stderr,"1");
+             else fprintf(stderr,"0");
+               }
+   } 
+
+
+
+int decodeHuff (THuff *H, uint *symb, uint *stream, uint ptr)
+
+   { uint pos;
+     int d;
+     pos = 0;
+     d = 0;
+     while (pos < H->fst[d])
+        { pos = (pos << 1) | bitget(stream,ptr);
+          ptr++; d++;
+        }
+     *symb = H->s.symb[H->num[d]+pos-H->fst[d]];
+     return ptr;
+   }
+
+
+
+
+/*   { uint pos;  // This "improved" code is actually slower!
+     int d;
+     uint wrd,off;
+     stream += ptr/W;
+     off = ptr & (W-1);
+     wrd = *stream >> off;
+     pos = 0;
+     d = 0;
+     while (pos < H.fst[d])
+       { pos = (pos << 1) | (wrd & 1);
+        d++; wrd >>= 1; off++;
+        if (off == W) { wrd = *++stream; off = 0; }
+       }
+     *symb = H.s.symb[H.num[d]+pos-H.fst[d]];
+     return ptr+d;
+   } 
+*/
+void saveHuff (THuff H, FILE *f)
+
+   { uint *symb = (uint *)malloc((H.lim+1)*sizeof(uint));
+     int i;
+     for (i=0;i<=H.max;i++) 
+               if (H.s.spos[i] != ~0) symb[H.s.spos[i]] = i;
+     fwrite (&H.max,sizeof(uint),1,f); 
+     fwrite (&H.lim,sizeof(uint),1,f); 
+     fwrite (&H.depth,sizeof(uint),1,f); 
+     fwrite (symb,sizeof(uint),H.lim+1,f); 
+     fwrite (H.num,sizeof(uint),H.depth+1,f); 
+     free (symb);
+   }
+
+uint sizeHuff (THuff H)
+
+   { return (4 +(H.lim+1)+2*(H.depth+1))*sizeof(uint);
+   }
+
+uint sizeHuffDisk (THuff H)
+
+   { return ( sizeof(THuff) + ((H.lim+1)+(H.depth+1))*sizeof(uint) );
+   }
+
+void freeHuff (THuff H)
+
+  { free (H.s.spos);  free (H.num); free (H.fst);
+  }
+
+
+THuff loadHuff (FILE *f, int enc)   //enc (0/1)-> do you only  want to perform encoding ??
+
+   { THuff H;
+     uint *symb;
+     uint *num;
+     int i,d,dold,dact;
+     fread (&H.max,sizeof(uint),1,f); 
+     fread (&H.lim,sizeof(uint),1,f); 
+     fread (&H.depth,sizeof(uint),1,f); 
+     symb = (uint *) malloc ((H.lim+1)*sizeof(uint));
+     fread (symb,sizeof(uint),H.lim+1,f); 
+     if (enc) 
+          { H.s.spos = (uint *) malloc ((H.max+1)*sizeof(uint));
+            for (i=0;i<=H.max;i++) H.s.spos[i] = ~0;
+            for (i=0;i<=H.lim;i++) H.s.spos[symb[i]] = i;
+            free (symb);
+         }
+     else H.s.symb = symb;
+       
+     H.num = (uint *) malloc ((H.depth+1)*sizeof(uint));
+     fread (H.num,sizeof(uint),H.depth+1,f); 
+     if (!enc) 
+          { H.fst = (uint *) malloc ((H.depth+1)*sizeof(uint));
+            H.fst[H.depth] = 0; dold = 0;
+            for (d=H.depth-1;d>=0;d--)
+               { dact = H.num[d+1];
+                 H.fst[d] = (H.fst[d+1]+dact) >> 1;
+                 H.num[d+1] = dold;
+                 dold += dact;
+               }
+           H.num[0] = dold;
+         }
+     return H;
+   }     
+
+
+
+  
+
+/***************************************************************/     
+void prepareToDecode(THuff *H)   
+//***///**//  //by fari !!     
+
+   { uint *symb = (uint *) malloc((H->lim+1)*sizeof(uint));
+     uint *num;
+     int i,d,dold,dact;
+
+     for (i=0;i<=H->max;i++) 
+               if (H->s.spos[i] != ~0) 
+                       symb[H->s.spos[i]] = i;
+                       
+        for (i=0;i<=H->lim;i++) 
+               H->s.symb[i] = symb[i];
+       
+     //H.num = malloc ((H.depth+1)*sizeof(uint));
+     {
+               H->fst = (uint *)malloc ((H->depth+1)*sizeof(uint));
+               H->fst[H->depth] = 0; dold = 0;
+               for (d=H->depth-1;d>=0;d--)
+               {       dact = H->num[d+1];
+                       H->fst[d] = (H->fst[d+1]+dact) >> 1;
+                       H->num[d+1] = dold;
+                       dold += dact;
+               }
+           H->num[0] = dold;
+        } 
+     free (symb);       
+}    
+
+
+////////////     
+void saveHuffAfterDecode (THuff H, FILE *f)     
+   { 
+     fwrite (&H.max,sizeof(uint),1,f); 
+     fwrite (&H.lim,sizeof(uint),1,f); 
+     fwrite (&H.depth,sizeof(uint),1,f); 
+     fwrite (H.s.symb,sizeof(uint),H.lim+1,f); 
+     
+     fwrite (H.fst,sizeof(uint),H.depth+1,f); 
+     fwrite (H.num,sizeof(uint),H.depth+1,f); 
+   }
+
+
+THuff loadHuffAfterDecode (FILE *f, int enc)   //enc (0/1)-> do you only  want to perform encoding ??
+
+   { THuff H;
+//     int i,d,dold,dact;
+//
+//     fread (&H.max,sizeof(uint),1,f); 
+//     fread (&H.lim,sizeof(uint),1,f); 
+//     fread (&H.depth,sizeof(uint),1,f); 
+//
+//     H.s.symb = malloc ((H.lim+1)*sizeof(uint));
+//     fread (H.s.symb,sizeof(uint),H.lim+1,f); 
+// 
+//     H.fst = malloc ((H.depth+1)*sizeof(uint));
+//     fread (H.fst,sizeof(uint),H.depth+1,f);
+//
+//     H.num = malloc ((H.depth+1)*sizeof(uint));
+//     fread (H.num,sizeof(uint),H.depth+1,f); 
+//     
+     return H;
+   }   
+       
+
+
+void loadHuffAfterDecode2 (THuff *H, FILE *f, int enc)   //enc (0/1)-> do you only  want to perform encoding ??
+
+   { 
+     int i,d,dold,dact;
+
+     fread (&H->max,sizeof(uint),1,f); 
+     fread (&H->lim,sizeof(uint),1,f); 
+     fread (&H->depth,sizeof(uint),1,f); 
+
+     H->s.symb = (uint *) malloc ((H->lim+1)*sizeof(uint));
+     fread (H->s.symb,sizeof(uint),H->lim+1,f); 
+ 
+     H->fst = (uint *) malloc ((H->depth+1)*sizeof(uint));
+     fread (H->fst,sizeof(uint),H->depth+1,f);
+
+     H->num = (uint *) malloc ((H->depth+1)*sizeof(uint));
+     fread (H->num,sizeof(uint),H->depth+1,f); 
+   }   
+       
+     
+     
+
diff --git a/swcsa/utils/huff.h b/swcsa/utils/huff.h

new file mode 100755 (executable)

index 0000000..060570c
--- /dev/null
+++ b/swcsa/utils/huff.h
@@ -0,0 +1,80 @@
+
+// implements canonical Huffman 
+
+#ifndef HUFFINCLUDED
+#define HUFFINCLUDED
+
+#include "basics.h"
+#define SORTED 1
+#define UNSORTED 0
+
+typedef struct
+   { uint max,lim;   // maximum symbol (0..max), same excluding zero freqs
+     uint depth; // max symbol length
+     union
+       { uint *spos; // symbol positions after sorting by decr freq (enc)
+        uint *symb; // symbols sorted by freq (dec)
+       } s;
+     uint *num;  // first pos of each length (dec), number of each length (enc)
+     uint *fst;  // first code (numeric) of each length (dec)
+     uint total; // total length to achieve, in bits
+   } THuff;
+
+       // Creates Huffman encoder given symbols 0..lim with frequencies 
+       // freq[i], ready for compression
+       // sorted --> are the symbols already sorted ?
+
+THuff createHuff (uint *freq, uint lim, uint sorted);
+
+       // Encodes symb using H, over stream[ptr...lim] (ptr and lim are
+       // bit positions of stream). Returns the new ptr.
+       
+int encodeHuff (THuff H, uint symb, uint *stream, uint ptr);
+
+       // Decodes *symb using H, over stream[ptr...lim] (ptr and lim are
+       // bit positions of stream). Returns the new ptr.
+       
+int decodeHuff (THuff *H, uint *symb, uint *stream, uint ptr);
+
+       //Prepares a Huffman tree for decoding (changes in spos & symb)
+       
+void prepareToDecode(THuff *H);
+
+       // Writes H in file f
+       
+void saveHuff (THuff H, FILE *f);
+
+       // Size of H written on file
+       
+uint sizeHuffDisk (THuff H);
+
+       //Size of H in memory
+uint sizeHuff (THuff H);
+       
+       // Frees H
+       
+void freeHuff (THuff H);
+
+       // Loads H from file f, prepared for encoding or decoding depending
+       // on enc
+       
+THuff loadHuff (FILE *f, int enc);
+
+//Decodes a code starting in position ptr from stream. Returns the ranking in the
+//vector of symbols.
+
+#define decodeNormalHuffMacro(H, symbol, stream, ptr) \
+   { uint pos; \
+     int d; \
+     pos = 0; \
+     d = 0; \
+     while (pos < H->fst[d]) \
+        { pos = (pos << 1) | bitget(stream,ptr); \
+          ptr++; d++; \
+        } \
+    symbol = (H->s.symb[ H->num[d] + pos - H->fst[d] ]); \
+   }
+
+
+#endif
+
diff --git a/swcsa/utils/huffDec.c b/swcsa/utils/huffDec.c

new file mode 100755 (executable)

index 0000000..aa8e4cc
--- /dev/null
+++ b/swcsa/utils/huffDec.c
@@ -0,0 +1,180 @@
+
+// implements canonical Huffman 
+
+#include "huffDec.h"
+
+
+//THuff createHuff (uint *fst, uint *num, uint depth) {
+//     THuff H;
+//     H.fst = fst;
+//     H.num = num;
+//     H.depth = depth;
+//     return H;       
+//}
+
+
+void printCodeHuffDec (THuffDec H, uint symb)
+
+   { uint pos;
+     uint code;
+     int d;
+     //pos = H.s.spos[symb];
+     pos = symb;
+     code = 0;
+     d = H.depth;
+     
+    // fprintf(stderr,"\n H.depth= %ld and pos is %ld\n",H.depth,pos); 
+     while (pos >= H.num[d])
+       { code = (code + H.num[d]) >> 1;
+         pos -= H.num[d--];
+       }
+     code += pos;
+     if (d > W) {fprintf(stderr,"code larger than W"); d=W;}
+   
+        //show the code.
+     while (d--)
+        { if ((code >> d) & 1) 
+               fprintf(stderr,"1");
+             else fprintf(stderr,"0");
+               }
+   } 
+
+
+
+//Decodes a code starting in position ptr from stream. Returns the ranking in the
+//vector of symbols.
+int decodeHuffDec (THuffDec *H, uint *symb, uint *stream, uint ptr)
+   { uint pos;
+     int d;
+     pos = 0;
+     d = 0;
+     while (pos < H->fst[d])
+        { pos = (pos << 1) | bitget(stream,ptr);
+          ptr++; d++;
+        }
+     *symb = H->num[d]+pos-H->fst[d];
+     return ptr;
+   }
+
+
+
+// //Decodes a code starting in position ptr from stream. Returns the ranking in the
+// //vector of symbols.
+// int decodeHuffDecVariantWord (uint *fst , uint *symb, uint *stream, uint ptr, uint depth)
+//    { uint pos;
+//      int d;
+//      pos = 0;
+//      d = 0;
+//      while ((d< depth) && (pos < fst[d*2])  )
+//         { pos = (pos << 1) | bitget(stream,ptr);
+//           ptr++; d++;
+//         }
+//     if (depth==d) 
+//             *symb = pos;
+//     else 
+//             *symb = 
+//                     pos  - 
+//                     fst[d*2] + 
+//                     fst[d*2+1]; 
+//      //(*symb) = (depth==d) ? pos : fst[d*2+1] + pos - fst[d*2]; 
+// 
+//      return ptr;
+//    }
+
+
+
+// the bytes used by HuffDecman struct
+uint sizeHuffDec (THuffDec H)
+   { return (1+ 2*(H.depth+1))*sizeof(uint);
+   }
+
+
+void freeHuffDec (THuffDec H)
+
+  { free (H.fst); free (H.num);
+  }
+
+
+THuffDec loadHuffDecAfterDecode (FILE *f, int enc)   //enc (0/1)-> do you only  want to perform encoding ??
+
+   { THuffDec H;
+//     int i,d,dold,dact;
+//
+//     fread (&H.max,sizeof(uint),1,f); 
+//     fread (&H.lim,sizeof(uint),1,f); 
+//     fread (&H.depth,sizeof(uint),1,f); 
+//
+//     H.s.symb = malloc ((H.lim+1)*sizeof(uint));
+//     fread (H.s.symb,sizeof(uint),H.lim+1,f); 
+// 
+//     H.fst = malloc ((H.depth+1)*sizeof(uint));
+//     fread (H.fst,sizeof(uint),H.depth+1,f);
+//
+//     H.num = malloc ((H.depth+1)*sizeof(uint));
+//     fread (H.num,sizeof(uint),H.depth+1,f); 
+//     
+     return H;
+   }   
+       
+       
+////THuffDec loadHuffDec (FILE *f, int enc)
+////
+////   { THuffDec H;
+////     uint *symb;
+////     uint *num;
+////     int i,d,dold,dact;
+////     fread (&H.max,sizeof(uint),1,f); 
+////     fread (&H.lim,sizeof(uint),1,f); 
+////     fread (&H.depth,sizeof(uint),1,f); 
+////     symb = malloc ((H.lim+1)*sizeof(uint));
+////     fread (symb,sizeof(uint),H.lim+1,f); 
+////     if (enc) 
+////          { H.s.spos = malloc ((H.max+1)*sizeof(uint));
+////            for (i=0;i<=H.max;i++) H.s.spos[i] = ~0;
+////            for (i=0;i<=H.lim;i++) H.s.spos[symb[i]] = i;
+////            free (symb);
+////     }
+////     else H.s.symb = symb;
+////     H.num = malloc ((H.depth+1)*sizeof(uint));
+////     fread (H.num,sizeof(uint),H.depth+1,f); 
+////     if (!enc) 
+////          { H.fst = malloc ((H.depth+1)*sizeof(uint));
+////            H.fst[H.depth] = 0; dold = 0;
+////            for (d=H.depth-1;d>=0;d--)
+////           { dact = H.num[d+1];
+////             H.fst[d] = (H.fst[d+1]+dact) >> 1;
+////             H.num[d+1] = dold;
+////             dold += dact;
+////           }
+////       H.num[0] = dold;
+////     }
+////     return H;
+////   }       
+
+
+void loadHuffDecAfterDecode2 (THuffDec *H, FILE *f, int enc)   //enc (0/1)-> do you only  want to perform encoding ??
+
+   { 
+     int i,d,dold,dact;
+
+//     fread (&H->max,sizeof(uint),1,f); 
+//     fread (&H->lim,sizeof(uint),1,f); 
+     fread (&H->depth,sizeof(uint),1,f); 
+
+//     H->s.symb = malloc ((H->lim+1)*sizeof(uint));
+//     fread (H->s.symb,sizeof(uint),H->lim+1,f); 
+ 
+     H->fst = (uint *) malloc ((H->depth+1)*sizeof(uint));
+     fread (H->fst,sizeof(uint),H->depth+1,f);
+
+     H->num = (uint *) malloc ((H->depth+1)*sizeof(uint));
+     fread (H->num,sizeof(uint),H->depth+1,f); 
+   }   
+       
+     
+     
+     
+
+
+     
+
diff --git a/swcsa/utils/huffDec.h b/swcsa/utils/huffDec.h

new file mode 100755 (executable)

index 0000000..7cf1b40
--- /dev/null
+++ b/swcsa/utils/huffDec.h
@@ -0,0 +1,214 @@
+
+// implements canonical Huffman !! Just for decoding when symbols were sorted before creating huffman
+
+#ifndef HUFFDECINCLUDED
+#define HUFFDECINCLUDED
+
+#include "basics.h"
+#define SORTED 1
+#define UNSORTED 0
+
+typedef struct
+   { //uint lim;   
+     uint depth; // max symbol length
+     uint *num;  // first pos of each length (dec), number of each length (enc)
+     uint *fst;  // first code (numeric) of each length (dec)        
+   } THuffDec;
+
+
+//typedef struct
+//   { uint max,lim;   // maximum symbol (0..max), same excluding zero freqs
+//     uint depth; // max symbol length
+//     union
+//       { uint *spos; // symbol positions after sorting by decr freq (enc)
+//      uint *symb; // symbols sorted by freq (dec)
+//       } s;
+//     uint *num;  // first pos of each length (dec), number of each length (enc)
+//     uint *fst;  // first code (numeric) of each length (dec)
+//     uint total; // total length to achieve, in bits
+//   } THuff;
+
+
+       // Decodes *symb using H, over stream[ptr...lim] (ptr and lim are
+       // bit positions of stream). Returns the new ptr.
+int decodeHuffDec (THuffDec *H, uint *symb, uint *stream, uint ptr);
+
+       // Writes H in file f   
+void saveHuffDec (THuffDec H, FILE *f);
+
+       // Frees H      
+void freeHuffDec (THuffDec H);
+
+       // the number of bytes used by HuffDecman struct.
+uint sizeHuffDec (THuffDec H);
+
+       // Loads H from file f, prepared for encoding or decoding depending
+       // on enc
+       
+THuffDec loadHuffDec (FILE *f, int enc);
+
+
+//Decodes a code starting in position ptr from stream. Returns the ranking in the
+//vector of symbols.
+#define decodeHuffDecMacro(H, symb, stream, ptr) \
+   { uint pos; \
+     int d; \
+     pos = 0; \
+     d = 0; \
+     while (pos < H->fst[d]) \
+        { pos = (pos << 1) | bitget(stream,ptr); \
+          ptr++; d++; \
+        } \
+     fflush(stdout); \
+     symb = H->num[d]+pos-H->fst[d]; \
+   }
+
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+//#define decodeHuffDecMacroVALIDWORDS decodeHuffDecMacro
+
+//Decodes a code starting in position ptr from stream. Returns the ranking in the
+//vector of symbols. 
+//Saves space, as the last level for num[] and fst[] are not needed.
+//             at expenses of an extra-IF (the last line).
+#define decodeHuffDecMacroVALIDWORDS(H, symb, stream, ptr, depth) \
+   { uint pos; \
+     int d; \
+     pos = 0; \
+     d = 0; \
+     while ((pos < H->fst[d])  && (d< depth)) \
+        { pos = (pos << 1) | bitget(stream,ptr); \
+          ptr++; d++; \
+        } \
+     fflush(stdout); \
+     symb = (depth==d) ? pos : H->num[d]+pos-H->fst[d]; \
+   }
+
+
+//Decodes a code starting in position ptr from stream. Returns the ranking in the
+//vector of symbols.
+#define decodeHuffDecMacroVariantWordxx(fst,num, symb, stream, ptr, depth) \
+   { uint pos; \
+     int d; \
+     pos = 0; \
+     d = 0; \
+     while ((pos < fst[d])  && (d< depth)) \
+        { pos = (pos << 1) | bitget(stream,ptr); \
+          ptr++; d++; \
+        } \
+     fflush(stdout); \
+     symb = (depth==d) ? pos : num[d] + pos - fst[d]; \
+   }
+
+
+
+//Decodes a code starting in position ptr from stream. Returns the ranking in the
+//vector of symbols.
+// #define decodeHuffDecMacroVariantWord(fst,symb, stream, ptr, depth) \
+//    { uint pos; \
+//      int d; \
+//      pos = 0; \
+//      d = 0; \
+//      while ((d< depth) && (pos < fst[d*2])  ) \
+//         { pos = (pos << 1) | bitget(stream,ptr); \
+//           ptr++; d++; \
+//         } \
+//      symb = (depth==d) ? pos : fst[d*2+1] + pos - fst[d*2]; \
+//    }
+
+
+/** Decodes a variant of a word from a stream of compressed bits, starting in the ptr-th bit
+       The starting bucket in fstnum is "offbucket" [fst|num|fst|num|fst|num]
+       */
+#define decodeHuffDecMacroVariantWordPos(fstnum, offbucket, symb, stream, ptr, depth) \
+   { uint pos; \
+     register uint d; \
+     pos = 0; \
+     d = 0; \
+     while ((d< depth) && (pos < fstnum[offbucket + d*2])  ) \
+        { pos = (pos << 1) | bitget(stream,ptr); \
+          ptr++; d++; \
+        } \
+     symb = (depth==d) ? pos : fstnum[offbucket+d*2+1] + pos - fstnum[offbucket+d*2]; \
+   }
+#define decodeHuffDecMacroVariantWordPos2(HV, symb, stream, ptr, idCanonical) \
+   { uint pos; \
+     register uint d; \
+     uint *offfstnum = HV.offsetNumAndFst; \
+     uint *fstnum = HV.zoneNumFst; \
+        register uint offbucket = offfstnum[idCanonical]; \
+        register uint depth = (offfstnum[idCanonical+1] - offbucket)/2; \
+     pos = 0; \
+     d = 0; \
+     while ((d< depth) && (pos < fstnum[offbucket + d*2])  ) \
+        { pos = (pos << 1) | bitget(stream,ptr); \
+          ptr++; d++; \
+        } \
+     symb = (depth==d) ? pos : fstnum[offbucket+d*2+1] + pos - fstnum[offbucket+d*2]; \
+   }
+   
+   
+     //fst[0] = zone[0  ];
+        //fst[1] = zone[1*2];
+        //fst[i] = zone[i*2];
+
+     //num[0] = zone[0  +1];
+        //num[1] = zone[1*2+1];
+        //num[i] = zone[i*2+1];
+
+
+
+//#define decodeHuffDecMacroVariantWordPos2bits(HV, symb, stream, ptr, idCanonical) \
+//   { uint pos; \
+//     register uint d; \
+//     uint sizeBuckbits= HV.sizeBuckbits; \
+//     uint dirbElemSize = HV.dirbElemSize; \
+//     uint sizeFstbits = HV.sizeFstbits; \
+//     uint sizeNumbits = HV.sizeNumbits; \
+//     uint *dirb = HV.Dirb; \
+//     uint *zonefstnum = HV.zoneMem; \
+//      register uint offbucket; \
+//      offbucket = bitread (dirb, idCanonical*dirbElemSize, dirbElemSize); \
+//      register uint depth; \
+//      depth = bitread (dirb, (idCanonical+1)*dirbElemSize, dirbElemSize); \
+//      depth = (depth - offbucket)/sizeBuckbits; \
+//     pos = 0; \
+//     d = 0; \
+//     register uint currfst; \
+//     currfst = bitread (zonefstnum, (offbucket + d*sizeBuckbits), sizeFstbits); \
+//     while ((pos < currfst)  ) \
+//        { pos = (pos << 1) | bitget(stream,ptr); \
+//          ptr++; d++; \
+//          if (d<depth) \
+//             currfst = bitread (zonefstnum, (offbucket + d*sizeBuckbits), sizeFstbits); \
+//          else break; \
+//        } \
+//    if (depth==d) \
+//             symb=pos; \
+//    else { \
+//     uint currNum; \
+//     currNum = bitread (zonefstnum, (offbucket + d*sizeBuckbits+ sizeFstbits), sizeNumbits); \
+//     symb = currNum + pos - currfst; \
+//    } \
+//  }
+
+
+
+// int decodeHuffDecVariantWord (uint *fst , uint *symb, uint *stream, uint ptr, uint depth);
+
+
diff --git a/swcsa/utils/kBitArray.c b/swcsa/utils/kBitArray.c

new file mode 100755 (executable)

index 0000000..23af265
--- /dev/null
+++ b/swcsa/utils/kBitArray.c
@@ -0,0 +1,59 @@
+ #include "kBitArray.h"\r
+ \r
+/*-----------------------------------------------------------------\r
+ Initilization of the kBitArray\r
+ ---------------------------------------------------------------- */ \r
+ t_kBitArray create_kBitArray (uint size, uint elemSize) {\r
+       uint bitsNeeded; \r
+       t_kBitArray      V;\r
+       \r
+       V = (t_kBitArray) malloc (sizeof(struct akbitArr));\r
+       bitsNeeded = size * elemSize;\r
+       V->totalInts = ((bitsNeeded+W-1)/W);\r
+       V->data = (uint *) malloc((V->totalInts) * sizeof(uint));\r
+       V->data[V->totalInts-1]=0000; /** avoids valgrind to blame*/\r
+       V->size = size;\r
+       V->elemSize= elemSize;\r
+       printf("\nKbitVector[0,%d), of elemSize = %u initialized\n",V->size,V->elemSize);\r
+       printf("\ntotalInts = %u, size = %u  elemSize = %u\n",V->totalInts,V->size,V->elemSize);\r
+       return V;\r
+}\r
+\r
+uint getKBitArray(t_kBitArray V, register uint i) {\r
+       register uint eSize = V->elemSize;\r
+       register uint answ;\r
+       register uint pos = i * eSize;\r
+       mybitread(answ, V->data, pos, eSize);\r
+       return (answ);\r
+       //return ( bitread(V->data,i * eSize,eSize)); \r
+}\r
+\r
+uint getKBitArraySinMacro(t_kBitArray V, register uint i) {\r
+       register uint eSize = V->elemSize;\r
+       return ( bitread(V->data,i * eSize,eSize)); \r
+}\r
+\r
+void setKBitArray(t_kBitArray V, uint i, uint value){\r
+       register uint eSize = V->elemSize;      \r
+       register uint pos = i * eSize;\r
+       mybitwrite(V->data, pos, eSize, value);\r
+}\r
+\r
+void setKBitArraySinMacro(t_kBitArray V, uint i, uint value){\r
+       register uint eSize = V->elemSize;\r
+       bitwrite(V->data,i *eSize, eSize, value);\r
+}\r
+\r
+/*-----------------------------------------------------------------\r
+ freeing resources\r
+ ---------------------------------------------------------------- */ \r
+void destroy_kBitArray (t_kBitArray kBitArray) {\r
+       uint total;\r
+       total = (kBitArray->totalInts) * sizeof(uint);\r
+       \r
+       free(kBitArray->data);\r
+       free(kBitArray);        \r
+       printf("\n[destroying a Kbit array of size table]...Freed %u bytes... RAM", total);             \r
+}\r
+\r
+\r
diff --git a/swcsa/utils/kBitArray.h b/swcsa/utils/kBitArray.h

new file mode 100755 (executable)

index 0000000..e929b11
--- /dev/null
+++ b/swcsa/utils/kBitArray.h
@@ -0,0 +1,23 @@
+#include "basics.h"\r
+ \r
+struct akbitArr {\r
+       uint *data;        /* uint * contains space for the array of kbit elements      */\r
+       uint size;     /* number of kbitElements    */\r
+       uint elemSize; /* number of bits of each element*/      \r
+       uint totalInts; /* number of ints used */\r
+};\r
+\r
+typedef struct akbitArr *t_kBitArray;\r
+\r
+       /*********/\r
+       t_kBitArray create_kBitArray (uint size, uint elemSize);\r
+       uint getKBitArray(t_kBitArray V,uint i);\r
+       void setKBitArray(t_kBitArray V, uint i, uint value);\r
+       void destroy_kBitArray (t_kBitArray kBitArray);\r
+\r
+\r
+       #define getKBitArrayMacro mybitread  //mybitread(answ, v, p, len)\r
+\r
+\r
+\r
+\r
diff --git a/swcsa/utils/parameters.c b/swcsa/utils/parameters.c

new file mode 100644 (file)

index 0000000..9d06475
--- /dev/null
+++ b/swcsa/utils/parameters.c
@@ -0,0 +1,47 @@
+
+#include "parameters.h"
+
+/***********************************************************************************/
+/*** FUNCTIONS USED FOR PARSING PARAMETERS FROM COMMAND LINE ***********************/
+/* Three function to variables to manage parameters */
+ bool is_delimeter(char *delimiters, char c) {
+   int i=0,len_delimiters=strlen(delimiters);
+   bool is=false;
+   for (i=0;i<len_delimiters;i++)
+     if (c == delimiters[i]) is=true;
+   return is;
+}
+
+
+ void parse_parameters(char *options, int *num_parameters, char ***parameters, char *delimiters) {
+  int i=0,j=0,temp=0,num=0, len_options=strlen(options);
+  char *options_temp;
+  while  (i<len_options) {
+    while ((i<len_options) && is_delimeter(delimiters,options[i])) i++;
+    temp=i;
+    while ((i<len_options) && !is_delimeter(delimiters,options[i])) i++;
+    if (i!=temp) num++;
+  }
+  (*parameters) = (char **) malloc(num*sizeof(char * ));
+  i=0;
+  while  (i<len_options) {
+    while ((i<len_options) && is_delimeter(delimiters,options[i])) i++;
+    temp=i;
+    while ((i<len_options) && !is_delimeter(delimiters,options[i])) i++;
+    if (i!=temp) {
+      (*parameters)[j]=(char *) malloc((i-temp+1)*sizeof(char));
+      options_temp = options+temp;
+      strncpy((*parameters)[j], options_temp, i-temp);
+      ((*parameters)[j])[i-temp] = '\0';
+      j++;
+    }
+  }
+  *num_parameters = num;
+}
+
+ void free_parameters(int num_parameters,char ***parameters) {
+  int i=0;
+  for (i=0; i<num_parameters;i++)
+    free((*parameters)[i]);
+  free((*parameters));
+}
diff --git a/swcsa/utils/parameters.h b/swcsa/utils/parameters.h

new file mode 100644 (file)

index 0000000..f13da9f
--- /dev/null
+++ b/swcsa/utils/parameters.h
@@ -0,0 +1,11 @@
+#include <string.h>
+#include <stdlib.h>
+/***********************************************************************************/
+/*** FUNCTIONS USED FOR PARSING PARAMETERS FROM COMMAND LINE ***********************/
+
+#ifndef PARAMETERS_INCLUDED
+#define PARAMETERS_INCLUDED
+ bool is_delimeter(char *delimiters, char c) ;
+ void parse_parameters(char *options, int *num_parameters, char ***parameters, char *delimiters);
+ void free_parameters(int num_parameters,char ***parameters);
+#endif
diff --git a/swcsa/utils/valstring.c b/swcsa/utils/valstring.c

new file mode 100755 (executable)

index 0000000..d27c747
--- /dev/null
+++ b/swcsa/utils/valstring.c
@@ -0,0 +1,130 @@
+#include "valstring.h"
+#include <stdio.h>
+
+unsigned char  _Valid[256];
+unsigned char  _Invalid[256];
+
+unsigned char _toLow[256];
+
+
+#ifndef ValidCh
+
+ #define ValidCh(ch)  (isalnum(ch)) /* Teste de validacao        */   
+ #define InvalidCh(ch) (!ValidCh(ch))
+
+#endif
+
+
+
+
+void StartValid() {
+
+  unsigned x;
+
+  for(x=0;x<128;x++) {
+    if(ValidCh(x)) {
+      _Valid[x]=1;
+      _Invalid[x]=0;
+    }
+    else {
+      _Valid[x]=0;
+      _Invalid[x]=1;
+    }
+  }
+  for(x=128;x<256;x++) {
+    _Valid[x]=0;
+    _Invalid[x]=1;
+  }
+  
+  _Valid[0]=_Invalid[0]=0;
+
+  // Caracteres especiales (acentuados, dieresis...)
+  // Caracter 'ñ'
+  _Valid[241]=1;
+  _Invalid[241]=0;
+  // Caracter 'Ñ'
+  _Valid[209]=1;
+  _Invalid[209]=0;
+  // Caracter 'á'
+  _Valid[225]=1;
+  _Invalid[225]=0;
+  // Caracter 'é'
+  _Valid[233]=1;
+  _Invalid[233]=0;
+  // Caracter 'í'
+  _Valid[237]=1;
+  _Invalid[237]=0;
+  // Caracter 'ó'
+  _Valid[243]=1;
+  _Invalid[243]=0;
+  // Caracter 'ú'
+  _Valid[250]=1;
+  _Invalid[250]=0;
+  // Caracter 'Á'
+  _Valid[193]=1;
+  _Invalid[193]=0;
+  // Caracter 'É'
+  _Valid[201]=1;
+  _Invalid[201]=0;
+  // Caracter 'Í'
+  _Valid[205]=1;
+  _Invalid[205]=0;
+  // Caracter 'Ó'
+  _Valid[211]=1;
+  _Invalid[211]=0;
+  // Caracter 'Ú'
+  _Valid[218]=1;
+  _Invalid[218]=0;
+  // Caracter 'ü'
+  _Valid[252]=1;
+  _Invalid[252]=0;
+  // Caracter 'Ü'
+  _Valid[220]=1;
+  _Invalid[220]=0;
+      
+}
+
+
+
+
+void StartToLow() {
+       int i;
+       unsigned char c;
+       for (i=0;i<256;i++) {
+               c=i;
+               if( (c >= 'A') && (c <= 'Z') ){
+                       _toLow[i]= c+ 'a'-'A';
+               }
+               else if (!_Valid[c]) {
+                       _toLow[i]=c;    
+               }               
+               else {
+                       switch(c) {                     
+                               case 192: case 193: case 194: case 195: case 196: case 197: case 224: case 225: 
+                               case 226: case 227: case 228: case 229: 
+                                       c = 97; break;
+                       
+                               case 201: case 202: case 203: case 232: case 233: case 234: case 235: 
+                                       c = 101; break;
+                               
+                               case 204: case 205: case 206: case 207: case 236: case 237: case 238: case 239: 
+                                       c = 105; break;
+                               
+                               case 210: case 211: case 212: case 213: case 214: case 242: case 243: case 244: 
+                                         case 245: case 246: 
+                                       c = 111; break;
+                               
+                               case 217: case 218: case 219: case 220: case 249: case 250: case 251: case 252: 
+                                       c = 117; break;
+                               
+                               case 209: c = 241; break;
+                               
+                               case 138: case 154: c = 115; break;
+                               
+                               case 159: case 221: case 253: case 255: 
+                                       c = 121; break;                         
+                       }
+                       _toLow[i]=c;
+               }
+       }       
+}
diff --git a/swcsa/utils/valstring.h b/swcsa/utils/valstring.h

new file mode 100755 (executable)

index 0000000..93a59a3
--- /dev/null
+++ b/swcsa/utils/valstring.h
@@ -0,0 +1,17 @@
+#include <ctype.h>
+
+#ifndef ValidCh
+
+ #define ValidCh(ch)  (isalnum(ch)) /* Teste de validacao      */   
+ #define InvalidCh(ch) (!ValidCh(ch))
+
+#endif
+
+
+extern unsigned char  _Valid[256];
+extern unsigned char  _Invalid[256];
+
+extern unsigned char  _toLow[256];
+
+void StartValid();
+void StartToLow();
author	nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
	Sat, 23 Oct 2010 12:32:46 +0000 (12:32 +0000)
committer	nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
	Sat, 23 Oct 2010 12:32:46 +0000 (12:32 +0000)
makefile		patch \| blob \| history
swcsa/Makefile	[new file with mode: 0644]	patch \| blob
swcsa/buildAll.c	[new file with mode: 0644]	patch \| blob
swcsa/buildFacade.c	[new file with mode: 0755]	patch \| blob
swcsa/buildFacade.h	[new file with mode: 0755]	patch \| blob
swcsa/buildIntIndex.c	[new file with mode: 0644]	patch \| blob
swcsa/buildParser.c	[new file with mode: 0644]	patch \| blob
swcsa/buildStats.c	[new file with mode: 0644]	patch \| blob
swcsa/intIndex/Makefile	[new file with mode: 0755]	patch \| blob
swcsa/intIndex/defValues.h	[new file with mode: 0644]	patch \| blob
swcsa/intIndex/icsa.c	[new file with mode: 0644]	patch \| blob
swcsa/intIndex/icsa.h	[new file with mode: 0644]	patch \| blob
swcsa/intIndex/interfaceIntIndex.h	[new file with mode: 0644]	patch \| blob
swcsa/intIndex/psiDeltaCode.c	[new file with mode: 0644]	patch \| blob
swcsa/intIndex/psiDeltaCode.h	[new file with mode: 0644]	patch \| blob
swcsa/intIndex/psiGonzalo.c	[new file with mode: 0644]	patch \| blob
swcsa/intIndex/psiGonzalo.h	[new file with mode: 0644]	patch \| blob
swcsa/intIndex/psiHuffmanRLE.c	[new file with mode: 0644]	patch \| blob
swcsa/intIndex/psiHuffmanRLE.h	[new file with mode: 0644]	patch \| blob
swcsa/interface.h	[new file with mode: 0755]	patch \| blob
swcsa/utils/MemoryManager.c	[new file with mode: 0755]	patch \| blob
swcsa/utils/MemoryManager.h	[new file with mode: 0755]	patch \| blob
swcsa/utils/basics.c	[new file with mode: 0755]	patch \| blob
swcsa/utils/basics.h	[new file with mode: 0755]	patch \| blob
swcsa/utils/bitmap.c	[new file with mode: 0755]	patch \| blob
swcsa/utils/bitmap.h	[new file with mode: 0755]	patch \| blob
swcsa/utils/defValues.h	[new file with mode: 0755]	patch \| blob
swcsa/utils/errors.c	[new file with mode: 0755]	patch \| blob
swcsa/utils/fileInfo.c	[new file with mode: 0755]	patch \| blob
swcsa/utils/fileInfo.h	[new file with mode: 0755]	patch \| blob
swcsa/utils/hash.c	[new file with mode: 0755]	patch \| blob
swcsa/utils/hash.h	[new file with mode: 0755]	patch \| blob
swcsa/utils/huff.c	[new file with mode: 0755]	patch \| blob
swcsa/utils/huff.h	[new file with mode: 0755]	patch \| blob
swcsa/utils/huffDec.c	[new file with mode: 0755]	patch \| blob
swcsa/utils/huffDec.h	[new file with mode: 0755]	patch \| blob
swcsa/utils/kBitArray.c	[new file with mode: 0755]	patch \| blob
swcsa/utils/kBitArray.h	[new file with mode: 0755]	patch \| blob
swcsa/utils/parameters.c	[new file with mode: 0644]	patch \| blob
swcsa/utils/parameters.h	[new file with mode: 0644]	patch \| blob
swcsa/utils/valstring.c	[new file with mode: 0755]	patch \| blob
swcsa/utils/valstring.h	[new file with mode: 0755]	patch \| blob