From: nvalimak Date: Sat, 23 Oct 2010 12:32:46 +0000 (+0000) Subject: Added simple WCSA X-Git-Url: http://git.nguyen.vg/gitweb/?p=SXSI%2FTextCollection.git;a=commitdiff_plain;h=102e33b134075765e6d4e0c38bc1307568ce5602 Added simple WCSA git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@919 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- diff --git a/makefile b/makefile index 046043b..f12c965 100644 --- a/makefile +++ b/makefile @@ -4,11 +4,12 @@ CPPFLAGS = -Wall -ansi -g -I$(LIBCDSPATH)/includes/ -O3 -DNDEBUG LIBCDSA = $(LIBCDSPATH)/lib/libcds.a LIBRLCSA = incbwt/rlcsa.a LIBLZTRIE = lzindex/lztrie.a +LIBSWCSA = swcsa/swcsa.a dcover_obs = dcover/difference_cover.o TextCollection_obs = TextCollection.o TextCollectionBuilder.o TCImplementation.o Tools.o BitRank.o \ - TextStorage.o ${LIBRLCSA} ${LIBCDSA} ${LIBLZTRIE} + TextStorage.o ${LIBRLCSA} ${LIBCDSA} ${LIBLZTRIE} ${LIBSWCSA} TCDebug_obs = bittree.o rbtree.o dynFMI.o all: testTextCollection @@ -28,11 +29,18 @@ incbwt/rlcsa.a: lzindex/lztrie.a: @make -C lzindex +swcsa/swcsa.a: + @make -C swcsa + clean: @make clean -C incbwt @make clean -C lzindex + @make clean -C swcsa rm -f core *.o *~ testTextCollection timeTextCollection dcover/*.o dcover/*~ +shallow_clean: + rm -f core *.o *~ testTextCollection timeTextCollection + depend: @make depend -C incbwt $(CC) -MM *.cpp > dependencies.mk diff --git a/swcsa/Makefile b/swcsa/Makefile new file mode 100644 index 0000000..1d731af --- /dev/null +++ b/swcsa/Makefile @@ -0,0 +1,75 @@ +SRCDIRUTILS = utils +SRCDIRCSA = intIndex +CC = g++ + +# If you have trouble with make, e.g.: +# /usr/bin/ld: skipping incompatible /usr/lib/gcc/x86_64-linux-gnu/4.4.3/libstdc++.a when searching for -lstdc++ +# ... +# try adding +# ln -s /usr/lib32/libstdc++.so.6.0.13 libstdc++.so +# +# The filename libstdc++.so.6.0.13 is probably different +# but any from /usr/lib32 is fine. +export CFLAGS = -O9 -m32 -L. -D_FORTIFY_SOURCE=0 +#export CFLAGS = -O0 -m32 -pg +#export CFLAGS = -g -m32 -O0 + +LIBINDEX = swcsa.a +LIBINTINDEX = icsa.a + +all: clean wcsa cleanO + +wcsa: intIndexPackage buildFacade.o parameters.o hash.o valstring.o MemoryManager.o basics.o \ + bitmap.o huffDec.o huff.o fileInfo.o + ar rc $(LIBINTINDEX) parameters.o buildFacade.o hash.o valstring.o MemoryManager.o basics.o \ + bitmap.o huffDec.o huff.o fileInfo.o + mv $(LIBINTINDEX) $(LIBINDEX) + +################# SELF INDEX ON INTEGERS ############################## +intIndexPackage: + cd $(SRCDIRCSA) && $(MAKE) -w + @echo "[copying the int-index lib into current dir]" + @cp $(SRCDIRCSA)/$(LIBINTINDEX) . + + +####################### UTILS MODULES ################################# + +parameters.o: + $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/parameters.c + +fileInfo.o: + $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/fileInfo.c + +hash.o: MemoryManager.o + $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/hash.c + + +MemoryManager.o: + $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/MemoryManager.c + +valstring.o: + $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/valstring.c + +huff.o: + $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/huff.c + +huffDec.o: + $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/huffDec.c + + +basics.o: + $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/basics.c + +bitmap.o: + $(CC) $(CFLAGS) -c $(SRCDIRUTILS)/bitmap.c + + +############################ CLEANING ################################# + +cleanO: + rm -f *.o + +clean: + cd $(SRCDIRCSA) && $(MAKE) clean -w + rm -rf *~ *% *.o core *.bak $(LIBINTINDEX) $(LIBINDEX) + diff --git a/swcsa/buildAll.c b/swcsa/buildAll.c new file mode 100644 index 0000000..b114d3a --- /dev/null +++ b/swcsa/buildAll.c @@ -0,0 +1,365 @@ +#include "buildFacade.h" + +/**------------------------------------------------------------------ + * MAIN PROGRAM. + *------------------------------------------------------------------ */ + + int main(int argc, char* argv[]) + { + + char *infile, *outbasename, *stopwordsfile; // Name of in/out files + byte *inputBuffer; + ulong finsize; + + int f_in; + void *Index; + + + printf("\n*Presentation level for CSA (simple WCSA)"); + printf("\n*CopyRight (c) 2007 [LBD & G.N.]\n\n"); + + // Reads input parameters from command line. + if(argc < 3) { + printf("Use: %s [build_options]\n", argv[0]); + exit(0); + } + + // Reads params (input file, output basename, and stopwords file) + infile = argv[1]; + outbasename = argv[2]; + stopwordsfile = argv[3]; + + finsize= fileSize(infile); + + if (! finsize) { + printf( "\nFILE EMPTY OR FILE NOT FOUND %s !!\nSkipping processement ...\n",infile); + exit(0); + } + + // Opening the input text file. + if( (f_in = open(infile, O_RDONLY)) < 0) { + printf("Cannot read file %s\n", infile); + exit(0); + } + inputBuffer = (byte *) malloc(finsize *sizeof(byte));// +1); + read (f_in,inputBuffer,finsize); + close (f_in); + + + { + //printf("\n parametros <<%s>>\n\n",stopwordsfile); + // build_WCSA (inputBuffer, finsize, stopwordsfile, NULL,outbasename); + build_index (inputBuffer, finsize, stopwordsfile, &Index); /** building the index */ + + + /** saving the index to disk*/ + + save_index (Index, outbasename); + fprintf(stderr,"Index saved !! "); + + /** tells the mem used by the index */ + ulong indexsize; + index_size(Index, &indexsize); + fprintf(stderr,"Index occupied %d bytes, 2 extra mallocs = %d",indexsize,2* sizeof(uint)); + + + /** recovering the source text from the index */ + { + double start, end; + start = getTime2(); + ulong size; + get_length(Index, &size); + + fprintf(stderr, "\nRecovering source file "); fflush(stderr); + char ext1[10]=".source"; + recoverSourceText1((twcsa*) Index, outbasename,ext1, size); + end = getTime2(); + fprintf(stderr, " time: %.3f secs\n", end-start ); + + start=end; + char ext2[10]=".source2"; + fprintf(stderr, "\nRecovering source file "); fflush(stderr); + recoverSourceText2((twcsa*) Index, outbasename,ext2,size); + end = getTime2(); + fprintf(stderr, " time: %.3f secs\n", end-start ); + //fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start ); + } + + // DISPLAYING THE OCCURRENCES OF A TEXT PATTERN (word/phrase). + {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE]; + int error = 0; + ulong numocc,numc, length, i, *snippet_len, tot_numcharext = 0, numpatt; + uchar *pattern, *snippet_text; + + pattern = textPattern; + printf("\nSEARCH TEST for DISPLAY (pizzachili interface)\n"); + while(1) { + printf("Intro string: "); + fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin); + if (!strcmp((char*)textPattern,"\n") ) break; + textPattern[strlen((char*)textPattern)-1] = '\0'; + + length = strlen( (char*)textPattern); + numc=50; + +// error = display (Index, textPattern, length, numc, &numocc, +// &snippet_text, &snippet_len); + error = displayWords (Index, textPattern, length, numc, &numocc, + &snippet_text, &snippet_len,1); + + if (error){ fprintf(stderr, "%s\n", "Hubo un error durante display");exit(0);} + + fprintf(stderr,"\n acabou display");fflush(stderr); + {//show the results + ulong j, len = length + 2*numc; + char blank = '\0'; + fprintf(stderr,"\n length = %d",length); + fprintf(stderr,"\n pattern = %s",pattern);fflush(stderr); + fprintf(stderr,"\n numocc = %d",numocc);fflush(stderr); + fprintf(stderr,"\n snippet len = %d",len);fflush(stderr); + fprintf(stderr,"\n =========");fflush(stderr); + for (i = 0; i < numocc; i++){ + fprintf(stderr,"\n[%2d][len=%3d]<<",i+1,snippet_len[i]);fflush(stderr); + fwrite(snippet_text+len*i,sizeof(uchar),snippet_len[i],stderr);fflush(stderr); + fprintf(stderr,">>");fflush(stderr); + } + } + numpatt--; + + for(i=0; i0) free(occs); + + if (!strcmp((char*)textPattern,"\n") ) break; + } + } + + + + + /** freeing the index */ + free_index(Index); + + } +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +//{ +// bG = createBitmap (B,len); +// bE = createBitmapEdu (B,len); +// //fprintf(stderr,"\n RANK DE GONZALO DA %u, de EDU DA %u\n",rank(bG,33),rank1Edu(bE,33)); +// //fprintf(stderr,"\n SELECT1(2) DE GONZALO DA %u, de EDU DA %u\n",bselect(bG,4),bselect(bE,4)); +// +// showBitVector(bitvector,34); +//} + + + +/* + + //USING A HASH TABLE + // { + // char a[20]="beginnings";char b[20]="HOSTIAS"; + // char *w; + // int i; + // w=a; + // i = inHashTable(stopwordshash,w, strlen(w), &addrInTH ); + // if (!i) insertElement (stopwordshash, w, strlen(w), &addrInTH); + // else stopwordshash->hash[addrInTH].freq++; + // //fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); + // //fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, stopwordshash->hash[addrInTH].word, stopwordshash->hash[addrInTH].freq, stopwordshash->hash[addrInTH].posInVoc); + // } + + + + /// ENCODING THE separators ... +{ + freeHuff(gapsHuffman); + uint i; + uint *bitvector; + uint bitvectorSize; + uint ptr; + bitmap bG,bE; + uint len; + len = 1000; //number of bits + bitvector = (uint *) malloc ((len/32 +1)* sizeof(uint)); + + byte texto[100] = "####@?*"; + uint freqs[256]; + + //fprintf(stderr,"\n este es el texto a codificar: %s",texto); + for (i=0;i<256;i++) freqs[i]=0; + for (i=0;isourceTextSize), sizeof(uint)); + write(file, &(wcsa->seSize), sizeof(uint)); + close(file); + } + + /** The Words in the vocabulary of words (sorted alphabetically)*/ + { strcpy(outfilename, basename); + strcat(outfilename, "."); + strcat(outfilename, VOCABULARY_WORDS_FILE_EXT); + unlink(outfilename); + if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) { + printf("Cannot open file %s\n", outfilename); + exit(0); + } + + uint n = wcsa->n; + uint elemSize = wcsa->wordsData.elemSize; + write(file, &n, sizeof(uint)); + write(file, &elemSize, sizeof(uint)); + write(file, &(wcsa->wordsData.wordsZoneMem.size), sizeof(uint)); + + //the number of canonical words + write(file, (char *)wcsa->wordsData.wordsZoneMem.zone, wcsa->wordsData.wordsZoneMem.size * sizeof(byte)); + write(file, (char *)wcsa->wordsData.words, ((((n+1)* (elemSize))+W-1) /W) * (sizeof(uint)) ); + + close(file); + } + + free(outfilename); + + if (wcsa->myicsa) { + /******** saves index on integers (bottom) ******/ + //Storing the CSA + //storeStructsCSA(wcsa->myicsa,basename); + saveIntIndex((void *) wcsa->myicsa, basename); + } + + if (wcsa->se) { + saveSEfile(basename,wcsa->se, wcsa->seSize+1); + //free(wcsa->se); + } + + return 0; +} + + + + /** Loads index from one or more file(s) named filename, possibly + adding the proper extensions. */ +int load_index(char *filename, void **index){ + twcsa *wcsa; + wcsa = loadWCSA (filename); + (*index) = (void *) wcsa; + return 0; +} + + /** Frees the memory occupied by index. */ +int free_index(void *index){ + twcsa *wcsa=(twcsa *) index; + ulong size; + index_size(index,&size); + printf("\n[destroying index] ...Freed %lu bytes... RAM", size); + + + //frees the array SE. + if (wcsa->se) + free (wcsa->se); + + //the iCSA. + if (wcsa->myicsa) { + //destroyStructsCSA(wcsa->myicsa); + int err = freeIntIndex((void *) wcsa->myicsa); + } + + //the words. + free (wcsa->wordsData.wordsZoneMem.zone); + free (wcsa->wordsData.words); /** huge!! */ + + //the pointer to wcsa. + free(wcsa); + return 0; +} + + /** Gives the memory occupied by index in bytes. */ +int index_size(void *index, ulong *size) { + ulong totaltmp; + twcsa *wcsa=(twcsa *)index; + uint n= wcsa->n; + *size=0; + *size += sizeof(twcsa); + + totaltmp=0; //words + totaltmp += ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)); //the pointers + totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words. + *size += totaltmp; + + if (wcsa->myicsa) { + uint nbytes; + int err = sizeIntIndex((void *) wcsa->myicsa, &nbytes); + *size += nbytes; + //*size += CSA_size(wcsa->myicsa); + } + + return 0; +} + + +/** Querying the index =============================================================*/ + + /* Writes in numocc the number of occurrences of the substring + pattern[0..length-1] found in the text indexed by index. */ +int count (void *index, uchar *pattern, ulong length, ulong *numocc){ + uint integerPatterns[MAX_INTEGER_PATTERN_SIZE]; + uint integerPatternSize; + ulong l,r; + + twcsa *wcsa=(twcsa *) index; + parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize); + if (!integerPatternSize) {*numocc=0; return 0;} //not found + + //*numocc = countCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &l, &r); + int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, numocc, &l, &r); + return 0; +} + + /* Writes in numocc the number of occurrences of the substring + pattern[0..length-1] in the text indexed by index. It also allocates + occ (which must be freed by the caller) and writes the locations of + the numocc occurrences in occ, in arbitrary order. */ +int locate(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc){ + return 99; +} + + /* Gives the length of the text indexed */ +int get_length(void *index, ulong *length) { + twcsa *wcsa=(twcsa *) index; + *length = wcsa->sourceTextSize; + return 0; +} + + /** Obtains the length of the text indexed by index. */ + +int length (void *index, ulong *length) { + return (get_length(index,length)); +} + + +/** *********************************************************************************** + * Accessing the indexed text + * ***********************************************************************************/ + + + /** Allocates snippet (which must be freed by the caller) and writes + the substring text[from..to] into it. Returns in snippet_length the + length of the text snippet actually extracted (that could be less + than to-from+1 if to is larger than the text size). */ +int extract (void *index, ulong from, ulong to, uchar **snippet, ulong *snippet_length) { + twcsa *wcsa=(twcsa *) index; + return 99; +} + + /** Displays the text (snippet) surrounding any occurrence of the + substring pattern[0..length-1] within the text indexed by index. + The snippet must include numc characters before and after the + pattern occurrence, totalizing length+2*numc characters, or less if + the text boundaries are reached. Writes in numocc the number of + occurrences, and allocates the arrays snippet_text and + snippet_lengths (which must be freed by the caller). The first is a + character array of numocc*(length+2*numc) characters, with a new + snippet starting at every multiple of length+2*numc. The second + gives the real length of each of the numocc snippets. */ + +int display (void *index, uchar *pattern, ulong length, ulong numc, + ulong *numocc, uchar **snippet_text, ulong **snippet_lengths) { + return 99; +} + + + +/** *********************************************************************************** + * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord + * ***********************************************************************************/ + /* Writes in numocc the number of occurrences of the substring + pattern[0..length-1] in the text indexed by index. It also allocates + occ (which must be freed by the caller) and writes the locations of + the numocc occurrences in occ, in arbitrary order. These occurrences + refer to the offsets in TOH where the caller could start a display + operation. So locateWord implies synchronization using B. + Moreover, positions occ[numocc.. 2*numocc-1] is set with the rank in SE of the + words whose codes begin in TOH in the positions in occ[0... numocc-1] + ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the + searched word, but the offset in TOH of k-before words before. + */ + +int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore){ + uint integerPatterns[MAX_INTEGER_PATTERN_SIZE]; + uint integerPatternSize; + ulong occurrences,l,r; + twcsa *wcsa=(twcsa *) index; + + parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize); + if (!integerPatternSize) {*numocc=0; return 0;} //not found + + ulong *seOffsets; + + //obtains the indexes in vector SE where the pattern appears. + //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences); + int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences); + + *numocc = occurrences; + + if (!occurrences) {(*occ)=NULL;return 0;} + + (*occ) = (ulong *)seOffsets; + return 0; +} + + + /** Displays the text (snippet) surrounding any occurrence of the + substring pattern[0..length-1] within the text indexed by index. + The snippet must include numc characters before and after the + pattern occurrence, totalizing length+2*numc characters, or less if + the text boundaries are reached. Writes in numocc the number of + occurrences, and allocates the arrays snippet_text and + snippet_lengths (which must be freed by the caller). The first is a + character array of numocc*(length+2*numc) characters, with a new + snippet starting at every multiple of length+2*numc. The second + gives the real length of each of the numocc snippets. */ + + int displayWords (void *index, uchar *pattern, ulong length, ulong numc, + ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore) { + + /** actually extracts upto length + 2*numc chars, starting extraction kbefore + * words before the occurrence **/ + + ulong *indexesInSE; + ulong occurrences; + uint bytesPerSnippet; + byte *text_aux; + twcsa *wcsa=(twcsa *) index; + + locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0); + (*numocc) = occurrences; + + if (!occurrences) { + *snippet_text =NULL; + *snippet_lengths =NULL; + return 0; + } + + bytesPerSnippet = length+2*numc; +// bytesPerSnippet = 2*numc; + *snippet_lengths = (ulong *) malloc((*numocc)*sizeof(ulong)); + if (!(*snippet_lengths)) return 1; + *snippet_text = (uchar *) malloc((*numocc)*(bytesPerSnippet)*sizeof(uchar) +1) ; //(the last "1" is for '\0'); + if (!(*snippet_text)) return 1; + + // fprintf(stderr,"\n occs found = %7d for pattern %s",*numocc, pattern); + // fflush(stderr); + + text_aux=*snippet_text; + { + uint i, j, tmplen; + uint ptr, maxptr; + byte *src, *dst; + uint snippetLen; + uint posSEValue,indexSE; + + for (i=0;i kbefore) ? indexSE-kbefore : 0; + + dst = text_aux; + while ((!endSnippet) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */ + + //posSEValue =displayCSA(wcsa->myicsa,indexSE); + int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue); + + {//obtains pointer to the ith word + uint offtmp; + uint ith = posSEValue -1; // !! + tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); + offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); + tmplen -=offtmp; //the lenght of the ith word. + + src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp; + } + + if (_Valid[*src]) { + if (prevValid){ + *dst++ =' '; + snippetLen++; + if (snippetLen==bytesPerSnippet) break; //end of snippet (ends in BLANK_SPACE) + } + prevValid =1; //for the next iteration + } + else prevValid=0; + + indexSE++; + + /* at the end ?? */ + if ((tmplen+snippetLen)>=bytesPerSnippet) { + tmplen =(bytesPerSnippet - snippetLen); + endSnippet=1; //so while loop ends; + } + + for (j=0;j wordsbefore) ? indexSE-wordsbefore : 0; + + dst = text_aux; + z=0; + while ((zseSize) ){ /** extracting words (if not at the end) */ + + //posSEValue =displayCSA(wcsa->myicsa,indexSE); + int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue); + + {//obtains pointer to the ith word + uint offtmp; + uint ith = posSEValue -1; // !! + tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); + offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); + tmplen -=offtmp; //the lenght of the ith word. + + src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp; + } + + if (_Valid[*src]) { + if (prevValid){ + *dst++ =' '; + snippetLen++; + if (snippetLen==maxsnippetLen) break; //end of snippet (ends in BLANK_SPACE) + } + prevValid =1; //for the next iteration + } + else prevValid=0; + + indexSE++; + + /* at the end ?? */ + if ((tmplen+snippetLen)>=maxsnippetLen) { + break; + } + + //fprintf(stderr,"\ntmplen = %d ",tmplen); fflush(stderr); + for (j=0;j wcsa->seSize) toWord = wcsa->seSize; + if (fromWord >= wcsa->seSize) fromWord = wcsa->seSize-1; + if (buffBytes < ( (toWord-fromWord)* avgWordLen)) buffBytes = ((toWord-fromWord)* avgWordLen); + + buff = (uchar *) malloc (buffBytes * sizeof(char)); + if (!buff) return 1; //out of memory. + dst = buff; + + register uint indexSE=fromWord; + uint posSEValue=0; + register uint ith; + + while ( (indexSE < toWord) ){ /** extracting words (if not at the end) */ + + int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue); + + {//obtains pointer to the ith word + uint offtmp; + ith= posSEValue -1; // !! + tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); + offtmp = bitread (wcsa->wordsData.words, (ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); + tmplen -=offtmp; //the lenght of the ith word. + src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp; + } + + if ( buffBytes < (leng + tmplen+1) ) { + buffBytes *=2; + buff = (uchar*) realloc(buff, buffBytes); + if (!buff) return 1; //out of memory. + dst = buff + leng; + } + + if (_Valid[*src]) { + if (prevValid){ + *dst++ =' '; + leng += 1; + } + prevValid =1; //for the next iteration + } + else prevValid=0; + + indexSE++; + + for (j=0;jword, (char *)a2->word); + } + +/** + * BUILDS THE WCSA INDEX + */ + +int build_WCSA (uchar *text, ulong length, char *build_options, void **index) { + + unsigned long zeroNode; //number of different canonical words. + + t_hash hash; // the hash table to store both variants and canonical words. + tposInHT *posInHT; // structure for canonicals and variants+huffmans + + uint sourceTextSize; + + uint seSize=0; //it's size == "numberOfValidWords". + uint *SE; //Integers vector. (represents the rank of the valid words in the source text). + + uint totallenWords=0; //The numberOfBytes that occupy canonical words (their ascii version) in memory + + + ulong bytesFile,bytesFileReal; + long sizeNValue; + + /* used during first pass */ + + ulong addrInTH; + + byte* inputBuffer = text; + bytesFileReal= bytesFile = length; + + sourceTextSize=length; + + /** Initializes WCSA structure*/ + twcsa *wcsa; + wcsa = (twcsa *) malloc (sizeof (twcsa) * 1); + zeroNode=0; + /** */ + + //Stimation (Using Heap's law) of the number of different "meaningful" words. + //sizeNValue=N_value; + if(bytesFile<5000000) bytesFile = 5000000; + sizeNValue = (unsigned long) floor(3.9* pow(bytesFile,0.60) ); + + + // Inicializes the arrays used to detect if a char is valid or not. + StartValid(); + // Inicializes the arrays used translated a char into lowercase. + StartToLow(); + + + // ********************************************************************************** + //STARTING THE FIRST PASS. + // ********************************************************************************** + printf("\nSTARTING THE FIRST PASS..."); + + posInHT = (tposInHT *) malloc(sizeof(tposInHT) * sizeNValue); + hash = initialize_hash (sizeNValue); //hash to cointain both the parsed words + + //----------------------------------------------------------------- + //1st pass (processing the file) + { + byte *pbeg,*pend,*wordstart,*aWord; + register ulong size; + register uint i; + + pbeg = inputBuffer; + pend = inputBuffer+bytesFileReal; + + while (pbeg = pend) {size++;} // a unique BLANK at the end of the file. + else { + if (_Valid [*pbeg] ) { + wordstart = pbeg; //So skipping 1 blank character + while ( (sizehash[addrInTH].word; + hash->hash[addrInTH].posInVoc = zeroNode; + zeroNode++; + totallenWords += size +1; // +1 due to the '\0' char... + //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size); + } + seSize ++; + }//while pbeghash[posInHT[i].slot].posInVoc = i; + } + } + + // INITIALIZING structures for the 2nd pass ...................................... + { + SE = (uint *) malloc ((seSize+1)*sizeof (uint)); + } + + + // ********************************************************************************** + // STARTING THE SECOND PASS. + // **********************************************************************************/ + + printf("\nSTARTING THE SECOND PASS... "); + //2nd pass (processing the file) + { + byte *pbeg,*pend,*wordstart,*aWord; + register ulong size; + register uint i; + register ulong countValidWords = 0; + + + pbeg = inputBuffer; + pend = inputBuffer+bytesFileReal; + + while (pbeg = pend) {size++;} // a unique BLANK at the end of the file. + else { + if (_Valid [*pbeg] ) { + wordstart = pbeg; //So skipping 1 blank character + while ( (sizehash[addrInTH].posInVoc+1; // !!!! + countValidWords++; + + }// while pbegn = zeroNode; + wcsa->sourceTextSize = sourceTextSize; + wcsa->seSize = seSize; + + // Creating the words of the vocabulary... + { + /** copying the words into WCSA. */ + uint *tmpOffsets = (uint *) malloc (sizeof(uint) * (zeroNode +1) ); //1 extra uint (to point to the virtual "zeroNode+1" ^th word. + uint tmpOffset =0; + + byte *zoneMem,*src; + uint i; + + //Moving data from posInHT to WCSA structure + //wcsa->wordsData = (twords *) malloc(sizeof(twords) * zeroNode); + wcsa->wordsData.wordsZoneMem.size = totallenWords - zeroNode; //without '\0' bytes (end-tag). + wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc ( wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ); + zoneMem = wcsa->wordsData.wordsZoneMem.zone; + for(i = 0; i < zeroNode; i++) { + src = posInHT[i].word; //copying the canonical word + //wcsa->wordsData.words[i].word = zoneMem; //setting the pointer + tmpOffsets[i]=tmpOffset; //offset in zoneMem + while (*src) {*zoneMem++ = *src++; tmpOffset++;} //moving data until '\0' + //*zoneMem='\0'; zoneMem++; //copies also the '\0' + + } + tmpOffsets[zeroNode]=tmpOffset; //setting pointer to the "virtual" word {zeroNode+1}^{th} + + //kbit encoding of the offsets + uint elemSize = bits(tmpOffset); + wcsa->wordsData.elemSize = elemSize; + wcsa->wordsData.words = (uint *) malloc (((((zeroNode +1)*elemSize)+W-1) /W) * sizeof(uint)); //with 1 extra slot !. + wcsa->wordsData.words[((((zeroNode +1)*elemSize)+W-1) /W) -1 ] =0000; + // fprintf(stderr,"\n ElemSize = %d, maxOffset = %d",elemSize,tmpOffset); + + tmpOffset=0; + for (i=0; i<=zeroNode; i++) { //setting "zeroNode+1" offsets + bitwrite(wcsa->wordsData.words, tmpOffset, elemSize, tmpOffsets[i]); + tmpOffset+=elemSize; + } + + //////////// CHECKS IT WORKED. old !!!! + // { uint kk; + // tmpOffset=0; + // for (i=0; iwordsData.words, i* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); + // tmpOffset+=elemSize; + // if (kk != tmpOffsets[i]) {fprintf(stderr,"\n @@@@@@@@ DISTINTOS OFFSETS "); break;} + // else fprintf(stderr,"\n iguales, %d, %d :: <<%s>>len=%d",kk,i, posInHT[i].word, strlen((char*)posInHT[i].word)); + // } + // } + // + // { uint len1, len, tmplen, len2; + // uint i,p; + // byte *wcsaWord, *src; + // + // for (p=0;pwordsData.words, (wcsa->wordsData.elemSize * (p+1)), wcsa->wordsData.elemSize); + // tmplen = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p)) , wcsa->wordsData.elemSize); + // + // //fprintf(stderr,"\n :: off[%d]= %d - off [%d] = %d ==> %d",p+1,len,p,tmplen,len-tmplen); + // + // len2 =len-tmplen; + // wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen; + // } + // + // src = posInHT[p].word; + // len1 = strlen((char *)src); + // + // if (strcompL(src,wcsaWord,len1,len2) != 0) { + // fprintf(stderr,"\n %6d DISTINTOS !! ===len1 %d,len %d===== <<",p,len1,len2);printWord(src,len1); + // fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>"); + // exit(0); + // } + // else { + // fprintf(stderr,"\n %6d ======len1 %d,len2 %d===== <<",p,len1,len2);printWord(src,len1); + // fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>"); + // } + // } + // + // } + // + + /**-----------*/ + //frees memory from hash table and posInHT structures. + free(tmpOffsets); + destroy_hash(hash); + free(posInHT); + } + + /** ******* creates the self-index on ints (bottom layer) ==> see build_icsa *********/ +/** + #ifdef CSA_ON + { + uint total; + fprintf(stderr,"\n **** CREATING CSA from Edu's Code *****"); + ticsa *myicsa; + myicsa = createIntegerCSA(&SE,seSize+1,build_options); + wcsa->myicsa= myicsa; + total = CSA_size(myicsa); + + free(SE); //SE is no longer needed, (it is indexed by the iCSA) + printf("\n\t**** [iCSA built on %d words. Size = %ld bytes... RAM",seSize,total); + } + #endif +*/ + + //#ifndef CSA_ON + wcsa->se = SE; + wcsa->myicsa = NULL; + //#endif + + printf("\n\t ** Building done! **\n"); + printf("\n Process finished!\n"); + + *index = wcsa; + return 0; +} + + +int build_iCSA (char *build_options, void *index) +{ + twcsa *wcsa = (twcsa *) index; + /********* creates the self-index on ints (bottom layer) *********/ + //creating CSA from Edu's code... + { + uint total; + fprintf(stderr,"\n **** CREATING CSA-bottom-layer *****"); + void *bottomIntIndex; + int err = buildIntIndex(wcsa->se,wcsa->seSize+1, build_options,(void **)&bottomIntIndex); + wcsa->myicsa = bottomIntIndex; + + //total = CSA_size(wcsa->myicsa); + err = sizeIntIndex((void *) wcsa->myicsa, &total); + + printf("\n\t**** [iCSA built on %d words. Size = %u bytes... RAM",wcsa->seSize,total); + } + return 0; +} + +/** ******************************************************************** + * Loading from disk + **********************************************************************/ + +/**----------------------------------------------------------------- + * LoadWCSA. + * Loads all the data structures of WCSA (included the icsa) + ----------------------------------------------------------------- */ + +twcsa *loadWCSA(char *filename) { + twcsa *wcsa; + // Inicializes the arrays used to detect if a char is valid or not. + StartValid(); + // Inicializes the arrays used translated a char into lowercase. + StartToLow(); + + wcsa = (twcsa *) malloc (sizeof (twcsa) * 1); + wcsa->n=0; + + int err = loadIntIndex(filename, (void **)&wcsa->myicsa); + + loadStructs(wcsa,filename); + + return wcsa; +} + +/** ------------------------------------------------------------------ + * LoadStructs. + * Reads files and loads all the data needed for searcherFacade + ----------------------------------------------------------------- */ + void loadStructs(twcsa *wcsa, char *basename) { + uint i,j; + char *filename; + int file; + uint sizeFile; + char c; + uint n; + + filename = (char *)malloc(sizeof(char)*(strlen(basename)+10)); + fprintf(stderr,"Loading Index from file %s.*\n", basename); + + //** SOME CONSTANTS: sourceTextSize + { strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, CONSTANTS_FILE_EXT); + + if( (file = open(filename, O_RDONLY)) < 0) { + printf("Cannot open file %s\n", filename); + exit(0); + } + + read(file, &(wcsa->sourceTextSize), sizeof(uint)); + read(file, &(wcsa->seSize), sizeof(uint)); + close(file); + } + + /** File with the words from the vocabulary (sorted alphabetically) */ + { byte *zoneMem; + + strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, VOCABULARY_WORDS_FILE_EXT); + //sizeFile= fileSize(filename)-sizeof(uint); + + if( (file = open(filename, O_RDONLY)) < 0) { + printf("Cannot open file %s\n", filename); + exit(0); + } + + //the number of canonical words + read(file, &n, sizeof(uint)); + wcsa->n = n; + read(file, &(wcsa->wordsData.elemSize), (sizeof(uint))); + read(file, &(wcsa->wordsData.wordsZoneMem.size), (sizeof(uint))); + + //allocating the memory needed for all words and reading them //(ascii) << no \0 chars are needed>>. + wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc(wcsa->wordsData.wordsZoneMem.size * sizeof(byte)); + read(file, (wcsa->wordsData.wordsZoneMem.zone), wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ); + + //reading the offsets of the words (kbitArray that points to offsets in zoneMem of words. + wcsa->wordsData.words = (uint *) malloc (((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * sizeof(uint)); + wcsa->wordsData.words[ ((((n+1)*(wcsa->wordsData.elemSize))+W-1) /W) -1 ] =0000; + read(file, (wcsa->wordsData.words), ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint))); + + + close(file); + } + wcsa->se= NULL; + free(filename); +} + + + + +/** **************************************************************** + * Querying the index WCSA + * ***************************************************************/ +/////////////////////////////////////////////////////////////////////////////////////// +// FUNCTIONS NEEDED FOR SEARCHING A PATTERN // +/////////////////////////////////////////////////////////////////////////////////////// + + + +/*------------------------------------------------------------------ + * Given a text pattern translates it into a list of integers (corresponding to the + * canonical words associated to the valid words in the text pattern) + ------------------------------------------------------------------*/ +void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) { + + byte *pbeg,*pend,*wordstart,*aWord; + register unsigned long size; + uint index =0; + + pbeg = textPattern; + pend = pbeg + patLen; + + while (pbeg = pend) {size++;} // a unique BLANK at the end of the file. + else { + if (_Valid [*pbeg] ) { + wordstart = pbeg; //So skipping 1 blank character + while ( (sizen) - 1; + while(min < max) { + p = (min+max)/2; + + {//preparing for strcompL + len = bitread (wcsa->wordsData.words, (p+1)* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize); + tmplen = bitread (wcsa->wordsData.words, (p )* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize); + len -=tmplen; + wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen; + } + + //if(strncmp((char*)aWord, (char*)wcsa->wordsData[p].word,size) > 0) min = p+1; + if(strcompL(aWord, wcsaWord, size, len) > 0) min = p+1; + else max = p; + + + // { //SHOW PROGRESS + // fprintf(stderr,"\n Patron = <<%s>>, curposWord= %d ==><<",aWord,p); + // printWord(wcsaWord,len); fprintf(stderr,">> len =%d",len); + // } + + } + + {//preparing for strcompL + len = bitread (wcsa->wordsData.words, (min+1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); + tmplen = bitread (wcsa->wordsData.words, ( min )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); + len -=tmplen; + wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen; + } + + // if(!strncmp((char*)aWord, (char*)wcsa->wordsData[min].word, size)) { + if(!strcompL(aWord, wcsaWord, size, len)) { + integerPattern[index++] = min +1 ; //<-- + } + else {*sizeIntegers = 0; return;} // a valid word that does not appear in the source text. + + } + }// end while + *sizeIntegers = index; + + // //shows the parsed words: + // {uint i; + // printf("\n\n >>%s>> HA SIDO PARSEADO COMO:",textPattern); + // for (i=0; i>",wcsa->wordsData[integerPattern[i] -1].word); + // } + // + // } +} + + + + + + /** ------------------------------------------------------------------ + * Returns the number of occurrences of a given text pattern + *------------------------------------------------------------------ */ +int countTextOcurrences(twcsa *wcsa, byte *textPattern) { + ulong left, right; + uint integerPatterns[MAX_INTEGER_PATTERN_SIZE]; + uint integerPatternSize, min, max; + + uint lenpat = strlen((char*)textPattern); + parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize); + if (!integerPatternSize) return -1; + +// #ifdef DEBUG_ON +// uint i; +// printf("\n %d Integers to search for:",integerPatternSize ); +// for (i=0;iwordsData[integerPatterns[i]-1].word); +// } +// printf("\n"); +// #endif + + ulong numocc; + int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, &numocc, &left, &right); + return numocc; + +} + + + /** ------------------------------------------------------------------ + * locateTextOcurrences: + * Returns the offsets of the source text where a word/phrase appears + * Returns also the number of occurrences. + *------------------------------------------------------------------ */ +uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences) { + uint integerPatterns[MAX_INTEGER_PATTERN_SIZE]; + uint integerPatternSize, min, max; + + uint lenpat = strlen((char*)textPattern); + parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize); + if (!integerPatternSize) {*numberOccurrences = -1; return NULL;} + +// #ifdef DEBUG_ON +// uint i; +// printf("\n %d Integers to search for:",integerPatternSize ); +// for (i=0;iwordsData[integerPatterns[i]-1].word); +// } +// printf("\n"); +// #endif + + ulong occurrences, left, right; + ulong *seOffsets; + ulong *sourceOffsets; + + //obtains the indexes in vector SE where the pattern appears. + //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences); + int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences); + + //sourceOffsets = (uint *) malloc (sizeof(uint)*occurrences); + + sourceOffsets=seOffsets; + //obtains the offsets in the source text of the pattern (sourceOffsets) + locateFacade(wcsa, (uint *)sourceOffsets, (uint *)seOffsets,occurrences); + + #ifdef DEBUG_ON + fprintf(stderr,"\n*** %s appears in the source text in positions:\n\t",textPattern); + for (i=0;i=0; + ------------------------------------------------------------------*/ + int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr) { + return 99; //not implemented: function not available for this index +} + + + /**------------------------------------------------------------------ + * DISPLAYFacadeMalloc: + * Returns the subString from a starting offset to a final offset + * in the source text. It allocates Memory !! + * NOT CURRENTLY USED + ------------------------------------------------------------------*/ +byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length) { + byte *dstptr=NULL; //not implemented: function not available + return dstptr; +} + + + /** ------------------------------------------------------------------ + * LOCATEALLandDISPLAY: + * Displays the text around an occurrence of the searched word in the source text. + * Assuming that $p$ is that position --> shows only chars in [p_radix-1,p_radix] + ------------------------------------------------------------------*/ +int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix) { +return 99; //not implemented: function not available for this index + +} + + + /** ------------------------------------------------------------------ + * recovers the source text by calling display(0,fileSize); + * ------------------------------------------------------------------ */ +void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) { + + int start;int end; + byte *cc; + char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1); + ulong length; + + strcpy( filename, basename); + strcat( filename, ext); + filename[strlen( basename)+ strlen(ext)]='\0'; + fprintf(stderr,"\n uncompressing text into file -->%s ",filename);fflush(stderr); + + FILE *salida; + unlink( filename); + salida = fopen( filename,"w"); + start=0; end = sourceTextSize-1; + + cc = (byte *) malloc (sourceTextSize* sizeof(uchar)); + + { + uint i, j;//, tmplen; + uint prevValid; + //uint ptr, maxptr; + byte *src, *dst; + uint leng =0; + uint tmplen =0; + + uint indexSE=0; + uint posSEValue=0; + + dst=cc; + while ( (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */ + + int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue); + + {//obtains pointer to the ith word + uint offtmp; + uint ith = posSEValue -1; // !! + tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); + offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); + tmplen -=offtmp; //the lenght of the ith word. + src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp; + } + + if (_Valid[*src]) { + if (prevValid){ + *dst++ =' '; + leng +=1; + } + prevValid =1; //for the next iteration + } + else prevValid=0; + + indexSE++; + + for (j=0;j%s ",filename);fflush(stderr); + + FILE *salida; + unlink( filename); + salida = fopen( filename,"w"); + start=0; end = wcsa->seSize; + + error = extractWords((void *) wcsa, start, end, &cc, &length); + if (error) {fprintf(stderr,"\n error during recoverSourceText2"); exit(0);} + + fprintf(stderr,"\n sourceTextSize = %d, len = %ld",sourceTextSize,length); + fwrite(cc,sizeof(byte),length,salida); + fclose(salida); + + free(cc); + free(filename); +} + +/** ******************************************************************************* + * Showing some statistics and info of the index + * *******************************************************************************/ +void printInfoReduced(twcsa *wcsa) { + //not implemented: function not available +} + + /* Shows summary info of the index */ +int printInfo(void *index) { + uint n; + + twcsa *wcsa = (twcsa *) index; + + unsigned long indexSize; + uint intIndexSize, presentationSize; + int err; + + err = index_size(index, &indexSize); + if (err!=0) return err; + err = sizeIntIndex(wcsa->myicsa, &intIndexSize); + if (err!=0) return err; + + presentationSize = indexSize - intIndexSize; + + printf("\n ===================================================:"); + printf("\n Summary of Presentation layer:"); + printf("\n Number of valid words (SEsize) = %u",wcsa->seSize); + printf("\n Number of different words = %ld",wcsa->n); + printf("\n WCSA structure = %d bytes", sizeof(twcsa)); + + uint totalpointers = ((((wcsa->n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)); + uint totalasciizone = wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ; + uint totalwords = totalasciizone + totalpointers; + + printf("\n Size Of words structure (%d bytes):",totalwords); + printf("\n [ pointers = %d bytes || AsciiZone = %d bytes", totalpointers, totalasciizone); + + printf("\n\n Total = ** %u bytes (in RAM) **",presentationSize); + //printf("\n\n @@ Summary of self-index on Integers:"); + err = printInfoIntIndex(wcsa->myicsa, " "); + if (err!=0) return err; + + printf("\n ===================================================:"); + printf("\n"); + return 0; + } + +/**------------------------------------------------------------------ + * structsSize. + * Counts the memory amount needed by the Facade (Presentation Layer). + * skipping the stop_words hash table + ----------------------------------------------------------------- */ +uint structsSizeMem(twcsa *wcsa) { + return 0; //not implemented: function not available for this index. +} + + +/** for debugging **/ +void printWord(uchar *str, uint len) { + uint i; + for (i=0;i \n", argv[0]); + exit(0); + } + + // Reads params (input file, output basename, and stopwords file) + infile = argv[1]; + outbasename = argv[2]; + stopwordsfile = argv[3]; + + finsize= fileSize(infile); + + if (! finsize) { + printf( "\nFILE EMPTY OR FILE NOT FOUND %s !!\nSkipping processement ...\n",infile); + exit(0); + } + + // Opening the input text file. + if( (f_in = open(infile, O_RDONLY)) < 0) { + printf("Cannot read file %s\n", infile); + exit(0); + } + inputBuffer = (byte *) malloc(finsize *sizeof(byte));// +1); + read (f_in,inputBuffer,finsize); + close (f_in); + + + { + //printf("\n parametros <<%s>>\n\n",stopwordsfile); + build_index (inputBuffer, finsize, stopwordsfile, &Index); /** building the index */ + +// /** recovering the source text from the index */ + { + double start, end; + start = getTime2(); + ulong size; + get_length(Index, &size); + char extension[10]= ".source"; + + //recoverSourceText1((twcsa*) Index, outbasename,extension, size); + strcat(extension,"2"); + recoverSourceText2((twcsa*) Index, outbasename,extension,size); + end = getTime2(); + fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start ); + } +// + // DISPLAYING THE OCCURRENCES OF A TEXT PATTERN (word/phrase). + {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE]; + int error = 0; + ulong numocc,numc, length, i, *snippet_len, tot_numcharext = 0, numpatt; + uchar *pattern, *snippet_text; + + pattern = textPattern; + printf("\nSEARCH TEST for DISPLAY (pizzachili interface)\n"); + while(1) { + printf("Intro string: "); + fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin); + if (!strcmp((char*)textPattern,"\n") ) break; + textPattern[strlen((char*)textPattern)-1] = '\0'; + + length = strlen( (char*)textPattern); + numc=50; + +// error = display (Index, textPattern, length, numc, &numocc, +// &snippet_text, &snippet_len); + error = displayWords (Index, textPattern, length, numc, &numocc, + &snippet_text, &snippet_len,1); + + if (error){ fprintf(stderr, "%s\n", "Hubo un error durante display");exit(0);} + + fprintf(stderr,"\n acabou display");fflush(stderr); + {//show the results + ulong j, len = length + 2*numc; + char blank = '\0'; + fprintf(stderr,"\n length = %d",length); + fprintf(stderr,"\n pattern = %s",pattern);fflush(stderr); + fprintf(stderr,"\n numocc = %d",numocc);fflush(stderr); + fprintf(stderr,"\n snippet len = %d",len);fflush(stderr); + fprintf(stderr,"\n =========");fflush(stderr); + for (i = 0; i < numocc; i++){ + fprintf(stderr,"\n[%2d][len=%3d]<<",i+1,snippet_len[i]);fflush(stderr); + fwrite(snippet_text+len*i,sizeof(uchar),snippet_len[i],stderr);fflush(stderr); + fprintf(stderr,">>");fflush(stderr); + } + } + numpatt--; + + for(i=0; i +#include + + +#include "utils/valstring.h" +#include "utils/defValues.h" +#include "utils/MemoryManager.h" +#include "utils/fileInfo.h" + +#include "utils/hash.h" + +#include "utils/huff.h" +//#include "utils/errors.c" +#include "utils/parameters.h" + +//from SEARCHER FACADE +#include "utils/huffDec.h" +//#include "icsa/icsa.h" + +#include "intIndex/interfaceIntIndex.h" + +#ifndef uchar +#define uchar unsigned char +#endif +#ifndef uint +#define uint unsigned int +#endif +#ifndef ulong +#define ulong unsigned long +#endif + +#define STRLEN(str,len) \ +{len=0; \ + byte *ptr = str; \ + while(*ptr++) len++; \ +} + +#define ADDLEN(str,len) \ +{byte *ptr = str; \ + while(*ptr++) len++; \ +} + +/** Some data types used **ONLY**during construction process */ + +// Words, both the canonical words and their variants + typedef struct { + unsigned long slot; // the position in the hash table of the canonical word + byte *word; //makes alphanumerical sorting easier... + } tposInHT; + + + typedef struct SzoneMem { //a large block of memory to load a file into mem. + byte *zone; //block of mem. + uint size; //number of bytes + } tZoneMem; + + // words dataStructure. + typedef struct { + uint *words; + uint elemSize; //the size (in bits) of each pointer. + tZoneMem wordsZoneMem; // a block of memory where the canonical words are loaded (from file). + + } twords; + + +/** Some data types used during searches */ + + + + + /**the WCSA index structures... */ + typedef struct { + + /**valid words */ + twords wordsData; /* vocabulary (words) of the index */ + + ulong n; /* number of different words. */ + uint seSize; /* number of words in the source text */ + + uint sourceTextSize; /*the size of the source text in bytes*/ + + //ticsa *myicsa; //the WiCSA on SE words + void *myicsa; //the WiCSA on SE words + + //#ifndef CSA_ON + uint *se; + //#endif + + }twcsa; + + +/** ****************************************************************************** + * Interface (from pizza chili) for using the WCSA index +*********************************************************************************/ + +/* Error management */ + + /* Returns a string describing the error associated with error number + e. The string must not be freed, and it will be overwritten with + subsequent calls. */ + +char *error_index (int e); + +/* Building the index */ + + /* Creates index from text[0..length-1]. Note that the index is an + opaque data type. Any build option must be passed in string + build_options, whose syntax depends on the index. The index must + always work with some default parameters if build_options is NULL. + The returned index is ready to be queried. */ + +int build_index (uchar *text, ulong length, char *build_options, void **index); + + /* Saves index on disk by using single or multiple files, having + proper extensions. */ + +int save_index (void *index, char *filename); + + /* Loads index from one or more file(s) named filename, possibly + adding the proper extensions. */ + +int load_index (char *filename, void **index); + + /* Frees the memory occupied by index. */ + +int free_index (void *index); + + /* Gives the memory occupied by index in bytes. */ + +int index_size(void *index, ulong *size); + +/* Querying the index */ + + /* Writes in numocc the number of occurrences of the substring + pattern[0..length-1] found in the text indexed by index. */ + +int count (void *index, uchar *pattern, ulong length, ulong *numocc); + + /* Gives the length of the text indexed */ + +int get_length(void *index, ulong *length); + +/* Accessing the indexed text */ + + /* Writes in numocc the number of occurrences of the substring + pattern[0..length-1] in the text indexed by index. It also allocates + occ (which must be freed by the caller) and writes the locations of + the numocc occurrences in occ, in arbitrary order. */ + +int locate (void *index, uchar *pattern, ulong length, ulong **occ, + ulong *numocc); + + /* Allocates snippet (which must be freed by the caller) and writes + the substring text[from..to] into it. Returns in snippet_length the + length of the text snippet actually extracted (that could be less + than to-from+1 if to is larger than the text size). */ + +int extract (void *index, ulong from, ulong to, uchar **snippet, + ulong *snippet_length); + + /* Displays the text (snippet) surrounding any occurrence of the + substring pattern[0..length-1] within the text indexed by index. + The snippet must include numc characters before and after the + pattern occurrence, totalizing length+2*numc characters, or less if + the text boundaries are reached. Writes in numocc the number of + occurrences, and allocates the arrays snippet_text and + snippet_lengths (which must be freed by the caller). The first is a + character array of numocc*(length+2*numc) characters, with a new + snippet starting at every multiple of length+2*numc. The second + gives the real length of each of the numocc snippets. */ + +int display (void *index, uchar *pattern, ulong length, ulong numc, + ulong *numocc, uchar **snippet_text, ulong **snippet_lengths); + + /* Obtains the length of the text indexed by index. */ + +int length (void *index, ulong *length); + + /* Shows summary info of the index */ +int printInfo(void *index); + +/** *******************************************************************************************/ +/** Building part of the index ****************************************************************/ + +int build_WCSA (uchar *text, ulong length, char *build_options, void **index); +int build_iCSA (char *build_options, void *index); + + + +/** *******************************************************************************************/ +/** Search part of the index ******************************************************************/ +// Definitions of some PUBLIC function prototipes. + + //loading/freeing the data structures into memory. + + void loadStructs(twcsa *wcsa, char *basename); + twcsa *loadWCSA(char *filename); + + //returns the source text from given [offsetIni, offsetFin] offsets. + //byte *displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin); + byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length); + int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr); + + //locate all the ocurrences of a word/phrase + int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number); + + //show text around the occurrences of a word. + int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix); + + //recovers the source text by calling display (either only once or "len" times) + void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize); + void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize); + + //***Searching for a TEXT pattern ... + + //extracts the ids of the valid words of a "plain text". + void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) ; + + //counts the occurrences of a given text pattern. + int countTextOcurrences(twcsa *wcsa, byte *textPattern); + + //returns the offsets (to the source text) where of a given text pattern appears. + uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences); + + //shows a snippet with the text around the ocurrences of a pattern. + int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay); + + +/** *********************************************************************************** + * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord + * ***********************************************************************************/ + /** Writes in numocc the number of occurrences of the substring + pattern[0..length-1] in the text indexed by index. It also allocates + occ (which must be freed by the caller) and writes the locations of + the numocc occurrences in occ, in arbitrary order. These occurrences + refer to the offsets in TOH where the caller could start a display + operation. So locateWord implies synchronization using B. + ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the + searched word, but the offset in TOH of k-before words before. + */ + +int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore); + + /** Displays the text (snippet) surrounding any occurrence of the + substring pattern[0..length-1] within the text indexed by index. + The snippet must include numc characters before and after the + pattern occurrence, totalizing length+2*numc characters, or less if + the text boundaries are reached. Writes in numocc the number of + occurrences, and allocates the arrays snippet_text and + snippet_lengths (which must be freed by the caller). The first is a + character array of numocc*(length+2*numc) characters, with a new + snippet starting at every multiple of length+2*numc. The second + gives the real length of each of the numocc snippets. */ + + int displayWords (void *index, uchar *pattern, ulong length, ulong numc, + ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore); + + +/** simulates extration of text process, but do not actually returns anything at all + Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern. + Less than 2K words can be extracted if more than numc characters have been already obtained. + Do nothing else... do not return the text */ + + int displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc); + + +/** Allocates text (which must be freed by the caller) and recovers the + the substring of text starting from the "fromword"-th word up to the + "toWord"-th words. Returns in text the text, and in "text_lenght" the + length of the text actually extracted. Text is allocated. + Actually extracts SE[fromWord .. toWord) ... not the last word. */ + +int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text, + ulong *text_length); + + + + + //recovers the source text by calling display (either only once or "len" times) + void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize); + void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize); + + +// Definitions of PRIVATE functions + + //Auxiliary functions + + uint structsSizeDisk(twcsa *wcsa); + uint structsSizeMem(twcsa *wcsa); + void printInfoReduced(twcsa *wcsa); + int saveSEfile (char *basename, uint *v, uint n); + double getTime2 (void); + + diff --git a/swcsa/buildIntIndex.c b/swcsa/buildIntIndex.c new file mode 100644 index 0000000..3622a4b --- /dev/null +++ b/swcsa/buildIntIndex.c @@ -0,0 +1,138 @@ +#include +#include +#include +#include "buildFacade.h" + +/* only for getTime() */ +#include +#include + +/* macro to detect and notify errors */ +#define IFERROR(error) {{if (error) { fprintf(stderr, "%s\n", error_index(error)); exit(1); }}} + +int loadSEfile (char *basename, uint **v, uint *n); +void print_usage(char *); +double getTime(void); + +int main(int argc, char *argv[]) { + + char *basenamefile; + char *params = NULL; + void *index; uint index_len; + int error, i; + double start, end; + uint *se; + uint seSize; + + if (argc < 2) print_usage(argv[0]); + if (argc > 2) { + int nchars, len; + nchars = argc-2; + for(i=1;i Output (pres_layer) %u bytes.", seSize*sizeof(uint), index_len); + fprintf(stdout,"\n\t ## Overall compression --> %.3f%% (%.3f bits per char).\n\n", + (100.0*index_len)/(seSize*sizeof(uint)), (index_len*8.0)/(seSize*sizeof(uint))); + + exit(0); +} + + +int loadSEfile (char *basename, uint **v, uint *n){ + char filename[255]; + int file; + sprintf(filename,"%s.%s",basename,SE_FILE_EXT); + + uint sizeFile = fileSize(filename); + + if( sizeFile <= 0) { + printf("Cannot read information from file %s\n", filename); return -1; + } + + if( (file = open(filename, O_RDONLY)) < 0) { + printf("Cannot open file %s\n", filename); + return -1; + } + + uint *se = (uint *) malloc (sizeFile); + uint seSize = sizeFile / sizeof(uint); + read(file, se, sizeFile); //the samples + close(file); + *v=se; + *n=seSize; + return 0; +} + + + +double +getTime (void) +{ + + double usertime, systime; + struct rusage usage; + + getrusage (RUSAGE_SELF, &usage); + + usertime = (double) usage.ru_utime.tv_sec + + (double) usage.ru_utime.tv_usec / 1000000.0; + + systime = (double) usage.ru_stime.tv_sec + + (double) usage.ru_stime.tv_usec / 1000000.0; + + return (usertime + systime); + +} + +void print_usage(char * progname) { + fprintf(stderr, "Usage: %s []\n", progname); + fprintf(stderr, "\nIt builds the index on Integer for the sequence in .se,\n"); + fprintf(stderr, "storing it in [.[*]]; Any additional \n"); + fprintf(stderr, "will be passed to the construction function.\n"); + fprintf(stderr, "At the end, the program sends to the standard error \n"); + fprintf(stderr, "performance measures on time to build it.\n\n"); + exit(1); +} diff --git a/swcsa/buildParser.c b/swcsa/buildParser.c new file mode 100644 index 0000000..b5ad256 --- /dev/null +++ b/swcsa/buildParser.c @@ -0,0 +1,138 @@ +#include +#include +#include +#include "buildFacade.h" + +/* only for getTime() */ +#include +#include + +/* macro to detect and notify errors */ +#define IFERROR(error) {{if (error) { fprintf(stderr, "%s\n", error_index(error)); exit(1); }}} + +int read_file(char *filename, uchar **textt, ulong *length); +void print_usage(char *); +double getTime(void); + +int main(int argc, char *argv[]) { + + char *infile, *outfile; + uchar *text; + char *params = NULL; + ulong text_len; + void *index; + int error, i; + double start, end; + + if (argc < 3) print_usage(argv[0]); + if (argc > 3) { + int nchars, len; + nchars = argc-3; + for(i=2;i Output (pres_layer) %lu bytes.", text_len, index_len); + fprintf(stdout,"\n\t ## Overall compression --> %.2f%% (%.2f bits per char).\n\n", + (100.0*index_len)/text_len, (index_len*8.0)/text_len); + + exit(0); +} + +/* + Opens and reads a text file +*/ +int read_file(char *filename, uchar **textt, ulong *length) { + + uchar *text; + unsigned long t; + FILE *infile; + + infile = fopen(filename, "rb"); // b is for binary: required by DOS + if(infile == NULL) return 1; + + /* store input file length */ + if(fseek(infile,0,SEEK_END) !=0 ) return 1; + *length = ftell(infile); + + /* alloc memory for text (the overshoot is for suffix sorting) */ + text = (uchar *) malloc((*length)*sizeof(*text)); + if(text == NULL) return 1; + + /* read text in one sweep */ + rewind(infile); + t = fread(text, sizeof(*text), (size_t) *length, infile); + if(t!=*length) return 1; + *textt = text; + fclose(infile); + + return 0; +} + +double +getTime (void) +{ + + double usertime, systime; + struct rusage usage; + + getrusage (RUSAGE_SELF, &usage); + + usertime = (double) usage.ru_utime.tv_sec + + (double) usage.ru_utime.tv_usec / 1000000.0; + + systime = (double) usage.ru_stime.tv_sec + + (double) usage.ru_stime.tv_usec / 1000000.0; + + return (usertime + systime); + +} + +void print_usage(char * progname) { + fprintf(stderr, "Usage: %s []\n", progname); + fprintf(stderr, "\nIt builds the index for the text in file ,\n"); + fprintf(stderr, "storing it in . Any additional \n"); + fprintf(stderr, "will be passed to the construction function.\n"); + fprintf(stderr, "At the end, the program sends to the standard error \n"); + fprintf(stderr, "performance measures on time to build the index.\n\n"); + exit(1); +} diff --git a/swcsa/buildStats.c b/swcsa/buildStats.c new file mode 100644 index 0000000..4339bbd --- /dev/null +++ b/swcsa/buildStats.c @@ -0,0 +1,106 @@ +/* + * Run Queries + */ + +#include +#include +#include +#include +#include "buildFacade.h" + +/* only for getTime() */ +#include +#include + +/* macro to detect and to notify errors */ +#define IFERROR(error) {{if (error) { fprintf(stderr, "%s\n", error_index(error)); exit(1); }}} + + +/* local headers */ + +double getTime (void); +void usage(char * progname); + +static void *Index; /* opauque data type */ +static ulong Index_size, Text_length; +static double Load_time; + + + +/* + * Temporary usage: run_queries [length] [V] + */ +int main (int argc, char *argv[]) +{ + int error = 0; + char *filename; + char querytype; + + if (argc != 2) { + usage(argv[0]); + exit (1); + } + + filename = argv[1]; + + printf("\n Stats of index: %s\n",argv[1]); + + Load_time = getTime (); + error = load_index (filename, &Index); + IFERROR (error); + Load_time = getTime () - Load_time; + fprintf (stderr, "\tLoad index time = %.2f secs\n", Load_time); + + error = index_size(Index, &Index_size); + IFERROR (error); + + error = get_length(Index, &Text_length); + IFERROR (error); + + + ulong text_len; + error = get_length(Index, &text_len); + IFERROR (error); + + error = printInfo(Index); + IFERROR(error); + + error = free_index(Index); + IFERROR(error); + + + + fprintf(stdout,"\t===============================================\n"); + fprintf(stdout,"\tInput: %lu bytes (text) --> Output %lu bytes (wcsa).\n", text_len, Index_size); + fprintf(stderr,"\tIndex size = %lu Kb\n", Index_size/1024); + fprintf(stdout,"\tOverall compression --> %.2f%% (%.2f bits per char).\n", + (100.0*Index_size)/text_len, (Index_size*8.0)/text_len); + fprintf(stdout,"\t===============================================\n"); + + + return 0; +} + +double +getTime (void) +{ + double usertime, systime; + struct rusage usage; + + getrusage (RUSAGE_SELF, &usage); + + usertime = (double) usage.ru_utime.tv_sec + + (double) usage.ru_utime.tv_usec / 1000000.0; + + systime = (double) usage.ru_stime.tv_sec + + (double) usage.ru_stime.tv_usec / 1000000.0; + + //return (usertime + systime); + return (usertime ); +} + + +void usage(char * progname) { + fprintf(stderr, "\nThe program loads and shows some stats on it\n"); + fprintf(stderr, "Usage: %s \n", progname); +} diff --git a/swcsa/intIndex/Makefile b/swcsa/intIndex/Makefile new file mode 100755 index 0000000..3fb9c0c --- /dev/null +++ b/swcsa/intIndex/Makefile @@ -0,0 +1,55 @@ +SRCDIR = . +SRCDIRCSA = . +SRCDIRUTILS = ../utils +CC = g++ + +ifndef CFLAGS ##possibly already defined by the "main Makefile". + ##CFLAGS = -O9 -m32 + CFLAGS = -O9 -m32 + ##CFLAGS = -g -m32 -O0 +endif + +LIBINTINDEX = icsa.a + +all: intIndex cleanO + + +intIndex: icsa.o parameters.o basics.o huff.o bitmap.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o + ar rc $(LIBINTINDEX) icsa.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o + #not including "parameters.o basics.o bitmap.o huff.o" as they are included by wcsa + #they are already included into the library by the presentation layer. + +icsa.o: parameters.o basics.o bitmap.o huff.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o + $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/icsa.c + +psiHuffmanRLE.o: huff.o + $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/psiHuffmanRLE.c + +psiDeltaCode.o: + $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/psiDeltaCode.c + +psiGonzalo.o: huff.o + $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/psiGonzalo.c + +parameters.o: + $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/parameters.c + + +huff.o: + $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/huff.c + +basics.o: + $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/basics.c + +bitmap.o: + $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/bitmap.c + + +cleanO: + rm -f *.o + +clean: + rm -rf *~ *% *.o core *.bak icsa.a + +tar: + tar czvf icsa.tar.gz Makefile diff --git a/swcsa/intIndex/defValues.h b/swcsa/intIndex/defValues.h new file mode 100644 index 0000000..e19f339 --- /dev/null +++ b/swcsa/intIndex/defValues.h @@ -0,0 +1,39 @@ +#ifndef DEFVALUESSAD +#define DEFVALUESSAD + +// CONFIGURACI�NS DO CSA DE SADAKANE PARA TEXTOS DE ENTEIROS + +// Parametros configurables +#define DEFAULT_A_SAMPLING_PERIOD 256 // Periodo de muestreo de A +#define DEFAULT_A_INV_SAMPLING_PERIOD 256 // Periodo de muestreo de inversa A +#define DEFAULT_PSI_SAMPLING_PERIOD 256 // Periodo de muestreo da funcion PSI + +/*********************************/ +//#define PSI_HUFFMANRLE // Uses Lbd's implementation of Psi. Improvement over Gonzalo's (in compression and speed) +#define DEFAULT_nsHUFF 16*1024 // huffman limit on Psi. it can be optimized for space + +#define PSI_DELTACODES // Uses Delta codes for increments of Psi. Faster but compresses less than the others +/*********************************/ + + +#define MAX_FILENAME_LENGTH 256 // Lonxitude maxima do nome do ficheiro + +// Extensions dos ficheiros creados +#define NUMBER_OF_ELEMENTS_FILE_EXT "CSA.noe" // Numero de elementos (tama�o de A, Psi e D) + +#define D_FILE_EXT "CSA.Dbv" // O vector D de Sadakane +#define D_RANK_DIRECTORY_FILE_EXT "CSA.Drd" // O directorio para Rank sobre D + +#define SAMPLES_A_FILE_EXT "CSA.sA" // Mostras do array de sufixos +#define BA_FILE_EXT "CSA.BAbv" // O vector D de Sadakane +#define BA_RANK_DIRECTORY_FILE_EXT "CSA.BArd" // O directorio para Rank sobre D + +#define SAMPLES_A_INV_FILE_EXT "CSA.sAI" // Mostras da inversa do array de sufixos +#define SAMPLING_PERIOD_A_FILE_EXT "CSA.sTA" // Periodo de muestreo do array de sufixos e da inversa + +#define PSI_COMPRESSED_FILE_EXT "CSA.psi" // COdigos delta de PSI + +#define DEFAULT_PSI_BINARY_SEARCH_FACTOR 2 // Periodo de muestreo na busca binaria +//#define BINARY_SEARCH_INTERVAL 128 // Periodo de muestreo na busca binaria + +#endif diff --git a/swcsa/intIndex/icsa.c b/swcsa/intIndex/icsa.c new file mode 100644 index 0000000..eec4e34 --- /dev/null +++ b/swcsa/intIndex/icsa.c @@ -0,0 +1,976 @@ +#include "icsa.h" + +// Global para que funcione a funcion de comparacion do quicksort +uint *intVector; + +// Para o quicksort +int suffixCmp(const void *arg1, const void *arg2) { + + register uint a,b; + a=*((uint *) arg1); + b=*((uint *) arg2); + + while(intVector[a] == intVector[b]) { a++; b++; } + return (intVector[a] - intVector[b]); + +} + + + +/* **NO REVISADO TAMAÑO FILE. +int buildIntIndexFromFile (char *filename, char *build_options,void **index) { + //char filename[255]; + int file; + struct stat f_stat; + //sprintf(filename, "%s.%s", basename,SE_FILE_EXT); + + if( (file = open(filename, O_RDONLY)) < 0) { + printf("Cannot open file %s\n", filename); + exit(0); + } + if( fstat(file, &f_stat) < 0) { + printf("Cannot read information from file %s\n", filename); exit(0); + } + uint sizeFile = (f_stat.st_size)/sizeof(uint); + + uint *se = (uint *) malloc (sizeFile); + uint seSize = sizeFile / sizeof(uint); + read(file, se, sizeFile); //the samples + close(file); + return (buildIntIndex(se,seSize,build_options,index)); +} +*/ + +//ticsa *createIntegerCSA(uint **aintVector, uint textSize, char *build_options) { +int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index ){ + uint textSize=n; + intVector = aintVector; //global variable + ticsa *myicsa; + myicsa = (ticsa *) malloc (sizeof (ticsa)); + uint *Psi, *SAI, *C, vocSize; + register uint i, j; + uint nsHUFF; + + parametersCSA(myicsa, build_options); + + nsHUFF=myicsa->tempNSHUFF; + + // Almacenamos o valor dalguns parametros + myicsa->suffixArraySize = textSize; + myicsa->D = (uint*) malloc (sizeof(uint) * ((textSize+31)/32)); + + myicsa->samplesASize = (textSize + myicsa->T_A - 1) / myicsa->T_A + 1; + myicsa->samplesA = (uint *)malloc(sizeof(int) * myicsa->samplesASize); + myicsa->BA = (uint*) malloc (sizeof(uint) * ((textSize+31)/32)); + myicsa->samplesAInvSize = (textSize + myicsa->T_AInv - 1) / myicsa->T_AInv; + myicsa->samplesAInv = (uint *)malloc(sizeof(int) * myicsa->samplesAInvSize); + + // Reservamos espacio para os vectores + Psi = (uint *) malloc (sizeof(uint) * textSize); + + // CONSTRUIMOS A FUNCION C + vocSize = 0; + for(i=0;ivocSize) vocSize = intVector[i]; + C = (uint *) malloc(sizeof(uint) * (vocSize + 1)); // Para contar o 0 terminador + for(i=0;i0;i--) C[i] = C[i-1]; + C[0] = 0; + + // Construimos o array de sufixos (en Psi) - con quicksort + printf("\n\t *BUILDING THE SUFFIX ARRAY over %d integers... (with qsort)", textSize);fflush(stdout); + for(i=0; iBA[i] = 0; + for(i=0; iT_A) bitset(myicsa->BA, SAI[i]); + bitset(myicsa->BA, SAI[textSize-1]); // A ultima posicion sempre muestreada + //printf("TextSize = %d\n", textSize); + myicsa->bBA = createBitmap(myicsa->BA, textSize); + for(i=0,j=0; iBA, i)) myicsa->samplesA[j++] = Psi[i]; + + // ALMACENAMOS AS MOSTRAS DA INVERSA DO ARRAY DE SUFIXOS + for(i=0,j=0;iT_AInv) myicsa->samplesAInv[j++] = SAI[i]; + + // CONSTRUIMOS E COMPRIMIMOS PSI + printf("\n\t Creating compressed Psi..."); + for(i=0;ihcPsi = huffmanCompressPsi(Psi,textSize,myicsa->T_Psi,nsHUFF); + #endif + #ifdef PSI_GONZALO + myicsa->gcPsi = gonzaloCompressPsi(Psi,textSize,myicsa->T_Psi,nsHUFF); + #endif + #ifdef PSI_DELTACODES + myicsa->dcPsi = deltaCompressPsi(Psi,textSize,myicsa->T_Psi); + #endif + free(Psi); + + // Contruimos D + for(i=0;i<((textSize+31)/32);i++) myicsa->D[i] = 0; + for(i=0;i<=vocSize;i++) bitset(myicsa->D, C[i]); + myicsa->bD = createBitmap(myicsa->D,textSize); + free(C); + + // VARIABLE GLOBAL QUE ALMACENA O ESTADO DOS DISPLAYS (IMPORTANTE PARA DISPLAY SECUENCIAL) + // Almacena a última posición do array de sufixos que mostramos con display ou displayNext + // Se nos piden un displayNext, aplicamos PSI sobre esta posición e obtemos a seguinte, + // coa que podemos obter o símbolo pedido, e actualizamos displayState + myicsa->displayCSAState = 0; + myicsa->displayCSAPrevPosition = -2; //needed by DisplayCSA(position) + + aintVector = intVector; + // Liberamos o espacion non necesario + + *index = myicsa; + //return (myicsa); + return 0; +} + + +//Returns number of elements in the indexed sequence of integers +int sourceLenIntIndex(void *index, uint *numInts){ + ticsa *myicsa = (ticsa *) index; + *numInts= myicsa->suffixArraySize; + return 0; //no error; +} + +int saveIntIndex(void *index, char *pathname) { +//void storeStructsCSA(ticsa *myicsa, char *basename) { + + ticsa *myicsa = (ticsa *) index; + char *basename=pathname; + + char *filename; + int file; + + // Reservamos espacio para o nome do ficheiro + filename = (char *)malloc(sizeof(char)*MAX_FILENAME_LENGTH); + + // Ficheiro co n�mero de elementos indexados (enteiros do texto orixinal) + strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, NUMBER_OF_ELEMENTS_FILE_EXT); + unlink(filename); + if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) { + printf("Cannot open file %s\n", filename); + exit(0); + } + write(file, &(myicsa->suffixArraySize), sizeof(int)); + close(file); + + strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, PSI_COMPRESSED_FILE_EXT); + + #ifdef PSI_HUFFMANRLE + storeHuffmanCompressedPsi(&(myicsa->hcPsi), filename); + #endif + #ifdef PSI_GONZALO + storeGonzaloCompressedPsi(&(myicsa->gcPsi), filename); + #endif + #ifdef PSI_DELTACODES + storeDeltaCompressedPsi(&(myicsa->dcPsi), filename); + #endif + + // Ficheiro co vector de bits D + strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, D_FILE_EXT); + unlink(filename); + if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) { + printf("Cannot open file %s\n", filename); + exit(0); + } + write(file, myicsa->D, sizeof(int)*((myicsa->suffixArraySize+31)/32)); + close(file); + + // Directorio de rank para D + // Almacenamos o n�mero de superbloques seguido dos superbloques + // E logo o n�mero de bloques seguido dos bloques + strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, D_RANK_DIRECTORY_FILE_EXT); + saveBitmap(filename,myicsa->bD); + + // Ficheiro coas mostras de A + strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, SAMPLES_A_FILE_EXT); + unlink(filename); + if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) { + printf("Cannot open file %s\n", filename); + exit(0); + } + write(file, myicsa->samplesA, sizeof(int) * (myicsa->samplesASize)); + close(file); + + // Ficheiro co vector BA (marca as posicions de A muestreadas) + strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, BA_FILE_EXT); + unlink(filename); + if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) { + printf("Cannot open file %s\n", filename); + exit(0); + } + write(file, myicsa->BA, sizeof(int)*((myicsa->suffixArraySize+31)/32)); + close(file); + + // Directorio de rank para BA + strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, BA_RANK_DIRECTORY_FILE_EXT); + saveBitmap(filename, myicsa->bBA); + + // Ficheiro coas mostras de A inversa + strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, SAMPLES_A_INV_FILE_EXT); + unlink(filename); + if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) { + printf("Cannot open file %s\n", filename); + exit(0); + } + write(file, myicsa->samplesAInv, sizeof(int) * (myicsa->samplesAInvSize)); + close(file); + + // Ficheiro co periodo de muestreo de A e A inversa + strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, SAMPLING_PERIOD_A_FILE_EXT); + unlink(filename); + if( (file = open(filename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) { + printf("Cannot open file %s\n", filename); + exit(0); + } + write(file, &(myicsa->T_A), sizeof(int)); + write(file, &(myicsa->T_AInv), sizeof(int)); + + write(file, &(myicsa->psiSearchFactorJump), sizeof(uint)); + + close(file); + free(filename); + return 0; //no error. +} + +//Returns the size (in bytes) of the index over the sequence of integers. +//uint CSA_size(ticsa *myicsa) { +int sizeIntIndex(void *index, uint *numBytes) { + ticsa *myicsa = (ticsa *) index; + uint size = 0; + size +=(sizeof (ticsa) * 1); + size += sizeof(uint)*((myicsa->suffixArraySize+31)/32) ; //D vector + size += myicsa->bD->mem_usage; + size += sizeof(uint) * myicsa->samplesASize ; // samples A + size += sizeof(uint) * myicsa->samplesAInvSize ; // samples A^{-1} + size += sizeof(uint)*((myicsa->suffixArraySize+31)/32) ; //BA vector + size += myicsa->bBA->mem_usage; + #ifdef PSI_HUFFMANRLE + size +=myicsa->hcPsi.totalMem; + #endif + #ifdef PSI_GONZALO + size +=myicsa->gcPsi.totalMem; + #endif + #ifdef PSI_DELTACODES + size +=myicsa->dcPsi.totalMem; + #endif + *numBytes = size; + return 0; //no error. +} + + +//ticsa *loadCSA(char *basename) { +int loadIntIndex(char *pathname, void **index){ + + char *basename=pathname; + char *filename; + int file; + uint length; + char c; + char *word; + struct stat f_stat; + uint suffixArraySize; + + ticsa *myicsa; + myicsa = (ticsa *) malloc (sizeof (ticsa) * 1); + + + // VARIABLE GLOBAL QUE ALMACENA O ESTADO DOS DISPLAYS (IMPORTANTE PARA DISPLAY SECUENCIAL) + // Almacena a �ltima posici�n do array de sufixos que mostramos con display ou displayNext + // Se nos piden un displayNext, aplicamos PSI sobre esta posici�n e obtemos a seguinte, + // coa que podemos obter o s�mbolo pedido, e actualizamos displayState + myicsa->displayCSAState = 0; + myicsa->displayCSAPrevPosition = -2; //needed by DisplayCSA(position) + + // Reservamos espacio para o nome do ficheiro + filename = (char *)malloc(sizeof(char)*MAX_FILENAME_LENGTH); + + // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O NUMERO DE ELEMENTOS INDEXADOS + strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, NUMBER_OF_ELEMENTS_FILE_EXT); + if( (file = open(filename, O_RDONLY)) < 0) { + printf("Cannot read file %s\n", filename);exit(0); + } + read(file, &suffixArraySize, sizeof(uint)); + myicsa->suffixArraySize = suffixArraySize; + printf("Number of indexed elements (suffix array size) = %d\n", suffixArraySize); + + // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA PSI COMPRIMIDA + strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, PSI_COMPRESSED_FILE_EXT); + #ifdef PSI_HUFFMANRLE + myicsa->hcPsi = loadHuffmanCompressedPsi(filename); + #endif + #ifdef PSI_GONZALO + myicsa->gcPsi = loadGonzaloCompressedPsi(filename); + #endif + #ifdef PSI_DELTACODES + myicsa->dcPsi = loadDeltaCompressedPsi(filename); + #endif + + // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA D + strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, D_FILE_EXT); + if( (file = open(filename, O_RDONLY)) < 0) { + printf("Cannot read file %s\n", filename); exit(0); + } + myicsa->D = (uint *) malloc (sizeof(uint)*((suffixArraySize+31)/32)); + read(file, myicsa->D, sizeof(uint)*((suffixArraySize+31)/32)); + printf("Bit vector D loaded (%d bits)\n", suffixArraySize); + + // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O DIRECTORIO DE RANK1 PARA D + strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, D_RANK_DIRECTORY_FILE_EXT); + myicsa->bD = loadBitmap(filename,myicsa->D,suffixArraySize); + { uint ns, nb; + ns = myicsa->bD->sSize; + nb = myicsa->bD->bSize; + myicsa->bD->data = myicsa->D; + printf("Rank1 Directory for D loaded (%d superblocks, %d blocks)\n", ns, nb); + } + + // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA SAMPLES A + strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, SAMPLES_A_FILE_EXT); + if( (file = open(filename, O_RDONLY)) < 0) { + printf("Cannot read file %s\n", filename); exit(0); + } + if( fstat(file, &f_stat) < 0) { + printf("Cannot read information from file %s\n", filename); exit(0); + } + myicsa->samplesASize = (f_stat.st_size)/sizeof(uint); + myicsa->samplesA = (uint *)malloc(sizeof(uint) * myicsa->samplesASize); + read(file, myicsa->samplesA, sizeof(uint) * myicsa->samplesASize); + printf("Suffix array samples loaded (%d samples)\n", myicsa->samplesASize); + + // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA BA + strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, BA_FILE_EXT); + if( (file = open(filename, O_RDONLY)) < 0) { + printf("Cannot read file %s\n", filename); exit(0); + } + myicsa->BA = (uint *) malloc (sizeof(uint)*((suffixArraySize+31)/32)); + read(file, myicsa->BA, sizeof(uint)*((suffixArraySize+31)/32)); + printf("Bit vector BA loaded (%d bits)\n", suffixArraySize); + + // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O DIRECTORIO DE RANK1 PARA BA + strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, BA_RANK_DIRECTORY_FILE_EXT); + myicsa->bBA = loadBitmap(filename,myicsa->BA,suffixArraySize); + { uint ns, nb; + ns = myicsa->bBA->sSize; + nb = myicsa->bBA->bSize; + myicsa->bBA->data = myicsa->BA; + printf("Rank1 Directory for BA loaded (%d superblocks, %d blocks)\n", ns, nb); + } + + // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA SAMPLES A INVERSA + strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, SAMPLES_A_INV_FILE_EXT); + if( (file = open(filename, O_RDONLY)) < 0) { + printf("Cannot read file %s\n", filename); exit(0); + } + if( fstat(file, &f_stat) < 0) { + printf("Cannot read information from file %s\n", filename); exit(0); + } + myicsa->samplesAInvSize = (f_stat.st_size)/(sizeof(uint)); + myicsa->samplesAInv = (uint *)malloc(sizeof(uint) * myicsa->samplesAInvSize); + read(file, myicsa->samplesAInv, sizeof(uint) * myicsa->samplesAInvSize); + printf("Suffix array inverse samples loaded (%d samples)\n", myicsa->samplesAInvSize); + + // LEEMOS OS DATOS DO FICHEIRO QUE ALMACENA O PERIODO DE MUESTREO DO ARRAY DE SUFIXOS E DA INVERSA + strcpy(filename, basename); + strcat(filename, "."); + strcat(filename, SAMPLING_PERIOD_A_FILE_EXT); + if( (file = open(filename, O_RDONLY)) < 0) { + printf("Cannot read file %s\n", filename); exit(0); + } + read(file, &(myicsa->T_A), sizeof(uint)); + read(file, &(myicsa->T_AInv), sizeof(uint)); + printf("Sampling A Period T = %d, Sampling A inv Period TInv = %d\n", myicsa->T_A, myicsa->T_AInv); + + read(file, &(myicsa->psiSearchFactorJump), sizeof(uint)); + printf("Psi Bin Search Factor-Jump is = %d\n", myicsa->psiSearchFactorJump); + + close(file); + free(filename); + + //return myicsa; + *index = myicsa; + return 0; +} + + +//uint destroyStructsCSA(ticsa *myicsa) { +int freeIntIndex(void *index) { + ticsa *myicsa = (ticsa *) index; + // Liberamos o espacio reservado + + if (!myicsa) return 0; + + uint total=0, totaltmp=0; + + uint nbytes;sizeIntIndex(index, &nbytes); + + total +=(sizeof (ticsa) * 1);; + + #ifdef PSI_HUFFMANRLE + total+= totaltmp = myicsa->hcPsi.totalMem; + destroyHuffmanCompressedPsi(&(myicsa->hcPsi)); + #endif + #ifdef PSI_GONZALO + total+= totaltmp = myicsa->gcPsi.totalMem; + destroyGonzaloCompressedPsi(&(myicsa->gcPsi)); + #endif + #ifdef PSI_DELTACODES + total+= totaltmp = myicsa->dcPsi.totalMem; + destroyDeltaCodesCompressedPsi(&(myicsa->dcPsi)); + #endif + printf("\n\t[destroying SA: compressed PSI structure] ...Freed %u bytes... RAM",totaltmp); + + free(myicsa->D); total+= totaltmp = (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); + printf("\n\t[destroying SA: D vector] ...Freed %u bytes... RAM",totaltmp); + free(myicsa->samplesA); total+= totaltmp = (sizeof(uint) * myicsa->samplesASize); + printf("\n\t[destroying Samples A: A ] ...Freed %u bytes... RAM",totaltmp); + free(myicsa->samplesAInv); total+= totaltmp = (sizeof(uint) * myicsa->samplesAInvSize); + printf("\n\t[destroying Samples AInv: A ] ...Freed %u bytes... RAM",totaltmp); + printf("\n\t[destroying rank bit D ] ...Freed %u bytes... RAM",myicsa->bD->mem_usage); + free(myicsa->BA); total+= totaltmp = (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); + printf("\n\t[destroying SA: BA vector] ...Freed %u bytes... RAM",totaltmp); + + total += myicsa->bD->mem_usage; + destroyBitmap(myicsa->bD); + total += myicsa->bBA->mem_usage; + destroyBitmap(myicsa->bBA); + + printf("\n\t**** [the whole iCSA ocuppied ... %u bytes... RAM",total); + printf("\n\t**** iCSA size = %d bytes ",nbytes); + printf("\n"); + + free(myicsa); + + return 0; //no error. +} + + // Shows detailed summary info of the self-index (memory usage of each structure) +int printInfoIntIndex(void *index, const char tab[]) { + ticsa *myicsa = (ticsa *) index; + if (!myicsa) return 0; + + uint structure, totalpsi, totalD, totalBD, totalSA,totalSAinv, totalBA,totalBBA; + + structure=sizeof(ticsa); + + #ifdef PSI_HUFFMANRLE + totalpsi = myicsa->hcPsi.totalMem; + #endif + #ifdef PSI_GONZALO + totalpsi = myicsa->gcPsi.totalMem; + #endif + #ifdef PSI_DELTACODES + totalpsi = myicsa->dcPsi.totalMem; + #endif + + totalD = (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); + totalBD = myicsa->bD->mem_usage; + totalSA = (sizeof(uint) * myicsa->samplesASize); + totalSAinv = (sizeof(uint) * myicsa->samplesAInvSize); + totalBA = (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); + totalBBA = myicsa->bBA->mem_usage; + + uint nbytes; sizeIntIndex(index, &nbytes); //whole self-index + + printf("\n ===================================================:"); + printf("\n%sSummary Self-index on integers (icsa) layer:",tab); + printf("\n%s icsa structure = %d bytes",tab, structure); + printf("\n%s psi = %8d bytes",tab, totalpsi); + printf("\n%s D (bitmap) = %8d bytes",tab, totalD); + printf("\n%s rank for D = %8d bytes",tab, totalBD); + printf("\n%s SA(sampled) = %8d bytes",tab, totalSA); + printf("\n%s SAinv(samp) = %8d bytes",tab, totalSAinv); + printf("\n%s BA (bitmap) = %8d bytes",tab, totalBA); + printf("\n%s rank for BA = %8d bytes",tab, totalBBA); + printf("\n%sTotal = ** %9d bytes (in RAM) ** ",tab, nbytes); + printf("\n"); + + return 0; //no error. +} + + +// OPERACIONS DO CSA + +// BUSCA BINARIA SOBRE MOSTRAS + 2 BUSCAS EXPONENCIAIS + 2 BUSCAS BINARIAS +// 1º Busca binaria sobre o array de sufixos, elexindo como pivote un múltiplo de bin_search_psi_skip_interval (que orixinalmente foi pensado para igualar co valor de Psi). +// 2º Esta busca pode deterse por: +// a) O pivote repítese entre dúas iteracións -> As ocorrencias están entre o pivote e a seguinte mostra (pivote + bin_search_psi_skip_interval) -> facemos dúas buscas binarias +// b) O pivote é unha ocorrencia do patrón -> Faise unha busca exponencial sobre mostras hacia a esquerda e outra hacia a dereita, ata atopar a unha mostra á esquerda e outra +// á dereita do intervalo de ocorrencias. Entre cada unha destas e a inmediatamente anterior da busca exponencial, faise unha busca binaria para atopar os extremos do intervalo. + +int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right){ + //unsigned int countCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right) { + + uint patternSize = length; + ticsa *myicsa = (ticsa *) index; + + register unsigned long l, r, i; + register long comp, p, previousP; + //register unsigned int l, r, i; + //register int comp, p, previousP; + register uint bin_search_psi_skip_interval = myicsa->psiSearchFactorJump; + + //fprintf(stderr,"\n psiSearchFactor = %d",myicsa->psiSearchFactorJump); + + l = 0; + r = (myicsa->suffixArraySize+bin_search_psi_skip_interval-2)/bin_search_psi_skip_interval*bin_search_psi_skip_interval; + p = ((l+r)/2)/bin_search_psi_skip_interval * bin_search_psi_skip_interval; + previousP = 0; + + // BUSCA BINARIA SOBRE MOSTRAS + while( (p != previousP) && (comp = SadCSACompare(myicsa, pattern, patternSize, p)) ) { + if(comp > 0) l = p; + else r = p; + previousP = p; + p = ((l+r)/2)/bin_search_psi_skip_interval*bin_search_psi_skip_interval; + } + + if(p==previousP) { + + // BUSCA BINARIA ENTRE O PIVOTE E A SEGUINTE MOSTRA + l = previousP; + r = previousP+bin_search_psi_skip_interval; + if(r > myicsa->suffixArraySize) r = myicsa->suffixArraySize - 1; + while(l < r) { + p = (l+r)/2; + if(SadCSACompare(myicsa, pattern, patternSize, p) <= 0) r = p; + else l = p+1; + } + + if(SadCSACompare(myicsa, pattern, patternSize, r)) { + *left = l; + *right = r; + //return 0; + *numocc = 0; + return 0; //no error. + } + *left = r; + + l = previousP; + r = previousP+bin_search_psi_skip_interval; + if(r > myicsa->suffixArraySize) r = myicsa->suffixArraySize - 1; + while(l < r) { + p = (l+r+1)/2; + if(SadCSACompare(myicsa, pattern, patternSize, p) >= 0) l = p; + else r = p-1; + } + *right = l; + + } else { + + previousP = p; // En previousP poñemos o p atopado na busca sobre as mostras + + // BUSCA EXPONENCIAL HACIA ATRÁS + i = 1; + p -= bin_search_psi_skip_interval; + while(p>0 && !SadCSACompare(myicsa, pattern, patternSize, p)) { + i<<=1; + p = previousP-(i*bin_search_psi_skip_interval); + } + // Busca binaria entre as duas ultimas mostras da exponencial + if(previousP > i*bin_search_psi_skip_interval) l = previousP-(i*bin_search_psi_skip_interval); + else l=0; + i>>=1; + r = previousP-(i*bin_search_psi_skip_interval); + while(l < r) { + p = (l+r)/2; + if(SadCSACompare(myicsa, pattern, patternSize, p) <= 0) r = p; + else l = p+1; + } + *left = r; + + // BUSCA EXPONENCIAL HACIA ADIANTE + i = 1; + p = previousP+bin_search_psi_skip_interval; + while(psuffixArraySize && !SadCSACompare(myicsa, pattern, patternSize, p)) { + i<<=1; + p = previousP+(i*bin_search_psi_skip_interval); + } + // Busca binaria entre as duas ultimas mostras da exponencial + if(p < myicsa->suffixArraySize) r = previousP+(i*bin_search_psi_skip_interval); + else r = myicsa->suffixArraySize-1; + i>>=1; + l = previousP+(i*bin_search_psi_skip_interval); + while(l < r) { + p = (l+r+1)/2; + if(SadCSACompare(myicsa, pattern, patternSize, p) >= 0) l = p; + else r = p-1; + } + *right = l; + } + + //return *right-*left+1; + *numocc = (uint) *right-*left+1; + return 0; //no error +} + +// Version inicial de busca binaria +unsigned int countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right) { + register ulong l, r, p; + + l = 0; + r = myicsa->suffixArraySize-1; + + while(l < r) { + p = (l+r)/2; + if(SadCSACompare(myicsa, pattern, patternSize, p) <= 0) r = p; + else l = p+1; + } + + // SE SON DISTINTOS O PATRON NON APARECE NO TEXTO E DEVOLVEMOS 0 + if(SadCSACompare(myicsa, pattern, patternSize, r)) { + *left = l; + *right = r; + return 0; + } + + // Almacenamos o limite esquerdo + *left = r; + + // SE SON IGUALES (O PATRON APARECE NO TEXTO), BUSCAMOS AGORA O LIMITE DEREITO, QUE ALMACENAREMOS EN right + // NOTA: INICIAMOS A BUSQUEDA A PARTIR DO ESQUERDO... + l = r; + r = myicsa->suffixArraySize-1; + + while(l < r) { + p = (l+r+1)/2; + if(SadCSACompare(myicsa, pattern, patternSize, p) >= 0) l = p; + else r = p-1; + } + + // Gardamos o limite dereito + *right = l; + + return (uint) *right-*left+1; +} + +int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc) { + //unsigned int *locateCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *occ) { + + ticsa *myicsa = (ticsa *) index; + uint patternSize = length; + ulong *positions; + ulong occurrences; + register ulong left, right; + + //occurrences = countCSA(myicsa, pattern, patternSize, &left, &right); + int err; + err = countIntIndex((void *) myicsa, pattern, patternSize, &occurrences, &left, &right); + *numocc = occurrences; + + if (occurrences) { + register ulong idx = 0; + positions = (ulong*) malloc(sizeof(ulong) * occurrences); + while(left<=right) positions[idx++] = A(myicsa,left++); + + *occ = positions; + return 0; + //return positions; + } + + (*occ)=NULL; + return 0; //no error, but no occurrences. + + //return NULL; +} + +// Devolve o enteiro do texto que ocupa a posicion dada, +// e fixa o estado para poder seguir obtendo os seguintes enteiros con displayNext(); + +int displayIntIndex(void *index, ulong position, uint *value){ + //uint displayCSA(ticsa *myicsa, uint position) { + ticsa *myicsa = (ticsa *) index; + if (position == (myicsa->displayCSAPrevPosition +1)) { + myicsa->displayCSAPrevPosition = position; + //return displayCSANext(myicsa); + *value = displayCSANext(myicsa); + } + else { + myicsa->displayCSAPrevPosition = position; + //return displayCSAFirst(myicsa, position); + *value = displayCSAFirst(myicsa, position); + } + return 0; //no error +} + + +/**********************************************************************/ + +// Devolve o enteiro do texto que ocupa a posicion dada, e fixa o estado +// para poder seguir obtendo os seguintes enteiros con displayNext(); +uint displayCSAFirst(ticsa *myicsa, uint position) { + + register uint positionAux, index; + register uint T_AInv = myicsa->T_AInv; + + positionAux = myicsa->samplesAInv[position / T_AInv]; + for(index = 0; index < position%T_AInv; index++) { + #ifdef PSI_HUFFMANRLE + positionAux=getHuffmanPsiValue(&(myicsa->hcPsi),positionAux); + #endif + #ifdef PSI_GONZALO + positionAux=getGonzaloPsiValue(&(myicsa->gcPsi),positionAux); + #endif + #ifdef PSI_DELTACODES + positionAux=getDeltaPsiValue(&(myicsa->dcPsi),positionAux); + #endif + } + + // Fijamos a variable global para a chamada a displayNext + myicsa->displayCSAState = positionAux; + + // return rank1(D, Dir, positionAux) - 1; + return rank(myicsa->bD, positionAux) - 1; +} + + +// Devolve o seguinte enteiro do texto (a partir do estado) +unsigned int displayCSANext(ticsa *myicsa) { + #ifdef PSI_HUFFMANRLE + myicsa->displayCSAState=getHuffmanPsiValue(&(myicsa->hcPsi),myicsa->displayCSAState); + #endif + #ifdef PSI_GONZALO + myicsa->displayCSAState = getGonzaloPsiValue(&(myicsa->gcPsi),myicsa->displayCSAState); + #endif + #ifdef PSI_DELTACODES + myicsa->displayCSAState = getDeltaPsiValue(&(myicsa->dcPsi),myicsa->displayCSAState); + #endif + return rank(myicsa->bD, myicsa->displayCSAState) - 1; +} + + +// Mostra as estructuras creadas +void showStructsCSA(ticsa *myicsa) { + + unsigned int index; + + // ESTRUCTURAS PARA CSA + printf("Basic CSA structures:\n\n"); + + // VALORES DA FUNCI�N PSI (decodificando) + printf("\tPSI: (Sampling period = %d)\n", myicsa->T_Psi); + for(index=0; index < myicsa->suffixArraySize; index++) + #ifdef PSI_HUFFMANRLE + printf("\tPsi[%d] = %d\n", index, getHuffmanPsiValue(&(myicsa->hcPsi),index)); + #endif + #ifdef PSI_GONZALO + printf("\tPsi[%d] = %d\n", index, getGonzaloPsiValue(&(myicsa->gcPsi),index)); + #endif + #ifdef PSI_DELTACODES + printf("\tPsi[%d] = %d\n", index, getDeltaPsiValue(&(myicsa->dcPsi),index)); + #endif + printf("\n"); + + // VECTOR D DE SADAKANE CO DIRECTORIO DE RANK ASOCIADO + printf("\tD = "); + showBitVector(myicsa->D, myicsa->suffixArraySize); + printf("\n\nSuperbloques de D:\n"); + { uint ns; + uint nb; + ns = myicsa->bD->sSize; + nb= myicsa->bD->bSize; + for(index=0; indexbD->sdata[index]); + } + printf("\nBloques de D:\n"); + + for(index=0; indexbD->bdata[index]); + } + printf("\n\n"); + } + // ESTRUCTURAS PARA ACCEDER O ARRAY DE SUFIXOS E A SUA INVERSA + printf("Suffix Array Sampling Structures: (Sampling period = %d)\n", myicsa->T_A); + printf("\tSuffix Array Samples:\n"); + for(index=0; index < myicsa->samplesASize; index++) + printf("\tSamplesA[%d] = %d\n", index, myicsa->samplesA[index]); + printf("\n"); + printf("\tInverse Suffix Array Samples:\n"); + for(index=0; index < myicsa->samplesASize; index++) + printf("\tSamplesAInv[%d] = %d\n", index, myicsa->samplesAInv[index]); + printf("\n"); + +} + + +// Comparacion de Sadakane entre un patron (pattern) y el sufijo en la posicion p del array de sufijos +// IMPORTANTE EVITAR ULTIMA CHAMADA A PSI +int SadCSACompare(ticsa *myicsa, uint *pattern, uint patternSize, uint p) { + + register unsigned int j, i, currentInteger, diff; + + i = p; + j = 0; + + while(1) { + currentInteger = rank(myicsa->bD, i) - 1; + diff = pattern[j++] - currentInteger; + if(diff) return diff; + if(j == patternSize) return 0; + else + #ifdef PSI_HUFFMANRLE + i=getHuffmanPsiValue(&(myicsa->hcPsi),i); + #endif + #ifdef PSI_GONZALO + i=getGonzaloPsiValue(&(myicsa->gcPsi),i); + #endif + #ifdef PSI_DELTACODES + i=getDeltaPsiValue(&(myicsa->dcPsi),i); + #endif + } + +} + + +// Acceso a array de sufixos A +inline uint A(ticsa *myicsa, uint position) { + + register uint timesPsi, sampleValue; + register uint T_A = myicsa->T_A; + + uint proba = position; + + timesPsi = 0; + while(!bitget(myicsa->BA, position)) { + + #ifdef PSI_HUFFMANRLE + position=getHuffmanPsiValue(&(myicsa->hcPsi),position); + #endif + #ifdef PSI_GONZALO + position=getGonzaloPsiValue(&(myicsa->gcPsi),position); + #endif + #ifdef PSI_DELTACODES + position=getDeltaPsiValue(&(myicsa->dcPsi),position); + #endif + timesPsi++; + + } + sampleValue = myicsa->samplesA[rank(myicsa->bBA, position)-1]; + + return sampleValue - timesPsi; + +} + + +// Acceso 'a inversa do array de sufixos +inline uint inverseA(ticsa *myicsa, uint offset) { + + register uint index, inverseValue; + register uint T_AInv = myicsa->T_AInv; + + inverseValue = myicsa->samplesAInv[offset/T_AInv]; + for(index=0; index<(offset%T_AInv); index++) + #ifdef PSI_HUFFMANRLE + inverseValue=getHuffmanPsiValue(&(myicsa->hcPsi),inverseValue); + #endif + #ifdef PSI_GONZALO + inverseValue = getGonzaloPsiValue(&(myicsa->gcPsi),inverseValue); + #endif + #ifdef PSI_DELTACODES + inverseValue = getDeltaPsiValue(&(myicsa->dcPsi),inverseValue); + #endif + return inverseValue; + +} + +// Initializes the parameters of the index. +uint parametersCSA(ticsa *myicsa, char *build_options){ + char delimiters[] = " =;"; + int j,num_parameters; + char ** parameters; + int ssA,ssAinv,ssPsi,nsHuff,psiSearchFactor; + + ssA = DEFAULT_A_SAMPLING_PERIOD; + ssAinv = DEFAULT_A_INV_SAMPLING_PERIOD; + ssPsi = DEFAULT_PSI_SAMPLING_PERIOD; + nsHuff = DEFAULT_nsHUFF; + psiSearchFactor = DEFAULT_PSI_BINARY_SEARCH_FACTOR; + + if (build_options != NULL) { + parse_parameters(build_options,&num_parameters, ¶meters, delimiters); + for (j=0; jT_A = ssA; + myicsa->T_AInv = ssAinv; + myicsa->T_Psi = ssPsi; + myicsa->tempNSHUFF = nsHuff; + myicsa->psiSearchFactorJump = psiSearchFactor * ssPsi; + + printf("\n\t parameters for iCSA: sampleA=%d, sampleAinv=%d, samplePsi=%d",ssA,ssAinv,ssPsi); + printf("\n\t : nsHuff=%d, psiSearchFactor = %d --> jump = %d", nsHuff,psiSearchFactor, myicsa->psiSearchFactorJump); +} diff --git a/swcsa/intIndex/icsa.h b/swcsa/intIndex/icsa.h new file mode 100644 index 0000000..102832c --- /dev/null +++ b/swcsa/intIndex/icsa.h @@ -0,0 +1 @@ +#include #include #include #include #include #include "defValues.h" #include "../utils/bitmap.h" #include "../utils/huff.h" #include "../utils/parameters.h" #ifdef PSI_HUFFMANRLE #include "psiHuffmanRLE.h" #endif #ifdef PSI_GONZALO #include "psiGonzalo.h" #endif #ifdef PSI_DELTACODES #include "psiDeltaCode.h" #endif typedef struct { uint suffixArraySize; uint T_Psi; uint *D; bitmap bD; uint T_A; uint T_AInv; uint *samplesA; uint samplesASize; uint *BA; bitmap bBA; uint *samplesAInv; uint samplesAInvSize; uint displayCSAState; int displayCSAPrevPosition; #ifdef PSI_HUFFMANRLE HuffmanCompressedPsi hcPsi; #endif #ifdef PSI_GONZALO GonzaloCompressedPsi gcPsi; #endif #ifdef PSI_DELTACODES DeltaCompressedPsi dcPsi; #endif //only needed during "parse_parameters". uint tempNSHUFF; uint psiSearchFactorJump; //factor of the T_Psi value. } ticsa; // FUNCTION PROTOTYPES: BUILDING THE INDEX //Creates the ICSA int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ); //ticsa *createIntegerCSA (uint **aintVector, uint SAsize, char *build_options); //Returns number of elements in the indexed sequence of integers int sourceLenIntIndex(void *index, uint *numInts); //Save the index to disk int saveIntIndex(void *index, char *pathname); //void storeStructsCSA(ticsa *myicsa, char *basename); // Loads the index from disk. int loadIntIndex(char *pathname, void **index); //ticsa *loadCSA(char *basename); // Frees memory int freeIntIndex(void *index); //uint destroyStructsCSA(ticsa *myicsa); //Returns the size (in bytes) of the index over the sequence of integers. int sizeIntIndex(void *index, uint *numBytes); //uint CSA_size(ticsa *myicsa); // Shows detailed summary info of the self-index (memory usage of each structure) int printInfoIntIndex(void *index, const char tab[]); //Number of occurrences of the pattern, and the interval [left,right] in the suffix array. int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right); //uint countCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right); // Exponential search //uint countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right); // Binary search // Returns an array with integers corresponding offsets to the occurrences of the pattern, // as well as the number of occurrences int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc); //uint *locateCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *occ); //Returns the value of the source (array of integers) at a given offset. // (that is, the element "position" from the original array of uints) int displayIntIndex(void *index, ulong position, uint *value); //uint displayCSA(ticsa *myicsa, uint position); /* Private function prototypes ********************************************/ uint parametersCSA(ticsa *myicsa, char *build_options); uint displayCSAFirst(ticsa *myicsa, uint position); uint displayCSANext(ticsa *myicsa); int SadCSACompare(ticsa *myicsa, uint *pattern, uint patternSize, uint p); uint A(ticsa *myicsa, uint position); uint inverseA(ticsa *myicsa, uint offset); void showStructsCSA(ticsa *myicsa); // For Debugging \ No newline at end of file diff --git a/swcsa/intIndex/interfaceIntIndex.h b/swcsa/intIndex/interfaceIntIndex.h new file mode 100644 index 0000000..5000f18 --- /dev/null +++ b/swcsa/intIndex/interfaceIntIndex.h @@ -0,0 +1,35 @@ + +// FUNCTION PROTOTYPES: SELF-INDEX ON INTEGERS. + +int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ); + + //Saves the index to disk +int saveIntIndex(void *index, char *pathname); + + //Returns number of elements in the indexed sequence of integers +int sourceLenIntIndex(void *index, uint *numInts); + + //Loads the index from disk. +int loadIntIndex(char *pathname, void **index); + + //Frees the memory allocated to the int_index +int freeIntIndex(void *index); + + //Returns the size (in bytes) of the index over the sequence of integers. +int sizeIntIndex(void *index, uint *numBytes); + + // Shows detailed summary info of the self-index (memory usage of each structure) +int printInfoIntIndex(void *index, const char tab[]); + + //Number of occurrences of the pattern, and the interval [left,right] in the suffix array +int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right); + + //returns an array with integers corresponding offsets to the occurrences of the pattern, + //as well as the number of occurrences +int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc); + + //Returns the value of the source (array of integers) at a given offset. + // (that is, the element "position" from the original array of uints) +int displayIntIndex(void *index, ulong position, uint *value); + + diff --git a/swcsa/intIndex/psiDeltaCode.c b/swcsa/intIndex/psiDeltaCode.c new file mode 100644 index 0000000..04a4f91 --- /dev/null +++ b/swcsa/intIndex/psiDeltaCode.c @@ -0,0 +1,226 @@ +#include "psiDeltaCode.h" + +void destroyDeltaCodesCompressedPsi(DeltaCompressedPsi *compressedPsi) { + free(compressedPsi->deltaCodes); + free(compressedPsi->samples); + free(compressedPsi->pointers); +} + + +DeltaCompressedPsi deltaCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T) { + + DeltaCompressedPsi cPsi; + + int numberOfSamples; + register int diff, deltaCodesPos; + register unsigned int k, p, aux, diffpositive, code, index; + unsigned int samplesIndex, codeLenght, currentInput, wordsDeltaCodes, totalSize; + unsigned int *deltaCodes; + unsigned int *samples; + unsigned int *pointers; + + // Auxiliar para deltaCodes (estimamos como espacio maximo o do array de sufixos) + unsigned int *deltaCodesAux; + + // Calculamos o mellor valor para negativeGap <= 64 + unsigned int negativeGap; + register unsigned int maxNegativeBits = 0; + k = psiSize; + while(k) { + k >>= 1; + maxNegativeBits++; + } + if(maxNegativeBits<=26) negativeGap = 64; + else negativeGap = 1<<(32-maxNegativeBits); + + // Reservamos espacio para as estructuras + numberOfSamples = (psiSize + T - 1) / T; + samples = (unsigned int *)malloc(sizeof(int)*numberOfSamples); + pointers = (unsigned int *)malloc(sizeof(int)*numberOfSamples); + + deltaCodesAux = (unsigned int *)malloc(sizeof(int)*psiSize); + for(index=0; index0) diffpositive = (negativeGap*diff-1)/(negativeGap-1); + else diffpositive = -negativeGap*diff; + + k = 0; + aux = diffpositive; + while(aux) { + aux >>= 1; + k++; + } + aux = k; + p = 0; + while(aux) { + aux >>= 1; + p++; + } + + code = diffpositive & ((1<<(k-1))-1); + codeLenght = 2*p+k-2; + + // Primeiro metemos os p-1 0's iniciais + deltaCodesPos += p-1; + + // Agora metemos os p bits de k + if( ((deltaCodesPos%32) + p) > 32 ) { + deltaCodesAux[deltaCodesPos/32] |= (k>>((deltaCodesPos%32)+p-32)); + deltaCodesAux[deltaCodesPos/32+1] = (k<<(64-(deltaCodesPos%32)-p)); + } else { + deltaCodesAux[deltaCodesPos/32] |= (k<<(32-p-(deltaCodesPos%32))); + } + deltaCodesPos += p; + + // Por �ltimo metemos os k-1 bits de code (sen o 1 inicial) + if( ((deltaCodesPos%32) + (k-1)) > 32 ) { + deltaCodesAux[deltaCodesPos/32] |= (code>>((deltaCodesPos%32)+(k-1)-32)); + deltaCodesAux[deltaCodesPos/32+1] = (code<<(64-(deltaCodesPos%32)-(k-1))); + } else { + deltaCodesAux[deltaCodesPos/32] |= (code<<(32-(k-1)-(deltaCodesPos%32))); + } + deltaCodesPos += k-1; + + } else { + samples[samplesIndex] = Psi[index]; + pointers[samplesIndex++] = deltaCodesPos; + currentInput = Psi[index]; + } + + } + + // Ahora que xa sabemos o espacio necesario para os deltaCodes, reservamolo e liberamos a estructura auxiliar + wordsDeltaCodes = (deltaCodesPos+31)/32; + deltaCodes = (unsigned int *)malloc(sizeof(int)*wordsDeltaCodes); + for(index=0;indexsamples[position/cPsi->T]; + pointer = cPsi->pointers[position/cPsi->T]; + + // Calculamos o numero de codigos a decodificar a partir da mostra + toDecode = position % cPsi->T; + + while(toDecode--) { + + // Collemos o n�mero ceros iniciais + // Po�emos o inicio do c�digo nun enteiro (code) alineado a esquerda + // Non importa que non colla todo o c�digo, pero si temos asegurado que + // colle p e k (k<=32 (6bits), p<=5bits) + code = (cPsi->deltaCodes[pointer/32] << (pointer%32)) | + ((pointer%32 != 0) * (cPsi->deltaCodes[pointer/32+1] >> (32-(pointer%32)))); + + //Ahora contamos o n�mero de ceros (p) que hai nas posicions da esquerda de code + p = 1; + while(!(code & 0x80000000)) { + code <<= 1; + p++; + } + + // Ahora calculamos o numero de digitos da representacion binaria do codigo (k) + k = code >> (32-p); + + // Actualizamos o punteiro global sobre deltaCodes + pointer += 2*p-1; + + // Po�emos a representacion binaria do codigo nun enteiro (code) alineado a esquerda + code = (cPsi->deltaCodes[pointer/32] << (pointer%32)) | + ((pointer%32 != 0) * (cPsi->deltaCodes[pointer/32+1] >> (32-(pointer%32)))); + code = ((code >> 1) | 0x80000000) >> (32-k); + pointer += k-1; + + // Bixecci�n + if(code % cPsi->negativeGap) result += (code - (code/cPsi->negativeGap)); + else result -= code/cPsi->negativeGap; + + } + + return result; + +} + + +void storeDeltaCompressedPsi(DeltaCompressedPsi *compressedPsi, char *filename) { + + int file; + + if( (file = open(filename, O_WRONLY|O_CREAT, 0700)) < 0) { + printf("Cannot open file %s\n", filename); + exit(0); + } + write(file, &(compressedPsi->T), sizeof(int)); + write(file, &(compressedPsi->negativeGap), sizeof(int)); + write(file, &(compressedPsi->deltaCodesSize), sizeof(int)); + write(file, compressedPsi->deltaCodes, compressedPsi->deltaCodesSize*sizeof(int)); + write(file, &(compressedPsi->numberOfSamples), sizeof(int)); + write(file, compressedPsi->samples, compressedPsi->numberOfSamples*sizeof(int)); + write(file, compressedPsi->pointers, compressedPsi->numberOfSamples*sizeof(int)); + write(file, &(compressedPsi->totalMem), sizeof(int)); + + close(file); + +} + + +DeltaCompressedPsi loadDeltaCompressedPsi(char *filename) { + + DeltaCompressedPsi compressedPsi; + + int file; + + if( (file = open(filename, O_RDONLY)) < 0) { + printf("Cannot read file %s\n", filename); + exit(0); + } + read(file, &(compressedPsi.T), sizeof(int)); + read(file, &(compressedPsi.negativeGap), sizeof(int)); + read(file, &(compressedPsi.deltaCodesSize), sizeof(int)); + compressedPsi.deltaCodes = (unsigned int *)malloc(compressedPsi.deltaCodesSize*sizeof(int)); + read(file, compressedPsi.deltaCodes, compressedPsi.deltaCodesSize*sizeof(int)); + read(file, &(compressedPsi.numberOfSamples), sizeof(int)); + compressedPsi.samples = (unsigned int *)malloc(compressedPsi.numberOfSamples*sizeof(int)); + compressedPsi.pointers = (unsigned int *)malloc(compressedPsi.numberOfSamples*sizeof(int)); + read(file, compressedPsi.samples, compressedPsi.numberOfSamples*sizeof(int)); + read(file, compressedPsi.pointers, compressedPsi.numberOfSamples*sizeof(int)); + read(file, &(compressedPsi.totalMem), sizeof(int)); + + close(file); + + return compressedPsi; + +} diff --git a/swcsa/intIndex/psiDeltaCode.h b/swcsa/intIndex/psiDeltaCode.h new file mode 100644 index 0000000..1f955c8 --- /dev/null +++ b/swcsa/intIndex/psiDeltaCode.h @@ -0,0 +1,248 @@ +#include +#include +#include +#include +#include + + + + +typedef struct { + unsigned int T; + unsigned int negativeGap; + unsigned int deltaCodesSize; // En palabras + unsigned int *deltaCodes; + unsigned int numberOfSamples; + unsigned int *samples; + unsigned int *pointers; + unsigned int totalMem; // the size in bytes used; +} DeltaCompressedPsi; + + +// PROTOTIPOS DE FUNCI�NS +DeltaCompressedPsi deltaCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T); +int getDeltaPsiValue(DeltaCompressedPsi *cPsi, unsigned int position); +void storeDeltaCompressedPsi(DeltaCompressedPsi *compressedPsi, char *filename); +DeltaCompressedPsi loadDeltaCompressedPsi(char *filename); +void destroyDeltaCodesCompressedPsi(DeltaCompressedPsi *compressedPsi); + +// IMPLEMENTACI�N DAS FUNCI�NS + +//void destroyDeltaCodesCompressedPsi(DeltaCompressedPsi *compressedPsi) { +// free(compressedPsi->deltaCodes); +// free(compressedPsi->samples); +// free(compressedPsi->pointers); +//} +// +// +//DeltaCompressedPsi deltaCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T) { +// +// DeltaCompressedPsi cPsi; +// +// int numberOfSamples; +// register int diff, deltaCodesPos; +// register unsigned int k, p, aux, diffpositive, code, index; +// unsigned int samplesIndex, codeLenght, currentInput, wordsDeltaCodes, totalSize; +// unsigned int *deltaCodes; +// unsigned int *samples; +// unsigned int *pointers; +// +// // Auxiliar para deltaCodes (estimamos como espacio maximo o do array de sufixos) +// unsigned int *deltaCodesAux; +// +// // Calculamos o mellor valor para negativeGap <= 64 +// unsigned int negativeGap; +// register unsigned int maxNegativeBits = 0; +// k = psiSize; +// while(k) { +// k >>= 1; +// maxNegativeBits++; +// } +// if(maxNegativeBits<=26) negativeGap = 64; +// else negativeGap = 1<<(32-maxNegativeBits); +// +// // Reservamos espacio para as estructuras +// numberOfSamples = (psiSize + T - 1) / T; +// samples = (unsigned int *)malloc(sizeof(int)*numberOfSamples); +// pointers = (unsigned int *)malloc(sizeof(int)*numberOfSamples); +// +// deltaCodesAux = (unsigned int *)malloc(sizeof(int)*psiSize); +// for(index=0; index0) diffpositive = (negativeGap*diff-1)/(negativeGap-1); +// else diffpositive = -negativeGap*diff; +// +// k = 0; +// aux = diffpositive; +// while(aux) { +// aux >>= 1; +// k++; +// } +// aux = k; +// p = 0; +// while(aux) { +// aux >>= 1; +// p++; +// } +// +// code = diffpositive & ((1<<(k-1))-1); +// codeLenght = 2*p+k-2; +// +// // Primeiro metemos os p-1 0's iniciais +// deltaCodesPos += p-1; +// +// // Agora metemos os p bits de k +// if( ((deltaCodesPos%32) + p) > 32 ) { +// deltaCodesAux[deltaCodesPos/32] |= (k>>((deltaCodesPos%32)+p-32)); +// deltaCodesAux[deltaCodesPos/32+1] = (k<<(64-(deltaCodesPos%32)-p)); +// } else { +// deltaCodesAux[deltaCodesPos/32] |= (k<<(32-p-(deltaCodesPos%32))); +// } +// deltaCodesPos += p; +// +// // Por �ltimo metemos os k-1 bits de code (sen o 1 inicial) +// if( ((deltaCodesPos%32) + (k-1)) > 32 ) { +// deltaCodesAux[deltaCodesPos/32] |= (code>>((deltaCodesPos%32)+(k-1)-32)); +// deltaCodesAux[deltaCodesPos/32+1] = (code<<(64-(deltaCodesPos%32)-(k-1))); +// } else { +// deltaCodesAux[deltaCodesPos/32] |= (code<<(32-(k-1)-(deltaCodesPos%32))); +// } +// deltaCodesPos += k-1; +// +// } else { +// samples[samplesIndex] = Psi[index]; +// pointers[samplesIndex++] = deltaCodesPos; +// currentInput = Psi[index]; +// } +// +// } +// +// // Ahora que xa sabemos o espacio necesario para os deltaCodes, reservamolo e liberamos a estructura auxiliar +// wordsDeltaCodes = (deltaCodesPos+31)/32; +// deltaCodes = (unsigned int *)malloc(sizeof(int)*wordsDeltaCodes); +// for(index=0;indexsamples[position/cPsi->T]; +// pointer = cPsi->pointers[position/cPsi->T]; +// +// // Calculamos o numero de codigos a decodificar a partir da mostra +// toDecode = position % cPsi->T; +// +// while(toDecode--) { +// +// // Collemos o n�mero ceros iniciais +// // Po�emos o inicio do c�digo nun enteiro (code) alineado a esquerda +// // Non importa que non colla todo o c�digo, pero si temos asegurado que +// // colle p e k (k<=32 (6bits), p<=5bits) +// code = (cPsi->deltaCodes[pointer/32] << (pointer%32)) | +// ((pointer%32 != 0) * (cPsi->deltaCodes[pointer/32+1] >> (32-(pointer%32)))); +// +// //Ahora contamos o n�mero de ceros (p) que hai nas posicions da esquerda de code +// p = 1; +// while(!(code & 0x80000000)) { +// code <<= 1; +// p++; +// } +// +// // Ahora calculamos o numero de digitos da representacion binaria do codigo (k) +// k = code >> (32-p); +// +// // Actualizamos o punteiro global sobre deltaCodes +// pointer += 2*p-1; +// +// // Po�emos a representacion binaria do codigo nun enteiro (code) alineado a esquerda +// code = (cPsi->deltaCodes[pointer/32] << (pointer%32)) | +// ((pointer%32 != 0) * (cPsi->deltaCodes[pointer/32+1] >> (32-(pointer%32)))); +// code = ((code >> 1) | 0x80000000) >> (32-k); +// pointer += k-1; +// +// // Bixecci�n +// if(code % cPsi->negativeGap) result += (code - (code/cPsi->negativeGap)); +// else result -= code/cPsi->negativeGap; +// +// } +// +// return result; +// +//} +// +// +//void storeDeltaCompressedPsi(DeltaCompressedPsi *compressedPsi, char *filename) { +// +// int file; +// +// if( (file = open(filename, O_WRONLY|O_CREAT, 0700)) < 0) { +// printf("Cannot open file %s\n", filename); +// exit(0); +// } +// write(file, &(compressedPsi->T), sizeof(int)); +// write(file, &(compressedPsi->negativeGap), sizeof(int)); +// write(file, &(compressedPsi->deltaCodesSize), sizeof(int)); +// write(file, compressedPsi->deltaCodes, compressedPsi->deltaCodesSize*sizeof(int)); +// write(file, &(compressedPsi->numberOfSamples), sizeof(int)); +// write(file, compressedPsi->samples, compressedPsi->numberOfSamples*sizeof(int)); +// write(file, compressedPsi->pointers, compressedPsi->numberOfSamples*sizeof(int)); +// close(file); +// +//} +// +// +//DeltaCompressedPsi loadDeltaCompressedPsi(char *filename) { +// +// DeltaCompressedPsi compressedPsi; +// +// int file; +// +// if( (file = open(filename, O_RDONLY)) < 0) { +// printf("Cannot read file %s\n", filename); +// exit(0); +// } +// read(file, &(compressedPsi.T), sizeof(int)); +// read(file, &(compressedPsi.negativeGap), sizeof(int)); +// read(file, &(compressedPsi.deltaCodesSize), sizeof(int)); +// compressedPsi.deltaCodes = (unsigned int *)malloc(compressedPsi.deltaCodesSize*sizeof(int)); +// read(file, compressedPsi.deltaCodes, compressedPsi.deltaCodesSize*sizeof(int)); +// read(file, &(compressedPsi.numberOfSamples), sizeof(int)); +// compressedPsi.samples = (unsigned int *)malloc(compressedPsi.numberOfSamples*sizeof(int)); +// compressedPsi.pointers = (unsigned int *)malloc(compressedPsi.numberOfSamples*sizeof(int)); +// read(file, compressedPsi.samples, compressedPsi.numberOfSamples*sizeof(int)); +// read(file, compressedPsi.pointers, compressedPsi.numberOfSamples*sizeof(int)); +// close(file); +// +// return compressedPsi; +// +//} diff --git a/swcsa/intIndex/psiGonzalo.c b/swcsa/intIndex/psiGonzalo.c new file mode 100644 index 0000000..42449a4 --- /dev/null +++ b/swcsa/intIndex/psiGonzalo.c @@ -0,0 +1,296 @@ + +#include "psiGonzalo.h" + + +// IMPLEMENTACI�N DAS FUNCI�NS + +void destroyGonzaloCompressedPsi(GonzaloCompressedPsi *compressedPsi) { + + //free(compressedPsi->Hlen.s.spos); + //free(compressedPsi->Hacc.s.spos); + freeHuff(compressedPsi->Hlen); + freeHuff(compressedPsi->Hacc); + free(compressedPsi->cPsi); + free(compressedPsi->bposS); +} + + +GonzaloCompressedPsi gonzaloCompressPsi(uint *Psi, uint psiSize, uint T, uint HUFF) { + + GonzaloCompressedPsi compressedPsi; + + register uint i; + uint oi,j; + int ok,k; + register uint _cptr; + + uint *_cPsi; + uint *_bposS; + + uint links = psiSize; + uint samplen = T; + uint _bplen; + uint pslen; + uint totexc; + + uint *acc,*lacc; + THuff Hacc, Hlen; + + uint totalSize; + + // Construe os arboles de huffman, o dos valores directos + // e o das lonxitudes dos runs. Usa como vectores auxiliares de frecuencias + // a acc e lacc, que finalmente libera. + acc = (uint *)malloc (HUFF*sizeof(uint)); + lacc = (uint *)malloc ((samplen-1)*sizeof(uint)); + for (k=0;k= HUFF)) acc[0]++; + else acc[k]++; + } + ok = (i % samplen) ? k : 0; + k = Psi[i+1]-Psi[i]; + } + + if (ok == 1) { + acc[1]++; + lacc[i-oi-1]++; + } + + Hacc = createHuff (acc,HUFF-1, UNSORTED); + Hlen = createHuff (lacc,samplen-2, UNSORTED); + totexc = acc[0]; + pslen = bits(psiSize+1); + _bplen = bits(Hacc.total+Hlen.total+(1+links/samplen+totexc)*pslen); + _bposS = (uint *)malloc ((((1+links/samplen)*_bplen+W-1)/W)*sizeof(uint)); + _cPsi = (uint *)malloc (((Hacc.total+Hlen.total+(1+links/samplen+totexc)*pslen+W-1)/W)*sizeof(uint)); + + _cptr = 0; + ok = 0; + k = Psi[0]; + + for (i=0;i<=links;i++) { + + if ((k == 1) && (i % samplen)) { if (ok != 1) oi = i; } + else { + if (ok == 1) { + _cptr = encodeHuff (Hacc,1,_cPsi,_cptr); + _cptr = encodeHuff(Hlen,i-oi-1,_cPsi,_cptr); + } + if (i % samplen) { + if ((k > 1) && (k < HUFF)) _cptr = encodeHuff (Hacc,k,_cPsi,_cptr); + else { + _cptr = encodeHuff (Hacc,0,_cPsi,_cptr); + bitwrite (_cPsi,_cptr,pslen,Psi[i]); + _cptr += pslen; + } + } + else { + bitwrite (_bposS,(i/samplen)*_bplen,_bplen,_cptr); + bitwrite (_cPsi,_cptr,pslen,Psi[i]); + _cptr += pslen; + } + } + ok = (i % samplen) ? k : 0; + k = Psi[i+1]-Psi[i]; + } + + if (ok == 1) { + _cptr = encodeHuff (Hacc,1,_cPsi,_cptr); + _cptr = encodeHuff(Hlen,i-oi-1,_cPsi,_cptr); + } + + // Calculamos o espacio total + totalSize = (((1+links/samplen)*_bplen+W-1)/W)*sizeof(uint) + + ((Hacc.total+Hlen.total+(1+links/samplen+totexc)*pslen+W-1)/W)*sizeof(uint) + + 5*sizeof(int) + sizeHuff(Hacc) + sizeHuff(Hlen); + printf("\n\tCompressed Psi size = %d bytes\n", totalSize); + + // Necesario antes de decodificar + prepareToDecode(&Hacc); + prepareToDecode(&Hlen); + + // Asignamos os valores e devolvemos psi comprimido + compressedPsi.links = psiSize; + compressedPsi.totexc = totexc; + compressedPsi.cPsi = _cPsi; + compressedPsi.samplen = samplen; + compressedPsi.bposS = _bposS; + compressedPsi.bplen = _bplen; + compressedPsi.pslen = pslen; + compressedPsi.Hacc = Hacc; + compressedPsi.Hlen = Hlen; + compressedPsi.totalMem = totalSize; + + free(acc); + free(lacc); + + return compressedPsi; +} + + + +int getGonzaloPsiValue(GonzaloCompressedPsi *compressedPsi, unsigned int position) { + + uint *cPsi = compressedPsi->cPsi; + uint samplen = compressedPsi->samplen; + uint *bposS = compressedPsi->bposS; + uint bplen = compressedPsi->bplen; + uint pslen = compressedPsi->pslen; + THuff *Hacc = &compressedPsi->Hacc; + THuff *Hlen = &compressedPsi->Hlen; + + uint sampj,cptr,val,dval,rlen,head,hlen; + + sampj = (position/samplen)*samplen; + cptr = bitread(bposS,(sampj/samplen)*bplen,bplen); + head = cptr; + val = bitread(cPsi,head,pslen); + head += pslen; + + while (sampj < position) { + + head = decodeHuff(Hacc,&dval,cPsi,head); + + if (dval == 0) { + + val = bitread(cPsi,head,pslen); + head += pslen; + sampj++; + } + else + if (dval == 1) { + head = decodeHuff(Hlen,&rlen,cPsi,head); + rlen++; + if (sampj + rlen >= position) return val + (position-sampj); + val += rlen; + sampj += rlen; + } + else { + val += dval; + sampj++; + } + + } + + return val; + +} + + +void storeGonzaloCompressedPsi(GonzaloCompressedPsi *compressedPsi, char *filename) { + + int file; + THuff Hacc; + THuff Hlen; + + if( (file = open(filename, O_WRONLY|O_CREAT, 0700)) < 0) { + printf("Cannot open file %s\n", filename); + exit(0); + } + + // Copias locales dos arboles de HUFFMAN + Hacc = compressedPsi->Hacc; + Hlen = compressedPsi->Hlen; + + write(file, &(compressedPsi->links), sizeof(int)); + write(file, &(compressedPsi->totexc), sizeof(int)); + write(file, &(compressedPsi->samplen), sizeof(int)); + write(file, &(compressedPsi->bplen), sizeof(int)); + write(file, &(compressedPsi->pslen), sizeof(int)); + // Almacenar o arbol de huffman principal + write(file, &Hacc.max, sizeof(int)); + write(file, &Hacc.lim, sizeof(int)); + write(file, &Hacc.depth, sizeof(int)); +// write(file, Hacc.s.spos, (Hacc.lim+1)*sizeof(int)); + write(file, Hacc.s.symb, (Hacc.lim+1)*sizeof(int)); + write(file, Hacc.num, (Hacc.depth+1)*sizeof(int)); + write(file, Hacc.fst, (Hacc.depth+1)*sizeof(int)); + // Fin de almacenar o arbol de huffman principal + // Almacenar o arbol de huffman das lonxitudes dos runs + write(file, &Hlen.max, sizeof(int)); + write(file, &Hlen.lim, sizeof(int)); + write(file, &Hlen.depth, sizeof(int)); +// write(file, Hlen.s.spos, (Hlen.lim+1)*sizeof(int)); + write(file, Hlen.s.symb, (Hlen.lim+1)*sizeof(int)); + write(file, Hlen.num, (Hlen.depth+1)*sizeof(int)); + write(file, Hlen.fst, (Hlen.depth+1)*sizeof(int)); + // Fin de almacenar o arbol de huffman das lonxitudes dos runs + write(file, compressedPsi->bposS, ((compressedPsi->bplen*(1+compressedPsi->links/compressedPsi->samplen)+W-1)/W)*sizeof(uint)); + write(file, compressedPsi->cPsi, ((Hacc.total+Hlen.total+(1+compressedPsi->links/compressedPsi->samplen+compressedPsi->totexc)*compressedPsi->pslen+W-1)/W)*sizeof(int)); + + write(file, &(compressedPsi->totalMem), sizeof(int)); + + close(file); + +} + + +GonzaloCompressedPsi loadGonzaloCompressedPsi(char *filename) { + + GonzaloCompressedPsi compressedPsi; + + THuff Hacc; + THuff Hlen; + + int file; + + if( (file = open(filename, O_RDONLY)) < 0) { + printf("Cannot read file %s\n", filename); + exit(0); + } + read(file, &(compressedPsi.links), sizeof(int)); + read(file, &(compressedPsi.totexc), sizeof(int)); + read(file, &(compressedPsi.samplen), sizeof(int)); + read(file, &(compressedPsi.bplen), sizeof(int)); + read(file, &(compressedPsi.pslen), sizeof(int)); + // Cargamos o arbol de Huffman principal + read(file, &Hacc.max, sizeof(int)); + read(file, &Hacc.lim, sizeof(int)); + read(file, &Hacc.depth, sizeof(int)); + //Hacc.s.spos = (unsigned int *) malloc((Hacc.lim+1)*sizeof(int)); + Hacc.s.symb = (unsigned int *) malloc((Hacc.lim+1)*sizeof(int)); + Hacc.num = (unsigned int *) malloc((Hacc.depth+1)*sizeof(int)); + Hacc.fst = (unsigned int *) malloc((Hacc.depth+1)*sizeof(int)); + //read(file, Hacc.s.spos, (Hacc.lim+1)*sizeof(int)); + read(file, Hacc.s.symb, (Hacc.lim+1)*sizeof(int)); + read(file, Hacc.num, (Hacc.depth+1)*sizeof(int)); + read(file, Hacc.fst, (Hacc.depth+1)*sizeof(int)); + compressedPsi.Hacc = Hacc; + // Fin da carga do arbol de Huffman principal + // Cargamos o arbol de Huffman coas lonxitudes dos runs + read(file, &Hlen.max, sizeof(int)); + read(file, &Hlen.lim, sizeof(int)); + read(file, &Hlen.depth, sizeof(int)); + //Hlen.s.spos = (unsigned int *) malloc((Hlen.lim+1)*sizeof(int)); + Hlen.s.symb = (unsigned int *) malloc((Hlen.lim+1)*sizeof(int)); + Hlen.num = (unsigned int *) malloc((Hlen.depth+1)*sizeof(int)); + Hlen.fst = (unsigned int *) malloc((Hlen.depth+1)*sizeof(int)); + //read(file, Hlen.s.spos, (Hlen.lim+1)*sizeof(int)); + read(file, Hlen.s.symb, (Hlen.lim+1)*sizeof(int)); + read(file, Hlen.num, (Hlen.depth+1)*sizeof(int)); + read(file, Hlen.fst, (Hlen.depth+1)*sizeof(int)); + compressedPsi.Hlen = Hlen; + // Fin da carga do arbol de Huffman coas lonxitudes dos runs + compressedPsi.bposS = (uint *) malloc (((compressedPsi.bplen*(1+compressedPsi.links/compressedPsi.samplen)+W-1)/W)*sizeof(uint)); + read(file, compressedPsi.bposS, ((compressedPsi.bplen*(1+compressedPsi.links/compressedPsi.samplen)+W-1)/W)*sizeof(uint)); + compressedPsi.cPsi = (uint *) malloc (((Hacc.total+Hlen.total+(1+compressedPsi.links/compressedPsi.samplen+compressedPsi.totexc)*compressedPsi.pslen+W-1)/W)*sizeof(uint)); + read(file, compressedPsi.cPsi, ((Hacc.total+Hlen.total+(1+compressedPsi.links/compressedPsi.samplen+compressedPsi.totexc)*compressedPsi.pslen+W-1)/W)*sizeof(uint)); + + read(file, &(compressedPsi.totalMem), sizeof(int)); + close(file); + + return compressedPsi; + +} diff --git a/swcsa/intIndex/psiGonzalo.h b/swcsa/intIndex/psiGonzalo.h new file mode 100644 index 0000000..95693bd --- /dev/null +++ b/swcsa/intIndex/psiGonzalo.h @@ -0,0 +1,317 @@ +#include +#include +#include +#include "../utils/huff.h" + +// ESTRUCTURA QUE REPRESENTA A FUNCION PSI COMPRIMIDA +typedef struct { + uint links; + uint totexc; + uint samplen; + uint bplen; + uint pslen; + uint *cPsi; + uint *bposS; + THuff Hacc; + THuff Hlen; + unsigned int totalMem; // the size in bytes used; +} GonzaloCompressedPsi; + + +// PROTOTIPOS DE FUNCIONS +GonzaloCompressedPsi gonzaloCompressPsi(uint *Psi, uint psiSize, uint T, uint HUFF); +int getGonzaloPsiValue(GonzaloCompressedPsi *compressedPsi, unsigned int position); +void storeGonzaloCompressedPsi(GonzaloCompressedPsi *compressedPsi, char *filename); +GonzaloCompressedPsi loadGonzaloCompressedPsi(char *filename); +//frees the memory used. +void destroyGonzaloCompressedPsi(GonzaloCompressedPsi *compressedPsi); + + +//// IMPLEMENTACI�N DAS FUNCI�NS +// +//void destroyGonzaloCompressedPsi(GonzaloCompressedPsi *compressedPsi) { +// +// //free(compressedPsi->Hlen.s.spos); +// //free(compressedPsi->Hacc.s.spos); +// freeHuff(compressedPsi->Hlen); +// freeHuff(compressedPsi->Hacc); +// free(compressedPsi->cPsi); +// free(compressedPsi->bposS); +//} +// +// +//GonzaloCompressedPsi gonzaloCompressPsi(uint *Psi, uint psiSize, uint T, uint HUFF) { +// +// GonzaloCompressedPsi compressedPsi; +// +// register uint i; +// uint oi,j; +// int ok,k; +// register uint _cptr; +// +// uint *_cPsi; +// uint *_bposS; +// +// uint links = psiSize; +// uint samplen = T; +// uint _bplen; +// uint pslen; +// uint totexc; +// +// uint *acc,*lacc; +// THuff Hacc, Hlen; +// +// uint totalSize; +// +// // Construe os arboles de huffman, o dos valores directos +// // e o das lonxitudes dos runs. Usa como vectores auxiliares de frecuencias +// // a acc e lacc, que finalmente libera. +// acc = (uint *)malloc (HUFF*sizeof(uint)); +// lacc = (uint *)malloc ((samplen-1)*sizeof(uint)); +// for (k=0;k= HUFF)) acc[0]++; +// else acc[k]++; +// } +// ok = (i % samplen) ? k : 0; +// k = Psi[i+1]-Psi[i]; +// } +// +// if (ok == 1) { +// acc[1]++; +// lacc[i-oi-1]++; +// } +// +// Hacc = createHuff (acc,HUFF-1, UNSORTED); +// Hlen = createHuff (lacc,samplen-2, UNSORTED); +// totexc = acc[0]; +// pslen = bits(psiSize+1); +// _bplen = bits(Hacc.total+Hlen.total+(1+links/samplen+totexc)*pslen); +// _bposS = (uint *)malloc ((((1+links/samplen)*_bplen+W-1)/W)*sizeof(uint)); +// _cPsi = (uint *)malloc (((Hacc.total+Hlen.total+(1+links/samplen+totexc)*pslen+W-1)/W)*sizeof(uint)); +// +// _cptr = 0; +// ok = 0; +// k = Psi[0]; +// +// for (i=0;i<=links;i++) { +// +// if ((k == 1) && (i % samplen)) { if (ok != 1) oi = i; } +// else { +// if (ok == 1) { +// _cptr = encodeHuff (Hacc,1,_cPsi,_cptr); +// _cptr = encodeHuff(Hlen,i-oi-1,_cPsi,_cptr); +// } +// if (i % samplen) { +// if ((k > 1) && (k < HUFF)) _cptr = encodeHuff (Hacc,k,_cPsi,_cptr); +// else { +// _cptr = encodeHuff (Hacc,0,_cPsi,_cptr); +// bitwrite (_cPsi,_cptr,pslen,Psi[i]); +// _cptr += pslen; +// } +// } +// else { +// bitwrite (_bposS,(i/samplen)*_bplen,_bplen,_cptr); +// bitwrite (_cPsi,_cptr,pslen,Psi[i]); +// _cptr += pslen; +// } +// } +// ok = (i % samplen) ? k : 0; +// k = Psi[i+1]-Psi[i]; +// } +// +// if (ok == 1) { +// _cptr = encodeHuff (Hacc,1,_cPsi,_cptr); +// _cptr = encodeHuff(Hlen,i-oi-1,_cPsi,_cptr); +// } +// +// // Calculamos o espacio total +// totalSize = (((1+links/samplen)*_bplen+W-1)/W)*sizeof(uint) + +// ((Hacc.total+Hlen.total+(1+links/samplen+totexc)*pslen+W-1)/W)*sizeof(uint) + +// 5*sizeof(int) + sizeHuff(Hacc) + sizeHuff(Hlen); +// printf("Compressed Psi size = %d bytes\n", totalSize); +// +// // Necesario antes de decodificar +// prepareToDecode(&Hacc); +// prepareToDecode(&Hlen); +// +// // Asignamos os valores e devolvemos psi comprimido +// compressedPsi.links = psiSize; +// compressedPsi.totexc = totexc; +// compressedPsi.cPsi = _cPsi; +// compressedPsi.samplen = samplen; +// compressedPsi.bposS = _bposS; +// compressedPsi.bplen = _bplen; +// compressedPsi.pslen = pslen; +// compressedPsi.Hacc = Hacc; +// compressedPsi.Hlen = Hlen; +// +// free(acc); +// free(lacc); +// +// return compressedPsi; +//} +// +// +// +//int getGonzaloPsiValue(GonzaloCompressedPsi *compressedPsi, unsigned int position) { +// +// uint *cPsi = compressedPsi->cPsi; +// uint samplen = compressedPsi->samplen; +// uint *bposS = compressedPsi->bposS; +// uint bplen = compressedPsi->bplen; +// uint pslen = compressedPsi->pslen; +// THuff *Hacc = &compressedPsi->Hacc; +// THuff *Hlen = &compressedPsi->Hlen; +// +// uint sampj,cptr,val,dval,rlen,head,hlen; +// +// sampj = (position/samplen)*samplen; +// cptr = bitread(bposS,(sampj/samplen)*bplen,bplen); +// head = cptr; +// val = bitread(cPsi,head,pslen); +// head += pslen; +// +// while (sampj < position) { +// +// head = decodeHuff(Hacc,&dval,cPsi,head); +// +// if (dval == 0) { +// +// val = bitread(cPsi,head,pslen); +// head += pslen; +// sampj++; +// } +// else +// if (dval == 1) { +// head = decodeHuff(Hlen,&rlen,cPsi,head); +// rlen++; +// if (sampj + rlen >= position) return val + (position-sampj); +// val += rlen; +// sampj += rlen; +// } +// else { +// val += dval; +// sampj++; +// } +// +// } +// +// return val; +// +//} +// +// +//void storeGonzaloCompressedPsi(GonzaloCompressedPsi *compressedPsi, char *filename) { +// +// int file; +// THuff Hacc; +// THuff Hlen; +// +// if( (file = open(filename, O_WRONLY|O_CREAT, 0700)) < 0) { +// printf("Cannot open file %s\n", filename); +// exit(0); +// } +// +// // Copias locales dos arboles de HUFFMAN +// Hacc = compressedPsi->Hacc; +// Hlen = compressedPsi->Hlen; +// +// write(file, &(compressedPsi->links), sizeof(int)); +// write(file, &(compressedPsi->totexc), sizeof(int)); +// write(file, &(compressedPsi->samplen), sizeof(int)); +// write(file, &(compressedPsi->bplen), sizeof(int)); +// write(file, &(compressedPsi->pslen), sizeof(int)); +// // Almacenar o arbol de huffman principal +// write(file, &Hacc.max, sizeof(int)); +// write(file, &Hacc.lim, sizeof(int)); +// write(file, &Hacc.depth, sizeof(int)); +//// write(file, Hacc.s.spos, (Hacc.lim+1)*sizeof(int)); +// write(file, Hacc.s.symb, (Hacc.lim+1)*sizeof(int)); +// write(file, Hacc.num, (Hacc.depth+1)*sizeof(int)); +// write(file, Hacc.fst, (Hacc.depth+1)*sizeof(int)); +// // Fin de almacenar o arbol de huffman principal +// // Almacenar o arbol de huffman das lonxitudes dos runs +// write(file, &Hlen.max, sizeof(int)); +// write(file, &Hlen.lim, sizeof(int)); +// write(file, &Hlen.depth, sizeof(int)); +//// write(file, Hlen.s.spos, (Hlen.lim+1)*sizeof(int)); +// write(file, Hlen.s.symb, (Hlen.lim+1)*sizeof(int)); +// write(file, Hlen.num, (Hlen.depth+1)*sizeof(int)); +// write(file, Hlen.fst, (Hlen.depth+1)*sizeof(int)); +// // Fin de almacenar o arbol de huffman das lonxitudes dos runs +// write(file, compressedPsi->bposS, ((compressedPsi->bplen*(1+compressedPsi->links/compressedPsi->samplen)+W-1)/W)*sizeof(uint)); +// write(file, compressedPsi->cPsi, ((Hacc.total+Hlen.total+(1+compressedPsi->links/compressedPsi->samplen+compressedPsi->totexc)*compressedPsi->pslen+W-1)/W)*sizeof(int)); +// +// close(file); +// +//} +// +// +//GonzaloCompressedPsi loadGonzaloCompressedPsi(char *filename) { +// +// GonzaloCompressedPsi compressedPsi; +// +// THuff Hacc; +// THuff Hlen; +// +// int file; +// +// if( (file = open(filename, O_RDONLY)) < 0) { +// printf("Cannot read file %s\n", filename); +// exit(0); +// } +// read(file, &(compressedPsi.links), sizeof(int)); +// read(file, &(compressedPsi.totexc), sizeof(int)); +// read(file, &(compressedPsi.samplen), sizeof(int)); +// read(file, &(compressedPsi.bplen), sizeof(int)); +// read(file, &(compressedPsi.pslen), sizeof(int)); +// // Cargamos o arbol de Huffman principal +// read(file, &Hacc.max, sizeof(int)); +// read(file, &Hacc.lim, sizeof(int)); +// read(file, &Hacc.depth, sizeof(int)); +// //Hacc.s.spos = (unsigned int *) malloc((Hacc.lim+1)*sizeof(int)); +// Hacc.s.symb = (unsigned int *) malloc((Hacc.lim+1)*sizeof(int)); +// Hacc.num = (unsigned int *) malloc((Hacc.depth+1)*sizeof(int)); +// Hacc.fst = (unsigned int *) malloc((Hacc.depth+1)*sizeof(int)); +// //read(file, Hacc.s.spos, (Hacc.lim+1)*sizeof(int)); +// read(file, Hacc.s.symb, (Hacc.lim+1)*sizeof(int)); +// read(file, Hacc.num, (Hacc.depth+1)*sizeof(int)); +// read(file, Hacc.fst, (Hacc.depth+1)*sizeof(int)); +// compressedPsi.Hacc = Hacc; +// // Fin da carga do arbol de Huffman principal +// // Cargamos o arbol de Huffman coas lonxitudes dos runs +// read(file, &Hlen.max, sizeof(int)); +// read(file, &Hlen.lim, sizeof(int)); +// read(file, &Hlen.depth, sizeof(int)); +// //Hlen.s.spos = (unsigned int *) malloc((Hlen.lim+1)*sizeof(int)); +// Hlen.s.symb = (unsigned int *) malloc((Hlen.lim+1)*sizeof(int)); +// Hlen.num = (unsigned int *) malloc((Hlen.depth+1)*sizeof(int)); +// Hlen.fst = (unsigned int *) malloc((Hlen.depth+1)*sizeof(int)); +// //read(file, Hlen.s.spos, (Hlen.lim+1)*sizeof(int)); +// read(file, Hlen.s.symb, (Hlen.lim+1)*sizeof(int)); +// read(file, Hlen.num, (Hlen.depth+1)*sizeof(int)); +// read(file, Hlen.fst, (Hlen.depth+1)*sizeof(int)); +// compressedPsi.Hlen = Hlen; +// // Fin da carga do arbol de Huffman coas lonxitudes dos runs +// compressedPsi.bposS = (uint *) malloc (((compressedPsi.bplen*(1+compressedPsi.links/compressedPsi.samplen)+W-1)/W)*sizeof(uint)); +// read(file, compressedPsi.bposS, ((compressedPsi.bplen*(1+compressedPsi.links/compressedPsi.samplen)+W-1)/W)*sizeof(uint)); +// compressedPsi.cPsi = (uint *) malloc (((Hacc.total+Hlen.total+(1+compressedPsi.links/compressedPsi.samplen+compressedPsi.totexc)*compressedPsi.pslen+W-1)/W)*sizeof(uint)); +// read(file, compressedPsi.cPsi, ((Hacc.total+Hlen.total+(1+compressedPsi.links/compressedPsi.samplen+compressedPsi.totexc)*compressedPsi.pslen+W-1)/W)*sizeof(uint)); +// +// close(file); +// +// return compressedPsi; +// +//} diff --git a/swcsa/intIndex/psiHuffmanRLE.c b/swcsa/intIndex/psiHuffmanRLE.c new file mode 100644 index 0000000..5e36869 --- /dev/null +++ b/swcsa/intIndex/psiHuffmanRLE.c @@ -0,0 +1,335 @@ +#include "psiHuffmanRLE.h" + +// IMPLEMENTACION DAS FUNCIONS + +void destroyHuffmanCompressedPsi(HuffmanCompressedPsi *compressedPsi) { + freeHuff(compressedPsi->diffsHT); + free(compressedPsi->samples); + free(compressedPsi->samplePointers); + free (compressedPsi->stream); +} + +HuffmanCompressedPsi huffmanCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T, unsigned int nS) { + + HuffmanCompressedPsi cPsi; + + int absolute_value; + register unsigned int index, ptr, samplesPtr, samplePointersPtr; + unsigned int runLenght, binaryLenght; + + int *diffs; + unsigned int *huffmanDst; + + // Estructuras da funcion comprimida (para logo asignar) + // Tam�n se podian almacenar directamente + THuff diffsHT; + unsigned int numberOfSamples; + unsigned int *samples; + unsigned int sampleSize; + unsigned int *samplePointers; + unsigned int pointerSize; + unsigned int *stream; + unsigned int streamSize; + + // Variables que marcan os intervalos dentro do vector de frecuencias + unsigned int runLenghtStart = nS - 64 - T; // Inicio das Runs + unsigned int negStart = nS - 64; // Inicio dos Negativos + unsigned int bigStart = nS - 32; // Inicio dos Grandes (>runLenghtStart) + + // Para estadistica + unsigned int totalSize; + + // Reservamos espacio para a distribuci�n de valores de Psi + huffmanDst = (unsigned int *)malloc(sizeof(int)*nS); + for(index=0;index1 && diffs[index]= 128 + absolute_value = diffs[index]; + binaryLenght = bits(absolute_value); + huffmanDst[binaryLenght+bigStart-1]++; + } + } + + } else { // Rompemos o run porque atopamos unha mostra + if(runLenght) { + huffmanDst[runLenght+runLenghtStart]++; + runLenght = 0; + } + } + + } + + if(runLenght) huffmanDst[runLenght+runLenghtStart]++; + + // Creamos o arbol de Huffman + diffsHT = createHuff(huffmanDst,nS-1,UNSORTED); + + // Calculamos o espacio total ocupado pola secuencia Huffman + RLE + streamSize = diffsHT.total; + for(index=negStart;index1 && diffs[index]= 128 + absolute_value = diffs[index]; + binaryLenght = bits(absolute_value); + ptr = encodeHuff(diffsHT,binaryLenght+bigStart-1,stream,ptr); + bitwrite(stream,ptr,binaryLenght,absolute_value); + ptr += binaryLenght; + } + } + + } else { // Rompemos o run porque atopamos unha mostra + if(runLenght) { + ptr = encodeHuff(diffsHT,runLenght+runLenghtStart,stream,ptr); + runLenght = 0; + } + bitwrite(samples,samplesPtr,sampleSize,Psi[index]); + samplesPtr += sampleSize; + bitwrite(samplePointers,samplePointersPtr,pointerSize,ptr); + samplePointersPtr += pointerSize; + } + + } + + if(runLenght) { + ptr = encodeHuff(diffsHT,runLenght+runLenghtStart,stream,ptr); + } + + // Amosamos o espacio ocupado + totalSize = sizeof(HuffmanCompressedPsi) + + sizeof(int)*((numberOfSamples*sampleSize+31)/32) + + sizeof(int)*((numberOfSamples*pointerSize+31)/32) + + sizeof(int)*((streamSize+31)/32) + sizeHuff(diffsHT); + + printf("\n\t Compressed Psi size = %d bytes, with %d different symbols.", totalSize, nS); + + // Necesario antes de decodificar + prepareToDecode(&diffsHT); + + // Asignamos os valores a cPsi e devolvemolo + cPsi.T = T; + cPsi.diffsHT = diffsHT; + cPsi.nS = nS; + cPsi.numberOfSamples = numberOfSamples; + cPsi.samples = samples; + cPsi.sampleSize = sampleSize; + cPsi.samplePointers = samplePointers; + cPsi.pointerSize = pointerSize; + cPsi.stream = stream; + cPsi.streamSize = streamSize; + cPsi.totalMem = totalSize; + + //frees resources not needed in advance + free(diffs); + free(huffmanDst); + + //returns the data structure that holds the compressed psi. + return cPsi; +} + + +unsigned int getHuffmanPsiValue(HuffmanCompressedPsi *cPsi, unsigned int position) { + + register unsigned int index; + unsigned int sampleIndex, ptr, psiValue, huffmanCode, positionsSinceSample; + unsigned int absolute_value, binaryLenght, runLenght; + + unsigned int runLenghtStart = cPsi->nS - 64 - cPsi->T; + unsigned int negStart = cPsi->nS - 64; + unsigned int bigStart = cPsi->nS - 32; + + sampleIndex = position / cPsi->T; + psiValue = bitread(cPsi->samples,sampleIndex*cPsi->sampleSize,cPsi->sampleSize); + ptr = bitread(cPsi->samplePointers,sampleIndex*cPsi->pointerSize,cPsi->pointerSize); + + positionsSinceSample = position%cPsi->T; + + for(index=0;indexdiffsHT,&huffmanCode,cPsi->stream,ptr); + + if(huffmanCode < runLenghtStart) { // Incremento directo + psiValue += huffmanCode; + } + else + if(huffmanCode < negStart) { // Estamos nun run + runLenght = huffmanCode - runLenghtStart; + if(index+runLenght>=positionsSinceSample) + return psiValue+positionsSinceSample-index; + else { + psiValue += runLenght; + index += runLenght-1; + } + } + else + if(huffmanCode < bigStart) { // Negativo + binaryLenght = huffmanCode-negStart+1; + absolute_value = bitread(cPsi->stream,ptr,binaryLenght); + ptr += binaryLenght; + psiValue -= absolute_value; + } + else { // Grande + binaryLenght = huffmanCode-bigStart+1; + absolute_value = bitread(cPsi->stream,ptr,binaryLenght); + ptr += binaryLenght; + psiValue += absolute_value; + } + + } + + return psiValue; + +} + + +void storeHuffmanCompressedPsi(HuffmanCompressedPsi *compressedPsi, char *filename) { + + int file; + THuff H; + + if( (file = open(filename, O_WRONLY|O_CREAT, 0700)) < 0) { + printf("Cannot open file %s\n", filename); + exit(0); + } + write(file, &(compressedPsi->T), sizeof(int)); + // Almacenar o arbol de huffman + H = compressedPsi->diffsHT; + write(file, &H.max, sizeof(int)); + write(file, &H.lim, sizeof(int)); + write(file, &H.depth, sizeof(int)); +// write(file, H.s.spos, (H.lim+1)*sizeof(int)); + write(file, H.s.symb, (H.lim+1)*sizeof(int)); + write(file, H.num, (H.depth+1)*sizeof(int)); + write(file, H.fst, (H.depth+1)*sizeof(int)); + // Fin de almacenar o arbol de huffman + write(file, &(compressedPsi->nS), sizeof(int)); + write(file, &(compressedPsi->numberOfSamples), sizeof(int)); + write(file, &(compressedPsi->sampleSize), sizeof(int)); + write(file, compressedPsi->samples, ((compressedPsi->numberOfSamples*compressedPsi->sampleSize+31)/32)*sizeof(int)); + write(file, &(compressedPsi->pointerSize), sizeof(int)); + write(file, compressedPsi->samplePointers, ((compressedPsi->numberOfSamples*compressedPsi->pointerSize+31)/32)*sizeof(int)); + write(file, &(compressedPsi->streamSize), sizeof(int)); + write(file, compressedPsi->stream, ((compressedPsi->streamSize+31)/32)*sizeof(int)); + write(file, &(compressedPsi->totalMem), sizeof(int)); + + close(file); + +} + + +HuffmanCompressedPsi loadHuffmanCompressedPsi(char *filename) { + + HuffmanCompressedPsi compressedPsi; + + THuff H; + + int file; + + if( (file = open(filename, O_RDONLY)) < 0) { + printf("Cannot read file %s\n", filename); + exit(0); + } + read(file, &(compressedPsi.T), sizeof(int)); + // Cargamos o arbol de Huffman + read(file, &H.max, sizeof(int)); + read(file, &H.lim, sizeof(int)); + read(file, &H.depth, sizeof(int)); + //H.s.spos = (unsigned int *) malloc((H.lim+1)*sizeof(int)); + //H.s.spos =H.s.symb = (unsigned int *) malloc((H.lim+1)*sizeof(int)); + H.s.symb = (unsigned int *) malloc((H.lim+1)*sizeof(int)); + H.num = (unsigned int *) malloc((H.depth+1)*sizeof(int)); + H.fst = (unsigned int *) malloc((H.depth+1)*sizeof(int)); + + //read(file, H.s.spos, (H.lim+1)*sizeof(int)); + //fprintf(stderr," \n read %d spos bytes\n", (H.lim+1)*sizeof(int)); + read(file, H.s.symb, (H.lim+1)*sizeof(int)); + + read(file, H.num, (H.depth+1)*sizeof(int)); + read(file, H.fst, (H.depth+1)*sizeof(int)); + compressedPsi.diffsHT = H; + // Fin da carga do arbol de Huffman + read(file, &(compressedPsi.nS), sizeof(int)); + read(file, &(compressedPsi.numberOfSamples), sizeof(int)); + read(file, &(compressedPsi.sampleSize), sizeof(int)); + compressedPsi.samples = (unsigned int *)malloc(((compressedPsi.numberOfSamples*compressedPsi.sampleSize+31)/32)*sizeof(int)); + read(file, compressedPsi.samples, ((compressedPsi.numberOfSamples*compressedPsi.sampleSize+31)/32)*sizeof(int)); + read(file, &(compressedPsi.pointerSize), sizeof(int)); + compressedPsi.samplePointers = (unsigned int *)malloc(((compressedPsi.numberOfSamples*compressedPsi.pointerSize+31)/32)*sizeof(int)); + read(file, compressedPsi.samplePointers, ((compressedPsi.numberOfSamples*compressedPsi.pointerSize+31)/32)*sizeof(int)); + read(file, &(compressedPsi.streamSize), sizeof(int)); + compressedPsi.stream = (unsigned int *)malloc(((compressedPsi.streamSize+31)/32)*sizeof(int)); + read(file, compressedPsi.stream, ((compressedPsi.streamSize+31)/32)*sizeof(int)); + read(file, &(compressedPsi.totalMem), sizeof(int)); + + close(file); + + return compressedPsi; + +} diff --git a/swcsa/intIndex/psiHuffmanRLE.h b/swcsa/intIndex/psiHuffmanRLE.h new file mode 100644 index 0000000..8757c73 --- /dev/null +++ b/swcsa/intIndex/psiHuffmanRLE.h @@ -0,0 +1,389 @@ +#include +#include +#include +#include "../utils/huff.h" + +/* +Compresion de PSI utilizando codificación incremental e RLE para os runs. + +Utilizamos códigos Huffman, entre os que distinguimos 4 grupos. + +G1: Incrementos frecuentes: Entre 2 e Total - 32 negativos - 32 grandes - (máxima lonxitude dun Run == periodo de muestreo - 1)) +G2: Código que representa que hai un run e a súa lonxitude (periodo de muestreo códigos) +G3: Numeros negativos (32 caracteres de escape, que representan a lonxitude da representación binaria do valor absoluto do negativo) +G4: Numeros grandes, maiores que os de G1 (32 caracteres de escape representando novamente a lonxitude da representación binaria do seu valor) + +Os de G1 obtéñense directamente, tras decodificar Huffman. +Os de G2 transfórmanse nunha run da lonxitude obtida tras decodificar con Huffman. +Os de G3 van seguidos da representación binaria do valor absoluto do seu número. +Os de G4 van seguidos da súa representación binaria. +*/ + +// ESTRUCTURA DE PSI COMPRIMIDA +typedef struct { + unsigned int T; // Periodo de muestreo de PSI + THuff diffsHT; // Arbol de Huffman (codifica stream) + unsigned int nS; // Numero de simbolos para Huffman + unsigned int numberOfSamples; + unsigned int sampleSize; // Bits que ocupa cada mostra + unsigned int *samples; // Vector de mostras + unsigned int pointerSize; // Bits que ocupa cada punteiro + unsigned int *samplePointers; // Punteiros das mostras a stream + unsigned int streamSize; // Bits que ocupa stream + unsigned int *stream; // Secuencia Huffman + RLE + unsigned int totalMem; // the size in bytes used; +} HuffmanCompressedPsi; + + +// PROTOTIPOS DE FUNCIÓNS + +// Crea as estructuras de Psi comprimida: +// +// Psi: Funcion Psi original +// psiSize: Numero de elementos de Psi +// T: Periodo de muestreo en Psi +// nS: Numero de simbolos que se utilizaran no arbol de Huffman +// +// Devolve unha estructura CompressedPSI que representa a Psi comprimida +HuffmanCompressedPsi huffmanCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T, unsigned int nS); + +// Obtén un valor de Psi +// +// cPsi: A estructura que representa a Psi comprimida +// position: A posicion da que queremos obter o valor de Psi +unsigned int getHuffmanPsiValue(HuffmanCompressedPsi *cPsi, unsigned int position); + +void storeHuffmanCompressedPsi(HuffmanCompressedPsi *compressedPsi, char *filename); +HuffmanCompressedPsi loadHuffmanCompressedPsi(char *filename); + +//frees the memory used. +void destroyHuffmanCompressedPsi(HuffmanCompressedPsi *compressedPsi); + + +// +//// IMPLEMENTACIÓN DAS FUNCIÓNS +// +//void destroyHuffmanCompressedPsi(HuffmanCompressedPsi *compressedPsi) { +// freeHuff(compressedPsi->diffsHT); +// free(compressedPsi->samples); +// free(compressedPsi->samplePointers); +// free (compressedPsi->stream); +//} +// +//HuffmanCompressedPsi huffmanCompressPsi(unsigned int *Psi, unsigned int psiSize, unsigned int T, unsigned int nS) { +// +// HuffmanCompressedPsi cPsi; +// +// int absolute_value; +// register unsigned int index, ptr, samplesPtr, samplePointersPtr; +// unsigned int runLenght, binaryLenght; +// +// int *diffs; +// unsigned int *huffmanDst; +// +// // Estructuras da funcion comprimida (para logo asignar) +// // Tamén se podian almacenar directamente +// THuff diffsHT; +// unsigned int numberOfSamples; +// unsigned int *samples; +// unsigned int sampleSize; +// unsigned int *samplePointers; +// unsigned int pointerSize; +// unsigned int *stream; +// unsigned int streamSize; +// +// // Variables que marcan os intervalos dentro do vector de frecuencias +// unsigned int runLenghtStart = nS - 64 - T; // Inicio das Runs +// unsigned int negStart = nS - 64; // Inicio dos Negativos +// unsigned int bigStart = nS - 32; // Inicio dos Grandes (>runLenghtStart) +// +// // Para estadistica +// unsigned int totalSize; +// +// // Reservamos espacio para a distribución de valores de Psi +// huffmanDst = (unsigned int *)malloc(sizeof(int)*nS); +// for(index=0;index1 && diffs[index]= 128 +// absolute_value = diffs[index]; +// binaryLenght = bits(absolute_value); +// huffmanDst[binaryLenght+bigStart-1]++; +// } +// } +// +// } else { // Rompemos o run porque atopamos unha mostra +// if(runLenght) { +// huffmanDst[runLenght+runLenghtStart]++; +// runLenght = 0; +// } +// } +// +// } +// +// if(runLenght) huffmanDst[runLenght+runLenghtStart]++; +// +// // Creamos o arbol de Huffman +// diffsHT = createHuff(huffmanDst,nS-1,UNSORTED); +// +// // Calculamos o espacio total ocupado pola secuencia Huffman + RLE +// streamSize = diffsHT.total; +// for(index=negStart;index1 && diffs[index]= 128 +// absolute_value = diffs[index]; +// binaryLenght = bits(absolute_value); +// ptr = encodeHuff(diffsHT,binaryLenght+bigStart-1,stream,ptr); +// bitwrite(stream,ptr,binaryLenght,absolute_value); +// ptr += binaryLenght; +// } +// } +// +// } else { // Rompemos o run porque atopamos unha mostra +// if(runLenght) { +// ptr = encodeHuff(diffsHT,runLenght+runLenghtStart,stream,ptr); +// runLenght = 0; +// } +// bitwrite(samples,samplesPtr,sampleSize,Psi[index]); +// samplesPtr += sampleSize; +// bitwrite(samplePointers,samplePointersPtr,pointerSize,ptr); +// samplePointersPtr += pointerSize; +// } +// +// } +// +// if(runLenght) { +// ptr = encodeHuff(diffsHT,runLenght+runLenghtStart,stream,ptr); +// } +// +// // Amosamos o espacio ocupado +// totalSize = sizeof(int)*((numberOfSamples*sampleSize+31)/32) + +// sizeof(int)*((numberOfSamples*pointerSize+31)/32) + +// sizeof(int)*((streamSize+31)/32) + sizeHuff(diffsHT) + +// 6*sizeof(int); +// printf("Compressed Psi size = %d bytes\n", totalSize); +// +// // Necesario antes de decodificar +// prepareToDecode(&diffsHT); +// +// // Asignamos os valores a cPsi e devolvemolo +// cPsi.T = T; +// cPsi.diffsHT = diffsHT; +// cPsi.nS = nS; +// cPsi.numberOfSamples = numberOfSamples; +// cPsi.samples = samples; +// cPsi.sampleSize = sampleSize; +// cPsi.samplePointers = samplePointers; +// cPsi.pointerSize = pointerSize; +// cPsi.stream = stream; +// cPsi.streamSize = streamSize; +// +// //frees resources not needed in advance +// free(diffs); +// free(huffmanDst); +// +// //returns the data structure that holds the compressed psi. +// return cPsi; +//} +// +// +//unsigned int getHuffmanPsiValue(HuffmanCompressedPsi *cPsi, unsigned int position) { +// +// register unsigned int index; +// unsigned int sampleIndex, ptr, psiValue, huffmanCode, positionsSinceSample; +// unsigned int absolute_value, binaryLenght, runLenght; +// +// unsigned int runLenghtStart = cPsi->nS - 64 - cPsi->T; +// unsigned int negStart = cPsi->nS - 64; +// unsigned int bigStart = cPsi->nS - 32; +// +// sampleIndex = position / cPsi->T; +// psiValue = bitread(cPsi->samples,sampleIndex*cPsi->sampleSize,cPsi->sampleSize); +// ptr = bitread(cPsi->samplePointers,sampleIndex*cPsi->pointerSize,cPsi->pointerSize); +// +// positionsSinceSample = position%cPsi->T; +// +// for(index=0;indexdiffsHT,&huffmanCode,cPsi->stream,ptr); +// +// if(huffmanCode < runLenghtStart) { // Incremento directo +// psiValue += huffmanCode; +// } +// else +// if(huffmanCode < negStart) { // Estamos nun run +// runLenght = huffmanCode - runLenghtStart; +// if(index+runLenght>=positionsSinceSample) +// return psiValue+positionsSinceSample-index; +// else { +// psiValue += runLenght; +// index += runLenght-1; +// } +// } +// else +// if(huffmanCode < bigStart) { // Negativo +// binaryLenght = huffmanCode-negStart+1; +// absolute_value = bitread(cPsi->stream,ptr,binaryLenght); +// ptr += binaryLenght; +// psiValue -= absolute_value; +// } +// else { // Grande +// binaryLenght = huffmanCode-bigStart+1; +// absolute_value = bitread(cPsi->stream,ptr,binaryLenght); +// ptr += binaryLenght; +// psiValue += absolute_value; +// } +// +// } +// +// return psiValue; +// +//} +// +// +//void storeHuffmanCompressedPsi(HuffmanCompressedPsi *compressedPsi, char *filename) { +// +// int file; +// THuff H; +// +// if( (file = open(filename, O_WRONLY|O_CREAT, 0700)) < 0) { +// printf("Cannot open file %s\n", filename); +// exit(0); +// } +// write(file, &(compressedPsi->T), sizeof(int)); +// // Almacenar o arbol de huffman +// H = compressedPsi->diffsHT; +// write(file, &H.max, sizeof(int)); +// write(file, &H.lim, sizeof(int)); +// write(file, &H.depth, sizeof(int)); +//// write(file, H.s.spos, (H.lim+1)*sizeof(int)); +// write(file, H.s.symb, (H.lim+1)*sizeof(int)); +// write(file, H.num, (H.depth+1)*sizeof(int)); +// write(file, H.fst, (H.depth+1)*sizeof(int)); +// // Fin de almacenar o arbol de huffman +// write(file, &(compressedPsi->nS), sizeof(int)); +// write(file, &(compressedPsi->numberOfSamples), sizeof(int)); +// write(file, &(compressedPsi->sampleSize), sizeof(int)); +// write(file, compressedPsi->samples, ((compressedPsi->numberOfSamples*compressedPsi->sampleSize+31)/32)*sizeof(int)); +// write(file, &(compressedPsi->pointerSize), sizeof(int)); +// write(file, compressedPsi->samplePointers, ((compressedPsi->numberOfSamples*compressedPsi->pointerSize+31)/32)*sizeof(int)); +// write(file, &(compressedPsi->streamSize), sizeof(int)); +// write(file, compressedPsi->stream, ((compressedPsi->streamSize+31)/32)*sizeof(int)); +// +// close(file); +// +//} +// +// +//HuffmanCompressedPsi loadHuffmanCompressedPsi(char *filename) { +// +// HuffmanCompressedPsi compressedPsi; +// +// THuff H; +// +// int file; +// +// if( (file = open(filename, O_RDONLY)) < 0) { +// printf("Cannot read file %s\n", filename); +// exit(0); +// } +// read(file, &(compressedPsi.T), sizeof(int)); +// // Cargamos o arbol de Huffman +// read(file, &H.max, sizeof(int)); +// read(file, &H.lim, sizeof(int)); +// read(file, &H.depth, sizeof(int)); +// //H.s.spos = (unsigned int *) malloc((H.lim+1)*sizeof(int)); +// //H.s.spos =H.s.symb = (unsigned int *) malloc((H.lim+1)*sizeof(int)); +// H.s.symb = (unsigned int *) malloc((H.lim+1)*sizeof(int)); +// H.num = (unsigned int *) malloc((H.depth+1)*sizeof(int)); +// H.fst = (unsigned int *) malloc((H.depth+1)*sizeof(int)); +// +// //read(file, H.s.spos, (H.lim+1)*sizeof(int)); +// fprintf(stderr," \n read %d spos bytes\n", (H.lim+1)*sizeof(int)); +// read(file, H.s.symb, (H.lim+1)*sizeof(int)); +// +// read(file, H.num, (H.depth+1)*sizeof(int)); +// read(file, H.fst, (H.depth+1)*sizeof(int)); +// compressedPsi.diffsHT = H; +// // Fin da carga do arbol de Huffman +// read(file, &(compressedPsi.nS), sizeof(int)); +// read(file, &(compressedPsi.numberOfSamples), sizeof(int)); +// read(file, &(compressedPsi.sampleSize), sizeof(int)); +// compressedPsi.samples = (unsigned int *)malloc(((compressedPsi.numberOfSamples*compressedPsi.sampleSize+31)/32)*sizeof(int)); +// read(file, compressedPsi.samples, ((compressedPsi.numberOfSamples*compressedPsi.sampleSize+31)/32)*sizeof(int)); +// read(file, &(compressedPsi.pointerSize), sizeof(int)); +// compressedPsi.samplePointers = (unsigned int *)malloc(((compressedPsi.numberOfSamples*compressedPsi.pointerSize+31)/32)*sizeof(int)); +// read(file, compressedPsi.samplePointers, ((compressedPsi.numberOfSamples*compressedPsi.pointerSize+31)/32)*sizeof(int)); +// read(file, &(compressedPsi.streamSize), sizeof(int)); +// compressedPsi.stream = (unsigned int *)malloc(((compressedPsi.streamSize+31)/32)*sizeof(int)); +// read(file, compressedPsi.stream, ((compressedPsi.streamSize+31)/32)*sizeof(int)); +// +// close(file); +// +// return compressedPsi; +// +//} diff --git a/swcsa/interface.h b/swcsa/interface.h new file mode 100755 index 0000000..938bd43 --- /dev/null +++ b/swcsa/interface.h @@ -0,0 +1,96 @@ + +/* General interface for using the compressed index libraries */ + +#ifndef uchar +#define uchar unsigned char +#endif +#ifndef uint +#define uint unsigned int +#endif +#ifndef ulong +#define ulong unsigned long +#endif + +/* Error management */ + + /* Returns a string describing the error associated with error number + e. The string must not be freed, and it will be overwritten with + subsequent calls. */ + +char *error_index (int e); + +/* Building the index */ + + /* Creates index from text[0..length-1]. Note that the index is an + opaque data type. Any build option must be passed in string + build_options, whose syntax depends on the index. The index must + always work with some default parameters if build_options is NULL. + The returned index is ready to be queried. */ + +int build_index (uchar *text, ulong length, char *build_options, void **index); + + /* Saves index on disk by using single or multiple files, having + proper extensions. */ + +int save_index (void *index, char *filename); + + /* Loads index from one or more file(s) named filename, possibly + adding the proper extensions. */ + +int load_index (char *filename, void **index); + + /* Frees the memory occupied by index. */ + +int free_index (void *index); + + /* Gives the memory occupied by index in bytes. */ + +int index_size(void *index, ulong *size); + +/* Querying the index */ + + /* Writes in numocc the number of occurrences of the substring + pattern[0..length-1] found in the text indexed by index. */ + +//int count (void *index, uchar *pattern, ulong length, ulong *numocc); +// +// /* Writes in numocc the number of occurrences of the substring +// pattern[0..length-1] in the text indexed by index. It also allocates +// occ (which must be freed by the caller) and writes the locations of +// the numocc occurrences in occ, in arbitrary order. */ +// +//int locate (void *index, uchar *pattern, ulong length, ulong **occ, +// ulong *numocc); +// +// /* Gives the length of the text indexed */ +// +//int get_length(void *index, ulong *length); +// +///* Accessing the indexed text */ +// +// /* Allocates snippet (which must be freed by the caller) and writes +// the substring text[from..to] into it. Returns in snippet_length the +// length of the text snippet actually extracted (that could be less +// than to-from+1 if to is larger than the text size). */ +// +//int extract (void *index, ulong from, ulong to, uchar **snippet, +// ulong *snippet_length); +// +// /* Displays the text (snippet) surrounding any occurrence of the +// substring pattern[0..length-1] within the text indexed by index. +// The snippet must include numc characters before and after the +// pattern occurrence, totalizing length+2*numc characters, or less if +// the text boundaries are reached. Writes in numocc the number of +// occurrences, and allocates the arrays snippet_text and +// snippet_lengths (which must be freed by the caller). The first is a +// character array of numocc*(length+2*numc) characters, with a new +// snippet starting at every multiple of length+2*numc. The second +// gives the real length of each of the numocc snippets. */ +// +//int display (void *index, uchar *pattern, ulong length, ulong numc, +// ulong *numocc, uchar **snippet_text, ulong **snippet_lengths); + + /* Obtains the length of the text indexed by index. */ + +int length (void *index, ulong *length); + diff --git a/swcsa/utils/MemoryManager.c b/swcsa/utils/MemoryManager.c new file mode 100755 index 0000000..e50f5b7 --- /dev/null +++ b/swcsa/utils/MemoryManager.c @@ -0,0 +1,105 @@ + +/*----------------------------------------------------------------------- + File : MemoryManager.cpp + Function : Reserves large blocks of memory and gives pointers to small + portions of that block when requested. + This improves performance since a unique "LARGE ALLOCATION" + of memory is needed (a unique call to malloc). + It is also responsible of freeing memory. + Last change: 10/03/2004 + Purpose : Improve hash performance. + ------------------------------------------------------------------------*/ +#include "MemoryManager.h" + + +/*------------------------------------------------------------------ + Constructor method + ------------------------------------------------------------------ */ +MemoryManager createMemoryManager(void) { + MemoryManager mm; + mm = (MemoryManager) malloc (sizeof(struct sMem)); + mm->currentBlock=0; + createNewMemoryBlock(mm); + return mm; +} + +/*------------------------------------------------------------------ + Destructor method + ------------------------------------------------------------------ */ +void destroyMemoryManager (MemoryManager mm){ + register int i; + for (i=0; i<=mm->currentBlock;i++) free(mm->BLOCKS[i]); + printf("\n[destroying MemManager] ...Freed %u bytes... RAM", LARGE_BLOCK_SIZE* (mm->currentBlock+1)); + free(mm); + +} + +/*------------------------------------------------------------------ + createNewMemoryBlock method + Allocates a new memory block of size "LARGE_BLOCK_SIZE" and adds it to + vector BLOCKS + ------------------------------------------------------------------ */ + +void createNewMemoryBlock (MemoryManager mm) { + mm->BLOCKS[mm->currentBlock] = (byte *) malloc (LARGE_BLOCK_SIZE); + + if (mm->BLOCKS[mm->currentBlock] == NULL) { + fprintf(stderr, "\nERROR...\nUnable to allocate enough memory. Exitting...\n"); + exit(0); + } + + mm->remainderBytes = LARGE_BLOCK_SIZE; + mm->availableByte = mm->BLOCKS[mm->currentBlock]; //points to the begining of the block +} + + +/*------------------------------------------------------------------ + getBlock method + returns a pointer to a free block of memory of size "size" + ------------------------------------------------------------------ */ +void getMemoryBlock (MemoryManager mm, byte **dst, const unsigned int size) { + if (mm->remainderBytes < size) { + mm->currentBlock++; + createNewMemoryBlock(mm); + //fprintf(stderr,"\new memory block"); + } + + *dst = mm->availableByte; //points to a free size-block + mm->remainderBytes -= (size); + mm->availableByte += (size); +} + + +/*------------------------------------------------------------------ + main, to make unit proofs + ------------------------------------------------------------------ */ +/* +int main(int argc, char* argv[]) +{ byte *word, *word2; + unsigned int size; +int i; + MemoryManager memMgr; + memMgr=createMemoryManager(); + + size = 100; + getMemoryBlock(memMgr,&word,size); + + fprintf(stderr,"pasei getblock\n"); + strcpy((char *)(word), "01234567890123456789012345678901234567890123456789"); + word[50]='\0'; + fprintf(stderr,"pasei strcpy \n"); + fprintf(stderr,"\n%s",word); + getMemoryBlock(memMgr,&word2,size); + strcpy((char *)(word2), "soy la word 2"); + + for (i=0;i<100000;i++) { + size = 89; + getMemoryBlock(memMgr,&word,size); + } + + fprintf(stderr,"\n final %s",word2); + + destroyMemoryManager(memMgr); + +} */ + diff --git a/swcsa/utils/MemoryManager.h b/swcsa/utils/MemoryManager.h new file mode 100755 index 0000000..fb51e63 --- /dev/null +++ b/swcsa/utils/MemoryManager.h @@ -0,0 +1,47 @@ +/*----------------------------------------------------------------------- + File : MemoryManager.h + Function : Reserves large blocks of memory and gives pointers to small + portions of that block when requested. + This improves performance since a unique "LARGE ALLOCATION" + of memory is needed (a unique call to malloc). + It is also responsible of freeing memory. + Last change: 10/03/2004 + Purpose : Improve hash performance. + ------------------------------------------------------------------------*/ + +#ifndef MEMORYMANAGERINCLUDED +#define MEMORYMANAGERINCLUDED // only used for hashTable of stopwords + +#include +#include +#include +#include +#include + +#ifndef byte + #define byte unsigned char +#endif + +#define LARGE_BLOCK_SIZE 1024*256 // Size of the blocks of memory that will be allocated +#define MAX_BLOCKS 2048 // Maximum number of blocks of size LARGE_BLOCK_SIZE that + // can be allocated + + /* + * Definition of structure MemoryManager + */ + struct sMem { + byte *BLOCKS[MAX_BLOCKS]; //array of blocks of size LARGE_BLOCK_SIZE + unsigned int currentBlock; //currentBlock in the array of blocks + unsigned long remainderBytes; //number of bytes not yet assigned in BLOCKS[currentBlock] + byte *availableByte; //pointer to next byte not yet assigned + } ; + + typedef struct sMem *MemoryManager; + + + MemoryManager createMemoryManager(void); + void destroyMemoryManager (MemoryManager mm); + void getMemoryBlock (MemoryManager mm, byte **dst, const unsigned int size); + void createNewMemoryBlock (MemoryManager mm); + +#endif diff --git a/swcsa/utils/basics.c b/swcsa/utils/basics.c new file mode 100755 index 0000000..f213ed6 --- /dev/null +++ b/swcsa/utils/basics.c @@ -0,0 +1,107 @@ + +// Basics + +// #include "basics.h" included later to avoid macro recursion for malloc +#include +#include +#include + + // Memory management + + void *Malloc (int n) + + { void *p; + if (n == 0) return NULL; + p = (void*) malloc (n); + if (p == NULL) + { fprintf (stderr,"Could not allocate %i bytes\n",n); + exit(1); + } + return p; + } + + void Free (void *p) + + { if (p) free (p); + } + + void *Realloc (void *p, int n) + + { if (p == NULL) return Malloc (n); + if (n == 0) { Free(p); return NULL; } + p = (void*) realloc (p,n); + if (p == NULL) + { fprintf (stderr,"Could not allocate %i bytes\n",n); + exit(1); + } + return p; + } + +#include "basics.h" + + // bits needed to represent a number between 0 and n + +uint bits (uint n) + + { uint b = 0; + while (n) + { b++; n >>= 1; } + return b; + } + + // returns e[p..p+len-1], assuming len <= W + +uint bitread (uint *e, uint p, uint len) + + { uint answ; + e += p/W; p %= W; + answ = *e >> p; + if (len == W) + { if (p) answ |= (*(e+1)) << (W-p); + } + else { if (p+len > W) answ |= (*(e+1)) << (W-p); + answ &= (1<> (W-p)); + } + else { if (p+len <= W) + { *e = (*e & ~(((1<> (W-p)); + } + } + // writes e[p..p+len-1] = 0 + +void bitzero (register uint *e, register uint p, + register uint len) + + { e += p/W; p %= W; + if (p+len >= W) + { *e &= ~((1<= W) + { *e++ = 0; + len -= W; + } + if (len > 0) + *e &= ~(((1< +#include +#include +#include +#include +#include +#include +#include +#include + + // Memory management + +#define malloc(n) Malloc(n) +#define free(p) Free(p) +#define realloc(p,n) Realloc(p,n) + +void *Malloc (int n); +void Free (void *p); +void *Realloc (void *p, int n); + + // Data types + +#ifndef byte + #define byte unsigned char +#endif + +//typedef unsigned char byte; +// typedef unsigned int uint; + +//typedef int bool; +//#define true 1 +//#define false 0 + +#define max(x,y) ((x)>(y)?(x):(y)) +#define min(x,y) ((x)<(y)?(x):(y)) + + // Bitstream management + +//#define W (8*sizeof(uint)) +#define W (32) + + // bits needed to represent a number between 0 and n +uint bits (uint n); + // returns e[p..p+len-1], assuming len <= W +uint bitread (uint *e, uint p, uint len); + // writes e[p..p+len-1] = s, assuming len <= W +void bitwrite (uint *e, uint p, uint len, uint s); + // writes e[p..p+len-1] = 0, no assumption on len + + /**/ //FARI. WITH ASSUMPTION ON LEN, OR IT CRASHES + //NOt WORKING UPON THE LIMIT OF THE STARTING uint. +void bitzero (uint *e, uint p, uint len); + // reads bit p from e +#define bitget(e,p) (((e)[(p)/W] >> ((p)%W)) & 1) + // sets bit p in e +#define bitset(e,p) ((e)[(p)/W] |= (1<<((p)%W))) + // cleans bit p in e +#define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W))) + + + +/* bitRead and bitWrite as MACROS */ + // returns e[p..p+len-1], assuming len <= W + //mybitread (uint returned value, uint *e, uint p, uint len) +#define mybitread(answ, v, p, len) \ + { uint *e ; \ + e=v;\ + e += p/W; p %= W; \ + answ = *e >> p; \ + if (len == W) \ + { if (p) answ |= (*(e+1)) << (W-p); \ + } \ + else { if (p+len > W) answ |= (*(e+1)) << (W-p); \ + answ &= (1<> (W-p)); \ + } \ + } \ + else { if (p+len <= W) \ + { *e = (*e & ~(((1<> (W-p)); \ + } \ + } \ + } + +#endif diff --git a/swcsa/utils/bitmap.c b/swcsa/utils/bitmap.c new file mode 100755 index 0000000..92ac4c0 --- /dev/null +++ b/swcsa/utils/bitmap.c @@ -0,0 +1,316 @@ + +// Implements operations over a bitmap + +#include "bitmap.h" + + + // In theory, we should have superblocks of size s=log^2 n divided into + // blocks of size b=(log n)/2. This takes + // O(n log n / log^2 n + n log log n / log n + log n sqrt n log log n) bits + // In practice, we can have any s and b, and the needed amount of bits is + // (n/s) log n + (n/b) log s + b 2^b log b bits + // Optimizing it turns out that s should be exactly s = b log n + // Optimizing b is more difficult but could be done numerically. + // However, the exponential table does no more than popcounting, so why not + // setting up a popcount algorithm tailored to the computer register size, + // defining that size as b, and proceeding. + +//unsigned char OnesInByte[] = +const unsigned char popc[] = //number of ones in one byte value [0..255]. +{ +0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, +1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, +1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, +2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, +1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, +2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, +2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, +3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8, +}; + +uint popcount (register uint x) + + { return popc[x&0xFF] + popc[(x>>8)&0xFF] + popc[(x>>16)&0xFF] + popc[x>>24]; + } + + +/******************************************************************/ +// FUNCIONS DE EDU ... +/******************************************************************/ +/* + Creates a bitmap and structures to rank and select +*/ + +//bitmap createBitmapEdu (uint *string, uint n){ return createBitmap(string,n);} + +bitmap createBitmap (uint *string, uint n){ + bitmap B; + + unsigned int nb; + unsigned int ns; + unsigned int countB, countS, blockIndex, superblockIndex; + register unsigned int block; + + B =(struct sbitmap *) malloc (sizeof(struct sbitmap)); + B->data = string; + B->n = n; + ns = (n/256)+1; + nb = (n/32)+1; + + B->bSize = nb; + B->sSize = ns; + B->bdata =(byte *)malloc(nb*sizeof(byte)); // Db = (unsigned char *)malloc(nb*sizeof(unsigned char)); + B->sdata = (uint*)malloc(ns*sizeof(uint)); // Ds = (unsigned int *)malloc(ns*sizeof(unsigned int)); + + B->mem_usage = (ns*sizeof(uint)) + (nb*sizeof(byte)) + (sizeof(struct sbitmap)); + /* Ahora construimos los bloques */ + blockIndex = 0; + superblockIndex = 0; + countB = 0; + countS = 0; + + while(blockIndex < nb) { + + if(!(blockIndex%8)) { + countS += countB; + B->sdata[superblockIndex++] = countS; + countB = 0; + } + + B->bdata[blockIndex] = countB; + block = string[blockIndex++]; + + countB += popcount(block); + } + + B->pop = countS+countB; + +// {int i; //fprintf(stderr,"\n"); +// for (i=0;isdata[i]); +// } +// //fprintf(stderr,"\n"); +// for (i=0;i<8;i++) {//fprintf(stderr,"%d ",B->bdata[i]); +// } +// } + return B; +} + + +/* + Number of 1s in range [0,posicion] +*/ +//uint rank1Edu(bitmap B, unsigned int position) { +//uint rank1Edu(bitmap B, unsigned int position) { return rank(B,position);} +uint rank(bitmap B, unsigned int position) { + register unsigned int block; + if (position > B->n) return B->pop; + //position -=1; + + block = B->data[position/32] << (31-position%32); + + return B->sdata[position/256] + B->bdata[position/32] + + popc[block & 0xff] + popc[(block>>8) & 0xff] + + popc[(block>>16) & 0xff] + popc[block>>24]; +} + + +/********************************************************************************************/ +/**************************************************************************************/ + +static uint binsearch (uint *data, uint size, uint val) + + { uint i,j,m; + i = 0; j = size; + while (i+1 < j) + { m = (i+j)/2; + if (data[m] >= val) j = m; + else i = m; + } + return i; + } + +uint bselect (bitmap B, uint j) + + { uint spos,bpos,pos,word,x; + byte *blk; + if (j > B->pop) return B->n; + spos = binsearch(B->sdata,(B->n+256-1)/256,j); + + //fprintf(stderr,"\n SPOS IS %d, and B->sdata[pos] = %d",spos,B->sdata[spos]); + j -= B->sdata[spos]; + pos = spos<<8; + blk = B->bdata + (pos>>5); + bpos = 0; + + //while ((bpos < (1<<3)-1) && (blk[bpos+1] < j)) bpos++; + while ( ((spos*8+bpos) < ((B->n-1)/W)) && (bpos < (1<<3)-1) && (blk[bpos+1] < j)) bpos++; + + + //fprintf(stderr,"\n BPOS = %d",bpos); + pos += bpos<<5; + word = B->data[pos>>5]; + j -= blk[bpos]; + //fprintf(stderr,"\n pos>>5 = %d ... pasou XXX con word = %d, and j= %d",pos>>5,word,j); + while (1) + { x = popc[word & ((1<<8)-1)]; + //fprintf(stderr,"\n word = %u popc vale %u",word & ((1<<8)-1),x); + if (j <= x) break; + j -= x; pos += 8; + word >>= 8; + + } + + while (j) { if (word & 1) j--; word >>= 1; pos++; } + + // fprintf(stderr,"\n\nBSELECT::: POSICIÓN FINAL = %u",pos-1); + return pos-1; + + } + + +// destroys the bitmap, freeing the original bitstream +void destroyBitmap (bitmap B) + + { //free (B->data); + free (B->bdata); + free (B->sdata); + free (B); + } + + +// Prints the bit vector +void showBitVector(uint * V, int vectorSize) { + uint bitIndex=0; + while(bitIndexsSize), sizeof(uint)); + write(file, b->sdata, sizeof(int) * (b->sSize)); + write(file, &(b->bSize), sizeof(uint)); + write(file, b->bdata, sizeof(byte) * (b->bSize)); + + write(file, &(b->pop), sizeof(uint)); + write(file, &(b->n), sizeof(uint)); + close(file); +} + +/* loads the Rank structures from disk, and sets Bitmap->data ptr to "string" +*/ +bitmap loadBitmap (char *filename, uint *string, uint n) { + bitmap B; + int file; + + if( (file = open(filename, O_RDONLY)) < 0) { + printf("Cannot read file %s\n", filename); + exit(0); + } + + B = (struct sbitmap *) malloc (sizeof(struct sbitmap)); + B->data = string; + + read(file, &(B->sSize), sizeof(uint)); + B->sdata = (uint *) malloc(sizeof(uint) * B->sSize); + read(file, B->sdata, sizeof(uint) * B->sSize); + + read(file, &(B->bSize), sizeof(uint)); + B->bdata = (byte *) malloc(sizeof(byte) * B->bSize); + read(file, B->bdata, sizeof(byte) * B->bSize); + + read(file, &(B->pop), sizeof(uint)); + read(file, &(B->n), sizeof(uint)); + close(file); + B->mem_usage = (sizeof(uint) * B->sSize) + (sizeof(byte) * B->bSize) + (sizeof(struct sbitmap)); + + if (n != B->n) {printf("\n LoadBitmap failed: %u distinto de %u",n,B->n); exit(0);} + return B; + +} + + +/********************************************************************************************/ +/********************************************************************************************/ + + + + + // creates a bitmap structure from a bitstring, which is shared + +bitmap createBitmapGONZA (uint *string, uint n) +//bitmap createBitmap (uint *string, uint n) + + { bitmap B; + uint i,j,pop,bpop,pos; + uint s,nb,ns,words; + B = (struct sbitmap *) malloc (sizeof(struct sbitmap)); + B->data = string; + + + B->n = n; words = (n+W-1)/W; + ns = (n+256-1)/256; nb = 256/W; // adjustments + + B->bSize = ns*nb; + B->bdata = (byte *) malloc (ns*nb*sizeof(byte)); + B->sSize = ns; + B->sdata = (uint *)malloc (ns*sizeof(int)); + + B->mem_usage = (ns*sizeof(int)) + (ns*nb*sizeof(byte)) + (sizeof(struct sbitmap)); +#ifdef INDEXREPORT + printf (" Bitmap over %i bits took %i bits\n", n,n+ns*nb*8+ns*32); +#endif + //fprintf (stderr," Bitmap over %i bits took %i bits\n", n,n+ns*nb*8+ns*32); + pop = 0; pos = 0; + for (i=0;isdata[i] = pop; + for (j=0;jbdata[pos++] = bpop; + bpop += popcount(*string++); + } + pop += bpop; + } + B->pop = pop; + + // //fprintf(stderr,"\n"); + // for (i=0;isdata[i]); + // } + // //fprintf(stderr,"\n"); + // for (i=0;ibdata[i]); + // } + + return B; + } + + // rank(i): how many 1's are there before position i, not included + +//uint rank (bitmap B, uint i) +uint rankGONZA (bitmap B, uint i) + + { + i++; + if (i > B->n) return B->pop; + return B->sdata[i>>8] + B->bdata[i>>5] + + popcount (B->data[i>>5] & ((1<<(i&0x1F))-1)); + } + + + + + + + + + + + + diff --git a/swcsa/utils/bitmap.h b/swcsa/utils/bitmap.h new file mode 100755 index 0000000..2c61ed0 --- /dev/null +++ b/swcsa/utils/bitmap.h @@ -0,0 +1,46 @@ + +// Implements operations over a bitmap + +#ifndef BITMAPINCLUDED +#define BITMAPINCLUDED + +#include "basics.h" + +typedef struct sbitmap + { uint *data; + uint n; // # of bits + uint pop; // # bits set + uint *sdata; // superblock counters + uint sSize; // size of sdata vector + byte *bdata; // block counters + uint bSize; // size of bdata vector + uint mem_usage; + } *bitmap; + + + // creates a bitmap structure from a bitstring, which gets owned +bitmap createBitmap (uint *string, uint n); + // rank(i): how many 1's are there before position i, not included +uint rank (bitmap B, uint i); + // select(i): position of i-th 1 +uint bselect (bitmap B, uint i); + // destroys the bitmap, freeing the original bitstream +void destroyBitmap (bitmap B); + // popcounts 1's in x +uint popcount (register uint x); + +void saveBitmap (char *filename, bitmap b); +bitmap loadBitmap (char *filename, uint *string, uint n); + + + +////EDU'S functions included here. +//bitmap createBitmapEdu (uint *string, uint n); +//uint popcountEdu (register uint x); //which is identical to popcount. +//uint rank1Edu(bitmap B, unsigned int position); +//unsigned int isActiveBit(uint *V, uint position); +void showBitVector(uint * V, int vectorSize); + +#endif + + diff --git a/swcsa/utils/defValues.h b/swcsa/utils/defValues.h new file mode 100755 index 0000000..a09122d --- /dev/null +++ b/swcsa/utils/defValues.h @@ -0,0 +1,42 @@ +#ifndef DEFVALUES_INCLUDED +#define DEFVALUES_INCLUDED + +//#define MAX_LEN_VOCABULARY 3000000 //number of different words --> (number of variants) +#define MAX_MEANINGFUL_WORDS 100000000 //number of words (non separators) +#define MAX_SIZE_OF_WORD 1000000 //255 //size of word +#define MAX_SIZE_OF_GAP 1000000 //1000 //size of separator + //#define MAX_STOPW0RDS_SIZE 255 //size of stopword +#define MAX_SIZE_OF_ANY 1000000 //1000 + + +#define DEFAULT_SUFFIX_ARRAY_SIZE 50000000 + +#define DEFAULT_SAMPLE_PERIOD_Z 32 +#define DEFAULT_SAMPLE_PERIOD_B 32 + + + + +//for queries +#define MAX_INTEGER_PATTERN_SIZE 20 +#define MAX_TEXT_PATTERN_SIZE 100 //maximum number of valid words in a searched pattern + +#ifndef DEBUG_ON + // #define DEBUG_ON +#endif + + + +#define byte unsigned char + + +#define CSA_ON //generates the CSA or only "presentation layer" +//#define WRITE_SE_FILE //outputs to a file the array of integers indexed. + +// Extensions of created files + +#define VOCABULARY_WORDS_FILE_EXT "words" +#define SE_FILE_EXT "se" +#define CONSTANTS_FILE_EXT "cte" + +#endif diff --git a/swcsa/utils/errors.c b/swcsa/utils/errors.c new file mode 100755 index 0000000..42886ff --- /dev/null +++ b/swcsa/utils/errors.c @@ -0,0 +1,48 @@ + +/*//////////////// +//Error handling// +////////////////*/ +/* +char *error_index(int e){ + switch(e) { + case 0: return "No error"; break; + case 1: return "Out of memory"; break; + case 2: return "The text must end with a \\0"; break; + case 5: return "You can't free the text if you don't copy it"; break; + case 20: return "Cannot create files"; break; + case 21: return "Error writing the index"; break; + case 22: return "Error writing the index"; break; + case 23: return "Cannot open index; break"; + case 24: return "Cannot open text; break"; + case 25: return "Error reading the index"; break; + case 26: return "Error reading the index"; break; + case 27: return "Error reading the text"; break; + case 28: return "Error reading the text"; break; + case 99: return "Not implemented"; break; + default: return "Unknown error"; + } +} +*/ + + +char *error_index(int e){ +static char err[100]; + switch(e) { + case 0: strcpy(err, "No error"); break; + case 1: strcpy(err, "Out of memory"); break; + case 2: strcpy(err, "The text must end with a \\0"); break; + case 5: strcpy(err, "You can't free the text if you don't copy it"); break; + case 20: strcpy(err, "Cannot create files"); break; + case 21: strcpy(err, "Error writing the index"); break; + case 22: strcpy(err, "Error writing the index"); break; + case 23: strcpy(err, "Cannot open index; break"); + case 24: strcpy(err, "Cannot open text; break"); + case 25: strcpy(err, "Error reading the index"); break; + case 26: strcpy(err, "Error reading the index"); break; + case 27: strcpy(err, "Error reading the text"); break; + case 28: strcpy(err, "Error reading the text"); break; + case 99: strcpy(err, "Not implemented"); break; + default: strcpy(err, "Unknown error"); + } + return err; +} diff --git a/swcsa/utils/fileInfo.c b/swcsa/utils/fileInfo.c new file mode 100755 index 0000000..3f9d413 --- /dev/null +++ b/swcsa/utils/fileInfo.c @@ -0,0 +1,44 @@ + +#include "fileInfo.h" + +unsigned long fileSize (char *filename){ + FILE *fpText; + unsigned long fsize; + fpText = fopen(filename,"rb"); + fsize=0; + if (fpText) { + fseek(fpText,0,2); + fsize= ftell(fpText); + fclose(fpText); + ////fprintf(stderr,"fileSize = %ld",fsize); + } + return fsize; +} + +/*copies from infile to outfile */ +void copyFile (char *infile, char *outfile){ + FILE *in, *out; + unsigned long fsize; + + if ( (in = fopen(infile,"rb")) <0) { + printf("Cannot open file %s\n", infile); exit(0); + } + + unlink(outfile); + if( (out = fopen(outfile, "w")) < 0) { + printf("Cannot open file %s\n", outfile); + exit(0); + } + + fsize=fileSize(infile); + if (fsize) { + char *buff = (char *) malloc(sizeof(char)*fsize); + if (fread(buff,sizeof(char),fsize,in)) { + fwrite(buff,sizeof(char),fsize,out); + } + free(buff); + } + fclose(in); + fclose(out); +} + diff --git a/swcsa/utils/fileInfo.h b/swcsa/utils/fileInfo.h new file mode 100755 index 0000000..603b4f2 --- /dev/null +++ b/swcsa/utils/fileInfo.h @@ -0,0 +1,19 @@ +#ifndef FILE_INFO_INCLUDED +#define FILE_INFO_INCLUDED + +#include +#include + #include + + + + /*------------------------------------------------------------------ + Obtains the size of a file. + ------------------------------------------------------------------ */ + unsigned long fileSize (char *filename); + + /* copies from infile to outfile */ + void copyFile (char *infile, char *outfile); + +#endif + diff --git a/swcsa/utils/hash.c b/swcsa/utils/hash.c new file mode 100755 index 0000000..8e77870 --- /dev/null +++ b/swcsa/utils/hash.c @@ -0,0 +1,394 @@ + +/* DYNAMIC END-TAGGED DENSE CODE. -- +A dynamic word-based byte oriented compressor for text files based on +dynamic End-Tagged Dense Code. + +Brisaboa, N. R., Faria, A., Navarro, G., Param, J. R. +Simple, Fast, and Efficient Natural Language Adaptive Compression. +11th International Symposium on String Processing and Information Retrieval (SPIRE'04) - LNCS 3246. A. Apostolico, M. Melucci (Ed.), pp. 230-241. +Padova (Italia), 2004. + +Copyright (C) 2005 Antonio Faria. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +Author's contact: Antonio Faria, Dept. of Computer Science, University of +A Corua. Campus de Elvia s/n. Spain fari@udc.es +*/ + +/*----------------------------------------------------------------------- + Hash: Definition of HashTable class (Linear Hash) + ------------------------------------------------------------------------*/ + +#include "hash.h" + +#include +#include +#include + + +/*----------------------------------------------------------------- + Initilization of data structures used by the hashTable + ---------------------------------------------------------------- */ +t_hash initialize_hash (unsigned long sizeVoc) { + t_hash h; + unsigned long i; + + h = (t_hash) malloc(sizeof(struct hashStr)); + h->SIZE_HASH = (unsigned long) (OCUP_HASH * sizeVoc); + h->SIZE_HASH = NearestPrime(h->SIZE_HASH); + h->hash = (t_hashNode *) malloc(h->SIZE_HASH * sizeof(t_hashNode)); + h->NumElem = 0; + + //creates the memory manager that is used to reserve small pieces of memory (for words) + h->_memMgr=createMemoryManager(); + + + for (i = 0; i < h->SIZE_HASH; i++) { + h->hash[i].word = NULL; + h->hash[i].len = 0; + h->hash[i].posInVoc = 0; + } + printf("\nHash Table initilized with: %lu elements\n",h->SIZE_HASH); + + return h; +} + + +/*------------------------------------------------------------------ + Find the nearest prime number over n. + ---------------------------------------------------------------- */ +unsigned long NearestPrime(unsigned long n) +{ + long position; /* the prime number being sought */ + long index; + + for (position = n; ; position++) + { + // checks if those values from 2 to $\sqrt{m}$ can be factors of $m$ */ + for (index = 2; index <= (long) sqrt((double) position) && position % index != 0; index++) ; + + if (position % index != 0) /* No factors in that range, therefore a prime number was found */ + { + break; + } + } + return position; +} + + + +/*----------------------------------------------------------------------- + Computes a hash function over a string "aWord", it returns the position + where "aWord" should go in the hash table if no collision ocurrs. + ---------------------------------------------------------------------*/ +unsigned long hashFunction (const unsigned char *aWord, unsigned int len, unsigned long sizeHash) +{ + char c; + register unsigned int h; + register unsigned long last; + last=((unsigned long) aWord) +len -1; + + + h = SEED; + //last=aWord+len; + + for( ; ((unsigned long) aWord <=last ) ; ) + //for(c=*aWord; (aWord++)> 2) ); + } + return((unsigned int)((h&0x7fffffff) % sizeHash)); +} + + +/*----------------------------------------------------------------------- + compares two strings + ---------------------------------------------------------------------*/ + +/* Author J. Zobel, April 2001. + http://www.seg.rmit.edu.au/code/zwh-ipl/ + Permission to use this code is freely granted, provided that this + statement is retained. */ + + + + int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2) { + if (ws1 !=ws2) { + return -1; + } + + { register unsigned long iters; + iters=1; + while( iters iters) so s1 != s2 +// return( *s1-*s2); +// } +//} + + +//permits to compare 2 strings of len ws1 and ws2 that do not end in '\0' +int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) { + register ulong iters = 0; + + while( itersNumElem >= h->SIZE_HASH -1) //loses 1 slot, but ensures that "search function" does not enter an infinity loop + { + printf("\n\nHash table full!! Change size and recompile !\n"); + exit(1); + } + + getMemoryBlock(h->_memMgr,( byte **)&(h->hash[*addr].word),len+1); + //fprintf(stderr,"\n tras obter memoria"); + + strncpy ((char *) h->hash[*addr].word, (char *)aWord, len); + + h->hash[*addr].word[len]='\0'; + h->hash[*addr].len =len; + h->hash[*addr].posInVoc = h->NumElem; + h->NumElem++; + + //fprintf(stderr,"\n####inserted word [%s] ",h->hash[*addr].word); + + //return *addr; +} + +/*----------------------------------------------------------------------- + Search for a word in the hash table and returns its position in the + vocabulary. It returns the next "available" position in the vocabulary, + if the word is not in the hash table. That is: a "0-node" position. + It also returns -using attribute returnedAddr- the position where word + was found (or where it should go if it was inserted in next "insert call". + -----------------------------------------------------------------------*/ +unsigned long search (t_hash h, const unsigned char *aWord, register unsigned len, + unsigned long *returnedAddr){ + + register unsigned long addr, Saddr; + + //fprintf(stderr,"\n searching for [%s], [%d], sizehash= %ld",aWord,len,h->SIZE_HASH); + addr = hashFunction(aWord,len, h->SIZE_HASH); + Saddr = addr; + + t_hashNode *hash; + hash = h->hash; + + while((hash[addr].word != NULL)&&((strcomp(hash[addr].word, aWord, hash[addr].len, len)) != 0)) { + //fprintf(stderr,"\nComprueba [%s], [%d]",hash[addr].word,strcomp(hash[addr].word, aWord, len)); + addr = ((addr + JUMP) % h->SIZE_HASH); + } + + *returnedAddr = addr; + + if(hash[addr].word == NULL) { + return h->NumElem; //Word was not found + } + else { + return h->hash[addr].posInVoc; //Word was found in this position in the vocabulary + } +} + + + +/*----------------------------------------------------------------------- + Tells if a word appears or not in the hash table. + -----------------------------------------------------------------------*/ +unsigned long inHashTable (t_hash h, const unsigned char *aWord, register unsigned len, unsigned long *returnedAddr){ + + unsigned long searched; + unsigned long nothing; + searched = search(h,aWord,len,¬hing); + *returnedAddr=nothing; + return (searched < (h->NumElem) ); +} + +/*------------------------------------------------------------------ + Destructor method + ------------------------------------------------------------------ */ +void destroy_hash (t_hash hash){ + unsigned long mem=0; + mem += sizeof(struct hashStr) + hash->SIZE_HASH * sizeof(t_hashNode); + free(hash->hash); + destroyMemoryManager(hash->_memMgr); //frees words and variants +// free(hash->_memMgr); + free(hash); + printf("\n[destroying hash table]...Freed %ld bytes... RAM", mem); +} + + + + +/*------------------------------------------------------------------ + main, to make unit proofs + ------------------------------------------------------------------ */ +/* +int main(int argc, char* argv[]) +{ byte a[10]= "word1"; + byte b[10]= "word2"; + byte c[10]= "word3"; + byte d[10]= "word4"; + byte e[10]= "word5"; + byte f[10]= "word6"; + byte * w; + unsigned int size; + unsigned long i,addrInTH; + + t_hash hash; + + _memMgr=createMemoryManager(); + + hash = initialize_hash (2); + + w=a; + i = search (hash,w, strlen(w), &addrInTH ); + insertElement (hash, w, strlen(w), &addrInTH); + fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); + fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); + + w=b; + i = search (hash,w, strlen(w), &addrInTH ); + insertElement (hash, w, strlen(w), &addrInTH); + fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); + fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); + + w=c; + i = search (hash,w, strlen(w), &addrInTH ); + insertElement (hash, w, strlen(w), &addrInTH); + fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); + fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); + + w=d; + i = search (hash,w, strlen(w), &addrInTH ); + insertElement (hash, w, strlen(w), &addrInTH); + fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); + fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); + +// w=e; +// i = search (hash,w, strlen(w), &addrInTH ); +// insertElement (hash, w, strlen(w), &addrInTH); +// fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); +// fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); + +// w=f; +// i = search (hash,w, strlen(w), &addrInTH ); +// insertElement (hash, w, strlen(w), &addrInTH); +// fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); +// fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); + + fprintf(stderr,"\n in: %s hash ? = %d",a,inHashTable(hash,a,strlen(a)) ); + fprintf(stderr,"\n in: %s hash ? = %d",e, inHashTable(hash,e,strlen(e)) ); + fprintf(stderr,"\n in: %s hash ? = %d",b, inHashTable(hash,b,strlen(b)) ); + + destroy_hash(hash); + destroyMemoryManager(_memMgr); + printf("\n\n"); +} +*/ diff --git a/swcsa/utils/hash.h b/swcsa/utils/hash.h new file mode 100755 index 0000000..d691c7a --- /dev/null +++ b/swcsa/utils/hash.h @@ -0,0 +1,91 @@ + +/* DYNAMIC END-TAGGED DENSE CODE. -- +A dynamic word-based byte oriented compressor for text files based on +dynamic End-Tagged Dense Code. + +Brisaboa, N. R., Fariña, A., Navarro, G., Paramá, J. R. +Simple, Fast, and Efficient Natural Language Adaptive Compression. +11th International Symposium on String Processing and Information Retrieval (SPIRE'04) - LNCS 3246. A. Apostolico, M. Melucci (Ed.), pp. 230-241. +Padova (Italia), 2004. + +Copyright (C) 2005 Antonio Fariña. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +Author's contact: Antonio Fariña, Dept. of Computer Science, University of +A Coruña. Campus de Elviña s/n. Spain fari@udc.es +*/ + + +/*----------------------------------------------------------------------- + Hash: Definition of HashTable class (Linear Hash) + ------------------------------------------------------------------------*/ +#ifndef HASH_INCLUDED +#define HASH_INCLUDED + +#include +#include +#include +#include + +#include "MemoryManager.h" + +#define JUMP 101 //jump done when a collision appears +#define OCUP_HASH 1.5 //index of occupation of the hash table +#define SMALL_PRIME 1009 // a small prime number, used to compute a hash function +#define SEED 1159241 +/* Type definitions */ + +#define MIN(a,b) (a < b) ? a : b + +struct hashNode { + unsigned char *word; + unsigned long len; + unsigned long posInVoc; //positon of the canonical word in vector posInHT +}; +typedef struct hashNode t_hashNode; + +struct hashStr { + t_hashNode *hash; /* the slots in the hash table */ + unsigned long SIZE_HASH; /* # entries in the hash table */ + unsigned long NumElem; /* # elements already added to the hash table*/ + MemoryManager _memMgr; /* Holds dynamic memory allocation for words. */ +}; + +typedef struct hashStr *t_hash; + +// private: + + unsigned long NearestPrime (unsigned long n); + unsigned long hashFunction (const unsigned char *aWord, unsigned int len, unsigned long sizeHash); + +// public: + + t_hash initialize_hash (unsigned long sizeVoc); + + void insertElement (t_hash h, const unsigned char *aWord, register unsigned long len, + register unsigned long *addr); + unsigned long search (t_hash h, const unsigned char *aWord, register unsigned len, + unsigned long *returnedAddr); + unsigned long inHashTable (t_hash h, const unsigned char *aWord, register unsigned len, + unsigned long *returnedAddr); + void destroy_hash (t_hash hash); + + int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2); + + int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2); + + +#endif diff --git a/swcsa/utils/huff.c b/swcsa/utils/huff.c new file mode 100755 index 0000000..b45a484 --- /dev/null +++ b/swcsa/utils/huff.c @@ -0,0 +1,396 @@ + +// implements canonical Huffman + +#include "huff.h" + +typedef struct + { uint freq; + uint symb; + union + { int prev; + int depth; + } h; + int ch1,ch2; + } Ttree; + +static void sort (Ttree *tree, int lo, int up) + + { int i, j; + Ttree temp; + while (up>lo) + { i = lo; + j = up; + temp = tree[lo]; + while (i temp.freq) j--; + tree[i] = tree[j]; + while (i0) + { tree[j].freq = freq[i]; + tree[j].symb = i; + j++; + //fprintf(stderr,"\n freq[%d] = [%d], para s�mbolo %d, now j=%ld",i,freq[i],tree[j-1].symb,j); + } + } + + /*** BEG FARI... para solucionar caso en que s�lo hubiese un �nico s�mbolo a codificar ****/ + if (onlyOneSymbol){ + tree[j].freq = 0; + tree[j++].symb = 1; //<--- !!!!!!!!! (incrementa j !!!) + + //H.total -=1 * 1; //we will add 1*1 at the end of this function. + + } + //fprintf(stderr,"\n"); + + /*** END FARI... para solucionar caso en que s�lo hubiese un �nico s�mbolo a codificar ****/ + + H.lim = lim = j-1; + + + // now run Huffman algorithm + if (!sorted) sort (tree,0,lim); + + for (i=0;i<=lim;i++) { + //fprintf(stderr,"\n XX freq[%d] = [%d], para symbolo %d",i,tree[i].freq,tree[i].symb); + tree[i].h.prev = i+1; + tree[i].ch1 = tree[i].ch2 = -1; + } + tree[lim].h.prev = -1; + // last = next node to process, ptr = search point, fre = next free cell + // leaves are in 0..lim in decreasing freq order + // internal nodes are in lim+1.. 2*lim, created in incr. fre order + last=0; ptr = 0; fre = lim+1; + for (i=0;i=0;i--) + { H.s.spos[tree[i].symb] = i; + while (tree[i].h.depth > d) + { H.num[d] = i+1; d++; } + } + H.num[d] = 0; + H.depth = d; + for (d=H.depth;d>0;d--) H.num[d] = H.num[d-1] - H.num[d]; + H.num[0] = (lim == 0); + H.num = (uint *) realloc(H.num,(H.depth+1)*sizeof(uint)); + //H.total = 0; + + + + + if (onlyOneSymbol){ H.total += freq[tree[0].symb] * tree[0].h.depth;} + else { + for (i=0;i<=lim;i++) { + H.total += freq[tree[i].symb] * tree[i].h.depth; + //fprintf(stderr,"\n ****tota[%d] = %d x %d =%d",i,freq[tree[i].symb],tree[i].h.depth,H.total); + } + } + free (tree); + + //fprintf(stderr,"\n **** CALL ENDS CREATE_HUFF WITH lim = %ld, freq[0]=%ld, Huffsize=%ld",lim,freq[0],H.total); + + return H; + } + +int encodeHuff (THuff H, uint symb, uint *stream, uint ptr) + + { uint pos; + uint code; + int d; + pos = H.s.spos[symb]; + code = 0; + d = H.depth; + while (pos >= H.num[d]) + { code = (code + H.num[d]) >> 1; + pos -= H.num[d--]; + } + code += pos; + if (d > W) { bitzero(stream,ptr,d-W); ptr += d-W; d = W; } + while (d--) + { if ((code >> d) & 1) bitset(stream,ptr); + else bitclean(stream,ptr); + ptr++; + } + return ptr; + } + +void printCodeHuff (THuff H, uint symb) + + { uint pos; + uint code; + int d; + pos = H.s.spos[symb]; + code = 0; + d = H.depth; + + // fprintf(stderr,"\n H.depth= %ld and pos is %ld\n",H.depth,pos); + while (pos >= H.num[d]) + { code = (code + H.num[d]) >> 1; + pos -= H.num[d--]; + } + code += pos; + if (d > W) {fprintf(stderr,"code larger than W"); d=W;} + + //show the code. + while (d--) + { if ((code >> d) & 1) + fprintf(stderr,"1"); + else fprintf(stderr,"0"); + } + } + + + +int decodeHuff (THuff *H, uint *symb, uint *stream, uint ptr) + + { uint pos; + int d; + pos = 0; + d = 0; + while (pos < H->fst[d]) + { pos = (pos << 1) | bitget(stream,ptr); + ptr++; d++; + } + *symb = H->s.symb[H->num[d]+pos-H->fst[d]]; + return ptr; + } + + + + +/* { uint pos; // This "improved" code is actually slower! + int d; + uint wrd,off; + stream += ptr/W; + off = ptr & (W-1); + wrd = *stream >> off; + pos = 0; + d = 0; + while (pos < H.fst[d]) + { pos = (pos << 1) | (wrd & 1); + d++; wrd >>= 1; off++; + if (off == W) { wrd = *++stream; off = 0; } + } + *symb = H.s.symb[H.num[d]+pos-H.fst[d]]; + return ptr+d; + } +*/ +void saveHuff (THuff H, FILE *f) + + { uint *symb = (uint *)malloc((H.lim+1)*sizeof(uint)); + int i; + for (i=0;i<=H.max;i++) + if (H.s.spos[i] != ~0) symb[H.s.spos[i]] = i; + fwrite (&H.max,sizeof(uint),1,f); + fwrite (&H.lim,sizeof(uint),1,f); + fwrite (&H.depth,sizeof(uint),1,f); + fwrite (symb,sizeof(uint),H.lim+1,f); + fwrite (H.num,sizeof(uint),H.depth+1,f); + free (symb); + } + +uint sizeHuff (THuff H) + + { return (4 +(H.lim+1)+2*(H.depth+1))*sizeof(uint); + } + +uint sizeHuffDisk (THuff H) + + { return ( sizeof(THuff) + ((H.lim+1)+(H.depth+1))*sizeof(uint) ); + } + +void freeHuff (THuff H) + + { free (H.s.spos); free (H.num); free (H.fst); + } + + +THuff loadHuff (FILE *f, int enc) //enc (0/1)-> do you only want to perform encoding ?? + + { THuff H; + uint *symb; + uint *num; + int i,d,dold,dact; + fread (&H.max,sizeof(uint),1,f); + fread (&H.lim,sizeof(uint),1,f); + fread (&H.depth,sizeof(uint),1,f); + symb = (uint *) malloc ((H.lim+1)*sizeof(uint)); + fread (symb,sizeof(uint),H.lim+1,f); + if (enc) + { H.s.spos = (uint *) malloc ((H.max+1)*sizeof(uint)); + for (i=0;i<=H.max;i++) H.s.spos[i] = ~0; + for (i=0;i<=H.lim;i++) H.s.spos[symb[i]] = i; + free (symb); + } + else H.s.symb = symb; + + H.num = (uint *) malloc ((H.depth+1)*sizeof(uint)); + fread (H.num,sizeof(uint),H.depth+1,f); + if (!enc) + { H.fst = (uint *) malloc ((H.depth+1)*sizeof(uint)); + H.fst[H.depth] = 0; dold = 0; + for (d=H.depth-1;d>=0;d--) + { dact = H.num[d+1]; + H.fst[d] = (H.fst[d+1]+dact) >> 1; + H.num[d+1] = dold; + dold += dact; + } + H.num[0] = dold; + } + return H; + } + + + + + +/***************************************************************/ +void prepareToDecode(THuff *H) +//***///**// //by fari !! + + { uint *symb = (uint *) malloc((H->lim+1)*sizeof(uint)); + uint *num; + int i,d,dold,dact; + + for (i=0;i<=H->max;i++) + if (H->s.spos[i] != ~0) + symb[H->s.spos[i]] = i; + + for (i=0;i<=H->lim;i++) + H->s.symb[i] = symb[i]; + + //H.num = malloc ((H.depth+1)*sizeof(uint)); + { + H->fst = (uint *)malloc ((H->depth+1)*sizeof(uint)); + H->fst[H->depth] = 0; dold = 0; + for (d=H->depth-1;d>=0;d--) + { dact = H->num[d+1]; + H->fst[d] = (H->fst[d+1]+dact) >> 1; + H->num[d+1] = dold; + dold += dact; + } + H->num[0] = dold; + } + free (symb); +} + + +//////////// +void saveHuffAfterDecode (THuff H, FILE *f) + { + fwrite (&H.max,sizeof(uint),1,f); + fwrite (&H.lim,sizeof(uint),1,f); + fwrite (&H.depth,sizeof(uint),1,f); + fwrite (H.s.symb,sizeof(uint),H.lim+1,f); + + fwrite (H.fst,sizeof(uint),H.depth+1,f); + fwrite (H.num,sizeof(uint),H.depth+1,f); + } + + +THuff loadHuffAfterDecode (FILE *f, int enc) //enc (0/1)-> do you only want to perform encoding ?? + + { THuff H; +// int i,d,dold,dact; +// +// fread (&H.max,sizeof(uint),1,f); +// fread (&H.lim,sizeof(uint),1,f); +// fread (&H.depth,sizeof(uint),1,f); +// +// H.s.symb = malloc ((H.lim+1)*sizeof(uint)); +// fread (H.s.symb,sizeof(uint),H.lim+1,f); +// +// H.fst = malloc ((H.depth+1)*sizeof(uint)); +// fread (H.fst,sizeof(uint),H.depth+1,f); +// +// H.num = malloc ((H.depth+1)*sizeof(uint)); +// fread (H.num,sizeof(uint),H.depth+1,f); +// + return H; + } + + + +void loadHuffAfterDecode2 (THuff *H, FILE *f, int enc) //enc (0/1)-> do you only want to perform encoding ?? + + { + int i,d,dold,dact; + + fread (&H->max,sizeof(uint),1,f); + fread (&H->lim,sizeof(uint),1,f); + fread (&H->depth,sizeof(uint),1,f); + + H->s.symb = (uint *) malloc ((H->lim+1)*sizeof(uint)); + fread (H->s.symb,sizeof(uint),H->lim+1,f); + + H->fst = (uint *) malloc ((H->depth+1)*sizeof(uint)); + fread (H->fst,sizeof(uint),H->depth+1,f); + + H->num = (uint *) malloc ((H->depth+1)*sizeof(uint)); + fread (H->num,sizeof(uint),H->depth+1,f); + } + + + + diff --git a/swcsa/utils/huff.h b/swcsa/utils/huff.h new file mode 100755 index 0000000..060570c --- /dev/null +++ b/swcsa/utils/huff.h @@ -0,0 +1,80 @@ + +// implements canonical Huffman + +#ifndef HUFFINCLUDED +#define HUFFINCLUDED + +#include "basics.h" +#define SORTED 1 +#define UNSORTED 0 + +typedef struct + { uint max,lim; // maximum symbol (0..max), same excluding zero freqs + uint depth; // max symbol length + union + { uint *spos; // symbol positions after sorting by decr freq (enc) + uint *symb; // symbols sorted by freq (dec) + } s; + uint *num; // first pos of each length (dec), number of each length (enc) + uint *fst; // first code (numeric) of each length (dec) + uint total; // total length to achieve, in bits + } THuff; + + // Creates Huffman encoder given symbols 0..lim with frequencies + // freq[i], ready for compression + // sorted --> are the symbols already sorted ? + +THuff createHuff (uint *freq, uint lim, uint sorted); + + // Encodes symb using H, over stream[ptr...lim] (ptr and lim are + // bit positions of stream). Returns the new ptr. + +int encodeHuff (THuff H, uint symb, uint *stream, uint ptr); + + // Decodes *symb using H, over stream[ptr...lim] (ptr and lim are + // bit positions of stream). Returns the new ptr. + +int decodeHuff (THuff *H, uint *symb, uint *stream, uint ptr); + + //Prepares a Huffman tree for decoding (changes in spos & symb) + +void prepareToDecode(THuff *H); + + // Writes H in file f + +void saveHuff (THuff H, FILE *f); + + // Size of H written on file + +uint sizeHuffDisk (THuff H); + + //Size of H in memory +uint sizeHuff (THuff H); + + // Frees H + +void freeHuff (THuff H); + + // Loads H from file f, prepared for encoding or decoding depending + // on enc + +THuff loadHuff (FILE *f, int enc); + +//Decodes a code starting in position ptr from stream. Returns the ranking in the +//vector of symbols. + +#define decodeNormalHuffMacro(H, symbol, stream, ptr) \ + { uint pos; \ + int d; \ + pos = 0; \ + d = 0; \ + while (pos < H->fst[d]) \ + { pos = (pos << 1) | bitget(stream,ptr); \ + ptr++; d++; \ + } \ + symbol = (H->s.symb[ H->num[d] + pos - H->fst[d] ]); \ + } + + +#endif + diff --git a/swcsa/utils/huffDec.c b/swcsa/utils/huffDec.c new file mode 100755 index 0000000..aa8e4cc --- /dev/null +++ b/swcsa/utils/huffDec.c @@ -0,0 +1,180 @@ + +// implements canonical Huffman + +#include "huffDec.h" + + +//THuff createHuff (uint *fst, uint *num, uint depth) { +// THuff H; +// H.fst = fst; +// H.num = num; +// H.depth = depth; +// return H; +//} + + +void printCodeHuffDec (THuffDec H, uint symb) + + { uint pos; + uint code; + int d; + //pos = H.s.spos[symb]; + pos = symb; + code = 0; + d = H.depth; + + // fprintf(stderr,"\n H.depth= %ld and pos is %ld\n",H.depth,pos); + while (pos >= H.num[d]) + { code = (code + H.num[d]) >> 1; + pos -= H.num[d--]; + } + code += pos; + if (d > W) {fprintf(stderr,"code larger than W"); d=W;} + + //show the code. + while (d--) + { if ((code >> d) & 1) + fprintf(stderr,"1"); + else fprintf(stderr,"0"); + } + } + + + +//Decodes a code starting in position ptr from stream. Returns the ranking in the +//vector of symbols. +int decodeHuffDec (THuffDec *H, uint *symb, uint *stream, uint ptr) + { uint pos; + int d; + pos = 0; + d = 0; + while (pos < H->fst[d]) + { pos = (pos << 1) | bitget(stream,ptr); + ptr++; d++; + } + *symb = H->num[d]+pos-H->fst[d]; + return ptr; + } + + + +// //Decodes a code starting in position ptr from stream. Returns the ranking in the +// //vector of symbols. +// int decodeHuffDecVariantWord (uint *fst , uint *symb, uint *stream, uint ptr, uint depth) +// { uint pos; +// int d; +// pos = 0; +// d = 0; +// while ((d< depth) && (pos < fst[d*2]) ) +// { pos = (pos << 1) | bitget(stream,ptr); +// ptr++; d++; +// } +// if (depth==d) +// *symb = pos; +// else +// *symb = +// pos - +// fst[d*2] + +// fst[d*2+1]; +// //(*symb) = (depth==d) ? pos : fst[d*2+1] + pos - fst[d*2]; +// +// return ptr; +// } + + + +// the bytes used by HuffDecman struct +uint sizeHuffDec (THuffDec H) + { return (1+ 2*(H.depth+1))*sizeof(uint); + } + + +void freeHuffDec (THuffDec H) + + { free (H.fst); free (H.num); + } + + +THuffDec loadHuffDecAfterDecode (FILE *f, int enc) //enc (0/1)-> do you only want to perform encoding ?? + + { THuffDec H; +// int i,d,dold,dact; +// +// fread (&H.max,sizeof(uint),1,f); +// fread (&H.lim,sizeof(uint),1,f); +// fread (&H.depth,sizeof(uint),1,f); +// +// H.s.symb = malloc ((H.lim+1)*sizeof(uint)); +// fread (H.s.symb,sizeof(uint),H.lim+1,f); +// +// H.fst = malloc ((H.depth+1)*sizeof(uint)); +// fread (H.fst,sizeof(uint),H.depth+1,f); +// +// H.num = malloc ((H.depth+1)*sizeof(uint)); +// fread (H.num,sizeof(uint),H.depth+1,f); +// + return H; + } + + +////THuffDec loadHuffDec (FILE *f, int enc) +//// +//// { THuffDec H; +//// uint *symb; +//// uint *num; +//// int i,d,dold,dact; +//// fread (&H.max,sizeof(uint),1,f); +//// fread (&H.lim,sizeof(uint),1,f); +//// fread (&H.depth,sizeof(uint),1,f); +//// symb = malloc ((H.lim+1)*sizeof(uint)); +//// fread (symb,sizeof(uint),H.lim+1,f); +//// if (enc) +//// { H.s.spos = malloc ((H.max+1)*sizeof(uint)); +//// for (i=0;i<=H.max;i++) H.s.spos[i] = ~0; +//// for (i=0;i<=H.lim;i++) H.s.spos[symb[i]] = i; +//// free (symb); +//// } +//// else H.s.symb = symb; +//// H.num = malloc ((H.depth+1)*sizeof(uint)); +//// fread (H.num,sizeof(uint),H.depth+1,f); +//// if (!enc) +//// { H.fst = malloc ((H.depth+1)*sizeof(uint)); +//// H.fst[H.depth] = 0; dold = 0; +//// for (d=H.depth-1;d>=0;d--) +//// { dact = H.num[d+1]; +//// H.fst[d] = (H.fst[d+1]+dact) >> 1; +//// H.num[d+1] = dold; +//// dold += dact; +//// } +//// H.num[0] = dold; +//// } +//// return H; +//// } + + +void loadHuffDecAfterDecode2 (THuffDec *H, FILE *f, int enc) //enc (0/1)-> do you only want to perform encoding ?? + + { + int i,d,dold,dact; + +// fread (&H->max,sizeof(uint),1,f); +// fread (&H->lim,sizeof(uint),1,f); + fread (&H->depth,sizeof(uint),1,f); + +// H->s.symb = malloc ((H->lim+1)*sizeof(uint)); +// fread (H->s.symb,sizeof(uint),H->lim+1,f); + + H->fst = (uint *) malloc ((H->depth+1)*sizeof(uint)); + fread (H->fst,sizeof(uint),H->depth+1,f); + + H->num = (uint *) malloc ((H->depth+1)*sizeof(uint)); + fread (H->num,sizeof(uint),H->depth+1,f); + } + + + + + + + + diff --git a/swcsa/utils/huffDec.h b/swcsa/utils/huffDec.h new file mode 100755 index 0000000..7cf1b40 --- /dev/null +++ b/swcsa/utils/huffDec.h @@ -0,0 +1,214 @@ + +// implements canonical Huffman !! Just for decoding when symbols were sorted before creating huffman + +#ifndef HUFFDECINCLUDED +#define HUFFDECINCLUDED + +#include "basics.h" +#define SORTED 1 +#define UNSORTED 0 + +typedef struct + { //uint lim; + uint depth; // max symbol length + uint *num; // first pos of each length (dec), number of each length (enc) + uint *fst; // first code (numeric) of each length (dec) + } THuffDec; + + +//typedef struct +// { uint max,lim; // maximum symbol (0..max), same excluding zero freqs +// uint depth; // max symbol length +// union +// { uint *spos; // symbol positions after sorting by decr freq (enc) +// uint *symb; // symbols sorted by freq (dec) +// } s; +// uint *num; // first pos of each length (dec), number of each length (enc) +// uint *fst; // first code (numeric) of each length (dec) +// uint total; // total length to achieve, in bits +// } THuff; + + + // Decodes *symb using H, over stream[ptr...lim] (ptr and lim are + // bit positions of stream). Returns the new ptr. +int decodeHuffDec (THuffDec *H, uint *symb, uint *stream, uint ptr); + + // Writes H in file f +void saveHuffDec (THuffDec H, FILE *f); + + // Frees H +void freeHuffDec (THuffDec H); + + // the number of bytes used by HuffDecman struct. +uint sizeHuffDec (THuffDec H); + + // Loads H from file f, prepared for encoding or decoding depending + // on enc + +THuffDec loadHuffDec (FILE *f, int enc); + + +//Decodes a code starting in position ptr from stream. Returns the ranking in the +//vector of symbols. +#define decodeHuffDecMacro(H, symb, stream, ptr) \ + { uint pos; \ + int d; \ + pos = 0; \ + d = 0; \ + while (pos < H->fst[d]) \ + { pos = (pos << 1) | bitget(stream,ptr); \ + ptr++; d++; \ + } \ + fflush(stdout); \ + symb = H->num[d]+pos-H->fst[d]; \ + } + +#endif + + + + + + + + + + + + + + + + +//#define decodeHuffDecMacroVALIDWORDS decodeHuffDecMacro + +//Decodes a code starting in position ptr from stream. Returns the ranking in the +//vector of symbols. +//Saves space, as the last level for num[] and fst[] are not needed. +// at expenses of an extra-IF (the last line). +#define decodeHuffDecMacroVALIDWORDS(H, symb, stream, ptr, depth) \ + { uint pos; \ + int d; \ + pos = 0; \ + d = 0; \ + while ((pos < H->fst[d]) && (d< depth)) \ + { pos = (pos << 1) | bitget(stream,ptr); \ + ptr++; d++; \ + } \ + fflush(stdout); \ + symb = (depth==d) ? pos : H->num[d]+pos-H->fst[d]; \ + } + + +//Decodes a code starting in position ptr from stream. Returns the ranking in the +//vector of symbols. +#define decodeHuffDecMacroVariantWordxx(fst,num, symb, stream, ptr, depth) \ + { uint pos; \ + int d; \ + pos = 0; \ + d = 0; \ + while ((pos < fst[d]) && (d< depth)) \ + { pos = (pos << 1) | bitget(stream,ptr); \ + ptr++; d++; \ + } \ + fflush(stdout); \ + symb = (depth==d) ? pos : num[d] + pos - fst[d]; \ + } + + + +//Decodes a code starting in position ptr from stream. Returns the ranking in the +//vector of symbols. +// #define decodeHuffDecMacroVariantWord(fst,symb, stream, ptr, depth) \ +// { uint pos; \ +// int d; \ +// pos = 0; \ +// d = 0; \ +// while ((d< depth) && (pos < fst[d*2]) ) \ +// { pos = (pos << 1) | bitget(stream,ptr); \ +// ptr++; d++; \ +// } \ +// symb = (depth==d) ? pos : fst[d*2+1] + pos - fst[d*2]; \ +// } + + +/** Decodes a variant of a word from a stream of compressed bits, starting in the ptr-th bit + The starting bucket in fstnum is "offbucket" [fst|num|fst|num|fst|num] + */ +#define decodeHuffDecMacroVariantWordPos(fstnum, offbucket, symb, stream, ptr, depth) \ + { uint pos; \ + register uint d; \ + pos = 0; \ + d = 0; \ + while ((d< depth) && (pos < fstnum[offbucket + d*2]) ) \ + { pos = (pos << 1) | bitget(stream,ptr); \ + ptr++; d++; \ + } \ + symb = (depth==d) ? pos : fstnum[offbucket+d*2+1] + pos - fstnum[offbucket+d*2]; \ + } +#define decodeHuffDecMacroVariantWordPos2(HV, symb, stream, ptr, idCanonical) \ + { uint pos; \ + register uint d; \ + uint *offfstnum = HV.offsetNumAndFst; \ + uint *fstnum = HV.zoneNumFst; \ + register uint offbucket = offfstnum[idCanonical]; \ + register uint depth = (offfstnum[idCanonical+1] - offbucket)/2; \ + pos = 0; \ + d = 0; \ + while ((d< depth) && (pos < fstnum[offbucket + d*2]) ) \ + { pos = (pos << 1) | bitget(stream,ptr); \ + ptr++; d++; \ + } \ + symb = (depth==d) ? pos : fstnum[offbucket+d*2+1] + pos - fstnum[offbucket+d*2]; \ + } + + + //fst[0] = zone[0 ]; + //fst[1] = zone[1*2]; + //fst[i] = zone[i*2]; + + //num[0] = zone[0 +1]; + //num[1] = zone[1*2+1]; + //num[i] = zone[i*2+1]; + + + +//#define decodeHuffDecMacroVariantWordPos2bits(HV, symb, stream, ptr, idCanonical) \ +// { uint pos; \ +// register uint d; \ +// uint sizeBuckbits= HV.sizeBuckbits; \ +// uint dirbElemSize = HV.dirbElemSize; \ +// uint sizeFstbits = HV.sizeFstbits; \ +// uint sizeNumbits = HV.sizeNumbits; \ +// uint *dirb = HV.Dirb; \ +// uint *zonefstnum = HV.zoneMem; \ +// register uint offbucket; \ +// offbucket = bitread (dirb, idCanonical*dirbElemSize, dirbElemSize); \ +// register uint depth; \ +// depth = bitread (dirb, (idCanonical+1)*dirbElemSize, dirbElemSize); \ +// depth = (depth - offbucket)/sizeBuckbits; \ +// pos = 0; \ +// d = 0; \ +// register uint currfst; \ +// currfst = bitread (zonefstnum, (offbucket + d*sizeBuckbits), sizeFstbits); \ +// while ((pos < currfst) ) \ +// { pos = (pos << 1) | bitget(stream,ptr); \ +// ptr++; d++; \ +// if (dtotalInts = ((bitsNeeded+W-1)/W); + V->data = (uint *) malloc((V->totalInts) * sizeof(uint)); + V->data[V->totalInts-1]=0000; /** avoids valgrind to blame*/ + V->size = size; + V->elemSize= elemSize; + printf("\nKbitVector[0,%d), of elemSize = %u initialized\n",V->size,V->elemSize); + printf("\ntotalInts = %u, size = %u elemSize = %u\n",V->totalInts,V->size,V->elemSize); + return V; +} + +uint getKBitArray(t_kBitArray V, register uint i) { + register uint eSize = V->elemSize; + register uint answ; + register uint pos = i * eSize; + mybitread(answ, V->data, pos, eSize); + return (answ); + //return ( bitread(V->data,i * eSize,eSize)); +} + +uint getKBitArraySinMacro(t_kBitArray V, register uint i) { + register uint eSize = V->elemSize; + return ( bitread(V->data,i * eSize,eSize)); +} + +void setKBitArray(t_kBitArray V, uint i, uint value){ + register uint eSize = V->elemSize; + register uint pos = i * eSize; + mybitwrite(V->data, pos, eSize, value); +} + +void setKBitArraySinMacro(t_kBitArray V, uint i, uint value){ + register uint eSize = V->elemSize; + bitwrite(V->data,i *eSize, eSize, value); +} + +/*----------------------------------------------------------------- + freeing resources + ---------------------------------------------------------------- */ +void destroy_kBitArray (t_kBitArray kBitArray) { + uint total; + total = (kBitArray->totalInts) * sizeof(uint); + + free(kBitArray->data); + free(kBitArray); + printf("\n[destroying a Kbit array of size table]...Freed %u bytes... RAM", total); +} + + diff --git a/swcsa/utils/kBitArray.h b/swcsa/utils/kBitArray.h new file mode 100755 index 0000000..e929b11 --- /dev/null +++ b/swcsa/utils/kBitArray.h @@ -0,0 +1,23 @@ +#include "basics.h" + +struct akbitArr { + uint *data; /* uint * contains space for the array of kbit elements */ + uint size; /* number of kbitElements */ + uint elemSize; /* number of bits of each element*/ + uint totalInts; /* number of ints used */ +}; + +typedef struct akbitArr *t_kBitArray; + + /*********/ + t_kBitArray create_kBitArray (uint size, uint elemSize); + uint getKBitArray(t_kBitArray V,uint i); + void setKBitArray(t_kBitArray V, uint i, uint value); + void destroy_kBitArray (t_kBitArray kBitArray); + + + #define getKBitArrayMacro mybitread //mybitread(answ, v, p, len) + + + + diff --git a/swcsa/utils/parameters.c b/swcsa/utils/parameters.c new file mode 100644 index 0000000..9d06475 --- /dev/null +++ b/swcsa/utils/parameters.c @@ -0,0 +1,47 @@ + +#include "parameters.h" + +/***********************************************************************************/ +/*** FUNCTIONS USED FOR PARSING PARAMETERS FROM COMMAND LINE ***********************/ +/* Three function to variables to manage parameters */ + bool is_delimeter(char *delimiters, char c) { + int i=0,len_delimiters=strlen(delimiters); + bool is=false; + for (i=0;i +#include +/***********************************************************************************/ +/*** FUNCTIONS USED FOR PARSING PARAMETERS FROM COMMAND LINE ***********************/ + +#ifndef PARAMETERS_INCLUDED +#define PARAMETERS_INCLUDED + bool is_delimeter(char *delimiters, char c) ; + void parse_parameters(char *options, int *num_parameters, char ***parameters, char *delimiters); + void free_parameters(int num_parameters,char ***parameters); +#endif diff --git a/swcsa/utils/valstring.c b/swcsa/utils/valstring.c new file mode 100755 index 0000000..d27c747 --- /dev/null +++ b/swcsa/utils/valstring.c @@ -0,0 +1,130 @@ +#include "valstring.h" +#include + +unsigned char _Valid[256]; +unsigned char _Invalid[256]; + +unsigned char _toLow[256]; + + +#ifndef ValidCh + + #define ValidCh(ch) (isalnum(ch)) /* Teste de validacao */ + #define InvalidCh(ch) (!ValidCh(ch)) + +#endif + + + + +void StartValid() { + + unsigned x; + + for(x=0;x<128;x++) { + if(ValidCh(x)) { + _Valid[x]=1; + _Invalid[x]=0; + } + else { + _Valid[x]=0; + _Invalid[x]=1; + } + } + for(x=128;x<256;x++) { + _Valid[x]=0; + _Invalid[x]=1; + } + + _Valid[0]=_Invalid[0]=0; + + // Caracteres especiales (acentuados, dieresis...) + // Caracter 'ñ' + _Valid[241]=1; + _Invalid[241]=0; + // Caracter 'Ñ' + _Valid[209]=1; + _Invalid[209]=0; + // Caracter 'á' + _Valid[225]=1; + _Invalid[225]=0; + // Caracter 'é' + _Valid[233]=1; + _Invalid[233]=0; + // Caracter 'í' + _Valid[237]=1; + _Invalid[237]=0; + // Caracter 'ó' + _Valid[243]=1; + _Invalid[243]=0; + // Caracter 'ú' + _Valid[250]=1; + _Invalid[250]=0; + // Caracter 'Á' + _Valid[193]=1; + _Invalid[193]=0; + // Caracter 'É' + _Valid[201]=1; + _Invalid[201]=0; + // Caracter 'Í' + _Valid[205]=1; + _Invalid[205]=0; + // Caracter 'Ó' + _Valid[211]=1; + _Invalid[211]=0; + // Caracter 'Ú' + _Valid[218]=1; + _Invalid[218]=0; + // Caracter 'ü' + _Valid[252]=1; + _Invalid[252]=0; + // Caracter 'Ü' + _Valid[220]=1; + _Invalid[220]=0; + +} + + + + +void StartToLow() { + int i; + unsigned char c; + for (i=0;i<256;i++) { + c=i; + if( (c >= 'A') && (c <= 'Z') ){ + _toLow[i]= c+ 'a'-'A'; + } + else if (!_Valid[c]) { + _toLow[i]=c; + } + else { + switch(c) { + case 192: case 193: case 194: case 195: case 196: case 197: case 224: case 225: + case 226: case 227: case 228: case 229: + c = 97; break; + + case 201: case 202: case 203: case 232: case 233: case 234: case 235: + c = 101; break; + + case 204: case 205: case 206: case 207: case 236: case 237: case 238: case 239: + c = 105; break; + + case 210: case 211: case 212: case 213: case 214: case 242: case 243: case 244: + case 245: case 246: + c = 111; break; + + case 217: case 218: case 219: case 220: case 249: case 250: case 251: case 252: + c = 117; break; + + case 209: c = 241; break; + + case 138: case 154: c = 115; break; + + case 159: case 221: case 253: case 255: + c = 121; break; + } + _toLow[i]=c; + } + } +} diff --git a/swcsa/utils/valstring.h b/swcsa/utils/valstring.h new file mode 100755 index 0000000..93a59a3 --- /dev/null +++ b/swcsa/utils/valstring.h @@ -0,0 +1,17 @@ +#include + +#ifndef ValidCh + + #define ValidCh(ch) (isalnum(ch)) /* Teste de validacao */ + #define InvalidCh(ch) (!ValidCh(ch)) + +#endif + + +extern unsigned char _Valid[256]; +extern unsigned char _Invalid[256]; + +extern unsigned char _toLow[256]; + +void StartValid(); +void StartToLow();