From: Kim Nguyễn Date: Wed, 29 Feb 2012 00:33:25 +0000 (+0100) Subject: Allow arbitrary SWCSA index size X-Git-Url: http://git.nguyen.vg/gitweb/?p=SXSI%2FTextCollection.git;a=commitdiff_plain;h=9abbea1aba3d81f1eccd84a92d1857e46a1b3ba2 Allow arbitrary SWCSA index size --- diff --git a/swcsa/buildFacade.c b/swcsa/buildFacade.c index b581059..4cd28fe 100755 --- a/swcsa/buildFacade.c +++ b/swcsa/buildFacade.c @@ -4,26 +4,26 @@ /** Building the index */ - /* Creates index from text[0..length-1]. Note that the index is an - opaque data type. Any build option must be passed in string - build_options, whose syntax depends on the index. The index must - always work with some default parameters if build_options is NULL. + /* Creates index from text[0..length-1]. Note that the index is an + opaque data type. Any build option must be passed in string + build_options, whose syntax depends on the index. The index must + always work with some default parameters if build_options is NULL. The returned index is ready to be queried. */ int build_index (uchar *text, ulong length, char *build_options, void **index) { int returnvalue; printf("\n parameters: \"%s\"\n",build_options); fflush(stderr); - + returnvalue = build_WCSA (text, length, build_options, index); - - if (!returnvalue) + + if (!returnvalue) returnvalue = build_iCSA (build_options,*index); - - return returnvalue; + + return returnvalue; } -/** Saves index on disk by using single or multiple files, having +/** Saves index on disk by using single or multiple files, having proper extensions. */ int save_index (void *index, char *filename) { @@ -35,7 +35,7 @@ int save_index (void *index, char *filename) { int file; char c; - printf("\n Saving structures to disk: %s.*",filename); + printf("\n Saving structures to disk: %s.*",filename); outfilename = (char *)malloc(sizeof(char)*(strlen(basename)+10)); /**File with some constants (bSize and tohSize); */ @@ -47,11 +47,11 @@ int save_index (void *index, char *filename) { if( (file = open(outfilename, O_WRONLY|O_CREAT,S_IRWXG | S_IRWXU)) < 0) { printf("Cannot open file %s\n", outfilename); exit(0); - } - write(file, &(wcsa->sourceTextSize), sizeof(uint)); - write(file, &(wcsa->seSize), sizeof(uint)); - close(file); - } + } + write(file, &(wcsa->sourceTextSize), sizeof(uint)); + write(file, &(wcsa->seSize), sizeof(uint)); + close(file); + } /** The Words in the vocabulary of words (sorted alphabetically)*/ { strcpy(outfilename, basename); @@ -68,26 +68,26 @@ int save_index (void *index, char *filename) { write(file, &n, sizeof(uint)); write(file, &elemSize, sizeof(uint)); write(file, &(wcsa->wordsData.wordsZoneMem.size), sizeof(uint)); - + //the number of canonical words write(file, (char *)wcsa->wordsData.wordsZoneMem.zone, wcsa->wordsData.wordsZoneMem.size * sizeof(byte)); write(file, (char *)wcsa->wordsData.words, ((((n+1)* (elemSize))+W-1) /W) * (sizeof(uint)) ); close(file); - } - + } + free(outfilename); if (wcsa->myicsa) { /******** saves index on integers (bottom) ******/ //Storing the CSA //storeStructsCSA(wcsa->myicsa,basename); - saveIntIndex((void *) wcsa->myicsa, basename); - } - + saveIntIndex((void *) wcsa->myicsa, basename); + } + if (wcsa->se) { saveSEfile(basename,wcsa->se, wcsa->seSize+1); - //free(wcsa->se); + //free(wcsa->se); } return 0; @@ -95,7 +95,7 @@ int save_index (void *index, char *filename) { - /** Loads index from one or more file(s) named filename, possibly + /** Loads index from one or more file(s) named filename, possibly adding the proper extensions. */ int load_index(char *filename, void **index){ twcsa *wcsa; @@ -109,9 +109,9 @@ int free_index(void *index){ twcsa *wcsa=(twcsa *) index; ulong size; index_size(index,&size); - printf("\n[destroying index] ...Freed %lu bytes... RAM", size); - - + printf("\n[destroying index] ...Freed %lu bytes... RAM", size); + + //frees the array SE. if (wcsa->se) free (wcsa->se); @@ -125,8 +125,8 @@ int free_index(void *index){ //the words. free (wcsa->wordsData.wordsZoneMem.zone); free (wcsa->wordsData.words); /** huge!! */ - - //the pointer to wcsa. + + //the pointer to wcsa. free(wcsa); return 0; } @@ -137,46 +137,46 @@ int index_size(void *index, ulong *size) { twcsa *wcsa=(twcsa *)index; uint n= wcsa->n; *size=0; - *size += sizeof(twcsa); + *size += sizeof(twcsa); totaltmp=0; //words totaltmp += ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)); //the pointers totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words. *size += totaltmp; - + if (wcsa->myicsa) { uint nbytes; int err = sizeIntIndex((void *) wcsa->myicsa, &nbytes); *size += nbytes; - //*size += CSA_size(wcsa->myicsa); + //*size += CSA_size(wcsa->myicsa); } - return 0; + return 0; } /** Querying the index =============================================================*/ - - /* Writes in numocc the number of occurrences of the substring + + /* Writes in numocc the number of occurrences of the substring pattern[0..length-1] found in the text indexed by index. */ int count (void *index, uchar *pattern, ulong length, ulong *numocc){ uint integerPatterns[MAX_INTEGER_PATTERN_SIZE]; uint integerPatternSize; - ulong l,r; + ulong l,r; twcsa *wcsa=(twcsa *) index; - parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize); + parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize); if (!integerPatternSize) {*numocc=0; return 0;} //not found - + //*numocc = countCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &l, &r); int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, numocc, &l, &r); return 0; } - /* Writes in numocc the number of occurrences of the substring + /* Writes in numocc the number of occurrences of the substring pattern[0..length-1] in the text indexed by index. It also allocates - occ (which must be freed by the caller) and writes the locations of - the numocc occurrences in occ, in arbitrary order. */ + occ (which must be freed by the caller) and writes the locations of + the numocc occurrences in occ, in arbitrary order. */ int locate(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc){ return 99; } @@ -196,31 +196,31 @@ int length (void *index, ulong *length) { /** *********************************************************************************** - * Accessing the indexed text + * Accessing the indexed text * ***********************************************************************************/ - /** Allocates snippet (which must be freed by the caller) and writes - the substring text[from..to] into it. Returns in snippet_length the - length of the text snippet actually extracted (that could be less + /** Allocates snippet (which must be freed by the caller) and writes + the substring text[from..to] into it. Returns in snippet_length the + length of the text snippet actually extracted (that could be less than to-from+1 if to is larger than the text size). */ -int extract (void *index, ulong from, ulong to, uchar **snippet, ulong *snippet_length) { +int extract (void *index, ulong from, ulong to, uchar **snippet, ulong *snippet_length) { twcsa *wcsa=(twcsa *) index; - return 99; + return 99; } - /** Displays the text (snippet) surrounding any occurrence of the - substring pattern[0..length-1] within the text indexed by index. - The snippet must include numc characters before and after the - pattern occurrence, totalizing length+2*numc characters, or less if - the text boundaries are reached. Writes in numocc the number of - occurrences, and allocates the arrays snippet_text and - snippet_lengths (which must be freed by the caller). The first is a - character array of numocc*(length+2*numc) characters, with a new - snippet starting at every multiple of length+2*numc. The second + /** Displays the text (snippet) surrounding any occurrence of the + substring pattern[0..length-1] within the text indexed by index. + The snippet must include numc characters before and after the + pattern occurrence, totalizing length+2*numc characters, or less if + the text boundaries are reached. Writes in numocc the number of + occurrences, and allocates the arrays snippet_text and + snippet_lengths (which must be freed by the caller). The first is a + character array of numocc*(length+2*numc) characters, with a new + snippet starting at every multiple of length+2*numc. The second gives the real length of each of the numocc snippets. */ -int display (void *index, uchar *pattern, ulong length, ulong numc, +int display (void *index, uchar *pattern, ulong length, ulong numc, ulong *numocc, uchar **snippet_text, ulong **snippet_lengths) { return 99; } @@ -229,74 +229,74 @@ int display (void *index, uchar *pattern, ulong length, ulong numc, /** *********************************************************************************** * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord - * ***********************************************************************************/ - /* Writes in numocc the number of occurrences of the substring + * ***********************************************************************************/ + /* Writes in numocc the number of occurrences of the substring pattern[0..length-1] in the text indexed by index. It also allocates - occ (which must be freed by the caller) and writes the locations of + occ (which must be freed by the caller) and writes the locations of the numocc occurrences in occ, in arbitrary order. These occurrences refer to the offsets in TOH where the caller could start a display operation. So locateWord implies synchronization using B. Moreover, positions occ[numocc.. 2*numocc-1] is set with the rank in SE of the words whose codes begin in TOH in the positions in occ[0... numocc-1] ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the - searched word, but the offset in TOH of k-before words before. - */ - + searched word, but the offset in TOH of k-before words before. + */ + int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore){ uint integerPatterns[MAX_INTEGER_PATTERN_SIZE]; uint integerPatternSize; - ulong occurrences,l,r; + ulong occurrences,l,r; twcsa *wcsa=(twcsa *) index; - - parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize); - if (!integerPatternSize) {*numocc=0; return 0;} //not found + + parseTextIntoIntegers(wcsa, pattern, length, integerPatterns, &integerPatternSize); + if (!integerPatternSize) {*numocc=0; return 0;} //not found ulong *seOffsets; - + //obtains the indexes in vector SE where the pattern appears. //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences); int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences); - + *numocc = occurrences; - + if (!occurrences) {(*occ)=NULL;return 0;} (*occ) = (ulong *)seOffsets; return 0; -} - - - /** Displays the text (snippet) surrounding any occurrence of the - substring pattern[0..length-1] within the text indexed by index. - The snippet must include numc characters before and after the - pattern occurrence, totalizing length+2*numc characters, or less if - the text boundaries are reached. Writes in numocc the number of - occurrences, and allocates the arrays snippet_text and - snippet_lengths (which must be freed by the caller). The first is a - character array of numocc*(length+2*numc) characters, with a new - snippet starting at every multiple of length+2*numc. The second +} + + + /** Displays the text (snippet) surrounding any occurrence of the + substring pattern[0..length-1] within the text indexed by index. + The snippet must include numc characters before and after the + pattern occurrence, totalizing length+2*numc characters, or less if + the text boundaries are reached. Writes in numocc the number of + occurrences, and allocates the arrays snippet_text and + snippet_lengths (which must be freed by the caller). The first is a + character array of numocc*(length+2*numc) characters, with a new + snippet starting at every multiple of length+2*numc. The second gives the real length of each of the numocc snippets. */ - int displayWords (void *index, uchar *pattern, ulong length, ulong numc, + int displayWords (void *index, uchar *pattern, ulong length, ulong numc, ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore) { - + /** actually extracts upto length + 2*numc chars, starting extraction kbefore - * words before the occurrence **/ + * words before the occurrence **/ ulong *indexesInSE; ulong occurrences; uint bytesPerSnippet; byte *text_aux; twcsa *wcsa=(twcsa *) index; - - locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0); - (*numocc) = occurrences; + + locateWord(index, pattern, length, (ulong **)&indexesInSE, &occurrences, 0); + (*numocc) = occurrences; if (!occurrences) { *snippet_text =NULL; *snippet_lengths =NULL; return 0; - } + } bytesPerSnippet = length+2*numc; // bytesPerSnippet = 2*numc; @@ -307,90 +307,90 @@ int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *nu // fprintf(stderr,"\n occs found = %7d for pattern %s",*numocc, pattern); // fflush(stderr); - - text_aux=*snippet_text; + + text_aux=*snippet_text; { uint i, j, tmplen; uint ptr, maxptr; byte *src, *dst; uint snippetLen; uint posSEValue,indexSE; - + for (i=0;i kbefore) ? indexSE-kbefore : 0; + indexSE = (indexSE > kbefore) ? indexSE-kbefore : 0; - dst = text_aux; + dst = text_aux; while ((!endSnippet) && (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */ - //posSEValue =displayCSA(wcsa->myicsa,indexSE); + //posSEValue =displayCSA(wcsa->myicsa,indexSE); int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue); - + {//obtains pointer to the ith word uint offtmp; - uint ith = posSEValue -1; // !! + uint ith = posSEValue -1; // !! tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); tmplen -=offtmp; //the lenght of the ith word. src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp; - } + } if (_Valid[*src]) { if (prevValid){ - *dst++ =' '; + *dst++ =' '; snippetLen++; if (snippetLen==bytesPerSnippet) break; //end of snippet (ends in BLANK_SPACE) } prevValid =1; //for the next iteration } - else prevValid=0; - + else prevValid=0; + indexSE++; - + /* at the end ?? */ if ((tmplen+snippetLen)>=bytesPerSnippet) { tmplen =(bytesPerSnippet - snippetLen); endSnippet=1; //so while loop ends; - } - + } + for (j=0;j wordsbefore) ? indexSE-wordsbefore : 0; + indexSE = (indexSE > wordsbefore) ? indexSE-wordsbefore : 0; - dst = text_aux; - z=0; + dst = text_aux; + z=0; while ((zseSize) ){ /** extracting words (if not at the end) */ - //posSEValue =displayCSA(wcsa->myicsa,indexSE); - int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue); - + //posSEValue =displayCSA(wcsa->myicsa,indexSE); + int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue); + {//obtains pointer to the ith word uint offtmp; - uint ith = posSEValue -1; // !! + uint ith = posSEValue -1; // !! tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); tmplen -=offtmp; //the lenght of the ith word. src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp; - } - + } + if (_Valid[*src]) { if (prevValid){ - *dst++ =' '; + *dst++ =' '; snippetLen++; if (snippetLen==maxsnippetLen) break; //end of snippet (ends in BLANK_SPACE) } prevValid =1; //for the next iteration } - else prevValid=0; - + else prevValid=0; + indexSE++; - + /* at the end ?? */ if ((tmplen+snippetLen)>=maxsnippetLen) { break; - } - - //fprintf(stderr,"\ntmplen = %d ",tmplen); fflush(stderr); + } + + //fprintf(stderr,"\ntmplen = %d ",tmplen); fflush(stderr); for (j=0;j wcsa->seSize) toWord = wcsa->seSize; if (fromWord >= wcsa->seSize) fromWord = wcsa->seSize-1; if (buffBytes < ( (toWord-fromWord)* avgWordLen)) buffBytes = ((toWord-fromWord)* avgWordLen); - + buff = (uchar *) malloc (buffBytes * sizeof(char)); if (!buff) return 1; //out of memory. dst = buff; - + register uint indexSE=fromWord; uint posSEValue=0; register uint ith; - + while ( (indexSE < toWord) ){ /** extracting words (if not at the end) */ - int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue); - + int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue); + {//obtains pointer to the ith word uint offtmp; - ith= posSEValue -1; // !! + ith= posSEValue -1; // !! tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); offtmp = bitread (wcsa->wordsData.words, (ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); tmplen -=offtmp; //the lenght of the ith word. src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp; - } + } if ( buffBytes < (leng + tmplen+1) ) { buffBytes *=2; @@ -523,15 +523,15 @@ int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text, if (_Valid[*src]) { if (prevValid){ - *dst++ =' '; + *dst++ =' '; leng += 1; } prevValid =1; //for the next iteration } - else prevValid=0; - + else prevValid=0; + indexSE++; - + for (j=0;jword, (char *)a2->word); + return strcmp((char*)a1->word, (char *)a2->word); } /** @@ -562,15 +562,15 @@ int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text, */ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) { - + unsigned long zeroNode; //number of different canonical words. - - t_hash hash; // the hash table to store both variants and canonical words. - tposInHT *posInHT; // structure for canonicals and variants+huffmans - uint sourceTextSize; - - uint seSize=0; //it's size == "numberOfValidWords". + t_hash hash; // the hash table to store both variants and canonical words. + tposInHT *posInHT; // structure for canonicals and variants+huffmans + + uint sourceTextSize; + + uint seSize=0; //it's size == "numberOfValidWords". uint *SE; //Integers vector. (represents the rank of the valid words in the source text). uint totallenWords=0; //The numberOfBytes that occupy canonical words (their ascii version) in memory @@ -578,16 +578,16 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) { ulong bytesFile,bytesFileReal; long sizeNValue; + unsigned long size_posInHT; + /* used during first pass */ - /* used during first pass */ - ulong addrInTH; - - byte* inputBuffer = text; + + byte* inputBuffer = text; bytesFileReal= bytesFile = length; sourceTextSize=length; - + /** Initializes WCSA structure*/ twcsa *wcsa; wcsa = (twcsa *) malloc (sizeof (twcsa) * 1); @@ -597,34 +597,35 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) { //Stimation (Using Heap's law) of the number of different "meaningful" words. //sizeNValue=N_value; if(bytesFile<5000000) bytesFile = 5000000; - sizeNValue = (unsigned long) floor(3.9* pow(bytesFile,0.60) ); - + + sizeNValue = (unsigned long) floor(3.9* pow(bytesFile,0.70) ); // Inicializes the arrays used to detect if a char is valid or not. StartValid(); // Inicializes the arrays used translated a char into lowercase. StartToLow(); - - + + // ********************************************************************************** - //STARTING THE FIRST PASS. + //STARTING THE FIRST PASS. // ********************************************************************************** printf("\nSTARTING THE FIRST PASS..."); - - posInHT = (tposInHT *) malloc(sizeof(tposInHT) * sizeNValue); + + posInHT = (tposInHT *) malloc(sizeof(tposInHT) * sizeNValue); + size_posInHT = sizeNValue; hash = initialize_hash (sizeNValue); //hash to cointain both the parsed words - - //----------------------------------------------------------------- + + //----------------------------------------------------------------- //1st pass (processing the file) - { + { byte *pbeg,*pend,*wordstart,*aWord; register ulong size; register uint i; - + pbeg = inputBuffer; pend = inputBuffer+bytesFileReal; - - while (pbeg = size_posInHT){ + size_posInHT *= 2; + posInHT = (tposInHT*) realloc(posInHT, size_posInHT * sizeof(tposInHT)); + } posInHT[zeroNode].slot=addrInTH; posInHT[zeroNode].word=hash->hash[addrInTH].word; hash->hash[addrInTH].posInVoc = zeroNode; zeroNode++; - totallenWords += size +1; // +1 due to the '\0' char... + totallenWords += size +1; // +1 due to the '\0' char... //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size); - } + } seSize ++; - + }//while pbeghash[posInHT[i].slot].posInVoc = i; + for (i=0;ihash[posInHT[i].slot].posInVoc = i; } - } - + } + // INITIALIZING structures for the 2nd pass ...................................... { SE = (uint *) malloc ((seSize+1)*sizeof (uint)); @@ -718,7 +723,7 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) { printf("\nSTARTING THE SECOND PASS... "); //2nd pass (processing the file) - { + { byte *pbeg,*pend,*wordstart,*aWord; register ulong size; register uint i; @@ -727,22 +732,22 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) { pbeg = inputBuffer; pend = inputBuffer+bytesFileReal; - - while (pbeg hash[addrInTH].posInVoc+1; // !!!! - countValidWords++; - + countValidWords++; + }// while pbegn = zeroNode; wcsa->sourceTextSize = sourceTextSize; wcsa->seSize = seSize; - + // Creating the words of the vocabulary... - { + { /** copying the words into WCSA. */ uint *tmpOffsets = (uint *) malloc (sizeof(uint) * (zeroNode +1) ); //1 extra uint (to point to the virtual "zeroNode+1" ^th word. uint tmpOffset =0; - + byte *zoneMem,*src; uint i; - + //Moving data from posInHT to WCSA structure - //wcsa->wordsData = (twords *) malloc(sizeof(twords) * zeroNode); + //wcsa->wordsData = (twords *) malloc(sizeof(twords) * zeroNode); wcsa->wordsData.wordsZoneMem.size = totallenWords - zeroNode; //without '\0' bytes (end-tag). - wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc ( wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ); - zoneMem = wcsa->wordsData.wordsZoneMem.zone; + wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc ( wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ); + zoneMem = wcsa->wordsData.wordsZoneMem.zone; for(i = 0; i < zeroNode; i++) { src = posInHT[i].word; //copying the canonical word //wcsa->wordsData.words[i].word = zoneMem; //setting the pointer tmpOffsets[i]=tmpOffset; //offset in zoneMem while (*src) {*zoneMem++ = *src++; tmpOffset++;} //moving data until '\0' //*zoneMem='\0'; zoneMem++; //copies also the '\0' - + } tmpOffsets[zeroNode]=tmpOffset; //setting pointer to the "virtual" word {zeroNode+1}^{th} - + //kbit encoding of the offsets uint elemSize = _bits(tmpOffset); wcsa->wordsData.elemSize = elemSize; @@ -832,15 +837,15 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) { tmpOffset=0; for (i=0; i<=zeroNode; i++) { //setting "zeroNode+1" offsets bitwrite(wcsa->wordsData.words, tmpOffset, elemSize, tmpOffsets[i]); - tmpOffset+=elemSize; + tmpOffset+=elemSize; } - + //////////// CHECKS IT WORKED. old !!!! // { uint kk; // tmpOffset=0; // for (i=0; iwordsData.words, i* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); - // tmpOffset+=elemSize; + // tmpOffset+=elemSize; // if (kk != tmpOffsets[i]) {fprintf(stderr,"\n @@@@@@@@ DISTINTOS OFFSETS "); break;} // else fprintf(stderr,"\n iguales, %d, %d :: <<%s>>len=%d",kk,i, posInHT[i].word, strlen((char*)posInHT[i].word)); // } @@ -849,42 +854,42 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) { // { uint len1, len, tmplen, len2; // uint i,p; // byte *wcsaWord, *src; - // + // // for (p=0;pwordsData.words, (wcsa->wordsData.elemSize * (p+1)), wcsa->wordsData.elemSize); // tmplen = bitread (wcsa->wordsData.words, (wcsa->wordsData.elemSize * (p)) , wcsa->wordsData.elemSize); - // + // // //fprintf(stderr,"\n :: off[%d]= %d - off [%d] = %d ==> %d",p+1,len,p,tmplen,len-tmplen); - // + // // len2 =len-tmplen; // wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen; - // } - // - // src = posInHT[p].word; + // } + // + // src = posInHT[p].word; // len1 = strlen((char *)src); - // + // // if (strcompL(src,wcsaWord,len1,len2) != 0) { - // fprintf(stderr,"\n %6d DISTINTOS !! ===len1 %d,len %d===== <<",p,len1,len2);printWord(src,len1); + // fprintf(stderr,"\n %6d DISTINTOS !! ===len1 %d,len %d===== <<",p,len1,len2);printWord(src,len1); // fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>"); // exit(0); // } // else { - // fprintf(stderr,"\n %6d ======len1 %d,len2 %d===== <<",p,len1,len2);printWord(src,len1); + // fprintf(stderr,"\n %6d ======len1 %d,len2 %d===== <<",p,len1,len2);printWord(src,len1); // fprintf(stderr,">> <<"); printWord(wcsaWord,len2); fprintf(stderr,">>"); - // } + // } // } - // + // // } - // - + // + /**-----------*/ //frees memory from hash table and posInHT structures. free(tmpOffsets); destroy_hash(hash); free(posInHT); } - + /** ******* creates the self-index on ints (bottom layer) ==> see build_icsa *********/ /** #ifdef CSA_ON @@ -895,10 +900,10 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) { myicsa = createIntegerCSA(&SE,seSize+1,build_options); wcsa->myicsa= myicsa; total = CSA_size(myicsa); - + free(SE); //SE is no longer needed, (it is indexed by the iCSA) printf("\n\t**** [iCSA built on %d words. Size = %ld bytes... RAM",seSize,total); - } + } #endif */ @@ -906,8 +911,8 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) { wcsa->se = SE; wcsa->myicsa = NULL; //#endif - - printf("\n\t ** Building done! **\n"); + + printf("\n\t ** Building done! **\n"); printf("\n Process finished!\n"); *index = wcsa; @@ -924,25 +929,25 @@ int build_iCSA (char *build_options, void *index) uint total; fprintf(stderr,"\n **** CREATING CSA-bottom-layer *****"); void *bottomIntIndex; - int err = buildIntIndex(wcsa->se,wcsa->seSize+1, build_options,(void **)&bottomIntIndex); + int err = buildIntIndex(wcsa->se,wcsa->seSize+1, build_options,(void **)&bottomIntIndex); wcsa->myicsa = bottomIntIndex; //total = CSA_size(wcsa->myicsa); err = sizeIntIndex((void *) wcsa->myicsa, &total); printf("\n\t**** [iCSA built on %d words. Size = %u bytes... RAM",wcsa->seSize,total); - } + } return 0; } /** ******************************************************************** * Loading from disk - **********************************************************************/ + **********************************************************************/ -/**----------------------------------------------------------------- - * LoadWCSA. - * Loads all the data structures of WCSA (included the icsa) - ----------------------------------------------------------------- */ +/**----------------------------------------------------------------- + * LoadWCSA. + * Loads all the data structures of WCSA (included the icsa) + ----------------------------------------------------------------- */ twcsa *loadWCSA(char *filename) { twcsa *wcsa; @@ -950,13 +955,13 @@ twcsa *loadWCSA(char *filename) { StartValid(); // Inicializes the arrays used translated a char into lowercase. StartToLow(); - + wcsa = (twcsa *) malloc (sizeof (twcsa) * 1); wcsa->n=0; int err = loadIntIndex(filename, (void **)&wcsa->myicsa); - loadStructs(wcsa,filename); + loadStructs(wcsa,filename); return wcsa; } @@ -964,7 +969,7 @@ twcsa *loadWCSA(char *filename) { /** ------------------------------------------------------------------ * LoadStructs. * Reads files and loads all the data needed for searcherFacade - ----------------------------------------------------------------- */ + ----------------------------------------------------------------- */ void loadStructs(twcsa *wcsa, char *basename) { uint i,j; char *filename; @@ -972,28 +977,28 @@ twcsa *loadWCSA(char *filename) { uint sizeFile; char c; uint n; - + filename = (char *)malloc(sizeof(char)*(strlen(basename)+10)); fprintf(stderr,"Loading Index from file %s.*\n", basename); - + //** SOME CONSTANTS: sourceTextSize { strcpy(filename, basename); strcat(filename, "."); strcat(filename, CONSTANTS_FILE_EXT); - + if( (file = open(filename, O_RDONLY)) < 0) { printf("Cannot open file %s\n", filename); exit(0); } - - read(file, &(wcsa->sourceTextSize), sizeof(uint)); - read(file, &(wcsa->seSize), sizeof(uint)); + + read(file, &(wcsa->sourceTextSize), sizeof(uint)); + read(file, &(wcsa->seSize), sizeof(uint)); close(file); - } - + } + /** File with the words from the vocabulary (sorted alphabetically) */ { byte *zoneMem; - + strcpy(filename, basename); strcat(filename, "."); strcat(filename, VOCABULARY_WORDS_FILE_EXT); @@ -1003,26 +1008,26 @@ twcsa *loadWCSA(char *filename) { printf("Cannot open file %s\n", filename); exit(0); } - + //the number of canonical words read(file, &n, sizeof(uint)); wcsa->n = n; read(file, &(wcsa->wordsData.elemSize), (sizeof(uint))); read(file, &(wcsa->wordsData.wordsZoneMem.size), (sizeof(uint))); - - //allocating the memory needed for all words and reading them //(ascii) << no \0 chars are needed>>. - wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc(wcsa->wordsData.wordsZoneMem.size * sizeof(byte)); + + //allocating the memory needed for all words and reading them //(ascii) << no \0 chars are needed>>. + wcsa->wordsData.wordsZoneMem.zone = (byte *) malloc(wcsa->wordsData.wordsZoneMem.size * sizeof(byte)); read(file, (wcsa->wordsData.wordsZoneMem.zone), wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ); - + //reading the offsets of the words (kbitArray that points to offsets in zoneMem of words. - wcsa->wordsData.words = (uint *) malloc (((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * sizeof(uint)); + wcsa->wordsData.words = (uint *) malloc (((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * sizeof(uint)); wcsa->wordsData.words[ ((((n+1)*(wcsa->wordsData.elemSize))+W-1) /W) -1 ] =0000; read(file, (wcsa->wordsData.words), ((((n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint))); - + close(file); - } - wcsa->se= NULL; + } + wcsa->se= NULL; free(filename); } @@ -1040,26 +1045,26 @@ twcsa *loadWCSA(char *filename) { /*------------------------------------------------------------------ * Given a text pattern translates it into a list of integers (corresponding to the - * canonical words associated to the valid words in the text pattern) + * canonical words associated to the valid words in the text pattern) ------------------------------------------------------------------*/ void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) { - + byte *pbeg,*pend,*wordstart,*aWord; register unsigned long size; - uint index =0; - - pbeg = textPattern; + uint index =0; + + pbeg = textPattern; pend = pbeg + patLen; - - while (pbeg n) - 1; while(min < max) { p = (min+max)/2; - + {//preparing for strcompL len = bitread (wcsa->wordsData.words, (p+1)* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize); tmplen = bitread (wcsa->wordsData.words, (p )* wcsa->wordsData.elemSize , wcsa->wordsData.elemSize); len -=tmplen; wcsaWord= (byte *) wcsa->wordsData.wordsZoneMem.zone + tmplen; } - + //if(strncmp((char*)aWord, (char*)wcsa->wordsData[p].word,size) > 0) min = p+1; if(strcompL(aWord, wcsaWord, size, len) > 0) min = p+1; else max = p; @@ -1107,9 +1112,9 @@ void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *in // fprintf(stderr,"\n Patron = <<%s>>, curposWord= %d ==><<",aWord,p); // printWord(wcsaWord,len); fprintf(stderr,">> len =%d",len); // } - + } - + {//preparing for strcompL len = bitread (wcsa->wordsData.words, (min+1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); tmplen = bitread (wcsa->wordsData.words, ( min )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); @@ -1122,19 +1127,19 @@ void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *in integerPattern[index++] = min +1 ; //<-- } else {*sizeIntegers = 0; return;} // a valid word that does not appear in the source text. - - } + + } }// end while - *sizeIntegers = index; - + *sizeIntegers = index; + // //shows the parsed words: // {uint i; // printf("\n\n >>%s>> HA SIDO PARSEADO COMO:",textPattern); // for (i=0; i>",wcsa->wordsData[integerPattern[i] -1].word); // } - // - // } + // + // } } @@ -1148,66 +1153,66 @@ int countTextOcurrences(twcsa *wcsa, byte *textPattern) { ulong left, right; uint integerPatterns[MAX_INTEGER_PATTERN_SIZE]; uint integerPatternSize, min, max; - + uint lenpat = strlen((char*)textPattern); - parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize); + parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize); if (!integerPatternSize) return -1; - -// #ifdef DEBUG_ON + +// #ifdef DEBUG_ON // uint i; // printf("\n %d Integers to search for:",integerPatternSize ); // for (i=0;iwordsData[integerPatterns[i]-1].word); -// } +// printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word); +// } // printf("\n"); // #endif ulong numocc; int err = countIntIndex((void *) wcsa->myicsa, integerPatterns, integerPatternSize, &numocc, &left, &right); - return numocc; + return numocc; } /** ------------------------------------------------------------------ - * locateTextOcurrences: + * locateTextOcurrences: * Returns the offsets of the source text where a word/phrase appears * Returns also the number of occurrences. *------------------------------------------------------------------ */ uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences) { uint integerPatterns[MAX_INTEGER_PATTERN_SIZE]; uint integerPatternSize, min, max; - + uint lenpat = strlen((char*)textPattern); - parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize); + parseTextIntoIntegers(wcsa, textPattern, lenpat, integerPatterns, &integerPatternSize); if (!integerPatternSize) {*numberOccurrences = -1; return NULL;} - -// #ifdef DEBUG_ON + +// #ifdef DEBUG_ON // uint i; // printf("\n %d Integers to search for:",integerPatternSize ); // for (i=0;iwordsData[integerPatterns[i]-1].word); -// } +// printf(" \n [%u] from word [%s]",integerPatterns[i], wcsa->wordsData[integerPatterns[i]-1].word); +// } // printf("\n"); // #endif - + ulong occurrences, left, right; ulong *seOffsets; ulong *sourceOffsets; - + //obtains the indexes in vector SE where the pattern appears. //seOffsets = locateCSA(wcsa->myicsa, integerPatterns, integerPatternSize, &occurrences); int err = locateIntIndex((void *)wcsa->myicsa, integerPatterns, integerPatternSize, &seOffsets, &occurrences); - + //sourceOffsets = (uint *) malloc (sizeof(uint)*occurrences); - + sourceOffsets=seOffsets; //obtains the offsets in the source text of the pattern (sourceOffsets) locateFacade(wcsa, (uint *)sourceOffsets, (uint *)seOffsets,occurrences); - #ifdef DEBUG_ON + #ifdef DEBUG_ON fprintf(stderr,"\n*** %s appears in the source text in positions:\n\t",textPattern); - for (i=0;i=0; ------------------------------------------------------------------*/ int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr) { - return 99; //not implemented: function not available for this index + return 99; //not implemented: function not available for this index } @@ -1255,15 +1260,15 @@ int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint ------------------------------------------------------------------*/ byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length) { byte *dstptr=NULL; //not implemented: function not available - return dstptr; + return dstptr; } - /** ------------------------------------------------------------------ + /** ------------------------------------------------------------------ * LOCATEALLandDISPLAY: * Displays the text around an occurrence of the searched word in the source text. * Assuming that $p$ is that position --> shows only chars in [p_radix-1,p_radix] - ------------------------------------------------------------------*/ + ------------------------------------------------------------------*/ int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix) { return 99; //not implemented: function not available for this index @@ -1273,13 +1278,13 @@ return 99; //not implemented: function not available for this index /** ------------------------------------------------------------------ * recovers the source text by calling display(0,fileSize); * ------------------------------------------------------------------ */ -void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) { +void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize) { int start;int end; byte *cc; char *filename = (char *) malloc (strlen( basename)+ strlen(ext)+1); ulong length; - + strcpy( filename, basename); strcat( filename, ext); filename[strlen( basename)+ strlen(ext)]='\0'; @@ -1289,45 +1294,45 @@ void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextS unlink( filename); salida = fopen( filename,"w"); start=0; end = sourceTextSize-1; - + cc = (byte *) malloc (sourceTextSize* sizeof(uchar)); { uint i, j;//, tmplen; - uint prevValid; + uint prevValid = 0; //uint ptr, maxptr; byte *src, *dst; uint leng =0; uint tmplen =0; - + uint indexSE=0; uint posSEValue=0; - + dst=cc; while ( (indexSE < wcsa->seSize) ){ /** extracting words (if not at the end) */ int err= displayIntIndex((void *)wcsa->myicsa,indexSE, &posSEValue); - + {//obtains pointer to the ith word uint offtmp; - uint ith = posSEValue -1; // !! + uint ith = posSEValue -1; // !! tmplen = bitread (wcsa->wordsData.words, (ith +1)* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); offtmp = bitread (wcsa->wordsData.words, ( ith )* wcsa->wordsData.elemSize, wcsa->wordsData.elemSize); tmplen -=offtmp; //the lenght of the ith word. src= (byte *) wcsa->wordsData.wordsZoneMem.zone + offtmp; - } + } if (_Valid[*src]) { if (prevValid){ - *dst++ =' '; + *dst++ =' '; leng +=1; } prevValid =1; //for the next iteration } - else prevValid=0; - + else prevValid=0; + indexSE++; - + for (j=0;jseSize; - + error = extractWords((void *) wcsa, start, end, &cc, &length); if (error) {fprintf(stderr,"\n error during recoverSourceText2"); exit(0);} @@ -1369,7 +1374,7 @@ void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextS fclose(salida); free(cc); - free(filename); + free(filename); } /** ******************************************************************************* @@ -1384,20 +1389,20 @@ int printInfo(void *index) { uint n; twcsa *wcsa = (twcsa *) index; - + unsigned long indexSize; uint intIndexSize, presentationSize; int err; - + err = index_size(index, &indexSize); if (err!=0) return err; - err = sizeIntIndex(wcsa->myicsa, &intIndexSize); + err = sizeIntIndex(wcsa->myicsa, &intIndexSize); if (err!=0) return err; - + presentationSize = indexSize - intIndexSize; - - printf("\n ===================================================:"); - printf("\n Summary of Presentation layer:"); + + printf("\n ===================================================:"); + printf("\n Summary of Presentation layer:"); printf("\n Number of valid words (SEsize) = %u",wcsa->seSize); printf("\n Number of different words = %ld",wcsa->n); printf("\n WCSA structure = %lu bytes", sizeof(twcsa)); @@ -1407,14 +1412,14 @@ int printInfo(void *index) { uint totalwords = totalasciizone + totalpointers; printf("\n Size Of words structure (%d bytes):",totalwords); - printf("\n [ pointers = %d bytes || AsciiZone = %d bytes", totalpointers, totalasciizone); - - printf("\n\n Total = ** %u bytes (in RAM) **",presentationSize); + printf("\n [ pointers = %d bytes || AsciiZone = %d bytes", totalpointers, totalasciizone); + + printf("\n\n Total = ** %u bytes (in RAM) **",presentationSize); //printf("\n\n @@ Summary of self-index on Integers:"); err = printInfoIntIndex(wcsa->myicsa, " "); if (err!=0) return err; - - printf("\n ===================================================:"); + + printf("\n ===================================================:"); printf("\n"); return 0; } @@ -1422,7 +1427,7 @@ int printInfo(void *index) { /**------------------------------------------------------------------ * structsSize. * Counts the memory amount needed by the Facade (Presentation Layer). - * skipping the stop_words hash table + * skipping the stop_words hash table ----------------------------------------------------------------- */ uint structsSizeMem(twcsa *wcsa) { return 0; //not implemented: function not available for this index. @@ -1448,7 +1453,7 @@ int saveSEfile (char *basename, uint *v, uint n) { exit(0); } - write(file, v, sizeof(uint) * n ); + write(file, v, sizeof(uint) * n ); close(file); } @@ -1471,56 +1476,56 @@ double getTime2 (void) -/**------------------------------------------------------------------ +/**------------------------------------------------------------------ * MAIN PROGRAM. *------------------------------------------------------------------ */ #ifdef FACADEWITHMAIN int main(int argc, char* argv[]) { - - - + + + char *infile, *outbasename, *stopwordsfile; // Name of in/out files byte *inputBuffer; ulong finsize; - + int f_in; void *Index; - + printf("\n*Word-based iCSA: A word-based CSA"); printf("\n*CopyRight (c) 2008 [LBD & G.N.]\n\n"); - + // Reads input parameters from command line. if(argc < 3) { printf("Use: %s \n", argv[0]); exit(0); } - + // Reads params (input file, output basename, and stopwords file) infile = argv[1]; outbasename = argv[2]; stopwordsfile = argv[3]; - + finsize= fileSize(infile); - + if (! finsize) { printf( "\nFILE EMPTY OR FILE NOT FOUND %s !!\nSkipping processement ...\n",infile); exit(0); - } - + } + // Opening the input text file. if( (f_in = open(infile, O_RDONLY)) < 0) { printf("Cannot read file %s\n", infile); exit(0); - } + } inputBuffer = (byte *) malloc(finsize *sizeof(byte));// +1); - read (f_in,inputBuffer,finsize); - close (f_in); - - + read (f_in,inputBuffer,finsize); + close (f_in); + + { - //printf("\n parametros <<%s>>\n\n",stopwordsfile); + //printf("\n parametros <<%s>>\n\n",stopwordsfile); build_index (inputBuffer, finsize, stopwordsfile, &Index); /** building the index */ // /** recovering the source text from the index */ @@ -1530,39 +1535,39 @@ double getTime2 (void) ulong size; get_length(Index, &size); char extension[10]= ".source"; - + //recoverSourceText1((twcsa*) Index, outbasename,extension, size); strcat(extension,"2"); recoverSourceText2((twcsa*) Index, outbasename,extension,size); - end = getTime2(); - fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start ); + end = getTime2(); + fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start ); } -// +// // DISPLAYING THE OCCURRENCES OF A TEXT PATTERN (word/phrase). {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE]; int error = 0; ulong numocc,numc, length, i, *snippet_len, tot_numcharext = 0, numpatt; uchar *pattern, *snippet_text; - + pattern = textPattern; printf("\nSEARCH TEST for DISPLAY (pizzachili interface)\n"); - while(1) { + while(1) { printf("Intro string: "); fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin); if (!strcmp((char*)textPattern,"\n") ) break; textPattern[strlen((char*)textPattern)-1] = '\0'; - + length = strlen( (char*)textPattern); numc=50; - -// error = display (Index, textPattern, length, numc, &numocc, + +// error = display (Index, textPattern, length, numc, &numocc, // &snippet_text, &snippet_len); - error = displayWords (Index, textPattern, length, numc, &numocc, + error = displayWords (Index, textPattern, length, numc, &numocc, &snippet_text, &snippet_len,1); - + if (error){ fprintf(stderr, "%s\n", "Hubo un error durante display");exit(0);} - - fprintf(stderr,"\n acabou display");fflush(stderr); + + fprintf(stderr,"\n acabou display");fflush(stderr); {//show the results ulong j, len = length + 2*numc; char blank = '\0'; @@ -1570,7 +1575,7 @@ double getTime2 (void) fprintf(stderr,"\n pattern = %s",pattern);fflush(stderr); fprintf(stderr,"\n numocc = %d",numocc);fflush(stderr); fprintf(stderr,"\n snippet len = %d",len);fflush(stderr); - fprintf(stderr,"\n =========");fflush(stderr); + fprintf(stderr,"\n =========");fflush(stderr); for (i = 0; i < numocc; i++){ fprintf(stderr,"\n[%2d][len=%3d]<<",i+1,snippet_len[i]);fflush(stderr); fwrite(snippet_text+len*i,sizeof(uchar),snippet_len[i],stderr);fflush(stderr); @@ -1578,21 +1583,21 @@ double getTime2 (void) } } numpatt--; - + for(i=0; i @@ -61,9 +61,8 @@ t_hash initialize_hash (unsigned long sizeVoc) { h->hash[i].len = 0; h->hash[i].posInVoc = 0; } - printf("\nHash Table initilized with: %lu elements\n",h->SIZE_HASH); - return h; + return h; } @@ -74,7 +73,6 @@ unsigned long NearestPrime(unsigned long n) { long position; /* the prime number being sought */ long index; - for (position = n; ; position++) { // checks if those values from 2 to $\sqrt{m}$ can be factors of $m$ */ @@ -109,12 +107,22 @@ unsigned long hashFunction (const unsigned char *aWord, unsigned int len, unsign //for(c=*aWord; (aWord++)> 2) ); + h ^= ( (h << 5) + c + (h >> 2) ); } + return((unsigned int)((h&0x7fffffff) % sizeHash)); } +unsigned long newhashFunction(const unsigned char *aWord, unsigned int len, unsigned long sizeHash) +{ + unsigned int h = 0; + unsigned int i = 0; + for(i = 0; i < len; i ++) + h = ((unsigned int) aWord[i]) + (h << 6) + (h << 16) - h; + return (h&0x7fffffff) % sizeHash; +} /*----------------------------------------------------------------------- compares two strings @@ -127,12 +135,12 @@ unsigned long hashFunction (const unsigned char *aWord, unsigned int len, unsign - int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2) { + int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2) { if (ws1 !=ws2) { return -1; } - - { register unsigned long iters; + + { register unsigned long iters; iters=1; while( iters iters) so s1 != s2 // return( *s1-*s2); -// } +// } //} //permits to compare 2 strings of len ws1 and ws2 that do not end in '\0' int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) { - register ulong iters = 0; + register ulong iters = 0; while( itershash[addr].word = aWord; + h->hash[addr].len = len; + h->hash[addr].posInVoc = pos; + h->NumElem++; +} + +void resize(t_hash h){ + unsigned long oldsize = h->SIZE_HASH; + unsigned long newsize = NearestPrime(h->SIZE_HASH * 2); + t_hashNode* oldh = h->hash; + unsigned long l = newsize * sizeof(t_hashNode); + t_hashNode* newh = (t_hashNode *) malloc(newsize * sizeof(t_hashNode)); + unsigned long i; + fprintf(stderr, "Resizing hashtable from %lu to %lu (%lu bytes)\n", oldsize, newsize, l); + h->hash = newh; + h->SIZE_HASH = newsize; + for (i = 0; i < h->SIZE_HASH; i++) { + h->hash[i].word = NULL; + h->hash[i].len = 0; + h->hash[i].posInVoc = 0; + } + h->NumElem = 0; + for(i = 0; i < oldsize; i++) + if (oldh[i].word != NULL){ + insertAux(h, oldh[i].word, oldh[i].len, oldh[i].posInVoc); + oldh[i].word = NULL; + oldh[i].len = 0; + oldh[i].posInVoc=0; + } + //free(oldh); + +} + + void insertElement (t_hash h, const unsigned char *aWord, register unsigned long len, register unsigned long *addr) { //fprintf(stderr,"\n Entra en la funcin [%s], [%ld]",aWord, len); - if(h->NumElem >= h->SIZE_HASH -1) //loses 1 slot, but ensures that "search function" does not enter an infinity loop + if(h->NumElem >= h->SIZE_HASH/2) { - printf("\n\nHash table full!! Change size and recompile !\n"); - exit(1); - } + + resize(h); - getMemoryBlock(h->_memMgr,( byte **)&(h->hash[*addr].word),len+1); - //fprintf(stderr,"\n tras obter memoria"); + } + //h->hash[*addr].word = (unsigned char*) malloc((len+1) * sizeof(unsigned char)); + getMemoryBlock(h->_memMgr,( byte **)&(h->hash[*addr].word),len+1); + //fprintf(stderr,"\n tras obter memoria"); strncpy ((char *) h->hash[*addr].word, (char *)aWord, len); @@ -258,7 +306,7 @@ void insertElement (t_hash h, const unsigned char *aWord, register unsigned long h->NumElem++; //fprintf(stderr,"\n####inserted word [%s] ",h->hash[*addr].word); - + //return *addr; } @@ -273,23 +321,24 @@ unsigned long search (t_hash h, const unsigned char *aWord, register unsigned le unsigned long *returnedAddr){ register unsigned long addr, Saddr; - + //fprintf(stderr,"\n searching for [%s], [%d], sizehash= %ld",aWord,len,h->SIZE_HASH); addr = hashFunction(aWord,len, h->SIZE_HASH); Saddr = addr; t_hashNode *hash; hash = h->hash; - - while((hash[addr].word != NULL)&&((strcomp(hash[addr].word, aWord, hash[addr].len, len)) != 0)) { + + while((hash[addr].word != NULL)&&((strcomp(hash[addr].word, aWord, hash[addr].len, len)) != 0)) { //fprintf(stderr,"\nComprueba [%s], [%d]",hash[addr].word,strcomp(hash[addr].word, aWord, len)); addr = ((addr + JUMP) % h->SIZE_HASH); } *returnedAddr = addr; - + if(hash[addr].word == NULL) { - return h->NumElem; //Word was not found + // fprintf(stderr, "word:'%.*s', hash %lu\n", len, aWord, Saddr); + return h->NumElem; //Word was not found } else { return h->hash[addr].posInVoc; //Word was found in this position in the vocabulary @@ -302,7 +351,7 @@ unsigned long search (t_hash h, const unsigned char *aWord, register unsigned le Tells if a word appears or not in the hash table. -----------------------------------------------------------------------*/ unsigned long inHashTable (t_hash h, const unsigned char *aWord, register unsigned len, unsigned long *returnedAddr){ - + unsigned long searched; unsigned long nothing; searched = search(h,aWord,len,¬hing); @@ -315,12 +364,15 @@ unsigned long inHashTable (t_hash h, const unsigned char *aWord, register unsign ------------------------------------------------------------------ */ void destroy_hash (t_hash hash){ unsigned long mem=0; - mem += sizeof(struct hashStr) + hash->SIZE_HASH * sizeof(t_hashNode); + unsigned long i = 0; + mem = sizeof(struct hashStr) + hash->SIZE_HASH * sizeof(t_hashNode); + printf("\n[destroying hash table]...Freed %ld bytes... RAM\n", mem); + fflush(stdout); + free(hash->hash); - destroyMemoryManager(hash->_memMgr); //frees words and variants -// free(hash->_memMgr); - free(hash); - printf("\n[destroying hash table]...Freed %ld bytes... RAM", mem); + destroyMemoryManager(hash->_memMgr); //frees words and variants +/// free(hash->_memMgr); + free(hash); } @@ -340,48 +392,48 @@ int main(int argc, char* argv[]) byte * w; unsigned int size; unsigned long i,addrInTH; - + t_hash hash; - + _memMgr=createMemoryManager(); hash = initialize_hash (2); - + w=a; i = search (hash,w, strlen(w), &addrInTH ); insertElement (hash, w, strlen(w), &addrInTH); fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); - fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); + fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); w=b; i = search (hash,w, strlen(w), &addrInTH ); insertElement (hash, w, strlen(w), &addrInTH); fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); - fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); + fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); w=c; i = search (hash,w, strlen(w), &addrInTH ); insertElement (hash, w, strlen(w), &addrInTH); fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); - fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); + fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); w=d; i = search (hash,w, strlen(w), &addrInTH ); insertElement (hash, w, strlen(w), &addrInTH); fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); - fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); + fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); // w=e; // i = search (hash,w, strlen(w), &addrInTH ); // insertElement (hash, w, strlen(w), &addrInTH); // fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); -// fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); +// fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); // w=f; // i = search (hash,w, strlen(w), &addrInTH ); // insertElement (hash, w, strlen(w), &addrInTH); // fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); -// fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); +// fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); fprintf(stderr,"\n in: %s hash ? = %d",a,inHashTable(hash,a,strlen(a)) ); fprintf(stderr,"\n in: %s hash ? = %d",e, inHashTable(hash,e,strlen(e)) );