X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=swcsa%2Futils%2Fhash.c;h=a8aa4f7f79f48ea813f4bf2389dc3d70519f05b6;hb=9abbea1aba3d81f1eccd84a92d1857e46a1b3ba2;hp=8e77870ba6534269a848883366ceff1a1b68be4e;hpb=f23d86be251c79353f66701f10d69020845050c8;p=SXSI%2FTextCollection.git diff --git a/swcsa/utils/hash.c b/swcsa/utils/hash.c index 8e77870..a8aa4f7 100755 --- a/swcsa/utils/hash.c +++ b/swcsa/utils/hash.c @@ -1,12 +1,12 @@ -/* DYNAMIC END-TAGGED DENSE CODE. -- -A dynamic word-based byte oriented compressor for text files based on +/* DYNAMIC END-TAGGED DENSE CODE. -- +A dynamic word-based byte oriented compressor for text files based on dynamic End-Tagged Dense Code. -Brisaboa, N. R., Faria, A., Navarro, G., Param, J. R. -Simple, Fast, and Efficient Natural Language Adaptive Compression. -11th International Symposium on String Processing and Information Retrieval (SPIRE'04) - LNCS 3246. A. Apostolico, M. Melucci (Ed.), pp. 230-241. -Padova (Italia), 2004. +Brisaboa, N. R., Faria, A., Navarro, G., Param, J. R. +Simple, Fast, and Efficient Natural Language Adaptive Compression. +11th International Symposium on String Processing and Information Retrieval (SPIRE'04) - LNCS 3246. A. Apostolico, M. Melucci (Ed.), pp. 230-241. +Padova (Italia), 2004. Copyright (C) 2005 Antonio Faria. @@ -31,7 +31,7 @@ A Corua. Campus de Elvia s/n. Spain fari@udc.es /*----------------------------------------------------------------------- Hash: Definition of HashTable class (Linear Hash) ------------------------------------------------------------------------*/ - + #include "hash.h" #include @@ -61,9 +61,8 @@ t_hash initialize_hash (unsigned long sizeVoc) { h->hash[i].len = 0; h->hash[i].posInVoc = 0; } - printf("\nHash Table initilized with: %lu elements\n",h->SIZE_HASH); - return h; + return h; } @@ -74,7 +73,6 @@ unsigned long NearestPrime(unsigned long n) { long position; /* the prime number being sought */ long index; - for (position = n; ; position++) { // checks if those values from 2 to $\sqrt{m}$ can be factors of $m$ */ @@ -109,12 +107,22 @@ unsigned long hashFunction (const unsigned char *aWord, unsigned int len, unsign //for(c=*aWord; (aWord++)> 2) ); + h ^= ( (h << 5) + c + (h >> 2) ); } + return((unsigned int)((h&0x7fffffff) % sizeHash)); } +unsigned long newhashFunction(const unsigned char *aWord, unsigned int len, unsigned long sizeHash) +{ + unsigned int h = 0; + unsigned int i = 0; + for(i = 0; i < len; i ++) + h = ((unsigned int) aWord[i]) + (h << 6) + (h << 16) - h; + return (h&0x7fffffff) % sizeHash; +} /*----------------------------------------------------------------------- compares two strings @@ -127,12 +135,12 @@ unsigned long hashFunction (const unsigned char *aWord, unsigned int len, unsign - int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2) { + int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2) { if (ws1 !=ws2) { return -1; } - - { register unsigned long iters; + + { register unsigned long iters; iters=1; while( iters iters) so s1 != s2 // return( *s1-*s2); -// } +// } //} //permits to compare 2 strings of len ws1 and ws2 that do not end in '\0' int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) { - register ulong iters = 0; + register ulong iters = 0; while( itershash[addr].word = aWord; + h->hash[addr].len = len; + h->hash[addr].posInVoc = pos; + h->NumElem++; +} + +void resize(t_hash h){ + unsigned long oldsize = h->SIZE_HASH; + unsigned long newsize = NearestPrime(h->SIZE_HASH * 2); + t_hashNode* oldh = h->hash; + unsigned long l = newsize * sizeof(t_hashNode); + t_hashNode* newh = (t_hashNode *) malloc(newsize * sizeof(t_hashNode)); + unsigned long i; + fprintf(stderr, "Resizing hashtable from %lu to %lu (%lu bytes)\n", oldsize, newsize, l); + h->hash = newh; + h->SIZE_HASH = newsize; + for (i = 0; i < h->SIZE_HASH; i++) { + h->hash[i].word = NULL; + h->hash[i].len = 0; + h->hash[i].posInVoc = 0; + } + h->NumElem = 0; + for(i = 0; i < oldsize; i++) + if (oldh[i].word != NULL){ + insertAux(h, oldh[i].word, oldh[i].len, oldh[i].posInVoc); + oldh[i].word = NULL; + oldh[i].len = 0; + oldh[i].posInVoc=0; + } + //free(oldh); + +} + + void insertElement (t_hash h, const unsigned char *aWord, register unsigned long len, register unsigned long *addr) { //fprintf(stderr,"\n Entra en la funcin [%s], [%ld]",aWord, len); - if(h->NumElem >= h->SIZE_HASH -1) //loses 1 slot, but ensures that "search function" does not enter an infinity loop + if(h->NumElem >= h->SIZE_HASH/2) { - printf("\n\nHash table full!! Change size and recompile !\n"); - exit(1); - } + + resize(h); - getMemoryBlock(h->_memMgr,( byte **)&(h->hash[*addr].word),len+1); - //fprintf(stderr,"\n tras obter memoria"); + } + //h->hash[*addr].word = (unsigned char*) malloc((len+1) * sizeof(unsigned char)); + getMemoryBlock(h->_memMgr,( byte **)&(h->hash[*addr].word),len+1); + //fprintf(stderr,"\n tras obter memoria"); strncpy ((char *) h->hash[*addr].word, (char *)aWord, len); @@ -258,7 +306,7 @@ void insertElement (t_hash h, const unsigned char *aWord, register unsigned long h->NumElem++; //fprintf(stderr,"\n####inserted word [%s] ",h->hash[*addr].word); - + //return *addr; } @@ -273,23 +321,24 @@ unsigned long search (t_hash h, const unsigned char *aWord, register unsigned le unsigned long *returnedAddr){ register unsigned long addr, Saddr; - + //fprintf(stderr,"\n searching for [%s], [%d], sizehash= %ld",aWord,len,h->SIZE_HASH); addr = hashFunction(aWord,len, h->SIZE_HASH); Saddr = addr; t_hashNode *hash; hash = h->hash; - - while((hash[addr].word != NULL)&&((strcomp(hash[addr].word, aWord, hash[addr].len, len)) != 0)) { + + while((hash[addr].word != NULL)&&((strcomp(hash[addr].word, aWord, hash[addr].len, len)) != 0)) { //fprintf(stderr,"\nComprueba [%s], [%d]",hash[addr].word,strcomp(hash[addr].word, aWord, len)); addr = ((addr + JUMP) % h->SIZE_HASH); } *returnedAddr = addr; - + if(hash[addr].word == NULL) { - return h->NumElem; //Word was not found + // fprintf(stderr, "word:'%.*s', hash %lu\n", len, aWord, Saddr); + return h->NumElem; //Word was not found } else { return h->hash[addr].posInVoc; //Word was found in this position in the vocabulary @@ -302,7 +351,7 @@ unsigned long search (t_hash h, const unsigned char *aWord, register unsigned le Tells if a word appears or not in the hash table. -----------------------------------------------------------------------*/ unsigned long inHashTable (t_hash h, const unsigned char *aWord, register unsigned len, unsigned long *returnedAddr){ - + unsigned long searched; unsigned long nothing; searched = search(h,aWord,len,¬hing); @@ -315,12 +364,15 @@ unsigned long inHashTable (t_hash h, const unsigned char *aWord, register unsign ------------------------------------------------------------------ */ void destroy_hash (t_hash hash){ unsigned long mem=0; - mem += sizeof(struct hashStr) + hash->SIZE_HASH * sizeof(t_hashNode); + unsigned long i = 0; + mem = sizeof(struct hashStr) + hash->SIZE_HASH * sizeof(t_hashNode); + printf("\n[destroying hash table]...Freed %ld bytes... RAM\n", mem); + fflush(stdout); + free(hash->hash); - destroyMemoryManager(hash->_memMgr); //frees words and variants -// free(hash->_memMgr); - free(hash); - printf("\n[destroying hash table]...Freed %ld bytes... RAM", mem); + destroyMemoryManager(hash->_memMgr); //frees words and variants +/// free(hash->_memMgr); + free(hash); } @@ -340,48 +392,48 @@ int main(int argc, char* argv[]) byte * w; unsigned int size; unsigned long i,addrInTH; - + t_hash hash; - + _memMgr=createMemoryManager(); hash = initialize_hash (2); - + w=a; i = search (hash,w, strlen(w), &addrInTH ); insertElement (hash, w, strlen(w), &addrInTH); fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); - fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); + fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); w=b; i = search (hash,w, strlen(w), &addrInTH ); insertElement (hash, w, strlen(w), &addrInTH); fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); - fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); + fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); w=c; i = search (hash,w, strlen(w), &addrInTH ); insertElement (hash, w, strlen(w), &addrInTH); fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); - fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); + fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); w=d; i = search (hash,w, strlen(w), &addrInTH ); insertElement (hash, w, strlen(w), &addrInTH); fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); - fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); + fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); // w=e; // i = search (hash,w, strlen(w), &addrInTH ); // insertElement (hash, w, strlen(w), &addrInTH); // fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); -// fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); +// fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); // w=f; // i = search (hash,w, strlen(w), &addrInTH ); // insertElement (hash, w, strlen(w), &addrInTH); // fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); -// fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); +// fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc); fprintf(stderr,"\n in: %s hash ? = %d",a,inHashTable(hash,a,strlen(a)) ); fprintf(stderr,"\n in: %s hash ? = %d",e, inHashTable(hash,e,strlen(e)) );