-/* DYNAMIC END-TAGGED DENSE CODE. --
-A dynamic word-based byte oriented compressor for text files based on
+/* DYNAMIC END-TAGGED DENSE CODE. --
+A dynamic word-based byte oriented compressor for text files based on
dynamic End-Tagged Dense Code.
-Brisaboa, N. R., Faria, A., Navarro, G., Param, J. R.
-Simple, Fast, and Efficient Natural Language Adaptive Compression.
-11th International Symposium on String Processing and Information Retrieval (SPIRE'04) - LNCS 3246. A. Apostolico, M. Melucci (Ed.), pp. 230-241.
-Padova (Italia), 2004.
+Brisaboa, N. R., Faria, A., Navarro, G., Param, J. R.
+Simple, Fast, and Efficient Natural Language Adaptive Compression.
+11th International Symposium on String Processing and Information Retrieval (SPIRE'04) - LNCS 3246. A. Apostolico, M. Melucci (Ed.), pp. 230-241.
+Padova (Italia), 2004.
Copyright (C) 2005 Antonio Faria.
/*-----------------------------------------------------------------------
Hash: Definition of HashTable class (Linear Hash)
------------------------------------------------------------------------*/
-
+
#include "hash.h"
#include <stdio.h>
unsigned long i;
h = (t_hash) malloc(sizeof(struct hashStr));
- h->SIZE_HASH = (unsigned long) (OCUP_HASH * sizeVoc);
- h->SIZE_HASH = NearestPrime(h->SIZE_HASH);
+ unsigned long m = 6 * sizeVoc;
+ h->SIZE_HASH = sizeVoc;
+ do {
+ h->SIZE_HASH = NearestPrime(h->SIZE_HASH);
+ } while (h->SIZE_HASH < m);
+
h->hash = (t_hashNode *) malloc(h->SIZE_HASH * sizeof(t_hashNode));
h->NumElem = 0;
h->hash[i].len = 0;
h->hash[i].posInVoc = 0;
}
- printf("\nHash Table initilized with: %lu elements\n",h->SIZE_HASH);
- return h;
+ return h;
}
{
long position; /* the prime number being sought */
long index;
-
for (position = n; ; position++)
{
// checks if those values from 2 to $\sqrt{m}$ can be factors of $m$ */
//for(c=*aWord; (aWord++)<last ; )
{
c=*(aWord++);
+
//c=*aWord;
- h ^= ( (h << 5) + c + (h >> 2) );
+ h ^= ( (h << 5) + c + (h >> 2) );
}
+
return((unsigned int)((h&0x7fffffff) % sizeHash));
}
+unsigned long newhashFunction(const unsigned char *aWord, unsigned int len, unsigned long sizeHash)
+{
+ unsigned int h = 0;
+ unsigned int i = 0;
+ for(i = 0; i < len; i ++)
+ h = ((unsigned int) aWord[i]) + (h << 6) + (h << 16) - h;
+ return (h&0x7fffffff) % sizeHash;
+}
/*-----------------------------------------------------------------------
compares two strings
- int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2) {
+ int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2) {
if (ws1 !=ws2) {
return -1;
}
-
- { register unsigned long iters;
+
+ { register unsigned long iters;
iters=1;
while( iters<ws1 && *s1 == *s2 )
{
}
inline int strcomp3(const unsigned char *s1, const unsigned char *s2, unsigned long ws1, unsigned long ws2) {
-
+
if (ws1 !=ws2) {
return -1;
}
-
+
{ register unsigned long iters;
register unsigned long end;
end = MIN(ws1,ws2);
//}
//int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
-// register ulong iters = 1;
+// register ulong iters = 1;
//
// while( iters<ws1 && *s1 == *s2 ){
// s1++;
// if (ws1 == iters) {
// if (ws2 == ws1)
// return 0; // s1 equals to s2
-// else
+// else
// return -1; // s1 < s2
// }
// else { //(ws1 > iters) so s1 != s2
// return( *s1-*s2);
-// }
+// }
//}
//permits to compare 2 strings of len ws1 and ws2 that do not end in '\0'
int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
- register ulong iters = 0;
+ register ulong iters = 0;
while( iters<ws1 && iters<ws2 && *s1 == *s2 ){
s1++;
if (ws1 == iters) {
if (ws2 == ws1)
return 0; // s1 equals to s2
- else
+ else
return -1; // w1 < w2 and *s1_i == *s2_i for i=0 to iters-1
}
- else
+ else
if (ws2 == iters) {
if (ws2 == ws1)
return 0; // s1 equals to s2
- else
+ else
return +1; // w2 < w1 and *ws1 is '\0'
}
else { //*s1 != *s2
return (*s1-*s2);
- }
-
+ }
+
}
-
+
/*-----------------------------------------------------------------------
Insert a new word in a position of the hashTable (position previously computed)
---------------------------------------------------------------------*/
+
+void insertAux(t_hash h, unsigned char *aWord, unsigned long len, unsigned long pos)
+{
+ unsigned long addr = 0;
+ unsigned long dummy = search(h, aWord, len, &addr);
+ h->hash[addr].word = aWord;
+ h->hash[addr].len = len;
+ h->hash[addr].posInVoc = pos;
+ h->NumElem++;
+}
+
+void resize(t_hash h){
+ unsigned long oldsize = h->SIZE_HASH;
+ unsigned long newsize = NearestPrime(h->SIZE_HASH * 2);
+ t_hashNode* oldh = h->hash;
+ unsigned long l = newsize * sizeof(t_hashNode);
+ t_hashNode* newh = (t_hashNode *) malloc(newsize * sizeof(t_hashNode));
+ unsigned long i;
+ fprintf(stderr, "Resizing hashtable from %lu to %lu (%lu bytes)\n", oldsize, newsize, l);
+ h->hash = newh;
+ h->SIZE_HASH = newsize;
+ for (i = 0; i < h->SIZE_HASH; i++) {
+ h->hash[i].word = NULL;
+ h->hash[i].len = 0;
+ h->hash[i].posInVoc = 0;
+ }
+ h->NumElem = 0;
+ for(i = 0; i < oldsize; i++)
+ if (oldh[i].word != NULL){
+ insertAux(h, oldh[i].word, oldh[i].len, oldh[i].posInVoc);
+ oldh[i].word = NULL;
+ oldh[i].len = 0;
+ oldh[i].posInVoc=0;
+ }
+ //free(oldh);
+
+}
+
+
void insertElement (t_hash h, const unsigned char *aWord, register unsigned long len, register unsigned long *addr) {
//fprintf(stderr,"\n Entra en la funcin [%s], [%ld]",aWord, len);
- if(h->NumElem >= h->SIZE_HASH -1) //loses 1 slot, but ensures that "search function" does not enter an infinity loop
+ if(h->NumElem >= h->SIZE_HASH/2)
{
- printf("\n\nHash table full!! Change size and recompile !\n");
- exit(1);
- }
+
+ resize(h);
- getMemoryBlock(h->_memMgr,( byte **)&(h->hash[*addr].word),len+1);
- //fprintf(stderr,"\n tras obter memoria");
+ }
+ //h->hash[*addr].word = (unsigned char*) malloc((len+1) * sizeof(unsigned char));
+ getMemoryBlock(h->_memMgr,( byte **)&(h->hash[*addr].word),len+1);
+ //fprintf(stderr,"\n tras obter memoria");
strncpy ((char *) h->hash[*addr].word, (char *)aWord, len);
h->NumElem++;
//fprintf(stderr,"\n####inserted word [%s] ",h->hash[*addr].word);
-
+
//return *addr;
}
unsigned long *returnedAddr){
register unsigned long addr, Saddr;
-
+
//fprintf(stderr,"\n searching for [%s], [%d], sizehash= %ld",aWord,len,h->SIZE_HASH);
addr = hashFunction(aWord,len, h->SIZE_HASH);
Saddr = addr;
t_hashNode *hash;
hash = h->hash;
-
- while((hash[addr].word != NULL)&&((strcomp(hash[addr].word, aWord, hash[addr].len, len)) != 0)) {
+
+ while((hash[addr].word != NULL)&&((strcomp(hash[addr].word, aWord, hash[addr].len, len)) != 0)) {
//fprintf(stderr,"\nComprueba [%s], [%d]",hash[addr].word,strcomp(hash[addr].word, aWord, len));
addr = ((addr + JUMP) % h->SIZE_HASH);
}
*returnedAddr = addr;
-
+
if(hash[addr].word == NULL) {
- return h->NumElem; //Word was not found
+ // fprintf(stderr, "word:'%.*s', hash %lu\n", len, aWord, Saddr);
+ return h->NumElem; //Word was not found
}
else {
return h->hash[addr].posInVoc; //Word was found in this position in the vocabulary
Tells if a word appears or not in the hash table.
-----------------------------------------------------------------------*/
unsigned long inHashTable (t_hash h, const unsigned char *aWord, register unsigned len, unsigned long *returnedAddr){
-
+
unsigned long searched;
unsigned long nothing;
searched = search(h,aWord,len,¬hing);
------------------------------------------------------------------ */
void destroy_hash (t_hash hash){
unsigned long mem=0;
- mem += sizeof(struct hashStr) + hash->SIZE_HASH * sizeof(t_hashNode);
+ unsigned long i = 0;
+ mem = sizeof(struct hashStr) + hash->SIZE_HASH * sizeof(t_hashNode);
+ printf("\n[destroying hash table]...Freed %ld bytes... RAM\n", mem);
+ fflush(stdout);
+
free(hash->hash);
- destroyMemoryManager(hash->_memMgr); //frees words and variants
-// free(hash->_memMgr);
- free(hash);
- printf("\n[destroying hash table]...Freed %ld bytes... RAM", mem);
+ destroyMemoryManager(hash->_memMgr); //frees words and variants
+/// free(hash->_memMgr);
+ free(hash);
}
byte * w;
unsigned int size;
unsigned long i,addrInTH;
-
+
t_hash hash;
-
+
_memMgr=createMemoryManager();
hash = initialize_hash (2);
-
+
w=a;
i = search (hash,w, strlen(w), &addrInTH );
insertElement (hash, w, strlen(w), &addrInTH);
fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
- fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
+ fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
w=b;
i = search (hash,w, strlen(w), &addrInTH );
insertElement (hash, w, strlen(w), &addrInTH);
fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
- fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
+ fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
w=c;
i = search (hash,w, strlen(w), &addrInTH );
insertElement (hash, w, strlen(w), &addrInTH);
fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
- fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
+ fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
w=d;
i = search (hash,w, strlen(w), &addrInTH );
insertElement (hash, w, strlen(w), &addrInTH);
fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
- fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
+ fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
// w=e;
// i = search (hash,w, strlen(w), &addrInTH );
// insertElement (hash, w, strlen(w), &addrInTH);
// fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
-// fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
+// fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
// w=f;
// i = search (hash,w, strlen(w), &addrInTH );
// insertElement (hash, w, strlen(w), &addrInTH);
// fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
-// fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
+// fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
fprintf(stderr,"\n in: %s hash ? = %d",a,inHashTable(hash,a,strlen(a)) );
fprintf(stderr,"\n in: %s hash ? = %d",e, inHashTable(hash,e,strlen(e)) );