2 /* DYNAMIC END-TAGGED DENSE CODE. --
3 A dynamic word-based byte oriented compressor for text files based on
4 dynamic End-Tagged Dense Code.
6 Brisaboa, N. R., Faria, A., Navarro, G., Param, J. R.
7 Simple, Fast, and Efficient Natural Language Adaptive Compression.
8 11th International Symposium on String Processing and Information Retrieval (SPIRE'04) - LNCS 3246. A. Apostolico, M. Melucci (Ed.), pp. 230-241.
11 Copyright (C) 2005 Antonio Faria.
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License
15 as published by the Free Software Foundation; either version 2
16 of the License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
27 Author's contact: Antonio Faria, Dept. of Computer Science, University of
28 A Corua. Campus de Elvia s/n. Spain fari@udc.es
31 /*-----------------------------------------------------------------------
32 Hash: Definition of HashTable class (Linear Hash)
33 ------------------------------------------------------------------------*/
42 /*-----------------------------------------------------------------
43 Initilization of data structures used by the hashTable
44 ---------------------------------------------------------------- */
45 t_hash initialize_hash (unsigned long sizeVoc) {
49 h = (t_hash) malloc(sizeof(struct hashStr));
50 unsigned long m = 6 * sizeVoc;
51 h->SIZE_HASH = sizeVoc;
53 h->SIZE_HASH = NearestPrime(h->SIZE_HASH);
54 } while (h->SIZE_HASH >= m);
56 h->hash = (t_hashNode *) malloc(h->SIZE_HASH * sizeof(t_hashNode));
59 //creates the memory manager that is used to reserve small pieces of memory (for words)
60 h->_memMgr=createMemoryManager();
63 for (i = 0; i < h->SIZE_HASH; i++) {
64 h->hash[i].word = NULL;
66 h->hash[i].posInVoc = 0;
73 /*------------------------------------------------------------------
74 Find the nearest prime number over n.
75 ---------------------------------------------------------------- */
76 unsigned long NearestPrime(unsigned long n)
78 long position; /* the prime number being sought */
80 for (position = n; ; position++)
82 // checks if those values from 2 to $\sqrt{m}$ can be factors of $m$ */
83 for (index = 2; index <= (long) sqrt((double) position) && position % index != 0; index++) ;
85 if (position % index != 0) /* No factors in that range, therefore a prime number was found */
95 /*-----------------------------------------------------------------------
96 Computes a hash function over a string "aWord", it returns the position
97 where "aWord" should go in the hash table if no collision ocurrs.
98 ---------------------------------------------------------------------*/
99 unsigned long hashFunction (const unsigned char *aWord, unsigned int len, unsigned long sizeHash)
102 register unsigned int h;
103 register unsigned long last;
104 last=((unsigned long) aWord) +len -1;
110 for( ; ((unsigned long) aWord <=last ) ; )
111 //for(c=*aWord; (aWord++)<last ; )
116 h ^= ( (h << 5) + c + (h >> 2) );
119 return((unsigned int)((h&0x7fffffff) % sizeHash));
122 unsigned long newhashFunction(const unsigned char *aWord, unsigned int len, unsigned long sizeHash)
126 for(i = 0; i < len; i ++)
127 h = ((unsigned int) aWord[i]) + (h << 6) + (h << 16) - h;
128 return (h&0x7fffffff) % sizeHash;
131 /*-----------------------------------------------------------------------
133 ---------------------------------------------------------------------*/
135 /* Author J. Zobel, April 2001.
136 http://www.seg.rmit.edu.au/code/zwh-ipl/
137 Permission to use this code is freely granted, provided that this
138 statement is retained. */
142 int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2) {
147 { register unsigned long iters;
149 while( iters<ws1 && *s1 == *s2 )
155 //fprintf(stderr,"\nDevuelve [%d]",*s1-*s2);
160 inline int strcomp3(const unsigned char *s1, const unsigned char *s2, unsigned long ws1, unsigned long ws2) {
166 { register unsigned long iters;
167 register unsigned long end;
170 while( iters<end && *s1 == *s2 )
176 //fprintf(stderr,"\nDevuelve [%d]",*s1-*s2);
182 /* Author J. Zobel, April 2001.
183 http://www.seg.rmit.edu.au/code/zwh-ipl/
184 Permission to use this code is freely granted, provided that this
185 statement is retained. */
187 //int strcomp(const unsigned char *s1, const unsigned char *s2, unsigned long ws) {
188 // register unsigned long i;
190 // while( i < ws-1 && *s1 == *s2 )*/
196 // return( *s1-*s2 );
199 //int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
200 // register ulong iters = 1;
202 // while( iters<ws1 && *s1 == *s2 ){
207 // // so iters == ws1 OR *s1 != *s2
208 // if (ws1 == iters) {
210 // return 0; // s1 equals to s2
212 // return -1; // s1 < s2
214 // else { //(ws1 > iters) so s1 != s2
220 //permits to compare 2 strings of len ws1 and ws2 that do not end in '\0'
221 int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
222 register ulong iters = 0;
224 while( iters<ws1 && iters<ws2 && *s1 == *s2 ){
232 return 0; // s1 equals to s2
234 return -1; // w1 < w2 and *s1_i == *s2_i for i=0 to iters-1
239 return 0; // s1 equals to s2
241 return +1; // w2 < w1 and *ws1 is '\0'
250 /*-----------------------------------------------------------------------
251 Insert a new word in a position of the hashTable (position previously computed)
252 ---------------------------------------------------------------------*/
254 void insertAux(t_hash h, unsigned char *aWord, unsigned long len, unsigned long pos)
256 unsigned long addr = 0;
257 unsigned long dummy = search(h, aWord, len, &addr);
258 h->hash[addr].word = aWord;
259 h->hash[addr].len = len;
260 h->hash[addr].posInVoc = pos;
264 void resize(t_hash h){
265 unsigned long oldsize = h->SIZE_HASH;
266 unsigned long newsize = NearestPrime(h->SIZE_HASH * 2);
267 t_hashNode* oldh = h->hash;
268 unsigned long l = newsize * sizeof(t_hashNode);
269 t_hashNode* newh = (t_hashNode *) malloc(newsize * sizeof(t_hashNode));
271 fprintf(stderr, "Resizing hashtable from %lu to %lu (%lu bytes)\n", oldsize, newsize, l);
273 h->SIZE_HASH = newsize;
274 for (i = 0; i < h->SIZE_HASH; i++) {
275 h->hash[i].word = NULL;
277 h->hash[i].posInVoc = 0;
280 for(i = 0; i < oldsize; i++)
281 if (oldh[i].word != NULL){
282 insertAux(h, oldh[i].word, oldh[i].len, oldh[i].posInVoc);
292 void insertElement (t_hash h, const unsigned char *aWord, register unsigned long len, register unsigned long *addr) {
293 //fprintf(stderr,"\n Entra en la funcin [%s], [%ld]",aWord, len);
295 if(h->NumElem >= h->SIZE_HASH/2)
299 //h->hash[*addr].word = (unsigned char*) malloc((len+1) * sizeof(unsigned char));
300 getMemoryBlock(h->_memMgr,( byte **)&(h->hash[*addr].word),len+1);
301 //fprintf(stderr,"\n tras obter memoria");
303 strncpy ((char *) h->hash[*addr].word, (char *)aWord, len);
305 h->hash[*addr].word[len]='\0';
306 h->hash[*addr].len =len;
307 h->hash[*addr].posInVoc = h->NumElem;
310 //fprintf(stderr,"\n####inserted word [%s] ",h->hash[*addr].word);
315 /*-----------------------------------------------------------------------
316 Search for a word in the hash table and returns its position in the
317 vocabulary. It returns the next "available" position in the vocabulary,
318 if the word is not in the hash table. That is: a "0-node" position.
319 It also returns -using attribute returnedAddr- the position where word
320 was found (or where it should go if it was inserted in next "insert call".
321 -----------------------------------------------------------------------*/
322 unsigned long search (t_hash h, const unsigned char *aWord, register unsigned len,
323 unsigned long *returnedAddr){
325 register unsigned long addr, Saddr;
327 //fprintf(stderr,"\n searching for [%s], [%d], sizehash= %ld",aWord,len,h->SIZE_HASH);
328 addr = hashFunction(aWord,len, h->SIZE_HASH);
334 while((hash[addr].word != NULL)&&((strcomp(hash[addr].word, aWord, hash[addr].len, len)) != 0)) {
335 //fprintf(stderr,"\nComprueba [%s], [%d]",hash[addr].word,strcomp(hash[addr].word, aWord, len));
336 addr = ((addr + JUMP) % h->SIZE_HASH);
339 *returnedAddr = addr;
341 if(hash[addr].word == NULL) {
342 // fprintf(stderr, "word:'%.*s', hash %lu\n", len, aWord, Saddr);
343 return h->NumElem; //Word was not found
346 return h->hash[addr].posInVoc; //Word was found in this position in the vocabulary
352 /*-----------------------------------------------------------------------
353 Tells if a word appears or not in the hash table.
354 -----------------------------------------------------------------------*/
355 unsigned long inHashTable (t_hash h, const unsigned char *aWord, register unsigned len, unsigned long *returnedAddr){
357 unsigned long searched;
358 unsigned long nothing;
359 searched = search(h,aWord,len,¬hing);
360 *returnedAddr=nothing;
361 return (searched < (h->NumElem) );
364 /*------------------------------------------------------------------
366 ------------------------------------------------------------------ */
367 void destroy_hash (t_hash hash){
370 mem = sizeof(struct hashStr) + hash->SIZE_HASH * sizeof(t_hashNode);
371 printf("\n[destroying hash table]...Freed %ld bytes... RAM\n", mem);
375 destroyMemoryManager(hash->_memMgr); //frees words and variants
376 /// free(hash->_memMgr);
383 /*------------------------------------------------------------------
384 main, to make unit proofs
385 ------------------------------------------------------------------ */
387 int main(int argc, char* argv[])
388 { byte a[10]= "word1";
396 unsigned long i,addrInTH;
400 _memMgr=createMemoryManager();
402 hash = initialize_hash (2);
405 i = search (hash,w, strlen(w), &addrInTH );
406 insertElement (hash, w, strlen(w), &addrInTH);
407 fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
408 fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
411 i = search (hash,w, strlen(w), &addrInTH );
412 insertElement (hash, w, strlen(w), &addrInTH);
413 fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
414 fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
417 i = search (hash,w, strlen(w), &addrInTH );
418 insertElement (hash, w, strlen(w), &addrInTH);
419 fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
420 fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
423 i = search (hash,w, strlen(w), &addrInTH );
424 insertElement (hash, w, strlen(w), &addrInTH);
425 fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
426 fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
429 // i = search (hash,w, strlen(w), &addrInTH );
430 // insertElement (hash, w, strlen(w), &addrInTH);
431 // fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
432 // fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
435 // i = search (hash,w, strlen(w), &addrInTH );
436 // insertElement (hash, w, strlen(w), &addrInTH);
437 // fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
438 // fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
440 fprintf(stderr,"\n in: %s hash ? = %d",a,inHashTable(hash,a,strlen(a)) );
441 fprintf(stderr,"\n in: %s hash ? = %d",e, inHashTable(hash,e,strlen(e)) );
442 fprintf(stderr,"\n in: %s hash ? = %d",b, inHashTable(hash,b,strlen(b)) );
445 destroyMemoryManager(_memMgr);