2 /* DYNAMIC END-TAGGED DENSE CODE. --
3 A dynamic word-based byte oriented compressor for text files based on
4 dynamic End-Tagged Dense Code.
6 Brisaboa, N. R., Faria, A., Navarro, G., Param, J. R.
7 Simple, Fast, and Efficient Natural Language Adaptive Compression.
8 11th International Symposium on String Processing and Information Retrieval (SPIRE'04) - LNCS 3246. A. Apostolico, M. Melucci (Ed.), pp. 230-241.
11 Copyright (C) 2005 Antonio Faria.
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License
15 as published by the Free Software Foundation; either version 2
16 of the License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
27 Author's contact: Antonio Faria, Dept. of Computer Science, University of
28 A Corua. Campus de Elvia s/n. Spain fari@udc.es
31 /*-----------------------------------------------------------------------
32 Hash: Definition of HashTable class (Linear Hash)
33 ------------------------------------------------------------------------*/
42 /*-----------------------------------------------------------------
43 Initilization of data structures used by the hashTable
44 ---------------------------------------------------------------- */
45 t_hash initialize_hash (unsigned long sizeVoc) {
49 h = (t_hash) malloc(sizeof(struct hashStr));
50 unsigned long m = 6 * sizeVoc;
51 h->SIZE_HASH = sizeVoc;
53 h->SIZE_HASH = NearestPrime(h->SIZE_HASH);
54 } while (h->SIZE_HASH < m);
56 h->hash = (t_hashNode *) malloc(h->SIZE_HASH * sizeof(t_hashNode));
59 //creates the memory manager that is used to reserve small pieces of memory (for words)
60 h->_memMgr=createMemoryManager();
63 for (i = 0; i < h->SIZE_HASH; i++) {
64 h->hash[i].word = NULL;
66 h->hash[i].posInVoc = 0;
73 /*------------------------------------------------------------------
74 Find the nearest prime number over n.
75 ---------------------------------------------------------------- */
76 unsigned long NearestPrime(unsigned long n)
78 long position; /* the prime number being sought */
80 for (position = n; ; position++)
82 // checks if those values from 2 to $\sqrt{m}$ can be factors of $m$ */
83 for (index = 2; index <= (long) sqrt((double) position) && position % index != 0; index++) ;
85 if (position % index != 0) /* No factors in that range, therefore a prime number was found */
95 /*-----------------------------------------------------------------------
96 Computes a hash function over a string "aWord", it returns the position
97 where "aWord" should go in the hash table if no collision ocurrs.
98 ---------------------------------------------------------------------*/
99 unsigned long hashFunction (const unsigned char *aWord, unsigned int len, unsigned long sizeHash)
102 register unsigned int h;
103 register unsigned long last;
104 last=((unsigned long) aWord) +len -1;
110 for( ; ((unsigned long) aWord <=last ) ; )
111 //for(c=*aWord; (aWord++)<last ; )
116 h ^= ( (h << 5) + c + (h >> 2) );
119 return((unsigned int)((h&0x7fffffff) % sizeHash));
122 unsigned long newhashFunction(const unsigned char *aWord, unsigned int len, unsigned long sizeHash)
126 for(i = 0; i < len; i ++)
127 h = ((unsigned int) aWord[i]) + (h << 6) + (h << 16) - h;
128 return (h&0x7fffffff) % sizeHash;
131 /*-----------------------------------------------------------------------
133 ---------------------------------------------------------------------*/
135 /* Author J. Zobel, April 2001.
136 http://www.seg.rmit.edu.au/code/zwh-ipl/
137 Permission to use this code is freely granted, provided that this
138 statement is retained. */
142 int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2) {
147 { register unsigned long iters;
149 while( iters<ws1 && *s1 == *s2 )
155 //fprintf(stderr,"\nDevuelve [%d]",*s1-*s2);
160 inline int strcomp3(const unsigned char *s1, const unsigned char *s2, unsigned long ws1, unsigned long ws2) {
166 { register unsigned long iters;
167 register unsigned long end;
170 while( iters<end && *s1 == *s2 )
176 //fprintf(stderr,"\nDevuelve [%d]",*s1-*s2);
182 /* Author J. Zobel, April 2001.
183 http://www.seg.rmit.edu.au/code/zwh-ipl/
184 Permission to use this code is freely granted, provided that this
185 statement is retained. */
187 //int strcomp(const unsigned char *s1, const unsigned char *s2, unsigned long ws) {
188 // register unsigned long i;
190 // while( i < ws-1 && *s1 == *s2 )*/
196 // return( *s1-*s2 );
199 //int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
200 // register ulong iters = 1;
202 // while( iters<ws1 && *s1 == *s2 ){
207 // // so iters == ws1 OR *s1 != *s2
208 // if (ws1 == iters) {
210 // return 0; // s1 equals to s2
212 // return -1; // s1 < s2
214 // else { //(ws1 > iters) so s1 != s2
220 //permits to compare 2 strings of len ws1 and ws2 that do not end in '\0'
221 int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
222 register ulong iters = 0;
224 while( iters<ws1 && iters<ws2 && *s1 == *s2 ){
232 return 0; // s1 equals to s2
234 return -1; // w1 < w2 and *s1_i == *s2_i for i=0 to iters-1
239 return 0; // s1 equals to s2
241 return +1; // w2 < w1 and *ws1 is '\0'
250 /*-----------------------------------------------------------------------
251 Insert a new word in a position of the hashTable (position previously computed)
252 ---------------------------------------------------------------------*/
254 void insertAux(t_hash h, unsigned char *aWord, unsigned long len, unsigned long pos)
256 unsigned long addr = 0;
257 unsigned long dummy = search(h, aWord, len, &addr);
258 h->hash[addr].word = aWord;
259 h->hash[addr].len = len;
260 h->hash[addr].posInVoc = pos;
264 void resize(t_hash h){
265 unsigned long oldsize = h->SIZE_HASH;
266 unsigned long newsize = NearestPrime(h->SIZE_HASH * 2);
267 t_hashNode* oldh = h->hash;
268 unsigned long l = newsize * sizeof(t_hashNode);
269 t_hashNode* newh = (t_hashNode *) malloc(newsize * sizeof(t_hashNode));
271 fprintf(stderr, "Resizing hashtable from %lu to %lu (%lu bytes)\n", oldsize, newsize, l);
273 h->SIZE_HASH = newsize;
274 for (i = 0; i < h->SIZE_HASH; i++) {
275 h->hash[i].word = NULL;
277 h->hash[i].posInVoc = 0;
280 for(i = 0; i < oldsize; i++)
281 if (oldh[i].word != NULL){
282 insertAux(h, oldh[i].word, oldh[i].len, oldh[i].posInVoc);
292 void insertElement (t_hash h, const unsigned char *aWord, register unsigned long len, register unsigned long *addr) {
293 //fprintf(stderr,"\n Entra en la funcin [%s], [%ld]",aWord, len);
295 if(h->NumElem >= h->SIZE_HASH/2)
301 //h->hash[*addr].word = (unsigned char*) malloc((len+1) * sizeof(unsigned char));
302 getMemoryBlock(h->_memMgr,( byte **)&(h->hash[*addr].word),len+1);
303 //fprintf(stderr,"\n tras obter memoria");
305 strncpy ((char *) h->hash[*addr].word, (char *)aWord, len);
307 h->hash[*addr].word[len]='\0';
308 h->hash[*addr].len =len;
309 h->hash[*addr].posInVoc = h->NumElem;
312 //fprintf(stderr,"\n####inserted word [%s] ",h->hash[*addr].word);
317 /*-----------------------------------------------------------------------
318 Search for a word in the hash table and returns its position in the
319 vocabulary. It returns the next "available" position in the vocabulary,
320 if the word is not in the hash table. That is: a "0-node" position.
321 It also returns -using attribute returnedAddr- the position where word
322 was found (or where it should go if it was inserted in next "insert call".
323 -----------------------------------------------------------------------*/
324 unsigned long search (t_hash h, const unsigned char *aWord, register unsigned len,
325 unsigned long *returnedAddr){
327 register unsigned long addr, Saddr;
329 //fprintf(stderr,"\n searching for [%s], [%d], sizehash= %ld",aWord,len,h->SIZE_HASH);
330 addr = hashFunction(aWord,len, h->SIZE_HASH);
336 while((hash[addr].word != NULL)&&((strcomp(hash[addr].word, aWord, hash[addr].len, len)) != 0)) {
337 //fprintf(stderr,"\nComprueba [%s], [%d]",hash[addr].word,strcomp(hash[addr].word, aWord, len));
338 addr = ((addr + JUMP) % h->SIZE_HASH);
341 *returnedAddr = addr;
343 if(hash[addr].word == NULL) {
344 // fprintf(stderr, "word:'%.*s', hash %lu\n", len, aWord, Saddr);
345 return h->NumElem; //Word was not found
348 return h->hash[addr].posInVoc; //Word was found in this position in the vocabulary
354 /*-----------------------------------------------------------------------
355 Tells if a word appears or not in the hash table.
356 -----------------------------------------------------------------------*/
357 unsigned long inHashTable (t_hash h, const unsigned char *aWord, register unsigned len, unsigned long *returnedAddr){
359 unsigned long searched;
360 unsigned long nothing;
361 searched = search(h,aWord,len,¬hing);
362 *returnedAddr=nothing;
363 return (searched < (h->NumElem) );
366 /*------------------------------------------------------------------
368 ------------------------------------------------------------------ */
369 void destroy_hash (t_hash hash){
372 mem = sizeof(struct hashStr) + hash->SIZE_HASH * sizeof(t_hashNode);
373 printf("\n[destroying hash table]...Freed %ld bytes... RAM\n", mem);
377 destroyMemoryManager(hash->_memMgr); //frees words and variants
378 /// free(hash->_memMgr);
385 /*------------------------------------------------------------------
386 main, to make unit proofs
387 ------------------------------------------------------------------ */
389 int main(int argc, char* argv[])
390 { byte a[10]= "word1";
398 unsigned long i,addrInTH;
402 _memMgr=createMemoryManager();
404 hash = initialize_hash (2);
407 i = search (hash,w, strlen(w), &addrInTH );
408 insertElement (hash, w, strlen(w), &addrInTH);
409 fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
410 fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
413 i = search (hash,w, strlen(w), &addrInTH );
414 insertElement (hash, w, strlen(w), &addrInTH);
415 fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
416 fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
419 i = search (hash,w, strlen(w), &addrInTH );
420 insertElement (hash, w, strlen(w), &addrInTH);
421 fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
422 fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
425 i = search (hash,w, strlen(w), &addrInTH );
426 insertElement (hash, w, strlen(w), &addrInTH);
427 fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
428 fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
431 // i = search (hash,w, strlen(w), &addrInTH );
432 // insertElement (hash, w, strlen(w), &addrInTH);
433 // fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
434 // fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
437 // i = search (hash,w, strlen(w), &addrInTH );
438 // insertElement (hash, w, strlen(w), &addrInTH);
439 // fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
440 // fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
442 fprintf(stderr,"\n in: %s hash ? = %d",a,inHashTable(hash,a,strlen(a)) );
443 fprintf(stderr,"\n in: %s hash ? = %d",e, inHashTable(hash,e,strlen(e)) );
444 fprintf(stderr,"\n in: %s hash ? = %d",b, inHashTable(hash,b,strlen(b)) );
447 destroyMemoryManager(_memMgr);