2 /* DYNAMIC END-TAGGED DENSE CODE. --
3 A dynamic word-based byte oriented compressor for text files based on
4 dynamic End-Tagged Dense Code.
6 Brisaboa, N. R., Faria, A., Navarro, G., Param, J. R.
7 Simple, Fast, and Efficient Natural Language Adaptive Compression.
8 11th International Symposium on String Processing and Information Retrieval (SPIRE'04) - LNCS 3246. A. Apostolico, M. Melucci (Ed.), pp. 230-241.
11 Copyright (C) 2005 Antonio Faria.
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License
15 as published by the Free Software Foundation; either version 2
16 of the License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
27 Author's contact: Antonio Faria, Dept. of Computer Science, University of
28 A Corua. Campus de Elvia s/n. Spain fari@udc.es
31 /*-----------------------------------------------------------------------
32 Hash: Definition of HashTable class (Linear Hash)
33 ------------------------------------------------------------------------*/
42 /*-----------------------------------------------------------------
43 Initilization of data structures used by the hashTable
44 ---------------------------------------------------------------- */
45 t_hash initialize_hash (unsigned long sizeVoc) {
49 h = (t_hash) malloc(sizeof(struct hashStr));
50 h->SIZE_HASH = (unsigned long) (OCUP_HASH * sizeVoc);
51 h->SIZE_HASH = NearestPrime(h->SIZE_HASH);
52 h->hash = (t_hashNode *) malloc(h->SIZE_HASH * sizeof(t_hashNode));
55 //creates the memory manager that is used to reserve small pieces of memory (for words)
56 h->_memMgr=createMemoryManager();
59 for (i = 0; i < h->SIZE_HASH; i++) {
60 h->hash[i].word = NULL;
62 h->hash[i].posInVoc = 0;
69 /*------------------------------------------------------------------
70 Find the nearest prime number over n.
71 ---------------------------------------------------------------- */
72 unsigned long NearestPrime(unsigned long n)
74 long position; /* the prime number being sought */
76 for (position = n; ; position++)
78 // checks if those values from 2 to $\sqrt{m}$ can be factors of $m$ */
79 for (index = 2; index <= (long) sqrt((double) position) && position % index != 0; index++) ;
81 if (position % index != 0) /* No factors in that range, therefore a prime number was found */
91 /*-----------------------------------------------------------------------
92 Computes a hash function over a string "aWord", it returns the position
93 where "aWord" should go in the hash table if no collision ocurrs.
94 ---------------------------------------------------------------------*/
95 unsigned long hashFunction (const unsigned char *aWord, unsigned int len, unsigned long sizeHash)
98 register unsigned int h;
99 register unsigned long last;
100 last=((unsigned long) aWord) +len -1;
106 for( ; ((unsigned long) aWord <=last ) ; )
107 //for(c=*aWord; (aWord++)<last ; )
112 h ^= ( (h << 5) + c + (h >> 2) );
115 return((unsigned int)((h&0x7fffffff) % sizeHash));
118 unsigned long newhashFunction(const unsigned char *aWord, unsigned int len, unsigned long sizeHash)
122 for(i = 0; i < len; i ++)
123 h = ((unsigned int) aWord[i]) + (h << 6) + (h << 16) - h;
124 return (h&0x7fffffff) % sizeHash;
127 /*-----------------------------------------------------------------------
129 ---------------------------------------------------------------------*/
131 /* Author J. Zobel, April 2001.
132 http://www.seg.rmit.edu.au/code/zwh-ipl/
133 Permission to use this code is freely granted, provided that this
134 statement is retained. */
138 int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2) {
143 { register unsigned long iters;
145 while( iters<ws1 && *s1 == *s2 )
151 //fprintf(stderr,"\nDevuelve [%d]",*s1-*s2);
156 inline int strcomp3(const unsigned char *s1, const unsigned char *s2, unsigned long ws1, unsigned long ws2) {
162 { register unsigned long iters;
163 register unsigned long end;
166 while( iters<end && *s1 == *s2 )
172 //fprintf(stderr,"\nDevuelve [%d]",*s1-*s2);
178 /* Author J. Zobel, April 2001.
179 http://www.seg.rmit.edu.au/code/zwh-ipl/
180 Permission to use this code is freely granted, provided that this
181 statement is retained. */
183 //int strcomp(const unsigned char *s1, const unsigned char *s2, unsigned long ws) {
184 // register unsigned long i;
186 // while( i < ws-1 && *s1 == *s2 )*/
192 // return( *s1-*s2 );
195 //int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
196 // register ulong iters = 1;
198 // while( iters<ws1 && *s1 == *s2 ){
203 // // so iters == ws1 OR *s1 != *s2
204 // if (ws1 == iters) {
206 // return 0; // s1 equals to s2
208 // return -1; // s1 < s2
210 // else { //(ws1 > iters) so s1 != s2
216 //permits to compare 2 strings of len ws1 and ws2 that do not end in '\0'
217 int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
218 register ulong iters = 0;
220 while( iters<ws1 && iters<ws2 && *s1 == *s2 ){
228 return 0; // s1 equals to s2
230 return -1; // w1 < w2 and *s1_i == *s2_i for i=0 to iters-1
235 return 0; // s1 equals to s2
237 return +1; // w2 < w1 and *ws1 is '\0'
246 /*-----------------------------------------------------------------------
247 Insert a new word in a position of the hashTable (position previously computed)
248 ---------------------------------------------------------------------*/
250 void insertAux(t_hash h, unsigned char *aWord, unsigned long len, unsigned long pos)
252 unsigned long addr = 0;
253 unsigned long dummy = search(h, aWord, len, &addr);
254 h->hash[addr].word = aWord;
255 h->hash[addr].len = len;
256 h->hash[addr].posInVoc = pos;
260 void resize(t_hash h){
261 unsigned long oldsize = h->SIZE_HASH;
262 unsigned long newsize = NearestPrime(h->SIZE_HASH * 2);
263 t_hashNode* oldh = h->hash;
264 unsigned long l = newsize * sizeof(t_hashNode);
265 t_hashNode* newh = (t_hashNode *) malloc(newsize * sizeof(t_hashNode));
267 fprintf(stderr, "Resizing hashtable from %lu to %lu (%lu bytes)\n", oldsize, newsize, l);
269 h->SIZE_HASH = newsize;
270 for (i = 0; i < h->SIZE_HASH; i++) {
271 h->hash[i].word = NULL;
273 h->hash[i].posInVoc = 0;
276 for(i = 0; i < oldsize; i++)
277 if (oldh[i].word != NULL){
278 insertAux(h, oldh[i].word, oldh[i].len, oldh[i].posInVoc);
288 void insertElement (t_hash h, const unsigned char *aWord, register unsigned long len, register unsigned long *addr) {
289 //fprintf(stderr,"\n Entra en la funcin [%s], [%ld]",aWord, len);
291 if(h->NumElem >= h->SIZE_HASH/2)
297 //h->hash[*addr].word = (unsigned char*) malloc((len+1) * sizeof(unsigned char));
298 getMemoryBlock(h->_memMgr,( byte **)&(h->hash[*addr].word),len+1);
299 //fprintf(stderr,"\n tras obter memoria");
301 strncpy ((char *) h->hash[*addr].word, (char *)aWord, len);
303 h->hash[*addr].word[len]='\0';
304 h->hash[*addr].len =len;
305 h->hash[*addr].posInVoc = h->NumElem;
308 //fprintf(stderr,"\n####inserted word [%s] ",h->hash[*addr].word);
313 /*-----------------------------------------------------------------------
314 Search for a word in the hash table and returns its position in the
315 vocabulary. It returns the next "available" position in the vocabulary,
316 if the word is not in the hash table. That is: a "0-node" position.
317 It also returns -using attribute returnedAddr- the position where word
318 was found (or where it should go if it was inserted in next "insert call".
319 -----------------------------------------------------------------------*/
320 unsigned long search (t_hash h, const unsigned char *aWord, register unsigned len,
321 unsigned long *returnedAddr){
323 register unsigned long addr, Saddr;
325 //fprintf(stderr,"\n searching for [%s], [%d], sizehash= %ld",aWord,len,h->SIZE_HASH);
326 addr = hashFunction(aWord,len, h->SIZE_HASH);
332 while((hash[addr].word != NULL)&&((strcomp(hash[addr].word, aWord, hash[addr].len, len)) != 0)) {
333 //fprintf(stderr,"\nComprueba [%s], [%d]",hash[addr].word,strcomp(hash[addr].word, aWord, len));
334 addr = ((addr + JUMP) % h->SIZE_HASH);
337 *returnedAddr = addr;
339 if(hash[addr].word == NULL) {
340 // fprintf(stderr, "word:'%.*s', hash %lu\n", len, aWord, Saddr);
341 return h->NumElem; //Word was not found
344 return h->hash[addr].posInVoc; //Word was found in this position in the vocabulary
350 /*-----------------------------------------------------------------------
351 Tells if a word appears or not in the hash table.
352 -----------------------------------------------------------------------*/
353 unsigned long inHashTable (t_hash h, const unsigned char *aWord, register unsigned len, unsigned long *returnedAddr){
355 unsigned long searched;
356 unsigned long nothing;
357 searched = search(h,aWord,len,¬hing);
358 *returnedAddr=nothing;
359 return (searched < (h->NumElem) );
362 /*------------------------------------------------------------------
364 ------------------------------------------------------------------ */
365 void destroy_hash (t_hash hash){
368 mem = sizeof(struct hashStr) + hash->SIZE_HASH * sizeof(t_hashNode);
369 printf("\n[destroying hash table]...Freed %ld bytes... RAM\n", mem);
373 destroyMemoryManager(hash->_memMgr); //frees words and variants
374 /// free(hash->_memMgr);
381 /*------------------------------------------------------------------
382 main, to make unit proofs
383 ------------------------------------------------------------------ */
385 int main(int argc, char* argv[])
386 { byte a[10]= "word1";
394 unsigned long i,addrInTH;
398 _memMgr=createMemoryManager();
400 hash = initialize_hash (2);
403 i = search (hash,w, strlen(w), &addrInTH );
404 insertElement (hash, w, strlen(w), &addrInTH);
405 fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
406 fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
409 i = search (hash,w, strlen(w), &addrInTH );
410 insertElement (hash, w, strlen(w), &addrInTH);
411 fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
412 fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
415 i = search (hash,w, strlen(w), &addrInTH );
416 insertElement (hash, w, strlen(w), &addrInTH);
417 fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
418 fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
421 i = search (hash,w, strlen(w), &addrInTH );
422 insertElement (hash, w, strlen(w), &addrInTH);
423 fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
424 fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
427 // i = search (hash,w, strlen(w), &addrInTH );
428 // insertElement (hash, w, strlen(w), &addrInTH);
429 // fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
430 // fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
433 // i = search (hash,w, strlen(w), &addrInTH );
434 // insertElement (hash, w, strlen(w), &addrInTH);
435 // fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
436 // fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
438 fprintf(stderr,"\n in: %s hash ? = %d",a,inHashTable(hash,a,strlen(a)) );
439 fprintf(stderr,"\n in: %s hash ? = %d",e, inHashTable(hash,e,strlen(e)) );
440 fprintf(stderr,"\n in: %s hash ? = %d",b, inHashTable(hash,b,strlen(b)) );
443 destroyMemoryManager(_memMgr);