2 /* DYNAMIC END-TAGGED DENSE CODE. --
3 A dynamic word-based byte oriented compressor for text files based on
4 dynamic End-Tagged Dense Code.
6 Brisaboa, N. R., Faria, A., Navarro, G., Param, J. R.
7 Simple, Fast, and Efficient Natural Language Adaptive Compression.
8 11th International Symposium on String Processing and Information Retrieval (SPIRE'04) - LNCS 3246. A. Apostolico, M. Melucci (Ed.), pp. 230-241.
11 Copyright (C) 2005 Antonio Faria.
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License
15 as published by the Free Software Foundation; either version 2
16 of the License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
27 Author's contact: Antonio Faria, Dept. of Computer Science, University of
28 A Corua. Campus de Elvia s/n. Spain fari@udc.es
31 /*-----------------------------------------------------------------------
32 Hash: Definition of HashTable class (Linear Hash)
33 ------------------------------------------------------------------------*/
42 /*-----------------------------------------------------------------
43 Initilization of data structures used by the hashTable
44 ---------------------------------------------------------------- */
45 t_hash initialize_hash (unsigned long sizeVoc) {
49 h = (t_hash) malloc(sizeof(struct hashStr));
50 h->SIZE_HASH = (unsigned long) (OCUP_HASH * sizeVoc);
51 h->SIZE_HASH = NearestPrime(h->SIZE_HASH);
52 h->hash = (t_hashNode *) malloc(h->SIZE_HASH * sizeof(t_hashNode));
55 //creates the memory manager that is used to reserve small pieces of memory (for words)
56 h->_memMgr=createMemoryManager();
59 for (i = 0; i < h->SIZE_HASH; i++) {
60 h->hash[i].word = NULL;
62 h->hash[i].posInVoc = 0;
64 printf("\nHash Table initilized with: %lu elements\n",h->SIZE_HASH);
70 /*------------------------------------------------------------------
71 Find the nearest prime number over n.
72 ---------------------------------------------------------------- */
73 unsigned long NearestPrime(unsigned long n)
75 long position; /* the prime number being sought */
78 for (position = n; ; position++)
80 // checks if those values from 2 to $\sqrt{m}$ can be factors of $m$ */
81 for (index = 2; index <= (long) sqrt((double) position) && position % index != 0; index++) ;
83 if (position % index != 0) /* No factors in that range, therefore a prime number was found */
93 /*-----------------------------------------------------------------------
94 Computes a hash function over a string "aWord", it returns the position
95 where "aWord" should go in the hash table if no collision ocurrs.
96 ---------------------------------------------------------------------*/
97 unsigned long hashFunction (const unsigned char *aWord, unsigned int len, unsigned long sizeHash)
100 register unsigned int h;
101 register unsigned long last;
102 last=((unsigned long) aWord) +len -1;
108 for( ; ((unsigned long) aWord <=last ) ; )
109 //for(c=*aWord; (aWord++)<last ; )
113 h ^= ( (h << 5) + c + (h >> 2) );
115 return((unsigned int)((h&0x7fffffff) % sizeHash));
119 /*-----------------------------------------------------------------------
121 ---------------------------------------------------------------------*/
123 /* Author J. Zobel, April 2001.
124 http://www.seg.rmit.edu.au/code/zwh-ipl/
125 Permission to use this code is freely granted, provided that this
126 statement is retained. */
130 int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2) {
135 { register unsigned long iters;
137 while( iters<ws1 && *s1 == *s2 )
143 //fprintf(stderr,"\nDevuelve [%d]",*s1-*s2);
148 inline int strcomp3(const unsigned char *s1, const unsigned char *s2, unsigned long ws1, unsigned long ws2) {
154 { register unsigned long iters;
155 register unsigned long end;
158 while( iters<end && *s1 == *s2 )
164 //fprintf(stderr,"\nDevuelve [%d]",*s1-*s2);
170 /* Author J. Zobel, April 2001.
171 http://www.seg.rmit.edu.au/code/zwh-ipl/
172 Permission to use this code is freely granted, provided that this
173 statement is retained. */
175 //int strcomp(const unsigned char *s1, const unsigned char *s2, unsigned long ws) {
176 // register unsigned long i;
178 // while( i < ws-1 && *s1 == *s2 )*/
184 // return( *s1-*s2 );
187 //int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
188 // register ulong iters = 1;
190 // while( iters<ws1 && *s1 == *s2 ){
195 // // so iters == ws1 OR *s1 != *s2
196 // if (ws1 == iters) {
198 // return 0; // s1 equals to s2
200 // return -1; // s1 < s2
202 // else { //(ws1 > iters) so s1 != s2
208 //permits to compare 2 strings of len ws1 and ws2 that do not end in '\0'
209 int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
210 register ulong iters = 0;
212 while( iters<ws1 && iters<ws2 && *s1 == *s2 ){
220 return 0; // s1 equals to s2
222 return -1; // w1 < w2 and *s1_i == *s2_i for i=0 to iters-1
227 return 0; // s1 equals to s2
229 return +1; // w2 < w1 and *ws1 is '\0'
238 /*-----------------------------------------------------------------------
239 Insert a new word in a position of the hashTable (position previously computed)
240 ---------------------------------------------------------------------*/
241 void insertElement (t_hash h, const unsigned char *aWord, register unsigned long len, register unsigned long *addr) {
242 //fprintf(stderr,"\n Entra en la funcin [%s], [%ld]",aWord, len);
244 if(h->NumElem >= h->SIZE_HASH -1) //loses 1 slot, but ensures that "search function" does not enter an infinity loop
246 printf("\n\nHash table full!! Change size and recompile !\n");
250 getMemoryBlock(h->_memMgr,( byte **)&(h->hash[*addr].word),len+1);
251 //fprintf(stderr,"\n tras obter memoria");
253 strncpy ((char *) h->hash[*addr].word, (char *)aWord, len);
255 h->hash[*addr].word[len]='\0';
256 h->hash[*addr].len =len;
257 h->hash[*addr].posInVoc = h->NumElem;
260 //fprintf(stderr,"\n####inserted word [%s] ",h->hash[*addr].word);
265 /*-----------------------------------------------------------------------
266 Search for a word in the hash table and returns its position in the
267 vocabulary. It returns the next "available" position in the vocabulary,
268 if the word is not in the hash table. That is: a "0-node" position.
269 It also returns -using attribute returnedAddr- the position where word
270 was found (or where it should go if it was inserted in next "insert call".
271 -----------------------------------------------------------------------*/
272 unsigned long search (t_hash h, const unsigned char *aWord, register unsigned len,
273 unsigned long *returnedAddr){
275 register unsigned long addr, Saddr;
277 //fprintf(stderr,"\n searching for [%s], [%d], sizehash= %ld",aWord,len,h->SIZE_HASH);
278 addr = hashFunction(aWord,len, h->SIZE_HASH);
284 while((hash[addr].word != NULL)&&((strcomp(hash[addr].word, aWord, hash[addr].len, len)) != 0)) {
285 //fprintf(stderr,"\nComprueba [%s], [%d]",hash[addr].word,strcomp(hash[addr].word, aWord, len));
286 addr = ((addr + JUMP) % h->SIZE_HASH);
289 *returnedAddr = addr;
291 if(hash[addr].word == NULL) {
292 return h->NumElem; //Word was not found
295 return h->hash[addr].posInVoc; //Word was found in this position in the vocabulary
301 /*-----------------------------------------------------------------------
302 Tells if a word appears or not in the hash table.
303 -----------------------------------------------------------------------*/
304 unsigned long inHashTable (t_hash h, const unsigned char *aWord, register unsigned len, unsigned long *returnedAddr){
306 unsigned long searched;
307 unsigned long nothing;
308 searched = search(h,aWord,len,¬hing);
309 *returnedAddr=nothing;
310 return (searched < (h->NumElem) );
313 /*------------------------------------------------------------------
315 ------------------------------------------------------------------ */
316 void destroy_hash (t_hash hash){
318 mem += sizeof(struct hashStr) + hash->SIZE_HASH * sizeof(t_hashNode);
320 destroyMemoryManager(hash->_memMgr); //frees words and variants
321 // free(hash->_memMgr);
323 printf("\n[destroying hash table]...Freed %ld bytes... RAM", mem);
329 /*------------------------------------------------------------------
330 main, to make unit proofs
331 ------------------------------------------------------------------ */
333 int main(int argc, char* argv[])
334 { byte a[10]= "word1";
342 unsigned long i,addrInTH;
346 _memMgr=createMemoryManager();
348 hash = initialize_hash (2);
351 i = search (hash,w, strlen(w), &addrInTH );
352 insertElement (hash, w, strlen(w), &addrInTH);
353 fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
354 fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
357 i = search (hash,w, strlen(w), &addrInTH );
358 insertElement (hash, w, strlen(w), &addrInTH);
359 fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
360 fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
363 i = search (hash,w, strlen(w), &addrInTH );
364 insertElement (hash, w, strlen(w), &addrInTH);
365 fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
366 fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
369 i = search (hash,w, strlen(w), &addrInTH );
370 insertElement (hash, w, strlen(w), &addrInTH);
371 fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
372 fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
375 // i = search (hash,w, strlen(w), &addrInTH );
376 // insertElement (hash, w, strlen(w), &addrInTH);
377 // fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
378 // fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
381 // i = search (hash,w, strlen(w), &addrInTH );
382 // insertElement (hash, w, strlen(w), &addrInTH);
383 // fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
384 // fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
386 fprintf(stderr,"\n in: %s hash ? = %d",a,inHashTable(hash,a,strlen(a)) );
387 fprintf(stderr,"\n in: %s hash ? = %d",e, inHashTable(hash,e,strlen(e)) );
388 fprintf(stderr,"\n in: %s hash ? = %d",b, inHashTable(hash,b,strlen(b)) );
391 destroyMemoryManager(_memMgr);