swcsa/utils/hash.c

   1
   2 /* DYNAMIC END-TAGGED DENSE CODE. --
   3 A dynamic word-based byte oriented compressor for text files based on
   4 dynamic End-Tagged Dense Code.
   5
   6 Brisaboa, N. R., Faria, A., Navarro, G., Param, J. R.
   7 Simple, Fast, and Efficient Natural Language Adaptive Compression.
   8 11th International Symposium on String Processing and Information Retrieval (SPIRE'04) - LNCS 3246. A. Apostolico, M. Melucci (Ed.), pp. 230-241.
   9 Padova (Italia), 2004.
  10
  11 Copyright (C) 2005 Antonio Faria.
  12
  13 This program is free software; you can redistribute it and/or
  14 modify it under the terms of the GNU General Public License
  15 as published by the Free Software Foundation; either version 2
  16 of the License, or (at your option) any later version.
  17
  18 This program is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with this program; if not, write to the Free Software
  25 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  26
  27 Author's contact: Antonio Faria, Dept. of Computer Science, University of
  28 A Corua. Campus de Elvia s/n. Spain  fari@udc.es
  29 */
  30
  31 /*-----------------------------------------------------------------------
  32  Hash: Definition of HashTable class (Linear Hash)
  33  ------------------------------------------------------------------------*/
  34
  35 #include "hash.h"
  36
  37 #include <stdio.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41
  42 /*-----------------------------------------------------------------
  43  Initilization of data structures used by the hashTable
  44  ---------------------------------------------------------------- */
  45 t_hash initialize_hash (unsigned long sizeVoc) {
  46         t_hash h;
  47         unsigned long i;
  48
  49         h = (t_hash) malloc(sizeof(struct hashStr));
  50         unsigned long m = 6 * sizeVoc;
  51         h->SIZE_HASH = sizeVoc;
  52         do {
  53           h->SIZE_HASH = NearestPrime(h->SIZE_HASH);
  54         } while (h->SIZE_HASH < m);
  55
  56         h->hash = (t_hashNode *) malloc(h->SIZE_HASH * sizeof(t_hashNode));
  57         h->NumElem = 0;
  58
  59         //creates the memory manager that is used to reserve small pieces of memory (for words)
  60         h->_memMgr=createMemoryManager();
  61
  62
  63         for (i = 0; i < h->SIZE_HASH; i++)    {
  64                 h->hash[i].word = NULL;
  65                 h->hash[i].len = 0;
  66                 h->hash[i].posInVoc = 0;
  67         }
  68
  69         return h;
  70 }
  71
  72
  73 /*------------------------------------------------------------------
  74  Find the nearest prime number over n.
  75  ---------------------------------------------------------------- */
  76 unsigned long NearestPrime(unsigned long n)
  77 {
  78     long position;  /* the prime number being sought */
  79     long index;
  80     for (position = n; ; position++)
  81     {
  82         // checks if those values from 2 to $\sqrt{m}$ can be factors of $m$ */
  83         for (index = 2; index <= (long) sqrt((double) position) && position % index != 0; index++) ;
  84
  85         if (position % index != 0)  /* No factors in that range, therefore a prime number was found */
  86         {
  87             break;
  88         }
  89     }
  90     return position;
  91 }
  92
  93
  94
  95 /*-----------------------------------------------------------------------
  96  Computes a hash function over a string "aWord", it returns the position
  97  where "aWord" should go in the hash table if no collision ocurrs.
  98  ---------------------------------------------------------------------*/
  99 unsigned long hashFunction (const unsigned char *aWord, unsigned int len, unsigned long sizeHash)
 100 {
 101     char        c;
 102     register unsigned int h;
 103     register unsigned long last;
 104     last=((unsigned long) aWord) +len -1;
 105
 106
 107     h = SEED;
 108     //last=aWord+len;
 109
 110     for( ; ((unsigned long) aWord <=last ) ; )
 111     //for(c=*aWord; (aWord++)<last ;  )
 112     {
 113         c=*(aWord++);
 114
 115                 //c=*aWord;
 116         h ^= ( (h << 5) + c + (h >> 2) );
 117     }
 118
 119     return((unsigned int)((h&0x7fffffff) % sizeHash));
 120 }
 121
 122 unsigned long newhashFunction(const unsigned char *aWord, unsigned int len, unsigned long sizeHash)
 123 {
 124   unsigned int h = 0;
 125   unsigned int i = 0;
 126   for(i = 0; i  < len; i ++)
 127     h = ((unsigned int) aWord[i]) + (h << 6) + (h << 16) - h;
 128   return (h&0x7fffffff) % sizeHash;
 129 }
 130
 131 /*-----------------------------------------------------------------------
 132   compares two strings
 133  ---------------------------------------------------------------------*/
 134
 135 /* Author J. Zobel, April 2001.
 136         http://www.seg.rmit.edu.au/code/zwh-ipl/
 137    Permission to use this code is freely granted, provided that this
 138    statement is retained. */
 139
 140
 141
 142  int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2) {
 143          if (ws1 !=ws2) {
 144                         return -1;
 145          }
 146
 147          {  register unsigned long iters;
 148                  iters=1;
 149             while( iters<ws1 && *s1 == *s2 )
 150             {
 151                         s1++;
 152                         s2++;
 153                         iters++;
 154             }
 155             //fprintf(stderr,"\nDevuelve [%d]",*s1-*s2);
 156             return( *s1-*s2 );
 157          }
 158 }
 159
 160 inline int strcomp3(const unsigned char *s1, const unsigned char *s2, unsigned long ws1, unsigned long ws2) {
 161
 162          if (ws1 !=ws2) {
 163                         return -1;
 164          }
 165
 166          {  register unsigned long iters;
 167             register unsigned long end;
 168             end = MIN(ws1,ws2);
 169                  iters=1;
 170             while( iters<end && *s1 == *s2 )
 171             {
 172                         s1++;
 173                         s2++;
 174                         iters++;
 175             }
 176             //fprintf(stderr,"\nDevuelve [%d]",*s1-*s2);
 177             return( *s1-*s2 );
 178          }
 179 }
 180
 181
 182 /* Author J. Zobel, April 2001.
 183         http://www.seg.rmit.edu.au/code/zwh-ipl/
 184    Permission to use this code is freely granted, provided that this
 185    statement is retained. */
 186
 187 //int strcomp(const unsigned char *s1, const unsigned char *s2, unsigned long ws) {
 188 //    register unsigned long i;
 189 //    i=0;
 190 //    while( i < ws-1 && *s1 == *s2 )*/
 191 //    {
 192 //                      s1++;
 193 //                      s2++;
 194 //                      i++;
 195 //    }
 196 //    return( *s1-*s2 );
 197 //}
 198
 199 //int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
 200 //      register ulong iters = 1;
 201 //
 202 //      while( iters<ws1 && *s1 == *s2 ){
 203 //              s1++;
 204 //              s2++;
 205 //              iters++;
 206 //      }
 207 //        // so     iters == ws1   OR   *s1 != *s2
 208 //      if (ws1 == iters) {
 209 //                      if (ws2 == ws1)
 210 //                              return 0;  // s1 equals to s2
 211 //                      else
 212 //                              return -1;   // s1 < s2
 213 //      }
 214 //      else { //(ws1 > iters) so s1 != s2
 215 //              return( *s1-*s2);
 216 //      }
 217 //}
 218
 219
 220 //permits to compare 2 strings of len ws1 and ws2 that do not end in '\0'
 221 int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
 222         register ulong iters = 0;
 223
 224         while( iters<ws1 && iters<ws2 && *s1 == *s2 ){
 225                 s1++;
 226                 s2++;
 227                 iters++;
 228         }
 229
 230         if (ws1 == iters) {
 231                         if (ws2 == ws1)
 232                                 return 0;  // s1 equals to s2
 233                         else
 234                                 return -1;   // w1 < w2  and  *s1_i == *s2_i for i=0 to iters-1
 235         }
 236         else
 237         if (ws2 == iters) {
 238                         if (ws2 == ws1)
 239                                 return 0;  // s1 equals to s2
 240                         else
 241                                 return +1;   // w2 < w1  and  *ws1 is '\0'
 242         }
 243         else { //*s1 != *s2
 244                 return (*s1-*s2);
 245         }
 246
 247 }
 248
 249
 250 /*-----------------------------------------------------------------------
 251  Insert a new word in a position of the hashTable (position previously computed)
 252  ---------------------------------------------------------------------*/
 253
 254 void insertAux(t_hash h, unsigned char *aWord, unsigned long len, unsigned long pos)
 255 {
 256   unsigned long addr = 0;
 257   unsigned long dummy = search(h, aWord, len, &addr);
 258   h->hash[addr].word = aWord;
 259   h->hash[addr].len = len;
 260   h->hash[addr].posInVoc = pos;
 261   h->NumElem++;
 262 }
 263
 264 void resize(t_hash h){
 265   unsigned long oldsize = h->SIZE_HASH;
 266   unsigned long newsize = NearestPrime(h->SIZE_HASH * 2);
 267   t_hashNode* oldh = h->hash;
 268   unsigned long l = newsize * sizeof(t_hashNode);
 269   t_hashNode* newh = (t_hashNode *) malloc(newsize * sizeof(t_hashNode));
 270   unsigned long i;
 271   fprintf(stderr, "Resizing hashtable from %lu to %lu (%lu bytes)\n", oldsize, newsize, l);
 272   h->hash = newh;
 273   h->SIZE_HASH = newsize;
 274   for (i = 0; i < h->SIZE_HASH; i++)    {
 275     h->hash[i].word = NULL;
 276     h->hash[i].len = 0;
 277     h->hash[i].posInVoc = 0;
 278   }
 279   h->NumElem = 0;
 280   for(i = 0; i < oldsize; i++)
 281     if (oldh[i].word != NULL){
 282       insertAux(h, oldh[i].word, oldh[i].len, oldh[i].posInVoc);
 283       oldh[i].word = NULL;
 284       oldh[i].len = 0;
 285       oldh[i].posInVoc=0;
 286     }
 287   //free(oldh);
 288
 289 }
 290
 291
 292 void insertElement (t_hash h, const unsigned char *aWord, register unsigned long len, register unsigned long *addr) {
 293     //fprintf(stderr,"\n Entra en la funcin [%s], [%ld]",aWord, len);
 294
 295         if(h->NumElem >= h->SIZE_HASH/2)
 296         {
 297
 298          resize(h);
 299
 300         }
 301         //h->hash[*addr].word = (unsigned char*) malloc((len+1) * sizeof(unsigned char));
 302         getMemoryBlock(h->_memMgr,( byte **)&(h->hash[*addr].word),len+1);
 303         //fprintf(stderr,"\n tras obter memoria");
 304
 305         strncpy ((char *) h->hash[*addr].word, (char *)aWord, len);
 306
 307         h->hash[*addr].word[len]='\0';
 308         h->hash[*addr].len =len;
 309         h->hash[*addr].posInVoc = h->NumElem;
 310         h->NumElem++;
 311
 312         //fprintf(stderr,"\n####inserted word [%s] ",h->hash[*addr].word);
 313
 314         //return *addr;
 315 }
 316
 317 /*-----------------------------------------------------------------------
 318  Search for a word in the hash table and returns its position in the
 319  vocabulary. It returns the next "available" position in the vocabulary,
 320  if the word is not in the hash table. That is: a "0-node" position.
 321  It also returns -using attribute returnedAddr- the position where word
 322  was found (or where it should go if it was inserted in next "insert call".
 323  -----------------------------------------------------------------------*/
 324 unsigned long search (t_hash h, const unsigned char *aWord, register unsigned len,
 325                                                                  unsigned long *returnedAddr){
 326
 327         register unsigned long addr, Saddr;
 328
 329         //fprintf(stderr,"\n searching for  [%s], [%d], sizehash= %ld",aWord,len,h->SIZE_HASH);
 330         addr = hashFunction(aWord,len, h->SIZE_HASH);
 331         Saddr = addr;
 332
 333         t_hashNode *hash;
 334         hash = h->hash;
 335
 336         while((hash[addr].word  != NULL)&&((strcomp(hash[addr].word, aWord,  hash[addr].len, len)) != 0))  {
 337                 //fprintf(stderr,"\nComprueba [%s], [%d]",hash[addr].word,strcomp(hash[addr].word, aWord, len));
 338                 addr = ((addr + JUMP) % h->SIZE_HASH);
 339         }
 340
 341         *returnedAddr = addr;
 342
 343         if(hash[addr].word  == NULL) {
 344           //      fprintf(stderr, "word:'%.*s', hash %lu\n", len, aWord, Saddr);
 345           return h->NumElem;                            //Word was not found
 346         }
 347         else {
 348                 return h->hash[addr].posInVoc; //Word was found in this position in the vocabulary
 349         }
 350 }
 351
 352
 353
 354 /*-----------------------------------------------------------------------
 355   Tells if a word appears or not in the hash table.
 356  -----------------------------------------------------------------------*/
 357 unsigned long inHashTable (t_hash h, const unsigned char *aWord, register unsigned len, unsigned long *returnedAddr){
 358
 359         unsigned long searched;
 360         unsigned long nothing;
 361         searched = search(h,aWord,len,&nothing);
 362         *returnedAddr=nothing;
 363         return (searched < (h->NumElem) );
 364 }
 365
 366 /*------------------------------------------------------------------
 367  Destructor method
 368  ------------------------------------------------------------------ */
 369 void destroy_hash (t_hash hash){
 370         unsigned long mem=0;
 371         unsigned long i = 0;
 372         mem = sizeof(struct hashStr) + hash->SIZE_HASH * sizeof(t_hashNode);
 373         printf("\n[destroying hash table]...Freed %ld bytes... RAM\n", mem);
 374         fflush(stdout);
 375
 376         free(hash->hash);
 377         destroyMemoryManager(hash->_memMgr); //frees words and variants
 378 ///     free(hash->_memMgr);
 379         free(hash);
 380 }
 381
 382
 383
 384
 385 /*------------------------------------------------------------------
 386  main, to make unit proofs
 387  ------------------------------------------------------------------ */
 388 /*
 389 int main(int argc, char* argv[])
 390 {   byte a[10]= "word1";
 391         byte b[10]= "word2";
 392         byte c[10]= "word3";
 393         byte d[10]= "word4";
 394         byte e[10]= "word5";
 395         byte f[10]= "word6";
 396         byte * w;
 397         unsigned int size;
 398         unsigned long i,addrInTH;
 399
 400         t_hash hash;
 401
 402         _memMgr=createMemoryManager();
 403
 404         hash = initialize_hash (2);
 405
 406         w=a;
 407         i = search (hash,w, strlen(w), &addrInTH );
 408         insertElement (hash, w, strlen(w), &addrInTH);
 409         fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
 410         fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
 411
 412         w=b;
 413         i = search (hash,w, strlen(w), &addrInTH );
 414         insertElement (hash, w, strlen(w), &addrInTH);
 415         fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
 416         fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
 417
 418         w=c;
 419         i = search (hash,w, strlen(w), &addrInTH );
 420         insertElement (hash, w, strlen(w), &addrInTH);
 421         fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
 422         fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
 423
 424         w=d;
 425         i = search (hash,w, strlen(w), &addrInTH );
 426         insertElement (hash, w, strlen(w), &addrInTH);
 427         fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
 428         fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
 429
 430 //      w=e;
 431 //      i = search (hash,w, strlen(w), &addrInTH );
 432 //      insertElement (hash, w, strlen(w), &addrInTH);
 433 //      fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
 434 //      fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
 435
 436 //      w=f;
 437 //      i = search (hash,w, strlen(w), &addrInTH );
 438 //      insertElement (hash, w, strlen(w), &addrInTH);
 439 //      fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
 440 //      fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
 441
 442         fprintf(stderr,"\n in: %s  hash ? = %d",a,inHashTable(hash,a,strlen(a)) );
 443         fprintf(stderr,"\n in: %s  hash ? = %d",e, inHashTable(hash,e,strlen(e)) );
 444         fprintf(stderr,"\n in: %s  hash ? = %d",b, inHashTable(hash,b,strlen(b)) );
 445
 446         destroy_hash(hash);
 447         destroyMemoryManager(_memMgr);
 448         printf("\n\n");
 449 }
 450 */