swcsa/utils/hash.c

   1
   2 /* DYNAMIC END-TAGGED DENSE CODE. --
   3 A dynamic word-based byte oriented compressor for text files based on
   4 dynamic End-Tagged Dense Code.
   5
   6 Brisaboa, N. R., Faria, A., Navarro, G., Param, J. R.
   7 Simple, Fast, and Efficient Natural Language Adaptive Compression.
   8 11th International Symposium on String Processing and Information Retrieval (SPIRE'04) - LNCS 3246. A. Apostolico, M. Melucci (Ed.), pp. 230-241.
   9 Padova (Italia), 2004.
  10
  11 Copyright (C) 2005 Antonio Faria.
  12
  13 This program is free software; you can redistribute it and/or
  14 modify it under the terms of the GNU General Public License
  15 as published by the Free Software Foundation; either version 2
  16 of the License, or (at your option) any later version.
  17
  18 This program is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with this program; if not, write to the Free Software
  25 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  26
  27 Author's contact: Antonio Faria, Dept. of Computer Science, University of
  28 A Corua. Campus de Elvia s/n. Spain  fari@udc.es
  29 */
  30
  31 /*-----------------------------------------------------------------------
  32  Hash: Definition of HashTable class (Linear Hash)
  33  ------------------------------------------------------------------------*/
  34
  35 #include "hash.h"
  36
  37 #include <stdio.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41
  42 /*-----------------------------------------------------------------
  43  Initilization of data structures used by the hashTable
  44  ---------------------------------------------------------------- */
  45 t_hash initialize_hash (unsigned long sizeVoc) {
  46         t_hash h;
  47         unsigned long i;
  48
  49         h = (t_hash) malloc(sizeof(struct hashStr));
  50         h->SIZE_HASH = (unsigned long) (OCUP_HASH * sizeVoc);
  51         h->SIZE_HASH = NearestPrime(h->SIZE_HASH);
  52         h->hash = (t_hashNode *) malloc(h->SIZE_HASH * sizeof(t_hashNode));
  53         h->NumElem = 0;
  54
  55         //creates the memory manager that is used to reserve small pieces of memory (for words)
  56         h->_memMgr=createMemoryManager();
  57
  58
  59         for (i = 0; i < h->SIZE_HASH; i++)    {
  60                 h->hash[i].word = NULL;
  61                 h->hash[i].len = 0;
  62                 h->hash[i].posInVoc = 0;
  63         }
  64
  65         return h;
  66 }
  67
  68
  69 /*------------------------------------------------------------------
  70  Find the nearest prime number over n.
  71  ---------------------------------------------------------------- */
  72 unsigned long NearestPrime(unsigned long n)
  73 {
  74     long position;  /* the prime number being sought */
  75     long index;
  76     for (position = n; ; position++)
  77     {
  78         // checks if those values from 2 to $\sqrt{m}$ can be factors of $m$ */
  79         for (index = 2; index <= (long) sqrt((double) position) && position % index != 0; index++) ;
  80
  81         if (position % index != 0)  /* No factors in that range, therefore a prime number was found */
  82         {
  83             break;
  84         }
  85     }
  86     return position;
  87 }
  88
  89
  90
  91 /*-----------------------------------------------------------------------
  92  Computes a hash function over a string "aWord", it returns the position
  93  where "aWord" should go in the hash table if no collision ocurrs.
  94  ---------------------------------------------------------------------*/
  95 unsigned long hashFunction (const unsigned char *aWord, unsigned int len, unsigned long sizeHash)
  96 {
  97     char        c;
  98     register unsigned int h;
  99     register unsigned long last;
 100     last=((unsigned long) aWord) +len -1;
 101
 102
 103     h = SEED;
 104     //last=aWord+len;
 105
 106     for( ; ((unsigned long) aWord <=last ) ; )
 107     //for(c=*aWord; (aWord++)<last ;  )
 108     {
 109         c=*(aWord++);
 110
 111                 //c=*aWord;
 112         h ^= ( (h << 5) + c + (h >> 2) );
 113     }
 114
 115     return((unsigned int)((h&0x7fffffff) % sizeHash));
 116 }
 117
 118 unsigned long newhashFunction(const unsigned char *aWord, unsigned int len, unsigned long sizeHash)
 119 {
 120   unsigned int h = 0;
 121   unsigned int i = 0;
 122   for(i = 0; i  < len; i ++)
 123     h = ((unsigned int) aWord[i]) + (h << 6) + (h << 16) - h;
 124   return (h&0x7fffffff) % sizeHash;
 125 }
 126
 127 /*-----------------------------------------------------------------------
 128   compares two strings
 129  ---------------------------------------------------------------------*/
 130
 131 /* Author J. Zobel, April 2001.
 132         http://www.seg.rmit.edu.au/code/zwh-ipl/
 133    Permission to use this code is freely granted, provided that this
 134    statement is retained. */
 135
 136
 137
 138  int strcomp(const unsigned char *s1, const unsigned char *s2, register unsigned long ws1, unsigned long ws2) {
 139          if (ws1 !=ws2) {
 140                         return -1;
 141          }
 142
 143          {  register unsigned long iters;
 144                  iters=1;
 145             while( iters<ws1 && *s1 == *s2 )
 146             {
 147                         s1++;
 148                         s2++;
 149                         iters++;
 150             }
 151             //fprintf(stderr,"\nDevuelve [%d]",*s1-*s2);
 152             return( *s1-*s2 );
 153          }
 154 }
 155
 156 inline int strcomp3(const unsigned char *s1, const unsigned char *s2, unsigned long ws1, unsigned long ws2) {
 157
 158          if (ws1 !=ws2) {
 159                         return -1;
 160          }
 161
 162          {  register unsigned long iters;
 163             register unsigned long end;
 164             end = MIN(ws1,ws2);
 165                  iters=1;
 166             while( iters<end && *s1 == *s2 )
 167             {
 168                         s1++;
 169                         s2++;
 170                         iters++;
 171             }
 172             //fprintf(stderr,"\nDevuelve [%d]",*s1-*s2);
 173             return( *s1-*s2 );
 174          }
 175 }
 176
 177
 178 /* Author J. Zobel, April 2001.
 179         http://www.seg.rmit.edu.au/code/zwh-ipl/
 180    Permission to use this code is freely granted, provided that this
 181    statement is retained. */
 182
 183 //int strcomp(const unsigned char *s1, const unsigned char *s2, unsigned long ws) {
 184 //    register unsigned long i;
 185 //    i=0;
 186 //    while( i < ws-1 && *s1 == *s2 )*/
 187 //    {
 188 //                      s1++;
 189 //                      s2++;
 190 //                      i++;
 191 //    }
 192 //    return( *s1-*s2 );
 193 //}
 194
 195 //int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
 196 //      register ulong iters = 1;
 197 //
 198 //      while( iters<ws1 && *s1 == *s2 ){
 199 //              s1++;
 200 //              s2++;
 201 //              iters++;
 202 //      }
 203 //        // so     iters == ws1   OR   *s1 != *s2
 204 //      if (ws1 == iters) {
 205 //                      if (ws2 == ws1)
 206 //                              return 0;  // s1 equals to s2
 207 //                      else
 208 //                              return -1;   // s1 < s2
 209 //      }
 210 //      else { //(ws1 > iters) so s1 != s2
 211 //              return( *s1-*s2);
 212 //      }
 213 //}
 214
 215
 216 //permits to compare 2 strings of len ws1 and ws2 that do not end in '\0'
 217 int strcompL(const byte *s1, const byte *s2, register ulong ws1, register ulong ws2) {
 218         register ulong iters = 0;
 219
 220         while( iters<ws1 && iters<ws2 && *s1 == *s2 ){
 221                 s1++;
 222                 s2++;
 223                 iters++;
 224         }
 225
 226         if (ws1 == iters) {
 227                         if (ws2 == ws1)
 228                                 return 0;  // s1 equals to s2
 229                         else
 230                                 return -1;   // w1 < w2  and  *s1_i == *s2_i for i=0 to iters-1
 231         }
 232         else
 233         if (ws2 == iters) {
 234                         if (ws2 == ws1)
 235                                 return 0;  // s1 equals to s2
 236                         else
 237                                 return +1;   // w2 < w1  and  *ws1 is '\0'
 238         }
 239         else { //*s1 != *s2
 240                 return (*s1-*s2);
 241         }
 242
 243 }
 244
 245
 246 /*-----------------------------------------------------------------------
 247  Insert a new word in a position of the hashTable (position previously computed)
 248  ---------------------------------------------------------------------*/
 249
 250 void insertAux(t_hash h, unsigned char *aWord, unsigned long len, unsigned long pos)
 251 {
 252   unsigned long addr = 0;
 253   unsigned long dummy = search(h, aWord, len, &addr);
 254   h->hash[addr].word = aWord;
 255   h->hash[addr].len = len;
 256   h->hash[addr].posInVoc = pos;
 257   h->NumElem++;
 258 }
 259
 260 void resize(t_hash h){
 261   unsigned long oldsize = h->SIZE_HASH;
 262   unsigned long newsize = NearestPrime(h->SIZE_HASH * 2);
 263   t_hashNode* oldh = h->hash;
 264   unsigned long l = newsize * sizeof(t_hashNode);
 265   t_hashNode* newh = (t_hashNode *) malloc(newsize * sizeof(t_hashNode));
 266   unsigned long i;
 267   fprintf(stderr, "Resizing hashtable from %lu to %lu (%lu bytes)\n", oldsize, newsize, l);
 268   h->hash = newh;
 269   h->SIZE_HASH = newsize;
 270   for (i = 0; i < h->SIZE_HASH; i++)    {
 271     h->hash[i].word = NULL;
 272     h->hash[i].len = 0;
 273     h->hash[i].posInVoc = 0;
 274   }
 275   h->NumElem = 0;
 276   for(i = 0; i < oldsize; i++)
 277     if (oldh[i].word != NULL){
 278       insertAux(h, oldh[i].word, oldh[i].len, oldh[i].posInVoc);
 279       oldh[i].word = NULL;
 280       oldh[i].len = 0;
 281       oldh[i].posInVoc=0;
 282     }
 283   //free(oldh);
 284
 285 }
 286
 287
 288 void insertElement (t_hash h, const unsigned char *aWord, register unsigned long len, register unsigned long *addr) {
 289     //fprintf(stderr,"\n Entra en la funcin [%s], [%ld]",aWord, len);
 290
 291         if(h->NumElem >= h->SIZE_HASH/2)
 292         {
 293
 294          resize(h);
 295
 296         }
 297         //h->hash[*addr].word = (unsigned char*) malloc((len+1) * sizeof(unsigned char));
 298         getMemoryBlock(h->_memMgr,( byte **)&(h->hash[*addr].word),len+1);
 299         //fprintf(stderr,"\n tras obter memoria");
 300
 301         strncpy ((char *) h->hash[*addr].word, (char *)aWord, len);
 302
 303         h->hash[*addr].word[len]='\0';
 304         h->hash[*addr].len =len;
 305         h->hash[*addr].posInVoc = h->NumElem;
 306         h->NumElem++;
 307
 308         //fprintf(stderr,"\n####inserted word [%s] ",h->hash[*addr].word);
 309
 310         //return *addr;
 311 }
 312
 313 /*-----------------------------------------------------------------------
 314  Search for a word in the hash table and returns its position in the
 315  vocabulary. It returns the next "available" position in the vocabulary,
 316  if the word is not in the hash table. That is: a "0-node" position.
 317  It also returns -using attribute returnedAddr- the position where word
 318  was found (or where it should go if it was inserted in next "insert call".
 319  -----------------------------------------------------------------------*/
 320 unsigned long search (t_hash h, const unsigned char *aWord, register unsigned len,
 321                                                                  unsigned long *returnedAddr){
 322
 323         register unsigned long addr, Saddr;
 324
 325         //fprintf(stderr,"\n searching for  [%s], [%d], sizehash= %ld",aWord,len,h->SIZE_HASH);
 326         addr = hashFunction(aWord,len, h->SIZE_HASH);
 327         Saddr = addr;
 328
 329         t_hashNode *hash;
 330         hash = h->hash;
 331
 332         while((hash[addr].word  != NULL)&&((strcomp(hash[addr].word, aWord,  hash[addr].len, len)) != 0))  {
 333                 //fprintf(stderr,"\nComprueba [%s], [%d]",hash[addr].word,strcomp(hash[addr].word, aWord, len));
 334                 addr = ((addr + JUMP) % h->SIZE_HASH);
 335         }
 336
 337         *returnedAddr = addr;
 338
 339         if(hash[addr].word  == NULL) {
 340           //      fprintf(stderr, "word:'%.*s', hash %lu\n", len, aWord, Saddr);
 341           return h->NumElem;                            //Word was not found
 342         }
 343         else {
 344                 return h->hash[addr].posInVoc; //Word was found in this position in the vocabulary
 345         }
 346 }
 347
 348
 349
 350 /*-----------------------------------------------------------------------
 351   Tells if a word appears or not in the hash table.
 352  -----------------------------------------------------------------------*/
 353 unsigned long inHashTable (t_hash h, const unsigned char *aWord, register unsigned len, unsigned long *returnedAddr){
 354
 355         unsigned long searched;
 356         unsigned long nothing;
 357         searched = search(h,aWord,len,&nothing);
 358         *returnedAddr=nothing;
 359         return (searched < (h->NumElem) );
 360 }
 361
 362 /*------------------------------------------------------------------
 363  Destructor method
 364  ------------------------------------------------------------------ */
 365 void destroy_hash (t_hash hash){
 366         unsigned long mem=0;
 367         unsigned long i = 0;
 368         mem = sizeof(struct hashStr) + hash->SIZE_HASH * sizeof(t_hashNode);
 369         printf("\n[destroying hash table]...Freed %ld bytes... RAM\n", mem);
 370         fflush(stdout);
 371
 372         free(hash->hash);
 373         destroyMemoryManager(hash->_memMgr); //frees words and variants
 374 ///     free(hash->_memMgr);
 375         free(hash);
 376 }
 377
 378
 379
 380
 381 /*------------------------------------------------------------------
 382  main, to make unit proofs
 383  ------------------------------------------------------------------ */
 384 /*
 385 int main(int argc, char* argv[])
 386 {   byte a[10]= "word1";
 387         byte b[10]= "word2";
 388         byte c[10]= "word3";
 389         byte d[10]= "word4";
 390         byte e[10]= "word5";
 391         byte f[10]= "word6";
 392         byte * w;
 393         unsigned int size;
 394         unsigned long i,addrInTH;
 395
 396         t_hash hash;
 397
 398         _memMgr=createMemoryManager();
 399
 400         hash = initialize_hash (2);
 401
 402         w=a;
 403         i = search (hash,w, strlen(w), &addrInTH );
 404         insertElement (hash, w, strlen(w), &addrInTH);
 405         fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
 406         fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
 407
 408         w=b;
 409         i = search (hash,w, strlen(w), &addrInTH );
 410         insertElement (hash, w, strlen(w), &addrInTH);
 411         fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
 412         fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
 413
 414         w=c;
 415         i = search (hash,w, strlen(w), &addrInTH );
 416         insertElement (hash, w, strlen(w), &addrInTH);
 417         fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
 418         fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
 419
 420         w=d;
 421         i = search (hash,w, strlen(w), &addrInTH );
 422         insertElement (hash, w, strlen(w), &addrInTH);
 423         fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
 424         fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
 425
 426 //      w=e;
 427 //      i = search (hash,w, strlen(w), &addrInTH );
 428 //      insertElement (hash, w, strlen(w), &addrInTH);
 429 //      fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
 430 //      fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
 431
 432 //      w=f;
 433 //      i = search (hash,w, strlen(w), &addrInTH );
 434 //      insertElement (hash, w, strlen(w), &addrInTH);
 435 //      fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
 436 //      fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, hash->hash[addrInTH].word, hash->hash[addrInTH].freq, hash->hash[addrInTH].posInVoc);
 437
 438         fprintf(stderr,"\n in: %s  hash ? = %d",a,inHashTable(hash,a,strlen(a)) );
 439         fprintf(stderr,"\n in: %s  hash ? = %d",e, inHashTable(hash,e,strlen(e)) );
 440         fprintf(stderr,"\n in: %s  hash ? = %d",b, inHashTable(hash,b,strlen(b)) );
 441
 442         destroy_hash(hash);
 443         destroyMemoryManager(_memMgr);
 444         printf("\n\n");
 445 }
 446 */