From: nvalimak Date: Thu, 28 May 2009 10:30:59 +0000 (+0000) Subject: Implementation of LZ-index for extracting text X-Git-Url: http://git.nguyen.vg/gitweb/?a=commitdiff_plain;h=5d74bc076c9d15d607d183c16ff8468ea961a0f9;hp=5d47d1754b1df68fc1843989c3992df52025dfa4;p=SXSI%2FTextCollection.git Implementation of LZ-index for extracting text git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@413 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- diff --git a/lzindex/basics.c b/lzindex/basics.c new file mode 100644 index 0000000..4e67ff3 --- /dev/null +++ b/lzindex/basics.c @@ -0,0 +1,177 @@ + +// Basics + +// #include "basics.h" included later to avoid macro recursion for malloc +#include +#include +#include + +uint nbits_aux1, *e_aux1, p_aux1; +uint nbits_aux2, *e_aux2, p_aux2; +uint nbits_aux3, *e_aux3, p_aux3; + + // Memory management + +#ifdef MEMCTRL +int account = 1; +int currmem = 0; +int newmem; +int maxmem = 0; +#endif + +void *Malloc (int n) + + { void *p; + if (n == 0) return NULL; +#ifndef MEMCTRL + p = (void*) malloc (n); +#else + p = (void*) (malloc (n+sizeof(int))+sizeof(int)); +#endif + if (p == NULL) + { fprintf (stderr,"Could not allocate %i bytes\n",n); + exit(1); + } +#ifdef MEMCTRL + *(((int*)p)-1) = n; + if (account) + { newmem = currmem+n; + if (newmem > maxmem) maxmem = newmem; + if (currmem/1024 != newmem/1024) + printf ("Memory: %i Kb, maximum: %i Kb\n",newmem/1024,maxmem/1024); + currmem = newmem; + } +#endif + return p; + } + +void Free (void *p) + + { +#ifndef MEMCTRL + if (p) free (p); +#else + if (!p) return; + if (account) + { newmem = currmem - *(((int*)p)-1); + free ((void*)(((int)p)-sizeof(int))); + if (currmem/1024 != newmem/1024) + printf ("Memory: %i Kb, maximum: %i Kb\n",newmem/1024,maxmem/1024); + currmem = newmem; + } +#endif + } + +void *Realloc (void *p, int n) + + { if (p == NULL) return Malloc (n); + if (n == 0) { Free(p); return NULL; } +#ifndef MEMCTRL + p = (void*) realloc (p,n); +#else + if (account) + newmem = currmem - *(((int*)p)-1); + p = (void*) (realloc ((void*)(((int)p)-sizeof(int)),n+sizeof(int))+sizeof(int)); + *(((int*)p)-1) = n; +#endif + if (p == NULL) + { fprintf (stderr,"Could not allocate %i bytes\n",n); + exit(1); + } +#ifdef MEMCTRL + if (account) + { newmem = newmem + n; + if (newmem > maxmem) maxmem = newmem; + if (currmem/1024 != newmem/1024) + printf ("Memory: %i Kb, maximum: %i Kb\n",newmem/1024,maxmem/1024); + currmem = newmem; + } +#endif + return p; + } + +#include "basics.h" + + // bits needed to represent a number between 0 and n + +uint bits (uint n) + + { uint b = 0; + while (n) + { b++; n >>= 1; } + return b; + } + + +uint bitget_aux1() + { + register uint i=(p_aux1>>5), j=p_aux1&0x1F, answ; + if (j+nbits_aux1 <= W) + answ = (e_aux1[i] << (W-j-nbits_aux1)) >> (W-nbits_aux1); + else { + answ = e_aux1[i] >> j; + answ = answ | ((e_aux1[i+1] << (W-j-nbits_aux1)) >> (W-nbits_aux1)); + } + return answ; + } +/* +uint bitget_aux2() + { + register uint i=(p_aux2>>5), j=p_aux2&0x1F, answ; + if (j+nbits_aux2 <= W) + answ = (e_aux2[i] << (W-j-nbits_aux2)) >> (W-nbits_aux2); + else { + answ = e_aux2[i] >> j; + answ = answ | ((e_aux2[i+1] << (W-j-nbits_aux2)) >> (W-nbits_aux2)); + } + return answ; + } + +uint bitget_aux3() + { + register uint i=(p_aux3>>5), j=p_aux3&0x1F, answ; + if (j+nbits_aux3 <= W) + answ = (e_aux3[i] << (W-j-nbits_aux3)) >> (W-nbits_aux3); + else { + answ = e_aux3[i] >> j; + answ = answ | ((e_aux3[i+1] << (W-j-nbits_aux3)) >> (W-nbits_aux3)); + } + return answ; + }*/ + + + // returns e[p..p+len-1], assuming len <= W + +uint bitget (uint *e, uint p, uint len) + { + register uint i=(p>>5), j=p&0x1F, answ; + if (j+len <= W) + answ = (e[i] << (W-j-len)) >> (W-len); + else { + answ = e[i] >> j; + answ = answ | ((e[i+1] << (W-j-len)) >> (W-len)); + } + return answ; + } + + // writes e[p..p+len-1] = s, len <= W + +void bitput (register uint *e, register uint p, + register uint len, register uint s) + + { e += p >> bitsW; p &= (1<> (W-p)); + } + else { if (p+len <= W) + { *e = (*e & ~(((1<> (W-p)); + } + } diff --git a/lzindex/basics.h b/lzindex/basics.h new file mode 100644 index 0000000..d8e5d5d --- /dev/null +++ b/lzindex/basics.h @@ -0,0 +1,74 @@ + +// Basics + +#ifndef BASICSINCLUDED +#define BASICSINCLUDED + + + // Includes + +#include +#include +#include +#include +#include +#include +#include + + // Memory management + +#define malloc(n) Malloc(n) +#define free(p) Free(p) +#define realloc(p,n) Realloc(p,n) + +void *Malloc (int n); +void Free (void *p); +void *Realloc (void *p, int n); + + // Data types + +typedef unsigned char byte; +typedef uint trieNode; // a node of lztrie +#ifndef uint +typedef unsigned int uint; +#endif + +#ifndef __cplusplus +typedef int bool; +#endif +#define true 1 +#define false 0 + +#define max(x,y) ((x)>(y)?(x):(y)) +#define min(x,y) ((x)<(y)?(x):(y)) + + // Bitstream management + +#define W (8*sizeof(uint)) +#define bitsW 5 // OJO + + // bits needed to represent a number between 0 and n +uint bits (uint n); + // returns e[p..p+len-1], assuming len <= W +uint bitget (uint *e, uint p, uint len); + // writes e[p..p+len-1] = s, assuming len <= W +void bitput (uint *e, uint p, uint len, uint s); + // reads bit p from e +#define bitget1(e,p) ((e)[(p)/W] & (1<<((p)%W))) + // sets bit p in e +#define bitset(e,p) ((e)[(p)/W] |= (1<<((p)%W))) + // cleans bit p in e +#define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W))) + +extern uint nbits_aux1, *e_aux1, p_aux1; + +uint bitget_aux1(); + +extern uint nbits_aux2, *e_aux2, p_aux2; + +uint bitget_aux2(); + +extern uint nbits_aux3, *e_aux3, p_aux3; + +uint bitget_aux3(); +#endif diff --git a/lzindex/bitmap.c b/lzindex/bitmap.c new file mode 100644 index 0000000..65664b9 --- /dev/null +++ b/lzindex/bitmap.c @@ -0,0 +1,241 @@ + + +// Implements operations over a bitmap + +#include "bitmap.h" + + // In theory, we should have superblocks of size s=log^2 n divided into + // blocks of size b=(log n)/2. This takes + // O(n log n / log^2 n + n log log n / log n + log n sqrt n log log n) bits + // In practice, we can have any s and b, and the needed amount of bits is + // (n/s) log n + (n/b) log s + b 2^b log b bits + // Optimizing it turns out that s should be exactly s = b log n + // Optimizing b is more difficult but could be done numerically. + // However, the exponential table does no more than popcounting, so why not + // setting up a popcount algorithm tailored to the computer register size, + // defining that size as b, and proceeding. + +const unsigned char popc[] = + { + 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, + 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, + 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, + 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, + 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, + 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, + 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, + 3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8, + }; + +uint popcount (register uint x) + { + return popc[x&0xFF] + popc[(x>>8)&0xFF] + popc[(x>>16)&0xFF] + popc[x>>24]; + } + +uint popcount8(register int x) + { + return popc[x & 0xff]; + } + + + // creates a bitmap structure from a bitstring, which is shared + +bitmap createBitmap (uint *string, uint n, bool isfullbmap) + { + bitmap B; + uint i,j,pop,bpop,pos; + uint nb,ns,words; + B = malloc (sizeof(struct sbitmap)); + B->data = string; + B->n = n; words = (n+W-1)/W; + ns = (n+256-1)/256; nb = 256/W; // adjustments + B->bdata = malloc (ns*nb*sizeof(byte)); + B->sdata = malloc (ns*sizeof(int)); +#ifdef INDEXREPORT + printf (" Bitmap over %i bits took %i bits\n", n,n+ns*nb*8+ns*32); +#endif + pop = 0; pos = 0; + for (i=0;isdata[i] = pop; + for (j=0;jbdata[pos++] = bpop; + bpop += popcount(*string++); + } + pop += bpop; + } + if (isfullbmap) {// creates the data structure to solve select_0() queries + B->bdata_0 = malloc(ns*nb*sizeof(byte)); + B->sdata_0 = malloc(ns*sizeof(int)); + string = B->data; + pop = 0; pos = 0; + for (i = 0; i < ns; i++) { + bpop = 0; + B->sdata_0[i] = pop; + for (j = 0; j < nb; j++) { + if (pos == words) break; + B->bdata_0[pos++] = bpop; + bpop += popcount(~(*string++)); + } + pop += bpop; + } + } + else { B->bdata_0 = NULL; B->sdata_0 = NULL;} + return B; + } + + // rank(i): how many 1's are there before position i, not included + +uint rank (bitmap B, uint i) + { + return B->sdata[i>>8] + B->bdata[i>>5] + + popcount (B->data[i>>5] & ((1<<(i&0x1F))-1)); + } + + + // select_1(x): returns the position i of the bitmap B such that + // the number of ones up to position i is x. + +uint select_1(bitmap B, uint x) + { + uint s = 256; + uint l = 0, n = B->n, r = n/s,left; + uint mid = (l+r)/2; + uint ones, j; + uint rankmid = B->sdata[mid]; + // first, binary search on the superblock array + while (l <= r) { + if (rankmid < x) + l = mid+1; + else + r = mid-1; + mid = (l+r)/2; + rankmid = B->sdata[mid]; + } + // sequential search using popcount over an int + left = mid*8; + x -= rankmid; + j = B->data[left]; + ones = popcount(j); + while (ones < x) { + x -= ones; + left++; + if (left > (n+W-1)/W) return n; + j = B->data[left]; + ones = popcount(j); + } + // sequential search using popcount over a char + left = left*32; + rankmid = popcount8(j); + if (rankmid < x) { + j = j>>8; + x -= rankmid; + left += 8; + rankmid = popcount8(j); + if (rankmid < x) { + j = j>>8; + x -= rankmid; + left += 8; + rankmid = popcount8(j); + if (rankmid < x) { + j = j>>8; + x -= rankmid; + left += 8; + } + } + } + // finally sequential search bit per bit + while (x > 0) { + if (j&1) x--; + j = j>>1; + left++; + } + return left-1; +} + + + // select_0(x): returns the position i of the bitmap B such that + // the number of zeros up to position i is x. + +uint select_0(bitmap B, uint x) + { + uint s = 256; + uint l = 0, n = B->n, r = n/s, left; + uint mid = (l+r)/2; + uint ones, j; + uint rankmid = B->sdata_0[mid]; + // first, binary search on the superblock array + while (l <= r) { + if (rankmid < x) + l = mid+1; + else + r = mid-1; + mid = (l+r)/2; + rankmid = B->sdata_0[mid]; + } + // sequential search using popcount over an int + left = mid*8; + x -= rankmid; + j = B->data[left]; + ones = popcount(~j); + while (ones < x) { + x -= ones; + left++; + if (left > (n+W-1)/W) return n; + j = B->data[left]; + ones = popcount(~j); + } + // sequential search using popcount over a char + left = left*32; + rankmid = popcount8(~j); + if (rankmid < x) { + j = j>>8; + x -= rankmid; + left += 8; + rankmid = popcount8(~j); + if (rankmid < x) { + j = j>>8; + x -= rankmid; + left += 8; + rankmid = popcount8(~j); + if (rankmid < x) { + j = j>>8; + x -= rankmid; + left += 8; + } + } + } + // finally sequential search bit per bit + while (x > 0) { + if ((j&1)==0) x--; + j = j>>1; + left++; + } + return left-1; + } + + + + + // destroys the bitmap, freeing the original bitstream + +void destroyBitmap (bitmap B) + + { free (B->data); + free (B->bdata); + free (B->sdata); + free (B); + } + + +uint sizeofBitmap(bitmap B) + { + return sizeof(struct sbitmap) + + ((B->n+W-1)/W)*sizeof(uint) + // data + ((B->n+256-1)/256)*sizeof(int) + // sdata + ((B->n+256-1)/256)*(256/W)*sizeof(byte); // bdata + //((B->sdata_0)?((B->n+256-1)/256)*sizeof(int):0) + + //((B->bdata_0)?((B->n+256-1)/256)*(256/W)*sizeof(byte):0); + } + diff --git a/lzindex/bitmap.h b/lzindex/bitmap.h new file mode 100644 index 0000000..f818160 --- /dev/null +++ b/lzindex/bitmap.h @@ -0,0 +1,38 @@ + + +// Implements operations over a bitmap + +#ifndef BITMAPINCLUDED +#define BITMAPINCLUDED + +#include "basics.h" + +typedef struct sbitmap + { + uint *data; + uint n; // # of bits + uint *sdata; // superblock counters + byte *bdata; // block counters + uint *sdata_0; // superblock counters for select_0() queries + byte *bdata_0; // block counters for select_0() queries + } *bitmap; + + + // creates a bitmap structure from a bitstring, which gets owned +bitmap createBitmap (uint *string, uint n, bool isfullbmap); + // rank(i): how many 1's are there before position i, not included +uint rank (bitmap B, uint i); + // select_1(x): returns the position i of the bitmap B such that + // the number of ones up to position i is x. +uint select_1(bitmap B, uint x); + // select_0(x): returns the position i of the bitmap B such that + // the number of zeros up to position i is x. +uint select_0(bitmap B, uint x); + // destroys the bitmap, freeing the original bitstream +void destroyBitmap (bitmap B); + // popcounts 1's in x +uint popcount (register uint x); + +uint sizeofBitmap(bitmap B); +#endif + diff --git a/lzindex/hash.c b/lzindex/hash.c new file mode 100644 index 0000000..272fbee --- /dev/null +++ b/lzindex/hash.c @@ -0,0 +1,72 @@ + +// Closed hash table + +#include "hash.h" + + // creates a table to store up to n values with guaranteed load factor. + // vbits = # of bits per entry, ENTRIES CANNOT HAVE ZERO VALUE + +hash createHash (uint n, uint vbits, float factor) + + { hash H = malloc (sizeof(struct shash)); + int i,N; + if (n == 0) return NULL; + N = n*factor; if (N <= n) N = n+1; + H->size = (1 << bits(N-1)) - 1; + H->bits = vbits; + H->table = malloc ((((H->size+1)*vbits+W-1)/W)*sizeof(uint)); +#ifdef INDEXREPORT + printf (" Also created hash table of %i bits\n", + (((H->size+1)*vbits+W-1)/W)*W); +#endif + for (i=(((H->size+1)*vbits+W-1)/W)-1;i>=0;i--) H->table[i] = 0; + return H; + } + + // frees the structure + +void destroyHash (hash H) + + { if (H == NULL) return; + free (H->table); + free (H); + } + + // inserts an entry, not prepared for overflow + +void insertHash (hash H, uint key, uint value) + + { uint pos = (key*PRIME1) & H->size; + if (bitget(H->table,pos*H->bits,H->bits) != 0) + { do pos = (pos + PRIME2) & H->size; + while (bitget(H->table,pos*H->bits,H->bits) != 0); + } + bitput(H->table,pos*H->bits,H->bits,value); + } + + // looks for a key, returns first value (zero => no values) + // writes in pos a handle to get next values + +uint searchHash (hash H, uint key, handle *h) + + { *h = (key*PRIME1) & H->size; + return bitget(H->table,*h*H->bits,H->bits); + } + + // gets following values using handle *pos, which is rewritten + // returns next value (zero => no more values) + +uint nextHash (hash H, handle *h) + + { *h = (*h +PRIME2) & H->size; + return bitget(H->table,*h*H->bits,H->bits); + } + + +uint sizeofHash(hash H) + { + if (H) return sizeof(struct shash) + + (((H->size+1)*H->bits+W-1)/W)*sizeof(uint); + else return 0; + } + diff --git a/lzindex/hash.h b/lzindex/hash.h new file mode 100644 index 0000000..46c30e0 --- /dev/null +++ b/lzindex/hash.h @@ -0,0 +1,41 @@ + +// Closed hash table that does not store the keys + +// needs the maximum number of elements stored to be declared at creation time +// cannot remove elements +// does not store the key, rather, it reports all the collisioning values +// right value under collisions +// CANNOT STORE ZERO in the value + +#ifndef HASHINCLUDED +#define HASHINCLUDED + +#include "basics.h" + +typedef struct shash + { uint size; // # of table entries + uint bits; // bits per entry + uint *table; // data + } *hash; + +typedef uint handle; + + // creates a table to store up to n values with guaranteed load factor. + // vbits = # of bits per entry, ENTRIES CANNOT HAVE VALUE ZERO +hash createHash (uint n, uint vbits, float factor); + // frees the structure +void destroyHash (hash H); + // inserts an entry +void insertHash (hash H, uint key, uint elem); + // looks for a key, returns first value (zero => no values) + // writes in pos a handle to get next values +uint searchHash (hash H, uint key, handle *h); + // gets following values using handle *h, which is rewritten + // returns next value (zero => no more values) +uint nextHash (hash H, handle *h); + + // two large primes found with etc/hash.c +#define PRIME1 ((uint)4294967279) +#define PRIME2 ((uint)4294967197) + +#endif diff --git a/lzindex/heap.c b/lzindex/heap.c new file mode 100644 index 0000000..affe6c7 --- /dev/null +++ b/lzindex/heap.c @@ -0,0 +1,105 @@ + + // Heap of fixed size objects, fast to alloc and especially to totally free + +#include "heap.h" + +#define BLK 2048 + +#ifdef MEMCTRL +extern int account,currmem,newmem,maxmem; +#endif + + // Creates a heap of elements of size siz + +heap createHeap (uint siz) + + { heap H; + H = malloc (sizeof(struct sheap)); + H->siz = siz; + H->cap = BLK/siz; if (H->cap == 0) H->cap = 1; + H->blocks = NULL; + H->free = NULL; + H->first = H->cap; +#ifdef MEMCTRL + H->totsize = 0; +#endif + return H; + } + + // Gets a new element from H + +void *mallocHeap (heap H) + + { void *elem; + void **b; +#ifdef MEMCTRL + H->totsize++; + newmem = currmem+H->siz; + if (newmem > maxmem) maxmem = newmem; + if (currmem/1024 != newmem/1024) + printf ("Memory: %i Kb, maximum: %i Kb\n",newmem/1024,maxmem/1024); + currmem = newmem; +#endif + if (H->free) + { elem = (void*)H->free; + H->free = *H->free; + return elem; + } + if (H->first != H->cap) + { elem = (void*)(((uint)(H->blocks+1)) + H->first * H->siz); + H->first++; + return elem; + } +#ifdef MEMCTRL + account = 0; +#endif + b = malloc (sizeof(void*) + H->cap * H->siz); +#ifdef MEMCTRL + account = 1; +#endif + b[0] = H->blocks; + H->blocks = b; + elem = (void*)(H->blocks+1); + H->first = 1; + return elem; + } + + // Frees ptr from heap H + +void freeHeap (heap H, void *ptr) + + { void **p; + if (!ptr) return; +#ifdef MEMCTRL + H->totsize--; + newmem = currmem-H->siz; + if (currmem/1024 != newmem/1024) + printf ("Memory: %i Kb, maximum: %i Kb\n",newmem/1024,maxmem/1024); + currmem = newmem; +#endif + p = (void**)ptr; + *p = H->free; + H->free = p; + } + + // Frees everything in heap H + +void destroyHeap (heap H) + + { void **b; +#ifdef MEMCTRL + account = 0; +#endif + while ((b = H->blocks) != NULL) + { H->blocks = b[0]; + free (b); + } +#ifdef MEMCTRL + account = 1; + newmem = currmem-H->totsize*H->siz; + if (currmem/1024 != newmem/1024) + printf ("Memory: %i Kb, maximum: %i Kb\n",newmem/1024,maxmem/1024); + currmem = newmem; +#endif + free (H); + } diff --git a/lzindex/heap.h b/lzindex/heap.h new file mode 100644 index 0000000..aaee593 --- /dev/null +++ b/lzindex/heap.h @@ -0,0 +1,27 @@ + + // Heap of fixed size objects, fast to alloc and especially to totally free + +#ifndef HEAPINCLUDED +#define HEAPINCLUDED + +#include "basics.h" + +typedef struct sheap + { uint siz; // element size + uint cap; // block capacity in # of elems + void **blocks; // list of blocks, next is block[0] + void **free; // list of free positions, each contains next + uint first; // first free position in current block + uint totsize; // total memory allocated here + } *heap; + + // Creates a heap of elements of size siz +heap createHeap (uint siz); + // Gets a new element from H +void *mallocHeap (heap H); + // Frees ptr from heap H +void freeHeap (heap H, void *ptr); + // Frees everything in heap H +void destroyHeap (heap H); + +#endif diff --git a/lzindex/lztrie.c b/lzindex/lztrie.c new file mode 100644 index 0000000..0a4c56b --- /dev/null +++ b/lzindex/lztrie.c @@ -0,0 +1,288 @@ + + +// Implements the LZtrie data structure + +#include "lztrie.h" + + // creates a lztrie structure from a parentheses bitstring, + // a letter array in preorder, and an id array in preorder, + // all of which except the latter become owned + // n is the total number of nodes (n letters/ids, 2n parentheses) + +lztrie createLZTrie(uint *string, byte *letters, uint *Node, uint n, uint text_length) + { + lztrie T; + int i; + T = malloc(sizeof(struct slztrie)); + T->data = string; + T->pdata = createParentheses(string,2*n,true); + T->n = n; + T->nbits = bits(n-1); + T->letters = letters; + T->Node = createNodemap(Node, n, n); + T->TPos = createPosition(T, text_length); + T->text_length = text_length; + // boost first access + T->boost = malloc(256*sizeof(trieNode)); + for (i=0;i<256;i++) T->boost[i] = NULLT; + i = 1; // shortcut for first child of root + while (i != 2*n-1) { // shortcut for its closing parenthesis + T->boost[T->letters[i-rank(T->pdata->bdata,i)]] = i; + // shortcut for leftrankLZTrie + i = findclose(T->pdata,i)+1; + } + return T; + } + +// builds an lztrie from a text. "s" is the text terminator. +lztrie buildLZTrie(byte *text, byte s, uint text_length) + { + trie T; + uint n; + uint *parent; + byte *letters; + lztrie LZT; + uint *Node; + // first creates a full trie T + T = createTrie(); + do { + text = insertTrie(T,text); + } + while (text[-1] != s); + // now compresses it + n = T->nid; + parent = malloc(((2*n+W-1)/W)*sizeof(uint)); + letters = malloc(n*sizeof(byte)); + // Aqui pedir espacio para Node, ver como lo voy a construir en vez de sacar solo los ids + Node = malloc(n*sizeof(uint)); + representTrie(T,parent,letters,Node); + + destroyTrie(T); + LZT = createLZTrie(parent,letters,Node,n,text_length); + return LZT; + } + + + + // frees LZTrie structure, including the owned data + +void destroyLZTrie(lztrie T) + { + destroyParentheses(T->pdata); + free(T->letters); + destroyNodemap(T->Node); + destroyPosition(T->TPos); + free(T->boost); + free(T); + } + + // stores lztrie T on file f + +void saveLZTrie (lztrie T, FILE *f) + { + if (fwrite (&T->n,sizeof(uint),1,f) != 1) { + fprintf (stderr,"Error: Cannot write LZTrie on file\n"); + exit(1); + } + if (fwrite (T->data,sizeof(uint),(2*T->n+W-1)/W,f) != (2*T->n+W-1)/W) { + fprintf (stderr,"Error: Cannot write LZTrie on file\n"); + exit(1); + } + if (fwrite (T->letters,sizeof(byte),T->n,f) != T->n) { + fprintf (stderr,"Error: Cannot write LZTrie on file\n"); + exit(1); + } + + saveNodemap(f, T->Node); + fwrite(&T->text_length, sizeof(uint), 1, f); + savePosition(f, T->TPos); + } + + // loads lztrie T from file f + +lztrie loadLZTrie (FILE *f) + { + lztrie T; + int i; + T = malloc (sizeof(struct slztrie)); + if (fread (&T->n,sizeof(uint),1,f) != 1) { + fprintf (stderr,"Error: Cannot read LZTrie from file\n"); + exit(1); + } + T->data = malloc (((2*T->n+W-1)/W)*sizeof(uint)); + if (fread (T->data,sizeof(uint),(2*T->n+W-1)/W,f) != (2*T->n+W-1)/W) { + fprintf (stderr,"Error: Cannot read LZTrie from file\n"); + exit(1); + } + T->pdata = createParentheses (T->data,2*T->n,true); + T->nbits = bits(T->n-1); + T->letters = malloc (T->n*sizeof(byte)); + if (fread (T->letters,sizeof(byte),T->n,f) != T->n) { + fprintf (stderr,"Error: Cannot read LZTrie from file\n"); + exit(1); + } + + T->Node = loadNodemap(f); + fread(&T->text_length, sizeof(uint), 1, f); + T->TPos = loadPosition(f, T->text_length); + + // boost first access + T->boost = malloc(256*sizeof(trieNode)); + for (i=0;i<256;i++) T->boost[i] = NULLT; + i = 1; // shortcut for first child of root + while (i != 2*T->n-1) { // shortcut for its closing parenthesis + T->boost[T->letters[i-rank(T->pdata->bdata,i)]] = i; + // shortcut for leftrankLZTrie + i = findclose(T->pdata,i)+1; + } + + return T; + } + + + // letter by which node i descends + +byte letterLZTrie (lztrie T, trieNode i) + { + return T->letters[i-rank(T->pdata->bdata,i)]; // shortcut leftrankLZTrie + } + + // go down by letter c, if possible + + +trieNode childLZTrie (lztrie T, trieNode i, byte c) + { + trieNode j; + byte tc; + j = i+1; + while (!bitget1(T->data,j)) { // there is a child here + tc = T->letters[j-rank(T->pdata->bdata,j)]; + // shortcut for leftrankLZTrie: is a child by letter c? + if (tc > c) break; + if (tc == c) return j; + j = findclose(T->pdata,j)+1; + } + return NULLT; // no child to go down by c + } + + // go up, if possible + +trieNode parentLZTrie (lztrie T, trieNode i) + { + if (i == ROOT) return NULLT; // parent of root + if (T->boost[T->letters[i-rank(T->pdata->bdata,i)]] == i) return ROOT; + return enclose (T->pdata,i); + } + + // subtree size + +uint subtreesizeLZTrie (lztrie T, trieNode i) + { + trieNode j; + j = findclose (T->pdata,i); + return (j-i+1)/2; + } + + // depth + +uint depthLZTrie (lztrie T, trieNode i) + { + return excess (T->pdata,i); + } + + // smallest rank in subtree + +uint leftrankLZTrie (lztrie T, trieNode i) + + { return i-rank(T->pdata->bdata,i); + } + + // largest rank in subtree + +uint rightrankLZTrie (lztrie T, trieNode i) + { + trieNode j; + j = findclose (T->pdata,i); + return j-rank(T->pdata->bdata,j)-1; + } + + // is node i ancestor of node j? + +bool ancestorLZTrie (lztrie T, trieNode i, trieNode j) + { + return (i <= j) && (j < findclose (T->pdata,i)); + } + + // next node from i, in preorder, adds/decrements depth accordingly + // assumes it *can* go on! + +trieNode nextLZTrie (lztrie T, trieNode i, uint *depth) + { + uint *data = T->data; + i++; + while (bitget1(data,i)) { i++; (*depth)--; } + (*depth)++; + return i; + } + + +// extracts text from position "from" up to position "to". +// Parameter "snippet_length" is the actual length of the snippet extracted. +// The extracted text is stored in array "snippet". +// +void extract(lztrie T, + ulong from, + ulong to, + byte **snippet, + ulong *snippet_length) + { + ulong snpp_pos, posaux, idaux; + uint idfrom,idto; + trieNode p; + byte *snppt; + nodemap Node = T->Node; + + if (to > (T->text_length-1)) to = T->text_length-1; + if (from > (T->text_length-1)) from = T->text_length-1; + + *snippet_length = to-from+1; + *snippet = malloc(*snippet_length); + + idfrom = getphrasePosition(T->TPos, from); + idto = getphrasePosition(T->TPos, to+1); + // *snippet_length = to-from+1; + + posaux = getPosition(T->TPos, idto+1)-1; + p = mapto(Node, idto); + + while (p&&(posaux > to)) { + p = parentLZTrie(T, p); + posaux--; + } + + snpp_pos = (*snippet_length)-1; + snppt = *snippet; + + for (idaux = idto; idaux != idfrom;) { + while (p) { + snppt[snpp_pos--] = letterLZTrie(T, p); + p = parentLZTrie(T, p); + } + p = mapto(Node, --idaux); + } + if (idfrom != idto) posaux = getPosition(T->TPos, idfrom+1)-(from!=0); + while (p && (posaux >= from)) { + snppt[snpp_pos--] = letterLZTrie(T, p); + p = parentLZTrie(T, p); + posaux--; + } + } + +uint sizeofLZTrie(lztrie T) + { + return sizeof(struct slztrie) + + sizeofParentheses(T->pdata) + + T->n*sizeof(byte) + + sizeofNodemap(T->Node) + + sizeofPosition(T->TPos); + } diff --git a/lzindex/lztrie.h b/lzindex/lztrie.h new file mode 100644 index 0000000..691a2aa --- /dev/null +++ b/lzindex/lztrie.h @@ -0,0 +1,81 @@ + + +// Implements the LZtrie data structure + +#ifndef LZTRIEINCLUDED +#define LZTRIEINCLUDED + +#include "basics.h" +#include "trie.h" +#include "parentheses.h" +#include "nodemap.h" +#include "position.h" + +//typedef uint trieNode; // a node of lztrie + +#define NULLT ((trieNode)(~0)) // a null node +#define ROOT ((trieNode)(0)) // the root node + +typedef struct slztrie + { + uint *data; // bitmap data + parentheses pdata; // parentheses structure + uint n; // # of parentheses + byte *letters; // letters of the trie + uint nbits; // log n + nodemap Node; // ids of the trie + position TPos; // text position data structure + uint text_length; // length of the original text + trieNode *boost; // direct pointers to first children + } *lztrie; + + + // creates a lztrie structure from a parentheses bitstring, + // a letter array in preorder, and an id array in preorder, + // all of which except the latter become owned + // n is the total number of nodes (n letters/ids, 2n parentheses) +lztrie createLZTrie (uint *string, byte *letters, uint *id, uint n, uint text_length); + // builds an lztrie from a text. "s" is the text terminator. +#ifdef __cplusplus +extern "C" { + lztrie buildLZTrie(byte *text, byte s, uint text_length); +} +#else + lztrie buildLZTrie(byte *text, byte s, uint text_length); +#endif + + // frees LZTrie structure, including the owned data +void destroyLZTrie (lztrie T); + // stores lztrie T on file f +void saveLZTrie (lztrie T, FILE *f); + // loads lztrie T from file f +lztrie loadLZTrie (FILE *f); + // letter by which node i descends +byte letterLZTrie (lztrie T, trieNode i); + // go down by letter c, if possible +trieNode childLZTrie (lztrie T, trieNode i, byte c); + // go up, if possible +trieNode parentLZTrie (lztrie T, trieNode i); + // subtree size +uint subtreesizeLZTrie (lztrie T, trieNode i); + // smallest rank in subtree +uint leftrankLZTrie (lztrie T, trieNode i); + // largest rank in subtree +uint rightrankLZTrie (lztrie T, trieNode i); + // is node i ancestor of node j? +bool ancestorLZTrie (lztrie T, trieNode i, trieNode j); + // next node from i, in preorder, adds/decrements depth accordingly + // assumes it *can* go on! +trieNode nextLZTrie (lztrie T, trieNode i, uint *depth); + // size in bytes of LZTrie T +uint sizeofLZTrie(lztrie T); + +#ifdef __cplusplus +extern "C" { +void extract(lztrie T, ulong from, ulong to, byte **snippet, ulong *snippet_length); +} +#else +void extract(lztrie T, ulong from, ulong to, byte **snippet, ulong *snippet_length); +#endif + +#endif diff --git a/lzindex/makefile b/lzindex/makefile new file mode 100644 index 0000000..c2d688b --- /dev/null +++ b/lzindex/makefile @@ -0,0 +1,66 @@ +#FLAGS = -g -lm +FLAGS = -O9 -fomit-frame-pointer -W -Wall -Winline -DDEBUG=0 -DNDEBUG=1 + +all: lztrie.a + +lztrie.a: lztrie.o nodemap.o position.o trie.o heap.o parentheses.o bitmap.o hash.o basics.o + ar rc lztrie.a lztrie.o nodemap.o position.o trie.o heap.o parentheses.o bitmap.o hash.o basics.o + +lztrie.o: lztrie.c + gcc $(FLAGS) -c lztrie.c +nodemap.o: nodemap.c + gcc $(FLAGS) -c nodemap.c +position.o: position.c + gcc $(FLAGS) -c position.c +trie.o: trie.c + gcc $(FLAGS) -c trie.c +heap.o: heap.c + gcc $(FLAGS) -c heap.c +parentheses.o: parentheses.c + gcc $(FLAGS) -c parentheses.c +bitmap.o: bitmap.c + gcc $(FLAGS) -c bitmap.c +hash.o: hash.c + gcc $(FLAGS) -c hash.c +basics.o: basics.c + gcc $(FLAGS) -c basics.c + +basics.h: makefile + touch basics.h +bitmap.h: basics.h + touch bitmap.h +hash.h: basics.h + touch hash.h +lztrie.h: basics.h parentheses.h + touch lztrie.h +position.h: lztrie.h nodemap.h + touch position.h +nodemap.h: basics.h lztrie.h + touch nodemap.h +parentheses.h: basics.h bitmap.h hash.h + touch parentheses.h +trie.h: basics.h heap.h + touch trie.h +heap.h: basics.h + touch heap.h +basics.c: basics.h + touch basics.c +hash.c: hash.h + touch hash.c +bitmap.c: bitmap.h + touch bitmap.c +lztrie.c: lztrie.h + touch lztrie.c +position.c: position.h + touch position.c +nodemap.c: nodemap.h + touch nodemap.c +parentheses.c: parentheses.h + touch parentheses.c +trie.c: trie.h + touch trie.c +heap.c: heap.h + touch heap.c + +clean: + rm -f *.a *.o diff --git a/lzindex/nodemap.c b/lzindex/nodemap.c new file mode 100644 index 0000000..b72d474 --- /dev/null +++ b/lzindex/nodemap.c @@ -0,0 +1,61 @@ + +// Implements the Map data structure, which maps block ids to lztrie nodes + +#include "nodemap.h" + + // creates a nodemap structure from a mapping array, not owning it + // n is number of blocks + // max is the number of trie nodes + +nodemap createNodemap(uint *map, uint n, uint max) + { + nodemap M; + uint i; + M = malloc(sizeof(struct snodemap)); + M->nbits = bits(2*max-1); + M->n = n; + M->map = malloc(((n*M->nbits+W-1)/W)*sizeof(uint)); + for (i=0;imap,i*M->nbits,M->nbits,map[i]); + return M; + } + + // frees revtrie structure, including the owned data + +void destroyNodemap(nodemap M) + { + free(M->map); + free(M); + } + + // mapping + +trieNode mapto(nodemap M, uint id) + { + return bitget(M->map,id*M->nbits,M->nbits); + } + +void saveNodemap(FILE *f, nodemap M) + { + fwrite(&M->nbits, sizeof(uint), 1, f); + fwrite(&M->n, sizeof(uint), 1, f); + fwrite(M->map, sizeof(uint), (M->n*M->nbits+W-1)/W, f); + } + +nodemap loadNodemap(FILE *f) + { + nodemap M; + + M = malloc(sizeof(struct snodemap)); + fread(&M->nbits, sizeof(uint), 1, f); + fread(&M->n, sizeof(uint), 1, f); + M->map = malloc(((M->n*M->nbits+W-1)/W)*sizeof(uint)); + fread(M->map, sizeof(uint), (M->n*M->nbits+W-1)/W, f); + return M; + } + +uint sizeofNodemap(nodemap M) + { + return sizeof(struct snodemap) + ((M->n*M->nbits+W-1)/W)*sizeof(uint); + } + diff --git a/lzindex/nodemap.h b/lzindex/nodemap.h new file mode 100644 index 0000000..89a3d83 --- /dev/null +++ b/lzindex/nodemap.h @@ -0,0 +1,35 @@ + + +// Implements the Node and RNode data structures, +// which map block ids to LZTrie nodes and +// block ids to RevTrie nodes, respectively + +#ifndef NODEMAPINCLUDED +#define NODEMAPINCLUDED + +#include "basics.h" +//#include "lztrie.h" + +typedef struct snodemap + { + uint nbits; // bits per cell + uint *map; // mapping + uint n; // number of elements in the mapping + } *nodemap; + + + // creates a nodemap structure from a mapping array, not owning it + // n is number of blocks + // max is the number of trie nodes +nodemap createNodemap(uint *map, uint n, uint max); + // frees revtrie structure, including the owned data +void destroyNodemap(nodemap M); + // mapping +trieNode mapto(nodemap M, uint id); + +void saveNodemap(FILE *f, nodemap M); + +nodemap loadNodemap(FILE *f); + +uint sizeofNodemap(nodemap M); +#endif diff --git a/lzindex/out-pily b/lzindex/out-pily new file mode 100644 index 0000000..fdd592e Binary files /dev/null and b/lzindex/out-pily differ diff --git a/lzindex/parentheses.c b/lzindex/parentheses.c new file mode 100644 index 0000000..15c8a92 --- /dev/null +++ b/lzindex/parentheses.c @@ -0,0 +1,294 @@ + + +// Implements operations over a sequence of balanced parentheses + +#include "parentheses.h" + + // I have decided not to implement Munro et al.'s scheme, as it is too + // complicated and the overhead is not so small in practice. I have opted + // for a simpler scheme. Each open (closing) parenthesis will be able to + // find its matching closing (open) parenthesis. If the distance is shorter + // than b, we will do it by hand, traversing the string. Otherwise, the + // answer will be stored in a hash table. In fact, only subtrees larger than + // s will have the full distance stored, while those between b and s will + // be in another table with just log s bits. The traversal by hand proceeds + // in fact by chunks of k bits, whose answers are precomputed. + // Space: there cannot be more than n/s subtrees larger than s, idem b. + // So we have (n/s)log n bits for far pointers and (n/b)log s for near + // pointers. The space for the table is 2^k*k*log b. The optimum s=b log n, + // in which case the space is n/b(1 + log b + log log n) + 2^k*k*log b. + // Time: the time is O(b/k), and we want to keep it O(log log n), so + // k = b/log log n. + // (previous arguments hold if there are no unary nodes, but we hope that + // there are not too many -- in revtrie we compress unary paths except when + // they have id) + // Settings: using b = log n, we have + // space = n log log n / log n + 2^(log n / log log n) log n + // time = log log n + // In practice: we use k = 8 (bytes table), b = W (so work = 4 or 8) + // and space ~= n/3 + 10 Kbytes (fixed table). + // Notice that we need a hash table that stores only the deltas and does not + // store the keys! (they would take log n instead of log s!). Collisions are + // resolved as follows: see all the deltas that could be and pick the smallest + // one whose excess is the same of the argument. To make this low we use a + // load factor of 2.0, so it is 2n/3 after all. + // We need the same for the reverses, for the forward is only for ('s and + // reverses for )'s, so the proportion stays the same. + // We also need the stream to be a bitmap to know how many open parentheses + // we have to the left. The operations are as follows: + // findclose: use the mechanism described above + // findparent: similar, in reverse, looking for the current excess - 1 + // this needs us to store the (near/far) parent of each node, which may + // cost more than the next sibling. + // excess: using the number of open parentheses + // enclose: almost findparent + + // creates a parentheses structure from a bitstring, which is shared + // n is the total number of parentheses, opening + closing + +static uint calcsizes (parentheses P, uint posparent, uint posopen, + uint *near, uint *far, uint *pnear, uint *pfar) + + { uint posclose,newpos; + if ((posopen == P->n) || bitget1(P->data,posopen)) + return posopen; // no more trees + newpos = posopen; + do { posclose = newpos+1; + newpos = calcsizes (P,posopen,posclose,near,far,pnear,pfar); + } + while (newpos != posclose); + if ((posclose < P->n) && (posclose-posopen > W)) // exists and not small + if (posclose-posopen < (1<sbits)) (*near)++; // near pointer + else (*far)++; + if ((posopen > 0) && (posopen-posparent > W)) // exists and not small + if (posopen-posparent < (1<sbits)) (*pnear)++; // near pointer + else (*pfar)++; + return posclose; + } + + +static uint filltables (parentheses P, uint posparent, uint posopen, bool bwd) + + { uint posclose,newpos; + if ((posopen == P->n) || bitget1(P->data,posopen)) + return posopen; // no more trees + newpos = posopen; + do { posclose = newpos+1; + newpos = filltables (P,posopen,posclose,bwd); + } + while (newpos != posclose); + if ((posclose < P->n) && (posclose-posopen > W)) // exists and not small + { if (posclose-posopen < (1<sbits)) // near pointers + insertHash (P->bftable,posopen,posclose-posopen); + else // far pointers + insertHash (P->sftable,posopen,posclose-posopen); + } + if (bwd && (posopen > 0) && (posopen-posparent > W)) //exists and not small + { if (posopen-posparent < (1<sbits)) // near pointer + insertHash (P->bbtable,posopen,posopen-posparent); + else // far pointers + insertHash (P->sbtable,posopen,posopen-posparent); + } + return posclose; + } + + + +static byte FwdPos[256][W/2]; +static byte BwdPos[256][W/2]; +static char Excess[256]; +static bool tablesComputed = false; + +static void fcompchar (byte x, byte *pos, char *excess) + + { int exc = 0; + uint i; + for (i=0;i>= 1; + } + *excess = exc; + } + +static void bcompchar (byte x, byte *pos) + + { int exc = 0; + uint i; + for (i=0;i 0) && !pos[exc-1]) pos[exc-1] = i+1; + } + else exc--; + x <<= 1; + } + } + +parentheses createParentheses (uint *string, uint n, bool bwd) + + { parentheses P; + uint i,s,nb,ns,nbits,near,far,pnear,pfar; + + P = malloc (sizeof(struct sparentheses)); + P->data = string; + P->n = n; + P->bdata = createBitmap (string,n,false); + nbits = bits(n-1); + s = nbits*W; + P->sbits = bits(s-1); + s = 1 << P->sbits; // to take the most advantage of what we can represent + ns = (n+s-1)/s; nb = (s+W-1)/W; // adjustments + near = far = pnear = pfar = 0; + calcsizes (P,~0,0,&near,&far,&pnear,&pfar); +#ifdef INDEXREPORT + printf (" Parentheses: total %i, near %i, far %i, pnear %i, pfar %i\n",n,near,far,pnear,pfar); +#endif + P->sftable = createHash(far,nbits,1.8); + P->bftable = createHash(near,P->sbits,1.8); + + P->sbtable = createHash (pfar,nbits,2.5); + P->bbtable = createHash (pnear,P->sbits,2.5); + filltables (P,~0,0,true); + if (!tablesComputed) + { tablesComputed = true; + for (i=0;i<256;i++) + { fcompchar (i,FwdPos[i],Excess+i); + bcompchar (i,BwdPos[i]); + } + } + return P; + } + + // frees parentheses structure, including the bitstream + +void destroyParentheses (parentheses P) + + { destroyBitmap(P->bdata); + destroyHash (P->sftable); + if (P->sbtable) destroyHash (P->sbtable); + destroyHash (P->bftable); + if (P->bbtable) destroyHash (P->bbtable); + free (P); + } + + // the position of the closing parenthesis corresponding to (opening) + // parenthesis at position i + +uint findclose (parentheses P, uint i) + + { uint bitW; + uint len,res,minres,exc; + byte W1; + handle h; + uint myexcess; + // first see if it is at small distance + len = W; if (i+len >= P->n) len = P->n-i-1; + bitW = bitget (P->data,i+1,len); + exc = 0; len = 0; + while (bitW && (exc < W/2)) + // either we shift it all or it only opens parentheses or too + // many open parentheses + { W1 = bitW & 255; + if (res = FwdPos[W1][exc]) return i+len+res; + bitW >>= 8; exc += Excess[W1]; + len += 8; + } + // ok, it's not a small distance, try with hashing btable + minres = 0; + myexcess = excess (P,i); + res = searchHash (P->bftable,i,&h); + while (res) + { if (!minres || (res < minres)) + if ((i+res+1 < P->n) && (excess(P,i+res+1) == myexcess)) + minres = res; + res = nextHash (P->bftable,&h); + } + if (minres) return i+minres; + // finally, it has to be a far pointer + res = searchHash (P->sftable,i,&h); + while (res) + { if (!minres || (res < minres)) + if ((i+res+1 < P->n) && (excess(P,i+res+1) == myexcess)) + minres = res; + res = nextHash (P->sftable,&h); + } + return i+minres; // there should be one if the sequence is balanced! + } + + // find enclosing parenthesis for an open parenthesis + // assumes that the parenthesis has an enclosing pair + +uint findparent (parentheses P, uint i) + + { uint bitW; + uint len,res,minres,exc; + byte W1; + handle h; + uint myexcess; + // first see if it is at small distance + len = W; if (i < len) len = i-1; + bitW = ~bitget (P->data,i-len,len) << (W-len); + exc = 0; len = 0; + while (bitW && (exc < W/2)) + // either we shift it all or it only closes parentheses or too + // many closed parentheses + { W1 = (bitW >> (W-8)); + if (res = BwdPos[W1][exc]) return i-len-res; + bitW <<= 8; exc += Excess[W1]; // note W1 is complemented! + len += 8; + } + // ok, it's not a small distance, try with hashing btable + minres = 0; + myexcess = excess (P,i) - 1; + res = searchHash (P->bbtable,i,&h); + while (res) + { if (!minres || (res < minres)) + if ((i-res >= 0) && (excess(P,i-res) == myexcess)) + minres = res; + res = nextHash (P->bbtable,&h); + } + if (minres) return i-minres; + // finally, it has to be a far pointer + res = searchHash (P->sbtable,i,&h); + while (res) + { if (!minres || (res < minres)) + if ((i-res >= 0) && (excess(P,i-res) == myexcess)) + minres = res; + res = nextHash (P->sbtable,&h); + } + return i-minres; // there should be one if the sequence is balanced! + } + + // # open - # close at position i, not included + +uint excess (parentheses P, uint i) + + { return i - 2*rank(P->bdata,i); + } + + // open position of closest parentheses pair that contains the pair + // that opens at i, ~0 if no parent + +uint enclose (parentheses P, uint i) + + { if (i == 0) return ~0; // no parent! + return findparent (P,i); + } + + + +uint sizeofParentheses(parentheses P) + { + return sizeof(struct sparentheses) + + sizeofBitmap(P->bdata) + + sizeofHash(P->sftable) + + sizeofHash(P->sbtable) + + sizeofHash(P->bftable) + + sizeofHash(P->bbtable); + } diff --git a/lzindex/parentheses.h b/lzindex/parentheses.h new file mode 100644 index 0000000..05b6aaa --- /dev/null +++ b/lzindex/parentheses.h @@ -0,0 +1,41 @@ + + +// Implements operations over a sequence of balanced parentheses + +#ifndef PARENTHESESINCLUDED +#define PARENTHESESINCLUDED + +#include "basics.h" +#include "bitmap.h" +#include "hash.h" + +typedef struct sparentheses + { uint *data; // string + bitmap bdata; // bitmap of string + uint n; // # of parentheses + uint sbits; // bits for near pointers + hash sftable; // table of far forward pointers + hash sbtable; // table of far backward pointers + hash bftable; // table of near forward pointers + hash bbtable; // table of near backward pointers + } *parentheses; + + // creates a parentheses structure from a bitstring, which gets owned + // n is the total number of parentheses, opening + closing + // bwd says if you will want to perform findopen and enclose +parentheses createParentheses (uint *string, uint n, bool bwd); + // frees parentheses structure, including the bitstream +void destroyParentheses (parentheses P); + // the position of the closing parenthesis corresponding to (opening) + // parenthesis at position i +uint findclose (parentheses P, uint i); + // respectively, for closing parenthesis i +uint findopen (parentheses P, uint i); + // # open - # close at position i, not included +uint excess (parentheses P, uint i); + // open position of closest parentheses pair that contains the pair + // that opens at i, ~0 if no parent +uint enclose (parentheses P, uint i); + +uint sizeofParentheses(parentheses P); +#endif diff --git a/lzindex/position.c b/lzindex/position.c new file mode 100644 index 0000000..984a8d4 --- /dev/null +++ b/lzindex/position.c @@ -0,0 +1,223 @@ + +// This code implements the data structure to get +// "real" text positions at search time. + +#include "position.h" +#include "lztrie.h" +#include + + +position createPosition(void *Taux, uint text_length) + { + position P; + lztrie T = (lztrie) Taux; + uint *Offset; + ulong n, M, superblock_size, current_superblock, + starting_pos, i, depth, pos; + trieNode node; + + P = malloc(sizeof(struct tpos)); + n = T->n; + P->SBlock_size = 32;//bits(n-1); // superblock size is log n + P->nbitsSB = bits(text_length-1); + P->nSuperBlock = (ulong)ceil((double)n/P->SBlock_size); // number of superblocks + P->SuperBlock = malloc(((P->nbitsSB*P->nSuperBlock+W-1)/W)*sizeof(uint)); + P->Tlength = text_length; + M = 0; // maximum superblock size (in number of characters) + + current_superblock = 0; + superblock_size = 0; + starting_pos = 0; + P->nOffset = n; + Offset = malloc(n*sizeof(uint)); // array for temporary use + + bitput(P->SuperBlock, 0, P->nbitsSB, 0); + + for (i = 0, pos = 0; i < n; i++) { + node = mapto(T->Node, i); + depth = depthLZTrie(T, node); + pos += depth; + + superblock_size++; + + if ((superblock_size > P->SBlock_size) || (i==0)) { + if (i) + bitput(P->SuperBlock,P->nbitsSB*(++current_superblock),P->nbitsSB,pos); + if (i!=0 && Offset[i-1]>M) M = Offset[i-1]; + superblock_size = 1; + starting_pos = pos; + } + + Offset[i] = pos-starting_pos; + } + + if (Offset[i-1]>M) M = Offset[i-1]; + + P->nbitsOffs = bits(M-1); + P->Offset = malloc(((P->nOffset*P->nbitsOffs+W-1)/W)*sizeof(uint)); + for (i = 0; i < n; i++) + bitput(P->Offset, i*P->nbitsOffs, P->nbitsOffs, Offset[i]); + + free(Offset); + + return P; + } + + +// given a phrase id, gets the corresponding text position + +ulong getPosition(position P, uint id) + { + if (id > P->nOffset) return P->Tlength; + else { + ulong posSB = (id-1)>>5;///P->SBlock_size;//(ulong)ceil((double)id/P->SBlock_size)-1; + return bitget(P->SuperBlock,posSB*P->nbitsSB,P->nbitsSB) + + bitget(P->Offset,(id-1)*P->nbitsOffs,P->nbitsOffs); + } + } + + +// given a text position, gets the identifier of the +// LZ78 phrase containing that position + +ulong getphrasePosition(position P, ulong text_pos) + { + ulong li, ls, nbits, elem, med, SB, temp, i; + + li = 0; + ls = P->nSuperBlock-1; + nbits = P->nbitsSB; + while ((ls-li+1) > 0) { + med = (li+ls)/2; + elem = bitget(P->SuperBlock,med*nbits,nbits); + if (elem == text_pos) break; + if (elem < text_pos) li = med+1; + else ls = med - 1; + } + if (elem > text_pos && med) med--; + SB = bitget(P->SuperBlock,med*nbits,nbits); + + temp = li; + li = med*P->SBlock_size; + + if (med+1 == P->nSuperBlock) + ls = P->nOffset-1; + else + ls = (med+1)*P->SBlock_size - 1; + + nbits = P->nbitsOffs; + + while ((ls-li+1) > 8) { + med = (li+ls)/2; + elem = bitget(P->Offset,med*nbits,nbits)+SB; + if (elem == text_pos) break; + if (elem < text_pos) li = med+1; + else ls = med-1; + } + + for (i = li; i <= ls; i++) + if ((elem=bitget(P->Offset,i*nbits,nbits) + SB) >= text_pos) break; + + if (elem > text_pos) i--; + + return i+(elem>text_pos); + } + + +ulong sizeofPosition(position P) + { + return sizeof(struct tpos) + + ((P->nbitsSB*P->nSuperBlock+W-1)/W)*sizeof(uint) + + ((P->nOffset*P->nbitsOffs+W-1)/W)*sizeof(uint); + } + +void destroyPosition(position P) + { + if (!P) return; + free(P->SuperBlock); + free(P->Offset); + } + + +position loadPosition(FILE *f, uint text_length) + { + position P; + uint aux; + + P = malloc (sizeof(struct tpos)); + + if (fread(&P->nSuperBlock,sizeof(uint),1,f) != 1) { + fprintf(stderr,"Error: Cannot read LZTrie from file\n"); + exit(1); + } + + P->nbitsSB = bits(text_length-1); + aux = (((unsigned long long) P->nbitsSB*P->nSuperBlock+W-1)/W); + P->SuperBlock = malloc(aux*sizeof(uint)); + + if (fread(P->SuperBlock,sizeof(uint),aux,f) != aux) { + fprintf(stderr,"Error: Cannot read LZTrie from file\n"); + exit(1); + } + + P->SBlock_size = 32; + + P->Tlength = text_length; + + if (fread(&P->nOffset,sizeof(uint),1,f) != 1) { + fprintf(stderr,"Error: Cannot read LZTrie from file\n"); + exit(1); + } + + if (fread(&P->nbitsOffs,sizeof(uint),1,f) != 1) { + fprintf(stderr,"Error: Cannot read LZTrie from file\n"); + exit(1); + } + + aux = (((unsigned long long)P->nOffset*P->nbitsOffs+W-1)/W); + P->Offset = malloc(aux*sizeof(uint)); + if (fread(P->Offset,sizeof(uint),aux,f) != aux) { + fprintf(stderr,"Error: Cannot read LZTrie from file\n"); + exit(1); + } + + return P; + } + + +void savePosition(FILE *f, position P) + { + uint aux; + + if (fwrite(&P->nSuperBlock,sizeof(uint),1,f) != 1) { + fprintf(stderr,"Error: Cannot write LZTrie on file\n"); + exit(1); + } + + aux = (((unsigned long long) P->nbitsSB*P->nSuperBlock+W-1)/W); + + if (fwrite(P->SuperBlock,sizeof(uint),aux,f) != aux) { + fprintf(stderr,"Error: Cannot write LZTrie on file\n"); + exit(1); + } + + if (fwrite(&P->nOffset,sizeof(uint),1,f) != 1) { + fprintf(stderr,"Error: Cannot write LZTrie on file\n"); + exit(1); + } + + if (fwrite(&P->nbitsOffs,sizeof(uint),1,f) != 1) { + fprintf(stderr,"Error: Cannot write LZTrie on file\n"); + exit(1); + } + + aux = (((unsigned long long)P->nOffset*P->nbitsOffs+W-1)/W); + + if (fwrite(P->Offset,sizeof(uint),aux,f) != aux) { + fprintf(stderr,"Error: Cannot write LZTrie on file\n"); + exit(1); + } + } + + + diff --git a/lzindex/position.h b/lzindex/position.h new file mode 100644 index 0000000..f192c3f --- /dev/null +++ b/lzindex/position.h @@ -0,0 +1,34 @@ + +#ifndef POSITIONINCLUDED +#define POSITIONINCLUDED + +//#include "lztrie.h" +#include "nodemap.h" + + +typedef struct tpos + { + uint *SuperBlock; // array with starting positions of superblocks + uint nSuperBlock; // number of superblocks + uint nbitsSB; // number of bits used for starting positions + uint SBlock_size; // superblock size + uint *Offset; // array of offsets of each phrase within each superblock + uint nOffset; // size of the Offset array (number of LZ78 phrases) + uint nbitsOffs; // number of bits used per offset + ulong Tlength; // text length + } *position; + + +position createPosition(void *T, uint text_length); + +ulong getPosition(position P, uint id); + +ulong sizeofPosition(position P); + +void destroyPosition(position P); + +void savePosition(FILE *f, position P); + +position loadPosition(FILE *f, uint text_length); + +#endif diff --git a/lzindex/test.c b/lzindex/test.c new file mode 100644 index 0000000..c0d2694 --- /dev/null +++ b/lzindex/test.c @@ -0,0 +1,24 @@ +#include "stdio.h" +#include "string.h" +#include "lztrie.h" + +int main() +{ + byte *text = (byte*)"ababacaaaaaaaaaaaaaaaaaaaffffffffffffffdddddddddssssssssssfffffffffffzzzbbbbbbbbbbbbbbbbbbbbbbbbbabababaccacacababababababababbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbaaaaaaaaaacccdsfffffffffffffffffffffffaslköfjaösldjföasjdgajsdföljasdkfljaölsdjkföskjfölakjdföklajdökfjlaösldjföaögoiubvoixcupvozucvpouizxcpovui,qwner.,qwenrqbwe,rbqwmerbmwerbm"; + ulong n = strlen((char *)text); + ulong i; + byte *newt = 0; + ulong l = 0; + + lztrie lz = buildLZTrie(text, 0, n); + + printf("extracting:\n"); + extract(lz, 0, n, &newt, &l); + for (i = 0; i < n; ++i) + if (newt[i] != text[i]) { + printf("texts differ at %lu\n", i); + exit(0); + } + + return 0; +} diff --git a/lzindex/trie.c b/lzindex/trie.c new file mode 100644 index 0000000..841801d --- /dev/null +++ b/lzindex/trie.c @@ -0,0 +1,160 @@ + + // LZ78 trie data structure + +#include "trie.h" + + // creates trie + +trie createTrie (void) + + { trie pTrie; + uint i; + pTrie = malloc (sizeof(struct strie)); + pTrie->nid = 0; + pTrie->trie.id = pTrie->nid++; + pTrie->trie.nchildren = 0; + pTrie->trie.children = NULL; + pTrie->heaps[0] = createHeap(sizeof(triebody)); + for (i=1;i<256;i++) + pTrie->heaps[i] = createHeap(i*sizeof(struct schild)); + return pTrie; + } + +uint sizeofTrie(triebody *t) + { + uint i, size = 0; + size = sizeof(uint)+sizeof(short)+sizeof(struct schild *); + for (i=0;inchildren;i++) + size += sizeof(byte)+sizeof(struct striebody *)+sizeofTrie(t->children[i].trie); + return size; + } + + // frees the trie + +void destroyTrie (trie pTrie) + + { uint i; + for (i=0;i<256;i++) destroyHeap (pTrie->heaps[i]); + free (pTrie); + } + + // inserts word[0...] into pTrie and returns new text ptr + // insertion proceeds until we get a new trie node + +byte *insertTrie (trie pTrie, byte *word) + + { triebody *t = &pTrie->trie; + triebody *nt; + struct schild *newc; + int i,j; + int m = 0; + // traverse pTrie with word[0...] + while (true) + { i = 0; + while (i < t->nchildren) + { if (t->children[i].car >= word[m]) break; + i++; + } + if ((i == t->nchildren) || (t->children[i].car > word[m])) + break; // not found, get out + t = t->children[i].trie; + m++; + } + // at this point we fell off the trie, which is guaranteed to occur + // since the text finishes with the unique character 0 + newc = mallocHeap(pTrie->heaps[t->nchildren+1]); + memcpy (newc,t->children,i*sizeof(struct schild)); + memcpy (newc+i+1,t->children+i,(t->nchildren-i)*sizeof(struct schild)); + freeHeap (pTrie->heaps[t->nchildren],t->children); + t->children = newc; + t->children[i].car = word[m]; + nt = mallocHeap (pTrie->heaps[0]); + t->children[i].trie = nt; + t->nchildren++; + // new node created + nt->id = pTrie->nid++; + nt->nchildren = 0; + nt->children = NULL; + // return rest of text + return word+m+1; + } + + // inserts word[0..len-1] into pTrie, with id = id + // assumes that no two equal strings are ever inserted + +void insertstringTrie (trie pTrie, byte *word, uint len, uint id) + + { triebody *t,*nt; + uint i,j,m; + struct schild *newc; + // traverse pTrie with word[0...] + t = &pTrie->trie; + m = 0; + while (m < len) + { i = 0; + while (i < t->nchildren) + { if (t->children[i].car >= word[m]) break; + i++; + } + if ((i == t->nchildren) || (t->children[i].car > word[m])) + break; // not found, get out + t = t->children[i].trie; + m++; + } + // if we fell off the trie, we create more (unary and empty) nodes + while (m < len) + { newc = mallocHeap(pTrie->heaps[t->nchildren+1]); + memcpy (newc,t->children,i*sizeof(struct schild)); + memcpy (newc+i+1,t->children+i,(t->nchildren-i)*sizeof(struct schild)); + freeHeap (pTrie->heaps[t->nchildren],t->children); + t->children = newc; + if ((t->id == ~0) && (t->nchildren == 1)) pTrie->nid++; //not mute now + t->children[i].car = word[m]; + nt = mallocHeap (pTrie->heaps[0]); + nt->id = ~0; // empty node, at least for now + nt->nchildren = 0; + nt->children = NULL; + t->children[i].trie = nt; + t->nchildren++; + t = nt; + m++; i = 0; + } + // new node created or existing node with id added + t->id = id; + if (t->nchildren <= 1) pTrie->nid++; //not mute now + } + + // represents pTrie with parentheses, letters and ids + + // also returns the leftmost id +static uint traverse(triebody *t, uint *parent, byte *letters, uint *ids, + uint *pi, uint *pli) + { + uint i,chid,oldpli,myid; + myid = t->id; + // open parenthesis + bitclean(parent,*pi); + ids[myid] = *pi; + (*pi)++; + oldpli = *pli; + // traverse children + for (i=0; inchildren; i++) { + (*pli)++; + letters[*pli] = t->children[i].car; + chid = traverse(t->children[i].trie,parent,letters,ids,pi,pli); + } + // close parenthesis + bitset(parent,*pi); + (*pi)++; + // return leftmost id + return myid; + } + +void representTrie (trie pTrie, uint *parent, byte *letters, uint *ids) + { + uint pi,pli; + letters[0] = 0; // dummy value + pi = 0; pli = 0; + traverse(&pTrie->trie,parent,letters,ids,&pi,&pli); + } + diff --git a/lzindex/trie.h b/lzindex/trie.h new file mode 100644 index 0000000..51670cf --- /dev/null +++ b/lzindex/trie.h @@ -0,0 +1,40 @@ + + // LZ78 trie data structure + +#ifndef TRIEINCLUDED +#define TRIEINCLUDED + +#include "basics.h" +#include "heap.h" + +typedef struct striebody + { uint id; // node id + short nchildren; + struct schild + { byte car; + struct striebody *trie; + } *children; + } triebody; + +typedef struct strie + { triebody trie; // trie + heap heaps[256]; // heaps + uint nid; // nr of nodes + } *trie; + + // creates trie +trie createTrie (void); + // inserts word[0...] into pTrie and returns new text ptr + // insertion proceeds until we get a new trie node +uint sizeofTrie(triebody *t); + +byte *insertTrie (trie pTrie, byte *word/*, ulong *pos*/); + // inserts word[0..len-1] into pTrie, with id = id + // assumes that no two equal strings are ever inserted +void insertstringTrie (trie pTrie, byte *word, uint len, uint id); + // frees the trie +void destroyTrie (trie pTrie); + // represents pTrie with parentheses, letters and ids +void representTrie (trie pTrie, uint *parent, byte *letters, uint *ids); + +#endif