--- /dev/null
+
+// Basics
+
+// #include "basics.h" included later to avoid macro recursion for malloc
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+uint nbits_aux1, *e_aux1, p_aux1;
+uint nbits_aux2, *e_aux2, p_aux2;
+uint nbits_aux3, *e_aux3, p_aux3;
+
+ // Memory management
+
+#ifdef MEMCTRL
+int account = 1;
+int currmem = 0;
+int newmem;
+int maxmem = 0;
+#endif
+
+void *Malloc (int n)
+
+ { void *p;
+ if (n == 0) return NULL;
+#ifndef MEMCTRL
+ p = (void*) malloc (n);
+#else
+ p = (void*) (malloc (n+sizeof(int))+sizeof(int));
+#endif
+ if (p == NULL)
+ { fprintf (stderr,"Could not allocate %i bytes\n",n);
+ exit(1);
+ }
+#ifdef MEMCTRL
+ *(((int*)p)-1) = n;
+ if (account)
+ { newmem = currmem+n;
+ if (newmem > maxmem) maxmem = newmem;
+ if (currmem/1024 != newmem/1024)
+ printf ("Memory: %i Kb, maximum: %i Kb\n",newmem/1024,maxmem/1024);
+ currmem = newmem;
+ }
+#endif
+ return p;
+ }
+
+void Free (void *p)
+
+ {
+#ifndef MEMCTRL
+ if (p) free (p);
+#else
+ if (!p) return;
+ if (account)
+ { newmem = currmem - *(((int*)p)-1);
+ free ((void*)(((int)p)-sizeof(int)));
+ if (currmem/1024 != newmem/1024)
+ printf ("Memory: %i Kb, maximum: %i Kb\n",newmem/1024,maxmem/1024);
+ currmem = newmem;
+ }
+#endif
+ }
+
+void *Realloc (void *p, int n)
+
+ { if (p == NULL) return Malloc (n);
+ if (n == 0) { Free(p); return NULL; }
+#ifndef MEMCTRL
+ p = (void*) realloc (p,n);
+#else
+ if (account)
+ newmem = currmem - *(((int*)p)-1);
+ p = (void*) (realloc ((void*)(((int)p)-sizeof(int)),n+sizeof(int))+sizeof(int));
+ *(((int*)p)-1) = n;
+#endif
+ if (p == NULL)
+ { fprintf (stderr,"Could not allocate %i bytes\n",n);
+ exit(1);
+ }
+#ifdef MEMCTRL
+ if (account)
+ { newmem = newmem + n;
+ if (newmem > maxmem) maxmem = newmem;
+ if (currmem/1024 != newmem/1024)
+ printf ("Memory: %i Kb, maximum: %i Kb\n",newmem/1024,maxmem/1024);
+ currmem = newmem;
+ }
+#endif
+ return p;
+ }
+
+#include "basics.h"
+
+ // bits needed to represent a number between 0 and n
+
+uint bits (uint n)
+
+ { uint b = 0;
+ while (n)
+ { b++; n >>= 1; }
+ return b;
+ }
+
+
+uint bitget_aux1()
+ {
+ register uint i=(p_aux1>>5), j=p_aux1&0x1F, answ;
+ if (j+nbits_aux1 <= W)
+ answ = (e_aux1[i] << (W-j-nbits_aux1)) >> (W-nbits_aux1);
+ else {
+ answ = e_aux1[i] >> j;
+ answ = answ | ((e_aux1[i+1] << (W-j-nbits_aux1)) >> (W-nbits_aux1));
+ }
+ return answ;
+ }
+/*
+uint bitget_aux2()
+ {
+ register uint i=(p_aux2>>5), j=p_aux2&0x1F, answ;
+ if (j+nbits_aux2 <= W)
+ answ = (e_aux2[i] << (W-j-nbits_aux2)) >> (W-nbits_aux2);
+ else {
+ answ = e_aux2[i] >> j;
+ answ = answ | ((e_aux2[i+1] << (W-j-nbits_aux2)) >> (W-nbits_aux2));
+ }
+ return answ;
+ }
+
+uint bitget_aux3()
+ {
+ register uint i=(p_aux3>>5), j=p_aux3&0x1F, answ;
+ if (j+nbits_aux3 <= W)
+ answ = (e_aux3[i] << (W-j-nbits_aux3)) >> (W-nbits_aux3);
+ else {
+ answ = e_aux3[i] >> j;
+ answ = answ | ((e_aux3[i+1] << (W-j-nbits_aux3)) >> (W-nbits_aux3));
+ }
+ return answ;
+ }*/
+
+
+ // returns e[p..p+len-1], assuming len <= W
+
+uint bitget (uint *e, uint p, uint len)
+ {
+ register uint i=(p>>5), j=p&0x1F, answ;
+ if (j+len <= W)
+ answ = (e[i] << (W-j-len)) >> (W-len);
+ else {
+ answ = e[i] >> j;
+ answ = answ | ((e[i+1] << (W-j-len)) >> (W-len));
+ }
+ return answ;
+ }
+
+ // writes e[p..p+len-1] = s, len <= W
+
+void bitput (register uint *e, register uint p,
+ register uint len, register uint s)
+
+ { e += p >> bitsW; p &= (1<<bitsW)-1;
+ if (len == W)
+ { *e |= (*e & ((1<<p)-1)) | (s << p);
+ if (!p) return;
+ e++;
+ *e = (*e & ~((1<<p)-1)) | (s >> (W-p));
+ }
+ else { if (p+len <= W)
+ { *e = (*e & ~(((1<<len)-1)<<p)) | (s << p);
+ return;
+ }
+ *e = (*e & ((1<<p)-1)) | (s << p);
+ e++; len -= W-p;
+ *e = (*e & ~((1<<len)-1)) | (s >> (W-p));
+ }
+ }
--- /dev/null
+
+// Basics
+
+#ifndef BASICSINCLUDED
+#define BASICSINCLUDED
+
+
+ // Includes
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/times.h>
+#include <unistd.h>
+
+ // Memory management
+
+#define malloc(n) Malloc(n)
+#define free(p) Free(p)
+#define realloc(p,n) Realloc(p,n)
+
+void *Malloc (int n);
+void Free (void *p);
+void *Realloc (void *p, int n);
+
+ // Data types
+
+typedef unsigned char byte;
+typedef uint trieNode; // a node of lztrie
+#ifndef uint
+typedef unsigned int uint;
+#endif
+
+#ifndef __cplusplus
+typedef int bool;
+#endif
+#define true 1
+#define false 0
+
+#define max(x,y) ((x)>(y)?(x):(y))
+#define min(x,y) ((x)<(y)?(x):(y))
+
+ // Bitstream management
+
+#define W (8*sizeof(uint))
+#define bitsW 5 // OJO
+
+ // bits needed to represent a number between 0 and n
+uint bits (uint n);
+ // returns e[p..p+len-1], assuming len <= W
+uint bitget (uint *e, uint p, uint len);
+ // writes e[p..p+len-1] = s, assuming len <= W
+void bitput (uint *e, uint p, uint len, uint s);
+ // reads bit p from e
+#define bitget1(e,p) ((e)[(p)/W] & (1<<((p)%W)))
+ // sets bit p in e
+#define bitset(e,p) ((e)[(p)/W] |= (1<<((p)%W)))
+ // cleans bit p in e
+#define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W)))
+
+extern uint nbits_aux1, *e_aux1, p_aux1;
+
+uint bitget_aux1();
+
+extern uint nbits_aux2, *e_aux2, p_aux2;
+
+uint bitget_aux2();
+
+extern uint nbits_aux3, *e_aux3, p_aux3;
+
+uint bitget_aux3();
+#endif
--- /dev/null
+
+
+// Implements operations over a bitmap
+
+#include "bitmap.h"
+
+ // In theory, we should have superblocks of size s=log^2 n divided into
+ // blocks of size b=(log n)/2. This takes
+ // O(n log n / log^2 n + n log log n / log n + log n sqrt n log log n) bits
+ // In practice, we can have any s and b, and the needed amount of bits is
+ // (n/s) log n + (n/b) log s + b 2^b log b bits
+ // Optimizing it turns out that s should be exactly s = b log n
+ // Optimizing b is more difficult but could be done numerically.
+ // However, the exponential table does no more than popcounting, so why not
+ // setting up a popcount algorithm tailored to the computer register size,
+ // defining that size as b, and proceeding.
+
+const unsigned char popc[] =
+ {
+ 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
+ 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+ 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+ 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+ 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+ 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+ 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+ 3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8,
+ };
+
+uint popcount (register uint x)
+ {
+ return popc[x&0xFF] + popc[(x>>8)&0xFF] + popc[(x>>16)&0xFF] + popc[x>>24];
+ }
+
+uint popcount8(register int x)
+ {
+ return popc[x & 0xff];
+ }
+
+
+ // creates a bitmap structure from a bitstring, which is shared
+
+bitmap createBitmap (uint *string, uint n, bool isfullbmap)
+ {
+ bitmap B;
+ uint i,j,pop,bpop,pos;
+ uint nb,ns,words;
+ B = malloc (sizeof(struct sbitmap));
+ B->data = string;
+ B->n = n; words = (n+W-1)/W;
+ ns = (n+256-1)/256; nb = 256/W; // adjustments
+ B->bdata = malloc (ns*nb*sizeof(byte));
+ B->sdata = malloc (ns*sizeof(int));
+#ifdef INDEXREPORT
+ printf (" Bitmap over %i bits took %i bits\n", n,n+ns*nb*8+ns*32);
+#endif
+ pop = 0; pos = 0;
+ for (i=0;i<ns;i++) {
+ bpop = 0;
+ B->sdata[i] = pop;
+ for (j=0;j<nb;j++) {
+ if (pos == words) break;
+ B->bdata[pos++] = bpop;
+ bpop += popcount(*string++);
+ }
+ pop += bpop;
+ }
+ if (isfullbmap) {// creates the data structure to solve select_0() queries
+ B->bdata_0 = malloc(ns*nb*sizeof(byte));
+ B->sdata_0 = malloc(ns*sizeof(int));
+ string = B->data;
+ pop = 0; pos = 0;
+ for (i = 0; i < ns; i++) {
+ bpop = 0;
+ B->sdata_0[i] = pop;
+ for (j = 0; j < nb; j++) {
+ if (pos == words) break;
+ B->bdata_0[pos++] = bpop;
+ bpop += popcount(~(*string++));
+ }
+ pop += bpop;
+ }
+ }
+ else { B->bdata_0 = NULL; B->sdata_0 = NULL;}
+ return B;
+ }
+
+ // rank(i): how many 1's are there before position i, not included
+
+uint rank (bitmap B, uint i)
+ {
+ return B->sdata[i>>8] + B->bdata[i>>5] +
+ popcount (B->data[i>>5] & ((1<<(i&0x1F))-1));
+ }
+
+
+ // select_1(x): returns the position i of the bitmap B such that
+ // the number of ones up to position i is x.
+
+uint select_1(bitmap B, uint x)
+ {
+ uint s = 256;
+ uint l = 0, n = B->n, r = n/s,left;
+ uint mid = (l+r)/2;
+ uint ones, j;
+ uint rankmid = B->sdata[mid];
+ // first, binary search on the superblock array
+ while (l <= r) {
+ if (rankmid < x)
+ l = mid+1;
+ else
+ r = mid-1;
+ mid = (l+r)/2;
+ rankmid = B->sdata[mid];
+ }
+ // sequential search using popcount over an int
+ left = mid*8;
+ x -= rankmid;
+ j = B->data[left];
+ ones = popcount(j);
+ while (ones < x) {
+ x -= ones;
+ left++;
+ if (left > (n+W-1)/W) return n;
+ j = B->data[left];
+ ones = popcount(j);
+ }
+ // sequential search using popcount over a char
+ left = left*32;
+ rankmid = popcount8(j);
+ if (rankmid < x) {
+ j = j>>8;
+ x -= rankmid;
+ left += 8;
+ rankmid = popcount8(j);
+ if (rankmid < x) {
+ j = j>>8;
+ x -= rankmid;
+ left += 8;
+ rankmid = popcount8(j);
+ if (rankmid < x) {
+ j = j>>8;
+ x -= rankmid;
+ left += 8;
+ }
+ }
+ }
+ // finally sequential search bit per bit
+ while (x > 0) {
+ if (j&1) x--;
+ j = j>>1;
+ left++;
+ }
+ return left-1;
+}
+
+
+ // select_0(x): returns the position i of the bitmap B such that
+ // the number of zeros up to position i is x.
+
+uint select_0(bitmap B, uint x)
+ {
+ uint s = 256;
+ uint l = 0, n = B->n, r = n/s, left;
+ uint mid = (l+r)/2;
+ uint ones, j;
+ uint rankmid = B->sdata_0[mid];
+ // first, binary search on the superblock array
+ while (l <= r) {
+ if (rankmid < x)
+ l = mid+1;
+ else
+ r = mid-1;
+ mid = (l+r)/2;
+ rankmid = B->sdata_0[mid];
+ }
+ // sequential search using popcount over an int
+ left = mid*8;
+ x -= rankmid;
+ j = B->data[left];
+ ones = popcount(~j);
+ while (ones < x) {
+ x -= ones;
+ left++;
+ if (left > (n+W-1)/W) return n;
+ j = B->data[left];
+ ones = popcount(~j);
+ }
+ // sequential search using popcount over a char
+ left = left*32;
+ rankmid = popcount8(~j);
+ if (rankmid < x) {
+ j = j>>8;
+ x -= rankmid;
+ left += 8;
+ rankmid = popcount8(~j);
+ if (rankmid < x) {
+ j = j>>8;
+ x -= rankmid;
+ left += 8;
+ rankmid = popcount8(~j);
+ if (rankmid < x) {
+ j = j>>8;
+ x -= rankmid;
+ left += 8;
+ }
+ }
+ }
+ // finally sequential search bit per bit
+ while (x > 0) {
+ if ((j&1)==0) x--;
+ j = j>>1;
+ left++;
+ }
+ return left-1;
+ }
+
+
+
+
+ // destroys the bitmap, freeing the original bitstream
+
+void destroyBitmap (bitmap B)
+
+ { free (B->data);
+ free (B->bdata);
+ free (B->sdata);
+ free (B);
+ }
+
+
+uint sizeofBitmap(bitmap B)
+ {
+ return sizeof(struct sbitmap) +
+ ((B->n+W-1)/W)*sizeof(uint) + // data
+ ((B->n+256-1)/256)*sizeof(int) + // sdata
+ ((B->n+256-1)/256)*(256/W)*sizeof(byte); // bdata
+ //((B->sdata_0)?((B->n+256-1)/256)*sizeof(int):0) +
+ //((B->bdata_0)?((B->n+256-1)/256)*(256/W)*sizeof(byte):0);
+ }
+
--- /dev/null
+
+
+// Implements operations over a bitmap
+
+#ifndef BITMAPINCLUDED
+#define BITMAPINCLUDED
+
+#include "basics.h"
+
+typedef struct sbitmap
+ {
+ uint *data;
+ uint n; // # of bits
+ uint *sdata; // superblock counters
+ byte *bdata; // block counters
+ uint *sdata_0; // superblock counters for select_0() queries
+ byte *bdata_0; // block counters for select_0() queries
+ } *bitmap;
+
+
+ // creates a bitmap structure from a bitstring, which gets owned
+bitmap createBitmap (uint *string, uint n, bool isfullbmap);
+ // rank(i): how many 1's are there before position i, not included
+uint rank (bitmap B, uint i);
+ // select_1(x): returns the position i of the bitmap B such that
+ // the number of ones up to position i is x.
+uint select_1(bitmap B, uint x);
+ // select_0(x): returns the position i of the bitmap B such that
+ // the number of zeros up to position i is x.
+uint select_0(bitmap B, uint x);
+ // destroys the bitmap, freeing the original bitstream
+void destroyBitmap (bitmap B);
+ // popcounts 1's in x
+uint popcount (register uint x);
+
+uint sizeofBitmap(bitmap B);
+#endif
+
--- /dev/null
+
+// Closed hash table
+
+#include "hash.h"
+
+ // creates a table to store up to n values with guaranteed load factor.
+ // vbits = # of bits per entry, ENTRIES CANNOT HAVE ZERO VALUE
+
+hash createHash (uint n, uint vbits, float factor)
+
+ { hash H = malloc (sizeof(struct shash));
+ int i,N;
+ if (n == 0) return NULL;
+ N = n*factor; if (N <= n) N = n+1;
+ H->size = (1 << bits(N-1)) - 1;
+ H->bits = vbits;
+ H->table = malloc ((((H->size+1)*vbits+W-1)/W)*sizeof(uint));
+#ifdef INDEXREPORT
+ printf (" Also created hash table of %i bits\n",
+ (((H->size+1)*vbits+W-1)/W)*W);
+#endif
+ for (i=(((H->size+1)*vbits+W-1)/W)-1;i>=0;i--) H->table[i] = 0;
+ return H;
+ }
+
+ // frees the structure
+
+void destroyHash (hash H)
+
+ { if (H == NULL) return;
+ free (H->table);
+ free (H);
+ }
+
+ // inserts an entry, not prepared for overflow
+
+void insertHash (hash H, uint key, uint value)
+
+ { uint pos = (key*PRIME1) & H->size;
+ if (bitget(H->table,pos*H->bits,H->bits) != 0)
+ { do pos = (pos + PRIME2) & H->size;
+ while (bitget(H->table,pos*H->bits,H->bits) != 0);
+ }
+ bitput(H->table,pos*H->bits,H->bits,value);
+ }
+
+ // looks for a key, returns first value (zero => no values)
+ // writes in pos a handle to get next values
+
+uint searchHash (hash H, uint key, handle *h)
+
+ { *h = (key*PRIME1) & H->size;
+ return bitget(H->table,*h*H->bits,H->bits);
+ }
+
+ // gets following values using handle *pos, which is rewritten
+ // returns next value (zero => no more values)
+
+uint nextHash (hash H, handle *h)
+
+ { *h = (*h +PRIME2) & H->size;
+ return bitget(H->table,*h*H->bits,H->bits);
+ }
+
+
+uint sizeofHash(hash H)
+ {
+ if (H) return sizeof(struct shash) +
+ (((H->size+1)*H->bits+W-1)/W)*sizeof(uint);
+ else return 0;
+ }
+
--- /dev/null
+
+// Closed hash table that does not store the keys
+
+// needs the maximum number of elements stored to be declared at creation time
+// cannot remove elements
+// does not store the key, rather, it reports all the collisioning values
+// right value under collisions
+// CANNOT STORE ZERO in the value
+
+#ifndef HASHINCLUDED
+#define HASHINCLUDED
+
+#include "basics.h"
+
+typedef struct shash
+ { uint size; // # of table entries
+ uint bits; // bits per entry
+ uint *table; // data
+ } *hash;
+
+typedef uint handle;
+
+ // creates a table to store up to n values with guaranteed load factor.
+ // vbits = # of bits per entry, ENTRIES CANNOT HAVE VALUE ZERO
+hash createHash (uint n, uint vbits, float factor);
+ // frees the structure
+void destroyHash (hash H);
+ // inserts an entry
+void insertHash (hash H, uint key, uint elem);
+ // looks for a key, returns first value (zero => no values)
+ // writes in pos a handle to get next values
+uint searchHash (hash H, uint key, handle *h);
+ // gets following values using handle *h, which is rewritten
+ // returns next value (zero => no more values)
+uint nextHash (hash H, handle *h);
+
+ // two large primes found with etc/hash.c
+#define PRIME1 ((uint)4294967279)
+#define PRIME2 ((uint)4294967197)
+
+#endif
--- /dev/null
+
+ // Heap of fixed size objects, fast to alloc and especially to totally free
+
+#include "heap.h"
+
+#define BLK 2048
+
+#ifdef MEMCTRL
+extern int account,currmem,newmem,maxmem;
+#endif
+
+ // Creates a heap of elements of size siz
+
+heap createHeap (uint siz)
+
+ { heap H;
+ H = malloc (sizeof(struct sheap));
+ H->siz = siz;
+ H->cap = BLK/siz; if (H->cap == 0) H->cap = 1;
+ H->blocks = NULL;
+ H->free = NULL;
+ H->first = H->cap;
+#ifdef MEMCTRL
+ H->totsize = 0;
+#endif
+ return H;
+ }
+
+ // Gets a new element from H
+
+void *mallocHeap (heap H)
+
+ { void *elem;
+ void **b;
+#ifdef MEMCTRL
+ H->totsize++;
+ newmem = currmem+H->siz;
+ if (newmem > maxmem) maxmem = newmem;
+ if (currmem/1024 != newmem/1024)
+ printf ("Memory: %i Kb, maximum: %i Kb\n",newmem/1024,maxmem/1024);
+ currmem = newmem;
+#endif
+ if (H->free)
+ { elem = (void*)H->free;
+ H->free = *H->free;
+ return elem;
+ }
+ if (H->first != H->cap)
+ { elem = (void*)(((uint)(H->blocks+1)) + H->first * H->siz);
+ H->first++;
+ return elem;
+ }
+#ifdef MEMCTRL
+ account = 0;
+#endif
+ b = malloc (sizeof(void*) + H->cap * H->siz);
+#ifdef MEMCTRL
+ account = 1;
+#endif
+ b[0] = H->blocks;
+ H->blocks = b;
+ elem = (void*)(H->blocks+1);
+ H->first = 1;
+ return elem;
+ }
+
+ // Frees ptr from heap H
+
+void freeHeap (heap H, void *ptr)
+
+ { void **p;
+ if (!ptr) return;
+#ifdef MEMCTRL
+ H->totsize--;
+ newmem = currmem-H->siz;
+ if (currmem/1024 != newmem/1024)
+ printf ("Memory: %i Kb, maximum: %i Kb\n",newmem/1024,maxmem/1024);
+ currmem = newmem;
+#endif
+ p = (void**)ptr;
+ *p = H->free;
+ H->free = p;
+ }
+
+ // Frees everything in heap H
+
+void destroyHeap (heap H)
+
+ { void **b;
+#ifdef MEMCTRL
+ account = 0;
+#endif
+ while ((b = H->blocks) != NULL)
+ { H->blocks = b[0];
+ free (b);
+ }
+#ifdef MEMCTRL
+ account = 1;
+ newmem = currmem-H->totsize*H->siz;
+ if (currmem/1024 != newmem/1024)
+ printf ("Memory: %i Kb, maximum: %i Kb\n",newmem/1024,maxmem/1024);
+ currmem = newmem;
+#endif
+ free (H);
+ }
--- /dev/null
+
+ // Heap of fixed size objects, fast to alloc and especially to totally free
+
+#ifndef HEAPINCLUDED
+#define HEAPINCLUDED
+
+#include "basics.h"
+
+typedef struct sheap
+ { uint siz; // element size
+ uint cap; // block capacity in # of elems
+ void **blocks; // list of blocks, next is block[0]
+ void **free; // list of free positions, each contains next
+ uint first; // first free position in current block
+ uint totsize; // total memory allocated here
+ } *heap;
+
+ // Creates a heap of elements of size siz
+heap createHeap (uint siz);
+ // Gets a new element from H
+void *mallocHeap (heap H);
+ // Frees ptr from heap H
+void freeHeap (heap H, void *ptr);
+ // Frees everything in heap H
+void destroyHeap (heap H);
+
+#endif
--- /dev/null
+
+
+// Implements the LZtrie data structure
+
+#include "lztrie.h"
+
+ // creates a lztrie structure from a parentheses bitstring,
+ // a letter array in preorder, and an id array in preorder,
+ // all of which except the latter become owned
+ // n is the total number of nodes (n letters/ids, 2n parentheses)
+
+lztrie createLZTrie(uint *string, byte *letters, uint *Node, uint n, uint text_length)
+ {
+ lztrie T;
+ int i;
+ T = malloc(sizeof(struct slztrie));
+ T->data = string;
+ T->pdata = createParentheses(string,2*n,true);
+ T->n = n;
+ T->nbits = bits(n-1);
+ T->letters = letters;
+ T->Node = createNodemap(Node, n, n);
+ T->TPos = createPosition(T, text_length);
+ T->text_length = text_length;
+ // boost first access
+ T->boost = malloc(256*sizeof(trieNode));
+ for (i=0;i<256;i++) T->boost[i] = NULLT;
+ i = 1; // shortcut for first child of root
+ while (i != 2*n-1) { // shortcut for its closing parenthesis
+ T->boost[T->letters[i-rank(T->pdata->bdata,i)]] = i;
+ // shortcut for leftrankLZTrie
+ i = findclose(T->pdata,i)+1;
+ }
+ return T;
+ }
+
+// builds an lztrie from a text. "s" is the text terminator.
+lztrie buildLZTrie(byte *text, byte s, uint text_length)
+ {
+ trie T;
+ uint n;
+ uint *parent;
+ byte *letters;
+ lztrie LZT;
+ uint *Node;
+ // first creates a full trie T
+ T = createTrie();
+ do {
+ text = insertTrie(T,text);
+ }
+ while (text[-1] != s);
+ // now compresses it
+ n = T->nid;
+ parent = malloc(((2*n+W-1)/W)*sizeof(uint));
+ letters = malloc(n*sizeof(byte));
+ // Aqui pedir espacio para Node, ver como lo voy a construir en vez de sacar solo los ids
+ Node = malloc(n*sizeof(uint));
+ representTrie(T,parent,letters,Node);
+
+ destroyTrie(T);
+ LZT = createLZTrie(parent,letters,Node,n,text_length);
+ return LZT;
+ }
+
+
+
+ // frees LZTrie structure, including the owned data
+
+void destroyLZTrie(lztrie T)
+ {
+ destroyParentheses(T->pdata);
+ free(T->letters);
+ destroyNodemap(T->Node);
+ destroyPosition(T->TPos);
+ free(T->boost);
+ free(T);
+ }
+
+ // stores lztrie T on file f
+
+void saveLZTrie (lztrie T, FILE *f)
+ {
+ if (fwrite (&T->n,sizeof(uint),1,f) != 1) {
+ fprintf (stderr,"Error: Cannot write LZTrie on file\n");
+ exit(1);
+ }
+ if (fwrite (T->data,sizeof(uint),(2*T->n+W-1)/W,f) != (2*T->n+W-1)/W) {
+ fprintf (stderr,"Error: Cannot write LZTrie on file\n");
+ exit(1);
+ }
+ if (fwrite (T->letters,sizeof(byte),T->n,f) != T->n) {
+ fprintf (stderr,"Error: Cannot write LZTrie on file\n");
+ exit(1);
+ }
+
+ saveNodemap(f, T->Node);
+ fwrite(&T->text_length, sizeof(uint), 1, f);
+ savePosition(f, T->TPos);
+ }
+
+ // loads lztrie T from file f
+
+lztrie loadLZTrie (FILE *f)
+ {
+ lztrie T;
+ int i;
+ T = malloc (sizeof(struct slztrie));
+ if (fread (&T->n,sizeof(uint),1,f) != 1) {
+ fprintf (stderr,"Error: Cannot read LZTrie from file\n");
+ exit(1);
+ }
+ T->data = malloc (((2*T->n+W-1)/W)*sizeof(uint));
+ if (fread (T->data,sizeof(uint),(2*T->n+W-1)/W,f) != (2*T->n+W-1)/W) {
+ fprintf (stderr,"Error: Cannot read LZTrie from file\n");
+ exit(1);
+ }
+ T->pdata = createParentheses (T->data,2*T->n,true);
+ T->nbits = bits(T->n-1);
+ T->letters = malloc (T->n*sizeof(byte));
+ if (fread (T->letters,sizeof(byte),T->n,f) != T->n) {
+ fprintf (stderr,"Error: Cannot read LZTrie from file\n");
+ exit(1);
+ }
+
+ T->Node = loadNodemap(f);
+ fread(&T->text_length, sizeof(uint), 1, f);
+ T->TPos = loadPosition(f, T->text_length);
+
+ // boost first access
+ T->boost = malloc(256*sizeof(trieNode));
+ for (i=0;i<256;i++) T->boost[i] = NULLT;
+ i = 1; // shortcut for first child of root
+ while (i != 2*T->n-1) { // shortcut for its closing parenthesis
+ T->boost[T->letters[i-rank(T->pdata->bdata,i)]] = i;
+ // shortcut for leftrankLZTrie
+ i = findclose(T->pdata,i)+1;
+ }
+
+ return T;
+ }
+
+
+ // letter by which node i descends
+
+byte letterLZTrie (lztrie T, trieNode i)
+ {
+ return T->letters[i-rank(T->pdata->bdata,i)]; // shortcut leftrankLZTrie
+ }
+
+ // go down by letter c, if possible
+
+
+trieNode childLZTrie (lztrie T, trieNode i, byte c)
+ {
+ trieNode j;
+ byte tc;
+ j = i+1;
+ while (!bitget1(T->data,j)) { // there is a child here
+ tc = T->letters[j-rank(T->pdata->bdata,j)];
+ // shortcut for leftrankLZTrie: is a child by letter c?
+ if (tc > c) break;
+ if (tc == c) return j;
+ j = findclose(T->pdata,j)+1;
+ }
+ return NULLT; // no child to go down by c
+ }
+
+ // go up, if possible
+
+trieNode parentLZTrie (lztrie T, trieNode i)
+ {
+ if (i == ROOT) return NULLT; // parent of root
+ if (T->boost[T->letters[i-rank(T->pdata->bdata,i)]] == i) return ROOT;
+ return enclose (T->pdata,i);
+ }
+
+ // subtree size
+
+uint subtreesizeLZTrie (lztrie T, trieNode i)
+ {
+ trieNode j;
+ j = findclose (T->pdata,i);
+ return (j-i+1)/2;
+ }
+
+ // depth
+
+uint depthLZTrie (lztrie T, trieNode i)
+ {
+ return excess (T->pdata,i);
+ }
+
+ // smallest rank in subtree
+
+uint leftrankLZTrie (lztrie T, trieNode i)
+
+ { return i-rank(T->pdata->bdata,i);
+ }
+
+ // largest rank in subtree
+
+uint rightrankLZTrie (lztrie T, trieNode i)
+ {
+ trieNode j;
+ j = findclose (T->pdata,i);
+ return j-rank(T->pdata->bdata,j)-1;
+ }
+
+ // is node i ancestor of node j?
+
+bool ancestorLZTrie (lztrie T, trieNode i, trieNode j)
+ {
+ return (i <= j) && (j < findclose (T->pdata,i));
+ }
+
+ // next node from i, in preorder, adds/decrements depth accordingly
+ // assumes it *can* go on!
+
+trieNode nextLZTrie (lztrie T, trieNode i, uint *depth)
+ {
+ uint *data = T->data;
+ i++;
+ while (bitget1(data,i)) { i++; (*depth)--; }
+ (*depth)++;
+ return i;
+ }
+
+
+// extracts text from position "from" up to position "to".
+// Parameter "snippet_length" is the actual length of the snippet extracted.
+// The extracted text is stored in array "snippet".
+//
+void extract(lztrie T,
+ ulong from,
+ ulong to,
+ byte **snippet,
+ ulong *snippet_length)
+ {
+ ulong snpp_pos, posaux, idaux;
+ uint idfrom,idto;
+ trieNode p;
+ byte *snppt;
+ nodemap Node = T->Node;
+
+ if (to > (T->text_length-1)) to = T->text_length-1;
+ if (from > (T->text_length-1)) from = T->text_length-1;
+
+ *snippet_length = to-from+1;
+ *snippet = malloc(*snippet_length);
+
+ idfrom = getphrasePosition(T->TPos, from);
+ idto = getphrasePosition(T->TPos, to+1);
+ // *snippet_length = to-from+1;
+
+ posaux = getPosition(T->TPos, idto+1)-1;
+ p = mapto(Node, idto);
+
+ while (p&&(posaux > to)) {
+ p = parentLZTrie(T, p);
+ posaux--;
+ }
+
+ snpp_pos = (*snippet_length)-1;
+ snppt = *snippet;
+
+ for (idaux = idto; idaux != idfrom;) {
+ while (p) {
+ snppt[snpp_pos--] = letterLZTrie(T, p);
+ p = parentLZTrie(T, p);
+ }
+ p = mapto(Node, --idaux);
+ }
+ if (idfrom != idto) posaux = getPosition(T->TPos, idfrom+1)-(from!=0);
+ while (p && (posaux >= from)) {
+ snppt[snpp_pos--] = letterLZTrie(T, p);
+ p = parentLZTrie(T, p);
+ posaux--;
+ }
+ }
+
+uint sizeofLZTrie(lztrie T)
+ {
+ return sizeof(struct slztrie) +
+ sizeofParentheses(T->pdata) +
+ T->n*sizeof(byte) +
+ sizeofNodemap(T->Node) +
+ sizeofPosition(T->TPos);
+ }
--- /dev/null
+
+
+// Implements the LZtrie data structure
+
+#ifndef LZTRIEINCLUDED
+#define LZTRIEINCLUDED
+
+#include "basics.h"
+#include "trie.h"
+#include "parentheses.h"
+#include "nodemap.h"
+#include "position.h"
+
+//typedef uint trieNode; // a node of lztrie
+
+#define NULLT ((trieNode)(~0)) // a null node
+#define ROOT ((trieNode)(0)) // the root node
+
+typedef struct slztrie
+ {
+ uint *data; // bitmap data
+ parentheses pdata; // parentheses structure
+ uint n; // # of parentheses
+ byte *letters; // letters of the trie
+ uint nbits; // log n
+ nodemap Node; // ids of the trie
+ position TPos; // text position data structure
+ uint text_length; // length of the original text
+ trieNode *boost; // direct pointers to first children
+ } *lztrie;
+
+
+ // creates a lztrie structure from a parentheses bitstring,
+ // a letter array in preorder, and an id array in preorder,
+ // all of which except the latter become owned
+ // n is the total number of nodes (n letters/ids, 2n parentheses)
+lztrie createLZTrie (uint *string, byte *letters, uint *id, uint n, uint text_length);
+ // builds an lztrie from a text. "s" is the text terminator.
+#ifdef __cplusplus
+extern "C" {
+ lztrie buildLZTrie(byte *text, byte s, uint text_length);
+}
+#else
+ lztrie buildLZTrie(byte *text, byte s, uint text_length);
+#endif
+
+ // frees LZTrie structure, including the owned data
+void destroyLZTrie (lztrie T);
+ // stores lztrie T on file f
+void saveLZTrie (lztrie T, FILE *f);
+ // loads lztrie T from file f
+lztrie loadLZTrie (FILE *f);
+ // letter by which node i descends
+byte letterLZTrie (lztrie T, trieNode i);
+ // go down by letter c, if possible
+trieNode childLZTrie (lztrie T, trieNode i, byte c);
+ // go up, if possible
+trieNode parentLZTrie (lztrie T, trieNode i);
+ // subtree size
+uint subtreesizeLZTrie (lztrie T, trieNode i);
+ // smallest rank in subtree
+uint leftrankLZTrie (lztrie T, trieNode i);
+ // largest rank in subtree
+uint rightrankLZTrie (lztrie T, trieNode i);
+ // is node i ancestor of node j?
+bool ancestorLZTrie (lztrie T, trieNode i, trieNode j);
+ // next node from i, in preorder, adds/decrements depth accordingly
+ // assumes it *can* go on!
+trieNode nextLZTrie (lztrie T, trieNode i, uint *depth);
+ // size in bytes of LZTrie T
+uint sizeofLZTrie(lztrie T);
+
+#ifdef __cplusplus
+extern "C" {
+void extract(lztrie T, ulong from, ulong to, byte **snippet, ulong *snippet_length);
+}
+#else
+void extract(lztrie T, ulong from, ulong to, byte **snippet, ulong *snippet_length);
+#endif
+
+#endif
--- /dev/null
+#FLAGS = -g -lm
+FLAGS = -O9 -fomit-frame-pointer -W -Wall -Winline -DDEBUG=0 -DNDEBUG=1
+
+all: lztrie.a
+
+lztrie.a: lztrie.o nodemap.o position.o trie.o heap.o parentheses.o bitmap.o hash.o basics.o
+ ar rc lztrie.a lztrie.o nodemap.o position.o trie.o heap.o parentheses.o bitmap.o hash.o basics.o
+
+lztrie.o: lztrie.c
+ gcc $(FLAGS) -c lztrie.c
+nodemap.o: nodemap.c
+ gcc $(FLAGS) -c nodemap.c
+position.o: position.c
+ gcc $(FLAGS) -c position.c
+trie.o: trie.c
+ gcc $(FLAGS) -c trie.c
+heap.o: heap.c
+ gcc $(FLAGS) -c heap.c
+parentheses.o: parentheses.c
+ gcc $(FLAGS) -c parentheses.c
+bitmap.o: bitmap.c
+ gcc $(FLAGS) -c bitmap.c
+hash.o: hash.c
+ gcc $(FLAGS) -c hash.c
+basics.o: basics.c
+ gcc $(FLAGS) -c basics.c
+
+basics.h: makefile
+ touch basics.h
+bitmap.h: basics.h
+ touch bitmap.h
+hash.h: basics.h
+ touch hash.h
+lztrie.h: basics.h parentheses.h
+ touch lztrie.h
+position.h: lztrie.h nodemap.h
+ touch position.h
+nodemap.h: basics.h lztrie.h
+ touch nodemap.h
+parentheses.h: basics.h bitmap.h hash.h
+ touch parentheses.h
+trie.h: basics.h heap.h
+ touch trie.h
+heap.h: basics.h
+ touch heap.h
+basics.c: basics.h
+ touch basics.c
+hash.c: hash.h
+ touch hash.c
+bitmap.c: bitmap.h
+ touch bitmap.c
+lztrie.c: lztrie.h
+ touch lztrie.c
+position.c: position.h
+ touch position.c
+nodemap.c: nodemap.h
+ touch nodemap.c
+parentheses.c: parentheses.h
+ touch parentheses.c
+trie.c: trie.h
+ touch trie.c
+heap.c: heap.h
+ touch heap.c
+
+clean:
+ rm -f *.a *.o
--- /dev/null
+
+// Implements the Map data structure, which maps block ids to lztrie nodes
+
+#include "nodemap.h"
+
+ // creates a nodemap structure from a mapping array, not owning it
+ // n is number of blocks
+ // max is the number of trie nodes
+
+nodemap createNodemap(uint *map, uint n, uint max)
+ {
+ nodemap M;
+ uint i;
+ M = malloc(sizeof(struct snodemap));
+ M->nbits = bits(2*max-1);
+ M->n = n;
+ M->map = malloc(((n*M->nbits+W-1)/W)*sizeof(uint));
+ for (i=0;i<n;i++)
+ bitput(M->map,i*M->nbits,M->nbits,map[i]);
+ return M;
+ }
+
+ // frees revtrie structure, including the owned data
+
+void destroyNodemap(nodemap M)
+ {
+ free(M->map);
+ free(M);
+ }
+
+ // mapping
+
+trieNode mapto(nodemap M, uint id)
+ {
+ return bitget(M->map,id*M->nbits,M->nbits);
+ }
+
+void saveNodemap(FILE *f, nodemap M)
+ {
+ fwrite(&M->nbits, sizeof(uint), 1, f);
+ fwrite(&M->n, sizeof(uint), 1, f);
+ fwrite(M->map, sizeof(uint), (M->n*M->nbits+W-1)/W, f);
+ }
+
+nodemap loadNodemap(FILE *f)
+ {
+ nodemap M;
+
+ M = malloc(sizeof(struct snodemap));
+ fread(&M->nbits, sizeof(uint), 1, f);
+ fread(&M->n, sizeof(uint), 1, f);
+ M->map = malloc(((M->n*M->nbits+W-1)/W)*sizeof(uint));
+ fread(M->map, sizeof(uint), (M->n*M->nbits+W-1)/W, f);
+ return M;
+ }
+
+uint sizeofNodemap(nodemap M)
+ {
+ return sizeof(struct snodemap) + ((M->n*M->nbits+W-1)/W)*sizeof(uint);
+ }
+
--- /dev/null
+
+
+// Implements the Node and RNode data structures,
+// which map block ids to LZTrie nodes and
+// block ids to RevTrie nodes, respectively
+
+#ifndef NODEMAPINCLUDED
+#define NODEMAPINCLUDED
+
+#include "basics.h"
+//#include "lztrie.h"
+
+typedef struct snodemap
+ {
+ uint nbits; // bits per cell
+ uint *map; // mapping
+ uint n; // number of elements in the mapping
+ } *nodemap;
+
+
+ // creates a nodemap structure from a mapping array, not owning it
+ // n is number of blocks
+ // max is the number of trie nodes
+nodemap createNodemap(uint *map, uint n, uint max);
+ // frees revtrie structure, including the owned data
+void destroyNodemap(nodemap M);
+ // mapping
+trieNode mapto(nodemap M, uint id);
+
+void saveNodemap(FILE *f, nodemap M);
+
+nodemap loadNodemap(FILE *f);
+
+uint sizeofNodemap(nodemap M);
+#endif
--- /dev/null
+
+
+// Implements operations over a sequence of balanced parentheses
+
+#include "parentheses.h"
+
+ // I have decided not to implement Munro et al.'s scheme, as it is too
+ // complicated and the overhead is not so small in practice. I have opted
+ // for a simpler scheme. Each open (closing) parenthesis will be able to
+ // find its matching closing (open) parenthesis. If the distance is shorter
+ // than b, we will do it by hand, traversing the string. Otherwise, the
+ // answer will be stored in a hash table. In fact, only subtrees larger than
+ // s will have the full distance stored, while those between b and s will
+ // be in another table with just log s bits. The traversal by hand proceeds
+ // in fact by chunks of k bits, whose answers are precomputed.
+ // Space: there cannot be more than n/s subtrees larger than s, idem b.
+ // So we have (n/s)log n bits for far pointers and (n/b)log s for near
+ // pointers. The space for the table is 2^k*k*log b. The optimum s=b log n,
+ // in which case the space is n/b(1 + log b + log log n) + 2^k*k*log b.
+ // Time: the time is O(b/k), and we want to keep it O(log log n), so
+ // k = b/log log n.
+ // (previous arguments hold if there are no unary nodes, but we hope that
+ // there are not too many -- in revtrie we compress unary paths except when
+ // they have id)
+ // Settings: using b = log n, we have
+ // space = n log log n / log n + 2^(log n / log log n) log n
+ // time = log log n
+ // In practice: we use k = 8 (bytes table), b = W (so work = 4 or 8)
+ // and space ~= n/3 + 10 Kbytes (fixed table).
+ // Notice that we need a hash table that stores only the deltas and does not
+ // store the keys! (they would take log n instead of log s!). Collisions are
+ // resolved as follows: see all the deltas that could be and pick the smallest
+ // one whose excess is the same of the argument. To make this low we use a
+ // load factor of 2.0, so it is 2n/3 after all.
+ // We need the same for the reverses, for the forward is only for ('s and
+ // reverses for )'s, so the proportion stays the same.
+ // We also need the stream to be a bitmap to know how many open parentheses
+ // we have to the left. The operations are as follows:
+ // findclose: use the mechanism described above
+ // findparent: similar, in reverse, looking for the current excess - 1
+ // this needs us to store the (near/far) parent of each node, which may
+ // cost more than the next sibling.
+ // excess: using the number of open parentheses
+ // enclose: almost findparent
+
+ // creates a parentheses structure from a bitstring, which is shared
+ // n is the total number of parentheses, opening + closing
+
+static uint calcsizes (parentheses P, uint posparent, uint posopen,
+ uint *near, uint *far, uint *pnear, uint *pfar)
+
+ { uint posclose,newpos;
+ if ((posopen == P->n) || bitget1(P->data,posopen))
+ return posopen; // no more trees
+ newpos = posopen;
+ do { posclose = newpos+1;
+ newpos = calcsizes (P,posopen,posclose,near,far,pnear,pfar);
+ }
+ while (newpos != posclose);
+ if ((posclose < P->n) && (posclose-posopen > W)) // exists and not small
+ if (posclose-posopen < (1<<P->sbits)) (*near)++; // near pointer
+ else (*far)++;
+ if ((posopen > 0) && (posopen-posparent > W)) // exists and not small
+ if (posopen-posparent < (1<<P->sbits)) (*pnear)++; // near pointer
+ else (*pfar)++;
+ return posclose;
+ }
+
+
+static uint filltables (parentheses P, uint posparent, uint posopen, bool bwd)
+
+ { uint posclose,newpos;
+ if ((posopen == P->n) || bitget1(P->data,posopen))
+ return posopen; // no more trees
+ newpos = posopen;
+ do { posclose = newpos+1;
+ newpos = filltables (P,posopen,posclose,bwd);
+ }
+ while (newpos != posclose);
+ if ((posclose < P->n) && (posclose-posopen > W)) // exists and not small
+ { if (posclose-posopen < (1<<P->sbits)) // near pointers
+ insertHash (P->bftable,posopen,posclose-posopen);
+ else // far pointers
+ insertHash (P->sftable,posopen,posclose-posopen);
+ }
+ if (bwd && (posopen > 0) && (posopen-posparent > W)) //exists and not small
+ { if (posopen-posparent < (1<<P->sbits)) // near pointer
+ insertHash (P->bbtable,posopen,posopen-posparent);
+ else // far pointers
+ insertHash (P->sbtable,posopen,posopen-posparent);
+ }
+ return posclose;
+ }
+
+
+
+static byte FwdPos[256][W/2];
+static byte BwdPos[256][W/2];
+static char Excess[256];
+static bool tablesComputed = false;
+
+static void fcompchar (byte x, byte *pos, char *excess)
+
+ { int exc = 0;
+ uint i;
+ for (i=0;i<W/2;i++) pos[i] = 0;
+ for (i=0;i<8;i++)
+ { if (x & 1) // closing
+ { exc--;
+ if ((exc < 0) && !pos[-exc-1]) pos[-exc-1] = i+1;
+ }
+ else exc++;
+ x >>= 1;
+ }
+ *excess = exc;
+ }
+
+static void bcompchar (byte x, byte *pos)
+
+ { int exc = 0;
+ uint i;
+ for (i=0;i<W/2;i++) pos[i] = 0;
+ for (i=0;i<8;i++)
+ { if (x & 128) // opening, will be used on complemented masks
+ { exc++;
+ if ((exc > 0) && !pos[exc-1]) pos[exc-1] = i+1;
+ }
+ else exc--;
+ x <<= 1;
+ }
+ }
+
+parentheses createParentheses (uint *string, uint n, bool bwd)
+
+ { parentheses P;
+ uint i,s,nb,ns,nbits,near,far,pnear,pfar;
+
+ P = malloc (sizeof(struct sparentheses));
+ P->data = string;
+ P->n = n;
+ P->bdata = createBitmap (string,n,false);
+ nbits = bits(n-1);
+ s = nbits*W;
+ P->sbits = bits(s-1);
+ s = 1 << P->sbits; // to take the most advantage of what we can represent
+ ns = (n+s-1)/s; nb = (s+W-1)/W; // adjustments
+ near = far = pnear = pfar = 0;
+ calcsizes (P,~0,0,&near,&far,&pnear,&pfar);
+#ifdef INDEXREPORT
+ printf (" Parentheses: total %i, near %i, far %i, pnear %i, pfar %i\n",n,near,far,pnear,pfar);
+#endif
+ P->sftable = createHash(far,nbits,1.8);
+ P->bftable = createHash(near,P->sbits,1.8);
+
+ P->sbtable = createHash (pfar,nbits,2.5);
+ P->bbtable = createHash (pnear,P->sbits,2.5);
+ filltables (P,~0,0,true);
+ if (!tablesComputed)
+ { tablesComputed = true;
+ for (i=0;i<256;i++)
+ { fcompchar (i,FwdPos[i],Excess+i);
+ bcompchar (i,BwdPos[i]);
+ }
+ }
+ return P;
+ }
+
+ // frees parentheses structure, including the bitstream
+
+void destroyParentheses (parentheses P)
+
+ { destroyBitmap(P->bdata);
+ destroyHash (P->sftable);
+ if (P->sbtable) destroyHash (P->sbtable);
+ destroyHash (P->bftable);
+ if (P->bbtable) destroyHash (P->bbtable);
+ free (P);
+ }
+
+ // the position of the closing parenthesis corresponding to (opening)
+ // parenthesis at position i
+
+uint findclose (parentheses P, uint i)
+
+ { uint bitW;
+ uint len,res,minres,exc;
+ byte W1;
+ handle h;
+ uint myexcess;
+ // first see if it is at small distance
+ len = W; if (i+len >= P->n) len = P->n-i-1;
+ bitW = bitget (P->data,i+1,len);
+ exc = 0; len = 0;
+ while (bitW && (exc < W/2))
+ // either we shift it all or it only opens parentheses or too
+ // many open parentheses
+ { W1 = bitW & 255;
+ if (res = FwdPos[W1][exc]) return i+len+res;
+ bitW >>= 8; exc += Excess[W1];
+ len += 8;
+ }
+ // ok, it's not a small distance, try with hashing btable
+ minres = 0;
+ myexcess = excess (P,i);
+ res = searchHash (P->bftable,i,&h);
+ while (res)
+ { if (!minres || (res < minres))
+ if ((i+res+1 < P->n) && (excess(P,i+res+1) == myexcess))
+ minres = res;
+ res = nextHash (P->bftable,&h);
+ }
+ if (minres) return i+minres;
+ // finally, it has to be a far pointer
+ res = searchHash (P->sftable,i,&h);
+ while (res)
+ { if (!minres || (res < minres))
+ if ((i+res+1 < P->n) && (excess(P,i+res+1) == myexcess))
+ minres = res;
+ res = nextHash (P->sftable,&h);
+ }
+ return i+minres; // there should be one if the sequence is balanced!
+ }
+
+ // find enclosing parenthesis for an open parenthesis
+ // assumes that the parenthesis has an enclosing pair
+
+uint findparent (parentheses P, uint i)
+
+ { uint bitW;
+ uint len,res,minres,exc;
+ byte W1;
+ handle h;
+ uint myexcess;
+ // first see if it is at small distance
+ len = W; if (i < len) len = i-1;
+ bitW = ~bitget (P->data,i-len,len) << (W-len);
+ exc = 0; len = 0;
+ while (bitW && (exc < W/2))
+ // either we shift it all or it only closes parentheses or too
+ // many closed parentheses
+ { W1 = (bitW >> (W-8));
+ if (res = BwdPos[W1][exc]) return i-len-res;
+ bitW <<= 8; exc += Excess[W1]; // note W1 is complemented!
+ len += 8;
+ }
+ // ok, it's not a small distance, try with hashing btable
+ minres = 0;
+ myexcess = excess (P,i) - 1;
+ res = searchHash (P->bbtable,i,&h);
+ while (res)
+ { if (!minres || (res < minres))
+ if ((i-res >= 0) && (excess(P,i-res) == myexcess))
+ minres = res;
+ res = nextHash (P->bbtable,&h);
+ }
+ if (minres) return i-minres;
+ // finally, it has to be a far pointer
+ res = searchHash (P->sbtable,i,&h);
+ while (res)
+ { if (!minres || (res < minres))
+ if ((i-res >= 0) && (excess(P,i-res) == myexcess))
+ minres = res;
+ res = nextHash (P->sbtable,&h);
+ }
+ return i-minres; // there should be one if the sequence is balanced!
+ }
+
+ // # open - # close at position i, not included
+
+uint excess (parentheses P, uint i)
+
+ { return i - 2*rank(P->bdata,i);
+ }
+
+ // open position of closest parentheses pair that contains the pair
+ // that opens at i, ~0 if no parent
+
+uint enclose (parentheses P, uint i)
+
+ { if (i == 0) return ~0; // no parent!
+ return findparent (P,i);
+ }
+
+
+
+uint sizeofParentheses(parentheses P)
+ {
+ return sizeof(struct sparentheses) +
+ sizeofBitmap(P->bdata) +
+ sizeofHash(P->sftable) +
+ sizeofHash(P->sbtable) +
+ sizeofHash(P->bftable) +
+ sizeofHash(P->bbtable);
+ }
--- /dev/null
+
+
+// Implements operations over a sequence of balanced parentheses
+
+#ifndef PARENTHESESINCLUDED
+#define PARENTHESESINCLUDED
+
+#include "basics.h"
+#include "bitmap.h"
+#include "hash.h"
+
+typedef struct sparentheses
+ { uint *data; // string
+ bitmap bdata; // bitmap of string
+ uint n; // # of parentheses
+ uint sbits; // bits for near pointers
+ hash sftable; // table of far forward pointers
+ hash sbtable; // table of far backward pointers
+ hash bftable; // table of near forward pointers
+ hash bbtable; // table of near backward pointers
+ } *parentheses;
+
+ // creates a parentheses structure from a bitstring, which gets owned
+ // n is the total number of parentheses, opening + closing
+ // bwd says if you will want to perform findopen and enclose
+parentheses createParentheses (uint *string, uint n, bool bwd);
+ // frees parentheses structure, including the bitstream
+void destroyParentheses (parentheses P);
+ // the position of the closing parenthesis corresponding to (opening)
+ // parenthesis at position i
+uint findclose (parentheses P, uint i);
+ // respectively, for closing parenthesis i
+uint findopen (parentheses P, uint i);
+ // # open - # close at position i, not included
+uint excess (parentheses P, uint i);
+ // open position of closest parentheses pair that contains the pair
+ // that opens at i, ~0 if no parent
+uint enclose (parentheses P, uint i);
+
+uint sizeofParentheses(parentheses P);
+#endif
--- /dev/null
+
+// This code implements the data structure to get
+// "real" text positions at search time.
+
+#include "position.h"
+#include "lztrie.h"
+#include <math.h>
+
+
+position createPosition(void *Taux, uint text_length)
+ {
+ position P;
+ lztrie T = (lztrie) Taux;
+ uint *Offset;
+ ulong n, M, superblock_size, current_superblock,
+ starting_pos, i, depth, pos;
+ trieNode node;
+
+ P = malloc(sizeof(struct tpos));
+ n = T->n;
+ P->SBlock_size = 32;//bits(n-1); // superblock size is log n
+ P->nbitsSB = bits(text_length-1);
+ P->nSuperBlock = (ulong)ceil((double)n/P->SBlock_size); // number of superblocks
+ P->SuperBlock = malloc(((P->nbitsSB*P->nSuperBlock+W-1)/W)*sizeof(uint));
+ P->Tlength = text_length;
+ M = 0; // maximum superblock size (in number of characters)
+
+ current_superblock = 0;
+ superblock_size = 0;
+ starting_pos = 0;
+ P->nOffset = n;
+ Offset = malloc(n*sizeof(uint)); // array for temporary use
+
+ bitput(P->SuperBlock, 0, P->nbitsSB, 0);
+
+ for (i = 0, pos = 0; i < n; i++) {
+ node = mapto(T->Node, i);
+ depth = depthLZTrie(T, node);
+ pos += depth;
+
+ superblock_size++;
+
+ if ((superblock_size > P->SBlock_size) || (i==0)) {
+ if (i)
+ bitput(P->SuperBlock,P->nbitsSB*(++current_superblock),P->nbitsSB,pos);
+ if (i!=0 && Offset[i-1]>M) M = Offset[i-1];
+ superblock_size = 1;
+ starting_pos = pos;
+ }
+
+ Offset[i] = pos-starting_pos;
+ }
+
+ if (Offset[i-1]>M) M = Offset[i-1];
+
+ P->nbitsOffs = bits(M-1);
+ P->Offset = malloc(((P->nOffset*P->nbitsOffs+W-1)/W)*sizeof(uint));
+ for (i = 0; i < n; i++)
+ bitput(P->Offset, i*P->nbitsOffs, P->nbitsOffs, Offset[i]);
+
+ free(Offset);
+
+ return P;
+ }
+
+
+// given a phrase id, gets the corresponding text position
+
+ulong getPosition(position P, uint id)
+ {
+ if (id > P->nOffset) return P->Tlength;
+ else {
+ ulong posSB = (id-1)>>5;///P->SBlock_size;//(ulong)ceil((double)id/P->SBlock_size)-1;
+ return bitget(P->SuperBlock,posSB*P->nbitsSB,P->nbitsSB)
+ + bitget(P->Offset,(id-1)*P->nbitsOffs,P->nbitsOffs);
+ }
+ }
+
+
+// given a text position, gets the identifier of the
+// LZ78 phrase containing that position
+
+ulong getphrasePosition(position P, ulong text_pos)
+ {
+ ulong li, ls, nbits, elem, med, SB, temp, i;
+
+ li = 0;
+ ls = P->nSuperBlock-1;
+ nbits = P->nbitsSB;
+ while ((ls-li+1) > 0) {
+ med = (li+ls)/2;
+ elem = bitget(P->SuperBlock,med*nbits,nbits);
+ if (elem == text_pos) break;
+ if (elem < text_pos) li = med+1;
+ else ls = med - 1;
+ }
+ if (elem > text_pos && med) med--;
+ SB = bitget(P->SuperBlock,med*nbits,nbits);
+
+ temp = li;
+ li = med*P->SBlock_size;
+
+ if (med+1 == P->nSuperBlock)
+ ls = P->nOffset-1;
+ else
+ ls = (med+1)*P->SBlock_size - 1;
+
+ nbits = P->nbitsOffs;
+
+ while ((ls-li+1) > 8) {
+ med = (li+ls)/2;
+ elem = bitget(P->Offset,med*nbits,nbits)+SB;
+ if (elem == text_pos) break;
+ if (elem < text_pos) li = med+1;
+ else ls = med-1;
+ }
+
+ for (i = li; i <= ls; i++)
+ if ((elem=bitget(P->Offset,i*nbits,nbits) + SB) >= text_pos) break;
+
+ if (elem > text_pos) i--;
+
+ return i+(elem>text_pos);
+ }
+
+
+ulong sizeofPosition(position P)
+ {
+ return sizeof(struct tpos)
+ + ((P->nbitsSB*P->nSuperBlock+W-1)/W)*sizeof(uint)
+ + ((P->nOffset*P->nbitsOffs+W-1)/W)*sizeof(uint);
+ }
+
+void destroyPosition(position P)
+ {
+ if (!P) return;
+ free(P->SuperBlock);
+ free(P->Offset);
+ }
+
+
+position loadPosition(FILE *f, uint text_length)
+ {
+ position P;
+ uint aux;
+
+ P = malloc (sizeof(struct tpos));
+
+ if (fread(&P->nSuperBlock,sizeof(uint),1,f) != 1) {
+ fprintf(stderr,"Error: Cannot read LZTrie from file\n");
+ exit(1);
+ }
+
+ P->nbitsSB = bits(text_length-1);
+ aux = (((unsigned long long) P->nbitsSB*P->nSuperBlock+W-1)/W);
+ P->SuperBlock = malloc(aux*sizeof(uint));
+
+ if (fread(P->SuperBlock,sizeof(uint),aux,f) != aux) {
+ fprintf(stderr,"Error: Cannot read LZTrie from file\n");
+ exit(1);
+ }
+
+ P->SBlock_size = 32;
+
+ P->Tlength = text_length;
+
+ if (fread(&P->nOffset,sizeof(uint),1,f) != 1) {
+ fprintf(stderr,"Error: Cannot read LZTrie from file\n");
+ exit(1);
+ }
+
+ if (fread(&P->nbitsOffs,sizeof(uint),1,f) != 1) {
+ fprintf(stderr,"Error: Cannot read LZTrie from file\n");
+ exit(1);
+ }
+
+ aux = (((unsigned long long)P->nOffset*P->nbitsOffs+W-1)/W);
+ P->Offset = malloc(aux*sizeof(uint));
+ if (fread(P->Offset,sizeof(uint),aux,f) != aux) {
+ fprintf(stderr,"Error: Cannot read LZTrie from file\n");
+ exit(1);
+ }
+
+ return P;
+ }
+
+
+void savePosition(FILE *f, position P)
+ {
+ uint aux;
+
+ if (fwrite(&P->nSuperBlock,sizeof(uint),1,f) != 1) {
+ fprintf(stderr,"Error: Cannot write LZTrie on file\n");
+ exit(1);
+ }
+
+ aux = (((unsigned long long) P->nbitsSB*P->nSuperBlock+W-1)/W);
+
+ if (fwrite(P->SuperBlock,sizeof(uint),aux,f) != aux) {
+ fprintf(stderr,"Error: Cannot write LZTrie on file\n");
+ exit(1);
+ }
+
+ if (fwrite(&P->nOffset,sizeof(uint),1,f) != 1) {
+ fprintf(stderr,"Error: Cannot write LZTrie on file\n");
+ exit(1);
+ }
+
+ if (fwrite(&P->nbitsOffs,sizeof(uint),1,f) != 1) {
+ fprintf(stderr,"Error: Cannot write LZTrie on file\n");
+ exit(1);
+ }
+
+ aux = (((unsigned long long)P->nOffset*P->nbitsOffs+W-1)/W);
+
+ if (fwrite(P->Offset,sizeof(uint),aux,f) != aux) {
+ fprintf(stderr,"Error: Cannot write LZTrie on file\n");
+ exit(1);
+ }
+ }
+
+
+
--- /dev/null
+
+#ifndef POSITIONINCLUDED
+#define POSITIONINCLUDED
+
+//#include "lztrie.h"
+#include "nodemap.h"
+
+
+typedef struct tpos
+ {
+ uint *SuperBlock; // array with starting positions of superblocks
+ uint nSuperBlock; // number of superblocks
+ uint nbitsSB; // number of bits used for starting positions
+ uint SBlock_size; // superblock size
+ uint *Offset; // array of offsets of each phrase within each superblock
+ uint nOffset; // size of the Offset array (number of LZ78 phrases)
+ uint nbitsOffs; // number of bits used per offset
+ ulong Tlength; // text length
+ } *position;
+
+
+position createPosition(void *T, uint text_length);
+
+ulong getPosition(position P, uint id);
+
+ulong sizeofPosition(position P);
+
+void destroyPosition(position P);
+
+void savePosition(FILE *f, position P);
+
+position loadPosition(FILE *f, uint text_length);
+
+#endif
--- /dev/null
+#include "stdio.h"
+#include "string.h"
+#include "lztrie.h"
+
+int main()
+{
+ byte *text = (byte*)"ababacaaaaaaaaaaaaaaaaaaaffffffffffffffdddddddddssssssssssfffffffffffzzzbbbbbbbbbbbbbbbbbbbbbbbbbabababaccacacababababababababbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbaaaaaaaaaacccdsfffffffffffffffffffffffaslköfjaösldjföasjdgajsdföljasdkfljaölsdjkföskjfölakjdföklajdökfjlaösldjföaögoiubvoixcupvozucvpouizxcpovui,qwner.,qwenrqbwe,rbqwmerbmwerbm";
+ ulong n = strlen((char *)text);
+ ulong i;
+ byte *newt = 0;
+ ulong l = 0;
+
+ lztrie lz = buildLZTrie(text, 0, n);
+
+ printf("extracting:\n");
+ extract(lz, 0, n, &newt, &l);
+ for (i = 0; i < n; ++i)
+ if (newt[i] != text[i]) {
+ printf("texts differ at %lu\n", i);
+ exit(0);
+ }
+
+ return 0;
+}
--- /dev/null
+
+ // LZ78 trie data structure
+
+#include "trie.h"
+
+ // creates trie
+
+trie createTrie (void)
+
+ { trie pTrie;
+ uint i;
+ pTrie = malloc (sizeof(struct strie));
+ pTrie->nid = 0;
+ pTrie->trie.id = pTrie->nid++;
+ pTrie->trie.nchildren = 0;
+ pTrie->trie.children = NULL;
+ pTrie->heaps[0] = createHeap(sizeof(triebody));
+ for (i=1;i<256;i++)
+ pTrie->heaps[i] = createHeap(i*sizeof(struct schild));
+ return pTrie;
+ }
+
+uint sizeofTrie(triebody *t)
+ {
+ uint i, size = 0;
+ size = sizeof(uint)+sizeof(short)+sizeof(struct schild *);
+ for (i=0;i<t->nchildren;i++)
+ size += sizeof(byte)+sizeof(struct striebody *)+sizeofTrie(t->children[i].trie);
+ return size;
+ }
+
+ // frees the trie
+
+void destroyTrie (trie pTrie)
+
+ { uint i;
+ for (i=0;i<256;i++) destroyHeap (pTrie->heaps[i]);
+ free (pTrie);
+ }
+
+ // inserts word[0...] into pTrie and returns new text ptr
+ // insertion proceeds until we get a new trie node
+
+byte *insertTrie (trie pTrie, byte *word)
+
+ { triebody *t = &pTrie->trie;
+ triebody *nt;
+ struct schild *newc;
+ int i,j;
+ int m = 0;
+ // traverse pTrie with word[0...]
+ while (true)
+ { i = 0;
+ while (i < t->nchildren)
+ { if (t->children[i].car >= word[m]) break;
+ i++;
+ }
+ if ((i == t->nchildren) || (t->children[i].car > word[m]))
+ break; // not found, get out
+ t = t->children[i].trie;
+ m++;
+ }
+ // at this point we fell off the trie, which is guaranteed to occur
+ // since the text finishes with the unique character 0
+ newc = mallocHeap(pTrie->heaps[t->nchildren+1]);
+ memcpy (newc,t->children,i*sizeof(struct schild));
+ memcpy (newc+i+1,t->children+i,(t->nchildren-i)*sizeof(struct schild));
+ freeHeap (pTrie->heaps[t->nchildren],t->children);
+ t->children = newc;
+ t->children[i].car = word[m];
+ nt = mallocHeap (pTrie->heaps[0]);
+ t->children[i].trie = nt;
+ t->nchildren++;
+ // new node created
+ nt->id = pTrie->nid++;
+ nt->nchildren = 0;
+ nt->children = NULL;
+ // return rest of text
+ return word+m+1;
+ }
+
+ // inserts word[0..len-1] into pTrie, with id = id
+ // assumes that no two equal strings are ever inserted
+
+void insertstringTrie (trie pTrie, byte *word, uint len, uint id)
+
+ { triebody *t,*nt;
+ uint i,j,m;
+ struct schild *newc;
+ // traverse pTrie with word[0...]
+ t = &pTrie->trie;
+ m = 0;
+ while (m < len)
+ { i = 0;
+ while (i < t->nchildren)
+ { if (t->children[i].car >= word[m]) break;
+ i++;
+ }
+ if ((i == t->nchildren) || (t->children[i].car > word[m]))
+ break; // not found, get out
+ t = t->children[i].trie;
+ m++;
+ }
+ // if we fell off the trie, we create more (unary and empty) nodes
+ while (m < len)
+ { newc = mallocHeap(pTrie->heaps[t->nchildren+1]);
+ memcpy (newc,t->children,i*sizeof(struct schild));
+ memcpy (newc+i+1,t->children+i,(t->nchildren-i)*sizeof(struct schild));
+ freeHeap (pTrie->heaps[t->nchildren],t->children);
+ t->children = newc;
+ if ((t->id == ~0) && (t->nchildren == 1)) pTrie->nid++; //not mute now
+ t->children[i].car = word[m];
+ nt = mallocHeap (pTrie->heaps[0]);
+ nt->id = ~0; // empty node, at least for now
+ nt->nchildren = 0;
+ nt->children = NULL;
+ t->children[i].trie = nt;
+ t->nchildren++;
+ t = nt;
+ m++; i = 0;
+ }
+ // new node created or existing node with id added
+ t->id = id;
+ if (t->nchildren <= 1) pTrie->nid++; //not mute now
+ }
+
+ // represents pTrie with parentheses, letters and ids
+
+ // also returns the leftmost id
+static uint traverse(triebody *t, uint *parent, byte *letters, uint *ids,
+ uint *pi, uint *pli)
+ {
+ uint i,chid,oldpli,myid;
+ myid = t->id;
+ // open parenthesis
+ bitclean(parent,*pi);
+ ids[myid] = *pi;
+ (*pi)++;
+ oldpli = *pli;
+ // traverse children
+ for (i=0; i<t->nchildren; i++) {
+ (*pli)++;
+ letters[*pli] = t->children[i].car;
+ chid = traverse(t->children[i].trie,parent,letters,ids,pi,pli);
+ }
+ // close parenthesis
+ bitset(parent,*pi);
+ (*pi)++;
+ // return leftmost id
+ return myid;
+ }
+
+void representTrie (trie pTrie, uint *parent, byte *letters, uint *ids)
+ {
+ uint pi,pli;
+ letters[0] = 0; // dummy value
+ pi = 0; pli = 0;
+ traverse(&pTrie->trie,parent,letters,ids,&pi,&pli);
+ }
+
--- /dev/null
+
+ // LZ78 trie data structure
+
+#ifndef TRIEINCLUDED
+#define TRIEINCLUDED
+
+#include "basics.h"
+#include "heap.h"
+
+typedef struct striebody
+ { uint id; // node id
+ short nchildren;
+ struct schild
+ { byte car;
+ struct striebody *trie;
+ } *children;
+ } triebody;
+
+typedef struct strie
+ { triebody trie; // trie
+ heap heaps[256]; // heaps
+ uint nid; // nr of nodes
+ } *trie;
+
+ // creates trie
+trie createTrie (void);
+ // inserts word[0...] into pTrie and returns new text ptr
+ // insertion proceeds until we get a new trie node
+uint sizeofTrie(triebody *t);
+
+byte *insertTrie (trie pTrie, byte *word/*, ulong *pos*/);
+ // inserts word[0..len-1] into pTrie, with id = id
+ // assumes that no two equal strings are ever inserted
+void insertstringTrie (trie pTrie, byte *word, uint len, uint id);
+ // frees the trie
+void destroyTrie (trie pTrie);
+ // represents pTrie with parentheses, letters and ids
+void representTrie (trie pTrie, uint *parent, byte *letters, uint *ids);
+
+#endif