Update swcs with larson-sadakane suffix array construction method.

author Kim Nguyễn <kn@lri.fr>

Thu, 1 Mar 2012 13:18:13 +0000 (14:18 +0100)

committer Kim Nguyễn <kn@lri.fr>

Thu, 1 Mar 2012 13:18:13 +0000 (14:18 +0100)
author Kim Nguyễn <kn@lri.fr>
Thu, 1 Mar 2012 13:18:13 +0000 (14:18 +0100)
committer Kim Nguyễn <kn@lri.fr>
Thu, 1 Mar 2012 13:18:13 +0000 (14:18 +0100)
diff --git a/Makefile b/Makefile

index bb8d60d..156142d 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
  HIDE=@
  CC = g++
  LIBCDSPATH = ../libcds
-CPPFLAGS = -Wall -ansi -g -I$(LIBCDSPATH)/includes/ -O3 -DNDEBUG -Wno-deprecated-declarations
+CPPFLAGS = -Wall -ansi -g -I$(LIBCDSPATH)/includes/ -O3 -DNDEBUG -Wno-deprecated-declarations -flto
  #CPPFLAGS = -Wall -ansi -g -I$(LIBCDSPATH)/includes/
  LIBCDSA = $(LIBCDSPATH)/lib/libcds.a
  LIBRLCSA = incbwt/rlcsa.a
diff --git a/swcsa/intIndex/Makefile b/swcsa/intIndex/Makefile

index 8f02954..0585c6e 100755 (executable)
--- a/swcsa/intIndex/Makefile
+++ b/swcsa/intIndex/Makefile
@@ -1,12 +1,13 @@
  SRCDIR = .
  SRCDIRCSA = .
+SRCDIRCSAUTILS = ../intIndexUtils
  SRCDIRUTILS = ../utils
  CC          = g++
  
  ifndef CFLAGS  ##possibly already defined by the "main Makefile".
         ##CFLAGS = -O9 -m32
-       CFLAGS      = -O9 -m32
-       ##CFLAGS      = -g -m32 -O0
+       ##CFLAGS      = -O9 -m32 
+       CFLAGS      = -g -m32 -O0
  endif
  
  LIBINTINDEX    = icsa.a
@@ -15,41 +16,41 @@ all: intIndex cleanO
  
  
  intIndex: icsa.o parameters.o basics.o huff.o bitmap.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o
-       ar rc $(LIBINTINDEX) icsa.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o
+       ar rc $(LIBINTINDEX) icsa.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o  
                         #not including "parameters.o basics.o bitmap.o huff.o" as they are included by wcsa
                         #they are already included into the library by the presentation layer.
  
-icsa.o: parameters.o basics.o bitmap.o huff.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o
-        $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/icsa.c
+icsa.o: parameters.o basics.o bitmap.o huff.o psiHuffmanRLE.o psiDeltaCode.o psiGonzalo.o 
+        $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/icsa.c 
  
  psiHuffmanRLE.o: huff.o
-       $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/psiHuffmanRLE.c
+       $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSAUTILS)/psiHuffmanRLE.c 
  
  psiDeltaCode.o:
-       $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/psiDeltaCode.c
+       $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSAUTILS)/psiDeltaCode.c 
  
  psiGonzalo.o: huff.o
-       $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSA)/psiGonzalo.c
+       $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRCSAUTILS)/psiGonzalo.c 
  
-parameters.o:
-       $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/parameters.c
-
-
-huff.o:
+parameters.o: 
+       $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/parameters.c 
+       
+       
+huff.o: 
         $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/huff.c
  
-basics.o:
+basics.o: 
         $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/basics.c
  
-bitmap.o:
+bitmap.o: 
         $(CC) $(CFLAGS) -c $(SRCDIR)/$(SRCDIRUTILS)/bitmap.c
  
  
-cleanO:
+cleanO: 
         rm -f *.o
-
+       
  clean:
-       rm -rf *~ *% *.o core *.bak icsa.a
+       rm -rf *~ *% *.o core *.bak icsa.a 
  
  tar:
         tar czvf icsa.tar.gz  Makefile
diff --git a/swcsa/intIndex/icsa.c b/swcsa/intIndex/icsa.c

old mode 100644 (file)

new mode 100755 (executable)

index eec4e34..c126f9a
--- a/swcsa/intIndex/icsa.c
+++ b/swcsa/intIndex/icsa.c
@@ -1,50 +1,338 @@
+/* icsa.c
+ * Copyright (C) 2011, Antonio Fariña and Eduardo Rodriguez, all rights reserved.
+ *
+ * icsa.c: Implementation of the interface "../intIndex/interfaceIntIndex.h"
+ *   that permits to represent a sequence of uint32 integers with an iCSA: 
+ *   An integer-oriented Compressed Suffix Array.
+ *   Such representation will be handled as a "ticsa" data structure, that
+ *   the WCSA self-index will use (as an opaque type) to 
+ *   create/save/load/search/recover/getSize of the representation of the 
+ *   original sequence.
+ *   Suffix sorting is done via Larson/Sadakane suffix sort algorithm 
+ *   from the char-based csa.
+ * 
+ *   Ref: Larsson, N. J. and Sadakane, K. 2007. Faster suffix sorting. Theoretical 
+ *        Computer Science 387, 3, 258–272.
+ *    
+ *    
+ * See more details in:
+ * Antonio Fariña, Nieves Brisaboa, Gonzalo Navarro, Francisco Claude, Ángeles Places, 
+ * and Eduardo Rodríguez. Word-based Self-Indexes for Natural Language Text. ACM 
+ * Transactions on Information Systems (TOIS), 2012. 
+ * http://vios.dc.fi.udc.es/indexing/wsi/publications.html
+ * http://www.dcc.uchile.cl/~gnavarro/ps/tois11.pdf       
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include <limits.h>
  #include "icsa.h"
  
-// Global para que funcione a funcion de comparacion do quicksort
-uint *intVector;
-
-// Para o quicksort
-int suffixCmp(const void *arg1, const void *arg2) {
-
-       register uint a,b;
-       a=*((uint *) arg1);
-       b=*((uint *) arg2);
+#define KEY(p)          (V[*(p)+(h)])
+#define SWAP(p, q)      (tmp=*(p), *(p)=*(q), *(q)=tmp)
+#define MED3(a, b, c)   (KEY(a)<KEY(b) ?                        \
+        (KEY(b)<KEY(c) ? (b) : KEY(a)<KEY(c) ? (c) : (a))       \
+        : (KEY(b)>KEY(c) ? (b) : KEY(a)>KEY(c) ? (c) : (a)))
+       
+static int *I,                  /* group array, ultimately suffix array.*/
+   *V,                          /* inverse array, ultimately inverse of I.*/
+   r,                           /* number of symbols aggregated by transform.*/
+   h;                           /* length of already-sorted prefixes.*/
+       
+/* Subroutine for select_sort_split and sort_split. Sets group numbers for a
+   group whose lowest position in I is pl and highest position is pm.*/
+
+static void update_group(int *pl, int *pm)
+{
+   int g;
+
+   g=pm-I;                      /* group number.*/
+   V[*pl]=g;                    /* update group number of first position.*/
+   if (pl==pm)
+      *pl=-1;                   /* one element, sorted group.*/
+   else
+      do                        /* more than one element, unsorted group.*/
+         V[*++pl]=g;            /* update group numbers.*/
+      while (pl<pm);
+}
  
-       while(intVector[a] == intVector[b]) { a++; b++; }
-       return (intVector[a] - intVector[b]);
+/* Quadratic sorting method to use for small subarrays. To be able to update
+   group numbers consistently, a variant of selection sorting is used.*/
+
+static void select_sort_split(int *p, int n) {
+   int *pa, *pb, *pi, *pn;
+   int f, v, tmp;
+
+   pa=p;                        /* pa is start of group being picked out.*/
+   pn=p+n-1;                    /* pn is last position of subarray.*/
+   while (pa<pn) {
+      for (pi=pb=pa+1, f=KEY(pa); pi<=pn; ++pi)
+         if ((v=KEY(pi))<f) {
+            f=v;                /* f is smallest key found.*/
+            SWAP(pi, pa);       /* place smallest element at beginning.*/
+            pb=pa+1;            /* pb is position for elements equal to f.*/
+         } else if (v==f) {     /* if equal to smallest key.*/
+            SWAP(pi, pb);       /* place next to other smallest elements.*/
+            ++pb;
+         }
+      update_group(pa, pb-1);   /* update group values for new group.*/
+      pa=pb;                    /* continue sorting rest of the subarray.*/
+   }
+   if (pa==pn) {                /* check if last part is single element.*/
+      V[*pa]=pa-I;
+      *pa=-1;                   /* sorted group.*/
+   }
+}
  
+/* Subroutine for sort_split, algorithm by Bentley & McIlroy.*/
+
+static int choose_pivot(int *p, int n) {
+   int *pl, *pm, *pn;
+   int s;
+   
+   pm=p+(n>>1);                 /* small arrays, middle element.*/
+   if (n>7) {
+      pl=p;
+      pn=p+n-1;
+      if (n>40) {               /* big arrays, pseudomedian of 9.*/
+         s=n>>3;
+         pl=MED3(pl, pl+s, pl+s+s);
+         pm=MED3(pm-s, pm, pm+s);
+         pn=MED3(pn-s-s, pn-s, pn);
+      }
+      pm=MED3(pl, pm, pn);      /* midsize arrays, median of 3.*/
+   }
+   return KEY(pm);
  }
  
+/* Sorting routine called for each unsorted group. Sorts the array of integers
+   (suffix numbers) of length n starting at p. The algorithm is a ternary-split
+   quicksort taken from Bentley & McIlroy, "Engineering a Sort Function",
+   Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This
+   function is based on Program 7.*/
+
+static void sort_split(int *p, int n)
+{
+   int *pa, *pb, *pc, *pd, *pl, *pm, *pn;
+   int f, v, s, t, tmp;
+
+   if (n<7) {                   /* multi-selection sort smallest arrays.*/
+      select_sort_split(p, n);
+      return;
+   }
+
+   v=choose_pivot(p, n);
+   pa=pb=p;
+   pc=pd=p+n-1;
+   while (1) {                  /* split-end partition.*/
+      while (pb<=pc && (f=KEY(pb))<=v) {
+         if (f==v) {
+            SWAP(pa, pb);
+            ++pa;
+         }
+         ++pb;
+      }
+      while (pc>=pb && (f=KEY(pc))>=v) {
+         if (f==v) {
+            SWAP(pc, pd);
+            --pd;
+         }
+         --pc;
+      }
+      if (pb>pc)
+         break;
+      SWAP(pb, pc);
+      ++pb;
+      --pc;
+   }
+   pn=p+n;
+   if ((s=pa-p)>(t=pb-pa))
+      s=t;
+   for (pl=p, pm=pb-s; s; --s, ++pl, ++pm)
+      SWAP(pl, pm);
+   if ((s=pd-pc)>(t=pn-pd-1))
+      s=t;
+   for (pl=pb, pm=pn-s; s; --s, ++pl, ++pm)
+      SWAP(pl, pm);
+
+   s=pb-pa;
+   t=pd-pc;
+   if (s>0)
+      sort_split(p, s);
+   update_group(p+s, p+n-t-1);
+   if (t>0)
+      sort_split(p+n-t, t);
+}
  
+/* Bucketsort for first iteration.
+
+   Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear
+   at least once. x[n] is 0. (This is the corresponding output of transform.) k
+   must be at most n+1. p is array of size n+1 whose contents are disregarded.
+
+   Output: x is V and p is I after the initial sorting stage of the refined
+   suffix sorting algorithm.*/
+      
+static void bucketsort(int *x, int *p, int n, int k)
+{
+   int *pi, i, c, d, g;
+
+   for (pi=p; pi<p+k; ++pi)
+      *pi=-1;                   /* mark linked lists empty.*/
+   for (i=0; i<=n; ++i) {
+      x[i]=p[c=x[i]];           /* insert in linked list.*/
+      p[c]=i;
+   }
+   for (pi=p+k-1, i=n; pi>=p; --pi) {
+      d=x[c=*pi];               /* c is position, d is next in list.*/
+      x[c]=g=i;                 /* last position equals group number.*/
+      if (d>=0) {               /* if more than one element in group.*/
+         p[i--]=c;              /* p is permutation for the sorted x.*/
+         do {
+            d=x[c=d];           /* next in linked list.*/
+            x[c]=g;             /* group number in x.*/
+            p[i--]=c;           /* permutation in p.*/
+         } while (d>=0);
+      } else
+         p[i--]=-1;             /* one element, sorted group.*/
+   }
+}
  
-/* **NO REVISADO TAMAÑO FILE.
-int buildIntIndexFromFile (char *filename, char *build_options,void **index) {
-       //char filename[255];
-       int file;
-       struct stat f_stat;     
-       //sprintf(filename, "%s.%s", basename,SE_FILE_EXT);
-       
-       if( (file = open(filename, O_RDONLY)) < 0) {
-               printf("Cannot open file %s\n", filename);
-               exit(0);
-       }
-       if( fstat(file, &f_stat) < 0) {
-               printf("Cannot read information from file %s\n", filename);     exit(0);
-       }       
-       uint sizeFile = (f_stat.st_size)/sizeof(uint);
+/* Transforms the alphabet of x by attempting to aggregate several symbols into
+   one, while preserving the suffix order of x. The alphabet may also be
+   compacted, so that x on output comprises all integers of the new alphabet
+   with no skipped numbers.
+
+   Input: x is an array of size n+1 whose first n elements are positive
+   integers in the range l...k-1. p is array of size n+1, used for temporary
+   storage. q controls aggregation and compaction by defining the maximum value
+   for any symbol during transformation: q must be at least k-l; if q<=n,
+   compaction is guaranteed; if k-l>n, compaction is never done; if q is
+   INT_MAX, the maximum number of symbols are aggregated into one.
+   
+   Output: Returns an integer j in the range 1...q representing the size of the
+   new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is
+   set to the number of old symbols grouped into one. Only x[n] is 0.*/
+
+static int transform(int *x, int *p, int n, int k, int l, int q)
+{
+   int b, c, d, e, i, j, m, s;
+   int *pi, *pj;
+   
+   for (s=0, i=k-l; i; i>>=1)
+      ++s;                      /* s is number of bits in old symbol.*/
+   e=INT_MAX>>s;                /* e is for overflow checking.*/
+   for (b=d=r=0; r<n && d<=e && (c=d<<s|(k-l))<=q; ++r) {
+      b=b<<s|(x[r]-l+1);        /* b is start of x in chunk alphabet.*/
+      d=c;                      /* d is max symbol in chunk alphabet.*/
+   }
+   m=(1<<(r-1)*s)-1;            /* m masks off top old symbol from chunk.*/
+   x[n]=l-1;                    /* emulate zero terminator.*/
+   if (d<=n) {                  /* if bucketing possible, compact alphabet.*/
+      for (pi=p; pi<=p+d; ++pi)
+         *pi=0;                 /* zero transformation table.*/
+      for (pi=x+r, c=b; pi<=x+n; ++pi) {
+         p[c]=1;                /* mark used chunk symbol.*/
+         c=(c&m)<<s|(*pi-l+1);  /* shift in next old symbol in chunk.*/
+      }
+      for (i=1; i<r; ++i) {     /* handle last r-1 positions.*/
+         p[c]=1;                /* mark used chunk symbol.*/
+         c=(c&m)<<s;            /* shift in next old symbol in chunk.*/
+      }
+      for (pi=p, j=1; pi<=p+d; ++pi)
+         if (*pi)
+            *pi=j++;            /* j is new alphabet size.*/
+      for (pi=x, pj=x+r, c=b; pj<=x+n; ++pi, ++pj) {
+         *pi=p[c];              /* transform to new alphabet.*/
+         c=(c&m)<<s|(*pj-l+1);  /* shift in next old symbol in chunk.*/
+      }
+      while (pi<x+n) {          /* handle last r-1 positions.*/
+         *pi++=p[c];            /* transform to new alphabet.*/
+         c=(c&m)<<s;            /* shift right-end zero in chunk.*/
+      }
+   } else {                     /* bucketing not possible, don't compact.*/
+      for (pi=x, pj=x+r, c=b; pj<=x+n; ++pi, ++pj) {
+         *pi=c;                 /* transform to new alphabet.*/
+         c=(c&m)<<s|(*pj-l+1);  /* shift in next old symbol in chunk.*/
+      }
+      while (pi<x+n) {          /* handle last r-1 positions.*/
+         *pi++=c;               /* transform to new alphabet.*/
+         c=(c&m)<<s;            /* shift right-end zero in chunk.*/
+      }
+      j=d+1;                    /* new alphabet size.*/
+   }
+   x[n]=0;                      /* end-of-string symbol is zero.*/
+   return j;                    /* return new alphabet size.*/
+}
  
-       uint *se = (uint *) malloc (sizeFile);
-       uint seSize = sizeFile / sizeof(uint);
-       read(file, se,  sizeFile);              //the samples
-       close(file);    
-       return (buildIntIndex(se,seSize,build_options,index));
+/* Makes suffix array p of x. x becomes inverse of p. p and x are both of size
+   n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
+   contents of x[n] is disregarded, the n-th symbol being regarded as
+   end-of-string smaller than all other symbols.*/
+
+void suffixsort(int *x, int *p, int n, int k, int l)
+{
+   int *pi, *pk;
+   int i, j, s, sl;
+
+   V=x;                         /* set global values.*/
+   I=p;
+   
+   if (n>=k-l) {                /* if bucketing possible,*/
+      j=transform(V, I, n, k, l, n);
+      bucketsort(V, I, n, j);   /* bucketsort on first r positions.*/
+   } else {
+      transform(V, I, n, k, l, INT_MAX);
+      for (i=0; i<=n; ++i)
+         I[i]=i;                /* initialize I with suffix numbers.*/
+      h=0;
+      sort_split(I, n+1);       /* quicksort on first r positions.*/
+   }
+   h=r;                         /* number of symbols aggregated by transform.*/
+   
+   while (*I>=-n) {
+      pi=I;                     /* pi is first position of group.*/
+      sl=0;                     /* sl is negated length of sorted groups.*/
+      do {
+         if ((s=*pi)<0) {
+            pi-=s;              /* skip over sorted group.*/
+            sl+=s;              /* add negated length to sl.*/
+         } else {
+            if (sl) {
+               *(pi+sl)=sl;     /* combine sorted groups before pi.*/
+               sl=0;
+            }
+            pk=I+V[s]+1;        /* pk-1 is last position of unsorted group.*/
+            sort_split(pi, pk-pi);
+            pi=pk;              /* next group.*/
+         }
+      } while (pi<=I+n);
+      if (sl)                   /* if the array ends with a sorted group.*/
+         *(pi+sl)=sl;           /* combine sorted groups at end of I.*/
+      h=2*h;                    /* double sorted-depth.*/
+   }
+
+   for (i=0; i<=n; ++i)         /* reconstruct suffix array from inverse.*/
+      I[V[i]]=i;
  }
-*/
  
-//ticsa *createIntegerCSA(uint **aintVector, uint textSize, char *build_options) {
-int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index ){
-       uint textSize=n;        
-       intVector = aintVector;  //global variable
+
+//ticsa *createIntegerCSA(uint **intVector, uint textSize, char *build_options) {
+int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ){
+       uint textSize=n;
         ticsa *myicsa;
         myicsa = (ticsa *) malloc (sizeof (ticsa));
         uint *Psi, *SAI, *C, vocSize;
@@ -56,18 +344,18 @@ int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index )
         nsHUFF=myicsa->tempNSHUFF;
         
         // Almacenamos o valor dalguns parametros
+       
         myicsa->suffixArraySize = textSize;
         myicsa->D = (uint*) malloc (sizeof(uint) * ((textSize+31)/32)); 
         
-       myicsa->samplesASize = (textSize + myicsa->T_A - 1) / myicsa->T_A + 1;
-       myicsa->samplesA = (uint *)malloc(sizeof(int) * myicsa->samplesASize);
+       myicsa->samplesASize = (textSize + myicsa->T_A - 1) / myicsa->T_A + 1; //--> última pos siempre sampleada!
+//     myicsa->samplesASize = (textSize + myicsa->T_A - 1) / myicsa->T_A ;//+ 1;
+       myicsa->samplesA = (uint *)malloc(sizeof(uint) * myicsa->samplesASize);
+       myicsa->samplesA[myicsa->samplesASize-1] = 0000;
         myicsa->BA = (uint*) malloc (sizeof(uint) * ((textSize+31)/32));
         myicsa->samplesAInvSize = (textSize + myicsa->T_AInv - 1) / myicsa->T_AInv;
         myicsa->samplesAInv = (uint *)malloc(sizeof(int) * myicsa->samplesAInvSize);
  
-       // Reservamos espacio para os vectores
-       Psi = (uint *) malloc (sizeof(uint) * textSize);
-
         // CONSTRUIMOS A FUNCION C
         vocSize = 0;
         for(i=0;i<textSize;i++) if(intVector[i]>vocSize) vocSize = intVector[i];
@@ -81,19 +369,35 @@ int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index )
         for(i=vocSize;i>0;i--) C[i] = C[i-1];
         C[0] = 0;
  
-       // Construimos o array de sufixos (en Psi) - con quicksort
-       printf("\n\t *BUILDING THE SUFFIX ARRAY over %d integers... (with qsort)", textSize);fflush(stdout);
-       for(i=0; i<textSize; i++) Psi[i]=i;
-       
-       qsort(Psi, textSize, sizeof(uint), suffixCmp);
-       
-       
+       // Reservamos espacio para os array de sufijos
+       Psi = (uint *) malloc (sizeof(uint) * (textSize+1));
+       printf("\n\t *BUILDING THE SUFFIX ARRAY over %d integers... (with larsson-sadakane)", textSize);fflush(stdout);
+       
+/* Makes suffix array p of x. x becomes inverse of p. p and x are both of size
+   n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
+   contents of x[n] is disregarded, the n-th symbol being regarded as
+   end-of-string smaller than all other symbols.*/
+   //void suffixsort(int *x, int *p, int n, int k, int l)
+       
+       //suffixsort((int *)intVector, (int *)Psi, n, vocSize+1, 1);
+       suffixsort((int *)intVector, (int *)Psi, n-1, vocSize+1, 1);
+       //*Psi = *Psi++;        // Con esto temos o array de sufixos desde a primeira posición (sen o terminador)
         printf("\n\t ...... ended.");
  
+
+
         // CONSTRUIMOS A INVERSA DO ARRAY DE SUFIXOS
+       //SAI = (uint *) malloc (sizeof(uint) * (textSize + 1));        // +1 para repetir na ultima posición. Evitamos un if
+       //for(i=0;i<textSize;i++) SAI[Psi[i]] = i;
+       //SAI[textSize] = SAI[0];
+       
+       //SAI = intVector;
+       
         SAI = (uint *) malloc (sizeof(uint) * (textSize + 1));  // +1 para repetir na ultima posición. Evitamos un if
-       for(i=0;i<textSize;i++) SAI[Psi[i]] = i;
-       SAI[textSize] = SAI[0];
+       for(i=0;i<textSize;i++) SAI[i] = intVector[i];
+       SAI[textSize] = SAI[0]; 
+
+       //SAI[textSize] = SAI[0];
  
         // ALMACENAMOS AS MOSTRAS DO ARRAY DE SUFIXOS
         for(i=0;i<((textSize+31)/32);i++) myicsa->BA[i] = 0;
@@ -109,7 +413,14 @@ int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index )
         // CONSTRUIMOS E COMPRIMIMOS PSI
         printf("\n\t Creating compressed Psi...");
         for(i=0;i<textSize;i++) Psi[i] = SAI[Psi[i]+1];
+       
+       //FILE *ff = fopen("psi.log","w");
+       //for (i=0;i<textSize;i++) fprintf(ff,"%d::%u",i,Psi[i]);
+       //fclose(ff);
+       
         free(SAI);
+       
+       
         #ifdef PSI_HUFFMANRLE   
         myicsa->hcPsi = huffmanCompressPsi(Psi,textSize,myicsa->T_Psi,nsHUFF);
         #endif                          
@@ -119,13 +430,14 @@ int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index )
         #ifdef PSI_DELTACODES
         myicsa->dcPsi = deltaCompressPsi(Psi,textSize,myicsa->T_Psi);           
         #endif
+       
         free(Psi);      
-       
+               
         // Contruimos D
         for(i=0;i<((textSize+31)/32);i++) myicsa->D[i] = 0;     
         for(i=0;i<=vocSize;i++) bitset(myicsa->D, C[i]);
         myicsa->bD = createBitmap(myicsa->D,textSize);
-       free(C);
+       free(C);        
  
         // VARIABLE GLOBAL QUE ALMACENA O ESTADO DOS DISPLAYS (IMPORTANTE PARA DISPLAY SECUENCIAL)
         // Almacena a última posición do array de sufixos que mostramos con display ou displayNext
@@ -134,8 +446,8 @@ int buildIntIndex (uint *aintVector, uint n, char *build_options, void **index )
         myicsa->displayCSAState = 0;
         myicsa->displayCSAPrevPosition = -2;  //needed by DisplayCSA(position)
         
-       aintVector = intVector;
-       // Liberamos o espacion non necesario
+       //aintVector = intVector;
+       // Liberamos o espacio non necesario
  
         *index = myicsa;
         //return (myicsa);
@@ -452,7 +764,7 @@ int freeIntIndex(void *index) {
                 
         if (!myicsa) return 0;
         
-       uint total=0, totaltmp=0;
+       size_t total=0, totaltmp=0;
         
         uint nbytes;sizeIntIndex(index, &nbytes);
                 
@@ -470,25 +782,25 @@ int freeIntIndex(void *index) {
                 total+= totaltmp = myicsa->dcPsi.totalMem;
                 destroyDeltaCodesCompressedPsi(&(myicsa->dcPsi));       
         #endif
-       printf("\n\t[destroying  SA: compressed PSI structure] ...Freed %u bytes... RAM",totaltmp);
+       printf("\n\t[destroying  SA: compressed PSI structure] ...Freed %zu bytes... RAM", totaltmp);
         
         free(myicsa->D);                        total+= totaltmp =  (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); 
-                                                       printf("\n\t[destroying  SA: D vector] ...Freed %u bytes... RAM",totaltmp);
+                                                       printf("\n\t[destroying  SA: D vector] ...Freed %zu bytes... RAM",totaltmp);
         free(myicsa->samplesA);         total+= totaltmp = (sizeof(uint) * myicsa->samplesASize); 
-                                                       printf("\n\t[destroying  Samples A: A   ] ...Freed %u bytes... RAM",totaltmp);
+                                                       printf("\n\t[destroying  Samples A: A   ] ...Freed %zu bytes... RAM",totaltmp);
         free(myicsa->samplesAInv);      total+= totaltmp =  (sizeof(uint) * myicsa->samplesAInvSize); 
-                                                       printf("\n\t[destroying  Samples AInv: A   ] ...Freed %u bytes... RAM",totaltmp);                                                       
-                                               printf("\n\t[destroying  rank bit D   ] ...Freed %u bytes... RAM",myicsa->bD->mem_usage);
+                                                       printf("\n\t[destroying  Samples AInv: A   ] ...Freed %zubytes... RAM",totaltmp);                                                       
+                                               printf("\n\t[destroying  rank bit D   ] ...Freed %zu bytes... RAM", (size_t)myicsa->bD->mem_usage);
         free(myicsa->BA);                       total+= totaltmp =  (sizeof(uint)*((myicsa->suffixArraySize+31)/32)); 
-                                                       printf("\n\t[destroying  SA: BA vector] ...Freed %u bytes... RAM",totaltmp);
+                                                       printf("\n\t[destroying  SA: BA vector] ...Freed %zu bytes... RAM",totaltmp);
                                                                 
                                                                 total += myicsa->bD->mem_usage;
         destroyBitmap(myicsa->bD);                      
                                                                 total += myicsa->bBA->mem_usage;
         destroyBitmap(myicsa->bBA);
                                                                 
-       printf("\n\t**** [the whole iCSA ocuppied ... %u bytes... RAM",total);
-       printf("\n\t**** iCSA size = %d bytes ",nbytes);
+       printf("\n\t**** [the whole iCSA ocuppied ... %zu bytes... RAM",total);
+       printf("\n\t**** iCSA size = %zu bytes ", (size_t) nbytes);
         printf("\n");
         
         free(myicsa);
diff --git a/swcsa/intIndex/icsa.h b/swcsa/intIndex/icsa.h

old mode 100644 (file)

new mode 100755 (executable)

index 523859d..6d4dc3d
--- a/swcsa/intIndex/icsa.h
+++ b/swcsa/intIndex/icsa.h
@@ -1 +1 @@
-//#include <stdio.h>\r#include <fcntl.h>\r#include <sys/stat.h>\r#include <time.h>\r#include <sys/time.h>\r\r#include "defValues.h"\r#include "../utils/bitmap.h"\r#include "../utils/huff.h"\r#include "../utils/parameters.h"\r\r#ifdef PSI_HUFFMANRLE\r #include "psiHuffmanRLE.h"\r#endif\r\r#ifdef PSI_GONZALO\r  #include "psiGonzalo.h"\r#endif\r\r#ifdef PSI_DELTACODES\r  #include "psiDeltaCode.h"\r#endif\r\r\rtypedef struct {\r    uint suffixArraySize;\r  uint T_Psi;\r    uint *D;\r       bitmap bD;\r     uint T_A;\r      uint T_AInv;\r   uint *samplesA;\r        uint samplesASize;\r     uint *BA;\r      bitmap bBA;     \r       uint *samplesAInv;\r     uint samplesAInvSize;\r  uint displayCSAState;\r  int displayCSAPrevPosition;\r    #ifdef PSI_HUFFMANRLE   \r       HuffmanCompressedPsi hcPsi;\r    #endif  \r       #ifdef PSI_GONZALO\r     GonzaloCompressedPsi gcPsi;\r    #endif\r #ifdef PSI_DELTACODES\r  DeltaCompressedPsi dcPsi;\r      #endif\r \r       //only needed during "parse_parameters".\r       uint tempNSHUFF;\r       uint psiSearchFactorJump;  //factor of the T_Psi value.\r} ticsa;        \r\r      \r// FUNCTION PROTOTYPES: BUILDING THE INDEX\r\r//Creates the ICSA \r\r      int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ); //ticsa *createIntegerCSA (uint **aintVector, uint SAsize, char *build_options);\r\r//Returns number of elements in the indexed sequence of integers\r    int sourceLenIntIndex(void *index, uint *numInts);\r\r//Save the index to disk\r   int saveIntIndex(void *index, char *pathname); //void storeStructsCSA(ticsa *myicsa, char *basename);\r\r// Loads the index from disk.\r   int loadIntIndex(char *pathname, void **index);  //ticsa *loadCSA(char *basename);\r\r//  Frees memory    \r       int freeIntIndex(void *index); //uint destroyStructsCSA(ticsa *myicsa);\r\r//Returns the size (in bytes) of the index over the sequence of integers.\r     int sizeIntIndex(void *index, uint *numBytes); //uint CSA_size(ticsa *myicsa);  \r\r      // Shows detailed summary info of the self-index (memory usage of each structure)\rint printInfoIntIndex(void *index, const char tab[]);\r\r//Number of occurrences of the pattern, and the interval [left,right] in the suffix array.\r    int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right);\r                  //uint countCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right);               // Exponential search\r                  //uint countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right);    // Binary search\r\r// Returns an array with integers corresponding offsets to the occurrences of the pattern, \r// as well as the number of occurrences\r  int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc);\r               //uint *locateCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *occ);\r\r//Returns the value of the source (array of integers) at a given offset.\r// (that is, the element "position" from the original array of uints)\r  int displayIntIndex(void *index, ulong position, uint *value);\r         //uint displayCSA(ticsa *myicsa, uint position);        \r\r\r/* Private function prototypes ********************************************/\ruint parametersCSA(ticsa *myicsa, char *build_options);\r\ruint displayCSAFirst(ticsa *myicsa, uint position);\ruint displayCSANext(ticsa *myicsa);\rint  SadCSACompare(ticsa *myicsa, uint *pattern, uint patternSize, uint p);\ruint A(ticsa *myicsa, uint position);\ruint inverseA(ticsa *myicsa, uint offset);\r\rvoid showStructsCSA(ticsa *myicsa);              // For Debugging\r\r
-\ No newline at end of file
+/* icsa.h\r * Copyright (C) 2011, Antonio Fariña and Eduardo Rodriguez, all rights reserved.\r *\r * icsa.h: Definition of the functions of the interface "../intIndex/interfaceIntIndex.h"\r *   that permits to represent a sequence of uint32 integers with an iCSA: \r *   An integer-oriented Compressed Suffix Array.\r *   Such representation will be handled as a "ticsa" data structure, that\r *   the WCSA self-index will use (as an opaque type) to \r *   create/save/load/search/recover/getSize of the representation of the \r *   original sequence.\r *   Suffix sorting is done via Larson/Sadakane suffix sort algorithm \r *   from the char-based csa.\r * \r *   Ref: Larsson, N. J. and Sadakane, K. 2007. Faster suffix sorting. Theoretical \r *        Computer Science 387, 3, 258–272.\r *    \r * See more details in:\r * Antonio Fariña, Nieves Brisaboa, Gonzalo Navarro, Francisco Claude, Ángeles Places, \r * and Eduardo Rodríguez. Word-based Self-Indexes for Natural Language Text. ACM \r * Transactions on Information Systems (TOIS), 2012. \r * http://vios.dc.fi.udc.es/indexing/wsi/publications.html\r * http://www.dcc.uchile.cl/~gnavarro/ps/tois11.pdf       \r * \r * This library is free software; you can redistribute it and/or\r * modify it under the terms of the GNU Lesser General Public\r * License as published by the Free Software Foundation; either\r * version 2.1 of the License, or (at your option) any later version.\r *\r * This library is distributed in the hope that it will be useful,\r * but WITHOUT ANY WARRANTY; without even the implied warranty of\r * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\r * Lesser General Public License for more details.\r *\r * You should have received a copy of the GNU Lesser General Public\r * License along with this library; if not, write to the Free Software\r * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA\r *\r */\r\r#include <stdio.h>\r\r#include <fcntl.h>\r#include <sys/stat.h>\r#include <time.h>\r#include <sys/time.h>\r\r#include "../intIndexUtils/defValues.h"\r\r#include "../utils/bitmap.h"\r#include "../utils/huff.h"\r#include "../utils/parameters.h"\r\r#ifdef PSI_HUFFMANRLE\r        #include "../intIndexUtils/psiHuffmanRLE.h"\r#endif\r\r#ifdef PSI_GONZALO\r #include "../intIndexUtils/psiGonzalo.h"\r#endif\r\r#ifdef PSI_DELTACODES\r #include "../intIndexUtils/psiDeltaCode.h"\r#endif\r\r\rtypedef struct {\r   uint suffixArraySize;\r  uint T_Psi;\r    uint *D;\r       bitmap bD;\r     uint T_A;\r      uint T_AInv;\r   uint *samplesA;\r        uint samplesASize;\r     uint *BA;\r      bitmap bBA;     \r       uint *samplesAInv;\r     uint samplesAInvSize;\r  uint displayCSAState;\r  long displayCSAPrevPosition;\r   #ifdef PSI_HUFFMANRLE   \r       HuffmanCompressedPsi hcPsi;\r    #endif  \r       #ifdef PSI_GONZALO\r     GonzaloCompressedPsi gcPsi;\r    #endif\r #ifdef PSI_DELTACODES\r  DeltaCompressedPsi dcPsi;\r      #endif\r \r       //only needed during "parse_parameters".\r       uint tempNSHUFF;\r       uint psiSearchFactorJump;  //factor of the T_Psi value.\r} ticsa;        \r\r      \r// FUNCTION PROTOTYPES: BUILDING THE INDEX\r\r//Creates the ICSA \r\r      int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ); //ticsa *createIntegerCSA (uint **aintVector, uint SAsize, char *build_options);\r\r//Returns number of elements in the indexed sequence of integers\r    int sourceLenIntIndex(void *index, uint *numInts);\r\r//Save the index to disk\r   int saveIntIndex(void *index, char *pathname); //void storeStructsCSA(ticsa *myicsa, char *basename);\r\r// Loads the index from disk.\r   int loadIntIndex(char *pathname, void **index);  //ticsa *loadCSA(char *basename);\r\r//  Frees memory    \r       int freeIntIndex(void *index); //uint destroyStructsCSA(ticsa *myicsa);\r\r//Returns the size (in bytes) of the index over the sequence of integers.\r     int sizeIntIndex(void *index, uint *numBytes); //uint CSA_size(ticsa *myicsa);  \r\r      // Shows detailed summary info of the self-index (memory usage of each structure)\rint printInfoIntIndex(void *index, const char tab[]);\r\r//Number of occurrences of the pattern, and the interval [left,right] in the suffix array.\r    int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right);\r                  //uint countCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right);               // Exponential search\r                  //uint countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right);    // Binary search\r\r// Returns an array with integers corresponding offsets to the occurrences of the pattern, \r// as well as the number of occurrences\r  int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc);\r               //uint *locateCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *occ);\r\r//Returns the value of the source (array of integers) at a given offset.\r// (that is, the element "position" from the original array of uints)\r  int displayIntIndex(void *index, ulong position, uint *value);\r         //uint displayCSA(ticsa *myicsa, uint position);        \r\r\r/* Private function prototypes ********************************************/\ruint parametersCSA(ticsa *myicsa, char *build_options);\r\ruint displayCSAFirst(ticsa *myicsa, uint position);\ruint displayCSANext(ticsa *myicsa);\rint  SadCSACompare(ticsa *myicsa, uint *pattern, uint patternSize, uint p);\ruint A(ticsa *myicsa, uint position);\ruint inverseA(ticsa *myicsa, uint offset);\r\rvoid showStructsCSA(ticsa *myicsa);              // For Debugging\r\r
+\ No newline at end of file
diff --git a/swcsa/intIndexUtils b/swcsa/intIndexUtils

new file mode 120000 (symlink)

index 0000000..245a84f
--- /dev/null
+++ b/swcsa/intIndexUtils
@@ -0,0 +1 @@
+intIndex/
+\ No newline at end of file
author	Kim Nguyễn <kn@lri.fr>
	Thu, 1 Mar 2012 13:18:13 +0000 (14:18 +0100)
committer	Kim Nguyễn <kn@lri.fr>
	Thu, 1 Mar 2012 13:18:13 +0000 (14:18 +0100)
Makefile		patch \| blob \| history
swcsa/intIndex/Makefile		patch \| blob \| history
swcsa/intIndex/icsa.c	[changed mode: 0644->0755]	patch \| blob \| history
swcsa/intIndex/icsa.h	[changed mode: 0644->0755]	patch \| blob \| history
swcsa/intIndexUtils	[new symlink]	patch \| blob