swcsa/buildFacade.h

   1 /* only for getTime() */
   2
   3 #include <sys/time.h>
   4 #include <sys/resource.h>
   5 #include "utils/basics.h"
   6
   7 #include "utils/valstring.h"
   8 #include "utils/defValues.h"
   9 #include "utils/MemoryManager.h"
  10 #include "utils/fileInfo.h"
  11
  12 #include "utils/hash.h"
  13
  14 #include "utils/huff.h"
  15 //#include "utils/errors.c"
  16 #include "utils/parameters.h"
  17
  18 //from SEARCHER FACADE
  19 #include "utils/huffDec.h"
  20 //#include "icsa/icsa.h"
  21
  22 #include "intIndex/interfaceIntIndex.h"
  23
  24 #ifndef uchar
  25 #define uchar unsigned char
  26 #endif
  27 #ifndef uint
  28 #define uint  unsigned int
  29 #endif
  30 #ifndef ulong
  31 #define ulong unsigned long
  32 #endif
  33
  34 #define STRLEN(str,len) \
  35 {len=0; \
  36  byte *ptr = str; \
  37  while(*ptr++) len++; \
  38 }
  39
  40 #define ADDLEN(str,len) \
  41 {byte *ptr = str; \
  42         while(*ptr++) len++; \
  43 }
  44
  45 /** Some data types used **ONLY**during construction process */
  46
  47 // Words, both the canonical words and their variants
  48         typedef struct {
  49                 unsigned long slot;   // the position in the hash table of the canonical word
  50                 byte *word; //makes alphanumerical sorting easier...
  51         } tposInHT;
  52
  53
  54         typedef struct SzoneMem {        //a large block of memory to load a file into mem.
  55                 byte *zone;  //block of mem.
  56                 uint size;  //number of bytes
  57         } tZoneMem;
  58
  59         //  words dataStructure.
  60         typedef struct {
  61                 uint *words;
  62                 uint elemSize;  //the size (in bits) of each pointer.
  63                 tZoneMem wordsZoneMem; // a block of memory where the canonical words are loaded (from file).
  64
  65         } twords;
  66
  67
  68 /** Some data types used during searches */
  69
  70
  71
  72
  73         /**the WCSA index structures... */
  74         typedef struct {
  75
  76                 /**valid words */
  77                 twords wordsData;               /* vocabulary (words) of the index */
  78
  79                 ulong n;                                /* number of different words. */
  80                 uint seSize;                    /* number of words in the source text */
  81
  82                 uint sourceTextSize;    /*the size of the source text in bytes*/
  83
  84                 //ticsa *myicsa; //the WiCSA on SE words
  85                 void *myicsa; //the WiCSA on SE words
  86
  87                 //#ifndef CSA_ON
  88                 uint *se;
  89                 //#endif
  90
  91         }twcsa;
  92
  93
  94 /** ******************************************************************************
  95     * Interface (from pizza chili) for using the WCSA index
  96 *********************************************************************************/
  97
  98 /* Error management */
  99
 100         /* Returns a string describing the error associated with error number
 101           e. The string must not be freed, and it will be overwritten with
 102           subsequent calls. */
 103
 104 char *error_index (int e);
 105
 106 /* Building the index */
 107
 108         /* Creates index from text[0..length-1]. Note that the index is an
 109           opaque data type. Any build option must be passed in string
 110           build_options, whose syntax depends on the index. The index must
 111           always work with some default parameters if build_options is NULL.
 112           The returned index is ready to be queried. */
 113
 114 int build_index (uchar *text, ulong length, char *build_options, void **index);
 115
 116         /*  Saves index on disk by using single or multiple files, having
 117           proper extensions. */
 118
 119 int save_index (void *index, char *filename);
 120
 121         /*  Loads index from one or more file(s) named filename, possibly
 122           adding the proper extensions. */
 123
 124 int load_index (char *filename, void **index);
 125
 126         /* Frees the memory occupied by index. */
 127
 128 int free_index (void *index);
 129
 130         /* Gives the memory occupied by index in bytes. */
 131
 132 int index_size(void *index, ulong *size);
 133
 134 /* Querying the index */
 135
 136         /* Writes in numocc the number of occurrences of the substring
 137           pattern[0..length-1] found in the text indexed by index. */
 138
 139 int count (void *index, uchar *pattern, ulong length, ulong *numocc);
 140
 141         /* Gives the length of the text indexed */
 142
 143 int get_length(void *index, ulong *length);
 144
 145 /* Accessing the indexed text  */
 146
 147         /* Writes in numocc the number of occurrences of the substring
 148           pattern[0..length-1] in the text indexed by index. It also allocates
 149           occ (which must be freed by the caller) and writes the locations of
 150           the numocc occurrences in occ, in arbitrary order.  */
 151
 152 int locate (void *index, uchar *pattern, ulong length, ulong **occ,
 153         ulong *numocc);
 154
 155         /*  Allocates snippet (which must be freed by the caller) and writes
 156           the substring text[from..to] into it. Returns in snippet_length the
 157           length of the text snippet actually extracted (that could be less
 158           than to-from+1 if to is larger than the text size). */
 159
 160 int extract (void *index, ulong from, ulong to, uchar **snippet,
 161         ulong *snippet_length);
 162
 163         /* Displays the text (snippet) surrounding any occurrence of the
 164           substring pattern[0..length-1] within the text indexed by index.
 165           The snippet must include numc characters before and after the
 166           pattern occurrence, totalizing length+2*numc characters, or less if
 167           the text boundaries are reached. Writes in numocc the number of
 168           occurrences, and allocates the arrays snippet_text and
 169           snippet_lengths (which must be freed by the caller). The first is a
 170           character array of numocc*(length+2*numc) characters, with a new
 171           snippet starting at every multiple of length+2*numc. The second
 172           gives the real length of each of the numocc snippets. */
 173
 174 int display (void *index, uchar *pattern, ulong length, ulong numc,
 175         ulong *numocc, uchar **snippet_text, ulong **snippet_lengths);
 176
 177         /*  Obtains the length of the text indexed by index. */
 178
 179 int length (void *index, ulong *length);
 180
 181                 /* Shows summary info of the index */
 182 int printInfo(void *index);
 183
 184 /** *******************************************************************************************/
 185 /** Building part of the index ****************************************************************/
 186
 187 int build_WCSA (uchar *text, ulong length, char *build_options, void **index);
 188 int build_iCSA (char  *build_options, void *index);
 189
 190
 191
 192 /** *******************************************************************************************/
 193 /** Search part of the index ******************************************************************/
 194 // Definitions of some PUBLIC function prototipes.
 195
 196                 //loading/freeing the data structures into memory.
 197
 198     void loadStructs(twcsa *wcsa, char *basename);
 199         twcsa *loadWCSA(char *filename);
 200
 201                 //returns the source text from given [offsetIni, offsetFin] offsets.
 202         //byte *displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin);
 203         byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length);
 204         int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr);
 205
 206                 //locate all the ocurrences of a word/phrase
 207         int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number);
 208
 209                 //show text around the occurrences of a word.
 210         int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix);
 211
 212                 //recovers the source text by calling display (either only once or "len" times)
 213         void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
 214         void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
 215
 216         //***Searching for a TEXT pattern ...
 217
 218                 //extracts the ids of the valid words of a "plain text".
 219         void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) ;
 220
 221                 //counts the occurrences of a given text pattern.
 222         int countTextOcurrences(twcsa *wcsa, byte *textPattern);
 223
 224                 //returns the offsets (to the source text) where of a given text pattern appears.
 225         uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences);
 226
 227                 //shows a snippet with the text around the ocurrences of a pattern.
 228         int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay);
 229
 230
 231 /** ***********************************************************************************
 232   * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
 233   * ***********************************************************************************/
 234         /** Writes in numocc the number of occurrences of the substring
 235           pattern[0..length-1] in the text indexed by index. It also allocates
 236           occ (which must be freed by the caller) and writes the locations of
 237           the numocc occurrences in occ, in arbitrary order. These occurrences
 238           refer to the offsets in TOH where the caller could start a display
 239           operation. So locateWord implies synchronization using B.
 240           ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
 241              searched word, but the offset in TOH of k-before words before.
 242         */
 243
 244 int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore);
 245
 246   /** Displays the text (snippet) surrounding any occurrence of the
 247     substring pattern[0..length-1] within the text indexed by index.
 248     The snippet must include numc characters before and after the
 249     pattern occurrence, totalizing length+2*numc characters, or less if
 250     the text boundaries are reached. Writes in numocc the number of
 251     occurrences, and allocates the arrays snippet_text and
 252     snippet_lengths (which must be freed by the caller). The first is a
 253     character array of numocc*(length+2*numc) characters, with a new
 254     snippet starting at every multiple of length+2*numc. The second
 255     gives the real length of each of the numocc snippets. */
 256
 257  int displayWords (void *index, uchar *pattern, ulong length, ulong numc,
 258          ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore);
 259
 260
 261 /** simulates extration of text process, but do not actually returns anything at all
 262    Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
 263    Less than 2K words can be extracted if more than numc characters have been already obtained.
 264    Do nothing else... do not return the text */
 265
 266         int  displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc);
 267
 268
 269 /**  Allocates text (which must be freed by the caller) and recovers the
 270   the substring of text starting from the "fromword"-th word up to the
 271   "toWord"-th words. Returns in text the text, and in "text_lenght" the
 272   length of the text  actually extracted. Text is allocated.
 273   Actually extracts SE[fromWord .. toWord) ... not the last word.    */
 274
 275 int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
 276         ulong *text_length);
 277
 278
 279
 280
 281                 //recovers the source text by calling display (either only once or "len" times)
 282         void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
 283         void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
 284
 285
 286 // Definitions of PRIVATE functions
 287
 288         //Auxiliary functions
 289
 290         uint structsSizeDisk(twcsa *wcsa);
 291         uint structsSizeMem(twcsa *wcsa);
 292         void printInfoReduced(twcsa *wcsa);
 293         int saveSEfile (char *basename, uint *v, uint n);
 294         double getTime2 (void);
 295
 296