1 /* only for getTime() */
4 #include <sys/resource.h>
5 #include "utils/basics.h"
7 #include "utils/valstring.h"
8 #include "utils/defValues.h"
9 #include "utils/MemoryManager.h"
10 #include "utils/fileInfo.h"
12 #include "utils/hash.h"
14 #include "utils/huff.h"
15 //#include "utils/errors.c"
16 #include "utils/parameters.h"
18 //from SEARCHER FACADE
19 #include "utils/huffDec.h"
20 //#include "icsa/icsa.h"
22 #include "intIndex/interfaceIntIndex.h"
25 #define uchar unsigned char
28 #define uint unsigned int
31 #define ulong unsigned long
34 #define STRLEN(str,len) \
37 while(*ptr++) len++; \
40 #define ADDLEN(str,len) \
42 while(*ptr++) len++; \
45 /** Some data types used **ONLY**during construction process */
47 // Words, both the canonical words and their variants
49 unsigned long slot; // the position in the hash table of the canonical word
50 byte *word; //makes alphanumerical sorting easier...
54 typedef struct SzoneMem { //a large block of memory to load a file into mem.
55 byte *zone; //block of mem.
56 uint size; //number of bytes
59 // words dataStructure.
62 uint elemSize; //the size (in bits) of each pointer.
63 tZoneMem wordsZoneMem; // a block of memory where the canonical words are loaded (from file).
68 /** Some data types used during searches */
73 /**the WCSA index structures... */
77 twords wordsData; /* vocabulary (words) of the index */
79 ulong n; /* number of different words. */
80 uint seSize; /* number of words in the source text */
82 uint sourceTextSize; /*the size of the source text in bytes*/
84 //ticsa *myicsa; //the WiCSA on SE words
85 void *myicsa; //the WiCSA on SE words
94 /** ******************************************************************************
95 * Interface (from pizza chili) for using the WCSA index
96 *********************************************************************************/
98 /* Error management */
100 /* Returns a string describing the error associated with error number
101 e. The string must not be freed, and it will be overwritten with
104 char *error_index (int e);
106 /* Building the index */
108 /* Creates index from text[0..length-1]. Note that the index is an
109 opaque data type. Any build option must be passed in string
110 build_options, whose syntax depends on the index. The index must
111 always work with some default parameters if build_options is NULL.
112 The returned index is ready to be queried. */
114 int build_index (uchar *text, ulong length, char *build_options, void **index);
116 /* Saves index on disk by using single or multiple files, having
117 proper extensions. */
119 int save_index (void *index, char *filename);
121 /* Loads index from one or more file(s) named filename, possibly
122 adding the proper extensions. */
124 int load_index (char *filename, void **index);
126 /* Frees the memory occupied by index. */
128 int free_index (void *index);
130 /* Gives the memory occupied by index in bytes. */
132 int index_size(void *index, ulong *size);
134 /* Querying the index */
136 /* Writes in numocc the number of occurrences of the substring
137 pattern[0..length-1] found in the text indexed by index. */
139 int count (void *index, uchar *pattern, ulong length, ulong *numocc);
141 /* Gives the length of the text indexed */
143 int get_length(void *index, ulong *length);
145 /* Accessing the indexed text */
147 /* Writes in numocc the number of occurrences of the substring
148 pattern[0..length-1] in the text indexed by index. It also allocates
149 occ (which must be freed by the caller) and writes the locations of
150 the numocc occurrences in occ, in arbitrary order. */
152 int locate (void *index, uchar *pattern, ulong length, ulong **occ,
155 /* Allocates snippet (which must be freed by the caller) and writes
156 the substring text[from..to] into it. Returns in snippet_length the
157 length of the text snippet actually extracted (that could be less
158 than to-from+1 if to is larger than the text size). */
160 int extract (void *index, ulong from, ulong to, uchar **snippet,
161 ulong *snippet_length);
163 /* Displays the text (snippet) surrounding any occurrence of the
164 substring pattern[0..length-1] within the text indexed by index.
165 The snippet must include numc characters before and after the
166 pattern occurrence, totalizing length+2*numc characters, or less if
167 the text boundaries are reached. Writes in numocc the number of
168 occurrences, and allocates the arrays snippet_text and
169 snippet_lengths (which must be freed by the caller). The first is a
170 character array of numocc*(length+2*numc) characters, with a new
171 snippet starting at every multiple of length+2*numc. The second
172 gives the real length of each of the numocc snippets. */
174 int display (void *index, uchar *pattern, ulong length, ulong numc,
175 ulong *numocc, uchar **snippet_text, ulong **snippet_lengths);
177 /* Obtains the length of the text indexed by index. */
179 int length (void *index, ulong *length);
181 /* Shows summary info of the index */
182 int printInfo(void *index);
184 /** *******************************************************************************************/
185 /** Building part of the index ****************************************************************/
187 int build_WCSA (uchar *text, ulong length, char *build_options, void **index);
188 int build_iCSA (char *build_options, void *index);
192 /** *******************************************************************************************/
193 /** Search part of the index ******************************************************************/
194 // Definitions of some PUBLIC function prototipes.
196 //loading/freeing the data structures into memory.
198 void loadStructs(twcsa *wcsa, char *basename);
199 twcsa *loadWCSA(char *filename);
201 //returns the source text from given [offsetIni, offsetFin] offsets.
202 //byte *displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin);
203 byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length);
204 int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr);
206 //locate all the ocurrences of a word/phrase
207 int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number);
209 //show text around the occurrences of a word.
210 int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix);
212 //recovers the source text by calling display (either only once or "len" times)
213 void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
214 void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
216 //***Searching for a TEXT pattern ...
218 //extracts the ids of the valid words of a "plain text".
219 void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) ;
221 //counts the occurrences of a given text pattern.
222 int countTextOcurrences(twcsa *wcsa, byte *textPattern);
224 //returns the offsets (to the source text) where of a given text pattern appears.
225 uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences);
227 //shows a snippet with the text around the ocurrences of a pattern.
228 int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay);
231 /** ***********************************************************************************
232 * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
233 * ***********************************************************************************/
234 /** Writes in numocc the number of occurrences of the substring
235 pattern[0..length-1] in the text indexed by index. It also allocates
236 occ (which must be freed by the caller) and writes the locations of
237 the numocc occurrences in occ, in arbitrary order. These occurrences
238 refer to the offsets in TOH where the caller could start a display
239 operation. So locateWord implies synchronization using B.
240 ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
241 searched word, but the offset in TOH of k-before words before.
244 int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore);
246 /** Displays the text (snippet) surrounding any occurrence of the
247 substring pattern[0..length-1] within the text indexed by index.
248 The snippet must include numc characters before and after the
249 pattern occurrence, totalizing length+2*numc characters, or less if
250 the text boundaries are reached. Writes in numocc the number of
251 occurrences, and allocates the arrays snippet_text and
252 snippet_lengths (which must be freed by the caller). The first is a
253 character array of numocc*(length+2*numc) characters, with a new
254 snippet starting at every multiple of length+2*numc. The second
255 gives the real length of each of the numocc snippets. */
257 int displayWords (void *index, uchar *pattern, ulong length, ulong numc,
258 ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore);
261 /** simulates extration of text process, but do not actually returns anything at all
262 Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
263 Less than 2K words can be extracted if more than numc characters have been already obtained.
264 Do nothing else... do not return the text */
266 int displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc);
269 /** Allocates text (which must be freed by the caller) and recovers the
270 the substring of text starting from the "fromword"-th word up to the
271 "toWord"-th words. Returns in text the text, and in "text_lenght" the
272 length of the text actually extracted. Text is allocated.
273 Actually extracts SE[fromWord .. toWord) ... not the last word. */
275 int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
281 //recovers the source text by calling display (either only once or "len" times)
282 void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
283 void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
286 // Definitions of PRIVATE functions
288 //Auxiliary functions
290 uint structsSizeDisk(twcsa *wcsa);
291 uint structsSizeMem(twcsa *wcsa);
292 void printInfoReduced(twcsa *wcsa);
293 int saveSEfile (char *basename, uint *v, uint n);
294 double getTime2 (void);