1 /* only for getTime() */
3 #include <sys/resource.h>
6 #include "utils/valstring.h"
7 #include "utils/defValues.h"
8 #include "utils/MemoryManager.h"
9 #include "utils/fileInfo.h"
11 #include "utils/hash.h"
13 #include "utils/huff.h"
14 //#include "utils/errors.c"
15 #include "utils/parameters.h"
17 //from SEARCHER FACADE
18 #include "utils/huffDec.h"
19 //#include "icsa/icsa.h"
21 #include "intIndex/interfaceIntIndex.h"
24 #define uchar unsigned char
27 #define uint unsigned int
30 #define ulong unsigned long
33 #define STRLEN(str,len) \
36 while(*ptr++) len++; \
39 #define ADDLEN(str,len) \
41 while(*ptr++) len++; \
44 /** Some data types used **ONLY**during construction process */
46 // Words, both the canonical words and their variants
48 unsigned long slot; // the position in the hash table of the canonical word
49 byte *word; //makes alphanumerical sorting easier...
53 typedef struct SzoneMem { //a large block of memory to load a file into mem.
54 byte *zone; //block of mem.
55 uint size; //number of bytes
58 // words dataStructure.
61 uint elemSize; //the size (in bits) of each pointer.
62 tZoneMem wordsZoneMem; // a block of memory where the canonical words are loaded (from file).
67 /** Some data types used during searches */
72 /**the WCSA index structures... */
76 twords wordsData; /* vocabulary (words) of the index */
78 ulong n; /* number of different words. */
79 uint seSize; /* number of words in the source text */
81 uint sourceTextSize; /*the size of the source text in bytes*/
83 //ticsa *myicsa; //the WiCSA on SE words
84 void *myicsa; //the WiCSA on SE words
93 /** ******************************************************************************
94 * Interface (from pizza chili) for using the WCSA index
95 *********************************************************************************/
97 /* Error management */
99 /* Returns a string describing the error associated with error number
100 e. The string must not be freed, and it will be overwritten with
103 char *error_index (int e);
105 /* Building the index */
107 /* Creates index from text[0..length-1]. Note that the index is an
108 opaque data type. Any build option must be passed in string
109 build_options, whose syntax depends on the index. The index must
110 always work with some default parameters if build_options is NULL.
111 The returned index is ready to be queried. */
113 int build_index (uchar *text, ulong length, char *build_options, void **index);
115 /* Saves index on disk by using single or multiple files, having
116 proper extensions. */
118 int save_index (void *index, char *filename);
120 /* Loads index from one or more file(s) named filename, possibly
121 adding the proper extensions. */
123 int load_index (char *filename, void **index);
125 /* Frees the memory occupied by index. */
127 int free_index (void *index);
129 /* Gives the memory occupied by index in bytes. */
131 int index_size(void *index, ulong *size);
133 /* Querying the index */
135 /* Writes in numocc the number of occurrences of the substring
136 pattern[0..length-1] found in the text indexed by index. */
138 int count (void *index, uchar *pattern, ulong length, ulong *numocc);
140 /* Gives the length of the text indexed */
142 int get_length(void *index, ulong *length);
144 /* Accessing the indexed text */
146 /* Writes in numocc the number of occurrences of the substring
147 pattern[0..length-1] in the text indexed by index. It also allocates
148 occ (which must be freed by the caller) and writes the locations of
149 the numocc occurrences in occ, in arbitrary order. */
151 int locate (void *index, uchar *pattern, ulong length, ulong **occ,
154 /* Allocates snippet (which must be freed by the caller) and writes
155 the substring text[from..to] into it. Returns in snippet_length the
156 length of the text snippet actually extracted (that could be less
157 than to-from+1 if to is larger than the text size). */
159 int extract (void *index, ulong from, ulong to, uchar **snippet,
160 ulong *snippet_length);
162 /* Displays the text (snippet) surrounding any occurrence of the
163 substring pattern[0..length-1] within the text indexed by index.
164 The snippet must include numc characters before and after the
165 pattern occurrence, totalizing length+2*numc characters, or less if
166 the text boundaries are reached. Writes in numocc the number of
167 occurrences, and allocates the arrays snippet_text and
168 snippet_lengths (which must be freed by the caller). The first is a
169 character array of numocc*(length+2*numc) characters, with a new
170 snippet starting at every multiple of length+2*numc. The second
171 gives the real length of each of the numocc snippets. */
173 int display (void *index, uchar *pattern, ulong length, ulong numc,
174 ulong *numocc, uchar **snippet_text, ulong **snippet_lengths);
176 /* Obtains the length of the text indexed by index. */
178 int length (void *index, ulong *length);
180 /* Shows summary info of the index */
181 int printInfo(void *index);
183 /** *******************************************************************************************/
184 /** Building part of the index ****************************************************************/
186 int build_WCSA (uchar *text, ulong length, char *build_options, void **index);
187 int build_iCSA (char *build_options, void *index);
191 /** *******************************************************************************************/
192 /** Search part of the index ******************************************************************/
193 // Definitions of some PUBLIC function prototipes.
195 //loading/freeing the data structures into memory.
197 void loadStructs(twcsa *wcsa, char *basename);
198 twcsa *loadWCSA(char *filename);
200 //returns the source text from given [offsetIni, offsetFin] offsets.
201 //byte *displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin);
202 byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length);
203 int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr);
205 //locate all the ocurrences of a word/phrase
206 int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number);
208 //show text around the occurrences of a word.
209 int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix);
211 //recovers the source text by calling display (either only once or "len" times)
212 void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
213 void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
215 //***Searching for a TEXT pattern ...
217 //extracts the ids of the valid words of a "plain text".
218 void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) ;
220 //counts the occurrences of a given text pattern.
221 int countTextOcurrences(twcsa *wcsa, byte *textPattern);
223 //returns the offsets (to the source text) where of a given text pattern appears.
224 uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences);
226 //shows a snippet with the text around the ocurrences of a pattern.
227 int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay);
230 /** ***********************************************************************************
231 * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
232 * ***********************************************************************************/
233 /** Writes in numocc the number of occurrences of the substring
234 pattern[0..length-1] in the text indexed by index. It also allocates
235 occ (which must be freed by the caller) and writes the locations of
236 the numocc occurrences in occ, in arbitrary order. These occurrences
237 refer to the offsets in TOH where the caller could start a display
238 operation. So locateWord implies synchronization using B.
239 ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
240 searched word, but the offset in TOH of k-before words before.
243 int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore);
245 /** Displays the text (snippet) surrounding any occurrence of the
246 substring pattern[0..length-1] within the text indexed by index.
247 The snippet must include numc characters before and after the
248 pattern occurrence, totalizing length+2*numc characters, or less if
249 the text boundaries are reached. Writes in numocc the number of
250 occurrences, and allocates the arrays snippet_text and
251 snippet_lengths (which must be freed by the caller). The first is a
252 character array of numocc*(length+2*numc) characters, with a new
253 snippet starting at every multiple of length+2*numc. The second
254 gives the real length of each of the numocc snippets. */
256 int displayWords (void *index, uchar *pattern, ulong length, ulong numc,
257 ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore);
260 /** simulates extration of text process, but do not actually returns anything at all
261 Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
262 Less than 2K words can be extracted if more than numc characters have been already obtained.
263 Do nothing else... do not return the text */
265 int displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc);
268 /** Allocates text (which must be freed by the caller) and recovers the
269 the substring of text starting from the "fromword"-th word up to the
270 "toWord"-th words. Returns in text the text, and in "text_lenght" the
271 length of the text actually extracted. Text is allocated.
272 Actually extracts SE[fromWord .. toWord) ... not the last word. */
274 int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
280 //recovers the source text by calling display (either only once or "len" times)
281 void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
282 void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
285 // Definitions of PRIVATE functions
287 //Auxiliary functions
289 uint structsSizeDisk(twcsa *wcsa);
290 uint structsSizeMem(twcsa *wcsa);
291 void printInfoReduced(twcsa *wcsa);
292 int saveSEfile (char *basename, uint *v, uint n);
293 double getTime2 (void);