2 /* General interface for using the compressed index libraries */
5 #define uchar unsigned char
8 #define uint unsigned int
11 #define ulong unsigned long
14 /* Error management */
16 /* Returns a string describing the error associated with error number
17 e. The string must not be freed, and it will be overwritten with
20 char *error_index (int e);
22 /* Building the index */
24 /* Creates index from text[0..length-1]. Note that the index is an
25 opaque data type. Any build option must be passed in string
26 build_options, whose syntax depends on the index. The index must
27 always work with some default parameters if build_options is NULL.
28 The returned index is ready to be queried. */
30 int build_index (uchar *text, ulong length, char *build_options, void **index);
32 /* Saves index on disk by using single or multiple files, having
35 int save_index (void *index, char *filename);
37 /* Loads index from one or more file(s) named filename, possibly
38 adding the proper extensions. */
40 int load_index (char *filename, void **index);
42 /* Frees the memory occupied by index. */
44 int free_index (void *index);
46 /* Gives the memory occupied by index in bytes. */
48 int index_size(void *index, ulong *size);
50 /* Querying the index */
52 /* Writes in numocc the number of occurrences of the substring
53 pattern[0..length-1] found in the text indexed by index. */
55 int count (void *index, uchar *pattern, ulong length, ulong *numocc);
57 /* Writes in numocc the number of occurrences of the substring
58 pattern[0..length-1] in the text indexed by index. It also allocates
59 occ (which must be freed by the caller) and writes the locations of
60 the numocc occurrences in occ, in arbitrary order. */
62 int locate (void *index, uchar *pattern, ulong length, ulong **occ,
65 /* Gives the length of the text indexed */
67 int get_length(void *index, ulong *length);
69 /* Accessing the indexed text */
71 /* Allocates snippet (which must be freed by the caller) and writes
72 the substring text[from..to] into it. Returns in snippet_length the
73 length of the text snippet actually extracted (that could be less
74 than to-from+1 if to is larger than the text size). */
76 int extract (void *index, ulong from, ulong to, uchar **snippet,
77 ulong *snippet_length);
79 /* Displays the text (snippet) surrounding any occurrence of the
80 substring pattern[0..length-1] within the text indexed by index.
81 The snippet must include numc characters before and after the
82 pattern occurrence, totalizing length+2*numc characters, or less if
83 the text boundaries are reached. Writes in numocc the number of
84 occurrences, and allocates the arrays snippet_text and
85 snippet_lengths (which must be freed by the caller). The first is a
86 character array of numocc*(length+2*numc) characters, with a new
87 snippet starting at every multiple of length+2*numc. The second
88 gives the real length of each of the numocc snippets. */
90 int display (void *index, uchar *pattern, ulong length, ulong numc,
91 ulong *numocc, uchar **snippet_text, ulong **snippet_lengths);
93 /* Obtains the length of the text indexed by index. */
95 int length (void *index, ulong *length);
97 /* Shows summary info of the index */
99 int printInfo(void *index);
103 /** ***********************************************************************************
104 * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
105 * ***********************************************************************************/
106 /** Writes in numocc the number of occurrences of the substring
107 pattern[0..length-1] in the text indexed by index. It also allocates
108 occ (which must be freed by the caller) and writes the locations of
109 the numocc occurrences in occ, in arbitrary order. These occurrences
110 refer to the offsets in TOH where the caller could start a display
111 operation. So locateWord implies synchronization using B.
112 ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
113 searched word, but the offset in TOH of k-before words before.
116 int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore);
118 /** Displays the text (snippet) surrounding any occurrence of the
119 substring pattern[0..length-1] within the text indexed by index.
120 The snippet must include numc characters before and after the
121 pattern occurrence, totalizing length+2*numc characters, or less if
122 the text boundaries are reached. Writes in numocc the number of
123 occurrences, and allocates the arrays snippet_text and
124 snippet_lengths (which must be freed by the caller). The first is a
125 character array of numocc*(length+2*numc) characters, with a new
126 snippet starting at every multiple of length+2*numc. The second
127 gives the real length of each of the numocc snippets. */
129 int displayWords (void *index, uchar *pattern, ulong length, ulong numc,
130 ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore);
133 /** simulates extration of text process, but do not actually returns anything at all
134 Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
135 Less than 2K words can be extracted if more than numc characters have been already obtained.
136 Do nothing else... do not return the text */
138 int displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc);
142 /** Allocates text (which must be freed by the caller) and recovers the
143 the substring of text starting from the "fromword"-th word up to the
144 "toWord"-th words. Returns in text the text, and in "text_lenght" the
145 length of the text actually extracted. Text is allocated.
146 Actually extracts SE[fromWord .. toWord) ... not the last word. */
148 int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,