X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=swcsa%2FbuildFacade.h;fp=swcsa%2FbuildFacade.h;h=05bd7082d4738968220d5ce3c80e8b4a811926f3;hb=102e33b134075765e6d4e0c38bc1307568ce5602;hp=0000000000000000000000000000000000000000;hpb=ed61d2042a7ad7dd83bae32d7c31e69504dafa80;p=SXSI%2FTextCollection.git diff --git a/swcsa/buildFacade.h b/swcsa/buildFacade.h new file mode 100755 index 0000000..05bd708 --- /dev/null +++ b/swcsa/buildFacade.h @@ -0,0 +1,295 @@ +/* only for getTime() */ +#include +#include + + +#include "utils/valstring.h" +#include "utils/defValues.h" +#include "utils/MemoryManager.h" +#include "utils/fileInfo.h" + +#include "utils/hash.h" + +#include "utils/huff.h" +//#include "utils/errors.c" +#include "utils/parameters.h" + +//from SEARCHER FACADE +#include "utils/huffDec.h" +//#include "icsa/icsa.h" + +#include "intIndex/interfaceIntIndex.h" + +#ifndef uchar +#define uchar unsigned char +#endif +#ifndef uint +#define uint unsigned int +#endif +#ifndef ulong +#define ulong unsigned long +#endif + +#define STRLEN(str,len) \ +{len=0; \ + byte *ptr = str; \ + while(*ptr++) len++; \ +} + +#define ADDLEN(str,len) \ +{byte *ptr = str; \ + while(*ptr++) len++; \ +} + +/** Some data types used **ONLY**during construction process */ + +// Words, both the canonical words and their variants + typedef struct { + unsigned long slot; // the position in the hash table of the canonical word + byte *word; //makes alphanumerical sorting easier... + } tposInHT; + + + typedef struct SzoneMem { //a large block of memory to load a file into mem. + byte *zone; //block of mem. + uint size; //number of bytes + } tZoneMem; + + // words dataStructure. + typedef struct { + uint *words; + uint elemSize; //the size (in bits) of each pointer. + tZoneMem wordsZoneMem; // a block of memory where the canonical words are loaded (from file). + + } twords; + + +/** Some data types used during searches */ + + + + + /**the WCSA index structures... */ + typedef struct { + + /**valid words */ + twords wordsData; /* vocabulary (words) of the index */ + + ulong n; /* number of different words. */ + uint seSize; /* number of words in the source text */ + + uint sourceTextSize; /*the size of the source text in bytes*/ + + //ticsa *myicsa; //the WiCSA on SE words + void *myicsa; //the WiCSA on SE words + + //#ifndef CSA_ON + uint *se; + //#endif + + }twcsa; + + +/** ****************************************************************************** + * Interface (from pizza chili) for using the WCSA index +*********************************************************************************/ + +/* Error management */ + + /* Returns a string describing the error associated with error number + e. The string must not be freed, and it will be overwritten with + subsequent calls. */ + +char *error_index (int e); + +/* Building the index */ + + /* Creates index from text[0..length-1]. Note that the index is an + opaque data type. Any build option must be passed in string + build_options, whose syntax depends on the index. The index must + always work with some default parameters if build_options is NULL. + The returned index is ready to be queried. */ + +int build_index (uchar *text, ulong length, char *build_options, void **index); + + /* Saves index on disk by using single or multiple files, having + proper extensions. */ + +int save_index (void *index, char *filename); + + /* Loads index from one or more file(s) named filename, possibly + adding the proper extensions. */ + +int load_index (char *filename, void **index); + + /* Frees the memory occupied by index. */ + +int free_index (void *index); + + /* Gives the memory occupied by index in bytes. */ + +int index_size(void *index, ulong *size); + +/* Querying the index */ + + /* Writes in numocc the number of occurrences of the substring + pattern[0..length-1] found in the text indexed by index. */ + +int count (void *index, uchar *pattern, ulong length, ulong *numocc); + + /* Gives the length of the text indexed */ + +int get_length(void *index, ulong *length); + +/* Accessing the indexed text */ + + /* Writes in numocc the number of occurrences of the substring + pattern[0..length-1] in the text indexed by index. It also allocates + occ (which must be freed by the caller) and writes the locations of + the numocc occurrences in occ, in arbitrary order. */ + +int locate (void *index, uchar *pattern, ulong length, ulong **occ, + ulong *numocc); + + /* Allocates snippet (which must be freed by the caller) and writes + the substring text[from..to] into it. Returns in snippet_length the + length of the text snippet actually extracted (that could be less + than to-from+1 if to is larger than the text size). */ + +int extract (void *index, ulong from, ulong to, uchar **snippet, + ulong *snippet_length); + + /* Displays the text (snippet) surrounding any occurrence of the + substring pattern[0..length-1] within the text indexed by index. + The snippet must include numc characters before and after the + pattern occurrence, totalizing length+2*numc characters, or less if + the text boundaries are reached. Writes in numocc the number of + occurrences, and allocates the arrays snippet_text and + snippet_lengths (which must be freed by the caller). The first is a + character array of numocc*(length+2*numc) characters, with a new + snippet starting at every multiple of length+2*numc. The second + gives the real length of each of the numocc snippets. */ + +int display (void *index, uchar *pattern, ulong length, ulong numc, + ulong *numocc, uchar **snippet_text, ulong **snippet_lengths); + + /* Obtains the length of the text indexed by index. */ + +int length (void *index, ulong *length); + + /* Shows summary info of the index */ +int printInfo(void *index); + +/** *******************************************************************************************/ +/** Building part of the index ****************************************************************/ + +int build_WCSA (uchar *text, ulong length, char *build_options, void **index); +int build_iCSA (char *build_options, void *index); + + + +/** *******************************************************************************************/ +/** Search part of the index ******************************************************************/ +// Definitions of some PUBLIC function prototipes. + + //loading/freeing the data structures into memory. + + void loadStructs(twcsa *wcsa, char *basename); + twcsa *loadWCSA(char *filename); + + //returns the source text from given [offsetIni, offsetFin] offsets. + //byte *displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin); + byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length); + int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr); + + //locate all the ocurrences of a word/phrase + int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number); + + //show text around the occurrences of a word. + int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix); + + //recovers the source text by calling display (either only once or "len" times) + void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize); + void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize); + + //***Searching for a TEXT pattern ... + + //extracts the ids of the valid words of a "plain text". + void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) ; + + //counts the occurrences of a given text pattern. + int countTextOcurrences(twcsa *wcsa, byte *textPattern); + + //returns the offsets (to the source text) where of a given text pattern appears. + uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences); + + //shows a snippet with the text around the ocurrences of a pattern. + int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay); + + +/** *********************************************************************************** + * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord + * ***********************************************************************************/ + /** Writes in numocc the number of occurrences of the substring + pattern[0..length-1] in the text indexed by index. It also allocates + occ (which must be freed by the caller) and writes the locations of + the numocc occurrences in occ, in arbitrary order. These occurrences + refer to the offsets in TOH where the caller could start a display + operation. So locateWord implies synchronization using B. + ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the + searched word, but the offset in TOH of k-before words before. + */ + +int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore); + + /** Displays the text (snippet) surrounding any occurrence of the + substring pattern[0..length-1] within the text indexed by index. + The snippet must include numc characters before and after the + pattern occurrence, totalizing length+2*numc characters, or less if + the text boundaries are reached. Writes in numocc the number of + occurrences, and allocates the arrays snippet_text and + snippet_lengths (which must be freed by the caller). The first is a + character array of numocc*(length+2*numc) characters, with a new + snippet starting at every multiple of length+2*numc. The second + gives the real length of each of the numocc snippets. */ + + int displayWords (void *index, uchar *pattern, ulong length, ulong numc, + ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore); + + +/** simulates extration of text process, but do not actually returns anything at all + Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern. + Less than 2K words can be extracted if more than numc characters have been already obtained. + Do nothing else... do not return the text */ + + int displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc); + + +/** Allocates text (which must be freed by the caller) and recovers the + the substring of text starting from the "fromword"-th word up to the + "toWord"-th words. Returns in text the text, and in "text_lenght" the + length of the text actually extracted. Text is allocated. + Actually extracts SE[fromWord .. toWord) ... not the last word. */ + +int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text, + ulong *text_length); + + + + + //recovers the source text by calling display (either only once or "len" times) + void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize); + void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize); + + +// Definitions of PRIVATE functions + + //Auxiliary functions + + uint structsSizeDisk(twcsa *wcsa); + uint structsSizeMem(twcsa *wcsa); + void printInfoReduced(twcsa *wcsa); + int saveSEfile (char *basename, uint *v, uint n); + double getTime2 (void); + +