Added simple WCSA

[SXSI/TextCollection.git] / swcsa / buildFacade.h
diff --git a/swcsa/buildFacade.h b/swcsa/buildFacade.h

new file mode 100755 (executable)

index 0000000..05bd708
--- /dev/null
+++ b/swcsa/buildFacade.h
@@ -0,0 +1,295 @@
+/* only for getTime() */
+#include <sys/time.h>
+#include <sys/resource.h>
+
+
+#include "utils/valstring.h"
+#include "utils/defValues.h"
+#include "utils/MemoryManager.h"
+#include "utils/fileInfo.h"
+
+#include "utils/hash.h"
+
+#include "utils/huff.h"
+//#include "utils/errors.c"
+#include "utils/parameters.h"
+
+//from SEARCHER FACADE 
+#include "utils/huffDec.h"
+//#include "icsa/icsa.h"
+
+#include "intIndex/interfaceIntIndex.h"
+
+#ifndef uchar
+#define uchar unsigned char
+#endif
+#ifndef uint
+#define uint  unsigned int
+#endif
+#ifndef ulong
+#define ulong unsigned long
+#endif
+
+#define STRLEN(str,len) \
+{len=0; \
+ byte *ptr = str; \
+ while(*ptr++) len++; \
+}
+
+#define ADDLEN(str,len) \
+{byte *ptr = str; \
+       while(*ptr++) len++; \
+}
+
+/** Some data types used **ONLY**during construction process */
+       
+// Words, both the canonical words and their variants
+       typedef struct {
+               unsigned long slot;   // the position in the hash table of the canonical word
+               byte *word; //makes alphanumerical sorting easier...
+       } tposInHT;
+       
+       
+       typedef struct SzoneMem {        //a large block of memory to load a file into mem.
+               byte *zone;  //block of mem.
+               uint size;  //number of bytes            
+       } tZoneMem;
+
+       //  words dataStructure.        
+       typedef struct {
+               uint *words;
+               uint elemSize;  //the size (in bits) of each pointer.
+               tZoneMem wordsZoneMem; // a block of memory where the canonical words are loaded (from file).
+
+       } twords;
+
+
+/** Some data types used during searches */
+
+       
+
+
+       /**the WCSA index structures... */
+       typedef struct {
+                       
+               /**valid words */
+               twords wordsData;               /* vocabulary (words) of the index */
+
+               ulong n;                                /* number of different words. */                
+               uint seSize;                    /* number of words in the source text */ 
+
+               uint sourceTextSize;    /*the size of the source text in bytes*/
+
+               //ticsa *myicsa; //the WiCSA on SE words
+               void *myicsa; //the WiCSA on SE words
+       
+               //#ifndef CSA_ON
+               uint *se;
+               //#endif
+
+       }twcsa;
+
+
+/** ******************************************************************************
+    * Interface (from pizza chili) for using the WCSA index
+*********************************************************************************/
+
+/* Error management */
+
+        /* Returns a string describing the error associated with error number
+          e. The string must not be freed, and it will be overwritten with
+          subsequent calls. */
+
+char *error_index (int e);
+
+/* Building the index */
+
+        /* Creates index from text[0..length-1]. Note that the index is an 
+          opaque data type. Any build option must be passed in string 
+          build_options, whose syntax depends on the index. The index must 
+          always work with some default parameters if build_options is NULL. 
+          The returned index is ready to be queried. */
+
+int build_index (uchar *text, ulong length, char *build_options, void **index);
+
+        /*  Saves index on disk by using single or multiple files, having 
+          proper extensions. */
+
+int save_index (void *index, char *filename);
+
+        /*  Loads index from one or more file(s) named filename, possibly 
+          adding the proper extensions. */
+
+int load_index (char *filename, void **index);
+
+        /* Frees the memory occupied by index. */
+
+int free_index (void *index);
+
+        /* Gives the memory occupied by index in bytes. */
+
+int index_size(void *index, ulong *size);
+
+/* Querying the index */
+
+        /* Writes in numocc the number of occurrences of the substring 
+          pattern[0..length-1] found in the text indexed by index. */
+
+int count (void *index, uchar *pattern, ulong length, ulong *numocc);
+
+        /* Gives the length of the text indexed */
+
+int get_length(void *index, ulong *length);
+
+/* Accessing the indexed text  */
+
+        /* Writes in numocc the number of occurrences of the substring 
+          pattern[0..length-1] in the text indexed by index. It also allocates
+          occ (which must be freed by the caller) and writes the locations of 
+          the numocc occurrences in occ, in arbitrary order.  */
+
+int locate (void *index, uchar *pattern, ulong length, ulong **occ, 
+        ulong *numocc);
+
+        /*  Allocates snippet (which must be freed by the caller) and writes 
+          the substring text[from..to] into it. Returns in snippet_length the 
+          length of the text snippet actually extracted (that could be less 
+          than to-from+1 if to is larger than the text size). */
+
+int extract (void *index, ulong from, ulong to, uchar **snippet, 
+        ulong *snippet_length);
+
+        /* Displays the text (snippet) surrounding any occurrence of the 
+          substring pattern[0..length-1] within the text indexed by index. 
+          The snippet must include numc characters before and after the 
+          pattern occurrence, totalizing length+2*numc characters, or less if 
+          the text boundaries are reached. Writes in numocc the number of 
+          occurrences, and allocates the arrays snippet_text and 
+          snippet_lengths (which must be freed by the caller). The first is a 
+          character array of numocc*(length+2*numc) characters, with a new 
+          snippet starting at every multiple of length+2*numc. The second 
+          gives the real length of each of the numocc snippets. */
+
+int display (void *index, uchar *pattern, ulong length, ulong numc, 
+        ulong *numocc, uchar **snippet_text, ulong **snippet_lengths);
+
+        /*  Obtains the length of the text indexed by index. */
+
+int length (void *index, ulong *length);
+
+               /* Shows summary info of the index */
+int printInfo(void *index);
+
+/** *******************************************************************************************/
+/** Building part of the index ****************************************************************/
+
+int build_WCSA (uchar *text, ulong length, char *build_options, void **index);
+int build_iCSA (char  *build_options, void *index); 
+
+
+
+/** *******************************************************************************************/
+/** Search part of the index ******************************************************************/
+// Definitions of some PUBLIC function prototipes.
+
+               //loading/freeing the data structures into memory.
+       
+    void loadStructs(twcsa *wcsa, char *basename);     
+       twcsa *loadWCSA(char *filename);        
+       
+               //returns the source text from given [offsetIni, offsetFin] offsets.
+       //byte *displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin);
+       byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length);  
+       int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr);
+
+               //locate all the ocurrences of a word/phrase 
+       int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number);
+       
+               //show text around the occurrences of a word. 
+       int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix);
+
+               //recovers the source text by calling display (either only once or "len" times)
+       void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
+       void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
+
+       //***Searching for a TEXT pattern ...
+       
+               //extracts the ids of the valid words of a "plain text".
+       void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) ;
+               
+               //counts the occurrences of a given text pattern.
+       int countTextOcurrences(twcsa *wcsa, byte *textPattern);
+       
+               //returns the offsets (to the source text) where of a given text pattern appears.       
+       uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences);
+       
+               //shows a snippet with the text around the ocurrences of a pattern.     
+       int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay);
+
+
+/** ***********************************************************************************
+  * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
+  * ***********************************************************************************/  
+       /** Writes in numocc the number of occurrences of the substring 
+         pattern[0..length-1] in the text indexed by index. It also allocates
+         occ (which must be freed by the caller) and writes the locations of 
+         the numocc occurrences in occ, in arbitrary order. These occurrences
+         refer to the offsets in TOH where the caller could start a display
+         operation. So locateWord implies synchronization using B.
+         ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
+            searched word, but the offset in TOH of k-before words before.       
+       */        
+         
+int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore);
+
+  /** Displays the text (snippet) surrounding any occurrence of the 
+    substring pattern[0..length-1] within the text indexed by index. 
+    The snippet must include numc characters before and after the 
+    pattern occurrence, totalizing length+2*numc characters, or less if 
+    the text boundaries are reached. Writes in numocc the number of 
+    occurrences, and allocates the arrays snippet_text and 
+    snippet_lengths (which must be freed by the caller). The first is a 
+    character array of numocc*(length+2*numc) characters, with a new 
+    snippet starting at every multiple of length+2*numc. The second 
+    gives the real length of each of the numocc snippets. */
+
+ int displayWords (void *index, uchar *pattern, ulong length, ulong numc, 
+         ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore);
+
+
+/** simulates extration of text process, but do not actually returns anything at all 
+   Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
+   Less than 2K words can be extracted if more than numc characters have been already obtained.
+   Do nothing else... do not return the text */
+   
+       int  displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc);
+
+
+/**  Allocates text (which must be freed by the caller) and recovers the
+  the substring of text starting from the "fromword"-th word up to the
+  "toWord"-th words. Returns in text the text, and in "text_lenght" the 
+  length of the text  actually extracted. Text is allocated. 
+  Actually extracts SE[fromWord .. toWord) ... not the last word.    */
+
+int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text, 
+        ulong *text_length);
+
+
+
+
+               //recovers the source text by calling display (either only once or "len" times)
+       void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
+       void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
+
+
+// Definitions of PRIVATE functions
+
+       //Auxiliary functions
+
+       uint structsSizeDisk(twcsa *wcsa);
+       uint structsSizeMem(twcsa *wcsa);
+       void printInfoReduced(twcsa *wcsa); 
+       int saveSEfile (char *basename, uint *v, uint n);
+       double getTime2 (void); 
+
+