X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=swcsa%2FbuildFacade.h;fp=swcsa%2FbuildFacade.h;h=05bd7082d4738968220d5ce3c80e8b4a811926f3;hb=102e33b134075765e6d4e0c38bc1307568ce5602;hp=0000000000000000000000000000000000000000;hpb=ed61d2042a7ad7dd83bae32d7c31e69504dafa80;p=SXSI%2FTextCollection.git

diff --git a/swcsa/buildFacade.h b/swcsa/buildFacade.h
new file mode 100755
index 0000000..05bd708
--- /dev/null
+++ b/swcsa/buildFacade.h
@@ -0,0 +1,295 @@
+/* only for getTime() */
+#include <sys/time.h>
+#include <sys/resource.h>
+
+
+#include "utils/valstring.h"
+#include "utils/defValues.h"
+#include "utils/MemoryManager.h"
+#include "utils/fileInfo.h"
+
+#include "utils/hash.h"
+
+#include "utils/huff.h"
+//#include "utils/errors.c"
+#include "utils/parameters.h"
+
+//from SEARCHER FACADE 
+#include "utils/huffDec.h"
+//#include "icsa/icsa.h"
+
+#include "intIndex/interfaceIntIndex.h"
+
+#ifndef uchar
+#define uchar unsigned char
+#endif
+#ifndef uint
+#define uint  unsigned int
+#endif
+#ifndef ulong
+#define ulong unsigned long
+#endif
+
+#define STRLEN(str,len) \
+{len=0; \
+ byte *ptr = str; \
+ while(*ptr++) len++; \
+}
+
+#define ADDLEN(str,len) \
+{byte *ptr = str; \
+	while(*ptr++) len++; \
+}
+
+/** Some data types used **ONLY**during construction process */
+	
+// Words, both the canonical words and their variants
+	typedef struct {
+		unsigned long slot;   // the position in the hash table of the canonical word
+		byte *word; //makes alphanumerical sorting easier...
+	} tposInHT;
+	
+	
+	typedef struct SzoneMem {        //a large block of memory to load a file into mem.
+		byte *zone;  //block of mem.
+		uint size;  //number of bytes	 	 
+	} tZoneMem;
+
+	//  words dataStructure.	
+	typedef struct {
+		uint *words;
+		uint elemSize;  //the size (in bits) of each pointer.
+		tZoneMem wordsZoneMem; // a block of memory where the canonical words are loaded (from file).
+
+	} twords;
+
+
+/** Some data types used during searches */
+
+	
+
+
+	/**the WCSA index structures... */
+	typedef struct {
+			
+		/**valid words */
+		twords wordsData;		/* vocabulary (words) of the index */
+
+		ulong n;  				/* number of different words. */		
+		uint seSize;  			/* number of words in the source text */ 
+
+		uint sourceTextSize; 	/*the size of the source text in bytes*/
+
+		//ticsa *myicsa; //the WiCSA on SE words
+		void *myicsa; //the WiCSA on SE words
+	
+		//#ifndef CSA_ON
+		uint *se;
+		//#endif
+
+	}twcsa;
+
+
+/** ******************************************************************************
+    * Interface (from pizza chili) for using the WCSA index
+*********************************************************************************/
+
+/* Error management */
+
+        /* Returns a string describing the error associated with error number
+          e. The string must not be freed, and it will be overwritten with
+          subsequent calls. */
+
+char *error_index (int e);
+
+/* Building the index */
+
+        /* Creates index from text[0..length-1]. Note that the index is an 
+          opaque data type. Any build option must be passed in string 
+          build_options, whose syntax depends on the index. The index must 
+          always work with some default parameters if build_options is NULL. 
+          The returned index is ready to be queried. */
+
+int build_index (uchar *text, ulong length, char *build_options, void **index);
+
+        /*  Saves index on disk by using single or multiple files, having 
+          proper extensions. */
+
+int save_index (void *index, char *filename);
+
+        /*  Loads index from one or more file(s) named filename, possibly 
+          adding the proper extensions. */
+
+int load_index (char *filename, void **index);
+
+        /* Frees the memory occupied by index. */
+
+int free_index (void *index);
+
+        /* Gives the memory occupied by index in bytes. */
+
+int index_size(void *index, ulong *size);
+
+/* Querying the index */
+
+        /* Writes in numocc the number of occurrences of the substring 
+          pattern[0..length-1] found in the text indexed by index. */
+
+int count (void *index, uchar *pattern, ulong length, ulong *numocc);
+
+        /* Gives the length of the text indexed */
+
+int get_length(void *index, ulong *length);
+
+/* Accessing the indexed text  */
+
+        /* Writes in numocc the number of occurrences of the substring 
+          pattern[0..length-1] in the text indexed by index. It also allocates
+          occ (which must be freed by the caller) and writes the locations of 
+          the numocc occurrences in occ, in arbitrary order.  */
+
+int locate (void *index, uchar *pattern, ulong length, ulong **occ, 
+        ulong *numocc);
+
+        /*  Allocates snippet (which must be freed by the caller) and writes 
+          the substring text[from..to] into it. Returns in snippet_length the 
+          length of the text snippet actually extracted (that could be less 
+          than to-from+1 if to is larger than the text size). */
+
+int extract (void *index, ulong from, ulong to, uchar **snippet, 
+        ulong *snippet_length);
+
+        /* Displays the text (snippet) surrounding any occurrence of the 
+          substring pattern[0..length-1] within the text indexed by index. 
+          The snippet must include numc characters before and after the 
+          pattern occurrence, totalizing length+2*numc characters, or less if 
+          the text boundaries are reached. Writes in numocc the number of 
+          occurrences, and allocates the arrays snippet_text and 
+          snippet_lengths (which must be freed by the caller). The first is a 
+          character array of numocc*(length+2*numc) characters, with a new 
+          snippet starting at every multiple of length+2*numc. The second 
+          gives the real length of each of the numocc snippets. */
+
+int display (void *index, uchar *pattern, ulong length, ulong numc, 
+        ulong *numocc, uchar **snippet_text, ulong **snippet_lengths);
+
+        /*  Obtains the length of the text indexed by index. */
+
+int length (void *index, ulong *length);
+
+		/* Shows summary info of the index */
+int printInfo(void *index);
+
+/** *******************************************************************************************/
+/** Building part of the index ****************************************************************/
+
+int build_WCSA (uchar *text, ulong length, char *build_options, void **index);
+int build_iCSA (char  *build_options, void *index); 
+
+
+
+/** *******************************************************************************************/
+/** Search part of the index ******************************************************************/
+// Definitions of some PUBLIC function prototipes.
+
+		//loading/freeing the data structures into memory.
+	
+    void loadStructs(twcsa *wcsa, char *basename);	
+	twcsa *loadWCSA(char *filename);	
+	
+		//returns the source text from given [offsetIni, offsetFin] offsets.
+	//byte *displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin);
+ 	byte *displayFacadeMalloc (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length);	 
+	int displayFacade (twcsa *wcsa, uint offsetIni, uint offsetFin, ulong *length, byte *dstptr);
+
+		//locate all the ocurrences of a word/phrase 
+	int locateFacade (twcsa *wcsa, uint *sourceTextPositions,uint *sePositions, uint number);
+	
+		//show text around the occurrences of a word. 
+	int locateAllAndDisplay (twcsa *wcsa, uint *sePositions, uint number, int radix);
+
+		//recovers the source text by calling display (either only once or "len" times)
+	void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
+	void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
+
+	//***Searching for a TEXT pattern ...
+	
+		//extracts the ids of the valid words of a "plain text".
+	void parseTextIntoIntegers(twcsa *wcsa, byte *textPattern, uint patLen, uint *integerPattern, uint *sizeIntegers) ;
+		
+ 		//counts the occurrences of a given text pattern.
+	int countTextOcurrences(twcsa *wcsa, byte *textPattern);
+	
+ 		//returns the offsets (to the source text) where of a given text pattern appears.	
+	uint *locateTextOcurrences(twcsa *wcsa, byte *textPattern, int *numberOccurrences);
+	
+ 		//shows a snippet with the text around the ocurrences of a pattern.	
+	int displayTextOcurrences(twcsa *wcsa, byte *textPattern, uint radixDisplay);
+
+
+/** ***********************************************************************************
+  * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
+  * ***********************************************************************************/  
+	/** Writes in numocc the number of occurrences of the substring 
+	  pattern[0..length-1] in the text indexed by index. It also allocates
+	  occ (which must be freed by the caller) and writes the locations of 
+	  the numocc occurrences in occ, in arbitrary order. These occurrences
+	  refer to the offsets in TOH where the caller could start a display
+	  operation. So locateWord implies synchronization using B.
+	  ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
+	     searched word, but the offset in TOH of k-before words before.	  
+	*/	  
+	  
+int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore);
+
+  /** Displays the text (snippet) surrounding any occurrence of the 
+    substring pattern[0..length-1] within the text indexed by index. 
+    The snippet must include numc characters before and after the 
+    pattern occurrence, totalizing length+2*numc characters, or less if 
+    the text boundaries are reached. Writes in numocc the number of 
+    occurrences, and allocates the arrays snippet_text and 
+    snippet_lengths (which must be freed by the caller). The first is a 
+    character array of numocc*(length+2*numc) characters, with a new 
+    snippet starting at every multiple of length+2*numc. The second 
+    gives the real length of each of the numocc snippets. */
+
+ int displayWords (void *index, uchar *pattern, ulong length, ulong numc, 
+         ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore);
+
+
+/** simulates extration of text process, but do not actually returns anything at all 
+   Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
+   Less than 2K words can be extracted if more than numc characters have been already obtained.
+   Do nothing else... do not return the text */
+   
+	int  displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc);
+
+
+/**  Allocates text (which must be freed by the caller) and recovers the
+  the substring of text starting from the "fromword"-th word up to the
+  "toWord"-th words. Returns in text the text, and in "text_lenght" the 
+  length of the text  actually extracted. Text is allocated. 
+  Actually extracts SE[fromWord .. toWord) ... not the last word.    */
+
+int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text, 
+        ulong *text_length);
+
+
+
+
+		//recovers the source text by calling display (either only once or "len" times)
+	void recoverSourceText1(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
+	void recoverSourceText2(twcsa *wcsa, char *basename, char *ext, uint sourceTextSize);
+
+
+// Definitions of PRIVATE functions
+
+	//Auxiliary functions
+
+	uint structsSizeDisk(twcsa *wcsa);
+	uint structsSizeMem(twcsa *wcsa);
+	void printInfoReduced(twcsa *wcsa); 
+	int saveSEfile (char *basename, uint *v, uint n);
+	double getTime2 (void); 
+
+