From: nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Date: Wed, 27 Oct 2010 12:59:37 +0000 (+0000)
Subject: Updated construction of SWCSA
X-Git-Url: http://git.nguyen.vg/gitweb/?p=SXSI%2FTextCollection.git;a=commitdiff_plain;h=443151511a86083b21c1c06eb610f86b3aed35be

Updated construction of SWCSA

git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@920 3cdefd35-fc62-479d-8e8d-bae585ffb9ca
---

diff --git a/swcsa/Makefile b/swcsa/Makefile
index 1d731af..5719be6 100644
--- a/swcsa/Makefile
+++ b/swcsa/Makefile
@@ -2,7 +2,11 @@ SRCDIRUTILS = utils
 SRCDIRCSA   = intIndex
 CC          = g++
 
-# If you have trouble with make, e.g.:
+export CFLAGS  = -O9 -D_FORTIFY_SOURCE=0
+#export CFLAGS  = -O9 -m32 -L. -g -D_FORTIFY_SOURCE=0
+
+# Original settings:
+# If you have trouble with -m32, e.g.:
 #    /usr/bin/ld: skipping incompatible /usr/lib/gcc/x86_64-linux-gnu/4.4.3/libstdc++.a when searching for -lstdc++
 #    ...
 # try adding
@@ -10,7 +14,7 @@ CC          = g++
 #
 # The filename libstdc++.so.6.0.13 is probably different
 # but any from /usr/lib32 is fine.
-export CFLAGS  = -O9 -m32 -L. -D_FORTIFY_SOURCE=0
+#export CFLAGS  = -O9 -m32 -L. -D_FORTIFY_SOURCE=0
 #export CFLAGS      = -O0 -m32 -pg
 #export CFLAGS      = -g -m32 -O0
 
diff --git a/swcsa/buildFacade.c b/swcsa/buildFacade.c
index 768e943..3647e99 100755
--- a/swcsa/buildFacade.c
+++ b/swcsa/buildFacade.c
@@ -482,7 +482,7 @@ int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text,
     uint avgWordLen =7;
 
 	uint i, j;//, tmplen;
-	uint prevValid;
+	uint prevValid = 0;
 	byte *src, *dst, *buff;
 	uint tmplen =0;
 
@@ -617,62 +617,79 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
 	//-----------------------------------------------------------------	
 	//1st pass (processing the file)
 	{ 
-		byte *pbeg,*pend,*wordstart,*aWord;
-		register ulong size;
-		register uint i;
-
-		pbeg = inputBuffer;
-		pend = inputBuffer+bytesFileReal;
-				
-		while (pbeg <pend) {  
-			
-			//parsing either a word or separator.			
-			size=0;
-			wordstart = pbeg;
-			if (_Valid[*pbeg]) {   //alphanumerical data
-				while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
-		    }		    
-			else {
-				if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word					
-					while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
-				}
-				else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
-					pbeg++;
-					if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
-					else {
-						if (_Valid [*pbeg] ) {
-							wordstart = pbeg;   //So skipping 1 blank character
-							while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
-						}
-						else {   // a "separator word" ...
-							size++; //the prev BLANK...
-							while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
-						}//else {  // a "separator word"
-					}//else ... not a unique BLANK AT THE END.
-				}//else ... starting by a BLANK... 
-			}
-
-			//The parsed word/separator is  is "wordstart", and its length is "size"...
-			aWord=wordstart;				
-
-			//Processement done for each word word			
-			i = inHashTable(hash,aWord, size, &addrInTH );				
-			if (!i){
-				insertElement (hash,aWord, size, &addrInTH);
-				posInHT[zeroNode].slot=addrInTH;
-				posInHT[zeroNode].word=hash->hash[addrInTH].word;
-				hash->hash[addrInTH].posInVoc = zeroNode;
-				zeroNode++;
-				totallenWords += size +1;			// +1 due to the '\0' char...		
-				//fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);				
-			}							
-			seSize ++;
-		}//while pbeg<pend
-
-		fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
-
-	}//1st pass ends
-
+            byte *pbeg,*pend,*wordstart,*aWord;
+            register ulong size;
+            register uint i;
+            
+            pbeg = inputBuffer;
+            pend = inputBuffer+bytesFileReal;
+            
+            while (pbeg <pend) 
+            {
+                if (*pbeg == 0)
+                {
+                    fprintf(stderr, "buildFacade.c: assert failed, *pbeg == 0\n");
+                    exit(1);
+                }
+
+                //parsing either a word or separator.			
+                size=0;
+                wordstart = pbeg;
+                if (_Valid[*pbeg]) {   //alphanumerical data
+                    while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+                }		    
+                else
+                {
+                    if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word					
+                        while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
+                    }
+                    else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
+                        pbeg++;
+                        if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
+                        else {
+                            if (_Valid [*pbeg] ) {
+                                wordstart = pbeg;   //So skipping 1 blank character
+                                while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+                            }
+                            else {   // a "separator word" ...
+                                size++; //the prev BLANK...
+                                while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
+                            }//else {  // a "separator word"
+                        }//else ... not a unique BLANK AT THE END.
+                    }//else ... starting by a BLANK... 
+                }
+                
+                if (pbeg < pend && *pbeg == 0) 
+                    pbeg ++;    // Skip the 0-bytes
+
+                if (size == 0)
+                {
+                    fprintf(stderr, "buildFacade.c: assert failed, size == 0\n");
+                    exit(1);
+                }
+
+                //The parsed word/separator is  is "wordstart", and its length is "size"...
+                aWord=wordstart;				
+                
+                //Processement done for each word word			
+                i = inHashTable(hash,aWord, size, &addrInTH );				
+                if (!i){
+                    insertElement (hash,aWord, size, &addrInTH);
+                    posInHT[zeroNode].slot=addrInTH;
+                    posInHT[zeroNode].word=hash->hash[addrInTH].word;
+                    hash->hash[addrInTH].posInVoc = zeroNode;
+                    zeroNode++;
+                    totallenWords += size +1;			// +1 due to the '\0' char...		
+                   //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size);
+                }							
+                seSize ++;
+                
+            }//while pbeg<pend
+            
+            fprintf(stderr,"\n1st pass ends: TOTAL distinct words: %lu totalWords = %u",zeroNode,seSize);
+            
+        }//1st pass ends
+        
 
 	// **********************************************************************************
 	// END OF 1ST PASS
@@ -702,56 +719,70 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
 	printf("\nSTARTING THE SECOND PASS... ");
 	//2nd pass (processing the file)
 	{ 
-		byte *pbeg,*pend,*wordstart,*aWord;
-		register ulong size;
-		register uint i;
-		register ulong countValidWords = 0;
+            byte *pbeg,*pend,*wordstart,*aWord;
+            register ulong size;
+            register uint i;
+            register ulong countValidWords = 0;
 
 
-		pbeg = inputBuffer;
-		pend = inputBuffer+bytesFileReal;
+            pbeg = inputBuffer;
+            pend = inputBuffer+bytesFileReal;
 				
-		while (pbeg <pend) {  
+            while (pbeg <pend) {  
+                if (*pbeg == 0)
+                {
+                    fprintf(stderr, "buildFacade.c 2nd pass: assert failed, *pbeg == 0\n");
+                    exit(1);
+                }
 			
-			//parsing either a word or separator.			
-			size=0;
-			wordstart = pbeg;
-			if (_Valid[*pbeg]) {   //alphanumerical data
-				while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
-		  }		    
-			else {
-				if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word					
-					while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
-				}
-				else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
-					pbeg++;
-					if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
-					else {
-						if (_Valid [*pbeg] ) {
-							wordstart = pbeg;   //So skipping 1 blank character
-							while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
-						}
-						else {   // a "separator word" ...
-							size++; //the prev BLANK...
-							while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] )) {size++;pbeg++;}
-						}//else {  // a "separator word"
-					}//else ... not a unique BLANK AT THE END.
-				}//else ... starting by a BLANK... 
-			}
-
-			//The parsed word/separator is  is "wordstart", and its length is "size"...
-			aWord=wordstart;					
-
-			//Processement done for each word word			
-			i = inHashTable(hash,aWord, size, &addrInTH );				
-
-			SE[countValidWords]=hash->hash[addrInTH].posInVoc+1;  // !!!!
-			countValidWords++;		
-
-		}// while pbeg<pend
+                //parsing either a word or separator.			
+                size=0;
+                wordstart = pbeg;
+                if (_Valid[*pbeg]) {   //alphanumerical data
+                    while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+                }		    
+                else {
+                    if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word					
+                        while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
+                    }
+                    else {  //a  SPACE comes, so we have to test if next character is alphanumerical or not
+                        pbeg++;
+                        if (pbeg >= pend) {size++;}  // a unique BLANK at the end of the file.
+                        else {
+                            if (_Valid [*pbeg] ) {
+                                wordstart = pbeg;   //So skipping 1 blank character
+                                while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+                            }
+                            else {   // a "separator word" ...
+                                size++; //the prev BLANK...
+                                while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) &&  (!_Valid[*pbeg]  && *pbeg != 0)) {size++;pbeg++;}
+                            }//else {  // a "separator word"
+                        }//else ... not a unique BLANK AT THE END.
+                    }//else ... starting by a BLANK... 
+                }
+                    
+                if (pbeg < pend && *pbeg == 0) 
+                    pbeg ++;    // Skip the 0-bytes
+
+                if (size == 0)
+                {
+                    fprintf(stderr, "buildFacade.c 2nd pass: assert failed, size == 0\n");
+                    exit(1);
+                }
+
+                //The parsed word/separator is  is "wordstart", and its length is "size"...
+                aWord=wordstart;					
+                    
+                //Processement done for each word word			
+                i = inHashTable(hash,aWord, size, &addrInTH );				
+                    
+                SE[countValidWords]=hash->hash[addrInTH].posInVoc+1;  // !!!!
+                countValidWords++;		
+                    
+            }// while pbeg<pend
 		
-		SE[countValidWords] = 0;
-		fprintf(stderr,"\n2nd pass ends: TOTAL distinct words: %lu totalWords = %lu",zeroNode,countValidWords);
+            SE[countValidWords] = 0;
+            fprintf(stderr,"\n2nd pass ends: TOTAL distinct words: %lu totalWords = %lu",zeroNode,countValidWords);
 			
 	}//2nd pass ends
 	
@@ -760,7 +791,7 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) {
 	// **********************************************************************************
 	
 	//freeing the source text (it is no longer needed).
-	free(inputBuffer); //the text	
+	delete [] inputBuffer; //the text	
 
 	/** Now Setting the data of the index **/
 	wcsa->n = zeroNode;
@@ -1369,7 +1400,7 @@ int printInfo(void *index) {
 		printf("\n Summary of Presentation layer:");		
 		printf("\n   Number of valid words (SEsize) = %u",wcsa->seSize);
 		printf("\n   Number of different words = %ld",wcsa->n);
-		printf("\n   WCSA structure = %d bytes", sizeof(twcsa));
+		printf("\n   WCSA structure = %lu bytes", sizeof(twcsa));
 
 		uint totalpointers = ((((wcsa->n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));
 		uint totalasciizone = wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ;
diff --git a/swcsa/interface.h b/swcsa/interface.h
index 938bd43..94766b7 100755
--- a/swcsa/interface.h
+++ b/swcsa/interface.h
@@ -52,45 +52,98 @@ int index_size(void *index, ulong *size);
         /* Writes in numocc the number of occurrences of the substring 
           pattern[0..length-1] found in the text indexed by index. */
 
-//int count (void *index, uchar *pattern, ulong length, ulong *numocc);
-//
-//        /* Writes in numocc the number of occurrences of the substring 
-//          pattern[0..length-1] in the text indexed by index. It also allocates
-//          occ (which must be freed by the caller) and writes the locations of 
-//          the numocc occurrences in occ, in arbitrary order.  */
-//
-//int locate (void *index, uchar *pattern, ulong length, ulong **occ, 
-//        ulong *numocc);
-//
-//        /* Gives the length of the text indexed */
-//
-//int get_length(void *index, ulong *length);
-//
-///* Accessing the indexed text  */
-//
-//        /*  Allocates snippet (which must be freed by the caller) and writes 
-//          the substring text[from..to] into it. Returns in snippet_length the 
-//          length of the text snippet actually extracted (that could be less 
-//          than to-from+1 if to is larger than the text size). */
-//
-//int extract (void *index, ulong from, ulong to, uchar **snippet, 
-//        ulong *snippet_length);
-//
-//        /* Displays the text (snippet) surrounding any occurrence of the 
-//          substring pattern[0..length-1] within the text indexed by index. 
-//          The snippet must include numc characters before and after the 
-//          pattern occurrence, totalizing length+2*numc characters, or less if 
-//          the text boundaries are reached. Writes in numocc the number of 
-//          occurrences, and allocates the arrays snippet_text and 
-//          snippet_lengths (which must be freed by the caller). The first is a 
-//          character array of numocc*(length+2*numc) characters, with a new 
-//          snippet starting at every multiple of length+2*numc. The second 
-//          gives the real length of each of the numocc snippets. */
-//
-//int display (void *index, uchar *pattern, ulong length, ulong numc, 
-//        ulong *numocc, uchar **snippet_text, ulong **snippet_lengths);
+int count (void *index, uchar *pattern, ulong length, ulong *numocc);
+
+        /* Writes in numocc the number of occurrences of the substring 
+          pattern[0..length-1] in the text indexed by index. It also allocates
+          occ (which must be freed by the caller) and writes the locations of 
+          the numocc occurrences in occ, in arbitrary order.  */
+
+int locate (void *index, uchar *pattern, ulong length, ulong **occ, 
+        ulong *numocc);
+
+        /* Gives the length of the text indexed */
+
+int get_length(void *index, ulong *length);
+
+/* Accessing the indexed text  */
+
+        /*  Allocates snippet (which must be freed by the caller) and writes 
+          the substring text[from..to] into it. Returns in snippet_length the 
+          length of the text snippet actually extracted (that could be less 
+          than to-from+1 if to is larger than the text size). */
+
+int extract (void *index, ulong from, ulong to, uchar **snippet, 
+        ulong *snippet_length);
+
+        /* Displays the text (snippet) surrounding any occurrence of the 
+          substring pattern[0..length-1] within the text indexed by index. 
+          The snippet must include numc characters before and after the 
+          pattern occurrence, totalizing length+2*numc characters, or less if 
+          the text boundaries are reached. Writes in numocc the number of 
+          occurrences, and allocates the arrays snippet_text and 
+          snippet_lengths (which must be freed by the caller). The first is a 
+          character array of numocc*(length+2*numc) characters, with a new 
+          snippet starting at every multiple of length+2*numc. The second 
+          gives the real length of each of the numocc snippets. */
+
+int display (void *index, uchar *pattern, ulong length, ulong numc, 
+        ulong *numocc, uchar **snippet_text, ulong **snippet_lengths);
 
         /*  Obtains the length of the text indexed by index. */
 
 int length (void *index, ulong *length);
 
+		/* Shows summary info of the index */
+		
+int printInfo(void *index);
+
+
+
+/** ***********************************************************************************
+  * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord
+  * ***********************************************************************************/  
+	/** Writes in numocc the number of occurrences of the substring 
+	  pattern[0..length-1] in the text indexed by index. It also allocates
+	  occ (which must be freed by the caller) and writes the locations of 
+	  the numocc occurrences in occ, in arbitrary order. These occurrences
+	  refer to the offsets in TOH where the caller could start a display
+	  operation. So locateWord implies synchronization using B.
+	  ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the
+	     searched word, but the offset in TOH of k-before words before.	  
+	*/	  
+	  
+int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore);
+
+  /** Displays the text (snippet) surrounding any occurrence of the 
+    substring pattern[0..length-1] within the text indexed by index. 
+    The snippet must include numc characters before and after the 
+    pattern occurrence, totalizing length+2*numc characters, or less if 
+    the text boundaries are reached. Writes in numocc the number of 
+    occurrences, and allocates the arrays snippet_text and 
+    snippet_lengths (which must be freed by the caller). The first is a 
+    character array of numocc*(length+2*numc) characters, with a new 
+    snippet starting at every multiple of length+2*numc. The second 
+    gives the real length of each of the numocc snippets. */
+
+ int displayWords (void *index, uchar *pattern, ulong length, ulong numc, 
+         ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore);
+
+
+/** simulates extration of text process, but do not actually returns anything at all 
+   Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern.
+   Less than 2K words can be extracted if more than numc characters have been already obtained.
+   Do nothing else... do not return the text */
+
+int  displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc);
+
+
+
+/**  Allocates text (which must be freed by the caller) and recovers the
+  the substring of text starting from the "fromword"-th word up to the
+  "toWord"-th words. Returns in text the text, and in "text_lenght" the 
+  length of the text  actually extracted. Text is allocated. 
+  Actually extracts SE[fromWord .. toWord) ... not the last word.    */
+
+int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text, 
+       ulong *text_length);
diff --git a/swcsa/utils/basics.c b/swcsa/utils/basics.c
index f213ed6..398e975 100755
--- a/swcsa/utils/basics.c
+++ b/swcsa/utils/basics.c
@@ -89,7 +89,7 @@ void bitwrite (register uint *e, register uint p,
    }
   	// writes e[p..p+len-1] = 0
 
-void bitzero (register uint *e, register uint p, 
+void bitzero2 (register uint *e, register uint p, 
 	       register uint len)
 
    { e += p/W; p %= W;
diff --git a/swcsa/utils/basics.h b/swcsa/utils/basics.h
index da91efb..6e4b39b 100755
--- a/swcsa/utils/basics.h
+++ b/swcsa/utils/basics.h
@@ -58,7 +58,7 @@ void bitwrite (uint *e, uint p, uint len, uint s);
         
     /**/ //FARI. WITH ASSUMPTION ON LEN, OR IT CRASHES 
          //NOt WORKING UPON THE LIMIT OF THE STARTING uint.
-void bitzero (uint *e, uint p, uint len);
+void bitzero2 (uint *e, uint p, uint len);
 	// reads bit p from e
 #define bitget(e,p) (((e)[(p)/W] >> ((p)%W)) & 1)
 	// sets bit p in e
diff --git a/swcsa/utils/huff.c b/swcsa/utils/huff.c
index b45a484..9d95da9 100755
--- a/swcsa/utils/huff.c
+++ b/swcsa/utils/huff.c
@@ -166,7 +166,7 @@ int encodeHuff (THuff H, uint symb, uint *stream, uint ptr)
          pos -= H.num[d--];
        }
      code += pos;
-     if (d > W) { bitzero(stream,ptr,d-W); ptr += d-W; d = W; }
+     if (d > W) { bitzero2(stream,ptr,d-W); ptr += d-W; d = W; }
      while (d--)
         { if ((code >> d) & 1) bitset(stream,ptr);
 	  else bitclean(stream,ptr);