From: nvalimak Date: Wed, 27 Oct 2010 12:59:37 +0000 (+0000) Subject: Updated construction of SWCSA X-Git-Url: http://git.nguyen.vg/gitweb/?p=SXSI%2FTextCollection.git;a=commitdiff_plain;h=443151511a86083b21c1c06eb610f86b3aed35be Updated construction of SWCSA git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@920 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- diff --git a/swcsa/Makefile b/swcsa/Makefile index 1d731af..5719be6 100644 --- a/swcsa/Makefile +++ b/swcsa/Makefile @@ -2,7 +2,11 @@ SRCDIRUTILS = utils SRCDIRCSA = intIndex CC = g++ -# If you have trouble with make, e.g.: +export CFLAGS = -O9 -D_FORTIFY_SOURCE=0 +#export CFLAGS = -O9 -m32 -L. -g -D_FORTIFY_SOURCE=0 + +# Original settings: +# If you have trouble with -m32, e.g.: # /usr/bin/ld: skipping incompatible /usr/lib/gcc/x86_64-linux-gnu/4.4.3/libstdc++.a when searching for -lstdc++ # ... # try adding @@ -10,7 +14,7 @@ CC = g++ # # The filename libstdc++.so.6.0.13 is probably different # but any from /usr/lib32 is fine. -export CFLAGS = -O9 -m32 -L. -D_FORTIFY_SOURCE=0 +#export CFLAGS = -O9 -m32 -L. -D_FORTIFY_SOURCE=0 #export CFLAGS = -O0 -m32 -pg #export CFLAGS = -g -m32 -O0 diff --git a/swcsa/buildFacade.c b/swcsa/buildFacade.c index 768e943..3647e99 100755 --- a/swcsa/buildFacade.c +++ b/swcsa/buildFacade.c @@ -482,7 +482,7 @@ int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text, uint avgWordLen =7; uint i, j;//, tmplen; - uint prevValid; + uint prevValid = 0; byte *src, *dst, *buff; uint tmplen =0; @@ -617,62 +617,79 @@ int build_WCSA (uchar *text, ulong length, char *build_options, void **index) { //----------------------------------------------------------------- //1st pass (processing the file) { - byte *pbeg,*pend,*wordstart,*aWord; - register ulong size; - register uint i; - - pbeg = inputBuffer; - pend = inputBuffer+bytesFileReal; - - while (pbeg = pend) {size++;} // a unique BLANK at the end of the file. - else { - if (_Valid [*pbeg] ) { - wordstart = pbeg; //So skipping 1 blank character - while ( (sizehash[addrInTH].word; - hash->hash[addrInTH].posInVoc = zeroNode; - zeroNode++; - totallenWords += size +1; // +1 due to the '\0' char... - //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size); - } - seSize ++; - }//while pbeg= pend) {size++;} // a unique BLANK at the end of the file. + else { + if (_Valid [*pbeg] ) { + wordstart = pbeg; //So skipping 1 blank character + while ( (sizehash[addrInTH].word; + hash->hash[addrInTH].posInVoc = zeroNode; + zeroNode++; + totallenWords += size +1; // +1 due to the '\0' char... + //fprintf(stderr,"\n Adding word: <<%s>> (size=%d) to the hashTable",hash->hash[addrInTH].word,size); + } + seSize ++; + + }//while pbeg= pend) {size++;} // a unique BLANK at the end of the file. - else { - if (_Valid [*pbeg] ) { - wordstart = pbeg; //So skipping 1 blank character - while ( (sizehash[addrInTH].posInVoc+1; // !!!! - countValidWords++; - - }// while pbeg= pend) {size++;} // a unique BLANK at the end of the file. + else { + if (_Valid [*pbeg] ) { + wordstart = pbeg; //So skipping 1 blank character + while ( (sizehash[addrInTH].posInVoc+1; // !!!! + countValidWords++; + + }// while pbegn = zeroNode; @@ -1369,7 +1400,7 @@ int printInfo(void *index) { printf("\n Summary of Presentation layer:"); printf("\n Number of valid words (SEsize) = %u",wcsa->seSize); printf("\n Number of different words = %ld",wcsa->n); - printf("\n WCSA structure = %d bytes", sizeof(twcsa)); + printf("\n WCSA structure = %lu bytes", sizeof(twcsa)); uint totalpointers = ((((wcsa->n+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)); uint totalasciizone = wcsa->wordsData.wordsZoneMem.size * sizeof(byte) ; diff --git a/swcsa/interface.h b/swcsa/interface.h index 938bd43..94766b7 100755 --- a/swcsa/interface.h +++ b/swcsa/interface.h @@ -52,45 +52,98 @@ int index_size(void *index, ulong *size); /* Writes in numocc the number of occurrences of the substring pattern[0..length-1] found in the text indexed by index. */ -//int count (void *index, uchar *pattern, ulong length, ulong *numocc); -// -// /* Writes in numocc the number of occurrences of the substring -// pattern[0..length-1] in the text indexed by index. It also allocates -// occ (which must be freed by the caller) and writes the locations of -// the numocc occurrences in occ, in arbitrary order. */ -// -//int locate (void *index, uchar *pattern, ulong length, ulong **occ, -// ulong *numocc); -// -// /* Gives the length of the text indexed */ -// -//int get_length(void *index, ulong *length); -// -///* Accessing the indexed text */ -// -// /* Allocates snippet (which must be freed by the caller) and writes -// the substring text[from..to] into it. Returns in snippet_length the -// length of the text snippet actually extracted (that could be less -// than to-from+1 if to is larger than the text size). */ -// -//int extract (void *index, ulong from, ulong to, uchar **snippet, -// ulong *snippet_length); -// -// /* Displays the text (snippet) surrounding any occurrence of the -// substring pattern[0..length-1] within the text indexed by index. -// The snippet must include numc characters before and after the -// pattern occurrence, totalizing length+2*numc characters, or less if -// the text boundaries are reached. Writes in numocc the number of -// occurrences, and allocates the arrays snippet_text and -// snippet_lengths (which must be freed by the caller). The first is a -// character array of numocc*(length+2*numc) characters, with a new -// snippet starting at every multiple of length+2*numc. The second -// gives the real length of each of the numocc snippets. */ -// -//int display (void *index, uchar *pattern, ulong length, ulong numc, -// ulong *numocc, uchar **snippet_text, ulong **snippet_lengths); +int count (void *index, uchar *pattern, ulong length, ulong *numocc); + + /* Writes in numocc the number of occurrences of the substring + pattern[0..length-1] in the text indexed by index. It also allocates + occ (which must be freed by the caller) and writes the locations of + the numocc occurrences in occ, in arbitrary order. */ + +int locate (void *index, uchar *pattern, ulong length, ulong **occ, + ulong *numocc); + + /* Gives the length of the text indexed */ + +int get_length(void *index, ulong *length); + +/* Accessing the indexed text */ + + /* Allocates snippet (which must be freed by the caller) and writes + the substring text[from..to] into it. Returns in snippet_length the + length of the text snippet actually extracted (that could be less + than to-from+1 if to is larger than the text size). */ + +int extract (void *index, ulong from, ulong to, uchar **snippet, + ulong *snippet_length); + + /* Displays the text (snippet) surrounding any occurrence of the + substring pattern[0..length-1] within the text indexed by index. + The snippet must include numc characters before and after the + pattern occurrence, totalizing length+2*numc characters, or less if + the text boundaries are reached. Writes in numocc the number of + occurrences, and allocates the arrays snippet_text and + snippet_lengths (which must be freed by the caller). The first is a + character array of numocc*(length+2*numc) characters, with a new + snippet starting at every multiple of length+2*numc. The second + gives the real length of each of the numocc snippets. */ + +int display (void *index, uchar *pattern, ulong length, ulong numc, + ulong *numocc, uchar **snippet_text, ulong **snippet_lengths); /* Obtains the length of the text indexed by index. */ int length (void *index, ulong *length); + /* Shows summary info of the index */ + +int printInfo(void *index); + + + +/** *********************************************************************************** + * WORD-ORIENTED QUERY FUNCTIONS: LocateWord and DisplayWord + * ***********************************************************************************/ + /** Writes in numocc the number of occurrences of the substring + pattern[0..length-1] in the text indexed by index. It also allocates + occ (which must be freed by the caller) and writes the locations of + the numocc occurrences in occ, in arbitrary order. These occurrences + refer to the offsets in TOH where the caller could start a display + operation. So locateWord implies synchronization using B. + ** Parameter kbefore sets locateWord not to obtain the offset in TOH of the + searched word, but the offset in TOH of k-before words before. + */ + +int locateWord(void *index, uchar *pattern, ulong length, ulong **occ, ulong *numocc, uint kbefore); + + /** Displays the text (snippet) surrounding any occurrence of the + substring pattern[0..length-1] within the text indexed by index. + The snippet must include numc characters before and after the + pattern occurrence, totalizing length+2*numc characters, or less if + the text boundaries are reached. Writes in numocc the number of + occurrences, and allocates the arrays snippet_text and + snippet_lengths (which must be freed by the caller). The first is a + character array of numocc*(length+2*numc) characters, with a new + snippet starting at every multiple of length+2*numc. The second + gives the real length of each of the numocc snippets. */ + + int displayWords (void *index, uchar *pattern, ulong length, ulong numc, + ulong *numocc, uchar **snippet_text, ulong **snippet_lengths, uint kbefore); + + +/** simulates extration of text process, but do not actually returns anything at all + Extracts upto <=2K words from K=wordsbefore words before each occurrence of a pattern. + Less than 2K words can be extracted if more than numc characters have been already obtained. + Do nothing else... do not return the text */ + +int displayTextOcurrencesNoShow(void *index, uchar *pattern, ulong length, uint wordsbefore, uint maxnumc); + + + +/** Allocates text (which must be freed by the caller) and recovers the + the substring of text starting from the "fromword"-th word up to the + "toWord"-th words. Returns in text the text, and in "text_lenght" the + length of the text actually extracted. Text is allocated. + Actually extracts SE[fromWord .. toWord) ... not the last word. */ + +int extractWords (void *index, ulong fromWord, ulong toWord, uchar **text, + ulong *text_length); diff --git a/swcsa/utils/basics.c b/swcsa/utils/basics.c index f213ed6..398e975 100755 --- a/swcsa/utils/basics.c +++ b/swcsa/utils/basics.c @@ -89,7 +89,7 @@ void bitwrite (register uint *e, register uint p, } // writes e[p..p+len-1] = 0 -void bitzero (register uint *e, register uint p, +void bitzero2 (register uint *e, register uint p, register uint len) { e += p/W; p %= W; diff --git a/swcsa/utils/basics.h b/swcsa/utils/basics.h index da91efb..6e4b39b 100755 --- a/swcsa/utils/basics.h +++ b/swcsa/utils/basics.h @@ -58,7 +58,7 @@ void bitwrite (uint *e, uint p, uint len, uint s); /**/ //FARI. WITH ASSUMPTION ON LEN, OR IT CRASHES //NOt WORKING UPON THE LIMIT OF THE STARTING uint. -void bitzero (uint *e, uint p, uint len); +void bitzero2 (uint *e, uint p, uint len); // reads bit p from e #define bitget(e,p) (((e)[(p)/W] >> ((p)%W)) & 1) // sets bit p in e diff --git a/swcsa/utils/huff.c b/swcsa/utils/huff.c index b45a484..9d95da9 100755 --- a/swcsa/utils/huff.c +++ b/swcsa/utils/huff.c @@ -166,7 +166,7 @@ int encodeHuff (THuff H, uint symb, uint *stream, uint ptr) pos -= H.num[d--]; } code += pos; - if (d > W) { bitzero(stream,ptr,d-W); ptr += d-W; d = W; } + if (d > W) { bitzero2(stream,ptr,d-W); ptr += d-W; d = W; } while (d--) { if ((code >> d) & 1) bitset(stream,ptr); else bitclean(stream,ptr);