X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=swcsa%2FbuildAll.c;fp=swcsa%2FbuildAll.c;h=b114d3a22a1f43a6818a02377cbad0d26c9d0a00;hb=102e33b134075765e6d4e0c38bc1307568ce5602;hp=0000000000000000000000000000000000000000;hpb=ed61d2042a7ad7dd83bae32d7c31e69504dafa80;p=SXSI%2FTextCollection.git diff --git a/swcsa/buildAll.c b/swcsa/buildAll.c new file mode 100644 index 0000000..b114d3a --- /dev/null +++ b/swcsa/buildAll.c @@ -0,0 +1,365 @@ +#include "buildFacade.h" + +/**------------------------------------------------------------------ + * MAIN PROGRAM. + *------------------------------------------------------------------ */ + + int main(int argc, char* argv[]) + { + + char *infile, *outbasename, *stopwordsfile; // Name of in/out files + byte *inputBuffer; + ulong finsize; + + int f_in; + void *Index; + + + printf("\n*Presentation level for CSA (simple WCSA)"); + printf("\n*CopyRight (c) 2007 [LBD & G.N.]\n\n"); + + // Reads input parameters from command line. + if(argc < 3) { + printf("Use: %s [build_options]\n", argv[0]); + exit(0); + } + + // Reads params (input file, output basename, and stopwords file) + infile = argv[1]; + outbasename = argv[2]; + stopwordsfile = argv[3]; + + finsize= fileSize(infile); + + if (! finsize) { + printf( "\nFILE EMPTY OR FILE NOT FOUND %s !!\nSkipping processement ...\n",infile); + exit(0); + } + + // Opening the input text file. + if( (f_in = open(infile, O_RDONLY)) < 0) { + printf("Cannot read file %s\n", infile); + exit(0); + } + inputBuffer = (byte *) malloc(finsize *sizeof(byte));// +1); + read (f_in,inputBuffer,finsize); + close (f_in); + + + { + //printf("\n parametros <<%s>>\n\n",stopwordsfile); + // build_WCSA (inputBuffer, finsize, stopwordsfile, NULL,outbasename); + build_index (inputBuffer, finsize, stopwordsfile, &Index); /** building the index */ + + + /** saving the index to disk*/ + + save_index (Index, outbasename); + fprintf(stderr,"Index saved !! "); + + /** tells the mem used by the index */ + ulong indexsize; + index_size(Index, &indexsize); + fprintf(stderr,"Index occupied %d bytes, 2 extra mallocs = %d",indexsize,2* sizeof(uint)); + + + /** recovering the source text from the index */ + { + double start, end; + start = getTime2(); + ulong size; + get_length(Index, &size); + + fprintf(stderr, "\nRecovering source file "); fflush(stderr); + char ext1[10]=".source"; + recoverSourceText1((twcsa*) Index, outbasename,ext1, size); + end = getTime2(); + fprintf(stderr, " time: %.3f secs\n", end-start ); + + start=end; + char ext2[10]=".source2"; + fprintf(stderr, "\nRecovering source file "); fflush(stderr); + recoverSourceText2((twcsa*) Index, outbasename,ext2,size); + end = getTime2(); + fprintf(stderr, " time: %.3f secs\n", end-start ); + //fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start ); + } + + // DISPLAYING THE OCCURRENCES OF A TEXT PATTERN (word/phrase). + {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE]; + int error = 0; + ulong numocc,numc, length, i, *snippet_len, tot_numcharext = 0, numpatt; + uchar *pattern, *snippet_text; + + pattern = textPattern; + printf("\nSEARCH TEST for DISPLAY (pizzachili interface)\n"); + while(1) { + printf("Intro string: "); + fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin); + if (!strcmp((char*)textPattern,"\n") ) break; + textPattern[strlen((char*)textPattern)-1] = '\0'; + + length = strlen( (char*)textPattern); + numc=50; + +// error = display (Index, textPattern, length, numc, &numocc, +// &snippet_text, &snippet_len); + error = displayWords (Index, textPattern, length, numc, &numocc, + &snippet_text, &snippet_len,1); + + if (error){ fprintf(stderr, "%s\n", "Hubo un error durante display");exit(0);} + + fprintf(stderr,"\n acabou display");fflush(stderr); + {//show the results + ulong j, len = length + 2*numc; + char blank = '\0'; + fprintf(stderr,"\n length = %d",length); + fprintf(stderr,"\n pattern = %s",pattern);fflush(stderr); + fprintf(stderr,"\n numocc = %d",numocc);fflush(stderr); + fprintf(stderr,"\n snippet len = %d",len);fflush(stderr); + fprintf(stderr,"\n =========");fflush(stderr); + for (i = 0; i < numocc; i++){ + fprintf(stderr,"\n[%2d][len=%3d]<<",i+1,snippet_len[i]);fflush(stderr); + fwrite(snippet_text+len*i,sizeof(uchar),snippet_len[i],stderr);fflush(stderr); + fprintf(stderr,">>");fflush(stderr); + } + } + numpatt--; + + for(i=0; i0) free(occs); + + if (!strcmp((char*)textPattern,"\n") ) break; + } + } + + + + + /** freeing the index */ + free_index(Index); + + } +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +//{ +// bG = createBitmap (B,len); +// bE = createBitmapEdu (B,len); +// //fprintf(stderr,"\n RANK DE GONZALO DA %u, de EDU DA %u\n",rank(bG,33),rank1Edu(bE,33)); +// //fprintf(stderr,"\n SELECT1(2) DE GONZALO DA %u, de EDU DA %u\n",bselect(bG,4),bselect(bE,4)); +// +// showBitVector(bitvector,34); +//} + + + +/* + + //USING A HASH TABLE + // { + // char a[20]="beginnings";char b[20]="HOSTIAS"; + // char *w; + // int i; + // w=a; + // i = inHashTable(stopwordshash,w, strlen(w), &addrInTH ); + // if (!i) insertElement (stopwordshash, w, strlen(w), &addrInTH); + // else stopwordshash->hash[addrInTH].freq++; + // //fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH); + // //fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, stopwordshash->hash[addrInTH].word, stopwordshash->hash[addrInTH].freq, stopwordshash->hash[addrInTH].posInVoc); + // } + + + + /// ENCODING THE separators ... +{ + freeHuff(gapsHuffman); + uint i; + uint *bitvector; + uint bitvectorSize; + uint ptr; + bitmap bG,bE; + uint len; + len = 1000; //number of bits + bitvector = (uint *) malloc ((len/32 +1)* sizeof(uint)); + + byte texto[100] = "####@?*"; + uint freqs[256]; + + //fprintf(stderr,"\n este es el texto a codificar: %s",texto); + for (i=0;i<256;i++) freqs[i]=0; + for (i=0;i