Debug swcsa
[SXSI/TextCollection.git] / swcsa / buildAll.c
1 #include "buildFacade.h"
2
3 /**------------------------------------------------------------------ 
4   *  MAIN PROGRAM.
5   *------------------------------------------------------------------ */
6
7         int main(int argc, char* argv[])
8         {
9                                 
10                 char *infile, *outbasename, *stopwordsfile;     // Name of in/out files
11                 byte *inputBuffer;
12                 ulong finsize;
13         
14                 int f_in;
15                 void *Index;
16
17                 
18                 printf("\n*Presentation level for CSA (simple WCSA)");
19                 printf("\n*CopyRight (c) 2007 [LBD & G.N.]\n\n");
20         
21                 // Reads input parameters from command line.
22                 if(argc < 3) {
23                         printf("Use: %s <in file> <out basename> [build_options]\n", argv[0]);
24                         exit(0);
25                 }
26         
27                 // Reads params (input file, output basename, and stopwords file)
28                 infile = argv[1];
29                 outbasename = argv[2];
30                 stopwordsfile = argv[3];
31                 
32                 finsize= fileSize(infile);
33                 
34                 if (! finsize) {
35                         printf( "\nFILE EMPTY OR FILE NOT FOUND %s !!\nSkipping processement ...\n",infile);
36                         exit(0);
37                 }       
38         
39                 // Opening the input text file.
40                 if( (f_in = open(infile, O_RDONLY)) < 0) {
41                         printf("Cannot read file %s\n", infile);
42                         exit(0);
43                 }       
44                 inputBuffer = (byte *) malloc(finsize *sizeof(byte));// +1);
45                 read (f_in,inputBuffer,finsize);        
46                 close (f_in);   
47                 
48                 
49         {
50                 //printf("\n parametros <<%s>>\n\n",stopwordsfile);     
51                 //      build_WCSA (inputBuffer, finsize, stopwordsfile, NULL,outbasename);                                                             
52                 build_index (inputBuffer, finsize, stopwordsfile, &Index);  /** building the index */
53
54
55                 /** saving the index to disk*/
56                 
57                 save_index (Index, outbasename);                
58                 fprintf(stderr,"Index saved !! ");
59
60                 /** tells the mem used by the index */
61                 ulong indexsize;                
62                 index_size(Index, &indexsize);
63                 fprintf(stderr,"Index occupied %d bytes, 2 extra mallocs = %d",indexsize,2* sizeof(uint));
64
65
66                 /** recovering the source text from the index */
67                         {
68                                 double start, end;
69                                 start = getTime2();
70                                 ulong size;
71                                 get_length(Index, &size);
72
73                                 fprintf(stderr, "\nRecovering source file ");   fflush(stderr);
74                                 char ext1[10]=".source";
75                                 recoverSourceText1((twcsa*) Index, outbasename,ext1, size);
76                                 end = getTime2();       
77                                 fprintf(stderr, " time: %.3f secs\n", end-start );      
78
79                                 start=end;
80                                 char ext2[10]=".source2";
81                                 fprintf(stderr, "\nRecovering source file ");   fflush(stderr);
82                                 recoverSourceText2((twcsa*) Index, outbasename,ext2,size);
83                                 end = getTime2();       
84                                 fprintf(stderr, " time: %.3f secs\n", end-start );      
85                                 //fprintf(stderr, "\nRecovering source file time: %.3f secs\n", end-start );    
86                         }
87                 
88                 // DISPLAYING THE OCCURRENCES OF A TEXT PATTERN (word/phrase).
89                         {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
90                          int error = 0;
91                         ulong numocc,numc, length, i, *snippet_len, tot_numcharext = 0, numpatt;
92                         uchar *pattern, *snippet_text;
93                                  
94                                  pattern = textPattern;
95                          printf("\nSEARCH TEST for DISPLAY (pizzachili interface)\n");
96                                 while(1) {      
97                                         printf("Intro string: ");
98                                         fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
99                                         if (!strcmp((char*)textPattern,"\n") ) break;
100                                          textPattern[strlen((char*)textPattern)-1] = '\0';
101                 
102                                         length = strlen( (char*)textPattern);
103                                         numc=50;
104          
105 //                                      error = display (Index, textPattern, length, numc, &numocc, 
106 //                                                               &snippet_text, &snippet_len);
107                                         error = displayWords (Index, textPattern, length, numc, &numocc, 
108                                                                  &snippet_text, &snippet_len,1);
109                                         
110                                         if (error){ fprintf(stderr, "%s\n", "Hubo un error durante display");exit(0);}
111                 
112                                                 fprintf(stderr,"\n acabou display");fflush(stderr);                     
113                                         {//show the results
114                                                 ulong j, len = length + 2*numc;
115                                             char blank = '\0';
116                                                 fprintf(stderr,"\n length = %d",length);
117                                                 fprintf(stderr,"\n pattern = %s",pattern);fflush(stderr);
118                                                 fprintf(stderr,"\n numocc = %d",numocc);fflush(stderr);
119                                                 fprintf(stderr,"\n snippet len = %d",len);fflush(stderr);
120                                                 fprintf(stderr,"\n =========");fflush(stderr);          
121                                                 for (i = 0; i < numocc; i++){
122                                                         fprintf(stderr,"\n[%2d][len=%3d]<<",i+1,snippet_len[i]);fflush(stderr);
123                                                         fwrite(snippet_text+len*i,sizeof(uchar),snippet_len[i],stderr);fflush(stderr);
124                                                         fprintf(stderr,">>");fflush(stderr);
125                                                 }
126                                         }
127                                         numpatt--;
128                                         
129                                         for(i=0; i<numocc; i++) {
130                                                 tot_numcharext += snippet_len[i];
131                                         }
132                                                 
133                                         if (numocc) {
134                                                 free (snippet_len);
135                                                 free (snippet_text);
136                                         }
137                                         
138                                         printf("Ocurrences = %d\n", numocc);
139                                         if (!strcmp((char*)textPattern,"\n") ) break;
140                                 }
141                         }
142         
143
144
145                         // SEARCHING FOR A TEXT PATTERN (word/phrase).
146                         {unsigned char textPattern[MAX_TEXT_PATTERN_SIZE];
147                          int occ;
148                          int len;
149                          uint *occs;
150                          int i;
151                          printf("\nSEARCH TEST for LOCATE\n");
152                                 while(1) {      
153                                         printf("Intro string: ");
154                                         fgets((char*)textPattern, MAX_TEXT_PATTERN_SIZE, stdin);
155                                         len = strlen((char*)textPattern);
156                                         if (!strcmp((char*)textPattern,"\n") ) break;
157                                         textPattern[len-1] = '\0';
158                                         len --;
159                                         
160                                         //occs = locateTextOcurrences(wcsa,textPattern,&occ);
161                                         // locate(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ);
162                                           locateWord(Index, textPattern, len, (ulong **)&occs, (ulong *)&occ, 0);
163                                         
164                                         printf("\n*** %s occurs %d times: In the source text in positions:\n\t",textPattern,occ);
165                 /*                      for (i=0;i<occ;i++) 
166                                                 printf("[%u]",occs[i]);
167                                         fflush(stderr);  */
168                                         if (occ >0) free(occs);         
169                                         
170                                         if (!strcmp((char*)textPattern,"\n") ) break;
171                                 }
172                         }       
173         
174
175
176
177                 /** freeing the index */                                        
178                 free_index(Index);                              
179                          
180         }
181 }
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211 //{     
212 //      bG = createBitmap (B,len);
213 //      bE = createBitmapEdu (B,len);
214 //      //fprintf(stderr,"\n RANK DE GONZALO DA %u, de EDU DA %u\n",rank(bG,33),rank1Edu(bE,33));
215 //      //fprintf(stderr,"\n SELECT1(2) DE GONZALO DA %u, de EDU DA %u\n",bselect(bG,4),bselect(bE,4));
216 //      
217 //      showBitVector(bitvector,34);    
218 //}
219
220
221
222 /*      
223
224         //USING A HASH TABLE
225         //      {
226         //      char a[20]="beginnings";char b[20]="HOSTIAS";
227         //      char *w;        
228         //      int i;
229         //      w=a;
230         //      i = inHashTable(stopwordshash,w, strlen(w), &addrInTH );
231         //      if (!i) insertElement (stopwordshash, w, strlen(w), &addrInTH);
232         //      else stopwordshash->hash[addrInTH].freq++;
233         //      //fprintf(stderr,"\n i = %ld, addrInTh = %ld ",i,addrInTH);
234         //      //fprintf(stderr,"\n word in hash[%ld]= %s , freq = %ld, posinvoc =%ld",addrInTH, stopwordshash->hash[addrInTH].word, stopwordshash->hash[addrInTH].freq, stopwordshash->hash[addrInTH].posInVoc);      
235         //  }
236
237
238
239         /// ENCODING THE separators  ...
240 {       
241         freeHuff(gapsHuffman);
242         uint i;
243         uint *bitvector;
244         uint bitvectorSize;
245         uint ptr;
246         bitmap bG,bE;
247         uint len;
248         len = 1000; //number of bits
249         bitvector = (uint *) malloc ((len/32 +1)* sizeof(uint));
250         
251         byte texto[100] = "####@?*";
252         uint freqs[256];
253         
254         //fprintf(stderr,"\n este es el texto a codificar: %s",texto);
255         for (i=0;i<256;i++) freqs[i]=0;
256         for (i=0;i<strlen(texto);i++) freqs[texto[i]]++;
257         gapsHuffman = createHuff (freqs,255,UNSORTED);
258                 
259         ptr=0;
260         for (i=0;i<strlen(texto);i++) {
261                 //fprintf(stderr,"\n ENCODING seprators !!\n");
262                 //fprintf(stderr,"%d. \n",ptr=encodeHuff(gapsHuffman, texto[i],bitvector,ptr) );                
263         }       
264
265         prepareToDecode(&(gapsHuffman));
266         bitvectorSize = ptr;
267         showBitVector(bitvector,bitvectorSize); 
268         uint pos;
269         //fprintf(stderr,"\n DECODING !!\n");
270         ptr=0;
271         while (ptr < bitvectorSize) {
272                 ptr=decodeHuff (gapsHuffman, &pos, bitvector, ptr);
273                 //fprintf(stderr,"\n DECODING pos is %ld!!\n",pos);
274                 //fprintf(stderr,"%c. \n",pos);
275         }
276         exit(0);        
277 }                                                                                                                       
278         
279 /// ENCODING THE CANONICAL WORDS ...
280 {       uint i;
281         uint *bitvector;
282         uint bitvectorSize;
283         uint ptr;
284         bitmap bG,bE;
285         uint len;
286         len = 1000; //number of bits
287         bitvector = (uint *) malloc ((len/32 +1)* sizeof(uint));
288         
289         
290         ptr=0;
291         //fprintf(stderr,"\n ENCODING VARIANTS !!\n");
292         //fprintf(stderr,"%d. \n",ptr=encodeHuff(posInHT[0].huffman, 0,bitvector,ptr) );
293         //fprintf(stderr,"%d. \n",ptr=encodeHuff(posInHT[0].huffman, 1,bitvector,ptr) );
294         //fprintf(stderr,"%d. \n",ptr=encodeHuff(posInHT[0].huffman, 2,bitvector,ptr) );
295         //fprintf(stderr,"%d. \n",ptr=encodeHuff(posInHT[0].huffman, 3,bitvector,ptr) );
296         //fprintf(stderr,"%d. \n",ptr=encodeHuff(posInHT[0].huffman, 4,bitvector,ptr) );
297         //fprintf(stderr,"%d. \n",ptr=encodeHuff(posInHT[0].huffman, 4,bitvector,ptr) );
298                         
299         //      FILE *f;
300         //      f = fopen("huff","w");
301         //      saveHuff(posInHT[0].huffman,f);
302         //      fclose(f);
303         //      
304         //      f = fopen("huff","r");
305         //      posInHT[0].huffman = loadHuff (f,0);
306         //      fclose(f);
307         //      
308         
309         prepareToDecode(&(posInHT[0].huffman));
310
311         FILE *f;
312         f = fopen("huff","w");
313         saveHuffAfterDecode(posInHT[0].huffman,f);
314         fclose(f);
315         
316         f = fopen("huff","r");
317         //posInHT[0].huffman = loadHuffAfterDecode(f,0);
318         loadHuffAfterDecode2 (&(posInHT[0].huffman),f,0);
319         fclose(f);
320         
321
322
323         bitvectorSize = ptr;
324         showBitVector(bitvector,bitvectorSize); 
325         uint pos;
326         //fprintf(stderr,"\n DECODING !!\n");
327         ptr=0;
328         while (ptr < bitvectorSize) {
329                 ptr=decodeHuff (posInHT[0].huffman, &pos, bitvector, ptr);
330                 //fprintf(stderr,"\n DECODING pos is %ld!!\n",pos);
331                 //fprintf(stderr,"%s. \n",posInHT[0].variants[pos]);
332         }
333         exit(0);
334 }       
335         
336         
337                         
338 {       uint i;
339         uint *bitvector;
340         bitmap bG,bE;
341         uint len;
342         len = 101; //number of bits
343         bitvector = (uint *) malloc ((len/32 +1)* sizeof(uint));
344         //bitvector[0]=0;
345         //bitvector[1]=0;
346         bitzero (bitvector,0,101-1);
347         for (i=0; i<len;i++) setBit (bitvector,len,i,0);
348          
349
350
351         bitset(bitvector,1); 
352         bitset(bitvector,10);
353         bitset(bitvector,12);
354         //activateBit(bitvector,1); 
355         //activateBit(bitvector,10); 
356         
357         bG = createBitmap (bitvector,len);
358         bE = createBitmapEdu (bitvector,len);
359         //fprintf(stderr,"\n RANK DE GONZALO DA %u, de EDU DA %u\n",rank(bG,33),rank1Edu(bE,33));
360         //fprintf(stderr,"\n SELECT1(2) DE GONZALO DA %u, de EDU DA %u\n",bselect(bG,4),bselect(bE,4));
361         
362         showBitVector(bitvector,34);
363 }       
364 */              
365