7 #ifdef MULTITHREAD_SUPPORT
11 #include "rlcsa_builder.h"
12 #include "misc/utils.h"
18 double indexParts(std::vector<std::string>& filename, usint threads, Parameters& parameters);
20 const int MAX_THREADS = 64;
24 main(int argc, char** argv)
26 std::cout << "Parallel RLCSA builder" << std::endl;
29 std::cout << "Usage: parallel_build [-n] listname output [threads]" << std::endl;
30 std::cout << " -n do not merge the indexes" << std::endl;
34 int list_parameter = 1, output_parameter = 2, threads_parameter = 3;
36 if(std::string("-n").compare(argv[1]) == 0)
38 list_parameter++; output_parameter++; threads_parameter++;
40 std::cout << "Option '-n' specified. Partial indexes will not be merged." << std::endl;
43 std::ifstream filelist(argv[list_parameter], std::ios_base::binary);
46 std::cerr << "Error opening file list!" << std::endl;
49 std::vector<std::string> files;
50 readRows(filelist, files, true);
52 std::cout << "Input files: " << files.size() << std::endl;
54 std::string base_name = argv[output_parameter];
55 std::cout << "Output: " << base_name << std::endl;
58 if(argc > threads_parameter)
60 threads = std::min(MAX_THREADS, std::max(atoi(argv[threads_parameter]), 1));
62 std::cout << "Threads: " << threads << std::endl;
63 std::cout << std::endl;
65 std::string parameters_name = base_name + PARAMETERS_EXTENSION;
66 Parameters parameters;
67 parameters.set(RLCSA_BLOCK_SIZE);
68 parameters.set(SAMPLE_RATE);
69 parameters.set(SUPPORT_LOCATE);
70 parameters.set(SUPPORT_DISPLAY);
71 parameters.read(parameters_name);
74 double start = readTimer();
75 double megabytes = indexParts(files, threads, parameters);
77 RLCSABuilder builder(parameters.get(RLCSA_BLOCK_SIZE), parameters.get(SAMPLE_RATE), 0, threads);
80 std::cout << "Phase 2: Merging the indexes" << std::endl;
81 for(std::vector<std::string>::iterator iter = files.begin(); iter != files.end(); iter++)
83 std::cout << "Increment: " << *iter << std::endl;
84 builder.insertFromFile(*iter);
86 std::cout << std::endl;
88 RLCSA* index = builder.getRLCSA();
89 if(index != 0 && index->isOk())
92 index->reportSize(true);
93 index->writeTo(base_name);
94 parameters.write(parameters_name);
99 double stop = readTimer();
100 std::cout << megabytes << " megabytes indexed in " << (stop - start) << " seconds (" << (megabytes / (stop - start)) << " MB/s)." << std::endl;
103 std::cout << "Search time: " << builder.getSearchTime() << " seconds" << std::endl;
104 std::cout << "Sort time: " << builder.getSortTime() << " seconds" << std::endl;
105 std::cout << "Merge time: " << builder.getMergeTime() << " seconds" << std::endl;
107 std::cout << std::endl;
114 indexParts(std::vector<std::string>& filenames, usint threads, Parameters& parameters)
116 double start = readTimer();
117 std::cout << "Phase 1: Building indexes for input files" << std::endl;
118 usint block_size = parameters.get(RLCSA_BLOCK_SIZE);
119 usint sample_rate = parameters.get(SAMPLE_RATE);
120 usint total_size = 0;
122 std::ifstream* input_file;
124 std::string parameters_name;
129 #ifdef MULTITHREAD_SUPPORT
130 omp_set_num_threads(threads);
131 #pragma omp parallel private(input_file, size, parameters_name, index, data)
133 #pragma omp for schedule(dynamic, 1)
135 for(i = 0; i < (sint)(filenames.size()); i++)
137 #ifdef MULTITHREAD_SUPPORT
142 std::cout << "Input: " << filenames[i] << std::endl;
143 input_file = new std::ifstream(filenames[i].c_str(), std::ios_base::binary);
146 std::cerr << "Error opening input file " << filenames[i] << "!" << std::endl;
150 size = fileSize(*input_file);
151 data = new uchar[size];
152 input_file->read((char*)data, size);
155 #ifdef MULTITHREAD_SUPPORT
161 index = new RLCSA(data, size, block_size, sample_rate, true, true);
162 if(index != 0 && index->isOk()) { index->writeTo(filenames[i]); }
165 #ifdef MULTITHREAD_SUPPORT
170 parameters_name = filenames[i] + PARAMETERS_EXTENSION;
171 parameters.write(parameters_name);
172 #ifdef MULTITHREAD_SUPPORT
178 #ifdef MULTITHREAD_SUPPORT
182 std::cerr << "Warning: Empty input file " << filenames[i] << "!" << std::endl;
183 #ifdef MULTITHREAD_SUPPORT
188 #ifdef MULTITHREAD_SUPPORT
192 double stop = readTimer();
193 double megabytes = total_size / (double)MEGABYTE;
194 std::cout << "Indexed " << megabytes << " megabytes in " << (stop - start) << " seconds (" << (megabytes / (stop - start)) << " MB/s)." << std::endl;
195 std::cout << std::endl;