X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=incbwt%2Fbuild_rlcsa.cpp;fp=incbwt%2Fbuild_rlcsa.cpp;h=073dcd5e0ad850f0dd8aca7601feef880c78b259;hb=13e254b7c0ee22dffbc7c3125cee0408f9b375da;hp=0000000000000000000000000000000000000000;hpb=e4b6bdc7cc2a1372e4d4dae50acac55cebcc7e9b;p=SXSI%2FTextCollection.git diff --git a/incbwt/build_rlcsa.cpp b/incbwt/build_rlcsa.cpp new file mode 100644 index 0000000..073dcd5 --- /dev/null +++ b/incbwt/build_rlcsa.cpp @@ -0,0 +1,105 @@ +#include +#include +#include +#include + +#include "rlcsa_builder.h" +#include "misc/utils.h" + + +using namespace CSA; + + + +int +lineByLineRLCSA(std::string base_name, usint block_size, usint sample_rate, usint buffer_size) +{ + Parameters parameters; + parameters.set(RLCSA_BLOCK_SIZE.first, block_size); + parameters.set(SAMPLE_RATE.first, sample_rate); + + if(sample_rate > 0) + { + parameters.set(SUPPORT_LOCATE.first, 1); + parameters.set(SUPPORT_DISPLAY.first, 1); + } + else + { + parameters.set(SUPPORT_LOCATE.first, 0); + parameters.set(SUPPORT_DISPLAY.first, 0); + } + + std::string parameters_name = base_name + PARAMETERS_EXTENSION; + parameters.print(); + parameters.write(parameters_name); + + std::cout << "Input: " << base_name << std::endl; + std::ifstream input_file(base_name.c_str(), std::ios_base::binary); + if(!input_file) + { + std::cerr << "Error opening input file!" << std::endl; + return 2; + } + std::cout << "Buffer size: " << buffer_size << " MB" << std::endl; + std::cout << std::endl; + + double start = readTimer(); + RLCSABuilder builder(parameters.get(RLCSA_BLOCK_SIZE), parameters.get(SAMPLE_RATE), buffer_size * MEGABYTE); + + usint lines = 0, total = 0; + while(input_file) + { + char buffer[16384]; // FIXME What if lines are longer? Probably fails. + input_file.getline(buffer, 16384); + usint chars = input_file.gcount(); + lines++; total += chars; + if(chars >= 16383) { std::cout << lines << ": " << chars << " chars read!" << std::endl; } + if(chars > 1) { builder.insertSequence(buffer, chars - 1, false); } + } + + RLCSA* rlcsa = 0; + if(builder.isOk()) + { + rlcsa = builder.getRLCSA(); + rlcsa->writeTo(base_name); + } + else + { + std::cerr << "Error: RLCSA construction failed!" << std::endl; + return 3; + } + + double time = readTimer() - start; + double build_time = builder.getBuildTime(); + double search_time = builder.getSearchTime(); + double sort_time = builder.getSortTime(); + double merge_time = builder.getMergeTime(); + + double megabytes = rlcsa->getSize() / (double)MEGABYTE; + usint sequences = rlcsa->getNumberOfSequences(); + std::cout << sequences << " sequences" << std::endl; + std::cout << megabytes << " megabytes in " << time << " seconds (" << (megabytes / time) << " MB/s)" << std::endl; + std::cout << "(build " << build_time << " s, search " << search_time << "s, sort " << sort_time << " s, merge " << merge_time << " s)" << std::endl; + std::cout << std::endl; + + delete rlcsa; + return 0; +} + + +int +main(int argc, char** argv) +{ + std::cout << "Line-by-line RLCSA builder" << std::endl; + if(argc < 3) + { + std::cout << "Usage: build_rlcsa base_name buffer_size [block_size [sample_rate]]" << std::endl; + return 1; + } + std::cout << std::endl; + + int name_arg = 1, buffer_arg = 2, block_arg = 3, sample_arg = 4; + usint block_size = (argc > block_arg ? atoi(argv[block_arg]) : RLCSA_BLOCK_SIZE.second); + usint sample_rate = (argc > sample_arg ? atoi(argv[sample_arg]) : SAMPLE_RATE.second); + return lineByLineRLCSA(argv[name_arg], block_size, sample_rate, atoi(argv[buffer_arg])); +}