X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=incbwt%2Fread_bwt.cpp;fp=incbwt%2Fread_bwt.cpp;h=6b7b65e62f0672cb9ac33c62618fe99385d1be0e;hb=13e254b7c0ee22dffbc7c3125cee0408f9b375da;hp=0000000000000000000000000000000000000000;hpb=e4b6bdc7cc2a1372e4d4dae50acac55cebcc7e9b;p=SXSI%2FTextCollection.git diff --git a/incbwt/read_bwt.cpp b/incbwt/read_bwt.cpp new file mode 100644 index 0000000..6b7b65e --- /dev/null +++ b/incbwt/read_bwt.cpp @@ -0,0 +1,103 @@ +#include +#include +#include + +#include "rlcsa.h" + + +using namespace CSA; + + +/* + This program writes run-length encoded PLCP of the collection into a file. +*/ + + +usint +countRuns(usint prev, uchar* buffer, usint length) +{ + usint runs = 0; + + for(usint i = 0; i < length; i++) + { + if(buffer[i] != prev) + { + prev = buffer[i]; + if(buffer[i] != 0) { runs++; } + } + } + + return runs; +} + + +int +main(int argc, char** argv) +{ + std::cout << "RLCSA to BWT converter" << std::endl; + if(argc < 2) + { + std::cout << "Usage: read_bwt base_name [buffer_size]" << std::endl; + return 1; + } + + std::string base_name = argv[1]; + std::string bwt_name = base_name + ".bwt"; + std::cout << "BWT: " << bwt_name << std::endl; + std::ofstream bwt_file(bwt_name.c_str(), std::ios_base::binary); + if(!bwt_file) + { + std::cerr << "Error creating BWT file!" << std::endl; + return 2; + } + std::cout << std::endl; + + RLCSA rlcsa(base_name); + clock_t start = clock(); + usint buffer_size = 0; + if(argc > 2) { buffer_size = atoi(argv[2]); } + usint n = rlcsa.getSize() + rlcsa.getNumberOfSequences(); + + usint runs = 0, prev = CHARS; + if(buffer_size > 0) + { + for(usint i = 0; i < n; i += buffer_size) + { + pair_type range(i, std::min(i + buffer_size - 1, n - 1)); + uchar* bwt = rlcsa.readBWT(range); + if(bwt != 0) + { + runs += countRuns(prev, bwt, length(range)); + prev = bwt[length(range) - 1]; + bwt_file.write((char*)bwt, length(range)); + delete[] bwt; + } + } + } + else + { + uchar* bwt = rlcsa.readBWT(); + if(bwt != 0) + { + runs = countRuns(prev, bwt, n); + bwt_file.write((char*)bwt, n); + delete[] bwt; + } + } + + clock_t stop = clock(); + double time = ((stop - start) / (double)CLOCKS_PER_SEC); + double megabytes = n / (double)MEGABYTE; + std::cout << megabytes << " megabytes in " << time << " seconds (" << (megabytes / time) << " MB/s)" << std::endl; + std::cout << std::endl; + + // Testing direct reporting of the number of runs. + // This is as expensive as reading the BWT. + std::cout << "Number of runs: " << runs << std::endl; + runs = rlcsa.countRuns(); + std::cout << "Number of runs (direct count): " << runs << std::endl; + std::cout << std::endl; + + bwt_file.close(); + return 0; +}