Added RLCSA index option
[SXSI/TextCollection.git] / incbwt / utils / split_wikipedia.py
1 #! /usr/bin/env python
2 # -*- coding: iso-8859-15 -*-
3
4 import getopt, sys
5
6
7 def main():
8   if len(sys.argv) < 4:
9     return
10
11   sequences = 0
12   in_sequence = False
13   part_size = 1048576 * int(sys.argv[2])
14   current_file = 1
15   print "Part size:", part_size
16   print
17
18   if sys.argv[1] == "-":
19     infile = sys.stdin
20   else:
21     infile = open(sys.argv[1], "r")
22   partname = "part"
23
24   start_tag = "<" + sys.argv[3] + ">"
25   end_tag = "</" + sys.argv[3] + ">"
26
27   output = open(partname + "." + str(current_file), "wb")
28   print "Writing part", output.name, "..."
29
30   for line in infile:
31     if in_sequence:
32       if line.find(end_tag) >= 0:
33         output.write("\0")
34         in_sequence = False
35       else:
36         output.write(line)
37     else:
38       if line.find(start_tag) >= 0:
39         if output.tell() >= part_size:
40           output.close()
41           current_file += 1
42           output = open(partname + "." + str(current_file), "wb")
43           print "Writing part", output.name, "..."
44         in_sequence = True
45         sequences += 1
46
47   infile.close()
48   output.close()
49   print
50   print "Sequences: ", sequences
51
52 if __name__ == "__main__":
53     main()