Huge refactoring to remove diego' C/C++ chimera code.
[SXSI/XMLTree.git] / xml-tree-builder.cpp
diff --git a/xml-tree-builder.cpp b/xml-tree-builder.cpp
new file mode 100644 (file)
index 0000000..0b7effc
--- /dev/null
@@ -0,0 +1,140 @@
+#include "xml-tree-builder.hpp"
+#include <stdexcept>
+#include <utility>
+
+using namespace SXSI;
+
+xml_tree_builder::xml_tree_builder()
+{
+  opened = false;
+  par = 0;
+  tags = 0;
+  tag_ids = 0;
+  text_positions = 0;
+  disable_text_index = false;
+  tc_builder = 0;
+}
+
+xml_tree_builder::~xml_tree_builder()
+{
+  if (opened) reset();
+}
+
+void xml_tree_builder::reset()
+{
+  delete par;
+  delete tags;
+  delete tag_ids;
+  if (!disable_text_index){
+    delete tc_builder;
+    delete text_positions;
+  };
+}
+
+
+int32_t xml_tree_builder::register_tag(std::string tag, int32_t id)
+{
+  auto found = tag_ids->find(tag);
+
+  if (found == tag_ids->end()) {
+    if (id != current_tag)
+      throw std::runtime_error("xml-tree-builder: inconsistant tag id");
+
+    tag_ids->insert(std::make_pair(tag, id));
+    current_tag++;
+    return id;
+  } else
+    return found->second;
+
+}
+
+int32_t xml_tree_builder::register_tag(std::string tag)
+{
+  return register_tag(tag, current_tag);
+}
+
+void
+xml_tree_builder::open_document(bool disable_text_index,
+                                unsigned int sample_rate,
+                                TextCollectionBuilder::index_type_t idx_type)
+{
+  if (opened) reset();
+  opened = true;
+  par = new bit_vector();
+  tags = new std::vector<int32_t>();
+  current_tag = 0;
+  tag_ids = new std::unordered_map<std::string, int32_t>();
+
+  register_tag(xml_tree::DOCUMENT_OPEN_TAG, xml_tree::DOCUMENT_OPEN_TAG_ID);
+  register_tag(xml_tree::DOCUMENT_OPEN_TAG, xml_tree::DOCUMENT_OPEN_TAG_ID);
+
+  register_tag(xml_tree::ATTRIBUTE_OPEN_TAG, xml_tree::ATTRIBUTE_OPEN_TAG_ID);
+  register_tag(xml_tree::ATTRIBUTE_OPEN_TAG, xml_tree::ATTRIBUTE_OPEN_TAG_ID);
+
+  register_tag(xml_tree::PCDATA_OPEN_TAG, xml_tree::PCDATA_OPEN_TAG_ID);
+  register_tag(xml_tree::PCDATA_OPEN_TAG, xml_tree::PCDATA_OPEN_TAG_ID);
+
+  register_tag(xml_tree::ATTRIBUTE_DATA_OPEN_TAG,
+               xml_tree::ATTRIBUTE_DATA_OPEN_TAG_ID);
+
+  register_tag(xml_tree::ATTRIBUTE_DATA_OPEN_TAG,
+               xml_tree::ATTRIBUTE_DATA_OPEN_TAG_ID);
+
+
+  this->disable_text_index = disable_text_index;
+  if (!disable_text_index){
+    tc_builder = TextCollectionBuilder::create(sample_rate, idx_type);
+    text_positions = new bit_vector();
+    text_index_type = idx_type;
+  };
+}
+
+void xml_tree_builder::open_tag(std::string tag)
+{
+  int32_t id = register_tag(tag);
+  tags->push_back(id);
+  par->push_back(true);
+  if (!disable_text_index) text_positions->push_back(false);
+}
+
+void xml_tree_builder::close_tag(std::string)
+{
+  xml_tree::tag_t t = xml_tree::CLOSE_TAG_ID;
+  tags->push_back(t);
+  par->push_back(false);
+  if (!disable_text_index) text_positions->push_back(false);
+}
+
+void xml_tree_builder::text(std::string s)
+{
+  if (!disable_text_index){
+    if (s.empty()) s = "\001";
+    tc_builder->InsertText((const unsigned char *) s.c_str());
+    text_positions->set(text_positions->size() - 1, true);
+  }
+}
+
+xml_tree *xml_tree_builder::close_document()
+{
+  if (opened) {
+    opened = false;
+    auto tags_ = tags;
+    auto tag_ids_ = tag_ids;
+    auto par_ = par;
+    auto tc_builder_ = tc_builder;
+    auto text_positions_ = text_positions;
+    tc_builder = 0;
+    text_positions = 0;
+    tags = 0;
+    tag_ids = 0;
+    par = 0;
+    return new xml_tree(tags_, tag_ids_, par_,
+                        disable_text_index,
+                        tc_builder_,
+                        text_index_type,
+                        text_positions_);
+  };
+
+  throw std::runtime_error("xml_tree_builder: inconsistent parser state");
+}
+