Fix printing

[SXSI/XMLTree.git] / XMLTree.h
diff --git a/XMLTree.h b/XMLTree.h

index ee44d32..c2a0210 100644 (file)
--- a/XMLTree.h
+++ b/XMLTree.h
@@ -1,4 +1,3 @@
-\r
  /******************************************************************************\r
   *   Copyright (C) 2008 by Diego Arroyuelo                                    *\r
   *   Interface for the in-memory XQuery/XPath engine                          *\r
@@ -21,11 +20,20 @@
  \r
  #ifndef XMLTREE_H_\r
  #define XMLTREE_H_\r
+extern "C" {\r
+#define CAML_NAME_SPACE\r
+#include <caml/mlvalues.h>\r
+#include <caml/custom.h>\r
+#define XMLTREE(x) ((XMLTree *)(* (XMLTree**) Data_custom_val(x)))\r
+  //#define XMLTREE(x) ((XMLTree*) (x))\r
+}\r
  #include <unordered_set>\r
  #include <unordered_map>\r
+#include <sstream>\r
  #include "TextCollection/TextCollectionBuilder.h"\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
+\r
+#include <cstdio>\r
+#include <cstdlib>\r
  #include <cstring>\r
  \r
  \r
@@ -34,13 +42,14 @@
  #undef Wminusone\r
  \r
  #include "bp.h"\r
-\r
+#include <libcds/includes/basics.h>\r
  #include <static_bitsequence.h>\r
  #include <alphabet_mapper.h>\r
  #include <static_sequence.h>\r
  using SXSI::TextCollection;\r
  using SXSI::TextCollectionBuilder;\r
-using SXSI::TextStorage;\r
+\r
+\r
  \r
  // this constant is used to efficiently compute the child operation in the tree\r
  #define OPTD 10\r
@@ -78,13 +87,15 @@ typedef struct {
  #define PCDATA_TAG_ID 2\r
  #define ATTRIBUTE_DATA_OPEN_TAG "<@$>"\r
  #define ATTRIBUTE_DATA_TAG_ID 3\r
+#define CLOSING_TAG   "</>"\r
+#define CLOSING_TAG_ID 4\r
  #define DOCUMENT_CLOSE_TAG "/"\r
  #define ATTRIBUTE_CLOSE_TAG "/<@>"\r
  #define PCDATA_CLOSE_TAG "/<$>"\r
  #define ATTRIBUTE_DATA_CLOSE_TAG "/<@$>"\r
  \r
  \r
-\r
+typedef std::unordered_set<int> TagIdSet;\r
  typedef std::unordered_map<string,int> TagIdMap;\r
  typedef TagIdMap::const_iterator TagIdMapIT;\r
  \r
@@ -92,6 +103,14 @@ typedef TagIdMap::const_iterator TagIdMapIT;
      (v)->push_back(t); } while (false)\r
  \r
  \r
+// returns NULLT if the test is true\r
+#define NULLT_IF(x)  do { if (x) return NULLT; } while (0)\r
+\r
+\r
+\r
+\r
+\r
+\r
  class XMLTreeBuilder;\r
  \r
  class XMLTree {\r
@@ -121,7 +140,28 @@ class XMLTree {
     // Allows to disable the TextCollection for benchmarkin purposes\r
     bool disable_tc;\r
     \r
+   FILE* stream;\r
+   int   stream_fd; \r
+   string buffer;\r
+   void myfputs(const char* s, FILE * fp){\r
+     buffer.append(s);\r
+     if (buffer.size() >= 1000000){\r
+       fputs(buffer.c_str(),fp);\r
+       buffer.clear();\r
+     };\r
  \r
+   }\r
+   void myfputc(const char c, FILE*fp){\r
+     buffer.append(1,c);\r
+     if (buffer.size() >= 1000000){\r
+       fputs(buffer.c_str(),fp);\r
+       buffer.clear();\r
+     };\r
+   }\r
+   void mybufferflush(FILE* fp){\r
+     fputs(buffer.c_str(), fp);\r
+     buffer.clear();\r
+   }\r
     /** Data structure constructors */\r
     XMLTree(){;};\r
  \r
@@ -134,36 +174,48 @@ public:
     ~XMLTree();\r
     \r
     /** root(): returns the tree root. */\r
-   treeNode Root();\r
-   \r
+   treeNode Root() { return 0; }\r
+\r
+   /** Size() :  Number of parenthesis */\r
+   unsigned int Size(){\r
+     return tags_len/2;\r
+   }\r
+\r
     /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of \r
      * node x. */\r
     int SubtreeSize(treeNode x);\r
-   \r
+  \r
     /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree \r
      * of node x. */\r
     int SubtreeTags(treeNode x, TagType tag);\r
     \r
+   /** SubtreeElements(x) of element nodes in the subtree of x\r
+    */\r
+   int SubtreeElements(treeNode x);\r
+\r
     /** IsLeaf(x): returns whether node x is leaf or not. In the succinct \r
      * representation this is just a bit inspection. */\r
+\r
     bool IsLeaf(treeNode x);\r
-    \r
+\r
     /** IsAncestor(x,y): returns whether node x is ancestor of node y. */\r
+\r
     bool IsAncestor(treeNode x, treeNode y);\r
    \r
     /** IsChild(x,y): returns whether node x is parent of node y. */\r
     bool IsChild(treeNode x, treeNode y);\r
  \r
     /** IsFirstChild(x): returns whether node x is the first child of its parent. */\r
+   /* OCAML */\r
     bool IsFirstChild(treeNode x);\r
-\r
+     \r
     /** NumChildren(x): number of children of node x. Constant time with the \r
      * data structure of Sadakane. */\r
     int NumChildren(treeNode x);\r
-   \r
+\r
     /** ChildNumber(x): returns i if node x is the i-th children of its \r
      * parent. */\r
-   inline int ChildNumber(treeNode x);\r
+   int ChildNumber(treeNode x);\r
  \r
     /** Depth(x): depth of node x, a simple binary rank on the parentheses \r
      * sequence. */\r
@@ -176,35 +228,51 @@ public:
     /** Postorder(x): returns the postorder number of node x, just regarding \r
      * tree nodes (and not texts). */\r
     int Postorder(treeNode x);\r
-   \r
+      \r
     /** Tag(x): returns the tag identifier of node x. */\r
-   TagType Tag(treeNode x);\r
-   \r
+   TagType Tag(treeNode x) {\r
+     if (tags_blen == 8)\r
+       return  (TagType) (((uchar*)tags_fix)[(int) x]);\r
+     else\r
+       return (TagType) get_field(tags_fix,tags_blen, (int) x);\r
+   }\r
+\r
     /** DocIds(x): returns the range (i.e., a pair of integers) of document \r
      * identifiers that descend from node x. */\r
     range DocIds(treeNode x);\r
-   \r
+\r
     /** Parent(x): returns the parent node of node x. */\r
     treeNode Parent(treeNode x);\r
+   /* Assumes x is neither 0 nor -1 */\r
     \r
     /** Child(x,i): returns the i-th child of node x, assuming it exists. */   \r
     treeNode Child(treeNode x, int i);\r
-   \r
-   /** FirstChild(x): returns the first child of node x, assuming it exists. \r
-    * Very fast in BP. */\r
+\r
+   /** FirstChild(x): returns the first child of node x, or NULLT if the node is a leaf\r
+    */\r
     treeNode FirstChild(treeNode x);\r
-   treeNode FirstElement(treeNode x);\r
  \r
+   /** FirstElement(x): returns the first non text, non attribute child of node x, or NULLT\r
+    *    if none.\r
+    */\r
+   treeNode FirstElement(treeNode x);\r
+   value CamlFirstElement(value x);\r
     /** LastChild(x): returns the last child of node x.  */\r
     treeNode LastChild(treeNode x);\r
     \r
-   /** NextSibling(x): returns the next sibling of node x, assuming it \r
+   /** NextSibling(x): returns the next sibling of node x, or NULLT if none \r
      * exists. */\r
+\r
     treeNode NextSibling(treeNode x);\r
+\r
+   /** NextElement(x): returns the first non text, non attribute sibling of node x, or NULLT\r
+    *    if none.\r
+    */\r
     treeNode NextElement(treeNode x);\r
-   \r
+   value CamlNextElement(value x);\r
     /** PrevSibling(x): returns the previous sibling of node x, assuming it \r
      * exists. */\r
+\r
     treeNode PrevSibling(treeNode x);\r
     \r
     /** TaggedChild(x,tag): returns the first child of node x tagged tag, or \r
@@ -213,38 +281,38 @@ public:
      * among the children of node x until finding the desired child. */\r
     treeNode TaggedChild(treeNode x, TagType tag);\r
     \r
-   treeNode SelectChild(treeNode x, std::unordered_set<int> * tags);\r
+   treeNode SelectChild(treeNode x, TagIdSet * tags);\r
  \r
-   /** TaggedFollSibling(x,tag): returns the first sibling of node x tagged tag, or \r
+   /** TaggedFollowingSibling(x,tag): returns the first sibling of node x tagged tag, or \r
      *  NULLT if there is none. */\r
-   treeNode TaggedFollSibling(treeNode x, TagType tag);\r
+   treeNode TaggedFollowingSibling(treeNode x, TagType tag);\r
     \r
-   treeNode SelectFollSibling(treeNode x, std::unordered_set<int> * tags);\r
+   treeNode SelectFollowingSibling(treeNode x, TagIdSet * tags);\r
  \r
     /** TaggedDesc(x,tag): returns the first node tagged tag with larger \r
      * preorder than x and within the subtree of x. Returns NULT if there \r
      * is none. */\r
-   treeNode TaggedDesc(treeNode x, TagType tag);\r
-\r
-   treeNode SelectDesc(treeNode x, std::unordered_set<int> * tags);\r
+   treeNode TaggedDescendant(treeNode x, TagType tag);\r
  \r
+   treeNode SelectDescendant(treeNode x, TagIdSet * tags);\r
  \r
     /** TaggedPrec(x,tag): returns the first node tagged tag with smaller \r
      * preorder than x and not an ancestor of x. Returns NULLT if there \r
      * is none. */\r
-   treeNode TaggedPrec(treeNode x, TagType tag);\r
+   treeNode TaggedPreceding(treeNode x, TagType tag);\r
    \r
     /** TaggedFoll(x,tag): returns the first node tagged tag with larger \r
      * preorder than x and not in the subtree of x. Returns NULLT if there \r
      * is none. */\r
-   treeNode TaggedFoll(treeNode x, TagType tag);\r
+   treeNode TaggedFollowing(treeNode x, TagType tag);\r
  \r
-   treeNode TaggedFollBelow(treeNode x, TagType tag,treeNode root);     \r
-   \r
-   treeNode SelectFollBelow(treeNode x, std::unordered_set<int> * tags, treeNode root);\r
+   treeNode TaggedFollowingBelow(treeNode x, TagType tag,treeNode ancestor);     \r
  \r
-   /** TaggedFollowingSibling(x,tag) */\r
-   treeNode TaggedFollowingSibling(treeNode x, TagType tag);\r
+   treeNode SelectFollowingBelow(treeNode x, TagIdSet * tags, treeNode ancestor);\r
+\r
+   treeNode TaggedFollowingBefore(treeNode x, TagType tag,treeNode closing);\r
+\r
+   treeNode SelectFollowingBefore(treeNode x, TagIdSet * tags, treeNode closing);\r
  \r
     /** TaggedAncestor(x, tag): returns the closest ancestor of x tagged \r
       * tag. Return NULLT is there is none. */\r
@@ -261,7 +329,8 @@ public:
     /** MyText(x): returns the document identifier of the text below node x, or \r
      * NULLT if x is not a leaf node. */\r
     DocID MyText(treeNode x);\r
-   \r
+   DocID MyTextUnsafe(treeNode x);\r
+\r
     /** TextXMLId(d): returns the preorder of document with identifier d in the \r
      * tree consisting of all tree nodes and all text nodes. */\r
     int TextXMLId(DocID d);\r
@@ -311,7 +380,7 @@ public:
     }\r
  \r
     /** Equal(s): search for texts equal to string s. */\r
-   TextCollection::document_result Equal(uchar const *s) {\r
+   TextCollection::document_result Equals(uchar const *s) {\r
        return Text->Equal(s);\r
     }\r
  \r
@@ -325,20 +394,6 @@ public:
     TextCollection::document_result LessThan(uchar const *s) {\r
        return Text->LessThan(s);\r
     }\r
-\r
-   /** KMismatches (s): returns document identifiers for the texts that\r
-    * contain occurrence of string s with at most K mismatches. */\r
-   TextCollection::document_result KMismatches(uchar const *s, unsigned K) {\r
-       return Text->KMismatches(s, K);\r
-   }\r
-\r
-   /** KErrors (s): returns document identifiers for the texts that\r
-    * contain occurrence of string s with at most K errors. \r
-    * The accepted "errors" are insertions, deletions and mutations of chars.\r
-    */\r
-   TextCollection::document_result KErrors(uchar const *s, unsigned K) {\r
-       return Text->KErrors(s, K);\r
-   }\r
     \r
     /** IsPrefix(x): returns true if there is a text prefixed by string s. */\r
     bool IsPrefix(uchar const *s) {\r
@@ -399,34 +454,19 @@ public:
     }\r
     \r
     /** GetText(d): returns the text corresponding to document with\r
-    * id d. \r
-    *\r
-    * Implementation of GetText() may or may NOT \r
-    * require you to free() the pointer that is returned.\r
-    * Call DeleteText() for each pointer returned by GetText()\r
-    * to avoid possible memory leaks.\r
-    */\r
-   uchar* GetText(DocID d) const {\r
-     return Text->GetText(d);\r
-   }\r
-\r
-   /**\r
-    * Free the pointer returned by GetText().\r
-    */\r
-   void DeleteText(uchar *text) const {\r
-       Text->DeleteText(text);\r
+    * id d. */\r
+   uchar* GetText(DocID d) {\r
+     \r
+       uchar * s = Text->GetText(d);\r
+       return (s[0] == 1 ? (uchar*)"" : s);\r
     }\r
  \r
     /** GetText(i, j): returns the texts corresponding to documents with\r
-    * ids i, i+1, ..., j. Texts are separated by '\0' character.  \r
-    *\r
-    * Call DeleteText() for each pointer returned by GetText()\r
-    * to avoid possible memory leaks.\r
-    */\r
-   uchar* GetText(DocID i, DocID j) const {\r
-     return Text->GetText(i, j);\r
-   }\r
-\r
+    * ids i, i+1, ..., j. Texts are separated by '\0' character.  */\r
+   //   uchar* GetText(DocID i, DocID j) {\r
+   //  uchar * s = Text->GetText(i, j);\r
+   // return (s[0] == 1 ? (uchar*)"" : s);\r
+   //}\r
  \r
     TextCollection *getTextCollection() {\r
        return Text;\r
@@ -437,11 +477,29 @@ public:
        \r
     /** Load: loads XML tree data structure from file. sample_rate_text \r
      * indicates the sample rate for the text search data structure. */\r
-   static XMLTree *Load(int fd);   \r
+   static XMLTree *Load(int fd,bool load_tc, int sample_factor);   \r
  \r
     void insertTag(TagType tag, uint position);\r
     \r
     void print_stats();\r
+\r
+   \r
+   /** Parenthesis functions */\r
+   treeNode Closing(treeNode x);\r
+\r
+   bool IsOpen(treeNode x);\r
+\r
+\r
+   /** Print procedure */\r
+   void Print(int fd,treeNode x, bool no_text);\r
+   void Print(int fd,treeNode x) { Print(fd,x,false); }\r
+\r
  };\r
+\r
+extern "C" value caml_cpp_fast_first_element(value xmltree, value node);\r
+extern "C" value caml_cpp_fast_next_element(value xmltree, value node);\r
+\r
+\r
+\r
  #endif\r
  \r