\r
#ifndef XMLTREE_H_\r
#define XMLTREE_H_\r
+#include <unordered_set>\r
+#include <unordered_map>\r
#include "TextCollection/TextCollectionBuilder.h"\r
#include <stdio.h>\r
#include <stdlib.h>\r
#undef Wminusone\r
\r
#include "bp.h"\r
-//#include "basics.h"\r
+\r
#include <static_bitsequence.h>\r
#include <alphabet_mapper.h>\r
#include <static_sequence.h>\r
int max;\r
} range;\r
\r
-typedef struct nd {\r
- uint position;\r
- struct nd *next;\r
-} ListNode;\r
+// Encoding of the XML Document :\r
+// The following TAGs and IDs are fixed, "" is the tag of the root.\r
+// a TextNode is represented by a leaf <<$>></<$>> The DocId in the TextCollection\r
+// of that leaf is kept in a bit sequence.\r
+// a TextNode below an attribute is likewise represented by a leaf <<@$>><</@$>>\r
+// An element <e a1="v1" a2="v2" ... an="vn" > ...</e> the representation is:\r
+// <e><<@>> <<@>a1> <<$@>>DocID(v1)</<$@>></<@>a1> ... </<@>> .... </e>\r
+// Hence the attributes (if any) are always below the first child of their element,\r
+// as the children of a fake node <@>.\r
\r
-typedef struct {\r
- ListNode *first;\r
- ListNode *last;\r
-} TagArrayEntry;\r
+\r
+#define DOCUMENT_OPEN_TAG ""\r
+#define DOCUMENT_TAG_ID 0\r
+#define ATTRIBUTE_OPEN_TAG "<@>"\r
+#define ATTRIBUTE_TAG_ID 1\r
+#define PCDATA_OPEN_TAG "<$>"\r
+#define PCDATA_TAG_ID 2\r
+#define ATTRIBUTE_DATA_OPEN_TAG "<@$>"\r
+#define ATTRIBUTE_DATA_TAG_ID 3\r
+#define DOCUMENT_CLOSE_TAG "/"\r
+#define ATTRIBUTE_CLOSE_TAG "/<@>"\r
+#define PCDATA_CLOSE_TAG "/<$>"\r
+#define ATTRIBUTE_DATA_CLOSE_TAG "/<@$>"\r
+\r
+\r
+\r
+typedef std::unordered_map<string,int> TagIdMap;\r
+typedef TagIdMap::const_iterator TagIdMapIT;\r
+\r
+#define REGISTER_TAG(v,h,t) do { (h)->insert(std::make_pair((t),(v)->size()));\\r
+ (v)->push_back(t); } while (false)\r
+\r
+\r
+class XMLTreeBuilder;\r
\r
class XMLTree {\r
+\r
+ // Only the builder can access the constructor\r
+ friend class XMLTreeBuilder;\r
+\r
+ private:\r
/** Balanced parentheses representation of the tree */\r
bp *Par;\r
\r
/** Mapping from tag identifer to tag name */ \r
- unsigned char **TagName;\r
- uint ntagnames;\r
+ vector<string> *TagName;\r
+ TagIdMap * tIdMap;\r
\r
- /** boolean flag indicating whether we are indexing empty texts or not */\r
- bool indexing_empty_texts; \r
- \r
/** Bit vector indicating with a 1 the positions of the non-empty texts. */\r
static_bitsequence *EBVector; \r
\r
TextCollection *Text;\r
\r
/** The texts in the XML document (cached for faster display) */\r
- vector<string> CachedText;\r
- \r
- TagArrayEntry *TagArray;\r
+ vector<string> *CachedText;\r
\r
// Allows to disable the TextCollection for benchmarkin purposes\r
bool disable_tc;\r
\r
-public:\r
+\r
/** Data structure constructors */\r
- XMLTree() {;};\r
+ XMLTree(){;};\r
\r
- XMLTree(pb *par, uint npar, unsigned char **TN, uint ntagnames, uint *empty_texts_bmp, TagType *tags,\r
- TextCollection *TC, vector<string> CT, bool indexing_empty_t, bool dis_tc);\r
- \r
+ // non const pointer are freed by this method.\r
+ XMLTree( pb * const par, uint npar, vector<string> * const TN, TagIdMap * const tim, uint *empty_texts_bmp, TagType *tags,\r
+ TextCollection * const TC, vector<string> * const CT, bool dis_tc);\r
+\r
+public: \r
/** Data structure destructor */\r
~XMLTree();\r
\r
/** FirstChild(x): returns the first child of node x, assuming it exists. \r
* Very fast in BP. */\r
treeNode FirstChild(treeNode x);\r
- \r
+ treeNode FirstElement(treeNode x);\r
+\r
/** LastChild(x): returns the last child of node x. */\r
treeNode LastChild(treeNode x);\r
\r
/** NextSibling(x): returns the next sibling of node x, assuming it \r
* exists. */\r
treeNode NextSibling(treeNode x);\r
+ treeNode NextElement(treeNode x);\r
\r
/** PrevSibling(x): returns the previous sibling of node x, assuming it \r
* exists. */\r
* among the children of node x until finding the desired child. */\r
treeNode TaggedChild(treeNode x, TagType tag);\r
\r
- treeNode SelectChild(treeNode x, TagType *tags, int ntags);\r
+ treeNode SelectChild(treeNode x, std::unordered_set<int> * tags);\r
\r
- /** TaggedSibling(x,tag): returns the first sibling of node x tagged tag, or \r
+ /** TaggedFollSibling(x,tag): returns the first sibling of node x tagged tag, or \r
* NULLT if there is none. */\r
- treeNode TaggedSibling(treeNode x, TagType tag);\r
+ treeNode TaggedFollSibling(treeNode x, TagType tag);\r
\r
- treeNode SelectSibling(treeNode x, TagType *tags, int ntags);\r
+ treeNode SelectFollSibling(treeNode x, std::unordered_set<int> * tags);\r
\r
/** TaggedDesc(x,tag): returns the first node tagged tag with larger \r
* preorder than x and within the subtree of x. Returns NULT if there \r
* is none. */\r
treeNode TaggedDesc(treeNode x, TagType tag);\r
\r
- treeNode SelectDesc(treeNode x, TagType *tags, int ntags);\r
+ treeNode SelectDesc(treeNode x, std::unordered_set<int> * tags);\r
\r
- treeNode TaggedBelow(treeNode x, TagType *childtags, unsigned int ctlen,\r
- TagType *desctags, unsigned int dtlen);\r
- \r
- treeNode TaggedNext(treeNode x, TagType *childtags, unsigned int ctlen,\r
- TagType *folltags, unsigned int flen,treeNode root);\r
-\r
- treeNode TaggedDescOnly(treeNode x, TagType *desctags, unsigned int dtlen);\r
- \r
- treeNode TaggedDescOrFollOnly(treeNode x, TagType *folltags, unsigned int flen,\r
- treeNode root);\r
-\r
- treeNode TaggedFollOnly(treeNode x, TagType *folltags, unsigned int flen,\r
- treeNode root);\r
- \r
\r
/** TaggedPrec(x,tag): returns the first node tagged tag with smaller \r
* preorder than x and not an ancestor of x. Returns NULLT if there \r
\r
treeNode TaggedFollBelow(treeNode x, TagType tag,treeNode root); \r
\r
- treeNode SelectFollBelow(treeNode x, TagType *tags, int ntags, treeNode ctx);\r
+ treeNode SelectFollBelow(treeNode x, std::unordered_set<int> * tags, treeNode root);\r
\r
/** TaggedFollowingSibling(x,tag) */\r
treeNode TaggedFollowingSibling(treeNode x, TagType tag);\r
/** GetText(d): returns the text corresponding to document with\r
* id d. */\r
uchar* GetText(DocID d) {\r
- return Text->GetText(d);\r
+ uchar * s = Text->GetText(d);\r
+ return (s[0] == 1 ? (uchar*)"" : s);\r
}\r
\r
uchar* GetCachedText(DocID d) {\r
- uchar * str = (uchar*) calloc(sizeof(char),(CachedText.at(d).size() + 1));\r
- strcpy((char*) str,(const char*) CachedText.at(d).c_str());\r
+ uchar * str = (uchar*) calloc(sizeof(char),(CachedText->at(d).size() + 1));\r
+ strcpy((char*) str,(const char*) CachedText->at(d).c_str());\r
return (uchar*) (str);\r
}\r
\r
}\r
\r
/** Save: saves XML tree data structure to file. */\r
- void Save(unsigned char *filename);\r
+ void Save(int fd);\r
\r
/** Load: loads XML tree data structure from file. sample_rate_text \r
* indicates the sample rate for the text search data structure. */\r
- static XMLTree *Load(unsigned char *filename, int sample_rate_text); \r
+ static XMLTree *Load(int fd); \r
\r
void insertTag(TagType tag, uint position);\r
\r