1 /******************************************************************************
2 * Copyright (C) 2008 by Diego Arroyuelo *
3 * Interface for the in-memory XQuery/XPath engine *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19 ******************************************************************************/
25 #include <unordered_set>
26 #include <unordered_map>
28 #include "TextCollection/TextCollectionBuilder.h"
35 #include <libcds/includes/basics.h>
36 #include <static_bitsequence.h>
37 #include <alphabet_mapper.h>
38 #include <static_sequence.h>
39 using SXSI::TextCollection;
40 using SXSI::TextCollectionBuilder;
43 // this constant is used to efficiently compute the child operation in the tree
48 #define PERM_SAMPLE 10
60 // Encoding of the XML Document :
61 // The following TAGs and IDs are fixed, "" is the tag of the root.
62 // a TextNode is represented by a leaf <<$>></<$>> The DocId in the TextCollection
63 // of that leaf is kept in a bit sequence.
64 // a TextNode below an attribute is likewise represented by a leaf <<@$>><</@$>>
65 // An element <e a1="v1" a2="v2" ... an="vn" > ...</e> the representation is:
66 // <e><<@>> <<@>a1> <<$@>>DocID(v1)</<$@>></<@>a1> ... </<@>> .... </e>
67 // Hence the attributes (if any) are always below the first child of their element,
68 // as the children of a fake node <@>.
71 #define DOCUMENT_OPEN_TAG ""
72 #define DOCUMENT_TAG_ID 0
73 #define ATTRIBUTE_OPEN_TAG "<@>"
74 #define ATTRIBUTE_TAG_ID 1
75 #define PCDATA_OPEN_TAG "<$>"
76 #define PCDATA_TAG_ID 2
77 #define ATTRIBUTE_DATA_OPEN_TAG "<@$>"
78 #define ATTRIBUTE_DATA_TAG_ID 3
79 #define CLOSING_TAG "</>"
80 #define CLOSING_TAG_ID 4
81 #define DOCUMENT_CLOSE_TAG "/"
82 #define ATTRIBUTE_CLOSE_TAG "/<@>"
83 #define PCDATA_CLOSE_TAG "/<$>"
84 #define ATTRIBUTE_DATA_CLOSE_TAG "/<@$>"
87 typedef std::unordered_set<int> TagIdSet;
88 typedef std::unordered_map<std::string,int> TagIdMap;
89 typedef TagIdMap::const_iterator TagIdMapIT;
91 #define REGISTER_TAG(v,h,t) do { (h)->insert(std::make_pair((t),(v)->size()));\
92 (v)->push_back(t); } while (false)
94 // returns NULLT if the test is true
95 #define NULLT_IF(x) do { if (x) return NULLT; } while (0)
97 // Direct calls to sarray library
99 static inline int fast_find_close(bp *b,int s)
101 return fwd_excess(b,s,-1);
104 static inline int fast_inspect(bp* Par,treeNode i)
109 return (Par->B[j] >> (D-1-l)) & 1;
112 static bool fast_isleaf(bp* Par,treeNode x){
113 return (fast_inspect(Par, x+1) == CP);
116 static treeNode fast_first_child(bp *Par, treeNode x)
119 return (fast_inspect(Par,x) == OP) ? x : NULLT;
122 inline static treeNode fast_next_sibling(bp* Par,treeNode x)
124 treeNode y = fast_find_close(Par,x)+1;
125 return (fast_inspect(Par, y) == OP) ? y : NULLT;
128 inline static bool fast_is_ancestor(bp * Par,treeNode x,treeNode y){
129 return (x <= y) && ((x==0) || (y <= fast_find_close(Par,x)));
132 // tag position -> tree node
133 static treeNode tagpos2node(int t)
137 // tree node -> tag position
138 static int node2tagpos(treeNode x)
144 class XMLTreeBuilder;
148 // Only the builder can access the constructor
149 friend class XMLTreeBuilder;
152 /** Balanced parentheses representation of the tree */
155 /** Mapping from tag identifer to tag name */
156 std::vector<std::string> *TagName;
159 /** Bit vector indicating with a 1 the positions of the non-empty texts. */
160 static_bitsequence *EBVector;
162 /** Tag sequence represented with a data structure for rank and select */
163 static_sequence *Tags;
165 uint tags_blen, tags_len;
167 /** The texts in the XML document */
168 TextCollection *Text;
170 // Allows to disable the TextCollection for benchmarkin purposes
175 std::string * buffer;
178 size_t size = buffer->size();
179 size_t written = write(fd, buffer->data(), size);
181 throw "Cannot flush buffer";
185 void _dputs(const char* s, int fd){
187 if (buffer->size() >= 131072) _flush(fd);
191 void _dputc(const char c, int fd){
193 if (buffer->size() >= 131072) _flush(fd);
196 size_t _dprintf(const char* s, int fd){
197 if (s == NULL) return 0;
202 buffer->append("&");
205 buffer->append("'");
208 buffer->append(""");
211 buffer->append("<");
214 buffer->append(">");
219 buffer->append(1, s[i]);
222 if (buffer->size() >= 131072) _flush(fd);
227 void PrintNode(treeNode n, int fd);
228 /** Data structure constructors */
229 XMLTree(){ buffer = 0;};
231 // non const pointer are freed by this method.
232 XMLTree( pb * const par, uint npar, std::vector<std::string> * const TN, TagIdMap * const tim, uint *empty_texts_bmp, TagType *tags,
233 TextCollection * const TC, bool dis_tc);
236 /** Data structure destructor */
239 /** root(): returns the tree root. */
240 treeNode Root() { return 0; }
242 /** Size() : Number of parenthesis */
248 /** NumTags() : Number of distinct tags */
249 unsigned int NumTags() {
250 return TagName->size();
253 int TagsBinaryLength(){ return tags_blen; };
254 unsigned int TagStructLength(){ return uint_len(tags_blen,tags_len); };
255 unsigned int * TagStruct() { return tags_fix; };
258 /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of
260 int SubtreeSize(treeNode x) { return subtree_size(Par, x); }
262 /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree
264 int SubtreeTags(treeNode x, TagType tag){
265 //int s = x + 2*subtree_size(Par, x) - 1;
266 treeNode y = fast_find_close(Par, x);
270 for(int i = x; i < y; i++)
271 count += (Tag(i) == tag);
275 return (Tags->rank(tag, y) - Tags->rank(tag, x));
278 /** SubtreeElements(x) of element nodes in the subtree of x
280 int SubtreeElements(treeNode x);
282 /** IsLeaf(x): returns whether node x is leaf or not. In the succinct
283 * representation this is just a bit inspection. */
285 bool IsLeaf(treeNode x);
287 /** IsAncestor(x,y): returns whether node x is ancestor of node y. */
289 bool IsAncestor(treeNode x, treeNode y);
291 /** IsChild(x,y): returns whether node x is parent of node y. */
292 bool IsChild(treeNode x, treeNode y);
294 /** IsFirstChild(x): returns whether node x is the first child of its parent. */
296 bool IsFirstChild(treeNode x) {
297 return ((x != NULLT)&&(x==Root() || prev_sibling(Par,x) == (treeNode)-1));
300 /** NumChildren(x): number of children of node x. Constant time with the
301 * data structure of Sadakane. */
302 int NumChildren(treeNode x);
304 /** ChildNumber(x): returns i if node x is the i-th children of its
306 int ChildNumber(treeNode x);
308 /** Depth(x): depth of node x, a simple binary rank on the parentheses
310 int Depth(treeNode x);
312 /** Preorder(x): returns the preorder number of node x, just regarding tree
313 * nodes (and not texts). */
314 int Preorder(treeNode x);
316 /** Postorder(x): returns the postorder number of node x, just regarding
317 * tree nodes (and not texts). */
318 int Postorder(treeNode x);
321 /** DocIds(x): returns the range (i.e., a pair of integers) of document
322 * identifiers that descend from node x. */
323 range DocIds(treeNode x);
325 /** Parent(x): returns the parent node of node x. */
326 treeNode Parent(treeNode x) {
330 return parent(Par, x);
332 /* Assumes x is neither 0 nor -1 */
334 /** Child(x,i): returns the i-th child of node x, assuming it exists. */
335 treeNode Child(treeNode x, int i);
339 /** LastChild(x): returns the last child of node x. */
340 treeNode LastChild(treeNode x);
344 /** PrevSibling(x): returns the previous sibling of node x, assuming it
347 treeNode PrevSibling(treeNode x);
349 /** TaggedChild(x,tag): returns the first child of node x tagged tag, or
350 * NULLT if there is none. Because of the balanced-parentheses representation
351 * of the tree, this operation is not supported efficiently, just iterating
352 * among the children of node x until finding the desired child. */
355 treeNode SelectChild(treeNode x, TagIdSet * tags);
357 /** TaggedFollowingSibling(x,tag): returns the first sibling of node x tagged tag, or
358 * NULLT if there is none. */
360 treeNode SelectFollowingSibling(treeNode x, TagIdSet * tags);
365 treeNode SelectDescendant(treeNode x, TagIdSet * tags) {
366 NULLT_IF (x == NULLT);
368 if (fast_inspect(Par, fc) == CP) return NULLT;
369 treeNode min = NULLT;
372 TagIdSet::const_iterator tagit;
373 for (tagit = tags->begin(); tagit != tags->end(); ++tagit) {
374 aux = TaggedDescendant(x, (TagType) *tagit);
375 if (((unsigned int) aux) < ((unsigned int) min)) min = aux;
380 /** TaggedPrec(x,tag): returns the first node tagged tag with smaller
381 * preorder than x and not an ancestor of x. Returns NULLT if there
383 treeNode TaggedPreceding(treeNode x, TagType tag);
385 /** TaggedFoll(x,tag): returns the first node tagged tag with larger
386 * preorder than x and not in the subtree of x. Returns NULLT if there
388 treeNode TaggedFollowing(treeNode x, TagType tag);
392 treeNode SelectFollowingBelow(treeNode x, TagIdSet * tags, treeNode ancestor);
394 // treeNode TaggedFollowingBefore(treeNode x, TagType tag,treeNode closing);
396 treeNode SelectFollowingBefore(treeNode x, TagIdSet * tags, treeNode ancestor_closing)
401 treeNode close = fast_find_close(Par,x);
404 treeNode min = NULLT;
408 TagIdSet::const_iterator tagit;
409 for (tagit = tags->begin(); tagit != tags->end(); ++tagit) {
411 aux = tagpos2node(Tags->select_next(*tagit, close));
413 if (((unsigned int) aux) < ((unsigned int) min)) min = aux;
417 return (min < ancestor_closing) ? min : NULLT;
421 /** TaggedAncestor(x, tag): returns the closest ancestor of x tagged
422 * tag. Return NULLT is there is none. */
423 treeNode TaggedAncestor(treeNode x, TagType tag);
425 /** PrevText(x): returns the document identifier of the text to the left of
426 * node x, or NULLT if x is the root node. */
427 DocID PrevText(treeNode x);
429 /** NextText(x): returns the document identifier of the text to the right of
430 * node x, or NULLT if x is the root node. */
431 DocID NextText(treeNode x);
433 /** MyText(x): returns the document identifier of the text below node x, or
434 * NULLT if x is not a leaf node. */
435 DocID MyText(treeNode x);
436 DocID MyTextUnsafe(treeNode x);
438 /** TextXMLId(d): returns the preorder of document with identifier d in the
439 * tree consisting of all tree nodes and all text nodes. */
440 int TextXMLId(DocID d);
442 /** NodeXMLId(x): returns the preorder of node x in the tree consisting of
443 * all tree nodes and all text nodes. */
444 int NodeXMLId(treeNode x);
446 /** ParentNode(d): returns the parent node of document identifier d. */
447 treeNode ParentNode(DocID d);
449 treeNode PrevNode(DocID d);
451 /** GetTagId(tagname): returns the tag identifier corresponding to a given
452 * tag name. Returns NULLT in case that the tag name does not exists. */
453 TagType GetTagId(unsigned char *tagname);
455 /** GetTagName(tagid): returns the tag name of a given tag identifier.
456 * Returns NULL in case that the tag identifier is not valid.*/
457 unsigned char *GetTagName(TagType tagid);
459 /** GetTagName(tagid): returns the tag name of a given tag identifier.
460 * The result is just a reference and should not be freed by the caller.
462 const unsigned char *GetTagNameByRef(TagType tagid);
464 /** RegisterTag adds a new tag to the tag collection this is needed
465 * if the query contains a tag which is not in the document, we need
466 * to give this new tag a fresh id and store it somewhere. A logical
468 * We might want to use a hashtable instead of an array though.
470 TagType RegisterTag(unsigned char *tagname);
472 bool EmptyText(DocID i) {
473 return Text->EmptyText(i);
476 /** Prefix(s): search for texts prefixed by string s. */
477 TextCollection::document_result Prefix(uchar const *s) {
478 return Text->Prefix(s);
481 /** Suffix(s): search for texts having string s as a suffix. */
482 TextCollection::document_result Suffix(uchar const *s) {
483 return Text->Suffix(s);
486 /** Equal(s): search for texts equal to string s. */
487 TextCollection::document_result Equals(uchar const *s) {
488 return Text->Equal(s);
491 /** Contains(s): search for texts containing string s. */
492 TextCollection::document_result Contains(uchar const *s) {
493 return Text->Contains(s);
496 /** LessThan(s): returns document identifiers for the texts that
497 * are lexicographically smaller than string s. */
498 TextCollection::document_result LessThan(uchar const *s) {
499 return Text->LessThan(s);
502 /** IsPrefix(x): returns true if there is a text prefixed by string s. */
503 bool IsPrefix(uchar const *s) {
504 return Text->IsPrefix(s);
507 /** IsSuffix(s): returns true if there is a text having string s as a
509 bool IsSuffix(uchar const *s) {
510 return Text->IsSuffix(s);
513 /** IsEqual(s): returns true if there is a text that equals given
515 bool IsEqual(uchar const *s) {
516 return Text->IsEqual(s);
519 /** IsContains(s): returns true if there is a text containing string s. */
520 bool IsContains(uchar const *s) {
521 return Text->IsContains(s);
524 /** IsLessThan(s): returns true if there is at least a text that is
525 * lexicographically smaller than string s. */
526 bool IsLessThan(uchar const *s) {
527 return Text->IsLessThan(s);
530 /** Count(s): Global counting */
531 unsigned Count(uchar const *s) {
532 return Text->Count(s);
535 /** CountPrefix(s): counting version of Prefix(s). */
536 unsigned CountPrefix(uchar const *s) {
537 return Text->CountPrefix(s);
540 /** CountSuffix(s): counting version of Suffix(s). */
541 unsigned CountSuffix(uchar const *s) {
542 return Text->CountSuffix(s);
545 /** CountEqual(s): counting version of Equal(s). */
546 unsigned CountEqual(uchar const *s) {
547 return Text->CountEqual(s);
550 /** CountContains(s): counting version of Contains(s). */
551 unsigned CountContains(uchar const *s) {
552 return Text->CountContains(s);
555 /** CountLessThan(s): counting version of LessThan(s). */
556 unsigned CountLessThan(uchar const *s) {
557 return Text->CountLessThan(s);
560 /** GetText(d): returns the text corresponding to document with
562 uchar* GetText(DocID d) {
564 uchar * s = Text->GetText(d);
565 return (s[0] == 1 ? (s+1) : s);
568 /** GetText(i, j): returns the texts corresponding to documents with
569 * ids i, i+1, ..., j. Texts are separated by '\0' character. */
570 // uchar* GetText(DocID i, DocID j) {
571 // uchar * s = Text->GetText(i, j);
572 // return (s[0] == 1 ? (uchar*)"" : s);
575 TextCollection *getTextCollection() {
579 /** Save: saves XML tree data structure to file. */
580 void Save(int fd, char *filename);
582 /** Load: loads XML tree data structure from file. sample_rate_text
583 * indicates the sample rate for the text search data structure. */
584 static XMLTree *Load(int fd, char *filename, bool load_tc, int sample_factor);
586 void insertTag(TagType tag, uint position);
591 /** Parenthesis functions */
592 treeNode Closing(treeNode x);
594 bool IsOpen(treeNode x);
597 /** Print procedure */
598 void Print(int fd,treeNode x, bool no_text);
599 void Print(int fd,treeNode x) { Print(fd,x,false); }
601 // The following are inlined here for speed
602 /** Tag(x): returns the tag identifier of node x. */
604 inline TagType Tag(treeNode x) const throw () {
606 return (TagType) (((uchar*)tags_fix)[(int) x]);
608 return get_field(tags_fix, tags_blen, x);
611 size_t idxlen = x * tags_blen;
612 size_t j = idxlen % W;
613 size_t i = idxlen / W;
614 size_t offset = W - tags_blen;
615 size_t offset2 = offset - j;
616 size_t w = tags_fix[i];
617 return (offset2 >= 0)
618 ? ( w << offset2 ) >> offset
619 : ( w >> j) | (tags_fix[i+1] << (W+offset2)) >> offset;
624 /** FirstChild(x): returns the first child of node x, or NULLT if the node is a leaf
626 treeNode FirstChild(treeNode x) {
628 return fast_first_child(Par, x);
632 /** FirstElement(x): returns the first non text, non attribute child of node x, or NULLT
635 treeNode FirstElement(treeNode node){
637 NULLT_IF(node == NULLT);
638 treeNode x = fast_first_child(Par, node);
639 NULLT_IF(x == NULLT);
642 case ATTRIBUTE_TAG_ID:
643 x = fast_next_sibling(Par,x);
644 if (x == NULLT || Tag(x) != PCDATA_TAG_ID) return x;
648 return (fast_inspect(Par,x)==OP)? x : NULLT;
656 /** NextSibling(x): returns the next sibling of node x, or NULLT if none
659 treeNode NextSibling(treeNode x) {
661 return fast_next_sibling(Par, x);
664 /** NextElement(x): returns the first non text, non attribute sibling of node x, or NULLT
667 treeNode NextElement(treeNode x)
670 x = fast_next_sibling(Par, x);
671 NULLT_IF(x == NULLT);
672 if (Tag(x) == PCDATA_TAG_ID){
674 return (fast_inspect(Par, y) == OP) ? y : NULLT;
678 /** TaggedDesc(x,tag): returns the first node tagged tag with larger
679 * preorder than x and within the subtree of x. Returns NULT if there
681 inline treeNode TaggedDescendant(treeNode x, TagType tag)
684 int s = (int) Tags->select_next(tag,node2tagpos(x));
687 treeNode y = tagpos2node(s); // transforms the tag position into a node position
689 return (fast_is_ancestor(Par,x,y) ? y : NULLT);
692 inline treeNode TaggedFollowingBelow(treeNode x, TagType tag, treeNode ancestor)
694 treeNode close = fast_find_close(Par, x);
695 treeNode s = tagpos2node(Tags->select_next(tag, close));
697 if (ancestor == Root() || s == NULLT || s < fast_find_close(Par,ancestor)) return s;
701 inline treeNode TaggedFollowingBefore(treeNode x, TagType tag, treeNode ancestor_closing)
703 treeNode close = fast_find_close(Par, x);
704 treeNode s = tagpos2node(Tags->select_next(tag, close));
706 if (ancestor_closing == Root() || s == NULLT || s < ancestor_closing) return s;
710 // TaggedSibling(x,tag): returns the first sibling of node x tagged tag, or NULLT if there is none.
711 treeNode TaggedFollowingSibling(treeNode x, TagType tag)
714 treeNode sibling = x;
716 while ((sibling = fast_next_sibling(Par, sibling)) != NULLT) {
718 if (ctag == tag) return sibling;
723 treeNode TaggedChild(treeNode x, TagType tag)
726 NULLT_IF(x==NULLT || fast_isleaf(Par,x));
728 child = fast_first_child(Par, x);
730 if (Tag(child) == tag)
733 return TaggedFollowingSibling(child, tag);