1 /******************************************************************************
2 * Copyright (C) 2008 by Diego Arroyuelo *
3 * Interface for the in-memory XQuery/XPath engine *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19 ******************************************************************************/
25 #include <unordered_set>
26 #include <unordered_map>
28 #include <TextCollection/TextCollectionBuilder.h>
35 #include <libcds/includes/basics.h>
36 #include <libcds/includes/static_bitsequence.h>
37 #include <libcds/includes/alphabet_mapper.h>
38 #include <libcds/includes/static_sequence.h>
40 using SXSI::TextCollection;
41 using SXSI::TextCollectionBuilder;
44 // this constant is used to efficiently compute the child operation in the tree
49 #define PERM_SAMPLE 10
61 // Encoding of the XML Document :
62 // The following TAGs and IDs are fixed, "" is the tag of the root.
63 // a TextNode is represented by a leaf <<$>></<$>> The DocId in the TextCollection
64 // of that leaf is kept in a bit sequence.
65 // a TextNode below an attribute is likewise represented by a leaf <<@$>><</@$>>
66 // An element <e a1="v1" a2="v2" ... an="vn" > ...</e> the representation is:
67 // <e><<@>> <<@>a1> <<$@>>DocID(v1)</<$@>></<@>a1> ... </<@>> .... </e>
68 // Hence the attributes (if any) are always below the first child of their element,
69 // as the children of a fake node <@>.
72 #define DOCUMENT_OPEN_TAG ""
73 #define DOCUMENT_TAG_ID 0
74 #define ATTRIBUTE_OPEN_TAG "<@>"
75 #define ATTRIBUTE_TAG_ID 1
76 #define PCDATA_OPEN_TAG "<$>"
77 #define PCDATA_TAG_ID 2
78 #define ATTRIBUTE_DATA_OPEN_TAG "<@$>"
79 #define ATTRIBUTE_DATA_TAG_ID 3
80 #define CLOSING_TAG "</>"
81 #define CLOSING_TAG_ID 4
82 #define DOCUMENT_CLOSE_TAG "/"
83 #define ATTRIBUTE_CLOSE_TAG "/<@>"
84 #define PCDATA_CLOSE_TAG "/<$>"
85 #define ATTRIBUTE_DATA_CLOSE_TAG "/<@$>"
88 typedef std::unordered_set<int> TagIdSet;
89 typedef std::unordered_map<std::string,int> TagIdMap;
90 typedef TagIdMap::const_iterator TagIdMapIT;
92 #define REGISTER_TAG(v,h,t) do { (h)->insert(std::make_pair((t),(v)->size()));\
93 (v)->push_back(t); } while (false)
95 // returns NULLT if the test is true
96 #define NULLT_IF(x) do { if (x) return NULLT; } while (0)
97 #define IS_NIL(x) ((x) < 0)
99 // Direct calls to sarray library
101 #define BUFFER_ALLOC (8192 * 2)
102 #define BUFFER_SIZE (BUFFER_ALLOC / 2)
104 // tag position -> tree node
105 static treeNode tagpos2node(int t)
109 // tree node -> tag position
110 static int node2tagpos(treeNode x)
116 class XMLTreeBuilder;
120 // Only the builder can access the constructor
121 friend class XMLTreeBuilder;
124 /** Balanced parentheses representation of the tree */
127 /** Mapping from tag identifer to tag name */
128 std::vector<std::string> *TagName;
131 /** Bit vector indicating with a 1 the positions of the non-empty texts. */
132 static_bitsequence *EBVector;
134 /** Tag sequence represented with a data structure for rank and select */
135 static_sequence *Tags;
137 uint tags_blen, tags_len;
139 /** The texts in the XML document */
140 TextCollection *Text;
142 // Allows to disable the TextCollection for benchmarkin purposes
144 SXSI::TextCollectionBuilder::index_type_t text_index_type;
147 std::vector<std::string *> *print_stack;
150 void _real_flush(int fd, size_t size) {
151 if (size == 0) return;
154 written = write(fd, buffer->data(), size);
155 if ((written < 0) && (errno == EAGAIN || errno == EINTR))
164 size_t size = buffer->size();
165 if (size < BUFFER_SIZE) return;
166 _real_flush(fd, size);
169 void _dput_str(std::string s, int fd){
174 void _dputs(const char* s, int fd){
179 void _dputc(const char c, int fd){
180 buffer->push_back(c);
183 size_t _dprintf(const char* s, int fd){
184 if (s == NULL) return 0;
187 for (i = 0; (c = s[i]); i++) {
190 _dputs(""", fd);
196 _dputs("'", fd);
212 void PrintNode(treeNode n, int fd);
213 /** Data structure constructors */
214 XMLTree(){ buffer = 0;};
216 // non const pointer are freed by this method.
217 XMLTree( pb * const par,
219 std::vector<std::string> * const TN,
220 TagIdMap * const tim, uint *empty_texts_bmp,
222 TextCollection * const TC, bool dis_tc,
223 TextCollectionBuilder::index_type_t _index_type );
226 /** Data structure destructor */
229 /** root(): returns the tree root. */
230 treeNode Root() { return 0; }
232 /** Size() : Number of parenthesis */
238 /** NumTags() : Number of distinct tags */
239 unsigned int NumTags() {
240 return TagName->size();
243 int TagsBinaryLength(){ return tags_blen; };
244 unsigned int TagStructLength(){ return uint_len(tags_blen,tags_len); };
245 unsigned int * TagStruct() { return tags_fix; };
248 /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of
250 int SubtreeSize(treeNode x) { return bp_subtree_size(Par, x); }
252 /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree
254 int SubtreeTags(treeNode x, TagType tag){
255 //int s = x + 2*subtree_size(Par, x) - 1;
256 treeNode y = bp_find_close(Par, x);
260 for(int i = x; i < y; i++)
261 count += (Tag(i) == tag);
265 return (Tags->rank(tag, y) - Tags->rank(tag, x));
268 /** SubtreeElements(x) of element nodes in the subtree of x
270 int SubtreeElements(treeNode x);
272 /** IsLeaf(x): returns whether node x is leaf or not. In the succinct
273 * representation this is just a bit inspection. */
275 bool IsLeaf(treeNode x);
277 /** IsAncestor(x,y): returns whether node x is ancestor of node y. */
279 bool IsAncestor(treeNode x, treeNode y);
282 /** IsRigthDescendant returns true if y is a descendant of x and y is
283 not a descendant of the first child of x */
284 bool IsRightDescendant(treeNode x, treeNode y) {
285 if (x <= Root()) return false;
286 treeNode z = bp_parent_close(Par, x);
287 treeNode c = bp_find_close(Par, x);
288 return (y > c && y < z );
292 /** IsChild(x,y): returns whether node x is parent of node y. */
293 bool IsChild(treeNode x, treeNode y);
295 /** IsFirstChild(x): returns whether node x is the first child of its parent. */
297 bool IsFirstChild(treeNode x) {
298 return ((x != NULLT)&&(x==Root() || bp_prev_sibling(Par,x) == (treeNode)-1));
301 /** NumChildren(x): number of children of node x. Constant time with the
302 * data structure of Sadakane. */
303 int NumChildren(treeNode x);
305 /** ChildNumber(x): returns i if node x is the i-th children of its
307 int ChildNumber(treeNode x);
309 /** Depth(x): depth of node x, a simple binary rank on the parentheses
311 int Depth(treeNode x);
313 /** Preorder(x): returns the preorder number of node x, just regarding tree
314 * nodes (and not texts). */
315 int Preorder(treeNode x);
317 /** Postorder(x): returns the postorder number of node x, just regarding
318 * tree nodes (and not texts). */
319 int Postorder(treeNode x);
322 /** DocIds(x): returns the range (i.e., a pair of integers) of document
323 * identifiers that descend from node x. */
324 range DocIds(treeNode x);
326 /** Parent(x): returns the parent node of node x. */
327 treeNode Parent(treeNode x) {
328 return (x == Root()) ? NULLT : bp_parent(Par, x);
331 treeNode BinaryParent(treeNode x){
335 treeNode prev = x - 1;
336 return (bp_inspect(Par, prev) == OP) ? prev : bp_find_open(Par, prev);
340 /* Assumes x is neither 0 nor -1 */
342 /** Child(x,i): returns the i-th child of node x, assuming it exists. */
343 treeNode Child(treeNode x, int i);
347 /** LastChild(x): returns the last child of node x. */
348 treeNode LastChild(treeNode x) {
349 NULLT_IF(x == NULLT || bp_isleaf(Par,x));
350 return bp_find_open(Par, bp_find_close(Par, x)-1);
353 /** PrevSibling(x): returns the previous sibling of node x, assuming it
356 treeNode PrevSibling(treeNode x)
359 return bp_prev_sibling(Par, x);
363 /** TaggedChild(x,tag): returns the first child of node x tagged tag, or
364 * NULLT if there is none. Because of the balanced-parentheses representation
365 * of the tree, this operation is not supported efficiently, just iterating
366 * among the children of node x until finding the desired child. */
369 treeNode SelectChild(treeNode x, TagIdSet * tags);
371 /** TaggedFollowingSibling(x,tag): returns the first sibling of node x tagged tag, or
372 * NULLT if there is none. */
374 treeNode SelectFollowingSibling(treeNode x, TagIdSet * tags);
379 treeNode SelectDescendant(treeNode x, TagIdSet * tags) {
380 NULLT_IF (x == NULLT);
382 if (bp_inspect(Par, fc) == CP) return NULLT;
383 treeNode min = NULLT;
386 TagIdSet::const_iterator tagit;
387 for (tagit = tags->begin(); tagit != tags->end(); ++tagit) {
388 aux = TaggedDescendant(x, (TagType) *tagit);
389 if (((unsigned int) aux) < ((unsigned int) min)) min = aux;
394 /** TaggedPrec(x,tag): returns the first node tagged tag with smaller
395 * preorder than x and not an ancestor of x. Returns NULLT if there
397 treeNode TaggedPreceding(treeNode x, TagType tag);
399 /** TaggedFoll(x,tag): returns the first node tagged tag with larger
400 * preorder than x and not in the subtree of x. Returns NULLT if there
402 treeNode TaggedFollowing(treeNode x, TagType tag);
406 treeNode SelectFollowingBelow(treeNode x, TagIdSet * tags, treeNode ancestor);
408 // treeNode TaggedFollowingBefore(treeNode x, TagType tag,treeNode closing);
410 treeNode SelectFollowingBefore(treeNode x, TagIdSet * tags, treeNode ancestor_closing)
415 treeNode close = bp_find_close(Par,x);
418 treeNode min = NULLT;
422 TagIdSet::const_iterator tagit;
423 for (tagit = tags->begin(); tagit != tags->end(); ++tagit) {
425 aux = tagpos2node(Tags->select_next(*tagit, close));
427 if (((unsigned int) aux) < ((unsigned int) min)) min = aux;
431 return (min < ancestor_closing) ? min : NULLT;
435 /** TaggedAncestor(x, tag): returns the closest ancestor of x tagged
436 * tag. Return NULLT is there is none. */
437 treeNode TaggedAncestor(treeNode x, TagType tag);
439 /** PrevText(x): returns the document identifier of the text to the left of
440 * node x, or NULLT if x is the root node. */
441 DocID PrevText(treeNode x);
443 /** NextText(x): returns the document identifier of the text to the right of
444 * node x, or NULLT if x is the root node. */
445 DocID NextText(treeNode x);
447 /** MyText(x): returns the document identifier of the text below node x, or
448 * NULLT if x is not a leaf node. */
449 DocID MyText(treeNode x);
450 DocID MyTextUnsafe(treeNode x);
452 /** TextXMLId(d): returns the preorder of document with identifier d in the
453 * tree consisting of all tree nodes and all text nodes. */
454 int TextXMLId(DocID d);
456 /** NodeXMLId(x): returns the preorder of node x in the tree consisting of
457 * all tree nodes and all text nodes. */
458 int NodeXMLId(treeNode x);
460 /** ParentNode(d): returns the parent node of document identifier d. */
461 treeNode ParentNode(DocID d);
463 treeNode PrevNode(DocID d);
465 /** GetTagId(tagname): returns the tag identifier corresponding to a given
466 * tag name. Returns NULLT in case that the tag name does not exists. */
467 TagType GetTagId(unsigned char *tagname);
469 /** GetTagName(tagid): returns the tag name of a given tag identifier.
470 * Returns NULL in case that the tag identifier is not valid.*/
471 unsigned char *GetTagName(TagType tagid);
473 /** GetTagName(tagid): returns the tag name of a given tag identifier.
474 * The result is just a reference and should not be freed by the caller.
476 const unsigned char *GetTagNameByRef(TagType tagid);
478 /** RegisterTag adds a new tag to the tag collection this is needed
479 * if the query contains a tag which is not in the document, we need
480 * to give this new tag a fresh id and store it somewhere. A logical
482 * We might want to use a hashtable instead of an array though.
484 TagType RegisterTag(unsigned char *tagname);
486 bool EmptyText(DocID i) {
487 return Text->EmptyText(i);
490 /** Prefix(s): search for texts prefixed by string s. */
491 TextCollection::document_result Prefix(uchar const *s) {
492 return Text->Prefix(s);
495 /** Suffix(s): search for texts having string s as a suffix. */
496 TextCollection::document_result Suffix(uchar const *s) {
497 return Text->Suffix(s);
500 /** Equal(s): search for texts equal to string s. */
501 TextCollection::document_result Equals(uchar const *s) {
502 return Text->Equal(s);
505 /** Contains(s): search for texts containing string s. */
506 TextCollection::document_result Contains(uchar const *s) {
507 return Text->Contains(s);
510 /** LessThan(s): returns document identifiers for the texts that
511 * are lexicographically smaller than string s. */
512 TextCollection::document_result LessThan(uchar const *s) {
513 return Text->LessThan(s);
516 /** IsPrefix(x): returns true if there is a text prefixed by string s. */
517 bool IsPrefix(uchar const *s) {
518 return Text->IsPrefix(s);
521 /** IsSuffix(s): returns true if there is a text having string s as a
523 bool IsSuffix(uchar const *s) {
524 return Text->IsSuffix(s);
527 /** IsEqual(s): returns true if there is a text that equals given
529 bool IsEqual(uchar const *s) {
530 return Text->IsEqual(s);
533 /** IsContains(s): returns true if there is a text containing string s. */
534 bool IsContains(uchar const *s) {
535 return Text->IsContains(s);
538 /** IsLessThan(s): returns true if there is at least a text that is
539 * lexicographically smaller than string s. */
540 bool IsLessThan(uchar const *s) {
541 return Text->IsLessThan(s);
544 /** Count(s): Global counting */
545 unsigned Count(uchar const *s) {
546 return Text->Count(s);
549 /** CountPrefix(s): counting version of Prefix(s). */
550 unsigned CountPrefix(uchar const *s) {
551 return Text->CountPrefix(s);
554 /** CountSuffix(s): counting version of Suffix(s). */
555 unsigned CountSuffix(uchar const *s) {
556 return Text->CountSuffix(s);
559 /** CountEqual(s): counting version of Equal(s). */
560 unsigned CountEqual(uchar const *s) {
561 return Text->CountEqual(s);
564 /** CountContains(s): counting version of Contains(s). */
565 unsigned CountContains(uchar const *s) {
566 return Text->CountContains(s);
569 /** CountLessThan(s): counting version of LessThan(s). */
570 unsigned CountLessThan(uchar const *s) {
571 return Text->CountLessThan(s);
574 /** GetText(d): returns the text corresponding to document with
576 uchar* GetText(DocID d) {
578 uchar * s = Text->GetText(d);
579 return (s[0] == 1 ? (s+1) : s);
582 /** GetText(i, j): returns the texts corresponding to documents with
583 * ids i, i+1, ..., j. Texts are separated by '\0' character. */
584 // uchar* GetText(DocID i, DocID j) {
585 // uchar * s = Text->GetText(i, j);
586 // return (s[0] == 1 ? (uchar*)"" : s);
589 TextCollection *getTextCollection() {
593 /** Save: saves XML tree data structure to file. */
594 void Save(int fd, char* name );
596 /** Load: loads XML tree data structure from file. sample_rate_text
597 * indicates the sample rate for the text search data structure. */
598 static XMLTree *Load(int fd, bool load_tc, int sample_factor, char * name);
600 void insertTag(TagType tag, uint position);
605 /** Parenthesis functions */
606 treeNode Closing(treeNode x);
608 bool IsOpen(treeNode x);
611 /** Print procedure */
612 void Print(int fd,treeNode x, bool no_text);
613 void Print(int fd,treeNode x) { Print(fd,x,false); }
614 void Flush(int fd){ if (buffer) _real_flush(fd, buffer->size()); }
616 // The following are inlined here for speed
617 /** Tag(x): returns the tag identifier of node x. */
619 inline TagType Tag(treeNode x) const throw () {
621 return (TagType) (((uchar*)tags_fix)[(int) x]);
623 return get_field(tags_fix, tags_blen, x);
626 size_t idxlen = x * tags_blen;
627 size_t j = idxlen % W;
628 size_t i = idxlen / W;
629 size_t offset = W - tags_blen;
630 size_t offset2 = offset - j;
631 size_t w = tags_fix[i];
632 return (offset2 >= 0)
633 ? ( w << offset2 ) >> offset
634 : ( w >> j) | (tags_fix[i+1] << (W+offset2)) >> offset;
639 /** FirstChild(x): returns the first child of node x, or NULLT if the node is a leaf
641 treeNode FirstChild(treeNode x) {
643 return bp_first_child(Par, x);
647 /** FirstElement(x): returns the first non text, non attribute child of node x, or NULLT
650 treeNode FirstElement(treeNode node){
652 NULLT_IF(node == NULLT);
653 treeNode x = bp_first_child(Par, node);
654 NULLT_IF(x == NULLT);
657 case ATTRIBUTE_TAG_ID:
658 x = bp_next_sibling(Par,x);
659 if (x == NULLT || Tag(x) != PCDATA_TAG_ID) return x;
663 return (bp_inspect(Par,x)==OP)? x : NULLT;
671 /** NextSibling(x): returns the next sibling of node x, or NULLT if none
674 treeNode NextSibling(treeNode x) {
676 return bp_next_sibling(Par, x);
679 /** NextElement(x): returns the first non text, non attribute sibling of node x, or NULLT
682 treeNode NextElement(treeNode x)
685 x = bp_next_sibling(Par, x);
686 NULLT_IF(x == NULLT);
687 if (Tag(x) == PCDATA_TAG_ID){
689 return (bp_inspect(Par, y) == OP) ? y : NULLT;
693 /** TaggedDesc(x,tag): returns the first node tagged tag with larger
694 * preorder than x and within the subtree of x. Returns NULT if there
696 inline treeNode TaggedNext(treeNode x, TagType tag)
698 return tagpos2node(Tags->select_next(tag,node2tagpos(x)));
700 inline treeNode TaggedDescendant(treeNode x, TagType tag)
703 int s = (int) Tags->select_next(tag,node2tagpos(x));
706 treeNode y = tagpos2node(s); // transforms the tag position into a node position
708 return (bp_is_ancestor(Par,x,y) ? y : NULLT);
711 inline treeNode TaggedFollowingBelow(treeNode x, TagType tag, treeNode ancestor)
713 treeNode close = bp_find_close(Par, x);
714 treeNode s = tagpos2node(Tags->select_next(tag, close));
716 if (ancestor == Root() || s == NULLT || s < bp_find_close(Par,ancestor)) return s;
720 inline treeNode TaggedFollowingBefore(treeNode x, TagType tag, treeNode ancestor_closing)
722 treeNode close = bp_find_close(Par, x);
723 treeNode s = tagpos2node(Tags->select_next(tag, close));
725 if (ancestor_closing == Root() || s == NULLT || s < ancestor_closing) return s;
729 inline treeNode NextNodeBefore(treeNode x, treeNode ancestor_closing)
731 treeNode close = bp_find_close(Par, x);
732 int rank = bp_rank_open(Par, close);
733 treeNode y = bp_select_open(Par, rank+1);
734 return (y < ancestor_closing) ? y : NULLT;
737 // TaggedSibling(x,tag): returns the first sibling of node x tagged tag, or NULLT if there is none.
738 treeNode TaggedFollowingSibling(treeNode x, TagType tag)
741 treeNode sibling = x;
743 while ((sibling = bp_next_sibling(Par, sibling)) != NULLT) {
745 if (ctag == tag) return sibling;
750 treeNode TaggedChild(treeNode x, TagType tag)
753 NULLT_IF(x==NULLT || bp_isleaf(Par,x));
755 child = bp_first_child(Par, x);
757 if (Tag(child) == tag)
760 return TaggedFollowingSibling(child, tag);