1 /******************************************************************************
\r
2 * Copyright (C) 2008 by Diego Arroyuelo *
\r
3 * Interface for the in-memory XQuery/XPath engine *
\r
5 * This program is free software; you can redistribute it and/or modify *
\r
6 * it under the terms of the GNU Lesser General Public License as published *
\r
7 * by the Free Software Foundation; either version 2 of the License, or *
\r
8 * (at your option) any later version. *
\r
10 * This program is distributed in the hope that it will be useful, *
\r
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
\r
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
\r
13 * GNU Lesser General Public License for more details. *
\r
15 * You should have received a copy of the GNU Lesser General Public License *
\r
16 * along with this program; if not, write to the *
\r
17 * Free Software Foundation, Inc., *
\r
18 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
\r
19 ******************************************************************************/
\r
25 #include <unordered_set>
\r
26 #include <unordered_map>
\r
28 #include "TextCollection/TextCollectionBuilder.h"
\r
35 #include <libcds/includes/basics.h>
\r
36 #include <static_bitsequence.h>
\r
37 #include <alphabet_mapper.h>
\r
38 #include <static_sequence.h>
\r
39 using SXSI::TextCollection;
\r
40 using SXSI::TextCollectionBuilder;
\r
43 // this constant is used to efficiently compute the child operation in the tree
\r
48 #define PERM_SAMPLE 10
\r
51 typedef int treeNode;
\r
52 typedef int TagType;
\r
60 // Encoding of the XML Document :
\r
61 // The following TAGs and IDs are fixed, "" is the tag of the root.
\r
62 // a TextNode is represented by a leaf <<$>></<$>> The DocId in the TextCollection
\r
63 // of that leaf is kept in a bit sequence.
\r
64 // a TextNode below an attribute is likewise represented by a leaf <<@$>><</@$>>
\r
65 // An element <e a1="v1" a2="v2" ... an="vn" > ...</e> the representation is:
\r
66 // <e><<@>> <<@>a1> <<$@>>DocID(v1)</<$@>></<@>a1> ... </<@>> .... </e>
\r
67 // Hence the attributes (if any) are always below the first child of their element,
\r
68 // as the children of a fake node <@>.
\r
71 #define DOCUMENT_OPEN_TAG ""
\r
72 #define DOCUMENT_TAG_ID 0
\r
73 #define ATTRIBUTE_OPEN_TAG "<@>"
\r
74 #define ATTRIBUTE_TAG_ID 1
\r
75 #define PCDATA_OPEN_TAG "<$>"
\r
76 #define PCDATA_TAG_ID 2
\r
77 #define ATTRIBUTE_DATA_OPEN_TAG "<@$>"
\r
78 #define ATTRIBUTE_DATA_TAG_ID 3
\r
79 #define CLOSING_TAG "</>"
\r
80 #define CLOSING_TAG_ID 4
\r
81 #define DOCUMENT_CLOSE_TAG "/"
\r
82 #define ATTRIBUTE_CLOSE_TAG "/<@>"
\r
83 #define PCDATA_CLOSE_TAG "/<$>"
\r
84 #define ATTRIBUTE_DATA_CLOSE_TAG "/<@$>"
\r
87 typedef std::unordered_set<int> TagIdSet;
\r
88 typedef std::unordered_map<std::string,int> TagIdMap;
\r
89 typedef TagIdMap::const_iterator TagIdMapIT;
\r
91 #define REGISTER_TAG(v,h,t) do { (h)->insert(std::make_pair((t),(v)->size()));\
\r
92 (v)->push_back(t); } while (false)
\r
94 // returns NULLT if the test is true
\r
95 #define NULLT_IF(x) do { if (x) return NULLT; } while (0)
\r
97 // Direct calls to sarray library
\r
99 static inline int fast_find_close(bp *b,int s)
\r
101 return fwd_excess(b,s,-1);
\r
104 static inline int fast_inspect(bp* Par,treeNode i)
\r
109 return (Par->B[j] >> (D-1-l)) & 1;
\r
112 static treeNode fast_first_child(bp *Par, treeNode x)
\r
115 return (fast_inspect(Par,x) == OP) ? x : NULLT;
\r
118 inline static treeNode fast_next_sibling(bp* Par,treeNode x)
\r
120 treeNode y = fast_find_close(Par,x)+1;
\r
121 return (fast_inspect(Par, y) == OP) ? y : NULLT;
\r
124 inline static bool fast_is_ancestor(bp * Par,treeNode x,treeNode y){
\r
125 return (x <= y) && ((x==0) || (y <= fast_find_close(Par,x)));
\r
128 // tag position -> tree node
\r
129 static treeNode tagpos2node(int t)
\r
131 return (treeNode) t;
\r
133 // tree node -> tag position
\r
134 static int node2tagpos(treeNode x)
\r
140 class XMLTreeBuilder;
\r
144 // Only the builder can access the constructor
\r
145 friend class XMLTreeBuilder;
\r
148 /** Balanced parentheses representation of the tree */
\r
151 /** Mapping from tag identifer to tag name */
\r
152 std::vector<std::string> *TagName;
\r
155 /** Bit vector indicating with a 1 the positions of the non-empty texts. */
\r
156 static_bitsequence *EBVector;
\r
158 /** Tag sequence represented with a data structure for rank and select */
\r
159 static_sequence *Tags;
\r
161 uint tags_blen, tags_len;
\r
163 /** The texts in the XML document */
\r
164 TextCollection *Text;
\r
166 // Allows to disable the TextCollection for benchmarkin purposes
\r
171 std::string * buffer;
\r
172 void myfputs(const char* s, FILE * fp){
\r
174 if (buffer->size() >= 100000){
\r
175 fputs(buffer->c_str(),fp);
\r
180 void myfputc(const char c, FILE*fp){
\r
181 buffer->append(1,c);
\r
182 if (buffer->size() >= 100000){
\r
183 fputs(buffer->c_str(),fp);
\r
187 void mybufferflush(FILE* fp){
\r
188 fputs(buffer->c_str(), fp);
\r
192 size_t myfprintf(const char* s, FILE * fp){
\r
195 size_t i = buffer->size();
\r
197 size_t j = buffer->size();
\r
198 if (buffer->size() >= 100000){
\r
199 fputs(buffer->c_str(),fp);
\r
205 void PrintNode(treeNode n, int fd);
\r
206 /** Data structure constructors */
\r
207 XMLTree(){ buffer = 0;};
\r
209 // non const pointer are freed by this method.
\r
210 XMLTree( pb * const par, uint npar, std::vector<std::string> * const TN, TagIdMap * const tim, uint *empty_texts_bmp, TagType *tags,
\r
211 TextCollection * const TC, bool dis_tc);
\r
214 /** Data structure destructor */
\r
217 /** root(): returns the tree root. */
\r
218 treeNode Root() { return 0; }
\r
220 /** Size() : Number of parenthesis */
\r
221 unsigned int Size(){
\r
226 /** NumTags() : Number of distinct tags */
\r
227 unsigned int NumTags() {
\r
228 return TagName->size();
\r
231 int TagsBinaryLength(){ return tags_blen; };
\r
232 unsigned int TagStructLength(){ return uint_len(tags_blen,tags_len); };
\r
233 unsigned int * TagStruct() { return tags_fix; };
\r
236 /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of
\r
238 int SubtreeSize(treeNode x);
\r
240 /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree
\r
242 int SubtreeTags(treeNode x, TagType tag);
\r
244 /** SubtreeElements(x) of element nodes in the subtree of x
\r
246 int SubtreeElements(treeNode x);
\r
248 /** IsLeaf(x): returns whether node x is leaf or not. In the succinct
\r
249 * representation this is just a bit inspection. */
\r
251 bool IsLeaf(treeNode x);
\r
253 /** IsAncestor(x,y): returns whether node x is ancestor of node y. */
\r
255 bool IsAncestor(treeNode x, treeNode y);
\r
257 /** IsChild(x,y): returns whether node x is parent of node y. */
\r
258 bool IsChild(treeNode x, treeNode y);
\r
260 /** IsFirstChild(x): returns whether node x is the first child of its parent. */
\r
262 bool IsFirstChild(treeNode x) {
\r
263 return ((x != NULLT)&&(x==Root() || prev_sibling(Par,x) == (treeNode)-1));
\r
266 /** NumChildren(x): number of children of node x. Constant time with the
\r
267 * data structure of Sadakane. */
\r
268 int NumChildren(treeNode x);
\r
270 /** ChildNumber(x): returns i if node x is the i-th children of its
\r
272 int ChildNumber(treeNode x);
\r
274 /** Depth(x): depth of node x, a simple binary rank on the parentheses
\r
276 int Depth(treeNode x);
\r
278 /** Preorder(x): returns the preorder number of node x, just regarding tree
\r
279 * nodes (and not texts). */
\r
280 int Preorder(treeNode x);
\r
282 /** Postorder(x): returns the postorder number of node x, just regarding
\r
283 * tree nodes (and not texts). */
\r
284 int Postorder(treeNode x);
\r
287 /** DocIds(x): returns the range (i.e., a pair of integers) of document
\r
288 * identifiers that descend from node x. */
\r
289 range DocIds(treeNode x);
\r
291 /** Parent(x): returns the parent node of node x. */
\r
292 treeNode Parent(treeNode x) {
\r
296 return parent(Par, x);
\r
298 /* Assumes x is neither 0 nor -1 */
\r
300 /** Child(x,i): returns the i-th child of node x, assuming it exists. */
\r
301 treeNode Child(treeNode x, int i);
\r
305 /** LastChild(x): returns the last child of node x. */
\r
306 treeNode LastChild(treeNode x);
\r
310 /** PrevSibling(x): returns the previous sibling of node x, assuming it
\r
313 treeNode PrevSibling(treeNode x);
\r
315 /** TaggedChild(x,tag): returns the first child of node x tagged tag, or
\r
316 * NULLT if there is none. Because of the balanced-parentheses representation
\r
317 * of the tree, this operation is not supported efficiently, just iterating
\r
318 * among the children of node x until finding the desired child. */
\r
319 treeNode TaggedChild(treeNode x, TagType tag);
\r
321 treeNode SelectChild(treeNode x, TagIdSet * tags);
\r
323 /** TaggedFollowingSibling(x,tag): returns the first sibling of node x tagged tag, or
\r
324 * NULLT if there is none. */
\r
325 treeNode TaggedFollowingSibling(treeNode x, TagType tag);
\r
327 treeNode SelectFollowingSibling(treeNode x, TagIdSet * tags);
\r
332 treeNode SelectDescendant(treeNode x, TagIdSet * tags);
\r
334 /** TaggedPrec(x,tag): returns the first node tagged tag with smaller
\r
335 * preorder than x and not an ancestor of x. Returns NULLT if there
\r
337 treeNode TaggedPreceding(treeNode x, TagType tag);
\r
339 /** TaggedFoll(x,tag): returns the first node tagged tag with larger
\r
340 * preorder than x and not in the subtree of x. Returns NULLT if there
\r
342 treeNode TaggedFollowing(treeNode x, TagType tag);
\r
346 treeNode SelectFollowingBelow(treeNode x, TagIdSet * tags, treeNode ancestor);
\r
348 // treeNode TaggedFollowingBefore(treeNode x, TagType tag,treeNode closing);
\r
350 treeNode SelectFollowingBefore(treeNode x, TagIdSet * tags, treeNode closing);
\r
352 /** TaggedAncestor(x, tag): returns the closest ancestor of x tagged
\r
353 * tag. Return NULLT is there is none. */
\r
354 treeNode TaggedAncestor(treeNode x, TagType tag);
\r
356 /** PrevText(x): returns the document identifier of the text to the left of
\r
357 * node x, or NULLT if x is the root node. */
\r
358 DocID PrevText(treeNode x);
\r
360 /** NextText(x): returns the document identifier of the text to the right of
\r
361 * node x, or NULLT if x is the root node. */
\r
362 DocID NextText(treeNode x);
\r
364 /** MyText(x): returns the document identifier of the text below node x, or
\r
365 * NULLT if x is not a leaf node. */
\r
366 DocID MyText(treeNode x);
\r
367 DocID MyTextUnsafe(treeNode x);
\r
369 /** TextXMLId(d): returns the preorder of document with identifier d in the
\r
370 * tree consisting of all tree nodes and all text nodes. */
\r
371 int TextXMLId(DocID d);
\r
373 /** NodeXMLId(x): returns the preorder of node x in the tree consisting of
\r
374 * all tree nodes and all text nodes. */
\r
375 int NodeXMLId(treeNode x);
\r
377 /** ParentNode(d): returns the parent node of document identifier d. */
\r
378 treeNode ParentNode(DocID d);
\r
380 treeNode PrevNode(DocID d);
\r
382 /** GetTagId(tagname): returns the tag identifier corresponding to a given
\r
383 * tag name. Returns NULLT in case that the tag name does not exists. */
\r
384 TagType GetTagId(unsigned char *tagname);
\r
386 /** GetTagName(tagid): returns the tag name of a given tag identifier.
\r
387 * Returns NULL in case that the tag identifier is not valid.*/
\r
388 unsigned char *GetTagName(TagType tagid);
\r
390 /** GetTagName(tagid): returns the tag name of a given tag identifier.
\r
391 * The result is just a reference and should not be freed by the caller.
\r
393 const unsigned char *GetTagNameByRef(TagType tagid);
\r
395 /** RegisterTag adds a new tag to the tag collection this is needed
\r
396 * if the query contains a tag which is not in the document, we need
\r
397 * to give this new tag a fresh id and store it somewhere. A logical
\r
399 * We might want to use a hashtable instead of an array though.
\r
401 TagType RegisterTag(unsigned char *tagname);
\r
403 bool EmptyText(DocID i) {
\r
404 return Text->EmptyText(i);
\r
407 /** Prefix(s): search for texts prefixed by string s. */
\r
408 TextCollection::document_result Prefix(uchar const *s) {
\r
409 return Text->Prefix(s);
\r
412 /** Suffix(s): search for texts having string s as a suffix. */
\r
413 TextCollection::document_result Suffix(uchar const *s) {
\r
414 return Text->Suffix(s);
\r
417 /** Equal(s): search for texts equal to string s. */
\r
418 TextCollection::document_result Equals(uchar const *s) {
\r
419 return Text->Equal(s);
\r
422 /** Contains(s): search for texts containing string s. */
\r
423 TextCollection::document_result Contains(uchar const *s) {
\r
424 return Text->Contains(s);
\r
427 /** LessThan(s): returns document identifiers for the texts that
\r
428 * are lexicographically smaller than string s. */
\r
429 TextCollection::document_result LessThan(uchar const *s) {
\r
430 return Text->LessThan(s);
\r
433 /** IsPrefix(x): returns true if there is a text prefixed by string s. */
\r
434 bool IsPrefix(uchar const *s) {
\r
435 return Text->IsPrefix(s);
\r
438 /** IsSuffix(s): returns true if there is a text having string s as a
\r
440 bool IsSuffix(uchar const *s) {
\r
441 return Text->IsSuffix(s);
\r
444 /** IsEqual(s): returns true if there is a text that equals given
\r
446 bool IsEqual(uchar const *s) {
\r
447 return Text->IsEqual(s);
\r
450 /** IsContains(s): returns true if there is a text containing string s. */
\r
451 bool IsContains(uchar const *s) {
\r
452 return Text->IsContains(s);
\r
455 /** IsLessThan(s): returns true if there is at least a text that is
\r
456 * lexicographically smaller than string s. */
\r
457 bool IsLessThan(uchar const *s) {
\r
458 return Text->IsLessThan(s);
\r
461 /** Count(s): Global counting */
\r
462 unsigned Count(uchar const *s) {
\r
463 return Text->Count(s);
\r
466 /** CountPrefix(s): counting version of Prefix(s). */
\r
467 unsigned CountPrefix(uchar const *s) {
\r
468 return Text->CountPrefix(s);
\r
471 /** CountSuffix(s): counting version of Suffix(s). */
\r
472 unsigned CountSuffix(uchar const *s) {
\r
473 return Text->CountSuffix(s);
\r
476 /** CountEqual(s): counting version of Equal(s). */
\r
477 unsigned CountEqual(uchar const *s) {
\r
478 return Text->CountEqual(s);
\r
481 /** CountContains(s): counting version of Contains(s). */
\r
482 unsigned CountContains(uchar const *s) {
\r
483 return Text->CountContains(s);
\r
486 /** CountLessThan(s): counting version of LessThan(s). */
\r
487 unsigned CountLessThan(uchar const *s) {
\r
488 return Text->CountLessThan(s);
\r
491 /** GetText(d): returns the text corresponding to document with
\r
493 uchar* GetText(DocID d) {
\r
495 uchar * s = Text->GetText(d);
\r
496 return (s[0] == 1 ? (s+1) : s);
\r
499 /** GetText(i, j): returns the texts corresponding to documents with
\r
500 * ids i, i+1, ..., j. Texts are separated by '\0' character. */
\r
501 // uchar* GetText(DocID i, DocID j) {
\r
502 // uchar * s = Text->GetText(i, j);
\r
503 // return (s[0] == 1 ? (uchar*)"" : s);
\r
506 TextCollection *getTextCollection() {
\r
510 /** Save: saves XML tree data structure to file. */
\r
511 void Save(int fd, char *filename);
\r
513 /** Load: loads XML tree data structure from file. sample_rate_text
\r
514 * indicates the sample rate for the text search data structure. */
\r
515 static XMLTree *Load(int fd, char *filename, bool load_tc, int sample_factor);
\r
517 void insertTag(TagType tag, uint position);
\r
519 void print_stats();
\r
522 /** Parenthesis functions */
\r
523 treeNode Closing(treeNode x);
\r
525 bool IsOpen(treeNode x);
\r
528 /** Print procedure */
\r
529 void Print(int fd,treeNode x, bool no_text);
\r
530 void Print(int fd,treeNode x) { Print(fd,x,false); }
\r
532 // The following are inlined here for speed
\r
533 /** Tag(x): returns the tag identifier of node x. */
\r
535 inline TagType Tag(treeNode x) const throw () {
\r
536 if (tags_blen == 8)
\r
537 return (TagType) (((uchar*)tags_fix)[(int) x]);
\r
539 return get_field(tags_fix, tags_blen, x);
\r
542 size_t idxlen = x * tags_blen;
\r
543 size_t j = idxlen % W;
\r
544 size_t i = idxlen / W;
\r
545 size_t offset = W - tags_blen;
\r
546 size_t offset2 = offset - j;
\r
547 size_t w = tags_fix[i];
\r
548 return (offset2 >= 0)
\r
549 ? ( w << offset2 ) >> offset
\r
550 : ( w >> j) | (tags_fix[i+1] << (W+offset2)) >> offset;
\r
555 /** FirstChild(x): returns the first child of node x, or NULLT if the node is a leaf
\r
557 treeNode FirstChild(treeNode x) {
\r
558 NULLT_IF(x==NULLT);
\r
559 return fast_first_child(Par, x);
\r
563 /** FirstElement(x): returns the first non text, non attribute child of node x, or NULLT
\r
566 treeNode FirstElement(treeNode x){
\r
568 NULLT_IF(x==NULLT);
\r
569 x = fast_first_child(Par, x);
\r
570 NULLT_IF(x == NULLT);
\r
573 case PCDATA_TAG_ID:
\r
575 return (fast_inspect(Par,x)==OP)? x : NULLT;
\r
577 case ATTRIBUTE_TAG_ID:
\r
578 x = fast_next_sibling(Par,x);
\r
579 if (x != NULLT && Tag(x) == PCDATA_TAG_ID){
\r
581 return (fast_inspect(Par,x)==OP)? x : NULLT;
\r
590 /** NextSibling(x): returns the next sibling of node x, or NULLT if none
\r
593 treeNode NextSibling(treeNode x) {
\r
595 return fast_next_sibling(Par, x);
\r
598 /** NextElement(x): returns the first non text, non attribute sibling of node x, or NULLT
\r
601 treeNode NextElement(treeNode x)
\r
604 x = fast_next_sibling(Par, x);
\r
605 NULLT_IF(x == NULLT);
\r
606 if (Tag(x) == PCDATA_TAG_ID){
\r
608 return (fast_inspect(Par,x)==OP)? x : NULLT;
\r
612 /** TaggedDesc(x,tag): returns the first node tagged tag with larger
\r
613 * preorder than x and within the subtree of x. Returns NULT if there
\r
615 inline treeNode TaggedDescendant(treeNode x, TagType tag)
\r
618 int s = (int) Tags->select_next(tag,node2tagpos(x));
\r
619 NULLT_IF (s == -1);
\r
621 treeNode y = tagpos2node(s); // transforms the tag position into a node position
\r
623 return (fast_is_ancestor(Par,x,y) ? y : NULLT);
\r
626 inline treeNode TaggedFollowingBelow(treeNode x, TagType tag, treeNode ancestor)
\r
628 treeNode close = fast_find_close(Par, x);
\r
629 treeNode s = tagpos2node(Tags->select_next(tag, close));
\r
631 if (ancestor == Root() || s == NULLT || s < fast_find_close(Par,ancestor)) return s;
\r
635 inline treeNode TaggedFollowingBefore(treeNode x, TagType tag, treeNode ancestor_closing)
\r
637 treeNode close = fast_find_close(Par, x);
\r
638 treeNode s = tagpos2node(Tags->select_next(tag, close));
\r
640 if (ancestor_closing == Root() || s == NULLT || s < ancestor_closing) return s;
\r