X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;ds=sidebyside;f=XMLTree.h;h=90f6b15796a22d199077936542387e854a942b9c;hb=6597c2306cd1720c87cfebf5f855d0fedff3c585;hp=00ad143f5e8e6075300cbb96ffebeb7436d1a8c5;hpb=b2df171c52f1e6d35a8b131299e4a7f494520333;p=SXSI%2FXMLTree.git diff --git a/XMLTree.h b/XMLTree.h index 00ad143..90f6b15 100644 --- a/XMLTree.h +++ b/XMLTree.h @@ -20,23 +20,13 @@ #ifndef XMLTREE_H_ #define XMLTREE_H_ -extern "C" { -#define CAML_NAME_SPACE -#include -#include -#define XMLTREE(x) ((XMLTree *)(* (XMLTree**) Data_custom_val(x))) - //#define XMLTREE(x) ((XMLTree*) (x)) -} + + #include #include #include #include "TextCollection/TextCollectionBuilder.h" -#include -#include -#include - - #undef W #undef WW #undef Wminusone @@ -50,7 +40,6 @@ using SXSI::TextCollection; using SXSI::TextCollectionBuilder; - // this constant is used to efficiently compute the child operation in the tree #define OPTD 10 @@ -96,19 +85,56 @@ typedef struct { typedef std::unordered_set TagIdSet; -typedef std::unordered_map TagIdMap; +typedef std::unordered_map TagIdMap; typedef TagIdMap::const_iterator TagIdMapIT; #define REGISTER_TAG(v,h,t) do { (h)->insert(std::make_pair((t),(v)->size()));\ (v)->push_back(t); } while (false) - // returns NULLT if the test is true #define NULLT_IF(x) do { if (x) return NULLT; } while (0) +// Direct calls to sarray library +static inline int fast_find_close(bp *b,int s) +{ + return fwd_excess(b,s,-1); +} + +static inline int fast_inspect(bp* Par,treeNode i) +{ + int j,l; + j = i >> logD; + l = i & (D-1); + return (Par->B[j] >> (D-1-l)) & 1; +} + +static treeNode fast_first_child(bp *Par, treeNode x) +{ + x = x+1; + return (fast_inspect(Par,x) == OP) ? x : NULLT; +} + +inline static treeNode fast_next_sibling(bp* Par,treeNode x) +{ + treeNode y = fast_find_close(Par,x)+1; + return (fast_inspect(Par, y) == OP) ? y : NULLT; +} +inline static bool fast_is_ancestor(bp * Par,treeNode x,treeNode y){ + return (x <= y) && ((x==0) || (y <= fast_find_close(Par,x))); +} +// tag position -> tree node +static treeNode tagpos2node(int t) + { + return (treeNode) t; + } +// tree node -> tag position +static int node2tagpos(treeNode x) +{ + return (int)x; +} class XMLTreeBuilder; @@ -123,7 +149,7 @@ class XMLTree { bp *Par; /** Mapping from tag identifer to tag name */ - vector *TagName; + std::vector *TagName; TagIdMap * tIdMap; /** Bit vector indicating with a 1 the positions of the non-empty texts. */ @@ -141,13 +167,47 @@ class XMLTree { bool disable_tc; FILE* stream; - int stream_fd; + int stream_fd; + std::string * buffer; + void myfputs(const char* s, FILE * fp){ + buffer->append(s); + if (buffer->size() >= 100000){ + fputs(buffer->c_str(),fp); + buffer->clear(); + }; + + } + void myfputc(const char c, FILE*fp){ + buffer->append(1,c); + if (buffer->size() >= 100000){ + fputs(buffer->c_str(),fp); + buffer->clear(); + }; + } + void mybufferflush(FILE* fp){ + fputs(buffer->c_str(), fp); + buffer->clear(); + } + + size_t myfprintf(const char* s, FILE * fp){ + if (s == NULL) + return 0; + size_t i = buffer->size(); + buffer->append(s); + size_t j = buffer->size(); + if (buffer->size() >= 100000){ + fputs(buffer->c_str(),fp); + buffer->clear(); + }; + return (j-i); + } + void PrintNode(treeNode n, int fd); /** Data structure constructors */ - XMLTree(){;}; + XMLTree(){ buffer = 0;}; // non const pointer are freed by this method. - XMLTree( pb * const par, uint npar, vector * const TN, TagIdMap * const tim, uint *empty_texts_bmp, TagType *tags, + XMLTree( pb * const par, uint npar, std::vector * const TN, TagIdMap * const tim, uint *empty_texts_bmp, TagType *tags, TextCollection * const TC, bool dis_tc); public: @@ -162,6 +222,17 @@ public: return tags_len/2; } + + /** NumTags() : Number of distinct tags */ + unsigned int NumTags() { + return TagName->size(); + } + + int TagsBinaryLength(){ return tags_blen; }; + unsigned int TagStructLength(){ return uint_len(tags_blen,tags_len); }; + unsigned int * TagStruct() { return tags_fix; }; + + /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of * node x. */ int SubtreeSize(treeNode x); @@ -188,7 +259,9 @@ public: /** IsFirstChild(x): returns whether node x is the first child of its parent. */ /* OCAML */ - bool IsFirstChild(treeNode x); + bool IsFirstChild(treeNode x) { + return ((x != NULLT)&&(x==Root() || prev_sibling(Par,x) == (treeNode)-1)); + }; /** NumChildren(x): number of children of node x. Constant time with the * data structure of Sadakane. */ @@ -210,47 +283,30 @@ public: * tree nodes (and not texts). */ int Postorder(treeNode x); - /** Tag(x): returns the tag identifier of node x. */ - TagType Tag(treeNode x) { - if (tags_blen == 8) - return (TagType) (((uchar*)tags_fix)[(int) x]); - else - return (TagType) get_field(tags_fix,tags_blen, (int) x); - } /** DocIds(x): returns the range (i.e., a pair of integers) of document * identifiers that descend from node x. */ range DocIds(treeNode x); /** Parent(x): returns the parent node of node x. */ - treeNode Parent(treeNode x); + treeNode Parent(treeNode x) { + if (x == Root()) + return NULLT; + else + return parent(Par, x); + }; /* Assumes x is neither 0 nor -1 */ /** Child(x,i): returns the i-th child of node x, assuming it exists. */ treeNode Child(treeNode x, int i); - /** FirstChild(x): returns the first child of node x, or NULLT if the node is a leaf - */ - treeNode FirstChild(treeNode x); - /** FirstElement(x): returns the first non text, non attribute child of node x, or NULLT - * if none. - */ - treeNode FirstElement(treeNode x); - value CamlFirstElement(value x); + /** LastChild(x): returns the last child of node x. */ treeNode LastChild(treeNode x); - /** NextSibling(x): returns the next sibling of node x, or NULLT if none - * exists. */ - treeNode NextSibling(treeNode x); - /** NextElement(x): returns the first non text, non attribute sibling of node x, or NULLT - * if none. - */ - treeNode NextElement(treeNode x); - value CamlNextElement(value x); /** PrevSibling(x): returns the previous sibling of node x, assuming it * exists. */ @@ -270,12 +326,10 @@ public: treeNode SelectFollowingSibling(treeNode x, TagIdSet * tags); - /** TaggedDesc(x,tag): returns the first node tagged tag with larger - * preorder than x and within the subtree of x. Returns NULT if there - * is none. */ - treeNode TaggedDescendant(treeNode x, TagType tag); - treeNode SelectDescendant(treeNode x, TagIdSet * tags); + + + treeNode SelectDescendant(treeNode x, TagIdSet * tags); /** TaggedPrec(x,tag): returns the first node tagged tag with smaller * preorder than x and not an ancestor of x. Returns NULLT if there @@ -287,11 +341,11 @@ public: * is none. */ treeNode TaggedFollowing(treeNode x, TagType tag); - treeNode TaggedFollowingBelow(treeNode x, TagType tag,treeNode ancestor); + treeNode SelectFollowingBelow(treeNode x, TagIdSet * tags, treeNode ancestor); - treeNode TaggedFollowingBefore(treeNode x, TagType tag,treeNode closing); + // treeNode TaggedFollowingBefore(treeNode x, TagType tag,treeNode closing); treeNode SelectFollowingBefore(treeNode x, TagIdSet * tags, treeNode closing); @@ -439,7 +493,7 @@ public: uchar* GetText(DocID d) { uchar * s = Text->GetText(d); - return (s[0] == 1 ? (uchar*)"" : s); + return (s[0] == 1 ? (s+1) : s); } /** GetText(i, j): returns the texts corresponding to documents with @@ -454,11 +508,11 @@ public: } /** Save: saves XML tree data structure to file. */ - void Save(int fd); + void Save(int fd, char *filename); /** Load: loads XML tree data structure from file. sample_rate_text * indicates the sample rate for the text search data structure. */ - static XMLTree *Load(int fd,bool load_tc, int sample_factor); + static XMLTree *Load(int fd, char *filename, bool load_tc, int sample_factor); void insertTag(TagType tag, uint position); @@ -475,10 +529,120 @@ public: void Print(int fd,treeNode x, bool no_text); void Print(int fd,treeNode x) { Print(fd,x,false); } + // The following are inlined here for speed + /** Tag(x): returns the tag identifier of node x. */ + + inline TagType Tag(treeNode x) const throw () { + if (tags_blen == 8) + return (TagType) (((uchar*)tags_fix)[(int) x]); + else + return get_field(tags_fix, tags_blen, x); + /* + { + size_t idxlen = x * tags_blen; + size_t j = idxlen % W; + size_t i = idxlen / W; + size_t offset = W - tags_blen; + size_t offset2 = offset - j; + size_t w = tags_fix[i]; + return (offset2 >= 0) + ? ( w << offset2 ) >> offset + : ( w >> j) | (tags_fix[i+1] << (W+offset2)) >> offset; + }; */ + + } + + /** FirstChild(x): returns the first child of node x, or NULLT if the node is a leaf + */ + treeNode FirstChild(treeNode x) { + NULLT_IF(x==NULLT); + return fast_first_child(Par, x); + }; + + + /** FirstElement(x): returns the first non text, non attribute child of node x, or NULLT + * if none. + */ + treeNode FirstElement(treeNode x){ + { + NULLT_IF(x==NULLT); + x = fast_first_child(Par, x); + NULLT_IF(x == NULLT); + switch (Tag(x)){ + + case PCDATA_TAG_ID: + x = x+2; + return (fast_inspect(Par,x)==OP)? x : NULLT; + + case ATTRIBUTE_TAG_ID: + x = fast_next_sibling(Par,x); + if (x != NULLT && Tag(x) == PCDATA_TAG_ID){ + x = x+2; + return (fast_inspect(Par,x)==OP)? x : NULLT; + } + else return x; + default: + return x; + } + } + }; + + /** NextSibling(x): returns the next sibling of node x, or NULLT if none + * exists. */ + + treeNode NextSibling(treeNode x) { + NULLT_IF (x <= 0); + return fast_next_sibling(Par, x); + }; + + /** NextElement(x): returns the first non text, non attribute sibling of node x, or NULLT + * if none. + */ + treeNode NextElement(treeNode x) + { + NULLT_IF(x <= 0); + x = fast_next_sibling(Par, x); + NULLT_IF(x == NULLT); + if (Tag(x) == PCDATA_TAG_ID){ + x = x+2; + return (fast_inspect(Par,x)==OP)? x : NULLT; + } + else return x; + }; + /** TaggedDesc(x,tag): returns the first node tagged tag with larger + * preorder than x and within the subtree of x. Returns NULT if there + * is none. */ + inline treeNode TaggedDescendant(treeNode x, TagType tag) + { + + int s = (int) Tags->select_next(tag,node2tagpos(x)); + NULLT_IF (s == -1); + + treeNode y = tagpos2node(s); // transforms the tag position into a node position + + return (fast_is_ancestor(Par,x,y) ? y : NULLT); + }; + + inline treeNode TaggedFollowingBelow(treeNode x, TagType tag, treeNode ancestor) + { + treeNode close = fast_find_close(Par, x); + treeNode s = tagpos2node(Tags->select_next(tag, close)); + + if (ancestor == Root() || s == NULLT || s < fast_find_close(Par,ancestor)) return s; + else return NULLT; + }; + + inline treeNode TaggedFollowingBefore(treeNode x, TagType tag, treeNode ancestor_closing) + { + treeNode close = fast_find_close(Par, x); + treeNode s = tagpos2node(Tags->select_next(tag, close)); + + if (ancestor_closing == Root() || s == NULLT || s < ancestor_closing) return s; + else return NULLT; + }; + }; -extern "C" value caml_cpp_fast_first_element(value xmltree, value node); -extern "C" value caml_cpp_fast_next_element(value xmltree, value node);