#include <unordered_set>
#include <unordered_map>
#include <sstream>
-#include "TextCollection/TextCollectionBuilder.h"
+#include <TextCollection/TextCollectionBuilder.h>
#undef W
#undef WW
#undef Wminusone
-#include "bp.h"
+#include <bp/bp.h>
#include <libcds/includes/basics.h>
-#include <static_bitsequence.h>
-#include <alphabet_mapper.h>
-#include <static_sequence.h>
+#include <libcds/includes/static_bitsequence.h>
+#include <libcds/includes/alphabet_mapper.h>
+#include <libcds/includes/static_sequence.h>
+
using SXSI::TextCollection;
using SXSI::TextCollectionBuilder;
typedef int treeNode;
-typedef int TagType;
-typedef int DocID;
+typedef int TagType;
+typedef int DocID;
typedef struct {
int min;
#define BUFFER_ALLOC (8192 * 2)
#define BUFFER_SIZE (BUFFER_ALLOC / 2)
-static inline int fast_find_close(bp *b,int s)
-{
- return fwd_excess(b,s,-1);
-}
-
-static inline int fast_inspect(bp* Par,treeNode i)
-{
- int j,l;
- j = i >> logD;
- l = i & (D-1);
- return (Par->B[j] >> (D-1-l)) & 1;
-}
-
-static bool fast_isleaf(bp* Par,treeNode x){
- return (fast_inspect(Par, x+1) == CP);
-}
-
-static treeNode fast_first_child(bp *Par, treeNode x)
-{
- x = x+1;
- return (fast_inspect(Par,x) == OP) ? x : NULLT;
-}
-
-inline static treeNode fast_next_sibling(bp* Par,treeNode x)
-{
- treeNode y = fast_find_close(Par,x)+1;
- return (fast_inspect(Par, y) == OP) ? y : NULLT;
-}
-
-inline static bool fast_is_ancestor(bp * Par,treeNode x,treeNode y){
- return (x <= y) && ((x==0) || (y <= fast_find_close(Par,x)));
-}
// tag position -> tree node
-static treeNode tagpos2node(int t)
+static treeNode tagpos2node(int t)
{
return (treeNode) t;
}
// tree node -> tag position
-static int node2tagpos(treeNode x)
+static int node2tagpos(treeNode x)
{
return (int)x;
}
private:
/** Balanced parentheses representation of the tree */
bp *Par;
-
- /** Mapping from tag identifer to tag name */
+
+ /** Mapping from tag identifer to tag name */
std::vector<std::string> *TagName;
TagIdMap * tIdMap;
-
+
/** Bit vector indicating with a 1 the positions of the non-empty texts. */
- static_bitsequence *EBVector;
+ static_bitsequence *EBVector;
/** Tag sequence represented with a data structure for rank and select */
static_sequence *Tags;
// Allows to disable the TextCollection for benchmarkin purposes
bool disable_tc;
SXSI::TextCollectionBuilder::index_type_t text_index_type;
-
+
std::string *buffer;
std::vector<std::string *> *print_stack;
+
+ void _real_flush(int fd, size_t size) {
+ if (size == 0) return;
+ size_t written;
+ while (1) {
+ written = write(fd, buffer->data(), size);
+ if ((written < 0) && (errno == EAGAIN || errno == EINTR))
+ continue;
+ break;
+ };
+ buffer->clear();
+
+ }
+
void _flush(int fd){
size_t size = buffer->size();
if (size < BUFFER_SIZE) return;
- size_t written;
- while (1) {
- written = write(fd, buffer->data(), size);
- if ((written < 0) && (errno == EAGAIN || errno == EINTR))
- continue;
- break;
- };
- buffer->clear();
+ _real_flush(fd, size);
}
void _dput_str(std::string s, int fd){
void _dputs(const char* s, int fd){
buffer->append(s);
- _flush(fd);
+ _flush(fd);
}
void _dputc(const char c, int fd){
TextCollection * const TC, bool dis_tc,
TextCollectionBuilder::index_type_t _index_type );
-public:
+public:
/** Data structure destructor */
~XMLTree();
-
+
/** root(): returns the tree root. */
treeNode Root() { return 0; }
unsigned int * TagStruct() { return tags_fix; };
- /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of
+ /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of
* node x. */
- int SubtreeSize(treeNode x) { return subtree_size(Par, x); }
-
- /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree
+ int SubtreeSize(treeNode x) { return bp_subtree_size(Par, x); }
+
+ /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree
* of node x. */
int SubtreeTags(treeNode x, TagType tag){
//int s = x + 2*subtree_size(Par, x) - 1;
- treeNode y = fast_find_close(Par, x);
-
+ treeNode y = bp_find_close(Par, x);
+
if (y - x < 10) {
int count = 0;
for(int i = x; i < y; i++)
else
return (Tags->rank(tag, y) - Tags->rank(tag, x));
};
-
+
/** SubtreeElements(x) of element nodes in the subtree of x
*/
int SubtreeElements(treeNode x);
- /** IsLeaf(x): returns whether node x is leaf or not. In the succinct
+ /** IsLeaf(x): returns whether node x is leaf or not. In the succinct
* representation this is just a bit inspection. */
bool IsLeaf(treeNode x);
/** IsAncestor(x,y): returns whether node x is ancestor of node y. */
bool IsAncestor(treeNode x, treeNode y);
-
+
+
+ /** IsRigthDescendant returns true if y is a descendant of x and y is
+ not a descendant of the first child of x */
+ bool IsRightDescendant(treeNode x, treeNode y) {
+ if (x <= Root()) return false;
+ treeNode z = bp_parent_close(Par, x);
+ treeNode c = bp_find_close(Par, x);
+ return (y > c && y < z );
+ }
+
+
/** IsChild(x,y): returns whether node x is parent of node y. */
bool IsChild(treeNode x, treeNode y);
/** IsFirstChild(x): returns whether node x is the first child of its parent. */
/* OCAML */
- bool IsFirstChild(treeNode x) {
- return ((x != NULLT)&&(x==Root() || prev_sibling(Par,x) == (treeNode)-1));
+ bool IsFirstChild(treeNode x) {
+ return ((x != NULLT)&&(x==Root() || bp_prev_sibling(Par,x) == (treeNode)-1));
};
-
- /** NumChildren(x): number of children of node x. Constant time with the
+
+ /** NumChildren(x): number of children of node x. Constant time with the
* data structure of Sadakane. */
int NumChildren(treeNode x);
- /** ChildNumber(x): returns i if node x is the i-th children of its
+ /** ChildNumber(x): returns i if node x is the i-th children of its
* parent. */
int ChildNumber(treeNode x);
- /** Depth(x): depth of node x, a simple binary rank on the parentheses
+ /** Depth(x): depth of node x, a simple binary rank on the parentheses
* sequence. */
int Depth(treeNode x);
-
- /** Preorder(x): returns the preorder number of node x, just regarding tree
- * nodes (and not texts). */
+
+ /** Preorder(x): returns the preorder number of node x, just regarding tree
+ * nodes (and not texts). */
int Preorder(treeNode x);
-
- /** Postorder(x): returns the postorder number of node x, just regarding
+
+ /** Postorder(x): returns the postorder number of node x, just regarding
* tree nodes (and not texts). */
int Postorder(treeNode x);
-
- /** DocIds(x): returns the range (i.e., a pair of integers) of document
+
+ /** DocIds(x): returns the range (i.e., a pair of integers) of document
* identifiers that descend from node x. */
range DocIds(treeNode x);
/** Parent(x): returns the parent node of node x. */
- treeNode Parent(treeNode x) {
- if (x == Root())
- return NULLT;
- else
- return parent(Par, x);
+ treeNode Parent(treeNode x) {
+ return (x == Root()) ? NULLT : bp_parent(Par, x);
};
treeNode BinaryParent(treeNode x){
- if (x <= Root())
- return NULLT;
- else {
- treeNode prev = x - 1;
- return (fast_inspect(Par, prev) == OP) ? prev : find_open(Par, prev);
- };
+ if (x <= Root())
+ return NULLT;
+ else {
+ treeNode prev = x - 1;
+ return (bp_inspect(Par, prev) == OP) ? prev : bp_find_open(Par, prev);
+ };
};
/* Assumes x is neither 0 nor -1 */
-
- /** Child(x,i): returns the i-th child of node x, assuming it exists. */
+
+ /** Child(x,i): returns the i-th child of node x, assuming it exists. */
treeNode Child(treeNode x, int i);
/** LastChild(x): returns the last child of node x. */
treeNode LastChild(treeNode x) {
- NULLT_IF(x == NULLT || fast_isleaf(Par,x));
- return find_open(Par, fast_find_close(Par, x)-1);
+ NULLT_IF(x == NULLT || bp_isleaf(Par,x));
+ return bp_find_open(Par, bp_find_close(Par, x)-1);
}
-
- /** PrevSibling(x): returns the previous sibling of node x, assuming it
+
+ /** PrevSibling(x): returns the previous sibling of node x, assuming it
* exists. */
treeNode PrevSibling(treeNode x)
{
NULLT_IF(x==NULLT);
- return prev_sibling(Par, x);
+ return bp_prev_sibling(Par, x);
}
-
- /** TaggedChild(x,tag): returns the first child of node x tagged tag, or
- * NULLT if there is none. Because of the balanced-parentheses representation
- * of the tree, this operation is not supported efficiently, just iterating
+
+ /** TaggedChild(x,tag): returns the first child of node x tagged tag, or
+ * NULLT if there is none. Because of the balanced-parentheses representation
+ * of the tree, this operation is not supported efficiently, just iterating
* among the children of node x until finding the desired child. */
-
+
treeNode SelectChild(treeNode x, TagIdSet * tags);
- /** TaggedFollowingSibling(x,tag): returns the first sibling of node x tagged tag, or
+ /** TaggedFollowingSibling(x,tag): returns the first sibling of node x tagged tag, or
* NULLT if there is none. */
-
+
treeNode SelectFollowingSibling(treeNode x, TagIdSet * tags);
treeNode SelectDescendant(treeNode x, TagIdSet * tags) {
NULLT_IF (x == NULLT);
treeNode fc = x+1;
- if (fast_inspect(Par, fc) == CP) return NULLT;
+ if (bp_inspect(Par, fc) == CP) return NULLT;
treeNode min = NULLT;
treeNode aux;
TagIdSet::const_iterator tagit;
for (tagit = tags->begin(); tagit != tags->end(); ++tagit) {
- aux = TaggedDescendant(x, (TagType) *tagit);
+ aux = TaggedDescendant(x, (TagType) *tagit);
if (((unsigned int) aux) < ((unsigned int) min)) min = aux;
};
return min;
}
- /** TaggedPrec(x,tag): returns the first node tagged tag with smaller
- * preorder than x and not an ancestor of x. Returns NULLT if there
+ /** TaggedPrec(x,tag): returns the first node tagged tag with smaller
+ * preorder than x and not an ancestor of x. Returns NULLT if there
* is none. */
treeNode TaggedPreceding(treeNode x, TagType tag);
-
- /** TaggedFoll(x,tag): returns the first node tagged tag with larger
- * preorder than x and not in the subtree of x. Returns NULLT if there
+
+ /** TaggedFoll(x,tag): returns the first node tagged tag with larger
+ * preorder than x and not in the subtree of x. Returns NULLT if there
* is none. */
treeNode TaggedFollowing(treeNode x, TagType tag);
NULLT_IF(x <= 0);
- treeNode close = fast_find_close(Par,x);
+ treeNode close = bp_find_close(Par,x);
+
-
treeNode min = NULLT;
treeNode aux;
-
+
TagIdSet::const_iterator tagit;
for (tagit = tags->begin(); tagit != tags->end(); ++tagit) {
- aux = tagpos2node(Tags->select_next(*tagit, close));
+ aux = tagpos2node(Tags->select_next(*tagit, close));
if (((unsigned int) aux) < ((unsigned int) min)) min = aux;
};
-
+
return (min < ancestor_closing) ? min : NULLT;
-
+
}
- /** TaggedAncestor(x, tag): returns the closest ancestor of x tagged
+ /** TaggedAncestor(x, tag): returns the closest ancestor of x tagged
* tag. Return NULLT is there is none. */
treeNode TaggedAncestor(treeNode x, TagType tag);
-
- /** PrevText(x): returns the document identifier of the text to the left of
+
+ /** PrevText(x): returns the document identifier of the text to the left of
* node x, or NULLT if x is the root node. */
DocID PrevText(treeNode x);
-
- /** NextText(x): returns the document identifier of the text to the right of
+
+ /** NextText(x): returns the document identifier of the text to the right of
* node x, or NULLT if x is the root node. */
DocID NextText(treeNode x);
-
- /** MyText(x): returns the document identifier of the text below node x, or
+
+ /** MyText(x): returns the document identifier of the text below node x, or
* NULLT if x is not a leaf node. */
DocID MyText(treeNode x);
DocID MyTextUnsafe(treeNode x);
- /** TextXMLId(d): returns the preorder of document with identifier d in the
+ /** TextXMLId(d): returns the preorder of document with identifier d in the
* tree consisting of all tree nodes and all text nodes. */
int TextXMLId(DocID d);
-
- /** NodeXMLId(x): returns the preorder of node x in the tree consisting of
+
+ /** NodeXMLId(x): returns the preorder of node x in the tree consisting of
* all tree nodes and all text nodes. */
int NodeXMLId(treeNode x);
-
+
/** ParentNode(d): returns the parent node of document identifier d. */
treeNode ParentNode(DocID d);
-
+
treeNode PrevNode(DocID d);
- /** GetTagId(tagname): returns the tag identifier corresponding to a given
+ /** GetTagId(tagname): returns the tag identifier corresponding to a given
* tag name. Returns NULLT in case that the tag name does not exists. */
TagType GetTagId(unsigned char *tagname);
- /** GetTagName(tagid): returns the tag name of a given tag identifier.
+ /** GetTagName(tagid): returns the tag name of a given tag identifier.
* Returns NULL in case that the tag identifier is not valid.*/
unsigned char *GetTagName(TagType tagid);
- /** GetTagName(tagid): returns the tag name of a given tag identifier.
+ /** GetTagName(tagid): returns the tag name of a given tag identifier.
* The result is just a reference and should not be freed by the caller.
*/
const unsigned char *GetTagNameByRef(TagType tagid);
TextCollection::document_result LessThan(uchar const *s) {
return Text->LessThan(s);
}
-
+
/** IsPrefix(x): returns true if there is a text prefixed by string s. */
bool IsPrefix(uchar const *s) {
return Text->IsPrefix(s);
- }
-
- /** IsSuffix(s): returns true if there is a text having string s as a
+ }
+
+ /** IsSuffix(s): returns true if there is a text having string s as a
* suffix.*/
bool IsSuffix(uchar const *s) {
return Text->IsSuffix(s);
}
-
- /** IsEqual(s): returns true if there is a text that equals given
+
+ /** IsEqual(s): returns true if there is a text that equals given
* string s. */
bool IsEqual(uchar const *s) {
return Text->IsEqual(s);
}
-
+
/** IsContains(s): returns true if there is a text containing string s. */
bool IsContains(uchar const *s) {
return Text->IsContains(s);
}
-
- /** IsLessThan(s): returns true if there is at least a text that is
+
+ /** IsLessThan(s): returns true if there is at least a text that is
* lexicographically smaller than string s. */
bool IsLessThan(uchar const *s) {
return Text->IsLessThan(s);
}
-
+
/** Count(s): Global counting */
unsigned Count(uchar const *s) {
return Text->Count(s);
unsigned CountPrefix(uchar const *s) {
return Text->CountPrefix(s);
}
-
+
/** CountSuffix(s): counting version of Suffix(s). */
unsigned CountSuffix(uchar const *s) {
return Text->CountSuffix(s);
}
-
+
/** CountEqual(s): counting version of Equal(s). */
unsigned CountEqual(uchar const *s) {
return Text->CountEqual(s);
}
-
+
/** CountContains(s): counting version of Contains(s). */
unsigned CountContains(uchar const *s) {
return Text->CountContains(s);
}
-
+
/** CountLessThan(s): counting version of LessThan(s). */
unsigned CountLessThan(uchar const *s) {
return Text->CountLessThan(s);
}
-
+
/** GetText(d): returns the text corresponding to document with
* id d. */
uchar* GetText(DocID d) {
-
+
uchar * s = Text->GetText(d);
return (s[0] == 1 ? (s+1) : s);
}
TextCollection *getTextCollection() {
return Text;
}
-
+
/** Save: saves XML tree data structure to file. */
- void Save(int fd );
-
- /** Load: loads XML tree data structure from file. sample_rate_text
+ void Save(int fd, char* name );
+
+ /** Load: loads XML tree data structure from file. sample_rate_text
* indicates the sample rate for the text search data structure. */
- static XMLTree *Load(int fd, bool load_tc, int sample_factor);
+ static XMLTree *Load(int fd, bool load_tc, int sample_factor, char * name);
void insertTag(TagType tag, uint position);
-
+
void print_stats();
-
+
/** Parenthesis functions */
treeNode Closing(treeNode x);
/** Print procedure */
void Print(int fd,treeNode x, bool no_text);
void Print(int fd,treeNode x) { Print(fd,x,false); }
- void Flush(int fd){ _flush(fd); }
+ void Flush(int fd){ if (buffer) _real_flush(fd, buffer->size()); }
// The following are inlined here for speed
/** Tag(x): returns the tag identifier of node x. */
inline TagType Tag(treeNode x) const throw () {
if (tags_blen == 8)
return (TagType) (((uchar*)tags_fix)[(int) x]);
- else
+ else
return get_field(tags_fix, tags_blen, x);
/*
- {
+ {
size_t idxlen = x * tags_blen;
size_t j = idxlen % W;
- size_t i = idxlen / W;
+ size_t i = idxlen / W;
size_t offset = W - tags_blen;
size_t offset2 = offset - j;
size_t w = tags_fix[i];
*/
treeNode FirstChild(treeNode x) {
NULLT_IF(x==NULLT);
- return fast_first_child(Par, x);
+ return bp_first_child(Par, x);
};
treeNode FirstElement(treeNode node){
{
NULLT_IF(node == NULLT);
- treeNode x = fast_first_child(Par, node);
+ treeNode x = bp_first_child(Par, node);
NULLT_IF(x == NULLT);
switch (Tag(x)){
- case ATTRIBUTE_TAG_ID:
- x = fast_next_sibling(Par,x);
+ case ATTRIBUTE_TAG_ID:
+ x = bp_next_sibling(Par,x);
if (x == NULLT || Tag(x) != PCDATA_TAG_ID) return x;
-
+
case PCDATA_TAG_ID:
x = x+2;
- return (fast_inspect(Par,x)==OP)? x : NULLT;
-
+ return (bp_inspect(Par,x)==OP)? x : NULLT;
+
default:
return x;
}
}
};
- /** NextSibling(x): returns the next sibling of node x, or NULLT if none
+ /** NextSibling(x): returns the next sibling of node x, or NULLT if none
* exists. */
-
+
treeNode NextSibling(treeNode x) {
NULLT_IF (x <= 0);
- return fast_next_sibling(Par, x);
+ return bp_next_sibling(Par, x);
};
-
+
/** NextElement(x): returns the first non text, non attribute sibling of node x, or NULLT
* if none.
*/
treeNode NextElement(treeNode x)
{
NULLT_IF(x <= 0);
- x = fast_next_sibling(Par, x);
- NULLT_IF(x == NULLT);
+ x = bp_next_sibling(Par, x);
+ NULLT_IF(x == NULLT);
if (Tag(x) == PCDATA_TAG_ID){
int y = x+2;
- return (fast_inspect(Par, y) == OP) ? y : NULLT;
+ return (bp_inspect(Par, y) == OP) ? y : NULLT;
}
- else return x;
+ else return x;
};
- /** TaggedDesc(x,tag): returns the first node tagged tag with larger
- * preorder than x and within the subtree of x. Returns NULT if there
+ /** TaggedDesc(x,tag): returns the first node tagged tag with larger
+ * preorder than x and within the subtree of x. Returns NULT if there
* is none. */
+ inline treeNode TaggedNext(treeNode x, TagType tag)
+ {
+ return tagpos2node(Tags->select_next(tag,node2tagpos(x)));
+ }
inline treeNode TaggedDescendant(treeNode x, TagType tag)
{
-
+
int s = (int) Tags->select_next(tag,node2tagpos(x));
NULLT_IF (s == -1);
-
+
treeNode y = tagpos2node(s); // transforms the tag position into a node position
-
- return (fast_is_ancestor(Par,x,y) ? y : NULLT);
+
+ return (bp_is_ancestor(Par,x,y) ? y : NULLT);
};
-
+
inline treeNode TaggedFollowingBelow(treeNode x, TagType tag, treeNode ancestor)
{
- treeNode close = fast_find_close(Par, x);
+ treeNode close = bp_find_close(Par, x);
treeNode s = tagpos2node(Tags->select_next(tag, close));
-
- if (ancestor == Root() || s == NULLT || s < fast_find_close(Par,ancestor)) return s;
+
+ if (ancestor == Root() || s == NULLT || s < bp_find_close(Par,ancestor)) return s;
else return NULLT;
};
inline treeNode TaggedFollowingBefore(treeNode x, TagType tag, treeNode ancestor_closing)
{
- treeNode close = fast_find_close(Par, x);
+ treeNode close = bp_find_close(Par, x);
treeNode s = tagpos2node(Tags->select_next(tag, close));
-
+
if (ancestor_closing == Root() || s == NULLT || s < ancestor_closing) return s;
else return NULLT;
};
-
+
+ inline treeNode NextNodeBefore(treeNode x, treeNode ancestor_closing)
+ {
+ treeNode close = bp_find_close(Par, x);
+ int rank = bp_rank_open(Par, close);
+ treeNode y = bp_select_open(Par, rank+1);
+ return (y < ancestor_closing) ? y : NULLT;
+ };
+
// TaggedSibling(x,tag): returns the first sibling of node x tagged tag, or NULLT if there is none.
treeNode TaggedFollowingSibling(treeNode x, TagType tag)
{
NULLT_IF(x==NULLT);
treeNode sibling = x;
TagType ctag;
- while ((sibling = fast_next_sibling(Par, sibling)) != NULLT) {
+ while ((sibling = bp_next_sibling(Par, sibling)) != NULLT) {
ctag = Tag(sibling);
- if (ctag == tag) return sibling;
+ if (ctag == tag) return sibling;
}
- return NULLT;
+ return NULLT;
};
-treeNode TaggedChild(treeNode x, TagType tag)
+treeNode TaggedChild(treeNode x, TagType tag)
{
-
- NULLT_IF(x==NULLT || fast_isleaf(Par,x));
- treeNode child;
- child = fast_first_child(Par, x);
-
+
+ NULLT_IF(x==NULLT || bp_isleaf(Par,x));
+ treeNode child;
+ child = bp_first_child(Par, x);
+
if (Tag(child) == tag)
return child;
else