#include "basics.h"\r
-#include <cstring>\r
-#include <sstream>\r
-#include <stack>\r
#include "XMLTree.h"\r
-#include <sys/time.h>\r
-#include <time.h>\r
-#include <sys/types.h>\r
-#include <sys/stat.h> \r
-#include <unistd.h>\r
-\r
-static double tLoading = 0;\r
-\r
-static unsigned int cLoading = 0;\r
-static struct timeval tmpv1;\r
-static struct timeval tmpv2;\r
-static string mem1;\r
-static string mem2;\r
-\r
-void read_procmem(string& memstr) {\r
- std::string buf;\r
- pid_t pid = getpid();\r
- std::stringstream path;\r
- path << "/proc/" << pid << "/status";\r
- std::ifstream infile;\r
- infile.open (path.str().c_str(), std::ifstream::in);\r
- while (infile.good()){\r
- getline(infile,buf);\r
- if (infile.eof()) {\r
- memstr = "Could not read memory";\r
- return;\r
- };\r
- int idx = buf.find("VmRSS");\r
- if (idx == 0){\r
- memstr = buf;\r
- return;\r
- };\r
- };\r
- memstr = "Could not read memory";\r
- return;\r
-\r
-}\r
-\r
-#define STARTTIMER() do { \\r
- gettimeofday(&tmpv1,NULL); \\r
- read_procmem(mem1); \\r
- } while (0) \\r
-\r
-#define STOPTIMER(x) do { \\r
- read_procmem(mem2); \\r
- gettimeofday(&tmpv2,NULL); \\r
- (t##x) = ((tmpv2.tv_sec - tmpv1.tv_sec) * 1000000.0 + \\r
- (tmpv2.tv_usec - tmpv1.tv_usec))/1000.0; \\r
- (c##x)= (c##x)+1; \\r
- } while (0)\r
-\r
-#define PRINTTIME(s,x) do { \\r
- std::cerr << (s) << " : " << (t##x) << "ms" << std::endl; \\r
- std::cerr << "Mem use before: " << mem1 << std::endl; \\r
- std::cerr << "Mem use after: " << mem2 << std::endl; \\r
- std::cerr.flush(); \\r
- } while (0) \\r
-\r
+#include "timings.h"\r
+#include <errno.h>\r
+using std::cout;\r
+using std::endl;\r
+using std::min;\r
+using std::string;\r
\r
// functions to convert tag positions to the corresponding tree node and viceversa. \r
// These are implemented in order to be able to change the tree and Tags representations, \r
// Current implementation corresponds to balanced-parentheses representation for\r
// the tree, and storing 2 tags per tree node (opening and closing tags).\r
\r
-// tag position -> tree node\r
-inline treeNode tagpos2node(int t) \r
- {\r
- return (treeNode) t;\r
- }\r
-\r
-// tree node -> tag position\r
-inline int node2tagpos(treeNode x) \r
-{\r
- return (int)x;\r
-}\r
-\r
-int fast_find_close(bp *b,int s)\r
-{\r
- return fwd_excess(b,s,-1);\r
-}\r
\r
-inline int fast_inspect(bp* Par,treeNode i)\r
-{\r
- int j,l;\r
- j = i >> logD;\r
- l = i & (D-1);\r
- return (Par->B[j] >> (D-1-l)) & 1;\r
+static int bits8 (int t ) {\r
+ int r = bits(t);\r
+ if (r <= 8)\r
+ return 8;\r
+ else if (r <= 16)\r
+ return 16;\r
+ else \r
+ return r;\r
}\r
\r
-inline treeNode fast_first_child(bp *Par, treeNode x)\r
-{\r
- x = x+1;\r
- return (fast_inspect(Par,x) == CP) ? NULLT : x;\r
-}\r
\r
-inline treeNode fast_next_sibling(bp* Par,treeNode x)\r
-{\r
- x = fast_find_close(Par,x)+1;\r
- return (fast_inspect(Par,x) == CP) ? NULLT : x;\r
-}\r
\r
\r
-inline treeNode fast_sibling(bp* Par,treeNode x,TagType tag){\r
+static treeNode fast_sibling(bp* Par,treeNode x,TagType tag){\r
\r
if (tag == PCDATA_TAG_ID){\r
x = x+2;\r
\r
}\r
\r
-inline bool fast_isleaf(bp* Par,treeNode x){\r
+static bool fast_isleaf(bp* Par,treeNode x){\r
return (fast_inspect(Par,x+1) == CP ? true : false);\r
}\r
\r
-inline uint fast_get_field(uint* A,int len, int idx)\r
+\r
+inline uint get_field_no_power(uint *A, uint len, uint index) {\r
+ \r
+ register uint i=index*len/W, j=index*len-W*i;\r
+ return (j+len <= W) ? (A[i] << (W-j-len)) >> (W-len) : (A[i] >> j) | (A[i+1] << (WW-j-len)) >> (W-len);\r
+\r
+}\r
+\r
+static uint fast_get_field(uint* A,int len, int idx)\r
{\r
- switch(len){\r
+ uint f1, f2;\r
+ switch (len) {\r
case 8:\r
- return (uint) (((uchar*)A)[idx]);\r
- // Other 8-alligned values need to take care of the endianess of the arch.\r
- default: \r
- return get_field (A,len,idx);\r
+ return (uint) (((uchar*)A)[idx]);\r
+ case 16:\r
+ f2 = ((unsigned short*)A)[idx];\r
+ f1 = ((unsigned short*)A)[idx+1];\r
+ return (f1 << 16) + f2;\r
+ default:\r
+ return get_field_no_power (A,len,idx);\r
};\r
\r
}\r
\r
-inline bool fast_is_ancestor(bp * Par,treeNode x,treeNode y){\r
\r
- if (x > y) \r
- return false;\r
- else\r
- return (y <= fast_find_close(Par,x));\r
-}\r
\r
\r
XMLTree::XMLTree( pb * const par, uint npar, vector<string> * const TN, TagIdMap * const tim, \r
uint *empty_texts_bmp, TagType *tags,\r
TextCollection * const TC, bool dis_tc)\r
{\r
+ buffer = 0;\r
// creates the data structure for the tree topology\r
Par = (bp *)umalloc(sizeof(bp));\r
- bp_construct(Par, npar, (pb*) par, OPT_DEGREE|0); \r
+ STARTTIMER();\r
+ bp_construct(Par, npar, (pb*) par, OPT_DEGREE|0);\r
+ STOPTIMER(Building);\r
+ PRINTTIME("Building parenthesis struct", Building);\r
+ STARTTIMER();\r
+\r
\r
// creates structure for tags\r
\r
alphabet_mapper *am = new alphabet_mapper_none();\r
Tags = new static_sequence_bs((uint*)tags,npar,am,bmb);\r
\r
- cout << "Tags test: " << Tags->test((uint*)tags,npar) << endl;\r
+ //cout << "Tags test: " << Tags->test((uint*)tags,npar) << endl;\r
\r
//Ensures that for small tag numbers, we are on an 8bit boundary.\r
//Makes tag access way faster with negligeable waste of space.\r
- tags_blen = max(bits(max_tag),8);\r
+ tags_blen = bits8(max_tag);\r
std::cerr << "Tags blen is " << tags_blen << "\n";\r
tags_len = (uint)npar;\r
tags_fix = new uint[uint_len(tags_blen,tags_len)];\r
for(uint i=0;i<(uint)npar;i++)\r
set_field(tags_fix,tags_blen,i,tags[i]);\r
-\r
delete bmb; \r
free(tags);\r
tags = NULL;\r
+\r
+ STOPTIMER(Building);\r
+ PRINTTIME("Building Tag Structure", Building);\r
\r
Text = (TextCollection*) TC;\r
\r
disable_tc = dis_tc;\r
stream = NULL;\r
stream_fd = 0;\r
+ std::cerr << "Number of distinct tags " << TagName->size() << "\n";\r
+ //std::cerr.flush();\r
}\r
\r
\r
}\r
\r
// Save: saves XML tree data structure to file. \r
-void XMLTree::Save(int fd) \r
+void XMLTree::Save(int fd, char *filename) \r
{\r
FILE *fp;\r
char filenameaux[1024];\r
\r
// stores the texts \r
if (!disable_tc) {\r
- Text->Save(fp);\r
+ Text->Save(fp, filename);\r
};\r
-\r
-\r
}\r
\r
\r
// Load: loads XML tree data structure from file. Returns\r
// a pointer to the loaded data structure\r
-XMLTree *XMLTree::Load(int fd, bool load_tc,int sample_factor) \r
+XMLTree *XMLTree::Load(int fd, char *filename, bool load_tc,int sample_factor) \r
{\r
+\r
FILE *fp;\r
char buffer[1024];\r
XMLTree *XML_Tree;\r
int i;\r
\r
-\r
+ buffer[1023] = '\0';\r
\r
fp = fdopen(fd, "r");\r
\r
PRINTTIME("Loading parenthesis struct", Loading);\r
STARTTIMER();\r
\r
- XML_Tree->TagName = new vector<string>();\r
- XML_Tree->tIdMap = new std::unordered_map<string,int>();\r
- \r
- string s;\r
+ XML_Tree->TagName = new std::vector<std::string>();\r
+ XML_Tree->tIdMap = new std::unordered_map<std::string,int>();\r
+ std::string s;\r
int ntags;\r
\r
// Load the tag names\r
ufread(&ntags, sizeof(int), 1, fp);\r
\r
for (i=0; i<ntags;i++) {\r
- char * r = fgets(buffer,1023,fp);\r
- if (r==NULL)\r
+ if (fgets(buffer,1022,fp) != buffer)\r
throw "Cannot read tag list";\r
- s = (const char*) buffer;\r
+ s = buffer;\r
// remove the trailing \n\r
s.erase(s.size()-1); \r
- XML_Tree->TagName->push_back(s);\r
+ XML_Tree->TagName->push_back(s); \r
XML_Tree->tIdMap->insert(std::make_pair(s,i));\r
\r
};\r
/// End ugly tests\r
\r
STOPTIMER(Loading);\r
+ std::cerr << (uint_len(XML_Tree->tags_blen,XML_Tree->tags_len)*sizeof(uint))/(1024*1024) << " MB for tag sequence" << std::endl;\r
PRINTTIME("Loading tag struct", Loading);\r
STARTTIMER();\r
\r
// Not used \r
// loads the texts\r
if (!XML_Tree->disable_tc){\r
- XML_Tree->Text = TextCollection::Load(fp,sample_factor);\r
+ XML_Tree->Text = TextCollection::Load(fp, filename, TextCollection::index_mode_default, sample_factor);\r
}\r
else XML_Tree->Text = NULL;\r
STOPTIMER(Loading);\r
}\r
\r
\r
-// root(): returns the tree root.\r
-inline treeNode XMLTree::Root() \r
- {\r
- return 0; //root_node(Par);\r
- }\r
\r
// SubtreeSize(x): the number of nodes (and attributes) in the subtree of node x.\r
int XMLTree::SubtreeSize(treeNode x) \r
\r
int s = x + 2*subtree_size(Par, x) - 1;\r
\r
- return Tags->rank(tag, s) - Tags->rank(tag, node2tagpos(x)-1);\r
+ return (Tags->rank(tag, s) - Tags->rank(tag, node2tagpos(x)-1))+1;\r
}\r
int XMLTree::SubtreeElements(treeNode x) \r
{\r
}\r
\r
// IsFirstChild(x): returns whether node x is the first child of its parent.\r
-bool XMLTree::IsFirstChild(treeNode x)\r
+/*bool XMLTree::IsFirstChild(treeNode x)\r
{\r
return ((x != NULLT)&&(x==Root() || prev_sibling(Par,x) == (treeNode)-1));\r
}\r
-\r
+*/\r
\r
// NumChildren(x): number of children of node x. Constant time with the data structure\r
// of Sadakane.\r
}\r
\r
// Parent(x): returns the parent node of node x.\r
-\r
+/*\r
treeNode XMLTree::Parent(treeNode x) \r
{\r
if (x == Root())\r
return NULLT;\r
else\r
return parent(Par, x);;\r
- }\r
+ }*/\r
\r
// Child(x,i): returns the i-th child of node x, assuming it exists.\r
treeNode XMLTree::Child(treeNode x, int i) \r
}\r
\r
// FirstChild(x): returns the first child of node x, assuming it exists. Very fast in BP.\r
-\r
+/*\r
treeNode XMLTree::FirstChild(treeNode x) \r
{\r
NULLT_IF(x==NULLT);\r
return fast_first_child(Par, x);\r
}\r
-\r
+*/\r
+/*\r
treeNode XMLTree::FirstElement(treeNode x) \r
{\r
NULLT_IF(x==NULLT);\r
x = fast_first_child(Par, x);\r
NULLT_IF(x == NULLT);\r
- TagType tag = Tag(x);\r
- if (tag == PCDATA_TAG_ID){\r
+ switch (Tag(x)){\r
+ \r
+ case PCDATA_TAG_ID:\r
x = x+2;\r
return (fast_inspect(Par,x)==OP)? x : NULLT;\r
- } \r
- else if (tag == ATTRIBUTE_TAG_ID){\r
+ \r
+ case ATTRIBUTE_TAG_ID: \r
x = fast_next_sibling(Par,x);\r
if (x != NULLT && Tag(x) == PCDATA_TAG_ID){\r
x = x+2;\r
return (fast_inspect(Par,x)==OP)? x : NULLT;\r
} \r
- else return x; \r
- }else return x;\r
+ else return x; \r
+ default:\r
+ return x;\r
+ }\r
}\r
-\r
+*//*\r
treeNode XMLTree::NextElement(treeNode x) \r
{\r
NULLT_IF(x==NULLT);\r
return (fast_inspect(Par,x)==OP)? x : NULLT;\r
}\r
else return x; \r
-}\r
+ }*/\r
\r
// LastChild(x): returns the last child of node x.\r
treeNode XMLTree::LastChild(treeNode x)\r
{\r
- NULLT_IF(NULLT || x == Root() || fast_isleaf(Par,x));\r
+ NULLT_IF(x == NULLT || fast_isleaf(Par,x));\r
return find_open(Par, fast_find_close(Par, x)-1);\r
}\r
\r
// NextSibling(x): returns the next sibling of node x, assuming it exists.\r
-treeNode XMLTree::NextSibling(treeNode x) \r
+/*treeNode XMLTree::NextSibling(treeNode x) \r
{\r
NULLT_IF(x==NULLT || x == Root() );\r
x = fast_find_close(Par,x)+1;\r
return (fast_inspect(Par,x) == CP ? NULLT : x);\r
}\r
-\r
+*/\r
\r
// PrevSibling(x): returns the previous sibling of node x, assuming it exists.\r
treeNode XMLTree::PrevSibling(treeNode x) \r
{\r
- NULLT_IF(x==NULLT || x == Root());\r
+ NULLT_IF(x==NULLT);\r
return prev_sibling(Par, x);\r
}\r
\r
\r
// TaggedDescendant(x,tag): returns the first node tagged tag with larger preorder than x and within\r
// the subtree of x. Returns NULLT if there is none.\r
+/*\r
treeNode XMLTree::TaggedDescendant(treeNode x, TagType tag) \r
{\r
- NULLT_IF(x==NULLT || fast_isleaf(Par,x));\r
+ //NULLT_IF(x==NULLT || fast_isleaf(Par,x));\r
\r
int s = (int) Tags->select_next(tag,node2tagpos(x));\r
NULLT_IF (s == -1);\r
\r
return (fast_is_ancestor(Par,x,y) ? y : NULLT);\r
}\r
-\r
+*/\r
\r
treeNode XMLTree::SelectDescendant(treeNode x, TagIdSet *tags)\r
{\r
\r
// TaggedFollBelow(x,tag,root): returns the first node tagged tag with larger preorder than x \r
// and not in the subtree of x. Returns NULLT if there is none.\r
+/*\r
treeNode XMLTree::TaggedFollowingBelow(treeNode x, TagType tag, treeNode ancestor)\r
{\r
- NULLT_IF (x == NULLT || x == Root() || x == ancestor); \r
- treeNode s = tagpos2node(Tags->select_next(tag, fast_find_close(Par, x)));\r
- \r
- if (ancestor == Root()) return s;\r
- NULLT_IF (s == NULLT || s >= fast_find_close(Par, ancestor));\r
+ // NULLT_IF (x == NULLT || x == Root() || x == ancestor); \r
+\r
+ //Special optimisation, test for the following sibling first\r
+ treeNode close = fast_find_close(Par, x);\r
+ treeNode s = tagpos2node(Tags->select_next(tag, close));\r
\r
- return s;\r
+ if (ancestor == Root() || s==NULLT || s < fast_find_close(Par,ancestor)) return s;\r
+ else return NULLT;\r
} \r
+*/\r
\r
treeNode XMLTree::TaggedFollowingBefore(treeNode x, TagType tag, treeNode closing)\r
{\r
{\r
\r
NULLT_IF(x==NULLT || x==Root());\r
+\r
+ treeNode close = fast_find_close(Par,x);\r
+ treeNode ns = close+1;\r
+ if ( (fast_inspect(Par,ns) == OP) && (tags->find(Tag(ns)) != tags->end()))\r
+ return ns;\r
+\r
int i;\r
treeNode min = NULLT;\r
- treeNode ns = fast_next_sibling(Par, x);\r
- treeNode close = ns - 1;\r
treeNode aux;\r
+ \r
+\r
TagIdSet::const_iterator tagit;\r
for (tagit = tags->begin(); tagit != tags->end(); tagit++) {\r
\r
\r
// The next sibling of x is guaranteed to be below ctx\r
// and is the node with lowest preorder which is after ctx.\r
- // if we find it, we return early;\r
- \r
- if (aux == ns ) return ns;\r
+ // if we find it, we return early; \r
if (aux == NULLT) continue;\r
if ((min == NULLT) || (aux < min)) min = aux;\r
};\r
\r
//WARNING this uses directly the underlying implementation for plain text\r
\r
-void XMLTree::Print(int fd,treeNode x){\r
+\r
+void XMLTree::Print(int fd,treeNode x, bool no_text){\r
\r
int newfd = dup(fd);\r
stream = fdopen(newfd,"wa");\r
- /* if (stream_fd != fd){\r
- if (stream != NULL)\r
- fclose(stream);\r
- int newfd = dup(fd);\r
- stream = fdopen(newfd,"wa");\r
- stream_fd = fd;\r
- };\r
- */\r
+ if (stream == 0){\r
+ perror(NULL);\r
+ return;\r
+ };\r
+\r
+ if (buffer == 0)\r
+ buffer = new string();\r
\r
FILE* fp = stream;\r
treeNode fin = fast_find_close(Par,x);\r
uchar * tagstr;\r
range r = DocIds(x);\r
treeNode first_idx;\r
- treeNode first_text = (tag == PCDATA_TAG_ID ? x : TaggedDescendant(x,PCDATA_TAG_ID));\r
- treeNode first_att = NULLT;//TaggedDesc(x,ATTRIBUTE_DATA_TAG_ID);\r
+ treeNode first_text = (tag == PCDATA_TAG_ID ? x : ParentNode(r.min-1));\r
+ treeNode first_att = NULLT;\r
\r
if (first_att == NULLT)\r
first_idx = first_text;\r
uchar * current_text=NULL;\r
if (first_idx != NULLT)\r
current_text = GetText(MyText(first_idx));\r
- int read = 0;\r
-\r
- std::stack<uchar*> st;\r
+ size_t read = 0;\r
+ std::vector<uchar*> st;\r
while (n <= fin){\r
if (fast_inspect(Par,n)){\r
- if (tag == PCDATA_TAG_ID) { \r
- // fputs((const char*) (GetText(MyTextUnsafe(n))),fp);\r
- \r
- read = fprintf(fp,"%s",(const char*) current_text);\r
- current_text += (read + 1);\r
-\r
+ if (tag == PCDATA_TAG_ID ) { \r
+\r
+ if (no_text)\r
+ myfputs("<$/>",fp);\r
+ else{\r
+ read = myfprintf((const char*) current_text, fp);\r
+ current_text += (read + 1);\r
+ };\r
n+=2; // skip closing $\r
tag = Tag(n);\r
+ \r
}\r
else {\r
-\r
- fputc('<',fp);\r
+ myfputc('<',fp);\r
tagstr = (uchar*) GetTagNameByRef(tag);\r
- fputs((const char*) tagstr ,fp);\r
+ myfputs((const char*) tagstr ,fp);\r
n++;\r
if (fast_inspect(Par,n)) {\r
- st.push(tagstr);\r
+ st.push_back(tagstr);\r
tag = Tag(n);\r
if (tag == ATTRIBUTE_TAG_ID){\r
n++;\r
+ if (no_text) myfputs("><@@>",fp);\r
while (fast_inspect(Par,n)){\r
- fputc(' ',fp);\r
- fputs((const char*) &(GetTagNameByRef(Tag(n))[3]),fp);\r
- fputs("=\"",fp);\r
- n++;\r
- read = fprintf(fp,"%s",(const char*) current_text);\r
- current_text += (read + 1);\r
- //fputs((const char*) GetText(MyTextUnsafe(n)),fp);\r
- fputc('"',fp);\r
- n+=3; //close @$ @@ \r
+ if (no_text) {\r
+ myfputc('<',fp);\r
+ myfputs((const char*) &(GetTagNameByRef(Tag(n))[3]),fp);\r
+ myfputc('>',fp);\r
+ myfputs("<$@/></",fp);\r
+ myfputs((const char*) &(GetTagNameByRef(Tag(n))[3]),fp);\r
+ myfputc('>',fp);\r
+ n+= 4;\r
+ }\r
+ else {\r
+ myfputc(' ',fp);\r
+ myfputs((const char*) &(GetTagNameByRef(Tag(n))[3]),fp);\r
+ n++;\r
+ myfputs("=\"",fp);\r
+ read = myfprintf((const char*) current_text,fp);\r
+ current_text += (read + 1);\r
+ myfputc('"',fp);\r
+ n+=3;\r
+ }\r
};\r
- fputc('>',fp);\r
+ if (no_text) \r
+ myfputs("</@@>",fp);\r
+ else myfputc('>',fp);\r
n++;\r
tag=Tag(n);\r
}\r
else {\r
- fputc('>',fp);\r
+ myfputc('>',fp);\r
};\r
}\r
else {// <foo /> tag\r
- fputs("/>",fp);\r
+ myfputs("/>",fp);\r
n++;\r
tag=Tag(n); \r
}; \r
}\r
else\r
do {\r
- fputs("</",fp);\r
- fputs((const char*)st.top(),fp);\r
- fputc('>', fp);\r
- st.pop();\r
+ myfputs("</",fp);\r
+ myfputs((const char*)st.back(),fp);\r
+ myfputc('>', fp);\r
+ st.pop_back();\r
n++;\r
}while (!fast_inspect(Par,n) && !st.empty());\r
tag=Tag(n);\r
};\r
- fputc('\n',fp);\r
- fflush(fp);\r
+ myfputc('\n',fp);\r
+ mybufferflush(fp);\r
+ //fflush(fp);\r
fclose(fp);\r
}\r