-\r
-#include "XMLTreeBuilder.h"\r
#include "basics.h"\r
+#include "XMLTreeBuilder.h"\r
+\r
+\r
+XMLTreeBuilder::~XMLTreeBuilder(){\r
+ \r
+}\r
\r
// OpenDocument(empty_texts): it starts the construction of the data structure for\r
// the XML document. Parameter empty_texts indicates whether we index empty texts\r
// in document or not. Returns a non-zero value upon success, NULLT in case of error.\r
int XMLTreeBuilder::OpenDocument(bool empty_texts, int sample_rate_text, bool dtc)\r
{\r
- found_attributes = false;\r
npar = 0;\r
parArraySize = 1;\r
- ntagnames = 4; \r
disable_tc = dtc;\r
\r
- indexing_empty_texts = empty_texts;\r
- \r
+ \r
par_aux = (pb *)umalloc(sizeof(pb)*parArraySize);\r
\r
tags_aux = (TagType *) umalloc(sizeof(TagType));\r
\r
- TagName = (unsigned char **) umalloc(4*sizeof(unsigned char*));\r
- TagName[0] = (unsigned char *) umalloc(4*sizeof(unsigned char));\r
- strcpy((char *) TagName[0], "<@>");\r
- TagName[1] = (unsigned char *) umalloc(4*sizeof(unsigned char));\r
- strcpy((char *) TagName[1], "<$>");\r
- TagName[2] = (unsigned char *) umalloc(5*sizeof(unsigned char));\r
- strcpy((char *) TagName[2], "/<@>");\r
- TagName[3] = (unsigned char *) umalloc(5*sizeof(unsigned char));\r
- strcpy((char *) TagName[3], "/<$>");\r
-\r
- if (!indexing_empty_texts) \r
- empty_texts_aux = (unsigned int *)umalloc(sizeof(unsigned int));\r
- \r
+ TagName = new vector<string>();\r
+ tIdMap = new std::unordered_map<string,int>();\r
+\r
+ REGISTER_TAG(TagName,tIdMap,DOCUMENT_OPEN_TAG);\r
+ REGISTER_TAG(TagName,tIdMap,ATTRIBUTE_OPEN_TAG);\r
+ REGISTER_TAG(TagName,tIdMap,PCDATA_OPEN_TAG);\r
+ REGISTER_TAG(TagName,tIdMap,ATTRIBUTE_DATA_OPEN_TAG);\r
+ REGISTER_TAG(TagName,tIdMap,DOCUMENT_CLOSE_TAG);\r
+ REGISTER_TAG(TagName,tIdMap,ATTRIBUTE_CLOSE_TAG);\r
+ REGISTER_TAG(TagName,tIdMap,PCDATA_CLOSE_TAG);\r
+ REGISTER_TAG(TagName,tIdMap,ATTRIBUTE_DATA_CLOSE_TAG);\r
+\r
+\r
+ CachedText = new vector<string>;\r
if (disable_tc)\r
TextBuilder = 0;\r
else \r
- TextBuilder = new TextCollectionBuilder((unsigned)sample_rate_text);\r
+ TextBuilder = new TextCollectionBuilder((unsigned)sample_rate_text);\r
Text = 0;\r
- \r
+ empty_texts_aux = (unsigned int *)ucalloc(sizeof(unsigned int),1);\r
+ eta_size = sizeof(unsigned int);\r
return 1; // indicates success in the initialization of the data structure\r
}\r
\r
// the data structure can be queried.\r
XMLTree *XMLTreeBuilder::CloseDocument()\r
{ \r
- // closing parenthesis for the tree root\r
- par_aux = (pb *)urealloc(par_aux, sizeof(pb)*(1+npar/(8*sizeof(pb))));\r
- setbit(par_aux, npar, CP);\r
- npar++;\r
+ //closing parenthesis for the tree root\r
+ //par_aux = (pb *)urealloc(par_aux, sizeof(pb)*(1+npar/(8*sizeof(pb))));\r
+ //setbit(par_aux, npar, CP);\r
+ //npar++;\r
\r
// makes the text collection static\r
if (!disable_tc) {\r
delete TextBuilder;\r
TextBuilder = 0;\r
}\r
-\r
- XMLTree *T = new XMLTree(par_aux, npar, TagName, ntagnames, empty_texts_aux, tags_aux, \r
- Text, CachedText, indexing_empty_texts, disable_tc);\r
+ \r
+ XMLTree *T = new XMLTree(par_aux,\r
+ npar, \r
+ TagName,\r
+ tIdMap,\r
+ empty_texts_aux, // freed by the constructor\r
+ tags_aux, //freed by the constructor\r
+ Text,\r
+ CachedText,\r
+ disable_tc);\r
return T; \r
}\r
\r
// NewOpenTag(tagname): indicates the event of finding a new opening tag in the document.\r
// Tag name is given. Returns a non-zero value upon success, and returns NULLT\r
// in case of failing when trying to insert the new tag.\r
-int XMLTreeBuilder::NewOpenTag(unsigned char *tagname)\r
+int XMLTreeBuilder::NewOpenTag(string tagname)\r
{\r
int i;\r
\r
}\r
\r
setbit(par_aux,npar,OP); // marks a new opening parenthesis\r
+ \r
+ TagIdMapIT tag_id = tIdMap->find(tagname);\r
+ \r
+ if (tag_id == tIdMap->end()){\r
+ REGISTER_TAG(TagName,tIdMap,tagname);\r
+ i = TagName->size() - 1;\r
+ }\r
+ else\r
+ i = tag_id->second;\r
\r
- // transforms the tagname into a tag identifier. If the tag is new, we insert\r
- // it in the table.\r
- for (i=0; i<ntagnames; i++)\r
- if (strcmp((const char *)tagname,(const char *)TagName[i])==0) break;\r
- \r
-\r
- // NewOpenTag("<@>") was called\r
- if (i==0) \r
- found_attributes=true;\r
-\r
- if (i==ntagnames) { // the tag is a new one, then we insert it\r
- TagName = (unsigned char **)urealloc(TagName, sizeof(char *)*(ntagnames+1));\r
- \r
- if (!TagName) {\r
- fprintf(stderr, "Error: not enough memory\n");\r
- return NULLT;\r
- }\r
- \r
- ntagnames++;\r
- TagName[i] = (unsigned char *)umalloc(sizeof(unsigned char)*(strlen((const char *)tagname)+1));\r
- strcpy((char *)TagName[i], (const char *)tagname);\r
- } \r
+ if (tagname.compare(PCDATA_OPEN_TAG) == 0 ||\r
+ tagname.compare(ATTRIBUTE_DATA_OPEN_TAG) == 0){\r
+ };\r
+ \r
tags_aux = (TagType *) urealloc(tags_aux, sizeof(TagType)*(npar + 1));\r
-\r
+ \r
tags_aux[npar] = i; // inserts the new tag id within the preorder sequence of tags\r
\r
npar++;\r
// NewClosingTag(tagname): indicates the event of finding a new closing tag in the document.\r
// Tag name is given. Returns a non-zero value upon success, and returns NULLT\r
// in case of failing when trying to insert the new tag.\r
-int XMLTreeBuilder::NewClosingTag(unsigned char *tagname)\r
+int XMLTreeBuilder::NewClosingTag(string tagname)\r
{\r
int i;\r
\r
}\r
\r
setbit(par_aux,npar,CP); // marks a new closing parenthesis\r
+ \r
+ tagname.insert(0,"/");\r
\r
- // transforms the tagname into a tag identifier. If the tag is new, we insert\r
- // it in the table.\r
- for (i=0; i<ntagnames; i++)\r
- if ((strcmp((const char *)tagname,(const char *)(TagName[i]+1))==0) && (TagName[i][0]=='/')) break;\r
- \r
- if (i==ntagnames) { // the tag is a new one, then we insert it\r
- TagName = (unsigned char **)urealloc(TagName, sizeof(char *)*(ntagnames+1));\r
- \r
- ntagnames++;\r
- TagName[i] = (unsigned char *)umalloc(sizeof(char)*(strlen((const char *)tagname)+2));\r
- TagName[i][0] = '/';\r
- strcpy((char *)&(TagName[i][1]), (const char *)tagname);\r
- } \r
+ TagIdMapIT tag_id = tIdMap->find(tagname); \r
+\r
+ if (tag_id == tIdMap->end()){\r
+ REGISTER_TAG(TagName,tIdMap,tagname);\r
+ i = TagName->size() - 1;\r
+ }\r
+ else\r
+ i = tag_id->second;\r
\r
tags_aux = (TagType *)urealloc(tags_aux, sizeof(TagType)*(npar + 1));\r
\r
// NewText(s): indicates the event of finding a new (non-empty) text s in the document.\r
// The new text is inserted within the text collection. Returns a non-zero value upon\r
// success, NULLT in case of error.\r
-int XMLTreeBuilder::NewText(unsigned char *s)\r
+int XMLTreeBuilder::NewText(string text)\r
{\r
- if (disable_tc) {\r
- XMLTreeBuilder::NewEmptyText();\r
- return 1;\r
- }\r
-\r
- if (!indexing_empty_texts) {\r
- empty_texts_aux = (unsigned int *)urealloc(empty_texts_aux, sizeof(pb)*(1+(npar-1)/(8*sizeof(pb))));\r
- bitset(empty_texts_aux, npar-1); // marks the non-empty text with a 1 in the bit vector\r
- }\r
- \r
- TextBuilder->InsertText(s);\r
- string cpps = (char*) s;\r
- CachedText.push_back(cpps); \r
- \r
- return 1; // success\r
+ if (!disable_tc) {\r
+ if (text.empty())\r
+ TextBuilder->InsertText((uchar *)"\001");\r
+ else\r
+ TextBuilder->InsertText((uchar *) text.c_str());\r
+ };\r
+\r
+ CachedText->push_back(text); \r
+ int n_eta_size = sizeof(uint)*(1+(npar-1)/(8*sizeof(uint)));\r
+ //see basics.h, recalloc resizes and sets the new area to 0.\r
+ \r
+ empty_texts_aux = (uint *)urecalloc(empty_texts_aux,eta_size,n_eta_size);\r
+ eta_size = n_eta_size;\r
+ bitset(empty_texts_aux, npar-1); // marks the non-empty text with a 1 in the bit vector\r
+\r
+ return 1; // success\r
}\r
\r
-// NewEmptyText(): indicates the event of finding a new empty text in the document.\r
-// In case of indexing empty and non-empty texts, we insert the empty texts into the\r
-// text collection. In case of indexing only non-empty texts, it just indicates an\r
-// empty text in the bit vector of empty texts. Returns a non-zero value upon\r
-// success, NULLT in case of error.\r
-int XMLTreeBuilder::NewEmptyText() \r
- {\r
- unsigned char c = 0;\r
-\r
- if (!indexing_empty_texts) {\r
- empty_texts_aux = (unsigned int *)urealloc(empty_texts_aux, sizeof(pb)*(1+(npar-1)/(8*sizeof(pb))));\r
- \r
- bitclean(empty_texts_aux, npar-1); // marks the empty text with a 0 in the bit vector\r
- }\r
- else TextBuilder->InsertText(&c); // we insert the empty text just in case we index all the texts\r
- \r
- return 1; // success \r
- }\r
\r