2 #include "XMLTreeBuilder.h"
\r
6 XMLTreeBuilder::~XMLTreeBuilder(){
\r
9 //delete other stuff.
\r
13 // OpenDocument(empty_texts): it starts the construction of the data structure for
\r
14 // the XML document. Parameter empty_texts indicates whether we index empty texts
\r
15 // in document or not. Returns a non-zero value upon success, NULLT in case of error.
\r
16 int XMLTreeBuilder::OpenDocument(bool empty_texts,
\r
17 int sample_rate_text,
\r
19 TextCollectionBuilder::index_type_t index_type)
\r
24 text_index_type = index_type;
\r
27 par_aux = (pb *)umalloc(sizeof(pb)*parArraySize);
\r
29 tags_aux = (TagType *) umalloc(sizeof(TagType));
\r
31 TagName = new vector<string>();
\r
32 tIdMap = new std::unordered_map<string,int>();
\r
34 REGISTER_TAG(TagName,tIdMap,DOCUMENT_OPEN_TAG);
\r
35 REGISTER_TAG(TagName,tIdMap,ATTRIBUTE_OPEN_TAG);
\r
36 REGISTER_TAG(TagName,tIdMap,PCDATA_OPEN_TAG);
\r
37 REGISTER_TAG(TagName,tIdMap,ATTRIBUTE_DATA_OPEN_TAG);
\r
38 REGISTER_TAG(TagName,tIdMap,CLOSING_TAG);
\r
39 REGISTER_TAG(TagName,tIdMap,DOCUMENT_CLOSE_TAG);
\r
40 REGISTER_TAG(TagName,tIdMap,ATTRIBUTE_CLOSE_TAG);
\r
41 REGISTER_TAG(TagName,tIdMap,PCDATA_CLOSE_TAG);
\r
42 REGISTER_TAG(TagName,tIdMap,ATTRIBUTE_DATA_CLOSE_TAG);
\r
48 TextBuilder = TextCollectionBuilder::create((unsigned)sample_rate_text, index_type);
\r
51 empty_texts_aux = (unsigned int *)ucalloc(sizeof(unsigned int),1);
\r
52 eta_size = sizeof(unsigned int);
\r
53 return 1; // indicates success in the initialization of the data structure
\r
56 // CloseDocument(): it finishes the construction of the data structure for the XML
\r
57 // document. Tree and tags are represented in the final form, dynamic data
\r
58 // structures are made static, and the flag "finished" is set to true. After that,
\r
59 // the data structure can be queried.
\r
60 XMLTree *XMLTreeBuilder::CloseDocument()
\r
62 //closing parenthesis for the tree root
\r
63 //par_aux = (pb *)urealloc(par_aux, sizeof(pb)*(1+npar/(8*sizeof(pb))));
\r
64 //setbit(par_aux, npar, CP);
\r
67 // makes the text collection static
\r
69 PRINTTIME("Parsing XML Document", Parsing);
\r
71 XMLTree *T = new XMLTree(par_aux,
\r
75 empty_texts_aux, // freed by the constructor
\r
76 tags_aux, // freed by the constructor
\r
77 TextBuilder, // freed by the constructor
\r
81 empty_texts_aux = 0;
\r
86 // NewOpenTag(tagname): indicates the event of finding a new opening tag in the document.
\r
87 // Tag name is given. Returns a non-zero value upon success, and returns NULLT
\r
88 // in case of failing when trying to insert the new tag.
\r
89 int XMLTreeBuilder::NewOpenTag(string tagname)
\r
92 // inserts a new opening parentheses in the bit sequence
\r
93 if (sizeof(pb)*8*parArraySize == npar) { // no space left for the new parenthesis
\r
95 // If array is already 1GB, be gentler when resizing:
\r
96 if (sizeof(pb)*parArraySize >= 1024*1024*1024)
\r
97 parArraySize += (128*1024*1024);
\r
100 par_aux = (pb *) urealloc(par_aux, sizeof(pb)*parArraySize);
\r
103 bp_setbit(par_aux,npar,OP); // marks a new opening parenthesis
\r
105 TagIdMapIT tag_id = tIdMap->find(tagname);
\r
107 if (tag_id == tIdMap->end()){
\r
108 REGISTER_TAG(TagName,tIdMap,tagname);
\r
109 i = TagName->size() - 1;
\r
112 i = tag_id->second;
\r
114 if (tagname.compare(PCDATA_OPEN_TAG) == 0 ||
\r
115 tagname.compare(ATTRIBUTE_DATA_OPEN_TAG) == 0){
\r
119 tags_aux = (TagType *) urealloc(tags_aux, sizeof(TagType)*(npar + 1));
\r
121 tags_aux[npar] = i; // inserts the new tag id within the preorder sequence of tags
\r
125 return 1; // success
\r
129 // NewClosingTag(tagname): indicates the event of finding a new closing tag in the document.
\r
130 // Tag name is given. Returns a non-zero value upon success, and returns NULLT
\r
131 // in case of failing when trying to insert the new tag.
\r
132 int XMLTreeBuilder::NewClosingTag(string tagname)
\r
135 // inserts a new closing parentheses in the bit sequence
\r
136 if (sizeof(pb)*8*parArraySize == npar) { // no space left for the new parenthesis
\r
137 // If array is already 1GB, be gentler when resizing:
\r
138 if (sizeof(pb)*parArraySize >= 1024*1024*1024)
\r
139 parArraySize += (128*1024*1024);
\r
142 par_aux = (pb *)urealloc(par_aux, sizeof(pb)*parArraySize);
\r
145 bp_setbit(par_aux,npar,CP); // marks a new closing parenthesis
\r
147 //tagname.insert(0,"/");
\r
149 //TagIdMapIT tag_id = tIdMap->find(tagname);
\r
151 // if (tag_id == tIdMap->end()){
\r
152 // REGISTER_TAG(TagName,tIdMap,tagname);
\r
153 // i = TagName->size() - 1;
\r
156 // i = tag_id->second;
\r
158 tags_aux = (TagType *)urealloc(tags_aux, sizeof(TagType)*(npar + 1));
\r
160 tags_aux[npar] = CLOSING_TAG_ID; // inserts the new tag id within the preorder sequence of tags
\r
164 return 1; // success
\r
168 // NewText(s): indicates the event of finding a new (non-empty) text s in the document.
\r
169 // The new text is inserted within the text collection. Returns a non-zero value upon
\r
170 // success, NULLT in case of error.
\r
171 int XMLTreeBuilder::NewText(string text)
\r
175 TextBuilder->InsertText((uchar *)"\001");
\r
177 TextBuilder->InsertText((uchar *) text.c_str());
\r
180 int n_eta_size = sizeof(uint)*(1+(npar-1)/(8*sizeof(uint)));
\r
181 //see basics.h, recalloc resizes and sets the new area to 0.
\r
183 empty_texts_aux = (uint *)urecalloc(empty_texts_aux,eta_size,n_eta_size);
\r
184 eta_size = n_eta_size;
\r
185 bitset(empty_texts_aux, npar-1); // marks the non-empty text with a 1 in the bit vector
\r
187 return 1; // success
\r