From: kim Date: Mon, 24 Nov 2008 23:31:26 +0000 (+0000) Subject: Initial import of XMLTree X-Git-Url: http://git.nguyen.vg/gitweb/?a=commitdiff_plain;ds=inline;h=0bf9688e2615a9fc07860c5762240e4ce26ee5d3;p=SXSI%2FXMLTree.git Initial import of XMLTree git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/XMLTree@5 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- 0bf9688e2615a9fc07860c5762240e4ce26ee5d3 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..a3b0e61 --- /dev/null +++ b/Makefile @@ -0,0 +1,28 @@ +FLAGS = -O9 -I./libcds/includes/ + +OBJECTS=libcds/lib/libcds.a + +all: text_collection XMLTree + +XMLTree: XMLTree.o bp.o darray.o bpcore.o libcds/lib/libcds.a + cp libcds/lib/libcds.a XMLTree.a + ar vrcs XMLTree.a XMLTree.o bp.o darray.o bpcore.o + + +XMLTree.o: bp.c bp.h bpcore.c XMLTree.cpp XMLTree.h + g++ $(FLAGS) -c XMLTree.cpp + +bp.o: bp.c bpcore.c darray.c bp.h darray.h + g++ $(FLAGS) -c bp.c + +darray.o: darray.c darray.h bpcore.c + g++ $(FLAGS) -c darray.c + +bpcore.o: bpcore.c + g++ $(FLAGS) -c bpcore.c + +text_collection: + make -C TextCollection/ + +clean: + rm -f *.o \ No newline at end of file diff --git a/XMLTree.cpp b/XMLTree.cpp new file mode 100644 index 0000000..a3fefea --- /dev/null +++ b/XMLTree.cpp @@ -0,0 +1,700 @@ + +#include "XMLTree.h" +#include +// functions to convert tag positions to the corresponding tree node and viceversa. +// These are implemented in order to be able to change the tree and Tags representations, +// without affecting the code so much. +// Current implementation corresponds to balanced-parentheses representation for +// the tree, and storing 2 tags per tree node (opening and closing tags). + +// tag position -> tree node +inline treeNode tagpos2node(int t) { + return (treeNode)t; +} + +// tree node -> tag position +inline int node2tagpos(treeNode x) { + return (int)x; +} + +// Save: saves XML tree data structure to file. +void XMLTree::Save(unsigned char *filename) + { + + FILE *fp; + char filenameaux[1024]; + int i; + + sprintf(filenameaux, "%s.srx", filename); + fp = fopen(filenameaux, "w"); + if (fp == NULL) { + printf("Error: cannot create file %s to store the tree structure of XML collection\n", filenameaux); + exit(1); + } + + // first stores the tree topology + saveTree(Par, fp); + + // stores the table with tag names + fwrite(&ntagnames, sizeof(int), 1, fp); + for (i=0; isave(fp); + + // stores the tags + Tags->save(fp); + + // stores the texts + //Text->Save(fp); + + fclose(fp); + + } + + +// Load: loads XML tree data structure from file. Returns +// a pointer to the loaded data structure +XMLTree *XMLTree::Load(unsigned char *filename, int sample_rate_text) + { + + FILE *fp; + char filenameaux[1024]; + XMLTree *XML_Tree; + int i; + + // first load the tree topology + sprintf(filenameaux, "%s.srx", filename); + fp = fopen(filenameaux, "r"); + if (fp == NULL) { + printf("Error: cannot open file %s to load the tree structure of XML collection\n", filenameaux); + exit(1); + } + + XML_Tree = new XMLTree(); + + XML_Tree->Par = (bp *)malloc(sizeof(bp)); + + loadTree(XML_Tree->Par, fp); + + // stores the table with tag names + fread(&XML_Tree->ntagnames, sizeof(int), 1, fp); + + XML_Tree->TagName = (unsigned char **)malloc(XML_Tree->ntagnames*sizeof(unsigned char *)); + + for (i=0; intagnames;i++) { + int k = feof(fp); + fscanf(fp, "%s\n",filenameaux); + XML_Tree->TagName[i] = (unsigned char *)malloc(sizeof(unsigned char)*(strlen((const char *)filenameaux)+1)); + strcpy((char *)XML_Tree->TagName[i], (const char *)filenameaux); + } + + // loads the flags + fread(&(XML_Tree->indexing_empty_texts), sizeof(bool), 1, fp); + fread(&(XML_Tree->initialized), sizeof(bool), 1, fp); + fread(&(XML_Tree->finished), sizeof(bool), 1, fp); + + if (!(XML_Tree->indexing_empty_texts)) XML_Tree->EBVector = static_bitsequence_rrr02::load(fp); + + // loads the tags + XML_Tree->Tags = static_sequence_wvtree::load(fp); + + // loads the texts + //XML_Tree->Text->Load(fp,sample_rate_text); + + fclose(fp); + + return XML_Tree; + } + + +// ~XMLTree: frees memory of XML tree. +XMLTree::~XMLTree() + { + int i; + + destroyTree(Par); + free(Par); // frees the memory of struct Par + + for (i=0; i~static_bitsequence_rrr02(); + delete EBVector; + EBVector = NULL; + } + + Tags->~static_sequence_wvtree(); + delete Tags; + Tags = NULL; + + //Text->~TextCollection(); + // delete Text; + // Text = NULL; + + initialized = false; + finished = false; + } + +// root(): returns the tree root. +treeNode XMLTree::Root() + { + return root_node(Par); + } + +// SubtreeSize(x): the number of nodes (and attributes) in the subtree of node x. +int XMLTree::SubtreeSize(treeNode x) + { + return subtree_size(Par, x); + } + +// SubtreeTags(x,tag): the number of occurrences of tag within the subtree of node x. +int XMLTree::SubtreeTags(treeNode x, TagType tag) + { + int s = x + 2*subtree_size(Par, x) - 1; + + return Tags->rank(tag, s) - Tags->rank(tag, node2tagpos(x)-1); + } + +// IsLeaf(x): returns whether node x is leaf or not. In the succinct representation +// this is just a bit inspection. +bool XMLTree::IsLeaf(treeNode x) + { + return isleaf(Par, x); + } + +// IsAncestor(x,y): returns whether node x is ancestor of node y. +bool XMLTree::IsAncestor(treeNode x, treeNode y) + { + return is_ancestor(Par, x, y); + } + +// IsChild(x,y): returns whether node x is parent of node y. +bool XMLTree::IsChild(treeNode x, treeNode y) + { + if (!is_ancestor(Par, x, y)) return false; + return depth(Par, x) == (depth(Par, y) + 1); + } + +// NumChildren(x): number of children of node x. Constant time with the data structure +// of Sadakane. +int XMLTree::NumChildren(treeNode x) + { + return degree(Par, x); + } + +// ChildNumber(x): returns i if node x is the i-th children of its parent. +int XMLTree::ChildNumber(treeNode x) + { + return child_rank(Par, x); + } + +// Depth(x): depth of node x, a simple binary rank on the parentheses sequence. +int XMLTree::Depth(treeNode x) + { + return depth(Par, x); + } + +// Preorder(x): returns the preorder number of node x, just counting the tree +// nodes (i.e., tags, it disregards the texts in the tree). +int XMLTree::Preorder(treeNode x) + { + return preorder_rank(Par, x); + } + +// Postorder(x): returns the postorder number of node x, just counting the tree +// nodes (i.e., tags, it disregards the texts in the tree). +int XMLTree::Postorder(treeNode x) + { + return postorder_rank(Par, x); + } + +// Tag(x): returns the tag identifier of node x. +TagType XMLTree::Tag(treeNode x) + { + return Tags->access(node2tagpos(x)); + } + +// DocIds(x): returns the range of text identifiers that descend from node x. +// returns {NULLT, NULLT} when there are no texts descending from x. +range XMLTree::DocIds(treeNode x) + { + range r; + if (indexing_empty_texts) { // faster, no rank needed + r.min = x; + r.max = x+2*subtree_size(Par, x)-2; + } + else { // we are not indexing empty texts, we need rank + int min = EBVector->rank1(x-1); + int max = EBVector->rank1(x+2*subtree_size(Par, x)-2); + if (min==max) { // range is empty, no texts within the subtree of x + r.min = NULLT; + r.max = NULLT; + } + else { // the range is non-empty, there are texts within the subtree of x + r.min = min+1; + r.max = max; + } + } + return r; + } + +// Parent(x): returns the parent node of node x. +treeNode XMLTree::Parent(treeNode x) + { + return parent(Par, x); + } + +// Child(x,i): returns the i-th child of node x, assuming it exists. +treeNode XMLTree::Child(treeNode x, int i) + { + if (i <= OPTD) return naive_child(Par, x, i); + else return child(Par, x, i); + } + +// FirstChild(x): returns the first child of node x, assuming it exists. Very fast in BP. +treeNode XMLTree::FirstChild(treeNode x) + { + return first_child(Par, x); + } + +// NextSibling(x): returns the next sibling of node x, assuming it exists. +treeNode XMLTree::NextSibling(treeNode x) + { + return next_sibling(Par, x); + } + +// PrevSibling(x): returns the previous sibling of node x, assuming it exists. +treeNode XMLTree::PrevSibling(treeNode x) + { + return prev_sibling(Par, x); + } + +// TaggedChild(x,i,tag): returns the i-th child of node x tagged tag, or NULLT if there is none. +// Because of the balanced-parentheses representation of the tree, this operation is not supported +// efficiently, just iterating among the children of node x until finding the desired child. +treeNode XMLTree::TaggedChild(treeNode x, int i, TagType tag) + { + treeNode child; + + child = first_child(Par, x); // starts at first child of node x + if (child==(treeNode)-1) return NULLT; // node x is a leaf, there is no such child + while (child!=(treeNode)-1) { + if (Tags->access(node2tagpos(child)) == tag) { // current child is labeled with tag of interest + i--; + if (i==0) return child; // we have seen i children of x tagged tag, this is the one we are looking for + } + child = next_sibling(Par, x); // OK, let's try with the next child + } + return NULLT; // no such child was found + } + +// TaggedDesc(x,tag): returns the first node tagged tag with larger preorder than x and within +// the subtree of x. Returns NULLT if there is none. +treeNode XMLTree::TaggedDesc(treeNode x, TagType tag) + { + int r, s; + treeNode y; + r = (int) Tags->rank(tag, node2tagpos(x)); + s = (int) Tags->select(tag, r+1); + if (s == -1) return NULLT; // there is no such node + y = tagpos2node(s); // transforms the tag position into a node position + if (!is_ancestor(Par, x, y)) return NULLT; // the next node tagged tag (in preorder) is not within the subtree of x. + else return y; + } + +// TaggedPrec(x,tag): returns the first node tagged tag with smaller preorder than x and not an +// ancestor of x. Returns NULLT if there is none. +treeNode XMLTree::TaggedPrec(treeNode x, TagType tag) + { + int r, s; + treeNode node_s, root; + r = (int)Tags->rank(tag, node2tagpos(x)-1); + if (r==0) return NULLT; // there is no such node. + s = (int)Tags->select(tag, r); + root = root_node(Par); + node_s = tagpos2node(s); + while (is_ancestor(Par, node_s, x) && (node_s!=root)) { // the one that we found is an ancestor of x + r--; + if (r==0) return NULLT; // there is no such node + s = (int)Tags->select(tag, r); // we should use select_prev instead when provided + node_s = tagpos2node(s); + } + return NULLT; // there is no such node + } + +// TaggedFoll(x,tag): returns the first node tagged tag with larger preorder than x and not in +// the subtree of x. Returns NULLT if there is none. +treeNode XMLTree::TaggedFoll(treeNode x, TagType tag) + { + int r, s; + r = (int) Tags->rank(tag, node2tagpos(next_sibling(Par, x))-1); + s = (int) Tags->select(tag, r+1); // select returns -1 in case that there is no r+1-th tag. + if (s==-1) return NULLT; + else return tagpos2node(s); + } + +// PrevText(x): returns the document identifier of the text to the left +// of node x, or NULLT if x is the root node or the text is empty. +// Assumes Doc ids start from 0. +DocID XMLTree::PrevText(treeNode x) + { + if (x == Root()) return NULLT; + if (indexing_empty_texts) // faster, no rank needed + return (DocID)x-1; + else { // we are not indexing empty texts, rank is needed + if (EBVector->access(x-1) == 0) + return (DocID)NULLT; // there is no text to the left of node (text is empty) + else + return (DocID)EBVector->rank1(x-1)-1; //-1 because document ids start from 0 + } + } + +// NextText(x): returns the document identifier of the text to the right +// of node x, or NULLT if x is the root node. Assumes Doc ids start from 0. +DocID XMLTree::NextText(treeNode x) + { + if (x == Root()) return NULLT; + if (indexing_empty_texts) // faster, no rank needed + return (DocID)x+2*subtree_size(Par, x)-1; + else { // we are not indexing empty texts, rank is needed + int p = x+2*subtree_size(Par, x)-1; + if (EBVector->access(p) == 0) // there is no text to the right of node + return (DocID)NULLT; + else + return (DocID)EBVector->rank1(p)-1; //-1 because document ids start from 0 + } + } + +// MyText(x): returns the document identifier of the text below node x, +// or NULLT if x is not a leaf node or the text is empty. Assumes Doc +// ids start from 0. +DocID XMLTree::MyText(treeNode x) + { + if (!IsLeaf(x)) return NULLT; + if (indexing_empty_texts) // faster, no rank needed + return (DocID)x; + else { // we are not indexing empty texts, rank is needed + if (EBVector->access(x) == 0) // there is no text below node x + return (DocID)NULLT; + else + return (DocID)EBVector->rank1(x)-1; //-1 because document ids start from 0 + } + } + +// TextXMLId(d): returns the preorder of document with identifier d in the tree consisting of +// all tree nodes and all text nodes. Assumes that the tree root has preorder 1. +int XMLTree::TextXMLId(DocID d) + { + if (indexing_empty_texts) + return d + rank_open(Par, d)+1; // +1 because root has preorder 1 + else { // slower, needs rank and select + int s = EBVector->select1(d+1); + return rank_open(Par, s) + d + 1; // +1 because root has preorder 1 + } + } + +// NodeXMLId(x): returns the preorder of node x in the tree consisting +// of all tree nodes and all text nodes. Assumes that the tree root has +// preorder 0; +int XMLTree::NodeXMLId(treeNode x) + { + if (indexing_empty_texts) + return x - 1 + rank_open(Par, x); + else { + if (x == Root()) return 1; // root node has preorder 1 + else + return rank_open(Par, x) + EBVector->rank1(x-1); + } + } + +// ParentNode(d): returns the parent node of document identifier d. +treeNode XMLTree::ParentNode(DocID d) + { + int s; + if (indexing_empty_texts) s = d; + else s = EBVector->select1(d); + + if (inspect(Par,s) == CP) // is a closing parenthesis + return parent(Par, find_open(Par, s)); + else // is an opening parenthesis + return (treeNode)s; + + } + + +// OpenDocument(empty_texts): it starts the construction of the data structure for +// the XML document. Parameter empty_texts indicates whether we index empty texts +// in document or not. Returns a non-zero value upon success, NULLT in case of error. +int XMLTree::OpenDocument(bool empty_texts, int sample_rate_text) + { + initialized = true; + finished = false; + npar = 0; + ntagnames = 0; + + indexing_empty_texts = empty_texts; + + par_aux = (pb *)malloc(sizeof(pb)); + if (!par_aux) { + fprintf(stderr, "Error: not enough memory\n"); + return NULLT; + } + setbit(par_aux,npar,OP); // marks a new opening parenthesis for the tree root + npar++; + + tags_aux = (TagType *)malloc(sizeof(TagType)); + if (!tags_aux) { + fprintf(stderr, "Error: not enough memory\n"); + return NULLT; + } + + if (!indexing_empty_texts) { + empty_texts_aux = (unsigned int *)malloc(sizeof(unsigned int)); + if (!empty_texts_aux) { + fprintf(stderr, "Error: not enough memory\n"); + return NULLT; + } + } + + //Text = TextCollection::InitTextCollection(sample_rate_text); + + return 1; // indicates success in the initialization of the data structure + } + +// CloseDocument(): it finishes the construction of the data structure for the XML +// document. Tree and tags are represented in the final form, dynamic data +// structures are made static, and the flag "finished" is set to true. After that, +// the data structure can be queried. +int XMLTree::CloseDocument() + { + if (!initialized) { // data structure has not been initialized properly + fprintf(stderr, "Error: data structure has not been initialized properly (by calling method OpenDocument)\n"); + return NULLT; + } + + // closing parenthesis for the tree root + par_aux = (pb *)realloc(par_aux, sizeof(pb)*(1+npar/(8*sizeof(pb)))); + if (!par_aux) { + fprintf(stderr, "Error: not enough memory\n"); + return NULLT; + } + setbit(par_aux,npar,CP); + npar++; + + // creates the data structure for the tree topology + Par = (bp *)malloc(sizeof(bp)); + bp_construct(Par, npar, par_aux, OPT_DEGREE|0); + // creates structure for tags + alphabet_mapper * am = new alphabet_mapper_none(); + static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(32); + wt_coder * wtc = new wt_coder_huff((uint *)tags_aux,npar-1,am); + Tags = new static_sequence_wvtree((uint *) tags_aux, (uint) npar-1, wtc, bmb, am); + + // makes the text collection static + //Text->MakeStatic(); + + // creates the data structure marking the non-empty texts (just in the case it is necessary) + if (!indexing_empty_texts) + EBVector = new static_bitsequence_rrr02((uint *)empty_texts_aux,(ulong)npar,(uint)32); + + finished = true; + + return 1; // indicates success in the inicialization + } + + +// NewOpenTag(tagname): indicates the event of finding a new opening tag in the document. +// Tag name is given. Returns a non-zero value upon success, and returns NULLT +// in case of failing when trying to insert the new tag. +int XMLTree::NewOpenTag(unsigned char *tagname) + { + int i; + + if (!initialized) { // data structure has not been initialized properly + fprintf(stderr, "Error: you cannot insert a new opening tag without first calling method OpenDocument first\n"); + return NULLT; + } + + // inserts a new opening parentheses in the bit sequence + par_aux = (pb *)realloc(par_aux, sizeof(pb)*(1+npar/(8*sizeof(pb)))); + if (!par_aux) { + fprintf(stderr, "Error: not enough memory\n"); + return NULLT; + } + setbit(par_aux,npar,OP); // marks a new opening parenthesis + + // transforms the tagname into a tag identifier. If the tag is new, we insert + // it in the table. + for (i=0; iInsertText(s); + + return 1; // success + } + +// NewEmptyText(): indicates the event of finding a new empty text in the document. +// In case of indexing empty and non-empty texts, we insert the empty texts into the +// text collection. In case of indexing only non-empty texts, it just indicates an +// empty text in the bit vector of empty texts. Returns a non-zero value upon +// success, NULLT in case of error. +int XMLTree::NewEmptyText() + { + unsigned char c = 0; + if (!initialized) { // data structure has not been initialized properly + fprintf(stderr, "Error: you cannot insert a new empty text without first calling method OpenDocument first\n"); + return NULLT; + } + + if (!indexing_empty_texts) { + empty_texts_aux = (unsigned int *)realloc(empty_texts_aux, sizeof(pb)*(1+(npar-1)/(8*sizeof(pb)))); + if (!empty_texts_aux) { + fprintf(stderr, "Error: not enough memory\n"); + return NULLT; + } + + bitclean(empty_texts_aux, npar-1); // marks the empty text with a 0 in the bit vector + } + // else Text->InsertText(&c); // we insert the empty text just in case we index all the texts + + return 1; // success + } + + +// GetTagId: returns the tag identifier corresponding to a given tag name. +// Returns NULLT in case that the tag name does not exists. +TagType XMLTree::GetTagId(unsigned char *tagname) + { + int i; + // this should be changed for more efficient processing + for (i=0; i= ntagnames) return NULL; // invalid tag identifier + s = (unsigned char *)malloc((strlen((const char *)TagName[tagid])+1)*sizeof(unsigned char)); + strcpy((char *)s, (const char *)TagName[tagid]); + return s; + } + diff --git a/XMLTree.h b/XMLTree.h new file mode 100644 index 0000000..8fc6596 --- /dev/null +++ b/XMLTree.h @@ -0,0 +1,260 @@ + +/****************************************************************************** + * Copyright (C) 2008 by Diego Arroyuelo * + * Interface for the in-memory XQuery/XPath engine * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU Lesser General Public License as published * + * by the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +#include +#include +#include "bp.h" +#include "libcds/includes/static_bitsequence.h" +#include "libcds/includes/alphabet_mapper.h" +#include "libcds/includes/static_sequence.h" + +//#include "TextCollection/TextCollection.h" +//using SXSI::TextCollection; + +// this constant is used to efficiently compute the child operation in the tree +#define OPTD 10 + +#define NULLT -1 + + // sets bit p in e +#define bitset(e,p) ((e)[(p)/W] |= (1<<((p)%W))) + // cleans bit p in e +#define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W))) + + +typedef int treeNode; +typedef int TagType; +typedef int DocID; + +typedef struct { + int min; + int max; +} range; + + +class XMLTree { + /** Balanced parentheses representation of the tree */ + bp *Par; + + /** Mapping from tag identifer to tag name */ + unsigned char **TagName; + + /** boolean flag indicating whether we are indexing empty texts or not */ + bool indexing_empty_texts; + + /** Bit vector indicating with a 1 the positions of the non-empty texts. */ + static_bitsequence_rrr02 *EBVector; + + /** Tag sequence represented with a data structure for rank and select */ + static_sequence_wvtree *Tags; + + /** The texts in the XML document */ + //TextCollection *Text; + + /** Flag indicating whether the whole data structure has been constructed or + * not. If the value is true, you cannot add more texts, nodes, etc. */ + bool finished; + + /** Flag indicating whether the construction of the data structure has been + * initialized or not (by calling method OpenDocument()). If this is true, + * you cannot insert new tags or texts. */ + bool initialized; + + /* the following components are used for construction purposes */ + pb *par_aux; + TagType *tags_aux; + int npar; + int ntagnames; + unsigned int *empty_texts_aux; + +public: + + /** Data structure constructor */ + XMLTree() {finished = false; initialized = false;}; + + /** Data structure destructor */ + ~XMLTree(); + + /** root(): returns the tree root. */ + treeNode Root(); + + /** SubtreeSize(x): the number of nodes (and attributes) in the subtree of + * node x. */ + int SubtreeSize(treeNode x); + + /** SubtreeTags(x,tag): the number of occurrences of tag within the subtree + * of node x. */ + int SubtreeTags(treeNode x, TagType tag); + + /** IsLeaf(x): returns whether node x is leaf or not. In the succinct + * representation this is just a bit inspection. */ + bool IsLeaf(treeNode x); + + /** IsAncestor(x,y): returns whether node x is ancestor of node y. */ + bool IsAncestor(treeNode x, treeNode y); + + /** IsChild(x,y): returns whether node x is parent of node y. */ + bool IsChild(treeNode x, treeNode y); + + /** NumChildren(x): number of children of node x. Constant time with the + * data structure of Sadakane. */ + int NumChildren(treeNode x); + + /** ChildNumber(x): returns i if node x is the i-th children of its + * parent. */ + inline int ChildNumber(treeNode x); + + /** Depth(x): depth of node x, a simple binary rank on the parentheses + * sequence. */ + int Depth(treeNode x); + + /** Preorder(x): returns the preorder number of node x, just regarding tree + * nodes (and not texts). */ + int Preorder(treeNode x); + + /** Postorder(x): returns the postorder number of node x, just regarding + * tree nodes (and not texts). */ + int Postorder(treeNode x); + + /** Tag(x): returns the tag identifier of node x. */ + TagType Tag(treeNode x); + + /** DocIds(x): returns the range (i.e., a pair of integers) of document + * identifiers that descend from node x. */ + range DocIds(treeNode x); + + /** Parent(x): returns the parent node of node x. */ + treeNode Parent(treeNode x); + + /** Child(x,i): returns the i-th child of node x, assuming it exists. */ + treeNode Child(treeNode x, int i); + + /** FirstChild(x): returns the first child of node x, assuming it exists. + * Very fast in BP. */ + treeNode FirstChild(treeNode x); + + /** NextSibling(x): returns the next sibling of node x, assuming it + * exists. */ + treeNode NextSibling(treeNode x); + + /** PrevSibling(x): returns the previous sibling of node x, assuming it + * exists. */ + treeNode PrevSibling(treeNode x); + + /** TaggedChild(x,i,tag): returns the i-th child of node x tagged tag, or + * NULLT if there is none. Because of the balanced-parentheses representation + * of the tree, this operation is not supported efficiently, just iterating + * among the children of node x until finding the desired child. */ + treeNode TaggedChild(treeNode x, int i, TagType tag); + + /** TaggedDesc(x,tag): returns the first node tagged tag with larger + * preorder than x and within the subtree of x. Returns NULT if there + * is none. */ + treeNode TaggedDesc(treeNode x, TagType tag); + + /** TaggedPrec(x,tag): returns the first node tagged tag with smaller + * preorder than x and not an ancestor of x. Returns NULLT if there + * is none. */ + treeNode TaggedPrec(treeNode x, TagType tag); + + /** TaggedFoll(x,tag): returns the first node tagged tag with larger + * preorder than x and not in the subtree of x. Returns NULLT if there + * is none. */ + treeNode TaggedFoll(treeNode x, TagType tag); + + /** PrevText(x): returns the document identifier of the text to the left of + * node x, or NULLT if x is the root node. */ + DocID PrevText(treeNode x); + + /** NextText(x): returns the document identifier of the text to the right of + * node x, or NULLT if x is the root node. */ + DocID NextText(treeNode x); + + /** MyText(x): returns the document identifier of the text below node x, or + * NULLT if x is not a leaf node. */ + DocID MyText(treeNode x); + + /** TextXMLId(d): returns the preorder of document with identifier d in the + * tree consisting of all tree nodes and all text nodes. */ + int TextXMLId(DocID d); + + /** NodeXMLId(x): returns the preorder of node x in the tree consisting of + * all tree nodes and all text nodes. */ + int NodeXMLId(treeNode x); + + /** ParentNode(d): returns the parent node of document identifier d. */ + treeNode ParentNode(DocID d); + + /** OpenDocument(empty_texts,sample_rate_text): initilizes the construction + * of the data structure for the XML document. Parameter empty_texts + * indicates whether we index empty texts in document or not. Parameter + * sample_rate_text indicates the sampling rate for the text searching data + * structures (small values get faster searching but a bigger space + * requirement). Returns a non-zero value upon success, NULLT in case of + * error. */ + int OpenDocument(bool empty_texts, int sample_rate_text); + + /** CloseDocument(): finishes the construction of the data structure for + * the XML document. Tree and tags are represented in the final form, + * dynamic data structures are made static, and the flag "finished" is set + * to true. After that, the data structure can be queried. */ + int CloseDocument(); + + /** NewOpenTag(tagname): indicates the event of finding a new opening tag + * in the document. Tag name is given. Returns a non-zero value upon + * success, and returns NULLT in case of error. */ + int NewOpenTag(unsigned char *tagname); + + /** NewClosingTag(tagname): indicates the event of finding a new closing tag + * in the document. Tag name is given. Returns a non-zero value upon + * success, and returns NULLT in case of error. */ + int NewClosingTag(unsigned char *tagname); + + /** NewText(s): indicates the event of finding a new (non-empty) text s in + * the document. The new text is inserted within the text collection. + * Returns a non-zero value upon success, NULLT in case of error. */ + int NewText(unsigned char *s); + + /** NewEmptyText(): indicates the event of finding a new empty text in the + * document. In case of indexing empty and non-empty texts, we insert the + * empty texts into the text collection. In case of indexing only non-empty + * texts, it just indicates an empty text in the bit vector of empty texts. + * Returns a non-zero value upon success, NULLT in case of error. */ + int NewEmptyText(); + + /** GetTagId(tagname): returns the tag identifier corresponding to a given + * tag name. Returns NULLT in case that the tag name does not exists. */ + TagType GetTagId(unsigned char *tagname); + + /** GetTagName(tagid): returns the tag name of a given tag identifier. + * Returns NULL in case that the tag identifier is not valid.*/ + unsigned char *GetTagName(TagType tagid); + + /** saveXMLTree: saves XML tree data structure to file. Every component of + * the collection is stored in different files (same name, different file + * extensions). */ + void Save(unsigned char *filename); + + /** load: loads XML tree data structure from file. */ + static XMLTree *Load(unsigned char *filename, int sample_rate_text); +}; + + diff --git a/bp.c b/bp.c new file mode 100644 index 0000000..9058ec8 --- /dev/null +++ b/bp.c @@ -0,0 +1,1142 @@ +#include "bp.h" + +//#define CHECK +#define RANDOM + +int msize=0; +#define mymalloc(p,n,f) {p =(__typeof__(p)) malloc((n)*sizeof(*p)); msize += (f)*(n)*sizeof(*p); /* if (f) printf("malloc %d bytes at line %d total %d\n",(n)*sizeof(*p),__LINE__,msize); */ if ((p)==NULL) {printf("not enough memory (%d bytes) in line %d\n",msize,__LINE__); exit(1);};} + +int postorder_select_bsearch(bp *b,int s); + +int naive_depth(bp *b, int s) +{ + int i,d; + if (s < 0) return 0; + d = 0; + for (i=0; i<=s; i++) { + if (getbit(b->B,i)==OP) { + d++; + } else { + d--; + } + } + return d; +} + +void printbp(bp *b, int s, int t) +{ + int i,c,d; + d = 0; + for (i=s; i<=t; i++) { + if (getbit(b->B,i)==OP) { + c = '('; + d++; + } else { + c = ')'; + d--; + } + putchar(c); + } +} + +int *matchtbl,*parenttbl; +void make_naivetbl(pb *B,int n) +{ + int i,v; + int *stack,s; + + mymalloc(matchtbl,n,0); + mymalloc(parenttbl,n,0); + mymalloc(stack,n,0); + + for (i=0; i 0) { + parenttbl[i] = stack[v-1]; + stack[v] = i; + } + } else { + if (v > 0) { + matchtbl[stack[v]] = i; // close + matchtbl[i] = stack[v]; // open + } + v--; + } + } + + free(stack); +} + +int popCount[1<=0; i--) { + if (getbit(buf,i)==OP) { + r--; + } else { + r++; + } + if (bwdtbl[((r+ETW)< M) { + M = r; + //maxtbl_li[x] = i; maxtbl_lv[x] = r; + minmaxtbl_i[OPT_MAX | OPT_LEFT][x] = i; + minmaxtbl_v[OPT_MAX | OPT_LEFT][x] = r; + } + } else { + r--; + if (r == m) { + deg++; + childtbl[((deg-1)<= M) { + M = r; + //maxtbl_ri[x] = i; maxtbl_rv[x] = r; + minmaxtbl_i[OPT_MAX | OPT_RIGHT][x] = i; + minmaxtbl_v[OPT_MAX | OPT_RIGHT][x] = r; + } + } else { + r--; + if (r <= m) { + m = r; + //mintbl_ri[x] = i; mintbl_rv[x] = r; + minmaxtbl_i[OPT_MIN | OPT_RIGHT][x] = i; + minmaxtbl_v[OPT_MIN | OPT_RIGHT][x] = r; + } + } + } + + for (i = 0; i < ETW; i++) { + for (j = -ETW; j <= ETW; j++) { + childtbl2[j+ETW][i][x] = -1; + } + } + + for (j=-ETW; j<=ETW; j++) { + int ith; + ith = 0; + r = 0; + for (i = 0; i < ETW; i++) { + if (getbit(buf,i)==OP) { + r++; + } else { + r--; + if (r < j) break; + if (r == j) { + ith++; + childtbl2[j+ETW][ith-1][x] = i; + } + } + } + } + } + +} + + +int bp_construct(bp *b,int n, pb *B, int opt) +{ + int i,j,d; + int m,M,ds; + int ns,nm; + byte *sm, *sM; + byte *sd; + int *mm, *mM; + int *md; + int m_ofs; + int r; // # of minimum values + +#if 0 + if (SB % D != 0) { + printf("warning: SB=%d should be a multiple of D=%d\n",SB,D); + // not necessarily? + } + if (SB % RRR != 0) { + printf("warning: SB=%d should be a multiple of RRR=%d\n",SB,RRR); + } +#endif + + b->B = B; + b->n = n; + b->opt = opt; + b->idx_size = 0; + b->sm = NULL; + b->sM = NULL; + b->sd = NULL; + b->mm = NULL; + b->mM = NULL; + b->md = NULL; + b->da_leaf = NULL; + b->da_inorder = NULL; + b->da_postorder = NULL; + b->da_dfuds_leaf = NULL; + mymalloc(b->da,1,0); + darray_construct(b->da,n,B, opt & OPT_FAST_PREORDER_SELECT); + b->idx_size += b->da->idx_size; + printf("preorder rank/select table: %d bytes (%1.2f bpc)\n",b->da->idx_size,(double)b->da->idx_size*8/n); + + make_matchtbl(); + + ns = (n+SB-1)/SB; + mymalloc(sm, ns, 0); b->idx_size += ns * sizeof(*sm); + mymalloc(sM, ns, 0); b->idx_size += ns * sizeof(*sM); + b->sm = sm; + b->sM = sM; + if (opt & OPT_DEGREE) { + mymalloc(sd, ns, 0); b->idx_size += ns * sizeof(*sd); + b->sd = sd; + printf("SB degree table: %d bytes (%1.2f bpc)\n",ns * sizeof(*sd), (double)ns * sizeof(*sd) * 8/n); + } + printf("SB table: %d bytes (%1.2f bpc)\n",ns * sizeof(*sm) * 2, (double)ns * sizeof(*sm)*2 * 8/n); + + for (i=0; i M) M = d; + } + if (i % SB == SB-1 || i==n-1) { + ds = depth(b,(i/SB)*SB-1); + if (m - ds + SB < 0 || m - ds + SB > 255) { + printf("error m=%d ds=%d\n",m,ds); + } + if (M - ds + 1 < 0 || M - ds + 1 > 255) { + printf("error M=%d ds=%d\n",M,ds); + } + sm[i/SB] = m - ds + SB; + sM[i/SB] = M - ds + 1; + if (opt & OPT_DEGREE) sd[i/SB] = r; + } + } + +#if 0 + printf("sd: "); + for (i=0;im_ofs = m_ofs; + + mymalloc(mm, nm + m_ofs, 0); b->idx_size += (nm+m_ofs) * sizeof(*mm); + mymalloc(mM, nm + m_ofs, 0); b->idx_size += (nm+m_ofs) * sizeof(*mM); + b->mm = mm; + b->mM = mM; + if (opt & OPT_DEGREE) { + mymalloc(md, nm + m_ofs, 0); b->idx_size += (nm+m_ofs) * sizeof(*md); + b->md = md; + printf("MB degree table: %d bytes (%1.2f bpc)\n",(nm+m_ofs) * sizeof(*md), (double)(nm+m_ofs) * sizeof(*md) * 8/n); + } + printf("MB table: %d bytes (%1.2f bpc)\n",(nm+m_ofs) * sizeof(*mm) * 2, (double)(nm+m_ofs) * sizeof(*mm)*2 * 8/n); + + for (i=0; i M) M = d; + } + if (i % MB == MB-1 || i==n-1) { + mm[m_ofs+ i/MB] = m; + mM[m_ofs+ i/MB] = M; + if (opt & OPT_DEGREE) md[m_ofs+ i/MB] = r; + } + } + + for (j=m_ofs-1; j > 0; j--) { + m = 0; + if (j*2 < nm + m_ofs) m = mm[j*2]; + if (j*2+1 < nm + m_ofs) m = min(m,mm[j*2+1]); + M = 0; + if (j*2 < nm + m_ofs) M = mM[j*2]; + if (j*2+1 < nm + m_ofs) M = max(M,mM[j*2+1]); + mm[j] = m; mM[j] = M; + if (opt & OPT_DEGREE) { + d = 0; + if (j*2 < nm + m_ofs) d = md[j*2]; + if (j*2+1 < nm + m_ofs) { + if (mm[j*2] == mm[j*2+1]) d += md[j*2+1]; + if (mm[j*2] > mm[j*2+1]) d = md[j*2+1]; + } + md[j] = d; + } + } + mm[0] = -1; + mM[0] = mM[1]; + if (opt & OPT_DEGREE) { + md[0] = -1; + } + + +#if 0 + printf("md: "); + for (i=0;ida_leaf,1,0); + darray_pat_construct(b->da_leaf, n, B, 2, 0x2, opt & OPT_FAST_LEAF_SELECT); + printf("leaf rank/select table: %d bytes (%1.2f bpc)\n",b->da_leaf->idx_size,(double)b->da_leaf->idx_size*8/n); + b->idx_size += b->da_leaf->idx_size; + } else { + b->da_leaf = NULL; + } + + if (opt & OPT_INORDER) { + mymalloc(b->da_inorder,1,0); + darray_pat_construct(b->da_inorder, n, B, 2, 0x1, opt & OPT_FAST_INORDER_SELECT); + printf("inorder rank/select table: %d bytes (%1.2f bpc)\n",b->da_inorder->idx_size,(double)b->da_inorder->idx_size*8/n); + b->idx_size += b->da_inorder->idx_size; + } else { + b->da_inorder = NULL; + } + + if (opt & OPT_FAST_POSTORDER_SELECT) { + mymalloc(b->da_postorder,1,0); + darray_pat_construct(b->da_postorder, n, B, 1, 0x0, (opt & OPT_FAST_POSTORDER_SELECT) | OPT_NO_RANK); + printf("postorder rank/select table: %d bytes (%1.2f bpc)\n",b->da_postorder->idx_size,(double)b->da_postorder->idx_size*8/n); + b->idx_size += b->da_postorder->idx_size; + } else { + b->da_postorder = NULL; + } + + if (opt & OPT_DFUDS_LEAF) { + mymalloc(b->da_dfuds_leaf,1,0); + darray_pat_construct(b->da_dfuds_leaf, n, B, 2, 0x0, opt & OPT_FAST_DFUDS_LEAF_SELECT); + printf("dfuds leaf rank/select table: %d bytes (%1.2f bpc)\n",b->da_dfuds_leaf->idx_size,(double)b->da_dfuds_leaf->idx_size*8/n); + b->idx_size += b->da_dfuds_leaf->idx_size; + } else { + b->da_dfuds_leaf = NULL; + } + + return 0; +} + +// destroyTree: frees the memory of tree. +void destroyTree(bp *b) { + if (!b) return; // nothing to free + + destroyDarray(b->da); // destroys da data structure + if (b->da) free(b->da); + + if (b->sm) free(b->sm); + if (b->sM) free(b->sM); + if (b->sd) free(b->sd); + if (b->mm) free(b->mm); + if (b->mM) free(b->mM); + if (b->md) free(b->md); + + destroyDarray(b->da_leaf); + if (b->da_leaf) free(b->da_leaf); + + destroyDarray(b->da_inorder); + if (b->da_inorder) free(b->da_inorder); + + destroyDarray(b->da_postorder); + if (b->da_postorder) free(b->da_postorder); + + destroyDarray(b->da_dfuds_leaf); + if (b->da_dfuds_leaf) free(b->da_dfuds_leaf); +} + + +// saveTree: saves parentheses data structure to file +// By Diego Arroyuelo +void saveTree(bp *b, FILE *fp) { + + if (fwrite(&(b->n), sizeof(int), 1, fp) != 1) { + printf("Error: cannot save number of parentheses to file\n"); + exit(1); + } + + if (fwrite(b->B, sizeof(pb), (b->n+D-1)/D, fp) != ((b->n+D-1)/D)) { + printf("Error: cannot save parentheses sequence to file\n"); + exit(1); + } + + if (fwrite(&(b->opt), sizeof(int), 1, fp) != 1) { + printf("Error: cannot save opt in parentheses to file\n"); + exit(1); + } +} + +// loadTree: load parentheses data structure from file +// By Diego Arroyuelo +void loadTree(bp *b, FILE *fp) { + + pb *B; + int n, opt; + + if (fread(&n, sizeof(int), 1, fp) != 1) { + printf("Error: cannot read number of parentheses from file\n"); + exit(1); + } + + mymalloc(B,(n+D-1)/D,0); + + if (fread(B, sizeof(pb), (n+D-1)/D, fp) != ((n+D-1)/D)) { + printf("Error: cannot read parentheses sequence from file\n"); + exit(1); + } + + if (fread(&opt, sizeof(int), 1, fp) != 1) { + printf("Error: cannot read opt in parentheses from file\n"); + exit(1); + } + + bp_construct(b, n, B, opt); + +} + + + +int naive_fwd_excess(bp *b,int s, int rel) +{ + int i,v,n; + pb *B; + n = b->n; B = b->B; + v = 0; + for (i=s+1; iB; + v = 0; + for (i=s; i>=0; i--) { + if (getbit(B,i)==OP) { + v--; + } else { + v++; + } + if (v == rel) return i-1; + } + return -2; +} + +int naive_search_SB_l(bp *b, int i, int rel) +{ + int il,v; + + il = (i / SB) * SB; + for (; i>=il; i--) { + if (getbit(b->B,i)==OP) { + rel++; + } else { + rel--; + } + if (rel == 0) return i-1; + } + if (i < 0) return -2; + return -3; +} + +int naive_rmq(bp *b, int s, int t,int opt) +{ + int d,i,dm,im; + + if (opt & OPT_RIGHT) { + d = dm = depth(b,t); im = t; + i = t-1; + while (i >= s) { + if (getbit(b->B,i+1)==CP) { + d++; + if (opt & OPT_MAX) { + if (d > dm) { + dm = d; im = i; + } + } + } else { + d--; + if (!(opt & OPT_MAX)) { + if (d < dm) { + dm = d; im = i; + } + } + } + i--; + } + } else { + d = dm = depth(b,s); im = s; + i = s+1; + while (i <= t) { + if (getbit(b->B,i)==OP) { + d++; + if (opt & OPT_MAX) { + if (d > dm) { + dm = d; im = i; + } + } + } else { + d--; + if (!(opt & OPT_MAX)) { + if (d < dm) { + dm = d; im = i; + } + } + } + i++; + } + } + return im; +} + +int root_node(bp *b) +{ + return 0; +} + + +int rank_open(bp *b, int s) +{ + return darray_rank(b->da,s); +} + +int rank_close(bp *b, int s) +{ + return s+1 - darray_rank(b->da,s); +} + +int select_open(bp *b, int s) +{ + if (b->opt & OPT_FAST_PREORDER_SELECT) { + return darray_select(b->da,s,1); + } else { + return darray_select_bsearch(b->da,s,getpat_preorder); + } +} + +int select_close(bp *b, int s) +{ + if (b->opt & OPT_FAST_POSTORDER_SELECT) { + return darray_pat_select(b->da_postorder,s,getpat_postorder); + } else { + return postorder_select_bsearch(b,s); + } +} + +/////////////////////////////////////////// +// find_close(bp *b,int s) +// returns the matching close parenthesis of s +/////////////////////////////////////////// +int find_close(bp *b,int s) +{ + return fwd_excess(b,s,-1); +} + +/////////////////////////////////////////// +// find_open(bp *b,int s) +// returns the matching open parenthesis of s +/////////////////////////////////////////// +int find_open(bp *b,int s) +{ + int r; + r = bwd_excess(b,s,0); + if (r >= -1) return r+1; + return -1; +} + +/////////////////////////////////////////// +// parent(bp *b,int s) +// returns the parent of s +// -1 if s is the root +/////////////////////////////////////////// +int parent(bp *b,int s) +{ + int r; + r = bwd_excess(b,s,-2); + if (r >= -1) return r+1; + return -1; +} + +int enclose(bp *b,int s) +{ + return parent(b,s); +} + +/////////////////////////////////////////// +// level_ancestor(bp *b,int s,int d) +// returns the ancestor of s with relative depth d (d < 0) +// -1 if no such node +/////////////////////////////////////////// +int level_ancestor(bp *b,int s,int d) +{ + int r; + r = bwd_excess(b,s,d-1); + if (r >= -1) return r+1; + return -1; +} + +/////////////////////////////////////////// +// lca(bp *b, int s, int t) +// returns the lowest common ancestor of s and t +/////////////////////////////////////////// +int lca(bp *b, int s, int t) +{ + return parent(b,rmq(b,s,t,0)+1); +} + + +/////////////////////////////////////////// +// preorder_rank(bp *b,int s) +// returns the preorder (>= 1) of node s (s >= 0) +/////////////////////////////////////////// +int preorder_rank(bp *b,int s) +{ + return darray_rank(b->da,s); +} + +/////////////////////////////////////////// +// preorder_select(bp *b,int s) +// returns the node with preorder s (s >= 1) +// -1 if no such node +/////////////////////////////////////////// +int preorder_select(bp *b,int s) +{ + // no error handling + if (b->opt & OPT_FAST_PREORDER_SELECT) { + return darray_select(b->da,s,1); + } else { + return darray_select_bsearch(b->da,s,getpat_preorder); + } +} + +/////////////////////////////////////////// +// postorder_rank(bp *b,int s) +// returns the postorder (>= 1) of node s (s >= 0) +// -1 if s-th bit is not OP +/////////////////////////////////////////// +int postorder_rank(bp *b,int s) +{ + int t; + if (inspect(b,s) == CP) return -1; + t = find_close(b,s); + // return t+1 - darray_rank(b->da,t); + return rank_close(b,t); +} + +int postorder_select_bsearch(bp *b,int s) +{ + int l,r,m; + + if (s == 0) return -1; + + if (s > b->da->n - b->da->m) { + return -1; + } + l = 0; r = b->da->n - 1; + + while (l < r) { + m = (l+r)/2; + //printf("m=%d rank=%d s=%d\n",m,m+1 - darray_rank(b->da,m),s); + if (m+1 - darray_rank(b->da,m) >= s) { + r = m; + } else { + l = m+1; + } + } + return l; +} + +/////////////////////////////////////////// +// postorder_select(bp *b,int s) +// returns the position of CP of the node with postorder s (>= 1) +/////////////////////////////////////////// +int postorder_select(bp *b,int s) +{ +#if 0 + if (b->opt & OPT_FAST_POSTORDER_SELECT) { + return darray_pat_select(b->da_postorder,s,getpat_postorder); + } else { + return postorder_select_bsearch(b->da,s); + } +#else + return select_close(b,s); +#endif +} + +/////////////////////////////////////////// +// leaf_rank(bp *b,int s) +// returns the number of leaves to the left of s +/////////////////////////////////////////// +int leaf_rank(bp *b,int s) +{ + if ((b->opt & OPT_LEAF) == 0) { + printf("leaf_rank: error!!! not supported\n"); + return -1; + } + if (s >= b->n-1) { + s = b->n-2; + } + return darray_pat_rank(b->da_leaf,s,getpat_leaf); +} + +/////////////////////////////////////////// +// leaf_select(bp *b,int s) +// returns the position of s-th leaf +/////////////////////////////////////////// +int leaf_select(bp *b,int s) +{ + if ((b->opt & OPT_LEAF) == 0) { + printf("leaf_select: error!!! not supported\n"); + return -1; + } + if (s > b->da_leaf->m) return -1; + if (b->opt & OPT_FAST_LEAF_SELECT) { + return darray_pat_select(b->da_leaf,s,getpat_leaf); + } else { + return darray_select_bsearch(b->da_leaf,s,getpat_leaf); + } +} + + +/////////////////////////////////////////// +// inorder_rank(bp *b,int s) +// returns the number of ")(" (s >= 0) +/////////////////////////////////////////// +int inorder_rank(bp *b,int s) +{ + if ((b->opt & OPT_INORDER) == 0) { + printf("inorder_rank: error!!! not supported\n"); + return -1; + } + if (s >= b->n-1) { + s = b->n-2; + } + return darray_pat_rank(b->da_inorder,s,getpat_inorder); +} + +/////////////////////////////////////////// +// inorder_select(bp *b,int s) +// returns the s-th position of ")(" (s >= 1) +/////////////////////////////////////////// +int inorder_select(bp *b,int s) +{ + if ((b->opt & OPT_INORDER) == 0) { + printf("inorder_select: error!!! not supported\n"); + return -1; + } + if (b->opt & OPT_FAST_INORDER_SELECT) { + return darray_pat_select(b->da_inorder,s,getpat_inorder); + } else { + return darray_select_bsearch(b->da_inorder,s,getpat_inorder); + } +} + +/////////////////////////////////////////// +// leftmost_leaf(bp *b, int s) +/////////////////////////////////////////// +int leftmost_leaf(bp *b, int s) +{ + if ((b->opt & OPT_LEAF) == 0) { + printf("leftmost_leaf: error!!! not supported\n"); + return -1; + } + return leaf_select(b,leaf_rank(b,s)+1); +} + +/////////////////////////////////////////// +// rightmost_leaf(bp *b, int s) +/////////////////////////////////////////// +int rightmost_leaf(bp *b, int s) +{ + int t; + if ((b->opt & OPT_LEAF) == 0) { + printf("leftmost_leaf: error!!! not supported\n"); + return -1; + } + t = find_close(b,s); + return leaf_select(b,leaf_rank(b,t)); +} + + + +/////////////////////////////////////////// +// inspect(bp *b, int s) +// returns OP (==1) or CP (==0) at s-th bit (0 <= s < n) +/////////////////////////////////////////// +int inspect(bp *b, int s) +{ + if (s < 0 || s >= b->n) { + printf("inspect: error s=%d is out of [%d,%d]\n",s,0,b->n-1); + } + return getbit(b->B,s); +} + +int isleaf(bp *b, int s) +{ + if (inspect(b,s) != OP) { + printf("isleaf: error!!! B[%d] = OP\n",s); + } + if (inspect(b,s+1) == CP) return 1; + else return 0; +} + + +/////////////////////////////////////////// +// subtree_size(bp *b, int s) +// returns the number of nodes in the subtree of s +/////////////////////////////////////////// +int subtree_size(bp *b, int s) +{ + return (find_close(b,s) - s + 1) / 2; +} + +/////////////////////////////////////////// +// first_child(bp *b, int s) +// returns the first child +// -1 if s is a leaf +/////////////////////////////////////////// +int first_child(bp *b, int s) +{ + if (inspect(b,s+1) == CP) return -1; + return s+1; +} + +/////////////////////////////////////////// +// next_sibling(bp *b,int s) +// returns the next sibling of parent(s) +// -1 if s is the last child +////////////////////////////////////////// +int next_sibling(bp *b, int s) +{ + int t; + t = find_close(b,s)+1; + if (t >= b->n) { + printf("next_sibling: error s=%d t=%d\n",s,t); + } + if (inspect(b,t) == CP) return -1; + return t; +} + +/////////////////////////////////////////// +// prev_sibling(bp *b,int s) +// returns the previous sibling of parent(s) +// -1 if s is the first child +////////////////////////////////////////// +int prev_sibling(bp *b, int s) +{ + int t; + if (s < 0) { + printf("prev_sibling: error s=%d\n",s); + } + if (s == 0) return -1; + if (inspect(b,s-1) == OP) return -1; + t = find_open(b,s-1); + return t; +} + +/////////////////////////////////////////// +// deepest_node(bp *b,int s) +// returns the first node with the largest depth in the subtree of s +/////////////////////////////////////////// +int deepest_node(bp *b,int s) +{ + int t,m; + t = find_close(b,s); + m = rmq(b,s,t, OPT_MAX); + return m; +} + +/////////////////////////////////////////// +// subtree_height(bp *b,int s) +// returns the height of the subtree of s +// 0 if s is a leaf +/////////////////////////////////////////// +int subtree_height(bp *b,int s) +{ + int t; + t = deepest_node(b,s); + return depth(b,t) - depth(b,s); +} + +int naive_degree(bp *b, int s) +{ + int t,d; + d = 0; + t = first_child(b,s); + while (t >= 0) { + d++; + t = next_sibling(b,t); + } + return d; +} + +/////////////////////////////////////////// +// degree(bp *b, int s) +// returns the number of children of s +// 0 if s is a leaf +/////////////////////////////////////////// +int degree(bp *b, int s) +{ + if (b->opt & OPT_DEGREE) { + return fast_degree(b,s,b->n,0); + } else { + return naive_degree(b,s); + } +} + +int naive_child(bp *b, int s, int d) +{ + int t,i; + t = first_child(b,s); + for (i = 1; i < d; i++) { + if (t == -1) break; + t = next_sibling(b,t); + } + return t; +} + +/////////////////////////////////////////// +// child(bp *b, int s, int d) +// returns the d-th child of s (1 <= d <= degree(s)) +// -1 if no such node +/////////////////////////////////////////// +int child(bp *b, int s, int d) +{ + int r; + if (b->opt & OPT_DEGREE) { + //return find_open(b,fast_degree(b,s,b->n,d)); + if (d==1) return first_child(b,s); + r = fast_degree(b,s,b->n,d-1)+1; + if (inspect(b,r) == CP) return -1; + return r; + } else { + return naive_child(b,s,d); + } + +} + + +int naive_child_rank(bp *b, int t) +{ + int v,d; + d = 0; + while (t != -1) { + d++; + t = prev_sibling(b,t); + } + return d; +} + +/////////////////////////////////////////// +// child_rank(bp *b, int t) +// returns d if t is the d-th child of the parent of t (d >= 1) +// 1 if t is the root +/////////////////////////////////////////// +int child_rank(bp *b, int t) +{ + int r; + if (t == root_node(b)) return 1; + if (b->opt & OPT_DEGREE) { + r = parent(b,t); + return fast_degree(b,r,t,0)+1; + } else { + return naive_child_rank(b,t); + } +} + + + +/////////////////////////////////////////// +// is_ancestor(bp *b, int s, int t) +// returns 1 if s is an ancestor of t +// 0 otherwise +/////////////////////////////////////////// +int is_ancestor(bp *b, int s, int t) +{ + int v; + v = find_close(b,s); + if (s <= t && t <= v) return 1; + return 0; +} + +/////////////////////////////////////////// +// distance(bp *b, int s, int t) +// returns the length of the shortest path from s to t in the tree +/////////////////////////////////////////// +int distance(bp *b, int s, int t) +{ + int v,d; + v = lca(b,s,t); + d = depth(b,v); + return (depth(b,s) - d) + (depth(b,t) - d); +} + +/////////////////////////////////////////// +// level_next(bp *b, int d) +/////////////////////////////////////////// +int level_next(bp *b,int s) +{ + int t; + t = fwd_excess(b,s,0); + return t; +} + +/////////////////////////////////////////// +// level_prev(bp *b, int d) +/////////////////////////////////////////// +int level_prev(bp *b,int s) +{ + int t; + t = bwd_excess(b,s,0); + return t; +} + +/////////////////////////////////////////// +// level_leftmost(bp *b, int d) +/////////////////////////////////////////// +int level_leftmost(bp *b, int d) +{ + int t; + if (d < 1) return -1; + if (d == 1) return 0; + t = fwd_excess(b,0,d); + return t; +} + +/////////////////////////////////////////// +// level_rigthmost(bp *b, int d) +/////////////////////////////////////////// +int level_rigthmost(bp *b, int d) +{ + int t; + if (d < 1) return -1; + if (d == 1) return 0; + t = bwd_excess(b,0,d-1); + return find_open(b,t); +} + +/////////////////////////////////////////// +// leaf_size(bp *b, int s) +/////////////////////////////////////////// +int leaf_size(bp *b, int s) +{ + int t; + if ((b->opt & OPT_LEAF) == 0) { + printf("leaf_size: error!!! not supported\n"); + return -1; + } + t = find_close(b,s); + return leaf_rank(b,t) - leaf_rank(b,s); +} diff --git a/bp.h b/bp.h new file mode 100644 index 0000000..30c841e --- /dev/null +++ b/bp.h @@ -0,0 +1,158 @@ +#include +#include +#include "darray.h" + +#define OP 1 +#define CP 0 + +#define OPT_MIN 0 +#define OPT_MAX 1 +#define OPT_LEFT 0 +#define OPT_RIGHT 2 + +#define OPT_LEAF (1<<0) +#define OPT_INORDER (1<<1) +#define OPT_DEGREE (1<<2) +#define OPT_FAST_PREORDER_SELECT (1<<3) +#define OPT_FAST_LEAF_SELECT (1<<4) +#define OPT_FAST_INORDER_SELECT (1<<5) +#define OPT_FAST_POSTORDER_SELECT (1<<6) +#define OPT_DFUDS_LEAF (1<<7) +#define OPT_FAST_DFUDS_LEAF_SELECT (1<<8) + +//#define logSB 9 +#define logSB 7 +//#define logSB 2 +#define SB (1<(y)?(x):(y)) +#endif + + +typedef struct { + int n; + pb *B; + darray *da; + byte *sm, *sM; + byte *sd; + int *mm, *mM; + int *md; + int m_ofs; + + darray *da_leaf; + darray *da_inorder; + darray *da_postorder; + darray *da_dfuds_leaf; + int idx_size; + int opt; +} bp; + +int bp_construct(bp *b,int n, pb *B, int opt); +void printbp(bp *b, int s, int t); + + +int rank_open(bp *b, int s); +int rank_close(bp *b, int s); +int select_open(bp *b, int s); +int select_close(bp *b, int s); + + +int root_node(bp *b); +int find_close(bp *b,int s); +int find_open(bp *b,int s); +int enclose(bp *b,int s); +int parent(bp *b,int s); +int level_ancestor(bp *b,int s,int d); +int depth(bp *b, int s); +int preorder_rank(bp *b,int s); +int postorder_rank(bp *b,int s); +int inspect(bp *b, int s); +int isleaf(bp *b, int s); +int rmq(bp *b, int s, int t, int opt); +int subtree_size(bp *b, int s); +int first_child(bp *b, int s); +int next_sibling(bp *b, int s); +int prev_sibling(bp *b, int s); +int deepest_node(bp *b,int s); +int subtree_height(bp *b,int s); +int is_ancestor(bp *b, int s, int t); +int distance(bp *b, int s, int t); +int level_lefthmost(bp *b, int d); +int level_rigthmost(bp *b, int d); +int degree(bp *b,int s); + +// not efficient +int naive_degree(bp *b, int s); +int naive_child(bp *b, int s, int d); +int naive_child_rank(bp *b, int t); +int naive_rmq(bp *b, int s, int t,int opt); +int postorder_select(bp *b,int s); + +// using preorder select index +int preorder_select(bp *b,int s); + +// using leaf index +int leaf_rank(bp *b,int s); +int leaf_size(bp *b, int s); +int leftmost_leaf(bp *b, int s); +int rightmost_leaf(bp *b, int s); + +// using leaf select index +int leaf_select(bp *b,int s); + +// using inorder index +int inorder_rank(bp *b,int s); + +// using inorder select index +int inorder_select(bp *b,int s); + +// using degree index +int fast_degree(bp *b,int s, int t, int ith); +int child_rank(bp *b, int t); +int child(bp *b, int s, int d); + + +// new functions for persistence purposes, added by Diego Arroyuelo +void saveTree(bp *b, FILE *fp); +void loadTree(bp *b, FILE *fp); +void destroyTree(bp *b); + +int blog(int x); +pb getpat_preorder(pb *b); +pb getpat_leaf(pb *b); +pb getpat_inorder(pb *b); +pb getpat_postorder(pb *b); + + +int fwd_excess(bp *b,int s, int rel); +int bwd_excess(bp *b,int s, int rel); + +extern int *matchtbl,*parenttbl; +void make_naivetbl(pb *B,int n); + +extern int popCount[1< +#include +#include "bp.h" + +#define NOTFOUND -2 +#define CONTINUE -3 +#define END -4 +#define FOUND -5 + +#define SBid(i) ((i)>>logSB) +#define SBfirst(i) ((i) & (~(SB-1))) +#define SBlast(i) ((i) | (SB-1)) + +#define MBid(i) ((i)>>logMB) +#define MBfirst(i) ((i) & (~(MB-1))) +#define MBlast(i) ((i) | (MB-1)) + + +int blog(int x) +{ +int l; + l = 0; + while (x>0) { + x>>=1; + l++; + } + return l; +} + + +pb getpat_preorder(pb *b) +{ + return *b; +} + +pb getpat_postorder(pb *b) +{ + return ~(*b); +} + +pb getpat_leaf(pb *b) +{ + pb w1,w2,w; + w1 = b[0]; + w2 = (w1 << 1) + (b[1] >> (D-1)); + w = w1 & (~w2); + return w; +} + +pb getpat_inorder(pb *b) +{ + pb w1,w2,w; + w1 = b[0]; + w2 = (w1 << 1) + (b[1] >> (D-1)); + w = (~w1) & w2; + return w; +} + +pb getpat_dfuds_leaf(pb *b) +{ + pb w1,w2,w; + w1 = b[0]; + w2 = (w1 << 1) + (b[1] >> (D-1)); + w = (~w1) & (~w2); + return w; +} + + + +/////////////////////////////////////////// +// depth(bp *b, int s) +// returns the depth of s +// The root node has depth 1 +/////////////////////////////////////////// +int depth(bp *b, int s) +{ + int d; + if (s < 0) return 0; + d = 2 * darray_rank(b->da,s) - (s+1); +#if 0 + if (d != naive_depth(b,s)) { + d = naive_depth(b,s); + darray_rank(b->da,s); + } + //printf("depth(%d)=%d\n",s,d); +#endif + return d; +} + +int fast_depth(bp *b, int s) +{ + int d; + darray *da; + int r,j; + + s++; + if ((s & (RRR-1)) != 0) { + //printf("fast_depth:warning s=%d\n",s); + return depth(b,s); + } + da = b->da; + //d = 2 * (da->rl[s>>logR] + da->rm[s>>logRR] + da->rs[s>>logRRR]) - s; + + r = da->rl[s>>logR] + da->rm[s>>logRR]; + j = (s>>logRRR) & (RR/RRR-1); + while (j > 0) { + r += da->rs[((s>>logRR)<<(logRR-logRRR))+j-1]; + j--; + } + d = 2 * r - s; + + return d; +} + +int search_SB_r(bp *b, int i, int rel) +{ + int j,r,n,il; + pb *p,x,w; + + n = b->n; + il = min((SBid(i) + 1) << logSB,n); + p = &b->B[i>>logD]; + while (i0) { + w = (x >> (D-ETW)) & ((1<= -ETW && rel <= ETW) { + r = fwdtbl[((rel+ETW)<= n) return NOTFOUND; + return i+r; + } + } + r = min(j,ETW); + rel -= 2*popCount[w]-r; + x <<= r; + i += r; + j -= r; + } + } + return CONTINUE; +} + +int search_MB_r(bp *b, int i, int td) +{ + int il,d; + int m,M,n; + pb *B; + + B = b->B; + n = b->n; + + il = min((MBid(i) + 1) << logMB,n); + for ( ; i < il; i+=SB) { +#if (SB % RRR != 0) + d = depth(b,i-1); +#else + d = fast_depth(b,i-1); +#endif + m = d + b->sm[SBid(i)] - SB; + M = d + b->sM[SBid(i)] - 1; + if (m <= td && td <= M) { + return search_SB_r(b,i,td-d); + } + } + if (i >= n) return NOTFOUND; + return CONTINUE; +} + +/////////////////////////////////////////// +// fwd_excess(bp *b,int s, int rel) +// find the leftmost value depth(s)+rel to the right of s (exclusive) +/////////////////////////////////////////// +int fwd_excess(bp *b,int s, int rel) +{ + int i,n; + int d,td; + int m,M; + int m_ofs; + pb *B; + n = b->n; B = b->B; + + i = s+1; +#if 1 + d = search_SB_r(b,i,rel); + if (d >= NOTFOUND) return d; + + i = min((SBid(i) + 1) << logSB,n); + td = depth(b,s) + rel; + d = search_MB_r(b,i,td); + if (d >= NOTFOUND) return d; +#else + if (i != SBfirst(i)) { + d = search_SB_r(b,i,rel); + if (d >= NOTFOUND) return d; + } + + td = depth(b,s) + rel; + + i = SBid(i+SB-1) << logSB; + + if (i != MBfirst(i)) { + d = search_MB_r(b,i,td); + if (d >= NOTFOUND) return d; + } +#endif + + m_ofs = b->m_ofs; + i = MBid(s) + m_ofs; + while (i > 0) { + if ((i&1) == 0) { + i++; + m = b->mm[i]; + M = b->mM[i]; + if (m <= td && td <= M) break; + } + i >>= 1; + } + if (i == 0) return NOTFOUND; + + while (i < m_ofs) { + i <<= 1; + m = b->mm[i]; + M = b->mM[i]; + if (!(m <= td && td <= M)) i++; + } + i -= m_ofs; + i <<= logMB; + + d = search_MB_r(b,i,td); + if (d >= NOTFOUND) return d; + + // unexpected (bug) + printf("fwd_excess: ???\n"); + return -99; + +} + + +int degree_SB_slow(bp *b, int i, int t, int rel, int *ans, int ith) +{ + int j,r,n,il; + pb *p,x,w,w2; + int d, deg, v; + + n = t; + il = min((SBid(i) + 1) << logSB,n); + d = deg = 0; + + while (i < il) { + if (getbit(b->B,i)==OP) { + d++; + } else { + d--; + } + if (d < rel) { // reached the end + if (ith > 0) { + return NOTFOUND; + } else { + *ans = deg; + return END; + } + } + if (d == rel) { // found the same depth + deg++; + if (deg == ith) { + *ans = i; + return FOUND; + } + } + i++; + } + *ans = deg; + return CONTINUE; +} + +int degree_SB(bp *b, int i, int t, int rel, int *ans, int ith) +{ + int j,r,n,il; + pb *p,x,w,w2; + int d, deg, v; + int degtmp,itmp; + int ith2,d2; + + n = t; + il = min((SBid(i) + 1) << logSB,n); + d = deg = 0; + + p = &b->B[i>>logD]; + while (i < il) { + x = *p++; + j = i & (D-1); + x <<= j; + j = min(D-j,il-i); + while (j>0) { + w = (x >> (D-ETW)) & ((1< 0) { +#if 0 + for (r = 0; r < ETW; r++) { + if (w & 0x80) { + d++; + } else { + d--; + if (d < rel) break; + if (d == rel) { + ith--; + if (ith == 0) { + *ans = i + r; + return FOUND; + } + } + } + w <<= 1; + } + return NOTFOUND; +#else + r = childtbl2[rel-d+ETW][ith-1][w]; + if (r >= 0) { + *ans = i + r; + return FOUND; + } + return NOTFOUND; +#endif + } + r = ETW-1-minmaxtbl_i[0][w | w2]; + w2 = (1< 0) { + if (ith <= r) { + *ans = i + childtbl[((ith-1)<B; + n = t; + + il = min((MBid(i) + 1) << logMB,n); + deg = 0; + for ( ; i+SB-1 < il; i+=SB) { +#if (SB % RRR != 0) + d = depth(b,i-1); +#else + d = fast_depth(b,i-1); +#endif + m = d + b->sm[SBid(i)] - SB; + if (m < td) { + r = degree_SB(b,i,n,td-d,°tmp,ith); + if (ith > 0) { + if (r == NOTFOUND) return NOTFOUND; + *ans = degtmp; + return FOUND; + } else { + *ans = deg + degtmp; + return END; + } + } + if (m == td) { + if (ith > 0) { + if (ith <= b->sd[SBid(i)]) break; + ith -= b->sd[SBid(i)]; + } + deg += b->sd[SBid(i)]; + } + } + if (i < il) { +#if (SB % RRR != 0) + d = depth(b,i-1); +#else + d = fast_depth(b,i-1); +#endif + r = degree_SB(b,i,n,td-d,°tmp,ith); + if (ith > 0) { + if (r == NOTFOUND) return NOTFOUND; + if (r == FOUND) { + *ans = degtmp; + return FOUND; + } + } else { + deg += degtmp; + } + } + *ans = deg; + if (i >= n) return END; + return CONTINUE; +} + +static int partition_range(int s,int t) +{ + int i,j,h; + + printf("partition [%d,%d] => ",s,t); + h = 1; + while (s <= t) { + if (s & h) { + if (s+h-1 <= t) { + printf("[%d,%d] ",s,s+h-1); + s += h; + } + } else { + if (s+h > t) break; + } + h <<= 1; + } + while (h > 0) { + if (s+h-1 <= t) { + printf("[%d,%d] ",s,s+h-1); + s += h; + } + h >>= 1; + } + printf("\n"); +} + + + + +/////////////////////////////////////////// +// fast_degree(bp *b,int s, int t, int ith) +// returns the number of children of s, to the left of t +// returns the position of (ith)-th child of s if (ith > 0) +/////////////////////////////////////////// +int fast_degree(bp *b,int s, int t, int ith) +{ + int i,j,n; + int d,td; + int m,M; + int m_ofs; + pb *B; + int deg,degtmp; + int sm,tm,ss,h; + + n = t; + B = b->B; + + deg = 0; + + i = s+1; + if (i != SBfirst(i)) { + d = degree_SB(b,i,n,0,°tmp,ith); + if (ith > 0) { + if (d == NOTFOUND) return -1; + if (d == FOUND) return degtmp; + ith -= degtmp; + } + if (d == END) return degtmp; + deg += degtmp; + } + + td = depth(b,s); + + i = SBid(i+SB-1) << logSB; + + if (i != MBfirst(i)) { + d = degree_MB(b,i,n,td,°tmp,ith); + if (ith > 0) { + if (d == NOTFOUND) return -1; + if (d == FOUND) return degtmp; + ith -= degtmp; + deg += degtmp; + } else { + deg += degtmp; + if (d == END) { + return deg; + } + } + } + +#if 0 + // sequential search + + sm = MBid(i+MB-1); + tm = MBid((n-1)+1)-1; // the rightmost MB fully contained in [0,n-1] + + m_ofs = b->m_ofs; + sm += m_ofs; tm += m_ofs; + for (i=sm; i<=tm; i++) { + if (b->mm[i] < td) { + break; + } + if (b->mm[i] == td) { + if (ith > 0) { + if (ith <= b->md[i]) break; + ith -= b->md[i]; + } + deg += b->md[i]; + } + } + ss = i - m_ofs; +#else + sm = MBid(i+MB-1); + tm = MBid((n-1)+1)-1; // the rightmost MB fully contained in [0,n-1] + + m_ofs = b->m_ofs; + sm += m_ofs; tm += m_ofs; + ss = sm; + + //partition_range(sm,tm); + + //printf("partition [%d,%d] => ",sm,tm); + h = 1; + while (sm <= tm) { + if (sm & h) { + if (sm+h-1 <= tm) { + //printf("[%d,%d] ",sm,sm+h-1); + j = sm / h; + if (b->mm[j] < td) { + h >>= 1; + break; + } + if (b->mm[j] == td) { + if (ith > 0) { + if (ith <= b->md[j]) { + h >>= 1; + break; + } + ith -= b->md[j]; + } + deg += b->md[j]; + } + sm += h; + } + } else { + if (sm+h > tm) break; + } + h <<= 1; + } + while (h > 0) { + if (sm+h-1 <= tm) { + //printf("[%d,%d] ",sm,sm+h-1); + j = sm / h; + if (ith > 0) { + if (b->mm[j] >= td) { + if (b->mm[j] == td) { + if (ith > b->md[j]) { + ith -= b->md[j]; + sm += h; + } else { + deg += b->md[j]; + } + } else { + sm += h; + } + } + } else { + if (b->mm[j] >= td) { + if (b->mm[j] == td) { + deg += b->md[j]; + } + sm += h; + } + } + } + h >>= 1; + } + //printf("\n"); + ss = sm; + + ss -= m_ofs; + +#endif + + ss <<= logMB; + + d = degree_MB(b,ss,n,td,°tmp,ith); + if (ith > 0) { + if (d == NOTFOUND) return -1; + if (d == FOUND) return degtmp; + } + deg += degtmp; + if (d == END) return deg; + return deg; + + // unexpected (bug) + printf("degree: ???\n"); + return -99; + +} + +int search_SB_l(bp *b, int i, int rel) +{ + int j,r,il; + pb *p,x,w; + + il = SBfirst(i); + + p = &b->B[i>>logD]; + while (i>=il) { + x = *p--; + j = (i & (D-1))+1; + x >>= D-j; + while (j>0) { + w = x & ((1<= -ETW && rel <= ETW) { + r = bwdtbl[((rel+ETW)<>= r; + i -= r; + j -= r; + } + + } + if (i < 0) return NOTFOUND; + return CONTINUE; +} + +int search_MB_l(bp *b, int i, int td) +{ + int il,d; + int m,M; + pb *B; + +#if 0 + if (i % SB != SB-1) { + printf("search_MB_l:error!!! i=%d SB=%d\n",i,i%SB); + } +#endif + B = b->B; + + il = MBfirst(i); + for ( ; i >= il; i-=SB) { +#if (SB % RRR != 0) + d = depth(b,i-SB); +#else + d = fast_depth(b,i-SB); +#endif + m = d + b->sm[SBid(i)] - SB; + M = d + b->sM[SBid(i)] - 1; + if (m <= td && td <= M) { +#if (SB % RRR != 0) + d = depth(b,i); +#else + d = fast_depth(b,i); +#endif + if (d == td) return i; + return search_SB_l(b,i,td-d); + } + } + return CONTINUE; +} + +/////////////////////////////////////////// +// bwd_excess(bp *b,int s, int rel) +// find the rightmost value depth(s)+rel to the left of s (exclusive) +/////////////////////////////////////////// +int bwd_excess(bp *b,int s, int rel) +{ + int i,n; + int d,td; + int m,M; + int m_ofs; + pb *B; + n = b->n; B = b->B; + + i = s; + d = search_SB_l(b,i,rel); + if (d >= NOTFOUND) return d; + + i = SBfirst(i) -1; + + td = depth(b,s) + rel; + + d = search_MB_l(b,i,td); + if (d >= NOTFOUND) return d; + + m_ofs = b->m_ofs; + i = (s>>logMB) + m_ofs; + while (i > 0) { + if ((i&1) == 1) { + i--; + m = b->mm[i]; + M = b->mM[i]; + if (m <= td && td <= M) break; + } + i >>= 1; + } + if (i == 0) { + if (td == 0) return -1; + else return NOTFOUND; + } + + while (i < m_ofs) { + i = i*2 + 1; + m = b->mm[i]; + M = b->mM[i]; + if (!(m <= td && td <= M)) i--; + } + i -= m_ofs; + i = ((i+1)<= NOTFOUND) return d; + + // unexpected (bug) + printf("bwd_excess: ???\n"); + return -99; + +} + +int rmq_SB(bp *b, int s, int t, int opt, int *dm) +{ + int i,d; + int is,ds; + pb *p,x,w,w2; + int j,v,r; + int lr; + int op; + + lr = 0; if (opt & OPT_RIGHT) lr = 1; + op = opt & (OPT_RIGHT | OPT_MAX); + + is = s; ds = d = 0; + i = s+1; + +#if SB >= ETW + p = &b->B[i>>logD]; + while (i <= t) { + x = *p++; + j = i & (D-1); + x <<= j; + j = min(D-j,t-i+1); + while (j>0) { + w = (x >> (D-ETW)) & ((1< ds) { + ds = d + v; + is = i + minmaxtbl_i[op][w & (~w2)]; + } + } else { + v = minmaxtbl_v[op][w | w2]; + if (d + v < ds +lr) { + ds = d + v; + is = i + minmaxtbl_i[op][w | w2]; + } + } + + r = min(j,ETW); + d += 2*popCount[w]-r; + x <<= r; + i += r; + j -= r; + } + } +#else + while (i <= t) { + if (getbit(b->B,i)==OP) { + d++; + if (op & OPT_MAX) { + if (d + lr > ds) { + ds = d; is = i; + } + } + } else { + d--; + if (!(op & OPT_MAX)) { + if (d < ds + lr) { + ds = d; is = i; + } + } + } + i++; + } +#endif + *dm = ds; + return is; +} + +int rmq_MB(bp *b, int s, int t, int opt, int *dm) +{ + int i,d,m; + int mi,md; + int lr; + + lr = 0; if (opt & OPT_RIGHT) lr = 1; + + md = *dm; mi = -1; + for (i = s; i <= t; i++) { +#if (SB % RRR != 0) + d = depth(b,(i<sM[i] - 1; + if (m + lr > md) { + md = m; mi = i; + } + } else { + m = d + b->sm[i] - SB; + if (m < md + lr) { + md = m; mi = i; + } + } + } + *dm = md; + return mi; +} + + + + +/////////////////////////////////////////// +// rmq(bp *b, int s, int t, int opt) +// returns the index of leftmost/rightmost minimum/maximum value +// in the range [s,t] (inclusive) +// returns the leftmost minimum if opt == 0 +// the rightmost one if (opt & OPT_RIGHT) +// the maximum if (opt & OPT_MAX) +/////////////////////////////////////////// +int rmq(bp *b, int s, int t, int opt) +{ + int ss, ts; // SB index of s and t + int sm, tm; // MB index of s and t + int ds; // current min value + int is; // current min index + int ys; // type of current min index + // 0: is is the index of min + // 1: is is the SB index + // 2: is is the MB index + int m_ofs; + int i,j,il,d,n; + int dm; + int lr; + + lr = 0; if (opt & OPT_RIGHT) lr = 1; + + n = b->n; + if (s < 0 || t >= n || s > t) { + printf("rmq: error s=%d t=%d n=%d\n",s,t,n); + return -1; + } + if (s == t) return s; + + + //////////////////////////////////////////////////////////// + // search the SB of s + //////////////////////////////////////////////////////////// + + il = min(SBlast(s),t); + is = rmq_SB(b,s,il,opt,&dm); + if (il == t) { // scan reached the end of the range + return is; + } + ds = depth(b,s) + dm; ys = 0; + + //////////////////////////////////////////////////////////// + // search the MB of s + //////////////////////////////////////////////////////////// + + ss = SBid(s) + 1; + il = min(SBid(MBlast(s)),SBid(t)-1); + dm = ds; + j = rmq_MB(b,ss,il,opt,&dm); + //if (dm < ds + lr) { + if (j >= 0) { + ds = dm; is = j; ys = 1; + } + + //////////////////////////////////////////////////////////// + // search the tree + //////////////////////////////////////////////////////////// + + sm = MBid(s) + 1; + tm = MBid(t) - 1; + +#if 0 + // sequential search + m_ofs = b->m_ofs; + sm += m_ofs; tm += m_ofs; + for (i=sm; i<=tm; i++) { + if (opt & OPT_MAX) { + if (b->mM[i] + lr > ds) { + ds = b->mM[i]; is = i - m_ofs; ys = 2; + } + } else { + if (b->mm[i] < ds + lr) { + ds = b->mm[i]; is = i - m_ofs; ys = 2; + } + } + } + +#else + if (sm <= tm) { + int h; + h = blog(sm ^ tm); + + m_ofs = b->m_ofs; + sm += m_ofs; tm += m_ofs; + + if (opt & OPT_MAX) { + if (b->mM[sm] + lr > ds) { + ds = b->mM[sm]; is = sm; ys = 2; + } + for (i=0; i<=h-2; i++) { + j = sm>>i; + if ((j&1) == 0) { + if (b->mM[j+1] + lr > ds) { + ds = b->mM[j+1]; is = j+1; ys = 2; + } + } + } + for (i=h-2; i>=0; i--) { + j = tm>>i; + if ((j&1)==1) { + if (b->mM[j-1] + lr > ds) { + ds = b->mM[j-1]; is = j-1; ys = 2; + } + } + } + if (b->mM[tm] + lr > ds) { + ds = b->mM[tm]; is = tm; ys = 2; + } + if (ys == 2) { + while (is < m_ofs) { + is <<= 1; + if (b->mM[is+1] + lr > b->mM[is]) is++; + } + is -= m_ofs; + } + } else { // MIN + if (b->mm[sm] < ds + lr) { + ds = b->mm[sm]; is = sm; ys = 2; + } + for (i=0; i<=h-2; i++) { + j = sm>>i; + if ((j&1) == 0) { + if (b->mm[j+1] < ds + lr) { + ds = b->mm[j+1]; is = j+1; ys = 2; + } + } + } + for (i=h-2; i>=0; i--) { + j = tm>>i; + if ((j&1)==1) { + if (b->mm[j-1] < ds + lr) { + ds = b->mm[j-1]; is = j-1; ys = 2; + } + } + } + if (b->mm[tm] < ds + lr) { + ds = b->mm[tm]; is = tm; ys = 2; + } + if (ys == 2) { + while (is < m_ofs) { + is <<= 1; + if (b->mm[is+1] < b->mm[is] + lr) is++; + } + is -= m_ofs; + } + } + } + +#endif + + //////////////////////////////////////////////////////////// + // search the MB of t + //////////////////////////////////////////////////////////// + + + ts = max(SBid(MBfirst(t)),SBid(s)+1); + il = SBid(SBfirst(t)-1); + dm = ds; + j = rmq_MB(b,ts,il,opt,&dm); + //if (dm < ds + lr) { + if (j >= 0) { + ds = dm; is = j; ys = 1; + } + + //////////////////////////////////////////////////////////// + // search the SB of t + //////////////////////////////////////////////////////////// + + i = SBfirst(t); + j = rmq_SB(b,i,t,opt,&dm); + d = depth(b,i) + dm; + if (opt & OPT_MAX) { + if (d + lr > ds) { + ds = d; is = j; ys = 0; + } + } else { + if (d < ds + lr) { + ds = d; is = j; ys = 0; + } + } + + //////////////////////////////////////////////////////////// + // search the rest + //////////////////////////////////////////////////////////// + + if (ys == 2) { + ss = SBid(is << logMB); + il = SBid(MBlast(is << logMB)); + if (opt & OPT_MAX) { + dm = -n-1; + } else { + dm = n+1; + } + j = rmq_MB(b,ss,il,opt,&dm); + ds = dm; is = j; ys = 1; + } + + if (ys == 1) { + ss = is << logSB; + il = SBlast(is << logSB); + is = rmq_SB(b,ss,il,opt,&dm); + //ds = depth(b,ss) + dm; ys = 0; + } + + return is; +} + diff --git a/darray.c b/darray.c new file mode 100644 index 0000000..7d302c4 --- /dev/null +++ b/darray.c @@ -0,0 +1,701 @@ +#include +#include +#include "darray.h" + +//typedef unsigned char byte; +//typedef unsigned short word; +//typedef unsigned int dword; + +//typedef dword pb; +#define logD 5 + +#define PBS (sizeof(pb)*8) +#define D (1<>(d-j-1))&1); + } + return x; +} + +int getbit(pb *B, int i) +{ + int j,l; + + //j = i / D; + //l = i % D; + j = i >> logD; + l = i & (D-1); + return (B[j] >> (D-1-l)) & 1; +} + +dword getbits(pb *B, int i, int d) +{ + dword j,x; + + x = 0; + for (j=0; j>(k-1-j))); + } + //printf("getpattern(%d) = %d\n",i,x); + return x; +} + + +static const unsigned int popCount[] = { +0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4, +1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, +1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, +2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, +1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, +2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, +2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, +3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, +1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, +2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, +2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, +3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, +2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, +3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, +3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, +4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8 +}; + +static int selecttbl[8*256]; + +unsigned int popcount(pb x) +{ + pb r; +#if 0 + r = x; + r = r - ((r>>1) & 0x77777777) - ((r>>2) & 0x33333333) - ((r>>3) & 0x11111111); + r = ((r + (r>>4)) & 0x0f0f0f0f) % 0xff; +#elif 1 + r = x; + r = ((r & 0xaaaaaaaa)>>1) + (r & 0x55555555); + r = ((r & 0xcccccccc)>>2) + (r & 0x33333333); + //r = ((r & 0xf0f0f0f0)>>4) + (r & 0x0f0f0f0f); + r = ((r>>4) + r) & 0x0f0f0f0f; + //r = ((r & 0xff00ff00)>>8) + (r & 0x00ff00ff); + r = (r>>8) + r; + //r = ((r & 0xffff0000)>>16) + (r & 0x0000ffff); + r = ((r>>16) + r) & 63; +#else + r = popCount[x & 0xff]; + x >>= 8; + r += popCount[x & 0xff]; + x >>= 8; + r += popCount[x & 0xff]; + x >>= 8; + r += popCount[x & 0xff]; +#endif + return r; +} + +unsigned int popcount8(pb x) +{ + dword r; +#if 1 + r = x; + r = ((r & 0xaa)>>1) + (r & 0x55); + r = ((r & 0xcc)>>2) + (r & 0x33); + r = ((r>>4) + r) & 0x0f; +#else + r = popCount[x & 0xff]; +#endif + return r; +} + +void make_selecttbl(void) +{ + int i,x,r; + pb buf[1]; + + for (x = 0; x < 256; x++) { + setbits(buf,0,8,x); + for (r=0; r<8; r++) selecttbl[(r<<8)+x] = -1; + r = 0; + for (i=0; i<8; i++) { + if (getbit(buf,i)) { + selecttbl[(r<<8)+x] = i; + r++; + } + } + } +} + + +int darray_construct(darray *da, int n, pb *buf, int opt) +{ + int i,j,k,m; + int nl; + int p,pp; + int il,is,ml,ms; + int r,m2; + dword *s; + int p1,p2,p3,p4,s1,s2,s3,s4; + + da->idx_size = 0; + + make_selecttbl(); + + + if (L/LLL == 0) { + printf("ERROR: L=%d LLL=%d\n",L,LLL); + exit(1); + } + + m = 0; + for (i=0; in = n; + da->m = m; + //printf("n=%d m=%d\n",n,m); + + da->buf = buf; + + if (opt & (~OPT_NO_RANK)) { // construct select table +#if 0 + mymalloc(s,m,0); + m = 0; + for (i=0; ilp,nl+1,1); da->idx_size += (nl+1)*sizeof(*da->lp); + mymalloc(da->p,nl+1,1); da->idx_size += (nl+1)*sizeof(*da->p); +#if 0 + printf("lp table: %d bytes (%1.2f bpc)\n",(nl+1)*sizeof(*da->lp), (double)(nl+1)*sizeof(*da->lp) * 8/n); + printf("p table: %d bytes (%1.2f bpc)\n",(nl+1)*sizeof(*da->p), (double)(nl+1)*sizeof(*da->p) * 8/n); +#endif + + for (r = 0; r < 2; r++) { + s1 = s2 = s3 = s4 = 0; + p1 = p2 = p3 = p4 = -1; + + ml = ms = 0; + for (il = 0; il < nl; il++) { + //pp = s[il*L]; + while (s1 <= il*L) { + if (getbit(buf,p1+1)) s1++; + p1++; + } + pp = p1; + da->lp[il] = pp; + i = min((il+1)*L-1,m-1); + //p = s[i]; + while (s2 <= i) { + if (getbit(buf,p2+1)) s2++; + p2++; + } + p = p2; + //printf("%d ",p-pp); + if (p - pp >= LL) { + if (r == 1) { + for (is = 0; is < L; is++) { + if (il*L+is >= m) break; + //da->sl[ml*L+is] = s[il*L+is]; + while (s3 <= il*L+is) { + if (getbit(buf,p3+1)) s3++; + p3++; + } + da->sl[ml*L+is] = p3; + } + } + da->p[il] = -(ml+1); + ml++; + } else { + if (r == 1) { + for (is = 0; is < L/LLL; is++) { + if (il*L+is*LLL >= m) break; + while (s4 <= il*L+is*LLL) { + if (getbit(buf,p4+1)) s4++; + p4++; + } + //da->ss[ms*(L/LLL)+is] = s[il*L+is*LLL] - pp; + da->ss[ms*(L/LLL)+is] = p4 - pp; + } + } + da->p[il] = ms; + ms++; + } + } + if (r == 0) { + mymalloc(da->sl,ml*L+1,1); da->idx_size += (ml*L+1)*sizeof(*da->sl); + mymalloc(da->ss,ms*(L/LLL)+1,1); da->idx_size += (ms*(L/LLL)+1)*sizeof(*da->ss); +#if 0 + printf("sl table: %d bytes (%1.2f bpc)\n",(ml*L+1)*sizeof(*da->sl), (double)(ml*L+1)*sizeof(*da->sl) * 8/n); + printf("ss table: %d bytes (%1.2f bpc)\n",(ms*(L/LLL)+1)*sizeof(*da->ss), (double)(ms*(L/LLL)+1)*sizeof(*da->ss) * 8/n); +#endif + } + } + //free(s); + } else { // no select table + da->lp = NULL; + da->p = da->sl = NULL; + da->ss = NULL; + } + + // construct rank table + + if ((opt & OPT_NO_RANK) == 0) { + mymalloc(da->rl,n/R1+2,1); da->idx_size += (n/R1+2)*sizeof(*da->rl); + mymalloc(da->rm,n/RR+2,1); da->idx_size += (n/RR+2)*sizeof(*da->rm); + mymalloc(da->rs,n/RRR+2,1); da->idx_size += (n/RRR+2)*sizeof(*da->rs); +#if 0 + printf("rl table: %d bytes (%1.2f bpc)\n",(n/R1+2)*sizeof(*da->rl), (double)(n/R1+2)*sizeof(*da->rl) * 8/n); + printf("rm table: %d bytes (%1.2f bpc)\n",(n/RR+2)*sizeof(*da->rm), (double)(n/RR+2)*sizeof(*da->rm) * 8/n); + printf("rs table: %d bytes (%1.2f bpc)\n",(n/RRR+2)*sizeof(*da->rs), (double)(n/RRR+2)*sizeof(*da->rs) * 8/n); +#endif + r = 0; + for (k=0; k<=n; k+=R1) { + da->rl[k/R1] = r; + m2 = 0; + for (i=0; irm[(k+i)/RR] = m2; + m = 0; + for (j=0; jrs[(k+i+j)/RRR] = m; + m2 += m; + m = 0; + } + } + if (m != 0) { + printf("???\n"); + } + //m2 += m; + } + r += m2; + } + } + + return 0; +} + +// destroyDarray: frees the memory of darray +// Added by Diego Arroyuelo +void destroyDarray(darray *da) { + if (!da) return; // nothing to free + + if (da->buf) free(da->buf); + if (da->lp) free(da->lp); + if (da->sl) free(da->sl); + if (da->ss) free(da->ss); + if (da->p) free(da->p); + if (da->rl) free(da->rl); + if (da->rm) free(da->rm); + if (da->rs) free(da->rs); +} + +int darray_rank0(darray *da, int i) +{ + int r,j; + pb *p; + +#if (RRR == D*2) + r = da->rl[i>>logR] + da->rm[i>>logRR] + da->rs[i>>logRRR]; + p = da->buf + ((i>>logRRR)<<(logRRR-logD)); + j = i & (RRR-1); + if (j < D) r += popcount(*p >> (D-1-j)); + else r += popcount(*p) + popcount(p[1] >> (D-1-(j-D))); +#else + + j = i & (RRR-1); + if (j < RRR/2) { + r = da->rl[i>>logR] + da->rm[i>>logRR] + da->rs[i>>logRRR]; + p = da->buf + ((i>>logRRR)<<(logRRR-logD)); + while (j >= D) { + r += popcount(*p++); + j -= D; + } + r += popcount(*p >> (D-1-j)); + } else { + j = RRR-1 - (i & (RRR-1)); + i += j+1; + r = da->rl[i>>logR] + da->rm[i>>logRR] + da->rs[i>>logRRR]; + p = da->buf + ((i>>logRRR)<<(logRRR-logD)); + while (j >= D) { + r -= popcount(*--p); + j -= D; + } + if (j > 0) r -= popcount(*--p << (D-j)); + } + +#endif + + return r; +} + +int darray_rank(darray *da, int i) +{ + int r,j; + pb *p; + + r = da->rl[i>>logR] + da->rm[i>>logRR]; + j = (i>>logRRR) & (RR/RRR-1); + while (j > 0) { + r += da->rs[((i>>logRR)<<(logRR-logRRR))+j-1]; + j--; + } + + p = da->buf + ((i>>logRRR)<<(logRRR-logD)); + j = i & (RRR-1); + while (j >= D) { + r += popcount(*p++); + j -= D; + } + r += popcount(*p >> (D-1-j)); + + return r; +} + +int darray_select_bsearch(darray *da, int i, pb (*getpat)(pb *)) +{ + int j; + int l,r,m,n; + int s; + int t,x,rr; + pb *p,w; + + // for debug + //s = darray_select(da,i,1); + // + //printf("select(%d)=%d\n",i,s); + + + + if (i == 0) return -1; + + if (i > da->m) { + return -1; + } + + i--; + + + + n = da->m; + + t = i; + + l = 0; r = (n-1)>>logR; + // find the smallest index x s.t. rl[x] >= t + while (l < r) { + m = (l+r)/2; + //printf("m=%d rl[m+1]=%d t=%d\n",m,da->rl[m+1],t); + if (da->rl[m+1] > t) { // m+1 is out of range + r = m; // new r = m >= l + } else { + l = m+1; // new l = m+1 <= r + } + } + x = l; + t -= da->rl[x]; + + x <<= logR; + + l = x >> logRR; r = (min(x+R1-1,n))>>logRR; + while (l < r) { + m = (l+r)/2; + //printf("m=%d rm[m+1]=%d t=%d\n",m,da->rm[m+1],t); + if (da->rm[m+1] > t) { // m+1 is out of range + r = m; + } else { + l = m+1; // new l = m+1 <= r + } + } + x = l; + t -= da->rm[x]; + + x <<= logRR; + +#if 0 + l = x >> logRRR; r = (min(x+RR-1,n))>>logRRR; + while (l < r) { + m = (l+r)/2; + //printf("m=%d rs[m+1]=%d t=%d\n",m,da->rs[m+1],t); + if (da->rs[m+1] > t) { // m+1 is out of range + r = m; + } else { + l = m+1; // new l = m+1 <= r + } + } + x = l; + t -= da->rs[x]; +#else + l = x >> logRRR; + while (t > da->rs[l]) { + t -= da->rs[l]; + l++; + } + x = l; +#endif + + x <<= logRRR; + + p = &da->buf[x >> logD]; + while (1) { + m = popcount(getpat(p)); + if (m > t) break; + t -= m; + x += D; + p++; + } + + w = getpat(p); + while (1) { + rr = popCount[w >> (D-8)]; + if (rr > t) break; + t -= rr; + x += 8; + w <<= 8; + } + x += selecttbl[((t-0)<<8)+(w>>(D-8))]; + +#if 0 + if (x != s) { + printf("error x=%d s=%d\n",x,s); + } +#endif + return x; +} + +int darray_pat_rank(darray *da, int i, pb (*getpat)(pb *)) +{ + int r,j; + pb *p; + + r = da->rl[i>>logR] + da->rm[i>>logRR]; + j = (i>>logRRR) & (RR/RRR-1); + while (j > 0) { + r += da->rs[((i>>logRR)<<(logRR-logRRR))+j-1]; + j--; + } + + p = da->buf + ((i>>logRRR)<<(logRRR-logD)); + j = i & (RRR-1); + while (j >= D) { + r += popcount(getpat(p)); + p++; + j -= D; + } + r += popcount(getpat(p) >> (D-1-j)); + + return r; +} + + +int darray_select(darray *da, int i,int f) +{ + int p,r; + int il; + int rr; + pb x; + pb *q; + + if (i == 0) return -1; + + if (i > da->m) { + return -1; + //printf("ERROR: m=%d i=%d\n",da->m,i); + //exit(1); + } + + i--; + + il = da->p[i>>logL]; + if (il < 0) { + il = -il-1; + p = da->sl[(il<lp[i>>logL]; + p += da->ss[(il<<(logL-logLLL))+(i & (L-1))/LLL]; + r = i - (i & (LLL-1)); + + q = &(da->buf[p>>logD]); + + if (f == 1) { + rr = p & (D-1); + r -= popcount(*q >> (D-1-rr)); + p = p - rr; + + while (1) { + rr = popcount(*q); + if (r + rr >= i) break; + r += rr; + p += D; + q++; + } + + x = *q; + while (1) { + //rr = popcount(x >> (D-8)); + rr = popCount[x >> (D-8)]; + //rr = popcount8(x >> (D-8)); + if (r + rr >= i) break; + r += rr; + p += 8; + x <<= 8; + } + p += selecttbl[((i-r-1)<<8)+(x>>(D-8))]; + } else { + rr = p & (D-1); + r -= popcount((~(*q)) >> (D-1-rr)); + p = p - rr; + + while (1) { + rr = popcount(~(*q)); + if (r + rr >= i) break; + r += rr; + p += D; + q++; + } + + x = ~(*q); + + while (1) { + //rr = popcount(x >> (D-8)); + rr = popCount[x >> (D-8)]; + //rr = popcount8(x >> (D-8)); + if (r + rr >= i) break; + r += rr; + p += 8; + x <<= 8; + } + p += selecttbl[((i-r-1)<<8)+(x>>(D-8))]; + } + } + return p; +} + +int darray_pat_select(darray *da, int i, pb (*getpat)(pb *)) +{ + int p,r; + int il; + int rr; + pb x; + pb *q; + + if (i == 0) return -1; + + if (i > da->m) { + return -1; + //printf("ERROR: m=%d i=%d\n",da->m,i); + //exit(1); + } + + i--; + + il = da->p[i>>logL]; + if (il < 0) { + il = -il-1; + p = da->sl[(il<lp[i>>logL]; + p += da->ss[(il<<(logL-logLLL))+(i & (L-1))/LLL]; + r = i - (i & (LLL-1)); + + q = &(da->buf[p>>logD]); + + rr = p & (D-1); + r -= popcount(getpat(q) >> (D-1-rr)); + p = p - rr; + + while (1) { + rr = popcount(getpat(q)); + if (r + rr >= i) break; + r += rr; + p += D; + q++; + } + + x = getpat(q); + while (1) { + //rr = popcount(x >> (D-8)); + rr = popCount[x >> (D-8)]; + //rr = popcount8(x >> (D-8)); + if (r + rr >= i) break; + r += rr; + p += 8; + x <<= 8; + } + p += selecttbl[((i-r-1)<<8)+(x>>(D-8))]; + } + return p; +} + +int darray_pat_construct(darray *da, int n, pb *buf, int k, pb pat, int opt) +{ + int i; + pb *b; + mymalloc(b,(n+D-1)/D,0); + + for (i=0; ibuf = buf; + + free(b); + + return 0; +} diff --git a/darray.h b/darray.h new file mode 100644 index 0000000..0f79fae --- /dev/null +++ b/darray.h @@ -0,0 +1,53 @@ +typedef unsigned char byte; +typedef unsigned short word; +typedef unsigned int dword; + +#define OPT_NO_RANK (1<<30) + + +typedef dword pb; + +#define logD 5 +#define D (1< +using namespace std; +#include +#include + + +/** mask for obtaining the first 5 bits */ +#define mask31 0x0000001F + +/** max function */ +#define max(x,y) ((x)>(y)?(x):(y)) +/** min function */ +#define min(x,y) ((x)<(y)?(x):(y)) + + +/** number of bits in a uint */ +#define W 32 + +/** W-1 */ +#define Wminusone 31 + +/** 2W*/ +#define WW 64 + +/** number of bits per uchar */ +#define bitsM 8 + +/** number of bytes per uint */ +#define BW 4 + +/** uchar = unsigned char */ +#ifndef uchar +#define uchar unsigned char +#endif + +/** ushort = unsigned short */ +#ifndef ushort +#define ushort unsigned short +#endif + +/** ulong = unsigned long */ +#ifndef ulong +#define ulong unsigned long +#endif + +/** uint = unsigned int */ +#ifndef uint +#define uint unsigned int +#endif + +/** number of different uchar values 0..255 */ +#define size_uchar 256 + +/** popcount array for uchars */ +const unsigned char __popcount_tab[] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, +}; + +/** select array for uchars */ +const unsigned char select_tab[] = { + 0, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 7, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 8, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 7, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, + 6, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, 5, 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, +}; + +/** prev array for uchars */ +const unsigned char prev_tab[] = { + 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, +}; + + + +/** bits needed to represent a number between 0 and n */ +inline uint bits(uint n){ + uint b = 0; + while (n) { b++; n >>= 1; } + return b; +} + +/** reads bit p from e */ +#define bitget(e,p) ((((e)[(p)/W] >> ((p)%W))) & 1) + +/** sets bit p in e */ +#define bitset(e,p) ((e)[(p)/W] |= (1<<((p)%W))) + +/** cleans bit p in e */ +#define bitclean(e,p) ((e)[(p)/W] &= ~(1<<((p)%W))) + +/** uints required to represent e integers of n bits each */ +#define uint_len(e,n) (((e)*(n))/W+(((e)*(n))%W > 0)) + +/** Retrieve a given index from array A where every value uses len bits + * @param A Array + * @param len Length in bits of each field + * @param index Position to be retrieved + */ +inline uint get_field(uint *A, uint len, uint index) { + if(len==0) return 0; + register uint i=index*len/W, j=index*len-W*i, result; + if (j+len <= W) + result = (A[i] << (W-j-len)) >> (W-len); + else { + result = A[i] >> j; + result = result | (A[i+1] << (WW-j-len)) >> (W-len); + } + return result; +} + +/** Store a given value in index into array A where every value uses len bits + * @param A Array + * @param len Length in bits of each field + * @param index Position to store in + * @param x Value to be stored + */ +inline void set_field(uint *A, uint len, uint index, uint x) { + if(len==0) return; + uint i=index*len/W, j=index*len-i*W; + uint mask = ((j+len) < W ? ~0u << (j+len) : 0) + | ((W-j) < W ? ~0u >> (W-j) : 0); + A[i] = (A[i] & mask) | x << j; + if (j+len>W) { + mask = ((~0u) << (len+j-W)); + A[i+1] = (A[i+1] & mask)| x >> (W-j); + } +} + +/** Retrieve a given bitsequence from array A + * @param A Array + * @param ini Starting position + * @param fin Retrieve until end-1 + */ +inline uint get_var_field(uint *A, uint ini, uint fin) { + if(ini==fin+1) return 0; + uint i=ini/W, j=ini-W*i, result; + uint len = (fin-ini+1); + if (j+len <= W) + result = (A[i] << (W-j-len)) >> (W-len); + else { + result = A[i] >> j; + result = result | (A[i+1] << (WW-j-len)) >> (W-len); + } + return result; +} + +/** Stores a given bitsequence into array A + * @param A Array + * @param ini Starting position + * @param fin Store until end-1 + * @param x Value to be stored + */ +inline void set_var_field(uint *A, uint ini, uint fin, uint x) { + if(ini==fin+1) return; + uint i=ini/W, j=ini-i*W; + uint len = (fin-ini+1); + uint mask = ((j+len) < W ? ~0u << (j+len) : 0) + | ((W-j) < W ? ~0u >> (W-j) : 0); + A[i] = (A[i] & mask) | x << j; + if (j+len>W) { + mask = ((~0u) << (len+j-W)); + A[i+1] = (A[i+1] & mask)| x >> (W-j); + } +} + +/** Retrieve a given index from array A where every value uses 4 bits + * @param A Array + * @param index Position to be retrieved + */ +inline uint get_field4(uint *A, uint index) { + unsigned i=index/8, j=(index&0x7)<<2; + return (A[i] << (28-j)) >> (28); +} + +/** Counts the number of 1s in x */ +inline uint popcount(int x){ + return __popcount_tab[(x >> 0) & 0xff] + __popcount_tab[(x >> 8) & 0xff] + + __popcount_tab[(x >> 16) & 0xff] + __popcount_tab[(x >> 24) & 0xff]; +} + +/** Counts the number of 1s in the first 16 bits of x */ +inline uint popcount16(int x){ + return __popcount_tab[x & 0xff] + __popcount_tab[(x >> 8) & 0xff]; +} + +/** Counts the number of 1s in the first 8 bits of x */ +inline uint popcount8(int x){ + return __popcount_tab[x & 0xff]; +} + +#endif /* _BASICS_H */ + diff --git a/libcds/src/coders/huff.cpp b/libcds/src/coders/huff.cpp new file mode 100644 index 0000000..deb0e70 --- /dev/null +++ b/libcds/src/coders/huff.cpp @@ -0,0 +1,239 @@ + +// implements canonical Huffman + +#include + +typedef struct + { uint freq; + uint symb; + union + { int prev; + uint depth; + } h; + int ch1,ch2; + } Ttree; + +static void sort (Ttree *tree, int lo, int up) + + { uint i, j; + Ttree temp; + while (up>lo) + { i = lo; + j = up; + temp = tree[lo]; + while (i temp.freq) j--; + tree[i] = tree[j]; + while (i0) + { tree[j].freq = freq[i]; + tree[j].symb = i; + j++; + } + } + H.lim = lim = j-1; + // now run Huffman algorithm + sort (tree,0,lim); + for (i=0;i<=(int)lim;i++) + { tree[i].h.prev = i+1; + tree[i].ch1 = tree[i].ch2 = -1; + } + tree[lim].h.prev = -1; + // last = next node to process, ptr = search point, fre = next free cell + // leaves are in 0..lim in decreasing freq order + // internal nodes are in lim+1.. 2*lim, created in incr. fre order + last=0; ptr = 0; fre = lim+1; + for (i=0;i<(int)lim;i++) + { tree[fre].ch1 = last; + last = tree[last].h.prev; + tree[fre].ch2 = last; + tree[fre].freq = tree[tree[fre].ch1].freq+tree[tree[fre].ch2].freq; + while ((tree[ptr].h.prev != -1) && + (tree[tree[ptr].h.prev].freq <= tree[fre].freq)) + ptr = tree[ptr].h.prev; + tree[fre].h.prev = tree[ptr].h.prev; + tree[ptr].h.prev = fre; + last = tree[last].h.prev; + fre++; + } + // now assign depths recursively + setdepths (tree,2*lim,0); + H.s.spos = (uint*)malloc ((H.max+1)*sizeof(uint)); + for (i=0;i<=(int)H.max;i++) H.s.spos[i] = ~0; + H.num = (uint*)malloc ((lim+1)*sizeof(uint)); // max possible depth + d=0; + for (i=lim;i>=0;i--) + { H.s.spos[tree[i].symb] = i; + while ((int)tree[i].h.depth > d) + { H.num[d] = i+1; d++; } + } + H.num[d] = 0; + H.depth = d; + for (d=H.depth;d>0;d--) H.num[d] = H.num[d-1] - H.num[d]; + H.num[0] = (lim == 0); + H.num = (uint*)realloc(H.num,(H.depth+1)*sizeof(uint)); + H.total = 0; + for (i=0;i<=(int)lim;i++) + H.total += freq[tree[i].symb] * tree[i].h.depth; + free (tree); + return H; + } + +void bitzero (register uint *e, register uint p, + register uint len) + + { e += p/W; p %= W; + if (p+len >= W) + { *e &= ~((1<= W) + { *e++ = 0; + len -= W; + } + if (len > 0) + *e &= ~(((1<= H.num[d]) + { code = (code + H.num[d]) >> 1; + pos -= H.num[d--]; + } + code += pos; + if (d > W) { bitzero(stream,ptr,d-W); ptr += d-W; d = W; } + while (d--) + { if ((code >> d) & 1) bitset(stream,ptr); + else bitclean(stream,ptr); + ptr++; + } + return ptr; + } + +ulong decodeHuff (THuff H, uint *symb, uint *stream, ulong ptr) + + { uint pos; + uint d; + pos = 0; + d = 0; + while (pos < H.fst[d]) + { pos = (pos << 1) | bitget(stream,ptr); + ptr++; d++; + } + *symb = H.s.symb[H.num[d]+pos-H.fst[d]]; + return ptr; + } + +/* { uint pos; // This "improved" code is actually slower! + int d; + uint wrd,off; + stream += ptr/W; + off = ptr & (W-1); + wrd = *stream >> off; + pos = 0; + d = 0; + while (pos < H.fst[d]) + { pos = (pos << 1) | (wrd & 1); + d++; wrd >>= 1; off++; + if (off == W) { wrd = *++stream; off = 0; } + } + *symb = H.s.symb[H.num[d]+pos-H.fst[d]]; + return ptr+d; + } +*/ +void saveHuff (THuff H, FILE *f) + + { uint *symb = (uint*)malloc((H.lim+1)*sizeof(uint)); + uint i; + for (i=0;i<=H.max;i++) + if (H.s.spos[i] != (uint)~0) symb[H.s.spos[i]] = i; + uint l=fwrite (&H.max,sizeof(uint),1,f); + l += fwrite (&H.lim,sizeof(uint),1,f); + l += fwrite (&H.depth,sizeof(uint),1,f); + l += fwrite (symb,sizeof(uint),H.lim+1,f); + l += fwrite (H.num,sizeof(uint),H.depth+1,f); + free (symb); + } + +uint sizeHuff (THuff H) + + { return (4+(H.lim+1)+(H.depth+1))*sizeof(uint); + } + +void freeHuff (THuff H) + + { free (H.s.spos); free (H.num); + } + +THuff loadHuff (FILE *f, int enc) + + { THuff H; + uint *symb; + //uint *num; + uint i,d,dold,dact; + uint l = fread (&H.max,sizeof(uint),1,f); + l += fread (&H.lim,sizeof(uint),1,f); + l += fread (&H.depth,sizeof(uint),1,f); + symb = (uint*)malloc ((H.lim+1)*sizeof(uint)); + l += fread (symb,sizeof(uint),H.lim+1,f); + if (enc) + { H.s.spos = (uint*)malloc ((H.max+1)*sizeof(uint)); + for (i=0;i<=H.max;i++) H.s.spos[i] = (uint)~0; + for (i=0;i<=H.lim;i++) H.s.spos[symb[i]] = i; + free (symb); + } + else H.s.symb = symb; + H.num = (uint*)malloc ((H.depth+1)*sizeof(uint)); + l += fread (H.num,sizeof(uint),H.depth+1,f); + if (!enc) + { H.fst = (uint*)malloc ((H.depth+1)*sizeof(uint)); + H.fst[H.depth] = 0; dold = 0; + for (d=H.depth-1;d>=0;d--) + { dact = H.num[d+1]; + H.fst[d] = (H.fst[d+1]+dact) >> 1; + H.num[d+1] = dold; + dold += dact; + } + H.num[0] = dold; + } + return H; + } + + diff --git a/libcds/src/coders/huff.h b/libcds/src/coders/huff.h new file mode 100644 index 0000000..2dc96e2 --- /dev/null +++ b/libcds/src/coders/huff.h @@ -0,0 +1,53 @@ + +// implements canonical Huffman + +#ifndef HUFFINCLUDED +#define HUFFINCLUDED + +#include + +typedef struct + { uint max,lim; // maximum symbol (0..max), same excluding zero freqs + uint depth; // max symbol length + union + { uint *spos; // symbol positions after sorting by decr freq (enc) + uint *symb; // symbols sorted by freq (dec) + } s; + uint *num; // first pos of each length (dec), number of each length (enc) + uint *fst; // first code (numeric) of each length (dec) + ulong total; // total length to achieve, in bits + } THuff; + + // Creates Huffman encoder given symbols 0..lim with frequencies + // freq[i], ready for compression + +THuff createHuff (uint *freq, uint lim); + + // Encodes symb using H, over stream[ptr...lim] (ptr and lim are + // bit positions of stream). Returns the new ptr. + +ulong encodeHuff (THuff H, uint symb, uint *stream, ulong ptr); + + // Decodes *symb using H, over stream[ptr...lim] (ptr and lim are + // bit positions of stream). Returns the new ptr. + +ulong decodeHuff (THuff H, uint *symb, uint *stream, ulong ptr); + + // Writes H in file f + +void saveHuff (THuff H, FILE *f); + + // Size of H written on file + +uint sizeHuff (THuff H); + + // Frees H + +void freeHuff (THuff H); + + // Loads H from file f, prepared for encoding or decoding depending + // on enc + +THuff loadHuff (FILE *f, int enc); + +#endif diff --git a/libcds/src/coders/huffman_codes.cpp b/libcds/src/coders/huffman_codes.cpp new file mode 100644 index 0000000..cb64fd1 --- /dev/null +++ b/libcds/src/coders/huffman_codes.cpp @@ -0,0 +1,51 @@ + +#include + +huffman_codes::huffman_codes(uint * symb, uint n) { + uint max_v = 0; + for(uint i=0;ihuff_table = loadHuff(fp,1); + return ret; +} + + diff --git a/libcds/src/coders/huffman_codes.h b/libcds/src/coders/huffman_codes.h new file mode 100644 index 0000000..d421899 --- /dev/null +++ b/libcds/src/coders/huffman_codes.h @@ -0,0 +1,26 @@ + +#ifndef HUFFMAN_CODES_H +#define HUFFMAN_CODES_H + +#include +#include + +class huffman_codes { + + public: + huffman_codes(uint * seq, uint n); + ~huffman_codes(); + + ulong encode(uint symb, uint * stream, ulong pos); + ulong decode(uint * symb, uint * stream, ulong pos); + uint max_length(); + uint size(); + uint save(FILE *fp); + static huffman_codes * load(FILE *fp); + + protected: + huffman_codes(); + THuff huff_table; +}; + +#endif diff --git a/libcds/src/static_bitsequence/static_bitsequence.cpp b/libcds/src/static_bitsequence/static_bitsequence.cpp new file mode 100644 index 0000000..65de9c4 --- /dev/null +++ b/libcds/src/static_bitsequence/static_bitsequence.cpp @@ -0,0 +1,104 @@ +/* static_bitsequence.cpp + * Copyright (C) 2008, Francisco Claude, all rights reserved. + * + * static_bitsequence definition + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "static_bitsequence.h" + +uint static_bitsequence::rank0(uint i) { + return i+1-rank1(i); +} + +uint static_bitsequence::rank1(uint i) { + if(i>=len) return ones; + if(ones==0) return -1; + uint ini = 1; + uint fin = ones; + while(inii) return ini-1; + return ini; +} + +uint static_bitsequence::select0(uint i) { + if(i>len-ones) return len; + if(i==0) return -1; + uint ini = 0; + uint fin = len-1; + while(iniones) return len; + if(i==0) return 0; + uint ini = 0; + uint fin = len-1; + while(ini0; +} + +uint static_bitsequence::length() { + return len; +} + +uint static_bitsequence::count_one() { + return ones; +} + +uint static_bitsequence::count_zero() { + return len-ones; +} + +static_bitsequence * static_bitsequence::load(FILE * fp) { + uint r; + if(fread(&r,sizeof(uint),1,fp)!=1) return NULL; + fseek(fp,-1*sizeof(uint),SEEK_CUR); + switch(r) { + case RRR02_HDR: return static_bitsequence_rrr02::load(fp); + case BRW32_HDR: return static_bitsequence_brw32::load(fp); + } + return NULL; +} + diff --git a/libcds/src/static_bitsequence/static_bitsequence.h b/libcds/src/static_bitsequence/static_bitsequence.h new file mode 100644 index 0000000..4a3f915 --- /dev/null +++ b/libcds/src/static_bitsequence/static_bitsequence.h @@ -0,0 +1,93 @@ +/* static_bitsequence.h + * Copyright (C) 2008, Francisco Claude, all rights reserved. + * + * static_bitsequence definition + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef _STATIC_BITSEQUENCE_H +#define _STATIC_BITSEQUENCE_H + +#define RRR02_HDR 2 +#define BRW32_HDR 3 + +#include +#include + + +using namespace std; + +/** Base class for static bitsequences, contains many abstract functions, so this can't + * be instantiated. It includes base implementations for rank0, select0 and select1 based + * on rank0. + * + * @author Francisco Claude + */ +class static_bitsequence { + +public: + virtual ~static_bitsequence() {}; + + /** Returns the number of zeros until position i */ + virtual uint rank0(uint i); + + /** Returns the position of the i-th zero + * @return (uint)-1 if i=0, len if i>num_zeros or the position */ + virtual uint select0(uint i); + + /** Returns the number of ones until position i */ + virtual uint rank1(uint i); + + /** Returns the position of the i-th one + * @return (uint)-1 if i=0, len if i>num_ones or the position */ + virtual uint select1(uint i); + + /** Returns the i-th bit */ + virtual bool access(uint i); + + /** Returns the length in bits of the bitmap */ + virtual uint length(); + + /** Returns how many ones are in the bitstring */ + virtual uint count_one(); + + /** Returns how many zeros are in the bitstring */ + virtual uint count_zero(); + + /** Returns the size of the structure in bytes */ + virtual uint size()=0; + + /** Stores the bitmap given a file pointer, return 0 in case of success */ + virtual int save(FILE * fp)=0; + + /** Reads a bitmap determining the type */ + static static_bitsequence * load(FILE * fp); + +protected: + /** Length of the bitstring */ + uint len; + /** Number of ones in the bitstring */ + uint ones; + +}; + +#include "static_bitsequence_rrr02.h" +#include "static_bitsequence_naive.h" +#include "static_bitsequence_brw32.h" + +#endif /* _STATIC_BITSEQUENCE_H */ + diff --git a/libcds/src/static_bitsequence/static_bitsequence_brw32.cpp b/libcds/src/static_bitsequence/static_bitsequence_brw32.cpp new file mode 100644 index 0000000..9bc1313 --- /dev/null +++ b/libcds/src/static_bitsequence/static_bitsequence_brw32.cpp @@ -0,0 +1,295 @@ +/* static_bitsequence_brw32.cpp + Copyright (C) 2005, Rodrigo Gonzalez, all rights reserved. + + New RANK, SELECT, SELECT-NEXT and SPARSE RANK implementations. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*/ + +#include "static_bitsequence_brw32.h" +#include +#include +// #include + + +///////////// +//Rank(B,i)// +///////////// +//_factor = 0 => s=W*lgn +//_factor = P => s=W*P +//Is interesting to notice +//factor=2 => overhead 50% +//factor=3 => overhead 33% +//factor=4 => overhead 25% +//factor=20=> overhead 5% + +static_bitsequence_brw32::static_bitsequence_brw32(){ + data=NULL; + this->owner = true; + this->n=0; + this->factor=0; +} + +static_bitsequence_brw32::static_bitsequence_brw32( uint *bitarray, uint _n, uint _factor){ + /*cout << "*****" << endl; + cout << bitarray << endl; + cout << _n << endl; + cout << _factor << endl; */ + if(_factor==0) exit(-1); + data=new uint[_n/W+1]; + for(uint i=0;iowner = true; + this->n=_n; + uint lgn=bits(n-1); + this->factor=_factor; + if (_factor==0) this->factor=lgn; + else this->factor=_factor; + b=32; + s=b*this->factor; + integers = n/W+1; + BuildRank(); + this->len = n; + this->ones = rank1(n-1); +} + +static_bitsequence_brw32::~static_bitsequence_brw32() { + delete [] Rs; + if (owner) delete [] data; +} + +//Metodo que realiza la busqueda d +void static_bitsequence_brw32::BuildRank(){ + uint num_sblock = n/s; + Rs = new uint[num_sblock+5];// +1 pues sumo la pos cero + for(uint i=0;in,sizeof(uint),1,f) != 1) return NULL; + ret->b=32; // b is a word + if (fread (&ret->factor,sizeof(uint),1,f) != 1) return NULL; + ret->s=ret->b*ret->factor; + uint aux=(ret->n+1)%W; + if (aux != 0) + ret->integers = (ret->n+1)/W+1; + else + ret->integers = (ret->n+1)/W; + ret->data = new uint[ret->n/W+1]; + if (!ret->data) return NULL; + if (fread (ret->data,sizeof(uint),ret->n/W+1,f) != ret->n/W+1) return NULL; + ret->owner = true; + ret->Rs= new uint[ret->n/ret->s+1]; + if (!ret->Rs) return NULL; + if (fread (ret->Rs,sizeof(uint),ret->n/ret->s+1,f) != ret->n/ret->s+1) return NULL; + ret->len = ret->n; + ret->ones = ret->rank1(ret->n-1); + return ret; +} + +uint static_bitsequence_brw32::SpaceRequirementInBits() { + return (owner?n:0)+(n/s)*sizeof(uint)*8 +sizeof(static_bitsequence_brw32)*8; +} + +uint static_bitsequence_brw32::size() { + return SpaceRequirementInBits()/8; +} + +uint static_bitsequence_brw32::SpaceRequirement() { + return (owner?n:0)/8+(n/s)*sizeof(uint)+sizeof(static_bitsequence_brw32); +} + +uint static_bitsequence_brw32::prev2(uint start) { + // returns the position of the previous 1 bit before and including start. + // tuned to 32 bit machine + + uint i = start >> 5; + int offset = (start % W); + uint answer = start; + uint val = data[i] << (Wminusone-offset); + + if (!val) { val = data[--i]; answer -= 1+offset; } + + while (!val) { val = data[--i]; answer -= W; } + + if (!(val & 0xFFFF0000)) { val <<= 16; answer -= 16; } + if (!(val & 0xFF000000)) { val <<= 8; answer -= 8; } + + while (!(val & 0x80000000)) { val <<= 1; answer--; } + return answer; +} + +uint static_bitsequence_brw32::prev(uint start) { + // returns the position of the previous 1 bit before and including start. + // tuned to 32 bit machine + + uint i = start >> 5; + int offset = (start % W); + uint aux2 = data[i] & (-1u >> (31-offset)); + + if (aux2 > 0) { + if ((aux2&0xFF000000) > 0) return i*W+23+prev_tab[(aux2>>24)&0xFF]; + else if ((aux2&0xFF0000) > 0) return i*W+15+prev_tab[(aux2>>16)&0xFF]; + else if ((aux2&0xFF00) > 0) return i*W+7+prev_tab[(aux2>>8)&0xFF]; + else return i*W+prev_tab[aux2&0xFF]-1; + } + for (uint k=i-1;;k--) { + aux2=data[k]; + if (aux2 > 0) { + if ((aux2&0xFF000000) > 0) return k*W+23+prev_tab[(aux2>>24)&0xFF]; + else if ((aux2&0xFF0000) > 0) return k*W+15+prev_tab[(aux2>>16)&0xFF]; + else if ((aux2&0xFF00) > 0) return k*W+7+prev_tab[(aux2>>8)&0xFF]; + else return k*W+prev_tab[aux2&0xFF]-1; + } + } + return 0; +} + +uint static_bitsequence_brw32::next(uint k) { + uint count = k; + uint des,aux2; + des=count%W; + aux2= data[count/W] >> des; + if (aux2 > 0) { + if ((aux2&0xff) > 0) return count+select_tab[aux2&0xff]-1; + else if ((aux2&0xff00) > 0) return count+8+select_tab[(aux2>>8)&0xff]-1; + else if ((aux2&0xff0000) > 0) return count+16+select_tab[(aux2>>16)&0xff]-1; + else {return count+24+select_tab[(aux2>>24)&0xff]-1;} + } + + for (uint i=count/W+1;i 0) { + if ((aux2&0xff) > 0) return i*W+select_tab[aux2&0xff]-1; + else if ((aux2&0xff00) > 0) return i*W+8+select_tab[(aux2>>8)&0xff]-1; + else if ((aux2&0xff0000) > 0) return i*W+16+select_tab[(aux2>>16)&0xff]-1; + else {return i*W+24+select_tab[(aux2>>24)&0xff]-1;} + } + } + return n; +} + +uint static_bitsequence_brw32::select1(uint x) { + // returns i such that x=rank(i) && rank(i-1) integers) return n; + j = data[left]; + ones = popcount(j); + } + //sequential search using popcount over a char + left=left*b; + rankmid = popcount8(j); + if (rankmid < x) { + j=j>>8; + x-=rankmid; + left+=8; + rankmid = popcount8(j); + if (rankmid < x) { + j=j>>8; + x-=rankmid; + left+=8; + rankmid = popcount8(j); + if (rankmid < x) { + j=j>>8; + x-=rankmid; + left+=8; + } + } + } + + // then sequential search bit a bit + while (x>0) { + if (j&1) x--; + j=j>>1; + left++; + } + return left-1; +} diff --git a/libcds/src/static_bitsequence/static_bitsequence_brw32.h b/libcds/src/static_bitsequence/static_bitsequence_brw32.h new file mode 100644 index 0000000..50ea7b4 --- /dev/null +++ b/libcds/src/static_bitsequence/static_bitsequence_brw32.h @@ -0,0 +1,78 @@ +/* static_bitsequence_brw32.h + Copyright (C) 2005, Rodrigo Gonzalez, all rights reserved. + + New RANK, SELECT, SELECT-NEXT and SPARSE RANK implementations. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*/ + +#ifndef _STATIC_BITSEQUENCE_BRW32_H +#define _STATIC_BITSEQUENCE_BRW32_H + +#include +#include +///////////// +//Rank(B,i)// +///////////// +//_factor = 0 => s=W*lgn +//_factor = P => s=W*P +//Is interesting to notice +//factor=2 => overhead 50% +//factor=3 => overhead 33% +//factor=4 => overhead 25% +//factor=20=> overhead 5% + +/** Implementation of Rodrigo Gonzalez et al. practical rank/select solution [1]. + * The interface was adapted. + * + * [1] Rodrigo Gonzalez, Szymon Grabowski, Veli Makinen, and Gonzalo Navarro. + * Practical Implementation of Rank and Select Queries. WEA05. + * + * @author Rodrigo Gonzalez + */ +class static_bitsequence_brw32 : public static_bitsequence { +private: + uint *data; + bool owner; + uint n,integers; + uint factor,b,s; + uint *Rs; //superblock array + + uint BuildRankSub(uint ini,uint fin); //uso interno para contruir el indice rank + void BuildRank(); //crea indice para rank + static_bitsequence_brw32(); + +public: + static_bitsequence_brw32(uint *bitarray, uint n, uint factor); + ~static_bitsequence_brw32(); //destructor + virtual bool access(uint i); + virtual uint rank1(uint i); //Nivel 1 bin, nivel 2 sec-pop y nivel 3 sec-bit + + uint prev(uint start); // gives the largest index i<=start such that IsBitSet(i)=true + uint prev2(uint start); // gives the largest index i<=start such that IsBitSet(i)=true + uint next(uint start); // gives the smallest index i>=start such that IsBitSet(i)=true + virtual uint select1(uint x); // gives the position of the x:th 1. + uint SpaceRequirementInBits(); + uint SpaceRequirement(); + virtual uint size(); + + /*load-save functions*/ + virtual int save(FILE *f); + static static_bitsequence_brw32 * load(FILE * fp); +}; + +#endif + diff --git a/libcds/src/static_bitsequence/static_bitsequence_builder.h b/libcds/src/static_bitsequence/static_bitsequence_builder.h new file mode 100644 index 0000000..fb471bb --- /dev/null +++ b/libcds/src/static_bitsequence/static_bitsequence_builder.h @@ -0,0 +1,34 @@ +/* static_bitsequence_builder.h + * Copyright (C) 2008, Francisco Claude, all rights reserved. + * + * static_bitsequence_builder definition + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef _STATIC_BITSEQUENCE_BUILDER_H +#define _STATIC_BITSEQUENCE_BUILDER_H + +class static_bitsequence_builder { + public: + virtual ~static_bitsequence_builder() {} + virtual static_bitsequence * build(uint * bitsequence, uint len)=0; +}; + +#include +#include + +#endif /* _STATIC_BITSEQUENCE_BUILDER_H */ diff --git a/libcds/src/static_bitsequence/static_bitsequence_builder_brw32.cpp b/libcds/src/static_bitsequence/static_bitsequence_builder_brw32.cpp new file mode 100644 index 0000000..f982cf3 --- /dev/null +++ b/libcds/src/static_bitsequence/static_bitsequence_builder_brw32.cpp @@ -0,0 +1,10 @@ + +#include + +static_bitsequence_builder_brw32::static_bitsequence_builder_brw32(uint sampling) { + this->sample_rate=sampling; +} + +static_bitsequence * static_bitsequence_builder_brw32::build(uint * bitsequence, uint len) { + return new static_bitsequence_brw32(bitsequence,len,this->sample_rate); +} diff --git a/libcds/src/static_bitsequence/static_bitsequence_builder_brw32.h b/libcds/src/static_bitsequence/static_bitsequence_builder_brw32.h new file mode 100644 index 0000000..a5116e3 --- /dev/null +++ b/libcds/src/static_bitsequence/static_bitsequence_builder_brw32.h @@ -0,0 +1,39 @@ +/* static_bitsequence_builder_brw32.h + * Copyright (C) 2008, Francisco Claude, all rights reserved. + * + * static_bitsequence_builder definition + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef _STATIC_BITSEQUENCE_BUILDER_BRW32_H +#define _STATIC_BITSEQUENCE_BUILDER_BRW32_H + +#include +#include +#include + +class static_bitsequence_builder_brw32 : public static_bitsequence_builder { + public: + static_bitsequence_builder_brw32(uint sampling); + virtual ~static_bitsequence_builder_brw32() {} + virtual static_bitsequence * build(uint * bitsequence, uint len); + + protected: + uint sample_rate; +}; + +#endif /* _STATIC_BITSEQUENCE_BUILDER_BRW32_H */ diff --git a/libcds/src/static_bitsequence/static_bitsequence_builder_rrr02.cpp b/libcds/src/static_bitsequence/static_bitsequence_builder_rrr02.cpp new file mode 100644 index 0000000..32e8ad6 --- /dev/null +++ b/libcds/src/static_bitsequence/static_bitsequence_builder_rrr02.cpp @@ -0,0 +1,10 @@ + +#include + +static_bitsequence_builder_rrr02::static_bitsequence_builder_rrr02(uint sampling) { + sample_rate=sampling; +} + +static_bitsequence * static_bitsequence_builder_rrr02::build(uint * bitsequence, uint len) { + return new static_bitsequence_rrr02(bitsequence,len,sample_rate); +} diff --git a/libcds/src/static_bitsequence/static_bitsequence_builder_rrr02.h b/libcds/src/static_bitsequence/static_bitsequence_builder_rrr02.h new file mode 100644 index 0000000..e858221 --- /dev/null +++ b/libcds/src/static_bitsequence/static_bitsequence_builder_rrr02.h @@ -0,0 +1,39 @@ +/* static_bitsequence_builder_rrr02.h + * Copyright (C) 2008, Francisco Claude, all rights reserved. + * + * static_bitsequence_builder definition + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef _STATIC_BITSEQUENCE_BUILDER_RRR02_H +#define _STATIC_BITSEQUENCE_BUILDER_RRR02_H + +#include +#include +#include + +class static_bitsequence_builder_rrr02 : public static_bitsequence_builder { + public: + static_bitsequence_builder_rrr02(uint sampling); + virtual ~static_bitsequence_builder_rrr02() {} + virtual static_bitsequence * build(uint * bitsequence, uint len); + + protected: + uint sample_rate; +}; + +#endif /* _STATIC_BITSEQUENCE_BUILDER_RRR02_H */ diff --git a/libcds/src/static_bitsequence/static_bitsequence_naive.cpp b/libcds/src/static_bitsequence/static_bitsequence_naive.cpp new file mode 100644 index 0000000..ba3c150 --- /dev/null +++ b/libcds/src/static_bitsequence/static_bitsequence_naive.cpp @@ -0,0 +1,71 @@ +/* static_bitsequence_naive.cpp + * Copyright (C) 2008, Francisco Claude, all rights reserved. + * + * Naive Bitsequence - don't use, only for testing + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "static_bitsequence_naive.h" + +static_bitsequence_naive::static_bitsequence_naive(uint * bitseq, uint len) { + this->len = len; + this->bitseq = new uint[len/W+(len%W>0)]; + for(uint i=0;i0);i++) + this->bitseq[i] = bitseq[i]; + uint ret = 0; + for(uint k=0;kones = ret; +} + +static_bitsequence_naive::~static_bitsequence_naive() { + delete [] bitseq; +} + +uint static_bitsequence_naive::rank1(uint i) { + if(i>=len) return ones; + uint ret = 0; + for(uint k=0;k<=i;k++) + if(bitget(bitseq,k)) + ret++; + return ret; +} + +uint static_bitsequence_naive::select1(uint i) { + if(i==0) return (uint)-1; + if(i>ones) return len; + uint cnt = 0; + for(uint k=0;k + +/** Class used for testing, should not be used with long bitmaps + * @author Francisco Claude + */ +class static_bitsequence_naive: public static_bitsequence { +public: + /** Builds a naive bitsequence, receives the bitmap and the length + * in bits + */ + static_bitsequence_naive(uint * bitseq, uint len); + + virtual ~static_bitsequence_naive(); + + /** Returns the number of ones until position i */ + virtual uint rank1(uint i); + + /** Returns the position of the i-th one + * @return (uint)-1 if i=0, len if i>num_ones or the position */ + virtual uint select1(uint i); + + /** Returns the i-th bit */ + virtual bool access(uint i); + + /** Returns the size of the structure in bytes */ + virtual uint size(); + + /** - Not implemented - */ + virtual int save(FILE * fp); + + +protected: + uint * bitseq; +}; + +#endif /* _STATIC_BITSEQUENCE_NAIVE_H */ + diff --git a/libcds/src/static_bitsequence/static_bitsequence_rrr02.cpp b/libcds/src/static_bitsequence/static_bitsequence_rrr02.cpp new file mode 100644 index 0000000..404d905 --- /dev/null +++ b/libcds/src/static_bitsequence/static_bitsequence_rrr02.cpp @@ -0,0 +1,349 @@ +/* static_bitsequence_rrr02.cpp + * Copyright (C) 2008, Francisco Claude, all rights reserved. + * + * static_bitsequence_rrr02 definition + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "static_bitsequence_rrr02.h" + +table_offset * static_bitsequence_rrr02::E = NULL; + +static_bitsequence_rrr02::static_bitsequence_rrr02() { + ones=0; + len=0; + if(E==NULL) E = new table_offset(BLOCK_SIZE); + E->use(); + C = NULL; + O = NULL; + C_sampling = NULL; + O_pos = NULL; + sample_rate = DEFAULT_SAMPLING; + C_len = O_len = C_sampling_len = O_pos_len = 0; + O_bits_len = C_sampling_field_bits = O_pos_field_bits = 0; +} + +static_bitsequence_rrr02::static_bitsequence_rrr02(uint * bitseq, uint len, uint sample_rate) { + ones = 0; + this->len = len; + if(E==NULL) E = new table_offset(BLOCK_SIZE); + E->use(); + // Table C + C_len = len/BLOCK_SIZE + (len%BLOCK_SIZE!=0); + C_field_bits = bits(BLOCK_SIZE); + C = new uint[uint_len(C_len,C_field_bits)]; + for(uint i=0;iget_log2binomial(BLOCK_SIZE,value); + } + // Table O + O_len = uint_len(1,O_bits_len); + O = new uint[O_len]; + for(uint i=0;iget_log2binomial(BLOCK_SIZE,popcount(value))-1,E->compute_offset((ushort)value)); + O_pos += E->get_log2binomial(BLOCK_SIZE,popcount(value)); + } + C_sampling = NULL; + this->O_pos = NULL; + + create_sampling(sample_rate); +} + +void static_bitsequence_rrr02::create_sampling(uint sample_rate) { + this->sample_rate = sample_rate; +/* for(uint i=0;iget_log2binomial(BLOCK_SIZE,get_field(C,C_field_bits,i)); + }*/ + // Sampling for C + C_sampling_len = C_len/sample_rate+2; + C_sampling_field_bits = bits(ones); + if(C_sampling!=NULL) delete [] C_sampling; + C_sampling = new uint[max((uint)1,uint_len(C_sampling_len,C_sampling_field_bits))]; + for(uint i=0;iget_log2binomial(BLOCK_SIZE,get_field(C,C_field_bits,i)); + } +} + +bool static_bitsequence_rrr02::access(uint i) { + uint nearest_sampled_value = i/BLOCK_SIZE/sample_rate; + uint pos_O = get_field(O_pos,O_pos_field_bits,nearest_sampled_value); + uint pos = i/BLOCK_SIZE; + assert(pos<=C_len); + for(uint k=nearest_sampled_value*sample_rate;kget_log2binomial(BLOCK_SIZE,aux); + } + uint c = get_field(C,C_field_bits,pos); + return ((1<<(i%BLOCK_SIZE))&E->short_bitmap(c,get_var_field(O,pos_O,pos_O+E->get_log2binomial(BLOCK_SIZE,c)-1)))!=0; +} + +uint static_bitsequence_rrr02::rank0(uint i) { + if(i+1==0) return 0; + return 1+i-rank1(i); +} + +uint static_bitsequence_rrr02::rank1(uint i) { + if(i+1==0) return 0; + uint nearest_sampled_value = i/BLOCK_SIZE/sample_rate; + uint sum = get_field(C_sampling,C_sampling_field_bits,nearest_sampled_value); + uint pos_O = get_field(O_pos,O_pos_field_bits,nearest_sampled_value); + uint pos = i/BLOCK_SIZE; + uint k=nearest_sampled_value*sample_rate; + if(k%2==1 && kget_log2binomial(BLOCK_SIZE,aux); + k++; + } + uchar * a = (uchar *)C; + uint mask = 0x0F; + a += k/2; + while(k<(uint)max(0,(int)pos-1)) { + assert(((*a)&mask)==get_field(C,C_field_bits,k)); + assert((*a)/16==get_field(C,C_field_bits,k+1)); + sum += ((*a)&mask)+(*a)/16; + pos_O += E->get_log2binomial(BLOCK_SIZE,((*a)&mask))+E->get_log2binomial(BLOCK_SIZE,((*a)/16)); + a++; + k+=2; + } + if(kget_log2binomial(BLOCK_SIZE,aux); + k++; + } + uint c = get_field(C,C_field_bits,pos); + sum += popcount(((2<<(i%BLOCK_SIZE))-1) & E->short_bitmap(c,get_var_field(O,pos_O,pos_O+E->get_log2binomial(BLOCK_SIZE,c)-1))); + return sum; +} + +uint static_bitsequence_rrr02::select0(uint i) { + if(i==0) return -1; + if(i>len-ones) return len; + // Search over partial sums + uint start=0; + uint end=C_sampling_len-1; + uint med, acc=0, pos; + while(start=i) break; + pos_O += E->get_log2binomial(BLOCK_SIZE,s); + acc += BLOCK_SIZE-s; + } + pos = (pos)*BLOCK_SIZE; + // Search inside the block + + while(accget_log2binomial(BLOCK_SIZE,s); + uint block = E->short_bitmap(s,get_var_field(O,pos_O,new_posO-1)); + pos_O = new_posO; + new_posO = 0; + while(accones) return len; + // Search over partial sums + uint start=0; + uint end=C_sampling_len-1; + uint med, acc=0, pos; + while(start=i) break; + pos_O += E->get_log2binomial(BLOCK_SIZE,s); + acc += s; + } + pos = (pos)*BLOCK_SIZE; + //cout << "pos=" << pos << endl; + // Search inside the block + while(accget_log2binomial(BLOCK_SIZE,s); + uint block = E->short_bitmap(s,get_var_field(O,pos_O,new_posO-1)); + pos_O = new_posO; + new_posO = 0; + while(accsize() << endl;*/ + uint sum = sizeof(static_bitsequence_rrr02); + sum += uint_len(C_len,C_field_bits)*sizeof(uint); + sum += O_len*sizeof(uint); + sum += uint_len(C_sampling_len,C_sampling_field_bits)*sizeof(uint); + sum += uint_len(O_pos_len,O_pos_field_bits)*sizeof(uint); + //sum += E->size(); + return sum; +} + +static_bitsequence_rrr02::~static_bitsequence_rrr02() { + if(C!=NULL) delete [] C; + if(O!=NULL) delete [] O; + if(C_sampling!=NULL) delete [] C_sampling; + if(O_pos!=NULL) delete [] O_pos; + E = E->unuse(); +} + +int static_bitsequence_rrr02::save(FILE * fp) { + uint wr = RRR02_HDR; + wr = fwrite(&wr,sizeof(uint),1,fp); + wr += fwrite(&len,sizeof(uint),1,fp); + wr += fwrite(&ones,sizeof(uint),1,fp); + wr += fwrite(&C_len,sizeof(uint),1,fp); + wr += fwrite(&C_field_bits,sizeof(uint),1,fp); + wr += fwrite(&O_len,sizeof(uint),1,fp); + wr += fwrite(&O_bits_len,sizeof(uint),1,fp); + wr += fwrite(&sample_rate,sizeof(uint),1,fp); + if(wr!=8) return -1; + wr = fwrite(C,sizeof(uint),uint_len(C_len,C_field_bits),fp); + if(wr!=uint_len(C_len,C_field_bits)) return -1; + wr = fwrite(O,sizeof(uint),O_len,fp); + if(wr!=O_len) return -1; + return 0; +} + +static_bitsequence_rrr02 * static_bitsequence_rrr02::load(FILE * fp) { + static_bitsequence_rrr02 * ret = new static_bitsequence_rrr02(); + uint rd = 0, type; + rd += fread(&type,sizeof(uint),1,fp); + rd += fread(&ret->len,sizeof(uint),1,fp); + rd += fread(&ret->ones,sizeof(uint),1,fp); + rd += fread(&ret->C_len,sizeof(uint),1,fp); + rd += fread(&ret->C_field_bits,sizeof(uint),1,fp); + rd += fread(&ret->O_len,sizeof(uint),1,fp); + rd += fread(&ret->O_bits_len,sizeof(uint),1,fp); + rd += fread(&ret->sample_rate,sizeof(uint),1,fp); + if(rd!=8 || type!=RRR02_HDR) { + delete ret; + return NULL; + } + ret->C = new uint[uint_len(ret->C_len,ret->C_field_bits)]; + rd = fread(ret->C,sizeof(uint),uint_len(ret->C_len,ret->C_field_bits),fp); + if(rd!=uint_len(ret->C_len,ret->C_field_bits)) { + ret->C=NULL; + delete ret; + return NULL; + } + ret->O = new uint[ret->O_len]; + rd = fread(ret->O,sizeof(uint),ret->O_len,fp); + if(rd!=ret->O_len) { + ret->O=NULL; + delete ret; + return NULL; + } + ret->create_sampling(ret->sample_rate); + return ret; +} + diff --git a/libcds/src/static_bitsequence/static_bitsequence_rrr02.h b/libcds/src/static_bitsequence/static_bitsequence_rrr02.h new file mode 100644 index 0000000..1d343f5 --- /dev/null +++ b/libcds/src/static_bitsequence/static_bitsequence_rrr02.h @@ -0,0 +1,110 @@ +/* static_bitsequence_rrr02.h + * Copyright (C) 2008, Francisco Claude, all rights reserved. + * + * RRR02 Bitsequence - + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + + +#ifndef _STATIC_BITSEQUENCE_RRR02_H +#define _STATIC_BITSEQUENCE_RRR02_H + +#define BLOCK_SIZE 15 +#define DEFAULT_SAMPLING 32 + +#include +#include +#include +#include + +using namespace std; + +/** Implementation of Raman, Raman and Rao's [1] proposal for rank/select capable + * data structures, it achieves space nH_0, O(sample_rate) time for rank and O(log len) + * for select. The practial implementation is based on [2] + * + * [1] R. Raman, V. Raman and S. Rao. Succinct indexable dictionaries with applications + * to encoding $k$-ary trees and multisets. SODA02. + * [2] F. Claude and G. Navarro. Practical Rank/Select over Arbitrary Sequences. SPIRE08. + * + * @author Francisco Claude + */ +class static_bitsequence_rrr02: public static_bitsequence { +public: + static_bitsequence_rrr02(uint * bitseq, uint len, uint sample_rate=DEFAULT_SAMPLING); + virtual ~static_bitsequence_rrr02(); + + /** Returns the number of zeros until position i */ + virtual uint rank0(uint i); + + /** Returns the number of ones until position i */ + virtual uint rank1(uint i); + + /** Returns the position of the i-th zero + * @return (uint)-1 if i=0, len if i>num_zeros or the position */ + virtual uint select0(uint i); + + /** Returns the position of the i-th one + * @return (uint)-1 if i=0, len if i>num_ones or the position */ + virtual uint select1(uint i); + + /** Returns the i-th bit */ + virtual bool access(uint i); + + /** Returns the size of the structure in bytes */ + virtual uint size(); + + /** Stores the bitmap given a file pointer, return 0 in case of success */ + virtual int save(FILE * fp); + + /** Reads the bitmap from a file pointer, returns NULL in case of error */ + static static_bitsequence_rrr02 * load(FILE * fp); + + /** Creates a new sampling for the queries */ + void create_sampling(uint sampling_rate); + + /** Frees the space required by the table E, which is static and global + * to all instances. + */ + static void delete_E() { + delete E; + } + + +protected: + static_bitsequence_rrr02(); + /** Classes and offsets */ + uint *C, *O; + /** Length of C and O (in uints) */ + uint C_len, O_len; + /** Bits required per field for C and in total for O */ + uint C_field_bits, O_bits_len; + /** C and O samplings */ + uint *C_sampling, *O_pos; + /** Length of the samplings */ + uint C_sampling_len,O_pos_len; + /** Lenght in bits per field */ + uint C_sampling_field_bits,O_pos_field_bits; + /** Sample rate */ + uint sample_rate; + + static table_offset * E; +}; + +#endif /* _STATIC_BITSEQUENCE_RRR02_H */ + + diff --git a/libcds/src/static_bitsequence/table_offset.cpp b/libcds/src/static_bitsequence/table_offset.cpp new file mode 100644 index 0000000..434009a --- /dev/null +++ b/libcds/src/static_bitsequence/table_offset.cpp @@ -0,0 +1,125 @@ +/* table_offset.cpp + * Copyright (C) 2008, Francisco Claude, all rights reserved. + * + * Table for offsets. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + + +#include "table_offset.h" + + +// Interface for old implementation +void genera(ushort * bch, uint u, ushort * F, uint lF); +uint generaClase(ushort * bch, uint u, uint clase, uint puestos, uint pos_ini, uint generado); +uint offset_func(uint u, uint busca); +uint offsetRecursivo(uint u, uint busca, uint clase, uint puestos, uint pos_ini, uint generado); +uint __indiceFunc; +uint __indAcumulado; +ushort * __Lis; +// End interface old implementation + + +table_offset::table_offset(uint u) { + this->u = u; + users_count = 0; + short_bitmaps = new ushort[((1< +#include + +using namespace std; + +/** Universal table required for static_bitsequence_rrr02, Raman, Raman and Rao's [1] + * proposal for rank/select capable data structures, it achieves space nH_0, + * O(sample_rate) time for rank and O(log len) for select. The practial implementation + * is based on [2] + * + * [1] R. Raman, V. Raman and S. Rao. Succinct indexable dictionaries with applications + * to encoding $k$-ary trees and multisets. SODA02. + * [2] F. Claude and G. Navarro. Practical Rank/Select over Arbitrary Sequences. SPIRE08. + * + * @author Francisco Claude + */ +class table_offset { + +public: + table_offset(uint u); + ~table_offset(); + + /** Increments the counter of users for the table */ + inline void use() { + users_count++; + } + + /** Tells the object that the user is not going to need the table anymore. */ + inline table_offset * unuse() { + users_count--; + if(!users_count) { + delete this; + return NULL; + } + return this; + } + + /** Computes binomial(n,k) for n,k<=u */ + inline uint get_binomial(uint n, uint k) { + return binomial[n][k]; + } + + /** Computes ceil(log2(binomial(n,k))) for n,k<=u */ + inline ushort get_log2binomial(uint n, uint k) { + return log2binomial[n][k]; + } + + /** Returns the bitmap represented by the given class and inclass offsets */ + inline ushort short_bitmap(uint class_offset, uint inclass_offset) { + if(class_offset==0) return 0; + if(class_offset==u) return (ushort)(((uint)1< + +static_sequence::static_sequence() {} +static_sequence::~static_sequence() {} +uint static_sequence::length() { return len; } + +static_sequence * static_sequence::load(FILE * fp) { + uint rd; + if(fread(&rd,sizeof(uint),1,fp)!=1) return NULL; + fseek(fp,-sizeof(uint),SEEK_CUR); + switch(rd) { + case WVTREE_HDR: return static_sequence_wvtree::load(fp); + } + return NULL; +} diff --git a/libcds/src/static_sequence/static_sequence.h b/libcds/src/static_sequence/static_sequence.h new file mode 100644 index 0000000..0baf721 --- /dev/null +++ b/libcds/src/static_sequence/static_sequence.h @@ -0,0 +1,77 @@ +/* static_sequence.h + * Copyright (C) 2008, Francisco Claude, all rights reserved. + * + * static_sequence definition + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef _STATIC_SEQUENCE_H +#define _STATIC_SEQUENCE_H + + +#include +#include + +#define WVTREE_HDR 2 + +using namespace std; + +/** Base class for static sequences, contains many abstract functions, so this can't + * be instantiated. + * + * @author Francisco Claude + */ +class static_sequence { + +public: + static_sequence(); + virtual ~static_sequence(); + + /** Returns the number of occurrences of c until position i */ + virtual uint rank(uint c, uint i)=0; + + /** Returns the position of the i-th c + * @return (uint)-1 if i=0, len if i exceeds the number of cs */ + virtual uint select(uint c, uint i)=0; + + /** Returns the i-th element */ + virtual uint access(uint i)=0; + + /** Returns the length of the sequence */ + virtual uint length(); + + /** Returns how many cs are in the sequence */ + virtual uint count(uint c)=0; + + /** Returns the size of the structure in bytes */ + virtual uint size()=0; + + /** Stores the bitmap given a file pointer, return 0 in case of success */ + virtual uint save(FILE * fp)=0; + + /** Reads a bitmap determining the type */ + static static_sequence * load(FILE * fp); + +protected: + /** Length of the bitstring */ + uint len; + +}; + +#include + +#endif /* _STATIC_SEQUENCE_H */ diff --git a/libcds/src/static_sequence/static_sequence_wvtree.cpp b/libcds/src/static_sequence/static_sequence_wvtree.cpp new file mode 100644 index 0000000..efdd8f7 --- /dev/null +++ b/libcds/src/static_sequence/static_sequence_wvtree.cpp @@ -0,0 +1,71 @@ + +#include + +static_sequence_wvtree::static_sequence_wvtree(uint * symbols, uint n, wt_coder * c, static_bitsequence_builder * bmb, alphabet_mapper * am) { + for(uint i=0;imap(symbols[i]); + this->am = am; + this->c=c; + cout << "Building..."; cout.flush(); + root = new wt_node_internal(symbols, n, 0, c, bmb); + cout << "done" << endl; cout.flush(); + for(uint i=0;iunmap(symbols[i]); +} + +static_sequence_wvtree::static_sequence_wvtree() {} + + +static_sequence_wvtree::~static_sequence_wvtree() { + delete root; + delete am; + delete c; +} + +uint static_sequence_wvtree::rank(uint symbol, uint pos) { + return root->rank(am->map(symbol), pos, 0, c); +} + +uint static_sequence_wvtree::count(uint s) { + return root->rank(am->map(s), len-1, 0, c); +} + +uint static_sequence_wvtree::select(uint symbol, uint pos) { + return root->select(am->map(symbol), pos, 0, c)-1; +} + +uint static_sequence_wvtree::access(uint pos) { + return am->unmap(root->access(pos)); +} + +uint static_sequence_wvtree::size() { + /*cout << "WT: " << root->size() << endl; + cout << "Coder: " << c->size() << endl; + cout << "AM: " << am->size() << endl;*/ + return sizeof(static_sequence_wvtree)+sizeof(uint)+root->size()+am->size()+c->size(); +} + +uint static_sequence_wvtree::save(FILE * fp) { + uint wr = WVTREE_HDR; + wr = fwrite(&wr,sizeof(uint),1,fp); + if(wr!=1) return 1; + wr = fwrite(&n,sizeof(uint),1,fp); + if(wr!=1) return 1; + if(c->save(fp)) return 1; + if(am->save(fp)) return 1; + if(root->save(fp)) return 1; + return 0; +} + +static_sequence_wvtree * static_sequence_wvtree::load(FILE *fp) { + uint rd; + if(fread(&rd,sizeof(uint),1,fp)!=1) return NULL; + if(rd!=WVTREE_HDR) return NULL; + static_sequence_wvtree * ret = new static_sequence_wvtree(); + if(fread(&ret->n,sizeof(uint),1,fp)!=1) return NULL; + ret->c = wt_coder::load(fp); + ret->am = alphabet_mapper::load(fp); + ret->root = wt_node::load(fp); + return ret; +} + diff --git a/libcds/src/static_sequence/static_sequence_wvtree.h b/libcds/src/static_sequence/static_sequence_wvtree.h new file mode 100644 index 0000000..b93caf2 --- /dev/null +++ b/libcds/src/static_sequence/static_sequence_wvtree.h @@ -0,0 +1,60 @@ + +#ifndef STATIC_SEQUENCE_WVTREE_H +#define STATIC_SEQUENCE_WVTREE_H +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +class static_sequence_wvtree : public static_sequence { + public: + + /** Builds a Wavelet Tree for the string + * pointed by symbols assuming its length + * equals n and the test flag allows to set + * if the structure must be tested for + * correctness after being created (this is very expensive). */ + static_sequence_wvtree(uint * symbols, uint n, wt_coder * coder, static_bitsequence_builder * bmb, alphabet_mapper * am); + + virtual ~static_sequence_wvtree(); + + virtual uint rank(uint symbol, uint pos); + + virtual uint select(uint symbol, uint i); + + virtual uint access(uint pos); + + virtual uint count(uint s); + + virtual uint size(); + + virtual uint save(FILE * fp); + static static_sequence_wvtree * load(FILE *fp); + protected: + + static_sequence_wvtree(); + + wt_node * root; + wt_coder * c; + alphabet_mapper * am; + //bitmap_builder * bmb; + + /** Length of the string. */ + uint n; + + /** Height of the Wavelet Tree. */ + uint max_v; + + /** Flag for testing for correcteness. */ + bool test; + + +}; +#endif /* _STATIC_SEQUENCE_WVTREE_H */ diff --git a/libcds/src/static_sequence/wt_coder.cpp b/libcds/src/static_sequence/wt_coder.cpp new file mode 100644 index 0000000..1a8cd99 --- /dev/null +++ b/libcds/src/static_sequence/wt_coder.cpp @@ -0,0 +1,12 @@ + +#include + +wt_coder * wt_coder::load(FILE *fp) { + uint rd; + if(fread(&rd,sizeof(uint),1,fp)!=1) return NULL; + fseek(fp,-sizeof(uint),SEEK_CUR); + switch(rd) { + case WT_CODER_HUFF_HDR: return wt_coder_huff::load(fp); + } + return NULL; +} diff --git a/libcds/src/static_sequence/wt_coder.h b/libcds/src/static_sequence/wt_coder.h new file mode 100644 index 0000000..4b7d4da --- /dev/null +++ b/libcds/src/static_sequence/wt_coder.h @@ -0,0 +1,26 @@ + +#ifndef wt_coder_h +#define wt_coder_h + +#include +#include + +using namespace std; + +#define WT_CODER_HUFF_HDR 2 + +class wt_coder { + public: + virtual ~wt_coder() {}; + virtual bool is_set(uint symbol, uint l)=0; + virtual bool done(uint symbol, uint l)=0; + virtual uint size()=0; + virtual uint save(FILE *fp)=0; + static wt_coder * load(FILE *fp); +}; + +#include +#include + +#endif + diff --git a/libcds/src/static_sequence/wt_coder_binary.cpp b/libcds/src/static_sequence/wt_coder_binary.cpp new file mode 100644 index 0000000..11ba4f5 --- /dev/null +++ b/libcds/src/static_sequence/wt_coder_binary.cpp @@ -0,0 +1,26 @@ + +#include + +wt_coder_binary::wt_coder_binary(uint * seq, uint n, alphabet_mapper * am) { + uint max_v = 0; + for(uint i=0;imap(seq[i]),max_v); + h=bits(max_v); +} + +wt_coder_binary::~wt_coder_binary() {} + +bool wt_coder_binary::is_set(uint symbol, uint l) { + if((1<<(h-l-1))&symbol) return true; + return false; +} + +bool wt_coder_binary::done(uint symbol, uint l) { + if(l==h-1) return true; + return false; +} + +uint wt_coder_binary::size() { + return sizeof(wt_coder_binary); +} + diff --git a/libcds/src/static_sequence/wt_coder_binary.h b/libcds/src/static_sequence/wt_coder_binary.h new file mode 100644 index 0000000..e784281 --- /dev/null +++ b/libcds/src/static_sequence/wt_coder_binary.h @@ -0,0 +1,22 @@ + +#ifndef wt_coder_binary_h +#define wt_coder_binary_h + +#include +#include +#include + +class wt_coder_binary: public wt_coder { + public: + wt_coder_binary(uint * seq, uint n, alphabet_mapper * am); + virtual ~wt_coder_binary(); + virtual bool is_set(uint symbol, uint l); + virtual bool done(uint symbol, uint l); + virtual uint size(); + + protected: + uint h; +}; + +#endif + diff --git a/libcds/src/static_sequence/wt_coder_huff.cpp b/libcds/src/static_sequence/wt_coder_huff.cpp new file mode 100644 index 0000000..f5fafef --- /dev/null +++ b/libcds/src/static_sequence/wt_coder_huff.cpp @@ -0,0 +1,59 @@ + +#include + +wt_coder_huff::wt_coder_huff(uint * symbs, uint n, alphabet_mapper * am) { + for(uint i=0;imap(symbs[i]); + hc = new huffman_codes(symbs, n); + buffer = new uint[hc->max_length()/W+1]; + s_len = 0; last_symbol = (uint)-1; + for(uint i=0;iunmap(symbs[i]); +} + +wt_coder_huff::wt_coder_huff() {} + +wt_coder_huff::~wt_coder_huff() { + delete hc; + delete [] buffer; +} + +bool wt_coder_huff::is_set(uint symbol, uint l) { + if(symbol!=last_symbol) { + s_len = (uint)hc->encode(symbol, buffer, (ulong)0); + last_symbol = symbol; + } + return bitget(buffer,l); +} + +bool wt_coder_huff::done(uint symbol, uint l) { + if(symbol!=last_symbol) { + s_len = (uint)hc->encode(symbol, buffer, (ulong)0); + last_symbol = symbol; + } + return l==s_len; +} + +uint wt_coder_huff::size() { + return 2*sizeof(uint)+sizeof(wt_coder_huff)+hc->size()+(hc->max_length()/W+1)*sizeof(uint); +} + +uint wt_coder_huff::save(FILE * fp) { + uint wr = WT_CODER_HUFF_HDR; + wr = fwrite(&wr,sizeof(uint),1,fp); + if(wr!=1) return 1; + if(hc->save(fp)) return 1; + //if(am->save(fp)) return 1; + return 0; +} + +wt_coder_huff * wt_coder_huff::load(FILE *fp) { + uint rd; + if(fread(&rd,sizeof(uint),1,fp)!=1) return NULL; + if(rd!=WT_CODER_HUFF_HDR) return NULL; + wt_coder_huff * ret = new wt_coder_huff(); + ret->hc = huffman_codes::load(fp); + ret->buffer = new uint[ret->hc->max_length()/W+1]; + ret->s_len = 0; ret->last_symbol = (uint)-1; + return ret; +} diff --git a/libcds/src/static_sequence/wt_coder_huff.h b/libcds/src/static_sequence/wt_coder_huff.h new file mode 100644 index 0000000..cb05056 --- /dev/null +++ b/libcds/src/static_sequence/wt_coder_huff.h @@ -0,0 +1,29 @@ + +#ifndef wt_coder_huff_h +#define wt_coder_huff_h + +#include +#include +#include +#include + +class wt_coder_huff: public wt_coder { + public: + wt_coder_huff(uint *symbs, uint n, alphabet_mapper * am); + virtual ~wt_coder_huff(); + virtual bool is_set(uint symbol, uint l); + virtual bool done(uint symbol, uint l); + virtual uint size(); + virtual uint save(FILE *fp); + static wt_coder_huff * load(FILE *fp); + uint * get_buffer(uint symbol, uint *n); + + protected: + wt_coder_huff(); + huffman_codes * hc; + uint * buffer; + uint last_symbol, s_len; +}; + +#endif + diff --git a/libcds/src/static_sequence/wt_node.cpp b/libcds/src/static_sequence/wt_node.cpp new file mode 100644 index 0000000..8240a2a --- /dev/null +++ b/libcds/src/static_sequence/wt_node.cpp @@ -0,0 +1,34 @@ +/* wt_node.h + * Copyright (C) 2008, Francisco Claude, all rights reserved. + * + * wt_node + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include + +wt_node * wt_node::load(FILE *fp) { + uint rd; + if(fread(&rd,sizeof(uint),1,fp)!=1) return NULL; + if(rd==WT_NODE_NULL_HDR) return NULL; + fseek(fp,-sizeof(uint),SEEK_CUR); + switch(rd) { + case WT_NODE_INTERNAL_HDR: return wt_node_internal::load(fp); + case WT_NODE_LEAF_HDR: return wt_node_leaf::load(fp); + } + return NULL; +} diff --git a/libcds/src/static_sequence/wt_node.h b/libcds/src/static_sequence/wt_node.h new file mode 100644 index 0000000..14ea05c --- /dev/null +++ b/libcds/src/static_sequence/wt_node.h @@ -0,0 +1,48 @@ +/* wt_node.h + * Copyright (C) 2008, Francisco Claude, all rights reserved. + * + * wt_node + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef wt_node_h +#define wt_node_h + +#include +#include + +#define WT_NODE_NULL_HDR 0 +#define WT_NODE_INTERNAL_HDR 2 +#define WT_NODE_LEAF_HDR 3 + + +class wt_node { + public: + virtual ~wt_node() {} + virtual uint rank(uint symbol, uint pos, uint l, wt_coder * c)=0; + virtual uint select(uint symbol, uint pos, uint l, wt_coder * c)=0; + virtual uint access(uint pos)=0; + virtual uint size()=0; + virtual uint save(FILE *fp)=0; + static wt_node * load(FILE *fp); +}; + +#include +#include + +#endif + diff --git a/libcds/src/static_sequence/wt_node_internal.cpp b/libcds/src/static_sequence/wt_node_internal.cpp new file mode 100644 index 0000000..e741b6f --- /dev/null +++ b/libcds/src/static_sequence/wt_node_internal.cpp @@ -0,0 +1,142 @@ + +#include + + +wt_node_internal::wt_node_internal(uint * symbols, uint n, uint l, wt_coder * c, static_bitsequence_builder * bmb) { + uint * ibitmap = new uint[n/W+1]; + for(uint i=0;iis_set(symbols[i],l)) + bitset(ibitmap,i); + bitmap = bmb->build(ibitmap, n); + delete [] ibitmap; + uint count_right = bitmap->rank1(n-1); + uint count_left = n-count_right+1; + uint * left = new uint[count_left+1]; + uint * right = new uint[count_right+1]; + count_right = count_left = 0; + bool match_left = true, match_right = true; + for(uint i=0;iaccess(i)) { + right[count_right++]=symbols[i]; + if(count_right>1) + if(right[count_right-1]!=right[count_right-2]) + match_right = false; + } + else { + left[count_left++]=symbols[i]; + if(count_left>1) + if(left[count_left-1]!=left[count_left-2]) + match_left = false; + } + } + if(count_left>0) { + if(match_left) + left_child = new wt_node_leaf(left[0], count_left); + else + left_child = new wt_node_internal(left, count_left, l+1, c, bmb); + } else { + left_child = NULL; + } + if(count_right>0) { + if(match_right) + right_child = new wt_node_leaf(right[0], count_right); + else + right_child = new wt_node_internal(right, count_right, l+1, c, bmb); + } else { + right_child = NULL; + } + delete [] left; + delete [] right; +} + +wt_node_internal::wt_node_internal() { } + +wt_node_internal::~wt_node_internal() { + delete bitmap; + if(right_child!=NULL) delete right_child; + if(left_child!=NULL) delete left_child; +} + +uint wt_node_internal::rank(uint symbol, uint pos, uint l, wt_coder * c) { + bool is_set = c->is_set(symbol,l); + if(!is_set) { + if(left_child==NULL) return 0; + return left_child->rank(symbol, bitmap->rank0(pos)-1,l+1,c); + } + else { + if(right_child==NULL) return 0; + return right_child->rank(symbol, bitmap->rank1(pos)-1,l+1,c); + } +} + +uint wt_node_internal::select(uint symbol, uint pos, uint l, wt_coder * c) { + bool is_set = c->is_set(symbol, l); + if(!is_set) { + if(left_child==NULL) + return (uint)(-1); + uint new_pos = left_child->select(symbol, pos, l+1,c); + if(new_pos+1==0) return (uint)(-1); + return bitmap->select0(new_pos)+1; + } else { + if(right_child==NULL) + return (uint)(-1); + uint new_pos = right_child->select(symbol, pos, l+1,c); + if(new_pos+1==0) return (uint)(-1); + return bitmap->select1(new_pos)+1; + } +} + +uint wt_node_internal::access(uint pos) { + bool is_set = bitmap->access(pos); + if(!is_set) { + assert(left_child!=NULL); + return left_child->access(bitmap->rank0(pos)-1); + } else { + assert(right_child!=NULL); + return right_child->access(bitmap->rank1(pos)-1); + } +} + +uint wt_node_internal::size() { + uint s = bitmap->size()+sizeof(wt_node_internal); + if(left_child!=NULL) + s += left_child->size(); + if(right_child!=NULL) + s += right_child->size(); + return s; +} + +uint wt_node_internal::save(FILE *fp) { + uint wr = WT_NODE_INTERNAL_HDR; + wr = fwrite(&wr,sizeof(uint),1,fp); + if(wr!=1) return 1; + if(bitmap->save(fp)) return 1; + if(left_child!=NULL) { + if(left_child->save(fp)) return 1; + } else { + wr = WT_NODE_NULL_HDR; + wr = fwrite(&wr,sizeof(uint),1,fp); + if(wr!=1) return 1; + } + if(right_child!=NULL) { + if(right_child->save(fp)) return 1; + } else { + wr = WT_NODE_NULL_HDR; + wr = fwrite(&wr,sizeof(uint),1,fp); + if(wr!=1) return 1; + } + return 0; +} + +wt_node_internal * wt_node_internal::load(FILE *fp) { + uint rd; + if(fread(&rd,sizeof(uint),1,fp)!=1) return NULL; + if(rd!=WT_NODE_INTERNAL_HDR) return NULL; + wt_node_internal * ret = new wt_node_internal(); + ret->bitmap = static_bitsequence::load(fp); + ret->left_child = wt_node::load(fp); + ret->right_child = wt_node::load(fp); + return ret; +} diff --git a/libcds/src/static_sequence/wt_node_internal.h b/libcds/src/static_sequence/wt_node_internal.h new file mode 100644 index 0000000..df77c18 --- /dev/null +++ b/libcds/src/static_sequence/wt_node_internal.h @@ -0,0 +1,33 @@ + +#ifndef wt_node_internal_h +#define wt_node_internal_h + +#include +#include +#include +#include +#include +#include +#include + +class wt_node_internal: public wt_node { + public: + wt_node_internal(uint * seq, uint n, uint l, wt_coder * c, static_bitsequence_builder * bmb); + virtual ~wt_node_internal(); + virtual uint rank(uint symbol, uint pos, uint level, wt_coder * c); + virtual uint select(uint symbol, uint pos, uint level, wt_coder * c); + virtual uint access(uint pos); + virtual uint size(); + virtual uint save(FILE *fp); + static wt_node_internal * load(FILE *fp); + + + protected: + wt_node_internal(); + wt_node *left_child, *right_child; + static_bitsequence * bitmap; + //uint length; +}; + +#endif + diff --git a/libcds/src/static_sequence/wt_node_leaf.cpp b/libcds/src/static_sequence/wt_node_leaf.cpp new file mode 100644 index 0000000..6f6c887 --- /dev/null +++ b/libcds/src/static_sequence/wt_node_leaf.cpp @@ -0,0 +1,52 @@ + +#include + +wt_node_leaf::wt_node_leaf(uint symbol, uint count) { + this->symbol = symbol; + this->count = count; +} + +wt_node_leaf::wt_node_leaf() {} + +wt_node_leaf::~wt_node_leaf() {} + +uint wt_node_leaf::rank(uint symbol, uint pos, uint l, wt_coder * c) { + assert(symbol==this->symbol); + pos++; + assert(pos<=count); + return pos; +} + +uint wt_node_leaf::select(uint symbol, uint pos, uint l, wt_coder * c) { + assert(symbol==this->symbol); + assert(pos<=count && pos>0); + return pos; +} + +uint wt_node_leaf::access(uint pos) { + assert(poscount),sizeof(uint),1,fp); + rd += fread(&(ret->symbol),sizeof(uint),1,fp); + if(rd!=2) { delete ret; return NULL; } + return ret; +} diff --git a/libcds/src/static_sequence/wt_node_leaf.h b/libcds/src/static_sequence/wt_node_leaf.h new file mode 100644 index 0000000..a64c2a2 --- /dev/null +++ b/libcds/src/static_sequence/wt_node_leaf.h @@ -0,0 +1,28 @@ + +#ifndef wt_node_leaf_h +#define wt_node_leaf_h + +#include +#include +#include +#include + +class wt_node_leaf: public wt_node { + public: + wt_node_leaf(uint symbol, uint count); + virtual ~wt_node_leaf(); + virtual uint rank(uint symbol, uint pos, uint l, wt_coder * c); + virtual uint select(uint symbol, uint pos, uint l, wt_coder * c); + virtual uint access(uint pos); + virtual uint size(); + virtual uint save(FILE *fp); + static wt_node_leaf * load(FILE *fp); + + protected: + wt_node_leaf(); + uint symbol; + uint count; +}; + +#endif + diff --git a/libcds/src/utils/alphabet_mapper.cpp b/libcds/src/utils/alphabet_mapper.cpp new file mode 100644 index 0000000..a1b8cd5 --- /dev/null +++ b/libcds/src/utils/alphabet_mapper.cpp @@ -0,0 +1,32 @@ +/* alphabet_mapper.cpp + * Copyright (C) 2008, Francisco Claude, all rights reserved. + * + * static_bitsequence definition + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include + +alphabet_mapper * alphabet_mapper::load(FILE *fp) { + uint rd; + if(fread(&rd,sizeof(uint),1,fp)!=1) return NULL; + fseek(fp,-1*sizeof(uint),SEEK_CUR); + switch(rd) { + case ALPHABET_MAPPER_NONE_HDR: return alphabet_mapper_none::load(fp); + } + return NULL; +} diff --git a/libcds/src/utils/alphabet_mapper.h b/libcds/src/utils/alphabet_mapper.h new file mode 100644 index 0000000..125e0c6 --- /dev/null +++ b/libcds/src/utils/alphabet_mapper.h @@ -0,0 +1,48 @@ +/* alphabet_mapper.h + * Copyright (C) 2008, Francisco Claude, all rights reserved. + * + * static_bitsequence definition + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef _ALPHABET_MAPPER_H +#define _ALPHABET_MAPPER_H + +#include +#include + +#define ALPHABET_MAPPER_NONE_HDR 2 + +using namespace std; + +/** Base class for alphabet mappers + * + * @author Francisco Claude + */ +class alphabet_mapper { + public: + virtual ~alphabet_mapper() {} + virtual uint map(uint s)=0; + virtual uint unmap(uint s)=0; + virtual uint size()=0; + virtual uint save(FILE *fp)=0; + static alphabet_mapper * load(FILE * fp); +}; + +#include + +#endif /* _ALPHABET_MAPPER_H */ diff --git a/libcds/src/utils/alphabet_mapper_none.cpp b/libcds/src/utils/alphabet_mapper_none.cpp new file mode 100644 index 0000000..9bb4bbe --- /dev/null +++ b/libcds/src/utils/alphabet_mapper_none.cpp @@ -0,0 +1,25 @@ + +#include + +alphabet_mapper_none::alphabet_mapper_none() { } + +uint alphabet_mapper_none::map(uint s) {return s;} + +uint alphabet_mapper_none::unmap(uint s) {return s;} + +uint alphabet_mapper_none::size() { return sizeof(alphabet_mapper_none); } + +uint alphabet_mapper_none::save(FILE *fp) { + uint wr = ALPHABET_MAPPER_NONE_HDR; + wr = fwrite(&wr,sizeof(uint),1,fp); + if(wr!=1) return 1; + return 0; +} + +alphabet_mapper_none * alphabet_mapper_none::load(FILE * fp) { + uint rd; + if(fread(&rd,sizeof(uint),1,fp)!=1) return NULL; + if(rd!=ALPHABET_MAPPER_NONE_HDR) return NULL; + return new alphabet_mapper_none(); +} + diff --git a/libcds/src/utils/alphabet_mapper_none.h b/libcds/src/utils/alphabet_mapper_none.h new file mode 100644 index 0000000..8609914 --- /dev/null +++ b/libcds/src/utils/alphabet_mapper_none.h @@ -0,0 +1,46 @@ +/* alphabet_mapper_none.h + * Copyright (C) 2008, Francisco Claude, all rights reserved. + * + * static_bitsequence definition + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef _ALPHABET_MAPPER_NONE_H +#define _ALPHABET_MAPPER_NONE_H + +#include +#include +#include + +using namespace std; + +/** Mapper that doesn't change the value (identity) + * + * @author Francisco Claude + */ +class alphabet_mapper_none : public alphabet_mapper { + public: + alphabet_mapper_none(); + virtual ~alphabet_mapper_none() {} + virtual uint map(uint s); + virtual uint unmap(uint s); + virtual uint size(); + virtual uint save(FILE *fp); + static alphabet_mapper_none * load(FILE *fp); +}; + +#endif /* _ALPHABET_MAPPER_NONE_H */ diff --git a/libcds/tests/Makefile b/libcds/tests/Makefile new file mode 100644 index 0000000..e1dfb7f --- /dev/null +++ b/libcds/tests/Makefile @@ -0,0 +1,34 @@ +CPP=g++ + +#CPPFLAGS=-g3 -Wall -I../includes/ +CPPFLAGS=-O9 -Wall -DNDEBUG -I../includes/ + +OBJECTS=test_naive.o test_rrr02.o test_brw32.o make_bitmap.o test_wvtree01.o test_wvtree02.o +BIN=test_naive test_rrr02 test_brw32 make_bitmap test_wvtree01 test_wvtree02 +LIB=../lib/libcds.a + +%.o: %.cpp + $(CPP) $(CPPFLAGS) -c $< -o $@ + +all: $(OBJECTS) $(BIN) + +test_naive: + $(CPP) $(CPPFLAGS) -o test_naive test_naive.o $(LIB) + +test_rrr02: + $(CPP) $(CPPFLAGS) -o test_rrr02 test_rrr02.o $(LIB) + +test_brw32: + $(CPP) $(CPPFLAGS) -o test_brw32 test_brw32.o $(LIB) + +make_bitmap: + $(CPP) $(CPPFLAGS) -o make_bitmap make_bitmap.o $(LIB) + +test_wvtree01: + $(CPP) $(CPPFLAGS) -o test_wvtree01 test_wvtree01.o $(LIB) + +test_wvtree02: + $(CPP) $(CPPFLAGS) -o test_wvtree02 test_wvtree02.o $(LIB) + +clean: + rm -f $(OBJECTS) $(BIN) diff --git a/libcds/tests/make_bitmap.cpp b/libcds/tests/make_bitmap.cpp new file mode 100644 index 0000000..0da2d93 --- /dev/null +++ b/libcds/tests/make_bitmap.cpp @@ -0,0 +1,37 @@ + +#include +#include +#include +#include +#include + +using namespace std; + +int main(int argc, char ** argv) { + if(argc!=4) { + cout << "usage: " << argv[0] << " " << endl; + return 0; + } + char * fname = argv[1]; + uint len = atoi(argv[2]); + uint ones = atoi(argv[3]); + uint * bm = new uint[uint_len(len,1)]; + for(uint i=0;i +#include +#include +#include + +using namespace std; + +int main(int argc, char ** argv) { + if(argc!=3) { + cout << "usage: " << argv[0] << " " << endl; + return 0; + } + FILE * fp = fopen(argv[1],"r"); + if(fp==NULL) { + cout << "Error opening " << argv[1] << endl; + return -1; + } + uint *bitseq, len; + uint l = fread(&len, sizeof(uint), 1, fp); + bitseq = new uint[uint_len(len,1)]; + l += fread(bitseq, sizeof(uint), uint_len(len,1), fp); + fclose(fp); + + uint sample_rate; + stringstream ss(argv[2]); + ss >> sample_rate; + + static_bitsequence * bs = new static_bitsequence_brw32(bitseq,len,sample_rate); + + char * fname = new char[string(argv[1]).length()+10]; + sprintf(fname,"%s.rrr",argv[1]); + + fp = fopen(fname,"w"); + cout << "Save: " << bs->save(fp) << endl; + fclose(fp); + delete bs; + + fp = fopen(fname,"r"); + bs = static_bitsequence::load(fp); + cout << bs << endl; + fclose(fp); + delete [] fname; + + cout << "Bitmap length: " << len << " =? " << bs->length() << endl; + cout << "Ones: " << bs->count_one() << endl; + cout << "Bitmap size: " << bs->size() << endl; + /*for(uint i=0;i<64;i++) { + if(i%15==0) cout << " "; + cout << (bs->access(i)?"1":"0"); + } + cout << endl;*/ + uint ones = 0; + for(uint i=0;iaccess(i) != (bitget(bitseq,i)!=0)) { + cout << "Access error for position " << i << endl; + cout << " got: " << bs->access(i) << " expected: " << (bitget(bitseq,i)!=0) << endl; + } + if(bs->rank1(i) != ones) { + cout << "Rank1 error for position " << i << endl; + cout << " got: " << bs->rank1(i) << " expected: " << ones << endl; + } + if(bitget(bitseq,i) && bs->select1(ones) != i) { + cout << "Select1 error for position " << i << " ones:" << ones << endl; + cout << " got: " << bs->select1(ones) << " expected: " << i << endl; + } + if(bs->rank0(i) != i+1-ones) { + cout << "Rank0 error for position " << i << endl; + cout << " got: " << bs->rank0(i) << " expected: " << ones << endl; + } + if(!bitget(bitseq,i) && bs->select0(i+1-ones) != i) { + cout << "Select0 error for position " << i << endl; + cout << " got: " << bs->select0(i+1-ones) << " expected: " << i << endl; + } + } + delete bs; + delete [] bitseq; + cout << "Test completed." << endl; + return 0; +} + diff --git a/libcds/tests/test_naive.cpp b/libcds/tests/test_naive.cpp new file mode 100644 index 0000000..3ce65ff --- /dev/null +++ b/libcds/tests/test_naive.cpp @@ -0,0 +1,49 @@ + +#include +#include +#include + +using namespace std; + +int main(int argc, char ** argv) { + if(argc!=2) { + cout << "usage: " << argv[0] << " " << endl; + return 0; + } + FILE * fp = fopen(argv[1],"r"); + if(fp==NULL) { + cout << "Error opening " << argv[1] << endl; + return -1; + } + uint *bitseq, len; + uint l = fread(&len, sizeof(uint), 1, fp); + bitseq = new uint[uint_len(len,1)]; + l += fread(bitseq, sizeof(uint), uint_len(len,1), fp); + static_bitsequence * bs = new static_bitsequence_naive(bitseq,len); + cout << "Bitmap length: " << len << " =? " << bs->length() << endl; + uint ones = 0; + for(uint i=0;irank1(i) != ones) { + cout << "Rank1 error for position " << i << endl; + cout << " got: " << bs->rank1(i) << " expected: " << ones << endl; + } + if(bitget(bitseq,i) && bs->select1(ones) != i) { + cout << "Select1 error for position " << i << endl; + cout << " got: " << bs->select1(ones) << " expected: " << i << endl; + } + if(bs->rank0(i) != i+1-ones) { + cout << "Rank0 error for position " << i << endl; + cout << " got: " << bs->rank0(i) << " expected: " << ones << endl; + } + if(!bitget(bitseq,i) && bs->select0(i+1-ones) != i) { + cout << "Select0 error for position " << i << endl; + cout << " got: " << bs->select0(ones) << " expected: " << i << endl; + } + } + delete bs; + fclose(fp); + cout << "Test completed." << endl; + return 0; +} + diff --git a/libcds/tests/test_rrr02.cpp b/libcds/tests/test_rrr02.cpp new file mode 100644 index 0000000..5c460f9 --- /dev/null +++ b/libcds/tests/test_rrr02.cpp @@ -0,0 +1,84 @@ + +#include +#include +#include +#include + +using namespace std; + +int main(int argc, char ** argv) { + if(argc!=3) { + cout << "usage: " << argv[0] << " " << endl; + return 0; + } + FILE * fp = fopen(argv[1],"r"); + if(fp==NULL) { + cout << "Error opening " << argv[1] << endl; + return -1; + } + uint *bitseq, len; + uint l=fread(&len, sizeof(uint), 1, fp); + bitseq = new uint[uint_len(len,1)]; + l+=fread(bitseq, sizeof(uint), uint_len(len,1), fp); + fclose(fp); + + static_bitsequence * bs = new static_bitsequence_rrr02(bitseq,len); + + char * fname = new char[string(argv[1]).length()+10]; + sprintf(fname,"%s.rrr",argv[1]); + + fp = fopen(fname,"w"); + cout << "Save: " << bs->save(fp) << endl; + fclose(fp); + delete bs; + + fp = fopen(fname,"r"); + bs = static_bitsequence::load(fp); + uint sample_rate; + stringstream ss(argv[2]); + ss >> sample_rate; + ((static_bitsequence_rrr02*)bs)->create_sampling(sample_rate); + fclose(fp); + delete [] fname; + + cout << "Bitmap length: " << len << " =? " << bs->length() << endl; + cout << "Ones: " << bs->count_one() << endl; + cout << "Bitmap size: " << bs->size() << endl; + /*for(uint i=0;i<64;i++) { + if(i%15==0) cout << " "; + cout << (bs->access(i)?"1":"0"); + } + cout << endl;*/ + uint ones = 0; + for(uint i=0;iaccess(i) != (bitget(bitseq,i)!=0)) { + cout << "Access error for position " << i << endl; + cout << " got: " << bs->access(i) << " expected: " << (bitget(bitseq,i)!=0) << endl; + } + if(bs->rank1(i) != ones) { + cout << "Rank1 error for position " << i << endl; + cout << " got: " << bs->rank1(i) << " expected: " << ones << endl; + } + if(bitget(bitseq,i) && bs->select1(ones) != i) { + cout << "Select1 error for position " << i << " ones:" << ones << endl; + cout << " got: " << bs->select1(ones) << " expected: " << i << endl; + } + if(bs->rank0(i) != i+1-ones) { + cout << "Rank0 error for position " << i << endl; + cout << " got: " << bs->rank0(i) << " expected: " << ones << endl; + } + if(!bitget(bitseq,i) && bs->select0(i+1-ones) != i) { + cout << "Select0 error for position " << i << endl; + cout << " got: " << bs->select0(i+1-ones) << " expected: " << i << endl; + } + } + delete bs; + //static_bitsequence_rrr02::delete_E(); + delete [] bitseq; + cout << "Test completed." << endl; + return 0; +} + diff --git a/libcds/tests/test_wvtree01.cpp b/libcds/tests/test_wvtree01.cpp new file mode 100644 index 0000000..48f6951 --- /dev/null +++ b/libcds/tests/test_wvtree01.cpp @@ -0,0 +1,84 @@ + +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +int main(int argc, char ** argv) { + if(argc!=3) { + cout << "usage: " << argv[0] << " " << endl; + return 0; + } + struct stat text_info; + if(stat(argv[1],&text_info)<0) { + cout << "could not stat: " << argv[1] << endl; + return -1; + } + + stringstream ss; + ss << argv[2]; + uint samp; + ss >> samp; + + uint n= (uint)text_info.st_size/4; + uint * text = new uint[n+1]; + FILE * fp = fopen(argv[1],"r"); + if(fp==NULL) { + cout << "could not open " << argv[1] << endl; + return -1; + } + + cout << "File: " << argv[1] << endl; + cout << "Length: " << n << endl; + + uint max_symbol = 0; + for(uint i=0;isave(fp); + fclose(fp); + delete wt; + fp = fopen(fname,"r"); + wt = static_sequence::load(fp); + fclose(fp); + delete [] fname; + + + cout << "WT Size: " << wt->size() << endl; + cout << "ft = " << 1.*wt->size()/(bits(max_symbol-1)*n/8) << endl; + + delete [] text; + delete wt; + +} diff --git a/libcds/tests/test_wvtree02.cpp b/libcds/tests/test_wvtree02.cpp new file mode 100644 index 0000000..8a76c72 --- /dev/null +++ b/libcds/tests/test_wvtree02.cpp @@ -0,0 +1,95 @@ + +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +int main(int argc, char ** argv) { + if(argc!=3) { + cout << "usage: " << argv[0] << " " << endl; + return 0; + } + struct stat text_info; + if(stat(argv[1],&text_info)<0) { + cout << "could not stat: " << argv[1] << endl; + return -1; + } + + uint samp; + { + stringstream ss; + ss << string(argv[2]); + + ss >> samp; + } + + uint n= (uint)text_info.st_size; + uint * text = new uint[n+1]; + FILE * fp = fopen(argv[1],"r"); + if(fp==NULL) { + cout << "could not open " << argv[1] << endl; + return -1; + } + + cout << "File: " << argv[1] << endl; + cout << "Length: " << n << endl; + + uint max_symbol = 0; + for(uint i=0;isave(fp) << endl; + fclose(fp); + delete wt; + fp = fopen(fname,"r"); + wt = static_sequence::load(fp); + fclose(fp); + delete [] fname; + + ((static_sequence_wvtree*)wt)->test_structure(text,n); + + cout << "WT Size: " << wt->size() << endl; + cout << "ft = " << 1.*wt->size()/n << endl; + + fname = new char[10+string(argv[1]).length()]; + sprintf(fname,"%s.wt",argv[1]); + fp = fopen(fname,"w"); + cout << "save: " << wt->save(fp) << endl; + fclose(fp); + delete [] fname; + + delete [] text; + delete wt; + +} diff --git a/testXML.srx b/testXML.srx new file mode 100644 index 0000000..fe7f118 Binary files /dev/null and b/testXML.srx differ diff --git a/test_XML.cpp b/test_XML.cpp new file mode 100644 index 0000000..20265cf --- /dev/null +++ b/test_XML.cpp @@ -0,0 +1,86 @@ + + +#include "XMLTree.h" + + +void traverseXML(XMLTree *X, treeNode x) + { + int j; + int d = X->NumChildren(x); + treeNode y; + + y = X->FirstChild(x); + for (j = 0; j < d; j++) { + traverseXML(X, y); + y = X->NextSibling(y); + } + } + + +int main() + { + XMLTree *X, *Y; + int rr, n, i; + unsigned char openTag[]="A", closeTag[]="/A", filename[]="testXML"; + treeNode x; + + n = 4999999; + + X = new XMLTree(); + + + + X->OpenDocument(false, 32); + + rr = 0; + for (i = 1; i < n; i++) { + //if (rand() % 100 < r) setbit(B,i,1); + if (rr > (n-i)) { + printf("???? rr=%d n=%d i=%d\n",rr,n,i); + //exit(1); + } else if (rr >= (n-i)) { + X->NewClosingTag(closeTag); + if ((1+(int) (2.0*rand()/(RAND_MAX+1.0))) == 1) + X->NewEmptyText(); + else X->NewText(NULL); // just a test, NULL string + rr--; + } + else { + if (rand() % (rr+1) < (rr+1)/2 || (rr<=1)) { + X->NewOpenTag(openTag); + if ((1+(int) (2.0*rand()/(RAND_MAX+1.0))) == 1) + X->NewEmptyText(); + else X->NewText(NULL); + rr++; + } + else { + X->NewClosingTag(closeTag); + if ((1+(int) (2.0*rand()/(RAND_MAX+1.0))) <= 1) + X->NewEmptyText(); + else X->NewText(NULL); + rr--; + } + } + } + + X->CloseDocument(); + + for (i=0; i <= (n+1)/2; i++) + X->TextXMLId(i); + + x = X->Root(); + + traverseXML(X, x); + + X->Save(filename); + + delete X; + + X = XMLTree::Load(filename, 32); + + x = X->Root(); + + traverseXML(X, x); + + } +