3 // Implements operations over a sequence of balanced parentheses
5 #include "parentheses.h"
7 // I have decided not to implement Munro et al.'s scheme, as it is too
8 // complicated and the overhead is not so small in practice. I have opted
9 // for a simpler scheme. Each open (closing) parenthesis will be able to
10 // find its matching closing (open) parenthesis. If the distance is shorter
11 // than b, we will do it by hand, traversing the string. Otherwise, the
12 // answer will be stored in a hash table. In fact, only subtrees larger than
13 // s will have the full distance stored, while those between b and s will
14 // be in another table with just log s bits. The traversal by hand proceeds
15 // in fact by chunks of k bits, whose answers are precomputed.
16 // Space: there cannot be more than n/s subtrees larger than s, idem b.
17 // So we have (n/s)log n bits for far pointers and (n/b)log s for near
18 // pointers. The space for the table is 2^k*k*log b. The optimum s=b log n,
19 // in which case the space is n/b(1 + log b + log log n) + 2^k*k*log b.
20 // Time: the time is O(b/k), and we want to keep it O(log log n), so
22 // (previous arguments hold if there are no unary nodes, but we hope that
23 // there are not too many -- in revtrie we compress unary paths except when
25 // Settings: using b = log n, we have
26 // space = n log log n / log n + 2^(log n / log log n) log n
28 // In practice: we use k = 8 (bytes table), b = W (so work = 4 or 8)
29 // and space ~= n/3 + 10 Kbytes (fixed table).
30 // Notice that we need a hash table that stores only the deltas and does not
31 // store the keys! (they would take log n instead of log s!). Collisions are
32 // resolved as follows: see all the deltas that could be and pick the smallest
33 // one whose excess is the same of the argument. To make this low we use a
34 // load factor of 2.0, so it is 2n/3 after all.
35 // We need the same for the reverses, for the forward is only for ('s and
36 // reverses for )'s, so the proportion stays the same.
37 // We also need the stream to be a bitmap to know how many open parentheses
38 // we have to the left. The operations are as follows:
39 // findclose: use the mechanism described above
40 // findparent: similar, in reverse, looking for the current excess - 1
41 // this needs us to store the (near/far) parent of each node, which may
42 // cost more than the next sibling.
43 // excess: using the number of open parentheses
44 // enclose: almost findparent
46 // creates a parentheses structure from a bitstring, which is shared
47 // n is the total number of parentheses, opening + closing
49 static uint calcsizes (parentheses P, uint posparent, uint posopen,
50 uint *near, uint *far, uint *pnear, uint *pfar)
52 { uint posclose,newpos;
53 if ((posopen == P->n) || bitget1(P->data,posopen))
54 return posopen; // no more trees
56 do { posclose = newpos+1;
57 newpos = calcsizes (P,posopen,posclose,near,far,pnear,pfar);
59 while (newpos != posclose);
60 if ((posclose < P->n) && (posclose-posopen > W)) // exists and not small
61 if (posclose-posopen < (1<<P->sbits)) (*near)++; // near pointer
63 if ((posopen > 0) && (posopen-posparent > W)) // exists and not small
64 if (posopen-posparent < (1<<P->sbits)) (*pnear)++; // near pointer
70 static uint filltables (parentheses P, uint posparent, uint posopen, bool bwd)
72 { uint posclose,newpos;
73 if ((posopen == P->n) || bitget1(P->data,posopen))
74 return posopen; // no more trees
76 do { posclose = newpos+1;
77 newpos = filltables (P,posopen,posclose,bwd);
79 while (newpos != posclose);
80 if ((posclose < P->n) && (posclose-posopen > W)) // exists and not small
81 { if (posclose-posopen < (1<<P->sbits)) // near pointers
82 insertHash (P->bftable,posopen,posclose-posopen);
84 insertHash (P->sftable,posopen,posclose-posopen);
86 if (bwd && (posopen > 0) && (posopen-posparent > W)) //exists and not small
87 { if (posopen-posparent < (1<<P->sbits)) // near pointer
88 insertHash (P->bbtable,posopen,posopen-posparent);
90 insertHash (P->sbtable,posopen,posopen-posparent);
97 static byte FwdPos[256][W/2];
98 static byte BwdPos[256][W/2];
99 static char Excess[256];
100 static bool tablesComputed = false;
102 static void fcompchar (byte x, byte *pos, char *excess)
106 for (i=0;i<W/2;i++) pos[i] = 0;
108 { if (x & 1) // closing
110 if ((exc < 0) && !pos[-exc-1]) pos[-exc-1] = i+1;
118 static void bcompchar (byte x, byte *pos)
122 for (i=0;i<W/2;i++) pos[i] = 0;
124 { if (x & 128) // opening, will be used on complemented masks
126 if ((exc > 0) && !pos[exc-1]) pos[exc-1] = i+1;
133 parentheses createParentheses (uint *string, uint n, bool bwd)
136 uint i,s,nb,ns,nbits,near,far,pnear,pfar;
138 P = malloc (sizeof(struct sparentheses));
141 P->bdata = createBitmap (string,n,false);
144 P->sbits = bits(s-1);
145 s = 1 << P->sbits; // to take the most advantage of what we can represent
146 ns = (n+s-1)/s; nb = (s+W-1)/W; // adjustments
147 near = far = pnear = pfar = 0;
148 calcsizes (P,~0,0,&near,&far,&pnear,&pfar);
150 printf (" Parentheses: total %i, near %i, far %i, pnear %i, pfar %i\n",n,near,far,pnear,pfar);
152 P->sftable = createHash(far,nbits,1.8);
153 P->bftable = createHash(near,P->sbits,1.8);
155 P->sbtable = createHash (pfar,nbits,2.5);
156 P->bbtable = createHash (pnear,P->sbits,2.5);
157 filltables (P,~0,0,true);
159 { tablesComputed = true;
161 { fcompchar (i,FwdPos[i],Excess+i);
162 bcompchar (i,BwdPos[i]);
168 // frees parentheses structure, including the bitstream
170 void destroyParentheses (parentheses P)
172 { destroyBitmap(P->bdata);
173 destroyHash (P->sftable);
174 if (P->sbtable) destroyHash (P->sbtable);
175 destroyHash (P->bftable);
176 if (P->bbtable) destroyHash (P->bbtable);
180 // the position of the closing parenthesis corresponding to (opening)
181 // parenthesis at position i
183 uint findclose (parentheses P, uint i)
186 uint len,res,minres,exc;
190 // first see if it is at small distance
191 len = W; if (i+len >= P->n) len = P->n-i-1;
192 bitW = bitget (P->data,i+1,len);
194 while (bitW && (exc < W/2))
195 // either we shift it all or it only opens parentheses or too
196 // many open parentheses
198 if (res = FwdPos[W1][exc]) return i+len+res;
199 bitW >>= 8; exc += Excess[W1];
202 // ok, it's not a small distance, try with hashing btable
204 myexcess = excess (P,i);
205 res = searchHash (P->bftable,i,&h);
207 { if (!minres || (res < minres))
208 if ((i+res+1 < P->n) && (excess(P,i+res+1) == myexcess))
210 res = nextHash (P->bftable,&h);
212 if (minres) return i+minres;
213 // finally, it has to be a far pointer
214 res = searchHash (P->sftable,i,&h);
216 { if (!minres || (res < minres))
217 if ((i+res+1 < P->n) && (excess(P,i+res+1) == myexcess))
219 res = nextHash (P->sftable,&h);
221 return i+minres; // there should be one if the sequence is balanced!
224 // find enclosing parenthesis for an open parenthesis
225 // assumes that the parenthesis has an enclosing pair
227 uint findparent (parentheses P, uint i)
230 uint len,res,minres,exc;
234 // first see if it is at small distance
235 len = W; if (i < len) len = i-1;
236 bitW = ~bitget (P->data,i-len,len) << (W-len);
238 while (bitW && (exc < W/2))
239 // either we shift it all or it only closes parentheses or too
240 // many closed parentheses
241 { W1 = (bitW >> (W-8));
242 if (res = BwdPos[W1][exc]) return i-len-res;
243 bitW <<= 8; exc += Excess[W1]; // note W1 is complemented!
246 // ok, it's not a small distance, try with hashing btable
248 myexcess = excess (P,i) - 1;
249 res = searchHash (P->bbtable,i,&h);
251 { if (!minres || (res < minres))
252 if ((i-res >= 0) && (excess(P,i-res) == myexcess))
254 res = nextHash (P->bbtable,&h);
256 if (minres) return i-minres;
257 // finally, it has to be a far pointer
258 res = searchHash (P->sbtable,i,&h);
260 { if (!minres || (res < minres))
261 if ((i-res >= 0) && (excess(P,i-res) == myexcess))
263 res = nextHash (P->sbtable,&h);
265 return i-minres; // there should be one if the sequence is balanced!
268 // # open - # close at position i, not included
270 uint excess (parentheses P, uint i)
272 { return i - 2*rank(P->bdata,i);
275 // open position of closest parentheses pair that contains the pair
276 // that opens at i, ~0 if no parent
278 uint enclose (parentheses P, uint i)
280 { if (i == 0) return ~0; // no parent!
281 return findparent (P,i);
286 uint sizeofParentheses(parentheses P)
288 return sizeof(struct sparentheses) +
289 sizeofBitmap(P->bdata) +
290 sizeofHash(P->sftable) +
291 sizeofHash(P->sbtable) +
292 sizeofHash(P->bftable) +
293 sizeofHash(P->bbtable);