- /*i = 0;
- for (map<ulong, pair<DocId, ulong> >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it, ++i)
- {
- int docc = (*endmarkerDocId)[i];
- ulong poss = (*endmarkerPos)[i];
- printf("endm[%u] = %lu (text pos: %lu) (recorded: %d, %lu)\n", (it->second).first, it->first, (it->second).second, docc, poss);
- }*/
-/*
- for (i = 0; i < numberOfTexts; ++ i)
+ sampled = new BSGAP(sampledpositions,n,true);
+ sampleLength = sampled->rank(n-1);
+ assert(sampleCount == sampleLength);
+
+ // Suffixes store an offset from the text start position
+ suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
+ suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
+
+ for(ulong i=0; i<sampleLength; i++) {
+ assert((*positions)[i] < n);
+ ulong j = sampled->rank((*positions)[i]);
+ if (j==0) j=sampleLength;
+ TextPosition textPos = (*tmpSuffix)[i];
+ (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos);
+
+ assert((unsigned)DocIdAtTextPos(textStartPos, textPos) < numberOfTexts);
+ assert((*suffixDocId)[j-1] < numberOfTexts);
+ // calculate offset from text start:
+ (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
+ }
+ // FIXME Temp, remove
+ delete tmpSuffix;
+ delete positions;
+// delete textLength;
+ delete textStartPos;
+}
+
+
+/**
+ * Finds document identifier for given text position
+ *
+ * Starting text position of the document is stored into second parameter.
+ * Binary searching on text starting positions.
+ */
+TextCollection::DocId CSA::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
+{
+ assert(i < n);
+
+ DocId a = 0;
+ DocId b = numberOfTexts - 1;
+ while (a < b)