1 /******************************************************************************
2 * Copyright (C) 2009 by Niko Valimaki <nvalimak@cs.helsinki.fi> *
3 * Text collection interface for an in-memory XQuery/XPath engine *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19 ******************************************************************************/
21 #ifndef _SXSI_TextCollectionBuilder_h_
22 #define _SXSI_TextCollectionBuilder_h_
24 #include "TextCollection.h"
25 #include "TextStorage.h"
28 // Default samplerate for suffix array samples
29 #define TEXTCOLLECTION_DEFAULT_SAMPLERATE 32
31 // Default input length, used to calculate the buffer size.
32 #define TEXTCOLLECTION_DEFAULT_INPUT_LENGTH (150 * 1024 * 1024)
38 * Builder for an instance of the TextCollection class.
40 class TextCollectionBuilder
43 // Index type defaults to FM-index.
44 // SWCSA can be used for natural language inputs.
45 // NB: Current SWCSA uses a lot of memory during construction!
46 enum index_type_t { index_type_default, index_type_swcsa, index_type_rlcsa };
48 static TextCollectionBuilder* create(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE,
49 index_type_t type = index_type_default,
50 ulong estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH);
53 virtual ~TextCollectionBuilder() { };
58 * Must be a zero-terminated string from alphabet [1,255].
59 * Can not be called after makeStatic().
60 * The i'th text insertion gets an identifier value i-1.
61 * In other words, document identifiers start from 0.
63 * Second parameter tells if the text will be added to the
64 * index also. If false, text is added only to the TextCollection
65 * and can not be searched for.
67 virtual void InsertText(uchar const *, bool index = true) = 0;
71 * Convert to a static collection.
72 * New texts can not be inserted after this operation.
74 * TextStorage type defaults to TYPE_PLAIN_TEXT, another
75 * possible type is TYPE_LZ_INDEX.
77 virtual TextCollection * InitTextCollection(char type = TextStorage::TYPE_PLAIN_TEXT) = 0;
80 // Protected constructor; use the static method TextCollectionBuilder::create()
81 TextCollectionBuilder() { };
84 // No copy constructor or assignment
85 TextCollectionBuilder(TextCollectionBuilder const&);
86 TextCollectionBuilder& operator = (TextCollectionBuilder const&);