X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=SWCSABuilder.h;fp=SWCSABuilder.h;h=e60bf027b968b791c58b6ccfe629c4410c7c63b8;hb=89dc22aee980ba16f757cd9a7f77478c2da50051;hp=0000000000000000000000000000000000000000;hpb=443151511a86083b21c1c06eb610f86b3aed35be;p=SXSI%2FTextCollection.git diff --git a/SWCSABuilder.h b/SWCSABuilder.h new file mode 100644 index 0000000..e60bf02 --- /dev/null +++ b/SWCSABuilder.h @@ -0,0 +1,118 @@ +/****************************************************************************** + * Copyright (C) 2009 by Niko Valimaki * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU Lesser General Public License as published * + * by the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +#ifndef _SXSI_SWCSABuilder_h_ +#define _SXSI_SWCSABuilder_h_ + +#include "TextCollectionBuilder.h" +#include "TextStorage.h" +#include "Tools.h" // Defines ulong and uchar. +#include "SWCSAWrapper.h" + +#include +#include // Defines std::pair. +#include // Defines std::strlen, added by Kim + +namespace SXSI +{ + /** + * Build an instance of the TextCollection class. + */ + class SWCSABuilder : public TextCollectionBuilder + { + public: + SWCSABuilder(unsigned sampler) + : text(""), samplerate(sampler), numberOfTexts(0) + { /* NOP */ } + + virtual ~SWCSABuilder() + { /* NOP */ } + + /** + * Insert text + * + * Must be a zero-terminated string from alphabet [1,255]. + * Can not be called after makeStatic(). + * The i'th text insertion gets an identifier value i-1. + * In other words, document identifiers start from 0. + * + * All texts must be inserted into the index! + * The default (FMIndex) text collection supports non-indexed texts. + */ + virtual void InsertText(uchar const *t, bool index = true) + { + if (strlen((char const *) t) == 0) + { + std::cerr << "SWCSABuilder::InsertText(): Can not index empty texts!" << std::endl; + std::exit(1); + } + assert(index); + if (!index) + { + std::cerr << "SWCSABuilder::InsertText(): The implementation of SWCSA does not support non-indexed texts" + << std::endl << "Use the default (FMIndex) text collection instead." << std::endl; + std::exit(1); + } + text.append((char const *) t, strlen((char const *) t) + 1); // +1 for 0-byte. + ++ numberOfTexts; + } + + /** + * Make static + * + * Convert to a static collection. + * New texts can not be inserted after this operation. + * + * + */ + virtual TextCollection * InitTextCollection(char type = TextStorage::TYPE_PLAIN_TEXT) + { + assert(type == TextStorage::TYPE_PLAIN_TEXT); + if (type != TextStorage::TYPE_PLAIN_TEXT) + { + std::cerr << "SWCSABuilder::InitTextCollection(): The implementation of SWCSA supports only TextStorage::TYPE_PLAIN_TEXT" + << std::endl << "Use the default (FMIndex) text collection instead." << std::endl; + std::exit(1); + } + + ulong n = text.size(); + uchar *t = new uchar[n]; // FIXME uses temporarily too much space + ulong l = text.copy((char *)t, n); + if (l != n) + { + std::cerr << "SWCSABuilder::InitTextCollection(): copy failed!" << std::endl; + std::exit(1); + } + text.clear(); + return new SWCSAWrapper(t, n, samplerate, numberOfTexts); // This will delete [] t. + } + + + private: + SWCSABuilder(); + std::string text; + unsigned samplerate; + unsigned numberOfTexts; + + // No copy constructor or assignment + SWCSABuilder(SWCSABuilder const&); + SWCSABuilder& operator = (SWCSABuilder const&); + }; +} +#endif