1 /* icsa.h
\r * Copyright (C) 2011, Antonio Fariña and Eduardo Rodriguez, all rights reserved.
\r *
\r * icsa.h: Definition of the functions of the interface "../intIndex/interfaceIntIndex.h"
\r * that permits to represent a sequence of uint32 integers with an iCSA:
\r * An integer-oriented Compressed Suffix Array.
\r * Such representation will be handled as a "ticsa" data structure, that
\r * the WCSA self-index will use (as an opaque type) to
\r * create/save/load/search/recover/getSize of the representation of the
\r * original sequence.
\r * Suffix sorting is done via Larson/Sadakane suffix sort algorithm
\r * from the char-based csa.
\r *
\r * Ref: Larsson, N. J. and Sadakane, K. 2007. Faster suffix sorting. Theoretical
\r * Computer Science 387, 3, 258–272.
\r *
\r * See more details in:
\r * Antonio Fariña, Nieves Brisaboa, Gonzalo Navarro, Francisco Claude, Ángeles Places,
\r * and Eduardo Rodríguez. Word-based Self-Indexes for Natural Language Text. ACM
\r * Transactions on Information Systems (TOIS), 2012.
\r * http://vios.dc.fi.udc.es/indexing/wsi/publications.html
\r * http://www.dcc.uchile.cl/~gnavarro/ps/tois11.pdf
\r *
\r * This library is free software; you can redistribute it and/or
\r * modify it under the terms of the GNU Lesser General Public
\r * License as published by the Free Software Foundation; either
\r * version 2.1 of the License, or (at your option) any later version.
\r *
\r * This library is distributed in the hope that it will be useful,
\r * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
\r * Lesser General Public License for more details.
\r *
\r * You should have received a copy of the GNU Lesser General Public
\r * License along with this library; if not, write to the Free Software
\r * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
\r *
\r */
\r\r#include <stdio.h>
\r\r#include <fcntl.h>
\r#include <sys/stat.h>
\r#include <time.h>
\r#include <sys/time.h>
\r\r#include "../intIndexUtils/defValues.h"
\r\r#include "../utils/bitmap.h"
\r#include "../utils/huff.h"
\r#include "../utils/parameters.h"
\r\r#ifdef PSI_HUFFMANRLE
\r #include "../intIndexUtils/psiHuffmanRLE.h"
\r#endif
\r\r#ifdef PSI_GONZALO
\r #include "../intIndexUtils/psiGonzalo.h"
\r#endif
\r\r#ifdef PSI_DELTACODES
\r #include "../intIndexUtils/psiDeltaCode.h"
\r#endif
\r\r\rtypedef struct {
\r uint suffixArraySize;
\r uint T_Psi;
\r uint *D;
\r bitmap bD;
\r uint T_A;
\r uint T_AInv;
\r uint *samplesA;
\r uint samplesASize;
\r uint *BA;
\r bitmap bBA;
\r uint *samplesAInv;
\r uint samplesAInvSize;
\r uint displayCSAState;
\r long displayCSAPrevPosition;
\r #ifdef PSI_HUFFMANRLE
\r HuffmanCompressedPsi hcPsi;
\r #endif
\r #ifdef PSI_GONZALO
\r GonzaloCompressedPsi gcPsi;
\r #endif
\r #ifdef PSI_DELTACODES
\r DeltaCompressedPsi dcPsi;
\r #endif
\r \r //only needed during "parse_parameters".
\r uint tempNSHUFF;
\r uint psiSearchFactorJump; //factor of the T_Psi value.
\r} ticsa;
\r\r \r// FUNCTION PROTOTYPES: BUILDING THE INDEX
\r\r//Creates the ICSA
\r\r int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ); //ticsa *createIntegerCSA (uint **aintVector, uint SAsize, char *build_options);
\r\r//Returns number of elements in the indexed sequence of integers
\r int sourceLenIntIndex(void *index, uint *numInts);
\r\r//Save the index to disk
\r int saveIntIndex(void *index, char *pathname); //void storeStructsCSA(ticsa *myicsa, char *basename);
\r\r// Loads the index from disk.
\r int loadIntIndex(char *pathname, void **index); //ticsa *loadCSA(char *basename);
\r\r// Frees memory
\r int freeIntIndex(void *index); //uint destroyStructsCSA(ticsa *myicsa);
\r\r//Returns the size (in bytes) of the index over the sequence of integers.
\r int sizeIntIndex(void *index, uint *numBytes); //uint CSA_size(ticsa *myicsa);
\r\r // Shows detailed summary info of the self-index (memory usage of each structure)
\rint printInfoIntIndex(void *index, const char tab[]);
\r\r//Number of occurrences of the pattern, and the interval [left,right] in the suffix array.
\r int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right);
\r //uint countCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right); // Exponential search
\r //uint countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right); // Binary search
\r\r// Returns an array with integers corresponding offsets to the occurrences of the pattern,
\r// as well as the number of occurrences
\r int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc);
\r //uint *locateCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *occ);
\r\r//Returns the value of the source (array of integers) at a given offset.
\r// (that is, the element "position" from the original array of uints)
\r int displayIntIndex(void *index, ulong position, uint *value);
\r //uint displayCSA(ticsa *myicsa, uint position);
\r\r\r/* Private function prototypes ********************************************/
\ruint parametersCSA(ticsa *myicsa, char *build_options);
\r\ruint displayCSAFirst(ticsa *myicsa, uint position);
\ruint displayCSANext(ticsa *myicsa);
\rint SadCSACompare(ticsa *myicsa, uint *pattern, uint patternSize, uint p);
\ruint A(ticsa *myicsa, uint position);
\ruint inverseA(ticsa *myicsa, uint offset);
\r\rvoid showStructsCSA(ticsa *myicsa); // For Debugging
\r\r