X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=swcsa%2FintIndex%2Ficsa.h;h=6114a35acd7f7e78b01efcdd43c5bd2a93688f26;hb=898f6e5c6b7223f4753b7ccb7939809ee5f53aae;hp=6d4dc3da3a63900e69b87efd60c19a0695b32cba;hpb=6ba83dea468496a31eb57bdbac9b257a5a191a26;p=SXSI%2FTextCollection.git diff --git a/swcsa/intIndex/icsa.h b/swcsa/intIndex/icsa.h index 6d4dc3d..6114a35 100755 --- a/swcsa/intIndex/icsa.h +++ b/swcsa/intIndex/icsa.h @@ -1 +1 @@ -/* icsa.h * Copyright (C) 2011, Antonio Fariña and Eduardo Rodriguez, all rights reserved. * * icsa.h: Definition of the functions of the interface "../intIndex/interfaceIntIndex.h" * that permits to represent a sequence of uint32 integers with an iCSA: * An integer-oriented Compressed Suffix Array. * Such representation will be handled as a "ticsa" data structure, that * the WCSA self-index will use (as an opaque type) to * create/save/load/search/recover/getSize of the representation of the * original sequence. * Suffix sorting is done via Larson/Sadakane suffix sort algorithm * from the char-based csa. * * Ref: Larsson, N. J. and Sadakane, K. 2007. Faster suffix sorting. Theoretical * Computer Science 387, 3, 258–272. * * See more details in: * Antonio Fariña, Nieves Brisaboa, Gonzalo Navarro, Francisco Claude, Ángeles Places, * and Eduardo Rodríguez. Word-based Self-Indexes for Natural Language Text. ACM * Transactions on Information Systems (TOIS), 2012. * http://vios.dc.fi.udc.es/indexing/wsi/publications.html * http://www.dcc.uchile.cl/~gnavarro/ps/tois11.pdf * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include #include #include #include "../intIndexUtils/defValues.h" #include "../utils/bitmap.h" #include "../utils/huff.h" #include "../utils/parameters.h" #ifdef PSI_HUFFMANRLE #include "../intIndexUtils/psiHuffmanRLE.h" #endif #ifdef PSI_GONZALO #include "../intIndexUtils/psiGonzalo.h" #endif #ifdef PSI_DELTACODES #include "../intIndexUtils/psiDeltaCode.h" #endif typedef struct { uint suffixArraySize; uint T_Psi; uint *D; bitmap bD; uint T_A; uint T_AInv; uint *samplesA; uint samplesASize; uint *BA; bitmap bBA; uint *samplesAInv; uint samplesAInvSize; uint displayCSAState; long displayCSAPrevPosition; #ifdef PSI_HUFFMANRLE HuffmanCompressedPsi hcPsi; #endif #ifdef PSI_GONZALO GonzaloCompressedPsi gcPsi; #endif #ifdef PSI_DELTACODES DeltaCompressedPsi dcPsi; #endif //only needed during "parse_parameters". uint tempNSHUFF; uint psiSearchFactorJump; //factor of the T_Psi value. } ticsa; // FUNCTION PROTOTYPES: BUILDING THE INDEX //Creates the ICSA int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ); //ticsa *createIntegerCSA (uint **aintVector, uint SAsize, char *build_options); //Returns number of elements in the indexed sequence of integers int sourceLenIntIndex(void *index, uint *numInts); //Save the index to disk int saveIntIndex(void *index, char *pathname); //void storeStructsCSA(ticsa *myicsa, char *basename); // Loads the index from disk. int loadIntIndex(char *pathname, void **index); //ticsa *loadCSA(char *basename); // Frees memory int freeIntIndex(void *index); //uint destroyStructsCSA(ticsa *myicsa); //Returns the size (in bytes) of the index over the sequence of integers. int sizeIntIndex(void *index, uint *numBytes); //uint CSA_size(ticsa *myicsa); // Shows detailed summary info of the self-index (memory usage of each structure) int printInfoIntIndex(void *index, const char tab[]); //Number of occurrences of the pattern, and the interval [left,right] in the suffix array. int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right); //uint countCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right); // Exponential search //uint countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right); // Binary search // Returns an array with integers corresponding offsets to the occurrences of the pattern, // as well as the number of occurrences int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc); //uint *locateCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *occ); //Returns the value of the source (array of integers) at a given offset. // (that is, the element "position" from the original array of uints) int displayIntIndex(void *index, ulong position, uint *value); //uint displayCSA(ticsa *myicsa, uint position); /* Private function prototypes ********************************************/ uint parametersCSA(ticsa *myicsa, char *build_options); uint displayCSAFirst(ticsa *myicsa, uint position); uint displayCSANext(ticsa *myicsa); int SadCSACompare(ticsa *myicsa, uint *pattern, uint patternSize, uint p); uint A(ticsa *myicsa, uint position); uint inverseA(ticsa *myicsa, uint offset); void showStructsCSA(ticsa *myicsa); // For Debugging \ No newline at end of file +/* icsa.h * Copyright (C) 2011, Antonio Fariña and Eduardo Rodriguez, all rights reserved. * * icsa.h: Definition of the functions of the interface "../intIndex/interfaceIntIndex.h" * that permits to represent a sequence of uint32 integers with an iCSA: * An integer-oriented Compressed Suffix Array. * Such representation will be handled as a "ticsa" data structure, that * the WCSA self-index will use (as an opaque type) to * create/save/load/search/recover/getSize of the representation of the * original sequence. * Suffix sorting is done via Larson/Sadakane suffix sort algorithm * from the char-based csa. * * Ref: Larsson, N. J. and Sadakane, K. 2007. Faster suffix sorting. Theoretical * Computer Science 387, 3, 258–272. * * See more details in: * Antonio Fariña, Nieves Brisaboa, Gonzalo Navarro, Francisco Claude, Ángeles Places, * and Eduardo Rodríguez. Word-based Self-Indexes for Natural Language Text. ACM * Transactions on Information Systems (TOIS), 2012. * http://vios.dc.fi.udc.es/indexing/wsi/publications.html * http://www.dcc.uchile.cl/~gnavarro/ps/tois11.pdf * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include #include #include #include "../intIndex/defValues.h" #include "../utils/bitmap.h" #include "../utils/huff.h" #include "../utils/parameters.h" #include "../utils/basics.h" #ifdef PSI_HUFFMANRLE #include "../intIndex/psiHuffmanRLE.h" #endif #ifdef PSI_GONZALO #include "../intIndex/psiGonzalo.h" #endif #ifdef PSI_DELTACODES #include "../intIndex/psiDeltaCode.h" #endif typedef struct { uint suffixArraySize; uint T_Psi; uint *D; bitmap bD; uint T_A; uint T_AInv; uint *samplesA; uint samplesASize; uint *BA; bitmap bBA; uint *samplesAInv; uint samplesAInvSize; uint displayCSAState; long displayCSAPrevPosition; #ifdef PSI_HUFFMANRLE HuffmanCompressedPsi hcPsi; #endif #ifdef PSI_GONZALO GonzaloCompressedPsi gcPsi; #endif #ifdef PSI_DELTACODES DeltaCompressedPsi dcPsi; #endif //only needed during "parse_parameters". uint tempNSHUFF; uint psiSearchFactorJump; //factor of the T_Psi value. } ticsa; // FUNCTION PROTOTYPES: BUILDING THE INDEX //Creates the ICSA int buildIntIndex (uint *intVector, uint n, char *build_options, void **index ); //ticsa *createIntegerCSA (uint **aintVector, uint SAsize, char *build_options); //Returns number of elements in the indexed sequence of integers int sourceLenIntIndex(void *index, uint *numInts); //Save the index to disk int saveIntIndex(void *index, char *pathname); //void storeStructsCSA(ticsa *myicsa, char *basename); // Loads the index from disk. int loadIntIndex(char *pathname, void **index); //ticsa *loadCSA(char *basename); // Frees memory int freeIntIndex(void *index); //uint destroyStructsCSA(ticsa *myicsa); //Returns the size (in bytes) of the index over the sequence of integers. int sizeIntIndex(void *index, uint *numBytes); //uint CSA_size(ticsa *myicsa); // Shows detailed summary info of the self-index (memory usage of each structure) int printInfoIntIndex(void *index, const char tab[]); //Number of occurrences of the pattern, and the interval [left,right] in the suffix array. int countIntIndex(void *index, uint *pattern, uint length, ulong *numocc, ulong *left, ulong *right); //uint countCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right); // Exponential search //uint countCSABin(ticsa *myicsa, uint *pattern, uint patternSize, uint *left, uint *right); // Binary search // Returns an array with integers corresponding offsets to the occurrences of the pattern, // as well as the number of occurrences int locateIntIndex(void *index, uint *pattern, uint length, ulong **occ, ulong *numocc); //uint *locateCSA(ticsa *myicsa, uint *pattern, uint patternSize, uint *occ); //Returns the value of the source (array of integers) at a given offset. // (that is, the element "position" from the original array of uints) int displayIntIndex(void *index, ulong position, uint *value); //uint displayCSA(ticsa *myicsa, uint position); /* Private function prototypes ********************************************/ uint parametersCSA(ticsa *myicsa, char *build_options); uint displayCSAFirst(ticsa *myicsa, uint position); uint displayCSANext(ticsa *myicsa); int SadCSACompare(ticsa *myicsa, uint *pattern, uint patternSize, uint p); uint A(ticsa *myicsa, uint position); uint inverseA(ticsa *myicsa, uint offset); void showStructsCSA(ticsa *myicsa); // For Debugging \ No newline at end of file