Lucene++ - a full-featured, c++ search engine
API Documentation


StandardTokenizer.h
Go to the documentation of this file.
1 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
6 
7 #ifndef STANDARDTOKENIZER_H
8 #define STANDARDTOKENIZER_H
9 
10 #include "Tokenizer.h"
11 
12 namespace Lucene {
13 
34 class LPPAPI StandardTokenizer : public Tokenizer {
35 public:
38  StandardTokenizer(LuceneVersion::Version matchVersion, const ReaderPtr& input);
39 
41  StandardTokenizer(LuceneVersion::Version matchVersion, const AttributeSourcePtr& source, const ReaderPtr& input);
42 
44  StandardTokenizer(LuceneVersion::Version matchVersion, const AttributeFactoryPtr& factory, const ReaderPtr& input);
45 
46  virtual ~StandardTokenizer();
47 
49 
50 protected:
53 
55  int32_t maxTokenLength;
56 
57  // this tokenizer generates three attributes: offset, positionIncrement and type
62 
63 public:
64  static const int32_t ALPHANUM;
65  static const int32_t APOSTROPHE;
66  static const int32_t ACRONYM;
67  static const int32_t COMPANY;
68  static const int32_t EMAIL;
69  static const int32_t HOST;
70  static const int32_t NUM;
71  static const int32_t CJ;
72 
74  static const int32_t ACRONYM_DEP;
75 
77  static const Collection<String> TOKEN_TYPES();
78 
79 protected:
80  void init(const ReaderPtr& input, LuceneVersion::Version matchVersion);
81 
82 public:
84  void setMaxTokenLength(int32_t length);
85 
87  int32_t getMaxTokenLength();
88 
90  virtual bool incrementToken();
91 
92  virtual void end();
93 
94  virtual void reset(const ReaderPtr& input);
95 
98  bool isReplaceInvalidAcronym();
99 
102  void setReplaceInvalidAcronym(bool replaceInvalidAcronym);
103 };
104 
105 }
106 
107 #endif
boost::shared_ptr< Reader > ReaderPtr
Definition: LuceneTypes.h:547
boost::shared_ptr< PositionIncrementAttribute > PositionIncrementAttributePtr
Definition: LuceneTypes.h:45
static const int32_t ALPHANUM
Definition: StandardTokenizer.h:64
OffsetAttributePtr offsetAtt
Definition: StandardTokenizer.h:59
boost::shared_ptr< AttributeFactory > AttributeFactoryPtr
Definition: LuceneTypes.h:519
boost::shared_ptr< StandardTokenizerImpl > StandardTokenizerImplPtr
Definition: LuceneTypes.h:53
boost::shared_ptr< AttributeSource > AttributeSourcePtr
Definition: LuceneTypes.h:520
boost::shared_ptr< TermAttribute > TermAttributePtr
Definition: LuceneTypes.h:58
static const int32_t ACRONYM_DEP
Definition: StandardTokenizer.h:74
bool replaceInvalidAcronym
Definition: StandardTokenizer.h:54
static const int32_t ACRONYM
Definition: StandardTokenizer.h:66
int32_t maxTokenLength
Definition: StandardTokenizer.h:55
PositionIncrementAttributePtr posIncrAtt
Definition: StandardTokenizer.h:60
static const int32_t HOST
Definition: StandardTokenizer.h:69
static const int32_t COMPANY
Definition: StandardTokenizer.h:67
TermAttributePtr termAtt
Definition: StandardTokenizer.h:58
static const int32_t CJ
Definition: StandardTokenizer.h:71
#define LUCENE_CLASS(Name)
Definition: LuceneObject.h:24
Definition: AbstractAllTermDocs.h:12
A grammar-based tokenizer.
Definition: StandardTokenizer.h:34
boost::shared_ptr< OffsetAttribute > OffsetAttributePtr
Definition: LuceneTypes.h:40
boost::shared_ptr< TypeAttribute > TypeAttributePtr
Definition: LuceneTypes.h:64
static const int32_t EMAIL
Definition: StandardTokenizer.h:68
A Tokenizer is a TokenStream whose input is a Reader.
Definition: Tokenizer.h:20
Version
Definition: Constants.h:40
static const int32_t NUM
Definition: StandardTokenizer.h:70
TypeAttributePtr typeAtt
Definition: StandardTokenizer.h:61
static const int32_t APOSTROPHE
Definition: StandardTokenizer.h:65

clucene.sourceforge.net