SHOGUN  5.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules
HashedDocConverter.h
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 
11 #ifndef _HASHEDDOCCONVERTER__H__
12 #define _HASHEDDOCCONVERTER__H__
13 
14 #include <shogun/lib/config.h>
15 
18 #include <shogun/lib/Tokenizer.h>
20 
21 namespace shogun
22 {
23 class CFeatures;
24 class CTokenizer;
25 class CConverter;
26 template<class T> class CSparseFeatures;
27 
40 {
41 public:
44 
53  CHashedDocConverter(int32_t hash_bits, bool normalize = false, int32_t n_grams = 1, int32_t skips = 0);
54 
63  CHashedDocConverter(CTokenizer* tzer, int32_t hash_bits, bool normalize = false, int32_t n_grams = 1,
64  int32_t skips = 0);
65 
67  virtual ~CHashedDocConverter();
68 
74  virtual CFeatures* apply(CFeatures* features);
75 
82 
101  static index_t generate_ngram_hashes(SGVector<uint32_t>& hashes, index_t hashes_start, index_t len,
102  SGVector<index_t>& ngram_hashes, int32_t num_bits, int32_t ngrams, int32_t tokens_to_skip);
103 
105  virtual const char* get_name() const;
106 
111  void set_normalization(bool normalize);
112 
120  void set_k_skip_n_grams(int32_t k, int32_t n);
121 protected:
122 
124  void init(CTokenizer* tzer, int32_t d, bool normalize, int32_t n_grams, int32_t skips);
125 
132  int32_t count_distinct_indices(CDynamicArray<uint32_t>& hashed_indices);
133 
141 
142 protected:
143 
145  int32_t num_bits;
146 
149 
152 
154  int32_t ngrams;
155 
157  int32_t tokens_to_skip;
158 };
159 }
160 
161 #endif
class Converter used to convert data
Definition: Converter.h:26
void init(CTokenizer *tzer, int32_t d, bool normalize, int32_t n_grams, int32_t skips)
int32_t index_t
Definition: common.h:62
static index_t generate_ngram_hashes(SGVector< uint32_t > &hashes, index_t hashes_start, index_t len, SGVector< index_t > &ngram_hashes, int32_t num_bits, int32_t ngrams, int32_t tokens_to_skip)
int32_t count_distinct_indices(CDynamicArray< uint32_t > &hashed_indices)
Template class SparseFeatures implements sparse matrices.
Template Dynamic array class that creates an array that can be used like a list or an array...
Definition: DynArray.h:22
virtual const char * get_name() const
This class can be used to convert a document collection contained in a CStringFeatures object w...
SGSparseVector< float64_t > create_hashed_representation(CDynamicArray< uint32_t > &hashed_indices)
The class CTokenizer acts as a base class in order to implement tokenizers. Sub-classes must implemen...
Definition: Tokenizer.h:29
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
The class Features is the base class of all feature objects.
Definition: Features.h:68
void set_k_skip_n_grams(int32_t k, int32_t n)
virtual CFeatures * apply(CFeatures *features)
void set_normalization(bool normalize)

SHOGUN Machine Learning Toolbox - Documentation