SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
StreamingHashedDocDotFeatures.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 
14 
15 using namespace shogun;
16 
18  bool is_labelled, int32_t size, CTokenizer* tzer, int32_t bits)
20 {
21  init(file, is_labelled, size, tzer, bits, true, 1, 0);
22 }
23 
25 {
26  init(NULL, false, 0, NULL, 0, false, 1, 0);
27 }
28 
30  CStringFeatures<char>* dot_features, CTokenizer* tzer, int32_t bits, float64_t* lab)
32 {
34  new CStreamingFileFromStringFeatures<char>(dot_features, lab);
35  bool is_labelled = (lab != NULL);
36  int32_t size=1024;
37 
38  init(file, is_labelled, size, tzer, bits, true, 1, 0);
39 
40  parser.set_free_vectors_on_destruct(false);
41  seekable= true;
42 }
43 void CStreamingHashedDocDotFeatures::init(CStreamingFile* file, bool is_labelled,
44  int32_t size, CTokenizer* tzer, int32_t bits, bool normalize, int32_t n_grams, int32_t skips)
45 {
46  num_bits = bits;
47  tokenizer = tzer;
48  if (tokenizer)
49  {
51  converter = new CHashedDocConverter(tzer, bits, normalize, n_grams, skips);
52  }
53  else
54  converter=NULL;
55 
56  SG_ADD(&num_bits, "num_bits", "Number of bits for hash", MS_NOT_AVAILABLE);
57  SG_ADD((CSGObject** ) &tokenizer, "tokenizer", "The tokenizer used on the documents",
59  SG_ADD((CSGObject** ) &converter, "converter", "Converter", MS_NOT_AVAILABLE);
60 
61  has_labels = is_labelled;
62  if (file)
63  {
64  working_file = file;
66  parser.init(file, is_labelled, size);
67  seekable = false;
68  }
69  else
70  working_file = NULL;
71 
73  parser.set_free_vector_after_release(false);
74 }
75 
77 {
78  if (parser.is_running())
79  parser.end_parser();
83 }
84 
86 {
87  ASSERT(df)
88  ASSERT(df->get_name() == get_name())
89 
91  float32_t result = current_vector.sparse_dot(cdf->current_vector);
92  return result;
93 }
94 
96 {
97  ASSERT(vec2_len == CMath::pow(2, num_bits))
98 
99  float32_t result = 0;
100  for (index_t i=0; i<current_vector.num_feat_entries; i++)
101  {
102  result += vec2[current_vector.features[i].feat_index] *
104  }
105  return result;
106 }
107 
109  int32_t vec2_len, bool abs_val)
110 {
111  float32_t value = abs_val ? CMath::abs(alpha) : alpha;
112 
113  for (index_t i=0; i<current_vector.num_feat_entries; i++)
115 }
116 
118 {
119  return CMath::pow(2, num_bits);
120 }
121 
123 {
124  return "StreamingHashedDocDotFeatures";
125 }
126 
128 {
129  return new CStreamingHashedDocDotFeatures(*this);
130 }
131 
133 {
134  return F_UINT;
135 }
136 
138 {
139  return C_STREAMING_SPARSE;
140 }
141 
143 {
144  if (!parser.is_running())
145  parser.start_parser();
146 }
147 
149 {
150  parser.end_parser();
151 }
152 
154 {
155  SGVector<char> tmp;
156  if (parser.get_next_example(tmp.vector,
157  tmp.vlen, current_label))
158  {
159  ASSERT(tmp.vector)
160  ASSERT(tmp.vlen > 0)
162  return true;
163  }
164  return false;
165 }
166 
168 {
169  parser.finalize_example();
170 }
171 
173 {
174  return (int32_t) CMath::pow(2, num_bits);
175 }
176 
178 {
179  return current_label;
180 }
181 
183 {
184  return 1;
185 }
186 
188 {
189  parser.set_read_vector(&CStreamingFile::get_string);
190 }
191 
193 {
194  parser.set_read_vector_and_label(&CStreamingFile::get_string_and_label);
195 }
196 
198 {
199  return current_vector;
200 }
201 
203 {
204  converter->set_normalization(normalize);
205 }
206 
208 {
210 }
virtual const char * get_name() const =0
This class implements streaming features for a document collection. Like in the standard Bag-of-Words...
virtual void get_string(bool *&vector, int32_t &len)
T sparse_dot(const SGSparseVector< T > &v)
int32_t index_t
Definition: common.h:62
bool has_labels
Whether examples are labelled or not.
#define SG_REF(x)
Definition: SGObject.h:54
CStreamingFile * working_file
The StreamingFile object to read from.
EFeatureClass
shogun feature class
Definition: FeatureTypes.h:38
A Streaming File access class.
Definition: StreamingFile.h:34
virtual void get_string_and_label(bool *&vector, int32_t &len, float64_t &label)
virtual void add_to_dense_vec(float32_t alpha, float32_t *vec2, int32_t vec2_len, bool abs_val=false)
index_t vlen
Definition: SGVector.h:492
This class can be used to convert a document collection contained in a CStringFeatures object w...
#define ASSERT(x)
Definition: SGIO.h:201
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:115
virtual float32_t dense_dot(const float32_t *vec2, int32_t vec2_len)
The class CTokenizer acts as a base class in order to implement tokenizers. Sub-classes must implemen...
Definition: Tokenizer.h:29
double float64_t
Definition: common.h:50
Class CStreamingFileFromStringFeatures is derived from CStreamingFile and provides an input source fo...
Streaming features that support dot products among other operations.
SGSparseVectorEntry< T > * features
float float32_t
Definition: common.h:49
EFeatureType
shogun feature type
Definition: FeatureTypes.h:19
#define SG_UNREF(x)
Definition: SGObject.h:55
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
bool seekable
Whether the stream is seekable.
The class Features is the base class of all feature objects.
Definition: Features.h:68
void set_k_skip_n_grams(int32_t k, int32_t n)
#define SG_ADD(...)
Definition: SGObject.h:84
virtual CFeatures * apply(CFeatures *features)
void set_normalization(bool normalize)
static int32_t pow(bool x, int32_t n)
Definition: Math.h:535
static T abs(T a)
Definition: Math.h:179
virtual float32_t dot(CStreamingDotFeatures *df)

SHOGUN Machine Learning Toolbox - Documentation