SHOGUN  6.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules
StreamingSparseFeatures.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2011 Shashwat Lal Das
8  * Modifications (W) 2013 Thoralf Klein
9  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
10  */
11 
14 
15 namespace shogun
16 {
17 
18 template <class T>
20 {
22  init();
23 }
24 
25 template <class T>
27  bool is_labelled,
28  int32_t size)
30 {
32  init(file, is_labelled, size);
33 }
34 
35 template <class T>
37 {
38  if (parser.is_running())
39  parser.end_parser();
40 }
41 
42 template <class T>
44 {
45  ASSERT(index>=0 && index<current_num_features)
46  return current_sgvector.get_feature(index);
47 }
48 
49 template <class T>
51 {
53 }
54 
55 template <class T>
57 {
58  int32_t n=current_num_features;
59  ASSERT(n<=num)
60  current_num_features=num;
61  return n;
62 }
63 
64 template <class T>
66 {
67  T result=0;
68 
69  //result remains zero when one of the vectors is non existent
70  if (avec && bvec)
71  {
72  SGSparseVector<T> asv(avec, alen, false);
73  SGSparseVector<T> bsv(bvec, blen, false);
74 
75  result=alpha*SGSparseVector<T>::sparse_dot(asv, bsv);
76  }
77 
78  return result;
79 }
80 
81 template <class T>
82 T CStreamingSparseFeatures<T>::dense_dot(T alpha, T* vec, int32_t dim, T b)
83 {
84  ASSERT(vec)
85  ASSERT(dim>=current_num_features)
86 
87  return current_sgvector.dense_dot(alpha, vec, dim, b);
88 }
89 
90 template <class T>
92 {
93  ASSERT(vec2)
94 
95  int32_t current_length = current_sgvector.num_feat_entries;
96  SGSparseVectorEntry<T>* current_vector = current_sgvector.features;
97 
98  float64_t result=0;
99  if (current_vector)
100  {
101  for (int32_t i=0; i<current_length; i++) {
102  if (current_vector[i].feat_index < vec2_len) {
103  result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
104  }
105  }
106  }
107 
108  return result;
109 }
110 
111 template <class T>
113 {
114  ASSERT(vec2)
115 
116  int32_t current_length = current_sgvector.num_feat_entries;
117  SGSparseVectorEntry<T>* current_vector = current_sgvector.features;
118 
119  float32_t result=0;
120  if (current_vector)
121  {
122  for (int32_t i=0; i<current_length; i++) {
123  if (current_vector[i].feat_index < vec2_len) {
124  result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
125  }
126  }
127  }
128 
129  return result;
130 }
131 
132 template <class T>
133 void CStreamingSparseFeatures<T>::add_to_dense_vec(float64_t alpha, float64_t* vec2, int32_t vec2_len, bool abs_val)
134 {
135  ASSERT(vec2)
136  if (vec2_len < current_num_features)
137  {
138  SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
139  vec2_len, current_num_features);
140  }
141 
142  SGSparseVectorEntry<T>* sv=current_sgvector.features;
143  int32_t num_feat=current_sgvector.num_feat_entries;
144 
145  if (sv)
146  {
147  if (abs_val)
148  {
149  for (int32_t i=0; i<num_feat; i++)
150  vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
151  }
152  else
153  {
154  for (int32_t i=0; i<num_feat; i++)
155  vec2[sv[i].feat_index]+= alpha*sv[i].entry;
156  }
157  }
158 }
159 
160 template <class T>
161 void CStreamingSparseFeatures<T>::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val)
162 {
163  ASSERT(vec2)
164  if (vec2_len < current_num_features)
165  {
166  SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
167  vec2_len, current_num_features);
168  }
169 
170  SGSparseVectorEntry<T>* sv=current_sgvector.features;
171  int32_t num_feat=current_sgvector.num_feat_entries;
172 
173  if (sv)
174  {
175  if (abs_val)
176  {
177  for (int32_t i=0; i<num_feat; i++)
178  vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
179  }
180  else
181  {
182  for (int32_t i=0; i<num_feat; i++)
183  vec2[sv[i].feat_index]+= alpha*sv[i].entry;
184  }
185  }
186 }
187 
188 template <class T>
190 {
191  return current_sgvector.num_feat_entries;
192 }
193 
194 template <class T>
196 {
197  int32_t current_length = current_sgvector.num_feat_entries;
198  SGSparseVectorEntry<T>* current_vector = current_sgvector.features;
199 
200  ASSERT(current_vector)
201 
202  float32_t sq=0;
203 
204  for (int32_t i=0; i<current_length; i++)
205  sq += current_vector[i].entry * current_vector[i].entry;
206 
207  return sq;
208 }
209 
210 template <class T>
212 {
213  SGSparseVectorEntry<T>* old_ptr = current_sgvector.features;
214 
215  // setting false to disallow reallocation
216  // and guarantee stable get_vector().features pointer
217  get_vector().sort_features(true);
218 
219  ASSERT(old_ptr == current_sgvector.features);
220 }
221 
222 template <class T>
224 {
225  if (current_sgvector.features)
226  return 1;
227  return 0;
228 }
229 
231 {
232  parser.set_read_vector(&CStreamingFile::get_sparse_vector);
233 }
234 
236 {
237  parser.set_read_vector_and_label
239 }
240 
241 #define GET_FEATURE_TYPE(f_type, sg_type) \
242 template<> EFeatureType CStreamingSparseFeatures<sg_type>::get_feature_type() const \
243 { \
244  return f_type; \
245 }
246 
249 GET_FEATURE_TYPE(F_BYTE, uint8_t)
250 GET_FEATURE_TYPE(F_BYTE, int8_t)
251 GET_FEATURE_TYPE(F_SHORT, int16_t)
252 GET_FEATURE_TYPE(F_WORD, uint16_t)
253 GET_FEATURE_TYPE(F_INT, int32_t)
254 GET_FEATURE_TYPE(F_UINT, uint32_t)
255 GET_FEATURE_TYPE(F_LONG, int64_t)
256 GET_FEATURE_TYPE(F_ULONG, uint64_t)
260 #undef GET_FEATURE_TYPE
261 
262 
263 template <class T>
264 void CStreamingSparseFeatures<T>::init()
265 {
266  working_file=NULL;
267  current_vec_index=0;
268  current_num_features=-1;
269 
270  set_generic<T>();
271 }
272 
273 template <class T>
274 void CStreamingSparseFeatures<T>::init(CStreamingFile* file,
275  bool is_labelled,
276  int32_t size)
277 {
278  init();
279  has_labels = is_labelled;
280  working_file = file;
281  SG_REF(working_file);
282  parser.init(file, is_labelled, size);
283  parser.set_free_vector_after_release(false);
284 }
285 
286 template <class T>
288 {
289  if (!parser.is_running())
290  parser.start_parser();
291 }
292 
293 template <class T>
295 {
296  parser.end_parser();
297 }
298 
299 template <class T>
301 {
302  int32_t current_length = 0;
303  SGSparseVectorEntry<T>* current_vector = NULL;
304 
305  bool ret_value;
306  ret_value = (bool) parser.get_next_example(current_vector,
307  current_length,
308  current_label);
309 
310  if (!ret_value)
311  return false;
312 
313  // ref_count disabled, because parser still owns the memory
314  current_sgvector = SGSparseVector<T>(current_vector, current_length, false);
315 
316  // Update number of features based on highest index
317  int32_t current_dimension = get_vector().get_num_dimensions();
318  current_num_features = CMath::max(current_num_features, current_dimension);
319 
320  current_vec_index++;
321  return true;
322 }
323 
324 template <class T>
326 {
327  return current_sgvector;
328 }
329 
330 template <class T>
332 {
333  ASSERT(has_labels)
334 
335  return current_label;
336 }
337 
338 template <class T>
340 {
341  parser.finalize_example();
342 }
343 
344 template <class T>
346 {
347  return current_num_features;
348 }
349 
350 template <class T>
352 {
354  return -1;
355 }
356 
357 template <class T>
359 {
360  return current_num_features;
361 }
362 
363 template <class T>
365 {
366  return current_sgvector.num_feat_entries;
367 }
368 
369 template <class T>
371 {
372  return C_STREAMING_SPARSE;
373 }
374 
375 template class CStreamingSparseFeatures<bool>;
376 template class CStreamingSparseFeatures<char>;
377 template class CStreamingSparseFeatures<int8_t>;
378 template class CStreamingSparseFeatures<uint8_t>;
379 template class CStreamingSparseFeatures<int16_t>;
381 template class CStreamingSparseFeatures<int32_t>;
383 template class CStreamingSparseFeatures<int64_t>;
388 }
T sparse_dot(const SGSparseVector< T > &v)
#define SG_ERROR(...)
Definition: SGIO.h:128
#define SG_NOTIMPLEMENTED
Definition: SGIO.h:138
virtual void add_to_dense_vec(float64_t alpha, float64_t *vec2, int32_t vec2_len, bool abs_val=false)
#define SG_REF(x)
Definition: SGObject.h:52
static T sparse_dot(T alpha, SGSparseVectorEntry< T > *avec, int32_t alen, SGSparseVectorEntry< T > *bvec, int32_t blen)
EFeatureClass
shogun feature class
Definition: FeatureTypes.h:38
A Streaming File access class.
Definition: StreamingFile.h:34
virtual float32_t dot(CStreamingDotFeatures *df)
virtual int32_t get_dim_feature_space() const
#define ASSERT(x)
Definition: SGIO.h:200
double float64_t
Definition: common.h:60
long double floatmax_t
Definition: common.h:61
virtual void get_sparse_vector_and_label(SGSparseVectorEntry< bool > *&vector, int32_t &len, float64_t &label)
static T max(T a, T b)
Definition: Math.h:164
Streaming features that support dot products among other operations.
float float32_t
Definition: common.h:59
virtual EFeatureClass get_feature_class() const
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual void get_sparse_vector(SGSparseVectorEntry< bool > *&vector, int32_t &len)
template class SGSparseVectorEntry
Definition: File.h:23
template class SGSparseVector The assumtion is that the stored SGSparseVectorEntry* vector is orde...
#define GET_FEATURE_TYPE(f_type, sg_type)
T dense_dot(T alpha, T *vec, int32_t dim, T b)
This class implements streaming features with sparse feature vectors. The vector is represented as an...
static T abs(T a)
Definition: Math.h:175

SHOGUN Machine Learning Toolbox - Documentation