SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StreamingSparseFeatures.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2011 Shashwat Lal Das
8  * Modifications (W) 2013 Thoralf Klein
9  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
10  */
11 
14 
15 namespace shogun
16 {
17 
18 template <class T>
20 {
22  init();
23 }
24 
25 template <class T>
27  bool is_labelled,
28  int32_t size)
30 {
32  init(file, is_labelled, size);
33 }
34 
35 template <class T>
37 {
38  if (parser.is_running())
39  parser.end_parser();
40 }
41 
42 template <class T>
44 {
45  ASSERT(index>=0 && index<current_num_features)
46  return current_sgvector.get_feature(index);
47 }
48 
49 template <class T>
51 {
53 }
54 
55 template <class T>
57 {
58  int32_t n=current_num_features;
59  ASSERT(n<=num)
60  current_num_features=num;
61  return n;
62 }
63 
64 template <class T>
66 {
67  T result=0;
68 
69  //result remains zero when one of the vectors is non existent
70  if (avec && bvec)
71  {
72  SGSparseVector<T> asv(avec, alen, false);
73  SGSparseVector<T> bsv(bvec, blen, false);
74 
75  result=alpha*SGSparseVector<T>::sparse_dot(asv, bsv);
76  }
77 
78  return result;
79 }
80 
81 template <class T>
82 T CStreamingSparseFeatures<T>::dense_dot(T alpha, T* vec, int32_t dim, T b)
83 {
84  ASSERT(vec)
85  ASSERT(dim>=current_num_features)
86 
87  return current_sgvector.dense_dot(alpha, vec, dim, b);
88 }
89 
90 template <class T>
92 {
93  ASSERT(vec2)
94 
95  int32_t current_length = current_sgvector.num_feat_entries;
96  SGSparseVectorEntry<T>* current_vector = current_sgvector.features;
97 
98  float64_t result=0;
99  if (current_vector)
100  {
101  for (int32_t i=0; i<current_length; i++) {
102  if (current_vector[i].feat_index < vec2_len) {
103  result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
104  }
105  }
106  }
107 
108  return result;
109 }
110 
111 template <class T>
113 {
114  ASSERT(vec2)
115 
116  int32_t current_length = current_sgvector.num_feat_entries;
117  SGSparseVectorEntry<T>* current_vector = current_sgvector.features;
118 
119  float32_t result=0;
120  if (current_vector)
121  {
122  for (int32_t i=0; i<current_length; i++) {
123  if (current_vector[i].feat_index < vec2_len) {
124  result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
125  }
126  }
127  }
128 
129  return result;
130 }
131 
132 template <class T>
133 void CStreamingSparseFeatures<T>::add_to_dense_vec(float64_t alpha, float64_t* vec2, int32_t vec2_len, bool abs_val)
134 {
135  ASSERT(vec2)
136  if (vec2_len < current_num_features)
137  {
138  SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
139  vec2_len, current_num_features);
140  }
141 
142  SGSparseVectorEntry<T>* sv=current_sgvector.features;
143  int32_t num_feat=current_sgvector.num_feat_entries;
144 
145  if (sv)
146  {
147  if (abs_val)
148  {
149  for (int32_t i=0; i<num_feat; i++)
150  vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
151  }
152  else
153  {
154  for (int32_t i=0; i<num_feat; i++)
155  vec2[sv[i].feat_index]+= alpha*sv[i].entry;
156  }
157  }
158 }
159 
160 template <class T>
161 void CStreamingSparseFeatures<T>::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val)
162 {
163  ASSERT(vec2)
164  if (vec2_len < current_num_features)
165  {
166  SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
167  vec2_len, current_num_features);
168  }
169 
170  SGSparseVectorEntry<T>* sv=current_sgvector.features;
171  int32_t num_feat=current_sgvector.num_feat_entries;
172 
173  if (sv)
174  {
175  if (abs_val)
176  {
177  for (int32_t i=0; i<num_feat; i++)
178  vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
179  }
180  else
181  {
182  for (int32_t i=0; i<num_feat; i++)
183  vec2[sv[i].feat_index]+= alpha*sv[i].entry;
184  }
185  }
186 }
187 
188 template <class T>
190 {
191  return current_sgvector.num_feat_entries;
192 }
193 
194 template <class T>
196 {
197  int32_t current_length = current_sgvector.num_feat_entries;
198  SGSparseVectorEntry<T>* current_vector = current_sgvector.features;
199 
200  ASSERT(current_vector)
201 
202  float32_t sq=0;
203 
204  for (int32_t i=0; i<current_length; i++)
205  sq += current_vector[i].entry * current_vector[i].entry;
206 
207  return sq;
208 }
209 
210 template <class T>
212 {
213  SGSparseVectorEntry<T>* old_ptr = current_sgvector.features;
214 
215  // setting false to disallow reallocation
216  // and guarantee stable get_vector().features pointer
217  get_vector().sort_features(true);
218 
219  ASSERT(old_ptr == current_sgvector.features);
220 }
221 
222 template <class T>
224 {
225  return new CStreamingSparseFeatures<T>(*this);
226 }
227 
228 template <class T>
230 {
231  if (current_sgvector.features)
232  return 1;
233  return 0;
234 }
235 
237 {
238  parser.set_read_vector(&CStreamingFile::get_sparse_vector);
239 }
240 
242 {
243  parser.set_read_vector_and_label
245 }
246 
247 #define GET_FEATURE_TYPE(f_type, sg_type) \
248 template<> EFeatureType CStreamingSparseFeatures<sg_type>::get_feature_type() const \
249 { \
250  return f_type; \
251 }
252 
255 GET_FEATURE_TYPE(F_BYTE, uint8_t)
256 GET_FEATURE_TYPE(F_BYTE, int8_t)
257 GET_FEATURE_TYPE(F_SHORT, int16_t)
258 GET_FEATURE_TYPE(F_WORD, uint16_t)
259 GET_FEATURE_TYPE(F_INT, int32_t)
260 GET_FEATURE_TYPE(F_UINT, uint32_t)
261 GET_FEATURE_TYPE(F_LONG, int64_t)
262 GET_FEATURE_TYPE(F_ULONG, uint64_t)
266 #undef GET_FEATURE_TYPE
267 
268 
269 template <class T>
270 void CStreamingSparseFeatures<T>::init()
271 {
272  working_file=NULL;
273  current_vec_index=0;
274  current_num_features=-1;
275 
276  set_generic<T>();
277 }
278 
279 template <class T>
280 void CStreamingSparseFeatures<T>::init(CStreamingFile* file,
281  bool is_labelled,
282  int32_t size)
283 {
284  init();
285  has_labels = is_labelled;
286  working_file = file;
287  SG_REF(working_file);
288  parser.init(file, is_labelled, size);
289  parser.set_free_vector_after_release(false);
290 }
291 
292 template <class T>
294 {
295  if (!parser.is_running())
296  parser.start_parser();
297 }
298 
299 template <class T>
301 {
302  parser.end_parser();
303 }
304 
305 template <class T>
307 {
308  int32_t current_length = 0;
309  SGSparseVectorEntry<T>* current_vector = NULL;
310 
311  bool ret_value;
312  ret_value = (bool) parser.get_next_example(current_vector,
313  current_length,
314  current_label);
315 
316  if (!ret_value)
317  return false;
318 
319  // ref_count disabled, because parser still owns the memory
320  current_sgvector = SGSparseVector<T>(current_vector, current_length, false);
321 
322  // Update number of features based on highest index
323  int32_t current_dimension = get_vector().get_num_dimensions();
324  current_num_features = CMath::max(current_num_features, current_dimension);
325 
326  current_vec_index++;
327  return true;
328 }
329 
330 template <class T>
332 {
333  return current_sgvector;
334 }
335 
336 template <class T>
338 {
339  ASSERT(has_labels)
340 
341  return current_label;
342 }
343 
344 template <class T>
346 {
347  parser.finalize_example();
348 }
349 
350 template <class T>
352 {
353  return current_num_features;
354 }
355 
356 template <class T>
358 {
360  return -1;
361 }
362 
363 template <class T>
365 {
366  return current_num_features;
367 }
368 
369 template <class T>
371 {
372  return current_sgvector.num_feat_entries;
373 }
374 
375 template <class T>
377 {
378  return C_STREAMING_SPARSE;
379 }
380 
381 template class CStreamingSparseFeatures<bool>;
382 template class CStreamingSparseFeatures<char>;
383 template class CStreamingSparseFeatures<int8_t>;
384 template class CStreamingSparseFeatures<uint8_t>;
385 template class CStreamingSparseFeatures<int16_t>;
387 template class CStreamingSparseFeatures<int32_t>;
389 template class CStreamingSparseFeatures<int64_t>;
394 }

SHOGUN Machine Learning Toolbox - Documentation