SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
SparseFeatures.cpp
Go to the documentation of this file.
1 #include <shogun/lib/common.h>
2 #include <shogun/lib/memory.h>
6 #include <shogun/io/SGIO.h>
7 
8 #include <string.h>
9 #include <stdlib.h>
10 
11 namespace shogun
12 {
13 
14 template <class ST> class CSparsePreprocessor;
15 
16 template<class ST> CSparseFeatures<ST>::CSparseFeatures(int32_t size)
17 : CDotFeatures(size), feature_cache(NULL)
18 {
19  init();
20 }
21 
23 : CDotFeatures(0), feature_cache(NULL)
24 {
25  init();
26 
28 }
29 
31 : CDotFeatures(0), feature_cache(NULL)
32 {
33  init();
34 
36 }
37 
38 template<class ST> CSparseFeatures<ST>::CSparseFeatures(const CSparseFeatures & orig)
39 : CDotFeatures(orig), sparse_feature_matrix(orig.sparse_feature_matrix),
40  feature_cache(orig.feature_cache)
41 {
42  init();
43 
46 }
47 template<class ST> CSparseFeatures<ST>::CSparseFeatures(CFile* loader)
48 : CDotFeatures(), feature_cache(NULL)
49 {
50  init();
51 
52  load(loader);
53 }
54 
56 {
57  SG_UNREF(feature_cache);
58 }
59 
60 template<class ST> CFeatures* CSparseFeatures<ST>::duplicate() const
61 {
62  return new CSparseFeatures<ST>(*this);
63 }
64 
65 template<class ST> ST CSparseFeatures<ST>::get_feature(int32_t num, int32_t index)
66 {
67  REQUIRE(index>=0 && index<get_num_features(),
68  "get_feature(num=%d,index=%d): index exceeds [0;%d]\n",
69  num, index, get_num_features()-1);
70 
71  SGSparseVector<ST> sv=get_sparse_feature_vector(num);
72  ST ret = sv.get_feature(index);
73 
74  free_sparse_feature_vector(num);
75  return ret;
76 }
77 
79 {
80  SGSparseVector<ST> sv=get_sparse_feature_vector(num);
81  SGVector<ST> dense = sv.get_dense(get_num_features());
82  free_sparse_feature_vector(num);
83  return dense;
84 }
85 
86 template<class ST> int32_t CSparseFeatures<ST>::get_nnz_features_for_vector(int32_t num)
87 {
88  SGSparseVector<ST> sv = get_sparse_feature_vector(num);
89  int32_t len=sv.num_feat_entries;
90  free_sparse_feature_vector(num);
91  return len;
92 }
93 
95 {
96  REQUIRE(num>=0 && num<get_num_vectors(),
97  "get_sparse_feature_vector(num=%d): num exceeds [0;%d]\n",
98  num, get_num_vectors()-1);
99  index_t real_num=m_subset_stack->subset_idx_conversion(num);
100 
101  if (sparse_feature_matrix.sparse_matrix)
102  {
103  return sparse_feature_matrix[real_num];
104  }
105  else
106  {
107  SGSparseVector<ST> result;
108  if (feature_cache)
109  {
110  result.features=feature_cache->lock_entry(num);
111 
112  if (result.features)
113  return result;
114  else
115  {
116  result.features=feature_cache->set_entry(num);
117  }
118  }
119 
120  //if (!result.features)
121  // result.do_free=true;
122 
123  result.features=compute_sparse_feature_vector(num,
124  result.num_feat_entries, result.features);
125 
126 
127  if (get_num_preprocessors())
128  {
129  int32_t tmp_len=result.num_feat_entries;
130  SGSparseVectorEntry<ST>* tmp_feat_before=result.features;
131  SGSparseVectorEntry<ST>* tmp_feat_after = NULL;
132 
133  for (int32_t i=0; i<get_num_preprocessors(); i++)
134  {
135  //tmp_feat_after=((CSparsePreprocessor<ST>*) get_preproc(i))->apply_to_feature_vector(tmp_feat_before, tmp_len);
136 
137  if (i!=0) // delete feature vector, except for the the first one, i.e., feat
138  SG_FREE(tmp_feat_before);
139  tmp_feat_before=tmp_feat_after;
140  }
141 
142  if (tmp_feat_after)
143  {
144  memcpy(result.features, tmp_feat_after,
145  sizeof(SGSparseVectorEntry<ST>)*tmp_len);
146 
147  SG_FREE(tmp_feat_after);
148  result.num_feat_entries=tmp_len;
149  }
150  SG_DEBUG("len: %d len2: %d\n", result.num_feat_entries, get_num_features())
151  }
152  return result ;
153  }
154 }
155 
156 template<class ST> ST CSparseFeatures<ST>::dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
157 {
158  SGSparseVector<ST> sv=get_sparse_feature_vector(num);
159  ST result = sv.dense_dot(alpha,vec,dim,b);
160  free_sparse_feature_vector(num);
161  return result;
162 }
163 
164 template<class ST> void CSparseFeatures<ST>::add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val)
165 {
166  REQUIRE(vec, "add_to_dense_vec(num=%d,dim=%d): vec must not be NULL\n",
167  num, dim);
168  REQUIRE(dim>=get_num_features(),
169  "add_to_dense_vec(num=%d,dim=%d): dim should contain number of features %d\n",
170  num, dim, get_num_features());
171 
172  SGSparseVector<ST> sv=get_sparse_feature_vector(num);
173 
174  if (sv.features)
175  {
176  if (abs_val)
177  {
178  for (int32_t i=0; i<sv.num_feat_entries; i++)
179  {
180  vec[sv.features[i].feat_index]+=alpha
181  *CMath::abs(sv.features[i].entry);
182  }
183  }
184  else
185  {
186  for (int32_t i=0; i<sv.num_feat_entries; i++)
187  {
188  vec[sv.features[i].feat_index]+=alpha
189  *sv.features[i].entry;
190  }
191  }
192  }
193 
194  free_sparse_feature_vector(num);
195 }
196 
197 template<>
199  int32_t num, float64_t* vec, int32_t dim, bool abs_val)
200 {
202 }
203 
204 template<class ST> void CSparseFeatures<ST>::free_sparse_feature_vector(int32_t num)
205 {
206  if (feature_cache)
207  feature_cache->unlock_entry(m_subset_stack->subset_idx_conversion(num));
208 
209  //vec.free_vector();
210 }
211 
213 {
214  if (m_subset_stack->has_subsets())
215  SG_ERROR("Not allowed with subset\n");
216 
217  return sparse_feature_matrix;
218 }
219 
221 {
222  if (m_subset_stack->has_subsets())
223  SG_ERROR("Not allowed with subset\n");
224 
225  return new CSparseFeatures<ST>(sparse_feature_matrix.get_transposed());
226 }
227 
229 {
230  if (m_subset_stack->has_subsets())
231  SG_ERROR("Not allowed with subset\n");
232 
233  sparse_feature_matrix=sm;
234 
235  // TODO: check should be implemented in sparse matrix class
236  for (int32_t j=0; j<get_num_vectors(); j++) {
237  SGSparseVector<ST> sv=get_sparse_feature_vector(j);
238  REQUIRE(get_num_features() >= sv.get_num_dimensions(),
239  "sparse_matrix[%d] check failed (matrix features %d >= vector dimension %d)\n",
240  j, get_num_features(), sv.get_num_dimensions());
241  }
242 }
243 
245 {
246  SGMatrix<ST> full(get_num_features(), get_num_vectors());
247  full.zero();
248 
249  SG_INFO("converting sparse features to full feature matrix of %d x %d"
250  " entries\n", sparse_feature_matrix.num_vectors, get_num_features())
251 
252  for (int32_t v=0; v<full.num_cols; v++)
253  {
254  int32_t idx=m_subset_stack->subset_idx_conversion(v);
255  SGSparseVector<ST> current=sparse_feature_matrix[idx];
256 
257  for (int32_t f=0; f<current.num_feat_entries; f++)
258  {
259  int64_t offs=(v*get_num_features())
260  +current.features[f].feat_index;
261 
262  full.matrix[offs]=current.features[f].entry;
263  }
264  }
265 
266  return full;
267 }
268 
270 {
271  free_sparse_feature_matrix();
272  SG_UNREF(feature_cache);
273 }
274 
276 {
277  sparse_feature_matrix=SGSparseMatrix<ST>();
278 }
279 
281 {
282  remove_all_subsets();
283  free_sparse_feature_matrix();
284  sparse_feature_matrix.from_dense(full);
285 }
286 
287 template<class ST> bool CSparseFeatures<ST>::apply_preprocessor(bool force_preprocessing)
288 {
289  SG_INFO("force: %d\n", force_preprocessing)
290 
291  if (sparse_feature_matrix.sparse_matrix && get_num_preprocessors())
292  {
293  for (int32_t i=0; i<get_num_preprocessors(); i++)
294  {
295  if (!is_preprocessed(i) || force_preprocessing)
296  {
297  set_preprocessed(i);
298  CSparsePreprocessor<ST>* p = (CSparsePreprocessor<ST>*) get_preprocessor(i);
299  SG_INFO("preprocessing using preproc %s\n", p->get_name())
300 
301  if (p->apply_to_sparse_feature_matrix(this) == NULL)
302  {
303  SG_UNREF(p);
304  return false;
305  }
306 
307  SG_UNREF(p);
308  }
309  }
310  return true;
311  }
312  else
313  {
314  SG_WARNING("no sparse feature matrix available or features already preprocessed - skipping.\n")
315  return false;
316  }
317 }
318 
320 {
322  ASSERT(fm.matrix && fm.num_cols>0 && fm.num_rows>0)
323  set_full_feature_matrix(fm);
324 }
325 
327 {
329 }
330 
331 template<class ST> int32_t CSparseFeatures<ST>::get_num_vectors() const
332 {
333  return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : sparse_feature_matrix.num_vectors;
334 }
335 
336 template<class ST> int32_t CSparseFeatures<ST>::get_num_features() const
337 {
338  return sparse_feature_matrix.num_features;
339 }
340 
341 template<class ST> int32_t CSparseFeatures<ST>::set_num_features(int32_t num)
342 {
343  int32_t n=get_num_features();
344  ASSERT(n<=num)
345  sparse_feature_matrix.num_features=num;
346  return sparse_feature_matrix.num_features;
347 }
348 
350 {
351  return C_SPARSE;
352 }
353 
354 template<class ST> void CSparseFeatures<ST>::free_feature_vector(int32_t num)
355 {
356  if (feature_cache)
357  feature_cache->unlock_entry(m_subset_stack->subset_idx_conversion(num));
358 
359  //vec.free_vector();
360 }
361 
363 {
364  int64_t num=0;
365  index_t num_vec=get_num_vectors();
366  for (int32_t i=0; i<num_vec; i++)
367  num+=sparse_feature_matrix[m_subset_stack->subset_idx_conversion(i)].num_feat_entries;
368 
369  return num;
370 }
371 
373 {
374  ASSERT(sq)
375 
376  index_t num_vec=get_num_vectors();
377  for (int32_t i=0; i<num_vec; i++)
378  {
379  sq[i]=0;
380  SGSparseVector<ST> vec=get_sparse_feature_vector(i);
381 
382  for (int32_t j=0; j<vec.num_feat_entries; j++)
383  sq[i]+=vec.features[j].entry*vec.features[j].entry;
384 
385  free_feature_vector(i);
386  }
387 
388  return sq;
389 }
390 
392 {
394  return sq;
395 }
396 
398  CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a,
399  CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
400 {
401  int32_t i,j;
402  ASSERT(lhs)
403  ASSERT(rhs)
404 
407  ASSERT(avec.features)
408  ASSERT(bvec.features)
409 
410  float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
411 
412  if (avec.num_feat_entries<=bvec.num_feat_entries)
413  {
414  j=0;
415  for (i=0; i<avec.num_feat_entries; i++)
416  {
417  int32_t a_feat_idx=avec.features[i].feat_index;
418 
419  while ((j<bvec.num_feat_entries)
420  &&(bvec.features[j].feat_index<a_feat_idx))
421  j++;
422 
423  if ((j<bvec.num_feat_entries)
424  &&(bvec.features[j].feat_index==a_feat_idx))
425  {
426  result-=2*(avec.features[i].entry*bvec.features[j].entry);
427  j++;
428  }
429  }
430  }
431  else
432  {
433  j=0;
434  for (i=0; i<bvec.num_feat_entries; i++)
435  {
436  int32_t b_feat_idx=bvec.features[i].feat_index;
437 
438  while ((j<avec.num_feat_entries)
439  &&(avec.features[j].feat_index<b_feat_idx))
440  j++;
441 
442  if ((j<avec.num_feat_entries)
443  &&(avec.features[j].feat_index==b_feat_idx))
444  {
445  result-=2*(bvec.features[i].entry*avec.features[j].entry);
446  j++;
447  }
448  }
449  }
450 
451  ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(idx_a);
452  ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(idx_b);
453 
454  return CMath::abs(result);
455 }
456 
457 template<class ST> int32_t CSparseFeatures<ST>::get_dim_feature_space() const
458 {
459  return get_num_features();
460 }
461 
462 template<class ST> float64_t CSparseFeatures<ST>::dot(int32_t vec_idx1,
463  CDotFeatures* df, int32_t vec_idx2)
464 {
465  ASSERT(df)
466  ASSERT(df->get_feature_type() == get_feature_type())
467  ASSERT(df->get_feature_class() == get_feature_class())
469 
470  SGSparseVector<ST> avec=get_sparse_feature_vector(vec_idx1);
471  SGSparseVector<ST> bvec=sf->get_sparse_feature_vector(vec_idx2);
472 
473  float64_t result = SGSparseVector<ST>::sparse_dot(avec, bvec);
474  free_sparse_feature_vector(vec_idx1);
475  sf->free_sparse_feature_vector(vec_idx2);
476 
477  return result;
478 }
479 
480 template<> float64_t CSparseFeatures<complex128_t>::dot(int32_t vec_idx1,
481  CDotFeatures* df, int32_t vec_idx2)
482 {
484  return 0.0;
485 }
486 
487 template<class ST> float64_t CSparseFeatures<ST>::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
488 {
489  REQUIRE(vec2, "dense_dot(vec_idx1=%d,vec2_len=%d): vec2 must not be NULL\n",
490  vec_idx1, vec2_len);
491  REQUIRE(vec2_len>=get_num_features(),
492  "dense_dot(vec_idx1=%d,vec2_len=%d): vec2_len should contain number of features %d %d\n",
493  vec_idx1, vec2_len, get_num_features());
494 
495  float64_t result=0;
496  SGSparseVector<ST> sv=get_sparse_feature_vector(vec_idx1);
497 
498  if (sv.features)
499  {
500  REQUIRE(get_num_features() >= sv.get_num_dimensions(),
501  "sparse_matrix[%d] check failed (matrix features %d >= vector dimension %d)\n",
502  vec_idx1, get_num_features(), sv.get_num_dimensions());
503 
504  REQUIRE(vec2_len >= sv.get_num_dimensions(),
505  "sparse_matrix[%d] check failed (dense vector dimension %d >= vector dimension %d)\n",
506  vec_idx1, vec2_len, sv.get_num_dimensions());
507 
508  for (int32_t i=0; i<sv.num_feat_entries; i++)
509  result+=vec2[sv.features[i].feat_index]*sv.features[i].entry;
510  }
511 
512  free_sparse_feature_vector(vec_idx1);
513 
514  return result;
515 }
516 
518  const float64_t* vec2, int32_t vec2_len)
519 {
521  return 0.0;
522 }
523 
524 template<class ST> void* CSparseFeatures<ST>::get_feature_iterator(int32_t vector_index)
525 {
526  if (vector_index>=get_num_vectors())
527  {
528  SG_ERROR("Index out of bounds (number of vectors %d, you "
529  "requested %d)\n", get_num_vectors(), vector_index);
530  }
531 
532  if (!sparse_feature_matrix.sparse_matrix)
533  SG_ERROR("Requires a in-memory feature matrix\n")
534 
535  sparse_feature_iterator* it=new sparse_feature_iterator();
536  it->sv=get_sparse_feature_vector(vector_index);
537  it->index=0;
538  it->vector_index=vector_index;
539 
540  return it;
541 }
542 
543 template<class ST> bool CSparseFeatures<ST>::get_next_feature(int32_t& index, float64_t& value, void* iterator)
544 {
545  sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
546  if (!it || it->index>=it->sv.num_feat_entries)
547  return false;
548 
549  int32_t i=it->index++;
550 
551  index=it->sv.features[i].feat_index;
552  value=(float64_t) it->sv.features[i].entry;
553 
554  return true;
555 }
556 
557 template<> bool CSparseFeatures<complex128_t>::get_next_feature(int32_t& index,
558  float64_t& value, void* iterator)
559 {
561  return false;
562 }
563 
564 template<class ST> void CSparseFeatures<ST>::free_feature_iterator(void* iterator)
565 {
566  if (!iterator)
567  return;
568 
569  delete ((sparse_feature_iterator*) iterator);
570 }
571 
573 {
574  SGSparseMatrix<ST> matrix_copy=SGSparseMatrix<ST>(get_dim_feature_space(),
575  indices.vlen);
576 
577  for (index_t i=0; i<indices.vlen; ++i)
578  {
579  /* index to copy */
580  index_t index=indices.vector[i];
581  index_t real_index=m_subset_stack->subset_idx_conversion(index);
582 
583  /* copy sparse vector */
584  SGSparseVector<ST> current=get_sparse_feature_vector(real_index);
585  matrix_copy.sparse_matrix[i]=current;
586 
587  free_sparse_feature_vector(index);
588  }
589 
590  CFeatures* result=new CSparseFeatures<ST>(matrix_copy);
591  return result;
592 }
593 
595  int32_t& len, SGSparseVectorEntry<ST>* target)
596 {
598 
599  len=0;
600  return NULL;
601 }
602 
603 template<class ST> void CSparseFeatures<ST>::sort_features()
604 {
605  sparse_feature_matrix.sort_features();
606 }
607 
608 template<class ST> void CSparseFeatures<ST>::init()
609 {
610  set_generic<ST>();
611 
612  m_parameters->add_vector(&sparse_feature_matrix.sparse_matrix, &sparse_feature_matrix.num_vectors,
613  "sparse_feature_matrix",
614  "Array of sparse vectors.");
615  m_parameters->add(&sparse_feature_matrix.num_features, "sparse_feature_matrix.num_features",
616  "Total number of features.");
617 }
618 
619 #define GET_FEATURE_TYPE(sg_type, f_type) \
620 template<> EFeatureType CSparseFeatures<sg_type>::get_feature_type() const \
621 { \
622  return f_type; \
623 }
626 GET_FEATURE_TYPE(uint8_t, F_BYTE)
627 GET_FEATURE_TYPE(int8_t, F_BYTE)
628 GET_FEATURE_TYPE(int16_t, F_SHORT)
629 GET_FEATURE_TYPE(uint16_t, F_WORD)
630 GET_FEATURE_TYPE(int32_t, F_INT)
631 GET_FEATURE_TYPE(uint32_t, F_UINT)
632 GET_FEATURE_TYPE(int64_t, F_LONG)
633 GET_FEATURE_TYPE(uint64_t, F_ULONG)
638 #undef GET_FEATURE_TYPE
639 
640 template<class ST> void CSparseFeatures<ST>::load(CFile* loader)
641 {
642  remove_all_subsets();
643  ASSERT(loader)
644  free_sparse_feature_matrix();
645  sparse_feature_matrix.load(loader);
646 }
647 
649 {
650  remove_all_subsets();
651  ASSERT(loader)
652  free_sparse_feature_matrix();
653  return sparse_feature_matrix.load_with_labels(loader);
654 }
655 
656 template<class ST> void CSparseFeatures<ST>::save(CFile* writer)
657 {
658  if (m_subset_stack->has_subsets())
659  SG_ERROR("Not allowed with subset\n");
660  ASSERT(writer)
661  sparse_feature_matrix.save(writer);
662 }
663 
665 {
666  if (m_subset_stack->has_subsets())
667  SG_ERROR("Not allowed with subset\n");
668  ASSERT(writer)
669  sparse_feature_matrix.save_with_labels(writer, labels);
670 }
671 
672 template class CSparseFeatures<bool>;
673 template class CSparseFeatures<char>;
674 template class CSparseFeatures<int8_t>;
675 template class CSparseFeatures<uint8_t>;
676 template class CSparseFeatures<int16_t>;
677 template class CSparseFeatures<uint16_t>;
678 template class CSparseFeatures<int32_t>;
679 template class CSparseFeatures<uint32_t>;
680 template class CSparseFeatures<int64_t>;
681 template class CSparseFeatures<uint64_t>;
682 template class CSparseFeatures<float32_t>;
683 template class CSparseFeatures<float64_t>;
684 template class CSparseFeatures<floatmax_t>;
685 template class CSparseFeatures<complex128_t>;
686 }

SHOGUN Machine Learning Toolbox - Documentation