SHOGUN  6.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules
HashedWDFeaturesTransposed.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2010 Soeren Sonnenburg
8  * Copyright (C) 2010 Berlin Institute of Technology
9  */
10 
12 #include <shogun/io/SGIO.h>
13 #include <shogun/lib/Signal.h>
14 #include <shogun/base/Parallel.h>
15 
16 #ifdef HAVE_PTHREAD
17 #include <pthread.h>
18 #endif
19 
20 using namespace shogun;
21 
22 #ifndef DOXYGEN_SHOULD_SKIP_THIS
23 struct HASHEDWD_THREAD_PARAM
24 {
26  int32_t* sub_index;
27  float64_t* output;
28  int32_t start;
29  int32_t stop;
30  float64_t* alphas;
31  float64_t* vec;
32  float64_t bias;
33  bool progress;
34  uint32_t* index;
35 };
36 #endif // DOXYGEN_SHOULD_SKIP_THIS
37 
39  :CDotFeatures()
40 {
42  "CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed()",
43  "\n");
44 
45  strings = NULL;
46  transposed_strings = NULL;
47 
48  degree = 0;
49  start_degree = 0;
50  from_degree = 0;
51  string_length = 0;
52  num_strings = 0;
53  alphabet_size = 0;
54  w_dim = 0;
55  partial_w_dim = 0;
56  wd_weights = NULL;
57  mask = 0;
58  m_hash_bits = 0;
59 
60  normalization_const = 0.0;
61 }
62 
64  int32_t start_order, int32_t order, int32_t from_order,
65  int32_t hash_bits) : CDotFeatures()
66 {
67  ASSERT(start_order>=0)
68  ASSERT(start_order<order)
69  ASSERT(order<=from_order)
70  ASSERT(hash_bits>0)
71  ASSERT(str)
72  ASSERT(str->have_same_length())
73  SG_REF(str);
74 
75  strings=str;
76  int32_t transposed_num_feat=0;
77  int32_t transposed_num_vec=0;
78  transposed_strings=str->get_transposed(transposed_num_feat, transposed_num_vec);
79 
82  ASSERT(transposed_num_feat==num_strings)
83  ASSERT(transposed_num_vec==string_length)
84 
85  CAlphabet* alpha=str->get_alphabet();
87  SG_UNREF(alpha);
88 
89  degree=order;
90  start_degree=start_order;
91  if (start_degree!=0)
93  from_degree=from_order;
94  m_hash_bits=hash_bits;
97 }
98 
100  : CDotFeatures(orig), strings(orig.strings), transposed_strings(orig.transposed_strings),
101  degree(orig.degree), start_degree(orig.start_degree),
102  from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits),
103  normalization_const(orig.normalization_const)
104 {
105  SG_REF(strings);
108  CAlphabet* alpha=strings->get_alphabet();
110  SG_UNREF(alpha);
111 
112  set_wd_weights();
113 }
114 
116 {
117  for (int32_t i=0; i<string_length; i++)
118  SG_FREE(transposed_strings[i].string);
119  SG_FREE(transposed_strings);
120 
121  SG_UNREF(strings);
122  SG_FREE(wd_weights);
123 }
124 
125 float64_t CHashedWDFeaturesTransposed::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
126 {
127  ASSERT(df)
131 
132  int32_t len1, len2;
133  bool free_vec1, free_vec2;
134 
135  uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
136  uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
137 
138  ASSERT(len1==len2)
139 
140  float64_t sum=0.0;
141 
142  for (int32_t i=0; i<len1; i++)
143  {
144  for (int32_t j=0; (i+j<len1) && (j<degree); j++)
145  {
146  if (vec1[i+j]!=vec2[i+j])
147  break;
148  if (j>=start_degree)
149  sum += wd_weights[j]*wd_weights[j];
150  }
151  }
152  strings->free_feature_vector(vec1, vec_idx1, free_vec1);
153  wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
154  return sum/CMath::sq(normalization_const);
155 }
156 
157 float64_t CHashedWDFeaturesTransposed::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
158 {
159  if (vec2_len != w_dim)
160  SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim)
161 
162  float64_t sum=0;
163  int32_t len;
164  bool free_vec1;
165  uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
166  uint32_t* val=SG_MALLOC(uint32_t, len);
167 
168  uint32_t offs=0;
169 
170  SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF);
171 
172  for (int32_t i=0; i < len; i++)
173  {
174  uint32_t o=offs;
175  uint32_t carry = 0;
176  uint32_t chunk = 0;
177 
178  for (int32_t k=0; k<degree && i+k<len; k++)
179  {
180  const float64_t wd = wd_weights[k];
181  chunk++;
182  CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1);
183  uint32_t h =
184  CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
185 #ifdef DEBUG_HASHEDWD
186  SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h)
187 #endif
188  sum+=vec2[o+(h & mask)]*wd;
189  o+=partial_w_dim;
190  }
191  val[i] = CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
192  offs+=partial_w_dim*degree;
193  }
194  SG_FREE(val);
195  strings->free_feature_vector(vec, vec_idx1, free_vec1);
196 
197  return sum/normalization_const;
198 }
199 
200 void CHashedWDFeaturesTransposed::dense_dot_range(float64_t* output, int32_t start, int32_t stop, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
201 {
202  ASSERT(output)
203  // write access is internally between output[start..stop] so the following
204  // line is necessary to write to output[0...(stop-start-1)]
205  output-=start;
206  ASSERT(start>=0)
207  ASSERT(start<stop)
208  ASSERT(stop<=get_num_vectors())
209  uint32_t* index=SG_MALLOC(uint32_t, stop);
210 
211  int32_t num_vectors=stop-start;
212  ASSERT(num_vectors>0)
213 
214  // TODO: port to use OpenMP backend instead of pthread
215 #ifdef HAVE_PTHREAD
216  int32_t num_threads=parallel->get_num_threads();
217 #else
218  int32_t num_threads=1;
219 #endif
220  ASSERT(num_threads>0)
221 
223 
224  if (dim != w_dim)
225  SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim)
226 
227  if (num_threads < 2)
228  {
229  HASHEDWD_THREAD_PARAM params;
230  params.hf=this;
231  params.sub_index=NULL;
232  params.output=output;
233  params.start=start;
234  params.stop=stop;
235  params.alphas=alphas;
236  params.vec=vec;
237  params.bias=b;
238  params.progress=false; //true;
239  params.index=index;
240  dense_dot_range_helper((void*) &params);
241  }
242 #ifdef HAVE_PTHREAD
243  else
244  {
245  pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
246  HASHEDWD_THREAD_PARAM* params = SG_MALLOC(HASHEDWD_THREAD_PARAM, num_threads);
247  int32_t step= num_vectors/num_threads;
248 
249  int32_t t;
250 
251  for (t=0; t<num_threads-1; t++)
252  {
253  params[t].hf = this;
254  params[t].sub_index=NULL;
255  params[t].output = output;
256  params[t].start = start+t*step;
257  params[t].stop = start+(t+1)*step;
258  params[t].alphas=alphas;
259  params[t].vec=vec;
260  params[t].bias=b;
261  params[t].progress = false;
262  params[t].index=index;
263  pthread_create(&threads[t], NULL,
265  }
266 
267  params[t].hf = this;
268  params[t].sub_index=NULL;
269  params[t].output = output;
270  params[t].start = start+t*step;
271  params[t].stop = stop;
272  params[t].alphas=alphas;
273  params[t].vec=vec;
274  params[t].bias=b;
275  params[t].progress = false; //true;
276  params[t].index=index;
278 
279  for (t=0; t<num_threads-1; t++)
280  pthread_join(threads[t], NULL);
281 
282  SG_FREE(params);
283  SG_FREE(threads);
284  }
285 #endif
286  SG_FREE(index);
287 
288 #ifndef WIN32
290  SG_INFO("prematurely stopped. \n")
291 #endif
292 }
293 
294 void CHashedWDFeaturesTransposed::dense_dot_range_subset(int32_t* sub_index, int num, float64_t* output, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
295 {
296  ASSERT(sub_index)
297  ASSERT(output)
298 
299  uint32_t* index=SG_MALLOC(uint32_t, num);
300 
301  // TODO: port to use OpenMP backend instead of pthread
302 #ifdef HAVE_PTHREAD
303  int32_t num_threads=parallel->get_num_threads();
304 #else
305  int32_t num_threads=1;
306 #endif
307  ASSERT(num_threads>0)
308 
310 
311  if (dim != w_dim)
312  SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim)
313 
314  if (num_threads < 2)
315  {
316  HASHEDWD_THREAD_PARAM params;
317  params.hf=this;
318  params.sub_index=sub_index;
319  params.output=output;
320  params.start=0;
321  params.stop=num;
322  params.alphas=alphas;
323  params.vec=vec;
324  params.bias=b;
325  params.progress=false; //true;
326  params.index=index;
327  dense_dot_range_helper((void*) &params);
328  }
329 #ifdef HAVE_PTHREAD
330  else
331  {
332  pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
333  HASHEDWD_THREAD_PARAM* params = SG_MALLOC(HASHEDWD_THREAD_PARAM, num_threads);
334  int32_t step= num/num_threads;
335 
336  int32_t t;
337 
338  for (t=0; t<num_threads-1; t++)
339  {
340  params[t].hf = this;
341  params[t].sub_index=sub_index;
342  params[t].output = output;
343  params[t].start = t*step;
344  params[t].stop = (t+1)*step;
345  params[t].alphas=alphas;
346  params[t].vec=vec;
347  params[t].bias=b;
348  params[t].progress = false;
349  params[t].index=index;
350  pthread_create(&threads[t], NULL,
352  }
353 
354  params[t].hf = this;
355  params[t].sub_index=sub_index;
356  params[t].output = output;
357  params[t].start = t*step;
358  params[t].stop = num;
359  params[t].alphas=alphas;
360  params[t].vec=vec;
361  params[t].bias=b;
362  params[t].progress = false; //true;
363  params[t].index=index;
365 
366  for (t=0; t<num_threads-1; t++)
367  pthread_join(threads[t], NULL);
368 
369  SG_FREE(params);
370  SG_FREE(threads);
371  SG_FREE(index);
372  }
373 #endif
374 
375 #ifndef WIN32
377  SG_INFO("prematurely stopped. \n")
378 #endif
379 }
380 
382 {
383  HASHEDWD_THREAD_PARAM* par=(HASHEDWD_THREAD_PARAM*) p;
384  CHashedWDFeaturesTransposed* hf=par->hf;
385  int32_t* sub_index=par->sub_index;
386  float64_t* output=par->output;
387  int32_t start=par->start;
388  int32_t stop=par->stop;
389  float64_t* alphas=par->alphas;
390  float64_t* vec=par->vec;
391  float64_t bias=par->bias;
392  bool progress=par->progress;
393  uint32_t* index=par->index;
394  int32_t string_length=hf->string_length;
395  int32_t degree=hf->degree;
398  uint32_t mask=hf->mask;
399  int32_t partial_w_dim=hf->partial_w_dim;
401 
402  if (sub_index)
403  {
404  for (int32_t j=start; j<stop; j++)
405  output[j]=0.0;
406 
407  uint32_t offs=0;
408  for (int32_t i=0; i<string_length; i++)
409  {
410  uint32_t o=offs;
411  for (int32_t k=0; k<degree && i+k<string_length; k++)
412  {
413  const float64_t wd = wd_weights[k];
414  uint8_t* dim=transposed_strings[i+k].string;
415  uint32_t carry = 0;
416  uint32_t chunk = 0;
417  for (int32_t j=start; j<stop; j++)
418  {
419  uint8_t bval=dim[sub_index[j]];
420  if (k==0)
421  index[j] = 0xDEADBEAF;
422 
423  CHash::IncrementalMurmurHash3(&index[j], &carry, &bval, 1);
424 
425  chunk++;
426  uint32_t h =
428  index[j], carry, chunk);
429 
430  output[j]+=vec[o + (h & mask)]*wd;
431  index[j] = h;
432  }
433 
434  index[stop-1] =
436  index[stop-1], carry, chunk);
437 
438  o+=partial_w_dim;
439  }
440  offs+=partial_w_dim*degree;
441 
442  if (progress)
443  hf->io->progress(i, 0,string_length, i);
444  }
445 
446  for (int32_t j=start; j<stop; j++)
447  {
448  if (alphas)
449  output[j]=output[j]*alphas[sub_index[j]]/normalization_const+bias;
450  else
451  output[j]=output[j]/normalization_const+bias;
452  }
453  }
454  else
455  {
456  SGVector<float64_t>::fill_vector(&output[start], stop-start, 0.0);
457 
458  uint32_t offs=0;
459  for (int32_t i=0; i<string_length; i++)
460  {
461  uint32_t o=offs;
462  for (int32_t k=0; k<degree && i+k<string_length; k++)
463  {
464  const float64_t wd = wd_weights[k];
465  uint8_t* dim=transposed_strings[i+k].string;
466  uint32_t carry = 0;
467  uint32_t chunk = 0;
468 
469  for (int32_t j=start; j<stop; j++)
470  {
471  uint8_t bval=dim[sub_index[j]];
472  if (k==0)
473  index[j] = 0xDEADBEAF;
474 
475  CHash::IncrementalMurmurHash3(&index[j], &carry, &bval, 1);
476 
477  chunk++;
478  uint32_t h =
480  index[j], carry, chunk);
481 
482  index[j] = h;
483  output[j]+=vec[o + (h & mask)]*wd;
484  }
485 
487  index[stop-1], carry, chunk);
488 
489  o+=partial_w_dim;
490  }
491  offs+=partial_w_dim*degree;
492 
493  if (progress)
494  hf->io->progress(i, 0,string_length, i);
495  }
496 
497  for (int32_t j=start; j<stop; j++)
498  {
499  if (alphas)
500  output[j]=output[j]*alphas[j]/normalization_const+bias;
501  else
502  output[j]=output[j]/normalization_const+bias;
503  }
504  }
505 
506  return NULL;
507 }
508 
509 void CHashedWDFeaturesTransposed::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
510 {
511  if (vec2_len != w_dim)
512  SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim)
513 
514  int32_t len;
515  bool free_vec1;
516  uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
517  uint32_t* val=SG_MALLOC(uint32_t, len);
518 
519  uint32_t offs=0;
520  float64_t factor=alpha/normalization_const;
521  if (abs_val)
522  factor=CMath::abs(factor);
523 
524  SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF);
525 
526  for (int32_t i=0; i<len; i++)
527  {
528  uint32_t o=offs;
529  uint32_t carry = 0;
530  uint32_t chunk = 0;
531 
532  for (int32_t k=0; k<degree && i+k<len; k++)
533  {
534  float64_t wd = wd_weights[k]*factor;
535  chunk++;
536  CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1);
537  uint32_t h =
538  CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
539 #ifdef DEBUG_HASHEDWD
540  SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h)
541  SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d\n", vec[i], k,offs, o)
542 #endif
543  vec2[o+(h & mask)]+=wd;
544  val[i] = h;
545  o+=partial_w_dim;
546  }
547 
548  val[i] = CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
549  offs+=partial_w_dim*degree;
550  }
551 
552  SG_FREE(val);
553  strings->free_feature_vector(vec, vec_idx1, free_vec1);
554 }
555 
557 {
558  ASSERT(degree>0)
559 
560  mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
563 
564  wd_weights=SG_MALLOC(float64_t, degree);
565 
566  for (int32_t i=0; i<degree; i++)
567  wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
568 
569  SG_DEBUG("created HashedWDFeaturesTransposed with d=%d (%d), alphabetsize=%d, "
570  "dim=%d partial_dim=%d num=%d, len=%d\n",
571  degree, from_degree, alphabet_size,
573 }
574 
575 
577 {
578  if (n==0)
579  {
581  for (int32_t i=0; i<degree; i++)
583 
585  }
586  else
588 
589  SG_DEBUG("normalization_const:%f\n", normalization_const)
590 }
591 
593 {
594  return new CHashedWDFeaturesTransposed(*this);
595 }
596 
598 {
600  return NULL;
601 }
602 
603 bool CHashedWDFeaturesTransposed::get_next_feature(int32_t& index, float64_t& value, void* iterator)
604 {
606  return false;
607 }
608 
610 {
612 }
virtual int32_t get_max_vector_length()
SGVector< ST > get_feature_vector(int32_t num)
#define SG_INFO(...)
Definition: SGIO.h:117
static void fill_vector(T *vec, int32_t len, T value)
Definition: SGVector.cpp:264
virtual void free_feature_iterator(void *iterator)
int32_t get_num_threads() const
Definition: Parallel.cpp:97
virtual int32_t get_num_vectors() const
static T sq(T x)
Definition: Math.h:445
#define SG_ERROR(...)
Definition: SGIO.h:128
#define SG_NOTIMPLEMENTED
Definition: SGIO.h:138
The class Alphabet implements an alphabet and alphabet utility functions.
Definition: Alphabet.h:91
static uint32_t FinalizeIncrementalMurmurHash3(uint32_t h, uint32_t carry, uint32_t total_length)
Definition: Hash.cpp:376
virtual void add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t *vec2, int32_t vec2_len, bool abs_val=false)
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
Parallel * parallel
Definition: SGObject.h:561
Features that support dot products among other operations.
Definition: DotFeatures.h:44
#define SG_REF(x)
Definition: SGObject.h:52
#define SG_PRINT(...)
Definition: SGIO.h:136
virtual void * get_feature_iterator(int32_t vector_index)
#define ASSERT(x)
Definition: SGIO.h:200
CStringFeatures< ST > * get_transposed()
static void clear_cancel()
Definition: Signal.cpp:126
double float64_t
Definition: common.h:60
virtual EFeatureClass get_feature_class() const
int32_t get_num_symbols() const
Definition: Alphabet.h:139
virtual float64_t dot(int32_t vec_idx1, CDotFeatures *df, int32_t vec_idx2)
virtual EFeatureClass get_feature_class() const =0
static bool cancel_computations()
Definition: Signal.h:111
bool have_same_length(int32_t len=-1)
#define SG_UNREF(x)
Definition: SGObject.h:53
#define SG_DEBUG(...)
Definition: SGIO.h:106
static void IncrementalMurmurHash3(uint32_t *hash, uint32_t *carry, uint8_t *data, int32_t len)
Definition: Hash.cpp:371
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
T sum(const Container< T > &a, bool no_diag=false)
virtual bool get_next_feature(int32_t &index, float64_t &value, void *iterator)
The class Features is the base class of all feature objects.
Definition: Features.h:68
virtual float64_t dense_dot(int32_t vec_idx1, const float64_t *vec2, int32_t vec2_len)
void progress(float64_t current_val, float64_t min_val=0.0, float64_t max_val=1.0, int32_t decimals=1, const char *prefix="PROGRESS:\t")
Definition: SGIO.cpp:155
Features that compute the Weighted Degreee Kernel feature space explicitly.
virtual EFeatureType get_feature_type() const
virtual void dense_dot_range_subset(int32_t *sub_index, int32_t num, float64_t *output, float64_t *alphas, float64_t *vec, int32_t dim, float64_t b)
static float32_t sqrt(float32_t x)
Definition: Math.h:454
#define SG_UNSTABLE(func,...)
Definition: SGIO.h:131
virtual EFeatureType get_feature_type() const =0
virtual void dense_dot_range(float64_t *output, int32_t start, int32_t stop, float64_t *alphas, float64_t *vec, int32_t dim, float64_t b)
static T abs(T a)
Definition: Math.h:175

SHOGUN Machine Learning Toolbox - Documentation