SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashedDocDotFeatures.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 
13 #include <shogun/lib/Hash.h>
15 
16 namespace shogun
17 {
19  CTokenizer* tzer, bool normalize, int32_t n_grams, int32_t skips, int32_t size) : CDotFeatures(size)
20 {
21  if (n_grams < 1)
22  n_grams = 1;
23 
24  if ( (n_grams==1 && skips!=0) || (skips<0))
25  skips = 0;
26 
27  init(hash_bits, docs, tzer, normalize, n_grams, skips);
28 }
29 
31 : CDotFeatures(orig)
32 {
33  init(orig.num_bits, orig.doc_collection, orig.tokenizer, orig.should_normalize,
34  orig.ngrams, orig.tokens_to_skip);
35 }
36 
38 {
40 }
41 
42 void CHashedDocDotFeatures::init(int32_t hash_bits, CStringFeatures<char>* docs,
43  CTokenizer* tzer, bool normalize, int32_t n_grams, int32_t skips)
44 {
45  num_bits = hash_bits;
46  ngrams = n_grams;
47  tokens_to_skip = skips;
48  doc_collection = docs;
49  tokenizer = tzer;
50  should_normalize = normalize;
51 
52  if (!tokenizer)
53  {
55  ((CDelimiterTokenizer* )tokenizer)->init_for_whitespace();
56  }
57 
58  SG_ADD(&num_bits, "num_bits", "Number of bits of hash", MS_NOT_AVAILABLE);
59  SG_ADD(&ngrams, "ngrams", "Number of tokens to combine for quadratic feature support",
61  SG_ADD(&tokens_to_skip, "tokens_to_skip", "Number of tokens to skip when combining features",
63  SG_ADD((CSGObject**) &doc_collection, "doc_collection", "Document collection",
65  SG_ADD((CSGObject**) &tokenizer, "tokenizer", "Document tokenizer",
67  SG_ADD(&should_normalize, "should_normalize", "Normalize or not the dot products",
69 
72 }
73 
75 {
78 }
79 
81 {
82  return CMath::pow(2, num_bits);
83 }
84 
85 float64_t CHashedDocDotFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
86 {
87  ASSERT(df)
88  ASSERT(df->get_name() == get_name())
89 
91 
93  SGVector<char> sv2 = hddf->doc_collection->get_feature_vector(vec_idx2);
94 
97  SGSparseVector<float64_t> cv1 = converter->apply(sv1);
98  SGSparseVector<float64_t> cv2 = converter->apply(sv2);
100 
101  doc_collection->free_feature_vector(sv1, vec_idx1);
102  hddf->doc_collection->free_feature_vector(sv2, vec_idx2);
103  SG_UNREF(converter);
104 
105  return result;
106 }
107 
109 {
110  return dense_dot(vec_idx1, vec2.vector, vec2.vlen);
111 }
112 
113 float64_t CHashedDocDotFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
114 {
115  ASSERT(vec2_len == CMath::pow(2,num_bits))
116 
118 
122  index_t hashes_start = 0;
123  index_t hashes_end = 0;
124  int32_t len = hashes.vlen - 1;
125 
128  SGVector<index_t> hashed_indices((ngrams-1)*(tokens_to_skip+1) + 1);
129 
130  float64_t result = 0;
131  CTokenizer* local_tzer = tokenizer->get_copy();
132 
134  const int32_t seed = 0xdeadbeaf;
135  local_tzer->set_text(sv);
136  index_t start = 0;
137  while (hashes_end<ngrams-1+tokens_to_skip && local_tzer->has_next())
138  {
139  index_t end = local_tzer->next_token_idx(start);
140  uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);
141  hashes[hashes_end++] = token_hash;
142  }
143 
145  while (local_tzer->has_next())
146  {
147  index_t end = local_tzer->next_token_idx(start);
148  uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);
149  hashes[hashes_end] = token_hash;
150 
151  CHashedDocConverter::generate_ngram_hashes(hashes, hashes_start, len, hashed_indices,
153 
154  for (index_t i=0; i<hashed_indices.vlen; i++)
155  result += vec2[hashed_indices[i]];
156 
157  hashes_start++;
158  hashes_end++;
159  if (hashes_end==hashes.vlen)
160  hashes_end = 0;
161  if (hashes_start==hashes.vlen)
162  hashes_start = 0;
163  }
164 
165  if (ngrams>1)
166  {
167  while (hashes_start!=hashes_end)
168  {
169  len--;
170  index_t max_idx = CHashedDocConverter::generate_ngram_hashes(hashes, hashes_start,
171  len, hashed_indices, num_bits, ngrams, tokens_to_skip);
172 
173  for (index_t i=0; i<max_idx; i++)
174  result += vec2[hashed_indices[i]];
175 
176  hashes_start++;
177  if (hashes_start==hashes.vlen)
178  hashes_start = 0;
179  }
180  }
181  doc_collection->free_feature_vector(sv, vec_idx1);
182  SG_UNREF(local_tzer);
183  return should_normalize ? result / CMath::sqrt((float64_t) sv.size()) : result;
184 }
185 
187  float64_t* vec2, int32_t vec2_len, bool abs_val)
188 {
189  ASSERT(vec2_len == CMath::pow(2,num_bits))
190 
191  if (abs_val)
192  alpha = CMath::abs(alpha);
193 
195  const float64_t value = should_normalize ? alpha / CMath::sqrt((float64_t) sv.size()) : alpha;
196 
200  index_t hashes_start = 0;
201  index_t hashes_end = 0;
202  index_t len = hashes.vlen - 1;
203 
206  SGVector<index_t> hashed_indices((ngrams-1)*(tokens_to_skip+1) + 1);
207 
208  CTokenizer* local_tzer = tokenizer->get_copy();
209 
211  const int32_t seed = 0xdeadbeaf;
212  local_tzer->set_text(sv);
213  index_t start = 0;
214  while (hashes_end<ngrams-1+tokens_to_skip && local_tzer->has_next())
215  {
216  index_t end = local_tzer->next_token_idx(start);
217  uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);
218  hashes[hashes_end++] = token_hash;
219  }
220 
221  while (local_tzer->has_next())
222  {
223  index_t end = local_tzer->next_token_idx(start);
224  uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);
225  hashes[hashes_end] = token_hash;
226 
227  CHashedDocConverter::generate_ngram_hashes(hashes, hashes_start, len, hashed_indices,
229 
230  for (index_t i=0; i<hashed_indices.vlen; i++)
231  vec2[hashed_indices[i]] += value;
232 
233  hashes_start++;
234  hashes_end++;
235  if (hashes_end==hashes.vlen)
236  hashes_end = 0;
237  if (hashes_start==hashes.vlen)
238  hashes_start = 0;
239  }
240 
241  if (ngrams>1)
242  {
243  while (hashes_start!=hashes_end)
244  {
245  len--;
247  hashes_start, len, hashed_indices, num_bits, ngrams, tokens_to_skip);
248 
249  for (index_t i=0; i<max_idx; i++)
250  vec2[hashed_indices[i]] += value;
251 
252  hashes_start++;
253  if (hashes_start==hashes.vlen)
254  hashes_start = 0;
255  }
256  }
257 
258  doc_collection->free_feature_vector(sv, vec_idx1);
259  SG_UNREF(local_tzer);
260 }
261 
263  int32_t length, int32_t num_bits, uint32_t seed)
264 {
265  int32_t hash = CHash::MurmurHash3((uint8_t* ) token, length, seed);
266  return hash & ((1 << num_bits) - 1);
267 }
268 
270 {
272  doc_collection = docs;
273 }
274 
276 {
278  int32_t num_nnz_features = sv.size();
280  return num_nnz_features;
281 }
282 
284 {
286  return NULL;
287 }
288 
289 bool CHashedDocDotFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
290 {
292  return false;
293 }
294 
296 {
298 }
299 
301 {
302  return "HashedDocDotFeatures";
303 }
304 
306 {
307  return new CHashedDocDotFeatures(*this);
308 }
309 
311 {
312  return F_UINT;
313 }
314 
316 {
317  return C_SPARSE;
318 }
319 
321 {
323 }
324 }

SHOGUN Machine Learning Toolbox - Documentation