SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
SpectrumMismatchRBFKernel.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Written (W) 1999-2008 Gunnar Raetsch
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
12 #include <vector>
13 
14 #include <shogun/lib/common.h>
15 #include <shogun/io/SGIO.h>
16 #include <shogun/lib/Signal.h>
17 #include <shogun/lib/Trie.h>
18 #include <shogun/base/Parallel.h>
19 
23 
24 #include <vector>
25 #include <string>
26 
27 #include <assert.h>
28 
29 #ifndef WIN32
30 #include <pthread.h>
31 #endif
32 
33 using namespace shogun;
34 
36  CStringKernel<char>(0)
37 {
38  init();
40 }
41 
43  float64_t* AA_matrix_, int32_t nr, int32_t nc, int32_t degree_,
44  int32_t max_mismatch_, float64_t width_) :
45  CStringKernel<char>(size), alphabet(NULL), degree(degree_), max_mismatch(
46  max_mismatch_), width(width_)
47 {
48  init();
49  target_letter_0=-1;
50  set_AA_matrix(AA_matrix_, nr, nc);
52 }
53 
55  CStringFeatures<char>* r, int32_t size, float64_t* AA_matrix_,
56  int32_t nr, int32_t nc, int32_t degree_, int32_t max_mismatch_,
57  float64_t width_) :
58  CStringKernel<char>(size), alphabet(NULL), degree(degree_), max_mismatch(
59  max_mismatch_), width(width_)
60 {
61  target_letter_0=-1;
62 
63  set_AA_matrix(AA_matrix_, nr, nc);
64  init(l, r);
66 }
67 
69 {
70  cleanup();
72 }
73 
74 bool CSpectrumMismatchRBFKernel::init(CFeatures* l, CFeatures* r)
75 {
76  int32_t lhs_changed=(lhs!=l);
77  int32_t rhs_changed=(rhs!=r);
78 
80 
81  SG_DEBUG("lhs_changed: %i\n", lhs_changed)
82  SG_DEBUG("rhs_changed: %i\n", rhs_changed)
83 
86 
88  alphabet=sf_l->get_alphabet();
89  CAlphabet* ralphabet=sf_r->get_alphabet();
90 
91  if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
92  properties&=((uint64_t)(-1))^(KP_LINADD|KP_BATCHEVALUATION);
93 
94  ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet())
95  SG_UNREF(ralphabet);
96 
97  compute_all();
98 
99  return init_normalizer();
100 }
101 
103 {
104 
106  alphabet=NULL;
107 
109 }
110 
112  const char* joint_seq, unsigned int index)
113 {
114  float64_t diff=0.0;
115 
116  for (unsigned int i=0; i<path.size(); i++)
117  {
118  if (path[i]!=joint_seq[index+i])
119  {
120  diff+=AA_matrix.matrix[(path[i]-1)*128+path[i]-1];
121  diff-=2*AA_matrix.matrix[(path[i]-1)*128+joint_seq[index+i]-1];
122  diff+=AA_matrix.matrix[(joint_seq[index+i]-1)*128+joint_seq[index+i]
123  -1];
124  }
125  }
126 
127  return exp(-diff/width);
128 }
129 
131  std::vector<struct joint_list_struct> &joint_list, std::string path,
132  unsigned int d)
133 {
134  const char* AA="ACDEFGHIKLMNPQRSTVWY";
135  const unsigned int num_AA=strlen(AA);
136 
137  assert(path.size()==d);
138 
139  for (unsigned int i=0; i<num_AA; i++)
140  {
141  std::vector<struct joint_list_struct> joint_list_;
142 
143  if (d==0)
144  SG_PRINT("i=%i: ", i);
145  if (d==0&&target_letter_0!=-1&&(int)i!=target_letter_0)
146  continue;
147 
148  if (d==1)
149  {
150  SG_PRINT("*");
151  }
152  if (d==2)
153  {
154  SG_PRINT("+");
155  }
156 
157  for (unsigned int j=0; j<joint_list.size(); j++)
158  {
159  if (joint_seq[joint_list[j].index+d]!=AA[i])
160  {
161  if (joint_list[j].mismatch+1<=(unsigned int)max_mismatch)
162  {
163  struct joint_list_struct list_item;
164  list_item=joint_list[j];
165  list_item.mismatch=joint_list[j].mismatch+1;
166  joint_list_.push_back(list_item);
167  }
168  }
169  else
170  joint_list_.push_back(joint_list[j]);
171  }
172 
173  if (joint_list_.size()>0)
174  {
175  std::string path_=path+AA[i];
176 
177  if (d+1<(unsigned int)degree)
178  {
179  compute_helper_all(joint_seq, joint_list_, path_, d+1);
180  }
181  else
182  {
185  feats.set_const(0);
186 
187  for (unsigned int j=0; j<joint_list_.size(); j++)
188  {
189  if (width==0.0)
190  {
191  feats[joint_list_[j].ex_index]++;
192  //if (joint_mismatch_[j]==0)
193  // feats[joint_ex_index_[j]]+=3 ;
194  }
195  else
196  {
197  if (joint_list_[j].mismatch!=0)
198  feats[joint_list_[j].ex_index]+=AA_helper(path_,
199  joint_seq, joint_list_[j].index);
200  else
201  feats[joint_list_[j].ex_index]++;
202  }
203  }
204 
205  std::vector<int> idx;
206  for (int r=0; r<feats.get_array_size(); r++)
207  if (feats[r]!=0.0)
208  idx.push_back(r);
209 
210  for (unsigned int r=0; r<idx.size(); r++)
211  for (unsigned int s=r; s<idx.size(); s++)
212  if (s==r)
214  feats[idx[r]]*feats[idx[s]]
215  +kernel_matrix->get_element(idx[r],
216  idx[s]), idx[r], idx[s]);
217  else
218  {
220  feats[idx[r]]*feats[idx[s]]
221  +kernel_matrix->get_element(idx[r],
222  idx[s]), idx[r], idx[s]);
224  feats[idx[r]]*feats[idx[s]]
225  +kernel_matrix->get_element(idx[s],
226  idx[r]), idx[s], idx[r]);
227  }
228  }
229  }
230  if (d==0)
231  SG_PRINT("\n");
232  }
233 }
234 
236 {
237  std::string joint_seq;
238  std::vector<struct joint_list_struct> joint_list;
239 
240  assert(lhs->get_num_vectors()==rhs->get_num_vectors());
243  for (int i=0; i<lhs->get_num_vectors(); i++)
244  for (int j=0; j<lhs->get_num_vectors(); j++)
245  kernel_matrix->set_element(0, i, j);
246 
247  for (int i=0; i<lhs->get_num_vectors(); i++)
248  {
249  int32_t alen;
250  bool free_avec;
251  char* avec=((CStringFeatures<char>*)lhs)->get_feature_vector(i, alen,
252  free_avec);
253 
254  for (int apos=0; apos+degree-1<alen; apos++)
255  {
256  struct joint_list_struct list_item;
257  list_item.ex_index=i;
258  list_item.index=apos+joint_seq.size();
259  list_item.mismatch=0;
260 
261  joint_list.push_back(list_item);
262  }
263  joint_seq+=std::string(avec, alen);
264 
265  ((CStringFeatures<char>*)lhs)->free_feature_vector(avec, i, free_avec);
266  }
267 
268  compute_helper_all(joint_seq.c_str(), joint_list, "", 0);
269 }
270 
271 float64_t CSpectrumMismatchRBFKernel::compute(int32_t idx_a, int32_t idx_b)
272 {
273  return kernel_matrix->element(idx_a, idx_b);
274 }
275 
277  int32_t nr, int32_t nc)
278 {
279  if (AA_matrix_)
280  {
281  if (nr!=128 || nc!=128)
282  SG_ERROR("AA_matrix should be of shape 128x128\n")
283 
285  SG_DEBUG("Setting AA_matrix\n")
286  memcpy(AA_matrix.matrix, AA_matrix_, 128*128*sizeof(float64_t));
287  return true;
288  }
289 
290  return false;
291 }
292 
294 {
296 
297  if (lhs!=NULL&&rhs!=NULL)
298  return init(lhs, rhs);
299  else
300  return true;
301 }
302 
304 {
305  SG_ADD(&degree, "degree", "degree of the kernel", MS_AVAILABLE);
306  SG_ADD(&AA_matrix, "AA_matrix", "128*128 scalar product matrix",
308  SG_ADD(&width, "width", "width of Gaussian", MS_AVAILABLE);
309  SG_ADD(&target_letter_0, "target_letter_0", "target letter 0",
311  SG_ADD(&initialized, "initialized", "the mark of initialization status",
313  SG_ADD((CSGObject** )&kernel_matrix, "kernel_matrix",
314  "the kernel matrix with its length "
315  "defined by the number of vectors of the string features",
317 }
318 
320 {
321  SG_ADD((CSGObject** )&alphabet, "alphabet", "the alphabet used by kernel",
323 }
324 
325 void CSpectrumMismatchRBFKernel::init()
326 {
327  alphabet=NULL;
328  degree=0;
329  max_mismatch=0;
330  width=0.0;
332  initialized=false;
333  target_letter_0=0;
334 }
335 

SHOGUN Machine Learning Toolbox - Documentation