SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
DynProg.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 2 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2007 Soeren Sonnenburg
8  * Written (W) 1999-2007 Gunnar Raetsch
9  * Written (W) 2008-2009 Jonas Behr
10  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
11  */
12 
15 #include <shogun/io/SGIO.h>
16 #include <shogun/lib/config.h>
19 #include <shogun/structure/Plif.h>
21 
22 #include <stdlib.h>
23 #include <stdio.h>
24 #include <time.h>
25 #include <ctype.h>
26 
27 using namespace shogun;
28 
29 //#define USE_TMP_ARRAYCLASS
30 //#define DYNPROG_DEBUG
31 
32 int32_t CDynProg::word_degree_default[4]={3,4,5,6} ;
33 int32_t CDynProg::cum_num_words_default[5]={0,64,320,1344,5440} ;
34 int32_t CDynProg::frame_plifs[3]={4,5,6};
35 int32_t CDynProg::num_words_default[4]= {64,256,1024,4096} ;
36 int32_t CDynProg::mod_words_default[32] = {1,1,1,1,1,1,1,1,
37  1,1,1,1,1,1,1,1,
38  0,0,0,0,0,0,0,0,
39  0,0,0,0,0,0,0,0} ;
40 bool CDynProg::sign_words_default[16] = {true,true,true,true,true,true,true,true,
41  false,false,false,false,false,false,false,false} ; // whether to use counts or signum of counts
42 int32_t CDynProg::string_words_default[16] = {0,0,0,0,0,0,0,0,
43  1,1,1,1,1,1,1,1} ; // which string should be used
44 
45 CDynProg::CDynProg(int32_t num_svms /*= 8 */)
46 : CSGObject(), m_transition_matrix_a_id(1,1), m_transition_matrix_a(1,1),
47  m_transition_matrix_a_deriv(1,1), m_initial_state_distribution_p(1),
48  m_initial_state_distribution_p_deriv(1), m_end_state_distribution_q(1),
49  m_end_state_distribution_q_deriv(1),
50 
51  // multi svm
52  m_num_degrees(4),
53  m_num_svms(num_svms),
54  m_word_degree(word_degree_default, m_num_degrees, true, true),
55  m_cum_num_words(cum_num_words_default, m_num_degrees+1, true, true),
56  m_cum_num_words_array(m_cum_num_words.get_array()),
57  m_num_words(num_words_default, m_num_degrees, true, true),
58  m_num_words_array(m_num_words.get_array()),
59  m_mod_words(mod_words_default, m_num_svms, 2, true, true),
60  m_mod_words_array(m_mod_words.get_array()),
61  m_sign_words(sign_words_default, m_num_svms, true, true),
62  m_sign_words_array(m_sign_words.get_array()),
63  m_string_words(string_words_default, m_num_svms, true, true),
64  m_string_words_array(m_string_words.get_array()),
65  //m_svm_pos_start(m_num_degrees),
66  m_num_unique_words(m_num_degrees),
67  m_svm_arrays_clean(true),
68 
69  m_max_a_id(0), m_observation_matrix(1,1,1),
70  m_pos(1),
71  m_seq_len(0),
72  m_orf_info(1,2),
73  m_plif_list(1),
74  m_genestr(1), m_wordstr(NULL), m_dict_weights(1,1), m_segment_loss(1,1,2),
75  m_segment_ids(1),
76  m_segment_mask(1),
77  m_my_state_seq(1),
78  m_my_pos_seq(1),
79  m_my_scores(1),
80  m_my_losses(1),
81  m_scores(1),
82  m_states(1,1),
83  m_positions(1,1),
84 
85  m_seq_sparse1(NULL),
86  m_seq_sparse2(NULL),
87  m_plif_matrices(NULL),
88 
89  m_genestr_stop(1),
90  m_intron_list(NULL),
91  m_num_intron_plifs(0),
92  m_lin_feat(1,1), //by Jonas
93  m_raw_intensities(NULL),
94  m_probe_pos(NULL),
95  m_num_probes_cum(NULL),
96  m_num_lin_feat_plifs_cum(NULL),
97  m_num_raw_data(0),
98 
99  m_long_transitions(true),
100  m_long_transition_threshold(1000)
101 {
102  trans_list_forward = NULL ;
103  trans_list_forward_cnt = NULL ;
104  trans_list_forward_val = NULL ;
105  trans_list_forward_id = NULL ;
106  trans_list_len = 0 ;
107 
108  mem_initialized = true ;
109 
110  m_N=1;
111 
112  m_raw_intensities = NULL;
113  m_probe_pos = NULL;
114  m_num_probes_cum = SG_MALLOC(int32_t, 100);
115  m_num_probes_cum[0] = 0;
116  //m_use_tiling=false;
117  m_num_lin_feat_plifs_cum = SG_MALLOC(int32_t, 100);
119  m_num_raw_data = 0;
120 #ifdef ARRAY_STATISTICS
121  m_word_degree.set_array_name("word_degree");
122 #endif
123 
124  m_transition_matrix_a_id.set_array_name("transition_matrix_a_id");
125  m_transition_matrix_a.set_array_name("transition_matrix_a");
126  m_transition_matrix_a_deriv.set_array_name("transition_matrix_a_deriv");
127  m_mod_words.set_array_name("mod_words");
128  m_orf_info.set_array_name("orf_info");
129  m_segment_sum_weights.set_array_name("segment_sum_weights");
130  m_dict_weights.set_array_name("dict_weights");
131  m_states.set_array_name("states");
132  m_positions.set_array_name("positions");
133  m_lin_feat.set_array_name("lin_feat");
134 
135 
136  m_observation_matrix.set_array_name("m_observation_matrix");
137  m_segment_loss.set_array_name("m_segment_loss");
139 }
140 
142 {
143  if (trans_list_forward_cnt)
144  SG_FREE(trans_list_forward_cnt);
145  if (trans_list_forward)
146  {
147  for (int32_t i=0; i<trans_list_len; i++)
148  {
149  if (trans_list_forward[i])
150  SG_FREE(trans_list_forward[i]);
151  }
152  SG_FREE(trans_list_forward);
153  }
154  if (trans_list_forward_val)
155  {
156  for (int32_t i=0; i<trans_list_len; i++)
157  {
158  if (trans_list_forward_val[i])
159  SG_FREE(trans_list_forward_val[i]);
160  }
161  SG_FREE(trans_list_forward_val);
162  }
163  if (trans_list_forward_id)
164  {
165  for (int32_t i=0; i<trans_list_len; i++)
166  {
167  if (trans_list_forward_id[i])
168  SG_FREE(trans_list_forward_id[i]);
169  }
170  SG_FREE(trans_list_forward_id);
171  }
172  if (m_raw_intensities)
173  SG_FREE(m_raw_intensities);
174  if (m_probe_pos)
175  SG_FREE(m_probe_pos);
176  if (m_num_probes_cum)
177  SG_FREE(m_num_probes_cum);
179  SG_FREE(m_num_lin_feat_plifs_cum);
180 
181  delete m_intron_list;
182 
187 }
188 
191 {
192  return m_num_svms;
193 }
194 
196 {
197  int32_t length=m_genestr.get_dim1();
198 
199  m_genestr_stop.resize_array(length) ;
201  m_genestr_stop.set_array_name("genestr_stop") ;
202  {
203  for (int32_t i=0; i<length-2; i++)
204  if ((m_genestr[i]=='t' || m_genestr[i]=='T') &&
205  (((m_genestr[i+1]=='a' || m_genestr[i+1]=='A') &&
206  (m_genestr[i+2]=='a' || m_genestr[i+2]=='g' || m_genestr[i+2]=='A' || m_genestr[i+2]=='G')) ||
207  ((m_genestr[i+1]=='g'||m_genestr[i+1]=='G') && (m_genestr[i+2]=='a' || m_genestr[i+2]=='A') )))
208  {
209  m_genestr_stop.element(i)=true ;
210  }
211  else
212  m_genestr_stop.element(i)=false ;
213  m_genestr_stop.element(length-2)=false ;
214  m_genestr_stop.element(length-1)=false ;
215  }
216 }
217 
218 void CDynProg::set_num_states(int32_t p_N)
219 {
220  m_N=p_N ;
221 
229 
231 }
232 
234 {
235  return m_N;
236 }
237 
239  int32_t* probe_pos, float64_t* intensities, const int32_t num_probes)
240 {
241  m_num_raw_data++;
243 
244  int32_t* tmp_probe_pos = SG_MALLOC(int32_t, m_num_probes_cum[m_num_raw_data]);
245  float64_t* tmp_raw_intensities = SG_MALLOC(float64_t, m_num_probes_cum[m_num_raw_data]);
246 
247 
248  if (m_num_raw_data==1){
249  memcpy(tmp_probe_pos, probe_pos, num_probes*sizeof(int32_t));
250  memcpy(tmp_raw_intensities, intensities, num_probes*sizeof(float64_t));
251  //SG_PRINT("raw_intens:%f \n",*tmp_raw_intensities+2)
252  }else{
253  memcpy(tmp_probe_pos, m_probe_pos, m_num_probes_cum[m_num_raw_data-1]*sizeof(int32_t));
254  memcpy(tmp_raw_intensities, m_raw_intensities, m_num_probes_cum[m_num_raw_data-1]*sizeof(float64_t));
255  memcpy(tmp_probe_pos+m_num_probes_cum[m_num_raw_data-1], probe_pos, num_probes*sizeof(int32_t));
256  memcpy(tmp_raw_intensities+m_num_probes_cum[m_num_raw_data-1], intensities, num_probes*sizeof(float64_t));
257  }
258  SG_FREE(m_probe_pos);
259  SG_FREE(m_raw_intensities);
260  m_probe_pos = tmp_probe_pos; //SG_MALLOC(int32_t, num_probes);
261  m_raw_intensities = tmp_raw_intensities;//SG_MALLOC(float64_t, num_probes);
262 
263  //memcpy(m_probe_pos, probe_pos, num_probes*sizeof(int32_t));
264  //memcpy(m_raw_intensities, intensities, num_probes*sizeof(float64_t));
265 
266 }
267 
268 void CDynProg::init_content_svm_value_array(const int32_t p_num_svms)
269 {
270  m_lin_feat.resize_array(p_num_svms, m_seq_len);
271 
272  // initialize array
273  for (int s=0; s<p_num_svms; s++)
274  for (int p=0; p<m_seq_len; p++)
275  m_lin_feat.set_element(0.0, s, p) ;
276 }
277 
278 void CDynProg::resize_lin_feat(const int32_t num_new_feat)
279 {
280  int32_t dim1, dim2;
281  m_lin_feat.get_array_size(dim1, dim2);
283  ASSERT(dim2==m_seq_len) // == number of candidate positions
284 
285 
286 
287  float64_t* arr = m_lin_feat.get_array();
288  float64_t* tmp = SG_MALLOC(float64_t, (dim1+num_new_feat)*dim2);
289  memset(tmp, 0, (dim1+num_new_feat)*dim2*sizeof(float64_t)) ;
290  for(int32_t j=0;j<m_seq_len;j++)
291  for(int32_t k=0;k<m_num_lin_feat_plifs_cum[m_num_raw_data-1];k++)
292  tmp[j*(dim1+num_new_feat)+k] = arr[j*dim1+k];
293 
294  m_lin_feat.set_array(tmp, dim1+num_new_feat,dim2, true, true);// copy array and free it later
295  SG_FREE(tmp);
296 
297  /*for(int32_t j=0;j<5;j++)
298  {
299  for(int32_t k=0;k<m_num_lin_feat_plifs_cum[m_num_raw_data];k++)
300  {
301  SG_PRINT("(%i,%i)%f ",k,j,m_lin_feat.get_element(k,j))
302  }
303  SG_PRINT("\n")
304  }
305  m_lin_feat.get_array_size(dim1,dim2);
306  SG_PRINT("resize_lin_feat: dim1:%i, dim2:%i\n",dim1,dim2)*/
307 
308  //SG_PRINT("resize_lin_feat: done\n")
309 }
310 
312  CPlif** PEN, const int32_t* tiling_plif_ids, const int32_t num_tiling_plifs)
313 {
315  float64_t* tiling_plif = SG_MALLOC(float64_t, num_tiling_plifs);
318  svm_value[i]=0.0;
319  int32_t* tiling_rows = SG_MALLOC(int32_t, num_tiling_plifs);
320  for (int32_t i=0; i<num_tiling_plifs; i++)
321  {
322  tiling_plif[i]=0.0;
323  CPlif * plif = PEN[tiling_plif_ids[i]];
324  tiling_rows[i] = plif->get_use_svm();
325 
326  ASSERT(tiling_rows[i]-1==m_num_lin_feat_plifs_cum[m_num_raw_data-1]+i)
327  }
328  resize_lin_feat(num_tiling_plifs);
329 
330 
331  int32_t* p_tiling_pos = &m_probe_pos[m_num_probes_cum[m_num_raw_data-1]];
332  float64_t* p_tiling_data = &m_raw_intensities[m_num_probes_cum[m_num_raw_data-1]];
333  int32_t num=m_num_probes_cum[m_num_raw_data-1];
334 
335  for (int32_t pos_idx=0;pos_idx<m_seq_len;pos_idx++)
336  {
337  while (num<m_num_probes_cum[m_num_raw_data]&&*p_tiling_pos<m_pos[pos_idx])
338  {
339  for (int32_t i=0; i<num_tiling_plifs; i++)
340  {
341  svm_value[m_num_lin_feat_plifs_cum[m_num_raw_data-1]+i]=*p_tiling_data;
342  CPlif * plif = PEN[tiling_plif_ids[i]];
343  ASSERT(m_num_lin_feat_plifs_cum[m_num_raw_data-1]+i==plif->get_use_svm()-1)
344  plif->set_do_calc(true);
345  tiling_plif[i]+=plif->lookup_penalty(0,svm_value);
346  plif->set_do_calc(false);
347  }
348  p_tiling_data++;
349  p_tiling_pos++;
350  num++;
351  }
352  for (int32_t i=0; i<num_tiling_plifs; i++)
353  m_lin_feat.set_element(tiling_plif[i],tiling_rows[i]-1,pos_idx);
354  }
355  SG_FREE(svm_value);
356  SG_FREE(tiling_plif);
357  SG_FREE(tiling_rows);
358 }
359 
361 {
362  SG_FREE(m_wordstr);
363  m_wordstr=SG_MALLOC(uint16_t**, 5440);
364  int32_t k=0;
365  int32_t genestr_len=m_genestr.get_dim1();
366 
367  m_wordstr[k]=SG_MALLOC(uint16_t*, m_num_degrees);
368  for (int32_t j=0; j<m_num_degrees; j++)
369  {
370  m_wordstr[k][j]=NULL ;
371  {
372  m_wordstr[k][j]=SG_MALLOC(uint16_t, genestr_len);
373  for (int32_t i=0; i<genestr_len; i++)
374  switch (m_genestr[i])
375  {
376  case 'A':
377  case 'a': m_wordstr[k][j][i]=0 ; break ;
378  case 'C':
379  case 'c': m_wordstr[k][j][i]=1 ; break ;
380  case 'G':
381  case 'g': m_wordstr[k][j][i]=2 ; break ;
382  case 'T':
383  case 't': m_wordstr[k][j][i]=3 ; break ;
384  default: ASSERT(0)
385  }
387  }
388  }
389 }
390 
392 {
393  for (int32_t s=0; s<m_num_svms; s++)
394  m_lin_feat.set_element(0.0, s, 0);
395 
396  for (int32_t p=0 ; p<m_seq_len-1 ; p++)
397  {
398  int32_t from_pos = m_pos[p];
399  int32_t to_pos = m_pos[p+1];
400  float64_t* my_svm_values_unnormalized = SG_MALLOC(float64_t, m_num_svms);
401  //SG_PRINT("%i(%i->%i) ",p,from_pos, to_pos)
402 
403  ASSERT(from_pos<=m_genestr.get_dim1())
404  ASSERT(to_pos<=m_genestr.get_dim1())
405 
406  for (int32_t s=0; s<m_num_svms; s++)
407  my_svm_values_unnormalized[s]=0.0;//precomputed_svm_values.element(s,p);
408 
409  for (int32_t i=from_pos; i<to_pos; i++)
410  {
411  for (int32_t j=0; j<m_num_degrees; j++)
412  {
413  uint16_t word = m_wordstr[0][j][i] ;
414  for (int32_t s=0; s<m_num_svms; s++)
415  {
416  // check if this k-mer should be considered for this SVM
417  if (m_mod_words.get_element(s,0)==3 && i%3!=m_mod_words.get_element(s,1))
418  continue;
419  my_svm_values_unnormalized[s] += m_dict_weights[(word+m_cum_num_words_array[j])+s*m_cum_num_words_array[m_num_degrees]] ;
420  }
421  }
422  }
423  for (int32_t s=0; s<m_num_svms; s++)
424  {
425  float64_t prev = m_lin_feat.get_element(s, p);
426  //SG_PRINT("elem (%i, %i, %f)\n", s, p, prev)
427  if (prev<-1e20 || prev>1e20)
428  {
429  SG_ERROR("initialization missing (%i, %i, %f)\n", s, p, prev)
430  prev=0 ;
431  }
432  m_lin_feat.set_element(prev + my_svm_values_unnormalized[s], s, p+1);
433  }
434  SG_FREE(my_svm_values_unnormalized);
435  }
436  //for (int32_t j=0; j<m_num_degrees; j++)
437  // SG_FREE(m_wordstr[0][j]);
438  //SG_FREE(m_wordstr[0]);
439 }
440 
442 {
443  if (!(p.vlen==m_N))
444  SG_ERROR("length of start prob vector p (%i) is not equal to the number of states (%i), N: %i\n",p.vlen, m_N)
445 
447 }
448 
450 {
451  if (!(q.vlen==m_N))
452  SG_ERROR("length of end prob vector q (%i) is not equal to the number of states (%i), N: %i\n",q.vlen, m_N)
454 }
455 
457 {
458  ASSERT(a.num_cols==m_N)
459  ASSERT(a.num_rows==m_N)
460  m_transition_matrix_a.set_array(a.matrix, m_N, m_N, true, true);
462 }
463 
465 {
466  ASSERT(a.num_cols==m_N)
467  ASSERT(a.num_rows==m_N)
469  m_max_a_id = 0;
470  for (int32_t i=0; i<m_N; i++)
471  {
472  for (int32_t j=0; j<m_N; j++)
474  }
475 }
476 
478 {
479  int32_t num_trans=a_trans.num_rows;
480  int32_t num_cols=a_trans.num_cols;
481 
482  //CMath::display_matrix(a_trans.matrix,num_trans, num_cols,"a_trans");
483 
484  if (!((num_cols==3) || (num_cols==4)))
485  SG_ERROR("!((num_cols==3) || (num_cols==4)), num_cols: %i\n",num_cols)
486 
487  SG_FREE(trans_list_forward);
488  SG_FREE(trans_list_forward_cnt);
489  SG_FREE(trans_list_forward_val);
490  SG_FREE(trans_list_forward_id);
491 
492  trans_list_forward = NULL ;
493  trans_list_forward_cnt = NULL ;
494  trans_list_forward_val = NULL ;
495  trans_list_len = 0 ;
496 
499 
500  mem_initialized = true ;
501 
502  trans_list_forward_cnt=NULL ;
503  trans_list_len = m_N ;
504  trans_list_forward = SG_MALLOC(T_STATES*, m_N);
505  trans_list_forward_cnt = SG_MALLOC(T_STATES, m_N);
506  trans_list_forward_val = SG_MALLOC(float64_t*, m_N);
507  trans_list_forward_id = SG_MALLOC(int32_t*, m_N);
508 
509  int32_t start_idx=0;
510  for (int32_t j=0; j<m_N; j++)
511  {
512  int32_t old_start_idx=start_idx;
513 
514  while (start_idx<num_trans && a_trans.matrix[start_idx+num_trans]==j)
515  {
516  start_idx++;
517 
518  if (start_idx>1 && start_idx<num_trans)
519  ASSERT(a_trans.matrix[start_idx+num_trans-1] <= a_trans.matrix[start_idx+num_trans])
520  }
521 
522  if (start_idx>1 && start_idx<num_trans)
523  ASSERT(a_trans.matrix[start_idx+num_trans-1] <= a_trans.matrix[start_idx+num_trans])
524 
525  int32_t len=start_idx-old_start_idx;
526  ASSERT(len>=0)
527 
528  trans_list_forward_cnt[j] = 0 ;
529 
530  if (len>0)
531  {
532  trans_list_forward[j] = SG_MALLOC(T_STATES, len);
533  trans_list_forward_val[j] = SG_MALLOC(float64_t, len);
534  trans_list_forward_id[j] = SG_MALLOC(int32_t, len);
535  }
536  else
537  {
538  trans_list_forward[j] = NULL;
539  trans_list_forward_val[j] = NULL;
540  trans_list_forward_id[j] = NULL;
541  }
542  }
543 
544  for (int32_t i=0; i<num_trans; i++)
545  {
546  int32_t from_state = (int32_t)a_trans.matrix[i] ;
547  int32_t to_state = (int32_t)a_trans.matrix[i+num_trans] ;
548  float64_t val = a_trans.matrix[i+num_trans*2] ;
549  int32_t id = 0 ;
550  if (num_cols==4)
551  id = (int32_t)a_trans.matrix[i+num_trans*3] ;
552  //SG_DEBUG("id=%i\n", id)
553 
554  ASSERT(to_state>=0 && to_state<m_N)
555  ASSERT(from_state>=0 && from_state<m_N)
556 
557  trans_list_forward[to_state][trans_list_forward_cnt[to_state]]=from_state ;
558  trans_list_forward_val[to_state][trans_list_forward_cnt[to_state]]=val ;
559  trans_list_forward_id[to_state][trans_list_forward_cnt[to_state]]=id ;
560  trans_list_forward_cnt[to_state]++ ;
561  m_transition_matrix_a.element(from_state, to_state) = val ;
562  m_transition_matrix_a_id.element(from_state, to_state) = id ;
563  //SG_PRINT("from_state:%i to_state:%i trans_matrix_a_id:%i \n",from_state, to_state,m_transition_matrix_a_id.element(from_state, to_state))
564  } ;
565 
566  m_max_a_id = 0 ;
567  for (int32_t i=0; i<m_N; i++)
568  for (int32_t j=0; j<m_N; j++)
569  {
570  //if (m_transition_matrix_a_id.element(i,j))
571  //SG_DEBUG("(%i,%i)=%i\n", i,j, m_transition_matrix_a_id.element(i,j))
573  }
574  //SG_DEBUG("m_max_a_id=%i\n", m_max_a_id)
575 }
576 
577 
579 {
580  //for (int32_t i=0; i<mod_words_input.num_cols; i++)
581  //{
582  // for (int32_t j=0; j<mod_words_input.num_rows; j++)
583  // SG_PRINT("%i ",mod_words_input[i*mod_words_input.num_rows+j])
584  // SG_PRINT("\n")
585  //}
586  m_svm_arrays_clean=false ;
587 
588  ASSERT(m_num_svms==mod_words_input.num_rows)
589  ASSERT(mod_words_input.num_cols==2)
590 
591  m_mod_words.set_array(mod_words_input.matrix, mod_words_input.num_rows, 2, true, true) ;
593 
594  /*SG_DEBUG("m_mod_words=[")
595  for (int32_t i=0; i<mod_words_input.num_rows; i++)
596  SG_DEBUG("%i, ", p_mod_words_array[i])
597  SG_DEBUG("]\n") */
598 }
599 
601 {
602  //SG_DEBUG("wd_dim1=%d, m_cum_num_words=%d, m_num_words=%d, m_svm_pos_start=%d, num_uniq_w=%d, mod_words_dims=(%d,%d), sign_w=%d,string_w=%d\n m_num_degrees=%d, m_num_svms=%d, m_num_strings=%d", m_word_degree.get_dim1(), m_cum_num_words.get_dim1(), m_num_words.get_dim1(), m_svm_pos_start.get_dim1(), m_num_unique_words.get_dim1(), m_mod_words.get_dim1(), m_mod_words.get_dim2(), m_sign_words.get_dim1(), m_string_words.get_dim1(), m_num_degrees, m_num_svms, m_num_strings)
606  //(word_used.get_dim1()==m_num_degrees) &&
607  //(word_used.get_dim2()==m_num_words[m_num_degrees-1]) &&
608  //(word_used.get_dim3()==m_num_strings) &&
609  // (svm_values_unnormalized.get_dim1()==m_num_degrees) &&
610  // (svm_values_unnormalized.get_dim2()==m_num_svms) &&
611  //(m_svm_pos_start.get_dim1()==m_num_degrees) &&
614  (m_mod_words.get_dim2()==2) &&
617  {
618  m_svm_arrays_clean=true ;
619  return true ;
620  }
621  else
622  {
625  (m_mod_words.get_dim2()==2) &&
628  SG_PRINT("OK\n")
629  else
630  SG_PRINT("not OK\n")
631 
633  SG_WARNING("SVM array: word_degree.get_dim1()!=m_num_degrees")
635  SG_WARNING("SVM array: m_cum_num_words.get_dim1()!=m_num_degrees+1")
637  SG_WARNING("SVM array: m_num_words.get_dim1()==m_num_degrees")
638  //if (!(m_svm_pos_start.get_dim1()==m_num_degrees))
639  // SG_WARNING("SVM array: m_svm_pos_start.get_dim1()!=m_num_degrees")
641  SG_WARNING("SVM array: m_num_unique_words.get_dim1()!=m_num_degrees")
642  if (!(m_mod_words.get_dim1()==m_num_svms))
643  SG_WARNING("SVM array: m_mod_words.get_dim1()!=num_svms")
644  if (!(m_mod_words.get_dim2()==2))
645  SG_WARNING("SVM array: m_mod_words.get_dim2()!=2")
646  if (!(m_sign_words.get_dim1()==m_num_svms))
647  SG_WARNING("SVM array: m_sign_words.get_dim1()!=num_svms")
649  SG_WARNING("SVM array: m_string_words.get_dim1()!=num_svms")
650 
651  m_svm_arrays_clean=false ;
652  return false ;
653  }
654 }
655 
657 {
658  if (seq.num_dims!=3)
659  SG_ERROR("Expected 3-dimensional Matrix\n")
660 
661  int32_t N=seq.dims[0];
662  int32_t cand_pos=seq.dims[1];
663  int32_t max_num_features=seq.dims[2];
664 
665  if (!m_svm_arrays_clean)
666  {
667  SG_ERROR("SVM arrays not clean")
668  return ;
669  } ;
670 
671  ASSERT(N==m_N)
672  ASSERT(cand_pos==m_seq_len)
675 
676  m_observation_matrix.set_array(seq.array, N, m_seq_len, max_num_features, true, true) ;
677 }
679 {
680  return m_seq_len;
681 }
682 
684 {
685  ASSERT(seg_path.num_rows==2)
686  ASSERT(seg_path.num_cols==m_seq_len)
687 
688  if (seg_path.matrix!=NULL)
689  {
690  int32_t *segment_ids = SG_MALLOC(int32_t, m_seq_len);
691  float64_t *segment_mask = SG_MALLOC(float64_t, m_seq_len);
692  for (int32_t i=0; i<m_seq_len; i++)
693  {
694  segment_ids[i] = (int32_t)seg_path.matrix[2*i] ;
695  segment_mask[i] = seg_path.matrix[2*i+1] ;
696  }
697  best_path_set_segment_ids_mask(segment_ids, segment_mask, m_seq_len) ;
698  SG_FREE(segment_ids);
699  SG_FREE(segment_mask);
700  }
701  else
702  {
703  int32_t *izeros = SG_MALLOC(int32_t, m_seq_len);
704  float64_t *dzeros = SG_MALLOC(float64_t, m_seq_len);
705  for (int32_t i=0; i<m_seq_len; i++)
706  {
707  izeros[i]=0 ;
708  dzeros[i]=0.0 ;
709  }
710  best_path_set_segment_ids_mask(izeros, dzeros, m_seq_len) ;
711  SG_FREE(izeros);
712  SG_FREE(dzeros);
713  }
714 }
715 
717 {
718  m_pos.set_array(pos.vector, pos.vlen, true, true) ;
719  m_seq_len = pos.vlen;
720 }
721 
723 {
724  if (orf_info.num_cols!=2)
725  SG_ERROR("orf_info size incorrect %i!=2\n", orf_info.num_cols)
726 
727  m_orf_info.set_array(orf_info.matrix, orf_info.num_rows, orf_info.num_cols, true, true) ;
728  m_orf_info.set_array_name("orf_info") ;
729 }
730 
732 {
733  if ((!seq_sparse1 && seq_sparse2) || (seq_sparse1 && !seq_sparse2))
734  SG_ERROR("Sparse features must either both be NULL or both NON-NULL\n")
735 
738 
739  m_seq_sparse1=seq_sparse1;
740  m_seq_sparse2=seq_sparse2;
743 }
744 
746 {
748 
749  m_plif_matrices=pm;
750 
752 }
753 
755 {
756  ASSERT(genestr.vector)
757  ASSERT(genestr.vlen>0)
758 
759  m_genestr.set_array(genestr.vector, genestr.vlen, true, true) ;
760 }
761 
762 void CDynProg::set_my_state_seq(int32_t* my_state_seq)
763 {
764  ASSERT(my_state_seq && m_seq_len>0)
766  for (int32_t i=0; i<m_seq_len; i++)
767  m_my_state_seq[i]=my_state_seq[i];
768 }
769 
770 void CDynProg::set_my_pos_seq(int32_t* my_pos_seq)
771 {
772  ASSERT(my_pos_seq && m_seq_len>0)
774  for (int32_t i=0; i<m_seq_len; i++)
775  m_my_pos_seq[i]=my_pos_seq[i];
776 }
777 
779 {
780  if (m_num_svms!=dictionary_weights.num_cols)
781  {
782  SG_ERROR("m_dict_weights array does not match num_svms=%i!=%i\n",
783  m_num_svms, dictionary_weights.num_cols) ;
784  }
785 
786  m_dict_weights.set_array(dictionary_weights.matrix, dictionary_weights.num_rows, m_num_svms, true, true) ;
787 
788  // initialize, so it does not bother when not used
795 }
796 
798 {
799  int32_t m=segment_loss.num_rows;
800  int32_t n=segment_loss.num_cols;
801  // here we need two matrices. Store it in one: 2N x N
802  if (2*m!=n)
803  SG_ERROR("segment_loss should be 2 x quadratic matrix: %i!=%i\n", 2*m, n)
804 
805  if (m!=m_max_a_id+1)
806  SG_ERROR("segment_loss size should match m_max_a_id: %i!=%i\n", m, m_max_a_id+1)
807 
808  m_segment_loss.set_array(segment_loss.matrix, m, n/2, 2, true, true) ;
809  /*for (int32_t i=0; i<n; i++)
810  for (int32_t j=0; j<n; j++)
811  SG_DEBUG("loss(%i,%i)=%f\n", i,j, m_segment_loss.element(0,i,j)) */
812 }
813 
815  int32_t* segment_ids, float64_t* segment_mask, int32_t m)
816 {
817 
819  SG_ERROR("size of segment_ids or segment_mask (%i) does not match the size of the feature matrix (%i)", m, m_observation_matrix.get_dim2())
820  int32_t max_id = 0;
821  for (int32_t i=1;i<m;i++)
822  max_id = CMath::max(max_id,segment_ids[i]);
823  //SG_PRINT("max_id: %i, m:%i\n",max_id, m)
824  m_segment_ids.set_array(segment_ids, m, true, true) ;
825  m_segment_ids.set_array_name("m_segment_ids");
826  m_segment_mask.set_array(segment_mask, m, true, true) ;
827  m_segment_mask.set_array_name("m_segment_mask");
828 
832 }
833 
835 {
837  memcpy(scores.vector,m_scores.get_array(), sizeof(float64_t)*(m_scores.get_dim1()));
838 
839  return scores;
840 }
841 
843 {
845 
846  int32_t sz = sizeof(int32_t)*( m_states.get_dim1() * m_states.get_dim2() );
847  memcpy(states.matrix ,m_states.get_array(),sz);
848 
849  return states;
850 }
851 
853 {
855 
856  int32_t sz = sizeof(int32_t)*(m_positions.get_dim1()*m_positions.get_dim2());
857  memcpy(positions.matrix, m_positions.get_array(),sz);
858 
859  return positions;
860 }
861 
862 void CDynProg::get_path_scores(float64_t** scores, int32_t* seq_len)
863 {
864  ASSERT(scores && seq_len)
865 
866  *seq_len=m_my_scores.get_dim1();
867 
868  int32_t sz = sizeof(float64_t)*(*seq_len);
869 
870  *scores = SG_MALLOC(float64_t, *seq_len);
871  ASSERT(*scores)
872 
873  memcpy(*scores,m_my_scores.get_array(),sz);
874 }
875 
876 void CDynProg::get_path_losses(float64_t** losses, int32_t* seq_len)
877 {
878  ASSERT(losses && seq_len)
879 
880  *seq_len=m_my_losses.get_dim1();
881 
882  int32_t sz = sizeof(float64_t)*(*seq_len);
883 
884  *losses = SG_MALLOC(float64_t, *seq_len);
885  ASSERT(*losses)
886 
887  memcpy(*losses,m_my_losses.get_array(),sz);
888 }
889 
891 
893  int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos,
894  int32_t to)
895 {
896 #ifdef DYNPROG_TIMING_DETAIL
897  MyTime.start() ;
898 #endif
899 
900  if (start<0)
901  start=0 ;
902  if (to<0)
903  to=0 ;
904 
905  int32_t orf_target = orf_to-orf_from ;
906  if (orf_target<0) orf_target+=3 ;
907 
908  int32_t pos ;
909  if (last_pos==to)
910  pos = to-orf_to-3 ;
911  else
912  pos=last_pos ;
913 
914  if (pos<0)
915  {
916 #ifdef DYNPROG_TIMING_DETAIL
917  MyTime.stop() ;
918  orf_time += MyTime.time_diff_sec() ;
919 #endif
920  return true ;
921  }
922 
923  for (; pos>=start; pos-=3)
924  if (m_genestr_stop[pos])
925  {
926 #ifdef DYNPROG_TIMING_DETAIL
927  MyTime.stop() ;
928  orf_time += MyTime.time_diff_sec() ;
929 #endif
930  return false ;
931  }
932 
933 
934  last_pos = CMath::min(pos+3,to-orf_to-3) ;
935 
936 #ifdef DYNPROG_TIMING_DETAIL
937  MyTime.stop() ;
938  orf_time += MyTime.time_diff_sec() ;
939 #endif
940  return true ;
941 }
942 
943 void CDynProg::compute_nbest_paths(int32_t max_num_signals, bool use_orf,
944  int16_t nbest, bool with_loss, bool with_multiple_sequences)
945  {
946 
947  //FIXME we need checks here if all the fields are of right size
948  //SG_PRINT("m_seq_len: %i\n", m_seq_len)
949  //SG_PRINT("m_pos[0]: %i\n", m_pos[0])
950  //SG_PRINT("\n")
951 
952  //FIXME these variables can go away when compute_nbest_paths uses them
953  //instead of the local pointers below
954  const float64_t* seq_array = m_observation_matrix.get_array();
955  m_scores.resize_array(nbest) ;
958 
959  for (int32_t i=0; i<nbest; i++)
960  {
961  m_scores[i]=-1;
962  for (int32_t j=0; j<m_observation_matrix.get_dim2(); j++)
963  {
964  m_states.element(i,j)=-1;
965  m_positions.element(i,j)=-1;
966  }
967  }
968  float64_t* prob_nbest=m_scores.get_array();
969  int32_t* my_state_seq=m_states.get_array();
970  int32_t* my_pos_seq=m_positions.get_array();
971 
972  CPlifBase** Plif_matrix=m_plif_matrices->get_plif_matrix();
973  CPlifBase** Plif_state_signals=m_plif_matrices->get_state_signals();
974  //END FIXME
975 
976 
977 #ifdef DYNPROG_TIMING
978  segment_init_time = 0.0 ;
979  segment_pos_time = 0.0 ;
980  segment_extend_time = 0.0 ;
981  segment_clean_time = 0.0 ;
982  orf_time = 0.0 ;
983  svm_init_time = 0.0 ;
984  svm_pos_time = 0.0 ;
985  svm_clean_time = 0.0 ;
986  inner_loop_time = 0.0 ;
987  content_svm_values_time = 0.0 ;
988  content_plifs_time = 0.0 ;
989  inner_loop_max_time = 0.0 ;
990  long_transition_time = 0.0 ;
991 
992  MyTime2.start() ;
993 #endif
994 
995  if (!m_svm_arrays_clean)
996  {
997  SG_ERROR("SVM arrays not clean")
998  return ;
999  }
1000 
1001 #ifdef DYNPROG_DEBUG
1002  m_transition_matrix_a.set_array_name("transition_matrix");
1007  //SG_PRINT("use_orf = %i\n", use_orf)
1008 #endif
1009 
1010  int32_t max_look_back = 1000 ;
1011  bool use_svm = false ;
1012 
1013  SG_DEBUG("m_N:%i, m_seq_len:%i, max_num_signals:%i\n",m_N, m_seq_len, max_num_signals)
1014 
1015  //for (int32_t i=0;i<m_N*m_seq_len*max_num_signals;i++)
1016  // SG_PRINT("(%i)%0.2f ",i,seq_array[i])
1017 
1018  CDynamicObjectArray PEN((CSGObject**) Plif_matrix, m_N, m_N, false, false) ; // 2d, CPlifBase*
1019  PEN.set_array_name("PEN");
1020 
1021  CDynamicObjectArray PEN_state_signals((CSGObject**) Plif_state_signals, m_N, max_num_signals, false, false) ; // 2d, CPlifBase*
1022  PEN_state_signals.set_array_name("state_signals");
1023 
1024  CDynamicArray<float64_t> seq(m_N, m_seq_len) ; // 2d
1025  seq.set_array_name("seq") ;
1026  seq.set_const(0) ;
1027 
1028 #ifdef DYNPROG_DEBUG
1029  SG_PRINT("m_num_raw_data: %i\n",m_num_raw_data)
1030  SG_PRINT("m_num_intron_plifs: %i\n", m_num_intron_plifs)
1031  SG_PRINT("m_num_svms: %i\n", m_num_svms)
1032  SG_PRINT("m_num_lin_feat_plifs_cum: ")
1033  for (int i=0; i<=m_num_raw_data; i++)
1035  SG_PRINT("\n")
1036 #endif
1037 
1039  { // initialize svm_svalue
1041  svm_value[s]=0 ;
1042  }
1043 
1044  { // convert seq_input to seq
1045  // this is independent of the svm values
1046 
1047  CDynamicArray<float64_t> *seq_input=NULL ; // 3d
1048  if (seq_array!=NULL)
1049  {
1050  //SG_PRINT("using dense seq_array\n")
1051 
1052  seq_input=new CDynamicArray<float64_t>(seq_array, m_N, m_seq_len, max_num_signals) ;
1053  seq_input->set_array_name("seq_input") ;
1054  //seq_input.display_array() ;
1055 
1056  ASSERT(m_seq_sparse1==NULL)
1057  ASSERT(m_seq_sparse2==NULL)
1058  } else
1059  {
1060  SG_PRINT("using sparse seq_array\n")
1061 
1062  ASSERT(m_seq_sparse1!=NULL)
1063  ASSERT(m_seq_sparse2!=NULL)
1064  ASSERT(max_num_signals==2)
1065  }
1066 
1067  for (int32_t i=0; i<m_N; i++)
1068  for (int32_t j=0; j<m_seq_len; j++)
1069  seq.element(i,j) = 0 ;
1070 
1071  for (int32_t i=0; i<m_N; i++)
1072  for (int32_t j=0; j<m_seq_len; j++)
1073  for (int32_t k=0; k<max_num_signals; k++)
1074  {
1075  if ((PEN_state_signals.element(i,k)==NULL) && (k==0))
1076  {
1077  // no plif
1078  if (seq_input!=NULL)
1079  seq.element(i,j) = seq_input->element(i,j,k) ;
1080  else
1081  {
1082  if (k==0)
1083  seq.element(i,j) = m_seq_sparse1->get_feature(i,j) ;
1084  if (k==1)
1085  seq.element(i,j) = m_seq_sparse2->get_feature(i,j) ;
1086  }
1087  break ;
1088  }
1089  if (PEN_state_signals.element(i,k)!=NULL)
1090  {
1091  if (seq_input!=NULL)
1092  {
1093  // just one plif
1094  if (CMath::is_finite(seq_input->element(i,j,k)))
1095  seq.element(i,j) += ((CPlifBase*) PEN_state_signals.element(i,k))->lookup_penalty(seq_input->element(i,j,k), svm_value) ;
1096  else
1097  // keep infinity values
1098  seq.element(i,j) = seq_input->element(i, j, k) ;
1099  }
1100  else
1101  {
1102  if (k==0)
1103  {
1104  // just one plif
1106  seq.element(i,j) += ((CPlifBase*) PEN_state_signals.element(i,k))->lookup_penalty(m_seq_sparse1->get_feature(i,j), svm_value) ;
1107  else
1108  // keep infinity values
1109  seq.element(i,j) = m_seq_sparse1->get_feature(i, j) ;
1110  }
1111  if (k==1)
1112  {
1113  // just one plif
1115  seq.element(i,j) += ((CPlifBase*) PEN_state_signals.element(i,k))->lookup_penalty(m_seq_sparse2->get_feature(i,j), svm_value) ;
1116  else
1117  // keep infinity values
1118  seq.element(i,j) = m_seq_sparse2->get_feature(i, j) ;
1119  }
1120  }
1121  }
1122  else
1123  break ;
1124  }
1125  delete seq_input;
1126  SG_FREE(svm_value);
1127  }
1128 
1129  // allow longer transitions than look_back
1130  bool long_transitions = m_long_transitions ;
1131  CDynamicArray<int32_t> long_transition_content_start_position(m_N,m_N) ; // 2d
1132  long_transition_content_start_position.set_array_name("long_transition_content_start_position");
1133 #ifdef DYNPROG_DEBUG
1134  CDynamicArray<int32_t> long_transition_content_end_position(m_N,m_N) ; // 2d
1135  long_transition_content_end_position.set_array_name("long_transition_content_end_position");
1136 #endif
1137  CDynamicArray<int32_t> long_transition_content_start(m_N,m_N) ; // 2d
1138  long_transition_content_start.set_array_name("long_transition_content_start");
1139 
1140  CDynamicArray<float64_t> long_transition_content_scores(m_N,m_N) ; // 2d
1141  long_transition_content_scores.set_array_name("long_transition_content_scores");
1142 #ifdef DYNPROG_DEBUG
1143 
1144  CDynamicArray<float64_t> long_transition_content_scores_pen(m_N,m_N) ; // 2d
1145  long_transition_content_scores_pen.set_array_name("long_transition_content_scores_pen");
1146 
1147  CDynamicArray<float64_t> long_transition_content_scores_prev(m_N,m_N) ; // 2d
1148  long_transition_content_scores_prev.set_array_name("long_transition_content_scores_prev");
1149 
1150  CDynamicArray<float64_t> long_transition_content_scores_elem(m_N,m_N) ; // 2d
1151  long_transition_content_scores_elem.set_array_name("long_transition_content_scores_elem");
1152 #endif
1153  CDynamicArray<float64_t> long_transition_content_scores_loss(m_N,m_N) ; // 2d
1154  long_transition_content_scores_loss.set_array_name("long_transition_content_scores_loss");
1155 
1156  if (nbest!=1)
1157  {
1158  SG_ERROR("Long transitions are not supported for nbest!=1")
1159  long_transitions = false ;
1160  }
1161  long_transition_content_scores.set_const(-CMath::INFTY);
1162 #ifdef DYNPROG_DEBUG
1163  long_transition_content_scores_pen.set_const(0) ;
1164  long_transition_content_scores_elem.set_const(0) ;
1165  long_transition_content_scores_prev.set_const(0) ;
1166 #endif
1167  if (with_loss)
1168  long_transition_content_scores_loss.set_const(0) ;
1169  long_transition_content_start.set_const(0) ;
1170  long_transition_content_start_position.set_const(0) ;
1171 #ifdef DYNPROG_DEBUG
1172  long_transition_content_end_position.set_const(0) ;
1173 #endif
1174 
1176  { // initialize svm_svalue
1178  svm_value[s]=0 ;
1179  }
1180 
1181  CDynamicArray<int32_t> look_back(m_N,m_N) ; // 2d
1182  look_back.set_array_name("look_back");
1183  //CDynamicArray<int32_t> look_back_orig(m_N,m_N) ;
1184  //look_back.set_array_name("look_back_orig");
1185 
1186 
1187  { // determine maximal length of look-back
1188  for (int32_t i=0; i<m_N; i++)
1189  for (int32_t j=0; j<m_N; j++)
1190  {
1191  look_back.set_element(INT_MAX, i, j) ;
1192  //look_back_orig.set_element(INT_MAX, i, j) ;
1193  }
1194 
1195  for (int32_t j=0; j<m_N; j++)
1196  {
1197  // only consider transitions that are actually allowed
1198  const T_STATES num_elem = trans_list_forward_cnt[j] ;
1199  const T_STATES *elem_list = trans_list_forward[j] ;
1200 
1201  for (int32_t i=0; i<num_elem; i++)
1202  {
1203  T_STATES ii = elem_list[i] ;
1204 
1205  CPlifBase *penij=(CPlifBase*) PEN.element(j, ii) ;
1206  if (penij==NULL)
1207  {
1208  if (long_transitions)
1209  {
1210  look_back.set_element(m_long_transition_threshold, j, ii) ;
1211  //look_back_orig.set_element(m_long_transition_max, j, ii) ;
1212  }
1213  continue ;
1214  }
1215 
1216  /* if the transition is an ORF or we do computation with loss, we have to disable long transitions */
1217  if ((m_orf_info.element(ii,0)!=-1) || (m_orf_info.element(j,1)!=-1) || (!long_transitions))
1218  {
1219  look_back.set_element(CMath::ceil(penij->get_max_value()), j, ii) ;
1220  //look_back_orig.set_element(CMath::ceil(penij->get_max_value()), j, ii) ;
1221  if (CMath::ceil(penij->get_max_value()) > max_look_back)
1222  {
1223  SG_DEBUG("%d %d -> value: %f\n", ii,j,penij->get_max_value())
1224  max_look_back = (int32_t) (CMath::ceil(penij->get_max_value()));
1225  }
1226  }
1227  else
1228  {
1229  look_back.set_element(CMath::min( (int32_t)CMath::ceil(penij->get_max_value()), m_long_transition_threshold ), j, ii) ;
1230  //look_back_orig.set_element( (int32_t)CMath::ceil(penij->get_max_value()), j, ii) ;
1231  }
1232 
1233  if (penij->uses_svm_values())
1234  use_svm=true ;
1235  }
1236  }
1237  /* make sure max_look_back is at least as long as a long transition */
1238  if (long_transitions)
1239  max_look_back = CMath::max(m_long_transition_threshold, max_look_back) ;
1240 
1241  /* make sure max_look_back is not longer than the whole string */
1242  max_look_back = CMath::min(m_genestr.get_dim1(), max_look_back) ;
1243 
1244  int32_t num_long_transitions = 0 ;
1245  for (int32_t i=0; i<m_N; i++)
1246  for (int32_t j=0; j<m_N; j++)
1247  {
1248  if (look_back.get_element(i,j)==m_long_transition_threshold)
1249  num_long_transitions++ ;
1250  if (look_back.get_element(i,j)==INT_MAX)
1251  {
1252  if (long_transitions)
1253  {
1254  look_back.set_element(m_long_transition_threshold, i, j) ;
1255  //look_back_orig.set_element(m_long_transition_max, i, j) ;
1256  }
1257  else
1258  {
1259  look_back.set_element(max_look_back, i, j) ;
1260  //look_back_orig.set_element(m_long_transition_max, i, j) ;
1261  }
1262  }
1263  }
1264  SG_DEBUG("Using %i long transitions\n", num_long_transitions)
1265  }
1266  //SG_PRINT("max_look_back: %i \n", max_look_back)
1267 
1268  //SG_PRINT("use_svm=%i, genestr_len: \n", use_svm, m_genestr.get_dim1())
1269  SG_DEBUG("use_svm=%i\n", use_svm)
1270 
1271  SG_DEBUG("maxlook: %d m_N: %d nbest: %d \n", max_look_back, m_N, nbest)
1272  const int32_t look_back_buflen = (max_look_back*m_N+1)*nbest ;
1273  SG_DEBUG("look_back_buflen=%i\n", look_back_buflen)
1274  /*const float64_t mem_use = (float64_t)(m_seq_len*m_N*nbest*(sizeof(T_STATES)+sizeof(int16_t)+sizeof(int32_t))+
1275  look_back_buflen*(2*sizeof(float64_t)+sizeof(int32_t))+
1276  m_seq_len*(sizeof(T_STATES)+sizeof(int32_t))+
1277  m_genestr.get_dim1()*sizeof(bool))/(1024*1024);*/
1278 
1279  //bool is_big = (mem_use>200) || (m_seq_len>5000) ;
1280 
1281  /*if (is_big)
1282  {
1283  SG_DEBUG("calling compute_nbest_paths: m_seq_len=%i, m_N=%i, lookback=%i nbest=%i\n",
1284  m_seq_len, m_N, max_look_back, nbest) ;
1285  SG_DEBUG("allocating %1.2fMB of memory\n",
1286  mem_use) ;
1287  }*/
1288  ASSERT(nbest<32000)
1289 
1290  CDynamicArray<float64_t> delta(m_seq_len, m_N, nbest) ; // 3d
1291  delta.set_array_name("delta");
1292  float64_t* delta_array = delta.get_array() ;
1293  //delta.set_const(0) ;
1294 
1295  CDynamicArray<T_STATES> psi(m_seq_len, m_N, nbest) ; // 3d
1296  psi.set_array_name("psi");
1297  //psi.set_const(0) ;
1298 
1299  CDynamicArray<int16_t> ktable(m_seq_len, m_N, nbest) ; // 3d
1300  ktable.set_array_name("ktable");
1301  //ktable.set_const(0) ;
1302 
1303  CDynamicArray<int32_t> ptable(m_seq_len, m_N, nbest) ; // 3d
1304  ptable.set_array_name("ptable");
1305  //ptable.set_const(0) ;
1306 
1307  CDynamicArray<float64_t> delta_end(nbest) ;
1308  delta_end.set_array_name("delta_end");
1309  //delta_end.set_const(0) ;
1310 
1311  CDynamicArray<T_STATES> path_ends(nbest) ;
1312  path_ends.set_array_name("path_ends");
1313  //path_ends.set_const(0) ;
1314 
1315  CDynamicArray<int16_t> ktable_end(nbest) ;
1316  ktable_end.set_array_name("ktable_end");
1317  //ktable_end.set_const(0) ;
1318 
1319  float64_t * fixedtempvv=SG_MALLOC(float64_t, look_back_buflen);
1320  memset(fixedtempvv, 0, look_back_buflen*sizeof(float64_t)) ;
1321  int32_t * fixedtempii=SG_MALLOC(int32_t, look_back_buflen);
1322  memset(fixedtempii, 0, look_back_buflen*sizeof(int32_t)) ;
1323 
1324  CDynamicArray<float64_t> oldtempvv(look_back_buflen) ;
1325  oldtempvv.set_array_name("oldtempvv");
1326 
1327  CDynamicArray<float64_t> oldtempvv2(look_back_buflen) ;
1328  oldtempvv2.set_array_name("oldtempvv2");
1329  //oldtempvv.set_const(0) ;
1330  //oldtempvv.display_size() ;
1331 
1332  CDynamicArray<int32_t> oldtempii(look_back_buflen) ;
1333  oldtempii.set_array_name("oldtempii");
1334 
1335  CDynamicArray<int32_t> oldtempii2(look_back_buflen) ;
1336  oldtempii2.set_array_name("oldtempii2");
1337  //oldtempii.set_const(0) ;
1338 
1339  CDynamicArray<T_STATES> state_seq(m_seq_len) ;
1340  state_seq.set_array_name("state_seq");
1341  //state_seq.set_const(0) ;
1342 
1344  pos_seq.set_array_name("pos_seq");
1345  //pos_seq.set_const(0) ;
1346 
1347 
1348  m_dict_weights.set_array_name("dict_weights") ;
1349  m_word_degree.set_array_name("word_degree") ;
1350  m_cum_num_words.set_array_name("cum_num_words") ;
1351  m_num_words.set_array_name("num_words") ;
1352  //word_used.set_array_name("word_used") ;
1353  //svm_values_unnormalized.set_array_name("svm_values_unnormalized") ;
1354  //m_svm_pos_start.set_array_name("svm_pos_start") ;
1355  m_num_unique_words.set_array_name("num_unique_words") ;
1356 
1357  PEN.set_array_name("PEN") ;
1358  seq.set_array_name("seq") ;
1359 
1360  delta.set_array_name("delta") ;
1361  psi.set_array_name("psi") ;
1362  ktable.set_array_name("ktable") ;
1363  ptable.set_array_name("ptable") ;
1364  delta_end.set_array_name("delta_end") ;
1365  path_ends.set_array_name("path_ends") ;
1366  ktable_end.set_array_name("ktable_end") ;
1367 
1368 #ifdef USE_TMP_ARRAYCLASS
1369  fixedtempvv.set_array_name("fixedtempvv") ;
1370  fixedtempii.set_array_name("fixedtempvv") ;
1371 #endif
1372 
1373  oldtempvv.set_array_name("oldtempvv") ;
1374  oldtempvv2.set_array_name("oldtempvv2") ;
1375  oldtempii.set_array_name("oldtempii") ;
1376  oldtempii2.set_array_name("oldtempii2") ;
1377 
1378 
1380 
1381 #ifdef DYNPROG_DEBUG
1382  state_seq.display_size() ;
1383  pos_seq.display_size() ;
1384 
1389  //word_used.display_size() ;
1390  //svm_values_unnormalized.display_size() ;
1391  //m_svm_pos_start.display_array() ;
1393 
1394  PEN.display_size() ;
1395  PEN_state_signals.display_size() ;
1396  seq.display_size() ;
1398 
1399  //m_genestr_stop.display_size() ;
1400  delta.display_size() ;
1401  psi.display_size() ;
1402  ktable.display_size() ;
1403  ptable.display_size() ;
1404  delta_end.display_size() ;
1405  path_ends.display_size() ;
1406  ktable_end.display_size() ;
1407 
1408 #ifdef USE_TMP_ARRAYCLASS
1409  fixedtempvv.display_size() ;
1410  fixedtempii.display_size() ;
1411 #endif
1412 
1413  //oldtempvv.display_size() ;
1414  //oldtempii.display_size() ;
1415 
1416  state_seq.display_size() ;
1417  pos_seq.display_size() ;
1418 
1419  //seq.set_const(0) ;
1420 
1421 #endif //DYNPROG_DEBUG
1422 
1424 
1425 
1426 
1427  {
1428  for (int32_t s=0; s<m_num_svms; s++)
1430  }
1431 
1432 
1433  //CDynamicArray<int32_t*> trans_matrix_svms(m_N,m_N); // 2d
1434  //CDynamicArray<int32_t> trans_matrix_num_svms(m_N,m_N); // 2d
1435 
1436  { // initialization
1437 
1438  for (T_STATES i=0; i<m_N; i++)
1439  {
1440  //delta.element(0, i, 0) = get_p(i) + seq.element(i,0) ; // get_p defined in HMM.h to be equiv to initial_state_distribution
1441  delta.element(delta_array, 0, i, 0, m_seq_len, m_N) = get_p(i) + seq.element(i,0) ; // get_p defined in HMM.h to be equiv to initial_state_distribution
1442  psi.element(0,i,0) = 0 ;
1443  if (nbest>1)
1444  ktable.element(0,i,0) = 0 ;
1445  ptable.element(0,i,0) = 0 ;
1446  for (int16_t k=1; k<nbest; k++)
1447  {
1448  int32_t dim1, dim2, dim3 ;
1449  delta.get_array_size(dim1, dim2, dim3) ;
1450  //SG_DEBUG("i=%i, k=%i -- %i, %i, %i\n", i, k, dim1, dim2, dim3)
1451  //delta.element(0, i, k) = -CMath::INFTY ;
1452  delta.element(delta_array, 0, i, k, m_seq_len, m_N) = -CMath::INFTY ;
1453  psi.element(0,i,0) = 0 ; // <--- what's this for?
1454  if (nbest>1)
1455  ktable.element(0,i,k) = 0 ;
1456  ptable.element(0,i,k) = 0 ;
1457  }
1458  /*
1459  for (T_STATES j=0; j<m_N; j++)
1460  {
1461  CPlifBase * penalty = PEN.element(i,j) ;
1462  int32_t num_current_svms=0;
1463  int32_t svm_ids[] = {-8, -7, -6, -5, -4, -3, -2, -1};
1464  if (penalty)
1465  {
1466  SG_PRINT("trans %i -> %i \n",i,j)
1467  penalty->get_used_svms(&num_current_svms, svm_ids);
1468  trans_matrix_svms.set_element(svm_ids,i,j);
1469  for (int32_t l=0;l<num_current_svms;l++)
1470  SG_PRINT("svm_ids[%i]: %i \n",l,svm_ids[l])
1471  trans_matrix_num_svms.set_element(num_current_svms,i,j);
1472  }
1473  }
1474  */
1475 
1476  }
1477  }
1478 
1479  SG_DEBUG("START_RECURSION \n\n")
1480 
1481  // recursion
1482  for (int32_t t=1; t<m_seq_len; t++)
1483  {
1484  //if (is_big && t%(1+(m_seq_len/1000))==1)
1485  // SG_PROGRESS(t, 0, m_seq_len)
1486  //SG_PRINT("%i\n", t)
1487 
1488  for (T_STATES j=0; j<m_N; j++)
1489  {
1490  if (seq.element(j,t)<=-1e20)
1491  { // if we cannot observe the symbol here, then we can omit the rest
1492  for (int16_t k=0; k<nbest; k++)
1493  {
1494  delta.element(delta_array, t, j, k, m_seq_len, m_N) = seq.element(j,t) ;
1495  psi.element(t,j,k) = 0 ;
1496  if (nbest>1)
1497  ktable.element(t,j,k) = 0 ;
1498  ptable.element(t,j,k) = 0 ;
1499  }
1500  }
1501  else
1502  {
1503  const T_STATES num_elem = trans_list_forward_cnt[j] ;
1504  const T_STATES *elem_list = trans_list_forward[j] ;
1505  const float64_t *elem_val = trans_list_forward_val[j] ;
1506  const int32_t *elem_id = trans_list_forward_id[j] ;
1507 
1508  int32_t fixed_list_len = 0 ;
1509  float64_t fixedtempvv_ = CMath::INFTY ;
1510  int32_t fixedtempii_ = 0 ;
1511  bool fixedtemplong = false ;
1512 
1513  for (int32_t i=0; i<num_elem; i++)
1514  {
1515  T_STATES ii = elem_list[i] ;
1516 
1517  const CPlifBase* penalty = (CPlifBase*) PEN.element(j,ii) ;
1518 
1519  /*int32_t look_back = max_look_back ;
1520  if (0)
1521  { // find lookback length
1522  CPlifBase *pen = (CPlifBase*) penalty ;
1523  if (pen!=NULL)
1524  look_back=(int32_t) (CMath::ceil(pen->get_max_value()));
1525  if (look_back>=1e6)
1526  SG_PRINT("%i,%i -> %d from %ld\n", j, ii, look_back, (long)pen)
1527  ASSERT(look_back<1e6)
1528  } */
1529 
1530  int32_t look_back_ = look_back.element(j, ii) ;
1531 
1532  int32_t orf_from = m_orf_info.element(ii,0) ;
1533  int32_t orf_to = m_orf_info.element(j,1) ;
1534  if((orf_from!=-1)!=(orf_to!=-1))
1535  SG_DEBUG("j=%i ii=%i orf_from=%i orf_to=%i p=%1.2f\n", j, ii, orf_from, orf_to, elem_val[i])
1536  ASSERT((orf_from!=-1)==(orf_to!=-1))
1537 
1538  int32_t orf_target = -1 ;
1539  if (orf_from!=-1)
1540  {
1541  orf_target=orf_to-orf_from ;
1542  if (orf_target<0)
1543  orf_target+=3 ;
1544  ASSERT(orf_target>=0 && orf_target<3)
1545  }
1546 
1547  int32_t orf_last_pos = m_pos[t] ;
1548 #ifdef DYNPROG_TIMING
1549  MyTime3.start() ;
1550 #endif
1551  int32_t num_ok_pos = 0 ;
1552 
1553  for (int32_t ts=t-1; ts>=0 && m_pos[t]-m_pos[ts]<=look_back_; ts--)
1554  {
1555  bool ok ;
1556  //int32_t plen=t-ts;
1557 
1558  /*for (int32_t s=0; s<m_num_svms; s++)
1559  if ((fabs(svs.svm_values[s*svs.seqlen+plen]-svs2.svm_values[s*svs.seqlen+plen])>1e-6) ||
1560  (fabs(svs.svm_values[s*svs.seqlen+plen]-svs3.svm_values[s*svs.seqlen+plen])>1e-6))
1561  {
1562  SG_DEBUG("s=%i, t=%i, ts=%i, %1.5e, %1.5e, %1.5e\n", s, t, ts, svs.svm_values[s*svs.seqlen+plen], svs2.svm_values[s*svs.seqlen+plen], svs3.svm_values[s*svs.seqlen+plen])
1563  }*/
1564 
1565  if (orf_target==-1)
1566  ok=true ;
1567  else if (m_pos[ts]!=-1 && (m_pos[t]-m_pos[ts])%3==orf_target)
1568  ok=(!use_orf) || extend_orf(orf_from, orf_to, m_pos[ts], orf_last_pos, m_pos[t]) ;
1569  else
1570  ok=false ;
1571 
1572  if (ok)
1573  {
1574 
1575  float64_t segment_loss = 0.0 ;
1576  if (with_loss)
1577  {
1578  segment_loss = m_seg_loss_obj->get_segment_loss(ts, t, elem_id[i]);
1579  //if (segment_loss!=segment_loss2)
1580  //SG_PRINT("segment_loss:%f segment_loss2:%f\n", segment_loss, segment_loss2)
1581  }
1583  // BEST_PATH_TRANS
1585 
1586  int32_t frame = orf_from;//m_orf_info.element(ii,0);
1587  lookup_content_svm_values(ts, t, m_pos[ts], m_pos[t], svm_value, frame);
1588 
1589  float64_t pen_val = 0.0 ;
1590  if (penalty)
1591  {
1592 #ifdef DYNPROG_TIMING_DETAIL
1593  MyTime.start() ;
1594 #endif
1595  pen_val = penalty->lookup_penalty(m_pos[t]-m_pos[ts], svm_value) ;
1596 
1597 #ifdef DYNPROG_TIMING_DETAIL
1598  MyTime.stop() ;
1599  content_plifs_time += MyTime.time_diff_sec() ;
1600 #endif
1601  }
1602 
1603 #ifdef DYNPROG_TIMING_DETAIL
1604  MyTime.start() ;
1605 #endif
1606  num_ok_pos++ ;
1607 
1608  if (nbest==1)
1609  {
1610  float64_t val = elem_val[i] + pen_val ;
1611  if (with_loss)
1612  val += segment_loss ;
1613 
1614  float64_t mval = -(val + delta.element(delta_array, ts, ii, 0, m_seq_len, m_N)) ;
1615 
1616  if (mval<fixedtempvv_)
1617  {
1618  fixedtempvv_ = mval ;
1619  fixedtempii_ = ii + ts*m_N;
1620  fixed_list_len = 1 ;
1621  fixedtemplong = false ;
1622  }
1623  }
1624  else
1625  {
1626  for (int16_t diff=0; diff<nbest; diff++)
1627  {
1628  float64_t val = elem_val[i] ;
1629  val += pen_val ;
1630  if (with_loss)
1631  val += segment_loss ;
1632 
1633  float64_t mval = -(val + delta.element(delta_array, ts, ii, diff, m_seq_len, m_N)) ;
1634 
1635  /* only place -val in fixedtempvv if it is one of the nbest lowest values in there */
1636  /* fixedtempvv[i], i=0:nbest-1, is sorted so that fixedtempvv[0] <= fixedtempvv[1] <= ...*/
1637  /* fixed_list_len has the number of elements in fixedtempvv */
1638 
1639  if ((fixed_list_len < nbest) || ((0==fixed_list_len) || (mval < fixedtempvv[fixed_list_len-1])))
1640  {
1641  if ( (fixed_list_len<nbest) && ((0==fixed_list_len) || (mval>fixedtempvv[fixed_list_len-1])) )
1642  {
1643  fixedtempvv[fixed_list_len] = mval ;
1644  fixedtempii[fixed_list_len] = ii + diff*m_N + ts*m_N*nbest;
1645  fixed_list_len++ ;
1646  }
1647  else // must have mval < fixedtempvv[fixed_list_len-1]
1648  {
1649  int32_t addhere = fixed_list_len;
1650  while ((addhere > 0) && (mval < fixedtempvv[addhere-1]))
1651  addhere--;
1652 
1653  // move everything from addhere+1 one forward
1654  for (int32_t jj=fixed_list_len-1; jj>addhere; jj--)
1655  {
1656  fixedtempvv[jj] = fixedtempvv[jj-1];
1657  fixedtempii[jj] = fixedtempii[jj-1];
1658  }
1659 
1660  fixedtempvv[addhere] = mval;
1661  fixedtempii[addhere] = ii + diff*m_N + ts*m_N*nbest;
1662 
1663  if (fixed_list_len < nbest)
1664  fixed_list_len++;
1665  }
1666  }
1667  }
1668  }
1669 #ifdef DYNPROG_TIMING_DETAIL
1670  MyTime.stop() ;
1671  inner_loop_max_time += MyTime.time_diff_sec() ;
1672 #endif
1673  }
1674  }
1675 #ifdef DYNPROG_TIMING
1676  MyTime3.stop() ;
1677  inner_loop_time += MyTime3.time_diff_sec() ;
1678 #endif
1679  }
1680  for (int32_t i=0; i<num_elem; i++)
1681  {
1682  T_STATES ii = elem_list[i] ;
1683 
1684  const CPlifBase* penalty = (CPlifBase*) PEN.element(j,ii) ;
1685 
1686  /*int32_t look_back = max_look_back ;
1687  if (0)
1688  { // find lookback length
1689  CPlifBase *pen = (CPlifBase*) penalty ;
1690  if (pen!=NULL)
1691  look_back=(int32_t) (CMath::ceil(pen->get_max_value()));
1692  if (look_back>=1e6)
1693  SG_PRINT("%i,%i -> %d from %ld\n", j, ii, look_back, (long)pen)
1694  ASSERT(look_back<1e6)
1695  } */
1696 
1697  int32_t look_back_ = look_back.element(j, ii) ;
1698  //int32_t look_back_orig_ = look_back_orig.element(j, ii) ;
1699 
1700  int32_t orf_from = m_orf_info.element(ii,0) ;
1701  int32_t orf_to = m_orf_info.element(j,1) ;
1702  if((orf_from!=-1)!=(orf_to!=-1))
1703  SG_DEBUG("j=%i ii=%i orf_from=%i orf_to=%i p=%1.2f\n", j, ii, orf_from, orf_to, elem_val[i])
1704  ASSERT((orf_from!=-1)==(orf_to!=-1))
1705 
1706  int32_t orf_target = -1 ;
1707  if (orf_from!=-1)
1708  {
1709  orf_target=orf_to-orf_from ;
1710  if (orf_target<0)
1711  orf_target+=3 ;
1712  ASSERT(orf_target>=0 && orf_target<3)
1713  }
1714 
1715  //int32_t loss_last_pos = t ;
1716  //float64_t last_loss = 0.0 ;
1717 
1718 #ifdef DYNPROG_TIMING
1719  MyTime3.start() ;
1720 #endif
1721 
1722  /* long transition stuff */
1723  /* only do this, if
1724  * this feature is enabled
1725  * this is not a transition with ORF restrictions
1726  * the loss is switched off
1727  * nbest=1
1728  */
1729 #ifdef DYNPROG_TIMING
1730  MyTime3.start() ;
1731 #endif
1732  // long transitions, only when not considering ORFs
1733  if ( long_transitions && orf_target==-1 && look_back_ == m_long_transition_threshold )
1734  {
1735 
1736  // update table for 5' part of the long segment
1737 
1738  int32_t start = long_transition_content_start.get_element(ii, j) ;
1739  int32_t end_5p_part = start ;
1740  for (int32_t start_5p_part=start; m_pos[t]-m_pos[start_5p_part] > m_long_transition_threshold ; start_5p_part++)
1741  {
1742  // find end_5p_part, which is greater than start_5p_part and at least m_long_transition_threshold away
1743  while (end_5p_part<=t && m_pos[end_5p_part+1]-m_pos[start_5p_part]<=m_long_transition_threshold)
1744  end_5p_part++ ;
1745 
1746  ASSERT(m_pos[end_5p_part+1]-m_pos[start_5p_part] > m_long_transition_threshold || end_5p_part==t)
1747  ASSERT(m_pos[end_5p_part]-m_pos[start_5p_part] <= m_long_transition_threshold)
1748 
1749  float64_t pen_val = 0.0;
1750  /* recompute penalty, if necessary */
1751  if (penalty)
1752  {
1753  int32_t frame = m_orf_info.element(ii,0);
1754  lookup_content_svm_values(start_5p_part, end_5p_part, m_pos[start_5p_part], m_pos[end_5p_part], svm_value, frame); // * t -> end_5p_part
1755  pen_val = penalty->lookup_penalty(m_pos[end_5p_part]-m_pos[start_5p_part], svm_value) ;
1756  }
1757 
1758  /*if (m_pos[start_5p_part]==1003)
1759  {
1760  SG_PRINT("Part1: %i - %i vs %i - %i\n", m_pos[t], m_pos[ts], m_pos[end_5p_part], m_pos[start_5p_part])
1761  SG_PRINT("Part1: ts=%i t=%i start_5p_part=%i m_seq_len=%i\n", m_pos[ts], m_pos[t], m_pos[start_5p_part], m_seq_len)
1762  }*/
1763 
1764  float64_t mval_trans = -( elem_val[i] + pen_val*0.5 + delta.element(delta_array, start_5p_part, ii, 0, m_seq_len, m_N) ) ;
1765  //float64_t mval_trans = -( elem_val[i] + delta.element(delta_array, ts, ii, 0, m_seq_len, m_N) ) ; // enable this for the incomplete extra check
1766 
1767  float64_t segment_loss_part1=0.0 ;
1768  if (with_loss)
1769  { // this is the loss from the start of the long segment (5' part + middle section)
1770 
1771  segment_loss_part1 = m_seg_loss_obj->get_segment_loss(start_5p_part /*long_transition_content_start_position.get_element(ii,j)*/, end_5p_part, elem_id[i]); // * unsure
1772 
1773  mval_trans -= segment_loss_part1 ;
1774  }
1775 
1776 
1777  if (0)//m_pos[end_5p_part] - m_pos[long_transition_content_start_position.get_element(ii, j)] > look_back_orig_/*m_long_transition_max*/)
1778  {
1779  // this restricts the maximal length of segments,
1780  // but the current implementation is not valid since the
1781  // long transition is discarded without loocking if there
1782  // is a second best long transition in between
1783  long_transition_content_scores.set_element(-CMath::INFTY, ii, j) ;
1784  long_transition_content_start_position.set_element(0, ii, j) ;
1785  if (with_loss)
1786  long_transition_content_scores_loss.set_element(0.0, ii, j) ;
1787 #ifdef DYNPROG_DEBUG
1788  long_transition_content_scores_pen.set_element(0.0, ii, j) ;
1789  long_transition_content_scores_elem.set_element(0.0, ii, j) ;
1790  long_transition_content_scores_prev.set_element(0.0, ii, j) ;
1791  long_transition_content_end_position.set_element(0, ii, j) ;
1792 #endif
1793  }
1794  if (with_loss)
1795  {
1796  float64_t old_loss = long_transition_content_scores_loss.get_element(ii, j) ;
1797  float64_t new_loss = m_seg_loss_obj->get_segment_loss(long_transition_content_start_position.get_element(ii,j), end_5p_part, elem_id[i]);
1798  float64_t score = long_transition_content_scores.get_element(ii, j) - old_loss + new_loss ;
1799  long_transition_content_scores.set_element(score, ii, j) ;
1800  long_transition_content_scores_loss.set_element(new_loss, ii, j) ;
1801 #ifdef DYNPROG_DEBUG
1802  long_transition_content_end_position.set_element(end_5p_part, ii, j) ;
1803 #endif
1804 
1805  }
1806  if (-long_transition_content_scores.get_element(ii, j) > mval_trans )
1807  {
1808  /* then the old long transition is either too far away or worse than the current one */
1809  long_transition_content_scores.set_element(-mval_trans, ii, j) ;
1810  long_transition_content_start_position.set_element(start_5p_part, ii, j) ;
1811  if (with_loss)
1812  long_transition_content_scores_loss.set_element(segment_loss_part1, ii, j) ;
1813 #ifdef DYNPROG_DEBUG
1814  long_transition_content_scores_pen.set_element(pen_val*0.5, ii, j) ;
1815  long_transition_content_scores_elem.set_element(elem_val[i], ii, j) ;
1816  long_transition_content_scores_prev.set_element(delta.element(delta_array, start_5p_part, ii, 0, m_seq_len, m_N), ii, j) ;
1817  /*ASSERT(fabs(long_transition_content_scores.get_element(ii, j)-(long_transition_content_scores_pen.get_element(ii, j) +
1818  long_transition_content_scores_elem.get_element(ii, j) +
1819  long_transition_content_scores_prev.get_element(ii, j)))<1e-6) ;*/
1820  long_transition_content_end_position.set_element(end_5p_part, ii, j) ;
1821 #endif
1822  }
1823  //
1824  // this sets the position where the search for better 5'parts is started the next time
1825  // whithout this the prediction takes ages
1826  //
1827  long_transition_content_start.set_element(start_5p_part, ii, j) ;
1828  }
1829 
1830  // consider the 3' part at the end of the long segment:
1831  // * with length = m_long_transition_threshold
1832  // * content prediction and loss only for this part
1833 
1834  // find ts > 0 with distance from m_pos[t] greater m_long_transition_threshold
1835  // precompute: only depends on t
1836  int ts = t;
1837  while (ts>0 && m_pos[t]-m_pos[ts-1] <= m_long_transition_threshold)
1838  ts-- ;
1839 
1840  if (ts>0)
1841  {
1842  ASSERT((m_pos[t]-m_pos[ts-1] > m_long_transition_threshold) && (m_pos[t]-m_pos[ts] <= m_long_transition_threshold))
1843 
1844 
1845  /* only consider this transition, if the right position was found */
1846  float pen_val_3p = 0.0 ;
1847  if (penalty)
1848  {
1849  int32_t frame = orf_from ; //m_orf_info.element(ii, 0);
1850  lookup_content_svm_values(ts, t, m_pos[ts], m_pos[t], svm_value, frame);
1851  pen_val_3p = penalty->lookup_penalty(m_pos[t]-m_pos[ts], svm_value) ;
1852  }
1853 
1854  float64_t mval = -(long_transition_content_scores.get_element(ii, j) + pen_val_3p*0.5) ;
1855 
1856  {
1857 #ifdef DYNPROG_DEBUG
1858  float64_t segment_loss_part2=0.0 ;
1859  float64_t segment_loss_part1=0.0 ;
1860 #endif
1861  float64_t segment_loss_total=0.0 ;
1862 
1863  if (with_loss)
1864  { // this is the loss for the 3' end fragment of the segment
1865  // (the 5' end and the middle section loss is already contained in mval)
1866 
1867 #ifdef DYNPROG_DEBUG
1868  // this is an alternative, which should be identical, if the loss is additive
1869  segment_loss_part2 = m_seg_loss_obj->get_segment_loss_extend(long_transition_content_end_position.get_element(ii,j), t, elem_id[i]);
1870  //mval -= segment_loss_part2 ;
1871  segment_loss_part1 = m_seg_loss_obj->get_segment_loss(long_transition_content_start_position.get_element(ii,j), long_transition_content_end_position.get_element(ii,j), elem_id[i]);
1872 #endif
1873  segment_loss_total = m_seg_loss_obj->get_segment_loss(long_transition_content_start_position.get_element(ii,j), t, elem_id[i]);
1874  mval -= (segment_loss_total-long_transition_content_scores_loss.get_element(ii, j)) ;
1875  }
1876 
1877 #ifdef DYNPROG_DEBUG
1878  if (m_pos[t]==10108 ||m_pos[t]==12802 ||m_pos[t]== 12561)
1879  {
1880  SG_PRINT("Part2: %i,%i,%i: val=%1.6f pen_val_3p*0.5=%1.6f (t=%i, ts=%i, ts-1=%i, ts+1=%i) scores=%1.6f (pen=%1.6f,prev=%1.6f,elem=%1.6f,loss=%1.1f), positions=%i,%i,%i, loss=%1.1f/%1.1f (%i,%i)\n",
1881  m_pos[t], j, ii, -mval, 0.5*pen_val_3p, m_pos[t], m_pos[ts], m_pos[ts-1], m_pos[ts+1],
1882  long_transition_content_scores.get_element(ii, j),
1883  long_transition_content_scores_pen.get_element(ii, j),
1884  long_transition_content_scores_prev.get_element(ii, j),
1885  long_transition_content_scores_elem.get_element(ii, j),
1886  long_transition_content_scores_loss.get_element(ii, j),
1887  m_pos[long_transition_content_start_position.get_element(ii,j)],
1888  m_pos[long_transition_content_end_position.get_element(ii,j)],
1889  m_pos[long_transition_content_start.get_element(ii,j)], segment_loss_part2, segment_loss_total, long_transition_content_start_position.get_element(ii,j), t) ;
1890  SG_PRINT("fixedtempvv_: %1.6f, from_state:%i from_pos:%i\n ",-fixedtempvv_, (fixedtempii_%m_N), m_pos[(fixedtempii_-(fixedtempii_%(m_N*nbest)))/(m_N*nbest)] )
1891  }
1892 
1893  if (fabs(segment_loss_part2+long_transition_content_scores_loss.get_element(ii, j) - segment_loss_total)>1e-3)
1894  {
1895  SG_ERROR("LOSS: total=%1.1f (%i-%i) part1=%1.1f/%1.1f (%i-%i) part2=%1.1f (%i-%i) sum=%1.1f diff=%1.1f\n",
1896  segment_loss_total, m_pos[long_transition_content_start_position.get_element(ii,j)], m_pos[t],
1897  long_transition_content_scores_loss.get_element(ii, j), segment_loss_part1, m_pos[long_transition_content_start_position.get_element(ii,j)], m_pos[long_transition_content_end_position.get_element(ii,j)],
1898  segment_loss_part2, m_pos[long_transition_content_end_position.get_element(ii,j)], m_pos[t],
1899  segment_loss_part2+long_transition_content_scores_loss.get_element(ii, j),
1900  segment_loss_part2+long_transition_content_scores_loss.get_element(ii, j) - segment_loss_total) ;
1901  }
1902 #endif
1903  }
1904 
1905  // prefer simpler version to guarantee optimality
1906  //
1907  // original:
1908  /* if ((mval < fixedtempvv_) &&
1909  (m_pos[t] - m_pos[long_transition_content_start_position.get_element(ii, j)])<=look_back_orig_) */
1910  if (mval < fixedtempvv_)
1911  {
1912  /* then the long transition is better than the short one => replace it */
1913  int32_t fromtjk = fixedtempii_ ;
1914  /*SG_PRINT("%i,%i: Long transition (%1.5f=-(%1.5f+%1.5f+%1.5f+%1.5f), %i) to m_pos %i better than short transition (%1.5f,%i) to m_pos %i \n",
1915  m_pos[t], j,
1916  mval, pen_val_3p*0.5, long_transition_content_scores_pen.get_element(ii, j), long_transition_content_scores_elem.get_element(ii, j), long_transition_content_scores_prev.get_element(ii, j), ii,
1917  m_pos[long_transition_content_position.get_element(ii, j)],
1918  fixedtempvv_, (fromtjk%m_N), m_pos[(fromtjk-(fromtjk%(m_N*nbest)))/(m_N*nbest)]) ;*/
1919  ASSERT((fromtjk-(fromtjk%(m_N*nbest)))/(m_N*nbest)==0 || m_pos[(fromtjk-(fromtjk%(m_N*nbest)))/(m_N*nbest)]>=m_pos[long_transition_content_start_position.get_element(ii, j)] || fixedtemplong)
1920 
1921  fixedtempvv_ = mval ;
1922  fixedtempii_ = ii + m_N*long_transition_content_start_position.get_element(ii, j) ;
1923  fixed_list_len = 1 ;
1924  fixedtemplong = true ;
1925  }
1926  }
1927  }
1928  }
1929 #ifdef DYNPROG_TIMING
1930  MyTime3.stop() ;
1931  long_transition_time += MyTime3.time_diff_sec() ;
1932 #endif
1933 
1934 
1935  int32_t numEnt = fixed_list_len;
1936 
1937  float64_t minusscore;
1938  int64_t fromtjk;
1939 
1940  for (int16_t k=0; k<nbest; k++)
1941  {
1942  if (k<numEnt)
1943  {
1944  if (nbest==1)
1945  {
1946  minusscore = fixedtempvv_ ;
1947  fromtjk = fixedtempii_ ;
1948  }
1949  else
1950  {
1951  minusscore = fixedtempvv[k];
1952  fromtjk = fixedtempii[k];
1953  }
1954 
1955  delta.element(delta_array, t, j, k, m_seq_len, m_N) = -minusscore + seq.element(j,t);
1956  psi.element(t,j,k) = (fromtjk%m_N) ;
1957  if (nbest>1)
1958  ktable.element(t,j,k) = (fromtjk%(m_N*nbest)-psi.element(t,j,k))/m_N ;
1959  ptable.element(t,j,k) = (fromtjk-(fromtjk%(m_N*nbest)))/(m_N*nbest) ;
1960  }
1961  else
1962  {
1963  delta.element(delta_array, t, j, k, m_seq_len, m_N) = -CMath::INFTY ;
1964  psi.element(t,j,k) = 0 ;
1965  if (nbest>1)
1966  ktable.element(t,j,k) = 0 ;
1967  ptable.element(t,j,k) = 0 ;
1968  }
1969  }
1970  }
1971  }
1972  }
1973  { //termination
1974  int32_t list_len = 0 ;
1975  for (int16_t diff=0; diff<nbest; diff++)
1976  {
1977  for (T_STATES i=0; i<m_N; i++)
1978  {
1979  oldtempvv[list_len] = -(delta.element(delta_array, (m_seq_len-1), i, diff, m_seq_len, m_N)+get_q(i)) ;
1980  oldtempii[list_len] = i + diff*m_N ;
1981  list_len++ ;
1982  }
1983  }
1984 
1985  CMath::nmin(oldtempvv.get_array(), oldtempii.get_array(), list_len, nbest) ;
1986 
1987  for (int16_t k=0; k<nbest; k++)
1988  {
1989  delta_end.element(k) = -oldtempvv[k] ;
1990  path_ends.element(k) = (oldtempii[k]%m_N) ;
1991  if (nbest>1)
1992  ktable_end.element(k) = (oldtempii[k]-path_ends.element(k))/m_N ;
1993  }
1994 
1995 
1996  }
1997 
1998  { //state sequence backtracking
1999  for (int16_t k=0; k<nbest; k++)
2000  {
2001  prob_nbest[k]= delta_end.element(k) ;
2002 
2003  int32_t i = 0 ;
2004  state_seq[i] = path_ends.element(k) ;
2005  int16_t q = 0 ;
2006  if (nbest>1)
2007  q=ktable_end.element(k) ;
2008  pos_seq[i] = m_seq_len-1 ;
2009 
2010  while (pos_seq[i]>0)
2011  {
2012  ASSERT(i+1<m_seq_len)
2013  //SG_DEBUG("s=%i p=%i q=%i\n", state_seq[i], pos_seq[i], q)
2014  state_seq[i+1] = psi.element(pos_seq[i], state_seq[i], q);
2015  pos_seq[i+1] = ptable.element(pos_seq[i], state_seq[i], q) ;
2016  if (nbest>1)
2017  q = ktable.element(pos_seq[i], state_seq[i], q) ;
2018  i++ ;
2019  }
2020  //SG_DEBUG("s=%i p=%i q=%i\n", state_seq[i], pos_seq[i], q)
2021  int32_t num_states = i+1 ;
2022  for (i=0; i<num_states;i++)
2023  {
2024  my_state_seq[i+k*m_seq_len] = state_seq[num_states-i-1] ;
2025  my_pos_seq[i+k*m_seq_len] = pos_seq[num_states-i-1] ;
2026  }
2027  if (num_states<m_seq_len)
2028  {
2029  my_state_seq[num_states+k*m_seq_len]=-1 ;
2030  my_pos_seq[num_states+k*m_seq_len]=-1 ;
2031  }
2032  }
2033  }
2034 
2035  //if (is_big)
2036  // SG_PRINT("DONE. \n")
2037 
2038 
2039 #ifdef DYNPROG_TIMING
2040  MyTime2.stop() ;
2041 
2042  //if (is_big)
2043  SG_PRINT("Timing: orf=%1.2f s \n Segment_init=%1.2f s Segment_pos=%1.2f s Segment_extend=%1.2f s Segment_clean=%1.2f s\nsvm_init=%1.2f s svm_pos=%1.2f svm_clean=%1.2f\n content_svm_values_time=%1.2f content_plifs_time=%1.2f\ninner_loop_max_time=%1.2f inner_loop=%1.2f long_transition_time=%1.2f\n total=%1.2f\n", orf_time, segment_init_time, segment_pos_time, segment_extend_time, segment_clean_time, svm_init_time, svm_pos_time, svm_clean_time, content_svm_values_time, content_plifs_time, inner_loop_max_time, inner_loop_time, long_transition_time, MyTime2.time_diff_sec())
2044 #endif
2045 
2046  SG_FREE(fixedtempvv);
2047  SG_FREE(fixedtempii);
2048  }
2049 
2050 
2052  int32_t *my_state_seq, int32_t *my_pos_seq,
2053  int32_t my_seq_len, const float64_t *seq_array, int32_t max_num_signals)
2054 {
2058  //m_my_scores.resize_array(m_my_state_seq.get_array_size()) ;
2059  //m_my_losses.resize_array(m_my_state_seq.get_array_size()) ;
2060  m_my_scores.resize_array(my_seq_len);
2061  m_my_losses.resize_array(my_seq_len);
2062  float64_t* my_scores=m_my_scores.get_array();
2063  float64_t* my_losses=m_my_losses.get_array();
2064  CPlifBase** Plif_matrix=m_plif_matrices->get_plif_matrix();
2065  CPlifBase** Plif_state_signals=m_plif_matrices->get_state_signals();
2066 
2067  if (!m_svm_arrays_clean)
2068  {
2069  SG_ERROR("SVM arrays not clean")
2070  return ;
2071  } ;
2072  //SG_PRINT("genestr_len=%i, genestr_num=%i\n", genestr_len, genestr_num)
2073  //m_mod_words.display() ;
2074  //m_sign_words.display() ;
2075  //m_string_words.display() ;
2076 
2077  bool use_svm = false ;
2078 
2079  CDynamicObjectArray PEN((CSGObject**) Plif_matrix, m_N, m_N, false, false) ; // 2d, CPlifBase*
2080  PEN.set_array_name("PEN");
2081 
2082  CDynamicObjectArray PEN_state_signals((CSGObject**) Plif_state_signals, m_N, max_num_signals, false, false) ; // 2d, CPlifBase*
2083  PEN_state_signals.set_array_name("PEN_state_signals");
2084 
2085  CDynamicArray<float64_t> seq_input(seq_array, m_N, m_seq_len, max_num_signals) ;
2086  seq_input.set_array_name("seq_input");
2087 
2088  { // determine whether to use svm outputs and clear derivatives
2089  for (int32_t i=0; i<m_N; i++)
2090  for (int32_t j=0; j<m_N; j++)
2091  {
2092  CPlifBase* penij=(CPlifBase*) PEN.element(i,j) ;
2093  if (penij==NULL)
2094  continue ;
2095 
2096  if (penij->uses_svm_values())
2097  use_svm=true ;
2098  penij->penalty_clear_derivative() ;
2099  }
2100  for (int32_t i=0; i<m_N; i++)
2101  for (int32_t j=0; j<max_num_signals; j++)
2102  {
2103  CPlifBase* penij=(CPlifBase*) PEN_state_signals.element(i,j) ;
2104  if (penij==NULL)
2105  continue ;
2106  if (penij->uses_svm_values())
2107  use_svm=true ;
2108  penij->penalty_clear_derivative() ;
2109  }
2110  }
2111 
2112  { // set derivatives of p, q and a to zero
2113 
2114  for (int32_t i=0; i<m_N; i++)
2115  {
2118  for (int32_t j=0; j<m_N; j++)
2120  }
2121  }
2122 
2123  { // clear score vector
2124  for (int32_t i=0; i<my_seq_len; i++)
2125  {
2126  my_scores[i]=0.0 ;
2127  my_losses[i]=0.0 ;
2128  }
2129  }
2130 
2131  //int32_t total_len = 0 ;
2132 
2133  //m_transition_matrix_a.display_array() ;
2134  //m_transition_matrix_a_id.display_array() ;
2135 
2136  // compute derivatives for given path
2141  {
2142  svm_value[s]=0 ;
2143  svm_value_part1[s]=0 ;
2144  svm_value_part2[s]=0 ;
2145  }
2146 
2147  //#ifdef DYNPROG_DEBUG
2148  float64_t total_score = 0.0 ;
2149  float64_t total_loss = 0.0 ;
2150  //#endif
2151 
2152  ASSERT(my_state_seq[0]>=0)
2153  m_initial_state_distribution_p_deriv.element(my_state_seq[0])++ ;
2154  my_scores[0] += m_initial_state_distribution_p.element(my_state_seq[0]) ;
2155 
2156  ASSERT(my_state_seq[my_seq_len-1]>=0)
2157  m_end_state_distribution_q_deriv.element(my_state_seq[my_seq_len-1])++ ;
2158  my_scores[my_seq_len-1] += m_end_state_distribution_q.element(my_state_seq[my_seq_len-1]);
2159 
2160  //#ifdef DYNPROG_DEBUG
2161  total_score += my_scores[0] + my_scores[my_seq_len-1] ;
2162  //#endif
2163 
2164  SG_DEBUG("m_seq_len=%i\n", my_seq_len)
2165  for (int32_t i=0; i<my_seq_len-1; i++)
2166  {
2167  if (my_state_seq[i+1]==-1)
2168  break ;
2169  int32_t from_state = my_state_seq[i] ;
2170  int32_t to_state = my_state_seq[i+1] ;
2171  int32_t from_pos = my_pos_seq[i] ;
2172  int32_t to_pos = my_pos_seq[i+1] ;
2173 
2174  int32_t elem_id = m_transition_matrix_a_id.element(from_state, to_state) ;
2175  my_losses[i] = m_seg_loss_obj->get_segment_loss(from_pos, to_pos, elem_id);
2176 
2177 #ifdef DYNPROG_DEBUG
2178 
2179 
2180  if (i>0)// test if segment loss is additive
2181  {
2182  float32_t loss1 = m_seg_loss_obj->get_segment_loss(my_pos_seq[i-1], my_pos_seq[i], elem_id);
2183  float32_t loss2 = m_seg_loss_obj->get_segment_loss(my_pos_seq[i], my_pos_seq[i+1], elem_id);
2184  float32_t loss3 = m_seg_loss_obj->get_segment_loss(my_pos_seq[i-1], my_pos_seq[i+1], elem_id);
2185  SG_PRINT("loss1:%f loss2:%f loss3:%f, diff:%f\n", loss1, loss2, loss3, loss1+loss2-loss3)
2186  if (CMath::abs(loss1+loss2-loss3)>0)
2187  {
2188  SG_PRINT("%i. segment loss %f (id=%i): from=%i(%i), to=%i(%i)\n", i, my_losses[i], elem_id, from_pos, from_state, to_pos, to_state)
2189  }
2190  }
2191  io->set_loglevel(M_DEBUG) ;
2192  SG_DEBUG("%i. segment loss %f (id=%i): from=%i(%i), to=%i(%i)\n", i, my_losses[i], elem_id, from_pos, from_state, to_pos, to_state)
2193 #endif
2194  // increase usage of this transition
2195  m_transition_matrix_a_deriv.element(from_state, to_state)++ ;
2196  my_scores[i] += m_transition_matrix_a.element(from_state, to_state) ;
2197  //SG_PRINT("m_transition_matrix_a.element(%i, %i),%f \n",from_state, to_state, m_transition_matrix_a.element(from_state, to_state))
2198 #ifdef DYNPROG_DEBUG
2199  SG_DEBUG("%i. scores[i]=%f\n", i, my_scores[i])
2200 #endif
2201 
2202  /*int32_t last_svm_pos[m_num_degrees] ;
2203  for (int32_t qq=0; qq<m_num_degrees; qq++)
2204  last_svm_pos[qq]=-1 ;*/
2205 
2206  bool is_long_transition = false ;
2207  if (m_long_transitions)
2208  {
2209  if (m_pos[to_pos]-m_pos[from_pos]>m_long_transition_threshold)
2210  is_long_transition = true ;
2211  if (m_orf_info.element(from_state,0)!=-1)
2212  is_long_transition = false ;
2213  }
2214 
2215  int32_t from_pos_thresh = from_pos ;
2216  int32_t to_pos_thresh = to_pos ;
2217 
2218  if (use_svm)
2219  {
2220  if (is_long_transition)
2221  {
2222 
2223  while (from_pos_thresh<to_pos && m_pos[from_pos_thresh+1] - m_pos[from_pos] <= m_long_transition_threshold) // *
2224  from_pos_thresh++ ;
2225  ASSERT(from_pos_thresh<to_pos)
2226  ASSERT(m_pos[from_pos_thresh] - m_pos[from_pos] <= m_long_transition_threshold) // *
2227  ASSERT(m_pos[from_pos_thresh+1] - m_pos[from_pos] > m_long_transition_threshold)// *
2228 
2229  int32_t frame = m_orf_info.element(from_state,0);
2230  lookup_content_svm_values(from_pos, from_pos_thresh, m_pos[from_pos], m_pos[from_pos_thresh], svm_value_part1, frame);
2231 
2232 #ifdef DYNPROG_DEBUG
2233  SG_PRINT("part1: pos1: %i pos2: %i pos3: %i \nsvm_value_part1: ", m_pos[from_pos], m_pos[from_pos_thresh], m_pos[from_pos_thresh+1])
2235  SG_PRINT("%1.4f ", svm_value_part1[s])
2236  SG_PRINT("\n")
2237 #endif
2238 
2239  while (to_pos_thresh>0 && m_pos[to_pos] - m_pos[to_pos_thresh-1] <= m_long_transition_threshold) // *
2240  to_pos_thresh-- ;
2241  ASSERT(to_pos_thresh>0)
2242  ASSERT(m_pos[to_pos] - m_pos[to_pos_thresh] <= m_long_transition_threshold) // *
2243  ASSERT(m_pos[to_pos] - m_pos[to_pos_thresh-1] > m_long_transition_threshold) // *
2244 
2245  lookup_content_svm_values(to_pos_thresh, to_pos, m_pos[to_pos_thresh], m_pos[to_pos], svm_value_part2, frame);
2246 
2247 #ifdef DYNPROG_DEBUG
2248  SG_PRINT("part2: pos1: %i pos2: %i pos3: %i \nsvm_value_part2: ", m_pos[to_pos], m_pos[to_pos_thresh], m_pos[to_pos_thresh+1])
2250  SG_PRINT("%1.4f ", svm_value_part2[s])
2251  SG_PRINT("\n")
2252 #endif
2253  }
2254  else
2255  {
2256  /* normal case */
2257 
2258  //SG_PRINT("from_pos: %i; to_pos: %i; m_pos[to_pos]-m_pos[from_pos]: %i \n",from_pos, to_pos, m_pos[to_pos]-m_pos[from_pos])
2259  int32_t frame = m_orf_info.element(from_state,0);
2260  if (false)//(frame>=0)
2261  {
2262  int32_t num_current_svms=0;
2263  int32_t svm_ids[] = {-8, -7, -6, -5, -4, -3, -2, -1};
2264  SG_PRINT("penalties(%i, %i), frame:%i ", from_state, to_state, frame)
2265  ((CPlifBase*) PEN.element(to_state, from_state))->get_used_svms(&num_current_svms, svm_ids);
2266  SG_PRINT("\n")
2267  }
2268 
2269  lookup_content_svm_values(from_pos, to_pos, m_pos[from_pos],m_pos[to_pos], svm_value, frame);
2270 #ifdef DYNPROG_DEBUG
2271  SG_PRINT("part2: pos1: %i pos2: %i \nsvm_values: ", m_pos[from_pos], m_pos[to_pos])
2273  SG_PRINT("%1.4f ", svm_value[s])
2274  SG_PRINT("\n")
2275 #endif
2276  }
2277  }
2278 
2279  if (PEN.element(to_state, from_state)!=NULL)
2280  {
2281  float64_t nscore = 0 ;
2282  if (is_long_transition)
2283  {
2284  float64_t pen_value_part1 = ((CPlifBase*) PEN.element(to_state, from_state))->lookup_penalty(m_pos[from_pos_thresh]-m_pos[from_pos], svm_value_part1) ;
2285  float64_t pen_value_part2 = ((CPlifBase*) PEN.element(to_state, from_state))->lookup_penalty(m_pos[to_pos]-m_pos[to_pos_thresh], svm_value_part2) ;
2286  nscore= 0.5*pen_value_part1 + 0.5*pen_value_part2 ;
2287  }
2288  else
2289  nscore = ((CPlifBase*) PEN.element(to_state, from_state))->lookup_penalty(m_pos[to_pos]-m_pos[from_pos], svm_value) ;
2290 
2291  if (false)//(nscore<-1e9)
2292  SG_PRINT("is_long_transition=%i (from_pos=%i (%i), to_pos=%i (%i)=> %1.5f\n",
2293  is_long_transition, m_pos[from_pos], from_state, m_pos[to_pos], to_state, nscore) ;
2294 
2295  my_scores[i] += nscore ;
2296 
2297  for (int32_t s=m_num_svms;s<m_num_lin_feat_plifs_cum[m_num_raw_data]; s++)/*set tiling plif values to neutral values (that do not influence derivative calculation)*/
2298  {
2299  svm_value[s]=-CMath::INFTY;
2300  svm_value_part1[s]=-CMath::INFTY;
2301  svm_value_part2[s]=-CMath::INFTY;
2302  }
2303 
2304 #ifdef DYNPROG_DEBUG
2305  //SG_DEBUG("%i. transition penalty: from_state=%i to_state=%i from_pos=%i [%i] to_pos=%i [%i] value=%i\n", i, from_state, to_state, from_pos, m_pos[from_pos], to_pos, m_pos[to_pos], m_pos[to_pos]-m_pos[from_pos])
2306 #endif
2307  if (is_long_transition)
2308  {
2309 #ifdef DYNPROG_DEBUG
2310  float64_t sum_score = 0.0 ;
2311 
2312  for (int kk=0; kk<i; kk++)
2313  sum_score += my_scores[i] ;
2314 
2315  SG_PRINT("is_long_transition=%i (from_pos=%i (%i), to_pos=%i (%i)=> %1.5f, %1.5f --- 1: %1.6f (%i-%i) 2: %1.6f (%i-%i) \n",
2316  is_long_transition, m_pos[from_pos], from_state, m_pos[to_pos], to_state,
2317  nscore, sum_score,
2318  PEN.element(to_state, from_state)->lookup_penalty(m_pos[from_pos_thresh]-m_pos[from_pos], svm_value_part1)*0.5, m_pos[from_pos], m_pos[from_pos_thresh],
2319  PEN.element(to_state, from_state)->lookup_penalty(m_pos[to_pos]-m_pos[to_pos_thresh], svm_value_part2)*0.5, m_pos[to_pos_thresh], m_pos[to_pos]) ;
2320 #endif
2321  }
2322 
2323  if (is_long_transition)
2324  {
2325  ((CPlifBase*) PEN.element(to_state, from_state))->penalty_add_derivative(m_pos[from_pos_thresh]-m_pos[from_pos], svm_value_part1, 0.5) ;
2326  ((CPlifBase*) PEN.element(to_state, from_state))->penalty_add_derivative(m_pos[to_pos]-m_pos[to_pos_thresh], svm_value_part2, 0.5) ;
2327  }
2328  else
2329  ((CPlifBase*) PEN.element(to_state, from_state))->penalty_add_derivative(m_pos[to_pos]-m_pos[from_pos], svm_value, 1) ;
2330 
2331  //SG_PRINT("m_num_raw_data = %i \n", m_num_raw_data)
2332 
2333  // for tiling array and rna-seq data every single measurement must be added to the derivative
2334  // in contrast to the content svm predictions where we have a single value per transition;
2335  // content svm predictions have already been added to the derivative, thus we start with d=1
2336  // instead of d=0
2337  if (is_long_transition)
2338  {
2339  for (int32_t d=1; d<=m_num_raw_data; d++)
2340  {
2341  for (int32_t s=0;s<m_num_lin_feat_plifs_cum[m_num_raw_data]+m_num_intron_plifs;s++)
2342  svm_value[s]=-CMath::INFTY;
2343  float64_t* intensities = SG_MALLOC(float64_t, m_num_probes_cum[d]);
2344  int32_t num_intensities = raw_intensities_interval_query(m_pos[from_pos], m_pos[from_pos_thresh],intensities, d);
2345  for (int32_t k=0;k<num_intensities;k++)
2346  {
2347  for (int32_t j=m_num_lin_feat_plifs_cum[d-1];j<m_num_lin_feat_plifs_cum[d];j++)
2348  svm_value[j]=intensities[k];
2349 
2350  ((CPlifBase*) PEN.element(to_state, from_state))->penalty_add_derivative(-CMath::INFTY, svm_value, 0.5) ;
2351 
2352  }
2353  num_intensities = raw_intensities_interval_query(m_pos[to_pos_thresh], m_pos[to_pos],intensities, d);
2354  for (int32_t k=0;k<num_intensities;k++)
2355  {
2356  for (int32_t j=m_num_lin_feat_plifs_cum[d-1];j<m_num_lin_feat_plifs_cum[d];j++)
2357  svm_value[j]=intensities[k];
2358 
2359  ((CPlifBase*) PEN.element(to_state, from_state))->penalty_add_derivative(-CMath::INFTY, svm_value, 0.5) ;
2360 
2361  }
2362  SG_FREE(intensities);
2363 
2364  }
2365  }
2366  else
2367  {
2368  for (int32_t d=1; d<=m_num_raw_data; d++)
2369  {
2370  for (int32_t s=0;s<m_num_lin_feat_plifs_cum[m_num_raw_data]+m_num_intron_plifs;s++)
2371  svm_value[s]=-CMath::INFTY;
2372  float64_t* intensities = SG_MALLOC(float64_t, m_num_probes_cum[d]);
2373  int32_t num_intensities = raw_intensities_interval_query(m_pos[from_pos], m_pos[to_pos],intensities, d);
2374  //SG_PRINT("m_pos[from_pos]:%i, m_pos[to_pos]:%i, num_intensities:%i\n",m_pos[from_pos],m_pos[to_pos], num_intensities)
2375  for (int32_t k=0;k<num_intensities;k++)
2376  {
2377  for (int32_t j=m_num_lin_feat_plifs_cum[d-1];j<m_num_lin_feat_plifs_cum[d];j++)
2378  svm_value[j]=intensities[k];
2379 
2380  ((CPlifBase*) PEN.element(to_state, from_state))->penalty_add_derivative(-CMath::INFTY, svm_value, 1) ;
2381 
2382  }
2383  SG_FREE(intensities);
2384  }
2385  }
2386 
2387  }
2388 #ifdef DYNPROG_DEBUG
2389  SG_DEBUG("%i. scores[i]=%f\n", i, my_scores[i])
2390 #endif
2391 
2392  //SG_DEBUG("emmission penalty skipped: to_state=%i to_pos=%i value=%1.2f score=%1.2f\n", to_state, to_pos, seq_input.element(to_state, to_pos), 0.0)
2393  for (int32_t k=0; k<max_num_signals; k++)
2394  {
2395  if ((PEN_state_signals.element(to_state,k)==NULL)&&(k==0))
2396  {
2397 #ifdef DYNPROG_DEBUG
2398  SG_DEBUG("%i. emmission penalty: to_state=%i to_pos=%i score=%1.2f (no signal plif)\n", i, to_state, to_pos, seq_input.element(to_state, to_pos, k))
2399 #endif
2400  my_scores[i] += seq_input.element(to_state, to_pos, k) ;
2401  //if (seq_input.element(to_state, to_pos, k) !=0)
2402  // SG_PRINT("features(%i,%i): %f\n",to_state,to_pos,seq_input.element(to_state, to_pos, k))
2403  break ;
2404  }
2405  if (PEN_state_signals.element(to_state, k)!=NULL)
2406  {
2407  float64_t nscore = ((CPlifBase*) PEN_state_signals.element(to_state,k))->lookup_penalty(seq_input.element(to_state, to_pos, k), svm_value) ; // this should be ok for long_transitions (svm_value does not matter)
2408  my_scores[i] += nscore ;
2409 #ifdef DYNPROG_DEBUG
2410  if (false)//(nscore<-1e9)
2411  {
2412  SG_PRINT("is_long_transition=%i (from_pos=%i (%i), from_state=%i, to_pos=%i (%i) to_state=%i=> %1.5f, dim3:%i, seq_input.element(to_state, to_pos, k): %1.4f\n",
2413  is_long_transition, m_pos[from_pos], from_pos, from_state, m_pos[to_pos], to_pos, to_state, nscore, k, seq_input.element(to_state, to_pos, k)) ;
2414  for (int x=0; x<23; x++)
2415  {
2416  for (int i=-10; i<10; i++)
2417  SG_PRINT("%1.4f\t", seq_input.element(x, to_pos+i, k))
2418  SG_PRINT("\n")
2419  }
2420 
2421  }
2422 #endif
2423  //break ;
2424  //int32_t num_current_svms=0;
2425  //int32_t svm_ids[] = {-8, -7, -6, -5, -4, -3, -2, -1};
2426  //SG_PRINT("PEN_state_signals->id: ")
2427  //PEN_state_signals.element(to_state, k)->get_used_svms(&num_current_svms, svm_ids);
2428  //SG_PRINT("\n")
2429  //if (nscore != 0)
2430  //SG_PRINT("%i. emmission penalty: to_state=%i to_pos=%i value=%1.2f score=%1.2f k=%i\n", i, to_state, to_pos, seq_input.element(to_state, to_pos, k), nscore, k)
2431 #ifdef DYNPROG_DEBUG
2432  SG_DEBUG("%i. emmission penalty: to_state=%i to_pos=%i value=%1.2f score=%1.2f k=%i\n", i, to_state, to_pos, seq_input.element(to_state, to_pos, k), nscore, k)
2433 #endif
2434  ((CPlifBase*) PEN_state_signals.element(to_state,k))->penalty_add_derivative(seq_input.element(to_state, to_pos, k), svm_value, 1) ; // this should be ok for long_transitions (svm_value does not matter)
2435  } else
2436  break ;
2437  }
2438 
2439  //#ifdef DYNPROG_DEBUG
2440  //SG_PRINT("scores[%i]=%f (final) \n", i, my_scores[i])
2441  //SG_PRINT("losses[%i]=%f (final) , total_loss: %f \n", i, my_losses[i], total_loss)
2442  total_score += my_scores[i] ;
2443  total_loss += my_losses[i] ;
2444  //#endif
2445  }
2446  //#ifdef DYNPROG_DEBUG
2447  //SG_PRINT("total score = %f \n", total_score)
2448  //SG_PRINT("total loss = %f \n", total_loss)
2449  //#endif
2450  SG_FREE(svm_value);
2451  SG_FREE(svm_value_part1);
2452  SG_FREE(svm_value_part2);
2453 }
2454 
2455 int32_t CDynProg::raw_intensities_interval_query(const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type)
2456 {
2457  ASSERT(from_pos<to_pos)
2458  int32_t num_intensities = 0;
2459  int32_t* p_tiling_pos = &m_probe_pos[m_num_probes_cum[type-1]];
2460  float64_t* p_tiling_data = &m_raw_intensities[m_num_probes_cum[type-1]];
2461  int32_t last_pos;
2462  int32_t num = m_num_probes_cum[type-1];
2463  while (*p_tiling_pos<to_pos)
2464  {
2465  if (*p_tiling_pos>=from_pos)
2466  {
2467  intensities[num_intensities] = *p_tiling_data;
2468  num_intensities++;
2469  }
2470  num++;
2471  if (num>=m_num_probes_cum[type])
2472  break;
2473  last_pos = *p_tiling_pos;
2474  p_tiling_pos++;
2475  p_tiling_data++;
2476  ASSERT(last_pos<*p_tiling_pos)
2477  }
2478  return num_intensities;
2479 }
2480 
2481 void CDynProg::lookup_content_svm_values(const int32_t from_state, const int32_t to_state, const int32_t from_pos, const int32_t to_pos, float64_t* svm_values, int32_t frame)
2482 {
2483 #ifdef DYNPROG_TIMING_DETAIL
2484  MyTime.start() ;
2485 #endif
2486 // ASSERT(from_state<to_state)
2487 // if (!(from_pos<to_pos))
2488 // SG_ERROR("from_pos!<to_pos, from_pos: %i to_pos: %i \n",from_pos,to_pos)
2489  for (int32_t i=0;i<m_num_svms;i++)
2490  {
2491  float64_t to_val = m_lin_feat.get_element(i, to_state);
2492  float64_t from_val = m_lin_feat.get_element(i, from_state);
2493  svm_values[i] = (to_val-from_val)/(to_pos-from_pos);
2494  }
2495  for (int32_t i=m_num_svms;i<m_num_lin_feat_plifs_cum[m_num_raw_data];i++)
2496  {
2497  float64_t to_val = m_lin_feat.get_element(i, to_state);
2498  float64_t from_val = m_lin_feat.get_element(i, from_state);
2499  svm_values[i] = to_val-from_val ;
2500  }
2501  if (m_intron_list)
2502  {
2503  int32_t* support = SG_MALLOC(int32_t, m_num_intron_plifs);
2504  m_intron_list->get_intron_support(support, from_state, to_state);
2505  int32_t intron_list_start = m_num_lin_feat_plifs_cum[m_num_raw_data];
2506  int32_t intron_list_end = m_num_lin_feat_plifs_cum[m_num_raw_data]+m_num_intron_plifs;
2507  int32_t cnt = 0;
2508  for (int32_t i=intron_list_start; i<intron_list_end;i++)
2509  {
2510  svm_values[i] = (float64_t) (support[cnt]);
2511  cnt++;
2512  }
2513  //if (to_pos>3990 && to_pos<4010)
2514  // SG_PRINT("from_state:%i to_state:%i support[0]:%i support[1]:%i\n",from_state, to_state, support[0], support[1])
2515  SG_FREE(support);
2516  }
2517  // find the correct row with precomputed frame predictions
2518  if (frame!=-1)
2519  {
2520  svm_values[frame_plifs[0]] = 1e10;
2521  svm_values[frame_plifs[1]] = 1e10;
2522  svm_values[frame_plifs[2]] = 1e10;
2523  int32_t global_frame = from_pos%3;
2524  int32_t row = ((global_frame+frame)%3)+4;
2525  float64_t to_val = m_lin_feat.get_element(row, to_state);
2526  float64_t from_val = m_lin_feat.get_element(row, from_state);
2527  svm_values[frame+frame_plifs[0]] = (to_val-from_val)/(to_pos-from_pos);
2528  }
2529 #ifdef DYNPROG_TIMING_DETAIL
2530  MyTime.stop() ;
2531  content_svm_values_time += MyTime.time_diff_sec() ;
2532 #endif
2533 }
2534 void CDynProg::set_intron_list(CIntronList* intron_list, int32_t num_plifs)
2535 {
2536  m_intron_list = intron_list;
2537  m_num_intron_plifs = num_plifs;
2538 }
2539 

SHOGUN Machine Learning Toolbox - Documentation