SHOGUN  4.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
DynProg.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 2 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2007 Soeren Sonnenburg
8  * Written (W) 1999-2007 Gunnar Raetsch
9  * Written (W) 2008-2009 Jonas Behr
10  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
11  */
12 
15 #include <shogun/io/SGIO.h>
16 #include <shogun/lib/config.h>
19 #include <shogun/structure/Plif.h>
21 
22 #include <stdlib.h>
23 #include <stdio.h>
24 #include <time.h>
25 #include <ctype.h>
26 #include <limits.h>
27 
28 using namespace shogun;
29 
30 //#define USE_TMP_ARRAYCLASS
31 //#define DYNPROG_DEBUG
32 
33 int32_t CDynProg::word_degree_default[4]={3,4,5,6} ;
34 int32_t CDynProg::cum_num_words_default[5]={0,64,320,1344,5440} ;
35 int32_t CDynProg::frame_plifs[3]={4,5,6};
36 int32_t CDynProg::num_words_default[4]= {64,256,1024,4096} ;
37 int32_t CDynProg::mod_words_default[32] = {1,1,1,1,1,1,1,1,
38  1,1,1,1,1,1,1,1,
39  0,0,0,0,0,0,0,0,
40  0,0,0,0,0,0,0,0} ;
41 bool CDynProg::sign_words_default[16] = {true,true,true,true,true,true,true,true,
42  false,false,false,false,false,false,false,false} ; // whether to use counts or signum of counts
43 int32_t CDynProg::string_words_default[16] = {0,0,0,0,0,0,0,0,
44  1,1,1,1,1,1,1,1} ; // which string should be used
45 
46 CDynProg::CDynProg(int32_t num_svms /*= 8 */)
47 : CSGObject(), m_transition_matrix_a_id(1,1), m_transition_matrix_a(1,1),
48  m_transition_matrix_a_deriv(1,1), m_initial_state_distribution_p(1),
49  m_initial_state_distribution_p_deriv(1), m_end_state_distribution_q(1),
50  m_end_state_distribution_q_deriv(1),
51 
52  // multi svm
53  m_num_degrees(4),
54  m_num_svms(num_svms),
55  m_word_degree(word_degree_default, m_num_degrees, true, true),
56  m_cum_num_words(cum_num_words_default, m_num_degrees+1, true, true),
57  m_cum_num_words_array(m_cum_num_words.get_array()),
58  m_num_words(num_words_default, m_num_degrees, true, true),
59  m_num_words_array(m_num_words.get_array()),
60  m_mod_words(mod_words_default, m_num_svms, 2, true, true),
61  m_mod_words_array(m_mod_words.get_array()),
62  m_sign_words(sign_words_default, m_num_svms, true, true),
63  m_sign_words_array(m_sign_words.get_array()),
64  m_string_words(string_words_default, m_num_svms, true, true),
65  m_string_words_array(m_string_words.get_array()),
66  //m_svm_pos_start(m_num_degrees),
67  m_num_unique_words(m_num_degrees),
68  m_svm_arrays_clean(true),
69 
70  m_max_a_id(0), m_observation_matrix(1,1,1),
71  m_pos(1),
72  m_seq_len(0),
73  m_orf_info(1,2),
74  m_plif_list(1),
75  m_genestr(1), m_wordstr(NULL), m_dict_weights(1,1), m_segment_loss(1,1,2),
76  m_segment_ids(1),
77  m_segment_mask(1),
78  m_my_state_seq(1),
79  m_my_pos_seq(1),
80  m_my_scores(1),
81  m_my_losses(1),
82  m_scores(1),
83  m_states(1,1),
84  m_positions(1,1),
85 
86  m_seq_sparse1(NULL),
87  m_seq_sparse2(NULL),
88  m_plif_matrices(NULL),
89 
90  m_genestr_stop(1),
91  m_intron_list(NULL),
92  m_num_intron_plifs(0),
93  m_lin_feat(1,1), //by Jonas
94  m_raw_intensities(NULL),
95  m_probe_pos(NULL),
96  m_num_probes_cum(NULL),
97  m_num_lin_feat_plifs_cum(NULL),
98  m_num_raw_data(0),
99 
100  m_long_transitions(true),
101  m_long_transition_threshold(1000)
102 {
103  trans_list_forward = NULL ;
104  trans_list_forward_cnt = NULL ;
105  trans_list_forward_val = NULL ;
106  trans_list_forward_id = NULL ;
107  trans_list_len = 0 ;
108 
109  mem_initialized = true ;
110 
111  m_N=1;
112 
113  m_raw_intensities = NULL;
114  m_probe_pos = NULL;
115  m_num_probes_cum = SG_MALLOC(int32_t, 100);
116  m_num_probes_cum[0] = 0;
117  //m_use_tiling=false;
118  m_num_lin_feat_plifs_cum = SG_MALLOC(int32_t, 100);
120  m_num_raw_data = 0;
121 #ifdef ARRAY_STATISTICS
122  m_word_degree.set_array_name("word_degree");
123 #endif
124 
125  m_transition_matrix_a_id.set_array_name("transition_matrix_a_id");
126  m_transition_matrix_a.set_array_name("transition_matrix_a");
127  m_transition_matrix_a_deriv.set_array_name("transition_matrix_a_deriv");
128  m_mod_words.set_array_name("mod_words");
129  m_orf_info.set_array_name("orf_info");
130  m_segment_sum_weights.set_array_name("segment_sum_weights");
131  m_dict_weights.set_array_name("dict_weights");
132  m_states.set_array_name("states");
133  m_positions.set_array_name("positions");
134  m_lin_feat.set_array_name("lin_feat");
135 
136 
137  m_observation_matrix.set_array_name("m_observation_matrix");
138  m_segment_loss.set_array_name("m_segment_loss");
140 }
141 
143 {
144  if (trans_list_forward_cnt)
145  SG_FREE(trans_list_forward_cnt);
146  if (trans_list_forward)
147  {
148  for (int32_t i=0; i<trans_list_len; i++)
149  {
150  if (trans_list_forward[i])
151  SG_FREE(trans_list_forward[i]);
152  }
153  SG_FREE(trans_list_forward);
154  }
155  if (trans_list_forward_val)
156  {
157  for (int32_t i=0; i<trans_list_len; i++)
158  {
159  if (trans_list_forward_val[i])
160  SG_FREE(trans_list_forward_val[i]);
161  }
162  SG_FREE(trans_list_forward_val);
163  }
164  if (trans_list_forward_id)
165  {
166  for (int32_t i=0; i<trans_list_len; i++)
167  {
168  if (trans_list_forward_id[i])
169  SG_FREE(trans_list_forward_id[i]);
170  }
171  SG_FREE(trans_list_forward_id);
172  }
173  if (m_raw_intensities)
174  SG_FREE(m_raw_intensities);
175  if (m_probe_pos)
176  SG_FREE(m_probe_pos);
177  if (m_num_probes_cum)
178  SG_FREE(m_num_probes_cum);
180  SG_FREE(m_num_lin_feat_plifs_cum);
181 
182  delete m_intron_list;
183 
188 }
189 
192 {
193  return m_num_svms;
194 }
195 
197 {
198  int32_t length=m_genestr.get_dim1();
199 
200  m_genestr_stop.resize_array(length) ;
202  m_genestr_stop.set_array_name("genestr_stop") ;
203  {
204  for (int32_t i=0; i<length-2; i++)
205  if ((m_genestr[i]=='t' || m_genestr[i]=='T') &&
206  (((m_genestr[i+1]=='a' || m_genestr[i+1]=='A') &&
207  (m_genestr[i+2]=='a' || m_genestr[i+2]=='g' || m_genestr[i+2]=='A' || m_genestr[i+2]=='G')) ||
208  ((m_genestr[i+1]=='g'||m_genestr[i+1]=='G') && (m_genestr[i+2]=='a' || m_genestr[i+2]=='A') )))
209  {
210  m_genestr_stop.element(i)=true ;
211  }
212  else
213  m_genestr_stop.element(i)=false ;
214  m_genestr_stop.element(length-2)=false ;
215  m_genestr_stop.element(length-1)=false ;
216  }
217 }
218 
219 void CDynProg::set_num_states(int32_t p_N)
220 {
221  m_N=p_N ;
222 
230 
232 }
233 
235 {
236  return m_N;
237 }
238 
240  int32_t* probe_pos, float64_t* intensities, const int32_t num_probes)
241 {
242  m_num_raw_data++;
244 
245  int32_t* tmp_probe_pos = SG_MALLOC(int32_t, m_num_probes_cum[m_num_raw_data]);
246  float64_t* tmp_raw_intensities = SG_MALLOC(float64_t, m_num_probes_cum[m_num_raw_data]);
247 
248 
249  if (m_num_raw_data==1){
250  memcpy(tmp_probe_pos, probe_pos, num_probes*sizeof(int32_t));
251  memcpy(tmp_raw_intensities, intensities, num_probes*sizeof(float64_t));
252  //SG_PRINT("raw_intens:%f \n",*tmp_raw_intensities+2)
253  }else{
254  memcpy(tmp_probe_pos, m_probe_pos, m_num_probes_cum[m_num_raw_data-1]*sizeof(int32_t));
255  memcpy(tmp_raw_intensities, m_raw_intensities, m_num_probes_cum[m_num_raw_data-1]*sizeof(float64_t));
256  memcpy(tmp_probe_pos+m_num_probes_cum[m_num_raw_data-1], probe_pos, num_probes*sizeof(int32_t));
257  memcpy(tmp_raw_intensities+m_num_probes_cum[m_num_raw_data-1], intensities, num_probes*sizeof(float64_t));
258  }
259  SG_FREE(m_probe_pos);
260  SG_FREE(m_raw_intensities);
261  m_probe_pos = tmp_probe_pos; //SG_MALLOC(int32_t, num_probes);
262  m_raw_intensities = tmp_raw_intensities;//SG_MALLOC(float64_t, num_probes);
263 
264  //memcpy(m_probe_pos, probe_pos, num_probes*sizeof(int32_t));
265  //memcpy(m_raw_intensities, intensities, num_probes*sizeof(float64_t));
266 
267 }
268 
269 void CDynProg::init_content_svm_value_array(const int32_t p_num_svms)
270 {
271  m_lin_feat.resize_array(p_num_svms, m_seq_len);
272 
273  // initialize array
274  for (int s=0; s<p_num_svms; s++)
275  for (int p=0; p<m_seq_len; p++)
276  m_lin_feat.set_element(0.0, s, p) ;
277 }
278 
279 void CDynProg::resize_lin_feat(const int32_t num_new_feat)
280 {
281  int32_t dim1, dim2;
282  m_lin_feat.get_array_size(dim1, dim2);
284  ASSERT(dim2==m_seq_len) // == number of candidate positions
285 
286 
287 
288  float64_t* arr = m_lin_feat.get_array();
289  float64_t* tmp = SG_MALLOC(float64_t, (dim1+num_new_feat)*dim2);
290  memset(tmp, 0, (dim1+num_new_feat)*dim2*sizeof(float64_t)) ;
291  for(int32_t j=0;j<m_seq_len;j++)
292  for(int32_t k=0;k<m_num_lin_feat_plifs_cum[m_num_raw_data-1];k++)
293  tmp[j*(dim1+num_new_feat)+k] = arr[j*dim1+k];
294 
295  m_lin_feat.set_array(tmp, dim1+num_new_feat,dim2, true, true);// copy array and free it later
296  SG_FREE(tmp);
297 
298  /*for(int32_t j=0;j<5;j++)
299  {
300  for(int32_t k=0;k<m_num_lin_feat_plifs_cum[m_num_raw_data];k++)
301  {
302  SG_PRINT("(%i,%i)%f ",k,j,m_lin_feat.get_element(k,j))
303  }
304  SG_PRINT("\n")
305  }
306  m_lin_feat.get_array_size(dim1,dim2);
307  SG_PRINT("resize_lin_feat: dim1:%i, dim2:%i\n",dim1,dim2)*/
308 
309  //SG_PRINT("resize_lin_feat: done\n")
310 }
311 
313  CPlif** PEN, const int32_t* tiling_plif_ids, const int32_t num_tiling_plifs)
314 {
316  float64_t* tiling_plif = SG_MALLOC(float64_t, num_tiling_plifs);
319  svm_value[i]=0.0;
320  int32_t* tiling_rows = SG_MALLOC(int32_t, num_tiling_plifs);
321  for (int32_t i=0; i<num_tiling_plifs; i++)
322  {
323  tiling_plif[i]=0.0;
324  CPlif * plif = PEN[tiling_plif_ids[i]];
325  tiling_rows[i] = plif->get_use_svm();
326 
327  ASSERT(tiling_rows[i]-1==m_num_lin_feat_plifs_cum[m_num_raw_data-1]+i)
328  }
329  resize_lin_feat(num_tiling_plifs);
330 
331 
332  int32_t* p_tiling_pos = &m_probe_pos[m_num_probes_cum[m_num_raw_data-1]];
333  float64_t* p_tiling_data = &m_raw_intensities[m_num_probes_cum[m_num_raw_data-1]];
334  int32_t num=m_num_probes_cum[m_num_raw_data-1];
335 
336  for (int32_t pos_idx=0;pos_idx<m_seq_len;pos_idx++)
337  {
338  while (num<m_num_probes_cum[m_num_raw_data]&&*p_tiling_pos<m_pos[pos_idx])
339  {
340  for (int32_t i=0; i<num_tiling_plifs; i++)
341  {
342  svm_value[m_num_lin_feat_plifs_cum[m_num_raw_data-1]+i]=*p_tiling_data;
343  CPlif * plif = PEN[tiling_plif_ids[i]];
344  ASSERT(m_num_lin_feat_plifs_cum[m_num_raw_data-1]+i==plif->get_use_svm()-1)
345  plif->set_do_calc(true);
346  tiling_plif[i]+=plif->lookup_penalty(0,svm_value);
347  plif->set_do_calc(false);
348  }
349  p_tiling_data++;
350  p_tiling_pos++;
351  num++;
352  }
353  for (int32_t i=0; i<num_tiling_plifs; i++)
354  m_lin_feat.set_element(tiling_plif[i],tiling_rows[i]-1,pos_idx);
355  }
356  SG_FREE(svm_value);
357  SG_FREE(tiling_plif);
358  SG_FREE(tiling_rows);
359 }
360 
362 {
363  SG_FREE(m_wordstr);
364  m_wordstr=SG_MALLOC(uint16_t**, 5440);
365  int32_t k=0;
366  int32_t genestr_len=m_genestr.get_dim1();
367 
368  m_wordstr[k]=SG_MALLOC(uint16_t*, m_num_degrees);
369  for (int32_t j=0; j<m_num_degrees; j++)
370  {
371  m_wordstr[k][j]=NULL ;
372  {
373  m_wordstr[k][j]=SG_MALLOC(uint16_t, genestr_len);
374  for (int32_t i=0; i<genestr_len; i++)
375  switch (m_genestr[i])
376  {
377  case 'A':
378  case 'a': m_wordstr[k][j][i]=0 ; break ;
379  case 'C':
380  case 'c': m_wordstr[k][j][i]=1 ; break ;
381  case 'G':
382  case 'g': m_wordstr[k][j][i]=2 ; break ;
383  case 'T':
384  case 't': m_wordstr[k][j][i]=3 ; break ;
385  default: ASSERT(0)
386  }
388  }
389  }
390 }
391 
393 {
394  for (int32_t s=0; s<m_num_svms; s++)
395  m_lin_feat.set_element(0.0, s, 0);
396 
397  for (int32_t p=0 ; p<m_seq_len-1 ; p++)
398  {
399  int32_t from_pos = m_pos[p];
400  int32_t to_pos = m_pos[p+1];
401  float64_t* my_svm_values_unnormalized = SG_MALLOC(float64_t, m_num_svms);
402  //SG_PRINT("%i(%i->%i) ",p,from_pos, to_pos)
403 
404  ASSERT(from_pos<=m_genestr.get_dim1())
405  ASSERT(to_pos<=m_genestr.get_dim1())
406 
407  for (int32_t s=0; s<m_num_svms; s++)
408  my_svm_values_unnormalized[s]=0.0;//precomputed_svm_values.element(s,p);
409 
410  for (int32_t i=from_pos; i<to_pos; i++)
411  {
412  for (int32_t j=0; j<m_num_degrees; j++)
413  {
414  uint16_t word = m_wordstr[0][j][i] ;
415  for (int32_t s=0; s<m_num_svms; s++)
416  {
417  // check if this k-mer should be considered for this SVM
418  if (m_mod_words.get_element(s,0)==3 && i%3!=m_mod_words.get_element(s,1))
419  continue;
420  my_svm_values_unnormalized[s] += m_dict_weights[(word+m_cum_num_words_array[j])+s*m_cum_num_words_array[m_num_degrees]] ;
421  }
422  }
423  }
424  for (int32_t s=0; s<m_num_svms; s++)
425  {
426  float64_t prev = m_lin_feat.get_element(s, p);
427  //SG_PRINT("elem (%i, %i, %f)\n", s, p, prev)
428  if (prev<-1e20 || prev>1e20)
429  {
430  SG_ERROR("initialization missing (%i, %i, %f)\n", s, p, prev)
431  prev=0 ;
432  }
433  m_lin_feat.set_element(prev + my_svm_values_unnormalized[s], s, p+1);
434  }
435  SG_FREE(my_svm_values_unnormalized);
436  }
437  //for (int32_t j=0; j<m_num_degrees; j++)
438  // SG_FREE(m_wordstr[0][j]);
439  //SG_FREE(m_wordstr[0]);
440 }
441 
443 {
444  if (!(p.vlen==m_N))
445  SG_ERROR("length of start prob vector p (%i) is not equal to the number of states (%i), N: %i\n",p.vlen, m_N)
446 
448 }
449 
451 {
452  if (!(q.vlen==m_N))
453  SG_ERROR("length of end prob vector q (%i) is not equal to the number of states (%i), N: %i\n",q.vlen, m_N)
455 }
456 
458 {
459  ASSERT(a.num_cols==m_N)
460  ASSERT(a.num_rows==m_N)
461  m_transition_matrix_a.set_array(a.matrix, m_N, m_N, true, true);
463 }
464 
466 {
467  ASSERT(a.num_cols==m_N)
468  ASSERT(a.num_rows==m_N)
470  m_max_a_id = 0;
471  for (int32_t i=0; i<m_N; i++)
472  {
473  for (int32_t j=0; j<m_N; j++)
475  }
476 }
477 
479 {
480  int32_t num_trans=a_trans.num_rows;
481  int32_t num_cols=a_trans.num_cols;
482 
483  //CMath::display_matrix(a_trans.matrix,num_trans, num_cols,"a_trans");
484 
485  if (!((num_cols==3) || (num_cols==4)))
486  SG_ERROR("!((num_cols==3) || (num_cols==4)), num_cols: %i\n",num_cols)
487 
488  SG_FREE(trans_list_forward);
489  SG_FREE(trans_list_forward_cnt);
490  SG_FREE(trans_list_forward_val);
491  SG_FREE(trans_list_forward_id);
492 
493  trans_list_forward = NULL ;
494  trans_list_forward_cnt = NULL ;
495  trans_list_forward_val = NULL ;
496  trans_list_len = 0 ;
497 
500 
501  mem_initialized = true ;
502 
503  trans_list_forward_cnt=NULL ;
504  trans_list_len = m_N ;
505  trans_list_forward = SG_MALLOC(T_STATES*, m_N);
506  trans_list_forward_cnt = SG_MALLOC(T_STATES, m_N);
507  trans_list_forward_val = SG_MALLOC(float64_t*, m_N);
508  trans_list_forward_id = SG_MALLOC(int32_t*, m_N);
509 
510  int32_t start_idx=0;
511  for (int32_t j=0; j<m_N; j++)
512  {
513  int32_t old_start_idx=start_idx;
514 
515  while (start_idx<num_trans && a_trans.matrix[start_idx+num_trans]==j)
516  {
517  start_idx++;
518 
519  if (start_idx>1 && start_idx<num_trans)
520  ASSERT(a_trans.matrix[start_idx+num_trans-1] <= a_trans.matrix[start_idx+num_trans])
521  }
522 
523  if (start_idx>1 && start_idx<num_trans)
524  ASSERT(a_trans.matrix[start_idx+num_trans-1] <= a_trans.matrix[start_idx+num_trans])
525 
526  int32_t len=start_idx-old_start_idx;
527  ASSERT(len>=0)
528 
529  trans_list_forward_cnt[j] = 0 ;
530 
531  if (len>0)
532  {
533  trans_list_forward[j] = SG_MALLOC(T_STATES, len);
534  trans_list_forward_val[j] = SG_MALLOC(float64_t, len);
535  trans_list_forward_id[j] = SG_MALLOC(int32_t, len);
536  }
537  else
538  {
539  trans_list_forward[j] = NULL;
540  trans_list_forward_val[j] = NULL;
541  trans_list_forward_id[j] = NULL;
542  }
543  }
544 
545  for (int32_t i=0; i<num_trans; i++)
546  {
547  int32_t from_state = (int32_t)a_trans.matrix[i] ;
548  int32_t to_state = (int32_t)a_trans.matrix[i+num_trans] ;
549  float64_t val = a_trans.matrix[i+num_trans*2] ;
550  int32_t id = 0 ;
551  if (num_cols==4)
552  id = (int32_t)a_trans.matrix[i+num_trans*3] ;
553  //SG_DEBUG("id=%i\n", id)
554 
555  ASSERT(to_state>=0 && to_state<m_N)
556  ASSERT(from_state>=0 && from_state<m_N)
557 
558  trans_list_forward[to_state][trans_list_forward_cnt[to_state]]=from_state ;
559  trans_list_forward_val[to_state][trans_list_forward_cnt[to_state]]=val ;
560  trans_list_forward_id[to_state][trans_list_forward_cnt[to_state]]=id ;
561  trans_list_forward_cnt[to_state]++ ;
562  m_transition_matrix_a.element(from_state, to_state) = val ;
563  m_transition_matrix_a_id.element(from_state, to_state) = id ;
564  //SG_PRINT("from_state:%i to_state:%i trans_matrix_a_id:%i \n",from_state, to_state,m_transition_matrix_a_id.element(from_state, to_state))
565  } ;
566 
567  m_max_a_id = 0 ;
568  for (int32_t i=0; i<m_N; i++)
569  for (int32_t j=0; j<m_N; j++)
570  {
571  //if (m_transition_matrix_a_id.element(i,j))
572  //SG_DEBUG("(%i,%i)=%i\n", i,j, m_transition_matrix_a_id.element(i,j))
574  }
575  //SG_DEBUG("m_max_a_id=%i\n", m_max_a_id)
576 }
577 
578 
580 {
581  //for (int32_t i=0; i<mod_words_input.num_cols; i++)
582  //{
583  // for (int32_t j=0; j<mod_words_input.num_rows; j++)
584  // SG_PRINT("%i ",mod_words_input[i*mod_words_input.num_rows+j])
585  // SG_PRINT("\n")
586  //}
587  m_svm_arrays_clean=false ;
588 
589  ASSERT(m_num_svms==mod_words_input.num_rows)
590  ASSERT(mod_words_input.num_cols==2)
591 
592  m_mod_words.set_array(mod_words_input.matrix, mod_words_input.num_rows, 2, true, true) ;
594 
595  /*SG_DEBUG("m_mod_words=[")
596  for (int32_t i=0; i<mod_words_input.num_rows; i++)
597  SG_DEBUG("%i, ", p_mod_words_array[i])
598  SG_DEBUG("]\n") */
599 }
600 
602 {
603  //SG_DEBUG("wd_dim1=%d, m_cum_num_words=%d, m_num_words=%d, m_svm_pos_start=%d, num_uniq_w=%d, mod_words_dims=(%d,%d), sign_w=%d,string_w=%d\n m_num_degrees=%d, m_num_svms=%d, m_num_strings=%d", m_word_degree.get_dim1(), m_cum_num_words.get_dim1(), m_num_words.get_dim1(), m_svm_pos_start.get_dim1(), m_num_unique_words.get_dim1(), m_mod_words.get_dim1(), m_mod_words.get_dim2(), m_sign_words.get_dim1(), m_string_words.get_dim1(), m_num_degrees, m_num_svms, m_num_strings)
607  //(word_used.get_dim1()==m_num_degrees) &&
608  //(word_used.get_dim2()==m_num_words[m_num_degrees-1]) &&
609  //(word_used.get_dim3()==m_num_strings) &&
610  // (svm_values_unnormalized.get_dim1()==m_num_degrees) &&
611  // (svm_values_unnormalized.get_dim2()==m_num_svms) &&
612  //(m_svm_pos_start.get_dim1()==m_num_degrees) &&
615  (m_mod_words.get_dim2()==2) &&
618  {
619  m_svm_arrays_clean=true ;
620  return true ;
621  }
622  else
623  {
626  (m_mod_words.get_dim2()==2) &&
629  SG_PRINT("OK\n")
630  else
631  SG_PRINT("not OK\n")
632 
634  SG_WARNING("SVM array: word_degree.get_dim1()!=m_num_degrees")
636  SG_WARNING("SVM array: m_cum_num_words.get_dim1()!=m_num_degrees+1")
638  SG_WARNING("SVM array: m_num_words.get_dim1()==m_num_degrees")
639  //if (!(m_svm_pos_start.get_dim1()==m_num_degrees))
640  // SG_WARNING("SVM array: m_svm_pos_start.get_dim1()!=m_num_degrees")
642  SG_WARNING("SVM array: m_num_unique_words.get_dim1()!=m_num_degrees")
643  if (!(m_mod_words.get_dim1()==m_num_svms))
644  SG_WARNING("SVM array: m_mod_words.get_dim1()!=num_svms")
645  if (!(m_mod_words.get_dim2()==2))
646  SG_WARNING("SVM array: m_mod_words.get_dim2()!=2")
647  if (!(m_sign_words.get_dim1()==m_num_svms))
648  SG_WARNING("SVM array: m_sign_words.get_dim1()!=num_svms")
650  SG_WARNING("SVM array: m_string_words.get_dim1()!=num_svms")
651 
652  m_svm_arrays_clean=false ;
653  return false ;
654  }
655 }
656 
658 {
659  if (seq.num_dims!=3)
660  SG_ERROR("Expected 3-dimensional Matrix\n")
661 
662  int32_t N=seq.dims[0];
663  int32_t cand_pos=seq.dims[1];
664  int32_t max_num_features=seq.dims[2];
665 
666  if (!m_svm_arrays_clean)
667  {
668  SG_ERROR("SVM arrays not clean")
669  return ;
670  } ;
671 
672  ASSERT(N==m_N)
673  ASSERT(cand_pos==m_seq_len)
676 
677  m_observation_matrix.set_array(seq.array, N, m_seq_len, max_num_features, true, true) ;
678 }
680 {
681  return m_seq_len;
682 }
683 
685 {
686  ASSERT(seg_path.num_rows==2)
687  ASSERT(seg_path.num_cols==m_seq_len)
688 
689  if (seg_path.matrix!=NULL)
690  {
691  int32_t *segment_ids = SG_MALLOC(int32_t, m_seq_len);
692  float64_t *segment_mask = SG_MALLOC(float64_t, m_seq_len);
693  for (int32_t i=0; i<m_seq_len; i++)
694  {
695  segment_ids[i] = (int32_t)seg_path.matrix[2*i] ;
696  segment_mask[i] = seg_path.matrix[2*i+1] ;
697  }
698  best_path_set_segment_ids_mask(segment_ids, segment_mask, m_seq_len) ;
699  SG_FREE(segment_ids);
700  SG_FREE(segment_mask);
701  }
702  else
703  {
704  int32_t *izeros = SG_MALLOC(int32_t, m_seq_len);
705  float64_t *dzeros = SG_MALLOC(float64_t, m_seq_len);
706  for (int32_t i=0; i<m_seq_len; i++)
707  {
708  izeros[i]=0 ;
709  dzeros[i]=0.0 ;
710  }
711  best_path_set_segment_ids_mask(izeros, dzeros, m_seq_len) ;
712  SG_FREE(izeros);
713  SG_FREE(dzeros);
714  }
715 }
716 
718 {
719  m_pos.set_array(pos.vector, pos.vlen, true, true) ;
720  m_seq_len = pos.vlen;
721 }
722 
724 {
725  if (orf_info.num_cols!=2)
726  SG_ERROR("orf_info size incorrect %i!=2\n", orf_info.num_cols)
727 
728  m_orf_info.set_array(orf_info.matrix, orf_info.num_rows, orf_info.num_cols, true, true) ;
729  m_orf_info.set_array_name("orf_info") ;
730 }
731 
733 {
734  if ((!seq_sparse1 && seq_sparse2) || (seq_sparse1 && !seq_sparse2))
735  SG_ERROR("Sparse features must either both be NULL or both NON-NULL\n")
736 
739 
740  m_seq_sparse1=seq_sparse1;
741  m_seq_sparse2=seq_sparse2;
744 }
745 
747 {
749 
750  m_plif_matrices=pm;
751 
753 }
754 
756 {
757  ASSERT(genestr.vector)
758  ASSERT(genestr.vlen>0)
759 
760  m_genestr.set_array(genestr.vector, genestr.vlen, true, true) ;
761 }
762 
763 void CDynProg::set_my_state_seq(int32_t* my_state_seq)
764 {
765  ASSERT(my_state_seq && m_seq_len>0)
767  for (int32_t i=0; i<m_seq_len; i++)
768  m_my_state_seq[i]=my_state_seq[i];
769 }
770 
771 void CDynProg::set_my_pos_seq(int32_t* my_pos_seq)
772 {
773  ASSERT(my_pos_seq && m_seq_len>0)
775  for (int32_t i=0; i<m_seq_len; i++)
776  m_my_pos_seq[i]=my_pos_seq[i];
777 }
778 
780 {
781  if (m_num_svms!=dictionary_weights.num_cols)
782  {
783  SG_ERROR("m_dict_weights array does not match num_svms=%i!=%i\n",
784  m_num_svms, dictionary_weights.num_cols) ;
785  }
786 
787  m_dict_weights.set_array(dictionary_weights.matrix, dictionary_weights.num_rows, m_num_svms, true, true) ;
788 
789  // initialize, so it does not bother when not used
796 }
797 
799 {
800  int32_t m=segment_loss.num_rows;
801  int32_t n=segment_loss.num_cols;
802  // here we need two matrices. Store it in one: 2N x N
803  if (2*m!=n)
804  SG_ERROR("segment_loss should be 2 x quadratic matrix: %i!=%i\n", 2*m, n)
805 
806  if (m!=m_max_a_id+1)
807  SG_ERROR("segment_loss size should match m_max_a_id: %i!=%i\n", m, m_max_a_id+1)
808 
809  m_segment_loss.set_array(segment_loss.matrix, m, n/2, 2, true, true) ;
810  /*for (int32_t i=0; i<n; i++)
811  for (int32_t j=0; j<n; j++)
812  SG_DEBUG("loss(%i,%i)=%f\n", i,j, m_segment_loss.element(0,i,j)) */
813 }
814 
816  int32_t* segment_ids, float64_t* segment_mask, int32_t m)
817 {
818 
820  SG_ERROR("size of segment_ids or segment_mask (%i) does not match the size of the feature matrix (%i)", m, m_observation_matrix.get_dim2())
821  int32_t max_id = 0;
822  for (int32_t i=1;i<m;i++)
823  max_id = CMath::max(max_id,segment_ids[i]);
824  //SG_PRINT("max_id: %i, m:%i\n",max_id, m)
825  m_segment_ids.set_array(segment_ids, m, true, true) ;
826  m_segment_ids.set_array_name("m_segment_ids");
827  m_segment_mask.set_array(segment_mask, m, true, true) ;
828  m_segment_mask.set_array_name("m_segment_mask");
829 
833 }
834 
836 {
838  memcpy(scores.vector,m_scores.get_array(), sizeof(float64_t)*(m_scores.get_dim1()));
839 
840  return scores;
841 }
842 
844 {
846 
847  int32_t sz = sizeof(int32_t)*( m_states.get_dim1() * m_states.get_dim2() );
848  memcpy(states.matrix ,m_states.get_array(),sz);
849 
850  return states;
851 }
852 
854 {
856 
857  int32_t sz = sizeof(int32_t)*(m_positions.get_dim1()*m_positions.get_dim2());
858  memcpy(positions.matrix, m_positions.get_array(),sz);
859 
860  return positions;
861 }
862 
863 void CDynProg::get_path_scores(float64_t** scores, int32_t* seq_len)
864 {
865  ASSERT(scores && seq_len)
866 
867  *seq_len=m_my_scores.get_dim1();
868 
869  int32_t sz = sizeof(float64_t)*(*seq_len);
870 
871  *scores = SG_MALLOC(float64_t, *seq_len);
872  ASSERT(*scores)
873 
874  memcpy(*scores,m_my_scores.get_array(),sz);
875 }
876 
877 void CDynProg::get_path_losses(float64_t** losses, int32_t* seq_len)
878 {
879  ASSERT(losses && seq_len)
880 
881  *seq_len=m_my_losses.get_dim1();
882 
883  int32_t sz = sizeof(float64_t)*(*seq_len);
884 
885  *losses = SG_MALLOC(float64_t, *seq_len);
886  ASSERT(*losses)
887 
888  memcpy(*losses,m_my_losses.get_array(),sz);
889 }
890 
892 
894  int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos,
895  int32_t to)
896 {
897 #ifdef DYNPROG_TIMING_DETAIL
898  MyTime.start() ;
899 #endif
900 
901  if (start<0)
902  start=0 ;
903  if (to<0)
904  to=0 ;
905 
906  int32_t orf_target = orf_to-orf_from ;
907  if (orf_target<0) orf_target+=3 ;
908 
909  int32_t pos ;
910  if (last_pos==to)
911  pos = to-orf_to-3 ;
912  else
913  pos=last_pos ;
914 
915  if (pos<0)
916  {
917 #ifdef DYNPROG_TIMING_DETAIL
918  MyTime.stop() ;
919  orf_time += MyTime.time_diff_sec() ;
920 #endif
921  return true ;
922  }
923 
924  for (; pos>=start; pos-=3)
925  if (m_genestr_stop[pos])
926  {
927 #ifdef DYNPROG_TIMING_DETAIL
928  MyTime.stop() ;
929  orf_time += MyTime.time_diff_sec() ;
930 #endif
931  return false ;
932  }
933 
934 
935  last_pos = CMath::min(pos+3,to-orf_to-3) ;
936 
937 #ifdef DYNPROG_TIMING_DETAIL
938  MyTime.stop() ;
939  orf_time += MyTime.time_diff_sec() ;
940 #endif
941  return true ;
942 }
943 
944 void CDynProg::compute_nbest_paths(int32_t max_num_signals, bool use_orf,
945  int16_t nbest, bool with_loss, bool with_multiple_sequences)
946  {
947 
948  //FIXME we need checks here if all the fields are of right size
949  //SG_PRINT("m_seq_len: %i\n", m_seq_len)
950  //SG_PRINT("m_pos[0]: %i\n", m_pos[0])
951  //SG_PRINT("\n")
952 
953  //FIXME these variables can go away when compute_nbest_paths uses them
954  //instead of the local pointers below
955  const float64_t* seq_array = m_observation_matrix.get_array();
956  m_scores.resize_array(nbest) ;
959 
960  for (int32_t i=0; i<nbest; i++)
961  {
962  m_scores[i]=-1;
963  for (int32_t j=0; j<m_observation_matrix.get_dim2(); j++)
964  {
965  m_states.element(i,j)=-1;
966  m_positions.element(i,j)=-1;
967  }
968  }
969  float64_t* prob_nbest=m_scores.get_array();
970  int32_t* my_state_seq=m_states.get_array();
971  int32_t* my_pos_seq=m_positions.get_array();
972 
973  CPlifBase** Plif_matrix=m_plif_matrices->get_plif_matrix();
974  CPlifBase** Plif_state_signals=m_plif_matrices->get_state_signals();
975  //END FIXME
976 
977 
978 #ifdef DYNPROG_TIMING
979  segment_init_time = 0.0 ;
980  segment_pos_time = 0.0 ;
981  segment_extend_time = 0.0 ;
982  segment_clean_time = 0.0 ;
983  orf_time = 0.0 ;
984  svm_init_time = 0.0 ;
985  svm_pos_time = 0.0 ;
986  svm_clean_time = 0.0 ;
987  inner_loop_time = 0.0 ;
988  content_svm_values_time = 0.0 ;
989  content_plifs_time = 0.0 ;
990  inner_loop_max_time = 0.0 ;
991  long_transition_time = 0.0 ;
992 
993  MyTime2.start() ;
994 #endif
995 
996  if (!m_svm_arrays_clean)
997  {
998  SG_ERROR("SVM arrays not clean")
999  return ;
1000  }
1001 
1002 #ifdef DYNPROG_DEBUG
1003  m_transition_matrix_a.set_array_name("transition_matrix");
1008  //SG_PRINT("use_orf = %i\n", use_orf)
1009 #endif
1010 
1011  int32_t max_look_back = 1000 ;
1012  bool use_svm = false ;
1013 
1014  SG_DEBUG("m_N:%i, m_seq_len:%i, max_num_signals:%i\n",m_N, m_seq_len, max_num_signals)
1015 
1016  //for (int32_t i=0;i<m_N*m_seq_len*max_num_signals;i++)
1017  // SG_PRINT("(%i)%0.2f ",i,seq_array[i])
1018 
1019  CDynamicObjectArray PEN((CSGObject**) Plif_matrix, m_N, m_N, false, false) ; // 2d, CPlifBase*
1020  PEN.set_array_name("PEN");
1021 
1022  CDynamicObjectArray PEN_state_signals((CSGObject**) Plif_state_signals, m_N, max_num_signals, false, false) ; // 2d, CPlifBase*
1023  PEN_state_signals.set_array_name("state_signals");
1024 
1025  CDynamicArray<float64_t> seq(m_N, m_seq_len) ; // 2d
1026  seq.set_array_name("seq") ;
1027  seq.set_const(0) ;
1028 
1029 #ifdef DYNPROG_DEBUG
1030  SG_PRINT("m_num_raw_data: %i\n",m_num_raw_data)
1031  SG_PRINT("m_num_intron_plifs: %i\n", m_num_intron_plifs)
1032  SG_PRINT("m_num_svms: %i\n", m_num_svms)
1033  SG_PRINT("m_num_lin_feat_plifs_cum: ")
1034  for (int i=0; i<=m_num_raw_data; i++)
1036  SG_PRINT("\n")
1037 #endif
1038 
1040  { // initialize svm_svalue
1042  svm_value[s]=0 ;
1043  }
1044 
1045  { // convert seq_input to seq
1046  // this is independent of the svm values
1047 
1048  CDynamicArray<float64_t> *seq_input=NULL ; // 3d
1049  if (seq_array!=NULL)
1050  {
1051  //SG_PRINT("using dense seq_array\n")
1052 
1053  seq_input=new CDynamicArray<float64_t>(seq_array, m_N, m_seq_len, max_num_signals) ;
1054  seq_input->set_array_name("seq_input") ;
1055  //seq_input.display_array() ;
1056 
1057  ASSERT(m_seq_sparse1==NULL)
1058  ASSERT(m_seq_sparse2==NULL)
1059  } else
1060  {
1061  SG_PRINT("using sparse seq_array\n")
1062 
1063  ASSERT(m_seq_sparse1!=NULL)
1064  ASSERT(m_seq_sparse2!=NULL)
1065  ASSERT(max_num_signals==2)
1066  }
1067 
1068  for (int32_t i=0; i<m_N; i++)
1069  for (int32_t j=0; j<m_seq_len; j++)
1070  seq.element(i,j) = 0 ;
1071 
1072  for (int32_t i=0; i<m_N; i++)
1073  for (int32_t j=0; j<m_seq_len; j++)
1074  for (int32_t k=0; k<max_num_signals; k++)
1075  {
1076  if ((PEN_state_signals.element(i,k)==NULL) && (k==0))
1077  {
1078  // no plif
1079  if (seq_input!=NULL)
1080  seq.element(i,j) = seq_input->element(i,j,k) ;
1081  else
1082  {
1083  if (k==0)
1084  seq.element(i,j) = m_seq_sparse1->get_feature(i,j) ;
1085  if (k==1)
1086  seq.element(i,j) = m_seq_sparse2->get_feature(i,j) ;
1087  }
1088  break ;
1089  }
1090  if (PEN_state_signals.element(i,k)!=NULL)
1091  {
1092  if (seq_input!=NULL)
1093  {
1094  // just one plif
1095  if (CMath::is_finite(seq_input->element(i,j,k)))
1096  seq.element(i,j) += ((CPlifBase*) PEN_state_signals.element(i,k))->lookup_penalty(seq_input->element(i,j,k), svm_value) ;
1097  else
1098  // keep infinity values
1099  seq.element(i,j) = seq_input->element(i, j, k) ;
1100  }
1101  else
1102  {
1103  if (k==0)
1104  {
1105  // just one plif
1107  seq.element(i,j) += ((CPlifBase*) PEN_state_signals.element(i,k))->lookup_penalty(m_seq_sparse1->get_feature(i,j), svm_value) ;
1108  else
1109  // keep infinity values
1110  seq.element(i,j) = m_seq_sparse1->get_feature(i, j) ;
1111  }
1112  if (k==1)
1113  {
1114  // just one plif
1116  seq.element(i,j) += ((CPlifBase*) PEN_state_signals.element(i,k))->lookup_penalty(m_seq_sparse2->get_feature(i,j), svm_value) ;
1117  else
1118  // keep infinity values
1119  seq.element(i,j) = m_seq_sparse2->get_feature(i, j) ;
1120  }
1121  }
1122  }
1123  else
1124  break ;
1125  }
1126  delete seq_input;
1127  SG_FREE(svm_value);
1128  }
1129 
1130  // allow longer transitions than look_back
1131  bool long_transitions = m_long_transitions ;
1132  CDynamicArray<int32_t> long_transition_content_start_position(m_N,m_N) ; // 2d
1133  long_transition_content_start_position.set_array_name("long_transition_content_start_position");
1134 #ifdef DYNPROG_DEBUG
1135  CDynamicArray<int32_t> long_transition_content_end_position(m_N,m_N) ; // 2d
1136  long_transition_content_end_position.set_array_name("long_transition_content_end_position");
1137 #endif
1138  CDynamicArray<int32_t> long_transition_content_start(m_N,m_N) ; // 2d
1139  long_transition_content_start.set_array_name("long_transition_content_start");
1140 
1141  CDynamicArray<float64_t> long_transition_content_scores(m_N,m_N) ; // 2d
1142  long_transition_content_scores.set_array_name("long_transition_content_scores");
1143 #ifdef DYNPROG_DEBUG
1144 
1145  CDynamicArray<float64_t> long_transition_content_scores_pen(m_N,m_N) ; // 2d
1146  long_transition_content_scores_pen.set_array_name("long_transition_content_scores_pen");
1147 
1148  CDynamicArray<float64_t> long_transition_content_scores_prev(m_N,m_N) ; // 2d
1149  long_transition_content_scores_prev.set_array_name("long_transition_content_scores_prev");
1150 
1151  CDynamicArray<float64_t> long_transition_content_scores_elem(m_N,m_N) ; // 2d
1152  long_transition_content_scores_elem.set_array_name("long_transition_content_scores_elem");
1153 #endif
1154  CDynamicArray<float64_t> long_transition_content_scores_loss(m_N,m_N) ; // 2d
1155  long_transition_content_scores_loss.set_array_name("long_transition_content_scores_loss");
1156 
1157  if (nbest!=1)
1158  {
1159  SG_ERROR("Long transitions are not supported for nbest!=1")
1160  long_transitions = false ;
1161  }
1162  long_transition_content_scores.set_const(-CMath::INFTY);
1163 #ifdef DYNPROG_DEBUG
1164  long_transition_content_scores_pen.set_const(0) ;
1165  long_transition_content_scores_elem.set_const(0) ;
1166  long_transition_content_scores_prev.set_const(0) ;
1167 #endif
1168  if (with_loss)
1169  long_transition_content_scores_loss.set_const(0) ;
1170  long_transition_content_start.set_const(0) ;
1171  long_transition_content_start_position.set_const(0) ;
1172 #ifdef DYNPROG_DEBUG
1173  long_transition_content_end_position.set_const(0) ;
1174 #endif
1175 
1177  { // initialize svm_svalue
1179  svm_value[s]=0 ;
1180  }
1181 
1182  CDynamicArray<int32_t> look_back(m_N,m_N) ; // 2d
1183  look_back.set_array_name("look_back");
1184  //CDynamicArray<int32_t> look_back_orig(m_N,m_N) ;
1185  //look_back.set_array_name("look_back_orig");
1186 
1187 
1188  { // determine maximal length of look-back
1189  for (int32_t i=0; i<m_N; i++)
1190  for (int32_t j=0; j<m_N; j++)
1191  {
1192  look_back.set_element(INT_MAX, i, j) ;
1193  //look_back_orig.set_element(INT_MAX, i, j) ;
1194  }
1195 
1196  for (int32_t j=0; j<m_N; j++)
1197  {
1198  // only consider transitions that are actually allowed
1199  const T_STATES num_elem = trans_list_forward_cnt[j] ;
1200  const T_STATES *elem_list = trans_list_forward[j] ;
1201 
1202  for (int32_t i=0; i<num_elem; i++)
1203  {
1204  T_STATES ii = elem_list[i] ;
1205 
1206  CPlifBase *penij=(CPlifBase*) PEN.element(j, ii) ;
1207  if (penij==NULL)
1208  {
1209  if (long_transitions)
1210  {
1211  look_back.set_element(m_long_transition_threshold, j, ii) ;
1212  //look_back_orig.set_element(m_long_transition_max, j, ii) ;
1213  }
1214  continue ;
1215  }
1216 
1217  /* if the transition is an ORF or we do computation with loss, we have to disable long transitions */
1218  if ((m_orf_info.element(ii,0)!=-1) || (m_orf_info.element(j,1)!=-1) || (!long_transitions))
1219  {
1220  look_back.set_element(CMath::ceil(penij->get_max_value()), j, ii) ;
1221  //look_back_orig.set_element(CMath::ceil(penij->get_max_value()), j, ii) ;
1222  if (CMath::ceil(penij->get_max_value()) > max_look_back)
1223  {
1224  SG_DEBUG("%d %d -> value: %f\n", ii,j,penij->get_max_value())
1225  max_look_back = (int32_t) (CMath::ceil(penij->get_max_value()));
1226  }
1227  }
1228  else
1229  {
1230  look_back.set_element(CMath::min( (int32_t)CMath::ceil(penij->get_max_value()), m_long_transition_threshold ), j, ii) ;
1231  //look_back_orig.set_element( (int32_t)CMath::ceil(penij->get_max_value()), j, ii) ;
1232  }
1233 
1234  if (penij->uses_svm_values())
1235  use_svm=true ;
1236  }
1237  }
1238  /* make sure max_look_back is at least as long as a long transition */
1239  if (long_transitions)
1240  max_look_back = CMath::max(m_long_transition_threshold, max_look_back) ;
1241 
1242  /* make sure max_look_back is not longer than the whole string */
1243  max_look_back = CMath::min(m_genestr.get_dim1(), max_look_back) ;
1244 
1245  int32_t num_long_transitions = 0 ;
1246  for (int32_t i=0; i<m_N; i++)
1247  for (int32_t j=0; j<m_N; j++)
1248  {
1249  if (look_back.get_element(i,j)==m_long_transition_threshold)
1250  num_long_transitions++ ;
1251  if (look_back.get_element(i,j)==INT_MAX)
1252  {
1253  if (long_transitions)
1254  {
1255  look_back.set_element(m_long_transition_threshold, i, j) ;
1256  //look_back_orig.set_element(m_long_transition_max, i, j) ;
1257  }
1258  else
1259  {
1260  look_back.set_element(max_look_back, i, j) ;
1261  //look_back_orig.set_element(m_long_transition_max, i, j) ;
1262  }
1263  }
1264  }
1265  SG_DEBUG("Using %i long transitions\n", num_long_transitions)
1266  }
1267  //SG_PRINT("max_look_back: %i \n", max_look_back)
1268 
1269  //SG_PRINT("use_svm=%i, genestr_len: \n", use_svm, m_genestr.get_dim1())
1270  SG_DEBUG("use_svm=%i\n", use_svm)
1271 
1272  SG_DEBUG("maxlook: %d m_N: %d nbest: %d \n", max_look_back, m_N, nbest)
1273  const int32_t look_back_buflen = (max_look_back*m_N+1)*nbest ;
1274  SG_DEBUG("look_back_buflen=%i\n", look_back_buflen)
1275  /*const float64_t mem_use = (float64_t)(m_seq_len*m_N*nbest*(sizeof(T_STATES)+sizeof(int16_t)+sizeof(int32_t))+
1276  look_back_buflen*(2*sizeof(float64_t)+sizeof(int32_t))+
1277  m_seq_len*(sizeof(T_STATES)+sizeof(int32_t))+
1278  m_genestr.get_dim1()*sizeof(bool))/(1024*1024);*/
1279 
1280  //bool is_big = (mem_use>200) || (m_seq_len>5000) ;
1281 
1282  /*if (is_big)
1283  {
1284  SG_DEBUG("calling compute_nbest_paths: m_seq_len=%i, m_N=%i, lookback=%i nbest=%i\n",
1285  m_seq_len, m_N, max_look_back, nbest) ;
1286  SG_DEBUG("allocating %1.2fMB of memory\n",
1287  mem_use) ;
1288  }*/
1289  ASSERT(nbest<32000)
1290 
1291  CDynamicArray<float64_t> delta(m_seq_len, m_N, nbest) ; // 3d
1292  delta.set_array_name("delta");
1293  float64_t* delta_array = delta.get_array() ;
1294  //delta.set_const(0) ;
1295 
1296  CDynamicArray<T_STATES> psi(m_seq_len, m_N, nbest) ; // 3d
1297  psi.set_array_name("psi");
1298  //psi.set_const(0) ;
1299 
1300  CDynamicArray<int16_t> ktable(m_seq_len, m_N, nbest) ; // 3d
1301  ktable.set_array_name("ktable");
1302  //ktable.set_const(0) ;
1303 
1304  CDynamicArray<int32_t> ptable(m_seq_len, m_N, nbest) ; // 3d
1305  ptable.set_array_name("ptable");
1306  //ptable.set_const(0) ;
1307 
1308  CDynamicArray<float64_t> delta_end(nbest) ;
1309  delta_end.set_array_name("delta_end");
1310  //delta_end.set_const(0) ;
1311 
1312  CDynamicArray<T_STATES> path_ends(nbest) ;
1313  path_ends.set_array_name("path_ends");
1314  //path_ends.set_const(0) ;
1315 
1316  CDynamicArray<int16_t> ktable_end(nbest) ;
1317  ktable_end.set_array_name("ktable_end");
1318  //ktable_end.set_const(0) ;
1319 
1320  float64_t * fixedtempvv=SG_MALLOC(float64_t, look_back_buflen);
1321  memset(fixedtempvv, 0, look_back_buflen*sizeof(float64_t)) ;
1322  int32_t * fixedtempii=SG_MALLOC(int32_t, look_back_buflen);
1323  memset(fixedtempii, 0, look_back_buflen*sizeof(int32_t)) ;
1324 
1325  CDynamicArray<float64_t> oldtempvv(look_back_buflen) ;
1326  oldtempvv.set_array_name("oldtempvv");
1327 
1328  CDynamicArray<float64_t> oldtempvv2(look_back_buflen) ;
1329  oldtempvv2.set_array_name("oldtempvv2");
1330  //oldtempvv.set_const(0) ;
1331  //oldtempvv.display_size() ;
1332 
1333  CDynamicArray<int32_t> oldtempii(look_back_buflen) ;
1334  oldtempii.set_array_name("oldtempii");
1335 
1336  CDynamicArray<int32_t> oldtempii2(look_back_buflen) ;
1337  oldtempii2.set_array_name("oldtempii2");
1338  //oldtempii.set_const(0) ;
1339 
1340  CDynamicArray<T_STATES> state_seq(m_seq_len) ;
1341  state_seq.set_array_name("state_seq");
1342  //state_seq.set_const(0) ;
1343 
1345  pos_seq.set_array_name("pos_seq");
1346  //pos_seq.set_const(0) ;
1347 
1348 
1349  m_dict_weights.set_array_name("dict_weights") ;
1350  m_word_degree.set_array_name("word_degree") ;
1351  m_cum_num_words.set_array_name("cum_num_words") ;
1352  m_num_words.set_array_name("num_words") ;
1353  //word_used.set_array_name("word_used") ;
1354  //svm_values_unnormalized.set_array_name("svm_values_unnormalized") ;
1355  //m_svm_pos_start.set_array_name("svm_pos_start") ;
1356  m_num_unique_words.set_array_name("num_unique_words") ;
1357 
1358  PEN.set_array_name("PEN") ;
1359  seq.set_array_name("seq") ;
1360 
1361  delta.set_array_name("delta") ;
1362  psi.set_array_name("psi") ;
1363  ktable.set_array_name("ktable") ;
1364  ptable.set_array_name("ptable") ;
1365  delta_end.set_array_name("delta_end") ;
1366  path_ends.set_array_name("path_ends") ;
1367  ktable_end.set_array_name("ktable_end") ;
1368 
1369 #ifdef USE_TMP_ARRAYCLASS
1370  fixedtempvv.set_array_name("fixedtempvv") ;
1371  fixedtempii.set_array_name("fixedtempvv") ;
1372 #endif
1373 
1374  oldtempvv.set_array_name("oldtempvv") ;
1375  oldtempvv2.set_array_name("oldtempvv2") ;
1376  oldtempii.set_array_name("oldtempii") ;
1377  oldtempii2.set_array_name("oldtempii2") ;
1378 
1379 
1381 
1382 #ifdef DYNPROG_DEBUG
1383  state_seq.display_size() ;
1384  pos_seq.display_size() ;
1385 
1390  //word_used.display_size() ;
1391  //svm_values_unnormalized.display_size() ;
1392  //m_svm_pos_start.display_array() ;
1394 
1395  PEN.display_size() ;
1396  PEN_state_signals.display_size() ;
1397  seq.display_size() ;
1399 
1400  //m_genestr_stop.display_size() ;
1401  delta.display_size() ;
1402  psi.display_size() ;
1403  ktable.display_size() ;
1404  ptable.display_size() ;
1405  delta_end.display_size() ;
1406  path_ends.display_size() ;
1407  ktable_end.display_size() ;
1408 
1409 #ifdef USE_TMP_ARRAYCLASS
1410  fixedtempvv.display_size() ;
1411  fixedtempii.display_size() ;
1412 #endif
1413 
1414  //oldtempvv.display_size() ;
1415  //oldtempii.display_size() ;
1416 
1417  state_seq.display_size() ;
1418  pos_seq.display_size() ;
1419 
1420  //seq.set_const(0) ;
1421 
1422 #endif //DYNPROG_DEBUG
1423 
1425 
1426 
1427 
1428  {
1429  for (int32_t s=0; s<m_num_svms; s++)
1431  }
1432 
1433 
1434  //CDynamicArray<int32_t*> trans_matrix_svms(m_N,m_N); // 2d
1435  //CDynamicArray<int32_t> trans_matrix_num_svms(m_N,m_N); // 2d
1436 
1437  { // initialization
1438 
1439  for (T_STATES i=0; i<m_N; i++)
1440  {
1441  //delta.element(0, i, 0) = get_p(i) + seq.element(i,0) ; // get_p defined in HMM.h to be equiv to initial_state_distribution
1442  delta.element(delta_array, 0, i, 0, m_seq_len, m_N) = get_p(i) + seq.element(i,0) ; // get_p defined in HMM.h to be equiv to initial_state_distribution
1443  psi.element(0,i,0) = 0 ;
1444  if (nbest>1)
1445  ktable.element(0,i,0) = 0 ;
1446  ptable.element(0,i,0) = 0 ;
1447  for (int16_t k=1; k<nbest; k++)
1448  {
1449  int32_t dim1, dim2, dim3 ;
1450  delta.get_array_size(dim1, dim2, dim3) ;
1451  //SG_DEBUG("i=%i, k=%i -- %i, %i, %i\n", i, k, dim1, dim2, dim3)
1452  //delta.element(0, i, k) = -CMath::INFTY ;
1453  delta.element(delta_array, 0, i, k, m_seq_len, m_N) = -CMath::INFTY ;
1454  psi.element(0,i,0) = 0 ; // <--- what's this for?
1455  if (nbest>1)
1456  ktable.element(0,i,k) = 0 ;
1457  ptable.element(0,i,k) = 0 ;
1458  }
1459  /*
1460  for (T_STATES j=0; j<m_N; j++)
1461  {
1462  CPlifBase * penalty = PEN.element(i,j) ;
1463  int32_t num_current_svms=0;
1464  int32_t svm_ids[] = {-8, -7, -6, -5, -4, -3, -2, -1};
1465  if (penalty)
1466  {
1467  SG_PRINT("trans %i -> %i \n",i,j)
1468  penalty->get_used_svms(&num_current_svms, svm_ids);
1469  trans_matrix_svms.set_element(svm_ids,i,j);
1470  for (int32_t l=0;l<num_current_svms;l++)
1471  SG_PRINT("svm_ids[%i]: %i \n",l,svm_ids[l])
1472  trans_matrix_num_svms.set_element(num_current_svms,i,j);
1473  }
1474  }
1475  */
1476 
1477  }
1478  }
1479 
1480  SG_DEBUG("START_RECURSION \n\n")
1481 
1482  // recursion
1483  for (int32_t t=1; t<m_seq_len; t++)
1484  {
1485  //if (is_big && t%(1+(m_seq_len/1000))==1)
1486  // SG_PROGRESS(t, 0, m_seq_len)
1487  //SG_PRINT("%i\n", t)
1488 
1489  for (T_STATES j=0; j<m_N; j++)
1490  {
1491  if (seq.element(j,t)<=-1e20)
1492  { // if we cannot observe the symbol here, then we can omit the rest
1493  for (int16_t k=0; k<nbest; k++)
1494  {
1495  delta.element(delta_array, t, j, k, m_seq_len, m_N) = seq.element(j,t) ;
1496  psi.element(t,j,k) = 0 ;
1497  if (nbest>1)
1498  ktable.element(t,j,k) = 0 ;
1499  ptable.element(t,j,k) = 0 ;
1500  }
1501  }
1502  else
1503  {
1504  const T_STATES num_elem = trans_list_forward_cnt[j] ;
1505  const T_STATES *elem_list = trans_list_forward[j] ;
1506  const float64_t *elem_val = trans_list_forward_val[j] ;
1507  const int32_t *elem_id = trans_list_forward_id[j] ;
1508 
1509  int32_t fixed_list_len = 0 ;
1510  float64_t fixedtempvv_ = CMath::INFTY ;
1511  int32_t fixedtempii_ = 0 ;
1512  bool fixedtemplong = false ;
1513 
1514  for (int32_t i=0; i<num_elem; i++)
1515  {
1516  T_STATES ii = elem_list[i] ;
1517 
1518  const CPlifBase* penalty = (CPlifBase*) PEN.element(j,ii) ;
1519 
1520  /*int32_t look_back = max_look_back ;
1521  if (0)
1522  { // find lookback length
1523  CPlifBase *pen = (CPlifBase*) penalty ;
1524  if (pen!=NULL)
1525  look_back=(int32_t) (CMath::ceil(pen->get_max_value()));
1526  if (look_back>=1e6)
1527  SG_PRINT("%i,%i -> %d from %ld\n", j, ii, look_back, (long)pen)
1528  ASSERT(look_back<1e6)
1529  } */
1530 
1531  int32_t look_back_ = look_back.element(j, ii) ;
1532 
1533  int32_t orf_from = m_orf_info.element(ii,0) ;
1534  int32_t orf_to = m_orf_info.element(j,1) ;
1535  if((orf_from!=-1)!=(orf_to!=-1))
1536  SG_DEBUG("j=%i ii=%i orf_from=%i orf_to=%i p=%1.2f\n", j, ii, orf_from, orf_to, elem_val[i])
1537  ASSERT((orf_from!=-1)==(orf_to!=-1))
1538 
1539  int32_t orf_target = -1 ;
1540  if (orf_from!=-1)
1541  {
1542  orf_target=orf_to-orf_from ;
1543  if (orf_target<0)
1544  orf_target+=3 ;
1545  ASSERT(orf_target>=0 && orf_target<3)
1546  }
1547 
1548  int32_t orf_last_pos = m_pos[t] ;
1549 #ifdef DYNPROG_TIMING
1550  MyTime3.start() ;
1551 #endif
1552  int32_t num_ok_pos = 0 ;
1553 
1554  for (int32_t ts=t-1; ts>=0 && m_pos[t]-m_pos[ts]<=look_back_; ts--)
1555  {
1556  bool ok ;
1557  //int32_t plen=t-ts;
1558 
1559  /*for (int32_t s=0; s<m_num_svms; s++)
1560  if ((fabs(svs.svm_values[s*svs.seqlen+plen]-svs2.svm_values[s*svs.seqlen+plen])>1e-6) ||
1561  (fabs(svs.svm_values[s*svs.seqlen+plen]-svs3.svm_values[s*svs.seqlen+plen])>1e-6))
1562  {
1563  SG_DEBUG("s=%i, t=%i, ts=%i, %1.5e, %1.5e, %1.5e\n", s, t, ts, svs.svm_values[s*svs.seqlen+plen], svs2.svm_values[s*svs.seqlen+plen], svs3.svm_values[s*svs.seqlen+plen])
1564  }*/
1565 
1566  if (orf_target==-1)
1567  ok=true ;
1568  else if (m_pos[ts]!=-1 && (m_pos[t]-m_pos[ts])%3==orf_target)
1569  ok=(!use_orf) || extend_orf(orf_from, orf_to, m_pos[ts], orf_last_pos, m_pos[t]) ;
1570  else
1571  ok=false ;
1572 
1573  if (ok)
1574  {
1575 
1576  float64_t segment_loss = 0.0 ;
1577  if (with_loss)
1578  {
1579  segment_loss = m_seg_loss_obj->get_segment_loss(ts, t, elem_id[i]);
1580  //if (segment_loss!=segment_loss2)
1581  //SG_PRINT("segment_loss:%f segment_loss2:%f\n", segment_loss, segment_loss2)
1582  }
1584  // BEST_PATH_TRANS
1586 
1587  int32_t frame = orf_from;//m_orf_info.element(ii,0);
1588  lookup_content_svm_values(ts, t, m_pos[ts], m_pos[t], svm_value, frame);
1589 
1590  float64_t pen_val = 0.0 ;
1591  if (penalty)
1592  {
1593 #ifdef DYNPROG_TIMING_DETAIL
1594  MyTime.start() ;
1595 #endif
1596  pen_val = penalty->lookup_penalty(m_pos[t]-m_pos[ts], svm_value) ;
1597 
1598 #ifdef DYNPROG_TIMING_DETAIL
1599  MyTime.stop() ;
1600  content_plifs_time += MyTime.time_diff_sec() ;
1601 #endif
1602  }
1603 
1604 #ifdef DYNPROG_TIMING_DETAIL
1605  MyTime.start() ;
1606 #endif
1607  num_ok_pos++ ;
1608 
1609  if (nbest==1)
1610  {
1611  float64_t val = elem_val[i] + pen_val ;
1612  if (with_loss)
1613  val += segment_loss ;
1614 
1615  float64_t mval = -(val + delta.element(delta_array, ts, ii, 0, m_seq_len, m_N)) ;
1616 
1617  if (mval<fixedtempvv_)
1618  {
1619  fixedtempvv_ = mval ;
1620  fixedtempii_ = ii + ts*m_N;
1621  fixed_list_len = 1 ;
1622  fixedtemplong = false ;
1623  }
1624  }
1625  else
1626  {
1627  for (int16_t diff=0; diff<nbest; diff++)
1628  {
1629  float64_t val = elem_val[i] ;
1630  val += pen_val ;
1631  if (with_loss)
1632  val += segment_loss ;
1633 
1634  float64_t mval = -(val + delta.element(delta_array, ts, ii, diff, m_seq_len, m_N)) ;
1635 
1636  /* only place -val in fixedtempvv if it is one of the nbest lowest values in there */
1637  /* fixedtempvv[i], i=0:nbest-1, is sorted so that fixedtempvv[0] <= fixedtempvv[1] <= ...*/
1638  /* fixed_list_len has the number of elements in fixedtempvv */
1639 
1640  if ((fixed_list_len < nbest) || ((0==fixed_list_len) || (mval < fixedtempvv[fixed_list_len-1])))
1641  {
1642  if ( (fixed_list_len<nbest) && ((0==fixed_list_len) || (mval>fixedtempvv[fixed_list_len-1])) )
1643  {
1644  fixedtempvv[fixed_list_len] = mval ;
1645  fixedtempii[fixed_list_len] = ii + diff*m_N + ts*m_N*nbest;
1646  fixed_list_len++ ;
1647  }
1648  else // must have mval < fixedtempvv[fixed_list_len-1]
1649  {
1650  int32_t addhere = fixed_list_len;
1651  while ((addhere > 0) && (mval < fixedtempvv[addhere-1]))
1652  addhere--;
1653 
1654  // move everything from addhere+1 one forward
1655  for (int32_t jj=fixed_list_len-1; jj>addhere; jj--)
1656  {
1657  fixedtempvv[jj] = fixedtempvv[jj-1];
1658  fixedtempii[jj] = fixedtempii[jj-1];
1659  }
1660 
1661  fixedtempvv[addhere] = mval;
1662  fixedtempii[addhere] = ii + diff*m_N + ts*m_N*nbest;
1663 
1664  if (fixed_list_len < nbest)
1665  fixed_list_len++;
1666  }
1667  }
1668  }
1669  }
1670 #ifdef DYNPROG_TIMING_DETAIL
1671  MyTime.stop() ;
1672  inner_loop_max_time += MyTime.time_diff_sec() ;
1673 #endif
1674  }
1675  }
1676 #ifdef DYNPROG_TIMING
1677  MyTime3.stop() ;
1678  inner_loop_time += MyTime3.time_diff_sec() ;
1679 #endif
1680  }
1681  for (int32_t i=0; i<num_elem; i++)
1682  {
1683  T_STATES ii = elem_list[i] ;
1684 
1685  const CPlifBase* penalty = (CPlifBase*) PEN.element(j,ii) ;
1686 
1687  /*int32_t look_back = max_look_back ;
1688  if (0)
1689  { // find lookback length
1690  CPlifBase *pen = (CPlifBase*) penalty ;
1691  if (pen!=NULL)
1692  look_back=(int32_t) (CMath::ceil(pen->get_max_value()));
1693  if (look_back>=1e6)
1694  SG_PRINT("%i,%i -> %d from %ld\n", j, ii, look_back, (long)pen)
1695  ASSERT(look_back<1e6)
1696  } */
1697 
1698  int32_t look_back_ = look_back.element(j, ii) ;
1699  //int32_t look_back_orig_ = look_back_orig.element(j, ii) ;
1700 
1701  int32_t orf_from = m_orf_info.element(ii,0) ;
1702  int32_t orf_to = m_orf_info.element(j,1) ;
1703  if((orf_from!=-1)!=(orf_to!=-1))
1704  SG_DEBUG("j=%i ii=%i orf_from=%i orf_to=%i p=%1.2f\n", j, ii, orf_from, orf_to, elem_val[i])
1705  ASSERT((orf_from!=-1)==(orf_to!=-1))
1706 
1707  int32_t orf_target = -1 ;
1708  if (orf_from!=-1)
1709  {
1710  orf_target=orf_to-orf_from ;
1711  if (orf_target<0)
1712  orf_target+=3 ;
1713  ASSERT(orf_target>=0 && orf_target<3)
1714  }
1715 
1716  //int32_t loss_last_pos = t ;
1717  //float64_t last_loss = 0.0 ;
1718 
1719 #ifdef DYNPROG_TIMING
1720  MyTime3.start() ;
1721 #endif
1722 
1723  /* long transition stuff */
1724  /* only do this, if
1725  * this feature is enabled
1726  * this is not a transition with ORF restrictions
1727  * the loss is switched off
1728  * nbest=1
1729  */
1730 #ifdef DYNPROG_TIMING
1731  MyTime3.start() ;
1732 #endif
1733  // long transitions, only when not considering ORFs
1734  if ( long_transitions && orf_target==-1 && look_back_ == m_long_transition_threshold )
1735  {
1736 
1737  // update table for 5' part of the long segment
1738 
1739  int32_t start = long_transition_content_start.get_element(ii, j) ;
1740  int32_t end_5p_part = start ;
1741  for (int32_t start_5p_part=start; m_pos[t]-m_pos[start_5p_part] > m_long_transition_threshold ; start_5p_part++)
1742  {
1743  // find end_5p_part, which is greater than start_5p_part and at least m_long_transition_threshold away
1744  while (end_5p_part<=t && m_pos[end_5p_part+1]-m_pos[start_5p_part]<=m_long_transition_threshold)
1745  end_5p_part++ ;
1746 
1747  ASSERT(m_pos[end_5p_part+1]-m_pos[start_5p_part] > m_long_transition_threshold || end_5p_part==t)
1748  ASSERT(m_pos[end_5p_part]-m_pos[start_5p_part] <= m_long_transition_threshold)
1749 
1750  float64_t pen_val = 0.0;
1751  /* recompute penalty, if necessary */
1752  if (penalty)
1753  {
1754  int32_t frame = m_orf_info.element(ii,0);
1755  lookup_content_svm_values(start_5p_part, end_5p_part, m_pos[start_5p_part], m_pos[end_5p_part], svm_value, frame); // * t -> end_5p_part
1756  pen_val = penalty->lookup_penalty(m_pos[end_5p_part]-m_pos[start_5p_part], svm_value) ;
1757  }
1758 
1759  /*if (m_pos[start_5p_part]==1003)
1760  {
1761  SG_PRINT("Part1: %i - %i vs %i - %i\n", m_pos[t], m_pos[ts], m_pos[end_5p_part], m_pos[start_5p_part])
1762  SG_PRINT("Part1: ts=%i t=%i start_5p_part=%i m_seq_len=%i\n", m_pos[ts], m_pos[t], m_pos[start_5p_part], m_seq_len)
1763  }*/
1764 
1765  float64_t mval_trans = -( elem_val[i] + pen_val*0.5 + delta.element(delta_array, start_5p_part, ii, 0, m_seq_len, m_N) ) ;
1766  //float64_t mval_trans = -( elem_val[i] + delta.element(delta_array, ts, ii, 0, m_seq_len, m_N) ) ; // enable this for the incomplete extra check
1767 
1768  float64_t segment_loss_part1=0.0 ;
1769  if (with_loss)
1770  { // this is the loss from the start of the long segment (5' part + middle section)
1771 
1772  segment_loss_part1 = m_seg_loss_obj->get_segment_loss(start_5p_part /*long_transition_content_start_position.get_element(ii,j)*/, end_5p_part, elem_id[i]); // * unsure
1773 
1774  mval_trans -= segment_loss_part1 ;
1775  }
1776 
1777 
1778  if (0)//m_pos[end_5p_part] - m_pos[long_transition_content_start_position.get_element(ii, j)] > look_back_orig_/*m_long_transition_max*/)
1779  {
1780  // this restricts the maximal length of segments,
1781  // but the current implementation is not valid since the
1782  // long transition is discarded without loocking if there
1783  // is a second best long transition in between
1784  long_transition_content_scores.set_element(-CMath::INFTY, ii, j) ;
1785  long_transition_content_start_position.set_element(0, ii, j) ;
1786  if (with_loss)
1787  long_transition_content_scores_loss.set_element(0.0, ii, j) ;
1788 #ifdef DYNPROG_DEBUG
1789  long_transition_content_scores_pen.set_element(0.0, ii, j) ;
1790  long_transition_content_scores_elem.set_element(0.0, ii, j) ;
1791  long_transition_content_scores_prev.set_element(0.0, ii, j) ;
1792  long_transition_content_end_position.set_element(0, ii, j) ;
1793 #endif
1794  }
1795  if (with_loss)
1796  {
1797  float64_t old_loss = long_transition_content_scores_loss.get_element(ii, j) ;
1798  float64_t new_loss = m_seg_loss_obj->get_segment_loss(long_transition_content_start_position.get_element(ii,j), end_5p_part, elem_id[i]);
1799  float64_t score = long_transition_content_scores.get_element(ii, j) - old_loss + new_loss ;
1800  long_transition_content_scores.set_element(score, ii, j) ;
1801  long_transition_content_scores_loss.set_element(new_loss, ii, j) ;
1802 #ifdef DYNPROG_DEBUG
1803  long_transition_content_end_position.set_element(end_5p_part, ii, j) ;
1804 #endif
1805 
1806  }
1807  if (-long_transition_content_scores.get_element(ii, j) > mval_trans )
1808  {
1809  /* then the old long transition is either too far away or worse than the current one */
1810  long_transition_content_scores.set_element(-mval_trans, ii, j) ;
1811  long_transition_content_start_position.set_element(start_5p_part, ii, j) ;
1812  if (with_loss)
1813  long_transition_content_scores_loss.set_element(segment_loss_part1, ii, j) ;
1814 #ifdef DYNPROG_DEBUG
1815  long_transition_content_scores_pen.set_element(pen_val*0.5, ii, j) ;
1816  long_transition_content_scores_elem.set_element(elem_val[i], ii, j) ;
1817  long_transition_content_scores_prev.set_element(delta.element(delta_array, start_5p_part, ii, 0, m_seq_len, m_N), ii, j) ;
1818  /*ASSERT(fabs(long_transition_content_scores.get_element(ii, j)-(long_transition_content_scores_pen.get_element(ii, j) +
1819  long_transition_content_scores_elem.get_element(ii, j) +
1820  long_transition_content_scores_prev.get_element(ii, j)))<1e-6) ;*/
1821  long_transition_content_end_position.set_element(end_5p_part, ii, j) ;
1822 #endif
1823  }
1824  //
1825  // this sets the position where the search for better 5'parts is started the next time
1826  // whithout this the prediction takes ages
1827  //
1828  long_transition_content_start.set_element(start_5p_part, ii, j) ;
1829  }
1830 
1831  // consider the 3' part at the end of the long segment:
1832  // * with length = m_long_transition_threshold
1833  // * content prediction and loss only for this part
1834 
1835  // find ts > 0 with distance from m_pos[t] greater m_long_transition_threshold
1836  // precompute: only depends on t
1837  int ts = t;
1838  while (ts>0 && m_pos[t]-m_pos[ts-1] <= m_long_transition_threshold)
1839  ts-- ;
1840 
1841  if (ts>0)
1842  {
1843  ASSERT((m_pos[t]-m_pos[ts-1] > m_long_transition_threshold) && (m_pos[t]-m_pos[ts] <= m_long_transition_threshold))
1844 
1845 
1846  /* only consider this transition, if the right position was found */
1847  float pen_val_3p = 0.0 ;
1848  if (penalty)
1849  {
1850  int32_t frame = orf_from ; //m_orf_info.element(ii, 0);
1851  lookup_content_svm_values(ts, t, m_pos[ts], m_pos[t], svm_value, frame);
1852  pen_val_3p = penalty->lookup_penalty(m_pos[t]-m_pos[ts], svm_value) ;
1853  }
1854 
1855  float64_t mval = -(long_transition_content_scores.get_element(ii, j) + pen_val_3p*0.5) ;
1856 
1857  {
1858 #ifdef DYNPROG_DEBUG
1859  float64_t segment_loss_part2=0.0 ;
1860  float64_t segment_loss_part1=0.0 ;
1861 #endif
1862  float64_t segment_loss_total=0.0 ;
1863 
1864  if (with_loss)
1865  { // this is the loss for the 3' end fragment of the segment
1866  // (the 5' end and the middle section loss is already contained in mval)
1867 
1868 #ifdef DYNPROG_DEBUG
1869  // this is an alternative, which should be identical, if the loss is additive
1870  segment_loss_part2 = m_seg_loss_obj->get_segment_loss_extend(long_transition_content_end_position.get_element(ii,j), t, elem_id[i]);
1871  //mval -= segment_loss_part2 ;
1872  segment_loss_part1 = m_seg_loss_obj->get_segment_loss(long_transition_content_start_position.get_element(ii,j), long_transition_content_end_position.get_element(ii,j), elem_id[i]);
1873 #endif
1874  segment_loss_total = m_seg_loss_obj->get_segment_loss(long_transition_content_start_position.get_element(ii,j), t, elem_id[i]);
1875  mval -= (segment_loss_total-long_transition_content_scores_loss.get_element(ii, j)) ;
1876  }
1877 
1878 #ifdef DYNPROG_DEBUG
1879  if (m_pos[t]==10108 ||m_pos[t]==12802 ||m_pos[t]== 12561)
1880  {
1881  SG_PRINT("Part2: %i,%i,%i: val=%1.6f pen_val_3p*0.5=%1.6f (t=%i, ts=%i, ts-1=%i, ts+1=%i) scores=%1.6f (pen=%1.6f,prev=%1.6f,elem=%1.6f,loss=%1.1f), positions=%i,%i,%i, loss=%1.1f/%1.1f (%i,%i)\n",
1882  m_pos[t], j, ii, -mval, 0.5*pen_val_3p, m_pos[t], m_pos[ts], m_pos[ts-1], m_pos[ts+1],
1883  long_transition_content_scores.get_element(ii, j),
1884  long_transition_content_scores_pen.get_element(ii, j),
1885  long_transition_content_scores_prev.get_element(ii, j),
1886  long_transition_content_scores_elem.get_element(ii, j),
1887  long_transition_content_scores_loss.get_element(ii, j),
1888  m_pos[long_transition_content_start_position.get_element(ii,j)],
1889  m_pos[long_transition_content_end_position.get_element(ii,j)],
1890  m_pos[long_transition_content_start.get_element(ii,j)], segment_loss_part2, segment_loss_total, long_transition_content_start_position.get_element(ii,j), t) ;
1891  SG_PRINT("fixedtempvv_: %1.6f, from_state:%i from_pos:%i\n ",-fixedtempvv_, (fixedtempii_%m_N), m_pos[(fixedtempii_-(fixedtempii_%(m_N*nbest)))/(m_N*nbest)] )
1892  }
1893 
1894  if (fabs(segment_loss_part2+long_transition_content_scores_loss.get_element(ii, j) - segment_loss_total)>1e-3)
1895  {
1896  SG_ERROR("LOSS: total=%1.1f (%i-%i) part1=%1.1f/%1.1f (%i-%i) part2=%1.1f (%i-%i) sum=%1.1f diff=%1.1f\n",
1897  segment_loss_total, m_pos[long_transition_content_start_position.get_element(ii,j)], m_pos[t],
1898  long_transition_content_scores_loss.get_element(ii, j), segment_loss_part1, m_pos[long_transition_content_start_position.get_element(ii,j)], m_pos[long_transition_content_end_position.get_element(ii,j)],
1899  segment_loss_part2, m_pos[long_transition_content_end_position.get_element(ii,j)], m_pos[t],
1900  segment_loss_part2+long_transition_content_scores_loss.get_element(ii, j),
1901  segment_loss_part2+long_transition_content_scores_loss.get_element(ii, j) - segment_loss_total) ;
1902  }
1903 #endif
1904  }
1905 
1906  // prefer simpler version to guarantee optimality
1907  //
1908  // original:
1909  /* if ((mval < fixedtempvv_) &&
1910  (m_pos[t] - m_pos[long_transition_content_start_position.get_element(ii, j)])<=look_back_orig_) */
1911  if (mval < fixedtempvv_)
1912  {
1913  /* then the long transition is better than the short one => replace it */
1914  int32_t fromtjk = fixedtempii_ ;
1915  /*SG_PRINT("%i,%i: Long transition (%1.5f=-(%1.5f+%1.5f+%1.5f+%1.5f), %i) to m_pos %i better than short transition (%1.5f,%i) to m_pos %i \n",
1916  m_pos[t], j,
1917  mval, pen_val_3p*0.5, long_transition_content_scores_pen.get_element(ii, j), long_transition_content_scores_elem.get_element(ii, j), long_transition_content_scores_prev.get_element(ii, j), ii,
1918  m_pos[long_transition_content_position.get_element(ii, j)],
1919  fixedtempvv_, (fromtjk%m_N), m_pos[(fromtjk-(fromtjk%(m_N*nbest)))/(m_N*nbest)]) ;*/
1920  ASSERT((fromtjk-(fromtjk%(m_N*nbest)))/(m_N*nbest)==0 || m_pos[(fromtjk-(fromtjk%(m_N*nbest)))/(m_N*nbest)]>=m_pos[long_transition_content_start_position.get_element(ii, j)] || fixedtemplong)
1921 
1922  fixedtempvv_ = mval ;
1923  fixedtempii_ = ii + m_N*long_transition_content_start_position.get_element(ii, j) ;
1924  fixed_list_len = 1 ;
1925  fixedtemplong = true ;
1926  }
1927  }
1928  }
1929  }
1930 #ifdef DYNPROG_TIMING
1931  MyTime3.stop() ;
1932  long_transition_time += MyTime3.time_diff_sec() ;
1933 #endif
1934 
1935 
1936  int32_t numEnt = fixed_list_len;
1937 
1938  float64_t minusscore;
1939  int64_t fromtjk;
1940 
1941  for (int16_t k=0; k<nbest; k++)
1942  {
1943  if (k<numEnt)
1944  {
1945  if (nbest==1)
1946  {
1947  minusscore = fixedtempvv_ ;
1948  fromtjk = fixedtempii_ ;
1949  }
1950  else
1951  {
1952  minusscore = fixedtempvv[k];
1953  fromtjk = fixedtempii[k];
1954  }
1955 
1956  delta.element(delta_array, t, j, k, m_seq_len, m_N) = -minusscore + seq.element(j,t);
1957  psi.element(t,j,k) = (fromtjk%m_N) ;
1958  if (nbest>1)
1959  ktable.element(t,j,k) = (fromtjk%(m_N*nbest)-psi.element(t,j,k))/m_N ;
1960  ptable.element(t,j,k) = (fromtjk-(fromtjk%(m_N*nbest)))/(m_N*nbest) ;
1961  }
1962  else
1963  {
1964  delta.element(delta_array, t, j, k, m_seq_len, m_N) = -CMath::INFTY ;
1965  psi.element(t,j,k) = 0 ;
1966  if (nbest>1)
1967  ktable.element(t,j,k) = 0 ;
1968  ptable.element(t,j,k) = 0 ;
1969  }
1970  }
1971  }
1972  }
1973  }
1974  { //termination
1975  int32_t list_len = 0 ;
1976  for (int16_t diff=0; diff<nbest; diff++)
1977  {
1978  for (T_STATES i=0; i<m_N; i++)
1979  {
1980  oldtempvv[list_len] = -(delta.element(delta_array, (m_seq_len-1), i, diff, m_seq_len, m_N)+get_q(i)) ;
1981  oldtempii[list_len] = i + diff*m_N ;
1982  list_len++ ;
1983  }
1984  }
1985 
1986  CMath::nmin(oldtempvv.get_array(), oldtempii.get_array(), list_len, nbest) ;
1987 
1988  for (int16_t k=0; k<nbest; k++)
1989  {
1990  delta_end.element(k) = -oldtempvv[k] ;
1991  path_ends.element(k) = (oldtempii[k]%m_N) ;
1992  if (nbest>1)
1993  ktable_end.element(k) = (oldtempii[k]-path_ends.element(k))/m_N ;
1994  }
1995 
1996 
1997  }
1998 
1999  { //state sequence backtracking
2000  for (int16_t k=0; k<nbest; k++)
2001  {
2002  prob_nbest[k]= delta_end.element(k) ;
2003 
2004  int32_t i = 0 ;
2005  state_seq[i] = path_ends.element(k) ;
2006  int16_t q = 0 ;
2007  if (nbest>1)
2008  q=ktable_end.element(k) ;
2009  pos_seq[i] = m_seq_len-1 ;
2010 
2011  while (pos_seq[i]>0)
2012  {
2013  ASSERT(i+1<m_seq_len)
2014  //SG_DEBUG("s=%i p=%i q=%i\n", state_seq[i], pos_seq[i], q)
2015  state_seq[i+1] = psi.element(pos_seq[i], state_seq[i], q);
2016  pos_seq[i+1] = ptable.element(pos_seq[i], state_seq[i], q) ;
2017  if (nbest>1)
2018  q = ktable.element(pos_seq[i], state_seq[i], q) ;
2019  i++ ;
2020  }
2021  //SG_DEBUG("s=%i p=%i q=%i\n", state_seq[i], pos_seq[i], q)
2022  int32_t num_states = i+1 ;
2023  for (i=0; i<num_states;i++)
2024  {
2025  my_state_seq[i+k*m_seq_len] = state_seq[num_states-i-1] ;
2026  my_pos_seq[i+k*m_seq_len] = pos_seq[num_states-i-1] ;
2027  }
2028  if (num_states<m_seq_len)
2029  {
2030  my_state_seq[num_states+k*m_seq_len]=-1 ;
2031  my_pos_seq[num_states+k*m_seq_len]=-1 ;
2032  }
2033  }
2034  }
2035 
2036  //if (is_big)
2037  // SG_PRINT("DONE. \n")
2038 
2039 
2040 #ifdef DYNPROG_TIMING
2041  MyTime2.stop() ;
2042 
2043  //if (is_big)
2044  SG_PRINT("Timing: orf=%1.2f s \n Segment_init=%1.2f s Segment_pos=%1.2f s Segment_extend=%1.2f s Segment_clean=%1.2f s\nsvm_init=%1.2f s svm_pos=%1.2f svm_clean=%1.2f\n content_svm_values_time=%1.2f content_plifs_time=%1.2f\ninner_loop_max_time=%1.2f inner_loop=%1.2f long_transition_time=%1.2f\n total=%1.2f\n", orf_time, segment_init_time, segment_pos_time, segment_extend_time, segment_clean_time, svm_init_time, svm_pos_time, svm_clean_time, content_svm_values_time, content_plifs_time, inner_loop_max_time, inner_loop_time, long_transition_time, MyTime2.time_diff_sec())
2045 #endif
2046 
2047  SG_FREE(fixedtempvv);
2048  SG_FREE(fixedtempii);
2049  }
2050 
2051 
2053  int32_t *my_state_seq, int32_t *my_pos_seq,
2054  int32_t my_seq_len, const float64_t *seq_array, int32_t max_num_signals)
2055 {
2059  //m_my_scores.resize_array(m_my_state_seq.get_array_size()) ;
2060  //m_my_losses.resize_array(m_my_state_seq.get_array_size()) ;
2061  m_my_scores.resize_array(my_seq_len);
2062  m_my_losses.resize_array(my_seq_len);
2063  float64_t* my_scores=m_my_scores.get_array();
2064  float64_t* my_losses=m_my_losses.get_array();
2065  CPlifBase** Plif_matrix=m_plif_matrices->get_plif_matrix();
2066  CPlifBase** Plif_state_signals=m_plif_matrices->get_state_signals();
2067 
2068  if (!m_svm_arrays_clean)
2069  {
2070  SG_ERROR("SVM arrays not clean")
2071  return ;
2072  } ;
2073  //SG_PRINT("genestr_len=%i, genestr_num=%i\n", genestr_len, genestr_num)
2074  //m_mod_words.display() ;
2075  //m_sign_words.display() ;
2076  //m_string_words.display() ;
2077 
2078  bool use_svm = false ;
2079 
2080  CDynamicObjectArray PEN((CSGObject**) Plif_matrix, m_N, m_N, false, false) ; // 2d, CPlifBase*
2081  PEN.set_array_name("PEN");
2082 
2083  CDynamicObjectArray PEN_state_signals((CSGObject**) Plif_state_signals, m_N, max_num_signals, false, false) ; // 2d, CPlifBase*
2084  PEN_state_signals.set_array_name("PEN_state_signals");
2085 
2086  CDynamicArray<float64_t> seq_input(seq_array, m_N, m_seq_len, max_num_signals) ;
2087  seq_input.set_array_name("seq_input");
2088 
2089  { // determine whether to use svm outputs and clear derivatives
2090  for (int32_t i=0; i<m_N; i++)
2091  for (int32_t j=0; j<m_N; j++)
2092  {
2093  CPlifBase* penij=(CPlifBase*) PEN.element(i,j) ;
2094  if (penij==NULL)
2095  continue ;
2096 
2097  if (penij->uses_svm_values())
2098  use_svm=true ;
2099  penij->penalty_clear_derivative() ;
2100  }
2101  for (int32_t i=0; i<m_N; i++)
2102  for (int32_t j=0; j<max_num_signals; j++)
2103  {
2104  CPlifBase* penij=(CPlifBase*) PEN_state_signals.element(i,j) ;
2105  if (penij==NULL)
2106  continue ;
2107  if (penij->uses_svm_values())
2108  use_svm=true ;
2109  penij->penalty_clear_derivative() ;
2110  }
2111  }
2112 
2113  { // set derivatives of p, q and a to zero
2114 
2115  for (int32_t i=0; i<m_N; i++)
2116  {
2119  for (int32_t j=0; j<m_N; j++)
2121  }
2122  }
2123 
2124  { // clear score vector
2125  for (int32_t i=0; i<my_seq_len; i++)
2126  {
2127  my_scores[i]=0.0 ;
2128  my_losses[i]=0.0 ;
2129  }
2130  }
2131 
2132  //int32_t total_len = 0 ;
2133 
2134  //m_transition_matrix_a.display_array() ;
2135  //m_transition_matrix_a_id.display_array() ;
2136 
2137  // compute derivatives for given path
2142  {
2143  svm_value[s]=0 ;
2144  svm_value_part1[s]=0 ;
2145  svm_value_part2[s]=0 ;
2146  }
2147 
2148  //#ifdef DYNPROG_DEBUG
2149  float64_t total_score = 0.0 ;
2150  float64_t total_loss = 0.0 ;
2151  //#endif
2152 
2153  ASSERT(my_state_seq[0]>=0)
2154  m_initial_state_distribution_p_deriv.element(my_state_seq[0])++ ;
2155  my_scores[0] += m_initial_state_distribution_p.element(my_state_seq[0]) ;
2156 
2157  ASSERT(my_state_seq[my_seq_len-1]>=0)
2158  m_end_state_distribution_q_deriv.element(my_state_seq[my_seq_len-1])++ ;
2159  my_scores[my_seq_len-1] += m_end_state_distribution_q.element(my_state_seq[my_seq_len-1]);
2160 
2161  //#ifdef DYNPROG_DEBUG
2162  total_score += my_scores[0] + my_scores[my_seq_len-1] ;
2163  //#endif
2164 
2165  SG_DEBUG("m_seq_len=%i\n", my_seq_len)
2166  for (int32_t i=0; i<my_seq_len-1; i++)
2167  {
2168  if (my_state_seq[i+1]==-1)
2169  break ;
2170  int32_t from_state = my_state_seq[i] ;
2171  int32_t to_state = my_state_seq[i+1] ;
2172  int32_t from_pos = my_pos_seq[i] ;
2173  int32_t to_pos = my_pos_seq[i+1] ;
2174 
2175  int32_t elem_id = m_transition_matrix_a_id.element(from_state, to_state) ;
2176  my_losses[i] = m_seg_loss_obj->get_segment_loss(from_pos, to_pos, elem_id);
2177 
2178 #ifdef DYNPROG_DEBUG
2179 
2180 
2181  if (i>0)// test if segment loss is additive
2182  {
2183  float32_t loss1 = m_seg_loss_obj->get_segment_loss(my_pos_seq[i-1], my_pos_seq[i], elem_id);
2184  float32_t loss2 = m_seg_loss_obj->get_segment_loss(my_pos_seq[i], my_pos_seq[i+1], elem_id);
2185  float32_t loss3 = m_seg_loss_obj->get_segment_loss(my_pos_seq[i-1], my_pos_seq[i+1], elem_id);
2186  SG_PRINT("loss1:%f loss2:%f loss3:%f, diff:%f\n", loss1, loss2, loss3, loss1+loss2-loss3)
2187  if (CMath::abs(loss1+loss2-loss3)>0)
2188  {
2189  SG_PRINT("%i. segment loss %f (id=%i): from=%i(%i), to=%i(%i)\n", i, my_losses[i], elem_id, from_pos, from_state, to_pos, to_state)
2190  }
2191  }
2192  io->set_loglevel(M_DEBUG) ;
2193  SG_DEBUG("%i. segment loss %f (id=%i): from=%i(%i), to=%i(%i)\n", i, my_losses[i], elem_id, from_pos, from_state, to_pos, to_state)
2194 #endif
2195  // increase usage of this transition
2196  m_transition_matrix_a_deriv.element(from_state, to_state)++ ;
2197  my_scores[i] += m_transition_matrix_a.element(from_state, to_state) ;
2198  //SG_PRINT("m_transition_matrix_a.element(%i, %i),%f \n",from_state, to_state, m_transition_matrix_a.element(from_state, to_state))
2199 #ifdef DYNPROG_DEBUG
2200  SG_DEBUG("%i. scores[i]=%f\n", i, my_scores[i])
2201 #endif
2202 
2203  /*int32_t last_svm_pos[m_num_degrees] ;
2204  for (int32_t qq=0; qq<m_num_degrees; qq++)
2205  last_svm_pos[qq]=-1 ;*/
2206 
2207  bool is_long_transition = false ;
2208  if (m_long_transitions)
2209  {
2210  if (m_pos[to_pos]-m_pos[from_pos]>m_long_transition_threshold)
2211  is_long_transition = true ;
2212  if (m_orf_info.element(from_state,0)!=-1)
2213  is_long_transition = false ;
2214  }
2215 
2216  int32_t from_pos_thresh = from_pos ;
2217  int32_t to_pos_thresh = to_pos ;
2218 
2219  if (use_svm)
2220  {
2221  if (is_long_transition)
2222  {
2223 
2224  while (from_pos_thresh<to_pos && m_pos[from_pos_thresh+1] - m_pos[from_pos] <= m_long_transition_threshold) // *
2225  from_pos_thresh++ ;
2226  ASSERT(from_pos_thresh<to_pos)
2227  ASSERT(m_pos[from_pos_thresh] - m_pos[from_pos] <= m_long_transition_threshold) // *
2228  ASSERT(m_pos[from_pos_thresh+1] - m_pos[from_pos] > m_long_transition_threshold)// *
2229 
2230  int32_t frame = m_orf_info.element(from_state,0);
2231  lookup_content_svm_values(from_pos, from_pos_thresh, m_pos[from_pos], m_pos[from_pos_thresh], svm_value_part1, frame);
2232 
2233 #ifdef DYNPROG_DEBUG
2234  SG_PRINT("part1: pos1: %i pos2: %i pos3: %i \nsvm_value_part1: ", m_pos[from_pos], m_pos[from_pos_thresh], m_pos[from_pos_thresh+1])
2236  SG_PRINT("%1.4f ", svm_value_part1[s])
2237  SG_PRINT("\n")
2238 #endif
2239 
2240  while (to_pos_thresh>0 && m_pos[to_pos] - m_pos[to_pos_thresh-1] <= m_long_transition_threshold) // *
2241  to_pos_thresh-- ;
2242  ASSERT(to_pos_thresh>0)
2243  ASSERT(m_pos[to_pos] - m_pos[to_pos_thresh] <= m_long_transition_threshold) // *
2244  ASSERT(m_pos[to_pos] - m_pos[to_pos_thresh-1] > m_long_transition_threshold) // *
2245 
2246  lookup_content_svm_values(to_pos_thresh, to_pos, m_pos[to_pos_thresh], m_pos[to_pos], svm_value_part2, frame);
2247 
2248 #ifdef DYNPROG_DEBUG
2249  SG_PRINT("part2: pos1: %i pos2: %i pos3: %i \nsvm_value_part2: ", m_pos[to_pos], m_pos[to_pos_thresh], m_pos[to_pos_thresh+1])
2251  SG_PRINT("%1.4f ", svm_value_part2[s])
2252  SG_PRINT("\n")
2253 #endif
2254  }
2255  else
2256  {
2257  /* normal case */
2258 
2259  //SG_PRINT("from_pos: %i; to_pos: %i; m_pos[to_pos]-m_pos[from_pos]: %i \n",from_pos, to_pos, m_pos[to_pos]-m_pos[from_pos])
2260  int32_t frame = m_orf_info.element(from_state,0);
2261  if (false)//(frame>=0)
2262  {
2263  int32_t num_current_svms=0;
2264  int32_t svm_ids[] = {-8, -7, -6, -5, -4, -3, -2, -1};
2265  SG_PRINT("penalties(%i, %i), frame:%i ", from_state, to_state, frame)
2266  ((CPlifBase*) PEN.element(to_state, from_state))->get_used_svms(&num_current_svms, svm_ids);
2267  SG_PRINT("\n")
2268  }
2269 
2270  lookup_content_svm_values(from_pos, to_pos, m_pos[from_pos],m_pos[to_pos], svm_value, frame);
2271 #ifdef DYNPROG_DEBUG
2272  SG_PRINT("part2: pos1: %i pos2: %i \nsvm_values: ", m_pos[from_pos], m_pos[to_pos])
2274  SG_PRINT("%1.4f ", svm_value[s])
2275  SG_PRINT("\n")
2276 #endif
2277  }
2278  }
2279 
2280  if (PEN.element(to_state, from_state)!=NULL)
2281  {
2282  float64_t nscore = 0 ;
2283  if (is_long_transition)
2284  {
2285  float64_t pen_value_part1 = ((CPlifBase*) PEN.element(to_state, from_state))->lookup_penalty(m_pos[from_pos_thresh]-m_pos[from_pos], svm_value_part1) ;
2286  float64_t pen_value_part2 = ((CPlifBase*) PEN.element(to_state, from_state))->lookup_penalty(m_pos[to_pos]-m_pos[to_pos_thresh], svm_value_part2) ;
2287  nscore= 0.5*pen_value_part1 + 0.5*pen_value_part2 ;
2288  }
2289  else
2290  nscore = ((CPlifBase*) PEN.element(to_state, from_state))->lookup_penalty(m_pos[to_pos]-m_pos[from_pos], svm_value) ;
2291 
2292  if (false)//(nscore<-1e9)
2293  SG_PRINT("is_long_transition=%i (from_pos=%i (%i), to_pos=%i (%i)=> %1.5f\n",
2294  is_long_transition, m_pos[from_pos], from_state, m_pos[to_pos], to_state, nscore) ;
2295 
2296  my_scores[i] += nscore ;
2297 
2298  for (int32_t s=m_num_svms;s<m_num_lin_feat_plifs_cum[m_num_raw_data]; s++)/*set tiling plif values to neutral values (that do not influence derivative calculation)*/
2299  {
2300  svm_value[s]=-CMath::INFTY;
2301  svm_value_part1[s]=-CMath::INFTY;
2302  svm_value_part2[s]=-CMath::INFTY;
2303  }
2304 
2305 #ifdef DYNPROG_DEBUG
2306  //SG_DEBUG("%i. transition penalty: from_state=%i to_state=%i from_pos=%i [%i] to_pos=%i [%i] value=%i\n", i, from_state, to_state, from_pos, m_pos[from_pos], to_pos, m_pos[to_pos], m_pos[to_pos]-m_pos[from_pos])
2307 #endif
2308  if (is_long_transition)
2309  {
2310 #ifdef DYNPROG_DEBUG
2311  float64_t sum_score = 0.0 ;
2312 
2313  for (int kk=0; kk<i; kk++)
2314  sum_score += my_scores[i] ;
2315 
2316  SG_PRINT("is_long_transition=%i (from_pos=%i (%i), to_pos=%i (%i)=> %1.5f, %1.5f --- 1: %1.6f (%i-%i) 2: %1.6f (%i-%i) \n",
2317  is_long_transition, m_pos[from_pos], from_state, m_pos[to_pos], to_state,
2318  nscore, sum_score,
2319  PEN.element(to_state, from_state)->lookup_penalty(m_pos[from_pos_thresh]-m_pos[from_pos], svm_value_part1)*0.5, m_pos[from_pos], m_pos[from_pos_thresh],
2320  PEN.element(to_state, from_state)->lookup_penalty(m_pos[to_pos]-m_pos[to_pos_thresh], svm_value_part2)*0.5, m_pos[to_pos_thresh], m_pos[to_pos]) ;
2321 #endif
2322  }
2323 
2324  if (is_long_transition)
2325  {
2326  ((CPlifBase*) PEN.element(to_state, from_state))->penalty_add_derivative(m_pos[from_pos_thresh]-m_pos[from_pos], svm_value_part1, 0.5) ;
2327  ((CPlifBase*) PEN.element(to_state, from_state))->penalty_add_derivative(m_pos[to_pos]-m_pos[to_pos_thresh], svm_value_part2, 0.5) ;
2328  }
2329  else
2330  ((CPlifBase*) PEN.element(to_state, from_state))->penalty_add_derivative(m_pos[to_pos]-m_pos[from_pos], svm_value, 1) ;
2331 
2332  //SG_PRINT("m_num_raw_data = %i \n", m_num_raw_data)
2333 
2334  // for tiling array and rna-seq data every single measurement must be added to the derivative
2335  // in contrast to the content svm predictions where we have a single value per transition;
2336  // content svm predictions have already been added to the derivative, thus we start with d=1
2337  // instead of d=0
2338  if (is_long_transition)
2339  {
2340  for (int32_t d=1; d<=m_num_raw_data; d++)
2341  {
2342  for (int32_t s=0;s<m_num_lin_feat_plifs_cum[m_num_raw_data]+m_num_intron_plifs;s++)
2343  svm_value[s]=-CMath::INFTY;
2344  float64_t* intensities = SG_MALLOC(float64_t, m_num_probes_cum[d]);
2345  int32_t num_intensities = raw_intensities_interval_query(m_pos[from_pos], m_pos[from_pos_thresh],intensities, d);
2346  for (int32_t k=0;k<num_intensities;k++)
2347  {
2348  for (int32_t j=m_num_lin_feat_plifs_cum[d-1];j<m_num_lin_feat_plifs_cum[d];j++)
2349  svm_value[j]=intensities[k];
2350 
2351  ((CPlifBase*) PEN.element(to_state, from_state))->penalty_add_derivative(-CMath::INFTY, svm_value, 0.5) ;
2352 
2353  }
2354  num_intensities = raw_intensities_interval_query(m_pos[to_pos_thresh], m_pos[to_pos],intensities, d);
2355  for (int32_t k=0;k<num_intensities;k++)
2356  {
2357  for (int32_t j=m_num_lin_feat_plifs_cum[d-1];j<m_num_lin_feat_plifs_cum[d];j++)
2358  svm_value[j]=intensities[k];
2359 
2360  ((CPlifBase*) PEN.element(to_state, from_state))->penalty_add_derivative(-CMath::INFTY, svm_value, 0.5) ;
2361 
2362  }
2363  SG_FREE(intensities);
2364 
2365  }
2366  }
2367  else
2368  {
2369  for (int32_t d=1; d<=m_num_raw_data; d++)
2370  {
2371  for (int32_t s=0;s<m_num_lin_feat_plifs_cum[m_num_raw_data]+m_num_intron_plifs;s++)
2372  svm_value[s]=-CMath::INFTY;
2373  float64_t* intensities = SG_MALLOC(float64_t, m_num_probes_cum[d]);
2374  int32_t num_intensities = raw_intensities_interval_query(m_pos[from_pos], m_pos[to_pos],intensities, d);
2375  //SG_PRINT("m_pos[from_pos]:%i, m_pos[to_pos]:%i, num_intensities:%i\n",m_pos[from_pos],m_pos[to_pos], num_intensities)
2376  for (int32_t k=0;k<num_intensities;k++)
2377  {
2378  for (int32_t j=m_num_lin_feat_plifs_cum[d-1];j<m_num_lin_feat_plifs_cum[d];j++)
2379  svm_value[j]=intensities[k];
2380 
2381  ((CPlifBase*) PEN.element(to_state, from_state))->penalty_add_derivative(-CMath::INFTY, svm_value, 1) ;
2382 
2383  }
2384  SG_FREE(intensities);
2385  }
2386  }
2387 
2388  }
2389 #ifdef DYNPROG_DEBUG
2390  SG_DEBUG("%i. scores[i]=%f\n", i, my_scores[i])
2391 #endif
2392 
2393  //SG_DEBUG("emmission penalty skipped: to_state=%i to_pos=%i value=%1.2f score=%1.2f\n", to_state, to_pos, seq_input.element(to_state, to_pos), 0.0)
2394  for (int32_t k=0; k<max_num_signals; k++)
2395  {
2396  if ((PEN_state_signals.element(to_state,k)==NULL)&&(k==0))
2397  {
2398 #ifdef DYNPROG_DEBUG
2399  SG_DEBUG("%i. emmission penalty: to_state=%i to_pos=%i score=%1.2f (no signal plif)\n", i, to_state, to_pos, seq_input.element(to_state, to_pos, k))
2400 #endif
2401  my_scores[i] += seq_input.element(to_state, to_pos, k) ;
2402  //if (seq_input.element(to_state, to_pos, k) !=0)
2403  // SG_PRINT("features(%i,%i): %f\n",to_state,to_pos,seq_input.element(to_state, to_pos, k))
2404  break ;
2405  }
2406  if (PEN_state_signals.element(to_state, k)!=NULL)
2407  {
2408  float64_t nscore = ((CPlifBase*) PEN_state_signals.element(to_state,k))->lookup_penalty(seq_input.element(to_state, to_pos, k), svm_value) ; // this should be ok for long_transitions (svm_value does not matter)
2409  my_scores[i] += nscore ;
2410 #ifdef DYNPROG_DEBUG
2411  if (false)//(nscore<-1e9)
2412  {
2413  SG_PRINT("is_long_transition=%i (from_pos=%i (%i), from_state=%i, to_pos=%i (%i) to_state=%i=> %1.5f, dim3:%i, seq_input.element(to_state, to_pos, k): %1.4f\n",
2414  is_long_transition, m_pos[from_pos], from_pos, from_state, m_pos[to_pos], to_pos, to_state, nscore, k, seq_input.element(to_state, to_pos, k)) ;
2415  for (int x=0; x<23; x++)
2416  {
2417  for (int i=-10; i<10; i++)
2418  SG_PRINT("%1.4f\t", seq_input.element(x, to_pos+i, k))
2419  SG_PRINT("\n")
2420  }
2421 
2422  }
2423 #endif
2424  //break ;
2425  //int32_t num_current_svms=0;
2426  //int32_t svm_ids[] = {-8, -7, -6, -5, -4, -3, -2, -1};
2427  //SG_PRINT("PEN_state_signals->id: ")
2428  //PEN_state_signals.element(to_state, k)->get_used_svms(&num_current_svms, svm_ids);
2429  //SG_PRINT("\n")
2430  //if (nscore != 0)
2431  //SG_PRINT("%i. emmission penalty: to_state=%i to_pos=%i value=%1.2f score=%1.2f k=%i\n", i, to_state, to_pos, seq_input.element(to_state, to_pos, k), nscore, k)
2432 #ifdef DYNPROG_DEBUG
2433  SG_DEBUG("%i. emmission penalty: to_state=%i to_pos=%i value=%1.2f score=%1.2f k=%i\n", i, to_state, to_pos, seq_input.element(to_state, to_pos, k), nscore, k)
2434 #endif
2435  ((CPlifBase*) PEN_state_signals.element(to_state,k))->penalty_add_derivative(seq_input.element(to_state, to_pos, k), svm_value, 1) ; // this should be ok for long_transitions (svm_value does not matter)
2436  } else
2437  break ;
2438  }
2439 
2440  //#ifdef DYNPROG_DEBUG
2441  //SG_PRINT("scores[%i]=%f (final) \n", i, my_scores[i])
2442  //SG_PRINT("losses[%i]=%f (final) , total_loss: %f \n", i, my_losses[i], total_loss)
2443  total_score += my_scores[i] ;
2444  total_loss += my_losses[i] ;
2445  //#endif
2446  }
2447  //#ifdef DYNPROG_DEBUG
2448  //SG_PRINT("total score = %f \n", total_score)
2449  //SG_PRINT("total loss = %f \n", total_loss)
2450  //#endif
2451  SG_FREE(svm_value);
2452  SG_FREE(svm_value_part1);
2453  SG_FREE(svm_value_part2);
2454 }
2455 
2456 int32_t CDynProg::raw_intensities_interval_query(const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type)
2457 {
2458  ASSERT(from_pos<to_pos)
2459  int32_t num_intensities = 0;
2460  int32_t* p_tiling_pos = &m_probe_pos[m_num_probes_cum[type-1]];
2461  float64_t* p_tiling_data = &m_raw_intensities[m_num_probes_cum[type-1]];
2462  int32_t last_pos;
2463  int32_t num = m_num_probes_cum[type-1];
2464  while (*p_tiling_pos<to_pos)
2465  {
2466  if (*p_tiling_pos>=from_pos)
2467  {
2468  intensities[num_intensities] = *p_tiling_data;
2469  num_intensities++;
2470  }
2471  num++;
2472  if (num>=m_num_probes_cum[type])
2473  break;
2474  last_pos = *p_tiling_pos;
2475  p_tiling_pos++;
2476  p_tiling_data++;
2477  ASSERT(last_pos<*p_tiling_pos)
2478  }
2479  return num_intensities;
2480 }
2481 
2482 void CDynProg::lookup_content_svm_values(const int32_t from_state, const int32_t to_state, const int32_t from_pos, const int32_t to_pos, float64_t* svm_values, int32_t frame)
2483 {
2484 #ifdef DYNPROG_TIMING_DETAIL
2485  MyTime.start() ;
2486 #endif
2487 // ASSERT(from_state<to_state)
2488 // if (!(from_pos<to_pos))
2489 // SG_ERROR("from_pos!<to_pos, from_pos: %i to_pos: %i \n",from_pos,to_pos)
2490  for (int32_t i=0;i<m_num_svms;i++)
2491  {
2492  float64_t to_val = m_lin_feat.get_element(i, to_state);
2493  float64_t from_val = m_lin_feat.get_element(i, from_state);
2494  svm_values[i] = (to_val-from_val)/(to_pos-from_pos);
2495  }
2496  for (int32_t i=m_num_svms;i<m_num_lin_feat_plifs_cum[m_num_raw_data];i++)
2497  {
2498  float64_t to_val = m_lin_feat.get_element(i, to_state);
2499  float64_t from_val = m_lin_feat.get_element(i, from_state);
2500  svm_values[i] = to_val-from_val ;
2501  }
2502  if (m_intron_list)
2503  {
2504  int32_t* support = SG_MALLOC(int32_t, m_num_intron_plifs);
2505  m_intron_list->get_intron_support(support, from_state, to_state);
2506  int32_t intron_list_start = m_num_lin_feat_plifs_cum[m_num_raw_data];
2507  int32_t intron_list_end = m_num_lin_feat_plifs_cum[m_num_raw_data]+m_num_intron_plifs;
2508  int32_t cnt = 0;
2509  for (int32_t i=intron_list_start; i<intron_list_end;i++)
2510  {
2511  svm_values[i] = (float64_t) (support[cnt]);
2512  cnt++;
2513  }
2514  //if (to_pos>3990 && to_pos<4010)
2515  // SG_PRINT("from_state:%i to_state:%i support[0]:%i support[1]:%i\n",from_state, to_state, support[0], support[1])
2516  SG_FREE(support);
2517  }
2518  // find the correct row with precomputed frame predictions
2519  if (frame!=-1)
2520  {
2521  svm_values[frame_plifs[0]] = 1e10;
2522  svm_values[frame_plifs[1]] = 1e10;
2523  svm_values[frame_plifs[2]] = 1e10;
2524  int32_t global_frame = from_pos%3;
2525  int32_t row = ((global_frame+frame)%3)+4;
2526  float64_t to_val = m_lin_feat.get_element(row, to_state);
2527  float64_t from_val = m_lin_feat.get_element(row, from_state);
2528  svm_values[frame+frame_plifs[0]] = (to_val-from_val)/(to_pos-from_pos);
2529  }
2530 #ifdef DYNPROG_TIMING_DETAIL
2531  MyTime.stop() ;
2532  content_svm_values_time += MyTime.time_diff_sec() ;
2533 #endif
2534 }
2535 void CDynProg::set_intron_list(CIntronList* intron_list, int32_t num_plifs)
2536 {
2537  m_intron_list = intron_list;
2538  m_num_intron_plifs = num_plifs;
2539 }
2540 

SHOGUN Machine Learning Toolbox - Documentation