SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringFeatures.cpp
Go to the documentation of this file.
5 #include <shogun/io/SGIO.h>
9 
10 #include <sys/types.h>
11 #include <sys/stat.h>
12 #include <dirent.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <unistd.h>
16 
17 
18 namespace shogun
19 {
20 
22 {
23  init();
24  alphabet=new CAlphabet();
25 }
26 
28 {
29  init();
30 
31  alphabet=new CAlphabet(alpha);
35 }
36 
37 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha)
38 : CFeatures(0)
39 {
40  init();
41 
42  alphabet=new CAlphabet(alpha);
46  set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
47 }
48 
49 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha)
50 : CFeatures(0)
51 {
52  init();
53 
54  alphabet=new CAlphabet(alpha);
58  set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
59 }
60 
62 : CFeatures(0)
63 {
64  init();
65 
66  ASSERT(alpha)
67  SG_REF(alpha);
68  alphabet=alpha;
71 }
72 
73 template<class ST> CStringFeatures<ST>::CStringFeatures(const CStringFeatures & orig)
74 : CFeatures(orig), num_vectors(orig.num_vectors),
75  single_string(orig.single_string),
76  length_of_single_string(orig.length_of_single_string),
77  max_string_length(orig.max_string_length),
78  num_symbols(orig.num_symbols),
79  original_num_symbols(orig.original_num_symbols),
80  order(orig.order), preprocess_on_get(false),
81  feature_cache(NULL)
82 {
83  init();
84 
85  ASSERT(orig.single_string == NULL) //not implemented
86 
87  alphabet=orig.alphabet;
89 
90  if (orig.features)
91  {
92  features=SG_MALLOC(SGString<ST>, orig.num_vectors);
93 
94  for (int32_t i=0; i<num_vectors; i++)
95  {
96  features[i].string=SG_MALLOC(ST, orig.features[i].slen);
97  features[i].slen=orig.features[i].slen;
98  memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].slen);
99  }
100  }
101 
102  if (orig.symbol_mask_table)
103  {
104  symbol_mask_table=SG_MALLOC(ST, 256);
106 
107  for (int32_t i=0; i<256; i++)
109  }
110 
113 }
114 
115 template<class ST> CStringFeatures<ST>::CStringFeatures(CFile* loader, EAlphabet alpha)
116 : CFeatures(), num_vectors(0),
117  features(NULL), single_string(NULL), length_of_single_string(0),
118  max_string_length(0), order(0),
119  preprocess_on_get(false), feature_cache(NULL)
120 {
121  init();
122 
123  alphabet=new CAlphabet(alpha);
124  SG_REF(alphabet);
127  load(loader);
128 }
129 
131 {
132  cleanup();
133 
134  SG_UNREF(alphabet);
135 }
136 
137 template<class ST> void CStringFeatures<ST>::cleanup()
138 {
139  remove_all_subsets();
140 
141  if (single_string)
142  {
143  SG_FREE(single_string);
144  single_string=NULL;
145  }
146  else
147  cleanup_feature_vectors(0, num_vectors-1);
148 
149  /*
150  if (single_string)
151  {
152  SG_FREE(single_string);
153  single_string=NULL;
154  }
155  else
156  cleanup_feature_vectors(0, num_vectors-1);
157  */
158 
159  num_vectors=0;
160  SG_FREE(features);
161  SG_FREE(symbol_mask_table);
162  features=NULL;
163  symbol_mask_table=NULL;
164 
165  /* start with a fresh alphabet, but instead of emptying the histogram
166  * create a new object (to leave the alphabet object alone if it is used
167  * by others)
168  */
169  CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
170  SG_UNREF(alphabet);
171  alphabet=alpha;
172  SG_REF(alphabet);
173 }
174 
175 template<class ST> void CStringFeatures<ST>::cleanup_feature_vector(int32_t num)
176 {
177  ASSERT(num<get_num_vectors())
178 
179  if (features)
180  {
181  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
182  SG_FREE(features[real_num].string);
183  features[real_num].string=NULL;
184  features[real_num].slen=0;
185 
186  determine_maximum_string_length();
187  }
188 }
189 
190 template<class ST> void CStringFeatures<ST>::cleanup_feature_vectors(int32_t start, int32_t stop)
191 {
192  if (features && get_num_vectors())
193  {
194  ASSERT(start<get_num_vectors())
195  ASSERT(stop<get_num_vectors())
196 
197  for (int32_t i=start; i<=stop; i++)
198  {
199  int32_t real_num=m_subset_stack->subset_idx_conversion(i);
200  SG_FREE(features[real_num].string);
201  features[real_num].string=NULL;
202  features[real_num].slen=0;
203  }
204  determine_maximum_string_length();
205  }
206 }
207 
208 template<class ST> EFeatureClass CStringFeatures<ST>::get_feature_class() const { return C_STRING; }
209 
210 template<class ST> EFeatureType CStringFeatures<ST>::get_feature_type() const { return F_UNKNOWN; }
211 
213 {
214  SG_REF(alphabet);
215  return alphabet;
216 }
217 
218 template<class ST> CFeatures* CStringFeatures<ST>::duplicate() const
219 {
220  return new CStringFeatures<ST>(*this);
221 }
222 
224 {
225  ASSERT(features)
226  if (num>=get_num_vectors())
227  {
228  SG_ERROR("Index out of bounds (number of strings %d, you "
229  "requested %d)\n", get_num_vectors(), num);
230  }
231 
232  int32_t l;
233  bool free_vec;
234  ST* vec=get_feature_vector(num, l, free_vec);
235  ST* dst=SG_MALLOC(ST, l);
236  memcpy(dst, vec, l*sizeof(ST));
237  free_feature_vector(vec, num, free_vec);
238  return SGVector<ST>(dst, l, true);
239 }
240 
241 template<class ST> void CStringFeatures<ST>::set_feature_vector(SGVector<ST> vector, int32_t num)
242 {
243  ASSERT(features)
244 
245  if (m_subset_stack->has_subsets())
246  SG_ERROR("A subset is set, cannot set feature vector\n")
247 
248  if (num>=num_vectors)
249  {
250  SG_ERROR("Index out of bounds (number of strings %d, you "
251  "requested %d)\n", num_vectors, num);
252  }
253 
254  if (vector.vlen<=0)
255  SG_ERROR("String has zero or negative length\n")
256 
257  cleanup_feature_vector(num);
258  features[num].slen=vector.vlen;
259  features[num].string=SG_MALLOC(ST, vector.vlen);
260  memcpy(features[num].string, vector.vector, vector.vlen*sizeof(ST));
261 
262  determine_maximum_string_length();
263 }
264 
266 {
267  preprocess_on_get=true;
268 }
269 
271 {
272  preprocess_on_get=false;
273 }
274 
275 template<class ST> ST* CStringFeatures<ST>::get_feature_vector(int32_t num, int32_t& len, bool& dofree)
276 {
277  ASSERT(features)
278  if (num>=get_num_vectors())
279  SG_ERROR("Requested feature vector with index %d while total num is", num, get_num_vectors())
280 
281  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
282 
283  if (!preprocess_on_get)
284  {
285  dofree=false;
286  len=features[real_num].slen;
287  return features[real_num].string;
288  }
289  else
290  {
291  SG_DEBUG("computing feature vector!\n")
292  ST* feat=compute_feature_vector(num, len);
293  dofree=true;
294 
295  if (get_num_preprocessors())
296  {
297  ST* tmp_feat_before=feat;
298 
299  for (int32_t i=0; i<get_num_preprocessors(); i++)
300  {
301  CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
302  feat=p->apply_to_string(tmp_feat_before, len);
303  SG_UNREF(p);
304  SG_FREE(tmp_feat_before);
305  tmp_feat_before=feat;
306  }
307  }
308  // TODO: implement caching
309  return feat;
310  }
311 }
312 
314 {
315  int32_t num_feat;
316  int32_t num_vec;
317  SGString<ST>* s=get_transposed(num_feat, num_vec);
318  SGStringList<ST> string_list;
319  string_list.strings = s;
320  string_list.num_strings = num_vec;
321  string_list.max_string_length = num_feat;
322 
323  return new CStringFeatures<ST>(string_list, alphabet);
324 }
325 
326 template<class ST> SGString<ST>* CStringFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec)
327 {
328  num_feat=get_num_vectors();
329  num_vec=get_max_vector_length();
330  ASSERT(have_same_length())
331 
332  SG_DEBUG("Allocating memory for transposed string features of size %ld\n",
333  int64_t(num_feat)*num_vec);
334 
335  SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
336 
337  for (int32_t i=0; i<num_vec; i++)
338  {
339  sf[i].string=SG_MALLOC(ST, num_feat);
340  sf[i].slen=num_feat;
341  }
342 
343  for (int32_t i=0; i<num_feat; i++)
344  {
345  int32_t len=0;
346  bool free_vec=false;
347  ST* vec=get_feature_vector(i, len, free_vec);
348 
349  for (int32_t j=0; j<num_vec; j++)
350  sf[j].string[i]=vec[j];
351 
352  free_feature_vector(vec, i, free_vec);
353  }
354  return sf;
355 }
356 
357 template<class ST> void CStringFeatures<ST>::free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
358 {
359  if (num>=get_num_vectors())
360  {
361  SG_ERROR(
362  "Trying to access string[%d] but num_str=%d\n", num,
363  get_num_vectors());
364  }
365 
366  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
367 
368  if (feature_cache)
369  feature_cache->unlock_entry(real_num);
370 
371  if (dofree)
372  SG_FREE(feat_vec);
373 }
374 
375 template<class ST> void CStringFeatures<ST>::free_feature_vector(SGVector<ST> feat_vec, int32_t num)
376 {
377  if (num>=get_num_vectors())
378  {
379  SG_ERROR(
380  "Trying to access string[%d] but num_str=%d\n", num,
381  get_num_vectors());
382  }
383 
384  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
385 
386  if (feature_cache)
387  feature_cache->unlock_entry(real_num);
388 }
389 
390 template<class ST> ST CStringFeatures<ST>::get_feature(int32_t vec_num, int32_t feat_num)
391 {
392  ASSERT(vec_num<get_num_vectors())
393 
394  int32_t len;
395  bool free_vec;
396  ST* vec=get_feature_vector(vec_num, len, free_vec);
397  ASSERT(feat_num<len)
398  ST result=vec[feat_num];
399  free_feature_vector(vec, vec_num, free_vec);
400 
401  return result;
402 }
403 
404 template<class ST> int32_t CStringFeatures<ST>::get_vector_length(int32_t vec_num)
405 {
406  ASSERT(vec_num<get_num_vectors())
407 
408  int32_t len;
409  bool free_vec;
410  ST* vec=get_feature_vector(vec_num, len, free_vec);
411  free_feature_vector(vec, vec_num, free_vec);
412  return len;
413 }
414 
415 template<class ST> int32_t CStringFeatures<ST>::get_max_vector_length()
416 {
417  return max_string_length;
418 }
419 
420 template<class ST> int32_t CStringFeatures<ST>::get_num_vectors() const
421 {
422  return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors;
423 }
424 
425 template<class ST> floatmax_t CStringFeatures<ST>::get_num_symbols() { return num_symbols; }
426 
427 template<class ST> floatmax_t CStringFeatures<ST>::get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
428 
429 template<class ST> floatmax_t CStringFeatures<ST>::get_original_num_symbols() { return original_num_symbols; }
430 
431 template<class ST> int32_t CStringFeatures<ST>::get_order() { return order; }
432 
433 template<class ST> ST CStringFeatures<ST>::get_masked_symbols(ST symbol, uint8_t mask)
434 {
435  ASSERT(symbol_mask_table)
436  return symbol_mask_table[mask] & symbol;
437 }
438 
439 template<class ST> ST CStringFeatures<ST>::shift_offset(ST offset, int32_t amount)
440 {
441  ASSERT(alphabet)
442  return (offset << (amount*alphabet->get_num_bits()));
443 }
444 
445 template<class ST> ST CStringFeatures<ST>::shift_symbol(ST symbol, int32_t amount)
446 {
447  ASSERT(alphabet)
448  return (symbol >> (amount*alphabet->get_num_bits()));
449 }
450 
451 template<class ST> void CStringFeatures<ST>::load_ascii_file(char* fname, bool remap_to_bin,
452  EAlphabet ascii_alphabet, EAlphabet binary_alphabet)
453 {
454  remove_all_subsets();
455 
456  size_t blocksize=1024*1024;
457  size_t required_blocksize=0;
458  uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
459  uint8_t* overflow=NULL;
460  int32_t overflow_len=0;
461 
462  cleanup();
463 
464  CAlphabet* alpha=new CAlphabet(ascii_alphabet);
465  CAlphabet* alpha_bin=new CAlphabet(binary_alphabet);
466 
467  FILE* f=fopen(fname, "ro");
468 
469  if (f)
470  {
471  num_vectors=0;
472  max_string_length=0;
473 
474  SG_INFO("counting line numbers in file %s\n", fname)
475  size_t block_offs=0;
476  size_t old_block_offs=0;
477  fseek(f, 0, SEEK_END);
478  size_t fsize=ftell(f);
479  rewind(f);
480 
481  if (blocksize>fsize)
482  blocksize=fsize;
483 
484  SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize)
485 
486  size_t sz=blocksize;
487  while (sz == blocksize)
488  {
489  sz=fread(dummy, sizeof(uint8_t), blocksize, f);
490  for (size_t i=0; i<sz; i++)
491  {
492  block_offs++;
493  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
494  {
495  num_vectors++;
496  required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
497  old_block_offs=block_offs;
498  }
499  }
500  SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t")
501  }
502 
503  SG_INFO("found %d strings\n", num_vectors)
504  SG_FREE(dummy);
505  blocksize=required_blocksize;
506  dummy=SG_MALLOC(uint8_t, blocksize);
507  overflow=SG_MALLOC(uint8_t, blocksize);
508  features=SG_MALLOC(SGString<ST>, num_vectors);
509 
510  rewind(f);
511  sz=blocksize;
512  int32_t lines=0;
513  while (sz == blocksize)
514  {
515  sz=fread(dummy, sizeof(uint8_t), blocksize, f);
516 
517  size_t old_sz=0;
518  for (size_t i=0; i<sz; i++)
519  {
520  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
521  {
522  int32_t len=i-old_sz;
523  //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz)
524  max_string_length=CMath::max(max_string_length, len+overflow_len);
525 
526  features[lines].slen=len;
527  features[lines].string=SG_MALLOC(ST, len);
528 
529  if (remap_to_bin)
530  {
531  for (int32_t j=0; j<overflow_len; j++)
532  features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
533  for (int32_t j=0; j<len; j++)
534  features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
535  alpha->add_string_to_histogram(&dummy[old_sz], len);
536  alpha_bin->add_string_to_histogram(features[lines].string, features[lines].slen);
537  }
538  else
539  {
540  for (int32_t j=0; j<overflow_len; j++)
541  features[lines].string[j]=overflow[j];
542  for (int32_t j=0; j<len; j++)
543  features[lines].string[j+overflow_len]=dummy[old_sz+j];
544  alpha->add_string_to_histogram(&dummy[old_sz], len);
545  alpha->add_string_to_histogram(features[lines].string, features[lines].slen);
546  }
547 
548  // clear overflow
549  overflow_len=0;
550 
551  //CMath::display_vector(features[lines].string, len);
552  old_sz=i+1;
553  lines++;
554  SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t")
555  }
556  }
557  for (size_t i=old_sz; i<sz; i++)
558  overflow[i-old_sz]=dummy[i];
559 
560  overflow_len=sz-old_sz;
561  }
562 
563  if (alpha->check_alphabet_size() && alpha->check_alphabet())
564  {
565  SG_INFO("file successfully read\n")
566  SG_INFO("max_string_length=%d\n", max_string_length)
567  SG_INFO("num_strings=%d\n", num_vectors)
568  }
569  fclose(f);
570  }
571 
572  SG_FREE(dummy);
573  SG_FREE(overflow);
574 
575  SG_UNREF(alphabet);
576 
577  if (remap_to_bin)
578  {
579  alphabet=alpha_bin;
580  SG_UNREF(alpha);
581  }
582  else
583  {
584  alphabet=alpha;
585  SG_UNREF(alpha_bin);
586  }
587  SG_REF(alphabet);
588  num_symbols=alphabet->get_num_symbols();
589 }
590 
591 template<class ST> bool CStringFeatures<ST>::load_fasta_file(const char* fname, bool ignore_invalid)
592 {
593  remove_all_subsets();
594 
595  int32_t i=0;
596  uint64_t len=0;
597  uint64_t offs=0;
598  int32_t num=0;
599  int32_t max_len=0;
600 
601  CMemoryMappedFile<char> f(fname);
602 
603  while (true)
604  {
605  char* s=f.get_line(len, offs);
606  if (!s)
607  break;
608 
609  if (len>0 && s[0]=='>')
610  num++;
611  }
612 
613  if (num==0)
614  SG_ERROR("No fasta hunks (lines starting with '>') found\n")
615 
616  cleanup();
617  SG_UNREF(alphabet);
618  alphabet=new CAlphabet(DNA);
619  num_symbols=alphabet->get_num_symbols();
620 
621  SGString<ST>* strings=SG_MALLOC(SGString<ST>, num);
622  offs=0;
623 
624  for (i=0;i<num; i++)
625  {
626  uint64_t id_len=0;
627  char* id=f.get_line(id_len, offs);
628 
629  char* fasta=f.get_line(len, offs);
630  char* s=fasta;
631  int32_t fasta_len=0;
632  int32_t spanned_lines=0;
633 
634  while (true)
635  {
636  if (!s || len==0)
637  SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len)
638 
639  if (s[0]=='>' || offs==f.get_size())
640  {
641  offs-=len+1; // seek to beginning
642  if (offs==f.get_size())
643  {
644  SG_DEBUG("at EOF\n")
645  fasta_len+=len;
646  }
647 
648  len=fasta_len-spanned_lines;
649  strings[i].string=SG_MALLOC(ST, len);
650  strings[i].slen=len;
651 
652  ST* str=strings[i].string;
653  int32_t idx=0;
654  SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines)
655 
656  for (int32_t j=0; j<fasta_len; j++)
657  {
658  if (fasta[j]=='\n')
659  continue;
660 
661  ST c=(ST) fasta[j];
662 
663  if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j]))
664  c=(ST) 'A';
665 
666  if (uint64_t(idx)>=len)
667  SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str)
668  str[idx++]=c;
669  }
670  max_len=CMath::max(max_len, strings[i].slen);
671 
672 
673  break;
674  }
675 
676  spanned_lines++;
677  fasta_len+=len+1; // including '\n'
678  s=f.get_line(len, offs);
679  }
680  }
681  return set_features(strings, num, max_len);
682 }
683 
684 template<class ST> bool CStringFeatures<ST>::load_fastq_file(const char* fname,
685  bool ignore_invalid, bool bitremap_in_single_string)
686 {
687  remove_all_subsets();
688 
689  CMemoryMappedFile<char> f(fname);
690 
691  int32_t i=0;
692  uint64_t len=0;
693  uint64_t offs=0;
694 
695  int32_t num=f.get_num_lines();
696  int32_t max_len=0;
697 
698  if (num%4)
699  SG_ERROR("Number of lines must be divisible by 4 in fastq files\n")
700  num/=4;
701 
702  cleanup();
703  SG_UNREF(alphabet);
704  alphabet=new CAlphabet(DNA);
705 
706  SGString<ST>* strings;
707 
708  ST* str=NULL;
709  if (bitremap_in_single_string)
710  {
711  strings=SG_MALLOC(SGString<ST>, 1);
712  strings[0].string=SG_MALLOC(ST, num);
713  strings[0].slen=num;
714  f.get_line(len, offs);
715  f.get_line(len, offs);
716  order=len;
717  max_len=num;
718  offs=0;
719  original_num_symbols=alphabet->get_num_symbols();
720  str=SG_MALLOC(ST, len);
721  }
722  else
723  strings=SG_MALLOC(SGString<ST>, num);
724 
725  for (i=0;i<num; i++)
726  {
727  if (!f.get_line(len, offs))
728  SG_ERROR("Error reading 'read' identifier in line %d", 4*i)
729 
730  char* s=f.get_line(len, offs);
731  if (!s || len==0)
732  SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len)
733 
734  if (bitremap_in_single_string)
735  {
736  if (len!=(uint64_t) order)
737  SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len)
738  for (int32_t j=0; j<order; j++)
739  str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
740 
741  strings[0].string[i]=embed_word(str, order);
742  }
743  else
744  {
745  strings[i].string=SG_MALLOC(ST, len);
746  strings[i].slen=len;
747  str=strings[i].string;
748 
749  if (ignore_invalid)
750  {
751  for (uint64_t j=0; j<len; j++)
752  {
753  if (alphabet->is_valid((uint8_t) s[j]))
754  str[j]= (ST) s[j];
755  else
756  str[j]= (ST) 'A';
757  }
758  }
759  else
760  {
761  for (uint64_t j=0; j<len; j++)
762  str[j]= (ST) s[j];
763  }
764  max_len=CMath::max(max_len, (int32_t) len);
765  }
766 
767 
768  if (!f.get_line(len, offs))
769  SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2)
770 
771  if (!f.get_line(len, offs))
772  SG_ERROR("Error reading 'read' quality in line %d", 4*i+3)
773  }
774 
775  if (bitremap_in_single_string)
776  num=1;
777 
778  num_vectors=num;
779  max_string_length=max_len;
780  features=strings;
781 
782  return true;
783 }
784 
785 template<class ST> bool CStringFeatures<ST>::load_from_directory(char* dirname)
786 {
787  remove_all_subsets();
788 
789  struct dirent **namelist;
790  int32_t n;
791 
792  SGIO::set_dirname(dirname);
793 
794  SG_DEBUG("dirname '%s'\n", dirname)
795 
796  n=scandir(dirname, &namelist, &SGIO::filter, alphasort);
797  if (n <= 0)
798  {
799  SG_ERROR("error calling scandir - no files found\n")
800  return false;
801  }
802  else
803  {
804  SGString<ST>* strings=NULL;
805 
806  int32_t num=0;
807  int32_t max_len=-1;
808 
809  //usually n==num_vec, but it might not in race conditions
810  //(file perms modified, file erased)
811  strings=SG_MALLOC(SGString<ST>, n);
812 
813  for (int32_t i=0; i<n; i++)
814  {
815  char* fname=SGIO::concat_filename(namelist[i]->d_name);
816 
817  struct stat s;
818  off_t filesize=0;
819 
820  if (!stat(fname, &s) && s.st_size>0)
821  {
822  filesize=s.st_size/sizeof(ST);
823 
824  FILE* f=fopen(fname, "ro");
825  if (f)
826  {
827  ST* str=SG_MALLOC(ST, filesize);
828  SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize)
829  if (fread(str, sizeof(ST), filesize, f)!=(size_t) filesize)
830  SG_ERROR("failed to read file\n")
831  strings[num].string=str;
832  strings[num].slen=filesize;
833  max_len=CMath::max(max_len, strings[num].slen);
834 
835  num++;
836  fclose(f);
837  }
838  }
839  else
840  SG_ERROR("empty or non readable file \'%s\'\n", fname)
841 
842  SG_FREE(namelist[i]);
843  }
844  SG_FREE(namelist);
845 
846  if (num>0 && strings)
847  {
848  set_features(strings, num, max_len);
849  return true;
850  }
851  }
852  return false;
853 }
854 
856 {
857  set_features(feats.strings, feats.num_strings, feats.max_string_length);
858 }
859 
860 template<class ST> bool CStringFeatures<ST>::set_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
861 {
862  if (m_subset_stack->has_subsets())
863  SG_ERROR("Cannot call set_features() with subset.\n")
864 
865  if (p_features)
866  {
867  CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
868 
869  //compute histogram for char/byte
870  for (int32_t i=0; i<p_num_vectors; i++)
871  alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
872 
873  SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram())
874  SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram())
875 
876  if (alpha->check_alphabet_size() && alpha->check_alphabet())
877  {
878  cleanup();
879  SG_UNREF(alphabet);
880 
881  alphabet=alpha;
882  SG_REF(alphabet);
883 
884  // TODO remove copying
885  features = SG_MALLOC(SGString<ST>,p_num_vectors);
886  memcpy(features,p_features,sizeof(SGString<ST>)*p_num_vectors);
887  num_vectors = p_num_vectors;
888  max_string_length = p_max_string_length;
889 
890  return true;
891  }
892  else
893  SG_UNREF(alpha);
894  }
895 
896  return false;
897 }
898 
900 {
901  ASSERT(sf)
902 
903  if (m_subset_stack->has_subsets())
904  SG_ERROR("Cannot call set_features() with subset.\n")
905 
906  SGString<ST>* new_features=SG_MALLOC(SGString<ST>, sf->get_num_vectors());
907 
908  index_t sf_num_str=sf->get_num_vectors();
909  for (int32_t i=0; i<sf_num_str; i++)
910  {
911  int32_t real_i = sf->m_subset_stack->subset_idx_conversion(i);
912  int32_t length=sf->features[real_i].slen;
913  new_features[i].string=SG_MALLOC(ST, length);
914  memcpy(new_features[i].string, sf->features[real_i].string, length);
915  new_features[i].slen=length;
916  }
917  return append_features(new_features, sf_num_str,
918  sf->max_string_length);
919 }
920 
921 template<class ST> bool CStringFeatures<ST>::append_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
922 {
923  if (m_subset_stack->has_subsets())
924  SG_ERROR("Cannot call set_features() with subset.\n")
925 
926  if (!features)
927  return set_features(p_features, p_num_vectors, p_max_string_length);
928 
929  CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
930 
931  //compute histogram for char/byte
932  for (int32_t i=0; i<p_num_vectors; i++)
933  alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
934 
935  SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram())
936  SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram())
937 
938  if (alpha->check_alphabet_size() && alpha->check_alphabet())
939  {
940  SG_UNREF(alpha);
941  for (int32_t i=0; i<p_num_vectors; i++)
942  alphabet->add_string_to_histogram( p_features[i].string, p_features[i].slen);
943 
944  int32_t old_num_vectors=num_vectors;
945  num_vectors=old_num_vectors+p_num_vectors;
946  SGString<ST>* new_features=SG_MALLOC(SGString<ST>, num_vectors);
947 
948  for (int32_t i=0; i<num_vectors; i++)
949  {
950  if (i<old_num_vectors)
951  {
952  new_features[i].string=features[i].string;
953  new_features[i].slen=features[i].slen;
954  }
955  else
956  {
957  new_features[i].string=p_features[i-old_num_vectors].string;
958  new_features[i].slen=p_features[i-old_num_vectors].slen;
959  }
960  }
961  SG_FREE(features);
962  SG_FREE(p_features); // free now obsolete features
963 
964  this->features=new_features;
965  max_string_length=CMath::max(max_string_length, p_max_string_length);
966 
967  return true;
968  }
969  SG_UNREF(alpha);
970 
971  return false;
972 }
973 
975 {
976  SGStringList<ST> sl(NULL,0,0,false);
977 
978  sl.strings=get_features(sl.num_strings, sl.max_string_length);
979  return sl;
980 }
981 
982 template<class ST> SGString<ST>* CStringFeatures<ST>::get_features(int32_t& num_str, int32_t& max_str_len)
983 {
984  if (m_subset_stack->has_subsets())
985  SG_ERROR("get features() is not possible on subset")
986 
987  num_str=num_vectors;
988  max_str_len=max_string_length;
989  return features;
990 }
991 
992 template<class ST> SGString<ST>* CStringFeatures<ST>::copy_features(int32_t& num_str, int32_t& max_str_len)
993 {
994  ASSERT(num_vectors>0)
995 
996  num_str=get_num_vectors();
997  max_str_len=max_string_length;
998  SGString<ST>* new_feat=SG_MALLOC(SGString<ST>, num_str);
999 
1000  for (int32_t i=0; i<num_str; i++)
1001  {
1002  int32_t len;
1003  bool free_vec;
1004  ST* vec=get_feature_vector(i, len, free_vec);
1005  new_feat[i].string=SG_MALLOC(ST, len);
1006  new_feat[i].slen=len;
1007  memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
1008  free_feature_vector(vec, i, free_vec);
1009  }
1010 
1011  return new_feat;
1012 }
1013 
1014 template<class ST> void CStringFeatures<ST>::get_features(SGString<ST>** dst, int32_t* num_str)
1015 {
1016  int32_t num_vec;
1017  int32_t max_str_len;
1018  *dst=copy_features(num_vec, max_str_len);
1019  *num_str=num_vec;
1020 }
1021 
1022 template<class ST> bool CStringFeatures<ST>::load_compressed(char* src, bool decompress)
1023 {
1024  remove_all_subsets();
1025 
1026  FILE* file=NULL;
1027 
1028  if (!(file=fopen(src, "r")))
1029  return false;
1030  cleanup();
1031 
1032  // header shogun v0
1033  char id[4];
1034  if (fread(&id[0], sizeof(char), 1, file)!=1)
1035  SG_ERROR("failed to read header")
1036  ASSERT(id[0]=='S')
1037  if (fread(&id[1], sizeof(char), 1, file)!=1)
1038  SG_ERROR("failed to read header")
1039  ASSERT(id[1]=='G')
1040  if (fread(&id[2], sizeof(char), 1, file)!=1)
1041  SG_ERROR("failed to read header")
1042  ASSERT(id[2]=='V')
1043  if (fread(&id[3], sizeof(char), 1, file)!=1)
1044  SG_ERROR("failed to read header")
1045  ASSERT(id[3]=='0')
1046 
1047  //compression type
1048  uint8_t c;
1049  if (fread(&c, sizeof(uint8_t), 1, file)!=1)
1050  SG_ERROR("failed to read compression type")
1051  CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
1052  //alphabet
1053  uint8_t a;
1054  delete alphabet;
1055  if (fread(&a, sizeof(uint8_t), 1, file)!=1)
1056  SG_ERROR("failed to read compression alphabet")
1057  alphabet=new CAlphabet((EAlphabet) a);
1058  // number of vectors
1059  if (fread(&num_vectors, sizeof(int32_t), 1, file)!=1)
1060  SG_ERROR("failed to read compression number of vectors")
1061  ASSERT(num_vectors>0)
1062  // maximum string length
1063  if (fread(&max_string_length, sizeof(int32_t), 1, file)!=1)
1064  SG_ERROR("failed to read maximum string length")
1065  ASSERT(max_string_length>0)
1066 
1067  features=SG_MALLOC(SGString<ST>, num_vectors);
1068 
1069  // vectors
1070  for (int32_t i=0; i<num_vectors; i++)
1071  {
1072  // vector len compressed
1073  int32_t len_compressed;
1074  if (fread(&len_compressed, sizeof(int32_t), 1, file)!=1)
1075  SG_ERROR("failed to read vector length compressed")
1076  // vector len uncompressed
1077  int32_t len_uncompressed;
1078  if (fread(&len_uncompressed, sizeof(int32_t), 1, file)!=1)
1079  SG_ERROR("failed to read vector length uncompressed")
1080 
1081  // vector raw data
1082  if (decompress)
1083  {
1084  features[i].string=SG_MALLOC(ST, len_uncompressed);
1085  features[i].slen=len_uncompressed;
1086  uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed);
1087  if (fread(compressed, sizeof(uint8_t), len_compressed, file)!=(size_t) len_compressed)
1088  SG_ERROR("failed to read compressed data (expected %d bytes)", len_compressed)
1089  uint64_t uncompressed_size=len_uncompressed;
1090  uncompressed_size*=sizeof(ST);
1091  compressor->decompress(compressed, len_compressed,
1092  (uint8_t*) features[i].string, uncompressed_size);
1093  SG_FREE(compressed);
1094  ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST))
1095  }
1096  else
1097  {
1098  int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
1099  features[i].string=SG_MALLOC(ST, len_compressed+offs);
1100  features[i].slen=len_compressed+offs;
1101  int32_t* feat32ptr=((int32_t*) (features[i].string));
1102  memset(features[i].string, 0, offs*sizeof(ST));
1103  feat32ptr[0]=(int32_t) len_compressed;
1104  feat32ptr[1]=(int32_t) len_uncompressed;
1105  uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
1106  if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed)
1107  SG_ERROR("failed to read uncompressed data")
1108  }
1109  }
1110 
1111  delete compressor;
1112  fclose(file);
1113 
1114  return false;
1115 }
1116 
1117 template<class ST> bool CStringFeatures<ST>::save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
1118 {
1119  if (m_subset_stack->has_subsets())
1120  SG_ERROR("save_compressed() is not possible on subset")
1121 
1122  FILE* file=NULL;
1123 
1124  if (!(file=fopen(dest, "wb")))
1125  return false;
1126 
1127  CCompressor* compressor= new CCompressor(compression);
1128 
1129  // header shogun v0
1130  const char* id="SGV0";
1131  fwrite(&id[0], sizeof(char), 1, file);
1132  fwrite(&id[1], sizeof(char), 1, file);
1133  fwrite(&id[2], sizeof(char), 1, file);
1134  fwrite(&id[3], sizeof(char), 1, file);
1135 
1136  //compression type
1137  uint8_t c=(uint8_t) compression;
1138  fwrite(&c, sizeof(uint8_t), 1, file);
1139  //alphabet
1140  uint8_t a=(uint8_t) alphabet->get_alphabet();
1141  fwrite(&a, sizeof(uint8_t), 1, file);
1142  // number of vectors
1143  fwrite(&num_vectors, sizeof(int32_t), 1, file);
1144  // maximum string length
1145  fwrite(&max_string_length, sizeof(int32_t), 1, file);
1146 
1147  // vectors
1148  for (int32_t i=0; i<num_vectors; i++)
1149  {
1150  int32_t len=-1;
1151  bool vfree;
1152  ST* vec=get_feature_vector(i, len, vfree);
1153 
1154  uint8_t* compressed=NULL;
1155  uint64_t compressed_size=0;
1156 
1157  compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
1158  compressed, compressed_size, level);
1159 
1160  int32_t len_compressed=(int32_t) compressed_size;
1161  // vector len compressed in bytes
1162  fwrite(&len_compressed, sizeof(int32_t), 1, file);
1163  // vector len uncompressed in number of elements of type ST
1164  fwrite(&len, sizeof(int32_t), 1, file);
1165  // vector raw data
1166  fwrite(compressed, compressed_size, 1, file);
1167  SG_FREE(compressed);
1168 
1169  free_feature_vector(vec, i, vfree);
1170  }
1171 
1172  delete compressor;
1173  fclose(file);
1174  return true;
1175 }
1176 
1177 template<class ST> bool CStringFeatures<ST>::apply_preprocessor(bool force_preprocessing)
1178 {
1179  SG_DEBUG("force: %d\n", force_preprocessing)
1180 
1181  for (int32_t i=0; i<get_num_preprocessors(); i++)
1182  {
1183  if ( (!is_preprocessed(i) || force_preprocessing) )
1184  {
1185  set_preprocessed(i);
1186  CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
1187  SG_INFO("preprocessing using preproc %s\n", p->get_name())
1188 
1189  if (!p->apply_to_string_features(this))
1190  {
1191  SG_UNREF(p);
1192  return false;
1193  }
1194  else
1195  SG_UNREF(p);
1196  }
1197  }
1198  return true;
1199 }
1200 
1201 template<class ST> int32_t CStringFeatures<ST>::obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip)
1202 {
1203  if (m_subset_stack->has_subsets())
1205 
1206  ASSERT(step_size>0)
1207  ASSERT(window_size>0)
1208  ASSERT(num_vectors==1 || single_string)
1209  ASSERT(max_string_length>=window_size ||
1210  (single_string && length_of_single_string>=window_size));
1211 
1212  //in case we are dealing with a single remapped string
1213  //allow remapping
1214  if (single_string)
1215  num_vectors= (length_of_single_string-window_size)/step_size + 1;
1216  else if (num_vectors==1)
1217  {
1218  num_vectors= (max_string_length-window_size)/step_size + 1;
1219  length_of_single_string=max_string_length;
1220  }
1221 
1222  SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
1223  int32_t offs=0;
1224  for (int32_t i=0; i<num_vectors; i++)
1225  {
1226  f[i].string=&features[0].string[offs+skip];
1227  f[i].slen=window_size-skip;
1228  offs+=step_size;
1229  }
1230  single_string=features[0].string;
1231  SG_FREE(features);
1232  features=f;
1233  max_string_length=window_size-skip;
1234 
1235  return num_vectors;
1236 }
1237 
1238 template<class ST> int32_t CStringFeatures<ST>::obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
1239  int32_t skip)
1240 {
1241  if (m_subset_stack->has_subsets())
1243 
1244  ASSERT(positions)
1245  ASSERT(window_size>0)
1246  ASSERT(num_vectors==1 || single_string)
1247  ASSERT(max_string_length>=window_size ||
1248  (single_string && length_of_single_string>=window_size));
1249 
1250  num_vectors= positions->get_num_elements();
1251  ASSERT(num_vectors>0)
1252 
1253  int32_t len;
1254 
1255  //in case we are dealing with a single remapped string
1256  //allow remapping
1257  if (single_string)
1258  len=length_of_single_string;
1259  else
1260  {
1261  single_string=features[0].string;
1262  len=max_string_length;
1263  length_of_single_string=max_string_length;
1264  }
1265 
1266  SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
1267  for (int32_t i=0; i<num_vectors; i++)
1268  {
1269  int32_t p=positions->get_element(i);
1270 
1271  if (p>=0 && p<=len-window_size)
1272  {
1273  f[i].string=&features[0].string[p+skip];
1274  f[i].slen=window_size-skip;
1275  }
1276  else
1277  {
1278  num_vectors=1;
1279  max_string_length=len;
1280  features[0].slen=len;
1281  single_string=NULL;
1282  SG_FREE(f);
1283  SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
1284  window_size, i, p, len);
1285  return -1;
1286  }
1287  }
1288 
1289  SG_FREE(features);
1290  features=f;
1291  max_string_length=window_size-skip;
1292 
1293  return num_vectors;
1294 }
1295 
1296 template<class ST> bool CStringFeatures<ST>::obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1297 {
1298  return obtain_from_char_features(sf, start, p_order, gap, rev);
1299 }
1300 
1301 template<class ST> bool CStringFeatures<ST>::have_same_length(int32_t len)
1302 {
1303  if (len!=-1)
1304  {
1305  if (len!=max_string_length)
1306  return false;
1307  }
1308  len=max_string_length;
1309 
1310  index_t num_str=get_num_vectors();
1311  for (int32_t i=0; i<num_str; i++)
1312  {
1313  if (get_vector_length(i)!=len)
1314  return false;
1315  }
1316 
1317  return true;
1318 }
1319 
1320 template<class ST> void CStringFeatures<ST>::embed_features(int32_t p_order)
1321 {
1322  if (m_subset_stack->has_subsets())
1324 
1325  ASSERT(alphabet->get_num_symbols_in_histogram() > 0)
1326 
1327  order=p_order;
1328  original_num_symbols=alphabet->get_num_symbols();
1329  int32_t max_val=alphabet->get_num_bits();
1330 
1331  if (p_order>1)
1332  num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
1333  else
1334  num_symbols=original_num_symbols;
1335 
1336  SG_INFO("max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols)
1337 
1338  if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
1339  SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val)
1340 
1341  ST mask=0;
1342  for (int32_t i=0; i<p_order*max_val; i++)
1343  mask= (mask<<1) | ((ST) 1);
1344 
1345  for (int32_t i=0; i<num_vectors; i++)
1346  {
1347  int32_t len=features[i].slen;
1348 
1349  if (len < p_order)
1350  SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order)
1351 
1352  ST* str=features[i].string;
1353 
1354  // convert first word
1355  for (int32_t j=0; j<p_order; j++)
1356  str[j]=(ST) alphabet->remap_to_bin(str[j]);
1357  str[0]=embed_word(&str[0], p_order);
1358 
1359  // convert the rest
1360  int32_t idx=0;
1361  for (int32_t j=p_order; j<len; j++)
1362  {
1363  str[j]=(ST) alphabet->remap_to_bin(str[j]);
1364  str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
1365  idx++;
1366  }
1367 
1368  features[i].slen=len-p_order+1;
1369  }
1370 
1371  compute_symbol_mask_table(max_val);
1372 }
1373 
1374 template<class ST> void CStringFeatures<ST>::compute_symbol_mask_table(int64_t max_val)
1375 {
1376  if (m_subset_stack->has_subsets())
1378 
1379  SG_FREE(symbol_mask_table);
1380  symbol_mask_table=SG_MALLOC(ST, 256);
1381  symbol_mask_table_len=256;
1382 
1383  uint64_t mask=0;
1384  for (int32_t i=0; i< (int64_t) max_val; i++)
1385  mask=(mask<<1) | 1;
1386 
1387  for (int32_t i=0; i<256; i++)
1388  {
1389  uint8_t bits=(uint8_t) i;
1390  symbol_mask_table[i]=0;
1391 
1392  for (int32_t j=0; j<8; j++)
1393  {
1394  if (bits & 1)
1395  symbol_mask_table[i]|=mask<<(max_val*j);
1396 
1397  bits>>=1;
1398  }
1399  }
1400 }
1401 
1402 template<class ST> void CStringFeatures<ST>::unembed_word(ST word, uint8_t* seq, int32_t len)
1403 {
1404  uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1405 
1406  ST mask=0;
1407  for (uint32_t i=0; i<nbits; i++)
1408  mask=(mask<<1) | (ST) 1;
1409 
1410  for (int32_t i=0; i<len; i++)
1411  {
1412  ST w=(word & mask);
1413  seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
1414  word>>=nbits;
1415  }
1416 }
1417 
1418 template<class ST> ST CStringFeatures<ST>::embed_word(ST* seq, int32_t len)
1419 {
1420  ST value=(ST) 0;
1421  uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1422  for (int32_t i=0; i<len; i++)
1423  {
1424  value<<=nbits;
1425  value|=seq[i];
1426  }
1427 
1428  return value;
1429 }
1430 
1432 {
1433  max_string_length=0;
1434  index_t num_str=get_num_vectors();
1435 
1436  for (int32_t i=0; i<num_str; i++)
1437  {
1438  max_string_length=CMath::max(max_string_length,
1439  features[m_subset_stack->subset_idx_conversion(i)].slen);
1440  }
1441 }
1442 
1444 {
1445  int32_t l=str.slen;
1446  ST* s=SG_MALLOC(ST, l+1);
1447  memcpy(s, str.string, sizeof(ST)*l);
1448  s[l]='\0';
1449  return s;
1450 }
1451 
1452 template<class ST> void CStringFeatures<ST>::set_feature_vector(int32_t num, ST* string, int32_t len)
1453 {
1454  ASSERT(features)
1455  ASSERT(num<get_num_vectors())
1456 
1457  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
1458 
1459 
1460  features[real_num].slen=len ;
1461  features[real_num].string=string ;
1462 
1463  max_string_length=CMath::max(len, max_string_length);
1464 }
1465 
1466 template<class ST> void CStringFeatures<ST>::get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize)
1467 {
1468  int32_t nsym=get_num_symbols();
1469  int32_t slen=get_max_vector_length();
1470  int64_t sz=int64_t(nsym)*slen*sizeof(float64_t);
1471  float64_t* h= SG_MALLOC(float64_t, sz);
1472  memset(h, 0, sz);
1473 
1474  float64_t* h_normalizer=SG_MALLOC(float64_t, slen);
1475  memset(h_normalizer, 0, slen*sizeof(float64_t));
1476  int32_t num_str=get_num_vectors();
1477  for (int32_t i=0; i<num_str; i++)
1478  {
1479  int32_t len;
1480  bool free_vec;
1481  ST* vec=get_feature_vector(i, len, free_vec);
1482  for (int32_t j=0; j<len; j++)
1483  {
1484  h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
1485  h_normalizer[j]++;
1486  }
1487  free_feature_vector(vec, i, free_vec);
1488  }
1489 
1490  if (normalize)
1491  {
1492  for (int32_t i=0; i<slen; i++)
1493  {
1494  for (int32_t j=0; j<nsym; j++)
1495  {
1496  if (h_normalizer && h_normalizer[i])
1497  h[int64_t(i)*nsym+j]/=h_normalizer[i];
1498  }
1499  }
1500  }
1501  SG_FREE(h_normalizer);
1502 
1503  *hist=h;
1504  *rows=nsym;
1505  *cols=slen;
1506 }
1507 
1508 template<class ST> void CStringFeatures<ST>::create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec)
1509 {
1510  ASSERT(rows == get_num_symbols())
1511  cleanup();
1512  float64_t* randoms=SG_MALLOC(float64_t, cols);
1513  SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
1514 
1515  for (int32_t i=0; i<num_vec; i++)
1516  {
1517  sf[i].string=SG_MALLOC(ST, cols);
1518  sf[i].slen=cols;
1519 
1520  SGVector<float64_t>::random_vector(randoms, cols, 0.0, 1.0);
1521 
1522  for (int32_t j=0; j<cols; j++)
1523  {
1524  float64_t lik=hist[int64_t(j)*rows+0];
1525 
1526  int32_t c;
1527  for (c=0; c<rows-1; c++)
1528  {
1529  if (randoms[j]<=lik)
1530  break;
1531  lik+=hist[int64_t(j)*rows+c+1];
1532  }
1533  sf[i].string[j]=alphabet->remap_to_char(c);
1534  }
1535  }
1536  SG_FREE(randoms);
1537  set_features(sf, num_vec, cols);
1538 }
1539 
1540 /*
1541 CStringFeatures<SSKTripleFeature>* obtain_sssk_triple_from_cha(int d1, int d2)
1542 {
1543  int *s;
1544  int32_t nStr=get_num_vectors();
1545 
1546  int32_t nfeat=0;
1547  for (int32_t i=0; i < nStr; ++i)
1548  nfeat += get_vector_length[i] - d1 -d2;
1549  SGString<SSKFeature>* F= SG_MALLOC(SGString<SSKFeature>, nfeat);
1550  int32_t c=0;
1551  for (int32_t i=0; i < nStr; ++i)
1552  {
1553  int32_t len;
1554  bool free_vec;
1555  ST* S=get_feature_vector(vec_num, len, free_vec);
1556  free_feature_vector(vec, vec_num, free_vec);
1557  int32_t n=len - d1 - d2;
1558  s=S[i];
1559  for (int32_t j=0; j < n; ++j)
1560  {
1561  F[c].feature1=s[j];
1562  F[c].feature2=s[j+d1];
1563  F[c].feature3=s[j+d1+d2];
1564  F[c].group=i;
1565  c++;
1566  }
1567  }
1568  ASSERT(nfeat==c)
1569  return F;
1570 }
1571 
1572 CStringFeatures<SSKFeature>* obtain_sssk_double_from_char(int **S, int *len, int nStr, int d1)
1573 {
1574  int i, j;
1575  int n, nfeat;
1576  int *group;
1577  int *features;
1578  int *s;
1579  int c;
1580  SSKFeatures *F;
1581 
1582  nfeat=0;
1583  for (i=0; i < nStr; ++i)
1584  nfeat += len[i] - d1;
1585  group=(int *)SG_MALLOC(nfeat*sizeof(int));
1586  features=(int *)SG_MALLOC(nfeat*2*sizeof(int *));
1587  c=0;
1588  for (i=0; i < nStr; ++i)
1589  {
1590  n=len[i] - d1;
1591  s=S[i];
1592  for (j=0; j < n; ++j)
1593  {
1594  features[c]=s[j];
1595  features[c+nfeat]=s[j+d1];
1596  group[c]=i;
1597  c++;
1598  }
1599  }
1600  if (nfeat!=c)
1601  printf("Something is wrong...\n");
1602  F=(SSKFeatures *)SG_MALLOC(sizeof(SSKFeatures));
1603  (*F).features=features;
1604  (*F).group=group;
1605  (*F).n=nfeat;
1606  return F;
1607 }
1608 */
1609 
1611  SGVector<index_t> indices)
1612 {
1613  /* string list to create new CStringFeatures from */
1614  SGStringList<ST> list_copy(indices.vlen, max_string_length);
1615 
1616  /* copy all features */
1617  for (index_t i=0; i<indices.vlen; ++i)
1618  {
1619  /* index with respect to possible subset */
1620  index_t real_idx=m_subset_stack->subset_idx_conversion(indices.vector[i]);
1621 
1622  /* copy string */
1623  SGString<ST> current_string=features[real_idx];
1624  SGString<ST> string_copy(current_string.slen);
1625  memcpy(string_copy.string, current_string.string,
1626  current_string.slen*sizeof(ST));
1627  list_copy.strings[i]=string_copy;
1628  }
1629 
1630  /* create copy instance */
1631  CStringFeatures* result=new CStringFeatures(list_copy, alphabet);
1632 
1633  /* max string length may have changed */
1635 
1636  /* keep things from original features (otherwise assertions in x-val) */
1637  result->order=order;
1639 
1640  SG_REF(result);
1641 
1642  return result;
1643 }
1644 
1646 {
1647  /* max string length has to be updated */
1648  determine_maximum_string_length();
1649 }
1650 
1651 template<class ST> ST* CStringFeatures<ST>::compute_feature_vector(int32_t num, int32_t& len)
1652 {
1653  ASSERT(features && num<get_num_vectors())
1654 
1655  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
1656 
1657  len=features[real_num].slen;
1658  if (len<=0)
1659  return NULL;
1660 
1661  ST* target=SG_MALLOC(ST, len);
1662  memcpy(target, features[real_num].string, len*sizeof(ST));
1663  return target;
1664 }
1665 
1666 template<class ST> void CStringFeatures<ST>::init()
1667 {
1668  set_generic<ST>();
1669 
1670  alphabet=NULL;
1671  num_vectors=0;
1672  features=NULL;
1673  single_string=NULL;
1674  length_of_single_string=0;
1675  max_string_length=0;
1676  order=0;
1677  preprocess_on_get=false;
1678  feature_cache=NULL;
1679  symbol_mask_table=NULL;
1680  symbol_mask_table_len=0;
1681  num_symbols=0.0;
1682  original_num_symbols=0;
1683 
1684  m_parameters->add((CSGObject**) &alphabet, "alphabet");
1685  m_parameters->add_vector(&features, &num_vectors, "features",
1686  "This contains the array of features.");
1687  m_parameters->add_vector(&single_string,
1688  &length_of_single_string,
1689  "single_string",
1690  "Created by sliding window.");
1691  m_parameters->add(&max_string_length, "max_string_length",
1692  "Length of longest string.");
1693  m_parameters->add(&num_symbols, "num_symbols",
1694  "Number of used symbols.");
1695  m_parameters->add(&original_num_symbols, "original_num_symbols",
1696  "Original number of used symbols.");
1697  m_parameters->add(&order, "order",
1698  "Order used in higher order mapping.");
1699  m_parameters->add(&preprocess_on_get, "preprocess_on_get",
1700  "Preprocess on-the-fly?");
1701 
1702  m_parameters->add_vector(&symbol_mask_table, &symbol_mask_table_len, "mask_table", "Symbol mask table - using in higher order mapping");
1703 }
1704 
1710 {
1711  return F_BOOL;
1712 }
1713 
1719 {
1720  return F_CHAR;
1721 }
1722 
1728 {
1729  return F_BYTE;
1730 }
1731 
1737 {
1738  return F_SHORT;
1739 }
1740 
1746 {
1747  return F_WORD;
1748 }
1749 
1755 {
1756  return F_INT;
1757 }
1758 
1764 {
1765  return F_UINT;
1766 }
1767 
1773 {
1774  return F_LONG;
1775 }
1776 
1782 {
1783  return F_ULONG;
1784 }
1785 
1791 {
1792  return F_SHORTREAL;
1793 }
1794 
1800 {
1801  return F_DREAL;
1802 }
1803 
1809 {
1810  return F_LONGREAL;
1811 }
1812 
1813 template<> bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
1814 {
1815  return symbol;
1816 }
1818 {
1819  return symbol;
1820 }
1822 {
1823  return symbol;
1824 }
1826 {
1827  return symbol;
1828 }
1829 
1830 template<> bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
1831 {
1832  return false;
1833 }
1835 {
1836  return 0;
1837 }
1839 {
1840  return 0;
1841 }
1843 {
1844  return 0;
1845 }
1846 
1847 template<> bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
1848 {
1849  return symbol;
1850 }
1852 {
1853  return symbol;
1854 }
1856 {
1857  return symbol;
1858 }
1860 {
1861  return symbol;
1862 }
1863 
1864 #ifndef SUNOS
1865 template<> template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1866 {
1867  return false;
1868 }
1869 template<> template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1870 {
1871  return false;
1872 }
1873 template<> template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1874 {
1875  return false;
1876 }
1877 #endif
1878 
1879 template<> void CStringFeatures<float32_t>::embed_features(int32_t p_order)
1880 {
1881 }
1882 template<> void CStringFeatures<float64_t>::embed_features(int32_t p_order)
1883 {
1884 }
1885 template<> void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
1886 {
1887 }
1888 
1890 {
1891 }
1893 {
1894 }
1896 {
1897 }
1898 
1900 {
1901  return 0;
1902 }
1904 {
1905  return 0;
1906 }
1908 {
1909  return 0;
1910 }
1911 
1912 template<> void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
1913 {
1914 }
1915 template<> void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
1916 {
1917 }
1918 template<> void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
1919 {
1920 }
1921 #define LOAD(f_load, sg_type) \
1922 template<> void CStringFeatures<sg_type>::load(CFile* loader) \
1923 { \
1924  SG_INFO("loading...\n") \
1925  \
1926  SG_SET_LOCALE_C; \
1927  SGString<sg_type>* strs; \
1928  int32_t num_str; \
1929  int32_t max_len; \
1930  loader->f_load(strs, num_str, max_len); \
1931  set_features(strs, num_str, max_len); \
1932  SG_RESET_LOCALE; \
1933 }
1934 
1935 LOAD(get_string_list, bool)
1936 LOAD(get_string_list, char)
1937 LOAD(get_string_list, int8_t)
1938 LOAD(get_string_list, uint8_t)
1939 LOAD(get_string_list, int16_t)
1940 LOAD(get_string_list, uint16_t)
1941 LOAD(get_string_list, int32_t)
1942 LOAD(get_string_list, uint32_t)
1943 LOAD(get_string_list, int64_t)
1944 LOAD(get_string_list, uint64_t)
1945 LOAD(get_string_list, float32_t)
1946 LOAD(get_string_list, float64_t)
1947 LOAD(get_string_list, floatmax_t)
1948 #undef LOAD
1949 
1950 #define SAVE(f_write, sg_type) \
1951 template<> void CStringFeatures<sg_type>::save(CFile* writer) \
1952 { \
1953  if (m_subset_stack->has_subsets()) \
1954  SG_ERROR("save() is not possible on subset") \
1955  SG_SET_LOCALE_C; \
1956  ASSERT(writer) \
1957  writer->f_write(features, num_vectors); \
1958  SG_RESET_LOCALE; \
1959 }
1960 
1961 SAVE(set_string_list, bool)
1962 SAVE(set_string_list, char)
1963 SAVE(set_string_list, int8_t)
1964 SAVE(set_string_list, uint8_t)
1965 SAVE(set_string_list, int16_t)
1966 SAVE(set_string_list, uint16_t)
1967 SAVE(set_string_list, int32_t)
1968 SAVE(set_string_list, uint32_t)
1969 SAVE(set_string_list, int64_t)
1970 SAVE(set_string_list, uint64_t)
1971 SAVE(set_string_list, float32_t)
1972 SAVE(set_string_list, float64_t)
1973 SAVE(set_string_list, floatmax_t)
1974 #undef SAVE
1975 
1976 template <class ST> template <class CT>
1978  int32_t p_order, int32_t gap, bool rev)
1979 {
1980  remove_all_subsets();
1981  ASSERT(sf)
1982 
1983  CAlphabet* alpha=sf->get_alphabet();
1984  ASSERT(alpha->get_num_symbols_in_histogram() > 0)
1985 
1986  this->order=p_order;
1987  cleanup();
1988 
1989  num_vectors=sf->get_num_vectors();
1990  ASSERT(num_vectors>0)
1991  max_string_length=sf->get_max_vector_length()-start;
1992  features=SG_MALLOC(SGString<ST>, num_vectors);
1993 
1994  SG_DEBUG("%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
1995  alpha->get_num_symbols_in_histogram());
1996 
1997  for (int32_t i=0; i<num_vectors; i++)
1998  {
1999  int32_t len=-1;
2000  bool vfree;
2001  CT* c=sf->get_feature_vector(i, len, vfree);
2002  ASSERT(!vfree) // won't work when preprocessors are attached
2003 
2004  features[i].string=SG_MALLOC(ST, len);
2005  features[i].slen=len;
2006 
2007  ST* str=features[i].string;
2008  for (int32_t j=0; j<len; j++)
2009  str[j]=(ST) alpha->remap_to_bin(c[j]);
2010  }
2011 
2012  original_num_symbols=alpha->get_num_symbols();
2013  int32_t max_val=alpha->get_num_bits();
2014 
2015  SG_UNREF(alpha);
2016 
2017  if (p_order>1)
2018  num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
2019  else
2020  num_symbols=original_num_symbols;
2021  SG_INFO("max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols)
2022 
2023  if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
2024  {
2025  SG_ERROR("symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val)
2026  return false;
2027  }
2028 
2029  SG_DEBUG("translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST))
2030  for (int32_t line=0; line<num_vectors; line++)
2031  {
2032  int32_t len=0;
2033  bool vfree;
2034  ST* fv=get_feature_vector(line, len, vfree);
2035  ASSERT(!vfree) // won't work when preprocessors are attached
2036 
2037  if (rev)
2038  CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
2039  else
2040  CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
2041 
2042  /* fix the length of the string -- hacky */
2043  features[line].slen-=start+gap ;
2044  if (features[line].slen<0)
2045  features[line].slen=0 ;
2046  }
2047 
2048  compute_symbol_mask_table(max_val);
2049 
2050  return true;
2051 }
2052 
2053 template class CStringFeatures<bool>;
2054 template class CStringFeatures<char>;
2055 template class CStringFeatures<int8_t>;
2056 template class CStringFeatures<uint8_t>;
2057 template class CStringFeatures<int16_t>;
2058 template class CStringFeatures<uint16_t>;
2059 template class CStringFeatures<int32_t>;
2060 template class CStringFeatures<uint32_t>;
2061 template class CStringFeatures<int64_t>;
2062 template class CStringFeatures<uint64_t>;
2063 template class CStringFeatures<float32_t>;
2064 template class CStringFeatures<float64_t>;
2065 template class CStringFeatures<floatmax_t>;
2066 
2067 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2068 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2069 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2070 
2071 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2072 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2073 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2074 }

SHOGUN Machine Learning Toolbox - Documentation