SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringFeatures.cpp
Go to the documentation of this file.
5 #include <shogun/io/SGIO.h>
9 
10 #include <sys/types.h>
11 #include <sys/stat.h>
12 #include <dirent.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <unistd.h>
16 
17 
18 namespace shogun
19 {
20 
22 {
23  init();
24  alphabet=new CAlphabet();
25 }
26 
28 {
29  init();
30 
31  alphabet=new CAlphabet(alpha);
35 }
36 
37 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha)
38 : CFeatures(0)
39 {
40  init();
41 
42  alphabet=new CAlphabet(alpha);
46  set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
47 }
48 
49 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha)
50 : CFeatures(0)
51 {
52  init();
53 
54  alphabet=new CAlphabet(alpha);
58  set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
59 }
60 
62 : CFeatures(0)
63 {
64  init();
65 
66  ASSERT(alpha)
67  SG_REF(alpha);
68  alphabet=alpha;
71 }
72 
73 template<class ST> CStringFeatures<ST>::CStringFeatures(const CStringFeatures & orig)
74 : CFeatures(orig), num_vectors(orig.num_vectors),
75  single_string(orig.single_string),
76  length_of_single_string(orig.length_of_single_string),
77  max_string_length(orig.max_string_length),
78  num_symbols(orig.num_symbols),
79  original_num_symbols(orig.original_num_symbols),
80  order(orig.order), preprocess_on_get(false),
81  feature_cache(NULL)
82 {
83  init();
84 
85  ASSERT(orig.single_string == NULL) //not implemented
86 
87  alphabet=orig.alphabet;
89 
90  if (orig.features)
91  {
92  features=SG_MALLOC(SGString<ST>, orig.num_vectors);
93 
94  for (int32_t i=0; i<num_vectors; i++)
95  {
96  features[i].string=SG_MALLOC(ST, orig.features[i].slen);
97  features[i].slen=orig.features[i].slen;
98  memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].slen);
99  }
100  }
101 
102  if (orig.symbol_mask_table)
103  {
104  symbol_mask_table=SG_MALLOC(ST, 256);
106 
107  for (int32_t i=0; i<256; i++)
109  }
110 
113 }
114 
115 template<class ST> CStringFeatures<ST>::CStringFeatures(CFile* loader, EAlphabet alpha)
116 : CFeatures(), num_vectors(0),
117  features(NULL), single_string(NULL), length_of_single_string(0),
118  max_string_length(0), order(0),
119  preprocess_on_get(false), feature_cache(NULL)
120 {
121  init();
122 
123  alphabet=new CAlphabet(alpha);
124  SG_REF(alphabet);
127  load(loader);
128 }
129 
131 {
132  cleanup();
133 
134  SG_UNREF(alphabet);
135 }
136 
137 template<class ST> void CStringFeatures<ST>::cleanup()
138 {
139  remove_all_subsets();
140 
141  if (single_string)
142  {
143  SG_FREE(single_string);
144  single_string=NULL;
145  }
146  else
147  cleanup_feature_vectors(0, num_vectors-1);
148 
149  /*
150  if (single_string)
151  {
152  SG_FREE(single_string);
153  single_string=NULL;
154  }
155  else
156  cleanup_feature_vectors(0, num_vectors-1);
157  */
158 
159  num_vectors=0;
160  SG_FREE(features);
161  SG_FREE(symbol_mask_table);
162  features=NULL;
163  symbol_mask_table=NULL;
164 
165  /* start with a fresh alphabet, but instead of emptying the histogram
166  * create a new object (to leave the alphabet object alone if it is used
167  * by others)
168  */
169  CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
170  SG_UNREF(alphabet);
171  alphabet=alpha;
172  SG_REF(alphabet);
173 }
174 
175 template<class ST> void CStringFeatures<ST>::cleanup_feature_vector(int32_t num)
176 {
177  ASSERT(num<get_num_vectors())
178 
179  if (features)
180  {
181  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
182  SG_FREE(features[real_num].string);
183  features[real_num].string=NULL;
184  features[real_num].slen=0;
185 
186  determine_maximum_string_length();
187  }
188 }
189 
190 template<class ST> void CStringFeatures<ST>::cleanup_feature_vectors(int32_t start, int32_t stop)
191 {
192  if (features && get_num_vectors())
193  {
194  ASSERT(start<get_num_vectors())
195  ASSERT(stop<get_num_vectors())
196 
197  for (int32_t i=start; i<=stop; i++)
198  {
199  int32_t real_num=m_subset_stack->subset_idx_conversion(i);
200  SG_FREE(features[real_num].string);
201  features[real_num].string=NULL;
202  features[real_num].slen=0;
203  }
204  determine_maximum_string_length();
205  }
206 }
207 
208 template<class ST> EFeatureClass CStringFeatures<ST>::get_feature_class() const { return C_STRING; }
209 
210 template<class ST> EFeatureType CStringFeatures<ST>::get_feature_type() const { return F_UNKNOWN; }
211 
213 {
214  SG_REF(alphabet);
215  return alphabet;
216 }
217 
218 template<class ST> CFeatures* CStringFeatures<ST>::duplicate() const
219 {
220  return new CStringFeatures<ST>(*this);
221 }
222 
224 {
225  ASSERT(features)
226  if (num>=get_num_vectors())
227  {
228  SG_ERROR("Index out of bounds (number of strings %d, you "
229  "requested %d)\n", get_num_vectors(), num);
230  }
231 
232  int32_t l;
233  bool free_vec;
234  ST* vec=get_feature_vector(num, l, free_vec);
235  ST* dst=SG_MALLOC(ST, l);
236  memcpy(dst, vec, l*sizeof(ST));
237  free_feature_vector(vec, num, free_vec);
238  return SGVector<ST>(dst, l, true);
239 }
240 
241 template<class ST> void CStringFeatures<ST>::set_feature_vector(SGVector<ST> vector, int32_t num)
242 {
243  ASSERT(features)
244 
245  if (m_subset_stack->has_subsets())
246  SG_ERROR("A subset is set, cannot set feature vector\n")
247 
248  if (num>=num_vectors)
249  {
250  SG_ERROR("Index out of bounds (number of strings %d, you "
251  "requested %d)\n", num_vectors, num);
252  }
253 
254  if (vector.vlen<=0)
255  SG_ERROR("String has zero or negative length\n")
256 
257  cleanup_feature_vector(num);
258  features[num].slen=vector.vlen;
259  features[num].string=SG_MALLOC(ST, vector.vlen);
260  memcpy(features[num].string, vector.vector, vector.vlen*sizeof(ST));
261 
262  determine_maximum_string_length();
263 }
264 
266 {
267  preprocess_on_get=true;
268 }
269 
271 {
272  preprocess_on_get=false;
273 }
274 
275 template<class ST> ST* CStringFeatures<ST>::get_feature_vector(int32_t num, int32_t& len, bool& dofree)
276 {
277  ASSERT(features)
278  if (num>=get_num_vectors())
279  SG_ERROR("Requested feature vector with index %d while total num is", num, get_num_vectors())
280 
281  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
282 
283  if (!preprocess_on_get)
284  {
285  dofree=false;
286  len=features[real_num].slen;
287  return features[real_num].string;
288  }
289  else
290  {
291  SG_DEBUG("computing feature vector!\n")
292  ST* feat=compute_feature_vector(num, len);
293  dofree=true;
294 
295  if (get_num_preprocessors())
296  {
297  ST* tmp_feat_before=feat;
298 
299  for (int32_t i=0; i<get_num_preprocessors(); i++)
300  {
301  CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
302  feat=p->apply_to_string(tmp_feat_before, len);
303  SG_UNREF(p);
304  SG_FREE(tmp_feat_before);
305  tmp_feat_before=feat;
306  }
307  }
308  // TODO: implement caching
309  return feat;
310  }
311 }
312 
314 {
315  int32_t num_feat;
316  int32_t num_vec;
317  SGString<ST>* s=get_transposed(num_feat, num_vec);
318  SGStringList<ST> string_list;
319  string_list.strings = s;
320  string_list.num_strings = num_vec;
321  string_list.max_string_length = num_feat;
322 
323  return new CStringFeatures<ST>(string_list, alphabet);
324 }
325 
326 template<class ST> SGString<ST>* CStringFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec)
327 {
328  num_feat=get_num_vectors();
329  num_vec=get_max_vector_length();
330  ASSERT(have_same_length())
331 
332  SG_DEBUG("Allocating memory for transposed string features of size %ld\n",
333  int64_t(num_feat)*num_vec);
334 
335  SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
336 
337  for (int32_t i=0; i<num_vec; i++)
338  {
339  sf[i].string=SG_MALLOC(ST, num_feat);
340  sf[i].slen=num_feat;
341  }
342 
343  for (int32_t i=0; i<num_feat; i++)
344  {
345  int32_t len=0;
346  bool free_vec=false;
347  ST* vec=get_feature_vector(i, len, free_vec);
348 
349  for (int32_t j=0; j<num_vec; j++)
350  sf[j].string[i]=vec[j];
351 
352  free_feature_vector(vec, i, free_vec);
353  }
354  return sf;
355 }
356 
357 template<class ST> void CStringFeatures<ST>::free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
358 {
359  if (num>=get_num_vectors())
360  {
361  SG_ERROR(
362  "Trying to access string[%d] but num_str=%d\n", num,
363  get_num_vectors());
364  }
365 
366  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
367 
368  if (feature_cache)
369  feature_cache->unlock_entry(real_num);
370 
371  if (dofree)
372  SG_FREE(feat_vec);
373 }
374 
375 template<class ST> void CStringFeatures<ST>::free_feature_vector(SGVector<ST> feat_vec, int32_t num)
376 {
377  if (num>=get_num_vectors())
378  {
379  SG_ERROR(
380  "Trying to access string[%d] but num_str=%d\n", num,
381  get_num_vectors());
382  }
383 
384  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
385 
386  if (feature_cache)
387  feature_cache->unlock_entry(real_num);
388 }
389 
390 template<class ST> ST CStringFeatures<ST>::get_feature(int32_t vec_num, int32_t feat_num)
391 {
392  ASSERT(vec_num<get_num_vectors())
393 
394  int32_t len;
395  bool free_vec;
396  ST* vec=get_feature_vector(vec_num, len, free_vec);
397  ASSERT(feat_num<len)
398  ST result=vec[feat_num];
399  free_feature_vector(vec, vec_num, free_vec);
400 
401  return result;
402 }
403 
404 template<class ST> int32_t CStringFeatures<ST>::get_vector_length(int32_t vec_num)
405 {
406  ASSERT(vec_num<get_num_vectors())
407 
408  int32_t len;
409  bool free_vec;
410  ST* vec=get_feature_vector(vec_num, len, free_vec);
411  free_feature_vector(vec, vec_num, free_vec);
412  return len;
413 }
414 
415 template<class ST> int32_t CStringFeatures<ST>::get_max_vector_length()
416 {
417  return max_string_length;
418 }
419 
420 template<class ST> int32_t CStringFeatures<ST>::get_num_vectors() const
421 {
422  return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors;
423 }
424 
425 template<class ST> floatmax_t CStringFeatures<ST>::get_num_symbols() { return num_symbols; }
426 
427 template<class ST> floatmax_t CStringFeatures<ST>::get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
428 
429 template<class ST> floatmax_t CStringFeatures<ST>::get_original_num_symbols() { return original_num_symbols; }
430 
431 template<class ST> int32_t CStringFeatures<ST>::get_order() { return order; }
432 
433 template<class ST> ST CStringFeatures<ST>::get_masked_symbols(ST symbol, uint8_t mask)
434 {
435  ASSERT(symbol_mask_table)
436  return symbol_mask_table[mask] & symbol;
437 }
438 
439 template<class ST> ST CStringFeatures<ST>::shift_offset(ST offset, int32_t amount)
440 {
441  ASSERT(alphabet)
442  return (offset << (amount*alphabet->get_num_bits()));
443 }
444 
445 template<class ST> ST CStringFeatures<ST>::shift_symbol(ST symbol, int32_t amount)
446 {
447  ASSERT(alphabet)
448  return (symbol >> (amount*alphabet->get_num_bits()));
449 }
450 
451 template<class ST> void CStringFeatures<ST>::load_ascii_file(char* fname, bool remap_to_bin,
452  EAlphabet ascii_alphabet, EAlphabet binary_alphabet)
453 {
454  remove_all_subsets();
455 
456  size_t blocksize=1024*1024;
457  size_t required_blocksize=0;
458  uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
459  uint8_t* overflow=NULL;
460  int32_t overflow_len=0;
461 
462  cleanup();
463 
464  CAlphabet* alpha=new CAlphabet(ascii_alphabet);
465  CAlphabet* alpha_bin=new CAlphabet(binary_alphabet);
466 
467  FILE* f=fopen(fname, "ro");
468 
469  if (f)
470  {
471  num_vectors=0;
472  max_string_length=0;
473 
474  SG_INFO("counting line numbers in file %s\n", fname)
475  size_t block_offs=0;
476  size_t old_block_offs=0;
477  fseek(f, 0, SEEK_END);
478  size_t fsize=ftell(f);
479  rewind(f);
480 
481  if (blocksize>fsize)
482  blocksize=fsize;
483 
484  SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize)
485 
486  size_t sz=blocksize;
487  while (sz == blocksize)
488  {
489  sz=fread(dummy, sizeof(uint8_t), blocksize, f);
490  for (size_t i=0; i<sz; i++)
491  {
492  block_offs++;
493  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
494  {
495  num_vectors++;
496  required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
497  old_block_offs=block_offs;
498  }
499  }
500  SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t")
501  }
502 
503  SG_INFO("found %d strings\n", num_vectors)
504  SG_FREE(dummy);
505  blocksize=required_blocksize;
506  dummy=SG_MALLOC(uint8_t, blocksize);
507  overflow=SG_MALLOC(uint8_t, blocksize);
508  features=SG_MALLOC(SGString<ST>, num_vectors);
509 
510  rewind(f);
511  sz=blocksize;
512  int32_t lines=0;
513  while (sz == blocksize)
514  {
515  sz=fread(dummy, sizeof(uint8_t), blocksize, f);
516 
517  size_t old_sz=0;
518  for (size_t i=0; i<sz; i++)
519  {
520  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
521  {
522  int32_t len=i-old_sz;
523  //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz)
524  max_string_length=CMath::max(max_string_length, len+overflow_len);
525 
526  features[lines].slen=len;
527  features[lines].string=SG_MALLOC(ST, len);
528 
529  if (remap_to_bin)
530  {
531  for (int32_t j=0; j<overflow_len; j++)
532  features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
533  for (int32_t j=0; j<len; j++)
534  features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
535  alpha->add_string_to_histogram(&dummy[old_sz], len);
536  alpha_bin->add_string_to_histogram(features[lines].string, features[lines].slen);
537  }
538  else
539  {
540  for (int32_t j=0; j<overflow_len; j++)
541  features[lines].string[j]=overflow[j];
542  for (int32_t j=0; j<len; j++)
543  features[lines].string[j+overflow_len]=dummy[old_sz+j];
544  alpha->add_string_to_histogram(&dummy[old_sz], len);
545  alpha->add_string_to_histogram(features[lines].string, features[lines].slen);
546  }
547 
548  // clear overflow
549  overflow_len=0;
550 
551  //CMath::display_vector(features[lines].string, len);
552  old_sz=i+1;
553  lines++;
554  SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t")
555  }
556  }
557  for (size_t i=old_sz; i<sz; i++)
558  overflow[i-old_sz]=dummy[i];
559 
560  overflow_len=sz-old_sz;
561  }
562 
563  if (alpha->check_alphabet_size() && alpha->check_alphabet())
564  {
565  SG_INFO("file successfully read\n")
566  SG_INFO("max_string_length=%d\n", max_string_length)
567  SG_INFO("num_strings=%d\n", num_vectors)
568  }
569  fclose(f);
570  }
571 
572  SG_FREE(dummy);
573 
574  SG_UNREF(alphabet);
575 
576  if (remap_to_bin)
577  alphabet=alpha_bin;
578  else
579  alphabet=alpha;
580  SG_REF(alphabet);
581  num_symbols=alphabet->get_num_symbols();
582 }
583 
584 template<class ST> bool CStringFeatures<ST>::load_fasta_file(const char* fname, bool ignore_invalid)
585 {
586  remove_all_subsets();
587 
588  int32_t i=0;
589  uint64_t len=0;
590  uint64_t offs=0;
591  int32_t num=0;
592  int32_t max_len=0;
593 
594  CMemoryMappedFile<char> f(fname);
595 
596  while (true)
597  {
598  char* s=f.get_line(len, offs);
599  if (!s)
600  break;
601 
602  if (len>0 && s[0]=='>')
603  num++;
604  }
605 
606  if (num==0)
607  SG_ERROR("No fasta hunks (lines starting with '>') found\n")
608 
609  cleanup();
610  SG_UNREF(alphabet);
611  alphabet=new CAlphabet(DNA);
612  num_symbols=alphabet->get_num_symbols();
613 
614  SGString<ST>* strings=SG_MALLOC(SGString<ST>, num);
615  offs=0;
616 
617  for (i=0;i<num; i++)
618  {
619  uint64_t id_len=0;
620  char* id=f.get_line(id_len, offs);
621 
622  char* fasta=f.get_line(len, offs);
623  char* s=fasta;
624  int32_t fasta_len=0;
625  int32_t spanned_lines=0;
626 
627  while (true)
628  {
629  if (!s || len==0)
630  SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len)
631 
632  if (s[0]=='>' || offs==f.get_size())
633  {
634  offs-=len+1; // seek to beginning
635  if (offs==f.get_size())
636  {
637  SG_DEBUG("at EOF\n")
638  fasta_len+=len;
639  }
640 
641  len=fasta_len-spanned_lines;
642  strings[i].string=SG_MALLOC(ST, len);
643  strings[i].slen=len;
644 
645  ST* str=strings[i].string;
646  int32_t idx=0;
647  SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines)
648 
649  for (int32_t j=0; j<fasta_len; j++)
650  {
651  if (fasta[j]=='\n')
652  continue;
653 
654  ST c=(ST) fasta[j];
655 
656  if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j]))
657  c=(ST) 'A';
658 
659  if (uint64_t(idx)>=len)
660  SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str)
661  str[idx++]=c;
662  }
663  max_len=CMath::max(max_len, strings[i].slen);
664 
665 
666  break;
667  }
668 
669  spanned_lines++;
670  fasta_len+=len+1; // including '\n'
671  s=f.get_line(len, offs);
672  }
673  }
674  return set_features(strings, num, max_len);
675 }
676 
677 template<class ST> bool CStringFeatures<ST>::load_fastq_file(const char* fname,
678  bool ignore_invalid, bool bitremap_in_single_string)
679 {
680  remove_all_subsets();
681 
682  CMemoryMappedFile<char> f(fname);
683 
684  int32_t i=0;
685  uint64_t len=0;
686  uint64_t offs=0;
687 
688  int32_t num=f.get_num_lines();
689  int32_t max_len=0;
690 
691  if (num%4)
692  SG_ERROR("Number of lines must be divisible by 4 in fastq files\n")
693  num/=4;
694 
695  cleanup();
696  SG_UNREF(alphabet);
697  alphabet=new CAlphabet(DNA);
698 
699  SGString<ST>* strings;
700 
701  ST* str=NULL;
702  if (bitremap_in_single_string)
703  {
704  strings=SG_MALLOC(SGString<ST>, 1);
705  strings[0].string=SG_MALLOC(ST, num);
706  strings[0].slen=num;
707  f.get_line(len, offs);
708  f.get_line(len, offs);
709  order=len;
710  max_len=num;
711  offs=0;
712  original_num_symbols=alphabet->get_num_symbols();
713  str=SG_MALLOC(ST, len);
714  }
715  else
716  strings=SG_MALLOC(SGString<ST>, num);
717 
718  for (i=0;i<num; i++)
719  {
720  if (!f.get_line(len, offs))
721  SG_ERROR("Error reading 'read' identifier in line %d", 4*i)
722 
723  char* s=f.get_line(len, offs);
724  if (!s || len==0)
725  SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len)
726 
727  if (bitremap_in_single_string)
728  {
729  if (len!=(uint64_t) order)
730  SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len)
731  for (int32_t j=0; j<order; j++)
732  str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
733 
734  strings[0].string[i]=embed_word(str, order);
735  }
736  else
737  {
738  strings[i].string=SG_MALLOC(ST, len);
739  strings[i].slen=len;
740  str=strings[i].string;
741 
742  if (ignore_invalid)
743  {
744  for (uint64_t j=0; j<len; j++)
745  {
746  if (alphabet->is_valid((uint8_t) s[j]))
747  str[j]= (ST) s[j];
748  else
749  str[j]= (ST) 'A';
750  }
751  }
752  else
753  {
754  for (uint64_t j=0; j<len; j++)
755  str[j]= (ST) s[j];
756  }
757  max_len=CMath::max(max_len, (int32_t) len);
758  }
759 
760 
761  if (!f.get_line(len, offs))
762  SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2)
763 
764  if (!f.get_line(len, offs))
765  SG_ERROR("Error reading 'read' quality in line %d", 4*i+3)
766  }
767 
768  if (bitremap_in_single_string)
769  num=1;
770 
771  num_vectors=num;
772  max_string_length=max_len;
773  features=strings;
774 
775  return true;
776 }
777 
778 template<class ST> bool CStringFeatures<ST>::load_from_directory(char* dirname)
779 {
780  remove_all_subsets();
781 
782  struct dirent **namelist;
783  int32_t n;
784 
785  SGIO::set_dirname(dirname);
786 
787  SG_DEBUG("dirname '%s'\n", dirname)
788 
789  n=scandir(dirname, &namelist, &SGIO::filter, alphasort);
790  if (n <= 0)
791  {
792  SG_ERROR("error calling scandir - no files found\n")
793  return false;
794  }
795  else
796  {
797  SGString<ST>* strings=NULL;
798 
799  int32_t num=0;
800  int32_t max_len=-1;
801 
802  //usually n==num_vec, but it might not in race conditions
803  //(file perms modified, file erased)
804  strings=SG_MALLOC(SGString<ST>, n);
805 
806  for (int32_t i=0; i<n; i++)
807  {
808  char* fname=SGIO::concat_filename(namelist[i]->d_name);
809 
810  struct stat s;
811  off_t filesize=0;
812 
813  if (!stat(fname, &s) && s.st_size>0)
814  {
815  filesize=s.st_size/sizeof(ST);
816 
817  FILE* f=fopen(fname, "ro");
818  if (f)
819  {
820  ST* str=SG_MALLOC(ST, filesize);
821  SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize)
822  if (fread(str, sizeof(ST), filesize, f)!=(size_t) filesize)
823  SG_ERROR("failed to read file\n")
824  strings[num].string=str;
825  strings[num].slen=filesize;
826  max_len=CMath::max(max_len, strings[num].slen);
827 
828  num++;
829  fclose(f);
830  }
831  }
832  else
833  SG_ERROR("empty or non readable file \'%s\'\n", fname)
834 
835  SG_FREE(namelist[i]);
836  }
837  SG_FREE(namelist);
838 
839  if (num>0 && strings)
840  {
841  set_features(strings, num, max_len);
842  return true;
843  }
844  }
845  return false;
846 }
847 
849 {
850  set_features(feats.strings, feats.num_strings, feats.max_string_length);
851 }
852 
853 template<class ST> bool CStringFeatures<ST>::set_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
854 {
855  if (m_subset_stack->has_subsets())
856  SG_ERROR("Cannot call set_features() with subset.\n")
857 
858  if (p_features)
859  {
860  CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
861 
862  //compute histogram for char/byte
863  for (int32_t i=0; i<p_num_vectors; i++)
864  alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
865 
866  SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram())
867  SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram())
868 
869  if (alpha->check_alphabet_size() && alpha->check_alphabet())
870  {
871  cleanup();
872  SG_UNREF(alphabet);
873 
874  alphabet=alpha;
875  SG_REF(alphabet);
876 
877  // TODO remove copying
878  features = SG_MALLOC(SGString<ST>,p_num_vectors);
879  memcpy(features,p_features,sizeof(SGString<ST>)*p_num_vectors);
880  num_vectors = p_num_vectors;
881  max_string_length = p_max_string_length;
882 
883  return true;
884  }
885  else
886  SG_UNREF(alpha);
887  }
888 
889  return false;
890 }
891 
893 {
894  ASSERT(sf)
895 
896  if (m_subset_stack->has_subsets())
897  SG_ERROR("Cannot call set_features() with subset.\n")
898 
899  SGString<ST>* new_features=SG_MALLOC(SGString<ST>, sf->get_num_vectors());
900 
901  index_t sf_num_str=sf->get_num_vectors();
902  for (int32_t i=0; i<sf_num_str; i++)
903  {
904  int32_t real_i = sf->m_subset_stack->subset_idx_conversion(i);
905  int32_t length=sf->features[real_i].slen;
906  new_features[i].string=SG_MALLOC(ST, length);
907  memcpy(new_features[i].string, sf->features[real_i].string, length);
908  new_features[i].slen=length;
909  }
910  return append_features(new_features, sf_num_str,
911  sf->max_string_length);
912 }
913 
914 template<class ST> bool CStringFeatures<ST>::append_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
915 {
916  if (m_subset_stack->has_subsets())
917  SG_ERROR("Cannot call set_features() with subset.\n")
918 
919  if (!features)
920  return set_features(p_features, p_num_vectors, p_max_string_length);
921 
922  CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
923 
924  //compute histogram for char/byte
925  for (int32_t i=0; i<p_num_vectors; i++)
926  alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
927 
928  SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram())
929  SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram())
930 
931  if (alpha->check_alphabet_size() && alpha->check_alphabet())
932  {
933  SG_UNREF(alpha);
934  for (int32_t i=0; i<p_num_vectors; i++)
935  alphabet->add_string_to_histogram( p_features[i].string, p_features[i].slen);
936 
937  int32_t old_num_vectors=num_vectors;
938  num_vectors=old_num_vectors+p_num_vectors;
939  SGString<ST>* new_features=SG_MALLOC(SGString<ST>, num_vectors);
940 
941  for (int32_t i=0; i<num_vectors; i++)
942  {
943  if (i<old_num_vectors)
944  {
945  new_features[i].string=features[i].string;
946  new_features[i].slen=features[i].slen;
947  }
948  else
949  {
950  new_features[i].string=p_features[i-old_num_vectors].string;
951  new_features[i].slen=p_features[i-old_num_vectors].slen;
952  }
953  }
954  SG_FREE(features);
955  SG_FREE(p_features); // free now obsolete features
956 
957  this->features=new_features;
958  max_string_length=CMath::max(max_string_length, p_max_string_length);
959 
960  return true;
961  }
962  SG_UNREF(alpha);
963 
964  return false;
965 }
966 
968 {
969  SGStringList<ST> sl(NULL,0,0,false);
970 
971  sl.strings=get_features(sl.num_strings, sl.max_string_length);
972  return sl;
973 }
974 
975 template<class ST> SGString<ST>* CStringFeatures<ST>::get_features(int32_t& num_str, int32_t& max_str_len)
976 {
977  if (m_subset_stack->has_subsets())
978  SG_ERROR("get features() is not possible on subset")
979 
980  num_str=num_vectors;
981  max_str_len=max_string_length;
982  return features;
983 }
984 
985 template<class ST> SGString<ST>* CStringFeatures<ST>::copy_features(int32_t& num_str, int32_t& max_str_len)
986 {
987  ASSERT(num_vectors>0)
988 
989  num_str=get_num_vectors();
990  max_str_len=max_string_length;
991  SGString<ST>* new_feat=SG_MALLOC(SGString<ST>, num_str);
992 
993  for (int32_t i=0; i<num_str; i++)
994  {
995  int32_t len;
996  bool free_vec;
997  ST* vec=get_feature_vector(i, len, free_vec);
998  new_feat[i].string=SG_MALLOC(ST, len);
999  new_feat[i].slen=len;
1000  memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
1001  free_feature_vector(vec, i, free_vec);
1002  }
1003 
1004  return new_feat;
1005 }
1006 
1007 template<class ST> void CStringFeatures<ST>::get_features(SGString<ST>** dst, int32_t* num_str)
1008 {
1009  int32_t num_vec;
1010  int32_t max_str_len;
1011  *dst=copy_features(num_vec, max_str_len);
1012  *num_str=num_vec;
1013 }
1014 
1015 template<class ST> bool CStringFeatures<ST>::load_compressed(char* src, bool decompress)
1016 {
1017  remove_all_subsets();
1018 
1019  FILE* file=NULL;
1020 
1021  if (!(file=fopen(src, "r")))
1022  return false;
1023  cleanup();
1024 
1025  // header shogun v0
1026  char id[4];
1027  if (fread(&id[0], sizeof(char), 1, file)!=1)
1028  SG_ERROR("failed to read header")
1029  ASSERT(id[0]=='S')
1030  if (fread(&id[1], sizeof(char), 1, file)!=1)
1031  SG_ERROR("failed to read header")
1032  ASSERT(id[1]=='G')
1033  if (fread(&id[2], sizeof(char), 1, file)!=1)
1034  SG_ERROR("failed to read header")
1035  ASSERT(id[2]=='V')
1036  if (fread(&id[3], sizeof(char), 1, file)!=1)
1037  SG_ERROR("failed to read header")
1038  ASSERT(id[3]=='0')
1039 
1040  //compression type
1041  uint8_t c;
1042  if (fread(&c, sizeof(uint8_t), 1, file)!=1)
1043  SG_ERROR("failed to read compression type")
1044  CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
1045  //alphabet
1046  uint8_t a;
1047  delete alphabet;
1048  if (fread(&a, sizeof(uint8_t), 1, file)!=1)
1049  SG_ERROR("failed to read compression alphabet")
1050  alphabet=new CAlphabet((EAlphabet) a);
1051  // number of vectors
1052  if (fread(&num_vectors, sizeof(int32_t), 1, file)!=1)
1053  SG_ERROR("failed to read compression number of vectors")
1054  ASSERT(num_vectors>0)
1055  // maximum string length
1056  if (fread(&max_string_length, sizeof(int32_t), 1, file)!=1)
1057  SG_ERROR("failed to read maximum string length")
1058  ASSERT(max_string_length>0)
1059 
1060  features=SG_MALLOC(SGString<ST>, num_vectors);
1061 
1062  // vectors
1063  for (int32_t i=0; i<num_vectors; i++)
1064  {
1065  // vector len compressed
1066  int32_t len_compressed;
1067  if (fread(&len_compressed, sizeof(int32_t), 1, file)!=1)
1068  SG_ERROR("failed to read vector length compressed")
1069  // vector len uncompressed
1070  int32_t len_uncompressed;
1071  if (fread(&len_uncompressed, sizeof(int32_t), 1, file)!=1)
1072  SG_ERROR("failed to read vector length uncompressed")
1073 
1074  // vector raw data
1075  if (decompress)
1076  {
1077  features[i].string=SG_MALLOC(ST, len_uncompressed);
1078  features[i].slen=len_uncompressed;
1079  uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed);
1080  if (fread(compressed, sizeof(uint8_t), len_compressed, file)!=(size_t) len_compressed)
1081  SG_ERROR("failed to read compressed data (expected %d bytes)", len_compressed)
1082  uint64_t uncompressed_size=len_uncompressed;
1083  uncompressed_size*=sizeof(ST);
1084  compressor->decompress(compressed, len_compressed,
1085  (uint8_t*) features[i].string, uncompressed_size);
1086  SG_FREE(compressed);
1087  ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST))
1088  }
1089  else
1090  {
1091  int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
1092  features[i].string=SG_MALLOC(ST, len_compressed+offs);
1093  features[i].slen=len_compressed+offs;
1094  int32_t* feat32ptr=((int32_t*) (features[i].string));
1095  memset(features[i].string, 0, offs*sizeof(ST));
1096  feat32ptr[0]=(int32_t) len_compressed;
1097  feat32ptr[1]=(int32_t) len_uncompressed;
1098  uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
1099  if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed)
1100  SG_ERROR("failed to read uncompressed data")
1101  }
1102  }
1103 
1104  delete compressor;
1105  fclose(file);
1106 
1107  return false;
1108 }
1109 
1110 template<class ST> bool CStringFeatures<ST>::save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
1111 {
1112  if (m_subset_stack->has_subsets())
1113  SG_ERROR("save_compressed() is not possible on subset")
1114 
1115  FILE* file=NULL;
1116 
1117  if (!(file=fopen(dest, "wb")))
1118  return false;
1119 
1120  CCompressor* compressor= new CCompressor(compression);
1121 
1122  // header shogun v0
1123  const char* id="SGV0";
1124  fwrite(&id[0], sizeof(char), 1, file);
1125  fwrite(&id[1], sizeof(char), 1, file);
1126  fwrite(&id[2], sizeof(char), 1, file);
1127  fwrite(&id[3], sizeof(char), 1, file);
1128 
1129  //compression type
1130  uint8_t c=(uint8_t) compression;
1131  fwrite(&c, sizeof(uint8_t), 1, file);
1132  //alphabet
1133  uint8_t a=(uint8_t) alphabet->get_alphabet();
1134  fwrite(&a, sizeof(uint8_t), 1, file);
1135  // number of vectors
1136  fwrite(&num_vectors, sizeof(int32_t), 1, file);
1137  // maximum string length
1138  fwrite(&max_string_length, sizeof(int32_t), 1, file);
1139 
1140  // vectors
1141  for (int32_t i=0; i<num_vectors; i++)
1142  {
1143  int32_t len=-1;
1144  bool vfree;
1145  ST* vec=get_feature_vector(i, len, vfree);
1146 
1147  uint8_t* compressed=NULL;
1148  uint64_t compressed_size=0;
1149 
1150  compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
1151  compressed, compressed_size, level);
1152 
1153  int32_t len_compressed=(int32_t) compressed_size;
1154  // vector len compressed in bytes
1155  fwrite(&len_compressed, sizeof(int32_t), 1, file);
1156  // vector len uncompressed in number of elements of type ST
1157  fwrite(&len, sizeof(int32_t), 1, file);
1158  // vector raw data
1159  fwrite(compressed, compressed_size, 1, file);
1160  SG_FREE(compressed);
1161 
1162  free_feature_vector(vec, i, vfree);
1163  }
1164 
1165  delete compressor;
1166  fclose(file);
1167  return true;
1168 }
1169 
1170 template<class ST> bool CStringFeatures<ST>::apply_preprocessor(bool force_preprocessing)
1171 {
1172  SG_DEBUG("force: %d\n", force_preprocessing)
1173 
1174  for (int32_t i=0; i<get_num_preprocessors(); i++)
1175  {
1176  if ( (!is_preprocessed(i) || force_preprocessing) )
1177  {
1178  set_preprocessed(i);
1179  CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
1180  SG_INFO("preprocessing using preproc %s\n", p->get_name())
1181 
1182  if (!p->apply_to_string_features(this))
1183  {
1184  SG_UNREF(p);
1185  return false;
1186  }
1187  else
1188  SG_UNREF(p);
1189  }
1190  }
1191  return true;
1192 }
1193 
1194 template<class ST> int32_t CStringFeatures<ST>::obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip)
1195 {
1196  if (m_subset_stack->has_subsets())
1198 
1199  ASSERT(step_size>0)
1200  ASSERT(window_size>0)
1201  ASSERT(num_vectors==1 || single_string)
1202  ASSERT(max_string_length>=window_size ||
1203  (single_string && length_of_single_string>=window_size));
1204 
1205  //in case we are dealing with a single remapped string
1206  //allow remapping
1207  if (single_string)
1208  num_vectors= (length_of_single_string-window_size)/step_size + 1;
1209  else if (num_vectors==1)
1210  {
1211  num_vectors= (max_string_length-window_size)/step_size + 1;
1212  length_of_single_string=max_string_length;
1213  }
1214 
1215  SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
1216  int32_t offs=0;
1217  for (int32_t i=0; i<num_vectors; i++)
1218  {
1219  f[i].string=&features[0].string[offs+skip];
1220  f[i].slen=window_size-skip;
1221  offs+=step_size;
1222  }
1223  single_string=features[0].string;
1224  SG_FREE(features);
1225  features=f;
1226  max_string_length=window_size-skip;
1227 
1228  return num_vectors;
1229 }
1230 
1231 template<class ST> int32_t CStringFeatures<ST>::obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
1232  int32_t skip)
1233 {
1234  if (m_subset_stack->has_subsets())
1236 
1237  ASSERT(positions)
1238  ASSERT(window_size>0)
1239  ASSERT(num_vectors==1 || single_string)
1240  ASSERT(max_string_length>=window_size ||
1241  (single_string && length_of_single_string>=window_size));
1242 
1243  num_vectors= positions->get_num_elements();
1244  ASSERT(num_vectors>0)
1245 
1246  int32_t len;
1247 
1248  //in case we are dealing with a single remapped string
1249  //allow remapping
1250  if (single_string)
1251  len=length_of_single_string;
1252  else
1253  {
1254  single_string=features[0].string;
1255  len=max_string_length;
1256  length_of_single_string=max_string_length;
1257  }
1258 
1259  SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
1260  for (int32_t i=0; i<num_vectors; i++)
1261  {
1262  int32_t p=positions->get_element(i);
1263 
1264  if (p>=0 && p<=len-window_size)
1265  {
1266  f[i].string=&features[0].string[p+skip];
1267  f[i].slen=window_size-skip;
1268  }
1269  else
1270  {
1271  num_vectors=1;
1272  max_string_length=len;
1273  features[0].slen=len;
1274  single_string=NULL;
1275  SG_FREE(f);
1276  SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
1277  window_size, i, p, len);
1278  return -1;
1279  }
1280  }
1281 
1282  SG_FREE(features);
1283  features=f;
1284  max_string_length=window_size-skip;
1285 
1286  return num_vectors;
1287 }
1288 
1289 template<class ST> bool CStringFeatures<ST>::obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1290 {
1291  return obtain_from_char_features(sf, start, p_order, gap, rev);
1292 }
1293 
1294 template<class ST> bool CStringFeatures<ST>::have_same_length(int32_t len)
1295 {
1296  if (len!=-1)
1297  {
1298  if (len!=max_string_length)
1299  return false;
1300  }
1301  len=max_string_length;
1302 
1303  index_t num_str=get_num_vectors();
1304  for (int32_t i=0; i<num_str; i++)
1305  {
1306  if (get_vector_length(i)!=len)
1307  return false;
1308  }
1309 
1310  return true;
1311 }
1312 
1313 template<class ST> void CStringFeatures<ST>::embed_features(int32_t p_order)
1314 {
1315  if (m_subset_stack->has_subsets())
1317 
1318  ASSERT(alphabet->get_num_symbols_in_histogram() > 0)
1319 
1320  order=p_order;
1321  original_num_symbols=alphabet->get_num_symbols();
1322  int32_t max_val=alphabet->get_num_bits();
1323 
1324  if (p_order>1)
1325  num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
1326  else
1327  num_symbols=original_num_symbols;
1328 
1329  SG_INFO("max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols)
1330 
1331  if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
1332  SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val)
1333 
1334  ST mask=0;
1335  for (int32_t i=0; i<p_order*max_val; i++)
1336  mask= (mask<<1) | ((ST) 1);
1337 
1338  for (int32_t i=0; i<num_vectors; i++)
1339  {
1340  int32_t len=features[i].slen;
1341 
1342  if (len < p_order)
1343  SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order)
1344 
1345  ST* str=features[i].string;
1346 
1347  // convert first word
1348  for (int32_t j=0; j<p_order; j++)
1349  str[j]=(ST) alphabet->remap_to_bin(str[j]);
1350  str[0]=embed_word(&str[0], p_order);
1351 
1352  // convert the rest
1353  int32_t idx=0;
1354  for (int32_t j=p_order; j<len; j++)
1355  {
1356  str[j]=(ST) alphabet->remap_to_bin(str[j]);
1357  str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
1358  idx++;
1359  }
1360 
1361  features[i].slen=len-p_order+1;
1362  }
1363 
1364  compute_symbol_mask_table(max_val);
1365 }
1366 
1367 template<class ST> void CStringFeatures<ST>::compute_symbol_mask_table(int64_t max_val)
1368 {
1369  if (m_subset_stack->has_subsets())
1371 
1372  SG_FREE(symbol_mask_table);
1373  symbol_mask_table=SG_MALLOC(ST, 256);
1374  symbol_mask_table_len=256;
1375 
1376  uint64_t mask=0;
1377  for (int32_t i=0; i< (int64_t) max_val; i++)
1378  mask=(mask<<1) | 1;
1379 
1380  for (int32_t i=0; i<256; i++)
1381  {
1382  uint8_t bits=(uint8_t) i;
1383  symbol_mask_table[i]=0;
1384 
1385  for (int32_t j=0; j<8; j++)
1386  {
1387  if (bits & 1)
1388  symbol_mask_table[i]|=mask<<(max_val*j);
1389 
1390  bits>>=1;
1391  }
1392  }
1393 }
1394 
1395 template<class ST> void CStringFeatures<ST>::unembed_word(ST word, uint8_t* seq, int32_t len)
1396 {
1397  uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1398 
1399  ST mask=0;
1400  for (uint32_t i=0; i<nbits; i++)
1401  mask=(mask<<1) | (ST) 1;
1402 
1403  for (int32_t i=0; i<len; i++)
1404  {
1405  ST w=(word & mask);
1406  seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
1407  word>>=nbits;
1408  }
1409 }
1410 
1411 template<class ST> ST CStringFeatures<ST>::embed_word(ST* seq, int32_t len)
1412 {
1413  ST value=(ST) 0;
1414  uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1415  for (int32_t i=0; i<len; i++)
1416  {
1417  value<<=nbits;
1418  value|=seq[i];
1419  }
1420 
1421  return value;
1422 }
1423 
1425 {
1426  max_string_length=0;
1427  index_t num_str=get_num_vectors();
1428 
1429  for (int32_t i=0; i<num_str; i++)
1430  {
1431  max_string_length=CMath::max(max_string_length,
1432  features[m_subset_stack->subset_idx_conversion(i)].slen);
1433  }
1434 }
1435 
1437 {
1438  int32_t l=str.slen;
1439  ST* s=SG_MALLOC(ST, l+1);
1440  memcpy(s, str.string, sizeof(ST)*l);
1441  s[l]='\0';
1442  return s;
1443 }
1444 
1445 template<class ST> void CStringFeatures<ST>::set_feature_vector(int32_t num, ST* string, int32_t len)
1446 {
1447  ASSERT(features)
1448  ASSERT(num<get_num_vectors())
1449 
1450  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
1451 
1452 
1453  features[real_num].slen=len ;
1454  features[real_num].string=string ;
1455 
1456  max_string_length=CMath::max(len, max_string_length);
1457 }
1458 
1459 template<class ST> void CStringFeatures<ST>::get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize)
1460 {
1461  int32_t nsym=get_num_symbols();
1462  int32_t slen=get_max_vector_length();
1463  int64_t sz=int64_t(nsym)*slen*sizeof(float64_t);
1464  float64_t* h= SG_MALLOC(float64_t, sz);
1465  memset(h, 0, sz);
1466 
1467  float64_t* h_normalizer=SG_MALLOC(float64_t, slen);
1468  memset(h_normalizer, 0, slen*sizeof(float64_t));
1469  int32_t num_str=get_num_vectors();
1470  for (int32_t i=0; i<num_str; i++)
1471  {
1472  int32_t len;
1473  bool free_vec;
1474  ST* vec=get_feature_vector(i, len, free_vec);
1475  for (int32_t j=0; j<len; j++)
1476  {
1477  h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
1478  h_normalizer[j]++;
1479  }
1480  free_feature_vector(vec, i, free_vec);
1481  }
1482 
1483  if (normalize)
1484  {
1485  for (int32_t i=0; i<slen; i++)
1486  {
1487  for (int32_t j=0; j<nsym; j++)
1488  {
1489  if (h_normalizer && h_normalizer[i])
1490  h[int64_t(i)*nsym+j]/=h_normalizer[i];
1491  }
1492  }
1493  }
1494  SG_FREE(h_normalizer);
1495 
1496  *hist=h;
1497  *rows=nsym;
1498  *cols=slen;
1499 }
1500 
1501 template<class ST> void CStringFeatures<ST>::create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec)
1502 {
1503  ASSERT(rows == get_num_symbols())
1504  cleanup();
1505  float64_t* randoms=SG_MALLOC(float64_t, cols);
1506  SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
1507 
1508  for (int32_t i=0; i<num_vec; i++)
1509  {
1510  sf[i].string=SG_MALLOC(ST, cols);
1511  sf[i].slen=cols;
1512 
1513  SGVector<float64_t>::random_vector(randoms, cols, 0.0, 1.0);
1514 
1515  for (int32_t j=0; j<cols; j++)
1516  {
1517  float64_t lik=hist[int64_t(j)*rows+0];
1518 
1519  int32_t c;
1520  for (c=0; c<rows-1; c++)
1521  {
1522  if (randoms[j]<=lik)
1523  break;
1524  lik+=hist[int64_t(j)*rows+c+1];
1525  }
1526  sf[i].string[j]=alphabet->remap_to_char(c);
1527  }
1528  }
1529  SG_FREE(randoms);
1530  set_features(sf, num_vec, cols);
1531 }
1532 
1533 /*
1534 CStringFeatures<SSKTripleFeature>* obtain_sssk_triple_from_cha(int d1, int d2)
1535 {
1536  int *s;
1537  int32_t nStr=get_num_vectors();
1538 
1539  int32_t nfeat=0;
1540  for (int32_t i=0; i < nStr; ++i)
1541  nfeat += get_vector_length[i] - d1 -d2;
1542  SGString<SSKFeature>* F= SG_MALLOC(SGString<SSKFeature>, nfeat);
1543  int32_t c=0;
1544  for (int32_t i=0; i < nStr; ++i)
1545  {
1546  int32_t len;
1547  bool free_vec;
1548  ST* S=get_feature_vector(vec_num, len, free_vec);
1549  free_feature_vector(vec, vec_num, free_vec);
1550  int32_t n=len - d1 - d2;
1551  s=S[i];
1552  for (int32_t j=0; j < n; ++j)
1553  {
1554  F[c].feature1=s[j];
1555  F[c].feature2=s[j+d1];
1556  F[c].feature3=s[j+d1+d2];
1557  F[c].group=i;
1558  c++;
1559  }
1560  }
1561  ASSERT(nfeat==c)
1562  return F;
1563 }
1564 
1565 CStringFeatures<SSKFeature>* obtain_sssk_double_from_char(int **S, int *len, int nStr, int d1)
1566 {
1567  int i, j;
1568  int n, nfeat;
1569  int *group;
1570  int *features;
1571  int *s;
1572  int c;
1573  SSKFeatures *F;
1574 
1575  nfeat=0;
1576  for (i=0; i < nStr; ++i)
1577  nfeat += len[i] - d1;
1578  group=(int *)SG_MALLOC(nfeat*sizeof(int));
1579  features=(int *)SG_MALLOC(nfeat*2*sizeof(int *));
1580  c=0;
1581  for (i=0; i < nStr; ++i)
1582  {
1583  n=len[i] - d1;
1584  s=S[i];
1585  for (j=0; j < n; ++j)
1586  {
1587  features[c]=s[j];
1588  features[c+nfeat]=s[j+d1];
1589  group[c]=i;
1590  c++;
1591  }
1592  }
1593  if (nfeat!=c)
1594  printf("Something is wrong...\n");
1595  F=(SSKFeatures *)SG_MALLOC(sizeof(SSKFeatures));
1596  (*F).features=features;
1597  (*F).group=group;
1598  (*F).n=nfeat;
1599  return F;
1600 }
1601 */
1602 
1604  SGVector<index_t> indices)
1605 {
1606  /* string list to create new CStringFeatures from */
1607  SGStringList<ST> list_copy(indices.vlen, max_string_length);
1608 
1609  /* copy all features */
1610  for (index_t i=0; i<indices.vlen; ++i)
1611  {
1612  /* index with respect to possible subset */
1613  index_t real_idx=m_subset_stack->subset_idx_conversion(indices.vector[i]);
1614 
1615  /* copy string */
1616  SGString<ST> current_string=features[real_idx];
1617  SGString<ST> string_copy(current_string.slen);
1618  memcpy(string_copy.string, current_string.string,
1619  current_string.slen*sizeof(ST));
1620  list_copy.strings[i]=string_copy;
1621  }
1622 
1623  /* create copy instance */
1624  CStringFeatures* result=new CStringFeatures(list_copy, alphabet);
1625 
1626  /* max string length may have changed */
1628 
1629  /* keep things from original features (otherwise assertions in x-val) */
1630  result->order=order;
1632 
1633  SG_REF(result);
1634 
1635  return result;
1636 }
1637 
1639 {
1640  /* max string length has to be updated */
1641  determine_maximum_string_length();
1642 }
1643 
1644 template<class ST> ST* CStringFeatures<ST>::compute_feature_vector(int32_t num, int32_t& len)
1645 {
1646  ASSERT(features && num<get_num_vectors())
1647 
1648  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
1649 
1650  len=features[real_num].slen;
1651  if (len<=0)
1652  return NULL;
1653 
1654  ST* target=SG_MALLOC(ST, len);
1655  memcpy(target, features[real_num].string, len*sizeof(ST));
1656  return target;
1657 }
1658 
1659 template<class ST> void CStringFeatures<ST>::init()
1660 {
1661  set_generic<ST>();
1662 
1663  alphabet=NULL;
1664  num_vectors=0;
1665  features=NULL;
1666  single_string=NULL;
1667  length_of_single_string=0;
1668  max_string_length=0;
1669  order=0;
1670  preprocess_on_get=false;
1671  feature_cache=NULL;
1672  symbol_mask_table=NULL;
1673  symbol_mask_table_len=0;
1674  num_symbols=0.0;
1675  original_num_symbols=0;
1676 
1677  m_parameters->add((CSGObject**) &alphabet, "alphabet");
1678  m_parameters->add_vector(&features, &num_vectors, "features",
1679  "This contains the array of features.");
1680  m_parameters->add_vector(&single_string,
1681  &length_of_single_string,
1682  "single_string",
1683  "Created by sliding window.");
1684  m_parameters->add(&max_string_length, "max_string_length",
1685  "Length of longest string.");
1686  m_parameters->add(&num_symbols, "num_symbols",
1687  "Number of used symbols.");
1688  m_parameters->add(&original_num_symbols, "original_num_symbols",
1689  "Original number of used symbols.");
1690  m_parameters->add(&order, "order",
1691  "Order used in higher order mapping.");
1692  m_parameters->add(&preprocess_on_get, "preprocess_on_get",
1693  "Preprocess on-the-fly?");
1694 
1695  m_parameters->add_vector(&symbol_mask_table, &symbol_mask_table_len, "mask_table", "Symbol mask table - using in higher order mapping");
1696 }
1697 
1703 {
1704  return F_BOOL;
1705 }
1706 
1712 {
1713  return F_CHAR;
1714 }
1715 
1721 {
1722  return F_BYTE;
1723 }
1724 
1730 {
1731  return F_SHORT;
1732 }
1733 
1739 {
1740  return F_WORD;
1741 }
1742 
1748 {
1749  return F_INT;
1750 }
1751 
1757 {
1758  return F_UINT;
1759 }
1760 
1766 {
1767  return F_LONG;
1768 }
1769 
1775 {
1776  return F_ULONG;
1777 }
1778 
1784 {
1785  return F_SHORTREAL;
1786 }
1787 
1793 {
1794  return F_DREAL;
1795 }
1796 
1802 {
1803  return F_LONGREAL;
1804 }
1805 
1806 template<> bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
1807 {
1808  return symbol;
1809 }
1811 {
1812  return symbol;
1813 }
1815 {
1816  return symbol;
1817 }
1819 {
1820  return symbol;
1821 }
1822 
1823 template<> bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
1824 {
1825  return false;
1826 }
1828 {
1829  return 0;
1830 }
1832 {
1833  return 0;
1834 }
1836 {
1837  return 0;
1838 }
1839 
1840 template<> bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
1841 {
1842  return symbol;
1843 }
1845 {
1846  return symbol;
1847 }
1849 {
1850  return symbol;
1851 }
1853 {
1854  return symbol;
1855 }
1856 
1857 #ifndef SUNOS
1858 template<> template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1859 {
1860  return false;
1861 }
1862 template<> template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1863 {
1864  return false;
1865 }
1866 template<> template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1867 {
1868  return false;
1869 }
1870 #endif
1871 
1872 template<> void CStringFeatures<float32_t>::embed_features(int32_t p_order)
1873 {
1874 }
1875 template<> void CStringFeatures<float64_t>::embed_features(int32_t p_order)
1876 {
1877 }
1878 template<> void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
1879 {
1880 }
1881 
1883 {
1884 }
1886 {
1887 }
1889 {
1890 }
1891 
1893 {
1894  return 0;
1895 }
1897 {
1898  return 0;
1899 }
1901 {
1902  return 0;
1903 }
1904 
1905 template<> void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
1906 {
1907 }
1908 template<> void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
1909 {
1910 }
1911 template<> void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
1912 {
1913 }
1914 #define LOAD(f_load, sg_type) \
1915 template<> void CStringFeatures<sg_type>::load(CFile* loader) \
1916 { \
1917  SG_INFO("loading...\n") \
1918  \
1919  SG_SET_LOCALE_C; \
1920  SGString<sg_type>* strs; \
1921  int32_t num_str; \
1922  int32_t max_len; \
1923  loader->f_load(strs, num_str, max_len); \
1924  set_features(strs, num_str, max_len); \
1925  SG_RESET_LOCALE; \
1926 }
1927 
1928 LOAD(get_string_list, bool)
1929 LOAD(get_string_list, char)
1930 LOAD(get_string_list, int8_t)
1931 LOAD(get_string_list, uint8_t)
1932 LOAD(get_string_list, int16_t)
1933 LOAD(get_string_list, uint16_t)
1934 LOAD(get_string_list, int32_t)
1935 LOAD(get_string_list, uint32_t)
1936 LOAD(get_string_list, int64_t)
1937 LOAD(get_string_list, uint64_t)
1938 LOAD(get_string_list, float32_t)
1939 LOAD(get_string_list, float64_t)
1940 LOAD(get_string_list, floatmax_t)
1941 #undef LOAD
1942 
1943 #define SAVE(f_write, sg_type) \
1944 template<> void CStringFeatures<sg_type>::save(CFile* writer) \
1945 { \
1946  if (m_subset_stack->has_subsets()) \
1947  SG_ERROR("save() is not possible on subset") \
1948  SG_SET_LOCALE_C; \
1949  ASSERT(writer) \
1950  writer->f_write(features, num_vectors); \
1951  SG_RESET_LOCALE; \
1952 }
1953 
1954 SAVE(set_string_list, bool)
1955 SAVE(set_string_list, char)
1956 SAVE(set_string_list, int8_t)
1957 SAVE(set_string_list, uint8_t)
1958 SAVE(set_string_list, int16_t)
1959 SAVE(set_string_list, uint16_t)
1960 SAVE(set_string_list, int32_t)
1961 SAVE(set_string_list, uint32_t)
1962 SAVE(set_string_list, int64_t)
1963 SAVE(set_string_list, uint64_t)
1964 SAVE(set_string_list, float32_t)
1965 SAVE(set_string_list, float64_t)
1966 SAVE(set_string_list, floatmax_t)
1967 #undef SAVE
1968 
1969 template <class ST> template <class CT>
1971  int32_t p_order, int32_t gap, bool rev)
1972 {
1973  remove_all_subsets();
1974  ASSERT(sf)
1975 
1976  CAlphabet* alpha=sf->get_alphabet();
1977  ASSERT(alpha->get_num_symbols_in_histogram() > 0)
1978 
1979  this->order=p_order;
1980  cleanup();
1981 
1982  num_vectors=sf->get_num_vectors();
1983  ASSERT(num_vectors>0)
1984  max_string_length=sf->get_max_vector_length()-start;
1985  features=SG_MALLOC(SGString<ST>, num_vectors);
1986 
1987  SG_DEBUG("%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
1988  alpha->get_num_symbols_in_histogram());
1989 
1990  for (int32_t i=0; i<num_vectors; i++)
1991  {
1992  int32_t len=-1;
1993  bool vfree;
1994  CT* c=sf->get_feature_vector(i, len, vfree);
1995  ASSERT(!vfree) // won't work when preprocessors are attached
1996 
1997  features[i].string=SG_MALLOC(ST, len);
1998  features[i].slen=len;
1999 
2000  ST* str=features[i].string;
2001  for (int32_t j=0; j<len; j++)
2002  str[j]=(ST) alpha->remap_to_bin(c[j]);
2003  }
2004 
2005  original_num_symbols=alpha->get_num_symbols();
2006  int32_t max_val=alpha->get_num_bits();
2007 
2008  SG_UNREF(alpha);
2009 
2010  if (p_order>1)
2011  num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
2012  else
2013  num_symbols=original_num_symbols;
2014  SG_INFO("max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols)
2015 
2016  if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
2017  {
2018  SG_ERROR("symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val)
2019  return false;
2020  }
2021 
2022  SG_DEBUG("translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST))
2023  for (int32_t line=0; line<num_vectors; line++)
2024  {
2025  int32_t len=0;
2026  bool vfree;
2027  ST* fv=get_feature_vector(line, len, vfree);
2028  ASSERT(!vfree) // won't work when preprocessors are attached
2029 
2030  if (rev)
2031  CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
2032  else
2033  CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
2034 
2035  /* fix the length of the string -- hacky */
2036  features[line].slen-=start+gap ;
2037  if (features[line].slen<0)
2038  features[line].slen=0 ;
2039  }
2040 
2041  compute_symbol_mask_table(max_val);
2042 
2043  return true;
2044 }
2045 
2046 template class CStringFeatures<bool>;
2047 template class CStringFeatures<char>;
2048 template class CStringFeatures<int8_t>;
2049 template class CStringFeatures<uint8_t>;
2050 template class CStringFeatures<int16_t>;
2051 template class CStringFeatures<uint16_t>;
2052 template class CStringFeatures<int32_t>;
2053 template class CStringFeatures<uint32_t>;
2054 template class CStringFeatures<int64_t>;
2055 template class CStringFeatures<uint64_t>;
2056 template class CStringFeatures<float32_t>;
2057 template class CStringFeatures<float64_t>;
2058 template class CStringFeatures<floatmax_t>;
2059 
2060 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2061 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2062 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2063 
2064 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2065 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2066 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2067 }

SHOGUN Machine Learning Toolbox - Documentation