SHOGUN  6.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules
StringFeatures.cpp
Go to the documentation of this file.
5 #include <shogun/io/SGIO.h>
9 
10 #include <sys/types.h>
11 #include <sys/stat.h>
12 #include <dirent.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #ifdef _WIN32
16 #include <tchar.h>
17 #include <strsafe.h>
18 #include <vector>
19 #else
20 #include <unistd.h>
21 #endif
22 
23 namespace shogun
24 {
25 
27 {
28  init();
29  alphabet=new CAlphabet();
30 }
31 
33 {
34  init();
35 
36  alphabet=new CAlphabet(alpha);
40 }
41 
42 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha)
43 : CFeatures(0)
44 {
45  init();
46 
47  alphabet=new CAlphabet(alpha);
51  set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
52 }
53 
54 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha)
55 : CFeatures(0)
56 {
57  init();
58 
59  alphabet=new CAlphabet(alpha);
63  set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
64 }
65 
67 : CFeatures(0)
68 {
69  init();
70 
71  ASSERT(alpha)
72  SG_REF(alpha);
73  alphabet=alpha;
76 }
77 
78 template<class ST> CStringFeatures<ST>::CStringFeatures(const CStringFeatures & orig)
79 : CFeatures(orig), num_vectors(orig.num_vectors),
80  single_string(orig.single_string),
81  length_of_single_string(orig.length_of_single_string),
82  max_string_length(orig.max_string_length),
83  num_symbols(orig.num_symbols),
84  original_num_symbols(orig.original_num_symbols),
85  order(orig.order), preprocess_on_get(false),
86  feature_cache(NULL)
87 {
88  init();
89 
90  ASSERT(orig.single_string == NULL) //not implemented
91 
92  alphabet=orig.alphabet;
94 
95  if (orig.features)
96  {
97  features=SG_MALLOC(SGString<ST>, orig.num_vectors);
98 
99  for (int32_t i=0; i<num_vectors; i++)
100  {
101  features[i].string=SG_MALLOC(ST, orig.features[i].slen);
102  features[i].slen=orig.features[i].slen;
103  sg_memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].slen);
104  }
105  }
106 
107  if (orig.symbol_mask_table)
108  {
109  symbol_mask_table=SG_MALLOC(ST, 256);
111 
112  for (int32_t i=0; i<256; i++)
114  }
115 
118 }
119 
120 template<class ST> CStringFeatures<ST>::CStringFeatures(CFile* loader, EAlphabet alpha)
121 : CFeatures(), num_vectors(0),
122  features(NULL), single_string(NULL), length_of_single_string(0),
123  max_string_length(0), order(0),
124  preprocess_on_get(false), feature_cache(NULL)
125 {
126  init();
127 
128  alphabet=new CAlphabet(alpha);
129  SG_REF(alphabet);
132  load(loader);
133 }
134 
136 {
137  cleanup();
138 
139  SG_UNREF(alphabet);
140 }
141 
142 template<class ST> void CStringFeatures<ST>::cleanup()
143 {
144  remove_all_subsets();
145 
146  if (single_string)
147  {
148  SG_FREE(single_string);
149  single_string=NULL;
150  }
151  else
152  cleanup_feature_vectors(0, num_vectors-1);
153 
154  /*
155  if (single_string)
156  {
157  SG_FREE(single_string);
158  single_string=NULL;
159  }
160  else
161  cleanup_feature_vectors(0, num_vectors-1);
162  */
163 
164  num_vectors=0;
165  SG_FREE(features);
166  SG_FREE(symbol_mask_table);
167  features=NULL;
168  symbol_mask_table=NULL;
169 
170  /* start with a fresh alphabet, but instead of emptying the histogram
171  * create a new object (to leave the alphabet object alone if it is used
172  * by others)
173  */
174  CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
175  SG_UNREF(alphabet);
176  alphabet=alpha;
177  SG_REF(alphabet);
178 }
179 
180 template<class ST> void CStringFeatures<ST>::cleanup_feature_vector(int32_t num)
181 {
182  ASSERT(num<get_num_vectors())
183 
184  if (features)
185  {
186  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
187  SG_FREE(features[real_num].string);
188  features[real_num].string=NULL;
189  features[real_num].slen=0;
190 
191  determine_maximum_string_length();
192  }
193 }
194 
195 template<class ST> void CStringFeatures<ST>::cleanup_feature_vectors(int32_t start, int32_t stop)
196 {
197  if (features && get_num_vectors())
198  {
199  ASSERT(start<get_num_vectors())
200  ASSERT(stop<get_num_vectors())
201 
202  for (int32_t i=start; i<=stop; i++)
203  {
204  int32_t real_num=m_subset_stack->subset_idx_conversion(i);
205  SG_FREE(features[real_num].string);
206  features[real_num].string=NULL;
207  features[real_num].slen=0;
208  }
209  determine_maximum_string_length();
210  }
211 }
212 
213 template<class ST> EFeatureClass CStringFeatures<ST>::get_feature_class() const { return C_STRING; }
214 
215 template<class ST> EFeatureType CStringFeatures<ST>::get_feature_type() const { return F_UNKNOWN; }
216 
218 {
219  SG_REF(alphabet);
220  return alphabet;
221 }
222 
223 template<class ST> CFeatures* CStringFeatures<ST>::duplicate() const
224 {
225  return new CStringFeatures<ST>(*this);
226 }
227 
229 {
230  ASSERT(features)
231  if (num>=get_num_vectors())
232  {
233  SG_ERROR("Index out of bounds (number of strings %d, you "
234  "requested %d)\n", get_num_vectors(), num);
235  }
236 
237  int32_t l;
238  bool free_vec;
239  ST* vec=get_feature_vector(num, l, free_vec);
240  ST* dst=SG_MALLOC(ST, l);
241  sg_memcpy(dst, vec, l*sizeof(ST));
242  free_feature_vector(vec, num, free_vec);
243  return SGVector<ST>(dst, l, true);
244 }
245 
246 template<class ST> void CStringFeatures<ST>::set_feature_vector(SGVector<ST> vector, int32_t num)
247 {
248  ASSERT(features)
249 
250  if (m_subset_stack->has_subsets())
251  SG_ERROR("A subset is set, cannot set feature vector\n")
252 
253  if (num>=num_vectors)
254  {
255  SG_ERROR("Index out of bounds (number of strings %d, you "
256  "requested %d)\n", num_vectors, num);
257  }
258 
259  if (vector.vlen<=0)
260  SG_ERROR("String has zero or negative length\n")
261 
262  cleanup_feature_vector(num);
263  features[num].slen=vector.vlen;
264  features[num].string=SG_MALLOC(ST, vector.vlen);
265  sg_memcpy(features[num].string, vector.vector, vector.vlen*sizeof(ST));
266 
267  determine_maximum_string_length();
268 }
269 
271 {
272  preprocess_on_get=true;
273 }
274 
276 {
277  preprocess_on_get=false;
278 }
279 
280 template<class ST> ST* CStringFeatures<ST>::get_feature_vector(int32_t num, int32_t& len, bool& dofree)
281 {
282  ASSERT(features)
283  if (num>=get_num_vectors())
284  SG_ERROR("Requested feature vector with index %d while total num is", num, get_num_vectors())
285 
286  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
287 
288  if (!preprocess_on_get)
289  {
290  dofree=false;
291  len=features[real_num].slen;
292  return features[real_num].string;
293  }
294  else
295  {
296  SG_DEBUG("computing feature vector!\n")
297  ST* feat=compute_feature_vector(num, len);
298  dofree=true;
299 
300  if (get_num_preprocessors())
301  {
302  ST* tmp_feat_before=feat;
303 
304  for (int32_t i=0; i<get_num_preprocessors(); i++)
305  {
306  CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
307  feat=p->apply_to_string(tmp_feat_before, len);
308  SG_UNREF(p);
309  SG_FREE(tmp_feat_before);
310  tmp_feat_before=feat;
311  }
312  }
313  // TODO: implement caching
314  return feat;
315  }
316 }
317 
319 {
320  int32_t num_feat;
321  int32_t num_vec;
322  SGString<ST>* s=get_transposed(num_feat, num_vec);
323  SGStringList<ST> string_list;
324  string_list.strings = s;
325  string_list.num_strings = num_vec;
326  string_list.max_string_length = num_feat;
327 
328  return new CStringFeatures<ST>(string_list, alphabet);
329 }
330 
331 template<class ST> SGString<ST>* CStringFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec)
332 {
333  num_feat=get_num_vectors();
334  num_vec=get_max_vector_length();
335  ASSERT(have_same_length())
336 
337  SG_DEBUG("Allocating memory for transposed string features of size %ld\n",
338  int64_t(num_feat)*num_vec);
339 
340  SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
341 
342  for (int32_t i=0; i<num_vec; i++)
343  {
344  sf[i].string=SG_MALLOC(ST, num_feat);
345  sf[i].slen=num_feat;
346  }
347 
348  for (int32_t i=0; i<num_feat; i++)
349  {
350  int32_t len=0;
351  bool free_vec=false;
352  ST* vec=get_feature_vector(i, len, free_vec);
353 
354  for (int32_t j=0; j<num_vec; j++)
355  sf[j].string[i]=vec[j];
356 
357  free_feature_vector(vec, i, free_vec);
358  }
359  return sf;
360 }
361 
362 template<class ST> void CStringFeatures<ST>::free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
363 {
364  if (num>=get_num_vectors())
365  {
366  SG_ERROR(
367  "Trying to access string[%d] but num_str=%d\n", num,
368  get_num_vectors());
369  }
370 
371  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
372 
373  if (feature_cache)
374  feature_cache->unlock_entry(real_num);
375 
376  if (dofree)
377  SG_FREE(feat_vec);
378 }
379 
380 template<class ST> void CStringFeatures<ST>::free_feature_vector(SGVector<ST> feat_vec, int32_t num)
381 {
382  if (num>=get_num_vectors())
383  {
384  SG_ERROR(
385  "Trying to access string[%d] but num_str=%d\n", num,
386  get_num_vectors());
387  }
388 
389  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
390 
391  if (feature_cache)
392  feature_cache->unlock_entry(real_num);
393 }
394 
395 template<class ST> ST CStringFeatures<ST>::get_feature(int32_t vec_num, int32_t feat_num)
396 {
397  ASSERT(vec_num<get_num_vectors())
398 
399  int32_t len;
400  bool free_vec;
401  ST* vec=get_feature_vector(vec_num, len, free_vec);
402  ASSERT(feat_num<len)
403  ST result=vec[feat_num];
404  free_feature_vector(vec, vec_num, free_vec);
405 
406  return result;
407 }
408 
409 template<class ST> int32_t CStringFeatures<ST>::get_vector_length(int32_t vec_num)
410 {
411  ASSERT(vec_num<get_num_vectors())
412 
413  int32_t len;
414  bool free_vec;
415  ST* vec=get_feature_vector(vec_num, len, free_vec);
416  free_feature_vector(vec, vec_num, free_vec);
417  return len;
418 }
419 
420 template<class ST> int32_t CStringFeatures<ST>::get_max_vector_length()
421 {
422  return max_string_length;
423 }
424 
425 template<class ST> int32_t CStringFeatures<ST>::get_num_vectors() const
426 {
427  return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors;
428 }
429 
430 template<class ST> floatmax_t CStringFeatures<ST>::get_num_symbols() { return num_symbols; }
431 
432 template<class ST> floatmax_t CStringFeatures<ST>::get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
433 
434 template<class ST> floatmax_t CStringFeatures<ST>::get_original_num_symbols() { return original_num_symbols; }
435 
436 template<class ST> int32_t CStringFeatures<ST>::get_order() { return order; }
437 
438 template<class ST> ST CStringFeatures<ST>::get_masked_symbols(ST symbol, uint8_t mask)
439 {
440  ASSERT(symbol_mask_table)
441  return symbol_mask_table[mask] & symbol;
442 }
443 
444 template<class ST> ST CStringFeatures<ST>::shift_offset(ST offset, int32_t amount)
445 {
446  ASSERT(alphabet)
447  return (offset << (amount*alphabet->get_num_bits()));
448 }
449 
450 template<class ST> ST CStringFeatures<ST>::shift_symbol(ST symbol, int32_t amount)
451 {
452  ASSERT(alphabet)
453  return (symbol >> (amount*alphabet->get_num_bits()));
454 }
455 
456 template<class ST> void CStringFeatures<ST>::load_ascii_file(char* fname, bool remap_to_bin,
457  EAlphabet ascii_alphabet, EAlphabet binary_alphabet)
458 {
459  remove_all_subsets();
460 
461  size_t blocksize=1024*1024;
462  size_t required_blocksize=0;
463  uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
464  uint8_t* overflow=NULL;
465  int32_t overflow_len=0;
466 
467  cleanup();
468 
469  CAlphabet* alpha=new CAlphabet(ascii_alphabet);
470  CAlphabet* alpha_bin=new CAlphabet(binary_alphabet);
471 
472  FILE* f=fopen(fname, "ro");
473 
474  if (f)
475  {
476  num_vectors=0;
477  max_string_length=0;
478 
479  SG_INFO("counting line numbers in file %s\n", fname)
480  size_t block_offs=0;
481  size_t old_block_offs=0;
482  fseek(f, 0, SEEK_END);
483  size_t fsize=ftell(f);
484  rewind(f);
485 
486  if (blocksize>fsize)
487  blocksize=fsize;
488 
489  SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize)
490 
491  size_t sz=blocksize;
492  while (sz == blocksize)
493  {
494  sz=fread(dummy, sizeof(uint8_t), blocksize, f);
495  for (size_t i=0; i<sz; i++)
496  {
497  block_offs++;
498  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
499  {
500  num_vectors++;
501  required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
502  old_block_offs=block_offs;
503  }
504  }
505  SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t")
506  }
507 
508  SG_INFO("found %d strings\n", num_vectors)
509  SG_FREE(dummy);
510  blocksize=required_blocksize;
511  dummy=SG_MALLOC(uint8_t, blocksize);
512  overflow=SG_MALLOC(uint8_t, blocksize);
513  features=SG_MALLOC(SGString<ST>, num_vectors);
514 
515  rewind(f);
516  sz=blocksize;
517  int32_t lines=0;
518  while (sz == blocksize)
519  {
520  sz=fread(dummy, sizeof(uint8_t), blocksize, f);
521 
522  size_t old_sz=0;
523  for (size_t i=0; i<sz; i++)
524  {
525  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
526  {
527  int32_t len=i-old_sz;
528  //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz)
529  max_string_length=CMath::max(max_string_length, len+overflow_len);
530 
531  features[lines].slen=len;
532  features[lines].string=SG_MALLOC(ST, len);
533 
534  if (remap_to_bin)
535  {
536  for (int32_t j=0; j<overflow_len; j++)
537  features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
538  for (int32_t j=0; j<len; j++)
539  features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
540  alpha->add_string_to_histogram(&dummy[old_sz], len);
541  alpha_bin->add_string_to_histogram(features[lines].string, features[lines].slen);
542  }
543  else
544  {
545  for (int32_t j=0; j<overflow_len; j++)
546  features[lines].string[j]=overflow[j];
547  for (int32_t j=0; j<len; j++)
548  features[lines].string[j+overflow_len]=dummy[old_sz+j];
549  alpha->add_string_to_histogram(&dummy[old_sz], len);
550  alpha->add_string_to_histogram(features[lines].string, features[lines].slen);
551  }
552 
553  // clear overflow
554  overflow_len=0;
555 
556  //CMath::display_vector(features[lines].string, len);
557  old_sz=i+1;
558  lines++;
559  SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t")
560  }
561  }
562  for (size_t i=old_sz; i<sz; i++)
563  overflow[i-old_sz]=dummy[i];
564 
565  overflow_len=sz-old_sz;
566  }
567 
568  if (alpha->check_alphabet_size() && alpha->check_alphabet())
569  {
570  SG_INFO("file successfully read\n")
571  SG_INFO("max_string_length=%d\n", max_string_length)
572  SG_INFO("num_strings=%d\n", num_vectors)
573  }
574  fclose(f);
575  }
576 
577  SG_FREE(dummy);
578  SG_FREE(overflow);
579 
580  SG_UNREF(alphabet);
581 
582  if (remap_to_bin)
583  {
584  alphabet=alpha_bin;
585  SG_UNREF(alpha);
586  }
587  else
588  {
589  alphabet=alpha;
590  SG_UNREF(alpha_bin);
591  }
592  SG_REF(alphabet);
593  num_symbols=alphabet->get_num_symbols();
594 }
595 
596 template<class ST> bool CStringFeatures<ST>::load_fasta_file(const char* fname, bool ignore_invalid)
597 {
598  remove_all_subsets();
599 
600  int32_t i=0;
601  uint64_t len=0;
602  uint64_t offs=0;
603  int32_t num=0;
604  int32_t max_len=0;
605 
606  CMemoryMappedFile<char> f(fname);
607 
608  while (true)
609  {
610  char* s=f.get_line(len, offs);
611  if (!s)
612  break;
613 
614  if (len>0 && s[0]=='>')
615  num++;
616  }
617 
618  if (num==0)
619  SG_ERROR("No fasta hunks (lines starting with '>') found\n")
620 
621  cleanup();
622  SG_UNREF(alphabet);
623  alphabet=new CAlphabet(DNA);
624  num_symbols=alphabet->get_num_symbols();
625 
626  SGString<ST>* strings=SG_MALLOC(SGString<ST>, num);
627  offs=0;
628 
629  for (i=0;i<num; i++)
630  {
631  uint64_t id_len=0;
632  char* id=f.get_line(id_len, offs);
633 
634  char* fasta=f.get_line(len, offs);
635  char* s=fasta;
636  int32_t fasta_len=0;
637  int32_t spanned_lines=0;
638 
639  while (true)
640  {
641  if (!s || len==0)
642  SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len)
643 
644  if (s[0]=='>' || offs==f.get_size())
645  {
646  offs-=len+1; // seek to beginning
647  if (offs==f.get_size())
648  {
649  SG_DEBUG("at EOF\n")
650  fasta_len+=len;
651  }
652 
653  len=fasta_len-spanned_lines;
654  strings[i].string=SG_MALLOC(ST, len);
655  strings[i].slen=len;
656 
657  ST* str=strings[i].string;
658  int32_t idx=0;
659  SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines)
660 
661  for (int32_t j=0; j<fasta_len; j++)
662  {
663  if (fasta[j]=='\n')
664  continue;
665 
666  ST c=(ST) fasta[j];
667 
668  if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j]))
669  c=(ST) 'A';
670 
671  if (uint64_t(idx)>=len)
672  SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str)
673  str[idx++]=c;
674  }
675  max_len=CMath::max(max_len, strings[i].slen);
676 
677 
678  break;
679  }
680 
681  spanned_lines++;
682  fasta_len+=len+1; // including '\n'
683  s=f.get_line(len, offs);
684  }
685  }
686  return set_features(strings, num, max_len);
687 }
688 
689 template<class ST> bool CStringFeatures<ST>::load_fastq_file(const char* fname,
690  bool ignore_invalid, bool bitremap_in_single_string)
691 {
692  remove_all_subsets();
693 
694  CMemoryMappedFile<char> f(fname);
695 
696  int32_t i=0;
697  uint64_t len=0;
698  uint64_t offs=0;
699 
700  int32_t num=f.get_num_lines();
701  int32_t max_len=0;
702 
703  if (num%4)
704  SG_ERROR("Number of lines must be divisible by 4 in fastq files\n")
705  num/=4;
706 
707  cleanup();
708  SG_UNREF(alphabet);
709  alphabet=new CAlphabet(DNA);
710 
711  SGString<ST>* strings;
712 
713  ST* str=NULL;
714  if (bitremap_in_single_string)
715  {
716  strings=SG_MALLOC(SGString<ST>, 1);
717  strings[0].string=SG_MALLOC(ST, num);
718  strings[0].slen=num;
719  f.get_line(len, offs);
720  f.get_line(len, offs);
721  order=len;
722  max_len=num;
723  offs=0;
724  original_num_symbols=alphabet->get_num_symbols();
725  str=SG_MALLOC(ST, len);
726  }
727  else
728  strings=SG_MALLOC(SGString<ST>, num);
729 
730  for (i=0;i<num; i++)
731  {
732  if (!f.get_line(len, offs))
733  SG_ERROR("Error reading 'read' identifier in line %d", 4*i)
734 
735  char* s=f.get_line(len, offs);
736  if (!s || len==0)
737  SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len)
738 
739  if (bitremap_in_single_string)
740  {
741  if (len!=(uint64_t) order)
742  SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len)
743  for (int32_t j=0; j<order; j++)
744  str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
745 
746  strings[0].string[i]=embed_word(str, order);
747  }
748  else
749  {
750  strings[i].string=SG_MALLOC(ST, len);
751  strings[i].slen=len;
752  str=strings[i].string;
753 
754  if (ignore_invalid)
755  {
756  for (uint64_t j=0; j<len; j++)
757  {
758  if (alphabet->is_valid((uint8_t) s[j]))
759  str[j]= (ST) s[j];
760  else
761  str[j]= (ST) 'A';
762  }
763  }
764  else
765  {
766  for (uint64_t j=0; j<len; j++)
767  str[j]= (ST) s[j];
768  }
769  max_len=CMath::max(max_len, (int32_t) len);
770  }
771 
772 
773  if (!f.get_line(len, offs))
774  SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2)
775 
776  if (!f.get_line(len, offs))
777  SG_ERROR("Error reading 'read' quality in line %d", 4*i+3)
778  }
779 
780  if (bitremap_in_single_string)
781  num=1;
782 
783  num_vectors=num;
784  max_string_length=max_len;
785  features=strings;
786 
787  return true;
788 }
789 
790 template<class ST> bool CStringFeatures<ST>::load_from_directory(char* dirname)
791 {
792  remove_all_subsets();
793 
794  struct dirent **namelist;
795  int32_t n;
796 
797  SGIO::set_dirname(dirname);
798 
799  SG_DEBUG("dirname '%s'\n", dirname)
800 
801 #ifdef _WIN32
802  TCHAR search_dir[MAX_PATH];
803  WIN32_FIND_DATA ffd;
804  LARGE_INTEGER filesize;
805  HANDLE h_find = INVALID_HANDLE_VALUE;
806 
807  StringCchCopy(search_dir, MAX_PATH, dirname);
808  StringCchCat(search_dir, MAX_PATH, TEXT("\\*"));
809 
810  h_find = FindFirstFile(search_dir, &ffd);
811  if (INVALID_HANDLE_VALUE == h_find)
812  {
813  SG_ERROR("Error finding finds in %s\n", dirname)
814  return false;
815  }
816 
817  std::vector<struct dirent*> files;
818  do
819  {
820  if (ffd.dwFileAttributes & FILE_ATTRIBUTE_NORMAL)
821  {
822  struct dirent* d = SG_MALLOC(struct dirent, 1);
823  StringCchCopy(d->d_name, MAX_PATH, ffd.cFileName);
824  files.push_back(d);
825  n++;
826  }
827  }
828  while (FindNextFile(h_find, &ffd) != 0);
829  namelist = &files[0];
830  FindClose(h_find);
831 #else
832  n=scandir(dirname, &namelist, &SGIO::filter, alphasort);
833 #endif
834  if (n <= 0)
835  {
836  SG_ERROR("error calling scandir - no files found\n")
837  return false;
838  }
839  else
840  {
841  SGString<ST>* strings=NULL;
842 
843  int32_t num=0;
844  int32_t max_len=-1;
845 
846  //usually n==num_vec, but it might not in race conditions
847  //(file perms modified, file erased)
848  strings=SG_MALLOC(SGString<ST>, n);
849 
850  for (int32_t i=0; i<n; i++)
851  {
852  char* fname=SGIO::concat_filename(namelist[i]->d_name);
853 
854  struct stat s;
855  off_t filesize=0;
856 
857  if (!stat(fname, &s) && s.st_size>0)
858  {
859  filesize=s.st_size/sizeof(ST);
860 
861  FILE* f=fopen(fname, "ro");
862  if (f)
863  {
864  ST* str=SG_MALLOC(ST, filesize);
865  SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize)
866  if (fread(str, sizeof(ST), filesize, f)!=(size_t) filesize)
867  SG_ERROR("failed to read file\n")
868  strings[num].string=str;
869  strings[num].slen=filesize;
870  max_len=CMath::max(max_len, strings[num].slen);
871 
872  num++;
873  fclose(f);
874  }
875  }
876  else
877  SG_ERROR("empty or non readable file \'%s\'\n", fname)
878 
879  SG_FREE(namelist[i]);
880  }
881  SG_FREE(namelist);
882 
883  if (num>0 && strings)
884  {
885  set_features(strings, num, max_len);
886  return true;
887  }
888  }
889 
890  return false;
891 }
892 
894 {
895  set_features(feats.strings, feats.num_strings, feats.max_string_length);
896 }
897 
898 template<class ST> bool CStringFeatures<ST>::set_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
899 {
900  if (m_subset_stack->has_subsets())
901  SG_ERROR("Cannot call set_features() with subset.\n")
902 
903  if (p_features)
904  {
905  CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
906 
907  //compute histogram for char/byte
908  for (int32_t i=0; i<p_num_vectors; i++)
909  alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
910 
911  SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram())
912  SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram())
913 
914  if (alpha->check_alphabet_size() && alpha->check_alphabet())
915  {
916  cleanup();
917  SG_UNREF(alphabet);
918 
919  alphabet=alpha;
920  SG_REF(alphabet);
921 
922  // TODO remove copying
923  features = SG_MALLOC(SGString<ST>,p_num_vectors);
924  sg_memcpy(features,p_features,sizeof(SGString<ST>)*p_num_vectors);
925  num_vectors = p_num_vectors;
926  max_string_length = p_max_string_length;
927 
928  return true;
929  }
930  else
931  SG_UNREF(alpha);
932  }
933 
934  return false;
935 }
936 
938 {
939  ASSERT(sf)
940 
941  if (m_subset_stack->has_subsets())
942  SG_ERROR("Cannot call set_features() with subset.\n")
943 
944  SGString<ST>* new_features=SG_MALLOC(SGString<ST>, sf->get_num_vectors());
945 
946  index_t sf_num_str=sf->get_num_vectors();
947  for (int32_t i=0; i<sf_num_str; i++)
948  {
949  int32_t real_i = sf->m_subset_stack->subset_idx_conversion(i);
950  int32_t length=sf->features[real_i].slen;
951  new_features[i].string=SG_MALLOC(ST, length);
952  sg_memcpy(new_features[i].string, sf->features[real_i].string, length);
953  new_features[i].slen=length;
954  }
955  return append_features(new_features, sf_num_str,
956  sf->max_string_length);
957 }
958 
959 template<class ST> bool CStringFeatures<ST>::append_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
960 {
961  if (m_subset_stack->has_subsets())
962  SG_ERROR("Cannot call set_features() with subset.\n")
963 
964  if (!features)
965  return set_features(p_features, p_num_vectors, p_max_string_length);
966 
967  CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
968 
969  //compute histogram for char/byte
970  for (int32_t i=0; i<p_num_vectors; i++)
971  alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
972 
973  SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram())
974  SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram())
975 
976  if (alpha->check_alphabet_size() && alpha->check_alphabet())
977  {
978  SG_UNREF(alpha);
979  for (int32_t i=0; i<p_num_vectors; i++)
980  alphabet->add_string_to_histogram( p_features[i].string, p_features[i].slen);
981 
982  int32_t old_num_vectors=num_vectors;
983  num_vectors=old_num_vectors+p_num_vectors;
984  SGString<ST>* new_features=SG_MALLOC(SGString<ST>, num_vectors);
985 
986  for (int32_t i=0; i<num_vectors; i++)
987  {
988  if (i<old_num_vectors)
989  {
990  new_features[i].string=features[i].string;
991  new_features[i].slen=features[i].slen;
992  }
993  else
994  {
995  new_features[i].string=p_features[i-old_num_vectors].string;
996  new_features[i].slen=p_features[i-old_num_vectors].slen;
997  }
998  }
999  SG_FREE(features);
1000  SG_FREE(p_features); // free now obsolete features
1001 
1002  this->features=new_features;
1003  max_string_length=CMath::max(max_string_length, p_max_string_length);
1004 
1005  return true;
1006  }
1007  SG_UNREF(alpha);
1008 
1009  return false;
1010 }
1011 
1013 {
1014  SGStringList<ST> sl(NULL,0,0,false);
1015 
1016  sl.strings=get_features(sl.num_strings, sl.max_string_length);
1017  return sl;
1018 }
1019 
1020 template<class ST> SGString<ST>* CStringFeatures<ST>::get_features(int32_t& num_str, int32_t& max_str_len)
1021 {
1022  if (m_subset_stack->has_subsets())
1023  SG_ERROR("get features() is not possible on subset")
1024 
1025  num_str=num_vectors;
1026  max_str_len=max_string_length;
1027  return features;
1028 }
1029 
1030 template<class ST> SGString<ST>* CStringFeatures<ST>::copy_features(int32_t& num_str, int32_t& max_str_len)
1031 {
1032  ASSERT(num_vectors>0)
1033 
1034  num_str=get_num_vectors();
1035  max_str_len=max_string_length;
1036  SGString<ST>* new_feat=SG_MALLOC(SGString<ST>, num_str);
1037 
1038  for (int32_t i=0; i<num_str; i++)
1039  {
1040  int32_t len;
1041  bool free_vec;
1042  ST* vec=get_feature_vector(i, len, free_vec);
1043  new_feat[i].string=SG_MALLOC(ST, len);
1044  new_feat[i].slen=len;
1045  sg_memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
1046  free_feature_vector(vec, i, free_vec);
1047  }
1048 
1049  return new_feat;
1050 }
1051 
1052 template<class ST> void CStringFeatures<ST>::get_features(SGString<ST>** dst, int32_t* num_str)
1053 {
1054  int32_t num_vec;
1055  int32_t max_str_len;
1056  *dst=copy_features(num_vec, max_str_len);
1057  *num_str=num_vec;
1058 }
1059 
1060 template<class ST> bool CStringFeatures<ST>::load_compressed(char* src, bool decompress)
1061 {
1062  remove_all_subsets();
1063 
1064  FILE* file=NULL;
1065 
1066  if (!(file=fopen(src, "r")))
1067  return false;
1068  cleanup();
1069 
1070  // header shogun v0
1071  char id[4];
1072  if (fread(&id[0], sizeof(char), 1, file)!=1)
1073  SG_ERROR("failed to read header")
1074  ASSERT(id[0]=='S')
1075  if (fread(&id[1], sizeof(char), 1, file)!=1)
1076  SG_ERROR("failed to read header")
1077  ASSERT(id[1]=='G')
1078  if (fread(&id[2], sizeof(char), 1, file)!=1)
1079  SG_ERROR("failed to read header")
1080  ASSERT(id[2]=='V')
1081  if (fread(&id[3], sizeof(char), 1, file)!=1)
1082  SG_ERROR("failed to read header")
1083  ASSERT(id[3]=='0')
1084 
1085  //compression type
1086  uint8_t c;
1087  if (fread(&c, sizeof(uint8_t), 1, file)!=1)
1088  SG_ERROR("failed to read compression type")
1089  CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
1090  //alphabet
1091  uint8_t a;
1092  delete alphabet;
1093  if (fread(&a, sizeof(uint8_t), 1, file)!=1)
1094  SG_ERROR("failed to read compression alphabet")
1095  alphabet=new CAlphabet((EAlphabet) a);
1096  // number of vectors
1097  if (fread(&num_vectors, sizeof(int32_t), 1, file)!=1)
1098  SG_ERROR("failed to read compression number of vectors")
1099  ASSERT(num_vectors>0)
1100  // maximum string length
1101  if (fread(&max_string_length, sizeof(int32_t), 1, file)!=1)
1102  SG_ERROR("failed to read maximum string length")
1103  ASSERT(max_string_length>0)
1104 
1105  features=SG_MALLOC(SGString<ST>, num_vectors);
1106 
1107  // vectors
1108  for (int32_t i=0; i<num_vectors; i++)
1109  {
1110  // vector len compressed
1111  int32_t len_compressed;
1112  if (fread(&len_compressed, sizeof(int32_t), 1, file)!=1)
1113  SG_ERROR("failed to read vector length compressed")
1114  // vector len uncompressed
1115  int32_t len_uncompressed;
1116  if (fread(&len_uncompressed, sizeof(int32_t), 1, file)!=1)
1117  SG_ERROR("failed to read vector length uncompressed")
1118 
1119  // vector raw data
1120  if (decompress)
1121  {
1122  features[i].string=SG_MALLOC(ST, len_uncompressed);
1123  features[i].slen=len_uncompressed;
1124  uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed);
1125  if (fread(compressed, sizeof(uint8_t), len_compressed, file)!=(size_t) len_compressed)
1126  SG_ERROR("failed to read compressed data (expected %d bytes)", len_compressed)
1127  uint64_t uncompressed_size=len_uncompressed;
1128  uncompressed_size*=sizeof(ST);
1129  compressor->decompress(compressed, len_compressed,
1130  (uint8_t*) features[i].string, uncompressed_size);
1131  SG_FREE(compressed);
1132  ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST))
1133  }
1134  else
1135  {
1136  int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
1137  features[i].string=SG_MALLOC(ST, len_compressed+offs);
1138  features[i].slen=len_compressed+offs;
1139  int32_t* feat32ptr=((int32_t*) (features[i].string));
1140  memset(features[i].string, 0, offs*sizeof(ST));
1141  feat32ptr[0]=(int32_t) len_compressed;
1142  feat32ptr[1]=(int32_t) len_uncompressed;
1143  uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
1144  if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed)
1145  SG_ERROR("failed to read uncompressed data")
1146  }
1147  }
1148 
1149  delete compressor;
1150  fclose(file);
1151 
1152  return false;
1153 }
1154 
1155 template<class ST> bool CStringFeatures<ST>::save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
1156 {
1157  if (m_subset_stack->has_subsets())
1158  SG_ERROR("save_compressed() is not possible on subset")
1159 
1160  FILE* file=NULL;
1161 
1162  if (!(file=fopen(dest, "wb")))
1163  return false;
1164 
1165  CCompressor* compressor= new CCompressor(compression);
1166 
1167  // header shogun v0
1168  const char* id="SGV0";
1169  fwrite(&id[0], sizeof(char), 1, file);
1170  fwrite(&id[1], sizeof(char), 1, file);
1171  fwrite(&id[2], sizeof(char), 1, file);
1172  fwrite(&id[3], sizeof(char), 1, file);
1173 
1174  //compression type
1175  uint8_t c=(uint8_t) compression;
1176  fwrite(&c, sizeof(uint8_t), 1, file);
1177  //alphabet
1178  uint8_t a=(uint8_t) alphabet->get_alphabet();
1179  fwrite(&a, sizeof(uint8_t), 1, file);
1180  // number of vectors
1181  fwrite(&num_vectors, sizeof(int32_t), 1, file);
1182  // maximum string length
1183  fwrite(&max_string_length, sizeof(int32_t), 1, file);
1184 
1185  // vectors
1186  for (int32_t i=0; i<num_vectors; i++)
1187  {
1188  int32_t len=-1;
1189  bool vfree;
1190  ST* vec=get_feature_vector(i, len, vfree);
1191 
1192  uint8_t* compressed=NULL;
1193  uint64_t compressed_size=0;
1194 
1195  compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
1196  compressed, compressed_size, level);
1197 
1198  int32_t len_compressed=(int32_t) compressed_size;
1199  // vector len compressed in bytes
1200  fwrite(&len_compressed, sizeof(int32_t), 1, file);
1201  // vector len uncompressed in number of elements of type ST
1202  fwrite(&len, sizeof(int32_t), 1, file);
1203  // vector raw data
1204  fwrite(compressed, compressed_size, 1, file);
1205  SG_FREE(compressed);
1206 
1207  free_feature_vector(vec, i, vfree);
1208  }
1209 
1210  delete compressor;
1211  fclose(file);
1212  return true;
1213 }
1214 
1215 template<class ST> bool CStringFeatures<ST>::apply_preprocessor(bool force_preprocessing)
1216 {
1217  SG_DEBUG("force: %d\n", force_preprocessing)
1218 
1219  for (int32_t i=0; i<get_num_preprocessors(); i++)
1220  {
1221  if ( (!is_preprocessed(i) || force_preprocessing) )
1222  {
1223  set_preprocessed(i);
1224  CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
1225  SG_INFO("preprocessing using preproc %s\n", p->get_name())
1226 
1227  if (!p->apply_to_string_features(this))
1228  {
1229  SG_UNREF(p);
1230  return false;
1231  }
1232  else
1233  SG_UNREF(p);
1234  }
1235  }
1236  return true;
1237 }
1238 
1239 template<class ST> int32_t CStringFeatures<ST>::obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip)
1240 {
1241  if (m_subset_stack->has_subsets())
1243 
1244  ASSERT(step_size>0)
1245  ASSERT(window_size>0)
1246  ASSERT(num_vectors==1 || single_string)
1247  ASSERT(max_string_length>=window_size ||
1248  (single_string && length_of_single_string>=window_size));
1249 
1250  //in case we are dealing with a single remapped string
1251  //allow remapping
1252  if (single_string)
1253  num_vectors= (length_of_single_string-window_size)/step_size + 1;
1254  else if (num_vectors==1)
1255  {
1256  num_vectors= (max_string_length-window_size)/step_size + 1;
1257  length_of_single_string=max_string_length;
1258  }
1259 
1260  SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
1261  int32_t offs=0;
1262  for (int32_t i=0; i<num_vectors; i++)
1263  {
1264  f[i].string=&features[0].string[offs+skip];
1265  f[i].slen=window_size-skip;
1266  offs+=step_size;
1267  }
1268  single_string=features[0].string;
1269  SG_FREE(features);
1270  features=f;
1271  max_string_length=window_size-skip;
1272 
1273  return num_vectors;
1274 }
1275 
1276 template<class ST> int32_t CStringFeatures<ST>::obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
1277  int32_t skip)
1278 {
1279  if (m_subset_stack->has_subsets())
1281 
1282  ASSERT(positions)
1283  ASSERT(window_size>0)
1284  ASSERT(num_vectors==1 || single_string)
1285  ASSERT(max_string_length>=window_size ||
1286  (single_string && length_of_single_string>=window_size));
1287 
1288  num_vectors= positions->get_num_elements();
1289  ASSERT(num_vectors>0)
1290 
1291  int32_t len;
1292 
1293  //in case we are dealing with a single remapped string
1294  //allow remapping
1295  if (single_string)
1296  len=length_of_single_string;
1297  else
1298  {
1299  single_string=features[0].string;
1300  len=max_string_length;
1301  length_of_single_string=max_string_length;
1302  }
1303 
1304  SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
1305  for (int32_t i=0; i<num_vectors; i++)
1306  {
1307  int32_t p=positions->get_element(i);
1308 
1309  if (p>=0 && p<=len-window_size)
1310  {
1311  f[i].string=&features[0].string[p+skip];
1312  f[i].slen=window_size-skip;
1313  }
1314  else
1315  {
1316  num_vectors=1;
1317  max_string_length=len;
1318  features[0].slen=len;
1319  single_string=NULL;
1320  SG_FREE(f);
1321  SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
1322  window_size, i, p, len);
1323  return -1;
1324  }
1325  }
1326 
1327  SG_FREE(features);
1328  features=f;
1329  max_string_length=window_size-skip;
1330 
1331  return num_vectors;
1332 }
1333 
1334 template<class ST> bool CStringFeatures<ST>::obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1335 {
1336  return obtain_from_char_features(sf, start, p_order, gap, rev);
1337 }
1338 
1339 template<class ST> bool CStringFeatures<ST>::have_same_length(int32_t len)
1340 {
1341  if (len!=-1)
1342  {
1343  if (len!=max_string_length)
1344  return false;
1345  }
1346  len=max_string_length;
1347 
1348  index_t num_str=get_num_vectors();
1349  for (int32_t i=0; i<num_str; i++)
1350  {
1351  if (get_vector_length(i)!=len)
1352  return false;
1353  }
1354 
1355  return true;
1356 }
1357 
1358 template<class ST> void CStringFeatures<ST>::embed_features(int32_t p_order)
1359 {
1360  if (m_subset_stack->has_subsets())
1362 
1363  ASSERT(alphabet->get_num_symbols_in_histogram() > 0)
1364 
1365  order=p_order;
1366  original_num_symbols=alphabet->get_num_symbols();
1367  int32_t max_val=alphabet->get_num_bits();
1368 
1369  if (p_order>1)
1370  num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
1371  else
1372  num_symbols=original_num_symbols;
1373 
1374  SG_INFO("max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols)
1375 
1376  if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
1377  SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val)
1378 
1379  ST mask=0;
1380  for (int32_t i=0; i<p_order*max_val; i++)
1381  mask= (mask<<1) | ((ST) 1);
1382 
1383  for (int32_t i=0; i<num_vectors; i++)
1384  {
1385  int32_t len=features[i].slen;
1386 
1387  if (len < p_order)
1388  SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order)
1389 
1390  ST* str=features[i].string;
1391 
1392  // convert first word
1393  for (int32_t j=0; j<p_order; j++)
1394  str[j]=(ST) alphabet->remap_to_bin(str[j]);
1395  str[0]=embed_word(&str[0], p_order);
1396 
1397  // convert the rest
1398  int32_t idx=0;
1399  for (int32_t j=p_order; j<len; j++)
1400  {
1401  str[j]=(ST) alphabet->remap_to_bin(str[j]);
1402  str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
1403  idx++;
1404  }
1405 
1406  features[i].slen=len-p_order+1;
1407  }
1408 
1409  compute_symbol_mask_table(max_val);
1410 }
1411 
1412 template<class ST> void CStringFeatures<ST>::compute_symbol_mask_table(int64_t max_val)
1413 {
1414  if (m_subset_stack->has_subsets())
1416 
1417  SG_FREE(symbol_mask_table);
1418  symbol_mask_table=SG_MALLOC(ST, 256);
1419  symbol_mask_table_len=256;
1420 
1421  uint64_t mask=0;
1422  for (int32_t i=0; i< (int64_t) max_val; i++)
1423  mask=(mask<<1) | 1;
1424 
1425  for (int32_t i=0; i<256; i++)
1426  {
1427  uint8_t bits=(uint8_t) i;
1428  symbol_mask_table[i]=0;
1429 
1430  for (int32_t j=0; j<8; j++)
1431  {
1432  if (bits & 1)
1433  symbol_mask_table[i]|=mask<<(max_val*j);
1434 
1435  bits>>=1;
1436  }
1437  }
1438 }
1439 
1440 template<class ST> void CStringFeatures<ST>::unembed_word(ST word, uint8_t* seq, int32_t len)
1441 {
1442  uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1443 
1444  ST mask=0;
1445  for (uint32_t i=0; i<nbits; i++)
1446  mask=(mask<<1) | (ST) 1;
1447 
1448  for (int32_t i=0; i<len; i++)
1449  {
1450  ST w=(word & mask);
1451  seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
1452  word>>=nbits;
1453  }
1454 }
1455 
1456 template<class ST> ST CStringFeatures<ST>::embed_word(ST* seq, int32_t len)
1457 {
1458  ST value=(ST) 0;
1459  uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1460  for (int32_t i=0; i<len; i++)
1461  {
1462  value<<=nbits;
1463  value|=seq[i];
1464  }
1465 
1466  return value;
1467 }
1468 
1470 {
1471  max_string_length=0;
1472  index_t num_str=get_num_vectors();
1473 
1474  for (int32_t i=0; i<num_str; i++)
1475  {
1476  max_string_length=CMath::max(max_string_length,
1477  features[m_subset_stack->subset_idx_conversion(i)].slen);
1478  }
1479 }
1480 
1482 {
1483  int32_t l=str.slen;
1484  ST* s=SG_MALLOC(ST, l+1);
1485  sg_memcpy(s, str.string, sizeof(ST)*l);
1486  s[l]='\0';
1487  return s;
1488 }
1489 
1490 template<class ST> void CStringFeatures<ST>::set_feature_vector(int32_t num, ST* string, int32_t len)
1491 {
1492  ASSERT(features)
1493  ASSERT(num<get_num_vectors())
1494 
1495  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
1496 
1497 
1498  features[real_num].slen=len ;
1499  features[real_num].string=string ;
1500 
1501  max_string_length=CMath::max(len, max_string_length);
1502 }
1503 
1504 template<class ST> void CStringFeatures<ST>::get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize)
1505 {
1506  int32_t nsym=get_num_symbols();
1507  int32_t slen=get_max_vector_length();
1508  int64_t sz=int64_t(nsym)*slen*sizeof(float64_t);
1509  float64_t* h= SG_MALLOC(float64_t, sz);
1510  memset(h, 0, sz);
1511 
1512  float64_t* h_normalizer=SG_MALLOC(float64_t, slen);
1513  memset(h_normalizer, 0, slen*sizeof(float64_t));
1514  int32_t num_str=get_num_vectors();
1515  for (int32_t i=0; i<num_str; i++)
1516  {
1517  int32_t len;
1518  bool free_vec;
1519  ST* vec=get_feature_vector(i, len, free_vec);
1520  for (int32_t j=0; j<len; j++)
1521  {
1522  h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
1523  h_normalizer[j]++;
1524  }
1525  free_feature_vector(vec, i, free_vec);
1526  }
1527 
1528  if (normalize)
1529  {
1530  for (int32_t i=0; i<slen; i++)
1531  {
1532  for (int32_t j=0; j<nsym; j++)
1533  {
1534  if (h_normalizer && h_normalizer[i])
1535  h[int64_t(i)*nsym+j]/=h_normalizer[i];
1536  }
1537  }
1538  }
1539  SG_FREE(h_normalizer);
1540 
1541  *hist=h;
1542  *rows=nsym;
1543  *cols=slen;
1544 }
1545 
1546 template<class ST> void CStringFeatures<ST>::create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec)
1547 {
1548  ASSERT(rows == get_num_symbols())
1549  cleanup();
1550  float64_t* randoms=SG_MALLOC(float64_t, cols);
1551  SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
1552 
1553  for (int32_t i=0; i<num_vec; i++)
1554  {
1555  sf[i].string=SG_MALLOC(ST, cols);
1556  sf[i].slen=cols;
1557 
1558  SGVector<float64_t>::random_vector(randoms, cols, 0.0, 1.0);
1559 
1560  for (int32_t j=0; j<cols; j++)
1561  {
1562  float64_t lik=hist[int64_t(j)*rows+0];
1563 
1564  int32_t c;
1565  for (c=0; c<rows-1; c++)
1566  {
1567  if (randoms[j]<=lik)
1568  break;
1569  lik+=hist[int64_t(j)*rows+c+1];
1570  }
1571  sf[i].string[j]=alphabet->remap_to_char(c);
1572  }
1573  }
1574  SG_FREE(randoms);
1575  set_features(sf, num_vec, cols);
1576 }
1577 
1578 /*
1579 CStringFeatures<SSKTripleFeature>* obtain_sssk_triple_from_cha(int d1, int d2)
1580 {
1581  int *s;
1582  int32_t nStr=get_num_vectors();
1583 
1584  int32_t nfeat=0;
1585  for (int32_t i=0; i < nStr; ++i)
1586  nfeat += get_vector_length[i] - d1 -d2;
1587  SGString<SSKFeature>* F= SG_MALLOC(SGString<SSKFeature>, nfeat);
1588  int32_t c=0;
1589  for (int32_t i=0; i < nStr; ++i)
1590  {
1591  int32_t len;
1592  bool free_vec;
1593  ST* S=get_feature_vector(vec_num, len, free_vec);
1594  free_feature_vector(vec, vec_num, free_vec);
1595  int32_t n=len - d1 - d2;
1596  s=S[i];
1597  for (int32_t j=0; j < n; ++j)
1598  {
1599  F[c].feature1=s[j];
1600  F[c].feature2=s[j+d1];
1601  F[c].feature3=s[j+d1+d2];
1602  F[c].group=i;
1603  c++;
1604  }
1605  }
1606  ASSERT(nfeat==c)
1607  return F;
1608 }
1609 
1610 CStringFeatures<SSKFeature>* obtain_sssk_double_from_char(int **S, int *len, int nStr, int d1)
1611 {
1612  int i, j;
1613  int n, nfeat;
1614  int *group;
1615  int *features;
1616  int *s;
1617  int c;
1618  SSKFeatures *F;
1619 
1620  nfeat=0;
1621  for (i=0; i < nStr; ++i)
1622  nfeat += len[i] - d1;
1623  group=(int *)SG_MALLOC(nfeat*sizeof(int));
1624  features=(int *)SG_MALLOC(nfeat*2*sizeof(int *));
1625  c=0;
1626  for (i=0; i < nStr; ++i)
1627  {
1628  n=len[i] - d1;
1629  s=S[i];
1630  for (j=0; j < n; ++j)
1631  {
1632  features[c]=s[j];
1633  features[c+nfeat]=s[j+d1];
1634  group[c]=i;
1635  c++;
1636  }
1637  }
1638  if (nfeat!=c)
1639  printf("Something is wrong...\n");
1640  F=(SSKFeatures *)SG_MALLOC(sizeof(SSKFeatures));
1641  (*F).features=features;
1642  (*F).group=group;
1643  (*F).n=nfeat;
1644  return F;
1645 }
1646 */
1647 
1649  SGVector<index_t> indices)
1650 {
1651  /* string list to create new CStringFeatures from */
1652  SGStringList<ST> list_copy(indices.vlen, max_string_length);
1653 
1654  /* copy all features */
1655  for (index_t i=0; i<indices.vlen; ++i)
1656  {
1657  /* index with respect to possible subset */
1658  index_t real_idx=m_subset_stack->subset_idx_conversion(indices.vector[i]);
1659 
1660  /* copy string */
1661  SGString<ST> current_string=features[real_idx];
1662  SGString<ST> string_copy(current_string.slen);
1663  sg_memcpy(string_copy.string, current_string.string,
1664  current_string.slen*sizeof(ST));
1665  list_copy.strings[i]=string_copy;
1666  }
1667 
1668  /* create copy instance */
1669  CStringFeatures* result=new CStringFeatures(list_copy, alphabet);
1670 
1671  /* max string length may have changed */
1673 
1674  /* keep things from original features (otherwise assertions in x-val) */
1675  result->order=order;
1677 
1678  SG_REF(result);
1679 
1680  return result;
1681 }
1682 
1684 {
1685  /* max string length has to be updated */
1686  determine_maximum_string_length();
1687 }
1688 
1689 template<class ST> ST* CStringFeatures<ST>::compute_feature_vector(int32_t num, int32_t& len)
1690 {
1691  ASSERT(features && num<get_num_vectors())
1692 
1693  int32_t real_num=m_subset_stack->subset_idx_conversion(num);
1694 
1695  len=features[real_num].slen;
1696  if (len<=0)
1697  return NULL;
1698 
1699  ST* target=SG_MALLOC(ST, len);
1700  sg_memcpy(target, features[real_num].string, len*sizeof(ST));
1701  return target;
1702 }
1703 
1704 template<class ST> void CStringFeatures<ST>::init()
1705 {
1706  set_generic<ST>();
1707 
1708  alphabet=NULL;
1709  num_vectors=0;
1710  features=NULL;
1711  single_string=NULL;
1712  length_of_single_string=0;
1713  max_string_length=0;
1714  order=0;
1715  preprocess_on_get=false;
1716  feature_cache=NULL;
1717  symbol_mask_table=NULL;
1718  symbol_mask_table_len=0;
1719  num_symbols=0.0;
1720  original_num_symbols=0;
1721 
1722  m_parameters->add((CSGObject**) &alphabet, "alphabet");
1723  m_parameters->add_vector(&features, &num_vectors, "features",
1724  "This contains the array of features.");
1725  m_parameters->add_vector(&single_string,
1726  &length_of_single_string,
1727  "single_string",
1728  "Created by sliding window.");
1729  m_parameters->add(&max_string_length, "max_string_length",
1730  "Length of longest string.");
1731  m_parameters->add(&num_symbols, "num_symbols",
1732  "Number of used symbols.");
1733  m_parameters->add(&original_num_symbols, "original_num_symbols",
1734  "Original number of used symbols.");
1735  m_parameters->add(&order, "order",
1736  "Order used in higher order mapping.");
1737  m_parameters->add(&preprocess_on_get, "preprocess_on_get",
1738  "Preprocess on-the-fly?");
1739 
1740  m_parameters->add_vector(&symbol_mask_table, &symbol_mask_table_len, "mask_table", "Symbol mask table - using in higher order mapping");
1741 }
1742 
1748 {
1749  return F_BOOL;
1750 }
1751 
1757 {
1758  return F_CHAR;
1759 }
1760 
1766 {
1767  return F_BYTE;
1768 }
1769 
1775 {
1776  return F_SHORT;
1777 }
1778 
1784 {
1785  return F_WORD;
1786 }
1787 
1793 {
1794  return F_INT;
1795 }
1796 
1802 {
1803  return F_UINT;
1804 }
1805 
1811 {
1812  return F_LONG;
1813 }
1814 
1820 {
1821  return F_ULONG;
1822 }
1823 
1829 {
1830  return F_SHORTREAL;
1831 }
1832 
1838 {
1839  return F_DREAL;
1840 }
1841 
1847 {
1848  return F_LONGREAL;
1849 }
1850 
1851 template<> bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
1852 {
1853  return symbol;
1854 }
1856 {
1857  return symbol;
1858 }
1860 {
1861  return symbol;
1862 }
1864 {
1865  return symbol;
1866 }
1867 
1868 template<> bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
1869 {
1870  return false;
1871 }
1873 {
1874  return 0;
1875 }
1877 {
1878  return 0;
1879 }
1881 {
1882  return 0;
1883 }
1884 
1885 template<> bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
1886 {
1887  return symbol;
1888 }
1890 {
1891  return symbol;
1892 }
1894 {
1895  return symbol;
1896 }
1898 {
1899  return symbol;
1900 }
1901 
1902 #ifndef SUNOS
1903 template<> template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1904 {
1905  return false;
1906 }
1907 template<> template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1908 {
1909  return false;
1910 }
1911 template<> template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1912 {
1913  return false;
1914 }
1915 #endif
1916 
1917 template<> void CStringFeatures<float32_t>::embed_features(int32_t p_order)
1918 {
1919 }
1920 template<> void CStringFeatures<float64_t>::embed_features(int32_t p_order)
1921 {
1922 }
1923 template<> void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
1924 {
1925 }
1926 
1928 {
1929 }
1931 {
1932 }
1934 {
1935 }
1936 
1938 {
1939  return 0;
1940 }
1942 {
1943  return 0;
1944 }
1946 {
1947  return 0;
1948 }
1949 
1950 template<> void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
1951 {
1952 }
1953 template<> void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
1954 {
1955 }
1956 template<> void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
1957 {
1958 }
1959 #define LOAD(f_load, sg_type) \
1960 template<> void CStringFeatures<sg_type>::load(CFile* loader) \
1961 { \
1962  SG_INFO("loading...\n") \
1963  \
1964  SG_SET_LOCALE_C; \
1965  SGString<sg_type>* strs; \
1966  int32_t num_str; \
1967  int32_t max_len; \
1968  loader->f_load(strs, num_str, max_len); \
1969  set_features(strs, num_str, max_len); \
1970  SG_RESET_LOCALE; \
1971 }
1972 
1973 LOAD(get_string_list, bool)
1974 LOAD(get_string_list, char)
1975 LOAD(get_string_list, int8_t)
1976 LOAD(get_string_list, uint8_t)
1977 LOAD(get_string_list, int16_t)
1978 LOAD(get_string_list, uint16_t)
1979 LOAD(get_string_list, int32_t)
1980 LOAD(get_string_list, uint32_t)
1981 LOAD(get_string_list, int64_t)
1982 LOAD(get_string_list, uint64_t)
1983 LOAD(get_string_list, float32_t)
1984 LOAD(get_string_list, float64_t)
1985 LOAD(get_string_list, floatmax_t)
1986 #undef LOAD
1987 
1988 #define SAVE(f_write, sg_type) \
1989 template<> void CStringFeatures<sg_type>::save(CFile* writer) \
1990 { \
1991  if (m_subset_stack->has_subsets()) \
1992  SG_ERROR("save() is not possible on subset") \
1993  SG_SET_LOCALE_C; \
1994  ASSERT(writer) \
1995  writer->f_write(features, num_vectors); \
1996  SG_RESET_LOCALE; \
1997 }
1998 
1999 SAVE(set_string_list, bool)
2000 SAVE(set_string_list, char)
2001 SAVE(set_string_list, int8_t)
2002 SAVE(set_string_list, uint8_t)
2003 SAVE(set_string_list, int16_t)
2004 SAVE(set_string_list, uint16_t)
2005 SAVE(set_string_list, int32_t)
2006 SAVE(set_string_list, uint32_t)
2007 SAVE(set_string_list, int64_t)
2008 SAVE(set_string_list, uint64_t)
2009 SAVE(set_string_list, float32_t)
2010 SAVE(set_string_list, float64_t)
2011 SAVE(set_string_list, floatmax_t)
2012 #undef SAVE
2013 
2014 template <class ST> template <class CT>
2016  int32_t p_order, int32_t gap, bool rev)
2017 {
2018  remove_all_subsets();
2019  ASSERT(sf)
2020 
2021  CAlphabet* alpha=sf->get_alphabet();
2022  ASSERT(alpha->get_num_symbols_in_histogram() > 0)
2023 
2024  this->order=p_order;
2025  cleanup();
2026 
2027  num_vectors=sf->get_num_vectors();
2028  ASSERT(num_vectors>0)
2029  max_string_length=sf->get_max_vector_length()-start;
2030  features=SG_MALLOC(SGString<ST>, num_vectors);
2031 
2032  SG_DEBUG("%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
2033  alpha->get_num_symbols_in_histogram());
2034 
2035  for (int32_t i=0; i<num_vectors; i++)
2036  {
2037  int32_t len=-1;
2038  bool vfree;
2039  CT* c=sf->get_feature_vector(i, len, vfree);
2040  ASSERT(!vfree) // won't work when preprocessors are attached
2041 
2042  features[i].string=SG_MALLOC(ST, len);
2043  features[i].slen=len;
2044 
2045  ST* str=features[i].string;
2046  for (int32_t j=0; j<len; j++)
2047  str[j]=(ST) alpha->remap_to_bin(c[j]);
2048  }
2049 
2050  original_num_symbols=alpha->get_num_symbols();
2051  int32_t max_val=alpha->get_num_bits();
2052 
2053  SG_UNREF(alpha);
2054 
2055  if (p_order>1)
2056  num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
2057  else
2058  num_symbols=original_num_symbols;
2059  SG_INFO("max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols)
2060 
2061  if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
2062  {
2063  SG_ERROR("symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val)
2064  return false;
2065  }
2066 
2067  SG_DEBUG("translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST))
2068  for (int32_t line=0; line<num_vectors; line++)
2069  {
2070  int32_t len=0;
2071  bool vfree;
2072  ST* fv=get_feature_vector(line, len, vfree);
2073  ASSERT(!vfree) // won't work when preprocessors are attached
2074 
2075  if (rev)
2076  CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
2077  else
2078  CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
2079 
2080  /* fix the length of the string -- hacky */
2081  features[line].slen-=start+gap ;
2082  if (features[line].slen<0)
2083  features[line].slen=0 ;
2084  }
2085 
2086  compute_symbol_mask_table(max_val);
2087 
2088  return true;
2089 }
2090 
2091 template class CStringFeatures<bool>;
2092 template class CStringFeatures<char>;
2093 template class CStringFeatures<int8_t>;
2094 template class CStringFeatures<uint8_t>;
2095 template class CStringFeatures<int16_t>;
2096 template class CStringFeatures<uint16_t>;
2097 template class CStringFeatures<int32_t>;
2098 template class CStringFeatures<uint32_t>;
2099 template class CStringFeatures<int64_t>;
2100 template class CStringFeatures<uint64_t>;
2101 template class CStringFeatures<float32_t>;
2102 template class CStringFeatures<float64_t>;
2103 template class CStringFeatures<floatmax_t>;
2104 
2105 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2106 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2107 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2108 
2109 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2110 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2111 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2112 }
void set_feature_vector(SGVector< ST > vector, int32_t num)
int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0)
virtual int32_t get_max_vector_length()
SGVector< ST > get_feature_vector(int32_t num)
CSubsetStack * m_subset_stack
Definition: Features.h:361
int32_t get_num_symbols_in_histogram()
Definition: Alphabet.cpp:565
#define SG_INFO(...)
Definition: SGIO.h:117
virtual void load(CFile *loader)
virtual CFeatures * duplicate() const
template class SGStringList
Definition: SGObject.h:41
DNA - letters A,C,G,T.
Definition: Alphabet.h:26
floatmax_t num_symbols
number of used symbols
bool load_from_directory(char *dirname)
bool obtain_from_char(CStringFeatures< char > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
virtual CFeatures * copy_subset(SGVector< index_t > indices)
ST shift_offset(ST offset, int32_t amount)
Template class StringFeatures implements a list of strings.
virtual void subset_changed_post()
int32_t index_t
Definition: common.h:72
#define SG_PROGRESS(...)
Definition: SGIO.h:141
static float64_t ceil(float64_t d)
Definition: Math.h:411
void unembed_word(ST word, uint8_t *seq, int32_t len)
EAlphabet
Alphabet of charfeatures/observations.
Definition: Alphabet.h:23
int32_t get_max_value_in_histogram()
Definition: Alphabet.cpp:550
virtual SGString< ST > * copy_features(int32_t &num_str, int32_t &max_str_len)
bool check_alphabet_size(bool print_error=true)
Definition: Alphabet.cpp:639
floatmax_t get_max_num_symbols()
bool load_fasta_file(const char *fname, bool ignore_invalid=false)
SGString< ST > * features
#define SAVE(f_write, sg_type)
virtual const char * get_name() const
return the name of the preprocessor
virtual int32_t get_num_vectors() const
bool append_features(CStringFeatures< ST > *sf)
virtual void create_random(float64_t *hist, int32_t rows, int32_t cols, int32_t num_vec)
#define SG_ERROR(...)
Definition: SGIO.h:128
#define SG_NOTIMPLEMENTED
Definition: SGIO.h:138
The class Alphabet implements an alphabet and alphabet utility functions.
Definition: Alphabet.h:91
Compression library for compressing and decompressing buffers using one of the standard compression a...
Definition: Compressor.h:46
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
floatmax_t get_original_num_symbols()
static int filter(CONST_DIRENT_T *d)
Definition: SGIO.cpp:418
uint8_t remap_to_bin(uint8_t c)
Definition: Alphabet.h:159
#define SG_REF(x)
Definition: SGObject.h:52
virtual bool load_compressed(char *src, bool decompress)
EFeatureClass
shogun feature class
Definition: FeatureTypes.h:38
void load_ascii_file(char *fname, bool remap_to_bin=true, EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA)
void add_string_to_histogram(T *p, int64_t len)
Definition: Alphabet.h:183
virtual bool apply_preprocessor(bool force_preprocessing=false)
index_t vlen
Definition: SGVector.h:545
char * get_line(uint64_t &len, uint64_t &offs)
ST get_masked_symbols(ST symbol, uint8_t mask)
#define ASSERT(x)
Definition: SGIO.h:200
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:125
int32_t symbol_mask_table_len
order used in higher order mapping
shogun vector
#define LOAD(f_load, sg_type)
CStringFeatures< ST > * get_transposed()
int32_t order
order used in higher order mapping
ST embed_word(ST *seq, int32_t len)
int32_t obtain_by_position_list(int32_t window_size, CDynamicArray< int32_t > *positions, int32_t skip=0)
bool load_fastq_file(const char *fname, bool ignore_invalid=false, bool bitremap_in_single_string=false)
double float64_t
Definition: common.h:60
virtual ST * apply_to_string(ST *f, int32_t &len)=0
apply preproc on single feature vector
floatmax_t original_num_symbols
original number of used symbols (before higher order mapping)
long double floatmax_t
Definition: common.h:61
int32_t get_num_symbols() const
Definition: Alphabet.h:139
A File access base class.
Definition: File.h:34
virtual EFeatureClass get_feature_class() const
virtual bool save_compressed(char *dest, E_COMPRESSION_TYPE compression, int level)
virtual ST * compute_feature_vector(int32_t num, int32_t &len)
virtual ST get_feature(int32_t vec_num, int32_t feat_num)
index_t subset_idx_conversion(index_t idx) const
Definition: SubsetStack.h:105
static T max(T a, T b)
Definition: Math.h:164
Template class StringPreprocessor, base class for preprocessors (cf. CPreprocessor) that apply to CSt...
void compute_symbol_mask_table(int64_t max_val)
virtual EFeatureType get_feature_type() const
bool check_alphabet(bool print_error=true)
Definition: Alphabet.cpp:617
int32_t get_num_bits() const
Definition: Alphabet.h:149
float float32_t
Definition: common.h:59
EFeatureType
shogun feature type
Definition: FeatureTypes.h:19
bool have_same_length(int32_t len=-1)
virtual void cleanup_feature_vector(int32_t num)
E_COMPRESSION_TYPE
Definition: Compressor.h:21
void compress(uint8_t *uncompressed, uint64_t uncompressed_size, uint8_t *&compressed, uint64_t &compressed_size, int32_t level=1)
Definition: Compressor.cpp:44
#define SG_UNREF(x)
Definition: SGObject.h:53
static void set_dirname(const char *dirname)
Definition: SGIO.h:463
#define SG_DEBUG(...)
Definition: SGIO.h:106
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
SGStringList< ST > get_features()
The class Features is the base class of all feature objects.
Definition: Features.h:68
SGString< T > * strings
Definition: SGStringList.h:88
void decompress(uint8_t *compressed, uint64_t compressed_size, uint8_t *uncompressed, uint64_t &uncompressed_size)
Definition: Compressor.cpp:209
index_t slen
Definition: SGString.h:79
ST shift_symbol(ST symbol, int32_t amount)
void embed_features(int32_t p_order)
int32_t get_num_elements() const
Definition: DynamicArray.h:200
virtual void get_histogram(float64_t **hist, int32_t *rows, int32_t *cols, bool normalize=true)
virtual void cleanup_feature_vectors(int32_t start, int32_t stop)
static char * concat_filename(const char *filename)
Definition: SGIO.cpp:405
#define SG_WARNING(...)
Definition: SGIO.h:127
static floatmax_t powl(floatmax_t x, floatmax_t n)
Definition: Math.h:519
virtual bool apply_to_string_features(CFeatures *f)=0
const T & get_element(int32_t idx1, int32_t idx2=0, int32_t idx3=0) const
Definition: DynamicArray.h:212
ST * symbol_mask_table
order used in higher order mapping
void set_features(SGStringList< ST > feats)
virtual int32_t get_vector_length(int32_t vec_num)
static void random_vector(T *vec, int32_t len, T min_value, T max_value)
Definition: SGVector.cpp:605
static ST * get_zero_terminated_string_copy(SGString< ST > str)

SHOGUN Machine Learning Toolbox - Documentation