SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringFeatures.h
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Written (W) 1999-2008 Gunnar Raetsch
9  * Written (W) 2011-2012 Heiko Strathmann
10  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
11  */
12 
13 #ifndef _CSTRINGFEATURES__H__
14 #define _CSTRINGFEATURES__H__
15 
16 #include <shogun/lib/config.h>
17 
18 #include <shogun/lib/common.h>
19 #include <shogun/lib/Cache.h>
21 #include <shogun/lib/Compressor.h>
22 #include <shogun/io/File.h>
23 
26 #include <shogun/lib/SGString.h>
27 
28 namespace shogun
29 {
30 class CAlphabet;
31 template <class T> class CDynamicArray;
32 class CFile;
33 template <class T> class SGString;
34 template <class T> class SGStringList;
35 
36 #ifndef DOXYGEN_SHOULD_SKIP_THIS
37 struct SSKDoubleFeature
38 {
39  int feature1;
40  int feature2;
41  int group;
42 };
43 
44 struct SSKTripleFeature
45 {
46  int feature1;
47  int feature2;
48  int feature3;
49  int group;
50 };
51 #endif
52 
76 template <class ST> class CStringFeatures : public CFeatures
77 {
78  public:
81 
87 
93  CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha);
94 
100  CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha);
101 
106  CStringFeatures(CAlphabet* alpha);
107 
112  CStringFeatures(const CStringFeatures& orig);
113 
119  CStringFeatures(CFile* loader, EAlphabet alpha=DNA);
120 
122  virtual ~CStringFeatures();
123 
129  virtual void cleanup();
130 
137  virtual void cleanup_feature_vector(int32_t num);
138 
146  virtual void cleanup_feature_vectors(int32_t start, int32_t stop);
147 
152  virtual EFeatureClass get_feature_class() const;
153 
158  virtual EFeatureType get_feature_type() const;
159 
165 
170  virtual CFeatures* duplicate() const;
171 
179  SGVector<ST> get_feature_vector(int32_t num);
180 
188  void set_feature_vector(SGVector<ST> vector, int32_t num);
189 
192 
197 
208  ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree);
209 
217 
231  SGString<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec);
232 
241  void free_feature_vector(ST* feat_vec, int32_t num, bool dofree);
242 
250  void free_feature_vector(SGVector<ST> feat_vec, int32_t num);
251 
260  virtual ST get_feature(int32_t vec_num, int32_t feat_num);
261 
269  virtual int32_t get_vector_length(int32_t vec_num);
270 
277  virtual int32_t get_max_vector_length();
278 
280  virtual int32_t get_num_vectors() const;
281 
289 
298 
299  // these functions are necessary to find out about a former conversion process
300 
306 
311  int32_t get_order();
312 
320  ST get_masked_symbols(ST symbol, uint8_t mask);
321 
328  ST shift_offset(ST offset, int32_t amount);
329 
336  ST shift_symbol(ST symbol, int32_t amount);
337 
342  virtual void load(CFile* loader);
343 
354  void load_ascii_file(char* fname, bool remap_to_bin=true,
355  EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA);
356 
365  bool load_fasta_file(const char* fname, bool ignore_invalid=false);
366 
376  bool load_fastq_file(const char* fname,
377  bool ignore_invalid=false, bool bitremap_in_single_string=false);
378 
386  bool load_from_directory(char* dirname);
387 
393  void set_features(SGStringList<ST> feats);
394 
404  bool set_features(SGString<ST>* p_features, int32_t p_num_vectors,
405  int32_t p_max_string_length);
406 
416 
429  bool append_features(SGString<ST>* p_features, int32_t p_num_vectors,
430  int32_t p_max_string_length);
431 
436 
445  virtual SGString<ST>* get_features(int32_t& num_str, int32_t& max_str_len);
446 
455  virtual SGString<ST>* copy_features(int32_t& num_str, int32_t& max_str_len);
456 
464  virtual void get_features(SGString<ST>** dst, int32_t* num_str);
465 
472  virtual void save(CFile* writer);
473 
482  virtual bool load_compressed(char* src, bool decompress);
483 
493  virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level);
494 
500  virtual bool apply_preprocessor(bool force_preprocessing=false);
501 
514  int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0);
515 
526  int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
527  int32_t skip=0);
528 
542  bool obtain_from_char(CStringFeatures<char>* sf, int32_t start,
543  int32_t p_order, int32_t gap, bool rev);
544 
556  template <class CT>
557  bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start,
558  int32_t p_order, int32_t gap, bool rev);
559 
569  bool have_same_length(int32_t len=-1);
570 
576  void embed_features(int32_t p_order);
577 
584  void compute_symbol_mask_table(int64_t max_val);
585 
592  void unembed_word(ST word, uint8_t* seq, int32_t len);
593 
599  ST embed_word(ST* seq, int32_t len);
600 
606 
615 
624  virtual void set_feature_vector(int32_t num, ST* string, int32_t len);
625 
630  virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols,
631  bool normalize=true);
632 
637  virtual void create_random(float64_t* hist, int32_t rows, int32_t cols,
638  int32_t num_vec);
639 
648  virtual CFeatures* copy_subset(SGVector<index_t> indices);
649 
651  virtual const char* get_name() const { return "StringFeatures"; }
652 
654  virtual void subset_changed_post();
655 
656  protected:
667  virtual ST* compute_feature_vector(int32_t num, int32_t& len);
668 
669  private:
670  void init();
671 
672  protected:
675 
677  int32_t num_vectors;
678 
681 
684 
687 
690 
693 
696 
698  int32_t order;
699 
702 
705 
708 
711 };
712 }
713 #endif // _CSTRINGFEATURES__H__

SHOGUN Machine Learning Toolbox - Documentation