SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
LibSVMFile.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2014 Jiaolong Xu
8  * Written (W) 2013 Evgeniy Andreev (gsomix)
9  */
10 
11 #include <shogun/io/LibSVMFile.h>
12 
13 #include <shogun/lib/SGVector.h>
15 #include <shogun/base/DynArray.h>
16 #include <shogun/io/LineReader.h>
17 #include <shogun/io/Parser.h>
19 
20 using namespace shogun;
21 
23 {
24  init();
25 }
26 
27 CLibSVMFile::CLibSVMFile(FILE* f, const char* name) :
28  CFile(f, name)
29 {
30  init();
31  init_with_defaults();
32 }
33 
34 CLibSVMFile::CLibSVMFile(const char* fname, char rw, const char* name) :
35  CFile(fname, rw, name)
36 {
37  init();
38  init_with_defaults();
39 }
40 
42 {
43  SG_UNREF(m_whitespace_tokenizer);
44  SG_UNREF(m_delimiter_feat_tokenizer);
45  SG_UNREF(m_delimiter_label_tokenizer);
46  SG_UNREF(m_line_tokenizer);
47  SG_UNREF(m_parser);
48  SG_UNREF(m_line_reader);
49 }
50 
51 void CLibSVMFile::init()
52 {
53  m_delimiter_feat=0;
54 
55  m_whitespace_tokenizer=NULL;
56  m_delimiter_feat_tokenizer=NULL;
57  m_delimiter_label_tokenizer=NULL;
58  m_line_tokenizer=NULL;
59  m_parser=NULL;
60  m_line_reader=NULL;
61 }
62 
63 void CLibSVMFile::init_with_defaults()
64 {
65  m_delimiter_feat=':';
66  m_delimiter_label=',';
67 
68  m_whitespace_tokenizer=new CDelimiterTokenizer(true);
69  m_whitespace_tokenizer->delimiters[' ']=1;
70  SG_REF(m_whitespace_tokenizer);
71 
72  m_delimiter_feat_tokenizer=new CDelimiterTokenizer(true);
73  m_delimiter_feat_tokenizer->delimiters[m_delimiter_feat]=1;
74  SG_REF(m_delimiter_feat_tokenizer);
75 
76  m_delimiter_label_tokenizer=new CDelimiterTokenizer(true);
77  m_delimiter_label_tokenizer->delimiters[m_delimiter_label]=1;
78  SG_REF(m_delimiter_label_tokenizer);
79 
80  m_line_tokenizer=new CDelimiterTokenizer(true);
81  m_line_tokenizer->delimiters['\n']=1;
82  SG_REF(m_line_tokenizer);
83 
84  m_parser=new CParser();
85  m_line_reader=new CLineReader(file, m_line_tokenizer);
86 }
87 
88 #define GET_SPARSE_MATRIX(read_func, sg_type) \
89 void CLibSVMFile::get_sparse_matrix(SGSparseVector<sg_type>*& mat_feat, int32_t& num_feat, int32_t& num_vec) \
90 { \
91  SGVector<float64_t>* multilabel; \
92  int32_t num_classes; \
93  get_sparse_matrix(mat_feat, num_feat, num_vec, multilabel, num_classes, false); \
94 }
95 
96 GET_SPARSE_MATRIX(read_bool, bool)
97 GET_SPARSE_MATRIX(read_char, int8_t)
98 GET_SPARSE_MATRIX(read_byte, uint8_t)
99 GET_SPARSE_MATRIX(read_char, char)
100 GET_SPARSE_MATRIX(read_int, int32_t)
101 GET_SPARSE_MATRIX(read_uint, uint32_t)
102 GET_SPARSE_MATRIX(read_short_real, float32_t)
103 GET_SPARSE_MATRIX(read_real, float64_t)
104 GET_SPARSE_MATRIX(read_long_real, floatmax_t)
105 GET_SPARSE_MATRIX(read_short, int16_t)
106 GET_SPARSE_MATRIX(read_word, uint16_t)
107 GET_SPARSE_MATRIX(read_long, int64_t)
108 GET_SPARSE_MATRIX(read_ulong, uint64_t)
109 #undef GET_SPARSE_MATRIX
110 
111 #define GET_LABELED_SPARSE_MATRIX(read_func, sg_type) \
112 void CLibSVMFile::get_sparse_matrix(SGSparseVector<sg_type>*& mat_feat, int32_t& num_feat, int32_t& num_vec, \
113  float64_t*& labels, bool load_labels) \
114 { \
115  SGVector<float64_t>* multilabel; \
116  int32_t num_classes; \
117  get_sparse_matrix(mat_feat, num_feat, num_vec, multilabel, num_classes, load_labels); \
118  \
119  for (int32_t i=0; i<num_vec; i++) \
120  { \
121  REQUIRE(multilabel[i].size()==1, \
122  "%s a multilabel file. You are trying to read it with a single-label reader.", filename); \
123  } \
124  labels=SG_MALLOC(float64_t, num_vec); \
125  \
126  for (int32_t i=0; i<num_vec; i++) \
127  labels[i]=multilabel[i][0]; \
128 } \
129 
130 GET_LABELED_SPARSE_MATRIX(read_bool, bool)
131 GET_LABELED_SPARSE_MATRIX(read_char, int8_t)
132 GET_LABELED_SPARSE_MATRIX(read_byte, uint8_t)
133 GET_LABELED_SPARSE_MATRIX(read_char, char)
134 GET_LABELED_SPARSE_MATRIX(read_int, int32_t)
135 GET_LABELED_SPARSE_MATRIX(read_uint, uint32_t)
136 GET_LABELED_SPARSE_MATRIX(read_short_real, float32_t)
138 GET_LABELED_SPARSE_MATRIX(read_long_real, floatmax_t)
139 GET_LABELED_SPARSE_MATRIX(read_short, int16_t)
140 GET_LABELED_SPARSE_MATRIX(read_word, uint16_t)
141 GET_LABELED_SPARSE_MATRIX(read_long, int64_t)
142 GET_LABELED_SPARSE_MATRIX(read_ulong, uint64_t)
143 #undef GET_LABELED_SPARSE_MATRIX
144 
145 #define GET_MULTI_LABELED_SPARSE_MATRIX(read_func, sg_type) \
146 void CLibSVMFile::get_sparse_matrix(SGSparseVector<sg_type>*& mat_feat, int32_t& num_feat, int32_t& num_vec, \
147  SGVector<float64_t>*& multilabel, int32_t& num_classes, bool load_labels) \
148 { \
149  num_feat=0; \
150  \
151  SG_INFO("counting line numbers in file %s\n", filename) \
152  num_vec=get_num_lines(); \
153  \
154  int32_t current_line_ind=0; \
155  SGVector<char> line; \
156  \
157  int32_t num_feat_entries=0; \
158  DynArray<SGVector<char> > entries_feat; \
159  DynArray<float64_t > entries_label; \
160  DynArray<float64_t> classes; \
161  \
162  mat_feat=SG_MALLOC(SGSparseVector<sg_type>, num_vec); \
163  multilabel=SG_MALLOC(SGVector<float64_t>, num_vec); \
164  \
165  num_classes=0; \
166  SG_SET_LOCALE_C; \
167  \
168  while (m_line_reader->has_next()) \
169  { \
170  num_feat_entries=0; \
171  entries_feat.reset(SGVector<char>(false)); \
172  line=m_line_reader->read_line(); \
173  \
174  m_parser->set_tokenizer(m_whitespace_tokenizer); \
175  m_parser->set_text(line); \
176  \
177  SGVector<char> entry_label; \
178  if (load_labels && m_parser->has_next()) \
179  { \
180  entry_label=m_parser->read_string(); \
181  if (is_feat_entry(entry_label)) \
182  { \
183  entries_feat.push_back(entry_label); \
184  num_feat_entries++; \
185  entry_label=SGVector<char>(0); \
186  } \
187  } \
188  \
189  while (m_parser->has_next()) \
190  { \
191  entries_feat.push_back(m_parser->read_string()); \
192  num_feat_entries++; \
193  } \
194  \
195  mat_feat[current_line_ind]=SGSparseVector<sg_type>(num_feat_entries); \
196  for (int32_t i=0; i<num_feat_entries; i++) \
197  { \
198  m_parser->set_tokenizer(m_delimiter_feat_tokenizer); \
199  m_parser->set_text(entries_feat[i]); \
200  \
201  int32_t feat_index=0; \
202  \
203  if (m_parser->has_next()) \
204  feat_index=m_parser->read_int(); \
205  \
206  sg_type entry=0; \
207  \
208  if (m_parser->has_next()) \
209  entry=m_parser->read_func(); \
210  \
211  if (feat_index>num_feat) \
212  num_feat=feat_index; \
213  \
214  mat_feat[current_line_ind].features[i].feat_index=feat_index-1; \
215  mat_feat[current_line_ind].features[i].entry=entry; \
216  } \
217  \
218  if (load_labels) \
219  { \
220  m_parser->set_tokenizer(m_delimiter_label_tokenizer); \
221  m_parser->set_text(entry_label); \
222  \
223  int32_t num_label_entries=0; \
224  entries_label.reset(0); \
225  \
226  while (m_parser->has_next()) \
227  { \
228  num_label_entries++; \
229  float64_t label_val=m_parser->read_real(); \
230  \
231  if (classes.find_element(label_val)==-1) \
232  classes.push_back(label_val); \
233  \
234  entries_label.push_back(label_val); \
235  } \
236  multilabel[current_line_ind]=SGVector<float64_t>(num_label_entries); \
237  \
238  for (int32_t j=0; j < num_label_entries; j++) \
239  multilabel[current_line_ind][j]=entries_label[j]; \
240  \
241  } \
242  \
243  current_line_ind++; \
244  SG_PROGRESS(current_line_ind, 0, num_vec, 1, "LOADING:\t") \
245  } \
246  num_classes=classes.get_num_elements(); \
247  \
248  SG_RESET_LOCALE; \
249  \
250  SG_INFO("file successfully read\n") \
251 }
252 
253 GET_MULTI_LABELED_SPARSE_MATRIX(read_bool, bool)
254 GET_MULTI_LABELED_SPARSE_MATRIX(read_char, int8_t)
255 GET_MULTI_LABELED_SPARSE_MATRIX(read_byte, uint8_t)
256 GET_MULTI_LABELED_SPARSE_MATRIX(read_char, char)
257 GET_MULTI_LABELED_SPARSE_MATRIX(read_int, int32_t)
258 GET_MULTI_LABELED_SPARSE_MATRIX(read_uint, uint32_t)
262 GET_MULTI_LABELED_SPARSE_MATRIX(read_short, int16_t)
263 GET_MULTI_LABELED_SPARSE_MATRIX(read_word, uint16_t)
264 GET_MULTI_LABELED_SPARSE_MATRIX(read_long, int64_t)
265 GET_MULTI_LABELED_SPARSE_MATRIX(read_ulong, uint64_t)
266 #undef GET_MULTI_LABELED_SPARSE_MATRIX
267 
268 #define SET_SPARSE_MATRIX(format, sg_type) \
269 void CLibSVMFile::set_sparse_matrix( \
270  const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec) \
271 { \
272  SGVector <float64_t>* labels = NULL; \
273  set_sparse_matrix(matrix, num_feat, num_vec, labels); \
274 }
275 
276 SET_SPARSE_MATRIX(SCNi32, bool)
277 SET_SPARSE_MATRIX(SCNi8, int8_t)
278 SET_SPARSE_MATRIX(SCNu8, uint8_t)
279 SET_SPARSE_MATRIX(SCNu8, char)
280 SET_SPARSE_MATRIX(SCNi32, int32_t)
281 SET_SPARSE_MATRIX(SCNu32, uint32_t)
282 SET_SPARSE_MATRIX(SCNi64, int64_t)
283 SET_SPARSE_MATRIX(SCNu64, uint64_t)
287 SET_SPARSE_MATRIX(SCNi16, int16_t)
288 SET_SPARSE_MATRIX(SCNu16, uint16_t)
289 #undef SET_SPARSE_MATRIX
290 
291 #define SET_LABELED_SPARSE_MATRIX(format, sg_type) \
292 void CLibSVMFile::set_sparse_matrix( \
293  const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec, \
294  const float64_t* labels) \
295 { \
296  SGVector<float64_t>* multilabel=SG_MALLOC(SGVector<float64_t>, num_vec); \
297  \
298  for (int32_t i=0; i<num_vec; i++) \
299  { \
300  multilabel[i]=SGVector<float64_t>(1); \
301  multilabel[i][0]=labels[i]; \
302  } \
303  \
304  set_sparse_matrix(matrix, num_feat, num_vec, multilabel); \
305  SG_FREE(multilabel); \
306 }
307 
308 SET_LABELED_SPARSE_MATRIX(SCNi32, bool)
309 SET_LABELED_SPARSE_MATRIX(SCNi8, int8_t)
310 SET_LABELED_SPARSE_MATRIX(SCNu8, uint8_t)
311 SET_LABELED_SPARSE_MATRIX(SCNu8, char)
312 SET_LABELED_SPARSE_MATRIX(SCNi32, int32_t)
313 SET_LABELED_SPARSE_MATRIX(SCNu32, uint32_t)
314 SET_LABELED_SPARSE_MATRIX(SCNi64, int64_t)
315 SET_LABELED_SPARSE_MATRIX(SCNu64, uint64_t)
319 SET_LABELED_SPARSE_MATRIX(SCNi16, int16_t)
320 SET_LABELED_SPARSE_MATRIX(SCNu16, uint16_t)
321 #undef SET_LABELED_SPARSE_MATRIX
322 
323 #define SET_MULTI_LABELED_SPARSE_MATRIX(format, sg_type) \
324 void CLibSVMFile::set_sparse_matrix( \
325  const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec, \
326  const SGVector<float64_t>* multilabel) \
327 { \
328  SG_SET_LOCALE_C; \
329  \
330  for (int32_t i=0; i<num_vec; i++) \
331  { \
332  if (multilabel!=NULL) \
333  { \
334  if (multilabel[i].size()==0) \
335  fprintf(file, " "); \
336  \
337  for (int32_t j=0; j <multilabel[i].size(); j++) \
338  { \
339  fprintf(file, "%lg", multilabel[i][j]); \
340  \
341  if (j==multilabel[i].size()-1) \
342  fprintf(file, " "); \
343  else \
344  fprintf(file, ","); \
345  } \
346  } \
347  \
348  for (int32_t j=0; j<matrix[i].num_feat_entries; j++) \
349  { \
350  fprintf(file, "%d%c%" format " ", \
351  matrix[i].features[j].feat_index+1, \
352  m_delimiter_feat, \
353  matrix[i].features[j].entry); \
354  } \
355  fprintf(file, "\n"); \
356  } \
357  \
358  SG_RESET_LOCALE; \
359 }
360 
362 SET_MULTI_LABELED_SPARSE_MATRIX(SCNi8, int8_t)
363 SET_MULTI_LABELED_SPARSE_MATRIX(SCNu8, uint8_t)
365 SET_MULTI_LABELED_SPARSE_MATRIX(SCNi32, int32_t)
366 SET_MULTI_LABELED_SPARSE_MATRIX(SCNu32, uint32_t)
367 SET_MULTI_LABELED_SPARSE_MATRIX(SCNi64, int64_t)
368 SET_MULTI_LABELED_SPARSE_MATRIX(SCNu64, uint64_t)
372 SET_MULTI_LABELED_SPARSE_MATRIX(SCNi16, int16_t)
373 SET_MULTI_LABELED_SPARSE_MATRIX(SCNu16, uint16_t)
374 #undef SET_MULTI_LABELED_SPARSE_MATRIX
375 
376 int32_t CLibSVMFile::get_num_lines()
377 {
378  int32_t num_lines=0;
379  while (m_line_reader->has_next())
380  {
381  m_line_reader->skip_line();
382  num_lines++;
383  }
384  m_line_reader->reset();
385 
386  return num_lines;
387 }
388 
389 bool CLibSVMFile::is_feat_entry(const SGVector<char> entry)
390 {
391  CParser* parser = new CParser();
392  parser->set_tokenizer(m_delimiter_feat_tokenizer);
393  parser->set_text(entry);
394  bool isfeat = false;
395 
396  if (parser->has_next())
397  {
398  parser->read_real();
399 
400  if (parser->has_next())
401  isfeat = true;
402 
403  }
404 
405  SG_UNREF(parser);
406 
407  return isfeat;
408 }

SHOGUN Machine Learning Toolbox - Documentation