SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StreamingAsciiFile.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2011 Shashwat Lal Das
8  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
9  */
10 
12 #include <shogun/io/CSVFile.h>
13 #include <shogun/io/SGIO.h>
15 #include <shogun/base/DynArray.h>
16 
17 #include <ctype.h>
18 
19 using namespace shogun;
20 
22  : CStreamingFile()
23 {
24  SG_UNSTABLE("CStreamingAsciiFile::CStreamingAsciiFile()", "\n")
25  m_delimiter = ' ';
26 }
27 
28 CStreamingAsciiFile::CStreamingAsciiFile(const char* fname, char rw)
29  : CStreamingFile(fname, rw)
30 {
31  m_delimiter = ' ';
32 }
33 
35 {
36 }
37 
38 /* Methods for reading dense vectors from an ascii file */
39 
40 #define GET_VECTOR(fname, conv, sg_type) \
41 void CStreamingAsciiFile::get_vector(sg_type*& vector, int32_t& num_feat) \
42 { \
43  char* buffer = NULL; \
44  ssize_t bytes_read; \
45  int32_t old_len = num_feat; \
46  \
47  SG_SET_LOCALE_C; \
48  bytes_read = buf->read_line(buffer); \
49  \
50  if (bytes_read<=0) \
51  { \
52  vector=NULL; \
53  num_feat=-1; \
54  SG_RESET_LOCALE; \
55  return; \
56  } \
57  \
58  /* determine num_feat, populate dynamic array */ \
59  int32_t nf=0; \
60  num_feat=0; \
61  \
62  char* ptr_item=NULL; \
63  char* ptr_data=buffer; \
64  DynArray<char*>* items=new DynArray<char*>(); \
65  \
66  while (*ptr_data) \
67  { \
68  if ((*ptr_data=='\n') || \
69  (ptr_data - buffer >= bytes_read)) \
70  { \
71  if (ptr_item) \
72  nf++; \
73  \
74  append_item(items, ptr_data, ptr_item); \
75  num_feat=nf; \
76  \
77  nf=0; \
78  ptr_item=NULL; \
79  break; \
80  } \
81  else if (!isblank(*ptr_data) && !ptr_item) \
82  { \
83  ptr_item=ptr_data; \
84  } \
85  else if (isblank(*ptr_data) && ptr_item) \
86  { \
87  append_item(items, ptr_data, ptr_item); \
88  ptr_item=NULL; \
89  nf++; \
90  } \
91  \
92  ptr_data++; \
93  } \
94  \
95  SG_DEBUG("num_feat %d\n", num_feat) \
96  \
97  /* now copy data into vector */ \
98  if (old_len < num_feat) \
99  vector=SG_REALLOC(sg_type, vector, old_len, num_feat); \
100  \
101  for (int32_t i=0; i<num_feat; i++) \
102  { \
103  char* item=items->get_element(i); \
104  vector[i]=conv(item); \
105  SG_FREE(item); \
106  } \
107  delete items; \
108  SG_RESET_LOCALE; \
109 }
110 
111 GET_VECTOR(get_bool_vector, str_to_bool, bool)
112 GET_VECTOR(get_byte_vector, atoi, uint8_t)
113 GET_VECTOR(get_char_vector, atoi, char)
114 GET_VECTOR(get_int_vector, atoi, int32_t)
115 GET_VECTOR(get_short_vector, atoi, int16_t)
116 GET_VECTOR(get_word_vector, atoi, uint16_t)
117 GET_VECTOR(get_int8_vector, atoi, int8_t)
118 GET_VECTOR(get_uint_vector, atoi, uint32_t)
119 GET_VECTOR(get_long_vector, atoi, int64_t)
120 GET_VECTOR(get_ulong_vector, atoi, uint64_t)
121 GET_VECTOR(get_longreal_vector, atoi, floatmax_t)
122 #undef GET_VECTOR
123 
124 #define GET_FLOAT_VECTOR(sg_type) \
125  void CStreamingAsciiFile::get_vector(sg_type*& vector, int32_t& len)\
126  { \
127  char *line=NULL; \
128  SG_SET_LOCALE_C; \
129  int32_t num_chars = buf->read_line(line); \
130  int32_t old_len = len; \
131  \
132  if (num_chars == 0) \
133  { \
134  len = -1; \
135  SG_RESET_LOCALE; \
136  return; \
137  } \
138  \
139  substring example_string = {line, line + num_chars}; \
140  \
141  CCSVFile::tokenize(m_delimiter, example_string, words); \
142  \
143  len = words.index(); \
144  substring* feature_start = &words[0]; \
145  \
146  if (len > old_len) \
147  vector = SG_REALLOC(sg_type, vector, old_len, len); \
148  \
149  int32_t j=0; \
150  for (substring* i = feature_start; i != words.end; i++) \
151  { \
152  vector[j++] = SGIO::float_of_substring(*i); \
153  } \
154  SG_RESET_LOCALE; \
155  }
156 
159 #undef GET_FLOAT_VECTOR
160 
161 /* Methods for reading a dense vector and a label from an ascii file */
162 
163 #define GET_VECTOR_AND_LABEL(fname, conv, sg_type) \
164  void CStreamingAsciiFile::get_vector_and_label(sg_type*& vector, int32_t& num_feat, float64_t& label) \
165  { \
166  char* buffer = NULL; \
167  ssize_t bytes_read; \
168  int32_t old_len = num_feat; \
169  SG_SET_LOCALE_C; \
170  \
171  bytes_read = buf->read_line(buffer); \
172  \
173  if (bytes_read<=0) \
174  { \
175  vector=NULL; \
176  num_feat=-1; \
177  SG_RESET_LOCALE; \
178  return; \
179  } \
180  \
181  /* determine num_feat, populate dynamic array */ \
182  int32_t nf=0; \
183  num_feat=0; \
184  \
185  char* ptr_item=NULL; \
186  char* ptr_data=buffer; \
187  DynArray<char*>* items=new DynArray<char*>(); \
188  \
189  while (*ptr_data) \
190  { \
191  if ((*ptr_data=='\n') || \
192  (ptr_data - buffer >= bytes_read)) \
193  { \
194  if (ptr_item) \
195  nf++; \
196  \
197  append_item(items, ptr_data, ptr_item); \
198  num_feat=nf; \
199  \
200  nf=0; \
201  ptr_item=NULL; \
202  break; \
203  } \
204  else if (!isblank(*ptr_data) && !ptr_item) \
205  { \
206  ptr_item=ptr_data; \
207  } \
208  else if (isblank(*ptr_data) && ptr_item) \
209  { \
210  append_item(items, ptr_data, ptr_item); \
211  ptr_item=NULL; \
212  nf++; \
213  } \
214  \
215  ptr_data++; \
216  } \
217  \
218  SG_DEBUG("num_feat %d\n", num_feat) \
219  /* The first element is the label */ \
220  label=atof(items->get_element(0)); \
221  /* now copy rest of the data into vector */ \
222  if (old_len < num_feat - 1) \
223  vector=SG_REALLOC(sg_type, vector, old_len, num_feat-1); \
224  \
225  for (int32_t i=1; i<num_feat; i++) \
226  { \
227  char* item=items->get_element(i); \
228  vector[i-1]=conv(item); \
229  SG_FREE(item); \
230  } \
231  delete items; \
232  num_feat--; \
233  SG_RESET_LOCALE; \
234  }
235 
236 GET_VECTOR_AND_LABEL(get_bool_vector_and_label, str_to_bool, bool)
237 GET_VECTOR_AND_LABEL(get_byte_vector_and_label, atoi, uint8_t)
238 GET_VECTOR_AND_LABEL(get_char_vector_and_label, atoi, char)
239 GET_VECTOR_AND_LABEL(get_int_vector_and_label, atoi, int32_t)
240 GET_VECTOR_AND_LABEL(get_short_vector_and_label, atoi, int16_t)
241 GET_VECTOR_AND_LABEL(get_word_vector_and_label, atoi, uint16_t)
242 GET_VECTOR_AND_LABEL(get_int8_vector_and_label, atoi, int8_t)
243 GET_VECTOR_AND_LABEL(get_uint_vector_and_label, atoi, uint32_t)
244 GET_VECTOR_AND_LABEL(get_long_vector_and_label, atoi, int64_t)
245 GET_VECTOR_AND_LABEL(get_ulong_vector_and_label, atoi, uint64_t)
246 GET_VECTOR_AND_LABEL(get_longreal_vector_and_label, atoi, floatmax_t)
247 #undef GET_VECTOR_AND_LABEL
248 
249 #define GET_FLOAT_VECTOR_AND_LABEL(sg_type) \
250  void CStreamingAsciiFile::get_vector_and_label(sg_type*& vector, int32_t& len, float64_t& label) \
251  { \
252  char *line=NULL; \
253  SG_SET_LOCALE_C; \
254  int32_t num_chars = buf->read_line(line); \
255  int32_t old_len = len; \
256  \
257  if (num_chars == 0) \
258  { \
259  len = -1; \
260  SG_RESET_LOCALE; \
261  return; \
262  } \
263  \
264  substring example_string = {line, line + num_chars}; \
265  \
266  CCSVFile::tokenize(m_delimiter, example_string, words); \
267  \
268  label = SGIO::float_of_substring(words[0]); \
269  \
270  len = words.index() - 1; \
271  substring* feature_start = &words[1]; \
272  \
273  if (len > old_len) \
274  vector = SG_REALLOC(sg_type, vector, old_len, len); \
275  \
276  int32_t j=0; \
277  for (substring* i = feature_start; i != words.end; i++) \
278  { \
279  vector[j++] = SGIO::float_of_substring(*i); \
280  } \
281  SG_RESET_LOCALE; \
282  }
283 
286 #undef GET_FLOAT_VECTOR_AND_LABEL
287 
288 /* Methods for reading a string vector from an ascii file (see StringFeatures) */
289 
290 #define GET_STRING(fname, conv, sg_type) \
291 void CStreamingAsciiFile::get_string(sg_type*& vector, int32_t& len) \
292 { \
293  char* buffer = NULL; \
294  ssize_t bytes_read; \
295  \
296  SG_SET_LOCALE_C; \
297  bytes_read = buf->read_line(buffer); \
298  \
299  if (bytes_read<=1) \
300  { \
301  vector=NULL; \
302  len=-1; \
303  SG_RESET_LOCALE; \
304  return; \
305  } \
306  \
307  SG_DEBUG("Line read from the file:\n%s\n", buffer) \
308  /* Remove the terminating \n */ \
309  if (buffer[bytes_read-1]=='\n') \
310  { \
311  len=bytes_read-1; \
312  buffer[bytes_read-1]='\0'; \
313  } \
314  else \
315  len=bytes_read; \
316  vector=(sg_type *) buffer; \
317  SG_RESET_LOCALE; \
318 }
319 
320 GET_STRING(get_bool_string, str_to_bool, bool)
321 GET_STRING(get_byte_string, atoi, uint8_t)
322 GET_STRING(get_char_string, atoi, char)
323 GET_STRING(get_int_string, atoi, int32_t)
324 GET_STRING(get_shortreal_string, atof, float32_t)
325 GET_STRING(get_real_string, atof, float64_t)
326 GET_STRING(get_short_string, atoi, int16_t)
327 GET_STRING(get_word_string, atoi, uint16_t)
328 GET_STRING(get_int8_string, atoi, int8_t)
329 GET_STRING(get_uint_string, atoi, uint32_t)
330 GET_STRING(get_long_string, atoi, int64_t)
331 GET_STRING(get_ulong_string, atoi, uint64_t)
332 GET_STRING(get_longreal_string, atoi, floatmax_t)
333 #undef GET_STRING
334 
335 /* Methods for reading a string vector and a label from an ascii file */
336 
337 #define GET_STRING_AND_LABEL(fname, conv, sg_type) \
338 void CStreamingAsciiFile::get_string_and_label(sg_type*& vector, int32_t& len, float64_t& label) \
339 { \
340  char* buffer = NULL; \
341  ssize_t bytes_read; \
342  \
343  SG_SET_LOCALE_C; \
344  bytes_read = buf->read_line(buffer); \
345  \
346  if (bytes_read<=1) \
347  { \
348  vector=NULL; \
349  len=-1; \
350  SG_RESET_LOCALE; \
351  return; \
352  } \
353  \
354  int32_t str_start_pos=-1; \
355  \
356  for (int32_t i=0; i<bytes_read; i++) \
357  { \
358  if (buffer[i] == ' ') \
359  { \
360  buffer[i]='\0'; \
361  label=atoi(buffer); \
362  buffer[i]=' '; \
363  str_start_pos=i+1; \
364  break; \
365  } \
366  } \
367  /* If no label found, set vector=NULL and length=-1 */ \
368  if (str_start_pos == -1) \
369  { \
370  vector=NULL; \
371  len=-1; \
372  return; \
373  } \
374  /* Remove terminating \n */ \
375  if (buffer[bytes_read-1]=='\n') \
376  { \
377  buffer[bytes_read-1]='\0'; \
378  len=bytes_read-str_start_pos-1; \
379  } \
380  else \
381  len=bytes_read-str_start_pos; \
382  \
383  vector=(sg_type*) &buffer[str_start_pos]; \
384  SG_RESET_LOCALE; \
385 }
386 
387 GET_STRING_AND_LABEL(get_bool_string_and_label, str_to_bool, bool)
388 GET_STRING_AND_LABEL(get_byte_string_and_label, atoi, uint8_t)
389 GET_STRING_AND_LABEL(get_char_string_and_label, atoi, char)
390 GET_STRING_AND_LABEL(get_int_string_and_label, atoi, int32_t)
391 GET_STRING_AND_LABEL(get_shortreal_string_and_label, atof, float32_t)
392 GET_STRING_AND_LABEL(get_real_string_and_label, atof, float64_t)
393 GET_STRING_AND_LABEL(get_short_string_and_label, atoi, int16_t)
394 GET_STRING_AND_LABEL(get_word_string_and_label, atoi, uint16_t)
395 GET_STRING_AND_LABEL(get_int8_string_and_label, atoi, int8_t)
396 GET_STRING_AND_LABEL(get_uint_string_and_label, atoi, uint32_t)
397 GET_STRING_AND_LABEL(get_long_string_and_label, atoi, int64_t)
398 GET_STRING_AND_LABEL(get_ulong_string_and_label, atoi, uint64_t)
399 GET_STRING_AND_LABEL(get_longreal_string_and_label, atoi, floatmax_t)
400 #undef GET_STRING_AND_LABEL
401 
402 /* Methods for reading a sparse vector from an ascii file */
403 
404 #define GET_SPARSE_VECTOR(fname, conv, sg_type) \
405 void CStreamingAsciiFile::get_sparse_vector(SGSparseVectorEntry<sg_type>*& vector, int32_t& len) \
406 { \
407  char* buffer = NULL; \
408  ssize_t bytes_read; \
409  SG_SET_LOCALE_C; \
410  \
411  bytes_read = buf->read_line(buffer); \
412  \
413  if (bytes_read<=1) \
414  { \
415  vector=NULL; \
416  len=-1; \
417  SG_RESET_LOCALE; \
418  return; \
419  } \
420  \
421  /* Remove terminating \n */ \
422  int32_t num_chars; \
423  if (buffer[bytes_read-1]=='\n') \
424  { \
425  num_chars=bytes_read-1; \
426  buffer[num_chars]='\0'; \
427  } \
428  else \
429  num_chars=bytes_read; \
430  \
431  int32_t num_dims=0; \
432  for (int32_t i=0; i<num_chars; i++) \
433  { \
434  if (buffer[i]==':') \
435  { \
436  num_dims++; \
437  } \
438  } \
439  \
440  int32_t index_start_pos=-1; \
441  int32_t feature_start_pos; \
442  int32_t current_feat=0; \
443  if (len < num_dims) \
444  vector=SG_REALLOC(SGSparseVectorEntry<sg_type>, vector, len, num_dims); \
445  for (int32_t i=0; i<num_chars; i++) \
446  { \
447  if (buffer[i]==':') \
448  { \
449  buffer[i]='\0'; \
450  vector[current_feat].feat_index=(int32_t) atoi(buffer+index_start_pos)-1; \
451  /* Unset index_start_pos */ \
452  index_start_pos=-1; \
453  \
454  feature_start_pos=i+1; \
455  while ((buffer[i]!=' ') && (i<num_chars)) \
456  { \
457  i++; \
458  } \
459  \
460  buffer[i]='\0'; \
461  vector[current_feat].entry=(sg_type) conv(buffer+feature_start_pos); \
462  \
463  current_feat++; \
464  } \
465  else if (buffer[i]==' ') \
466  i++; \
467  else \
468  { \
469  /* Set index_start_pos if not set already */ \
470  /* if already set, it means the index is */ \
471  /* more than one digit long. */ \
472  if (index_start_pos == -1) \
473  index_start_pos=i; \
474  } \
475  } \
476  \
477  len=current_feat; \
478  SG_RESET_LOCALE; \
479 }
480 
481 GET_SPARSE_VECTOR(get_bool_sparse_vector, str_to_bool, bool)
482 GET_SPARSE_VECTOR(get_byte_sparse_vector, atoi, uint8_t)
483 GET_SPARSE_VECTOR(get_char_sparse_vector, atoi, char)
484 GET_SPARSE_VECTOR(get_int_sparse_vector, atoi, int32_t)
485 GET_SPARSE_VECTOR(get_shortreal_sparse_vector, atof, float32_t)
486 GET_SPARSE_VECTOR(get_real_sparse_vector, atof, float64_t)
487 GET_SPARSE_VECTOR(get_short_sparse_vector, atoi, int16_t)
488 GET_SPARSE_VECTOR(get_word_sparse_vector, atoi, uint16_t)
489 GET_SPARSE_VECTOR(get_int8_sparse_vector, atoi, int8_t)
490 GET_SPARSE_VECTOR(get_uint_sparse_vector, atoi, uint32_t)
491 GET_SPARSE_VECTOR(get_long_sparse_vector, atoi, int64_t)
492 GET_SPARSE_VECTOR(get_ulong_sparse_vector, atoi, uint64_t)
493 GET_SPARSE_VECTOR(get_longreal_sparse_vector, atoi, floatmax_t)
494 #undef GET_SPARSE_VECTOR
495 
496 /* Methods for reading a sparse vector and a label from an ascii file */
497 
498 #define GET_SPARSE_VECTOR_AND_LABEL(fname, conv, sg_type) \
499 void CStreamingAsciiFile::get_sparse_vector_and_label(SGSparseVectorEntry<sg_type>*& vector, int32_t& len, float64_t& label) \
500 { \
501  char* buffer = NULL; \
502  ssize_t bytes_read; \
503  SG_SET_LOCALE_C; \
504  \
505  bytes_read = buf->read_line(buffer); \
506  \
507  if (bytes_read<=1) \
508  { \
509  vector=NULL; \
510  len=-1; \
511  SG_RESET_LOCALE; \
512  return; \
513  } \
514  \
515  /* Remove terminating \n */ \
516  int32_t num_chars; \
517  if (buffer[bytes_read-1]=='\n') \
518  { \
519  num_chars=bytes_read-1; \
520  buffer[num_chars]='\0'; \
521  } \
522  else \
523  num_chars=bytes_read; \
524  \
525  int32_t num_dims=0; \
526  for (int32_t i=0; i<num_chars; i++) \
527  { \
528  if (buffer[i]==':') \
529  { \
530  num_dims++; \
531  } \
532  } \
533  \
534  int32_t index_start_pos=-1; \
535  int32_t feature_start_pos; \
536  int32_t current_feat=0; \
537  int32_t label_pos=-1; \
538  if (len < num_dims) \
539  vector=SG_REALLOC(SGSparseVectorEntry<sg_type>, vector, len, num_dims); \
540  \
541  for (int32_t i=1; i<num_chars; i++) \
542  { \
543  if (buffer[i]==':') \
544  { \
545  break; \
546  } \
547  if ( (buffer[i]==' ') && (buffer[i-1]!=' ') ) \
548  { \
549  buffer[i]='\0'; \
550  label_pos=i; \
551  label=atof(buffer); \
552  break; \
553  } \
554  } \
555  \
556  if (label_pos==-1) \
557  SG_ERROR("No label found!\n") \
558  \
559  buffer+=label_pos+1; \
560  num_chars-=label_pos+1; \
561  for (int32_t i=0; i<num_chars; i++) \
562  { \
563  if (buffer[i]==':') \
564  { \
565  buffer[i]='\0'; \
566  vector[current_feat].feat_index=(int32_t) atoi(buffer+index_start_pos)-1; \
567  /* Unset index_start_pos */ \
568  index_start_pos=-1; \
569  \
570  feature_start_pos=i+1; \
571  while ((buffer[i]!=' ') && (i<num_chars)) \
572  { \
573  i++; \
574  } \
575  \
576  buffer[i]='\0'; \
577  vector[current_feat].entry=(sg_type) conv(buffer+feature_start_pos); \
578  \
579  current_feat++; \
580  } \
581  else if (buffer[i]==' ') \
582  i++; \
583  else \
584  { \
585  /* Set index_start_pos if not set already */ \
586  /* if already set, it means the index is */ \
587  /* more than one digit long. */ \
588  if (index_start_pos == -1) \
589  index_start_pos=i; \
590  } \
591  } \
592  \
593  len=current_feat; \
594  SG_RESET_LOCALE; \
595 }
596 
597 GET_SPARSE_VECTOR_AND_LABEL(get_bool_sparse_vector_and_label, str_to_bool, bool)
598 GET_SPARSE_VECTOR_AND_LABEL(get_byte_sparse_vector_and_label, atoi, uint8_t)
599 GET_SPARSE_VECTOR_AND_LABEL(get_char_sparse_vector_and_label, atoi, char)
600 GET_SPARSE_VECTOR_AND_LABEL(get_int_sparse_vector_and_label, atoi, int32_t)
601 GET_SPARSE_VECTOR_AND_LABEL(get_shortreal_sparse_vector_and_label, atof, float32_t)
602 GET_SPARSE_VECTOR_AND_LABEL(get_real_sparse_vector_and_label, atof, float64_t)
603 GET_SPARSE_VECTOR_AND_LABEL(get_short_sparse_vector_and_label, atoi, int16_t)
604 GET_SPARSE_VECTOR_AND_LABEL(get_word_sparse_vector_and_label, atoi, uint16_t)
605 GET_SPARSE_VECTOR_AND_LABEL(get_int8_sparse_vector_and_label, atoi, int8_t)
606 GET_SPARSE_VECTOR_AND_LABEL(get_uint_sparse_vector_and_label, atoi, uint32_t)
607 GET_SPARSE_VECTOR_AND_LABEL(get_long_sparse_vector_and_label, atoi, int64_t)
608 GET_SPARSE_VECTOR_AND_LABEL(get_ulong_sparse_vector_and_label, atoi, uint64_t)
609 GET_SPARSE_VECTOR_AND_LABEL(get_longreal_sparse_vector_and_label, atoi, floatmax_t)
610 #undef GET_SPARSE_VECTOR_AND_LABEL
611 
612 template <class T>
613 void CStreamingAsciiFile::append_item(
614  DynArray<T>* items, char* ptr_data, char* ptr_item)
615 {
616  REQUIRE(ptr_data && ptr_item, "Data and Item to append should not be NULL\n");
617 
618  size_t len=(ptr_data-ptr_item)/sizeof(char);
619  char* item=SG_MALLOC(char, len+1);
620  memset(item, 0, sizeof(char)*(len+1));
621  item=strncpy(item, ptr_item, len);
622 
623  SG_DEBUG("current %c, len %d, item %s\n", *ptr_data, len, item)
624  items->append_element(item);
625 }
626 
628 {
629  m_delimiter = delimiter;
630 }

SHOGUN Machine Learning Toolbox - Documentation