SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StreamingAsciiFile.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2011 Shashwat Lal Das
8  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
9  */
10 
12 #include <shogun/io/SGIO.h>
14 #include <shogun/base/DynArray.h>
15 
16 #include <ctype.h>
17 
18 using namespace shogun;
19 
21  : CStreamingFile()
22 {
23  SG_UNSTABLE("CStreamingAsciiFile::CStreamingAsciiFile()", "\n")
24  m_delimiter = ' ';
25 }
26 
27 CStreamingAsciiFile::CStreamingAsciiFile(const char* fname, char rw)
28  : CStreamingFile(fname, rw)
29 {
30  m_delimiter = ' ';
31 }
32 
34 {
35 }
36 
37 /* Methods for reading dense vectors from an ascii file */
38 
39 #define GET_VECTOR(fname, conv, sg_type) \
40 void CStreamingAsciiFile::get_vector(sg_type*& vector, int32_t& num_feat) \
41 { \
42  char* buffer = NULL; \
43  ssize_t bytes_read; \
44  int32_t old_len = num_feat; \
45  \
46  SG_SET_LOCALE_C; \
47  bytes_read = buf->read_line(buffer); \
48  \
49  if (bytes_read<=0) \
50  { \
51  vector=NULL; \
52  num_feat=-1; \
53  SG_RESET_LOCALE; \
54  return; \
55  } \
56  \
57  /* determine num_feat, populate dynamic array */ \
58  int32_t nf=0; \
59  num_feat=0; \
60  \
61  char* ptr_item=NULL; \
62  char* ptr_data=buffer; \
63  DynArray<char*>* items=new DynArray<char*>(); \
64  \
65  while (*ptr_data) \
66  { \
67  if ((*ptr_data=='\n') || \
68  (ptr_data - buffer >= bytes_read)) \
69  { \
70  if (ptr_item) \
71  nf++; \
72  \
73  append_item(items, ptr_data, ptr_item); \
74  num_feat=nf; \
75  \
76  nf=0; \
77  ptr_item=NULL; \
78  break; \
79  } \
80  else if (!isblank(*ptr_data) && !ptr_item) \
81  { \
82  ptr_item=ptr_data; \
83  } \
84  else if (isblank(*ptr_data) && ptr_item) \
85  { \
86  append_item(items, ptr_data, ptr_item); \
87  ptr_item=NULL; \
88  nf++; \
89  } \
90  \
91  ptr_data++; \
92  } \
93  \
94  SG_DEBUG("num_feat %d\n", num_feat) \
95  \
96  /* now copy data into vector */ \
97  if (old_len < num_feat) \
98  vector=SG_REALLOC(sg_type, vector, old_len, num_feat); \
99  \
100  for (int32_t i=0; i<num_feat; i++) \
101  { \
102  char* item=items->get_element(i); \
103  vector[i]=conv(item); \
104  SG_FREE(item); \
105  } \
106  delete items; \
107  SG_RESET_LOCALE; \
108 }
109 
110 GET_VECTOR(get_bool_vector, str_to_bool, bool)
111 GET_VECTOR(get_byte_vector, atoi, uint8_t)
112 GET_VECTOR(get_char_vector, atoi, char)
113 GET_VECTOR(get_int_vector, atoi, int32_t)
114 GET_VECTOR(get_short_vector, atoi, int16_t)
115 GET_VECTOR(get_word_vector, atoi, uint16_t)
116 GET_VECTOR(get_int8_vector, atoi, int8_t)
117 GET_VECTOR(get_uint_vector, atoi, uint32_t)
118 GET_VECTOR(get_long_vector, atoi, int64_t)
119 GET_VECTOR(get_ulong_vector, atoi, uint64_t)
120 GET_VECTOR(get_longreal_vector, atoi, floatmax_t)
121 #undef GET_VECTOR
122 
123 #define GET_FLOAT_VECTOR(sg_type) \
124  void CStreamingAsciiFile::get_vector(sg_type*& vector, int32_t& len)\
125  { \
126  char *line=NULL; \
127  SG_SET_LOCALE_C; \
128  int32_t num_chars = buf->read_line(line); \
129  int32_t old_len = len; \
130  \
131  if (num_chars == 0) \
132  { \
133  len = -1; \
134  SG_RESET_LOCALE; \
135  return; \
136  } \
137  \
138  substring example_string = {line, line + num_chars}; \
139  \
140  tokenize(m_delimiter, example_string, words); \
141  \
142  len = words.index(); \
143  substring* feature_start = &words[0]; \
144  \
145  if (len > old_len) \
146  vector = SG_REALLOC(sg_type, vector, old_len, len); \
147  \
148  int32_t j=0; \
149  for (substring* i = feature_start; i != words.end; i++) \
150  { \
151  vector[j++] = SGIO::float_of_substring(*i); \
152  } \
153  SG_RESET_LOCALE; \
154  }
155 
158 #undef GET_FLOAT_VECTOR
159 
160 /* Methods for reading a dense vector and a label from an ascii file */
161 
162 #define GET_VECTOR_AND_LABEL(fname, conv, sg_type) \
163  void CStreamingAsciiFile::get_vector_and_label(sg_type*& vector, int32_t& num_feat, float64_t& label) \
164  { \
165  char* buffer = NULL; \
166  ssize_t bytes_read; \
167  int32_t old_len = num_feat; \
168  SG_SET_LOCALE_C; \
169  \
170  bytes_read = buf->read_line(buffer); \
171  \
172  if (bytes_read<=0) \
173  { \
174  vector=NULL; \
175  num_feat=-1; \
176  SG_RESET_LOCALE; \
177  return; \
178  } \
179  \
180  /* determine num_feat, populate dynamic array */ \
181  int32_t nf=0; \
182  num_feat=0; \
183  \
184  char* ptr_item=NULL; \
185  char* ptr_data=buffer; \
186  DynArray<char*>* items=new DynArray<char*>(); \
187  \
188  while (*ptr_data) \
189  { \
190  if ((*ptr_data=='\n') || \
191  (ptr_data - buffer >= bytes_read)) \
192  { \
193  if (ptr_item) \
194  nf++; \
195  \
196  append_item(items, ptr_data, ptr_item); \
197  num_feat=nf; \
198  \
199  nf=0; \
200  ptr_item=NULL; \
201  break; \
202  } \
203  else if (!isblank(*ptr_data) && !ptr_item) \
204  { \
205  ptr_item=ptr_data; \
206  } \
207  else if (isblank(*ptr_data) && ptr_item) \
208  { \
209  append_item(items, ptr_data, ptr_item); \
210  ptr_item=NULL; \
211  nf++; \
212  } \
213  \
214  ptr_data++; \
215  } \
216  \
217  SG_DEBUG("num_feat %d\n", num_feat) \
218  /* The first element is the label */ \
219  label=atof(items->get_element(0)); \
220  /* now copy rest of the data into vector */ \
221  if (old_len < num_feat - 1) \
222  vector=SG_REALLOC(sg_type, vector, old_len, num_feat-1); \
223  \
224  for (int32_t i=1; i<num_feat; i++) \
225  { \
226  char* item=items->get_element(i); \
227  vector[i-1]=conv(item); \
228  SG_FREE(item); \
229  } \
230  delete items; \
231  num_feat--; \
232  SG_RESET_LOCALE; \
233  }
234 
235 GET_VECTOR_AND_LABEL(get_bool_vector_and_label, str_to_bool, bool)
236 GET_VECTOR_AND_LABEL(get_byte_vector_and_label, atoi, uint8_t)
237 GET_VECTOR_AND_LABEL(get_char_vector_and_label, atoi, char)
238 GET_VECTOR_AND_LABEL(get_int_vector_and_label, atoi, int32_t)
239 GET_VECTOR_AND_LABEL(get_short_vector_and_label, atoi, int16_t)
240 GET_VECTOR_AND_LABEL(get_word_vector_and_label, atoi, uint16_t)
241 GET_VECTOR_AND_LABEL(get_int8_vector_and_label, atoi, int8_t)
242 GET_VECTOR_AND_LABEL(get_uint_vector_and_label, atoi, uint32_t)
243 GET_VECTOR_AND_LABEL(get_long_vector_and_label, atoi, int64_t)
244 GET_VECTOR_AND_LABEL(get_ulong_vector_and_label, atoi, uint64_t)
245 GET_VECTOR_AND_LABEL(get_longreal_vector_and_label, atoi, floatmax_t)
246 #undef GET_VECTOR_AND_LABEL
247 
248 #define GET_FLOAT_VECTOR_AND_LABEL(sg_type) \
249  void CStreamingAsciiFile::get_vector_and_label(sg_type*& vector, int32_t& len, float64_t& label) \
250  { \
251  char *line=NULL; \
252  SG_SET_LOCALE_C; \
253  int32_t num_chars = buf->read_line(line); \
254  int32_t old_len = len; \
255  \
256  if (num_chars == 0) \
257  { \
258  len = -1; \
259  SG_RESET_LOCALE; \
260  return; \
261  } \
262  \
263  substring example_string = {line, line + num_chars}; \
264  \
265  tokenize(m_delimiter, example_string, words); \
266  \
267  label = SGIO::float_of_substring(words[0]); \
268  \
269  len = words.index() - 1; \
270  substring* feature_start = &words[1]; \
271  \
272  if (len > old_len) \
273  vector = SG_REALLOC(sg_type, vector, old_len, len); \
274  \
275  int32_t j=0; \
276  for (substring* i = feature_start; i != words.end; i++) \
277  { \
278  vector[j++] = SGIO::float_of_substring(*i); \
279  } \
280  SG_RESET_LOCALE; \
281  }
282 
285 #undef GET_FLOAT_VECTOR_AND_LABEL
286 
287 /* Methods for reading a string vector from an ascii file (see StringFeatures) */
288 
289 #define GET_STRING(fname, conv, sg_type) \
290 void CStreamingAsciiFile::get_string(sg_type*& vector, int32_t& len) \
291 { \
292  char* buffer = NULL; \
293  ssize_t bytes_read; \
294  \
295  SG_SET_LOCALE_C; \
296  bytes_read = buf->read_line(buffer); \
297  \
298  if (bytes_read<=1) \
299  { \
300  vector=NULL; \
301  len=-1; \
302  SG_RESET_LOCALE; \
303  return; \
304  } \
305  \
306  SG_DEBUG("Line read from the file:\n%s\n", buffer) \
307  /* Remove the terminating \n */ \
308  if (buffer[bytes_read-1]=='\n') \
309  { \
310  len=bytes_read-1; \
311  buffer[bytes_read-1]='\0'; \
312  } \
313  else \
314  len=bytes_read; \
315  vector=(sg_type *) buffer; \
316  SG_RESET_LOCALE; \
317 }
318 
319 GET_STRING(get_bool_string, str_to_bool, bool)
320 GET_STRING(get_byte_string, atoi, uint8_t)
321 GET_STRING(get_char_string, atoi, char)
322 GET_STRING(get_int_string, atoi, int32_t)
323 GET_STRING(get_shortreal_string, atof, float32_t)
324 GET_STRING(get_real_string, atof, float64_t)
325 GET_STRING(get_short_string, atoi, int16_t)
326 GET_STRING(get_word_string, atoi, uint16_t)
327 GET_STRING(get_int8_string, atoi, int8_t)
328 GET_STRING(get_uint_string, atoi, uint32_t)
329 GET_STRING(get_long_string, atoi, int64_t)
330 GET_STRING(get_ulong_string, atoi, uint64_t)
331 GET_STRING(get_longreal_string, atoi, floatmax_t)
332 #undef GET_STRING
333 
334 /* Methods for reading a string vector and a label from an ascii file */
335 
336 #define GET_STRING_AND_LABEL(fname, conv, sg_type) \
337 void CStreamingAsciiFile::get_string_and_label(sg_type*& vector, int32_t& len, float64_t& label) \
338 { \
339  char* buffer = NULL; \
340  ssize_t bytes_read; \
341  \
342  SG_SET_LOCALE_C; \
343  bytes_read = buf->read_line(buffer); \
344  \
345  if (bytes_read<=1) \
346  { \
347  vector=NULL; \
348  len=-1; \
349  SG_RESET_LOCALE; \
350  return; \
351  } \
352  \
353  int32_t str_start_pos=-1; \
354  \
355  for (int32_t i=0; i<bytes_read; i++) \
356  { \
357  if (buffer[i] == ' ') \
358  { \
359  buffer[i]='\0'; \
360  label=atoi(buffer); \
361  buffer[i]=' '; \
362  str_start_pos=i+1; \
363  break; \
364  } \
365  } \
366  /* If no label found, set vector=NULL and length=-1 */ \
367  if (str_start_pos == -1) \
368  { \
369  vector=NULL; \
370  len=-1; \
371  return; \
372  } \
373  /* Remove terminating \n */ \
374  if (buffer[bytes_read-1]=='\n') \
375  { \
376  buffer[bytes_read-1]='\0'; \
377  len=bytes_read-str_start_pos-1; \
378  } \
379  else \
380  len=bytes_read-str_start_pos; \
381  \
382  vector=(sg_type*) &buffer[str_start_pos]; \
383  SG_RESET_LOCALE; \
384 }
385 
386 GET_STRING_AND_LABEL(get_bool_string_and_label, str_to_bool, bool)
387 GET_STRING_AND_LABEL(get_byte_string_and_label, atoi, uint8_t)
388 GET_STRING_AND_LABEL(get_char_string_and_label, atoi, char)
389 GET_STRING_AND_LABEL(get_int_string_and_label, atoi, int32_t)
390 GET_STRING_AND_LABEL(get_shortreal_string_and_label, atof, float32_t)
391 GET_STRING_AND_LABEL(get_real_string_and_label, atof, float64_t)
392 GET_STRING_AND_LABEL(get_short_string_and_label, atoi, int16_t)
393 GET_STRING_AND_LABEL(get_word_string_and_label, atoi, uint16_t)
394 GET_STRING_AND_LABEL(get_int8_string_and_label, atoi, int8_t)
395 GET_STRING_AND_LABEL(get_uint_string_and_label, atoi, uint32_t)
396 GET_STRING_AND_LABEL(get_long_string_and_label, atoi, int64_t)
397 GET_STRING_AND_LABEL(get_ulong_string_and_label, atoi, uint64_t)
398 GET_STRING_AND_LABEL(get_longreal_string_and_label, atoi, floatmax_t)
399 #undef GET_STRING_AND_LABEL
400 
401 /* Methods for reading a sparse vector from an ascii file */
402 
403 #define GET_SPARSE_VECTOR(fname, conv, sg_type) \
404 void CStreamingAsciiFile::get_sparse_vector(SGSparseVectorEntry<sg_type>*& vector, int32_t& len) \
405 { \
406  char* buffer = NULL; \
407  ssize_t bytes_read; \
408  SG_SET_LOCALE_C; \
409  \
410  bytes_read = buf->read_line(buffer); \
411  \
412  if (bytes_read<=1) \
413  { \
414  vector=NULL; \
415  len=-1; \
416  SG_RESET_LOCALE; \
417  return; \
418  } \
419  \
420  /* Remove terminating \n */ \
421  int32_t num_chars; \
422  if (buffer[bytes_read-1]=='\n') \
423  { \
424  num_chars=bytes_read-1; \
425  buffer[num_chars]='\0'; \
426  } \
427  else \
428  num_chars=bytes_read; \
429  \
430  int32_t num_dims=0; \
431  for (int32_t i=0; i<num_chars; i++) \
432  { \
433  if (buffer[i]==':') \
434  { \
435  num_dims++; \
436  } \
437  } \
438  \
439  int32_t index_start_pos=-1; \
440  int32_t feature_start_pos; \
441  int32_t current_feat=0; \
442  if (len < num_dims) \
443  vector=SG_REALLOC(SGSparseVectorEntry<sg_type>, vector, len, num_dims); \
444  for (int32_t i=0; i<num_chars; i++) \
445  { \
446  if (buffer[i]==':') \
447  { \
448  buffer[i]='\0'; \
449  vector[current_feat].feat_index=(int32_t) atoi(buffer+index_start_pos)-1; \
450  /* Unset index_start_pos */ \
451  index_start_pos=-1; \
452  \
453  feature_start_pos=i+1; \
454  while ((buffer[i]!=' ') && (i<num_chars)) \
455  { \
456  i++; \
457  } \
458  \
459  buffer[i]='\0'; \
460  vector[current_feat].entry=(sg_type) conv(buffer+feature_start_pos); \
461  \
462  current_feat++; \
463  } \
464  else if (buffer[i]==' ') \
465  i++; \
466  else \
467  { \
468  /* Set index_start_pos if not set already */ \
469  /* if already set, it means the index is */ \
470  /* more than one digit long. */ \
471  if (index_start_pos == -1) \
472  index_start_pos=i; \
473  } \
474  } \
475  \
476  len=current_feat; \
477  SG_RESET_LOCALE; \
478 }
479 
480 GET_SPARSE_VECTOR(get_bool_sparse_vector, str_to_bool, bool)
481 GET_SPARSE_VECTOR(get_byte_sparse_vector, atoi, uint8_t)
482 GET_SPARSE_VECTOR(get_char_sparse_vector, atoi, char)
483 GET_SPARSE_VECTOR(get_int_sparse_vector, atoi, int32_t)
484 GET_SPARSE_VECTOR(get_shortreal_sparse_vector, atof, float32_t)
485 GET_SPARSE_VECTOR(get_real_sparse_vector, atof, float64_t)
486 GET_SPARSE_VECTOR(get_short_sparse_vector, atoi, int16_t)
487 GET_SPARSE_VECTOR(get_word_sparse_vector, atoi, uint16_t)
488 GET_SPARSE_VECTOR(get_int8_sparse_vector, atoi, int8_t)
489 GET_SPARSE_VECTOR(get_uint_sparse_vector, atoi, uint32_t)
490 GET_SPARSE_VECTOR(get_long_sparse_vector, atoi, int64_t)
491 GET_SPARSE_VECTOR(get_ulong_sparse_vector, atoi, uint64_t)
492 GET_SPARSE_VECTOR(get_longreal_sparse_vector, atoi, floatmax_t)
493 #undef GET_SPARSE_VECTOR
494 
495 /* Methods for reading a sparse vector and a label from an ascii file */
496 
497 #define GET_SPARSE_VECTOR_AND_LABEL(fname, conv, sg_type) \
498 void CStreamingAsciiFile::get_sparse_vector_and_label(SGSparseVectorEntry<sg_type>*& vector, int32_t& len, float64_t& label) \
499 { \
500  char* buffer = NULL; \
501  ssize_t bytes_read; \
502  SG_SET_LOCALE_C; \
503  \
504  bytes_read = buf->read_line(buffer); \
505  \
506  if (bytes_read<=1) \
507  { \
508  vector=NULL; \
509  len=-1; \
510  SG_RESET_LOCALE; \
511  return; \
512  } \
513  \
514  /* Remove terminating \n */ \
515  int32_t num_chars; \
516  if (buffer[bytes_read-1]=='\n') \
517  { \
518  num_chars=bytes_read-1; \
519  buffer[num_chars]='\0'; \
520  } \
521  else \
522  num_chars=bytes_read; \
523  \
524  int32_t num_dims=0; \
525  for (int32_t i=0; i<num_chars; i++) \
526  { \
527  if (buffer[i]==':') \
528  { \
529  num_dims++; \
530  } \
531  } \
532  \
533  int32_t index_start_pos=-1; \
534  int32_t feature_start_pos; \
535  int32_t current_feat=0; \
536  int32_t label_pos=-1; \
537  if (len < num_dims) \
538  vector=SG_REALLOC(SGSparseVectorEntry<sg_type>, vector, len, num_dims); \
539  \
540  for (int32_t i=1; i<num_chars; i++) \
541  { \
542  if (buffer[i]==':') \
543  { \
544  break; \
545  } \
546  if ( (buffer[i]==' ') && (buffer[i-1]!=' ') ) \
547  { \
548  buffer[i]='\0'; \
549  label_pos=i; \
550  label=atof(buffer); \
551  break; \
552  } \
553  } \
554  \
555  if (label_pos==-1) \
556  SG_ERROR("No label found!\n") \
557  \
558  buffer+=label_pos+1; \
559  num_chars-=label_pos+1; \
560  for (int32_t i=0; i<num_chars; i++) \
561  { \
562  if (buffer[i]==':') \
563  { \
564  buffer[i]='\0'; \
565  vector[current_feat].feat_index=(int32_t) atoi(buffer+index_start_pos)-1; \
566  /* Unset index_start_pos */ \
567  index_start_pos=-1; \
568  \
569  feature_start_pos=i+1; \
570  while ((buffer[i]!=' ') && (i<num_chars)) \
571  { \
572  i++; \
573  } \
574  \
575  buffer[i]='\0'; \
576  vector[current_feat].entry=(sg_type) conv(buffer+feature_start_pos); \
577  \
578  current_feat++; \
579  } \
580  else if (buffer[i]==' ') \
581  i++; \
582  else \
583  { \
584  /* Set index_start_pos if not set already */ \
585  /* if already set, it means the index is */ \
586  /* more than one digit long. */ \
587  if (index_start_pos == -1) \
588  index_start_pos=i; \
589  } \
590  } \
591  \
592  len=current_feat; \
593  SG_RESET_LOCALE; \
594 }
595 
596 GET_SPARSE_VECTOR_AND_LABEL(get_bool_sparse_vector_and_label, str_to_bool, bool)
597 GET_SPARSE_VECTOR_AND_LABEL(get_byte_sparse_vector_and_label, atoi, uint8_t)
598 GET_SPARSE_VECTOR_AND_LABEL(get_char_sparse_vector_and_label, atoi, char)
599 GET_SPARSE_VECTOR_AND_LABEL(get_int_sparse_vector_and_label, atoi, int32_t)
600 GET_SPARSE_VECTOR_AND_LABEL(get_shortreal_sparse_vector_and_label, atof, float32_t)
601 GET_SPARSE_VECTOR_AND_LABEL(get_real_sparse_vector_and_label, atof, float64_t)
602 GET_SPARSE_VECTOR_AND_LABEL(get_short_sparse_vector_and_label, atoi, int16_t)
603 GET_SPARSE_VECTOR_AND_LABEL(get_word_sparse_vector_and_label, atoi, uint16_t)
604 GET_SPARSE_VECTOR_AND_LABEL(get_int8_sparse_vector_and_label, atoi, int8_t)
605 GET_SPARSE_VECTOR_AND_LABEL(get_uint_sparse_vector_and_label, atoi, uint32_t)
606 GET_SPARSE_VECTOR_AND_LABEL(get_long_sparse_vector_and_label, atoi, int64_t)
607 GET_SPARSE_VECTOR_AND_LABEL(get_ulong_sparse_vector_and_label, atoi, uint64_t)
608 GET_SPARSE_VECTOR_AND_LABEL(get_longreal_sparse_vector_and_label, atoi, floatmax_t)
609 #undef GET_SPARSE_VECTOR_AND_LABEL
610 
611 template <class T>
612 void CStreamingAsciiFile::append_item(
613  DynArray<T>* items, char* ptr_data, char* ptr_item)
614 {
615  REQUIRE(ptr_data && ptr_item, "Data and Item to append should not be NULL\n");
616 
617  size_t len=(ptr_data-ptr_item)/sizeof(char);
618  char* item=SG_MALLOC(char, len+1);
619  memset(item, 0, sizeof(char)*(len+1));
620  item=strncpy(item, ptr_item, len);
621 
622  SG_DEBUG("current %c, len %d, item %s\n", *ptr_data, len, item)
623  items->append_element(item);
624 }
625 
627 {
628  m_delimiter = delimiter;
629 }
630 void CStreamingAsciiFile::tokenize(char delim, substring s, v_array<substring>& ret)
631 {
632  ret.erase();
633  char *last = s.start;
634  for (; s.start != s.end; s.start++)
635  {
636  if (*s.start == delim)
637  {
638  if (s.start != last)
639  {
640  substring temp = {last,s.start};
641  ret.push(temp);
642  }
643  last = s.start+1;
644  }
645  }
646  if (s.start != last)
647  {
648  substring final = {last, s.start};
649  ret.push(final);
650  }
651 }

SHOGUN Machine Learning Toolbox - Documentation