SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MLDataHDF5File.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Copyright (C) 2013 Zhengyang Liu (zhengyangl)
8  */
9 
10 #include <shogun/lib/config.h>
11 
12 #if defined(HAVE_HDF5) && defined( HAVE_CURL)
13 
14 #include <stdio.h>
15 #include <string.h>
16 #include <hdf5.h>
17 #include <curl/curl.h>
18 #include <shogun/lib/memory.h>
20 
21 using namespace shogun;
22 
23 CMLDataHDF5File::CMLDataHDF5File()
24 {
25  SG_UNSTABLE("CMLDataHDF5File::CMLDataHDF5File()", "\n")
26 
27  get_boolean_type();
28  h5file = -1;
29 }
30 
31 size_t write_data(void *ptr, size_t size, size_t nmemb, FILE *stream) {
32  size_t written = fwrite(ptr, size, nmemb, stream);
33  return written;
34 }
35 
36 CMLDataHDF5File::CMLDataHDF5File(char* data_name,
37  const char* name,
38  const char* url_prefix) : CFile()
39 {
40  get_boolean_type();
41  H5Eset_auto2(H5E_DEFAULT, NULL, NULL);
42 
43  if (name)
44  set_variable_name(name);
45 
46  CURL *curl;
47  FILE *fp=NULL;
48 
49  mldata_url = SG_CALLOC(char, strlen(url_prefix)+strlen(data_name)+1);
50  strcat(mldata_url, url_prefix);
51  strcat(mldata_url, data_name);
52 
53  fname = SG_CALLOC(char, strlen((char*)"/tmp/")+strlen(data_name)+strlen((char*)".h5")+1);
54  strcat(fname, (char*) "/tmp/");
55  strcat(fname, data_name);
56  strcat(fname, (char*) ".h5");
57 
58  curl = curl_easy_init();
59  fp = fopen(fname,"wb");
60 
61  if (!fp)
62  {
63  SG_ERROR("Could not open file '%s'\n", fname)
64  return;
65  }
66 
67  if (curl) {
68  curl_easy_setopt(curl, CURLOPT_URL, mldata_url);
69  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &write_data);
70  curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
71  curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0L);
72  curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L);
73  curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
74  curl_easy_perform(curl);
75  curl_easy_cleanup(curl);
76  }
77 
78  if(fp)
79  fclose(fp);
80 
81  h5file = H5Fopen(fname, H5F_ACC_RDONLY, H5P_DEFAULT);
82 
83  if (h5file<0)
84  SG_ERROR("Could not open data repository '%s'\n", data_name)
85 }
86 
87 CMLDataHDF5File::~CMLDataHDF5File()
88 {
89  H5Fclose(h5file);
90  remove(fname);
91  SG_FREE(fname);
92  SG_FREE(mldata_url);
93 }
94 
95 #define GET_VECTOR(fname, sg_type, datatype) \
96 void CMLDataHDF5File::fname(sg_type*& vec, int32_t& len) \
97 { \
98  if (!h5file) \
99  SG_ERROR("File invalid.\n") \
100  \
101  int32_t* dims; \
102  int32_t ndims; \
103  int64_t nelements; \
104  hid_t dataset=H5Dopen2(h5file, variable_name, H5P_DEFAULT); \
105  if (dataset<0) \
106  SG_ERROR("Error opening data set\n") \
107  hid_t dtype=H5Dget_type(dataset); \
108  H5T_class_t t_class=H5Tget_class(dtype); \
109  TSGDataType t datatype; hid_t h5_type=get_compatible_type(t_class, &t); \
110  if (h5_type==-1) \
111  { \
112  H5Dclose(dataset); \
113  SG_INFO("No compatible datatype found\n") \
114  } \
115  get_dims(dataset, dims, ndims, nelements); \
116  if (!((ndims==2 && dims[0]==nelements && dims[1]==1) || \
117  (ndims==2 && dims[0]==1 && dims[1]==nelements) || \
118  (ndims==1 && dims[0]==nelements))) \
119  SG_ERROR("Error not a 1-dimensional vector (ndims=%d, dims[0]=%d)\n", ndims, dims[0]) \
120  vec=SG_MALLOC(sg_type, nelements); \
121  len=nelements; \
122  herr_t status = H5Dread(dataset, h5_type, H5S_ALL, \
123  H5S_ALL, H5P_DEFAULT, vec); \
124  H5Dclose(dataset); \
125  H5Tclose(dtype); \
126  SG_FREE(dims); \
127  if (status<0) \
128  { \
129  SG_FREE(vec); \
130  SG_ERROR("Error reading dataset\n") \
131  } \
132 }
133 
134 GET_VECTOR(get_vector, bool, (CT_VECTOR, ST_NONE, PT_BOOL))
135 GET_VECTOR(get_vector, int8_t, (CT_VECTOR, ST_NONE, PT_INT8))
136 GET_VECTOR(get_vector, uint8_t, (CT_VECTOR, ST_NONE, PT_UINT8))
137 GET_VECTOR(get_vector, char, (CT_VECTOR, ST_NONE, PT_CHAR))
138 GET_VECTOR(get_vector, int32_t, (CT_VECTOR, ST_NONE, PT_INT32))
139 GET_VECTOR(get_vector, uint32_t, (CT_VECTOR, ST_NONE, PT_UINT32))
140 GET_VECTOR(get_vector, float32_t, (CT_VECTOR, ST_NONE, PT_FLOAT32))
141 GET_VECTOR(get_vector, float64_t, (CT_VECTOR, ST_NONE, PT_FLOAT64))
142 GET_VECTOR(get_vector, floatmax_t, (CT_VECTOR, ST_NONE, PT_FLOATMAX))
143 GET_VECTOR(get_vector, int16_t, (CT_VECTOR, ST_NONE, PT_INT16))
144 GET_VECTOR(get_vector, uint16_t, (CT_VECTOR, ST_NONE, PT_INT16))
145 GET_VECTOR(get_vector, int64_t, (CT_VECTOR, ST_NONE, PT_INT64))
146 GET_VECTOR(get_vector, uint64_t, (CT_VECTOR, ST_NONE, PT_UINT64))
147 #undef GET_VECTOR
148 
149 #define GET_MATRIX(fname, sg_type, datatype) \
150 void CMLDataHDF5File::fname(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec) \
151 { \
152  if (!h5file) \
153  SG_ERROR("File invalid.\n") \
154  \
155  int32_t* dims; \
156  int32_t ndims; \
157  int64_t nelements; \
158  hid_t dataset = H5Dopen2(h5file, variable_name, H5P_DEFAULT); \
159  if (dataset<0) \
160  SG_ERROR("Error opening data set\n") \
161  hid_t dtype = H5Dget_type(dataset); \
162  H5T_class_t t_class=H5Tget_class(dtype); \
163  TSGDataType t datatype; hid_t h5_type=get_compatible_type(t_class, &t); \
164  if (h5_type==-1) \
165  { \
166  H5Dclose(dataset); \
167  SG_INFO("No compatible datatype found\n") \
168  } \
169  get_dims(dataset, dims, ndims, nelements); \
170  if (ndims!=2) \
171  SG_ERROR("Error not a 2-dimensional matrix\n") \
172  matrix=SG_MALLOC(sg_type, nelements); \
173  num_feat=dims[0]; \
174  num_vec=dims[1]; \
175  herr_t status = H5Dread(dataset, h5_type, H5S_ALL, \
176  H5S_ALL, H5P_DEFAULT, matrix); \
177  H5Dclose(dataset); \
178  H5Tclose(dtype); \
179  SG_FREE(dims); \
180  if (status<0) \
181  { \
182  SG_FREE(matrix); \
183  SG_ERROR("Error reading dataset\n") \
184  } \
185 }
186 
187 GET_MATRIX(get_matrix, bool, (CT_MATRIX, ST_NONE, PT_BOOL))
188 GET_MATRIX(get_matrix, char, (CT_MATRIX, ST_NONE, PT_CHAR))
189 GET_MATRIX(get_matrix, uint8_t, (CT_MATRIX, ST_NONE, PT_UINT8))
190 GET_MATRIX(get_matrix, int32_t, (CT_MATRIX, ST_NONE, PT_INT32))
191 GET_MATRIX(get_matrix, uint32_t, (CT_MATRIX, ST_NONE, PT_INT32))
192 GET_MATRIX(get_matrix, int64_t, (CT_MATRIX, ST_NONE, PT_INT64))
193 GET_MATRIX(get_matrix, uint64_t, (CT_MATRIX, ST_NONE, PT_INT64))
194 GET_MATRIX(get_matrix, int16_t, (CT_MATRIX, ST_NONE, PT_INT16))
195 GET_MATRIX(get_matrix, uint16_t, (CT_MATRIX, ST_NONE, PT_INT16))
196 GET_MATRIX(get_matrix, float32_t, (CT_MATRIX, ST_NONE, PT_FLOAT32))
197 GET_MATRIX(get_matrix, float64_t, (CT_MATRIX, ST_NONE, PT_FLOAT64))
198 GET_MATRIX(get_matrix, floatmax_t, (CT_MATRIX, ST_NONE, PT_FLOATMAX))
199 #undef GET_MATRIX
200 
201 #define GET_SPARSEMATRIX(fname, sg_type, datatype) \
202 void CMLDataHDF5File::fname(SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \
203 { \
204  if (!(file)) \
205  SG_ERROR("File invalid.\n") \
206 }
207 GET_SPARSEMATRIX(get_sparse_matrix, bool, DT_SPARSE_BOOL)
208 GET_SPARSEMATRIX(get_sparse_matrix, char, DT_SPARSE_CHAR)
209 GET_SPARSEMATRIX(get_sparse_matrix, int8_t, DT_SPARSE_INT8)
210 GET_SPARSEMATRIX(get_sparse_matrix, uint8_t, DT_SPARSE_BYTE)
211 GET_SPARSEMATRIX(get_sparse_matrix, int32_t, DT_SPARSE_INT)
212 GET_SPARSEMATRIX(get_sparse_matrix, uint32_t, DT_SPARSE_UINT)
213 GET_SPARSEMATRIX(get_sparse_matrix, int64_t, DT_SPARSE_LONG)
214 GET_SPARSEMATRIX(get_sparse_matrix, uint64_t, DT_SPARSE_ULONG)
215 GET_SPARSEMATRIX(get_sparse_matrix, int16_t, DT_SPARSE_SHORT)
216 GET_SPARSEMATRIX(get_sparse_matrix, uint16_t, DT_SPARSE_WORD)
217 GET_SPARSEMATRIX(get_sparse_matrix, float32_t, DT_SPARSE_SHORTREAL)
218 GET_SPARSEMATRIX(get_sparse_matrix, float64_t, DT_SPARSE_REAL)
219 GET_SPARSEMATRIX(get_sparse_matrix, floatmax_t, DT_SPARSE_LONGREAL)
220 #undef GET_SPARSEMATRIX
221 
222 
223 #define GET_STRING_LIST(fname, sg_type, datatype) \
224 void CMLDataHDF5File::fname(SGString<sg_type>*& strings, int32_t& num_str, int32_t& max_string_len) \
225 { \
226 }
227 
228 GET_STRING_LIST(get_string_list, bool, DT_STRING_BOOL)
229 GET_STRING_LIST(get_string_list, char, DT_STRING_CHAR)
230 GET_STRING_LIST(get_string_list, int8_t, DT_STRING_INT8)
231 GET_STRING_LIST(get_string_list, uint8_t, DT_STRING_BYTE)
232 GET_STRING_LIST(get_string_list, int32_t, DT_STRING_INT)
233 GET_STRING_LIST(get_string_list, uint32_t, DT_STRING_UINT)
234 GET_STRING_LIST(get_string_list, int64_t, DT_STRING_LONG)
235 GET_STRING_LIST(get_string_list, uint64_t, DT_STRING_ULONG)
236 GET_STRING_LIST(get_string_list, int16_t, DT_STRING_SHORT)
237 GET_STRING_LIST(get_string_list, uint16_t, DT_STRING_WORD)
238 GET_STRING_LIST(get_string_list, float32_t, DT_STRING_SHORTREAL)
239 GET_STRING_LIST(get_string_list, float64_t, DT_STRING_REAL)
240 GET_STRING_LIST(get_string_list, floatmax_t, DT_STRING_LONGREAL)
241 #undef GET_STRING_LIST
242 
243 void CMLDataHDF5File::get_boolean_type()
244 {
245  boolean_type=H5T_NATIVE_UCHAR;
246  switch (sizeof(bool))
247  {
248  case 1:
249  boolean_type = H5T_NATIVE_UCHAR;
250  break;
251  case 2:
252  boolean_type = H5T_NATIVE_UINT16;
253  break;
254  case 4:
255  boolean_type = H5T_NATIVE_UINT32;
256  break;
257  case 8:
258  boolean_type = H5T_NATIVE_UINT64;
259  break;
260  default:
261  SG_ERROR("Boolean type not supported on this platform\n")
262  }
263 }
264 
265 hid_t CMLDataHDF5File::get_compatible_type(H5T_class_t t_class,
266  const TSGDataType* datatype)
267 {
268  switch (t_class)
269  {
270  case H5T_FLOAT:
271  case H5T_INTEGER:
272  switch (datatype->m_ptype)
273  {
274  case PT_BOOL: return boolean_type;
275  case PT_CHAR: return H5T_NATIVE_CHAR;
276  case PT_INT8: return H5T_NATIVE_INT8;
277  case PT_UINT8: return H5T_NATIVE_UINT8;
278  case PT_INT16: return H5T_NATIVE_INT16;
279  case PT_UINT16: return H5T_NATIVE_UINT16;
280  case PT_INT32: return H5T_NATIVE_INT32;
281  case PT_UINT32: return H5T_NATIVE_UINT32;
282  case PT_INT64: return H5T_NATIVE_INT64;
283  case PT_UINT64: return H5T_NATIVE_UINT64;
284  case PT_FLOAT32: return H5T_NATIVE_FLOAT;
285  case PT_FLOAT64: return H5T_NATIVE_DOUBLE;
286  case PT_FLOATMAX: return H5T_NATIVE_LDOUBLE;
287  case PT_COMPLEX128:
288  SG_ERROR("complex128_t not compatible with HDF5File!");
289  return -1;
290  case PT_UNDEFINED:
291  case PT_SGOBJECT:
292  SG_ERROR("Implementation error during writing "
293  "HDF5File!");
294  return -1;
295  }
296  case H5T_STRING:
297  SG_ERROR("Strings not supported")
298  return -1;
299  case H5T_VLEN:
300  SG_ERROR("Variable length containers currently not supported")
301  return -1;
302  case H5T_ARRAY:
303  SG_ERROR("Array containers currently not supported")
304  return -1;
305  default:
306  SG_ERROR("Datatype mismatchn")
307  return -1;
308  }
309 }
310 
311 void CMLDataHDF5File::get_dims(hid_t dataset, int32_t*& dims, int32_t& ndims, int64_t& total_elements)
312 {
313  hid_t dataspace = H5Dget_space(dataset);
314  if (dataspace<0)
315  SG_ERROR("Error obtaining hdf5 dataspace\n")
316 
317  ndims = H5Sget_simple_extent_ndims(dataspace);
318  total_elements=H5Sget_simple_extent_npoints(dataspace);
319  hsize_t* dims_out=SG_MALLOC(hsize_t, ndims);
320  dims=SG_MALLOC(int32_t, ndims);
321  H5Sget_simple_extent_dims(dataspace, dims_out, NULL);
322  for (int32_t i=0; i<ndims; i++)
323  dims[i]=dims_out[i];
324  SG_FREE(dims_out);
325  H5Sclose(dataspace);
326 }
327 
328 void CMLDataHDF5File::create_group_hierarchy()
329 {
330  char* vname=get_strdup(variable_name);
331  int32_t vlen=strlen(vname);
332  for (int32_t i=0; i<vlen; i++)
333  {
334  if (i!=0 && vname[i]=='/')
335  {
336  vname[i]='\0';
337  hid_t g = H5Gopen2(h5file, vname, H5P_DEFAULT);
338  if (g<0)
339  {
340  g=H5Gcreate2(h5file, vname, H5P_DEFAULT, H5P_DEFAULT,
341  H5P_DEFAULT);
342  if (g<0)
343  SG_ERROR("Error creating group '%s'\n", vname)
344  vname[i]='/';
345  }
346  H5Gclose(g);
347  }
348  }
349  SG_FREE(vname);
350 }
351 #endif // HAVE_CURL && HAVE_HDF5

SHOGUN Machine Learning Toolbox - Documentation