SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StreamingMMD.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2012-2013 Heiko Strathmann
4  * Written (w) 2014 Soumyajit De
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice, this
11  * list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  * this list of conditions and the following disclaimer in the documentation
14  * and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  * The views and conclusions contained in the software and documentation are those
28  * of the authors and should not be interpreted as representing official policies,
29  * either expressed or implied, of the Shogun Development Team.
30  */
31 
36 #include <shogun/lib/List.h>
37 
38 using namespace shogun;
39 
41 {
42  init();
43 }
44 
46  CStreamingFeatures* q, index_t m, index_t blocksize) :
47  CKernelTwoSampleTest(kernel, NULL, m)
48 {
49  init();
50 
51  m_streaming_p=p;
53 
54  m_streaming_q=q;
56 
57  m_blocksize=blocksize;
58 }
59 
61 {
64 
65  /* m_kernel is SG_UNREFed in base desctructor */
66 }
67 
68 void CStreamingMMD::init()
69 {
70  SG_ADD((CSGObject**)&m_streaming_p, "streaming_p", "Streaming features p",
72  SG_ADD((CSGObject**)&m_streaming_q, "streaming_q", "Streaming features p",
74  SG_ADD(&m_blocksize, "blocksize", "Number of elements processed at once",
76  SG_ADD(&m_simulate_h0, "simulate_h0", "Whether p and q are mixed",
78 
79  m_streaming_p=NULL;
80  m_streaming_q=NULL;
81  m_blocksize=10000;
82  m_simulate_h0=false;
83 }
84 
86 {
87  /* use wrapper method and compute for single kernel */
88  SGVector<float64_t> statistic;
89  SGVector<float64_t> variance;
90  compute_statistic_and_variance(statistic, variance, false);
91 
92  return statistic[0];
93 }
94 
96 {
97  /* make sure multiple_kernels flag is used only with a combined kernel */
98  REQUIRE(!multiple_kernels || m_kernel->get_kernel_type()==K_COMBINED,
99  "multiple kernels specified, but underlying kernel is not of type "
100  "K_COMBINED\n");
101 
102  SGVector<float64_t> statistic;
103  SGVector<float64_t> variance;
104  compute_statistic_and_variance(statistic, variance, multiple_kernels);
105 
106  return statistic;
107 }
108 
110 {
111  /* use wrapper method and compute for single kernel */
112  SGVector<float64_t> statistic;
113  SGVector<float64_t> variance;
114  compute_statistic_and_variance(statistic, variance, false);
115 
116  return variance[0];
117 }
118 
120 {
121  float64_t result=0;
122 
124  {
125  case MMD1_GAUSSIAN:
126  {
127  /* compute variance and use to estimate Gaussian distribution */
129  result=1.0-CStatistics::normal_cdf(statistic, std_dev);
130  }
131  break;
132 
133  default:
134  /* sampling null is handled here */
135  result=CKernelTwoSampleTest::compute_p_value(statistic);
136  break;
137  }
138 
139  return result;
140 }
141 
143 {
144  float64_t result=0;
145 
147  {
148  case MMD1_GAUSSIAN:
149  {
150  /* compute variance and use to estimate Gaussian distribution */
152  result=1.0-CStatistics::inverse_normal_cdf(1-alpha, 0, std_dev);
153  }
154  break;
155 
156  default:
157  /* sampling null is handled here */
159  break;
160  }
161 
162  return result;
163 }
164 
166 {
167  float64_t result=0;
168 
170  {
171  case MMD1_GAUSSIAN:
172  {
173  /* compute variance and use to estimate Gaussian distribution, use
174  * wrapper method and compute for single kernel */
175  SGVector<float64_t> statistic;
176  SGVector<float64_t> variance;
177  compute_statistic_and_variance(statistic, variance, false);
178 
179  /* estimate Gaussian distribution */
180  result=1.0-CStatistics::normal_cdf(statistic[0],
181  CMath::sqrt(variance[0]));
182  }
183  break;
184 
185  default:
186  /* sampling null can be done separately in superclass */
188  break;
189  }
190 
191  return result;
192 }
193 
195 {
197 
198  /* instead of permutating samples, just samples new data all the time. */
201  SG_REF(p);
202  SG_REF(q);
203 
204  bool old=m_simulate_h0;
205  set_simulate_h0(true);
206  for (index_t i=0; i<m_num_null_samples; ++i)
207  {
208  /* compute statistic for this permutation of mixed samples */
209  samples[i]=compute_statistic();
210  }
211  set_simulate_h0(old);
212  m_streaming_p=p;
213  m_streaming_q=q;
214  SG_UNREF(p);
215  SG_UNREF(q);
216 
217  return samples;
218 }
219 
221  index_t num_this_run)
222 {
223  SG_DEBUG("entering!\n");
224 
225  /* the list of blocks of data to be returned, turning delete_data flag
226  * on which SG_REFs the elements when appended or returned. */
227  CList* data=new CList(true);
228 
229  SG_DEBUG("streaming %d blocks from p of blocksize %d!\n", num_blocks,
230  num_this_run);
231 
232  /* stream data from p num_blocks of time*/
233  for (index_t i=0; i<num_blocks; ++i)
234  {
236  data->append_element(block);
237  }
238 
239  SG_DEBUG("streaming %d blocks from q of blocksize %d!\n", num_blocks,
240  num_this_run);
241 
242  /* stream data from q num_blocks of time*/
243  for (index_t i=0; i<num_blocks; ++i)
244  {
246  data->append_element(block);
247  }
248 
249  /* check whether h0 should be simulated and permute if so */
250  if (m_simulate_h0)
251  {
252  /* create merged copy of all feature instances to permute */
253  SG_DEBUG("merging and premuting features!\n");
254 
255  /* use the first element to merge rest of the data into */
256  CFeatures* first=(CFeatures*)data->get_first_element();
257 
258  /* this delete element doesn't deallocate first element but just removes
259  * from the list and does a SG_UNREF. But its not deleted because
260  * get_first_element() does a SG_REF before returning so we need to later
261  * manually take care of its destruction via SG_UNREF here itself */
262  data->delete_element();
263 
264  CFeatures* merged=first->create_merged_copy(data);
265 
266  /* now we can get rid of unnecessary feature objects */
267  SG_UNREF(first);
268  data->delete_all_elements();
269 
270  /* permute */
271  SGVector<index_t> inds(merged->get_num_vectors());
272  inds.range_fill();
273  inds.permute();
274  merged->add_subset(inds);
275 
276  /* copy back */
277  SGVector<index_t> copy(num_this_run);
278  copy.range_fill();
279  for (index_t i=0; i<2*num_blocks; ++i)
280  {
281  CFeatures* current=merged->copy_subset(copy);
282  data->append_element(current);
283  /* SG_UNREF'ing since copy_subset does a SG_REF, this is
284  * safe since the object is already SG_REF'ed inside the list */
285  SG_UNREF(current);
286 
287  if (i<2*num_blocks-1)
288  copy.add(num_this_run);
289  }
290 
291  /* clean up */
292  SG_UNREF(merged);
293  }
294 
295  SG_REF(data);
296 
297  SG_DEBUG("leaving!\n");
298  return data;
299 }
300 
302 {
303  SG_ERROR("Method not implemented since linear time mmd is based on "
304  "streaming features\n");
305 }
306 
308 {
309  SG_ERROR("Method not implemented since linear time mmd is based on "
310  "streaming features\n");
311  return NULL;
312 }
313 
315 {
317  return m_streaming_p;
318 }
319 
321 {
323  return m_streaming_q;
324 }
325 

SHOGUN Machine Learning Toolbox - Documentation