SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
MMDKernelSelectionMedian.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Heiko Strathmann
8  */
9 
19 
20 
21 using namespace shogun;
22 
25 {
26  init();
27 }
28 
30  CKernelTwoSampleTest* mmd, index_t num_data_distance) :
32 {
33  /* assert that a combined kernel is used */
34  CKernel* kernel=mmd->get_kernel();
35  CFeatures* lhs=kernel->get_lhs();
36  CFeatures* rhs=kernel->get_rhs();
37  REQUIRE(kernel, "%s::%s(): No kernel set!\n", get_name(), get_name());
38  REQUIRE(kernel->get_kernel_type()==K_COMBINED, "%s::%s(): Requires "
39  "CombinedKernel as kernel. Yours is %s", get_name(), get_name(),
40  kernel->get_name());
41 
42  /* assert that all subkernels are Gaussian kernels */
43  CCombinedKernel* combined=(CCombinedKernel*)kernel;
44 
45  for (index_t k_idx=0; k_idx<combined->get_num_kernels(); k_idx++)
46  {
47  CKernel* subkernel=combined->get_kernel(k_idx);
48  REQUIRE(kernel, "%s::%s(): Subkernel (%d) of current kernel is not"
49  " of type GaussianKernel\n", get_name(), get_name(), k_idx);
50  SG_UNREF(subkernel);
51  }
52 
53  /* assert 64 bit dense features since EuclideanDistance can only handle
54  * those */
56  {
57  CFeatures* features=((CQuadraticTimeMMD*)m_estimator)->get_p_and_q();
58  REQUIRE(features->get_feature_class()==C_DENSE &&
59  features->get_feature_type()==F_DREAL, "%s::select_kernel(): "
60  "Only 64 bit float dense features allowed, these are \"%s\""
61  " and of type %d\n",
62  get_name(), features->get_name(), features->get_feature_type());
63  SG_UNREF(features);
64  }
66  {
67  CStreamingFeatures* p=((CLinearTimeMMD*)m_estimator)->get_streaming_p();
68  CStreamingFeatures* q=((CLinearTimeMMD*)m_estimator)->get_streaming_q();
70  p->get_feature_type()==F_DREAL, "%s::select_kernel(): "
71  "Only 64 bit float streaming dense features allowed, these (p) "
72  "are \"%s\" and of type %d\n",
73  get_name(), p->get_name(), p->get_feature_type());
74 
76  p->get_feature_type()==F_DREAL, "%s::select_kernel(): "
77  "Only 64 bit float streaming dense features allowed, these (q) "
78  "are \"%s\" and of type %d\n",
79  get_name(), q->get_name(), q->get_feature_type());
80  SG_UNREF(p);
81  SG_UNREF(q);
82  }
83 
84  SG_UNREF(kernel);
85  SG_UNREF(lhs);
86  SG_UNREF(rhs);
87 
88  init();
89 
90  m_num_data_distance=num_data_distance;
91 }
92 
94 {
95 }
96 
97 void CMMDKernelSelectionMedian::init()
98 {
99  SG_ADD(&m_num_data_distance, "m_num_data_distance", "Number of elements to "
100  "to compute median distance on", MS_NOT_AVAILABLE);
101 
102  /* this is a sensible value */
103  m_num_data_distance=1000;
104 }
105 
107 {
108  SG_ERROR("%s::compute_measures(): Not implemented. Use select_kernel() "
109  "method!\n", get_name());
110  return SGVector<float64_t>();
111 }
112 
114 {
115  /* number of data for distace */
117 
118  SGMatrix<float64_t> dists;
119 
120  /* compute all pairwise distances, depends which mmd statistic is used */
122  {
123  /* fixed data, create merged copy of a random subset */
124 
125  /* create vector with that correspond to the num_data first points of
126  * each distribution, remember data is stored jointly */
127  SGVector<index_t> subset(num_data*2);
129  for (index_t i=0; i<num_data; ++i)
130  {
131  /* num_data samples from each half of joint sample */
132  subset[i]=i;
133  subset[i+num_data]=i+m;
134  }
135 
136  /* add subset and compute pairwise distances */
138  CFeatures* features=quad_mmd->get_p_and_q();
139  features->add_subset(subset);
140 
141  /* cast is safe, see constructor */
142  CDenseFeatures<float64_t>* dense_features=
143  (CDenseFeatures<float64_t>*) features;
144 
145  CEuclideanDistance* distance=new CEuclideanDistance(dense_features,
146  dense_features);
147  dists=distance->get_distance_matrix();
148  features->remove_subset();
149  SG_UNREF(distance);
150  SG_UNREF(features);
151  }
153  {
154  /* just stream the desired number of points */
156 
157  CStreamingFeatures* p=linear_mmd->get_streaming_p();
158  CStreamingFeatures* q=linear_mmd->get_streaming_q();
159 
160  /* cast is safe, see constructor */
162  p->get_streamed_features(num_data);
164  q->get_streamed_features(num_data);
165 
166  /* for safety */
167  SG_REF(p_streamed);
168  SG_REF(q_streamed);
169 
170  /* create merged feature object */
172  p_streamed->create_merged_copy(q_streamed);
173 
174  /* compute pairwise distances */
175  CEuclideanDistance* distance=new CEuclideanDistance(merged, merged);
176  dists=distance->get_distance_matrix();
177 
178  /* clean up */
179  SG_UNREF(distance);
180  SG_UNREF(p_streamed);
181  SG_UNREF(q_streamed);
182  SG_UNREF(p);
183  SG_UNREF(q);
184  }
185 
186  /* create a vector where the zeros have been removed, use upper triangle
187  * only since distances are symmetric */
188  SGVector<float64_t> dist_vec(dists.num_rows*(dists.num_rows-1)/2);
189  index_t write_idx=0;
190  for (index_t i=0; i<dists.num_rows; ++i)
191  {
192  for (index_t j=i+1; j<dists.num_rows; ++j)
193  dist_vec[write_idx++]=dists(i,j);
194  }
195 
196  /* now we have distance matrix, compute median, allow to modify matrix */
197  CMath::qsort<float64_t>(dist_vec.vector, dist_vec.vlen);
198  float64_t median_distance=dist_vec[dist_vec.vlen/2];
199  SG_DEBUG("median_distance: %f\n", median_distance);
200 
201  /* shogun has no square and factor two in its kernel width, MATLAB does
202  * median_width = sqrt(0.5*median_distance), we do this */
203  float64_t shogun_sigma=median_distance;
204  SG_DEBUG("kernel width (shogun): %f\n", shogun_sigma);
205 
206  /* now of all kernels, find the one which has its width closest
207  * Cast is safe due to constructor of MMDKernelSelection class */
209  float64_t min_distance=CMath::MAX_REAL_NUMBER;
210  CKernel* min_kernel=NULL;
212  for (index_t i=0; i<combined->get_num_subkernels(); ++i)
213  {
214  CKernel* current=combined->get_kernel(i);
215  REQUIRE(current->get_kernel_type()==K_GAUSSIAN, "%s::select_kernel(): "
216  "%d-th kernel is not a Gaussian but \"%s\"!\n", get_name(), i,
217  current->get_name());
218 
219  /* check if width is closer to median width */
220  distance=CMath::abs(((CGaussianKernel*)current)->get_width()-
221  shogun_sigma);
222 
223  if (distance<min_distance)
224  {
225  min_distance=distance;
226  min_kernel=current;
227  }
228 
229  /* next kernel */
230  SG_UNREF(current);
231  }
232  SG_UNREF(combined);
233 
234  /* returned referenced kernel */
235  SG_REF(min_kernel);
236  return min_kernel;
237 }
virtual const char * get_name() const =0
float distance(CJLCoverTreePoint p1, CJLCoverTreePoint p2, float64_t upper_bound)
virtual CStreamingFeatures * get_streaming_q()
virtual CStreamingFeatures * get_streaming_p()
int32_t index_t
Definition: common.h:62
virtual CFeatures * get_streamed_features(index_t num_elements)
CFeatures * get_rhs()
Definition: Kernel.h:511
#define SG_ERROR(...)
Definition: SGIO.h:129
#define REQUIRE(x,...)
Definition: SGIO.h:206
Base class for kernel selection for MMD-based two-sample test statistic implementations. Provides abstract methods for selecting kernels and computing criteria or kernel weights for the implemented method. In order to implement new methods for kernel selection, simply write a new implementation of this class.
Kernel two sample test base class. Provides an interface for performing a two-sample test using a ker...
#define SG_REF(x)
Definition: SGObject.h:51
virtual SGVector< float64_t > compute_measures()
This class implements the quadratic time Maximum Mean Statistic as described in [1]. The MMD is the distance of two probability distributions and in a RKHS which we denote by .
index_t vlen
Definition: SGVector.h:492
virtual const char * get_name() const
CKernel * get_kernel(int32_t idx)
double float64_t
Definition: common.h:50
virtual EFeatureClass get_feature_class() const =0
The Combined kernel is used to combine a number of kernels into a single CombinedKernel object by lin...
CFeatures * create_merged_copy(CList *other)
The well known Gaussian kernel (swiss army knife for SVMs) computed on CDotFeatures.
CKernelTwoSampleTest * m_estimator
#define SG_UNREF(x)
Definition: SGObject.h:52
#define SG_DEBUG(...)
Definition: SGIO.h:107
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual void remove_subset()
Definition: Features.cpp:322
virtual EKernelType get_kernel_type()=0
The class Features is the base class of all feature objects.
Definition: Features.h:68
static T min(T a, T b)
Definition: Math.h:157
Streaming features are features which are used for online algorithms.
This class implements the linear time Maximum Mean Statistic as described in [1] for streaming data (...
Definition: LinearTimeMMD.h:66
virtual EStatisticType get_statistic_type() const =0
The Kernel base class.
Definition: Kernel.h:159
SGMatrix< float64_t > get_distance_matrix()
Definition: Distance.h:156
#define SG_ADD(...)
Definition: SGObject.h:81
virtual CFeatures * get_p_and_q()
class EuclideanDistance
static const float64_t MAX_REAL_NUMBER
Definition: Math.h:2061
virtual void add_subset(SGVector< index_t > subset)
Definition: Features.cpp:310
virtual EFeatureType get_feature_type() const =0
static T abs(T a)
Definition: Math.h:179
CFeatures * get_lhs()
Definition: Kernel.h:505

SHOGUN Machine Learning Toolbox - Documentation