@@ -36,8 +36,6 @@ namespace {
36
36
const core::TPersistenceTag PRIOR_TAG{" a" , " prior" };
37
37
const core::TPersistenceTag CLASS_LABEL_TAG{" b" , " class_label" };
38
38
const core::TPersistenceTag CLASS_MODEL_TAG{" c" , " class_model" };
39
- const core::TPersistenceTag MIN_MAX_LOG_LIKELIHOOD_TO_USE_FEATURE_TAG{
40
- " d" , " min_max_likelihood_to_use_feature" };
41
39
const core::TPersistenceTag COUNT_TAG{" e" , " count" };
42
40
const core::TPersistenceTag CONDITIONAL_DENSITY_FROM_PRIOR_TAG{" f" , " conditional_density_from_prior" };
43
41
}
@@ -135,24 +133,26 @@ std::string CNaiveBayesFeatureDensityFromPrior::print() const {
135
133
return result;
136
134
}
137
135
138
- CNaiveBayes::CNaiveBayes (const CNaiveBayesFeatureDensity& exemplar,
139
- double decayRate,
140
- TOptionalDouble minMaxLogLikelihoodToUseFeature)
141
- : m_MinMaxLogLikelihoodToUseFeature{minMaxLogLikelihoodToUseFeature},
142
- m_DecayRate{decayRate}, m_Exemplar{exemplar.clone ()}, m_ClassConditionalDensities{2 } {
136
+ CNaiveBayes::CNaiveBayes (const CNaiveBayesFeatureDensity& exemplar, double decayRate)
137
+ : m_DecayRate{decayRate}, m_Exemplar{exemplar.clone ()}, m_ClassConditionalDensities{2 } {
143
138
}
144
139
145
140
CNaiveBayes::CNaiveBayes (const CNaiveBayesFeatureDensity& exemplar,
146
141
const SDistributionRestoreParams& params,
147
142
core::CStateRestoreTraverser& traverser)
148
143
: m_DecayRate{params.s_DecayRate }, m_Exemplar{exemplar.clone ()}, m_ClassConditionalDensities{2 } {
149
- traverser.traverseSubLevel (std::bind (&CNaiveBayes::acceptRestoreTraverser, this ,
150
- std::cref (params), std::placeholders::_1));
144
+ // If we persist before we create class conditional distributions we will
145
+ // not have anything to restore and hasSubLevel will be false. Trying to
146
+ // restore sets the traverser state to bad so we need to handle explicitly.
147
+ if (traverser.hasSubLevel () && traverser.traverseSubLevel ([&](auto & traverser_) {
148
+ return this ->acceptRestoreTraverser (params, traverser_);
149
+ }) == false ) {
150
+ traverser.setBadState ();
151
+ }
151
152
}
152
153
153
154
CNaiveBayes::CNaiveBayes (const CNaiveBayes& other)
154
- : m_MinMaxLogLikelihoodToUseFeature{other.m_MinMaxLogLikelihoodToUseFeature },
155
- m_DecayRate{other.m_DecayRate }, m_Exemplar{other.m_Exemplar ->clone ()} {
155
+ : m_DecayRate{other.m_DecayRate }, m_Exemplar{other.m_Exemplar ->clone ()} {
156
156
for (const auto & class_ : other.m_ClassConditionalDensities ) {
157
157
m_ClassConditionalDensities.emplace (class_.first , class_.second );
158
158
}
@@ -170,9 +170,6 @@ bool CNaiveBayes::acceptRestoreTraverser(const SDistributionRestoreParams& param
170
170
std::ref (class_), std::cref (params),
171
171
std::placeholders::_1)),
172
172
m_ClassConditionalDensities.emplace (label, std::move (class_)))
173
- RESTORE_SETUP_TEARDOWN (MIN_MAX_LOG_LIKELIHOOD_TO_USE_FEATURE_TAG, double value,
174
- core::CStringUtils::stringToType (traverser.value (), value),
175
- m_MinMaxLogLikelihoodToUseFeature.reset (value))
176
173
} while (traverser.next ());
177
174
return true ;
178
175
}
@@ -195,12 +192,6 @@ void CNaiveBayes::acceptPersistInserter(core::CStatePersistInserter& inserter) c
195
192
std::ref (class_->second ),
196
193
std::placeholders::_1));
197
194
}
198
-
199
- if (m_MinMaxLogLikelihoodToUseFeature) {
200
- inserter.insertValue (MIN_MAX_LOG_LIKELIHOOD_TO_USE_FEATURE_TAG,
201
- *m_MinMaxLogLikelihoodToUseFeature,
202
- core::CIEEE754::E_SinglePrecision);
203
- }
204
195
}
205
196
206
197
CNaiveBayes& CNaiveBayes::operator =(const CNaiveBayes& other) {
@@ -215,26 +206,29 @@ void CNaiveBayes::swap(CNaiveBayes& other) {
215
206
std::swap (m_DecayRate, other.m_DecayRate );
216
207
m_Exemplar.swap (other.m_Exemplar );
217
208
m_ClassConditionalDensities.swap (other.m_ClassConditionalDensities );
218
- std::swap (m_MinMaxLogLikelihoodToUseFeature, other.m_MinMaxLogLikelihoodToUseFeature );
219
209
}
220
210
221
211
bool CNaiveBayes::initialized () const {
222
- return m_ClassConditionalDensities.size () > 0 &&
212
+ return m_ClassConditionalDensities.empty () == false &&
223
213
std::all_of (m_ClassConditionalDensities.begin (),
224
214
m_ClassConditionalDensities.end (),
225
215
[](const std::pair<std::size_t , CClass>& class_) {
226
216
return class_.second .initialized ();
227
217
});
228
218
}
229
219
220
+ std::size_t CNaiveBayes::numberClasses () const {
221
+ return m_ClassConditionalDensities.size ();
222
+ }
223
+
230
224
void CNaiveBayes::initialClassCounts (const TDoubleSizePrVec& counts) {
231
225
for (const auto & count : counts) {
232
226
m_ClassConditionalDensities.emplace (count.second , CClass{count.first });
233
227
}
234
228
}
235
229
236
230
void CNaiveBayes::addTrainingDataPoint (std::size_t label, const TDouble1VecVec& x) {
237
- if (! this ->validate (x)) {
231
+ if (this ->validate (x) == false ) {
238
232
return ;
239
233
}
240
234
@@ -249,7 +243,7 @@ void CNaiveBayes::addTrainingDataPoint(std::size_t label, const TDouble1VecVec&
249
243
250
244
bool updateCount{false };
251
245
for (std::size_t i = 0 ; i < x.size (); ++i) {
252
- if (x[i].size () > 0 ) {
246
+ if (x[i].empty () == false ) {
253
247
class_.conditionalDensities ()[i]->add (x[i]);
254
248
updateCount = true ;
255
249
}
@@ -280,62 +274,74 @@ void CNaiveBayes::propagateForwardsByTime(double time) {
280
274
}
281
275
}
282
276
283
- CNaiveBayes::TDoubleSizePrVec
284
- CNaiveBayes::highestClassProbabilities (std::size_t n, const TDouble1VecVec& x) const {
285
- TDoubleSizePrVec p (this ->classProbabilities (x));
277
+ CNaiveBayes::TDoubleSizePrVecDoublePr
278
+ CNaiveBayes::highestClassProbabilities (std::size_t n,
279
+ const TDouble1VecVec& x,
280
+ const TFeatureWeightProvider& weightProvider) const {
281
+ auto [p, minFeatureWeight] = this ->classProbabilities (x, weightProvider);
286
282
n = std::min (n, p.size ());
287
- std::sort (p.begin (), p.begin () + n, std::greater<TDoubleSizePr >());
288
- return TDoubleSizePrVec{p.begin (), p.begin () + n};
283
+ std::sort (p.begin (), p.begin () + n, std::greater<>());
284
+ return { TDoubleSizePrVec{p.begin (), p.begin () + n}, minFeatureWeight };
289
285
}
290
286
291
- double CNaiveBayes::classProbability (std::size_t label, const TDouble1VecVec& x) const {
292
- TDoubleSizePrVec p (this ->classProbabilities (x));
287
+ CNaiveBayes::TDoubleDoublePr
288
+ CNaiveBayes::classProbability (std::size_t label,
289
+ const TDouble1VecVec& x,
290
+ const TFeatureWeightProvider& weightProvider) const {
291
+ auto [p, minFeatureWeight] = this ->classProbabilities (x, weightProvider);
293
292
auto i = std::find_if (p.begin (), p.end (), [label](const TDoubleSizePr& p_) {
294
293
return p_.second == label;
295
294
});
296
- return i == p.end () ? 0.0 : i->first ;
295
+ return { i == p.end () ? 0.0 : i->first , minFeatureWeight} ;
297
296
}
298
297
299
- CNaiveBayes::TDoubleSizePrVec CNaiveBayes::classProbabilities (const TDouble1VecVec& x) const {
300
- if (!this ->validate (x)) {
301
- return {};
298
+ CNaiveBayes::TDoubleSizePrVecDoublePr
299
+ CNaiveBayes::classProbabilities (const TDouble1VecVec& x,
300
+ const TFeatureWeightProvider& weightProvider) const {
301
+ if (this ->validate (x) == false ) {
302
+ return {{}, 0.0 };
302
303
}
303
304
if (m_ClassConditionalDensities.empty ()) {
304
305
LOG_ERROR (<< " Trying to compute class probabilities without supplying training data" );
305
- return {};
306
+ return {{}, 0.0 };
306
307
}
307
308
308
309
using TDoubleVec = std::vector<double >;
309
- using TMaxAccumulator = CBasicStatistics::SMax<double >::TAccumulator;
310
310
311
311
TDoubleSizePrVec p;
312
312
p.reserve (m_ClassConditionalDensities.size ());
313
313
for (const auto & class_ : m_ClassConditionalDensities) {
314
314
p.emplace_back (CTools::fastLog (class_.second .count ()), class_.first );
315
315
}
316
+ double minFeatureWeight{1.0 };
316
317
317
318
TDoubleVec logLikelihoods;
318
319
for (std::size_t i = 0 ; i < x.size (); ++i) {
319
- if (x[i].size () > 0 ) {
320
- TMaxAccumulator maxLogLikelihood ;
320
+ if (x[i].empty () == false ) {
321
+ auto & featureWeight = weightProvider () ;
321
322
logLikelihoods.clear ();
322
323
for (const auto & class_ : m_ClassConditionalDensities) {
323
324
const auto & density = class_.second .conditionalDensities ()[i];
324
325
double logLikelihood{density->logValue (x[i])};
325
326
double logMaximumLikelihood{density->logMaximumValue ()};
326
- maxLogLikelihood.add (logLikelihood - logMaximumLikelihood);
327
327
logLikelihoods.push_back (logLikelihood);
328
+ featureWeight.add (class_.first , logLikelihood - logMaximumLikelihood);
328
329
}
329
- double weight{1.0 };
330
- if (m_MinMaxLogLikelihoodToUseFeature) {
331
- weight = CTools::logisticFunction (
332
- (maxLogLikelihood[0 ] - *m_MinMaxLogLikelihoodToUseFeature) /
333
- std::fabs (*m_MinMaxLogLikelihoodToUseFeature),
334
- 0.1 );
335
- }
330
+
331
+ // We compute the class c_i probability using
332
+ //
333
+ // p(c_i | x) = exp(sum_i{w_j * log(L(x_j | c_i))}) / Z * p(c_i).
334
+ //
335
+ // Any feature whose weight < 1 has its significance dropped in class
336
+ // selection, effectively we use the w_i'th root of the log-likelihood
337
+ // which tends to 1 for all values if w_i is small enough. This can be
338
+ // used to ignore features that for which x is the extreme tails of the
339
+ // class conditional distribution.
340
+ double featureWeight_{featureWeight.calculate ()};
336
341
for (std::size_t j = 0 ; j < logLikelihoods.size (); ++j) {
337
- p[j].first += weight * logLikelihoods[j];
342
+ p[j].first += featureWeight_ * logLikelihoods[j];
338
343
}
344
+ minFeatureWeight = std::min (minFeatureWeight, featureWeight_);
339
345
}
340
346
}
341
347
@@ -349,7 +355,7 @@ CNaiveBayes::TDoubleSizePrVec CNaiveBayes::classProbabilities(const TDouble1VecV
349
355
pc.first /= Z;
350
356
}
351
357
352
- return p ;
358
+ return { std::move (p), minFeatureWeight} ;
353
359
}
354
360
355
361
void CNaiveBayes::debugMemoryUsage (const core::CMemoryUsage::TMemoryUsagePtr& mem) const {
@@ -363,8 +369,7 @@ std::size_t CNaiveBayes::memoryUsage() const {
363
369
core::CMemory::dynamicSize (m_ClassConditionalDensities);
364
370
}
365
371
366
- uint64_t CNaiveBayes::checksum (uint64_t seed) const {
367
- CChecksum::calculate (seed, m_MinMaxLogLikelihoodToUseFeature);
372
+ std::uint64_t CNaiveBayes::checksum (std::uint64_t seed) const {
368
373
CChecksum::calculate (seed, m_DecayRate);
369
374
CChecksum::calculate (seed, m_Exemplar);
370
375
return CChecksum::calculate (seed, m_ClassConditionalDensities);
@@ -386,7 +391,7 @@ std::string CNaiveBayes::print() const {
386
391
bool CNaiveBayes::validate (const TDouble1VecVec& x) const {
387
392
auto class_ = m_ClassConditionalDensities.begin ();
388
393
if (class_ != m_ClassConditionalDensities.end () &&
389
- class_->second .conditionalDensities ().size () > 0 &&
394
+ class_->second .conditionalDensities ().empty () == false &&
390
395
class_->second .conditionalDensities ().size () != x.size ()) {
391
396
LOG_ERROR (<< " Unexpected feature vector: " << core::CContainerPrinter::print (x));
392
397
return false ;
@@ -423,7 +428,7 @@ bool CNaiveBayes::CClass::acceptRestoreTraverser(const SDistributionRestoreParam
423
428
void CNaiveBayes::CClass::acceptPersistInserter (core::CStatePersistInserter& inserter) const {
424
429
inserter.insertValue (COUNT_TAG, m_Count, core::CIEEE754::E_SinglePrecision);
425
430
for (const auto & density : m_ConditionalDensities) {
426
- if (dynamic_cast <const CNaiveBayesFeatureDensityFromPrior*>(density.get ())) {
431
+ if (dynamic_cast <const CNaiveBayesFeatureDensityFromPrior*>(density.get ()) != nullptr ) {
427
432
inserter.insertLevel (CONDITIONAL_DENSITY_FROM_PRIOR_TAG,
428
433
std::bind (&CNaiveBayesFeatureDensity::acceptPersistInserter,
429
434
density.get (), std::placeholders::_1));
0 commit comments