@@ -40,8 +40,6 @@ namespace {
40
40
const core::TPersistenceTag PRIOR_TAG{" a" , " prior" };
41
41
const core::TPersistenceTag CLASS_LABEL_TAG{" b" , " class_label" };
42
42
const core::TPersistenceTag CLASS_MODEL_TAG{" c" , " class_model" };
43
- const core::TPersistenceTag MIN_MAX_LOG_LIKELIHOOD_TO_USE_FEATURE_TAG{
44
- " d" , " min_max_likelihood_to_use_feature" };
45
43
const core::TPersistenceTag COUNT_TAG{" e" , " count" };
46
44
const core::TPersistenceTag CONDITIONAL_DENSITY_FROM_PRIOR_TAG{" f" , " conditional_density_from_prior" };
47
45
}
@@ -141,27 +139,26 @@ std::string CNaiveBayesFeatureDensityFromPrior::print() const {
141
139
return result;
142
140
}
143
141
144
- CNaiveBayes::CNaiveBayes (const CNaiveBayesFeatureDensity& exemplar,
145
- double decayRate,
146
- TOptionalDouble minMaxLogLikelihoodToUseFeature)
147
- : m_MinMaxLogLikelihoodToUseFeature{minMaxLogLikelihoodToUseFeature},
148
- m_DecayRate{decayRate}, m_Exemplar{exemplar.clone ()}, m_ClassConditionalDensities{2 } {
142
+ CNaiveBayes::CNaiveBayes (const CNaiveBayesFeatureDensity& exemplar, double decayRate)
143
+ : m_DecayRate{decayRate}, m_Exemplar{exemplar.clone ()}, m_ClassConditionalDensities{2 } {
149
144
}
150
145
151
146
CNaiveBayes::CNaiveBayes (const CNaiveBayesFeatureDensity& exemplar,
152
147
const SDistributionRestoreParams& params,
153
148
core::CStateRestoreTraverser& traverser)
154
149
: m_DecayRate{params.s_DecayRate }, m_Exemplar{exemplar.clone ()}, m_ClassConditionalDensities{2 } {
155
- if (traverser.traverseSubLevel ([&](auto & traverser_) {
150
+ // If we persist before we create class conditional distributions we will
151
+ // not have anything to restore and hasSubLevel will be false. Trying to
152
+ // restore sets the traverser state to bad so we need to handle explicitly.
153
+ if (traverser.hasSubLevel () && traverser.traverseSubLevel ([&](auto & traverser_) {
156
154
return this ->acceptRestoreTraverser (params, traverser_);
157
155
}) == false ) {
158
156
traverser.setBadState ();
159
157
}
160
158
}
161
159
162
160
CNaiveBayes::CNaiveBayes (const CNaiveBayes& other)
163
- : m_MinMaxLogLikelihoodToUseFeature{other.m_MinMaxLogLikelihoodToUseFeature },
164
- m_DecayRate{other.m_DecayRate }, m_Exemplar{other.m_Exemplar ->clone ()} {
161
+ : m_DecayRate{other.m_DecayRate }, m_Exemplar{other.m_Exemplar ->clone ()} {
165
162
for (const auto & class_ : other.m_ClassConditionalDensities ) {
166
163
m_ClassConditionalDensities.emplace (class_.first , class_.second );
167
164
}
@@ -178,9 +175,6 @@ bool CNaiveBayes::acceptRestoreTraverser(const SDistributionRestoreParams& param
178
175
return class_.acceptRestoreTraverser (params, traverser_);
179
176
}),
180
177
m_ClassConditionalDensities.emplace (label, std::move (class_)))
181
- RESTORE_SETUP_TEARDOWN (MIN_MAX_LOG_LIKELIHOOD_TO_USE_FEATURE_TAG, double value,
182
- core::CStringUtils::stringToType (traverser.value (), value),
183
- m_MinMaxLogLikelihoodToUseFeature.emplace (value))
184
178
} while (traverser.next ());
185
179
return true ;
186
180
}
@@ -203,12 +197,6 @@ void CNaiveBayes::acceptPersistInserter(core::CStatePersistInserter& inserter) c
203
197
class_->second .acceptPersistInserter (inserter_);
204
198
});
205
199
}
206
-
207
- if (m_MinMaxLogLikelihoodToUseFeature) {
208
- inserter.insertValue (MIN_MAX_LOG_LIKELIHOOD_TO_USE_FEATURE_TAG,
209
- *m_MinMaxLogLikelihoodToUseFeature,
210
- core::CIEEE754::E_SinglePrecision);
211
- }
212
200
}
213
201
214
202
CNaiveBayes& CNaiveBayes::operator =(const CNaiveBayes& other) {
@@ -223,26 +211,29 @@ void CNaiveBayes::swap(CNaiveBayes& other) {
223
211
std::swap (m_DecayRate, other.m_DecayRate );
224
212
m_Exemplar.swap (other.m_Exemplar );
225
213
m_ClassConditionalDensities.swap (other.m_ClassConditionalDensities );
226
- std::swap (m_MinMaxLogLikelihoodToUseFeature, other.m_MinMaxLogLikelihoodToUseFeature );
227
214
}
228
215
229
216
bool CNaiveBayes::initialized () const {
230
- return m_ClassConditionalDensities.size () > 0 &&
217
+ return m_ClassConditionalDensities.empty () == false &&
231
218
std::all_of (m_ClassConditionalDensities.begin (),
232
219
m_ClassConditionalDensities.end (),
233
220
[](const std::pair<std::size_t , CClass>& class_) {
234
221
return class_.second .initialized ();
235
222
});
236
223
}
237
224
225
+ std::size_t CNaiveBayes::numberClasses () const {
226
+ return m_ClassConditionalDensities.size ();
227
+ }
228
+
238
229
void CNaiveBayes::initialClassCounts (const TDoubleSizePrVec& counts) {
239
230
for (const auto & count : counts) {
240
231
m_ClassConditionalDensities.emplace (count.second , CClass{count.first });
241
232
}
242
233
}
243
234
244
235
void CNaiveBayes::addTrainingDataPoint (std::size_t label, const TDouble1VecVec& x) {
245
- if (! this ->validate (x)) {
236
+ if (this ->validate (x) == false ) {
246
237
return ;
247
238
}
248
239
@@ -257,7 +248,7 @@ void CNaiveBayes::addTrainingDataPoint(std::size_t label, const TDouble1VecVec&
257
248
258
249
bool updateCount{false };
259
250
for (std::size_t i = 0 ; i < x.size (); ++i) {
260
- if (x[i].size () > 0 ) {
251
+ if (x[i].empty () == false ) {
261
252
class_.conditionalDensities ()[i]->add (x[i]);
262
253
updateCount = true ;
263
254
}
@@ -288,62 +279,74 @@ void CNaiveBayes::propagateForwardsByTime(double time) {
288
279
}
289
280
}
290
281
291
- CNaiveBayes::TDoubleSizePrVec
292
- CNaiveBayes::highestClassProbabilities (std::size_t n, const TDouble1VecVec& x) const {
293
- TDoubleSizePrVec p (this ->classProbabilities (x));
282
+ CNaiveBayes::TDoubleSizePrVecDoublePr
283
+ CNaiveBayes::highestClassProbabilities (std::size_t n,
284
+ const TDouble1VecVec& x,
285
+ const TFeatureWeightProvider& weightProvider) const {
286
+ auto [p, minFeatureWeight] = this ->classProbabilities (x, weightProvider);
294
287
n = std::min (n, p.size ());
295
288
std::sort (p.begin (), p.begin () + n, std::greater<>());
296
- return TDoubleSizePrVec{p.begin (), p.begin () + n};
289
+ return { TDoubleSizePrVec{p.begin (), p.begin () + n}, minFeatureWeight };
297
290
}
298
291
299
- double CNaiveBayes::classProbability (std::size_t label, const TDouble1VecVec& x) const {
300
- TDoubleSizePrVec p (this ->classProbabilities (x));
292
+ CNaiveBayes::TDoubleDoublePr
293
+ CNaiveBayes::classProbability (std::size_t label,
294
+ const TDouble1VecVec& x,
295
+ const TFeatureWeightProvider& weightProvider) const {
296
+ auto [p, minFeatureWeight] = this ->classProbabilities (x, weightProvider);
301
297
auto i = std::find_if (p.begin (), p.end (), [label](const TDoubleSizePr& p_) {
302
298
return p_.second == label;
303
299
});
304
- return i == p.end () ? 0.0 : i->first ;
300
+ return { i == p.end () ? 0.0 : i->first , minFeatureWeight} ;
305
301
}
306
302
307
- CNaiveBayes::TDoubleSizePrVec CNaiveBayes::classProbabilities (const TDouble1VecVec& x) const {
308
- if (!this ->validate (x)) {
309
- return {};
303
+ CNaiveBayes::TDoubleSizePrVecDoublePr
304
+ CNaiveBayes::classProbabilities (const TDouble1VecVec& x,
305
+ const TFeatureWeightProvider& weightProvider) const {
306
+ if (this ->validate (x) == false ) {
307
+ return {{}, 0.0 };
310
308
}
311
309
if (m_ClassConditionalDensities.empty ()) {
312
310
LOG_ERROR (<< " Trying to compute class probabilities without supplying training data" );
313
- return {};
311
+ return {{}, 0.0 };
314
312
}
315
313
316
314
using TDoubleVec = std::vector<double >;
317
- using TMaxAccumulator = CBasicStatistics::SMax<double >::TAccumulator;
318
315
319
316
TDoubleSizePrVec p;
320
317
p.reserve (m_ClassConditionalDensities.size ());
321
318
for (const auto & class_ : m_ClassConditionalDensities) {
322
319
p.emplace_back (CTools::fastLog (class_.second .count ()), class_.first );
323
320
}
321
+ double minFeatureWeight{1.0 };
324
322
325
323
TDoubleVec logLikelihoods;
326
324
for (std::size_t i = 0 ; i < x.size (); ++i) {
327
- if (x[i].size () > 0 ) {
328
- TMaxAccumulator maxLogLikelihood ;
325
+ if (x[i].empty () == false ) {
326
+ auto & featureWeight = weightProvider () ;
329
327
logLikelihoods.clear ();
330
328
for (const auto & class_ : m_ClassConditionalDensities) {
331
329
const auto & density = class_.second .conditionalDensities ()[i];
332
330
double logLikelihood{density->logValue (x[i])};
333
331
double logMaximumLikelihood{density->logMaximumValue ()};
334
- maxLogLikelihood.add (logLikelihood - logMaximumLikelihood);
335
332
logLikelihoods.push_back (logLikelihood);
333
+ featureWeight.add (class_.first , logLikelihood - logMaximumLikelihood);
336
334
}
337
- double weight{1.0 };
338
- if (m_MinMaxLogLikelihoodToUseFeature) {
339
- weight = CTools::logisticFunction (
340
- (maxLogLikelihood[0 ] - *m_MinMaxLogLikelihoodToUseFeature) /
341
- std::fabs (*m_MinMaxLogLikelihoodToUseFeature),
342
- 0.1 );
343
- }
335
+
336
+ // We compute the class c_i probability using
337
+ //
338
+ // p(c_i | x) = exp(sum_i{w_j * log(L(x_j | c_i))}) / Z * p(c_i).
339
+ //
340
+ // Any feature whose weight < 1 has its significance dropped in class
341
+ // selection, effectively we use the w_i'th root of the log-likelihood
342
+ // which tends to 1 for all values if w_i is small enough. This can be
343
+ // used to ignore features that for which x is the extreme tails of the
344
+ // class conditional distribution.
345
+ double featureWeight_{featureWeight.calculate ()};
344
346
for (std::size_t j = 0 ; j < logLikelihoods.size (); ++j) {
345
- p[j].first += weight * logLikelihoods[j];
347
+ p[j].first += featureWeight_ * logLikelihoods[j];
346
348
}
349
+ minFeatureWeight = std::min (minFeatureWeight, featureWeight_);
347
350
}
348
351
}
349
352
@@ -357,7 +360,7 @@ CNaiveBayes::TDoubleSizePrVec CNaiveBayes::classProbabilities(const TDouble1VecV
357
360
pc.first /= Z;
358
361
}
359
362
360
- return p ;
363
+ return { std::move (p), minFeatureWeight} ;
361
364
}
362
365
363
366
void CNaiveBayes::debugMemoryUsage (const core::CMemoryUsage::TMemoryUsagePtr& mem) const {
@@ -372,7 +375,6 @@ std::size_t CNaiveBayes::memoryUsage() const {
372
375
}
373
376
374
377
std::uint64_t CNaiveBayes::checksum (std::uint64_t seed) const {
375
- CChecksum::calculate (seed, m_MinMaxLogLikelihoodToUseFeature);
376
378
CChecksum::calculate (seed, m_DecayRate);
377
379
CChecksum::calculate (seed, m_Exemplar);
378
380
return CChecksum::calculate (seed, m_ClassConditionalDensities);
@@ -394,7 +396,7 @@ std::string CNaiveBayes::print() const {
394
396
bool CNaiveBayes::validate (const TDouble1VecVec& x) const {
395
397
auto class_ = m_ClassConditionalDensities.begin ();
396
398
if (class_ != m_ClassConditionalDensities.end () &&
397
- class_->second .conditionalDensities ().size () > 0 &&
399
+ class_->second .conditionalDensities ().empty () == false &&
398
400
class_->second .conditionalDensities ().size () != x.size ()) {
399
401
LOG_ERROR (<< " Unexpected feature vector: " << x);
400
402
return false ;
@@ -431,7 +433,7 @@ bool CNaiveBayes::CClass::acceptRestoreTraverser(const SDistributionRestoreParam
431
433
void CNaiveBayes::CClass::acceptPersistInserter (core::CStatePersistInserter& inserter) const {
432
434
inserter.insertValue (COUNT_TAG, m_Count, core::CIEEE754::E_SinglePrecision);
433
435
for (const auto & density : m_ConditionalDensities) {
434
- if (dynamic_cast <const CNaiveBayesFeatureDensityFromPrior*>(density.get ())) {
436
+ if (dynamic_cast <const CNaiveBayesFeatureDensityFromPrior*>(density.get ()) != nullptr ) {
435
437
inserter.insertLevel (CONDITIONAL_DENSITY_FROM_PRIOR_TAG,
436
438
[&density](auto & inserter_) {
437
439
density->acceptPersistInserter (inserter_);
0 commit comments