Skip to content

Commit ca215a9

Browse files
committed
[ML] Improve forecasting for time series with step changes (#2591)
We model the level of a time series which we've observed having step discontinuities via a Markov process for forecasting. Specifically, we estimate the historical step size distribution and the distribution of the steps in time and as a function of the time series value. For this second part we use an online naive Bayes model to estimate the probability that at any given point in a roll out for forecasting we will get a step. This approach generally works well unless we're in the tails of the distribution values we've observed for the time series historically when we roll out. In this case, our prediction probability are very sensitive to the tail behaviour of the distributions we fit to the time series values where we saw a step and sometimes we predict far too many steps as a result. We can detect this case: when we're in the tails of time series value distribution. This change does this and stops predicting changes in such cases, which avoids pathologies. This fixes #2466.
2 parents e1d8f49 + 7c29c8a commit ca215a9

File tree

14 files changed

+336
-102
lines changed

14 files changed

+336
-102
lines changed

.buildkite/ml_pipeline/config.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,15 @@ class Config:
1515
build_windows: bool = False
1616
build_macos: bool = False
1717
build_linux: bool = False
18+
run_qa_tests: bool = False
1819
action: str = "build"
1920

2021
def parse_comment(self):
2122
if "GITHUB_PR_COMMENT_VAR_ACTION" in os.environ:
2223
self.action = os.environ["GITHUB_PR_COMMENT_VAR_ACTION"]
24+
self.run_qa_tests = self.action == "run_qa_tests"
25+
if self.run_qa_tests:
26+
self.action = "build"
2327

2428
if "GITHUB_PR_COMMENT_VAR_PLATFORM" in os.environ:
2529
csv_platform = os.environ["GITHUB_PR_COMMENT_VAR_PLATFORM"]
@@ -36,13 +40,14 @@ def parse_comment(self):
3640
self.build_linux = True
3741

3842
def parse_label(self):
39-
build_labels = ['ci:build-linux','ci:build-macos','ci:build-windows']
43+
build_labels = ['ci:build-linux','ci:build-macos','ci:build-windows','ci:run-qa-tests']
4044
all_labels = [x.strip().lower() for x in os.environ["GITHUB_PR_LABELS"].split(",")]
4145
ci_labels = [label for label in all_labels if re.search("|".join(build_labels), label)]
4246
if not ci_labels:
4347
self.build_windows = True
4448
self.build_macos = True
4549
self.build_linux = True
50+
self.run_qa_tests = False
4651
else:
4752
for label in ci_labels:
4853
if "ci:build-windows" == label:
@@ -51,6 +56,11 @@ def parse_label(self):
5156
self.build_macos = True
5257
elif "ci:build-linux" == label:
5358
self.build_linux = True
59+
elif "ci:run-qa-tests" == label:
60+
self.build_windows = True
61+
self.build_macos = True
62+
self.build_linux = True
63+
self.run_qa_tests = True
5464

5565
def parse(self):
5666
"""Parse Github label or Github comment passed through buildkite-pr-bot."""
@@ -63,4 +73,5 @@ def parse(self):
6373
self.build_windows = True
6474
self.build_macos = True
6575
self.build_linux = True
76+
self.run_qa_tests = False
6677

.buildkite/pipeline.json.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ def main():
5050
pipeline_steps.append(build_linux)
5151
pipeline_steps.append(pipeline_steps.generate_step("Upload ES tests runner pipeline",
5252
".buildkite/pipelines/run_es_tests.yml.sh"))
53+
if config.run_qa_tests:
54+
pipeline_steps.append(pipeline_steps.generate_step("Upload QA tests runner pipeline",
55+
".buildkite/pipelines/run_qa_tests.yml.sh"))
5356
pipeline["env"] = env
5457
pipeline["steps"] = pipeline_steps
5558
print(json.dumps(pipeline, indent=2))
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/bin/bash
2+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
# or more contributor license agreements. Licensed under the Elastic License
4+
# 2.0 and the following additional limitation. Functionality enabled by the
5+
# files subject to the Elastic License 2.0 may only be used in production when
6+
# invoked by an Elasticsearch process with a license key installed that permits
7+
# use of machine learning features. You may not use this file except in
8+
# compliance with the Elastic License 2.0 and the foregoing additional
9+
# limitation.
10+
11+
cat <<EOL
12+
steps:
13+
- label: "Trigger Appex QA Tests :test_tube:"
14+
command:
15+
- echo 'Trigger QA Tests'
16+
- 'buildkite-agent artifact download "build/*" . --step build_test_linux-x86_64-RelWithDebInfo'
17+
depends_on: "build_test_linux-x86_64-RelWithDebInfo"
18+
- wait
19+
- trigger: appex-qa-stateful-custom-ml-c-plus-plus-build-testing
20+
async: false
21+
build:
22+
message: "${BUILDKITE_MESSAGE}"
23+
env:
24+
QAF_TESTS_TO_RUN: "ml_cpp_pr"
25+
EOL

.buildkite/pull-requests.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"commit_status_context": "ml-cpp-ci",
1010
"build_on_commit": true,
1111
"build_on_comment": true,
12-
"trigger_comment_regex": "^(?:(?:buildkite +)(?<action>build|debug) +(?: *on *(?<platform>(?:[ ,]*(?:windows|linux|mac(os)?))+))?)$",
12+
"trigger_comment_regex": "^(?:(?:buildkite +)(?<action>build|debug|run_qa_tests) +(?: *on *(?<platform>(?:[ ,]*(?:windows|linux|mac(os)?))+))?)$",
1313
"always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))",
1414
"skip_ci_labels": ["skip-ci", "jenkins-ci", ">test-mute", ">docs"],
1515
"skip_target_branches": ["6.8", "7.11", "7.12"],

docs/CHANGELOG.asciidoc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@
3333
=== Enhancements
3434

3535
* Upgrade Boost libraries to version 1.83. (See {ml-pull}2560[#2560].)
36+
* Improve forecasting for time series with step changes. (See {ml-pull}#2591[2591],
37+
issue: {ml-issue}2466[#2466]).
3638

3739
=== Bug Fixes
3840

include/maths/common/CNaiveBayes.h

Lines changed: 60 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -154,19 +154,43 @@ class MATHS_COMMON_EXPORT CNaiveBayesFeatureDensityFromPrior final
154154
TPriorPtr m_Prior;
155155
};
156156

157+
//! \brief Enables using custom feature weights in class prediction.
158+
class CNaiveBayesFeatureWeight {
159+
public:
160+
virtual ~CNaiveBayesFeatureWeight() = default;
161+
virtual void add(std::size_t class_, double logLikelihood) = 0;
162+
virtual double calculate() const = 0;
163+
};
164+
157165
//! \brief Implements a Naive Bayes classifier.
158166
class MATHS_COMMON_EXPORT CNaiveBayes {
159167
public:
168+
using TDoubleDoublePr = std::pair<double, double>;
160169
using TDoubleSizePr = std::pair<double, std::size_t>;
161170
using TDoubleSizePrVec = std::vector<TDoubleSizePr>;
171+
using TDoubleSizePrVecDoublePr = std::pair<TDoubleSizePrVec, double>;
162172
using TDouble1Vec = core::CSmallVector<double, 1>;
163173
using TDouble1VecVec = std::vector<TDouble1Vec>;
164-
using TOptionalDouble = std::optional<double>;
174+
using TFeatureWeightProvider = std::function<CNaiveBayesFeatureWeight&()>;
175+
176+
private:
177+
//! \brief All features have unit weight in class prediction.
178+
class CUnitFeatureWeight : public CNaiveBayesFeatureWeight {
179+
public:
180+
void add(std::size_t, double) override {}
181+
double calculate() const override { return 1.0; }
182+
};
183+
184+
class CUnitFeatureWeightProvider {
185+
public:
186+
CUnitFeatureWeight& operator()() const { return m_UnitWeight; }
187+
188+
private:
189+
mutable CUnitFeatureWeight m_UnitWeight;
190+
};
165191

166192
public:
167-
explicit CNaiveBayes(const CNaiveBayesFeatureDensity& exemplar,
168-
double decayRate = 0.0,
169-
TOptionalDouble minMaxLogLikelihoodToUseFeature = TOptionalDouble());
193+
explicit CNaiveBayes(const CNaiveBayesFeatureDensity& exemplar, double decayRate = 0.0);
170194
CNaiveBayes(const CNaiveBayesFeatureDensity& exemplar,
171195
const SDistributionRestoreParams& params,
172196
core::CStateRestoreTraverser& traverser);
@@ -184,6 +208,9 @@ class MATHS_COMMON_EXPORT CNaiveBayes {
184208
//! Check if any training data has been added initialized.
185209
bool initialized() const;
186210

211+
//! Get the number of classes.
212+
std::size_t numberClasses() const;
213+
187214
//! This can be used to optionally seed the class counts
188215
//! with \p counts. These are added on to data class counts
189216
//! to compute the class posterior probabilities.
@@ -210,27 +237,53 @@ class MATHS_COMMON_EXPORT CNaiveBayes {
210237
//!
211238
//! \param[in] n The number of class probabilities to estimate.
212239
//! \param[in] x The feature values.
240+
//! \param[in] weightProvider Computes a feature weight from the class
241+
//! conditional log-likelihood of the feature value. It should be in
242+
//! the range [0,1]. The smaller the value the less impact the feature
243+
//! has on class selection.
244+
//! \return The class probabilities and the minimum feature weight.
213245
//! \note \p x size should be equal to the number of features.
214246
//! A feature is missing is indicated by passing an empty vector
215247
//! for that feature.
216-
TDoubleSizePrVec highestClassProbabilities(std::size_t n, const TDouble1VecVec& x) const;
248+
TDoubleSizePrVecDoublePr highestClassProbabilities(
249+
std::size_t n,
250+
const TDouble1VecVec& x,
251+
const TFeatureWeightProvider& weightProvider = CUnitFeatureWeightProvider{}) const;
217252

218253
//! Get the probability of the class labeled \p label for \p x.
219254
//!
220255
//! \param[in] label The label of the class of interest.
221256
//! \param[in] x The feature values.
257+
//! \param[in] weightProvider Computes a feature weight from the class
258+
//! conditional log-likelihood of the feature value. It should be in
259+
//! the range [0,1]. The smaller the value the less impact the feature
260+
//! has on class selection.
261+
//! \return The class probabilities and the minimum feature weight.
262+
//! conditional distributions.
222263
//! \note \p x size should be equal to the number of features.
223264
//! A feature is missing is indicated by passing an empty vector
224265
//! for that feature.
225-
double classProbability(std::size_t label, const TDouble1VecVec& x) const;
266+
TDoubleDoublePr classProbability(std::size_t label,
267+
const TDouble1VecVec& x,
268+
const TFeatureWeightProvider& weightProvider =
269+
CUnitFeatureWeightProvider{}) const;
226270

227271
//! Get the probabilities of all the classes for \p x.
228272
//!
229273
//! \param[in] x The feature values.
274+
//! \param[in] weightProvider Computes a feature weight from the class
275+
//! conditional log-likelihood of the feature value. It should be in
276+
//! the range [0,1]. The smaller the value the less impact the feature
277+
//! has on class selection.
278+
//! \return The class probabilities and the minimum feature weight.
279+
//! A feature is missing is indicated by passing an empty vector
280+
//! for that feature.
230281
//! \note \p x size should be equal to the number of features.
231282
//! A feature is missing is indicated by passing an empty vector
232283
//! for that feature.
233-
TDoubleSizePrVec classProbabilities(const TDouble1VecVec& x) const;
284+
TDoubleSizePrVecDoublePr
285+
classProbabilities(const TDouble1VecVec& x,
286+
const TFeatureWeightProvider& weightProvider = CUnitFeatureWeightProvider{}) const;
234287

235288
//! Debug the memory used by this object.
236289
void debugMemoryUsage(const core::CMemoryUsage::TMemoryUsagePtr& mem) const;
@@ -298,13 +351,6 @@ class MATHS_COMMON_EXPORT CNaiveBayes {
298351
bool validate(const TDouble1VecVec& x) const;
299352

300353
private:
301-
//! It is not always appropriate to use features with very low
302-
//! probability in all classes to discriminate: the class choice
303-
//! will be very sensitive to the underlying conditional density
304-
//! model. This is a cutoff (for the minimum maximum class log
305-
//! likelihood) in order to use a feature.
306-
TOptionalDouble m_MinMaxLogLikelihoodToUseFeature;
307-
308354
//! Controls the rate at which data are aged out.
309355
double m_DecayRate;
310356

jupyter/requirements.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,14 @@ jupyter ==1.0.0
1111
libtmux ==0.10.1
1212
matplotlib ==3.3.4
1313
nbsmoke ==0.5.0
14-
numpy >=1.21.0
14+
numpy >=1.22.2
1515
pandas >=1.3
1616
pathlib2 ==2.3.5
1717
plotly ==5.3.1
1818
sacred ==0.8.2
19-
scikit-learn ==0.24.1
19+
scikit-learn ==1.3
2020
scipy >=1.5.4
2121
seaborn ==0.11.1
22-
tornado>=6.3.3 # not directly required, pinned by Snyk to avoid a vulnerability
23-
jupyter-server>=2.7.2 # not directly required, pinned by Snyk to avoid a vulnerability
22+
tornado >=6.3.3 # not directly required, pinned by Snyk to avoid a vulnerability
23+
jupyter-server >=2.7.2 # not directly required, pinned by Snyk to avoid a vulnerability
24+
pillow >=10.0.1 # not directly required, pinned by Snyk to avoid a vulnerability

lib/core/CStateRestoreTraverser.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,7 @@ namespace core {
1818
CStateRestoreTraverser::CStateRestoreTraverser() : m_BadState(false) {
1919
}
2020

21-
CStateRestoreTraverser::~CStateRestoreTraverser() {
22-
}
21+
CStateRestoreTraverser::~CStateRestoreTraverser() = default;
2322

2423
bool CStateRestoreTraverser::haveBadState() const {
2524
return m_BadState;

0 commit comments

Comments
 (0)