Skip to content

Commit 6daf0b8

Browse files
Mahadik, Mukul ChandrakantMahadik, Mukul Chandrakant
authored andcommitted
Using train / test data split + Added value-check tests + Reduced instance variables
1. Split up mock trips data into train / test data. - Saw that this was being done in one of the tests in TestForestModelLoadandSave.py itself as well as in TestGreedySimilarityBinning.py - Hence added it for all tests in forest model tests for uniformity. 2. Reduced number of instance variables since they were used inside setUp() only. This addresses review comment mentioned originally for TestForestModelIntegration e-mission#938 (comment) 3. Cleaned up TestForestModeIntegration.py - Added equality tests that check for prediction values generated in pipeline. Address review comment: e-mission#938 (comment) - Added train / test data split. - Removed check for empty data in setUp() Addresses review comment: e-mission#938 (comment)
1 parent e7f5d21 commit 6daf0b8

File tree

3 files changed

+89
-83
lines changed

3 files changed

+89
-83
lines changed

emission/tests/modellingTests/TestForestModelIntegration.py

Lines changed: 60 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
# This tests the label inference pipeline. It uses real data and placeholder inference algorithms
21
import unittest
32
import numpy as np
43
import time
4+
import logging
5+
import bson.objectid as boi
56
import emission.analysis.classification.inference.labels.pipeline as eacilp
67
import emission.analysis.classification.inference.labels.inferrers as eacili
78
import emission.core.wrapper.labelprediction as ecwl
@@ -11,30 +12,29 @@
1112
import emission.core.get_database as edb
1213
import emission.tests.common as etc
1314
import emission.pipeline.intake_stage as epi
14-
import logging
15-
import bson.objectid as boi
16-
1715
import emission.analysis.modelling.trip_model.config as eamtc
18-
1916
import emission.analysis.modelling.trip_model.run_model as eamur
2017
import emission.analysis.modelling.trip_model.model_type as eamumt
2118
import emission.analysis.modelling.trip_model.model_storage as eamums
2219
import emission.tests.modellingTests.modellingTestAssets as etmm
2320
import emission.storage.timeseries.abstract_timeseries as esta
2421

25-
2622
class TestForestModelIntegration(unittest.TestCase):
27-
# Test if the forest model for label prediction is smoothly integrated with the inference pipeline.
28-
# In the initial setup, build a dummy forest model. Then run the pipeline on real example data.
29-
# Finally in the test, assert the type of label predictions expected.
30-
23+
"""
24+
This tests the label inference pipeline. It uses real data and placeholder inference algorithms.
25+
Test if the forest model for label prediction is smoothly integrated with the inference pipeline.
26+
In the initial setup, build a dummy forest model. Then run the pipeline on real example data.
27+
Finally in the test, assert the type of label predictions expected.
28+
The label_data dict and mock_trip_data are copied over from TestRunGreedyModel.py
29+
"""
3130
def setUp(self):
3231
np.random.seed(91)
3332
self.test_algorithms = eacilp.primary_algorithms
3433
forest_model_config = eamtc.get_config_value_or_raise('model_parameters.forest')
35-
3634
etc.setupRealExample(self, "emission/tests/data/real_examples/shankari_2015-07-22") ##maybe use a different file
3735
ts = esta.TimeSeries.get_time_series(self.testUUID)
36+
37+
# Generate labels with a known sample weight that we can rely on in the test
3838
label_data = {
3939
"mode_confirm": ['ebike', 'bike'],
4040
"purpose_confirm": ['happy-hour', 'dog-park'],
@@ -43,11 +43,10 @@ def setUp(self):
4343
"purpose_weights": [0.1, 0.9]
4444
}
4545

46-
self.total_trips=100
47-
## generate mock trips
48-
train = etmm.generate_mock_trips(
46+
# Configuration values for randomly-generated test data copied over from TestRunGreedyModel.py
47+
mock_trip_data = etmm.generate_mock_trips(
4948
user_id=self.testUUID,
50-
trips=self.total_trips,
49+
trips=100,
5150
origin=(-105.1705977, 39.7402654),
5251
destination=(-105.1755606, 39.7673075),
5352
trip_part='od',
@@ -56,61 +55,80 @@ def setUp(self):
5655
threshold=0.004, # ~400m
5756
has_label_p=0.9
5857
)
59-
## Required for Forest model inference
60-
for result_entry in train:
58+
59+
# Required for Forest model inference
60+
for result_entry in mock_trip_data:
6161
result_entry['data']['start_local_dt']=result_entry['metadata']['write_local_dt']
6262
result_entry['data']['end_local_dt']=result_entry['metadata']['write_local_dt']
6363
result_entry['data']['start_place']=boi.ObjectId()
6464
result_entry['data']['end_place']=boi.ObjectId()
65-
ts.bulk_insert(train)
66-
# confirm data write did not fail
67-
check_data = esda.get_entries(key="analysis/confirmed_trip", user_id=self.testUUID, time_query=None)
68-
if len(check_data) != self.total_trips:
69-
logging.debug(f'test invariant failed after generating test data')
70-
self.fail()
71-
else:
72-
logging.debug(f'found {self.total_trips} trips in database')
73-
## Build an already existing model or a new model
65+
66+
split = int(len(mock_trip_data)*0.7)
67+
mock_train_data = mock_trip_data[:split]
68+
self.mock_test_data = mock_trip_data[split:]
69+
70+
ts.bulk_insert(mock_train_data)
71+
72+
# Build and train model
73+
logging.debug(f'(TRAIN) creating a model based on trips in database')
7474
eamur.update_trip_model(
7575
user_id=self.testUUID,
7676
model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER,
7777
model_storage=eamums.ModelStorage.DOCUMENT_DATABASE,
78-
min_trips=4,
78+
min_trips=14,
7979
model_config=forest_model_config
8080
)
81-
## run inference pipeline
81+
82+
# Run inference pipeline
8283
self.run_pipeline(self.test_algorithms)
8384
time_range = estt.TimeQuery("metadata.write_ts", None, time.time())
8485
self.inferred_trips = esda.get_entries(esda.INFERRED_TRIP_KEY, self.testUUID, time_query=time_range)
8586

8687
def tearDown(self):
87-
self.reset_all()
88+
etc.dropAllCollections(edb._get_current_db())
8889

8990
def run_pipeline(self, algorithms):
9091
default_primary_algorithms = eacilp.primary_algorithms
9192
eacilp.primary_algorithms = algorithms
9293
epi.run_intake_pipeline_for_user(self.testUUID,skip_if_no_new_data = False)
9394
eacilp.primary_algorithms = default_primary_algorithms
9495

95-
def reset_all(self):
96-
edb.get_analysis_timeseries_db().delete_many({'user_id': self.testUUID})
97-
edb.get_model_db().delete_many({'user_id': self.testUUID})
98-
edb.get_pipeline_state_db().delete_many({'user_id': self.testUUID})
99-
100-
101-
# Tests that forest algorithm being tested runs successfully
10296
def testForestAlgorithm(self):
97+
'''
98+
Tests that forest algorithm runs successfully when called from the analysis pipeline
99+
The tests are based on the existing tests in TestLabelInferencePipeline.py
100+
'''
101+
valid_modes = ['ebike', 'bike']
102+
valid_purposes = ['happy-hour', 'dog-park']
103+
103104
for trip in self.inferred_trips:
104105
entries = esdt.get_sections_for_trip("inference/labels", self.testUUID, trip.get_id())
105106
self.assertEqual(len(entries), len(self.test_algorithms))
106107
for entry in entries:
107-
self.assertGreater(len(entry["data"]["prediction"]), 0)
108+
# Test 1: Check that non-empty prediction list is generated
109+
self.assertGreater(len(entry["data"]["prediction"]), 0, "Prediction list should not be empty - model failed to generate any predictions")
110+
111+
# Test 2: Check for equality of trip inferred labels and prediction value in entry
112+
self.assertEqual(trip["data"]["inferred_labels"], entry["data"]["prediction"])
113+
114+
# Test 3: Check that prediction value in entry is equal to the prediction generated by the algorithm
115+
this_algorithm = ecwl.AlgorithmTypes(entry["data"]["algorithm_id"])
116+
self.assertIn(this_algorithm, self.test_algorithms)
117+
self.assertEqual(entry["data"]["prediction"], self.test_algorithms[this_algorithm]([trip])[0])
118+
108119
for singleprediction in entry["data"]["prediction"]:
109-
self.assertIsInstance(singleprediction, dict, " should be an instance of the dictionary class")
110-
self.assertIsInstance(singleprediction['labels'], dict, " should be an instance of the dictionary class")
111-
self.assertIn('mode_confirm',singleprediction['labels'].keys())
112-
self.assertIn('replaced_mode',singleprediction['labels'].keys())
113-
self.assertIn('purpose_confirm',singleprediction['labels'].keys())
120+
# Test 4: Check that the prediction is a dictionary
121+
self.assertIsInstance(singleprediction, dict, "should be an instance of the dictionary class")
122+
self.assertIsInstance(singleprediction['labels'], dict, "should be an instance of the dictionary class")
123+
124+
# Test 5: Check that the prediction dictionary contains the required keys
125+
self.assertIn('mode_confirm', singleprediction['labels'].keys())
126+
self.assertIn('replaced_mode', singleprediction['labels'].keys())
127+
self.assertIn('purpose_confirm', singleprediction['labels'].keys())
128+
129+
# Test 6: Check that the prediction dictionary contains the correct values
130+
self.assertIn(singleprediction['labels']['mode_confirm'], valid_modes)
131+
self.assertIn(singleprediction['labels']['purpose_confirm'], valid_purposes)
114132

115133
def main():
116134
etc.configLogging()

emission/tests/modellingTests/TestForestModelLoadandSave.py

Lines changed: 25 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -17,27 +17,14 @@
1717
class TestForestModelLoadandSave(unittest.TestCase):
1818
"""
1919
Tests to make sure the model load and save properly
20+
The label_data dict and mock_trip_data are copied over from TestRunGreedyModel.py
2021
"""
21-
def setUp(self):
22-
"""
23-
sets up the end-to-end run model test with Confirmedtrip data
24-
"""
25-
# configuration for randomly-generated test data
26-
self.user_id = user_id = 'TestForestModelLoadAndSave-TestData'
27-
self.origin = (-105.1705977, 39.7402654,)
28-
self.destination = (-105.1755606, 39.7673075)
29-
self.min_trips = 14
30-
self.total_trips = 100
31-
self.clustered_trips = 33 # must have at least self.min_trips similar trips by default
32-
self.has_label_percent = 0.9 # let's make a few that don't have a label, but invariant
33-
# $clustered_trips * $has_label_percent > self.min_trips
34-
# must be correct or else this test could fail under some random test cases.
35-
22+
def setUp(self):
23+
self.user_id = 'TestForestModelLoadAndSave-TestData'
3624
self.unused_user_id = 'asdjfkl;asdfjkl;asd08234ur13fi4jhf2103mkl'
25+
ts = esta.TimeSeries.get_time_series(self.user_id)
3726

38-
ts = esta.TimeSeries.get_time_series(user_id)
39-
40-
# generate labels with a known sample weight that we can rely on in the test
27+
# Generate labels with a known sample weight that we can rely on in the test
4128
label_data = {
4229
"mode_confirm": ['ebike', 'bike'],
4330
"purpose_confirm": ['happy-hour', 'dog-park'],
@@ -46,24 +33,29 @@ def setUp(self):
4633
"purpose_weights": [0.1, 0.9]
4734
}
4835

49-
# generate test data for the database
50-
test_data = etmm.generate_mock_trips(
51-
user_id=user_id,
52-
trips=self.total_trips,
53-
origin=self.origin,
54-
destination=self.destination,
36+
# Configuration values for randomly-generated test data copied over from TestRunGreedyModel.py
37+
mock_trip_data = etmm.generate_mock_trips(
38+
user_id=self.user_id,
39+
trips=100,
40+
origin=(-105.1705977, 39.7402654,),
41+
destination=(-105.1755606, 39.7673075),
5542
trip_part='od',
5643
label_data=label_data,
57-
within_threshold=self.clustered_trips,
44+
within_threshold=33,
5845
threshold=0.004, # ~400m
59-
has_label_p=self.has_label_percent
46+
has_label_p=0.9
6047
)
6148

62-
for result_entry in test_data:
49+
# Required for Forest model inference
50+
for result_entry in mock_trip_data:
6351
result_entry['data']['start_local_dt']=result_entry['metadata']['write_local_dt']
6452
result_entry['data']['end_local_dt']=result_entry['metadata']['write_local_dt']
6553

66-
ts.bulk_insert(test_data)
54+
split = int(len(mock_trip_data)*0.7)
55+
mock_train_data = mock_trip_data[:split]
56+
self.mock_test_data = mock_trip_data[split:]
57+
58+
ts.bulk_insert(mock_train_data)
6759

6860
self.forest_model_config= eamtc.get_config_value_or_raise('model_parameters.forest')
6961

@@ -73,7 +65,7 @@ def setUp(self):
7365
user_id=self.user_id,
7466
model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER,
7567
model_storage=eamums.ModelStorage.DOCUMENT_DATABASE,
76-
min_trips=self.min_trips,
68+
min_trips=14,
7769
model_config=self.forest_model_config
7870
)
7971

@@ -98,10 +90,8 @@ def testForestModelPredictionsEquality(self):
9890
The type of deserialized model attributes and the predictions of this must match
9991
those of initial model.
10092
"""
101-
test_trip_data = esda.get_entries(key=esda.CONFIRMED_TRIP_KEY, user_id=self.user_id, time_query=None)
102-
10393
predictions_list = eamur.predict_labels_with_n(
104-
trip_list = test_trip_data,
94+
trip_list = self.mock_test_data,
10595
model=self.model
10696
)
10797

@@ -111,7 +101,7 @@ def testForestModelPredictionsEquality(self):
111101
deserialized_model.from_dict(model_data)
112102

113103
predictions_deserialized_model_list = eamur.predict_labels_with_n(
114-
trip_list = test_trip_data,
104+
trip_list = self.mock_test_data,
115105
model=deserialized_model
116106
)
117107

@@ -130,10 +120,8 @@ def testForestModelConsistency(self):
130120
ConsistencyTest : To Verify that the serialization and deserialization process
131121
is consistent across multiple executions
132122
"""
133-
test_trip_data = esda.get_entries(key=esda.CONFIRMED_TRIP_KEY, user_id=self.user_id, time_query=None)
134-
135123
predictions_list_model1 = eamur.predict_labels_with_n(
136-
trip_list = test_trip_data,
124+
trip_list = self.mock_test_data,
137125
model=self.model
138126
)
139127

@@ -145,7 +133,7 @@ def testForestModelConsistency(self):
145133
)
146134

147135
predictions_list_model2 = eamur.predict_labels_with_n(
148-
trip_list = test_trip_data,
136+
trip_list = self.mock_test_data,
149137
model=model_iter2
150138
)
151139

emission/tests/modellingTests/TestRunForestModel.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ def testTrainForestModelWithZeroTrips(self):
155155
"pipeline should not have a current timestamp for the test user")
156156

157157

158-
def test1RoundPredictForestModel(self):
158+
def testRoundPredictForestModel(self):
159159
"""
160160
forest model takes config arguments via the constructor for testing
161161
purposes but will load from a file in /conf/analysis/ which is tested here
@@ -204,11 +204,11 @@ def test1RoundPredictForestModel(self):
204204
)
205205
for prediction, n in predictions_list:
206206
[logging.debug(p) for p in sorted(prediction, key=lambda r: r['p'], reverse=True)]
207-
self.assertNotEqual(len(prediction), 0, "should have a prediction")
207+
self.assertNotEqual(len(prediction), 0, "Prediction list should not be empty - model failed to generate any predictions")
208208
self.assertIn('labels',prediction[0].keys())
209209
self.assertIn('p',prediction[0].keys())
210-
self.assertIsInstance(prediction[0], dict, " should be an instance of the dictionary class")
211-
self.assertIsInstance(prediction[0]['labels'], dict, " should be an instance of the dictionary class")
210+
self.assertIsInstance(prediction[0], dict, "should be an instance of the dictionary class")
211+
self.assertIsInstance(prediction[0]['labels'], dict, "should be an instance of the dictionary class")
212212
self.assertIn('mode_confirm',prediction[0]['labels'].keys())
213213
self.assertIn('replaced_mode',prediction[0]['labels'].keys())
214214
self.assertIn('purpose_confirm',prediction[0]['labels'].keys())

0 commit comments

Comments
 (0)