e-mission · shankari · Aug 5, 2019 · Mar 19, 2018 · Mar 27, 2018 · Mar 28, 2018
diff --git a/bin/analysis/remove_inferred_modes.py b/bin/analysis/remove_inferred_modes.py
@@ -17,9 +17,39 @@
 import uuid
 import arrow
 
-import emission.analysis.classification.inference.mode.pipeline as eacimp
+import emission.analysis.classification.inference.mode.reset as eacimr
 import emission.core.get_database as edb
 import emission.storage.decorations.user_queries as esdu
+import emission.core.wrapper.user as ecwu
+
+def _get_user_list(args):
+    if args.all:
+        return _find_all_users()
+    elif args.platform:
+        return _find_platform_users(args.platform)
+    elif args.email_list:
+        return _email_2_user_list(args.email_list)
+    else:
+        assert args.user_list is not None
+        return [uuid.UUID(u) for u in args.user_list]
+
+def _find_platform_users(platform):
+    # Since all new clients register a profile with the server, we don't have
+    # to run a 'distinct' query over the entire contents of the timeseries.
+    # Instead, we can simply query from the profile users, which is
+    # significantly faster
+    # Use the commented out line instead for better performance.
+    # Soon, we can move to the more performant option, because there will be
+    # no users that don't have a profile
+    # return edb.get_timeseries_db().find({'metadata.platform': platform}).distinct(
+    #    'user_id')
+   return edb.get_profile_db().find({"curr_platform": platform}).distinct("user_id")
+
+def _find_all_users():
+   return esdu.get_all_uuids()
+
+def _email_2_user_list(email_list):
+    return [ecwu.User.fromEmail(e).uuid for e in email_list]
 
 if __name__ == '__main__':
     logging.basicConfig(level=logging.DEBUG)
@@ -47,14 +77,14 @@
     # Handle the first row in the table
     if args.date is None:
         if args.all:
-            eacimp.del_all_objects(args.dry_run)
+            eacimr.del_all_objects(args.dry_run)
         else:
             user_list = _get_user_list(args)
             logging.info("received list with %s users" % user_list)
             logging.info("first few entries are %s" % user_list[0:5])
             for user_id in user_list:
                 logging.info("resetting user %s to start" % user_id)
-                eacimp.del_objects_after(user_id, 0, args.dry_run)
+                eacimr.del_objects_after(user_id, 0, args.dry_run)
     else:
     # Handle the second row in the table
         day_dt = arrow.get(args.date, "YYYY-MM-DD")
@@ -66,5 +96,5 @@
         logging.info("first few entries are %s" % user_list[0:5])
         for user_id in user_list:
             logging.info("resetting user %s to ts %s" % (user_id, day_ts))
-            eacimp.del_objects_after(user_id, day_ts, args.dry_run)
+            eacimr.del_objects_after(user_id, day_ts, args.dry_run)
 
diff --git a/bin/reset_pipeline.py b/bin/reset_pipeline.py
@@ -21,6 +21,7 @@
 import emission.pipeline.reset as epr
 import emission.core.get_database as edb
 import emission.storage.decorations.user_queries as esdu
+import emission.core.wrapper.user as ecwu
 
 def _get_user_list(args):
     if args.all:
@@ -49,7 +50,7 @@ def _find_all_users():
    return esdu.get_all_uuids()
 
 def _email_2_user_list(email_list):
-    return [ecwu.User.fromEmail(e) for e in email_list]
+    return [ecwu.User.fromEmail(e).uuid for e in email_list]
 
 if __name__ == '__main__':
     logging.basicConfig(level=logging.DEBUG)

diff --git a/conf/analysis/debug.conf.json.sample b/conf/analysis/debug.conf.json.sample
@@ -3,5 +3,7 @@
     "intake.cleaning.filter_accuracy.enable": false,
     "classification.inference.mode.useAdvancedFeatureIndices": true,
     "classification.inference.mode.useBusTrainFeatureIndices": true,
+    "section.startStopRadius": 150,
+    "section.endStopRadius": 150,
     "analysis.result.section.key": "analysis/inferred_section"
 }
diff --git a/conf/net/ext_service/overpass_server.json.sample b/conf/net/ext_service/overpass_server.json.sample
@@ -0,0 +1,3 @@
+{
+    "url": "server running the overpass API to query OSM (e.g. https://wiki.openstreetmap.org/wiki/Overpass_API)"
+}
diff --git a/conf/net/ext_service/overpass_transit_stops_query_template.sample b/conf/net/ext_service/overpass_transit_stops_query_template.sample
@@ -0,0 +1,10 @@
+[out:json][timeout:25];
+(
+  node["highway"="bus_stop"]({bbox});
+  node["railway"="station"]({bbox});
+  node["public_transport"]({bbox});
+  way["railway"="station"]({bbox});
+  relation["route"]({bbox});
+);
+out body;
+>;
diff --git a/e-mission-py.bash b/e-mission-py.bash
@@ -5,4 +5,4 @@
 
 # Make sure that the python here is the anaconda python if that is not the one in the path
 
-PYTHONPATH=. python $*
+PYTHONPATH=. python "$@"
diff --git a/emission/analysis/classification/inference/mode/pipeline.py b/emission/analysis/classification/inference/mode/pipeline.py
@@ -49,58 +49,6 @@ def predict_mode(user_id):
         logging.exception("Error while inferring modes, timestamp is unchanged")
         epq.mark_mode_inference_failed(user_id)
 
-# Delete the objects created by this pipeline step (across users)
-def del_all_objects(is_dry_run):
-    del_query = {}
-    del_query.update({"metadata.key": {"$in": ["inference/prediction", "analysis/inferred_section"]}})
-    logging.info("About to delete %d entries" 
-        % edb.get_analysis_timeseries_db().find(del_query).count())
-    logging.info("About to delete entries with keys %s" 
-        % edb.get_analysis_timeseries_db().find(del_query).distinct("metadata.key"))
-
-    del_pipeline_query = {"pipeline_stage": ecwp.PipelineStages.MODE_INFERENCE.value}
-    logging.info("About to delete pipeline entries for stage %s" %
-        ecwp.PipelineStages.MODE_INFERENCE)
-
-    if is_dry_run:
-        logging.info("this is a dry-run, returning from del_objects_after without modifying anything")
-    else:
-        result = edb.get_analysis_timeseries_db().delete_many(del_query)
-        logging.info("this is not a dry-run, result of deleting analysis entries is %s" % result.raw_result)
-        result = edb.get_pipeline_state_db().delete_many(del_pipeline_query)
-        logging.info("this is not a dry-run, result of deleting pipeline state is %s" % result.raw_result)
-
-# Delete the objects created by this pipeline step (for a particular user)
-def del_objects_after(user_id, reset_ts, is_dry_run):
-    del_query = {}
-    # handle the user
-    del_query.update({"user_id": user_id})
-
-    del_query.update({"metadata.key": {"$in": ["inference/prediction", "analysis/inferred_section"]}})
-    # all objects inserted here have start_ts and end_ts and are trip-like
-    del_query.update({"data.start_ts": {"$gt": reset_ts}})
-    logging.debug("After all updates, del_query = %s" % del_query)
-
-    reset_pipeline_query = {"pipeline_stage": ecwp.PipelineStages.MODE_INFERENCE.value}
-    # Fuzz the TRIP_SEGMENTATION stage 5 mins because of
-    # https://github.com/e-mission/e-mission-server/issues/333#issuecomment-312730217
-    FUZZ_FACTOR = 5 * 60
-    reset_pipeline_update = {'$set': {'last_processed_ts': reset_ts + FUZZ_FACTOR}}
-    logging.info("About to reset stage %s to %s" 
-        % (ecwp.PipelineStages.MODE_INFERENCE, reset_ts))
-
-
-    logging.info("About to delete %d entries" 
-        % edb.get_analysis_timeseries_db().find(del_query).count())
-    logging.info("About to delete entries with keys %s" 
-        % edb.get_analysis_timeseries_db().find(del_query).distinct("metadata.key"))
-
-    if is_dry_run:
-        logging.info("this is a dry-run, returning from del_objects_after without modifying anything")
-    else:
-        result = edb.get_analysis_timeseries_db().remove(del_query)
-        logging.info("this is not a dry-run, result of deleting analysis entries is %s" % result)
-
 class ModeInferencePipeline:
   def __init__(self):
     self.featureLabels = ["distance", "duration", "first filter mode", "sectionId", "avg speed",
@@ -226,9 +174,6 @@ def updateFeatureMatrixRowWithSection(self, featureMatrix, i, section_entry):
     if (hasattr(self, "air_cluster")): 
         featureMatrix[i, 21] = easf.mode_start_end_coverage(section, self.air_cluster,600)
 
-    if self.last_section_done is None or self.last_section_done.data.end_ts < section_entry.data.end_ts:
-      self.last_section_done = section_entry
-
     # Replace NaN and inf by zeros so that it doesn't crash later
     featureMatrix[i] = np.nan_to_num(featureMatrix[i])
 
@@ -344,6 +289,9 @@ def savePredictionsStep(self):
         logging.debug("Updating sensed mode for section = %s to %s" % 
             (currSectionEntry.get_id(), ise.data.sensed_mode))
         self.ts.insert(ise)
+    # Set last_section_done after saving because otherwise if there is an error
+    # during inference, we will not save results and never re-run
+    self.last_section_done = self.toPredictSections[-1]
 
 if __name__ == "__main__":
   import json

diff --git a/emission/analysis/classification/inference/mode/reset.py b/emission/analysis/classification/inference/mode/reset.py
@@ -0,0 +1,61 @@
+import logging
+import emission.core.get_database as edb
+import emission.core.wrapper.pipelinestate as ecwp
+
+import emission.core.get_database as edb
+import emission.core.wrapper.pipelinestate as ecwp
+
+# Delete the objects created by this pipeline step (across users)
+def del_all_objects(is_dry_run):
+    del_query = {}
+    del_query.update({"metadata.key": {"$in": ["inference/prediction", "analysis/inferred_section"]}})
+    logging.info("About to delete %d entries" 
+        % edb.get_analysis_timeseries_db().find(del_query).count())
+    logging.info("About to delete entries with keys %s" 
+        % edb.get_analysis_timeseries_db().find(del_query).distinct("metadata.key"))
+
+    del_pipeline_query = {"pipeline_stage": ecwp.PipelineStages.MODE_INFERENCE.value}
+    logging.info("About to delete pipeline entries for stage %s" %
+        ecwp.PipelineStages.MODE_INFERENCE)
+
+    if is_dry_run:
+        logging.info("this is a dry-run, returning from del_objects_after without modifying anything")
+    else:
+        result = edb.get_analysis_timeseries_db().delete_many(del_query)
+        logging.info("this is not a dry-run, result of deleting analysis entries is %s" % result.raw_result)
+        result = edb.get_pipeline_state_db().delete_many(del_pipeline_query)
+        logging.info("this is not a dry-run, result of deleting pipeline state is %s" % result.raw_result)
+
+# Delete the objects created by this pipeline step (for a particular user)
+def del_objects_after(user_id, reset_ts, is_dry_run):
+    del_query = {}
+    # handle the user
+    del_query.update({"user_id": user_id})
+
+    del_query.update({"metadata.key": {"$in": ["inference/prediction", "analysis/inferred_section"]}})
+    # all objects inserted here have start_ts and end_ts and are trip-like
+    del_query.update({"data.start_ts": {"$gt": reset_ts}})
+    logging.debug("After all updates, del_query = %s" % del_query)
+
+    reset_pipeline_query = {"user_id": user_id, "pipeline_stage": ecwp.PipelineStages.MODE_INFERENCE.value}
+    # Fuzz the TRIP_SEGMENTATION stage 5 mins because of
+    # https://github.com/e-mission/e-mission-server/issues/333#issuecomment-312730217
+    FUZZ_FACTOR = 5 * 60
+    reset_pipeline_update = {'$set': {'last_processed_ts': reset_ts + FUZZ_FACTOR}}
+    logging.info("About to reset stage %s to %s" 
+        % (ecwp.PipelineStages.MODE_INFERENCE, reset_ts))
+
+
+    logging.info("About to delete %d entries" 
+        % edb.get_analysis_timeseries_db().find(del_query).count())
+    logging.info("About to delete entries with keys %s" 
+        % edb.get_analysis_timeseries_db().find(del_query).distinct("metadata.key"))
+
+    if is_dry_run:
+        logging.info("this is a dry-run, returning from del_objects_after without modifying anything")
+    else:
+        result = edb.get_analysis_timeseries_db().remove(del_query)
+        logging.info("this is not a dry-run, result of deleting analysis entries is %s" % result)
+        result = edb.get_pipeline_state_db().update_one(reset_pipeline_query, reset_pipeline_update)
+        logging.info("this is not a dry-run, result of updating pipeline state is %s" % result.raw_result)
+
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,4 +5,4 @@

		# Make sure that the python here is the anaconda python if that is not the one in the path

		PYTHONPATH=. python $*
		PYTHONPATH=. python "$@"