Skip to content

Commit b38366b

Browse files
committed
Store pipeline state + switch to argparse
The raw data and the analysis results do not constitute the entire state of a pipeline. In particular, if we store only the raw + analysis results, and then we try to run the pipeline again, we will end up with two copies of the analysis results. Instead, when we transfer data, it should include the raw data, the pipeline state, and the analysis results. Change this code to store the pipeline state as well. And since I am in there changing things anyway, switch to argparse to handle the arguments as well. ``` $ ./e-mission-py.bash bin/debug/extract_timeline_for_day_range_and_user.py -e test_output_gen_curr_ts -- 2010-01-01 2020-01-01 /tmp/test_dump storage not configured, falling back to sample, default configuration Connecting to database URL localhost INFO:root:================================================== INFO:root:Extracting timeline for user d4dfcc42-b6fc-4b6b-a246-d1abec1d039f day 2010-01-01 -> 2020-01-01 and saving to file /tmp/test_dump DEBUG:root:start_day_ts = 1262304000 (2010-01-01T00:00:00+00:00), end_day_ts = 1577836800 (2020-01-01T00:00:00+00:00) DEBUG:root:curr_query = {'user_id': UUID('d4dfcc42-b6fc-4b6b-a246-d1abec1d039f'), 'data.ts': {'$lte': 1577836800, '$gte': 1262304000}}, sort_key = data.ts DEBUG:root:orig_ts_db_keys = None, analysis_ts_db_keys = None DEBUG:root:finished querying values for None DEBUG:root:finished querying values for None DEBUG:root:curr_query = {'user_id': UUID('d4dfcc42-b6fc-4b6b-a246-d1abec1d039f'), 'data.start_ts': {'$lte': 1577836800, '$gte': 1262304000}}, sort_key = data.start_ts DEBUG:root:orig_ts_db_keys = None, analysis_ts_db_keys = None DEBUG:root:finished querying values for None DEBUG:root:finished querying values for None DEBUG:root:curr_query = {'user_id': UUID('d4dfcc42-b6fc-4b6b-a246-d1abec1d039f'), 'data.enter_ts': {'$lte': 1577836800, '$gte': 1262304000}}, sort_key = data.enter_ts DEBUG:root:orig_ts_db_keys = None, analysis_ts_db_keys = None DEBUG:root:finished querying values for None DEBUG:root:finished querying values for None INFO:root:Found 1449 loc entries, 27 trip-like entries, 19 place-like entries = 1495 total entries INFO:root:timeline has unique keys = {'stats/server_api_error', 'statemachine/transition', 'analysis/cleaned_stop', 'background/filtered_location', 'segmentation/raw_trip', 'background/location', 'segmentation/raw_stop', 'segmentation/raw_section', 'stats/client_time', 'background/motion_activity', 'analysis/recreated_location', 'segmentation/raw_place', 'analysis/cleaned_trip', 'background/battery', 'analysis/cleaned_section', 'stats/server_api_time', 'analysis/cleaned_place', 'stats/pipeline_time', 'stats/client_nav_event'} INFO:root:Found 6 pipeline states [6, 1, 2, 3, 11, 9] $ ls -1 /tmp/test_dump_* /tmp/test_dump_d4dfcc42-b6fc-4b6b-a246-d1abec1d039f.gz /tmp/test_dump_pipelinestate_d4dfcc42-b6fc-4b6b-a246-d1abec1d039f.gz ```
1 parent 869a43d commit b38366b

File tree

1 file changed

+47
-17
lines changed

1 file changed

+47
-17
lines changed

bin/debug/extract_timeline_for_day_range_and_user.py

Lines changed: 47 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import arrow
2020
import argparse
2121

22+
import emission.core.wrapper.user as ecwu
2223
import emission.storage.timeseries.abstract_timeseries as esta
2324
import emission.storage.timeseries.timequery as estt
2425
import emission.storage.decorations.user_queries as esdu
@@ -53,9 +54,25 @@ def export_timeline(user_id, start_day_str, end_day_str, file_name):
5354
if len(combined_list) == 0 or unique_key_list == set(['stats/pipeline_time']):
5455
logging.info("No entries found in range for user %s, skipping save" % user_id)
5556
else:
57+
# Also dump the pipeline state, since that's where we have analysis results upto
58+
# This allows us to copy data to a different *live system*, not just
59+
# duplicate for analysis
5660
combined_filename = "%s_%s.gz" % (file_name, user_id)
57-
json.dump(combined_list,
58-
gzip.open(combined_filename, "wb"), default=bju.default, allow_nan=False, indent=4)
61+
with gzip.open(combined_filename, "wt") as gcfd:
62+
json.dump(combined_list,
63+
gcfd, default=bju.default, allow_nan=False, indent=4)
64+
65+
import emission.core.get_database as edb
66+
67+
pipeline_state_list = list(edb.get_pipeline_state_db().find({"user_id": user_id}))
68+
logging.info("Found %d pipeline states %s" %
69+
(len(pipeline_state_list),
70+
list([ps["pipeline_stage"] for ps in pipeline_state_list])))
71+
72+
pipeline_filename = "%s_pipelinestate_%s.gz" % (file_name, user_id)
73+
with gzip.open(pipeline_filename, "wt") as gpfd:
74+
json.dump(pipeline_state_list,
75+
gpfd, default=bju.default, allow_nan=False, indent=4)
5976

6077
def validate_truncation(loc_entry_list, trip_entry_list, place_entry_list):
6178
MAX_LIMIT = 25 * 10000
@@ -70,20 +87,33 @@ def export_timeline_for_users(user_id_list, args):
7087
for curr_uuid in user_id_list:
7188
if curr_uuid != '':
7289
logging.info("=" * 50)
73-
export_timeline(user_id=curr_uuid, start_day_str=sys.argv[2], end_day_str=sys.argv[3], file_name=sys.argv[4])
74-
90+
export_timeline(user_id=curr_uuid, start_day_str=args.start_day,
91+
end_day_str= args.end_day, file_name=args.file_prefix)
7592

7693
if __name__ == '__main__':
77-
if len(sys.argv) != 5:
78-
print("Usage: %s [<user>|'all'|'file_XXX'] <start_day> <end_day> <file_prefix>" % (sys.argv[0]))
79-
else:
80-
user_id_str = sys.argv[1]
81-
if user_id_str == "all":
82-
all_uuids = esdu.get_all_uuids()
83-
export_timeline_for_users(all_uuids, sys.argv)
84-
elif user_id_str.startswith("file_"):
85-
uuid_strs = json.load(open(user_id_str))
86-
uuids = [uuid.UUID(ustr) for ustr in uuid_strs]
87-
export_timeline_for_users(uuids, sys.argv)
88-
else:
89-
export_timeline(user_id=uuid.UUID(sys.argv[1]), start_day_str=sys.argv[2], end_day_str=sys.argv[3], file_name=sys.argv[4])
94+
logging.basicConfig(level=logging.DEBUG)
95+
parser = argparse.ArgumentParser(prog="extract_timeline_for_day_range_and_user")
96+
97+
group = parser.add_mutually_exclusive_group(required=True)
98+
group.add_argument("-e", "--user_email", nargs="+")
99+
group.add_argument("-u", "--user_uuid", nargs="+")
100+
group.add_argument("-a", "--all", action="store_true")
101+
group.add_argument("-f", "--file")
102+
103+
parser.add_argument("start_day", help="start day in utc - e.g. 'YYYY-MM-DD'" )
104+
parser.add_argument("end_day", help="start day in utc - e.g. 'YYYY-MM-DD'" )
105+
parser.add_argument("file_prefix", help="prefix for the filenames generated - e.g /tmp/dump_ will generate files /tmp/dump_<uuid1>.gz, /tmp/dump_<uuid2>.gz..." )
106+
107+
args = parser.parse_args()
108+
109+
if args.user_uuid:
110+
uuid_list = [uuid.UUID(uuid_str) for uuid_str in args.user_uuid]
111+
elif args.user_email:
112+
uuid_list = [ecwu.User.fromEmail(uuid_str).uuid for uuid_str in args.user_email]
113+
elif args.all:
114+
uuid_list = esdu.get_all_uuids()
115+
elif args.file:
116+
with open(args.file) as fd:
117+
uuid_strs = json.load(fd)
118+
uuid_list = [uuid.UUID(ustr) for ustr in uuid_strs]
119+
export_timeline_for_users(uuid_list, args)

0 commit comments

Comments
 (0)