Skip to content

Commit 0a760a1

Browse files
authored
Merge pull request #1977 from tilezen/travisg/20211006-update-wof-grab
Update the way we grab WOF assets
2 parents 8c8cdae + 56c2cf0 commit 0a760a1

File tree

2 files changed

+69
-40
lines changed

2 files changed

+69
-40
lines changed

data/assets.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
bucket: nextzen-tile-assets
2-
datestamp: 20210825
2+
datestamp: 20211006
33

44
shapefiles:
55

data/wof_snapshot.py

Lines changed: 68 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
from os.path import basename
22
from os.path import splitext
33
from os.path import join as path_join
4+
45
from tilequeue.wof import Neighbourhood
56
from tilequeue.wof import NeighbourhoodFailure
67
from tilequeue.wof import NeighbourhoodMeta
78
from tilequeue.wof import create_neighbourhood_from_json
89
from tilequeue.wof import write_neighbourhood_data_to_file
910
import json
10-
import tarfile
1111
import requests
1212
from tqdm import tqdm
1313

14+
1415
"""
1516
expects input to look like "123456.geojson"
1617
"""
@@ -27,6 +28,14 @@ def _parse_neighbourhood(file_name, data, placetype, file_hash):
2728
n = create_neighbourhood_from_json(json_data, meta)
2829
return n
2930

31+
def _parse_neighbourhood_from_json(json_str):
32+
j = json.loads(json_str)
33+
wof_id = j['id']
34+
placetype = j['properties']['wof:placetype']
35+
meta = NeighbourhoodMeta(wof_id, placetype, None, "123", None)
36+
hood = create_neighbourhood_from_json(j, meta)
37+
return hood
38+
3039

3140
class WOFArchiveReader(object):
3241
"""
@@ -37,24 +46,7 @@ class WOFArchiveReader(object):
3746
def __init__(self):
3847
self.wof_items = []
3948

40-
def add_archive(self, archive, file_hash, count):
41-
"""
42-
Adds the GeoJSON files in the tar.gz archive to the list of wof_items.
43-
44-
Displays a progress bar, with count being the expected number of items
45-
in the tar.gz.
46-
"""
47-
48-
with tqdm(total=count) as pbar:
49-
with tarfile.open(archive) as tar:
50-
for info in tar:
51-
if info.isfile() and info.name.endswith('.geojson') and "-" not in basename(info.name):
52-
self._parse_file(
53-
info.name, tar.extractfile(info).read(), file_hash)
54-
pbar.update(1)
55-
56-
def _parse_file(self, file_name, data, file_hash):
57-
n_or_fail = _parse_neighbourhood(file_name, data, placetype, file_hash)
49+
def handle_neighborhood_or_fail(self, n_or_fail):
5850
if isinstance(n_or_fail, Neighbourhood):
5951
self.wof_items.append(n_or_fail)
6052
elif isinstance(n_or_fail, NeighbourhoodFailure):
@@ -67,14 +59,44 @@ def _parse_file(self, file_name, data, file_hash):
6759
else:
6860
raise ValueError("Unexpected %r" % (n_or_fail,))
6961

62+
def add_sqlite_file(self, sqlite_filename, file_hash):
63+
with tqdm(desc="Grabbing rows from sqlite file %s" % sqlite_filename, unit="Rows", unit_scale=True) as pbar:
64+
import sqlite3
65+
from _sqlite3 import Error
66+
try:
67+
conn = sqlite3.connect(sqlite_filename)
68+
pbar.update(1)
69+
except Error as e:
70+
print(e)
71+
72+
cursor = conn.cursor()
73+
pbar.update(1)
74+
query = """
75+
select geojson.body from geojson where geojson.id in (
76+
select spr.id from spr
77+
where spr.placetype IN ('neighbourhood', 'borough','macrohood', 'microhood')
78+
AND spr.id != 1
79+
AND spr.is_deprecated = 0
80+
AND spr.is_superseded = 0
81+
AND spr.is_current != 0
82+
) AND geojson.is_alt = 0
83+
"""
84+
cursor.execute(query)
85+
pbar.update(1)
86+
87+
for row in cursor:
88+
n_or_fail = _parse_neighbourhood_from_json(row[0])
89+
self.handle_neighborhood_or_fail(n_or_fail)
90+
pbar.update(1)
91+
7092

7193
class tmpdownload(object):
7294
"""
7395
Downloads a file to a temporary location and yields its absolute path. Once
7496
the scope exits, deletes the temporary file.
7597
"""
7698

77-
def __init__(self, url, expected_size):
99+
def __init__(self, url):
78100
import tempfile
79101
self.tempdir = tempfile.mkdtemp()
80102

@@ -85,7 +107,7 @@ def __init__(self, url, expected_size):
85107
with requests.get(url, stream=True) as response:
86108
response.raise_for_status()
87109

88-
with tqdm(total=expected_size) as pbar:
110+
with tqdm(desc="Downloading %s" % url, unit="bytes", unit_scale=True) as pbar:
89111
with open(abs_fname, 'wb') as fh:
90112
for chunk in response.iter_content(chunk_size=16384):
91113
if chunk:
@@ -102,30 +124,37 @@ def __exit__(self, type, value, traceback):
102124
shutil.rmtree(self.tempdir)
103125

104126

105-
WOF_INVENTORY = 'https://data.geocode.earth/wof/dist/legacy/inventory.json'
106-
WOF_BUNDLE_PREFIX = 'https://data.geocode.earth/wof/dist/legacy/'
127+
class TmpBz2Decompress(object):
128+
def __init__(self, filename):
129+
import bz2
130+
suffix_length = len(".bz2")
131+
input_filename = filename[:filename.rfind("/")+1]
132+
output_filename = filename[:-suffix_length]
133+
with tqdm(desc="Decompressing %s" % input_filename, unit="bytes", unit_scale=True) as pbar:
134+
with open(output_filename, 'w') as outfile:
135+
with bz2.BZ2File(filename, 'r') as bzfile:
136+
for chunk in bzfile:
137+
outfile.write(chunk)
138+
pbar.update(len(chunk))
107139

140+
self.abs_fname = output_filename
108141

109-
if __name__ == '__main__':
110-
inventory = requests.get(WOF_INVENTORY).json()
111-
reader = WOFArchiveReader()
142+
def __enter__(self):
143+
return self.abs_fname
112144

113-
for placetype in ('neighbourhood', 'macrohood', 'microhood', 'borough'):
114-
fname = 'whosonfirst-data-%s-latest.tar.bz2' % (placetype,)
145+
def __exit__(self, type, value, traceback):
146+
import os
147+
os.remove(self.abs_fname)
115148

116-
matching = [item for item in inventory
117-
if item['name_compressed'] == fname]
118-
assert len(matching) == 1
119-
item = matching[0]
120149

121-
version = item['last_updated']
122-
download_size = item['size_compressed']
150+
WOF_SQLITE = "https://data.geocode.earth/wof/dist/sqlite/whosonfirst-data-admin-latest.db.bz2"
151+
152+
if __name__ == '__main__':
153+
reader = WOFArchiveReader()
123154

124-
print "Downloading %r" % (placetype)
125-
with tmpdownload(WOF_BUNDLE_PREFIX + fname, download_size) as fname:
126-
print "Parsing WOF data"
127-
# 20210820: geocode.earth inventory files don't offer count, so count hardcoded to 1
128-
reader.add_archive(fname, version, 1)
155+
with tmpdownload(WOF_SQLITE) as fname:
156+
with TmpBz2Decompress(fname) as decompressed:
157+
reader.add_sqlite_file(decompressed, "latest")
129158

130159
print "Writing output SQL"
131160
with open('wof_snapshot.sql', 'w') as fh:

0 commit comments

Comments
 (0)