1
1
from os .path import basename
2
2
from os .path import splitext
3
3
from os .path import join as path_join
4
+
4
5
from tilequeue .wof import Neighbourhood
5
6
from tilequeue .wof import NeighbourhoodFailure
6
7
from tilequeue .wof import NeighbourhoodMeta
7
8
from tilequeue .wof import create_neighbourhood_from_json
8
9
from tilequeue .wof import write_neighbourhood_data_to_file
9
10
import json
10
- import tarfile
11
11
import requests
12
12
from tqdm import tqdm
13
13
14
+
14
15
"""
15
16
expects input to look like "123456.geojson"
16
17
"""
@@ -27,6 +28,14 @@ def _parse_neighbourhood(file_name, data, placetype, file_hash):
27
28
n = create_neighbourhood_from_json (json_data , meta )
28
29
return n
29
30
31
+ def _parse_neighbourhood_from_json (json_str ):
32
+ j = json .loads (json_str )
33
+ wof_id = j ['id' ]
34
+ placetype = j ['properties' ]['wof:placetype' ]
35
+ meta = NeighbourhoodMeta (wof_id , placetype , None , "123" , None )
36
+ hood = create_neighbourhood_from_json (j , meta )
37
+ return hood
38
+
30
39
31
40
class WOFArchiveReader (object ):
32
41
"""
@@ -37,24 +46,7 @@ class WOFArchiveReader(object):
37
46
def __init__ (self ):
38
47
self .wof_items = []
39
48
40
- def add_archive (self , archive , file_hash , count ):
41
- """
42
- Adds the GeoJSON files in the tar.gz archive to the list of wof_items.
43
-
44
- Displays a progress bar, with count being the expected number of items
45
- in the tar.gz.
46
- """
47
-
48
- with tqdm (total = count ) as pbar :
49
- with tarfile .open (archive ) as tar :
50
- for info in tar :
51
- if info .isfile () and info .name .endswith ('.geojson' ) and "-" not in basename (info .name ):
52
- self ._parse_file (
53
- info .name , tar .extractfile (info ).read (), file_hash )
54
- pbar .update (1 )
55
-
56
- def _parse_file (self , file_name , data , file_hash ):
57
- n_or_fail = _parse_neighbourhood (file_name , data , placetype , file_hash )
49
+ def handle_neighborhood_or_fail (self , n_or_fail ):
58
50
if isinstance (n_or_fail , Neighbourhood ):
59
51
self .wof_items .append (n_or_fail )
60
52
elif isinstance (n_or_fail , NeighbourhoodFailure ):
@@ -67,14 +59,44 @@ def _parse_file(self, file_name, data, file_hash):
67
59
else :
68
60
raise ValueError ("Unexpected %r" % (n_or_fail ,))
69
61
62
+ def add_sqlite_file (self , sqlite_filename , file_hash ):
63
+ with tqdm (desc = "Grabbing rows from sqlite file %s" % sqlite_filename , unit = "Rows" , unit_scale = True ) as pbar :
64
+ import sqlite3
65
+ from _sqlite3 import Error
66
+ try :
67
+ conn = sqlite3 .connect (sqlite_filename )
68
+ pbar .update (1 )
69
+ except Error as e :
70
+ print (e )
71
+
72
+ cursor = conn .cursor ()
73
+ pbar .update (1 )
74
+ query = """
75
+ select geojson.body from geojson where geojson.id in (
76
+ select spr.id from spr
77
+ where spr.placetype IN ('neighbourhood', 'borough','macrohood', 'microhood')
78
+ AND spr.id != 1
79
+ AND spr.is_deprecated = 0
80
+ AND spr.is_superseded = 0
81
+ AND spr.is_current != 0
82
+ ) AND geojson.is_alt = 0
83
+ """
84
+ cursor .execute (query )
85
+ pbar .update (1 )
86
+
87
+ for row in cursor :
88
+ n_or_fail = _parse_neighbourhood_from_json (row [0 ])
89
+ self .handle_neighborhood_or_fail (n_or_fail )
90
+ pbar .update (1 )
91
+
70
92
71
93
class tmpdownload (object ):
72
94
"""
73
95
Downloads a file to a temporary location and yields its absolute path. Once
74
96
the scope exits, deletes the temporary file.
75
97
"""
76
98
77
- def __init__ (self , url , expected_size ):
99
+ def __init__ (self , url ):
78
100
import tempfile
79
101
self .tempdir = tempfile .mkdtemp ()
80
102
@@ -85,7 +107,7 @@ def __init__(self, url, expected_size):
85
107
with requests .get (url , stream = True ) as response :
86
108
response .raise_for_status ()
87
109
88
- with tqdm (total = expected_size ) as pbar :
110
+ with tqdm (desc = "Downloading %s" % url , unit = "bytes" , unit_scale = True ) as pbar :
89
111
with open (abs_fname , 'wb' ) as fh :
90
112
for chunk in response .iter_content (chunk_size = 16384 ):
91
113
if chunk :
@@ -102,30 +124,37 @@ def __exit__(self, type, value, traceback):
102
124
shutil .rmtree (self .tempdir )
103
125
104
126
105
- WOF_INVENTORY = 'https://data.geocode.earth/wof/dist/legacy/inventory.json'
106
- WOF_BUNDLE_PREFIX = 'https://data.geocode.earth/wof/dist/legacy/'
127
+ class TmpBz2Decompress (object ):
128
+ def __init__ (self , filename ):
129
+ import bz2
130
+ suffix_length = len (".bz2" )
131
+ input_filename = filename [:filename .rfind ("/" )+ 1 ]
132
+ output_filename = filename [:- suffix_length ]
133
+ with tqdm (desc = "Decompressing %s" % input_filename , unit = "bytes" , unit_scale = True ) as pbar :
134
+ with open (output_filename , 'w' ) as outfile :
135
+ with bz2 .BZ2File (filename , 'r' ) as bzfile :
136
+ for chunk in bzfile :
137
+ outfile .write (chunk )
138
+ pbar .update (len (chunk ))
107
139
140
+ self .abs_fname = output_filename
108
141
109
- if __name__ == '__main__' :
110
- inventory = requests .get (WOF_INVENTORY ).json ()
111
- reader = WOFArchiveReader ()
142
+ def __enter__ (self ):
143
+ return self .abs_fname
112
144
113
- for placetype in ('neighbourhood' , 'macrohood' , 'microhood' , 'borough' ):
114
- fname = 'whosonfirst-data-%s-latest.tar.bz2' % (placetype ,)
145
+ def __exit__ (self , type , value , traceback ):
146
+ import os
147
+ os .remove (self .abs_fname )
115
148
116
- matching = [item for item in inventory
117
- if item ['name_compressed' ] == fname ]
118
- assert len (matching ) == 1
119
- item = matching [0 ]
120
149
121
- version = item ['last_updated' ]
122
- download_size = item ['size_compressed' ]
150
+ WOF_SQLITE = "https://data.geocode.earth/wof/dist/sqlite/whosonfirst-data-admin-latest.db.bz2"
151
+
152
+ if __name__ == '__main__' :
153
+ reader = WOFArchiveReader ()
123
154
124
- print "Downloading %r" % (placetype )
125
- with tmpdownload (WOF_BUNDLE_PREFIX + fname , download_size ) as fname :
126
- print "Parsing WOF data"
127
- # 20210820: geocode.earth inventory files don't offer count, so count hardcoded to 1
128
- reader .add_archive (fname , version , 1 )
155
+ with tmpdownload (WOF_SQLITE ) as fname :
156
+ with TmpBz2Decompress (fname ) as decompressed :
157
+ reader .add_sqlite_file (decompressed , "latest" )
129
158
130
159
print "Writing output SQL"
131
160
with open ('wof_snapshot.sql' , 'w' ) as fh :
0 commit comments