Skip to content

Commit bbcb86d

Browse files
author
Hamish Downer
committed
Highlight anyone with same address as someone selected
in this round. So it is reasonably obvious who might be a problem for manual selection in a later round.
1 parent 45f395a commit bbcb86d

File tree

2 files changed

+77
-19
lines changed

2 files changed

+77
-19
lines changed

src/sortition_algorithms/adapters.py

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,12 @@ def _stringify_records(
3939
return new_records
4040

4141

42-
def generate_dupes(people_remaining_rows: list[list[str]], settings: Settings) -> list[int]:
42+
def generate_dupes(
43+
people_remaining_rows: list[list[str]],
44+
people_selected_rows: list[list[str]],
45+
settings: Settings,
46+
already_selected: People | None = None,
47+
) -> list[int]:
4348
"""
4449
Generate a list of indexes of people who share an address with someone else in this set of rows.
4550
@@ -65,21 +70,37 @@ def generate_dupes(people_remaining_rows: list[list[str]], settings: Settings) -
6570
address_col_indexes: list[int] = [
6671
index for index, col in enumerate(table_col_names) if col in settings.check_same_address_columns
6772
]
68-
address_remaining_index: dict[tuple[str, ...], list[int]] = defaultdict(list)
6973

7074
# first, we assemble a dict with the key being the address, the value being the list of
7175
# indexes of people at that address
72-
for person_index, person in enumerate(people_remaining_rows):
73-
if person_index == 0:
74-
continue # skip the header row
76+
address_remaining_index: dict[tuple[str, ...], list[int]] = defaultdict(list)
77+
for person_index, person in enumerate(people_remaining_rows[1:], start=1): # skip the header row
7578
address_tuple = tuple(col for col_index, col in enumerate(person) if col_index in address_col_indexes)
7679
address_remaining_index[address_tuple].append(person_index)
7780

7881
# now extract all those people where the number of people at their address is more than one
79-
dupes: list[int] = []
82+
dupes: set[int] = set()
8083
for persons_at_address in address_remaining_index.values():
8184
if len(persons_at_address) > 1:
82-
dupes += persons_at_address
85+
dupes.update(persons_at_address)
86+
87+
# now we assemble the list of all selected addresses
88+
already_selected_addresses: set[tuple[str, ...]] = set()
89+
for person in people_selected_rows[1:]: # skip the header row
90+
already_selected_addresses.add(
91+
tuple(col for col_index, col in enumerate(person) if col_index in address_col_indexes)
92+
)
93+
"""
94+
if already_selected:
95+
for selected in already_selected:
96+
pass
97+
# already_selected_addresses.add(x)
98+
"""
99+
# and check if anyone is already present
100+
for person_index, person in enumerate(people_remaining_rows[1:], start=1): # skip the header row
101+
person_address = tuple(col for col_index, col in enumerate(person) if col_index in address_col_indexes)
102+
if person_address in already_selected_addresses:
103+
dupes.add(person_index)
83104

84105
return sorted(dupes)
85106

@@ -204,7 +225,7 @@ def output_selected_remaining(
204225
return [], report
205226
self.data_source.write_remaining(people_remaining_rows, report)
206227
# TODO: also highlight dupes of address in selected tab/set
207-
dupes = generate_dupes(people_remaining_rows, settings)
228+
dupes = generate_dupes(people_remaining_rows, people_selected_rows, settings)
208229
self.data_source.highlight_dupes(dupes)
209230
report.add_line_and_log("Finished writing both selected and remaining", logging.INFO)
210231
return dupes, report

tests/test_adapters.py

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -165,15 +165,20 @@ def test_generate_dupes_with_duplicates():
165165
["2", "Bob", "31 Acacia Avenue", "W1A 1AA"],
166166
["3", "Charlotte", "33 Acacia Avenue", "W1A 1AA"],
167167
["4", "David", "33 Acacia Avenue", "W1B 1BB"],
168+
["5", "Edward", "33 Zoological Street", "Z1Z 1ZZ"],
169+
]
170+
people_selected_rows = [
171+
["id", "name", "address_line_1", "postcode"],
172+
["11", "Zoe", "33 Zoological Street", "Z1Z 1ZZ"],
168173
]
169174
settings = Settings(
170175
id_column="id",
171176
columns_to_keep=["name"],
172177
check_same_address=True,
173178
check_same_address_columns=["address_line_1", "postcode"],
174179
)
175-
dupes = generate_dupes(people_remaining_rows, settings)
176-
assert dupes == [1, 3]
180+
dupes = generate_dupes(people_remaining_rows, people_selected_rows, settings)
181+
assert dupes == [1, 3, 5]
177182

178183

179184
def test_generate_dupes_no_duplicates():
@@ -186,13 +191,17 @@ def test_generate_dupes_no_duplicates():
186191
["2", "Bob", "31 Acacia Avenue", "W1A 1BB"],
187192
["3", "Charlotte", "35 Acacia Avenue", "W1A 1CC"],
188193
]
194+
people_selected_rows = [
195+
["id", "name", "address_line_1", "postcode"],
196+
["11", "Zoe", "33 Zoological Street", "Z1Z 1ZZ"],
197+
]
189198
settings = Settings(
190199
id_column="id",
191200
columns_to_keep=["name"],
192201
check_same_address=True,
193202
check_same_address_columns=["address_line_1", "postcode"],
194203
)
195-
dupes = generate_dupes(people_remaining_rows, settings)
204+
dupes = generate_dupes(people_remaining_rows, people_selected_rows, settings)
196205
assert dupes == []
197206

198207

@@ -204,14 +213,19 @@ def test_generate_dupes_check_disabled():
204213
["id", "name", "address_line_1", "postcode"],
205214
["1", "Alice", "33 Acacia Avenue", "W1A 1AA"],
206215
["2", "Bob", "33 Acacia Avenue", "W1A 1AA"],
216+
["5", "Edward", "33 Zoological Street", "Z1Z 1ZZ"],
217+
]
218+
people_selected_rows = [
219+
["id", "name", "address_line_1", "postcode"],
220+
["11", "Zoe", "33 Zoological Street", "Z1Z 1ZZ"],
207221
]
208222
settings = Settings(
209223
id_column="id",
210224
columns_to_keep=["name"],
211225
check_same_address=False,
212226
check_same_address_columns=["address_line_1", "postcode"],
213227
)
214-
dupes = generate_dupes(people_remaining_rows, settings)
228+
dupes = generate_dupes(people_remaining_rows, people_selected_rows, settings)
215229
assert dupes == []
216230

217231

@@ -227,13 +241,17 @@ def test_generate_dupes_multiple_groups():
227241
["4", "David", "15 Oak Street", "W2B 2BB"],
228242
["5", "Eve", "99 Pine Road", "W3C 3CC"],
229243
]
244+
people_selected_rows = [
245+
["id", "name", "address_line_1", "postcode"],
246+
["11", "Zoe", "33 Zoological Street", "Z1Z 1ZZ"],
247+
]
230248
settings = Settings(
231249
id_column="id",
232250
columns_to_keep=["name"],
233251
check_same_address=True,
234252
check_same_address_columns=["address_line_1", "postcode"],
235253
)
236-
dupes = generate_dupes(people_remaining_rows, settings)
254+
dupes = generate_dupes(people_remaining_rows, people_selected_rows, settings)
237255
assert dupes == [1, 2, 3, 4]
238256

239257

@@ -247,13 +265,17 @@ def test_generate_dupes_three_at_same_address():
247265
["2", "Bob", "33 Acacia Avenue", "W1A 1AA"],
248266
["3", "Charlotte", "33 Acacia Avenue", "W1A 1AA"],
249267
]
268+
people_selected_rows = [
269+
["id", "name", "address_line_1", "postcode"],
270+
["11", "Zoe", "33 Acacia Avenue", "W1A 1AA"],
271+
]
250272
settings = Settings(
251273
id_column="id",
252274
columns_to_keep=["name"],
253275
check_same_address=True,
254276
check_same_address_columns=["address_line_1", "postcode"],
255277
)
256-
dupes = generate_dupes(people_remaining_rows, settings)
278+
dupes = generate_dupes(people_remaining_rows, people_selected_rows, settings)
257279
assert dupes == [1, 2, 3]
258280

259281

@@ -266,15 +288,21 @@ def test_generate_dupes_single_address_column():
266288
["1", "Alice", "W1A 1AA"],
267289
["2", "Bob", "W1A 1AA"],
268290
["3", "Charlotte", "W1B 1BB"],
291+
["4", "David", "Z1Z 1ZZ"],
292+
["5", "Edwina", "W1C 1CC"],
293+
]
294+
people_selected_rows = [
295+
["id", "name", "postcode"],
296+
["11", "Zoe", "Z1Z 1ZZ"],
269297
]
270298
settings = Settings(
271299
id_column="id",
272300
columns_to_keep=["name"],
273301
check_same_address=True,
274302
check_same_address_columns=["postcode"],
275303
)
276-
dupes = generate_dupes(people_remaining_rows, settings)
277-
assert dupes == [1, 2]
304+
dupes = generate_dupes(people_remaining_rows, people_selected_rows, settings)
305+
assert dupes == [1, 2, 4]
278306

279307

280308
def test_generate_dupes_ignores_non_address_columns():
@@ -287,13 +315,17 @@ def test_generate_dupes_ignores_non_address_columns():
287315
["2", "Bob", "33 Acacia Avenue", "W1A 1AA", "456"],
288316
["3", "Charlotte", "35 Acacia Avenue", "W1B 1BB", "789"],
289317
]
318+
people_selected_rows = [
319+
["id", "name", "address_line_1", "postcode", "phone"],
320+
["11", "Zoe", "33 Zoological Street", "Z1Z 1ZZ", "789"],
321+
]
290322
settings = Settings(
291323
id_column="id",
292324
columns_to_keep=["name"],
293325
check_same_address=True,
294326
check_same_address_columns=["address_line_1", "postcode"],
295327
)
296-
dupes = generate_dupes(people_remaining_rows, settings)
328+
dupes = generate_dupes(people_remaining_rows, people_selected_rows, settings)
297329
# Alice and Bob have same address_line_1 and postcode, despite different phone
298330
assert dupes == [1, 2]
299331

@@ -303,13 +335,14 @@ def test_generate_dupes_only_header_row():
303335
Test that generate_dupes returns empty list when only header row is present.
304336
"""
305337
people_remaining_rows = [["id", "name", "address_line_1", "postcode"]]
338+
people_selected_rows = [["id", "name", "address_line_1", "postcode"]]
306339
settings = Settings(
307340
id_column="id",
308341
columns_to_keep=["name"],
309342
check_same_address=True,
310343
check_same_address_columns=["address_line_1", "postcode"],
311344
)
312-
dupes = generate_dupes(people_remaining_rows, settings)
345+
dupes = generate_dupes(people_remaining_rows, people_selected_rows, settings)
313346
assert dupes == []
314347

315348

@@ -323,13 +356,17 @@ def test_generate_dupes_partial_address_match():
323356
["2", "Bob", "33 Acacia Avenue", "W1B 1BB"],
324357
["3", "Charlotte", "31 Acacia Avenue", "W1A 1AA"],
325358
]
359+
people_selected_rows = [
360+
["id", "name", "address_line_1", "postcode"],
361+
["11", "Zoe", "33 Zoological Street", "Z1Z 1ZZ"],
362+
]
326363
settings = Settings(
327364
id_column="id",
328365
columns_to_keep=["name"],
329366
check_same_address=True,
330367
check_same_address_columns=["address_line_1", "postcode"],
331368
)
332-
dupes = generate_dupes(people_remaining_rows, settings)
369+
dupes = generate_dupes(people_remaining_rows, people_selected_rows, settings)
333370
assert dupes == []
334371

335372

0 commit comments

Comments
 (0)