When loading features, check against number to select

Hamish Downer · Hamish Downer · commit f1555baa96c9 · 2025-11-24T17:56:16.000Z
and raise an error if the maximum for any feature is less than the
number to select or the minimum is more than the number to select.

This should help make it clearer what max or min is out of what with
what you're trying to do.
diff --git a/docs/adapters.md b/docs/adapters.md
@@ -71,15 +71,16 @@ def csv_selection_workflow():
     )
     select_data = SelectionData(data_source)
     settings = Settings()
+    number_wanted=100
 
     # Load data
-    features, report = select_data.load_features()
+    features, report = select_data.load_features(number_wanted)
     print(report.as_text())
     people, report = select_data.load_people(Settings(), features)
     print(report.as_text())
 
     # Run selection
-    success, panels, msgs = run_stratification(features, people, 100, settings)
+    success, panels, msgs = run_stratification(features, people, number_wanted, settings)
 
     if success:
         # Format results
@@ -129,6 +130,8 @@ print(report.as_text())
 people, report = select_data.load_people(settings, features)
 print(report.as_text())
 
+# Here, do selection
+
 # Configure output tabs
 data_source.selected_tab_name_stub = "Selected Panel"
 data_source.remaining_tab_name_stub = "Reserve Pool"
@@ -140,31 +143,36 @@ select_data.output_selected_remaining(selected_rows, remaining_rows, settings)
 #### Full Google Sheets Workflow
 
 ```python
-from sortition_algorithms import GSheetAdapter, run_stratification, selected_remaining_tables, Settings
+from sortition_algorithms import GSheetDataSource, SelectionData, run_stratification, selected_remaining_tables, Settings
 from pathlib import Path
 
 def gsheet_selection_workflow():
     # Initialize
-    adapter = GSheetAdapter(
-        auth_json_path=Path("credentials.json"),
-        gen_rem_tab=True,
+    data_source = GSheetDataSource(
+        feature_tab_name="Demographics",
+        people_tab_name="Candidates",
+        auth_json_path=Path("/secure/path/credentials.json"),
+        gen_rem_tab=True,  # Generate remaining tab
     )
+    data_source.set_g_sheet_name("My Spreadsheet")
+    select_data = SelectionData(data_source)
     settings = Settings()
+    number_wanted = 120
 
     # Load data
     adapter.set_g_sheet_name("Citizen Panel 2024")
-    features, report = adapter.load_features("Demographics")
+    features, report = adapter.load_features(number_wanted)
     if features is None:
         print("Failed to load features:", "\n".join(msgs))
         return
 
-    people, report = adapter.load_people("Candidates", settings, features)
+    people, report = adapter.load_people(settings, features)
     if people is None:
         print("Failed to load people:", "\n".join(msgs))
         return
 
     # Run selection
-    success, panels, report = run_stratification(features, people, 120, settings)
+    success, panels, report = run_stratification(features, people, number_wanted, settings)
 
     if success:
         # Format results
diff --git a/docs/api-reference.md b/docs/api-reference.md
@@ -431,7 +431,7 @@ class SelectionData:
 **Methods:**
 
 ```python
-def load_features(self) -> tuple[FeatureCollection, RunReport]:
+def load_features(self, number_to_select: int = 0) -> tuple[FeatureCollection, RunReport]:
     # Load feature definitions from data source
 
 def load_people(
@@ -469,14 +469,15 @@ data_source = CSVFileDataSource(
 
 # Wrap in SelectionData
 selection_data = SelectionData(data_source)
+number_to_select = 100
 
 # Load data
-features, report = selection_data.load_features()
+features, report = selection_data.load_features(number_to_select)
 people, report = selection_data.load_people(settings, features)
 
 # Run stratification (using core.py functions)
 from sortition_algorithms.core import run_stratification, selected_remaining_tables
-success, panels, report = run_stratification(features, people, 100, settings)
+success, panels, report = run_stratification(features, people, number_to_select, settings)
 
 # Format and output results
 selected_rows, remaining_rows, _ = selected_remaining_tables(
diff --git a/src/sortition_algorithms/__main__.py b/src/sortition_algorithms/__main__.py
@@ -91,7 +91,7 @@ def csv(
     select_data = adapters.SelectionData(data_source)
     settings_obj, report = Settings.load_from_file(Path(settings))
     echo_report(report)
-    features, report = select_data.load_features()
+    features, report = select_data.load_features(number_wanted)
     echo_report(report)
     if features is None:
         raise click.ClickException("Could not load features, exiting.")
@@ -183,7 +183,7 @@ def gsheet(
     echo_report(report)
 
     data_source.set_g_sheet_name(gsheet_name)
-    features, report = select_data.load_features()
+    features, report = select_data.load_features(number_wanted)
     echo_report(report)
     if features is None:
         raise click.ClickException("Could not load features, exiting.")
diff --git a/src/sortition_algorithms/adapters.py b/src/sortition_algorithms/adapters.py
@@ -125,13 +125,15 @@ def __init__(self, data_source: AbstractDataSource, gen_rem_tab: bool = True) ->
         self.feature_column_name = "feature"
         self.feature_value_column_name = "value"
 
-    def load_features(self) -> tuple[FeatureCollection, RunReport]:
+    def load_features(self, number_to_select: int = 0) -> tuple[FeatureCollection, RunReport]:
         report = RunReport()
         with self.data_source.read_feature_data(report) as headers_body:
             headers_iter, body = headers_body
             headers = list(headers_iter)
             try:
-                features, self.feature_column_name, self.feature_value_column_name = read_in_features(headers, body)
+                features, self.feature_column_name, self.feature_value_column_name = read_in_features(
+                    headers, body, number_to_select=number_to_select
+                )
             except ParseTableMultiError as error:
                 new_error = self.data_source.customise_features_parse_error(error, headers)
                 raise new_error from error
diff --git a/src/sortition_algorithms/features.py b/src/sortition_algorithms/features.py
@@ -122,12 +122,43 @@ def report_min_max_error_details(fc: FeatureCollection, feature_column_name: str
     ]
 
 
-def check_min_max(fc: FeatureCollection, feature_column_name: str = "feature") -> None:
+def report_min_max_against_number_to_select(
+    fc: FeatureCollection, number_to_select: int, feature_column_name: str
+) -> list[str]:
+    """
+    If any combined minimum is > number_to_select we have a problem.
+    If any combined maximum is < number_to_select we have a problem.
+    """
+    if not fc:
+        return []
+    errors: list[str] = []
+    for key, fv in fc.items():
+        feature_minimum = _fv_minimum_selection(fv)
+        feature_maximum = _fv_maximum_selection(fv)
+        if feature_minimum > number_to_select:
+            errors.append(
+                f"Minimum for {feature_column_name} {key} ({feature_minimum}) "
+                f"is more than number to select ({number_to_select})"
+            )
+        if feature_maximum < number_to_select:
+            errors.append(
+                f"Maximum for {feature_column_name} {key} ({feature_maximum}) "
+                f"is less than number to select ({number_to_select})"
+            )
+    return errors
+
+
+def check_min_max(fc: FeatureCollection, number_to_select: int = 0, feature_column_name: str = "feature") -> None:
     """
     If the min is bigger than the max we're in trouble i.e. there's an input error
     """
+    errors: list[str] = []
     if minimum_selection(fc) > maximum_selection(fc):
-        raise SelectionMultilineError(report_min_max_error_details(fc, feature_column_name))
+        errors += report_min_max_error_details(fc, feature_column_name)
+    if number_to_select:
+        errors += report_min_max_against_number_to_select(fc, number_to_select, feature_column_name)
+    if errors:
+        raise SelectionMultilineError(errors)
 
 
 def check_desired(fc: FeatureCollection, desired_number: int) -> None:
@@ -307,7 +338,7 @@ def _clean_row(row: utils.StrippedDict, feature_flex: bool, row_number: int) ->
 
 
 def read_in_features(
-    features_head: Iterable[str], features_body: Iterable[dict[str, str]]
+    features_head: Iterable[str], features_body: Iterable[dict[str, str]], number_to_select: int = 0
 ) -> tuple[FeatureCollection, str, str]:
     """
     Read in stratified selection features and values
@@ -344,7 +375,7 @@ def read_in_features(
     if combined_error:
         raise combined_error
 
-    check_min_max(features, feature_column_name)
+    check_min_max(features, number_to_select=number_to_select, feature_column_name=feature_column_name)
     # check feature_flex to see if we need to set the max here
     # this only changes the max_flex value if these (optional) flex values are NOT set already
     set_default_max_flex(features)
diff --git a/tests/test_features.py b/tests/test_features.py
@@ -434,6 +434,66 @@ def test_inconsistent_min_max_across_features(self):
         assert "smallest maximum is 4 for feature 'age'" in context.exconly()
         assert "largest minimum is 10 for feature 'gender'" in context.exconly()
 
+    def test_min_larger_than_number_to_select(self):
+        head = FEATURE_FILE_FIELD_NAMES
+        body = [
+            {
+                "feature": "gender",
+                "value": "male",
+                "min": "5",
+                "max": "6",
+            },  # min total: 10
+            {
+                "feature": "gender",
+                "value": "female",
+                "min": "5",
+                "max": "6",
+            },  # max total: 12
+        ]
+        with pytest.raises(SelectionMultilineError) as context:
+            read_in_features(head, body, number_to_select=8)
+        assert "Minimum for feature gender (10) is more than number to select (8)" in context.exconly()
+
+    def test_max_smaller_than_number_to_select(self):
+        head = FEATURE_FILE_FIELD_NAMES
+        body = [
+            {
+                "feature": "gender",
+                "value": "male",
+                "min": "5",
+                "max": "6",
+            },  # min total: 10
+            {
+                "feature": "gender",
+                "value": "female",
+                "min": "5",
+                "max": "6",
+            },  # max total: 12
+        ]
+        with pytest.raises(SelectionMultilineError) as context:
+            read_in_features(head, body, number_to_select=15)
+        assert "Maximum for feature gender (12) is less than number to select (15)" in context.exconly()
+
+    def test_no_error_when_number_to_select_is_zero(self):
+        head = FEATURE_FILE_FIELD_NAMES
+        body = [
+            {
+                "feature": "gender",
+                "value": "male",
+                "min": "5",
+                "max": "6",
+            },  # min total: 10
+            {
+                "feature": "gender",
+                "value": "female",
+                "min": "5",
+                "max": "6",
+            },  # max total: 12
+        ]
+        _, feature_column_name, _ = read_in_features(head, body, number_to_select=0)
+        # really just check no error raised
+        assert feature_column_name == "feature"
+
 
 class TestReadInFeaturesDataTypes:
     """Test handling of different data types in input."""