Hochfrequenz · hf-kklein · Dec 19, 2022 · Dec 18, 2022 · Dec 18, 2022 · Dec 18, 2022
diff --git a/src/ebddocx2table/__init__.py b/src/ebddocx2table/__init__.py
@@ -4,7 +4,7 @@
 import re
 from io import BytesIO
 from pathlib import Path
-from typing import Dict, Generator, Union
+from typing import Dict, Generator, List, Union
 
 from docx import Document  # type:ignore[import]
 from docx.oxml import CT_P, CT_Tbl  # type:ignore[import]
@@ -47,16 +47,21 @@ def _get_tables_and_paragaphs(document: Document) -> Generator[Union[Table, Para
 _ebd_key_with_heading_pattern = re.compile(r"^(?P<key>E_\d{4})_(?P<title>.*)\s*$")
 
 
-def get_ebd_docx_table(docx_file_path: Path, ebd_key: str) -> Table:
+def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]:
     """
-    Opens the file specified in docx_file_path and returns the table that relates to the given ebd_key.
+    Opens the file specified in docx_file_path and returns the tables that relate to the given ebd_key.
+    There might be more than 1 docx table for 1 EBD table.
+    This is because of inconsistencies and manual editing during creation of the documents by EDI@Energy.
     Raises an ValueError if the table was not found.
     """
     if _ebd_key_pattern.match(ebd_key) is None:
         raise ValueError(f"The ebd_key '{ebd_key}' does not match {_ebd_key_pattern.pattern}")
     document = get_document(docx_file_path)
+
     next_table_is_requested_table: bool = False
-    for table_or_paragraph in _get_tables_and_paragaphs(document):
+    tables: List[Table] = []
+    tables_and_paragraphs = _get_tables_and_paragaphs(document)
+    for table_or_paragraph in tables_and_paragraphs:
         if isinstance(table_or_paragraph, Paragraph):
             paragraph: Paragraph = table_or_paragraph
             # Assumptions:
@@ -65,8 +70,28 @@ def get_ebd_docx_table(docx_file_path: Path, ebd_key: str) -> Table:
             next_table_is_requested_table = paragraph.text.startswith(ebd_key)
         if isinstance(table_or_paragraph, Table) and next_table_is_requested_table:
             table: Table = table_or_paragraph
-            return table
-    raise ValueError(f"EBD Table '{ebd_key}' was not found.")
+            tables.append(table)
+            # Now we have to check if the EBD table spans multiple pages and _maybe_ we have to collect more tables.
+            # The funny thing is: Sometimes the authors create multiple tables split over multiple lines which belong
+            # together, sometimes they create 1 proper table that spans multiple pages.
+            # The latter case (1 docx table spanning >1 pages) is transparent to the extraction logic; i.e. python-docx
+            # treats a single table that spans multiple pages just the same as a table on only 1 page.
+            for next_item in tables_and_paragraphs:  # start iterating from where the outer loop paused
+                if isinstance(next_item, Table):
+                    # this is the case that the authors created multiple single tables on single adjacent pages
+                    tables.append(next_item)
+                elif isinstance(next_item, Paragraph) and not next_item.text.strip():
+                    # sometimes the authors add blank lines before they continue with the next table
+                    continue
+                else:
+                    break  # inner loop because if no other table will follow
+                    # we're done collecting the tables for this EBD key
+        if next_table_is_requested_table and len(tables) > 0:  # this means: we found the table
+            # break the outer loop, too; no need to iterate any further
+            break
+    if len(tables) == 0:
+        raise ValueError(f"EBD Table '{ebd_key}' was not found.")
+    return tables
 
 
 def get_all_ebd_keys(docx_file_path: Path) -> Dict[str, str]:

diff --git a/src/ebddocx2table/docxtableconverter.py b/src/ebddocx2table/docxtableconverter.py
@@ -9,6 +9,7 @@
 from docx.table import Table, _Cell, _Row  # type:ignore[import]
 from ebdtable2graph import EbdTable, EbdTableRow, EbdTableSubRow
 from ebdtable2graph.models.ebd_table import EbdCheckResult, EbdTableMetaData
+from more_itertools import first
 
 
 def _is_pruefende_rolle_cell(cell: _Cell) -> bool:
@@ -71,11 +72,11 @@ class DocxTableConverter:
     converts docx tables to EbdTables
     """
 
-    def __init__(self, docx_table: Table, ebd_key: str, chapter: str, sub_chapter: str):
+    def __init__(self, docx_tables: List[Table], ebd_key: str, chapter: str, sub_chapter: str):
         """
-        the constructor initializes the instance and reads some metadata from the table header
+        the constructor initializes the instance and reads some metadata from the (first) table header
         """
-        self._docx_table = docx_table
+        self._docx_tables = docx_tables
         self._column_index_step_number: int
         self._column_index_description: int
         self._column_index_check_result: int
@@ -85,7 +86,7 @@ def __init__(self, docx_table: Table, ebd_key: str, chapter: str, sub_chapter: s
         for row_index in range(0, 2):  # the first two lines/rows are the header of the table.
             # In the constructor we just want to read the metadata from the table.
             # For this purpose the first two lines are enough.
-            for column_index, table_cell in enumerate(docx_table.row_cells(row_index)):
+            for column_index, table_cell in enumerate(first(docx_tables).row_cells(row_index)):
                 if row_index == 0 and _is_pruefende_rolle_cell(table_cell):
                     role = table_cell.text.split(":")[1].strip()
                     break  # because the prüfende rolle is always a full row with identical column cells
@@ -104,15 +105,15 @@ def __init__(self, docx_table: Table, ebd_key: str, chapter: str, sub_chapter: s
                     self._column_index_note = column_index
         self._metadata = EbdTableMetaData(ebd_code=ebd_key, sub_chapter=sub_chapter, chapter=chapter, role=role)
 
-    def convert_docx_table_to_ebd_table(self) -> EbdTable:
+    def _handle_single_table(
+        self, table: Table, row_offset: int, rows: List[EbdTableRow], sub_rows: List[EbdTableSubRow]
+    ) -> None:
         """
-        Converts the raw docx table of an EBD to an EbdTable.
-        The latter contains the same data but in an easily accessible format that can be used to e.g. plot real graphs.
+        Handles a single table (out of possible multiple tables for 1 EBD).
+        The results are written into rows and sub_rows. Those will be modified.
         """
-        rows: List[EbdTableRow] = []
-        sub_rows: List[EbdTableSubRow] = []
         for table_row, sub_row_position in zip(
-            self._docx_table.rows[self._row_index_last_header + 1 :],
+            table.rows[row_offset:],
             cycle([_EbdSubRowPosition.UPPER, _EbdSubRowPosition.LOWER]),
         ):
             row_cells = list(_sort_columns_in_row(table_row))
@@ -124,12 +125,10 @@ def convert_docx_table_to_ebd_table(self) -> EbdTable:
             boolean_outcome, subsequent_step_number = _read_subsequent_step_cell(
                 row_cells[self._column_index_check_result]
             )
-            result_code = row_cells[self._column_index_result_code].text.strip()
-            note = row_cells[self._column_index_note].text.strip()
             sub_row = EbdTableSubRow(
                 check_result=EbdCheckResult(subsequent_step_number=subsequent_step_number, result=boolean_outcome),
-                result_code=result_code or None,
-                note=note or None,
+                result_code=row_cells[self._column_index_result_code].text.strip() or None,
+                note=row_cells[self._column_index_note].text.strip() or None,
             )
             sub_rows.append(sub_row)
             if sub_row_position == _EbdSubRowPosition.LOWER:
@@ -139,6 +138,19 @@ def convert_docx_table_to_ebd_table(self) -> EbdTable:
                     sub_rows=sub_rows,
                 )
                 rows.append(row)
+
+    def convert_docx_tables_to_ebd_table(self) -> EbdTable:
+        """
+        Converts the raw docx tables of an EBD to an EbdTable.
+        The latter contains the same data but in an easily accessible format that can be used to e.g. plot real graphs.
+        """
+        rows: List[EbdTableRow] = []
+        sub_rows: List[EbdTableSubRow] = []
+        for table_index, table in enumerate(self._docx_tables):
+            offset: int = 0
+            if table_index == 0:
+                offset = self._row_index_last_header + 1
+            self._handle_single_table(table, offset, rows, sub_rows)
         result = EbdTable(
             rows=rows,
             metadata=self._metadata,

diff --git a/unittests/__init__.py b/unittests/__init__.py
@@ -3,7 +3,7 @@
 Further reading: https://docs.pytest.org/en/6.2.x/goodpractices.html#tests-outside-application-code
 """
 from pathlib import Path
-from typing import Dict
+from typing import Dict, List
 
 from docx import Document  # type:ignore[import]
 from docx.table import Table  # type:ignore[import]
@@ -19,12 +19,12 @@ def get_document(datafiles, filename: str) -> Document:
     return ebddocx2table.get_document(path)
 
 
-def get_ebd_docx_table(datafiles, filename: str, ebd_key: str) -> Table:
+def get_ebd_docx_tables(datafiles, filename: str, ebd_key: str) -> List[Table]:
     """
-    a datafiles compatible wrapper around ebddocx2table.get_ebd_docx_table
+    a datafiles compatible wrapper around ebddocx2table.get_ebd_docx_tables
     """
     path = datafiles / Path(filename)
-    return ebddocx2table.get_ebd_docx_table(path, ebd_key=ebd_key)
+    return ebddocx2table.get_ebd_docx_tables(path, ebd_key=ebd_key)
 
 
 def get_all_ebd_keys(datafiles, filename: str) -> Dict[str, str]:

diff --git a/unittests/examples.py b/unittests/examples.py
@@ -49,3 +49,79 @@
         ),
     ],
 )
+
+# E_0901 spans over multiple pages, let the fun begin
+table_e0901 = EbdTable(
+    metadata=EbdTableMetaData(
+        ebd_code="E_0901",
+        chapter="16.1 AD: Ermittlung und Abstimmung der abrechnungsrelevanten Ausfallarbeit – Prognosemodell",
+        sub_chapter="16.1.2 E_0901_Gegenvorschlag prüfen",
+        role="NB",
+    ),
+    rows=[
+        EbdTableRow(
+            step_number="1",
+            description="Liegt für die Ausfallarbeitszeitreihe bereits eine Zustimmung vor?",
+            sub_rows=[
+                EbdTableSubRow(
+                    check_result=EbdCheckResult(result=True, subsequent_step_number=None),
+                    result_code="A01",
+                    note="Cluster: Ablehnung\nAusfallarbeitszeitreihe wurde bereits bestätigt.",
+                ),
+                EbdTableSubRow(
+                    check_result=EbdCheckResult(result=False, subsequent_step_number="2"),
+                    result_code=None,
+                    note=None,
+                ),
+            ],
+        ),
+        EbdTableRow(
+            step_number="2",
+            description="Ist der Gegenvorschlag zur Ausfallarbeitszeitreihe innerhalb der vorgegebenen Frist eingegangen?",
+            sub_rows=[
+                EbdTableSubRow(
+                    check_result=EbdCheckResult(result=False, subsequent_step_number=None),
+                    result_code="A02",
+                    note="Cluster: Ablehnung\nFristüberschreitung",
+                ),
+                EbdTableSubRow(
+                    check_result=EbdCheckResult(result=True, subsequent_step_number="3"),
+                    result_code=None,
+                    note=None,
+                ),
+            ],
+        ),
+        EbdTableRow(
+            step_number="3",
+            description="Liegt bereits ein Gegenvorschlag zur Ausfallarbeitszeitreihe vor?",
+            sub_rows=[
+                EbdTableSubRow(
+                    check_result=EbdCheckResult(result=True, subsequent_step_number=None),
+                    result_code="A03",
+                    note="Cluster: Ablehnung\nGegenvorschlag liegt bereits vor\nHinweis: Ein weiterer Gegenvorschlag kann nicht eingereicht werden.",
+                ),
+                EbdTableSubRow(
+                    check_result=EbdCheckResult(result=False, subsequent_step_number="4"),
+                    result_code=None,
+                    note=None,
+                ),
+            ],
+        ),
+        EbdTableRow(
+            step_number="4",
+            description="Können die Energiemengen des Gegenvorschlages zur Ausfallarbeitszeitreihe akzeptiert werden?",
+            sub_rows=[
+                EbdTableSubRow(
+                    check_result=EbdCheckResult(result=False, subsequent_step_number=None),
+                    result_code="A04",
+                    note="Cluster: Ablehnung\nEnergiemengen falsch / nicht plausibel",
+                ),
+                EbdTableSubRow(
+                    check_result=EbdCheckResult(result=True, subsequent_step_number=None),
+                    result_code="A05",
+                    note="Cluster: Zustimmung\nZustimmung",
+                ),
+            ],
+        ),
+    ],
+)
diff --git a/unittests/test_highlevel.py b/unittests/test_highlevel.py
@@ -1,10 +1,11 @@
 import pytest  # type:ignore[import]
+from docx.table import Table  # type:ignore[import]
 from ebdtable2graph import EbdTable
 
 from ebddocx2table.docxtableconverter import DocxTableConverter
 
-from . import get_all_ebd_keys, get_document, get_ebd_docx_table
-from .examples import table_e0003
+from . import get_all_ebd_keys, get_document, get_ebd_docx_tables
+from .examples import table_e0003, table_e0901
 
 
 class TestEbdDocx2Table:
@@ -29,12 +30,20 @@ def test_get_ebd_keys(self, datafiles, filename: str, expected_length: int):
 
     @pytest.mark.datafiles("unittests/test_data/ebd20221128.docx")
     @pytest.mark.parametrize(
-        "filename, ebd_key",
-        [pytest.param("ebd20221128.docx", "E_0003")],  # 7.39.1 E_0003_Bestellung der Aggregationsebene RZ prüfen	342
+        "filename, ebd_key,expected_number_of_tables",
+        [
+            pytest.param("ebd20221128.docx", "E_0003", 1, id="E_0003: One table on only one page"),
+            pytest.param("ebd20221128.docx", "E_0015", 1, id="E_0015: one table spanning multiple pages"),
+            pytest.param("ebd20221128.docx", "E_0901", 2, id="E_0901: multiple tables on multiple pages")
+            # pytest.param("ebd20221128.docx", "E_0461"), # https://github.com/Hochfrequenz/ebd_docx_to_table/issues/6
+        ],
     )
-    def test_get_ebd_docx_table(self, datafiles, filename: str, ebd_key: str):
-        actual = get_ebd_docx_table(datafiles, filename, ebd_key=ebd_key)
+    def test_get_ebd_docx_table(self, datafiles, filename: str, ebd_key: str, expected_number_of_tables: int):
+        actual = get_ebd_docx_tables(datafiles, filename, ebd_key=ebd_key)
         assert actual is not None
+        assert len(actual) == expected_number_of_tables
+        for table in actual:
+            assert isinstance(table, Table)
 
     @pytest.mark.datafiles("unittests/test_data/ebd20221128.docx")
     @pytest.mark.parametrize(
@@ -46,13 +55,22 @@ def test_get_ebd_docx_table(self, datafiles, filename: str, ebd_key: str):
                 "7.39 AD: Bestellung der Aggregationsebene der Bilanzkreissummenzeitreihe auf Ebene der Regelzone",
                 "7.39.1 E_0003_Bestellung der Aggregationsebene RZ prüfen",
                 table_e0003,
-            )
+                id="E_0003: Simple single page table",
+            ),
+            pytest.param(
+                "ebd20221128.docx",
+                "E_0901",
+                "16.1 AD: Ermittlung und Abstimmung der abrechnungsrelevanten Ausfallarbeit – Prognosemodell",
+                "16.1.2 E_0901_Gegenvorschlag prüfen",
+                table_e0901,
+                id="E_0901: table that span over two pages",
+            ),
         ],
     )
     def test_convert_docx_table_to_ebd_table(
         self, datafiles, filename: str, ebd_key: str, chapter: str, sub_chapter: str, expected: EbdTable
     ):
-        docx_table = get_ebd_docx_table(datafiles, filename, ebd_key=ebd_key)
+        docx_table = get_ebd_docx_tables(datafiles, filename, ebd_key=ebd_key)
         converter = DocxTableConverter(docx_table, ebd_key=ebd_key, chapter=chapter, sub_chapter=sub_chapter)
-        actual = converter.convert_docx_table_to_ebd_table()
+        actual = converter.convert_docx_tables_to_ebd_table()
         assert actual == expected