Hochfrequenz · hf-kklein · Jun 30, 2023 · Jun 30, 2023 · Jun 30, 2023 · Jun 30, 2023
diff --git a/main.py b/main.py
@@ -18,6 +18,7 @@
 from ebdtable2graph import convert_graph_to_plantuml, convert_table_to_graph
 from ebdtable2graph.graphviz import convert_dot_to_svg_kroki, convert_graph_to_dot
 from ebdtable2graph.models import EbdGraph, EbdTable
+from ebdtable2graph.plantuml import GraphToComplexForPlantumlError
 
 # pylint:disable=import-error
 from ebddocx2table import TableNotFoundError, get_all_ebd_keys, get_ebd_docx_tables  # type:ignore[import]
@@ -71,6 +72,7 @@ def _dump_json(json_path: Path, ebd_table: EbdTable) -> None:
     multiple=True,
     help="Choose which file you'd like to create",
 )
+# pylint:disable=too-many-locals, too-many-branches
 def main(input_path: Path, output_path: Path, export_types: list[Literal["puml", "dot", "json", "svg"]]):
     """
     A program to get a machine-readable version of the AHBs docx files published by edi@energy.
@@ -107,7 +109,7 @@ def main(input_path: Path, output_path: Path, export_types: list[Literal["puml",
         try:
             ebd_graph = convert_table_to_graph(ebd_table)
         except Exception as graphing_error:  # pylint:disable=broad-except
-            click.secho(f"Error while graphing {ebd_key}: {str(graphing_error)}; Skip!", fg="red")
+            click.secho(f"Error while graphing {ebd_key}: {str(graphing_error)}; Skip!", fg="yellow")
             continue
         if "puml" in export_types:
             try:
@@ -116,12 +118,19 @@ def main(input_path: Path, output_path: Path, export_types: list[Literal["puml",
             except AssertionError as assertion_error:
                 # https://github.com/Hochfrequenz/ebdtable2graph/issues/35
                 click.secho(str(assertion_error), fg="red")
-        if "dot" in export_types:
-            _dump_dot(output_path / Path(f"{ebd_key}.dot"), ebd_graph)
-            click.secho(f"💾 Successfully exported '{ebd_key}.dot'")
-        if "svg" in export_types:
-            _dump_svg(output_path / Path(f"{ebd_key}.svg"), ebd_graph)
-            click.secho(f"💾 Successfully exported '{ebd_key}.svg'")
+            except GraphToComplexForPlantumlError as too_complex_error:
+                click.secho(str(too_complex_error), fg="red")
+        try:
+            if "dot" in export_types:
+                _dump_dot(output_path / Path(f"{ebd_key}.dot"), ebd_graph)
+                click.secho(f"💾 Successfully exported '{ebd_key}.dot'")
+            if "svg" in export_types:
+                _dump_svg(output_path / Path(f"{ebd_key}.svg"), ebd_graph)
+                click.secho(f"💾 Successfully exported '{ebd_key}.svg'")
+        except AssertionError as assertion_error:
+            # e.g. AssertionError: If indegree > 1, the number of paths should always be greater than 1 too.
+            click.secho(str(assertion_error), fg="red")
+            # both the SVG and dot path require graphviz to work, hence the common error handling block
 
     click.secho("🏁Finished")
 

diff --git a/src/ebddocx2table/__init__.py b/src/ebddocx2table/__init__.py
@@ -11,7 +11,7 @@
 import attrs
 from docx import Document  # type:ignore[import]
 from docx.oxml import CT_P, CT_Tbl  # type:ignore[import]
-from docx.table import Table  # type:ignore[import]
+from docx.table import Table, _Cell  # type:ignore[import]
 from docx.text.paragraph import Paragraph  # type:ignore[import]
 
 _logger = logging.getLogger(__name__)
@@ -62,6 +62,44 @@ def __init__(self, ebd_key: str):
         self.ebd_key = ebd_key
 
 
+_ebd_cell_pattern = re.compile(r"^(?:ja|nein)\s*(?:Ende|\d+)$")
+"""
+any EBD table shall contain at least one cell that matches this pattern
+"""
+
+
+def _cell_is_probably_from_an_ebd_cell(cell: _Cell) -> bool:
+    if "" in cell.text:
+        return True
+    if cell.text in {"ja", "nein"}:
+        return True
+    if "à" in cell.text:
+        # the rightarrow in wrong encoding
+        return True
+    if _ebd_cell_pattern.match(cell.text):
+        return True
+    if cell.text.strip().startswith("Cluster:") or cell.text.startswith("Hinweis:"):
+        return True
+    return False
+
+
+def _table_is_an_ebd_table(table: Table) -> bool:
+    """
+    Returns true iff the table "looks like" an EB-Table.
+    This is to distinguish between tables that are inside the same subsection that describes an EBD but are not part
+    of the decision tree at all (e.g. in E_0406 the tables about Artikel-IDs).
+    """
+    for row in table.rows:
+        try:
+            for cell in row.cells:
+                if _cell_is_probably_from_an_ebd_cell(cell):
+                    return True
+        except IndexError:  # don't ask me why this happens; It's the internals of python-docx
+            continue
+    return False
+
+
+# pylint:disable=too-many-branches
 def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]:
     """
     Opens the file specified in docx_file_path and returns the tables that relate to the given ebd_key.
@@ -73,7 +111,7 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]:
         raise ValueError(f"The ebd_key '{ebd_key}' does not match {_ebd_key_pattern.pattern}")
     document = get_document(docx_file_path)
 
-    next_table_is_requested_table: bool = False
+    is_inside_subsection_of_requested_table: bool = False
     tables: List[Table] = []
     tables_and_paragraphs = _get_tables_and_paragaphs(document)
     for table_or_paragraph in tables_and_paragraphs:
@@ -82,8 +120,21 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]:
             # Assumptions:
             # 1. before each EbdTable there is a paragraph whose text starts with the respective EBD key
             # 2. there are no duplicates
-            next_table_is_requested_table = paragraph.text.startswith(ebd_key)
-        if isinstance(table_or_paragraph, Table) and next_table_is_requested_table:
+            is_inside_subsection_of_requested_table = (
+                paragraph.text.startswith(ebd_key) or is_inside_subsection_of_requested_table
+            )
+            if (
+                is_inside_subsection_of_requested_table
+                and paragraph.text.strip().startswith("Es ist das EBD")
+                and paragraph.text.strip().endswith("zu nutzen.")
+            ):
+                # that's kind of a dirty hack. But it works.
+                break
+        if (
+            isinstance(table_or_paragraph, Table)
+            and is_inside_subsection_of_requested_table
+            and _table_is_an_ebd_table(table_or_paragraph)
+        ):
             table: Table = table_or_paragraph
             tables.append(table)
             # Now we have to check if the EBD table spans multiple pages and _maybe_ we have to collect more tables.
@@ -94,17 +145,22 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]:
             for next_item in tables_and_paragraphs:  # start iterating from where the outer loop paused
                 if isinstance(next_item, Table):
                     # this is the case that the authors created multiple single tables on single adjacent pages
-                    tables.append(next_item)
-                elif isinstance(next_item, Paragraph) and not next_item.text.strip():
-                    # sometimes the authors add blank lines before they continue with the next table
+                    # if table_is_an_ebd_table(table):
+                    if _table_is_an_ebd_table(next_item):
+                        tables.append(next_item)
+                elif isinstance(next_item, Paragraph):
+                    if next_item.text.startswith("S_") or next_item.text.startswith("E_"):
+                        # this is the case that the authors created 1 table that spans multiple pages
+                        # and we're done collecting tables for this EBD key
+                        break
                     continue
                 else:
                     break  # inner loop because if no other table will follow
                     # we're done collecting the tables for this EBD key
-        if next_table_is_requested_table and len(tables) > 0:  # this means: we found the table
+        if is_inside_subsection_of_requested_table and len(tables) > 0:  # this means: we found the table
             # break the outer loop, too; no need to iterate any further
             break
-    if len(tables) == 0:
+    if not any(tables):
         raise TableNotFoundError(ebd_key=ebd_key)
     return tables
 

diff --git a/src/ebddocx2table/docxtableconverter.py b/src/ebddocx2table/docxtableconverter.py
@@ -33,7 +33,7 @@ def _sort_columns_in_row(docx_table_row: _Row) -> Generator[_Cell, None, None]:
         yield _Cell(table_column, docx_table_row.table)
 
 
-_subsequent_step_pattern = re.compile(r"^(?P<bool>(?:ja)|(?:nein))\s*(?P<subsequent_step_number>(?:\d+\*?)|ende)?")
+_subsequent_step_pattern = re.compile(r"^(?P<bool>(?:ja)|(?:nein))[\sà]*(?P<subsequent_step_number>(?:\d+\*?)|ende)?")
 
 _step_number_pattern = re.compile(_STEP_NUMBER_REGEX)
 
@@ -42,7 +42,9 @@ def _get_index_of_first_column_with_step_number(cells: List[_Cell]) -> int:
     """
     returns the index of the first cell in cells, that contains a step number
     """
-    first_step_number_cell = first_true(cells, pred=lambda cell: _step_number_pattern.match(cell.text) is not None)
+    first_step_number_cell = first_true(
+        cells, pred=lambda cell: _step_number_pattern.match(cell.text.strip()) is not None
+    )
     step_number_column_index = cells.index(first_step_number_cell)
     _logger.debug("The step number is in column %i", step_number_column_index)
     return step_number_column_index
@@ -175,6 +177,7 @@ def __init__(self, docx_tables: List[Table], ebd_key: str, chapter: str, sub_cha
                     self._column_index_result_code = column_index
                 elif table_cell_text == "Hinweis":
                     self._column_index_note = column_index
+
         self._metadata = EbdTableMetaData(ebd_code=ebd_key, sub_chapter=sub_chapter, chapter=chapter, role=role)
 
     @staticmethod

diff --git a/unittests/test_highlevel.py b/unittests/test_highlevel.py
@@ -107,6 +107,57 @@ def test_get_ebd_docx_table(self, datafiles, filename: str, ebd_key: str, expect
         for table in actual:
             assert isinstance(table, Table)
 
+    @pytest.mark.datafiles("unittests/test_data/ebd20230629_v34.docx")
+    @pytest.mark.parametrize(
+        "filename, ebd_key",
+        [
+            pytest.param("ebd20230629_v34.docx", "E_0406", id="E_0406: EB-Table starts after pages of text"),
+        ],
+    )
+    def test_finding_tables_positive(self, datafiles, filename: str, ebd_key: str):
+        docx_tables = get_ebd_docx_tables(datafiles, filename, ebd_key=ebd_key)
+        converter = DocxTableConverter(
+            docx_tables, ebd_key=ebd_key, chapter="Dummy Chapter", sub_chapter="Dummy Subchapter"
+        )
+        actual = converter.convert_docx_tables_to_ebd_table()  # must not throw TableNotFoundError
+        assert isinstance(actual, EbdTable)
+
+    @pytest.mark.datafiles("unittests/test_data/ebd20230629_v34.docx")
+    @pytest.mark.parametrize(
+        "filename, ebd_key",
+        [
+            pytest.param("ebd20230629_v34.docx", "E_0561", id="Es ist das EBD E_0556 zu nutzen."),
+        ],
+    )
+    def test_finding_tables_negative(self, datafiles, filename: str, ebd_key: str):
+        with pytest.raises(TableNotFoundError):
+            docx_tables = get_ebd_docx_tables(datafiles, filename, ebd_key=ebd_key)
+            converter = DocxTableConverter(
+                docx_tables, ebd_key=ebd_key, chapter="Dummy Chapter", sub_chapter="Dummy Subchapter"
+            )
+            _ = converter.convert_docx_tables_to_ebd_table()
+
+    @pytest.mark.datafiles("unittests/test_data/ebd20230619_v34.docx")
+    @pytest.mark.parametrize(
+        "filename, ebd_key, excepted_subsequent",
+        [
+            pytest.param("ebd20230619_v34.docx", "E_0012", "2"),
+            pytest.param("ebd20230619_v34.docx", "E_0021", "Ende"),
+        ],
+    )
+    def test_wrong_encoding_of_rightarrow(self, datafiles, filename: str, ebd_key: str, excepted_subsequent: str):
+        docx_tables = get_ebd_docx_tables(datafiles, filename, ebd_key=ebd_key)
+        converter = DocxTableConverter(
+            docx_tables, ebd_key=ebd_key, chapter="Dummy Chapter", sub_chapter="Dummy Subchapter"
+        )
+        actual = converter.convert_docx_tables_to_ebd_table()
+        assert any(
+            subrow
+            for row in actual.rows
+            for subrow in row.sub_rows
+            if subrow.check_result.subsequent_step_number == excepted_subsequent
+        )
+
     @pytest.mark.datafiles("unittests/test_data/ebd20221128.docx")
     @pytest.mark.parametrize(
         "filename, ebd_key, chapter, sub_chapter, expected",