Hochfrequenz · hf-kklein · Dec 13, 2022 · Dec 12, 2022 · Dec 13, 2022 · Dec 13, 2022
diff --git a/requirements.in b/requirements.in
@@ -1 +1,3 @@
 ebdtable2graph
+python-docx
+more_itertools
diff --git a/requirements.txt b/requirements.txt
@@ -2,15 +2,16 @@
 # This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
-#    pip-compile requirements.txt
+#    pip-compile requirements.in
 #
 attrs==22.1.0
-    # via
-    #   -r requirements.txt
-    #   ebdtable2graph
+    # via ebdtable2graph
 ebdtable2graph==0.0.2
-    # via -r requirements.txt
+    # via -r requirements.in
+lxml==4.9.1 # switch to version 4.9.0 (for windows + Python 3.11)
+    # via python-docx
 networkx==2.8.8
-    # via
-    #   -r requirements.txt
-    #   ebdtable2graph
+    # via ebdtable2graph
+python-docx==0.8.11
+    # via -r requirements.in
+more_itertools==9.0.0
diff --git a/src/ebddocx2table/__init__.py b/src/ebddocx2table/__init__.py
@@ -1,3 +1,68 @@
 """
-src contains all your business logic
+Contains high level functions to process .docx files
 """
+import re
+from io import BytesIO
+from pathlib import Path
+from typing import Generator, Union
+
+from docx import Document  # type:ignore[import]
+from docx.oxml import CT_P, CT_Tbl  # type:ignore[import]
+from docx.table import Table  # type:ignore[import]
+from docx.text.paragraph import Paragraph  # type:ignore[import]
+
+
+def get_document(docx_file_path: Path) -> Document:
+    """
+    opens and returns the document specified in the docx_file_path using python-docx
+    """
+    with open(docx_file_path, "rb") as docx_file:
+        source_stream = BytesIO(docx_file.read())
+        # Originally I tried the recipe from
+        # https://python-docx.readthedocs.io/en/latest/user/documents.html#opening-a-file-like-document
+        # but then switched from StringIO to BytesIO (without explicit 'utf-8') because of:
+        # UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 605: character maps to <undefined>
+    try:
+        document = Document(source_stream)
+        return document
+    finally:
+        source_stream.close()
+
+
+def _get_tables_and_paragaphs(document: Document) -> Generator[Union[Table, Paragraph], None, None]:
+    """
+    Yields tables and paragraphs from the given document in the order in which they occur in the document.
+    This is helpful because document.tables and document.paragraphs are de-coupled and give you no information which
+    paragraph follows which table.
+    """
+    parent_elements = document.element.body
+    for item in parent_elements.iterchildren():
+        if isinstance(item, CT_P):
+            yield Paragraph(item, document)
+        elif isinstance(item, CT_Tbl):
+            yield Table(item, document)
+
+
+_ebd_key_pattern = re.compile(r"^[SE]_\d{4}$")
+
+
+def get_ebd_docx_table(docx_file_path: Path, ebd_key: str) -> Table:
+    """
+    Opens the file specified in docx_file_path and returns the table that relates to the given ebd_key.
+    Raises an ValueError if the table was not found.
+    """
+    if _ebd_key_pattern.match(ebd_key) is None:
+        raise ValueError(f"The ebd_key '{ebd_key}' does not match {_ebd_key_pattern.pattern}")
+    document = get_document(docx_file_path)
+    next_table_is_requested_table: bool = False
+    for table_or_paragraph in _get_tables_and_paragaphs(document):
+        if isinstance(table_or_paragraph, Paragraph):
+            paragraph: Paragraph = table_or_paragraph
+            # Assumptions:
+            # 1. before each EbdTable there is a paragraph whose text starts with the respective EBD key
+            # 2. there are no duplicates
+            next_table_is_requested_table = paragraph.text.startswith(ebd_key)
+        if isinstance(table_or_paragraph, Table) and next_table_is_requested_table:
+            table: Table = table_or_paragraph
+            return table
+    raise ValueError(f"EBD Table '{ebd_key}' was not found.")
diff --git a/src/ebddocx2table/docxtableconverter.py b/src/ebddocx2table/docxtableconverter.py
@@ -0,0 +1,138 @@
+"""
+This module converts tables read from the docx file into a format that is easily accessible (but still a table).
+"""
+import re
+from enum import Enum
+from itertools import cycle
+from typing import Generator, List, Literal, Optional, Tuple
+
+from docx.table import Table, _Cell, _Row  # type:ignore[import]
+from ebdtable2graph import EbdTable, EbdTableRow, EbdTableSubRow
+from ebdtable2graph.models.ebd_table import EbdCheckResult, EbdTableMetaData
+
+
+def _is_pruefende_rolle_cell(cell: _Cell) -> bool:
+    """ "
+    Returns true iff the cell mentions the market role that is responsible for applying this entscheidungsbaum
+    """
+    return cell.text.startswith("Prüfende Rolle: ")
+
+
+def _sort_columns_in_row(docx_table_row: _Row) -> Generator[_Cell, None, None]:
+    """
+    The internal structure of the table rows is not as you'd expect it to be as soon as there are merged columns.
+    This problem is described in https://github.com/python-openxml/python-docx/issues/970#issuecomment-877386927 .
+    We apply the workaround described in the GithHub issue.
+    """
+    for table_column in docx_table_row._tr.tc_lst:  # pylint:disable=protected-access
+        yield _Cell(table_column, docx_table_row.table)
+
+
+_subsequent_step_pattern = re.compile(r"^(?P<bool>(?:ja)|(?:nein))\s*(?P<subsequent_step_number>(?:\d+\*?)|ende)?")
+
+
+def _read_subsequent_step_cell(cell: _Cell) -> Tuple[bool, Optional[str]]:
+    """
+    Parses the cell that contains the outcome and the subsequent step (e.g. "ja➡5" where "5" is the subsequent step
+    number).
+    """
+    cell_text = cell.text.lower().strip()
+    # we first match against the lower case cell text; then we convert the "ende" to upper case again in the end.
+    # this is to avoid confusion with "ja" vs. "Ja"
+    match = _subsequent_step_pattern.match(cell_text)
+    if not match:
+        raise ValueError(f"The cell content '{cell_text}' does not belong to a ja/nein cell")
+    group_dict = match.groupdict()
+    result_is_ja = group_dict["bool"] == "ja"
+    subsequent_step_number = group_dict["subsequent_step_number"]
+    if subsequent_step_number == "ende":
+        subsequent_step_number = "Ende"
-    if subsequent_step_number == "ende":
-        subsequent_step_number = "Ende"
-    if subsequent_step_number == "ende":
-        subsequent_step_number = "Ende"
+    return result_is_ja, subsequent_step_number
+
+
+class _EbdSubRowPosition(Enum):
+    """
+    describes the position of a subrow in the Docx Table
+    """
+
+    UPPER = 1  #: the upper sub row
+    LOWER = 2  #: the lower sub row
+
+
+# pylint: disable=too-few-public-methods, too-many-instance-attributes
+class DocxTableConverter:
+    """
+    converts docx tables to EbdTables
+    """
+
+    def __init__(self, docx_table: Table, ebd_key: str, chapter: str, sub_chapter: str):
+        """
+        the constructor initializes the instance and reads some metadata from the table header
+        """
+        self._docx_table = docx_table
+        self._column_index_step_number: int
+        self._column_index_description: int
+        self._column_index_check_result: int
+        self._column_index_result_code: int
+        self._column_index_note: int
+        self._row_index_last_header: Literal[0, 1]  # either 0  or 1
+        for row_index in range(0, 2):  # the first two lines/rows are the header of the table.
+            # In the constructor we just want to read the metadata from the table.
+            # For this purpose the first two lines are enough.
+            for column_index, table_cell in enumerate(docx_table.row_cells(row_index)):
+                if row_index == 0 and _is_pruefende_rolle_cell(table_cell):
+                    role = table_cell.text.split(":")[1].strip()
+                    break  # because the prüfende rolle is always a full row with identical column cells
+                if table_cell.text == "Nr.":
+                    self._column_index_step_number = column_index
+                    # In most of the cases this will be 1,
+                    # but it can be 0 if the first row does _not_ contain the "Prüfende Rolle".
+                    self._row_index_last_header = row_index  # type:ignore[assignment]
+                elif table_cell.text == "Prüfschritt":
+                    self._column_index_description = column_index
+                elif table_cell.text == "Prüfergebnis":
+                    self._column_index_check_result = column_index
+                elif table_cell.text == "Code":
+                    self._column_index_result_code = column_index
+                elif table_cell.text == "Hinweis":
+                    self._column_index_note = column_index
+        self._metadata = EbdTableMetaData(ebd_code=ebd_key, sub_chapter=sub_chapter, chapter=chapter, role=role)
+
+    def convert_docx_table_to_ebd_table(self) -> EbdTable:
+        """
+        Converts the raw docx table of an EBD to an EbdTable.
+        The latter contains the same data but in an easily accessible format that can be used to e.g. plot real graphs.
+        """
+        rows: List[EbdTableRow] = []
+        sub_rows: List[EbdTableSubRow] = []
+        for table_row, sub_row_position in zip(
+            self._docx_table.rows[self._row_index_last_header + 1 :],
+            cycle([_EbdSubRowPosition.UPPER, _EbdSubRowPosition.LOWER]),
+        ):
+            row_cells = list(_sort_columns_in_row(table_row))
+            if sub_row_position == _EbdSubRowPosition.UPPER:
+                # clear list every second entry
+                sub_rows = []
+                step_number = row_cells[self._column_index_step_number].text.strip()
+                description = row_cells[self._column_index_description].text.strip()
+            boolean_outcome, subsequent_step_number = _read_subsequent_step_cell(row_cells[self._column_index_check_result])
+            result_code = row_cells[self._column_index_result_code].text.strip()
+            note = row_cells[self._column_index_note].text.strip()
+            sub_row = EbdTableSubRow(
+                check_result=EbdCheckResult(subsequent_step_number=subsequent_step_number, result=boolean_outcome),
+                result_code=result_code or None,
+                note=note or None,
+            )
+            sub_rows.append(sub_row)
+            if sub_row_position == _EbdSubRowPosition.LOWER:
+                row = EbdTableRow(
+                    description=description,
+                    step_number=step_number,
+                    sub_rows=sub_rows,
+                )
+                rows.append(row)
+        result = EbdTable(
+            rows=rows,
+            metadata=self._metadata,
+        )
+        return result
diff --git a/src/ebddocx2table/mymodule.py b/src/ebddocx2table/mymodule.py
diff --git a/tox.ini b/tox.ini
@@ -15,6 +15,7 @@ commands = python -m pip install --upgrade pip
 deps =
     -rrequirements.txt
     pytest
+    pytest-datafiles
 setenv = PYTHONPATH = {toxinidir}/src
 commands = python -m pytest --basetemp={envtmpdir} {posargs}
 

diff --git a/unittests/__init__.py b/unittests/__init__.py
@@ -2,3 +2,25 @@
 This file is here, because this allows for best de-coupling of tests and application/library logic.
 Further reading: https://docs.pytest.org/en/6.2.x/goodpractices.html#tests-outside-application-code
 """
+from pathlib import Path
+
+from docx import Document  # type:ignore[import]
+from docx.table import Table  # type:ignore[import]
+
+import ebddocx2table
+
+
+def get_document(datafiles, filename: str) -> Document:
+    """
+    a datafiles compatible wrapper around ebddocx2table.get_document
+    """
+    path = datafiles / Path(filename)
+    return ebddocx2table.get_document(path)
+
+
+def get_ebd_docx_table(datafiles, filename: str, ebd_key: str) -> Table:
+    """
+    a datafiles compatible wrapper around ebddocx2table.get_ebd_docx_table
+    """
+    path = datafiles / Path(filename)
+    return ebddocx2table.get_ebd_docx_table(path, ebd_key=ebd_key)
diff --git a/unittests/examples.py b/unittests/examples.py
@@ -0,0 +1,51 @@
+"""
+module contains data as we expected them to be scraped from the docx file
+"""
+
+from ebdtable2graph import EbdTable
+from ebdtable2graph.models.ebd_table import EbdCheckResult, EbdTableMetaData, EbdTableRow, EbdTableSubRow
+
+# E_0003 is pretty short
+# https://www.entscheidungsbaumdiagramm.de/diagram?ebdKey=E_0003&formatVersion=FV2204
+table_e0003 = EbdTable(
+    metadata=EbdTableMetaData(
+        ebd_code="E_0003",
+        chapter="7.39 AD: Bestellung der Aggregationsebene der Bilanzkreissummenzeitreihe auf Ebene der Regelzone",
+        sub_chapter="7.39.1 E_0003_Bestellung der Aggregationsebene RZ prüfen",
+        role="ÜNB",
+    ),
+    rows=[
+        EbdTableRow(
+            step_number="1",
+            description="Erfolgt der Eingang der Bestellung fristgerecht?",
+            sub_rows=[
+                EbdTableSubRow(
+                    check_result=EbdCheckResult(result=False, subsequent_step_number=None),
+                    result_code="A01",
+                    note="Fristüberschreitung",
+                ),
+                EbdTableSubRow(
+                    check_result=EbdCheckResult(result=True, subsequent_step_number="2"),
+                    result_code=None,
+                    note=None,
+                ),
+            ],
+        ),
+        EbdTableRow(
+            step_number="2",
+            description="Erfolgt die Bestellung zum Monatsersten 00:00 Uhr?",
+            sub_rows=[
+                EbdTableSubRow(
+                    check_result=EbdCheckResult(result=False, subsequent_step_number=None),
+                    result_code="A02",
+                    note="Gewählter Zeitpunkt nicht zulässig",
+                ),
+                EbdTableSubRow(
+                    check_result=EbdCheckResult(result=True, subsequent_step_number="Ende"),
+                    result_code=None,
+                    note=None,
+                ),
+            ],
+        ),
+    ],
+)
diff --git a/unittests/test_data/README.md b/unittests/test_data/README.md
@@ -0,0 +1,10 @@
+# Test Data (.docx)
+
+The `.docx` files in this directory are copied from edi-energy.de.
+The files are used to automatically test the scraping logic.
+
+The file [ebd20221128.docx](ebd20221128.docx) can be found on edi-energy.de under the title (link as of [2022-12-12](https://www.edi-energy.de/index.php?id=38&tx_bdew_bdew%5Buid%5D=1758&tx_bdew_bdew%5Baction%5D=download&tx_bdew_bdew%5Bcontroller%5D=Dokument&cHash=d148663456f1d71dc0c3f666849efa7a))
+
+> Entscheidungsbaum-Diagramme und Codelisten - informatorische Lesefassung 3.2 Konsolidierte Lesefassung mit Fehlerkorrekturen Stand: 28.11.2022
+
+The copyright for these files remains solely at EDI@Energy and the authors of the docx files.
diff --git a/unittests/test_data/ebd20221128.docx b/unittests/test_data/ebd20221128.docx