Skip to content
37 changes: 31 additions & 6 deletions src/ebddocx2table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import re
from io import BytesIO
from pathlib import Path
from typing import Dict, Generator, Union
from typing import Dict, Generator, List, Union

from docx import Document # type:ignore[import]
from docx.oxml import CT_P, CT_Tbl # type:ignore[import]
Expand Down Expand Up @@ -47,16 +47,21 @@ def _get_tables_and_paragaphs(document: Document) -> Generator[Union[Table, Para
_ebd_key_with_heading_pattern = re.compile(r"^(?P<key>E_\d{4})_(?P<title>.*)\s*$")


def get_ebd_docx_table(docx_file_path: Path, ebd_key: str) -> Table:
def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]:
"""
Opens the file specified in docx_file_path and returns the table that relates to the given ebd_key.
Opens the file specified in docx_file_path and returns the tables that relate to the given ebd_key.
There might be more than 1 docx table for 1 EBD table.
This is because of inconsistencies and manual editing during creation of the documents by EDI@Energy.
Raises an ValueError if the table was not found.
"""
if _ebd_key_pattern.match(ebd_key) is None:
raise ValueError(f"The ebd_key '{ebd_key}' does not match {_ebd_key_pattern.pattern}")
document = get_document(docx_file_path)

next_table_is_requested_table: bool = False
for table_or_paragraph in _get_tables_and_paragaphs(document):
tables: List[Table] = []
tables_and_paragraphs = _get_tables_and_paragaphs(document)
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

habe den aus der loop rausgezogen, damit wir weiter unten weiter drüber loopen können (an der stelle wo wir die erste treffende tabelle gefunden haben.

for table_or_paragraph in tables_and_paragraphs:
if isinstance(table_or_paragraph, Paragraph):
paragraph: Paragraph = table_or_paragraph
# Assumptions:
Expand All @@ -65,8 +70,28 @@ def get_ebd_docx_table(docx_file_path: Path, ebd_key: str) -> Table:
next_table_is_requested_table = paragraph.text.startswith(ebd_key)
if isinstance(table_or_paragraph, Table) and next_table_is_requested_table:
table: Table = table_or_paragraph
return table
raise ValueError(f"EBD Table '{ebd_key}' was not found.")
tables.append(table)
# Now we have to check if the EBD table spans multiple pages and _maybe_ we have to collect more tables.
# The funny thing is: Sometimes the authors create multiple tables split over multiple lines which belong
# together, sometimes they create 1 proper table that spans multiple pages.
# The latter case (1 docx table spanning >1 pages) is transparent to the extraction logic; i.e. python-docx
# treats a single table that spans multiple pages just the same as a table on only 1 page.
for next_item in tables_and_paragraphs: # start iterating from where the outer loop paused
if isinstance(next_item, Table):
# this is the case that the authors created multiple single tables on single adjacent pages
tables.append(next_item)
elif isinstance(next_item, Paragraph) and not next_item.text.strip():
# sometimes the authors add blank lines before they continue with the next table
continue
else:
break # inner loop because if no other table will follow
# we're done collecting the tables for this EBD key
if next_table_is_requested_table and len(tables) > 0: # this means: we found the table
# break the outer loop, too; no need to iterate any further
break
if len(tables) == 0:
raise ValueError(f"EBD Table '{ebd_key}' was not found.")
return tables


def get_all_ebd_keys(docx_file_path: Path) -> Dict[str, str]:
Expand Down
40 changes: 26 additions & 14 deletions src/ebddocx2table/docxtableconverter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from docx.table import Table, _Cell, _Row # type:ignore[import]
from ebdtable2graph import EbdTable, EbdTableRow, EbdTableSubRow
from ebdtable2graph.models.ebd_table import EbdCheckResult, EbdTableMetaData
from more_itertools import first


def _is_pruefende_rolle_cell(cell: _Cell) -> bool:
Expand Down Expand Up @@ -71,11 +72,11 @@ class DocxTableConverter:
converts docx tables to EbdTables
"""

def __init__(self, docx_table: Table, ebd_key: str, chapter: str, sub_chapter: str):
def __init__(self, docx_tables: List[Table], ebd_key: str, chapter: str, sub_chapter: str):
"""
the constructor initializes the instance and reads some metadata from the table header
the constructor initializes the instance and reads some metadata from the (first) table header
"""
self._docx_table = docx_table
self._docx_tables = docx_tables
self._column_index_step_number: int
self._column_index_description: int
self._column_index_check_result: int
Expand All @@ -85,7 +86,7 @@ def __init__(self, docx_table: Table, ebd_key: str, chapter: str, sub_chapter: s
for row_index in range(0, 2): # the first two lines/rows are the header of the table.
# In the constructor we just want to read the metadata from the table.
# For this purpose the first two lines are enough.
for column_index, table_cell in enumerate(docx_table.row_cells(row_index)):
for column_index, table_cell in enumerate(first(docx_tables).row_cells(row_index)):
if row_index == 0 and _is_pruefende_rolle_cell(table_cell):
role = table_cell.text.split(":")[1].strip()
break # because the prüfende rolle is always a full row with identical column cells
Expand All @@ -104,15 +105,15 @@ def __init__(self, docx_table: Table, ebd_key: str, chapter: str, sub_chapter: s
self._column_index_note = column_index
self._metadata = EbdTableMetaData(ebd_code=ebd_key, sub_chapter=sub_chapter, chapter=chapter, role=role)

def convert_docx_table_to_ebd_table(self) -> EbdTable:
def _handle_single_table(
self, table: Table, row_offset: int, rows: List[EbdTableRow], sub_rows: List[EbdTableSubRow]
) -> None:
"""
Converts the raw docx table of an EBD to an EbdTable.
The latter contains the same data but in an easily accessible format that can be used to e.g. plot real graphs.
Handles a single table (out of possible multiple tables for 1 EBD).
The results are written into rows and sub_rows. Those will be modified.
"""
rows: List[EbdTableRow] = []
sub_rows: List[EbdTableSubRow] = []
for table_row, sub_row_position in zip(
self._docx_table.rows[self._row_index_last_header + 1 :],
table.rows[row_offset:],
cycle([_EbdSubRowPosition.UPPER, _EbdSubRowPosition.LOWER]),
):
Comment on lines 107 to 118
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

die funktion, die vorher die eine einzige tabelle gehandlet hat, ist jetzt eine private methode, die ihre ergebnisse an zwei übergebene listen fürs rows und subrows anhängt.

row_cells = list(_sort_columns_in_row(table_row))
Expand All @@ -124,12 +125,10 @@ def convert_docx_table_to_ebd_table(self) -> EbdTable:
boolean_outcome, subsequent_step_number = _read_subsequent_step_cell(
row_cells[self._column_index_check_result]
)
result_code = row_cells[self._column_index_result_code].text.strip()
note = row_cells[self._column_index_note].text.strip()
sub_row = EbdTableSubRow(
check_result=EbdCheckResult(subsequent_step_number=subsequent_step_number, result=boolean_outcome),
result_code=result_code or None,
note=note or None,
result_code=row_cells[self._column_index_result_code].text.strip() or None,
note=row_cells[self._column_index_note].text.strip() or None,
Comment on lines -127 to +131
Copy link
Copy Markdown
Contributor Author

@hf-kklein hf-kklein Dec 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pylint hat gemeckert: too-many-locals (also zu viele lokale variablen): hier ist so ein klassiker für: das gefällt zwar dem linter aber die lesbarkeit leidet vllt ein bisschen.

)
sub_rows.append(sub_row)
if sub_row_position == _EbdSubRowPosition.LOWER:
Expand All @@ -139,6 +138,19 @@ def convert_docx_table_to_ebd_table(self) -> EbdTable:
sub_rows=sub_rows,
)
rows.append(row)

def convert_docx_tables_to_ebd_table(self) -> EbdTable:
"""
Converts the raw docx tables of an EBD to an EbdTable.
The latter contains the same data but in an easily accessible format that can be used to e.g. plot real graphs.
"""
rows: List[EbdTableRow] = []
sub_rows: List[EbdTableSubRow] = []
for table_index, table in enumerate(self._docx_tables):
offset: int = 0
if table_index == 0:
offset = self._row_index_last_header + 1
self._handle_single_table(table, offset, rows, sub_rows)
Copy link
Copy Markdown
Contributor Author

@hf-kklein hf-kklein Dec 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

die vormals einzige konvertierungsfunktion wird jetzt hier aufgerufen.

result = EbdTable(
rows=rows,
metadata=self._metadata,
Expand Down
8 changes: 4 additions & 4 deletions unittests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Further reading: https://docs.pytest.org/en/6.2.x/goodpractices.html#tests-outside-application-code
"""
from pathlib import Path
from typing import Dict
from typing import Dict, List

from docx import Document # type:ignore[import]
from docx.table import Table # type:ignore[import]
Expand All @@ -19,12 +19,12 @@ def get_document(datafiles, filename: str) -> Document:
return ebddocx2table.get_document(path)


def get_ebd_docx_table(datafiles, filename: str, ebd_key: str) -> Table:
def get_ebd_docx_tables(datafiles, filename: str, ebd_key: str) -> List[Table]:
"""
a datafiles compatible wrapper around ebddocx2table.get_ebd_docx_table
a datafiles compatible wrapper around ebddocx2table.get_ebd_docx_tables
"""
path = datafiles / Path(filename)
return ebddocx2table.get_ebd_docx_table(path, ebd_key=ebd_key)
return ebddocx2table.get_ebd_docx_tables(path, ebd_key=ebd_key)


def get_all_ebd_keys(datafiles, filename: str) -> Dict[str, str]:
Expand Down
76 changes: 76 additions & 0 deletions unittests/examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,79 @@
),
],
)

# E_0901 spans over multiple pages, let the fun begin
table_e0901 = EbdTable(
metadata=EbdTableMetaData(
ebd_code="E_0901",
chapter="16.1 AD: Ermittlung und Abstimmung der abrechnungsrelevanten Ausfallarbeit – Prognosemodell",
sub_chapter="16.1.2 E_0901_Gegenvorschlag prüfen",
role="NB",
),
rows=[
EbdTableRow(
step_number="1",
description="Liegt für die Ausfallarbeitszeitreihe bereits eine Zustimmung vor?",
sub_rows=[
EbdTableSubRow(
check_result=EbdCheckResult(result=True, subsequent_step_number=None),
result_code="A01",
note="Cluster: Ablehnung\nAusfallarbeitszeitreihe wurde bereits bestätigt.",
),
EbdTableSubRow(
check_result=EbdCheckResult(result=False, subsequent_step_number="2"),
result_code=None,
note=None,
),
],
),
EbdTableRow(
step_number="2",
description="Ist der Gegenvorschlag zur Ausfallarbeitszeitreihe innerhalb der vorgegebenen Frist eingegangen?",
sub_rows=[
EbdTableSubRow(
check_result=EbdCheckResult(result=False, subsequent_step_number=None),
result_code="A02",
note="Cluster: Ablehnung\nFristüberschreitung",
),
EbdTableSubRow(
check_result=EbdCheckResult(result=True, subsequent_step_number="3"),
result_code=None,
note=None,
),
],
),
EbdTableRow(
step_number="3",
description="Liegt bereits ein Gegenvorschlag zur Ausfallarbeitszeitreihe vor?",
sub_rows=[
EbdTableSubRow(
check_result=EbdCheckResult(result=True, subsequent_step_number=None),
result_code="A03",
note="Cluster: Ablehnung\nGegenvorschlag liegt bereits vor\nHinweis: Ein weiterer Gegenvorschlag kann nicht eingereicht werden.",
),
EbdTableSubRow(
check_result=EbdCheckResult(result=False, subsequent_step_number="4"),
result_code=None,
note=None,
),
],
),
EbdTableRow(
step_number="4",
description="Können die Energiemengen des Gegenvorschlages zur Ausfallarbeitszeitreihe akzeptiert werden?",
sub_rows=[
EbdTableSubRow(
check_result=EbdCheckResult(result=False, subsequent_step_number=None),
result_code="A04",
note="Cluster: Ablehnung\nEnergiemengen falsch / nicht plausibel",
),
EbdTableSubRow(
check_result=EbdCheckResult(result=True, subsequent_step_number=None),
result_code="A05",
note="Cluster: Zustimmung\nZustimmung",
),
],
),
],
)
36 changes: 27 additions & 9 deletions unittests/test_highlevel.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import pytest # type:ignore[import]
from docx.table import Table # type:ignore[import]
from ebdtable2graph import EbdTable

from ebddocx2table.docxtableconverter import DocxTableConverter

from . import get_all_ebd_keys, get_document, get_ebd_docx_table
from .examples import table_e0003
from . import get_all_ebd_keys, get_document, get_ebd_docx_tables
from .examples import table_e0003, table_e0901


class TestEbdDocx2Table:
Expand All @@ -29,12 +30,20 @@ def test_get_ebd_keys(self, datafiles, filename: str, expected_length: int):

@pytest.mark.datafiles("unittests/test_data/ebd20221128.docx")
@pytest.mark.parametrize(
"filename, ebd_key",
[pytest.param("ebd20221128.docx", "E_0003")], # 7.39.1 E_0003_Bestellung der Aggregationsebene RZ prüfen 342
"filename, ebd_key,expected_number_of_tables",
[
pytest.param("ebd20221128.docx", "E_0003", 1, id="E_0003: One table on only one page"),
pytest.param("ebd20221128.docx", "E_0015", 1, id="E_0015: one table spanning multiple pages"),
pytest.param("ebd20221128.docx", "E_0901", 2, id="E_0901: multiple tables on multiple pages")
# pytest.param("ebd20221128.docx", "E_0461"), # https://github.com/Hochfrequenz/ebd_docx_to_table/issues/6
],
)
def test_get_ebd_docx_table(self, datafiles, filename: str, ebd_key: str):
actual = get_ebd_docx_table(datafiles, filename, ebd_key=ebd_key)
def test_get_ebd_docx_table(self, datafiles, filename: str, ebd_key: str, expected_number_of_tables: int):
actual = get_ebd_docx_tables(datafiles, filename, ebd_key=ebd_key)
assert actual is not None
assert len(actual) == expected_number_of_tables
for table in actual:
assert isinstance(table, Table)

@pytest.mark.datafiles("unittests/test_data/ebd20221128.docx")
@pytest.mark.parametrize(
Expand All @@ -46,13 +55,22 @@ def test_get_ebd_docx_table(self, datafiles, filename: str, ebd_key: str):
"7.39 AD: Bestellung der Aggregationsebene der Bilanzkreissummenzeitreihe auf Ebene der Regelzone",
"7.39.1 E_0003_Bestellung der Aggregationsebene RZ prüfen",
table_e0003,
)
id="E_0003: Simple single page table",
),
pytest.param(
"ebd20221128.docx",
"E_0901",
"16.1 AD: Ermittlung und Abstimmung der abrechnungsrelevanten Ausfallarbeit – Prognosemodell",
"16.1.2 E_0901_Gegenvorschlag prüfen",
table_e0901,
id="E_0901: table that span over two pages",
),
],
)
def test_convert_docx_table_to_ebd_table(
self, datafiles, filename: str, ebd_key: str, chapter: str, sub_chapter: str, expected: EbdTable
):
docx_table = get_ebd_docx_table(datafiles, filename, ebd_key=ebd_key)
docx_table = get_ebd_docx_tables(datafiles, filename, ebd_key=ebd_key)
converter = DocxTableConverter(docx_table, ebd_key=ebd_key, chapter=chapter, sub_chapter=sub_chapter)
actual = converter.convert_docx_table_to_ebd_table()
actual = converter.convert_docx_tables_to_ebd_table()
assert actual == expected