Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions requirements.in
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
ebdtable2graph
python-docx
more_itertools
17 changes: 9 additions & 8 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements.txt
# pip-compile requirements.in
#
attrs==22.1.0
# via
# -r requirements.txt
# ebdtable2graph
# via ebdtable2graph
ebdtable2graph==0.0.2
# via -r requirements.txt
# via -r requirements.in
lxml==4.9.1 # switch to version 4.9.0 (for windows + Python 3.11)
# via python-docx
networkx==2.8.8
# via
# -r requirements.txt
# ebdtable2graph
# via ebdtable2graph
python-docx==0.8.11
# via -r requirements.in
more_itertools==9.0.0
67 changes: 66 additions & 1 deletion src/ebddocx2table/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,68 @@
"""
src contains all your business logic
Contains high level functions to process .docx files
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sind die Funktionen hier in der __init__.py sinnvoll hinterlegt?
Bin mir selbst nie so ganz sicher, was man am besten in eine init Datei schreibt.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mir auch nicht :D

"""
import re
from io import BytesIO
from pathlib import Path
from typing import Generator, Union

from docx import Document # type:ignore[import]
from docx.oxml import CT_P, CT_Tbl # type:ignore[import]
from docx.table import Table # type:ignore[import]
from docx.text.paragraph import Paragraph # type:ignore[import]


def get_document(docx_file_path: Path) -> Document:
"""
opens and returns the document specified in the docx_file_path using python-docx
"""
with open(docx_file_path, "rb") as docx_file:
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

müsste man noch den Fall abfangen, dass die Datei nicht da ist?

from pathlib import Path

p = Path.home()
print(p)
print(p.exists())

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ich muss den fall nicht abfangen. Es ist ok, wenn er mit einem FileNotFoundError stirbt. Den Fehler zu kaschieren bringt ja auch nichts

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

joa, man könnte ihn mit einer schöneren Fehlermeldung aussteigen lassen. Aber ja, passt für mich

source_stream = BytesIO(docx_file.read())
# Originally I tried the recipe from
# https://python-docx.readthedocs.io/en/latest/user/documents.html#opening-a-file-like-document
# but then switched from StringIO to BytesIO (without explicit 'utf-8') because of:
# UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 605: character maps to <undefined>
Comment on lines +20 to +24
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ich gehe davon aus, dass du das encoding bei open angegeben hast oder?

open('readme.txt', encoding="utf-8")

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

try:
document = Document(source_stream)
return document
finally:
source_stream.close()


def _get_tables_and_paragaphs(document: Document) -> Generator[Union[Table, Paragraph], None, None]:
"""
Yields tables and paragraphs from the given document in the order in which they occur in the document.
This is helpful because document.tables and document.paragraphs are de-coupled and give you no information which
paragraph follows which table.
"""
parent_elements = document.element.body
for item in parent_elements.iterchildren():
if isinstance(item, CT_P):
yield Paragraph(item, document)
elif isinstance(item, CT_Tbl):
yield Table(item, document)


_ebd_key_pattern = re.compile(r"^[SE]_\d{4}$")


def get_ebd_docx_table(docx_file_path: Path, ebd_key: str) -> Table:
"""
Opens the file specified in docx_file_path and returns the table that relates to the given ebd_key.
Raises an ValueError if the table was not found.
"""
if _ebd_key_pattern.match(ebd_key) is None:
raise ValueError(f"The ebd_key '{ebd_key}' does not match {_ebd_key_pattern.pattern}")
document = get_document(docx_file_path)
next_table_is_requested_table: bool = False
for table_or_paragraph in _get_tables_and_paragaphs(document):
if isinstance(table_or_paragraph, Paragraph):
paragraph: Paragraph = table_or_paragraph
# Assumptions:
# 1. before each EbdTable there is a paragraph whose text starts with the respective EBD key
# 2. there are no duplicates
next_table_is_requested_table = paragraph.text.startswith(ebd_key)
if isinstance(table_or_paragraph, Table) and next_table_is_requested_table:
Comment on lines +64 to +65
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

smart move

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ich denke das übernehme ich für den kohlrahbi

table: Table = table_or_paragraph
return table
raise ValueError(f"EBD Table '{ebd_key}' was not found.")
138 changes: 138 additions & 0 deletions src/ebddocx2table/docxtableconverter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
"""
This module converts tables read from the docx file into a format that is easily accessible (but still a table).
"""
import re
from enum import Enum
from itertools import cycle
from typing import Generator, List, Literal, Optional, Tuple

from docx.table import Table, _Cell, _Row # type:ignore[import]
from ebdtable2graph import EbdTable, EbdTableRow, EbdTableSubRow
from ebdtable2graph.models.ebd_table import EbdCheckResult, EbdTableMetaData


def _is_pruefende_rolle_cell(cell: _Cell) -> bool:
""" "
Returns true iff the cell mentions the market role that is responsible for applying this entscheidungsbaum
"""
return cell.text.startswith("Prüfende Rolle: ")


def _sort_columns_in_row(docx_table_row: _Row) -> Generator[_Cell, None, None]:
"""
The internal structure of the table rows is not as you'd expect it to be as soon as there are merged columns.
This problem is described in https://github.com/python-openxml/python-docx/issues/970#issuecomment-877386927 .
We apply the workaround described in the GithHub issue.
"""
for table_column in docx_table_row._tr.tc_lst: # pylint:disable=protected-access
yield _Cell(table_column, docx_table_row.table)
Comment on lines +21 to +28
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can feel the pain :D



_subsequent_step_pattern = re.compile(r"^(?P<bool>(?:ja)|(?:nein))\s*(?P<subsequent_step_number>(?:\d+\*?)|ende)?")


def _read_subsequent_step_cell(cell: _Cell) -> Tuple[bool, Optional[str]]:
"""
Parses the cell that contains the outcome and the subsequent step (e.g. "ja➡5" where "5" is the subsequent step
number).
"""
cell_text = cell.text.lower().strip()
Comment thread
hf-kklein marked this conversation as resolved.
# we first match against the lower case cell text; then we convert the "ende" to upper case again in the end.
# this is to avoid confusion with "ja" vs. "Ja"
match = _subsequent_step_pattern.match(cell_text)
if not match:
raise ValueError(f"The cell content '{cell_text}' does not belong to a ja/nein cell")
group_dict = match.groupdict()
result_is_ja = group_dict["bool"] == "ja"
subsequent_step_number = group_dict["subsequent_step_number"]
Comment thread
hf-kklein marked this conversation as resolved.
if subsequent_step_number == "ende":
subsequent_step_number = "Ende"
Comment on lines +48 to +49
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wenn ich das richtig sehe, ist Ende in den Tabellen immer groß geschrieben. Hab den regex oben angepasst, die Zeilen hier müssten also obsolet sein, oder?

Suggested change
if subsequent_step_number == "ende":
subsequent_step_number = "Ende"

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

der regex verwendet lowercase weil ich nicht ausschließen will, dass die leute manchmal "Ja" und manchmal "ja" schreiben. aber ich sehe, dass ein erklärender kommentar gut wäre.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

erkläreneder kommentar: 74f1988

return result_is_ja, subsequent_step_number


class _EbdSubRowPosition(Enum):
"""
describes the position of a subrow in the Docx Table
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kannst du hier ein Beispiel anfügen?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"""

UPPER = 1 #: the upper sub row
LOWER = 2 #: the lower sub row


# pylint: disable=too-few-public-methods, too-many-instance-attributes
class DocxTableConverter:
"""
converts docx tables to EbdTables
"""

def __init__(self, docx_table: Table, ebd_key: str, chapter: str, sub_chapter: str):
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vielleicht wäre es hier sinnvoll auch eine classmethod zu verwenden, um eine Instanz von DocxTableConverter zu erstellen, die etwas unabhängiger ist von dem Datenmodell.
Vergleiche https://github.com/Hochfrequenz/kohlrahbi/blob/53cf14c10cf1b965281dd705ce52126d9a3f3f50/src/kohlrahbi/helper/elixir.py#L48-L52

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hf-krechan kannst du dafür einen eigenen PR aufmachen? Ich sehe gerade den Nutzen noch nicht.

"""
the constructor initializes the instance and reads some metadata from the table header
"""
self._docx_table = docx_table
self._column_index_step_number: int
self._column_index_description: int
self._column_index_check_result: int
self._column_index_result_code: int
self._column_index_note: int
self._row_index_last_header: Literal[0, 1] # either 0 or 1
for row_index in range(0, 2): # the first two lines/rows are the header of the table.
# In the constructor we just want to read the metadata from the table.
# For this purpose the first two lines are enough.
for column_index, table_cell in enumerate(docx_table.row_cells(row_index)):
if row_index == 0 and _is_pruefende_rolle_cell(table_cell):
role = table_cell.text.split(":")[1].strip()
break # because the prüfende rolle is always a full row with identical column cells
if table_cell.text == "Nr.":
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

willst du hier extra so streng sein oder wäre ein startswith auch in Ordnung?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ich würde solange streng sein, bis es failed.

self._column_index_step_number = column_index
# In most of the cases this will be 1,
# but it can be 0 if the first row does _not_ contain the "Prüfende Rolle".
self._row_index_last_header = row_index # type:ignore[assignment]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In welchem Fall kann _row_index_last_header denn 0 sein?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wenn die erste row der tabelle nicht die Prüfende Rolle enthält: c606c61

elif table_cell.text == "Prüfschritt":
self._column_index_description = column_index
elif table_cell.text == "Prüfergebnis":
self._column_index_check_result = column_index
elif table_cell.text == "Code":
self._column_index_result_code = column_index
elif table_cell.text == "Hinweis":
self._column_index_note = column_index
self._metadata = EbdTableMetaData(ebd_code=ebd_key, sub_chapter=sub_chapter, chapter=chapter, role=role)
Copy link
Copy Markdown
Contributor

@lord-haffi lord-haffi Dec 13, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wenn die erste row der tabelle nicht die Prüfende Rolle enthält: c606c61

In dem Fall ist die Variable role aber dann nicht gesetzt, oder? Der müsste hier ja dann eigentlich crashen, testest du den Fall ab?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ja das crasht dann. Fände ich aber ok, dann muss man sehen was der Grund ist.

Ich verstehe deinen Punkt: der Code erweckt den Eindruck er sei ganz flexibel und auf alle eventualitäten vorbereitet aber tatsächlich kann er nur den einen aktuell abgetesteten Fall handlen.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In dem Fall ist die Variable role aber dann nicht gesetzt, oder?

#9 da isser, der fall :)


def convert_docx_table_to_ebd_table(self) -> EbdTable:
"""
Converts the raw docx table of an EBD to an EbdTable.
The latter contains the same data but in an easily accessible format that can be used to e.g. plot real graphs.
"""
rows: List[EbdTableRow] = []
sub_rows: List[EbdTableSubRow] = []
for table_row, sub_row_position in zip(
self._docx_table.rows[self._row_index_last_header + 1 :],
cycle([_EbdSubRowPosition.UPPER, _EbdSubRowPosition.LOWER]),
):
row_cells = list(_sort_columns_in_row(table_row))
if sub_row_position == _EbdSubRowPosition.UPPER:
# clear list every second entry
sub_rows = []
step_number = row_cells[self._column_index_step_number].text.strip()
description = row_cells[self._column_index_description].text.strip()
boolean_outcome, subsequent_step_number = _read_subsequent_step_cell(row_cells[self._column_index_check_result])
result_code = row_cells[self._column_index_result_code].text.strip()
note = row_cells[self._column_index_note].text.strip()
sub_row = EbdTableSubRow(
check_result=EbdCheckResult(subsequent_step_number=subsequent_step_number, result=boolean_outcome),
result_code=result_code or None,
note=note or None,
)
sub_rows.append(sub_row)
if sub_row_position == _EbdSubRowPosition.LOWER:
row = EbdTableRow(
description=description,
step_number=step_number,
sub_rows=sub_rows,
)
rows.append(row)
result = EbdTable(
rows=rows,
metadata=self._metadata,
)
return result
23 changes: 0 additions & 23 deletions src/ebddocx2table/mymodule.py

This file was deleted.

1 change: 1 addition & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ commands = python -m pip install --upgrade pip
deps =
-rrequirements.txt
pytest
pytest-datafiles
setenv = PYTHONPATH = {toxinidir}/src
commands = python -m pytest --basetemp={envtmpdir} {posargs}

Expand Down
22 changes: 22 additions & 0 deletions unittests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,25 @@
This file is here, because this allows for best de-coupling of tests and application/library logic.
Further reading: https://docs.pytest.org/en/6.2.x/goodpractices.html#tests-outside-application-code
"""
from pathlib import Path

from docx import Document # type:ignore[import]
from docx.table import Table # type:ignore[import]

import ebddocx2table


def get_document(datafiles, filename: str) -> Document:
"""
a datafiles compatible wrapper around ebddocx2table.get_document
"""
path = datafiles / Path(filename)
return ebddocx2table.get_document(path)


def get_ebd_docx_table(datafiles, filename: str, ebd_key: str) -> Table:
"""
a datafiles compatible wrapper around ebddocx2table.get_ebd_docx_table
"""
path = datafiles / Path(filename)
return ebddocx2table.get_ebd_docx_table(path, ebd_key=ebd_key)
51 changes: 51 additions & 0 deletions unittests/examples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""
module contains data as we expected them to be scraped from the docx file
"""

from ebdtable2graph import EbdTable
from ebdtable2graph.models.ebd_table import EbdCheckResult, EbdTableMetaData, EbdTableRow, EbdTableSubRow

# E_0003 is pretty short
# https://www.entscheidungsbaumdiagramm.de/diagram?ebdKey=E_0003&formatVersion=FV2204
table_e0003 = EbdTable(
metadata=EbdTableMetaData(
ebd_code="E_0003",
chapter="7.39 AD: Bestellung der Aggregationsebene der Bilanzkreissummenzeitreihe auf Ebene der Regelzone",
sub_chapter="7.39.1 E_0003_Bestellung der Aggregationsebene RZ prüfen",
role="ÜNB",
),
rows=[
EbdTableRow(
step_number="1",
description="Erfolgt der Eingang der Bestellung fristgerecht?",
sub_rows=[
EbdTableSubRow(
check_result=EbdCheckResult(result=False, subsequent_step_number=None),
result_code="A01",
note="Fristüberschreitung",
),
EbdTableSubRow(
check_result=EbdCheckResult(result=True, subsequent_step_number="2"),
result_code=None,
note=None,
),
],
),
EbdTableRow(
step_number="2",
description="Erfolgt die Bestellung zum Monatsersten 00:00 Uhr?",
sub_rows=[
EbdTableSubRow(
check_result=EbdCheckResult(result=False, subsequent_step_number=None),
result_code="A02",
note="Gewählter Zeitpunkt nicht zulässig",
),
EbdTableSubRow(
check_result=EbdCheckResult(result=True, subsequent_step_number="Ende"),
result_code=None,
note=None,
),
],
),
],
)
10 changes: 10 additions & 0 deletions unittests/test_data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Test Data (.docx)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

das ist natürlich nun sehr praktisch, dass du nur spezifische EBDs aus den docx ziehst.
mmmh ich überlege mal ob mir das auch gelingt im Kohlrahbi 👍


The `.docx` files in this directory are copied from edi-energy.de.
The files are used to automatically test the scraping logic.

The file [ebd20221128.docx](ebd20221128.docx) can be found on edi-energy.de under the title (link as of [2022-12-12](https://www.edi-energy.de/index.php?id=38&tx_bdew_bdew%5Buid%5D=1758&tx_bdew_bdew%5Baction%5D=download&tx_bdew_bdew%5Bcontroller%5D=Dokument&cHash=d148663456f1d71dc0c3f666849efa7a))

> Entscheidungsbaum-Diagramme und Codelisten - informatorische Lesefassung 3.2 Konsolidierte Lesefassung mit Fehlerkorrekturen Stand: 28.11.2022

The copyright for these files remains solely at EDI@Energy and the authors of the docx files.
Binary file added unittests/test_data/ebd20221128.docx
Binary file not shown.
Loading