Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from ebdtable2graph import convert_graph_to_plantuml, convert_table_to_graph
from ebdtable2graph.graphviz import convert_dot_to_svg_kroki, convert_graph_to_dot
from ebdtable2graph.models import EbdGraph, EbdTable
from ebdtable2graph.plantuml import GraphToComplexForPlantumlError

# pylint:disable=import-error
from ebddocx2table import TableNotFoundError, get_all_ebd_keys, get_ebd_docx_tables # type:ignore[import]
Expand Down Expand Up @@ -71,6 +72,7 @@ def _dump_json(json_path: Path, ebd_table: EbdTable) -> None:
multiple=True,
help="Choose which file you'd like to create",
)
# pylint:disable=too-many-locals, too-many-branches
def main(input_path: Path, output_path: Path, export_types: list[Literal["puml", "dot", "json", "svg"]]):
"""
A program to get a machine-readable version of the AHBs docx files published by edi@energy.
Expand Down Expand Up @@ -107,7 +109,7 @@ def main(input_path: Path, output_path: Path, export_types: list[Literal["puml",
try:
ebd_graph = convert_table_to_graph(ebd_table)
except Exception as graphing_error: # pylint:disable=broad-except
click.secho(f"Error while graphing {ebd_key}: {str(graphing_error)}; Skip!", fg="red")
click.secho(f"Error while graphing {ebd_key}: {str(graphing_error)}; Skip!", fg="yellow")
continue
if "puml" in export_types:
try:
Expand All @@ -116,12 +118,19 @@ def main(input_path: Path, output_path: Path, export_types: list[Literal["puml",
except AssertionError as assertion_error:
# https://github.com/Hochfrequenz/ebdtable2graph/issues/35
click.secho(str(assertion_error), fg="red")
if "dot" in export_types:
_dump_dot(output_path / Path(f"{ebd_key}.dot"), ebd_graph)
click.secho(f"💾 Successfully exported '{ebd_key}.dot'")
if "svg" in export_types:
_dump_svg(output_path / Path(f"{ebd_key}.svg"), ebd_graph)
click.secho(f"💾 Successfully exported '{ebd_key}.svg'")
except GraphToComplexForPlantumlError as too_complex_error:
click.secho(str(too_complex_error), fg="red")
try:
if "dot" in export_types:
_dump_dot(output_path / Path(f"{ebd_key}.dot"), ebd_graph)
click.secho(f"💾 Successfully exported '{ebd_key}.dot'")
if "svg" in export_types:
_dump_svg(output_path / Path(f"{ebd_key}.svg"), ebd_graph)
click.secho(f"💾 Successfully exported '{ebd_key}.svg'")
except AssertionError as assertion_error:
# e.g. AssertionError: If indegree > 1, the number of paths should always be greater than 1 too.
click.secho(str(assertion_error), fg="red")
# both the SVG and dot path require graphviz to work, hence the common error handling block

click.secho("🏁Finished")

Expand Down
74 changes: 65 additions & 9 deletions src/ebddocx2table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import attrs
from docx import Document # type:ignore[import]
from docx.oxml import CT_P, CT_Tbl # type:ignore[import]
from docx.table import Table # type:ignore[import]
from docx.table import Table, _Cell # type:ignore[import]
from docx.text.paragraph import Paragraph # type:ignore[import]

_logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -62,6 +62,44 @@ def __init__(self, ebd_key: str):
self.ebd_key = ebd_key


_ebd_cell_pattern = re.compile(r"^(?:ja|nein)\s*(?:Ende|\d+)$")
"""
any EBD table shall contain at least one cell that matches this pattern
"""


def _cell_is_probably_from_an_ebd_cell(cell: _Cell) -> bool:
if "" in cell.text:
return True
if cell.text in {"ja", "nein"}:
return True
if "à" in cell.text:
# the rightarrow in wrong encoding
return True
if _ebd_cell_pattern.match(cell.text):
return True
if cell.text.strip().startswith("Cluster:") or cell.text.startswith("Hinweis:"):
return True
return False


def _table_is_an_ebd_table(table: Table) -> bool:
"""
Returns true iff the table "looks like" an EB-Table.
This is to distinguish between tables that are inside the same subsection that describes an EBD but are not part
of the decision tree at all (e.g. in E_0406 the tables about Artikel-IDs).
"""
for row in table.rows:
try:
for cell in row.cells:
if _cell_is_probably_from_an_ebd_cell(cell):
return True
except IndexError: # don't ask me why this happens; It's the internals of python-docx
continue
return False


# pylint:disable=too-many-branches
def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]:
"""
Opens the file specified in docx_file_path and returns the tables that relate to the given ebd_key.
Expand All @@ -73,7 +111,7 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]:
raise ValueError(f"The ebd_key '{ebd_key}' does not match {_ebd_key_pattern.pattern}")
document = get_document(docx_file_path)

next_table_is_requested_table: bool = False
is_inside_subsection_of_requested_table: bool = False
tables: List[Table] = []
tables_and_paragraphs = _get_tables_and_paragaphs(document)
for table_or_paragraph in tables_and_paragraphs:
Expand All @@ -82,8 +120,21 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]:
# Assumptions:
# 1. before each EbdTable there is a paragraph whose text starts with the respective EBD key
# 2. there are no duplicates
next_table_is_requested_table = paragraph.text.startswith(ebd_key)
if isinstance(table_or_paragraph, Table) and next_table_is_requested_table:
is_inside_subsection_of_requested_table = (
paragraph.text.startswith(ebd_key) or is_inside_subsection_of_requested_table
)
if (
is_inside_subsection_of_requested_table
and paragraph.text.strip().startswith("Es ist das EBD")
and paragraph.text.strip().endswith("zu nutzen.")
):
# that's kind of a dirty hack. But it works.
break
if (
isinstance(table_or_paragraph, Table)
and is_inside_subsection_of_requested_table
and _table_is_an_ebd_table(table_or_paragraph)
):
table: Table = table_or_paragraph
tables.append(table)
# Now we have to check if the EBD table spans multiple pages and _maybe_ we have to collect more tables.
Expand All @@ -94,17 +145,22 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]:
for next_item in tables_and_paragraphs: # start iterating from where the outer loop paused
if isinstance(next_item, Table):
# this is the case that the authors created multiple single tables on single adjacent pages
tables.append(next_item)
elif isinstance(next_item, Paragraph) and not next_item.text.strip():
# sometimes the authors add blank lines before they continue with the next table
# if table_is_an_ebd_table(table):
if _table_is_an_ebd_table(next_item):
tables.append(next_item)
elif isinstance(next_item, Paragraph):
if next_item.text.startswith("S_") or next_item.text.startswith("E_"):
# this is the case that the authors created 1 table that spans multiple pages
# and we're done collecting tables for this EBD key
break
continue
else:
break # inner loop because if no other table will follow
# we're done collecting the tables for this EBD key
if next_table_is_requested_table and len(tables) > 0: # this means: we found the table
if is_inside_subsection_of_requested_table and len(tables) > 0: # this means: we found the table
# break the outer loop, too; no need to iterate any further
break
if len(tables) == 0:
if not any(tables):
raise TableNotFoundError(ebd_key=ebd_key)
return tables

Expand Down
7 changes: 5 additions & 2 deletions src/ebddocx2table/docxtableconverter.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def _sort_columns_in_row(docx_table_row: _Row) -> Generator[_Cell, None, None]:
yield _Cell(table_column, docx_table_row.table)


_subsequent_step_pattern = re.compile(r"^(?P<bool>(?:ja)|(?:nein))\s*(?P<subsequent_step_number>(?:\d+\*?)|ende)?")
_subsequent_step_pattern = re.compile(r"^(?P<bool>(?:ja)|(?:nein))[\sà]*(?P<subsequent_step_number>(?:\d+\*?)|ende)?")

_step_number_pattern = re.compile(_STEP_NUMBER_REGEX)

Expand All @@ -42,7 +42,9 @@ def _get_index_of_first_column_with_step_number(cells: List[_Cell]) -> int:
"""
returns the index of the first cell in cells, that contains a step number
"""
first_step_number_cell = first_true(cells, pred=lambda cell: _step_number_pattern.match(cell.text) is not None)
first_step_number_cell = first_true(
cells, pred=lambda cell: _step_number_pattern.match(cell.text.strip()) is not None
)
step_number_column_index = cells.index(first_step_number_cell)
_logger.debug("The step number is in column %i", step_number_column_index)
return step_number_column_index
Expand Down Expand Up @@ -175,6 +177,7 @@ def __init__(self, docx_tables: List[Table], ebd_key: str, chapter: str, sub_cha
self._column_index_result_code = column_index
elif table_cell_text == "Hinweis":
self._column_index_note = column_index

self._metadata = EbdTableMetaData(ebd_code=ebd_key, sub_chapter=sub_chapter, chapter=chapter, role=role)

@staticmethod
Expand Down
51 changes: 51 additions & 0 deletions unittests/test_highlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,57 @@ def test_get_ebd_docx_table(self, datafiles, filename: str, ebd_key: str, expect
for table in actual:
assert isinstance(table, Table)

@pytest.mark.datafiles("unittests/test_data/ebd20230629_v34.docx")
@pytest.mark.parametrize(
"filename, ebd_key",
[
pytest.param("ebd20230629_v34.docx", "E_0406", id="E_0406: EB-Table starts after pages of text"),
],
)
def test_finding_tables_positive(self, datafiles, filename: str, ebd_key: str):
docx_tables = get_ebd_docx_tables(datafiles, filename, ebd_key=ebd_key)
converter = DocxTableConverter(
docx_tables, ebd_key=ebd_key, chapter="Dummy Chapter", sub_chapter="Dummy Subchapter"
)
actual = converter.convert_docx_tables_to_ebd_table() # must not throw TableNotFoundError
assert isinstance(actual, EbdTable)

@pytest.mark.datafiles("unittests/test_data/ebd20230629_v34.docx")
@pytest.mark.parametrize(
"filename, ebd_key",
[
pytest.param("ebd20230629_v34.docx", "E_0561", id="Es ist das EBD E_0556 zu nutzen."),
],
)
def test_finding_tables_negative(self, datafiles, filename: str, ebd_key: str):
with pytest.raises(TableNotFoundError):
docx_tables = get_ebd_docx_tables(datafiles, filename, ebd_key=ebd_key)
converter = DocxTableConverter(
docx_tables, ebd_key=ebd_key, chapter="Dummy Chapter", sub_chapter="Dummy Subchapter"
)
_ = converter.convert_docx_tables_to_ebd_table()

@pytest.mark.datafiles("unittests/test_data/ebd20230619_v34.docx")
@pytest.mark.parametrize(
"filename, ebd_key, excepted_subsequent",
[
pytest.param("ebd20230619_v34.docx", "E_0012", "2"),
pytest.param("ebd20230619_v34.docx", "E_0021", "Ende"),
],
)
def test_wrong_encoding_of_rightarrow(self, datafiles, filename: str, ebd_key: str, excepted_subsequent: str):
docx_tables = get_ebd_docx_tables(datafiles, filename, ebd_key=ebd_key)
converter = DocxTableConverter(
docx_tables, ebd_key=ebd_key, chapter="Dummy Chapter", sub_chapter="Dummy Subchapter"
)
actual = converter.convert_docx_tables_to_ebd_table()
assert any(
subrow
for row in actual.rows
for subrow in row.sub_rows
if subrow.check_result.subsequent_step_number == excepted_subsequent
)

@pytest.mark.datafiles("unittests/test_data/ebd20221128.docx")
@pytest.mark.parametrize(
"filename, ebd_key, chapter, sub_chapter, expected",
Expand Down