Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions src/ebddocx2table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import re
from io import BytesIO
from pathlib import Path
from typing import Generator, Union
from typing import Dict, Generator, Union

from docx import Document # type:ignore[import]
from docx.oxml import CT_P, CT_Tbl # type:ignore[import]
Expand Down Expand Up @@ -43,7 +43,8 @@ def _get_tables_and_paragaphs(document: Document) -> Generator[Union[Table, Para
yield Table(item, document)


_ebd_key_pattern = re.compile(r"^[SE]_\d{4}$")
_ebd_key_pattern = re.compile(r"^E_\d{4}$")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gibt es einen Grund, warum vorher auch S möglich war und jetzt nicht mehr? ^^

Copy link
Copy Markdown
Contributor Author

@hf-kklein hf-kklein Dec 13, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Weil ich beim ersten Mal fälschlicherweise auch die codelisten gematched habe die mit S(trom) oder G(as) beginnen und im gleichen Dokument rumschwirren.
Nur E ist EBD.

_ebd_key_with_heading_pattern = re.compile(r"^(?P<key>E_\d{4})_(?P<title>.*)\s*$")


def get_ebd_docx_table(docx_file_path: Path, ebd_key: str) -> Table:
Expand All @@ -66,3 +67,21 @@ def get_ebd_docx_table(docx_file_path: Path, ebd_key: str) -> Table:
table: Table = table_or_paragraph
return table
raise ValueError(f"EBD Table '{ebd_key}' was not found.")


def get_all_ebd_keys(docx_file_path: Path) -> Dict[str, str]:
"""
Extract all EBD keys from the given file.
Returns a dictionary with all EBD keys as keys and the respective EBD titles as values.
E.g. key: "E_0003", value: "Bestellung der Aggregationsebene RZ prüfen"
"""
document = get_document(docx_file_path)
result: Dict[str, str] = {}
for paragraph in document.paragraphs:
match = _ebd_key_with_heading_pattern.match(paragraph.text)
if match is None:
continue
ebd_key = match.groupdict()["key"]
title = match.groupdict()["title"]
result[ebd_key] = title
return result
9 changes: 9 additions & 0 deletions unittests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Further reading: https://docs.pytest.org/en/6.2.x/goodpractices.html#tests-outside-application-code
"""
from pathlib import Path
from typing import Dict

from docx import Document # type:ignore[import]
from docx.table import Table # type:ignore[import]
Expand All @@ -24,3 +25,11 @@ def get_ebd_docx_table(datafiles, filename: str, ebd_key: str) -> Table:
"""
path = datafiles / Path(filename)
return ebddocx2table.get_ebd_docx_table(path, ebd_key=ebd_key)


def get_all_ebd_keys(datafiles, filename: str) -> Dict[str, str]:
"""
a datafiles compatible wrapper around ebddocx2table.get_all_ebd_keys
"""
path = datafiles / Path(filename)
return ebddocx2table.get_all_ebd_keys(path)
11 changes: 10 additions & 1 deletion unittests/test_highlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from ebddocx2table.docxtableconverter import DocxTableConverter

from . import get_document, get_ebd_docx_table
from . import get_all_ebd_keys, get_document, get_ebd_docx_table
from .examples import table_e0003


Expand All @@ -18,6 +18,15 @@ def test_can_read_document(self, datafiles, filename: str):
actual = get_document(datafiles, filename)
assert actual is not None

@pytest.mark.datafiles("unittests/test_data/ebd20221128.docx")
@pytest.mark.parametrize(
"filename,expected_length",
[pytest.param("ebd20221128.docx", 241)],
)
def test_get_ebd_keys(self, datafiles, filename: str, expected_length: int):
actual = get_all_ebd_keys(datafiles, filename)
assert len(actual) == expected_length # arbitrary, didn't check if these are really _all_ the keys
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hab das mal mit dem Inhaltsverzeichnis grob überschlagen und komme auf ~260 E's. Kommt also ungefähr hin.


@pytest.mark.datafiles("unittests/test_data/ebd20221128.docx")
@pytest.mark.parametrize(
"filename, ebd_key",
Expand Down