Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ def main(input_path: Path, output_path: Path, export_types: list[Literal["puml",
output_path.mkdir(parents=True)
click.secho(f"Created a new directory at {output_path}", fg="yellow")
all_ebd_keys = get_all_ebd_keys(input_path)
for ebd_key in all_ebd_keys:
click.secho(f"Processing EBD '{ebd_key}'")
for ebd_key, (ebd_title, ebd_kapitel) in all_ebd_keys.items():
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Braucht es die Klammern um ebd_title, ebd_kapitel ?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sonst hätte pycharm oder black sie entfernt. ich denke s liegt daran, dass ein item aus dem dctionary ein Tuple[Key, Value] ist und Value seinerseits ein Tuple[titel, kapitel].

click.secho(f"Processing EBD {ebd_kapitel} '{ebd_key}' ({ebd_title})")
try:
docx_tables = get_ebd_docx_tables(docx_file_path=input_path, ebd_key=ebd_key)
except TableNotFoundError as table_not_found_error:
Expand Down
65 changes: 59 additions & 6 deletions src/ebddocx2table/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
"""
Contains high level functions to process .docx files
"""
import itertools
import logging
import re
from io import BytesIO
from pathlib import Path
from typing import Dict, Generator, List, Union
from typing import Dict, Generator, Iterable, List, Tuple, Union

import attrs
from docx import Document # type:ignore[import]
from docx.oxml import CT_P, CT_Tbl # type:ignore[import]
from docx.table import Table # type:ignore[import]
Expand Down Expand Up @@ -107,21 +109,72 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]:
return tables


def get_all_ebd_keys(docx_file_path: Path) -> Dict[str, str]:
# pylint:disable=too-few-public-methods
@attrs.define(kw_only=True, frozen=True)
class EbdChapterInformation:
"""
Contains information about where an EBD is located within the document.
If the heading is e.g. "5.2.1" we denote this as:
* chapter 5
* section 2
* subsection 1
"""

chapter: int = attrs.field(
validator=attrs.validators.and_(attrs.validators.instance_of(int), attrs.validators.ge(1))
)
section: int = attrs.field(
validator=attrs.validators.and_(attrs.validators.instance_of(int), attrs.validators.ge(1))
)
subsection: int = attrs.field(
validator=attrs.validators.and_(attrs.validators.instance_of(int), attrs.validators.ge(1))
)

def __str__(self):
return f"{self.chapter}.{self.section}.{self.subsection}"


def _enrich_paragraphs_with_sections(
paragraphs: Iterable[Paragraph],
) -> Generator[Tuple[Paragraph, EbdChapterInformation], None, None]:
"""
Yield each paragraph + the "Kapitel" in which it is found.
"""
chapter_counter = itertools.count(start=1)
chapter = 1
section_counter = itertools.count(start=1)
section = 1
subsection_counter = itertools.count(start=1)
subsection = 1
for paragraph in paragraphs:
match paragraph.style.style_id:
case "berschrift1":
chapter = next(chapter_counter)
section_counter = itertools.count(start=1)
subsection_counter = itertools.count(start=1)
case "berschrift2":
section = next(section_counter)
subsection_counter = itertools.count(start=1)
case "berschrift3":
subsection = next(subsection_counter)
Comment on lines +151 to +159
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

check the tests: the german "ü" is actually discarded

yield paragraph, EbdChapterInformation(chapter=chapter, section=section, subsection=subsection)


def get_all_ebd_keys(docx_file_path: Path) -> Dict[str, Tuple[str, EbdChapterInformation]]:
"""
Extract all EBD keys from the given file.
Returns a dictionary with all EBD keys as keys and the respective EBD titles as values.
E.g. key: "E_0003", value: "Bestellung der Aggregationsebene RZ prüfen"
"""
document = get_document(docx_file_path)
result: Dict[str, str] = {}
for paragraph in document.paragraphs:
result: Dict[str, Tuple[str, EbdChapterInformation]] = {}
for paragraph, ebd_kapitel in _enrich_paragraphs_with_sections(document.paragraphs):
match = _ebd_key_with_heading_pattern.match(paragraph.text)
if match is None:
continue
ebd_key = match.groupdict()["key"]
title = match.groupdict()["title"]
result[ebd_key] = title
_logger.debug("Found EBD %s: '%s'", ebd_key, title)
result[ebd_key] = (title, ebd_kapitel)
_logger.debug("Found EBD %s: '%s' (%s)", ebd_key, title, ebd_kapitel)
_logger.info("%i EBD keys have been found", len(result))
return result
4 changes: 2 additions & 2 deletions unittests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Further reading: https://docs.pytest.org/en/6.2.x/goodpractices.html#tests-outside-application-code
"""
from pathlib import Path
from typing import Dict, List
from typing import Dict, List, Tuple

from docx import Document # type:ignore[import]
from docx.table import Table # type:ignore[import]
Expand All @@ -27,7 +27,7 @@ def get_ebd_docx_tables(datafiles, filename: str, ebd_key: str) -> List[Table]:
return ebddocx2table.get_ebd_docx_tables(path, ebd_key=ebd_key)


def get_all_ebd_keys(datafiles, filename: str) -> Dict[str, str]:
def get_all_ebd_keys(datafiles, filename: str) -> Dict[str, Tuple[str, ebddocx2table.EbdChapterInformation]]:
"""
a datafiles compatible wrapper around ebddocx2table.get_all_ebd_keys
"""
Expand Down
26 changes: 22 additions & 4 deletions unittests/test_highlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from docx.table import Table # type:ignore[import]
from ebdtable2graph.models import EbdTable

from ebddocx2table import TableNotFoundError
from ebddocx2table import EbdChapterInformation, TableNotFoundError
from ebddocx2table.docxtableconverter import DocxTableConverter

from . import get_all_ebd_keys, get_document, get_ebd_docx_tables
Expand All @@ -31,12 +31,30 @@ def test_can_read_document(self, datafiles, filename: str):

@pytest.mark.datafiles("unittests/test_data/ebd20221128.docx")
@pytest.mark.parametrize(
"filename,expected_length",
[pytest.param("ebd20221128.docx", 241)],
"filename,expected_length,expected_entries",
[
pytest.param(
"ebd20221128.docx",
241,
[
# arbitrary checks ("Stichproben") only
("Kündigung Stromliefervertrag prüfen", EbdChapterInformation(chapter=6, section=1, subsection=1)),
("MaBiS-ZP Aktivierung prüfen", EbdChapterInformation(chapter=7, section=2, subsection=1)),
(
"Datenstatus nach Eingang einer AAÜZ vergeben",
EbdChapterInformation(chapter=7, section=61, subsection=2),
),
Comment on lines +41 to +46
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

diese drei einträge einfach mit der docx aus den unittests vergleichen

],
)
],
)
def test_get_ebd_keys(self, datafiles, filename: str, expected_length: int):
def test_get_ebd_keys(
self, datafiles, filename: str, expected_length: int, expected_entries: List[Tuple[str, EbdChapterInformation]]
):
actual = get_all_ebd_keys(datafiles, filename)
assert len(actual) == expected_length # arbitrary, didn't check if these are really _all_ the keys
for expected_entry in expected_entries:
assert expected_entry in actual.values()

@pytest.mark.datafiles("unittests/test_data/ebd20221128.docx")
@pytest.mark.parametrize(
Expand Down