Hochfrequenz · hf-kklein · Mar 16, 2023 · Mar 16, 2023 · Mar 16, 2023 · Mar 16, 2023
diff --git a/main.py b/main.py
@@ -77,8 +77,8 @@ def main(input_path: Path, output_path: Path, export_types: list[Literal["puml",
         output_path.mkdir(parents=True)
         click.secho(f"Created a new directory at {output_path}", fg="yellow")
     all_ebd_keys = get_all_ebd_keys(input_path)
-    for ebd_key in all_ebd_keys:
-        click.secho(f"Processing EBD '{ebd_key}'")
+    for ebd_key, (ebd_title, ebd_kapitel) in all_ebd_keys.items():
+        click.secho(f"Processing EBD {ebd_kapitel} '{ebd_key}' ({ebd_title})")
         try:
             docx_tables = get_ebd_docx_tables(docx_file_path=input_path, ebd_key=ebd_key)
         except TableNotFoundError as table_not_found_error:

diff --git a/src/ebddocx2table/__init__.py b/src/ebddocx2table/__init__.py
@@ -1,12 +1,14 @@
 """
 Contains high level functions to process .docx files
 """
+import itertools
 import logging
 import re
 from io import BytesIO
 from pathlib import Path
-from typing import Dict, Generator, List, Union
+from typing import Dict, Generator, Iterable, List, Tuple, Union
 
+import attrs
 from docx import Document  # type:ignore[import]
 from docx.oxml import CT_P, CT_Tbl  # type:ignore[import]
 from docx.table import Table  # type:ignore[import]
@@ -107,21 +109,72 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]:
     return tables
 
 
-def get_all_ebd_keys(docx_file_path: Path) -> Dict[str, str]:
+# pylint:disable=too-few-public-methods
+@attrs.define(kw_only=True, frozen=True)
+class EbdChapterInformation:
+    """
+    Contains information about where an EBD is located within the document.
+    If the heading is e.g. "5.2.1" we denote this as:
+    * chapter 5
+    * section 2
+    * subsection 1
+    """
+
+    chapter: int = attrs.field(
+        validator=attrs.validators.and_(attrs.validators.instance_of(int), attrs.validators.ge(1))
+    )
+    section: int = attrs.field(
+        validator=attrs.validators.and_(attrs.validators.instance_of(int), attrs.validators.ge(1))
+    )
+    subsection: int = attrs.field(
+        validator=attrs.validators.and_(attrs.validators.instance_of(int), attrs.validators.ge(1))
+    )
+
+    def __str__(self):
+        return f"{self.chapter}.{self.section}.{self.subsection}"
+
+
+def _enrich_paragraphs_with_sections(
+    paragraphs: Iterable[Paragraph],
+) -> Generator[Tuple[Paragraph, EbdChapterInformation], None, None]:
+    """
+    Yield each paragraph + the "Kapitel" in which it is found.
+    """
+    chapter_counter = itertools.count(start=1)
+    chapter = 1
+    section_counter = itertools.count(start=1)
+    section = 1
+    subsection_counter = itertools.count(start=1)
+    subsection = 1
+    for paragraph in paragraphs:
+        match paragraph.style.style_id:
+            case "berschrift1":
+                chapter = next(chapter_counter)
+                section_counter = itertools.count(start=1)
+                subsection_counter = itertools.count(start=1)
+            case "berschrift2":
+                section = next(section_counter)
+                subsection_counter = itertools.count(start=1)
+            case "berschrift3":
+                subsection = next(subsection_counter)
+        yield paragraph, EbdChapterInformation(chapter=chapter, section=section, subsection=subsection)
+
+
+def get_all_ebd_keys(docx_file_path: Path) -> Dict[str, Tuple[str, EbdChapterInformation]]:
     """
     Extract all EBD keys from the given file.
     Returns a dictionary with all EBD keys as keys and the respective EBD titles as values.
     E.g. key: "E_0003", value: "Bestellung der Aggregationsebene RZ prüfen"
     """
     document = get_document(docx_file_path)
-    result: Dict[str, str] = {}
-    for paragraph in document.paragraphs:
+    result: Dict[str, Tuple[str, EbdChapterInformation]] = {}
+    for paragraph, ebd_kapitel in _enrich_paragraphs_with_sections(document.paragraphs):
         match = _ebd_key_with_heading_pattern.match(paragraph.text)
         if match is None:
             continue
         ebd_key = match.groupdict()["key"]
         title = match.groupdict()["title"]
-        result[ebd_key] = title
-        _logger.debug("Found EBD %s: '%s'", ebd_key, title)
+        result[ebd_key] = (title, ebd_kapitel)
+        _logger.debug("Found EBD %s: '%s' (%s)", ebd_key, title, ebd_kapitel)
     _logger.info("%i EBD keys have been found", len(result))
     return result
diff --git a/unittests/__init__.py b/unittests/__init__.py
@@ -3,7 +3,7 @@
 Further reading: https://docs.pytest.org/en/6.2.x/goodpractices.html#tests-outside-application-code
 """
 from pathlib import Path
-from typing import Dict, List
+from typing import Dict, List, Tuple
 
 from docx import Document  # type:ignore[import]
 from docx.table import Table  # type:ignore[import]
@@ -27,7 +27,7 @@ def get_ebd_docx_tables(datafiles, filename: str, ebd_key: str) -> List[Table]:
     return ebddocx2table.get_ebd_docx_tables(path, ebd_key=ebd_key)
 
 
-def get_all_ebd_keys(datafiles, filename: str) -> Dict[str, str]:
+def get_all_ebd_keys(datafiles, filename: str) -> Dict[str, Tuple[str, ebddocx2table.EbdChapterInformation]]:
     """
     a datafiles compatible wrapper around ebddocx2table.get_all_ebd_keys
     """

diff --git a/unittests/test_highlevel.py b/unittests/test_highlevel.py
@@ -4,7 +4,7 @@
 from docx.table import Table  # type:ignore[import]
 from ebdtable2graph.models import EbdTable
 
-from ebddocx2table import TableNotFoundError
+from ebddocx2table import EbdChapterInformation, TableNotFoundError
 from ebddocx2table.docxtableconverter import DocxTableConverter
 
 from . import get_all_ebd_keys, get_document, get_ebd_docx_tables
@@ -31,12 +31,30 @@ def test_can_read_document(self, datafiles, filename: str):
 
     @pytest.mark.datafiles("unittests/test_data/ebd20221128.docx")
     @pytest.mark.parametrize(
-        "filename,expected_length",
-        [pytest.param("ebd20221128.docx", 241)],
+        "filename,expected_length,expected_entries",
+        [
+            pytest.param(
+                "ebd20221128.docx",
+                241,
+                [
+                    # arbitrary checks ("Stichproben") only
+                    ("Kündigung Stromliefervertrag prüfen", EbdChapterInformation(chapter=6, section=1, subsection=1)),
+                    ("MaBiS-ZP Aktivierung prüfen", EbdChapterInformation(chapter=7, section=2, subsection=1)),
+                    (
+                        "Datenstatus nach Eingang einer AAÜZ vergeben",
+                        EbdChapterInformation(chapter=7, section=61, subsection=2),
+                    ),
+                ],
+            )
+        ],
     )
-    def test_get_ebd_keys(self, datafiles, filename: str, expected_length: int):
+    def test_get_ebd_keys(
+        self, datafiles, filename: str, expected_length: int, expected_entries: List[Tuple[str, EbdChapterInformation]]
+    ):
         actual = get_all_ebd_keys(datafiles, filename)
         assert len(actual) == expected_length  # arbitrary, didn't check if these are really _all_ the keys
+        for expected_entry in expected_entries:
+            assert expected_entry in actual.values()
 
     @pytest.mark.datafiles("unittests/test_data/ebd20221128.docx")
     @pytest.mark.parametrize(