Hochfrequenz · hf-kklein · Dec 13, 2022 · Dec 13, 2022 · Dec 13, 2022 · Dec 13, 2022
diff --git a/src/ebddocx2table/__init__.py b/src/ebddocx2table/__init__.py
@@ -4,7 +4,7 @@
 import re
 from io import BytesIO
 from pathlib import Path
-from typing import Generator, Union
+from typing import Dict, Generator, Union
 
 from docx import Document  # type:ignore[import]
 from docx.oxml import CT_P, CT_Tbl  # type:ignore[import]
@@ -43,7 +43,8 @@ def _get_tables_and_paragaphs(document: Document) -> Generator[Union[Table, Para
             yield Table(item, document)
 
 
-_ebd_key_pattern = re.compile(r"^[SE]_\d{4}$")
+_ebd_key_pattern = re.compile(r"^E_\d{4}$")
+_ebd_key_with_heading_pattern = re.compile(r"^(?P<key>E_\d{4})_(?P<title>.*)\s*$")
 
 
 def get_ebd_docx_table(docx_file_path: Path, ebd_key: str) -> Table:
@@ -66,3 +67,21 @@ def get_ebd_docx_table(docx_file_path: Path, ebd_key: str) -> Table:
             table: Table = table_or_paragraph
             return table
     raise ValueError(f"EBD Table '{ebd_key}' was not found.")
+
+
+def get_all_ebd_keys(docx_file_path: Path) -> Dict[str, str]:
+    """
+    Extract all EBD keys from the given file.
+    Returns a dictionary with all EBD keys as keys and the respective EBD titles as values.
+    E.g. key: "E_0003", value: "Bestellung der Aggregationsebene RZ prüfen"
+    """
+    document = get_document(docx_file_path)
+    result: Dict[str, str] = {}
+    for paragraph in document.paragraphs:
+        match = _ebd_key_with_heading_pattern.match(paragraph.text)
+        if match is None:
+            continue
+        ebd_key = match.groupdict()["key"]
+        title = match.groupdict()["title"]
+        result[ebd_key] = title
+    return result
diff --git a/unittests/__init__.py b/unittests/__init__.py
@@ -3,6 +3,7 @@
 Further reading: https://docs.pytest.org/en/6.2.x/goodpractices.html#tests-outside-application-code
 """
 from pathlib import Path
+from typing import Dict
 
 from docx import Document  # type:ignore[import]
 from docx.table import Table  # type:ignore[import]
@@ -24,3 +25,11 @@ def get_ebd_docx_table(datafiles, filename: str, ebd_key: str) -> Table:
     """
     path = datafiles / Path(filename)
     return ebddocx2table.get_ebd_docx_table(path, ebd_key=ebd_key)
+
+
+def get_all_ebd_keys(datafiles, filename: str) -> Dict[str, str]:
+    """
+    a datafiles compatible wrapper around ebddocx2table.get_all_ebd_keys
+    """
+    path = datafiles / Path(filename)
+    return ebddocx2table.get_all_ebd_keys(path)
diff --git a/unittests/test_highlevel.py b/unittests/test_highlevel.py
@@ -3,7 +3,7 @@
 
 from ebddocx2table.docxtableconverter import DocxTableConverter
 
-from . import get_document, get_ebd_docx_table
+from . import get_all_ebd_keys, get_document, get_ebd_docx_table
 from .examples import table_e0003
 
 
@@ -18,6 +18,15 @@ def test_can_read_document(self, datafiles, filename: str):
         actual = get_document(datafiles, filename)
         assert actual is not None
 
+    @pytest.mark.datafiles("unittests/test_data/ebd20221128.docx")
+    @pytest.mark.parametrize(
+        "filename,expected_length",
+        [pytest.param("ebd20221128.docx", 241)],
+    )
+    def test_get_ebd_keys(self, datafiles, filename: str, expected_length: int):
+        actual = get_all_ebd_keys(datafiles, filename)
+        assert len(actual) == expected_length  # arbitrary, didn't check if these are really _all_ the keys
+
     @pytest.mark.datafiles("unittests/test_data/ebd20221128.docx")
     @pytest.mark.parametrize(
         "filename, ebd_key",