diff --git a/src/ebddocx2table/__init__.py b/src/ebddocx2table/__init__.py index 8f54c74..07dbc30 100644 --- a/src/ebddocx2table/__init__.py +++ b/src/ebddocx2table/__init__.py @@ -4,7 +4,7 @@ import re from io import BytesIO from pathlib import Path -from typing import Generator, Union +from typing import Dict, Generator, Union from docx import Document # type:ignore[import] from docx.oxml import CT_P, CT_Tbl # type:ignore[import] @@ -43,7 +43,8 @@ def _get_tables_and_paragaphs(document: Document) -> Generator[Union[Table, Para yield Table(item, document) -_ebd_key_pattern = re.compile(r"^[SE]_\d{4}$") +_ebd_key_pattern = re.compile(r"^E_\d{4}$") +_ebd_key_with_heading_pattern = re.compile(r"^(?PE_\d{4})_(?P.*)\s*$") def get_ebd_docx_table(docx_file_path: Path, ebd_key: str) -> Table: @@ -66,3 +67,21 @@ def get_ebd_docx_table(docx_file_path: Path, ebd_key: str) -> Table: table: Table = table_or_paragraph return table raise ValueError(f"EBD Table '{ebd_key}' was not found.") + + +def get_all_ebd_keys(docx_file_path: Path) -> Dict[str, str]: + """ + Extract all EBD keys from the given file. + Returns a dictionary with all EBD keys as keys and the respective EBD titles as values. + E.g. key: "E_0003", value: "Bestellung der Aggregationsebene RZ prüfen" + """ + document = get_document(docx_file_path) + result: Dict[str, str] = {} + for paragraph in document.paragraphs: + match = _ebd_key_with_heading_pattern.match(paragraph.text) + if match is None: + continue + ebd_key = match.groupdict()["key"] + title = match.groupdict()["title"] + result[ebd_key] = title + return result diff --git a/unittests/__init__.py b/unittests/__init__.py index 15c6e07..21be04e 100644 --- a/unittests/__init__.py +++ b/unittests/__init__.py @@ -3,6 +3,7 @@ Further reading: https://docs.pytest.org/en/6.2.x/goodpractices.html#tests-outside-application-code """ from pathlib import Path +from typing import Dict from docx import Document # type:ignore[import] from docx.table import Table # type:ignore[import] @@ -24,3 +25,11 @@ def get_ebd_docx_table(datafiles, filename: str, ebd_key: str) -> Table: """ path = datafiles / Path(filename) return ebddocx2table.get_ebd_docx_table(path, ebd_key=ebd_key) + + +def get_all_ebd_keys(datafiles, filename: str) -> Dict[str, str]: + """ + a datafiles compatible wrapper around ebddocx2table.get_all_ebd_keys + """ + path = datafiles / Path(filename) + return ebddocx2table.get_all_ebd_keys(path) diff --git a/unittests/test_highlevel.py b/unittests/test_highlevel.py index d336dc2..8bfd688 100644 --- a/unittests/test_highlevel.py +++ b/unittests/test_highlevel.py @@ -3,7 +3,7 @@ from ebddocx2table.docxtableconverter import DocxTableConverter -from . import get_document, get_ebd_docx_table +from . import get_all_ebd_keys, get_document, get_ebd_docx_table from .examples import table_e0003 @@ -18,6 +18,15 @@ def test_can_read_document(self, datafiles, filename: str): actual = get_document(datafiles, filename) assert actual is not None + @pytest.mark.datafiles("unittests/test_data/ebd20221128.docx") + @pytest.mark.parametrize( + "filename,expected_length", + [pytest.param("ebd20221128.docx", 241)], + ) + def test_get_ebd_keys(self, datafiles, filename: str, expected_length: int): + actual = get_all_ebd_keys(datafiles, filename) + assert len(actual) == expected_length # arbitrary, didn't check if these are really _all_ the keys + @pytest.mark.datafiles("unittests/test_data/ebd20221128.docx") @pytest.mark.parametrize( "filename, ebd_key",