Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions docs/plans/2025-12-14-extract-docx-utils-design.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Extract Private Helpers to `_docx_utils.py`

## Goal

Reduce `__init__.py` bloat by moving private helper functions and constants to a dedicated internal module.

## New Module: `_docx_utils.py`

### Functions (no underscore prefix - module is already private)

| From `__init__.py` | To `_docx_utils.py` |
|--------------------|---------------------|
| `_get_tables_and_paragraphs()` | `get_tables_and_paragraphs()` |
| `_cell_is_probably_from_an_ebd_cell()` | `cell_is_probably_from_an_ebd_cell()` |
| `_table_is_an_ebd_table()` | `table_is_an_ebd_table()` |
| `_table_is_first_ebd_table()` | `table_is_first_ebd_table()` |
| `_enrich_paragraphs_with_sections()` | `enrich_paragraphs_with_sections()` |
| `is_heading()` | `is_heading()` |

### Constants (UPPER_CASE per PEP8)

| From `__init__.py` | To `_docx_utils.py` |
|--------------------|---------------------|
| `_ebd_key_pattern` | `EBD_KEY_PATTERN` |
| `_ebd_key_with_heading_pattern` | `EBD_KEY_WITH_HEADING_PATTERN` |
| `_ebd_cell_pattern` | `EBD_CELL_PATTERN` |
| `_DOCX_ARROW_CHAR` | `DOCX_ARROW_CHAR` |

## Updated `__init__.py`

### Stays in `__init__.py`

- `get_document()` - public API
- `get_ebd_docx_tables()` - public API
- `get_all_ebd_keys()` - public API
- `_logger` - module-specific logger
- `_is_python_version_314` / `_is_manually_triggered_garbage_collection_required` - Python 3.14 GC workaround

### Updated `__all__`

```python
__all__ = [
"EbdTableNotConvertibleError",
"StepNumberNotFoundError",
"TableNotFoundError",
"EbdChapterInformation",
"EbdNoTableSection",
"get_all_ebd_keys",
"get_document",
"get_ebd_docx_tables",
]
```

Note: `is_heading` removed from public API (now internal).

## File Structure After

```
src/ebdamame/
├── __init__.py # ~120 lines (was ~330)
├── _docx_utils.py # ~120 lines (NEW)
Copy link

Copilot AI Dec 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The file size stated here is inaccurate. The actual size of _docx_utils.py is 152 lines (not ~120 lines). Consider updating to match the actual file size.

Suggested change
├── _docx_utils.py # ~120 lines (NEW)
├── _docx_utils.py # ~152 lines (NEW)

Copilot uses AI. Check for mistakes.
├── exceptions.py
├── models.py
└── docxtableconverter.py
```

## Breaking Changes

- `is_heading()` no longer exported (was in `__all__`, now internal)
- No other breaking changes - all other public API unchanged
164 changes: 18 additions & 146 deletions src/ebdamame/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,25 @@
"""

import gc
import itertools
import logging
import re
import sys
from io import BytesIO
from pathlib import Path
from typing import Generator, Iterable, Optional, Union

import docx
from docx.document import Document as DocumentType
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import Table, _Cell
from docx.table import Table
from docx.text.paragraph import Paragraph

from ._docx_utils import (
EBD_KEY_PATTERN,
EBD_KEY_WITH_HEADING_PATTERN,
enrich_paragraphs_with_sections,
get_tables_and_paragraphs,
is_heading,
table_is_an_ebd_table,
table_is_first_ebd_table,
)
from .exceptions import EbdTableNotConvertibleError, StepNumberNotFoundError, TableNotFoundError
from .models import EbdChapterInformation, EbdNoTableSection

Expand All @@ -33,7 +37,6 @@
"get_all_ebd_keys",
"get_document",
"get_ebd_docx_tables",
"is_heading",
]

_logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -72,93 +75,7 @@ def get_document(docx_file_path: Path) -> DocumentType:
source_stream.close()


def _get_tables_and_paragraphs(document: DocumentType) -> Generator[Union[Table, Paragraph], None, None]:
"""
Yields tables and paragraphs from the given document in the order in which they occur in the document.
This is helpful because document.tables and document.paragraphs are de-coupled and give you no information which
paragraph follows which table.
"""
parent_elements = document.element.body
for item in parent_elements.iterchildren():
if isinstance(item, CT_P):
yield Paragraph(item, document)
elif isinstance(item, CT_Tbl):
yield Table(item, document)
else:
_logger.debug("Item %s is neither Paragraph nor Table", str(item))


_ebd_key_pattern = re.compile(r"^E_\d{4}$")
_ebd_key_with_heading_pattern = re.compile(r"^(?P<key>E_\d{4})_?(?P<title>.*)\s*$")


_ebd_cell_pattern = re.compile(r"^(?:ja|nein)\s*(?:Ende|\d+)$")
"""
any EBD table shall contain at least one cell that matches this pattern
"""

_DOCX_ARROW_CHAR = "\uf0e0"
"""
U+F0E0: Private Use Area character representing a right arrow in DOCX documents.
This character is used by MS Word to render arrows (e.g., "ja → 5") in EBD tables.
It appears in cells like "ja 5" to indicate the subsequent step number.
"""


def _cell_is_probably_from_an_ebd_cell(cell: _Cell) -> bool:
if _DOCX_ARROW_CHAR in cell.text:
return True
if cell.text in {"ja", "nein"}:
return True
if "à" in cell.text:
# the rightarrow in wrong encoding
return True
if _ebd_cell_pattern.match(cell.text):
return True
if cell.text.strip().startswith("Cluster:") or cell.text.startswith("Hinweis:"):
return True
return False


def _table_is_an_ebd_table(table: Table) -> bool:
"""
Returns true iff the table "looks like" an EB-Table.
This is to distinguish between tables that are inside the same subsection that describes an EBD but are not part
of the decision tree at all (e.g. in E_0406 the tables about Artikel-IDs).
"""
if _table_is_first_ebd_table(table):
return True
for row in table.rows:
try:
for cell in row.cells:
if _cell_is_probably_from_an_ebd_cell(cell):
return True
except IndexError: # don't ask me why this happens; It's the internals of python-docx
continue
return False


def _table_is_first_ebd_table(table: Table) -> bool:
"""
Returns true if the first row of a table contains "Prüfende Rolle".
We assume that each EBD table has a header row with
"Prüfende Rolle" in the first column.
"""
return "prüfende rolle" in table.rows[0].cells[0].text.lower()


# pylint:disable=too-many-branches
def is_heading(paragraph: Paragraph) -> bool:
"""
Returns True if the paragraph is a heading.
"""
return paragraph.style is not None and paragraph.style.style_id in {
"berschrift1",
"berschrift2",
"berschrift3",
}


def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> list[Table] | EbdNoTableSection:
"""
Opens the file specified in `docx_file_path` and returns the tables that relate to the given `ebd_key`.
Expand All @@ -181,15 +98,15 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> list[Table] | Ebd
Raises:
TableNotFoundError: If no tables related to the given `ebd_key` are found in the document.
"""
if _ebd_key_pattern.match(ebd_key) is None:
raise ValueError(f"The ebd_key '{ebd_key}' does not match {_ebd_key_pattern.pattern}")
if EBD_KEY_PATTERN.match(ebd_key) is None:
raise ValueError(f"The ebd_key '{ebd_key}' does not match {EBD_KEY_PATTERN.pattern}")
document = get_document(docx_file_path)

empty_ebd_text: str | None = None # paragraph text if there is no ebd table
found_table_in_subsection: bool = False
is_inside_subsection_of_requested_table: bool = False
tables: list[Table] = []
tables_and_paragraphs = _get_tables_and_paragraphs(document)
tables_and_paragraphs = get_tables_and_paragraphs(document)
for table_or_paragraph in tables_and_paragraphs:
if isinstance(table_or_paragraph, Paragraph):
paragraph: Paragraph = table_or_paragraph
Expand All @@ -214,8 +131,8 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> list[Table] | Ebd
if (
isinstance(table_or_paragraph, Table)
and is_inside_subsection_of_requested_table
and _table_is_an_ebd_table(table_or_paragraph)
and _table_is_first_ebd_table(table_or_paragraph)
and table_is_an_ebd_table(table_or_paragraph)
and table_is_first_ebd_table(table_or_paragraph)
):
table: Table = table_or_paragraph
tables.append(table)
Expand All @@ -228,7 +145,7 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> list[Table] | Ebd
if isinstance(next_item, Table):
# this is the case that the authors created multiple single tables on single adjacent pages
# if table_is_an_ebd_table(table):
if _table_is_an_ebd_table(next_item):
if table_is_an_ebd_table(next_item):
tables.append(next_item)
elif isinstance(next_item, Paragraph):
if next_item.text.startswith("S_") or next_item.text.startswith("E_"):
Expand Down Expand Up @@ -259,51 +176,6 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> list[Table] | Ebd
gc.collect()


def _enrich_paragraphs_with_sections(
paragraphs: Iterable[Paragraph],
) -> Generator[tuple[Paragraph, EbdChapterInformation], None, None]:
"""
Yield each paragraph + the "Kapitel" in which it is found.
"""
chapter_counter = itertools.count(start=1)
chapter = 1
chapter_title: Optional[str] = None
section_counter = itertools.count(start=1)
section = 1
section_title: Optional[str] = None
subsection_counter = itertools.count(start=1)
subsection = 1
subsection_title: Optional[str] = None
for paragraph in paragraphs:
# since pyton-docx 1.1.2 there are type hints; seems like the style is not guaranteed to be not None
match paragraph.style.style_id: # type:ignore[union-attr]
case "berschrift1":
chapter = next(chapter_counter)
chapter_title = paragraph.text.strip()
section_counter = itertools.count(start=1)
section_title = None
subsection_counter = itertools.count(start=1)
subsection_title = None
case "berschrift2":
section = next(section_counter)
section_title = paragraph.text.strip()
subsection_counter = itertools.count(start=1)
subsection_title = None
case "berschrift3":
subsection = next(subsection_counter)
subsection_title = paragraph.text.strip()
location = EbdChapterInformation(
chapter=chapter,
section=section,
subsection=subsection,
chapter_title=chapter_title,
section_title=section_title,
subsection_title=subsection_title,
)
_logger.debug("Handling Paragraph %i.%i.%i", chapter, section, subsection)
yield paragraph, location


def get_all_ebd_keys(docx_file_path: Path) -> dict[str, tuple[str, EbdChapterInformation]]:
"""
Extract all EBD keys from the given file.
Expand All @@ -312,8 +184,8 @@ def get_all_ebd_keys(docx_file_path: Path) -> dict[str, tuple[str, EbdChapterInf
"""
document = get_document(docx_file_path)
result: dict[str, tuple[str, EbdChapterInformation]] = {}
for paragraph, ebd_kapitel in _enrich_paragraphs_with_sections(document.paragraphs):
match = _ebd_key_with_heading_pattern.match(paragraph.text)
for paragraph, ebd_kapitel in enrich_paragraphs_with_sections(document.paragraphs):
match = EBD_KEY_WITH_HEADING_PATTERN.match(paragraph.text)
if match is None:
contains_ebd_number = paragraph.text.lstrip().startswith("E_")
if contains_ebd_number:
Expand Down
Loading