diff --git a/docs/plans/2025-12-14-extract-docx-utils-design.md b/docs/plans/2025-12-14-extract-docx-utils-design.md new file mode 100644 index 0000000..1895312 --- /dev/null +++ b/docs/plans/2025-12-14-extract-docx-utils-design.md @@ -0,0 +1,70 @@ +# Extract Private Helpers to `_docx_utils.py` + +## Goal + +Reduce `__init__.py` bloat by moving private helper functions and constants to a dedicated internal module. + +## New Module: `_docx_utils.py` + +### Functions (no underscore prefix - module is already private) + +| From `__init__.py` | To `_docx_utils.py` | +|--------------------|---------------------| +| `_get_tables_and_paragraphs()` | `get_tables_and_paragraphs()` | +| `_cell_is_probably_from_an_ebd_cell()` | `cell_is_probably_from_an_ebd_cell()` | +| `_table_is_an_ebd_table()` | `table_is_an_ebd_table()` | +| `_table_is_first_ebd_table()` | `table_is_first_ebd_table()` | +| `_enrich_paragraphs_with_sections()` | `enrich_paragraphs_with_sections()` | +| `is_heading()` | `is_heading()` | + +### Constants (UPPER_CASE per PEP8) + +| From `__init__.py` | To `_docx_utils.py` | +|--------------------|---------------------| +| `_ebd_key_pattern` | `EBD_KEY_PATTERN` | +| `_ebd_key_with_heading_pattern` | `EBD_KEY_WITH_HEADING_PATTERN` | +| `_ebd_cell_pattern` | `EBD_CELL_PATTERN` | +| `_DOCX_ARROW_CHAR` | `DOCX_ARROW_CHAR` | + +## Updated `__init__.py` + +### Stays in `__init__.py` + +- `get_document()` - public API +- `get_ebd_docx_tables()` - public API +- `get_all_ebd_keys()` - public API +- `_logger` - module-specific logger +- `_is_python_version_314` / `_is_manually_triggered_garbage_collection_required` - Python 3.14 GC workaround + +### Updated `__all__` + +```python +__all__ = [ + "EbdTableNotConvertibleError", + "StepNumberNotFoundError", + "TableNotFoundError", + "EbdChapterInformation", + "EbdNoTableSection", + "get_all_ebd_keys", + "get_document", + "get_ebd_docx_tables", +] +``` + +Note: `is_heading` removed from public API (now internal). + +## File Structure After + +``` +src/ebdamame/ +├── __init__.py # ~120 lines (was ~330) +├── _docx_utils.py # ~120 lines (NEW) +├── exceptions.py +├── models.py +└── docxtableconverter.py +``` + +## Breaking Changes + +- `is_heading()` no longer exported (was in `__all__`, now internal) +- No other breaking changes - all other public API unchanged diff --git a/src/ebdamame/__init__.py b/src/ebdamame/__init__.py index 8abe113..9b37a92 100644 --- a/src/ebdamame/__init__.py +++ b/src/ebdamame/__init__.py @@ -3,21 +3,25 @@ """ import gc -import itertools import logging -import re import sys from io import BytesIO from pathlib import Path -from typing import Generator, Iterable, Optional, Union import docx from docx.document import Document as DocumentType -from docx.oxml.table import CT_Tbl -from docx.oxml.text.paragraph import CT_P -from docx.table import Table, _Cell +from docx.table import Table from docx.text.paragraph import Paragraph +from ._docx_utils import ( + EBD_KEY_PATTERN, + EBD_KEY_WITH_HEADING_PATTERN, + enrich_paragraphs_with_sections, + get_tables_and_paragraphs, + is_heading, + table_is_an_ebd_table, + table_is_first_ebd_table, +) from .exceptions import EbdTableNotConvertibleError, StepNumberNotFoundError, TableNotFoundError from .models import EbdChapterInformation, EbdNoTableSection @@ -33,7 +37,6 @@ "get_all_ebd_keys", "get_document", "get_ebd_docx_tables", - "is_heading", ] _logger = logging.getLogger(__name__) @@ -72,93 +75,7 @@ def get_document(docx_file_path: Path) -> DocumentType: source_stream.close() -def _get_tables_and_paragraphs(document: DocumentType) -> Generator[Union[Table, Paragraph], None, None]: - """ - Yields tables and paragraphs from the given document in the order in which they occur in the document. - This is helpful because document.tables and document.paragraphs are de-coupled and give you no information which - paragraph follows which table. - """ - parent_elements = document.element.body - for item in parent_elements.iterchildren(): - if isinstance(item, CT_P): - yield Paragraph(item, document) - elif isinstance(item, CT_Tbl): - yield Table(item, document) - else: - _logger.debug("Item %s is neither Paragraph nor Table", str(item)) - - -_ebd_key_pattern = re.compile(r"^E_\d{4}$") -_ebd_key_with_heading_pattern = re.compile(r"^(?PE_\d{4})_?(?P.*)\s*$") - - -_ebd_cell_pattern = re.compile(r"^(?:ja|nein)\s*(?:Ende|\d+)$") -""" -any EBD table shall contain at least one cell that matches this pattern -""" - -_DOCX_ARROW_CHAR = "\uf0e0" -""" -U+F0E0: Private Use Area character representing a right arrow in DOCX documents. -This character is used by MS Word to render arrows (e.g., "ja → 5") in EBD tables. -It appears in cells like "ja 5" to indicate the subsequent step number. -""" - - -def _cell_is_probably_from_an_ebd_cell(cell: _Cell) -> bool: - if _DOCX_ARROW_CHAR in cell.text: - return True - if cell.text in {"ja", "nein"}: - return True - if "à" in cell.text: - # the rightarrow in wrong encoding - return True - if _ebd_cell_pattern.match(cell.text): - return True - if cell.text.strip().startswith("Cluster:") or cell.text.startswith("Hinweis:"): - return True - return False - - -def _table_is_an_ebd_table(table: Table) -> bool: - """ - Returns true iff the table "looks like" an EB-Table. - This is to distinguish between tables that are inside the same subsection that describes an EBD but are not part - of the decision tree at all (e.g. in E_0406 the tables about Artikel-IDs). - """ - if _table_is_first_ebd_table(table): - return True - for row in table.rows: - try: - for cell in row.cells: - if _cell_is_probably_from_an_ebd_cell(cell): - return True - except IndexError: # don't ask me why this happens; It's the internals of python-docx - continue - return False - - -def _table_is_first_ebd_table(table: Table) -> bool: - """ - Returns true if the first row of a table contains "Prüfende Rolle". - We assume that each EBD table has a header row with - "Prüfende Rolle" in the first column. - """ - return "prüfende rolle" in table.rows[0].cells[0].text.lower() - - # pylint:disable=too-many-branches -def is_heading(paragraph: Paragraph) -> bool: - """ - Returns True if the paragraph is a heading. - """ - return paragraph.style is not None and paragraph.style.style_id in { - "berschrift1", - "berschrift2", - "berschrift3", - } - - def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> list[Table] | EbdNoTableSection: """ Opens the file specified in `docx_file_path` and returns the tables that relate to the given `ebd_key`. @@ -181,15 +98,15 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> list[Table] | Ebd Raises: TableNotFoundError: If no tables related to the given `ebd_key` are found in the document. """ - if _ebd_key_pattern.match(ebd_key) is None: - raise ValueError(f"The ebd_key '{ebd_key}' does not match {_ebd_key_pattern.pattern}") + if EBD_KEY_PATTERN.match(ebd_key) is None: + raise ValueError(f"The ebd_key '{ebd_key}' does not match {EBD_KEY_PATTERN.pattern}") document = get_document(docx_file_path) empty_ebd_text: str | None = None # paragraph text if there is no ebd table found_table_in_subsection: bool = False is_inside_subsection_of_requested_table: bool = False tables: list[Table] = [] - tables_and_paragraphs = _get_tables_and_paragraphs(document) + tables_and_paragraphs = get_tables_and_paragraphs(document) for table_or_paragraph in tables_and_paragraphs: if isinstance(table_or_paragraph, Paragraph): paragraph: Paragraph = table_or_paragraph @@ -214,8 +131,8 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> list[Table] | Ebd if ( isinstance(table_or_paragraph, Table) and is_inside_subsection_of_requested_table - and _table_is_an_ebd_table(table_or_paragraph) - and _table_is_first_ebd_table(table_or_paragraph) + and table_is_an_ebd_table(table_or_paragraph) + and table_is_first_ebd_table(table_or_paragraph) ): table: Table = table_or_paragraph tables.append(table) @@ -228,7 +145,7 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> list[Table] | Ebd if isinstance(next_item, Table): # this is the case that the authors created multiple single tables on single adjacent pages # if table_is_an_ebd_table(table): - if _table_is_an_ebd_table(next_item): + if table_is_an_ebd_table(next_item): tables.append(next_item) elif isinstance(next_item, Paragraph): if next_item.text.startswith("S_") or next_item.text.startswith("E_"): @@ -259,51 +176,6 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> list[Table] | Ebd gc.collect() -def _enrich_paragraphs_with_sections( - paragraphs: Iterable[Paragraph], -) -> Generator[tuple[Paragraph, EbdChapterInformation], None, None]: - """ - Yield each paragraph + the "Kapitel" in which it is found. - """ - chapter_counter = itertools.count(start=1) - chapter = 1 - chapter_title: Optional[str] = None - section_counter = itertools.count(start=1) - section = 1 - section_title: Optional[str] = None - subsection_counter = itertools.count(start=1) - subsection = 1 - subsection_title: Optional[str] = None - for paragraph in paragraphs: - # since pyton-docx 1.1.2 there are type hints; seems like the style is not guaranteed to be not None - match paragraph.style.style_id: # type:ignore[union-attr] - case "berschrift1": - chapter = next(chapter_counter) - chapter_title = paragraph.text.strip() - section_counter = itertools.count(start=1) - section_title = None - subsection_counter = itertools.count(start=1) - subsection_title = None - case "berschrift2": - section = next(section_counter) - section_title = paragraph.text.strip() - subsection_counter = itertools.count(start=1) - subsection_title = None - case "berschrift3": - subsection = next(subsection_counter) - subsection_title = paragraph.text.strip() - location = EbdChapterInformation( - chapter=chapter, - section=section, - subsection=subsection, - chapter_title=chapter_title, - section_title=section_title, - subsection_title=subsection_title, - ) - _logger.debug("Handling Paragraph %i.%i.%i", chapter, section, subsection) - yield paragraph, location - - def get_all_ebd_keys(docx_file_path: Path) -> dict[str, tuple[str, EbdChapterInformation]]: """ Extract all EBD keys from the given file. @@ -312,8 +184,8 @@ def get_all_ebd_keys(docx_file_path: Path) -> dict[str, tuple[str, EbdChapterInf """ document = get_document(docx_file_path) result: dict[str, tuple[str, EbdChapterInformation]] = {} - for paragraph, ebd_kapitel in _enrich_paragraphs_with_sections(document.paragraphs): - match = _ebd_key_with_heading_pattern.match(paragraph.text) + for paragraph, ebd_kapitel in enrich_paragraphs_with_sections(document.paragraphs): + match = EBD_KEY_WITH_HEADING_PATTERN.match(paragraph.text) if match is None: contains_ebd_number = paragraph.text.lstrip().startswith("E_") if contains_ebd_number: diff --git a/src/ebdamame/_docx_utils.py b/src/ebdamame/_docx_utils.py new file mode 100644 index 0000000..a33899a --- /dev/null +++ b/src/ebdamame/_docx_utils.py @@ -0,0 +1,151 @@ +""" +Private helper functions and constants for docx processing. + +This module is internal - do not import directly from external code. +""" + +import itertools +import logging +import re +from typing import Generator, Iterable, Optional, Union + +from docx.document import Document as DocumentType +from docx.oxml.table import CT_Tbl +from docx.oxml.text.paragraph import CT_P +from docx.table import Table, _Cell +from docx.text.paragraph import Paragraph + +from .models import EbdChapterInformation + +_logger = logging.getLogger(__name__) + +# Regex patterns for EBD key detection +EBD_KEY_PATTERN = re.compile(r"^E_\d{4}$") +EBD_KEY_WITH_HEADING_PATTERN = re.compile(r"^(?P<key>E_\d{4})_?(?P<title>.*)\s*$") + +EBD_CELL_PATTERN = re.compile(r"^(?:ja|nein)\s*(?:Ende|\d+)$") +""" +Any EBD table shall contain at least one cell that matches this pattern. +""" + +DOCX_ARROW_CHAR = "\uf0e0" +""" +U+F0E0: Private Use Area character representing a right arrow in DOCX documents. +This character is used by MS Word to render arrows (e.g., "ja → 5") in EBD tables. +It appears in cells like "ja 5" to indicate the subsequent step number. +""" + + +def get_tables_and_paragraphs(document: DocumentType) -> Generator[Union[Table, Paragraph], None, None]: + """ + Yields tables and paragraphs from the given document in the order in which they occur in the document. + This is helpful because document.tables and document.paragraphs are de-coupled and give you no information which + paragraph follows which table. + """ + parent_elements = document.element.body + for item in parent_elements.iterchildren(): + if isinstance(item, CT_P): + yield Paragraph(item, document) + elif isinstance(item, CT_Tbl): + yield Table(item, document) + else: + _logger.debug("Item %s is neither Paragraph nor Table", str(item)) + + +def cell_is_probably_from_an_ebd_cell(cell: _Cell) -> bool: + """Check if a cell likely belongs to an EBD table based on its content.""" + if DOCX_ARROW_CHAR in cell.text: + return True + if cell.text in {"ja", "nein"}: + return True + if "à" in cell.text: + # the rightarrow in wrong encoding + return True + if EBD_CELL_PATTERN.match(cell.text): + return True + if cell.text.strip().startswith("Cluster:") or cell.text.startswith("Hinweis:"): + return True + return False + + +def table_is_an_ebd_table(table: Table) -> bool: + """ + Returns true iff the table "looks like" an EB-Table. + This is to distinguish between tables that are inside the same subsection that describes an EBD but are not part + of the decision tree at all (e.g. in E_0406 the tables about Artikel-IDs). + """ + if table_is_first_ebd_table(table): + return True + for row in table.rows: + try: + for cell in row.cells: + if cell_is_probably_from_an_ebd_cell(cell): + return True + except IndexError: # don't ask me why this happens; It's the internals of python-docx + continue + return False + + +def table_is_first_ebd_table(table: Table) -> bool: + """ + Returns true if the first row of a table contains "Prüfende Rolle". + We assume that each EBD table has a header row with + "Prüfende Rolle" in the first column. + """ + return "prüfende rolle" in table.rows[0].cells[0].text.lower() + + +def is_heading(paragraph: Paragraph) -> bool: + """ + Returns True if the paragraph is a heading. + """ + return paragraph.style is not None and paragraph.style.style_id in { + "berschrift1", + "berschrift2", + "berschrift3", + } + + +def enrich_paragraphs_with_sections( + paragraphs: Iterable[Paragraph], +) -> Generator[tuple[Paragraph, EbdChapterInformation], None, None]: + """ + Yield each paragraph + the "Kapitel" in which it is found. + """ + chapter_counter = itertools.count(start=1) + chapter = 1 + chapter_title: Optional[str] = None + section_counter = itertools.count(start=1) + section = 1 + section_title: Optional[str] = None + subsection_counter = itertools.count(start=1) + subsection = 1 + subsection_title: Optional[str] = None + for paragraph in paragraphs: + # since pyton-docx 1.1.2 there are type hints; seems like the style is not guaranteed to be not None + match paragraph.style.style_id: # type:ignore[union-attr] + case "berschrift1": + chapter = next(chapter_counter) + chapter_title = paragraph.text.strip() + section_counter = itertools.count(start=1) + section_title = None + subsection_counter = itertools.count(start=1) + subsection_title = None + case "berschrift2": + section = next(section_counter) + section_title = paragraph.text.strip() + subsection_counter = itertools.count(start=1) + subsection_title = None + case "berschrift3": + subsection = next(subsection_counter) + subsection_title = paragraph.text.strip() + location = EbdChapterInformation( + chapter=chapter, + section=section, + subsection=subsection, + chapter_title=chapter_title, + section_title=section_title, + subsection_title=subsection_title, + ) + _logger.debug("Handling Paragraph %i.%i.%i", chapter, section, subsection) + yield paragraph, location