Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
ebdtable2graph
ebdtable2graph>=0.1.5
python-docx
more_itertools
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ certifi==2022.12.7
# via requests
charset-normalizer==2.1.1
# via requests
ebdtable2graph==0.1.3
ebdtable2graph==0.1.5
# via -r requirements.in
idna==3.4
# via requests
Expand Down
119 changes: 87 additions & 32 deletions src/ebddocx2table/docxtableconverter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,20 @@
"""
import re
from enum import Enum
from itertools import cycle
from itertools import cycle, groupby
from typing import Generator, List, Literal, Optional, Tuple

from docx.table import Table, _Cell, _Row # type:ignore[import]
from ebdtable2graph.models import EbdTable, EbdTableRow, EbdTableSubRow
from ebdtable2graph.models.ebd_table import EbdCheckResult, EbdTableMetaData
from more_itertools import first
from ebdtable2graph.models.ebd_table import _STEP_NUMBER_REGEX, EbdCheckResult, EbdTableMetaData, MultiStepInstruction
from more_itertools import first, first_true


def _is_pruefende_rolle_cell(cell: _Cell) -> bool:
def _is_pruefende_rolle_cell_text(text: str) -> bool:
""" "
Returns true iff the cell mentions the market role that is responsible for applying this entscheidungsbaum
Returns true iff the given text mentions the market role that is responsible for applying this entscheidungsbaum
"""
return cell.text.startswith("Prüfende Rolle: ")
return text.startswith("Prüfende Rolle: ")


def _sort_columns_in_row(docx_table_row: _Row) -> Generator[_Cell, None, None]:
Expand All @@ -31,6 +31,29 @@ def _sort_columns_in_row(docx_table_row: _Row) -> Generator[_Cell, None, None]:

_subsequent_step_pattern = re.compile(r"^(?P<bool>(?:ja)|(?:nein))\s*(?P<subsequent_step_number>(?:\d+\*?)|ende)?")

_step_number_pattern = re.compile(_STEP_NUMBER_REGEX)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ich hab letztens mal aus gegebenen Anlass ein bisschen das re package durchforstet. Tatsächlich ist es ziemlich überflüssig selbst die compile-Funktion aufzurufen. Denn das Package ruft stets (d.h. z.B. bei .match(...)) eine interne compile-Funktion auf, die zusätzlich eine Cache-Funktion bietet. D.h. dass jedes Regex-Pattern immer nur einmal kompiliert und anschließend im internen Cache gespeichert wird.
Dennoch, ist für mich fein so.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wieder was gelernt ;)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wobei man dazu sagen muss, dass die cache-Größe auf 512 beschränkt ist. Man scheint das aber theoretisch auch vergößern zu können, wenn ich das richtig sehe.



def _get_index_of_first_column_with_step_number(cells: List[_Cell]) -> int:
"""
returns the index of the first cell in cells, that contains a step number
"""
first_step_number_cell = first_true(cells, pred=lambda cell: _step_number_pattern.match(cell.text) is not None)
step_number_column_index = cells.index(first_step_number_cell)
return step_number_column_index


def _get_use_cases(cells: List[_Cell]) -> List[str]:
"""
Extract use cases from the given list of cells.
May return empty list, never returns None.
"""
index_of_step_number = _get_index_of_first_column_with_step_number(cells)
if index_of_step_number != 0:
# "use_cases" are present; This means, that this step must only be applied for certain scenarios,
return [c.text for c in cells[0:index_of_step_number]]
return [] # we don't return None here because we need something that has a length in the calling code


def _read_subsequent_step_cell(cell: _Cell) -> Tuple[bool, Optional[str]]:
"""
Expand Down Expand Up @@ -82,61 +105,95 @@ def __init__(self, docx_tables: List[Table], ebd_key: str, chapter: str, sub_cha
self._column_index_check_result: int
self._column_index_result_code: int
self._column_index_note: int
self._row_index_last_header: Literal[0, 1] # either 0 or 1
self._row_index_last_header: Literal[0, 1] = 1 #: the index of the last table header row
# the index of the last header row _could_ by dynamically calculated but so far it has always been 1.
for row_index in range(0, 2): # the first two lines/rows are the header of the table.
# In the constructor we just want to read the metadata from the table.
# For this purpose the first two lines are enough.
for column_index, table_cell in enumerate(first(docx_tables).row_cells(row_index)):
if row_index == 0 and _is_pruefende_rolle_cell(table_cell):
role = table_cell.text.split(":")[1].strip()
# Now it feels natural, to loop over the cells/columns of the first row, but before we do so, we have to
# remove duplicates. Although there are usually only 5 columns visible, technically there might be even 8.
# In these cases (e.g. for E_0453) columns like 'Prüfergebnis' simply occur twice in the docx table header.
distinct_cell_texts: List[str] = [
x[0] for x in groupby(first(docx_tables).row_cells(row_index), lambda cell: cell.text)
]
for column_index, table_cell_text in enumerate(distinct_cell_texts):
if row_index == 0 and _is_pruefende_rolle_cell_text(table_cell_text):
role = table_cell_text.split(":")[1].strip()
break # because the prüfende rolle is always a full row with identical column cells
if table_cell.text == "Nr.":
if table_cell_text == "Nr.":
self._column_index_step_number = column_index
# In most of the cases this will be 1,
# but it can be 0 if the first row does _not_ contain the "Prüfende Rolle".
self._row_index_last_header = row_index # type:ignore[assignment]
elif table_cell.text == "Prüfschritt":
# self._row_index_last_header = row_index # type:ignore[assignment]
elif table_cell_text == "Prüfschritt":
self._column_index_description = column_index
elif table_cell.text == "Prüfergebnis":
elif table_cell_text == "Prüfergebnis":
self._column_index_check_result = column_index
elif table_cell.text == "Code":
elif table_cell_text == "Code":
self._column_index_result_code = column_index
elif table_cell.text == "Hinweis":
elif table_cell_text == "Hinweis":
self._column_index_note = column_index
self._metadata = EbdTableMetaData(ebd_code=ebd_key, sub_chapter=sub_chapter, chapter=chapter, role=role)

# I see that there are quite a few local variables, but honestly see no reason to break it down any further.
# pylint:disable=too-many-locals, too-many-arguments
def _handle_single_table(
self, table: Table, row_offset: int, rows: List[EbdTableRow], sub_rows: List[EbdTableSubRow]
self,
table: Table,
multi_step_instructions: List[MultiStepInstruction],
row_offset: int,
rows: List[EbdTableRow],
sub_rows: List[EbdTableSubRow],
) -> None:
"""
Handles a single table (out of possible multiple tables for 1 EBD).
The results are written into rows and sub_rows. Those will be modified.
The results are written into rows, sub_rows and multi_step_instructions. Those will be modified.
"""
upper_lower_iterator = cycle([_EbdSubRowPosition.UPPER, _EbdSubRowPosition.LOWER])
use_cases: List[str] = []
for table_row, sub_row_position in zip(
table.rows[row_offset:],
cycle([_EbdSubRowPosition.UPPER, _EbdSubRowPosition.LOWER]),
upper_lower_iterator,
):
row_cells = list(_sort_columns_in_row(table_row))
if len(row_cells) <= 2:
# These are the multi-column rows that span that contain stuff like
# "Alle festgestellten Antworten sind anzugeben, soweit im Format möglich (maximal 8 Antwortcodes)*."
_ = next(upper_lower_iterator) # reset the iterator
multi_step_instruction_text = row_cells[0].text
# we store the text in the local variable for now because we don't yet know the next step number
continue
if sub_row_position == _EbdSubRowPosition.UPPER:
use_cases = _get_use_cases(row_cells)
# clear list every second entry
sub_rows = []
step_number = row_cells[self._column_index_step_number].text.strip()
description = row_cells[self._column_index_description].text.strip()
step_number = row_cells[len(use_cases) + self._column_index_step_number].text.strip()
description = row_cells[len(use_cases) + self._column_index_description].text.strip()
boolean_outcome, subsequent_step_number = _read_subsequent_step_cell(
row_cells[self._column_index_check_result]
row_cells[len(use_cases) + self._column_index_check_result]
)
sub_row = EbdTableSubRow(
check_result=EbdCheckResult(subsequent_step_number=subsequent_step_number, result=boolean_outcome),
result_code=row_cells[self._column_index_result_code].text.strip() or None,
note=row_cells[self._column_index_note].text.strip() or None,
result_code=row_cells[len(use_cases) + self._column_index_result_code].text.strip() or None,
note=row_cells[len(use_cases) + self._column_index_note].text.strip() or None,
)
sub_rows.append(sub_row)
if sub_row_position == _EbdSubRowPosition.LOWER:
row = EbdTableRow(
description=description,
step_number=step_number,
sub_rows=sub_rows,
description=description, step_number=step_number, sub_rows=sub_rows, use_cases=use_cases or None
)
if "multi_step_instruction_text" in locals():
# if the variable with the given name is defined, then we append a multi_step_instruction, once.
multi_step_instructions.append(
MultiStepInstruction(
instruction_text=multi_step_instruction_text,
# in contrast to the row in which we found the bare multi_step_instruction_text
# we know the step_number here. This is why the detection of the instruction and the append
# are not in the same place.
first_step_number_affected=step_number,
)
)
del multi_step_instruction_text # prevent adding the same instructions for all following steps
rows.append(row)

def convert_docx_tables_to_ebd_table(self) -> EbdTable:
Expand All @@ -146,13 +203,11 @@ def convert_docx_tables_to_ebd_table(self) -> EbdTable:
"""
rows: List[EbdTableRow] = []
sub_rows: List[EbdTableSubRow] = []
multi_step_instructions: List[MultiStepInstruction] = []
for table_index, table in enumerate(self._docx_tables):
offset: int = 0
if table_index == 0:
offset = self._row_index_last_header + 1
self._handle_single_table(table, offset, rows, sub_rows)
result = EbdTable(
rows=rows,
metadata=self._metadata,
)
self._handle_single_table(table, multi_step_instructions, offset, rows, sub_rows)
result = EbdTable(rows=rows, metadata=self._metadata, multi_step_instructions=multi_step_instructions or None)
return result
Loading