-
Notifications
You must be signed in to change notification settings - Fork 1
✨ Implement Scraping of use_cases (the gray outer left cells)
#18
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
e27045d
4891156
ed00b2c
199e335
4ad8614
8281058
e26ab27
4c38dc6
ddfc975
48583e3
df083ca
d853c0e
a1de662
800353b
8d3a682
5bc25aa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,3 @@ | ||
| ebdtable2graph | ||
| ebdtable2graph>=0.1.4a | ||
| python-docx | ||
| more_itertools | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,20 +3,20 @@ | |
| """ | ||
| import re | ||
| from enum import Enum | ||
| from itertools import cycle | ||
| from itertools import cycle, groupby | ||
| from typing import Generator, List, Literal, Optional, Tuple | ||
|
|
||
| from docx.table import Table, _Cell, _Row # type:ignore[import] | ||
| from ebdtable2graph.models import EbdTable, EbdTableRow, EbdTableSubRow | ||
| from ebdtable2graph.models.ebd_table import EbdCheckResult, EbdTableMetaData | ||
| from more_itertools import first | ||
| from ebdtable2graph.models.ebd_table import _STEP_NUMBER_REGEX, EbdCheckResult, EbdTableMetaData | ||
| from more_itertools import first, first_true | ||
|
|
||
|
|
||
| def _is_pruefende_rolle_cell(cell: _Cell) -> bool: | ||
| def _is_pruefende_rolle_cell_text(text: str) -> bool: | ||
| """ " | ||
| Returns true iff the cell mentions the market role that is responsible for applying this entscheidungsbaum | ||
| Returns true iff the given text mentions the market role that is responsible for applying this entscheidungsbaum | ||
| """ | ||
| return cell.text.startswith("Prüfende Rolle: ") | ||
| return text.startswith("Prüfende Rolle: ") | ||
|
|
||
|
|
||
| def _sort_columns_in_row(docx_table_row: _Row) -> Generator[_Cell, None, None]: | ||
|
|
@@ -31,6 +31,29 @@ def _sort_columns_in_row(docx_table_row: _Row) -> Generator[_Cell, None, None]: | |
|
|
||
| _subsequent_step_pattern = re.compile(r"^(?P<bool>(?:ja)|(?:nein))\s*(?P<subsequent_step_number>(?:\d+\*?)|ende)?") | ||
|
|
||
| _step_number_pattern = re.compile(_STEP_NUMBER_REGEX) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ich hab letztens mal aus gegebenen Anlass ein bisschen das
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wieder was gelernt ;)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wobei man dazu sagen muss, dass die cache-Größe auf 512 beschränkt ist. Man scheint das aber theoretisch auch vergößern zu können, wenn ich das richtig sehe. |
||
|
|
||
|
|
||
| def _get_index_of_first_column_with_step_number(cells: List[_Cell]) -> int: | ||
| """ | ||
| returns the index of the first cell in cells, that contains a step number | ||
| """ | ||
| first_step_number_cell = first_true(cells, pred=lambda cell: _step_number_pattern.match(cell.text) is not None) | ||
| step_number_column_index = cells.index(first_step_number_cell) | ||
| return step_number_column_index | ||
|
|
||
|
|
||
| def _get_use_cases(cells: List[_Cell]) -> List[str]: | ||
| """ | ||
| Extract use cases from the given list of cells. | ||
| May return empty list, never returns None. | ||
| """ | ||
| index_of_step_number = _get_index_of_first_column_with_step_number(cells) | ||
| if index_of_step_number != 0: | ||
| # "use_cases" are present; This means, that this step must only be applied for certain scenarios, | ||
| return [c.text for c in cells[0:index_of_step_number]] | ||
| return [] # we don't return None here because we need something that has a length in the calling code | ||
|
|
||
|
|
||
| def _read_subsequent_step_cell(cell: _Cell) -> Tuple[bool, Optional[str]]: | ||
| """ | ||
|
|
@@ -79,63 +102,77 @@ def __init__(self, docx_tables: List[Table], ebd_key: str, chapter: str, sub_cha | |
| self._docx_tables = docx_tables | ||
| self._column_index_step_number: int | ||
| self._column_index_description: int | ||
| self._column_index_check_result: int | ||
| self._column_index_result_code: int | ||
| self._column_index_check_result: int = 0 | ||
| self._column_index_result_code: int = 0 | ||
| self._column_index_note: int | ||
| self._row_index_last_header: Literal[0, 1] # either 0 or 1 | ||
| for row_index in range(0, 2): # the first two lines/rows are the header of the table. | ||
| # In the constructor we just want to read the metadata from the table. | ||
| # For this purpose the first two lines are enough. | ||
| for column_index, table_cell in enumerate(first(docx_tables).row_cells(row_index)): | ||
| if row_index == 0 and _is_pruefende_rolle_cell(table_cell): | ||
| role = table_cell.text.split(":")[1].strip() | ||
| # Now it feels natural, to loop over the cells/columns of the first row, but before we do so, we have to | ||
| # remove duplicates. Although there are usually only 5 columns visible, technically there might be even 8. | ||
| # In these cases (e.g. for E_0453) columns like 'Prüfergebnis' simply occur twice in the docx table header. | ||
| distinct_cell_texts: List[str] = [ | ||
| x[0] for x in groupby(first(docx_tables).row_cells(row_index), lambda cell: cell.text) | ||
| ] | ||
| for column_index, table_cell_text in enumerate(distinct_cell_texts): | ||
| if row_index == 0 and _is_pruefende_rolle_cell_text(table_cell_text): | ||
| role = table_cell_text.split(":")[1].strip() | ||
| break # because the prüfende rolle is always a full row with identical column cells | ||
| if table_cell.text == "Nr.": | ||
| if table_cell_text == "Nr.": | ||
| self._column_index_step_number = column_index | ||
| # In most of the cases this will be 1, | ||
| # but it can be 0 if the first row does _not_ contain the "Prüfende Rolle". | ||
| self._row_index_last_header = row_index # type:ignore[assignment] | ||
| elif table_cell.text == "Prüfschritt": | ||
| elif table_cell_text == "Prüfschritt": | ||
| self._column_index_description = column_index | ||
| elif table_cell.text == "Prüfergebnis": | ||
| elif table_cell_text == "Prüfergebnis": | ||
| self._column_index_check_result = column_index | ||
| elif table_cell.text == "Code": | ||
| elif table_cell_text == "Code": | ||
| self._column_index_result_code = column_index | ||
| elif table_cell.text == "Hinweis": | ||
| elif table_cell_text == "Hinweis": | ||
| self._column_index_note = column_index | ||
| self._metadata = EbdTableMetaData(ebd_code=ebd_key, sub_chapter=sub_chapter, chapter=chapter, role=role) | ||
|
|
||
| # I see that there are quite a few local variables, but honestly see no reason to break it down any further | ||
| # pylint:disable=too-many-locals | ||
| def _handle_single_table( | ||
| self, table: Table, row_offset: int, rows: List[EbdTableRow], sub_rows: List[EbdTableSubRow] | ||
| ) -> None: | ||
| """ | ||
| Handles a single table (out of possible multiple tables for 1 EBD). | ||
| The results are written into rows and sub_rows. Those will be modified. | ||
| """ | ||
| upper_lower_iterator = cycle([_EbdSubRowPosition.UPPER, _EbdSubRowPosition.LOWER]) | ||
| use_cases: List[str] = [] | ||
| for table_row, sub_row_position in zip( | ||
| table.rows[row_offset:], | ||
| cycle([_EbdSubRowPosition.UPPER, _EbdSubRowPosition.LOWER]), | ||
| upper_lower_iterator, | ||
| ): | ||
| row_cells = list(_sort_columns_in_row(table_row)) | ||
| if len(row_cells) <= self._column_index_description: | ||
| # These are the multi-column rows that span that contain stuff like | ||
| # "Alle festgestellten Antworten sind anzugeben, soweit im Format möglich (maximal 8 Antwortcodes)*." | ||
| _ = next(upper_lower_iterator) # reset the iterator | ||
| continue | ||
| if sub_row_position == _EbdSubRowPosition.UPPER: | ||
| use_cases = _get_use_cases(row_cells) | ||
| # clear list every second entry | ||
| sub_rows = [] | ||
| step_number = row_cells[self._column_index_step_number].text.strip() | ||
| description = row_cells[self._column_index_description].text.strip() | ||
| step_number = row_cells[len(use_cases) + self._column_index_step_number].text.strip() | ||
| description = row_cells[len(use_cases) + self._column_index_description].text.strip() | ||
| boolean_outcome, subsequent_step_number = _read_subsequent_step_cell( | ||
| row_cells[self._column_index_check_result] | ||
| row_cells[len(use_cases) + self._column_index_check_result] | ||
| ) | ||
| sub_row = EbdTableSubRow( | ||
| check_result=EbdCheckResult(subsequent_step_number=subsequent_step_number, result=boolean_outcome), | ||
| result_code=row_cells[self._column_index_result_code].text.strip() or None, | ||
| note=row_cells[self._column_index_note].text.strip() or None, | ||
| result_code=row_cells[len(use_cases) + self._column_index_result_code].text.strip() or None, | ||
| note=row_cells[len(use_cases) + self._column_index_note].text.strip() or None, | ||
| ) | ||
| sub_rows.append(sub_row) | ||
| if sub_row_position == _EbdSubRowPosition.LOWER: | ||
| row = EbdTableRow( | ||
| description=description, | ||
| step_number=step_number, | ||
| sub_rows=sub_rows, | ||
| description=description, step_number=step_number, sub_rows=sub_rows, use_cases=use_cases or None | ||
| ) | ||
| rows.append(row) | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.