-
Notifications
You must be signed in to change notification settings - Fork 1
✨Support tables spanning over multiple pages #7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
cbfb77a
c9a5625
33645af
4059267
a8a4012
6e4b749
c2d49d5
86b2488
8257a7b
2f484ba
f8d9306
91297fe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,7 +4,7 @@ | |
| import re | ||
| from io import BytesIO | ||
| from pathlib import Path | ||
| from typing import Dict, Generator, Union | ||
| from typing import Dict, Generator, List, Union | ||
|
|
||
| from docx import Document # type:ignore[import] | ||
| from docx.oxml import CT_P, CT_Tbl # type:ignore[import] | ||
|
|
@@ -47,16 +47,19 @@ def _get_tables_and_paragaphs(document: Document) -> Generator[Union[Table, Para | |
| _ebd_key_with_heading_pattern = re.compile(r"^(?P<key>E_\d{4})_(?P<title>.*)\s*$") | ||
|
|
||
|
|
||
| def get_ebd_docx_table(docx_file_path: Path, ebd_key: str) -> Table: | ||
| def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]: | ||
| """ | ||
| Opens the file specified in docx_file_path and returns the table that relates to the given ebd_key. | ||
| Raises an ValueError if the table was not found. | ||
| """ | ||
| if _ebd_key_pattern.match(ebd_key) is None: | ||
| raise ValueError(f"The ebd_key '{ebd_key}' does not match {_ebd_key_pattern.pattern}") | ||
| document = get_document(docx_file_path) | ||
|
|
||
| next_table_is_requested_table: bool = False | ||
| for table_or_paragraph in _get_tables_and_paragaphs(document): | ||
| tables: List[Table] = [] | ||
| tables_and_paragraphs = _get_tables_and_paragaphs(document) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. habe den aus der loop rausgezogen, damit wir weiter unten weiter drüber loopen können (an der stelle wo wir die erste treffende tabelle gefunden haben. |
||
| for table_or_paragraph in tables_and_paragraphs: | ||
| if isinstance(table_or_paragraph, Paragraph): | ||
| paragraph: Paragraph = table_or_paragraph | ||
| # Assumptions: | ||
|
|
@@ -65,8 +68,22 @@ def get_ebd_docx_table(docx_file_path: Path, ebd_key: str) -> Table: | |
| next_table_is_requested_table = paragraph.text.startswith(ebd_key) | ||
| if isinstance(table_or_paragraph, Table) and next_table_is_requested_table: | ||
| table: Table = table_or_paragraph | ||
| return table | ||
| raise ValueError(f"EBD Table '{ebd_key}' was not found.") | ||
| tables.append(table) | ||
| # Now we have to check if the EBD table spans multiple pages and _maybe_ we have to collect more tables. | ||
| # The funny thing is: Sometimes the authors create multiple tables split over multiple lines which belong | ||
| # together, sometimes they create 1 proper table that spans multiple pages. This we won't notice here. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was wird hier nicht "genoticed"? Der Unterschied zwischen den beiden Fällen, die du präsentiert hast? Oder wird einer der Fälle nicht abgefangen? Edit: In Anbetracht des Codes hierunter nehme ich an, du meinst, dass zwar beide Fälle berücksichtigt werden, aber der Unterschied nicht im Ergebnis mit gespeichert wird?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. f8d9306 <-- so hfftl klarer |
||
| for next_item in tables_and_paragraphs: # start iterating from where the outer loop paused | ||
| if isinstance(next_item, Table): | ||
| # this is the case that the authors created multiple single tables on single adjacent pages | ||
| tables.append(next_item) | ||
| elif isinstance(next_item, Paragraph) and not next_item.text.strip(): | ||
| # sometimes the authors add blank lines before they continue with the next table | ||
| continue | ||
| else: | ||
| break # because if no other table follows, we're done collecting the tables for this EBD key | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. die outer loop ist uns egal, sobald wir einmal in der inner loop waren. |
||
| if len(tables) == 0: | ||
| raise ValueError(f"EBD Table '{ebd_key}' was not found.") | ||
| return tables | ||
|
|
||
|
|
||
| def get_all_ebd_keys(docx_file_path: Path) -> Dict[str, str]: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,7 @@ | |
| from docx.table import Table, _Cell, _Row # type:ignore[import] | ||
| from ebdtable2graph import EbdTable, EbdTableRow, EbdTableSubRow | ||
| from ebdtable2graph.models.ebd_table import EbdCheckResult, EbdTableMetaData | ||
| from more_itertools import first | ||
|
|
||
|
|
||
| def _is_pruefende_rolle_cell(cell: _Cell) -> bool: | ||
|
|
@@ -71,11 +72,11 @@ class DocxTableConverter: | |
| converts docx tables to EbdTables | ||
| """ | ||
|
|
||
| def __init__(self, docx_table: Table, ebd_key: str, chapter: str, sub_chapter: str): | ||
| def __init__(self, docx_tables: List[Table], ebd_key: str, chapter: str, sub_chapter: str): | ||
| """ | ||
| the constructor initializes the instance and reads some metadata from the table header | ||
| the constructor initializes the instance and reads some metadata from the (first) table header | ||
| """ | ||
| self._docx_table = docx_table | ||
| self._docx_tables = docx_tables | ||
| self._column_index_step_number: int | ||
| self._column_index_description: int | ||
| self._column_index_check_result: int | ||
|
|
@@ -85,7 +86,7 @@ def __init__(self, docx_table: Table, ebd_key: str, chapter: str, sub_chapter: s | |
| for row_index in range(0, 2): # the first two lines/rows are the header of the table. | ||
| # In the constructor we just want to read the metadata from the table. | ||
| # For this purpose the first two lines are enough. | ||
| for column_index, table_cell in enumerate(docx_table.row_cells(row_index)): | ||
| for column_index, table_cell in enumerate(first(docx_tables).row_cells(row_index)): | ||
| if row_index == 0 and _is_pruefende_rolle_cell(table_cell): | ||
| role = table_cell.text.split(":")[1].strip() | ||
| break # because the prüfende rolle is always a full row with identical column cells | ||
|
|
@@ -104,15 +105,15 @@ def __init__(self, docx_table: Table, ebd_key: str, chapter: str, sub_chapter: s | |
| self._column_index_note = column_index | ||
| self._metadata = EbdTableMetaData(ebd_code=ebd_key, sub_chapter=sub_chapter, chapter=chapter, role=role) | ||
|
|
||
| def convert_docx_table_to_ebd_table(self) -> EbdTable: | ||
| def _handle_single_table( | ||
| self, table: Table, row_offset: int, rows: List[EbdTableRow], sub_rows: List[EbdTableSubRow] | ||
| ) -> None: | ||
| """ | ||
| Converts the raw docx table of an EBD to an EbdTable. | ||
| The latter contains the same data but in an easily accessible format that can be used to e.g. plot real graphs. | ||
| Handles a single table (out of possible multiple tables for 1 EBD). | ||
| The results are written into rows and sub_rows. Those will be modified. | ||
| """ | ||
| rows: List[EbdTableRow] = [] | ||
| sub_rows: List[EbdTableSubRow] = [] | ||
| for table_row, sub_row_position in zip( | ||
| self._docx_table.rows[self._row_index_last_header + 1 :], | ||
| table.rows[row_offset:], | ||
| cycle([_EbdSubRowPosition.UPPER, _EbdSubRowPosition.LOWER]), | ||
| ): | ||
|
Comment on lines
107
to
118
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. die funktion, die vorher die eine einzige tabelle gehandlet hat, ist jetzt eine private methode, die ihre ergebnisse an zwei übergebene listen fürs rows und subrows anhängt. |
||
| row_cells = list(_sort_columns_in_row(table_row)) | ||
|
|
@@ -124,12 +125,10 @@ def convert_docx_table_to_ebd_table(self) -> EbdTable: | |
| boolean_outcome, subsequent_step_number = _read_subsequent_step_cell( | ||
| row_cells[self._column_index_check_result] | ||
| ) | ||
| result_code = row_cells[self._column_index_result_code].text.strip() | ||
| note = row_cells[self._column_index_note].text.strip() | ||
| sub_row = EbdTableSubRow( | ||
| check_result=EbdCheckResult(subsequent_step_number=subsequent_step_number, result=boolean_outcome), | ||
| result_code=result_code or None, | ||
| note=note or None, | ||
| result_code=row_cells[self._column_index_result_code].text.strip() or None, | ||
| note=row_cells[self._column_index_note].text.strip() or None, | ||
|
Comment on lines
-127
to
+131
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pylint hat gemeckert: too-many-locals (also zu viele lokale variablen): hier ist so ein klassiker für: das gefällt zwar dem linter aber die lesbarkeit leidet vllt ein bisschen. |
||
| ) | ||
| sub_rows.append(sub_row) | ||
| if sub_row_position == _EbdSubRowPosition.LOWER: | ||
|
|
@@ -139,6 +138,19 @@ def convert_docx_table_to_ebd_table(self) -> EbdTable: | |
| sub_rows=sub_rows, | ||
| ) | ||
| rows.append(row) | ||
|
|
||
| def convert_docx_tables_to_ebd_table(self) -> EbdTable: | ||
| """ | ||
| Converts the raw docx tables of an EBD to an EbdTable. | ||
| The latter contains the same data but in an easily accessible format that can be used to e.g. plot real graphs. | ||
| """ | ||
| rows: List[EbdTableRow] = [] | ||
| sub_rows: List[EbdTableSubRow] = [] | ||
| for table_index, table in enumerate(self._docx_tables): | ||
| offset: int = 0 | ||
| if table_index == 0: | ||
| offset = self._row_index_last_header + 1 | ||
| self._handle_single_table(table, offset, rows, sub_rows) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. die vormals einzige konvertierungsfunktion wird jetzt hier aufgerufen. |
||
| result = EbdTable( | ||
| rows=rows, | ||
| metadata=self._metadata, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Den Docstring musst du noch anpassen.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
2f484ba