-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy path__init__.py
More file actions
112 lines (100 loc) · 5.28 KB
/
__init__.py
File metadata and controls
112 lines (100 loc) · 5.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""
Contains high level functions to process .docx files
"""
import re
from io import BytesIO
from pathlib import Path
from typing import Dict, Generator, List, Union
from docx import Document # type:ignore[import]
from docx.oxml import CT_P, CT_Tbl # type:ignore[import]
from docx.table import Table # type:ignore[import]
from docx.text.paragraph import Paragraph # type:ignore[import]
def get_document(docx_file_path: Path) -> Document:
"""
opens and returns the document specified in the docx_file_path using python-docx
"""
with open(docx_file_path, "rb") as docx_file:
source_stream = BytesIO(docx_file.read())
# Originally I tried the recipe from
# https://python-docx.readthedocs.io/en/latest/user/documents.html#opening-a-file-like-document
# but then switched from StringIO to BytesIO (without explicit 'utf-8') because of:
# UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 605: character maps to <undefined>
try:
document = Document(source_stream)
return document
finally:
source_stream.close()
def _get_tables_and_paragaphs(document: Document) -> Generator[Union[Table, Paragraph], None, None]:
"""
Yields tables and paragraphs from the given document in the order in which they occur in the document.
This is helpful because document.tables and document.paragraphs are de-coupled and give you no information which
paragraph follows which table.
"""
parent_elements = document.element.body
for item in parent_elements.iterchildren():
if isinstance(item, CT_P):
yield Paragraph(item, document)
elif isinstance(item, CT_Tbl):
yield Table(item, document)
_ebd_key_pattern = re.compile(r"^E_\d{4}$")
_ebd_key_with_heading_pattern = re.compile(r"^(?P<key>E_\d{4})_(?P<title>.*)\s*$")
def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]:
"""
Opens the file specified in docx_file_path and returns the tables that relate to the given ebd_key.
There might be more than 1 docx table for 1 EBD table.
This is because of inconsistencies and manual editing during creation of the documents by EDI@Energy.
Raises an ValueError if the table was not found.
"""
if _ebd_key_pattern.match(ebd_key) is None:
raise ValueError(f"The ebd_key '{ebd_key}' does not match {_ebd_key_pattern.pattern}")
document = get_document(docx_file_path)
next_table_is_requested_table: bool = False
tables: List[Table] = []
tables_and_paragraphs = _get_tables_and_paragaphs(document)
for table_or_paragraph in tables_and_paragraphs:
if isinstance(table_or_paragraph, Paragraph):
paragraph: Paragraph = table_or_paragraph
# Assumptions:
# 1. before each EbdTable there is a paragraph whose text starts with the respective EBD key
# 2. there are no duplicates
next_table_is_requested_table = paragraph.text.startswith(ebd_key)
if isinstance(table_or_paragraph, Table) and next_table_is_requested_table:
table: Table = table_or_paragraph
tables.append(table)
# Now we have to check if the EBD table spans multiple pages and _maybe_ we have to collect more tables.
# The funny thing is: Sometimes the authors create multiple tables split over multiple lines which belong
# together, sometimes they create 1 proper table that spans multiple pages.
# The latter case (1 docx table spanning >1 pages) is transparent to the extraction logic; i.e. python-docx
# treats a single table that spans multiple pages just the same as a table on only 1 page.
for next_item in tables_and_paragraphs: # start iterating from where the outer loop paused
if isinstance(next_item, Table):
# this is the case that the authors created multiple single tables on single adjacent pages
tables.append(next_item)
elif isinstance(next_item, Paragraph) and not next_item.text.strip():
# sometimes the authors add blank lines before they continue with the next table
continue
else:
break # inner loop because if no other table will follow
# we're done collecting the tables for this EBD key
if next_table_is_requested_table and len(tables) > 0: # this means: we found the table
# break the outer loop, too; no need to iterate any further
break
if len(tables) == 0:
raise ValueError(f"EBD Table '{ebd_key}' was not found.")
return tables
def get_all_ebd_keys(docx_file_path: Path) -> Dict[str, str]:
"""
Extract all EBD keys from the given file.
Returns a dictionary with all EBD keys as keys and the respective EBD titles as values.
E.g. key: "E_0003", value: "Bestellung der Aggregationsebene RZ prüfen"
"""
document = get_document(docx_file_path)
result: Dict[str, str] = {}
for paragraph in document.paragraphs:
match = _ebd_key_with_heading_pattern.match(paragraph.text)
if match is None:
continue
ebd_key = match.groupdict()["key"]
title = match.groupdict()["title"]
result[ebd_key] = title
return result