-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy path__init__.py
More file actions
87 lines (76 loc) · 3.58 KB
/
__init__.py
File metadata and controls
87 lines (76 loc) · 3.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
"""
Contains high level functions to process .docx files
"""
import re
from io import BytesIO
from pathlib import Path
from typing import Dict, Generator, Union
from docx import Document # type:ignore[import]
from docx.oxml import CT_P, CT_Tbl # type:ignore[import]
from docx.table import Table # type:ignore[import]
from docx.text.paragraph import Paragraph # type:ignore[import]
def get_document(docx_file_path: Path) -> Document:
"""
opens and returns the document specified in the docx_file_path using python-docx
"""
with open(docx_file_path, "rb") as docx_file:
source_stream = BytesIO(docx_file.read())
# Originally I tried the recipe from
# https://python-docx.readthedocs.io/en/latest/user/documents.html#opening-a-file-like-document
# but then switched from StringIO to BytesIO (without explicit 'utf-8') because of:
# UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 605: character maps to <undefined>
try:
document = Document(source_stream)
return document
finally:
source_stream.close()
def _get_tables_and_paragaphs(document: Document) -> Generator[Union[Table, Paragraph], None, None]:
"""
Yields tables and paragraphs from the given document in the order in which they occur in the document.
This is helpful because document.tables and document.paragraphs are de-coupled and give you no information which
paragraph follows which table.
"""
parent_elements = document.element.body
for item in parent_elements.iterchildren():
if isinstance(item, CT_P):
yield Paragraph(item, document)
elif isinstance(item, CT_Tbl):
yield Table(item, document)
_ebd_key_pattern = re.compile(r"^E_\d{4}$")
_ebd_key_with_heading_pattern = re.compile(r"^(?P<key>E_\d{4})_(?P<title>.*)\s*$")
def get_ebd_docx_table(docx_file_path: Path, ebd_key: str) -> Table:
"""
Opens the file specified in docx_file_path and returns the table that relates to the given ebd_key.
Raises an ValueError if the table was not found.
"""
if _ebd_key_pattern.match(ebd_key) is None:
raise ValueError(f"The ebd_key '{ebd_key}' does not match {_ebd_key_pattern.pattern}")
document = get_document(docx_file_path)
next_table_is_requested_table: bool = False
for table_or_paragraph in _get_tables_and_paragaphs(document):
if isinstance(table_or_paragraph, Paragraph):
paragraph: Paragraph = table_or_paragraph
# Assumptions:
# 1. before each EbdTable there is a paragraph whose text starts with the respective EBD key
# 2. there are no duplicates
next_table_is_requested_table = paragraph.text.startswith(ebd_key)
if isinstance(table_or_paragraph, Table) and next_table_is_requested_table:
table: Table = table_or_paragraph
return table
raise ValueError(f"EBD Table '{ebd_key}' was not found.")
def get_all_ebd_keys(docx_file_path: Path) -> Dict[str, str]:
"""
Extract all EBD keys from the given file.
Returns a dictionary with all EBD keys as keys and the respective EBD titles as values.
E.g. key: "E_0003", value: "Bestellung der Aggregationsebene RZ prüfen"
"""
document = get_document(docx_file_path)
result: Dict[str, str] = {}
for paragraph in document.paragraphs:
match = _ebd_key_with_heading_pattern.match(paragraph.text)
if match is None:
continue
ebd_key = match.groupdict()["key"]
title = match.groupdict()["title"]
result[ebd_key] = title
return result