Skip to content

Commit 8e3915e

Browse files
OLILHRhf-kklein
andauthored
Replace docx2table and table2graph by ebdamame and rebdhuhn respectively; Delete main.py (#195)
* Replace docx2table and table2graph by ebdamame and rebdhuhn respectively * Ensure style attribute of a paragraph is not None * Directly access submodule by importing from docx.document * Raise ValueError in case no cell with a valid step number is found * Revert commit 70b3d42 * ⬆ Upgrade requirements.txt * ⬆ pip-compile-multi -d dev_requirements * Raise ValueError in case no cell with a valid step number is found * fix typing confusion * foo --------- Co-authored-by: konstantin <konstantin.klein@hochfrequenz.de>
1 parent bb4d714 commit 8e3915e

11 files changed

Lines changed: 47 additions & 207 deletions

.gitignore

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,5 +132,3 @@ dmypy.json
132132

133133
# vscode settings
134134
.vscode/
135-
136-
src/_ebdamame_version.py

main.py

Lines changed: 4 additions & 164 deletions
Original file line numberDiff line numberDiff line change
@@ -1,164 +1,4 @@
1-
"""
2-
a small click based script to extract all EBDs from a given file.
3-
"""
4-
5-
# invoke like this:
6-
# main.py -i unittests/test_data/ebd20230619_v33.docx
7-
# -o ../machine-readable_entscheidungsbaumdiagramme/FV2304
8-
# -t json -t dot -t svg -t puml
9-
# or
10-
# main.py -i unittests/test_data/ebd20230629_v34.docx
11-
# -o ../machine-readable_entscheidungsbaumdiagramme/FV2310
12-
# -t json -t dot -t svg -t puml
13-
import json
14-
from pathlib import Path
15-
from typing import Literal
16-
17-
import cattrs
18-
import click
19-
from ebdtable2graph import convert_graph_to_plantuml, convert_table_to_graph
20-
from ebdtable2graph.graphviz import convert_dot_to_svg_kroki, convert_graph_to_dot
21-
from ebdtable2graph.models import EbdGraph, EbdTable
22-
from ebdtable2graph.models.errors import (
23-
EbdCrossReferenceNotSupportedError,
24-
EndeInWrongColumnError,
25-
NotExactlyTwoOutgoingEdgesError,
26-
OutcomeCodeAmbiguousError,
27-
PathsNotGreaterThanOneError,
28-
)
29-
from ebdtable2graph.plantuml import GraphTooComplexForPlantumlError
30-
31-
# pylint:disable=import-error
32-
from ebdamame import TableNotFoundError, get_all_ebd_keys, get_ebd_docx_tables # type:ignore[import]
33-
from ebdamame.docxtableconverter import DocxTableConverter # type:ignore[import]
34-
35-
36-
def _dump_puml(puml_path: Path, ebd_graph: EbdGraph) -> None:
37-
plantuml_code = convert_graph_to_plantuml(ebd_graph)
38-
with open(puml_path, "w+", encoding="utf-8") as uml_file:
39-
uml_file.write(plantuml_code)
40-
41-
42-
def _dump_dot(dot_path: Path, ebd_graph: EbdGraph) -> None:
43-
dot_code = convert_graph_to_dot(ebd_graph)
44-
with open(dot_path, "w+", encoding="utf-8") as uml_file:
45-
uml_file.write(dot_code)
46-
47-
48-
def _dump_svg(svg_path: Path, ebd_graph: EbdGraph) -> None:
49-
dot_code = convert_graph_to_dot(ebd_graph)
50-
svg_code = convert_dot_to_svg_kroki(dot_code)
51-
with open(svg_path, "w+", encoding="utf-8") as svg_file:
52-
svg_file.write(svg_code)
53-
54-
55-
def _dump_json(json_path: Path, ebd_table: EbdTable) -> None:
56-
with open(json_path, "w+", encoding="utf-8") as json_file:
57-
json.dump(cattrs.unstructure(ebd_table), json_file, ensure_ascii=False, indent=2, sort_keys=True)
58-
59-
60-
@click.command()
61-
@click.option(
62-
"-i",
63-
"--input_path",
64-
type=click.Path(exists=True, dir_okay=False, file_okay=True, path_type=Path),
65-
prompt="Input DOCX File",
66-
help="Path of a .docx file from which the EBDs shall be extracted",
67-
)
68-
@click.option(
69-
"-o",
70-
"--output_path",
71-
type=click.Path(exists=False, dir_okay=True, file_okay=False, path_type=Path),
72-
default="output",
73-
prompt="Output directory",
74-
help="Define the path where you want to save the generated files",
75-
)
76-
@click.option(
77-
"-t",
78-
"--export_types",
79-
type=click.Choice(["puml", "dot", "json", "svg"], case_sensitive=False),
80-
multiple=True,
81-
help="Choose which file you'd like to create",
82-
)
83-
# pylint:disable=too-many-locals, too-many-branches, too-many-statements,
84-
def main(input_path: Path, output_path: Path, export_types: list[Literal["puml", "dot", "json", "svg"]]):
85-
"""
86-
A program to get a machine-readable version of the AHBs docx files published by edi@energy.
87-
"""
88-
if output_path.exists():
89-
click.secho(f"The output directory '{output_path}' exists already.", fg="yellow")
90-
else:
91-
output_path.mkdir(parents=True)
92-
click.secho(f"Created a new directory at {output_path}", fg="green")
93-
all_ebd_keys = get_all_ebd_keys(input_path)
94-
error_sources: dict[type, list[str]] = {}
95-
96-
def handle_known_error(error: Exception, ebd_key: str) -> None:
97-
click.secho(f"Error while processing EBD {ebd_key}: {error}", fg="yellow")
98-
if type(error) not in error_sources:
99-
error_sources[type(error)] = []
100-
error_sources[type(error)].append(ebd_key)
101-
102-
for ebd_key, (ebd_title, ebd_kapitel) in all_ebd_keys.items():
103-
click.secho(f"Processing EBD {ebd_kapitel} '{ebd_key}' ({ebd_title})")
104-
try:
105-
docx_tables = get_ebd_docx_tables(docx_file_path=input_path, ebd_key=ebd_key)
106-
except TableNotFoundError as table_not_found_error:
107-
click.secho(f"Table not found: {ebd_key}: {str(table_not_found_error)}; Skip!", fg="yellow")
108-
continue
109-
assert ebd_kapitel is not None
110-
try:
111-
converter = DocxTableConverter(
112-
docx_tables,
113-
ebd_key=ebd_key,
114-
chapter=ebd_kapitel.chapter_title, # type:ignore[arg-type]
115-
# pylint:disable=line-too-long
116-
sub_chapter=f"{ebd_kapitel.chapter}.{ebd_kapitel.section}.{ebd_kapitel.subsection}: {ebd_kapitel.section_title}",
117-
)
118-
ebd_table = converter.convert_docx_tables_to_ebd_table()
119-
except Exception as scraping_error: # pylint:disable=broad-except
120-
click.secho(f"Error while scraping {ebd_key}: {str(scraping_error)}; Skip!", fg="red")
121-
continue
122-
if "json" in export_types:
123-
_dump_json(output_path / Path(f"{ebd_key}.json"), ebd_table)
124-
click.secho(f"💾 Successfully exported '{ebd_key}.json'")
125-
try:
126-
ebd_graph = convert_table_to_graph(ebd_table)
127-
except (EbdCrossReferenceNotSupportedError, EndeInWrongColumnError, OutcomeCodeAmbiguousError) as known_issue:
128-
handle_known_error(known_issue, ebd_key)
129-
continue
130-
except Exception as unknown_error: # pylint:disable=broad-except
131-
click.secho(f"Error while graphing {ebd_key}: {str(unknown_error)}; Skip!", fg="red")
132-
continue
133-
if "puml" in export_types:
134-
try:
135-
_dump_puml(output_path / Path(f"{ebd_key}.puml"), ebd_graph)
136-
click.secho(f"💾 Successfully exported '{ebd_key}.puml'")
137-
except AssertionError as assertion_error:
138-
# https://github.com/Hochfrequenz/ebdtable2graph/issues/35
139-
click.secho(str(assertion_error), fg="red")
140-
except (NotExactlyTwoOutgoingEdgesError, GraphTooComplexForPlantumlError) as known_issue:
141-
handle_known_error(known_issue, ebd_key)
142-
except Exception as general_error: # pylint:disable=broad-exception-caught
143-
click.secho(f"Error while exporting {ebd_key} as UML: {str(general_error)}; Skip!", fg="yellow")
144-
145-
try:
146-
if "dot" in export_types:
147-
_dump_dot(output_path / Path(f"{ebd_key}.dot"), ebd_graph)
148-
click.secho(f"💾 Successfully exported '{ebd_key}.dot'")
149-
if "svg" in export_types:
150-
_dump_svg(output_path / Path(f"{ebd_key}.svg"), ebd_graph)
151-
click.secho(f"💾 Successfully exported '{ebd_key}.svg'")
152-
except PathsNotGreaterThanOneError as known_issue:
153-
handle_known_error(known_issue, ebd_key)
154-
except AssertionError as assertion_error:
155-
# e.g. AssertionError: If indegree > 1, the number of paths should always be greater than 1 too.
156-
click.secho(str(assertion_error), fg="red")
157-
# both the SVG and dot path require graphviz to work, hence the common error handling block
158-
click.secho(json.dumps({str(k): v for k, v in error_sources.items()}, indent=4))
159-
click.secho("🏁Finished")
160-
161-
162-
if __name__ == "__main__":
163-
# the parameter arguments gets provided over the CLI
164-
main() # pylint:disable=no-value-for-parameter
1+
"""
2+
the script to convert EBDs from .docx to SVGs visualizations using both the ebdamame and rebdhuhn libraries
3+
was relocated to https://github.com/Hochfrequenz/ebd_toolchain
4+
"""

pyproject.toml

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ classifiers = [
1717
"Programming Language :: Python :: 3.12",
1818
]
1919
dependencies = [
20-
"ebdtable2graph>=0.1.19",
20+
"rebdhuhn>=0.2.3",
2121
"python-docx",
2222
"more_itertools",
2323
"attrs",
@@ -52,10 +52,7 @@ fragments = [{ path = "README.md" }]
5252
source = "vcs"
5353

5454
[tool.hatch.build.hooks.vcs]
55-
version-file = "src/_ebddocx2table_version.py"
56-
template = '''
57-
version = "{version}"
58-
'''
55+
version-file = "src/ebdamame/version.py"
5956

6057
[tool.hatch.build.targets.sdist]
6158
exclude = ["/unittests"]

requirements.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ebdtable2graph>=0.1.20
1+
rebdhuhn>=0.2.3
22
python-docx
33
more_itertools
44
attrs

requirements.txt

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,43 @@
11
#
2-
# This file is autogenerated by pip-compile with Python 3.11
2+
# This file is autogenerated by pip-compile with Python 3.12
33
# by the following command:
44
#
5-
# pip-compile pyproject.toml
5+
# pip-compile requirements.in
66
#
77
attrs==23.2.0
88
# via
99
# cattrs
10-
# ebdamame (pyproject.toml)
11-
# ebdtable2graph
12-
cattrs==22.2.0
13-
# via ebdtable2graph
14-
certifi==2023.7.22
10+
# rebdhuhn
11+
cattrs==23.2.3
12+
# via rebdhuhn
13+
certifi==2024.2.2
1514
# via requests
16-
charset-normalizer==2.1.1
15+
charset-normalizer==3.3.2
1716
# via requests
1817
click==8.1.7
19-
# via ebdamame (pyproject.toml)
18+
# via -r requirements.in
2019
colorama==0.4.6
2120
# via click
22-
ebdtable2graph==0.1.20
23-
# via ebdamame (pyproject.toml)
2421
idna==3.7
2522
# via requests
26-
lxml==4.9.3
23+
lxml==5.2.1
2724
# via
28-
# ebdtable2graph
2925
# python-docx
26+
# rebdhuhn
3027
# svgutils
3128
more-itertools==10.2.0
32-
# via ebdamame (pyproject.toml)
33-
networkx==2.8.8
34-
# via ebdtable2graph
29+
# via -r requirements.in
30+
networkx==3.3
31+
# via rebdhuhn
3532
python-docx==1.1.2
36-
# via ebdamame (pyproject.toml)
33+
# via -r requirements.in
34+
rebdhuhn==0.2.3
35+
# via -r requirements.in
3736
requests==2.31.0
38-
# via ebdtable2graph
37+
# via rebdhuhn
3938
svgutils==0.3.4
40-
# via ebdtable2graph
39+
# via rebdhuhn
4140
typing-extensions==4.11.0
4241
# via python-docx
43-
urllib3==1.26.18
42+
urllib3==2.2.1
4443
# via requests

setup.cfg

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
# The content of this file is only necessary for python packages
22
[metadata]
3-
name = ebddocx2table
3+
name = ebdamame
44
author = Hochfrequenz Unternehmensberatung GmbH
55
author_email = info@hochfrequenz.de
66
description = library to scrape .docx files with Entscheidungsbaumdiagramm tables into a truely machine readable structure
77
long_description = file: README.md
88
long_description_content_type = text/markdown; charset=UTF-8
9-
url = https://github.com/Hochfrequenz/ebddocx2table
9+
url = https://github.com/Hochfrequenz/ebdamame
1010
project_urls =
11-
Documentation = https://github.com/Hochfrequenz/ebddocx2table
12-
Code = https://github.com/Hochfrequenz/ebddocx2table
13-
Bug tracker = https://github.com/Hochfrequenz/ebddocx2table/issues
11+
Documentation = https://github.com/Hochfrequenz/ebdamame
12+
Code = https://github.com/Hochfrequenz/ebdamame
13+
Bug tracker = https://github.com/Hochfrequenz/ebdamame/issues
1414
classifiers =
1515
Development Status :: 4 - Beta
1616
Intended Audience :: Developers
@@ -31,7 +31,7 @@ zip_safe = False
3131
include_package_data = True
3232
python_requires = >=3.11
3333
install_requires =
34-
ebdtable2graph>=0.1.20
34+
rebdhuhn>=0.2.3
3535
python-docx
3636
more_itertools
3737
attrs

src/_ebddocx2table_version.py

Lines changed: 0 additions & 1 deletion
This file was deleted.

src/ebdamame/docxtableconverter.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,16 @@
1010

1111
import attrs
1212
from docx.table import Table, _Cell, _Row
13-
from ebdtable2graph.models import EbdTable, EbdTableRow, EbdTableSubRow
14-
from ebdtable2graph.models.ebd_table import _STEP_NUMBER_REGEX, EbdCheckResult, EbdTableMetaData, MultiStepInstruction
1513
from more_itertools import first, first_true, last
14+
from rebdhuhn.models.ebd_table import (
15+
_STEP_NUMBER_REGEX,
16+
EbdCheckResult,
17+
EbdTable,
18+
EbdTableMetaData,
19+
EbdTableRow,
20+
EbdTableSubRow,
21+
MultiStepInstruction,
22+
)
1623

1724
_logger = logging.getLogger(__name__)
1825

unittests/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,23 +14,23 @@
1414

1515
def get_document(datafiles, filename: str) -> DocumentType:
1616
"""
17-
a datafiles compatible wrapper around ebddocx2table.get_document
17+
a datafiles compatible wrapper around ebdamame.get_document
1818
"""
1919
path = datafiles / Path(filename)
2020
return ebdamame.get_document(path)
2121

2222

2323
def get_ebd_docx_tables(datafiles, filename: str, ebd_key: str) -> List[Table]:
2424
"""
25-
a datafiles compatible wrapper around ebddocx2table.get_ebd_docx_tables
25+
a datafiles compatible wrapper around ebdamame.get_ebd_docx_tables
2626
"""
2727
path = datafiles / Path(filename)
2828
return ebdamame.get_ebd_docx_tables(path, ebd_key=ebd_key)
2929

3030

3131
def get_all_ebd_keys(datafiles, filename: str) -> Dict[str, Tuple[str, ebdamame.EbdChapterInformation]]:
3232
"""
33-
a datafiles compatible wrapper around ebddocx2table.get_all_ebd_keys
33+
a datafiles compatible wrapper around ebdamame.get_all_ebd_keys
3434
"""
3535
path = datafiles / Path(filename)
3636
return ebdamame.get_all_ebd_keys(path)

unittests/examples.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
module contains data as we expected them to be scraped from the docx file
33
"""
44

5-
from ebdtable2graph.models import EbdTable
6-
from ebdtable2graph.models.ebd_table import (
5+
from rebdhuhn.models.ebd_table import (
76
EbdCheckResult,
7+
EbdTable,
88
EbdTableMetaData,
99
EbdTableRow,
1010
EbdTableSubRow,

0 commit comments

Comments
 (0)