Replace docx2table and table2graph by ebdamame and rebdhuhn respectively; Delete main.py (#195)

OLILHR · hf-kklein · web-flow · commit 8e3915ed0e34 · 2024-05-05T12:47:07.000Z
* Replace docx2table and table2graph by ebdamame and rebdhuhn respectively * Ensure style attribute of a paragraph is not None * Directly access submodule by importing from docx.document * Raise ValueError in case no cell with a valid step number is found * Revert commit 70b3d42 * ⬆ Upgrade requirements.txt * ⬆ pip-compile-multi -d dev_requirements * Raise ValueError in case no cell with a valid step number is found * fix typing confusion * foo --------- Co-authored-by: konstantin <konstantin.klein@hochfrequenz.de>
diff --git a/.gitignore b/.gitignore
@@ -132,5 +132,3 @@ dmypy.json
 
 # vscode settings
 .vscode/
-
-src/_ebdamame_version.py
diff --git a/main.py b/main.py
@@ -1,164 +1,4 @@
-"""
-a small click based script to extract all EBDs from a given file.
-"""
-
-# invoke like this:
-# main.py -i unittests/test_data/ebd20230619_v33.docx
-#  -o ../machine-readable_entscheidungsbaumdiagramme/FV2304
-#  -t json -t dot -t svg -t puml
-# or
-# main.py -i unittests/test_data/ebd20230629_v34.docx
-#  -o ../machine-readable_entscheidungsbaumdiagramme/FV2310
-#  -t json -t dot -t svg -t puml
-import json
-from pathlib import Path
-from typing import Literal
-
-import cattrs
-import click
-from ebdtable2graph import convert_graph_to_plantuml, convert_table_to_graph
-from ebdtable2graph.graphviz import convert_dot_to_svg_kroki, convert_graph_to_dot
-from ebdtable2graph.models import EbdGraph, EbdTable
-from ebdtable2graph.models.errors import (
-    EbdCrossReferenceNotSupportedError,
-    EndeInWrongColumnError,
-    NotExactlyTwoOutgoingEdgesError,
-    OutcomeCodeAmbiguousError,
-    PathsNotGreaterThanOneError,
-)
-from ebdtable2graph.plantuml import GraphTooComplexForPlantumlError
-
-# pylint:disable=import-error
-from ebdamame import TableNotFoundError, get_all_ebd_keys, get_ebd_docx_tables  # type:ignore[import]
-from ebdamame.docxtableconverter import DocxTableConverter  # type:ignore[import]
-
-
-def _dump_puml(puml_path: Path, ebd_graph: EbdGraph) -> None:
-    plantuml_code = convert_graph_to_plantuml(ebd_graph)
-    with open(puml_path, "w+", encoding="utf-8") as uml_file:
-        uml_file.write(plantuml_code)
-
-
-def _dump_dot(dot_path: Path, ebd_graph: EbdGraph) -> None:
-    dot_code = convert_graph_to_dot(ebd_graph)
-    with open(dot_path, "w+", encoding="utf-8") as uml_file:
-        uml_file.write(dot_code)
-
-
-def _dump_svg(svg_path: Path, ebd_graph: EbdGraph) -> None:
-    dot_code = convert_graph_to_dot(ebd_graph)
-    svg_code = convert_dot_to_svg_kroki(dot_code)
-    with open(svg_path, "w+", encoding="utf-8") as svg_file:
-        svg_file.write(svg_code)
-
-
-def _dump_json(json_path: Path, ebd_table: EbdTable) -> None:
-    with open(json_path, "w+", encoding="utf-8") as json_file:
-        json.dump(cattrs.unstructure(ebd_table), json_file, ensure_ascii=False, indent=2, sort_keys=True)
-
-
-@click.command()
-@click.option(
-    "-i",
-    "--input_path",
-    type=click.Path(exists=True, dir_okay=False, file_okay=True, path_type=Path),
-    prompt="Input DOCX File",
-    help="Path of a .docx file from which the EBDs shall be extracted",
-)
-@click.option(
-    "-o",
-    "--output_path",
-    type=click.Path(exists=False, dir_okay=True, file_okay=False, path_type=Path),
-    default="output",
-    prompt="Output directory",
-    help="Define the path where you want to save the generated files",
-)
-@click.option(
-    "-t",
-    "--export_types",
-    type=click.Choice(["puml", "dot", "json", "svg"], case_sensitive=False),
-    multiple=True,
-    help="Choose which file you'd like to create",
-)
-# pylint:disable=too-many-locals, too-many-branches, too-many-statements,
-def main(input_path: Path, output_path: Path, export_types: list[Literal["puml", "dot", "json", "svg"]]):
-    """
-    A program to get a machine-readable version of the AHBs docx files published by edi@energy.
-    """
-    if output_path.exists():
-        click.secho(f"The output directory '{output_path}' exists already.", fg="yellow")
-    else:
-        output_path.mkdir(parents=True)
-        click.secho(f"Created a new directory at {output_path}", fg="green")
-    all_ebd_keys = get_all_ebd_keys(input_path)
-    error_sources: dict[type, list[str]] = {}
-
-    def handle_known_error(error: Exception, ebd_key: str) -> None:
-        click.secho(f"Error while processing EBD {ebd_key}: {error}", fg="yellow")
-        if type(error) not in error_sources:
-            error_sources[type(error)] = []
-        error_sources[type(error)].append(ebd_key)
-
-    for ebd_key, (ebd_title, ebd_kapitel) in all_ebd_keys.items():
-        click.secho(f"Processing EBD {ebd_kapitel} '{ebd_key}' ({ebd_title})")
-        try:
-            docx_tables = get_ebd_docx_tables(docx_file_path=input_path, ebd_key=ebd_key)
-        except TableNotFoundError as table_not_found_error:
-            click.secho(f"Table not found: {ebd_key}: {str(table_not_found_error)}; Skip!", fg="yellow")
-            continue
-        assert ebd_kapitel is not None
-        try:
-            converter = DocxTableConverter(
-                docx_tables,
-                ebd_key=ebd_key,
-                chapter=ebd_kapitel.chapter_title,  # type:ignore[arg-type]
-                # pylint:disable=line-too-long
-                sub_chapter=f"{ebd_kapitel.chapter}.{ebd_kapitel.section}.{ebd_kapitel.subsection}: {ebd_kapitel.section_title}",
-            )
-            ebd_table = converter.convert_docx_tables_to_ebd_table()
-        except Exception as scraping_error:  # pylint:disable=broad-except
-            click.secho(f"Error while scraping {ebd_key}: {str(scraping_error)}; Skip!", fg="red")
-            continue
-        if "json" in export_types:
-            _dump_json(output_path / Path(f"{ebd_key}.json"), ebd_table)
-            click.secho(f"💾 Successfully exported '{ebd_key}.json'")
-        try:
-            ebd_graph = convert_table_to_graph(ebd_table)
-        except (EbdCrossReferenceNotSupportedError, EndeInWrongColumnError, OutcomeCodeAmbiguousError) as known_issue:
-            handle_known_error(known_issue, ebd_key)
-            continue
-        except Exception as unknown_error:  # pylint:disable=broad-except
-            click.secho(f"Error while graphing {ebd_key}: {str(unknown_error)}; Skip!", fg="red")
-            continue
-        if "puml" in export_types:
-            try:
-                _dump_puml(output_path / Path(f"{ebd_key}.puml"), ebd_graph)
-                click.secho(f"💾 Successfully exported '{ebd_key}.puml'")
-            except AssertionError as assertion_error:
-                # https://github.com/Hochfrequenz/ebdtable2graph/issues/35
-                click.secho(str(assertion_error), fg="red")
-            except (NotExactlyTwoOutgoingEdgesError, GraphTooComplexForPlantumlError) as known_issue:
-                handle_known_error(known_issue, ebd_key)
-            except Exception as general_error:  # pylint:disable=broad-exception-caught
-                click.secho(f"Error while exporting {ebd_key} as UML: {str(general_error)}; Skip!", fg="yellow")
-
-        try:
-            if "dot" in export_types:
-                _dump_dot(output_path / Path(f"{ebd_key}.dot"), ebd_graph)
-                click.secho(f"💾 Successfully exported '{ebd_key}.dot'")
-            if "svg" in export_types:
-                _dump_svg(output_path / Path(f"{ebd_key}.svg"), ebd_graph)
-                click.secho(f"💾 Successfully exported '{ebd_key}.svg'")
-        except PathsNotGreaterThanOneError as known_issue:
-            handle_known_error(known_issue, ebd_key)
-        except AssertionError as assertion_error:
-            # e.g. AssertionError: If indegree > 1, the number of paths should always be greater than 1 too.
-            click.secho(str(assertion_error), fg="red")
-            # both the SVG and dot path require graphviz to work, hence the common error handling block
-    click.secho(json.dumps({str(k): v for k, v in error_sources.items()}, indent=4))
-    click.secho("🏁Finished")
-
-
-if __name__ == "__main__":
-    # the parameter arguments gets provided over the CLI
-    main()  # pylint:disable=no-value-for-parameter
+"""
+the script to convert EBDs from .docx to SVGs visualizations using both the ebdamame and rebdhuhn libraries
+was relocated to https://github.com/Hochfrequenz/ebd_toolchain
+"""
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,7 +17,7 @@ classifiers = [
     "Programming Language :: Python :: 3.12",
 ]
 dependencies = [
-    "ebdtable2graph>=0.1.19",
+    "rebdhuhn>=0.2.3",
     "python-docx",
     "more_itertools",
     "attrs",
@@ -52,10 +52,7 @@ fragments = [{ path = "README.md" }]
 source = "vcs"
 
 [tool.hatch.build.hooks.vcs]
-version-file = "src/_ebddocx2table_version.py"
-template = '''
-version = "{version}"
-'''
+version-file = "src/ebdamame/version.py"
 
 [tool.hatch.build.targets.sdist]
 exclude = ["/unittests"]
diff --git a/requirements.in b/requirements.in
@@ -1,4 +1,4 @@
-ebdtable2graph>=0.1.20
+rebdhuhn>=0.2.3
 python-docx
 more_itertools
 attrs
diff --git a/requirements.txt b/requirements.txt
@@ -1,44 +1,43 @@
 #
-# This file is autogenerated by pip-compile with Python 3.11
+# This file is autogenerated by pip-compile with Python 3.12
 # by the following command:
 #
-#    pip-compile pyproject.toml
+#    pip-compile requirements.in
 #
 attrs==23.2.0
     # via
     #   cattrs
-    #   ebdamame (pyproject.toml)
-    #   ebdtable2graph
-cattrs==22.2.0
-    # via ebdtable2graph
-certifi==2023.7.22
+    #   rebdhuhn
+cattrs==23.2.3
+    # via rebdhuhn
+certifi==2024.2.2
     # via requests
-charset-normalizer==2.1.1
+charset-normalizer==3.3.2
     # via requests
 click==8.1.7
-    # via ebdamame (pyproject.toml)
+    # via -r requirements.in
 colorama==0.4.6
     # via click
-ebdtable2graph==0.1.20
-    # via ebdamame (pyproject.toml)
 idna==3.7
     # via requests
-lxml==4.9.3
+lxml==5.2.1
     # via
-    #   ebdtable2graph
     #   python-docx
+    #   rebdhuhn
     #   svgutils
 more-itertools==10.2.0
-    # via ebdamame (pyproject.toml)
-networkx==2.8.8
-    # via ebdtable2graph
+    # via -r requirements.in
+networkx==3.3
+    # via rebdhuhn
 python-docx==1.1.2
-    # via ebdamame (pyproject.toml)
+    # via -r requirements.in
+rebdhuhn==0.2.3
+    # via -r requirements.in
 requests==2.31.0
-    # via ebdtable2graph
+    # via rebdhuhn
 svgutils==0.3.4
-    # via ebdtable2graph
+    # via rebdhuhn
 typing-extensions==4.11.0
     # via python-docx
-urllib3==1.26.18
+urllib3==2.2.1
     # via requests
diff --git a/setup.cfg b/setup.cfg
@@ -1,16 +1,16 @@
 # The content of this file is only necessary for python packages
 [metadata]
-name = ebddocx2table
+name = ebdamame
 author = Hochfrequenz Unternehmensberatung GmbH
 author_email = info@hochfrequenz.de
 description = library to scrape .docx files with Entscheidungsbaumdiagramm tables into a truely machine readable structure 
 long_description = file: README.md
 long_description_content_type = text/markdown; charset=UTF-8
-url = https://github.com/Hochfrequenz/ebddocx2table
+url = https://github.com/Hochfrequenz/ebdamame
 project_urls =
-    Documentation = https://github.com/Hochfrequenz/ebddocx2table
-    Code = https://github.com/Hochfrequenz/ebddocx2table
-    Bug tracker = https://github.com/Hochfrequenz/ebddocx2table/issues
+    Documentation = https://github.com/Hochfrequenz/ebdamame
+    Code = https://github.com/Hochfrequenz/ebdamame
+    Bug tracker = https://github.com/Hochfrequenz/ebdamame/issues
 classifiers =
     Development Status :: 4 - Beta
     Intended Audience :: Developers
@@ -31,7 +31,7 @@ zip_safe = False
 include_package_data = True
 python_requires = >=3.11
 install_requires =
-    ebdtable2graph>=0.1.20
+    rebdhuhn>=0.2.3
     python-docx
     more_itertools
     attrs
diff --git a/src/_ebddocx2table_version.py b/src/_ebddocx2table_version.py
diff --git a/src/ebdamame/docxtableconverter.py b/src/ebdamame/docxtableconverter.py
@@ -10,9 +10,16 @@
 
 import attrs
 from docx.table import Table, _Cell, _Row
-from ebdtable2graph.models import EbdTable, EbdTableRow, EbdTableSubRow
-from ebdtable2graph.models.ebd_table import _STEP_NUMBER_REGEX, EbdCheckResult, EbdTableMetaData, MultiStepInstruction
 from more_itertools import first, first_true, last
+from rebdhuhn.models.ebd_table import (
+    _STEP_NUMBER_REGEX,
+    EbdCheckResult,
+    EbdTable,
+    EbdTableMetaData,
+    EbdTableRow,
+    EbdTableSubRow,
+    MultiStepInstruction,
+)
 
 _logger = logging.getLogger(__name__)
 
diff --git a/unittests/__init__.py b/unittests/__init__.py
@@ -14,23 +14,23 @@
 
 def get_document(datafiles, filename: str) -> DocumentType:
     """
-    a datafiles compatible wrapper around ebddocx2table.get_document
+    a datafiles compatible wrapper around ebdamame.get_document
     """
     path = datafiles / Path(filename)
     return ebdamame.get_document(path)
 
 
 def get_ebd_docx_tables(datafiles, filename: str, ebd_key: str) -> List[Table]:
     """
-    a datafiles compatible wrapper around ebddocx2table.get_ebd_docx_tables
+    a datafiles compatible wrapper around ebdamame.get_ebd_docx_tables
     """
     path = datafiles / Path(filename)
     return ebdamame.get_ebd_docx_tables(path, ebd_key=ebd_key)
 
 
 def get_all_ebd_keys(datafiles, filename: str) -> Dict[str, Tuple[str, ebdamame.EbdChapterInformation]]:
     """
-    a datafiles compatible wrapper around ebddocx2table.get_all_ebd_keys
+    a datafiles compatible wrapper around ebdamame.get_all_ebd_keys
     """
     path = datafiles / Path(filename)
     return ebdamame.get_all_ebd_keys(path)
diff --git a/unittests/examples.py b/unittests/examples.py
@@ -2,9 +2,9 @@
 module contains data as we expected them to be scraped from the docx file
 """
 
-from ebdtable2graph.models import EbdTable
-from ebdtable2graph.models.ebd_table import (
+from rebdhuhn.models.ebd_table import (
     EbdCheckResult,
+    EbdTable,
     EbdTableMetaData,
     EbdTableRow,
     EbdTableSubRow,
diff --git a/unittests/test_highlevel.py b/unittests/test_highlevel.py
@@ -2,7 +2,7 @@
 
 import pytest  # type:ignore[import]
 from docx.table import Table
-from ebdtable2graph.models import EbdTable
+from rebdhuhn.models.ebd_table import EbdTable
 
 from ebdamame import EbdChapterInformation, TableNotFoundError
 from ebdamame.docxtableconverter import DocxTableConverter

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-ebdtable2graph>=0.1.20`
	`1`	`+rebdhuhn>=0.2.3`
`2`	`2`	`python-docx`
`3`	`3`	`more_itertools`
`4`	`4`	`attrs`