Self-Test: Add html diffing (elastic#635)

nik9000 · web-flow · commit 6af3ed8ef5d1 · 2019-02-22T14:56:35.000-05:00
Compare the html built by AsciiDoc and Asciidoctor as part of the
self-test. Like all good tests, it doesn't pass at first. I've
temporarily ignored all of the failures and have added them to the
main asciidoctor issue or filed them as their own issue.
diff --git a/Dockerfile b/Dockerfile
@@ -14,6 +14,7 @@ LABEL MAINTAINERS="Nik Everett <nik@elastic.co>"
 #   * openssh-client (used by git)
 #   * openssh-server (used to forward ssh auth for git when running with --all on macOS)
 #   * perl-base
+#   * python (is python2)
 #   * xsltproc
 # * To install rubygems for asciidoctor
 #   * build-essential
@@ -23,7 +24,8 @@ LABEL MAINTAINERS="Nik Everett <nik@elastic.co>"
 #   * ruby
 #   * ruby-dev
 # * Used to check the docs build in CI
-#   * pycodestyle
+#   * python3
+#   * python3-pip
 RUN install_packages \
   bash \
   build-essential \
@@ -38,8 +40,9 @@ RUN install_packages \
   openssh-client \
   openssh-server \
   perl-base \
-  pycodestyle \
   python \
+  python3 \
+  python3-pip \
   ruby \
   ruby-dev \
   unzip \
@@ -66,3 +69,13 @@ RUN gem install --no-document \
   rubocop:0.64.0 \
   rspec:3.8.0 \
   thread_safe:0.3.6
+
+# Wheel inventory:
+# * Used to test the docs build
+#   * beautifulsoup4
+#   * lxml
+#   * pycodestyle
+RUN pip3 install \
+  beautifulsoup4==4.7.1 \
+  lxml==4.3.1 \
+  pycodestyle==2.5.0
diff --git a/Makefile b/Makefile
@@ -8,10 +8,10 @@ MAKEFLAGS += --silent
 check: unit_test integration_test
 
 .PHONY: unit_test
-unit_test: build_docs_check asciidoctor_check
+unit_test: style asciidoctor_check
 
-.PHONY: build_docs_check
-build_docs_check:
+.PHONY: style
+style: build_docs
 	pycodestyle build_docs
 
 .PHONY: asciidoctor_check
diff --git a/integtest/Makefile b/integtest/Makefile
@@ -3,9 +3,14 @@ MAKEFLAGS += --silent
 
 .PHONY: check
 check: \
+	style \
 	readme_expected_files readme_same_files \
 	includes_expected_files includes_same_files
 
+.PHONY: style
+style: html_diff
+	pycodestyle html_diff
+
 define STANDARD_EXPECTED_FILES=
 	[ -s $^/index.html ]
 	[ -s $^/docs.js ]
@@ -39,6 +44,11 @@ readme_expected_files: /tmp/readme_asciidoc
 			| grep -v snippets/blocks \
 		) \
 		<(cd /tmp/$*_asciidoctor && find * -type f | sort)
+	# The grep -v below are for known issues with asciidoctor
+	for file in $$(cd /tmp/$*_asciidoc && find * -type f -name '*.html' \
+			| grep -v 'blocks\|changes\|experimental\|multi-part'); do \
+		./html_diff /tmp/$*_asciidoc/$$file /tmp/$*_asciidoctor/$$file; \
+	done
 
 define BD=
 /docs_build/build_docs.pl --in_standard_docker --out $@
diff --git a/integtest/html_diff b/integtest/html_diff
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+
+# Script to compare two html files, ignoring differences that we consider
+# to be unimportant. The output is a unified diff of formatted html meant
+# to be readable and precise at identifying differences.
+#
+# This script is designed to be run in the container managed by the
+# Dockerfile at the root of this repository.
+
+
+from bs4 import BeautifulSoup, NavigableString
+import difflib
+import re
+
+
+def normalize_html(html):
+    """Normalizes html to remove expected differences between AsciiDoc's
+    output and Asciidoctor's output.
+    """
+    # Replace many whitespace characters with a single space in some elements
+    # kind of like a browser does.
+    soup = BeautifulSoup(html, 'lxml')
+    for e in soup.select(':not(script,pre,code,style)'):
+        for part in e:
+            if isinstance(part, NavigableString):
+                crunched = NavigableString(re.sub(r'\s+', ' ', part))
+                if crunched != part:
+                    part.replace_with(crunched)
+    # Format the html with indentation so we can *see* things
+    html = soup.prettify()
+    # Remove the zero width space that asciidoctor adds after each horizontal
+    # ellipsis. They don't hurt anything but asciidoc doesn't make them
+    html = html.replace('\u2026\u200b', '\u2026')
+    # Temporary workaround for known issues
+    html = html.replace('class="edit_me" href="/./', 'class="edit_me" href="')
+    html = re.sub(
+        r'(?m)^\s+<div class="console_widget" data-snippet="[^"]+">'
+        r'\s+</div>\n', '', html)
+    html = html.replace('\\&lt;1&gt;', '&lt;1&gt;')
+    return html
+
+
+def html_diff(lhs_name, lhs, rhs_name, rhs):
+    """Compare two html blobs, ignoring expected differences between AsciiDoc
+    and Asciidoctor. The result is a generator for lines in the diff report.
+    If it is entirely empty then there is no diff.
+    """
+    lhs_lines = normalize_html(lhs).splitlines()
+    rhs_lines = normalize_html(rhs).splitlines()
+    return difflib.unified_diff(
+            lhs_lines,
+            rhs_lines,
+            fromfile=lhs_name,
+            tofile=rhs_name,
+            lineterm='')
+
+
+def html_file_diff(lhs, rhs):
+    """Compare two html files, ignoring expected differences between AsciiDoc
+    and Asciidoctor. The result is a generator for lines in the diff report.
+    If it is entirely empty then there is no diff.
+    """
+    with open(lhs, encoding='utf-8') as lhs_file:
+        lhs_text = lhs_file.read()
+    with open(rhs, encoding='utf-8') as rhs_file:
+        rhs_text = rhs_file.read()
+    return html_diff(lhs, lhs_text, rhs, rhs_text)
+
+
+if __name__ == '__main__':
+    import sys
+    if len(sys.argv) != 3:
+        print("Expected exactly 2 arguments but got %s" % sys.argv[1:])
+        exit(1)
+    had_diff = False
+    for line in html_file_diff(sys.argv[1], sys.argv[2]):
+        had_diff = True
+        # print doesn't like to print utf-8 in all cases but buffer.write is ok
+        sys.stderr.buffer.write(line.encode('utf-8'))
+        sys.stderr.buffer.write("\n".encode('utf-8'))
+    exit(1 if had_diff else 0)