|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +# Script to compare two html files, ignoring differences that we consider |
| 4 | +# to be unimportant. The output is a unified diff of formatted html meant |
| 5 | +# to be readable and precise at identifying differences. |
| 6 | +# |
| 7 | +# This script is designed to be run in the container managed by the |
| 8 | +# Dockerfile at the root of this repository. |
| 9 | + |
| 10 | + |
| 11 | +from bs4 import BeautifulSoup, NavigableString |
| 12 | +import difflib |
| 13 | +import re |
| 14 | + |
| 15 | + |
| 16 | +def normalize_html(html): |
| 17 | + """Normalizes html to remove expected differences between AsciiDoc's |
| 18 | + output and Asciidoctor's output. |
| 19 | + """ |
| 20 | + # Replace many whitespace characters with a single space in some elements |
| 21 | + # kind of like a browser does. |
| 22 | + soup = BeautifulSoup(html, 'lxml') |
| 23 | + for e in soup.select(':not(script,pre,code,style)'): |
| 24 | + for part in e: |
| 25 | + if isinstance(part, NavigableString): |
| 26 | + crunched = NavigableString(re.sub(r'\s+', ' ', part)) |
| 27 | + if crunched != part: |
| 28 | + part.replace_with(crunched) |
| 29 | + # Format the html with indentation so we can *see* things |
| 30 | + html = soup.prettify() |
| 31 | + # Remove the zero width space that asciidoctor adds after each horizontal |
| 32 | + # ellipsis. They don't hurt anything but asciidoc doesn't make them |
| 33 | + html = html.replace('\u2026\u200b', '\u2026') |
| 34 | + # Temporary workaround for known issues |
| 35 | + html = html.replace('class="edit_me" href="/./', 'class="edit_me" href="') |
| 36 | + html = re.sub( |
| 37 | + r'(?m)^\s+<div class="console_widget" data-snippet="[^"]+">' |
| 38 | + r'\s+</div>\n', '', html) |
| 39 | + html = html.replace('\\<1>', '<1>') |
| 40 | + return html |
| 41 | + |
| 42 | + |
| 43 | +def html_diff(lhs_name, lhs, rhs_name, rhs): |
| 44 | + """Compare two html blobs, ignoring expected differences between AsciiDoc |
| 45 | + and Asciidoctor. The result is a generator for lines in the diff report. |
| 46 | + If it is entirely empty then there is no diff. |
| 47 | + """ |
| 48 | + lhs_lines = normalize_html(lhs).splitlines() |
| 49 | + rhs_lines = normalize_html(rhs).splitlines() |
| 50 | + return difflib.unified_diff( |
| 51 | + lhs_lines, |
| 52 | + rhs_lines, |
| 53 | + fromfile=lhs_name, |
| 54 | + tofile=rhs_name, |
| 55 | + lineterm='') |
| 56 | + |
| 57 | + |
| 58 | +def html_file_diff(lhs, rhs): |
| 59 | + """Compare two html files, ignoring expected differences between AsciiDoc |
| 60 | + and Asciidoctor. The result is a generator for lines in the diff report. |
| 61 | + If it is entirely empty then there is no diff. |
| 62 | + """ |
| 63 | + with open(lhs, encoding='utf-8') as lhs_file: |
| 64 | + lhs_text = lhs_file.read() |
| 65 | + with open(rhs, encoding='utf-8') as rhs_file: |
| 66 | + rhs_text = rhs_file.read() |
| 67 | + return html_diff(lhs, lhs_text, rhs, rhs_text) |
| 68 | + |
| 69 | + |
| 70 | +if __name__ == '__main__': |
| 71 | + import sys |
| 72 | + if len(sys.argv) != 3: |
| 73 | + print("Expected exactly 2 arguments but got %s" % sys.argv[1:]) |
| 74 | + exit(1) |
| 75 | + had_diff = False |
| 76 | + for line in html_file_diff(sys.argv[1], sys.argv[2]): |
| 77 | + had_diff = True |
| 78 | + # print doesn't like to print utf-8 in all cases but buffer.write is ok |
| 79 | + sys.stderr.buffer.write(line.encode('utf-8')) |
| 80 | + sys.stderr.buffer.write("\n".encode('utf-8')) |
| 81 | + exit(1 if had_diff else 0) |
0 commit comments