Skip to content

Commit e5a3d3f

Browse files
committed
Backport pull request #2225
wikipedia: refine no-consume logic for section hatnotes
1 parent f2c6dfe commit e5a3d3f

File tree

1 file changed

+21
-6
lines changed

1 file changed

+21
-6
lines changed

sopel/modules/wikipedia.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,17 @@
2828

2929

3030
class WikiParser(HTMLParser):
31+
NO_CONSUME_TAGS = ('sup', 'style')
32+
"""Tags whose contents should always be ignored.
33+
34+
These are used in things like inline citations or section "hatnotes", none
35+
of which are useful output for IRC.
36+
"""
37+
3138
def __init__(self, section_name):
3239
HTMLParser.__init__(self)
3340
self.consume = True
41+
self.no_consume_depth = 0
3442
self.is_header = False
3543
self.section_name = section_name
3644

@@ -42,8 +50,9 @@ def __init__(self, section_name):
4250
self.result = ''
4351

4452
def handle_starttag(self, tag, attrs):
45-
if tag == 'sup': # don't consume anything in superscript (citation-related tags)
53+
if tag in self.NO_CONSUME_TAGS:
4654
self.consume = False
55+
self.no_consume_depth += 1
4756

4857
elif re.match(r'^h\d$', tag):
4958
self.is_header = True
@@ -57,14 +66,17 @@ def handle_starttag(self, tag, attrs):
5766
self.span_depth += 1
5867

5968
elif tag == 'div':
60-
# We want to skip thumbnail text and the inexplicable table of contents,
61-
# and as such also need to track div depth
69+
# We want to skip thumbnail text, the table of contents, and section "hatnotes".
70+
# This also requires tracking div nesting level.
6271
if self.div_depth:
6372
self.div_depth += 1
6473
else:
6574
for attr in attrs:
66-
if attr[0] == 'class' and ('thumb' in attr[1] or attr[1] == 'toc'):
75+
if attr[0] == 'class' and (
76+
'thumb' in attr[1] or 'hatnote' in attr[1] or attr[1] == 'toc'
77+
):
6778
self.div_depth += 1
79+
break
6880

6981
elif tag == 'table':
7082
# Message box templates are what we want to ignore here
@@ -91,8 +103,11 @@ def handle_starttag(self, tag, attrs):
91103
self.citations = True # once we hit citations, we can stop
92104

93105
def handle_endtag(self, tag):
94-
if not self.consume and tag == 'sup':
95-
self.consume = True
106+
if not self.consume and tag in self.NO_CONSUME_TAGS:
107+
if self.no_consume_depth:
108+
self.no_consume_depth -= 1
109+
if not self.no_consume_depth:
110+
self.consume = True
96111
if self.is_header and re.match(r'^h\d$', tag):
97112
self.is_header = False
98113
if self.span_depth and tag == 'span':

0 commit comments

Comments
 (0)