2828
2929
3030class WikiParser (HTMLParser ):
31+ NO_CONSUME_TAGS = ('sup' , 'style' )
32+ """Tags whose contents should always be ignored.
33+
34+ These are used in things like inline citations or section "hatnotes", none
35+ of which are useful output for IRC.
36+ """
37+
3138 def __init__ (self , section_name ):
3239 HTMLParser .__init__ (self )
3340 self .consume = True
41+ self .no_consume_depth = 0
3442 self .is_header = False
3543 self .section_name = section_name
3644
@@ -42,8 +50,9 @@ def __init__(self, section_name):
4250 self .result = ''
4351
4452 def handle_starttag (self , tag , attrs ):
45- if tag == 'sup' : # don't consume anything in superscript (citation-related tags)
53+ if tag in self . NO_CONSUME_TAGS :
4654 self .consume = False
55+ self .no_consume_depth += 1
4756
4857 elif re .match (r'^h\d$' , tag ):
4958 self .is_header = True
@@ -57,14 +66,17 @@ def handle_starttag(self, tag, attrs):
5766 self .span_depth += 1
5867
5968 elif tag == 'div' :
60- # We want to skip thumbnail text and the inexplicable table of contents,
61- # and as such also need to track div depth
69+ # We want to skip thumbnail text, the table of contents, and section "hatnotes".
70+ # This also requires tracking div nesting level.
6271 if self .div_depth :
6372 self .div_depth += 1
6473 else :
6574 for attr in attrs :
66- if attr [0 ] == 'class' and ('thumb' in attr [1 ] or attr [1 ] == 'toc' ):
75+ if attr [0 ] == 'class' and (
76+ 'thumb' in attr [1 ] or 'hatnote' in attr [1 ] or attr [1 ] == 'toc'
77+ ):
6778 self .div_depth += 1
79+ break
6880
6981 elif tag == 'table' :
7082 # Message box templates are what we want to ignore here
@@ -91,8 +103,11 @@ def handle_starttag(self, tag, attrs):
91103 self .citations = True # once we hit citations, we can stop
92104
93105 def handle_endtag (self , tag ):
94- if not self .consume and tag == 'sup' :
95- self .consume = True
106+ if not self .consume and tag in self .NO_CONSUME_TAGS :
107+ if self .no_consume_depth :
108+ self .no_consume_depth -= 1
109+ if not self .no_consume_depth :
110+ self .consume = True
96111 if self .is_header and re .match (r'^h\d$' , tag ):
97112 self .is_header = False
98113 if self .span_depth and tag == 'span' :
0 commit comments