Skip to content

Commit 0728d35

Browse files
SnoopJdgw
andcommitted
wikipedia: retrieve descriptiosns for media fragments
Co-authored-by: dgw <[email protected]>
1 parent 0b5fb8a commit 0728d35

File tree

1 file changed

+62
-5
lines changed

1 file changed

+62
-5
lines changed

sopel/modules/wikipedia.py

Lines changed: 62 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from __future__ import annotations
99

1010
from html.parser import HTMLParser
11+
import logging
1112
import re
1213

1314
from requests import get
@@ -17,6 +18,8 @@
1718
from sopel.tools.web import quote, unquote
1819

1920

21+
LOGGER = logging.getLogger(__name__)
22+
2023
REDIRECT = re.compile(r'^REDIRECT (.*)')
2124
PLUGIN_OUTPUT_PREFIX = '[wikipedia] '
2225

@@ -272,18 +275,72 @@ def mw_section(server, query, section):
272275
return text
273276

274277

278+
def say_image_description(bot, trigger, server, image):
279+
desc = mw_image_description(server, image)
280+
281+
if desc:
282+
bot.say(desc, truncation=" […]")
283+
284+
285+
def mw_image_description(server, image):
286+
"""Retrieves the description for the given image."""
287+
params = "&".join([
288+
"action=query",
289+
"prop=imageinfo",
290+
"format=json",
291+
"indexpageids=1",
292+
"iiprop=extmetadata",
293+
"iiextmetadatafilter=ImageDescription",
294+
"iilimit=1",
295+
"titles={image}".format(image=image),
296+
])
297+
url = "https://{server}/w/api.php?{params}".format(server=server, params=params)
298+
299+
response = get(url)
300+
json = response.json()
301+
302+
try:
303+
query_data = json["query"]
304+
pageids = query_data["pageids"]
305+
pages = query_data["pages"]
306+
307+
page = pages[pageids[0]]
308+
309+
raw_desc = page["imageinfo"][0]["extmetadata"]["ImageDescription"]["value"]
310+
311+
except LookupError:
312+
LOGGER.exception("Error getting image description for %r, response was: %r", image, json)
313+
return None
314+
315+
# Some descriptions contain markup, use WikiParser to discard that
316+
parser = WikiParser(image)
317+
parser.feed(raw_desc)
318+
desc = parser.get_result()
319+
desc = ' '.join(desc.split()) # collapse multiple whitespace chars
320+
321+
return desc
322+
323+
275324
# Matches a wikipedia page (excluding spaces and #, but not /File: links), with a separate optional field for the section
276325
@plugin.url(r'https?:\/\/([a-z]+(?:\.m)?\.wikipedia\.org)\/wiki\/((?!File\:)[^ #]+)#?([^ ]*)')
277326
@plugin.output_prefix(PLUGIN_OUTPUT_PREFIX)
278327
def mw_info(bot, trigger, match=None):
279328
"""Retrieves and outputs a snippet of the linked page."""
280-
if match.group(3):
281-
if match.group(3).startswith('cite_note-'): # Don't bother trying to retrieve a snippet when cite-note is linked
282-
say_snippet(bot, trigger, match.group(1), unquote(match.group(2)), show_url=False)
329+
server = match.group(1)
330+
query = unquote(match.group(2))
331+
section = unquote(match.group(3))
332+
333+
if section:
334+
if section.startswith('cite_note-'): # Don't bother trying to retrieve a snippet when cite-note is linked
335+
say_snippet(bot, trigger, server, query, show_url=False)
336+
elif section.startswith('/media'):
337+
# gh2316: media fragments are usually images; try to get an image description
338+
image = section[7:] # strip '/media' prefix in pre-3.9 friendly way
339+
say_image_description(bot, trigger, server, image)
283340
else:
284-
say_section(bot, trigger, match.group(1), unquote(match.group(2)), unquote(match.group(3)))
341+
say_section(bot, trigger, server, query, section)
285342
else:
286-
say_snippet(bot, trigger, match.group(1), unquote(match.group(2)), show_url=False)
343+
say_snippet(bot, trigger, server, query, show_url=False)
287344

288345

289346
@plugin.command('wikipedia', 'wp')

0 commit comments

Comments
 (0)