Skip to content

Commit e629693

Browse files
committed
url: fix .title vs url callback plugins
Also a bunch of misc cleaning
1 parent cea42e1 commit e629693

File tree

1 file changed

+89
-92
lines changed

1 file changed

+89
-92
lines changed

sopel/modules/url.py

Lines changed: 89 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,21 @@
1010
"""
1111
from __future__ import annotations
1212

13-
import ipaddress
13+
from ipaddress import ip_address
1414
import logging
1515
import re
16+
from typing import Generator, List, Optional, Tuple
1617
from urllib.parse import urlparse
1718

1819
import dns.resolver
1920
import requests
2021
from urllib3.exceptions import LocationValueError # type: ignore[import]
2122

2223
from sopel import plugin, tools
23-
from sopel.config import types
24+
from sopel.bot import Sopel
25+
from sopel.config import Config, types
2426
from sopel.tools import web
27+
from sopel.trigger import Trigger
2528

2629

2730
LOGGER = logging.getLogger(__name__)
@@ -62,22 +65,18 @@ class UrlSection(types.StaticSection):
6265
"""If greater than 0, the title fetcher will include a TinyURL version of links longer than this many characters."""
6366
enable_private_resolution = types.BooleanAttribute(
6467
'enable_private_resolution', default=False)
65-
"""Enable URL lookups for RFC1918 addresses"""
66-
enable_dns_resolution = types.BooleanAttribute(
67-
'enable_dns_resolution', default=False)
68-
"""Enable DNS resolution for all domains to validate if there are RFC1918 resolutions"""
68+
"""Enable requests to private and local network IP addresses"""
6969

7070

71-
def configure(config):
71+
def configure(config: Config):
7272
"""
7373
| name | example | purpose |
7474
| ---- | ------- | ------- |
7575
| enable_auto_title | yes | Enable auto-title. |
7676
| exclude | https?://git\\\\.io/.* | A list of regular expressions for URLs for which the title should not be shown. |
7777
| exclusion\\_char | ! | A character (or string) which, when immediately preceding a URL, will stop the URL's title from being shown. |
7878
| shorten\\_url\\_length | 72 | If greater than 0, the title fetcher will include a TinyURL version of links longer than this many characters. |
79-
| enable\\_private\\_resolution | False | Enable URL lookups for RFC1918 addresses. |
80-
| enable\\_dns\\_resolution | False | Enable DNS resolution for all domains to validate if there are RFC1918 resolutions. |
79+
| enable\\_private\\_resolution | False | Enable requests to private and local network IP addresses. |
8180
"""
8281
config.define_section('url', UrlSection)
8382
config.url.configure_setting(
@@ -100,15 +99,11 @@ def configure(config):
10099
)
101100
config.url.configure_setting(
102101
'enable_private_resolution',
103-
'Enable URL lookups for RFC1918 addresses?'
104-
)
105-
config.url.configure_setting(
106-
'enable_dns_resolution',
107-
'Enable DNS resolution for all domains to validate if there are RFC1918 resolutions?'
102+
'Enable requests to private and local network IP addresses?'
108103
)
109104

110105

111-
def setup(bot):
106+
def setup(bot: Sopel):
112107
bot.config.define_section('url', UrlSection)
113108

114109
if bot.config.url.exclude:
@@ -139,7 +134,7 @@ def setup(bot):
139134
bot.memory['shortened_urls'] = tools.SopelMemory()
140135

141136

142-
def shutdown(bot):
137+
def shutdown(bot: Sopel):
143138
# Unset `url_exclude` and `last_seen_url`, but not `shortened_urls`;
144139
# clearing `shortened_urls` will increase API calls. Leaving it in memory
145140
# should not lead to unexpected behavior.
@@ -154,7 +149,7 @@ def shutdown(bot):
154149
@plugin.example('.urlpexclude example\\.com/\\w+', user_help=True)
155150
@plugin.example('.urlexclude example.com/path', user_help=True)
156151
@plugin.output_prefix('[url] ')
157-
def url_ban(bot, trigger):
152+
def url_ban(bot: Sopel, trigger: Trigger):
158153
"""Exclude a URL from auto title.
159154
160155
Use ``urlpexclude`` to exclude a pattern instead of a URL.
@@ -199,7 +194,7 @@ def url_ban(bot, trigger):
199194
@plugin.example('.urlpallow example\\.com/\\w+', user_help=True)
200195
@plugin.example('.urlallow example.com/path', user_help=True)
201196
@plugin.output_prefix('[url] ')
202-
def url_unban(bot, trigger):
197+
def url_unban(bot: Sopel, trigger: Trigger):
203198
"""Allow a URL for auto title.
204199
205200
Use ``urlpallow`` to allow a pattern instead of a URL.
@@ -246,30 +241,27 @@ def url_unban(bot, trigger):
246241
'Google | www.google.com',
247242
online=True, vcr=True)
248243
@plugin.output_prefix('[url] ')
249-
def title_command(bot, trigger):
244+
def title_command(bot: Sopel, trigger: Trigger):
250245
"""
251246
Show the title or URL information for the given URL, or the last URL seen
252247
in this channel.
253248
"""
249+
result_count = 0
250+
254251
if not trigger.group(2):
255252
if trigger.sender not in bot.memory['last_seen_url']:
256253
return
257-
matched = check_callbacks(
258-
bot, bot.memory['last_seen_url'][trigger.sender])
259-
if matched:
260-
return
261-
else:
262-
urls = [bot.memory['last_seen_url'][trigger.sender]]
254+
urls = [bot.memory["last_seen_url"][trigger.sender]]
263255
else:
264-
urls = list( # needs to be a list so len() can be checked later
265-
web.search_urls(
266-
trigger,
267-
exclusion_char=bot.config.url.exclusion_char
268-
)
269-
)
256+
# needs to be a list so len() can be checked later
257+
urls = list(web.search_urls(trigger))
270258

271-
result_count = 0
272-
for url, title, domain, tinyurl in process_urls(bot, trigger, urls):
259+
for url, title, domain, tinyurl, dispatched in process_urls(
260+
bot, trigger, urls, requested=True
261+
):
262+
if dispatched:
263+
result_count += 1
264+
continue
273265
message = '%s | %s' % (title, domain)
274266
if tinyurl:
275267
message += ' ( %s )' % tinyurl
@@ -289,7 +281,7 @@ def title_command(bot, trigger):
289281

290282
@plugin.rule(r'(?u).*(https?://\S+).*')
291283
@plugin.output_prefix('[url] ')
292-
def title_auto(bot, trigger):
284+
def title_auto(bot: Sopel, trigger: Trigger):
293285
"""
294286
Automatically show titles for URLs. For shortened URLs/redirects, find
295287
where the URL redirects to and show the title for that (or call a function
@@ -311,55 +303,68 @@ def title_auto(bot, trigger):
311303
urls = web.search_urls(
312304
trigger, exclusion_char=bot.config.url.exclusion_char, clean=True)
313305

314-
for url, title, domain, tinyurl in process_urls(bot, trigger, urls):
315-
message = '%s | %s' % (title, domain)
316-
if tinyurl:
317-
message += ' ( %s )' % tinyurl
318-
# Guard against responding to other instances of this bot.
319-
if message != trigger:
320-
bot.say(message)
321-
bot.memory['last_seen_url'][trigger.sender] = url
306+
for url, title, domain, tinyurl, dispatched in process_urls(bot, trigger, urls):
307+
if not dispatched:
308+
message = '%s | %s' % (title, domain)
309+
if tinyurl:
310+
message += ' ( %s )' % tinyurl
311+
# Guard against responding to other instances of this bot.
312+
if message != trigger:
313+
bot.say(message)
314+
bot.memory["last_seen_url"][trigger.sender] = url
322315

323316

324-
def process_urls(bot, trigger, urls):
317+
def process_urls(
318+
bot: Sopel, trigger: Trigger, urls: List[str], requested: bool = False
319+
) -> Generator[Tuple[str, str, Optional[str], Optional[str], bool], None, None]:
325320
"""
326-
For each URL in the list, ensure that it isn't handled by another plugin.
327-
If not, find where it redirects to, if anywhere. If that redirected URL
328-
should be handled by another plugin, dispatch the callback for it.
329-
Return a list of (title, hostname) tuples for each URL which is not handled
330-
by another plugin.
321+
For each URL in the list, ensure it should be titled, and do so.
322+
323+
See if it's handled by another plugin. If not, find where it redirects to,
324+
if anywhere. If that redirected URL should be handled by another plugin,
325+
dispatch the callback for it. Return a list of
326+
(url, title, hostname, tinyurl, dispatched) tuples for each URL.
327+
328+
If a callback was dispatched, only the url and dispatched=True will be set.
329+
330+
For titles explicitly requested by the user, exclusion_char and excludes
331+
are skipped.
332+
333+
:param bot: Sopel instance
334+
:param trigger: The trigger object for this event
335+
:param urls: The URLs detected in the triggering message
336+
:param requested: Whether the title was explicitly requested (vs automatic)
331337
"""
332338
shorten_url_length = bot.config.url.shorten_url_length
333339
for url in urls:
334340
# Exclude URLs that start with the exclusion char
335-
if url.startswith(bot.config.url.exclusion_char):
341+
if not requested and url.startswith(bot.config.url.exclusion_char):
336342
continue
337343

344+
parsed_url = urlparse(url)
345+
338346
# Check the URL does not match an existing URL callback
339-
if check_callbacks(bot, url):
340-
continue
347+
if check_callbacks(bot, url, use_excludes=not requested):
348+
yield (url, None, None, None, True)
349+
return
341350

342351
# Prevent private addresses from being queried if enable_private_resolution is False
352+
# FIXME: This does nothing when an attacker knows how to host a 302
353+
# FIXME: This whole concept has a TOCTOU issue
343354
if not bot.config.url.enable_private_resolution:
344-
parsed = urlparse(url)
345-
# Check if it's an address like http://192.168.1.1
346355
try:
347-
if ipaddress.ip_address(parsed.hostname).is_private or ipaddress.ip_address(parsed.hostname).is_loopback:
348-
LOGGER.debug('Ignoring private URL: %s', url)
349-
continue
356+
ips = [ip_address(parsed_url.hostname)]
350357
except ValueError:
351-
pass
352-
353-
# Check if domains are RFC1918 addresses if enable_dns_resolutions is set
354-
if bot.config.url.enable_dns_resolution:
355-
private = False
356-
for result in dns.resolver.query(parsed.hostname):
357-
if ipaddress.ip_address(result).is_private or ipaddress.ip_address(parsed.hostname).is_loopback:
358-
private = True
359-
break
360-
if private:
361-
LOGGER.debug('Ignoring private URL: %s', url)
362-
continue
358+
ips = [ip_address(ip) for ip in dns.resolver.query(parsed_url.hostname)]
359+
360+
private = False
361+
for ip in ips:
362+
if ip.is_private or ip.is_loopback:
363+
private = True
364+
break
365+
if private:
366+
LOGGER.debug('Ignoring private URL: %s', url)
367+
continue
363368

364369
# Call the URL to get a title, if possible
365370
title = find_title(url)
@@ -373,14 +378,15 @@ def process_urls(bot, trigger, urls):
373378
if (shorten_url_length > 0) and (len(url) > shorten_url_length):
374379
tinyurl = get_or_create_shorturl(bot, url)
375380

376-
yield (url, title, get_hostname(url), tinyurl)
381+
yield (url, title, parsed_url.hostname, tinyurl, False)
377382

378383

379-
def check_callbacks(bot, url):
384+
def check_callbacks(bot: Sopel, url: str, use_excludes: bool = True) -> bool:
380385
"""Check if ``url`` is excluded or matches any URL callback patterns.
381386
382387
:param bot: Sopel instance
383-
:param str url: URL to check
388+
:param url: URL to check
389+
:param use_excludes: Use or ignore the configured exclusion lists
384390
:return: True if ``url`` is excluded or matches any URL callback pattern
385391
386392
This function looks at the ``bot.memory`` for ``url_exclude`` patterns and
@@ -400,16 +406,21 @@ def check_callbacks(bot, url):
400406
401407
"""
402408
# Check if it matches the exclusion list first
403-
matched = any(regex.search(url) for regex in bot.memory['url_exclude'])
409+
excluded = False
410+
if use_excludes:
411+
excluded = any(regex.search(url) for regex in bot.memory["url_exclude"])
404412
return (
405-
matched or
413+
excluded or
406414
any(bot.search_url_callbacks(url)) or
407415
bot.rules.check_url_callback(bot, url)
408416
)
409417

410418

411-
def find_title(url, verify=True):
412-
"""Return the title for the given URL."""
419+
def find_title(url: str, verify: bool = True) -> Optional[str]:
420+
"""Return the title for the given URL.
421+
422+
:param verify: Whether to require a valid certificate when using https
423+
"""
413424
try:
414425
response = requests.get(url, stream=True, verify=verify,
415426
headers=DEFAULT_HEADERS)
@@ -453,26 +464,12 @@ def find_title(url, verify=True):
453464
return title or None
454465

455466

456-
def get_hostname(url):
457-
idx = 7
458-
if url.startswith('https://'):
459-
idx = 8
460-
elif url.startswith('ftp://'):
461-
idx = 6
462-
hostname = url[idx:]
463-
slash = hostname.find('/')
464-
if slash != -1:
465-
hostname = hostname[:slash]
466-
return hostname
467-
468-
469-
def get_or_create_shorturl(bot, url):
467+
def get_or_create_shorturl(bot: Sopel, url: str) -> str:
470468
"""Get or create a short URL for ``url``
471469
472470
:param bot: Sopel instance
473-
:param str url: URL to get or create a short URL for
471+
:param url: URL to get or create a short URL for
474472
:return: A short URL
475-
:rtype: str
476473
477474
It gets the short URL for ``url`` from the bot's memory if it exists.
478475
Otherwise, it creates a short URL (see :func:`get_tinyurl`), stores it
@@ -488,7 +485,7 @@ def get_or_create_shorturl(bot, url):
488485
return tinyurl
489486

490487

491-
def get_tinyurl(url):
488+
def get_tinyurl(url: str) -> Optional[str]:
492489
"""Returns a shortened tinyURL link of the URL"""
493490
base_url = "https://tinyurl.com/api-create.php"
494491
tinyurl = "%s?%s" % (base_url, web.urlencode({'url': url}))

0 commit comments

Comments
 (0)