Skip to content

Commit 7fc8f76

Browse files
committed
url: fix .title vs url callback plugins
Also a bunch of misc cleaning
1 parent 984e1b9 commit 7fc8f76

File tree

1 file changed

+95
-99
lines changed

1 file changed

+95
-99
lines changed

sopel/modules/url.py

Lines changed: 95 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,10 @@
1010
"""
1111
from __future__ import annotations
1212

13-
import ipaddress
13+
from ipaddress import ip_address
1414
import logging
1515
import re
16+
from typing import TYPE_CHECKING
1617
from urllib.parse import urlparse
1718

1819
import dns.resolver
@@ -23,6 +24,12 @@
2324
from sopel.config import types
2425
from sopel.tools import web
2526

27+
if TYPE_CHECKING:
28+
from typing import Generator, List, Optional, Tuple
29+
30+
from sopel.bot import Sopel, SopelWrapper
31+
from sopel.config import Config
32+
from sopel.trigger import Trigger
2633

2734
LOGGER = logging.getLogger(__name__)
2835
USER_AGENT = (
@@ -39,8 +46,6 @@
3946
# world's best way to do this, but it'll do for now.
4047
TITLE_TAG_DATA = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE)
4148
QUOTED_TITLE = re.compile('[\'"]<title>[\'"]', re.IGNORECASE)
42-
# This is another regex that presumably does something important.
43-
RE_DCC = re.compile(r'(?i)dcc\ssend')
4449
# This sets the maximum number of bytes that should be read in order to find
4550
# the title. We don't want it too high, or a link to a big file/stream will
4651
# just keep downloading until there's no more memory. 640k ought to be enough
@@ -62,22 +67,18 @@ class UrlSection(types.StaticSection):
6267
"""If greater than 0, the title fetcher will include a TinyURL version of links longer than this many characters."""
6368
enable_private_resolution = types.BooleanAttribute(
6469
'enable_private_resolution', default=False)
65-
"""Enable URL lookups for RFC1918 addresses"""
66-
enable_dns_resolution = types.BooleanAttribute(
67-
'enable_dns_resolution', default=False)
68-
"""Enable DNS resolution for all domains to validate if there are RFC1918 resolutions"""
70+
"""Enable requests to private and local network IP addresses"""
6971

7072

71-
def configure(config):
73+
def configure(config: Config):
7274
"""
7375
| name | example | purpose |
7476
| ---- | ------- | ------- |
7577
| enable_auto_title | yes | Enable auto-title. |
7678
| exclude | https?://git\\\\.io/.* | A list of regular expressions for URLs for which the title should not be shown. |
7779
| exclusion\\_char | ! | A character (or string) which, when immediately preceding a URL, will stop the URL's title from being shown. |
7880
| shorten\\_url\\_length | 72 | If greater than 0, the title fetcher will include a TinyURL version of links longer than this many characters. |
79-
| enable\\_private\\_resolution | False | Enable URL lookups for RFC1918 addresses. |
80-
| enable\\_dns\\_resolution | False | Enable DNS resolution for all domains to validate if there are RFC1918 resolutions. |
81+
| enable\\_private\\_resolution | False | Enable requests to private and local network IP addresses. |
8182
"""
8283
config.define_section('url', UrlSection)
8384
config.url.configure_setting(
@@ -100,15 +101,11 @@ def configure(config):
100101
)
101102
config.url.configure_setting(
102103
'enable_private_resolution',
103-
'Enable URL lookups for RFC1918 addresses?'
104-
)
105-
config.url.configure_setting(
106-
'enable_dns_resolution',
107-
'Enable DNS resolution for all domains to validate if there are RFC1918 resolutions?'
104+
'Enable requests to private and local network IP addresses?'
108105
)
109106

110107

111-
def setup(bot):
108+
def setup(bot: Sopel):
112109
bot.config.define_section('url', UrlSection)
113110

114111
if bot.config.url.exclude:
@@ -139,7 +136,7 @@ def setup(bot):
139136
bot.memory['shortened_urls'] = tools.SopelMemory()
140137

141138

142-
def shutdown(bot):
139+
def shutdown(bot: Sopel):
143140
# Unset `url_exclude` and `last_seen_url`, but not `shortened_urls`;
144141
# clearing `shortened_urls` will increase API calls. Leaving it in memory
145142
# should not lead to unexpected behavior.
@@ -154,7 +151,7 @@ def shutdown(bot):
154151
@plugin.example('.urlpexclude example\\.com/\\w+', user_help=True)
155152
@plugin.example('.urlexclude example.com/path', user_help=True)
156153
@plugin.output_prefix('[url] ')
157-
def url_ban(bot, trigger):
154+
def url_ban(bot: SopelWrapper, trigger: Trigger):
158155
"""Exclude a URL from auto title.
159156
160157
Use ``urlpexclude`` to exclude a pattern instead of a URL.
@@ -199,7 +196,7 @@ def url_ban(bot, trigger):
199196
@plugin.example('.urlpallow example\\.com/\\w+', user_help=True)
200197
@plugin.example('.urlallow example.com/path', user_help=True)
201198
@plugin.output_prefix('[url] ')
202-
def url_unban(bot, trigger):
199+
def url_unban(bot: SopelWrapper, trigger: Trigger):
203200
"""Allow a URL for auto title.
204201
205202
Use ``urlpallow`` to allow a pattern instead of a URL.
@@ -246,35 +243,32 @@ def url_unban(bot, trigger):
246243
'Google | www.google.com',
247244
online=True, vcr=True)
248245
@plugin.output_prefix('[url] ')
249-
def title_command(bot, trigger):
246+
def title_command(bot: SopelWrapper, trigger: Trigger):
250247
"""
251248
Show the title or URL information for the given URL, or the last URL seen
252249
in this channel.
253250
"""
251+
result_count = 0
252+
254253
if not trigger.group(2):
255254
if trigger.sender not in bot.memory['last_seen_url']:
256255
return
257-
matched = check_callbacks(
258-
bot, bot.memory['last_seen_url'][trigger.sender])
259-
if matched:
260-
return
261-
else:
262-
urls = [bot.memory['last_seen_url'][trigger.sender]]
256+
urls = [bot.memory["last_seen_url"][trigger.sender]]
263257
else:
264-
urls = list( # needs to be a list so len() can be checked later
265-
web.search_urls(
266-
trigger,
267-
exclusion_char=bot.config.url.exclusion_char
268-
)
269-
)
258+
# needs to be a list so len() can be checked later
259+
urls = list(web.search_urls(trigger))
270260

271-
result_count = 0
272-
for url, title, domain, tinyurl in process_urls(bot, trigger, urls):
273-
message = '%s | %s' % (title, domain)
261+
for url, title, domain, tinyurl, dispatched in process_urls(
262+
bot, trigger, urls, requested=True
263+
):
264+
if dispatched:
265+
result_count += 1
266+
continue
267+
message = "%s | %s" % (title, domain)
274268
if tinyurl:
275-
message += ' ( %s )' % tinyurl
269+
message += " ( %s )" % tinyurl
276270
bot.reply(message)
277-
bot.memory['last_seen_url'][trigger.sender] = url
271+
bot.memory["last_seen_url"][trigger.sender] = url
278272
result_count += 1
279273

280274
expected_count = len(urls)
@@ -289,7 +283,7 @@ def title_command(bot, trigger):
289283

290284
@plugin.rule(r'(?u).*(https?://\S+).*')
291285
@plugin.output_prefix('[url] ')
292-
def title_auto(bot, trigger):
286+
def title_auto(bot: SopelWrapper, trigger: Trigger):
293287
"""
294288
Automatically show titles for URLs. For shortened URLs/redirects, find
295289
where the URL redirects to and show the title for that (or call a function
@@ -311,55 +305,68 @@ def title_auto(bot, trigger):
311305
urls = web.search_urls(
312306
trigger, exclusion_char=bot.config.url.exclusion_char, clean=True)
313307

314-
for url, title, domain, tinyurl in process_urls(bot, trigger, urls):
315-
message = '%s | %s' % (title, domain)
316-
if tinyurl:
317-
message += ' ( %s )' % tinyurl
318-
# Guard against responding to other instances of this bot.
319-
if message != trigger:
320-
bot.say(message)
321-
bot.memory['last_seen_url'][trigger.sender] = url
308+
for url, title, domain, tinyurl, dispatched in process_urls(bot, trigger, urls):
309+
if not dispatched:
310+
message = '%s | %s' % (title, domain)
311+
if tinyurl:
312+
message += ' ( %s )' % tinyurl
313+
# Guard against responding to other instances of this bot.
314+
if message != trigger:
315+
bot.say(message)
316+
bot.memory["last_seen_url"][trigger.sender] = url
322317

323318

324-
def process_urls(bot, trigger, urls):
319+
def process_urls(
320+
bot: SopelWrapper, trigger: Trigger, urls: List[str], requested: bool = False
321+
) -> Generator[Tuple[str, str, Optional[str], Optional[str], bool], None, None]:
325322
"""
326-
For each URL in the list, ensure that it isn't handled by another plugin.
327-
If not, find where it redirects to, if anywhere. If that redirected URL
328-
should be handled by another plugin, dispatch the callback for it.
329-
Return a list of (title, hostname) tuples for each URL which is not handled
330-
by another plugin.
323+
For each URL in the list, ensure it should be titled, and do so.
324+
325+
:param bot: Sopel instance
326+
:param trigger: The trigger object for this event
327+
:param urls: The URLs detected in the triggering message
328+
:param requested: Whether the title was explicitly requested (vs automatic)
329+
330+
See if it's handled by another plugin. If not, find where it redirects to,
331+
if anywhere. If that redirected URL should be handled by another plugin,
332+
dispatch the callback for it. Return a list of
333+
(url, title, hostname, tinyurl, dispatched) tuples for each URL.
334+
335+
If a callback was dispatched, only the url and dispatched=True will be set.
336+
337+
For titles explicitly requested by the user, exclusion_char and excludes
338+
are skipped.
331339
"""
332340
shorten_url_length = bot.config.url.shorten_url_length
333341
for url in urls:
334342
# Exclude URLs that start with the exclusion char
335-
if url.startswith(bot.config.url.exclusion_char):
343+
if not requested and url.startswith(bot.config.url.exclusion_char):
336344
continue
337345

346+
parsed_url = urlparse(url)
347+
338348
# Check the URL does not match an existing URL callback
339-
if check_callbacks(bot, url):
340-
continue
349+
if check_callbacks(bot, url, use_excludes=not requested):
350+
yield (url, None, None, None, True)
351+
return
341352

342353
# Prevent private addresses from being queried if enable_private_resolution is False
354+
# FIXME: This does nothing when an attacker knows how to host a 302
355+
# FIXME: This whole concept has a TOCTOU issue
343356
if not bot.config.url.enable_private_resolution:
344-
parsed = urlparse(url)
345-
# Check if it's an address like http://192.168.1.1
346357
try:
347-
if ipaddress.ip_address(parsed.hostname).is_private or ipaddress.ip_address(parsed.hostname).is_loopback:
348-
LOGGER.debug('Ignoring private URL: %s', url)
349-
continue
358+
ips = [ip_address(parsed_url.hostname)]
350359
except ValueError:
351-
pass
352-
353-
# Check if domains are RFC1918 addresses if enable_dns_resolutions is set
354-
if bot.config.url.enable_dns_resolution:
355-
private = False
356-
for result in dns.resolver.query(parsed.hostname):
357-
if ipaddress.ip_address(result).is_private or ipaddress.ip_address(parsed.hostname).is_loopback:
358-
private = True
359-
break
360-
if private:
361-
LOGGER.debug('Ignoring private URL: %s', url)
362-
continue
360+
ips = [ip_address(ip) for ip in dns.resolver.query(parsed_url.hostname)]
361+
362+
private = False
363+
for ip in ips:
364+
if ip.is_private or ip.is_loopback:
365+
private = True
366+
break
367+
if private:
368+
LOGGER.debug("Ignoring private URL: %s", url)
369+
continue
363370

364371
# Call the URL to get a title, if possible
365372
title = find_title(url)
@@ -373,14 +380,15 @@ def process_urls(bot, trigger, urls):
373380
if (shorten_url_length > 0) and (len(url) > shorten_url_length):
374381
tinyurl = get_or_create_shorturl(bot, url)
375382

376-
yield (url, title, get_hostname(url), tinyurl)
383+
yield (url, title, parsed_url.hostname, tinyurl, False)
377384

378385

379-
def check_callbacks(bot, url):
386+
def check_callbacks(bot: SopelWrapper, url: str, use_excludes: bool = True) -> bool:
380387
"""Check if ``url`` is excluded or matches any URL callback patterns.
381388
382389
:param bot: Sopel instance
383-
:param str url: URL to check
390+
:param url: URL to check
391+
:param use_excludes: Use or ignore the configured exclusion lists
384392
:return: True if ``url`` is excluded or matches any URL callback pattern
385393
386394
This function looks at the ``bot.memory`` for ``url_exclude`` patterns and
@@ -400,16 +408,21 @@ def check_callbacks(bot, url):
400408
401409
"""
402410
# Check if it matches the exclusion list first
403-
matched = any(regex.search(url) for regex in bot.memory['url_exclude'])
411+
excluded = use_excludes and any(
412+
regex.search(url) for regex in bot.memory["url_exclude"]
413+
)
404414
return (
405-
matched or
415+
excluded or
406416
any(bot.search_url_callbacks(url)) or
407417
bot.rules.check_url_callback(bot, url)
408418
)
409419

410420

411-
def find_title(url, verify=True):
412-
"""Return the title for the given URL."""
421+
def find_title(url: str, verify: bool = True) -> Optional[str]:
422+
"""Return the title for the given URL.
423+
424+
:param verify: Whether to require a valid certificate when using https
425+
"""
413426
try:
414427
response = requests.get(url, stream=True, verify=verify,
415428
headers=DEFAULT_HEADERS)
@@ -447,32 +460,15 @@ def find_title(url, verify=True):
447460

448461
title = ' '.join(title.split()) # cleanly remove multiple spaces
449462

450-
# More cryptic regex substitutions. This one looks to be myano's invention.
451-
title = RE_DCC.sub('', title)
452-
453463
return title or None
454464

455465

456-
def get_hostname(url):
457-
idx = 7
458-
if url.startswith('https://'):
459-
idx = 8
460-
elif url.startswith('ftp://'):
461-
idx = 6
462-
hostname = url[idx:]
463-
slash = hostname.find('/')
464-
if slash != -1:
465-
hostname = hostname[:slash]
466-
return hostname
467-
468-
469-
def get_or_create_shorturl(bot, url):
466+
def get_or_create_shorturl(bot: SopelWrapper, url: str) -> str:
470467
"""Get or create a short URL for ``url``
471468
472469
:param bot: Sopel instance
473-
:param str url: URL to get or create a short URL for
470+
:param url: URL to get or create a short URL for
474471
:return: A short URL
475-
:rtype: str
476472
477473
It gets the short URL for ``url`` from the bot's memory if it exists.
478474
Otherwise, it creates a short URL (see :func:`get_tinyurl`), stores it
@@ -488,7 +484,7 @@ def get_or_create_shorturl(bot, url):
488484
return tinyurl
489485

490486

491-
def get_tinyurl(url):
487+
def get_tinyurl(url: str) -> Optional[str]:
492488
"""Returns a shortened tinyURL link of the URL"""
493489
base_url = "https://tinyurl.com/api-create.php"
494490
tinyurl = "%s?%s" % (base_url, web.urlencode({'url': url}))

0 commit comments

Comments
 (0)