1010"""
1111from __future__ import annotations
1212
13- import ipaddress
13+ from ipaddress import ip_address
1414import logging
1515import re
16+ from typing import TYPE_CHECKING
1617from urllib .parse import urlparse
1718
1819import dns .resolver
2324from sopel .config import types
2425from sopel .tools import web
2526
27+ if TYPE_CHECKING :
28+ from typing import Generator , List , Optional , Tuple
29+
30+ from sopel .bot import Sopel , SopelWrapper
31+ from sopel .config import Config
32+ from sopel .trigger import Trigger
2633
2734LOGGER = logging .getLogger (__name__ )
2835USER_AGENT = (
3946# world's best way to do this, but it'll do for now.
4047TITLE_TAG_DATA = re .compile ('<(/?)title( [^>]+)?>' , re .IGNORECASE )
4148QUOTED_TITLE = re .compile ('[\' "]<title>[\' "]' , re .IGNORECASE )
42- # This is another regex that presumably does something important.
43- RE_DCC = re .compile (r'(?i)dcc\ssend' )
4449# This sets the maximum number of bytes that should be read in order to find
4550# the title. We don't want it too high, or a link to a big file/stream will
4651# just keep downloading until there's no more memory. 640k ought to be enough
@@ -62,22 +67,18 @@ class UrlSection(types.StaticSection):
6267 """If greater than 0, the title fetcher will include a TinyURL version of links longer than this many characters."""
6368 enable_private_resolution = types .BooleanAttribute (
6469 'enable_private_resolution' , default = False )
65- """Enable URL lookups for RFC1918 addresses"""
66- enable_dns_resolution = types .BooleanAttribute (
67- 'enable_dns_resolution' , default = False )
68- """Enable DNS resolution for all domains to validate if there are RFC1918 resolutions"""
70+ """Enable requests to private and local network IP addresses"""
6971
7072
71- def configure (config ):
73+ def configure (config : Config ):
7274 """
7375 | name | example | purpose |
7476 | ---- | ------- | ------- |
7577 | enable_auto_title | yes | Enable auto-title. |
7678 | exclude | https?://git\\ \\ .io/.* | A list of regular expressions for URLs for which the title should not be shown. |
7779 | exclusion\\ _char | ! | A character (or string) which, when immediately preceding a URL, will stop the URL's title from being shown. |
7880 | shorten\\ _url\\ _length | 72 | If greater than 0, the title fetcher will include a TinyURL version of links longer than this many characters. |
79- | enable\\ _private\\ _resolution | False | Enable URL lookups for RFC1918 addresses. |
80- | enable\\ _dns\\ _resolution | False | Enable DNS resolution for all domains to validate if there are RFC1918 resolutions. |
81+ | enable\\ _private\\ _resolution | False | Enable requests to private and local network IP addresses. |
8182 """
8283 config .define_section ('url' , UrlSection )
8384 config .url .configure_setting (
@@ -100,15 +101,11 @@ def configure(config):
100101 )
101102 config .url .configure_setting (
102103 'enable_private_resolution' ,
103- 'Enable URL lookups for RFC1918 addresses?'
104- )
105- config .url .configure_setting (
106- 'enable_dns_resolution' ,
107- 'Enable DNS resolution for all domains to validate if there are RFC1918 resolutions?'
104+ 'Enable requests to private and local network IP addresses?'
108105 )
109106
110107
111- def setup (bot ):
108+ def setup (bot : Sopel ):
112109 bot .config .define_section ('url' , UrlSection )
113110
114111 if bot .config .url .exclude :
@@ -139,7 +136,7 @@ def setup(bot):
139136 bot .memory ['shortened_urls' ] = tools .SopelMemory ()
140137
141138
142- def shutdown (bot ):
139+ def shutdown (bot : Sopel ):
143140 # Unset `url_exclude` and `last_seen_url`, but not `shortened_urls`;
144141 # clearing `shortened_urls` will increase API calls. Leaving it in memory
145142 # should not lead to unexpected behavior.
@@ -154,7 +151,7 @@ def shutdown(bot):
154151@plugin .example ('.urlpexclude example\\ .com/\\ w+' , user_help = True )
155152@plugin .example ('.urlexclude example.com/path' , user_help = True )
156153@plugin .output_prefix ('[url] ' )
157- def url_ban (bot , trigger ):
154+ def url_ban (bot : SopelWrapper , trigger : Trigger ):
158155 """Exclude a URL from auto title.
159156
160157 Use ``urlpexclude`` to exclude a pattern instead of a URL.
@@ -199,7 +196,7 @@ def url_ban(bot, trigger):
199196@plugin .example ('.urlpallow example\\ .com/\\ w+' , user_help = True )
200197@plugin .example ('.urlallow example.com/path' , user_help = True )
201198@plugin .output_prefix ('[url] ' )
202- def url_unban (bot , trigger ):
199+ def url_unban (bot : SopelWrapper , trigger : Trigger ):
203200 """Allow a URL for auto title.
204201
205202 Use ``urlpallow`` to allow a pattern instead of a URL.
@@ -246,35 +243,32 @@ def url_unban(bot, trigger):
246243 'Google | www.google.com' ,
247244 online = True , vcr = True )
248245@plugin .output_prefix ('[url] ' )
249- def title_command (bot , trigger ):
246+ def title_command (bot : SopelWrapper , trigger : Trigger ):
250247 """
251248 Show the title or URL information for the given URL, or the last URL seen
252249 in this channel.
253250 """
251+ result_count = 0
252+
254253 if not trigger .group (2 ):
255254 if trigger .sender not in bot .memory ['last_seen_url' ]:
256255 return
257- matched = check_callbacks (
258- bot , bot .memory ['last_seen_url' ][trigger .sender ])
259- if matched :
260- return
261- else :
262- urls = [bot .memory ['last_seen_url' ][trigger .sender ]]
256+ urls = [bot .memory ["last_seen_url" ][trigger .sender ]]
263257 else :
264- urls = list ( # needs to be a list so len() can be checked later
265- web .search_urls (
266- trigger ,
267- exclusion_char = bot .config .url .exclusion_char
268- )
269- )
258+ # needs to be a list so len() can be checked later
259+ urls = list (web .search_urls (trigger ))
270260
271- result_count = 0
272- for url , title , domain , tinyurl in process_urls (bot , trigger , urls ):
273- message = '%s | %s' % (title , domain )
261+ for url , title , domain , tinyurl , dispatched in process_urls (
262+ bot , trigger , urls , requested = True
263+ ):
264+ if dispatched :
265+ result_count += 1
266+ continue
267+ message = "%s | %s" % (title , domain )
274268 if tinyurl :
275- message += ' ( %s )' % tinyurl
269+ message += " ( %s )" % tinyurl
276270 bot .reply (message )
277- bot .memory [' last_seen_url' ][trigger .sender ] = url
271+ bot .memory [" last_seen_url" ][trigger .sender ] = url
278272 result_count += 1
279273
280274 expected_count = len (urls )
@@ -289,7 +283,7 @@ def title_command(bot, trigger):
289283
290284@plugin .rule (r'(?u).*(https?://\S+).*' )
291285@plugin .output_prefix ('[url] ' )
292- def title_auto (bot , trigger ):
286+ def title_auto (bot : SopelWrapper , trigger : Trigger ):
293287 """
294288 Automatically show titles for URLs. For shortened URLs/redirects, find
295289 where the URL redirects to and show the title for that (or call a function
@@ -311,55 +305,68 @@ def title_auto(bot, trigger):
311305 urls = web .search_urls (
312306 trigger , exclusion_char = bot .config .url .exclusion_char , clean = True )
313307
314- for url , title , domain , tinyurl in process_urls (bot , trigger , urls ):
315- message = '%s | %s' % (title , domain )
316- if tinyurl :
317- message += ' ( %s )' % tinyurl
318- # Guard against responding to other instances of this bot.
319- if message != trigger :
320- bot .say (message )
321- bot .memory ['last_seen_url' ][trigger .sender ] = url
308+ for url , title , domain , tinyurl , dispatched in process_urls (bot , trigger , urls ):
309+ if not dispatched :
310+ message = '%s | %s' % (title , domain )
311+ if tinyurl :
312+ message += ' ( %s )' % tinyurl
313+ # Guard against responding to other instances of this bot.
314+ if message != trigger :
315+ bot .say (message )
316+ bot .memory ["last_seen_url" ][trigger .sender ] = url
322317
323318
324- def process_urls (bot , trigger , urls ):
319+ def process_urls (
320+ bot : SopelWrapper , trigger : Trigger , urls : List [str ], requested : bool = False
321+ ) -> Generator [Tuple [str , str , Optional [str ], Optional [str ], bool ], None , None ]:
325322 """
326- For each URL in the list, ensure that it isn't handled by another plugin.
327- If not, find where it redirects to, if anywhere. If that redirected URL
328- should be handled by another plugin, dispatch the callback for it.
329- Return a list of (title, hostname) tuples for each URL which is not handled
330- by another plugin.
323+ For each URL in the list, ensure it should be titled, and do so.
324+
325+ :param bot: Sopel instance
326+ :param trigger: The trigger object for this event
327+ :param urls: The URLs detected in the triggering message
328+ :param requested: Whether the title was explicitly requested (vs automatic)
329+
330+ See if it's handled by another plugin. If not, find where it redirects to,
331+ if anywhere. If that redirected URL should be handled by another plugin,
332+ dispatch the callback for it. Return a list of
333+ (url, title, hostname, tinyurl, dispatched) tuples for each URL.
334+
335+ If a callback was dispatched, only the url and dispatched=True will be set.
336+
337+ For titles explicitly requested by the user, exclusion_char and excludes
338+ are skipped.
331339 """
332340 shorten_url_length = bot .config .url .shorten_url_length
333341 for url in urls :
334342 # Exclude URLs that start with the exclusion char
335- if url .startswith (bot .config .url .exclusion_char ):
343+ if not requested and url .startswith (bot .config .url .exclusion_char ):
336344 continue
337345
346+ parsed_url = urlparse (url )
347+
338348 # Check the URL does not match an existing URL callback
339- if check_callbacks (bot , url ):
340- continue
349+ if check_callbacks (bot , url , use_excludes = not requested ):
350+ yield (url , None , None , None , True )
351+ return
341352
342353 # Prevent private addresses from being queried if enable_private_resolution is False
354+ # FIXME: This does nothing when an attacker knows how to host a 302
355+ # FIXME: This whole concept has a TOCTOU issue
343356 if not bot .config .url .enable_private_resolution :
344- parsed = urlparse (url )
345- # Check if it's an address like http://192.168.1.1
346357 try :
347- if ipaddress .ip_address (parsed .hostname ).is_private or ipaddress .ip_address (parsed .hostname ).is_loopback :
348- LOGGER .debug ('Ignoring private URL: %s' , url )
349- continue
358+ ips = [ip_address (parsed_url .hostname )]
350359 except ValueError :
351- pass
352-
353- # Check if domains are RFC1918 addresses if enable_dns_resolutions is set
354- if bot .config .url .enable_dns_resolution :
355- private = False
356- for result in dns .resolver .query (parsed .hostname ):
357- if ipaddress .ip_address (result ).is_private or ipaddress .ip_address (parsed .hostname ).is_loopback :
358- private = True
359- break
360- if private :
361- LOGGER .debug ('Ignoring private URL: %s' , url )
362- continue
360+ ips = [ip_address (ip ) for ip in dns .resolver .query (parsed_url .hostname )]
361+
362+ private = False
363+ for ip in ips :
364+ if ip .is_private or ip .is_loopback :
365+ private = True
366+ break
367+ if private :
368+ LOGGER .debug ("Ignoring private URL: %s" , url )
369+ continue
363370
364371 # Call the URL to get a title, if possible
365372 title = find_title (url )
@@ -373,14 +380,15 @@ def process_urls(bot, trigger, urls):
373380 if (shorten_url_length > 0 ) and (len (url ) > shorten_url_length ):
374381 tinyurl = get_or_create_shorturl (bot , url )
375382
376- yield (url , title , get_hostname ( url ) , tinyurl )
383+ yield (url , title , parsed_url . hostname , tinyurl , False )
377384
378385
379- def check_callbacks (bot , url ) :
386+ def check_callbacks (bot : SopelWrapper , url : str , use_excludes : bool = True ) -> bool :
380387 """Check if ``url`` is excluded or matches any URL callback patterns.
381388
382389 :param bot: Sopel instance
383- :param str url: URL to check
390+ :param url: URL to check
391+ :param use_excludes: Use or ignore the configured exclusion lists
384392 :return: True if ``url`` is excluded or matches any URL callback pattern
385393
386394 This function looks at the ``bot.memory`` for ``url_exclude`` patterns and
@@ -400,16 +408,21 @@ def check_callbacks(bot, url):
400408
401409 """
402410 # Check if it matches the exclusion list first
403- matched = any (regex .search (url ) for regex in bot .memory ['url_exclude' ])
411+ excluded = use_excludes and any (
412+ regex .search (url ) for regex in bot .memory ["url_exclude" ]
413+ )
404414 return (
405- matched or
415+ excluded or
406416 any (bot .search_url_callbacks (url )) or
407417 bot .rules .check_url_callback (bot , url )
408418 )
409419
410420
411- def find_title (url , verify = True ):
412- """Return the title for the given URL."""
421+ def find_title (url : str , verify : bool = True ) -> Optional [str ]:
422+ """Return the title for the given URL.
423+
424+ :param verify: Whether to require a valid certificate when using https
425+ """
413426 try :
414427 response = requests .get (url , stream = True , verify = verify ,
415428 headers = DEFAULT_HEADERS )
@@ -447,32 +460,15 @@ def find_title(url, verify=True):
447460
448461 title = ' ' .join (title .split ()) # cleanly remove multiple spaces
449462
450- # More cryptic regex substitutions. This one looks to be myano's invention.
451- title = RE_DCC .sub ('' , title )
452-
453463 return title or None
454464
455465
456- def get_hostname (url ):
457- idx = 7
458- if url .startswith ('https://' ):
459- idx = 8
460- elif url .startswith ('ftp://' ):
461- idx = 6
462- hostname = url [idx :]
463- slash = hostname .find ('/' )
464- if slash != - 1 :
465- hostname = hostname [:slash ]
466- return hostname
467-
468-
469- def get_or_create_shorturl (bot , url ):
466+ def get_or_create_shorturl (bot : SopelWrapper , url : str ) -> str :
470467 """Get or create a short URL for ``url``
471468
472469 :param bot: Sopel instance
473- :param str url: URL to get or create a short URL for
470+ :param url: URL to get or create a short URL for
474471 :return: A short URL
475- :rtype: str
476472
477473 It gets the short URL for ``url`` from the bot's memory if it exists.
478474 Otherwise, it creates a short URL (see :func:`get_tinyurl`), stores it
@@ -488,7 +484,7 @@ def get_or_create_shorturl(bot, url):
488484 return tinyurl
489485
490486
491- def get_tinyurl (url ) :
487+ def get_tinyurl (url : str ) -> Optional [ str ] :
492488 """Returns a shortened tinyURL link of the URL"""
493489 base_url = "https://tinyurl.com/api-create.php"
494490 tinyurl = "%s?%s" % (base_url , web .urlencode ({'url' : url }))
0 commit comments