[networking] Remove dot segments during URL normalization (#7662)

This implements RFC3986 5.2.4 remove_dot_segments during the URL normalization process.

Closes #3355, #6526

Authored by: coletdjnz
This commit is contained in:
coletdjnz
2023-07-29 10:40:20 +12:00
committed by GitHub
parent a15fcd299e
commit 4bf912282a
8 changed files with 104 additions and 36 deletions

View File

@@ -41,7 +41,8 @@ from .exceptions import (
from ..dependencies import brotli
from ..socks import ProxyError as SocksProxyError
from ..socks import sockssocket
from ..utils import escape_url, update_url_query
from ..utils import update_url_query
from ..utils.networking import normalize_url
SUPPORTED_ENCODINGS = ['gzip', 'deflate']
CONTENT_DECODE_ERRORS = [zlib.error, OSError]
@@ -179,7 +180,7 @@ class HTTPHandler(urllib.request.AbstractHTTPHandler):
# Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
# the code of this workaround has been moved here from YoutubeDL.urlopen()
url = req.get_full_url()
url_escaped = escape_url(url)
url_escaped = normalize_url(url)
# Substitute URL if any change after escaping
if url != url_escaped:
@@ -212,7 +213,7 @@ class HTTPHandler(urllib.request.AbstractHTTPHandler):
if location:
# As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
location = location.encode('iso-8859-1').decode()
location_escaped = escape_url(location)
location_escaped = normalize_url(location)
if location != location_escaped:
del resp.headers['Location']
resp.headers['Location'] = location_escaped

View File

@@ -27,10 +27,9 @@ from ..utils import (
classproperty,
deprecation_warning,
error_to_str,
escape_url,
update_url_query,
)
from ..utils.networking import HTTPHeaderDict
from ..utils.networking import HTTPHeaderDict, normalize_url
if typing.TYPE_CHECKING:
RequestData = bytes | Iterable[bytes] | typing.IO | None
@@ -372,7 +371,7 @@ class Request:
raise TypeError('url must be a string')
elif url.startswith('//'):
url = 'http:' + url
self._url = escape_url(url)
self._url = normalize_url(url)
@property
def method(self):