from __future__ import annotations import re import urllib.parse from collections.abc import Sequence from typing import AnyStr from typing import overload from mitmproxy.net import check from mitmproxy.net.check import is_valid_host from mitmproxy.net.check import is_valid_port from mitmproxy.utils.strutils import always_str # This regex extracts & splits the host header into host and port. # Handles the edge case of IPv6 addresses containing colons. # https://bugzilla.mozilla.org/show_bug.cgi?id=45891 _authority_re = re.compile(r"^(?P[^:]+|\[.+\])(?::(?P\d+))?$") def parse(url: str | bytes) -> tuple[bytes, bytes, int, bytes]: """ URL-parsing function that checks that - port is an integer 0-65535 - host is a valid IDNA-encoded hostname with no null-bytes - path is valid ASCII Args: A URL (as bytes or as unicode) Returns: A (scheme, host, port, path) tuple Raises: ValueError, if the URL is not properly formatted. """ # FIXME: We shouldn't rely on urllib here. # Size of Ascii character after encoding is 1 byte which is same as its size # But non-Ascii character's size after encoding will be more than its size def ascii_check(x): if len(x) == len(str(x).encode()): return True return False if isinstance(url, bytes): url = url.decode() if not ascii_check(url): url = urllib.parse.urlsplit(url) # type: ignore url = list(url) # type: ignore url[3] = urllib.parse.quote(url[3]) # type: ignore url = urllib.parse.urlunsplit(url) # type: ignore parsed: urllib.parse.ParseResult = urllib.parse.urlparse(url) if not parsed.hostname: raise ValueError("No hostname given") else: host = parsed.hostname.encode("idna") parsed_b: urllib.parse.ParseResultBytes = parsed.encode("ascii") # type: ignore port = parsed_b.port if not port: port = 443 if parsed_b.scheme == b"https" else 80 full_path: bytes = urllib.parse.urlunparse( (b"", b"", parsed_b.path, parsed_b.params, parsed_b.query, parsed_b.fragment) # type: ignore ) if not full_path.startswith(b"/"): full_path = b"/" + full_path # type: ignore if not check.is_valid_host(host): raise ValueError("Invalid Host") return parsed_b.scheme, host, port, full_path @overload def unparse(scheme: str, host: str, port: int, path) -> str: ... @overload def unparse(scheme: bytes, host: bytes, port: int, path) -> bytes: ... def unparse(scheme, host, port, path): """ Returns a URL string, constructed from the specified components. """ authority = hostport(scheme, host, port) if isinstance(scheme, str): return f"{scheme}://{authority}{path}" else: return b"%s://%s%s" % (scheme, authority, path) def encode(s: Sequence[tuple[str, str]], similar_to: str | None = None) -> str: """ Takes a list of (key, value) tuples and returns a urlencoded string. If similar_to is passed, the output is formatted similar to the provided urlencoded string. """ remove_trailing_equal = False if similar_to: remove_trailing_equal = any("=" not in param for param in similar_to.split("&")) encoded = urllib.parse.urlencode(s, False, errors="surrogateescape") if encoded and remove_trailing_equal: encoded = encoded.replace("=&", "&") if encoded[-1] == "=": encoded = encoded[:-1] return encoded def decode(s): """ Takes a urlencoded string and returns a list of surrogate-escaped (key, value) tuples. """ return urllib.parse.parse_qsl(s, keep_blank_values=True, errors="surrogateescape") def quote(b: str, safe: str = "/") -> str: """ Returns: An ascii-encodable str. """ return urllib.parse.quote(b, safe=safe, errors="surrogateescape") def unquote(s: str) -> str: """ Args: s: A surrogate-escaped str Returns: A surrogate-escaped str """ return urllib.parse.unquote(s, errors="surrogateescape") def hostport(scheme: AnyStr, host: AnyStr, port: int) -> AnyStr: """ Returns the host component, with a port specification if needed. """ if default_port(scheme) == port: return host else: if isinstance(host, bytes): return b"%s:%d" % (host, port) else: return "%s:%d" % (host, port) def default_port(scheme: AnyStr) -> int | None: return { "http": 80, b"http": 80, "https": 443, b"https": 443, }.get(scheme, None) def parse_authority(authority: AnyStr, check: bool) -> tuple[str, int | None]: """Extract the host and port from host header/authority information Raises: ValueError, if check is True and the authority information is malformed. """ try: if isinstance(authority, bytes): m = _authority_re.match(authority.decode("utf-8")) if not m: raise ValueError host = m["host"].encode("utf-8").decode("idna") else: m = _authority_re.match(authority) if not m: raise ValueError host = m.group("host") if host.startswith("[") and host.endswith("]"): host = host[1:-1] if not is_valid_host(host): raise ValueError if m.group("port"): port = int(m.group("port")) if not is_valid_port(port): raise ValueError return host, port else: return host, None except ValueError: if check: raise else: return always_str(authority, "utf-8", "surrogateescape"), None