baijiahao_data_crawl/venv/Lib/site-packages/mitmproxy/net/http/validate.py

import logging
import re
import typing

from mitmproxy.http import Message
from mitmproxy.http import Request
from mitmproxy.http import Response

logger = logging.getLogger(__name__)

# https://datatracker.ietf.org/doc/html/rfc7230#section-3.2: Header fields are tokens.
# "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." /  "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA
_valid_header_name = re.compile(rb"^[!#$%&'*+\-.^_`|~0-9a-zA-Z]+$")

_valid_content_length = re.compile(rb"^(?:0|[1-9][0-9]*)$")
_valid_content_length_str = re.compile(r"^(?:0|[1-9][0-9]*)$")

# https://datatracker.ietf.org/doc/html/rfc9112#section-6.1:
# > A sender MUST NOT apply the chunked transfer coding more than once to a message body (i.e., chunking an already
# > chunked message is not allowed). If any transfer coding other than chunked is applied to a request's content, the
# > sender MUST apply chunked as the final transfer coding to ensure that the message is properly framed. If any
# > transfer coding other than chunked is applied to a response's content, the sender MUST either apply chunked as the
# > final transfer coding or terminate the message by closing the connection.
#
# The RFC technically still allows for fun encodings, we are a bit stricter and only accept a known subset by default.
TransferEncoding = typing.Literal[
    "chunked",
    "compress,chunked",
    "deflate,chunked",
    "gzip,chunked",
    "compress",
    "deflate",
    "gzip",
    "identity",
]
_HTTP_1_1_TRANSFER_ENCODINGS = frozenset(typing.get_args(TransferEncoding))


def parse_content_length(value: str | bytes) -> int:
    """Parse a content-length header value, or raise a ValueError if it is invalid."""
    if isinstance(value, str):
        valid = bool(_valid_content_length_str.match(value))
    else:
        valid = bool(_valid_content_length.match(value))
    if not valid:
        raise ValueError(f"invalid content-length header: {value!r}")
    return int(value)


def parse_transfer_encoding(value: str | bytes) -> TransferEncoding:
    """Parse a transfer-encoding header value, or raise a ValueError if it is invalid or unknown."""
    # guard against .lower() transforming non-ascii to ascii
    if not value.isascii():
        raise ValueError(f"invalid transfer-encoding header: {value!r}")
    if isinstance(value, str):
        te = value
    else:
        te = value.decode()
    te = te.lower()
    te = re.sub(r"[\t ]*,[\t ]*", ",", te)
    if te not in _HTTP_1_1_TRANSFER_ENCODINGS:
        raise ValueError(f"unknown transfer-encoding header: {value!r}")
    return typing.cast(TransferEncoding, te)


def validate_headers(message: Message) -> None:
    """
    Validate HTTP message headers to avoid request smuggling attacks.

    Raises a ValueError if they are malformed.
    """

    te = []
    cl = []

    for name, value in message.headers.fields:
        if not _valid_header_name.match(name):
            raise ValueError(f"invalid header name: {name!r}")
        match name.lower():
            case b"transfer-encoding":
                te.append(value)
            case b"content-length":
                cl.append(value)

    if te and cl:
        # > A server MAY reject a request that contains both Content-Length and Transfer-Encoding or process such a
        # > request in accordance with the Transfer-Encoding alone.

        # > A sender MUST NOT send a Content-Length header field in any message that contains a Transfer-Encoding header
        # > field.
        raise ValueError(
            "message with both transfer-encoding and content-length headers"
        )
    elif te:
        if len(te) > 1:
            raise ValueError(f"multiple transfer-encoding headers: {te!r}")
        # > Transfer-Encoding was added in HTTP/1.1. It is generally assumed that implementations advertising only
        # > HTTP/1.0 support will not understand how to process transfer-encoded content, and that an HTTP/1.0 message
        # > received with a Transfer-Encoding is likely to have been forwarded without proper handling of the chunked
        # > transfer coding in transit.
        #
        # > A client MUST NOT send a request containing Transfer-Encoding unless it knows the server will handle
        # > HTTP/1.1 requests (or later minor revisions); such knowledge might be in the form of specific user
        # > configuration or by remembering the version of a prior received response. A server MUST NOT send a response
        # > containing Transfer-Encoding unless the corresponding request indicates HTTP/1.1 (or later minor revisions).
        if not message.is_http11:
            raise ValueError(
                f"unexpected HTTP transfer-encoding {te[0]!r} for {message.http_version}"
            )
        # > A server MUST NOT send a Transfer-Encoding header field in any response with a status code of 1xx
        # > (Informational) or 204 (No Content).
        if isinstance(message, Response) and (
            100 <= message.status_code <= 199 or message.status_code == 204
        ):
            raise ValueError(
                f"unexpected HTTP transfer-encoding {te[0]!r} for response with status code {message.status_code}"
            )
        # > If a Transfer-Encoding header field is present in a request and the chunked transfer coding is not the final
        # > encoding, the message body length cannot be determined reliably; the server MUST respond with the 400 (Bad
        # > Request) status code and then close the connection.
        te_parsed = parse_transfer_encoding(te[0])
        match te_parsed:
            case "chunked" | "compress,chunked" | "deflate,chunked" | "gzip,chunked":
                pass
            case "compress" | "deflate" | "gzip" | "identity":
                if isinstance(message, Request):
                    raise ValueError(
                        f"unexpected HTTP transfer-encoding {te_parsed!r} for request"
                    )
            case other:  # pragma: no cover
                typing.assert_never(other)
    elif cl:
        # > If a message is received without Transfer-Encoding and with an invalid Content-Length header field, then the
        # > message framing is invalid and the recipient MUST treat it as an unrecoverable error, unless the field value
        # > can be successfully parsed as a comma-separated list (Section 5.6.1 of [HTTP]), all values in the list are
        # > valid, and all values in the list are the same (in which case, the message is processed with that single
        # > value used as the Content-Length field value).
        # We are stricter here and reject comma-separated lists.
        if len(cl) > 1:
            raise ValueError(f"multiple content-length headers: {cl!r}")
        parse_content_length(cl[0])