Files
baijiahao_data_crawl/venv/Lib/site-packages/mitmproxy/net/http/headers.py
“shengyudong” 322ac74336 2025-12-25 upload
2025-12-25 11:16:59 +08:00

114 lines
3.9 KiB
Python

import collections
import re
def parse_content_type(c: str) -> tuple[str, str, dict[str, str]] | None:
"""
A simple parser for content-type values. Returns a (type, subtype,
parameters) tuple, where type and subtype are strings, and parameters
is a dict. If the string could not be parsed, return None.
E.g. the following string:
text/html; charset=UTF-8
Returns:
("text", "html", {"charset": "UTF-8"})
"""
parts = c.split(";", 1)
ts = parts[0].split("/", 1)
if len(ts) != 2:
return None
d = collections.OrderedDict()
if len(parts) == 2:
for i in parts[1].split(";"):
clause = i.split("=", 1)
if len(clause) == 2:
d[clause[0].strip()] = clause[1].strip()
return ts[0].lower(), ts[1].lower(), d
def assemble_content_type(type, subtype, parameters):
if not parameters:
return f"{type}/{subtype}"
params = "; ".join(f"{k}={v}" for k, v in parameters.items())
return f"{type}/{subtype}; {params}"
def infer_content_encoding(content_type: str, content: bytes = b"") -> str:
"""
Infer the encoding of content from the content-type header.
"""
enc = None
# BOM has the highest priority
if content.startswith(b"\x00\x00\xfe\xff"):
enc = "utf-32be"
elif content.startswith(b"\xff\xfe\x00\x00"):
enc = "utf-32le"
elif content.startswith(b"\xfe\xff"):
enc = "utf-16be"
elif content.startswith(b"\xff\xfe"):
enc = "utf-16le"
elif content.startswith(b"\xef\xbb\xbf"):
# 'utf-8-sig' will strip the BOM on decode
enc = "utf-8-sig"
elif parsed_content_type := parse_content_type(content_type):
# Use the charset from the header if possible
enc = parsed_content_type[2].get("charset")
# Otherwise, infer the encoding
if not enc and "json" in content_type:
enc = "utf8"
if not enc and "html" in content_type:
meta_charset = re.search(
rb"""<meta[^>]+charset=['"]?([^'">]+)""", content, re.IGNORECASE
)
if meta_charset:
enc = meta_charset.group(1).decode("ascii", "ignore")
else:
# Fallback to utf8 for html
# Ref: https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
# > 9. [snip] the comprehensive UTF-8 encoding is suggested.
enc = "utf8"
if not enc and "xml" in content_type:
if xml_encoding := re.search(
rb"""<\?xml[^\?>]+encoding=['"]([^'"\?>]+)""", content, re.IGNORECASE
):
enc = xml_encoding.group(1).decode("ascii", "ignore")
else:
# Fallback to utf8 for xml
# Ref: https://datatracker.ietf.org/doc/html/rfc7303#section-8.5
# > the XML processor [snip] to determine an encoding of UTF-8.
enc = "utf8"
if not enc and ("javascript" in content_type or "ecmascript" in content_type):
# Fallback to utf8 for javascript
# Ref: https://datatracker.ietf.org/doc/html/rfc9239#section-4.2
# > 3. Else, the character encoding scheme is assumed to be UTF-8
enc = "utf8"
if not enc and "text/css" in content_type:
# @charset rule must be the very first thing.
css_charset = re.match(rb"""@charset "([^"]+)";""", content, re.IGNORECASE)
if css_charset:
enc = css_charset.group(1).decode("ascii", "ignore")
else:
# Fallback to utf8 for css
# Ref: https://drafts.csswg.org/css-syntax/#determine-the-fallback-encoding
# > 4. Otherwise, return utf-8
enc = "utf8"
# Fallback to latin-1
if not enc:
enc = "latin-1"
# Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites.
if enc.lower() in ("gb2312", "gbk"):
enc = "gb18030"
return enc