Files
baijiahao_data_crawl/venv/Lib/site-packages/mitmproxy/addons/savehar.py

313 lines
11 KiB
Python
Raw Normal View History

2025-12-25 11:16:59 +08:00
"""Write flow objects to a HAR file"""
import base64
import json
import logging
import zlib
from collections.abc import Sequence
from datetime import datetime
from datetime import timezone
from typing import Any
from mitmproxy import command
from mitmproxy import ctx
from mitmproxy import exceptions
from mitmproxy import flow
from mitmproxy import flowfilter
from mitmproxy import http
from mitmproxy import types
from mitmproxy import version
from mitmproxy.addonmanager import Loader
from mitmproxy.connection import Server
from mitmproxy.coretypes.multidict import _MultiDict
from mitmproxy.log import ALERT
from mitmproxy.utils import human
from mitmproxy.utils import strutils
logger = logging.getLogger(__name__)
class SaveHar:
def __init__(self) -> None:
self.flows: list[flow.Flow] = []
self.filt: flowfilter.TFilter | None = None
@command.command("save.har")
def export_har(self, flows: Sequence[flow.Flow], path: types.Path) -> None:
"""Export flows to an HAR (HTTP Archive) file."""
har = json.dumps(self.make_har(flows), indent=4).encode()
if path.endswith(".zhar"):
har = zlib.compress(har, 9)
with open(path, "wb") as f:
f.write(har)
logging.log(ALERT, f"HAR file saved ({human.pretty_size(len(har))} bytes).")
def make_har(self, flows: Sequence[flow.Flow]) -> dict:
entries = []
skipped = 0
# A list of server seen till now is maintained so we can avoid
# using 'connect' time for entries that use an existing connection.
servers_seen: set[Server] = set()
for f in flows:
if isinstance(f, http.HTTPFlow):
entries.append(self.flow_entry(f, servers_seen))
else:
skipped += 1
if skipped > 0:
logger.info(f"Skipped {skipped} flows that weren't HTTP flows.")
return {
"log": {
"version": "1.2",
"creator": {
"name": "mitmproxy",
"version": version.VERSION,
"comment": "",
},
"pages": [],
"entries": entries,
}
}
def load(self, loader: Loader):
loader.add_option(
"hardump",
str,
"",
"""
Save a HAR file with all flows on exit.
You may select particular flows by setting save_stream_filter.
For mitmdump, enabling this option will mean that flows are kept in memory.
""",
)
def configure(self, updated):
if "save_stream_filter" in updated:
if ctx.options.save_stream_filter:
try:
self.filt = flowfilter.parse(ctx.options.save_stream_filter)
except ValueError as e:
raise exceptions.OptionsError(str(e)) from e
else:
self.filt = None
if "hardump" in updated:
if not ctx.options.hardump:
self.flows = []
def response(self, flow: http.HTTPFlow) -> None:
# websocket flows will receive a websocket_end,
# we don't want to persist them here already
if flow.websocket is None:
self._save_flow(flow)
def error(self, flow: http.HTTPFlow) -> None:
self.response(flow)
def websocket_end(self, flow: http.HTTPFlow) -> None:
self._save_flow(flow)
def _save_flow(self, flow: http.HTTPFlow) -> None:
if ctx.options.hardump:
flow_matches = self.filt is None or self.filt(flow)
if flow_matches:
self.flows.append(flow)
def done(self):
if ctx.options.hardump:
if ctx.options.hardump == "-":
har = self.make_har(self.flows)
print(json.dumps(har, indent=4))
else:
self.export_har(self.flows, ctx.options.hardump)
def flow_entry(self, flow: http.HTTPFlow, servers_seen: set[Server]) -> dict:
"""Creates HAR entry from flow"""
if flow.server_conn in servers_seen:
connect_time = -1.0
ssl_time = -1.0
elif flow.server_conn.timestamp_tcp_setup:
assert flow.server_conn.timestamp_start
connect_time = 1000 * (
flow.server_conn.timestamp_tcp_setup - flow.server_conn.timestamp_start
)
if flow.server_conn.timestamp_tls_setup:
ssl_time = 1000 * (
flow.server_conn.timestamp_tls_setup
- flow.server_conn.timestamp_tcp_setup
)
else:
ssl_time = -1.0
servers_seen.add(flow.server_conn)
else:
connect_time = -1.0
ssl_time = -1.0
if flow.request.timestamp_end:
send = 1000 * (flow.request.timestamp_end - flow.request.timestamp_start)
else:
send = 0
if flow.response and flow.request.timestamp_end:
wait = 1000 * (flow.response.timestamp_start - flow.request.timestamp_end)
else:
wait = 0
if flow.response and flow.response.timestamp_end:
receive = 1000 * (
flow.response.timestamp_end - flow.response.timestamp_start
)
else:
receive = 0
timings: dict[str, float | None] = {
"connect": connect_time,
"ssl": ssl_time,
"send": send,
"receive": receive,
"wait": wait,
}
if flow.response:
try:
content = flow.response.content
except ValueError:
content = flow.response.raw_content
response_body_size = (
len(flow.response.raw_content) if flow.response.raw_content else 0
)
response_body_decoded_size = len(content) if content else 0
response_body_compression = response_body_decoded_size - response_body_size
response = {
"status": flow.response.status_code,
"statusText": flow.response.reason,
"httpVersion": flow.response.http_version,
"cookies": self.format_response_cookies(flow.response),
"headers": self.format_multidict(flow.response.headers),
"content": {
"size": response_body_size,
"compression": response_body_compression,
"mimeType": flow.response.headers.get("Content-Type", ""),
},
"redirectURL": flow.response.headers.get("Location", ""),
"headersSize": len(str(flow.response.headers)),
"bodySize": response_body_size,
}
if content and strutils.is_mostly_bin(content):
response["content"]["text"] = base64.b64encode(content).decode()
response["content"]["encoding"] = "base64"
else:
text_content = flow.response.get_text(strict=False)
if text_content is None:
response["content"]["text"] = ""
else:
response["content"]["text"] = text_content
else:
response = {
"status": 0,
"statusText": "",
"httpVersion": "",
"headers": [],
"cookies": [],
"content": {},
"redirectURL": "",
"headersSize": -1,
"bodySize": -1,
"_transferSize": 0,
"_error": None,
}
if flow.error:
response["_error"] = flow.error.msg
if flow.request.method == "CONNECT":
url = f"https://{flow.request.pretty_url}/"
else:
url = flow.request.pretty_url
entry: dict[str, Any] = {
"startedDateTime": datetime.fromtimestamp(
flow.request.timestamp_start, timezone.utc
).isoformat(),
"time": sum(v for v in timings.values() if v is not None and v >= 0),
"request": {
"method": flow.request.method,
"url": url,
"httpVersion": flow.request.http_version,
"cookies": self.format_multidict(flow.request.cookies),
"headers": self.format_multidict(flow.request.headers),
"queryString": self.format_multidict(flow.request.query),
"headersSize": len(str(flow.request.headers)),
"bodySize": len(flow.request.raw_content)
if flow.request.raw_content
else 0,
},
"response": response,
"cache": {},
"timings": timings,
}
if flow.request.method in ["POST", "PUT", "PATCH"]:
params = self.format_multidict(flow.request.urlencoded_form)
entry["request"]["postData"] = {
"mimeType": flow.request.headers.get("Content-Type", ""),
"text": flow.request.get_text(strict=False),
"params": params,
}
if flow.server_conn.peername:
entry["serverIPAddress"] = str(flow.server_conn.peername[0])
websocket_messages = []
if flow.websocket:
for message in flow.websocket.messages:
if message.is_text:
data = message.text
else:
data = base64.b64encode(message.content).decode()
websocket_message = {
"type": "send" if message.from_client else "receive",
"time": message.timestamp,
"opcode": message.type.value,
"data": data,
}
websocket_messages.append(websocket_message)
entry["_resourceType"] = "websocket"
entry["_webSocketMessages"] = websocket_messages
return entry
def format_response_cookies(self, response: http.Response) -> list[dict]:
"""Formats the response's cookie header to list of cookies"""
cookie_list = response.cookies.items(multi=True)
rv = []
for name, (value, attrs) in cookie_list:
cookie = {
"name": name,
"value": value,
"path": attrs.get("path", "/"),
"domain": attrs.get("domain", ""),
"httpOnly": "httpOnly" in attrs,
"secure": "secure" in attrs,
}
# TODO: handle expires attribute here.
# This is not quite trivial because we need to parse random date formats.
# For now, we just ignore the attribute.
if "sameSite" in attrs:
cookie["sameSite"] = attrs["sameSite"]
rv.append(cookie)
return rv
def format_multidict(self, obj: _MultiDict[str, str]) -> list[dict]:
return [{"name": k, "value": v} for k, v in obj.items(multi=True)]