1375 lines
48 KiB
Python
1375 lines
48 KiB
Python
#!/usr/bin/env python3
|
||
"""Visible Douyin crawler probe.
|
||
|
||
This is a standalone diagnostic copy of the douyin_agent collection flow. It
|
||
does not import or modify app/agents/douyin_agent.py. The script opens a headed
|
||
Chrome/Chromium window, lets the browser produce signed Douyin requests, records
|
||
the relevant API traffic, parses videos/comments, and saves project-compatible
|
||
CSV artifacts for offline inspection.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import csv
|
||
import json
|
||
import os
|
||
import random
|
||
import sys
|
||
import time
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
from urllib.parse import parse_qs, quote, urlparse
|
||
|
||
|
||
ROOT = Path(__file__).resolve().parents[1]
|
||
DEFAULT_PROFILE_DIR = Path(os.path.expanduser("~/.zn-kg/douyin-profile"))
|
||
DEFAULT_OUT_DIR = ROOT / "data" / "douyin_probe"
|
||
|
||
SEARCH_URL = "https://www.douyin.com/search/{kw}?type=general"
|
||
|
||
UA = (
|
||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||
"Chrome/124.0.0.0 Safari/537.36"
|
||
)
|
||
|
||
CHROME_ARGS = [
|
||
"--disable-blink-features=AutomationControlled",
|
||
"--no-first-run",
|
||
"--no-default-browser-check",
|
||
"--disable-sync",
|
||
"--disable-default-apps",
|
||
"--no-sandbox",
|
||
"--disable-dev-shm-usage",
|
||
]
|
||
|
||
STEALTH_JS = r"""
|
||
(() => {
|
||
const d=(o,k,g)=>{try{Object.defineProperty(o,k,{get:g,configurable:true})}catch(e){}};
|
||
d(Navigator.prototype,'webdriver',()=>undefined);
|
||
d(Navigator.prototype,'languages',()=>['zh-CN','zh','en']);
|
||
d(Navigator.prototype,'language',()=>'zh-CN');
|
||
d(Navigator.prototype,'vendor',()=>'Google Inc.');
|
||
if(!window.chrome){try{Object.defineProperty(window,'chrome',{value:{runtime:{}},configurable:true})}catch(e){}}
|
||
try{window.chrome.app={isInstalled:false};window.chrome.csi=()=>({});window.chrome.loadTimes=()=>({});}catch(e){}
|
||
const mk=(a)=>{a.item=(i)=>a[i]||null;a.namedItem=(n)=>a.find(x=>x.name===n)||null;return a;};
|
||
const plugins=mk([{name:'Chrome PDF Plugin'},{name:'Chrome PDF Viewer'},{name:'Native Client'}]);
|
||
d(Navigator.prototype,'plugins',()=>plugins);
|
||
const q=navigator.permissions&&navigator.permissions.query;
|
||
if(q){navigator.permissions.query=(p)=>p&&p.name==='notifications'
|
||
?Promise.resolve({state:Notification.permission,onchange:null}):q(p);}
|
||
const pw=(proto)=>{if(!proto||!proto.getParameter)return;const o=proto.getParameter;
|
||
proto.getParameter=function(p){if(p===37445)return 'Intel Inc.';
|
||
if(p===37446)return 'Intel Iris OpenGL Engine';return o.apply(this,arguments);};};
|
||
pw(window.WebGLRenderingContext&&WebGLRenderingContext.prototype);
|
||
pw(window.WebGL2RenderingContext&&WebGL2RenderingContext.prototype);
|
||
})();
|
||
"""
|
||
|
||
LISTEN_PATTERNS = [
|
||
"/aweme/v1/web/aweme/detail",
|
||
"/aweme/v1/web/comment/list",
|
||
"/aweme/v1/web/comment/list/reply",
|
||
"/aweme/v1/web/general/search/stream",
|
||
"/aweme/v1/web/general/search/single",
|
||
"/aweme/v1/web/search/item",
|
||
"/aweme/v1/web/general/search",
|
||
]
|
||
|
||
DIAG_PATTERNS = [
|
||
"https://www.douyin.com/aweme/v1/web/",
|
||
"https://www.douyin.com/search/",
|
||
]
|
||
|
||
COMMENT_SCROLLERS = (
|
||
"[data-e2e='comment-list']",
|
||
"[data-e2e='detail-comment']",
|
||
".comment-list",
|
||
".comment-mainContent",
|
||
".ESlRWJ2j",
|
||
)
|
||
|
||
DOM_COMMENT_SELECTORS = (
|
||
"[data-e2e='comment-item']",
|
||
"[data-e2e='comment-text']",
|
||
".comment-mainContent",
|
||
".comment-item",
|
||
".comment-text",
|
||
)
|
||
|
||
EVIDENCE_COLUMNS = [
|
||
"platform",
|
||
"kind",
|
||
"source_id",
|
||
"url",
|
||
"entity_name",
|
||
"place_natural_key",
|
||
"keyword",
|
||
"title",
|
||
"content",
|
||
"author",
|
||
"author_id",
|
||
"author_avatar",
|
||
"likes",
|
||
"comments",
|
||
"collects",
|
||
"shares",
|
||
"publish_time",
|
||
"location",
|
||
"tags",
|
||
"image_urls",
|
||
"raw_json",
|
||
]
|
||
|
||
VIDEO_METRIC_COLUMNS = [
|
||
"platform",
|
||
"source_id",
|
||
"url",
|
||
"entity_name",
|
||
"place_natural_key",
|
||
"keyword",
|
||
"video_title",
|
||
"video_author",
|
||
"video_author_id",
|
||
"video_publish_time",
|
||
"video_publish_timestamp",
|
||
"video_like_count",
|
||
"video_comment_count",
|
||
"video_collect_count",
|
||
"video_share_count",
|
||
]
|
||
|
||
|
||
def log(message: str) -> None:
|
||
ts = datetime.now().strftime("%H:%M:%S")
|
||
print(f"[{ts}] {message}", flush=True)
|
||
|
||
|
||
def unlock_profile(profile_dir: Path) -> None:
|
||
for name in ("SingletonLock", "SingletonCookie", "SingletonSocket"):
|
||
try:
|
||
(profile_dir / name).unlink()
|
||
except OSError:
|
||
pass
|
||
|
||
|
||
def scalar(obj, *keys, default=""):
|
||
if not isinstance(obj, dict):
|
||
return default
|
||
for key in keys:
|
||
value = obj.get(key)
|
||
if value not in (None, "", [], {}):
|
||
return value
|
||
return default
|
||
|
||
|
||
def first_avatar(user: dict) -> str:
|
||
if not isinstance(user, dict):
|
||
return ""
|
||
avatar = user.get("avatar_thumb") or user.get("avatar_medium") or {}
|
||
urls = avatar.get("url_list") if isinstance(avatar, dict) else None
|
||
if isinstance(urls, list) and urls:
|
||
return str(urls[0] or "")
|
||
return ""
|
||
|
||
|
||
def as_int(value) -> int:
|
||
try:
|
||
if value in (None, ""):
|
||
return 0
|
||
return int(value)
|
||
except (TypeError, ValueError):
|
||
return 0
|
||
|
||
|
||
def format_publish_time(value) -> str:
|
||
try:
|
||
ts = int(value)
|
||
except (TypeError, ValueError):
|
||
return str(value or "")
|
||
if ts <= 0:
|
||
return ""
|
||
if ts > 10_000_000_000:
|
||
ts = ts // 1000
|
||
try:
|
||
return datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S")
|
||
except (OSError, OverflowError, ValueError):
|
||
return str(value or "")
|
||
|
||
|
||
def body_of(entry: dict):
|
||
if not isinstance(entry, dict):
|
||
return {}
|
||
return entry.get("__body") if "__body" in entry else entry
|
||
|
||
|
||
def source_url_of(entry: dict) -> str:
|
||
if not isinstance(entry, dict):
|
||
return ""
|
||
return str(entry.get("__url") or entry.get("__probe_url") or "")
|
||
|
||
|
||
def aweme_id_from_url(url: str) -> str:
|
||
try:
|
||
qs = parse_qs(urlparse(url).query)
|
||
except Exception:
|
||
return ""
|
||
for key in ("aweme_id", "item_id", "group_id"):
|
||
vals = qs.get(key)
|
||
if vals:
|
||
return str(vals[0] or "")
|
||
return ""
|
||
|
||
|
||
def iter_lists_by_key(obj, keys: set[str]):
|
||
if isinstance(obj, dict):
|
||
for key, value in obj.items():
|
||
if key in keys and isinstance(value, list):
|
||
yield value
|
||
elif isinstance(value, (dict, list)):
|
||
yield from iter_lists_by_key(value, keys)
|
||
elif isinstance(obj, list):
|
||
for item in obj:
|
||
yield from iter_lists_by_key(item, keys)
|
||
|
||
|
||
def parse_douyin_notes(raw_api: list[dict], name: str, keyword: str) -> list[dict]:
|
||
out: list[dict] = []
|
||
seen: set[str] = set()
|
||
for entry in raw_api or []:
|
||
body = body_of(entry)
|
||
if not isinstance(body, dict):
|
||
continue
|
||
candidates: list = []
|
||
data = body.get("data")
|
||
data_dict = data if isinstance(data, dict) else {}
|
||
for value in (
|
||
data if isinstance(data, list) else None,
|
||
body.get("aweme_detail"),
|
||
body.get("aweme_list"),
|
||
data_dict.get("aweme_detail"),
|
||
data_dict.get("aweme_list"),
|
||
data_dict.get("data"),
|
||
):
|
||
if isinstance(value, dict):
|
||
candidates.append(value)
|
||
elif isinstance(value, list):
|
||
candidates.extend(value)
|
||
for nested in iter_lists_by_key(body, {"aweme_list"}):
|
||
candidates.extend(nested)
|
||
|
||
for item in candidates:
|
||
if not isinstance(item, dict):
|
||
continue
|
||
aweme = item.get("aweme_info") or item.get("aweme") or item
|
||
if not isinstance(aweme, dict):
|
||
continue
|
||
aid = scalar(aweme, "aweme_id", "group_id", "awemeId")
|
||
if not aid or str(aid) in seen:
|
||
continue
|
||
seen.add(str(aid))
|
||
author = aweme.get("author") or {}
|
||
stat = aweme.get("statistics") or {}
|
||
out.append(
|
||
{
|
||
"platform": "douyin",
|
||
"kind": "note",
|
||
"source_id": str(aid),
|
||
"url": f"https://www.douyin.com/video/{aid}",
|
||
"entity_name": name,
|
||
"keyword": keyword,
|
||
"title": scalar(aweme, "desc", "title", "caption"),
|
||
"content": scalar(aweme, "desc", "content"),
|
||
"author": scalar(author, "nickname", "name"),
|
||
"author_id": str(
|
||
scalar(author, "uid", "sec_uid", "unique_id")
|
||
),
|
||
"author_avatar": first_avatar(author),
|
||
"likes": as_int(scalar(stat, "digg_count", "admire_count", default=0)),
|
||
"comments": as_int(scalar(stat, "comment_count", default=0)),
|
||
"collects": as_int(
|
||
scalar(
|
||
stat,
|
||
"collect_count",
|
||
"favorite_count",
|
||
"collects_count",
|
||
default=0,
|
||
)
|
||
),
|
||
"shares": as_int(
|
||
scalar(stat, "share_count", "share_count_reflow", default=0)
|
||
),
|
||
"publish_time": str(scalar(aweme, "create_time", default="")),
|
||
"location": "",
|
||
"tags": [],
|
||
"image_urls": [],
|
||
"raw": item,
|
||
}
|
||
)
|
||
return out
|
||
|
||
|
||
def parse_douyin_comments(raw_api: list[dict], name: str, keyword: str) -> list[dict]:
|
||
out: list[dict] = []
|
||
seen: set[str] = set()
|
||
for entry in raw_api or []:
|
||
body = body_of(entry)
|
||
if not isinstance(body, dict):
|
||
continue
|
||
source_url = source_url_of(entry)
|
||
fallback_aweme_id = aweme_id_from_url(source_url)
|
||
comment_lists = list(
|
||
iter_lists_by_key(body, {"comments", "comment_list", "reply_comments"})
|
||
)
|
||
for comments in comment_lists:
|
||
for comment in comments:
|
||
if not isinstance(comment, dict):
|
||
continue
|
||
cid = scalar(comment, "cid", "comment_id", "id")
|
||
text = scalar(comment, "text", "content")
|
||
if not cid or not text or str(cid) in seen:
|
||
continue
|
||
seen.add(str(cid))
|
||
user = comment.get("user") or comment.get("user_info") or {}
|
||
aweme_id = scalar(comment, "aweme_id", default=fallback_aweme_id)
|
||
replies = scalar(
|
||
comment, "reply_comment_total", "reply_total", default=0
|
||
)
|
||
out.append(
|
||
{
|
||
"platform": "douyin",
|
||
"kind": "comment",
|
||
"source_id": str(cid),
|
||
"url": f"https://www.douyin.com/video/{aweme_id}",
|
||
"entity_name": name,
|
||
"keyword": keyword,
|
||
"title": "",
|
||
"content": text,
|
||
"author": scalar(user, "nickname", "name"),
|
||
"author_id": str(scalar(user, "uid", "sec_uid")),
|
||
"author_avatar": first_avatar(user),
|
||
"likes": as_int(scalar(comment, "digg_count", "like_count", default=0)),
|
||
"comments": as_int(replies),
|
||
"collects": 0,
|
||
"shares": 0,
|
||
"publish_time": str(
|
||
scalar(comment, "create_time", default="")
|
||
),
|
||
"location": scalar(comment, "ip_label", default=""),
|
||
"tags": [],
|
||
"image_urls": [],
|
||
"raw": comment,
|
||
}
|
||
)
|
||
return out
|
||
|
||
|
||
def response_shape(payload) -> dict:
|
||
body = payload
|
||
if isinstance(payload, dict) and "__body" in payload:
|
||
body = payload["__body"]
|
||
if not isinstance(body, dict):
|
||
return {"body_type": type(body).__name__, "keys": ""}
|
||
data = body.get("data")
|
||
data_type = type(data).__name__
|
||
comment_count = 0
|
||
aweme_count = 0
|
||
for comments in iter_lists_by_key(body, {"comments", "comment_list", "reply_comments"}):
|
||
comment_count += len(comments)
|
||
for awemes in iter_lists_by_key(body, {"aweme_list"}):
|
||
aweme_count += len(awemes)
|
||
if isinstance(data, list):
|
||
aweme_count += len(data)
|
||
return {
|
||
"body_type": "dict",
|
||
"data_type": data_type,
|
||
"keys": ",".join(list(body.keys())[:16]),
|
||
"comment_items": comment_count,
|
||
"aweme_items": aweme_count,
|
||
"status_code": body.get("status_code", ""),
|
||
"status_msg": body.get("status_msg", ""),
|
||
}
|
||
|
||
|
||
def write_csv(path: Path, rows: list[dict], columns: list[str] | None = None) -> None:
|
||
path.parent.mkdir(parents=True, exist_ok=True)
|
||
if columns is None:
|
||
keys: list[str] = []
|
||
for row in rows:
|
||
for key in row.keys():
|
||
if key not in keys:
|
||
keys.append(key)
|
||
columns = keys
|
||
with path.open("w", newline="", encoding="utf-8-sig") as f:
|
||
writer = csv.DictWriter(f, fieldnames=columns, extrasaction="ignore")
|
||
writer.writeheader()
|
||
writer.writerows(rows)
|
||
|
||
|
||
def evidence_csv_rows(records: list[dict], place_key: str) -> list[dict]:
|
||
rows = []
|
||
for record in records:
|
||
row = {key: record.get(key, "") for key in EVIDENCE_COLUMNS}
|
||
row["place_natural_key"] = place_key
|
||
row["tags"] = json.dumps(record.get("tags") or [], ensure_ascii=False)
|
||
row["image_urls"] = json.dumps(
|
||
record.get("image_urls") or [], ensure_ascii=False
|
||
)
|
||
row["raw_json"] = json.dumps(record.get("raw") or {}, ensure_ascii=False)
|
||
rows.append(row)
|
||
return rows
|
||
|
||
|
||
def video_metric_rows(records: list[dict], place_key: str) -> list[dict]:
|
||
rows = []
|
||
for record in records:
|
||
if record.get("kind") != "note":
|
||
continue
|
||
publish_time = record.get("publish_time", "")
|
||
rows.append(
|
||
{
|
||
"platform": record.get("platform", "douyin"),
|
||
"source_id": record.get("source_id", ""),
|
||
"url": record.get("url", ""),
|
||
"entity_name": record.get("entity_name", ""),
|
||
"place_natural_key": place_key,
|
||
"keyword": record.get("keyword", ""),
|
||
"video_title": record.get("title", "") or record.get("content", ""),
|
||
"video_author": record.get("author", ""),
|
||
"video_author_id": record.get("author_id", ""),
|
||
"video_publish_time": format_publish_time(publish_time),
|
||
"video_publish_timestamp": publish_time,
|
||
"video_like_count": record.get("likes", 0),
|
||
"video_comment_count": record.get("comments", 0),
|
||
"video_collect_count": record.get("collects", 0),
|
||
"video_share_count": record.get("shares", 0),
|
||
}
|
||
)
|
||
return rows
|
||
|
||
|
||
def save_artifacts(
|
||
out_dir: Path,
|
||
records: list[dict],
|
||
api_events: list[dict],
|
||
raw_api: list[dict],
|
||
dom_comments: list[dict],
|
||
summary: dict,
|
||
place_key: str,
|
||
) -> dict:
|
||
out_dir.mkdir(parents=True, exist_ok=True)
|
||
evidence_rows = evidence_csv_rows(records, place_key)
|
||
note_rows = [row for row in evidence_rows if row.get("kind") != "comment"]
|
||
comment_rows = [row for row in evidence_rows if row.get("kind") == "comment"]
|
||
|
||
files = {
|
||
"social_evidence_csv": out_dir / "social_evidence.csv",
|
||
"videos_csv": out_dir / "videos.csv",
|
||
"video_metrics_csv": out_dir / "video_metrics.csv",
|
||
"comments_csv": out_dir / "comments.csv",
|
||
"api_urls_csv": out_dir / "api_urls.csv",
|
||
"dom_comments_csv": out_dir / "dom_comments.csv",
|
||
"raw_api_jsonl": out_dir / "raw_api.jsonl",
|
||
"summary_json": out_dir / "run_summary.json",
|
||
}
|
||
|
||
write_csv(files["social_evidence_csv"], evidence_rows, EVIDENCE_COLUMNS)
|
||
write_csv(files["videos_csv"], note_rows, EVIDENCE_COLUMNS)
|
||
write_csv(
|
||
files["video_metrics_csv"],
|
||
video_metric_rows(records, place_key),
|
||
VIDEO_METRIC_COLUMNS,
|
||
)
|
||
write_csv(files["comments_csv"], comment_rows, EVIDENCE_COLUMNS)
|
||
write_csv(
|
||
files["api_urls_csv"],
|
||
api_events,
|
||
[
|
||
"seq",
|
||
"captured_at",
|
||
"status",
|
||
"kind",
|
||
"body_type",
|
||
"data_type",
|
||
"comment_items",
|
||
"aweme_items",
|
||
"status_code",
|
||
"status_msg",
|
||
"keys",
|
||
"url",
|
||
],
|
||
)
|
||
write_csv(files["dom_comments_csv"], dom_comments)
|
||
|
||
with files["raw_api_jsonl"].open("w", encoding="utf-8") as f:
|
||
for entry in raw_api:
|
||
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||
summary["files"] = {key: str(value) for key, value in files.items()}
|
||
with files["summary_json"].open("w", encoding="utf-8") as f:
|
||
json.dump(summary, f, ensure_ascii=False, indent=2)
|
||
return summary["files"]
|
||
|
||
|
||
def collect_dom_comments(page, video_url: str) -> list[dict]:
|
||
rows: list[dict] = []
|
||
seen: set[str] = set()
|
||
for selector in DOM_COMMENT_SELECTORS:
|
||
try:
|
||
elements = page.query_selector_all(selector)
|
||
except Exception:
|
||
continue
|
||
for element in elements:
|
||
try:
|
||
text = " ".join((element.inner_text() or "").split())
|
||
except Exception:
|
||
continue
|
||
if len(text) < 2 or text in seen:
|
||
continue
|
||
seen.add(text)
|
||
rows.append(
|
||
{
|
||
"source": "dom",
|
||
"selector": selector,
|
||
"video_url": video_url,
|
||
"content": text[:1000],
|
||
}
|
||
)
|
||
return rows
|
||
|
||
|
||
def comment_count_from_raw(raw_api: list[dict]) -> int:
|
||
total = 0
|
||
for entry in raw_api:
|
||
body = body_of(entry)
|
||
if not isinstance(body, dict):
|
||
continue
|
||
for comments in iter_lists_by_key(body, {"comments", "comment_list", "reply_comments"}):
|
||
total += len(comments)
|
||
return total
|
||
|
||
|
||
def click_or_goto_detail(page, link, href: str) -> None:
|
||
try:
|
||
with page.expect_navigation(timeout=15000):
|
||
link.click()
|
||
return
|
||
except Exception:
|
||
pass
|
||
try:
|
||
link.click()
|
||
page.wait_for_timeout(1800)
|
||
if "/video/" in page.url:
|
||
return
|
||
except Exception:
|
||
pass
|
||
if href:
|
||
goto_detail_url(page, href)
|
||
|
||
|
||
def goto_detail_url(page, href: str) -> None:
|
||
try:
|
||
page.goto(href, wait_until="commit", timeout=30000)
|
||
except Exception:
|
||
if "/video/" not in page.url:
|
||
raise
|
||
|
||
|
||
def comment_panel_present(page) -> bool:
|
||
try:
|
||
return bool(
|
||
page.evaluate(
|
||
"""
|
||
(selectors) => {
|
||
const visibleInViewport = (el) => {
|
||
if (!el) return false;
|
||
const r = el.getBoundingClientRect();
|
||
const s = getComputedStyle(el);
|
||
const width = Math.min(r.right, innerWidth) - Math.max(r.left, 0);
|
||
const height = Math.min(r.bottom, innerHeight) - Math.max(r.top, 0);
|
||
return r.width > 120 && r.height > 120 && width > 80 && height > 80 &&
|
||
s.display !== 'none' && s.visibility !== 'hidden' &&
|
||
Number(s.opacity || '1') > 0.05;
|
||
};
|
||
for (const selector of selectors) {
|
||
for (const el of document.querySelectorAll(selector)) {
|
||
if (visibleInViewport(el)) return true;
|
||
}
|
||
}
|
||
return false;
|
||
}
|
||
""",
|
||
list(COMMENT_SCROLLERS),
|
||
)
|
||
)
|
||
except Exception:
|
||
return False
|
||
|
||
|
||
def wait_for_comment_panel(page, timeout_ms: int = 3000) -> bool:
|
||
deadline = time.time() + timeout_ms / 1000
|
||
while time.time() < deadline:
|
||
if comment_panel_present(page):
|
||
return True
|
||
page.wait_for_timeout(180)
|
||
return comment_panel_present(page)
|
||
|
||
|
||
def detail_candidates(page, raw_api: list[dict], name: str, keyword: str) -> list[dict]:
|
||
candidates: list[dict] = []
|
||
seen: set[str] = set()
|
||
try:
|
||
links = page.query_selector_all("a[href*='/video/']")
|
||
except Exception:
|
||
links = []
|
||
for link in links:
|
||
try:
|
||
href = link.get_attribute("href") or ""
|
||
except Exception:
|
||
continue
|
||
if href.startswith("//"):
|
||
href = "https:" + href
|
||
elif href.startswith("/"):
|
||
href = "https://www.douyin.com" + href
|
||
if not href or href in seen:
|
||
continue
|
||
seen.add(href)
|
||
candidates.append({"href": href, "link": link, "source": "dom"})
|
||
|
||
for note in parse_douyin_notes(raw_api, name, keyword):
|
||
href = note.get("url") or ""
|
||
if not href or href in seen:
|
||
continue
|
||
seen.add(href)
|
||
candidates.append({"href": href, "link": None, "source": "api"})
|
||
return candidates
|
||
|
||
|
||
def reveal_player_controls(page) -> bool:
|
||
player_selectors = (
|
||
".xgplayer",
|
||
"[class*='xgplayer']",
|
||
"video",
|
||
)
|
||
for selector in player_selectors:
|
||
try:
|
||
loc = page.locator(selector).first
|
||
if loc.count() <= 0:
|
||
continue
|
||
box = loc.bounding_box(timeout=1200)
|
||
if not box:
|
||
continue
|
||
x = box["x"] + box["width"] * 0.72
|
||
y = box["y"] + box["height"] * 0.82
|
||
page.mouse.move(x, y)
|
||
page.wait_for_timeout(700)
|
||
return True
|
||
except Exception:
|
||
continue
|
||
try:
|
||
viewport = page.viewport_size or {"width": 1440, "height": 900}
|
||
page.mouse.move(viewport["width"] * 0.55, viewport["height"] * 0.78)
|
||
page.wait_for_timeout(700)
|
||
return True
|
||
except Exception:
|
||
return False
|
||
|
||
|
||
def player_fullscreen_active(page) -> bool:
|
||
try:
|
||
return bool(
|
||
page.evaluate(
|
||
"""
|
||
() => Boolean(
|
||
document.fullscreenElement ||
|
||
document.webkitFullscreenElement ||
|
||
document.querySelector(
|
||
'.xgplayer-is-fullscreen,.xgplayer-is-cssfullscreen,.xgplayer-fullscreen-active'
|
||
)
|
||
)
|
||
"""
|
||
)
|
||
)
|
||
except Exception:
|
||
return False
|
||
|
||
|
||
def maybe_click_player_fullscreen(page) -> bool:
|
||
reveal_player_controls(page)
|
||
candidates = [
|
||
".xgplayer-icon:has(.xg-get-fullscreen)",
|
||
".xgplayer-icon .xg-get-fullscreen",
|
||
".xg-get-fullscreen",
|
||
".xgplayer-fullscreen",
|
||
"[aria-label*='全屏']",
|
||
"[title*='全屏']",
|
||
"button:has-text('全屏')",
|
||
]
|
||
for selector in candidates:
|
||
try:
|
||
reveal_player_controls(page)
|
||
loc = page.locator(selector).first
|
||
if loc.count() <= 0:
|
||
continue
|
||
try:
|
||
loc.scroll_into_view_if_needed(timeout=1800)
|
||
except Exception:
|
||
pass
|
||
try:
|
||
loc.hover(timeout=1200)
|
||
except Exception:
|
||
pass
|
||
loc.click(timeout=1800)
|
||
page.wait_for_timeout(1600)
|
||
return True
|
||
except Exception:
|
||
continue
|
||
try:
|
||
points = page.evaluate(
|
||
"""
|
||
() => {
|
||
const visible = (el) => {
|
||
const r = el.getBoundingClientRect();
|
||
const s = getComputedStyle(el);
|
||
return r.width > 4 && r.height > 4 && s.visibility !== 'hidden' &&
|
||
s.display !== 'none' && r.bottom > 0 && r.right > 0;
|
||
};
|
||
const nodes = [
|
||
...document.querySelectorAll(
|
||
'.xg-get-fullscreen,.xgplayer-fullscreen,[aria-label*="全屏"],[title*="全屏"]'
|
||
)
|
||
];
|
||
const points = [];
|
||
for (const node of nodes) {
|
||
const target = node.closest('button,[role="button"],[tabindex],.xgplayer-icon') || node;
|
||
if (!visible(target)) continue;
|
||
const r = target.getBoundingClientRect();
|
||
points.push({
|
||
x: Math.round(r.left + r.width / 2),
|
||
y: Math.round(r.top + r.height / 2),
|
||
});
|
||
}
|
||
return points;
|
||
}
|
||
"""
|
||
)
|
||
for point in points or []:
|
||
x = point.get("x")
|
||
y = point.get("y")
|
||
if not isinstance(x, int) or not isinstance(y, int):
|
||
continue
|
||
reveal_player_controls(page)
|
||
page.mouse.move(x, y)
|
||
page.mouse.click(x, y)
|
||
page.wait_for_timeout(1600)
|
||
return True
|
||
except Exception:
|
||
pass
|
||
try:
|
||
reveal_player_controls(page)
|
||
page.keyboard.press("f")
|
||
page.wait_for_timeout(1200)
|
||
if player_fullscreen_active(page):
|
||
return True
|
||
except Exception:
|
||
pass
|
||
return False
|
||
|
||
|
||
def maybe_click_comments(page) -> bool:
|
||
if wait_for_comment_panel(page, 800):
|
||
return True
|
||
candidates = [
|
||
"[data-e2e='feed-comment-icon']",
|
||
"[data-e2e='feed-comment']",
|
||
"[data-e2e='comment-icon']",
|
||
"[data-e2e='video-comment']",
|
||
"[aria-label*='评论']",
|
||
"[title*='评论']",
|
||
"button:has-text('评论')",
|
||
"[role='button']:has-text('评论')",
|
||
"text=评论",
|
||
]
|
||
for selector in candidates:
|
||
try:
|
||
loc = page.locator(selector).first
|
||
if loc.count() > 0:
|
||
try:
|
||
loc.scroll_into_view_if_needed(timeout=1800)
|
||
except Exception:
|
||
pass
|
||
loc.click(timeout=1200)
|
||
if wait_for_comment_panel(page, 2200):
|
||
return True
|
||
except Exception:
|
||
continue
|
||
try:
|
||
points = page.evaluate(
|
||
"""
|
||
() => {
|
||
const hasPanel = () => Boolean(
|
||
[...document.querySelectorAll("[data-e2e='comment-list'],.comment-mainContent")]
|
||
.some((el) => {
|
||
const r = el.getBoundingClientRect();
|
||
const s = getComputedStyle(el);
|
||
const width = Math.min(r.right, innerWidth) - Math.max(r.left, 0);
|
||
const height = Math.min(r.bottom, innerHeight) - Math.max(r.top, 0);
|
||
return r.width > 120 && r.height > 120 && width > 80 && height > 80 &&
|
||
s.display !== 'none' && s.visibility !== 'hidden';
|
||
})
|
||
);
|
||
if (hasPanel()) return [{ alreadyOpen: true }];
|
||
const visible = (el) => {
|
||
const r = el.getBoundingClientRect();
|
||
const s = getComputedStyle(el);
|
||
return r.width > 6 && r.height > 6 && s.visibility !== 'hidden' &&
|
||
s.display !== 'none' && r.bottom > 0 && r.right > 0 &&
|
||
r.top < innerHeight && r.left < innerWidth;
|
||
};
|
||
const points = [];
|
||
const addPoint = (el, reason) => {
|
||
let target = el.closest('button,[role="button"],[tabindex],a') || el;
|
||
for (let i = 0; i < 5 && target.parentElement; i += 1) {
|
||
const current = target.getBoundingClientRect();
|
||
const parent = target.parentElement.getBoundingClientRect();
|
||
const clickableParent = target.parentElement.matches(
|
||
'button,[role="button"],[tabindex],a'
|
||
);
|
||
if (clickableParent || (parent.width <= 140 && parent.height <= 140 &&
|
||
parent.width >= current.width && parent.height >= current.height)) {
|
||
target = target.parentElement;
|
||
}
|
||
}
|
||
if (!visible(target)) target = el;
|
||
if (!visible(target)) return;
|
||
const r = target.getBoundingClientRect();
|
||
points.push({
|
||
x: Math.round(r.left + r.width / 2),
|
||
y: Math.round(r.top + r.height / 2),
|
||
reason,
|
||
rightBias: r.left > innerWidth * 0.45 ? 1 : 0,
|
||
size: Math.round(r.width * r.height),
|
||
});
|
||
};
|
||
const textNodes = [
|
||
...document.querySelectorAll('button,[role="button"],[tabindex],[aria-label],[title],span')
|
||
];
|
||
for (const el of textNodes) {
|
||
const text = [
|
||
el.innerText || '',
|
||
el.getAttribute('aria-label') || '',
|
||
el.getAttribute('title') || ''
|
||
].join(' ').trim();
|
||
if (!/评论/.test(text) || /评论区|评论列表|暂无评论/.test(text)) continue;
|
||
addPoint(el, 'text');
|
||
}
|
||
for (const svg of document.querySelectorAll('svg[viewBox="0 0 99 99"]')) {
|
||
const d = [...svg.querySelectorAll('path')]
|
||
.map((p) => p.getAttribute('d') || '')
|
||
.join(' ');
|
||
if (!d.includes('M-5.79,5.98') && !d.includes('C-3.56,3.75')) {
|
||
continue;
|
||
}
|
||
addPoint(svg, 'comment-svg-99');
|
||
}
|
||
return points.sort((a, b) =>
|
||
(b.rightBias - a.rightBias) || (a.size - b.size) || (a.y - b.y)
|
||
);
|
||
}
|
||
"""
|
||
)
|
||
for point in points or []:
|
||
if point.get("alreadyOpen"):
|
||
return True
|
||
x = point.get("x")
|
||
y = point.get("y")
|
||
if not isinstance(x, int) or not isinstance(y, int):
|
||
continue
|
||
page.mouse.move(x, y)
|
||
page.mouse.click(x, y)
|
||
if wait_for_comment_panel(page, 2400):
|
||
return True
|
||
except Exception:
|
||
pass
|
||
return False
|
||
|
||
|
||
def move_mouse_into_box(page, box: dict) -> None:
|
||
viewport = page.viewport_size or {"width": 1440, "height": 900}
|
||
x = min(max(box["x"] + box["width"] / 2, 8), viewport["width"] - 8)
|
||
y = min(max(box["y"] + min(box["height"] / 2, 260), 8), viewport["height"] - 8)
|
||
page.mouse.move(x, y)
|
||
|
||
|
||
def bring_comments_into_view(page) -> bool:
|
||
for _attempt in range(4):
|
||
for selector in COMMENT_SCROLLERS:
|
||
try:
|
||
locator = page.locator(selector).first
|
||
if locator.count() <= 0:
|
||
continue
|
||
locator.scroll_into_view_if_needed(timeout=5000)
|
||
page.wait_for_timeout(1800)
|
||
scroller = page.query_selector(selector)
|
||
if scroller:
|
||
box = scroller.bounding_box()
|
||
if box:
|
||
move_mouse_into_box(page, box)
|
||
return True
|
||
except Exception:
|
||
continue
|
||
try:
|
||
page.evaluate(
|
||
"window.scrollBy(0, Math.round(window.innerHeight * 0.68))"
|
||
)
|
||
page.wait_for_timeout(1700)
|
||
except Exception:
|
||
pass
|
||
return False
|
||
|
||
|
||
def dom_comment_item_count(page) -> int:
|
||
try:
|
||
return len(page.query_selector_all("[data-e2e='comment-item']"))
|
||
except Exception:
|
||
return 0
|
||
|
||
|
||
def comments_end_reached(page) -> bool:
|
||
try:
|
||
return bool(
|
||
page.evaluate(
|
||
"() => document.body && document.body.innerText.includes('暂时没有更多评论')"
|
||
)
|
||
)
|
||
except Exception:
|
||
return False
|
||
|
||
|
||
def scroll_comment_panel(page) -> bool:
|
||
try:
|
||
handle = page.evaluate_handle(
|
||
"""
|
||
() => {
|
||
const selectors = [
|
||
"[data-e2e='comment-list']",
|
||
".comment-mainContent",
|
||
"[data-e2e='detail-comment']",
|
||
".comment-list",
|
||
".ESlRWJ2j"
|
||
];
|
||
const scrollable = (el) => {
|
||
if (!el) return false;
|
||
const s = getComputedStyle(el);
|
||
return el.scrollHeight > el.clientHeight + 20 ||
|
||
/(auto|scroll)/.test(s.overflowY || '');
|
||
};
|
||
for (const selector of selectors) {
|
||
let el = document.querySelector(selector);
|
||
while (el && el !== document.body) {
|
||
if (scrollable(el) && el.clientHeight > 120) return el;
|
||
el = el.parentElement;
|
||
}
|
||
}
|
||
return document.scrollingElement || document.documentElement;
|
||
}
|
||
"""
|
||
)
|
||
element = handle.as_element()
|
||
except Exception:
|
||
element = None
|
||
try:
|
||
if element:
|
||
box = element.bounding_box()
|
||
if box:
|
||
move_mouse_into_box(page, box)
|
||
page.mouse.wheel(0, 2800)
|
||
page.evaluate(
|
||
"""
|
||
(e) => {
|
||
const step = Math.max(900, Math.floor((e.clientHeight || innerHeight) * 1.35));
|
||
if (typeof e.scrollBy === 'function') {
|
||
e.scrollBy(0, step);
|
||
} else {
|
||
e.scrollTop += step;
|
||
}
|
||
}
|
||
""",
|
||
element,
|
||
)
|
||
return True
|
||
except Exception:
|
||
pass
|
||
try:
|
||
page.mouse.wheel(0, 2800)
|
||
page.keyboard.press("End")
|
||
return True
|
||
except Exception:
|
||
return False
|
||
|
||
|
||
def launch_context(playwright, args):
|
||
profile_dir = Path(args.profile_dir).expanduser()
|
||
profile_dir.mkdir(parents=True, exist_ok=True)
|
||
unlock_profile(profile_dir)
|
||
|
||
launch_args = {
|
||
"user_data_dir": str(profile_dir),
|
||
"headless": args.headless,
|
||
"args": CHROME_ARGS,
|
||
"ignore_default_args": ["--enable-automation"],
|
||
"user_agent": UA,
|
||
"locale": "zh-CN",
|
||
"viewport": {"width": args.width, "height": args.height},
|
||
"slow_mo": args.slow_ms,
|
||
}
|
||
if args.browser_channel:
|
||
launch_args["channel"] = args.browser_channel
|
||
try:
|
||
return playwright.chromium.launch_persistent_context(**launch_args)
|
||
except Exception as exc:
|
||
if not args.browser_channel:
|
||
raise
|
||
log(f"指定 channel={args.browser_channel!r} 启动失败,回退到 Playwright Chromium: {exc}")
|
||
launch_args.pop("channel", None)
|
||
return playwright.chromium.launch_persistent_context(**launch_args)
|
||
|
||
|
||
def finalize_run(
|
||
out_dir: Path,
|
||
raw_api: list[dict],
|
||
api_events: list[dict],
|
||
dom_comments: list[dict],
|
||
args,
|
||
keyword: str,
|
||
logged_in: bool,
|
||
) -> dict:
|
||
notes = parse_douyin_notes(raw_api, args.name, keyword)
|
||
comments = parse_douyin_comments(raw_api, args.name, keyword)
|
||
records = notes + comments
|
||
endpoint_kinds = sorted({row.get("kind", "") for row in api_events})
|
||
summary = {
|
||
"ok": True,
|
||
"logged_in": logged_in,
|
||
"name": args.name,
|
||
"place_natural_key": args.place_key,
|
||
"keyword": keyword,
|
||
"note_count": len(notes),
|
||
"comment_count": len(comments),
|
||
"dom_comment_snippet_count": len(dom_comments),
|
||
"api_event_count": len(api_events),
|
||
"raw_api_count": len(raw_api),
|
||
"api_kinds": endpoint_kinds,
|
||
"comment_api_event_count": sum(
|
||
1 for row in api_events if row.get("kind") == "comment"
|
||
),
|
||
"search_api_event_count": sum(
|
||
1 for row in api_events if row.get("kind") == "search"
|
||
),
|
||
}
|
||
files = save_artifacts(
|
||
out_dir,
|
||
records,
|
||
api_events,
|
||
raw_api,
|
||
dom_comments,
|
||
summary,
|
||
args.place_key,
|
||
)
|
||
log(
|
||
"完成: "
|
||
f"视频 {len(notes)} 条, API评论 {len(comments)} 条, "
|
||
f"DOM评论片段 {len(dom_comments)} 条, API事件 {len(api_events)} 条"
|
||
)
|
||
log(f"CSV: {files['social_evidence_csv']}")
|
||
if summary["comment_api_event_count"] == 0:
|
||
log("诊断提示: 没有捕到 comment API,优先看页面是否进入详情和评论区是否展开。")
|
||
elif len(comments) == 0:
|
||
log("诊断提示: 捕到 comment API 但解析为 0,优先看 raw_api.jsonl 的响应结构。")
|
||
return summary
|
||
|
||
|
||
def run_probe(args) -> dict:
|
||
from playwright.sync_api import sync_playwright
|
||
|
||
keyword = args.keyword or f"贵阳 {args.name}".strip()
|
||
run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
out_dir = Path(args.out_dir).expanduser() / run_id
|
||
|
||
raw_api: list[dict] = []
|
||
api_events: list[dict] = []
|
||
dom_comments: list[dict] = []
|
||
captured_urls: set[str] = set()
|
||
event_seq = 0
|
||
|
||
log(f"关键词: {keyword}")
|
||
log(f"输出目录: {out_dir}")
|
||
log("即将打开可视化 Chrome 窗口,后续搜索/点视频/滚评论都会在窗口里可见。")
|
||
forced_candidates = []
|
||
if args.video_url:
|
||
forced_candidates = [
|
||
{"href": args.video_url, "link": None, "source": "arg"}
|
||
]
|
||
|
||
with sync_playwright() as p:
|
||
ctx = launch_context(p, args)
|
||
ctx.add_init_script(STEALTH_JS)
|
||
|
||
def on_response(resp):
|
||
nonlocal event_seq
|
||
url = resp.url
|
||
if not any(pattern in url for pattern in DIAG_PATTERNS):
|
||
return
|
||
body = None
|
||
shape = {}
|
||
matched = any(pattern in url for pattern in LISTEN_PATTERNS)
|
||
try:
|
||
body = resp.json()
|
||
shape = response_shape(body)
|
||
except Exception as exc:
|
||
shape = {"body_type": "non_json", "keys": str(exc)[:80]}
|
||
kind = (
|
||
"comment"
|
||
if "/comment/" in url or "/comment/list" in url
|
||
else "search"
|
||
if "/search" in url
|
||
else "aweme"
|
||
)
|
||
event_seq += 1
|
||
event = {
|
||
"seq": event_seq,
|
||
"captured_at": datetime.now().isoformat(timespec="seconds"),
|
||
"status": resp.status,
|
||
"kind": kind,
|
||
"url": url,
|
||
**shape,
|
||
}
|
||
api_events.append(event)
|
||
if matched and isinstance(body, dict):
|
||
raw_api.append(
|
||
{
|
||
"__url": url,
|
||
"__status": resp.status,
|
||
"__captured_at": event["captured_at"],
|
||
"__body": body,
|
||
}
|
||
)
|
||
if url not in captured_urls:
|
||
captured_urls.add(url)
|
||
cc = event.get("comment_items") or 0
|
||
ac = event.get("aweme_items") or 0
|
||
log(f"API {kind} status={resp.status} comments={cc} aweme={ac} {url[:120]}")
|
||
|
||
ctx.on("response", on_response)
|
||
page = ctx.pages[0] if ctx.pages else ctx.new_page()
|
||
page.set_default_timeout(15000)
|
||
logged_in = True
|
||
if forced_candidates:
|
||
search_url = args.video_url
|
||
log(f"跳过搜索,直接诊断视频: {args.video_url}")
|
||
else:
|
||
search_url = SEARCH_URL.format(kw=quote(keyword))
|
||
page.goto(search_url, wait_until="domcontentloaded", timeout=60000)
|
||
page.wait_for_timeout(args.search_wait_ms)
|
||
|
||
html = page.content()
|
||
cards = page.query_selector_all("a[href*='/video/']")
|
||
logged_in = not (
|
||
(not cards)
|
||
and ("扫码登录" in html or "手机号登录" in html
|
||
or "验证" in html or "/passport" in page.url)
|
||
)
|
||
if not logged_in:
|
||
log("看起来遇到登录墙/验证页;脚本仍会保存诊断文件。")
|
||
|
||
for i in range(args.search_scrolls):
|
||
page.mouse.wheel(0, 2600)
|
||
page.wait_for_timeout(int(random.uniform(1000, 1800)))
|
||
cards = page.query_selector_all("a[href*='/video/']")
|
||
log(
|
||
f"搜索页滚动 {i + 1}/{args.search_scrolls}: "
|
||
f"当前 video 链接 {len(cards)}"
|
||
)
|
||
if len(cards) >= args.max_search_links:
|
||
break
|
||
|
||
deadline = time.time() + args.search_api_wait_ms / 1000
|
||
while time.time() < deadline:
|
||
candidates = detail_candidates(page, raw_api, args.name, keyword)
|
||
if candidates:
|
||
log(
|
||
f"搜索候选已就绪: {len(candidates)} "
|
||
f"(DOM/API 混合,等待后进入深采)"
|
||
)
|
||
break
|
||
page.wait_for_timeout(1000)
|
||
|
||
search_url = page.url
|
||
for idx in range(args.max_notes):
|
||
candidates = forced_candidates or detail_candidates(
|
||
page, raw_api, args.name, keyword
|
||
)
|
||
if idx >= len(candidates):
|
||
log(
|
||
f"视频候选不足,停止深采: idx={idx}, "
|
||
f"candidates={len(candidates)}"
|
||
)
|
||
break
|
||
candidate = candidates[idx]
|
||
href = candidate["href"]
|
||
link = candidate.get("link")
|
||
before_comments = comment_count_from_raw(raw_api)
|
||
log(
|
||
f"打开第 {idx + 1}/{args.max_notes} 个视频 "
|
||
f"({candidate['source']}): {href}"
|
||
)
|
||
try:
|
||
if link:
|
||
click_or_goto_detail(page, link, href)
|
||
else:
|
||
goto_detail_url(page, href)
|
||
try:
|
||
page.wait_for_load_state("domcontentloaded", timeout=30000)
|
||
except Exception:
|
||
if "/video/" not in page.url:
|
||
raise
|
||
page.wait_for_timeout(args.detail_wait_ms)
|
||
if not args.skip_fullscreen:
|
||
if maybe_click_player_fullscreen(page):
|
||
log(" 已尝试点击播放器全屏按钮")
|
||
page.wait_for_timeout(args.fullscreen_wait_ms)
|
||
else:
|
||
log(" 未找到播放器全屏按钮,继续尝试普通详情页评论")
|
||
if maybe_click_comments(page):
|
||
log(" 评论面板已确认打开")
|
||
else:
|
||
log(" 未确认评论面板打开,继续尝试滚入评论区")
|
||
if bring_comments_into_view(page):
|
||
log(" 评论区已滚入视口,等待首屏评论 API")
|
||
page.wait_for_timeout(args.comment_wait_ms)
|
||
stall = 0
|
||
previous = comment_count_from_raw(raw_api)
|
||
previous_dom = dom_comment_item_count(page)
|
||
for round_idx in range(args.comment_scrolls):
|
||
scroll_comment_panel(page)
|
||
page.wait_for_timeout(int(random.uniform(1500, 2400)))
|
||
current = comment_count_from_raw(raw_api)
|
||
current_dom = dom_comment_item_count(page)
|
||
log(
|
||
f" 评论滚动 {round_idx + 1}/{args.comment_scrolls}: "
|
||
f"API评论累计 {current}, DOM评论项 {current_dom}"
|
||
)
|
||
if comments_end_reached(page):
|
||
log(" 页面提示暂时没有更多评论,停止滚动。")
|
||
break
|
||
if current > previous or current_dom > previous_dom:
|
||
previous = current
|
||
previous_dom = current_dom
|
||
stall = 0
|
||
else:
|
||
stall += 1
|
||
if stall >= args.stall_rounds:
|
||
break
|
||
|
||
dom_comments.extend(collect_dom_comments(page, href or page.url))
|
||
after_comments = comment_count_from_raw(raw_api)
|
||
log(
|
||
f"第 {idx + 1} 个视频结束: 新增 API 评论 "
|
||
f"{after_comments - before_comments}"
|
||
)
|
||
except Exception as exc:
|
||
log(f"第 {idx + 1} 个视频深采异常: {exc}")
|
||
finally:
|
||
try:
|
||
if forced_candidates:
|
||
continue
|
||
page.goto(search_url, wait_until="domcontentloaded", timeout=45000)
|
||
page.wait_for_timeout(int(random.uniform(1500, 2300)))
|
||
except Exception:
|
||
pass
|
||
|
||
summary = finalize_run(
|
||
out_dir, raw_api, api_events, dom_comments, args, keyword, logged_in
|
||
)
|
||
if args.keep_open_seconds > 0 and not args.headless and not args.leave_open:
|
||
log(f"保留浏览器 {args.keep_open_seconds}s,方便最后查看页面状态。")
|
||
page.wait_for_timeout(args.keep_open_seconds * 1000)
|
||
if args.leave_open and not args.headless:
|
||
log("已保存 CSV/JSONL;Chrome 将保持打开。终端按 Ctrl+C 才会关闭浏览器。")
|
||
try:
|
||
while True:
|
||
page.wait_for_timeout(60000)
|
||
except KeyboardInterrupt:
|
||
log("收到 Ctrl+C,准备关闭浏览器。")
|
||
ctx.close()
|
||
|
||
return summary
|
||
|
||
|
||
def build_parser() -> argparse.ArgumentParser:
|
||
parser = argparse.ArgumentParser(
|
||
description="Run a visible Douyin collection probe and save CSV diagnostics."
|
||
)
|
||
parser.add_argument("--name", default="老凯里酸汤鱼", help="实体名称")
|
||
parser.add_argument("--keyword", default="", help="搜索关键词,默认“贵阳 {name}”")
|
||
parser.add_argument("--video-url", default="", help="跳过搜索,直接诊断单个视频 URL")
|
||
parser.add_argument(
|
||
"--place-key",
|
||
default="place-lao-kaili-sourfish",
|
||
help="写入 CSV 的 place_natural_key/eid",
|
||
)
|
||
parser.add_argument("--out-dir", default=str(DEFAULT_OUT_DIR), help="输出根目录")
|
||
parser.add_argument(
|
||
"--profile-dir",
|
||
default=str(DEFAULT_PROFILE_DIR),
|
||
help="抖音持久化登录 profile 目录",
|
||
)
|
||
parser.add_argument(
|
||
"--browser-channel",
|
||
default="chrome",
|
||
help="Playwright channel,默认 chrome;失败会回退 chromium",
|
||
)
|
||
parser.add_argument("--headless", action="store_true", help="改为无头运行")
|
||
parser.add_argument("--width", type=int, default=1440)
|
||
parser.add_argument("--height", type=int, default=900)
|
||
parser.add_argument("--slow-ms", type=int, default=220, help="可视化慢动作毫秒")
|
||
parser.add_argument("--search-wait-ms", type=int, default=4500)
|
||
parser.add_argument("--search-api-wait-ms", type=int, default=12000)
|
||
parser.add_argument("--detail-wait-ms", type=int, default=3800)
|
||
parser.add_argument("--fullscreen-wait-ms", type=int, default=1800)
|
||
parser.add_argument("--comment-wait-ms", type=int, default=3500)
|
||
parser.add_argument("--search-scrolls", type=int, default=6)
|
||
parser.add_argument("--max-search-links", type=int, default=40)
|
||
parser.add_argument("--max-notes", type=int, default=3)
|
||
parser.add_argument("--comment-scrolls", type=int, default=30)
|
||
parser.add_argument("--stall-rounds", type=int, default=4)
|
||
parser.add_argument("--keep-open-seconds", type=int, default=12)
|
||
parser.add_argument(
|
||
"--skip-fullscreen",
|
||
action="store_true",
|
||
help="不点击播放器全屏按钮,直接在普通详情页尝试评论区",
|
||
)
|
||
parser.add_argument(
|
||
"--leave-open",
|
||
action="store_true",
|
||
help="采集结束后不关闭 Chrome,保持 Python 进程等待 Ctrl+C",
|
||
)
|
||
return parser
|
||
|
||
|
||
def main() -> int:
|
||
args = build_parser().parse_args()
|
||
try:
|
||
summary = run_probe(args)
|
||
except KeyboardInterrupt:
|
||
log("用户中断。")
|
||
return 130
|
||
except Exception as exc:
|
||
log(f"运行失败: {exc}")
|
||
return 1
|
||
print(json.dumps(summary, ensure_ascii=False, indent=2))
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
raise SystemExit(main())
|