bxh/scripts/baike_fullpage_markdown.py

"""Capture Baidu Baike rendered pages as auditable full-page Markdown.

This stricter crawler is for schema-building evidence. It does not accept a
short text fallback as success: the saved Markdown always includes a completeness
report comparing the page catalog with extracted body headings.
"""
from __future__ import annotations

import argparse
import json
import random
import re
import sys
import time
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import quote, urljoin

from bs4 import BeautifulSoup


ROOT = Path(__file__).resolve().parents[1]
DEFAULT_OUT_DIR = ROOT / "schema搭建" / "baidu_fullpage_md"
DEFAULT_PROFILE_DIR = ROOT / "data" / "browser_profiles" / "baike_fullpage"
DEFAULT_GUIZHOU_BATCH_DIR = ROOT / "schema搭建" / "city_poi_schema_v0_1" / "baidu_fullpage_guizhou_scenic"

BAD_LINE_TOKENS = [
    "百度首页",
    "登录",
    "注册",
    "打开APP",
    "百度百科合作平台",
    "使用百度前必读",
    "百科协议",
    "隐私政策",
    "京ICP",
    "营业执照",
    "投诉建议",
    "词条统计",
    "分享你的世界",
    "查看更多",
    "上传视频",
    "免责声明",
]

CATALOG_NOISE = {
    "首页",
    "帮助",
    "秒懂百科",
    "特色百科",
    "知识专题",
    "加入百科",
    "百科团队",
    "权威合作",
    "播报",
    "编辑",
    "讨论",
    "收藏",
    "赞",
}

BASIC_INFO_NOISE_KEYS = CATALOG_NOISE | {
    "网页",
    "新闻",
    "贴吧",
    "知道",
    "网盘",
    "图片",
    "视频",
    "地图",
    "文库",
    "资讯",
    "采购",
    "国际版",
}

ANTI_CRAWL_TOKENS = [
    "百度安全验证",
    "验证码",
    "系统检测到异常",
    "captcha",
    "anticrawl",
]


@dataclass
class PageQuality:
    status: str
    reason: str
    page_title: str
    final_url: str
    markdown_chars: int
    catalog_count: int
    body_heading_count: int
    matched_catalog_count: int
    catalog_coverage: float
    missing_catalog_headings: list[str]
    anti_crawl: bool
    media_count: int


@dataclass
class MediaItem:
    kind: str
    section: str
    caption: str
    alt: str
    src: str
    href: str
    asset_id: str
    width: str
    height: str


def compact(text: str) -> str:
    return re.sub(r"\s+", " ", text or "").strip()


def now_iso() -> str:
    return datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")


def baike_url(name: str) -> str:
    return f"https://baike.baidu.com/item/{quote(name)}"


GUIZHOU_SCENIC_SPOTS = [
    {"name": "黄果树瀑布"},
    {"name": "小七孔风景区"},
    {"name": "梵净山"},
    {"name": "西江千户苗寨"},
    {"name": "青岩古镇"},
    {"name": "镇远古城"},
    {"name": "肇兴侗寨"},
    {"name": "万峰林"},
    {"name": "马岭河峡谷"},
    {"name": "织金洞"},
    {"name": "百里杜鹃风景名胜区"},
    {"name": "赤水丹霞"},
    {"name": "龙宫风景区"},
    {"name": "遵义会议会址"},
    {"name": "甲秀楼"},
    {"name": "黔灵山公园"},
    {
        "name": "花溪公园",
        "url": "https://baike.baidu.com/item/%E8%8A%B1%E6%BA%AA%E5%85%AC%E5%9B%AD/112398?fromModule=lemma_search-box",
    },
    {"name": "天河潭"},
    {"name": "南江大峡谷"},
    {"name": "乌蒙大草原"},
]


def slugify(text: str) -> str:
    return re.sub(r"[\\/:*?\"<>|\\s]+", "_", text).strip("_") or "baike_page"


def clean_inline(text: str) -> str:
    text = compact(text)
    text = re.sub(r"\[(?:\d+|编辑)\]", "", text)
    text = re.sub(r"\s*播报\s*编辑\s*$", "", text)
    text = text.replace("\xa0", " ")
    return compact(text)


def clean_heading(text: str) -> str:
    text = clean_inline(text)
    return text.strip("# ")


def clean_catalog_heading(text: str) -> str:
    text = clean_heading(text)
    # Strip catalog ordinals such as "1历史沿革", but keep real names like
    # "68级跌水瀑布".
    text = re.sub(r"^\d{1,2}\s*(?!级)(?=[\u4e00-\u9fa5A-Za-z])", "", text)
    return text


def norm_heading(text: str) -> str:
    text = clean_heading(text).lower()
    text = re.sub(r"^[一二三四五六七八九十百千万]+[、.．\s]*", "", text)
    text = re.sub(r"^\d{1,2}\s*(?!级)(?=[\u4e00-\u9fa5A-Za-z])", "", text)
    return re.sub(r"[\s#：:、，,。.;；（）()\[\]【】·\-_/]+", "", text)


def is_bad_line(text: str) -> bool:
    text = clean_inline(text)
    if not text or len(text) <= 1:
        return True
    if any(token in text for token in BAD_LINE_TOKENS):
        return True
    if len(text) > 700 and (text.startswith("{") or text.startswith("[")):
        return True
    return False


def dedupe_keep_order(items: list[str]) -> list[str]:
    out: list[str] = []
    seen = set()
    for item in items:
        key = norm_heading(item) or item
        if key and key not in seen:
            seen.add(key)
            out.append(item)
    return out


def anti_crawl_detected(html: str, final_url: str, title: str) -> bool:
    if "anticrawl" in final_url or "captcha" in final_url:
        return True
    soup = BeautifulSoup(html, "html.parser")
    text = compact(" ".join([title, soup.get_text(" ", strip=True)[:3000]]))
    return any(token in text for token in ANTI_CRAWL_TOKENS)


def chrome_runtime_options() -> tuple[str, list[str], str]:
    sys.path.insert(0, str(ROOT))
    try:
        from app.agents.web_agent import _CHROME_ARGS, _STEALTH_JS, _UA
    except Exception:
        _UA = (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/124.0.0.0 Safari/537.36"
        )
        _CHROME_ARGS = [
            "--disable-blink-features=AutomationControlled",
            "--no-first-run",
            "--no-default-browser-check",
            "--disable-sync",
            "--disable-default-apps",
            "--no-sandbox",
            "--disable-dev-shm-usage",
        ]
        _STEALTH_JS = "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
    return _UA, list(_CHROME_ARGS), _STEALTH_JS


def scroll_to_render(page, max_rounds: int = 16) -> None:
    last_height = 0
    stable_rounds = 0
    for _ in range(max_rounds):
        height = page.evaluate("() => document.documentElement.scrollHeight")
        if height == last_height:
            stable_rounds += 1
        else:
            stable_rounds = 0
        if stable_rounds >= 3:
            break
        last_height = height
        page.mouse.wheel(0, random.randint(1100, 1900))
        page.wait_for_timeout(random.randint(220, 520))
    page.evaluate("() => window.scrollTo(0, 0)")
    page.wait_for_timeout(300)


def fetch_rendered_page(
    url: str,
    *,
    profile_dir: Path,
    headful: bool,
    manual_seconds: int,
    timeout_ms: int,
) -> tuple[str, str, str]:
    from playwright.sync_api import sync_playwright

    ua, chrome_args, stealth_js = chrome_runtime_options()
    profile_dir.mkdir(parents=True, exist_ok=True)
    with sync_playwright() as p:
        ctx = p.chromium.launch_persistent_context(
            str(profile_dir),
            headless=not headful,
            args=chrome_args,
            ignore_default_args=["--enable-automation"],
            user_agent=ua,
            locale="zh-CN",
            viewport=random.choice(
                [
                    {"width": 1440, "height": 1100},
                    {"width": 1366, "height": 900},
                    {"width": 1600, "height": 1100},
                ]
            ),
            extra_http_headers={
                "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
                "Referer": "https://www.baidu.com/",
            },
        )
        ctx.add_init_script(stealth_js)
        page = ctx.pages[0] if ctx.pages else ctx.new_page()
        page.goto(url, timeout=timeout_ms, wait_until="domcontentloaded")
        page.wait_for_timeout(random.randint(1000, 1800))

        if manual_seconds > 0:
            deadline = time.time() + manual_seconds
            while time.time() < deadline:
                html = page.content()
                if not anti_crawl_detected(html, page.url, page.title() or ""):
                    break
                remaining = int(deadline - time.time())
                print(f"[manual] Baidu verification page detected, waiting {remaining}s...", flush=True)
                page.wait_for_timeout(2500)

        scroll_to_render(page)
        final_url = page.url
        title = page.title() or ""
        html = page.content()
        ctx.close()
    return html, final_url, title


def table_to_markdown(table) -> list[str]:
    rows: list[list[str]] = []
    for tr in table.find_all("tr"):
        cells = [clean_inline(c.get_text(" ", strip=True)).replace("|", "/") for c in tr.find_all(["th", "td"])]
        cells = [c for c in cells if c]
        if cells:
            rows.append(cells[:8])
    if not rows:
        return []
    width = max(len(row) for row in rows)
    rows = [row + [""] * (width - len(row)) for row in rows]
    lines = ["| " + " | ".join(rows[0]) + " |"]
    lines.append("| " + " | ".join(["---"] * width) + " |")
    for row in rows[1:]:
        lines.append("| " + " | ".join(row) + " |")
    return lines


def find_content_root(soup: BeautifulSoup):
    for selector in (
        "div[class*='mainContent']",
        "div[id='J-lemma-main-wrapper']",
        "div[class*='lemmaWrapper']",
        "div.J-lemma-content",
        "div[class*='lemma-content']",
        "main",
        "article",
    ):
        root = soup.select_one(selector)
        if root and len(clean_inline(root.get_text(" ", strip=True))) > 600:
            return root
    return soup.body or soup


def extract_catalog(soup: BeautifulSoup) -> list[str]:
    candidates: list[str] = []
    containers = soup.select(
        "[class*='catalogWrapper'], [class*='CatalogWrapper'], "
        "[class*='catalog_'], [class*='Catalog_'], [class*='catalogList'], [class*='CatalogList']"
    )
    for container in containers:
        for link in container.find_all("a"):
            text = clean_catalog_heading(link.get_text(" ", strip=True))
            if not text:
                continue
            if text in CATALOG_NOISE or is_bad_line(text):
                continue
            if len(text) <= 40:
                candidates.append(text)

    if not candidates:
        root = find_content_root(soup)
        for link in root.select("a[href^='#']"):
            text = clean_catalog_heading(link.get_text(" ", strip=True))
            if text and text not in CATALOG_NOISE and len(text) <= 40 and not is_bad_line(text):
                candidates.append(text)

    return dedupe_keep_order(candidates)


def is_basic_info_pair(key: str, val: str) -> bool:
    key = clean_inline(key).rstrip(":：")
    val = clean_inline(val)
    key_norm = re.sub(r"\s+", "", key)
    if not key or not val:
        return False
    if key_norm in BASIC_INFO_NOISE_KEYS or key in BASIC_INFO_NOISE_KEYS:
        return False
    if is_bad_line(key + val):
        return False
    if len(key) > 24 or len(val) > 300:
        return False
    return True


def extract_basic_info(soup: BeautifulSoup) -> list[tuple[str, str]]:
    pairs: list[tuple[str, str]] = []

    def add_pair(key: str, val: str) -> None:
        key = clean_inline(key).rstrip(":：")
        val = clean_inline(val)
        if not is_basic_info_pair(key, val):
            return
        key_norm = re.sub(r"\s+", "", key)
        for idx, (old_key, old_val) in enumerate(pairs):
            old_key_norm = re.sub(r"\s+", "", old_key)
            if old_key_norm == key_norm:
                if len(val) > len(old_val):
                    pairs[idx] = (key, val)
                return
        pairs.append((key, val))

    names = soup.select(".basicInfo-item.name")
    values = soup.select(".basicInfo-item.value")
    for name, value in zip(names, values, strict=False):
        key = clean_inline(name.get_text(" ", strip=True)).rstrip(":：")
        val = clean_inline(value.get_text(" ", strip=True))
        add_pair(key, val)

    for dt in soup.find_all("dt"):
        dd = dt.find_next_sibling("dd")
        key = clean_inline(dt.get_text(" ", strip=True)).rstrip(":：")
        val = clean_inline(dd.get_text(" ", strip=True)) if dd else ""
        in_basic_info = bool(dt.find_parent(class_=re.compile(r"basicInfo|BasicInfo|J-basic-info")))
        if in_basic_info:
            add_pair(key, val)

    if not pairs:
        containers = soup.select("[class*='basicInfo'], [class*='BasicInfo'], .J-basic-info")
        for container in containers:
            tokens = [
                clean_inline(token)
                for token in container.get_text("\n", strip=True).splitlines()
                if clean_inline(token)
            ]
            i = 0
            while i < len(tokens) - 1:
                key = tokens[i].rstrip(":：")
                val = tokens[i + 1]
                before_count = len(pairs)
                add_pair(key, val)
                if len(pairs) > before_count:
                    i += 2
                else:
                    i += 1

    return pairs


def extract_summary(soup: BeautifulSoup) -> list[str]:
    root_candidates = soup.select(".J-summary, [class*='lemmaSummary'], [class*='summary_']")
    lines: list[str] = []
    seen = set()
    for root in root_candidates:
        for tag in root.find_all(["p", "div"], recursive=True):
            text = clean_inline(tag.get_text(" ", strip=True))
            if len(text) < 20 or is_bad_line(text):
                continue
            key = norm_heading(text)
            if key not in seen:
                seen.add(key)
                lines.append(text)
    return lines


def tag_classes(tag) -> str:
    return " ".join(tag.get("class") or [])


def block_kind(tag) -> str:
    classes = tag_classes(tag)
    tag_id = tag.get("id") or ""
    if "lemmaReference" in classes or "reference" in classes.lower() or tag_id == "J-lemma-reference":
        return "stop"
    if tag.name in {"h1", "h2", "h3", "h4"}:
        return "heading"
    if tag.name == "table":
        return "table"
    if tag.name == "li":
        return "list"
    if tag.name == "p":
        return "paragraph"
    if tag.name != "div":
        return ""
    if "paraTitle" in classes or re.search(r"\blevel-\d", classes):
        return "heading"
    lower = classes.lower()
    if "caption" in lower:
        return "caption"
    if "para" in classes and ("content" in classes or "summary" in classes or "MARK_MODULE" in classes):
        return "paragraph"
    return ""


def heading_level(tag) -> str:
    classes = tag_classes(tag)
    if tag.name == "h1":
        return "##"
    if tag.name == "h2" or "level-1" in classes:
        return "###"
    if tag.name == "h3" or "level-2" in classes:
        return "####"
    if tag.name == "h4" or "level-3" in classes:
        return "#####"
    return "###"


def media_src(tag, final_url: str) -> str:
    attrs = [
        "src",
        "data-src",
        "data-original",
        "data-lazy-src",
        "data-url",
        "poster",
    ]
    for attr in attrs:
        value = clean_inline(tag.get(attr) or "")
        if value and not value.startswith("data:"):
            return urljoin(final_url, value)
    source = tag.find("source") if hasattr(tag, "find") else None
    if source:
        value = clean_inline(source.get("src") or "")
        if value and not value.startswith("data:"):
            return urljoin(final_url, value)
    return ""


def media_caption(tag) -> str:
    for parent in [tag, *list(tag.parents)[:5]]:
        if not parent:
            continue
        for selector in (
            "[class*='swiperDesc']",
            "[class*='imageCaption']",
            "[class*='picDesc']",
            "[class*='caption']",
            "[class*='desc']",
            "figcaption",
        ):
            found = parent.select_one(selector) if hasattr(parent, "select_one") else None
            if found:
                text = clean_inline(found.get_text(" ", strip=True))
                if 1 < len(text) <= 120 and not is_bad_line(text):
                    return text
    for attr in ("alt", "title", "aria-label"):
        text = clean_inline(tag.get(attr) or "")
        if 1 < len(text) <= 120 and not is_bad_line(text):
            return text
    return ""


def text_without_media(tag) -> str:
    clone = BeautifulSoup(str(tag), "html.parser")
    root = clone.find()
    media_containers = []
    for media in clone.find_all(["img", "video", "source"]):
        container = media
        for parent in media.parents:
            if parent is clone or parent == root:
                break
            if parent.find(["img", "video"]):
                container = parent
        if container not in media_containers:
            media_containers.append(container)
    for container in media_containers:
        container.decompose()
    for selector in (
        "img",
        "video",
        "source",
        "[class*='swiperUl']",
        "[class*='swiperLi']",
        "[class*='swiperDesc']",
        "[class*='SwiperDesc']",
        "[class*='lemmaPicture']",
        "[class*='LemmaPicture']",
        "[class*='imageCaption']",
        "[class*='ImageCaption']",
        "[class*='picDesc']",
        "[class*='PicDesc']",
        "[class*='caption']",
        "[class*='Caption']",
        "[class*='desc']",
        "[class*='Desc']",
    ):
        for found in clone.select(selector):
            found.decompose()
    return clean_inline(clone.get_text(" ", strip=True))


def is_media_noise(item: MediaItem) -> bool:
    text = f"{item.caption} {item.alt}"
    if not item.src:
        return True
    if not item.caption and not item.alt:
        return True
    if any(token in text for token in ("订阅", "收藏", "赞", "播放", "编辑")):
        return True
    if "subscribe" in item.src or "front-end/swanapp-baike" in item.src:
        return True
    if item.section in {"相关星图", "词条统计"}:
        return True
    return False


def extract_media_items(soup: BeautifulSoup, final_url: str) -> list[MediaItem]:
    root = soup.select_one("div.J-lemma-content") or soup.select_one("div[class*='lemma-content']") or find_content_root(soup)
    items: list[MediaItem] = []
    seen = set()
    current_section = ""

    for tag in root.find_all(["h1", "h2", "h3", "h4", "div", "img", "video"], recursive=True):
        kind = block_kind(tag)
        if kind == "stop":
            break
        if kind == "heading":
            heading = clean_heading(tag.get_text(" ", strip=True))
            if heading and not is_bad_line(heading):
                current_section = heading
            continue
        if tag.name not in {"img", "video"}:
            continue

        src = media_src(tag, final_url)
        caption = media_caption(tag)
        alt = clean_inline(tag.get("alt") or tag.get("title") or "")
        parent_link = tag.find_parent("a")
        href = urljoin(final_url, parent_link.get("href")) if parent_link and parent_link.get("href") else ""
        asset_id = clean_inline(parent_link.get("id") or tag.get("id") or "") if parent_link or tag.get("id") else ""
        item = MediaItem(
            kind="video" if tag.name == "video" else "image",
            section=current_section,
            caption=caption or alt,
            alt=alt,
            src=src,
            href=href,
            asset_id=asset_id,
            width=clean_inline(tag.get("width") or ""),
            height=clean_inline(tag.get("height") or ""),
        )
        key = (item.kind, item.section, item.caption, item.src)
        if key in seen or is_media_noise(item):
            continue
        seen.add(key)
        items.append(item)

    return items


def extract_body_blocks(soup: BeautifulSoup) -> tuple[list[str], list[str], int, int]:
    root = soup.select_one("div.J-lemma-content") or soup.select_one("div[class*='lemma-content']") or find_content_root(soup)
    lines: list[str] = []
    headings: list[str] = []
    seen_blocks = set()
    paragraph_count = 0
    table_count = 0

    for tag in root.find_all(["h1", "h2", "h3", "h4", "p", "li", "table", "div"], recursive=True):
        kind = block_kind(tag)
        if kind == "stop":
            break
        if not kind:
            continue

        if kind == "table":
            table_lines = table_to_markdown(tag)
            if table_lines:
                key = "\n".join(table_lines)
                if key not in seen_blocks:
                    seen_blocks.add(key)
                    lines.extend(table_lines)
                    lines.append("")
                    table_count += 1
            continue

        if kind == "caption":
            continue

        text = text_without_media(tag) if kind in {"paragraph", "list"} else clean_inline(tag.get_text(" ", strip=True))
        if is_bad_line(text):
            continue
        if any(text.startswith(prefix) for prefix in ("参考资料", "词条标签", "开放分类")):
            break

        if kind == "heading":
            text = clean_heading(text)
            key = norm_heading(text)
            if not key or key in seen_blocks:
                continue
            seen_blocks.add(key)
            headings.append(text)
            lines.extend([f"{heading_level(tag)} {text}", ""])
            continue

        if kind == "list":
            if len(text) > 240:
                continue
            rendered = f"- {text}"
        else:
            rendered = text

        key = norm_heading(rendered)
        if not key or key in seen_blocks:
            continue
        seen_blocks.add(key)
        lines.extend([rendered, ""])
        paragraph_count += 1

    return lines, headings, paragraph_count, table_count


def match_headings(catalog: list[str], body_headings: list[str]) -> tuple[list[str], list[str]]:
    matched: list[str] = []
    missing: list[str] = []
    body_norms = [norm_heading(item) for item in body_headings]
    for item in catalog:
        item_norm = norm_heading(item)
        if not item_norm:
            continue
        ok = any(item_norm == body or item_norm in body or body in item_norm for body in body_norms if body)
        if ok:
            matched.append(item)
        else:
            missing.append(item)
    return matched, missing


def build_markdown(
    *,
    html: str,
    source_name: str,
    query_name: str,
    requested_url: str,
    final_url: str,
    browser_title: str,
    min_catalog_coverage: float,
    min_chars: int,
) -> tuple[str, PageQuality, list[MediaItem]]:
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "noscript", "iframe", "canvas", "svg", "button", "input", "form"]):
        tag.decompose()

    page_title = source_name
    h1 = soup.select_one("h1")
    if h1:
        page_title = clean_heading(h1.get_text(" ", strip=True)) or page_title
    elif browser_title:
        page_title = clean_heading(browser_title.split("_百度百科", 1)[0]) or page_title

    anti_crawl = anti_crawl_detected(html, final_url, browser_title)
    basic_info = extract_basic_info(soup)
    catalog = extract_catalog(soup)
    summary = extract_summary(soup)
    media_items = extract_media_items(soup, final_url)
    body_lines, body_headings, paragraph_count, table_count = extract_body_blocks(soup)
    matched_catalog, missing_catalog = match_headings(catalog, body_headings)

    catalog_coverage = (len(matched_catalog) / len(catalog)) if catalog else 0.0
    reasons: list[str] = []
    if anti_crawl:
        reasons.append("hit Baidu anti-crawl/verification page")
    if len(body_lines) == 0:
        reasons.append("no body blocks extracted")
    if catalog and catalog_coverage < min_catalog_coverage:
        reasons.append(f"catalog coverage {catalog_coverage:.0%} < {min_catalog_coverage:.0%}")

    crawl_time = now_iso()
    lines = [
        f"# {page_title or source_name}",
        "",
        "## 完整度检查",
        "",
        "- 状态：PENDING",
        f"- 页面目录项：{len(catalog)}",
        f"- 正文标题数：{len(body_headings)}",
        f"- 目录覆盖率：{catalog_coverage:.0%} ({len(matched_catalog)}/{len(catalog) if catalog else 0})",
        f"- 段落/列表：{paragraph_count}",
        f"- 表格：{table_count}",
        f"- 反爬/验证页：{'是' if anti_crawl else '否'}",
    ]
    if missing_catalog:
        lines.append("- 缺失目录项：" + "、".join(missing_catalog[:30]))
    lines.append("")

    lines.extend(
        [
            "## 页面正文 Markdown",
            "",
            "### 抓取信息（非原页面正文）",
            "",
            "- 数据源：百度百科",
            f"- 请求词条：{query_name}",
            f"- 页面标题：{page_title}",
            f"- 请求 URL：{requested_url}",
            f"- 最终 URL：{final_url}",
            f"- 抓取时间：{crawl_time}",
            "- 转换方式：rendered DOM fullpage + catalog validation",
            "",
        ]
    )

    if basic_info:
        lines.extend(["### 基本信息", "", "| 字段 | 值 |", "| --- | --- |"])
        for key, val in basic_info:
            lines.append(f"| {key.replace('|', '/')} | {val.replace('|', '/')} |")
        lines.append("")

    if summary:
        lines.extend(["### 摘要", ""])
        for item in summary:
            lines.extend([item, ""])

    if catalog:
        lines.extend(["### 页面目录", ""])
        for item in catalog:
            lines.append(f"- {item}")
        lines.append("")

    if media_items:
        lines.extend(
            [
                "### 媒体证据（图片/视频）",
                "",
                "| 所属章节 | 类型 | 说明 | URL |",
                "| --- | --- | --- | --- |",
            ]
        )
        for item in media_items:
            label = item.caption or item.alt
            url = item.href or item.src
            lines.append(
                f"| {item.section.replace('|', '/')} | {item.kind} | "
                f"{label.replace('|', '/')} | {url.replace('|', '/')} |"
            )
        lines.append("")

    lines.extend(body_lines)

    markdown_without_status = "\n".join(lines).strip() + "\n"
    markdown_chars = len(markdown_without_status)
    if markdown_chars < min_chars:
        reasons.append(f"markdown chars {markdown_chars} < {min_chars}")
    status = "OK" if not reasons else "INCOMPLETE"
    reason = "ok" if not reasons else "; ".join(reasons)
    markdown = markdown_without_status.replace("- 状态：PENDING", f"- 状态：{status}", 1)

    quality = PageQuality(
        status=status,
        reason=reason,
        page_title=page_title,
        final_url=final_url,
        markdown_chars=len(markdown),
        catalog_count=len(catalog),
        body_heading_count=len(body_headings),
        matched_catalog_count=len(matched_catalog),
        catalog_coverage=round(catalog_coverage, 4),
        missing_catalog_headings=missing_catalog,
        anti_crawl=anti_crawl,
        media_count=len(media_items),
    )
    return markdown, quality, media_items


def write_outputs(
    markdown: str,
    quality: PageQuality,
    media_items: list[MediaItem],
    out_dir: Path,
    filename: str,
    force: bool,
    write_json: bool,
) -> Path:
    out_dir.mkdir(parents=True, exist_ok=True)
    md_path = out_dir / filename
    if md_path.exists() and not force:
        raise FileExistsError(f"Output exists, use --force: {md_path}")
    md_path.write_text(markdown, encoding="utf-8")
    quality_path = md_path.with_suffix(".quality.json")
    media_path = md_path.with_suffix(".media.json")
    if write_json:
        quality_path.write_text(
            json.dumps(asdict(quality), ensure_ascii=False, indent=2),
            encoding="utf-8",
        )
        media_path.write_text(
            json.dumps([asdict(item) for item in media_items], ensure_ascii=False, indent=2),
            encoding="utf-8",
        )
    else:
        for path in (quality_path, media_path):
            if path.exists():
                path.unlink()
    return md_path


def crawl_fullpage(args: argparse.Namespace) -> tuple[Path, PageQuality]:
    if not args.name and not args.url:
        raise SystemExit("use --name or --url")

    query_name = args.name or args.url
    requested_url = args.url or baike_url(args.name)
    source_name = args.name or clean_heading(Path(requested_url).name)
    html, final_url, browser_title = fetch_rendered_page(
        requested_url,
        profile_dir=Path(args.profile_dir),
        headful=args.headful,
        manual_seconds=max(0, args.manual_seconds),
        timeout_ms=args.timeout_ms,
    )
    markdown, quality, media_items = build_markdown(
        html=html,
        source_name=source_name,
        query_name=query_name,
        requested_url=requested_url,
        final_url=final_url,
        browser_title=browser_title,
        min_catalog_coverage=args.min_catalog_coverage,
        min_chars=args.min_chars,
    )
    filename = args.output_name or f"{slugify(source_name)}.md"
    path = write_outputs(markdown, quality, media_items, Path(args.out_dir), filename, args.force, args.write_json)
    return path, quality


def write_batch_index(rows: list[dict], out_dir: Path) -> None:
    out_dir.mkdir(parents=True, exist_ok=True)
    lines = [
        "# 贵州景点百度百科 Markdown 抓取清单",
        "",
        f"- 生成时间：{now_iso()}",
        f"- 文件数：{len(rows)}",
        f"- 成功：{sum(1 for row in rows if row.get('status') == 'OK')}",
        f"- 需复核：{sum(1 for row in rows if row.get('status') != 'OK')}",
        "- 说明：完整度只用页面自身目录与正文标题校验，不预设业务目录。",
        "",
        "| # | 名称 | 页面标题 | Markdown | 目录覆盖 | 字符 | 媒体 | 状态 |",
        "| ---: | --- | --- | --- | ---: | ---: | ---: | --- |",
    ]
    for idx, row in enumerate(rows, 1):
        md_name = Path(row["path"]).name if row.get("path") else ""
        md_link = f"[{md_name}](./{md_name})" if md_name else "-"
        lines.append(
            f"| {idx} | {row.get('name', '')} | {row.get('page_title', '')} | "
            f"{md_link} | {row.get('catalog_matched', 0)}/{row.get('catalog_count', 0)} | "
            f"{row.get('chars', 0)} | {row.get('media_count', 0)} | {row.get('status', '')} |"
        )
    (out_dir / "index.md").write_text("\n".join(lines) + "\n", encoding="utf-8")


def crawl_guizhou_batch(args: argparse.Namespace) -> list[dict]:
    out_dir = Path(args.out_dir) if args.out_dir != str(DEFAULT_OUT_DIR) else DEFAULT_GUIZHOU_BATCH_DIR
    out_dir.mkdir(parents=True, exist_ok=True)
    rows: list[dict] = []
    limit = args.limit if args.limit and args.limit > 0 else len(GUIZHOU_SCENIC_SPOTS)
    for idx, entry in enumerate(GUIZHOU_SCENIC_SPOTS[:limit], 1):
        name = entry["name"]
        item_args = argparse.Namespace(**vars(args))
        item_args.name = name
        item_args.url = entry.get("url")
        item_args.out_dir = str(out_dir)
        item_args.output_name = f"{idx:02d}_{slugify(name)}.md"
        print(f"[batch] {idx:02d}/{limit} {name}", flush=True)
        try:
            path, quality = crawl_fullpage(item_args)
            rows.append(
                {
                    "name": name,
                    "path": str(path),
                    "status": quality.status,
                    "page_title": quality.page_title,
                    "catalog_matched": quality.matched_catalog_count,
                    "catalog_count": quality.catalog_count,
                    "chars": quality.markdown_chars,
                    "media_count": quality.media_count,
                    "reason": quality.reason,
                }
            )
            print(
                f"  [{quality.status}] title={quality.page_title} "
                f"catalog={quality.matched_catalog_count}/{quality.catalog_count} "
                f"chars={quality.markdown_chars} media={quality.media_count}",
                flush=True,
            )
        except Exception as exc:  # noqa: BLE001
            rows.append(
                {
                    "name": name,
                    "path": "",
                    "status": "ERROR",
                    "page_title": "",
                    "catalog_matched": 0,
                    "catalog_count": 0,
                    "chars": 0,
                    "media_count": 0,
                    "reason": str(exc)[:200],
                }
            )
            print(f"  [ERROR] {str(exc)[:200]}", flush=True)
        write_batch_index(rows, out_dir)
        if idx < limit and args.sleep > 0:
            time.sleep(args.sleep + random.random() * 0.7)
    return rows


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--name", help="Baidu Baike entry name")
    parser.add_argument("--url", help="Exact Baidu Baike URL; prefer item URLs with numeric lemma ID")
    parser.add_argument("--batch-guizhou", action="store_true", help="Crawl built-in Guizhou scenic-spot sample list")
    parser.add_argument("--out-dir", default=str(DEFAULT_OUT_DIR))
    parser.add_argument("--output-name", help="Markdown filename, for example 02_小七孔风景区_6899702.md")
    parser.add_argument("--profile-dir", default=str(DEFAULT_PROFILE_DIR))
    parser.add_argument("--headful", action="store_true", help="Open a visible browser; useful when Baidu asks for verification")
    parser.add_argument("--manual-seconds", type=int, default=0, help="Wait this many seconds for manual verification if needed")
    parser.add_argument("--timeout-ms", type=int, default=60000)
    parser.add_argument("--min-catalog-coverage", type=float, default=0.9)
    parser.add_argument("--min-chars", type=int, default=2000)
    parser.add_argument("--limit", type=int, help="Batch mode: crawl only first N items")
    parser.add_argument("--sleep", type=float, default=1.0, help="Batch mode: delay between pages")
    parser.add_argument("--write-json", action="store_true", help="Also write .quality.json and .media.json sidecar files")
    parser.add_argument("--force", action="store_true")
    parser.add_argument("--strict", action="store_true", help="Exit 2 when quality status is not OK")
    args = parser.parse_args()

    if args.batch_guizhou:
        rows = crawl_guizhou_batch(args)
        bad = [row for row in rows if row.get("status") != "OK"]
        print(f"[done] {DEFAULT_GUIZHOU_BATCH_DIR if args.out_dir == str(DEFAULT_OUT_DIR) else args.out_dir}", flush=True)
        return 2 if args.strict and bad else 0

    path, quality = crawl_fullpage(args)
    print(
        f"[{quality.status}] file={path} chars={quality.markdown_chars} "
        f"catalog={quality.matched_catalog_count}/{quality.catalog_count} "
        f"headings={quality.body_heading_count} reason={quality.reason}",
        flush=True,
    )
    return 2 if args.strict and quality.status != "OK" else 0


if __name__ == "__main__":
    raise SystemExit(main())