bxh/app/agents/xhs_agent.py

"""小红书子 Agent（P3 v1）—— 舆情/体验数据采集 → ExperienceTag。

参考用户 test.py 的 DrissionPage 监听方案，改造为自治版：
  • 关键词由 Super Agent 按 KG 需求决定（不再 input()）；
  • 搜索后 **AI 读动态 tab 栏（综合/必吃榜/本地推荐/特色小吃…）按业务需求选 tab**
    （默认综合），不再人工选；
  • 登录用 **持久化 Chrome Profile**：一次人工登录→cookie 持久→之后无头复用；
    未登录/失效 **不阻塞**，升级开工单+通知管理员重登（与现有升级机制一致）；
  • 采集到的帖子/评论文本 → opus 提炼成 **体验标签 ExperienceTag** 挂到 Place。

责任边界：仅做合规、低频、不破解登录的采集；登录/验证码走人工一次性手动接管
（持久化复用，非凭证窃取）。数据先跑通，形态后续细化。
"""
from __future__ import annotations

import asyncio
import hashlib
import json
import os
import random
import time
from urllib.parse import quote

from app.config import settings
from app.db import get_agent_settings, sa_save_evidence
from app.llm_client import LlmClient


def _parse_api_notes(raw_api: list, name: str, district: str,
                     keyword: str) -> list[dict]:
    """解析已捕获的小红书搜索 API JSON → 全结构化证据记录（含 PII）。

    字段容错（小红书结构常变，沿用 test.py 的多路径兜底）。
    """
    out: list[dict] = []
    seen: set = set()
    for j in raw_api or []:
        if not isinstance(j, dict):
            continue
        items = (j.get("data") or {}).get("items") or []
        for it in items:
            nc = it.get("note_card") or it.get("noteCard") or {}
            if not nc:
                continue
            sid = (it.get("id") or nc.get("note_id")
                   or nc.get("id") or "")
            if not sid or sid in seen:
                continue
            seen.add(sid)
            u = nc.get("user") or nc.get("User") or {}
            ii = nc.get("interact_info") or nc.get("interactInfo") or {}
            cover = nc.get("cover") or {}
            imgs = []
            for im in (nc.get("image_list") or nc.get("images") or []):
                if isinstance(im, dict):
                    iu = (im.get("url_default") or im.get("url")
                          or im.get("url_pre") or "")
                    if iu:
                        imgs.append(iu)
                elif isinstance(im, str):
                    imgs.append(im)
            cu = (cover.get("url_default") or cover.get("url") or "") \
                if isinstance(cover, dict) else ""
            if cu and cu not in imgs:
                imgs.insert(0, cu)
            out.append({
                "platform": "xhs", "kind": "note", "source_id": str(sid),
                "url": f"https://www.xiaohongshu.com/explore/{sid}",
                "entity_name": name, "keyword": keyword,
                "title": nc.get("display_title") or nc.get("title") or "",
                "content": nc.get("desc") or nc.get("content") or "",
                "author": u.get("nick_name") or u.get("nickname")
                or u.get("name") or "",
                "author_id": u.get("user_id") or u.get("userId")
                or u.get("id") or "",
                "author_avatar": u.get("avatar") or u.get("image") or "",
                "likes": ii.get("liked_count") or ii.get("likes") or 0,
                "comments": ii.get("comment_count") or 0,
                "collects": ii.get("collected_count") or 0,
                "shares": ii.get("shared_count") or ii.get("share_count") or 0,
                "publish_time": nc.get("time") or nc.get("create_time") or "",
                "location": (nc.get("location") or {}).get("name", "")
                if isinstance(nc.get("location"), dict) else "",
                "tags": [t.get("name", "") for t in (nc.get("tag_list") or [])
                         if isinstance(t, dict)],
                "image_urls": imgs,
                "raw": it,
            })
    return out


def _parse_api_comments(raw_api: list, name: str,
                        keyword: str) -> list[dict]:
    """解析已捕获的小红书评论 API JSON → 评论证据(含时间/PII)。

    评论的"时间"是二期事件抽取的关键锚点。字段多路径容错。
    """
    out: list[dict] = []
    seen: set = set()
    for j in raw_api or []:
        if not isinstance(j, dict):
            continue
        data = j.get("data") or {}
        comments = data.get("comments")
        if not comments:
            continue
        note_id = data.get("note_id") or ""
        for cm in comments:
            cid = cm.get("comment_id") or cm.get("id") or ""
            if not cid or cid in seen:
                continue
            seen.add(cid)
            u = (cm.get("user_info") or cm.get("user")
                 or cm.get("User") or cm.get("author") or {})
            imgs = []
            for im in (cm.get("pictures") or cm.get("images")
                       or cm.get("image_list") or []):
                if isinstance(im, dict):
                    iu = im.get("url") or im.get("url_default") or ""
                    if iu:
                        imgs.append(iu)
                elif isinstance(im, str):
                    imgs.append(im)
            reps = (cm.get("sub_comments") or cm.get("replies")
                    or cm.get("reply_list") or [])
            out.append({
                "platform": "xhs", "kind": "comment", "source_id": str(cid),
                "url": f"https://www.xiaohongshu.com/explore/{note_id}"
                       if note_id else "",
                "entity_name": name, "keyword": keyword,
                "title": "", "content": cm.get("content")
                or cm.get("text") or "",
                "author": u.get("nickname") or u.get("nick_name")
                or u.get("name") or "",
                "author_id": u.get("user_id") or u.get("userId")
                or u.get("id") or "",
                "author_avatar": u.get("image") or u.get("avatar") or "",
                "likes": cm.get("like_count") or cm.get("likes") or 0,
                "comments": len(reps) if isinstance(reps, list) else 0,
                "collects": 0, "shares": 0,
                "publish_time": str(cm.get("create_time")
                                    or cm.get("time") or ""),
                "location": cm.get("ip_location") or "",
                "tags": [], "image_urls": imgs,
                "raw": {"note_id": note_id, **cm},
            })
    return out


# 专用持久化用户目录（与你日常 Chrome 隔离；一次人工登录后 cookie 常驻）
XHS_PROFILE_DIR = os.path.expanduser("~/.zn-kg/xhs-profile")
_SEARCH = "https://www.xiaohongshu.com/search_result?keyword={kw}"
_LISTEN = ["/api/sns/web/v1/feed", "/api/sns/web/v2/feed",
           "/api/sns/web/v1/search/notes", "/api/sns/web/v1/comment/list",
           "/api/sns/web/v2/comment/page", "/api/sns/web/v1/comment/sub/page"]
_DEEP_NOTES = 3              # 评论深采:每次点开前 N 个帖子滚动捞评论
_DEEP_SCROLL = 30            # 单帖最多滚动多少次(以评论接口不再增长提前停)
_DEEP_STALL = 4              # 连续 N 次滚动无新评论包 → 判该帖评论已捞尽

_TAB_SYS = """你在小红书搜索结果页选筛选 tab。给你：业务需求 + 当前页可选 tab 列表。
按业务需求选**最相关的一个** tab 名；拿不准就选「综合」。
只输出 JSON：{"tab":"要点的tab名","reason":"一句话"}"""

_TAG_SYS = """你从小红书 UGC（帖子标题/正文/评论）提炼某地点的"体验标签"。
只保留多条内容相互印证、对该地点有信息量的短标签（如：酸汤鱼必点、排队久、
人均50、网红打卡、本地人推荐、环境一般）。不编造，噪声/广告/无关一律丢弃。
只输出 JSON：{"tags":["标签1","标签2"],"sentiment":"正面|中性|负面|混合",
"summary":"一句话口碑概述"}"""


def _unlock_profile() -> None:
    """清理被 kill 的 Chromium 残留的 Singleton 死锁。

    本机单用户串行使用该持久化目录（编排器 xhs 步串行 + 登录脚本手动），
    Singleton* 几乎总是上次进程被 kill 的残留；清掉可避免
    "profile already in use" 误报。若真有活进程占用，新进程仍会自旋安全退出。
    """
    for n in ("SingletonLock", "SingletonCookie", "SingletonSocket"):
        try:
            os.unlink(os.path.join(XHS_PROFILE_DIR, n))
        except OSError:
            pass


async def _build_llm() -> LlmClient | None:
    try:
        cfg = await get_agent_settings()
    except Exception:
        cfg = {}
    g = cfg.get("global", {}) if cfg else {}
    a = (cfg.get("agents", {}) or {}).get("xhs_agent", {}) if cfg else {}
    if a and a.get("enabled") is False:
        return None
    key = a.get("api_key") or g.get("api_key") or settings.llm_api_key
    base = a.get("base_url") or g.get("base_url") or settings.llm_api_base
    if not key or not base:
        return None
    model = a.get("model") or g.get("model") or settings.llm_model or "deepseek-chat"
    return LlmClient(api_base=base, api_key=key, model=model,
                     timeout=int(g.get("timeout") or 120))


def _collect(keyword: str, pick_tab_cb, deep: bool = False) -> dict:
    """Playwright 持久化上下文：复用我们已验证的隐身栈 + cookie 持久化。

    返回 {logged_in, notes:[{title,author,likes}], raw_api:[...], tabs:[...]}。
    pick_tab_cb(tab_labels:list[str]) -> str|None  由上层注入(含 AI 决策)。
    持久化目录里一次人工登录(scripts/xhs_login.py)后，此处无头复用 cookie。
    """
    from app.agents.web_agent import _STEALTH_JS, _UA, _CHROME_ARGS
    os.makedirs(XHS_PROFILE_DIR, exist_ok=True)
    _unlock_profile()
    captured: list = []
    notes: list = []
    seen: set = set()
    labels: list = []
    try:
        from playwright.sync_api import sync_playwright
        with sync_playwright() as p:
            ctx = p.chromium.launch_persistent_context(
                user_data_dir=XHS_PROFILE_DIR, headless=True,
                args=_CHROME_ARGS, ignore_default_args=["--enable-automation"],
                user_agent=_UA, locale="zh-CN",
                viewport={"width": 1440, "height": 900})
            ctx.add_init_script(_STEALTH_JS)

            def _on_resp(resp):
                u = resp.url
                if any(k in u for k in _LISTEN):
                    try:
                        captured.append(resp.json())
                    except Exception:
                        pass
            ctx.on("response", _on_resp)
            pg = ctx.pages[0] if ctx.pages else ctx.new_page()
            pg.goto(_SEARCH.format(kw=quote(keyword)),
                    wait_until="domcontentloaded", timeout=45000)
            pg.wait_for_timeout(4000)

            note_eles = pg.query_selector_all("section.note-item")
            html = pg.content()
            if not note_eles and ("/login" in pg.url or "扫码登录" in html
                                  or "手机号登录" in html):
                ctx.close()
                return {"logged_in": False, "notes": [], "raw_api": []}

            # AI 选 tab
            try:
                tabs = pg.query_selector_all(
                    "div.content-container button.tab")
                labels = [(t.get_attribute("aria-details")
                           or t.inner_text() or "").strip() for t in tabs]
                labels = [x for x in labels if x]
                chosen = pick_tab_cb(labels) if labels else None
                if chosen and chosen != "综合":
                    for t in tabs:
                        lab = (t.get_attribute("aria-details")
                               or t.inner_text() or "").strip()
                        if lab == chosen:
                            t.click()
                            pg.wait_for_timeout(3000)
                            break
            except Exception:
                pass

            for _ in range(6):
                for it in pg.query_selector_all("section.note-item"):
                    try:
                        te = (it.query_selector(".title")
                              or it.query_selector("a.title"))
                        title = (te.inner_text().strip() if te else "")
                        if not title or title in seen:
                            continue
                        seen.add(title)
                        au = (it.query_selector(".author .name")
                              or it.query_selector(".name"))
                        lk = (it.query_selector(".like-wrapper .count")
                              or it.query_selector(".count"))
                        notes.append({
                            "title": title,
                            "author": au.inner_text().strip() if au else "",
                            "likes": lk.inner_text().strip() if lk else ""})
                    except Exception:
                        continue
                try:
                    pg.mouse.wheel(0, 2400)
                except Exception:
                    pass
                pg.wait_for_timeout(int(random.uniform(1200, 2000)))
                if len(notes) >= 40:
                    break

            # ── 评论深采：现版小红书点 cover = 整页跳转(非弹层)，
            #   故每帖在**独立新标签页**打开 note 详情，搜索页 pg 不受影响；
            #   ctx 级监听照样捕获 /comment 接口；以"评论接口数据增长"驱动
            #   滚动(真滚轮+容器scrollTo+End)，连续无新增或 THE END 即停。
            _SCROLLERS = (".note-scroller", ".comment-container",
                          ".interaction-container", ".comments-container")

            def _ccount() -> int:
                t = 0
                for j in captured:
                    if isinstance(j, dict):
                        cs = (j.get("data") or {}).get("comments")
                        if cs:
                            t += len(cs)
                return t

            if deep:
                search_url = pg.url
                for idx in range(_DEEP_NOTES):
                    try:
                        covers = pg.query_selector_all(
                            "section.note-item a.cover")
                        if idx >= len(covers):
                            break
                        # SPA 内点击进帖(浏览器自带签名发评论请求)，等导航完成
                        try:
                            with pg.expect_navigation(timeout=15000):
                                covers[idx].click()
                        except Exception:
                            covers[idx].click()
                        pg.wait_for_load_state("domcontentloaded",
                                               timeout=20000)
                        pg.wait_for_timeout(3200)        # 详情+首屏评论
                        if not pg.query_selector(".no-comments"):
                            stall = 0
                            prev = _ccount()
                            for _ in range(_DEEP_SCROLL):
                                scs = [x for x in
                                       (pg.query_selector(s)
                                        for s in _SCROLLERS) if x]
                                try:
                                    for sc in (scs or [None]):
                                        if sc:
                                            box = sc.bounding_box()
                                            if box:
                                                pg.mouse.move(
                                                    box["x"] + box["width"]/2,
                                                    box["y"] + box["height"]/2)
                                            pg.mouse.wheel(0, 4000)
                                            pg.evaluate(
                                                "(e)=>e&&e.scrollTo("
                                                "0,e.scrollHeight)", sc)
                                        else:
                                            pg.mouse.wheel(0, 4000)
                                    pg.keyboard.press("End")
                                except Exception:
                                    pass
                                pg.wait_for_timeout(
                                    int(random.uniform(1800, 2600)))
                                if (pg.query_selector(".end-container")
                                        or pg.query_selector(
                                            ".comment-end-container")):
                                    break                   # THE END
                                cur = _ccount()
                                if cur > prev:
                                    prev, stall = cur, 0
                                else:
                                    stall += 1
                                    if stall >= _DEEP_STALL:
                                        break               # 无新增→捞尽
                            pg.wait_for_timeout(3000)   # 等末尾评论包
                        # 返回搜索结果 SPA，准备下一帖
                        try:
                            with pg.expect_navigation(timeout=15000):
                                pg.go_back()
                        except Exception:
                            pg.goto(search_url,
                                    wait_until="domcontentloaded",
                                    timeout=30000)
                        pg.wait_for_load_state("domcontentloaded",
                                               timeout=20000)
                        pg.wait_for_timeout(
                            int(random.uniform(1500, 2400)))
                    except Exception:
                        try:                    # 卡住就回搜索页，保证后续不挂
                            pg.goto(search_url,
                                    wait_until="domcontentloaded",
                                    timeout=30000)
                            pg.wait_for_timeout(1500)
                        except Exception:
                            pass
                        continue
            ctx.close()
        # 已登录的小红书搜索页必有 tab 栏/搜索API；三者皆空 ⇒ 实为登录墙/拦截
        if not notes and not labels and not captured:
            return {"logged_in": False, "notes": [], "raw_api": []}
        return {"logged_in": True, "notes": notes[:40],
                "raw_api": captured, "tabs": labels}
    except Exception as e:  # noqa: BLE001
        return {"error": str(e)[:140]}


async def xhs_enrich(entity: dict) -> dict:
    """对某 Place 联小红书采 UGC → 体验标签。

    返回 {ok, found, need_login, tags, sentiment, summary}。
    ok=False=未配置/停用；need_login=True=要人工一次性登录(升级)。
    """
    llm = await _build_llm()
    if llm is None:
        return {"ok": False, "summary": "xhs_agent 未配置或停用"}

    name = entity.get("name", "")
    district = entity.get("district", "")
    biz = entity.get("biz_need") or f"补全「{name}」的真实口碑/体验/网红热度"
    keyword = f"贵阳 {name}".strip()

    def _pick(labels: list[str]) -> str | None:
        try:
            r = llm.chat_json(_TAB_SYS, json.dumps(
                {"业务需求": biz, "可选tab": labels}, ensure_ascii=False))
            return (r or {}).get("tab")
        except Exception:
            return None

    res = await asyncio.to_thread(_collect, keyword, _pick, True)
    if res.get("error"):
        return {"ok": True, "found": False, "summary": f"采集异常:{res['error']}"}
    if res.get("logged_in") is False:
        return {"ok": True, "found": False, "need_login": True,
                "summary": "小红书未登录，需一次性人工登录(已升级)"}

    dom_notes = res.get("notes") or []
    pnk = entity.get("eid") or entity.get("natural_key")

    # 1) 优先用已捕获的官方 API → 全结构化证据(含 PII，按你的决策全留)
    records = _parse_api_notes(res.get("raw_api") or [], name,
                               district, keyword)
    # 2) API 没捕到则用 DOM 卡片作轻证据(标题哈希作稳定 source_id)
    if not records and dom_notes:
        for n in dom_notes:
            t = n.get("title") or ""
            if not t:
                continue
            sid = "domhash:" + hashlib.md5(
                (name + "|" + t).encode()).hexdigest()[:16]
            records.append({
                "platform": "xhs", "kind": "note_lite", "source_id": sid,
                "url": "", "entity_name": name, "keyword": keyword,
                "title": t, "content": "", "author": n.get("author", ""),
                "author_id": "", "author_avatar": "",
                "likes": 0, "comments": 0, "collects": 0, "shares": 0,
                "publish_time": "", "location": "",
                "tags": [], "image_urls": [], "raw": n})

    # 3) 评论深采(滚动加载所得)→ 评论证据(含时间，事件抽取锚点)
    comment_records = _parse_api_comments(res.get("raw_api") or [],
                                          name, keyword)
    records += comment_records

    if not records:
        return {"ok": True, "found": False,
                "summary": f"「{name}」小红书无相关 UGC，跳过"}

    for rec in records:
        rec["place_natural_key"] = pnk
    try:
        saved = await sa_save_evidence(records)            # 原始证据入库
    except Exception:
        saved = 0
    n_note = sum(1 for x in records if x["kind"] != "comment")
    n_cmt = len(comment_records)

    # 4) 体验标签仅从帖子语料派生(评论留给事件抽取)，可溯源
    corpus = json.dumps(
        {"地点": name, "区县": district,
         "帖子": [{"标题": x["title"], "赞": x.get("likes", 0)}
                  for x in records if x["kind"] != "comment"][:40]},
        ensure_ascii=False)
    try:
        r = await asyncio.to_thread(llm.chat_json, _TAG_SYS, corpus)
    except Exception as e:  # noqa: BLE001
        return {"ok": True, "found": True, "evidence_saved": saved,
                "tags": [], "note_count": len(records),
                "summary": f"证据入库{saved}条，标签提炼失败:{str(e)[:50]}"}
    tags = [str(t).strip()[:24] for t in (r.get("tags") or []) if str(t).strip()]
    return {"ok": True, "found": True,
            "tags": tags[:12], "sentiment": r.get("sentiment", ""),
            "evidence_saved": saved, "note_count": n_note,
            "comment_count": n_cmt,
            "summary": f"小红书 帖{n_note}+评论{n_cmt} 入证据层(存{saved}) → "
                       f"{len(tags)} 体验标签·口碑{r.get('sentiment','?')}"}