Initial travel knowledge graph release

2026-06-09 09:56:26 +08:00
commit 5f061295d8
402 changed files with 103877 additions and 0 deletions
--- a/app/agents/web_agent.py
+++ b/app/agents/web_agent.py
@@ -0,0 +1,403 @@
+"""web_agent —— 通用联网采集（P2，多站点 + 反爬加固）。
+
+来源策略（稳 → 富）：
+  1) 维基百科官方 API（为程序访问设计、合法、无反爬，带 UA 即可）—— 主源；
+  2) 隐身真浏览器 `fetch_page(url)` —— 通用兜底，可抓任意站点（百度百科、
+     官方文旅站等，未来多站点任务复用），带业界标准反检测层。
+
+反检测层为本项目自写实现（webdriver/chrome/plugins/languages/permissions/
+WebGL 伪装 + 启动参数 + 拟人化），思路与开源 playwright-stealth 一致；
+研究了 agent-browser-runtime 的做法，但未拷贝其受限源码。
+
+隐私红线沿用：只用 名称/地址/区县/类别 锚定（**不发坐标、不发电话**）；
+公开网页/百科数据非高德机密。Playwright 用同步 API 跑在 to_thread。
+"""
+from __future__ import annotations
+
+import asyncio
+import json
+import random
+import re
+from urllib.parse import quote
+
+import httpx
+
+import hashlib
+
+from app.config import settings  # noqa: F401  (some helpers may use)
+from app.db import get_agent_settings, sa_save_evidence
+from app.llm_client import LlmClient  # noqa: F401  (fetch helpers may use)
+from app.agents.distill_gate import ATTR_FIELDS
+from app.agents.multi_extract import build_extract_pool, fan_out, decide
+
+_UA = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+       "AppleWebKit/537.36 (KHTML, like Gecko) "
+       "Chrome/124.0.0.0 Safari/537.36")
+_WIKI_UA = "ZNKG-KnowledgeGraphBot/1.0 (admin tourism KG; contact ops)"
+# 业务口径:不截断网页正文(全部喂 opus,保抽取质量;原页另存证据层留底)
+
+# 业界标准反检测 init 脚本（本项目自写；进真站点前注入）
+_STEALTH_JS = r"""
+(() => {
+  const d=(o,k,g)=>{try{Object.defineProperty(o,k,{get:g,configurable:true})}catch(e){}};
+  d(Navigator.prototype,'webdriver',()=>undefined);
+  d(Navigator.prototype,'languages',()=>['zh-CN','zh','en']);
+  d(Navigator.prototype,'language',()=>'zh-CN');
+  d(Navigator.prototype,'vendor',()=>'Google Inc.');
+  if(!window.chrome){try{Object.defineProperty(window,'chrome',{value:{runtime:{}},configurable:true})}catch(e){}}
+  try{window.chrome.app={isInstalled:false};window.chrome.csi=()=>({});window.chrome.loadTimes=()=>({});}catch(e){}
+  const mk=(a)=>{a.item=(i)=>a[i]||null;a.namedItem=(n)=>a.find(x=>x.name===n)||null;return a;};
+  const plugins=mk([{name:'Chrome PDF Plugin'},{name:'Chrome PDF Viewer'},{name:'Native Client'}]);
+  d(Navigator.prototype,'plugins',()=>plugins);
+  const q=navigator.permissions&&navigator.permissions.query;
+  if(q){navigator.permissions.query=(p)=>p&&p.name==='notifications'
+    ?Promise.resolve({state:Notification.permission,onchange:null}):q(p);}
+  const pw=(proto)=>{if(!proto||!proto.getParameter)return;const o=proto.getParameter;
+    proto.getParameter=function(p){if(p===37445)return 'Intel Inc.';
+    if(p===37446)return 'Intel Iris OpenGL Engine';return o.apply(this,arguments);};};
+  pw(window.WebGLRenderingContext&&WebGLRenderingContext.prototype);
+  pw(window.WebGL2RenderingContext&&WebGL2RenderingContext.prototype);
+})();
+"""
+
+_CHROME_ARGS = [
+    "--disable-blink-features=AutomationControlled",
+    "--no-first-run", "--no-default-browser-check",
+    "--disable-sync", "--disable-default-apps",
+    "--no-sandbox", "--disable-dev-shm-usage",
+]
+
+_EXTRACT_SYS = """你是知识图谱数据抽取与对齐器。下面给你：
+①【实体锚点】名称/高德地址/区县/类别（用于确认网页是不是这个地点）；
+②【网页正文】从公开权威页面抓取的真实文本；
+③【图谱现有软字段】该地点图谱里已存在的值（可能来自上一次富集）。
+任务：
+1) 先判断网页是否就是这个锚点地点：不是/无法确认 → entity_match=false；
+2) 仅依据网页正文（不得编造），抽取并与图谱对齐这些软字段：
+   summary/history/features/suitable_for/best_season/ticket_hint
+   - 图谱该字段为空 → 放 adopt（给出依据网页的准确简洁值）
+   - 图谱已有且与网页一致 → 放 keep
+   - 图谱已有且与网页实质矛盾 → 放 conflict（existing/web/理由，绝不擅改）
+3) 网页中明显有、但上面6个软字段装不下的"结构化知识点"
+   （如 占地面积、海拔、文保级别、4A/5A、著名人物、重大历史事件、所属景区等）
+   → 放 schema_gaps：[{"attr":"中文属性名","field":"英文蛇形","value":"取值","why":"为何值得入模型"}]
+只输出 JSON：
+{"entity_match":true|false,
+ "adopt":{"字段":"值"},"keep":["字段"],
+ "conflict":[{"field":"","existing":"","web":"","note":""}],
+ "schema_gaps":[{"attr":"","field":"","value":"","why":""}],
+ "confidence":0~1}"""
+
+
+_AGG_SYS = """你是网页抽取裁决器。下面是多个模型对**同一份网页正文 + 同一个锚点**
+各自给出的抽取结果(JSON)。请你裁决合并:
+
+A. entity_match: 若 ≥半数模型说 false → 最终 false; 否则 true.
+B. adopt 软字段(summary/history/features/suitable_for/best_season/ticket_hint):
+   - ≥2 模型对同一字段给出兼容/一致内容 → 采纳, 综合最完整版作为最终值
+   - 只 1 模型给的或互相矛盾 → 不进 adopt, 进 uncertain
+C. keep: ≥半数模型说一致 → 进 keep
+D. conflict: 任一模型标出与图谱矛盾 → 进 conflict, 多模型矛盾合并描述
+E. schema_gaps: 同一 attr 在 ≥2 模型出现 → 高信(consensus=true); 单模型独发 → 低信(consensus=false)
+   合并时:
+   - 列表型(著名人物/景点/事件): 取所有模型并集去重
+   - 数值型(面积/海拔/绿地率): 多模型一致 → 高信, 不一致 → 注明分歧
+   - 描述型: 取最完整一份
+F. 最终 confidence: 看共识程度, ≥3/4 模型一致 → 0.9+, 多分歧 → ≤0.6
+
+只输出 JSON:
+{"entity_match":true|false,
+ "adopt":{"字段":"值"},"keep":["字段"],
+ "conflict":[{"field":"","existing":"","web":"","note":""}],
+ "uncertain":["字段"],
+ "schema_gaps":[{"attr":"","field":"","value":"","why":"",
+                 "consensus":true|false,"voted_by":["model_key1","..."]}],
+ "confidence":0~1}"""
+
+
+def _wiki_text(name: str) -> tuple[str | None, str | None]:
+    """维基百科官方 API 取纯文本摘录（主源：合法、稳定、无反爬）。"""
+    try:
+        r = httpx.get(
+            "https://zh.wikipedia.org/w/api.php",
+            params={"format": "json", "action": "query", "prop": "extracts",
+                    "explaintext": 1, "redirects": 1, "titles": name},
+            headers={"User-Agent": _WIKI_UA, "Accept": "application/json"},
+            timeout=20, verify=False, follow_redirects=True)
+        pages = r.json().get("query", {}).get("pages", {})
+        page = next(iter(pages.values()), {})
+        ex = (page.get("extract") or "").strip()
+        if len(ex) >= 120:
+            title = page.get("title", name)
+            return (re.sub(r"\s+", " ", ex),
+                    f"https://zh.wikipedia.org/wiki/{quote(title)}")
+    except Exception:
+        pass
+    return None, None
+
+
+def fetch_page(url: str, wait_selector: str | None = None) -> tuple[str | None, str]:
+    """通用隐身浏览器抓任意页面渲染正文。返回 (text, final_url)。
+
+    供未来多站点采集复用（百科/官方文旅站/名录…）。
+    """
+    try:
+        from playwright.sync_api import sync_playwright
+        with sync_playwright() as p:
+            b = p.chromium.launch(
+                headless=True, args=_CHROME_ARGS,
+                ignore_default_args=["--enable-automation"])
+            ctx = b.new_context(
+                user_agent=_UA, locale="zh-CN",
+                viewport={"width": 1440, "height": 900},
+                extra_http_headers={"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"})
+            ctx.add_init_script(_STEALTH_JS)
+            pg = ctx.new_page()
+            pg.goto(url, timeout=45000, wait_until="domcontentloaded")
+            if wait_selector:
+                try:
+                    pg.wait_for_selector(wait_selector, timeout=8000)
+                except Exception:
+                    pass
+            pg.wait_for_timeout(random.randint(900, 1800))   # 拟人化停顿
+            try:
+                pg.mouse.wheel(0, random.randint(600, 1600))
+                pg.wait_for_timeout(random.randint(400, 900))
+            except Exception:
+                pass
+            final = pg.url
+            body = ""
+            for sel in ("div.J-lemma-content", "main", "article", "body"):
+                try:
+                    body = pg.inner_text(sel)
+                    if body and len(body) > 120:
+                        break
+                except Exception:
+                    continue
+            b.close()
+    except Exception as e:  # noqa: BLE001
+        return None, f"ERR:{str(e)[:120]}"
+    if not body or len(body) < 120:
+        return None, final
+    return re.sub(r"\s+", " ", body).strip(), final
+
+
+def fetch_baidu_baike_text(url: str) -> tuple[str | None, str]:
+    """抓百度百科词条的摘要、基本信息表和正文。
+
+    百度百科新版页面的“基本信息”不在 div.J-lemma-content 内；如果只抓正文，
+    会漏掉地理位置、开放时间、景点级别、门票、面积等高价值结构化字段。
+    """
+    try:
+        from playwright.sync_api import sync_playwright
+        with sync_playwright() as p:
+            b = p.chromium.launch(
+                headless=True, args=_CHROME_ARGS,
+                ignore_default_args=["--enable-automation"])
+            ctx = b.new_context(
+                user_agent=_UA, locale="zh-CN",
+                viewport={"width": 1440, "height": 900},
+                extra_http_headers={"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"})
+            ctx.add_init_script(_STEALTH_JS)
+            pg = ctx.new_page()
+            pg.goto(url, timeout=45000, wait_until="domcontentloaded")
+            pg.wait_for_timeout(random.randint(900, 1800))
+            final = pg.url
+            parts = pg.evaluate(
+                r"""() => {
+                  const clean = (s) => (s || '').replace(/\s+/g, ' ').trim();
+                  const out = [];
+                  const seen = new Set();
+                  const add = (label, text) => {
+                    text = clean(text);
+                    if (!text || text.length < 2 || seen.has(label + text)) return;
+                    seen.add(label + text);
+                    out.push(label ? `${label}：${text}` : text);
+                  };
+
+                  const title = clean(document.querySelector('h1')?.innerText);
+                  if (title) add('词条名', title);
+
+                  const wanted = new Set([
+                    '中文名', '地理位置', '气候条件', '开放时间', '景点级别',
+                    '门票价格', '占地面积', '著名景点', '邻近景点', '美誉', '美 誉',
+                    '所属国家', '所属城市', '建议游玩时长', '适宜游玩季节'
+                  ]);
+
+                  // 新版百度百科基本信息通常是 dt/dd 对，class 名带 hash。
+                  const dts = Array.from(document.querySelectorAll('dt'));
+                  for (const dt of dts) {
+                    const key = clean(dt.innerText);
+                    if (!wanted.has(key)) continue;
+                    const dd = dt.nextElementSibling;
+                    const val = clean(dd && dd.tagName === 'DD' ? dd.innerText : '');
+                    if (key && val) add(key, val);
+                  }
+
+                  // 兼容少数新版卡片：label/value 不一定使用 dt/dd。
+                  const bodyText = clean(document.body?.innerText || '');
+                  for (const key of wanted) {
+                    if (out.some(x => x.startsWith(key + '：'))) continue;
+                    const re = new RegExp(`${key}\\s+([^\\n]{1,80})`);
+                    const m = bodyText.match(re);
+                    if (m) add(key, m[1]);
+                  }
+
+                  const summary = clean(document.querySelector('.lemmaSummary, .lemmaWgt-lemmaSummary')?.innerText);
+                  if (summary) add('摘要', summary);
+
+                  const content = clean(document.querySelector('div.J-lemma-content')?.innerText);
+                  if (content) add('正文', content);
+                  return out.join('\n');
+                }"""
+            )
+            b.close()
+    except Exception as e:  # noqa: BLE001
+        return None, f"ERR:{str(e)[:120]}"
+    if not parts or len(parts) < 120:
+        return None, final
+    return re.sub(r"\s+", " ", parts).strip(), final
+
+
+def _baike_text(name: str) -> tuple[str | None, str | None]:
+    url = f"https://baike.baidu.com/item/{quote(name)}"
+    text, final = fetch_baidu_baike_text(url)
+    if not text:
+        return None, url
+    return text, final
+
+
+async def fetch_entity_text(name: str) -> tuple[str | None, str | None, str]:
+    """稳→富：维基 API 主源；不足则隐身浏览器抓百度百科兜底。"""
+    text, url = await asyncio.to_thread(_wiki_text, name)
+    if text:
+        return text, url, "wikipedia"
+    text, url = await asyncio.to_thread(_baike_text, name)
+    if text:
+        return text, url, "baike"
+    return None, url, "none"
+
+
+async def web_enrich(entity: dict) -> dict:
+    """联网抓权威网页 → 多模型抽取(3 抽 + 1 决策) → 对齐 + schema 缺口。
+
+    主 agent (opus/global) 不参与抽取, 只调度。抽取走 extract 池(deepseek/doubao/qwen 等),
+    共享 distill.models 的 API 配置, 避免主 agent 一处欠费全瘫。
+
+    返回 {ok, found, entity_match, adopt, keep, uncertain, conflict, schema_gaps,
+          confidence, url, source, summary}。
+    schema_gaps 每条额外带 consensus/voted_by (谁支持).
+    """
+    cfg = await get_agent_settings()
+    extractors, agg, status_msg = build_extract_pool(cfg)
+    if len(extractors) < 1 or agg is None:
+        return {"ok": False,
+                "summary": f"知识抽取未配置：{status_msg}"
+                           f"（在系统设置 → 知识抽取 卡里启用）"}
+
+    name = entity.get("name", "")
+    pnk = entity.get("eid") or entity.get("natural_key") or ""
+    text, url, source = await fetch_entity_text(name)
+    if not text:
+        return {"ok": True, "found": False, "url": url, "source": source,
+                "adopt": {}, "keep": [], "conflict": [], "schema_gaps": [],
+                "summary": "公开权威源无该词条/抓取失败，跳过(已标记)"}
+
+    # 原页留底证据层(抓一次挖多次,后续业务需求免重抓)
+    try:
+        sid = "webpage:" + hashlib.md5(
+            (url or name).encode("utf-8")).hexdigest()[:16]
+        await sa_save_evidence([{
+            "platform": source, "kind": "web_page", "source_id": sid,
+            "url": url or "", "entity_name": name,
+            "place_natural_key": pnk, "keyword": name,
+            "title": name, "content": text,
+            "author": "", "author_id": "", "author_avatar": "",
+            "likes": 0, "comments": 0, "collects": 0, "shares": 0,
+            "publish_time": "", "location": "",
+            "tags": [], "image_urls": [],
+            "raw": {"url": url, "source": source,
+                    "char_count": len(text)},
+        }])
+    except Exception:
+        pass
+
+    body = json.dumps({
+        "实体锚点": {"名称": entity.get("name"),
+                     "高德地址": entity.get("address") or "",
+                     "区县": entity.get("district") or "",
+                     "类别": entity.get("place_type") or ""},
+        "网页正文": text,
+        "图谱现有软字段": {k: v for k, v in
+                           (entity.get("existing") or {}).items() if v},
+    }, ensure_ascii=False)
+
+    # === 阶段 1: 多模型扇出抽取 ===
+    responses = await fan_out(_EXTRACT_SYS, body, extractors)
+    valid = [r for r in responses
+             if r.get("data") and isinstance(r.get("data"), dict)]
+    if not valid:
+        errs = "; ".join(f"{r['model']}:{r.get('error','无返回')}"
+                        for r in responses if r.get("error"))[:120]
+        return {"ok": False, "url": url, "source": source,
+                "summary": f"多模型抽取全部失败({status_msg}); {errs}"}
+
+    ok_models = [r["model"] for r in valid]
+
+    # === 阶段 2: 决策器合并 ===
+    agg_input = json.dumps({
+        "锚点": {"名称": entity.get("name"),
+                 "地址": entity.get("address") or "",
+                 "区县": entity.get("district") or "",
+                 "类别": entity.get("place_type") or ""},
+        "图谱现有软字段": {k: v for k, v
+                           in (entity.get("existing") or {}).items() if v},
+        "多模型抽取": [{"model": r["model"], "data": r["data"]}
+                       for r in valid],
+    }, ensure_ascii=False)
+    decided, err = await decide(_AGG_SYS, agg_input, agg)
+    if not decided:
+        return {"ok": False, "url": url, "source": source,
+                "summary": f"决策器({agg[0]})失败({err}); "
+                           f"抽取器={ok_models}"}
+
+    if decided.get("entity_match") is False:
+        return {"ok": True, "found": True, "entity_match": False,
+                "adopt": {}, "keep": [], "conflict": [],
+                "uncertain": [], "schema_gaps": [],
+                "url": url, "source": source,
+                "summary": f"网页({source})与锚点不符(同名异地)，跳过"}
+
+    adopt = {k: str(v).strip()
+             for k, v in (decided.get("adopt") or {}).items()
+             if k in ATTR_FIELDS and str(v).strip()}
+    keep = [k for k in (decided.get("keep") or []) if k in ATTR_FIELDS]
+    conflict = [c for c in (decided.get("conflict") or [])
+                if isinstance(c, dict) and c.get("field") in ATTR_FIELDS]
+    uncertain = [u for u in (decided.get("uncertain") or [])
+                 if u in ATTR_FIELDS]
+    gaps = []
+    for s in (decided.get("schema_gaps") or []):
+        if not isinstance(s, dict) or not s.get("attr") or not s.get("value"):
+            continue
+        gaps.append({
+            "attr": str(s.get("attr"))[:40],
+            "field": str(s.get("field") or "")[:60],
+            "value": s.get("value"),
+            "why": str(s.get("why") or "")[:160],
+            "consensus": bool(s.get("consensus")),
+            "voted_by": [str(m)[:16] for m
+                         in (s.get("voted_by") or []) if m][:8],
+        })
+
+    return {"ok": True, "found": True, "entity_match": True,
+            "adopt": adopt, "keep": keep, "conflict": conflict,
+            "uncertain": uncertain,
+            "schema_gaps": gaps[:30],
+            "confidence": decided.get("confidence"),
+            "url": url, "source": source,
+            "summary": f"{source} · {len(ok_models)}/{len(extractors)}模型抽取"
+                       f"({','.join(ok_models)}) · {agg[0]}决策 → "
+                       f"采纳{len(adopt)}·一致{len(keep)}·矛盾{len(conflict)}"
+                       f"·存疑{len(uncertain)}·schema缺口{len(gaps)}"
+                       f"·正文{len(text)}字"}