Initial travel knowledge graph release
This commit is contained in:
403
app/agents/web_agent.py
Normal file
403
app/agents/web_agent.py
Normal file
@@ -0,0 +1,403 @@
|
||||
"""web_agent —— 通用联网采集(P2,多站点 + 反爬加固)。
|
||||
|
||||
来源策略(稳 → 富):
|
||||
1) 维基百科官方 API(为程序访问设计、合法、无反爬,带 UA 即可)—— 主源;
|
||||
2) 隐身真浏览器 `fetch_page(url)` —— 通用兜底,可抓任意站点(百度百科、
|
||||
官方文旅站等,未来多站点任务复用),带业界标准反检测层。
|
||||
|
||||
反检测层为本项目自写实现(webdriver/chrome/plugins/languages/permissions/
|
||||
WebGL 伪装 + 启动参数 + 拟人化),思路与开源 playwright-stealth 一致;
|
||||
研究了 agent-browser-runtime 的做法,但未拷贝其受限源码。
|
||||
|
||||
隐私红线沿用:只用 名称/地址/区县/类别 锚定(**不发坐标、不发电话**);
|
||||
公开网页/百科数据非高德机密。Playwright 用同步 API 跑在 to_thread。
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
from urllib.parse import quote
|
||||
|
||||
import httpx
|
||||
|
||||
import hashlib
|
||||
|
||||
from app.config import settings # noqa: F401 (some helpers may use)
|
||||
from app.db import get_agent_settings, sa_save_evidence
|
||||
from app.llm_client import LlmClient # noqa: F401 (fetch helpers may use)
|
||||
from app.agents.distill_gate import ATTR_FIELDS
|
||||
from app.agents.multi_extract import build_extract_pool, fan_out, decide
|
||||
|
||||
_UA = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/124.0.0.0 Safari/537.36")
|
||||
_WIKI_UA = "ZNKG-KnowledgeGraphBot/1.0 (admin tourism KG; contact ops)"
|
||||
# 业务口径:不截断网页正文(全部喂 opus,保抽取质量;原页另存证据层留底)
|
||||
|
||||
# 业界标准反检测 init 脚本(本项目自写;进真站点前注入)
|
||||
_STEALTH_JS = r"""
|
||||
(() => {
|
||||
const d=(o,k,g)=>{try{Object.defineProperty(o,k,{get:g,configurable:true})}catch(e){}};
|
||||
d(Navigator.prototype,'webdriver',()=>undefined);
|
||||
d(Navigator.prototype,'languages',()=>['zh-CN','zh','en']);
|
||||
d(Navigator.prototype,'language',()=>'zh-CN');
|
||||
d(Navigator.prototype,'vendor',()=>'Google Inc.');
|
||||
if(!window.chrome){try{Object.defineProperty(window,'chrome',{value:{runtime:{}},configurable:true})}catch(e){}}
|
||||
try{window.chrome.app={isInstalled:false};window.chrome.csi=()=>({});window.chrome.loadTimes=()=>({});}catch(e){}
|
||||
const mk=(a)=>{a.item=(i)=>a[i]||null;a.namedItem=(n)=>a.find(x=>x.name===n)||null;return a;};
|
||||
const plugins=mk([{name:'Chrome PDF Plugin'},{name:'Chrome PDF Viewer'},{name:'Native Client'}]);
|
||||
d(Navigator.prototype,'plugins',()=>plugins);
|
||||
const q=navigator.permissions&&navigator.permissions.query;
|
||||
if(q){navigator.permissions.query=(p)=>p&&p.name==='notifications'
|
||||
?Promise.resolve({state:Notification.permission,onchange:null}):q(p);}
|
||||
const pw=(proto)=>{if(!proto||!proto.getParameter)return;const o=proto.getParameter;
|
||||
proto.getParameter=function(p){if(p===37445)return 'Intel Inc.';
|
||||
if(p===37446)return 'Intel Iris OpenGL Engine';return o.apply(this,arguments);};};
|
||||
pw(window.WebGLRenderingContext&&WebGLRenderingContext.prototype);
|
||||
pw(window.WebGL2RenderingContext&&WebGL2RenderingContext.prototype);
|
||||
})();
|
||||
"""
|
||||
|
||||
_CHROME_ARGS = [
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--no-first-run", "--no-default-browser-check",
|
||||
"--disable-sync", "--disable-default-apps",
|
||||
"--no-sandbox", "--disable-dev-shm-usage",
|
||||
]
|
||||
|
||||
_EXTRACT_SYS = """你是知识图谱数据抽取与对齐器。下面给你:
|
||||
①【实体锚点】名称/高德地址/区县/类别(用于确认网页是不是这个地点);
|
||||
②【网页正文】从公开权威页面抓取的真实文本;
|
||||
③【图谱现有软字段】该地点图谱里已存在的值(可能来自上一次富集)。
|
||||
任务:
|
||||
1) 先判断网页是否就是这个锚点地点:不是/无法确认 → entity_match=false;
|
||||
2) 仅依据网页正文(不得编造),抽取并与图谱对齐这些软字段:
|
||||
summary/history/features/suitable_for/best_season/ticket_hint
|
||||
- 图谱该字段为空 → 放 adopt(给出依据网页的准确简洁值)
|
||||
- 图谱已有且与网页一致 → 放 keep
|
||||
- 图谱已有且与网页实质矛盾 → 放 conflict(existing/web/理由,绝不擅改)
|
||||
3) 网页中明显有、但上面6个软字段装不下的"结构化知识点"
|
||||
(如 占地面积、海拔、文保级别、4A/5A、著名人物、重大历史事件、所属景区等)
|
||||
→ 放 schema_gaps:[{"attr":"中文属性名","field":"英文蛇形","value":"取值","why":"为何值得入模型"}]
|
||||
只输出 JSON:
|
||||
{"entity_match":true|false,
|
||||
"adopt":{"字段":"值"},"keep":["字段"],
|
||||
"conflict":[{"field":"","existing":"","web":"","note":""}],
|
||||
"schema_gaps":[{"attr":"","field":"","value":"","why":""}],
|
||||
"confidence":0~1}"""
|
||||
|
||||
|
||||
_AGG_SYS = """你是网页抽取裁决器。下面是多个模型对**同一份网页正文 + 同一个锚点**
|
||||
各自给出的抽取结果(JSON)。请你裁决合并:
|
||||
|
||||
A. entity_match: 若 ≥半数模型说 false → 最终 false; 否则 true.
|
||||
B. adopt 软字段(summary/history/features/suitable_for/best_season/ticket_hint):
|
||||
- ≥2 模型对同一字段给出兼容/一致内容 → 采纳, 综合最完整版作为最终值
|
||||
- 只 1 模型给的或互相矛盾 → 不进 adopt, 进 uncertain
|
||||
C. keep: ≥半数模型说一致 → 进 keep
|
||||
D. conflict: 任一模型标出与图谱矛盾 → 进 conflict, 多模型矛盾合并描述
|
||||
E. schema_gaps: 同一 attr 在 ≥2 模型出现 → 高信(consensus=true); 单模型独发 → 低信(consensus=false)
|
||||
合并时:
|
||||
- 列表型(著名人物/景点/事件): 取所有模型并集去重
|
||||
- 数值型(面积/海拔/绿地率): 多模型一致 → 高信, 不一致 → 注明分歧
|
||||
- 描述型: 取最完整一份
|
||||
F. 最终 confidence: 看共识程度, ≥3/4 模型一致 → 0.9+, 多分歧 → ≤0.6
|
||||
|
||||
只输出 JSON:
|
||||
{"entity_match":true|false,
|
||||
"adopt":{"字段":"值"},"keep":["字段"],
|
||||
"conflict":[{"field":"","existing":"","web":"","note":""}],
|
||||
"uncertain":["字段"],
|
||||
"schema_gaps":[{"attr":"","field":"","value":"","why":"",
|
||||
"consensus":true|false,"voted_by":["model_key1","..."]}],
|
||||
"confidence":0~1}"""
|
||||
|
||||
|
||||
def _wiki_text(name: str) -> tuple[str | None, str | None]:
|
||||
"""维基百科官方 API 取纯文本摘录(主源:合法、稳定、无反爬)。"""
|
||||
try:
|
||||
r = httpx.get(
|
||||
"https://zh.wikipedia.org/w/api.php",
|
||||
params={"format": "json", "action": "query", "prop": "extracts",
|
||||
"explaintext": 1, "redirects": 1, "titles": name},
|
||||
headers={"User-Agent": _WIKI_UA, "Accept": "application/json"},
|
||||
timeout=20, verify=False, follow_redirects=True)
|
||||
pages = r.json().get("query", {}).get("pages", {})
|
||||
page = next(iter(pages.values()), {})
|
||||
ex = (page.get("extract") or "").strip()
|
||||
if len(ex) >= 120:
|
||||
title = page.get("title", name)
|
||||
return (re.sub(r"\s+", " ", ex),
|
||||
f"https://zh.wikipedia.org/wiki/{quote(title)}")
|
||||
except Exception:
|
||||
pass
|
||||
return None, None
|
||||
|
||||
|
||||
def fetch_page(url: str, wait_selector: str | None = None) -> tuple[str | None, str]:
|
||||
"""通用隐身浏览器抓任意页面渲染正文。返回 (text, final_url)。
|
||||
|
||||
供未来多站点采集复用(百科/官方文旅站/名录…)。
|
||||
"""
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
with sync_playwright() as p:
|
||||
b = p.chromium.launch(
|
||||
headless=True, args=_CHROME_ARGS,
|
||||
ignore_default_args=["--enable-automation"])
|
||||
ctx = b.new_context(
|
||||
user_agent=_UA, locale="zh-CN",
|
||||
viewport={"width": 1440, "height": 900},
|
||||
extra_http_headers={"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"})
|
||||
ctx.add_init_script(_STEALTH_JS)
|
||||
pg = ctx.new_page()
|
||||
pg.goto(url, timeout=45000, wait_until="domcontentloaded")
|
||||
if wait_selector:
|
||||
try:
|
||||
pg.wait_for_selector(wait_selector, timeout=8000)
|
||||
except Exception:
|
||||
pass
|
||||
pg.wait_for_timeout(random.randint(900, 1800)) # 拟人化停顿
|
||||
try:
|
||||
pg.mouse.wheel(0, random.randint(600, 1600))
|
||||
pg.wait_for_timeout(random.randint(400, 900))
|
||||
except Exception:
|
||||
pass
|
||||
final = pg.url
|
||||
body = ""
|
||||
for sel in ("div.J-lemma-content", "main", "article", "body"):
|
||||
try:
|
||||
body = pg.inner_text(sel)
|
||||
if body and len(body) > 120:
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
b.close()
|
||||
except Exception as e: # noqa: BLE001
|
||||
return None, f"ERR:{str(e)[:120]}"
|
||||
if not body or len(body) < 120:
|
||||
return None, final
|
||||
return re.sub(r"\s+", " ", body).strip(), final
|
||||
|
||||
|
||||
def fetch_baidu_baike_text(url: str) -> tuple[str | None, str]:
|
||||
"""抓百度百科词条的摘要、基本信息表和正文。
|
||||
|
||||
百度百科新版页面的“基本信息”不在 div.J-lemma-content 内;如果只抓正文,
|
||||
会漏掉地理位置、开放时间、景点级别、门票、面积等高价值结构化字段。
|
||||
"""
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
with sync_playwright() as p:
|
||||
b = p.chromium.launch(
|
||||
headless=True, args=_CHROME_ARGS,
|
||||
ignore_default_args=["--enable-automation"])
|
||||
ctx = b.new_context(
|
||||
user_agent=_UA, locale="zh-CN",
|
||||
viewport={"width": 1440, "height": 900},
|
||||
extra_http_headers={"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"})
|
||||
ctx.add_init_script(_STEALTH_JS)
|
||||
pg = ctx.new_page()
|
||||
pg.goto(url, timeout=45000, wait_until="domcontentloaded")
|
||||
pg.wait_for_timeout(random.randint(900, 1800))
|
||||
final = pg.url
|
||||
parts = pg.evaluate(
|
||||
r"""() => {
|
||||
const clean = (s) => (s || '').replace(/\s+/g, ' ').trim();
|
||||
const out = [];
|
||||
const seen = new Set();
|
||||
const add = (label, text) => {
|
||||
text = clean(text);
|
||||
if (!text || text.length < 2 || seen.has(label + text)) return;
|
||||
seen.add(label + text);
|
||||
out.push(label ? `${label}:${text}` : text);
|
||||
};
|
||||
|
||||
const title = clean(document.querySelector('h1')?.innerText);
|
||||
if (title) add('词条名', title);
|
||||
|
||||
const wanted = new Set([
|
||||
'中文名', '地理位置', '气候条件', '开放时间', '景点级别',
|
||||
'门票价格', '占地面积', '著名景点', '邻近景点', '美誉', '美 誉',
|
||||
'所属国家', '所属城市', '建议游玩时长', '适宜游玩季节'
|
||||
]);
|
||||
|
||||
// 新版百度百科基本信息通常是 dt/dd 对,class 名带 hash。
|
||||
const dts = Array.from(document.querySelectorAll('dt'));
|
||||
for (const dt of dts) {
|
||||
const key = clean(dt.innerText);
|
||||
if (!wanted.has(key)) continue;
|
||||
const dd = dt.nextElementSibling;
|
||||
const val = clean(dd && dd.tagName === 'DD' ? dd.innerText : '');
|
||||
if (key && val) add(key, val);
|
||||
}
|
||||
|
||||
// 兼容少数新版卡片:label/value 不一定使用 dt/dd。
|
||||
const bodyText = clean(document.body?.innerText || '');
|
||||
for (const key of wanted) {
|
||||
if (out.some(x => x.startsWith(key + ':'))) continue;
|
||||
const re = new RegExp(`${key}\\s+([^\\n]{1,80})`);
|
||||
const m = bodyText.match(re);
|
||||
if (m) add(key, m[1]);
|
||||
}
|
||||
|
||||
const summary = clean(document.querySelector('.lemmaSummary, .lemmaWgt-lemmaSummary')?.innerText);
|
||||
if (summary) add('摘要', summary);
|
||||
|
||||
const content = clean(document.querySelector('div.J-lemma-content')?.innerText);
|
||||
if (content) add('正文', content);
|
||||
return out.join('\n');
|
||||
}"""
|
||||
)
|
||||
b.close()
|
||||
except Exception as e: # noqa: BLE001
|
||||
return None, f"ERR:{str(e)[:120]}"
|
||||
if not parts or len(parts) < 120:
|
||||
return None, final
|
||||
return re.sub(r"\s+", " ", parts).strip(), final
|
||||
|
||||
|
||||
def _baike_text(name: str) -> tuple[str | None, str | None]:
|
||||
url = f"https://baike.baidu.com/item/{quote(name)}"
|
||||
text, final = fetch_baidu_baike_text(url)
|
||||
if not text:
|
||||
return None, url
|
||||
return text, final
|
||||
|
||||
|
||||
async def fetch_entity_text(name: str) -> tuple[str | None, str | None, str]:
|
||||
"""稳→富:维基 API 主源;不足则隐身浏览器抓百度百科兜底。"""
|
||||
text, url = await asyncio.to_thread(_wiki_text, name)
|
||||
if text:
|
||||
return text, url, "wikipedia"
|
||||
text, url = await asyncio.to_thread(_baike_text, name)
|
||||
if text:
|
||||
return text, url, "baike"
|
||||
return None, url, "none"
|
||||
|
||||
|
||||
async def web_enrich(entity: dict) -> dict:
|
||||
"""联网抓权威网页 → 多模型抽取(3 抽 + 1 决策) → 对齐 + schema 缺口。
|
||||
|
||||
主 agent (opus/global) 不参与抽取, 只调度。抽取走 extract 池(deepseek/doubao/qwen 等),
|
||||
共享 distill.models 的 API 配置, 避免主 agent 一处欠费全瘫。
|
||||
|
||||
返回 {ok, found, entity_match, adopt, keep, uncertain, conflict, schema_gaps,
|
||||
confidence, url, source, summary}。
|
||||
schema_gaps 每条额外带 consensus/voted_by (谁支持).
|
||||
"""
|
||||
cfg = await get_agent_settings()
|
||||
extractors, agg, status_msg = build_extract_pool(cfg)
|
||||
if len(extractors) < 1 or agg is None:
|
||||
return {"ok": False,
|
||||
"summary": f"知识抽取未配置:{status_msg}"
|
||||
f"(在系统设置 → 知识抽取 卡里启用)"}
|
||||
|
||||
name = entity.get("name", "")
|
||||
pnk = entity.get("eid") or entity.get("natural_key") or ""
|
||||
text, url, source = await fetch_entity_text(name)
|
||||
if not text:
|
||||
return {"ok": True, "found": False, "url": url, "source": source,
|
||||
"adopt": {}, "keep": [], "conflict": [], "schema_gaps": [],
|
||||
"summary": "公开权威源无该词条/抓取失败,跳过(已标记)"}
|
||||
|
||||
# 原页留底证据层(抓一次挖多次,后续业务需求免重抓)
|
||||
try:
|
||||
sid = "webpage:" + hashlib.md5(
|
||||
(url or name).encode("utf-8")).hexdigest()[:16]
|
||||
await sa_save_evidence([{
|
||||
"platform": source, "kind": "web_page", "source_id": sid,
|
||||
"url": url or "", "entity_name": name,
|
||||
"place_natural_key": pnk, "keyword": name,
|
||||
"title": name, "content": text,
|
||||
"author": "", "author_id": "", "author_avatar": "",
|
||||
"likes": 0, "comments": 0, "collects": 0, "shares": 0,
|
||||
"publish_time": "", "location": "",
|
||||
"tags": [], "image_urls": [],
|
||||
"raw": {"url": url, "source": source,
|
||||
"char_count": len(text)},
|
||||
}])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
body = json.dumps({
|
||||
"实体锚点": {"名称": entity.get("name"),
|
||||
"高德地址": entity.get("address") or "",
|
||||
"区县": entity.get("district") or "",
|
||||
"类别": entity.get("place_type") or ""},
|
||||
"网页正文": text,
|
||||
"图谱现有软字段": {k: v for k, v in
|
||||
(entity.get("existing") or {}).items() if v},
|
||||
}, ensure_ascii=False)
|
||||
|
||||
# === 阶段 1: 多模型扇出抽取 ===
|
||||
responses = await fan_out(_EXTRACT_SYS, body, extractors)
|
||||
valid = [r for r in responses
|
||||
if r.get("data") and isinstance(r.get("data"), dict)]
|
||||
if not valid:
|
||||
errs = "; ".join(f"{r['model']}:{r.get('error','无返回')}"
|
||||
for r in responses if r.get("error"))[:120]
|
||||
return {"ok": False, "url": url, "source": source,
|
||||
"summary": f"多模型抽取全部失败({status_msg}); {errs}"}
|
||||
|
||||
ok_models = [r["model"] for r in valid]
|
||||
|
||||
# === 阶段 2: 决策器合并 ===
|
||||
agg_input = json.dumps({
|
||||
"锚点": {"名称": entity.get("name"),
|
||||
"地址": entity.get("address") or "",
|
||||
"区县": entity.get("district") or "",
|
||||
"类别": entity.get("place_type") or ""},
|
||||
"图谱现有软字段": {k: v for k, v
|
||||
in (entity.get("existing") or {}).items() if v},
|
||||
"多模型抽取": [{"model": r["model"], "data": r["data"]}
|
||||
for r in valid],
|
||||
}, ensure_ascii=False)
|
||||
decided, err = await decide(_AGG_SYS, agg_input, agg)
|
||||
if not decided:
|
||||
return {"ok": False, "url": url, "source": source,
|
||||
"summary": f"决策器({agg[0]})失败({err}); "
|
||||
f"抽取器={ok_models}"}
|
||||
|
||||
if decided.get("entity_match") is False:
|
||||
return {"ok": True, "found": True, "entity_match": False,
|
||||
"adopt": {}, "keep": [], "conflict": [],
|
||||
"uncertain": [], "schema_gaps": [],
|
||||
"url": url, "source": source,
|
||||
"summary": f"网页({source})与锚点不符(同名异地),跳过"}
|
||||
|
||||
adopt = {k: str(v).strip()
|
||||
for k, v in (decided.get("adopt") or {}).items()
|
||||
if k in ATTR_FIELDS and str(v).strip()}
|
||||
keep = [k for k in (decided.get("keep") or []) if k in ATTR_FIELDS]
|
||||
conflict = [c for c in (decided.get("conflict") or [])
|
||||
if isinstance(c, dict) and c.get("field") in ATTR_FIELDS]
|
||||
uncertain = [u for u in (decided.get("uncertain") or [])
|
||||
if u in ATTR_FIELDS]
|
||||
gaps = []
|
||||
for s in (decided.get("schema_gaps") or []):
|
||||
if not isinstance(s, dict) or not s.get("attr") or not s.get("value"):
|
||||
continue
|
||||
gaps.append({
|
||||
"attr": str(s.get("attr"))[:40],
|
||||
"field": str(s.get("field") or "")[:60],
|
||||
"value": s.get("value"),
|
||||
"why": str(s.get("why") or "")[:160],
|
||||
"consensus": bool(s.get("consensus")),
|
||||
"voted_by": [str(m)[:16] for m
|
||||
in (s.get("voted_by") or []) if m][:8],
|
||||
})
|
||||
|
||||
return {"ok": True, "found": True, "entity_match": True,
|
||||
"adopt": adopt, "keep": keep, "conflict": conflict,
|
||||
"uncertain": uncertain,
|
||||
"schema_gaps": gaps[:30],
|
||||
"confidence": decided.get("confidence"),
|
||||
"url": url, "source": source,
|
||||
"summary": f"{source} · {len(ok_models)}/{len(extractors)}模型抽取"
|
||||
f"({','.join(ok_models)}) · {agg[0]}决策 → "
|
||||
f"采纳{len(adopt)}·一致{len(keep)}·矛盾{len(conflict)}"
|
||||
f"·存疑{len(uncertain)}·schema缺口{len(gaps)}"
|
||||
f"·正文{len(text)}字"}
|
||||
Reference in New Issue
Block a user