857 lines
39 KiB
Python
857 lines
39 KiB
Python
"""Super Agent — 知识图谱馆长(全城网格自治采集,常驻不停)。
|
||
|
||
科学合理 & 省额度的核心:**地理网格扫描**,不再用"热度关键词"。
|
||
• 把贵阳整个城市切成网格,逐格用高德"多边形(矩形)搜索 + 官方类型编码"
|
||
系统性扫,每格每页都是不同地理切片 → 几乎不重复 → 不浪费 API 额度。
|
||
• 每格扫到第几页**持久化**(gaode_grid_cells.next_page),跨步/跨轮接着扫,
|
||
扫尽的格标记 exhausted **永不重复请求**(省额度的关键)。
|
||
• 稠密区一格出不完 → **自适应四叉细分**,保证不漏。
|
||
• 全城扫完不"结束",转**驻守巡检**(不再消耗额度,仅响应停止/再播种)。
|
||
• 进度=网格地理覆盖率,真实可反映,非拍脑袋数字。
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import re
|
||
|
||
from app.api.graph import _get_graph
|
||
from app.config import settings
|
||
from app.agents.gaode_connector import AMAP_TYPECODES, search_polygon
|
||
from app.agents.super_ingest import ingest_rows
|
||
from app.agents.distill_gate import distill_entity, ATTR_FIELDS
|
||
from app.agents.web_agent import web_enrich
|
||
from app.agents.xhs_agent import xhs_enrich
|
||
from app.agents.douyin_agent import douyin_enrich
|
||
from app.agents.event_miner import mine_events
|
||
from app.db import (
|
||
sa_append_step, sa_finish, sa_stop_requested, sa_add_task, sa_set_status,
|
||
sa_has_open_escalation, create_acquisition_task, create_notification,
|
||
get_admin_user_id, grid_seed, grid_counts, grid_pending_cats,
|
||
grid_take_next, grid_update, grid_subdivide,
|
||
sa_merge_candidate_payload, sa_record_conflict, sa_record_schema_proposal,
|
||
)
|
||
|
||
_BG: set = set()
|
||
|
||
# 贵阳整市外接框 (min_lng, min_lat, max_lng, max_lat):含主城+周边区县
|
||
_GY_BBOX = (106.20, 26.10, 107.30, 27.05)
|
||
_CELL = 0.08 # 初始网格边长(°),约 8km
|
||
_OFFSET = 25 # 高德多边形单页上限
|
||
_MAX_PAGE_SPLIT = 8 # 单格翻到第N页仍满 → 太密 → 四叉细分
|
||
_MAX_DEPTH = 3 # 最深细分层级(0.08→约1km格)
|
||
_API_PACING = 0.35 # 每次高德调用间隔(秒),护 QPS
|
||
_HARD_MAX_STEPS = 100000 # 仅极端兜底(持久化,跨轮续扫,不暴露用户)
|
||
_MAX_API_PER_RUN = 6000 # 单进程跑安全上限;命中转驻守,下轮自动续
|
||
_STEWARD_INTERVAL = 1800
|
||
_STEWARD_TICK = 10
|
||
_ENRICH_BATCH = 5 # 每步最多蒸馏富集多少个实体
|
||
_ENRICH_EVERY = 4 # 网格忙时每 N 步插一次知识富集
|
||
_WEB_BATCH = 2 # 每步最多联网富集多少个实体(真浏览器较慢)
|
||
_XHS_BATCH = 1 # 小红书浏览器很慢/脆,每步只采 1 个
|
||
|
||
|
||
def _place_counts() -> dict:
|
||
g = _get_graph()
|
||
out: dict = {}
|
||
try:
|
||
for row in g.query(
|
||
"MATCH (p:Place) RETURN coalesce(p.place_type,'?'), count(*)"
|
||
).result_set:
|
||
out[row[0]] = row[1]
|
||
except Exception:
|
||
pass
|
||
return out
|
||
|
||
|
||
def _coverage() -> dict:
|
||
from app.agents.super_ingest import _PT
|
||
counts = _place_counts()
|
||
cat2pt = {cat: _PT.get(cat, "poi") for cat in AMAP_TYPECODES}
|
||
items = [{"cat": cat, "place_type": cat2pt[cat],
|
||
"current": int(counts.get(cat2pt[cat], 0))}
|
||
for cat in AMAP_TYPECODES]
|
||
return {"items": items, "total": sum(i["current"] for i in items)}
|
||
|
||
|
||
def _grid_for(bbox: tuple, step: float) -> list[tuple]:
|
||
mnlng, mnlat, mxlng, mxlat = bbox
|
||
cells, lng = [], mnlng
|
||
while lng < mxlng:
|
||
lat = mnlat
|
||
nlng = round(min(lng + step, mxlng), 6)
|
||
while lat < mxlat:
|
||
nlat = round(min(lat + step, mxlat), 6)
|
||
cells.append((round(lng, 6), round(lat, 6), nlng, nlat))
|
||
lat = nlat
|
||
lng = nlng
|
||
return cells
|
||
|
||
|
||
def _quads(c: dict) -> list[tuple]:
|
||
"""父格四等分为 4 个子格(矩形对半切)。"""
|
||
mnlng, mnlat = c["min_lng"], c["min_lat"]
|
||
mxlng, mxlat = c["max_lng"], c["max_lat"]
|
||
midlng = round((mnlng + mxlng) / 2, 6)
|
||
midlat = round((mnlat + mxlat) / 2, 6)
|
||
return [
|
||
(mnlng, mnlat, midlng, midlat), (midlng, mnlat, mxlng, midlat),
|
||
(mnlng, midlat, midlng, mxlat), (midlng, midlat, mxlng, mxlat),
|
||
]
|
||
|
||
|
||
async def _seed_if_needed() -> None:
|
||
have = await grid_counts()
|
||
for cat, tc in AMAP_TYPECODES.items():
|
||
if cat not in have:
|
||
await grid_seed(cat, tc, _grid_for(_GY_BBOX, _CELL))
|
||
|
||
|
||
async def _escalate(run_id: int, step: int, cat: str, cur: int) -> None:
|
||
if await sa_has_open_escalation(cat):
|
||
await sa_add_task(run_id, step, cat, "escalate",
|
||
"全城网格已扫尽,该类仍偏少", "skip_dup",
|
||
status="escalated", note="已有未结工单,不重复打扰")
|
||
return
|
||
task_id = None
|
||
try:
|
||
task = await create_acquisition_task({
|
||
"tenant_id": settings.default_tenant,
|
||
"project_id": settings.default_project,
|
||
"created_by": "super_agent",
|
||
"title": f"【Super Agent 求助】「{cat}」全城网格已扫尽仍偏少",
|
||
"description": (
|
||
f"馆长已对贵阳全城做网格化高德采集,「{cat}」当前仅 {cur} 条,"
|
||
f"高德官方源对该类覆盖有限。建议人工核查类型编码或改用"
|
||
f"小红书/大众点评/官方名录等渠道补全。馆长继续驻守,"
|
||
f"工单结清/数据增长后自动恢复。"),
|
||
"scenario_tags": ["super_agent", "escalation", cat],
|
||
"target_entity_types": ["Place"],
|
||
"target_fields": [],
|
||
"suggested_collection_method": "manual_or_alt_source",
|
||
"priority": 1,
|
||
})
|
||
task_id = task["id"]
|
||
except Exception:
|
||
task_id = None
|
||
try:
|
||
uid = await get_admin_user_id()
|
||
if uid:
|
||
await create_notification(
|
||
uid, title=f"Super Agent 求助:「{cat}」全城网格已扫尽",
|
||
body=(f"「{cat}」仅 {cur} 条,高德源覆盖有限。已开工单"
|
||
+ (f" #{task_id}" if task_id else "")
|
||
+ ",请人工/改渠道;馆长继续驻守。"),
|
||
ntype="task", related_task_id=task_id)
|
||
except Exception:
|
||
pass
|
||
await sa_add_task(run_id, step, cat, "escalate",
|
||
"全城网格已扫尽,该类仍偏少", "notify_admin",
|
||
status="escalated", related_task_id=task_id)
|
||
await sa_append_step(run_id, {
|
||
"step": step, "action": "escalate", "cat": cat,
|
||
"reason": f"「{cat}」全城网格扫尽仅{cur}条,已开工单并通知管理员,继续驻守"})
|
||
|
||
|
||
def _enrich_targets(limit: int) -> list[dict]:
|
||
"""未富集过的 Place + 锚点(名/址/区/类) + 现有软字段。
|
||
|
||
取数显式排除 经纬度/电话(隐私红线,绝不外发给蒸馏模型)。
|
||
"""
|
||
g = _get_graph()
|
||
try:
|
||
rs = g.query(
|
||
"MATCH (p:Place) WHERE p.enrich_done IS NULL "
|
||
"RETURN p.element_id, p.name, coalesce(p.place_type,''), "
|
||
"coalesce(p.district,''), coalesce(p.address,''), "
|
||
"coalesce(p.summary,''), coalesce(p.history,''), "
|
||
"coalesce(p.features,''), coalesce(p.suitable_for,''), "
|
||
"coalesce(p.best_season,''), coalesce(p.ticket_hint,'') "
|
||
"LIMIT $n", {"n": limit}).result_set
|
||
except Exception:
|
||
return []
|
||
out = []
|
||
for r in rs:
|
||
if not (r and r[1]):
|
||
continue
|
||
existing = {k: v for k, v in zip(
|
||
ATTR_FIELDS, [r[5], r[6], r[7], r[8], r[9], r[10]]) if v}
|
||
out.append({"eid": r[0], "name": r[1], "place_type": r[2],
|
||
"district": r[3], "address": r[4],
|
||
"existing": existing})
|
||
return out
|
||
|
||
|
||
def _apply_enrich(eid: str, fields: dict) -> None:
|
||
"""共识字段写回 FalkorDB 节点;无论是否有字段都打 enrich_done 防重复空跑。"""
|
||
g = _get_graph()
|
||
sets = ["p.enrich_done=1"]
|
||
params = {"eid": eid}
|
||
for k in ATTR_FIELDS:
|
||
if fields.get(k):
|
||
sets.append(f"p.{k}=${k}")
|
||
params[k] = fields[k]
|
||
try:
|
||
g.query(f"MATCH (p:Place {{element_id:$eid}}) SET {','.join(sets)}",
|
||
params)
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
async def _distill_enrich(run_id: int, step: int,
|
||
targets: list[dict]) -> bool:
|
||
"""工具:多模型知识蒸馏,给已有实体补"知识层"属性。
|
||
|
||
独立数据来源(不是高德质检闸门):问 N 个模型脑内知识 → 全局模型聚合
|
||
跨模型共识 → 写回 FalkorDB 节点 + 候选 payload(审计可溯)。
|
||
"""
|
||
enriched = adopt_fields = keep_total = conflict_total = uncertain_total = 0
|
||
conflict_names: list[str] = []
|
||
last = ""
|
||
for t in targets:
|
||
res = await distill_entity(t)
|
||
last = res.get("summary", "")
|
||
if not res.get("ok"):
|
||
# 配置/连通问题:整批中止——不打标记(待配置后重试)、不空转
|
||
await sa_add_task(
|
||
run_id, step, "蒸馏", "distill",
|
||
f"知识蒸馏未就绪:{last}", "distill_enrich",
|
||
result={"fetched": 0, "approved": 0, "pending": 0,
|
||
"skipped": 0}, status="skipped", note=last)
|
||
await sa_append_step(run_id, {
|
||
"step": step, "action": "distill",
|
||
"reason": f"知识蒸馏未配置/不可用({last}),本步跳过,"
|
||
f"配好≥2蒸馏模型+全局聚合后自动恢复"})
|
||
return False
|
||
|
||
adopt = res.get("adopt") or {}
|
||
_apply_enrich(t["eid"], adopt) # 只写 adopt;无则仅打标记防空转
|
||
if adopt:
|
||
await sa_merge_candidate_payload(t["eid"], adopt)
|
||
enriched += 1
|
||
adopt_fields += len(adopt)
|
||
keep_total += len(res.get("keep") or [])
|
||
uncertain_total += len(res.get("uncertain") or [])
|
||
|
||
# 与图谱既有值矛盾 → 不覆盖,落 validation_issues 转人工
|
||
for c in (res.get("conflict") or []):
|
||
await sa_record_conflict(
|
||
t["eid"], c.get("field", ""), c.get("existing", ""),
|
||
c.get("distilled", ""), c.get("note", ""))
|
||
conflict_total += 1
|
||
if t.get("name"):
|
||
conflict_names.append(f"{t['name']}·{c.get('field','')}")
|
||
|
||
res_obj = {"fetched": len(targets), "approved": adopt_fields,
|
||
"pending": conflict_total, "skipped": uncertain_total}
|
||
await sa_add_task(
|
||
run_id, step, "蒸馏", "distill",
|
||
f"多模型知识蒸馏富集 {len(targets)} 个实体({last})",
|
||
"distill_enrich", result=res_obj, status="done",
|
||
note=f"富集{enriched}实体/{adopt_fields}字段·一致{keep_total}"
|
||
f"·矛盾{conflict_total}·存疑{uncertain_total}")
|
||
await sa_append_step(run_id, {
|
||
"step": step, "action": "distill",
|
||
"reason": f"知识蒸馏富集:{enriched}/{len(targets)} 实体补 {adopt_fields} 字段,"
|
||
f"矛盾 {conflict_total} 转人工({last})"})
|
||
|
||
if conflict_total:
|
||
try:
|
||
uid = await get_admin_user_id()
|
||
if uid:
|
||
await create_notification(
|
||
uid,
|
||
title=f"蒸馏发现 {conflict_total} 处与图谱既有数据矛盾",
|
||
body=("需人工裁决(蒸馏未覆盖图谱):"
|
||
+ "、".join(conflict_names[:8])
|
||
+ ("…" if len(conflict_names) > 8 else "")
|
||
+ "。详见 数据质量 / 校验问题(distill_conflict)。"),
|
||
ntype="task")
|
||
except Exception:
|
||
pass
|
||
return True
|
||
|
||
|
||
def _web_targets(limit: int) -> list[dict]:
|
||
"""未联网富集过的 Place(无 web_done)+ 锚点 + 现有软字段。
|
||
取数显式排除 经纬度/电话(隐私红线)。"""
|
||
g = _get_graph()
|
||
try:
|
||
# 景点最可能有公开百科页 → 优先联网富集,命中率最高
|
||
rs = g.query(
|
||
"MATCH (p:Place) WHERE p.web_done IS NULL "
|
||
"RETURN p.element_id, p.name, coalesce(p.place_type,''), "
|
||
"coalesce(p.district,''), coalesce(p.address,''), "
|
||
"coalesce(p.summary,''), coalesce(p.history,''), "
|
||
"coalesce(p.features,''), coalesce(p.suitable_for,''), "
|
||
"coalesce(p.best_season,''), coalesce(p.ticket_hint,'') "
|
||
"ORDER BY CASE WHEN p.place_type='sight' THEN 0 ELSE 1 END, "
|
||
"p.element_id LIMIT $n", {"n": limit}).result_set
|
||
except Exception:
|
||
return []
|
||
out = []
|
||
for r in rs:
|
||
if not (r and r[1]):
|
||
continue
|
||
existing = {k: v for k, v in zip(
|
||
ATTR_FIELDS, [r[5], r[6], r[7], r[8], r[9], r[10]]) if v}
|
||
out.append({"eid": r[0], "name": r[1], "place_type": r[2],
|
||
"district": r[3], "address": r[4], "existing": existing})
|
||
return out
|
||
|
||
|
||
def _apply_web(eid: str, adopt: dict) -> None:
|
||
"""网页采纳字段写回;打 web_done(必);有 adopt 则连 enrich_done 一并打
|
||
(网页权威,省一次记忆蒸馏)。"""
|
||
g = _get_graph()
|
||
sets = ["p.web_done=1"]
|
||
params = {"eid": eid}
|
||
if adopt:
|
||
sets.append("p.enrich_done=1")
|
||
for k in ATTR_FIELDS:
|
||
if adopt.get(k):
|
||
sets.append(f"p.{k}=${k}")
|
||
params[k] = adopt[k]
|
||
try:
|
||
g.query(f"MATCH (p:Place {{element_id:$eid}}) SET {','.join(sets)}",
|
||
params)
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
async def _web_enrich(run_id: int, step: int, targets: list[dict]) -> bool:
|
||
"""工具:browser-use 式联网采集(真浏览器抓权威页 → opus 抽取对齐)。"""
|
||
enriched = adopt_fields = found = conflict_total = gap_total = 0
|
||
last = ""
|
||
for t in targets:
|
||
r = await web_enrich(t)
|
||
last = r.get("summary", "")
|
||
if not r.get("ok"):
|
||
await sa_add_task(run_id, step, "联网", "web",
|
||
f"web_agent 未就绪:{last}", "web_agent",
|
||
result={"fetched": 0, "approved": 0,
|
||
"pending": 0, "skipped": 0},
|
||
status="skipped", note=last)
|
||
await sa_append_step(run_id, {
|
||
"step": step, "action": "web",
|
||
"reason": f"web_agent 未配置/不可用({last}),跳过待恢复"})
|
||
return False
|
||
if not r.get("found") or r.get("entity_match") is False:
|
||
_apply_web(t["eid"], {}) # 标记 web_done,不再重复抓
|
||
continue
|
||
found += 1
|
||
adopt = r.get("adopt") or {}
|
||
_apply_web(t["eid"], adopt)
|
||
if adopt:
|
||
await sa_merge_candidate_payload(t["eid"], adopt)
|
||
enriched += 1
|
||
adopt_fields += len(adopt)
|
||
for c in (r.get("conflict") or []):
|
||
await sa_record_conflict(t["eid"], c.get("field", ""),
|
||
c.get("existing", ""),
|
||
c.get("web", ""), c.get("note", ""))
|
||
conflict_total += 1
|
||
for sgap in (r.get("schema_gaps") or []):
|
||
iid = await sa_record_schema_proposal(
|
||
sgap.get("attr", ""), sgap.get("field", "")
|
||
or re.sub(r"\W+", "_", sgap.get("attr", "")).strip("_"),
|
||
str(sgap.get("value", ""))[:200],
|
||
f"web_agent 在「{t['name']}」网页发现:{sgap.get('why','')}",
|
||
float(r.get("confidence") or 0.7))
|
||
if iid:
|
||
gap_total += 1
|
||
|
||
res_obj = {"fetched": len(targets), "approved": adopt_fields,
|
||
"pending": conflict_total, "skipped": gap_total}
|
||
await sa_add_task(
|
||
run_id, step, "联网", "web",
|
||
f"browser-use 联网采集 {len(targets)} 个实体({last})",
|
||
"web_agent", result=res_obj, status="done",
|
||
note=f"命中{found}·补{adopt_fields}字段·矛盾{conflict_total}"
|
||
f"·schema提案{gap_total}")
|
||
await sa_append_step(run_id, {
|
||
"step": step, "action": "web",
|
||
"reason": f"联网采集:命中 {found}/{len(targets)},补 {adopt_fields} 字段,"
|
||
f"矛盾 {conflict_total} 转人工,schema 提案 {gap_total}({last})"})
|
||
if conflict_total or gap_total:
|
||
try:
|
||
uid = await get_admin_user_id()
|
||
if uid:
|
||
await create_notification(
|
||
uid,
|
||
title=f"web_agent:{conflict_total} 矛盾 / {gap_total} schema 提案待裁决",
|
||
body="联网采集与图谱矛盾或发现新属性,详见 数据质量/校验问题 "
|
||
"与 本体建模/字段提案。",
|
||
ntype="task")
|
||
except Exception:
|
||
pass
|
||
return True
|
||
|
||
|
||
def _xhs_targets(limit: int) -> list[dict]:
|
||
"""未采过小红书(无 xhs_done)的 Place;美食最优先(小红书食/玩为主)。"""
|
||
g = _get_graph()
|
||
try:
|
||
rs = g.query(
|
||
"MATCH (p:Place) WHERE p.xhs_done IS NULL "
|
||
"RETURN p.element_id, p.name, coalesce(p.place_type,''), "
|
||
"coalesce(p.district,'') "
|
||
"ORDER BY CASE WHEN p.place_type='eat' THEN 0 "
|
||
"WHEN p.place_type='sight' THEN 1 ELSE 2 END, p.element_id "
|
||
"LIMIT $n", {"n": limit}).result_set
|
||
except Exception:
|
||
return []
|
||
return [{"eid": r[0], "name": r[1], "place_type": r[2], "district": r[3]}
|
||
for r in rs if r and r[1]]
|
||
|
||
|
||
def _apply_xhs(eid: str, tags: list[str]) -> None:
|
||
"""体验标签写回:MERGE ExperienceTag + (Place)-[:HAS_TAG]->(tag);
|
||
无论有无标签都打 xhs_done 防重复。"""
|
||
g = _get_graph()
|
||
try:
|
||
g.query("MATCH (p:Place {element_id:$eid}) SET p.xhs_done=1",
|
||
{"eid": eid})
|
||
for t in tags:
|
||
g.query(
|
||
"MATCH (p:Place {element_id:$eid}) "
|
||
"MERGE (e:ExperienceTag {name:$t}) "
|
||
"MERGE (p)-[:HAS_TAG]->(e) SET e.source='xiaohongshu'",
|
||
{"eid": eid, "t": t})
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
async def _xhs_enrich(run_id: int, step: int, targets: list[dict]) -> bool:
|
||
"""工具:小红书 UGC → 体验标签。未登录→升级人工(一次性登录),本轮关闭。"""
|
||
done = tags_total = found = ev_total = 0
|
||
last = ""
|
||
for t in targets:
|
||
r = await xhs_enrich(t)
|
||
last = r.get("summary", "")
|
||
if not r.get("ok"):
|
||
await sa_add_task(run_id, step, "小红书", "xhs",
|
||
f"xhs_agent 未就绪:{last}", "xhs_agent",
|
||
result={"fetched": 0, "approved": 0,
|
||
"pending": 0, "skipped": 0},
|
||
status="skipped", note=last)
|
||
await sa_append_step(run_id, {"step": step, "action": "xhs",
|
||
"reason": f"小红书未配置/停用({last})"})
|
||
return False
|
||
if r.get("need_login"):
|
||
if not await sa_has_open_escalation("小红书登录"):
|
||
try:
|
||
task = await create_acquisition_task({
|
||
"tenant_id": settings.default_tenant,
|
||
"project_id": settings.default_project,
|
||
"created_by": "super_agent",
|
||
"title": "【Super Agent 求助】小红书需一次性人工登录",
|
||
"description": "后台 xhs_agent 检测到未登录。请在项目根目录运行 "
|
||
"`python3 scripts/xhs_login.py`,弹出浏览器里登录"
|
||
"小红书一次;cookie 持久化后馆长自动恢复采集。",
|
||
"scenario_tags": ["super_agent", "escalation", "小红书登录"],
|
||
"target_entity_types": ["Place"], "target_fields": [],
|
||
"suggested_collection_method": "manual_login",
|
||
"priority": 1})
|
||
uid = await get_admin_user_id()
|
||
if uid:
|
||
await create_notification(
|
||
uid, title="小红书需一次性登录",
|
||
body="运行 scripts/xhs_login.py 登录一次即可,馆长自动恢复。",
|
||
ntype="task", related_task_id=task["id"])
|
||
except Exception:
|
||
pass
|
||
await sa_add_task(run_id, step, "小红书", "xhs",
|
||
"小红书未登录,已升级人工一次性登录", "notify_admin",
|
||
result={"fetched": 0, "approved": 0,
|
||
"pending": 0, "skipped": 0},
|
||
status="escalated", note="待 scripts/xhs_login.py")
|
||
await sa_append_step(run_id, {
|
||
"step": step, "action": "xhs",
|
||
"reason": "小红书未登录 → 已开工单+通知(运行 xhs_login.py),本轮暂停小红书"})
|
||
return False
|
||
ev_total += int(r.get("evidence_saved") or 0)
|
||
if r.get("found"):
|
||
_apply_xhs(t["eid"], r.get("tags") or [])
|
||
found += 1
|
||
if r.get("tags"):
|
||
done += 1
|
||
tags_total += len(r["tags"])
|
||
else:
|
||
_apply_xhs(t["eid"], []) # 标记,避免重复
|
||
res_obj = {"fetched": len(targets), "approved": tags_total,
|
||
"pending": ev_total, "skipped": len(targets) - found}
|
||
await sa_add_task(run_id, step, "小红书", "xhs",
|
||
f"小红书 UGC 采集 {len(targets)} 个实体({last})",
|
||
"xhs_agent", result=res_obj, status="done",
|
||
note=f"命中{found}·证据{ev_total}条·体验标签{tags_total}")
|
||
await sa_append_step(run_id, {
|
||
"step": step, "action": "xhs",
|
||
"reason": f"小红书采集:命中 {found}/{len(targets)},证据入库 {ev_total} 条,"
|
||
f"产出 {tags_total} 个体验标签({last})"})
|
||
return True
|
||
|
||
|
||
def _dy_targets(limit: int) -> list[dict]:
|
||
"""未采过抖音(无 dy_done)的 Place;美食/景点优先(抖音食玩为主)。"""
|
||
g = _get_graph()
|
||
try:
|
||
rs = g.query(
|
||
"MATCH (p:Place) WHERE p.dy_done IS NULL "
|
||
"RETURN p.element_id, p.name, coalesce(p.place_type,''), "
|
||
"coalesce(p.district,'') "
|
||
"ORDER BY CASE WHEN p.place_type='eat' THEN 0 "
|
||
"WHEN p.place_type='sight' THEN 1 ELSE 2 END, p.element_id "
|
||
"LIMIT $n", {"n": limit}).result_set
|
||
except Exception:
|
||
return []
|
||
return [{"eid": r[0], "name": r[1], "place_type": r[2], "district": r[3]}
|
||
for r in rs if r and r[1]]
|
||
|
||
|
||
def _apply_dy(eid: str, tags: list[str]) -> None:
|
||
g = _get_graph()
|
||
try:
|
||
g.query("MATCH (p:Place {element_id:$eid}) SET p.dy_done=1",
|
||
{"eid": eid})
|
||
for t in tags:
|
||
g.query(
|
||
"MATCH (p:Place {element_id:$eid}) "
|
||
"MERGE (e:ExperienceTag {name:$t}) "
|
||
"MERGE (p)-[:HAS_TAG]->(e) SET e.source='douyin'",
|
||
{"eid": eid, "t": t})
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
async def _douyin_enrich(run_id: int, step: int,
|
||
targets: list[dict]) -> bool:
|
||
"""工具:抖音 UGC → 证据层+体验标签。未登录→升级一次性登录,本轮关闭。"""
|
||
done = tags_total = found = ev_total = 0
|
||
last = ""
|
||
for t in targets:
|
||
r = await douyin_enrich(t)
|
||
last = r.get("summary", "")
|
||
if not r.get("ok"):
|
||
await sa_add_task(run_id, step, "抖音", "douyin",
|
||
f"douyin_agent 未就绪:{last}", "douyin_agent",
|
||
result={"fetched": 0, "approved": 0,
|
||
"pending": 0, "skipped": 0},
|
||
status="skipped", note=last)
|
||
await sa_append_step(run_id, {"step": step, "action": "douyin",
|
||
"reason": f"抖音未配置/停用({last})"})
|
||
return False
|
||
if r.get("need_login"):
|
||
if not await sa_has_open_escalation("抖音登录"):
|
||
try:
|
||
task = await create_acquisition_task({
|
||
"tenant_id": settings.default_tenant,
|
||
"project_id": settings.default_project,
|
||
"created_by": "super_agent",
|
||
"title": "【Super Agent 求助】抖音需一次性人工登录",
|
||
"description": "后台 douyin_agent 检测到未登录。请在项目根目录"
|
||
"运行 `python3 scripts/douyin_login.py`,弹出"
|
||
"浏览器里登录抖音一次;cookie 持久化后自动恢复。",
|
||
"scenario_tags": ["super_agent", "escalation",
|
||
"抖音登录"],
|
||
"target_entity_types": ["Place"], "target_fields": [],
|
||
"suggested_collection_method": "manual_login",
|
||
"priority": 1})
|
||
uid = await get_admin_user_id()
|
||
if uid:
|
||
await create_notification(
|
||
uid, title="抖音需一次性登录",
|
||
body="运行 scripts/douyin_login.py 登录一次即可,"
|
||
"馆长自动恢复。",
|
||
ntype="task", related_task_id=task["id"])
|
||
except Exception:
|
||
pass
|
||
await sa_add_task(run_id, step, "抖音", "douyin",
|
||
"抖音未登录,已升级人工一次性登录", "notify_admin",
|
||
result={"fetched": 0, "approved": 0,
|
||
"pending": 0, "skipped": 0},
|
||
status="escalated",
|
||
note="待 scripts/douyin_login.py")
|
||
await sa_append_step(run_id, {
|
||
"step": step, "action": "douyin",
|
||
"reason": "抖音未登录 → 已开工单+通知(运行 douyin_login.py),"
|
||
"本轮暂停抖音"})
|
||
return False
|
||
ev_total += int(r.get("evidence_saved") or 0)
|
||
if r.get("found"):
|
||
_apply_dy(t["eid"], r.get("tags") or [])
|
||
found += 1
|
||
if r.get("tags"):
|
||
done += 1
|
||
tags_total += len(r["tags"])
|
||
else:
|
||
_apply_dy(t["eid"], [])
|
||
res_obj = {"fetched": len(targets), "approved": tags_total,
|
||
"pending": ev_total, "skipped": len(targets) - found}
|
||
await sa_add_task(run_id, step, "抖音", "douyin",
|
||
f"抖音 UGC 采集 {len(targets)} 个实体({last})",
|
||
"douyin_agent", result=res_obj, status="done",
|
||
note=f"命中{found}·证据{ev_total}条·体验标签{tags_total}")
|
||
await sa_append_step(run_id, {
|
||
"step": step, "action": "douyin",
|
||
"reason": f"抖音采集:命中 {found}/{len(targets)},证据入库 {ev_total} 条,"
|
||
f"产出 {tags_total} 个体验标签({last})"})
|
||
return True
|
||
|
||
|
||
def _events_targets(limit: int) -> list[dict]:
|
||
"""已采过任何源(web/xhs/dy) 但还没抽过事件(无 events_done)的 Place。
|
||
|
||
平台无关:只要 social_evidence 里有该 pnk 的证据(任意 platform),
|
||
都可以挖事件,不再硬要求 xhs_done。
|
||
"""
|
||
g = _get_graph()
|
||
try:
|
||
rs = g.query(
|
||
"MATCH (p:Place) "
|
||
"WHERE (p.web_done=1 OR p.xhs_done=1 OR p.dy_done=1) "
|
||
" AND p.events_done IS NULL "
|
||
"RETURN p.element_id, p.name LIMIT $n", {"n": limit}).result_set
|
||
except Exception:
|
||
return []
|
||
return [{"eid": r[0], "name": r[1]} for r in rs if r and r[1]]
|
||
|
||
|
||
def _apply_events(eid: str, events: list[dict]) -> None:
|
||
"""事件写回:MERGE Event + (Place)-[:HAS_EVENT{time,type}]->(Event)。
|
||
|
||
- source 不再硬编 'xiaohongshu',按事件实际来源(baike/wiki/xhs/douyin)写
|
||
- 加 time_norm(排序用) / participants(涉及人物) / confidence(置信度)
|
||
- MERGE 用 (place,title) 幂等,同名事件不会重复
|
||
- 无论有无都打 events_done 防重复整批
|
||
"""
|
||
g = _get_graph()
|
||
try:
|
||
g.query("MATCH (p:Place {element_id:$eid}) SET p.events_done=1",
|
||
{"eid": eid})
|
||
for e in events:
|
||
g.query(
|
||
"MATCH (p:Place {element_id:$eid}) "
|
||
"MERGE (ev:Event {place:$eid, title:$t}) "
|
||
"SET ev.time=$tm, ev.time_norm=$tn, ev.type=$ty, "
|
||
" ev.desc=$d, ev.source=$src, ev.participants=$ppl, "
|
||
" ev.confidence=$conf "
|
||
"MERGE (p)-[:HAS_EVENT {time:$tn, type:$ty}]->(ev)",
|
||
{"eid": eid, "t": e.get("title", ""),
|
||
"tm": e.get("time", ""),
|
||
"tn": e.get("time_norm", "") or e.get("time", ""),
|
||
"ty": e.get("type", ""), "d": e.get("desc", ""),
|
||
"src": e.get("source_platform", "") or "mixed",
|
||
"ppl": ",".join(e.get("participants") or []),
|
||
"conf": float(e.get("confidence") or 0)})
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
async def _event_mine(run_id: int, step: int, targets: list[dict]) -> bool:
|
||
"""工具:从证据层评论/帖子抽时间锚定事件(纯 LLM,不爬网)。"""
|
||
done = ev_total = 0
|
||
last = ""
|
||
for t in targets:
|
||
r = await mine_events(t)
|
||
last = r.get("summary", "")
|
||
if not r.get("ok"):
|
||
await sa_add_task(run_id, step, "事件", "event",
|
||
f"event_miner 未就绪:{last}", "event_miner",
|
||
result={"fetched": 0, "approved": 0,
|
||
"pending": 0, "skipped": 0},
|
||
status="skipped", note=last)
|
||
await sa_append_step(run_id, {"step": step, "action": "event",
|
||
"reason": f"事件抽取未配置({last})"})
|
||
return False
|
||
evs = r.get("events") or []
|
||
_apply_events(t["eid"], evs)
|
||
if evs:
|
||
done += 1
|
||
ev_total += len(evs)
|
||
res_obj = {"fetched": len(targets), "approved": ev_total,
|
||
"pending": 0, "skipped": len(targets) - done}
|
||
await sa_add_task(run_id, step, "事件", "event",
|
||
f"评论时间→事件抽取 {len(targets)} 个实体({last})",
|
||
"event_miner", result=res_obj, status="done",
|
||
note=f"命中{done}·事件{ev_total}")
|
||
await sa_append_step(run_id, {
|
||
"step": step, "action": "event",
|
||
"reason": f"事件抽取:{done}/{len(targets)} 实体,"
|
||
f"产出 {ev_total} 个时间锚定事件({last})"})
|
||
return True
|
||
|
||
|
||
async def run_super_agent(run_id: int) -> None:
|
||
await _seed_if_needed()
|
||
step = 0
|
||
api_calls = 0
|
||
enrich_off = False # 记忆蒸馏未配置/不可用时本轮关闭
|
||
web_off = False # web_agent 未就绪时本轮关闭,避免空转
|
||
xhs_off = False # 小红书未配置/未登录时本轮关闭
|
||
dy_off = False # 抖音未配置/未登录时本轮关闭
|
||
event_off = False # 事件抽取未配置时本轮关闭
|
||
enrich_turn = 0 # 富集源轮转(web/xhs/douyin/event/distill 公平)
|
||
escalated: set[str] = set()
|
||
|
||
async def _wait_steward() -> bool:
|
||
waited = 0
|
||
while waited < _STEWARD_INTERVAL:
|
||
if await sa_stop_requested(run_id):
|
||
return True
|
||
await asyncio.sleep(_STEWARD_TICK)
|
||
waited += _STEWARD_TICK
|
||
return False
|
||
|
||
try:
|
||
while True:
|
||
if await sa_stop_requested(run_id):
|
||
await sa_finish(run_id, "stopped")
|
||
return
|
||
|
||
# ── 指挥大脑选工具 ───────────────────────────────────────
|
||
# 高德网格法 = 主力快采(最快、结构化、带坐标);
|
||
# 多模型蒸馏 = 知识富集(补高德给不了的知识层,独立来源)。
|
||
# 网格忙时每 _ENRICH_EVERY 步插一次富集;网格扫完后全力富集。
|
||
pend = await grid_pending_cats()
|
||
grid_busy = (bool(pend) and api_calls < _MAX_API_PER_RUN
|
||
and step < _HARD_MAX_STEPS)
|
||
|
||
# 富集插槽:web/小红书/抖音/事件/记忆蒸馏 **轮转调度**
|
||
# (避免某源目标海量把其它源饿死,多源公平持续推进)
|
||
if ((not web_off or not xhs_off or not dy_off or not event_off
|
||
or not enrich_off)
|
||
and ((not grid_busy)
|
||
or (step > 0 and step % _ENRICH_EVERY == 0))):
|
||
pool = ([("web", _web_targets, _WEB_BATCH, _web_enrich)]
|
||
if not web_off else [])
|
||
pool += ([("xhs", _xhs_targets, _XHS_BATCH, _xhs_enrich)]
|
||
if not xhs_off else [])
|
||
pool += ([("douyin", _dy_targets, 1, _douyin_enrich)]
|
||
if not dy_off else [])
|
||
pool += ([("event", _events_targets, 3, _event_mine)]
|
||
if not event_off else [])
|
||
pool += ([("distill", _enrich_targets, _ENRICH_BATCH,
|
||
_distill_enrich)] if not enrich_off else [])
|
||
hit = False
|
||
for i in range(len(pool)):
|
||
name, tfn, batch, efn = pool[(enrich_turn + i) % len(pool)]
|
||
tg = await asyncio.to_thread(tfn, batch)
|
||
if not tg:
|
||
continue
|
||
enrich_turn += 1
|
||
step += 1
|
||
ok = await efn(run_id, step, tg)
|
||
if not ok: # 未就绪 → 本轮关该源
|
||
if name == "web":
|
||
web_off = True
|
||
elif name == "xhs":
|
||
xhs_off = True
|
||
elif name == "douyin":
|
||
dy_off = True
|
||
elif name == "event":
|
||
event_off = True
|
||
else:
|
||
enrich_off = True
|
||
break
|
||
hit = True
|
||
break
|
||
if hit:
|
||
await asyncio.sleep(_API_PACING)
|
||
continue
|
||
|
||
if not grid_busy:
|
||
# 网格扫完且无可富集 → 驻守(不烧额度,下轮自动续,可一键停止)
|
||
reason = ("全城网格已扫完、知识富集也已补全,进入驻守巡检"
|
||
"(不再消耗额度,新数据/工单结清后自动恢复,可停止)"
|
||
if not pend else
|
||
"达单进程安全上限,转驻守;下次启动自动从断点续扫")
|
||
await sa_set_status(run_id, "stewarding")
|
||
await sa_append_step(run_id, {"step": step,
|
||
"action": "steward",
|
||
"reason": reason})
|
||
if await _wait_steward():
|
||
await sa_finish(run_id, "stopped")
|
||
return
|
||
api_calls = 0 # 新驻守周期重置进程内计数
|
||
continue
|
||
|
||
# 聚焦:仍有待扫格的类里,挑当前藏品最少的(均衡推进)
|
||
cov = {i["cat"]: i["current"] for i in _coverage()["items"]}
|
||
focus = min(pend, key=lambda c: cov.get(c, 0))
|
||
cell = await grid_take_next(focus)
|
||
if not cell:
|
||
continue
|
||
|
||
tc = cell["typecode"]
|
||
page = cell["next_page"]
|
||
bbox = (cell["min_lng"], cell["min_lat"],
|
||
cell["max_lng"], cell["max_lat"])
|
||
step += 1
|
||
try:
|
||
rows, raw = await asyncio.to_thread(
|
||
search_polygon, tc, bbox, page, _OFFSET)
|
||
except Exception as e:
|
||
rows, raw = [], -1
|
||
err = str(e)[:140]
|
||
else:
|
||
err = None
|
||
api_calls += 1
|
||
|
||
res = await ingest_rows(rows, focus, focus) if rows else \
|
||
{"fetched": 0, "approved": 0, "pending": 0, "skipped": 0}
|
||
if err:
|
||
res["error"] = err
|
||
|
||
area = (f"经[{bbox[0]:.3f},{bbox[2]:.3f}] "
|
||
f"纬[{bbox[1]:.3f},{bbox[3]:.3f}]")
|
||
# 状态机:扫尽 / 翻页 / 自适应细分(密集区 8 页仍满→四叉细分,防 POI 丢)
|
||
if err:
|
||
# 单格请求出错:不前进、不标尽,留待后续重试(不浪费已得)
|
||
nstatus, npage, note = "pending", page, f"请求失败重试 {err}"
|
||
elif raw <= 0 or raw < _OFFSET:
|
||
nstatus, npage, note = "exhausted", page, f"网格扫尽(本页{raw}条)"
|
||
elif page >= _MAX_PAGE_SPLIT and cell["depth"] < _MAX_DEPTH:
|
||
await grid_subdivide(cell["id"], focus, tc,
|
||
_quads(cell), cell["depth"] + 1)
|
||
nstatus, npage, note = None, page, "稠密→四叉细分深扫"
|
||
elif page >= _MAX_PAGE_SPLIT:
|
||
nstatus, npage, note = "exhausted", page, "达最深层,止于该格"
|
||
else:
|
||
nstatus, npage, note = "pending", page + 1, "本页满,续下一页"
|
||
|
||
if nstatus is not None:
|
||
await grid_update(cell["id"], npage,
|
||
res.get("approved", 0) + res.get("pending", 0),
|
||
nstatus)
|
||
|
||
reason = f"全城网格扫描 {area} 第{page}页({focus}/{tc})"
|
||
await sa_add_task(run_id, step, focus, "ingest", reason,
|
||
"gaode_grid", result=res, status="done",
|
||
note=f"格#{cell['id']} d{cell['depth']} p{page}·{note}")
|
||
await sa_append_step(run_id, {
|
||
"step": step,
|
||
"plan": {"poi_type": focus, "keyword": f"网格 {area} p{page}",
|
||
"reason": reason},
|
||
"result": res,
|
||
}, ingested_delta=res.get("approved", 0))
|
||
|
||
# 该类全城扫尽却仍偏少 → 求助管理员(去重,馆长不停)
|
||
if focus not in escalated:
|
||
gc = await grid_counts()
|
||
g = gc.get(focus, {})
|
||
if g.get("total") and g["done"] >= g["total"]:
|
||
maxc = max(cov.values()) if cov else 0
|
||
if cov.get(focus, 0) < max(0.25 * maxc, 30):
|
||
await _escalate(run_id, step, focus, cov.get(focus, 0))
|
||
escalated.add(focus)
|
||
|
||
await asyncio.sleep(_API_PACING)
|
||
except Exception as e: # noqa: BLE001
|
||
await sa_finish(run_id, "error", str(e)[:240])
|
||
|
||
|
||
def schedule_super_agent(run_id: int) -> None:
|
||
t = asyncio.create_task(run_super_agent(run_id))
|
||
_BG.add(t)
|
||
t.add_done_callback(_BG.discard)
|