Files
bxh/app/agents/aligner.py

169 lines
5.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""STEP 05 — Entity Alignment Agent.
Given new candidate entities, determines whether they should be:
- merged into an existing entity (duplicate detection)
- created as new
- rejected outright
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Literal
from app.config import settings
from app.db import list_vocabulary_terms, get_conn
from app.llm_client import LlmClient
SYSTEM_PROMPT = """你是一个知识图谱实体归一专家。给你一个新候选实体和一批已存在的相似实体,
判断新实体是已有实体的别名/重复,还是全新实体。输出 JSON。
判断依据:
1. 名称相似度(包含别名匹配)
2. 类型一致性(同类型更可能重复)
3. 地理位置重叠(地址/坐标接近)
4. 核心字段重叠opening_hours / contact 等)
动作:
- merge: 重复,合并到已有实体
- create: 全新实体,正常创建
- reject: 无效数据,拒绝
只输出 JSON。"""
USER_TEMPLATE = """新候选实体:
{new_candidate}
已存在相似实体:
{existing}
词汇表(标准名+别名):
{vocabulary}
输出 schema
{{
"candidate_id": int,
"action": "merge|create|reject",
"target_entity_id": "string|null",
"confidence": 0.0~1.0,
"reasons": ["理由1", "理由2"]
}}"""
@dataclass(frozen=True)
class AlignmentDecision:
candidate_id: int
action: Literal["merge", "create", "reject"]
target_entity_id: str | None
confidence: float
reasons: list[str]
async def _find_similar_entities(candidate: dict) -> list[dict]:
"""Find existing entities with similar names/types."""
s = settings.db_schema
name = (candidate.get("natural_key") or "").strip()
if not name:
return []
async with get_conn() as conn:
async with conn.cursor() as cur:
await cur.execute(
f"""SELECT id, natural_key, entity_type, payload
FROM {s}.candidate_entities
WHERE tenant_id=%s AND project_id=%s
AND entity_type=%s
AND status IN ('published', 'approved')
AND (natural_key ILIKE %s OR payload::text ILIKE %s)
LIMIT 10""",
(
settings.default_tenant,
settings.default_project,
candidate.get("entity_type", ""),
f"%{name[:5]}%",
f"%{name[:5]}%",
),
)
return await cur.fetchall()
async def align_single_candidate(
candidate: dict,
llm: LlmClient | None = None,
) -> AlignmentDecision:
"""Align a single candidate against existing entities and vocabulary."""
cid = candidate["id"]
# Find similar
similar = await _find_similar_entities(candidate)
vocab_terms = await list_vocabulary_terms(
settings.default_tenant,
settings.default_project,
candidate.get("entity_type"),
limit=50,
)
if llm and llm.available() and similar:
import json
try:
result = llm.chat_json(
system=SYSTEM_PROMPT,
user=USER_TEMPLATE.format(
new_candidate=json.dumps(
{"id": cid, "name": candidate.get("natural_key"), "type": candidate.get("entity_type")},
ensure_ascii=False,
),
existing=json.dumps(
[{"id": e["id"], "name": e["natural_key"], "type": e["entity_type"]} for e in similar],
ensure_ascii=False,
),
vocabulary=json.dumps(
[{"name": v["canonical_name"], "aliases": v.get("aliases", [])} for v in vocab_terms],
ensure_ascii=False,
),
),
)
return AlignmentDecision(
candidate_id=cid,
action=result.get("action", "create"),
target_entity_id=result.get("target_entity_id"),
confidence=float(result.get("confidence", 0.5)),
reasons=result.get("reasons", []),
)
except Exception:
pass
# Rule-based fallback: exact name match → merge
for s in similar:
if s["natural_key"] == candidate.get("natural_key"):
return AlignmentDecision(
candidate_id=cid,
action="merge",
target_entity_id=str(s["id"]),
confidence=0.95,
reasons=["规则匹配: 名称完全一致"],
)
return AlignmentDecision(
candidate_id=cid,
action="create",
target_entity_id=None,
confidence=0.6,
reasons=["无相似实体或LLM不可用"],
)
async def align_candidates(candidates: list[dict]) -> list[dict]:
"""Batch align candidates."""
llm = LlmClient.from_settings() if settings.llm_api_key else None
results = []
for c in candidates:
decision = await align_single_candidate(c, llm)
results.append({
"candidate_id": decision.candidate_id,
"action": decision.action,
"target_entity_id": decision.target_entity_id,
"confidence": decision.confidence,
"reasons": decision.reasons,
})
return results