bxh/app/agents/aligner.py

"""STEP 05 — Entity Alignment Agent.

Given new candidate entities, determines whether they should be:
- merged into an existing entity (duplicate detection)
- created as new
- rejected outright
"""
from __future__ import annotations

from dataclasses import dataclass
from typing import Literal

from app.config import settings
from app.db import list_vocabulary_terms, get_conn
from app.llm_client import LlmClient

SYSTEM_PROMPT = """你是一个知识图谱实体归一专家。给你一个新候选实体和一批已存在的相似实体，
判断新实体是已有实体的别名/重复，还是全新实体。输出 JSON。

判断依据：
1. 名称相似度（包含别名匹配）
2. 类型一致性（同类型更可能重复）
3. 地理位置重叠（地址/坐标接近）
4. 核心字段重叠（opening_hours / contact 等）

动作：
- merge: 重复，合并到已有实体
- create: 全新实体，正常创建
- reject: 无效数据，拒绝

只输出 JSON。"""

USER_TEMPLATE = """新候选实体：
{new_candidate}

已存在相似实体：
{existing}

词汇表（标准名+别名）：
{vocabulary}

输出 schema：
{{
  "candidate_id": int,
  "action": "merge|create|reject",
  "target_entity_id": "string|null",
  "confidence": 0.0~1.0,
  "reasons": ["理由1", "理由2"]
}}"""


@dataclass(frozen=True)
class AlignmentDecision:
    candidate_id: int
    action: Literal["merge", "create", "reject"]
    target_entity_id: str | None
    confidence: float
    reasons: list[str]


async def _find_similar_entities(candidate: dict) -> list[dict]:
    """Find existing entities with similar names/types."""
    s = settings.db_schema
    name = (candidate.get("natural_key") or "").strip()
    if not name:
        return []

    async with get_conn() as conn:
        async with conn.cursor() as cur:
            await cur.execute(
                f"""SELECT id, natural_key, entity_type, payload
                FROM {s}.candidate_entities
                WHERE tenant_id=%s AND project_id=%s
                AND entity_type=%s
                AND status IN ('published', 'approved')
                AND (natural_key ILIKE %s OR payload::text ILIKE %s)
                LIMIT 10""",
                (
                    settings.default_tenant,
                    settings.default_project,
                    candidate.get("entity_type", ""),
                    f"%{name[:5]}%",
                    f"%{name[:5]}%",
                ),
            )
            return await cur.fetchall()


async def align_single_candidate(
    candidate: dict,
    llm: LlmClient | None = None,
) -> AlignmentDecision:
    """Align a single candidate against existing entities and vocabulary."""
    cid = candidate["id"]

    # Find similar
    similar = await _find_similar_entities(candidate)
    vocab_terms = await list_vocabulary_terms(
        settings.default_tenant,
        settings.default_project,
        candidate.get("entity_type"),
        limit=50,
    )

    if llm and llm.available() and similar:
        import json
        try:
            result = llm.chat_json(
                system=SYSTEM_PROMPT,
                user=USER_TEMPLATE.format(
                    new_candidate=json.dumps(
                        {"id": cid, "name": candidate.get("natural_key"), "type": candidate.get("entity_type")},
                        ensure_ascii=False,
                    ),
                    existing=json.dumps(
                        [{"id": e["id"], "name": e["natural_key"], "type": e["entity_type"]} for e in similar],
                        ensure_ascii=False,
                    ),
                    vocabulary=json.dumps(
                        [{"name": v["canonical_name"], "aliases": v.get("aliases", [])} for v in vocab_terms],
                        ensure_ascii=False,
                    ),
                ),
            )
            return AlignmentDecision(
                candidate_id=cid,
                action=result.get("action", "create"),
                target_entity_id=result.get("target_entity_id"),
                confidence=float(result.get("confidence", 0.5)),
                reasons=result.get("reasons", []),
            )
        except Exception:
            pass

    # Rule-based fallback: exact name match → merge
    for s in similar:
        if s["natural_key"] == candidate.get("natural_key"):
            return AlignmentDecision(
                candidate_id=cid,
                action="merge",
                target_entity_id=str(s["id"]),
                confidence=0.95,
                reasons=["规则匹配: 名称完全一致"],
            )

    return AlignmentDecision(
        candidate_id=cid,
        action="create",
        target_entity_id=None,
        confidence=0.6,
        reasons=["无相似实体或LLM不可用"],
    )


async def align_candidates(candidates: list[dict]) -> list[dict]:
    """Batch align candidates."""
    llm = LlmClient.from_settings() if settings.llm_api_key else None
    results = []
    for c in candidates:
        decision = await align_single_candidate(c, llm)
        results.append({
            "candidate_id": decision.candidate_id,
            "action": decision.action,
            "target_entity_id": decision.target_entity_id,
            "confidence": decision.confidence,
            "reasons": decision.reasons,
        })
    return results