bxh/app/agents/schema_lint.py

"""Phase 1 — Schema Lint Agent.

Validates an ontology schema for completeness, consistency, and best practices.
"""
from __future__ import annotations

import json
from dataclasses import dataclass, field

from app.config import settings
from app.llm_client import LlmClient

SYSTEM_PROMPT = """你是一个知识图谱 Schema 审计员。给一个 ontology schema，检查它的问题。输出 JSON。

检查维度：
1. 类型覆盖率 — 常用实体类型（POI, Restaurant, Hotel, Attraction）是否都有
2. 字段完整性 — 每个实体类型的核心字段是否齐备
3. 关系连通性 — 实体类型之间是否有合理的关系类型
4. 命名字段一致性 — 同类字段在不同实体类型下命名是否一致
5. 必填字段合理性 — 标记为 required 的字段是否确实必要

问题严重度：
- error: 会导致图谱不可用
- warning: 建议修复
- info: 可选优化

只输出 JSON。"""

USER_TEMPLATE = """Ontology Schema：
{schema_json}

输出 schema：
{{
  "issues": [
    {{
      "severity": "error|warning|info",
      "code": "missing_entity_type|missing_field|orphan_type|naming_inconsistency",
      "target": "entity_type.field_name",
      "message": "中文描述",
      "suggestion": "修复建议"
    }}
  ],
  "score": 0~100,
  "summary": "一句话总结"
}}"""


@dataclass
class LintIssue:
    severity: str  # error | warning | info
    code: str
    target: str
    message: str
    suggestion: str = ""


@dataclass
class SchemaLintResult:
    issues: list[LintIssue] = field(default_factory=list)
    score: int = 100
    summary: str = ""


def _rule_based_lint(schema_dict: dict) -> SchemaLintResult:
    """Fallback rule-based lint when LLM is unavailable."""
    issues = []
    entity_types = schema_dict.get("entity_types", []) or []

    # Check required entity types
    type_names = {et.get("entity_type", "") for et in entity_types}
    required_types = {"POI", "Restaurant", "Hotel", "Attraction"}
    missing = required_types - type_names
    for mt in missing:
        issues.append(LintIssue(
            severity="warning",
            code="missing_entity_type",
            target=mt,
            message=f"缺少常用实体类型: {mt}",
            suggestion=f"建议添加 {mt} 实体类型定义",
        ))

    # Check fields for each entity type
    for et in entity_types:
        fields = et.get("fields", [])
        field_names = {f.get("field_name", "") for f in fields}
        if "name" not in field_names:
            issues.append(LintIssue(
                severity="error",
                code="missing_field",
                target=f"{et.get('entity_type')}.name",
                message=f"实体类型 {et.get('entity_type')} 缺少必填字段: name",
                suggestion="所有实体类型必须包含 name 字段",
            ))

    score = max(0, 100 - len(issues) * 10)
    return SchemaLintResult(
        issues=issues,
        score=score,
        summary=f"发现 {len(issues)} 个问题" if issues else "Schema 检查通过",
    )


async def lint_schema(
    schema_dict: dict,
    llm: LlmClient | None = None,
) -> SchemaLintResult:
    """Lint an ontology schema for issues.

    Args:
        schema_dict: The ontology schema as a dict
        llm: Optional LLM client
    """
    if llm is None:
        llm = LlmClient.from_settings() if settings.llm_api_key else None

    if llm and llm.available():
        try:
            schema_json = json.dumps(schema_dict, ensure_ascii=False, indent=2)
            result = llm.chat_json(
                system=SYSTEM_PROMPT,
                user=USER_TEMPLATE.format(schema_json=schema_json),
            )
            issues = [
                LintIssue(
                    severity=i.get("severity", "warning"),
                    code=i.get("code", ""),
                    target=i.get("target", ""),
                    message=i.get("message", ""),
                    suggestion=i.get("suggestion", ""),
                )
                for i in result.get("issues", [])
            ]
            return SchemaLintResult(
                issues=issues,
                score=result.get("score", 80),
                summary=result.get("summary", ""),
            )
        except Exception:
            pass

    return _rule_based_lint(schema_dict)