Files
bxh/app/agents/schema_lint.py

142 lines
4.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Phase 1 — Schema Lint Agent.
Validates an ontology schema for completeness, consistency, and best practices.
"""
from __future__ import annotations
import json
from dataclasses import dataclass, field
from app.config import settings
from app.llm_client import LlmClient
SYSTEM_PROMPT = """你是一个知识图谱 Schema 审计员。给一个 ontology schema检查它的问题。输出 JSON。
检查维度:
1. 类型覆盖率 — 常用实体类型POI, Restaurant, Hotel, Attraction是否都有
2. 字段完整性 — 每个实体类型的核心字段是否齐备
3. 关系连通性 — 实体类型之间是否有合理的关系类型
4. 命名字段一致性 — 同类字段在不同实体类型下命名是否一致
5. 必填字段合理性 — 标记为 required 的字段是否确实必要
问题严重度:
- error: 会导致图谱不可用
- warning: 建议修复
- info: 可选优化
只输出 JSON。"""
USER_TEMPLATE = """Ontology Schema
{schema_json}
输出 schema
{{
"issues": [
{{
"severity": "error|warning|info",
"code": "missing_entity_type|missing_field|orphan_type|naming_inconsistency",
"target": "entity_type.field_name",
"message": "中文描述",
"suggestion": "修复建议"
}}
],
"score": 0~100,
"summary": "一句话总结"
}}"""
@dataclass
class LintIssue:
severity: str # error | warning | info
code: str
target: str
message: str
suggestion: str = ""
@dataclass
class SchemaLintResult:
issues: list[LintIssue] = field(default_factory=list)
score: int = 100
summary: str = ""
def _rule_based_lint(schema_dict: dict) -> SchemaLintResult:
"""Fallback rule-based lint when LLM is unavailable."""
issues = []
entity_types = schema_dict.get("entity_types", []) or []
# Check required entity types
type_names = {et.get("entity_type", "") for et in entity_types}
required_types = {"POI", "Restaurant", "Hotel", "Attraction"}
missing = required_types - type_names
for mt in missing:
issues.append(LintIssue(
severity="warning",
code="missing_entity_type",
target=mt,
message=f"缺少常用实体类型: {mt}",
suggestion=f"建议添加 {mt} 实体类型定义",
))
# Check fields for each entity type
for et in entity_types:
fields = et.get("fields", [])
field_names = {f.get("field_name", "") for f in fields}
if "name" not in field_names:
issues.append(LintIssue(
severity="error",
code="missing_field",
target=f"{et.get('entity_type')}.name",
message=f"实体类型 {et.get('entity_type')} 缺少必填字段: name",
suggestion="所有实体类型必须包含 name 字段",
))
score = max(0, 100 - len(issues) * 10)
return SchemaLintResult(
issues=issues,
score=score,
summary=f"发现 {len(issues)} 个问题" if issues else "Schema 检查通过",
)
async def lint_schema(
schema_dict: dict,
llm: LlmClient | None = None,
) -> SchemaLintResult:
"""Lint an ontology schema for issues.
Args:
schema_dict: The ontology schema as a dict
llm: Optional LLM client
"""
if llm is None:
llm = LlmClient.from_settings() if settings.llm_api_key else None
if llm and llm.available():
try:
schema_json = json.dumps(schema_dict, ensure_ascii=False, indent=2)
result = llm.chat_json(
system=SYSTEM_PROMPT,
user=USER_TEMPLATE.format(schema_json=schema_json),
)
issues = [
LintIssue(
severity=i.get("severity", "warning"),
code=i.get("code", ""),
target=i.get("target", ""),
message=i.get("message", ""),
suggestion=i.get("suggestion", ""),
)
for i in result.get("issues", [])
]
return SchemaLintResult(
issues=issues,
score=result.get("score", 80),
summary=result.get("summary", ""),
)
except Exception:
pass
return _rule_based_lint(schema_dict)