142 lines
4.3 KiB
Python
142 lines
4.3 KiB
Python
"""Phase 1 — Schema Lint Agent.
|
||
|
||
Validates an ontology schema for completeness, consistency, and best practices.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
from dataclasses import dataclass, field
|
||
|
||
from app.config import settings
|
||
from app.llm_client import LlmClient
|
||
|
||
SYSTEM_PROMPT = """你是一个知识图谱 Schema 审计员。给一个 ontology schema,检查它的问题。输出 JSON。
|
||
|
||
检查维度:
|
||
1. 类型覆盖率 — 常用实体类型(POI, Restaurant, Hotel, Attraction)是否都有
|
||
2. 字段完整性 — 每个实体类型的核心字段是否齐备
|
||
3. 关系连通性 — 实体类型之间是否有合理的关系类型
|
||
4. 命名字段一致性 — 同类字段在不同实体类型下命名是否一致
|
||
5. 必填字段合理性 — 标记为 required 的字段是否确实必要
|
||
|
||
问题严重度:
|
||
- error: 会导致图谱不可用
|
||
- warning: 建议修复
|
||
- info: 可选优化
|
||
|
||
只输出 JSON。"""
|
||
|
||
USER_TEMPLATE = """Ontology Schema:
|
||
{schema_json}
|
||
|
||
输出 schema:
|
||
{{
|
||
"issues": [
|
||
{{
|
||
"severity": "error|warning|info",
|
||
"code": "missing_entity_type|missing_field|orphan_type|naming_inconsistency",
|
||
"target": "entity_type.field_name",
|
||
"message": "中文描述",
|
||
"suggestion": "修复建议"
|
||
}}
|
||
],
|
||
"score": 0~100,
|
||
"summary": "一句话总结"
|
||
}}"""
|
||
|
||
|
||
@dataclass
|
||
class LintIssue:
|
||
severity: str # error | warning | info
|
||
code: str
|
||
target: str
|
||
message: str
|
||
suggestion: str = ""
|
||
|
||
|
||
@dataclass
|
||
class SchemaLintResult:
|
||
issues: list[LintIssue] = field(default_factory=list)
|
||
score: int = 100
|
||
summary: str = ""
|
||
|
||
|
||
def _rule_based_lint(schema_dict: dict) -> SchemaLintResult:
|
||
"""Fallback rule-based lint when LLM is unavailable."""
|
||
issues = []
|
||
entity_types = schema_dict.get("entity_types", []) or []
|
||
|
||
# Check required entity types
|
||
type_names = {et.get("entity_type", "") for et in entity_types}
|
||
required_types = {"POI", "Restaurant", "Hotel", "Attraction"}
|
||
missing = required_types - type_names
|
||
for mt in missing:
|
||
issues.append(LintIssue(
|
||
severity="warning",
|
||
code="missing_entity_type",
|
||
target=mt,
|
||
message=f"缺少常用实体类型: {mt}",
|
||
suggestion=f"建议添加 {mt} 实体类型定义",
|
||
))
|
||
|
||
# Check fields for each entity type
|
||
for et in entity_types:
|
||
fields = et.get("fields", [])
|
||
field_names = {f.get("field_name", "") for f in fields}
|
||
if "name" not in field_names:
|
||
issues.append(LintIssue(
|
||
severity="error",
|
||
code="missing_field",
|
||
target=f"{et.get('entity_type')}.name",
|
||
message=f"实体类型 {et.get('entity_type')} 缺少必填字段: name",
|
||
suggestion="所有实体类型必须包含 name 字段",
|
||
))
|
||
|
||
score = max(0, 100 - len(issues) * 10)
|
||
return SchemaLintResult(
|
||
issues=issues,
|
||
score=score,
|
||
summary=f"发现 {len(issues)} 个问题" if issues else "Schema 检查通过",
|
||
)
|
||
|
||
|
||
async def lint_schema(
|
||
schema_dict: dict,
|
||
llm: LlmClient | None = None,
|
||
) -> SchemaLintResult:
|
||
"""Lint an ontology schema for issues.
|
||
|
||
Args:
|
||
schema_dict: The ontology schema as a dict
|
||
llm: Optional LLM client
|
||
"""
|
||
if llm is None:
|
||
llm = LlmClient.from_settings() if settings.llm_api_key else None
|
||
|
||
if llm and llm.available():
|
||
try:
|
||
schema_json = json.dumps(schema_dict, ensure_ascii=False, indent=2)
|
||
result = llm.chat_json(
|
||
system=SYSTEM_PROMPT,
|
||
user=USER_TEMPLATE.format(schema_json=schema_json),
|
||
)
|
||
issues = [
|
||
LintIssue(
|
||
severity=i.get("severity", "warning"),
|
||
code=i.get("code", ""),
|
||
target=i.get("target", ""),
|
||
message=i.get("message", ""),
|
||
suggestion=i.get("suggestion", ""),
|
||
)
|
||
for i in result.get("issues", [])
|
||
]
|
||
return SchemaLintResult(
|
||
issues=issues,
|
||
score=result.get("score", 80),
|
||
summary=result.get("summary", ""),
|
||
)
|
||
except Exception:
|
||
pass
|
||
|
||
return _rule_based_lint(schema_dict)
|