"""Phase 1 — Schema Lint Agent. Validates an ontology schema for completeness, consistency, and best practices. """ from __future__ import annotations import json from dataclasses import dataclass, field from app.config import settings from app.llm_client import LlmClient SYSTEM_PROMPT = """你是一个知识图谱 Schema 审计员。给一个 ontology schema,检查它的问题。输出 JSON。 检查维度: 1. 类型覆盖率 — 常用实体类型(POI, Restaurant, Hotel, Attraction)是否都有 2. 字段完整性 — 每个实体类型的核心字段是否齐备 3. 关系连通性 — 实体类型之间是否有合理的关系类型 4. 命名字段一致性 — 同类字段在不同实体类型下命名是否一致 5. 必填字段合理性 — 标记为 required 的字段是否确实必要 问题严重度: - error: 会导致图谱不可用 - warning: 建议修复 - info: 可选优化 只输出 JSON。""" USER_TEMPLATE = """Ontology Schema: {schema_json} 输出 schema: {{ "issues": [ {{ "severity": "error|warning|info", "code": "missing_entity_type|missing_field|orphan_type|naming_inconsistency", "target": "entity_type.field_name", "message": "中文描述", "suggestion": "修复建议" }} ], "score": 0~100, "summary": "一句话总结" }}""" @dataclass class LintIssue: severity: str # error | warning | info code: str target: str message: str suggestion: str = "" @dataclass class SchemaLintResult: issues: list[LintIssue] = field(default_factory=list) score: int = 100 summary: str = "" def _rule_based_lint(schema_dict: dict) -> SchemaLintResult: """Fallback rule-based lint when LLM is unavailable.""" issues = [] entity_types = schema_dict.get("entity_types", []) or [] # Check required entity types type_names = {et.get("entity_type", "") for et in entity_types} required_types = {"POI", "Restaurant", "Hotel", "Attraction"} missing = required_types - type_names for mt in missing: issues.append(LintIssue( severity="warning", code="missing_entity_type", target=mt, message=f"缺少常用实体类型: {mt}", suggestion=f"建议添加 {mt} 实体类型定义", )) # Check fields for each entity type for et in entity_types: fields = et.get("fields", []) field_names = {f.get("field_name", "") for f in fields} if "name" not in field_names: issues.append(LintIssue( severity="error", code="missing_field", target=f"{et.get('entity_type')}.name", message=f"实体类型 {et.get('entity_type')} 缺少必填字段: name", suggestion="所有实体类型必须包含 name 字段", )) score = max(0, 100 - len(issues) * 10) return SchemaLintResult( issues=issues, score=score, summary=f"发现 {len(issues)} 个问题" if issues else "Schema 检查通过", ) async def lint_schema( schema_dict: dict, llm: LlmClient | None = None, ) -> SchemaLintResult: """Lint an ontology schema for issues. Args: schema_dict: The ontology schema as a dict llm: Optional LLM client """ if llm is None: llm = LlmClient.from_settings() if settings.llm_api_key else None if llm and llm.available(): try: schema_json = json.dumps(schema_dict, ensure_ascii=False, indent=2) result = llm.chat_json( system=SYSTEM_PROMPT, user=USER_TEMPLATE.format(schema_json=schema_json), ) issues = [ LintIssue( severity=i.get("severity", "warning"), code=i.get("code", ""), target=i.get("target", ""), message=i.get("message", ""), suggestion=i.get("suggestion", ""), ) for i in result.get("issues", []) ] return SchemaLintResult( issues=issues, score=result.get("score", 80), summary=result.get("summary", ""), ) except Exception: pass return _rule_based_lint(schema_dict)