Initial travel knowledge graph release
This commit is contained in:
238
scripts/kg_schema_v1_preview_from_report.py
Normal file
238
scripts/kg_schema_v1_preview_from_report.py
Normal file
@@ -0,0 +1,238 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Build a kg_schema_v1 preview from the Huaxi extraction comparison report.
|
||||
|
||||
The comparison report intentionally keeps review_status beside each candidate
|
||||
for human reading. The production schema is stricter, so this script removes
|
||||
review-only fields, derives evidence_links, validates the shape, and writes a
|
||||
reviewable summary for discussion before anything is published to the graph.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from copy import deepcopy
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
ROOT = Path("/Users/xuexue/new2")
|
||||
IN_JSON = ROOT / "docs/reports/huaxi_kg_extraction_comparison.json"
|
||||
SCHEMA_JSON = ROOT / "app/schemas/kg_extraction_v1.schema.json"
|
||||
OUT_JSON = ROOT / "docs/reports/huaxi_kg_schema_v1_ready.json"
|
||||
OUT_REVIEW = ROOT / "docs/reports/huaxi_kg_schema_v1_review_plan.md"
|
||||
|
||||
|
||||
def strip_review_fields(row: dict[str, Any]) -> dict[str, Any]:
|
||||
item = deepcopy(row)
|
||||
item.pop("review_status", None)
|
||||
return item
|
||||
|
||||
|
||||
def evidence_links_for(target_ref: str, spans: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
links: list[dict[str, Any]] = []
|
||||
for span in spans or []:
|
||||
evidence_id = span.get("evidence_id")
|
||||
quote = str(span.get("quote") or "").strip()
|
||||
if evidence_id and quote:
|
||||
links.append({
|
||||
"target_ref": target_ref,
|
||||
"evidence_id": evidence_id,
|
||||
"support_type": "supports",
|
||||
"quote": quote,
|
||||
})
|
||||
return links
|
||||
|
||||
|
||||
def statement_ref(row: dict[str, Any], index: int) -> str:
|
||||
subj = str(row.get("subject_ref") or "")
|
||||
pred = str(row.get("predicate") or "")
|
||||
obj = str(row.get("object_ref") or "")[:48]
|
||||
return f"stmt_{index:03d}:{subj}:{pred}:{obj}"
|
||||
|
||||
|
||||
def build_payload(source: dict[str, Any]) -> dict[str, Any]:
|
||||
final = source["final"]
|
||||
payload = {
|
||||
"entities": [strip_review_fields(r) for r in final.get("entities", [])],
|
||||
"events": [strip_review_fields(r) for r in final.get("events", [])],
|
||||
"concepts": [strip_review_fields(r) for r in final.get("concepts", [])],
|
||||
"relations": [strip_review_fields(r) for r in final.get("relations", [])],
|
||||
"statements": [strip_review_fields(r) for r in final.get("statements", [])],
|
||||
"schema_proposals": [strip_review_fields(r) for r in final.get("schema_proposals", [])],
|
||||
"evidence_links": [],
|
||||
}
|
||||
|
||||
links: list[dict[str, Any]] = []
|
||||
for row in payload["entities"]:
|
||||
links.extend(evidence_links_for(row["temp_id"], row.get("source_spans") or []))
|
||||
for row in payload["events"]:
|
||||
links.extend(evidence_links_for(row["temp_id"], row.get("source_spans") or []))
|
||||
for row in payload["concepts"]:
|
||||
links.extend(evidence_links_for(row["temp_id"], row.get("source_spans") or []))
|
||||
for idx, row in enumerate(payload["statements"], start=1):
|
||||
links.extend(evidence_links_for(statement_ref(row, idx), row.get("source_spans") or []))
|
||||
|
||||
seen = set()
|
||||
for link in links:
|
||||
sig = (link["target_ref"], str(link["evidence_id"]), link["quote"])
|
||||
if sig in seen:
|
||||
continue
|
||||
seen.add(sig)
|
||||
payload["evidence_links"].append(link)
|
||||
|
||||
return payload
|
||||
|
||||
|
||||
def validate_payload(payload: dict[str, Any]) -> str:
|
||||
try:
|
||||
import jsonschema # type: ignore
|
||||
except Exception:
|
||||
required = [
|
||||
"entities", "events", "concepts", "relations", "statements",
|
||||
"schema_proposals", "evidence_links",
|
||||
]
|
||||
missing = [key for key in required if not isinstance(payload.get(key), list)]
|
||||
if missing:
|
||||
raise ValueError(f"missing list fields: {missing}")
|
||||
for row in payload["entities"]:
|
||||
for key in ["temp_id", "name", "entity_type", "confidence"]:
|
||||
if key not in row:
|
||||
raise ValueError(f"entity missing {key}: {row}")
|
||||
for row in payload["events"]:
|
||||
for key in ["temp_id", "title", "event_type", "confidence"]:
|
||||
if key not in row:
|
||||
raise ValueError(f"event missing {key}: {row}")
|
||||
for row in payload["concepts"]:
|
||||
for key in ["temp_id", "name", "concept_type", "confidence"]:
|
||||
if key not in row:
|
||||
raise ValueError(f"concept missing {key}: {row}")
|
||||
for row in payload["relations"]:
|
||||
for key in ["relation_type", "source_type", "target_type", "confidence"]:
|
||||
if key not in row:
|
||||
raise ValueError(f"relation missing {key}: {row}")
|
||||
for row in payload["statements"]:
|
||||
for key in ["subject_ref", "predicate", "object_ref", "object_kind", "confidence"]:
|
||||
if key not in row:
|
||||
raise ValueError(f"statement missing {key}: {row}")
|
||||
return "structural check passed"
|
||||
|
||||
schema = json.loads(SCHEMA_JSON.read_text(encoding="utf-8"))
|
||||
jsonschema.Draft202012Validator(schema).validate(payload)
|
||||
return "valid"
|
||||
|
||||
|
||||
AUTO_PUBLISH_THRESHOLD = 0.8
|
||||
|
||||
|
||||
def review_bucket(row: dict[str, Any], kind: str) -> str:
|
||||
confidence = float(row.get("confidence") or 0)
|
||||
has_disagreement = bool(row.get("model_disagreement") or row.get("conflict"))
|
||||
if kind == "schema_proposal":
|
||||
if confidence >= AUTO_PUBLISH_THRESHOLD and not has_disagreement:
|
||||
return "进入 Schema Proposal 队列:不阻塞数据入图"
|
||||
return "人工审核:schema 建议分数低或模型不一致"
|
||||
if confidence >= AUTO_PUBLISH_THRESHOLD and not has_disagreement:
|
||||
return "自动入候选发布:多模型一致且分数 >= 0.8"
|
||||
return "人工审核:模型不一致或最终分数 < 0.8"
|
||||
|
||||
|
||||
def write_review_plan(raw: dict[str, Any], payload: dict[str, Any], validation: str) -> None:
|
||||
counts = {k: len(payload[k]) for k in [
|
||||
"entities", "events", "concepts", "relations",
|
||||
"statements", "schema_proposals", "evidence_links",
|
||||
]}
|
||||
lines = [
|
||||
"# 花溪公园 kg_schema_v1 效果与人工审核方案",
|
||||
"",
|
||||
"## 这次新模式会得到什么",
|
||||
"",
|
||||
f"- Entity:{counts['entities']} 个",
|
||||
f"- Event:{counts['events']} 个",
|
||||
f"- Concept:{counts['concepts']} 个",
|
||||
f"- Relation Schema:{counts['relations']} 个",
|
||||
f"- Statement:{counts['statements']} 条",
|
||||
f"- Schema Proposal:{counts['schema_proposals']} 条",
|
||||
f"- Evidence Link:{counts['evidence_links']} 条",
|
||||
f"- JSON Schema 校验:{validation}",
|
||||
"",
|
||||
"## 和旧 web_agent 软字段模式的差别",
|
||||
"",
|
||||
"| 维度 | 旧模式 | kg_schema_v1 新模式 |",
|
||||
"|---|---|---|",
|
||||
"| 抽取目标 | summary/history/features 等软字段 + schema_gaps | Entity/Event/Concept/Relation/Statement 全量候选 |",
|
||||
"| 基本信息 | 容易被放进 schema_gaps,不能稳定入图 | 地址、开放时间、门票、面积、等级等变成可入图 Statement |",
|
||||
"| 历史信息 | 部分进入 history 文本 | 变成 Event 节点,并通过 HAS_EVENT 连到地点 |",
|
||||
"| 推荐语义 | 缺少概念层 | 历史文化、生态公园、亲水游憩等变成 Concept |",
|
||||
"| 人工审核 | 主要审候选实体字段 | 审 Entity/Event/Concept/Statement/SchemaProposal,能细粒度放行 |",
|
||||
"",
|
||||
"## 人工介入规则",
|
||||
"",
|
||||
"1. 多模型结果一致、证据 quote 可定位、final_score/confidence >= 0.8:自动进入发布候选,可直接写入正式图谱。",
|
||||
"2. 多模型结果不一致、存在冲突字段、或 final_score/confidence < 0.8:转人工审核。",
|
||||
"3. 新关系、新事件类型、新字段进入 Schema Proposal 队列;它不阻塞事实数据入图,但 schema 是否升级单独治理。",
|
||||
"4. 人工只处理低分/冲突/拿不准的数据,不再默认审所有候选。",
|
||||
"5. 发布时每条节点和关系保留 evidence_id、quote、confidence,后续可追溯和回滚。",
|
||||
"",
|
||||
"## 建议入图谱结构",
|
||||
"",
|
||||
"```cypher",
|
||||
"(:Place {id:'ent_huaxi_park', name:'花溪公园'})",
|
||||
"(:Event {id:'evt_1937_park_build', event_id:'evt_1937_park_build', title:'花溪正式辟建为公园', event_date:'1937年', event_date_norm:'1937', event_type:'ConstructionEvent', evidence_id, evidence_quote})",
|
||||
"(:Concept {id:'cpt_historical_culture', name:'历史文化景区'})",
|
||||
"(:Place)-[:HAS_EVENT {event_date, event_type, evidence_id, confidence}]->(:Event)",
|
||||
"(:Place)-[:HAS_CONCEPT {evidence_id, confidence}]->(:Concept)",
|
||||
"(:Place)-[:HAS_PART {evidence_id, confidence}]->(:ScenicSpot)",
|
||||
"(:Place)-[:NEARBY_ATTRACTION {distance_text, evidence_id}]->(:Place)",
|
||||
"```",
|
||||
"",
|
||||
"## 自动发布 / 人工审核判定",
|
||||
"",
|
||||
"| 类型 | ID/关系 | 名称/内容 | 置信度 | 建议 |",
|
||||
"|---|---|---|---:|---|",
|
||||
]
|
||||
for kind, key, label_key in [
|
||||
("entity", "entities", "name"),
|
||||
("event", "events", "title"),
|
||||
("concept", "concepts", "name"),
|
||||
]:
|
||||
for row in payload[key]:
|
||||
lines.append(
|
||||
f"| {kind} | {row.get('temp_id')} | {row.get(label_key)} | "
|
||||
f"{float(row.get('confidence') or 0):.2f} | {review_bucket(row, kind)} |"
|
||||
)
|
||||
for row in payload["schema_proposals"]:
|
||||
lines.append(
|
||||
f"| schema_proposal | {row.get('proposal_type')} | {row.get('name')} | "
|
||||
f"{float(row.get('confidence') or 0):.2f} | {review_bucket(row, 'schema_proposal')} |"
|
||||
)
|
||||
|
||||
lines += [
|
||||
"",
|
||||
"## 当前结论",
|
||||
"",
|
||||
"花溪公园这份百度百科证据使用 kg_schema_v1 后,已经足够形成一批完整候选知识。",
|
||||
"但为了保证城市知识图谱可长期复用,正确做法不是所有条目都人工审,而是:",
|
||||
"",
|
||||
"```text",
|
||||
"Evidence -> kg_schema_v1 多模型抽取 -> 决策合并/评分",
|
||||
"-> final_score >= 0.8 且无冲突:自动发布到 FalkorDB",
|
||||
"-> final_score < 0.8 或模型冲突:进入人工审核",
|
||||
"```",
|
||||
"",
|
||||
"对应严格 JSON 输出:`/Users/xuexue/new2/docs/reports/huaxi_kg_schema_v1_ready.json`",
|
||||
]
|
||||
OUT_REVIEW.write_text("\n".join(lines), encoding="utf-8")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
raw = json.loads(IN_JSON.read_text(encoding="utf-8"))
|
||||
payload = build_payload(raw)
|
||||
validation = validate_payload(payload)
|
||||
OUT_JSON.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
write_review_plan(raw, payload, validation)
|
||||
print(OUT_JSON)
|
||||
print(OUT_REVIEW)
|
||||
print({k: len(v) for k, v in payload.items() if isinstance(v, list)})
|
||||
print(validation)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user