Files
bxh/scripts/fix_huaxi_event_temporal_fields.py

118 lines
3.9 KiB
Python

#!/usr/bin/env python3
"""Backfill Huaxi Event temporal/type/evidence fields into FalkorDB.
The extraction JSON already contains event_type and time_text/time_norm.
This script makes those fields explicit on Event nodes and HAS_EVENT edges
so graph browsing, timelines, and future event retrieval do not have to infer
time from descriptions.
"""
from __future__ import annotations
import json
import sys
from pathlib import Path
from typing import Any
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from falkordb import FalkorDB # noqa: E402
from app.config import settings # noqa: E402
IN_JSON = ROOT / "docs/reports/huaxi_kg_schema_v1_ready.json"
HUAXI_ELEMENT_ID = "amap:B035300A51"
BAIDU_BAIKE_SOURCE_ID = "baidu_baike_huaxi_park"
BAIDU_BAIKE_SOURCE_NAME = "百度百科"
BAIDU_BAIKE_SOURCE_URL = "https://baike.baidu.com/item/%E8%8A%B1%E6%BA%AA%E5%85%AC%E5%9B%AD"
def first_span(row: dict[str, Any]) -> dict[str, Any]:
spans = row.get("source_spans") or []
if spans and isinstance(spans[0], dict):
return spans[0]
return {}
def main() -> None:
payload = json.loads(IN_JSON.read_text(encoding="utf-8"))
graph = FalkorDB(
host=settings.falkordb_host,
port=settings.falkordb_port,
).select_graph(settings.falkordb_graph)
updated = 0
for row in payload.get("events", []):
span = first_span(row)
params = {
"huaxi": HUAXI_ELEMENT_ID,
"id": row.get("temp_id") or "",
"title": row.get("title") or "",
"event_type": row.get("event_type") or "",
"event_date": row.get("time_text") or "",
"event_date_norm": row.get("time_norm") or "",
"description": row.get("description") or "",
"evidence_id": span.get("evidence_id") or BAIDU_BAIKE_SOURCE_ID,
"evidence_quote": span.get("quote") or "",
"source_name": BAIDU_BAIKE_SOURCE_NAME,
"source_url": BAIDU_BAIKE_SOURCE_URL,
"confidence": float(row.get("confidence") or 0),
}
if not params["id"]:
continue
graph.query(
"""
MATCH (e:Event {id:$id})
SET e.event_id=$id,
e.title=$title,
e.name=$title,
e.event_type=$event_type,
e.event_date=$event_date,
e.event_time=$event_date,
e.event_date_norm=$event_date_norm,
e.time_text=$event_date,
e.time_norm=$event_date_norm,
e.description=$description,
e.evidence_id=$evidence_id,
e.evidence_quote=$evidence_quote,
e.evidence_url=$source_url,
e.source='baidu_baike',
e.source_name=$source_name,
e.source_url=$source_url,
e.extraction_schema='kg_schema_v1',
e.review_status='auto_published',
e.confidence=$confidence
""",
params,
)
graph.query(
"""
MATCH (p:Place {element_id:$huaxi})-[r:HAS_EVENT]->(e:Event {id:$id})
SET r.event_type=$event_type,
r.event_date=$event_date,
r.event_time=$event_date,
r.event_date_norm=$event_date_norm,
r.evidence_id=$evidence_id,
r.evidence_quote=$evidence_quote,
r.evidence_url=$source_url,
r.source='baidu_baike',
r.source_name=$source_name,
r.source_url=$source_url,
r.extraction_schema='kg_schema_v1',
r.confidence=$confidence
""",
params,
)
updated += 1
print({
"graph": settings.falkordb_graph,
"updated_events": updated,
"huaxi_anchor": HUAXI_ELEMENT_ID,
})
if __name__ == "__main__":
main()