118 lines
3.9 KiB
Python
118 lines
3.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Backfill Huaxi Event temporal/type/evidence fields into FalkorDB.
|
|
|
|
The extraction JSON already contains event_type and time_text/time_norm.
|
|
This script makes those fields explicit on Event nodes and HAS_EVENT edges
|
|
so graph browsing, timelines, and future event retrieval do not have to infer
|
|
time from descriptions.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
ROOT = Path(__file__).resolve().parents[1]
|
|
if str(ROOT) not in sys.path:
|
|
sys.path.insert(0, str(ROOT))
|
|
|
|
from falkordb import FalkorDB # noqa: E402
|
|
|
|
from app.config import settings # noqa: E402
|
|
|
|
IN_JSON = ROOT / "docs/reports/huaxi_kg_schema_v1_ready.json"
|
|
HUAXI_ELEMENT_ID = "amap:B035300A51"
|
|
BAIDU_BAIKE_SOURCE_ID = "baidu_baike_huaxi_park"
|
|
BAIDU_BAIKE_SOURCE_NAME = "百度百科"
|
|
BAIDU_BAIKE_SOURCE_URL = "https://baike.baidu.com/item/%E8%8A%B1%E6%BA%AA%E5%85%AC%E5%9B%AD"
|
|
|
|
|
|
def first_span(row: dict[str, Any]) -> dict[str, Any]:
|
|
spans = row.get("source_spans") or []
|
|
if spans and isinstance(spans[0], dict):
|
|
return spans[0]
|
|
return {}
|
|
|
|
|
|
def main() -> None:
|
|
payload = json.loads(IN_JSON.read_text(encoding="utf-8"))
|
|
graph = FalkorDB(
|
|
host=settings.falkordb_host,
|
|
port=settings.falkordb_port,
|
|
).select_graph(settings.falkordb_graph)
|
|
|
|
updated = 0
|
|
for row in payload.get("events", []):
|
|
span = first_span(row)
|
|
params = {
|
|
"huaxi": HUAXI_ELEMENT_ID,
|
|
"id": row.get("temp_id") or "",
|
|
"title": row.get("title") or "",
|
|
"event_type": row.get("event_type") or "",
|
|
"event_date": row.get("time_text") or "",
|
|
"event_date_norm": row.get("time_norm") or "",
|
|
"description": row.get("description") or "",
|
|
"evidence_id": span.get("evidence_id") or BAIDU_BAIKE_SOURCE_ID,
|
|
"evidence_quote": span.get("quote") or "",
|
|
"source_name": BAIDU_BAIKE_SOURCE_NAME,
|
|
"source_url": BAIDU_BAIKE_SOURCE_URL,
|
|
"confidence": float(row.get("confidence") or 0),
|
|
}
|
|
if not params["id"]:
|
|
continue
|
|
graph.query(
|
|
"""
|
|
MATCH (e:Event {id:$id})
|
|
SET e.event_id=$id,
|
|
e.title=$title,
|
|
e.name=$title,
|
|
e.event_type=$event_type,
|
|
e.event_date=$event_date,
|
|
e.event_time=$event_date,
|
|
e.event_date_norm=$event_date_norm,
|
|
e.time_text=$event_date,
|
|
e.time_norm=$event_date_norm,
|
|
e.description=$description,
|
|
e.evidence_id=$evidence_id,
|
|
e.evidence_quote=$evidence_quote,
|
|
e.evidence_url=$source_url,
|
|
e.source='baidu_baike',
|
|
e.source_name=$source_name,
|
|
e.source_url=$source_url,
|
|
e.extraction_schema='kg_schema_v1',
|
|
e.review_status='auto_published',
|
|
e.confidence=$confidence
|
|
""",
|
|
params,
|
|
)
|
|
graph.query(
|
|
"""
|
|
MATCH (p:Place {element_id:$huaxi})-[r:HAS_EVENT]->(e:Event {id:$id})
|
|
SET r.event_type=$event_type,
|
|
r.event_date=$event_date,
|
|
r.event_time=$event_date,
|
|
r.event_date_norm=$event_date_norm,
|
|
r.evidence_id=$evidence_id,
|
|
r.evidence_quote=$evidence_quote,
|
|
r.evidence_url=$source_url,
|
|
r.source='baidu_baike',
|
|
r.source_name=$source_name,
|
|
r.source_url=$source_url,
|
|
r.extraction_schema='kg_schema_v1',
|
|
r.confidence=$confidence
|
|
""",
|
|
params,
|
|
)
|
|
updated += 1
|
|
|
|
print({
|
|
"graph": settings.falkordb_graph,
|
|
"updated_events": updated,
|
|
"huaxi_anchor": HUAXI_ELEMENT_ID,
|
|
})
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|