#!/usr/bin/env python3 """Publish the Huaxi kg_schema_v1 preview into FalkorDB for graph-browser QA.""" from __future__ import annotations import hashlib import json import re import sys from pathlib import Path from typing import Any ROOT = Path(__file__).resolve().parents[1] if str(ROOT) not in sys.path: sys.path.insert(0, str(ROOT)) from falkordb import FalkorDB # noqa: E402 from app.config import settings # noqa: E402 IN_JSON = ROOT / "docs/reports/huaxi_kg_schema_v1_ready.json" ROOT_PLACE_ID = "ent_huaxi_park" BAIDU_BAIKE_SOURCE_NAME = "百度百科" BAIDU_BAIKE_SOURCE_URL = "https://baike.baidu.com/item/%E8%8A%B1%E6%BA%AA%E5%85%AC%E5%9B%AD" def safe_token(value: str, fallback: str) -> str: token = re.sub(r"[^A-Za-z0-9_]", "", value or "") if not token: return fallback if token[0].isdigit(): token = f"{fallback}_{token}" return token def rel_type(value: str) -> str: token = safe_token(value.upper(), "RELATED_TO") return token if re.match(r"^[A-Z_][A-Z0-9_]*$", token) else "RELATED_TO" def node_label(value: str, fallback: str = "Entity") -> str: return safe_token(value, fallback) def literal_id(statement: dict[str, Any]) -> str: raw = f"{statement.get('subject_ref')}|{statement.get('predicate')}|{statement.get('object_ref')}" digest = hashlib.md5(raw.encode("utf-8")).hexdigest()[:16] return f"lit_{digest}" def first_quote(row: dict[str, Any]) -> str: spans = row.get("source_spans") or [] if spans and isinstance(spans[0], dict): return str(spans[0].get("quote") or "") return "" def first_evidence_id(row: dict[str, Any]) -> str: spans = row.get("source_spans") or [] if spans and isinstance(spans[0], dict): return str(spans[0].get("evidence_id") or "") return "" def main() -> None: payload = json.loads(IN_JSON.read_text(encoding="utf-8")) graph = FalkorDB(host=settings.falkordb_host, port=settings.falkordb_port).select_graph( settings.falkordb_graph ) entity_ids = {row["temp_id"] for row in payload["entities"]} event_ids = {row["temp_id"] for row in payload["events"]} concept_ids = {row["temp_id"] for row in payload["concepts"]} for row in payload["entities"]: label = node_label(row.get("entity_type") or "Entity") graph.query( f""" MERGE (n:{label} {{id:$id}}) SET n.name=$name, n.entity_type=$entity_type, n.description=$description, n.source='baidu_baike', n.source_name=$source_name, n.source_url=$source_url, n.extraction_schema='kg_schema_v1', n.review_status='auto_published', n.confidence=$confidence, n.evidence_quote=$evidence_quote """, { "id": row["temp_id"], "name": row.get("name") or "", "entity_type": row.get("entity_type") or "", "description": row.get("description") or "", "confidence": float(row.get("confidence") or 0), "evidence_quote": first_quote(row), "source_name": BAIDU_BAIKE_SOURCE_NAME, "source_url": BAIDU_BAIKE_SOURCE_URL, }, ) attrs = row.get("attributes") or {} if attrs: sets = [] params = {"id": row["temp_id"]} for i, (key, value) in enumerate(attrs.items()): prop = safe_token(str(key), f"attr_{i}") pkey = f"v{i}" sets.append(f"n.{prop}=${pkey}") params[pkey] = str(value) graph.query(f"MATCH (n:{label} {{id:$id}}) SET {', '.join(sets)}", params) for row in payload["events"]: graph.query( """ MERGE (n:Event {id:$id}) SET n.title=$title, n.name=$title, n.event_id=$id, n.event_type=$event_type, n.event_date=$time_text, n.event_time=$time_text, n.event_date_norm=$time_norm, n.time_text=$time_text, n.time_norm=$time_norm, n.description=$description, n.source='baidu_baike', n.source_name=$source_name, n.source_url=$source_url, n.evidence_url=$source_url, n.extraction_schema='kg_schema_v1', n.review_status='auto_published', n.confidence=$confidence, n.evidence_id=$evidence_id, n.evidence_quote=$evidence_quote """, { "id": row["temp_id"], "title": row.get("title") or "", "event_type": row.get("event_type") or "", "time_text": row.get("time_text") or "", "time_norm": row.get("time_norm") or "", "description": row.get("description") or "", "confidence": float(row.get("confidence") or 0), "evidence_id": first_evidence_id(row), "evidence_quote": first_quote(row), "source_name": BAIDU_BAIKE_SOURCE_NAME, "source_url": BAIDU_BAIKE_SOURCE_URL, }, ) for row in payload["concepts"]: graph.query( """ MERGE (n:Concept {id:$id}) SET n.name=$name, n.concept_type=$concept_type, n.description=$description, n.source='baidu_baike', n.source_name=$source_name, n.source_url=$source_url, n.extraction_schema='kg_schema_v1', n.review_status='auto_published', n.confidence=$confidence, n.evidence_quote=$evidence_quote """, { "id": row["temp_id"], "name": row.get("name") or "", "concept_type": row.get("concept_type") or "", "description": row.get("description") or "", "confidence": float(row.get("confidence") or 0), "evidence_quote": first_quote(row), "source_name": BAIDU_BAIKE_SOURCE_NAME, "source_url": BAIDU_BAIKE_SOURCE_URL, }, ) node_ids = entity_ids | event_ids | concept_ids edge_count = 0 literal_count = 0 explicit_edges: set[tuple[str, str, str]] = set() for row in payload["statements"]: subj = row.get("subject_ref") pred = rel_type(str(row.get("predicate") or "RELATED_TO")) obj = str(row.get("object_ref") or "") kind = row.get("object_kind") if not subj: continue params = { "sid": subj, "confidence": float(row.get("confidence") or 0), "evidence_quote": first_quote(row), "source": "baidu_baike", "source_name": BAIDU_BAIKE_SOURCE_NAME, "source_url": BAIDU_BAIKE_SOURCE_URL, } if kind == "literal" or obj not in node_ids: lid = literal_id(row) params.update({"oid": lid, "value": obj, "predicate": pred}) graph.query( """ MERGE (o:Literal {id:$oid}) SET o.name=$value, o.value=$value, o.predicate=$predicate, o.source='baidu_baike', o.source_name=$source_name, o.source_url=$source_url, o.extraction_schema='kg_schema_v1' """, params, ) literal_count += 1 else: params["oid"] = obj explicit_edges.add((str(subj), pred, str(params["oid"]))) graph.query( f""" MATCH (s {{id:$sid}}) MATCH (o {{id:$oid}}) MERGE (s)-[r:{pred}]->(o) SET r.confidence=$confidence, r.evidence_quote=$evidence_quote, r.source=$source, r.source_name=$source_name, r.source_url=$source_url, r.extraction_schema='kg_schema_v1' """, params, ) edge_count += 1 for row in payload["events"]: location_ref = row.get("location_ref") or ROOT_PLACE_ID event_id = row.get("temp_id") if not event_id or (location_ref, "HAS_EVENT", event_id) in explicit_edges: continue graph.query( """ MATCH (p {id:$pid}) MATCH (e:Event {id:$eid}) MERGE (p)-[r:HAS_EVENT]->(e) SET r.confidence=$confidence, r.event_type=$event_type, r.event_date=$time_text, r.event_time=$time_text, r.evidence_id=$evidence_id, r.evidence_quote=$evidence_quote, r.evidence_url=$source_url, r.source='baidu_baike', r.source_name=$source_name, r.source_url=$source_url, r.extraction_schema='kg_schema_v1', r.inferred_from='event.location_ref' """, { "pid": location_ref, "eid": event_id, "confidence": float(row.get("confidence") or 0), "event_type": row.get("event_type") or "", "time_text": row.get("time_text") or "", "evidence_id": first_evidence_id(row), "evidence_quote": first_quote(row), "source_name": BAIDU_BAIKE_SOURCE_NAME, "source_url": BAIDU_BAIKE_SOURCE_URL, }, ) edge_count += 1 for row in payload["concepts"]: concept_id = row.get("temp_id") if not concept_id or (ROOT_PLACE_ID, "HAS_CONCEPT", concept_id) in explicit_edges: continue graph.query( """ MATCH (p {id:$pid}) MATCH (c:Concept {id:$cid}) MERGE (p)-[r:HAS_CONCEPT]->(c) SET r.confidence=$confidence, r.evidence_quote=$evidence_quote, r.source='baidu_baike', r.source_name=$source_name, r.source_url=$source_url, r.extraction_schema='kg_schema_v1', r.inferred_from='document_anchor' """, { "pid": ROOT_PLACE_ID, "cid": concept_id, "confidence": float(row.get("confidence") or 0), "evidence_quote": first_quote(row), "source_name": BAIDU_BAIKE_SOURCE_NAME, "source_url": BAIDU_BAIKE_SOURCE_URL, }, ) edge_count += 1 print({ "graph": settings.falkordb_graph, "entities": len(payload["entities"]), "events": len(payload["events"]), "concepts": len(payload["concepts"]), "statements": len(payload["statements"]), "literal_nodes": literal_count, "edges": edge_count, }) if __name__ == "__main__": main()