Initial travel knowledge graph release
This commit is contained in:
308
scripts/publish_huaxi_kg_schema_v1_to_falkor.py
Normal file
308
scripts/publish_huaxi_kg_schema_v1_to_falkor.py
Normal file
@@ -0,0 +1,308 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Publish the Huaxi kg_schema_v1 preview into FalkorDB for graph-browser QA."""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
ROOT = Path("/Users/xuexue/new2")
|
||||
if str(ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from falkordb import FalkorDB # noqa: E402
|
||||
|
||||
from app.config import settings # noqa: E402
|
||||
|
||||
IN_JSON = ROOT / "docs/reports/huaxi_kg_schema_v1_ready.json"
|
||||
ROOT_PLACE_ID = "ent_huaxi_park"
|
||||
BAIDU_BAIKE_SOURCE_NAME = "百度百科"
|
||||
BAIDU_BAIKE_SOURCE_URL = "https://baike.baidu.com/item/%E8%8A%B1%E6%BA%AA%E5%85%AC%E5%9B%AD"
|
||||
|
||||
|
||||
def safe_token(value: str, fallback: str) -> str:
|
||||
token = re.sub(r"[^A-Za-z0-9_]", "", value or "")
|
||||
if not token:
|
||||
return fallback
|
||||
if token[0].isdigit():
|
||||
token = f"{fallback}_{token}"
|
||||
return token
|
||||
|
||||
|
||||
def rel_type(value: str) -> str:
|
||||
token = safe_token(value.upper(), "RELATED_TO")
|
||||
return token if re.match(r"^[A-Z_][A-Z0-9_]*$", token) else "RELATED_TO"
|
||||
|
||||
|
||||
def node_label(value: str, fallback: str = "Entity") -> str:
|
||||
return safe_token(value, fallback)
|
||||
|
||||
|
||||
def literal_id(statement: dict[str, Any]) -> str:
|
||||
raw = f"{statement.get('subject_ref')}|{statement.get('predicate')}|{statement.get('object_ref')}"
|
||||
digest = hashlib.md5(raw.encode("utf-8")).hexdigest()[:16]
|
||||
return f"lit_{digest}"
|
||||
|
||||
|
||||
def first_quote(row: dict[str, Any]) -> str:
|
||||
spans = row.get("source_spans") or []
|
||||
if spans and isinstance(spans[0], dict):
|
||||
return str(spans[0].get("quote") or "")
|
||||
return ""
|
||||
|
||||
|
||||
def first_evidence_id(row: dict[str, Any]) -> str:
|
||||
spans = row.get("source_spans") or []
|
||||
if spans and isinstance(spans[0], dict):
|
||||
return str(spans[0].get("evidence_id") or "")
|
||||
return ""
|
||||
|
||||
|
||||
def main() -> None:
|
||||
payload = json.loads(IN_JSON.read_text(encoding="utf-8"))
|
||||
graph = FalkorDB(host=settings.falkordb_host, port=settings.falkordb_port).select_graph(
|
||||
settings.falkordb_graph
|
||||
)
|
||||
|
||||
entity_ids = {row["temp_id"] for row in payload["entities"]}
|
||||
event_ids = {row["temp_id"] for row in payload["events"]}
|
||||
concept_ids = {row["temp_id"] for row in payload["concepts"]}
|
||||
|
||||
for row in payload["entities"]:
|
||||
label = node_label(row.get("entity_type") or "Entity")
|
||||
graph.query(
|
||||
f"""
|
||||
MERGE (n:{label} {{id:$id}})
|
||||
SET n.name=$name,
|
||||
n.entity_type=$entity_type,
|
||||
n.description=$description,
|
||||
n.source='baidu_baike',
|
||||
n.source_name=$source_name,
|
||||
n.source_url=$source_url,
|
||||
n.extraction_schema='kg_schema_v1',
|
||||
n.review_status='auto_published',
|
||||
n.confidence=$confidence,
|
||||
n.evidence_quote=$evidence_quote
|
||||
""",
|
||||
{
|
||||
"id": row["temp_id"],
|
||||
"name": row.get("name") or "",
|
||||
"entity_type": row.get("entity_type") or "",
|
||||
"description": row.get("description") or "",
|
||||
"confidence": float(row.get("confidence") or 0),
|
||||
"evidence_quote": first_quote(row),
|
||||
"source_name": BAIDU_BAIKE_SOURCE_NAME,
|
||||
"source_url": BAIDU_BAIKE_SOURCE_URL,
|
||||
},
|
||||
)
|
||||
attrs = row.get("attributes") or {}
|
||||
if attrs:
|
||||
sets = []
|
||||
params = {"id": row["temp_id"]}
|
||||
for i, (key, value) in enumerate(attrs.items()):
|
||||
prop = safe_token(str(key), f"attr_{i}")
|
||||
pkey = f"v{i}"
|
||||
sets.append(f"n.{prop}=${pkey}")
|
||||
params[pkey] = str(value)
|
||||
graph.query(f"MATCH (n:{label} {{id:$id}}) SET {', '.join(sets)}", params)
|
||||
|
||||
for row in payload["events"]:
|
||||
graph.query(
|
||||
"""
|
||||
MERGE (n:Event {id:$id})
|
||||
SET n.title=$title,
|
||||
n.name=$title,
|
||||
n.event_id=$id,
|
||||
n.event_type=$event_type,
|
||||
n.event_date=$time_text,
|
||||
n.event_time=$time_text,
|
||||
n.event_date_norm=$time_norm,
|
||||
n.time_text=$time_text,
|
||||
n.time_norm=$time_norm,
|
||||
n.description=$description,
|
||||
n.source='baidu_baike',
|
||||
n.source_name=$source_name,
|
||||
n.source_url=$source_url,
|
||||
n.evidence_url=$source_url,
|
||||
n.extraction_schema='kg_schema_v1',
|
||||
n.review_status='auto_published',
|
||||
n.confidence=$confidence,
|
||||
n.evidence_id=$evidence_id,
|
||||
n.evidence_quote=$evidence_quote
|
||||
""",
|
||||
{
|
||||
"id": row["temp_id"],
|
||||
"title": row.get("title") or "",
|
||||
"event_type": row.get("event_type") or "",
|
||||
"time_text": row.get("time_text") or "",
|
||||
"time_norm": row.get("time_norm") or "",
|
||||
"description": row.get("description") or "",
|
||||
"confidence": float(row.get("confidence") or 0),
|
||||
"evidence_id": first_evidence_id(row),
|
||||
"evidence_quote": first_quote(row),
|
||||
"source_name": BAIDU_BAIKE_SOURCE_NAME,
|
||||
"source_url": BAIDU_BAIKE_SOURCE_URL,
|
||||
},
|
||||
)
|
||||
|
||||
for row in payload["concepts"]:
|
||||
graph.query(
|
||||
"""
|
||||
MERGE (n:Concept {id:$id})
|
||||
SET n.name=$name,
|
||||
n.concept_type=$concept_type,
|
||||
n.description=$description,
|
||||
n.source='baidu_baike',
|
||||
n.source_name=$source_name,
|
||||
n.source_url=$source_url,
|
||||
n.extraction_schema='kg_schema_v1',
|
||||
n.review_status='auto_published',
|
||||
n.confidence=$confidence,
|
||||
n.evidence_quote=$evidence_quote
|
||||
""",
|
||||
{
|
||||
"id": row["temp_id"],
|
||||
"name": row.get("name") or "",
|
||||
"concept_type": row.get("concept_type") or "",
|
||||
"description": row.get("description") or "",
|
||||
"confidence": float(row.get("confidence") or 0),
|
||||
"evidence_quote": first_quote(row),
|
||||
"source_name": BAIDU_BAIKE_SOURCE_NAME,
|
||||
"source_url": BAIDU_BAIKE_SOURCE_URL,
|
||||
},
|
||||
)
|
||||
|
||||
node_ids = entity_ids | event_ids | concept_ids
|
||||
edge_count = 0
|
||||
literal_count = 0
|
||||
explicit_edges: set[tuple[str, str, str]] = set()
|
||||
for row in payload["statements"]:
|
||||
subj = row.get("subject_ref")
|
||||
pred = rel_type(str(row.get("predicate") or "RELATED_TO"))
|
||||
obj = str(row.get("object_ref") or "")
|
||||
kind = row.get("object_kind")
|
||||
if not subj:
|
||||
continue
|
||||
params = {
|
||||
"sid": subj,
|
||||
"confidence": float(row.get("confidence") or 0),
|
||||
"evidence_quote": first_quote(row),
|
||||
"source": "baidu_baike",
|
||||
"source_name": BAIDU_BAIKE_SOURCE_NAME,
|
||||
"source_url": BAIDU_BAIKE_SOURCE_URL,
|
||||
}
|
||||
if kind == "literal" or obj not in node_ids:
|
||||
lid = literal_id(row)
|
||||
params.update({"oid": lid, "value": obj, "predicate": pred})
|
||||
graph.query(
|
||||
"""
|
||||
MERGE (o:Literal {id:$oid})
|
||||
SET o.name=$value, o.value=$value, o.predicate=$predicate,
|
||||
o.source='baidu_baike',
|
||||
o.source_name=$source_name,
|
||||
o.source_url=$source_url,
|
||||
o.extraction_schema='kg_schema_v1'
|
||||
""",
|
||||
params,
|
||||
)
|
||||
literal_count += 1
|
||||
else:
|
||||
params["oid"] = obj
|
||||
explicit_edges.add((str(subj), pred, str(params["oid"])))
|
||||
graph.query(
|
||||
f"""
|
||||
MATCH (s {{id:$sid}})
|
||||
MATCH (o {{id:$oid}})
|
||||
MERGE (s)-[r:{pred}]->(o)
|
||||
SET r.confidence=$confidence,
|
||||
r.evidence_quote=$evidence_quote,
|
||||
r.source=$source,
|
||||
r.source_name=$source_name,
|
||||
r.source_url=$source_url,
|
||||
r.extraction_schema='kg_schema_v1'
|
||||
""",
|
||||
params,
|
||||
)
|
||||
edge_count += 1
|
||||
|
||||
for row in payload["events"]:
|
||||
location_ref = row.get("location_ref") or ROOT_PLACE_ID
|
||||
event_id = row.get("temp_id")
|
||||
if not event_id or (location_ref, "HAS_EVENT", event_id) in explicit_edges:
|
||||
continue
|
||||
graph.query(
|
||||
"""
|
||||
MATCH (p {id:$pid})
|
||||
MATCH (e:Event {id:$eid})
|
||||
MERGE (p)-[r:HAS_EVENT]->(e)
|
||||
SET r.confidence=$confidence,
|
||||
r.event_type=$event_type,
|
||||
r.event_date=$time_text,
|
||||
r.event_time=$time_text,
|
||||
r.evidence_id=$evidence_id,
|
||||
r.evidence_quote=$evidence_quote,
|
||||
r.evidence_url=$source_url,
|
||||
r.source='baidu_baike',
|
||||
r.source_name=$source_name,
|
||||
r.source_url=$source_url,
|
||||
r.extraction_schema='kg_schema_v1',
|
||||
r.inferred_from='event.location_ref'
|
||||
""",
|
||||
{
|
||||
"pid": location_ref,
|
||||
"eid": event_id,
|
||||
"confidence": float(row.get("confidence") or 0),
|
||||
"event_type": row.get("event_type") or "",
|
||||
"time_text": row.get("time_text") or "",
|
||||
"evidence_id": first_evidence_id(row),
|
||||
"evidence_quote": first_quote(row),
|
||||
"source_name": BAIDU_BAIKE_SOURCE_NAME,
|
||||
"source_url": BAIDU_BAIKE_SOURCE_URL,
|
||||
},
|
||||
)
|
||||
edge_count += 1
|
||||
|
||||
for row in payload["concepts"]:
|
||||
concept_id = row.get("temp_id")
|
||||
if not concept_id or (ROOT_PLACE_ID, "HAS_CONCEPT", concept_id) in explicit_edges:
|
||||
continue
|
||||
graph.query(
|
||||
"""
|
||||
MATCH (p {id:$pid})
|
||||
MATCH (c:Concept {id:$cid})
|
||||
MERGE (p)-[r:HAS_CONCEPT]->(c)
|
||||
SET r.confidence=$confidence,
|
||||
r.evidence_quote=$evidence_quote,
|
||||
r.source='baidu_baike',
|
||||
r.source_name=$source_name,
|
||||
r.source_url=$source_url,
|
||||
r.extraction_schema='kg_schema_v1',
|
||||
r.inferred_from='document_anchor'
|
||||
""",
|
||||
{
|
||||
"pid": ROOT_PLACE_ID,
|
||||
"cid": concept_id,
|
||||
"confidence": float(row.get("confidence") or 0),
|
||||
"evidence_quote": first_quote(row),
|
||||
"source_name": BAIDU_BAIKE_SOURCE_NAME,
|
||||
"source_url": BAIDU_BAIKE_SOURCE_URL,
|
||||
},
|
||||
)
|
||||
edge_count += 1
|
||||
|
||||
print({
|
||||
"graph": settings.falkordb_graph,
|
||||
"entities": len(payload["entities"]),
|
||||
"events": len(payload["events"]),
|
||||
"concepts": len(payload["concepts"]),
|
||||
"statements": len(payload["statements"]),
|
||||
"literal_nodes": literal_count,
|
||||
"edges": edge_count,
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user