Files
bxh/scripts/enrich_travel_graph_amap_pois.py

309 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Fetch focused AMap POI fields for the travel_graph location resources.
This does not add arbitrary online hotels/restaurants into the sellable resource
library. It only enriches resources that already exist in the business files with
AMap id/address/region/coordinate/photo fields when a high-confidence match is
available.
"""
from __future__ import annotations
import csv
import importlib.util
import json
import os
import re
import time
from datetime import datetime
from pathlib import Path
from typing import Any
import requests
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
BUILD_SCRIPT = Path("/Users/xuexue/new2/scripts/build_travel_graph_existing_product_project.py")
OUT_DIR = Path("/Users/xuexue/Downloads/图谱数据/travel_graph_旅行社线路制定")
NODES_PATH = OUT_DIR / "抽取结果_nodes.json"
CACHE_PATH = OUT_DIR / "amap_poi_enrichment_cache.json"
REPORT_CSV = OUT_DIR / "amap_poi_enrichment_report.csv"
AMAP_TEXT_URL = "https://restapi.amap.com/v3/place/text"
def load_build_module():
spec = importlib.util.spec_from_file_location("travel_build", BUILD_SCRIPT)
mod = importlib.util.module_from_spec(spec)
assert spec.loader
spec.loader.exec_module(mod)
return mod
b = load_build_module()
def load_key() -> str:
for key in (os.environ.get("AMAP_WEB_KEY"), os.environ.get("AMAP_KEY")):
if key:
return key
crawl_path = Path("/Users/xuexue/PycharmProjects/PythonProject/xuexue-CityGraph/crawl_guiyan.py")
if crawl_path.exists():
spec = importlib.util.spec_from_file_location("crawl_guiyan", crawl_path)
mod = importlib.util.module_from_spec(spec)
assert spec.loader
spec.loader.exec_module(mod)
key = (getattr(mod, "CONFIG", {}) or {}).get("key")
if key:
return key
raise RuntimeError("未找到可用的高德 Web 服务 Key。请配置 AMAP_WEB_KEY 或保留 crawl_guiyan.py 中的 CONFIG['key']。")
def clean(value: Any) -> str:
return b.compact(value)
def norm(value: Any) -> str:
return b.norm(value)
def as_list(value: Any) -> list[str]:
if isinstance(value, list):
return [clean(x) for x in value if clean(x)]
if isinstance(value, str):
try:
decoded = json.loads(value)
if isinstance(decoded, list):
return [clean(x) for x in decoded if clean(x)]
except Exception:
pass
return [x for x in re.split(r"[、,;/\n]+", value) if clean(x)]
return []
def expected_type_prefix(label: str) -> str:
return {
"ScenicAttraction": "110",
"HotelResource": "100",
"RestaurantResource": "050",
}.get(label, "")
def query_pois(key: str, keyword: str) -> list[dict[str, Any]]:
params = {
"key": key,
"keywords": keyword,
"city": "贵州",
"citylimit": "false",
"output": "json",
"extensions": "all",
"offset": 12,
"page": 1,
}
response = requests.get(AMAP_TEXT_URL, params=params, timeout=(3, 8), verify=False)
response.raise_for_status()
payload = response.json()
if payload.get("status") != "1":
return []
return payload.get("pois") or []
def region_hint_for_node(node: dict[str, Any]) -> tuple[str, str, str] | None:
if node.get("label") == "ScenicAttraction":
hint = b.ATTRACTION_ADMIN_HINTS.get(clean(node.get("name")))
if hint:
return hint
return b.region_hint_for_text(" ".join([
clean(node.get("name")),
clean(node.get("city_or_area")),
clean(node.get("address")),
clean(node.get("city")),
]))
def name_score(node: dict[str, Any], poi: dict[str, Any]) -> int:
poi_name = norm(poi.get("name"))
names = [clean(node.get("name")), *as_list(node.get("aliases"))]
best = 0
for name in names:
n = norm(name)
if not n or not poi_name:
continue
if n == poi_name:
best = max(best, 82)
elif n in poi_name or poi_name in n:
best = max(best, 64)
else:
common = len(set(re.findall(r"[\u4e00-\u9fff]{2,}", n)) & set(re.findall(r"[\u4e00-\u9fff]{2,}", poi_name)))
best = max(best, min(45, common * 10))
return best
def score_poi(node: dict[str, Any], poi: dict[str, Any]) -> tuple[int, str]:
score = name_score(node, poi)
reasons: list[str] = [f"name={score}"]
prefix = expected_type_prefix(node.get("label", ""))
typecode = clean(poi.get("typecode"))
if prefix and typecode.startswith(prefix):
score += 18
reasons.append("type=match")
elif prefix:
score -= 18
reasons.append(f"type=mismatch:{typecode}")
hint = region_hint_for_node(node)
if hint:
_, city, county = hint
cityname = clean(poi.get("cityname"))
adname = clean(poi.get("adname"))
if county and county == adname:
score += 12
reasons.append("county=match")
elif city and city == cityname:
score += 8
reasons.append("city=match")
elif county and adname and county != adname:
score -= 12
reasons.append(f"county=conflict:{adname}")
if poi.get("location"):
score += 4
reasons.append("geo=yes")
if poi.get("photos"):
score += 2
reasons.append("photo=yes")
return score, ";".join(reasons)
def has_hard_region_conflict(node: dict[str, Any], poi: dict[str, Any]) -> bool:
hint = region_hint_for_node(node)
if not hint:
return False
_, city, _county = hint
cityname = clean(poi.get("cityname"))
# City/province level mismatch is a hard reject. County names can differ
# for scenic management zones, e.g. 百里杜鹃管理区 vs 黔西市, so those stay soft.
return bool(city and cityname and city != cityname)
def best_match(node: dict[str, Any], pois: list[dict[str, Any]]) -> tuple[dict[str, Any] | None, int, str]:
filtered = [poi for poi in pois if not has_hard_region_conflict(node, poi)]
scored = [(score_poi(node, poi), poi) for poi in filtered]
scored.sort(key=lambda item: item[0][0], reverse=True)
if not scored:
return None, 0, "no_result_or_region_conflict"
(score, reason), poi = scored[0]
threshold = 54 if node.get("label") == "ScenicAttraction" else 70
if score < threshold:
return None, score, reason
return poi, score, reason
def fields_from_poi(poi: dict[str, Any], score: int, reason: str) -> dict[str, Any]:
lng = lat = ""
if poi.get("location") and "," in poi["location"]:
lng, lat = poi["location"].split(",", 1)
photos = [p.get("url") for p in (poi.get("photos") or []) if p.get("url")]
return {
"amap_match_status": "matched",
"amap_match_confidence": round(score / 100, 2),
"amap_match_reason": reason,
"amap_poi_id": poi.get("id"),
"amap_name": poi.get("name"),
"amap_type": poi.get("type"),
"amap_typecode": poi.get("typecode"),
"amap_address": poi.get("address"),
"amap_location": poi.get("location"),
"amap_lng": float(lng) if lng else "",
"amap_lat": float(lat) if lat else "",
"amap_pname": poi.get("pname"),
"amap_cityname": poi.get("cityname"),
"amap_adname": poi.get("adname"),
"amap_adcode": poi.get("adcode"),
"amap_tel": clean(poi.get("tel")),
"amap_rating": clean((poi.get("biz_ext") or {}).get("rating")),
"amap_cost": clean((poi.get("biz_ext") or {}).get("cost")),
"amap_open_time": clean((poi.get("biz_ext") or {}).get("open_time")),
"amap_photo_urls": photos,
"data_completeness_note": "高德POI补全行政区、地址、坐标、电话/评分/人均/营业时间按接口可得字段写入。",
}
def keyword_for_node(node: dict[str, Any]) -> str:
name = clean(node.get("name"))
if node.get("label") == "ScenicAttraction":
hint = region_hint_for_node(node)
if hint:
_province, city, county = hint
return " ".join(part for part in [city, county, name] if clean(part))
return name
region = clean(node.get("city_or_area"))
region = re.sub(r"区域$", "", region)
return f"{region} {name}".strip()
def main() -> None:
if not NODES_PATH.exists():
raise RuntimeError(f"缺少节点文件:{NODES_PATH},请先构建一次 travel_graph。")
OUT_DIR.mkdir(parents=True, exist_ok=True)
key = load_key()
nodes = json.loads(NODES_PATH.read_text(encoding="utf-8"))
targets = [n for n in nodes if n.get("label") in {"ScenicAttraction", "HotelResource", "RestaurantResource"}]
print(f"targets {len(targets)}", flush=True)
items: dict[str, dict[str, Any]] = {}
report_rows: list[dict[str, Any]] = []
for idx, node in enumerate(targets, start=1):
keyword = keyword_for_node(node)
print(f"[{idx}/{len(targets)}] {node.get('label')} {node.get('name')} -> {keyword}", flush=True)
try:
pois = query_pois(key, keyword)
poi, score, reason = best_match(node, pois)
except Exception as exc:
poi, score, reason = None, 0, f"api_error:{str(exc)[:120]}"
if poi:
fields = fields_from_poi(poi, score, reason)
status = "matched"
else:
fields = {
"amap_match_status": "unmatched",
"amap_match_confidence": round(score / 100, 2) if score else 0,
"amap_match_reason": reason,
"data_completeness_note": "高德POI未高置信匹配暂不写入坐标避免误把同名/异地资源挂错。",
}
status = "unmatched"
items[node["natural_key"]] = {
"label": node.get("label"),
"name": node.get("name"),
"query": keyword,
"status": status,
"fields": fields,
}
report_rows.append({
"label": node.get("label"),
"name": node.get("name"),
"query": keyword,
"status": status,
"score": fields.get("amap_match_confidence"),
"amap_name": fields.get("amap_name", ""),
"city": fields.get("amap_cityname", ""),
"district": fields.get("amap_adname", ""),
"reason": fields.get("amap_match_reason", ""),
})
if idx % 25 == 0:
print(f"processed {idx}/{len(targets)}", flush=True)
time.sleep(0.12)
CACHE_PATH.write_text(json.dumps({
"generated_at": datetime.now().isoformat(timespec="seconds"),
"source": "amap_place_text",
"note": "只补全业务文件已有资源;不新增线上随机酒店/餐厅为可售资源。",
"items": items,
}, ensure_ascii=False, indent=2), encoding="utf-8")
with REPORT_CSV.open("w", newline="", encoding="utf-8-sig") as fh:
writer = csv.DictWriter(fh, fieldnames=list(report_rows[0].keys()))
writer.writeheader()
writer.writerows(report_rows)
matched = sum(1 for row in report_rows if row["status"] == "matched")
print(json.dumps({"targets": len(targets), "matched": matched, "unmatched": len(targets) - matched, "cache": str(CACHE_PATH)}, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()