309 lines
11 KiB
Python
309 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
"""Fetch focused AMap POI fields for the travel_graph location resources.
|
||
|
||
This does not add arbitrary online hotels/restaurants into the sellable resource
|
||
library. It only enriches resources that already exist in the business files with
|
||
AMap id/address/region/coordinate/photo fields when a high-confidence match is
|
||
available.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import csv
|
||
import importlib.util
|
||
import json
|
||
import os
|
||
import re
|
||
import time
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
import requests
|
||
import urllib3
|
||
|
||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||
|
||
BUILD_SCRIPT = Path("/Users/xuexue/new2/scripts/build_travel_graph_existing_product_project.py")
|
||
OUT_DIR = Path("/Users/xuexue/Downloads/图谱数据/travel_graph_旅行社线路制定")
|
||
NODES_PATH = OUT_DIR / "抽取结果_nodes.json"
|
||
CACHE_PATH = OUT_DIR / "amap_poi_enrichment_cache.json"
|
||
REPORT_CSV = OUT_DIR / "amap_poi_enrichment_report.csv"
|
||
|
||
AMAP_TEXT_URL = "https://restapi.amap.com/v3/place/text"
|
||
|
||
|
||
def load_build_module():
|
||
spec = importlib.util.spec_from_file_location("travel_build", BUILD_SCRIPT)
|
||
mod = importlib.util.module_from_spec(spec)
|
||
assert spec.loader
|
||
spec.loader.exec_module(mod)
|
||
return mod
|
||
|
||
|
||
b = load_build_module()
|
||
|
||
|
||
def load_key() -> str:
|
||
for key in (os.environ.get("AMAP_WEB_KEY"), os.environ.get("AMAP_KEY")):
|
||
if key:
|
||
return key
|
||
crawl_path = Path("/Users/xuexue/PycharmProjects/PythonProject/xuexue-CityGraph/crawl_guiyan.py")
|
||
if crawl_path.exists():
|
||
spec = importlib.util.spec_from_file_location("crawl_guiyan", crawl_path)
|
||
mod = importlib.util.module_from_spec(spec)
|
||
assert spec.loader
|
||
spec.loader.exec_module(mod)
|
||
key = (getattr(mod, "CONFIG", {}) or {}).get("key")
|
||
if key:
|
||
return key
|
||
raise RuntimeError("未找到可用的高德 Web 服务 Key。请配置 AMAP_WEB_KEY 或保留 crawl_guiyan.py 中的 CONFIG['key']。")
|
||
|
||
|
||
def clean(value: Any) -> str:
|
||
return b.compact(value)
|
||
|
||
|
||
def norm(value: Any) -> str:
|
||
return b.norm(value)
|
||
|
||
|
||
def as_list(value: Any) -> list[str]:
|
||
if isinstance(value, list):
|
||
return [clean(x) for x in value if clean(x)]
|
||
if isinstance(value, str):
|
||
try:
|
||
decoded = json.loads(value)
|
||
if isinstance(decoded, list):
|
||
return [clean(x) for x in decoded if clean(x)]
|
||
except Exception:
|
||
pass
|
||
return [x for x in re.split(r"[、,,;/;\n]+", value) if clean(x)]
|
||
return []
|
||
|
||
|
||
def expected_type_prefix(label: str) -> str:
|
||
return {
|
||
"ScenicAttraction": "110",
|
||
"HotelResource": "100",
|
||
"RestaurantResource": "050",
|
||
}.get(label, "")
|
||
|
||
|
||
def query_pois(key: str, keyword: str) -> list[dict[str, Any]]:
|
||
params = {
|
||
"key": key,
|
||
"keywords": keyword,
|
||
"city": "贵州",
|
||
"citylimit": "false",
|
||
"output": "json",
|
||
"extensions": "all",
|
||
"offset": 12,
|
||
"page": 1,
|
||
}
|
||
response = requests.get(AMAP_TEXT_URL, params=params, timeout=(3, 8), verify=False)
|
||
response.raise_for_status()
|
||
payload = response.json()
|
||
if payload.get("status") != "1":
|
||
return []
|
||
return payload.get("pois") or []
|
||
|
||
|
||
def region_hint_for_node(node: dict[str, Any]) -> tuple[str, str, str] | None:
|
||
if node.get("label") == "ScenicAttraction":
|
||
hint = b.ATTRACTION_ADMIN_HINTS.get(clean(node.get("name")))
|
||
if hint:
|
||
return hint
|
||
return b.region_hint_for_text(" ".join([
|
||
clean(node.get("name")),
|
||
clean(node.get("city_or_area")),
|
||
clean(node.get("address")),
|
||
clean(node.get("city")),
|
||
]))
|
||
|
||
|
||
def name_score(node: dict[str, Any], poi: dict[str, Any]) -> int:
|
||
poi_name = norm(poi.get("name"))
|
||
names = [clean(node.get("name")), *as_list(node.get("aliases"))]
|
||
best = 0
|
||
for name in names:
|
||
n = norm(name)
|
||
if not n or not poi_name:
|
||
continue
|
||
if n == poi_name:
|
||
best = max(best, 82)
|
||
elif n in poi_name or poi_name in n:
|
||
best = max(best, 64)
|
||
else:
|
||
common = len(set(re.findall(r"[\u4e00-\u9fff]{2,}", n)) & set(re.findall(r"[\u4e00-\u9fff]{2,}", poi_name)))
|
||
best = max(best, min(45, common * 10))
|
||
return best
|
||
|
||
|
||
def score_poi(node: dict[str, Any], poi: dict[str, Any]) -> tuple[int, str]:
|
||
score = name_score(node, poi)
|
||
reasons: list[str] = [f"name={score}"]
|
||
prefix = expected_type_prefix(node.get("label", ""))
|
||
typecode = clean(poi.get("typecode"))
|
||
if prefix and typecode.startswith(prefix):
|
||
score += 18
|
||
reasons.append("type=match")
|
||
elif prefix:
|
||
score -= 18
|
||
reasons.append(f"type=mismatch:{typecode}")
|
||
hint = region_hint_for_node(node)
|
||
if hint:
|
||
_, city, county = hint
|
||
cityname = clean(poi.get("cityname"))
|
||
adname = clean(poi.get("adname"))
|
||
if county and county == adname:
|
||
score += 12
|
||
reasons.append("county=match")
|
||
elif city and city == cityname:
|
||
score += 8
|
||
reasons.append("city=match")
|
||
elif county and adname and county != adname:
|
||
score -= 12
|
||
reasons.append(f"county=conflict:{adname}")
|
||
if poi.get("location"):
|
||
score += 4
|
||
reasons.append("geo=yes")
|
||
if poi.get("photos"):
|
||
score += 2
|
||
reasons.append("photo=yes")
|
||
return score, ";".join(reasons)
|
||
|
||
|
||
def has_hard_region_conflict(node: dict[str, Any], poi: dict[str, Any]) -> bool:
|
||
hint = region_hint_for_node(node)
|
||
if not hint:
|
||
return False
|
||
_, city, _county = hint
|
||
cityname = clean(poi.get("cityname"))
|
||
# City/province level mismatch is a hard reject. County names can differ
|
||
# for scenic management zones, e.g. 百里杜鹃管理区 vs 黔西市, so those stay soft.
|
||
return bool(city and cityname and city != cityname)
|
||
|
||
|
||
def best_match(node: dict[str, Any], pois: list[dict[str, Any]]) -> tuple[dict[str, Any] | None, int, str]:
|
||
filtered = [poi for poi in pois if not has_hard_region_conflict(node, poi)]
|
||
scored = [(score_poi(node, poi), poi) for poi in filtered]
|
||
scored.sort(key=lambda item: item[0][0], reverse=True)
|
||
if not scored:
|
||
return None, 0, "no_result_or_region_conflict"
|
||
(score, reason), poi = scored[0]
|
||
threshold = 54 if node.get("label") == "ScenicAttraction" else 70
|
||
if score < threshold:
|
||
return None, score, reason
|
||
return poi, score, reason
|
||
|
||
|
||
def fields_from_poi(poi: dict[str, Any], score: int, reason: str) -> dict[str, Any]:
|
||
lng = lat = ""
|
||
if poi.get("location") and "," in poi["location"]:
|
||
lng, lat = poi["location"].split(",", 1)
|
||
photos = [p.get("url") for p in (poi.get("photos") or []) if p.get("url")]
|
||
return {
|
||
"amap_match_status": "matched",
|
||
"amap_match_confidence": round(score / 100, 2),
|
||
"amap_match_reason": reason,
|
||
"amap_poi_id": poi.get("id"),
|
||
"amap_name": poi.get("name"),
|
||
"amap_type": poi.get("type"),
|
||
"amap_typecode": poi.get("typecode"),
|
||
"amap_address": poi.get("address"),
|
||
"amap_location": poi.get("location"),
|
||
"amap_lng": float(lng) if lng else "",
|
||
"amap_lat": float(lat) if lat else "",
|
||
"amap_pname": poi.get("pname"),
|
||
"amap_cityname": poi.get("cityname"),
|
||
"amap_adname": poi.get("adname"),
|
||
"amap_adcode": poi.get("adcode"),
|
||
"amap_tel": clean(poi.get("tel")),
|
||
"amap_rating": clean((poi.get("biz_ext") or {}).get("rating")),
|
||
"amap_cost": clean((poi.get("biz_ext") or {}).get("cost")),
|
||
"amap_open_time": clean((poi.get("biz_ext") or {}).get("open_time")),
|
||
"amap_photo_urls": photos,
|
||
"data_completeness_note": "高德POI补全:行政区、地址、坐标、电话/评分/人均/营业时间按接口可得字段写入。",
|
||
}
|
||
|
||
|
||
def keyword_for_node(node: dict[str, Any]) -> str:
|
||
name = clean(node.get("name"))
|
||
if node.get("label") == "ScenicAttraction":
|
||
hint = region_hint_for_node(node)
|
||
if hint:
|
||
_province, city, county = hint
|
||
return " ".join(part for part in [city, county, name] if clean(part))
|
||
return name
|
||
region = clean(node.get("city_or_area"))
|
||
region = re.sub(r"区域$", "", region)
|
||
return f"{region} {name}".strip()
|
||
|
||
|
||
def main() -> None:
|
||
if not NODES_PATH.exists():
|
||
raise RuntimeError(f"缺少节点文件:{NODES_PATH},请先构建一次 travel_graph。")
|
||
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
key = load_key()
|
||
nodes = json.loads(NODES_PATH.read_text(encoding="utf-8"))
|
||
targets = [n for n in nodes if n.get("label") in {"ScenicAttraction", "HotelResource", "RestaurantResource"}]
|
||
print(f"targets {len(targets)}", flush=True)
|
||
items: dict[str, dict[str, Any]] = {}
|
||
report_rows: list[dict[str, Any]] = []
|
||
for idx, node in enumerate(targets, start=1):
|
||
keyword = keyword_for_node(node)
|
||
print(f"[{idx}/{len(targets)}] {node.get('label')} {node.get('name')} -> {keyword}", flush=True)
|
||
try:
|
||
pois = query_pois(key, keyword)
|
||
poi, score, reason = best_match(node, pois)
|
||
except Exception as exc:
|
||
poi, score, reason = None, 0, f"api_error:{str(exc)[:120]}"
|
||
if poi:
|
||
fields = fields_from_poi(poi, score, reason)
|
||
status = "matched"
|
||
else:
|
||
fields = {
|
||
"amap_match_status": "unmatched",
|
||
"amap_match_confidence": round(score / 100, 2) if score else 0,
|
||
"amap_match_reason": reason,
|
||
"data_completeness_note": "高德POI未高置信匹配,暂不写入坐标,避免误把同名/异地资源挂错。",
|
||
}
|
||
status = "unmatched"
|
||
items[node["natural_key"]] = {
|
||
"label": node.get("label"),
|
||
"name": node.get("name"),
|
||
"query": keyword,
|
||
"status": status,
|
||
"fields": fields,
|
||
}
|
||
report_rows.append({
|
||
"label": node.get("label"),
|
||
"name": node.get("name"),
|
||
"query": keyword,
|
||
"status": status,
|
||
"score": fields.get("amap_match_confidence"),
|
||
"amap_name": fields.get("amap_name", ""),
|
||
"city": fields.get("amap_cityname", ""),
|
||
"district": fields.get("amap_adname", ""),
|
||
"reason": fields.get("amap_match_reason", ""),
|
||
})
|
||
if idx % 25 == 0:
|
||
print(f"processed {idx}/{len(targets)}", flush=True)
|
||
time.sleep(0.12)
|
||
CACHE_PATH.write_text(json.dumps({
|
||
"generated_at": datetime.now().isoformat(timespec="seconds"),
|
||
"source": "amap_place_text",
|
||
"note": "只补全业务文件已有资源;不新增线上随机酒店/餐厅为可售资源。",
|
||
"items": items,
|
||
}, ensure_ascii=False, indent=2), encoding="utf-8")
|
||
with REPORT_CSV.open("w", newline="", encoding="utf-8-sig") as fh:
|
||
writer = csv.DictWriter(fh, fieldnames=list(report_rows[0].keys()))
|
||
writer.writeheader()
|
||
writer.writerows(report_rows)
|
||
matched = sum(1 for row in report_rows if row["status"] == "matched")
|
||
print(json.dumps({"targets": len(targets), "matched": matched, "unmatched": len(targets) - matched, "cache": str(CACHE_PATH)}, ensure_ascii=False, indent=2))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|