#!/usr/bin/env python3 """Enrich travel POI CSVs with AMap Web API data. Outputs are written next to the delivery CSVs and source files are not changed. """ from __future__ import annotations import csv import hashlib import json import math import ssl import sys import time import urllib.parse import urllib.request from pathlib import Path from typing import Any BASE_DIR = Path("/Users/xuexue/Documents/trae_projects/travel- graph/delivery_20260602") OUT_DIR = BASE_DIR / "amap_enriched" ENV_PATH = Path("/Users/xuexue/Desktop/zn-kg/.env") CACHE_PATH = OUT_DIR / "_amap_cache.json" SCENIC_TYPES = "110000" HOTEL_TYPES = "100000" RESTAURANT_TYPES = "050000" def read_env_key(path: Path, key: str) -> str: if not path.exists(): return "" for line in path.read_text(errors="ignore").splitlines(): s = line.strip() if not s or s.startswith("#") or "=" not in s: continue k, v = s.split("=", 1) if k.strip() == key: return v.strip().strip('"').strip("'") return "" def read_csv(path: Path) -> tuple[list[str], list[dict[str, str]]]: with path.open("r", encoding="utf-8-sig", newline="") as f: reader = csv.DictReader(f) return list(reader.fieldnames or []), list(reader) def write_csv(path: Path, rows: list[dict[str, Any]], preferred: list[str] | None = None) -> None: path.parent.mkdir(parents=True, exist_ok=True) keys: list[str] = [] for k in preferred or []: if k not in keys: keys.append(k) for row in rows: for k in row.keys(): if k not in keys: keys.append(k) with path.open("w", encoding="utf-8-sig", newline="") as f: writer = csv.DictWriter(f, fieldnames=keys, extrasaction="ignore") writer.writeheader() for row in rows: writer.writerow({k: row.get(k, "") for k in keys}) def load_cache() -> dict[str, Any]: if CACHE_PATH.exists(): return json.loads(CACHE_PATH.read_text(encoding="utf-8")) return {} def save_cache(cache: dict[str, Any]) -> None: CACHE_PATH.parent.mkdir(parents=True, exist_ok=True) CACHE_PATH.write_text(json.dumps(cache, ensure_ascii=False, indent=2), encoding="utf-8") class AMapClient: def __init__(self, key: str, cache: dict[str, Any]) -> None: self.key = key self.cache = cache self.ctx = ssl._create_unverified_context() self.calls = 0 self.errors: list[str] = [] def get(self, endpoint: str, params: dict[str, Any], retries: int = 3) -> dict[str, Any]: full_params = {**params, "key": self.key} cache_params = {k: v for k, v in full_params.items() if k != "key"} cache_key = endpoint + "?" + urllib.parse.urlencode(sorted(cache_params.items()), doseq=True) digest = hashlib.sha1(cache_key.encode()).hexdigest() if digest in self.cache: return self.cache[digest] url = "https://restapi.amap.com" + endpoint + "?" + urllib.parse.urlencode(full_params) last_error = "" for attempt in range(retries): try: with urllib.request.urlopen(url, timeout=20, context=self.ctx) as resp: data = json.loads(resp.read().decode("utf-8")) self.calls += 1 if data.get("status") == "1": self.cache[digest] = data if self.calls % 20 == 0: save_cache(self.cache) time.sleep(0.06) return data last_error = f"{data.get('infocode')} {data.get('info')}" if "CUQPS" in last_error or "QPS" in last_error: time.sleep(1.5 + attempt) continue break except Exception as exc: # noqa: BLE001 last_error = str(exc) time.sleep(0.8 + attempt) self.errors.append(f"{endpoint} {cache_params} -> {last_error}") return {"status": "0", "info": last_error, "pois": []} def place_text(self, keywords: str, city: str = "", types: str = "", offset: int = 20) -> list[dict[str, Any]]: params: dict[str, Any] = { "keywords": keywords, "offset": offset, "page": 1, "extensions": "all", "children": 1, } if city: params["city"] = city if types: params["types"] = types data = self.get("/v3/place/text", params) return data.get("pois") or [] def place_around(self, location: str, types: str, radius: int, offset: int = 25) -> list[dict[str, Any]]: params = { "location": location, "types": types, "radius": radius, "offset": offset, "page": 1, "extensions": "all", "sortrule": "distance", } data = self.get("/v3/place/around", params) return data.get("pois") or [] def driving(self, origin: str, destination: str) -> dict[str, str]: data = self.get( "/v3/direction/driving", {"origin": origin, "destination": destination, "extensions": "base", "strategy": 0}, ) paths = (data.get("route") or {}).get("paths") or [] if not paths: return {"drive_status": data.get("info") or "NO_ROUTE"} p = paths[0] distance_m = to_float(p.get("distance")) duration_s = to_float(p.get("duration")) return { "drive_status": "OK", "drive_distance_m": int(distance_m) if distance_m is not None else "", "drive_distance_km": round(distance_m / 1000, 2) if distance_m is not None else "", "drive_duration_s": int(duration_s) if duration_s is not None else "", "drive_duration_min": round(duration_s / 60, 1) if duration_s is not None else "", "drive_tolls": p.get("tolls", ""), "drive_traffic_lights": p.get("traffic_lights", ""), } def to_float(value: Any) -> float | None: try: if value in ("", None, []): return None return float(value) except Exception: return None def norm_name(value: str) -> str: s = str(value or "") for token in ["风景名胜区", "风景区", "旅游景区", "景区", "旅游区", "景点", "国家级", "贵州省"]: s = s.replace(token, "") return "".join(ch for ch in s if ch.isalnum() or "\u4e00" <= ch <= "\u9fff").lower() def poi_score(poi: dict[str, Any], target_name: str, city: str = "", district: str = "") -> int: score = 0 p_name = str(poi.get("name") or "") n1 = norm_name(target_name) n2 = norm_name(p_name) if n1 and n2: if n1 == n2: score += 120 elif n1 in n2 or n2 in n1: score += 80 if "风景" in str(poi.get("type") or "") or "景点" in str(poi.get("type") or ""): score += 25 if city and str(poi.get("cityname") or "").startswith(city[:2]): score += 12 d_clean = district.split("/")[0].replace("县", "").replace("区", "") if d_clean and d_clean in str(poi.get("adname") or ""): score += 18 if poi.get("photos"): score += 5 return score def select_best_poi(pois: list[dict[str, Any]], name: str, city: str = "", district: str = "") -> dict[str, Any] | None: if not pois: return None return sorted(pois, key=lambda p: poi_score(p, name, city, district), reverse=True)[0] def parse_location(location: str) -> tuple[str, str]: if not location or "," not in location: return "", "" lng, lat = location.split(",", 1) return lng.strip(), lat.strip() def photo_urls(poi: dict[str, Any]) -> list[str]: urls = [] for item in poi.get("photos") or []: url = str(item.get("url") or "").strip() if url and url not in urls: urls.append(url) return urls def amap_marker_url(lng: str, lat: str, name: str) -> str: if not lng or not lat: return "" return ( "https://uri.amap.com/marker?" + urllib.parse.urlencode( { "position": f"{lng},{lat}", "name": name, "src": "znkg", "coordinate": "gaode", "callnative": "0", } ) ) def poi_common_fields(poi: dict[str, Any]) -> dict[str, Any]: lng, lat = parse_location(str(poi.get("location") or "")) photos = photo_urls(poi) name = str(poi.get("name") or "") return { "amap_name": name, "amap_poi_id": poi.get("id", ""), "amap_type": poi.get("type", ""), "amap_typecode": poi.get("typecode", ""), "province": poi.get("pname", ""), "city": poi.get("cityname", ""), "district": poi.get("adname", ""), "adcode": poi.get("adcode", ""), "town": poi.get("townname", ""), "business_area": poi.get("business_area", ""), "formatted_address": poi.get("address", ""), "geo_lng": lng, "geo_lat": lat, "tel": poi.get("tel", ""), "first_image_url": photos[0] if photos else "", "all_image_urls": "|".join(photos), "image_count": len(photos), "amap_url": amap_marker_url(lng, lat, name), } def scenic_enrich(client: AMapClient, rows: list[dict[str, str]]) -> list[dict[str, Any]]: out: list[dict[str, Any]] = [] for idx, row in enumerate(rows, 1): name = row.get("name", "") city = row.get("city", "") district = row.get("district", "") queries = [ (name, SCENIC_TYPES), (row.get("amap_search_keyword", ""), SCENIC_TYPES), (name, ""), ] best = None for keyword, types in queries: if not keyword: continue pois = client.place_text(keyword, city=city, types=types, offset=20) best = select_best_poi(pois, name, city, district) if best: break enriched = dict(row) if best: common = poi_common_fields(best) enriched.update(common) enriched["has_geo"] = bool(common.get("geo_lng") and common.get("geo_lat")) enriched["amap_match_status"] = "matched" enriched["amap_match_score"] = poi_score(best, name, city, district) if row.get("first_image_url") and not common.get("first_image_url"): enriched["first_image_url"] = row.get("first_image_url") enriched["all_image_urls"] = row.get("all_image_urls", "") enriched["image_count"] = row.get("image_count", "") else: enriched["amap_match_status"] = "not_found" out.append(enriched) print(f"[scenic] {idx}/{len(rows)} {name} -> {enriched.get('amap_name','')}", flush=True) return out def enrich_existing_pois( client: AMapClient, rows: list[dict[str, str]], *, name_field: str, id_prefix: str, types: str, ) -> list[dict[str, Any]]: out = [] for idx, row in enumerate(rows, 1): name = row.get(name_field, "") city = row.get("city") or row.get("expected_city") or "" keyword = row.get("amap_search_keyword") or f"{city} {name}" pois = client.place_text(keyword, city=city, types=types, offset=20) best = select_best_poi(pois, name, city, row.get("district", "")) enriched = dict(row) if best: common = poi_common_fields(best) enriched.update(common) enriched["source"] = (row.get("source") or "source_csv") + "+amap_text" enriched["amap_match_status"] = "matched" enriched["amap_match_score"] = poi_score(best, name, city, row.get("district", "")) enriched[f"{id_prefix.lower()}_id"] = f"{id_prefix}_{best.get('id')}" else: enriched["amap_match_status"] = "not_found" enriched[f"{id_prefix.lower()}_id"] = f"{id_prefix}_UNMATCHED_{idx:04d}" out.append(enriched) if idx % 10 == 0 or idx == len(rows): print(f"[{id_prefix.lower()}] {idx}/{len(rows)}", flush=True) return out def dedupe_by_poi_id(rows: list[dict[str, Any]], name_key: str) -> list[dict[str, Any]]: seen: set[str] = set() out = [] for row in rows: key = str(row.get("amap_poi_id") or row.get(name_key) or "") if not key: key = json.dumps(row, ensure_ascii=False, sort_keys=True) if key in seen: continue seen.add(key) out.append(row) return out def nearby_candidates( client: AMapClient, scenic_rows: list[dict[str, Any]], *, types: str, kind: str, target_count: int = 10, ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: master: list[dict[str, Any]] = [] relation_rows: list[dict[str, Any]] = [] radii = [5000, 10000, 20000, 50000] for sidx, scenic in enumerate(scenic_rows, 1): scenic_id = scenic.get("id", "") scenic_name = scenic.get("name", "") lng = scenic.get("geo_lng", "") lat = scenic.get("geo_lat", "") if not lng or not lat: print(f"[nearby:{kind}] skip no geo {scenic_name}", flush=True) continue loc = f"{lng},{lat}" selected: list[dict[str, Any]] = [] selected_ids: set[str] = set() for radius in radii: pois = client.place_around(loc, types=types, radius=radius, offset=25) for poi in pois: pid = str(poi.get("id") or poi.get("name") or "") if not pid or pid in selected_ids: continue selected_ids.add(pid) selected.append(poi) if len(selected) >= target_count: break if len(selected) >= target_count: break for rank, poi in enumerate(selected[:target_count], 1): common = poi_common_fields(poi) poi_name = common.get("amap_name", "") row_id = f"{kind.upper()}_{common.get('amap_poi_id')}" master_row = { f"{kind}_id": row_id, f"{kind}_name": poi_name, "source": "amap_around", **common, } master.append(master_row) metric = client.driving(loc, f"{common.get('geo_lng')},{common.get('geo_lat')}") rel = { "scenic_id": scenic_id, "scenic_name": scenic_name, "scenic_lng": lng, "scenic_lat": lat, "resource_type": "Hotel" if kind == "hotel" else "Restaurant", "resource_id": row_id, "resource_name": poi_name, "amap_poi_id": common.get("amap_poi_id", ""), "resource_lng": common.get("geo_lng", ""), "resource_lat": common.get("geo_lat", ""), "rank_for_scenic": rank, "amap_around_distance_m": poi.get("distance", ""), **metric, "province": common.get("province", ""), "city": common.get("city", ""), "district": common.get("district", ""), "formatted_address": common.get("formatted_address", ""), "amap_type": common.get("amap_type", ""), "tel": common.get("tel", ""), "first_image_url": common.get("first_image_url", ""), "all_image_urls": common.get("all_image_urls", ""), "amap_url": common.get("amap_url", ""), } relation_rows.append(rel) print(f"[nearby:{kind}] {sidx}/{len(scenic_rows)} {scenic_name} -> {len(selected[:target_count])}", flush=True) return dedupe_by_poi_id(master, f"{kind}_name"), relation_rows def write_dictionary() -> None: text = """# 高德 POI 补全字段字典 ## 通用 POI 字段 | 字段 | 说明 | |---|---| | amap_poi_id | 高德 POI ID,可作为外部数据来源唯一标识 | | amap_name | 高德返回名称 | | amap_type / amap_typecode | 高德行业分类与分类编码 | | province / city / district / town / adcode | 高德行政区划字段 | | formatted_address | 高德地址 | | geo_lng / geo_lat | 高德 GCJ-02 坐标,经度/纬度 | | tel | 高德电话字段,多个号码按高德原格式保留 | | first_image_url | 高德照片第一张 URL | | all_image_urls | 高德照片 URL 列表,使用 `|` 分隔 | | image_count | 图片数量 | | amap_url | 高德 URI marker 链接,可用于前端跳转地图 | | amap_match_status | matched / not_found,表示源表记录是否匹配到高德 POI | | amap_match_score | 本地匹配打分,仅用于质检 | ## 景区附近资源关系字段 | 字段 | 说明 | |---|---| | scenic_id / scenic_name | 景区 ID 与名称 | | resource_type | Hotel 或 Restaurant | | resource_id / resource_name | 候选资源 ID 与名称 | | rank_for_scenic | 该景区附近资源排序,从 1 开始 | | amap_around_distance_m | 高德周边搜索返回距离,仅作参考 | | drive_distance_m / drive_distance_km | 高德驾车路线距离,推荐排序优先使用 | | drive_duration_s / drive_duration_min | 高德驾车时间,推荐排序优先使用 | | drive_status | OK 表示驾车路线成功;其他值表示高德未返回可用路线 | ## 使用建议 - 图谱实体:景区使用 `ScenicAttraction`,酒店/餐饮后续可独立为 `Hotel` / `Restaurant` POI。 - 图谱关系:`ScenicAttraction -[:NEARBY]-> Hotel/Restaurant`,关系属性放 `drive_distance_km`、`drive_duration_min`、`rank_for_scenic`。 - 费用/门票/小交通仍放 `TravelItem`,不要和 Hotel/Restaurant POI 混在同一张表里。 """ (OUT_DIR / "字段字典.md").write_text(text, encoding="utf-8") def write_report( scenic_rows: list[dict[str, Any]], hotel_rows: list[dict[str, Any]], restaurant_rows: list[dict[str, Any]], scenic_hotels: list[dict[str, Any]], scenic_restaurants: list[dict[str, Any]], client: AMapClient, ) -> None: def matched(rows: list[dict[str, Any]]) -> int: return sum(1 for r in rows if r.get("amap_match_status") == "matched" or r.get("amap_poi_id")) by_scenic: dict[str, dict[str, int]] = {} for r in scenic_hotels: by_scenic.setdefault(r["scenic_name"], {"hotel": 0, "restaurant": 0})["hotel"] += 1 for r in scenic_restaurants: by_scenic.setdefault(r["scenic_name"], {"hotel": 0, "restaurant": 0})["restaurant"] += 1 lines = [ "# 高德 POI 补全报告", "", f"- 景区补全:{matched(scenic_rows)}/{len(scenic_rows)}", f"- 原酒店表 POI 匹配:{matched(hotel_rows)}/{len(hotel_rows)}", f"- 原餐饮表 POI 匹配:{matched(restaurant_rows)}/{len(restaurant_rows)}", f"- 景区附近酒店关系:{len(scenic_hotels)} 条", f"- 景区附近餐饮关系:{len(scenic_restaurants)} 条", f"- 本次高德实际请求数:{client.calls}", "", "## 每个景区 nearby 覆盖", "", "| 景区 | 酒店候选 | 餐饮候选 |", "|---|---:|---:|", ] for name in sorted(by_scenic): v = by_scenic[name] lines.append(f"| {name} | {v.get('hotel', 0)} | {v.get('restaurant', 0)} |") if client.errors: lines.extend(["", "## API 异常/未返回", ""]) for err in client.errors[:80]: lines.append(f"- {err}") (OUT_DIR / "高德补全报告.md").write_text("\n".join(lines) + "\n", encoding="utf-8") def main() -> int: key = read_env_key(ENV_PATH, "AMAP_API_KEY") if not key: print(f"缺少 AMAP_API_KEY: {ENV_PATH}", file=sys.stderr) return 2 OUT_DIR.mkdir(parents=True, exist_ok=True) cache = load_cache() client = AMapClient(key, cache) scenic_headers, scenic_rows_raw = read_csv(BASE_DIR / "scenic_for_amap.csv") hotel_headers, hotel_rows_raw = read_csv(BASE_DIR / "hotel_poi.csv") restaurant_headers, restaurant_rows_raw = read_csv(BASE_DIR / "restaurant_poi.csv") scenic_rows = scenic_enrich(client, scenic_rows_raw) write_csv(OUT_DIR / "scenic_for_amap_enriched.csv", scenic_rows, scenic_headers) save_cache(cache) hotel_rows = enrich_existing_pois(client, hotel_rows_raw, name_field="hotel_name", id_prefix="HOTEL", types=HOTEL_TYPES) write_csv(OUT_DIR / "hotel_poi_enriched.csv", hotel_rows, hotel_headers) save_cache(cache) restaurant_rows = enrich_existing_pois( client, restaurant_rows_raw, name_field="restaurant_name", id_prefix="RESTAURANT", types=RESTAURANT_TYPES, ) write_csv(OUT_DIR / "restaurant_poi_enriched.csv", restaurant_rows, restaurant_headers) save_cache(cache) hotel_master, scenic_hotels = nearby_candidates(client, scenic_rows, types=HOTEL_TYPES, kind="hotel") restaurant_master, scenic_restaurants = nearby_candidates(client, scenic_rows, types=RESTAURANT_TYPES, kind="restaurant") # Include existing matched POIs in master output, then de-duplicate by AMap POI ID. hotel_master = dedupe_by_poi_id(hotel_rows + hotel_master, "hotel_name") restaurant_master = dedupe_by_poi_id(restaurant_rows + restaurant_master, "restaurant_name") write_csv(OUT_DIR / "hotel_poi_amap_master.csv", hotel_master) write_csv(OUT_DIR / "restaurant_poi_amap_master.csv", restaurant_master) write_csv(OUT_DIR / "scenic_hotel_nearby_10.csv", scenic_hotels) write_csv(OUT_DIR / "scenic_restaurant_nearby_10.csv", scenic_restaurants) write_csv(OUT_DIR / "scenic_resource_drive_metrics.csv", scenic_hotels + scenic_restaurants) write_dictionary() write_report(scenic_rows, hotel_rows, restaurant_rows, scenic_hotels, scenic_restaurants, client) save_cache(cache) print(f"done: {OUT_DIR}") return 0 if __name__ == "__main__": raise SystemExit(main())