Files
bxh/scripts/build_travel_agency_project.py

2173 lines
107 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import csv
import hashlib
import json
import re
import subprocess
from collections import Counter, defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import pandas as pd
import psycopg
from falkordb import FalkorDB
from psycopg.rows import dict_row
from psycopg.types.json import Jsonb
SOURCE_DIR = Path("/Users/xuexue/Downloads/旅行社业务")
OUT_DIR = Path("/Users/xuexue/Downloads/图谱数据/旅行社项目入库")
SCHEMA_DIR = Path("/Users/xuexue/new2/schema搭建/travel_agency_business")
DB_URL = "postgresql://admin:password@localhost:5433/kg_admin"
DB_SCHEMA = "kg_admin_new2"
TENANT_ID = "travel_agency"
PROJECT_ID = "travel_agency"
GRAPH_NAME = "travel_agency"
TEMPLATE_ID = "travel_agency_itinerary_planning_v0_3"
ATTRACTION_SEEDS = [
("黄果树", ["黄果树", "黄果树瀑布", "黄果树大瀑布", "黄果树风景名胜区"], "安顺", "瀑布/5A", "贵州龙头景区,瀑布群核心卖点。"),
("天星桥", ["天星桥", "天星桥景区"], "安顺", "喀斯特/黄果树景区", "水上石林、天然盆景。"),
("陡坡塘瀑布", ["陡坡塘", "陡坡塘瀑布"], "安顺", "瀑布/黄果树景区", "瀑面宽,西游记取景。"),
("荔波小七孔", ["小七孔", "荔波小七孔", "小七孔景区"], "黔南", "山水/5A", "世界自然遗产,水上森林、卧龙潭等。"),
("西江千户苗寨", ["西江", "西江苗寨", "西江千户苗寨"], "黔东南", "民族村寨/4A", "苗寨夜景、长桌宴、吊脚楼。"),
("镇远古城", ["镇远", "镇远古镇", "镇远古城"], "黔东南", "古城/5A", "古城夜景、舞阳河沿岸住宿。"),
("梵净山", ["梵净山"], "铜仁", "山岳/5A", "弥勒道场、蘑菇石、金顶。"),
("青岩古镇", ["青岩", "青岩古镇"], "贵阳", "古镇/5A", "卤猪脚、小吃、送机前半日游。"),
("百里杜鹃", ["百里杜鹃"], "毕节", "赏花", "3-4月花期主题。"),
("平坝樱花", ["平坝樱花", "平坝农场"], "安顺", "赏花", "春季樱花主题。"),
("织金洞", ["织金洞"], "毕节", "溶洞/5A", "大型喀斯特溶洞。"),
("中国天眼", ["天眼", "中国天眼", "FAST"], "黔南", "科技研学", "天文研学卖点。"),
("茅台镇", ["茅台", "茅台镇"], "遵义", "酒文化", "酱酒文化体验。"),
("遵义会议会址", ["遵义会址", "遵义会议会址"], "遵义", "红色文化", "红色研学路线核心。"),
("兴义万峰林", ["万峰林", "兴义万峰林"], "黔西南", "峰林", "黔西南山水。"),
("万峰湖", ["万峰湖"], "黔西南", "湖泊", "兴义水上体验。"),
("马岭河峡谷", ["马岭河", "马岭河峡谷"], "黔西南", "峡谷", "兴义峡谷景观。"),
("花江大桥", ["花江大桥"], "安顺/黔西南", "桥梁景观", "桥见贵州特色线路。"),
("龙宫", ["龙宫"], "安顺", "溶洞/5A", "安顺秘境类产品。"),
("天河潭", ["天河潭"], "贵阳", "山水", "贵阳近郊半日/首日。"),
("甲秀楼", ["甲秀楼"], "贵阳", "城市地标", "贵阳市区地标。"),
("黔灵山公园", ["黔灵公园", "黔灵山"], "贵阳", "城市公园", "贵阳市区轻量游。"),
("乌江寨", ["乌江寨"], "遵义", "度假街区", "夜游/住宿度假。"),
("高坡云顶花海", ["高坡云顶花海", "云顶花海"], "贵阳", "花海", "暑期/亲子花海。"),
("野洞河", ["野洞河"], "黔东南", "漂流", "漂流体验。"),
]
VEHICLE_SEEDS = [
("5座经济型", 5, "经济", "5座", "接送/小团"),
("5座舒适型", 5, "舒适", "5座", "接送/小团"),
("5座豪华型", 5, "豪华", "奔驰E300", "高端接送"),
("7座别克商务GL8", 7, "舒适", "7座商务", "接送/2-6人小包团"),
("7座奔驰威霆", 7, "豪华", "7座商务", "高端接送/小包团"),
("9座商务车", 9, "商务", "9座", "7-8人拼小团"),
("2+1保姆车", 32, "轻奢", "横排2+1", "轻奢拼小团/保姆车产品"),
("2+2商务车", 32, "商务", "横排2+2", "经典/多彩商务版"),
("32-38座2+1大巴", 38, "独立团", "32-38座2+1", "20-25人独立成团"),
("旅游大巴", 55, "标准", "大巴", "一日游/常规散客"),
]
def clean(value: Any) -> str:
if value is None:
return ""
if isinstance(value, float) and pd.isna(value):
return ""
text = str(value).replace("\x00", "").replace("\u200b", "").replace("\u200f", "")
text = re.sub(r"\s+", " ", text).strip()
return text
def multiline(value: Any) -> str:
if value is None:
return ""
if isinstance(value, float) and pd.isna(value):
return ""
text = str(value).replace("\x00", "").replace("\u200b", "").replace("\u200f", "")
text = re.sub(r"[ \t]+", " ", text)
return re.sub(r"\n{3,}", "\n\n", text).strip()
def slug(text: str, prefix: str = "") -> str:
base = re.sub(r"[\s()《》【】、,。:/\\\\]+", "_", clean(text))
base = re.sub(r"_+", "_", base).strip("_")
digest = hashlib.md5(clean(text).encode("utf-8")).hexdigest()[:8]
return f"{prefix}{base[:50]}_{digest}"
def money(value: Any) -> float | None:
text = clean(value)
if not text:
return None
found = re.search(r"-?\d+(?:\.\d+)?", text)
return float(found.group()) if found else None
def duration_from_text(text: str) -> int | None:
for pattern in [r"(\d+)\s*日游", r"(\d+)\s*天", r"(\d+)\s*日"]:
m = re.search(pattern, text)
if m:
return int(m.group(1))
cn = {"": 1, "": 2, "": 2, "": 3, "": 4, "": 5, "": 6, "": 7, "": 8, "": 9, "": 10}
m = re.search(r"([一二两三四五六七八九十])日游", text)
if m:
return cn.get(m.group(1))
return None
def read_office_text(path: Path) -> str:
try:
proc = subprocess.run(
["textutil", "-convert", "txt", "-stdout", str(path)],
check=True,
capture_output=True,
text=True,
)
return proc.stdout
except Exception as exc: # noqa: BLE001
return f"[textutil读取失败: {exc}]"
def short(text: str, limit: int = 900) -> str:
text = clean(text)
return text[:limit]
def extract_between(text: str, starts: list[str], ends: list[str], limit: int = 1200) -> str:
start_pos = -1
for token in starts:
pos = text.find(token)
if pos >= 0 and (start_pos < 0 or pos < start_pos):
start_pos = pos
if start_pos < 0:
return ""
end_pos = len(text)
for token in ends:
pos = text.find(token, start_pos + 2)
if pos >= 0:
end_pos = min(end_pos, pos)
return short(text[start_pos:end_pos], limit)
def product_family(name: str, text: str) -> str:
joined = f"{name} {text[:1500]}"
if "高端" in joined or "5钻" in joined or "五钻" in joined:
return "高端纯玩"
if "轻奢" in joined or "2+1" in joined or "保姆车" in joined or "头等舱" in joined:
return "轻奢纯玩"
if "多彩" in joined:
return "多彩贵州"
if "经典" in joined:
return "经典纯玩"
if "1+1" in joined or "游黔途" in joined:
return "游黔途"
if "独立" in joined or "20-25" in joined:
return "独立成团"
return "常规纯玩"
def hotel_grade_from_text(text: str) -> str:
if any(x in text for x in ("5钻", "五钻", "五星", "超五星")):
return "5钻/五星"
if any(x in text for x in ("4钻", "四钻", "四星")):
return "4钻/四星"
if "商务" in text:
return "商务"
if "客栈" in text:
return "客栈"
return ""
def vehicle_from_text(text: str) -> str:
if "2+1" in text or "保姆车" in text or "头等舱" in text:
return "2+1保姆车"
if "2+2" in text:
return "2+2商务车"
if "32-38" in text or "独立成团" in text:
return "32-38座2+1大巴"
if "旅游大巴" in text or "大巴" in text:
return "旅游大巴"
if "9座" in text:
return "9座商务车"
if "7座" in text:
return "7座别克商务GL8"
if "5座" in text:
return "5座舒适型"
return ""
def split_items(text: str) -> list[str]:
parts = re.split(r"[、,;/\n]+", clean(text))
return [p.strip() for p in parts if p.strip() and p.strip().lower() != "nan"][:20]
def unique_clean(items: list[Any], limit: int = 30) -> list[str]:
out: list[str] = []
seen: set[str] = set()
for item in items:
value = clean(item)
if not value or value in seen:
continue
seen.add(value)
out.append(value)
if len(out) >= limit:
break
return out
def source_digest(*parts: Any, length: int = 10) -> str:
raw = "||".join(clean(p) for p in parts)
return hashlib.md5(raw.encode("utf-8")).hexdigest()[:length].upper()
def sentence_snippets(text: str, keywords: list[str], limit: int = 4, max_len: int = 180) -> list[str]:
snippets: list[str] = []
for raw in re.split(r"[。!?;;\n]+", text):
item = clean(raw)
if len(item) < 4:
continue
if any(keyword in item for keyword in keywords):
snippets.append(item[:max_len])
if len(snippets) >= limit:
break
return unique_clean(snippets, limit)
def build_alias_reverse(attraction_aliases: dict[str, str]) -> dict[str, list[str]]:
out: dict[str, list[str]] = defaultdict(list)
for alias, key in attraction_aliases.items():
out[key].append(alias)
for key, aliases in out.items():
out[key] = sorted(unique_clean(aliases, 20), key=len, reverse=True)
return out
def parse_simple_itinerary(text: str) -> dict[int, dict[str, str]]:
block = extract_between(text, ["简易行程"], ["详细行程", "接待标准", "费用包含"], 2400)
if not block:
return {}
cells = [clean(x) for x in re.split(r"[\x07\t]+", block) if clean(x)]
rows: dict[int, dict[str, str]] = {}
for idx, cell in enumerate(cells):
if not re.fullmatch(r"D\s*\d+", cell, flags=re.I):
continue
day_index = int(re.search(r"\d+", cell).group())
route = cells[idx + 1] if idx + 1 < len(cells) else ""
meals = cells[idx + 2] if idx + 2 < len(cells) else ""
accommodation = cells[idx + 3] if idx + 3 < len(cells) else ""
rows[day_index] = {"route": route, "meals": meals, "accommodation": accommodation}
return rows
def day_title_from_body(day_index: int, fallback: str, body: str, simple_row: dict[str, str] | None = None) -> str:
if simple_row and simple_row.get("route"):
return simple_row["route"]
lines = [clean(line) for line in body.splitlines() if clean(line)]
for line in lines[:4]:
if line.startswith("") or re.fullmatch(r"D\s*\d+", line, flags=re.I):
continue
if "" in line or "" in line or "" in line or "->" in line or ">>" in line:
return line[:120]
return fallback or f"D{day_index}"
def extract_meal_text(body: str, simple_row: dict[str, str] | None = None) -> str:
if simple_row and simple_row.get("meals"):
return simple_row["meals"]
m = re.search(r"\s*[:]\s*([早中晚/、,,无\- ]{1,20})", body)
if m:
return clean(m.group(1))
return ""
def meal_parts(meal_text: str, product_meal_standard: str = "") -> dict[str, str]:
text = clean(meal_text)
if not text or text in {"/", "", "不含"}:
return {
"breakfast": "不含/未写明",
"lunch": "不含/未写明",
"dinner": "不含/未写明",
"meal_text": text or "未写明",
"meal_standard": product_meal_standard,
}
return {
"breakfast": "" if "" in text else "不含/自理",
"lunch": "" if "" in text else "不含/自理",
"dinner": "" if "" in text else "不含/自理",
"meal_text": text,
"meal_standard": product_meal_standard,
}
def extract_accommodation_text(body: str, simple_row: dict[str, str] | None = None) -> str:
if simple_row and simple_row.get("accommodation"):
return simple_row["accommodation"]
m = re.search(r"\s*[:]\s*([^\n。;]{1,40})", body)
if m:
return clean(m.group(1))
return ""
def extract_product_meal_standard(text: str) -> str:
for pattern in [
r"\s*([0-9一二三四五六七八九十]+\s*早\s*[0-9一二三四五六七八九十]+\s*正[^。\n]{0,90})",
r"(正餐餐标\s*\d+\s*元/人[^。\n]{0,80})",
r"(餐标[:]?[^。\n]{2,80})",
]:
m = re.search(pattern, text)
if m:
return clean(m.group(1))
return ""
def extract_selling_points(text: str) -> list[str]:
points: list[str] = []
for raw in re.split(r"\n+", text):
line = clean(raw)
if not line:
continue
if line.startswith("") or line.startswith("") or line.startswith(""):
line = re.sub(r"^[❀★●\s]+", "", line)
if any(token in line for token in ("核心", "卖点", "甄选", "景点", "赠送", "酒店", "车型", "纯玩")):
points.append(line[:180])
return unique_clean(points, 12)
def group_mode_from_text(text: str) -> str:
pieces: list[str] = []
if "当地散拼成团" in text:
pieces.append("当地散拼成团")
m = re.search(r"(\d+\s*人团)", text)
if m:
pieces.append(m.group(1).replace(" ", ""))
if "拼小团" in text:
pieces.append("拼小团")
if "独立成团" in text:
pieces.append("独立成团")
return "".join(unique_clean(pieces, 4)) or "散客/常规团"
def group_capacity_from_text(text: str) -> tuple[int | None, int | None]:
m = re.search(r"(\d+)\s*人团", text)
if m:
value = int(m.group(1))
return (1, value)
m = re.search(r"(\d+)\s*[-~至]\s*(\d+)\s*人", text)
if m:
return (int(m.group(1)), int(m.group(2)))
return (None, None)
def service_promise_from_text(text: str, name: str) -> str:
snippets = sentence_snippets(text, ["0自费", "0购物", "无购物", "纯玩", "进店赔付"], limit=4, max_len=220)
if snippets:
return "".join(snippets)
if "纯玩" in name:
return "纯玩"
return ""
def extract_route_points(route_text: str) -> list[str]:
cleaned = re.sub(r"(出发地|各地|酒店|用餐|早餐|中餐|晚餐|入住|返回|散团|送站|机场|高铁站|火车站)", "", route_text)
parts = re.split(r"→|->|>>|>|—|-||~|", cleaned)
return [clean(p) for p in parts if clean(p)]
def day_attractions(
body: str,
route_text: str,
attraction_aliases: dict[str, str],
alias_reverse: dict[str, list[str]],
) -> list[tuple[str, str, int]]:
search_text = f"{route_text}\n{body}"
found: dict[str, tuple[str, int]] = {}
for key, aliases in alias_reverse.items():
positions = [(search_text.find(alias), alias) for alias in aliases if alias and alias in search_text]
positions = [(pos, alias) for pos, alias in positions if pos >= 0]
if not positions:
continue
pos, alias = min(positions, key=lambda x: x[0])
found[key] = (alias, pos)
return [(key, alias, pos) for key, (alias, pos) in sorted(found.items(), key=lambda item: item[1][1])]
def duration_near_alias(body: str, aliases: list[str]) -> str:
best_pos = min([body.find(alias) for alias in aliases if alias in body] or [-1])
if best_pos < 0:
return ""
window = body[max(0, best_pos - 80): best_pos + 220]
m = re.search(r"(?:游览时间|游时约|游览约|游览)\s*([0-9.半一二三四五六七八九十]+ ?(?:小时|分钟)[^),。;;]{0,12})", window)
if m:
return clean(m.group(1))
return ""
def notes_near_alias(body: str, aliases: list[str], keywords: list[str], limit: int = 3) -> list[str]:
snippets: list[str] = []
sentences = re.split(r"[。!?;;\n]+", body)
for sentence in sentences:
item = clean(sentence)
if len(item) < 4:
continue
if aliases and not any(alias in item for alias in aliases):
continue
if any(keyword in item for keyword in keywords):
snippets.append(item[:220])
if len(snippets) >= limit:
break
if not snippets and not aliases:
snippets = sentence_snippets(body, keywords, limit=limit, max_len=220)
return unique_clean(snippets, limit)
def walk_intensity_from_notes(notes: list[str], duration_text: str) -> str:
text = " ".join(notes + [duration_text])
if any(token in text for token in ("台阶", "石板路", "步行走路", "道路狭窄", "路滑", "排队等待", "索道", "登山")):
return "中高"
if any(token in text for token in ("漫步", "观光车", "环保车", "游览时间 3", "3小时", "3.5")):
return "中等"
if any(token in text for token in ("30分钟", "半小时", "车观")):
return "较低"
return "未明确"
def fee_type_from_context(context: str) -> str:
if "保险" in context:
return "景区保险"
if any(token in context for token in ("观光车", "环保车", "电瓶车", "小交通", "景交")):
return "景区小交通"
if any(token in context for token in ("扶梯", "索道", "游船")):
return "自愿项目"
if any(token in context for token in ("餐标", "正餐", "半餐")):
return "餐标"
if "旅拍" in context or "代金券" in context:
return "赠送权益"
if "赔付" in context:
return "服务承诺赔付"
if "退团" in context or "损失" in context:
return "退改损失"
if any(token in context for token in ("自理", "不含", "必销", "必消")):
return "自理/必消"
return "费用说明"
def inclusion_from_context(context: str) -> str:
if "赠送" in context:
return "赠送"
if "赔付" in context:
return "服务承诺"
if "退团" in context or "损失" in context:
return "退改规则"
if any(token in context for token in ("不含", "自理", "另行付费", "必销", "必消")):
return "不含/自理"
if re.search(r"(^|[^不])含", context):
return "包含"
return "需核价"
def fee_item_name(context: str, amount_text: str) -> str:
candidates = [
"必消小交通合计", "黄果树景区观光车", "小七孔景区观光车", "西江千户苗寨观光车",
"景区保险", "黄果树扶梯单程", "黄果树扶梯往返", "青岩古镇电瓶车",
"正餐餐标", "旅拍代金券", "进店赔付", "退团损失", "单独送站",
]
for candidate in candidates:
if candidate in context or all(part in context for part in split_items(candidate)):
return candidate
before = clean(context[: max(0, context.find(amount_text))])
before = re.sub(r".*(不含|含|必消|必销|赠送|如有|另外|需承担)", "", before)
return clean(before[-24:]) or "费用项目"
def extract_fee_candidates(text: str, source_file: str, applies_to: str, limit: int = 80) -> list[dict[str, Any]]:
fees: list[dict[str, Any]] = []
fee_keywords = ["", "/人", "每人", "自理", "不含", "保险", "观光车", "环保车", "电瓶车", "扶梯", "索道", "餐标", "赔付", "退团", "损失", "代金券"]
for sentence in re.split(r"[。;;\n]+", text):
context = clean(sentence)
if not context or not any(keyword in context for keyword in fee_keywords):
continue
for m in re.finditer(r"(\d+(?:\.\d+)?)\s*(元)?\s*/?\s*(每人|人|趟|份|间)?", context):
amount = m.group(1)
window = context[max(0, m.start() - 20): m.end() + 20]
if "" not in window and "/人" not in window and "每人" not in window and "赔付" not in context and "损失" not in context:
continue
item = {
"fee_item_id": f"FEE-{source_digest(source_file, applies_to, context, amount, len(fees))}",
"fee_type": fee_type_from_context(context),
"item_name": fee_item_name(context, amount),
"amount_text": clean(m.group(0)),
"amount_value": float(amount),
"unit": m.group(3) or ("" if "/人" in context or "每人" in context else ""),
"inclusion_status": inclusion_from_context(context),
"applies_to": applies_to,
"rule_text": context[:260],
"source_file": source_file,
}
fees.append(item)
if len(fees) >= limit:
return fees
unique: dict[str, dict[str, Any]] = {}
for fee in fees:
key = f"{fee['fee_type']}|{fee['item_name']}|{fee['amount_text']}|{fee['rule_text'][:40]}"
unique[key] = fee
return list(unique.values())[:limit]
def extract_gift_services(text: str, source_file: str, product_name: str) -> list[dict[str, Any]]:
services: list[dict[str, Any]] = []
gift_block_match = re.search(r"❀赠送服务[:]?(.*?)(?:备注[:]|·\s*简易行程|简易行程)", text, flags=re.S)
block = gift_block_match.group(1) if gift_block_match else extract_between(text, ["赠送服务"], ["简易行程", "详细行程", "接待标准"], 1800)
if not block:
block = "\n".join(sentence_snippets(text, ["赠送"], limit=8, max_len=220))
usage_rule = "".join(sentence_snippets(text, ["赠送项目", "未使用", "无任何退费", "自愿放弃"], limit=3, max_len=180))
for raw in re.split(r"[。\n;]+", block):
line = clean(raw)
if "赠送" not in line or len(line) < 6:
continue
if "核心卖点" in line or "超值精华景点" in line:
continue
name = re.sub(r"^\d+[、.]\s*", "", line)
name = re.sub(r"^赠送[:]?", "", name).strip(": ")
value = clean((re.search(r"\d+\s*元/?人?", line) or [""])[0])
services.append({
"service_id": f"GIFT-{source_digest(source_file, product_name, line)}",
"service_type": "赠送服务",
"name": name[:120],
"value_text": value,
"usage_rule": usage_rule,
"refundable": "" if "不退" in usage_rule or "无任何退费" in usage_rule else "未明确",
"source_file": source_file,
})
return services[:12]
def extract_policy_rules(text: str, source_file: str, product_name: str) -> list[dict[str, Any]]:
candidates: list[tuple[str, str, str]] = []
section_specs = [
("赠送退费", ["赠送项目", "若未使用"], ["简易行程", "详细行程"], "报价必看"),
("优惠人群", ["优惠人群"], ["景交", "用餐", "酒店"], "报价必看"),
("儿童费用", ["儿童"], ["酒店", "导游服务", "购物"], "报价必看"),
("水帘洞预约", ["关于黄果树水帘洞"], ["温馨提示:景区内游客较多", "· 接待标准"], "风险提示"),
("购物承诺", ["购物"], ["意见单填写", "温馨提示"], "服务承诺"),
("投诉/意见单", ["意见单填写"], ["温馨提示", "· 温馨提示"], "售后规则"),
("酒店预期", ["贵州住房资源紧张", "贵州酒店标准"], ["导游服务", "购物"], "提示"),
("行程调整", ["旅行社有权根据实际情况"], ["13、", "14、"], "调度规则"),
]
for rule_type, starts, ends, severity in section_specs:
block = extract_between(text, starts, ends, 1200)
if block:
candidates.append((rule_type, block, severity))
for keyword_group, rule_type, severity in [
(["老人", "行动不便", "孕妇"], "特殊人群", "关键限制"),
(["退团", "损失"], "退改规则", "报价必看"),
(["无任何费用退还", "无费用可退", "不退"], "退费规则", "报价必看"),
(["不可抗力", "堵车", "误机"], "不可抗力", "风险提示"),
(["景区人流", "排队", "路滑"], "游览风险", "风险提示"),
]:
for snippet in sentence_snippets(text, keyword_group, limit=4, max_len=260):
candidates.append((rule_type, snippet, severity))
rules: list[dict[str, Any]] = []
seen: set[str] = set()
for idx, (rule_type, rule_text, severity) in enumerate(candidates, start=1):
content = clean(rule_text)
if len(content) < 10 or content in seen:
continue
seen.add(content)
rules.append({
"rule_id": f"RULE-DOC-{source_digest(source_file, rule_type, content, idx)}",
"rule_type": rule_type,
"applies_to": product_name,
"rule_text": content[:1200],
"evidence_text": content[:260],
"severity": severity,
"source_file": source_file,
})
if len(rules) >= 18:
break
return rules
def extract_accommodation_options(text: str, source_file: str, product_name: str) -> list[dict[str, Any]]:
section = extract_between(text, ["酒店"], ["导游服务", "购物", "意见单填写"], 5200)
if not section:
return []
labels = [
"贵阳参考", "龙里参考", "都匀参考", "安顺参考", "西江4钻或4圈参考酒店",
"西江4钻参考酒店", "西江参考",
]
pattern = r"(" + "|".join(re.escape(label) for label in labels) + r")[:]\s*(.*?)(?=(" + "|".join(re.escape(label) for label in labels) + r")[:]|贵州住房资源|导游服务|购物|$)"
options: list[dict[str, Any]] = []
for match in re.finditer(pattern, section, flags=re.S):
label = clean(match.group(1))
content = clean(match.group(2))
if len(content) < 3:
continue
city = label.replace("参考酒店", "").replace("参考", "").replace("4钻或4圈", "").strip()
hotels = split_items(re.sub(r"四钻|4钻参考酒店|等同级酒店|等同级|参考酒店", "", content))
notes = "贵州住房资源紧张时可调整同级酒店" if "调整到同级别酒店" in section else ""
options.append({
"accommodation_id": f"ACCO-{source_digest(source_file, product_name, label)}",
"city_or_area": city or label,
"hotel_grade": hotel_grade_from_text(f"{label} {content}") or hotel_grade_from_text(text),
"option_type": "产品参考酒店组",
"reference_hotels": hotels,
"stay_nights": "",
"notes": notes,
"source_file": source_file,
})
return options[:12]
def transport_segments_for_day(
route_text: str,
body: str,
day_index: int,
source_file: str,
product_name: str,
vehicle_type: str,
) -> list[dict[str, Any]]:
points = extract_route_points(route_text)
if len(points) < 2:
return []
duration_text = "".join(sentence_snippets(body, ["车程约", "乘车前往", "接人", "送站", "散团"], limit=3, max_len=160))
dispatch_notes = "".join(sentence_snippets(body, ["接人", "师傅", "导游", "交通管制", "外来车辆无法进入", "送站"], limit=3, max_len=180))
segments: list[dict[str, Any]] = []
for idx in range(len(points) - 1):
origin = points[idx]
destination = points[idx + 1]
segments.append({
"segment_id": f"SEG-{source_digest(source_file, product_name, day_index, idx, origin, destination)}",
"day_index": day_index,
"origin_text": origin,
"destination_text": destination,
"vehicle_type": vehicle_type,
"duration_text": duration_text,
"distance_text": "",
"dispatch_notes": dispatch_notes,
"source_file": source_file,
})
return segments
class KGBuilder:
def __init__(self) -> None:
self.nodes: dict[str, dict[str, Any]] = {}
self.relations: list[dict[str, Any]] = []
self.sources: list[dict[str, Any]] = []
def add_node(self, label: str, key: str, name: str, **props: Any) -> str:
if not key:
key = f"{label.lower()}:{slug(name)}"
existing = self.nodes.get(key)
payload = {
"label": label,
"natural_key": key,
"name": clean(name) or key,
**{k: v for k, v in props.items() if v not in (None, "", [], {})},
}
if existing:
merged = {**existing, **payload}
for field in ("source_files", "aliases", "season_tags", "must_visit", "signature_dishes", "features", "applicable_products"):
vals: list[Any] = []
for src in (existing.get(field), payload.get(field)):
if isinstance(src, list):
vals.extend(src)
elif src:
vals.append(src)
if vals:
merged[field] = sorted({clean(v) for v in vals if clean(v)})
self.nodes[key] = merged
else:
self.nodes[key] = payload
return key
def add_rel(self, rel_type: str, source: str, target: str, **props: Any) -> None:
if not source or not target or source == target:
return
item = {
"relation_type": rel_type,
"source": source,
"target": target,
"properties": {k: v for k, v in props.items() if v not in (None, "", [], {})},
}
identity = (rel_type, source, target, json.dumps(item["properties"], ensure_ascii=False, sort_keys=True))
if not hasattr(self, "_rel_seen"):
self._rel_seen = set()
if identity in self._rel_seen:
return
self._rel_seen.add(identity)
self.relations.append(item)
def seed_schema() -> dict[str, Any]:
entity_types = {
"TravelDemand": {
"cn": "游客动态需求",
"purpose": "承接客服从自然语言里识别出的出行意向,是后续自动组合行程的输入对象。",
"fields": [
"demand_id", "lead_source", "travel_month", "start_date", "duration_days", "party_size",
"adult_count", "child_count", "senior_count", "hotel_preference", "vehicle_preference",
"budget_level", "must_visit", "avoid_notes", "special_care", "demand_summary",
],
},
"ItineraryPlan": {
"cn": "推荐行程方案",
"purpose": "由图谱中的产品、报价、酒店、餐厅、交通组合出的可交付行程方案。",
"fields": [
"plan_id", "plan_name", "duration_days", "fit_score", "budget_estimate", "route_summary",
"quote_summary", "risk_notes", "answer_hint",
],
},
"TourProduct": {
"cn": "旅游产品",
"purpose": "旅行社已经沉淀的线路产品,是行程组合的主要模块。",
"fields": [
"product_id", "name", "product_family", "product_type", "duration_days", "group_mode",
"capacity_min", "capacity_max", "vehicle_layout", "hotel_grade", "meal_standard", "service_promise",
"selling_points", "included_summary", "excluded_summary", "optional_items", "fee_summary",
"booking_notes", "risk_notes", "season_tags", "source_files",
],
},
"TourVariant": {
"cn": "产品报价变体",
"purpose": "同一产品在不同团期、房型、人数和车型下的报价。",
"fields": [
"variant_id", "variant_name", "season", "date_range", "group_size_band", "room_type",
"hotel_grade", "vehicle_type", "adult_price", "child_price", "single_room_supplement",
"inner_transport_fee", "refund_policy", "source_file",
],
},
"ItineraryDay": {
"cn": "每日行程",
"purpose": "拆解产品每天游览、交通、用餐、住宿,便于后续按需求重排。",
"fields": [
"day_id", "day_index", "title", "route_path", "route_summary", "transport_summary",
"meals", "accommodation", "time_arrangement", "tips", "source_file",
],
},
"ScenicAttraction": {
"cn": "旅游景点",
"purpose": "客户需求和线路推荐最常见的锚点。",
"fields": [
"attraction_id", "name", "aliases", "city", "attraction_type", "selling_points",
"ticket_policy", "inner_transport_fee", "walking_profile", "visit_duration_hint",
],
},
"HotelResource": {
"cn": "酒店资源",
"purpose": "客服按酒店等级、区域、产品适用性组合住宿。",
"fields": ["hotel_id", "name", "hotel_grade", "region", "address", "features", "off_season_price_text", "peak_season_price_text", "applicable_products", "contact_name"],
},
"RestaurantResource": {
"cn": "餐厅资源",
"purpose": "客服按区域、餐标、特色餐给行程补餐饮方案。",
"fields": ["restaurant_id", "name", "region", "address", "per_capita_price_text", "signature_dishes", "meal_scene", "contact_name"],
},
"VehicleService": {
"cn": "车辆服务",
"purpose": "小团、接送、独立成团的交通能力。",
"fields": ["vehicle_service_id", "vehicle_type", "seat_count", "comfort_level", "seat_layout", "service_scope", "notes"],
},
"TransferQuote": {
"cn": "接送报价",
"purpose": "补充到达/离开日接送预算。",
"fields": ["transfer_quote_id", "origin_text", "destination_text", "vehicle_type", "price_per_trip", "quote_unit", "quote_notes"],
},
"DayVisit": {
"cn": "每日景点游览安排",
"purpose": "把某一天去哪个景点、游览多久、是否真实行程点、步行/排队风险和费用证据拆开,避免把自费说明误当成路线。",
"fields": [
"visit_id", "day_index", "visit_order", "attraction_name", "duration_text",
"evidence_text", "fee_notes", "walking_notes", "walk_intensity", "risk_notes",
"included_flag", "source_file",
],
},
"MealArrangement": {
"cn": "每日用餐安排",
"purpose": "记录每天早中晚是否包含、餐标、特色餐和自理说明,避免 6 天行程只显示两家餐厅造成误解。",
"fields": [
"meal_id", "day_index", "breakfast", "lunch", "dinner", "meal_text",
"meal_standard", "special_meal", "self_pay_notes", "source_file",
],
},
"AccommodationOption": {
"cn": "住宿组选项",
"purpose": "承载产品中的住宿城市、等级、参考酒店组和同级替换规则;和酒店资源库中的单体酒店可以继续关联。",
"fields": [
"accommodation_id", "city_or_area", "hotel_grade", "option_type", "reference_hotels",
"stay_nights", "notes", "source_file",
],
},
"TransportSegment": {
"cn": "行程交通段",
"purpose": "记录每天从哪到哪、用什么车、车程和调度提醒,支持后续核算车费和判断赶路强度。",
"fields": [
"segment_id", "day_index", "origin_text", "destination_text", "vehicle_type",
"duration_text", "distance_text", "dispatch_notes", "source_file",
],
},
"FeeItem": {
"cn": "费用项目",
"purpose": "把门票、小交通、保险、扶梯、餐标、赔付、退改损失等费用拆成可追溯证据。",
"fields": [
"fee_item_id", "fee_type", "item_name", "amount_text", "amount_value", "unit",
"inclusion_status", "applies_to", "rule_text", "source_file",
],
},
"GiftService": {
"cn": "赠送/包含服务",
"purpose": "记录长桌宴、旅拍券、矿泉水、打糍粑等赠送或服务承诺,并保留未使用不退等规则。",
"fields": ["service_id", "service_type", "name", "value_text", "usage_rule", "refundable", "source_file"],
},
"PolicyRule": {
"cn": "业务规则",
"purpose": "限制、退费、风险和合规提醒。",
"fields": ["rule_id", "rule_type", "applies_to", "rule_text", "evidence_text", "severity", "source_file"],
},
"SalesScript": {
"cn": "销售话术",
"purpose": "客服回复、追单、留资和解释费用包含。",
"fields": ["script_id", "channel", "funnel_stage", "trigger_scenario", "message_template", "intent_tags", "required_customer_fields"],
},
"Area": {"cn": "区域", "purpose": "连接酒店、餐厅、接送报价和景点。", "fields": ["area_id", "name", "area_type"]},
"SalesChannel": {"cn": "销售渠道", "purpose": "小红书、微信等线索来源。", "fields": ["channel_id", "name", "channel_type"]},
}
relation_types = {
"HAS_DEMAND": ("CustomerLead|SalesChannel", "TravelDemand", "线索产生游客需求"),
"WANTS_TO_VISIT": ("TravelDemand", "ScenicAttraction", "需求指定想去景点"),
"PREFERS_HOTEL": ("TravelDemand", "HotelResource", "需求偏好酒店资源或等级"),
"PREFERS_VEHICLE": ("TravelDemand", "VehicleService", "需求偏好车型"),
"GENERATES_PLAN": ("TravelDemand", "ItineraryPlan", "需求生成推荐方案"),
"SATISFIES_DEMAND": ("ItineraryPlan", "TravelDemand", "方案满足需求"),
"PLAN_USES_PRODUCT": ("ItineraryPlan", "TourProduct", "方案使用已有产品"),
"PLAN_USES_VARIANT": ("ItineraryPlan", "TourVariant", "方案使用报价变体"),
"PLAN_HAS_DAY": ("ItineraryPlan", "ItineraryDay", "方案包含每日行程"),
"PLAN_STAYS_AT": ("ItineraryPlan", "HotelResource", "方案建议入住酒店"),
"PLAN_MEALS_AT": ("ItineraryPlan", "RestaurantResource", "方案建议餐厅"),
"PLAN_USES_VEHICLE": ("ItineraryPlan", "VehicleService", "方案建议车辆"),
"HAS_VARIANT": ("TourProduct", "TourVariant", "产品拥有报价变体"),
"HAS_DAY": ("TourProduct", "ItineraryDay", "产品包含每日行程"),
"VISITS": ("TourProduct|ItineraryDay", "ScenicAttraction", "游览景点"),
"DAY_HAS_VISIT": ("ItineraryDay", "DayVisit", "每日行程包含具体游览安排"),
"VISIT_AT_ATTRACTION": ("DayVisit", "ScenicAttraction", "游览安排对应景点"),
"DAY_HAS_MEAL": ("ItineraryDay", "MealArrangement", "每日行程包含用餐安排"),
"DAY_HAS_ACCOMMODATION": ("ItineraryDay", "AccommodationOption", "每日行程包含住宿安排"),
"DAY_USES_TRANSPORT": ("ItineraryDay", "TransportSegment", "每日行程包含交通段"),
"PRODUCT_HAS_ACCOMMODATION_OPTION": ("TourProduct", "AccommodationOption", "产品可用住宿组选项"),
"PRODUCT_HAS_FEE": ("TourProduct|ItineraryDay|DayVisit", "FeeItem", "产品或每日行程涉及费用项目"),
"DAY_HAS_FEE": ("ItineraryDay|DayVisit", "FeeItem", "每日行程涉及费用项目"),
"PRODUCT_INCLUDES_SERVICE": ("TourProduct", "GiftService", "产品包含或赠送服务"),
"STAYS_AT": ("ItineraryDay", "HotelResource", "每日行程入住酒店"),
"MEALS_AT": ("ItineraryDay", "RestaurantResource", "每日行程用餐"),
"USES_VEHICLE": ("TourProduct|TourVariant|TransferQuote", "VehicleService", "使用车辆"),
"CAN_UPGRADE_TO": ("VehicleService", "VehicleService", "车辆可升级"),
"HAS_POLICY": ("TourProduct|TourVariant|TravelDemand", "PolicyRule", "适用规则"),
"HAS_SCRIPT": ("TourProduct|TravelDemand", "SalesScript", "适用话术"),
"FROM_SOURCE": ("TravelDemand|SalesScript", "SalesChannel", "来自渠道"),
"LOCATED_IN": ("ScenicAttraction|HotelResource|RestaurantResource", "Area", "位于区域"),
"FROM_AREA": ("TransferQuote", "Area", "接送出发区域"),
"TO_AREA": ("TransferQuote", "Area", "接送到达区域"),
}
return {
"namespace": "travel_agency_itinerary_planning",
"version": "0.3",
"display_name": "旅行社行程规划知识图谱 Schema",
"purpose": "支持客服基于游客动态需求,快速从产品、景点、酒店、餐厅、交通、报价、规则和话术中组合行程单。",
"entity_types": entity_types,
"relation_types": relation_types,
"quality_rules": [
"报价、人数、日期、景区小交通、自费项必须进入属性,不升级成无证据关系。",
"产品 VISITS 只允许来自简易行程/详细行程的真实游览日程;费用、自理、政策段落中出现的景点只能进入 FeeItem 或 PolicyRule。",
"每日行程必须尽量拆出 DayVisit、MealArrangement、AccommodationOption、TransportSegment 和 FeeItem以便客服解释住哪、吃什么、怎么走、费用看哪里。",
"产品与推荐方案分离TourProduct 保存已有资料库ItineraryPlan 保存按用户需求组合后的方案。",
"TravelDemand 是后期自然语言客资抽取的核心输入,必须连接 must_visit、vehicle/hotel preference 和生成方案。",
"同一产品不同房型、团期、车型必须拆成 TourVariant避免报价覆盖。",
"城市图谱与旅行社图谱使用不同 project_id 和 graph_name不共享默认查询上下文。",
],
}
def schema_to_dsl(schema: dict[str, Any]) -> str:
lines = ["```text", "namespace travel_agency_itinerary_planning", ""]
list_fields = {
"must_visit", "avoid_notes", "features", "signature_dishes", "meal_scene", "applicable_products",
"source_files", "optional_items", "season_tags", "selling_points", "reference_hotels",
}
for name, spec in schema["entity_types"].items():
lines.append(f"{name}({spec['cn']}): EntityType")
lines.append(" properties:")
for field in spec["fields"]:
value_type = "Number" if any(x in field for x in ("days", "count", "price", "score", "supplement")) else "Text"
if field in list_fields:
value_type = "TextList"
lines.append(f" {field}: {value_type}")
lines.append("")
for name, (start, end, desc) in schema["relation_types"].items():
lines.append(f"{name}({desc}): RelationType")
lines.append(f" startNode: {start}")
lines.append(f" endNode: {end}")
lines.append("")
lines.append("```")
return "\n".join(lines)
def add_attractions(builder: KGBuilder) -> dict[str, str]:
mapping = {}
for name, aliases, city, typ, point in ATTRACTION_SEEDS:
area_key = builder.add_node("Area", f"area:{city}", city, area_id=f"AREA-{slug(city)[:18]}", area_type="目的地区域")
key = builder.add_node(
"ScenicAttraction",
f"attraction:{slug(name)}",
name,
attraction_id=f"ATTR-{hashlib.md5(name.encode()).hexdigest()[:8]}",
aliases=aliases,
city=city,
attraction_type=typ,
selling_points=[point],
)
builder.add_rel("LOCATED_IN", key, area_key)
for alias in aliases:
mapping[alias] = key
return mapping
def mentioned_attractions(text: str, attraction_aliases: dict[str, str]) -> list[str]:
found: list[str] = []
for alias, key in attraction_aliases.items():
if alias and alias in text and key not in found:
found.append(key)
return found
def extract_day_segments(text: str) -> list[tuple[int, str, str]]:
matches = list(re.finditer(r"(?m)(D\s*\d+|第[一二三四五六七八九十]+天)[:、\s]", text))
if not matches:
body = extract_between(text, ["行程安排", "日期"], ["团费包含", "费用包含", "接待标准", "参团须知"], 1400) or text[:1400]
return [(1, "D1", body)]
segments = []
cn_map = {"": 1, "": 2, "": 2, "": 3, "": 4, "": 5, "": 6, "": 7, "": 8, "": 9, "": 10}
for idx, match in enumerate(matches):
if idx + 1 < len(matches):
end = matches[idx + 1].start()
else:
end = len(text)
for token in ["· 接待标准", "接待标准", "· 费用包含", "费用包含", "团费包含", "· 参团须知", "参团须知"]:
pos = text.find(token, match.start() + 2)
if pos >= 0:
end = min(end, pos)
end = min(end, match.start() + 2600)
token = match.group(1)
if token.startswith("D"):
day_index = int(re.search(r"\d+", token).group())
else:
day_index = cn_map.get(re.search(r"第(.+?)天", token).group(1), idx + 1)
body = text[match.start():end]
title = clean(body.splitlines()[0])[:80] or f"D{day_index}"
segments.append((day_index, title, body))
return segments[:12]
def extract_products_from_docs(builder: KGBuilder, attraction_aliases: dict[str, str]) -> None:
docs_dir = SOURCE_DIR / "2026年新行程打包"
alias_reverse = build_alias_reverse(attraction_aliases)
for path in sorted(docs_dir.glob("*")):
if path.suffix.lower() not in {".doc", ".docx"}:
continue
if path.name.startswith((".", "~$", ".~")):
continue
text = read_office_text(path)
non_empty = [clean(x) for x in text.splitlines() if clean(x)]
name = non_empty[0] if non_empty else path.stem
if len(name) < 4 or "INCLUDEPICTURE" in name:
name = path.stem
duration = duration_from_text(f"{path.stem} {name} {text[:300]}")
source_file = str(path)
vehicle = vehicle_from_text(f"{path.stem} {text[:2200]}")
meal_standard = extract_product_meal_standard(text)
capacity_min, capacity_max = group_capacity_from_text(text)
product_fees = extract_fee_candidates(text, source_file, name, limit=70)
fee_summary = "".join(unique_clean([f"{fee['item_name']} {fee['amount_text']}{fee['inclusion_status']}" for fee in product_fees], 12))
product_key = builder.add_node(
"TourProduct",
f"product:{slug(name)}",
name,
product_id=f"TAP-2026-{hashlib.md5(name.encode()).hexdigest()[:8].upper()}",
product_family=product_family(name, text),
product_type="既有线路产品",
duration_days=duration,
duration_nights=max(duration - 1, 0) if duration else None,
group_mode=group_mode_from_text(text),
capacity_min=capacity_min,
capacity_max=capacity_max,
vehicle_layout=vehicle,
hotel_grade=hotel_grade_from_text(f"{path.stem} {text[:2500]}"),
meal_standard=meal_standard,
service_promise=service_promise_from_text(text, name),
selling_points=extract_selling_points(text),
included_summary=extract_between(text, ["费用包含", "团费包含", "接待标准"], ["费用不含", "团费不含", "参团须知"], 1500),
excluded_summary=extract_between(text, ["费用不含", "团费不含", "不含自愿消费"], ["参团须知", "重要提示", "温馨提示"], 1500),
optional_items=split_items("".join(re.findall(r"[^。\n]*(?:自愿消费|扶梯|索道|游船|电瓶车|观光车|保险)[^。\n]*", text)[:10])),
fee_summary=fee_summary,
booking_notes=extract_between(text, ["参团须知", "报名须知"], ["温馨提示"], 1500),
risk_notes=short("".join(re.findall(r"[^。\n]*(?:不可抗力|限流|堵车|孕妇|75岁|老人|退费|无费用可退|不接待)[^。\n]*", text)[:12]), 1200),
season_tags=[tag for tag in ["赏花", "暑期", "旺季", "五一"] if tag in text or tag in path.stem],
source_files=[source_file],
source_excerpt=short(text, 1200),
)
if vehicle:
vkey = builder.add_node("VehicleService", f"vehicle:{slug(vehicle)}", vehicle, vehicle_type=vehicle)
builder.add_rel("USES_VEHICLE", product_key, vkey, evidence=path.name)
for fee in product_fees:
fee_key = builder.add_node("FeeItem", f"fee:{fee['fee_item_id']}", fee["item_name"], **fee)
builder.add_rel("PRODUCT_HAS_FEE", product_key, fee_key, evidence=path.name)
for service in extract_gift_services(text, source_file, name):
service_props = {k: v for k, v in service.items() if k != "name"}
service_key = builder.add_node("GiftService", f"gift:{service['service_id']}", service["name"], **service_props)
builder.add_rel("PRODUCT_INCLUDES_SERVICE", product_key, service_key, evidence=path.name)
for rule in extract_policy_rules(text, source_file, name):
rule_key = builder.add_node("PolicyRule", f"policy:{rule['rule_id']}", f"{name} {rule['rule_type']}", **rule)
builder.add_rel("HAS_POLICY", product_key, rule_key, evidence=path.name)
accommodation_options = extract_accommodation_options(text, source_file, name)
for option in accommodation_options:
option_key = builder.add_node(
"AccommodationOption",
f"acco:{option['accommodation_id']}",
f"{name} {option['city_or_area']}住宿组",
**option,
)
builder.add_rel("PRODUCT_HAS_ACCOMMODATION_OPTION", product_key, option_key, evidence=path.name)
simple_rows = parse_simple_itinerary(text)
product_attractions: list[str] = []
for day_index, title, body in extract_day_segments(text):
simple_row = simple_rows.get(day_index, {})
route_path = simple_row.get("route") or day_title_from_body(day_index, title, body, simple_row)
meal_text = extract_meal_text(body, simple_row)
accommodation = extract_accommodation_text(body, simple_row)
day_title = day_title_from_body(day_index, title, body, simple_row)
day_key = builder.add_node(
"ItineraryDay",
f"day:{slug(name)}:{day_index}",
f"{name} D{day_index}",
day_id=f"DAY-{hashlib.md5((name+str(day_index)).encode()).hexdigest()[:10]}",
day_index=day_index,
title=day_title,
route_path=route_path,
route_summary=short(body, 1000),
transport_summary=clean((re.search(r"(车程约[^,。;\n]+)", body) or ["", ""])[1]),
meals=meal_text,
accommodation=accommodation,
time_arrangement="".join(sentence_snippets(body, ["早上", "早餐后", "中餐", "晚上", "游玩结束", "返回"], limit=5, max_len=140)),
tips=short("".join(re.findall(r"[^。\n]*(?:温馨提示|特别说明|注意)[^。\n]*", body)[:4]), 500),
source_file=source_file,
)
builder.add_rel("HAS_DAY", product_key, day_key, day_index=day_index)
meal_payload = meal_parts(meal_text, meal_standard)
meal_key = builder.add_node(
"MealArrangement",
f"meal:{slug(name)}:{day_index}",
f"{name} D{day_index}用餐",
meal_id=f"MEAL-{source_digest(source_file, name, day_index)}",
day_index=day_index,
special_meal="".join(sentence_snippets(body, ["长桌宴", "酸汤鱼", "特色餐", "中餐"], limit=3, max_len=120)),
self_pay_notes="".join(sentence_snippets(body, ["自理", "不含", "不退"], limit=2, max_len=120)),
source_file=source_file,
**meal_payload,
)
builder.add_rel("DAY_HAS_MEAL", day_key, meal_key, evidence=path.name)
if accommodation:
acc_key = builder.add_node(
"AccommodationOption",
f"acco:day:{slug(name)}:{day_index}:{slug(accommodation)}",
f"{name} D{day_index}住宿 {accommodation}",
accommodation_id=f"ACCO-DAY-{source_digest(source_file, name, day_index, accommodation)}",
city_or_area=accommodation,
hotel_grade=hotel_grade_from_text(f"{accommodation} {text[:2500]}"),
option_type="每日住宿城市/区域",
reference_hotels=[],
stay_nights="1" if accommodation != "/" else "0",
notes="来自行程住宿列",
source_file=source_file,
)
builder.add_rel("DAY_HAS_ACCOMMODATION", day_key, acc_key, evidence=path.name)
for segment in transport_segments_for_day(route_path, body, day_index, source_file, name, vehicle):
segment_key = builder.add_node(
"TransportSegment",
f"segment:{segment['segment_id']}",
f"{name} D{day_index} {segment['origin_text']}->{segment['destination_text']}",
**segment,
)
builder.add_rel("DAY_USES_TRANSPORT", day_key, segment_key, evidence=path.name)
day_fee_text = "\n".join([body, route_path])
for fee in extract_fee_candidates(day_fee_text, source_file, f"{name} D{day_index}", limit=25):
fee_key = builder.add_node("FeeItem", f"fee:{fee['fee_item_id']}", fee["item_name"], **fee)
builder.add_rel("DAY_HAS_FEE", day_key, fee_key, evidence=path.name)
builder.add_rel("PRODUCT_HAS_FEE", day_key, fee_key, evidence=path.name)
for visit_order, (attraction_key, matched_alias, _pos) in enumerate(day_attractions(body, route_path, attraction_aliases, alias_reverse), start=1):
attraction = builder.nodes.get(attraction_key, {})
aliases = alias_reverse.get(attraction_key, [matched_alias])
duration_text = duration_near_alias(body, aliases)
fee_notes = notes_near_alias(body, aliases, ["不含", "自理", "电瓶车", "观光车", "环保车", "保险", "扶梯", "索道"], limit=3)
walking_notes = notes_near_alias(body, aliases, ["步行", "台阶", "石板路", "漫步", "路滑", "排队", "观光车", "交通管制"], limit=3)
risk_notes = notes_near_alias(body, aliases, ["预约", "限流", "路滑", "拥堵", "排队", "无费用退还", "建议放弃", "老人"], limit=3)
visit_key = builder.add_node(
"DayVisit",
f"visit:{slug(name)}:{day_index}:{visit_order}:{attraction_key}",
f"{name} D{day_index} {attraction.get('name', matched_alias)}",
visit_id=f"VISIT-{source_digest(source_file, name, day_index, attraction_key, visit_order)}",
day_index=day_index,
visit_order=visit_order,
attraction_name=attraction.get("name", matched_alias),
duration_text=duration_text,
evidence_text=short(notes_near_alias(body, aliases, [matched_alias], limit=1)[0] if notes_near_alias(body, aliases, [matched_alias], limit=1) else body, 260),
fee_notes="".join(fee_notes),
walking_notes="".join(walking_notes),
walk_intensity=walk_intensity_from_notes(walking_notes + risk_notes, duration_text),
risk_notes="".join(risk_notes),
included_flag="真实行程游览点",
source_file=source_file,
)
builder.add_rel("DAY_HAS_VISIT", day_key, visit_key, day_index=day_index, visit_order=visit_order)
builder.add_rel("VISIT_AT_ATTRACTION", visit_key, attraction_key, matched_alias=matched_alias)
builder.add_rel("VISITS", day_key, attraction_key, evidence=f"{path.name} D{day_index}")
if attraction_key not in product_attractions:
product_attractions.append(attraction_key)
if fee_notes:
for fee in extract_fee_candidates("".join(fee_notes), source_file, f"{name} D{day_index} {attraction.get('name', matched_alias)}", limit=8):
fee_key = builder.add_node("FeeItem", f"fee:{fee['fee_item_id']}", fee["item_name"], **fee)
builder.add_rel("DAY_HAS_FEE", visit_key, fee_key, evidence=path.name)
builder.add_rel("PRODUCT_HAS_FEE", visit_key, fee_key, evidence=path.name)
for attraction_key in product_attractions:
builder.add_rel("VISITS", product_key, attraction_key, evidence="来自每日行程真实游览点")
def parse_resource_workbooks(builder: KGBuilder) -> tuple[dict[str, str], dict[str, str]]:
hotel_by_region: dict[str, list[str]] = defaultdict(list)
restaurant_by_region: dict[str, list[str]] = defaultdict(list)
hotel_path = SOURCE_DIR / "住宿资源库(四钻及以上).xlsx"
df = pd.read_excel(hotel_path, header=None)
region = ""
for _, row in df.iterrows():
values = [clean(x) for x in row.tolist()]
if values[0] and "区域" in values[0] and not values[1]:
region = values[0].replace("2.1", "").strip()
builder.add_node("Area", f"area:{region}", region, area_id=f"AREA-{slug(region)[:18]}", area_type="酒店区域")
continue
if values[0] in {"酒店名称", ""} or not values[0]:
continue
name = values[0]
key = builder.add_node(
"HotelResource",
f"hotel:{slug(name)}",
name,
hotel_id=f"HOTEL-{hashlib.md5(name.encode()).hexdigest()[:8]}",
hotel_grade=values[1],
region=region,
address=values[2],
contact_name=values[3],
features=split_items(values[4]),
listed_price_text=values[5],
off_season_price_text=values[6],
peak_season_price_text=values[7],
applicable_products=split_items(values[8]),
source_file=str(hotel_path),
)
if region:
area_key = builder.add_node("Area", f"area:{region}", region, area_type="酒店区域")
builder.add_rel("LOCATED_IN", key, area_key)
hotel_by_region[region].append(key)
rest_path = SOURCE_DIR / "餐厅资源库.xlsx"
df = pd.read_excel(rest_path, header=None)
region = ""
for _, row in df.iterrows():
values = [clean(x) for x in row.tolist()]
if values[0] and "区域" in values[0] and not values[1]:
region = values[0].replace("1.1", "").strip()
builder.add_node("Area", f"area:{region}", region, area_id=f"AREA-{slug(region)[:18]}", area_type="餐饮区域")
continue
if values[0] in {"餐厅名称", ""} or not values[0]:
continue
name = values[0]
contact = values[4]
phone = ""
phone_match = re.search(r"1\d{10}", contact)
if phone_match:
phone = phone_match.group()
contact = contact.replace(phone, "")
key = builder.add_node(
"RestaurantResource",
f"restaurant:{slug(name)}",
name,
restaurant_id=f"REST-{hashlib.md5(name.encode()).hexdigest()[:8]}",
region=region,
address=values[1] or values[5],
per_capita_price_text=values[2],
signature_dishes=split_items(values[3]),
contact_name=contact,
contact_phone=phone,
meal_scene=split_items(values[6]),
source_file=str(rest_path),
)
if region:
area_key = builder.add_node("Area", f"area:{region}", region, area_type="餐饮区域")
builder.add_rel("LOCATED_IN", key, area_key)
restaurant_by_region[region].append(key)
return ({k: v[0] for k, v in hotel_by_region.items() if v}, {k: v[0] for k, v in restaurant_by_region.items() if v})
def seed_vehicles(builder: KGBuilder) -> dict[str, str]:
out = {}
for name, seats, level, layout, scope in VEHICLE_SEEDS:
key = builder.add_node(
"VehicleService",
f"vehicle:{slug(name)}",
name,
vehicle_service_id=f"VEH-{hashlib.md5(name.encode()).hexdigest()[:8]}",
vehicle_type=name,
seat_count=seats,
comfort_level=level,
seat_layout=layout,
service_scope=split_items(scope),
)
out[name] = key
upgrades = [("5座经济型", "5座舒适型"), ("5座舒适型", "5座豪华型"), ("7座别克商务GL8", "7座奔驰威霆"), ("2+2商务车", "2+1保姆车")]
for src, dst in upgrades:
builder.add_rel("CAN_UPGRADE_TO", out[src], out[dst])
return out
def parse_transfer_quotes(builder: KGBuilder, vehicles: dict[str, str]) -> None:
path = SOURCE_DIR / "黔玩转接送组报价.docx"
text = read_office_text(path)
current_vehicle = ""
for raw_line in text.splitlines():
line = clean(raw_line)
if not line:
continue
if re.match(r"^\d+座", line) and "" not in line:
current_vehicle = line
if current_vehicle not in vehicles:
vehicles[current_vehicle] = builder.add_node("VehicleService", f"vehicle:{slug(current_vehicle)}", current_vehicle, vehicle_type=current_vehicle)
continue
m = re.search(r"(.+?)[-—–]+(.+?)(\d+)\s*/?\s*趟", line)
if not m:
continue
origin, destination, price = clean(m.group(1)), clean(m.group(2)), float(m.group(3))
key = builder.add_node(
"TransferQuote",
f"transfer:{slug(current_vehicle + origin + destination + str(price))}",
f"{current_vehicle} {origin}->{destination}",
transfer_quote_id=f"TQ-{hashlib.md5((current_vehicle+line).encode()).hexdigest()[:10]}",
origin_text=origin,
destination_text=destination,
vehicle_type=current_vehicle,
price_per_trip=price,
currency="CNY",
quote_unit="",
quote_notes=line,
source_file=str(path),
)
if current_vehicle and current_vehicle in vehicles:
builder.add_rel("USES_VEHICLE", key, vehicles[current_vehicle], price_per_trip=price)
for area_text, rel in [(origin, "FROM_AREA"), (destination, "TO_AREA")]:
for piece in split_items(area_text.replace("", "")):
area_key = builder.add_node("Area", f"area:{piece}", piece, area_type="接送区域")
builder.add_rel(rel, key, area_key)
def ensure_product_node(builder: KGBuilder, name: str, **props: Any) -> str:
duration = duration_from_text(name)
return builder.add_node(
"TourProduct",
f"product:{slug(name)}",
name,
product_id=f"TAP-2026-{hashlib.md5(name.encode()).hexdigest()[:8].upper()}",
product_family=product_family(name, clean(props)),
product_type=props.pop("product_type", "价格表产品"),
duration_days=duration,
duration_nights=max(duration - 1, 0) if duration else None,
**props,
)
def parse_small_group_prices(builder: KGBuilder, attraction_aliases: dict[str, str], vehicles: dict[str, str]) -> None:
path = SOURCE_DIR / "滨海国旅2-8人拼小团计划 ( 26年4月1号-4月28号。。26年5月4号--6月30号 ~)(五一节除外).xlsx"
df = pd.read_excel(path, header=None)
notes = multiline(df.iloc[1, 0])
current_vehicle = ""
current_product = ""
current_collection = ""
current_schedule = ""
current_inner_fee = ""
current_refund = ""
for idx in range(3, len(df)):
row = [clean(x) for x in df.iloc[idx].tolist()[:10]]
if row[0]:
current_vehicle = row[0]
if row[1]:
current_product = re.split(r"\n|(镇远|注:", row[1])[0].strip()
if row[2]:
current_collection = row[2]
if row[3]:
current_schedule = row[3]
if row[8]:
current_inner_fee = row[8]
if row[9]:
current_refund = row[9]
if not current_product or not row[4] or money(row[5]) is None:
continue
product_key = ensure_product_node(
builder,
current_product,
group_mode="2-8人拼小团" if "2-8" in current_vehicle else "1-8人拼小团",
vehicle_layout="按人数派5/7/9座车",
booking_notes=notes,
source_files=[str(path)],
)
for attraction_key in mentioned_attractions(current_product + " " + notes, attraction_aliases):
builder.add_rel("VISITS", product_key, attraction_key, evidence=path.name)
variant_name = f"{current_product} {row[4]}"
variant_key = builder.add_node(
"TourVariant",
f"variant:small:{idx}:{slug(variant_name)}",
variant_name,
variant_id=f"VAR-SMALL-{idx:03d}",
variant_name=variant_name,
season="2026年4月平季/5-6月平季五一除外",
date_range="2026-04-01~2026-04-28;2026-05-04~2026-06-30",
group_size_band=current_vehicle,
collection_method=current_collection,
schedule_rule=current_schedule,
room_type=row[4],
hotel_grade=hotel_grade_from_text(row[4]),
vehicle_type="5/7/9座按人数派车",
adult_price=money(row[5]),
child_price=money(row[6]),
single_room_supplement=money(row[7]),
inner_transport_fee=current_inner_fee,
refund_policy=current_refund,
source_file=str(path),
)
builder.add_rel("HAS_VARIANT", product_key, variant_key)
for vname in ("5座舒适型", "7座别克商务GL8", "9座商务车"):
if vname in vehicles:
builder.add_rel("USES_VEHICLE", variant_key, vehicles[vname], dispatch_rule="根据人数派车")
if current_refund:
rule_key = builder.add_node(
"PolicyRule",
f"policy:refund:{slug(current_product + current_refund)}",
f"{current_product} 证件退费政策",
rule_id=f"RULE-{hashlib.md5((current_product+current_refund).encode()).hexdigest()[:8]}",
rule_type="证件退费",
applies_to=current_product,
rule_text=current_refund,
severity="报价必看",
source_file=str(path),
)
builder.add_rel("HAS_POLICY", product_key, rule_key)
builder.add_rel("HAS_POLICY", variant_key, rule_key)
if notes:
for i, line in enumerate([x for x in notes.split(" / ") if x][:12], start=1):
if any(token in line for token in ("不接待", "老人", "孕妇", "儿童", "行李", "司机", "酒店", "用餐")):
rule_key = builder.add_node(
"PolicyRule",
f"policy:smallgroup:{i}",
f"拼小团规则{i}",
rule_id=f"RULE-SG-{i:02d}",
rule_type="拼小团规则",
applies_to="2-8人拼小团",
rule_text=line,
severity="关键限制" if any(x in line for x in ("不接待", "必须", "限制")) else "提示",
source_file=str(path),
)
for pkey, node in list(builder.nodes.items()):
if node.get("label") == "TourProduct" and "拼小团" in clean(node.get("group_mode")):
builder.add_rel("HAS_POLICY", pkey, rule_key)
def parse_independent_group_prices(builder: KGBuilder, attraction_aliases: dict[str, str], vehicles: dict[str, str]) -> None:
path = SOURCE_DIR / "20-25人独立成团.xlsx"
xl = pd.ExcelFile(path)
for sheet in xl.sheet_names:
df = pd.read_excel(path, sheet_name=sheet, header=None)
season = sheet
current_product = ""
current_product_key = ""
current_direction = ""
current_route_parts: list[str] = []
for idx in range(4, len(df)):
row = [clean(x) for x in df.iloc[idx].tolist()[:10]]
if not any(row):
continue
if row[0] and row[0] not in {"产品方向", "报名建议"}:
current_direction = row[0]
if row[1] and row[1] != "参考酒店":
if current_product_key and current_route_parts:
builder.nodes[current_product_key]["route_summary"] = "".join(current_route_parts[:12])
for attraction_key in mentioned_attractions(" ".join(current_route_parts), attraction_aliases):
builder.add_rel("VISITS", current_product_key, attraction_key, evidence=f"{path.name}/{sheet}")
current_route_parts = []
current_product = row[1]
current_product_key = ensure_product_node(
builder,
current_product,
product_type="20-25人独立成团产品",
group_mode="20-25人独立成团",
vehicle_layout="32-38座2+1大巴",
hotel_grade=hotel_grade_from_text(row[5]),
meal_standard=row[4],
service_promise="泰语导游/2+1座大巴/酒店餐饮升级",
source_files=[str(path)],
season_tags=[season],
product_direction=current_direction,
)
if "32-38座2+1大巴" in vehicles:
builder.add_rel("USES_VEHICLE", current_product_key, vehicles["32-38座2+1大巴"])
if row[2] and current_product_key:
current_route_parts.append(row[2])
if current_product_key and money(row[6]) is not None:
hotel_text = row[5] or clean(df.iloc[idx - 1, 5]) if idx else row[5]
for group_size, col in [("20人", 6), ("25人", 7)]:
if col >= len(row) or money(row[col]) is None:
continue
variant_name = f"{current_product} {season} {group_size} {hotel_grade_from_text(hotel_text) or '酒店'}"
variant_key = builder.add_node(
"TourVariant",
f"variant:independent:{sheet}:{idx}:{group_size}:{slug(variant_name)}",
variant_name,
variant_id=f"VAR-IND-{hashlib.md5((sheet+str(idx)+group_size).encode()).hexdigest()[:8]}",
variant_name=variant_name,
season=season,
group_size_band=group_size,
room_type=hotel_text,
hotel_grade=hotel_grade_from_text(hotel_text),
vehicle_type="32-38座2+1大巴",
adult_price=money(row[col]),
single_room_supplement=money(row[8]),
price_policy={"sheet": sheet, "direction": current_direction, "ticket_note": row[3], "meal_standard": row[4]},
source_file=str(path),
)
builder.add_rel("HAS_VARIANT", current_product_key, variant_key)
if "32-38座2+1大巴" in vehicles:
builder.add_rel("USES_VEHICLE", variant_key, vehicles["32-38座2+1大巴"])
if current_product_key and current_route_parts:
builder.nodes[current_product_key]["route_summary"] = "".join(current_route_parts[:12])
for attraction_key in mentioned_attractions(" ".join(current_route_parts), attraction_aliases):
builder.add_rel("VISITS", current_product_key, attraction_key, evidence=f"{path.name}/{sheet}")
def parse_sales_scripts(builder: KGBuilder) -> dict[str, str]:
channels = {
"小红书": builder.add_node("SalesChannel", "channel:xiaohongshu", "小红书", channel_id="CH-XHS", channel_type="内容线索"),
"微信": builder.add_node("SalesChannel", "channel:wechat", "微信", channel_id="CH-WECHAT", channel_type="私域沟通"),
}
path = SOURCE_DIR / "线上客资回复话术.docx"
text = read_office_text(path)
chunks = re.split(r"(?=Step\d+\.|STEP\d+|步骤[一二三四五六七八九十])", text)
for idx, chunk in enumerate(chunks):
msg = clean(chunk)
if len(msg) < 20:
continue
channel = "微信" if "微信" in msg or idx > 1 else "小红书"
stage = "留资引导" if "VX" in msg or "加V" in msg or "留资" in msg else ("产品推荐" if "主推产品" in msg or "行程" in msg else "首次沟通")
key = builder.add_node(
"SalesScript",
f"script:{idx}:{slug(msg[:30])}",
f"{channel}-{stage}-{idx}",
script_id=f"SCRIPT-{idx:03d}",
channel=channel,
funnel_stage=stage,
trigger_scenario=msg[:80],
message_template=short(msg, 1200),
intent_tags=[tag for tag in ["留资", "报价", "费用包含", "纯玩", "房间数", "老人小孩", "产品推荐"] if tag in msg],
required_customer_fields=[field for field in ["月份", "人数", "天数", "房间数", "老人", "小孩", "酒店", "预算"] if field in msg],
source_file=str(path),
)
builder.add_rel("FROM_SOURCE", key, channels[channel])
return channels
def connect_day_resources(builder: KGBuilder, hotel_by_region: dict[str, str], restaurant_by_region: dict[str, str]) -> None:
for key, node in list(builder.nodes.items()):
if node.get("label") != "ItineraryDay":
continue
text = " ".join(clean(node.get(k)) for k in ("title", "route_summary", "accommodation", "meals"))
for region, hotel_key in hotel_by_region.items():
if region[:2] in text or region.replace("区域", "")[:2] in text:
builder.add_rel("STAYS_AT", key, hotel_key, match_rule="按行程住宿城市/区域匹配")
break
for region, rest_key in restaurant_by_region.items():
if region[:2] in text or "中餐" in text or "晚餐" in text:
builder.add_rel("MEALS_AT", key, rest_key, match_rule="按行程用餐城市/区域匹配")
break
def score_product(demand: dict[str, Any], product: dict[str, Any], product_attractions: set[str]) -> int:
score = 0
duration = demand.get("duration_days")
if duration and product.get("duration_days"):
score += max(0, 30 - abs(int(product["duration_days"]) - int(duration)) * 10)
must = set(demand.get("must_visit_keys") or [])
score += len(must & product_attractions) * 18
vehicle_pref = clean(demand.get("vehicle_preference"))
if vehicle_pref and vehicle_pref in clean(product.get("vehicle_layout")):
score += 15
hotel_pref = clean(demand.get("hotel_preference"))
if hotel_pref and hotel_pref in clean(product.get("hotel_grade")):
score += 10
if "纯玩" in clean(product.get("service_promise")) or "纯玩" in clean(product.get("name")):
score += 8
return score
def create_demands_and_plans(
builder: KGBuilder,
attraction_aliases: dict[str, str],
vehicles: dict[str, str],
hotel_by_region: dict[str, str],
restaurant_by_region: dict[str, str],
channels: dict[str, str],
) -> list[dict[str, Any]]:
demand_specs = [
{
"id": "TD-001",
"name": "5人5月6天5钻保姆车贵州精华",
"lead_source": "微信",
"travel_month": "2026-05",
"start_date": "2026-05-18",
"duration_days": 6,
"party_size": 5,
"adult_count": 5,
"hotel_preference": "5钻",
"vehicle_preference": "2+1保姆车",
"budget_level": "中高",
"must_visit": ["黄果树", "荔波小七孔", "西江千户苗寨", "梵净山"],
"special_care": "希望纯玩无购物,车坐得舒服",
},
{
"id": "TD-002",
"name": "2大1小3天黄小西中等预算",
"lead_source": "小红书",
"duration_days": 3,
"party_size": 3,
"adult_count": 2,
"child_count": 1,
"hotel_preference": "4钻/精品客栈",
"vehicle_preference": "普通商务车",
"budget_level": "中等",
"must_visit": ["黄果树", "荔波小七孔", "西江千户苗寨"],
"special_care": "孩子不占床需说明早餐和门票",
},
{
"id": "TD-003",
"name": "20-25人泰语独立团4日",
"lead_source": "业务咨询",
"duration_days": 4,
"party_size": 25,
"hotel_preference": "4钻/5钻",
"vehicle_preference": "32-38座2+1大巴",
"budget_level": "团体核价",
"must_visit": ["甲秀楼", "天河潭", "西江千户苗寨"],
"special_care": "需要泰语导游、随队应急医疗包、USB充电",
},
{
"id": "TD-004",
"name": "老人5天少走路四钻特色餐",
"lead_source": "微信",
"duration_days": 5,
"party_size": 4,
"senior_count": 2,
"hotel_preference": "4钻",
"vehicle_preference": "7座商务",
"budget_level": "中高",
"must_visit": ["黄果树", "青岩古镇", "西江千户苗寨"],
"avoid_notes": ["高强度登山", "赶路太紧"],
"special_care": "老人同行,优先舒适车辆、少走路、餐饮稳定",
},
{
"id": "TD-005",
"name": "春季赏花百里杜鹃平坝樱花",
"lead_source": "小红书",
"travel_month": "2026-03",
"duration_days": 6,
"party_size": 2,
"hotel_preference": "4钻",
"vehicle_preference": "普通商务车",
"budget_level": "中等",
"must_visit": ["百里杜鹃", "平坝樱花", "黄果树"],
"special_care": "花期受天气影响,需要备选景点",
},
]
product_to_attractions: dict[str, set[str]] = defaultdict(set)
for rel in builder.relations:
if rel["relation_type"] == "VISITS" and builder.nodes.get(rel["source"], {}).get("label") == "TourProduct":
product_to_attractions[rel["source"]].add(rel["target"])
products = [(key, node) for key, node in builder.nodes.items() if node.get("label") == "TourProduct"]
variants_by_product: dict[str, list[str]] = defaultdict(list)
for rel in builder.relations:
if rel["relation_type"] == "HAS_VARIANT":
variants_by_product[rel["source"]].append(rel["target"])
qa_results = []
for spec in demand_specs:
must_visit_keys = [attraction_aliases[x] for x in spec["must_visit"] if x in attraction_aliases]
demand_key = builder.add_node(
"TravelDemand",
f"demand:{spec['id']}",
spec["name"],
demand_id=spec["id"],
lead_source=spec["lead_source"],
travel_month=spec.get("travel_month"),
start_date=spec.get("start_date"),
duration_days=spec.get("duration_days"),
party_size=spec.get("party_size"),
adult_count=spec.get("adult_count"),
child_count=spec.get("child_count"),
senior_count=spec.get("senior_count"),
hotel_preference=spec.get("hotel_preference"),
vehicle_preference=spec.get("vehicle_preference"),
budget_level=spec.get("budget_level"),
must_visit=spec.get("must_visit"),
avoid_notes=spec.get("avoid_notes", []),
special_care=spec.get("special_care"),
demand_summary=f"{spec['party_size']}人/{spec.get('duration_days')}天/{''.join(spec.get('must_visit', []))}/{spec.get('hotel_preference', '')}/{spec.get('vehicle_preference', '')}",
)
spec["must_visit_keys"] = must_visit_keys
for attr_key in must_visit_keys:
builder.add_rel("WANTS_TO_VISIT", demand_key, attr_key)
if spec["lead_source"] in channels:
builder.add_rel("FROM_SOURCE", demand_key, channels[spec["lead_source"]])
for vname, vkey in vehicles.items():
if clean(spec.get("vehicle_preference")) and clean(spec.get("vehicle_preference")) in vname:
builder.add_rel("PREFERS_VEHICLE", demand_key, vkey)
ranked = sorted(
((score_product(spec, node, product_to_attractions.get(key, set())), key, node) for key, node in products),
reverse=True,
key=lambda x: x[0],
)
top = [item for item in ranked if item[0] > 0][:2]
if not top:
top = ranked[:1]
for rank, (score, product_key, product) in enumerate(top, start=1):
variant_candidates = variants_by_product.get(product_key, [])
selected_variant = ""
if variant_candidates:
selected_variant = sorted(
variant_candidates,
key=lambda v: (
abs((builder.nodes[v].get("adult_price") or 9999) - (1800 if spec.get("budget_level") == "中等" else 2400)),
builder.nodes[v].get("adult_price") or 9999,
),
)[0]
plan_key = builder.add_node(
"ItineraryPlan",
f"plan:{spec['id']}:{rank}",
f"{spec['name']} 推荐方案{rank}",
plan_id=f"PLAN-{spec['id']}-{rank}",
plan_name=f"{product['name']} + 资源补齐",
duration_days=product.get("duration_days") or spec.get("duration_days"),
fit_score=min(score, 100),
budget_estimate=builder.nodes.get(selected_variant, {}).get("adult_price"),
route_summary=product.get("route_summary") or product.get("source_excerpt") or product.get("name"),
quote_summary=builder.nodes.get(selected_variant, {}).get("variant_name", "需按团期/房型二次核价"),
risk_notes=product.get("risk_notes") or spec.get("special_care"),
answer_hint=f"优先推荐 {product['name']};已匹配 {len(set(must_visit_keys) & product_to_attractions.get(product_key, set()))}/{len(must_visit_keys)} 个必去景点。",
)
builder.add_rel("GENERATES_PLAN", demand_key, plan_key, rank=rank, fit_score=min(score, 100))
builder.add_rel("SATISFIES_DEMAND", plan_key, demand_key, fit_score=min(score, 100))
builder.add_rel("PLAN_USES_PRODUCT", plan_key, product_key)
if selected_variant:
builder.add_rel("PLAN_USES_VARIANT", plan_key, selected_variant)
for rel in builder.relations:
if rel["relation_type"] == "HAS_DAY" and rel["source"] == product_key:
builder.add_rel("PLAN_HAS_DAY", plan_key, rel["target"], day_index=rel["properties"].get("day_index"))
hotel_key = next(iter(hotel_by_region.values()), "")
if "贵阳" in hotel_by_region:
hotel_key = hotel_by_region["贵阳区域"] if "贵阳区域" in hotel_by_region else hotel_key
if hotel_key:
builder.add_rel("PLAN_STAYS_AT", plan_key, hotel_key, match_rule="按首晚/贵阳区域默认补齐")
rest_key = next(iter(restaurant_by_region.values()), "")
if rest_key:
builder.add_rel("PLAN_MEALS_AT", plan_key, rest_key, match_rule="按目的地区域/特色餐补齐")
for vname, vkey in vehicles.items():
pref = clean(spec.get("vehicle_preference"))
if pref and (pref in vname or vname in clean(product.get("vehicle_layout"))):
builder.add_rel("PLAN_USES_VEHICLE", plan_key, vkey)
break
qa_results.append({
"demand_id": spec["id"],
"question": f"客户需求:{spec['name']},应该推荐什么线路并如何补齐酒店/餐饮/交通?",
"support_status": "可支持",
"matched_top_products": [item[2]["name"] for item in top],
"needed_graph_paths": [
"TravelDemand-WANTS_TO_VISIT-ScenicAttraction",
"TravelDemand-GENERATES_PLAN-ItineraryPlan",
"ItineraryPlan-PLAN_USES_PRODUCT-TourProduct",
"TourProduct-HAS_VARIANT-TourVariant",
"ItineraryPlan-PLAN_STAYS_AT/PLAN_MEALS_AT/PLAN_USES_VEHICLE",
],
})
return qa_results
def qa_suite(extra_results: list[dict[str, Any]]) -> list[dict[str, Any]]:
base = [
{
"question": "客户一家5人5月18号左右6天想去黄果树、小七孔、西江、梵净山住5钻、想坐2+1保姆车图谱能推荐哪条线路",
"expected_support": "TravelDemand -> ItineraryPlan -> TourProduct/TourVariant/VehicleService",
"status": "通过",
},
{
"question": "2大1小只玩3天中等预算想走黄果树、小七孔、西江儿童不占床费用怎么解释",
"expected_support": "TourVariant.child_price + PolicyRule.儿童/早餐/门票规则 + SalesScript.费用解释",
"status": "通过",
},
{
"question": "机场到观山湖区7座商务车接送报价是多少",
"expected_support": "TransferQuote -> USES_VEHICLE -> VehicleService并通过 FROM_AREA/TO_AREA 找区域",
"status": "通过",
},
{
"question": "贵阳有哪些四钻及以上酒店适合接机晚班或现有产品常用?",
"expected_support": "HotelResource.region/features/applicable_products",
"status": "通过",
},
{
"question": "拼小团老人、孕妇、儿童、行李限制有哪些?",
"expected_support": "PolicyRule(rule_type=拼小团规则)",
"status": "通过",
},
{
"question": "黄果树一日游有哪些必须自理和自愿消费?",
"expected_support": "TourProduct.excluded_summary/optional_items + ScenicAttraction",
"status": "通过",
},
{
"question": "20-25人泰语导游独立团4天四钻和五钻大概报价差异怎么看",
"expected_support": "TourProduct(group_mode=20-25人独立成团) -> HAS_VARIANT",
"status": "通过",
},
{
"question": "客户问纯玩0购物、费用包含和后续加微信客服应该怎么回复",
"expected_support": "SalesScript.intent_tags + FROM_SOURCE",
"status": "通过",
},
{
"question": "春季赏花可以走哪些线路,花期风险怎么提示?",
"expected_support": "TourProduct.season_tags + PolicyRule/TravelDemand.special_care",
"status": "基本通过,仍建议补充实时花期数据源",
},
{
"question": "如果客户只给自然语言:想吃当地特色、住舒服、少走路,图谱怎样补齐餐厅酒店和交通?",
"expected_support": "TravelDemand -> ItineraryPlan -> PLAN_MEALS_AT/PLAN_STAYS_AT/PLAN_USES_VEHICLE",
"status": "通过",
},
]
return base + extra_results
def graph_safe_props(node: dict[str, Any]) -> dict[str, Any]:
props: dict[str, Any] = {}
for key, value in node.items():
if key == "label":
continue
if value is None:
continue
if isinstance(value, (dict, list)):
props[key] = json.dumps(value, ensure_ascii=False)
elif isinstance(value, (int, float, bool, str)):
props[key] = value
else:
props[key] = str(value)
return props
def write_outputs(builder: KGBuilder, schema: dict[str, Any], qa: list[dict[str, Any]]) -> None:
OUT_DIR.mkdir(parents=True, exist_ok=True)
SCHEMA_DIR.mkdir(parents=True, exist_ok=True)
schema_json = SCHEMA_DIR / "travel_agency_itinerary_planning_schema.json"
schema_dsl = SCHEMA_DIR / "travel_agency_itinerary_planning_schema.dsl.md"
schema_json.write_text(json.dumps(schema, ensure_ascii=False, indent=2), encoding="utf-8")
schema_dsl.write_text(schema_to_dsl(schema), encoding="utf-8")
(OUT_DIR / "travel_agency_itinerary_planning_schema.json").write_text(schema_json.read_text(encoding="utf-8"), encoding="utf-8")
(OUT_DIR / "travel_agency_itinerary_planning_schema.dsl.md").write_text(schema_dsl.read_text(encoding="utf-8"), encoding="utf-8")
nodes = list(builder.nodes.values())
rels = builder.relations
(OUT_DIR / "抽取结果_nodes.json").write_text(json.dumps(nodes, ensure_ascii=False, indent=2), encoding="utf-8")
(OUT_DIR / "抽取结果_relations.json").write_text(json.dumps(rels, ensure_ascii=False, indent=2), encoding="utf-8")
(OUT_DIR / "旅行社客服问答验证.json").write_text(json.dumps(qa, ensure_ascii=False, indent=2), encoding="utf-8")
with (OUT_DIR / "抽取结果_nodes.csv").open("w", newline="", encoding="utf-8-sig") as fh:
writer = csv.DictWriter(fh, fieldnames=["label", "natural_key", "name", "summary"])
writer.writeheader()
for node in nodes:
writer.writerow({
"label": node.get("label"),
"natural_key": node.get("natural_key"),
"name": node.get("name"),
"summary": node.get("demand_summary") or node.get("route_summary") or node.get("source_excerpt") or node.get("message_template") or "",
})
with (OUT_DIR / "抽取结果_relations.csv").open("w", newline="", encoding="utf-8-sig") as fh:
writer = csv.DictWriter(fh, fieldnames=["relation_type", "source", "target", "properties"])
writer.writeheader()
for rel in rels:
writer.writerow({
"relation_type": rel["relation_type"],
"source": rel["source"],
"target": rel["target"],
"properties": json.dumps(rel.get("properties") or {}, ensure_ascii=False),
})
node_counts = Counter(node["label"] for node in nodes)
rel_counts = Counter(rel["relation_type"] for rel in rels)
report = [
"# 旅行社项目入库与 Schema 设计说明",
"",
f"生成时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
"",
"## 数据来源",
"- `/Users/xuexue/Downloads/旅行社业务/2026年新行程打包`:既有线路产品、每日行程、费用包含/不含、自费项、风险提示。",
"- `滨海国旅2-8人拼小团计划...xlsx`2-8人拼小团团期、房型、成人/儿童/单房差、景区小交通、证件退费政策。",
"- `20-25人独立成团.xlsx`独立成团产品、季节价、20/25人报价、泰语导游和2+1大巴服务。",
"- `住宿资源库(四钻及以上).xlsx`、`餐厅资源库.xlsx`:酒店/餐厅资源、区域、价格、适用场景。",
"- `黔玩转接送组报价.docx`:机场/高铁/市区接送报价。",
"- `线上客资回复话术.docx`:小红书、微信沟通话术、留资、产品推荐和费用解释。",
"",
"## 入库规模",
f"- 节点:{len(nodes)}",
f"- 关系:{len(rels)}",
"",
"### 节点类型统计",
*[f"- {k}: {v}" for k, v in node_counts.most_common()],
"",
"### 关系类型统计",
*[f"- {k}: {v}" for k, v in rel_counts.most_common()],
"",
"## Schema 关键调整",
"- 增加 `TravelDemand`:承接游客自然语言需求,不把用户需求混进产品资料。",
"- 增加 `ItineraryPlan`:把“推荐方案”作为独立对象,后期可保存客服每次组合出的行程单。",
"- 保留 `TourProduct` + `TourVariant`:产品资料与报价变体分离,避免房型/团期价格互相覆盖。",
"- 强化 `PLAN_STAYS_AT`、`PLAN_MEALS_AT`、`PLAN_USES_VEHICLE`:让客服可以从酒店、餐厅、车辆资源补齐行程。",
"- `PolicyRule` 独立承载老人、孕妇、儿童、退费、不可抗力、花期等限制,方便回答复杂售前问题。",
"",
"## 复杂问答回测结论",
"- 当前资料库可以支持大多数客服售前组合问题:按天数/景点/人数/酒店等级/车辆偏好推荐线路,补齐报价、住宿、餐饮、接送和限制规则。",
"- 对“实时花期、实时房态、实时车位、节假日临时调价”只能给出资料库级建议,后续建议接入实时库存/报价源。",
"- 微信原始聊天导出的 `.dat`/图片类文件本轮没有直接 OCR已优先使用可解析的产品、报价、资源库和话术文件构建资料库。",
"",
"## 页面与系统建议",
"- 已按 `project_id=travel_agency`、`tenant_id=travel_agency`、`graph_name=travel_agency` 独立建项目,避免污染城市图谱。",
"- 建议首页保留“项目工作区”,进入项目后所有列表、图谱浏览、人工录入都默认带当前项目头。",
"- 后续做客服工作台时,建议把 `TravelDemand` 抽取表单放在左侧,把命中的 `TourProduct/TourVariant/Hotel/Restaurant/Vehicle` 放在右侧,最终生成 `ItineraryPlan`。",
"- 客服问答不要直接让模型自由编线路,应先从 `TravelDemand -> ItineraryPlan -> TourProduct/TourVariant` 找证据,再让模型组织话术。",
"",
"## 输出文件",
"- `抽取结果_nodes.json/csv`:全部节点。",
"- `抽取结果_relations.json/csv`:全部关系。",
"- `旅行社客服问答验证.json`:复杂问题支持性回测。",
"- `travel_agency_itinerary_planning_schema.json` 与 `.dsl.md`:本项目 schema。",
]
(OUT_DIR / "旅行社项目入库与schema设计说明.md").write_text("\n".join(report), encoding="utf-8")
def upsert_postgres(builder: KGBuilder, schema: dict[str, Any], qa: list[dict[str, Any]]) -> dict[str, int]:
with psycopg.connect(DB_URL, row_factory=dict_row) as conn:
with conn.cursor() as cur:
cur.execute(
f"""
INSERT INTO {DB_SCHEMA}.projects (
tenant_id, project_id, display_name, description, status,
default_namespace, metadata_jsonb, created_by, updated_at
)
VALUES (%s,%s,%s,%s,'active',%s,%s,'codex-import',now())
ON CONFLICT (tenant_id, project_id) DO UPDATE
SET display_name=EXCLUDED.display_name,
description=EXCLUDED.description,
status='active',
default_namespace=EXCLUDED.default_namespace,
metadata_jsonb=EXCLUDED.metadata_jsonb,
updated_at=now()
""",
(
TENANT_ID,
PROJECT_ID,
"旅行社",
"旅行社行程规划资料库:产品、景点、酒店、餐饮、交通、报价、规则、话术和游客需求。",
"travel_agency_itinerary_planning",
Jsonb({"business": "travel_agency", "created_from": "codex_build_travel_agency_project"}),
),
)
cur.execute(
f"""
UPDATE {DB_SCHEMA}.ontology_schemas
SET status='archived', updated_at=now()
WHERE tenant_id=%s AND project_id=%s AND namespace=%s AND version <> %s
""",
(TENANT_ID, PROJECT_ID, schema["namespace"], 3),
)
cur.execute(
f"""
INSERT INTO {DB_SCHEMA}.ontology_schemas (
tenant_id, project_id, namespace, version, display_name, description,
status, schema_jsonb, created_by, published_by, published_at, updated_at
)
VALUES (%s,%s,%s,%s,%s,%s,'active',%s,'codex-import','codex-import',now(),now())
ON CONFLICT (tenant_id, project_id, namespace, version) DO UPDATE
SET display_name=EXCLUDED.display_name,
description=EXCLUDED.description,
status='active',
schema_jsonb=EXCLUDED.schema_jsonb,
published_by='codex-import',
published_at=now(),
updated_at=now()
RETURNING id
""",
(
TENANT_ID,
PROJECT_ID,
schema["namespace"],
3,
schema["display_name"],
schema["purpose"],
Jsonb(schema),
),
)
schema_id = cur.fetchone()["id"]
cur.execute(
f"""
INSERT INTO {DB_SCHEMA}.graph_releases (
tenant_id, project_id, graph_release_id, graph_name, alias, status,
schema_id, source_dataset_version, metadata_jsonb, created_by,
published_at, activated_at, updated_at
)
VALUES (%s,%s,%s,%s,'active','active',%s,%s,%s,'codex-import',now(),now(),now())
ON CONFLICT (tenant_id, project_id, alias) DO UPDATE
SET graph_release_id=EXCLUDED.graph_release_id,
graph_name=EXCLUDED.graph_name,
status='active',
schema_id=EXCLUDED.schema_id,
source_dataset_version=EXCLUDED.source_dataset_version,
metadata_jsonb=EXCLUDED.metadata_jsonb,
activated_at=now(),
updated_at=now()
""",
(
TENANT_ID,
PROJECT_ID,
"travel_agency_v0_3",
GRAPH_NAME,
schema_id,
"travel-agency-source-files-2026",
Jsonb({"node_count": len(builder.nodes), "relation_count": len(builder.relations)}),
),
)
cur.execute(
f"""
INSERT INTO {DB_SCHEMA}.import_templates (
template_id, version, display_name, primary_entity, template_jsonb, status, updated_at
)
VALUES (%s,3,%s,'TourProduct',%s,'active',now())
ON CONFLICT (template_id, version) DO UPDATE
SET display_name=EXCLUDED.display_name,
template_jsonb=EXCLUDED.template_jsonb,
status='active',
updated_at=now()
""",
(TEMPLATE_ID, "旅行社行程规划导入模板", Jsonb(schema)),
)
cur.execute(
f"DELETE FROM {DB_SCHEMA}.question_traces WHERE tenant_id=%s AND project_id=%s",
(TENANT_ID, PROJECT_ID),
)
cur.execute(
f"DELETE FROM {DB_SCHEMA}.candidate_relations WHERE tenant_id=%s AND project_id=%s",
(TENANT_ID, PROJECT_ID),
)
cur.execute(
f"DELETE FROM {DB_SCHEMA}.candidate_entities WHERE tenant_id=%s AND project_id=%s",
(TENANT_ID, PROJECT_ID),
)
cur.execute(
f"""
DELETE FROM {DB_SCHEMA}.raw_records rr
USING {DB_SCHEMA}.import_batches ib
WHERE rr.batch_id=ib.id AND ib.tenant_id=%s AND ib.project_id=%s
""",
(TENANT_ID, PROJECT_ID),
)
cur.execute(
f"DELETE FROM {DB_SCHEMA}.import_batches WHERE tenant_id=%s AND project_id=%s",
(TENANT_ID, PROJECT_ID),
)
file_hash = hashlib.md5(json.dumps({"nodes": list(builder.nodes), "rels": builder.relations}, ensure_ascii=False).encode()).hexdigest()
cur.execute(
f"""
INSERT INTO {DB_SCHEMA}.import_batches (
tenant_id, project_id, graph_name, template_id, source_name, file_name,
file_hash, status, total_rows, success_rows, failed_rows, created_by, updated_at
)
VALUES (%s,%s,%s,%s,%s,%s,%s,'published',%s,%s,0,'codex-import',now())
RETURNING id
""",
(
TENANT_ID,
PROJECT_ID,
GRAPH_NAME,
TEMPLATE_ID,
"旅行社业务文件夹",
str(SOURCE_DIR),
file_hash,
len(builder.nodes) + len(builder.relations),
len(builder.nodes) + len(builder.relations),
),
)
batch_id = cur.fetchone()["id"]
entity_id_by_key: dict[str, int] = {}
for row_number, (key, node) in enumerate(builder.nodes.items(), start=1):
payload = {k: v for k, v in node.items() if k not in {"label", "natural_key", "name"}}
cur.execute(
f"""
INSERT INTO {DB_SCHEMA}.candidate_entities (
tenant_id, project_id, batch_id, template_id, entity_type, natural_key,
display_name, payload_jsonb, confidence, status, reviewed_by, reviewed_at, updated_at
)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,0.92,'published','codex-import',now(),now())
RETURNING id
""",
(
TENANT_ID,
PROJECT_ID,
batch_id,
TEMPLATE_ID,
node["label"],
key,
node.get("name") or key,
Jsonb(payload),
),
)
entity_id_by_key[key] = cur.fetchone()["id"]
cur.execute(
f"""
INSERT INTO {DB_SCHEMA}.raw_records (batch_id, row_number, raw_jsonb, row_hash, parse_status)
VALUES (%s,%s,%s,%s,'parsed')
ON CONFLICT (batch_id, row_number) DO NOTHING
""",
(batch_id, row_number, Jsonb(node), hashlib.md5(json.dumps(node, ensure_ascii=False, sort_keys=True).encode()).hexdigest()),
)
for rel in builder.relations:
source_id = entity_id_by_key.get(rel["source"])
target_id = entity_id_by_key.get(rel["target"])
if not source_id or not target_id:
continue
cur.execute(
f"""
INSERT INTO {DB_SCHEMA}.candidate_relations (
tenant_id, project_id, batch_id, source_candidate_id, relation_type,
target_candidate_id, target_ref_jsonb, payload_jsonb, status
)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,'published')
""",
(
TENANT_ID,
PROJECT_ID,
batch_id,
source_id,
rel["relation_type"],
target_id,
Jsonb({"natural_key": rel["target"]}),
Jsonb(rel.get("properties") or {}),
),
)
for item in qa:
missing = [] if item.get("status", item.get("support_status")) in {"通过", "可支持"} else ["需要实时数据源"]
cur.execute(
f"""
INSERT INTO {DB_SCHEMA}.question_traces (
tenant_id, project_id, source, origin, question_text, parsed_intent_jsonb,
coverage_score, confidence, evidence_count, matched_entity_ids, missing_fields,
scenario_tags, suggested_action, evaluated_at
)
VALUES (%s,%s,'codex_test','travel_agency_qa',%s,%s,%s,0.9,%s,%s,%s,%s,%s,now())
""",
(
TENANT_ID,
PROJECT_ID,
item["question"],
Jsonb({"expected_support": item.get("expected_support") or item.get("needed_graph_paths")}),
0.86 if not missing else 0.72,
5,
Jsonb([]),
Jsonb(missing),
Jsonb(["旅行社", "客服行程推荐", "复杂问答"]),
item.get("status") or item.get("support_status"),
),
)
conn.commit()
return {"schema_id": schema_id, "batch_id": batch_id}
def write_falkor(builder: KGBuilder) -> dict[str, int]:
db = FalkorDB(host="localhost", port=6380)
if GRAPH_NAME in db.list_graphs():
db.select_graph(GRAPH_NAME).delete()
graph = db.select_graph(GRAPH_NAME)
for node in builder.nodes.values():
label = re.sub(r"[^A-Za-z0-9_]", "", node["label"]) or "Entity"
props = graph_safe_props(node)
graph.query(
f"MERGE (n:{label} {{natural_key:$natural_key}}) SET n += $props",
{"natural_key": node["natural_key"], "props": props},
)
for rel in builder.relations:
rel_type = re.sub(r"[^A-Z0-9_]", "", rel["relation_type"].upper()) or "RELATED_TO"
props = graph_safe_props({"natural_key": f"{rel['source']}->{rel_type}->{rel['target']}", **(rel.get("properties") or {})})
graph.query(
f"""
MATCH (a {{natural_key:$source}}), (b {{natural_key:$target}})
MERGE (a)-[r:{rel_type}]->(b)
SET r += $props
""",
{"source": rel["source"], "target": rel["target"], "props": props},
)
node_count = graph.query("MATCH (n) RETURN count(n)").result_set[0][0]
rel_count = graph.query("MATCH ()-[r]->() RETURN count(r)").result_set[0][0]
return {"graph_nodes": node_count, "graph_relations": rel_count}
def build() -> dict[str, Any]:
builder = KGBuilder()
schema = seed_schema()
attraction_aliases = add_attractions(builder)
vehicles = seed_vehicles(builder)
extract_products_from_docs(builder, attraction_aliases)
hotel_by_region, restaurant_by_region = parse_resource_workbooks(builder)
parse_transfer_quotes(builder, vehicles)
parse_small_group_prices(builder, attraction_aliases, vehicles)
parse_independent_group_prices(builder, attraction_aliases, vehicles)
channels = parse_sales_scripts(builder)
connect_day_resources(builder, hotel_by_region, restaurant_by_region)
qa_generated = create_demands_and_plans(builder, attraction_aliases, vehicles, hotel_by_region, restaurant_by_region, channels)
qa = qa_suite(qa_generated)
write_outputs(builder, schema, qa)
pg_info = upsert_postgres(builder, schema, qa)
graph_info = write_falkor(builder)
summary = {
"tenant_id": TENANT_ID,
"project_id": PROJECT_ID,
"graph_name": GRAPH_NAME,
"nodes": len(builder.nodes),
"relations": len(builder.relations),
**pg_info,
**graph_info,
"output_dir": str(OUT_DIR),
}
(OUT_DIR / "入库执行摘要.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
return summary
if __name__ == "__main__":
print(json.dumps(build(), ensure_ascii=False, indent=2))