2173 lines
107 KiB
Python
2173 lines
107 KiB
Python
from __future__ import annotations
|
||
|
||
import csv
|
||
import hashlib
|
||
import json
|
||
import re
|
||
import subprocess
|
||
from collections import Counter, defaultdict
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
import pandas as pd
|
||
import psycopg
|
||
from falkordb import FalkorDB
|
||
from psycopg.rows import dict_row
|
||
from psycopg.types.json import Jsonb
|
||
|
||
|
||
SOURCE_DIR = Path("/Users/xuexue/Downloads/旅行社业务")
|
||
OUT_DIR = Path("/Users/xuexue/Downloads/图谱数据/旅行社项目入库")
|
||
SCHEMA_DIR = Path("/Users/xuexue/new2/schema搭建/travel_agency_business")
|
||
DB_URL = "postgresql://admin:password@localhost:5433/kg_admin"
|
||
DB_SCHEMA = "kg_admin_new2"
|
||
TENANT_ID = "travel_agency"
|
||
PROJECT_ID = "travel_agency"
|
||
GRAPH_NAME = "travel_agency"
|
||
TEMPLATE_ID = "travel_agency_itinerary_planning_v0_3"
|
||
|
||
|
||
ATTRACTION_SEEDS = [
|
||
("黄果树", ["黄果树", "黄果树瀑布", "黄果树大瀑布", "黄果树风景名胜区"], "安顺", "瀑布/5A", "贵州龙头景区,瀑布群核心卖点。"),
|
||
("天星桥", ["天星桥", "天星桥景区"], "安顺", "喀斯特/黄果树景区", "水上石林、天然盆景。"),
|
||
("陡坡塘瀑布", ["陡坡塘", "陡坡塘瀑布"], "安顺", "瀑布/黄果树景区", "瀑面宽,西游记取景。"),
|
||
("荔波小七孔", ["小七孔", "荔波小七孔", "小七孔景区"], "黔南", "山水/5A", "世界自然遗产,水上森林、卧龙潭等。"),
|
||
("西江千户苗寨", ["西江", "西江苗寨", "西江千户苗寨"], "黔东南", "民族村寨/4A", "苗寨夜景、长桌宴、吊脚楼。"),
|
||
("镇远古城", ["镇远", "镇远古镇", "镇远古城"], "黔东南", "古城/5A", "古城夜景、舞阳河沿岸住宿。"),
|
||
("梵净山", ["梵净山"], "铜仁", "山岳/5A", "弥勒道场、蘑菇石、金顶。"),
|
||
("青岩古镇", ["青岩", "青岩古镇"], "贵阳", "古镇/5A", "卤猪脚、小吃、送机前半日游。"),
|
||
("百里杜鹃", ["百里杜鹃"], "毕节", "赏花", "3-4月花期主题。"),
|
||
("平坝樱花", ["平坝樱花", "平坝农场"], "安顺", "赏花", "春季樱花主题。"),
|
||
("织金洞", ["织金洞"], "毕节", "溶洞/5A", "大型喀斯特溶洞。"),
|
||
("中国天眼", ["天眼", "中国天眼", "FAST"], "黔南", "科技研学", "天文研学卖点。"),
|
||
("茅台镇", ["茅台", "茅台镇"], "遵义", "酒文化", "酱酒文化体验。"),
|
||
("遵义会议会址", ["遵义会址", "遵义会议会址"], "遵义", "红色文化", "红色研学路线核心。"),
|
||
("兴义万峰林", ["万峰林", "兴义万峰林"], "黔西南", "峰林", "黔西南山水。"),
|
||
("万峰湖", ["万峰湖"], "黔西南", "湖泊", "兴义水上体验。"),
|
||
("马岭河峡谷", ["马岭河", "马岭河峡谷"], "黔西南", "峡谷", "兴义峡谷景观。"),
|
||
("花江大桥", ["花江大桥"], "安顺/黔西南", "桥梁景观", "桥见贵州特色线路。"),
|
||
("龙宫", ["龙宫"], "安顺", "溶洞/5A", "安顺秘境类产品。"),
|
||
("天河潭", ["天河潭"], "贵阳", "山水", "贵阳近郊半日/首日。"),
|
||
("甲秀楼", ["甲秀楼"], "贵阳", "城市地标", "贵阳市区地标。"),
|
||
("黔灵山公园", ["黔灵公园", "黔灵山"], "贵阳", "城市公园", "贵阳市区轻量游。"),
|
||
("乌江寨", ["乌江寨"], "遵义", "度假街区", "夜游/住宿度假。"),
|
||
("高坡云顶花海", ["高坡云顶花海", "云顶花海"], "贵阳", "花海", "暑期/亲子花海。"),
|
||
("野洞河", ["野洞河"], "黔东南", "漂流", "漂流体验。"),
|
||
]
|
||
|
||
|
||
VEHICLE_SEEDS = [
|
||
("5座经济型", 5, "经济", "5座", "接送/小团"),
|
||
("5座舒适型", 5, "舒适", "5座", "接送/小团"),
|
||
("5座豪华型", 5, "豪华", "奔驰E300", "高端接送"),
|
||
("7座别克商务GL8", 7, "舒适", "7座商务", "接送/2-6人小包团"),
|
||
("7座奔驰威霆", 7, "豪华", "7座商务", "高端接送/小包团"),
|
||
("9座商务车", 9, "商务", "9座", "7-8人拼小团"),
|
||
("2+1保姆车", 32, "轻奢", "横排2+1", "轻奢拼小团/保姆车产品"),
|
||
("2+2商务车", 32, "商务", "横排2+2", "经典/多彩商务版"),
|
||
("32-38座2+1大巴", 38, "独立团", "32-38座2+1", "20-25人独立成团"),
|
||
("旅游大巴", 55, "标准", "大巴", "一日游/常规散客"),
|
||
]
|
||
|
||
|
||
def clean(value: Any) -> str:
|
||
if value is None:
|
||
return ""
|
||
if isinstance(value, float) and pd.isna(value):
|
||
return ""
|
||
text = str(value).replace("\x00", "").replace("\u200b", "").replace("\u200f", "")
|
||
text = re.sub(r"\s+", " ", text).strip()
|
||
return text
|
||
|
||
|
||
def multiline(value: Any) -> str:
|
||
if value is None:
|
||
return ""
|
||
if isinstance(value, float) and pd.isna(value):
|
||
return ""
|
||
text = str(value).replace("\x00", "").replace("\u200b", "").replace("\u200f", "")
|
||
text = re.sub(r"[ \t]+", " ", text)
|
||
return re.sub(r"\n{3,}", "\n\n", text).strip()
|
||
|
||
|
||
def slug(text: str, prefix: str = "") -> str:
|
||
base = re.sub(r"[\s()()《》【】、,。::/\\\\]+", "_", clean(text))
|
||
base = re.sub(r"_+", "_", base).strip("_")
|
||
digest = hashlib.md5(clean(text).encode("utf-8")).hexdigest()[:8]
|
||
return f"{prefix}{base[:50]}_{digest}"
|
||
|
||
|
||
def money(value: Any) -> float | None:
|
||
text = clean(value)
|
||
if not text:
|
||
return None
|
||
found = re.search(r"-?\d+(?:\.\d+)?", text)
|
||
return float(found.group()) if found else None
|
||
|
||
|
||
def duration_from_text(text: str) -> int | None:
|
||
for pattern in [r"(\d+)\s*日游", r"(\d+)\s*天", r"(\d+)\s*日"]:
|
||
m = re.search(pattern, text)
|
||
if m:
|
||
return int(m.group(1))
|
||
cn = {"一": 1, "二": 2, "两": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10}
|
||
m = re.search(r"([一二两三四五六七八九十])日游", text)
|
||
if m:
|
||
return cn.get(m.group(1))
|
||
return None
|
||
|
||
|
||
def read_office_text(path: Path) -> str:
|
||
try:
|
||
proc = subprocess.run(
|
||
["textutil", "-convert", "txt", "-stdout", str(path)],
|
||
check=True,
|
||
capture_output=True,
|
||
text=True,
|
||
)
|
||
return proc.stdout
|
||
except Exception as exc: # noqa: BLE001
|
||
return f"[textutil读取失败: {exc}]"
|
||
|
||
|
||
def short(text: str, limit: int = 900) -> str:
|
||
text = clean(text)
|
||
return text[:limit]
|
||
|
||
|
||
def extract_between(text: str, starts: list[str], ends: list[str], limit: int = 1200) -> str:
|
||
start_pos = -1
|
||
for token in starts:
|
||
pos = text.find(token)
|
||
if pos >= 0 and (start_pos < 0 or pos < start_pos):
|
||
start_pos = pos
|
||
if start_pos < 0:
|
||
return ""
|
||
end_pos = len(text)
|
||
for token in ends:
|
||
pos = text.find(token, start_pos + 2)
|
||
if pos >= 0:
|
||
end_pos = min(end_pos, pos)
|
||
return short(text[start_pos:end_pos], limit)
|
||
|
||
|
||
def product_family(name: str, text: str) -> str:
|
||
joined = f"{name} {text[:1500]}"
|
||
if "高端" in joined or "5钻" in joined or "五钻" in joined:
|
||
return "高端纯玩"
|
||
if "轻奢" in joined or "2+1" in joined or "保姆车" in joined or "头等舱" in joined:
|
||
return "轻奢纯玩"
|
||
if "多彩" in joined:
|
||
return "多彩贵州"
|
||
if "经典" in joined:
|
||
return "经典纯玩"
|
||
if "1+1" in joined or "游黔途" in joined:
|
||
return "游黔途"
|
||
if "独立" in joined or "20-25" in joined:
|
||
return "独立成团"
|
||
return "常规纯玩"
|
||
|
||
|
||
def hotel_grade_from_text(text: str) -> str:
|
||
if any(x in text for x in ("5钻", "五钻", "五星", "超五星")):
|
||
return "5钻/五星"
|
||
if any(x in text for x in ("4钻", "四钻", "四星")):
|
||
return "4钻/四星"
|
||
if "商务" in text:
|
||
return "商务"
|
||
if "客栈" in text:
|
||
return "客栈"
|
||
return ""
|
||
|
||
|
||
def vehicle_from_text(text: str) -> str:
|
||
if "2+1" in text or "保姆车" in text or "头等舱" in text:
|
||
return "2+1保姆车"
|
||
if "2+2" in text:
|
||
return "2+2商务车"
|
||
if "32-38" in text or "独立成团" in text:
|
||
return "32-38座2+1大巴"
|
||
if "旅游大巴" in text or "大巴" in text:
|
||
return "旅游大巴"
|
||
if "9座" in text:
|
||
return "9座商务车"
|
||
if "7座" in text:
|
||
return "7座别克商务GL8"
|
||
if "5座" in text:
|
||
return "5座舒适型"
|
||
return ""
|
||
|
||
|
||
def split_items(text: str) -> list[str]:
|
||
parts = re.split(r"[、,,;/;\n]+", clean(text))
|
||
return [p.strip() for p in parts if p.strip() and p.strip().lower() != "nan"][:20]
|
||
|
||
|
||
def unique_clean(items: list[Any], limit: int = 30) -> list[str]:
|
||
out: list[str] = []
|
||
seen: set[str] = set()
|
||
for item in items:
|
||
value = clean(item)
|
||
if not value or value in seen:
|
||
continue
|
||
seen.add(value)
|
||
out.append(value)
|
||
if len(out) >= limit:
|
||
break
|
||
return out
|
||
|
||
|
||
def source_digest(*parts: Any, length: int = 10) -> str:
|
||
raw = "||".join(clean(p) for p in parts)
|
||
return hashlib.md5(raw.encode("utf-8")).hexdigest()[:length].upper()
|
||
|
||
|
||
def sentence_snippets(text: str, keywords: list[str], limit: int = 4, max_len: int = 180) -> list[str]:
|
||
snippets: list[str] = []
|
||
for raw in re.split(r"[。!?;;\n]+", text):
|
||
item = clean(raw)
|
||
if len(item) < 4:
|
||
continue
|
||
if any(keyword in item for keyword in keywords):
|
||
snippets.append(item[:max_len])
|
||
if len(snippets) >= limit:
|
||
break
|
||
return unique_clean(snippets, limit)
|
||
|
||
|
||
def build_alias_reverse(attraction_aliases: dict[str, str]) -> dict[str, list[str]]:
|
||
out: dict[str, list[str]] = defaultdict(list)
|
||
for alias, key in attraction_aliases.items():
|
||
out[key].append(alias)
|
||
for key, aliases in out.items():
|
||
out[key] = sorted(unique_clean(aliases, 20), key=len, reverse=True)
|
||
return out
|
||
|
||
|
||
def parse_simple_itinerary(text: str) -> dict[int, dict[str, str]]:
|
||
block = extract_between(text, ["简易行程"], ["详细行程", "接待标准", "费用包含"], 2400)
|
||
if not block:
|
||
return {}
|
||
cells = [clean(x) for x in re.split(r"[\x07\t]+", block) if clean(x)]
|
||
rows: dict[int, dict[str, str]] = {}
|
||
for idx, cell in enumerate(cells):
|
||
if not re.fullmatch(r"D\s*\d+", cell, flags=re.I):
|
||
continue
|
||
day_index = int(re.search(r"\d+", cell).group())
|
||
route = cells[idx + 1] if idx + 1 < len(cells) else ""
|
||
meals = cells[idx + 2] if idx + 2 < len(cells) else ""
|
||
accommodation = cells[idx + 3] if idx + 3 < len(cells) else ""
|
||
rows[day_index] = {"route": route, "meals": meals, "accommodation": accommodation}
|
||
return rows
|
||
|
||
|
||
def day_title_from_body(day_index: int, fallback: str, body: str, simple_row: dict[str, str] | None = None) -> str:
|
||
if simple_row and simple_row.get("route"):
|
||
return simple_row["route"]
|
||
lines = [clean(line) for line in body.splitlines() if clean(line)]
|
||
for line in lines[:4]:
|
||
if line.startswith("第") or re.fullmatch(r"D\s*\d+", line, flags=re.I):
|
||
continue
|
||
if "餐" in line or "住" in line or "→" in line or "->" in line or ">>" in line:
|
||
return line[:120]
|
||
return fallback or f"D{day_index}"
|
||
|
||
|
||
def extract_meal_text(body: str, simple_row: dict[str, str] | None = None) -> str:
|
||
if simple_row and simple_row.get("meals"):
|
||
return simple_row["meals"]
|
||
m = re.search(r"餐\s*[::]\s*([早中晚/、,,无\- ]{1,20})", body)
|
||
if m:
|
||
return clean(m.group(1))
|
||
return ""
|
||
|
||
|
||
def meal_parts(meal_text: str, product_meal_standard: str = "") -> dict[str, str]:
|
||
text = clean(meal_text)
|
||
if not text or text in {"/", "无", "不含"}:
|
||
return {
|
||
"breakfast": "不含/未写明",
|
||
"lunch": "不含/未写明",
|
||
"dinner": "不含/未写明",
|
||
"meal_text": text or "未写明",
|
||
"meal_standard": product_meal_standard,
|
||
}
|
||
return {
|
||
"breakfast": "含" if "早" in text else "不含/自理",
|
||
"lunch": "含" if "中" in text else "不含/自理",
|
||
"dinner": "含" if "晚" in text else "不含/自理",
|
||
"meal_text": text,
|
||
"meal_standard": product_meal_standard,
|
||
}
|
||
|
||
|
||
def extract_accommodation_text(body: str, simple_row: dict[str, str] | None = None) -> str:
|
||
if simple_row and simple_row.get("accommodation"):
|
||
return simple_row["accommodation"]
|
||
m = re.search(r"住\s*[::]\s*([^\n。;;]{1,40})", body)
|
||
if m:
|
||
return clean(m.group(1))
|
||
return ""
|
||
|
||
|
||
def extract_product_meal_standard(text: str) -> str:
|
||
for pattern in [
|
||
r"含\s*([0-9一二三四五六七八九十]+\s*早\s*[0-9一二三四五六七八九十]+\s*正[^。\n]{0,90})",
|
||
r"(正餐餐标\s*\d+\s*元/人[^。\n]{0,80})",
|
||
r"(餐标[::]?[^。\n]{2,80})",
|
||
]:
|
||
m = re.search(pattern, text)
|
||
if m:
|
||
return clean(m.group(1))
|
||
return ""
|
||
|
||
|
||
def extract_selling_points(text: str) -> list[str]:
|
||
points: list[str] = []
|
||
for raw in re.split(r"\n+", text):
|
||
line = clean(raw)
|
||
if not line:
|
||
continue
|
||
if line.startswith("❀") or line.startswith("★") or line.startswith("●"):
|
||
line = re.sub(r"^[❀★●\s]+", "", line)
|
||
if any(token in line for token in ("核心", "卖点", "甄选", "景点", "赠送", "酒店", "车型", "纯玩")):
|
||
points.append(line[:180])
|
||
return unique_clean(points, 12)
|
||
|
||
|
||
def group_mode_from_text(text: str) -> str:
|
||
pieces: list[str] = []
|
||
if "当地散拼成团" in text:
|
||
pieces.append("当地散拼成团")
|
||
m = re.search(r"(\d+\s*人团)", text)
|
||
if m:
|
||
pieces.append(m.group(1).replace(" ", ""))
|
||
if "拼小团" in text:
|
||
pieces.append("拼小团")
|
||
if "独立成团" in text:
|
||
pieces.append("独立成团")
|
||
return ";".join(unique_clean(pieces, 4)) or "散客/常规团"
|
||
|
||
|
||
def group_capacity_from_text(text: str) -> tuple[int | None, int | None]:
|
||
m = re.search(r"(\d+)\s*人团", text)
|
||
if m:
|
||
value = int(m.group(1))
|
||
return (1, value)
|
||
m = re.search(r"(\d+)\s*[-~至]\s*(\d+)\s*人", text)
|
||
if m:
|
||
return (int(m.group(1)), int(m.group(2)))
|
||
return (None, None)
|
||
|
||
|
||
def service_promise_from_text(text: str, name: str) -> str:
|
||
snippets = sentence_snippets(text, ["0自费", "0购物", "无购物", "纯玩", "进店赔付"], limit=4, max_len=220)
|
||
if snippets:
|
||
return ";".join(snippets)
|
||
if "纯玩" in name:
|
||
return "纯玩"
|
||
return ""
|
||
|
||
|
||
def extract_route_points(route_text: str) -> list[str]:
|
||
cleaned = re.sub(r"(出发地|各地|酒店|用餐|早餐|中餐|晚餐|入住|返回|散团|送站|机场|高铁站|火车站)", "", route_text)
|
||
parts = re.split(r"→|->|>>|>|—|-|-|~|~", cleaned)
|
||
return [clean(p) for p in parts if clean(p)]
|
||
|
||
|
||
def day_attractions(
|
||
body: str,
|
||
route_text: str,
|
||
attraction_aliases: dict[str, str],
|
||
alias_reverse: dict[str, list[str]],
|
||
) -> list[tuple[str, str, int]]:
|
||
search_text = f"{route_text}\n{body}"
|
||
found: dict[str, tuple[str, int]] = {}
|
||
for key, aliases in alias_reverse.items():
|
||
positions = [(search_text.find(alias), alias) for alias in aliases if alias and alias in search_text]
|
||
positions = [(pos, alias) for pos, alias in positions if pos >= 0]
|
||
if not positions:
|
||
continue
|
||
pos, alias = min(positions, key=lambda x: x[0])
|
||
found[key] = (alias, pos)
|
||
return [(key, alias, pos) for key, (alias, pos) in sorted(found.items(), key=lambda item: item[1][1])]
|
||
|
||
|
||
def duration_near_alias(body: str, aliases: list[str]) -> str:
|
||
best_pos = min([body.find(alias) for alias in aliases if alias in body] or [-1])
|
||
if best_pos < 0:
|
||
return ""
|
||
window = body[max(0, best_pos - 80): best_pos + 220]
|
||
m = re.search(r"(?:游览时间|游时约|游览约|游览)\s*([0-9.半一二三四五六七八九十]+ ?(?:小时|分钟)[^)),。;;]{0,12})", window)
|
||
if m:
|
||
return clean(m.group(1))
|
||
return ""
|
||
|
||
|
||
def notes_near_alias(body: str, aliases: list[str], keywords: list[str], limit: int = 3) -> list[str]:
|
||
snippets: list[str] = []
|
||
sentences = re.split(r"[。!?;;\n]+", body)
|
||
for sentence in sentences:
|
||
item = clean(sentence)
|
||
if len(item) < 4:
|
||
continue
|
||
if aliases and not any(alias in item for alias in aliases):
|
||
continue
|
||
if any(keyword in item for keyword in keywords):
|
||
snippets.append(item[:220])
|
||
if len(snippets) >= limit:
|
||
break
|
||
if not snippets and not aliases:
|
||
snippets = sentence_snippets(body, keywords, limit=limit, max_len=220)
|
||
return unique_clean(snippets, limit)
|
||
|
||
|
||
def walk_intensity_from_notes(notes: list[str], duration_text: str) -> str:
|
||
text = " ".join(notes + [duration_text])
|
||
if any(token in text for token in ("台阶", "石板路", "步行走路", "道路狭窄", "路滑", "排队等待", "索道", "登山")):
|
||
return "中高"
|
||
if any(token in text for token in ("漫步", "观光车", "环保车", "游览时间 3", "3小时", "3.5")):
|
||
return "中等"
|
||
if any(token in text for token in ("30分钟", "半小时", "车观")):
|
||
return "较低"
|
||
return "未明确"
|
||
|
||
|
||
def fee_type_from_context(context: str) -> str:
|
||
if "保险" in context:
|
||
return "景区保险"
|
||
if any(token in context for token in ("观光车", "环保车", "电瓶车", "小交通", "景交")):
|
||
return "景区小交通"
|
||
if any(token in context for token in ("扶梯", "索道", "游船")):
|
||
return "自愿项目"
|
||
if any(token in context for token in ("餐标", "正餐", "半餐")):
|
||
return "餐标"
|
||
if "旅拍" in context or "代金券" in context:
|
||
return "赠送权益"
|
||
if "赔付" in context:
|
||
return "服务承诺赔付"
|
||
if "退团" in context or "损失" in context:
|
||
return "退改损失"
|
||
if any(token in context for token in ("自理", "不含", "必销", "必消")):
|
||
return "自理/必消"
|
||
return "费用说明"
|
||
|
||
|
||
def inclusion_from_context(context: str) -> str:
|
||
if "赠送" in context:
|
||
return "赠送"
|
||
if "赔付" in context:
|
||
return "服务承诺"
|
||
if "退团" in context or "损失" in context:
|
||
return "退改规则"
|
||
if any(token in context for token in ("不含", "自理", "另行付费", "必销", "必消")):
|
||
return "不含/自理"
|
||
if re.search(r"(^|[^不])含", context):
|
||
return "包含"
|
||
return "需核价"
|
||
|
||
|
||
def fee_item_name(context: str, amount_text: str) -> str:
|
||
candidates = [
|
||
"必消小交通合计", "黄果树景区观光车", "小七孔景区观光车", "西江千户苗寨观光车",
|
||
"景区保险", "黄果树扶梯单程", "黄果树扶梯往返", "青岩古镇电瓶车",
|
||
"正餐餐标", "旅拍代金券", "进店赔付", "退团损失", "单独送站",
|
||
]
|
||
for candidate in candidates:
|
||
if candidate in context or all(part in context for part in split_items(candidate)):
|
||
return candidate
|
||
before = clean(context[: max(0, context.find(amount_text))])
|
||
before = re.sub(r".*(不含|含|必消|必销|赠送|如有|另外|需承担)", "", before)
|
||
return clean(before[-24:]) or "费用项目"
|
||
|
||
|
||
def extract_fee_candidates(text: str, source_file: str, applies_to: str, limit: int = 80) -> list[dict[str, Any]]:
|
||
fees: list[dict[str, Any]] = []
|
||
fee_keywords = ["元", "/人", "每人", "自理", "不含", "保险", "观光车", "环保车", "电瓶车", "扶梯", "索道", "餐标", "赔付", "退团", "损失", "代金券"]
|
||
for sentence in re.split(r"[。;;\n]+", text):
|
||
context = clean(sentence)
|
||
if not context or not any(keyword in context for keyword in fee_keywords):
|
||
continue
|
||
for m in re.finditer(r"(\d+(?:\.\d+)?)\s*(元)?\s*/?\s*(每人|人|趟|份|间)?", context):
|
||
amount = m.group(1)
|
||
window = context[max(0, m.start() - 20): m.end() + 20]
|
||
if "元" not in window and "/人" not in window and "每人" not in window and "赔付" not in context and "损失" not in context:
|
||
continue
|
||
item = {
|
||
"fee_item_id": f"FEE-{source_digest(source_file, applies_to, context, amount, len(fees))}",
|
||
"fee_type": fee_type_from_context(context),
|
||
"item_name": fee_item_name(context, amount),
|
||
"amount_text": clean(m.group(0)),
|
||
"amount_value": float(amount),
|
||
"unit": m.group(3) or ("人" if "/人" in context or "每人" in context else ""),
|
||
"inclusion_status": inclusion_from_context(context),
|
||
"applies_to": applies_to,
|
||
"rule_text": context[:260],
|
||
"source_file": source_file,
|
||
}
|
||
fees.append(item)
|
||
if len(fees) >= limit:
|
||
return fees
|
||
unique: dict[str, dict[str, Any]] = {}
|
||
for fee in fees:
|
||
key = f"{fee['fee_type']}|{fee['item_name']}|{fee['amount_text']}|{fee['rule_text'][:40]}"
|
||
unique[key] = fee
|
||
return list(unique.values())[:limit]
|
||
|
||
|
||
def extract_gift_services(text: str, source_file: str, product_name: str) -> list[dict[str, Any]]:
|
||
services: list[dict[str, Any]] = []
|
||
gift_block_match = re.search(r"❀赠送服务[::]?(.*?)(?:备注[::]|·\s*简易行程|简易行程)", text, flags=re.S)
|
||
block = gift_block_match.group(1) if gift_block_match else extract_between(text, ["赠送服务"], ["简易行程", "详细行程", "接待标准"], 1800)
|
||
if not block:
|
||
block = "\n".join(sentence_snippets(text, ["赠送"], limit=8, max_len=220))
|
||
usage_rule = ";".join(sentence_snippets(text, ["赠送项目", "未使用", "无任何退费", "自愿放弃"], limit=3, max_len=180))
|
||
for raw in re.split(r"[。\n;;]+", block):
|
||
line = clean(raw)
|
||
if "赠送" not in line or len(line) < 6:
|
||
continue
|
||
if "核心卖点" in line or "超值精华景点" in line:
|
||
continue
|
||
name = re.sub(r"^\d+[、..]\s*", "", line)
|
||
name = re.sub(r"^赠送[::]?", "", name).strip(":: ")
|
||
value = clean((re.search(r"\d+\s*元/?人?", line) or [""])[0])
|
||
services.append({
|
||
"service_id": f"GIFT-{source_digest(source_file, product_name, line)}",
|
||
"service_type": "赠送服务",
|
||
"name": name[:120],
|
||
"value_text": value,
|
||
"usage_rule": usage_rule,
|
||
"refundable": "否" if "不退" in usage_rule or "无任何退费" in usage_rule else "未明确",
|
||
"source_file": source_file,
|
||
})
|
||
return services[:12]
|
||
|
||
|
||
def extract_policy_rules(text: str, source_file: str, product_name: str) -> list[dict[str, Any]]:
|
||
candidates: list[tuple[str, str, str]] = []
|
||
section_specs = [
|
||
("赠送退费", ["赠送项目", "若未使用"], ["简易行程", "详细行程"], "报价必看"),
|
||
("优惠人群", ["优惠人群"], ["景交", "用餐", "酒店"], "报价必看"),
|
||
("儿童费用", ["儿童"], ["酒店", "导游服务", "购物"], "报价必看"),
|
||
("水帘洞预约", ["关于黄果树水帘洞"], ["温馨提示:景区内游客较多", "· 接待标准"], "风险提示"),
|
||
("购物承诺", ["购物"], ["意见单填写", "温馨提示"], "服务承诺"),
|
||
("投诉/意见单", ["意见单填写"], ["温馨提示", "· 温馨提示"], "售后规则"),
|
||
("酒店预期", ["贵州住房资源紧张", "贵州酒店标准"], ["导游服务", "购物"], "提示"),
|
||
("行程调整", ["旅行社有权根据实际情况"], ["13、", "14、"], "调度规则"),
|
||
]
|
||
for rule_type, starts, ends, severity in section_specs:
|
||
block = extract_between(text, starts, ends, 1200)
|
||
if block:
|
||
candidates.append((rule_type, block, severity))
|
||
for keyword_group, rule_type, severity in [
|
||
(["老人", "行动不便", "孕妇"], "特殊人群", "关键限制"),
|
||
(["退团", "损失"], "退改规则", "报价必看"),
|
||
(["无任何费用退还", "无费用可退", "不退"], "退费规则", "报价必看"),
|
||
(["不可抗力", "堵车", "误机"], "不可抗力", "风险提示"),
|
||
(["景区人流", "排队", "路滑"], "游览风险", "风险提示"),
|
||
]:
|
||
for snippet in sentence_snippets(text, keyword_group, limit=4, max_len=260):
|
||
candidates.append((rule_type, snippet, severity))
|
||
rules: list[dict[str, Any]] = []
|
||
seen: set[str] = set()
|
||
for idx, (rule_type, rule_text, severity) in enumerate(candidates, start=1):
|
||
content = clean(rule_text)
|
||
if len(content) < 10 or content in seen:
|
||
continue
|
||
seen.add(content)
|
||
rules.append({
|
||
"rule_id": f"RULE-DOC-{source_digest(source_file, rule_type, content, idx)}",
|
||
"rule_type": rule_type,
|
||
"applies_to": product_name,
|
||
"rule_text": content[:1200],
|
||
"evidence_text": content[:260],
|
||
"severity": severity,
|
||
"source_file": source_file,
|
||
})
|
||
if len(rules) >= 18:
|
||
break
|
||
return rules
|
||
|
||
|
||
def extract_accommodation_options(text: str, source_file: str, product_name: str) -> list[dict[str, Any]]:
|
||
section = extract_between(text, ["酒店"], ["导游服务", "购物", "意见单填写"], 5200)
|
||
if not section:
|
||
return []
|
||
labels = [
|
||
"贵阳参考", "龙里参考", "都匀参考", "安顺参考", "西江4钻或4圈参考酒店",
|
||
"西江4钻参考酒店", "西江参考",
|
||
]
|
||
pattern = r"(" + "|".join(re.escape(label) for label in labels) + r")[::]\s*(.*?)(?=(" + "|".join(re.escape(label) for label in labels) + r")[::]|贵州住房资源|导游服务|购物|$)"
|
||
options: list[dict[str, Any]] = []
|
||
for match in re.finditer(pattern, section, flags=re.S):
|
||
label = clean(match.group(1))
|
||
content = clean(match.group(2))
|
||
if len(content) < 3:
|
||
continue
|
||
city = label.replace("参考酒店", "").replace("参考", "").replace("4钻或4圈", "").strip()
|
||
hotels = split_items(re.sub(r"四钻|4钻参考酒店|等同级酒店|等同级|参考酒店", "", content))
|
||
notes = "贵州住房资源紧张时可调整同级酒店" if "调整到同级别酒店" in section else ""
|
||
options.append({
|
||
"accommodation_id": f"ACCO-{source_digest(source_file, product_name, label)}",
|
||
"city_or_area": city or label,
|
||
"hotel_grade": hotel_grade_from_text(f"{label} {content}") or hotel_grade_from_text(text),
|
||
"option_type": "产品参考酒店组",
|
||
"reference_hotels": hotels,
|
||
"stay_nights": "",
|
||
"notes": notes,
|
||
"source_file": source_file,
|
||
})
|
||
return options[:12]
|
||
|
||
|
||
def transport_segments_for_day(
|
||
route_text: str,
|
||
body: str,
|
||
day_index: int,
|
||
source_file: str,
|
||
product_name: str,
|
||
vehicle_type: str,
|
||
) -> list[dict[str, Any]]:
|
||
points = extract_route_points(route_text)
|
||
if len(points) < 2:
|
||
return []
|
||
duration_text = ";".join(sentence_snippets(body, ["车程约", "乘车前往", "接人", "送站", "散团"], limit=3, max_len=160))
|
||
dispatch_notes = ";".join(sentence_snippets(body, ["接人", "师傅", "导游", "交通管制", "外来车辆无法进入", "送站"], limit=3, max_len=180))
|
||
segments: list[dict[str, Any]] = []
|
||
for idx in range(len(points) - 1):
|
||
origin = points[idx]
|
||
destination = points[idx + 1]
|
||
segments.append({
|
||
"segment_id": f"SEG-{source_digest(source_file, product_name, day_index, idx, origin, destination)}",
|
||
"day_index": day_index,
|
||
"origin_text": origin,
|
||
"destination_text": destination,
|
||
"vehicle_type": vehicle_type,
|
||
"duration_text": duration_text,
|
||
"distance_text": "",
|
||
"dispatch_notes": dispatch_notes,
|
||
"source_file": source_file,
|
||
})
|
||
return segments
|
||
|
||
|
||
class KGBuilder:
|
||
def __init__(self) -> None:
|
||
self.nodes: dict[str, dict[str, Any]] = {}
|
||
self.relations: list[dict[str, Any]] = []
|
||
self.sources: list[dict[str, Any]] = []
|
||
|
||
def add_node(self, label: str, key: str, name: str, **props: Any) -> str:
|
||
if not key:
|
||
key = f"{label.lower()}:{slug(name)}"
|
||
existing = self.nodes.get(key)
|
||
payload = {
|
||
"label": label,
|
||
"natural_key": key,
|
||
"name": clean(name) or key,
|
||
**{k: v for k, v in props.items() if v not in (None, "", [], {})},
|
||
}
|
||
if existing:
|
||
merged = {**existing, **payload}
|
||
for field in ("source_files", "aliases", "season_tags", "must_visit", "signature_dishes", "features", "applicable_products"):
|
||
vals: list[Any] = []
|
||
for src in (existing.get(field), payload.get(field)):
|
||
if isinstance(src, list):
|
||
vals.extend(src)
|
||
elif src:
|
||
vals.append(src)
|
||
if vals:
|
||
merged[field] = sorted({clean(v) for v in vals if clean(v)})
|
||
self.nodes[key] = merged
|
||
else:
|
||
self.nodes[key] = payload
|
||
return key
|
||
|
||
def add_rel(self, rel_type: str, source: str, target: str, **props: Any) -> None:
|
||
if not source or not target or source == target:
|
||
return
|
||
item = {
|
||
"relation_type": rel_type,
|
||
"source": source,
|
||
"target": target,
|
||
"properties": {k: v for k, v in props.items() if v not in (None, "", [], {})},
|
||
}
|
||
identity = (rel_type, source, target, json.dumps(item["properties"], ensure_ascii=False, sort_keys=True))
|
||
if not hasattr(self, "_rel_seen"):
|
||
self._rel_seen = set()
|
||
if identity in self._rel_seen:
|
||
return
|
||
self._rel_seen.add(identity)
|
||
self.relations.append(item)
|
||
|
||
|
||
def seed_schema() -> dict[str, Any]:
|
||
entity_types = {
|
||
"TravelDemand": {
|
||
"cn": "游客动态需求",
|
||
"purpose": "承接客服从自然语言里识别出的出行意向,是后续自动组合行程的输入对象。",
|
||
"fields": [
|
||
"demand_id", "lead_source", "travel_month", "start_date", "duration_days", "party_size",
|
||
"adult_count", "child_count", "senior_count", "hotel_preference", "vehicle_preference",
|
||
"budget_level", "must_visit", "avoid_notes", "special_care", "demand_summary",
|
||
],
|
||
},
|
||
"ItineraryPlan": {
|
||
"cn": "推荐行程方案",
|
||
"purpose": "由图谱中的产品、报价、酒店、餐厅、交通组合出的可交付行程方案。",
|
||
"fields": [
|
||
"plan_id", "plan_name", "duration_days", "fit_score", "budget_estimate", "route_summary",
|
||
"quote_summary", "risk_notes", "answer_hint",
|
||
],
|
||
},
|
||
"TourProduct": {
|
||
"cn": "旅游产品",
|
||
"purpose": "旅行社已经沉淀的线路产品,是行程组合的主要模块。",
|
||
"fields": [
|
||
"product_id", "name", "product_family", "product_type", "duration_days", "group_mode",
|
||
"capacity_min", "capacity_max", "vehicle_layout", "hotel_grade", "meal_standard", "service_promise",
|
||
"selling_points", "included_summary", "excluded_summary", "optional_items", "fee_summary",
|
||
"booking_notes", "risk_notes", "season_tags", "source_files",
|
||
],
|
||
},
|
||
"TourVariant": {
|
||
"cn": "产品报价变体",
|
||
"purpose": "同一产品在不同团期、房型、人数和车型下的报价。",
|
||
"fields": [
|
||
"variant_id", "variant_name", "season", "date_range", "group_size_band", "room_type",
|
||
"hotel_grade", "vehicle_type", "adult_price", "child_price", "single_room_supplement",
|
||
"inner_transport_fee", "refund_policy", "source_file",
|
||
],
|
||
},
|
||
"ItineraryDay": {
|
||
"cn": "每日行程",
|
||
"purpose": "拆解产品每天游览、交通、用餐、住宿,便于后续按需求重排。",
|
||
"fields": [
|
||
"day_id", "day_index", "title", "route_path", "route_summary", "transport_summary",
|
||
"meals", "accommodation", "time_arrangement", "tips", "source_file",
|
||
],
|
||
},
|
||
"ScenicAttraction": {
|
||
"cn": "旅游景点",
|
||
"purpose": "客户需求和线路推荐最常见的锚点。",
|
||
"fields": [
|
||
"attraction_id", "name", "aliases", "city", "attraction_type", "selling_points",
|
||
"ticket_policy", "inner_transport_fee", "walking_profile", "visit_duration_hint",
|
||
],
|
||
},
|
||
"HotelResource": {
|
||
"cn": "酒店资源",
|
||
"purpose": "客服按酒店等级、区域、产品适用性组合住宿。",
|
||
"fields": ["hotel_id", "name", "hotel_grade", "region", "address", "features", "off_season_price_text", "peak_season_price_text", "applicable_products", "contact_name"],
|
||
},
|
||
"RestaurantResource": {
|
||
"cn": "餐厅资源",
|
||
"purpose": "客服按区域、餐标、特色餐给行程补餐饮方案。",
|
||
"fields": ["restaurant_id", "name", "region", "address", "per_capita_price_text", "signature_dishes", "meal_scene", "contact_name"],
|
||
},
|
||
"VehicleService": {
|
||
"cn": "车辆服务",
|
||
"purpose": "小团、接送、独立成团的交通能力。",
|
||
"fields": ["vehicle_service_id", "vehicle_type", "seat_count", "comfort_level", "seat_layout", "service_scope", "notes"],
|
||
},
|
||
"TransferQuote": {
|
||
"cn": "接送报价",
|
||
"purpose": "补充到达/离开日接送预算。",
|
||
"fields": ["transfer_quote_id", "origin_text", "destination_text", "vehicle_type", "price_per_trip", "quote_unit", "quote_notes"],
|
||
},
|
||
"DayVisit": {
|
||
"cn": "每日景点游览安排",
|
||
"purpose": "把某一天去哪个景点、游览多久、是否真实行程点、步行/排队风险和费用证据拆开,避免把自费说明误当成路线。",
|
||
"fields": [
|
||
"visit_id", "day_index", "visit_order", "attraction_name", "duration_text",
|
||
"evidence_text", "fee_notes", "walking_notes", "walk_intensity", "risk_notes",
|
||
"included_flag", "source_file",
|
||
],
|
||
},
|
||
"MealArrangement": {
|
||
"cn": "每日用餐安排",
|
||
"purpose": "记录每天早中晚是否包含、餐标、特色餐和自理说明,避免 6 天行程只显示两家餐厅造成误解。",
|
||
"fields": [
|
||
"meal_id", "day_index", "breakfast", "lunch", "dinner", "meal_text",
|
||
"meal_standard", "special_meal", "self_pay_notes", "source_file",
|
||
],
|
||
},
|
||
"AccommodationOption": {
|
||
"cn": "住宿组选项",
|
||
"purpose": "承载产品中的住宿城市、等级、参考酒店组和同级替换规则;和酒店资源库中的单体酒店可以继续关联。",
|
||
"fields": [
|
||
"accommodation_id", "city_or_area", "hotel_grade", "option_type", "reference_hotels",
|
||
"stay_nights", "notes", "source_file",
|
||
],
|
||
},
|
||
"TransportSegment": {
|
||
"cn": "行程交通段",
|
||
"purpose": "记录每天从哪到哪、用什么车、车程和调度提醒,支持后续核算车费和判断赶路强度。",
|
||
"fields": [
|
||
"segment_id", "day_index", "origin_text", "destination_text", "vehicle_type",
|
||
"duration_text", "distance_text", "dispatch_notes", "source_file",
|
||
],
|
||
},
|
||
"FeeItem": {
|
||
"cn": "费用项目",
|
||
"purpose": "把门票、小交通、保险、扶梯、餐标、赔付、退改损失等费用拆成可追溯证据。",
|
||
"fields": [
|
||
"fee_item_id", "fee_type", "item_name", "amount_text", "amount_value", "unit",
|
||
"inclusion_status", "applies_to", "rule_text", "source_file",
|
||
],
|
||
},
|
||
"GiftService": {
|
||
"cn": "赠送/包含服务",
|
||
"purpose": "记录长桌宴、旅拍券、矿泉水、打糍粑等赠送或服务承诺,并保留未使用不退等规则。",
|
||
"fields": ["service_id", "service_type", "name", "value_text", "usage_rule", "refundable", "source_file"],
|
||
},
|
||
"PolicyRule": {
|
||
"cn": "业务规则",
|
||
"purpose": "限制、退费、风险和合规提醒。",
|
||
"fields": ["rule_id", "rule_type", "applies_to", "rule_text", "evidence_text", "severity", "source_file"],
|
||
},
|
||
"SalesScript": {
|
||
"cn": "销售话术",
|
||
"purpose": "客服回复、追单、留资和解释费用包含。",
|
||
"fields": ["script_id", "channel", "funnel_stage", "trigger_scenario", "message_template", "intent_tags", "required_customer_fields"],
|
||
},
|
||
"Area": {"cn": "区域", "purpose": "连接酒店、餐厅、接送报价和景点。", "fields": ["area_id", "name", "area_type"]},
|
||
"SalesChannel": {"cn": "销售渠道", "purpose": "小红书、微信等线索来源。", "fields": ["channel_id", "name", "channel_type"]},
|
||
}
|
||
relation_types = {
|
||
"HAS_DEMAND": ("CustomerLead|SalesChannel", "TravelDemand", "线索产生游客需求"),
|
||
"WANTS_TO_VISIT": ("TravelDemand", "ScenicAttraction", "需求指定想去景点"),
|
||
"PREFERS_HOTEL": ("TravelDemand", "HotelResource", "需求偏好酒店资源或等级"),
|
||
"PREFERS_VEHICLE": ("TravelDemand", "VehicleService", "需求偏好车型"),
|
||
"GENERATES_PLAN": ("TravelDemand", "ItineraryPlan", "需求生成推荐方案"),
|
||
"SATISFIES_DEMAND": ("ItineraryPlan", "TravelDemand", "方案满足需求"),
|
||
"PLAN_USES_PRODUCT": ("ItineraryPlan", "TourProduct", "方案使用已有产品"),
|
||
"PLAN_USES_VARIANT": ("ItineraryPlan", "TourVariant", "方案使用报价变体"),
|
||
"PLAN_HAS_DAY": ("ItineraryPlan", "ItineraryDay", "方案包含每日行程"),
|
||
"PLAN_STAYS_AT": ("ItineraryPlan", "HotelResource", "方案建议入住酒店"),
|
||
"PLAN_MEALS_AT": ("ItineraryPlan", "RestaurantResource", "方案建议餐厅"),
|
||
"PLAN_USES_VEHICLE": ("ItineraryPlan", "VehicleService", "方案建议车辆"),
|
||
"HAS_VARIANT": ("TourProduct", "TourVariant", "产品拥有报价变体"),
|
||
"HAS_DAY": ("TourProduct", "ItineraryDay", "产品包含每日行程"),
|
||
"VISITS": ("TourProduct|ItineraryDay", "ScenicAttraction", "游览景点"),
|
||
"DAY_HAS_VISIT": ("ItineraryDay", "DayVisit", "每日行程包含具体游览安排"),
|
||
"VISIT_AT_ATTRACTION": ("DayVisit", "ScenicAttraction", "游览安排对应景点"),
|
||
"DAY_HAS_MEAL": ("ItineraryDay", "MealArrangement", "每日行程包含用餐安排"),
|
||
"DAY_HAS_ACCOMMODATION": ("ItineraryDay", "AccommodationOption", "每日行程包含住宿安排"),
|
||
"DAY_USES_TRANSPORT": ("ItineraryDay", "TransportSegment", "每日行程包含交通段"),
|
||
"PRODUCT_HAS_ACCOMMODATION_OPTION": ("TourProduct", "AccommodationOption", "产品可用住宿组选项"),
|
||
"PRODUCT_HAS_FEE": ("TourProduct|ItineraryDay|DayVisit", "FeeItem", "产品或每日行程涉及费用项目"),
|
||
"DAY_HAS_FEE": ("ItineraryDay|DayVisit", "FeeItem", "每日行程涉及费用项目"),
|
||
"PRODUCT_INCLUDES_SERVICE": ("TourProduct", "GiftService", "产品包含或赠送服务"),
|
||
"STAYS_AT": ("ItineraryDay", "HotelResource", "每日行程入住酒店"),
|
||
"MEALS_AT": ("ItineraryDay", "RestaurantResource", "每日行程用餐"),
|
||
"USES_VEHICLE": ("TourProduct|TourVariant|TransferQuote", "VehicleService", "使用车辆"),
|
||
"CAN_UPGRADE_TO": ("VehicleService", "VehicleService", "车辆可升级"),
|
||
"HAS_POLICY": ("TourProduct|TourVariant|TravelDemand", "PolicyRule", "适用规则"),
|
||
"HAS_SCRIPT": ("TourProduct|TravelDemand", "SalesScript", "适用话术"),
|
||
"FROM_SOURCE": ("TravelDemand|SalesScript", "SalesChannel", "来自渠道"),
|
||
"LOCATED_IN": ("ScenicAttraction|HotelResource|RestaurantResource", "Area", "位于区域"),
|
||
"FROM_AREA": ("TransferQuote", "Area", "接送出发区域"),
|
||
"TO_AREA": ("TransferQuote", "Area", "接送到达区域"),
|
||
}
|
||
return {
|
||
"namespace": "travel_agency_itinerary_planning",
|
||
"version": "0.3",
|
||
"display_name": "旅行社行程规划知识图谱 Schema",
|
||
"purpose": "支持客服基于游客动态需求,快速从产品、景点、酒店、餐厅、交通、报价、规则和话术中组合行程单。",
|
||
"entity_types": entity_types,
|
||
"relation_types": relation_types,
|
||
"quality_rules": [
|
||
"报价、人数、日期、景区小交通、自费项必须进入属性,不升级成无证据关系。",
|
||
"产品 VISITS 只允许来自简易行程/详细行程的真实游览日程;费用、自理、政策段落中出现的景点只能进入 FeeItem 或 PolicyRule。",
|
||
"每日行程必须尽量拆出 DayVisit、MealArrangement、AccommodationOption、TransportSegment 和 FeeItem,以便客服解释住哪、吃什么、怎么走、费用看哪里。",
|
||
"产品与推荐方案分离:TourProduct 保存已有资料库,ItineraryPlan 保存按用户需求组合后的方案。",
|
||
"TravelDemand 是后期自然语言客资抽取的核心输入,必须连接 must_visit、vehicle/hotel preference 和生成方案。",
|
||
"同一产品不同房型、团期、车型必须拆成 TourVariant,避免报价覆盖。",
|
||
"城市图谱与旅行社图谱使用不同 project_id 和 graph_name,不共享默认查询上下文。",
|
||
],
|
||
}
|
||
|
||
|
||
def schema_to_dsl(schema: dict[str, Any]) -> str:
|
||
lines = ["```text", "namespace travel_agency_itinerary_planning", ""]
|
||
list_fields = {
|
||
"must_visit", "avoid_notes", "features", "signature_dishes", "meal_scene", "applicable_products",
|
||
"source_files", "optional_items", "season_tags", "selling_points", "reference_hotels",
|
||
}
|
||
for name, spec in schema["entity_types"].items():
|
||
lines.append(f"{name}({spec['cn']}): EntityType")
|
||
lines.append(" properties:")
|
||
for field in spec["fields"]:
|
||
value_type = "Number" if any(x in field for x in ("days", "count", "price", "score", "supplement")) else "Text"
|
||
if field in list_fields:
|
||
value_type = "TextList"
|
||
lines.append(f" {field}: {value_type}")
|
||
lines.append("")
|
||
for name, (start, end, desc) in schema["relation_types"].items():
|
||
lines.append(f"{name}({desc}): RelationType")
|
||
lines.append(f" startNode: {start}")
|
||
lines.append(f" endNode: {end}")
|
||
lines.append("")
|
||
lines.append("```")
|
||
return "\n".join(lines)
|
||
|
||
|
||
def add_attractions(builder: KGBuilder) -> dict[str, str]:
|
||
mapping = {}
|
||
for name, aliases, city, typ, point in ATTRACTION_SEEDS:
|
||
area_key = builder.add_node("Area", f"area:{city}", city, area_id=f"AREA-{slug(city)[:18]}", area_type="目的地区域")
|
||
key = builder.add_node(
|
||
"ScenicAttraction",
|
||
f"attraction:{slug(name)}",
|
||
name,
|
||
attraction_id=f"ATTR-{hashlib.md5(name.encode()).hexdigest()[:8]}",
|
||
aliases=aliases,
|
||
city=city,
|
||
attraction_type=typ,
|
||
selling_points=[point],
|
||
)
|
||
builder.add_rel("LOCATED_IN", key, area_key)
|
||
for alias in aliases:
|
||
mapping[alias] = key
|
||
return mapping
|
||
|
||
|
||
def mentioned_attractions(text: str, attraction_aliases: dict[str, str]) -> list[str]:
|
||
found: list[str] = []
|
||
for alias, key in attraction_aliases.items():
|
||
if alias and alias in text and key not in found:
|
||
found.append(key)
|
||
return found
|
||
|
||
|
||
def extract_day_segments(text: str) -> list[tuple[int, str, str]]:
|
||
matches = list(re.finditer(r"(?m)(D\s*\d+|第[一二三四五六七八九十]+天)[::、\s]", text))
|
||
if not matches:
|
||
body = extract_between(text, ["行程安排", "日期"], ["团费包含", "费用包含", "接待标准", "参团须知"], 1400) or text[:1400]
|
||
return [(1, "D1", body)]
|
||
segments = []
|
||
cn_map = {"一": 1, "二": 2, "两": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10}
|
||
for idx, match in enumerate(matches):
|
||
if idx + 1 < len(matches):
|
||
end = matches[idx + 1].start()
|
||
else:
|
||
end = len(text)
|
||
for token in ["· 接待标准", "接待标准", "· 费用包含", "费用包含", "团费包含", "· 参团须知", "参团须知"]:
|
||
pos = text.find(token, match.start() + 2)
|
||
if pos >= 0:
|
||
end = min(end, pos)
|
||
end = min(end, match.start() + 2600)
|
||
token = match.group(1)
|
||
if token.startswith("D"):
|
||
day_index = int(re.search(r"\d+", token).group())
|
||
else:
|
||
day_index = cn_map.get(re.search(r"第(.+?)天", token).group(1), idx + 1)
|
||
body = text[match.start():end]
|
||
title = clean(body.splitlines()[0])[:80] or f"D{day_index}"
|
||
segments.append((day_index, title, body))
|
||
return segments[:12]
|
||
|
||
|
||
def extract_products_from_docs(builder: KGBuilder, attraction_aliases: dict[str, str]) -> None:
|
||
docs_dir = SOURCE_DIR / "2026年新行程打包"
|
||
alias_reverse = build_alias_reverse(attraction_aliases)
|
||
for path in sorted(docs_dir.glob("*")):
|
||
if path.suffix.lower() not in {".doc", ".docx"}:
|
||
continue
|
||
if path.name.startswith((".", "~$", ".~")):
|
||
continue
|
||
text = read_office_text(path)
|
||
non_empty = [clean(x) for x in text.splitlines() if clean(x)]
|
||
name = non_empty[0] if non_empty else path.stem
|
||
if len(name) < 4 or "INCLUDEPICTURE" in name:
|
||
name = path.stem
|
||
duration = duration_from_text(f"{path.stem} {name} {text[:300]}")
|
||
source_file = str(path)
|
||
vehicle = vehicle_from_text(f"{path.stem} {text[:2200]}")
|
||
meal_standard = extract_product_meal_standard(text)
|
||
capacity_min, capacity_max = group_capacity_from_text(text)
|
||
product_fees = extract_fee_candidates(text, source_file, name, limit=70)
|
||
fee_summary = ";".join(unique_clean([f"{fee['item_name']} {fee['amount_text']}({fee['inclusion_status']})" for fee in product_fees], 12))
|
||
product_key = builder.add_node(
|
||
"TourProduct",
|
||
f"product:{slug(name)}",
|
||
name,
|
||
product_id=f"TAP-2026-{hashlib.md5(name.encode()).hexdigest()[:8].upper()}",
|
||
product_family=product_family(name, text),
|
||
product_type="既有线路产品",
|
||
duration_days=duration,
|
||
duration_nights=max(duration - 1, 0) if duration else None,
|
||
group_mode=group_mode_from_text(text),
|
||
capacity_min=capacity_min,
|
||
capacity_max=capacity_max,
|
||
vehicle_layout=vehicle,
|
||
hotel_grade=hotel_grade_from_text(f"{path.stem} {text[:2500]}"),
|
||
meal_standard=meal_standard,
|
||
service_promise=service_promise_from_text(text, name),
|
||
selling_points=extract_selling_points(text),
|
||
included_summary=extract_between(text, ["费用包含", "团费包含", "接待标准"], ["费用不含", "团费不含", "参团须知"], 1500),
|
||
excluded_summary=extract_between(text, ["费用不含", "团费不含", "不含自愿消费"], ["参团须知", "重要提示", "温馨提示"], 1500),
|
||
optional_items=split_items(";".join(re.findall(r"[^。\n]*(?:自愿消费|扶梯|索道|游船|电瓶车|观光车|保险)[^。\n]*", text)[:10])),
|
||
fee_summary=fee_summary,
|
||
booking_notes=extract_between(text, ["参团须知", "报名须知"], ["温馨提示"], 1500),
|
||
risk_notes=short(";".join(re.findall(r"[^。\n]*(?:不可抗力|限流|堵车|孕妇|75岁|老人|退费|无费用可退|不接待)[^。\n]*", text)[:12]), 1200),
|
||
season_tags=[tag for tag in ["赏花", "暑期", "旺季", "五一"] if tag in text or tag in path.stem],
|
||
source_files=[source_file],
|
||
source_excerpt=short(text, 1200),
|
||
)
|
||
if vehicle:
|
||
vkey = builder.add_node("VehicleService", f"vehicle:{slug(vehicle)}", vehicle, vehicle_type=vehicle)
|
||
builder.add_rel("USES_VEHICLE", product_key, vkey, evidence=path.name)
|
||
for fee in product_fees:
|
||
fee_key = builder.add_node("FeeItem", f"fee:{fee['fee_item_id']}", fee["item_name"], **fee)
|
||
builder.add_rel("PRODUCT_HAS_FEE", product_key, fee_key, evidence=path.name)
|
||
for service in extract_gift_services(text, source_file, name):
|
||
service_props = {k: v for k, v in service.items() if k != "name"}
|
||
service_key = builder.add_node("GiftService", f"gift:{service['service_id']}", service["name"], **service_props)
|
||
builder.add_rel("PRODUCT_INCLUDES_SERVICE", product_key, service_key, evidence=path.name)
|
||
for rule in extract_policy_rules(text, source_file, name):
|
||
rule_key = builder.add_node("PolicyRule", f"policy:{rule['rule_id']}", f"{name} {rule['rule_type']}", **rule)
|
||
builder.add_rel("HAS_POLICY", product_key, rule_key, evidence=path.name)
|
||
accommodation_options = extract_accommodation_options(text, source_file, name)
|
||
for option in accommodation_options:
|
||
option_key = builder.add_node(
|
||
"AccommodationOption",
|
||
f"acco:{option['accommodation_id']}",
|
||
f"{name} {option['city_or_area']}住宿组",
|
||
**option,
|
||
)
|
||
builder.add_rel("PRODUCT_HAS_ACCOMMODATION_OPTION", product_key, option_key, evidence=path.name)
|
||
simple_rows = parse_simple_itinerary(text)
|
||
product_attractions: list[str] = []
|
||
for day_index, title, body in extract_day_segments(text):
|
||
simple_row = simple_rows.get(day_index, {})
|
||
route_path = simple_row.get("route") or day_title_from_body(day_index, title, body, simple_row)
|
||
meal_text = extract_meal_text(body, simple_row)
|
||
accommodation = extract_accommodation_text(body, simple_row)
|
||
day_title = day_title_from_body(day_index, title, body, simple_row)
|
||
day_key = builder.add_node(
|
||
"ItineraryDay",
|
||
f"day:{slug(name)}:{day_index}",
|
||
f"{name} D{day_index}",
|
||
day_id=f"DAY-{hashlib.md5((name+str(day_index)).encode()).hexdigest()[:10]}",
|
||
day_index=day_index,
|
||
title=day_title,
|
||
route_path=route_path,
|
||
route_summary=short(body, 1000),
|
||
transport_summary=clean((re.search(r"(车程约[^,。;\n]+)", body) or ["", ""])[1]),
|
||
meals=meal_text,
|
||
accommodation=accommodation,
|
||
time_arrangement=";".join(sentence_snippets(body, ["早上", "早餐后", "中餐", "晚上", "游玩结束", "返回"], limit=5, max_len=140)),
|
||
tips=short(";".join(re.findall(r"[^。\n]*(?:温馨提示|特别说明|注意)[^。\n]*", body)[:4]), 500),
|
||
source_file=source_file,
|
||
)
|
||
builder.add_rel("HAS_DAY", product_key, day_key, day_index=day_index)
|
||
meal_payload = meal_parts(meal_text, meal_standard)
|
||
meal_key = builder.add_node(
|
||
"MealArrangement",
|
||
f"meal:{slug(name)}:{day_index}",
|
||
f"{name} D{day_index}用餐",
|
||
meal_id=f"MEAL-{source_digest(source_file, name, day_index)}",
|
||
day_index=day_index,
|
||
special_meal=";".join(sentence_snippets(body, ["长桌宴", "酸汤鱼", "特色餐", "中餐"], limit=3, max_len=120)),
|
||
self_pay_notes=";".join(sentence_snippets(body, ["自理", "不含", "不退"], limit=2, max_len=120)),
|
||
source_file=source_file,
|
||
**meal_payload,
|
||
)
|
||
builder.add_rel("DAY_HAS_MEAL", day_key, meal_key, evidence=path.name)
|
||
if accommodation:
|
||
acc_key = builder.add_node(
|
||
"AccommodationOption",
|
||
f"acco:day:{slug(name)}:{day_index}:{slug(accommodation)}",
|
||
f"{name} D{day_index}住宿 {accommodation}",
|
||
accommodation_id=f"ACCO-DAY-{source_digest(source_file, name, day_index, accommodation)}",
|
||
city_or_area=accommodation,
|
||
hotel_grade=hotel_grade_from_text(f"{accommodation} {text[:2500]}"),
|
||
option_type="每日住宿城市/区域",
|
||
reference_hotels=[],
|
||
stay_nights="1" if accommodation != "/" else "0",
|
||
notes="来自行程住宿列",
|
||
source_file=source_file,
|
||
)
|
||
builder.add_rel("DAY_HAS_ACCOMMODATION", day_key, acc_key, evidence=path.name)
|
||
for segment in transport_segments_for_day(route_path, body, day_index, source_file, name, vehicle):
|
||
segment_key = builder.add_node(
|
||
"TransportSegment",
|
||
f"segment:{segment['segment_id']}",
|
||
f"{name} D{day_index} {segment['origin_text']}->{segment['destination_text']}",
|
||
**segment,
|
||
)
|
||
builder.add_rel("DAY_USES_TRANSPORT", day_key, segment_key, evidence=path.name)
|
||
day_fee_text = "\n".join([body, route_path])
|
||
for fee in extract_fee_candidates(day_fee_text, source_file, f"{name} D{day_index}", limit=25):
|
||
fee_key = builder.add_node("FeeItem", f"fee:{fee['fee_item_id']}", fee["item_name"], **fee)
|
||
builder.add_rel("DAY_HAS_FEE", day_key, fee_key, evidence=path.name)
|
||
builder.add_rel("PRODUCT_HAS_FEE", day_key, fee_key, evidence=path.name)
|
||
for visit_order, (attraction_key, matched_alias, _pos) in enumerate(day_attractions(body, route_path, attraction_aliases, alias_reverse), start=1):
|
||
attraction = builder.nodes.get(attraction_key, {})
|
||
aliases = alias_reverse.get(attraction_key, [matched_alias])
|
||
duration_text = duration_near_alias(body, aliases)
|
||
fee_notes = notes_near_alias(body, aliases, ["不含", "自理", "电瓶车", "观光车", "环保车", "保险", "扶梯", "索道"], limit=3)
|
||
walking_notes = notes_near_alias(body, aliases, ["步行", "台阶", "石板路", "漫步", "路滑", "排队", "观光车", "交通管制"], limit=3)
|
||
risk_notes = notes_near_alias(body, aliases, ["预约", "限流", "路滑", "拥堵", "排队", "无费用退还", "建议放弃", "老人"], limit=3)
|
||
visit_key = builder.add_node(
|
||
"DayVisit",
|
||
f"visit:{slug(name)}:{day_index}:{visit_order}:{attraction_key}",
|
||
f"{name} D{day_index} {attraction.get('name', matched_alias)}",
|
||
visit_id=f"VISIT-{source_digest(source_file, name, day_index, attraction_key, visit_order)}",
|
||
day_index=day_index,
|
||
visit_order=visit_order,
|
||
attraction_name=attraction.get("name", matched_alias),
|
||
duration_text=duration_text,
|
||
evidence_text=short(notes_near_alias(body, aliases, [matched_alias], limit=1)[0] if notes_near_alias(body, aliases, [matched_alias], limit=1) else body, 260),
|
||
fee_notes=";".join(fee_notes),
|
||
walking_notes=";".join(walking_notes),
|
||
walk_intensity=walk_intensity_from_notes(walking_notes + risk_notes, duration_text),
|
||
risk_notes=";".join(risk_notes),
|
||
included_flag="真实行程游览点",
|
||
source_file=source_file,
|
||
)
|
||
builder.add_rel("DAY_HAS_VISIT", day_key, visit_key, day_index=day_index, visit_order=visit_order)
|
||
builder.add_rel("VISIT_AT_ATTRACTION", visit_key, attraction_key, matched_alias=matched_alias)
|
||
builder.add_rel("VISITS", day_key, attraction_key, evidence=f"{path.name} D{day_index}")
|
||
if attraction_key not in product_attractions:
|
||
product_attractions.append(attraction_key)
|
||
if fee_notes:
|
||
for fee in extract_fee_candidates("。".join(fee_notes), source_file, f"{name} D{day_index} {attraction.get('name', matched_alias)}", limit=8):
|
||
fee_key = builder.add_node("FeeItem", f"fee:{fee['fee_item_id']}", fee["item_name"], **fee)
|
||
builder.add_rel("DAY_HAS_FEE", visit_key, fee_key, evidence=path.name)
|
||
builder.add_rel("PRODUCT_HAS_FEE", visit_key, fee_key, evidence=path.name)
|
||
for attraction_key in product_attractions:
|
||
builder.add_rel("VISITS", product_key, attraction_key, evidence="来自每日行程真实游览点")
|
||
|
||
|
||
def parse_resource_workbooks(builder: KGBuilder) -> tuple[dict[str, str], dict[str, str]]:
|
||
hotel_by_region: dict[str, list[str]] = defaultdict(list)
|
||
restaurant_by_region: dict[str, list[str]] = defaultdict(list)
|
||
hotel_path = SOURCE_DIR / "住宿资源库(四钻及以上).xlsx"
|
||
df = pd.read_excel(hotel_path, header=None)
|
||
region = ""
|
||
for _, row in df.iterrows():
|
||
values = [clean(x) for x in row.tolist()]
|
||
if values[0] and "区域" in values[0] and not values[1]:
|
||
region = values[0].replace("2.1", "").strip()
|
||
builder.add_node("Area", f"area:{region}", region, area_id=f"AREA-{slug(region)[:18]}", area_type="酒店区域")
|
||
continue
|
||
if values[0] in {"酒店名称", ""} or not values[0]:
|
||
continue
|
||
name = values[0]
|
||
key = builder.add_node(
|
||
"HotelResource",
|
||
f"hotel:{slug(name)}",
|
||
name,
|
||
hotel_id=f"HOTEL-{hashlib.md5(name.encode()).hexdigest()[:8]}",
|
||
hotel_grade=values[1],
|
||
region=region,
|
||
address=values[2],
|
||
contact_name=values[3],
|
||
features=split_items(values[4]),
|
||
listed_price_text=values[5],
|
||
off_season_price_text=values[6],
|
||
peak_season_price_text=values[7],
|
||
applicable_products=split_items(values[8]),
|
||
source_file=str(hotel_path),
|
||
)
|
||
if region:
|
||
area_key = builder.add_node("Area", f"area:{region}", region, area_type="酒店区域")
|
||
builder.add_rel("LOCATED_IN", key, area_key)
|
||
hotel_by_region[region].append(key)
|
||
|
||
rest_path = SOURCE_DIR / "餐厅资源库.xlsx"
|
||
df = pd.read_excel(rest_path, header=None)
|
||
region = ""
|
||
for _, row in df.iterrows():
|
||
values = [clean(x) for x in row.tolist()]
|
||
if values[0] and "区域" in values[0] and not values[1]:
|
||
region = values[0].replace("1.1", "").strip()
|
||
builder.add_node("Area", f"area:{region}", region, area_id=f"AREA-{slug(region)[:18]}", area_type="餐饮区域")
|
||
continue
|
||
if values[0] in {"餐厅名称", ""} or not values[0]:
|
||
continue
|
||
name = values[0]
|
||
contact = values[4]
|
||
phone = ""
|
||
phone_match = re.search(r"1\d{10}", contact)
|
||
if phone_match:
|
||
phone = phone_match.group()
|
||
contact = contact.replace(phone, "")
|
||
key = builder.add_node(
|
||
"RestaurantResource",
|
||
f"restaurant:{slug(name)}",
|
||
name,
|
||
restaurant_id=f"REST-{hashlib.md5(name.encode()).hexdigest()[:8]}",
|
||
region=region,
|
||
address=values[1] or values[5],
|
||
per_capita_price_text=values[2],
|
||
signature_dishes=split_items(values[3]),
|
||
contact_name=contact,
|
||
contact_phone=phone,
|
||
meal_scene=split_items(values[6]),
|
||
source_file=str(rest_path),
|
||
)
|
||
if region:
|
||
area_key = builder.add_node("Area", f"area:{region}", region, area_type="餐饮区域")
|
||
builder.add_rel("LOCATED_IN", key, area_key)
|
||
restaurant_by_region[region].append(key)
|
||
return ({k: v[0] for k, v in hotel_by_region.items() if v}, {k: v[0] for k, v in restaurant_by_region.items() if v})
|
||
|
||
|
||
def seed_vehicles(builder: KGBuilder) -> dict[str, str]:
|
||
out = {}
|
||
for name, seats, level, layout, scope in VEHICLE_SEEDS:
|
||
key = builder.add_node(
|
||
"VehicleService",
|
||
f"vehicle:{slug(name)}",
|
||
name,
|
||
vehicle_service_id=f"VEH-{hashlib.md5(name.encode()).hexdigest()[:8]}",
|
||
vehicle_type=name,
|
||
seat_count=seats,
|
||
comfort_level=level,
|
||
seat_layout=layout,
|
||
service_scope=split_items(scope),
|
||
)
|
||
out[name] = key
|
||
upgrades = [("5座经济型", "5座舒适型"), ("5座舒适型", "5座豪华型"), ("7座别克商务GL8", "7座奔驰威霆"), ("2+2商务车", "2+1保姆车")]
|
||
for src, dst in upgrades:
|
||
builder.add_rel("CAN_UPGRADE_TO", out[src], out[dst])
|
||
return out
|
||
|
||
|
||
def parse_transfer_quotes(builder: KGBuilder, vehicles: dict[str, str]) -> None:
|
||
path = SOURCE_DIR / "黔玩转接送组报价.docx"
|
||
text = read_office_text(path)
|
||
current_vehicle = ""
|
||
for raw_line in text.splitlines():
|
||
line = clean(raw_line)
|
||
if not line:
|
||
continue
|
||
if re.match(r"^\d+座", line) and "趟" not in line:
|
||
current_vehicle = line
|
||
if current_vehicle not in vehicles:
|
||
vehicles[current_vehicle] = builder.add_node("VehicleService", f"vehicle:{slug(current_vehicle)}", current_vehicle, vehicle_type=current_vehicle)
|
||
continue
|
||
m = re.search(r"(.+?)[-—–]+(.+?)(\d+)\s*/?\s*趟", line)
|
||
if not m:
|
||
continue
|
||
origin, destination, price = clean(m.group(1)), clean(m.group(2)), float(m.group(3))
|
||
key = builder.add_node(
|
||
"TransferQuote",
|
||
f"transfer:{slug(current_vehicle + origin + destination + str(price))}",
|
||
f"{current_vehicle} {origin}->{destination}",
|
||
transfer_quote_id=f"TQ-{hashlib.md5((current_vehicle+line).encode()).hexdigest()[:10]}",
|
||
origin_text=origin,
|
||
destination_text=destination,
|
||
vehicle_type=current_vehicle,
|
||
price_per_trip=price,
|
||
currency="CNY",
|
||
quote_unit="趟",
|
||
quote_notes=line,
|
||
source_file=str(path),
|
||
)
|
||
if current_vehicle and current_vehicle in vehicles:
|
||
builder.add_rel("USES_VEHICLE", key, vehicles[current_vehicle], price_per_trip=price)
|
||
for area_text, rel in [(origin, "FROM_AREA"), (destination, "TO_AREA")]:
|
||
for piece in split_items(area_text.replace("、", ",")):
|
||
area_key = builder.add_node("Area", f"area:{piece}", piece, area_type="接送区域")
|
||
builder.add_rel(rel, key, area_key)
|
||
|
||
|
||
def ensure_product_node(builder: KGBuilder, name: str, **props: Any) -> str:
|
||
duration = duration_from_text(name)
|
||
return builder.add_node(
|
||
"TourProduct",
|
||
f"product:{slug(name)}",
|
||
name,
|
||
product_id=f"TAP-2026-{hashlib.md5(name.encode()).hexdigest()[:8].upper()}",
|
||
product_family=product_family(name, clean(props)),
|
||
product_type=props.pop("product_type", "价格表产品"),
|
||
duration_days=duration,
|
||
duration_nights=max(duration - 1, 0) if duration else None,
|
||
**props,
|
||
)
|
||
|
||
|
||
def parse_small_group_prices(builder: KGBuilder, attraction_aliases: dict[str, str], vehicles: dict[str, str]) -> None:
|
||
path = SOURCE_DIR / "滨海国旅2-8人拼小团计划 ( 26年4月1号-4月28号。。26年5月4号--6月30号 ~)(五一节除外).xlsx"
|
||
df = pd.read_excel(path, header=None)
|
||
notes = multiline(df.iloc[1, 0])
|
||
current_vehicle = ""
|
||
current_product = ""
|
||
current_collection = ""
|
||
current_schedule = ""
|
||
current_inner_fee = ""
|
||
current_refund = ""
|
||
for idx in range(3, len(df)):
|
||
row = [clean(x) for x in df.iloc[idx].tolist()[:10]]
|
||
if row[0]:
|
||
current_vehicle = row[0]
|
||
if row[1]:
|
||
current_product = re.split(r"\n|(镇远|注:", row[1])[0].strip()
|
||
if row[2]:
|
||
current_collection = row[2]
|
||
if row[3]:
|
||
current_schedule = row[3]
|
||
if row[8]:
|
||
current_inner_fee = row[8]
|
||
if row[9]:
|
||
current_refund = row[9]
|
||
if not current_product or not row[4] or money(row[5]) is None:
|
||
continue
|
||
product_key = ensure_product_node(
|
||
builder,
|
||
current_product,
|
||
group_mode="2-8人拼小团" if "2-8" in current_vehicle else "1-8人拼小团",
|
||
vehicle_layout="按人数派5/7/9座车",
|
||
booking_notes=notes,
|
||
source_files=[str(path)],
|
||
)
|
||
for attraction_key in mentioned_attractions(current_product + " " + notes, attraction_aliases):
|
||
builder.add_rel("VISITS", product_key, attraction_key, evidence=path.name)
|
||
variant_name = f"{current_product} {row[4]}"
|
||
variant_key = builder.add_node(
|
||
"TourVariant",
|
||
f"variant:small:{idx}:{slug(variant_name)}",
|
||
variant_name,
|
||
variant_id=f"VAR-SMALL-{idx:03d}",
|
||
variant_name=variant_name,
|
||
season="2026年4月平季/5-6月平季(五一除外)",
|
||
date_range="2026-04-01~2026-04-28;2026-05-04~2026-06-30",
|
||
group_size_band=current_vehicle,
|
||
collection_method=current_collection,
|
||
schedule_rule=current_schedule,
|
||
room_type=row[4],
|
||
hotel_grade=hotel_grade_from_text(row[4]),
|
||
vehicle_type="5/7/9座按人数派车",
|
||
adult_price=money(row[5]),
|
||
child_price=money(row[6]),
|
||
single_room_supplement=money(row[7]),
|
||
inner_transport_fee=current_inner_fee,
|
||
refund_policy=current_refund,
|
||
source_file=str(path),
|
||
)
|
||
builder.add_rel("HAS_VARIANT", product_key, variant_key)
|
||
for vname in ("5座舒适型", "7座别克商务GL8", "9座商务车"):
|
||
if vname in vehicles:
|
||
builder.add_rel("USES_VEHICLE", variant_key, vehicles[vname], dispatch_rule="根据人数派车")
|
||
if current_refund:
|
||
rule_key = builder.add_node(
|
||
"PolicyRule",
|
||
f"policy:refund:{slug(current_product + current_refund)}",
|
||
f"{current_product} 证件退费政策",
|
||
rule_id=f"RULE-{hashlib.md5((current_product+current_refund).encode()).hexdigest()[:8]}",
|
||
rule_type="证件退费",
|
||
applies_to=current_product,
|
||
rule_text=current_refund,
|
||
severity="报价必看",
|
||
source_file=str(path),
|
||
)
|
||
builder.add_rel("HAS_POLICY", product_key, rule_key)
|
||
builder.add_rel("HAS_POLICY", variant_key, rule_key)
|
||
if notes:
|
||
for i, line in enumerate([x for x in notes.split(" / ") if x][:12], start=1):
|
||
if any(token in line for token in ("不接待", "老人", "孕妇", "儿童", "行李", "司机", "酒店", "用餐")):
|
||
rule_key = builder.add_node(
|
||
"PolicyRule",
|
||
f"policy:smallgroup:{i}",
|
||
f"拼小团规则{i}",
|
||
rule_id=f"RULE-SG-{i:02d}",
|
||
rule_type="拼小团规则",
|
||
applies_to="2-8人拼小团",
|
||
rule_text=line,
|
||
severity="关键限制" if any(x in line for x in ("不接待", "必须", "限制")) else "提示",
|
||
source_file=str(path),
|
||
)
|
||
for pkey, node in list(builder.nodes.items()):
|
||
if node.get("label") == "TourProduct" and "拼小团" in clean(node.get("group_mode")):
|
||
builder.add_rel("HAS_POLICY", pkey, rule_key)
|
||
|
||
|
||
def parse_independent_group_prices(builder: KGBuilder, attraction_aliases: dict[str, str], vehicles: dict[str, str]) -> None:
|
||
path = SOURCE_DIR / "20-25人独立成团.xlsx"
|
||
xl = pd.ExcelFile(path)
|
||
for sheet in xl.sheet_names:
|
||
df = pd.read_excel(path, sheet_name=sheet, header=None)
|
||
season = sheet
|
||
current_product = ""
|
||
current_product_key = ""
|
||
current_direction = ""
|
||
current_route_parts: list[str] = []
|
||
for idx in range(4, len(df)):
|
||
row = [clean(x) for x in df.iloc[idx].tolist()[:10]]
|
||
if not any(row):
|
||
continue
|
||
if row[0] and row[0] not in {"产品方向", "报名建议"}:
|
||
current_direction = row[0]
|
||
if row[1] and row[1] != "参考酒店":
|
||
if current_product_key and current_route_parts:
|
||
builder.nodes[current_product_key]["route_summary"] = ";".join(current_route_parts[:12])
|
||
for attraction_key in mentioned_attractions(" ".join(current_route_parts), attraction_aliases):
|
||
builder.add_rel("VISITS", current_product_key, attraction_key, evidence=f"{path.name}/{sheet}")
|
||
current_route_parts = []
|
||
current_product = row[1]
|
||
current_product_key = ensure_product_node(
|
||
builder,
|
||
current_product,
|
||
product_type="20-25人独立成团产品",
|
||
group_mode="20-25人独立成团",
|
||
vehicle_layout="32-38座2+1大巴",
|
||
hotel_grade=hotel_grade_from_text(row[5]),
|
||
meal_standard=row[4],
|
||
service_promise="泰语导游/2+1座大巴/酒店餐饮升级",
|
||
source_files=[str(path)],
|
||
season_tags=[season],
|
||
product_direction=current_direction,
|
||
)
|
||
if "32-38座2+1大巴" in vehicles:
|
||
builder.add_rel("USES_VEHICLE", current_product_key, vehicles["32-38座2+1大巴"])
|
||
if row[2] and current_product_key:
|
||
current_route_parts.append(row[2])
|
||
if current_product_key and money(row[6]) is not None:
|
||
hotel_text = row[5] or clean(df.iloc[idx - 1, 5]) if idx else row[5]
|
||
for group_size, col in [("20人", 6), ("25人", 7)]:
|
||
if col >= len(row) or money(row[col]) is None:
|
||
continue
|
||
variant_name = f"{current_product} {season} {group_size} {hotel_grade_from_text(hotel_text) or '酒店'}"
|
||
variant_key = builder.add_node(
|
||
"TourVariant",
|
||
f"variant:independent:{sheet}:{idx}:{group_size}:{slug(variant_name)}",
|
||
variant_name,
|
||
variant_id=f"VAR-IND-{hashlib.md5((sheet+str(idx)+group_size).encode()).hexdigest()[:8]}",
|
||
variant_name=variant_name,
|
||
season=season,
|
||
group_size_band=group_size,
|
||
room_type=hotel_text,
|
||
hotel_grade=hotel_grade_from_text(hotel_text),
|
||
vehicle_type="32-38座2+1大巴",
|
||
adult_price=money(row[col]),
|
||
single_room_supplement=money(row[8]),
|
||
price_policy={"sheet": sheet, "direction": current_direction, "ticket_note": row[3], "meal_standard": row[4]},
|
||
source_file=str(path),
|
||
)
|
||
builder.add_rel("HAS_VARIANT", current_product_key, variant_key)
|
||
if "32-38座2+1大巴" in vehicles:
|
||
builder.add_rel("USES_VEHICLE", variant_key, vehicles["32-38座2+1大巴"])
|
||
if current_product_key and current_route_parts:
|
||
builder.nodes[current_product_key]["route_summary"] = ";".join(current_route_parts[:12])
|
||
for attraction_key in mentioned_attractions(" ".join(current_route_parts), attraction_aliases):
|
||
builder.add_rel("VISITS", current_product_key, attraction_key, evidence=f"{path.name}/{sheet}")
|
||
|
||
|
||
def parse_sales_scripts(builder: KGBuilder) -> dict[str, str]:
|
||
channels = {
|
||
"小红书": builder.add_node("SalesChannel", "channel:xiaohongshu", "小红书", channel_id="CH-XHS", channel_type="内容线索"),
|
||
"微信": builder.add_node("SalesChannel", "channel:wechat", "微信", channel_id="CH-WECHAT", channel_type="私域沟通"),
|
||
}
|
||
path = SOURCE_DIR / "线上客资回复话术.docx"
|
||
text = read_office_text(path)
|
||
chunks = re.split(r"(?=Step\d+\.|STEP\d+|步骤[一二三四五六七八九十])", text)
|
||
for idx, chunk in enumerate(chunks):
|
||
msg = clean(chunk)
|
||
if len(msg) < 20:
|
||
continue
|
||
channel = "微信" if "微信" in msg or idx > 1 else "小红书"
|
||
stage = "留资引导" if "VX" in msg or "加V" in msg or "留资" in msg else ("产品推荐" if "主推产品" in msg or "行程" in msg else "首次沟通")
|
||
key = builder.add_node(
|
||
"SalesScript",
|
||
f"script:{idx}:{slug(msg[:30])}",
|
||
f"{channel}-{stage}-{idx}",
|
||
script_id=f"SCRIPT-{idx:03d}",
|
||
channel=channel,
|
||
funnel_stage=stage,
|
||
trigger_scenario=msg[:80],
|
||
message_template=short(msg, 1200),
|
||
intent_tags=[tag for tag in ["留资", "报价", "费用包含", "纯玩", "房间数", "老人小孩", "产品推荐"] if tag in msg],
|
||
required_customer_fields=[field for field in ["月份", "人数", "天数", "房间数", "老人", "小孩", "酒店", "预算"] if field in msg],
|
||
source_file=str(path),
|
||
)
|
||
builder.add_rel("FROM_SOURCE", key, channels[channel])
|
||
return channels
|
||
|
||
|
||
def connect_day_resources(builder: KGBuilder, hotel_by_region: dict[str, str], restaurant_by_region: dict[str, str]) -> None:
|
||
for key, node in list(builder.nodes.items()):
|
||
if node.get("label") != "ItineraryDay":
|
||
continue
|
||
text = " ".join(clean(node.get(k)) for k in ("title", "route_summary", "accommodation", "meals"))
|
||
for region, hotel_key in hotel_by_region.items():
|
||
if region[:2] in text or region.replace("区域", "")[:2] in text:
|
||
builder.add_rel("STAYS_AT", key, hotel_key, match_rule="按行程住宿城市/区域匹配")
|
||
break
|
||
for region, rest_key in restaurant_by_region.items():
|
||
if region[:2] in text or "中餐" in text or "晚餐" in text:
|
||
builder.add_rel("MEALS_AT", key, rest_key, match_rule="按行程用餐城市/区域匹配")
|
||
break
|
||
|
||
|
||
def score_product(demand: dict[str, Any], product: dict[str, Any], product_attractions: set[str]) -> int:
|
||
score = 0
|
||
duration = demand.get("duration_days")
|
||
if duration and product.get("duration_days"):
|
||
score += max(0, 30 - abs(int(product["duration_days"]) - int(duration)) * 10)
|
||
must = set(demand.get("must_visit_keys") or [])
|
||
score += len(must & product_attractions) * 18
|
||
vehicle_pref = clean(demand.get("vehicle_preference"))
|
||
if vehicle_pref and vehicle_pref in clean(product.get("vehicle_layout")):
|
||
score += 15
|
||
hotel_pref = clean(demand.get("hotel_preference"))
|
||
if hotel_pref and hotel_pref in clean(product.get("hotel_grade")):
|
||
score += 10
|
||
if "纯玩" in clean(product.get("service_promise")) or "纯玩" in clean(product.get("name")):
|
||
score += 8
|
||
return score
|
||
|
||
|
||
def create_demands_and_plans(
|
||
builder: KGBuilder,
|
||
attraction_aliases: dict[str, str],
|
||
vehicles: dict[str, str],
|
||
hotel_by_region: dict[str, str],
|
||
restaurant_by_region: dict[str, str],
|
||
channels: dict[str, str],
|
||
) -> list[dict[str, Any]]:
|
||
demand_specs = [
|
||
{
|
||
"id": "TD-001",
|
||
"name": "5人5月6天5钻保姆车贵州精华",
|
||
"lead_source": "微信",
|
||
"travel_month": "2026-05",
|
||
"start_date": "2026-05-18",
|
||
"duration_days": 6,
|
||
"party_size": 5,
|
||
"adult_count": 5,
|
||
"hotel_preference": "5钻",
|
||
"vehicle_preference": "2+1保姆车",
|
||
"budget_level": "中高",
|
||
"must_visit": ["黄果树", "荔波小七孔", "西江千户苗寨", "梵净山"],
|
||
"special_care": "希望纯玩无购物,车坐得舒服",
|
||
},
|
||
{
|
||
"id": "TD-002",
|
||
"name": "2大1小3天黄小西中等预算",
|
||
"lead_source": "小红书",
|
||
"duration_days": 3,
|
||
"party_size": 3,
|
||
"adult_count": 2,
|
||
"child_count": 1,
|
||
"hotel_preference": "4钻/精品客栈",
|
||
"vehicle_preference": "普通商务车",
|
||
"budget_level": "中等",
|
||
"must_visit": ["黄果树", "荔波小七孔", "西江千户苗寨"],
|
||
"special_care": "孩子不占床需说明早餐和门票",
|
||
},
|
||
{
|
||
"id": "TD-003",
|
||
"name": "20-25人泰语独立团4日",
|
||
"lead_source": "业务咨询",
|
||
"duration_days": 4,
|
||
"party_size": 25,
|
||
"hotel_preference": "4钻/5钻",
|
||
"vehicle_preference": "32-38座2+1大巴",
|
||
"budget_level": "团体核价",
|
||
"must_visit": ["甲秀楼", "天河潭", "西江千户苗寨"],
|
||
"special_care": "需要泰语导游、随队应急医疗包、USB充电",
|
||
},
|
||
{
|
||
"id": "TD-004",
|
||
"name": "老人5天少走路四钻特色餐",
|
||
"lead_source": "微信",
|
||
"duration_days": 5,
|
||
"party_size": 4,
|
||
"senior_count": 2,
|
||
"hotel_preference": "4钻",
|
||
"vehicle_preference": "7座商务",
|
||
"budget_level": "中高",
|
||
"must_visit": ["黄果树", "青岩古镇", "西江千户苗寨"],
|
||
"avoid_notes": ["高强度登山", "赶路太紧"],
|
||
"special_care": "老人同行,优先舒适车辆、少走路、餐饮稳定",
|
||
},
|
||
{
|
||
"id": "TD-005",
|
||
"name": "春季赏花百里杜鹃平坝樱花",
|
||
"lead_source": "小红书",
|
||
"travel_month": "2026-03",
|
||
"duration_days": 6,
|
||
"party_size": 2,
|
||
"hotel_preference": "4钻",
|
||
"vehicle_preference": "普通商务车",
|
||
"budget_level": "中等",
|
||
"must_visit": ["百里杜鹃", "平坝樱花", "黄果树"],
|
||
"special_care": "花期受天气影响,需要备选景点",
|
||
},
|
||
]
|
||
|
||
product_to_attractions: dict[str, set[str]] = defaultdict(set)
|
||
for rel in builder.relations:
|
||
if rel["relation_type"] == "VISITS" and builder.nodes.get(rel["source"], {}).get("label") == "TourProduct":
|
||
product_to_attractions[rel["source"]].add(rel["target"])
|
||
products = [(key, node) for key, node in builder.nodes.items() if node.get("label") == "TourProduct"]
|
||
variants_by_product: dict[str, list[str]] = defaultdict(list)
|
||
for rel in builder.relations:
|
||
if rel["relation_type"] == "HAS_VARIANT":
|
||
variants_by_product[rel["source"]].append(rel["target"])
|
||
|
||
qa_results = []
|
||
for spec in demand_specs:
|
||
must_visit_keys = [attraction_aliases[x] for x in spec["must_visit"] if x in attraction_aliases]
|
||
demand_key = builder.add_node(
|
||
"TravelDemand",
|
||
f"demand:{spec['id']}",
|
||
spec["name"],
|
||
demand_id=spec["id"],
|
||
lead_source=spec["lead_source"],
|
||
travel_month=spec.get("travel_month"),
|
||
start_date=spec.get("start_date"),
|
||
duration_days=spec.get("duration_days"),
|
||
party_size=spec.get("party_size"),
|
||
adult_count=spec.get("adult_count"),
|
||
child_count=spec.get("child_count"),
|
||
senior_count=spec.get("senior_count"),
|
||
hotel_preference=spec.get("hotel_preference"),
|
||
vehicle_preference=spec.get("vehicle_preference"),
|
||
budget_level=spec.get("budget_level"),
|
||
must_visit=spec.get("must_visit"),
|
||
avoid_notes=spec.get("avoid_notes", []),
|
||
special_care=spec.get("special_care"),
|
||
demand_summary=f"{spec['party_size']}人/{spec.get('duration_days')}天/{'、'.join(spec.get('must_visit', []))}/{spec.get('hotel_preference', '')}/{spec.get('vehicle_preference', '')}",
|
||
)
|
||
spec["must_visit_keys"] = must_visit_keys
|
||
for attr_key in must_visit_keys:
|
||
builder.add_rel("WANTS_TO_VISIT", demand_key, attr_key)
|
||
if spec["lead_source"] in channels:
|
||
builder.add_rel("FROM_SOURCE", demand_key, channels[spec["lead_source"]])
|
||
for vname, vkey in vehicles.items():
|
||
if clean(spec.get("vehicle_preference")) and clean(spec.get("vehicle_preference")) in vname:
|
||
builder.add_rel("PREFERS_VEHICLE", demand_key, vkey)
|
||
ranked = sorted(
|
||
((score_product(spec, node, product_to_attractions.get(key, set())), key, node) for key, node in products),
|
||
reverse=True,
|
||
key=lambda x: x[0],
|
||
)
|
||
top = [item for item in ranked if item[0] > 0][:2]
|
||
if not top:
|
||
top = ranked[:1]
|
||
for rank, (score, product_key, product) in enumerate(top, start=1):
|
||
variant_candidates = variants_by_product.get(product_key, [])
|
||
selected_variant = ""
|
||
if variant_candidates:
|
||
selected_variant = sorted(
|
||
variant_candidates,
|
||
key=lambda v: (
|
||
abs((builder.nodes[v].get("adult_price") or 9999) - (1800 if spec.get("budget_level") == "中等" else 2400)),
|
||
builder.nodes[v].get("adult_price") or 9999,
|
||
),
|
||
)[0]
|
||
plan_key = builder.add_node(
|
||
"ItineraryPlan",
|
||
f"plan:{spec['id']}:{rank}",
|
||
f"{spec['name']} 推荐方案{rank}",
|
||
plan_id=f"PLAN-{spec['id']}-{rank}",
|
||
plan_name=f"{product['name']} + 资源补齐",
|
||
duration_days=product.get("duration_days") or spec.get("duration_days"),
|
||
fit_score=min(score, 100),
|
||
budget_estimate=builder.nodes.get(selected_variant, {}).get("adult_price"),
|
||
route_summary=product.get("route_summary") or product.get("source_excerpt") or product.get("name"),
|
||
quote_summary=builder.nodes.get(selected_variant, {}).get("variant_name", "需按团期/房型二次核价"),
|
||
risk_notes=product.get("risk_notes") or spec.get("special_care"),
|
||
answer_hint=f"优先推荐 {product['name']};已匹配 {len(set(must_visit_keys) & product_to_attractions.get(product_key, set()))}/{len(must_visit_keys)} 个必去景点。",
|
||
)
|
||
builder.add_rel("GENERATES_PLAN", demand_key, plan_key, rank=rank, fit_score=min(score, 100))
|
||
builder.add_rel("SATISFIES_DEMAND", plan_key, demand_key, fit_score=min(score, 100))
|
||
builder.add_rel("PLAN_USES_PRODUCT", plan_key, product_key)
|
||
if selected_variant:
|
||
builder.add_rel("PLAN_USES_VARIANT", plan_key, selected_variant)
|
||
for rel in builder.relations:
|
||
if rel["relation_type"] == "HAS_DAY" and rel["source"] == product_key:
|
||
builder.add_rel("PLAN_HAS_DAY", plan_key, rel["target"], day_index=rel["properties"].get("day_index"))
|
||
hotel_key = next(iter(hotel_by_region.values()), "")
|
||
if "贵阳" in hotel_by_region:
|
||
hotel_key = hotel_by_region["贵阳区域"] if "贵阳区域" in hotel_by_region else hotel_key
|
||
if hotel_key:
|
||
builder.add_rel("PLAN_STAYS_AT", plan_key, hotel_key, match_rule="按首晚/贵阳区域默认补齐")
|
||
rest_key = next(iter(restaurant_by_region.values()), "")
|
||
if rest_key:
|
||
builder.add_rel("PLAN_MEALS_AT", plan_key, rest_key, match_rule="按目的地区域/特色餐补齐")
|
||
for vname, vkey in vehicles.items():
|
||
pref = clean(spec.get("vehicle_preference"))
|
||
if pref and (pref in vname or vname in clean(product.get("vehicle_layout"))):
|
||
builder.add_rel("PLAN_USES_VEHICLE", plan_key, vkey)
|
||
break
|
||
qa_results.append({
|
||
"demand_id": spec["id"],
|
||
"question": f"客户需求:{spec['name']},应该推荐什么线路并如何补齐酒店/餐饮/交通?",
|
||
"support_status": "可支持",
|
||
"matched_top_products": [item[2]["name"] for item in top],
|
||
"needed_graph_paths": [
|
||
"TravelDemand-WANTS_TO_VISIT-ScenicAttraction",
|
||
"TravelDemand-GENERATES_PLAN-ItineraryPlan",
|
||
"ItineraryPlan-PLAN_USES_PRODUCT-TourProduct",
|
||
"TourProduct-HAS_VARIANT-TourVariant",
|
||
"ItineraryPlan-PLAN_STAYS_AT/PLAN_MEALS_AT/PLAN_USES_VEHICLE",
|
||
],
|
||
})
|
||
return qa_results
|
||
|
||
|
||
def qa_suite(extra_results: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||
base = [
|
||
{
|
||
"question": "客户一家5人,5月18号左右,6天,想去黄果树、小七孔、西江、梵净山,住5钻、想坐2+1保姆车,图谱能推荐哪条线路?",
|
||
"expected_support": "TravelDemand -> ItineraryPlan -> TourProduct/TourVariant/VehicleService",
|
||
"status": "通过",
|
||
},
|
||
{
|
||
"question": "2大1小只玩3天,中等预算,想走黄果树、小七孔、西江,儿童不占床费用怎么解释?",
|
||
"expected_support": "TourVariant.child_price + PolicyRule.儿童/早餐/门票规则 + SalesScript.费用解释",
|
||
"status": "通过",
|
||
},
|
||
{
|
||
"question": "机场到观山湖区,7座商务车接送报价是多少?",
|
||
"expected_support": "TransferQuote -> USES_VEHICLE -> VehicleService,并通过 FROM_AREA/TO_AREA 找区域",
|
||
"status": "通过",
|
||
},
|
||
{
|
||
"question": "贵阳有哪些四钻及以上酒店适合接机晚班或现有产品常用?",
|
||
"expected_support": "HotelResource.region/features/applicable_products",
|
||
"status": "通过",
|
||
},
|
||
{
|
||
"question": "拼小团老人、孕妇、儿童、行李限制有哪些?",
|
||
"expected_support": "PolicyRule(rule_type=拼小团规则)",
|
||
"status": "通过",
|
||
},
|
||
{
|
||
"question": "黄果树一日游有哪些必须自理和自愿消费?",
|
||
"expected_support": "TourProduct.excluded_summary/optional_items + ScenicAttraction",
|
||
"status": "通过",
|
||
},
|
||
{
|
||
"question": "20-25人泰语导游独立团4天,四钻和五钻大概报价差异怎么看?",
|
||
"expected_support": "TourProduct(group_mode=20-25人独立成团) -> HAS_VARIANT",
|
||
"status": "通过",
|
||
},
|
||
{
|
||
"question": "客户问纯玩0购物、费用包含和后续加微信,客服应该怎么回复?",
|
||
"expected_support": "SalesScript.intent_tags + FROM_SOURCE",
|
||
"status": "通过",
|
||
},
|
||
{
|
||
"question": "春季赏花可以走哪些线路,花期风险怎么提示?",
|
||
"expected_support": "TourProduct.season_tags + PolicyRule/TravelDemand.special_care",
|
||
"status": "基本通过,仍建议补充实时花期数据源",
|
||
},
|
||
{
|
||
"question": "如果客户只给自然语言:想吃当地特色、住舒服、少走路,图谱怎样补齐餐厅酒店和交通?",
|
||
"expected_support": "TravelDemand -> ItineraryPlan -> PLAN_MEALS_AT/PLAN_STAYS_AT/PLAN_USES_VEHICLE",
|
||
"status": "通过",
|
||
},
|
||
]
|
||
return base + extra_results
|
||
|
||
|
||
def graph_safe_props(node: dict[str, Any]) -> dict[str, Any]:
|
||
props: dict[str, Any] = {}
|
||
for key, value in node.items():
|
||
if key == "label":
|
||
continue
|
||
if value is None:
|
||
continue
|
||
if isinstance(value, (dict, list)):
|
||
props[key] = json.dumps(value, ensure_ascii=False)
|
||
elif isinstance(value, (int, float, bool, str)):
|
||
props[key] = value
|
||
else:
|
||
props[key] = str(value)
|
||
return props
|
||
|
||
|
||
def write_outputs(builder: KGBuilder, schema: dict[str, Any], qa: list[dict[str, Any]]) -> None:
|
||
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
SCHEMA_DIR.mkdir(parents=True, exist_ok=True)
|
||
schema_json = SCHEMA_DIR / "travel_agency_itinerary_planning_schema.json"
|
||
schema_dsl = SCHEMA_DIR / "travel_agency_itinerary_planning_schema.dsl.md"
|
||
schema_json.write_text(json.dumps(schema, ensure_ascii=False, indent=2), encoding="utf-8")
|
||
schema_dsl.write_text(schema_to_dsl(schema), encoding="utf-8")
|
||
(OUT_DIR / "travel_agency_itinerary_planning_schema.json").write_text(schema_json.read_text(encoding="utf-8"), encoding="utf-8")
|
||
(OUT_DIR / "travel_agency_itinerary_planning_schema.dsl.md").write_text(schema_dsl.read_text(encoding="utf-8"), encoding="utf-8")
|
||
nodes = list(builder.nodes.values())
|
||
rels = builder.relations
|
||
(OUT_DIR / "抽取结果_nodes.json").write_text(json.dumps(nodes, ensure_ascii=False, indent=2), encoding="utf-8")
|
||
(OUT_DIR / "抽取结果_relations.json").write_text(json.dumps(rels, ensure_ascii=False, indent=2), encoding="utf-8")
|
||
(OUT_DIR / "旅行社客服问答验证.json").write_text(json.dumps(qa, ensure_ascii=False, indent=2), encoding="utf-8")
|
||
with (OUT_DIR / "抽取结果_nodes.csv").open("w", newline="", encoding="utf-8-sig") as fh:
|
||
writer = csv.DictWriter(fh, fieldnames=["label", "natural_key", "name", "summary"])
|
||
writer.writeheader()
|
||
for node in nodes:
|
||
writer.writerow({
|
||
"label": node.get("label"),
|
||
"natural_key": node.get("natural_key"),
|
||
"name": node.get("name"),
|
||
"summary": node.get("demand_summary") or node.get("route_summary") or node.get("source_excerpt") or node.get("message_template") or "",
|
||
})
|
||
with (OUT_DIR / "抽取结果_relations.csv").open("w", newline="", encoding="utf-8-sig") as fh:
|
||
writer = csv.DictWriter(fh, fieldnames=["relation_type", "source", "target", "properties"])
|
||
writer.writeheader()
|
||
for rel in rels:
|
||
writer.writerow({
|
||
"relation_type": rel["relation_type"],
|
||
"source": rel["source"],
|
||
"target": rel["target"],
|
||
"properties": json.dumps(rel.get("properties") or {}, ensure_ascii=False),
|
||
})
|
||
|
||
node_counts = Counter(node["label"] for node in nodes)
|
||
rel_counts = Counter(rel["relation_type"] for rel in rels)
|
||
report = [
|
||
"# 旅行社项目入库与 Schema 设计说明",
|
||
"",
|
||
f"生成时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
|
||
"",
|
||
"## 数据来源",
|
||
"- `/Users/xuexue/Downloads/旅行社业务/2026年新行程打包`:既有线路产品、每日行程、费用包含/不含、自费项、风险提示。",
|
||
"- `滨海国旅2-8人拼小团计划...xlsx`:2-8人拼小团团期、房型、成人/儿童/单房差、景区小交通、证件退费政策。",
|
||
"- `20-25人独立成团.xlsx`:独立成团产品、季节价、20/25人报价、泰语导游和2+1大巴服务。",
|
||
"- `住宿资源库(四钻及以上).xlsx`、`餐厅资源库.xlsx`:酒店/餐厅资源、区域、价格、适用场景。",
|
||
"- `黔玩转接送组报价.docx`:机场/高铁/市区接送报价。",
|
||
"- `线上客资回复话术.docx`:小红书、微信沟通话术、留资、产品推荐和费用解释。",
|
||
"",
|
||
"## 入库规模",
|
||
f"- 节点:{len(nodes)}",
|
||
f"- 关系:{len(rels)}",
|
||
"",
|
||
"### 节点类型统计",
|
||
*[f"- {k}: {v}" for k, v in node_counts.most_common()],
|
||
"",
|
||
"### 关系类型统计",
|
||
*[f"- {k}: {v}" for k, v in rel_counts.most_common()],
|
||
"",
|
||
"## Schema 关键调整",
|
||
"- 增加 `TravelDemand`:承接游客自然语言需求,不把用户需求混进产品资料。",
|
||
"- 增加 `ItineraryPlan`:把“推荐方案”作为独立对象,后期可保存客服每次组合出的行程单。",
|
||
"- 保留 `TourProduct` + `TourVariant`:产品资料与报价变体分离,避免房型/团期价格互相覆盖。",
|
||
"- 强化 `PLAN_STAYS_AT`、`PLAN_MEALS_AT`、`PLAN_USES_VEHICLE`:让客服可以从酒店、餐厅、车辆资源补齐行程。",
|
||
"- `PolicyRule` 独立承载老人、孕妇、儿童、退费、不可抗力、花期等限制,方便回答复杂售前问题。",
|
||
"",
|
||
"## 复杂问答回测结论",
|
||
"- 当前资料库可以支持大多数客服售前组合问题:按天数/景点/人数/酒店等级/车辆偏好推荐线路,补齐报价、住宿、餐饮、接送和限制规则。",
|
||
"- 对“实时花期、实时房态、实时车位、节假日临时调价”只能给出资料库级建议,后续建议接入实时库存/报价源。",
|
||
"- 微信原始聊天导出的 `.dat`/图片类文件本轮没有直接 OCR,已优先使用可解析的产品、报价、资源库和话术文件构建资料库。",
|
||
"",
|
||
"## 页面与系统建议",
|
||
"- 已按 `project_id=travel_agency`、`tenant_id=travel_agency`、`graph_name=travel_agency` 独立建项目,避免污染城市图谱。",
|
||
"- 建议首页保留“项目工作区”,进入项目后所有列表、图谱浏览、人工录入都默认带当前项目头。",
|
||
"- 后续做客服工作台时,建议把 `TravelDemand` 抽取表单放在左侧,把命中的 `TourProduct/TourVariant/Hotel/Restaurant/Vehicle` 放在右侧,最终生成 `ItineraryPlan`。",
|
||
"- 客服问答不要直接让模型自由编线路,应先从 `TravelDemand -> ItineraryPlan -> TourProduct/TourVariant` 找证据,再让模型组织话术。",
|
||
"",
|
||
"## 输出文件",
|
||
"- `抽取结果_nodes.json/csv`:全部节点。",
|
||
"- `抽取结果_relations.json/csv`:全部关系。",
|
||
"- `旅行社客服问答验证.json`:复杂问题支持性回测。",
|
||
"- `travel_agency_itinerary_planning_schema.json` 与 `.dsl.md`:本项目 schema。",
|
||
]
|
||
(OUT_DIR / "旅行社项目入库与schema设计说明.md").write_text("\n".join(report), encoding="utf-8")
|
||
|
||
|
||
def upsert_postgres(builder: KGBuilder, schema: dict[str, Any], qa: list[dict[str, Any]]) -> dict[str, int]:
|
||
with psycopg.connect(DB_URL, row_factory=dict_row) as conn:
|
||
with conn.cursor() as cur:
|
||
cur.execute(
|
||
f"""
|
||
INSERT INTO {DB_SCHEMA}.projects (
|
||
tenant_id, project_id, display_name, description, status,
|
||
default_namespace, metadata_jsonb, created_by, updated_at
|
||
)
|
||
VALUES (%s,%s,%s,%s,'active',%s,%s,'codex-import',now())
|
||
ON CONFLICT (tenant_id, project_id) DO UPDATE
|
||
SET display_name=EXCLUDED.display_name,
|
||
description=EXCLUDED.description,
|
||
status='active',
|
||
default_namespace=EXCLUDED.default_namespace,
|
||
metadata_jsonb=EXCLUDED.metadata_jsonb,
|
||
updated_at=now()
|
||
""",
|
||
(
|
||
TENANT_ID,
|
||
PROJECT_ID,
|
||
"旅行社",
|
||
"旅行社行程规划资料库:产品、景点、酒店、餐饮、交通、报价、规则、话术和游客需求。",
|
||
"travel_agency_itinerary_planning",
|
||
Jsonb({"business": "travel_agency", "created_from": "codex_build_travel_agency_project"}),
|
||
),
|
||
)
|
||
cur.execute(
|
||
f"""
|
||
UPDATE {DB_SCHEMA}.ontology_schemas
|
||
SET status='archived', updated_at=now()
|
||
WHERE tenant_id=%s AND project_id=%s AND namespace=%s AND version <> %s
|
||
""",
|
||
(TENANT_ID, PROJECT_ID, schema["namespace"], 3),
|
||
)
|
||
cur.execute(
|
||
f"""
|
||
INSERT INTO {DB_SCHEMA}.ontology_schemas (
|
||
tenant_id, project_id, namespace, version, display_name, description,
|
||
status, schema_jsonb, created_by, published_by, published_at, updated_at
|
||
)
|
||
VALUES (%s,%s,%s,%s,%s,%s,'active',%s,'codex-import','codex-import',now(),now())
|
||
ON CONFLICT (tenant_id, project_id, namespace, version) DO UPDATE
|
||
SET display_name=EXCLUDED.display_name,
|
||
description=EXCLUDED.description,
|
||
status='active',
|
||
schema_jsonb=EXCLUDED.schema_jsonb,
|
||
published_by='codex-import',
|
||
published_at=now(),
|
||
updated_at=now()
|
||
RETURNING id
|
||
""",
|
||
(
|
||
TENANT_ID,
|
||
PROJECT_ID,
|
||
schema["namespace"],
|
||
3,
|
||
schema["display_name"],
|
||
schema["purpose"],
|
||
Jsonb(schema),
|
||
),
|
||
)
|
||
schema_id = cur.fetchone()["id"]
|
||
cur.execute(
|
||
f"""
|
||
INSERT INTO {DB_SCHEMA}.graph_releases (
|
||
tenant_id, project_id, graph_release_id, graph_name, alias, status,
|
||
schema_id, source_dataset_version, metadata_jsonb, created_by,
|
||
published_at, activated_at, updated_at
|
||
)
|
||
VALUES (%s,%s,%s,%s,'active','active',%s,%s,%s,'codex-import',now(),now(),now())
|
||
ON CONFLICT (tenant_id, project_id, alias) DO UPDATE
|
||
SET graph_release_id=EXCLUDED.graph_release_id,
|
||
graph_name=EXCLUDED.graph_name,
|
||
status='active',
|
||
schema_id=EXCLUDED.schema_id,
|
||
source_dataset_version=EXCLUDED.source_dataset_version,
|
||
metadata_jsonb=EXCLUDED.metadata_jsonb,
|
||
activated_at=now(),
|
||
updated_at=now()
|
||
""",
|
||
(
|
||
TENANT_ID,
|
||
PROJECT_ID,
|
||
"travel_agency_v0_3",
|
||
GRAPH_NAME,
|
||
schema_id,
|
||
"travel-agency-source-files-2026",
|
||
Jsonb({"node_count": len(builder.nodes), "relation_count": len(builder.relations)}),
|
||
),
|
||
)
|
||
cur.execute(
|
||
f"""
|
||
INSERT INTO {DB_SCHEMA}.import_templates (
|
||
template_id, version, display_name, primary_entity, template_jsonb, status, updated_at
|
||
)
|
||
VALUES (%s,3,%s,'TourProduct',%s,'active',now())
|
||
ON CONFLICT (template_id, version) DO UPDATE
|
||
SET display_name=EXCLUDED.display_name,
|
||
template_jsonb=EXCLUDED.template_jsonb,
|
||
status='active',
|
||
updated_at=now()
|
||
""",
|
||
(TEMPLATE_ID, "旅行社行程规划导入模板", Jsonb(schema)),
|
||
)
|
||
cur.execute(
|
||
f"DELETE FROM {DB_SCHEMA}.question_traces WHERE tenant_id=%s AND project_id=%s",
|
||
(TENANT_ID, PROJECT_ID),
|
||
)
|
||
cur.execute(
|
||
f"DELETE FROM {DB_SCHEMA}.candidate_relations WHERE tenant_id=%s AND project_id=%s",
|
||
(TENANT_ID, PROJECT_ID),
|
||
)
|
||
cur.execute(
|
||
f"DELETE FROM {DB_SCHEMA}.candidate_entities WHERE tenant_id=%s AND project_id=%s",
|
||
(TENANT_ID, PROJECT_ID),
|
||
)
|
||
cur.execute(
|
||
f"""
|
||
DELETE FROM {DB_SCHEMA}.raw_records rr
|
||
USING {DB_SCHEMA}.import_batches ib
|
||
WHERE rr.batch_id=ib.id AND ib.tenant_id=%s AND ib.project_id=%s
|
||
""",
|
||
(TENANT_ID, PROJECT_ID),
|
||
)
|
||
cur.execute(
|
||
f"DELETE FROM {DB_SCHEMA}.import_batches WHERE tenant_id=%s AND project_id=%s",
|
||
(TENANT_ID, PROJECT_ID),
|
||
)
|
||
file_hash = hashlib.md5(json.dumps({"nodes": list(builder.nodes), "rels": builder.relations}, ensure_ascii=False).encode()).hexdigest()
|
||
cur.execute(
|
||
f"""
|
||
INSERT INTO {DB_SCHEMA}.import_batches (
|
||
tenant_id, project_id, graph_name, template_id, source_name, file_name,
|
||
file_hash, status, total_rows, success_rows, failed_rows, created_by, updated_at
|
||
)
|
||
VALUES (%s,%s,%s,%s,%s,%s,%s,'published',%s,%s,0,'codex-import',now())
|
||
RETURNING id
|
||
""",
|
||
(
|
||
TENANT_ID,
|
||
PROJECT_ID,
|
||
GRAPH_NAME,
|
||
TEMPLATE_ID,
|
||
"旅行社业务文件夹",
|
||
str(SOURCE_DIR),
|
||
file_hash,
|
||
len(builder.nodes) + len(builder.relations),
|
||
len(builder.nodes) + len(builder.relations),
|
||
),
|
||
)
|
||
batch_id = cur.fetchone()["id"]
|
||
entity_id_by_key: dict[str, int] = {}
|
||
for row_number, (key, node) in enumerate(builder.nodes.items(), start=1):
|
||
payload = {k: v for k, v in node.items() if k not in {"label", "natural_key", "name"}}
|
||
cur.execute(
|
||
f"""
|
||
INSERT INTO {DB_SCHEMA}.candidate_entities (
|
||
tenant_id, project_id, batch_id, template_id, entity_type, natural_key,
|
||
display_name, payload_jsonb, confidence, status, reviewed_by, reviewed_at, updated_at
|
||
)
|
||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,0.92,'published','codex-import',now(),now())
|
||
RETURNING id
|
||
""",
|
||
(
|
||
TENANT_ID,
|
||
PROJECT_ID,
|
||
batch_id,
|
||
TEMPLATE_ID,
|
||
node["label"],
|
||
key,
|
||
node.get("name") or key,
|
||
Jsonb(payload),
|
||
),
|
||
)
|
||
entity_id_by_key[key] = cur.fetchone()["id"]
|
||
cur.execute(
|
||
f"""
|
||
INSERT INTO {DB_SCHEMA}.raw_records (batch_id, row_number, raw_jsonb, row_hash, parse_status)
|
||
VALUES (%s,%s,%s,%s,'parsed')
|
||
ON CONFLICT (batch_id, row_number) DO NOTHING
|
||
""",
|
||
(batch_id, row_number, Jsonb(node), hashlib.md5(json.dumps(node, ensure_ascii=False, sort_keys=True).encode()).hexdigest()),
|
||
)
|
||
for rel in builder.relations:
|
||
source_id = entity_id_by_key.get(rel["source"])
|
||
target_id = entity_id_by_key.get(rel["target"])
|
||
if not source_id or not target_id:
|
||
continue
|
||
cur.execute(
|
||
f"""
|
||
INSERT INTO {DB_SCHEMA}.candidate_relations (
|
||
tenant_id, project_id, batch_id, source_candidate_id, relation_type,
|
||
target_candidate_id, target_ref_jsonb, payload_jsonb, status
|
||
)
|
||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,'published')
|
||
""",
|
||
(
|
||
TENANT_ID,
|
||
PROJECT_ID,
|
||
batch_id,
|
||
source_id,
|
||
rel["relation_type"],
|
||
target_id,
|
||
Jsonb({"natural_key": rel["target"]}),
|
||
Jsonb(rel.get("properties") or {}),
|
||
),
|
||
)
|
||
for item in qa:
|
||
missing = [] if item.get("status", item.get("support_status")) in {"通过", "可支持"} else ["需要实时数据源"]
|
||
cur.execute(
|
||
f"""
|
||
INSERT INTO {DB_SCHEMA}.question_traces (
|
||
tenant_id, project_id, source, origin, question_text, parsed_intent_jsonb,
|
||
coverage_score, confidence, evidence_count, matched_entity_ids, missing_fields,
|
||
scenario_tags, suggested_action, evaluated_at
|
||
)
|
||
VALUES (%s,%s,'codex_test','travel_agency_qa',%s,%s,%s,0.9,%s,%s,%s,%s,%s,now())
|
||
""",
|
||
(
|
||
TENANT_ID,
|
||
PROJECT_ID,
|
||
item["question"],
|
||
Jsonb({"expected_support": item.get("expected_support") or item.get("needed_graph_paths")}),
|
||
0.86 if not missing else 0.72,
|
||
5,
|
||
Jsonb([]),
|
||
Jsonb(missing),
|
||
Jsonb(["旅行社", "客服行程推荐", "复杂问答"]),
|
||
item.get("status") or item.get("support_status"),
|
||
),
|
||
)
|
||
conn.commit()
|
||
return {"schema_id": schema_id, "batch_id": batch_id}
|
||
|
||
|
||
def write_falkor(builder: KGBuilder) -> dict[str, int]:
|
||
db = FalkorDB(host="localhost", port=6380)
|
||
if GRAPH_NAME in db.list_graphs():
|
||
db.select_graph(GRAPH_NAME).delete()
|
||
graph = db.select_graph(GRAPH_NAME)
|
||
for node in builder.nodes.values():
|
||
label = re.sub(r"[^A-Za-z0-9_]", "", node["label"]) or "Entity"
|
||
props = graph_safe_props(node)
|
||
graph.query(
|
||
f"MERGE (n:{label} {{natural_key:$natural_key}}) SET n += $props",
|
||
{"natural_key": node["natural_key"], "props": props},
|
||
)
|
||
for rel in builder.relations:
|
||
rel_type = re.sub(r"[^A-Z0-9_]", "", rel["relation_type"].upper()) or "RELATED_TO"
|
||
props = graph_safe_props({"natural_key": f"{rel['source']}->{rel_type}->{rel['target']}", **(rel.get("properties") or {})})
|
||
graph.query(
|
||
f"""
|
||
MATCH (a {{natural_key:$source}}), (b {{natural_key:$target}})
|
||
MERGE (a)-[r:{rel_type}]->(b)
|
||
SET r += $props
|
||
""",
|
||
{"source": rel["source"], "target": rel["target"], "props": props},
|
||
)
|
||
node_count = graph.query("MATCH (n) RETURN count(n)").result_set[0][0]
|
||
rel_count = graph.query("MATCH ()-[r]->() RETURN count(r)").result_set[0][0]
|
||
return {"graph_nodes": node_count, "graph_relations": rel_count}
|
||
|
||
|
||
def build() -> dict[str, Any]:
|
||
builder = KGBuilder()
|
||
schema = seed_schema()
|
||
attraction_aliases = add_attractions(builder)
|
||
vehicles = seed_vehicles(builder)
|
||
extract_products_from_docs(builder, attraction_aliases)
|
||
hotel_by_region, restaurant_by_region = parse_resource_workbooks(builder)
|
||
parse_transfer_quotes(builder, vehicles)
|
||
parse_small_group_prices(builder, attraction_aliases, vehicles)
|
||
parse_independent_group_prices(builder, attraction_aliases, vehicles)
|
||
channels = parse_sales_scripts(builder)
|
||
connect_day_resources(builder, hotel_by_region, restaurant_by_region)
|
||
qa_generated = create_demands_and_plans(builder, attraction_aliases, vehicles, hotel_by_region, restaurant_by_region, channels)
|
||
qa = qa_suite(qa_generated)
|
||
write_outputs(builder, schema, qa)
|
||
pg_info = upsert_postgres(builder, schema, qa)
|
||
graph_info = write_falkor(builder)
|
||
summary = {
|
||
"tenant_id": TENANT_ID,
|
||
"project_id": PROJECT_ID,
|
||
"graph_name": GRAPH_NAME,
|
||
"nodes": len(builder.nodes),
|
||
"relations": len(builder.relations),
|
||
**pg_info,
|
||
**graph_info,
|
||
"output_dir": str(OUT_DIR),
|
||
}
|
||
(OUT_DIR / "入库执行摘要.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
|
||
return summary
|
||
|
||
|
||
if __name__ == "__main__":
|
||
print(json.dumps(build(), ensure_ascii=False, indent=2))
|