Initial travel knowledge graph release

This commit is contained in:
2026-06-09 09:56:26 +08:00
commit 5f061295d8
402 changed files with 103877 additions and 0 deletions

View File

@@ -0,0 +1,544 @@
from __future__ import annotations
import hashlib
import json
import re
import subprocess
from collections import Counter
from datetime import datetime
from pathlib import Path
from typing import Any
SOURCE_DIR = Path("/Users/xuexue/Downloads/旅行社业务/2026年新行程打包")
OUT_DIR = Path("/Users/xuexue/Downloads/旅行社业务/2026年新行程打包_md整理")
GRAPH_OUT_DIR = Path("/Users/xuexue/Downloads/图谱数据/旅行社项目入库/已有路线产品Markdown")
ATTRACTION_ALIASES = {
"黄果树": ["黄果树", "黄果树瀑布", "黄果树大瀑布"],
"天星桥": ["天星桥", "天星桥景区"],
"陡坡塘瀑布": ["陡坡塘", "陡坡塘瀑布"],
"荔波小七孔": ["小七孔", "荔波小七孔", "小七孔景区"],
"西江千户苗寨": ["西江", "西江苗寨", "西江千户苗寨"],
"镇远古城": ["镇远", "镇远古镇", "镇远古城"],
"梵净山": ["梵净山"],
"青岩古镇": ["青岩", "青岩古镇"],
"百里杜鹃": ["百里杜鹃"],
"平坝樱花": ["平坝樱花", "平坝农场"],
"织金洞": ["织金洞"],
"中国天眼": ["天眼", "中国天眼", "FAST"],
"茅台镇": ["茅台", "茅台镇"],
"遵义会议会址": ["遵义会址", "遵义会议会址"],
"兴义万峰林": ["万峰林", "兴义万峰林"],
"万峰湖": ["万峰湖"],
"马岭河峡谷": ["马岭河", "马岭河峡谷"],
"花江大桥": ["花江大桥"],
"龙宫": ["龙宫"],
"天河潭": ["天河潭"],
"甲秀楼": ["甲秀楼"],
"黔灵山公园": ["黔灵公园", "黔灵山"],
"乌江寨": ["乌江寨"],
}
def clean(value: Any) -> str:
if value is None:
return ""
text = str(value).replace("\x00", "").replace("\u200b", "").replace("\u200f", "")
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{4,}", "\n\n\n", text)
return text.strip()
def compact(value: Any) -> str:
return re.sub(r"\s+", " ", clean(value)).strip()
def chinese_count(text: str) -> int:
return len(re.findall(r"[\u4e00-\u9fff]", text))
def read_legacy_doc_text(path: Path) -> str:
"""Fallback for old WPS/Word .doc files that macOS textutil cannot decode."""
data = path.read_bytes()
decoded = data.decode("utf-16le", errors="ignore").replace("\x00", "")
anchors = [
path.stem,
"",
"贵客黔游",
"推荐理由",
"推 荐 理 由",
"简易行程",
"行程安排",
]
positions = [decoded.find(anchor) for anchor in anchors if decoded.find(anchor) >= 0]
if positions:
decoded = decoded[min(positions):]
decoded = decoded[:50000]
decoded = decoded.replace("\r", "\n")
decoded = re.sub(r"[\x01-\x06\x08-\x09\x0b-\x1f]", "", decoded)
decoded = re.sub(r"[䀀-俿]{8,}", "", decoded)
decoded = re.sub(r"[￿]{2,}", "", decoded)
decoded = re.sub(r"\n{3,}", "\n\n", decoded)
return clean(decoded)
def read_office_text(path: Path) -> str:
proc = subprocess.run(
["textutil", "-convert", "txt", "-stdout", str(path)],
check=False,
capture_output=True,
text=True,
)
if proc.returncode == 0 and chinese_count(proc.stdout) >= 30:
return proc.stdout.replace("\x00", "")
fallback = read_legacy_doc_text(path)
if chinese_count(fallback) >= 30:
return fallback
if proc.returncode != 0:
return f"[textutil读取失败] {proc.stderr}"
return proc.stdout.replace("\x00", "")
def safe_filename(text: str, index: int) -> str:
base = re.sub(r"[\\/:*?\"<>|]+", "_", compact(text))
base = re.sub(r"\s+", "", base)
base = base.strip("._ ")
digest = hashlib.md5(text.encode("utf-8")).hexdigest()[:8]
return f"{index:02d}_{base[:70]}_{digest}.md"
def duration_from_text(text: str) -> int | None:
for pattern in [r"(\d+)\s*日游", r"(\d+)\s*天", r"(\d+)\s*日"]:
m = re.search(pattern, text)
if m:
return int(m.group(1))
cn = {"": 1, "": 2, "": 2, "": 3, "": 4, "": 5, "": 6, "": 7, "": 8, "": 9, "": 10}
m = re.search(r"([一二两三四五六七八九十])日游", text)
if m:
return cn.get(m.group(1))
return None
def product_family(name: str, text: str) -> str:
joined = f"{name} {text[:1500]}"
if "高端" in joined or "5钻" in joined or "五钻" in joined:
return "高端纯玩"
if "轻奢" in joined or "2+1" in joined or "保姆车" in joined or "头等舱" in joined:
return "轻奢纯玩"
if "多彩" in joined:
return "多彩贵州"
if "经典" in joined:
return "经典纯玩"
if "1+1" in joined or "游黔途" in joined or "游黔程" in joined:
return "游黔途/游黔程"
if "独立" in joined or "20-25" in joined:
return "独立成团"
return "常规纯玩"
def hotel_grade_from_text(text: str) -> str:
if any(x in text for x in ("5钻", "五钻", "五星", "超五星")):
return "5钻/五星"
if any(x in text for x in ("4钻", "四钻", "四星")):
return "4钻/四星"
if "商务" in text:
return "商务"
if "客栈" in text:
return "客栈"
return ""
def vehicle_from_text(text: str) -> str:
if "1+1" in text:
return "1+1双排座"
if "2+1" in text or "保姆车" in text or "头等舱" in text:
return "2+1保姆车/头等舱"
if "2+2" in text:
return "2+2商务车"
if "32-38" in text or "独立成团" in text:
return "32-38座2+1大巴"
if "旅游大巴" in text or "大巴" in text:
return "旅游大巴"
if "9座" in text:
return "9座商务车"
if "7座" in text:
return "7座商务车"
if "5座" in text:
return "5座车"
return ""
def extract_between(text: str, starts: list[str], ends: list[str], limit: int | None = None) -> str:
start_pos = -1
for token in starts:
pos = text.find(token)
if pos >= 0 and (start_pos < 0 or pos < start_pos):
start_pos = pos
if start_pos < 0:
return ""
end_pos = len(text)
for token in ends:
pos = text.find(token, start_pos + 2)
if pos >= 0:
end_pos = min(end_pos, pos)
block = clean(text[start_pos:end_pos])
if limit:
return block[:limit]
return block
def parse_simple_itinerary(text: str) -> list[dict[str, str]]:
block = extract_between(text, ["简易行程"], ["详细行程", "接待标准", "费用包含", "团费包含"], 2600)
layout = "simple"
if not block:
block = extract_between(text, ["行程安排"], ["接待标准", "费用包含", "团费包含", "特别提醒"], 4200)
layout = "schedule"
if not block:
return []
cells = [compact(x) for x in re.split(r"[\x07\t]+", block) if compact(x)]
rows: list[dict[str, str]] = []
for idx, cell in enumerate(cells):
if not re.fullmatch(r"D\s*\d+", cell, flags=re.I):
continue
if layout == "schedule" and idx + 4 < len(cells):
content = cells[idx + 2]
attraction_route = "".join(extract_attractions(content))
rows.append({
"day": cell.upper().replace(" ", ""),
"day_index": re.search(r"\d+", cell).group(),
"route": attraction_route or content[:80],
"meals": cells[idx + 4],
"accommodation": cells[idx + 3],
})
else:
rows.append({
"day": cell.upper().replace(" ", ""),
"day_index": re.search(r"\d+", cell).group(),
"route": cells[idx + 1] if idx + 1 < len(cells) else "",
"meals": cells[idx + 2] if idx + 2 < len(cells) else "",
"accommodation": cells[idx + 3] if idx + 3 < len(cells) else "",
})
return rows
def extract_day_segments(text: str) -> list[dict[str, str]]:
source = extract_between(
text,
["详细行程", "行程安排"],
["· 接待标准", "接待标准", "· 费用包含", "费用包含", "团费包含", "参团须知", "· 温馨提示"],
)
if len(source) < 50:
source = text
matches = list(re.finditer(r"(?m)(D\s*\d+|第[一二三四五六七八九十]+天)[:、\s]*", source))
if not matches:
return []
cn_map = {"": 1, "": 2, "": 2, "": 3, "": 4, "": 5, "": 6, "": 7, "": 8, "": 9, "": 10}
rows: list[dict[str, str]] = []
for idx, match in enumerate(matches):
if idx + 1 < len(matches):
end = matches[idx + 1].start()
else:
end = len(source)
for token in ["· 接待标准", "接待标准", "· 费用包含", "费用包含", "团费包含", "参团须知", "温馨提示"]:
pos = source.find(token, match.start() + 2)
if pos >= 0:
end = min(end, pos)
token = match.group(1)
if token.startswith("D"):
day_index = int(re.search(r"\d+", token).group())
else:
day_index = cn_map.get(re.search(r"第(.+?)天", token).group(1), idx + 1)
body = clean(source[match.start():end])
lines = [compact(x) for x in body.splitlines() if compact(x)]
title = next((line for line in lines[:4] if not line.startswith("") and not re.fullmatch(r"D\s*\d+", line, flags=re.I)), f"D{day_index}")
rows.append({"day": f"D{day_index}", "day_index": str(day_index), "title": title, "body": body})
return rows[:12]
def extract_attractions(text: str) -> list[str]:
found: list[str] = []
for name, aliases in ATTRACTION_ALIASES.items():
if any(alias in text for alias in aliases) and name not in found:
found.append(name)
return found
def sentence_snippets(text: str, keywords: list[str], limit: int = 20) -> list[str]:
snippets: list[str] = []
for raw in re.split(r"[。!?;;\n]+", text):
line = compact(raw)
if len(line) < 4:
continue
if any(keyword in line for keyword in keywords):
snippets.append(line[:260])
if len(snippets) >= limit:
break
seen: set[str] = set()
out: list[str] = []
for item in snippets:
if item not in seen:
seen.add(item)
out.append(item)
return out
def md_table(headers: list[str], rows: list[list[Any]]) -> str:
def cell(value: Any) -> str:
return compact(value).replace("|", "\\|")
lines = ["| " + " | ".join(headers) + " |", "| " + " | ".join(["---"] * len(headers)) + " |"]
for row in rows:
lines.append("| " + " | ".join(cell(v) for v in row) + " |")
return "\n".join(lines)
def code_block(text: str) -> str:
return "```text\n" + clean(text).replace("```", "'''") + "\n```"
def source_files() -> list[Path]:
return sorted(
p for p in SOURCE_DIR.iterdir()
if p.is_file()
and p.suffix.lower() in {".doc", ".docx"}
and not p.name.startswith((".", "~$", ".~"))
)
def build_markdown(path: Path, index: int) -> tuple[str, dict[str, Any]]:
text = read_office_text(path)
lines = [compact(x) for x in text.splitlines() if compact(x)]
name = lines[0] if lines else path.stem
if len(name) < 3 or "INCLUDEPICTURE" in name:
name = path.stem
duration = duration_from_text(f"{path.stem} {name} {text[:300]}")
family = product_family(name, text)
vehicle = vehicle_from_text(f"{path.stem} {text[:2200]}")
hotel_grade = hotel_grade_from_text(f"{path.stem} {text[:2600]}")
simple_rows = parse_simple_itinerary(text)
day_segments = extract_day_segments(text)
route_text = " ".join(row.get("route", "") for row in simple_rows) or " ".join(row.get("title", "") for row in day_segments)
attractions = extract_attractions(f"{name} {path.stem} {route_text}")
if not attractions:
attractions = extract_attractions(" ".join(day.get("body", "")[:1200] for day in day_segments))
selling_points = sentence_snippets(text, ["核心卖点", "甄选", "赠送", "超值", "纯玩", "0购物", "车型"], limit=12)
fees = sentence_snippets(text, ["费用", "不含", "自理", "电瓶车", "观光车", "环保车", "保险", "扶梯", "索道", "餐标", "单房差", "儿童价"], limit=30)
rules = sentence_snippets(text, ["老人", "儿童", "学生", "军人", "退团", "退费", "不可抗力", "预约", "投诉", "意见单", "满房", "同级", "孕妇", "不接待"], limit=30)
hotel_block = extract_between(text, ["酒店"], ["导游服务", "购物", "意见单填写", "温馨提示"], 3200)
gift_block = extract_between(text, ["赠送服务", "赠送:"], ["简易行程", "详细行程", "接待标准"], 1800)
reception_block = extract_between(text, ["接待标准"], ["温馨提示", "· 温馨提示"], 4200)
frontmatter = {
"doc_type": "existing_route_product_markdown",
"schema_target": "travel_agency_existing_product",
"route_immutable": True,
"source_file": str(path),
"source_filename": path.name,
"product_name": name,
"duration_days": duration,
"product_family": family,
"default_vehicle_type": vehicle,
"default_hotel_grade": hotel_grade,
"core_attractions": attractions,
}
rows = [[r["day"], r.get("route", ""), r.get("meals", ""), r.get("accommodation", "")] for r in simple_rows]
if not rows:
rows = [[r["day"], r.get("title", ""), "", ""] for r in day_segments]
parts: list[str] = []
parts.append("---")
parts.append(json.dumps(frontmatter, ensure_ascii=False, indent=2))
parts.append("---")
parts.append(f"# {name}")
parts.append("")
parts.append("## 1. 产品识别")
parts.append(md_table(
["字段", ""],
[
["源文件", str(path)],
["产品名称", name],
["天数", duration or ""],
["产品系列/类型", family],
["默认车型", vehicle],
["默认酒店等级", hotel_grade],
["路线是否固定", "是;客户微调只能改资源槽位,不能改天数、景点顺序、城市移动路径"],
["核心景点候选", "".join(attractions)],
],
))
parts.append("")
parts.append("## 2. 固定路线骨架")
parts.append("")
parts.append("抽取目标:`TourProduct -> ProductDay -> RouteStop / RouteSegment`。")
parts.append("")
parts.append(md_table(["天数", "路线/标题", "用餐", "住宿"], rows))
parts.append("")
parts.append("## 3. 每日详细行程")
parts.append("")
parts.append("抽取目标:每天生成 `ProductDay`,从正文识别真实 `RouteStop` 和 `RouteSegment`;不要把费用说明里的景点当作真实停靠点。")
for day in day_segments:
idx = day["day_index"]
simple = next((row for row in simple_rows if row["day_index"] == idx), {})
parts.append("")
parts.append(f"### {day['day']} {simple.get('route') or day.get('title') or ''}")
parts.append("")
parts.append(md_table(
["字段", ""],
[
["day_index", idx],
["route_path", simple.get("route", "")],
["meal_text", simple.get("meals", "")],
["accommodation_text", simple.get("accommodation", "")],
],
))
parts.append("")
parts.append(code_block(day["body"]))
parts.append("")
parts.append("## 4. 可配置资源槽位候选")
parts.append("")
parts.append("抽取目标:`ResourceSlot -> ResourceOptionGroup -> Hotel/Restaurant/Vehicle/TicketFee/GiftService`。")
parts.append("")
parts.append("### 4.1 住宿槽位候选")
parts.append("")
if hotel_block:
parts.append(code_block(hotel_block))
else:
parts.append("- 原文未明确独立酒店参考段;可从每日住宿列生成住宿槽位。")
parts.append("")
parts.append("### 4.2 餐饮槽位候选")
parts.append("")
meal_lines = sentence_snippets(text, ["用餐", "餐标", "早餐", "中餐", "晚餐", "长桌宴", "酸汤鱼"], limit=20)
parts.extend(f"- {line}" for line in meal_lines or ["原文未明确独立餐饮段;可从每日用餐列生成餐饮槽位。"])
parts.append("")
parts.append("### 4.3 车辆/交通槽位候选")
parts.append("")
traffic_lines = sentence_snippets(text, ["交通", "车型", "用车", "车辆", "保姆车", "商务车", "大巴", "接人", "送站", "接站"], limit=20)
parts.extend(f"- {line}" for line in traffic_lines or ["原文未明确独立车辆段;可从产品名和行程交通描述识别默认车型。"])
parts.append("")
parts.append("### 4.4 门票/小交通/保险槽位候选")
parts.append("")
ticket_lines = sentence_snippets(text, ["门票", "观光车", "环保车", "电瓶车", "保险", "扶梯", "索道", "游船", "小交通", "景交"], limit=30)
parts.extend(f"- {line}" for line in ticket_lines or ["原文未明确门票小交通段。"])
parts.append("")
parts.append("### 4.5 赠送服务槽位候选")
parts.append("")
if gift_block:
parts.append(code_block(gift_block))
else:
gift_lines = sentence_snippets(text, ["赠送", "旅拍", "矿泉水", "长桌宴", "高山流水", "打糍粑"], limit=12)
parts.extend(f"- {line}" for line in gift_lines or ["原文未明确赠送服务。"])
parts.append("")
parts.append("## 5. 费用与规则候选")
parts.append("")
parts.append("抽取目标:`ProductPricePackage`、`TicketFee`、`FeeItem`、`BusinessRule`。")
parts.append("")
parts.append("### 5.1 费用候选")
parts.extend(f"- {line}" for line in fees or ["未识别到明显费用候选。"])
parts.append("")
parts.append("### 5.2 业务规则候选")
parts.extend(f"- {line}" for line in rules or ["未识别到明显规则候选。"])
if reception_block:
parts.append("")
parts.append("### 5.3 接待标准/服务规则原文")
parts.append(code_block(reception_block))
parts.append("")
parts.append("## 6. 原文保留")
parts.append("")
parts.append("后续抽取如果结构化段落不够,可回到本段原文补证据。")
parts.append("")
parts.append(code_block(text))
markdown = "\n".join(parts).strip() + "\n"
meta = {
**frontmatter,
"markdown_filename": "",
"simple_day_count": len(simple_rows),
"detailed_day_count": len(day_segments),
"text_chars": len(clean(text)),
}
return markdown, meta
def write_index(items: list[dict[str, Any]]) -> None:
summary_rows = [
[
item["product_name"],
item.get("duration_days") or "",
item.get("product_family") or "",
item.get("default_vehicle_type") or "",
item.get("default_hotel_grade") or "",
"".join(item.get("core_attractions") or []),
item["markdown_filename"],
]
for item in items
]
counts = Counter(item.get("duration_days") for item in items)
family_counts = Counter(item.get("product_family") for item in items)
lines = [
"# 已有路线产品 Markdown 整理索引",
"",
f"生成时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
"",
f"- 源目录:`{SOURCE_DIR}`",
f"- 输出目录:`{OUT_DIR}`",
f"- 产品文档数:{len(items)}",
"",
"## 按天数统计",
"",
*[f"- {day or '未知'}天:{count}" for day, count in sorted(counts.items(), key=lambda x: (x[0] is None, x[0] or 0))],
"",
"## 按产品类型统计",
"",
*[f"- {name}{count}" for name, count in family_counts.most_common()],
"",
"## 产品索引",
"",
md_table(["产品名称", "天数", "产品类型", "默认车型", "默认酒店", "核心景点候选", "Markdown文件"], summary_rows),
"",
"## 后续抽取建议",
"",
"- 先抽取 `TourProduct`、`ProductDay`、`RouteStop`、`RouteSegment`,确保路线骨架稳定。",
"- 再抽取 `ResourceSlot`,把住宿、餐饮、车辆、接送、门票小交通、赠送服务作为可配置槽位。",
"- 产品文档里“费用说明/自理项目/不含”出现的景点名称,不要抽成真实路线停靠点。",
"- 酒店参考段优先抽成 `ResourceOptionGroup`,不要强行抽成唯一入住酒店。",
"- 规则要挂到影响对象:退改挂产品/价格包,优惠挂门票费用,少走路/预约风险挂景点或停靠点,可替换规则挂资源槽位。",
]
(OUT_DIR / "README_已有路线产品md整理.md").write_text("\n".join(lines), encoding="utf-8")
(OUT_DIR / "产品索引.json").write_text(json.dumps(items, ensure_ascii=False, indent=2), encoding="utf-8")
def main() -> dict[str, Any]:
OUT_DIR.mkdir(parents=True, exist_ok=True)
GRAPH_OUT_DIR.mkdir(parents=True, exist_ok=True)
product_dir = OUT_DIR / "products"
product_dir.mkdir(parents=True, exist_ok=True)
items: list[dict[str, Any]] = []
for index, path in enumerate(source_files(), start=1):
markdown, meta = build_markdown(path, index)
filename = safe_filename(meta["product_name"] or path.stem, index)
meta["markdown_filename"] = f"products/{filename}"
(product_dir / filename).write_text(markdown, encoding="utf-8")
items.append(meta)
write_index(items)
# Keep a synchronized copy under 图谱数据 for extraction experiments.
for src in [OUT_DIR / "README_已有路线产品md整理.md", OUT_DIR / "产品索引.json"]:
(GRAPH_OUT_DIR / src.name).write_text(src.read_text(encoding="utf-8"), encoding="utf-8")
graph_products = GRAPH_OUT_DIR / "products"
graph_products.mkdir(parents=True, exist_ok=True)
for md in product_dir.glob("*.md"):
(graph_products / md.name).write_text(md.read_text(encoding="utf-8"), encoding="utf-8")
summary = {
"source_dir": str(SOURCE_DIR),
"output_dir": str(OUT_DIR),
"graph_output_dir": str(GRAPH_OUT_DIR),
"product_markdown_count": len(items),
"generated_at": datetime.now().isoformat(timespec="seconds"),
}
(OUT_DIR / "生成摘要.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
(GRAPH_OUT_DIR / "生成摘要.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
return summary
if __name__ == "__main__":
print(json.dumps(main(), ensure_ascii=False, indent=2))