Files
bxh/scripts/organize_existing_routes_to_md.py

546 lines
22 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import hashlib
import json
import re
import subprocess
from collections import Counter
from datetime import datetime
from pathlib import Path
from typing import Any
from common_paths import TRAVEL_AGENCY_SOURCE_ROOT, TRAVEL_KG_EXPORT_ROOT
SOURCE_DIR = TRAVEL_AGENCY_SOURCE_ROOT / "2026年新行程打包"
OUT_DIR = TRAVEL_AGENCY_SOURCE_ROOT / "2026年新行程打包_md整理"
GRAPH_OUT_DIR = TRAVEL_KG_EXPORT_ROOT / "旅行社项目入库/已有路线产品Markdown"
ATTRACTION_ALIASES = {
"黄果树": ["黄果树", "黄果树瀑布", "黄果树大瀑布"],
"天星桥": ["天星桥", "天星桥景区"],
"陡坡塘瀑布": ["陡坡塘", "陡坡塘瀑布"],
"荔波小七孔": ["小七孔", "荔波小七孔", "小七孔景区"],
"西江千户苗寨": ["西江", "西江苗寨", "西江千户苗寨"],
"镇远古城": ["镇远", "镇远古镇", "镇远古城"],
"梵净山": ["梵净山"],
"青岩古镇": ["青岩", "青岩古镇"],
"百里杜鹃": ["百里杜鹃"],
"平坝樱花": ["平坝樱花", "平坝农场"],
"织金洞": ["织金洞"],
"中国天眼": ["天眼", "中国天眼", "FAST"],
"茅台镇": ["茅台", "茅台镇"],
"遵义会议会址": ["遵义会址", "遵义会议会址"],
"兴义万峰林": ["万峰林", "兴义万峰林"],
"万峰湖": ["万峰湖"],
"马岭河峡谷": ["马岭河", "马岭河峡谷"],
"花江大桥": ["花江大桥"],
"龙宫": ["龙宫"],
"天河潭": ["天河潭"],
"甲秀楼": ["甲秀楼"],
"黔灵山公园": ["黔灵公园", "黔灵山"],
"乌江寨": ["乌江寨"],
}
def clean(value: Any) -> str:
if value is None:
return ""
text = str(value).replace("\x00", "").replace("\u200b", "").replace("\u200f", "")
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{4,}", "\n\n\n", text)
return text.strip()
def compact(value: Any) -> str:
return re.sub(r"\s+", " ", clean(value)).strip()
def chinese_count(text: str) -> int:
return len(re.findall(r"[\u4e00-\u9fff]", text))
def read_legacy_doc_text(path: Path) -> str:
"""Fallback for old WPS/Word .doc files that macOS textutil cannot decode."""
data = path.read_bytes()
decoded = data.decode("utf-16le", errors="ignore").replace("\x00", "")
anchors = [
path.stem,
"",
"贵客黔游",
"推荐理由",
"推 荐 理 由",
"简易行程",
"行程安排",
]
positions = [decoded.find(anchor) for anchor in anchors if decoded.find(anchor) >= 0]
if positions:
decoded = decoded[min(positions):]
decoded = decoded[:50000]
decoded = decoded.replace("\r", "\n")
decoded = re.sub(r"[\x01-\x06\x08-\x09\x0b-\x1f]", "", decoded)
decoded = re.sub(r"[䀀-俿]{8,}", "", decoded)
decoded = re.sub(r"[￿]{2,}", "", decoded)
decoded = re.sub(r"\n{3,}", "\n\n", decoded)
return clean(decoded)
def read_office_text(path: Path) -> str:
proc = subprocess.run(
["textutil", "-convert", "txt", "-stdout", str(path)],
check=False,
capture_output=True,
text=True,
)
if proc.returncode == 0 and chinese_count(proc.stdout) >= 30:
return proc.stdout.replace("\x00", "")
fallback = read_legacy_doc_text(path)
if chinese_count(fallback) >= 30:
return fallback
if proc.returncode != 0:
return f"[textutil读取失败] {proc.stderr}"
return proc.stdout.replace("\x00", "")
def safe_filename(text: str, index: int) -> str:
base = re.sub(r"[\\/:*?\"<>|]+", "_", compact(text))
base = re.sub(r"\s+", "", base)
base = base.strip("._ ")
digest = hashlib.md5(text.encode("utf-8")).hexdigest()[:8]
return f"{index:02d}_{base[:70]}_{digest}.md"
def duration_from_text(text: str) -> int | None:
for pattern in [r"(\d+)\s*日游", r"(\d+)\s*天", r"(\d+)\s*日"]:
m = re.search(pattern, text)
if m:
return int(m.group(1))
cn = {"": 1, "": 2, "": 2, "": 3, "": 4, "": 5, "": 6, "": 7, "": 8, "": 9, "": 10}
m = re.search(r"([一二两三四五六七八九十])日游", text)
if m:
return cn.get(m.group(1))
return None
def product_family(name: str, text: str) -> str:
joined = f"{name} {text[:1500]}"
if "高端" in joined or "5钻" in joined or "五钻" in joined:
return "高端纯玩"
if "轻奢" in joined or "2+1" in joined or "保姆车" in joined or "头等舱" in joined:
return "轻奢纯玩"
if "多彩" in joined:
return "多彩贵州"
if "经典" in joined:
return "经典纯玩"
if "1+1" in joined or "游黔途" in joined or "游黔程" in joined:
return "游黔途/游黔程"
if "独立" in joined or "20-25" in joined:
return "独立成团"
return "常规纯玩"
def hotel_grade_from_text(text: str) -> str:
if any(x in text for x in ("5钻", "五钻", "五星", "超五星")):
return "5钻/五星"
if any(x in text for x in ("4钻", "四钻", "四星")):
return "4钻/四星"
if "商务" in text:
return "商务"
if "客栈" in text:
return "客栈"
return ""
def vehicle_from_text(text: str) -> str:
if "1+1" in text:
return "1+1双排座"
if "2+1" in text or "保姆车" in text or "头等舱" in text:
return "2+1保姆车/头等舱"
if "2+2" in text:
return "2+2商务车"
if "32-38" in text or "独立成团" in text:
return "32-38座2+1大巴"
if "旅游大巴" in text or "大巴" in text:
return "旅游大巴"
if "9座" in text:
return "9座商务车"
if "7座" in text:
return "7座商务车"
if "5座" in text:
return "5座车"
return ""
def extract_between(text: str, starts: list[str], ends: list[str], limit: int | None = None) -> str:
start_pos = -1
for token in starts:
pos = text.find(token)
if pos >= 0 and (start_pos < 0 or pos < start_pos):
start_pos = pos
if start_pos < 0:
return ""
end_pos = len(text)
for token in ends:
pos = text.find(token, start_pos + 2)
if pos >= 0:
end_pos = min(end_pos, pos)
block = clean(text[start_pos:end_pos])
if limit:
return block[:limit]
return block
def parse_simple_itinerary(text: str) -> list[dict[str, str]]:
block = extract_between(text, ["简易行程"], ["详细行程", "接待标准", "费用包含", "团费包含"], 2600)
layout = "simple"
if not block:
block = extract_between(text, ["行程安排"], ["接待标准", "费用包含", "团费包含", "特别提醒"], 4200)
layout = "schedule"
if not block:
return []
cells = [compact(x) for x in re.split(r"[\x07\t]+", block) if compact(x)]
rows: list[dict[str, str]] = []
for idx, cell in enumerate(cells):
if not re.fullmatch(r"D\s*\d+", cell, flags=re.I):
continue
if layout == "schedule" and idx + 4 < len(cells):
content = cells[idx + 2]
attraction_route = "".join(extract_attractions(content))
rows.append({
"day": cell.upper().replace(" ", ""),
"day_index": re.search(r"\d+", cell).group(),
"route": attraction_route or content[:80],
"meals": cells[idx + 4],
"accommodation": cells[idx + 3],
})
else:
rows.append({
"day": cell.upper().replace(" ", ""),
"day_index": re.search(r"\d+", cell).group(),
"route": cells[idx + 1] if idx + 1 < len(cells) else "",
"meals": cells[idx + 2] if idx + 2 < len(cells) else "",
"accommodation": cells[idx + 3] if idx + 3 < len(cells) else "",
})
return rows
def extract_day_segments(text: str) -> list[dict[str, str]]:
source = extract_between(
text,
["详细行程", "行程安排"],
["· 接待标准", "接待标准", "· 费用包含", "费用包含", "团费包含", "参团须知", "· 温馨提示"],
)
if len(source) < 50:
source = text
matches = list(re.finditer(r"(?m)(D\s*\d+|第[一二三四五六七八九十]+天)[:、\s]*", source))
if not matches:
return []
cn_map = {"": 1, "": 2, "": 2, "": 3, "": 4, "": 5, "": 6, "": 7, "": 8, "": 9, "": 10}
rows: list[dict[str, str]] = []
for idx, match in enumerate(matches):
if idx + 1 < len(matches):
end = matches[idx + 1].start()
else:
end = len(source)
for token in ["· 接待标准", "接待标准", "· 费用包含", "费用包含", "团费包含", "参团须知", "温馨提示"]:
pos = source.find(token, match.start() + 2)
if pos >= 0:
end = min(end, pos)
token = match.group(1)
if token.startswith("D"):
day_index = int(re.search(r"\d+", token).group())
else:
day_index = cn_map.get(re.search(r"第(.+?)天", token).group(1), idx + 1)
body = clean(source[match.start():end])
lines = [compact(x) for x in body.splitlines() if compact(x)]
title = next((line for line in lines[:4] if not line.startswith("") and not re.fullmatch(r"D\s*\d+", line, flags=re.I)), f"D{day_index}")
rows.append({"day": f"D{day_index}", "day_index": str(day_index), "title": title, "body": body})
return rows[:12]
def extract_attractions(text: str) -> list[str]:
found: list[str] = []
for name, aliases in ATTRACTION_ALIASES.items():
if any(alias in text for alias in aliases) and name not in found:
found.append(name)
return found
def sentence_snippets(text: str, keywords: list[str], limit: int = 20) -> list[str]:
snippets: list[str] = []
for raw in re.split(r"[。!?;;\n]+", text):
line = compact(raw)
if len(line) < 4:
continue
if any(keyword in line for keyword in keywords):
snippets.append(line[:260])
if len(snippets) >= limit:
break
seen: set[str] = set()
out: list[str] = []
for item in snippets:
if item not in seen:
seen.add(item)
out.append(item)
return out
def md_table(headers: list[str], rows: list[list[Any]]) -> str:
def cell(value: Any) -> str:
return compact(value).replace("|", "\\|")
lines = ["| " + " | ".join(headers) + " |", "| " + " | ".join(["---"] * len(headers)) + " |"]
for row in rows:
lines.append("| " + " | ".join(cell(v) for v in row) + " |")
return "\n".join(lines)
def code_block(text: str) -> str:
return "```text\n" + clean(text).replace("```", "'''") + "\n```"
def source_files() -> list[Path]:
return sorted(
p for p in SOURCE_DIR.iterdir()
if p.is_file()
and p.suffix.lower() in {".doc", ".docx"}
and not p.name.startswith((".", "~$", ".~"))
)
def build_markdown(path: Path, index: int) -> tuple[str, dict[str, Any]]:
text = read_office_text(path)
lines = [compact(x) for x in text.splitlines() if compact(x)]
name = lines[0] if lines else path.stem
if len(name) < 3 or "INCLUDEPICTURE" in name:
name = path.stem
duration = duration_from_text(f"{path.stem} {name} {text[:300]}")
family = product_family(name, text)
vehicle = vehicle_from_text(f"{path.stem} {text[:2200]}")
hotel_grade = hotel_grade_from_text(f"{path.stem} {text[:2600]}")
simple_rows = parse_simple_itinerary(text)
day_segments = extract_day_segments(text)
route_text = " ".join(row.get("route", "") for row in simple_rows) or " ".join(row.get("title", "") for row in day_segments)
attractions = extract_attractions(f"{name} {path.stem} {route_text}")
if not attractions:
attractions = extract_attractions(" ".join(day.get("body", "")[:1200] for day in day_segments))
selling_points = sentence_snippets(text, ["核心卖点", "甄选", "赠送", "超值", "纯玩", "0购物", "车型"], limit=12)
fees = sentence_snippets(text, ["费用", "不含", "自理", "电瓶车", "观光车", "环保车", "保险", "扶梯", "索道", "餐标", "单房差", "儿童价"], limit=30)
rules = sentence_snippets(text, ["老人", "儿童", "学生", "军人", "退团", "退费", "不可抗力", "预约", "投诉", "意见单", "满房", "同级", "孕妇", "不接待"], limit=30)
hotel_block = extract_between(text, ["酒店"], ["导游服务", "购物", "意见单填写", "温馨提示"], 3200)
gift_block = extract_between(text, ["赠送服务", "赠送:"], ["简易行程", "详细行程", "接待标准"], 1800)
reception_block = extract_between(text, ["接待标准"], ["温馨提示", "· 温馨提示"], 4200)
frontmatter = {
"doc_type": "existing_route_product_markdown",
"schema_target": "travel_agency_existing_product",
"route_immutable": True,
"source_file": str(path),
"source_filename": path.name,
"product_name": name,
"duration_days": duration,
"product_family": family,
"default_vehicle_type": vehicle,
"default_hotel_grade": hotel_grade,
"core_attractions": attractions,
}
rows = [[r["day"], r.get("route", ""), r.get("meals", ""), r.get("accommodation", "")] for r in simple_rows]
if not rows:
rows = [[r["day"], r.get("title", ""), "", ""] for r in day_segments]
parts: list[str] = []
parts.append("---")
parts.append(json.dumps(frontmatter, ensure_ascii=False, indent=2))
parts.append("---")
parts.append(f"# {name}")
parts.append("")
parts.append("## 1. 产品识别")
parts.append(md_table(
["字段", ""],
[
["源文件", str(path)],
["产品名称", name],
["天数", duration or ""],
["产品系列/类型", family],
["默认车型", vehicle],
["默认酒店等级", hotel_grade],
["路线是否固定", "是;客户微调只能改资源槽位,不能改天数、景点顺序、城市移动路径"],
["核心景点候选", "".join(attractions)],
],
))
parts.append("")
parts.append("## 2. 固定路线骨架")
parts.append("")
parts.append("抽取目标:`TourProduct -> ProductDay -> RouteStop / RouteSegment`。")
parts.append("")
parts.append(md_table(["天数", "路线/标题", "用餐", "住宿"], rows))
parts.append("")
parts.append("## 3. 每日详细行程")
parts.append("")
parts.append("抽取目标:每天生成 `ProductDay`,从正文识别真实 `RouteStop` 和 `RouteSegment`;不要把费用说明里的景点当作真实停靠点。")
for day in day_segments:
idx = day["day_index"]
simple = next((row for row in simple_rows if row["day_index"] == idx), {})
parts.append("")
parts.append(f"### {day['day']} {simple.get('route') or day.get('title') or ''}")
parts.append("")
parts.append(md_table(
["字段", ""],
[
["day_index", idx],
["route_path", simple.get("route", "")],
["meal_text", simple.get("meals", "")],
["accommodation_text", simple.get("accommodation", "")],
],
))
parts.append("")
parts.append(code_block(day["body"]))
parts.append("")
parts.append("## 4. 可配置资源槽位候选")
parts.append("")
parts.append("抽取目标:`ResourceSlot -> ResourceOptionGroup -> Hotel/Restaurant/Vehicle/TicketFee/GiftService`。")
parts.append("")
parts.append("### 4.1 住宿槽位候选")
parts.append("")
if hotel_block:
parts.append(code_block(hotel_block))
else:
parts.append("- 原文未明确独立酒店参考段;可从每日住宿列生成住宿槽位。")
parts.append("")
parts.append("### 4.2 餐饮槽位候选")
parts.append("")
meal_lines = sentence_snippets(text, ["用餐", "餐标", "早餐", "中餐", "晚餐", "长桌宴", "酸汤鱼"], limit=20)
parts.extend(f"- {line}" for line in meal_lines or ["原文未明确独立餐饮段;可从每日用餐列生成餐饮槽位。"])
parts.append("")
parts.append("### 4.3 车辆/交通槽位候选")
parts.append("")
traffic_lines = sentence_snippets(text, ["交通", "车型", "用车", "车辆", "保姆车", "商务车", "大巴", "接人", "送站", "接站"], limit=20)
parts.extend(f"- {line}" for line in traffic_lines or ["原文未明确独立车辆段;可从产品名和行程交通描述识别默认车型。"])
parts.append("")
parts.append("### 4.4 门票/小交通/保险槽位候选")
parts.append("")
ticket_lines = sentence_snippets(text, ["门票", "观光车", "环保车", "电瓶车", "保险", "扶梯", "索道", "游船", "小交通", "景交"], limit=30)
parts.extend(f"- {line}" for line in ticket_lines or ["原文未明确门票小交通段。"])
parts.append("")
parts.append("### 4.5 赠送服务槽位候选")
parts.append("")
if gift_block:
parts.append(code_block(gift_block))
else:
gift_lines = sentence_snippets(text, ["赠送", "旅拍", "矿泉水", "长桌宴", "高山流水", "打糍粑"], limit=12)
parts.extend(f"- {line}" for line in gift_lines or ["原文未明确赠送服务。"])
parts.append("")
parts.append("## 5. 费用与规则候选")
parts.append("")
parts.append("抽取目标:`ProductPricePackage`、`TicketFee`、`FeeItem`、`BusinessRule`。")
parts.append("")
parts.append("### 5.1 费用候选")
parts.extend(f"- {line}" for line in fees or ["未识别到明显费用候选。"])
parts.append("")
parts.append("### 5.2 业务规则候选")
parts.extend(f"- {line}" for line in rules or ["未识别到明显规则候选。"])
if reception_block:
parts.append("")
parts.append("### 5.3 接待标准/服务规则原文")
parts.append(code_block(reception_block))
parts.append("")
parts.append("## 6. 原文保留")
parts.append("")
parts.append("后续抽取如果结构化段落不够,可回到本段原文补证据。")
parts.append("")
parts.append(code_block(text))
markdown = "\n".join(parts).strip() + "\n"
meta = {
**frontmatter,
"markdown_filename": "",
"simple_day_count": len(simple_rows),
"detailed_day_count": len(day_segments),
"text_chars": len(clean(text)),
}
return markdown, meta
def write_index(items: list[dict[str, Any]]) -> None:
summary_rows = [
[
item["product_name"],
item.get("duration_days") or "",
item.get("product_family") or "",
item.get("default_vehicle_type") or "",
item.get("default_hotel_grade") or "",
"".join(item.get("core_attractions") or []),
item["markdown_filename"],
]
for item in items
]
counts = Counter(item.get("duration_days") for item in items)
family_counts = Counter(item.get("product_family") for item in items)
lines = [
"# 已有路线产品 Markdown 整理索引",
"",
f"生成时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
"",
f"- 源目录:`{SOURCE_DIR}`",
f"- 输出目录:`{OUT_DIR}`",
f"- 产品文档数:{len(items)}",
"",
"## 按天数统计",
"",
*[f"- {day or '未知'}天:{count}" for day, count in sorted(counts.items(), key=lambda x: (x[0] is None, x[0] or 0))],
"",
"## 按产品类型统计",
"",
*[f"- {name}{count}" for name, count in family_counts.most_common()],
"",
"## 产品索引",
"",
md_table(["产品名称", "天数", "产品类型", "默认车型", "默认酒店", "核心景点候选", "Markdown文件"], summary_rows),
"",
"## 后续抽取建议",
"",
"- 先抽取 `TourProduct`、`ProductDay`、`RouteStop`、`RouteSegment`,确保路线骨架稳定。",
"- 再抽取 `ResourceSlot`,把住宿、餐饮、车辆、接送、门票小交通、赠送服务作为可配置槽位。",
"- 产品文档里“费用说明/自理项目/不含”出现的景点名称,不要抽成真实路线停靠点。",
"- 酒店参考段优先抽成 `ResourceOptionGroup`,不要强行抽成唯一入住酒店。",
"- 规则要挂到影响对象:退改挂产品/价格包,优惠挂门票费用,少走路/预约风险挂景点或停靠点,可替换规则挂资源槽位。",
]
(OUT_DIR / "README_已有路线产品md整理.md").write_text("\n".join(lines), encoding="utf-8")
(OUT_DIR / "产品索引.json").write_text(json.dumps(items, ensure_ascii=False, indent=2), encoding="utf-8")
def main() -> dict[str, Any]:
OUT_DIR.mkdir(parents=True, exist_ok=True)
GRAPH_OUT_DIR.mkdir(parents=True, exist_ok=True)
product_dir = OUT_DIR / "products"
product_dir.mkdir(parents=True, exist_ok=True)
items: list[dict[str, Any]] = []
for index, path in enumerate(source_files(), start=1):
markdown, meta = build_markdown(path, index)
filename = safe_filename(meta["product_name"] or path.stem, index)
meta["markdown_filename"] = f"products/{filename}"
(product_dir / filename).write_text(markdown, encoding="utf-8")
items.append(meta)
write_index(items)
# Keep a synchronized copy under 图谱数据 for extraction experiments.
for src in [OUT_DIR / "README_已有路线产品md整理.md", OUT_DIR / "产品索引.json"]:
(GRAPH_OUT_DIR / src.name).write_text(src.read_text(encoding="utf-8"), encoding="utf-8")
graph_products = GRAPH_OUT_DIR / "products"
graph_products.mkdir(parents=True, exist_ok=True)
for md in product_dir.glob("*.md"):
(graph_products / md.name).write_text(md.read_text(encoding="utf-8"), encoding="utf-8")
summary = {
"source_dir": str(SOURCE_DIR),
"output_dir": str(OUT_DIR),
"graph_output_dir": str(GRAPH_OUT_DIR),
"product_markdown_count": len(items),
"generated_at": datetime.now().isoformat(timespec="seconds"),
}
(OUT_DIR / "生成摘要.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
(GRAPH_OUT_DIR / "生成摘要.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
return summary
if __name__ == "__main__":
print(json.dumps(main(), ensure_ascii=False, indent=2))