from __future__ import annotations import hashlib import json import re import subprocess from collections import Counter from datetime import datetime from pathlib import Path from typing import Any from common_paths import TRAVEL_AGENCY_SOURCE_ROOT, TRAVEL_KG_EXPORT_ROOT SOURCE_DIR = TRAVEL_AGENCY_SOURCE_ROOT / "2026年新行程打包" OUT_DIR = TRAVEL_AGENCY_SOURCE_ROOT / "2026年新行程打包_md整理" GRAPH_OUT_DIR = TRAVEL_KG_EXPORT_ROOT / "旅行社项目入库/已有路线产品Markdown" ATTRACTION_ALIASES = { "黄果树": ["黄果树", "黄果树瀑布", "黄果树大瀑布"], "天星桥": ["天星桥", "天星桥景区"], "陡坡塘瀑布": ["陡坡塘", "陡坡塘瀑布"], "荔波小七孔": ["小七孔", "荔波小七孔", "小七孔景区"], "西江千户苗寨": ["西江", "西江苗寨", "西江千户苗寨"], "镇远古城": ["镇远", "镇远古镇", "镇远古城"], "梵净山": ["梵净山"], "青岩古镇": ["青岩", "青岩古镇"], "百里杜鹃": ["百里杜鹃"], "平坝樱花": ["平坝樱花", "平坝农场"], "织金洞": ["织金洞"], "中国天眼": ["天眼", "中国天眼", "FAST"], "茅台镇": ["茅台", "茅台镇"], "遵义会议会址": ["遵义会址", "遵义会议会址"], "兴义万峰林": ["万峰林", "兴义万峰林"], "万峰湖": ["万峰湖"], "马岭河峡谷": ["马岭河", "马岭河峡谷"], "花江大桥": ["花江大桥"], "龙宫": ["龙宫"], "天河潭": ["天河潭"], "甲秀楼": ["甲秀楼"], "黔灵山公园": ["黔灵公园", "黔灵山"], "乌江寨": ["乌江寨"], } def clean(value: Any) -> str: if value is None: return "" text = str(value).replace("\x00", "").replace("\u200b", "").replace("\u200f", "") text = re.sub(r"[ \t]+", " ", text) text = re.sub(r"\n{4,}", "\n\n\n", text) return text.strip() def compact(value: Any) -> str: return re.sub(r"\s+", " ", clean(value)).strip() def chinese_count(text: str) -> int: return len(re.findall(r"[\u4e00-\u9fff]", text)) def read_legacy_doc_text(path: Path) -> str: """Fallback for old WPS/Word .doc files that macOS textutil cannot decode.""" data = path.read_bytes() decoded = data.decode("utf-16le", errors="ignore").replace("\x00", "") anchors = [ path.stem, "❀", "贵客黔游", "推荐理由", "推 荐 理 由", "简易行程", "行程安排", ] positions = [decoded.find(anchor) for anchor in anchors if decoded.find(anchor) >= 0] if positions: decoded = decoded[min(positions):] decoded = decoded[:50000] decoded = decoded.replace("\r", "\n") decoded = re.sub(r"[\x01-\x06\x08-\x09\x0b-\x1f]", "", decoded) decoded = re.sub(r"[䀀-俿]{8,}", "", decoded) decoded = re.sub(r"[￿]{2,}", "", decoded) decoded = re.sub(r"\n{3,}", "\n\n", decoded) return clean(decoded) def read_office_text(path: Path) -> str: proc = subprocess.run( ["textutil", "-convert", "txt", "-stdout", str(path)], check=False, capture_output=True, text=True, ) if proc.returncode == 0 and chinese_count(proc.stdout) >= 30: return proc.stdout.replace("\x00", "") fallback = read_legacy_doc_text(path) if chinese_count(fallback) >= 30: return fallback if proc.returncode != 0: return f"[textutil读取失败] {proc.stderr}" return proc.stdout.replace("\x00", "") def safe_filename(text: str, index: int) -> str: base = re.sub(r"[\\/:*?\"<>|]+", "_", compact(text)) base = re.sub(r"\s+", "", base) base = base.strip("._ ") digest = hashlib.md5(text.encode("utf-8")).hexdigest()[:8] return f"{index:02d}_{base[:70]}_{digest}.md" def duration_from_text(text: str) -> int | None: for pattern in [r"(\d+)\s*日游", r"(\d+)\s*天", r"(\d+)\s*日"]: m = re.search(pattern, text) if m: return int(m.group(1)) cn = {"一": 1, "二": 2, "两": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10} m = re.search(r"([一二两三四五六七八九十])日游", text) if m: return cn.get(m.group(1)) return None def product_family(name: str, text: str) -> str: joined = f"{name} {text[:1500]}" if "高端" in joined or "5钻" in joined or "五钻" in joined: return "高端纯玩" if "轻奢" in joined or "2+1" in joined or "保姆车" in joined or "头等舱" in joined: return "轻奢纯玩" if "多彩" in joined: return "多彩贵州" if "经典" in joined: return "经典纯玩" if "1+1" in joined or "游黔途" in joined or "游黔程" in joined: return "游黔途/游黔程" if "独立" in joined or "20-25" in joined: return "独立成团" return "常规纯玩" def hotel_grade_from_text(text: str) -> str: if any(x in text for x in ("5钻", "五钻", "五星", "超五星")): return "5钻/五星" if any(x in text for x in ("4钻", "四钻", "四星")): return "4钻/四星" if "商务" in text: return "商务" if "客栈" in text: return "客栈" return "" def vehicle_from_text(text: str) -> str: if "1+1" in text: return "1+1双排座" if "2+1" in text or "保姆车" in text or "头等舱" in text: return "2+1保姆车/头等舱" if "2+2" in text: return "2+2商务车" if "32-38" in text or "独立成团" in text: return "32-38座2+1大巴" if "旅游大巴" in text or "大巴" in text: return "旅游大巴" if "9座" in text: return "9座商务车" if "7座" in text: return "7座商务车" if "5座" in text: return "5座车" return "" def extract_between(text: str, starts: list[str], ends: list[str], limit: int | None = None) -> str: start_pos = -1 for token in starts: pos = text.find(token) if pos >= 0 and (start_pos < 0 or pos < start_pos): start_pos = pos if start_pos < 0: return "" end_pos = len(text) for token in ends: pos = text.find(token, start_pos + 2) if pos >= 0: end_pos = min(end_pos, pos) block = clean(text[start_pos:end_pos]) if limit: return block[:limit] return block def parse_simple_itinerary(text: str) -> list[dict[str, str]]: block = extract_between(text, ["简易行程"], ["详细行程", "接待标准", "费用包含", "团费包含"], 2600) layout = "simple" if not block: block = extract_between(text, ["行程安排"], ["接待标准", "费用包含", "团费包含", "特别提醒"], 4200) layout = "schedule" if not block: return [] cells = [compact(x) for x in re.split(r"[\x07\t]+", block) if compact(x)] rows: list[dict[str, str]] = [] for idx, cell in enumerate(cells): if not re.fullmatch(r"D\s*\d+", cell, flags=re.I): continue if layout == "schedule" and idx + 4 < len(cells): content = cells[idx + 2] attraction_route = "→".join(extract_attractions(content)) rows.append({ "day": cell.upper().replace(" ", ""), "day_index": re.search(r"\d+", cell).group(), "route": attraction_route or content[:80], "meals": cells[idx + 4], "accommodation": cells[idx + 3], }) else: rows.append({ "day": cell.upper().replace(" ", ""), "day_index": re.search(r"\d+", cell).group(), "route": cells[idx + 1] if idx + 1 < len(cells) else "", "meals": cells[idx + 2] if idx + 2 < len(cells) else "", "accommodation": cells[idx + 3] if idx + 3 < len(cells) else "", }) return rows def extract_day_segments(text: str) -> list[dict[str, str]]: source = extract_between( text, ["详细行程", "行程安排"], ["· 接待标准", "接待标准", "· 费用包含", "费用包含", "团费包含", "参团须知", "· 温馨提示"], ) if len(source) < 50: source = text matches = list(re.finditer(r"(?m)(D\s*\d+|第[一二三四五六七八九十]+天)[::、\s]*", source)) if not matches: return [] cn_map = {"一": 1, "二": 2, "两": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10} rows: list[dict[str, str]] = [] for idx, match in enumerate(matches): if idx + 1 < len(matches): end = matches[idx + 1].start() else: end = len(source) for token in ["· 接待标准", "接待标准", "· 费用包含", "费用包含", "团费包含", "参团须知", "温馨提示"]: pos = source.find(token, match.start() + 2) if pos >= 0: end = min(end, pos) token = match.group(1) if token.startswith("D"): day_index = int(re.search(r"\d+", token).group()) else: day_index = cn_map.get(re.search(r"第(.+?)天", token).group(1), idx + 1) body = clean(source[match.start():end]) lines = [compact(x) for x in body.splitlines() if compact(x)] title = next((line for line in lines[:4] if not line.startswith("第") and not re.fullmatch(r"D\s*\d+", line, flags=re.I)), f"D{day_index}") rows.append({"day": f"D{day_index}", "day_index": str(day_index), "title": title, "body": body}) return rows[:12] def extract_attractions(text: str) -> list[str]: found: list[str] = [] for name, aliases in ATTRACTION_ALIASES.items(): if any(alias in text for alias in aliases) and name not in found: found.append(name) return found def sentence_snippets(text: str, keywords: list[str], limit: int = 20) -> list[str]: snippets: list[str] = [] for raw in re.split(r"[。!?;;\n]+", text): line = compact(raw) if len(line) < 4: continue if any(keyword in line for keyword in keywords): snippets.append(line[:260]) if len(snippets) >= limit: break seen: set[str] = set() out: list[str] = [] for item in snippets: if item not in seen: seen.add(item) out.append(item) return out def md_table(headers: list[str], rows: list[list[Any]]) -> str: def cell(value: Any) -> str: return compact(value).replace("|", "\\|") lines = ["| " + " | ".join(headers) + " |", "| " + " | ".join(["---"] * len(headers)) + " |"] for row in rows: lines.append("| " + " | ".join(cell(v) for v in row) + " |") return "\n".join(lines) def code_block(text: str) -> str: return "```text\n" + clean(text).replace("```", "'''") + "\n```" def source_files() -> list[Path]: return sorted( p for p in SOURCE_DIR.iterdir() if p.is_file() and p.suffix.lower() in {".doc", ".docx"} and not p.name.startswith((".", "~$", ".~")) ) def build_markdown(path: Path, index: int) -> tuple[str, dict[str, Any]]: text = read_office_text(path) lines = [compact(x) for x in text.splitlines() if compact(x)] name = lines[0] if lines else path.stem if len(name) < 3 or "INCLUDEPICTURE" in name: name = path.stem duration = duration_from_text(f"{path.stem} {name} {text[:300]}") family = product_family(name, text) vehicle = vehicle_from_text(f"{path.stem} {text[:2200]}") hotel_grade = hotel_grade_from_text(f"{path.stem} {text[:2600]}") simple_rows = parse_simple_itinerary(text) day_segments = extract_day_segments(text) route_text = " ".join(row.get("route", "") for row in simple_rows) or " ".join(row.get("title", "") for row in day_segments) attractions = extract_attractions(f"{name} {path.stem} {route_text}") if not attractions: attractions = extract_attractions(" ".join(day.get("body", "")[:1200] for day in day_segments)) selling_points = sentence_snippets(text, ["核心卖点", "甄选", "赠送", "超值", "纯玩", "0购物", "车型"], limit=12) fees = sentence_snippets(text, ["费用", "不含", "自理", "电瓶车", "观光车", "环保车", "保险", "扶梯", "索道", "餐标", "单房差", "儿童价"], limit=30) rules = sentence_snippets(text, ["老人", "儿童", "学生", "军人", "退团", "退费", "不可抗力", "预约", "投诉", "意见单", "满房", "同级", "孕妇", "不接待"], limit=30) hotel_block = extract_between(text, ["酒店"], ["导游服务", "购物", "意见单填写", "温馨提示"], 3200) gift_block = extract_between(text, ["赠送服务", "赠送:"], ["简易行程", "详细行程", "接待标准"], 1800) reception_block = extract_between(text, ["接待标准"], ["温馨提示", "· 温馨提示"], 4200) frontmatter = { "doc_type": "existing_route_product_markdown", "schema_target": "travel_agency_existing_product", "route_immutable": True, "source_file": str(path), "source_filename": path.name, "product_name": name, "duration_days": duration, "product_family": family, "default_vehicle_type": vehicle, "default_hotel_grade": hotel_grade, "core_attractions": attractions, } rows = [[r["day"], r.get("route", ""), r.get("meals", ""), r.get("accommodation", "")] for r in simple_rows] if not rows: rows = [[r["day"], r.get("title", ""), "", ""] for r in day_segments] parts: list[str] = [] parts.append("---") parts.append(json.dumps(frontmatter, ensure_ascii=False, indent=2)) parts.append("---") parts.append(f"# {name}") parts.append("") parts.append("## 1. 产品识别") parts.append(md_table( ["字段", "值"], [ ["源文件", str(path)], ["产品名称", name], ["天数", duration or ""], ["产品系列/类型", family], ["默认车型", vehicle], ["默认酒店等级", hotel_grade], ["路线是否固定", "是;客户微调只能改资源槽位,不能改天数、景点顺序、城市移动路径"], ["核心景点候选", "、".join(attractions)], ], )) parts.append("") parts.append("## 2. 固定路线骨架") parts.append("") parts.append("抽取目标:`TourProduct -> ProductDay -> RouteStop / RouteSegment`。") parts.append("") parts.append(md_table(["天数", "路线/标题", "用餐", "住宿"], rows)) parts.append("") parts.append("## 3. 每日详细行程") parts.append("") parts.append("抽取目标:每天生成 `ProductDay`,从正文识别真实 `RouteStop` 和 `RouteSegment`;不要把费用说明里的景点当作真实停靠点。") for day in day_segments: idx = day["day_index"] simple = next((row for row in simple_rows if row["day_index"] == idx), {}) parts.append("") parts.append(f"### {day['day']} {simple.get('route') or day.get('title') or ''}") parts.append("") parts.append(md_table( ["字段", "值"], [ ["day_index", idx], ["route_path", simple.get("route", "")], ["meal_text", simple.get("meals", "")], ["accommodation_text", simple.get("accommodation", "")], ], )) parts.append("") parts.append(code_block(day["body"])) parts.append("") parts.append("## 4. 可配置资源槽位候选") parts.append("") parts.append("抽取目标:`ResourceSlot -> ResourceOptionGroup -> Hotel/Restaurant/Vehicle/TicketFee/GiftService`。") parts.append("") parts.append("### 4.1 住宿槽位候选") parts.append("") if hotel_block: parts.append(code_block(hotel_block)) else: parts.append("- 原文未明确独立酒店参考段;可从每日住宿列生成住宿槽位。") parts.append("") parts.append("### 4.2 餐饮槽位候选") parts.append("") meal_lines = sentence_snippets(text, ["用餐", "餐标", "早餐", "中餐", "晚餐", "长桌宴", "酸汤鱼"], limit=20) parts.extend(f"- {line}" for line in meal_lines or ["原文未明确独立餐饮段;可从每日用餐列生成餐饮槽位。"]) parts.append("") parts.append("### 4.3 车辆/交通槽位候选") parts.append("") traffic_lines = sentence_snippets(text, ["交通", "车型", "用车", "车辆", "保姆车", "商务车", "大巴", "接人", "送站", "接站"], limit=20) parts.extend(f"- {line}" for line in traffic_lines or ["原文未明确独立车辆段;可从产品名和行程交通描述识别默认车型。"]) parts.append("") parts.append("### 4.4 门票/小交通/保险槽位候选") parts.append("") ticket_lines = sentence_snippets(text, ["门票", "观光车", "环保车", "电瓶车", "保险", "扶梯", "索道", "游船", "小交通", "景交"], limit=30) parts.extend(f"- {line}" for line in ticket_lines or ["原文未明确门票小交通段。"]) parts.append("") parts.append("### 4.5 赠送服务槽位候选") parts.append("") if gift_block: parts.append(code_block(gift_block)) else: gift_lines = sentence_snippets(text, ["赠送", "旅拍", "矿泉水", "长桌宴", "高山流水", "打糍粑"], limit=12) parts.extend(f"- {line}" for line in gift_lines or ["原文未明确赠送服务。"]) parts.append("") parts.append("## 5. 费用与规则候选") parts.append("") parts.append("抽取目标:`ProductPricePackage`、`TicketFee`、`FeeItem`、`BusinessRule`。") parts.append("") parts.append("### 5.1 费用候选") parts.extend(f"- {line}" for line in fees or ["未识别到明显费用候选。"]) parts.append("") parts.append("### 5.2 业务规则候选") parts.extend(f"- {line}" for line in rules or ["未识别到明显规则候选。"]) if reception_block: parts.append("") parts.append("### 5.3 接待标准/服务规则原文") parts.append(code_block(reception_block)) parts.append("") parts.append("## 6. 原文保留") parts.append("") parts.append("后续抽取如果结构化段落不够,可回到本段原文补证据。") parts.append("") parts.append(code_block(text)) markdown = "\n".join(parts).strip() + "\n" meta = { **frontmatter, "markdown_filename": "", "simple_day_count": len(simple_rows), "detailed_day_count": len(day_segments), "text_chars": len(clean(text)), } return markdown, meta def write_index(items: list[dict[str, Any]]) -> None: summary_rows = [ [ item["product_name"], item.get("duration_days") or "", item.get("product_family") or "", item.get("default_vehicle_type") or "", item.get("default_hotel_grade") or "", "、".join(item.get("core_attractions") or []), item["markdown_filename"], ] for item in items ] counts = Counter(item.get("duration_days") for item in items) family_counts = Counter(item.get("product_family") for item in items) lines = [ "# 已有路线产品 Markdown 整理索引", "", f"生成时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", "", f"- 源目录:`{SOURCE_DIR}`", f"- 输出目录:`{OUT_DIR}`", f"- 产品文档数:{len(items)}", "", "## 按天数统计", "", *[f"- {day or '未知'}天:{count} 个" for day, count in sorted(counts.items(), key=lambda x: (x[0] is None, x[0] or 0))], "", "## 按产品类型统计", "", *[f"- {name}:{count} 个" for name, count in family_counts.most_common()], "", "## 产品索引", "", md_table(["产品名称", "天数", "产品类型", "默认车型", "默认酒店", "核心景点候选", "Markdown文件"], summary_rows), "", "## 后续抽取建议", "", "- 先抽取 `TourProduct`、`ProductDay`、`RouteStop`、`RouteSegment`,确保路线骨架稳定。", "- 再抽取 `ResourceSlot`,把住宿、餐饮、车辆、接送、门票小交通、赠送服务作为可配置槽位。", "- 产品文档里“费用说明/自理项目/不含”出现的景点名称,不要抽成真实路线停靠点。", "- 酒店参考段优先抽成 `ResourceOptionGroup`,不要强行抽成唯一入住酒店。", "- 规则要挂到影响对象:退改挂产品/价格包,优惠挂门票费用,少走路/预约风险挂景点或停靠点,可替换规则挂资源槽位。", ] (OUT_DIR / "README_已有路线产品md整理.md").write_text("\n".join(lines), encoding="utf-8") (OUT_DIR / "产品索引.json").write_text(json.dumps(items, ensure_ascii=False, indent=2), encoding="utf-8") def main() -> dict[str, Any]: OUT_DIR.mkdir(parents=True, exist_ok=True) GRAPH_OUT_DIR.mkdir(parents=True, exist_ok=True) product_dir = OUT_DIR / "products" product_dir.mkdir(parents=True, exist_ok=True) items: list[dict[str, Any]] = [] for index, path in enumerate(source_files(), start=1): markdown, meta = build_markdown(path, index) filename = safe_filename(meta["product_name"] or path.stem, index) meta["markdown_filename"] = f"products/{filename}" (product_dir / filename).write_text(markdown, encoding="utf-8") items.append(meta) write_index(items) # Keep a synchronized copy under 图谱数据 for extraction experiments. for src in [OUT_DIR / "README_已有路线产品md整理.md", OUT_DIR / "产品索引.json"]: (GRAPH_OUT_DIR / src.name).write_text(src.read_text(encoding="utf-8"), encoding="utf-8") graph_products = GRAPH_OUT_DIR / "products" graph_products.mkdir(parents=True, exist_ok=True) for md in product_dir.glob("*.md"): (graph_products / md.name).write_text(md.read_text(encoding="utf-8"), encoding="utf-8") summary = { "source_dir": str(SOURCE_DIR), "output_dir": str(OUT_DIR), "graph_output_dir": str(GRAPH_OUT_DIR), "product_markdown_count": len(items), "generated_at": datetime.now().isoformat(timespec="seconds"), } (OUT_DIR / "生成摘要.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") (GRAPH_OUT_DIR / "生成摘要.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") return summary if __name__ == "__main__": print(json.dumps(main(), ensure_ascii=False, indent=2))