bxh/scripts/organize_existing_routes_to_md.py

from __future__ import annotations

import hashlib
import json
import re
import subprocess
from collections import Counter
from datetime import datetime
from pathlib import Path
from typing import Any

from common_paths import TRAVEL_AGENCY_SOURCE_ROOT, TRAVEL_KG_EXPORT_ROOT

SOURCE_DIR = TRAVEL_AGENCY_SOURCE_ROOT / "2026年新行程打包"
OUT_DIR = TRAVEL_AGENCY_SOURCE_ROOT / "2026年新行程打包_md整理"
GRAPH_OUT_DIR = TRAVEL_KG_EXPORT_ROOT / "旅行社项目入库/已有路线产品Markdown"


ATTRACTION_ALIASES = {
    "黄果树": ["黄果树", "黄果树瀑布", "黄果树大瀑布"],
    "天星桥": ["天星桥", "天星桥景区"],
    "陡坡塘瀑布": ["陡坡塘", "陡坡塘瀑布"],
    "荔波小七孔": ["小七孔", "荔波小七孔", "小七孔景区"],
    "西江千户苗寨": ["西江", "西江苗寨", "西江千户苗寨"],
    "镇远古城": ["镇远", "镇远古镇", "镇远古城"],
    "梵净山": ["梵净山"],
    "青岩古镇": ["青岩", "青岩古镇"],
    "百里杜鹃": ["百里杜鹃"],
    "平坝樱花": ["平坝樱花", "平坝农场"],
    "织金洞": ["织金洞"],
    "中国天眼": ["天眼", "中国天眼", "FAST"],
    "茅台镇": ["茅台", "茅台镇"],
    "遵义会议会址": ["遵义会址", "遵义会议会址"],
    "兴义万峰林": ["万峰林", "兴义万峰林"],
    "万峰湖": ["万峰湖"],
    "马岭河峡谷": ["马岭河", "马岭河峡谷"],
    "花江大桥": ["花江大桥"],
    "龙宫": ["龙宫"],
    "天河潭": ["天河潭"],
    "甲秀楼": ["甲秀楼"],
    "黔灵山公园": ["黔灵公园", "黔灵山"],
    "乌江寨": ["乌江寨"],
}


def clean(value: Any) -> str:
    if value is None:
        return ""
    text = str(value).replace("\x00", "").replace("\u200b", "").replace("\u200f", "")
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{4,}", "\n\n\n", text)
    return text.strip()


def compact(value: Any) -> str:
    return re.sub(r"\s+", " ", clean(value)).strip()


def chinese_count(text: str) -> int:
    return len(re.findall(r"[\u4e00-\u9fff]", text))


def read_legacy_doc_text(path: Path) -> str:
    """Fallback for old WPS/Word .doc files that macOS textutil cannot decode."""
    data = path.read_bytes()
    decoded = data.decode("utf-16le", errors="ignore").replace("\x00", "")
    anchors = [
        path.stem,
        "❀",
        "贵客黔游",
        "推荐理由",
        "推 荐 理 由",
        "简易行程",
        "行程安排",
    ]
    positions = [decoded.find(anchor) for anchor in anchors if decoded.find(anchor) >= 0]
    if positions:
        decoded = decoded[min(positions):]
    decoded = decoded[:50000]
    decoded = decoded.replace("\r", "\n")
    decoded = re.sub(r"[\x01-\x06\x08-\x09\x0b-\x1f]", "", decoded)
    decoded = re.sub(r"[䀀-俿]{8,}", "", decoded)
    decoded = re.sub(r"[]{2,}", "", decoded)
    decoded = re.sub(r"\n{3,}", "\n\n", decoded)
    return clean(decoded)


def read_office_text(path: Path) -> str:
    proc = subprocess.run(
        ["textutil", "-convert", "txt", "-stdout", str(path)],
        check=False,
        capture_output=True,
        text=True,
    )
    if proc.returncode == 0 and chinese_count(proc.stdout) >= 30:
        return proc.stdout.replace("\x00", "")
    fallback = read_legacy_doc_text(path)
    if chinese_count(fallback) >= 30:
        return fallback
    if proc.returncode != 0:
        return f"[textutil读取失败] {proc.stderr}"
    return proc.stdout.replace("\x00", "")


def safe_filename(text: str, index: int) -> str:
    base = re.sub(r"[\\/:*?\"<>|]+", "_", compact(text))
    base = re.sub(r"\s+", "", base)
    base = base.strip("._ ")
    digest = hashlib.md5(text.encode("utf-8")).hexdigest()[:8]
    return f"{index:02d}_{base[:70]}_{digest}.md"


def duration_from_text(text: str) -> int | None:
    for pattern in [r"(\d+)\s*日游", r"(\d+)\s*天", r"(\d+)\s*日"]:
        m = re.search(pattern, text)
        if m:
            return int(m.group(1))
    cn = {"一": 1, "二": 2, "两": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10}
    m = re.search(r"([一二两三四五六七八九十])日游", text)
    if m:
        return cn.get(m.group(1))
    return None


def product_family(name: str, text: str) -> str:
    joined = f"{name} {text[:1500]}"
    if "高端" in joined or "5钻" in joined or "五钻" in joined:
        return "高端纯玩"
    if "轻奢" in joined or "2+1" in joined or "保姆车" in joined or "头等舱" in joined:
        return "轻奢纯玩"
    if "多彩" in joined:
        return "多彩贵州"
    if "经典" in joined:
        return "经典纯玩"
    if "1+1" in joined or "游黔途" in joined or "游黔程" in joined:
        return "游黔途/游黔程"
    if "独立" in joined or "20-25" in joined:
        return "独立成团"
    return "常规纯玩"


def hotel_grade_from_text(text: str) -> str:
    if any(x in text for x in ("5钻", "五钻", "五星", "超五星")):
        return "5钻/五星"
    if any(x in text for x in ("4钻", "四钻", "四星")):
        return "4钻/四星"
    if "商务" in text:
        return "商务"
    if "客栈" in text:
        return "客栈"
    return ""


def vehicle_from_text(text: str) -> str:
    if "1+1" in text:
        return "1+1双排座"
    if "2+1" in text or "保姆车" in text or "头等舱" in text:
        return "2+1保姆车/头等舱"
    if "2+2" in text:
        return "2+2商务车"
    if "32-38" in text or "独立成团" in text:
        return "32-38座2+1大巴"
    if "旅游大巴" in text or "大巴" in text:
        return "旅游大巴"
    if "9座" in text:
        return "9座商务车"
    if "7座" in text:
        return "7座商务车"
    if "5座" in text:
        return "5座车"
    return ""


def extract_between(text: str, starts: list[str], ends: list[str], limit: int | None = None) -> str:
    start_pos = -1
    for token in starts:
        pos = text.find(token)
        if pos >= 0 and (start_pos < 0 or pos < start_pos):
            start_pos = pos
    if start_pos < 0:
        return ""
    end_pos = len(text)
    for token in ends:
        pos = text.find(token, start_pos + 2)
        if pos >= 0:
            end_pos = min(end_pos, pos)
    block = clean(text[start_pos:end_pos])
    if limit:
        return block[:limit]
    return block


def parse_simple_itinerary(text: str) -> list[dict[str, str]]:
    block = extract_between(text, ["简易行程"], ["详细行程", "接待标准", "费用包含", "团费包含"], 2600)
    layout = "simple"
    if not block:
        block = extract_between(text, ["行程安排"], ["接待标准", "费用包含", "团费包含", "特别提醒"], 4200)
        layout = "schedule"
    if not block:
        return []
    cells = [compact(x) for x in re.split(r"[\x07\t]+", block) if compact(x)]
    rows: list[dict[str, str]] = []
    for idx, cell in enumerate(cells):
        if not re.fullmatch(r"D\s*\d+", cell, flags=re.I):
            continue
        if layout == "schedule" and idx + 4 < len(cells):
            content = cells[idx + 2]
            attraction_route = "→".join(extract_attractions(content))
            rows.append({
                "day": cell.upper().replace(" ", ""),
                "day_index": re.search(r"\d+", cell).group(),
                "route": attraction_route or content[:80],
                "meals": cells[idx + 4],
                "accommodation": cells[idx + 3],
            })
        else:
            rows.append({
                "day": cell.upper().replace(" ", ""),
                "day_index": re.search(r"\d+", cell).group(),
                "route": cells[idx + 1] if idx + 1 < len(cells) else "",
                "meals": cells[idx + 2] if idx + 2 < len(cells) else "",
                "accommodation": cells[idx + 3] if idx + 3 < len(cells) else "",
            })
    return rows


def extract_day_segments(text: str) -> list[dict[str, str]]:
    source = extract_between(
        text,
        ["详细行程", "行程安排"],
        ["· 接待标准", "接待标准", "· 费用包含", "费用包含", "团费包含", "参团须知", "· 温馨提示"],
    )
    if len(source) < 50:
        source = text
    matches = list(re.finditer(r"(?m)(D\s*\d+|第[一二三四五六七八九十]+天)[：:、\s]*", source))
    if not matches:
        return []
    cn_map = {"一": 1, "二": 2, "两": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10}
    rows: list[dict[str, str]] = []
    for idx, match in enumerate(matches):
        if idx + 1 < len(matches):
            end = matches[idx + 1].start()
        else:
            end = len(source)
            for token in ["· 接待标准", "接待标准", "· 费用包含", "费用包含", "团费包含", "参团须知", "温馨提示"]:
                pos = source.find(token, match.start() + 2)
                if pos >= 0:
                    end = min(end, pos)
        token = match.group(1)
        if token.startswith("D"):
            day_index = int(re.search(r"\d+", token).group())
        else:
            day_index = cn_map.get(re.search(r"第(.+?)天", token).group(1), idx + 1)
        body = clean(source[match.start():end])
        lines = [compact(x) for x in body.splitlines() if compact(x)]
        title = next((line for line in lines[:4] if not line.startswith("第") and not re.fullmatch(r"D\s*\d+", line, flags=re.I)), f"D{day_index}")
        rows.append({"day": f"D{day_index}", "day_index": str(day_index), "title": title, "body": body})
    return rows[:12]


def extract_attractions(text: str) -> list[str]:
    found: list[str] = []
    for name, aliases in ATTRACTION_ALIASES.items():
        if any(alias in text for alias in aliases) and name not in found:
            found.append(name)
    return found


def sentence_snippets(text: str, keywords: list[str], limit: int = 20) -> list[str]:
    snippets: list[str] = []
    for raw in re.split(r"[。！？；;\n]+", text):
        line = compact(raw)
        if len(line) < 4:
            continue
        if any(keyword in line for keyword in keywords):
            snippets.append(line[:260])
        if len(snippets) >= limit:
            break
    seen: set[str] = set()
    out: list[str] = []
    for item in snippets:
        if item not in seen:
            seen.add(item)
            out.append(item)
    return out


def md_table(headers: list[str], rows: list[list[Any]]) -> str:
    def cell(value: Any) -> str:
        return compact(value).replace("|", "\\|")
    lines = ["| " + " | ".join(headers) + " |", "| " + " | ".join(["---"] * len(headers)) + " |"]
    for row in rows:
        lines.append("| " + " | ".join(cell(v) for v in row) + " |")
    return "\n".join(lines)


def code_block(text: str) -> str:
    return "```text\n" + clean(text).replace("```", "'''") + "\n```"


def source_files() -> list[Path]:
    return sorted(
        p for p in SOURCE_DIR.iterdir()
        if p.is_file()
        and p.suffix.lower() in {".doc", ".docx"}
        and not p.name.startswith((".", "~$", ".~"))
    )


def build_markdown(path: Path, index: int) -> tuple[str, dict[str, Any]]:
    text = read_office_text(path)
    lines = [compact(x) for x in text.splitlines() if compact(x)]
    name = lines[0] if lines else path.stem
    if len(name) < 3 or "INCLUDEPICTURE" in name:
        name = path.stem
    duration = duration_from_text(f"{path.stem} {name} {text[:300]}")
    family = product_family(name, text)
    vehicle = vehicle_from_text(f"{path.stem} {text[:2200]}")
    hotel_grade = hotel_grade_from_text(f"{path.stem} {text[:2600]}")
    simple_rows = parse_simple_itinerary(text)
    day_segments = extract_day_segments(text)
    route_text = " ".join(row.get("route", "") for row in simple_rows) or " ".join(row.get("title", "") for row in day_segments)
    attractions = extract_attractions(f"{name} {path.stem} {route_text}")
    if not attractions:
        attractions = extract_attractions(" ".join(day.get("body", "")[:1200] for day in day_segments))
    selling_points = sentence_snippets(text, ["核心卖点", "甄选", "赠送", "超值", "纯玩", "0购物", "车型"], limit=12)
    fees = sentence_snippets(text, ["费用", "不含", "自理", "电瓶车", "观光车", "环保车", "保险", "扶梯", "索道", "餐标", "单房差", "儿童价"], limit=30)
    rules = sentence_snippets(text, ["老人", "儿童", "学生", "军人", "退团", "退费", "不可抗力", "预约", "投诉", "意见单", "满房", "同级", "孕妇", "不接待"], limit=30)
    hotel_block = extract_between(text, ["酒店"], ["导游服务", "购物", "意见单填写", "温馨提示"], 3200)
    gift_block = extract_between(text, ["赠送服务", "赠送："], ["简易行程", "详细行程", "接待标准"], 1800)
    reception_block = extract_between(text, ["接待标准"], ["温馨提示", "· 温馨提示"], 4200)

    frontmatter = {
        "doc_type": "existing_route_product_markdown",
        "schema_target": "travel_agency_existing_product",
        "route_immutable": True,
        "source_file": str(path),
        "source_filename": path.name,
        "product_name": name,
        "duration_days": duration,
        "product_family": family,
        "default_vehicle_type": vehicle,
        "default_hotel_grade": hotel_grade,
        "core_attractions": attractions,
    }
    rows = [[r["day"], r.get("route", ""), r.get("meals", ""), r.get("accommodation", "")] for r in simple_rows]
    if not rows:
        rows = [[r["day"], r.get("title", ""), "", ""] for r in day_segments]

    parts: list[str] = []
    parts.append("---")
    parts.append(json.dumps(frontmatter, ensure_ascii=False, indent=2))
    parts.append("---")
    parts.append(f"# {name}")
    parts.append("")
    parts.append("## 1. 产品识别")
    parts.append(md_table(
        ["字段", "值"],
        [
            ["源文件", str(path)],
            ["产品名称", name],
            ["天数", duration or ""],
            ["产品系列/类型", family],
            ["默认车型", vehicle],
            ["默认酒店等级", hotel_grade],
            ["路线是否固定", "是；客户微调只能改资源槽位，不能改天数、景点顺序、城市移动路径"],
            ["核心景点候选", "、".join(attractions)],
        ],
    ))
    parts.append("")
    parts.append("## 2. 固定路线骨架")
    parts.append("")
    parts.append("抽取目标：`TourProduct -> ProductDay -> RouteStop / RouteSegment`。")
    parts.append("")
    parts.append(md_table(["天数", "路线/标题", "用餐", "住宿"], rows))
    parts.append("")
    parts.append("## 3. 每日详细行程")
    parts.append("")
    parts.append("抽取目标：每天生成 `ProductDay`，从正文识别真实 `RouteStop` 和 `RouteSegment`；不要把费用说明里的景点当作真实停靠点。")
    for day in day_segments:
        idx = day["day_index"]
        simple = next((row for row in simple_rows if row["day_index"] == idx), {})
        parts.append("")
        parts.append(f"### {day['day']} {simple.get('route') or day.get('title') or ''}")
        parts.append("")
        parts.append(md_table(
            ["字段", "值"],
            [
                ["day_index", idx],
                ["route_path", simple.get("route", "")],
                ["meal_text", simple.get("meals", "")],
                ["accommodation_text", simple.get("accommodation", "")],
            ],
        ))
        parts.append("")
        parts.append(code_block(day["body"]))
    parts.append("")
    parts.append("## 4. 可配置资源槽位候选")
    parts.append("")
    parts.append("抽取目标：`ResourceSlot -> ResourceOptionGroup -> Hotel/Restaurant/Vehicle/TicketFee/GiftService`。")
    parts.append("")
    parts.append("### 4.1 住宿槽位候选")
    parts.append("")
    if hotel_block:
        parts.append(code_block(hotel_block))
    else:
        parts.append("- 原文未明确独立酒店参考段；可从每日住宿列生成住宿槽位。")
    parts.append("")
    parts.append("### 4.2 餐饮槽位候选")
    parts.append("")
    meal_lines = sentence_snippets(text, ["用餐", "餐标", "早餐", "中餐", "晚餐", "长桌宴", "酸汤鱼"], limit=20)
    parts.extend(f"- {line}" for line in meal_lines or ["原文未明确独立餐饮段；可从每日用餐列生成餐饮槽位。"])
    parts.append("")
    parts.append("### 4.3 车辆/交通槽位候选")
    parts.append("")
    traffic_lines = sentence_snippets(text, ["交通", "车型", "用车", "车辆", "保姆车", "商务车", "大巴", "接人", "送站", "接站"], limit=20)
    parts.extend(f"- {line}" for line in traffic_lines or ["原文未明确独立车辆段；可从产品名和行程交通描述识别默认车型。"])
    parts.append("")
    parts.append("### 4.4 门票/小交通/保险槽位候选")
    parts.append("")
    ticket_lines = sentence_snippets(text, ["门票", "观光车", "环保车", "电瓶车", "保险", "扶梯", "索道", "游船", "小交通", "景交"], limit=30)
    parts.extend(f"- {line}" for line in ticket_lines or ["原文未明确门票小交通段。"])
    parts.append("")
    parts.append("### 4.5 赠送服务槽位候选")
    parts.append("")
    if gift_block:
        parts.append(code_block(gift_block))
    else:
        gift_lines = sentence_snippets(text, ["赠送", "旅拍", "矿泉水", "长桌宴", "高山流水", "打糍粑"], limit=12)
        parts.extend(f"- {line}" for line in gift_lines or ["原文未明确赠送服务。"])
    parts.append("")
    parts.append("## 5. 费用与规则候选")
    parts.append("")
    parts.append("抽取目标：`ProductPricePackage`、`TicketFee`、`FeeItem`、`BusinessRule`。")
    parts.append("")
    parts.append("### 5.1 费用候选")
    parts.extend(f"- {line}" for line in fees or ["未识别到明显费用候选。"])
    parts.append("")
    parts.append("### 5.2 业务规则候选")
    parts.extend(f"- {line}" for line in rules or ["未识别到明显规则候选。"])
    if reception_block:
        parts.append("")
        parts.append("### 5.3 接待标准/服务规则原文")
        parts.append(code_block(reception_block))
    parts.append("")
    parts.append("## 6. 原文保留")
    parts.append("")
    parts.append("后续抽取如果结构化段落不够，可回到本段原文补证据。")
    parts.append("")
    parts.append(code_block(text))
    markdown = "\n".join(parts).strip() + "\n"
    meta = {
        **frontmatter,
        "markdown_filename": "",
        "simple_day_count": len(simple_rows),
        "detailed_day_count": len(day_segments),
        "text_chars": len(clean(text)),
    }
    return markdown, meta


def write_index(items: list[dict[str, Any]]) -> None:
    summary_rows = [
        [
            item["product_name"],
            item.get("duration_days") or "",
            item.get("product_family") or "",
            item.get("default_vehicle_type") or "",
            item.get("default_hotel_grade") or "",
            "、".join(item.get("core_attractions") or []),
            item["markdown_filename"],
        ]
        for item in items
    ]
    counts = Counter(item.get("duration_days") for item in items)
    family_counts = Counter(item.get("product_family") for item in items)
    lines = [
        "# 已有路线产品 Markdown 整理索引",
        "",
        f"生成时间：{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
        "",
        f"- 源目录：`{SOURCE_DIR}`",
        f"- 输出目录：`{OUT_DIR}`",
        f"- 产品文档数：{len(items)}",
        "",
        "## 按天数统计",
        "",
        *[f"- {day or '未知'}天：{count} 个" for day, count in sorted(counts.items(), key=lambda x: (x[0] is None, x[0] or 0))],
        "",
        "## 按产品类型统计",
        "",
        *[f"- {name}：{count} 个" for name, count in family_counts.most_common()],
        "",
        "## 产品索引",
        "",
        md_table(["产品名称", "天数", "产品类型", "默认车型", "默认酒店", "核心景点候选", "Markdown文件"], summary_rows),
        "",
        "## 后续抽取建议",
        "",
        "- 先抽取 `TourProduct`、`ProductDay`、`RouteStop`、`RouteSegment`，确保路线骨架稳定。",
        "- 再抽取 `ResourceSlot`，把住宿、餐饮、车辆、接送、门票小交通、赠送服务作为可配置槽位。",
        "- 产品文档里“费用说明/自理项目/不含”出现的景点名称，不要抽成真实路线停靠点。",
        "- 酒店参考段优先抽成 `ResourceOptionGroup`，不要强行抽成唯一入住酒店。",
        "- 规则要挂到影响对象：退改挂产品/价格包，优惠挂门票费用，少走路/预约风险挂景点或停靠点，可替换规则挂资源槽位。",
    ]
    (OUT_DIR / "README_已有路线产品md整理.md").write_text("\n".join(lines), encoding="utf-8")
    (OUT_DIR / "产品索引.json").write_text(json.dumps(items, ensure_ascii=False, indent=2), encoding="utf-8")


def main() -> dict[str, Any]:
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    GRAPH_OUT_DIR.mkdir(parents=True, exist_ok=True)
    product_dir = OUT_DIR / "products"
    product_dir.mkdir(parents=True, exist_ok=True)
    items: list[dict[str, Any]] = []
    for index, path in enumerate(source_files(), start=1):
        markdown, meta = build_markdown(path, index)
        filename = safe_filename(meta["product_name"] or path.stem, index)
        meta["markdown_filename"] = f"products/{filename}"
        (product_dir / filename).write_text(markdown, encoding="utf-8")
        items.append(meta)
    write_index(items)

    # Keep a synchronized copy under 图谱数据 for extraction experiments.
    for src in [OUT_DIR / "README_已有路线产品md整理.md", OUT_DIR / "产品索引.json"]:
        (GRAPH_OUT_DIR / src.name).write_text(src.read_text(encoding="utf-8"), encoding="utf-8")
    graph_products = GRAPH_OUT_DIR / "products"
    graph_products.mkdir(parents=True, exist_ok=True)
    for md in product_dir.glob("*.md"):
        (graph_products / md.name).write_text(md.read_text(encoding="utf-8"), encoding="utf-8")

    summary = {
        "source_dir": str(SOURCE_DIR),
        "output_dir": str(OUT_DIR),
        "graph_output_dir": str(GRAPH_OUT_DIR),
        "product_markdown_count": len(items),
        "generated_at": datetime.now().isoformat(timespec="seconds"),
    }
    (OUT_DIR / "生成摘要.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
    (GRAPH_OUT_DIR / "生成摘要.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
    return summary


if __name__ == "__main__":
    print(json.dumps(main(), ensure_ascii=False, indent=2))