546 lines
22 KiB
Python
546 lines
22 KiB
Python
from __future__ import annotations
|
||
|
||
import hashlib
|
||
import json
|
||
import re
|
||
import subprocess
|
||
from collections import Counter
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
from common_paths import TRAVEL_AGENCY_SOURCE_ROOT, TRAVEL_KG_EXPORT_ROOT
|
||
|
||
SOURCE_DIR = TRAVEL_AGENCY_SOURCE_ROOT / "2026年新行程打包"
|
||
OUT_DIR = TRAVEL_AGENCY_SOURCE_ROOT / "2026年新行程打包_md整理"
|
||
GRAPH_OUT_DIR = TRAVEL_KG_EXPORT_ROOT / "旅行社项目入库/已有路线产品Markdown"
|
||
|
||
|
||
ATTRACTION_ALIASES = {
|
||
"黄果树": ["黄果树", "黄果树瀑布", "黄果树大瀑布"],
|
||
"天星桥": ["天星桥", "天星桥景区"],
|
||
"陡坡塘瀑布": ["陡坡塘", "陡坡塘瀑布"],
|
||
"荔波小七孔": ["小七孔", "荔波小七孔", "小七孔景区"],
|
||
"西江千户苗寨": ["西江", "西江苗寨", "西江千户苗寨"],
|
||
"镇远古城": ["镇远", "镇远古镇", "镇远古城"],
|
||
"梵净山": ["梵净山"],
|
||
"青岩古镇": ["青岩", "青岩古镇"],
|
||
"百里杜鹃": ["百里杜鹃"],
|
||
"平坝樱花": ["平坝樱花", "平坝农场"],
|
||
"织金洞": ["织金洞"],
|
||
"中国天眼": ["天眼", "中国天眼", "FAST"],
|
||
"茅台镇": ["茅台", "茅台镇"],
|
||
"遵义会议会址": ["遵义会址", "遵义会议会址"],
|
||
"兴义万峰林": ["万峰林", "兴义万峰林"],
|
||
"万峰湖": ["万峰湖"],
|
||
"马岭河峡谷": ["马岭河", "马岭河峡谷"],
|
||
"花江大桥": ["花江大桥"],
|
||
"龙宫": ["龙宫"],
|
||
"天河潭": ["天河潭"],
|
||
"甲秀楼": ["甲秀楼"],
|
||
"黔灵山公园": ["黔灵公园", "黔灵山"],
|
||
"乌江寨": ["乌江寨"],
|
||
}
|
||
|
||
|
||
def clean(value: Any) -> str:
|
||
if value is None:
|
||
return ""
|
||
text = str(value).replace("\x00", "").replace("\u200b", "").replace("\u200f", "")
|
||
text = re.sub(r"[ \t]+", " ", text)
|
||
text = re.sub(r"\n{4,}", "\n\n\n", text)
|
||
return text.strip()
|
||
|
||
|
||
def compact(value: Any) -> str:
|
||
return re.sub(r"\s+", " ", clean(value)).strip()
|
||
|
||
|
||
def chinese_count(text: str) -> int:
|
||
return len(re.findall(r"[\u4e00-\u9fff]", text))
|
||
|
||
|
||
def read_legacy_doc_text(path: Path) -> str:
|
||
"""Fallback for old WPS/Word .doc files that macOS textutil cannot decode."""
|
||
data = path.read_bytes()
|
||
decoded = data.decode("utf-16le", errors="ignore").replace("\x00", "")
|
||
anchors = [
|
||
path.stem,
|
||
"❀",
|
||
"贵客黔游",
|
||
"推荐理由",
|
||
"推 荐 理 由",
|
||
"简易行程",
|
||
"行程安排",
|
||
]
|
||
positions = [decoded.find(anchor) for anchor in anchors if decoded.find(anchor) >= 0]
|
||
if positions:
|
||
decoded = decoded[min(positions):]
|
||
decoded = decoded[:50000]
|
||
decoded = decoded.replace("\r", "\n")
|
||
decoded = re.sub(r"[\x01-\x06\x08-\x09\x0b-\x1f]", "", decoded)
|
||
decoded = re.sub(r"[䀀-俿]{8,}", "", decoded)
|
||
decoded = re.sub(r"[]{2,}", "", decoded)
|
||
decoded = re.sub(r"\n{3,}", "\n\n", decoded)
|
||
return clean(decoded)
|
||
|
||
|
||
def read_office_text(path: Path) -> str:
|
||
proc = subprocess.run(
|
||
["textutil", "-convert", "txt", "-stdout", str(path)],
|
||
check=False,
|
||
capture_output=True,
|
||
text=True,
|
||
)
|
||
if proc.returncode == 0 and chinese_count(proc.stdout) >= 30:
|
||
return proc.stdout.replace("\x00", "")
|
||
fallback = read_legacy_doc_text(path)
|
||
if chinese_count(fallback) >= 30:
|
||
return fallback
|
||
if proc.returncode != 0:
|
||
return f"[textutil读取失败] {proc.stderr}"
|
||
return proc.stdout.replace("\x00", "")
|
||
|
||
|
||
def safe_filename(text: str, index: int) -> str:
|
||
base = re.sub(r"[\\/:*?\"<>|]+", "_", compact(text))
|
||
base = re.sub(r"\s+", "", base)
|
||
base = base.strip("._ ")
|
||
digest = hashlib.md5(text.encode("utf-8")).hexdigest()[:8]
|
||
return f"{index:02d}_{base[:70]}_{digest}.md"
|
||
|
||
|
||
def duration_from_text(text: str) -> int | None:
|
||
for pattern in [r"(\d+)\s*日游", r"(\d+)\s*天", r"(\d+)\s*日"]:
|
||
m = re.search(pattern, text)
|
||
if m:
|
||
return int(m.group(1))
|
||
cn = {"一": 1, "二": 2, "两": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10}
|
||
m = re.search(r"([一二两三四五六七八九十])日游", text)
|
||
if m:
|
||
return cn.get(m.group(1))
|
||
return None
|
||
|
||
|
||
def product_family(name: str, text: str) -> str:
|
||
joined = f"{name} {text[:1500]}"
|
||
if "高端" in joined or "5钻" in joined or "五钻" in joined:
|
||
return "高端纯玩"
|
||
if "轻奢" in joined or "2+1" in joined or "保姆车" in joined or "头等舱" in joined:
|
||
return "轻奢纯玩"
|
||
if "多彩" in joined:
|
||
return "多彩贵州"
|
||
if "经典" in joined:
|
||
return "经典纯玩"
|
||
if "1+1" in joined or "游黔途" in joined or "游黔程" in joined:
|
||
return "游黔途/游黔程"
|
||
if "独立" in joined or "20-25" in joined:
|
||
return "独立成团"
|
||
return "常规纯玩"
|
||
|
||
|
||
def hotel_grade_from_text(text: str) -> str:
|
||
if any(x in text for x in ("5钻", "五钻", "五星", "超五星")):
|
||
return "5钻/五星"
|
||
if any(x in text for x in ("4钻", "四钻", "四星")):
|
||
return "4钻/四星"
|
||
if "商务" in text:
|
||
return "商务"
|
||
if "客栈" in text:
|
||
return "客栈"
|
||
return ""
|
||
|
||
|
||
def vehicle_from_text(text: str) -> str:
|
||
if "1+1" in text:
|
||
return "1+1双排座"
|
||
if "2+1" in text or "保姆车" in text or "头等舱" in text:
|
||
return "2+1保姆车/头等舱"
|
||
if "2+2" in text:
|
||
return "2+2商务车"
|
||
if "32-38" in text or "独立成团" in text:
|
||
return "32-38座2+1大巴"
|
||
if "旅游大巴" in text or "大巴" in text:
|
||
return "旅游大巴"
|
||
if "9座" in text:
|
||
return "9座商务车"
|
||
if "7座" in text:
|
||
return "7座商务车"
|
||
if "5座" in text:
|
||
return "5座车"
|
||
return ""
|
||
|
||
|
||
def extract_between(text: str, starts: list[str], ends: list[str], limit: int | None = None) -> str:
|
||
start_pos = -1
|
||
for token in starts:
|
||
pos = text.find(token)
|
||
if pos >= 0 and (start_pos < 0 or pos < start_pos):
|
||
start_pos = pos
|
||
if start_pos < 0:
|
||
return ""
|
||
end_pos = len(text)
|
||
for token in ends:
|
||
pos = text.find(token, start_pos + 2)
|
||
if pos >= 0:
|
||
end_pos = min(end_pos, pos)
|
||
block = clean(text[start_pos:end_pos])
|
||
if limit:
|
||
return block[:limit]
|
||
return block
|
||
|
||
|
||
def parse_simple_itinerary(text: str) -> list[dict[str, str]]:
|
||
block = extract_between(text, ["简易行程"], ["详细行程", "接待标准", "费用包含", "团费包含"], 2600)
|
||
layout = "simple"
|
||
if not block:
|
||
block = extract_between(text, ["行程安排"], ["接待标准", "费用包含", "团费包含", "特别提醒"], 4200)
|
||
layout = "schedule"
|
||
if not block:
|
||
return []
|
||
cells = [compact(x) for x in re.split(r"[\x07\t]+", block) if compact(x)]
|
||
rows: list[dict[str, str]] = []
|
||
for idx, cell in enumerate(cells):
|
||
if not re.fullmatch(r"D\s*\d+", cell, flags=re.I):
|
||
continue
|
||
if layout == "schedule" and idx + 4 < len(cells):
|
||
content = cells[idx + 2]
|
||
attraction_route = "→".join(extract_attractions(content))
|
||
rows.append({
|
||
"day": cell.upper().replace(" ", ""),
|
||
"day_index": re.search(r"\d+", cell).group(),
|
||
"route": attraction_route or content[:80],
|
||
"meals": cells[idx + 4],
|
||
"accommodation": cells[idx + 3],
|
||
})
|
||
else:
|
||
rows.append({
|
||
"day": cell.upper().replace(" ", ""),
|
||
"day_index": re.search(r"\d+", cell).group(),
|
||
"route": cells[idx + 1] if idx + 1 < len(cells) else "",
|
||
"meals": cells[idx + 2] if idx + 2 < len(cells) else "",
|
||
"accommodation": cells[idx + 3] if idx + 3 < len(cells) else "",
|
||
})
|
||
return rows
|
||
|
||
|
||
def extract_day_segments(text: str) -> list[dict[str, str]]:
|
||
source = extract_between(
|
||
text,
|
||
["详细行程", "行程安排"],
|
||
["· 接待标准", "接待标准", "· 费用包含", "费用包含", "团费包含", "参团须知", "· 温馨提示"],
|
||
)
|
||
if len(source) < 50:
|
||
source = text
|
||
matches = list(re.finditer(r"(?m)(D\s*\d+|第[一二三四五六七八九十]+天)[::、\s]*", source))
|
||
if not matches:
|
||
return []
|
||
cn_map = {"一": 1, "二": 2, "两": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10}
|
||
rows: list[dict[str, str]] = []
|
||
for idx, match in enumerate(matches):
|
||
if idx + 1 < len(matches):
|
||
end = matches[idx + 1].start()
|
||
else:
|
||
end = len(source)
|
||
for token in ["· 接待标准", "接待标准", "· 费用包含", "费用包含", "团费包含", "参团须知", "温馨提示"]:
|
||
pos = source.find(token, match.start() + 2)
|
||
if pos >= 0:
|
||
end = min(end, pos)
|
||
token = match.group(1)
|
||
if token.startswith("D"):
|
||
day_index = int(re.search(r"\d+", token).group())
|
||
else:
|
||
day_index = cn_map.get(re.search(r"第(.+?)天", token).group(1), idx + 1)
|
||
body = clean(source[match.start():end])
|
||
lines = [compact(x) for x in body.splitlines() if compact(x)]
|
||
title = next((line for line in lines[:4] if not line.startswith("第") and not re.fullmatch(r"D\s*\d+", line, flags=re.I)), f"D{day_index}")
|
||
rows.append({"day": f"D{day_index}", "day_index": str(day_index), "title": title, "body": body})
|
||
return rows[:12]
|
||
|
||
|
||
def extract_attractions(text: str) -> list[str]:
|
||
found: list[str] = []
|
||
for name, aliases in ATTRACTION_ALIASES.items():
|
||
if any(alias in text for alias in aliases) and name not in found:
|
||
found.append(name)
|
||
return found
|
||
|
||
|
||
def sentence_snippets(text: str, keywords: list[str], limit: int = 20) -> list[str]:
|
||
snippets: list[str] = []
|
||
for raw in re.split(r"[。!?;;\n]+", text):
|
||
line = compact(raw)
|
||
if len(line) < 4:
|
||
continue
|
||
if any(keyword in line for keyword in keywords):
|
||
snippets.append(line[:260])
|
||
if len(snippets) >= limit:
|
||
break
|
||
seen: set[str] = set()
|
||
out: list[str] = []
|
||
for item in snippets:
|
||
if item not in seen:
|
||
seen.add(item)
|
||
out.append(item)
|
||
return out
|
||
|
||
|
||
def md_table(headers: list[str], rows: list[list[Any]]) -> str:
|
||
def cell(value: Any) -> str:
|
||
return compact(value).replace("|", "\\|")
|
||
lines = ["| " + " | ".join(headers) + " |", "| " + " | ".join(["---"] * len(headers)) + " |"]
|
||
for row in rows:
|
||
lines.append("| " + " | ".join(cell(v) for v in row) + " |")
|
||
return "\n".join(lines)
|
||
|
||
|
||
def code_block(text: str) -> str:
|
||
return "```text\n" + clean(text).replace("```", "'''") + "\n```"
|
||
|
||
|
||
def source_files() -> list[Path]:
|
||
return sorted(
|
||
p for p in SOURCE_DIR.iterdir()
|
||
if p.is_file()
|
||
and p.suffix.lower() in {".doc", ".docx"}
|
||
and not p.name.startswith((".", "~$", ".~"))
|
||
)
|
||
|
||
|
||
def build_markdown(path: Path, index: int) -> tuple[str, dict[str, Any]]:
|
||
text = read_office_text(path)
|
||
lines = [compact(x) for x in text.splitlines() if compact(x)]
|
||
name = lines[0] if lines else path.stem
|
||
if len(name) < 3 or "INCLUDEPICTURE" in name:
|
||
name = path.stem
|
||
duration = duration_from_text(f"{path.stem} {name} {text[:300]}")
|
||
family = product_family(name, text)
|
||
vehicle = vehicle_from_text(f"{path.stem} {text[:2200]}")
|
||
hotel_grade = hotel_grade_from_text(f"{path.stem} {text[:2600]}")
|
||
simple_rows = parse_simple_itinerary(text)
|
||
day_segments = extract_day_segments(text)
|
||
route_text = " ".join(row.get("route", "") for row in simple_rows) or " ".join(row.get("title", "") for row in day_segments)
|
||
attractions = extract_attractions(f"{name} {path.stem} {route_text}")
|
||
if not attractions:
|
||
attractions = extract_attractions(" ".join(day.get("body", "")[:1200] for day in day_segments))
|
||
selling_points = sentence_snippets(text, ["核心卖点", "甄选", "赠送", "超值", "纯玩", "0购物", "车型"], limit=12)
|
||
fees = sentence_snippets(text, ["费用", "不含", "自理", "电瓶车", "观光车", "环保车", "保险", "扶梯", "索道", "餐标", "单房差", "儿童价"], limit=30)
|
||
rules = sentence_snippets(text, ["老人", "儿童", "学生", "军人", "退团", "退费", "不可抗力", "预约", "投诉", "意见单", "满房", "同级", "孕妇", "不接待"], limit=30)
|
||
hotel_block = extract_between(text, ["酒店"], ["导游服务", "购物", "意见单填写", "温馨提示"], 3200)
|
||
gift_block = extract_between(text, ["赠送服务", "赠送:"], ["简易行程", "详细行程", "接待标准"], 1800)
|
||
reception_block = extract_between(text, ["接待标准"], ["温馨提示", "· 温馨提示"], 4200)
|
||
|
||
frontmatter = {
|
||
"doc_type": "existing_route_product_markdown",
|
||
"schema_target": "travel_agency_existing_product",
|
||
"route_immutable": True,
|
||
"source_file": str(path),
|
||
"source_filename": path.name,
|
||
"product_name": name,
|
||
"duration_days": duration,
|
||
"product_family": family,
|
||
"default_vehicle_type": vehicle,
|
||
"default_hotel_grade": hotel_grade,
|
||
"core_attractions": attractions,
|
||
}
|
||
rows = [[r["day"], r.get("route", ""), r.get("meals", ""), r.get("accommodation", "")] for r in simple_rows]
|
||
if not rows:
|
||
rows = [[r["day"], r.get("title", ""), "", ""] for r in day_segments]
|
||
|
||
parts: list[str] = []
|
||
parts.append("---")
|
||
parts.append(json.dumps(frontmatter, ensure_ascii=False, indent=2))
|
||
parts.append("---")
|
||
parts.append(f"# {name}")
|
||
parts.append("")
|
||
parts.append("## 1. 产品识别")
|
||
parts.append(md_table(
|
||
["字段", "值"],
|
||
[
|
||
["源文件", str(path)],
|
||
["产品名称", name],
|
||
["天数", duration or ""],
|
||
["产品系列/类型", family],
|
||
["默认车型", vehicle],
|
||
["默认酒店等级", hotel_grade],
|
||
["路线是否固定", "是;客户微调只能改资源槽位,不能改天数、景点顺序、城市移动路径"],
|
||
["核心景点候选", "、".join(attractions)],
|
||
],
|
||
))
|
||
parts.append("")
|
||
parts.append("## 2. 固定路线骨架")
|
||
parts.append("")
|
||
parts.append("抽取目标:`TourProduct -> ProductDay -> RouteStop / RouteSegment`。")
|
||
parts.append("")
|
||
parts.append(md_table(["天数", "路线/标题", "用餐", "住宿"], rows))
|
||
parts.append("")
|
||
parts.append("## 3. 每日详细行程")
|
||
parts.append("")
|
||
parts.append("抽取目标:每天生成 `ProductDay`,从正文识别真实 `RouteStop` 和 `RouteSegment`;不要把费用说明里的景点当作真实停靠点。")
|
||
for day in day_segments:
|
||
idx = day["day_index"]
|
||
simple = next((row for row in simple_rows if row["day_index"] == idx), {})
|
||
parts.append("")
|
||
parts.append(f"### {day['day']} {simple.get('route') or day.get('title') or ''}")
|
||
parts.append("")
|
||
parts.append(md_table(
|
||
["字段", "值"],
|
||
[
|
||
["day_index", idx],
|
||
["route_path", simple.get("route", "")],
|
||
["meal_text", simple.get("meals", "")],
|
||
["accommodation_text", simple.get("accommodation", "")],
|
||
],
|
||
))
|
||
parts.append("")
|
||
parts.append(code_block(day["body"]))
|
||
parts.append("")
|
||
parts.append("## 4. 可配置资源槽位候选")
|
||
parts.append("")
|
||
parts.append("抽取目标:`ResourceSlot -> ResourceOptionGroup -> Hotel/Restaurant/Vehicle/TicketFee/GiftService`。")
|
||
parts.append("")
|
||
parts.append("### 4.1 住宿槽位候选")
|
||
parts.append("")
|
||
if hotel_block:
|
||
parts.append(code_block(hotel_block))
|
||
else:
|
||
parts.append("- 原文未明确独立酒店参考段;可从每日住宿列生成住宿槽位。")
|
||
parts.append("")
|
||
parts.append("### 4.2 餐饮槽位候选")
|
||
parts.append("")
|
||
meal_lines = sentence_snippets(text, ["用餐", "餐标", "早餐", "中餐", "晚餐", "长桌宴", "酸汤鱼"], limit=20)
|
||
parts.extend(f"- {line}" for line in meal_lines or ["原文未明确独立餐饮段;可从每日用餐列生成餐饮槽位。"])
|
||
parts.append("")
|
||
parts.append("### 4.3 车辆/交通槽位候选")
|
||
parts.append("")
|
||
traffic_lines = sentence_snippets(text, ["交通", "车型", "用车", "车辆", "保姆车", "商务车", "大巴", "接人", "送站", "接站"], limit=20)
|
||
parts.extend(f"- {line}" for line in traffic_lines or ["原文未明确独立车辆段;可从产品名和行程交通描述识别默认车型。"])
|
||
parts.append("")
|
||
parts.append("### 4.4 门票/小交通/保险槽位候选")
|
||
parts.append("")
|
||
ticket_lines = sentence_snippets(text, ["门票", "观光车", "环保车", "电瓶车", "保险", "扶梯", "索道", "游船", "小交通", "景交"], limit=30)
|
||
parts.extend(f"- {line}" for line in ticket_lines or ["原文未明确门票小交通段。"])
|
||
parts.append("")
|
||
parts.append("### 4.5 赠送服务槽位候选")
|
||
parts.append("")
|
||
if gift_block:
|
||
parts.append(code_block(gift_block))
|
||
else:
|
||
gift_lines = sentence_snippets(text, ["赠送", "旅拍", "矿泉水", "长桌宴", "高山流水", "打糍粑"], limit=12)
|
||
parts.extend(f"- {line}" for line in gift_lines or ["原文未明确赠送服务。"])
|
||
parts.append("")
|
||
parts.append("## 5. 费用与规则候选")
|
||
parts.append("")
|
||
parts.append("抽取目标:`ProductPricePackage`、`TicketFee`、`FeeItem`、`BusinessRule`。")
|
||
parts.append("")
|
||
parts.append("### 5.1 费用候选")
|
||
parts.extend(f"- {line}" for line in fees or ["未识别到明显费用候选。"])
|
||
parts.append("")
|
||
parts.append("### 5.2 业务规则候选")
|
||
parts.extend(f"- {line}" for line in rules or ["未识别到明显规则候选。"])
|
||
if reception_block:
|
||
parts.append("")
|
||
parts.append("### 5.3 接待标准/服务规则原文")
|
||
parts.append(code_block(reception_block))
|
||
parts.append("")
|
||
parts.append("## 6. 原文保留")
|
||
parts.append("")
|
||
parts.append("后续抽取如果结构化段落不够,可回到本段原文补证据。")
|
||
parts.append("")
|
||
parts.append(code_block(text))
|
||
markdown = "\n".join(parts).strip() + "\n"
|
||
meta = {
|
||
**frontmatter,
|
||
"markdown_filename": "",
|
||
"simple_day_count": len(simple_rows),
|
||
"detailed_day_count": len(day_segments),
|
||
"text_chars": len(clean(text)),
|
||
}
|
||
return markdown, meta
|
||
|
||
|
||
def write_index(items: list[dict[str, Any]]) -> None:
|
||
summary_rows = [
|
||
[
|
||
item["product_name"],
|
||
item.get("duration_days") or "",
|
||
item.get("product_family") or "",
|
||
item.get("default_vehicle_type") or "",
|
||
item.get("default_hotel_grade") or "",
|
||
"、".join(item.get("core_attractions") or []),
|
||
item["markdown_filename"],
|
||
]
|
||
for item in items
|
||
]
|
||
counts = Counter(item.get("duration_days") for item in items)
|
||
family_counts = Counter(item.get("product_family") for item in items)
|
||
lines = [
|
||
"# 已有路线产品 Markdown 整理索引",
|
||
"",
|
||
f"生成时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
|
||
"",
|
||
f"- 源目录:`{SOURCE_DIR}`",
|
||
f"- 输出目录:`{OUT_DIR}`",
|
||
f"- 产品文档数:{len(items)}",
|
||
"",
|
||
"## 按天数统计",
|
||
"",
|
||
*[f"- {day or '未知'}天:{count} 个" for day, count in sorted(counts.items(), key=lambda x: (x[0] is None, x[0] or 0))],
|
||
"",
|
||
"## 按产品类型统计",
|
||
"",
|
||
*[f"- {name}:{count} 个" for name, count in family_counts.most_common()],
|
||
"",
|
||
"## 产品索引",
|
||
"",
|
||
md_table(["产品名称", "天数", "产品类型", "默认车型", "默认酒店", "核心景点候选", "Markdown文件"], summary_rows),
|
||
"",
|
||
"## 后续抽取建议",
|
||
"",
|
||
"- 先抽取 `TourProduct`、`ProductDay`、`RouteStop`、`RouteSegment`,确保路线骨架稳定。",
|
||
"- 再抽取 `ResourceSlot`,把住宿、餐饮、车辆、接送、门票小交通、赠送服务作为可配置槽位。",
|
||
"- 产品文档里“费用说明/自理项目/不含”出现的景点名称,不要抽成真实路线停靠点。",
|
||
"- 酒店参考段优先抽成 `ResourceOptionGroup`,不要强行抽成唯一入住酒店。",
|
||
"- 规则要挂到影响对象:退改挂产品/价格包,优惠挂门票费用,少走路/预约风险挂景点或停靠点,可替换规则挂资源槽位。",
|
||
]
|
||
(OUT_DIR / "README_已有路线产品md整理.md").write_text("\n".join(lines), encoding="utf-8")
|
||
(OUT_DIR / "产品索引.json").write_text(json.dumps(items, ensure_ascii=False, indent=2), encoding="utf-8")
|
||
|
||
|
||
def main() -> dict[str, Any]:
|
||
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
GRAPH_OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
product_dir = OUT_DIR / "products"
|
||
product_dir.mkdir(parents=True, exist_ok=True)
|
||
items: list[dict[str, Any]] = []
|
||
for index, path in enumerate(source_files(), start=1):
|
||
markdown, meta = build_markdown(path, index)
|
||
filename = safe_filename(meta["product_name"] or path.stem, index)
|
||
meta["markdown_filename"] = f"products/{filename}"
|
||
(product_dir / filename).write_text(markdown, encoding="utf-8")
|
||
items.append(meta)
|
||
write_index(items)
|
||
|
||
# Keep a synchronized copy under 图谱数据 for extraction experiments.
|
||
for src in [OUT_DIR / "README_已有路线产品md整理.md", OUT_DIR / "产品索引.json"]:
|
||
(GRAPH_OUT_DIR / src.name).write_text(src.read_text(encoding="utf-8"), encoding="utf-8")
|
||
graph_products = GRAPH_OUT_DIR / "products"
|
||
graph_products.mkdir(parents=True, exist_ok=True)
|
||
for md in product_dir.glob("*.md"):
|
||
(graph_products / md.name).write_text(md.read_text(encoding="utf-8"), encoding="utf-8")
|
||
|
||
summary = {
|
||
"source_dir": str(SOURCE_DIR),
|
||
"output_dir": str(OUT_DIR),
|
||
"graph_output_dir": str(GRAPH_OUT_DIR),
|
||
"product_markdown_count": len(items),
|
||
"generated_at": datetime.now().isoformat(timespec="seconds"),
|
||
}
|
||
(OUT_DIR / "生成摘要.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
|
||
(GRAPH_OUT_DIR / "生成摘要.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
|
||
return summary
|
||
|
||
|
||
if __name__ == "__main__":
|
||
print(json.dumps(main(), ensure_ascii=False, indent=2))
|