"""Crawl Baidu Baike raw Markdown samples for city POI schema scenes. This script reads schema搭建/city_poi_schema_v0_1/business_scene_seed_manifest.json and writes per-scene raw Markdown evidence files. It is intentionally a schema building helper: the output is evidence material, not extracted facts. """ from __future__ import annotations import argparse import json import random import re import sys import time from dataclasses import asdict from datetime import datetime, timezone from pathlib import Path ROOT = Path(__file__).resolve().parents[1] SCHEMA_DIR = ROOT / "schema搭建" / "city_poi_schema_v0_1" MANIFEST_PATH = SCHEMA_DIR / "business_scene_seed_manifest.json" DEFAULT_OUT_DIR = SCHEMA_DIR / "baidu_baike_raw_md" def slugify(text: str) -> str: return re.sub(r"[\\/:*?\"<>|\\s]+", "_", text).strip("_") or "scene" def load_scene_manifest(path: Path = MANIFEST_PATH) -> dict: return json.loads(path.read_text(encoding="utf-8")) def select_scenes(manifest: dict, scene_filter: str | None) -> list[dict]: scenes = manifest.get("scenes") or [] if not scene_filter: return scenes wanted = {x.strip() for x in scene_filter.split(",") if x.strip()} return [scene for scene in scenes if scene.get("scene") in wanted] def write_plan_index(manifest: dict, out_dir: Path) -> None: out_dir.mkdir(parents=True, exist_ok=True) lines = [ "# 城市 POI 业务场景百度百科原文采集计划", "", f"- 生成时间:{datetime.now(timezone.utc).astimezone().isoformat(timespec='seconds')}", f"- Schema 版本:{manifest.get('version')}", "- 说明:本清单只是采集计划,事实抽取必须读取每个词条的原文 Markdown。", "", "| 场景 | 高德数量 | 网格进度 | 百度百科原文样本词条 |", "| --- | ---: | --- | --- |", ] for scene in manifest.get("scenes") or []: terms = "、".join(scene.get("baike_seed_terms") or []) lines.append( f"| {scene.get('scene')} | {scene.get('amap_count')} | " f"{scene.get('grid_progress')} | {terms} |" ) (out_dir / "crawl_plan.md").write_text("\n".join(lines) + "\n", encoding="utf-8") def crawl_scene( scene: dict, out_dir: Path, limit_per_scene: int | None, attempts: int, sleep: float, force: bool, ) -> list[dict]: sys.path.insert(0, str(ROOT)) from scripts.baike_to_markdown import crawl_one, write_manifest scene_name = scene["scene"] terms = list(scene.get("baike_seed_terms") or []) if limit_per_scene is not None: terms = terms[: max(0, limit_per_scene)] scene_dir = out_dir / slugify(scene_name) rows = [] for idx, term in enumerate(terms, 1): print(f"[crawl] {scene_name} {idx:02d}/{len(terms)} {term}", flush=True) row = crawl_one(term, scene_dir, idx=idx, force=force, attempts=attempts) rows.append(row) status = "FAIL" if row.error else "OK" print( f" [{status}] title={row.title} chars={row.markdown_chars} " f"headings={row.heading_count} paras={row.paragraph_count} file={row.markdown_file}", flush=True, ) if idx < len(terms): time.sleep(sleep + random.random() * 0.8) if rows: write_manifest(rows, scene_dir) return [ { "scene": scene_name, "scene_goal": scene.get("domain_goal", ""), **asdict(row), "ok": not bool(row.error), } for row in rows ] def write_global_index(rows: list[dict], out_dir: Path, manifest: dict) -> None: out_dir.mkdir(parents=True, exist_ok=True) (out_dir / "crawl_manifest.json").write_text( json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8", ) lines = [ "# 城市 POI 业务场景百度百科原文 Markdown", "", f"- 生成时间:{datetime.now(timezone.utc).astimezone().isoformat(timespec='seconds')}", f"- Schema 版本:{manifest.get('version')}", f"- 抓取词条数:{len(rows)}", f"- 成功:{sum(1 for row in rows if row.get('ok'))}", f"- 失败:{sum(1 for row in rows if not row.get('ok'))}", "", "| 场景 | 词条 | 页面标题 | Markdown | 字符 | 状态 |", "| --- | --- | --- | --- | ---: | --- |", ] for row in rows: scene_dir = slugify(row["scene"]) status = "OK" if row.get("ok") else f"失败:{row.get('error', '')[:40]}" lines.append( f"| {row['scene']} | {row['name']} | {row['title']} | " f"[{row['markdown_file']}](./{scene_dir}/{row['markdown_file']}) | " f"{row['markdown_chars']} | {status} |" ) (out_dir / "index.md").write_text("\n".join(lines) + "\n", encoding="utf-8") def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--scene", help="Only crawl one or more scenes, comma separated. Example: 景点,美食") parser.add_argument("--limit-per-scene", type=int, help="Limit sample terms per scene") parser.add_argument("--out-dir", default=str(DEFAULT_OUT_DIR)) parser.add_argument("--attempts", type=int, default=3) parser.add_argument("--sleep", type=float, default=1.2) parser.add_argument("--force", action="store_true") parser.add_argument("--plan-only", action="store_true", help="Only write crawl_plan.md; do not crawl pages") args = parser.parse_args() manifest = load_scene_manifest() out_dir = Path(args.out_dir) write_plan_index(manifest, out_dir) if args.plan_only: print(f"[plan] {out_dir / 'crawl_plan.md'}", flush=True) return 0 scenes = select_scenes(manifest, args.scene) if not scenes: raise SystemExit(f"No matching scene: {args.scene}") rows: list[dict] = [] for scene in scenes: rows.extend( crawl_scene( scene, out_dir, limit_per_scene=args.limit_per_scene, attempts=max(1, args.attempts), sleep=args.sleep, force=args.force, ) ) write_global_index(rows, out_dir, manifest) print(f"[done] {out_dir}", flush=True) return 0 if __name__ == "__main__": raise SystemExit(main())