Initial travel knowledge graph release
This commit is contained in:
176
scripts/crawl_city_scene_baike_md.py
Normal file
176
scripts/crawl_city_scene_baike_md.py
Normal file
@@ -0,0 +1,176 @@
|
||||
"""Crawl Baidu Baike raw Markdown samples for city POI schema scenes.
|
||||
|
||||
This script reads schema搭建/city_poi_schema_v0_1/business_scene_seed_manifest.json
|
||||
and writes per-scene raw Markdown evidence files. It is intentionally a schema
|
||||
building helper: the output is evidence material, not extracted facts.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import asdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
SCHEMA_DIR = ROOT / "schema搭建" / "city_poi_schema_v0_1"
|
||||
MANIFEST_PATH = SCHEMA_DIR / "business_scene_seed_manifest.json"
|
||||
DEFAULT_OUT_DIR = SCHEMA_DIR / "baidu_baike_raw_md"
|
||||
|
||||
|
||||
def slugify(text: str) -> str:
|
||||
return re.sub(r"[\\/:*?\"<>|\\s]+", "_", text).strip("_") or "scene"
|
||||
|
||||
|
||||
def load_scene_manifest(path: Path = MANIFEST_PATH) -> dict:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def select_scenes(manifest: dict, scene_filter: str | None) -> list[dict]:
|
||||
scenes = manifest.get("scenes") or []
|
||||
if not scene_filter:
|
||||
return scenes
|
||||
wanted = {x.strip() for x in scene_filter.split(",") if x.strip()}
|
||||
return [scene for scene in scenes if scene.get("scene") in wanted]
|
||||
|
||||
|
||||
def write_plan_index(manifest: dict, out_dir: Path) -> None:
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
lines = [
|
||||
"# 城市 POI 业务场景百度百科原文采集计划",
|
||||
"",
|
||||
f"- 生成时间:{datetime.now(timezone.utc).astimezone().isoformat(timespec='seconds')}",
|
||||
f"- Schema 版本:{manifest.get('version')}",
|
||||
"- 说明:本清单只是采集计划,事实抽取必须读取每个词条的原文 Markdown。",
|
||||
"",
|
||||
"| 场景 | 高德数量 | 网格进度 | 百度百科原文样本词条 |",
|
||||
"| --- | ---: | --- | --- |",
|
||||
]
|
||||
for scene in manifest.get("scenes") or []:
|
||||
terms = "、".join(scene.get("baike_seed_terms") or [])
|
||||
lines.append(
|
||||
f"| {scene.get('scene')} | {scene.get('amap_count')} | "
|
||||
f"{scene.get('grid_progress')} | {terms} |"
|
||||
)
|
||||
(out_dir / "crawl_plan.md").write_text("\n".join(lines) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
def crawl_scene(
|
||||
scene: dict,
|
||||
out_dir: Path,
|
||||
limit_per_scene: int | None,
|
||||
attempts: int,
|
||||
sleep: float,
|
||||
force: bool,
|
||||
) -> list[dict]:
|
||||
sys.path.insert(0, str(ROOT))
|
||||
from scripts.baike_to_markdown import crawl_one, write_manifest
|
||||
|
||||
scene_name = scene["scene"]
|
||||
terms = list(scene.get("baike_seed_terms") or [])
|
||||
if limit_per_scene is not None:
|
||||
terms = terms[: max(0, limit_per_scene)]
|
||||
scene_dir = out_dir / slugify(scene_name)
|
||||
rows = []
|
||||
for idx, term in enumerate(terms, 1):
|
||||
print(f"[crawl] {scene_name} {idx:02d}/{len(terms)} {term}", flush=True)
|
||||
row = crawl_one(term, scene_dir, idx=idx, force=force, attempts=attempts)
|
||||
rows.append(row)
|
||||
status = "FAIL" if row.error else "OK"
|
||||
print(
|
||||
f" [{status}] title={row.title} chars={row.markdown_chars} "
|
||||
f"headings={row.heading_count} paras={row.paragraph_count} file={row.markdown_file}",
|
||||
flush=True,
|
||||
)
|
||||
if idx < len(terms):
|
||||
time.sleep(sleep + random.random() * 0.8)
|
||||
|
||||
if rows:
|
||||
write_manifest(rows, scene_dir)
|
||||
return [
|
||||
{
|
||||
"scene": scene_name,
|
||||
"scene_goal": scene.get("domain_goal", ""),
|
||||
**asdict(row),
|
||||
"ok": not bool(row.error),
|
||||
}
|
||||
for row in rows
|
||||
]
|
||||
|
||||
|
||||
def write_global_index(rows: list[dict], out_dir: Path, manifest: dict) -> None:
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
(out_dir / "crawl_manifest.json").write_text(
|
||||
json.dumps(rows, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
lines = [
|
||||
"# 城市 POI 业务场景百度百科原文 Markdown",
|
||||
"",
|
||||
f"- 生成时间:{datetime.now(timezone.utc).astimezone().isoformat(timespec='seconds')}",
|
||||
f"- Schema 版本:{manifest.get('version')}",
|
||||
f"- 抓取词条数:{len(rows)}",
|
||||
f"- 成功:{sum(1 for row in rows if row.get('ok'))}",
|
||||
f"- 失败:{sum(1 for row in rows if not row.get('ok'))}",
|
||||
"",
|
||||
"| 场景 | 词条 | 页面标题 | Markdown | 字符 | 状态 |",
|
||||
"| --- | --- | --- | --- | ---: | --- |",
|
||||
]
|
||||
for row in rows:
|
||||
scene_dir = slugify(row["scene"])
|
||||
status = "OK" if row.get("ok") else f"失败:{row.get('error', '')[:40]}"
|
||||
lines.append(
|
||||
f"| {row['scene']} | {row['name']} | {row['title']} | "
|
||||
f"[{row['markdown_file']}](./{scene_dir}/{row['markdown_file']}) | "
|
||||
f"{row['markdown_chars']} | {status} |"
|
||||
)
|
||||
(out_dir / "index.md").write_text("\n".join(lines) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--scene", help="Only crawl one or more scenes, comma separated. Example: 景点,美食")
|
||||
parser.add_argument("--limit-per-scene", type=int, help="Limit sample terms per scene")
|
||||
parser.add_argument("--out-dir", default=str(DEFAULT_OUT_DIR))
|
||||
parser.add_argument("--attempts", type=int, default=3)
|
||||
parser.add_argument("--sleep", type=float, default=1.2)
|
||||
parser.add_argument("--force", action="store_true")
|
||||
parser.add_argument("--plan-only", action="store_true", help="Only write crawl_plan.md; do not crawl pages")
|
||||
args = parser.parse_args()
|
||||
|
||||
manifest = load_scene_manifest()
|
||||
out_dir = Path(args.out_dir)
|
||||
write_plan_index(manifest, out_dir)
|
||||
if args.plan_only:
|
||||
print(f"[plan] {out_dir / 'crawl_plan.md'}", flush=True)
|
||||
return 0
|
||||
|
||||
scenes = select_scenes(manifest, args.scene)
|
||||
if not scenes:
|
||||
raise SystemExit(f"No matching scene: {args.scene}")
|
||||
|
||||
rows: list[dict] = []
|
||||
for scene in scenes:
|
||||
rows.extend(
|
||||
crawl_scene(
|
||||
scene,
|
||||
out_dir,
|
||||
limit_per_scene=args.limit_per_scene,
|
||||
attempts=max(1, args.attempts),
|
||||
sleep=args.sleep,
|
||||
force=args.force,
|
||||
)
|
||||
)
|
||||
write_global_index(rows, out_dir, manifest)
|
||||
print(f"[done] {out_dir}", flush=True)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
|
||||
Reference in New Issue
Block a user