Initial travel knowledge graph release
This commit is contained in:
635
scripts/crawl_guizhou_baike_scenic_md.py
Normal file
635
scripts/crawl_guizhou_baike_scenic_md.py
Normal file
@@ -0,0 +1,635 @@
|
||||
"""Crawl Baidu Baike pages for 20 well-known Guizhou scenic spots.
|
||||
|
||||
The output is a schema-building Markdown dataset: source metadata, basic-info
|
||||
fields, short summary snippets, page outline, and candidate schema fields.
|
||||
It intentionally does not store full article bodies.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
OUT_DIR = ROOT / "schema搭建" / "baidu_baike_guizhou_scenic_20"
|
||||
|
||||
|
||||
SCENIC_SPOTS = [
|
||||
"黄果树瀑布",
|
||||
"荔波小七孔景区",
|
||||
"梵净山",
|
||||
"西江千户苗寨",
|
||||
"青岩古镇",
|
||||
"镇远古城",
|
||||
"肇兴侗寨",
|
||||
"万峰林",
|
||||
"马岭河峡谷",
|
||||
"织金洞",
|
||||
"百里杜鹃风景名胜区",
|
||||
"赤水丹霞",
|
||||
"龙宫风景区",
|
||||
"遵义会议会址",
|
||||
"甲秀楼",
|
||||
"黔灵山公园",
|
||||
"花溪公园",
|
||||
"天河潭",
|
||||
"南江大峡谷",
|
||||
"乌蒙大草原",
|
||||
]
|
||||
|
||||
|
||||
BAIKE_QUERY_ALIASES = {
|
||||
"荔波小七孔景区": ["荔波樟江风景名胜区", "小七孔"],
|
||||
"镇远古城": ["黔东南苗族侗族自治州镇远古城旅游景区"],
|
||||
"百里杜鹃风景名胜区": ["百里杜鹃景区", "贵州百里杜鹃风景名胜区"],
|
||||
}
|
||||
|
||||
|
||||
SCHEMA_FIELD_HINTS = [
|
||||
"中文名",
|
||||
"外文名",
|
||||
"地理位置",
|
||||
"气候条件",
|
||||
"开放时间",
|
||||
"景点级别",
|
||||
"门票价格",
|
||||
"占地面积",
|
||||
"著名景点",
|
||||
"建议游玩时长",
|
||||
"适宜游玩季节",
|
||||
"所属国家",
|
||||
"所属城市",
|
||||
"保护级别",
|
||||
"主要景观",
|
||||
"最佳旅游时间",
|
||||
]
|
||||
|
||||
|
||||
BAD_SNIPPET_TOKENS = [
|
||||
"©",
|
||||
"使用百度前必读",
|
||||
"百科协议",
|
||||
"隐私政策",
|
||||
"百度百科合作平台",
|
||||
"京ICP",
|
||||
"营业执照",
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class PageData:
|
||||
name: str
|
||||
requested_url: str
|
||||
final_url: str
|
||||
title: str
|
||||
summary: str
|
||||
basic_info: dict[str, str]
|
||||
headings: list[str]
|
||||
paragraph_count: int
|
||||
text_char_count: int
|
||||
error: str = ""
|
||||
|
||||
|
||||
def _import_web_agent_constants():
|
||||
sys.path.insert(0, str(ROOT))
|
||||
try:
|
||||
from app.agents.web_agent import _CHROME_ARGS, _STEALTH_JS, _UA
|
||||
except Exception:
|
||||
_UA = (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/124.0.0.0 Safari/537.36"
|
||||
)
|
||||
_CHROME_ARGS = [
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--no-first-run",
|
||||
"--no-default-browser-check",
|
||||
"--disable-sync",
|
||||
"--disable-default-apps",
|
||||
"--no-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
]
|
||||
_STEALTH_JS = "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
|
||||
return _UA, _CHROME_ARGS, _STEALTH_JS
|
||||
|
||||
|
||||
def slugify(name: str, idx: int) -> str:
|
||||
safe = re.sub(r"[\\/:*?\"<>|\\s]+", "_", name).strip("_")
|
||||
return f"{idx:02d}_{safe or 'scenic_spot'}"
|
||||
|
||||
|
||||
def compact(text: str) -> str:
|
||||
return re.sub(r"\s+", " ", text or "").strip()
|
||||
|
||||
|
||||
def short_snippet(text: str, limit: int = 120) -> str:
|
||||
text = compact(text)
|
||||
if len(text) <= limit:
|
||||
return text
|
||||
return text[:limit].rstrip(",。;、 ") + "..."
|
||||
|
||||
|
||||
def usable_snippet(text: str, limit: int = 120) -> str:
|
||||
snippet = short_snippet(text, limit)
|
||||
if any(token in snippet for token in BAD_SNIPPET_TOKENS):
|
||||
return ""
|
||||
return snippet
|
||||
|
||||
|
||||
def baike_url(name: str) -> str:
|
||||
return f"https://baike.baidu.com/item/{quote(name)}"
|
||||
|
||||
|
||||
def query_terms(name: str) -> list[str]:
|
||||
terms = [name, *BAIKE_QUERY_ALIASES.get(name, [])]
|
||||
deduped = []
|
||||
for term in terms:
|
||||
if term not in deduped:
|
||||
deduped.append(term)
|
||||
return deduped
|
||||
|
||||
|
||||
def is_blocked_or_empty(data: PageData) -> bool:
|
||||
if data.error:
|
||||
return True
|
||||
if "anticrawl" in data.final_url or "验证" in data.title:
|
||||
return True
|
||||
return data.text_char_count < 500 and not data.basic_info and not data.headings
|
||||
|
||||
|
||||
def is_good_enough(data: PageData) -> bool:
|
||||
if is_blocked_or_empty(data):
|
||||
return False
|
||||
return data.text_char_count >= 1200 or bool(data.basic_info) or len(data.headings) >= 5
|
||||
|
||||
|
||||
def data_score(data: PageData) -> int:
|
||||
if is_blocked_or_empty(data):
|
||||
return -1
|
||||
return data.text_char_count + 600 * len(data.basic_info) + 120 * len(data.headings)
|
||||
|
||||
|
||||
def parse_web_agent_text(name: str, text: str, final_url: str, requested_url: str) -> PageData:
|
||||
info: dict[str, str] = {}
|
||||
keys = ["词条名", *SCHEMA_FIELD_HINTS]
|
||||
key_alt = "|".join(re.escape(k) for k in keys)
|
||||
for key in keys:
|
||||
pat = rf"{re.escape(key)}:(.{{1,260}}?)(?=\s(?:{key_alt}|正文):|$)"
|
||||
m = re.search(pat, text)
|
||||
if not m:
|
||||
continue
|
||||
value = compact(m.group(1))
|
||||
if value and len(value) <= 240:
|
||||
info[key] = value
|
||||
|
||||
summary = ""
|
||||
m = re.search(r"摘要:(.{20,360}?)(?=\s(?:正文|词条名|中文名|地理位置|开放时间):|$)", text)
|
||||
if m:
|
||||
summary = compact(m.group(1))
|
||||
elif "正文:" in text:
|
||||
summary = compact(text.split("正文:", 1)[1])[:360]
|
||||
|
||||
headings = []
|
||||
for heading in re.findall(r"([\u4e00-\u9fa5A-Za-z0-9·、()()]{2,24})\s+播报\s+编辑", text):
|
||||
heading = compact(heading)
|
||||
if heading and heading not in headings and not heading.startswith("参考资料"):
|
||||
headings.append(heading)
|
||||
|
||||
title = info.get("词条名") or name
|
||||
info.pop("词条名", None)
|
||||
return PageData(
|
||||
name=name,
|
||||
requested_url=requested_url,
|
||||
final_url=final_url,
|
||||
title=title,
|
||||
summary=summary,
|
||||
basic_info=info,
|
||||
headings=headings[:80],
|
||||
paragraph_count=max(0, text.count("。")),
|
||||
text_char_count=len(text),
|
||||
)
|
||||
|
||||
|
||||
def fetch_with_existing_baike_code(name: str, query_name: str) -> PageData:
|
||||
sys.path.insert(0, str(ROOT))
|
||||
from app.agents.web_agent import fetch_baidu_baike_text
|
||||
|
||||
requested = baike_url(query_name)
|
||||
text, final_url = fetch_baidu_baike_text(requested)
|
||||
if not text:
|
||||
return PageData(
|
||||
name=name,
|
||||
requested_url=requested,
|
||||
final_url=final_url or requested,
|
||||
title=name,
|
||||
summary="",
|
||||
basic_info={},
|
||||
headings=[],
|
||||
paragraph_count=0,
|
||||
text_char_count=0,
|
||||
error="百度百科抓取为空或进入验证页",
|
||||
)
|
||||
return parse_web_agent_text(name, text, final_url or requested, requested)
|
||||
|
||||
|
||||
def fetch_page_data(
|
||||
name: str,
|
||||
query_name: str,
|
||||
timeout_ms: int = 45000,
|
||||
headless: bool = True,
|
||||
keep_open_seconds: int = 0,
|
||||
) -> PageData:
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
ua, chrome_args, stealth_js = _import_web_agent_constants()
|
||||
requested = baike_url(query_name)
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(
|
||||
headless=headless,
|
||||
args=chrome_args,
|
||||
ignore_default_args=["--enable-automation"],
|
||||
slow_mo=250 if not headless else 0,
|
||||
)
|
||||
ctx = browser.new_context(
|
||||
user_agent=ua,
|
||||
locale="zh-CN",
|
||||
viewport={"width": 1440, "height": 900},
|
||||
extra_http_headers={"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"},
|
||||
)
|
||||
ctx.add_init_script(stealth_js)
|
||||
page = ctx.new_page()
|
||||
page.goto(requested, timeout=timeout_ms, wait_until="domcontentloaded")
|
||||
page.wait_for_timeout(random.randint(900, 1600))
|
||||
try:
|
||||
page.mouse.wheel(0, random.randint(600, 1400))
|
||||
page.wait_for_timeout(random.randint(400, 900))
|
||||
except Exception:
|
||||
pass
|
||||
payload = page.evaluate(
|
||||
r"""() => {
|
||||
const clean = (s) => (s || '').replace(/\s+/g, ' ').trim();
|
||||
const pickText = (sel) => clean(document.querySelector(sel)?.innerText || '');
|
||||
const title = pickText('h1') || document.title;
|
||||
let summary = pickText('.lemmaSummary, .lemmaWgt-lemmaSummary');
|
||||
const basic = {};
|
||||
const wanted = new Set([
|
||||
'中文名', '外文名', '地理位置', '气候条件', '开放时间', '景点级别',
|
||||
'门票价格', '占地面积', '著名景点', '建议游玩时长', '适宜游玩季节',
|
||||
'所属国家', '所属城市', '保护级别', '主要景观', '最佳旅游时间'
|
||||
]);
|
||||
|
||||
const addPair = (k, v) => {
|
||||
k = clean(k).replace(/[::]+$/, '');
|
||||
v = clean(v);
|
||||
if (!wanted.has(k)) return;
|
||||
if (!k || !v || k.length > 24 || v.length > 240) return;
|
||||
if (!basic[k]) basic[k] = v;
|
||||
};
|
||||
|
||||
// Old Baike layout.
|
||||
const oldNames = Array.from(document.querySelectorAll('.basicInfo-item.name'));
|
||||
const oldVals = Array.from(document.querySelectorAll('.basicInfo-item.value'));
|
||||
for (let i = 0; i < Math.min(oldNames.length, oldVals.length); i++) {
|
||||
addPair(oldNames[i].innerText, oldVals[i].innerText);
|
||||
}
|
||||
|
||||
// New Baike layout often uses dt/dd.
|
||||
const dts = Array.from(document.querySelectorAll('dt'));
|
||||
for (const dt of dts) {
|
||||
const dd = dt.nextElementSibling;
|
||||
if (dd && dd.tagName === 'DD') addPair(dt.innerText, dd.innerText);
|
||||
}
|
||||
|
||||
const headings = Array.from(document.querySelectorAll('h2, h3'))
|
||||
.map(x => clean(x.innerText).replace(/\s*播报\s*编辑\s*$/, ''))
|
||||
.filter(Boolean)
|
||||
.filter((x, i, arr) => arr.indexOf(x) === i)
|
||||
.slice(0, 80);
|
||||
|
||||
const ps = Array.from(document.querySelectorAll('div.J-lemma-content p, main p, article p, p'))
|
||||
.map(x => clean(x.innerText))
|
||||
.filter(x => x.length >= 8);
|
||||
if (!summary && ps.length) summary = ps[0];
|
||||
const body = clean(document.body?.innerText || '');
|
||||
return {
|
||||
title, summary, basic_info: basic, headings,
|
||||
paragraph_count: ps.length,
|
||||
text_char_count: body.length,
|
||||
final_url: location.href
|
||||
};
|
||||
}"""
|
||||
)
|
||||
if keep_open_seconds > 0:
|
||||
print(
|
||||
"[debug] extracted "
|
||||
f"title={compact(payload.get('title') or name)!r} "
|
||||
f"final_url={payload.get('final_url') or requested} "
|
||||
f"chars={int(payload.get('text_char_count') or 0)} "
|
||||
f"basic={len(payload.get('basic_info') or {})} "
|
||||
f"headings={len(payload.get('headings') or [])}",
|
||||
flush=True,
|
||||
)
|
||||
print(
|
||||
"[debug] browser kept open "
|
||||
f"{keep_open_seconds}s; final_url={payload.get('final_url')}",
|
||||
flush=True,
|
||||
)
|
||||
page.wait_for_timeout(keep_open_seconds * 1000)
|
||||
browser.close()
|
||||
|
||||
return PageData(
|
||||
name=name,
|
||||
requested_url=requested,
|
||||
final_url=payload.get("final_url") or requested,
|
||||
title=compact(payload.get("title") or name),
|
||||
summary=compact(payload.get("summary") or ""),
|
||||
basic_info={str(k): str(v) for k, v in (payload.get("basic_info") or {}).items()},
|
||||
headings=[str(x) for x in (payload.get("headings") or [])],
|
||||
paragraph_count=int(payload.get("paragraph_count") or 0),
|
||||
text_char_count=int(payload.get("text_char_count") or 0),
|
||||
)
|
||||
|
||||
|
||||
def debug_visible_flow(name: str, keep_open_seconds: int) -> int:
|
||||
print(f"[debug] target={name}", flush=True)
|
||||
for idx, query_name in enumerate(query_terms(name), 1):
|
||||
print(f"[debug] try {idx}: {query_name} -> {baike_url(query_name)}", flush=True)
|
||||
try:
|
||||
data = fetch_page_data(
|
||||
name,
|
||||
query_name,
|
||||
headless=False,
|
||||
keep_open_seconds=keep_open_seconds,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
print(f"[debug] failed: {str(exc)[:300]}", flush=True)
|
||||
continue
|
||||
print(
|
||||
"[debug] result "
|
||||
f"title={data.title!r} final_url={data.final_url} "
|
||||
f"chars={data.text_char_count} basic={len(data.basic_info)} "
|
||||
f"headings={len(data.headings)} blocked={is_blocked_or_empty(data)}",
|
||||
flush=True,
|
||||
)
|
||||
if data.summary:
|
||||
print(f"[debug] summary={usable_snippet(data.summary) or '<unusable/empty>'}", flush=True)
|
||||
if data.basic_info:
|
||||
print(f"[debug] basic_info={json.dumps(data.basic_info, ensure_ascii=False)}", flush=True)
|
||||
return 0
|
||||
|
||||
|
||||
def fetch_with_retries(name: str, attempts: int = 3) -> PageData:
|
||||
last: PageData | None = None
|
||||
best: PageData | None = None
|
||||
best_score = -1
|
||||
terms = query_terms(name)
|
||||
for attempt in range(1, attempts + 1):
|
||||
for query_name in terms:
|
||||
try:
|
||||
data = fetch_page_data(name, query_name)
|
||||
last = data
|
||||
score = data_score(data)
|
||||
if score > best_score:
|
||||
best, best_score = data, score
|
||||
if is_good_enough(data):
|
||||
return data
|
||||
except Exception as exc: # noqa: BLE001
|
||||
last = PageData(
|
||||
name=name,
|
||||
requested_url=baike_url(query_name),
|
||||
final_url=baike_url(query_name),
|
||||
title=name,
|
||||
summary="",
|
||||
basic_info={},
|
||||
headings=[],
|
||||
paragraph_count=0,
|
||||
text_char_count=0,
|
||||
error=str(exc)[:300],
|
||||
)
|
||||
time.sleep(1.5 * attempt + random.random())
|
||||
|
||||
try:
|
||||
data = fetch_with_existing_baike_code(name, query_name)
|
||||
last = data
|
||||
score = data_score(data)
|
||||
if score > best_score:
|
||||
best, best_score = data, score
|
||||
if is_good_enough(data):
|
||||
return data
|
||||
except Exception as exc: # noqa: BLE001
|
||||
last = PageData(
|
||||
name=name,
|
||||
requested_url=baike_url(query_name),
|
||||
final_url=baike_url(query_name),
|
||||
title=name,
|
||||
summary="",
|
||||
basic_info={},
|
||||
headings=[],
|
||||
paragraph_count=0,
|
||||
text_char_count=0,
|
||||
error=f"existing baike fallback failed: {str(exc)[:260]}",
|
||||
)
|
||||
time.sleep(1.5 * attempt + random.random())
|
||||
|
||||
result = best if best is not None and best_score >= 0 else last
|
||||
assert result is not None
|
||||
if is_blocked_or_empty(result) and not result.error:
|
||||
result.error = f"百度百科页面为空、验证页或未抓到有效结构化信息;已尝试词条:{', '.join(terms)}"
|
||||
return result
|
||||
|
||||
|
||||
def schema_candidates(data: PageData) -> list[dict[str, str]]:
|
||||
candidates = []
|
||||
for key in SCHEMA_FIELD_HINTS:
|
||||
value = data.basic_info.get(key)
|
||||
if not value:
|
||||
continue
|
||||
candidates.append({
|
||||
"field_cn": key,
|
||||
"value": value,
|
||||
"source": "百度百科基本信息",
|
||||
})
|
||||
for heading in data.headings:
|
||||
if any(token in heading for token in ("历史", "地理", "景点", "景观", "交通", "文化", "保护", "荣誉")):
|
||||
candidates.append({
|
||||
"field_cn": f"章节:{heading}",
|
||||
"value": "页面存在该主题章节,可作为 schema/关系抽取候选",
|
||||
"source": "百度百科章节结构",
|
||||
})
|
||||
return candidates[:40]
|
||||
|
||||
|
||||
def markdown_for(data: PageData, idx: int) -> str:
|
||||
now = datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
|
||||
snippet = usable_snippet(data.summary)
|
||||
lines = [
|
||||
f"# {data.name}",
|
||||
"",
|
||||
"## 来源",
|
||||
"",
|
||||
f"- 数据源:百度百科",
|
||||
f"- 请求词条:{data.name}",
|
||||
f"- 页面标题:{data.title}",
|
||||
f"- 请求 URL:{data.requested_url}",
|
||||
f"- 最终 URL:{data.final_url}",
|
||||
f"- 抓取时间:{now}",
|
||||
f"- 页面文本规模:约 {data.text_char_count} 字符,段落 {data.paragraph_count} 个",
|
||||
"",
|
||||
"## 短摘要",
|
||||
"",
|
||||
f"> {snippet if snippet else '未抓到可用短摘要。'}",
|
||||
"",
|
||||
"## 基本信息",
|
||||
"",
|
||||
]
|
||||
if data.basic_info:
|
||||
lines.extend(["| 字段 | 值 |", "| --- | --- |"])
|
||||
for key, value in data.basic_info.items():
|
||||
lines.append(f"| {key} | {value.replace('|', '/')} |")
|
||||
else:
|
||||
msg = "未抓到基本信息表。"
|
||||
if data.error:
|
||||
msg += f" 抓取备注:{data.error}"
|
||||
lines.append(msg)
|
||||
|
||||
lines.extend(["", "## Schema 搭建候选", ""])
|
||||
candidates = schema_candidates(data)
|
||||
if candidates:
|
||||
lines.extend(["| 候选字段/主题 | 值或说明 | 来源 |", "| --- | --- | --- |"])
|
||||
for item in candidates:
|
||||
lines.append(
|
||||
f"| {item['field_cn']} | {str(item['value']).replace('|', '/')} | {item['source']} |"
|
||||
)
|
||||
else:
|
||||
lines.append("暂无。")
|
||||
|
||||
lines.extend(["", "## 页面结构", ""])
|
||||
if data.headings:
|
||||
for heading in data.headings:
|
||||
lines.append(f"- {heading}")
|
||||
else:
|
||||
lines.append("未抓到章节标题。")
|
||||
|
||||
lines.extend([
|
||||
"",
|
||||
"## 使用说明",
|
||||
"",
|
||||
"- 本文件用于 schema 搭建和字段候选分析。",
|
||||
"- 为避免直接沉淀整篇百科长文,只保存结构化字段、短摘要和页面结构。",
|
||||
"- 需要复核事实时,请回到上方最终 URL 查看原页面。",
|
||||
"",
|
||||
])
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def write_dataset(rows: list[PageData], out_dir: Path) -> None:
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
for stale in out_dir.glob("*.md"):
|
||||
stale.unlink()
|
||||
manifest = []
|
||||
for idx, row in enumerate(rows, 1):
|
||||
stem = slugify(row.name, idx)
|
||||
md_path = out_dir / f"{stem}.md"
|
||||
md_path.write_text(markdown_for(row, idx), encoding="utf-8")
|
||||
manifest.append({
|
||||
"index": idx,
|
||||
"name": row.name,
|
||||
"title": row.title,
|
||||
"requested_url": row.requested_url,
|
||||
"final_url": row.final_url,
|
||||
"markdown_file": md_path.name,
|
||||
"basic_info_count": len(row.basic_info),
|
||||
"heading_count": len(row.headings),
|
||||
"text_char_count": row.text_char_count,
|
||||
"error": row.error,
|
||||
})
|
||||
|
||||
(out_dir / "manifest.json").write_text(
|
||||
json.dumps(manifest, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
index_lines = [
|
||||
"# 贵州著名景区百度百科 Schema 搭建数据集",
|
||||
"",
|
||||
f"- 生成时间:{datetime.now(timezone.utc).astimezone().isoformat(timespec='seconds')}",
|
||||
f"- 景区数量:{len(rows)}",
|
||||
"- 说明:Markdown 文件保存结构化字段、短摘要、页面结构和 schema 候选,不保存整篇百科长文。",
|
||||
"",
|
||||
"| 序号 | 景区 | Markdown | 基本信息字段 | 章节数 | 来源 |",
|
||||
"| --- | --- | --- | ---: | ---: | --- |",
|
||||
]
|
||||
for item in manifest:
|
||||
index_lines.append(
|
||||
f"| {item['index']} | {item['name']} | "
|
||||
f"[{item['markdown_file']}](./{item['markdown_file']}) | "
|
||||
f"{item['basic_info_count']} | {item['heading_count']} | "
|
||||
f"[百度百科]({item['final_url']}) |"
|
||||
)
|
||||
(out_dir / "index.md").write_text("\n".join(index_lines) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--out-dir", default=str(OUT_DIR), help="Output folder")
|
||||
parser.add_argument("--limit", type=int, default=20, help="Number of scenic spots to crawl")
|
||||
parser.add_argument("--sleep", type=float, default=1.5, help="Polite delay between pages")
|
||||
parser.add_argument("--force", action="store_true", help="Re-crawl even if files already exist")
|
||||
parser.add_argument("--debug-name", help="Open one scenic spot in a visible browser and print extraction diagnostics")
|
||||
parser.add_argument("--keep-open-seconds", type=int, default=600, help="Visible debug browser hold time")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.debug_name:
|
||||
return debug_visible_flow(args.debug_name, args.keep_open_seconds)
|
||||
|
||||
out_dir = Path(args.out_dir)
|
||||
names = SCENIC_SPOTS[: max(1, min(args.limit, len(SCENIC_SPOTS)))]
|
||||
rows: list[PageData] = []
|
||||
|
||||
for idx, name in enumerate(names, 1):
|
||||
stem = slugify(name, idx)
|
||||
md_path = out_dir / f"{stem}.md"
|
||||
if md_path.exists() and not args.force:
|
||||
print(f"[skip] {idx:02d}/{len(names)} {name} exists", flush=True)
|
||||
continue
|
||||
print(f"[crawl] {idx:02d}/{len(names)} {name}", flush=True)
|
||||
try:
|
||||
row = fetch_with_retries(name)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
row = PageData(
|
||||
name=name,
|
||||
requested_url=baike_url(name),
|
||||
final_url=baike_url(name),
|
||||
title=name,
|
||||
summary="",
|
||||
basic_info={},
|
||||
headings=[],
|
||||
paragraph_count=0,
|
||||
text_char_count=0,
|
||||
error=str(exc)[:300],
|
||||
)
|
||||
print(f" [error] {row.error}", flush=True)
|
||||
if row.error:
|
||||
print(f" [warn] {row.error}", flush=True)
|
||||
rows.append(row)
|
||||
time.sleep(args.sleep + random.random() * 0.8)
|
||||
|
||||
if not rows and (out_dir / "manifest.json").exists():
|
||||
print(f"[done] no recrawl needed: {out_dir}", flush=True)
|
||||
return 0
|
||||
|
||||
if rows:
|
||||
write_dataset(rows, out_dir)
|
||||
print(f"[done] wrote {len(rows)} markdown files to {out_dir}", flush=True)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user