Files
NianToB/resources/skills/local/web-search/scripts/search-web.mjs
2026-05-12 19:44:44 +08:00

247 lines
7.6 KiB
JavaScript

#!/usr/bin/env node
import { writeFile } from 'node:fs/promises';
const DEFAULT_COUNT = 6;
const MAX_COUNT = 10;
const DEFAULT_TIMEOUT_MS = 9_000;
const DEFAULT_MAX_CHARS = 4_000;
const MAX_CHARS = 10_000;
const USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123 Safari/537.36 ZhinianAssistant/0.1';
function parseArgs(argv) {
const args = {
query: '',
count: DEFAULT_COUNT,
fetchUrl: '',
maxChars: DEFAULT_MAX_CHARS,
output: '',
language: 'zh-CN',
};
const positional = [];
for (let index = 0; index < argv.length; index += 1) {
const arg = argv[index];
if (arg === '--count' || arg === '-n') {
args.count = Number.parseInt(argv[++index] || '', 10);
} else if (arg === '--fetch-url') {
args.fetchUrl = argv[++index] || '';
} else if (arg === '--max-chars') {
args.maxChars = Number.parseInt(argv[++index] || '', 10);
} else if (arg === '--output' || arg === '-o') {
args.output = argv[++index] || '';
} else if (arg === '--language' || arg === '--lang') {
args.language = argv[++index] || args.language;
} else if (arg === '--help' || arg === '-h') {
printHelp();
process.exit(0);
} else {
positional.push(arg);
}
}
args.query = positional.join(' ').trim();
args.count = Number.isFinite(args.count) ? Math.max(1, Math.min(MAX_COUNT, args.count)) : DEFAULT_COUNT;
args.maxChars = Number.isFinite(args.maxChars) ? Math.max(500, Math.min(MAX_CHARS, args.maxChars)) : DEFAULT_MAX_CHARS;
return args;
}
function printHelp() {
process.stdout.write(`Usage:
node search-web.mjs "<query>" [--count 6] [--output <file>]
node search-web.mjs --fetch-url "<url>" [--max-chars 4000] [--output <file>]
Options:
--count, -n Number of search results to return, 1-${MAX_COUNT}.
--fetch-url Fetch and summarize a single page URL.
--max-chars Maximum returned text length in fetch mode, 500-${MAX_CHARS}.
--output, -o Write JSON result to a file in addition to stdout.
--language Accept-Language value. Default: zh-CN.
`);
}
function decodeHtml(value) {
return String(value || '')
.replace(/&quot;/g, '"')
.replace(/&#34;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&apos;/g, "'")
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>');
}
function stripTags(value) {
return decodeHtml(value)
.replace(/<script[\s\S]*?<\/script>/gi, ' ')
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
.replace(/<noscript[\s\S]*?<\/noscript>/gi, ' ')
.replace(/<br\s*\/?>/gi, '\n')
.replace(/<\/(p|div|li|h[1-6]|tr)>/gi, '\n')
.replace(/<[^>]+>/g, ' ')
.replace(/[ \t]+/g, ' ')
.replace(/\n\s+/g, '\n')
.replace(/\n{3,}/g, '\n\n')
.trim();
}
function isHttpUrl(value) {
try {
const url = new URL(value);
return url.protocol === 'http:' || url.protocol === 'https:';
} catch {
return false;
}
}
function buildBingSearchUrl(query) {
const url = new URL('https://www.bing.com/search');
url.searchParams.set('q', query);
url.searchParams.set('setlang', 'zh-CN');
url.searchParams.set('mkt', 'zh-CN');
return url.toString();
}
async function fetchText(url, language, timeoutMs = DEFAULT_TIMEOUT_MS) {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutMs);
try {
const response = await fetch(url, {
redirect: 'follow',
signal: controller.signal,
headers: {
'User-Agent': USER_AGENT,
'Accept': 'text/html,application/xhtml+xml,application/xml,application/json;q=0.9,text/plain;q=0.8,*/*;q=0.7',
'Accept-Language': language || 'zh-CN',
},
});
const contentType = response.headers.get('content-type') || '';
const text = await response.text();
if (!response.ok) {
throw new Error(`HTTP ${response.status} ${response.statusText}`);
}
return { text, contentType, finalUrl: response.url, status: response.status };
} finally {
clearTimeout(timer);
}
}
function normalizeBingUrl(rawUrl) {
const url = decodeHtml(rawUrl).trim();
if (!url) return '';
try {
const parsed = new URL(url);
if (parsed.hostname.endsWith('bing.com') && parsed.pathname === '/ck/a') {
const target = parsed.searchParams.get('u');
if (target) {
const decoded = target.startsWith('a1') ? Buffer.from(target.slice(2), 'base64').toString('utf8') : target;
return isHttpUrl(decoded) ? decoded : url;
}
}
} catch {
return '';
}
return isHttpUrl(url) ? url : '';
}
function parseBingResults(html, count) {
const results = [];
const seen = new Set();
const blocks = html.match(/<li class="b_algo"[\s\S]*?<\/li>/gi) || [];
for (const block of blocks) {
if (results.length >= count) break;
const linkMatch = block.match(/<h2[^>]*>[\s\S]*?<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h2>/i);
if (!linkMatch) continue;
const url = normalizeBingUrl(linkMatch[1]);
if (!url || seen.has(url)) continue;
seen.add(url);
const title = stripTags(linkMatch[2]);
const snippetMatch = block.match(/<p[^>]*>([\s\S]*?)<\/p>/i);
const snippet = snippetMatch ? stripTags(snippetMatch[1]) : '';
let sourceName = '';
try {
sourceName = new URL(url).hostname.replace(/^www\./, '');
} catch {
sourceName = '';
}
results.push({
title,
url,
snippet,
sourceName,
});
}
return results;
}
function summarizeHtml(html, maxChars) {
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
const title = titleMatch ? stripTags(titleMatch[1]) : '';
const text = stripTags(html).slice(0, maxChars);
return { title, text, truncated: stripTags(html).length > maxChars };
}
async function search(args) {
if (!args.query) {
throw new Error('Search query is required.');
}
const url = buildBingSearchUrl(args.query);
const fetched = await fetchText(url, args.language);
return {
status: 'ok',
mode: 'search',
query: args.query,
provider: 'bing-html',
fetchedAt: new Date().toISOString(),
results: parseBingResults(fetched.text, args.count),
};
}
async function fetchPage(args) {
if (!isHttpUrl(args.fetchUrl)) {
throw new Error('A valid http(s) --fetch-url is required.');
}
const fetched = await fetchText(args.fetchUrl, args.language);
const lowerType = fetched.contentType.toLowerCase();
const body = lowerType.includes('application/json')
? JSON.stringify(JSON.parse(fetched.text), null, 2).slice(0, args.maxChars)
: lowerType.includes('text/plain')
? fetched.text.slice(0, args.maxChars)
: summarizeHtml(fetched.text, args.maxChars).text;
const htmlSummary = lowerType.includes('text/html') ? summarizeHtml(fetched.text, args.maxChars) : null;
return {
status: 'ok',
mode: 'fetch',
url: args.fetchUrl,
finalUrl: fetched.finalUrl,
httpStatus: fetched.status,
contentType: fetched.contentType,
title: htmlSummary?.title || '',
fetchedAt: new Date().toISOString(),
text: body,
truncated: fetched.text.length > args.maxChars,
};
}
async function main() {
const args = parseArgs(process.argv.slice(2));
const result = args.fetchUrl ? await fetchPage(args) : await search(args);
const json = `${JSON.stringify(result, null, 2)}\n`;
if (args.output) {
await writeFile(args.output, json, 'utf8');
}
process.stdout.write(json);
}
main().catch((error) => {
const json = `${JSON.stringify({
status: 'error',
error: error instanceof Error ? error.message : String(error),
fetchedAt: new Date().toISOString(),
}, null, 2)}\n`;
process.stdout.write(json);
process.exitCode = 1;
});