247 lines
7.6 KiB
JavaScript
247 lines
7.6 KiB
JavaScript
#!/usr/bin/env node
|
|
import { writeFile } from 'node:fs/promises';
|
|
|
|
const DEFAULT_COUNT = 6;
|
|
const MAX_COUNT = 10;
|
|
const DEFAULT_TIMEOUT_MS = 9_000;
|
|
const DEFAULT_MAX_CHARS = 4_000;
|
|
const MAX_CHARS = 10_000;
|
|
const USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123 Safari/537.36 ZhinianAssistant/0.1';
|
|
|
|
function parseArgs(argv) {
|
|
const args = {
|
|
query: '',
|
|
count: DEFAULT_COUNT,
|
|
fetchUrl: '',
|
|
maxChars: DEFAULT_MAX_CHARS,
|
|
output: '',
|
|
language: 'zh-CN',
|
|
};
|
|
|
|
const positional = [];
|
|
for (let index = 0; index < argv.length; index += 1) {
|
|
const arg = argv[index];
|
|
if (arg === '--count' || arg === '-n') {
|
|
args.count = Number.parseInt(argv[++index] || '', 10);
|
|
} else if (arg === '--fetch-url') {
|
|
args.fetchUrl = argv[++index] || '';
|
|
} else if (arg === '--max-chars') {
|
|
args.maxChars = Number.parseInt(argv[++index] || '', 10);
|
|
} else if (arg === '--output' || arg === '-o') {
|
|
args.output = argv[++index] || '';
|
|
} else if (arg === '--language' || arg === '--lang') {
|
|
args.language = argv[++index] || args.language;
|
|
} else if (arg === '--help' || arg === '-h') {
|
|
printHelp();
|
|
process.exit(0);
|
|
} else {
|
|
positional.push(arg);
|
|
}
|
|
}
|
|
|
|
args.query = positional.join(' ').trim();
|
|
args.count = Number.isFinite(args.count) ? Math.max(1, Math.min(MAX_COUNT, args.count)) : DEFAULT_COUNT;
|
|
args.maxChars = Number.isFinite(args.maxChars) ? Math.max(500, Math.min(MAX_CHARS, args.maxChars)) : DEFAULT_MAX_CHARS;
|
|
return args;
|
|
}
|
|
|
|
function printHelp() {
|
|
process.stdout.write(`Usage:
|
|
node search-web.mjs "<query>" [--count 6] [--output <file>]
|
|
node search-web.mjs --fetch-url "<url>" [--max-chars 4000] [--output <file>]
|
|
|
|
Options:
|
|
--count, -n Number of search results to return, 1-${MAX_COUNT}.
|
|
--fetch-url Fetch and summarize a single page URL.
|
|
--max-chars Maximum returned text length in fetch mode, 500-${MAX_CHARS}.
|
|
--output, -o Write JSON result to a file in addition to stdout.
|
|
--language Accept-Language value. Default: zh-CN.
|
|
`);
|
|
}
|
|
|
|
function decodeHtml(value) {
|
|
return String(value || '')
|
|
.replace(/"/g, '"')
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, "'")
|
|
.replace(/'/g, "'")
|
|
.replace(/ /g, ' ')
|
|
.replace(/&/g, '&')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>');
|
|
}
|
|
|
|
function stripTags(value) {
|
|
return decodeHtml(value)
|
|
.replace(/<script[\s\S]*?<\/script>/gi, ' ')
|
|
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
|
|
.replace(/<noscript[\s\S]*?<\/noscript>/gi, ' ')
|
|
.replace(/<br\s*\/?>/gi, '\n')
|
|
.replace(/<\/(p|div|li|h[1-6]|tr)>/gi, '\n')
|
|
.replace(/<[^>]+>/g, ' ')
|
|
.replace(/[ \t]+/g, ' ')
|
|
.replace(/\n\s+/g, '\n')
|
|
.replace(/\n{3,}/g, '\n\n')
|
|
.trim();
|
|
}
|
|
|
|
function isHttpUrl(value) {
|
|
try {
|
|
const url = new URL(value);
|
|
return url.protocol === 'http:' || url.protocol === 'https:';
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
function buildBingSearchUrl(query) {
|
|
const url = new URL('https://www.bing.com/search');
|
|
url.searchParams.set('q', query);
|
|
url.searchParams.set('setlang', 'zh-CN');
|
|
url.searchParams.set('mkt', 'zh-CN');
|
|
return url.toString();
|
|
}
|
|
|
|
async function fetchText(url, language, timeoutMs = DEFAULT_TIMEOUT_MS) {
|
|
const controller = new AbortController();
|
|
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
try {
|
|
const response = await fetch(url, {
|
|
redirect: 'follow',
|
|
signal: controller.signal,
|
|
headers: {
|
|
'User-Agent': USER_AGENT,
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml,application/json;q=0.9,text/plain;q=0.8,*/*;q=0.7',
|
|
'Accept-Language': language || 'zh-CN',
|
|
},
|
|
});
|
|
const contentType = response.headers.get('content-type') || '';
|
|
const text = await response.text();
|
|
if (!response.ok) {
|
|
throw new Error(`HTTP ${response.status} ${response.statusText}`);
|
|
}
|
|
return { text, contentType, finalUrl: response.url, status: response.status };
|
|
} finally {
|
|
clearTimeout(timer);
|
|
}
|
|
}
|
|
|
|
function normalizeBingUrl(rawUrl) {
|
|
const url = decodeHtml(rawUrl).trim();
|
|
if (!url) return '';
|
|
try {
|
|
const parsed = new URL(url);
|
|
if (parsed.hostname.endsWith('bing.com') && parsed.pathname === '/ck/a') {
|
|
const target = parsed.searchParams.get('u');
|
|
if (target) {
|
|
const decoded = target.startsWith('a1') ? Buffer.from(target.slice(2), 'base64').toString('utf8') : target;
|
|
return isHttpUrl(decoded) ? decoded : url;
|
|
}
|
|
}
|
|
} catch {
|
|
return '';
|
|
}
|
|
return isHttpUrl(url) ? url : '';
|
|
}
|
|
|
|
function parseBingResults(html, count) {
|
|
const results = [];
|
|
const seen = new Set();
|
|
const blocks = html.match(/<li class="b_algo"[\s\S]*?<\/li>/gi) || [];
|
|
for (const block of blocks) {
|
|
if (results.length >= count) break;
|
|
const linkMatch = block.match(/<h2[^>]*>[\s\S]*?<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h2>/i);
|
|
if (!linkMatch) continue;
|
|
const url = normalizeBingUrl(linkMatch[1]);
|
|
if (!url || seen.has(url)) continue;
|
|
seen.add(url);
|
|
|
|
const title = stripTags(linkMatch[2]);
|
|
const snippetMatch = block.match(/<p[^>]*>([\s\S]*?)<\/p>/i);
|
|
const snippet = snippetMatch ? stripTags(snippetMatch[1]) : '';
|
|
let sourceName = '';
|
|
try {
|
|
sourceName = new URL(url).hostname.replace(/^www\./, '');
|
|
} catch {
|
|
sourceName = '';
|
|
}
|
|
|
|
results.push({
|
|
title,
|
|
url,
|
|
snippet,
|
|
sourceName,
|
|
});
|
|
}
|
|
return results;
|
|
}
|
|
|
|
function summarizeHtml(html, maxChars) {
|
|
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
const title = titleMatch ? stripTags(titleMatch[1]) : '';
|
|
const text = stripTags(html).slice(0, maxChars);
|
|
return { title, text, truncated: stripTags(html).length > maxChars };
|
|
}
|
|
|
|
async function search(args) {
|
|
if (!args.query) {
|
|
throw new Error('Search query is required.');
|
|
}
|
|
const url = buildBingSearchUrl(args.query);
|
|
const fetched = await fetchText(url, args.language);
|
|
return {
|
|
status: 'ok',
|
|
mode: 'search',
|
|
query: args.query,
|
|
provider: 'bing-html',
|
|
fetchedAt: new Date().toISOString(),
|
|
results: parseBingResults(fetched.text, args.count),
|
|
};
|
|
}
|
|
|
|
async function fetchPage(args) {
|
|
if (!isHttpUrl(args.fetchUrl)) {
|
|
throw new Error('A valid http(s) --fetch-url is required.');
|
|
}
|
|
const fetched = await fetchText(args.fetchUrl, args.language);
|
|
const lowerType = fetched.contentType.toLowerCase();
|
|
const body = lowerType.includes('application/json')
|
|
? JSON.stringify(JSON.parse(fetched.text), null, 2).slice(0, args.maxChars)
|
|
: lowerType.includes('text/plain')
|
|
? fetched.text.slice(0, args.maxChars)
|
|
: summarizeHtml(fetched.text, args.maxChars).text;
|
|
const htmlSummary = lowerType.includes('text/html') ? summarizeHtml(fetched.text, args.maxChars) : null;
|
|
return {
|
|
status: 'ok',
|
|
mode: 'fetch',
|
|
url: args.fetchUrl,
|
|
finalUrl: fetched.finalUrl,
|
|
httpStatus: fetched.status,
|
|
contentType: fetched.contentType,
|
|
title: htmlSummary?.title || '',
|
|
fetchedAt: new Date().toISOString(),
|
|
text: body,
|
|
truncated: fetched.text.length > args.maxChars,
|
|
};
|
|
}
|
|
|
|
async function main() {
|
|
const args = parseArgs(process.argv.slice(2));
|
|
const result = args.fetchUrl ? await fetchPage(args) : await search(args);
|
|
const json = `${JSON.stringify(result, null, 2)}\n`;
|
|
if (args.output) {
|
|
await writeFile(args.output, json, 'utf8');
|
|
}
|
|
process.stdout.write(json);
|
|
}
|
|
|
|
main().catch((error) => {
|
|
const json = `${JSON.stringify({
|
|
status: 'error',
|
|
error: error instanceof Error ? error.message : String(error),
|
|
fetchedAt: new Date().toISOString(),
|
|
}, null, 2)}\n`;
|
|
process.stdout.write(json);
|
|
process.exitCode = 1;
|
|
});
|