import type { IncomingMessage, ServerResponse } from 'node:http'; import crypto from 'node:crypto'; import { basename, extname, join } from 'node:path'; import { mkdir, readFile, stat, writeFile, copyFile } from 'node:fs/promises'; import type { HostApiContext } from '../context'; import { parseJsonBody, sendJson } from '../route-utils'; import { getDataDir } from '../../utils/paths'; const KNOWLEDGE_ROOT = join(getDataDir(), 'yinian', 'knowledge'); const MAX_KNOWLEDGE_FILE_BYTES = 20 * 1024 * 1024; const MAX_CONTEXT_CHARS_PER_FILE = 32_000; const MAX_CONTEXT_TOTAL_CHARS = 96_000; const TEXT_MIME_BY_EXT: Record = { '.txt': 'text/plain', '.md': 'text/markdown', '.markdown': 'text/markdown', '.csv': 'text/csv', '.tsv': 'text/tab-separated-values', '.json': 'application/json', '.jsonl': 'application/x-ndjson', '.xml': 'application/xml', '.html': 'text/html', '.htm': 'text/html', '.yaml': 'application/yaml', '.yml': 'application/yaml', '.log': 'text/plain', '.ini': 'text/plain', '.conf': 'text/plain', '.css': 'text/css', '.js': 'text/javascript', '.jsx': 'text/javascript', '.ts': 'text/typescript', '.tsx': 'text/typescript', '.py': 'text/x-python', '.sql': 'application/sql', }; const WORD_MIME_BY_EXT: Record = { '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', }; export interface KnowledgeDocument { id: string; workspaceId: string; name: string; mimeType: string; size: number; storedPath: string; textPath?: string; originalPath?: string; importedAt: number; status: 'stored'; } function sanitizeWorkspaceId(workspaceId?: string): string { const value = workspaceId?.trim() || 'default'; return value.replace(/[^a-zA-Z0-9_-]/g, '_').slice(0, 96) || 'default'; } function getWorkspaceDir(workspaceId?: string): string { return join(KNOWLEDGE_ROOT, sanitizeWorkspaceId(workspaceId)); } function getRegistryPath(workspaceId?: string): string { return join(getWorkspaceDir(workspaceId), 'registry.json'); } async function readRegistry(workspaceId?: string): Promise { const raw = await readFile(getRegistryPath(workspaceId), 'utf8').catch(() => ''); if (!raw.trim()) return []; try { const parsed = JSON.parse(raw) as unknown; return Array.isArray(parsed) ? parsed as KnowledgeDocument[] : []; } catch { return []; } } async function writeRegistry(workspaceId: string | undefined, docs: KnowledgeDocument[]): Promise { await mkdir(getWorkspaceDir(workspaceId), { recursive: true }); await writeFile(getRegistryPath(workspaceId), JSON.stringify(docs, null, 2), 'utf8'); } function getTextMimeType(filePath: string): string | null { const ext = extname(filePath).toLowerCase(); return TEXT_MIME_BY_EXT[ext] ?? WORD_MIME_BY_EXT[ext] ?? null; } function isDocx(filePath: string): boolean { return extname(filePath).toLowerCase() === '.docx'; } async function extractDocxText(filePath: string): Promise { const mammoth = await import('mammoth'); const result = await mammoth.extractRawText({ path: filePath }); return result.value.trim(); } export async function importKnowledgeFiles(params: { workspaceId?: string; filePaths: string[]; }): Promise<{ documents: KnowledgeDocument[]; rejected: Array<{ filePath: string; reason: string }> }> { const workspaceId = sanitizeWorkspaceId(params.workspaceId); const workspaceDir = getWorkspaceDir(workspaceId); const filesDir = join(workspaceDir, 'files'); const textDir = join(workspaceDir, 'texts'); await mkdir(filesDir, { recursive: true }); await mkdir(textDir, { recursive: true }); const currentDocs = await readRegistry(workspaceId); const importedDocs: KnowledgeDocument[] = []; const rejected: Array<{ filePath: string; reason: string }> = []; for (const filePath of params.filePaths) { const mimeType = getTextMimeType(filePath); if (!mimeType) { rejected.push({ filePath, reason: '仅支持文本类知识文件' }); continue; } const fileStat = await stat(filePath).catch(() => null); if (!fileStat || !fileStat.isFile()) { rejected.push({ filePath, reason: '文件不存在或不可读取' }); continue; } if (fileStat.size > MAX_KNOWLEDGE_FILE_BYTES) { rejected.push({ filePath, reason: '文件超过 20MB 限制' }); continue; } const id = crypto.randomUUID(); const ext = extname(filePath).toLowerCase(); const storedPath = join(filesDir, `${id}${ext}`); await copyFile(filePath, storedPath); let textPath: string | undefined; if (isDocx(filePath)) { try { const extractedText = await extractDocxText(storedPath); if (!extractedText) { rejected.push({ filePath, reason: 'Word 文档未提取到可用文本' }); continue; } textPath = join(textDir, `${id}.txt`); await writeFile(textPath, extractedText, 'utf8'); } catch { rejected.push({ filePath, reason: 'Word 文档解析失败,请确认文件为 .docx 格式' }); continue; } } const doc: KnowledgeDocument = { id, workspaceId, name: basename(filePath), mimeType, size: fileStat.size, storedPath, ...(textPath ? { textPath } : {}), originalPath: filePath, importedAt: Date.now(), status: 'stored', }; importedDocs.push(doc); } if (importedDocs.length > 0) { await writeRegistry(workspaceId, [...importedDocs, ...currentDocs]); } return { documents: importedDocs, rejected }; } export async function buildKnowledgeContext(params: { workspaceId?: string; documentIds: string[]; }): Promise<{ context: string; documents: KnowledgeDocument[]; missing: string[] }> { const workspaceId = sanitizeWorkspaceId(params.workspaceId); const selectedIds = new Set(params.documentIds.filter((id) => typeof id === 'string' && id.trim())); if (selectedIds.size === 0) { return { context: '', documents: [], missing: [] }; } const registry = await readRegistry(workspaceId); const docs = registry.filter((doc) => selectedIds.has(doc.id)); const missing = [...selectedIds].filter((id) => !docs.some((doc) => doc.id === id)); const sections: string[] = []; const usedDocs: KnowledgeDocument[] = []; let totalChars = 0; for (const doc of docs) { const readablePath = doc.textPath || doc.storedPath; const raw = await readFile(readablePath, 'utf8').catch(() => ''); const text = raw.trim(); if (!text) { missing.push(doc.id); continue; } const remaining = MAX_CONTEXT_TOTAL_CHARS - totalChars; if (remaining <= 0) break; const content = text.slice(0, Math.min(MAX_CONTEXT_CHARS_PER_FILE, remaining)); totalChars += content.length; usedDocs.push(doc); sections.push([ `## ${doc.name}`, `类型:${doc.mimeType}`, content, ].join('\n')); } if (sections.length === 0) { return { context: '', documents: [], missing }; } return { context: [ '[知识库上下文]', '用户已选择在本轮对话中使用当前组织空间知识库。以下内容来自智念助手保存的本地备份文件;回答前请优先参考这些内容。', ...sections, ].join('\n\n'), documents: usedDocs, missing, }; } export async function handleKnowledgeRoutes( req: IncomingMessage, res: ServerResponse, url: URL, _ctx: HostApiContext, ): Promise { if (url.pathname === '/api/knowledge/files' && req.method === 'GET') { const workspaceId = sanitizeWorkspaceId(url.searchParams.get('workspaceId') ?? undefined); const documents = await readRegistry(workspaceId); sendJson(res, 200, { documents }); return true; } if (url.pathname === '/api/knowledge/import-paths' && req.method === 'POST') { try { const body = await parseJsonBody<{ workspaceId?: string; filePaths?: string[] }>(req); const filePaths = Array.isArray(body.filePaths) ? body.filePaths : []; if (filePaths.length === 0) { sendJson(res, 400, { success: false, error: 'No files selected' }); return true; } const result = await importKnowledgeFiles({ workspaceId: body.workspaceId, filePaths }); sendJson(res, 200, { success: true, ...result }); } catch (error) { sendJson(res, 500, { success: false, error: String(error) }); } return true; } if (url.pathname === '/api/knowledge/context' && req.method === 'POST') { try { const body = await parseJsonBody<{ workspaceId?: string; documentIds?: string[] }>(req); const documentIds = Array.isArray(body.documentIds) ? body.documentIds : []; const result = await buildKnowledgeContext({ workspaceId: body.workspaceId, documentIds }); sendJson(res, 200, { success: true, ...result }); } catch (error) { sendJson(res, 500, { success: false, error: String(error) }); } return true; } return false; }