Files
NianToB/electron/api/routes/knowledge.ts

272 lines
8.8 KiB
TypeScript

import type { IncomingMessage, ServerResponse } from 'node:http';
import crypto from 'node:crypto';
import { basename, extname, join } from 'node:path';
import { mkdir, readFile, stat, writeFile, copyFile } from 'node:fs/promises';
import type { HostApiContext } from '../context';
import { parseJsonBody, sendJson } from '../route-utils';
import { getDataDir } from '../../utils/paths';
const KNOWLEDGE_ROOT = join(getDataDir(), 'yinian', 'knowledge');
const MAX_KNOWLEDGE_FILE_BYTES = 20 * 1024 * 1024;
const MAX_CONTEXT_CHARS_PER_FILE = 32_000;
const MAX_CONTEXT_TOTAL_CHARS = 96_000;
const TEXT_MIME_BY_EXT: Record<string, string> = {
'.txt': 'text/plain',
'.md': 'text/markdown',
'.markdown': 'text/markdown',
'.csv': 'text/csv',
'.tsv': 'text/tab-separated-values',
'.json': 'application/json',
'.jsonl': 'application/x-ndjson',
'.xml': 'application/xml',
'.html': 'text/html',
'.htm': 'text/html',
'.yaml': 'application/yaml',
'.yml': 'application/yaml',
'.log': 'text/plain',
'.ini': 'text/plain',
'.conf': 'text/plain',
'.css': 'text/css',
'.js': 'text/javascript',
'.jsx': 'text/javascript',
'.ts': 'text/typescript',
'.tsx': 'text/typescript',
'.py': 'text/x-python',
'.sql': 'application/sql',
};
const WORD_MIME_BY_EXT: Record<string, string> = {
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
};
export interface KnowledgeDocument {
id: string;
workspaceId: string;
name: string;
mimeType: string;
size: number;
storedPath: string;
textPath?: string;
originalPath?: string;
importedAt: number;
status: 'stored';
}
function sanitizeWorkspaceId(workspaceId?: string): string {
const value = workspaceId?.trim() || 'default';
return value.replace(/[^a-zA-Z0-9_-]/g, '_').slice(0, 96) || 'default';
}
function getWorkspaceDir(workspaceId?: string): string {
return join(KNOWLEDGE_ROOT, sanitizeWorkspaceId(workspaceId));
}
function getRegistryPath(workspaceId?: string): string {
return join(getWorkspaceDir(workspaceId), 'registry.json');
}
async function readRegistry(workspaceId?: string): Promise<KnowledgeDocument[]> {
const raw = await readFile(getRegistryPath(workspaceId), 'utf8').catch(() => '');
if (!raw.trim()) return [];
try {
const parsed = JSON.parse(raw) as unknown;
return Array.isArray(parsed) ? parsed as KnowledgeDocument[] : [];
} catch {
return [];
}
}
async function writeRegistry(workspaceId: string | undefined, docs: KnowledgeDocument[]): Promise<void> {
await mkdir(getWorkspaceDir(workspaceId), { recursive: true });
await writeFile(getRegistryPath(workspaceId), JSON.stringify(docs, null, 2), 'utf8');
}
function getTextMimeType(filePath: string): string | null {
const ext = extname(filePath).toLowerCase();
return TEXT_MIME_BY_EXT[ext] ?? WORD_MIME_BY_EXT[ext] ?? null;
}
function isDocx(filePath: string): boolean {
return extname(filePath).toLowerCase() === '.docx';
}
async function extractDocxText(filePath: string): Promise<string> {
const mammoth = await import('mammoth');
const result = await mammoth.extractRawText({ path: filePath });
return result.value.trim();
}
export async function importKnowledgeFiles(params: {
workspaceId?: string;
filePaths: string[];
}): Promise<{ documents: KnowledgeDocument[]; rejected: Array<{ filePath: string; reason: string }> }> {
const workspaceId = sanitizeWorkspaceId(params.workspaceId);
const workspaceDir = getWorkspaceDir(workspaceId);
const filesDir = join(workspaceDir, 'files');
const textDir = join(workspaceDir, 'texts');
await mkdir(filesDir, { recursive: true });
await mkdir(textDir, { recursive: true });
const currentDocs = await readRegistry(workspaceId);
const importedDocs: KnowledgeDocument[] = [];
const rejected: Array<{ filePath: string; reason: string }> = [];
for (const filePath of params.filePaths) {
const mimeType = getTextMimeType(filePath);
if (!mimeType) {
rejected.push({ filePath, reason: '仅支持文本类知识文件' });
continue;
}
const fileStat = await stat(filePath).catch(() => null);
if (!fileStat || !fileStat.isFile()) {
rejected.push({ filePath, reason: '文件不存在或不可读取' });
continue;
}
if (fileStat.size > MAX_KNOWLEDGE_FILE_BYTES) {
rejected.push({ filePath, reason: '文件超过 20MB 限制' });
continue;
}
const id = crypto.randomUUID();
const ext = extname(filePath).toLowerCase();
const storedPath = join(filesDir, `${id}${ext}`);
await copyFile(filePath, storedPath);
let textPath: string | undefined;
if (isDocx(filePath)) {
try {
const extractedText = await extractDocxText(storedPath);
if (!extractedText) {
rejected.push({ filePath, reason: 'Word 文档未提取到可用文本' });
continue;
}
textPath = join(textDir, `${id}.txt`);
await writeFile(textPath, extractedText, 'utf8');
} catch {
rejected.push({ filePath, reason: 'Word 文档解析失败,请确认文件为 .docx 格式' });
continue;
}
}
const doc: KnowledgeDocument = {
id,
workspaceId,
name: basename(filePath),
mimeType,
size: fileStat.size,
storedPath,
...(textPath ? { textPath } : {}),
originalPath: filePath,
importedAt: Date.now(),
status: 'stored',
};
importedDocs.push(doc);
}
if (importedDocs.length > 0) {
await writeRegistry(workspaceId, [...importedDocs, ...currentDocs]);
}
return { documents: importedDocs, rejected };
}
export async function buildKnowledgeContext(params: {
workspaceId?: string;
documentIds: string[];
}): Promise<{ context: string; documents: KnowledgeDocument[]; missing: string[] }> {
const workspaceId = sanitizeWorkspaceId(params.workspaceId);
const selectedIds = new Set(params.documentIds.filter((id) => typeof id === 'string' && id.trim()));
if (selectedIds.size === 0) {
return { context: '', documents: [], missing: [] };
}
const registry = await readRegistry(workspaceId);
const docs = registry.filter((doc) => selectedIds.has(doc.id));
const missing = [...selectedIds].filter((id) => !docs.some((doc) => doc.id === id));
const sections: string[] = [];
const usedDocs: KnowledgeDocument[] = [];
let totalChars = 0;
for (const doc of docs) {
const readablePath = doc.textPath || doc.storedPath;
const raw = await readFile(readablePath, 'utf8').catch(() => '');
const text = raw.trim();
if (!text) {
missing.push(doc.id);
continue;
}
const remaining = MAX_CONTEXT_TOTAL_CHARS - totalChars;
if (remaining <= 0) break;
const content = text.slice(0, Math.min(MAX_CONTEXT_CHARS_PER_FILE, remaining));
totalChars += content.length;
usedDocs.push(doc);
sections.push([
`## ${doc.name}`,
`类型:${doc.mimeType}`,
content,
].join('\n'));
}
if (sections.length === 0) {
return { context: '', documents: [], missing };
}
return {
context: [
'[知识库上下文]',
'用户已选择在本轮对话中使用当前组织空间知识库。以下内容来自智念助手保存的本地备份文件;回答前请优先参考这些内容。',
...sections,
].join('\n\n'),
documents: usedDocs,
missing,
};
}
export async function handleKnowledgeRoutes(
req: IncomingMessage,
res: ServerResponse,
url: URL,
_ctx: HostApiContext,
): Promise<boolean> {
if (url.pathname === '/api/knowledge/files' && req.method === 'GET') {
const workspaceId = sanitizeWorkspaceId(url.searchParams.get('workspaceId') ?? undefined);
const documents = await readRegistry(workspaceId);
sendJson(res, 200, { documents });
return true;
}
if (url.pathname === '/api/knowledge/import-paths' && req.method === 'POST') {
try {
const body = await parseJsonBody<{ workspaceId?: string; filePaths?: string[] }>(req);
const filePaths = Array.isArray(body.filePaths) ? body.filePaths : [];
if (filePaths.length === 0) {
sendJson(res, 400, { success: false, error: 'No files selected' });
return true;
}
const result = await importKnowledgeFiles({ workspaceId: body.workspaceId, filePaths });
sendJson(res, 200, { success: true, ...result });
} catch (error) {
sendJson(res, 500, { success: false, error: String(error) });
}
return true;
}
if (url.pathname === '/api/knowledge/context' && req.method === 'POST') {
try {
const body = await parseJsonBody<{ workspaceId?: string; documentIds?: string[] }>(req);
const documentIds = Array.isArray(body.documentIds) ? body.documentIds : [];
const result = await buildKnowledgeContext({ workspaceId: body.workspaceId, documentIds });
sendJson(res, 200, { success: true, ...result });
} catch (error) {
sendJson(res, 500, { success: false, error: String(error) });
}
return true;
}
return false;
}