309 lines
10 KiB
TypeScript
309 lines
10 KiB
TypeScript
import type { IncomingMessage, ServerResponse } from 'node:http';
|
|
import crypto from 'node:crypto';
|
|
import { basename, extname, join } from 'node:path';
|
|
import { mkdir, readFile, stat, writeFile, copyFile, rm } from 'node:fs/promises';
|
|
import type { HostApiContext } from '../context';
|
|
import { parseJsonBody, sendJson } from '../route-utils';
|
|
import { getDataDir } from '../../utils/paths';
|
|
|
|
const KNOWLEDGE_ROOT = join(getDataDir(), 'yinian', 'knowledge');
|
|
const MAX_KNOWLEDGE_FILE_BYTES = 20 * 1024 * 1024;
|
|
const MAX_CONTEXT_CHARS_PER_FILE = 32_000;
|
|
const MAX_CONTEXT_TOTAL_CHARS = 96_000;
|
|
|
|
const TEXT_MIME_BY_EXT: Record<string, string> = {
|
|
'.txt': 'text/plain',
|
|
'.md': 'text/markdown',
|
|
'.markdown': 'text/markdown',
|
|
'.csv': 'text/csv',
|
|
'.tsv': 'text/tab-separated-values',
|
|
'.json': 'application/json',
|
|
'.jsonl': 'application/x-ndjson',
|
|
'.xml': 'application/xml',
|
|
'.html': 'text/html',
|
|
'.htm': 'text/html',
|
|
'.yaml': 'application/yaml',
|
|
'.yml': 'application/yaml',
|
|
'.log': 'text/plain',
|
|
'.ini': 'text/plain',
|
|
'.conf': 'text/plain',
|
|
'.css': 'text/css',
|
|
'.js': 'text/javascript',
|
|
'.jsx': 'text/javascript',
|
|
'.ts': 'text/typescript',
|
|
'.tsx': 'text/typescript',
|
|
'.py': 'text/x-python',
|
|
'.sql': 'application/sql',
|
|
};
|
|
|
|
const WORD_MIME_BY_EXT: Record<string, string> = {
|
|
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
};
|
|
|
|
export interface KnowledgeDocument {
|
|
id: string;
|
|
workspaceId: string;
|
|
name: string;
|
|
mimeType: string;
|
|
size: number;
|
|
storedPath: string;
|
|
textPath?: string;
|
|
originalPath?: string;
|
|
importedAt: number;
|
|
status: 'stored';
|
|
}
|
|
|
|
function sanitizeWorkspaceId(workspaceId?: string): string {
|
|
const value = workspaceId?.trim() || 'default';
|
|
return value.replace(/[^a-zA-Z0-9_-]/g, '_').slice(0, 96) || 'default';
|
|
}
|
|
|
|
function getWorkspaceDir(workspaceId?: string): string {
|
|
return join(KNOWLEDGE_ROOT, sanitizeWorkspaceId(workspaceId));
|
|
}
|
|
|
|
function getRegistryPath(workspaceId?: string): string {
|
|
return join(getWorkspaceDir(workspaceId), 'registry.json');
|
|
}
|
|
|
|
async function readRegistry(workspaceId?: string): Promise<KnowledgeDocument[]> {
|
|
const raw = await readFile(getRegistryPath(workspaceId), 'utf8').catch(() => '');
|
|
if (!raw.trim()) return [];
|
|
try {
|
|
const parsed = JSON.parse(raw) as unknown;
|
|
return Array.isArray(parsed) ? parsed as KnowledgeDocument[] : [];
|
|
} catch {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
async function writeRegistry(workspaceId: string | undefined, docs: KnowledgeDocument[]): Promise<void> {
|
|
await mkdir(getWorkspaceDir(workspaceId), { recursive: true });
|
|
await writeFile(getRegistryPath(workspaceId), JSON.stringify(docs, null, 2), 'utf8');
|
|
}
|
|
|
|
function getTextMimeType(filePath: string): string | null {
|
|
const ext = extname(filePath).toLowerCase();
|
|
return TEXT_MIME_BY_EXT[ext] ?? WORD_MIME_BY_EXT[ext] ?? null;
|
|
}
|
|
|
|
function isDocx(filePath: string): boolean {
|
|
return extname(filePath).toLowerCase() === '.docx';
|
|
}
|
|
|
|
async function extractDocxText(filePath: string): Promise<string> {
|
|
const mammoth = await import('mammoth');
|
|
const result = await mammoth.extractRawText({ path: filePath });
|
|
return result.value.trim();
|
|
}
|
|
|
|
export async function importKnowledgeFiles(params: {
|
|
workspaceId?: string;
|
|
filePaths: string[];
|
|
}): Promise<{ documents: KnowledgeDocument[]; rejected: Array<{ filePath: string; reason: string }> }> {
|
|
const workspaceId = sanitizeWorkspaceId(params.workspaceId);
|
|
const workspaceDir = getWorkspaceDir(workspaceId);
|
|
const filesDir = join(workspaceDir, 'files');
|
|
const textDir = join(workspaceDir, 'texts');
|
|
await mkdir(filesDir, { recursive: true });
|
|
await mkdir(textDir, { recursive: true });
|
|
|
|
const currentDocs = await readRegistry(workspaceId);
|
|
const importedDocs: KnowledgeDocument[] = [];
|
|
const rejected: Array<{ filePath: string; reason: string }> = [];
|
|
|
|
for (const filePath of params.filePaths) {
|
|
const mimeType = getTextMimeType(filePath);
|
|
if (!mimeType) {
|
|
rejected.push({ filePath, reason: '仅支持文本类知识文件' });
|
|
continue;
|
|
}
|
|
|
|
const fileStat = await stat(filePath).catch(() => null);
|
|
if (!fileStat || !fileStat.isFile()) {
|
|
rejected.push({ filePath, reason: '文件不存在或不可读取' });
|
|
continue;
|
|
}
|
|
if (fileStat.size > MAX_KNOWLEDGE_FILE_BYTES) {
|
|
rejected.push({ filePath, reason: '文件超过 20MB 限制' });
|
|
continue;
|
|
}
|
|
|
|
const id = crypto.randomUUID();
|
|
const ext = extname(filePath).toLowerCase();
|
|
const storedPath = join(filesDir, `${id}${ext}`);
|
|
await copyFile(filePath, storedPath);
|
|
let textPath: string | undefined;
|
|
|
|
if (isDocx(filePath)) {
|
|
try {
|
|
const extractedText = await extractDocxText(storedPath);
|
|
if (!extractedText) {
|
|
rejected.push({ filePath, reason: 'Word 文档未提取到可用文本' });
|
|
continue;
|
|
}
|
|
textPath = join(textDir, `${id}.txt`);
|
|
await writeFile(textPath, extractedText, 'utf8');
|
|
} catch {
|
|
rejected.push({ filePath, reason: 'Word 文档解析失败,请确认文件为 .docx 格式' });
|
|
continue;
|
|
}
|
|
}
|
|
|
|
const doc: KnowledgeDocument = {
|
|
id,
|
|
workspaceId,
|
|
name: basename(filePath),
|
|
mimeType,
|
|
size: fileStat.size,
|
|
storedPath,
|
|
...(textPath ? { textPath } : {}),
|
|
originalPath: filePath,
|
|
importedAt: Date.now(),
|
|
status: 'stored',
|
|
};
|
|
importedDocs.push(doc);
|
|
}
|
|
|
|
if (importedDocs.length > 0) {
|
|
await writeRegistry(workspaceId, [...importedDocs, ...currentDocs]);
|
|
}
|
|
|
|
return { documents: importedDocs, rejected };
|
|
}
|
|
|
|
export async function buildKnowledgeContext(params: {
|
|
workspaceId?: string;
|
|
documentIds: string[];
|
|
}): Promise<{ context: string; documents: KnowledgeDocument[]; missing: string[] }> {
|
|
const workspaceId = sanitizeWorkspaceId(params.workspaceId);
|
|
const selectedIds = new Set(params.documentIds.filter((id) => typeof id === 'string' && id.trim()));
|
|
if (selectedIds.size === 0) {
|
|
return { context: '', documents: [], missing: [] };
|
|
}
|
|
|
|
const registry = await readRegistry(workspaceId);
|
|
const docs = registry.filter((doc) => selectedIds.has(doc.id));
|
|
const missing = [...selectedIds].filter((id) => !docs.some((doc) => doc.id === id));
|
|
const sections: string[] = [];
|
|
const usedDocs: KnowledgeDocument[] = [];
|
|
let totalChars = 0;
|
|
|
|
for (const doc of docs) {
|
|
const readablePath = doc.textPath || doc.storedPath;
|
|
const raw = await readFile(readablePath, 'utf8').catch(() => '');
|
|
const text = raw.trim();
|
|
if (!text) {
|
|
missing.push(doc.id);
|
|
continue;
|
|
}
|
|
|
|
const remaining = MAX_CONTEXT_TOTAL_CHARS - totalChars;
|
|
if (remaining <= 0) break;
|
|
const content = text.slice(0, Math.min(MAX_CONTEXT_CHARS_PER_FILE, remaining));
|
|
totalChars += content.length;
|
|
usedDocs.push(doc);
|
|
sections.push([
|
|
`## ${doc.name}`,
|
|
`类型:${doc.mimeType}`,
|
|
content,
|
|
].join('\n'));
|
|
}
|
|
|
|
if (sections.length === 0) {
|
|
return { context: '', documents: [], missing };
|
|
}
|
|
|
|
return {
|
|
context: [
|
|
'[知识库上下文]',
|
|
'用户已选择在本轮对话中使用当前组织空间知识库。以下内容来自智念助手保存的本地备份文件;回答前请优先参考这些内容。',
|
|
...sections,
|
|
].join('\n\n'),
|
|
documents: usedDocs,
|
|
missing,
|
|
};
|
|
}
|
|
|
|
async function deleteKnowledgeDocument(params: {
|
|
workspaceId?: string;
|
|
documentId: string;
|
|
}): Promise<{ deleted: KnowledgeDocument | null }> {
|
|
const workspaceId = sanitizeWorkspaceId(params.workspaceId);
|
|
const documentId = params.documentId.trim();
|
|
if (!documentId) return { deleted: null };
|
|
|
|
const registry = await readRegistry(workspaceId);
|
|
const target = registry.find((doc) => doc.id === documentId);
|
|
if (!target) return { deleted: null };
|
|
|
|
await Promise.all([
|
|
target.storedPath ? rm(target.storedPath, { force: true }).catch(() => undefined) : Promise.resolve(),
|
|
target.textPath ? rm(target.textPath, { force: true }).catch(() => undefined) : Promise.resolve(),
|
|
]);
|
|
await writeRegistry(workspaceId, registry.filter((doc) => doc.id !== documentId));
|
|
|
|
return { deleted: target };
|
|
}
|
|
|
|
export async function handleKnowledgeRoutes(
|
|
req: IncomingMessage,
|
|
res: ServerResponse,
|
|
url: URL,
|
|
_ctx: HostApiContext,
|
|
): Promise<boolean> {
|
|
if (url.pathname === '/api/knowledge/files' && req.method === 'GET') {
|
|
const workspaceId = sanitizeWorkspaceId(url.searchParams.get('workspaceId') ?? undefined);
|
|
const documents = await readRegistry(workspaceId);
|
|
sendJson(res, 200, { documents });
|
|
return true;
|
|
}
|
|
|
|
if (url.pathname.startsWith('/api/knowledge/files/') && req.method === 'DELETE') {
|
|
try {
|
|
const documentId = decodeURIComponent(url.pathname.slice('/api/knowledge/files/'.length));
|
|
const workspaceId = sanitizeWorkspaceId(url.searchParams.get('workspaceId') ?? undefined);
|
|
const result = await deleteKnowledgeDocument({ workspaceId, documentId });
|
|
if (!result.deleted) {
|
|
sendJson(res, 404, { success: false, error: 'Knowledge document not found' });
|
|
return true;
|
|
}
|
|
sendJson(res, 200, { success: true, document: result.deleted });
|
|
} catch (error) {
|
|
sendJson(res, 500, { success: false, error: String(error) });
|
|
}
|
|
return true;
|
|
}
|
|
|
|
if (url.pathname === '/api/knowledge/import-paths' && req.method === 'POST') {
|
|
try {
|
|
const body = await parseJsonBody<{ workspaceId?: string; filePaths?: string[] }>(req);
|
|
const filePaths = Array.isArray(body.filePaths) ? body.filePaths : [];
|
|
if (filePaths.length === 0) {
|
|
sendJson(res, 400, { success: false, error: 'No files selected' });
|
|
return true;
|
|
}
|
|
|
|
const result = await importKnowledgeFiles({ workspaceId: body.workspaceId, filePaths });
|
|
sendJson(res, 200, { success: true, ...result });
|
|
} catch (error) {
|
|
sendJson(res, 500, { success: false, error: String(error) });
|
|
}
|
|
return true;
|
|
}
|
|
|
|
if (url.pathname === '/api/knowledge/context' && req.method === 'POST') {
|
|
try {
|
|
const body = await parseJsonBody<{ workspaceId?: string; documentIds?: string[] }>(req);
|
|
const documentIds = Array.isArray(body.documentIds) ? body.documentIds : [];
|
|
const result = await buildKnowledgeContext({ workspaceId: body.workspaceId, documentIds });
|
|
sendJson(res, 200, { success: true, ...result });
|
|
} catch (error) {
|
|
sendJson(res, 500, { success: false, error: String(error) });
|
|
}
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|