diff --git a/dist-electron/main/main.js b/dist-electron/main/main.js index 2866ea8..48a3c8f 100644 --- a/dist-electron/main/main.js +++ b/dist-electron/main/main.js @@ -1,6 +1,6 @@ "use strict"; require("electron"); -require("./main-D-gZxrru.js"); +require("./main-BBEijEwg.js"); require("electron-squirrel-startup"); require("electron-log"); require("bytenode"); diff --git a/dist/index.html b/dist/index.html index d5b61b3..ac3ddba 100644 --- a/dist/index.html +++ b/dist/index.html @@ -8,8 +8,8 @@ http-equiv="Content-Security-Policy" content="default-src 'self'; script-src 'self'; style-src 'self' 'unsafe-inline'; img-src 'self' data: http://8.138.234.141 https://one-feel-bucket.oss-cn-guangzhou.aliyuncs.com; connect-src 'self' http://8.138.234.141 https://api.iconify.design wss://onefeel.brother7.cn" /> - - + +
diff --git a/docs/ClawX-Gateway-Launcher-Migration-Execution-Plan.md b/docs/ClawX-Gateway-Launcher-Migration-Execution-Plan.md new file mode 100644 index 0000000..e0ed0c4 --- /dev/null +++ b/docs/ClawX-Gateway-Launcher-Migration-Execution-Plan.md @@ -0,0 +1,509 @@ +# ClawX Gateway Launcher 迁移执行文档 + +## 1. 目标 + +`zn-ai` 当前已经完成 Gateway 启动链第一波迁移,解决重点是: + +- launch context 组装 +- launcher core 与 Windows 启动兼容 +- startup orchestrator +- startup stderr 诊断 +- supervisor 的基础端口与孤儿进程治理 + +下一波只继续迁移 `ClawX` 的 Gateway 生命周期与自愈闭环,范围严格限制在以下 4 组能力: + +- `connection-monitor` +- `restart-controller` / `restart-governor` +- `reload-policy` +- `doctor repair` + +本轮明确不做: + +- Chat 主链路改造 +- Skills 安装链路改造 +- Providers/Channels 非 Gateway 生命周期相关重构 +- UI 视觉层修改 + +## 2. 当前结论 + +`zn-ai` 与 `ClawX` 在 Gateway 启动能力上已经接近,但在 Gateway 完整生命周期能力上仍不一致。 + +`zn-ai` 已具备: + +- `prepareGatewayLaunchContext(...)` +- `launchGatewayProcess(...)` +- `runGatewayStartupSequence(...)` +- startup stderr 分类与缓存 +- `waitForPortFree(...)` +- 孤儿监听进程探测与清理 +- Windows `node-runtime` 启动策略 + +`zn-ai` 仍缺少: + +- 心跳监控与健康检查驱动的自愈 +- 启动中/重连中的 restart defer 机制 +- restart cooldown governor +- Gateway reload policy 解析与应用 +- OpenClaw doctor repair +- Python readiness warmup +- 与这些能力配套的 manager 生命周期接线 + +## 3. 第二波迁移目标 + +迁移完成后,`zn-ai` 的 Gateway 至少要补齐下面这组行为: + +1. Gateway 连接建立后有稳定的 ping/pong 与 message heartbeat 监控。 +2. 连接失活或健康检查失败时,manager 能按治理策略自愈,而不是单次掉线后长期停摆。 +3. 外部触发的 restart 请求在 `starting` / `reconnecting` 阶段不会打断在途启动,而是 defer 到合适时机。 +4. 连续 restart 受到 cooldown governor 约束,避免抖动和端口争抢。 +5. Gateway reload policy 能从 `~/.openclaw/openclaw.json` 读取,并决定走 `reload`、`restart`、`hybrid` 或 `off`。 +6. startup-orchestrator 在判定配置损坏时能触发 doctor repair,然后再试一次启动。 +7. Windows 下真实 Gateway 冒烟时,能区分: + - 启动慢但最终 ready + - 配置损坏后 repair 成功 + - repair 失败后停止重试 + +## 4. 建议 sub-agent 数量 + +建议 `4` 个开发 sub-agent,加 `1` 个主协调 agent。 + +理由: + +- 这 4 组能力的依赖闭包并不完全独立。 +- `manager.ts` 是热点文件,不能让多个 worker 同时改。 +- `doctor repair` 会额外牵动 `supervisor.ts` 与若干工具模块,适合单独收口。 +- 测试与真实回归要独立 ownership,避免实现者自己绕过风险。 + +不建议拆成超过 `4` 个开发 sub-agent,因为: + +- `manager.ts`、`startup-orchestrator.ts`、`supervisor.ts` 的写入冲突会明显增加。 +- 这轮目标是生命周期闭环,不是大范围模块化重构。 + +## 5. 分工方案 + +### 主协调 Agent + +职责: + +- 冻结第二波迁移边界,只做 Gateway 生命周期与自愈能力。 +- 冻结共享契约: + - lifecycle state + - reconnect / deferred restart policy + - reload policy result + - doctor repair hook +- 负责合并顺序、冲突协调、回归 checklist 与最终收口。 + +不直接负责大规模代码写入,重点是接口与顺序控制。 + +### SA-1 Lifecycle Primitives + +责任范围: + +- `zn-ai/electron/gateway/connection-monitor.ts` +- `zn-ai/electron/gateway/process-policy.ts` +- `zn-ai/electron/gateway/lifecycle-controller.ts` +- `zn-ai/electron/gateway/restart-controller.ts` +- `zn-ai/electron/gateway/restart-governor.ts` +- 如有必要,可补最小 `state` 辅助文件,但不负责 manager 接线 + +目标: + +- 迁入 ClawX 的心跳监控、健康检查定时器、restart defer 规则、reconnect 规则、restart cooldown 规则 +- 保持实现尽量独立,供 manager 后续接入 + +约束: + +- 不修改 `manager.ts` +- 不修改 UI / API / Chat / Skills + +### SA-2 Repair & Supervisor + +责任范围: + +- `zn-ai/electron/gateway/supervisor.ts` +- `zn-ai/electron/gateway/startup-orchestrator.ts` +- 如缺失则新增最小工具: + - `zn-ai/electron/utils/uv-env.ts` + - `zn-ai/electron/utils/uv-setup.ts` + - `zn-ai/electron/utils/env-path.ts` + +目标: + +- 迁入 `warmupManagedPythonReadiness()` +- 迁入 `runOpenClawDoctorRepair()` +- 把 doctor repair 接进 `runGatewayStartupSequence(...)` +- 保持对 Windows 的 process tree terminate 和 PATH 注入兼容 + +约束: + +- 不改 `manager.ts` +- 只为 doctor repair 引入最小依赖,不顺手扩散到其它 Python/uv 功能 + +### SA-3 Manager Lifecycle Integration + +责任范围: + +- `zn-ai/electron/gateway/manager.ts` +- 必要时小幅修改: + - `zn-ai/electron/gateway/ws-client.ts` + - `zn-ai/electron/gateway/types.ts` + +目标: + +- 接入 SA-1 输出的 lifecycle / reconnect / restart governance +- 接入 SA-2 输出的 doctor repair 与 startup recovery hook +- 为 `zn-ai` 补齐: + - `startHealthCheck()` + - `startPing()` + - `scheduleReconnect()` + - `debouncedRestart()` + - `reload()` + - `debouncedReload()` + - reload policy refresh + +约束: + +- 只动 Gateway 生命周期相关逻辑 +- 不改现有 chat payload 结构 +- 不借机重构无关的 runtime broadcast + +说明: + +`manager.ts` 由 SA-3 单独 owning,其他 sub-agent 不直接修改这个文件,避免冲突。 + +### SA-4 Verification & Regression + +责任范围: + +- `zn-ai/tests/*gateway*` +- Gateway 相关 smoke 脚本与文档 +- 必要时补最小测试夹具 + +目标: + +- 给新增的 lifecycle/reload/repair 行为补单测 +- 补真实 Windows 冒烟步骤 +- 输出失败定位矩阵,覆盖: + - heartbeat miss + - deferred restart + - governor suppress + - reload policy mode 分支 + - doctor repair success/failure + +约束: + +- 不修改产品功能逻辑,除非为了让测试可注入而做最小 seam + +## 6. 实施顺序 + +### Wave 2A + +并行推进: + +- SA-1 Lifecycle Primitives +- SA-2 Repair & Supervisor + +冻结输出契约: + +- `GatewayConnectionMonitor` +- `GatewayRestartController` +- `GatewayRestartGovernor` +- `GatewayReloadPolicy` +- `runOpenClawDoctorRepair()` +- `warmupManagedPythonReadiness()` + +### Wave 2B + +在 Wave 2A 契约冻结后推进: + +- SA-3 Manager Lifecycle Integration + +这一步只做接线和行为收口,不反向改 SA-1/SA-2 的模块边界。 + +### Wave 2C + +最后推进: + +- SA-4 Verification & Regression +- 主协调 Agent 汇总验收 + +## 7. 合并顺序 + +1. `process-policy.ts` / `lifecycle-controller.ts` / `connection-monitor.ts` +2. `restart-controller.ts` / `restart-governor.ts` +3. `reload-policy.ts` +4. `supervisor.ts` 的 doctor repair 与 Python warmup +5. `startup-orchestrator.ts` repair hook +6. `manager.ts` 生命周期接线 +7. tests / smoke / 文档收口 + +## 8. 关键依赖与注意事项 + +### 8.1 manager.ts 是单点热点 + +本轮很多能力最终都要落到 `manager.ts`。因此: + +- 只有 SA-3 可以写 `manager.ts` +- SA-1 / SA-2 只提供可接入模块和稳定接口 + +### 8.2 doctor repair 不是孤立功能 + +`ClawX` 的 doctor repair 依赖: + +- OpenClaw entry/runtime path +- bundled `bin` PATH 注入 +- `uv` 镜像环境 +- Python readiness 检查 + +`zn-ai` 当前还没有完整对应模块,所以必须按“最小依赖闭包”迁入,不能只拷贝 `runOpenClawDoctorRepair()`。 + +### 8.3 reload policy 只做 Gateway 进程策略 + +本轮 `reload-policy` 只决定 Gateway 进程层行为: + +- `off` +- `reload` +- `restart` +- `hybrid` + +不扩展到 UI 或配置编辑器逻辑。 + +### 8.4 Windows 冒烟必须保留长等待预算 + +之前真实诊断已经确认 Windows 下 OpenClaw ready 可能超过 100 秒,因此: + +- 不得把 ready wait 预算收回到旧值 +- 冒烟时要区分“慢启动”与“真失败” + +## 9. 验收标准 + +必须覆盖以下场景: + +- Gateway 连接后能持续 ping/pong,并在 message/pong 到达时恢复 heartbeat 状态 +- heartbeat 连续 miss 达阈值后能触发受控恢复 +- `restart()` 在 `starting` / `reconnecting` 阶段会 defer,而不是打断在途启动 +- restart governor 在 cooldown 内能 suppress 重复 restart +- `reload-policy` 可从 `~/.openclaw/openclaw.json` 读取并应用 +- `reload()` 失败时能 fallback 到 `restart()` +- startup 检测到配置损坏时能执行 doctor repair +- doctor repair 成功后,Gateway 能继续启动 +- doctor repair 失败后,错误信息可诊断,不会无限重试 +- Windows 真实启动回归中,不再频繁出现无诊断信息的 `exited before becoming ready` + +建议最少验证: + +- `pnpm typecheck` +- Gateway 生命周期相关单测 +- 一次 Windows 本机真实 Gateway 冒烟 +- 一次配置损坏后的 repair 回归 + +## 10. 当前执行状态 + +当前建议按以下顺序继续推进: + +1. 先完成 Wave 2 的 sub-agent 分工与 ownership 冻结 +2. 并行实施 SA-1 与 SA-2 +3. 由 SA-3 独占接入 `manager.ts` +4. 最后由 SA-4 负责真实回归与失败矩阵 + +## 11. 第三波补齐范围 + +在 `Wave 2A / 2B` 完成之后,`zn-ai` 和 `ClawX` 的 Gateway 差距主要收敛到下面这一组: + +- `state.ts` +- `protocol.ts` +- `event-dispatch.ts` +- `manager.ts` 里的 diagnostics +- `manager.ts` 里的 `gatewayReady fallback` + +这组能力的目标不是继续扩展生命周期治理,而是把 Gateway 的“状态模型、协议兼容、事件分发、诊断可观测性”补到接近 `ClawX`。 + +本轮明确不做: + +- Chat 业务语义改造 +- UI 新增诊断页 +- telemetry 上传体系迁移 +- 与本轮无关的 Skills / Channels / Providers 重构 + +## 12. 第三波建议 sub-agent 数量 + +最小可行配置是 `3` 个开发 sub-agent,加 `1` 个主协调 agent。 + +推荐配置是 `4` 个开发 sub-agent,加 `1` 个主协调 agent。 + +推荐按 `4` 个开发 sub-agent 推进,原因是: + +- `manager.ts` 仍然是单点热点文件,必须单 owner。 +- `protocol.ts` / `event-dispatch.ts` 与 `state.ts` / diagnostics 的依赖闭包并不相同,适合并行推进。 +- diagnostics / `gatewayReady fallback` 需要独立验证,不适合完全由实现者自测。 + +如果资源受限,也可以退化成 `3` 个开发 sub-agent: + +- 把验证工作并回主协调 Agent +- 或把 `protocol/event-dispatch` 与 `state/diagnostics primitives` 合并给同一个 sub-agent + +## 13. 第三波分工方案 + +### 主协调 Agent + +职责: + +- 冻结第三波范围,只做状态层、协议层、事件分发层和 manager diagnostics/fallback。 +- 冻结共享契约: + - GatewayStatus 状态结构 + - diagnostics snapshot 结构 + - protocol / notification 类型守卫 + - `gateway.ready` fallback 行为 +- 负责合并顺序、冲突协调和最终验收。 + +### SA-1 Protocol & Dispatch + +责任范围: + +- `zn-ai/electron/gateway/protocol.ts` +- `zn-ai/electron/gateway/event-dispatch.ts` +- 必要时小幅修改: + - `zn-ai/electron/gateway/types.ts` + +目标: + +- 迁入 `ClawX` 的 JSON-RPC type guards 与 protocol 类型定义 +- 迁入 protocol event 与 JSON-RPC notification 分发逻辑 +- 让 `zn-ai` 的 Gateway manager 不再只处理当前 OpenClaw event frame,而是具备和 `ClawX` 接近的 protocol fallback 面 + +约束: + +- 不修改 `manager.ts` +- 不修改 Chat store / renderer 事件消费逻辑 + +### SA-2 State & Diagnostics Primitives + +责任范围: + +- `zn-ai/electron/gateway/state.ts` +- `zn-ai/electron/gateway/diagnostics.ts` +- 如有必要,可补最小 diagnostics 类型文件 + +目标: + +- 迁入 `GatewayStateController` +- 定义 `getStatus()` / `isConnected()` / state transition hook 的统一实现 +- 为 manager diagnostics 提供稳定数据结构: + - `lastAliveAt` + - `lastRpcSuccessAt` + - `lastRpcFailureAt` + - `lastRpcFailureMethod` + - `lastHeartbeatTimeoutAt` + - `lastSocketCloseAt` + - `lastSocketCloseCode` + - `consecutiveHeartbeatMisses` + - `consecutiveRpcFailures` + +约束: + +- 不修改 `manager.ts` +- 不新增 UI 诊断入口 + +### SA-3 Manager State & Protocol Integration + +责任范围: + +- `zn-ai/electron/gateway/manager.ts` +- 必要时小幅修改: + - `zn-ai/electron/gateway/ws-client.ts` + +目标: + +- 接入 SA-1 的 protocol / event-dispatch +- 接入 SA-2 的 state controller 与 diagnostics snapshot +- 为 `zn-ai` 补齐: + - `getDiagnostics()` + - `gateway.ready` event handling + - `gatewayReady fallback` timer + - richer state transition handling + - RPC success / failure / socket close / heartbeat timeout 记录 + +约束: + +- `manager.ts` 由 SA-3 单独 owning +- 不借机改动 chat payload 结构 +- 不迁移 telemetry 上传逻辑,除非成为必需依赖 + +### SA-4 Verification & Regression + +责任范围: + +- `zn-ai/tests/*gateway*` +- Gateway 相关 smoke checklist 与文档 + +目标: + +- 给以下能力补测试: + - protocol type guards + - event dispatch mapping + - state transition callbacks + - diagnostics snapshot 更新 + - `gateway.ready` fallback 行为 +- 补一组回归清单,确认新 protocol fallback 不会破坏现有 `chat:*` 事件链路 + +约束: + +- 不修改产品逻辑,除非为了测试可注入而补最小 seam + +## 14. 第三波实施顺序 + +### Wave 3A + +并行推进: + +- SA-1 Protocol & Dispatch +- SA-2 State & Diagnostics Primitives + +冻结输出契约: + +- `protocol.ts` +- `event-dispatch.ts` +- `GatewayStateController` +- diagnostics snapshot 结构 + +### Wave 3B + +在 Wave 3A 契约冻结后推进: + +- SA-3 Manager State & Protocol Integration + +这一步必须保持 `manager.ts` 单 owner,不允许并行写入。 + +### Wave 3C + +最后推进: + +- SA-4 Verification & Regression +- 主协调 Agent 汇总验收 + +## 15. 第三波合并顺序 + +1. `protocol.ts` +2. `event-dispatch.ts` +3. `state.ts` +4. diagnostics types / helper +5. `manager.ts` 接线 +6. tests / smoke / 文档收口 + +## 16. 第三波验收标准 + +必须覆盖以下场景: + +- `zn-ai` 具备与 `ClawX` 接近的 JSON-RPC request/response/notification type guards +- protocol event 与 JSON-RPC notification 能走统一 dispatch 层 +- manager 状态更新不再完全依赖手工分支,而是通过 state controller 收口 +- diagnostics 能记录最近 alive/RPC/socket/heartbeat 关键时间点和失败原因 +- 收到 `gateway.ready` 时能更新 Gateway ready 状态 +- 未收到 `gateway.ready` 时,fallback timer 能在超时后兜底设置 ready +- 现有 `chat:delta` / `chat:final` / `chat:error` / `chat:aborted` 链路不回归 + +建议最少验证: + +- `pnpm typecheck` +- Gateway protocol / diagnostics 相关单测 +- 一次真实 Gateway 启动与 `gateway.ready` 兜底回归 diff --git a/electron/api/context.ts b/electron/api/context.ts index 2dc675a..5951905 100644 --- a/electron/api/context.ts +++ b/electron/api/context.ts @@ -2,10 +2,12 @@ import type { BrowserWindow } from 'electron'; import type { gatewayManager } from '@electron/gateway/manager'; import type { providerApiService } from '@electron/service/provider-api-service'; import type { ClawHubService } from '@electron/gateway/clawhub'; +import type { hostEventBus } from './event-bus'; export interface HostApiContext { gatewayManager: typeof gatewayManager; providerApiService: typeof providerApiService; mainWindow: BrowserWindow | null; clawHubService: ClawHubService; + eventBus: typeof hostEventBus; } diff --git a/electron/api/event-bus.ts b/electron/api/event-bus.ts new file mode 100644 index 0000000..259cf3c --- /dev/null +++ b/electron/api/event-bus.ts @@ -0,0 +1,64 @@ +import type { ServerResponse } from 'node:http'; + +type EventPayload = unknown; + +type EventListener = (payload: EventPayload) => void; + +export class HostEventBus { + private readonly listeners = new Map>(); + private readonly sseClients = new Set(); + + on(eventName: string, listener: EventListener): () => void { + const bucket = this.listeners.get(eventName) ?? new Set(); + bucket.add(listener); + this.listeners.set(eventName, bucket); + + return () => { + bucket.delete(listener); + if (bucket.size === 0) { + this.listeners.delete(eventName); + } + }; + } + + addSseClient(res: ServerResponse): void { + this.sseClients.add(res); + res.on('close', () => { + this.sseClients.delete(res); + }); + } + + emit(eventName: string, payload: EventPayload): void { + const bucket = this.listeners.get(eventName); + if (bucket) { + for (const listener of bucket) { + listener(payload); + } + } + + if (this.sseClients.size > 0) { + const message = `event: ${eventName}\ndata: ${JSON.stringify(payload)}\n\n`; + for (const client of this.sseClients) { + try { + client.write(message); + } catch { + this.sseClients.delete(client); + } + } + } + } + + closeAll(): void { + this.listeners.clear(); + for (const client of this.sseClients) { + try { + client.end(); + } catch { + // Ignore individual client close failures. + } + } + this.sseClients.clear(); + } +} + +export const hostEventBus = new HostEventBus(); diff --git a/electron/api/route-utils.ts b/electron/api/route-utils.ts index 0bac184..ddb799e 100644 --- a/electron/api/route-utils.ts +++ b/electron/api/route-utils.ts @@ -1,4 +1,5 @@ import type { HostApiResult } from '@src/types/runtime'; +import type { IncomingMessage, ServerResponse } from 'node:http'; export interface HostApiRequest { path: string; @@ -16,6 +17,20 @@ export interface NormalizedHostApiRequest { url: URL; } +export async function parseRawJsonBody(req: IncomingMessage): Promise { + const chunks: Buffer[] = []; + for await (const chunk of req) { + chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); + } + + const raw = Buffer.concat(chunks).toString('utf8').trim(); + if (!raw) { + return {} as T; + } + + return JSON.parse(raw) as T; +} + export function normalizeRequest(request: HostApiRequest): NormalizedHostApiRequest { const path = String(request.path || '/').trim() || '/'; @@ -61,3 +76,34 @@ export function fail(status: number, error: string, data?: T): Host data, }; } + +export function setCorsHeaders(res: ServerResponse, origin?: string): void { + if (origin) { + res.setHeader('Access-Control-Allow-Origin', origin); + res.setHeader('Vary', 'Origin'); + } else { + res.setHeader('Access-Control-Allow-Origin', '*'); + } + res.setHeader('Access-Control-Allow-Methods', 'GET,POST,PUT,DELETE,OPTIONS'); + res.setHeader('Access-Control-Allow-Headers', 'Content-Type, Authorization, X-Host-Api-Token'); +} + +export function requireJsonContentType(req: IncomingMessage): boolean { + if (req.method === 'GET' || req.method === 'OPTIONS' || req.method === 'HEAD') { + return true; + } + + const contentLength = req.headers['content-length']; + if (contentLength === '0' || contentLength === undefined) { + return true; + } + + const contentType = req.headers['content-type'] || ''; + return contentType.includes('application/json'); +} + +export function sendJsonResponse(res: ServerResponse, statusCode: number, payload: unknown): void { + res.statusCode = statusCode; + res.setHeader('Content-Type', 'application/json; charset=utf-8'); + res.end(JSON.stringify(payload)); +} diff --git a/electron/api/router.ts b/electron/api/router.ts index 5d5c36b..53ea663 100644 --- a/electron/api/router.ts +++ b/electron/api/router.ts @@ -2,6 +2,7 @@ import { BrowserWindow } from 'electron'; import { gatewayManager } from '@electron/gateway/manager'; import { providerApiService } from '@electron/service/provider-api-service'; import { ClawHubService } from '@electron/gateway/clawhub'; +import { hostEventBus } from './event-bus'; import type { HostApiContext } from './context'; import type { HostApiRequest } from './route-utils'; import { normalizeRequest } from './route-utils'; @@ -36,18 +37,19 @@ const routeHandlers: RouteHandler[] = [ handleSkillRoutes, ]; -function createContext(): HostApiContext { +export function createHostApiContext(): HostApiContext { return { gatewayManager, providerApiService, mainWindow: BrowserWindow.getAllWindows()[0] ?? null, clawHubService: new ClawHubService(), + eventBus: hostEventBus, }; } export async function dispatchLocalHostApi(request: HostApiRequest) { const normalized = normalizeRequest(request); - const ctx = createContext(); + const ctx = createHostApiContext(); for (const handler of routeHandlers) { const result = await handler(normalized, ctx); diff --git a/electron/api/server.ts b/electron/api/server.ts new file mode 100644 index 0000000..211d37d --- /dev/null +++ b/electron/api/server.ts @@ -0,0 +1,158 @@ +import { randomBytes } from 'node:crypto'; +import { createServer, type Server } from 'node:http'; +import log from 'electron-log'; +import type { HostApiContext } from './context'; +import type { HostApiRequest } from './route-utils'; +import { + parseRawJsonBody, + requireJsonContentType, + sendJsonResponse, + setCorsHeaders, +} from './route-utils'; + +const DEFAULT_HOST_API_PORT = 13210; + +type StartHostApiServerOptions = { + ctx: HostApiContext; + dispatchRequest: (request: HostApiRequest) => Promise; + fallbackRequest?: (request: HostApiRequest) => Promise; + port?: number; +}; + +let hostApiToken = ''; + +export function getHostApiPort(): number { + const raw = process.env['ZN_AI_HOST_API_PORT']; + const parsed = raw ? Number.parseInt(raw, 10) : NaN; + return Number.isFinite(parsed) && parsed > 0 ? parsed : DEFAULT_HOST_API_PORT; +} + +export function getHostApiToken(): string { + return hostApiToken; +} + +export function getHostApiBase(): string { + return `http://127.0.0.1:${getHostApiPort()}`; +} + +export function startHostApiServer(options: StartHostApiServerOptions): Server { + const port = options.port ?? getHostApiPort(); + hostApiToken = randomBytes(32).toString('hex'); + + const server = createServer(async (req, res) => { + try { + const requestUrl = new URL(req.url || '/', `http://127.0.0.1:${port}`); + setCorsHeaders(res, req.headers.origin); + + if (req.method === 'OPTIONS') { + res.statusCode = 204; + res.end(); + return; + } + + const bearerHeader = req.headers.authorization || ''; + const bearerToken = bearerHeader.startsWith('Bearer ') + ? bearerHeader.slice('Bearer '.length) + : ''; + const token = ( + req.headers['x-host-api-token'] + || requestUrl.searchParams.get('token') + || bearerToken + ); + + if (token !== hostApiToken) { + sendJsonResponse(res, 401, { + success: false, + ok: false, + error: 'Unauthorized', + }); + return; + } + + if (requestUrl.pathname === '/api/events' && req.method === 'GET') { + res.writeHead(200, { + 'Content-Type': 'text/event-stream; charset=utf-8', + 'Cache-Control': 'no-cache, no-transform', + Connection: 'keep-alive', + }); + res.write(': connected\n\n'); + options.ctx.eventBus.addSseClient(res); + res.write(`event: gateway:status\ndata: ${JSON.stringify(options.ctx.gatewayManager.getStatus())}\n\n`); + return; + } + + if (!requireJsonContentType(req)) { + sendJsonResponse(res, 415, { + success: false, + ok: false, + error: 'Content-Type must be application/json', + }); + return; + } + + const body = req.method && ['POST', 'PUT', 'PATCH', 'DELETE'].includes(req.method) + ? await parseRawJsonBody(req) + : null; + + const forwardedHeaders: Record = {}; + Object.entries(req.headers).forEach(([key, value]) => { + if (typeof value === 'string') { + forwardedHeaders[key] = value; + } + }); + delete forwardedHeaders['x-host-api-token']; + + const request: HostApiRequest = { + path: `${requestUrl.pathname}${requestUrl.search}`, + method: req.method, + headers: forwardedHeaders, + body, + }; + + const localResult = await options.dispatchRequest(request); + const response = localResult ?? ( + options.fallbackRequest + ? await options.fallbackRequest(request) + : null + ); + + if (response == null) { + sendJsonResponse(res, 404, { + success: false, + ok: false, + error: `No route for ${req.method} ${requestUrl.pathname}`, + }); + return; + } + + const result = response as { + status?: number; + data?: unknown; + json?: unknown; + success?: boolean; + ok?: boolean; + error?: string; + text?: string; + }; + + sendJsonResponse(res, result.status ?? 200, response); + } catch (error) { + sendJsonResponse(res, 500, { + success: false, + ok: false, + error: error instanceof Error ? error.message : String(error), + }); + } + }); + + server.on('error', (error) => { + log.error('Host API server failed:', error); + }); + + server.on('close', () => { + hostApiToken = ''; + }); + + server.listen(port, '127.0.0.1'); + return server; +} diff --git a/electron/gateway/config-sync.ts b/electron/gateway/config-sync.ts index 3512fad..b061ff8 100644 --- a/electron/gateway/config-sync.ts +++ b/electron/gateway/config-sync.ts @@ -1,5 +1,99 @@ +import { existsSync } from 'node:fs'; +import { delimiter, join } from 'node:path'; +import { app } from 'electron'; import { syncBrowserConfigToOpenClaw } from '@electron/utils/openclaw-auth'; +import { + getOpenClawEntryPath, + getOpenClawResolvedDir, +} from '@electron/utils/paths'; + +export interface GatewayLaunchContext { + openclawDir: string; + entryScript: string; + binDir: string; + gatewayArgs: string[]; + forkEnv: Record; + mode: 'dev' | 'packaged'; + binPathExists: boolean; +} + +export interface PrepareGatewayLaunchContextOptions { + port: number; + token: string; + skipChannels?: boolean; +} + +function prependPathEntry(pathValue: string | undefined, entry: string): string { + if (!pathValue?.trim()) { + return entry; + } + + const segments = pathValue.split(delimiter); + if (segments.includes(entry)) { + return pathValue; + } + + return `${entry}${delimiter}${pathValue}`; +} + +function getBundledBinDir(): string { + if (app.isPackaged) { + return join(process.resourcesPath, 'bin'); + } + + return join(process.cwd(), 'resources', 'bin', `${process.platform}-${process.arch}`); +} export async function syncGatewayConfigBeforeLaunch(): Promise { await syncBrowserConfigToOpenClaw(); } + +export async function prepareGatewayLaunchContext( + options: PrepareGatewayLaunchContextOptions, +): Promise { + await syncGatewayConfigBeforeLaunch(); + + const openclawDir = getOpenClawResolvedDir(); + const entryScript = getOpenClawEntryPath(); + + if (!existsSync(entryScript)) { + throw new Error(`OpenClaw entry script not found at ${entryScript}`); + } + + const gatewayArgs = [ + 'gateway', + '--port', + String(options.port), + '--token', + options.token, + '--allow-unconfigured', + ]; + + const skipChannels = options.skipChannels ?? true; + const binDir = getBundledBinDir(); + const binPathExists = existsSync(binDir); + + const { + NODE_OPTIONS: _nodeOptions, + ELECTRON_RUN_AS_NODE: _electronRunAsNode, + ...baseEnv + } = process.env; + const forkEnv: Record = { + ...baseEnv, + PATH: binPathExists ? prependPathEntry(baseEnv.PATH, binDir) : baseEnv.PATH, + OPENCLAW_GATEWAY_TOKEN: options.token, + OPENCLAW_SKIP_CHANNELS: skipChannels ? '1' : '', + CLAWDBOT_SKIP_CHANNELS: skipChannels ? '1' : '', + OPENCLAW_NO_RESPAWN: '1', + }; + + return { + openclawDir, + entryScript, + binDir, + gatewayArgs, + forkEnv, + mode: app.isPackaged ? 'packaged' : 'dev', + binPathExists, + }; +} diff --git a/electron/gateway/connection-monitor.ts b/electron/gateway/connection-monitor.ts new file mode 100644 index 0000000..2cd07b6 --- /dev/null +++ b/electron/gateway/connection-monitor.ts @@ -0,0 +1,122 @@ +import logManager from '@electron/service/logger'; + +type HealthResult = { ok: boolean; error?: string }; +type HeartbeatAliveReason = 'pong' | 'message'; + +type PingOptions = { + sendPing: () => void; + onHeartbeatTimeout: (context: { consecutiveMisses: number; timeoutMs: number }) => void; + intervalMs?: number; + timeoutMs?: number; + maxConsecutiveMisses?: number; +}; + +export class GatewayConnectionMonitor { + private pingInterval: NodeJS.Timeout | null = null; + private healthCheckInterval: NodeJS.Timeout | null = null; + private lastPingAt = 0; + private waitingForAlive = false; + private consecutiveMisses = 0; + private timeoutTriggered = false; + + startPing(options: PingOptions): void { + const intervalMs = options.intervalMs ?? 30000; + const timeoutMs = options.timeoutMs ?? 10000; + const maxConsecutiveMisses = Math.max(1, options.maxConsecutiveMisses ?? 3); + this.resetHeartbeatState(); + + if (this.pingInterval) { + clearInterval(this.pingInterval); + } + + this.pingInterval = setInterval(() => { + const now = Date.now(); + + if (this.waitingForAlive && now - this.lastPingAt >= timeoutMs) { + this.waitingForAlive = false; + this.consecutiveMisses += 1; + logManager.warn( + `Gateway heartbeat missed (${this.consecutiveMisses}/${maxConsecutiveMisses}, timeout=${timeoutMs}ms)`, + ); + if (this.consecutiveMisses >= maxConsecutiveMisses && !this.timeoutTriggered) { + this.timeoutTriggered = true; + options.onHeartbeatTimeout({ + consecutiveMisses: this.consecutiveMisses, + timeoutMs, + }); + return; + } + } + + options.sendPing(); + this.waitingForAlive = true; + this.lastPingAt = now; + }, intervalMs); + } + + markAlive(reason: HeartbeatAliveReason): void { + if (this.consecutiveMisses > 0) { + logManager.debug(`Gateway heartbeat recovered via ${reason} (misses=${this.consecutiveMisses})`); + } + this.waitingForAlive = false; + this.consecutiveMisses = 0; + this.timeoutTriggered = false; + } + + handlePong(): void { + this.markAlive('pong'); + } + + getConsecutiveMisses(): number { + return this.consecutiveMisses; + } + + startHealthCheck(options: { + shouldCheck: () => boolean; + checkHealth: () => Promise; + onUnhealthy: (errorMessage: string) => void; + onError: (error: unknown) => void; + intervalMs?: number; + }): void { + if (this.healthCheckInterval) { + clearInterval(this.healthCheckInterval); + } + + this.healthCheckInterval = setInterval(async () => { + if (!options.shouldCheck()) { + return; + } + + try { + const health = await options.checkHealth(); + if (!health.ok) { + const errorMessage = health.error ?? 'Health check failed'; + logManager.warn(`Gateway health check failed: ${errorMessage}`); + options.onUnhealthy(errorMessage); + } + } catch (error) { + logManager.error('Gateway health check error:', error); + options.onError(error); + } + }, options.intervalMs ?? 30000); + } + + clear(): void { + if (this.pingInterval) { + clearInterval(this.pingInterval); + this.pingInterval = null; + } + if (this.healthCheckInterval) { + clearInterval(this.healthCheckInterval); + this.healthCheckInterval = null; + } + this.resetHeartbeatState(); + } + + private resetHeartbeatState(): void { + this.lastPingAt = 0; + this.waitingForAlive = false; + this.consecutiveMisses = 0; + this.timeoutTriggered = false; + } +} diff --git a/electron/gateway/diagnostics.ts b/electron/gateway/diagnostics.ts index 9ac395d..0bafda5 100644 --- a/electron/gateway/diagnostics.ts +++ b/electron/gateway/diagnostics.ts @@ -1,5 +1,25 @@ import type { ChannelAccountCatalogGroup, ChannelConnectionStatus } from '@src/lib/channel-types'; import { buildChannelStatusSummary, type ChannelStatusSummary } from '@electron/utils/channel-status'; +import type { GatewayLifecycleState } from './process-policy'; + +export interface GatewayDiagnosticsSnapshot { + lastAliveAt?: number; + lastRpcSuccessAt?: number; + lastRpcFailureAt?: number; + lastRpcFailureMethod?: string; + lastHeartbeatTimeoutAt?: number; + consecutiveHeartbeatMisses: number; + lastSocketCloseAt?: number; + lastSocketCloseCode?: number; + consecutiveRpcFailures: number; +} + +export function createInitialGatewayDiagnostics(): GatewayDiagnosticsSnapshot { + return { + consecutiveHeartbeatMisses: 0, + consecutiveRpcFailures: 0, + }; +} export interface GatewayHealthSnapshot { ok: boolean; @@ -9,6 +29,9 @@ export interface GatewayHealthSnapshot { port?: number | null; pid?: number | null; lastError?: string; + lifecycleState?: GatewayLifecycleState; + gatewayReady?: boolean; + diagnostics?: GatewayDiagnosticsSnapshot; } export interface GatewayDiagnosticsSummary { diff --git a/electron/gateway/event-dispatch.ts b/electron/gateway/event-dispatch.ts new file mode 100644 index 0000000..e33667c --- /dev/null +++ b/electron/gateway/event-dispatch.ts @@ -0,0 +1,55 @@ +import logManager from '@electron/service/logger'; +import { GatewayEventType, type JsonRpcNotification } from './protocol'; + +type GatewayEventEmitter = { + emit: (event: string, payload: unknown) => boolean; +}; + +export function dispatchProtocolEvent( + emitter: GatewayEventEmitter, + event: string, + payload: unknown, +): void { + switch (event) { + case 'tick': + break; + case 'chat': + emitter.emit('chat:message', { message: payload }); + break; + case 'agent': + emitter.emit('notification', { method: event, params: payload }); + break; + case 'channel.status': + case 'channel.status_changed': + emitter.emit('channel:status', payload as { channelId: string; status: string }); + break; + case 'gateway.ready': + case 'ready': + emitter.emit('gateway:ready', payload); + break; + default: + emitter.emit('notification', { method: event, params: payload }); + } +} + +export function dispatchJsonRpcNotification( + emitter: GatewayEventEmitter, + notification: JsonRpcNotification, +): void { + emitter.emit('notification', notification); + switch (notification.method) { + case GatewayEventType.CHANNEL_STATUS_CHANGED: + emitter.emit('channel:status', notification.params as { channelId: string; status: string }); + break; + case GatewayEventType.MESSAGE_RECEIVED: + emitter.emit('chat:message', notification.params as { message: unknown }); + break; + case GatewayEventType.ERROR: { + const errorData = notification.params as { message?: string }; + emitter.emit('error', new Error(errorData.message || 'Gateway error')); + break; + } + default: + logManager.debug(`Unknown Gateway notification: ${notification.method}`); + } +} diff --git a/electron/gateway/launch-strategy.ts b/electron/gateway/launch-strategy.ts new file mode 100644 index 0000000..fdbe2e1 --- /dev/null +++ b/electron/gateway/launch-strategy.ts @@ -0,0 +1,36 @@ +export type GatewayLaunchStrategy = 'utility-process' | 'node-runtime'; + +function normalizeForcedStrategy( + forced: string | undefined, +): GatewayLaunchStrategy | null { + const normalized = forced?.trim().toLowerCase(); + switch (normalized) { + case 'utility': + case 'utility-process': + return 'utility-process'; + case 'node-runtime': + case 'node': + case 'cli': + case 'electron-run-as-node': + return 'node-runtime'; + default: + return null; + } +} + +export function resolveGatewayLaunchStrategy(options: { + platform: NodeJS.Platform; + mode: 'dev' | 'packaged'; + forced?: string | undefined; +}): GatewayLaunchStrategy { + const forced = normalizeForcedStrategy(options.forced); + if (forced) { + return forced; + } + + if (options.platform === 'win32' && options.mode === 'dev') { + return 'node-runtime'; + } + + return 'utility-process'; +} diff --git a/electron/gateway/lifecycle-controller.ts b/electron/gateway/lifecycle-controller.ts new file mode 100644 index 0000000..8d4db86 --- /dev/null +++ b/electron/gateway/lifecycle-controller.ts @@ -0,0 +1,31 @@ +import logManager from '@electron/service/logger'; +import { isLifecycleSuperseded, nextLifecycleEpoch } from './process-policy'; + +export class LifecycleSupersededError extends Error { + constructor(message: string) { + super(message); + this.name = 'LifecycleSupersededError'; + } +} + +export class GatewayLifecycleController { + private epoch = 0; + + getCurrentEpoch(): number { + return this.epoch; + } + + bump(reason: string): number { + this.epoch = nextLifecycleEpoch(this.epoch); + logManager.debug(`Gateway lifecycle epoch advanced to ${this.epoch} (${reason})`); + return this.epoch; + } + + assert(expectedEpoch: number, phase: string): void { + if (isLifecycleSuperseded(expectedEpoch, this.epoch)) { + throw new LifecycleSupersededError( + `Gateway ${phase} superseded (expectedEpoch=${expectedEpoch}, currentEpoch=${this.epoch})`, + ); + } + } +} diff --git a/electron/gateway/manager.ts b/electron/gateway/manager.ts index 8bfe015..ae7949e 100644 --- a/electron/gateway/manager.ts +++ b/electron/gateway/manager.ts @@ -1,4 +1,5 @@ import { randomUUID } from 'node:crypto'; +import { EventEmitter } from 'node:events'; import { createServer } from 'node:net'; import { join } from 'node:path'; import { BrowserWindow } from 'electron'; @@ -7,6 +8,8 @@ import logManager from '@electron/service/logger'; import configManager from '@electron/service/config-service'; import { updateTrayStatus } from '@electron/service/tray'; import { getUserDataDir } from '@electron/utils/paths'; +import { captureTelemetryEvent, trackMetric } from '@electron/utils/telemetry'; +import WebSocket from 'ws'; import { loadOrCreateDeviceIdentity, type DeviceIdentity, @@ -17,14 +20,49 @@ import type { ContentBlock, RawMessage } from '@runtime/shared/chat-model'; import type { GatewayEvent, GatewayRpcParams, RuntimeRefreshTopic } from './types'; import * as providerHandlers from './handlers/provider'; import * as skillHandlers from './handlers/skills'; +import { + createInitialGatewayDiagnostics, + type GatewayDiagnosticsSnapshot, +} from './diagnostics'; +import { dispatchJsonRpcNotification, dispatchProtocolEvent } from './event-dispatch'; import { OpenClawProcessOwner } from './openclaw-process-owner'; import { launchGatewayProcess } from './process-launcher'; +import { prepareGatewayLaunchContext } from './config-sync'; +import { + DEFAULT_RECONNECT_CONFIG, + type ReconnectConfig, + type GatewayLifecycleState, + getReconnectScheduleDecision, + getReconnectSkipReason, +} from './process-policy'; +import { isNotification, isResponse, type JsonRpcNotification } from './protocol'; import { clearPendingGatewayRequests, rejectPendingGatewayRequest, resolvePendingGatewayRequest, type PendingGatewayRequest, } from './request-store'; +import { classifyGatewayStderrMessage, recordGatewayStartupStderrLine } from './startup-stderr'; +import { runGatewayStartupSequence } from './startup-orchestrator'; +import { + findExistingGatewayProcess, + runOpenClawDoctorRepair, + terminateOwnedGatewayProcess, + unloadLaunchctlGatewayService, + waitForPortFree, + warmupManagedPythonReadiness, +} from './supervisor'; +import type { GatewayProcessHandle } from './process-handle'; +import { GatewayConnectionMonitor } from './connection-monitor'; +import { GatewayLifecycleController, LifecycleSupersededError } from './lifecycle-controller'; +import { GatewayRestartController } from './restart-controller'; +import { GatewayRestartGovernor } from './restart-governor'; +import { + DEFAULT_GATEWAY_RELOAD_POLICY, + loadGatewayReloadPolicy, + type GatewayReloadPolicy, +} from './reload-policy'; +import { GatewayStateController, type GatewayRuntimeStatus } from './state'; import { connectGatewaySocket, waitForGatewayReady } from './ws-client'; type RuntimeChangeBroadcast = { @@ -51,6 +89,24 @@ type GatewayEventFrame = { payload?: unknown; }; +type GatewayNotificationEvent = + | JsonRpcNotification + | { + method: string; + params?: unknown; + }; + +export interface GatewayManagerEvents { + status: (status: GatewayRuntimeStatus) => void; + message: (message: unknown) => void; + notification: (notification: GatewayNotificationEvent) => void; + exit: (code: number | null) => void; + error: (error: Error) => void; + 'channel:status': (data: { channelId: string; status: string }) => void; + 'chat:message': (data: { message: unknown }) => void; + 'gateway:ready': (payload: unknown) => void; +} + function isRecord(value: unknown): value is Record { return typeof value === 'object' && value !== null; } @@ -62,6 +118,14 @@ function toErrorMessage(error: unknown): string { return String(error); } +function isTransportRpcFailure(error: unknown): boolean { + const message = error instanceof Error ? error.message : String(error); + return message.includes('Gateway RPC timed out:') + || message.includes('OpenClaw Gateway socket is not connected') + || message.includes('Gateway request cancelled:') + || message.includes('Failed to send Gateway RPC'); +} + function normalizeTimestamp(value: unknown): number | undefined { if (typeof value === 'number' && Number.isFinite(value)) { return value; @@ -217,29 +281,226 @@ async function findAvailablePort(): Promise { }); } -class GatewayManager { +export class GatewayManager extends EventEmitter { private initialized = false; private initPromise: Promise | null = null; private startPromise: Promise | null = null; private stopPromise: Promise | null = null; private status: GatewayStatus = 'disconnected'; + private lifecycleState: GatewayLifecycleState = 'stopped'; + private gatewayStatusDetails: GatewayRuntimeStatus = { state: 'stopped', port: null }; private readonly mode = 'openclaw' as const; private readonly processOwner = new OpenClawProcessOwner(); private readonly pendingRequests = new Map(); private readonly deltaSnapshots = new Map(); private gatewayToken = randomUUID(); private socket: WebSocket | null = null; - private child: Electron.UtilityProcess | null = null; + private child: GatewayProcessHandle | null = null; private port: number | null = null; private exitCode: number | null = null; private lastError?: string; private stopping = false; private deviceIdentity: DeviceIdentity | null = null; + private ownsProcess = false; + private externalShutdownSupported: boolean | null = null; + private reconnectTimer: NodeJS.Timeout | null = null; + private reconnectAttempts = 0; + private reconnectAttemptsTotal = 0; + private reconnectSuccessTotal = 0; + private reconnectConfig: ReconnectConfig; + private shouldReconnect = true; + private startLock = false; + private restartInFlight: Promise | null = null; + private readonly connectionMonitor = new GatewayConnectionMonitor(); + private readonly lifecycleController = new GatewayLifecycleController(); + private readonly restartController = new GatewayRestartController(); + private readonly restartGovernor = new GatewayRestartGovernor(); + private reloadDebounceTimer: NodeJS.Timeout | null = null; + private reloadPolicy: GatewayReloadPolicy = { ...DEFAULT_GATEWAY_RELOAD_POLICY }; + private reloadPolicyLoadedAt = 0; + private reloadPolicyRefreshPromise: Promise | null = null; + private isAutoReconnectStart = false; + private pendingRuntimeChange: RuntimeChangeBroadcast | null = null; + private lastSpawnSummary: string | null = null; + private recentStartupStderrLines: string[] = []; + private readonly stateController: GatewayStateController; + private gatewayReadyFallbackTimer: NodeJS.Timeout | null = null; + private diagnostics: GatewayDiagnosticsSnapshot = createInitialGatewayDiagnostics(); - private setStatus(status: GatewayStatus): void { - this.status = status; - updateTrayStatus(status); - this.broadcast({ type: 'gateway:status', status }); + private static readonly RELOAD_POLICY_REFRESH_MS = 15_000; + private static readonly HEARTBEAT_INTERVAL_MS = 30_000; + private static readonly HEARTBEAT_TIMEOUT_MS = 12_000; + private static readonly HEARTBEAT_MAX_MISSES = 3; + private static readonly HEARTBEAT_INTERVAL_MS_WIN = 60_000; + private static readonly HEARTBEAT_TIMEOUT_MS_WIN = 25_000; + private static readonly HEARTBEAT_MAX_MISSES_WIN = 5; + private static readonly GATEWAY_READY_FALLBACK_MS = 30_000; + + override on(eventName: 'status', listener: GatewayManagerEvents['status']): this; + override on(eventName: 'message', listener: GatewayManagerEvents['message']): this; + override on(eventName: 'notification', listener: GatewayManagerEvents['notification']): this; + override on(eventName: 'exit', listener: GatewayManagerEvents['exit']): this; + override on(eventName: 'error', listener: GatewayManagerEvents['error']): this; + override on(eventName: 'channel:status', listener: GatewayManagerEvents['channel:status']): this; + override on(eventName: 'chat:message', listener: GatewayManagerEvents['chat:message']): this; + override on(eventName: 'gateway:ready', listener: GatewayManagerEvents['gateway:ready']): this; + override on(eventName: string | symbol, listener: (...args: any[]) => void): this; + override on(eventName: string | symbol, listener: (...args: any[]) => void): this { + return super.on(eventName, listener); + } + + override once(eventName: 'status', listener: GatewayManagerEvents['status']): this; + override once(eventName: 'message', listener: GatewayManagerEvents['message']): this; + override once(eventName: 'notification', listener: GatewayManagerEvents['notification']): this; + override once(eventName: 'exit', listener: GatewayManagerEvents['exit']): this; + override once(eventName: 'error', listener: GatewayManagerEvents['error']): this; + override once(eventName: 'channel:status', listener: GatewayManagerEvents['channel:status']): this; + override once(eventName: 'chat:message', listener: GatewayManagerEvents['chat:message']): this; + override once(eventName: 'gateway:ready', listener: GatewayManagerEvents['gateway:ready']): this; + override once(eventName: string | symbol, listener: (...args: any[]) => void): this; + override once(eventName: string | symbol, listener: (...args: any[]) => void): this { + return super.once(eventName, listener); + } + + override off(eventName: 'status', listener: GatewayManagerEvents['status']): this; + override off(eventName: 'message', listener: GatewayManagerEvents['message']): this; + override off(eventName: 'notification', listener: GatewayManagerEvents['notification']): this; + override off(eventName: 'exit', listener: GatewayManagerEvents['exit']): this; + override off(eventName: 'error', listener: GatewayManagerEvents['error']): this; + override off(eventName: 'channel:status', listener: GatewayManagerEvents['channel:status']): this; + override off(eventName: 'chat:message', listener: GatewayManagerEvents['chat:message']): this; + override off(eventName: 'gateway:ready', listener: GatewayManagerEvents['gateway:ready']): this; + override off(eventName: string | symbol, listener: (...args: any[]) => void): this; + override off(eventName: string | symbol, listener: (...args: any[]) => void): this { + return super.off(eventName, listener); + } + + constructor(config?: Partial) { + super(); + this.reconnectConfig = { ...DEFAULT_RECONNECT_CONFIG, ...config }; + this.stateController = new GatewayStateController({ + emitStatus: (status) => { + this.gatewayStatusDetails = status; + this.lifecycleState = status.state; + this.status = this.toGatewayStatus(status.state); + this.emit('status', { ...status }); + updateTrayStatus(this.status); + this.broadcast({ type: 'gateway:status', status: this.status }); + }, + onTransition: (previousState, nextState) => { + if (nextState === 'running') { + this.restartGovernor.onRunning(); + } + this.restartController.flushDeferredRestart( + `status:${previousState}->${nextState}`, + { + state: this.lifecycleState, + startLock: this.startLock, + shouldReconnect: this.shouldReconnect, + }, + () => { + void this.restart().catch((error) => { + logManager.warn('Deferred Gateway restart failed:', error); + }); + }, + ); + }, + }); + + this.on('gateway:ready', () => { + this.clearGatewayReadyFallback(); + if (this.lifecycleState === 'running' && !this.gatewayStatusDetails.gatewayReady) { + logManager.info('Gateway subsystems ready (event received)'); + this.setGatewayState({ gatewayReady: true }); + } + }); + + this.on('error', (error) => { + logManager.debug('GatewayManager emitted error event', error); + }); + } + + private sanitizeSpawnArgs(args: string[]): string[] { + const sanitized = [...args]; + const tokenIndex = sanitized.indexOf('--token'); + if (tokenIndex >= 0 && tokenIndex + 1 < sanitized.length) { + sanitized[tokenIndex + 1] = ''; + } + return sanitized; + } + + private isUnsupportedShutdownError(error: unknown): boolean { + const message = error instanceof Error ? error.message : String(error); + return /unknown method:\s*shutdown/i.test(message); + } + + private toGatewayStatus(state: GatewayLifecycleState): GatewayStatus { + switch (state) { + case 'running': + return 'connected'; + case 'starting': + case 'reconnecting': + return 'reconnecting'; + case 'stopped': + case 'error': + default: + return 'disconnected'; + } + } + + private setGatewayState(update: Partial): void { + this.stateController.setStatus(update); + } + + private queueRuntimeChange(change?: RuntimeChangeBroadcast): void { + if (!change) { + return; + } + + if (!this.pendingRuntimeChange) { + this.pendingRuntimeChange = { + topics: [...change.topics], + reason: change.reason, + warnings: change.warnings ? [...change.warnings] : undefined, + channelType: change.channelType, + accountId: change.accountId, + }; + return; + } + + this.pendingRuntimeChange = { + topics: Array.from(new Set([...this.pendingRuntimeChange.topics, ...change.topics])), + reason: change.reason ?? this.pendingRuntimeChange.reason, + warnings: Array.from(new Set([ + ...(this.pendingRuntimeChange.warnings ?? []), + ...(change.warnings ?? []), + ])), + channelType: change.channelType ?? this.pendingRuntimeChange.channelType, + accountId: change.accountId ?? this.pendingRuntimeChange.accountId, + }; + } + + private takeQueuedRuntimeChange(): RuntimeChangeBroadcast | undefined { + if (!this.pendingRuntimeChange) { + return undefined; + } + + const change = this.pendingRuntimeChange; + this.pendingRuntimeChange = null; + return change; + } + + private flushQueuedRuntimeChangeFailure(prefix: string, error: unknown): void { + const runtimeChange = this.takeQueuedRuntimeChange(); + if (!runtimeChange) { + return; + } + + const warning = `${prefix}: ${toErrorMessage(error)}`; + this.notifyRuntimeChanged({ + ...runtimeChange, + warnings: [...(runtimeChange.warnings ?? []), warning], + }); } private async initDeviceIdentity(): Promise { @@ -258,39 +519,18 @@ class GatewayManager { } } - private async terminateChild(child: Electron.UtilityProcess): Promise { - await new Promise((resolve) => { - let settled = false; - - const finish = () => { - if (settled) return; - settled = true; - resolve(); - }; - - child.once('exit', () => { - finish(); - }); - - setTimeout(() => { - finish(); - }, 1500); - - try { - child.kill(); - } catch { - finish(); - } - }); + private async terminateChild(child: GatewayProcessHandle): Promise { + await terminateOwnedGatewayProcess(child); } private async disposeTransport(reason: string): Promise { const socket = this.socket; this.socket = null; + this.clearGatewayReadyFallback(); if (socket) { try { - socket.close(); + socket.terminate(); } catch (error) { logManager.warn(`Failed to close OpenClaw Gateway socket during ${reason}:`, error); } @@ -299,6 +539,7 @@ class GatewayManager { const child = this.child; this.child = null; this.port = null; + this.ownsProcess = false; if (child && this.exitCode === null) { this.exitCode = -1; } else if (!child) { @@ -311,42 +552,154 @@ class GatewayManager { new Error(`Gateway request cancelled: ${reason}`), ); + this.setGatewayState({ + port: this.port, + pid: undefined, + connectedAt: undefined, + uptime: undefined, + gatewayReady: undefined, + }); + if (child) { await this.terminateChild(child); } } - private bindProcessLifecycle(child: Electron.UtilityProcess): void { - child.on('exit', (code) => { - if (this.child !== child) { - return; - } + private handleGatewayProcessExit(child: GatewayProcessHandle, code: number | null): void { + if (this.child !== child) { + return; + } - this.exitCode = code ?? -1; - this.child = null; - this.port = null; + this.exitCode = code ?? -1; + this.child = null; + this.ownsProcess = false; + this.connectionMonitor.clear(); + this.clearGatewayReadyFallback(); + this.emit('exit', code); - if (this.stopping) { - return; - } + if (this.stopping) { + return; + } - this.lastError = `OpenClaw Gateway exited unexpectedly (code=${code ?? 'unknown'})`; - this.socket = null; - this.deltaSnapshots.clear(); - clearPendingGatewayRequests( - this.pendingRequests, - new Error(this.lastError), - ); - this.setStatus('disconnected'); - logManager.warn(this.lastError); + this.lastError = `OpenClaw Gateway exited unexpectedly (code=${code ?? 'unknown'})`; + this.socket = null; + this.deltaSnapshots.clear(); + clearPendingGatewayRequests( + this.pendingRequests, + new Error(this.lastError), + ); + this.setGatewayState({ + state: 'stopped', + port: this.port, + pid: undefined, + error: this.lastError, + connectedAt: undefined, + uptime: undefined, + gatewayReady: undefined, + reconnectAttempts: this.reconnectAttempts, + }); + logManager.warn(this.lastError); + this.scheduleReconnect(); + } + + private handleGatewayProcessError(error: Error): void { + this.lastError = toErrorMessage(error); + logManager.error('OpenClaw Gateway process error:', error); + this.emit('error', error); + } + + private async spawnGatewayProcess(): Promise { + if (this.port === null) { + throw new Error('OpenClaw Gateway port not allocated'); + } + + const launchContext = await prepareGatewayLaunchContext({ + port: this.port, + token: this.gatewayToken, + }); + await unloadLaunchctlGatewayService(); + + const stderrDedup = new Map(); + const { child, lastSpawnSummary } = await launchGatewayProcess({ + port: this.port, + launchContext, + sanitizeSpawnArgs: (args) => this.sanitizeSpawnArgs(args), + onStdoutLine: (line) => { + logManager.debug(`[OpenClaw stdout] ${line}`); + }, + onStderrLine: (line) => { + recordGatewayStartupStderrLine(this.recentStartupStderrLines, line); + const classified = classifyGatewayStderrMessage(line); + if (classified.level === 'drop') { + return; + } + + const count = (stderrDedup.get(classified.normalized) ?? 0) + 1; + stderrDedup.set(classified.normalized, count); + if (count > 1) { + if (count % 50 === 0) { + logManager.debug(`[Gateway stderr] (suppressed ${count} repeats) ${classified.normalized}`); + } + return; + } + + if (classified.level === 'debug') { + logManager.debug(`[Gateway stderr] ${classified.normalized}`); + return; + } + + logManager.warn(`[Gateway stderr] ${classified.normalized}`); + }, + onExit: (exitedChild, code) => { + this.handleGatewayProcessExit(exitedChild, code); + }, + onError: (error) => { + this.handleGatewayProcessError(error); + }, }); - child.on('error', (error) => { - this.lastError = toErrorMessage(error); - logManager.error('OpenClaw Gateway process error:', error); + this.child = child; + this.ownsProcess = true; + this.lastSpawnSummary = lastSpawnSummary; + this.setGatewayState({ + port: this.port, + pid: child.pid ?? undefined, + error: undefined, + reconnectAttempts: this.reconnectAttempts, }); } + private async connectToGateway(port: number, externalToken?: string): Promise { + const token = externalToken ?? this.gatewayToken; + const socket = await connectGatewaySocket({ + port, + token, + deviceIdentity: this.deviceIdentity, + platform: process.platform, + onMessage: (message) => this.handleGatewayFrame(message), + onCloseAfterHandshake: (socket, code) => this.handleGatewaySocketClosed(socket, code), + }); + this.socket = socket; + socket.on('pong', () => { + this.connectionMonitor.markAlive('pong'); + this.recordGatewayAlive(); + }); + this.recordGatewayAlive(); + this.connectionMonitor.markAlive('message'); + this.setGatewayState({ + state: 'running', + port, + pid: this.child?.pid ?? this.gatewayStatusDetails.pid, + error: undefined, + connectedAt: Date.now(), + reconnectAttempts: this.reconnectAttempts, + gatewayReady: false, + }); + this.startPing(); + this.startHealthCheck(); + this.scheduleGatewayReadyFallback(); + } + private handleGatewaySocketClosed(socket: WebSocket, code: number): void { if (this.socket !== socket) { return; @@ -357,17 +710,38 @@ class GatewayManager { } this.socket = null; + this.connectionMonitor.clear(); + this.clearGatewayReadyFallback(); + this.recordSocketClose(code); + this.diagnostics.consecutiveHeartbeatMisses = 0; this.deltaSnapshots.clear(); this.lastError = `OpenClaw Gateway socket closed (code=${code})`; clearPendingGatewayRequests( this.pendingRequests, new Error(this.lastError), ); - this.setStatus('disconnected'); + const wasRunning = this.lifecycleState === 'running'; + this.setGatewayState({ + state: 'stopped', + port: this.port, + pid: this.child?.pid ?? undefined, + error: this.lastError, + connectedAt: undefined, + uptime: undefined, + gatewayReady: undefined, + reconnectAttempts: this.reconnectAttempts, + }); logManager.warn(this.lastError); + + if (wasRunning && (process.platform !== 'win32' || code === 1012)) { + this.scheduleReconnect(); + } } private handleGatewayFrame(frame: unknown): void { + this.connectionMonitor.markAlive('message'); + this.recordGatewayAlive(); + if (!isRecord(frame)) { return; } @@ -392,23 +766,33 @@ class GatewayManager { } if (frame.type === 'event' && typeof frame.event === 'string') { - this.handleGatewayEvent(frame as GatewayEventFrame); + const eventFrame = frame as GatewayEventFrame; + if (eventFrame.event === 'chat' && isRecord(eventFrame.payload)) { + this.handleChatEvent(eventFrame.payload); + } + dispatchProtocolEvent(this, eventFrame.event, eventFrame.payload); + return; } - } - private handleGatewayEvent(event: GatewayEventFrame): void { - switch (event.event) { - case 'chat': - if (isRecord(event.payload)) { - this.handleChatEvent(event.payload); - } - break; - case 'gateway.ready': - logManager.info('OpenClaw Gateway reported ready'); - break; - default: - break; + if (isResponse(frame) && frame.id && this.pendingRequests.has(String(frame.id))) { + if (frame.error) { + rejectPendingGatewayRequest( + this.pendingRequests, + String(frame.id), + buildGatewayRpcError(frame.error, `Gateway RPC failed: ${String(frame.id)}`), + ); + } else { + resolvePendingGatewayRequest(this.pendingRequests, String(frame.id), frame.result); + } + return; } + + if (isNotification(frame)) { + dispatchJsonRpcNotification(this, frame as JsonRpcNotification); + return; + } + + this.emit('message', frame); } private handleChatEvent(payload: Record): void { @@ -500,12 +884,17 @@ class GatewayManager { } } - private async rpcGateway( + private async sendGatewayRequest( method: string, - params: Record, - options?: { timeoutMs?: number | null }, + params?: Record, + options?: { + timeoutMs?: number | null; + skipStart?: boolean; + }, ): Promise { - await this.start(); + if (!options?.skipStart) { + await this.start(); + } const socket = this.socket; if (!socket || socket.readyState !== WebSocket.OPEN) { @@ -546,9 +935,361 @@ class GatewayManager { new Error(`Failed to send Gateway RPC ${method}: ${toErrorMessage(error)}`), ); } + }).then((result) => { + this.recordRpcSuccess(); + return result; + }).catch((error) => { + if (isTransportRpcFailure(error)) { + this.recordRpcFailure(method); + } + throw error; }); } + private async rpcGateway( + method: string, + params: Record, + options?: { timeoutMs?: number | null }, + ): Promise { + return await this.sendGatewayRequest(method, params, options); + } + + private async requestGatewayShutdown(timeoutMs = 5_000): Promise { + await this.sendGatewayRequest( + 'shutdown', + undefined, + { + timeoutMs, + skipStart: true, + }, + ); + } + + private startHealthCheck(): void { + this.connectionMonitor.startHealthCheck({ + shouldCheck: () => this.lifecycleState === 'running', + checkHealth: () => this.checkHealth(), + onUnhealthy: (errorMessage) => { + logManager.warn(`Gateway health check failed: ${errorMessage}`); + this.emit('error', new Error(errorMessage)); + }, + onError: () => { + // The monitor already logged the error. + }, + }); + } + + private startPing(): void { + const isWindows = process.platform === 'win32'; + this.connectionMonitor.startPing({ + intervalMs: isWindows + ? GatewayManager.HEARTBEAT_INTERVAL_MS_WIN + : GatewayManager.HEARTBEAT_INTERVAL_MS, + timeoutMs: isWindows + ? GatewayManager.HEARTBEAT_TIMEOUT_MS_WIN + : GatewayManager.HEARTBEAT_TIMEOUT_MS, + maxConsecutiveMisses: isWindows + ? GatewayManager.HEARTBEAT_MAX_MISSES_WIN + : GatewayManager.HEARTBEAT_MAX_MISSES, + sendPing: () => { + if (this.socket?.readyState === WebSocket.OPEN) { + this.socket.ping(); + } + }, + onHeartbeatTimeout: ({ consecutiveMisses, timeoutMs }) => { + this.recordHeartbeatTimeout(consecutiveMisses); + logManager.warn( + `Gateway heartbeat: ${consecutiveMisses} consecutive pong misses (timeout=${timeoutMs}ms, state=${this.lifecycleState}, autoReconnect=${this.shouldReconnect}).`, + ); + if (!this.shouldReconnect || this.lifecycleState !== 'running') { + return; + } + if (process.platform === 'win32') { + logManager.warn('Gateway heartbeat recovery skipped on Windows; waiting for process exit or socket close'); + return; + } + void this.restart().catch((error) => { + logManager.warn('Gateway heartbeat recovery failed:', error); + }); + }, + }); + } + + private async refreshReloadPolicy(force = false): Promise { + const now = Date.now(); + if (!force && now - this.reloadPolicyLoadedAt < GatewayManager.RELOAD_POLICY_REFRESH_MS) { + return; + } + + if (this.reloadPolicyRefreshPromise) { + await this.reloadPolicyRefreshPromise; + return; + } + + this.reloadPolicyRefreshPromise = (async () => { + const nextPolicy = await loadGatewayReloadPolicy(); + this.reloadPolicy = nextPolicy; + this.reloadPolicyLoadedAt = Date.now(); + })(); + + try { + await this.reloadPolicyRefreshPromise; + } finally { + this.reloadPolicyRefreshPromise = null; + } + } + + private clearAllTimers(): void { + if (this.reconnectTimer) { + clearTimeout(this.reconnectTimer); + this.reconnectTimer = null; + } + this.connectionMonitor.clear(); + this.restartController.clearDebounceTimer(); + if (this.reloadDebounceTimer) { + clearTimeout(this.reloadDebounceTimer); + this.reloadDebounceTimer = null; + } + this.clearGatewayReadyFallback(); + } + + private clearGatewayReadyFallback(): void { + if (this.gatewayReadyFallbackTimer) { + clearTimeout(this.gatewayReadyFallbackTimer); + this.gatewayReadyFallbackTimer = null; + } + } + + private scheduleGatewayReadyFallback(): void { + this.clearGatewayReadyFallback(); + this.gatewayReadyFallbackTimer = setTimeout(() => { + this.gatewayReadyFallbackTimer = null; + if (this.lifecycleState === 'running' && !this.gatewayStatusDetails.gatewayReady) { + logManager.info('Gateway ready fallback triggered (no gateway.ready event within timeout)'); + this.setGatewayState({ gatewayReady: true }); + } + }, GatewayManager.GATEWAY_READY_FALLBACK_MS); + } + + getDiagnostics(): GatewayDiagnosticsSnapshot { + return { ...this.diagnostics }; + } + + private recordGatewayAlive(): void { + this.diagnostics.lastAliveAt = Date.now(); + this.diagnostics.consecutiveHeartbeatMisses = 0; + } + + private recordRpcSuccess(): void { + this.diagnostics.lastRpcSuccessAt = Date.now(); + this.diagnostics.consecutiveRpcFailures = 0; + } + + private recordRpcFailure(method: string): void { + this.diagnostics.lastRpcFailureAt = Date.now(); + this.diagnostics.lastRpcFailureMethod = method; + this.diagnostics.consecutiveRpcFailures += 1; + } + + private recordHeartbeatTimeout(consecutiveMisses: number): void { + this.diagnostics.lastHeartbeatTimeoutAt = Date.now(); + this.diagnostics.consecutiveHeartbeatMisses = consecutiveMisses; + } + + private recordSocketClose(code: number): void { + this.diagnostics.lastSocketCloseAt = Date.now(); + this.diagnostics.lastSocketCloseCode = code; + } + + private emitReconnectMetric( + outcome: 'success' | 'failure', + payload: { + attemptNo: number; + maxAttempts: number; + delayMs: number; + error?: string; + }, + ): void { + const successRate = this.reconnectAttemptsTotal > 0 + ? this.reconnectSuccessTotal / this.reconnectAttemptsTotal + : 0; + + trackMetric('gateway.reconnect', { + outcome, + attemptNo: payload.attemptNo, + maxAttempts: payload.maxAttempts, + delayMs: payload.delayMs, + gateway_reconnect_success_count: this.reconnectSuccessTotal, + gateway_reconnect_attempt_count: this.reconnectAttemptsTotal, + gateway_reconnect_success_rate: Number(successRate.toFixed(4)), + ...(payload.error ? { error: payload.error } : {}), + }); + } + + private scheduleReconnect(): void { + const decision = getReconnectScheduleDecision({ + shouldReconnect: this.shouldReconnect, + hasReconnectTimer: this.reconnectTimer !== null, + reconnectAttempts: this.reconnectAttempts, + maxAttempts: this.reconnectConfig.maxAttempts, + baseDelay: this.reconnectConfig.baseDelay, + maxDelay: this.reconnectConfig.maxDelay, + }); + + if (decision.action === 'skip') { + return; + } + + if (decision.action === 'already-scheduled') { + return; + } + + if (decision.action === 'fail') { + this.lastError = 'Failed to reconnect after maximum attempts'; + this.setGatewayState({ + state: 'error', + port: this.port, + pid: this.child?.pid ?? this.gatewayStatusDetails.pid, + error: this.lastError, + reconnectAttempts: this.reconnectAttempts, + gatewayReady: undefined, + }); + return; + } + + const { delay, nextAttempt, maxAttempts } = decision; + this.reconnectAttempts = nextAttempt; + this.setGatewayState({ + state: 'reconnecting', + port: this.port, + pid: this.child?.pid ?? this.gatewayStatusDetails.pid, + error: this.lastError, + reconnectAttempts: nextAttempt, + gatewayReady: false, + }); + const scheduledEpoch = this.lifecycleController.getCurrentEpoch(); + + this.reconnectTimer = setTimeout(async () => { + this.reconnectTimer = null; + const skipReason = getReconnectSkipReason({ + scheduledEpoch, + currentEpoch: this.lifecycleController.getCurrentEpoch(), + shouldReconnect: this.shouldReconnect, + }); + if (skipReason) { + return; + } + + const attemptNo = this.reconnectAttempts; + this.reconnectAttemptsTotal += 1; + try { + this.isAutoReconnectStart = true; + await this.start(); + this.reconnectSuccessTotal += 1; + this.emitReconnectMetric('success', { + attemptNo, + maxAttempts, + delayMs: delay, + }); + this.reconnectAttempts = 0; + } catch (error) { + logManager.error( + `Gateway reconnection attempt ${nextAttempt}/${maxAttempts} failed:`, + error, + ); + this.emitReconnectMetric('failure', { + attemptNo, + maxAttempts, + delayMs: delay, + error: toErrorMessage(error), + }); + this.scheduleReconnect(); + } + }, delay); + } + + debouncedRestart(delayMs = 2000, options?: RuntimeChangeBroadcast): void { + this.queueRuntimeChange(options); + this.restartController.debouncedRestart(delayMs, () => { + void this.restart().catch((error) => { + logManager.warn('Debounced Gateway restart failed:', error); + this.flushQueuedRuntimeChangeFailure('Gateway restart failed', error); + }); + }); + } + + async reload(options?: RuntimeChangeBroadcast): Promise { + this.queueRuntimeChange(options); + await this.refreshReloadPolicy(); + + if (this.reloadPolicy.mode === 'off' || this.reloadPolicy.mode === 'restart') { + await this.restart(); + return; + } + + if (this.restartController.isRestartDeferred({ + state: this.lifecycleState, + startLock: this.startLock, + })) { + this.restartController.markDeferredRestart('reload', { + state: this.lifecycleState, + startLock: this.startLock, + }); + return; + } + + if (!this.child?.pid || this.lifecycleState !== 'running') { + await this.restart(); + return; + } + + if (process.platform === 'win32') { + await this.restart(); + return; + } + + try { + process.kill(this.child.pid, 'SIGUSR1'); + await new Promise((resolve) => setTimeout(resolve, 1500)); + + if (this.lifecycleState !== 'running' || !this.child?.pid) { + await this.restart(); + return; + } + + const runtimeChange = this.takeQueuedRuntimeChange(); + if (runtimeChange) { + this.notifyRuntimeChanged(runtimeChange); + } + } catch (error) { + logManager.warn('Gateway reload signal failed, falling back to restart:', error); + await this.restart(); + } + } + + debouncedReload(delayMs?: number, options?: RuntimeChangeBroadcast): void { + this.queueRuntimeChange(options); + void this.refreshReloadPolicy(); + + const effectiveDelay = delayMs ?? this.reloadPolicy.debounceMs; + if (this.reloadPolicy.mode === 'off' || this.reloadPolicy.mode === 'restart') { + this.debouncedRestart(effectiveDelay); + return; + } + + if (this.reloadDebounceTimer) { + clearTimeout(this.reloadDebounceTimer); + } + + this.reloadDebounceTimer = setTimeout(() => { + this.reloadDebounceTimer = null; + void this.reload().catch((error) => { + logManager.warn('Debounced Gateway reload failed:', error); + this.flushQueuedRuntimeChangeFailure('Gateway reload failed', error); + }); + }, effectiveDelay); + } + async init(): Promise { if (this.initialized) { return; @@ -564,7 +1305,13 @@ class GatewayManager { const autoStart = Boolean(configManager.get(CONFIG_KEYS.GATEWAY_AUTO_START)); if (!autoStart) { - this.setStatus('disconnected'); + this.setGatewayState({ + state: 'stopped', + port: this.port, + pid: this.child?.pid ?? this.gatewayStatusDetails.pid, + gatewayReady: undefined, + reconnectAttempts: this.reconnectAttempts, + }); return; } @@ -572,7 +1319,14 @@ class GatewayManager { await this.start(); } catch (error) { this.lastError = toErrorMessage(error); - this.setStatus('disconnected'); + this.setGatewayState({ + state: 'stopped', + port: this.port, + pid: this.child?.pid ?? this.gatewayStatusDetails.pid, + error: this.lastError, + gatewayReady: undefined, + reconnectAttempts: this.reconnectAttempts, + }); logManager.error('Failed to auto-start OpenClaw Gateway:', error); } })(); @@ -585,7 +1339,14 @@ class GatewayManager { } async start(): Promise { - if (this.status === 'connected' && this.socket?.readyState === WebSocket.OPEN) { + if (this.startLock) { + if (this.startPromise) { + return await this.startPromise; + } + return; + } + + if (this.lifecycleState === 'running' && this.socket?.readyState === WebSocket.OPEN) { return; } @@ -599,9 +1360,12 @@ class GatewayManager { await this.stopPromise; } + this.startLock = true; + const startEpoch = this.lifecycleController.bump('start'); this.stopping = false; this.lastError = undefined; - this.setStatus('reconnecting'); + this.shouldReconnect = true; + await this.refreshReloadPolicy(true); try { await this.initDeviceIdentity(); @@ -611,47 +1375,159 @@ class GatewayManager { throw new Error(runtimeStatus.lastError || `OpenClaw entry not found at ${runtimeStatus.runtimePaths.entryPath}`); } - await this.disposeTransport('starting OpenClaw Gateway'); + if (this.reconnectTimer) { + clearTimeout(this.reconnectTimer); + this.reconnectTimer = null; + } - this.port = await findAvailablePort(); + const wasAutoReconnectStart = this.isAutoReconnectStart; + if (!wasAutoReconnectStart) { + this.reconnectAttempts = 0; + } + this.isAutoReconnectStart = false; + + const reusingManagedProcess = + this.child?.pid != null && + this.ownsProcess && + this.port != null && + this.socket === null; + const shouldDisposeTransport = !reusingManagedProcess && (this.socket !== null || this.child !== null); + if (shouldDisposeTransport) { + await this.disposeTransport('starting OpenClaw Gateway'); + } + + if (this.port === null) { + this.port = await findAvailablePort(); + } this.exitCode = null; - this.gatewayToken = randomUUID(); - - const child = await launchGatewayProcess({ + if (!reusingManagedProcess) { + this.gatewayToken = randomUUID(); + } + this.setGatewayState({ + state: wasAutoReconnectStart ? 'reconnecting' : 'starting', port: this.port, - token: this.gatewayToken, - openclawDir: runtimeStatus.runtimePaths.resolvedDir, - entryScript: runtimeStatus.runtimePaths.entryPath, + pid: this.child?.pid ?? this.gatewayStatusDetails.pid, + error: undefined, + connectedAt: undefined, + uptime: undefined, + reconnectAttempts: this.reconnectAttempts, + gatewayReady: false, }); - this.child = child; - this.bindProcessLifecycle(child); + warmupManagedPythonReadiness(); - await waitForGatewayReady({ + await runGatewayStartupSequence({ port: this.port, - getProcessExitCode: () => this.exitCode, - }); - - this.socket = await connectGatewaySocket({ - port: this.port, - token: this.gatewayToken, - deviceIdentity: this.deviceIdentity, - platform: process.platform, - onMessage: (message) => this.handleGatewayFrame(message), - onCloseAfterHandshake: (socket, code) => this.handleGatewaySocketClosed(socket, code), + shouldWaitForPortFree: process.platform === 'win32', + hasOwnedProcess: () => this.child?.pid != null && this.ownsProcess, + assertLifecycle: (phase) => { + this.lifecycleController.assert(startEpoch, phase); + }, + resetStartupStderrLines: () => { + this.recentStartupStderrLines = []; + }, + getStartupStderrLines: () => this.recentStartupStderrLines, + findExistingGateway: async (port) => { + return await findExistingGatewayProcess({ port, ownedPid: this.child?.pid ?? null }); + }, + connect: async (port, externalToken) => { + await this.connectToGateway(port, externalToken); + }, + onConnectedToExistingGateway: () => { + const isOwnProcess = this.child?.pid != null && this.ownsProcess; + if (!isOwnProcess) { + this.ownsProcess = false; + this.externalShutdownSupported = null; + this.setGatewayState({ pid: undefined }); + } + if (isOwnProcess) { + this.restartController.recordRestartCompleted(); + } + }, + waitForPortFree: async (port) => { + await waitForPortFree(port); + }, + startProcess: async () => { + await this.spawnGatewayProcess(); + }, + waitForReady: async (port) => { + await waitForGatewayReady({ + port, + getProcessExitCode: () => this.exitCode, + }); + }, + onConnectedToManagedGateway: () => { + logManager.info('OpenClaw Gateway startup sequence completed', { + port: this.port, + pid: this.child?.pid, + spawn: this.lastSpawnSummary, + }); + }, + runDoctorRepair: async () => await runOpenClawDoctorRepair(), + onDoctorRepairSuccess: () => { + this.lastError = undefined; + this.setGatewayState({ + state: 'starting', + port: this.port, + pid: this.child?.pid ?? this.gatewayStatusDetails.pid, + error: undefined, + reconnectAttempts: this.reconnectAttempts, + gatewayReady: false, + }); + }, + delay: async (ms) => { + await new Promise((resolve) => setTimeout(resolve, ms)); + }, }); this.lastError = undefined; - this.setStatus('connected'); + this.reconnectAttempts = 0; + this.setGatewayState({ + state: 'running', + port: this.port, + pid: this.child?.pid ?? this.gatewayStatusDetails.pid, + error: undefined, + reconnectAttempts: 0, + }); logManager.info('OpenClaw Gateway connected', { port: this.port, pid: this.child?.pid, + spawn: this.lastSpawnSummary, }); } catch (error) { + if (error instanceof LifecycleSupersededError) { + logManager.debug(error.message); + return; + } + this.lastError = toErrorMessage(error); await this.disposeTransport('failed OpenClaw Gateway start'); - this.setStatus('disconnected'); + this.setGatewayState({ + state: 'error', + port: this.port, + pid: undefined, + error: this.lastError, + connectedAt: undefined, + uptime: undefined, + reconnectAttempts: this.reconnectAttempts, + gatewayReady: undefined, + }); throw error; + } finally { + this.startLock = false; + this.restartController.flushDeferredRestart( + 'start:finally', + { + state: this.lifecycleState, + startLock: this.startLock, + shouldReconnect: this.shouldReconnect, + }, + () => { + void this.restart().catch((error) => { + logManager.warn('Deferred Gateway restart failed:', error); + }); + }, + ); } })(); @@ -668,9 +1544,41 @@ class GatewayManager { } this.stopPromise = (async () => { + this.lifecycleController.bump('stop'); + this.shouldReconnect = false; this.stopping = true; + this.clearAllTimers(); + + if (!this.ownsProcess && this.socket?.readyState === WebSocket.OPEN && this.externalShutdownSupported !== false) { + try { + await this.requestGatewayShutdown(); + this.externalShutdownSupported = true; + } catch (error) { + if (this.isUnsupportedShutdownError(error)) { + this.externalShutdownSupported = false; + logManager.info('External Gateway does not support "shutdown"; skipping shutdown RPC for future stops'); + } else { + logManager.warn('Failed to request shutdown for externally managed Gateway:', error); + } + } + } + await this.disposeTransport('stopping OpenClaw Gateway'); - this.setStatus('disconnected'); + this.restartController.resetDeferredRestart(); + this.isAutoReconnectStart = false; + this.reconnectAttempts = 0; + this.lastError = undefined; + this.diagnostics.consecutiveHeartbeatMisses = 0; + this.setGatewayState({ + state: 'stopped', + port: this.port, + pid: undefined, + error: undefined, + connectedAt: undefined, + uptime: undefined, + reconnectAttempts: 0, + gatewayReady: undefined, + }); this.stopping = false; })(); @@ -681,13 +1589,132 @@ class GatewayManager { } } - async restart(options?: RuntimeChangeBroadcast): Promise { - this.setStatus('reconnecting'); - await this.stop(); - await this.start(); + async forceTerminateOwnedProcessForQuit(): Promise { + if (!this.child || !this.ownsProcess) { + return false; + } - if (options) { - this.notifyRuntimeChanged(options); + if (this.socket) { + try { + this.socket.terminate(); + } catch { + // ignore quit-time socket termination errors + } + this.socket = null; + } + this.clearGatewayReadyFallback(); + this.connectionMonitor.clear(); + clearPendingGatewayRequests( + this.pendingRequests, + new Error('Gateway terminated during app quit'), + ); + + const child = this.child; + await this.terminateChild(child); + if (this.child === child) { + this.child = null; + } + this.ownsProcess = false; + this.setGatewayState({ + pid: undefined, + connectedAt: undefined, + uptime: undefined, + gatewayReady: undefined, + }); + return true; + } + + async restart(options?: RuntimeChangeBroadcast): Promise { + this.queueRuntimeChange(options); + + if (this.restartController.isRestartDeferred({ + state: this.lifecycleState, + startLock: this.startLock, + })) { + this.restartController.markDeferredRestart('restart', { + state: this.lifecycleState, + startLock: this.startLock, + }); + return; + } + + if (this.restartInFlight) { + await this.restartInFlight; + return; + } + + const decision = this.restartGovernor.decide(); + if (!decision.allow) { + const observability = this.restartGovernor.getObservability(); + logManager.warn( + `[gateway-restart-governor] restart suppressed reason=${decision.reason} retryAfterMs=${decision.retryAfterMs} suppressed=${observability.suppressed_total} executed=${observability.executed_total} circuitOpenUntil=${observability.circuit_open_until}`, + ); + const props = { + reason: decision.reason, + retry_after_ms: decision.retryAfterMs, + gateway_restart_suppressed_total: observability.suppressed_total, + gateway_restart_executed_total: observability.executed_total, + gateway_restart_circuit_open_until: observability.circuit_open_until, + }; + trackMetric('gateway.restart.suppressed', props); + captureTelemetryEvent('gateway_restart_suppressed', props); + return; + } + + const pidBefore = this.gatewayStatusDetails.pid ?? this.child?.pid; + this.restartInFlight = (async () => { + this.setGatewayState({ + state: 'reconnecting', + port: this.port, + pid: this.child?.pid ?? this.gatewayStatusDetails.pid, + error: this.lastError, + reconnectAttempts: this.reconnectAttempts, + gatewayReady: false, + }); + await this.stop(); + try { + await this.start(); + } catch (error) { + logManager.warn('Gateway restart: start() failed after stop(), enabling auto-reconnect recovery', error); + this.shouldReconnect = true; + this.scheduleReconnect(); + throw error; + } + })(); + + try { + await this.restartInFlight; + this.restartGovernor.recordExecuted(); + this.restartController.recordRestartCompleted(); + const observability = this.restartGovernor.getObservability(); + const props = { + gateway_restart_executed_total: observability.executed_total, + gateway_restart_suppressed_total: observability.suppressed_total, + gateway_restart_circuit_open_until: observability.circuit_open_until, + pid_before: pidBefore ?? 'n/a', + pid_after: this.gatewayStatusDetails.pid ?? this.child?.pid ?? 'n/a', + }; + trackMetric('gateway.restart.executed', props); + captureTelemetryEvent('gateway_restart_executed', props); + const runtimeChange = this.takeQueuedRuntimeChange(); + if (runtimeChange) { + this.notifyRuntimeChanged(runtimeChange); + } + } finally { + this.restartInFlight = null; + this.restartController.flushDeferredRestart( + 'restart:finally', + { + state: this.lifecycleState, + startLock: this.startLock, + shouldReconnect: this.shouldReconnect, + }, + () => { + void this.restart().catch((error) => { + logManager.warn('Deferred Gateway restart failed:', error); + }); + }, + ); } } @@ -698,15 +1725,33 @@ class GatewayManager { port: number | null; pid: number | null; lastError?: string; + lifecycleState: GatewayLifecycleState; + gatewayReady?: boolean; + connectedAt?: number; + uptime?: number; + reconnectAttempts?: number; + diagnostics: GatewayDiagnosticsSnapshot; runtime: ReturnType; } { + const runtimeStatus = this.stateController.getStatus(); + const connectedAt = runtimeStatus.connectedAt; + const uptime = runtimeStatus.state === 'running' && connectedAt + ? Date.now() - connectedAt + : undefined; + return { status: this.status, initialized: this.initialized, mode: this.mode, - port: this.port, - pid: this.child?.pid ?? null, + port: runtimeStatus.port ?? this.port, + pid: runtimeStatus.pid ?? this.child?.pid ?? null, lastError: this.lastError, + lifecycleState: runtimeStatus.state, + gatewayReady: runtimeStatus.gatewayReady, + connectedAt, + uptime, + reconnectAttempts: runtimeStatus.reconnectAttempts, + diagnostics: this.getDiagnostics(), runtime: this.processOwner.getStatus(), }; } @@ -719,6 +1764,12 @@ class GatewayManager { port: number | null; pid: number | null; lastError?: string; + lifecycleState: GatewayLifecycleState; + gatewayReady?: boolean; + connectedAt?: number; + uptime?: number; + reconnectAttempts?: number; + diagnostics: GatewayDiagnosticsSnapshot; runtime: ReturnType; }> { const status = this.getStatus(); @@ -857,14 +1908,7 @@ class GatewayManager { }; if (this.initialized && (this.status === 'connected' || this.status === 'reconnecting')) { - void this.restart(runtimeChange).catch((error) => { - const warning = `Gateway restart after provider reload failed: ${toErrorMessage(error)}`; - logManager.error(warning, error); - this.notifyRuntimeChanged({ - ...runtimeChange, - warnings: [...(runtimeChange.warnings ?? []), warning], - }); - }); + this.debouncedReload(undefined, runtimeChange); return; } diff --git a/electron/gateway/process-handle.ts b/electron/gateway/process-handle.ts new file mode 100644 index 0000000..46b1d28 --- /dev/null +++ b/electron/gateway/process-handle.ts @@ -0,0 +1,7 @@ +export interface GatewayProcessHandle { + pid: number | undefined; + stdout?: NodeJS.ReadableStream | null; + stderr?: NodeJS.ReadableStream | null; + once(event: string, listener: (...args: any[]) => void): this; + kill(signal?: NodeJS.Signals | number): boolean; +} diff --git a/electron/gateway/process-launcher.ts b/electron/gateway/process-launcher.ts index 9bf8edc..aea02ae 100644 --- a/electron/gateway/process-launcher.ts +++ b/electron/gateway/process-launcher.ts @@ -1,53 +1,196 @@ -import { utilityProcess } from 'electron'; +import { spawn } from 'node:child_process'; +import { existsSync, writeFileSync } from 'node:fs'; +import path from 'node:path'; +import { app, utilityProcess } from 'electron'; import logManager from '@electron/service/logger'; +import { appendNodeRequireToNodeOptions } from '@electron/utils/paths'; +import type { GatewayLaunchContext } from './config-sync'; +import { resolveGatewayLaunchStrategy, type GatewayLaunchStrategy } from './launch-strategy'; +import type { GatewayProcessHandle } from './process-handle'; -export interface OpenClawGatewayLaunchContext { - port: number; - token: string; - openclawDir: string; - entryScript: string; +const GATEWAY_PRELOAD_SOURCE = `'use strict'; +(function () { + if (process.platform !== 'win32') return; + try { + var cp = require('child_process'); + if (cp.__znAiGatewayPatched) return; + cp.__znAiGatewayPatched = true; + ['spawn', 'exec', 'execFile', 'fork', 'spawnSync', 'execSync', 'execFileSync'].forEach(function (method) { + var original = cp[method]; + if (typeof original !== 'function') return; + cp[method] = function () { + var args = Array.prototype.slice.call(arguments); + var optIdx = -1; + for (var i = 1; i < args.length; i++) { + var candidate = args[i]; + if (candidate && typeof candidate === 'object' && !Array.isArray(candidate)) { + optIdx = i; + break; + } + } + if (optIdx >= 0) { + args[optIdx].windowsHide = true; + } else { + var opts = { windowsHide: true }; + if (typeof args[args.length - 1] === 'function') { + args.splice(args.length - 1, 0, opts); + } else { + args.push(opts); + } + } + return original.apply(this, args); + }; + }); + } catch { + // best-effort + } +})(); +`; + +function ensureGatewayPreload(): string { + const destination = path.join(app.getPath('userData'), 'gateway-preload.cjs'); + try { + writeFileSync(destination, GATEWAY_PRELOAD_SOURCE, 'utf-8'); + } catch { + // best-effort + } + return destination; } -export async function launchGatewayProcess( - context: OpenClawGatewayLaunchContext, -): Promise { - const gatewayArgs = [ - 'gateway', - '--port', - String(context.port), - '--token', - context.token, - '--allow-unconfigured', - ]; +export interface LaunchGatewayProcessOptions { + port: number; + launchContext: GatewayLaunchContext; + sanitizeSpawnArgs?: (args: string[]) => string[]; + onStdoutLine?: (line: string) => void; + onStderrLine?: (line: string) => void; + onSpawn?: (pid: number | undefined) => void; + onExit?: (child: GatewayProcessHandle, code: number | null) => void; + onError?: (error: Error) => void; +} - const env: NodeJS.ProcessEnv = { - ...process.env, - OPENCLAW_GATEWAY_TOKEN: context.token, - OPENCLAW_SKIP_CHANNELS: '1', - OPENCLAW_NO_RESPAWN: '1', - }; +function emitProcessOutput( + child: GatewayProcessHandle, + stream: 'stdout' | 'stderr', + onLine: ((line: string) => void) | undefined, +): void { + const readable = child[stream]; + if (!readable) { + return; + } - logManager.info('Starting OpenClaw Gateway process', { - port: context.port, - entryScript: context.entryScript, - cwd: context.openclawDir, - args: gatewayArgs, + readable.on('data', (data) => { + const raw = data.toString(); + for (const line of raw.split(/\r?\n/)) { + const trimmed = line.trim(); + if (!trimmed) { + continue; + } + + if (onLine) { + onLine(trimmed); + continue; + } + + if (stream === 'stdout') { + logManager.debug(`[OpenClaw stdout] ${trimmed}`); + continue; + } + + logManager.warn(`[OpenClaw] ${trimmed}`); + } }); +} - return await new Promise((resolve, reject) => { - const child = utilityProcess.fork(context.entryScript, gatewayArgs, { - cwd: context.openclawDir, - stdio: 'pipe', - env, - serviceName: 'OpenClaw Gateway', - }); +function buildGatewayRuntimeEnv( + launchContext: GatewayLaunchContext, +): Record { + const runtimeEnv = { ...launchContext.forkEnv }; + if (!app.isPackaged) { + try { + const preloadPath = ensureGatewayPreload(); + if (existsSync(preloadPath)) { + runtimeEnv.NODE_OPTIONS = appendNodeRequireToNodeOptions( + runtimeEnv.NODE_OPTIONS, + preloadPath, + ); + } + } catch (error) { + logManager.warn('Failed to prepare Gateway preload', error); + } + } + + return runtimeEnv; +} + +function resolvePreferredLaunchStrategy( + launchContext: GatewayLaunchContext, +): GatewayLaunchStrategy { + return resolveGatewayLaunchStrategy({ + platform: process.platform, + mode: launchContext.mode, + forced: process.env.ZN_AI_GATEWAY_LAUNCH_STRATEGY, + }); +} + +function resolveNodeRuntimeHost(options: { + launchContext: GatewayLaunchContext; + runtimeEnv: Record; +}): { + command: string; + env: NodeJS.ProcessEnv; + hostKind: 'bundled-node' | 'electron-run-as-node'; +} { + const bundledNodePath = process.platform === 'win32' + ? path.join(options.launchContext.binDir, 'node.exe') + : ''; + + if (bundledNodePath && existsSync(bundledNodePath)) { + const { ELECTRON_RUN_AS_NODE: _electronRunAsNode, ...envWithoutElectronNode } = options.runtimeEnv; + return { + command: bundledNodePath, + env: envWithoutElectronNode as NodeJS.ProcessEnv, + hostKind: 'bundled-node', + }; + } + + return { + command: process.execPath, + env: { + ...options.runtimeEnv, + ELECTRON_RUN_AS_NODE: '1', + } as NodeJS.ProcessEnv, + hostKind: 'electron-run-as-node', + }; +} + +async function launchWithUtilityProcess(options: { + launchContext: GatewayLaunchContext; + runtimeEnv: Record; + lastSpawnSummary: string; + onStdoutLine?: (line: string) => void; + onStderrLine?: (line: string) => void; + onSpawn?: (pid: number | undefined) => void; + onExit?: (child: GatewayProcessHandle, code: number | null) => void; + onError?: (error: Error) => void; +}): Promise<{ child: GatewayProcessHandle; lastSpawnSummary: string }> { + return await new Promise<{ child: GatewayProcessHandle; lastSpawnSummary: string }>((resolve, reject) => { + const child = utilityProcess.fork( + options.launchContext.entryScript, + options.launchContext.gatewayArgs, + { + cwd: options.launchContext.openclawDir, + stdio: 'pipe', + env: options.runtimeEnv as NodeJS.ProcessEnv, + serviceName: 'OpenClaw Gateway', + }, + ) as GatewayProcessHandle; let settled = false; const resolveOnce = () => { if (settled) return; settled = true; - resolve(child); + resolve({ child, lastSpawnSummary: options.lastSpawnSummary }); }; const rejectOnce = (error: Error) => { @@ -57,29 +200,135 @@ export async function launchGatewayProcess( }; child.once('spawn', () => { - logManager.info('OpenClaw Gateway process spawned', { pid: child.pid }); + logManager.info('OpenClaw Gateway process spawned', { + pid: child.pid, + launchStrategy: 'utility-process', + }); + options.onSpawn?.(child.pid); resolveOnce(); }); child.once('error', (error) => { logManager.error('OpenClaw Gateway process spawn error:', error); + options.onError?.(error); rejectOnce(error); }); child.once('exit', (code) => { + options.onExit?.(child, code ?? null); if (!settled) { rejectOnce(new Error(`OpenClaw Gateway exited before spawn completed (code=${code ?? 'unknown'})`)); } }); - child.stderr?.on('data', (data) => { - const raw = data.toString(); - for (const line of raw.split(/\r?\n/)) { - const trimmed = line.trim(); - if (trimmed) { - logManager.warn(`[OpenClaw] ${trimmed}`); - } - } - }); + emitProcessOutput(child, 'stdout', options.onStdoutLine); + emitProcessOutput(child, 'stderr', options.onStderrLine); }); } + +async function launchWithNodeRuntime(options: { + launchContext: GatewayLaunchContext; + runtimeEnv: Record; + lastSpawnSummary: string; + onStdoutLine?: (line: string) => void; + onStderrLine?: (line: string) => void; + onSpawn?: (pid: number | undefined) => void; + onExit?: (child: GatewayProcessHandle, code: number | null) => void; + onError?: (error: Error) => void; +}): Promise<{ child: GatewayProcessHandle; lastSpawnSummary: string }> { + const runtimeHost = resolveNodeRuntimeHost({ + launchContext: options.launchContext, + runtimeEnv: options.runtimeEnv, + }); + + return await new Promise<{ child: GatewayProcessHandle; lastSpawnSummary: string }>((resolve, reject) => { + const child = spawn( + runtimeHost.command, + [options.launchContext.entryScript, ...options.launchContext.gatewayArgs], + { + cwd: options.launchContext.openclawDir, + env: runtimeHost.env, + stdio: 'pipe', + windowsHide: true, + }, + ) as GatewayProcessHandle; + + let settled = false; + + const resolveOnce = () => { + if (settled) return; + settled = true; + resolve({ child, lastSpawnSummary: options.lastSpawnSummary }); + }; + + const rejectOnce = (error: Error) => { + if (settled) return; + settled = true; + reject(error); + }; + + child.once('spawn', () => { + logManager.info('OpenClaw Gateway process spawned', { + pid: child.pid, + launchStrategy: 'node-runtime', + runtimeHost: runtimeHost.hostKind, + execPath: runtimeHost.command, + }); + options.onSpawn?.(child.pid); + resolveOnce(); + }); + + child.once('error', (error) => { + logManager.error('OpenClaw Gateway process spawn error:', error); + options.onError?.(error); + rejectOnce(error); + }); + + child.once('exit', (code) => { + options.onExit?.(child, code ?? null); + if (!settled) { + rejectOnce(new Error(`OpenClaw Gateway exited before spawn completed (code=${code ?? 'unknown'})`)); + } + }); + + emitProcessOutput(child, 'stdout', options.onStdoutLine); + emitProcessOutput(child, 'stderr', options.onStderrLine); + }); +} + +export async function launchGatewayProcess( + options: LaunchGatewayProcessOptions, +): Promise<{ child: GatewayProcessHandle; lastSpawnSummary: string }> { + const { launchContext } = options; + const sanitizedArgs = options.sanitizeSpawnArgs?.(launchContext.gatewayArgs) ?? launchContext.gatewayArgs; + const launchStrategy = resolvePreferredLaunchStrategy(launchContext); + const lastSpawnSummary = `mode=${launchContext.mode}, launcher="${launchStrategy}", entry="${launchContext.entryScript}", args="${sanitizedArgs.join(' ')}", cwd="${launchContext.openclawDir}"`; + const runtimeEnv = buildGatewayRuntimeEnv(launchContext); + + logManager.info('Starting OpenClaw Gateway process', { + port: options.port, + entryScript: launchContext.entryScript, + cwd: launchContext.openclawDir, + args: sanitizedArgs, + mode: launchContext.mode, + bundledBin: launchContext.binPathExists, + launchStrategy, + }); + + const launchOptions = { + launchContext, + runtimeEnv, + lastSpawnSummary, + onStdoutLine: options.onStdoutLine, + onStderrLine: options.onStderrLine, + onSpawn: options.onSpawn, + onExit: options.onExit, + onError: options.onError, + }; + + if (launchStrategy === 'node-runtime') { + return await launchWithNodeRuntime(launchOptions); + } + + return await launchWithUtilityProcess(launchOptions); +} diff --git a/electron/gateway/process-policy.ts b/electron/gateway/process-policy.ts new file mode 100644 index 0000000..1d01286 --- /dev/null +++ b/electron/gateway/process-policy.ts @@ -0,0 +1,107 @@ +export interface ReconnectConfig { + maxAttempts: number; + baseDelay: number; + maxDelay: number; +} + +export const DEFAULT_RECONNECT_CONFIG: ReconnectConfig = { + maxAttempts: 10, + baseDelay: 1000, + maxDelay: 30000, +}; + +export function nextLifecycleEpoch(currentEpoch: number): number { + return currentEpoch + 1; +} + +export function isLifecycleSuperseded(expectedEpoch: number, currentEpoch: number): boolean { + return expectedEpoch !== currentEpoch; +} + +export interface ReconnectAttemptContext { + scheduledEpoch: number; + currentEpoch: number; + shouldReconnect: boolean; +} + +export function getReconnectSkipReason(context: ReconnectAttemptContext): string | null { + if (!context.shouldReconnect) { + return 'auto-reconnect disabled'; + } + if (isLifecycleSuperseded(context.scheduledEpoch, context.currentEpoch)) { + return `stale reconnect callback (scheduledEpoch=${context.scheduledEpoch}, currentEpoch=${context.currentEpoch})`; + } + return null; +} + +export interface ReconnectScheduleContext { + shouldReconnect: boolean; + hasReconnectTimer: boolean; + reconnectAttempts: number; + maxAttempts: number; + baseDelay: number; + maxDelay: number; +} + +export type ReconnectScheduleDecision = + | { action: 'skip'; reason: string } + | { action: 'already-scheduled' } + | { action: 'fail'; attempts: number; maxAttempts: number } + | { action: 'schedule'; nextAttempt: number; maxAttempts: number; delay: number }; + +export function getReconnectScheduleDecision( + context: ReconnectScheduleContext, +): ReconnectScheduleDecision { + if (!context.shouldReconnect) { + return { action: 'skip', reason: 'auto-reconnect disabled' }; + } + + if (context.hasReconnectTimer) { + return { action: 'already-scheduled' }; + } + + if (context.reconnectAttempts >= context.maxAttempts) { + return { + action: 'fail', + attempts: context.reconnectAttempts, + maxAttempts: context.maxAttempts, + }; + } + + const delay = Math.min( + context.baseDelay * Math.pow(2, context.reconnectAttempts), + context.maxDelay, + ); + + return { + action: 'schedule', + nextAttempt: context.reconnectAttempts + 1, + maxAttempts: context.maxAttempts, + delay, + }; +} + +export type GatewayLifecycleState = 'stopped' | 'starting' | 'running' | 'error' | 'reconnecting'; + +export interface RestartDeferralContext { + state: GatewayLifecycleState; + startLock: boolean; +} + +export function shouldDeferRestart(context: RestartDeferralContext): boolean { + return context.startLock || context.state === 'starting' || context.state === 'reconnecting'; +} + +export interface DeferredRestartActionContext extends RestartDeferralContext { + hasPendingRestart: boolean; + shouldReconnect: boolean; +} + +export type DeferredRestartAction = 'none' | 'wait' | 'drop' | 'execute'; + +export function getDeferredRestartAction(context: DeferredRestartActionContext): DeferredRestartAction { + if (!context.hasPendingRestart) return 'none'; + if (shouldDeferRestart(context)) return 'wait'; + if (!context.shouldReconnect) return 'drop'; + return 'execute'; +} diff --git a/electron/gateway/protocol.ts b/electron/gateway/protocol.ts new file mode 100644 index 0000000..52117c9 --- /dev/null +++ b/electron/gateway/protocol.ts @@ -0,0 +1,136 @@ +import { randomUUID } from 'node:crypto'; + +export interface JsonRpcRequest { + jsonrpc: '2.0'; + id: string | number; + method: string; + params?: unknown; +} + +export interface JsonRpcResponse { + jsonrpc: '2.0'; + id: string | number; + result?: T; + error?: JsonRpcError; +} + +export interface JsonRpcError { + code: number; + message: string; + data?: unknown; +} + +export interface JsonRpcNotification { + jsonrpc: '2.0'; + method: string; + params?: unknown; +} + +export enum JsonRpcErrorCode { + PARSE_ERROR = -32700, + INVALID_REQUEST = -32600, + METHOD_NOT_FOUND = -32601, + INVALID_PARAMS = -32602, + INTERNAL_ERROR = -32603, + SERVER_ERROR = -32000, +} + +export enum GatewayErrorCode { + NOT_CONNECTED = -32001, + AUTH_REQUIRED = -32002, + PERMISSION_DENIED = -32003, + NOT_FOUND = -32004, + TIMEOUT = -32005, + RATE_LIMITED = -32006, +} + +export enum GatewayEventType { + STATUS_CHANGED = 'gateway.status_changed', + CHANNEL_STATUS_CHANGED = 'channel.status_changed', + MESSAGE_RECEIVED = 'chat.message_received', + MESSAGE_SENT = 'chat.message_sent', + TOOL_CALL_STARTED = 'tool.call_started', + TOOL_CALL_COMPLETED = 'tool.call_completed', + ERROR = 'error', +} + +export interface GatewayProtocolEvent { + type: GatewayEventType; + timestamp: string; + data: T; +} + +export function createRequest( + method: string, + params?: unknown, + id?: string | number, +): JsonRpcRequest { + return { + jsonrpc: '2.0', + id: id ?? randomUUID(), + method, + params, + }; +} + +export function createSuccessResponse( + id: string | number, + result: T, +): JsonRpcResponse { + return { + jsonrpc: '2.0', + id, + result, + }; +} + +export function createErrorResponse( + id: string | number, + code: number, + message: string, + data?: unknown, +): JsonRpcResponse { + return { + jsonrpc: '2.0', + id, + error: { + code, + message, + data, + }, + }; +} + +export function isRequest(message: unknown): message is JsonRpcRequest { + return ( + typeof message === 'object' && + message !== null && + 'jsonrpc' in message && + message.jsonrpc === '2.0' && + 'method' in message && + typeof message.method === 'string' && + 'id' in message + ); +} + +export function isResponse(message: unknown): message is JsonRpcResponse { + return ( + typeof message === 'object' && + message !== null && + 'jsonrpc' in message && + message.jsonrpc === '2.0' && + 'id' in message && + ('result' in message || 'error' in message) + ); +} + +export function isNotification(message: unknown): message is JsonRpcNotification { + return ( + typeof message === 'object' && + message !== null && + 'jsonrpc' in message && + message.jsonrpc === '2.0' && + 'method' in message && + !('id' in message) + ); +} diff --git a/electron/gateway/reload-policy.ts b/electron/gateway/reload-policy.ts new file mode 100644 index 0000000..0def301 --- /dev/null +++ b/electron/gateway/reload-policy.ts @@ -0,0 +1,62 @@ +export type GatewayReloadMode = 'hybrid' | 'reload' | 'restart' | 'off'; + +export type GatewayReloadPolicy = { + mode: GatewayReloadMode; + debounceMs: number; +}; + +export const DEFAULT_GATEWAY_RELOAD_POLICY: GatewayReloadPolicy = { + mode: 'hybrid', + debounceMs: 1200, +}; + +const MAX_DEBOUNCE_MS = 60_000; + +function normalizeMode(value: unknown): GatewayReloadMode { + if (value === 'off' || value === 'reload' || value === 'restart' || value === 'hybrid') { + return value; + } + return DEFAULT_GATEWAY_RELOAD_POLICY.mode; +} + +function normalizeDebounceMs(value: unknown): number { + if (typeof value !== 'number' || !Number.isFinite(value)) { + return DEFAULT_GATEWAY_RELOAD_POLICY.debounceMs; + } + const rounded = Math.round(value); + if (rounded < 0) return 0; + if (rounded > MAX_DEBOUNCE_MS) return MAX_DEBOUNCE_MS; + return rounded; +} + +export function parseGatewayReloadPolicy(config: unknown): GatewayReloadPolicy { + if (!config || typeof config !== 'object') { + return { ...DEFAULT_GATEWAY_RELOAD_POLICY }; + } + const root = config as Record; + const gateway = (root.gateway && typeof root.gateway === 'object' + ? root.gateway + : {}) as Record; + const reload = (gateway.reload && typeof gateway.reload === 'object' + ? gateway.reload + : {}) as Record; + + return { + mode: normalizeMode(reload.mode), + debounceMs: normalizeDebounceMs(reload.debounceMs), + }; +} + +export async function loadGatewayReloadPolicy(): Promise { + try { + const [{ readFile }, { homedir }, { join }] = await Promise.all([ + import('node:fs/promises'), + import('node:os'), + import('node:path'), + ]); + const raw = await readFile(join(homedir(), '.openclaw', 'openclaw.json'), 'utf-8'); + return parseGatewayReloadPolicy(JSON.parse(raw)); + } catch { + return { ...DEFAULT_GATEWAY_RELOAD_POLICY }; + } +} diff --git a/electron/gateway/restart-controller.ts b/electron/gateway/restart-controller.ts new file mode 100644 index 0000000..850df4c --- /dev/null +++ b/electron/gateway/restart-controller.ts @@ -0,0 +1,111 @@ +import logManager from '@electron/service/logger'; +import { + getDeferredRestartAction, + shouldDeferRestart, + type GatewayLifecycleState, +} from './process-policy'; + +type RestartDeferralState = { + state: GatewayLifecycleState; + startLock: boolean; +}; + +type DeferredRestartContext = RestartDeferralState & { + shouldReconnect: boolean; +}; + +export class GatewayRestartController { + private deferredRestartPending = false; + private deferredRestartRequestedAt = 0; + private lastRestartCompletedAt = 0; + private restartDebounceTimer: NodeJS.Timeout | null = null; + + isRestartDeferred(context: RestartDeferralState): boolean { + return shouldDeferRestart(context); + } + + markDeferredRestart(reason: string, context: RestartDeferralState): void { + if (!this.deferredRestartPending) { + logManager.info( + `Deferring Gateway restart (${reason}) until startup/reconnect settles (state=${context.state}, startLock=${context.startLock})`, + ); + } else { + logManager.debug( + `Gateway restart already deferred; keeping pending request (${reason}, state=${context.state}, startLock=${context.startLock})`, + ); + } + this.deferredRestartPending = true; + if (this.deferredRestartRequestedAt === 0) { + this.deferredRestartRequestedAt = Date.now(); + } + } + + recordRestartCompleted(): void { + this.lastRestartCompletedAt = Date.now(); + } + + flushDeferredRestart( + trigger: string, + context: DeferredRestartContext, + executeRestart: () => void, + ): void { + const action = getDeferredRestartAction({ + hasPendingRestart: this.deferredRestartPending, + state: context.state, + startLock: context.startLock, + shouldReconnect: context.shouldReconnect, + }); + + if (action === 'none') return; + if (action === 'wait') { + logManager.debug( + `Deferred Gateway restart still waiting (${trigger}, state=${context.state}, startLock=${context.startLock})`, + ); + return; + } + + const requestedAt = this.deferredRestartRequestedAt; + this.deferredRestartPending = false; + this.deferredRestartRequestedAt = 0; + + if (action === 'drop') { + logManager.info( + `Dropping deferred Gateway restart (${trigger}) because lifecycle already recovered (state=${context.state}, shouldReconnect=${context.shouldReconnect})`, + ); + return; + } + + if (requestedAt > 0 && this.lastRestartCompletedAt >= requestedAt) { + logManager.info( + `Dropping deferred Gateway restart (${trigger}): a restart already completed after the request (requested=${requestedAt}, completed=${this.lastRestartCompletedAt})`, + ); + return; + } + + logManager.info(`Executing deferred Gateway restart now (${trigger})`); + executeRestart(); + } + + debouncedRestart(delayMs: number, executeRestart: () => void): void { + if (this.restartDebounceTimer) { + clearTimeout(this.restartDebounceTimer); + } + logManager.debug(`Gateway restart debounced (will fire in ${delayMs}ms)`); + this.restartDebounceTimer = setTimeout(() => { + this.restartDebounceTimer = null; + executeRestart(); + }, delayMs); + } + + clearDebounceTimer(): void { + if (this.restartDebounceTimer) { + clearTimeout(this.restartDebounceTimer); + this.restartDebounceTimer = null; + } + } + + resetDeferredRestart(): void { + this.deferredRestartPending = false; + this.deferredRestartRequestedAt = 0; + } +} diff --git a/electron/gateway/restart-governor.ts b/electron/gateway/restart-governor.ts new file mode 100644 index 0000000..cc1d54b --- /dev/null +++ b/electron/gateway/restart-governor.ts @@ -0,0 +1,75 @@ +export type RestartDecision = + | { allow: true } + | { + allow: false; + reason: 'cooldown_active'; + retryAfterMs: number; + }; + +type RestartGovernorOptions = { + cooldownMs: number; +}; + +const DEFAULT_OPTIONS: RestartGovernorOptions = { + cooldownMs: 2500, +}; + +export class GatewayRestartGovernor { + private readonly options: RestartGovernorOptions; + private lastRestartAt = 0; + private suppressedTotal = 0; + private executedTotal = 0; + + constructor(options?: Partial) { + this.options = { ...DEFAULT_OPTIONS, ...options }; + } + + onRunning(_now = Date.now()): void { + // Kept for interface compatibility with ClawX lifecycle wiring. + } + + decide(now = Date.now()): RestartDecision { + if (this.lastRestartAt > 0) { + const sinceLast = now - this.lastRestartAt; + if (sinceLast < this.options.cooldownMs) { + this.suppressedTotal = this.safeIncrement(this.suppressedTotal); + return { + allow: false, + reason: 'cooldown_active', + retryAfterMs: this.options.cooldownMs - sinceLast, + }; + } + } + + return { allow: true }; + } + + recordExecuted(now = Date.now()): void { + this.executedTotal = this.safeIncrement(this.executedTotal); + this.lastRestartAt = now; + } + + getCounters(): { executedTotal: number; suppressedTotal: number } { + return { + executedTotal: this.executedTotal, + suppressedTotal: this.suppressedTotal, + }; + } + + getObservability(): { + suppressed_total: number; + executed_total: number; + circuit_open_until: number; + } { + return { + suppressed_total: this.suppressedTotal, + executed_total: this.executedTotal, + circuit_open_until: 0, + }; + } + + private safeIncrement(current: number): number { + if (current >= Number.MAX_SAFE_INTEGER) return 0; + return current + 1; + } +} diff --git a/electron/gateway/startup-orchestrator.ts b/electron/gateway/startup-orchestrator.ts new file mode 100644 index 0000000..5554f68 --- /dev/null +++ b/electron/gateway/startup-orchestrator.ts @@ -0,0 +1,110 @@ +import logManager from '@electron/service/logger'; +import { LifecycleSupersededError } from './lifecycle-controller'; +import { getGatewayStartupRecoveryAction } from './startup-recovery'; + +export interface ExistingGatewayInfo { + port: number; + externalToken?: string; +} + +type StartupHooks = { + port: number; + shouldWaitForPortFree: boolean; + maxStartAttempts?: number; + hasOwnedProcess?: () => boolean; + assertLifecycle?: (phase: string) => void; + resetStartupStderrLines: () => void; + getStartupStderrLines: () => string[]; + findExistingGateway: (port: number) => Promise; + connect: (port: number, externalToken?: string) => Promise; + onConnectedToExistingGateway: () => void; + waitForPortFree: (port: number) => Promise; + startProcess: () => Promise; + waitForReady: (port: number) => Promise; + onConnectedToManagedGateway: () => void; + runDoctorRepair?: () => Promise; + onDoctorRepairSuccess?: () => void; + delay: (ms: number) => Promise; +}; + +export async function runGatewayStartupSequence(hooks: StartupHooks): Promise { + let configRepairAttempted = false; + let startAttempts = 0; + const maxStartAttempts = hooks.maxStartAttempts ?? 3; + + while (true) { + startAttempts += 1; + hooks.assertLifecycle?.('start'); + hooks.resetStartupStderrLines(); + + try { + const existing = await hooks.findExistingGateway(hooks.port); + hooks.assertLifecycle?.('start/find-existing'); + if (existing) { + logManager.debug(`Found existing Gateway on port ${existing.port}`); + await hooks.connect(existing.port, existing.externalToken); + hooks.assertLifecycle?.('start/connect-existing'); + hooks.onConnectedToExistingGateway(); + return; + } + + if (hooks.hasOwnedProcess?.()) { + logManager.info('Owned Gateway process still alive; waiting for it to become ready'); + await hooks.waitForReady(hooks.port); + hooks.assertLifecycle?.('start/wait-ready-owned'); + await hooks.connect(hooks.port); + hooks.assertLifecycle?.('start/connect-owned'); + hooks.onConnectedToExistingGateway(); + return; + } + + if (hooks.shouldWaitForPortFree) { + await hooks.waitForPortFree(hooks.port); + hooks.assertLifecycle?.('start/wait-port'); + } + + await hooks.startProcess(); + hooks.assertLifecycle?.('start/start-process'); + await hooks.waitForReady(hooks.port); + hooks.assertLifecycle?.('start/wait-ready'); + await hooks.connect(hooks.port); + hooks.assertLifecycle?.('start/connect'); + hooks.onConnectedToManagedGateway(); + return; + } catch (error) { + if (error instanceof LifecycleSupersededError) { + throw error; + } + + const recoveryAction = getGatewayStartupRecoveryAction({ + startupError: error, + startupStderrLines: hooks.getStartupStderrLines(), + configRepairAttempted, + attempt: startAttempts, + maxAttempts: maxStartAttempts, + }); + + if (recoveryAction === 'repair' && hooks.runDoctorRepair) { + configRepairAttempted = true; + logManager.warn( + 'Detected invalid OpenClaw config during Gateway startup; running doctor repair before retry', + ); + const repaired = await hooks.runDoctorRepair(); + if (repaired) { + logManager.info('OpenClaw doctor repair completed; retrying Gateway startup'); + hooks.onDoctorRepairSuccess?.(); + continue; + } + logManager.error('OpenClaw doctor repair failed; not retrying Gateway startup'); + } + + if (recoveryAction === 'retry') { + logManager.warn(`Transient start error: ${String(error)}. Retrying... (${startAttempts}/${maxStartAttempts})`); + await hooks.delay(1000); + continue; + } + + throw error; + } + } +} diff --git a/electron/gateway/startup-recovery.ts b/electron/gateway/startup-recovery.ts new file mode 100644 index 0000000..5e596ff --- /dev/null +++ b/electron/gateway/startup-recovery.ts @@ -0,0 +1,91 @@ +const INVALID_CONFIG_PATTERNS: RegExp[] = [ + /\binvalid config\b/i, + /\bconfig invalid\b/i, + /\bunrecognized key\b/i, + /\brun:\s*openclaw doctor --fix\b/i, +]; + +const TRANSIENT_START_ERROR_PATTERNS: RegExp[] = [ + /WebSocket closed before handshake/i, + /ECONNREFUSED/i, + /Gateway exited before spawn completed/i, + /Gateway exited before becoming ready/i, + /Gateway failed to become ready on port/i, + /Timed out waiting for connect\.challenge/i, + /Connect handshake timeout/i, + /Port \d+ still occupied after \d+ms/i, +]; + +function normalizeLogLine(value: string): string { + return value.trim(); +} + +export function isInvalidConfigSignal(text: string): boolean { + const normalized = normalizeLogLine(text); + if (!normalized) { + return false; + } + + return INVALID_CONFIG_PATTERNS.some((pattern) => pattern.test(normalized)); +} + +export function hasInvalidConfigFailureSignal( + startupError: unknown, + startupStderrLines: string[], +): boolean { + for (const line of startupStderrLines) { + if (isInvalidConfigSignal(line)) { + return true; + } + } + + const errorText = startupError instanceof Error + ? `${startupError.name}: ${startupError.message}` + : String(startupError ?? ''); + + return isInvalidConfigSignal(errorText); +} + +export function shouldAttemptConfigAutoRepair( + startupError: unknown, + startupStderrLines: string[], + alreadyAttempted: boolean, +): boolean { + if (alreadyAttempted) { + return false; + } + + return hasInvalidConfigFailureSignal(startupError, startupStderrLines); +} + +export function isTransientGatewayStartError(error: unknown): boolean { + const errorText = error instanceof Error + ? `${error.name}: ${error.message}` + : String(error ?? ''); + + return TRANSIENT_START_ERROR_PATTERNS.some((pattern) => pattern.test(errorText)); +} + +export type GatewayStartupRecoveryAction = 'repair' | 'retry' | 'fail'; + +export function getGatewayStartupRecoveryAction(options: { + startupError: unknown; + startupStderrLines: string[]; + configRepairAttempted: boolean; + attempt: number; + maxAttempts: number; +}): GatewayStartupRecoveryAction { + if (shouldAttemptConfigAutoRepair( + options.startupError, + options.startupStderrLines, + options.configRepairAttempted, + )) { + return 'repair'; + } + + if (options.attempt < options.maxAttempts && isTransientGatewayStartError(options.startupError)) { + return 'retry'; + } + + return 'fail'; +} diff --git a/electron/gateway/startup-stderr.ts b/electron/gateway/startup-stderr.ts new file mode 100644 index 0000000..ba76ce2 --- /dev/null +++ b/electron/gateway/startup-stderr.ts @@ -0,0 +1,52 @@ +export type GatewayStderrClassification = { + level: 'drop' | 'debug' | 'warn'; + normalized: string; +}; + +const MAX_STDERR_LINES = 120; + +export function classifyGatewayStderrMessage(message: string): GatewayStderrClassification { + const normalized = message.trim(); + if (!normalized) { + return { level: 'drop', normalized }; + } + + if (normalized.includes('openclaw-control-ui') && normalized.includes('token_mismatch')) { + return { level: 'drop', normalized }; + } + if (normalized.includes('closed before connect') && normalized.includes('token mismatch')) { + return { level: 'drop', normalized }; + } + if (normalized.includes('[ws] closed before connect') && normalized.includes('code=1005')) { + return { level: 'debug', normalized }; + } + if (normalized.includes('ExperimentalWarning')) { + return { level: 'debug', normalized }; + } + if (normalized.includes('DeprecationWarning')) { + return { level: 'debug', normalized }; + } + if (normalized.includes('Debugger attached')) { + return { level: 'debug', normalized }; + } + if (normalized.includes('Config warnings:')) { + return { level: 'debug', normalized }; + } + if (normalized.includes('node: --require is not allowed in NODE_OPTIONS')) { + return { level: 'debug', normalized }; + } + + return { level: 'warn', normalized }; +} + +export function recordGatewayStartupStderrLine(lines: string[], line: string): void { + const normalized = line.trim(); + if (!normalized) { + return; + } + + lines.push(normalized); + if (lines.length > MAX_STDERR_LINES) { + lines.splice(0, lines.length - MAX_STDERR_LINES); + } +} diff --git a/electron/gateway/state.ts b/electron/gateway/state.ts new file mode 100644 index 0000000..cb4934d --- /dev/null +++ b/electron/gateway/state.ts @@ -0,0 +1,57 @@ +import logManager from '@electron/service/logger'; +import type { GatewayLifecycleState } from './process-policy'; + +export interface GatewayRuntimeStatus { + state: GatewayLifecycleState; + port: number | null; + pid?: number; + uptime?: number; + error?: string; + connectedAt?: number; + reconnectAttempts?: number; + gatewayReady?: boolean; +} + +type GatewayStateHooks = { + emitStatus: (status: GatewayRuntimeStatus) => void; + onTransition?: ( + previousState: GatewayRuntimeStatus['state'], + nextState: GatewayRuntimeStatus['state'], + ) => void; +}; + +export class GatewayStateController { + private status: GatewayRuntimeStatus = { state: 'stopped', port: null }; + + constructor(private readonly hooks: GatewayStateHooks) {} + + getStatus(): GatewayRuntimeStatus { + const snapshot = { ...this.status }; + if (snapshot.state === 'running' && snapshot.connectedAt) { + snapshot.uptime = Date.now() - snapshot.connectedAt; + } + return snapshot; + } + + isConnected(isSocketOpen: boolean): boolean { + return this.status.state === 'running' && isSocketOpen; + } + + setStatus(update: Partial): void { + const previousState = this.status.state; + this.status = { ...this.status, ...update }; + + if (this.status.state === 'running' && this.status.connectedAt) { + this.status.uptime = Date.now() - this.status.connectedAt; + } else if (this.status.state !== 'running') { + this.status.uptime = undefined; + } + + this.hooks.emitStatus(this.status); + + if (previousState !== this.status.state) { + logManager.debug(`Gateway state changed: ${previousState} -> ${this.status.state}`); + this.hooks.onTransition?.(previousState, this.status.state); + } + } +} diff --git a/electron/gateway/supervisor.ts b/electron/gateway/supervisor.ts new file mode 100644 index 0000000..d94914c --- /dev/null +++ b/electron/gateway/supervisor.ts @@ -0,0 +1,340 @@ +import { exec } from 'node:child_process'; +import { once } from 'node:events'; +import { existsSync } from 'node:fs'; +import { createServer } from 'node:net'; +import { join } from 'node:path'; +import { app, utilityProcess } from 'electron'; +import logManager from '@electron/service/logger'; +import { getOpenClawDir, getOpenClawEntryPath } from '@electron/utils/paths'; +import { prependPathEntry } from '@electron/utils/env-path'; +import { getUvMirrorEnv } from '@electron/utils/uv-env'; +import { isPythonReady, setupManagedPython } from '@electron/utils/uv-setup'; +import type { GatewayProcessHandle } from './process-handle'; +import { probeGatewayReady } from './ws-client'; + +export function warmupManagedPythonReadiness(): void { + void isPythonReady() + .then((pythonReady) => { + if (!pythonReady) { + logManager.info('Python environment missing or incomplete, attempting background repair...'); + void setupManagedPython().catch((error) => { + logManager.error('Background Python repair failed:', error); + }); + } + }) + .catch((error) => { + logManager.error('Failed to check Python environment:', error); + }); +} + +async function getListeningProcessIds(port: number): Promise { + const command = process.platform === 'win32' + ? `netstat -ano | findstr :${port}` + : `lsof -i :${port} -sTCP:LISTEN -t`; + + const stdout = await new Promise((resolve) => { + exec(command, { timeout: 5000, windowsHide: true }, (error, result) => { + if (error) { + resolve(''); + return; + } + resolve(result); + }); + }); + + if (!stdout.trim()) { + return []; + } + + if (process.platform === 'win32') { + const pids: string[] = []; + for (const line of stdout.trim().split(/\r?\n/)) { + const parts = line.trim().split(/\s+/); + if (parts.length >= 5 && parts[3] === 'LISTENING') { + pids.push(parts[4]); + } + } + return [...new Set(pids)]; + } + + return [...new Set(stdout.trim().split(/\r?\n/).map((value) => value.trim()).filter(Boolean))]; +} + +async function terminateOrphanedProcessIds(port: number, pids: string[]): Promise { + logManager.warn(`Found orphaned Gateway listener on port ${port}; terminating PIDs: ${pids.join(', ')}`); + + for (const pid of pids) { + try { + if (process.platform === 'win32') { + await new Promise((resolve) => { + exec( + `taskkill /F /PID ${pid} /T`, + { timeout: 5000, windowsHide: true }, + () => resolve(), + ); + }); + continue; + } + + process.kill(Number.parseInt(pid, 10), 'SIGTERM'); + } catch { + // ignore already-exited processes + } + } + + await new Promise((resolve) => setTimeout(resolve, process.platform === 'win32' ? 1500 : 1000)); +} + +export async function terminateOwnedGatewayProcess(child: GatewayProcessHandle): Promise { + const pid = child.pid; + + await new Promise((resolve) => { + let exited = false; + let timeout: NodeJS.Timeout | null = null; + + child.once('exit', () => { + exited = true; + if (timeout) { + clearTimeout(timeout); + } + resolve(); + }); + + logManager.info(`Sending kill to Gateway process (pid=${pid ?? 'unknown'})`); + + if (process.platform === 'win32' && pid) { + exec(`taskkill /F /PID ${pid} /T`, { timeout: 5000, windowsHide: true }, () => { + if (!exited) { + resolve(); + } + }); + return; + } + + try { + child.kill(); + } catch { + resolve(); + } + + timeout = setTimeout(() => { + if (!exited) { + logManager.warn(`Gateway did not exit in time, force-killing (pid=${pid ?? 'unknown'})`); + if (pid) { + if (process.platform === 'win32') { + exec(`taskkill /F /PID ${pid} /T`, { timeout: 5000, windowsHide: true }, () => { + resolve(); + }); + return; + } + + try { + process.kill(pid, 'SIGKILL'); + } catch { + // ignore + } + } + } + resolve(); + }, 5000); + }); +} + +export async function unloadLaunchctlGatewayService(): Promise { + if (process.platform !== 'darwin') { + return; + } + + try { + const uid = process.getuid?.(); + if (uid === undefined) { + return; + } + + const launchdLabel = 'ai.openclaw.gateway'; + const serviceTarget = `gui/${uid}/${launchdLabel}`; + + const loaded = await new Promise((resolve) => { + exec(`launchctl print ${serviceTarget}`, { timeout: 5000 }, (error) => { + resolve(!error); + }); + }); + + if (!loaded) { + return; + } + + logManager.info(`Unloading launchctl service ${serviceTarget} to prevent auto-respawn`); + await new Promise((resolve) => { + exec(`launchctl bootout ${serviceTarget}`, { timeout: 10000 }, (error) => { + if (error) { + logManager.warn(`Failed to bootout launchctl service: ${error.message}`); + } else { + logManager.info('Successfully unloaded launchctl gateway service'); + } + resolve(); + }); + }); + } catch (error) { + logManager.warn('Error while unloading launchctl gateway service:', error); + } +} + +async function canListenOnPort(port: number): Promise { + const server = createServer(); + try { + server.listen(port, '127.0.0.1'); + await once(server, 'listening'); + return true; + } catch { + return false; + } finally { + try { + server.close(); + } catch { + // ignore + } + } +} + +export async function waitForPortFree( + port: number, + timeoutMs = 30_000, + intervalMs = 500, +): Promise { + const startAt = Date.now(); + let logged = false; + + while (Date.now() - startAt < timeoutMs) { + if (await canListenOnPort(port)) { + const elapsed = Date.now() - startAt; + if (elapsed > intervalMs) { + logManager.info(`Port ${port} became available after ${elapsed}ms`); + } + return; + } + + if (!logged) { + logManager.info(`Waiting for port ${port} to become available (Windows TCP TIME_WAIT)...`); + logged = true; + } + await new Promise((resolve) => setTimeout(resolve, intervalMs)); + } + + logManager.error(`Port ${port} still occupied after ${timeoutMs}ms; aborting startup to avoid port conflict`); + throw new Error(`Port ${port} still occupied after ${timeoutMs}ms`); +} + +export async function findExistingGatewayProcess(options: { + port: number; + ownedPid?: number | null; +}): Promise<{ port: number; externalToken?: string } | null> { + try { + const pids = await getListeningProcessIds(options.port); + if (pids.length > 0 && (!options.ownedPid || !pids.includes(String(options.ownedPid)))) { + await terminateOrphanedProcessIds(options.port, pids); + if (process.platform === 'win32') { + await waitForPortFree(options.port, 10_000); + } + return null; + } + + const ready = await probeGatewayReady(options.port, 5_000); + return ready ? { port: options.port } : null; + } catch { + return null; + } +} + +export async function runOpenClawDoctorRepair(): Promise { + const openclawDir = getOpenClawDir(); + const entryScript = getOpenClawEntryPath(); + if (!existsSync(entryScript)) { + logManager.error(`Cannot run OpenClaw doctor repair: entry script not found at ${entryScript}`); + return false; + } + + const platform = process.platform; + const arch = process.arch; + const target = `${platform}-${arch}`; + const binPath = app.isPackaged + ? join(process.resourcesPath, 'bin') + : join(process.cwd(), 'resources', 'bin', target); + const binPathExists = existsSync(binPath); + const baseProcessEnv = process.env as Record; + const baseEnvPatched = binPathExists + ? prependPathEntry(baseProcessEnv, binPath).env + : baseProcessEnv; + const uvEnv = await getUvMirrorEnv(); + const doctorArgs = ['doctor', '--fix', '--yes', '--non-interactive']; + + logManager.info( + `Running OpenClaw doctor repair (entry="${entryScript}", args="${doctorArgs.join(' ')}", cwd="${openclawDir}", bundledBin=${binPathExists ? 'yes' : 'no'})`, + ); + + return await new Promise((resolve) => { + const forkEnv: Record = { + ...baseEnvPatched, + ...uvEnv, + OPENCLAW_NO_RESPAWN: '1', + }; + + const child = utilityProcess.fork(entryScript, doctorArgs, { + cwd: openclawDir, + stdio: 'pipe', + env: forkEnv as NodeJS.ProcessEnv, + }); + + let settled = false; + const finish = (ok: boolean) => { + if (settled) return; + settled = true; + resolve(ok); + }; + + const timeout = setTimeout(() => { + logManager.error('OpenClaw doctor repair timed out after 120000ms'); + try { + child.kill(); + } catch { + // ignore + } + finish(false); + }, 120000); + + child.on('error', (error) => { + clearTimeout(timeout); + logManager.error('Failed to spawn OpenClaw doctor repair process:', error); + finish(false); + }); + + child.stdout?.on('data', (data) => { + const raw = data.toString(); + for (const line of raw.split(/\r?\n/)) { + const normalized = line.trim(); + if (!normalized) continue; + logManager.debug(`[Gateway doctor stdout] ${normalized}`); + } + }); + + child.stderr?.on('data', (data) => { + const raw = data.toString(); + for (const line of raw.split(/\r?\n/)) { + const normalized = line.trim(); + if (!normalized) continue; + logManager.warn(`[Gateway doctor stderr] ${normalized}`); + } + }); + + child.on('exit', (code) => { + clearTimeout(timeout); + if (code === 0) { + logManager.info('OpenClaw doctor repair completed successfully'); + finish(true); + return; + } + logManager.warn(`OpenClaw doctor repair exited (code=${code})`); + finish(false); + }); + }); +} diff --git a/electron/gateway/ws-client.ts b/electron/gateway/ws-client.ts index d31d3de..ba4feaf 100644 --- a/electron/gateway/ws-client.ts +++ b/electron/gateway/ws-client.ts @@ -1,3 +1,4 @@ +import WebSocket from 'ws'; import type { DeviceIdentity } from '@electron/utils/device-identity'; import { buildDeviceAuthPayload, @@ -19,32 +20,8 @@ type GatewayProtocolFrame = } | null; -function isBlobLike(value: unknown): value is Blob { - return typeof Blob !== 'undefined' && value instanceof Blob; -} - -async function dataToString(data: unknown): Promise { - if (typeof data === 'string') { - return data; - } - - if (data instanceof ArrayBuffer) { - return Buffer.from(data).toString('utf-8'); - } - - if (ArrayBuffer.isView(data)) { - return Buffer.from(data.buffer, data.byteOffset, data.byteLength).toString('utf-8'); - } - - if (isBlobLike(data)) { - return await data.text(); - } - - return String(data ?? ''); -} - -async function parseGatewayFrame(data: unknown): Promise { - const text = await dataToString(data); +function parseGatewayFrame(data: WebSocket.RawData): GatewayProtocolFrame { + const text = data.toString(); if (!text) { return null; } @@ -131,7 +108,7 @@ export async function probeGatewayReady( settled = true; clearTimeout(timeout); try { - ws.close(); + ws.terminate(); } catch { // ignore probe close errors } @@ -142,24 +119,22 @@ export async function probeGatewayReady( resolveOnce(false); }, timeoutMs); - ws.addEventListener('message', (event) => { - void (async () => { - try { - const message = await parseGatewayFrame(event.data); - if (message?.type === 'event' && message.event === 'connect.challenge') { - resolveOnce(true); - } - } catch { - // ignore malformed probe payloads + ws.on('message', (data) => { + try { + const message = parseGatewayFrame(data); + if (message?.type === 'event' && message.event === 'connect.challenge') { + resolveOnce(true); } - })(); + } catch { + // ignore malformed probe payloads + } }); - ws.addEventListener('error', () => { + ws.on('error', () => { resolveOnce(false); }); - ws.addEventListener('close', () => { + ws.on('close', () => { resolveOnce(false); }); }); @@ -168,19 +143,22 @@ export async function probeGatewayReady( export async function waitForGatewayReady(options: { port: number; getProcessExitCode: () => number | null; - retries?: number; + timeoutMs?: number; intervalMs?: number; + probeTimeoutMs?: number; }): Promise { - const retries = options.retries ?? 300; + const timeoutMs = options.timeoutMs ?? (process.platform === 'win32' ? 180_000 : 90_000); const intervalMs = options.intervalMs ?? 200; + const probeTimeoutMs = options.probeTimeoutMs ?? 1500; + const startedAt = Date.now(); - for (let i = 0; i < retries; i += 1) { + while (Date.now() - startedAt < timeoutMs) { const exitCode = options.getProcessExitCode(); if (exitCode !== null) { throw new Error(`OpenClaw Gateway exited before becoming ready (code=${exitCode})`); } - const ready = await probeGatewayReady(options.port, 1500); + const ready = await probeGatewayReady(options.port, probeTimeoutMs); if (ready) { return; } @@ -188,7 +166,7 @@ export async function waitForGatewayReady(options: { await new Promise((resolve) => setTimeout(resolve, intervalMs)); } - throw new Error(`OpenClaw Gateway failed to become ready on port ${options.port}`); + throw new Error(`OpenClaw Gateway failed to become ready on port ${options.port} within ${timeoutMs}ms`); } export async function connectGatewaySocket(options: { @@ -234,112 +212,105 @@ export async function connectGatewaySocket(options: { if (settled) return; settled = true; cleanup(); + try { + ws.terminate(); + } catch { + // ignore terminate errors + } reject(error instanceof Error ? error : new Error(String(error))); }; challengeTimer = setTimeout(() => { - try { - ws.close(); - } catch { - // ignore close error - } rejectOnce(new Error('Timed out waiting for connect.challenge from OpenClaw Gateway')); }, challengeTimeoutMs); - ws.addEventListener('message', (event) => { - void (async () => { - try { - const message = await parseGatewayFrame(event.data); - if (!message) { - return; - } - - if (!handshakeComplete && message.type === 'event' && message.event === 'connect.challenge') { - if (challengeTimer) { - clearTimeout(challengeTimer); - challengeTimer = null; - } - - const nonce = ( - typeof message.payload === 'object' && - message.payload !== null && - 'nonce' in message.payload && - typeof (message.payload as { nonce?: unknown }).nonce === 'string' - ) - ? (message.payload as { nonce: string }).nonce - : ''; - - if (!nonce) { - rejectOnce(new Error('OpenClaw Gateway connect.challenge missing nonce')); - return; - } - - const payload = buildGatewayConnectFrame({ - challengeNonce: nonce, - token: options.token, - deviceIdentity: options.deviceIdentity, - platform: options.platform, - }); - connectId = payload.connectId; - ws.send(JSON.stringify(payload.frame)); - - handshakeTimer = setTimeout(() => { - try { - ws.close(); - } catch { - // ignore close error - } - rejectOnce(new Error('Timed out waiting for OpenClaw Gateway connect response')); - }, connectTimeoutMs); - return; - } - - if (!handshakeComplete && message.type === 'res' && message.id === connectId) { - if (message.ok === false) { - const errorMessage = - typeof message.error === 'string' - ? message.error - : ( - typeof message.error === 'object' && - message.error !== null && - 'message' in message.error && - typeof (message.error as { message?: unknown }).message === 'string' - ) - ? (message.error as { message: string }).message - : 'OpenClaw Gateway connect handshake failed'; - rejectOnce(new Error(errorMessage)); - return; - } - - handshakeComplete = true; - resolveOnce(); - return; - } - - if (handshakeComplete) { - options.onMessage(message); - } - } catch (error) { - if (!handshakeComplete) { - rejectOnce(error); - } + ws.on('message', (data) => { + try { + const message = parseGatewayFrame(data); + if (!message) { + return; } - })(); + + if (!handshakeComplete && message.type === 'event' && message.event === 'connect.challenge') { + if (challengeTimer) { + clearTimeout(challengeTimer); + challengeTimer = null; + } + + const nonce = ( + typeof message.payload === 'object' && + message.payload !== null && + 'nonce' in message.payload && + typeof (message.payload as { nonce?: unknown }).nonce === 'string' + ) + ? (message.payload as { nonce: string }).nonce + : ''; + + if (!nonce) { + rejectOnce(new Error('OpenClaw Gateway connect.challenge missing nonce')); + return; + } + + const payload = buildGatewayConnectFrame({ + challengeNonce: nonce, + token: options.token, + deviceIdentity: options.deviceIdentity, + platform: options.platform, + }); + connectId = payload.connectId; + ws.send(JSON.stringify(payload.frame)); + + handshakeTimer = setTimeout(() => { + rejectOnce(new Error('Timed out waiting for OpenClaw Gateway connect response')); + }, connectTimeoutMs); + return; + } + + if (!handshakeComplete && message.type === 'res' && message.id === connectId) { + if (message.ok === false) { + const errorMessage = + typeof message.error === 'string' + ? message.error + : ( + typeof message.error === 'object' && + message.error !== null && + 'message' in message.error && + typeof (message.error as { message?: unknown }).message === 'string' + ) + ? (message.error as { message: string }).message + : 'OpenClaw Gateway connect handshake failed'; + rejectOnce(new Error(errorMessage)); + return; + } + + handshakeComplete = true; + resolveOnce(); + return; + } + + if (handshakeComplete) { + options.onMessage(message); + } + } catch (error) { + if (!handshakeComplete) { + rejectOnce(error); + } + } }); - ws.addEventListener('close', (event) => { + ws.on('close', (code) => { if (!handshakeComplete) { - rejectOnce(new Error(`OpenClaw Gateway socket closed before handshake (code=${event.code})`)); + rejectOnce(new Error(`OpenClaw Gateway socket closed before handshake (code=${code})`)); return; } cleanup(); - options.onCloseAfterHandshake(ws, event.code); + options.onCloseAfterHandshake(ws, code); }); - ws.addEventListener('error', () => { + ws.on('error', (error) => { if (!handshakeComplete) { - rejectOnce(new Error('OpenClaw Gateway socket connection failed')); + rejectOnce(error); } }); }); diff --git a/electron/main.ts b/electron/main.ts index 6c1bafc..f2aa6b9 100644 --- a/electron/main.ts +++ b/electron/main.ts @@ -1,3 +1,4 @@ +import type { Server } from 'node:http'; import { app, BrowserWindow, ipcMain } from 'electron' import { CONFIG_KEYS, IPC_EVENTS } from '@runtime/lib/constants' import { setupMainWindow } from './wins'; @@ -12,10 +13,13 @@ import { appUpdater } from '@electron/service/updater'; import axios from 'axios'; import { onProviderChange } from '@electron/service/provider-api-service'; import { gatewayManager } from '@electron/gateway/manager'; -import { dispatchLocalHostApi } from '@electron/api/router'; +import { createHostApiContext, dispatchLocalHostApi } from '@electron/api/router'; +import { hostEventBus } from '@electron/api/event-bus'; +import { getHostApiBase, getHostApiToken, startHostApiServer } from '@electron/api/server'; import { syncProviderRuntimeSnapshot } from '@electron/service/provider-runtime-sync'; import { applyLaunchAtStartupSetting, syncLaunchAtStartupSettingFromConfig } from '@electron/service/launch-at-startup'; import { ensureBuiltinSkillsInstalled, ensurePreinstalledSkillsInstalled } from '@electron/utils/skill-config'; +import { initTelemetry, shutdownTelemetry } from '@electron/utils/telemetry'; import { syncGatewayConfigBeforeLaunch } from '@electron/gateway/config-sync'; // 初始化 updater,确保在 app ready 之前或者之中注册好 IPC @@ -26,6 +30,19 @@ appUpdater.init(); const HOST_API_BASE_URL = process.env['ZN_AI_HOST_API_BASE_URL'] || process.env['VITE_SERVICE_URL'] || 'http://8.138.234.141/ingress'; +const GATEWAY_QUIT_TIMEOUT_MS = 5_000; + +let gatewayEventBridgeBound = false; +let gatewayQuitCleanupInProgress = false; +let gatewayQuitCleanupCompleted = false; +let hostApiServer: Server | null = null; + +type HostApiProxyRequest = { + path: string; + method?: string; + headers?: Record; + body?: unknown; +}; function refreshProviderRuntime(): { warnings: string[] } { try { @@ -54,6 +71,7 @@ async function requestUpstreamHostApi(path: string, method: string, headers: Rec return { success: true, ok: true, + status: response.status, json: response.data, data: response.data, }; @@ -76,20 +94,187 @@ async function requestUpstreamHostApi(path: string, method: string, headers: Rec } } -ipcMain.handle(IPC_EVENTS.HOST_API_FETCH, async (_event, { path, method, headers, body }) => { - const normalizedMethod = method || 'GET'; +async function closeHostApiServer(): Promise { + if (!hostApiServer) { + return; + } + + const server = hostApiServer; + hostApiServer = null; + + await new Promise((resolve) => { + server.close(() => resolve()); + }); +} + +function normalizeProxyBody(body: unknown): string | undefined { + if (body == null) { + return undefined; + } + + if (typeof body === 'string') { + return body; + } + + return JSON.stringify(body); +} + +async function proxyHostApiRequest(request: HostApiProxyRequest) { + const path = typeof request.path === 'string' ? request.path : ''; + if (!path || !path.startsWith('/')) { + return { + success: false, + ok: false, + status: 400, + error: `Invalid host API path: ${String(request.path)}`, + }; + } + + const hostApiToken = getHostApiToken(); + if (!hostApiServer || !hostApiToken) { + const localResult = await dispatchLocalHostApi(request); + if (localResult) { + return localResult; + } + + return requestUpstreamHostApi( + path, + request.method || 'GET', + request.headers, + request.body, + ); + } + + const method = (request.method || 'GET').toUpperCase(); + const headers: Record = { + ...(request.headers || {}), + 'X-Host-Api-Token': hostApiToken, + }; + const body = normalizeProxyBody(request.body); + + if (body !== undefined && !headers['Content-Type'] && !headers['content-type']) { + headers['Content-Type'] = 'application/json'; + } + + try { + const response = await fetch(`${getHostApiBase()}${path}`, { + method, + headers, + body, + }); + const contentType = response.headers.get('content-type') || ''; + + if (contentType.includes('application/json')) { + return await response.json(); + } + + const text = await response.text(); + return { + success: response.ok, + ok: response.ok, + status: response.status, + text, + ...(response.ok ? {} : { error: text || response.statusText }), + }; + } catch (error) { + return { + success: false, + ok: false, + error: error instanceof Error ? error.message : String(error), + }; + } +} + +function emitGatewayRendererEvent(channel: string, payload: unknown): void { + BrowserWindow.getAllWindows().forEach((window) => { + if (!window.isDestroyed()) { + window.webContents.send(channel, payload); + } + }); +} + +function bindGatewayLifecycleEvents(): void { + if (gatewayEventBridgeBound) { + return; + } + + gatewayEventBridgeBound = true; + + gatewayManager.on('status', (status) => { + hostEventBus.emit('gateway:status', status); + emitGatewayRendererEvent('gateway:status-changed', status); + }); + + gatewayManager.on('message', (message) => { + hostEventBus.emit('gateway:message', message); + emitGatewayRendererEvent('gateway:message', message); + }); + + gatewayManager.on('notification', (notification) => { + hostEventBus.emit('gateway:notification', notification); + emitGatewayRendererEvent('gateway:notification', notification); + }); + + gatewayManager.on('channel:status', (data) => { + hostEventBus.emit('gateway:channel-status', data); + emitGatewayRendererEvent('gateway:channel-status', data); + }); + + gatewayManager.on('chat:message', (data) => { + hostEventBus.emit('gateway:chat-message', data); + emitGatewayRendererEvent('gateway:chat-message', data); + }); + + gatewayManager.on('exit', (code) => { + hostEventBus.emit('gateway:exit', { code }); + emitGatewayRendererEvent('gateway:exit', code); + }); + + gatewayManager.on('error', (error) => { + hostEventBus.emit('gateway:error', { message: error.message }); + emitGatewayRendererEvent('gateway:error', error.message); + }); +} + +function requestQuitOnSignal(signal: NodeJS.Signals): void { + log.info(`Received ${signal}; requesting app quit`); + app.quit(); +} + +function emergencyGatewayCleanup(reason: string, error: unknown): void { + log.error(`${reason}:`, error); + hostEventBus.closeAll(); + void closeHostApiServer().catch(() => { + // ignore host API server close failures during emergency cleanup + }); + try { + void gatewayManager.stop().catch(() => { + // ignore stop failures during emergency cleanup + }); + } catch { + // ignore stop invocation failures if state is corrupted + } + + setTimeout(() => { + void shutdownTelemetry().catch(() => { + // ignore telemetry flush failures during crash shutdown + }); + void gatewayManager.forceTerminateOwnedProcessForQuit().catch(() => { + // ignore forced termination failures during crash shutdown + }).finally(() => { + process.exit(1); + }); + }, 3_000).unref(); +} + +ipcMain.handle(IPC_EVENTS.HOST_API_TOKEN, async () => getHostApiToken()); + +ipcMain.handle(IPC_EVENTS.HOST_API_FETCH, async (_event, request: HostApiProxyRequest) => { + return proxyHostApiRequest({ ...request, method: request.method || 'GET' }); // 1. 优先本地处理 Host API 路由(逐步对齐 ClawX) - const localResult = await dispatchLocalHostApi({ - path, - method: normalizedMethod, - headers, - body, - }); - if (localResult) return localResult; // 2. 其余接口代理到远端后端 - return await requestUpstreamHostApi(path, normalizedMethod, headers, body); }); // Gateway RPC IPC handler @@ -113,10 +298,97 @@ if (started) { // logManager.error('unhandledRejection', reason, promise); // }); +process.once('SIGINT', () => requestQuitOnSignal('SIGINT')); +process.once('SIGTERM', () => requestQuitOnSignal('SIGTERM')); +process.on('uncaughtException', (error) => { + emergencyGatewayCleanup('Uncaught exception in main process', error); +}); +process.on('unhandledRejection', (reason) => { + emergencyGatewayCleanup('Unhandled promise rejection in main process', reason); +}); + +app.on('before-quit', (event) => { + if (gatewayQuitCleanupCompleted) { + return; + } + + event.preventDefault(); + + if (gatewayQuitCleanupInProgress) { + return; + } + + gatewayQuitCleanupInProgress = true; + hostEventBus.closeAll(); + const closeServerPromise = closeHostApiServer().catch((error) => { + log.warn('Host API server close failed during quit:', error); + }); + + const stopPromise = Promise.all([ + closeServerPromise, + gatewayManager.stop(), + ]).catch((error) => { + log.warn('gatewayManager.stop() error during quit:', error); + }); + const timeoutPromise = new Promise<'timeout'>((resolve) => { + setTimeout(() => resolve('timeout'), GATEWAY_QUIT_TIMEOUT_MS); + }); + + void Promise.race([ + stopPromise.then(() => 'stopped' as const), + timeoutPromise, + ]).then(async (result) => { + if (result === 'timeout') { + log.warn('Gateway shutdown timed out during app quit; proceeding with forced quit'); + try { + const terminated = await gatewayManager.forceTerminateOwnedProcessForQuit(); + if (terminated) { + log.warn('Forced gateway process termination completed after quit timeout'); + } + } catch (error) { + log.warn('Forced gateway termination failed after quit timeout:', error); + } + } + + try { + await shutdownTelemetry(); + } catch (error) { + log.warn('Telemetry shutdown failed during app quit:', error); + } + + gatewayQuitCleanupCompleted = true; + app.quit(); + }).catch((error) => { + gatewayQuitCleanupInProgress = false; + log.warn('Gateway quit cleanup failed:', error); + gatewayQuitCleanupCompleted = true; + app.quit(); + }); +}); + app.whenReady().then(async () => { await configManager.init(); await syncLaunchAtStartupSettingFromConfig(); await themeManager.init(); + await initTelemetry(); + bindGatewayLifecycleEvents(); + + try { + hostApiServer = startHostApiServer({ + ctx: createHostApiContext(), + dispatchRequest: dispatchLocalHostApi, + fallbackRequest: async (request) => { + return requestUpstreamHostApi( + request.path, + request.method || 'GET', + request.headers, + request.body, + ); + }, + }); + } catch (error) { + log.error('Failed to start Host API server:', error); + } let launchAtStartup = Boolean(configManager.get(CONFIG_KEYS.LAUNCH_AT_STARTUP)); const stopLaunchAtStartupSync = configManager.onConfigChange((config) => { @@ -131,6 +403,9 @@ app.whenReady().then(async () => { app.once('will-quit', () => { stopLaunchAtStartupSync(); + void closeHostApiServer().catch(() => { + // ignore host API server close failures during final teardown + }); }); void ensureBuiltinSkillsInstalled().catch((error) => { diff --git a/electron/utils/env-path.ts b/electron/utils/env-path.ts new file mode 100644 index 0000000..ee0769f --- /dev/null +++ b/electron/utils/env-path.ts @@ -0,0 +1,56 @@ +type EnvMap = Record; + +function isPathKey(key: string): boolean { + return key.toLowerCase() === 'path'; +} + +function preferredPathKey(): string { + return process.platform === 'win32' ? 'Path' : 'PATH'; +} + +function pathDelimiter(): string { + return process.platform === 'win32' ? ';' : ':'; +} + +export function getPathEnvKey(env: EnvMap): string { + const keys = Object.keys(env).filter(isPathKey); + if (keys.length === 0) return preferredPathKey(); + + if (process.platform === 'win32') { + if (keys.includes('Path')) return 'Path'; + if (keys.includes('PATH')) return 'PATH'; + return keys[0]; + } + + if (keys.includes('PATH')) return 'PATH'; + return keys[0]; +} + +export function getPathEnvValue(env: EnvMap): string { + const key = getPathEnvKey(env); + return env[key] ?? ''; +} + +export function setPathEnvValue(env: EnvMap, nextPath: string): EnvMap { + const nextEnv: EnvMap = { ...env }; + for (const key of Object.keys(nextEnv)) { + if (isPathKey(key)) { + delete nextEnv[key]; + } + } + + nextEnv[getPathEnvKey(env)] = nextPath; + return nextEnv; +} + +export function prependPathEntry( + env: EnvMap, + entry: string, +): { env: EnvMap; path: string } { + const current = getPathEnvValue(env); + const nextPath = current ? `${entry}${pathDelimiter()}${current}` : entry; + return { + env: setPathEnvValue(env, nextPath), + path: nextPath, + }; +} diff --git a/electron/utils/paths.ts b/electron/utils/paths.ts index 1797882..a7f8f7a 100644 --- a/electron/utils/paths.ts +++ b/electron/utils/paths.ts @@ -72,6 +72,22 @@ export function getOpenClawBuildDir(): string { return join(app.getAppPath(), 'build', OPENCLAW_PACKAGE_DIR_NAME); } +export function normalizeNodeRequirePathForNodeOptions(modulePath: string): string { + if (process.platform !== 'win32') { + return modulePath; + } + + return modulePath.replace(/\\/g, '/'); +} + +export function appendNodeRequireToNodeOptions( + nodeOptions: string | undefined, + modulePath: string, +): string { + const normalizedPath = normalizeNodeRequirePathForNodeOptions(modulePath); + return `${nodeOptions ?? ''} --require "${normalizedPath}"`.trim(); +} + export function getOpenClawPackageStatus(): { dir: string; entryPath: string; diff --git a/electron/utils/telemetry.ts b/electron/utils/telemetry.ts new file mode 100644 index 0000000..39d60ef --- /dev/null +++ b/electron/utils/telemetry.ts @@ -0,0 +1,116 @@ +import { randomUUID } from 'node:crypto'; +import { app } from 'electron'; +import axios from 'axios'; +import logManager from '@electron/service/logger'; +import configManager from '@electron/service/config-service'; + +const POSTHOG_API_KEY = 'phc_aGNegeJQP5FzNiF2rEoKqQbkuCpiiETMttplibXpB0n'; +const POSTHOG_HOST = 'https://us.i.posthog.com'; +const TELEMETRY_REQUEST_TIMEOUT_MS = 2_500; +const TELEMETRY_SHUTDOWN_TIMEOUT_MS = 1_500; + +let telemetryEnabled = false; +let distinctId = ''; +const pendingCaptures = new Set>(); + +function getCommonProperties(): Record { + return { + $app_version: app.getVersion(), + $os: process.platform, + os_tag: process.platform, + arch: process.arch, + }; +} + +function queueCapture(event: string, properties: Record): void { + let capturePromise: Promise; + const request = axios.post( + `${POSTHOG_HOST}/capture/`, + { + api_key: POSTHOG_API_KEY, + event, + properties: { + distinct_id: distinctId, + ...properties, + }, + }, + { + headers: { + 'Content-Type': 'application/json', + }, + timeout: TELEMETRY_REQUEST_TIMEOUT_MS, + validateStatus: () => true, + }, + ).then((response) => { + if (response.status >= 400) { + logManager.debug(`Telemetry backend rejected event "${event}" with status ${response.status}`); + } + }).catch((error) => { + logManager.debug(`Failed to capture telemetry event "${event}":`, error); + }).finally(() => { + pendingCaptures.delete(capturePromise); + }); + + capturePromise = request.then(() => {}); + pendingCaptures.add(capturePromise); +} + +export async function initTelemetry(): Promise { + telemetryEnabled = Boolean(configManager.get('telemetryEnabled' as never)); + if (!telemetryEnabled) { + logManager.info('Telemetry is disabled; observability stays local-only'); + return; + } + + const storedDistinctId = configManager.get('machineId' as never); + distinctId = storedDistinctId && storedDistinctId.trim() + ? storedDistinctId + : randomUUID(); + if (!storedDistinctId) { + configManager.set('machineId' as never, distinctId); + } + + const hasReportedInstall = Boolean(configManager.get('hasReportedInstall' as never)); + if (!hasReportedInstall) { + captureTelemetryEvent('app_installed'); + configManager.set('hasReportedInstall' as never, true); + } + captureTelemetryEvent('app_opened'); +} + +export function trackMetric(event: string, properties: Record = {}): void { + logManager.info(`[metric] ${event}`, properties); +} + +export function captureTelemetryEvent( + event: string, + properties: Record = {}, +): void { + if (!telemetryEnabled || !distinctId) { + return; + } + + const mergedProperties = { + ...getCommonProperties(), + ...properties, + }; + queueCapture(event, mergedProperties); +} + +export async function shutdownTelemetry(): Promise { + telemetryEnabled = false; + if (pendingCaptures.size === 0) { + distinctId = ''; + return; + } + + const captures = Array.from(pendingCaptures); + await Promise.race([ + Promise.allSettled(captures).then(() => undefined), + new Promise((resolve) => { + setTimeout(resolve, TELEMETRY_SHUTDOWN_TIMEOUT_MS); + }), + ]); + + distinctId = ''; +} diff --git a/electron/utils/uv-env.ts b/electron/utils/uv-env.ts new file mode 100644 index 0000000..9da7fb0 --- /dev/null +++ b/electron/utils/uv-env.ts @@ -0,0 +1,123 @@ +import { request } from 'node:https'; +import { app } from 'electron'; +import logManager from '@electron/service/logger'; + +const UV_MIRROR_ENV: Record = { + UV_PYTHON_INSTALL_MIRROR: 'https://registry.npmmirror.com/-/binary/python-build-standalone/', + UV_INDEX_URL: 'https://pypi.tuna.tsinghua.edu.cn/simple/', +}; + +const GOOGLE_204_HOST = 'www.google.com'; +const GOOGLE_204_PATH = '/generate_204'; +const GOOGLE_204_TIMEOUT_MS = 2000; + +let cachedOptimized: boolean | null = null; +let cachedPromise: Promise | null = null; +let loggedOnce = false; + +function getLocaleAndTimezone(): { locale: string; timezone: string } { + const locale = app.getLocale?.() || ''; + const timezone = Intl.DateTimeFormat().resolvedOptions().timeZone || ''; + return { locale, timezone }; +} + +function isRegionOptimized(locale: string, timezone: string): boolean { + if (timezone) return timezone === 'Asia/Shanghai'; + return locale === 'zh-CN'; +} + +function probeGoogle204(timeoutMs: number): Promise { + return new Promise((resolve) => { + let done = false; + const finish = (value: boolean) => { + if (done) return; + done = true; + resolve(value); + }; + + const req = request( + { + method: 'GET', + hostname: GOOGLE_204_HOST, + path: GOOGLE_204_PATH, + }, + (res) => { + const status = res.statusCode || 0; + res.resume(); + finish(status >= 200 && status < 300); + }, + ); + + req.setTimeout(timeoutMs, () => { + req.destroy(new Error('google_204_timeout')); + }); + + req.on('error', () => finish(false)); + req.end(); + }); +} + +async function computeOptimization(): Promise { + const { locale, timezone } = getLocaleAndTimezone(); + + if (isRegionOptimized(locale, timezone)) { + if (!loggedOnce) { + logManager.info( + `Region optimization enabled via locale/timezone (locale=${locale || 'unknown'}, tz=${timezone || 'unknown'})`, + ); + loggedOnce = true; + } + return true; + } + + const reachable = await probeGoogle204(GOOGLE_204_TIMEOUT_MS); + const isOptimized = !reachable; + + if (!loggedOnce) { + const reason = reachable ? 'google_204_reachable' : 'google_204_unreachable'; + logManager.info( + `Network optimization probe: ${reason} (locale=${locale || 'unknown'}, tz=${timezone || 'unknown'})`, + ); + loggedOnce = true; + } + + return isOptimized; +} + +export async function shouldOptimizeNetwork(): Promise { + if (cachedOptimized !== null) return cachedOptimized; + if (cachedPromise) return cachedPromise; + + if (!app.isReady()) { + await app.whenReady(); + } + + cachedPromise = computeOptimization() + .then((result) => { + cachedOptimized = result; + return result; + }) + .catch((error) => { + logManager.warn('Network optimization check failed, defaulting to enabled:', error); + cachedOptimized = true; + return true; + }) + .finally(() => { + cachedPromise = null; + }); + + return cachedPromise; +} + +export async function getUvMirrorEnv(): Promise> { + const isOptimized = await shouldOptimizeNetwork(); + return isOptimized ? { ...UV_MIRROR_ENV } : {}; +} + +export async function warmupNetworkOptimization(): Promise { + try { + await shouldOptimizeNetwork(); + } catch { + // Ignore warmup failures. + } +} diff --git a/electron/utils/uv-setup.ts b/electron/utils/uv-setup.ts new file mode 100644 index 0000000..7e7b652 --- /dev/null +++ b/electron/utils/uv-setup.ts @@ -0,0 +1,192 @@ +import { execSync, spawn } from 'node:child_process'; +import { existsSync } from 'node:fs'; +import { join } from 'node:path'; +import { app } from 'electron'; +import logManager from '@electron/service/logger'; +import { getUvMirrorEnv } from './uv-env'; + +function getBundledUvPath(): string { + const platform = process.platform; + const arch = process.arch; + const target = `${platform}-${arch}`; + const binName = platform === 'win32' ? 'uv.exe' : 'uv'; + + if (app.isPackaged) { + return join(process.resourcesPath, 'bin', binName); + } + + return join(process.cwd(), 'resources', 'bin', target, binName); +} + +function findUvInPathSync(): boolean { + try { + const command = process.platform === 'win32' ? 'where.exe uv' : 'which uv'; + execSync(command, { stdio: 'ignore', timeout: 5000, windowsHide: true }); + return true; + } catch { + return false; + } +} + +function resolveUvBin(): { bin: string; source: 'bundled' | 'path' | 'bundled-fallback' } { + const bundled = getBundledUvPath(); + + if (app.isPackaged) { + if (existsSync(bundled)) { + return { bin: bundled, source: 'bundled' }; + } + logManager.warn(`Bundled uv binary not found at ${bundled}, falling back to system PATH`); + } + + if (findUvInPathSync()) { + return { bin: 'uv', source: 'path' }; + } + + if (existsSync(bundled)) { + return { bin: bundled, source: 'bundled-fallback' }; + } + + return { bin: 'uv', source: 'path' }; +} + +export async function checkUvInstalled(): Promise { + const { bin, source } = resolveUvBin(); + if (source === 'bundled' || source === 'bundled-fallback') { + return existsSync(bin); + } + return findUvInPathSync(); +} + +export async function installUv(): Promise { + const isAvailable = await checkUvInstalled(); + if (!isAvailable) { + const bin = getBundledUvPath(); + throw new Error(`uv not found in system PATH and bundled binary missing at ${bin}`); + } + logManager.info('uv is available and ready to use'); +} + +export async function isPythonReady(): Promise { + const { bin: uvBin } = resolveUvBin(); + + return await new Promise((resolve) => { + try { + const child = spawn(uvBin, ['python', 'find', '3.12'], { + windowsHide: true, + }); + child.on('close', (code) => resolve(code === 0)); + child.on('error', () => resolve(false)); + } catch { + resolve(false); + } + }); +} + +async function runPythonInstall( + uvBin: string, + env: Record, + label: string, +): Promise { + return await new Promise((resolve, reject) => { + const stderrChunks: string[] = []; + const stdoutChunks: string[] = []; + + const child = spawn(uvBin, ['python', 'install', '3.12'], { + env, + windowsHide: true, + }); + + child.stdout?.on('data', (data) => { + const line = data.toString().trim(); + if (line) { + stdoutChunks.push(line); + logManager.debug(`[python-setup:${label}] stdout: ${line}`); + } + }); + + child.stderr?.on('data', (data) => { + const line = data.toString().trim(); + if (line) { + stderrChunks.push(line); + logManager.info(`[python-setup:${label}] stderr: ${line}`); + } + }); + + child.on('close', (code) => { + if (code === 0) { + resolve(); + return; + } + + const stderr = stderrChunks.join('\n'); + const stdout = stdoutChunks.join('\n'); + const detail = stderr || stdout || '(no output captured)'; + reject(new Error( + `Python installation failed with code ${code} [${label}]\n` + + ` uv binary: ${uvBin}\n` + + ` platform: ${process.platform}/${process.arch}\n` + + ` output: ${detail}`, + )); + }); + + child.on('error', (error) => { + reject(new Error( + `Python installation spawn error [${label}]: ${error.message}\n` + + ` uv binary: ${uvBin}\n` + + ` platform: ${process.platform}/${process.arch}`, + )); + }); + }); +} + +export async function setupManagedPython(): Promise { + const { bin: uvBin, source } = resolveUvBin(); + const uvEnv = await getUvMirrorEnv(); + const hasMirror = Object.keys(uvEnv).length > 0; + + logManager.info( + `Setting up managed Python 3.12 ` + + `(uv=${uvBin}, source=${source}, arch=${process.arch}, mirror=${hasMirror})`, + ); + + const baseEnv: Record = { ...process.env }; + + try { + await runPythonInstall(uvBin, { ...baseEnv, ...uvEnv }, hasMirror ? 'mirror' : 'default'); + } catch (firstError) { + logManager.warn('Python install attempt 1 failed:', firstError); + + if (!hasMirror) { + throw firstError; + } + + logManager.info('Retrying Python install without mirror...'); + try { + await runPythonInstall(uvBin, baseEnv, 'no-mirror'); + } catch (secondError) { + logManager.error('Python install attempt 2 (no mirror) also failed:', secondError); + throw secondError; + } + } + + try { + const findPath = await new Promise((resolve) => { + const child = spawn(uvBin, ['python', 'find', '3.12'], { + env: { ...process.env, ...uvEnv }, + windowsHide: true, + }); + let output = ''; + child.stdout?.on('data', (data) => { + output += data; + }); + child.on('close', () => resolve(output.trim())); + child.on('error', () => resolve('')); + }); + + if (findPath) { + logManager.info(`Managed Python 3.12 installed at: ${findPath}`); + } + } catch (error) { + logManager.warn('Could not determine Python path after install:', error); + } +} diff --git a/global.d.ts b/global.d.ts index 982694a..2bf9e5f 100644 --- a/global.d.ts +++ b/global.d.ts @@ -47,6 +47,10 @@ declare global { params: [request: { path: string; method?: string; headers?: Record; body?: unknown }] return: Promise } + [IPC_EVENTS.HOST_API_TOKEN]: { + params: [] + return: Promise + } // 任务事件 [IPC_EVENTS.TASK_PROGRESS]: { params: [payload: any]; return: void; } diff --git a/runtime-shared/lib/constants.ts b/runtime-shared/lib/constants.ts index f8db9fc..d8a5d4c 100644 --- a/runtime-shared/lib/constants.ts +++ b/runtime-shared/lib/constants.ts @@ -1,5 +1,6 @@ export enum IPC_EVENTS { HOST_API_FETCH = 'hostapi:fetch', + HOST_API_TOKEN = 'hostapi:token', EXTERNAL_OPEN = 'external-open', APP_SET_FRAMELESS = 'app:set-frameless', APP_LOAD_PAGE = 'app:load-page', diff --git a/src/lib/host-api.ts b/src/lib/host-api.ts index 1cbc9c2..ec908a4 100644 --- a/src/lib/host-api.ts +++ b/src/lib/host-api.ts @@ -4,11 +4,16 @@ import { logout, readPersistedAuthToken } from '../router/auth-session'; type RequestInitLike = Pick; +const HOST_API_PORT = 13210; +const HOST_API_BASE = `http://127.0.0.1:${HOST_API_PORT}`; + type LooseIpcBridge = { invoke(channel: string, ...args: any[]): Promise; on?(channel: string, callback: (...args: any[]) => void): () => void; }; +let cachedHostApiToken: string | null = null; + function normalizeHeaders(headers?: HeadersInit): Headers { return new Headers(headers ?? {}); } @@ -67,6 +72,58 @@ function handleUnauthorized(): void { logout({ reason: 'unauthorized', from }); } +async function getHostApiToken(): Promise { + if (cachedHostApiToken) { + return cachedHostApiToken; + } + + cachedHostApiToken = await invokeIpc(IPC_EVENTS.HOST_API_TOKEN); + return cachedHostApiToken; +} + +async function fetchViaLocalHostApi( + path: string, + method: string, + headers: Headers, + body: BodyInit | null | undefined, +): Promise { + const hostApiToken = await getHostApiToken(); + const localHeaders = new Headers(headers); + if (hostApiToken && !localHeaders.has('X-Host-Api-Token')) { + localHeaders.set('X-Host-Api-Token', hostApiToken); + } + if (body != null && !localHeaders.has('Content-Type')) { + localHeaders.set('Content-Type', 'application/json'); + } + + const response = await fetch(`${HOST_API_BASE}${path}`, { + method, + headers: localHeaders, + body, + }); + + if (!response.ok) { + if (isUnauthorizedStatus(response.status)) { + handleUnauthorized(); + } + const text = await response.text(); + if (!isUnauthorizedStatus(response.status) && isUnauthorizedMessage(text)) { + handleUnauthorized(); + } + throw new Error(text || response.statusText || `Request failed with ${response.status}`); + } + + const contentType = response.headers.get('content-type') ?? ''; + if (response.status === 204) { + return undefined as T; + } + if (contentType.includes('application/json')) { + return (await response.json()) as T; + } + + return (await response.text()) as unknown as T; +} + export function hasHostApiBridge(): boolean { return typeof window !== 'undefined' && Boolean(window.api?.invoke); } @@ -89,6 +146,15 @@ export function onIpc(channel: string, callback: (...args: any[]) => void): () = return bridge.on ? bridge.on(channel, callback) : () => {}; } +function shouldFallbackToBrowser(message: string): boolean { + const normalized = message.toLowerCase(); + return normalized.includes('invalid ipc channel: hostapi:fetch') + || normalized.includes("no handler registered for 'hostapi:fetch'") + || normalized.includes('no handler registered for "hostapi:fetch"') + || normalized.includes('no handler registered for hostapi:fetch') + || normalized.includes('window is not defined'); +} + export async function hostApiFetch(path: string, init?: RequestInitLike): Promise { const method = init?.method ?? 'GET'; const headers = normalizeHeaders(init?.headers); @@ -112,8 +178,17 @@ export async function hostApiFetch(path: string, init?: RequestInitLike): Pro }; if (hasHostApiBridge()) { - const response = await invokeIpc(IPC_EVENTS.HOST_API_FETCH, request); - return extractResult(response); + try { + const response = await invokeIpc(IPC_EVENTS.HOST_API_FETCH, request); + return extractResult(response); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + if (!shouldFallbackToBrowser(message)) { + throw error; + } + + return fetchViaLocalHostApi(path, method, headers, normalizeBody(init?.body)); + } } if (typeof fetch === 'function') { @@ -135,6 +210,9 @@ export async function hostApiFetch(path: string, init?: RequestInitLike): Pro } const contentType = response.headers.get('content-type') ?? ''; + if (response.status === 204) { + return undefined as T; + } if (contentType.includes('application/json')) { return (await response.json()) as T; } @@ -144,3 +222,13 @@ export async function hostApiFetch(path: string, init?: RequestInitLike): Pro throw new Error(`No HTTP bridge available for ${path}`); } + +export async function createHostEventSource(path = '/api/events'): Promise { + const token = await getHostApiToken(); + const separator = path.includes('?') ? '&' : '?'; + return new EventSource(`${HOST_API_BASE}${path}${separator}token=${encodeURIComponent(token)}`); +} + +export function getHostApiBase(): string { + return HOST_API_BASE; +} diff --git a/tests/gateway-protocol-state.test.ts b/tests/gateway-protocol-state.test.ts new file mode 100644 index 0000000..98f6b05 --- /dev/null +++ b/tests/gateway-protocol-state.test.ts @@ -0,0 +1,249 @@ +// @vitest-environment node +import { describe, expect, it, vi, beforeEach, afterEach } from 'vitest'; + +vi.mock('@electron/service/logger', () => { + const logger = { + debug: vi.fn(), + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + }; + + return { + default: logger, + logManager: logger, + }; +}); + +import { + createErrorResponse, + createRequest, + createSuccessResponse, + GatewayEventType, + isNotification, + isRequest, + isResponse, +} from '../electron/gateway/protocol'; +import { + dispatchJsonRpcNotification, + dispatchProtocolEvent, +} from '../electron/gateway/event-dispatch'; +import { GatewayStateController } from '../electron/gateway/state'; +import { createInitialGatewayDiagnostics } from '../electron/gateway/diagnostics'; + +type EmittedEvent = { + event: string; + payload: unknown; +}; + +function createEmitterRecorder() { + const events: EmittedEvent[] = []; + + return { + events, + emitter: { + emit: vi.fn((event: string, payload: unknown) => { + events.push({ event, payload }); + return true; + }), + }, + }; +} + +describe('gateway protocol helpers', () => { + it('creates JSON-RPC request and responses', () => { + const request = createRequest('skills.status', { includeDisabled: true }, 'req-1'); + expect(request).toEqual({ + jsonrpc: '2.0', + id: 'req-1', + method: 'skills.status', + params: { includeDisabled: true }, + }); + + expect(createSuccessResponse('req-1', { ok: true })).toEqual({ + jsonrpc: '2.0', + id: 'req-1', + result: { ok: true }, + }); + + expect(createErrorResponse('req-1', -32001, 'not connected')).toEqual({ + jsonrpc: '2.0', + id: 'req-1', + error: { + code: -32001, + message: 'not connected', + data: undefined, + }, + }); + }); + + it('generates a request id when one is not supplied', () => { + const request = createRequest('gateway.ping'); + expect(request.jsonrpc).toBe('2.0'); + expect(typeof request.id).toBe('string'); + expect(request.id).not.toHaveLength(0); + }); + + it('detects requests, responses, and notifications', () => { + const request = createRequest('chat.send', { text: 'hello' }, 'req-2'); + const response = createSuccessResponse('req-2', { runId: 'run-1' }); + const notification = { + jsonrpc: '2.0' as const, + method: GatewayEventType.MESSAGE_RECEIVED, + params: { message: 'hi' }, + }; + + expect(isRequest(request)).toBe(true); + expect(isResponse(request)).toBe(false); + expect(isNotification(request)).toBe(false); + + expect(isResponse(response)).toBe(true); + expect(isRequest(response)).toBe(false); + expect(isNotification(response)).toBe(false); + + expect(isNotification(notification)).toBe(true); + expect(isRequest(notification)).toBe(false); + expect(isResponse(notification)).toBe(false); + }); +}); + +describe('gateway event dispatch', () => { + it('dispatches protocol chat and gateway.ready events', () => { + const { emitter, events } = createEmitterRecorder(); + + dispatchProtocolEvent(emitter, 'chat', { message: 'delta' }); + dispatchProtocolEvent(emitter, 'gateway.ready', { source: 'event' }); + + expect(events).toEqual([ + { event: 'chat:message', payload: { message: { message: 'delta' } } }, + { event: 'gateway:ready', payload: { source: 'event' } }, + ]); + }); + + it('dispatches unknown protocol events to notification listeners', () => { + const { emitter, events } = createEmitterRecorder(); + + dispatchProtocolEvent(emitter, 'skills.changed', { slug: 'minimax-xlsx' }); + + expect(events).toEqual([ + { + event: 'notification', + payload: { method: 'skills.changed', params: { slug: 'minimax-xlsx' } }, + }, + ]); + }); + + it('dispatches JSON-RPC notifications onto typed channels', () => { + const { emitter, events } = createEmitterRecorder(); + + dispatchJsonRpcNotification(emitter, { + jsonrpc: '2.0', + method: GatewayEventType.CHANNEL_STATUS_CHANGED, + params: { channelId: 'wx', status: 'connected' }, + }); + dispatchJsonRpcNotification(emitter, { + jsonrpc: '2.0', + method: GatewayEventType.MESSAGE_RECEIVED, + params: { message: { text: 'hello' } }, + }); + dispatchJsonRpcNotification(emitter, { + jsonrpc: '2.0', + method: GatewayEventType.ERROR, + params: { message: 'gateway boom' }, + }); + + expect(events[0]).toEqual({ + event: 'notification', + payload: { + jsonrpc: '2.0', + method: GatewayEventType.CHANNEL_STATUS_CHANGED, + params: { channelId: 'wx', status: 'connected' }, + }, + }); + expect(events[1]).toEqual({ + event: 'channel:status', + payload: { channelId: 'wx', status: 'connected' }, + }); + expect(events[2]).toEqual({ + event: 'notification', + payload: { + jsonrpc: '2.0', + method: GatewayEventType.MESSAGE_RECEIVED, + params: { message: { text: 'hello' } }, + }, + }); + expect(events[3]).toEqual({ + event: 'chat:message', + payload: { message: { text: 'hello' } }, + }); + expect(events[4].event).toBe('notification'); + expect(events[5].event).toBe('error'); + expect((events[5].payload as Error).message).toBe('gateway boom'); + }); +}); + +describe('gateway state controller', () => { + beforeEach(() => { + vi.useFakeTimers(); + vi.setSystemTime(new Date('2026-04-23T00:00:00.000Z')); + }); + + afterEach(() => { + vi.useRealTimers(); + }); + + it('tracks transitions, connectivity, and uptime', () => { + const emittedStatuses: Array> = []; + const transitions: Array<[string, string]> = []; + const controller = new GatewayStateController({ + emitStatus: (status) => { + emittedStatuses.push({ ...status }); + }, + onTransition: (previousState, nextState) => { + transitions.push([previousState, nextState]); + }, + }); + + controller.setStatus({ state: 'starting', port: 18789 }); + controller.setStatus({ state: 'running', port: 18789, connectedAt: Date.now() - 2_000 }); + + expect(controller.isConnected(false)).toBe(false); + expect(controller.isConnected(true)).toBe(true); + expect(emittedStatuses[0]).toMatchObject({ state: 'starting', port: 18789 }); + expect(emittedStatuses[1]).toMatchObject({ + state: 'running', + port: 18789, + connectedAt: Date.now() - 2_000, + }); + expect(emittedStatuses[1].uptime).toBe(2_000); + expect(transitions).toEqual([ + ['stopped', 'starting'], + ['starting', 'running'], + ]); + + vi.setSystemTime(new Date('2026-04-23T00:00:05.000Z')); + expect(controller.getStatus().uptime).toBe(7_000); + }); + + it('clears uptime when the gateway is no longer running', () => { + const controller = new GatewayStateController({ + emitStatus: () => {}, + }); + + controller.setStatus({ state: 'running', port: 18789, connectedAt: Date.now() - 1_000 }); + expect(controller.getStatus().uptime).toBe(1_000); + + controller.setStatus({ state: 'stopped' }); + expect(controller.getStatus().uptime).toBeUndefined(); + expect(controller.isConnected(true)).toBe(false); + }); +}); + +describe('gateway diagnostics helpers', () => { + it('creates an empty diagnostics snapshot', () => { + expect(createInitialGatewayDiagnostics()).toEqual({ + consecutiveHeartbeatMisses: 0, + consecutiveRpcFailures: 0, + }); + }); +}); diff --git a/tests/gateway-startup-helpers.test.ts b/tests/gateway-startup-helpers.test.ts new file mode 100644 index 0000000..b013fb7 --- /dev/null +++ b/tests/gateway-startup-helpers.test.ts @@ -0,0 +1,517 @@ +// @vitest-environment node +import { describe, expect, it, vi } from 'vitest'; + +vi.mock('@electron/service/logger', () => { + const logger = { + debug: vi.fn(), + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + }; + + return { + default: logger, + logManager: logger, + }; +}); + +import { + classifyGatewayStderrMessage, + recordGatewayStartupStderrLine, +} from '../electron/gateway/startup-stderr'; +import { resolveGatewayLaunchStrategy } from '../electron/gateway/launch-strategy'; +import { getGatewayStartupRecoveryAction } from '../electron/gateway/startup-recovery'; +import { runGatewayStartupSequence } from '../electron/gateway/startup-orchestrator'; +import { + DEFAULT_RECONNECT_CONFIG, + getDeferredRestartAction, + getReconnectScheduleDecision, + getReconnectSkipReason, + shouldDeferRestart, +} from '../electron/gateway/process-policy'; +import { GatewayLifecycleController, LifecycleSupersededError } from '../electron/gateway/lifecycle-controller'; +import { GatewayConnectionMonitor } from '../electron/gateway/connection-monitor'; +import { GatewayRestartController } from '../electron/gateway/restart-controller'; +import { GatewayRestartGovernor } from '../electron/gateway/restart-governor'; +import { + DEFAULT_GATEWAY_RELOAD_POLICY, + parseGatewayReloadPolicy, +} from '../electron/gateway/reload-policy'; + +describe('startup-stderr helpers', () => { + it('downgrades known config warnings to debug', () => { + expect( + classifyGatewayStderrMessage( + '2026-04-23T09:14:55.599+08:00 Config warnings: stale plugin ignored', + ), + ).toEqual({ + level: 'debug', + normalized: '2026-04-23T09:14:55.599+08:00 Config warnings: stale plugin ignored', + }); + }); + + it('keeps actionable stderr as warn', () => { + expect(classifyGatewayStderrMessage('fatal: failed to start gateway')).toEqual({ + level: 'warn', + normalized: 'fatal: failed to start gateway', + }); + }); + + it('caps startup stderr history', () => { + const lines: string[] = []; + for (let i = 0; i < 130; i += 1) { + recordGatewayStartupStderrLine(lines, `line-${i}`); + } + + expect(lines).toHaveLength(120); + expect(lines[0]).toBe('line-10'); + expect(lines.at(-1)).toBe('line-129'); + }); +}); + +describe('startup-recovery helpers', () => { + it('retries transient startup errors before max attempts', () => { + expect(getGatewayStartupRecoveryAction({ + startupError: new Error('OpenClaw Gateway exited before becoming ready (code=-1)'), + startupStderrLines: [], + configRepairAttempted: false, + attempt: 1, + maxAttempts: 3, + })).toBe('retry'); + }); + + it('prefers repair for invalid config signals', () => { + expect(getGatewayStartupRecoveryAction({ + startupError: new Error('startup failed'), + startupStderrLines: ['invalid config: unrecognized key browser.foo'], + configRepairAttempted: false, + attempt: 1, + maxAttempts: 3, + })).toBe('repair'); + }); + + it('fails once retries are exhausted', () => { + expect(getGatewayStartupRecoveryAction({ + startupError: new Error('OpenClaw Gateway failed to become ready on port 18789'), + startupStderrLines: [], + configRepairAttempted: false, + attempt: 3, + maxAttempts: 3, + })).toBe('fail'); + }); +}); + +describe('launch-strategy helpers', () => { + it('uses node-runtime on Windows dev by default', () => { + expect(resolveGatewayLaunchStrategy({ + platform: 'win32', + mode: 'dev', + })).toBe('node-runtime'); + }); + + it('keeps utility-process for packaged Windows builds', () => { + expect(resolveGatewayLaunchStrategy({ + platform: 'win32', + mode: 'packaged', + })).toBe('utility-process'); + }); + + it('honors forced strategy overrides', () => { + expect(resolveGatewayLaunchStrategy({ + platform: 'darwin', + mode: 'dev', + forced: 'utility', + })).toBe('utility-process'); + + expect(resolveGatewayLaunchStrategy({ + platform: 'linux', + mode: 'packaged', + forced: 'node', + })).toBe('node-runtime'); + }); +}); + +describe('runGatewayStartupSequence', () => { + it('connects to an existing gateway without starting a new process', async () => { + const connect = vi.fn(async () => {}); + const startProcess = vi.fn(async () => {}); + const onConnectedToExistingGateway = vi.fn(); + + await runGatewayStartupSequence({ + port: 18789, + shouldWaitForPortFree: false, + resetStartupStderrLines: vi.fn(), + getStartupStderrLines: () => [], + findExistingGateway: async () => ({ port: 18789 }), + connect, + onConnectedToExistingGateway, + waitForPortFree: vi.fn(async () => {}), + startProcess, + waitForReady: vi.fn(async () => {}), + onConnectedToManagedGateway: vi.fn(), + delay: vi.fn(async () => {}), + }); + + expect(connect).toHaveBeenCalledWith(18789, undefined); + expect(onConnectedToExistingGateway).toHaveBeenCalledTimes(1); + expect(startProcess).not.toHaveBeenCalled(); + }); + + it('starts and connects a managed gateway when none exists', async () => { + const events: string[] = []; + + await runGatewayStartupSequence({ + port: 19001, + shouldWaitForPortFree: true, + resetStartupStderrLines: vi.fn(() => { + events.push('reset'); + }), + getStartupStderrLines: () => [], + findExistingGateway: async () => null, + connect: async () => { + events.push('connect'); + }, + onConnectedToExistingGateway: vi.fn(), + waitForPortFree: async () => { + events.push('wait-port'); + }, + startProcess: async () => { + events.push('start'); + }, + waitForReady: async () => { + events.push('ready'); + }, + onConnectedToManagedGateway: () => { + events.push('managed-connected'); + }, + delay: vi.fn(async () => {}), + }); + + expect(events).toEqual(['reset', 'wait-port', 'start', 'ready', 'connect', 'managed-connected']); + }); + + it('retries once after a transient startup error', async () => { + let attempts = 0; + const delay = vi.fn(async () => {}); + const startProcess = vi.fn(async () => { + attempts += 1; + if (attempts === 1) { + throw new Error('OpenClaw Gateway exited before becoming ready (code=-1)'); + } + }); + const connect = vi.fn(async () => {}); + + await runGatewayStartupSequence({ + port: 19001, + shouldWaitForPortFree: false, + maxStartAttempts: 3, + resetStartupStderrLines: vi.fn(), + getStartupStderrLines: () => [], + findExistingGateway: async () => null, + connect, + onConnectedToExistingGateway: vi.fn(), + waitForPortFree: vi.fn(async () => {}), + startProcess, + waitForReady: vi.fn(async () => {}), + onConnectedToManagedGateway: vi.fn(), + delay, + }); + + expect(startProcess).toHaveBeenCalledTimes(2); + expect(delay).toHaveBeenCalledTimes(1); + expect(connect).toHaveBeenCalledTimes(1); + }); + + it('runs doctor repair once for invalid config signals before retrying startup', async () => { + let attempts = 0; + const events: string[] = []; + const runDoctorRepair = vi.fn(async () => true); + + await runGatewayStartupSequence({ + port: 19002, + shouldWaitForPortFree: false, + maxStartAttempts: 3, + resetStartupStderrLines: vi.fn(() => { + events.push('reset'); + }), + getStartupStderrLines: () => attempts === 1 ? ['invalid config: unrecognized key browser.foo'] : [], + findExistingGateway: async () => null, + connect: async () => { + events.push('connect'); + }, + onConnectedToExistingGateway: vi.fn(), + waitForPortFree: vi.fn(async () => {}), + startProcess: async () => { + attempts += 1; + events.push(`start-${attempts}`); + if (attempts === 1) { + throw new Error('startup failed'); + } + }, + waitForReady: async () => { + events.push('ready'); + }, + onConnectedToManagedGateway: () => { + events.push('managed-connected'); + }, + runDoctorRepair, + onDoctorRepairSuccess: () => { + events.push('repair-success'); + }, + delay: vi.fn(async () => {}), + }); + + expect(runDoctorRepair).toHaveBeenCalledTimes(1); + expect(events).toEqual([ + 'reset', + 'start-1', + 'repair-success', + 'reset', + 'start-2', + 'ready', + 'connect', + 'managed-connected', + ]); + }); + + it('bubbles lifecycle superseded errors without retrying', async () => { + await expect(runGatewayStartupSequence({ + port: 19003, + shouldWaitForPortFree: false, + resetStartupStderrLines: vi.fn(), + getStartupStderrLines: () => [], + findExistingGateway: async () => { + throw new LifecycleSupersededError('stale start'); + }, + connect: vi.fn(async () => {}), + onConnectedToExistingGateway: vi.fn(), + waitForPortFree: vi.fn(async () => {}), + startProcess: vi.fn(async () => {}), + waitForReady: vi.fn(async () => {}), + onConnectedToManagedGateway: vi.fn(), + delay: vi.fn(async () => {}), + })).rejects.toThrow('stale start'); + }); +}); + +describe('process-policy helpers', () => { + it('schedules reconnect attempts with exponential backoff', () => { + expect(getReconnectScheduleDecision({ + shouldReconnect: true, + hasReconnectTimer: false, + reconnectAttempts: 2, + maxAttempts: DEFAULT_RECONNECT_CONFIG.maxAttempts, + baseDelay: DEFAULT_RECONNECT_CONFIG.baseDelay, + maxDelay: DEFAULT_RECONNECT_CONFIG.maxDelay, + })).toEqual({ + action: 'schedule', + nextAttempt: 3, + maxAttempts: 10, + delay: 4000, + }); + }); + + it('returns skip reasons for reconnect callbacks', () => { + expect(getReconnectSkipReason({ + scheduledEpoch: 1, + currentEpoch: 1, + shouldReconnect: false, + })).toBe('auto-reconnect disabled'); + + expect(getReconnectSkipReason({ + scheduledEpoch: 1, + currentEpoch: 2, + shouldReconnect: true, + })).toContain('stale reconnect callback'); + }); + + it('classifies deferred restart actions', () => { + expect(shouldDeferRestart({ state: 'starting', startLock: false })).toBe(true); + expect(getDeferredRestartAction({ + hasPendingRestart: true, + state: 'running', + startLock: false, + shouldReconnect: true, + })).toBe('execute'); + expect(getDeferredRestartAction({ + hasPendingRestart: true, + state: 'running', + startLock: false, + shouldReconnect: false, + })).toBe('drop'); + }); +}); + +describe('lifecycle-controller helpers', () => { + it('tracks lifecycle epochs and rejects stale phases', () => { + const lifecycle = new GatewayLifecycleController(); + const firstEpoch = lifecycle.bump('start'); + expect(firstEpoch).toBe(1); + + lifecycle.bump('restart'); + expect(() => lifecycle.assert(firstEpoch, 'connect')).toThrow(LifecycleSupersededError); + }); +}); + +describe('connection-monitor helpers', () => { + it('triggers heartbeat timeout after consecutive misses', () => { + vi.useFakeTimers(); + try { + const sendPing = vi.fn(); + const onHeartbeatTimeout = vi.fn(); + const monitor = new GatewayConnectionMonitor(); + + monitor.startPing({ + sendPing, + onHeartbeatTimeout, + intervalMs: 1000, + timeoutMs: 500, + maxConsecutiveMisses: 2, + }); + + vi.advanceTimersByTime(1000); + vi.advanceTimersByTime(1000); + vi.advanceTimersByTime(1000); + + expect(sendPing).toHaveBeenCalledTimes(2); + expect(onHeartbeatTimeout).toHaveBeenCalledWith({ + consecutiveMisses: 2, + timeoutMs: 500, + }); + } finally { + vi.useRealTimers(); + } + }); + + it('resets heartbeat misses when alive messages arrive', () => { + vi.useFakeTimers(); + try { + const monitor = new GatewayConnectionMonitor(); + monitor.startPing({ + sendPing: vi.fn(), + onHeartbeatTimeout: vi.fn(), + intervalMs: 1000, + timeoutMs: 500, + maxConsecutiveMisses: 3, + }); + + vi.advanceTimersByTime(1000); + vi.advanceTimersByTime(1000); + expect(monitor.getConsecutiveMisses()).toBe(1); + + monitor.markAlive('message'); + expect(monitor.getConsecutiveMisses()).toBe(0); + } finally { + vi.useRealTimers(); + } + }); +}); + +describe('restart-controller helpers', () => { + it('flushes deferred restarts once lifecycle settles', () => { + const controller = new GatewayRestartController(); + const executeRestart = vi.fn(); + + controller.markDeferredRestart('restart', { + state: 'starting', + startLock: true, + }); + + controller.flushDeferredRestart('status:starting->running', { + state: 'running', + startLock: false, + shouldReconnect: true, + }, executeRestart); + + expect(executeRestart).toHaveBeenCalledTimes(1); + }); + + it('drops deferred restarts if another restart already completed later', () => { + vi.useFakeTimers(); + try { + vi.setSystemTime(new Date('2026-04-23T12:00:00Z')); + const controller = new GatewayRestartController(); + const executeRestart = vi.fn(); + + controller.markDeferredRestart('restart', { + state: 'starting', + startLock: true, + }); + + vi.setSystemTime(new Date('2026-04-23T12:00:01Z')); + controller.recordRestartCompleted(); + controller.flushDeferredRestart('start:finally', { + state: 'running', + startLock: false, + shouldReconnect: true, + }, executeRestart); + + expect(executeRestart).not.toHaveBeenCalled(); + } finally { + vi.useRealTimers(); + } + }); + + it('debounces repeated restart requests', () => { + vi.useFakeTimers(); + try { + const controller = new GatewayRestartController(); + const executeRestart = vi.fn(); + + controller.debouncedRestart(1000, executeRestart); + controller.debouncedRestart(1000, executeRestart); + vi.advanceTimersByTime(999); + expect(executeRestart).not.toHaveBeenCalled(); + + vi.advanceTimersByTime(1); + expect(executeRestart).toHaveBeenCalledTimes(1); + } finally { + vi.useRealTimers(); + } + }); +}); + +describe('restart-governor helpers', () => { + it('suppresses restart attempts during cooldown', () => { + const governor = new GatewayRestartGovernor({ cooldownMs: 5000 }); + + expect(governor.decide(1000)).toEqual({ allow: true }); + governor.recordExecuted(1000); + expect(governor.decide(3000)).toEqual({ + allow: false, + reason: 'cooldown_active', + retryAfterMs: 3000, + }); + expect(governor.getCounters()).toEqual({ + executedTotal: 1, + suppressedTotal: 1, + }); + }); +}); + +describe('reload-policy helpers', () => { + it('parses valid gateway reload config and clamps debounce', () => { + expect(parseGatewayReloadPolicy({ + gateway: { + reload: { + mode: 'restart', + debounceMs: 120000, + }, + }, + })).toEqual({ + mode: 'restart', + debounceMs: 60000, + }); + }); + + it('falls back to defaults for invalid values', () => { + expect(parseGatewayReloadPolicy({ + gateway: { + reload: { + mode: 'invalid', + debounceMs: 'oops', + }, + }, + })).toEqual(DEFAULT_GATEWAY_RELOAD_POLICY); + }); +});