diff --git a/electron/api/routes/app.ts b/electron/api/routes/app.ts index 92e0d79..7d4db9e 100644 --- a/electron/api/routes/app.ts +++ b/electron/api/routes/app.ts @@ -17,6 +17,9 @@ export async function handleAppRoutes( }); res.write(': connected\n\n'); ctx.eventBus.addSseClient(res); + // Send a current-state snapshot immediately so renderer subscribers do not + // miss lifecycle transitions that happened before the SSE connection opened. + res.write(`event: gateway:status\ndata: ${JSON.stringify(ctx.gatewayManager.getStatus())}\n\n`); return true; } diff --git a/electron/gateway/config-sync.ts b/electron/gateway/config-sync.ts index 3d973d1..6dadb74 100644 --- a/electron/gateway/config-sync.ts +++ b/electron/gateway/config-sync.ts @@ -6,6 +6,7 @@ import { getApiKey, getDefaultProvider, getProvider } from '../utils/secure-stor import { getProviderEnvVar, getKeyableProviderTypes } from '../utils/provider-registry'; import { getOpenClawDir, getOpenClawEntryPath, isOpenClawPresent } from '../utils/paths'; import { getUvMirrorEnv } from '../utils/uv-env'; +import { listConfiguredChannels } from '../utils/channel-config'; import { syncGatewayTokenToConfig, syncBrowserConfigToOpenClaw, sanitizeOpenClawConfig } from '../utils/openclaw-auth'; import { buildProxyEnv, resolveProxySettings } from '../utils/proxy'; import { syncProxyConfigToOpenClaw } from '../utils/openclaw-proxy'; @@ -21,6 +22,7 @@ export interface GatewayLaunchContext { binPathExists: boolean; loadedProviderKeyCount: number; proxySummary: string; + channelStartupSummary: string; } export async function syncGatewayConfigBeforeLaunch( @@ -88,6 +90,32 @@ async function loadProviderEnv(): Promise<{ providerEnv: Record; return { providerEnv, loadedProviderKeyCount }; } +async function resolveChannelStartupPolicy(): Promise<{ + skipChannels: boolean; + channelStartupSummary: string; +}> { + try { + const configuredChannels = await listConfiguredChannels(); + if (configuredChannels.length === 0) { + return { + skipChannels: true, + channelStartupSummary: 'skipped(no configured channels)', + }; + } + + return { + skipChannels: false, + channelStartupSummary: `enabled(${configuredChannels.join(',')})`, + }; + } catch (error) { + logger.warn('Failed to determine configured channels for gateway launch:', error); + return { + skipChannels: false, + channelStartupSummary: 'enabled(unknown)', + }; + } +} + export async function prepareGatewayLaunchContext(port: number): Promise { const openclawDir = getOpenClawDir(); const entryScript = getOpenClawEntryPath(); @@ -118,6 +146,7 @@ export async function prepareGatewayLaunchContext(port: number): Promise void; } -/** - * Reconnection configuration - */ -interface ReconnectConfig { - maxAttempts: number; - baseDelay: number; - maxDelay: number; -} - -const DEFAULT_RECONNECT_CONFIG: ReconnectConfig = { - maxAttempts: 10, - baseDelay: 1000, - maxDelay: 30000, -}; - // getNodeExecutablePath() removed: utilityProcess.fork() handles process isolation // natively on all platforms (no dock icon on macOS, no console on Windows). @@ -257,40 +252,6 @@ export class GatewayManager extends EventEmitter { return sanitized; } - private formatExit(code: number | null, signal: NodeJS.Signals | null): string { - if (code !== null) return `code=${code}`; - if (signal) return `signal=${signal}`; - return 'code=null signal=null'; - } - - private classifyStderrMessage(message: string): { level: 'drop' | 'debug' | 'warn'; normalized: string } { - const msg = message.trim(); - if (!msg) return { level: 'drop', normalized: msg }; - - // Known noisy lines that are not actionable for Gateway lifecycle debugging. - if (msg.includes('openclaw-control-ui') && msg.includes('token_mismatch')) return { level: 'drop', normalized: msg }; - if (msg.includes('closed before connect') && msg.includes('token mismatch')) return { level: 'drop', normalized: msg }; - - // Downgrade frequent non-fatal noise. - if (msg.includes('ExperimentalWarning')) return { level: 'debug', normalized: msg }; - if (msg.includes('DeprecationWarning')) return { level: 'debug', normalized: msg }; - if (msg.includes('Debugger attached')) return { level: 'debug', normalized: msg }; - // Electron restricts NODE_OPTIONS in packaged apps; this is expected and harmless. - if (msg.includes('NODE_OPTIONs are not supported in packaged apps')) return { level: 'debug', normalized: msg }; - - return { level: 'warn', normalized: msg }; - } - - private recordStartupStderrLine(line: string): void { - const normalized = line.trim(); - if (!normalized) return; - this.recentStartupStderrLines.push(normalized); - const MAX_STDERR_LINES = 120; - if (this.recentStartupStderrLines.length > MAX_STDERR_LINES) { - this.recentStartupStderrLines.splice(0, this.recentStartupStderrLines.length - MAX_STDERR_LINES); - } - } - private bumpLifecycleEpoch(reason: string): number { this.lifecycleEpoch = nextLifecycleEpoch(this.lifecycleEpoch); logger.debug(`Gateway lifecycle epoch advanced to ${this.lifecycleEpoch} (${reason})`); @@ -406,16 +367,7 @@ export class GatewayManager extends EventEmitter { // Check if Python environment is ready (self-healing) asynchronously. // Fire-and-forget: only needs to run once, not on every retry. - void isPythonReady().then(pythonReady => { - if (!pythonReady) { - logger.info('Python environment missing or incomplete, attempting background repair...'); - void setupManagedPython().catch(err => { - logger.error('Background Python repair failed:', err); - }); - } - }).catch(err => { - logger.error('Failed to check Python environment:', err); - }); + warmupManagedPythonReadiness(); try { let startAttempts = 0; @@ -428,7 +380,10 @@ export class GatewayManager extends EventEmitter { try { // Check if Gateway is already running logger.debug('Checking for existing Gateway...'); - const existing = await this.findExistingGateway(); + const existing = await findExistingGatewayProcess({ + port: this.status.port, + ownedPid: this.process?.pid, + }); this.assertLifecycleEpoch(startEpoch, 'start/find-existing'); if (existing) { logger.debug(`Found existing Gateway on port ${existing.port}`); @@ -446,7 +401,7 @@ export class GatewayManager extends EventEmitter { // after the previous Gateway process exits, preventing the new one // from binding. Wait for the port to be free before proceeding. if (process.platform === 'win32') { - await this.waitForPortFree(this.status.port); + await waitForPortFree(this.status.port); this.assertLifecycleEpoch(startEpoch, 'start/wait-port'); } @@ -475,7 +430,7 @@ export class GatewayManager extends EventEmitter { logger.warn( 'Detected invalid OpenClaw config during Gateway startup; running doctor repair before retry' ); - const repaired = await this.runOpenClawDoctorRepair(); + const repaired = await runOpenClawDoctorRepair(); if (repaired) { logger.info('OpenClaw doctor repair completed; retrying Gateway startup'); this.setStatus({ state: 'starting', error: undefined, reconnectAttempts: 0 }); @@ -486,12 +441,7 @@ export class GatewayManager extends EventEmitter { // Retry on transient connect errors const errMsg = String(error); - const isTransientError = - errMsg.includes('WebSocket closed before handshake') || - errMsg.includes('ECONNREFUSED') || - errMsg.includes('Gateway process exited before becoming ready') || - errMsg.includes('Timed out waiting for connect.challenge') || - errMsg.includes('Connect handshake timeout'); + const isTransientError = isTransientGatewayStartError(error); if (startAttempts < MAX_START_ATTEMPTS && isTransientError) { logger.warn(`Transient start error: ${errMsg}. Retrying... (${startAttempts}/${MAX_START_ATTEMPTS})`); @@ -551,34 +501,7 @@ export class GatewayManager extends EventEmitter { // Kill process if (this.process && this.ownsProcess) { const child = this.process; - // UtilityProcess doesn't expose exitCode/signalCode — track exit via event. - let exited = false; - - await new Promise((resolve) => { - child.once('exit', () => { - exited = true; - resolve(); - }); - - const pid = child.pid; - logger.info(`Sending kill to Gateway process (pid=${pid ?? 'unknown'})`); - try { child.kill(); } catch { /* ignore if already exited */ } - - // Force kill after timeout via OS-level kill on the PID - const timeout = setTimeout(() => { - if (!exited) { - logger.warn(`Gateway did not exit in time, force-killing (pid=${pid ?? 'unknown'})`); - if (pid) { - try { process.kill(pid, 'SIGKILL'); } catch { /* ignore */ } - } - } - resolve(); - }, 5000); - - child.once('exit', () => { - clearTimeout(timeout); - }); - }); + await terminateOwnedGatewayProcess(child); if (this.process === child) { this.process = null; @@ -747,322 +670,13 @@ export class GatewayManager extends EventEmitter { } } - /** - * Unload the system-managed openclaw gateway launchctl service if it is - * loaded. Without this, killing the process only causes launchctl to - * respawn it, leading to an infinite reconnect loop. - */ - private async unloadLaunchctlService(): Promise { - if (process.platform !== 'darwin') return; - - try { - const uid = process.getuid?.(); - if (uid === undefined) return; - - const LAUNCHD_LABEL = 'ai.openclaw.gateway'; - const serviceTarget = `gui/${uid}/${LAUNCHD_LABEL}`; - - const loaded = await new Promise((resolve) => { - import('child_process').then(cp => { - cp.exec(`launchctl print ${serviceTarget}`, { timeout: 5000 }, (err) => { - resolve(!err); - }); - }).catch(() => resolve(false)); - }); - - if (!loaded) return; - - logger.info(`Unloading launchctl service ${serviceTarget} to prevent auto-respawn`); - await new Promise((resolve) => { - import('child_process').then(cp => { - cp.exec(`launchctl bootout ${serviceTarget}`, { timeout: 10000 }, (err) => { - if (err) { - logger.warn(`Failed to bootout launchctl service: ${err.message}`); - } else { - logger.info('Successfully unloaded launchctl gateway service'); - } - resolve(); - }); - }).catch(() => resolve()); - }); - - await new Promise(r => setTimeout(r, 2000)); - - // Remove the plist so the service won't reload on next login. - try { - const { homedir } = await import('os'); - const plistPath = path.join(homedir(), 'Library', 'LaunchAgents', `${LAUNCHD_LABEL}.plist`); - const { access, unlink } = await import('fs/promises'); - await access(plistPath); - await unlink(plistPath); - logger.info(`Removed legacy launchd plist to prevent reload on next login: ${plistPath}`); - } catch { - // File doesn't exist or can't be removed -- not fatal - } - } catch (err) { - logger.warn('Error while unloading launchctl gateway service:', err); - } - } - - /** - * Find existing Gateway process by attempting a WebSocket connection - */ - private async findExistingGateway(): Promise<{ port: number, externalToken?: string } | null> { - try { - const port = PORTS.OPENCLAW_GATEWAY; - - try { - // Platform-specific command to find processes listening on the gateway port. - // We use native commands (netstat on Windows) to avoid triggering AV blocks - // that flag "powershell -WindowStyle Hidden" as malware behavior. - // windowsHide: true in cp.exec natively prevents the black command window. - const cmd = process.platform === 'win32' - ? `netstat -ano | findstr :${port}` - : `lsof -i :${port} -sTCP:LISTEN -t`; - - const { stdout } = await new Promise<{ stdout: string }>((resolve, reject) => { - import('child_process').then(cp => { - cp.exec(cmd, { timeout: 5000, windowsHide: true }, (err, stdout) => { - if (err) resolve({ stdout: '' }); - else resolve({ stdout }); - }); - }).catch(reject); - }); - - if (stdout.trim()) { - // Parse netstat or lsof output to extract PIDs - let pids: string[] = []; - if (process.platform === 'win32') { - // netstat -ano output format: - // TCP 127.0.0.1:3000 0.0.0.0:0 LISTENING 12345 - const lines = stdout.trim().split(/\r?\n/); - for (const line of lines) { - const parts = line.trim().split(/\s+/); - if (parts.length >= 5 && parts[3] === 'LISTENING') { - pids.push(parts[4]); - } - } - } else { - pids = stdout.trim().split(/\r?\n/).map(s => s.trim()).filter(Boolean); - } - // Remove duplicate PIDs - pids = [...new Set(pids)]; - - if (pids.length > 0) { - if (!this.process || !pids.includes(String(this.process.pid))) { - logger.info(`Found orphaned process listening on port ${port} (PIDs: ${pids.join(', ')}), attempting to kill...`); - - // Unload the launchctl service first so macOS doesn't auto- - // respawn the process we're about to kill. - if (process.platform === 'darwin') { - await this.unloadLaunchctlService(); - } - - // Terminate orphaned processes - for (const pid of pids) { - try { - if (process.platform === 'win32') { - // Use taskkill with windowsHide: true. This natively hides the console - // flash without needing PowerShell, avoiding AV alerts. - import('child_process').then(cp => { - cp.exec( - `taskkill /F /PID ${pid} /T`, - { timeout: 5000, windowsHide: true }, - () => { } - ); - }).catch(() => { }); - } else { - // SIGTERM first so the gateway can clean up its lock file. - process.kill(parseInt(pid), 'SIGTERM'); - } - } catch { /* ignore */ } - } - await new Promise(r => setTimeout(r, process.platform === 'win32' ? 2000 : 3000)); - - // SIGKILL any survivors (Unix only — Windows taskkill /F is already forceful) - if (process.platform !== 'win32') { - for (const pid of pids) { - try { process.kill(parseInt(pid), 0); process.kill(parseInt(pid), 'SIGKILL'); } catch { /* already exited */ } - } - await new Promise(r => setTimeout(r, 1000)); - } - return null; - } - } - } - } catch (err) { - logger.warn('Error checking for existing process on port:', err); - } - - // Try a quick WebSocket connection to check if gateway is listening - return await new Promise<{ port: number, externalToken?: string } | null>((resolve) => { - const testWs = new WebSocket(`ws://localhost:${port}/ws`); - const timeout = setTimeout(() => { - testWs.close(); - resolve(null); - }, 2000); - - testWs.on('open', () => { - clearTimeout(timeout); - testWs.close(); - resolve({ port }); - }); - - testWs.on('error', () => { - clearTimeout(timeout); - resolve(null); - }); - }); - } catch { - // Gateway not running - } - - return null; - } - - /** - * Attempt to repair invalid OpenClaw config using the built-in doctor command. - * Returns true when doctor exits successfully. - */ - private async runOpenClawDoctorRepair(): Promise { - const openclawDir = getOpenClawDir(); - const entryScript = getOpenClawEntryPath(); - if (!existsSync(entryScript)) { - logger.error(`Cannot run OpenClaw doctor repair: entry script not found at ${entryScript}`); - return false; - } - - const platform = process.platform; - const arch = process.arch; - const target = `${platform}-${arch}`; - const binPath = app.isPackaged - ? path.join(process.resourcesPath, 'bin') - : path.join(process.cwd(), 'resources', 'bin', target); - const binPathExists = existsSync(binPath); - const finalPath = binPathExists - ? `${binPath}${path.delimiter}${process.env.PATH || ''}` - : process.env.PATH || ''; - - const uvEnv = await getUvMirrorEnv(); - const doctorArgs = ['doctor', '--fix', '--yes', '--non-interactive']; - logger.info( - `Running OpenClaw doctor repair (entry="${entryScript}", args="${doctorArgs.join(' ')}", cwd="${openclawDir}", bundledBin=${binPathExists ? 'yes' : 'no'})` - ); - - return new Promise((resolve) => { - const forkEnv: Record = { - ...process.env, - PATH: finalPath, - ...uvEnv, - OPENCLAW_NO_RESPAWN: '1', - }; - - const child = utilityProcess.fork(entryScript, doctorArgs, { - cwd: openclawDir, - stdio: 'pipe', - env: forkEnv as NodeJS.ProcessEnv, - }); - - let settled = false; - const finish = (ok: boolean) => { - if (settled) return; - settled = true; - resolve(ok); - }; - - const timeout = setTimeout(() => { - logger.error('OpenClaw doctor repair timed out after 120000ms'); - try { - child.kill(); - } catch { - // ignore - } - finish(false); - }, 120000); - - child.on('error', (err) => { - clearTimeout(timeout); - logger.error('Failed to spawn OpenClaw doctor repair process:', err); - finish(false); - }); - - child.stdout?.on('data', (data) => { - const raw = data.toString(); - for (const line of raw.split(/\r?\n/)) { - const normalized = line.trim(); - if (!normalized) continue; - logger.debug(`[Gateway doctor stdout] ${normalized}`); - } - }); - - child.stderr?.on('data', (data) => { - const raw = data.toString(); - for (const line of raw.split(/\r?\n/)) { - const normalized = line.trim(); - if (!normalized) continue; - logger.warn(`[Gateway doctor stderr] ${normalized}`); - } - }); - - child.on('exit', (code: number) => { - clearTimeout(timeout); - if (code === 0) { - logger.info('OpenClaw doctor repair completed successfully'); - finish(true); - return; - } - logger.warn(`OpenClaw doctor repair exited (code=${code})`); - finish(false); - }); - }); - } - /** * Start Gateway process * Uses OpenClaw npm package from node_modules (dev) or resources (production) */ - /** - * Wait until the gateway port is no longer held by the OS. - * On Windows, TCP TIME_WAIT can keep a port occupied for up to 2 minutes - * after the owning process exits, causing the new Gateway to hang on bind. - */ - private async waitForPortFree(port: number, timeoutMs = 30000): Promise { - const net = await import('net'); - const start = Date.now(); - const pollInterval = 500; - let logged = false; - - while (Date.now() - start < timeoutMs) { - const available = await new Promise((resolve) => { - const server = net.createServer(); - server.once('error', () => resolve(false)); - server.once('listening', () => { - server.close(() => resolve(true)); - }); - server.listen(port, '127.0.0.1'); - }); - - if (available) { - const elapsed = Date.now() - start; - if (elapsed > pollInterval) { - logger.info(`Port ${port} became available after ${elapsed}ms`); - } - return; - } - - if (!logged) { - logger.info(`Waiting for port ${port} to become available (Windows TCP TIME_WAIT)...`); - logged = true; - } - await new Promise(r => setTimeout(r, pollInterval)); - } - - logger.warn(`Port ${port} still occupied after ${timeoutMs}ms, proceeding anyway`); - } - private async startProcess(): Promise { // Ensure no system-managed gateway service will compete with our process. - await this.unloadLaunchctlService(); + await unloadLaunchctlGatewayService(); const launchContext = await prepareGatewayLaunchContext(this.status.port); const { openclawDir, @@ -1073,10 +687,11 @@ export class GatewayManager extends EventEmitter { binPathExists, loadedProviderKeyCount, proxySummary, + channelStartupSummary, } = launchContext; logger.info( - `Starting Gateway process (mode=${mode}, port=${this.status.port}, entry="${entryScript}", args="${this.sanitizeSpawnArgs(gatewayArgs).join(' ')}", cwd="${openclawDir}", bundledBin=${binPathExists ? 'yes' : 'no'}, providerKeys=${loadedProviderKeyCount}, proxy=${proxySummary})` + `Starting Gateway process (mode=${mode}, port=${this.status.port}, entry="${entryScript}", args="${this.sanitizeSpawnArgs(gatewayArgs).join(' ')}", cwd="${openclawDir}", bundledBin=${binPathExists ? 'yes' : 'no'}, providerKeys=${loadedProviderKeyCount}, channels=${channelStartupSummary}, proxy=${proxySummary})` ); this.lastSpawnSummary = `mode=${mode}, entry="${entryScript}", args="${this.sanitizeSpawnArgs(gatewayArgs).join(' ')}", cwd="${openclawDir}"`; @@ -1144,8 +759,8 @@ export class GatewayManager extends EventEmitter { child.stderr?.on('data', (data) => { const raw = data.toString(); for (const line of raw.split(/\r?\n/)) { - this.recordStartupStderrLine(line); - const classified = this.classifyStderrMessage(line); + recordGatewayStartupStderrLine(this.recentStartupStderrLines, line); + const classified = classifyGatewayStderrMessage(line); if (classified.level === 'drop') continue; if (classified.level === 'debug') { logger.debug(`[Gateway stderr] ${classified.normalized}`); @@ -1168,9 +783,9 @@ export class GatewayManager extends EventEmitter { } /** - * Wait for Gateway to be ready by checking if the port is accepting connections + * Wait for Gateway to be ready by checking if it can issue connect challenges. */ - private async waitForReady(retries = 2400, interval = 250): Promise { + private async waitForReady(retries = 2400, interval = 200): Promise { const child = this.process; for (let i = 0; i < retries; i++) { // Early exit if the gateway process has already exited. @@ -1182,7 +797,7 @@ export class GatewayManager extends EventEmitter { } try { - const ready = await probeGatewayReady(this.status.port, 2000); + const ready = await probeGatewayReady(this.status.port, 1500); if (ready) { logger.debug(`Gateway ready after ${i + 1} attempt(s)`); @@ -1447,17 +1062,26 @@ export class GatewayManager extends EventEmitter { * Schedule reconnection attempt with exponential backoff */ private scheduleReconnect(): void { - if (!this.shouldReconnect) { - logger.debug('Gateway reconnect skipped (auto-reconnect disabled)'); + const decision = getReconnectScheduleDecision({ + shouldReconnect: this.shouldReconnect, + hasReconnectTimer: this.reconnectTimer !== null, + reconnectAttempts: this.reconnectAttempts, + maxAttempts: this.reconnectConfig.maxAttempts, + baseDelay: this.reconnectConfig.baseDelay, + maxDelay: this.reconnectConfig.maxDelay, + }); + + if (decision.action === 'skip') { + logger.debug(`Gateway reconnect skipped (${decision.reason})`); return; } - if (this.reconnectTimer) { + if (decision.action === 'already-scheduled') { return; } - if (this.reconnectAttempts >= this.reconnectConfig.maxAttempts) { - logger.error(`Gateway reconnect failed: max attempts reached (${this.reconnectConfig.maxAttempts})`); + if (decision.action === 'fail') { + logger.error(`Gateway reconnect failed: max attempts reached (${decision.maxAttempts})`); this.setStatus({ state: 'error', error: 'Failed to reconnect after maximum attempts', @@ -1466,14 +1090,9 @@ export class GatewayManager extends EventEmitter { return; } - // Calculate delay with exponential backoff - const delay = Math.min( - this.reconnectConfig.baseDelay * Math.pow(2, this.reconnectAttempts), - this.reconnectConfig.maxDelay - ); - - this.reconnectAttempts++; - logger.warn(`Scheduling Gateway reconnect attempt ${this.reconnectAttempts}/${this.reconnectConfig.maxAttempts} in ${delay}ms`); + const { delay, nextAttempt, maxAttempts } = decision; + this.reconnectAttempts = nextAttempt; + logger.warn(`Scheduling Gateway reconnect attempt ${nextAttempt}/${maxAttempts} in ${delay}ms`); this.setStatus({ state: 'reconnecting', diff --git a/electron/gateway/process-policy.ts b/electron/gateway/process-policy.ts index ece3b65..607e13f 100644 --- a/electron/gateway/process-policy.ts +++ b/electron/gateway/process-policy.ts @@ -1,3 +1,15 @@ +export interface ReconnectConfig { + maxAttempts: number; + baseDelay: number; + maxDelay: number; +} + +export const DEFAULT_RECONNECT_CONFIG: ReconnectConfig = { + maxAttempts: 10, + baseDelay: 1000, + maxDelay: 30000, +}; + export function nextLifecycleEpoch(currentEpoch: number): number { return currentEpoch + 1; } @@ -22,6 +34,53 @@ export function getReconnectSkipReason(context: ReconnectAttemptContext): string return null; } +export interface ReconnectScheduleContext { + shouldReconnect: boolean; + hasReconnectTimer: boolean; + reconnectAttempts: number; + maxAttempts: number; + baseDelay: number; + maxDelay: number; +} + +export type ReconnectScheduleDecision = + | { action: 'skip'; reason: string } + | { action: 'already-scheduled' } + | { action: 'fail'; attempts: number; maxAttempts: number } + | { action: 'schedule'; nextAttempt: number; maxAttempts: number; delay: number }; + +export function getReconnectScheduleDecision( + context: ReconnectScheduleContext, +): ReconnectScheduleDecision { + if (!context.shouldReconnect) { + return { action: 'skip', reason: 'auto-reconnect disabled' }; + } + + if (context.hasReconnectTimer) { + return { action: 'already-scheduled' }; + } + + if (context.reconnectAttempts >= context.maxAttempts) { + return { + action: 'fail', + attempts: context.reconnectAttempts, + maxAttempts: context.maxAttempts, + }; + } + + const delay = Math.min( + context.baseDelay * Math.pow(2, context.reconnectAttempts), + context.maxDelay, + ); + + return { + action: 'schedule', + nextAttempt: context.reconnectAttempts + 1, + maxAttempts: context.maxAttempts, + delay, + }; +} + export type GatewayLifecycleState = 'stopped' | 'starting' | 'running' | 'error' | 'reconnecting'; export interface RestartDeferralContext { diff --git a/electron/gateway/startup-stderr.ts b/electron/gateway/startup-stderr.ts new file mode 100644 index 0000000..b23f089 --- /dev/null +++ b/electron/gateway/startup-stderr.ts @@ -0,0 +1,42 @@ +export type GatewayStderrClassification = { + level: 'drop' | 'debug' | 'warn'; + normalized: string; +}; + +const MAX_STDERR_LINES = 120; + +export function classifyGatewayStderrMessage(message: string): GatewayStderrClassification { + const msg = message.trim(); + if (!msg) { + return { level: 'drop', normalized: msg }; + } + + // Known noisy lines that are not actionable for Gateway lifecycle debugging. + if (msg.includes('openclaw-control-ui') && msg.includes('token_mismatch')) { + return { level: 'drop', normalized: msg }; + } + if (msg.includes('closed before connect') && msg.includes('token mismatch')) { + return { level: 'drop', normalized: msg }; + } + + // Downgrade frequent non-fatal noise. + if (msg.includes('ExperimentalWarning')) return { level: 'debug', normalized: msg }; + if (msg.includes('DeprecationWarning')) return { level: 'debug', normalized: msg }; + if (msg.includes('Debugger attached')) return { level: 'debug', normalized: msg }; + + // Electron restricts NODE_OPTIONS in packaged apps; this is expected and harmless. + if (msg.includes('node: --require is not allowed in NODE_OPTIONS')) { + return { level: 'debug', normalized: msg }; + } + + return { level: 'warn', normalized: msg }; +} + +export function recordGatewayStartupStderrLine(lines: string[], line: string): void { + const normalized = line.trim(); + if (!normalized) return; + lines.push(normalized); + if (lines.length > MAX_STDERR_LINES) { + lines.splice(0, lines.length - MAX_STDERR_LINES); + } +} diff --git a/electron/gateway/supervisor.ts b/electron/gateway/supervisor.ts new file mode 100644 index 0000000..83c9523 --- /dev/null +++ b/electron/gateway/supervisor.ts @@ -0,0 +1,359 @@ +import { app, utilityProcess } from 'electron'; +import path from 'path'; +import { existsSync } from 'fs'; +import WebSocket from 'ws'; +import { getOpenClawDir, getOpenClawEntryPath } from '../utils/paths'; +import { getUvMirrorEnv } from '../utils/uv-env'; +import { isPythonReady, setupManagedPython } from '../utils/uv-setup'; +import { logger } from '../utils/logger'; + +export function warmupManagedPythonReadiness(): void { + void isPythonReady().then((pythonReady) => { + if (!pythonReady) { + logger.info('Python environment missing or incomplete, attempting background repair...'); + void setupManagedPython().catch((err) => { + logger.error('Background Python repair failed:', err); + }); + } + }).catch((err) => { + logger.error('Failed to check Python environment:', err); + }); +} + +export function isTransientGatewayStartError(error: unknown): boolean { + const errMsg = String(error); + return ( + errMsg.includes('WebSocket closed before handshake') || + errMsg.includes('ECONNREFUSED') || + errMsg.includes('Gateway process exited before becoming ready') || + errMsg.includes('Timed out waiting for connect.challenge') || + errMsg.includes('Connect handshake timeout') + ); +} + +export async function terminateOwnedGatewayProcess(child: Electron.UtilityProcess): Promise { + let exited = false; + + await new Promise((resolve) => { + child.once('exit', () => { + exited = true; + resolve(); + }); + + const pid = child.pid; + logger.info(`Sending kill to Gateway process (pid=${pid ?? 'unknown'})`); + try { + child.kill(); + } catch { + // ignore if already exited + } + + const timeout = setTimeout(() => { + if (!exited) { + logger.warn(`Gateway did not exit in time, force-killing (pid=${pid ?? 'unknown'})`); + if (pid) { + try { + process.kill(pid, 'SIGKILL'); + } catch { + // ignore + } + } + } + resolve(); + }, 5000); + + child.once('exit', () => { + clearTimeout(timeout); + }); + }); +} + +export async function unloadLaunchctlGatewayService(): Promise { + if (process.platform !== 'darwin') return; + + try { + const uid = process.getuid?.(); + if (uid === undefined) return; + + const launchdLabel = 'ai.openclaw.gateway'; + const serviceTarget = `gui/${uid}/${launchdLabel}`; + const cp = await import('child_process'); + const fsPromises = await import('fs/promises'); + const os = await import('os'); + + const loaded = await new Promise((resolve) => { + cp.exec(`launchctl print ${serviceTarget}`, { timeout: 5000 }, (err) => { + resolve(!err); + }); + }); + + if (!loaded) return; + + logger.info(`Unloading launchctl service ${serviceTarget} to prevent auto-respawn`); + await new Promise((resolve) => { + cp.exec(`launchctl bootout ${serviceTarget}`, { timeout: 10000 }, (err) => { + if (err) { + logger.warn(`Failed to bootout launchctl service: ${err.message}`); + } else { + logger.info('Successfully unloaded launchctl gateway service'); + } + resolve(); + }); + }); + + await new Promise((resolve) => setTimeout(resolve, 2000)); + + try { + const plistPath = path.join(os.homedir(), 'Library', 'LaunchAgents', `${launchdLabel}.plist`); + await fsPromises.access(plistPath); + await fsPromises.unlink(plistPath); + logger.info(`Removed legacy launchd plist to prevent reload on next login: ${plistPath}`); + } catch { + // File doesn't exist or can't be removed -- not fatal + } + } catch (err) { + logger.warn('Error while unloading launchctl gateway service:', err); + } +} + +export async function waitForPortFree(port: number, timeoutMs = 30000): Promise { + const net = await import('net'); + const start = Date.now(); + const pollInterval = 500; + let logged = false; + + while (Date.now() - start < timeoutMs) { + const available = await new Promise((resolve) => { + const server = net.createServer(); + server.once('error', () => resolve(false)); + server.once('listening', () => { + server.close(() => resolve(true)); + }); + server.listen(port, '127.0.0.1'); + }); + + if (available) { + const elapsed = Date.now() - start; + if (elapsed > pollInterval) { + logger.info(`Port ${port} became available after ${elapsed}ms`); + } + return; + } + + if (!logged) { + logger.info(`Waiting for port ${port} to become available (Windows TCP TIME_WAIT)...`); + logged = true; + } + await new Promise((resolve) => setTimeout(resolve, pollInterval)); + } + + logger.warn(`Port ${port} still occupied after ${timeoutMs}ms, proceeding anyway`); +} + +async function getListeningProcessIds(port: number): Promise { + const cmd = process.platform === 'win32' + ? `netstat -ano | findstr :${port}` + : `lsof -i :${port} -sTCP:LISTEN -t`; + + const cp = await import('child_process'); + const { stdout } = await new Promise<{ stdout: string }>((resolve) => { + cp.exec(cmd, { timeout: 5000, windowsHide: true }, (err, stdout) => { + if (err) { + resolve({ stdout: '' }); + } else { + resolve({ stdout }); + } + }); + }); + + if (!stdout.trim()) { + return []; + } + + if (process.platform === 'win32') { + const pids: string[] = []; + for (const line of stdout.trim().split(/\r?\n/)) { + const parts = line.trim().split(/\s+/); + if (parts.length >= 5 && parts[3] === 'LISTENING') { + pids.push(parts[4]); + } + } + return [...new Set(pids)]; + } + + return [...new Set(stdout.trim().split(/\r?\n/).map((value) => value.trim()).filter(Boolean))]; +} + +async function terminateOrphanedProcessIds(port: number, pids: string[]): Promise { + logger.info(`Found orphaned process listening on port ${port} (PIDs: ${pids.join(', ')}), attempting to kill...`); + + if (process.platform === 'darwin') { + await unloadLaunchctlGatewayService(); + } + + for (const pid of pids) { + try { + if (process.platform === 'win32') { + const cp = await import('child_process'); + await new Promise((resolve) => { + cp.exec( + `taskkill /F /PID ${pid} /T`, + { timeout: 5000, windowsHide: true }, + () => resolve(), + ); + }); + } else { + process.kill(parseInt(pid, 10), 'SIGTERM'); + } + } catch { + // Ignore processes that have already exited. + } + } + + await new Promise((resolve) => setTimeout(resolve, process.platform === 'win32' ? 2000 : 3000)); + + if (process.platform !== 'win32') { + for (const pid of pids) { + try { + process.kill(parseInt(pid, 10), 0); + process.kill(parseInt(pid, 10), 'SIGKILL'); + } catch { + // Already exited. + } + } + await new Promise((resolve) => setTimeout(resolve, 1000)); + } +} + +export async function findExistingGatewayProcess(options: { + port: number; + ownedPid?: number; +}): Promise<{ port: number; externalToken?: string } | null> { + const { port, ownedPid } = options; + + try { + try { + const pids = await getListeningProcessIds(port); + if (pids.length > 0 && (!ownedPid || !pids.includes(String(ownedPid)))) { + await terminateOrphanedProcessIds(port, pids); + return null; + } + } catch (err) { + logger.warn('Error checking for existing process on port:', err); + } + + return await new Promise<{ port: number; externalToken?: string } | null>((resolve) => { + const testWs = new WebSocket(`ws://localhost:${port}/ws`); + const timeout = setTimeout(() => { + testWs.close(); + resolve(null); + }, 2000); + + testWs.on('open', () => { + clearTimeout(timeout); + testWs.close(); + resolve({ port }); + }); + + testWs.on('error', () => { + clearTimeout(timeout); + resolve(null); + }); + }); + } catch { + return null; + } +} + +export async function runOpenClawDoctorRepair(): Promise { + const openclawDir = getOpenClawDir(); + const entryScript = getOpenClawEntryPath(); + if (!existsSync(entryScript)) { + logger.error(`Cannot run OpenClaw doctor repair: entry script not found at ${entryScript}`); + return false; + } + + const platform = process.platform; + const arch = process.arch; + const target = `${platform}-${arch}`; + const binPath = app.isPackaged + ? path.join(process.resourcesPath, 'bin') + : path.join(process.cwd(), 'resources', 'bin', target); + const binPathExists = existsSync(binPath); + const finalPath = binPathExists + ? `${binPath}${path.delimiter}${process.env.PATH || ''}` + : process.env.PATH || ''; + + const uvEnv = await getUvMirrorEnv(); + const doctorArgs = ['doctor', '--fix', '--yes', '--non-interactive']; + logger.info( + `Running OpenClaw doctor repair (entry="${entryScript}", args="${doctorArgs.join(' ')}", cwd="${openclawDir}", bundledBin=${binPathExists ? 'yes' : 'no'})`, + ); + + return await new Promise((resolve) => { + const forkEnv: Record = { + ...process.env, + PATH: finalPath, + ...uvEnv, + OPENCLAW_NO_RESPAWN: '1', + }; + + const child = utilityProcess.fork(entryScript, doctorArgs, { + cwd: openclawDir, + stdio: 'pipe', + env: forkEnv as NodeJS.ProcessEnv, + }); + + let settled = false; + const finish = (ok: boolean) => { + if (settled) return; + settled = true; + resolve(ok); + }; + + const timeout = setTimeout(() => { + logger.error('OpenClaw doctor repair timed out after 120000ms'); + try { + child.kill(); + } catch { + // ignore + } + finish(false); + }, 120000); + + child.on('error', (err) => { + clearTimeout(timeout); + logger.error('Failed to spawn OpenClaw doctor repair process:', err); + finish(false); + }); + + child.stdout?.on('data', (data) => { + const raw = data.toString(); + for (const line of raw.split(/\r?\n/)) { + const normalized = line.trim(); + if (!normalized) continue; + logger.debug(`[Gateway doctor stdout] ${normalized}`); + } + }); + + child.stderr?.on('data', (data) => { + const raw = data.toString(); + for (const line of raw.split(/\r?\n/)) { + const normalized = line.trim(); + if (!normalized) continue; + logger.warn(`[Gateway doctor stderr] ${normalized}`); + } + }); + + child.on('exit', (code: number) => { + clearTimeout(timeout); + if (code === 0) { + logger.info('OpenClaw doctor repair completed successfully'); + finish(true); + return; + } + logger.warn(`OpenClaw doctor repair exited (code=${code})`); + finish(false); + }); + }); +} diff --git a/electron/gateway/ws-client.ts b/electron/gateway/ws-client.ts index 2d1ff77..2addefe 100644 --- a/electron/gateway/ws-client.ts +++ b/electron/gateway/ws-client.ts @@ -8,24 +8,51 @@ import { export async function probeGatewayReady( port: number, - timeoutMs = 2000, + timeoutMs = 1500, ): Promise { return await new Promise((resolve) => { const testWs = new WebSocket(`ws://localhost:${port}/ws`); + let settled = false; + + const resolveOnce = (value: boolean) => { + if (settled) return; + settled = true; + clearTimeout(timeout); + try { + testWs.close(); + } catch { + // ignore + } + resolve(value); + }; + const timeout = setTimeout(() => { - testWs.close(); - resolve(false); + resolveOnce(false); }, timeoutMs); testWs.on('open', () => { - clearTimeout(timeout); - testWs.close(); - resolve(true); + // Do not resolve on plain socket open. The gateway can accept the TCP/WebSocket + // connection before it is ready to issue protocol challenges, which previously + // caused a false "ready" result and then a full connect() stall. + }); + + testWs.on('message', (data) => { + try { + const message = JSON.parse(data.toString()) as { type?: string; event?: string }; + if (message.type === 'event' && message.event === 'connect.challenge') { + resolveOnce(true); + } + } catch { + // ignore malformed probe payloads + } }); testWs.on('error', () => { - clearTimeout(timeout); - resolve(false); + resolveOnce(false); + }); + + testWs.on('close', () => { + resolveOnce(false); }); }); } diff --git a/electron/main/index.ts b/electron/main/index.ts index 2e150d7..55e825d 100644 --- a/electron/main/index.ts +++ b/electron/main/index.ts @@ -230,40 +230,8 @@ async function initialize(): Promise { logger.warn('Failed to install built-in skills:', error); }); - // Start Gateway automatically (this seeds missing bootstrap files with full templates) - const gatewayAutoStart = await getSetting('gatewayAutoStart'); - if (gatewayAutoStart) { - try { - logger.debug('Auto-starting Gateway...'); - await gatewayManager.start(); - logger.info('Gateway auto-start succeeded'); - } catch (error) { - logger.error('Gateway auto-start failed:', error); - mainWindow?.webContents.send('gateway:error', String(error)); - } - } else { - logger.info('Gateway auto-start disabled in settings'); - } - - // Merge ClawX context snippets into the workspace bootstrap files. - // The gateway seeds workspace files asynchronously after its HTTP server - // is ready, so ensureClawXContext will retry until the target files appear. - void ensureClawXContext().catch((error) => { - logger.warn('Failed to merge ClawX context into workspace:', error); - }); - - // Auto-install openclaw CLI and shell completions (non-blocking). - void autoInstallCliIfNeeded((installedPath) => { - mainWindow?.webContents.send('openclaw:cli-installed', installedPath); - }).then(() => { - generateCompletionCache(); - installCompletionToProfile(); - }).catch((error) => { - logger.warn('CLI auto-install failed:', error); - }); - - // Re-apply ClawX context after every gateway restart because the gateway - // may re-seed workspace files with clean templates (losing ClawX markers). + // Bridge gateway and host-side events before any auto-start logic runs, so + // renderer subscribers observe the full startup lifecycle. gatewayManager.on('status', (status: { state: string }) => { hostEventBus.emit('gateway:status', status); if (status.state === 'running') { @@ -320,6 +288,38 @@ async function initialize(): Promise { whatsAppLoginManager.on('error', (error) => { hostEventBus.emit('channel:whatsapp-error', error); }); + + // Start Gateway automatically (this seeds missing bootstrap files with full templates) + const gatewayAutoStart = await getSetting('gatewayAutoStart'); + if (gatewayAutoStart) { + try { + logger.debug('Auto-starting Gateway...'); + await gatewayManager.start(); + logger.info('Gateway auto-start succeeded'); + } catch (error) { + logger.error('Gateway auto-start failed:', error); + mainWindow?.webContents.send('gateway:error', String(error)); + } + } else { + logger.info('Gateway auto-start disabled in settings'); + } + + // Merge ClawX context snippets into the workspace bootstrap files. + // The gateway seeds workspace files asynchronously after its HTTP server + // is ready, so ensureClawXContext will retry until the target files appear. + void ensureClawXContext().catch((error) => { + logger.warn('Failed to merge ClawX context into workspace:', error); + }); + + // Auto-install openclaw CLI and shell completions (non-blocking). + void autoInstallCliIfNeeded((installedPath) => { + mainWindow?.webContents.send('openclaw:cli-installed', installedPath); + }).then(() => { + generateCompletionCache(); + installCompletionToProfile(); + }).catch((error) => { + logger.warn('CLI auto-install failed:', error); + }); } // When a second instance is launched, focus the existing window instead. diff --git a/electron/utils/channel-config.ts b/electron/utils/channel-config.ts index 1732e41..d68c1de 100644 --- a/electron/utils/channel-config.ts +++ b/electron/utils/channel-config.ts @@ -102,21 +102,6 @@ export async function saveChannelConfig( } } - // DingTalk is a channel plugin; make sure it's explicitly allowed. - // Newer OpenClaw versions may not load non-bundled plugins when allowlist is empty. - if (channelType === 'dingtalk') { - if (!currentConfig.plugins) { - currentConfig.plugins = {}; - } - currentConfig.plugins.enabled = true; - const allow = Array.isArray(currentConfig.plugins.allow) - ? currentConfig.plugins.allow as string[] - : []; - if (!allow.includes('dingtalk')) { - currentConfig.plugins.allow = [...allow, 'dingtalk']; - } - } - // Plugin-based channels (e.g. WhatsApp) go under plugins.entries, not channels if (PLUGIN_CHANNELS.includes(channelType)) { if (!currentConfig.plugins) {