This commit is contained in:
paisley
2026-03-07 17:27:31 +08:00
parent 990edec63c
commit c4f3749516
9 changed files with 611 additions and 487 deletions

View File

@@ -17,6 +17,9 @@ export async function handleAppRoutes(
});
res.write(': connected\n\n');
ctx.eventBus.addSseClient(res);
// Send a current-state snapshot immediately so renderer subscribers do not
// miss lifecycle transitions that happened before the SSE connection opened.
res.write(`event: gateway:status\ndata: ${JSON.stringify(ctx.gatewayManager.getStatus())}\n\n`);
return true;
}

View File

@@ -6,6 +6,7 @@ import { getApiKey, getDefaultProvider, getProvider } from '../utils/secure-stor
import { getProviderEnvVar, getKeyableProviderTypes } from '../utils/provider-registry';
import { getOpenClawDir, getOpenClawEntryPath, isOpenClawPresent } from '../utils/paths';
import { getUvMirrorEnv } from '../utils/uv-env';
import { listConfiguredChannels } from '../utils/channel-config';
import { syncGatewayTokenToConfig, syncBrowserConfigToOpenClaw, sanitizeOpenClawConfig } from '../utils/openclaw-auth';
import { buildProxyEnv, resolveProxySettings } from '../utils/proxy';
import { syncProxyConfigToOpenClaw } from '../utils/openclaw-proxy';
@@ -21,6 +22,7 @@ export interface GatewayLaunchContext {
binPathExists: boolean;
loadedProviderKeyCount: number;
proxySummary: string;
channelStartupSummary: string;
}
export async function syncGatewayConfigBeforeLaunch(
@@ -88,6 +90,32 @@ async function loadProviderEnv(): Promise<{ providerEnv: Record<string, string>;
return { providerEnv, loadedProviderKeyCount };
}
async function resolveChannelStartupPolicy(): Promise<{
skipChannels: boolean;
channelStartupSummary: string;
}> {
try {
const configuredChannels = await listConfiguredChannels();
if (configuredChannels.length === 0) {
return {
skipChannels: true,
channelStartupSummary: 'skipped(no configured channels)',
};
}
return {
skipChannels: false,
channelStartupSummary: `enabled(${configuredChannels.join(',')})`,
};
} catch (error) {
logger.warn('Failed to determine configured channels for gateway launch:', error);
return {
skipChannels: false,
channelStartupSummary: 'enabled(unknown)',
};
}
}
export async function prepareGatewayLaunchContext(port: number): Promise<GatewayLaunchContext> {
const openclawDir = getOpenClawDir();
const entryScript = getOpenClawEntryPath();
@@ -118,6 +146,7 @@ export async function prepareGatewayLaunchContext(port: number): Promise<Gateway
: process.env.PATH || '';
const { providerEnv, loadedProviderKeyCount } = await loadProviderEnv();
const { skipChannels, channelStartupSummary } = await resolveChannelStartupPolicy();
const uvEnv = await getUvMirrorEnv();
const proxyEnv = buildProxyEnv(appSettings);
const resolvedProxy = resolveProxySettings(appSettings);
@@ -133,8 +162,8 @@ export async function prepareGatewayLaunchContext(port: number): Promise<Gateway
...uvEnv,
...proxyEnv,
OPENCLAW_GATEWAY_TOKEN: appSettings.gatewayToken,
OPENCLAW_SKIP_CHANNELS: '',
CLAWDBOT_SKIP_CHANNELS: '',
OPENCLAW_SKIP_CHANNELS: skipChannels ? '1' : '',
CLAWDBOT_SKIP_CHANNELS: skipChannels ? '1' : '',
OPENCLAW_NO_RESPAWN: '1',
};
@@ -148,5 +177,6 @@ export async function prepareGatewayLaunchContext(port: number): Promise<Gateway
binPathExists,
loadedProviderKeyCount,
proxySummary,
channelStartupSummary,
};
}

View File

@@ -9,22 +9,22 @@ import { existsSync, writeFileSync } from 'fs';
import WebSocket from 'ws';
import { PORTS } from '../utils/config';
import {
getOpenClawDir,
getOpenClawEntryPath,
appendNodeRequireToNodeOptions,
} from '../utils/paths';
import { getSetting } from '../utils/store';
import { JsonRpcNotification, isNotification, isResponse } from './protocol';
import { logger } from '../utils/logger';
import { isPythonReady, setupManagedPython } from '../utils/uv-setup';
import {
loadOrCreateDeviceIdentity,
type DeviceIdentity,
} from '../utils/device-identity';
import { shouldAttemptConfigAutoRepair } from './startup-recovery';
import {
DEFAULT_RECONNECT_CONFIG,
type ReconnectConfig,
type GatewayLifecycleState,
getDeferredRestartAction,
getReconnectScheduleDecision,
getReconnectSkipReason,
isLifecycleSuperseded,
nextLifecycleEpoch,
@@ -40,6 +40,16 @@ import { dispatchJsonRpcNotification, dispatchProtocolEvent } from './event-disp
import { GatewayStateController } from './state';
import { prepareGatewayLaunchContext } from './config-sync';
import { buildGatewayConnectFrame, probeGatewayReady } from './ws-client';
import {
findExistingGatewayProcess,
isTransientGatewayStartError,
runOpenClawDoctorRepair,
terminateOwnedGatewayProcess,
unloadLaunchctlGatewayService,
waitForPortFree,
warmupManagedPythonReadiness,
} from './supervisor';
import { classifyGatewayStderrMessage, recordGatewayStartupStderrLine } from './startup-stderr';
/**
* Gateway connection status
@@ -68,21 +78,6 @@ export interface GatewayManagerEvents {
'chat:message': (data: { message: unknown }) => void;
}
/**
* Reconnection configuration
*/
interface ReconnectConfig {
maxAttempts: number;
baseDelay: number;
maxDelay: number;
}
const DEFAULT_RECONNECT_CONFIG: ReconnectConfig = {
maxAttempts: 10,
baseDelay: 1000,
maxDelay: 30000,
};
// getNodeExecutablePath() removed: utilityProcess.fork() handles process isolation
// natively on all platforms (no dock icon on macOS, no console on Windows).
@@ -257,40 +252,6 @@ export class GatewayManager extends EventEmitter {
return sanitized;
}
private formatExit(code: number | null, signal: NodeJS.Signals | null): string {
if (code !== null) return `code=${code}`;
if (signal) return `signal=${signal}`;
return 'code=null signal=null';
}
private classifyStderrMessage(message: string): { level: 'drop' | 'debug' | 'warn'; normalized: string } {
const msg = message.trim();
if (!msg) return { level: 'drop', normalized: msg };
// Known noisy lines that are not actionable for Gateway lifecycle debugging.
if (msg.includes('openclaw-control-ui') && msg.includes('token_mismatch')) return { level: 'drop', normalized: msg };
if (msg.includes('closed before connect') && msg.includes('token mismatch')) return { level: 'drop', normalized: msg };
// Downgrade frequent non-fatal noise.
if (msg.includes('ExperimentalWarning')) return { level: 'debug', normalized: msg };
if (msg.includes('DeprecationWarning')) return { level: 'debug', normalized: msg };
if (msg.includes('Debugger attached')) return { level: 'debug', normalized: msg };
// Electron restricts NODE_OPTIONS in packaged apps; this is expected and harmless.
if (msg.includes('NODE_OPTIONs are not supported in packaged apps')) return { level: 'debug', normalized: msg };
return { level: 'warn', normalized: msg };
}
private recordStartupStderrLine(line: string): void {
const normalized = line.trim();
if (!normalized) return;
this.recentStartupStderrLines.push(normalized);
const MAX_STDERR_LINES = 120;
if (this.recentStartupStderrLines.length > MAX_STDERR_LINES) {
this.recentStartupStderrLines.splice(0, this.recentStartupStderrLines.length - MAX_STDERR_LINES);
}
}
private bumpLifecycleEpoch(reason: string): number {
this.lifecycleEpoch = nextLifecycleEpoch(this.lifecycleEpoch);
logger.debug(`Gateway lifecycle epoch advanced to ${this.lifecycleEpoch} (${reason})`);
@@ -406,16 +367,7 @@ export class GatewayManager extends EventEmitter {
// Check if Python environment is ready (self-healing) asynchronously.
// Fire-and-forget: only needs to run once, not on every retry.
void isPythonReady().then(pythonReady => {
if (!pythonReady) {
logger.info('Python environment missing or incomplete, attempting background repair...');
void setupManagedPython().catch(err => {
logger.error('Background Python repair failed:', err);
});
}
}).catch(err => {
logger.error('Failed to check Python environment:', err);
});
warmupManagedPythonReadiness();
try {
let startAttempts = 0;
@@ -428,7 +380,10 @@ export class GatewayManager extends EventEmitter {
try {
// Check if Gateway is already running
logger.debug('Checking for existing Gateway...');
const existing = await this.findExistingGateway();
const existing = await findExistingGatewayProcess({
port: this.status.port,
ownedPid: this.process?.pid,
});
this.assertLifecycleEpoch(startEpoch, 'start/find-existing');
if (existing) {
logger.debug(`Found existing Gateway on port ${existing.port}`);
@@ -446,7 +401,7 @@ export class GatewayManager extends EventEmitter {
// after the previous Gateway process exits, preventing the new one
// from binding. Wait for the port to be free before proceeding.
if (process.platform === 'win32') {
await this.waitForPortFree(this.status.port);
await waitForPortFree(this.status.port);
this.assertLifecycleEpoch(startEpoch, 'start/wait-port');
}
@@ -475,7 +430,7 @@ export class GatewayManager extends EventEmitter {
logger.warn(
'Detected invalid OpenClaw config during Gateway startup; running doctor repair before retry'
);
const repaired = await this.runOpenClawDoctorRepair();
const repaired = await runOpenClawDoctorRepair();
if (repaired) {
logger.info('OpenClaw doctor repair completed; retrying Gateway startup');
this.setStatus({ state: 'starting', error: undefined, reconnectAttempts: 0 });
@@ -486,12 +441,7 @@ export class GatewayManager extends EventEmitter {
// Retry on transient connect errors
const errMsg = String(error);
const isTransientError =
errMsg.includes('WebSocket closed before handshake') ||
errMsg.includes('ECONNREFUSED') ||
errMsg.includes('Gateway process exited before becoming ready') ||
errMsg.includes('Timed out waiting for connect.challenge') ||
errMsg.includes('Connect handshake timeout');
const isTransientError = isTransientGatewayStartError(error);
if (startAttempts < MAX_START_ATTEMPTS && isTransientError) {
logger.warn(`Transient start error: ${errMsg}. Retrying... (${startAttempts}/${MAX_START_ATTEMPTS})`);
@@ -551,34 +501,7 @@ export class GatewayManager extends EventEmitter {
// Kill process
if (this.process && this.ownsProcess) {
const child = this.process;
// UtilityProcess doesn't expose exitCode/signalCode — track exit via event.
let exited = false;
await new Promise<void>((resolve) => {
child.once('exit', () => {
exited = true;
resolve();
});
const pid = child.pid;
logger.info(`Sending kill to Gateway process (pid=${pid ?? 'unknown'})`);
try { child.kill(); } catch { /* ignore if already exited */ }
// Force kill after timeout via OS-level kill on the PID
const timeout = setTimeout(() => {
if (!exited) {
logger.warn(`Gateway did not exit in time, force-killing (pid=${pid ?? 'unknown'})`);
if (pid) {
try { process.kill(pid, 'SIGKILL'); } catch { /* ignore */ }
}
}
resolve();
}, 5000);
child.once('exit', () => {
clearTimeout(timeout);
});
});
await terminateOwnedGatewayProcess(child);
if (this.process === child) {
this.process = null;
@@ -747,322 +670,13 @@ export class GatewayManager extends EventEmitter {
}
}
/**
* Unload the system-managed openclaw gateway launchctl service if it is
* loaded. Without this, killing the process only causes launchctl to
* respawn it, leading to an infinite reconnect loop.
*/
private async unloadLaunchctlService(): Promise<void> {
if (process.platform !== 'darwin') return;
try {
const uid = process.getuid?.();
if (uid === undefined) return;
const LAUNCHD_LABEL = 'ai.openclaw.gateway';
const serviceTarget = `gui/${uid}/${LAUNCHD_LABEL}`;
const loaded = await new Promise<boolean>((resolve) => {
import('child_process').then(cp => {
cp.exec(`launchctl print ${serviceTarget}`, { timeout: 5000 }, (err) => {
resolve(!err);
});
}).catch(() => resolve(false));
});
if (!loaded) return;
logger.info(`Unloading launchctl service ${serviceTarget} to prevent auto-respawn`);
await new Promise<void>((resolve) => {
import('child_process').then(cp => {
cp.exec(`launchctl bootout ${serviceTarget}`, { timeout: 10000 }, (err) => {
if (err) {
logger.warn(`Failed to bootout launchctl service: ${err.message}`);
} else {
logger.info('Successfully unloaded launchctl gateway service');
}
resolve();
});
}).catch(() => resolve());
});
await new Promise(r => setTimeout(r, 2000));
// Remove the plist so the service won't reload on next login.
try {
const { homedir } = await import('os');
const plistPath = path.join(homedir(), 'Library', 'LaunchAgents', `${LAUNCHD_LABEL}.plist`);
const { access, unlink } = await import('fs/promises');
await access(plistPath);
await unlink(plistPath);
logger.info(`Removed legacy launchd plist to prevent reload on next login: ${plistPath}`);
} catch {
// File doesn't exist or can't be removed -- not fatal
}
} catch (err) {
logger.warn('Error while unloading launchctl gateway service:', err);
}
}
/**
* Find existing Gateway process by attempting a WebSocket connection
*/
private async findExistingGateway(): Promise<{ port: number, externalToken?: string } | null> {
try {
const port = PORTS.OPENCLAW_GATEWAY;
try {
// Platform-specific command to find processes listening on the gateway port.
// We use native commands (netstat on Windows) to avoid triggering AV blocks
// that flag "powershell -WindowStyle Hidden" as malware behavior.
// windowsHide: true in cp.exec natively prevents the black command window.
const cmd = process.platform === 'win32'
? `netstat -ano | findstr :${port}`
: `lsof -i :${port} -sTCP:LISTEN -t`;
const { stdout } = await new Promise<{ stdout: string }>((resolve, reject) => {
import('child_process').then(cp => {
cp.exec(cmd, { timeout: 5000, windowsHide: true }, (err, stdout) => {
if (err) resolve({ stdout: '' });
else resolve({ stdout });
});
}).catch(reject);
});
if (stdout.trim()) {
// Parse netstat or lsof output to extract PIDs
let pids: string[] = [];
if (process.platform === 'win32') {
// netstat -ano output format:
// TCP 127.0.0.1:3000 0.0.0.0:0 LISTENING 12345
const lines = stdout.trim().split(/\r?\n/);
for (const line of lines) {
const parts = line.trim().split(/\s+/);
if (parts.length >= 5 && parts[3] === 'LISTENING') {
pids.push(parts[4]);
}
}
} else {
pids = stdout.trim().split(/\r?\n/).map(s => s.trim()).filter(Boolean);
}
// Remove duplicate PIDs
pids = [...new Set(pids)];
if (pids.length > 0) {
if (!this.process || !pids.includes(String(this.process.pid))) {
logger.info(`Found orphaned process listening on port ${port} (PIDs: ${pids.join(', ')}), attempting to kill...`);
// Unload the launchctl service first so macOS doesn't auto-
// respawn the process we're about to kill.
if (process.platform === 'darwin') {
await this.unloadLaunchctlService();
}
// Terminate orphaned processes
for (const pid of pids) {
try {
if (process.platform === 'win32') {
// Use taskkill with windowsHide: true. This natively hides the console
// flash without needing PowerShell, avoiding AV alerts.
import('child_process').then(cp => {
cp.exec(
`taskkill /F /PID ${pid} /T`,
{ timeout: 5000, windowsHide: true },
() => { }
);
}).catch(() => { });
} else {
// SIGTERM first so the gateway can clean up its lock file.
process.kill(parseInt(pid), 'SIGTERM');
}
} catch { /* ignore */ }
}
await new Promise(r => setTimeout(r, process.platform === 'win32' ? 2000 : 3000));
// SIGKILL any survivors (Unix only — Windows taskkill /F is already forceful)
if (process.platform !== 'win32') {
for (const pid of pids) {
try { process.kill(parseInt(pid), 0); process.kill(parseInt(pid), 'SIGKILL'); } catch { /* already exited */ }
}
await new Promise(r => setTimeout(r, 1000));
}
return null;
}
}
}
} catch (err) {
logger.warn('Error checking for existing process on port:', err);
}
// Try a quick WebSocket connection to check if gateway is listening
return await new Promise<{ port: number, externalToken?: string } | null>((resolve) => {
const testWs = new WebSocket(`ws://localhost:${port}/ws`);
const timeout = setTimeout(() => {
testWs.close();
resolve(null);
}, 2000);
testWs.on('open', () => {
clearTimeout(timeout);
testWs.close();
resolve({ port });
});
testWs.on('error', () => {
clearTimeout(timeout);
resolve(null);
});
});
} catch {
// Gateway not running
}
return null;
}
/**
* Attempt to repair invalid OpenClaw config using the built-in doctor command.
* Returns true when doctor exits successfully.
*/
private async runOpenClawDoctorRepair(): Promise<boolean> {
const openclawDir = getOpenClawDir();
const entryScript = getOpenClawEntryPath();
if (!existsSync(entryScript)) {
logger.error(`Cannot run OpenClaw doctor repair: entry script not found at ${entryScript}`);
return false;
}
const platform = process.platform;
const arch = process.arch;
const target = `${platform}-${arch}`;
const binPath = app.isPackaged
? path.join(process.resourcesPath, 'bin')
: path.join(process.cwd(), 'resources', 'bin', target);
const binPathExists = existsSync(binPath);
const finalPath = binPathExists
? `${binPath}${path.delimiter}${process.env.PATH || ''}`
: process.env.PATH || '';
const uvEnv = await getUvMirrorEnv();
const doctorArgs = ['doctor', '--fix', '--yes', '--non-interactive'];
logger.info(
`Running OpenClaw doctor repair (entry="${entryScript}", args="${doctorArgs.join(' ')}", cwd="${openclawDir}", bundledBin=${binPathExists ? 'yes' : 'no'})`
);
return new Promise<boolean>((resolve) => {
const forkEnv: Record<string, string | undefined> = {
...process.env,
PATH: finalPath,
...uvEnv,
OPENCLAW_NO_RESPAWN: '1',
};
const child = utilityProcess.fork(entryScript, doctorArgs, {
cwd: openclawDir,
stdio: 'pipe',
env: forkEnv as NodeJS.ProcessEnv,
});
let settled = false;
const finish = (ok: boolean) => {
if (settled) return;
settled = true;
resolve(ok);
};
const timeout = setTimeout(() => {
logger.error('OpenClaw doctor repair timed out after 120000ms');
try {
child.kill();
} catch {
// ignore
}
finish(false);
}, 120000);
child.on('error', (err) => {
clearTimeout(timeout);
logger.error('Failed to spawn OpenClaw doctor repair process:', err);
finish(false);
});
child.stdout?.on('data', (data) => {
const raw = data.toString();
for (const line of raw.split(/\r?\n/)) {
const normalized = line.trim();
if (!normalized) continue;
logger.debug(`[Gateway doctor stdout] ${normalized}`);
}
});
child.stderr?.on('data', (data) => {
const raw = data.toString();
for (const line of raw.split(/\r?\n/)) {
const normalized = line.trim();
if (!normalized) continue;
logger.warn(`[Gateway doctor stderr] ${normalized}`);
}
});
child.on('exit', (code: number) => {
clearTimeout(timeout);
if (code === 0) {
logger.info('OpenClaw doctor repair completed successfully');
finish(true);
return;
}
logger.warn(`OpenClaw doctor repair exited (code=${code})`);
finish(false);
});
});
}
/**
* Start Gateway process
* Uses OpenClaw npm package from node_modules (dev) or resources (production)
*/
/**
* Wait until the gateway port is no longer held by the OS.
* On Windows, TCP TIME_WAIT can keep a port occupied for up to 2 minutes
* after the owning process exits, causing the new Gateway to hang on bind.
*/
private async waitForPortFree(port: number, timeoutMs = 30000): Promise<void> {
const net = await import('net');
const start = Date.now();
const pollInterval = 500;
let logged = false;
while (Date.now() - start < timeoutMs) {
const available = await new Promise<boolean>((resolve) => {
const server = net.createServer();
server.once('error', () => resolve(false));
server.once('listening', () => {
server.close(() => resolve(true));
});
server.listen(port, '127.0.0.1');
});
if (available) {
const elapsed = Date.now() - start;
if (elapsed > pollInterval) {
logger.info(`Port ${port} became available after ${elapsed}ms`);
}
return;
}
if (!logged) {
logger.info(`Waiting for port ${port} to become available (Windows TCP TIME_WAIT)...`);
logged = true;
}
await new Promise(r => setTimeout(r, pollInterval));
}
logger.warn(`Port ${port} still occupied after ${timeoutMs}ms, proceeding anyway`);
}
private async startProcess(): Promise<void> {
// Ensure no system-managed gateway service will compete with our process.
await this.unloadLaunchctlService();
await unloadLaunchctlGatewayService();
const launchContext = await prepareGatewayLaunchContext(this.status.port);
const {
openclawDir,
@@ -1073,10 +687,11 @@ export class GatewayManager extends EventEmitter {
binPathExists,
loadedProviderKeyCount,
proxySummary,
channelStartupSummary,
} = launchContext;
logger.info(
`Starting Gateway process (mode=${mode}, port=${this.status.port}, entry="${entryScript}", args="${this.sanitizeSpawnArgs(gatewayArgs).join(' ')}", cwd="${openclawDir}", bundledBin=${binPathExists ? 'yes' : 'no'}, providerKeys=${loadedProviderKeyCount}, proxy=${proxySummary})`
`Starting Gateway process (mode=${mode}, port=${this.status.port}, entry="${entryScript}", args="${this.sanitizeSpawnArgs(gatewayArgs).join(' ')}", cwd="${openclawDir}", bundledBin=${binPathExists ? 'yes' : 'no'}, providerKeys=${loadedProviderKeyCount}, channels=${channelStartupSummary}, proxy=${proxySummary})`
);
this.lastSpawnSummary = `mode=${mode}, entry="${entryScript}", args="${this.sanitizeSpawnArgs(gatewayArgs).join(' ')}", cwd="${openclawDir}"`;
@@ -1144,8 +759,8 @@ export class GatewayManager extends EventEmitter {
child.stderr?.on('data', (data) => {
const raw = data.toString();
for (const line of raw.split(/\r?\n/)) {
this.recordStartupStderrLine(line);
const classified = this.classifyStderrMessage(line);
recordGatewayStartupStderrLine(this.recentStartupStderrLines, line);
const classified = classifyGatewayStderrMessage(line);
if (classified.level === 'drop') continue;
if (classified.level === 'debug') {
logger.debug(`[Gateway stderr] ${classified.normalized}`);
@@ -1168,9 +783,9 @@ export class GatewayManager extends EventEmitter {
}
/**
* Wait for Gateway to be ready by checking if the port is accepting connections
* Wait for Gateway to be ready by checking if it can issue connect challenges.
*/
private async waitForReady(retries = 2400, interval = 250): Promise<void> {
private async waitForReady(retries = 2400, interval = 200): Promise<void> {
const child = this.process;
for (let i = 0; i < retries; i++) {
// Early exit if the gateway process has already exited.
@@ -1182,7 +797,7 @@ export class GatewayManager extends EventEmitter {
}
try {
const ready = await probeGatewayReady(this.status.port, 2000);
const ready = await probeGatewayReady(this.status.port, 1500);
if (ready) {
logger.debug(`Gateway ready after ${i + 1} attempt(s)`);
@@ -1447,17 +1062,26 @@ export class GatewayManager extends EventEmitter {
* Schedule reconnection attempt with exponential backoff
*/
private scheduleReconnect(): void {
if (!this.shouldReconnect) {
logger.debug('Gateway reconnect skipped (auto-reconnect disabled)');
const decision = getReconnectScheduleDecision({
shouldReconnect: this.shouldReconnect,
hasReconnectTimer: this.reconnectTimer !== null,
reconnectAttempts: this.reconnectAttempts,
maxAttempts: this.reconnectConfig.maxAttempts,
baseDelay: this.reconnectConfig.baseDelay,
maxDelay: this.reconnectConfig.maxDelay,
});
if (decision.action === 'skip') {
logger.debug(`Gateway reconnect skipped (${decision.reason})`);
return;
}
if (this.reconnectTimer) {
if (decision.action === 'already-scheduled') {
return;
}
if (this.reconnectAttempts >= this.reconnectConfig.maxAttempts) {
logger.error(`Gateway reconnect failed: max attempts reached (${this.reconnectConfig.maxAttempts})`);
if (decision.action === 'fail') {
logger.error(`Gateway reconnect failed: max attempts reached (${decision.maxAttempts})`);
this.setStatus({
state: 'error',
error: 'Failed to reconnect after maximum attempts',
@@ -1466,14 +1090,9 @@ export class GatewayManager extends EventEmitter {
return;
}
// Calculate delay with exponential backoff
const delay = Math.min(
this.reconnectConfig.baseDelay * Math.pow(2, this.reconnectAttempts),
this.reconnectConfig.maxDelay
);
this.reconnectAttempts++;
logger.warn(`Scheduling Gateway reconnect attempt ${this.reconnectAttempts}/${this.reconnectConfig.maxAttempts} in ${delay}ms`);
const { delay, nextAttempt, maxAttempts } = decision;
this.reconnectAttempts = nextAttempt;
logger.warn(`Scheduling Gateway reconnect attempt ${nextAttempt}/${maxAttempts} in ${delay}ms`);
this.setStatus({
state: 'reconnecting',

View File

@@ -1,3 +1,15 @@
export interface ReconnectConfig {
maxAttempts: number;
baseDelay: number;
maxDelay: number;
}
export const DEFAULT_RECONNECT_CONFIG: ReconnectConfig = {
maxAttempts: 10,
baseDelay: 1000,
maxDelay: 30000,
};
export function nextLifecycleEpoch(currentEpoch: number): number {
return currentEpoch + 1;
}
@@ -22,6 +34,53 @@ export function getReconnectSkipReason(context: ReconnectAttemptContext): string
return null;
}
export interface ReconnectScheduleContext {
shouldReconnect: boolean;
hasReconnectTimer: boolean;
reconnectAttempts: number;
maxAttempts: number;
baseDelay: number;
maxDelay: number;
}
export type ReconnectScheduleDecision =
| { action: 'skip'; reason: string }
| { action: 'already-scheduled' }
| { action: 'fail'; attempts: number; maxAttempts: number }
| { action: 'schedule'; nextAttempt: number; maxAttempts: number; delay: number };
export function getReconnectScheduleDecision(
context: ReconnectScheduleContext,
): ReconnectScheduleDecision {
if (!context.shouldReconnect) {
return { action: 'skip', reason: 'auto-reconnect disabled' };
}
if (context.hasReconnectTimer) {
return { action: 'already-scheduled' };
}
if (context.reconnectAttempts >= context.maxAttempts) {
return {
action: 'fail',
attempts: context.reconnectAttempts,
maxAttempts: context.maxAttempts,
};
}
const delay = Math.min(
context.baseDelay * Math.pow(2, context.reconnectAttempts),
context.maxDelay,
);
return {
action: 'schedule',
nextAttempt: context.reconnectAttempts + 1,
maxAttempts: context.maxAttempts,
delay,
};
}
export type GatewayLifecycleState = 'stopped' | 'starting' | 'running' | 'error' | 'reconnecting';
export interface RestartDeferralContext {

View File

@@ -0,0 +1,42 @@
export type GatewayStderrClassification = {
level: 'drop' | 'debug' | 'warn';
normalized: string;
};
const MAX_STDERR_LINES = 120;
export function classifyGatewayStderrMessage(message: string): GatewayStderrClassification {
const msg = message.trim();
if (!msg) {
return { level: 'drop', normalized: msg };
}
// Known noisy lines that are not actionable for Gateway lifecycle debugging.
if (msg.includes('openclaw-control-ui') && msg.includes('token_mismatch')) {
return { level: 'drop', normalized: msg };
}
if (msg.includes('closed before connect') && msg.includes('token mismatch')) {
return { level: 'drop', normalized: msg };
}
// Downgrade frequent non-fatal noise.
if (msg.includes('ExperimentalWarning')) return { level: 'debug', normalized: msg };
if (msg.includes('DeprecationWarning')) return { level: 'debug', normalized: msg };
if (msg.includes('Debugger attached')) return { level: 'debug', normalized: msg };
// Electron restricts NODE_OPTIONS in packaged apps; this is expected and harmless.
if (msg.includes('node: --require is not allowed in NODE_OPTIONS')) {
return { level: 'debug', normalized: msg };
}
return { level: 'warn', normalized: msg };
}
export function recordGatewayStartupStderrLine(lines: string[], line: string): void {
const normalized = line.trim();
if (!normalized) return;
lines.push(normalized);
if (lines.length > MAX_STDERR_LINES) {
lines.splice(0, lines.length - MAX_STDERR_LINES);
}
}

View File

@@ -0,0 +1,359 @@
import { app, utilityProcess } from 'electron';
import path from 'path';
import { existsSync } from 'fs';
import WebSocket from 'ws';
import { getOpenClawDir, getOpenClawEntryPath } from '../utils/paths';
import { getUvMirrorEnv } from '../utils/uv-env';
import { isPythonReady, setupManagedPython } from '../utils/uv-setup';
import { logger } from '../utils/logger';
export function warmupManagedPythonReadiness(): void {
void isPythonReady().then((pythonReady) => {
if (!pythonReady) {
logger.info('Python environment missing or incomplete, attempting background repair...');
void setupManagedPython().catch((err) => {
logger.error('Background Python repair failed:', err);
});
}
}).catch((err) => {
logger.error('Failed to check Python environment:', err);
});
}
export function isTransientGatewayStartError(error: unknown): boolean {
const errMsg = String(error);
return (
errMsg.includes('WebSocket closed before handshake') ||
errMsg.includes('ECONNREFUSED') ||
errMsg.includes('Gateway process exited before becoming ready') ||
errMsg.includes('Timed out waiting for connect.challenge') ||
errMsg.includes('Connect handshake timeout')
);
}
export async function terminateOwnedGatewayProcess(child: Electron.UtilityProcess): Promise<void> {
let exited = false;
await new Promise<void>((resolve) => {
child.once('exit', () => {
exited = true;
resolve();
});
const pid = child.pid;
logger.info(`Sending kill to Gateway process (pid=${pid ?? 'unknown'})`);
try {
child.kill();
} catch {
// ignore if already exited
}
const timeout = setTimeout(() => {
if (!exited) {
logger.warn(`Gateway did not exit in time, force-killing (pid=${pid ?? 'unknown'})`);
if (pid) {
try {
process.kill(pid, 'SIGKILL');
} catch {
// ignore
}
}
}
resolve();
}, 5000);
child.once('exit', () => {
clearTimeout(timeout);
});
});
}
export async function unloadLaunchctlGatewayService(): Promise<void> {
if (process.platform !== 'darwin') return;
try {
const uid = process.getuid?.();
if (uid === undefined) return;
const launchdLabel = 'ai.openclaw.gateway';
const serviceTarget = `gui/${uid}/${launchdLabel}`;
const cp = await import('child_process');
const fsPromises = await import('fs/promises');
const os = await import('os');
const loaded = await new Promise<boolean>((resolve) => {
cp.exec(`launchctl print ${serviceTarget}`, { timeout: 5000 }, (err) => {
resolve(!err);
});
});
if (!loaded) return;
logger.info(`Unloading launchctl service ${serviceTarget} to prevent auto-respawn`);
await new Promise<void>((resolve) => {
cp.exec(`launchctl bootout ${serviceTarget}`, { timeout: 10000 }, (err) => {
if (err) {
logger.warn(`Failed to bootout launchctl service: ${err.message}`);
} else {
logger.info('Successfully unloaded launchctl gateway service');
}
resolve();
});
});
await new Promise((resolve) => setTimeout(resolve, 2000));
try {
const plistPath = path.join(os.homedir(), 'Library', 'LaunchAgents', `${launchdLabel}.plist`);
await fsPromises.access(plistPath);
await fsPromises.unlink(plistPath);
logger.info(`Removed legacy launchd plist to prevent reload on next login: ${plistPath}`);
} catch {
// File doesn't exist or can't be removed -- not fatal
}
} catch (err) {
logger.warn('Error while unloading launchctl gateway service:', err);
}
}
export async function waitForPortFree(port: number, timeoutMs = 30000): Promise<void> {
const net = await import('net');
const start = Date.now();
const pollInterval = 500;
let logged = false;
while (Date.now() - start < timeoutMs) {
const available = await new Promise<boolean>((resolve) => {
const server = net.createServer();
server.once('error', () => resolve(false));
server.once('listening', () => {
server.close(() => resolve(true));
});
server.listen(port, '127.0.0.1');
});
if (available) {
const elapsed = Date.now() - start;
if (elapsed > pollInterval) {
logger.info(`Port ${port} became available after ${elapsed}ms`);
}
return;
}
if (!logged) {
logger.info(`Waiting for port ${port} to become available (Windows TCP TIME_WAIT)...`);
logged = true;
}
await new Promise((resolve) => setTimeout(resolve, pollInterval));
}
logger.warn(`Port ${port} still occupied after ${timeoutMs}ms, proceeding anyway`);
}
async function getListeningProcessIds(port: number): Promise<string[]> {
const cmd = process.platform === 'win32'
? `netstat -ano | findstr :${port}`
: `lsof -i :${port} -sTCP:LISTEN -t`;
const cp = await import('child_process');
const { stdout } = await new Promise<{ stdout: string }>((resolve) => {
cp.exec(cmd, { timeout: 5000, windowsHide: true }, (err, stdout) => {
if (err) {
resolve({ stdout: '' });
} else {
resolve({ stdout });
}
});
});
if (!stdout.trim()) {
return [];
}
if (process.platform === 'win32') {
const pids: string[] = [];
for (const line of stdout.trim().split(/\r?\n/)) {
const parts = line.trim().split(/\s+/);
if (parts.length >= 5 && parts[3] === 'LISTENING') {
pids.push(parts[4]);
}
}
return [...new Set(pids)];
}
return [...new Set(stdout.trim().split(/\r?\n/).map((value) => value.trim()).filter(Boolean))];
}
async function terminateOrphanedProcessIds(port: number, pids: string[]): Promise<void> {
logger.info(`Found orphaned process listening on port ${port} (PIDs: ${pids.join(', ')}), attempting to kill...`);
if (process.platform === 'darwin') {
await unloadLaunchctlGatewayService();
}
for (const pid of pids) {
try {
if (process.platform === 'win32') {
const cp = await import('child_process');
await new Promise<void>((resolve) => {
cp.exec(
`taskkill /F /PID ${pid} /T`,
{ timeout: 5000, windowsHide: true },
() => resolve(),
);
});
} else {
process.kill(parseInt(pid, 10), 'SIGTERM');
}
} catch {
// Ignore processes that have already exited.
}
}
await new Promise((resolve) => setTimeout(resolve, process.platform === 'win32' ? 2000 : 3000));
if (process.platform !== 'win32') {
for (const pid of pids) {
try {
process.kill(parseInt(pid, 10), 0);
process.kill(parseInt(pid, 10), 'SIGKILL');
} catch {
// Already exited.
}
}
await new Promise((resolve) => setTimeout(resolve, 1000));
}
}
export async function findExistingGatewayProcess(options: {
port: number;
ownedPid?: number;
}): Promise<{ port: number; externalToken?: string } | null> {
const { port, ownedPid } = options;
try {
try {
const pids = await getListeningProcessIds(port);
if (pids.length > 0 && (!ownedPid || !pids.includes(String(ownedPid)))) {
await terminateOrphanedProcessIds(port, pids);
return null;
}
} catch (err) {
logger.warn('Error checking for existing process on port:', err);
}
return await new Promise<{ port: number; externalToken?: string } | null>((resolve) => {
const testWs = new WebSocket(`ws://localhost:${port}/ws`);
const timeout = setTimeout(() => {
testWs.close();
resolve(null);
}, 2000);
testWs.on('open', () => {
clearTimeout(timeout);
testWs.close();
resolve({ port });
});
testWs.on('error', () => {
clearTimeout(timeout);
resolve(null);
});
});
} catch {
return null;
}
}
export async function runOpenClawDoctorRepair(): Promise<boolean> {
const openclawDir = getOpenClawDir();
const entryScript = getOpenClawEntryPath();
if (!existsSync(entryScript)) {
logger.error(`Cannot run OpenClaw doctor repair: entry script not found at ${entryScript}`);
return false;
}
const platform = process.platform;
const arch = process.arch;
const target = `${platform}-${arch}`;
const binPath = app.isPackaged
? path.join(process.resourcesPath, 'bin')
: path.join(process.cwd(), 'resources', 'bin', target);
const binPathExists = existsSync(binPath);
const finalPath = binPathExists
? `${binPath}${path.delimiter}${process.env.PATH || ''}`
: process.env.PATH || '';
const uvEnv = await getUvMirrorEnv();
const doctorArgs = ['doctor', '--fix', '--yes', '--non-interactive'];
logger.info(
`Running OpenClaw doctor repair (entry="${entryScript}", args="${doctorArgs.join(' ')}", cwd="${openclawDir}", bundledBin=${binPathExists ? 'yes' : 'no'})`,
);
return await new Promise<boolean>((resolve) => {
const forkEnv: Record<string, string | undefined> = {
...process.env,
PATH: finalPath,
...uvEnv,
OPENCLAW_NO_RESPAWN: '1',
};
const child = utilityProcess.fork(entryScript, doctorArgs, {
cwd: openclawDir,
stdio: 'pipe',
env: forkEnv as NodeJS.ProcessEnv,
});
let settled = false;
const finish = (ok: boolean) => {
if (settled) return;
settled = true;
resolve(ok);
};
const timeout = setTimeout(() => {
logger.error('OpenClaw doctor repair timed out after 120000ms');
try {
child.kill();
} catch {
// ignore
}
finish(false);
}, 120000);
child.on('error', (err) => {
clearTimeout(timeout);
logger.error('Failed to spawn OpenClaw doctor repair process:', err);
finish(false);
});
child.stdout?.on('data', (data) => {
const raw = data.toString();
for (const line of raw.split(/\r?\n/)) {
const normalized = line.trim();
if (!normalized) continue;
logger.debug(`[Gateway doctor stdout] ${normalized}`);
}
});
child.stderr?.on('data', (data) => {
const raw = data.toString();
for (const line of raw.split(/\r?\n/)) {
const normalized = line.trim();
if (!normalized) continue;
logger.warn(`[Gateway doctor stderr] ${normalized}`);
}
});
child.on('exit', (code: number) => {
clearTimeout(timeout);
if (code === 0) {
logger.info('OpenClaw doctor repair completed successfully');
finish(true);
return;
}
logger.warn(`OpenClaw doctor repair exited (code=${code})`);
finish(false);
});
});
}

View File

@@ -8,24 +8,51 @@ import {
export async function probeGatewayReady(
port: number,
timeoutMs = 2000,
timeoutMs = 1500,
): Promise<boolean> {
return await new Promise<boolean>((resolve) => {
const testWs = new WebSocket(`ws://localhost:${port}/ws`);
let settled = false;
const resolveOnce = (value: boolean) => {
if (settled) return;
settled = true;
clearTimeout(timeout);
try {
testWs.close();
} catch {
// ignore
}
resolve(value);
};
const timeout = setTimeout(() => {
testWs.close();
resolve(false);
resolveOnce(false);
}, timeoutMs);
testWs.on('open', () => {
clearTimeout(timeout);
testWs.close();
resolve(true);
// Do not resolve on plain socket open. The gateway can accept the TCP/WebSocket
// connection before it is ready to issue protocol challenges, which previously
// caused a false "ready" result and then a full connect() stall.
});
testWs.on('message', (data) => {
try {
const message = JSON.parse(data.toString()) as { type?: string; event?: string };
if (message.type === 'event' && message.event === 'connect.challenge') {
resolveOnce(true);
}
} catch {
// ignore malformed probe payloads
}
});
testWs.on('error', () => {
clearTimeout(timeout);
resolve(false);
resolveOnce(false);
});
testWs.on('close', () => {
resolveOnce(false);
});
});
}

View File

@@ -230,40 +230,8 @@ async function initialize(): Promise<void> {
logger.warn('Failed to install built-in skills:', error);
});
// Start Gateway automatically (this seeds missing bootstrap files with full templates)
const gatewayAutoStart = await getSetting('gatewayAutoStart');
if (gatewayAutoStart) {
try {
logger.debug('Auto-starting Gateway...');
await gatewayManager.start();
logger.info('Gateway auto-start succeeded');
} catch (error) {
logger.error('Gateway auto-start failed:', error);
mainWindow?.webContents.send('gateway:error', String(error));
}
} else {
logger.info('Gateway auto-start disabled in settings');
}
// Merge ClawX context snippets into the workspace bootstrap files.
// The gateway seeds workspace files asynchronously after its HTTP server
// is ready, so ensureClawXContext will retry until the target files appear.
void ensureClawXContext().catch((error) => {
logger.warn('Failed to merge ClawX context into workspace:', error);
});
// Auto-install openclaw CLI and shell completions (non-blocking).
void autoInstallCliIfNeeded((installedPath) => {
mainWindow?.webContents.send('openclaw:cli-installed', installedPath);
}).then(() => {
generateCompletionCache();
installCompletionToProfile();
}).catch((error) => {
logger.warn('CLI auto-install failed:', error);
});
// Re-apply ClawX context after every gateway restart because the gateway
// may re-seed workspace files with clean templates (losing ClawX markers).
// Bridge gateway and host-side events before any auto-start logic runs, so
// renderer subscribers observe the full startup lifecycle.
gatewayManager.on('status', (status: { state: string }) => {
hostEventBus.emit('gateway:status', status);
if (status.state === 'running') {
@@ -320,6 +288,38 @@ async function initialize(): Promise<void> {
whatsAppLoginManager.on('error', (error) => {
hostEventBus.emit('channel:whatsapp-error', error);
});
// Start Gateway automatically (this seeds missing bootstrap files with full templates)
const gatewayAutoStart = await getSetting('gatewayAutoStart');
if (gatewayAutoStart) {
try {
logger.debug('Auto-starting Gateway...');
await gatewayManager.start();
logger.info('Gateway auto-start succeeded');
} catch (error) {
logger.error('Gateway auto-start failed:', error);
mainWindow?.webContents.send('gateway:error', String(error));
}
} else {
logger.info('Gateway auto-start disabled in settings');
}
// Merge ClawX context snippets into the workspace bootstrap files.
// The gateway seeds workspace files asynchronously after its HTTP server
// is ready, so ensureClawXContext will retry until the target files appear.
void ensureClawXContext().catch((error) => {
logger.warn('Failed to merge ClawX context into workspace:', error);
});
// Auto-install openclaw CLI and shell completions (non-blocking).
void autoInstallCliIfNeeded((installedPath) => {
mainWindow?.webContents.send('openclaw:cli-installed', installedPath);
}).then(() => {
generateCompletionCache();
installCompletionToProfile();
}).catch((error) => {
logger.warn('CLI auto-install failed:', error);
});
}
// When a second instance is launched, focus the existing window instead.

View File

@@ -102,21 +102,6 @@ export async function saveChannelConfig(
}
}
// DingTalk is a channel plugin; make sure it's explicitly allowed.
// Newer OpenClaw versions may not load non-bundled plugins when allowlist is empty.
if (channelType === 'dingtalk') {
if (!currentConfig.plugins) {
currentConfig.plugins = {};
}
currentConfig.plugins.enabled = true;
const allow = Array.isArray(currentConfig.plugins.allow)
? currentConfig.plugins.allow as string[]
: [];
if (!allow.includes('dingtalk')) {
currentConfig.plugins.allow = [...allow, 'dingtalk'];
}
}
// Plugin-based channels (e.g. WhatsApp) go under plugins.entries, not channels
if (PLUGIN_CHANNELS.includes(channelType)) {
if (!currentConfig.plugins) {