feat(workflow-executor): add graceful shutdown with in-flight step drain

matthv · claude · alban bertolini · commit bd7ac009a84f · 2026-03-26T18:39:35.000+01:00
- stop() now drains in-flight steps before closing resources
- Add Runner.state getter: idle → running → draining → stopped
- Add stopTimeoutMs config (default 30s) to prevent hanging on stuck steps
- Convert inFlightSteps from Set to Map to track step promises
- HTTP server stays up during drain for frontend access
- Add Logger.info optional method for drain status messages
- 7 new tests: drain, timeout, state transitions, log messages

fixes PRD-241

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/packages/workflow-executor/CLAUDE.md b/packages/workflow-executor/CLAUDE.md
@@ -43,7 +43,7 @@ Front  ◀──▶  Orchestrator  ◀──pull/push──▶  Executor  ──
 ```
 src/
 ├── errors.ts               # WorkflowExecutorError, MissingToolCallError, MalformedToolCallError, NoRecordsError, NoReadableFieldsError, NoWritableFieldsError, NoActionsError, StepPersistenceError, NoRelationshipFieldsError, RelatedRecordNotFoundError
-├── runner.ts               # Runner class — main entry point (start/stop/triggerPoll, HTTP server wiring)
+├── runner.ts               # Runner class — main entry point (start/stop/triggerPoll, HTTP server wiring, graceful drain)
 ├── types/                  # Core type definitions (@draft)
 │   ├── step-definition.ts  # StepType enum + step definition interfaces
 │   ├── step-outcome.ts     # Step outcome tracking types (StepOutcome, sent to orchestrator)
@@ -54,6 +54,10 @@ src/
 │   ├── agent-port.ts       # Interface to the Forest Admin agent (datasource)
 │   ├── workflow-port.ts    # Interface to the orchestrator
 │   └── run-store.ts        # Interface for persisting run state (scoped to a run)
+├── stores/                 # RunStore implementations
+│   ├── in-memory-store.ts  # InMemoryStore — Map-based, for tests
+│   ├── database-store.ts   # DatabaseStore — Sequelize + umzug migrations
+│   └── build-run-store.ts  # Factory functions: buildDatabaseRunStore, buildInMemoryRunStore
 ├── adapters/               # Port implementations
 │   ├── agent-client-agent-port.ts      # AgentPort via @forestadmin/agent-client
 │   └── forest-server-workflow-port.ts  # WorkflowPort via HTTP (forestadmin-client ServerUtils)
@@ -83,6 +87,7 @@ src/
 - **displayName in AI tools** — All `DynamicStructuredTool` schemas and system message prompts must use `displayName`, never `fieldName`. `displayName` is a Forest Admin frontend feature that replaces the technical field/relation/action name with a product-oriented label configured by the Forest Admin admin. End users write their workflow prompts using these display names, not the underlying technical names. After an AI tool call returns display names, map them back to `fieldName`/`name` before using them in datasource operations (e.g. filtering record values, calling `getRecord`).
 - **No recovery/retry** — Once the executor returns a step result to the orchestrator, the step is considered executed. There is no mechanism to re-dispatch a step, so executors must NOT include recovery checks (e.g. checking the RunStore for cached results before executing). Each step executes exactly once.
 - **Fetched steps must be executed** — Any step retrieved from the orchestrator via `getPendingStepExecutions()` must be executed. Silently discarding a fetched step (e.g. filtering it out by `runId` after fetching) violates the executor contract: the orchestrator assumes execution is guaranteed once the step is dispatched. The only valid filter before executing is deduplication via `inFlightSteps` (to avoid running the same step twice concurrently).
+- **Graceful shutdown** — `stop()` drains in-flight steps before closing resources. The `state` getter exposes the lifecycle: `idle → running → draining → stopped`. `stopTimeoutMs` (default 30s) prevents `stop()` from hanging forever if a step is stuck. The HTTP server stays up during drain so the frontend can still query run status. Signal handling (`SIGTERM`/`SIGINT`) is the consumer's responsibility — the Runner is a library class.
 
 ## Commands
 
diff --git a/packages/workflow-executor/src/adapters/console-logger.ts b/packages/workflow-executor/src/adapters/console-logger.ts
@@ -4,4 +4,8 @@ export default class ConsoleLogger implements Logger {
   error(message: string, context: Record<string, unknown>): void {
     console.error(JSON.stringify({ message, timestamp: new Date().toISOString(), ...context }));
   }
+
+  info(message: string, context: Record<string, unknown>): void {
+    console.info(JSON.stringify({ message, timestamp: new Date().toISOString(), ...context }));
+  }
 }
diff --git a/packages/workflow-executor/src/index.ts b/packages/workflow-executor/src/index.ts
@@ -101,7 +101,7 @@ export { default as ForestServerWorkflowPort } from './adapters/forest-server-wo
 export { default as ExecutorHttpServer } from './http/executor-http-server';
 export type { ExecutorHttpServerOptions } from './http/executor-http-server';
 export { default as Runner } from './runner';
-export type { RunnerConfig } from './runner';
+export type { RunnerConfig, RunnerState } from './runner';
 export { default as validateSecrets } from './validate-secrets';
 export { default as SchemaCache } from './schema-cache';
 export { default as InMemoryStore } from './stores/in-memory-store';
diff --git a/packages/workflow-executor/src/ports/logger-port.ts b/packages/workflow-executor/src/ports/logger-port.ts
@@ -1,3 +1,4 @@
 export interface Logger {
   error(message: string, context: Record<string, unknown>): void;
+  info?(message: string, context: Record<string, unknown>): void;
 }
diff --git a/packages/workflow-executor/src/runner.ts b/packages/workflow-executor/src/runner.ts
@@ -21,6 +21,8 @@ import ExecutorHttpServer from './http/executor-http-server';
 import patchBodySchemas from './pending-data-validators';
 import validateSecrets from './validate-secrets';
 
+export type RunnerState = 'idle' | 'running' | 'draining' | 'stopped';
+
 export interface RunnerConfig {
   agentPort: AgentPort;
   workflowPort: WorkflowPort;
@@ -32,15 +34,19 @@ export interface RunnerConfig {
   authSecret: string;
   logger?: Logger;
   httpPort?: number;
+  stopTimeoutMs?: number;
 }
 
+const DEFAULT_STOP_TIMEOUT_MS = 30_000;
+
 export default class Runner {
   private readonly config: RunnerConfig;
   private httpServer: ExecutorHttpServer | null = null;
   private pollingTimer: NodeJS.Timeout | null = null;
-  private readonly inFlightSteps = new Set<string>();
+  private readonly inFlightSteps = new Map<string, Promise<void>>();
   private isRunning = false;
   private readonly logger: Logger;
+  private _state: RunnerState = 'idle';
 
   private static stepKey(step: PendingStepExecution): string {
     return `${step.runId}:${step.stepId}`;
@@ -51,12 +57,17 @@ export default class Runner {
     this.logger = config.logger ?? new ConsoleLogger();
   }
 
+  get state(): RunnerState {
+    return this._state;
+  }
+
   async start(): Promise<void> {
     if (this.isRunning) return;
 
     validateSecrets({ envSecret: this.config.envSecret, authSecret: this.config.authSecret });
 
     this.isRunning = true;
+    this._state = 'running';
 
     try {
       await this.config.runStore.init(this.logger);
@@ -74,20 +85,48 @@ export default class Runner {
       }
     } catch (error) {
       this.isRunning = false;
+      this._state = 'idle';
       throw error;
     }
 
     this.schedulePoll();
   }
 
   async stop(): Promise<void> {
+    this._state = 'draining';
     this.isRunning = false;
 
     if (this.pollingTimer !== null) {
       clearTimeout(this.pollingTimer);
       this.pollingTimer = null;
     }
 
+    // Drain in-flight steps
+    if (this.inFlightSteps.size > 0) {
+      this.logger.info?.('Draining in-flight steps', {
+        count: this.inFlightSteps.size,
+        steps: [...this.inFlightSteps.keys()],
+      });
+
+      const timeout = this.config.stopTimeoutMs ?? DEFAULT_STOP_TIMEOUT_MS;
+      const drainResult = await Promise.race([
+        Promise.allSettled(this.inFlightSteps.values()).then(() => 'drained' as const),
+        new Promise<'timeout'>(resolve => {
+          setTimeout(() => resolve('timeout'), timeout);
+        }),
+      ]);
+
+      if (drainResult === 'timeout') {
+        this.logger.error('Drain timeout — steps still in flight', {
+          remainingSteps: [...this.inFlightSteps.keys()],
+          timeoutMs: timeout,
+        });
+      } else {
+        this.logger.info?.('All in-flight steps drained', {});
+      }
+    }
+
+    // Close resources after drain
     if (this.httpServer) {
       await this.httpServer.stop();
       this.httpServer = null;
@@ -98,7 +137,7 @@ export default class Runner {
       this.config.runStore.close(this.logger),
     ]);
 
-    // TODO: graceful drain of in-flight steps (out of scope PRD-223)
+    this._state = 'stopped';
   }
 
   async getRunStepExecutions(runId: string): Promise<StepExecutionData[]> {
@@ -189,10 +228,18 @@ export default class Runner {
     return this.config.aiClient.loadRemoteTools(mergedConfig);
   }
 
-  private async executeStep(step: PendingStepExecution): Promise<void> {
+  private executeStep(step: PendingStepExecution): Promise<void> {
     const key = Runner.stepKey(step);
-    this.inFlightSteps.add(key);
+    const promise = this.doExecuteStep(step, key);
+    this.inFlightSteps.set(key, promise);
 
+    return promise;
+  }
+
+  private async doExecuteStep(
+    step: PendingStepExecution,
+    key: string,
+  ): Promise<void> {
     let result: StepExecutionResult;
 
     try {
diff --git a/packages/workflow-executor/test/runner.test.ts b/packages/workflow-executor/test/runner.test.ts
@@ -62,8 +62,8 @@ function createMockAiClient() {
   };
 }
 
-function createMockLogger(): jest.Mocked<Logger> {
-  return { error: jest.fn() };
+function createMockLogger(): jest.Mocked<Required<Logger>> {
+  return { error: jest.fn(), info: jest.fn() };
 }
 
 const VALID_ENV_SECRET = 'a'.repeat(64);
@@ -89,6 +89,7 @@ function createRunnerConfig(
     envSecret: string;
     authSecret: string;
     schemaCache: SchemaCache;
+    stopTimeoutMs: number;
   }> = {},
 ) {
   return {
@@ -178,7 +179,7 @@ beforeEach(() => {
 
 afterEach(async () => {
   // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
-  if (runner) {
+  if (runner && runner.state !== 'stopped') {
     await runner.stop();
     (runner as Runner | undefined) = undefined;
   }
@@ -285,6 +286,188 @@ describe('stop', () => {
   });
 });
 
+// ---------------------------------------------------------------------------
+// Graceful shutdown
+// ---------------------------------------------------------------------------
+
+describe('graceful shutdown', () => {
+  it('state transitions: idle → running → draining → stopped', async () => {
+    runner = new Runner(createRunnerConfig());
+
+    expect(runner.state).toBe('idle');
+
+    await runner.start();
+    expect(runner.state).toBe('running');
+
+    const stopPromise = runner.stop();
+    expect(runner.state).toBe('draining');
+
+    await stopPromise;
+    expect(runner.state).toBe('stopped');
+  });
+
+  it('state resets to idle on start failure', async () => {
+    const config = createRunnerConfig();
+    (config.runStore.init as jest.Mock).mockRejectedValueOnce(new Error('init failed'));
+    runner = new Runner(config);
+
+    await expect(runner.start()).rejects.toThrow('init failed');
+    expect(runner.state).toBe('idle');
+  });
+
+  it('stop() waits for in-flight steps before resolving', async () => {
+    let resolveStep!: () => void;
+    const stepPromise = new Promise<void>(resolve => {
+      resolveStep = resolve;
+    });
+
+    const workflowPort = createMockWorkflowPort();
+    workflowPort.getPendingStepExecutions.mockResolvedValueOnce([
+      makePendingStep({ runId: 'run-1', stepId: 'step-1' }),
+    ]);
+
+    jest.spyOn(StepExecutorFactory, 'create').mockResolvedValueOnce({
+      execute: () =>
+        stepPromise.then(() => ({
+          stepOutcome: { type: 'condition', stepId: 'step-1', stepIndex: 0, status: 'success' },
+        })),
+    } as never);
+
+    runner = new Runner(createRunnerConfig({ workflowPort }));
+    await runner.start();
+
+    jest.advanceTimersByTime(POLLING_INTERVAL_MS);
+    await flushPromises();
+
+    let stopResolved = false;
+    const stopPromise = runner.stop().then(() => {
+      stopResolved = true;
+    });
+
+    // stop() should not resolve while step is in flight
+    await flushPromises();
+    expect(stopResolved).toBe(false);
+
+    // Resolve the step
+    resolveStep();
+    await stopPromise;
+    expect(stopResolved).toBe(true);
+  });
+
+  it('stop() resolves after timeout when step is stuck', async () => {
+    const workflowPort = createMockWorkflowPort();
+    const logger = createMockLogger();
+    workflowPort.getPendingStepExecutions.mockResolvedValueOnce([
+      makePendingStep({ runId: 'run-1', stepId: 'stuck-step' }),
+    ]);
+
+    jest.spyOn(StepExecutorFactory, 'create').mockResolvedValueOnce({
+      execute: () => new Promise(() => {}), // never resolves
+    } as never);
+
+    runner = new Runner(createRunnerConfig({ workflowPort, logger, stopTimeoutMs: 50 }));
+    await runner.start();
+
+    jest.advanceTimersByTime(POLLING_INTERVAL_MS);
+    await flushPromises();
+
+    jest.useRealTimers();
+    await runner.stop();
+    jest.useFakeTimers();
+
+    expect(logger.error).toHaveBeenCalledWith(
+      'Drain timeout — steps still in flight',
+      expect.objectContaining({
+        remainingSteps: ['run-1:stuck-step'],
+        timeoutMs: 50,
+      }),
+    );
+    expect(runner.state).toBe('stopped');
+  });
+
+  it('stop() resolves immediately when no steps are in flight', async () => {
+    const logger = createMockLogger();
+    runner = new Runner(createRunnerConfig({ logger }));
+    await runner.start();
+    await runner.stop();
+
+    expect(logger.info).not.toHaveBeenCalledWith('Draining in-flight steps', expect.anything());
+    expect(runner.state).toBe('stopped');
+  });
+
+  it('HTTP server is closed after drain completes', async () => {
+    let resolveStep!: () => void;
+    const stepPromise = new Promise<void>(resolve => {
+      resolveStep = resolve;
+    });
+
+    const workflowPort = createMockWorkflowPort();
+    workflowPort.getPendingStepExecutions.mockResolvedValueOnce([
+      makePendingStep({ runId: 'run-1', stepId: 'step-1' }),
+    ]);
+
+    jest.spyOn(StepExecutorFactory, 'create').mockResolvedValueOnce({
+      execute: () =>
+        stepPromise.then(() => ({
+          stepOutcome: { type: 'condition', stepId: 'step-1', stepIndex: 0, status: 'success' },
+        })),
+    } as never);
+
+    runner = new Runner(createRunnerConfig({ workflowPort, httpPort: 3100 }));
+    await runner.start();
+
+    jest.advanceTimersByTime(POLLING_INTERVAL_MS);
+    await flushPromises();
+
+    const stopPromise = runner.stop();
+    await flushPromises();
+
+    // HTTP server should NOT have been stopped yet (drain in progress)
+    expect(MockedExecutorHttpServer.prototype.stop).not.toHaveBeenCalled();
+
+    resolveStep();
+    await stopPromise;
+
+    // Now HTTP server should be stopped
+    expect(MockedExecutorHttpServer.prototype.stop).toHaveBeenCalled();
+  });
+
+  it('logs drain info when steps are in flight', async () => {
+    let resolveStep!: () => void;
+    const stepPromise = new Promise<void>(resolve => {
+      resolveStep = resolve;
+    });
+
+    const workflowPort = createMockWorkflowPort();
+    const logger = createMockLogger();
+    workflowPort.getPendingStepExecutions.mockResolvedValueOnce([
+      makePendingStep({ runId: 'run-1', stepId: 'step-1' }),
+    ]);
+
+    jest.spyOn(StepExecutorFactory, 'create').mockResolvedValueOnce({
+      execute: () =>
+        stepPromise.then(() => ({
+          stepOutcome: { type: 'condition', stepId: 'step-1', stepIndex: 0, status: 'success' },
+        })),
+    } as never);
+
+    runner = new Runner(createRunnerConfig({ workflowPort, logger }));
+    await runner.start();
+
+    jest.advanceTimersByTime(POLLING_INTERVAL_MS);
+    await flushPromises();
+
+    resolveStep();
+    await runner.stop();
+
+    expect(logger.info).toHaveBeenCalledWith('Draining in-flight steps', {
+      count: 1,
+      steps: ['run-1:step-1'],
+    });
+    expect(logger.info).toHaveBeenCalledWith('All in-flight steps drained', {});
+  });
+});
+
 // ---------------------------------------------------------------------------
 // Polling loop
 // ---------------------------------------------------------------------------

Original file line number	Diff line number	Diff line change
`@@ -4,4 +4,8 @@ export default class ConsoleLogger implements Logger {`
`4`	`4`	`error(message: string, context: Record<string, unknown>): void {`
`5`	`5`	`console.error(JSON.stringify({ message, timestamp: new Date().toISOString(), ...context }));`
`6`	`6`	`}`
	`7`	`+`
	`8`	`+ info(message: string, context: Record<string, unknown>): void {`
	`9`	`+ console.info(JSON.stringify({ message, timestamp: new Date().toISOString(), ...context }));`
	`10`	`+ }`
`7`	`11`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
`1`	`1`	`export interface Logger {`
`2`	`2`	`error(message: string, context: Record<string, unknown>): void;`
	`3`	`+ info?(message: string, context: Record<string, unknown>): void;`
`3`	`4`	`}`