diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 56599d35..b835e6e5 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -79,14 +79,16 @@ jobs: name: codecov-unit-node-${{ matrix.node-version }} fail_ci_if_error: false - # Integration tests (v5): one job at a time (max-parallel: 1) to avoid 502/503 on shared Conductor. - # Sharding (--shard i/N) splits the suite so each job runs ~1/N of tests — keeps per-job under timeout. + # Integration tests (v5): lower max-parallel reduces 502/503 from the shared Conductor server + # but makes CI slower without eliminating flakes entirely — feel free to experiment. + # Sharding (--shard i/N) splits the suite so each job runs ~1/N of tests. + # fetchWithRetry now retries 502/503/504, so higher parallelism is more viable than before. integration-tests: runs-on: ubuntu-latest timeout-minutes: 25 strategy: fail-fast: false - max-parallel: 2 + max-parallel: 3 matrix: node-version: [20, 22, 24] shard: [1, 2, 3] diff --git a/AGENTS.md b/AGENTS.md index 87f2aa09..afb1584e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -32,7 +32,7 @@ src/sdk/ # Main SDK source decorators/worker.ts # @worker decorator + dual-mode support decorators/registry.ts # Global registry (register/get/clear) context/TaskContext.ts # AsyncLocalStorage per-task context - metrics/ # MetricsCollector, MetricsServer, PrometheusRegistry + metrics/ # LegacyMetricsCollector, CanonicalMetricsCollector, metricsFactory, MetricsServer, PrometheusRegistry, CanonicalPrometheusRegistry, accumulators, httpObserver schema/ # jsonSchema, schemaField decorators generators/ # Legacy generators (pre-v3, still exported for compat) src/open-api/ # OpenAPI layer @@ -211,10 +211,10 @@ public async someMethod(args): Promise { ### Metrics Documentation (METRICS.md) -When adding, removing, or renaming metrics in `src/sdk/worker/metrics/MetricsCollector.ts`: -1. Update `METRICS.md` to reflect the change (name, type, labels, description) -2. Ensure both `MetricsCollector.toPrometheusText()` and `PrometheusRegistry.createMetrics()` are updated in sync — missing a summary/counter in either causes silent data loss -3. Update the metric count in the METRICS.md overview section +When adding, removing, or renaming metrics in `src/sdk/worker/metrics/`: +1. Update both `LegacyMetricsCollector.ts` and `CanonicalMetricsCollector.ts` (or add a no-op stub in the collector that does not emit the metric) +2. Ensure `toPrometheusText()` and the corresponding `PrometheusRegistry` / `CanonicalPrometheusRegistry` are updated in sync — missing a metric in either causes silent data loss +3. Update `METRICS.md` to reflect the change in both the legacy and canonical catalog tables 4. Add or update the corresponding direct recording method documentation if applicable ### SDK_NEW_LANGUAGE_GUIDE.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..1dacb8b7 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,26 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added + +- **Metrics harmonization** - canonical metric surface aligned with the cross-SDK catalog, opt-in via `WORKER_CANONICAL_METRICS=true` + - New `CanonicalMetricsCollector` and optional `CanonicalPrometheusRegistry` (prom-client adapter) emit the harmonized cross-SDK catalog: 12 counters (e.g. `task_poll_total`, `task_execution_started_total`, `task_paused_total`, `external_payload_used_total{entityName,operation,payloadType}`, `workflow_start_error_total{workflowType,exception}`), 4 time histograms (`task_poll_time_seconds`, `task_execute_time_seconds`, `task_update_time_seconds`, `http_api_client_request_seconds{method,uri,status}`) with buckets `0.001…10s`, 2 size histograms (`task_result_size_bytes`, `workflow_input_size_bytes{workflowType,version}`) with buckets `100…10_000_000` bytes, and `active_workers` gauge. Labels are camelCase; names are unprefixed. + - `createMetricsCollector()` factory selects `LegacyMetricsCollector` (default) or `CanonicalMetricsCollector` based on `WORKER_CANONICAL_METRICS` (truthy: `true`, `1`, `yes`, case-insensitive). `WORKER_LEGACY_METRICS` is also recognized; canonical wins when both are set. + - `HttpMetricsObserver` plus `fetchWithRetry` instrumentation records `http_api_client_request_seconds`; `WorkflowExecutor` records `workflow_input_size_bytes` and `workflow_start_error_total`. + - `Poller`, `TaskRunner`, and `EventDispatcher` emit a new `taskPaused` event when a poll cycle is skipped because the worker is paused. + - `fetchWithRetry` now retries HTTP 502/503/504 for idempotent methods (GET, HEAD, OPTIONS, PUT, DELETE). + - Harness deployment manifest sets `WORKER_CANONICAL_METRICS=true`; `harness/main.ts` logs which collector is active. + +### Changed + +- **Metrics harmonization** - defaults preserved; legacy metrics emit unchanged when `WORKER_CANONICAL_METRICS` is unset + - `src/sdk/worker/metrics/MetricsCollector.ts` was renamed to `LegacyMetricsCollector.ts`. The public symbol is preserved via `export { LegacyMetricsCollector as MetricsCollector }` in `src/sdk/worker/metrics/index.ts`, so existing imports keep working. + - Default behavior is unchanged: with no env var set, the metric names, labels, and `conductor_worker_*` prefix from `v3.0.3` are preserved byte-for-byte. + - Rewrote `METRICS.md` with both surfaces, the env-var gate, side-by-side migration table with PromQL replacements, and troubleshooting. + - Updated `README.md`, `AGENTS.md`, `SDK_DEVELOPMENT.md`, `SDK_COMPARISON.md`, and `WORKER_ARCHITECTURE_COMPARISON.md` to reference `createMetricsCollector()` and the env var. diff --git a/METRICS.md b/METRICS.md index d8770ed8..7d43e0c2 100644 --- a/METRICS.md +++ b/METRICS.md @@ -1,259 +1,388 @@ -# Metrics Reference +# JavaScript SDK Metrics -The Conductor JavaScript SDK provides built-in Prometheus metrics for monitoring worker performance, API latency, and task execution. +The Conductor JavaScript SDK can expose Prometheus metrics for worker polling, +task execution, task updates, workflow starts, external payload usage, and +API-client HTTP calls. -## Overview +The SDK currently has two mutually exclusive metric surfaces: -`MetricsCollector` implements `TaskRunnerEventsListener` and records **18 metric types** (12 counters + 6 summaries). Metrics are exposed in [Prometheus exposition format](https://prometheus.io/docs/instrumenting/exposition_formats/). +- **Legacy metrics** are the default. They preserve the original JavaScript SDK + names and shapes, including a `conductor_worker_` prefix, `task_type` labels, + millisecond time units, and Summary type for distributions. +- **Canonical metrics** are opt-in with `WORKER_CANONICAL_METRICS=true`. They + use the cross-SDK canonical names, labels, units, and Prometheus histogram + shapes. -- **Default prefix:** `conductor_worker` -- **Quantiles:** p50, p75, p90, p95, p99 (computed from a sliding window) -- **Sliding window:** Last 1,000 observations (configurable) +Only one collector is active at a time. The SDK does not emit legacy and +canonical metrics at the same time. -## Quick Start +Metrics are created lazily. A metric appears in `/metrics` only after the +corresponding worker event or collector method records it. Some low-level +surface metrics, such as ack, queue-full, paused, and uncaught-exception +counters, may not appear in normal worker runs unless that path is exercised. -### HTTP Server +## Usage -```typescript -import { MetricsCollector, MetricsServer, TaskHandler } from "@io-orkes/conductor-javascript"; +Create a metrics collector, start a scrape server, and wire the collector into +`TaskHandler` as an event listener: -const metrics = new MetricsCollector({ httpPort: 9090 }); +```typescript +import { + createMetricsCollector, + MetricsServer, + TaskHandler, +} from "@io-orkes/conductor-javascript"; + +const metrics = createMetricsCollector(); +const server = new MetricsServer(metrics, 9090); +await server.start(); const handler = new TaskHandler({ client, - scanForDecorated: true, eventListeners: [metrics], + scanForDecorated: true, }); await handler.startWorkers(); // GET http://localhost:9090/metrics — Prometheus text format -// GET http://localhost:9090/health — { "status": "UP" } +// GET http://localhost:9090/health — {"status":"UP"} +``` + +`createMetricsCollector()` reads `WORKER_CANONICAL_METRICS` and returns either +a `LegacyMetricsCollector` or a `CanonicalMetricsCollector`. Both implement +`MetricsCollectorInterface`, so call sites never need to know which variant is +active. + +You can also construct a collector directly if you need to pass configuration: + +```typescript +import { LegacyMetricsCollector } from "@io-orkes/conductor-javascript"; + +const metrics = new LegacyMetricsCollector({ + httpPort: 9090, + filePath: "/tmp/conductor_metrics.prom", + fileWriteIntervalMs: 10000, + usePromClient: true, +}); ``` ### File Output ```typescript -const metrics = new MetricsCollector({ +const metrics = createMetricsCollector({ filePath: "/tmp/conductor_metrics.prom", - fileWriteIntervalMs: 10000, // write every 10s + fileWriteIntervalMs: 10000, }); ``` -The file writer performs an immediate first write, then writes periodically at the configured interval. The timer is unreferenced so it does not prevent Node.js process exit. +The file writer performs an immediate first write, then writes periodically at +the configured interval. The timer is unreferenced so it does not prevent +Node.js process exit. ### prom-client Integration ```typescript -const metrics = new MetricsCollector({ usePromClient: true }); +const metrics = createMetricsCollector({ usePromClient: true }); // Metrics are registered in prom-client's default registry. // Use prom-client's register.metrics() for native scraping. ``` -Requires `npm install prom-client`. Falls back to built-in text format if not installed. +Requires `npm install prom-client`. Falls back to built-in text format if +prom-client is not installed. -### All-in-One +## Selecting Canonical Metrics -```typescript -const metrics = new MetricsCollector({ - prefix: "myapp_worker", - httpPort: 9090, - filePath: "/tmp/metrics.prom", - fileWriteIntervalMs: 10000, - slidingWindowSize: 500, - usePromClient: true, -}); +Set `WORKER_CANONICAL_METRICS` before the worker starts: + +```shell +WORKER_CANONICAL_METRICS=true node my_worker.js ``` +Accepted true values are `true`, `1`, and `yes`, case-insensitive. Any other +value, or an unset variable, selects legacy metrics. The variable is read when +the metrics collector is created, so changing it requires a worker restart. + ## Configuration | Option | Type | Default | Description | -|--------|------|---------|-------------| -| `prefix` | `string` | `"conductor_worker"` | Prometheus metric name prefix | -| `httpPort` | `number` | — | Start built-in HTTP server on this port | -| `filePath` | `string` | — | Periodically write metrics to this file path | -| `fileWriteIntervalMs` | `number` | `5000` | File write interval in milliseconds | -| `slidingWindowSize` | `number` | `1000` | Max observations kept for quantile calculation | -| `usePromClient` | `boolean` | `false` | Use `prom-client` for native Prometheus integration | - ---- - -## Counter Metrics - -### Labeled by `task_type` - -| Prometheus Name | Internal Key | Description | -|----------------|-------------|-------------| -| `{prefix}_task_poll_total` | `pollTotal` | Total number of task polls initiated | -| `{prefix}_task_poll_error_total` | `pollErrorTotal` | Total number of failed task polls | -| `{prefix}_task_execute_total` | `taskExecutionTotal` | Total number of task executions completed | -| `{prefix}_task_execute_error_total` | `taskExecutionErrorTotal` | Total task execution errors. Label format: `taskType:ExceptionName` | -| `{prefix}_task_update_error_total` | `taskUpdateFailureTotal` | Total task result update failures (result lost from Conductor) | -| `{prefix}_task_ack_error_total` | `taskAckErrorTotal` | Total task acknowledgement errors | -| `{prefix}_task_execution_queue_full_total` | `taskExecutionQueueFullTotal` | Times the execution queue was full (concurrency limit reached) | -| `{prefix}_task_paused_total` | `taskPausedTotal` | Total task paused events | - -### Labeled by `payload_type` - -| Prometheus Name | Internal Key | Description | -|----------------|-------------|-------------| -| `{prefix}_external_payload_used_total` | `externalPayloadUsedTotal` | External payload storage usage (e.g., `"workflow_input"`, `"task_output"`) | - -### Global (no labels) - -| Prometheus Name | Internal Key | Description | -|----------------|-------------|-------------| -| `{prefix}_thread_uncaught_exceptions_total` | `uncaughtExceptionTotal` | Total uncaught exceptions in worker processes | -| `{prefix}_worker_restart_total` | `workerRestartTotal` | Total worker restart events | -| `{prefix}_workflow_start_error_total` | `workflowStartErrorTotal` | Total workflow start errors | - ---- - -## Summary Metrics - -Each summary emits quantile values, a count, and a sum: - +|---|---|---|---| +| `prefix` | `string` | `"conductor_worker"` | Prometheus metric name prefix. Legacy only; canonical metrics are unprefixed. | +| `httpPort` | `number` | — | Start built-in HTTP server on this port. | +| `filePath` | `string` | — | Periodically write metrics to this file path. | +| `fileWriteIntervalMs` | `number` | `5000` | File write interval in milliseconds. | +| `slidingWindowSize` | `number` | `1000` | Max observations for quantile calculation. Legacy only; canonical uses histogram buckets. | +| `usePromClient` | `boolean` | `false` | Use `prom-client` for native Prometheus integration. | + +## Canonical Metrics + +Canonical timing values are seconds. Canonical size values are bytes. Label +names use camelCase. + +### Canonical Counters + +| Metric | Labels | Description | +|---|---|---| +| `task_poll_total` | `taskType` | Incremented each time the worker issues a poll request. | +| `task_execution_started_total` | `taskType` | Incremented when a polled task is dispatched to the worker function. | +| `task_poll_error_total` | `taskType`, `exception` | Incremented when a poll request fails client-side. | +| `task_execute_error_total` | `taskType`, `exception` | Incremented when the worker function throws. | +| `task_update_error_total` | `taskType`, `exception` | Incremented when updating the task result fails. | +| `task_ack_error_total` | `taskType`, `exception` | Collector surface for task ack errors. The internal runner uses batch poll responses as ack and may not emit this during normal polling. | +| `task_ack_failed_total` | `taskType` | Collector surface for failed task ack responses. The internal runner uses batch poll responses as ack and may not emit this during normal polling. | +| `task_execution_queue_full_total` | `taskType` | Incremented when the worker execution queue is saturated. | +| `task_paused_total` | `taskType` | Incremented when a worker is paused and skips acting on a poll. | +| `thread_uncaught_exceptions_total` | `exception` | Incremented on uncaught exceptions in the worker process. | +| `external_payload_used_total` | `entityName`, `operation`, `payloadType` | Incremented when external payload storage is used for task or workflow payloads. | +| `workflow_start_error_total` | `workflowType`, `exception` | Incremented when starting a workflow fails client-side. | + +### Canonical Time Histograms + +All canonical time histograms use buckets: +`0.001`, `0.005`, `0.01`, `0.025`, `0.05`, `0.1`, `0.25`, `0.5`, `1`, `2.5`, +`5`, `10`. + +| Metric | Labels | Description | +|---|---|---| +| `task_poll_time_seconds` | `taskType`, `status` | Poll request latency. `status` is `SUCCESS` or `FAILURE`. | +| `task_execute_time_seconds` | `taskType`, `status` | Worker function execution duration. `status` is `SUCCESS` or `FAILURE`. | +| `task_update_time_seconds` | `taskType`, `status` | Task-result update latency. `status` is `SUCCESS` or `FAILURE`. | +| `http_api_client_request_seconds` | `method`, `uri`, `status` | API-client HTTP request latency. `status` is the HTTP status code as a string, or `"0"` on network failure. | + +Each histogram exposes Prometheus series such as: + +```prometheus +task_execute_time_seconds_bucket{taskType="my_task",status="SUCCESS",le="0.1"} 42 +task_execute_time_seconds_count{taskType="my_task",status="SUCCESS"} 50 +task_execute_time_seconds_sum{taskType="my_task",status="SUCCESS"} 2.3 ``` -{name}{task_type="myTask",quantile="0.5"} 12.3 -{name}{task_type="myTask",quantile="0.75"} 15.1 -{name}{task_type="myTask",quantile="0.9"} 18.7 -{name}{task_type="myTask",quantile="0.95"} 22.0 -{name}{task_type="myTask",quantile="0.99"} 45.2 -{name}_count{task_type="myTask"} 1000 -{name}_sum{task_type="myTask"} 14523.7 -``` - -### Labeled by `task_type` -| Prometheus Name | Internal Key | Unit | Description | -|----------------|-------------|------|-------------| -| `{prefix}_task_poll_time` | `pollDurationMs` | ms | Task poll round-trip duration | -| `{prefix}_task_execute_time` | `executionDurationMs` | ms | Worker function execution duration | -| `{prefix}_task_update_time` | `updateDurationMs` | ms | Task result update (SDK to server) duration | -| `{prefix}_task_result_size_bytes` | `outputSizeBytes` | bytes | Task result output payload size | +### Canonical Size Histograms -### Labeled by `workflow_type` +All canonical size histograms use buckets: +`100`, `1000`, `10000`, `100000`, `1000000`, `10000000`. -| Prometheus Name | Internal Key | Unit | Description | -|----------------|-------------|------|-------------| -| `{prefix}_workflow_input_size_bytes` | `workflowInputSizeBytes` | bytes | Workflow input payload size | +| Metric | Labels | Description | +|---|---|---| +| `task_result_size_bytes` | `taskType` | Serialized task result output size. | +| `workflow_input_size_bytes` | `workflowType`, `version` | Serialized workflow input size. `version` is an empty string when not provided. | -### Labeled by `endpoint` +### Canonical Gauges -| Prometheus Name | Internal Key | Unit | Description | -|----------------|-------------|------|-------------| -| `{prefix}_http_api_client_request` | `apiRequestDurationMs` | ms | API request duration. Label format: `METHOD:/api/path:STATUS` | +| Metric | Labels | Description | +|---|---|---| +| `active_workers` | `taskType` | Current number of workers actively executing tasks. | ---- +## Legacy Metrics -## Event Listener Methods +Legacy mode is the default so existing dashboards and alerts continue to work. +The default metric name prefix is `conductor_worker`. The prefix is configurable +via the `prefix` option on `MetricsCollectorConfig`. -These methods are called automatically by the `TaskRunner` when `MetricsCollector` is registered as an event listener: +Distribution metrics are sliding-window summaries over the latest 1,000 +observations (configurable via `slidingWindowSize`), exposing quantiles at +p50, p75, p90, p95, and p99. Legacy distribution metrics also expose `_count` +and `_sum` series. -| Method | Metrics Updated | -|--------|----------------| -| `onPollStarted(event)` | Increments `pollTotal` | -| `onPollCompleted(event)` | Records `pollDurationMs` | -| `onPollFailure(event)` | Increments `pollErrorTotal`, records `pollDurationMs` | -| `onTaskExecutionStarted(event)` | _(no-op, counted on completion)_ | -| `onTaskExecutionCompleted(event)` | Increments `taskExecutionTotal`, records `executionDurationMs` and `outputSizeBytes` | -| `onTaskExecutionFailure(event)` | Increments `taskExecutionErrorTotal`, records `executionDurationMs` | -| `onTaskUpdateCompleted(event)` | Records `updateDurationMs` | -| `onTaskUpdateFailure(event)` | Increments `taskUpdateFailureTotal` | +### Legacy Counters -## Direct Recording Methods +| Metric | Labels | Description | +|---|---|---| +| `conductor_worker_task_poll_total` | `task_type` | Incremented each time polling is done. | +| `conductor_worker_task_poll_error_total` | `task_type` | Incremented when a poll request fails. | +| `conductor_worker_task_execute_total` | `task_type` | Incremented when a task execution completes. | +| `conductor_worker_task_execute_error_total` | `task_type` | Task execution errors. Label format: `taskType:ExceptionName`. | +| `conductor_worker_task_update_error_total` | `task_type` | Incremented when updating the task result fails. | +| `conductor_worker_task_ack_error_total` | `task_type` | Collector surface for task ack errors. | +| `conductor_worker_task_execution_queue_full_total` | `task_type` | Incremented when the execution queue is saturated. | +| `conductor_worker_task_paused_total` | `task_type` | Incremented when a worker is paused and skips a poll. | +| `conductor_worker_external_payload_used_total` | `payload_type` | External payload storage usage. | +| `conductor_worker_thread_uncaught_exceptions_total` | none | Uncaught exceptions in the worker process. | +| `conductor_worker_worker_restart_total` | none | Worker restart events. | +| `conductor_worker_workflow_start_error_total` | none | Workflow start errors. | -For metrics outside the event listener system, call these methods directly: +Legacy mode does not emit `task_execution_started_total`, +`task_ack_failed_total`, or `active_workers`. -```typescript -const collector = new MetricsCollector(); - -collector.recordTaskExecutionQueueFull("my_task"); -collector.recordUncaughtException(); -collector.recordWorkerRestart(); -collector.recordTaskPaused("my_task"); -collector.recordTaskAckError("my_task"); -collector.recordWorkflowStartError(); -collector.recordExternalPayloadUsed("task_output"); -collector.recordWorkflowInputSize("my_workflow", 2048); -collector.recordApiRequestTime("POST", "/api/tasks", 200, 35); -``` +### Legacy Time Metrics -## Exposition Formats +Time values are milliseconds. Type is Summary. -### Built-in Prometheus Text - -```typescript -const text = collector.toPrometheusText(); -// Returns Prometheus text format (text/plain; version=0.0.4) -``` +| Metric | Labels | Description | +|---|---|---| +| `conductor_worker_task_poll_time` | `task_type` | Poll round-trip duration. | +| `conductor_worker_task_execute_time` | `task_type` | Worker function execution duration. | +| `conductor_worker_task_update_time` | `task_type` | Task result update duration. | -### Async (with prom-client support) +Each summary exposes quantile, count, and sum series: -```typescript -const text = await collector.toPrometheusTextAsync(); -// Uses prom-client registry when available, falls back to built-in +```prometheus +conductor_worker_task_execute_time{task_type="my_task",quantile="0.5"} 102 +conductor_worker_task_execute_time{task_type="my_task",quantile="0.95"} 250 +conductor_worker_task_execute_time_count{task_type="my_task"} 1000 +conductor_worker_task_execute_time_sum{task_type="my_task"} 120345 ``` -### HTTP Server (MetricsServer) - -```typescript -import { MetricsServer } from "@io-orkes/conductor-javascript"; - -const server = new MetricsServer(collector, 9090); -await server.start(); -// GET /metrics — Content-Type from collector.getContentType() -// GET /health — { "status": "UP" } -await server.stop(); +### Legacy Size Metrics + +Type is Summary. Values are bytes. + +| Metric | Labels | Description | +|---|---|---| +| `conductor_worker_task_result_size_bytes` | `task_type` | Task result output payload size. | +| `conductor_worker_workflow_input_size_bytes` | `workflow_type` | Workflow input payload size. | + +### Legacy HTTP Metrics + +| Metric | Labels | Description | +|---|---|---| +| `conductor_worker_http_api_client_request` | `endpoint` | API request duration in milliseconds. The `endpoint` label is a compound `"METHOD:/api/path:STATUS"` string. | + +## Labels + +| Label | Used by | Values | +|---|---|---| +| `task_type` | Legacy worker metrics | Task definition name. | +| `taskType` | Canonical worker metrics | Task definition name. | +| `workflowType` | Canonical workflow metrics | Workflow definition name. | +| `workflow_type` | Legacy `conductor_worker_workflow_input_size_bytes` | Workflow definition name. | +| `version` | Canonical `workflow_input_size_bytes` | Workflow version as a string. Empty string when not provided. | +| `status` | Canonical task time histograms | `SUCCESS` or `FAILURE`. For `http_api_client_request_seconds`, the HTTP status code as a string, or `"0"` on network failure. | +| `exception` | Canonical error counters | Exception type name, such as `TypeError`. Derived from `error.name` or `error.constructor.name`. | +| `entityName` | Canonical `external_payload_used_total` | Task type or workflow name associated with the external payload. | +| `operation` | Canonical `external_payload_used_total` | External payload operation, such as `READ` or `WRITE`. | +| `payload_type` | Legacy `conductor_worker_external_payload_used_total` | Payload type, such as `workflow_input` or `task_output`. | +| `payloadType` | Canonical `external_payload_used_total` | Payload type, such as `TASK_INPUT`, `TASK_OUTPUT`, `WORKFLOW_INPUT`, or `WORKFLOW_OUTPUT`. | +| `method` | Canonical HTTP metrics | HTTP verb. | +| `uri` | Canonical HTTP metrics | Request path. May contain interpolated identifiers. | +| `endpoint` | Legacy HTTP metrics | Compound `"METHOD:/api/path:STATUS"` string. | +| `quantile` | Legacy time and size metrics | `0.5`, `0.75`, `0.9`, `0.95`, or `0.99`. | + +## Migrating From Legacy to Canonical + +Canonical mode is opt-in during the deprecation period. Before switching a +production worker, update dashboards and alerts against a staging worker with +`WORKER_CANONICAL_METRICS=true`. + +Key changes: + +- The `conductor_worker_` prefix is removed. Canonical metric names are + unprefixed. +- Legacy task labels use `task_type`; canonical task labels use `taskType`. +- Legacy time metrics are millisecond summaries with quantiles. Canonical time + metrics are second-based histograms with bucket boundaries. Query `_bucket` + series with `histogram_quantile()` instead of reading `{quantile="..."}` + gauges. +- Legacy size metrics are summaries. Canonical size metrics are histograms. +- Canonical error counters add an `exception` label containing the exception + type name. +- Canonical time histograms add a `status` label (`SUCCESS` or `FAILURE`). +- Canonical mode adds metrics that legacy mode never emits: + `task_execution_started_total`, `task_ack_failed_total`, and + `active_workers`. +- Legacy `conductor_worker_worker_restart_total` is not emitted in canonical + mode (Node.js single-process model). +- Legacy uses `payload_type`; canonical uses `payloadType`. +- Legacy HTTP metrics use a compound `endpoint` label; canonical uses separate + `method`, `uri`, and `status` labels. +- Canonical and legacy collectors are mutually exclusive. During a migration, + compare scrape output by running separate worker instances or environments + with and without `WORKER_CANONICAL_METRICS=true`. + +Legacy-to-canonical replacements: + +| Legacy metric | Canonical replacement | +|---|---| +| `conductor_worker_task_poll_total{task_type}` | `task_poll_total{taskType}` | +| `conductor_worker_task_poll_error_total{task_type}` | `task_poll_error_total{taskType,exception}` | +| `conductor_worker_task_execute_total{task_type}` | `task_execute_time_seconds{taskType,status}` (count from histogram) and `task_execution_started_total{taskType}` | +| `conductor_worker_task_execute_error_total{task_type}` | `task_execute_error_total{taskType,exception}` | +| `conductor_worker_task_update_error_total{task_type}` | `task_update_error_total{taskType,exception}` | +| `conductor_worker_task_poll_time{task_type}` (summary, ms) | `task_poll_time_seconds{taskType,status}` (histogram, seconds) | +| `conductor_worker_task_execute_time{task_type}` (summary, ms) | `task_execute_time_seconds{taskType,status}` (histogram, seconds) | +| `conductor_worker_task_update_time{task_type}` (summary, ms) | `task_update_time_seconds{taskType,status}` (histogram, seconds) | +| `conductor_worker_task_result_size_bytes{task_type}` (summary) | `task_result_size_bytes{taskType}` (histogram) | +| `conductor_worker_workflow_input_size_bytes{workflow_type}` (summary) | `workflow_input_size_bytes{workflowType,version}` (histogram) | +| `conductor_worker_http_api_client_request{endpoint}` (summary, ms) | `http_api_client_request_seconds{method,uri,status}` (histogram, seconds) | +| `conductor_worker_external_payload_used_total{payload_type}` | `external_payload_used_total{entityName,operation,payloadType}` | +| `conductor_worker_workflow_start_error_total` (no labels) | `workflow_start_error_total{workflowType,exception}` | +| `conductor_worker_worker_restart_total` | — (not emitted in canonical mode) | + +Common PromQL replacements: + +| Legacy | Canonical | +|---|---| +| `conductor_worker_task_execute_time{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType, status) (rate(task_execute_time_seconds_bucket[5m])))` | +| `conductor_worker_task_poll_time{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType, status) (rate(task_poll_time_seconds_bucket[5m])))` | +| `conductor_worker_http_api_client_request{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, method, uri, status) (rate(http_api_client_request_seconds_bucket[5m])))` | +| `conductor_worker_task_result_size_bytes{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType) (rate(task_result_size_bytes_bucket[5m])))` | + +Average latency queries continue to use `_sum` divided by `_count`, but the +canonical series are cumulative histogram counters: + +```promql +sum(rate(task_execute_time_seconds_sum[5m])) by (taskType) +/ +sum(rate(task_execute_time_seconds_count[5m])) by (taskType) ``` -### File Output - -Configured via `filePath` in `MetricsCollectorConfig`. Writes `toPrometheusText()` output to disk. The file writer performs an immediate first write on construction, then writes periodically at the configured interval. +## Troubleshooting ---- +### Metrics Are Empty -## Sliding Window and Quantile Calculation +- Verify that `createMetricsCollector()` or a collector constructor is called + and the collector is passed to `TaskHandler` via `eventListeners`. +- Verify workers have polled or executed tasks. Metrics are created lazily when + the relevant event occurs. +- Confirm the scrape endpoint is reachable at the expected host and port. -Summary metrics use a **sliding window** (default: 1,000 observations) to calculate percentiles. This provides: +### Missing HTTP or Workflow Metrics -- Accurate recent percentiles without unbounded memory growth -- No need to pre-configure histogram bucket boundaries -- Direct percentile values without interpolation artifacts +- `http_api_client_request_seconds` (canonical) or + `conductor_worker_http_api_client_request` (legacy) is recorded from the + `fetchWithRetry` HTTP layer. Verify the collector is constructed before HTTP + calls begin. +- `workflow_input_size_bytes` and `workflow_start_error_total` are recorded in + `WorkflowExecutor`. Verify the collector is active before starting workflows. -Quantiles are computed on-demand using linear interpolation on sorted observations when `toPrometheusText()` is called. +### High Cardinality -When using `prom-client` (`usePromClient: true`), summaries use prom-client's native implementation with `maxAgeSeconds: 600` and `ageBuckets: 5`. +- Watch the `uri` label (canonical) or `endpoint` label (legacy) on HTTP + metrics. The SDK records the interpolated request path, which may include + task type names or workflow IDs. +- Prefer canonical mode for bounded `exception` labels. Legacy error counters + encode exception names in the Map key, not as a proper Prometheus label. +- Avoid embedding user identifiers or unbounded values in task type, workflow + type, or external payload labels. ---- +### Recording Uncaught Exceptions -## Monitoring Best Practices - -- **Use p95/p99 for SLO monitoring** rather than averages. Percentile-based thresholds better capture user-impacting performance variations. -- **Alert on `task_update_error_total`** — a rising count indicates task results are being lost and workers are failing to report back to the Conductor server. -- **Alert on `task_execution_queue_full_total`** — indicates the concurrency limit is consistently reached. Consider increasing worker `concurrency`. -- **Monitor `task_poll_time` p99** — high poll latency suggests network issues or server overload. -- **Monitor `task_execute_time` p95** — watch for execution time regression in worker functions. -- **File output interval**: 10-60 seconds recommended for production. Lower intervals increase disk I/O. -- **Clean metrics directory on startup** when using file output with multiprocess workers to avoid stale data. - ---- - -## Programmatic Access +The `thread_uncaught_exceptions_total` metric is not wired automatically. In +Node.js, registering a `process.on("uncaughtException")` handler overrides the +default crash behavior, which can leave the process running in a corrupted +state. Instead, wire it yourself so you control the exit policy: ```typescript -const metrics = collector.getMetrics(); +const metrics = createMetricsCollector(); -// Counter values -metrics.pollTotal.get("my_task"); // number -metrics.taskExecutionTotal.get("my_task"); // number +process.on("uncaughtException", (err) => { + metrics.recordUncaughtException(err.name || "Error"); + console.error(err); + process.exit(1); +}); -// Summary observations (raw array) -metrics.pollDurationMs.get("my_task"); // number[] -metrics.executionDurationMs.get("my_task"); // number[] +process.on("unhandledRejection", (reason) => { + const name = reason instanceof Error ? reason.name || "Error" : "Error"; + metrics.recordUncaughtException(name); + console.error(reason); + process.exit(1); +}); +``` -// Reset all metrics -collector.reset(); +### prom-client Issues -// Stop file writer and HTTP server -await collector.stop(); -``` +- `MetricsCollector` uses `await import("./MetricsServer.js")` internally. The + `.js` extension does not resolve under Jest's TypeScript transform. Test + `MetricsServer` by importing it directly, not via the `httpPort` config + option. +- When `usePromClient: true` is set but `prom-client` is not installed, the + collector falls back to the built-in text format silently. diff --git a/README.md b/README.md index 7880908a..f5479ac5 100644 --- a/README.md +++ b/README.md @@ -386,12 +386,12 @@ await handler.startWorkers(); ## Monitoring Workers -Enable Prometheus metrics with the built-in `MetricsCollector`: +Enable Prometheus metrics with the built-in metrics collector: ```typescript -import { MetricsCollector, MetricsServer, TaskHandler } from "@io-orkes/conductor-javascript"; +import { createMetricsCollector, MetricsServer, TaskHandler } from "@io-orkes/conductor-javascript"; -const metrics = new MetricsCollector(); +const metrics = createMetricsCollector(); const server = new MetricsServer(metrics, 9090); await server.start(); @@ -405,7 +405,7 @@ await handler.startWorkers(); // GET http://localhost:9090/health — {"status":"UP"} ``` -Collects 18 metric types: poll counts, execution durations, error rates, output sizes, and more — with p50/p75/p90/p95/p99 quantiles. See [METRICS.md](METRICS.md) for the full reference. +The SDK has two metric surfaces: **legacy** (default, prefixed `conductor_worker_` names, Summary type) and **canonical** (opt-in via `WORKER_CANONICAL_METRICS=true`, unprefixed names, Histogram type). See [METRICS.md](METRICS.md) for the full reference. ## Managing Workflow Executions @@ -542,7 +542,7 @@ See [examples/agentic-workflows/](examples/agentic-workflows/) for all examples. | Document | Description | |----------|-------------| | [SDK Development Guide](SDK_DEVELOPMENT.md) | Architecture, patterns, pitfalls, testing | -| [Metrics Reference](METRICS.md) | All 18 Prometheus metrics with descriptions | +| [Metrics Reference](METRICS.md) | Legacy and canonical Prometheus metrics with migration guide | | [Breaking Changes](BREAKING_CHANGES.md) | v3.x migration guide | | [Workflow Management](docs/api-reference/workflow-executor.md) | Start, pause, resume, terminate, retry, search, signal | | [Task Management](docs/api-reference/task-client.md) | Task operations, logs, queue management | diff --git a/SDK_COMPARISON.md b/SDK_COMPARISON.md index 881800bc..a2fdbfd6 100644 --- a/SDK_COMPARISON.md +++ b/SDK_COMPARISON.md @@ -4,7 +4,7 @@ This document provides a detailed comparison between the **Python SDK** (golden reference, `conductor-python`) and the **JavaScript SDK** (`@io-orkes/conductor-javascript` v3.0.0), covering API surface, architecture, worker system, and feature parity. -**Verdict Summary**: The JavaScript SDK has **near-complete parity** with the Python SDK (~98%). All client classes (including rate limit CRUD), worker features (TaskContext, adaptive backoff, 19 Prometheus metrics with quantiles and optional prom-client, decorator-based JSON schema generation), workflow DSL (`ConductorWorkflow` with `toSubWorkflowTask()`), and all 34 task type builders (including 13 LLM/AI + 2 MCP) are implemented. +**Verdict Summary**: The JavaScript SDK has **near-complete parity** with the Python SDK (~98%). All client classes (including rate limit CRUD), worker features (TaskContext, adaptive backoff, Prometheus metrics with legacy and canonical modes, decorator-based JSON schema generation), workflow DSL (`ConductorWorkflow` with `toSubWorkflowTask()`), and all 34 task type builders (including 13 LLM/AI + 2 MCP) are implemented. --- @@ -580,22 +580,23 @@ Features: #### Metrics / Observability +Both SDKs support legacy and canonical metric surfaces via `WORKER_CANONICAL_METRICS`. See [METRICS.md](METRICS.md) for the JavaScript SDK catalog. + | # | Feature | Python | JavaScript | Status | |---|---------|--------|-----------|--------| -| 1 | MetricsCollector | Yes (19 metrics) | Yes (19 metrics) | Full | +| 1 | Legacy / canonical collectors | `create_metrics_collector()` | `createMetricsCollector()` | Full | | 2 | HTTP `/metrics` endpoint | Yes | `MetricsServer` | Full | | 3 | HTTP `/health` endpoint | Yes | `MetricsServer` | Full | | 4 | Prometheus text format | Yes | `toPrometheusText()` | Full | | 5 | File-based metrics | Yes (`.prom` files) | Yes (`filePath` config option) | Full | -| 6 | API request metrics | `http_api_client_request` | `recordApiRequestTime()` | Full | -| 7 | Queue full metric | `task_execution_queue_full` | `recordTaskExecutionQueueFull()` | Full | -| 8 | Uncaught exception metric | `thread_uncaught_exceptions` | `recordUncaughtException()` | Full | -| 9 | Workflow start error | `workflow_start_error` | `recordWorkflowStartError()` | Full | -| 10 | External payload used | `external_payload_used` | `recordExternalPayloadUsed()` | Full | -| 11 | Worker restart metric | `worker_restart` | `recordWorkerRestart()` | Full | -| 12 | Quantile calculation (p50-p99) | Yes (sliding window 1000) | Yes (sliding window, configurable) | Full | -| 13 | `prometheus_client` integration | Yes (native prom-client) | Optional `prom-client` integration via `usePromClient: true` + `PrometheusRegistry` | Full (optional peer dep) | -| 14 | Auto-start via `httpPort` config | Yes | Yes | Full | +| 6 | API request metrics | Yes | `recordApiRequestTime()` | Full | +| 7 | Queue full metric | Yes | `recordTaskExecutionQueueFull()` | Full | +| 8 | Uncaught exception metric | Yes | `recordUncaughtException()` | Full | +| 9 | Workflow start error | Yes | `recordWorkflowStartError()` | Full | +| 10 | External payload used | Yes | `recordExternalPayloadUsed()` | Full | +| 11 | Worker restart metric | Yes (Python-only, multi-process) | Legacy only (single-process model) | Full | +| 12 | `prometheus_client` / `prom-client` | Yes (native) | Optional via `usePromClient: true` | Full | +| 13 | Auto-start via `httpPort` config | Yes | Yes | Full | #### JSON Schema Generation @@ -692,7 +693,7 @@ Both SDKs support identical environment variable formats for worker configuratio | **Worker system** | **98%** | TaskContext, adaptive backoff, auth backoff, events, schema | | **Task type builders** | **100%** | 34/34 builders including LLM, MCP, HTTP Poll, Webhook, GetDocument | | **Workflow DSL** | **95%** | Full `ConductorWorkflow` with `toSubWorkflowTask()`. Missing only `__call__` (language limitation) | -| **Metrics/Observability** | **98%** | 19 metrics, quantiles (p50-p99), HTTP + file export, optional prom-client integration | +| **Metrics/Observability** | **98%** | Legacy + canonical modes, HTTP + file export, optional prom-client integration | | **AI module** | **95%** | All LLM task builders + `LLMProvider`/`VectorDB` enums + `IntegrationConfig` types. Missing only `AIOrchestrator` | | **JSON schema** | **95%** | `jsonSchema()` declarative helper + `@schemaField()` decorator with `reflect-metadata` type inference | | **Scheduling** | **100%** | Full parity including tags | @@ -712,7 +713,7 @@ Both SDKs support identical environment variable formats for worker configuratio 7. **Worker configuration hierarchy**: Identical env var format and precedence 8. **`TaskHandler` orchestrator**: Same architecture as Python 9. **TaskContext**: Full async-local context with `addLog()`, `setCallbackAfter()` -10. **Prometheus metrics**: 19 metrics with quantiles (p50-p99), `MetricsServer` (HTTP + file export), optional `prom-client` integration +10. **Prometheus metrics**: Legacy and canonical modes with `createMetricsCollector()`, `MetricsServer` (HTTP + file export), optional `prom-client` integration 11. **JSON schema generation**: `jsonSchema()` declarative helper + `@schemaField()` decorator with `reflect-metadata` runtime type inference + `inputType`/`outputType` on `@worker` 12. **AI types**: `LLMProvider`, `VectorDB` enums + typed `IntegrationConfig` + `withPromptVariable()` helpers 13. **OpenAPI-generated types**: Strong TypeScript types from spec diff --git a/SDK_DEVELOPMENT.md b/SDK_DEVELOPMENT.md index b40fd2a6..05bfd277 100644 --- a/SDK_DEVELOPMENT.md +++ b/SDK_DEVELOPMENT.md @@ -119,9 +119,15 @@ src/ context/ TaskContext.ts # AsyncLocalStorage-based per-task context + getTaskContext() metrics/ - MetricsCollector.ts # TaskRunnerEventsListener impl, 19 metric types, quantiles - MetricsServer.ts # HTTP server: /metrics (Prometheus) + /health (JSON) - PrometheusRegistry.ts # Optional prom-client bridge (lazy loaded) + LegacyMetricsCollector.ts # Default: prefixed names, Summary type, ms units + CanonicalMetricsCollector.ts # Opt-in: unprefixed canonical names, Histogram type, seconds + metricsFactory.ts # createMetricsCollector() reads WORKER_CANONICAL_METRICS + MetricsServer.ts # HTTP server: /metrics (Prometheus) + /health (JSON) + MetricsCollectorInterface.ts # Shared interface for both collectors + PrometheusRegistry.ts # Optional prom-client bridge for legacy collector + CanonicalPrometheusRegistry.ts # Optional prom-client bridge for canonical collector + accumulators.ts # HistogramAccumulator, MultiLabelCounter, GaugeMetric + httpObserver.ts # Global singleton for API request metrics schema/ # jsonSchema(), schemaField() decorator, generateSchemaFromClass() config/ # Worker configuration resolution helpers generators/ # Legacy task generators (pre-v3, still exported for backward compat) @@ -268,15 +274,13 @@ Returns `undefined` outside task execution. All 16 methods: `getTaskId()`, `getW ### 7. Metrics +`createMetricsCollector()` reads `WORKER_CANONICAL_METRICS` and returns a legacy or canonical collector. See [METRICS.md](METRICS.md) for the full catalog and migration guide. + ```typescript -const metrics = new MetricsCollector({ prefix: "my_app", slidingWindowSize: 1000 }); +const metrics = createMetricsCollector(); const handler = new TaskHandler({ client, eventListeners: [metrics], scanForDecorated: true }); await handler.startWorkers(); -// Programmatic access -const m = metrics.getMetrics(); -console.log(m.pollTotal.get("my_task")); - // Prometheus text const text = metrics.toPrometheusText(); ``` @@ -291,7 +295,7 @@ await server.start(); await server.stop(); ``` -**Do not use `MetricsCollector({ httpPort })` in Jest** - it uses a dynamic import with `.js` extension that doesn't resolve under Jest's TS transform. +**Do not use `MetricsCollector({ httpPort })` in Jest** -- it uses a dynamic import with `.js` extension that doesn't resolve under Jest's TS transform. ## Known Pitfalls @@ -616,7 +620,7 @@ Node 18+ required. Dual ESM/CJS via `tsup` with `exports` field in `package.json | Workflow DSL | `ConductorWorkflow` | `ConductorWorkflow` | Fluent builder | | Worker decorator | `@worker_task` | `@worker` | TS decorator syntax | | Task context | `get_task_context()` | `getTaskContext()` | AsyncLocalStorage vs contextvars | -| Metrics | `MetricsCollector` | `MetricsCollector` | 19 metric types, quantiles | +| Metrics | `MetricsCollector` | `createMetricsCollector()` | Legacy + canonical modes, see [METRICS.md](METRICS.md) | | Metrics server | HTTP endpoint | `MetricsServer` | `/metrics` + `/health` | | Non-retryable | `NonRetryableError` | `NonRetryableException` | `FAILED_WITH_TERMINAL_ERROR` | | LLM builders | 13 builders | 13 builders | Full parity | diff --git a/WORKER_ARCHITECTURE_COMPARISON.md b/WORKER_ARCHITECTURE_COMPARISON.md index dd0701b5..528921ee 100644 --- a/WORKER_ARCHITECTURE_COMPARISON.md +++ b/WORKER_ARCHITECTURE_COMPARISON.md @@ -545,37 +545,38 @@ handler = TaskHandler( ### JavaScript: Built-in MetricsCollector -The JavaScript SDK provides a built-in `MetricsCollector` that implements `TaskRunnerEventsListener`: +The JavaScript SDK provides `createMetricsCollector()` which returns a legacy or canonical collector based on `WORKER_CANONICAL_METRICS`. See [METRICS.md](METRICS.md) for the full catalog. ```typescript -import { MetricsCollector, TaskHandler } from "@io-orkes/conductor-javascript/worker"; +import { createMetricsCollector, MetricsServer, TaskHandler } from "@io-orkes/conductor-javascript"; -const metrics = new MetricsCollector(); +const metrics = createMetricsCollector(); +const server = new MetricsServer(metrics, 9090); +await server.start(); const handler = new TaskHandler({ client, eventListeners: [metrics], }); -handler.startWorkers(); - -// Read metrics -const snapshot = metrics.getMetrics(); -console.log("Poll total:", snapshot.pollTotal); -console.log("Execution durations:", snapshot.executionDurationMs); +await handler.startWorkers(); +// GET http://localhost:9090/metrics -> Prometheus text +// GET http://localhost:9090/health -> {"status":"UP"} ``` ### Comparison | Feature | Python | JavaScript | |---------|--------|-----------| -| Built-in metrics collector | Yes (`MetricsCollector`) | Yes (`MetricsCollector`) | Parity | -| HTTP metrics endpoint | Yes (`/metrics`, `/health`) | No (use prom-client separately) | Python richer | -| File-based metrics | Yes (`.prom` files) | No | Python richer | +| Built-in metrics collector | Yes (`create_metrics_collector()`) | Yes (`createMetricsCollector()`) | Parity | +| Legacy / canonical modes | Yes (`WORKER_CANONICAL_METRICS`) | Yes (`WORKER_CANONICAL_METRICS`) | Parity | +| HTTP metrics endpoint | Yes (`/metrics`, `/health`) | Yes (`MetricsServer`) | Parity | +| File-based metrics | Yes (`.prom` files) | Yes (`filePath` config) | Parity | | Multiprocess aggregation | Yes (SQLite) | N/A (single process) | N/A | -| API request metrics | Yes (`http_api_client_request`) | No | Python richer | +| API request metrics | Yes | Yes (`recordApiRequestTime()`) | Parity | | Event-based architecture | Yes (MetricsCollector is listener) | Yes (MetricsCollector is listener) | Parity | | Custom metrics via events | Yes | Yes | Parity | +| Optional `prom-client` | Yes (native) | Yes (`usePromClient: true`) | Parity | --- diff --git a/harness/main.ts b/harness/main.ts index 3240ce1a..182b8d50 100644 --- a/harness/main.ts +++ b/harness/main.ts @@ -3,12 +3,13 @@ import { ConductorWorkflow, TaskHandler, simpleTask, - MetricsCollector, + createMetricsCollector, } from "../src/sdk"; import { MetadataResource } from "../src/open-api/generated"; import type { ConductorWorker } from "../src/sdk/clients/worker/types"; import { SimulatedTaskWorker } from "./simulatedTaskWorker"; import { WorkflowGovernor } from "./workflowGovernor"; +import { WorkflowStatusProbe } from "./workflowStatusProbe"; const WORKFLOW_NAME = "js_simulated_tasks_workflow"; @@ -93,8 +94,8 @@ async function main(): Promise { }); const metricsPort = envIntOrDefault("HARNESS_METRICS_PORT", 9991); - const metricsCollector = new MetricsCollector({ httpPort: metricsPort }); - console.log(`Prometheus metrics server started on port ${metricsPort}`); + const metricsCollector = createMetricsCollector({ httpPort: metricsPort }); + console.log(`Prometheus metrics server started on port ${metricsPort} (${metricsCollector.collectorName()} metrics)`); const handler = new TaskHandler({ client, @@ -104,15 +105,27 @@ async function main(): Promise { }); await handler.startWorkers(); + const probeRate = envIntOrDefault("HARNESS_PROBE_RATE_PER_SEC", 0); + const probe = + probeRate > 0 ? new WorkflowStatusProbe(client, probeRate) : undefined; + const governor = new WorkflowGovernor( workflowClient, WORKFLOW_NAME, workflowsPerSec, + probe ? probe.offer.bind(probe) : undefined, ); governor.start(); + if (probe) { + probe.start(); + } + const shutdown = async () => { console.log("Shutting down..."); + if (probe) { + probe.stop(); + } governor.stop(); await handler.stopWorkers(); process.exit(0); diff --git a/harness/manifests/deployment.yaml b/harness/manifests/deployment.yaml index 878d6728..95218587 100644 --- a/harness/manifests/deployment.yaml +++ b/harness/manifests/deployment.yaml @@ -40,6 +40,8 @@ spec: value: "20" - name: HARNESS_POLL_INTERVAL_MS value: "100" + - name: WORKER_CANONICAL_METRICS + value: "true" ports: - name: metrics containerPort: 9991 diff --git a/harness/workflowGovernor.ts b/harness/workflowGovernor.ts index 9865f126..4ccd0d33 100644 --- a/harness/workflowGovernor.ts +++ b/harness/workflowGovernor.ts @@ -4,16 +4,19 @@ export class WorkflowGovernor { private readonly workflowExecutor: WorkflowExecutor; private readonly workflowName: string; private readonly workflowsPerSecond: number; + private readonly idSink?: (id: string) => void; private timer: ReturnType | undefined; constructor( workflowExecutor: WorkflowExecutor, workflowName: string, workflowsPerSecond: number, + idSink?: (id: string) => void, ) { this.workflowExecutor = workflowExecutor; this.workflowName = workflowName; this.workflowsPerSecond = workflowsPerSecond; + this.idSink = idSink; } start(): void { @@ -51,10 +54,15 @@ export class WorkflowGovernor { } Promise.all(promises) - .then(() => { + .then((ids) => { console.log( `Governor: started ${this.workflowsPerSecond} workflow(s)`, ); + if (this.idSink) { + for (const id of ids) { + this.idSink(id); + } + } }) .catch((err: unknown) => { console.error( diff --git a/harness/workflowStatusProbe.ts b/harness/workflowStatusProbe.ts new file mode 100644 index 00000000..741f4438 --- /dev/null +++ b/harness/workflowStatusProbe.ts @@ -0,0 +1,97 @@ +import type { Client } from "../src/open-api/generated/client"; +import { WorkflowResource } from "../src/open-api/generated"; + +const MAX_TRACKED_IDS = 256; + +/** + * Exercises UUID-bearing workflow lookup endpoints so + * http_api_client_request_seconds picks up entries with + * uri=/workflow/{workflowId} and uri=/workflow/{workflowId}/tasks. + * + * Default harness traffic only hits bounded, no-path-param URLs (poll/update), + * making the high-cardinality concern on the uri label invisible without this + * probe. + * + * Default off. Runs only when HARNESS_PROBE_RATE_PER_SEC > 0. + * Side-effect-free: only issues read calls (getExecutionStatus, + * getExecutionStatusTaskList). + * Self-bounded: fixed-size FIFO of workflow IDs. + */ +export class WorkflowStatusProbe { + private readonly client: Client; + private readonly callsPerSecond: number; + private readonly recentIDs: string[] = []; + private timer: ReturnType | undefined; + + constructor(client: Client, callsPerSecond: number) { + this.client = client; + this.callsPerSecond = callsPerSecond; + } + + offer(workflowId: string): void { + if (!workflowId) return; + this.recentIDs.push(workflowId); + if (this.recentIDs.length > MAX_TRACKED_IDS) { + this.recentIDs.splice(0, this.recentIDs.length - MAX_TRACKED_IDS); + } + } + + start(): void { + if (this.callsPerSecond <= 0) { + console.log( + "WorkflowStatusProbe disabled (HARNESS_PROBE_RATE_PER_SEC<=0)", + ); + return; + } + console.log( + `WorkflowStatusProbe started: rate=${this.callsPerSecond}/sec, retainedIds<=${MAX_TRACKED_IDS}`, + ); + + this.timer = setInterval(() => { + this.tick(); + }, 1000); + + if (this.timer && typeof this.timer === "object" && "unref" in this.timer) { + this.timer.unref(); + } + } + + stop(): void { + if (this.timer) { + clearInterval(this.timer); + this.timer = undefined; + } + console.log("WorkflowStatusProbe stopped"); + } + + private tick(): void { + const budget = Math.min(this.callsPerSecond, this.recentIDs.length); + if (budget === 0) return; + + const ids: string[] = []; + for (let i = 0; i < budget; i++) { + ids.push( + this.recentIDs[Math.floor(Math.random() * this.recentIDs.length)], + ); + } + + for (const id of ids) { + const call = + Math.random() < 0.5 + ? WorkflowResource.getExecutionStatus({ + client: this.client, + path: { workflowId: id }, + }) + : WorkflowResource.getExecutionStatusTaskList({ + client: this.client, + path: { workflowId: id }, + }); + + call.catch((err: unknown) => { + console.error( + `probe: ${id}: ${err instanceof Error ? err.message : String(err)}`, + ); + }); + } + } +} diff --git a/src/integration-tests/LeaseExtension.validation.test.ts b/src/integration-tests/LeaseExtension.validation.test.ts index 015512c8..a977613b 100644 --- a/src/integration-tests/LeaseExtension.validation.test.ts +++ b/src/integration-tests/LeaseExtension.validation.test.ts @@ -36,7 +36,6 @@ import type { Client } from "../open-api"; import { MetadataClient, WorkflowExecutor, - TaskClient, orkesConductorClient, } from "../sdk"; import { LeaseTracker } from "../sdk/clients/worker/LeaseTracker"; @@ -56,7 +55,6 @@ describe("Lease Extension — end-to-end validation", () => { let client: Client; let executor: WorkflowExecutor; let metadataClient: MetadataClient; - let taskClient: TaskClient; const logger: ConductorLogger = new DefaultLogger(); const suffix = Date.now(); @@ -67,7 +65,6 @@ describe("Lease Extension — end-to-end validation", () => { client = await orkesConductorClient(); executor = new WorkflowExecutor(client); metadataClient = new MetadataClient(client); - taskClient = new TaskClient(client); await metadataClient.registerTask({ name: taskDefName, @@ -117,7 +114,7 @@ describe("Lease Extension — end-to-end validation", () => { const deadline = Date.now() + maxWaitMs; while (Date.now() < deadline) { const wf = await executor.getWorkflow(workflowId, false); - if (terminal.has(wf.status ?? "")) return wf.status!; + if (terminal.has(wf.status ?? "")) return wf.status ?? ""; await sleep(1_000); } return "STILL_RUNNING"; @@ -132,8 +129,8 @@ describe("Lease Extension — end-to-end validation", () => { const { data: tasks1 } = await TaskResource.batchPoll({ client, path: { tasktype: taskDefName }, query: { workerid: "val-worker-no-lease", count: 1, timeout: 200 } }); const [task] = tasks1 ?? []; expect(task).toBeDefined(); - const taskId1 = task.taskId!; - const wfId1 = task.workflowInstanceId!; + const taskId1 = task.taskId ?? ""; + const wfId1 = task.workflowInstanceId ?? ""; console.log(` Task polled: ${taskId1}`); console.log(` No LeaseTracker created — zero extendLease calls will be made`); @@ -177,8 +174,8 @@ describe("Lease Extension — end-to-end validation", () => { const { data: tasks2 } = await TaskResource.batchPoll({ client, path: { tasktype: taskDefName }, query: { workerid: "val-worker-with-lease", count: 1, timeout: 200 } }); const [task] = tasks2 ?? []; expect(task).toBeDefined(); - const taskId2 = task.taskId!; - const wfId2 = task.workflowInstanceId!; + const taskId2 = task.taskId ?? ""; + const wfId2 = task.workflowInstanceId ?? ""; console.log(` Task polled: ${taskId2}`); // Track heartbeat calls via a spy that ALSO sends the real heartbeat diff --git a/src/sdk/builders/tasks/pullWorkflowMessages.ts b/src/sdk/builders/tasks/pullWorkflowMessages.ts index e4b2ebae..e0bcb3c3 100644 --- a/src/sdk/builders/tasks/pullWorkflowMessages.ts +++ b/src/sdk/builders/tasks/pullWorkflowMessages.ts @@ -16,7 +16,7 @@ import { TaskType, PullWorkflowMessagesTaskDef } from "../../../open-api"; */ export const pullWorkflowMessages = ( taskReferenceName: string, - batchSize: number = 1, + batchSize = 1, optional?: boolean ): PullWorkflowMessagesTaskDef => ({ name: taskReferenceName, diff --git a/src/sdk/clients/worker/Poller.ts b/src/sdk/clients/worker/Poller.ts index a7fb1787..414552ac 100644 --- a/src/sdk/clients/worker/Poller.ts +++ b/src/sdk/clients/worker/Poller.ts @@ -189,6 +189,7 @@ export class Poller { this.logger.debug( `Worker ${this._pollerId} is paused, skipping poll` ); + this.options.onPaused?.(); await this.sleep( this.options.pollInterval ?? DEFAULT_POLL_INTERVAL ); diff --git a/src/sdk/clients/worker/TaskRunner.ts b/src/sdk/clients/worker/TaskRunner.ts index a836ad5b..5be29c83 100644 --- a/src/sdk/clients/worker/TaskRunner.ts +++ b/src/sdk/clients/worker/TaskRunner.ts @@ -96,6 +96,12 @@ export class TaskRunner { { concurrency: worker.concurrency ?? this.options.concurrency, pollInterval: worker.pollInterval ?? this.options.pollInterval, + onPaused: () => { + void this.eventDispatcher.publishTaskPaused({ + taskType: this.worker.taskDefName, + timestamp: new Date(), + }); + }, }, this.logger ); @@ -221,15 +227,16 @@ export class TaskRunner { const { workerID } = this.options; let retryCount = 0; let lastError: Error | null = null; + let lastAttemptDurationMs = 0; while (retryCount < this.maxRetries) { + const updateStart = Date.now(); try { if (process.env.CI) { console.log( `[TaskRunner] Submitting task result taskId=${taskResult.taskId} workflowId=${taskResult.workflowInstanceId} taskType=${this.worker.taskDefName} attempt=${retryCount + 1}/${this.maxRetries}` ); } - const updateStart = Date.now(); if (TaskRunner.updateV2Available === false) { // Already detected a legacy server — skip the probe, call legacy directly. @@ -328,6 +335,7 @@ export class TaskRunner { }); return nextTask ?? undefined; } catch (error: unknown) { + lastAttemptDurationMs = Date.now() - updateStart; lastError = error as Error; this.errorHandler(lastError, task); this.logger.error( @@ -358,6 +366,7 @@ export class TaskRunner { workerId: workerID, workflowInstanceId: taskResult.workflowInstanceId, cause: lastError ?? new Error("Task update failed after all retries"), + durationMs: lastAttemptDurationMs, retryCount, taskResult, timestamp: new Date(), diff --git a/src/sdk/clients/worker/__tests__/LeaseTracker.test.ts b/src/sdk/clients/worker/__tests__/LeaseTracker.test.ts index 86cf4e3e..73b56823 100644 --- a/src/sdk/clients/worker/__tests__/LeaseTracker.test.ts +++ b/src/sdk/clients/worker/__tests__/LeaseTracker.test.ts @@ -1,5 +1,5 @@ -import { LeaseTracker, LeaseInfo } from "@/sdk/clients/worker/LeaseTracker"; -import { LEASE_EXTEND_DURATION_FACTOR, LEASE_EXTEND_RETRY_COUNT, HEARTBEAT_RETRY_DELAY_MS } from "@/sdk/clients/worker/constants"; +import { LeaseTracker } from "@/sdk/clients/worker/LeaseTracker"; +import { LEASE_EXTEND_RETRY_COUNT, HEARTBEAT_RETRY_DELAY_MS } from "@/sdk/clients/worker/constants"; import { afterEach, beforeEach, describe, expect, jest, test } from "@jest/globals"; import type { Task } from "@open-api/index"; import { mockLogger } from "@test-utils/mockLogger"; diff --git a/src/sdk/clients/worker/events/EventDispatcher.ts b/src/sdk/clients/worker/events/EventDispatcher.ts index e6e1557a..16a2cf3c 100644 --- a/src/sdk/clients/worker/events/EventDispatcher.ts +++ b/src/sdk/clients/worker/events/EventDispatcher.ts @@ -9,6 +9,7 @@ import type { TaskExecutionFailure, TaskUpdateCompleted, TaskUpdateFailure, + TaskPaused, } from "./types"; /** @@ -60,6 +61,11 @@ export interface TaskRunnerEventsListener { * This is a CRITICAL event that may require operational intervention. */ onTaskUpdateFailure?(event: TaskUpdateFailure): void | Promise; + + /** + * Called when a poll cycle is skipped because the worker is paused. + */ + onTaskPaused?(event: TaskPaused): void | Promise; } /** @@ -159,6 +165,13 @@ export class EventDispatcher { await this.publishEvent("onTaskUpdateFailure", event); } + /** + * Publish a TaskPaused event. + */ + async publishTaskPaused(event: TaskPaused): Promise { + await this.publishEvent("onTaskPaused", event); + } + /** * Internal method to publish events to all registered listeners. * Listener failures are caught and logged to prevent affecting task execution. diff --git a/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts b/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts index bc2f2dc6..c8b31634 100644 --- a/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts +++ b/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts @@ -9,6 +9,7 @@ import type { TaskExecutionFailure, TaskUpdateCompleted, TaskUpdateFailure, + TaskPaused, } from "../types"; describe("EventDispatcher", () => { @@ -143,6 +144,7 @@ describe("EventDispatcher", () => { onTaskExecutionFailure: jest.fn<() => void>(), onTaskUpdateCompleted: jest.fn<() => void>(), onTaskUpdateFailure: jest.fn<() => void>(), + onTaskPaused: jest.fn<() => void>(), }; dispatcher.register(listener); @@ -225,12 +227,21 @@ describe("EventDispatcher", () => { workerId: "worker-1", workflowInstanceId: "workflow-1", cause: new Error("Update failed"), + durationMs: 500, retryCount: 4, taskResult: { status: "COMPLETED" }, timestamp: new Date(), }; await dispatcher.publishTaskUpdateFailure(updateFailure); expect(listener.onTaskUpdateFailure).toHaveBeenCalledWith(updateFailure); + + // Test TaskPaused + const taskPaused: TaskPaused = { + taskType: "test-task", + timestamp: new Date(), + }; + await dispatcher.publishTaskPaused(taskPaused); + expect(listener.onTaskPaused).toHaveBeenCalledWith(taskPaused); }); test("should have zero overhead when no listeners registered", async () => { diff --git a/src/sdk/clients/worker/events/types.ts b/src/sdk/clients/worker/events/types.ts index 7f12b2bf..4bd5306c 100644 --- a/src/sdk/clients/worker/events/types.ts +++ b/src/sdk/clients/worker/events/types.ts @@ -114,6 +114,8 @@ export interface TaskUpdateFailure extends TaskRunnerEvent { workflowInstanceId?: string; /** The error that caused the final update failure */ cause: Error; + /** Time taken for the last update attempt in milliseconds */ + durationMs: number; /** Number of retry attempts made */ retryCount: number; /** The TaskResult object that failed to update (for recovery/logging) */ @@ -134,6 +136,14 @@ export interface TaskUpdateCompleted extends TaskRunnerEvent { durationMs: number; } +/** + * Event published when a poll cycle is skipped because the worker is paused. + */ +// eslint-disable-next-line @typescript-eslint/no-empty-object-type +export interface TaskPaused extends TaskRunnerEvent { + // No additional fields — taskType is inherited from TaskRunnerEvent. +} + /** * Union type of all task runner events. */ @@ -145,4 +155,5 @@ export type TaskRunnerEventType = | TaskExecutionCompleted | TaskExecutionFailure | TaskUpdateCompleted - | TaskUpdateFailure; + | TaskUpdateFailure + | TaskPaused; diff --git a/src/sdk/clients/worker/types.ts b/src/sdk/clients/worker/types.ts index b56eed5c..3aa0db61 100644 --- a/src/sdk/clients/worker/types.ts +++ b/src/sdk/clients/worker/types.ts @@ -86,6 +86,8 @@ export interface PollerOptions { adaptiveBackoff?: boolean; /** Whether this poller is paused (default: false) */ paused?: boolean; + /** Callback invoked each time a poll cycle is skipped because the poller is paused */ + onPaused?: () => void; } /** diff --git a/src/sdk/clients/workflow/WorkflowExecutor.ts b/src/sdk/clients/workflow/WorkflowExecutor.ts index a34188e2..c527050e 100644 --- a/src/sdk/clients/workflow/WorkflowExecutor.ts +++ b/src/sdk/clients/workflow/WorkflowExecutor.ts @@ -30,6 +30,7 @@ import { enhanceSignalResponse } from "./helpers/enhanceSignalResponse"; import { reverseFind } from "./helpers/reverseFind"; import { isCompletedTaskMatchingType } from "./helpers/isCompletedTaskMatchingType"; import { RETRY_TIME_IN_MILLISECONDS } from "./constants"; +import { getHttpMetricsObserver } from "../../worker/metrics/httpObserver"; export class WorkflowExecutor { public readonly _client: Client; @@ -71,6 +72,23 @@ export class WorkflowExecutor { public async startWorkflow( workflowRequest: StartWorkflowRequest ): Promise { + const observer = getHttpMetricsObserver(); + if (observer) { + try { + const inputBytes = workflowRequest.input + ? JSON.stringify(workflowRequest.input).length + : 0; + observer.recordWorkflowInputSize( + workflowRequest.name ?? "", + inputBytes, + workflowRequest.version != null + ? String(workflowRequest.version) + : undefined, + ); + } catch { + // Best-effort — don't let metrics break workflow start. + } + } try { const { data } = await WorkflowResource.startWorkflow({ body: workflowRequest, @@ -80,6 +98,16 @@ export class WorkflowExecutor { return data; } catch (error: unknown) { + if (observer) { + const excName = + error instanceof Error + ? error.name || error.constructor?.name || "Error" + : "Error"; + observer.recordWorkflowStartError( + workflowRequest.name ?? "", + excName, + ); + } handleSdkError(error, "Failed to start workflow"); } } diff --git a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts index 76f73f9b..84277b67 100644 --- a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts +++ b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts @@ -1,5 +1,6 @@ -import { jest, expect, describe, it, beforeEach } from "@jest/globals"; +import { jest, expect, describe, it, beforeEach, afterEach } from "@jest/globals"; import { retryFetch, wrapFetchWithRetry, applyTimeout } from "../fetchWithRetry"; +import * as httpObserver from "@/sdk/worker/metrics/httpObserver"; const createMockResponse = (status: number, body = ""): Response => new Response(body, { status, statusText: `Status ${status}` }); @@ -180,6 +181,118 @@ describe("fetchWithRetry", () => { }); }); + // ─── Server error (502/503/504) retry ─────────────────────────────── + + describe("server error (502/503/504) retry", () => { + it("should retry 502 and succeed on next attempt", async () => { + mockFetch + .mockResolvedValueOnce(createMockResponse(502, "Bad Gateway")) + .mockResolvedValueOnce(createMockResponse(200, "ok")); + + const result = await retryFetch("http://test.com", {}, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + }); + + expect(result.status).toBe(200); + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + + it("should retry 503 and succeed on next attempt", async () => { + mockFetch + .mockResolvedValueOnce(createMockResponse(503, "Service Unavailable")) + .mockResolvedValueOnce(createMockResponse(200, "ok")); + + const result = await retryFetch("http://test.com", {}, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + }); + + expect(result.status).toBe(200); + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + + it("should retry 504 and succeed on next attempt", async () => { + mockFetch + .mockResolvedValueOnce(createMockResponse(504, "Gateway Timeout")) + .mockResolvedValueOnce(createMockResponse(200, "ok")); + + const result = await retryFetch("http://test.com", {}, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + }); + + expect(result.status).toBe(200); + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + + it("should exhaust retries and return last 5xx response", async () => { + mockFetch.mockResolvedValue(createMockResponse(502, "Bad Gateway")); + + const result = await retryFetch("http://test.com", {}, mockFetch, { + maxTransportRetries: 2, + initialRetryDelay: 1, + }); + + expect(result.status).toBe(502); + // 1 initial + 2 retries = 3 + expect(mockFetch).toHaveBeenCalledTimes(3); + }); + + it("should NOT retry 502 for POST requests (non-idempotent)", async () => { + mockFetch.mockResolvedValue(createMockResponse(502, "Bad Gateway")); + + const result = await retryFetch("http://test.com", { method: "POST" }, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + }); + + expect(result.status).toBe(502); + expect(mockFetch).toHaveBeenCalledTimes(1); + }); + + it("should NOT retry 503 for PATCH requests (non-idempotent)", async () => { + mockFetch.mockResolvedValue(createMockResponse(503, "Service Unavailable")); + + const result = await retryFetch("http://test.com", { method: "PATCH" }, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + }); + + expect(result.status).toBe(503); + expect(mockFetch).toHaveBeenCalledTimes(1); + }); + + it("should retry 502 for PUT requests (idempotent)", async () => { + mockFetch + .mockResolvedValueOnce(createMockResponse(502, "Bad Gateway")) + .mockResolvedValueOnce(createMockResponse(200, "ok")); + + const result = await retryFetch("http://test.com", { method: "PUT" }, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + }); + + expect(result.status).toBe(200); + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + + it("should handle transport error then 502 then success", async () => { + mockFetch + .mockRejectedValueOnce(new Error("ECONNRESET")) + .mockResolvedValueOnce(createMockResponse(502, "Bad Gateway")) + .mockResolvedValueOnce(createMockResponse(200, "ok")); + + const result = await retryFetch("http://test.com", {}, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + }); + + expect(result.status).toBe(200); + expect(mockFetch).toHaveBeenCalledTimes(3); + }); + }); + // ─── Auth failure (401/403) retry ────────────────────────────────── describe("auth failure (401/403) retry", () => { @@ -525,4 +638,107 @@ describe("fetchWithRetry", () => { expect(result.status).toBe(200); }); }); + + // ─── wrapFetchWithRetry metrics recording ─────────────────────────── + + describe("wrapFetchWithRetry metrics", () => { + const mockRecordApiRequestTime = jest.fn< + (m: string, u: string, s: string, d: number) => void + >(); + + beforeEach(() => { + mockRecordApiRequestTime.mockClear(); + httpObserver.setHttpMetricsObserver({ + recordApiRequestTime: mockRecordApiRequestTime, + recordWorkflowInputSize: jest.fn(), + recordWorkflowStartError: jest.fn(), + }); + }); + + afterEach(() => { + httpObserver.setHttpMetricsObserver(undefined); + }); + + it("should record API request time on successful response", async () => { + mockFetch.mockResolvedValue(createMockResponse(200)); + + const wrappedFetch = wrapFetchWithRetry(mockFetch); + await wrappedFetch("http://test.com/api/tasks", { method: "POST" }); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [method, uri, status, duration] = + mockRecordApiRequestTime.mock.calls[0]; + expect(method).toBe("POST"); + expect(uri).toBe("/api/tasks"); + expect(status).toBe("200"); + expect(duration).toBeGreaterThanOrEqual(0); + }); + + it("should record status '0' on network failure", async () => { + mockFetch.mockRejectedValue(new Error("ECONNRESET")); + + const wrappedFetch = wrapFetchWithRetry(mockFetch, { + maxTransportRetries: 0, + }); + + await expect( + wrappedFetch("http://test.com/api/workflow") + ).rejects.toThrow("ECONNRESET"); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [method, uri, status] = + mockRecordApiRequestTime.mock.calls[0]; + expect(method).toBe("GET"); + expect(uri).toBe("/api/workflow"); + expect(status).toBe("0"); + }); + + it("should extract method and URI from string URL", async () => { + mockFetch.mockResolvedValue(createMockResponse(201)); + + const wrappedFetch = wrapFetchWithRetry(mockFetch); + await wrappedFetch("http://example.com/tasks/123", { method: "PUT" }); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [method, uri] = mockRecordApiRequestTime.mock.calls[0]; + expect(method).toBe("PUT"); + expect(uri).toBe("/tasks/123"); + }); + + it("should extract method and URI from Request object", async () => { + mockFetch.mockResolvedValue(createMockResponse(200)); + + const wrappedFetch = wrapFetchWithRetry(mockFetch); + const request = new Request("http://example.com/api/metadata", { + method: "DELETE", + }); + await wrappedFetch(request); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [method, uri] = mockRecordApiRequestTime.mock.calls[0]; + expect(method).toBe("DELETE"); + expect(uri).toBe("/api/metadata"); + }); + + it("should not break fetch when no observer is registered", async () => { + httpObserver.setHttpMetricsObserver(undefined); + + mockFetch.mockResolvedValue(createMockResponse(200)); + const wrappedFetch = wrapFetchWithRetry(mockFetch); + const result = await wrappedFetch("http://test.com"); + + expect(result.status).toBe(200); + }); + + it("should default method to GET when init has no method", async () => { + mockFetch.mockResolvedValue(createMockResponse(200)); + + const wrappedFetch = wrapFetchWithRetry(mockFetch); + await wrappedFetch("http://test.com/path"); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [method] = mockRecordApiRequestTime.mock.calls[0]; + expect(method).toBe("GET"); + }); + }); }); diff --git a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts index 733313cd..693289df 100644 --- a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts +++ b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts @@ -1,3 +1,5 @@ +import { getHttpMetricsObserver } from "../../worker/metrics/httpObserver"; + type Input = Parameters[0]; type Init = Parameters[1]; @@ -95,6 +97,8 @@ const withJitter = (delayMs: number): number => { return Math.max(0, Math.round(delayMs + jitter)); }; +const IDEMPOTENT_METHODS = new Set(["GET", "HEAD", "OPTIONS", "PUT", "DELETE"]); + export const retryFetch = async ( input: Input, init: Init, @@ -157,6 +161,23 @@ export const retryFetch = async ( return rateLimitResponse; } + // Gateway error retry (502, 503, 504) -- only for idempotent methods to + // avoid duplicate side effects when the upstream may have processed the request. + if (response.status >= 502 && response.status <= 504) { + const reqMethod = (input instanceof Request + ? input.method + : init?.method ?? "GET" + ).toUpperCase(); + if (IDEMPOTENT_METHODS.has(reqMethod) && transportAttempt < maxTransportRetries) { + lastError = new Error(`Server error: HTTP ${response.status}`); + await new Promise((resolve) => + setTimeout(resolve, withJitter(initialRetryDelay * (transportAttempt + 1))) + ); + continue; + } + return response; + } + // Auth failure retry (401/403) - only refresh+retry when the error is a token // problem (EXPIRED_TOKEN or INVALID_TOKEN). Permission errors should propagate // immediately without wasting a token refresh + retry round-trip. @@ -186,9 +207,47 @@ export const retryFetch = async ( export const wrapFetchWithRetry = ( fetchFn: typeof fetch, - options?: RetryFetchOptions + options?: RetryFetchOptions, ): typeof fetch => { - return (input: Input, init?: Init): Promise => { - return retryFetch(input, init, fetchFn, options); + return async (input: Input, init?: Init): Promise => { + const start = performance.now(); + let method = "GET"; + let uri = ""; + + try { + if (input instanceof Request) { + method = input.method; + uri = new URL(input.url).pathname; + } else if (typeof input === "string") { + method = init?.method ?? "GET"; + try { uri = new URL(input).pathname; } catch { uri = input; } + } else { + method = init?.method ?? "GET"; + try { uri = input.pathname; } catch { uri = String(input); } + } + } catch { + // Best-effort URI extraction + } + + try { + const response = await retryFetch(input, init, fetchFn, options); + const durationMs = performance.now() - start; + getHttpMetricsObserver()?.recordApiRequestTime( + method, + uri, + String(response.status), + durationMs, + ); + return response; + } catch (error) { + const durationMs = performance.now() - start; + getHttpMetricsObserver()?.recordApiRequestTime( + method, + uri, + "0", + durationMs, + ); + throw error; + } }; }; diff --git a/src/sdk/worker/core/TaskHandler.ts b/src/sdk/worker/core/TaskHandler.ts index 2c0dd083..5bfcda5f 100644 --- a/src/sdk/worker/core/TaskHandler.ts +++ b/src/sdk/worker/core/TaskHandler.ts @@ -140,6 +140,7 @@ export class TaskHandler { private restartAttempts = new Map(); // runner index → attempt count private healthMonitorConfig: HealthMonitorConfig; + /** * Create a TaskHandler instance with async module imports. * Use this instead of `new TaskHandler()` when using `importModules`. diff --git a/src/sdk/worker/index.ts b/src/sdk/worker/index.ts index 35e1fed9..7605d589 100644 --- a/src/sdk/worker/index.ts +++ b/src/sdk/worker/index.ts @@ -17,7 +17,11 @@ export { TaskContext, getTaskContext } from "./context"; // Metrics export { MetricsCollector, + LegacyMetricsCollector, + CanonicalMetricsCollector, + createMetricsCollector, MetricsServer, + type MetricsCollectorInterface, type MetricsCollectorConfig, type WorkerMetrics, } from "./metrics"; diff --git a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts new file mode 100644 index 00000000..6757bf08 --- /dev/null +++ b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts @@ -0,0 +1,523 @@ +import type { + PollStarted, + PollCompleted, + PollFailure, + TaskExecutionStarted, + TaskExecutionCompleted, + TaskExecutionFailure, + TaskUpdateCompleted, + TaskUpdateFailure, + TaskPaused, +} from "../../clients/worker/events"; +import type { MetricsCollectorInterface } from "./MetricsCollectorInterface"; +import type { MetricsCollectorConfig } from "./LegacyMetricsCollector"; +import { setHttpMetricsObserver } from "./httpObserver"; +import { + HistogramAccumulator, + MultiLabelCounter, + GaugeMetric, + TIME_BUCKETS, + SIZE_BUCKETS, + exceptionLabel, +} from "./accumulators"; + +interface CanonicalMetricState { + // Counters (12) + taskPollTotal: MultiLabelCounter; + taskExecutionStartedTotal: MultiLabelCounter; + taskPollErrorTotal: MultiLabelCounter; + taskExecuteErrorTotal: MultiLabelCounter; + taskUpdateErrorTotal: MultiLabelCounter; + taskAckErrorTotal: MultiLabelCounter; + taskAckFailedTotal: MultiLabelCounter; + taskExecutionQueueFullTotal: MultiLabelCounter; + taskPausedTotal: MultiLabelCounter; + threadUncaughtExceptionsTotal: MultiLabelCounter; + externalPayloadUsedTotal: MultiLabelCounter; + workflowStartErrorTotal: MultiLabelCounter; + + // Time histograms (4) — seconds + taskPollTimeSeconds: HistogramAccumulator; + taskExecuteTimeSeconds: HistogramAccumulator; + taskUpdateTimeSeconds: HistogramAccumulator; + httpApiClientRequestSeconds: HistogramAccumulator; + + // Size histograms (2) — bytes + taskResultSizeBytes: HistogramAccumulator; + workflowInputSizeBytes: HistogramAccumulator; + + // Gauge (1) + activeWorkers: GaugeMetric; +} + +/** + * Canonical metrics collector. + * + * Emits unprefixed Prometheus metrics with camelCase labels, second-based + * time units, and Histogram type for distributions, per the cross-SDK + * canonical metric catalog. + * + * Selected when WORKER_CANONICAL_METRICS=true. + */ +export class CanonicalMetricsCollector implements MetricsCollectorInterface { + private state: CanonicalMetricState; + private _server?: import("./MetricsServer.js").MetricsServer; + private _fileTimer?: ReturnType; + private _promRegistry?: import("./CanonicalPrometheusRegistry.js").CanonicalPrometheusRegistry; + private readonly _usePromClient: boolean; + + constructor(config?: MetricsCollectorConfig) { + this.state = this.createEmpty(); + this._usePromClient = config?.usePromClient ?? false; + if (this._usePromClient) { + void this.initPromClient(); + } + if (config?.httpPort) { + void this.startServer(config.httpPort); + } + if (config?.filePath) { + this.startFileWriter( + config.filePath, + config.fileWriteIntervalMs ?? 5000, + ); + } + } + + private async initPromClient(): Promise { + const { CanonicalPrometheusRegistry } = await import( + "./CanonicalPrometheusRegistry.js" + ); + this._promRegistry = new CanonicalPrometheusRegistry(); + await this._promRegistry.initialize(); + } + + private async startServer(port: number): Promise { + const { MetricsServer } = await import("./MetricsServer.js"); + this._server = new MetricsServer(this, port); + await this._server.start(); + } + + private startFileWriter(filePath: string, intervalMs: number): void { + const doWrite = async () => { + try { + const { writeFile } = await import("node:fs/promises"); + await writeFile(filePath, this.toPrometheusText(), "utf-8"); + } catch { + // Silently ignore file write errors + } + }; + void doWrite(); + this._fileTimer = setInterval(doWrite, intervalMs); + if (typeof this._fileTimer === "object" && "unref" in this._fileTimer) { + this._fileTimer.unref(); + } + } + + private createEmpty(): CanonicalMetricState { + return { + taskPollTotal: new MultiLabelCounter(), + taskExecutionStartedTotal: new MultiLabelCounter(), + taskPollErrorTotal: new MultiLabelCounter(), + taskExecuteErrorTotal: new MultiLabelCounter(), + taskUpdateErrorTotal: new MultiLabelCounter(), + taskAckErrorTotal: new MultiLabelCounter(), + taskAckFailedTotal: new MultiLabelCounter(), + taskExecutionQueueFullTotal: new MultiLabelCounter(), + taskPausedTotal: new MultiLabelCounter(), + threadUncaughtExceptionsTotal: new MultiLabelCounter(), + externalPayloadUsedTotal: new MultiLabelCounter(), + workflowStartErrorTotal: new MultiLabelCounter(), + + taskPollTimeSeconds: new HistogramAccumulator(TIME_BUCKETS), + taskExecuteTimeSeconds: new HistogramAccumulator(TIME_BUCKETS), + taskUpdateTimeSeconds: new HistogramAccumulator(TIME_BUCKETS), + httpApiClientRequestSeconds: new HistogramAccumulator(TIME_BUCKETS), + + taskResultSizeBytes: new HistogramAccumulator(SIZE_BUCKETS), + workflowInputSizeBytes: new HistogramAccumulator(SIZE_BUCKETS), + + activeWorkers: new GaugeMetric(), + }; + } + + // ── Event Listener Methods ────────────────────────────────────── + + onPollStarted(event: PollStarted): void { + this.state.taskPollTotal.increment({ taskType: event.taskType }); + this._promRegistry?.incrementCounter("task_poll_total", { + taskType: event.taskType, + }); + } + + onPollCompleted(event: PollCompleted): void { + const seconds = event.durationMs / 1000; + this.state.taskPollTimeSeconds.observe( + { taskType: event.taskType, status: "SUCCESS" }, + seconds, + ); + this._promRegistry?.observeHistogram("task_poll_time_seconds", { + taskType: event.taskType, + status: "SUCCESS", + }, seconds); + } + + onPollFailure(event: PollFailure): void { + const excName = exceptionLabel(event.cause); + this.state.taskPollErrorTotal.increment({ + taskType: event.taskType, + exception: excName, + }); + this._promRegistry?.incrementCounter("task_poll_error_total", { + taskType: event.taskType, + exception: excName, + }); + + const seconds = event.durationMs / 1000; + this.state.taskPollTimeSeconds.observe( + { taskType: event.taskType, status: "FAILURE" }, + seconds, + ); + this._promRegistry?.observeHistogram("task_poll_time_seconds", { + taskType: event.taskType, + status: "FAILURE", + }, seconds); + } + + onTaskExecutionStarted(event: TaskExecutionStarted): void { + this.state.taskExecutionStartedTotal.increment({ + taskType: event.taskType, + }); + this._promRegistry?.incrementCounter("task_execution_started_total", { + taskType: event.taskType, + }); + this.state.activeWorkers.inc({ taskType: event.taskType }); + this._promRegistry?.setGauge( + "active_workers", + { taskType: event.taskType }, + this.state.activeWorkers.getValue({ taskType: event.taskType }), + ); + } + + onTaskExecutionCompleted(event: TaskExecutionCompleted): void { + const seconds = event.durationMs / 1000; + this.state.taskExecuteTimeSeconds.observe( + { taskType: event.taskType, status: "SUCCESS" }, + seconds, + ); + this._promRegistry?.observeHistogram("task_execute_time_seconds", { + taskType: event.taskType, + status: "SUCCESS", + }, seconds); + + if (event.outputSizeBytes !== undefined) { + this.state.taskResultSizeBytes.observe( + { taskType: event.taskType }, + event.outputSizeBytes, + ); + this._promRegistry?.observeHistogram("task_result_size_bytes", { + taskType: event.taskType, + }, event.outputSizeBytes); + } + + this.state.activeWorkers.dec({ taskType: event.taskType }); + this._promRegistry?.setGauge( + "active_workers", + { taskType: event.taskType }, + Math.max(0, this.state.activeWorkers.getValue({ taskType: event.taskType })), + ); + } + + onTaskExecutionFailure(event: TaskExecutionFailure): void { + const excName = exceptionLabel(event.cause); + this.state.taskExecuteErrorTotal.increment({ + taskType: event.taskType, + exception: excName, + }); + this._promRegistry?.incrementCounter("task_execute_error_total", { + taskType: event.taskType, + exception: excName, + }); + + const seconds = event.durationMs / 1000; + this.state.taskExecuteTimeSeconds.observe( + { taskType: event.taskType, status: "FAILURE" }, + seconds, + ); + this._promRegistry?.observeHistogram("task_execute_time_seconds", { + taskType: event.taskType, + status: "FAILURE", + }, seconds); + + this.state.activeWorkers.dec({ taskType: event.taskType }); + this._promRegistry?.setGauge( + "active_workers", + { taskType: event.taskType }, + Math.max(0, this.state.activeWorkers.getValue({ taskType: event.taskType })), + ); + } + + onTaskUpdateCompleted(event: TaskUpdateCompleted): void { + const seconds = event.durationMs / 1000; + this.state.taskUpdateTimeSeconds.observe( + { taskType: event.taskType, status: "SUCCESS" }, + seconds, + ); + this._promRegistry?.observeHistogram("task_update_time_seconds", { + taskType: event.taskType, + status: "SUCCESS", + }, seconds); + } + + onTaskUpdateFailure(event: TaskUpdateFailure): void { + const excName = exceptionLabel(event.cause); + this.state.taskUpdateErrorTotal.increment({ + taskType: event.taskType, + exception: excName, + }); + this._promRegistry?.incrementCounter("task_update_error_total", { + taskType: event.taskType, + exception: excName, + }); + + const seconds = event.durationMs / 1000; + this.state.taskUpdateTimeSeconds.observe( + { taskType: event.taskType, status: "FAILURE" }, + seconds, + ); + this._promRegistry?.observeHistogram("task_update_time_seconds", { + taskType: event.taskType, + status: "FAILURE", + }, seconds); + } + + onTaskPaused(event: TaskPaused): void { + this.recordTaskPaused(event.taskType); + } + + // ── Direct Recording Methods ─────────────────────────────────── + + recordTaskExecutionQueueFull(taskType: string): void { + this.state.taskExecutionQueueFullTotal.increment({ taskType }); + this._promRegistry?.incrementCounter("task_execution_queue_full_total", { + taskType, + }); + } + + recordUncaughtException(exception?: string): void { + const excName = exception ?? "Error"; + this.state.threadUncaughtExceptionsTotal.increment({ + exception: excName, + }); + this._promRegistry?.incrementCounter( + "thread_uncaught_exceptions_total", + { exception: excName }, + ); + } + + recordWorkerRestart(): void { + // Noop: worker_restart_total is N/A for the JS SDK (single-process model) + } + + recordTaskPaused(taskType: string): void { + this.state.taskPausedTotal.increment({ taskType }); + this._promRegistry?.incrementCounter("task_paused_total", { taskType }); + } + + recordTaskAckError(taskType: string, exception?: string): void { + const excName = exception ?? "Error"; + this.state.taskAckErrorTotal.increment({ taskType, exception: excName }); + this._promRegistry?.incrementCounter("task_ack_error_total", { + taskType, + exception: excName, + }); + } + + recordTaskAckFailed(taskType: string): void { + this.state.taskAckFailedTotal.increment({ taskType }); + this._promRegistry?.incrementCounter("task_ack_failed_total", { + taskType, + }); + } + + recordWorkflowStartError(workflowType?: string, exception?: string): void { + const wfType = workflowType ?? ""; + const excName = exception ?? "Error"; + this.state.workflowStartErrorTotal.increment({ + workflowType: wfType, + exception: excName, + }); + this._promRegistry?.incrementCounter("workflow_start_error_total", { + workflowType: wfType, + exception: excName, + }); + } + + recordExternalPayloadUsed( + payloadType: string, + entityName?: string, + operation?: string, + ): void { + this.state.externalPayloadUsedTotal.increment({ + entityName: entityName ?? "", + operation: operation ?? "", + payloadType, + }); + this._promRegistry?.incrementCounter("external_payload_used_total", { + entityName: entityName ?? "", + operation: operation ?? "", + payloadType, + }); + } + + recordWorkflowInputSize( + workflowType: string, + sizeBytes: number, + version?: string, + ): void { + this.state.workflowInputSizeBytes.observe( + { workflowType, version: version ?? "" }, + sizeBytes, + ); + this._promRegistry?.observeHistogram("workflow_input_size_bytes", { + workflowType, + version: version ?? "", + }, sizeBytes); + } + + recordApiRequestTime( + method: string, + uri: string, + status: number | string, + durationMs: number, + ): void { + const statusStr = String(status); + const seconds = durationMs / 1000; + this.state.httpApiClientRequestSeconds.observe( + { method, uri, status: statusStr }, + seconds, + ); + this._promRegistry?.observeHistogram("http_api_client_request_seconds", { + method, + uri, + status: statusStr, + }, seconds); + } + + // ── Public API ────────────────────────────────────────────────── + + getMetrics(): Readonly { + return this.state; + } + + reset(): void { + this.state = this.createEmpty(); + } + + async stop(): Promise { + setHttpMetricsObserver(undefined); + if (this._fileTimer) { + clearInterval(this._fileTimer); + this._fileTimer = undefined; + } + if (this._server) { + await this._server.stop(); + this._server = undefined; + } + } + + collectorName(): string { + return "canonical"; + } + + getContentType(): string { + return ( + this._promRegistry?.contentType ?? + "text/plain; version=0.0.4; charset=utf-8" + ); + } + + async toPrometheusTextAsync(): Promise { + if (this._promRegistry?.available) { + return this._promRegistry.metrics(); + } + return this.toPrometheusText(); + } + + toPrometheusText(_prefix?: string): string { + const lines: string[] = []; + + // ── Counters ── + const counterDefs: { + name: string; + help: string; + counter: MultiLabelCounter; + }[] = [ + { name: "task_poll_total", help: "Total number of task polls", counter: this.state.taskPollTotal }, + { name: "task_execution_started_total", help: "Total number of task executions started", counter: this.state.taskExecutionStartedTotal }, + { name: "task_poll_error_total", help: "Total number of task poll errors", counter: this.state.taskPollErrorTotal }, + { name: "task_execute_error_total", help: "Total number of task execution errors", counter: this.state.taskExecuteErrorTotal }, + { name: "task_update_error_total", help: "Total number of task update errors", counter: this.state.taskUpdateErrorTotal }, + { name: "task_ack_error_total", help: "Total number of task ack errors", counter: this.state.taskAckErrorTotal }, + { name: "task_ack_failed_total", help: "Total number of task ack failures (server declined)", counter: this.state.taskAckFailedTotal }, + { name: "task_execution_queue_full_total", help: "Total number of task execution queue full events", counter: this.state.taskExecutionQueueFullTotal }, + { name: "task_paused_total", help: "Total number of task paused events", counter: this.state.taskPausedTotal }, + { name: "thread_uncaught_exceptions_total", help: "Total uncaught exceptions", counter: this.state.threadUncaughtExceptionsTotal }, + { name: "external_payload_used_total", help: "Total external payload usage", counter: this.state.externalPayloadUsedTotal }, + { name: "workflow_start_error_total", help: "Total workflow start errors", counter: this.state.workflowStartErrorTotal }, + ]; + + for (const def of counterDefs) { + const rendered = def.counter.render(def.name, def.help); + if (rendered) lines.push(rendered); + } + + // ── Time histograms ── + const timeHistogramDefs: { + name: string; + help: string; + histogram: HistogramAccumulator; + }[] = [ + { name: "task_poll_time_seconds", help: "Task poll duration in seconds", histogram: this.state.taskPollTimeSeconds }, + { name: "task_execute_time_seconds", help: "Task execution duration in seconds", histogram: this.state.taskExecuteTimeSeconds }, + { name: "task_update_time_seconds", help: "Task update duration in seconds", histogram: this.state.taskUpdateTimeSeconds }, + { name: "http_api_client_request_seconds", help: "HTTP API client request duration in seconds", histogram: this.state.httpApiClientRequestSeconds }, + ]; + + for (const def of timeHistogramDefs) { + const rendered = def.histogram.render(def.name, def.help); + if (rendered) lines.push(rendered); + } + + // ── Size histograms ── + const sizeHistogramDefs: { + name: string; + help: string; + histogram: HistogramAccumulator; + }[] = [ + { name: "task_result_size_bytes", help: "Task result output size in bytes", histogram: this.state.taskResultSizeBytes }, + { name: "workflow_input_size_bytes", help: "Workflow input payload size in bytes", histogram: this.state.workflowInputSizeBytes }, + ]; + + for (const def of sizeHistogramDefs) { + const rendered = def.histogram.render(def.name, def.help); + if (rendered) lines.push(rendered); + } + + // ── Gauges ── + const gaugeDefs: { + name: string; + help: string; + gauge: GaugeMetric; + }[] = [ + { name: "active_workers", help: "Number of workers actively executing tasks", gauge: this.state.activeWorkers }, + ]; + + for (const def of gaugeDefs) { + const rendered = def.gauge.render(def.name, def.help); + if (rendered) lines.push(rendered); + } + + lines.push(""); // trailing newline + return lines.join("\n"); + } +} diff --git a/src/sdk/worker/metrics/CanonicalPrometheusRegistry.ts b/src/sdk/worker/metrics/CanonicalPrometheusRegistry.ts new file mode 100644 index 00000000..c7ced41d --- /dev/null +++ b/src/sdk/worker/metrics/CanonicalPrometheusRegistry.ts @@ -0,0 +1,168 @@ +/** + * Optional prom-client adapter for canonical metrics. + * + * Registers Prometheus Counter, Histogram, and Gauge objects using the + * canonical metric names (unprefixed, camelCase labels, seconds/bytes units). + */ + +import { TIME_BUCKETS, SIZE_BUCKETS } from "./accumulators"; + +interface PromCounter { + inc(labels: Record, value?: number): void; +} +interface PromHistogram { + observe(labels: Record, value: number): void; +} +interface PromGauge { + set(labels: Record, value: number): void; +} +interface PromRegistry { + metrics(): Promise; + contentType: string; +} + +export class CanonicalPrometheusRegistry { + private _counters = new Map(); + private _histograms = new Map(); + private _gauges = new Map(); + private _registry?: PromRegistry; + private _available = false; + + async initialize(): Promise { + try { + const promClient = await import("prom-client"); + this._registry = promClient.register; + this.createMetrics(promClient); + this._available = true; + return true; + } catch { + this._available = false; + return false; + } + } + + get available(): boolean { + return this._available; + } + + get contentType(): string { + return ( + this._registry?.contentType ?? + "text/plain; version=0.0.4; charset=utf-8" + ); + } + + async metrics(): Promise { + if (!this._registry) return ""; + return this._registry.metrics(); + } + + incrementCounter( + name: string, + labels: Record, + value = 1, + ): void { + this._counters.get(name)?.inc(labels, value); + } + + observeHistogram( + name: string, + labels: Record, + value: number, + ): void { + this._histograms.get(name)?.observe(labels, value); + } + + setGauge( + name: string, + labels: Record, + value: number, + ): void { + this._gauges.get(name)?.set(labels, value); + } + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + private createMetrics(promClient: any): void { + const Counter = promClient.Counter; + const Histogram = promClient.Histogram; + const Gauge = promClient.Gauge; + + const counterDefs: { + name: string; + help: string; + labels: string[]; + }[] = [ + { name: "task_poll_total", help: "Total task polls", labels: ["taskType"] }, + { name: "task_execution_started_total", help: "Total task executions started", labels: ["taskType"] }, + { name: "task_poll_error_total", help: "Total task poll errors", labels: ["taskType", "exception"] }, + { name: "task_execute_error_total", help: "Total task execution errors", labels: ["taskType", "exception"] }, + { name: "task_update_error_total", help: "Total task update errors", labels: ["taskType", "exception"] }, + { name: "task_ack_error_total", help: "Total task ack errors", labels: ["taskType", "exception"] }, + { name: "task_ack_failed_total", help: "Total task ack failures", labels: ["taskType"] }, + { name: "task_execution_queue_full_total", help: "Task execution queue full", labels: ["taskType"] }, + { name: "task_paused_total", help: "Task paused events", labels: ["taskType"] }, + { name: "thread_uncaught_exceptions_total", help: "Uncaught exceptions", labels: ["exception"] }, + { name: "external_payload_used_total", help: "External payload used", labels: ["entityName", "operation", "payloadType"] }, + { name: "workflow_start_error_total", help: "Workflow start errors", labels: ["workflowType", "exception"] }, + ]; + + for (const def of counterDefs) { + this._counters.set( + def.name, + new Counter({ + name: def.name, + help: def.help, + labelNames: def.labels, + }), + ); + } + + const timeBuckets = [...TIME_BUCKETS]; + const sizeBuckets = [...SIZE_BUCKETS]; + + const histogramDefs: { + name: string; + help: string; + labels: string[]; + buckets: number[]; + }[] = [ + { name: "task_poll_time_seconds", help: "Task poll duration (seconds)", labels: ["taskType", "status"], buckets: timeBuckets }, + { name: "task_execute_time_seconds", help: "Task execution duration (seconds)", labels: ["taskType", "status"], buckets: timeBuckets }, + { name: "task_update_time_seconds", help: "Task update duration (seconds)", labels: ["taskType", "status"], buckets: timeBuckets }, + { name: "http_api_client_request_seconds", help: "HTTP API client request duration (seconds)", labels: ["method", "uri", "status"], buckets: timeBuckets }, + { name: "task_result_size_bytes", help: "Task result size (bytes)", labels: ["taskType"], buckets: sizeBuckets }, + { name: "workflow_input_size_bytes", help: "Workflow input size (bytes)", labels: ["workflowType", "version"], buckets: sizeBuckets }, + ]; + + for (const def of histogramDefs) { + this._histograms.set( + def.name, + new Histogram({ + name: def.name, + help: def.help, + labelNames: def.labels, + buckets: def.buckets, + }), + ); + } + + const gaugeDefs: { + name: string; + help: string; + labels: string[]; + }[] = [ + { name: "active_workers", help: "Workers actively executing tasks", labels: ["taskType"] }, + ]; + + for (const def of gaugeDefs) { + this._gauges.set( + def.name, + new Gauge({ + name: def.name, + help: def.help, + labelNames: def.labels, + }), + ); + } + } +} diff --git a/src/sdk/worker/metrics/MetricsCollector.ts b/src/sdk/worker/metrics/LegacyMetricsCollector.ts similarity index 85% rename from src/sdk/worker/metrics/MetricsCollector.ts rename to src/sdk/worker/metrics/LegacyMetricsCollector.ts index ae3fe542..8932a91b 100644 --- a/src/sdk/worker/metrics/MetricsCollector.ts +++ b/src/sdk/worker/metrics/LegacyMetricsCollector.ts @@ -1,5 +1,4 @@ import type { - TaskRunnerEventsListener, PollStarted, PollCompleted, PollFailure, @@ -9,6 +8,8 @@ import type { TaskUpdateCompleted, TaskUpdateFailure, } from "../../clients/worker/events"; +import type { MetricsCollectorInterface } from "./MetricsCollectorInterface"; +import { setHttpMetricsObserver } from "./httpObserver"; /** * Configuration for MetricsCollector. @@ -35,7 +36,7 @@ export interface MetricsCollectorConfig { } /** - * Collected worker metrics. + * Collected worker metrics (legacy shape). */ export interface WorkerMetrics { /** Total polls by taskType */ @@ -78,9 +79,6 @@ export interface WorkerMetrics { const QUANTILES = [0.5, 0.75, 0.9, 0.95, 0.99] as const; -/** - * Calculate quantiles from sorted array using linear interpolation. - */ function computeQuantile(sorted: number[], q: number): number { if (sorted.length === 0) return 0; if (sorted.length === 1) return sorted[0]; @@ -92,26 +90,13 @@ function computeQuantile(sorted: number[], q: number): number { } /** - * Built-in metrics collector implementing TaskRunnerEventsListener. - * - * Collects 19 metric types matching the Python SDK's MetricsCollector, - * with sliding-window quantile support (p50, p75, p90, p95, p99). + * Legacy metrics collector. * - * @example - * ```typescript - * const metrics = new MetricsCollector({ httpPort: 9090 }); - * - * const handler = new TaskHandler({ - * client, - * eventListeners: [metrics], - * }); - * - * await handler.startWorkers(); - * // GET http://localhost:9090/metrics — Prometheus format - * // GET http://localhost:9090/health — {"status":"UP"} - * ``` + * Emits prefixed Prometheus metrics with `task_type` labels, millisecond + * time units, and Summary type for distributions. This is the default + * implementation during the deprecation transition period. */ -export class MetricsCollector implements TaskRunnerEventsListener { +export class LegacyMetricsCollector implements MetricsCollectorInterface { private metrics: WorkerMetrics; private readonly _prefix: string; private readonly _slidingWindowSize: number; @@ -132,7 +117,7 @@ export class MetricsCollector implements TaskRunnerEventsListener { if (config?.filePath) { this.startFileWriter( config.filePath, - config.fileWriteIntervalMs ?? 5000 + config.fileWriteIntervalMs ?? 5000, ); } } @@ -158,10 +143,8 @@ export class MetricsCollector implements TaskRunnerEventsListener { // Silently ignore file write errors } }; - // Immediate first write, then periodic void doWrite(); this._fileTimer = setInterval(doWrite, intervalMs); - // Unref so the timer doesn't prevent process exit if (typeof this._fileTimer === "object" && "unref" in this._fileTimer) { this._fileTimer.unref(); } @@ -197,7 +180,7 @@ export class MetricsCollector implements TaskRunnerEventsListener { private observe( map: Map, key: string, - value: number + value: number, ): void { let arr = map.get(key); if (!arr) { @@ -205,7 +188,6 @@ export class MetricsCollector implements TaskRunnerEventsListener { map.set(key, arr); } arr.push(value); - // Sliding window: keep only the last N observations if (arr.length > this._slidingWindowSize) { arr.splice(0, arr.length - this._slidingWindowSize); } @@ -215,7 +197,7 @@ export class MetricsCollector implements TaskRunnerEventsListener { map: Map, key: string, promKey: string, - labelName: string + labelName: string, ): void { this.increment(map, key); this._promRegistry?.incrementCounter(promKey, { [labelName]: key }); @@ -226,7 +208,7 @@ export class MetricsCollector implements TaskRunnerEventsListener { key: string, value: number, promKey: string, - labelName: string + labelName: string, ): void { this.observe(map, key, value); this._promRegistry?.observeSummary(promKey, { [labelName]: key }, value); @@ -248,7 +230,7 @@ export class MetricsCollector implements TaskRunnerEventsListener { } onTaskExecutionStarted(_event: TaskExecutionStarted): void { - // Counted on completion + // Legacy counts on completion, not start } onTaskExecutionCompleted(event: TaskExecutionCompleted): void { @@ -271,59 +253,68 @@ export class MetricsCollector implements TaskRunnerEventsListener { onTaskUpdateFailure(event: TaskUpdateFailure): void { this.incrementCounter(this.metrics.taskUpdateFailureTotal, event.taskType, "update_error_total", "task_type"); + this.observeSummary(this.metrics.updateDurationMs, event.taskType, event.durationMs, "update_time", "task_type"); + } + + // Canonical-only event — noop in legacy + onTaskPaused(): void { + // Noop: TaskPaused events are handled via recordTaskPaused() } - // ── Direct Recording Methods (for code outside event system) ─── + // ── Direct Recording Methods ─────────────────────────────────── - /** Record a task execution queue full event */ recordTaskExecutionQueueFull(taskType: string): void { this.incrementCounter(this.metrics.taskExecutionQueueFullTotal, taskType, "queue_full_total", "task_type"); } - /** Record an uncaught exception */ - recordUncaughtException(): void { + recordUncaughtException(_exception?: string): void { this.metrics.uncaughtExceptionTotal++; this._promRegistry?.incrementCounter("uncaught_total", {}); } - /** Record a worker restart */ recordWorkerRestart(): void { this.metrics.workerRestartTotal++; this._promRegistry?.incrementCounter("restart_total", {}); } - /** Record a task paused event */ recordTaskPaused(taskType: string): void { this.incrementCounter(this.metrics.taskPausedTotal, taskType, "paused_total", "task_type"); } - /** Record a task ack error */ - recordTaskAckError(taskType: string): void { + recordTaskAckError(taskType: string, _exception?: string): void { this.incrementCounter(this.metrics.taskAckErrorTotal, taskType, "ack_error_total", "task_type"); } - /** Record a workflow start error */ - recordWorkflowStartError(): void { + recordTaskAckFailed(_taskType: string): void { + // Noop: canonical-only metric (server declined ack) + } + + recordWorkflowStartError(_workflowType?: string, _exception?: string): void { this.metrics.workflowStartErrorTotal++; this._promRegistry?.incrementCounter("wf_start_error_total", {}); } - /** Record external payload usage */ - recordExternalPayloadUsed(payloadType: string): void { + recordExternalPayloadUsed( + payloadType: string, + _entityName?: string, + _operation?: string, + ): void { this.incrementCounter(this.metrics.externalPayloadUsedTotal, payloadType, "external_payload_total", "payload_type"); } - /** Record workflow input size */ - recordWorkflowInputSize(workflowType: string, sizeBytes: number): void { + recordWorkflowInputSize( + workflowType: string, + sizeBytes: number, + _version?: string, + ): void { this.observeSummary(this.metrics.workflowInputSizeBytes, workflowType, sizeBytes, "wf_input_size", "workflow_type"); } - /** Record API request duration */ recordApiRequestTime( method: string, uri: string, - status: number, - durationMs: number + status: number | string, + durationMs: number, ): void { const key = `${method}:${uri}:${status}`; this.observeSummary(this.metrics.apiRequestDurationMs, key, durationMs, "api_request", "endpoint"); @@ -331,18 +322,16 @@ export class MetricsCollector implements TaskRunnerEventsListener { // ── Public API ────────────────────────────────────────────────── - /** Get a snapshot of all collected metrics */ getMetrics(): Readonly { return this.metrics; } - /** Reset all collected metrics */ reset(): void { this.metrics = this.createEmptyMetrics(); } - /** Stop the auto-started metrics HTTP server and file writer (if any) */ async stop(): Promise { + setHttpMetricsObserver(undefined); if (this._fileTimer) { clearInterval(this._fileTimer); this._fileTimer = undefined; @@ -353,27 +342,17 @@ export class MetricsCollector implements TaskRunnerEventsListener { } } - /** - * Get the content type for the Prometheus metrics endpoint. - * Returns prom-client's content type when available, otherwise standard Prometheus text format. - */ + collectorName(): string { + return "legacy"; + } + getContentType(): string { - return this._promRegistry?.contentType ?? "text/plain; version=0.0.4; charset=utf-8"; + return ( + this._promRegistry?.contentType ?? + "text/plain; version=0.0.4; charset=utf-8" + ); } - /** - * Render all collected metrics in Prometheus exposition format. - * If prom-client is available and `usePromClient: true`, delegates to prom-client's registry. - * Otherwise uses built-in rendering with p50/p75/p90/p95/p99 quantiles. - * - * @param prefix - Metric name prefix (defaults to constructor config or "conductor_worker") - * @returns Prometheus text format string - */ - /** - * Async version of toPrometheusText. - * When prom-client is available, returns its native registry output. - * Otherwise falls back to the built-in text format. - */ async toPrometheusTextAsync(): Promise { if (this._promRegistry?.available) { return this._promRegistry.metrics(); @@ -454,7 +433,7 @@ export class MetricsCollector implements TaskRunnerEventsListener { lines.push(`# TYPE ${counter.name} counter`); for (const [label, value] of counter.data) { lines.push( - `${counter.name}{${counter.labelName}="${label}"} ${value}` + `${counter.name}{${counter.labelName}="${label}"} ${value}`, ); } } @@ -545,14 +524,14 @@ export class MetricsCollector implements TaskRunnerEventsListener { for (const q of QUANTILES) { const val = computeQuantile(sorted, q); lines.push( - `${summary.name}{${summary.labelName}="${label}",quantile="${q}"} ${val}` + `${summary.name}{${summary.labelName}="${label}",quantile="${q}"} ${val}`, ); } lines.push( - `${summary.name}_count{${summary.labelName}="${label}"} ${count}` + `${summary.name}_count{${summary.labelName}="${label}"} ${count}`, ); lines.push( - `${summary.name}_sum{${summary.labelName}="${label}"} ${sum}` + `${summary.name}_sum{${summary.labelName}="${label}"} ${sum}`, ); } } diff --git a/src/sdk/worker/metrics/MetricsCollectorInterface.ts b/src/sdk/worker/metrics/MetricsCollectorInterface.ts new file mode 100644 index 00000000..4852b099 --- /dev/null +++ b/src/sdk/worker/metrics/MetricsCollectorInterface.ts @@ -0,0 +1,49 @@ +import type { + TaskRunnerEventsListener, +} from "../../clients/worker/events"; + +/** + * Unified metrics collector interface. + * + * Both LegacyMetricsCollector and CanonicalMetricsCollector implement this + * interface so call sites never need to know which variant is active. + * Methods that only apply to one variant are noops in the other. + */ +export interface MetricsCollectorInterface extends TaskRunnerEventsListener { + // ── Direct recording methods (superset signatures) ───────────── + + recordTaskExecutionQueueFull(taskType: string): void; + recordUncaughtException(exception?: string): void; + recordWorkerRestart(): void; + recordTaskPaused(taskType: string): void; + recordTaskAckError(taskType: string, exception?: string): void; + /** Canonical-only: server declined ack (no exception). Legacy noops. */ + recordTaskAckFailed(taskType: string): void; + recordWorkflowStartError(workflowType?: string, exception?: string): void; + recordExternalPayloadUsed( + payloadType: string, + entityName?: string, + operation?: string, + ): void; + recordWorkflowInputSize( + workflowType: string, + sizeBytes: number, + version?: string, + ): void; + recordApiRequestTime( + method: string, + uri: string, + status: number | string, + durationMs: number, + ): void; + + // ── Output / lifecycle ───────────────────────────────────────── + + collectorName(): string; + getMetrics(): unknown; + reset(): void; + stop(): Promise; + getContentType(): string; + toPrometheusText(prefix?: string): string; + toPrometheusTextAsync(): Promise; +} diff --git a/src/sdk/worker/metrics/MetricsServer.ts b/src/sdk/worker/metrics/MetricsServer.ts index 62f1fbcb..d575901b 100644 --- a/src/sdk/worker/metrics/MetricsServer.ts +++ b/src/sdk/worker/metrics/MetricsServer.ts @@ -1,5 +1,5 @@ import { createServer, type Server, type IncomingMessage, type ServerResponse } from "node:http"; -import type { MetricsCollector } from "./MetricsCollector"; +import type { MetricsCollectorInterface } from "./MetricsCollectorInterface"; /** * Lightweight HTTP server exposing Prometheus metrics and a health check endpoint. @@ -21,11 +21,11 @@ import type { MetricsCollector } from "./MetricsCollector"; * ``` */ export class MetricsServer { - private readonly _collector: MetricsCollector; + private readonly _collector: MetricsCollectorInterface; private readonly _port: number; private _server?: Server; - constructor(collector: MetricsCollector, port: number) { + constructor(collector: MetricsCollectorInterface, port: number) { this._collector = collector; this._port = port; } diff --git a/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts new file mode 100644 index 00000000..37c9300f --- /dev/null +++ b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts @@ -0,0 +1,435 @@ +import { describe, it, expect, beforeEach } from "@jest/globals"; +import { CanonicalMetricsCollector } from "../CanonicalMetricsCollector"; + +describe("CanonicalMetricsCollector", () => { + let collector: CanonicalMetricsCollector; + + beforeEach(() => { + collector = new CanonicalMetricsCollector(); + }); + + describe("poll metrics", () => { + it("should emit task_poll_total counter on onPollStarted", () => { + collector.onPollStarted({ + taskType: "task_a", + workerId: "w1", + pollCount: 5, + timestamp: new Date(), + }); + collector.onPollStarted({ + taskType: "task_a", + workerId: "w1", + pollCount: 5, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE task_poll_total counter"); + expect(text).toContain('task_poll_total{taskType="task_a"} 2'); + }); + + it("should emit task_poll_time_seconds histogram on onPollCompleted", () => { + collector.onPollCompleted({ + taskType: "task_a", + durationMs: 100, + tasksReceived: 3, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE task_poll_time_seconds histogram"); + expect(text).toContain('task_poll_time_seconds_bucket{taskType="task_a",status="SUCCESS",le="0.1"} 1'); + expect(text).toContain('task_poll_time_seconds_sum{taskType="task_a",status="SUCCESS"} 0.1'); + expect(text).toContain('task_poll_time_seconds_count{taskType="task_a",status="SUCCESS"} 1'); + }); + + it("should emit task_poll_error_total with exception label on onPollFailure", () => { + collector.onPollFailure({ + taskType: "task_a", + durationMs: 5000, + cause: new TypeError("timeout"), + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain('task_poll_error_total{taskType="task_a",exception="TypeError"} 1'); + expect(text).toContain('task_poll_time_seconds_bucket{taskType="task_a",status="FAILURE"'); + }); + }); + + describe("execution metrics", () => { + it("should emit task_execution_started_total on onTaskExecutionStarted", () => { + collector.onTaskExecutionStarted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain('task_execution_started_total{taskType="task_a"} 1'); + }); + + it("should track active_workers gauge", () => { + collector.onTaskExecutionStarted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + timestamp: new Date(), + }); + collector.onTaskExecutionStarted({ + taskType: "task_a", + taskId: "t2", + workerId: "w1", + timestamp: new Date(), + }); + + let text = collector.toPrometheusText(); + expect(text).toContain('active_workers{taskType="task_a"} 2'); + + collector.onTaskExecutionCompleted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + durationMs: 200, + timestamp: new Date(), + }); + + text = collector.toPrometheusText(); + expect(text).toContain('active_workers{taskType="task_a"} 1'); + }); + + it("should emit task_execute_time_seconds histogram on completion", () => { + collector.onTaskExecutionCompleted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + durationMs: 500, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE task_execute_time_seconds histogram"); + expect(text).toContain('task_execute_time_seconds_sum{taskType="task_a",status="SUCCESS"} 0.5'); + }); + + it("should emit task_result_size_bytes histogram on completion", () => { + collector.onTaskExecutionCompleted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + durationMs: 200, + outputSizeBytes: 5000, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE task_result_size_bytes histogram"); + expect(text).toContain('task_result_size_bytes_sum{taskType="task_a"} 5000'); + }); + + it("should emit task_execute_error_total with exception label on failure", () => { + const err = new TypeError("invalid input"); + collector.onTaskExecutionFailure({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + cause: err, + durationMs: 50, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain('task_execute_error_total{taskType="task_a",exception="TypeError"} 1'); + expect(text).toContain('task_execute_time_seconds_bucket{taskType="task_a",status="FAILURE"'); + }); + }); + + describe("task update metrics", () => { + it("should emit task_update_time_seconds histogram on completion", () => { + collector.onTaskUpdateCompleted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + durationMs: 25, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE task_update_time_seconds histogram"); + expect(text).toContain('task_update_time_seconds_sum{taskType="task_a",status="SUCCESS"} 0.025'); + }); + + it("should emit task_update_error_total and task_update_time_seconds on failure", () => { + collector.onTaskUpdateFailure({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + cause: new Error("server error"), + durationMs: 150, + retryCount: 4, + taskResult: {}, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain('task_update_error_total{taskType="task_a",exception="Error"} 1'); + expect(text).toContain('task_update_time_seconds_sum{taskType="task_a",status="FAILURE"} 0.15'); + }); + }); + + describe("direct recording methods", () => { + it("recordTaskExecutionQueueFull should emit counter", () => { + collector.recordTaskExecutionQueueFull("task_a"); + collector.recordTaskExecutionQueueFull("task_a"); + + const text = collector.toPrometheusText(); + expect(text).toContain('task_execution_queue_full_total{taskType="task_a"} 2'); + }); + + it("recordUncaughtException should emit counter with exception label", () => { + collector.recordUncaughtException("RangeError"); + collector.recordUncaughtException("RangeError"); + collector.recordUncaughtException("TypeError"); + + const text = collector.toPrometheusText(); + expect(text).toContain('thread_uncaught_exceptions_total{exception="RangeError"} 2'); + expect(text).toContain('thread_uncaught_exceptions_total{exception="TypeError"} 1'); + }); + + it("recordWorkerRestart should be a noop (N/A for JS)", () => { + collector.recordWorkerRestart(); + const text = collector.toPrometheusText(); + expect(text).not.toContain("worker_restart"); + }); + + it("recordTaskPaused should emit counter", () => { + collector.recordTaskPaused("paused_task"); + const text = collector.toPrometheusText(); + expect(text).toContain('task_paused_total{taskType="paused_task"} 1'); + }); + + it("recordTaskAckError should emit counter with exception label", () => { + collector.recordTaskAckError("task_a", "TimeoutError"); + const text = collector.toPrometheusText(); + expect(text).toContain('task_ack_error_total{taskType="task_a",exception="TimeoutError"} 1'); + }); + + it("recordTaskAckFailed should emit counter", () => { + collector.recordTaskAckFailed("task_a"); + const text = collector.toPrometheusText(); + expect(text).toContain('task_ack_failed_total{taskType="task_a"} 1'); + }); + + it("recordWorkflowStartError should emit counter with labels", () => { + collector.recordWorkflowStartError("my_workflow", "NetworkError"); + const text = collector.toPrometheusText(); + expect(text).toContain('workflow_start_error_total{workflowType="my_workflow",exception="NetworkError"} 1'); + }); + + it("recordExternalPayloadUsed should emit counter with labels", () => { + collector.recordExternalPayloadUsed("TASK_OUTPUT", "myEntity", "WRITE"); + const text = collector.toPrometheusText(); + expect(text).toContain('external_payload_used_total{entityName="myEntity",operation="WRITE",payloadType="TASK_OUTPUT"} 1'); + }); + + it("recordWorkflowInputSize should emit histogram", () => { + collector.recordWorkflowInputSize("order_flow", 50000, "1"); + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE workflow_input_size_bytes histogram"); + expect(text).toContain('workflow_input_size_bytes_sum{workflowType="order_flow",version="1"} 50000'); + }); + + it("recordApiRequestTime should emit histogram in seconds", () => { + collector.recordApiRequestTime("GET", "/api/workflow", 200, 45); + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE http_api_client_request_seconds histogram"); + expect(text).toContain('http_api_client_request_seconds_sum{method="GET",uri="/api/workflow",status="200"} 0.045'); + }); + }); + + describe("onTaskPaused event", () => { + it("should increment task_paused_total", () => { + collector.onTaskPaused({ + taskType: "paused_task", + timestamp: new Date(), + }); + const text = collector.toPrometheusText(); + expect(text).toContain('task_paused_total{taskType="paused_task"} 1'); + }); + }); + + describe("reset", () => { + it("should clear all canonical metrics", () => { + collector.onPollStarted({ + taskType: "task_a", + workerId: "w1", + pollCount: 1, + timestamp: new Date(), + }); + collector.recordUncaughtException("Error"); + collector.recordTaskAckFailed("task_a"); + + collector.reset(); + + const text = collector.toPrometheusText(); + expect(text).toBe(""); + }); + }); + + describe("output format", () => { + it("should not apply a prefix to canonical metric names", () => { + collector.onPollStarted({ + taskType: "t", + workerId: "w", + pollCount: 1, + timestamp: new Date(), + }); + const text = collector.toPrometheusText(); + expect(text).not.toContain("conductor_worker"); + expect(text).toContain("task_poll_total{"); + }); + + it("should use seconds for time histograms", () => { + collector.onPollCompleted({ + taskType: "t", + durationMs: 1000, + tasksReceived: 1, + timestamp: new Date(), + }); + const text = collector.toPrometheusText(); + expect(text).toContain("task_poll_time_seconds"); + expect(text).toContain('_sum{taskType="t",status="SUCCESS"} 1'); + }); + + it("should use camelCase taskType label", () => { + collector.onPollStarted({ + taskType: "my_task", + workerId: "w", + pollCount: 1, + timestamp: new Date(), + }); + const text = collector.toPrometheusText(); + expect(text).toContain("taskType="); + expect(text).not.toContain("task_type="); + }); + }); + + describe("stop", () => { + it("should not throw when no server is running", async () => { + await expect(collector.stop()).resolves.toBeUndefined(); + }); + }); + + describe("default argument handling", () => { + it("recordUncaughtException with no arg should default to 'Error'", () => { + collector.recordUncaughtException(); + const text = collector.toPrometheusText(); + expect(text).toContain('thread_uncaught_exceptions_total{exception="Error"} 1'); + }); + + it("recordWorkflowStartError with no args should default both labels", () => { + collector.recordWorkflowStartError(); + const text = collector.toPrometheusText(); + expect(text).toContain('workflow_start_error_total{workflowType="",exception="Error"} 1'); + }); + + it("recordWorkflowStartError with only workflowType should default exception", () => { + collector.recordWorkflowStartError("my_wf"); + const text = collector.toPrometheusText(); + expect(text).toContain('workflow_start_error_total{workflowType="my_wf",exception="Error"} 1'); + }); + + it("recordExternalPayloadUsed with missing optional args should default to empty strings", () => { + collector.recordExternalPayloadUsed("TASK_OUTPUT"); + const text = collector.toPrometheusText(); + expect(text).toContain('external_payload_used_total{entityName="",operation="",payloadType="TASK_OUTPUT"} 1'); + }); + + it("recordTaskAckError with no exception arg should default to 'Error'", () => { + collector.recordTaskAckError("task_a"); + const text = collector.toPrometheusText(); + expect(text).toContain('task_ack_error_total{taskType="task_a",exception="Error"} 1'); + }); + + it("recordWorkflowInputSize with no version should default to empty string", () => { + collector.recordWorkflowInputSize("my_wf", 1000); + const text = collector.toPrometheusText(); + expect(text).toContain('workflow_input_size_bytes_sum{workflowType="my_wf",version=""} 1000'); + }); + }); + + describe("execution completion without outputSizeBytes", () => { + it("should not emit task_result_size_bytes when outputSizeBytes is undefined", () => { + collector.onTaskExecutionCompleted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + durationMs: 200, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain("task_execute_time_seconds"); + expect(text).not.toContain("task_result_size_bytes"); + }); + }); + + describe("output helpers without prom-client", () => { + it("getContentType should return plain text fallback", () => { + expect(collector.getContentType()).toBe( + "text/plain; version=0.0.4; charset=utf-8" + ); + }); + + it("toPrometheusTextAsync should fall back to sync text", async () => { + collector.onPollStarted({ + taskType: "t", + workerId: "w", + pollCount: 1, + timestamp: new Date(), + }); + const asyncText = await collector.toPrometheusTextAsync(); + const syncText = collector.toPrometheusText(); + expect(asyncText).toBe(syncText); + }); + + it("toPrometheusText should ignore _prefix argument", () => { + collector.onPollStarted({ + taskType: "t", + workerId: "w", + pollCount: 1, + timestamp: new Date(), + }); + const text = collector.toPrometheusText("some_prefix"); + expect(text).not.toContain("some_prefix"); + expect(text).toContain("task_poll_total{"); + }); + }); + + describe("active_workers gauge on failure", () => { + it("should decrement active_workers on task execution failure", () => { + collector.onTaskExecutionStarted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + timestamp: new Date(), + }); + + let text = collector.toPrometheusText(); + expect(text).toContain('active_workers{taskType="task_a"} 1'); + + collector.onTaskExecutionFailure({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + cause: new Error("fail"), + durationMs: 100, + timestamp: new Date(), + }); + + text = collector.toPrometheusText(); + expect(text).toContain('active_workers{taskType="task_a"} 0'); + }); + }); +}); diff --git a/src/sdk/worker/metrics/__tests__/CanonicalPrometheusRegistry.test.ts b/src/sdk/worker/metrics/__tests__/CanonicalPrometheusRegistry.test.ts new file mode 100644 index 00000000..449955eb --- /dev/null +++ b/src/sdk/worker/metrics/__tests__/CanonicalPrometheusRegistry.test.ts @@ -0,0 +1,176 @@ +import { describe, it, expect, beforeEach } from "@jest/globals"; +import { CanonicalPrometheusRegistry } from "../CanonicalPrometheusRegistry"; + +describe("CanonicalPrometheusRegistry", () => { + let registry: CanonicalPrometheusRegistry; + + beforeEach(async () => { + try { + const promClient = await import("prom-client"); + promClient.register.clear(); + } catch { + // prom-client not installed — skip cleanup + } + registry = new CanonicalPrometheusRegistry(); + }); + + describe("before initialization", () => { + it("should not be available", () => { + expect(registry.available).toBe(false); + }); + + it("should return default content type", () => { + expect(registry.contentType).toBe( + "text/plain; version=0.0.4; charset=utf-8" + ); + }); + + it("should return empty string from metrics()", async () => { + expect(await registry.metrics()).toBe(""); + }); + + it("should not throw when incrementCounter is called", () => { + expect(() => + registry.incrementCounter("task_poll_total", { taskType: "t" }) + ).not.toThrow(); + }); + + it("should not throw when observeHistogram is called", () => { + expect(() => + registry.observeHistogram("task_poll_time_seconds", { taskType: "t" }, 0.1) + ).not.toThrow(); + }); + + it("should not throw when setGauge is called", () => { + expect(() => + registry.setGauge("active_workers", { taskType: "t" }, 5) + ).not.toThrow(); + }); + }); + + describe("initialize()", () => { + it("should return true when prom-client is installed", async () => { + const result = await registry.initialize(); + expect(result).toBe(true); + expect(registry.available).toBe(true); + }); + + it("should set contentType from prom-client registry", async () => { + await registry.initialize(); + expect(registry.contentType).toBeDefined(); + expect(registry.contentType.length).toBeGreaterThan(0); + }); + }); + + describe("after initialization", () => { + beforeEach(async () => { + await registry.initialize(); + }); + + it("should be available", () => { + expect(registry.available).toBe(true); + }); + + it("should return metrics text from prom-client", async () => { + const text = await registry.metrics(); + expect(typeof text).toBe("string"); + }); + + it("should increment a counter and see it in metrics output", async () => { + registry.incrementCounter("task_poll_total", { taskType: "test_t" }); + registry.incrementCounter("task_poll_total", { taskType: "test_t" }); + const text = await registry.metrics(); + expect(text).toContain("task_poll_total"); + expect(text).toContain("test_t"); + }); + + it("should observe a histogram and see it in metrics output", async () => { + registry.observeHistogram( + "task_poll_time_seconds", + { taskType: "test_t", status: "SUCCESS" }, + 0.05 + ); + const text = await registry.metrics(); + expect(text).toContain("task_poll_time_seconds"); + expect(text).toContain("test_t"); + }); + + it("should set a gauge and see it in metrics output", async () => { + registry.setGauge("active_workers", { taskType: "test_t" }, 3); + const text = await registry.metrics(); + expect(text).toContain("active_workers"); + expect(text).toContain("test_t"); + }); + + it("should handle unknown counter key as no-op", () => { + expect(() => + registry.incrementCounter("nonexistent", { x: "y" }) + ).not.toThrow(); + }); + + it("should handle unknown histogram key as no-op", () => { + expect(() => + registry.observeHistogram("nonexistent", { x: "y" }, 1) + ).not.toThrow(); + }); + + it("should handle unknown gauge key as no-op", () => { + expect(() => + registry.setGauge("nonexistent", { x: "y" }, 1) + ).not.toThrow(); + }); + + it("should accept custom increment value for counter", () => { + expect(() => + registry.incrementCounter("task_poll_total", { taskType: "t" }, 5) + ).not.toThrow(); + }); + + it("should record all counter types", async () => { + registry.incrementCounter("task_poll_total", { taskType: "t" }); + registry.incrementCounter("task_execution_started_total", { taskType: "t" }); + registry.incrementCounter("task_poll_error_total", { taskType: "t", exception: "Error" }); + registry.incrementCounter("task_execute_error_total", { taskType: "t", exception: "Error" }); + registry.incrementCounter("task_update_error_total", { taskType: "t", exception: "Error" }); + registry.incrementCounter("task_ack_error_total", { taskType: "t", exception: "Error" }); + registry.incrementCounter("task_ack_failed_total", { taskType: "t" }); + registry.incrementCounter("task_execution_queue_full_total", { taskType: "t" }); + registry.incrementCounter("task_paused_total", { taskType: "t" }); + registry.incrementCounter("thread_uncaught_exceptions_total", { exception: "Error" }); + registry.incrementCounter("external_payload_used_total", { entityName: "e", operation: "o", payloadType: "p" }); + registry.incrementCounter("workflow_start_error_total", { workflowType: "w", exception: "Error" }); + + const text = await registry.metrics(); + expect(text).toContain("task_poll_total"); + expect(text).toContain("task_execution_started_total"); + expect(text).toContain("task_poll_error_total"); + expect(text).toContain("task_ack_failed_total"); + expect(text).toContain("thread_uncaught_exceptions_total"); + expect(text).toContain("external_payload_used_total"); + expect(text).toContain("workflow_start_error_total"); + }); + + it("should record all histogram types", async () => { + registry.observeHistogram("task_poll_time_seconds", { taskType: "t", status: "SUCCESS" }, 0.1); + registry.observeHistogram("task_execute_time_seconds", { taskType: "t", status: "SUCCESS" }, 0.5); + registry.observeHistogram("task_update_time_seconds", { taskType: "t", status: "SUCCESS" }, 0.02); + registry.observeHistogram("http_api_client_request_seconds", { method: "GET", uri: "/api", status: "200" }, 0.1); + registry.observeHistogram("task_result_size_bytes", { taskType: "t" }, 1024); + registry.observeHistogram("workflow_input_size_bytes", { workflowType: "w", version: "1" }, 512); + + const text = await registry.metrics(); + expect(text).toContain("task_poll_time_seconds"); + expect(text).toContain("task_execute_time_seconds"); + expect(text).toContain("task_update_time_seconds"); + expect(text).toContain("http_api_client_request_seconds"); + expect(text).toContain("task_result_size_bytes"); + expect(text).toContain("workflow_input_size_bytes"); + }); + + it("should record gauge metric", async () => { + registry.setGauge("active_workers", { taskType: "t" }, 7); + const text = await registry.metrics(); + expect(text).toContain("active_workers"); + }); + }); +}); diff --git a/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts b/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts index 3c61671a..ffe9f567 100644 --- a/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts +++ b/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts @@ -1,11 +1,11 @@ import { describe, it, expect, beforeEach } from "@jest/globals"; -import { MetricsCollector } from "../MetricsCollector"; +import { LegacyMetricsCollector } from "../LegacyMetricsCollector"; -describe("MetricsCollector - Prometheus features", () => { - let collector: MetricsCollector; +describe("LegacyMetricsCollector - Prometheus features", () => { + let collector: LegacyMetricsCollector; beforeEach(() => { - collector = new MetricsCollector({ slidingWindowSize: 1000 }); + collector = new LegacyMetricsCollector({ slidingWindowSize: 1000 }); }); // ── toPrometheusText() ────────────────────────────────────────── @@ -36,7 +36,7 @@ describe("MetricsCollector - Prometheus features", () => { }); it("should use custom prefix", () => { - const c = new MetricsCollector({ prefix: "myapp" }); + const c = new LegacyMetricsCollector({ prefix: "myapp" }); c.onPollStarted({ taskType: "t", workerId: "w", pollCount: 1, timestamp: new Date() }); const text = c.toPrometheusText(); expect(text).toContain("myapp_task_poll_total"); @@ -167,7 +167,7 @@ describe("MetricsCollector - Prometheus features", () => { describe("sliding window", () => { it("should trim observations beyond window size", () => { - const small = new MetricsCollector({ slidingWindowSize: 5 }); + const small = new LegacyMetricsCollector({ slidingWindowSize: 5 }); for (let i = 0; i < 10; i++) { small.onPollCompleted({ taskType: "t", workerId: "w", durationMs: i, pollCount: i, taskCount: 1, timestamp: new Date() }); } diff --git a/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts b/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts index 691d9325..33787620 100644 --- a/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts +++ b/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts @@ -1,11 +1,11 @@ import { describe, it, expect, beforeEach } from "@jest/globals"; -import { MetricsCollector } from "../MetricsCollector"; +import { LegacyMetricsCollector } from "../LegacyMetricsCollector"; -describe("MetricsCollector", () => { - let collector: MetricsCollector; +describe("LegacyMetricsCollector", () => { + let collector: LegacyMetricsCollector; beforeEach(() => { - collector = new MetricsCollector(); + collector = new LegacyMetricsCollector(); }); describe("poll metrics", () => { @@ -133,13 +133,14 @@ describe("MetricsCollector", () => { expect(metrics.updateDurationMs.get("task_a")).toEqual([25, 30]); }); - it("should count update failures via onTaskUpdateFailure", () => { + it("should count update failures and record duration via onTaskUpdateFailure", () => { collector.onTaskUpdateFailure({ taskType: "task_a", taskId: "t1", workerId: "w1", workflowInstanceId: "wf1", cause: new Error("server error"), + durationMs: 200, retryCount: 4, taskResult: {}, timestamp: new Date(), @@ -147,6 +148,7 @@ describe("MetricsCollector", () => { const metrics = collector.getMetrics(); expect(metrics.taskUpdateFailureTotal.get("task_a")).toBe(1); + expect(metrics.updateDurationMs.get("task_a")).toEqual([200]); }); }); diff --git a/src/sdk/worker/metrics/__tests__/MetricsServer.test.ts b/src/sdk/worker/metrics/__tests__/MetricsServer.test.ts index bbe5153f..c0b423f7 100644 --- a/src/sdk/worker/metrics/__tests__/MetricsServer.test.ts +++ b/src/sdk/worker/metrics/__tests__/MetricsServer.test.ts @@ -1,6 +1,6 @@ import { describe, it, expect, afterEach } from "@jest/globals"; import { get as httpGet } from "node:http"; -import { MetricsCollector } from "../MetricsCollector"; +import { LegacyMetricsCollector } from "../LegacyMetricsCollector"; import { MetricsServer } from "../MetricsServer"; function fetchHttp(url: string): Promise<{ status: number; body: string; headers: Record }> { @@ -38,7 +38,7 @@ describe("MetricsServer", () => { it("should serve /metrics with Prometheus text format", async () => { const port = nextPort(); - const collector = new MetricsCollector(); + const collector = new LegacyMetricsCollector(); collector.onPollStarted({ taskType: "test_task", workerId: "w", pollCount: 1, timestamp: new Date() }); server = new MetricsServer(collector, port); @@ -51,7 +51,7 @@ describe("MetricsServer", () => { it("should serve /health with JSON status", async () => { const port = nextPort(); - const collector = new MetricsCollector(); + const collector = new LegacyMetricsCollector(); server = new MetricsServer(collector, port); await server.start(); @@ -62,7 +62,7 @@ describe("MetricsServer", () => { it("should return 404 for unknown paths", async () => { const port = nextPort(); - const collector = new MetricsCollector(); + const collector = new LegacyMetricsCollector(); server = new MetricsServer(collector, port); await server.start(); @@ -72,7 +72,7 @@ describe("MetricsServer", () => { it("should stop cleanly after start", async () => { const port = nextPort(); - const collector = new MetricsCollector(); + const collector = new LegacyMetricsCollector(); server = new MetricsServer(collector, port); await server.start(); await server.stop(); @@ -80,7 +80,7 @@ describe("MetricsServer", () => { }); it("should not throw when stop() called without start()", async () => { - const collector = new MetricsCollector(); + const collector = new LegacyMetricsCollector(); server = new MetricsServer(collector, nextPort()); await server.stop(); server = undefined; @@ -88,7 +88,7 @@ describe("MetricsServer", () => { it("should not throw when start() called twice", async () => { const port = nextPort(); - const collector = new MetricsCollector(); + const collector = new LegacyMetricsCollector(); server = new MetricsServer(collector, port); await server.start(); await server.start(); // second call should be a no-op diff --git a/src/sdk/worker/metrics/__tests__/accumulators.test.ts b/src/sdk/worker/metrics/__tests__/accumulators.test.ts new file mode 100644 index 00000000..d25c3181 --- /dev/null +++ b/src/sdk/worker/metrics/__tests__/accumulators.test.ts @@ -0,0 +1,283 @@ +import { describe, it, expect } from "@jest/globals"; +import { + labelKey, + renderLabels, + exceptionLabel, + HistogramAccumulator, + MultiLabelCounter, + GaugeMetric, + TIME_BUCKETS, + SIZE_BUCKETS, +} from "../accumulators"; + +describe("labelKey", () => { + it("should return empty string for empty labels", () => { + expect(labelKey({})).toBe(""); + }); + + it("should return key=value for a single label", () => { + expect(labelKey({ taskType: "my_task" })).toBe("taskType=my_task"); + }); + + it("should sort keys alphabetically", () => { + expect(labelKey({ z: "1", a: "2", m: "3" })).toBe("a=2,m=3,z=1"); + }); + + it("should produce the same key regardless of insertion order", () => { + const key1 = labelKey({ status: "SUCCESS", taskType: "t" }); + const key2 = labelKey({ taskType: "t", status: "SUCCESS" }); + expect(key1).toBe(key2); + }); +}); + +describe("renderLabels", () => { + it("should return empty string for empty labels", () => { + expect(renderLabels({})).toBe(""); + }); + + it("should format a single label with quotes", () => { + expect(renderLabels({ taskType: "my_task" })).toBe('taskType="my_task"'); + }); + + it("should format multiple labels comma-separated", () => { + const result = renderLabels({ method: "GET", uri: "/api" }); + expect(result).toBe('method="GET",uri="/api"'); + }); +}); + +describe("exceptionLabel", () => { + it("should return Error name for standard Error", () => { + expect(exceptionLabel(new Error("oops"))).toBe("Error"); + }); + + it("should return subclass name for TypeError", () => { + expect(exceptionLabel(new TypeError("bad type"))).toBe("TypeError"); + }); + + it("should return subclass name for RangeError", () => { + expect(exceptionLabel(new RangeError("out of range"))).toBe("RangeError"); + }); + + it("should return 'Error' for non-Error values", () => { + expect(exceptionLabel("string error")).toBe("Error"); + expect(exceptionLabel(42)).toBe("Error"); + expect(exceptionLabel(null)).toBe("Error"); + expect(exceptionLabel(undefined)).toBe("Error"); + expect(exceptionLabel({ message: "not an error" })).toBe("Error"); + }); + + it("should fall back to constructor name when .name is empty", () => { + const err = new TypeError("test"); + Object.defineProperty(err, "name", { value: "" }); + expect(exceptionLabel(err)).toBe("TypeError"); + }); +}); + +describe("HistogramAccumulator", () => { + it("should render empty string when no observations", () => { + const h = new HistogramAccumulator([1, 5, 10]); + expect(h.render("test_metric", "A test")).toBe(""); + }); + + it("should place value in correct buckets", () => { + const h = new HistogramAccumulator([1, 5, 10]); + h.observe({ taskType: "t" }, 3); + + const text = h.render("req_time", "Request time"); + expect(text).toContain("# HELP req_time Request time"); + expect(text).toContain("# TYPE req_time histogram"); + expect(text).toContain('req_time_bucket{taskType="t",le="1"} 0'); + expect(text).toContain('req_time_bucket{taskType="t",le="5"} 1'); + expect(text).toContain('req_time_bucket{taskType="t",le="10"} 1'); + expect(text).toContain('req_time_bucket{taskType="t",le="+Inf"} 1'); + expect(text).toContain('req_time_sum{taskType="t"} 3'); + expect(text).toContain('req_time_count{taskType="t"} 1'); + }); + + it("should increment all buckets at or above the value boundary", () => { + const h = new HistogramAccumulator([1, 5, 10]); + h.observe({}, 1); // exactly on boundary + + const text = h.render("m", "help"); + expect(text).toContain('m_bucket{le="1"} 1'); + expect(text).toContain('m_bucket{le="5"} 1'); + expect(text).toContain('m_bucket{le="10"} 1'); + }); + + it("should handle value above all boundaries", () => { + const h = new HistogramAccumulator([1, 5, 10]); + h.observe({}, 100); + + const text = h.render("m", "help"); + expect(text).toContain('m_bucket{le="1"} 0'); + expect(text).toContain('m_bucket{le="5"} 0'); + expect(text).toContain('m_bucket{le="10"} 0'); + expect(text).toContain('m_bucket{le="+Inf"} 1'); + expect(text).toContain("m_sum{} 100"); + }); + + it("should accumulate multiple observations", () => { + const h = new HistogramAccumulator([1, 5, 10]); + h.observe({ t: "a" }, 0.5); + h.observe({ t: "a" }, 3); + h.observe({ t: "a" }, 7); + + const text = h.render("m", "help"); + expect(text).toContain('m_bucket{t="a",le="1"} 1'); + expect(text).toContain('m_bucket{t="a",le="5"} 2'); + expect(text).toContain('m_bucket{t="a",le="10"} 3'); + expect(text).toContain('m_count{t="a"} 3'); + expect(text).toContain('m_sum{t="a"} 10.5'); + }); + + it("should track separate series for different label sets", () => { + const h = new HistogramAccumulator([10]); + h.observe({ status: "OK" }, 5); + h.observe({ status: "ERR" }, 15); + + const text = h.render("m", "help"); + expect(text).toContain('m_bucket{status="OK",le="10"} 1'); + expect(text).toContain('m_bucket{status="ERR",le="10"} 0'); + expect(text).toContain('m_count{status="OK"} 1'); + expect(text).toContain('m_count{status="ERR"} 1'); + }); + + it("should default to TIME_BUCKETS when no boundaries given", () => { + const h = new HistogramAccumulator(); + h.observe({}, 0.005); + const text = h.render("m", "help"); + // TIME_BUCKETS starts at 0.001, 0.005, ... + expect(text).toContain('le="0.001"'); + expect(text).toContain('le="0.005"'); + }); +}); + +describe("MultiLabelCounter", () => { + it("should render empty string when no increments", () => { + const c = new MultiLabelCounter(); + expect(c.render("test_counter", "A test")).toBe(""); + }); + + it("should increment and render a single-label counter", () => { + const c = new MultiLabelCounter(); + c.increment({ taskType: "t" }); + c.increment({ taskType: "t" }); + + const text = c.render("poll_total", "Total polls"); + expect(text).toContain("# HELP poll_total Total polls"); + expect(text).toContain("# TYPE poll_total counter"); + expect(text).toContain('poll_total{taskType="t"} 2'); + }); + + it("should support custom increment values", () => { + const c = new MultiLabelCounter(); + c.increment({ taskType: "t" }, 5); + c.increment({ taskType: "t" }, 3); + + const text = c.render("m", "help"); + expect(text).toContain('m{taskType="t"} 8'); + }); + + it("should track separate series for different label sets", () => { + const c = new MultiLabelCounter(); + c.increment({ taskType: "a" }); + c.increment({ taskType: "b" }); + c.increment({ taskType: "a" }); + + const text = c.render("m", "help"); + expect(text).toContain('m{taskType="a"} 2'); + expect(text).toContain('m{taskType="b"} 1'); + }); + + it("should handle multi-label counters", () => { + const c = new MultiLabelCounter(); + c.increment({ taskType: "t", exception: "TypeError" }); + + const text = c.render("err", "Errors"); + expect(text).toContain('err{taskType="t",exception="TypeError"} 1'); + }); +}); + +describe("GaugeMetric", () => { + it("should render empty string when no values set", () => { + const g = new GaugeMetric(); + expect(g.render("test_gauge", "A test")).toBe(""); + }); + + it("should set and render a gauge value", () => { + const g = new GaugeMetric(); + g.set({ taskType: "t" }, 42); + + const text = g.render("active", "Active workers"); + expect(text).toContain("# HELP active Active workers"); + expect(text).toContain("# TYPE active gauge"); + expect(text).toContain('active{taskType="t"} 42'); + }); + + it("should overwrite previous value on set", () => { + const g = new GaugeMetric(); + g.set({ taskType: "t" }, 10); + g.set({ taskType: "t" }, 99); + + const text = g.render("m", "help"); + expect(text).toContain('m{taskType="t"} 99'); + expect(text).not.toContain("10"); + }); + + it("should increment with inc()", () => { + const g = new GaugeMetric(); + g.inc({ taskType: "t" }); + g.inc({ taskType: "t" }); + g.inc({ taskType: "t" }, 3); + + expect(g.getValue({ taskType: "t" })).toBe(5); + }); + + it("should decrement with dec()", () => { + const g = new GaugeMetric(); + g.inc({ taskType: "t" }, 5); + g.dec({ taskType: "t" }); + g.dec({ taskType: "t" }, 2); + + expect(g.getValue({ taskType: "t" })).toBe(2); + }); + + it("should allow negative values after dec()", () => { + const g = new GaugeMetric(); + g.dec({ taskType: "t" }); + + expect(g.getValue({ taskType: "t" })).toBe(-1); + }); + + it("should return 0 for getValue on unknown labels", () => { + const g = new GaugeMetric(); + expect(g.getValue({ taskType: "unknown" })).toBe(0); + }); + + it("should track separate series for different label sets", () => { + const g = new GaugeMetric(); + g.set({ taskType: "a" }, 10); + g.set({ taskType: "b" }, 20); + + expect(g.getValue({ taskType: "a" })).toBe(10); + expect(g.getValue({ taskType: "b" })).toBe(20); + + const text = g.render("m", "help"); + expect(text).toContain('m{taskType="a"} 10'); + expect(text).toContain('m{taskType="b"} 20'); + }); +}); + +describe("bucket constants", () => { + it("TIME_BUCKETS should be sorted ascending", () => { + for (let i = 1; i < TIME_BUCKETS.length; i++) { + expect(TIME_BUCKETS[i]).toBeGreaterThan(TIME_BUCKETS[i - 1]); + } + }); + + it("SIZE_BUCKETS should be sorted ascending", () => { + for (let i = 1; i < SIZE_BUCKETS.length; i++) { + expect(SIZE_BUCKETS[i]).toBeGreaterThan(SIZE_BUCKETS[i - 1]); + } + }); +}); diff --git a/src/sdk/worker/metrics/__tests__/httpObserver.test.ts b/src/sdk/worker/metrics/__tests__/httpObserver.test.ts new file mode 100644 index 00000000..bf8adb42 --- /dev/null +++ b/src/sdk/worker/metrics/__tests__/httpObserver.test.ts @@ -0,0 +1,58 @@ +import { describe, it, expect, beforeEach, jest } from "@jest/globals"; +import { + getHttpMetricsObserver, + setHttpMetricsObserver, + type HttpMetricsObserver, +} from "../httpObserver"; + +describe("httpObserver", () => { + beforeEach(() => { + setHttpMetricsObserver(undefined); + }); + + it("should return undefined by default", () => { + expect(getHttpMetricsObserver()).toBeUndefined(); + }); + + it("should return the observer after setting one", () => { + const observer: HttpMetricsObserver = { + recordApiRequestTime: jest.fn(), + recordWorkflowInputSize: jest.fn(), + recordWorkflowStartError: jest.fn(), + }; + setHttpMetricsObserver(observer); + expect(getHttpMetricsObserver()).toBe(observer); + }); + + it("should clear the observer when set to undefined", () => { + const observer: HttpMetricsObserver = { + recordApiRequestTime: jest.fn(), + recordWorkflowInputSize: jest.fn(), + recordWorkflowStartError: jest.fn(), + }; + setHttpMetricsObserver(observer); + expect(getHttpMetricsObserver()).toBe(observer); + + setHttpMetricsObserver(undefined); + expect(getHttpMetricsObserver()).toBeUndefined(); + }); + + it("should replace the observer on subsequent set calls", () => { + const observer1: HttpMetricsObserver = { + recordApiRequestTime: jest.fn(), + recordWorkflowInputSize: jest.fn(), + recordWorkflowStartError: jest.fn(), + }; + const observer2: HttpMetricsObserver = { + recordApiRequestTime: jest.fn(), + recordWorkflowInputSize: jest.fn(), + recordWorkflowStartError: jest.fn(), + }; + + setHttpMetricsObserver(observer1); + expect(getHttpMetricsObserver()).toBe(observer1); + + setHttpMetricsObserver(observer2); + expect(getHttpMetricsObserver()).toBe(observer2); + }); +}); diff --git a/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts new file mode 100644 index 00000000..529fad17 --- /dev/null +++ b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts @@ -0,0 +1,128 @@ +import { describe, it, expect, afterEach } from "@jest/globals"; +import { createMetricsCollector } from "../metricsFactory"; +import { LegacyMetricsCollector } from "../LegacyMetricsCollector"; +import { CanonicalMetricsCollector } from "../CanonicalMetricsCollector"; + +describe("createMetricsCollector", () => { + const originalEnv = { ...process.env }; + + afterEach(() => { + process.env = { ...originalEnv }; + }); + + it("should return LegacyMetricsCollector by default", () => { + delete process.env.WORKER_CANONICAL_METRICS; + delete process.env.WORKER_LEGACY_METRICS; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(LegacyMetricsCollector); + }); + + it("should return LegacyMetricsCollector when WORKER_LEGACY_METRICS=true", () => { + process.env.WORKER_LEGACY_METRICS = "true"; + delete process.env.WORKER_CANONICAL_METRICS; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(LegacyMetricsCollector); + }); + + it("should return CanonicalMetricsCollector when WORKER_CANONICAL_METRICS=true", () => { + process.env.WORKER_CANONICAL_METRICS = "true"; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(CanonicalMetricsCollector); + }); + + it("should prefer canonical when both env vars are true", () => { + process.env.WORKER_CANONICAL_METRICS = "true"; + process.env.WORKER_LEGACY_METRICS = "true"; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(CanonicalMetricsCollector); + }); + + it("should return LegacyMetricsCollector when WORKER_CANONICAL_METRICS=false", () => { + process.env.WORKER_CANONICAL_METRICS = "false"; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(LegacyMetricsCollector); + }); + + it("should be case-insensitive for env var value", () => { + process.env.WORKER_CANONICAL_METRICS = "TRUE"; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(CanonicalMetricsCollector); + }); + + it('should return CanonicalMetricsCollector when WORKER_CANONICAL_METRICS="1"', () => { + process.env.WORKER_CANONICAL_METRICS = "1"; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(CanonicalMetricsCollector); + }); + + it('should return CanonicalMetricsCollector when WORKER_CANONICAL_METRICS="yes"', () => { + process.env.WORKER_CANONICAL_METRICS = "yes"; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(CanonicalMetricsCollector); + }); + + it("should pass config through to the collector", () => { + delete process.env.WORKER_CANONICAL_METRICS; + + const collector = createMetricsCollector({ prefix: "custom_prefix" }); + expect(collector).toBeInstanceOf(LegacyMetricsCollector); + const text = collector.toPrometheusText(); + // No metrics recorded yet, so empty, but the collector was created successfully + expect(typeof text).toBe("string"); + }); + + it('legacy collector returns "legacy" from collectorName()', () => { + delete process.env.WORKER_CANONICAL_METRICS; + + const collector = createMetricsCollector(); + expect(collector.collectorName()).toBe("legacy"); + }); + + it('canonical collector returns "canonical" from collectorName()', () => { + process.env.WORKER_CANONICAL_METRICS = "true"; + + const collector = createMetricsCollector(); + expect(collector.collectorName()).toBe("canonical"); + }); + + it("both implementations satisfy MetricsCollectorInterface", () => { + const legacy = createMetricsCollector(); + const requiredMethods = [ + "recordTaskExecutionQueueFull", + "recordUncaughtException", + "recordWorkerRestart", + "recordTaskPaused", + "recordTaskAckError", + "recordTaskAckFailed", + "recordWorkflowStartError", + "recordExternalPayloadUsed", + "recordWorkflowInputSize", + "recordApiRequestTime", + "getMetrics", + "reset", + "stop", + "getContentType", + "toPrometheusText", + "collectorName", + "toPrometheusTextAsync", + ]; + + for (const method of requiredMethods) { + expect(typeof (legacy as unknown as Record)[method]).toBe("function"); + } + + process.env.WORKER_CANONICAL_METRICS = "true"; + const canonical = createMetricsCollector(); + for (const method of requiredMethods) { + expect(typeof (canonical as unknown as Record)[method]).toBe("function"); + } + }); +}); diff --git a/src/sdk/worker/metrics/accumulators.ts b/src/sdk/worker/metrics/accumulators.ts new file mode 100644 index 00000000..de8d48ae --- /dev/null +++ b/src/sdk/worker/metrics/accumulators.ts @@ -0,0 +1,177 @@ +/** + * In-memory Prometheus-compatible metric accumulators used by the + * canonical metrics implementation. + */ + +export const TIME_BUCKETS = [ + 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, +] as const; + +export const SIZE_BUCKETS = [ + 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000, +] as const; + +export function labelKey(labels: Record): string { + const keys = Object.keys(labels).sort(); + return keys.map((k) => `${k}=${labels[k]}`).join(","); +} + +export function renderLabels(labels: Record): string { + return Object.entries(labels) + .map(([k, v]) => `${k}="${v}"`) + .join(","); +} + +export function exceptionLabel(error: unknown): string { + if (error instanceof Error) { + return error.name || error.constructor?.name || "Error"; + } + return "Error"; +} + +// ── HistogramAccumulator ───────────────────────────────────────── + +interface HistogramSeries { + labels: Record; + buckets: number[]; + count: number; + sum: number; +} + +export class HistogramAccumulator { + private readonly _boundaries: readonly number[]; + private _series = new Map(); + + constructor(boundaries: readonly number[] = TIME_BUCKETS) { + this._boundaries = boundaries; + } + + observe(labels: Record, value: number): void { + const key = labelKey(labels); + let s = this._series.get(key); + if (!s) { + s = { + labels, + buckets: new Array(this._boundaries.length).fill(0), + count: 0, + sum: 0, + }; + this._series.set(key, s); + } + for (let i = 0; i < this._boundaries.length; i++) { + if (value <= this._boundaries[i]) { + s.buckets[i]++; + } + } + s.count++; + s.sum += value; + } + + render(name: string, help: string): string { + if (this._series.size === 0) return ""; + const lines: string[] = []; + lines.push(`# HELP ${name} ${help}`); + lines.push(`# TYPE ${name} histogram`); + for (const s of this._series.values()) { + const lblStr = renderLabels(s.labels); + const sep = lblStr ? "," : ""; + for (let i = 0; i < this._boundaries.length; i++) { + lines.push( + `${name}_bucket{${lblStr}${sep}le="${this._boundaries[i]}"} ${s.buckets[i]}`, + ); + } + lines.push(`${name}_bucket{${lblStr}${sep}le="+Inf"} ${s.count}`); + lines.push(`${name}_sum{${lblStr}} ${s.sum}`); + lines.push(`${name}_count{${lblStr}} ${s.count}`); + } + return lines.join("\n"); + } +} + +// ── MultiLabelCounter ──────────────────────────────────────────── + +interface CounterSeries { + labels: Record; + value: number; +} + +export class MultiLabelCounter { + private _series = new Map(); + + increment(labels: Record, value = 1): void { + const key = labelKey(labels); + let s = this._series.get(key); + if (!s) { + s = { labels, value: 0 }; + this._series.set(key, s); + } + s.value += value; + } + + render(name: string, help: string): string { + if (this._series.size === 0) return ""; + const lines: string[] = []; + lines.push(`# HELP ${name} ${help}`); + lines.push(`# TYPE ${name} counter`); + for (const s of this._series.values()) { + lines.push(`${name}{${renderLabels(s.labels)}} ${s.value}`); + } + return lines.join("\n"); + } +} + +// ── GaugeMetric ────────────────────────────────────────────────── + +interface GaugeSeries { + labels: Record; + value: number; +} + +export class GaugeMetric { + private _series = new Map(); + + set(labels: Record, value: number): void { + const key = labelKey(labels); + let s = this._series.get(key); + if (!s) { + s = { labels, value: 0 }; + this._series.set(key, s); + } + s.value = value; + } + + inc(labels: Record, delta = 1): void { + const key = labelKey(labels); + let s = this._series.get(key); + if (!s) { + s = { labels, value: 0 }; + this._series.set(key, s); + } + s.value += delta; + } + + dec(labels: Record, delta = 1): void { + const key = labelKey(labels); + let s = this._series.get(key); + if (!s) { + s = { labels, value: 0 }; + this._series.set(key, s); + } + s.value -= delta; + } + + getValue(labels: Record): number { + return this._series.get(labelKey(labels))?.value ?? 0; + } + + render(name: string, help: string): string { + if (this._series.size === 0) return ""; + const lines: string[] = []; + lines.push(`# HELP ${name} ${help}`); + lines.push(`# TYPE ${name} gauge`); + for (const s of this._series.values()) { + lines.push(`${name}{${renderLabels(s.labels)}} ${s.value}`); + } + return lines.join("\n"); + } +} diff --git a/src/sdk/worker/metrics/httpObserver.ts b/src/sdk/worker/metrics/httpObserver.ts new file mode 100644 index 00000000..fa2cbc76 --- /dev/null +++ b/src/sdk/worker/metrics/httpObserver.ts @@ -0,0 +1,40 @@ +/** + * Global metrics observer for recording API client request latency + * and workflow-level metrics from code outside the event system. + * + * Both LegacyMetricsCollector and CanonicalMetricsCollector register + * themselves here on construction; fetchWithRetry and WorkflowExecutor + * call the observer without needing a direct reference. + */ + +export interface HttpMetricsObserver { + recordApiRequestTime( + method: string, + uri: string, + status: number | string, + durationMs: number, + ): void; + + recordWorkflowInputSize( + workflowType: string, + sizeBytes: number, + version?: string, + ): void; + + recordWorkflowStartError( + workflowType?: string, + exception?: string, + ): void; +} + +let _observer: HttpMetricsObserver | undefined; + +export function setHttpMetricsObserver( + observer: HttpMetricsObserver | undefined, +): void { + _observer = observer; +} + +export function getHttpMetricsObserver(): HttpMetricsObserver | undefined { + return _observer; +} diff --git a/src/sdk/worker/metrics/index.ts b/src/sdk/worker/metrics/index.ts index 9869a1c7..732e02f7 100644 --- a/src/sdk/worker/metrics/index.ts +++ b/src/sdk/worker/metrics/index.ts @@ -1,7 +1,37 @@ +// Interface +export type { MetricsCollectorInterface } from "./MetricsCollectorInterface"; + +// Implementations export { - MetricsCollector, + LegacyMetricsCollector, type MetricsCollectorConfig, type WorkerMetrics, -} from "./MetricsCollector"; +} from "./LegacyMetricsCollector"; +export { CanonicalMetricsCollector } from "./CanonicalMetricsCollector"; + +// Backward-compat alias: MetricsCollector → LegacyMetricsCollector +export { LegacyMetricsCollector as MetricsCollector } from "./LegacyMetricsCollector"; + +// Factory +export { createMetricsCollector } from "./metricsFactory"; + +// HTTP observer +export { + type HttpMetricsObserver, + getHttpMetricsObserver, + setHttpMetricsObserver, +} from "./httpObserver"; + +// Server & registries export { MetricsServer } from "./MetricsServer"; export { PrometheusRegistry } from "./PrometheusRegistry"; +export { CanonicalPrometheusRegistry } from "./CanonicalPrometheusRegistry"; + +// Accumulators (exposed for advanced usage / testing) +export { + HistogramAccumulator, + MultiLabelCounter, + GaugeMetric, + TIME_BUCKETS, + SIZE_BUCKETS, +} from "./accumulators"; diff --git a/src/sdk/worker/metrics/metricsFactory.ts b/src/sdk/worker/metrics/metricsFactory.ts new file mode 100644 index 00000000..dabe4810 --- /dev/null +++ b/src/sdk/worker/metrics/metricsFactory.ts @@ -0,0 +1,29 @@ +import type { MetricsCollectorConfig } from "./LegacyMetricsCollector"; +import type { MetricsCollectorInterface } from "./MetricsCollectorInterface"; +import { LegacyMetricsCollector } from "./LegacyMetricsCollector"; +import { CanonicalMetricsCollector } from "./CanonicalMetricsCollector"; +import { setHttpMetricsObserver } from "./httpObserver"; + +/** + * Create the appropriate MetricsCollector based on environment variables. + * + * - WORKER_CANONICAL_METRICS=true -> CanonicalMetricsCollector (default: false) + * - WORKER_LEGACY_METRICS=true -> LegacyMetricsCollector (default: true) + * + * WORKER_CANONICAL_METRICS takes priority when both are set. + * During the deprecation transition period the default is legacy. + */ +export function createMetricsCollector( + config?: MetricsCollectorConfig, +): MetricsCollectorInterface { + const useCanonical = ["true", "1", "yes"].includes( + (process.env.WORKER_CANONICAL_METRICS ?? "").toLowerCase(), + ); + + const collector = useCanonical + ? new CanonicalMetricsCollector(config) + : new LegacyMetricsCollector(config); + + setHttpMetricsObserver(collector); + return collector; +}