diff --git a/.gitignore b/.gitignore index b3b772d43..09d0e1893 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,6 @@ bin/ .project .settings/ .factorypath +*.db +*.db-shm +*.db-wal diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f2595f53..85a7ba9d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,23 @@ All notable changes to this project will be documented in this file. +## [Unreleased] + +### Added + +- Canonical metrics mode: opt-in harmonized metric surface via `WORKER_CANONICAL_METRICS=true` — [details](conductor-client-metrics/README.md#detailed-technical-notes--unreleased) +- Automatic metrics wiring: `ConductorClient.Builder.withMetricsCollector(...)` installs the HTTP interceptor and auto-registers listeners on `TaskClient` and `WorkflowClient` (automatic in canonical mode; opt-in via `setAutoWiringEnabled(true)` for legacy) + +### Changed + +- Legacy metrics emit unchanged by default; no env var required +- `micrometer-registry-prometheus` is now a transitive (`api`) dependency + +### Deprecated + +- `PrometheusMetricsCollector` — use `MetricsCollectorFactory.create()` or `MetricsBundle.create()` +- `TaskClient.ack(String, String)` — use `ack(String taskType, String taskId, String workerId)` + ## [4.0.0] - 2024-10-09 - New major release – [Read more](https://orkes.io/blog/conductor-java-client-v4/) diff --git a/INTERCEPTOR.md b/INTERCEPTOR.md index e3b449cc7..4e7510019 100644 --- a/INTERCEPTOR.md +++ b/INTERCEPTOR.md @@ -240,26 +240,31 @@ public class ListenerRegister { } ``` -### 4. PrometheusMetricsCollector +### 4. MetricsCollectorFactory / LegacyPrometheusMetricsCollector / CanonicalPrometheusMetricsCollector -**Location**: `conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/PrometheusMetricsCollector.java` +**Location**: `conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/` Reference implementation of `MetricsCollector` using Micrometer Prometheus. **Features**: - Exposes HTTP endpoint for Prometheus scraping (default: `localhost:9991/metrics`) -- Records timers for poll duration (success/failure) -- Records timers for task execution duration (completed/failure) -- Records counters for poll started and task execution started -- All metrics tagged with task type - -**Metrics Exposed**: -- `poll_failure` (timer) - Duration of failed polls -- `poll_success` (timer) - Duration of successful polls -- `poll_started` (counter) - Count of poll attempts -- `task_execution_started` (counter) - Count of task executions started -- `task_execution_completed` (timer) - Duration of completed task executions -- `task_execution_failure` (timer) - Duration of failed task executions +- Selects either the legacy or canonical Prometheus collector at startup +- Records worker, task client, and workflow client metrics through the event listener system +- Records HTTP API client metrics through an OkHttp interceptor +- Keeps the metrics backend separated from task and workflow business logic + +For setup instructions, environment-variable selection, the complete legacy and canonical metric catalogs, and migration guidance, see [`conductor-client-metrics/README.md`](conductor-client-metrics/README.md). + +### Compatibility: `PrometheusMetricsCollector` + +`com.netflix.conductor.client.metrics.prometheus.PrometheusMetricsCollector` is retained as a deprecated alias for `LegacyPrometheusMetricsCollector`. Existing 4.0.x code that does: + +```java +PrometheusMetricsCollector metricsCollector = new PrometheusMetricsCollector(); +metricsCollector.startServer(9991, "/metrics"); +``` + +continues to compile and emit the same six legacy meter names (`poll_started`, `poll_success`, `poll_failure`, `task_execution_started`, `task_execution_completed`, `task_execution_failure`) byte-for-byte. The shim deliberately delegates to `LegacyPrometheusMetricsCollector`, **not** to `MetricsCollectorFactory.create()`, so an upgrader who already has `WORKER_CANONICAL_METRICS=true` in their environment is not silently flipped to the canonical surface. New code should use `MetricsCollectorFactory.create()` (or `MetricsBundle.create()`) to opt into env-var-driven selection. ## Event Lifecycle @@ -308,21 +313,20 @@ Reference implementation of `MetricsCollector` using Micrometer Prometheus. ``` ┌─────────────────────────────────────────────────────────────────┐ -│ 1. Check payload size │ +│ 1. Off-load oversized payload (only when │ +│ isEnforceThresholds() == true) │ │ WorkflowClient.checkAndUploadToExternalStorage() │ -│ └─→ eventDispatcher.publish( │ -│ new WorkflowInputPayloadSizeEvent(name, version, size)) │ +│ └─→ if size > threshold: │ +│ eventDispatcher.publish( │ +│ new WorkflowPayloadUsedEvent(name, version, │ +│ "WRITE", "WORKFLOW_INPUT")) │ └─────────────────────────────────────────────────────────────────┘ ▼ ┌─────────────────────────────────────────────────────────────────┐ -│ 2. Upload to external storage (if needed) │ -│ └─→ eventDispatcher.publish( │ -│ new WorkflowPayloadUsedEvent(name, version, │ -│ "WRITE", "WORKFLOW_INPUT")) │ -└─────────────────────────────────────────────────────────────────┘ - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ 3. Start workflow │ +│ 2. Start workflow (POST /workflow tagged with │ +│ PayloadKind.WorkflowInput so the ApiClientMetrics │ +│ OkHttp interceptor records workflow_input_size_bytes │ +│ from RequestBody.contentLength() at wire time) │ │ WorkflowClient.startWorkflow() │ │ • Success: eventDispatcher.publish( │ │ new WorkflowStartedEvent(name, version)) │ @@ -331,37 +335,54 @@ Reference implementation of `MetricsCollector` using Micrometer Prometheus. └─────────────────────────────────────────────────────────────────┘ ``` +> Note: `WorkflowInputPayloadSizeEvent` is no longer published from +> `WorkflowClient` — the canonical `workflow_input_size_bytes` histogram is +> populated at wire time by `ApiClientMetrics`, which avoids serializing the +> input twice. The event POJO and `consume(WorkflowInputPayloadSizeEvent)` +> hook are retained for third-party publishers and route through the same +> `PrometheusApiClientMetrics` helper. The same applies to +> `TaskResultPayloadSizeEvent` and `task_result_size_bytes`. + ## Usage Guide ### Basic Setup with Prometheus Metrics ```java -import com.netflix.conductor.client.http.TaskClient; import com.netflix.conductor.client.automator.TaskRunnerConfigurer; -import com.netflix.conductor.client.metrics.prometheus.PrometheusMetricsCollector; +import com.netflix.conductor.client.http.ConductorClient; +import com.netflix.conductor.client.http.TaskClient; +import com.netflix.conductor.client.http.WorkflowClient; +import com.netflix.conductor.client.metrics.prometheus.MetricsBundle; -// 1. Create TaskClient -TaskClient taskClient = new TaskClient("http://conductor-server:8080"); +// 1. Create and start metrics (factory-selected collector + Prometheus scrape server) +MetricsBundle bundle = MetricsBundle.create(); // port 9991, /metrics -// 2. Create and start PrometheusMetricsCollector -PrometheusMetricsCollector metricsCollector = new PrometheusMetricsCollector(); -metricsCollector.startServer(); // Starts HTTP server on port 9991 +// 2. Create ConductorClient — withMetricsCollector installs the HTTP interceptor +// and enables automatic listener registration on downstream clients +ConductorClient client = ConductorClient.builder() + .basePath("http://conductor-server:8080/api") + .withMetricsCollector(bundle.getCollector()) + .build(); + +// 3. Downstream clients auto-register as listeners +TaskClient taskClient = new TaskClient(client); +WorkflowClient workflowClient = new WorkflowClient(client); -// 3. Configure TaskRunner with metrics TaskRunnerConfigurer configurer = new TaskRunnerConfigurer.Builder(taskClient, workers) .withThreadCount(10) - .withMetricsCollector(metricsCollector) .build(); // 4. Start polling configurer.init(); ``` +For fine-grained control over which listeners are registered, use `withHttpMetrics` instead of `withMetricsCollector`. This installs only the HTTP interceptor and leaves all listener registration to you. See the [Manual Wiring](conductor-client-metrics/README.md#manual-wiring) section in the metrics README. + ### Custom Metrics Endpoint ```java // Start Prometheus server on custom port and endpoint -PrometheusMetricsCollector metricsCollector = new PrometheusMetricsCollector(); +AbstractPrometheusMetricsCollector metricsCollector = MetricsCollectorFactory.create(); metricsCollector.startServer(8080, "/custom-metrics"); ``` @@ -509,8 +530,11 @@ ListenerRegister.register(new TaskMonitor(), dispatcher); ### Workflow and Task Client Event Listeners ```java -WorkflowClient workflowClient = new WorkflowClient("http://conductor-server:8080"); -TaskClient taskClient = new TaskClient("http://conductor-server:8080"); +ConductorClient client = ConductorClient.builder() + .basePath("http://conductor-server:8080/api") + .build(); +WorkflowClient workflowClient = new WorkflowClient(client); +TaskClient taskClient = new TaskClient(client); // Register workflow listener workflowClient.registerListener(new WorkflowClientListener() { @@ -834,7 +858,7 @@ public class CloudWatchMetricsCollector implements MetricsCollector { ```java // Create multiple collectors -PrometheusMetricsCollector prometheus = new PrometheusMetricsCollector(); +AbstractPrometheusMetricsCollector prometheus = MetricsCollectorFactory.create(); prometheus.startServer(9991, "/metrics"); DatadogMetricsCollector datadog = new DatadogMetricsCollector( @@ -1192,16 +1216,21 @@ public class MetricsOverheadMonitor implements TaskRunnerEventsListener { package com.example.conductor; import com.netflix.conductor.client.http.TaskClient; +import com.netflix.conductor.client.http.ConductorClient; import com.netflix.conductor.client.automator.TaskRunnerConfigurer; import com.netflix.conductor.client.worker.Worker; -import com.netflix.conductor.client.metrics.prometheus.PrometheusMetricsCollector; +import com.netflix.conductor.client.metrics.prometheus.MetricsCollectorFactory; +import com.netflix.conductor.client.metrics.prometheus.AbstractPrometheusMetricsCollector; import java.util.List; public class ConductorMonitoringSetup { public static void main(String[] args) throws Exception { // 1. Create clients - TaskClient taskClient = new TaskClient("http://conductor-server:8080"); + ConductorClient client = ConductorClient.builder() + .basePath("http://conductor-server:8080/api") + .build(); + TaskClient taskClient = new TaskClient(client); // 2. Create workers List workers = List.of( @@ -1210,7 +1239,7 @@ public class ConductorMonitoringSetup { ); // 3. Setup Prometheus metrics - PrometheusMetricsCollector prometheus = new PrometheusMetricsCollector(); + AbstractPrometheusMetricsCollector prometheus = MetricsCollectorFactory.create(); prometheus.startServer(9991, "/metrics"); // 4. Setup custom monitoring diff --git a/README.md b/README.md index aa976699d..fb2707df5 100644 --- a/README.md +++ b/README.md @@ -298,9 +298,9 @@ executor.initWorkers("com.mycompany.workers"); // Package to scan for @WorkerTa ## Monitoring Workers -Enable metrics collection for monitoring workers: +Enable Prometheus metrics collection for monitoring workers: -```java +```groovy // Using conductor-client-metrics module dependencies { implementation 'org.conductoross:conductor-client-metrics:4.0.1' @@ -308,14 +308,19 @@ dependencies { ``` ```java -// Configure metrics with Prometheus +import com.netflix.conductor.client.metrics.prometheus.MetricsCollectorFactory; + TaskRunnerConfigurer configurer = new TaskRunnerConfigurer.Builder(taskClient, workers) .withThreadCount(10) - .withMetricsCollector(new PrometheusMetricsCollector()) + .withMetricsCollector(MetricsCollectorFactory.create()) .build(); ``` -See [conductor-client-metrics/README.md](conductor-client-metrics/README.md) for full metrics documentation. +`MetricsCollectorFactory.create()` uses the legacy Java SDK metric names by default. Set `WORKER_CANONICAL_METRICS=true` to opt in to the canonical cross-SDK metric names. + +> Migrating from 4.0.x? `PrometheusMetricsCollector` still works — it is now a deprecated alias for `LegacyPrometheusMetricsCollector` and emits the same six legacy meter names byte-for-byte. New code should use `MetricsCollectorFactory.create()` (or `MetricsBundle.create()`) so it can opt into canonical metrics via `WORKER_CANONICAL_METRICS=true`. + +See [conductor-client-metrics/README.md](conductor-client-metrics/README.md) for setup details, the complete legacy and canonical metric catalogs, and migration guidance. ## Workflows diff --git a/conductor-client-metrics/README.md b/conductor-client-metrics/README.md index c2ef9336c..f058b8899 100644 --- a/conductor-client-metrics/README.md +++ b/conductor-client-metrics/README.md @@ -1,9 +1,336 @@ # Conductor Client Metrics -**Status: Incubating.** +**Status: Incubating.** -Provides metrics and monitoring capabilities for Conductor clients. +The `conductor-client-metrics` module provides Prometheus metrics for Java SDK clients and workers. It helps operators monitor worker polling, task execution, task result updates, payload sizes, workflow starts, and HTTP client latency. -It helps developers track the performance and health of their workers, offering insights into task execution times, error rates, and system throughput. +This document covers the Java SDK metrics emitted by `MetricsCollectorFactory`, `LegacyPrometheusMetricsCollector`, and `CanonicalPrometheusMetricsCollector`. It does not cover Conductor server metrics or metrics emitted by other SDKs. -As an incubating module, it's still under development and subject to changes. \ No newline at end of file +## Installation + +Add the metrics module to the worker application: + +```groovy +dependencies { + implementation 'org.conductoross:conductor-client-metrics:4.0.1' +} +``` + +## Usage + +The Java SDK offers two ways to wire metrics: automatic wiring (recommended) and manual wiring. Both produce the same metrics output. + +### Automatic Wiring + +Use `MetricsBundle` to create the collector and start the scrape server, then pass the collector to `ConductorClient.Builder`. All downstream clients and the task runner auto-register themselves as listeners. + +```java +import com.netflix.conductor.client.metrics.prometheus.MetricsBundle; + +MetricsBundle bundle = MetricsBundle.create(); // port 9991, /metrics + +ConductorClient client = ConductorClient.builder() + .basePath("http://conductor-server:8080/api") + .withMetricsCollector(bundle.getCollector()) + .build(); + +TaskClient taskClient = new TaskClient(client); +WorkflowClient workflowClient = new WorkflowClient(client); + +TaskRunnerConfigurer configurer = new TaskRunnerConfigurer.Builder(taskClient, workers) + .withThreadCount(10) + .build(); +configurer.init(); +``` + +`MetricsBundle.create()` also accepts `(port)` and `(port, endpoint)` overloads for custom scrape configurations. The client builder accepts the usual timeouts, SSL, authentication, and other options alongside `withMetricsCollector` -- none of them change how metrics wiring works. + +In canonical mode, the `ConductorClient` automatically installs an OkHttp interceptor that records `http_api_client_request_seconds`. In legacy mode the interceptor is skipped entirely since there is nothing to record. + +### Manual Wiring + +For advanced use cases where you need fine-grained control over which listeners are registered where, or you want to mix the metrics collector with custom event listeners, use `withHttpMetrics` instead of `withMetricsCollector` on the builder. This installs only the HTTP interceptor (for `http_api_client_request_seconds`, `task_result_size_bytes`, `workflow_input_size_bytes`) without triggering automatic listener registration on downstream clients: + +```java +import com.netflix.conductor.client.metrics.prometheus.AbstractPrometheusMetricsCollector; +import com.netflix.conductor.client.metrics.prometheus.MetricsCollectorFactory; + +AbstractPrometheusMetricsCollector metricsCollector = MetricsCollectorFactory.create(); +metricsCollector.startServer(); // http://localhost:9991/metrics + +ConductorClient client = ConductorClient.builder() + .basePath("http://conductor-server:8080/api") + .withHttpMetrics(metricsCollector) + .build(); + +TaskClient taskClient = new TaskClient(client); +taskClient.registerListener(metricsCollector); +taskClient.registerTaskRunnerListener(metricsCollector); + +TaskRunnerConfigurer configurer = new TaskRunnerConfigurer.Builder(taskClient, workers) + .withThreadCount(10) + .withMetricsCollector(metricsCollector) + .build(); + +configurer.init(); + +WorkflowClient workflowClient = new WorkflowClient(client); +workflowClient.registerListener(metricsCollector); +``` + +Use this approach when you need to register the metrics collector on some clients but not others, or when mixing in custom event listeners alongside the metrics collector. With `withHttpMetrics`, none of the listener registrations happen automatically -- you choose exactly which clients get which listeners. + +### How Auto-Registration Works + +When a `MetricsCollector` is passed to `ConductorClient.Builder.withMetricsCollector()`: + +1. The `ConductorClient` installs an OkHttp interceptor that records `http_api_client_request_seconds`. In legacy mode the interceptor is not installed because `getApiClientMetrics()` returns `ApiClientMetrics.NOOP`. +2. `TaskClient` detects the collector from the `ConductorClient` it receives and calls `registerListener` and `registerTaskRunnerListener` on itself. +3. `WorkflowClient` detects the collector from the `ConductorClient` it receives and calls `registerListener` on itself. +4. `TaskRunnerConfigurer.Builder.build()` detects the collector from the `TaskClient`'s `ConductorClient` and registers task-runner events automatically, unless `withMetricsCollector` was called explicitly on the builder. + +All registrations are idempotent. If you call both `withMetricsCollector` on the builder and `registerListener` manually with the same collector, events are not duplicated. + +To install only the HTTP interceptor (step 1) without triggering automatic listener registration (steps 2-4), use `withHttpMetrics` instead of `withMetricsCollector` on the builder. See [Manual Wiring](#manual-wiring) above. + +The collector exposes Prometheus text format from the embedded HTTP server. Metrics are created lazily, so a metric family appears after the corresponding worker or client event has occurred. + +## Tuning + +Canonical mode enables several hot-path behaviors that legacy mode leaves off by default to preserve zero-overhead backward compatibility. These can be toggled per-collector or overridden per-`TaskRunnerConfigurer`. + +### Collector-level flags + +Call these on `MetricsCollector` (or `AbstractPrometheusMetricsCollector`) before passing it to the builder: + +| Flag | Canonical default | Legacy default | Effect | +|---|---|---|---| +| `setAutoWiringEnabled(boolean)` | `true` | `false` | When `true`, `TaskClient` / `WorkflowClient` constructors auto-register the collector as an event listener. When `false`, callers must register manually. | +| `setActiveWorkersTrackingEnabled(boolean)` | `true` | `false` | When `true`, `TaskRunner` publishes an `ActiveWorkersChanged` event on every task start and finish, driving the `active_workers` gauge. Adds two async event dispatches per task execution. | +| `setDiagnosticEventsEnabled(boolean)` | `true` | `false` | When `true`, `TaskRunner` publishes `TaskPaused` and `TaskExecutionQueueFull` per poll cycle, and `TaskClient` publishes `TaskAckFailure` / `TaskAckError` on ack outcomes. | + +### TaskRunnerConfigurer overrides + +These builder methods override the collector defaults for a single configurer instance: + +| Builder method | Overrides | +|---|---| +| `withActiveWorkersTracking(boolean)` | `MetricsCollector.isActiveWorkersTrackingEnabled()` | +| `withDiagnosticEvents(boolean)` | `MetricsCollector.isDiagnosticEventsEnabled()` | + +Example: opt a legacy-mode deployment into active-worker tracking without switching to canonical metrics: + +```java +MetricsCollector collector = new LegacyPrometheusMetricsCollector(); +collector.setAutoWiringEnabled(true); + +TaskRunnerConfigurer configurer = new TaskRunnerConfigurer.Builder(taskClient, workers) + .withActiveWorkersTracking(true) + .build(); +``` + +## Legacy and Canonical Modes + +The Java SDK currently supports two mutually exclusive metric surfaces: + +- **Legacy metrics** preserve the original Java SDK names and the `type` task label. This is the default. +- **Canonical metrics** use the cross-SDK worker metric catalog with harmonized names, labels, units, and bucket boundaries. + +`MetricsCollectorFactory.create()` reads `WORKER_CANONICAL_METRICS` when the collector is created: + +| Environment variable | Values | Effect | +|---|---|---| +| `WORKER_CANONICAL_METRICS` | `true`, `1`, or `yes` (case-insensitive, surrounding whitespace ignored) | Selects `CanonicalPrometheusMetricsCollector`. | +| `WORKER_CANONICAL_METRICS` | unset, blank, `false`, `0`, `no`, or any other value | Selects `LegacyPrometheusMetricsCollector`. | + +Only one implementation is active at a time. The Java SDK does not dual-emit legacy and canonical names from the same collector. Restart workers after changing `WORKER_CANONICAL_METRICS` so the factory creates the desired collector. + +`WORKER_LEGACY_METRICS` is reserved for a future default-flip phase and is not currently read by the Java SDK factory. + +## Legacy Metrics Catalog + +Legacy mode emits the original Java SDK worker metrics. The table lists the Micrometer meter names registered by the collector. In Prometheus scrape output, Micrometer may add suffixes such as `_total`, `_seconds_count`, `_seconds_sum`, and `_seconds_max` depending on the meter type. + +| Meter | Micrometer type | Labels | Meaning | +|---|---|---|---| +| `poll_started` | Counter | `type` | Count of poll attempts for a task type. | +| `poll_success` | Timer | `type` | Duration of successful poll requests. | +| `poll_failure` | Timer | `type` | Duration of failed poll requests. | +| `task_execution_started` | Counter | `type` | Count of tasks dispatched to worker code. | +| `task_execution_completed` | Timer | `type` | Duration of successful task executions. | +| `task_execution_failure` | Timer | `type` | Duration of failed task executions. | + +Legacy mode intentionally does not emit canonical-only metrics for task update latency, task ack failures, queue saturation, paused workers, uncaught worker thread exceptions, external payload usage, task result size, workflow input size, workflow start errors, active workers, or HTTP API client latency. + +## Canonical Metrics Catalog + +Canonical mode emits the harmonized Java SDK metric surface. Time metrics use seconds and the standard canonical bucket boundaries. Size metrics use bytes and the standard canonical size bucket boundaries. Exception labels use bounded exception type names, not exception messages or stack traces. + +### Counters + +| Meter | Labels | Meaning | +|---|---|---| +| `task_poll_total` | `taskType` | Incremented each time a worker issues a poll request. | +| `task_execution_started_total` | `taskType` | Incremented when a polled task is dispatched to the worker function. | +| `task_poll_error_total` | `taskType`, `exception` | Incremented when polling fails with a client-side exception. | +| `task_execute_error_total` | `taskType`, `exception` | Incremented when worker code throws while executing a task. | +| `task_update_error_total` | `taskType`, `exception` | Incremented when reporting a task result back to Conductor fails. | +| `task_ack_failed_total` | `taskType` | Incremented when an explicit task ack response is unsuccessful. The internal task runner uses batch poll responses as ack and may not emit this during normal polling. | +| `task_ack_error_total` | `taskType`, `exception` | Incremented when an explicit task ack call throws. The internal task runner uses batch poll responses as ack and may not emit this during normal polling. | +| `task_execution_queue_full_total` | `taskType` | Incremented when a poll cycle is skipped because all worker threads are busy (zero permits available). | +| `task_paused_total` | `taskType` | Incremented when a worker is paused and skips acting on a poll. | +| `thread_uncaught_exceptions_total` | `exception` | Incremented when a worker thread raises an uncaught exception. | +| `external_payload_used_total` | `entityName`, `operation`, `payloadType` | Incremented when external payload storage is used for task or workflow payloads. | +| `workflow_start_error_total` | `workflowType`, `exception` | Incremented when starting a workflow fails client-side. | + +### Time Metrics + +| Meter | Labels | Meaning | +|---|---|---| +| `task_poll_time_seconds` | `taskType`, `status` | Poll request latency. `status` is `SUCCESS` or `FAILURE`. | +| `task_execute_time_seconds` | `taskType`, `status` | Worker function execution latency. `status` is `SUCCESS` or `FAILURE`. | +| `task_update_time_seconds` | `taskType`, `status` | Latency for reporting a task result back to Conductor. `status` is `SUCCESS` or `FAILURE`. | +| `http_api_client_request_seconds` | `method`, `uri`, `status` | Latency of HTTP requests made by the API client. `status` is the HTTP status code as a string, or `0` when no response status is available. | + +Time metrics use these service-level objective buckets, in seconds: + +```text +0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.250, 0.500, 1, 2.5, 5, 10 +``` + +The `uri` label for `http_api_client_request_seconds` uses the path template (e.g. `/workflow/{workflowId}`, `/tasks/poll/batch/{taskType}`) rather than the resolved path. This keeps the label space bounded regardless of how many unique workflow or task IDs are processed. + +### Size Metrics + +| Meter | Labels | Meaning | +|---|---|---| +| `task_result_size_bytes` | `taskType` | Serialized task result output size, captured from `RequestBody.contentLength()` of the outbound `POST /tasks` (or `POST /tasks/update-v2`) request. `taskType` is empty when the caller used the single-argument `TaskClient.updateTask(TaskResult)` overload. | +| `workflow_input_size_bytes` | `workflowType`, `version` | Serialized workflow input size, captured from `RequestBody.contentLength()` of the outbound `POST /workflow` request. `version` is an empty string when the workflow version is absent. | + +Both histograms are populated at wire time by the `ApiClientMetrics` OkHttp interceptor, reading a `PayloadKind` tag attached by `TaskClient`/`WorkflowClient`. The byte count is read off the request body the HTTP layer is about to send, so no extra JSON serialization is needed. + +Size metrics use these service-level objective buckets, in bytes: + +```text +100, 1000, 10000, 100000, 1000000, 10000000 +``` + +### Gauges + +| Meter | Labels | Meaning | +|---|---|---| +| `active_workers` | `taskType` | Current number of worker threads actively executing tasks. | + +## Labels + +| Label | Used by | Values | +|---|---|---| +| `type` | Legacy worker metrics | Task definition name. Replaced by `taskType` in canonical mode. | +| `taskType` | Canonical worker metrics | Task definition name. | +| `workflowType` | Workflow metrics | Workflow definition name. | +| `version` | `workflow_input_size_bytes` | Workflow version as a string. Empty string when the version is absent. | +| `status` | Task time metrics | `SUCCESS` or `FAILURE`. For `http_api_client_request_seconds`, the HTTP status code as a string, or `0` when no response status is available. | +| `exception` | Canonical error counters | Exception type name, such as `SocketTimeoutException`. | +| `entityName` | `external_payload_used_total` | Task type or workflow name associated with the external payload. | +| `operation` | `external_payload_used_total` | External payload operation, such as `READ` or `WRITE`. | +| `payloadType` | `external_payload_used_total` | Payload type, such as `TASK_INPUT`, `TASK_OUTPUT`, `WORKFLOW_INPUT`, or `WORKFLOW_OUTPUT`. | +| `method` | HTTP metrics | HTTP verb. | +| `uri` | HTTP metrics | Path template from the Java HTTP client (e.g. `/workflow/{workflowId}`). Resolved identifiers are not included, keeping cardinality bounded. | + +## Migration from 4.0.x + +The 4.0.x entry point `PrometheusMetricsCollector` is retained as a deprecated alias for `LegacyPrometheusMetricsCollector`, so existing code keeps compiling and emits the same six legacy meter names byte-for-byte. Use this table to decide what to do at upgrade time: + +| 4.0.x usage | 4.x replacement | +|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------| +| `new PrometheusMetricsCollector()` | `MetricsCollectorFactory.create()` (or `MetricsBundle.create()`) — env-var-selected legacy or canonical | +| `new PrometheusMetricsCollector()` (force legacy names) | `new LegacyPrometheusMetricsCollector()` | +| `metricsCollector.startServer(port, "/metrics")` | unchanged — still on `AbstractPrometheusMetricsCollector` | + +The shim is intentionally pinned to `LegacyPrometheusMetricsCollector` rather than `MetricsCollectorFactory.create()`, so an upgrader who already has `WORKER_CANONICAL_METRICS=true` set in their environment is not silently flipped to the canonical metric surface just by upgrading the SDK. Switch to `MetricsCollectorFactory.create()` when you are ready to opt into env-var-driven selection. + +## Migration from Legacy to Canonical + +Switching to canonical metrics is an explicit metrics-surface cutover. Enable `WORKER_CANONICAL_METRICS=true` in a lower environment first, then update dashboards, recording rules, and alerts before enabling it in production. + +Important migration changes: + +- Legacy task labels use `type`; canonical task labels use `taskType`. +- Legacy success and failure timings are split across different meter names, such as `poll_success` and `poll_failure`; canonical timings use one metric name with `status=SUCCESS` or `status=FAILURE`. +- Legacy execution timings use `task_execution_completed` and `task_execution_failure`; canonical mode uses `task_execute_time_seconds` with `status`. +- Canonical error counters add an `exception` label that contains the exception type name. +- Canonical mode adds metrics that legacy mode never emitted, including task update latency, task result size, workflow input size, workflow start errors, active worker counts, and HTTP API client request latency. +- Canonical and legacy collectors are mutually exclusive. During a migration, compare scrape output by running separate worker instances or environments with and without `WORKER_CANONICAL_METRICS=true`. + +Common legacy-to-canonical replacements: + +| Legacy meter | Canonical replacement | +|---|---| +| `poll_started{type}` | `task_poll_total{taskType}` | +| `poll_success{type}` | `task_poll_time_seconds{taskType,status="SUCCESS"}` | +| `poll_failure{type}` | `task_poll_time_seconds{taskType,status="FAILURE"}` and `task_poll_error_total{taskType,exception}` | +| `task_execution_started{type}` | `task_execution_started_total{taskType}` | +| `task_execution_completed{type}` | `task_execute_time_seconds{taskType,status="SUCCESS"}` | +| `task_execution_failure{type}` | `task_execute_time_seconds{taskType,status="FAILURE"}` and `task_execute_error_total{taskType,exception}` | + +## Troubleshooting + +### Metrics Are Empty + +- Verify that the collector is wired into the client. The simplest check: was `withMetricsCollector` called on `ConductorClient.Builder`, or was `MetricsCollectorFactory.create()` called and registered manually? +- Verify workers have polled or executed tasks. Metrics are created lazily when the relevant event occurs. +- Confirm the scrape endpoint is reachable at the expected host and port. + +### Missing HTTP or Workflow Metrics + +- `http_api_client_request_seconds` requires the HTTP interceptor, which is installed automatically when `withMetricsCollector` is called on the builder. In canonical mode the interceptor records request latency; in legacy mode it is skipped because `getApiClientMetrics()` returns `ApiClientMetrics.NOOP`. +- `task_result_size_bytes` and `workflow_input_size_bytes` likewise require the HTTP interceptor — they are recorded at wire time from `RequestBody.contentLength()` for requests tagged with a `PayloadKind`. If the `ConductorClient` is built without `withMetricsCollector`, those histograms will be empty even when canonical mode is enabled. (`workflow_start_error_total` and workflow-side `external_payload_used_total` continue to flow through `workflowClient.registerListener(metricsCollector)`.) +- `task_ack_failed_total` and `task_ack_error_total` require `taskClient.registerTaskRunnerListener(metricsCollector)`. This is automatic when using `withMetricsCollector` on the builder. + +### High Cardinality + +- The `uri` label on `http_api_client_request_seconds` uses the path template, so it is bounded by the number of distinct API endpoints (not by request volume or unique IDs). The interceptor falls back to the resolved path for requests that are not tagged with a template, which may be unbounded. +- Prefer canonical mode for bounded `exception` labels. Legacy mode does not emit exception-labeled error counters. +- Avoid embedding user identifiers or unbounded values in task type, workflow type, or external payload labels. + +## Detailed Technical Notes — Unreleased + +Implementation details, internal design decisions, and migration notes for the +unreleased metrics harmonization work. For a summary, see the project +[CHANGELOG](../CHANGELOG.md). + +### Added + +- **Metrics harmonization** - canonical metric surface aligned with the cross-SDK catalog, opt-in via `WORKER_CANONICAL_METRICS=true` + - New `CanonicalPrometheusMetricsCollector` emits the harmonized cross-SDK catalog: `task_poll_total`, `task_poll_time_seconds{status}`, `task_poll_error_total{exception}`, `task_execution_started_total`, `task_execute_time_seconds{status}`, `task_execute_error_total{exception}`, `task_update_time_seconds{status}`, `task_update_error_total{exception}`, `task_ack_failed_total`, `task_ack_error_total{exception}`, `task_execution_queue_full_total`, `task_paused_total`, `thread_uncaught_exceptions_total{exception}`, `external_payload_used_total{entityName,operation,payloadType}`, `task_result_size_bytes`, `workflow_input_size_bytes{workflowType,version}`, `workflow_start_error_total{workflowType,exception}`, `active_workers` (gauge), and `http_api_client_request_seconds{method,uri,status}`. Time histograms use buckets `0.001…10s`; size histograms use `100…10_000_000` bytes. + - `MetricsCollectorFactory.create()` selects between `LegacyPrometheusMetricsCollector` (default) and `CanonicalPrometheusMetricsCollector` based on `WORKER_CANONICAL_METRICS` (truthy values: `true`, `1`, `yes`, case-insensitive). `WORKER_LEGACY_METRICS` is reserved for a future default-flip phase and is not currently read. + - `MetricsBundle.create(port)` convenience that builds the factory-selected collector and starts the Prometheus scrape server in one call. + - `ApiClientMetrics` SPI with an internal OkHttp interceptor. `ConductorClient.Builder.withMetricsCollector(...)` installs the interceptor and enables automatic listener registration on downstream clients. `ConductorClient.Builder.withHttpMetrics(...)` installs only the interceptor, leaving listener registration to the caller. The interceptor records HTTP-client latency for every request and, for requests tagged with a `PayloadKind`, the body size from `RequestBody.contentLength()` — populating `task_result_size_bytes{taskType}` and `workflow_input_size_bytes{workflowType,version}` without an extra JSON serialization. The `uri` label uses the path template (e.g. `/workflow/{workflowId}`) rather than the resolved URL, so the label space is bounded by the number of distinct API endpoints regardless of how many unique workflow or task IDs are processed. + - New `PayloadKind` sealed interface (supported by Java 21 which the project targets) with `TaskResult` and `WorkflowInput` record implementations, used to tag outbound `ConductorClientRequest`s so the metrics interceptor can label payload-size histograms without a second serialization pass. + - `TaskClient` and `WorkflowClient` auto-register as event listeners when the underlying `ConductorClient` is built with a metrics collector **and** `MetricsCollector.isAutoWiringEnabled()` returns `true` (canonical defaults to `true`; legacy defaults to `false` for backward compatibility). Call `setAutoWiringEnabled(true)` on any collector to opt in. New event POJOs under `events/taskrunner/` and `events/listeners/` thread task-runner and workflow events into the metrics collector. + - `TaskRunnerConfigurer.Builder.withActiveWorkersTracking(boolean)` explicitly enables or disables `ActiveWorkersChanged` event publishing on the task-execution hot path. When not set, the default is derived from `MetricsCollector.isActiveWorkersTrackingEnabled()` (canonical: `true`, legacy: `false`). Legacy SDK upgraders see zero additional hot-path overhead by default. + - `TaskRunnerConfigurer.Builder.withDiagnosticEvents(boolean)` explicitly enables or disables per-poll-cycle diagnostic events (`TaskPaused`, `TaskExecutionQueueFull`). When not set, the default is derived from `MetricsCollector.isDiagnosticEventsEnabled()` (canonical: `true`, legacy: `false`). Legacy SDK upgraders see zero additional hot-path overhead by default. + - New `TaskClient.updateTask(TaskResult, String taskType)` and `TaskClient.updateTaskV2(TaskResult, String taskType)` overloads carry the task definition name through to the canonical size histogram (which `TaskResult` itself does not). The single-argument forms are retained and emit the histogram with `taskType=""`. + - Harness deployment manifest sets `WORKER_CANONICAL_METRICS=true` so certification runs exercise the canonical surface; `HarnessMain` logs which collector is active. + +### Changed + +- **Metrics harmonization** - defaults preserved; legacy metrics emit unchanged when `WORKER_CANONICAL_METRICS` is unset + - `conductor-client-metrics`: `micrometer-registry-prometheus` is now an `api` dependency so consumers see it transitively. `okhttp` is an `implementation` dependency (not leaked transitively). + - Default behavior is unchanged: with no env var set, `LegacyPrometheusMetricsCollector` emits the previously released six meters (`poll_started{type}`, `poll_success{type}`, `poll_failure{type}`, `task_execution_started{type}`, `task_execution_completed{type}`, `task_execution_failure{type}`) byte-for-byte identically. + - Rewrote `conductor-client-metrics/README.md` with full legacy and canonical catalogs, label conventions, a legacy → canonical migration table, and troubleshooting guidance. + - The Prometheus meter registry is now a `static final` singleton **per concrete subclass** (`LegacyPrometheusMetricsCollector` and `CanonicalPrometheusMetricsCollector` each own their own static registry). This preserves the 4.0.x safety property that multiple instances of the same collector type share a single registry (so accidentally creating two instances cannot cause silent metric loss), while keeping legacy and canonical meter names isolated from each other. The base class `AbstractPrometheusMetricsCollector` now receives the registry via its constructor rather than creating one per instance. + - `TaskRunnerConfigurer.shutdown()` now calls `taskClient.unregisterListeners()` to clean up auto-wired listener registrations from `ListenerRegister`'s static dedup map, preventing stale entries in long-lived JVMs that re-create configurer instances. + - Updated `README.md` "Monitoring Workers" and `INTERCEPTOR.md` to use `MetricsCollectorFactory.create()` and reference the env var. + - Removed `WorkflowInputPayloadSizeEvent` publish from `checkAndUploadToExternalStorage` and `TaskResultPayloadSizeEvent` publish from `handleExternalStorage`. These were dead code on `main`: the call site was gated behind `isEnforceThresholds()` (default `false`), the `PrometheusMetricsCollector.consume()` handlers were unimplemented `//TODO` stubs, and neither `workflowClient.registerListener()` nor `taskClient.registerListener()` was ever called in any wiring path — no deployment ever received these metrics. Payload-size observability is now provided unconditionally by the `PayloadKind` tag on outbound requests, recorded at wire time by the `ApiClientMetrics` OkHttp interceptor as `task_result_size_bytes{taskType}` and `workflow_input_size_bytes{workflowType,version}`, without double-serialization or manual listener registration. + - **Transitive dependency change**: `conductor-client-metrics` now exposes `micrometer-registry-prometheus` as an `api` (transitive) dependency instead of `implementation`. Consumers with their own Micrometer version on the classpath should verify compatibility with 1.15.1. + - **Graceful shutdown improvement**: `TaskRunner` now catches `RejectedExecutionException` when submitting a polled task to the executor during shutdown, logging a descriptive message and releasing the semaphore permit. Previously, the same exception was caught by a broad `catch (Throwable t)` that only logged the message string and did not release the permit. The task outcome is unchanged in both cases (polled but never executed; times out server-side). + - **OkHttp request tagging**: Outbound requests are tagged with `ConductorClient.PathTemplateTag` (not `String.class`) so the metrics interceptor can read the URI template without conflicting with user-installed interceptors. + - **ListenerRegister thread safety**: All `ListenerRegister.register()` and `unregister()` methods are now `synchronized`, closing a check-then-act race that could double-register listeners under concurrent initialization. + - **Diagnostic events gating**: Per-poll-cycle events (`TaskPaused`, `TaskExecutionQueueFull`) and ack diagnostic events (`TaskAckFailure`, `TaskAckError`) are now gated behind `MetricsCollector.isDiagnosticEventsEnabled()` (canonical: `true`, legacy: `false`). When disabled, no event objects are allocated on the hot path. + +### Deprecated + +- `TaskClient.ack(String taskId, String workerId)` is deprecated in favor of `ack(String taskType, String taskId, String workerId)`. The 2-arg form lacks the `taskType` parameter, so canonical ack metrics (`task_ack_failed_total{taskType}`, `task_ack_error_total{taskType}`) cannot be labeled. The deprecated overload delegates with `taskType=null`, preserving identical runtime behavior — no metrics are emitted and no try-catch wrapping takes effect. + +- `com.netflix.conductor.client.metrics.prometheus.PrometheusMetricsCollector` is deprecated and is now a thin alias for `LegacyPrometheusMetricsCollector`. Existing 4.0.x callers that wrote `new PrometheusMetricsCollector()` continue to compile and emit the same six legacy meter names (`poll_started{type}`, `poll_success{type}`, `poll_failure{type}`, `task_execution_started{type}`, `task_execution_completed{type}`, `task_execution_failure{type}`) byte-for-byte. The shim deliberately pins to the legacy collector rather than `MetricsCollectorFactory.create()`, so upgraders with `WORKER_CANONICAL_METRICS=true` already set in their environment are not silently flipped to the canonical surface. New code should use `MetricsCollectorFactory.create()` or `MetricsBundle.create()` to opt into env-var-driven selection. \ No newline at end of file diff --git a/conductor-client-metrics/build.gradle b/conductor-client-metrics/build.gradle index 3c87758f4..34ccbea4d 100644 --- a/conductor-client-metrics/build.gradle +++ b/conductor-client-metrics/build.gradle @@ -13,8 +13,9 @@ plugins { //apply plugin: 'publish-config' dependencies { - implementation 'io.micrometer:micrometer-registry-prometheus:1.15.1' + api 'io.micrometer:micrometer-registry-prometheus:1.15.1' implementation project(":conductor-client") + implementation "com.squareup.okhttp3:okhttp:${versions.okHttp}" testImplementation 'org.mockito:mockito-core:5.12.0' testImplementation 'org.junit.jupiter:junit-jupiter-api:5.10.3' diff --git a/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/AbstractPrometheusMetricsCollector.java b/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/AbstractPrometheusMetricsCollector.java new file mode 100644 index 000000000..1e8317027 --- /dev/null +++ b/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/AbstractPrometheusMetricsCollector.java @@ -0,0 +1,133 @@ +/* + * Copyright 2024 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.metrics.prometheus; + +import java.io.IOException; +import java.net.InetSocketAddress; + +import com.netflix.conductor.client.metrics.MetricsCollector; + +import com.sun.net.httpserver.HttpServer; +import io.micrometer.prometheusmetrics.PrometheusMeterRegistry; + +/** + * Shared base for Prometheus-backed {@link MetricsCollector} implementations. + * Owns the reference to a {@link PrometheusMeterRegistry} and the built-in + * HTTP scrape server so both legacy and canonical implementations share the + * same plumbing. + * + *

Each concrete subclass holds its own {@code static final} registry so + * that multiple instances of the same subclass share a single registry (as + * the original 4.0.x {@code PrometheusMetricsCollector} did), while legacy + * and canonical registries remain isolated from each other. + */ +public abstract class AbstractPrometheusMetricsCollector implements MetricsCollector { + + protected final PrometheusMeterRegistry registry; + + private boolean autoWiringEnabled = false; + private boolean activeWorkersTrackingEnabled = false; + private boolean diagnosticEventsEnabled = false; + + protected AbstractPrometheusMetricsCollector(PrometheusMeterRegistry registry) { + this.registry = registry; + } + + @Override + public boolean isAutoWiringEnabled() { + return autoWiringEnabled; + } + + @Override + public void setAutoWiringEnabled(boolean enabled) { + this.autoWiringEnabled = enabled; + } + + @Override + public boolean isActiveWorkersTrackingEnabled() { + return activeWorkersTrackingEnabled; + } + + @Override + public void setActiveWorkersTrackingEnabled(boolean enabled) { + this.activeWorkersTrackingEnabled = enabled; + } + + @Override + public boolean isDiagnosticEventsEnabled() { + return diagnosticEventsEnabled; + } + + @Override + public void setDiagnosticEventsEnabled(boolean enabled) { + this.diagnosticEventsEnabled = enabled; + } + + private static final int DEFAULT_PORT = 9991; + private static final String DEFAULT_ENDPOINT = "/metrics"; + + public void startServer() throws IOException { + startServer(DEFAULT_PORT, DEFAULT_ENDPOINT); + } + + public void startServer(int port, String endpoint) throws IOException { + var server = HttpServer.create(new InetSocketAddress(port), 0); + server.createContext(endpoint, exchange -> { + var body = registry.scrape(); + exchange.getResponseHeaders().set("Content-Type", "text/plain"); + exchange.sendResponseHeaders(200, body.getBytes().length); + try (var os = exchange.getResponseBody()) { + os.write(body.getBytes()); + } + }); + server.start(); + } + + public abstract String collectorName(); + + public PrometheusMeterRegistry getRegistry() { + return registry; + } + + protected static String nullToEmpty(String s) { + return s == null ? "" : s; + } + + protected static String versionLabel(Integer v) { + return v == null ? "" : v.toString(); + } + + /** + * Produce a bounded-cardinality label value for an exception. Uses the + * simple class name so that the label space stays small. + */ + protected static String exceptionLabel(Throwable t) { + if (t == null) { + return ""; + } + Throwable cause = t; + if (cause.getCause() != null && ( + cause instanceof java.util.concurrent.ExecutionException + || cause instanceof java.util.concurrent.CompletionException + || cause instanceof java.lang.reflect.InvocationTargetException)) { + cause = cause.getCause(); + } + String simple = cause.getClass().getSimpleName(); + if (simple == null || simple.isEmpty()) { + String fqn = cause.getClass().getName(); + int dot = fqn.lastIndexOf('.'); + return dot < 0 ? fqn : fqn.substring(dot + 1); + } + return simple; + } +} diff --git a/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/ApiClientMetricsInterceptor.java b/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/ApiClientMetricsInterceptor.java new file mode 100644 index 000000000..ef0444307 --- /dev/null +++ b/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/ApiClientMetricsInterceptor.java @@ -0,0 +1,85 @@ +/* + * Copyright 2024 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.metrics.prometheus; + +import java.io.IOException; +import java.time.Duration; + +import com.netflix.conductor.client.http.ConductorClient; +import com.netflix.conductor.client.metrics.ApiClientMetrics; +import com.netflix.conductor.client.metrics.PayloadKind; + +import okhttp3.Interceptor; +import okhttp3.Request; +import okhttp3.Response; + +/** + * OkHttp3 {@link Interceptor} that records every HTTP request made through + * the Conductor API client into a supplied {@link ApiClientMetrics}. + * + *

The interceptor never alters the request or response and never throws + * on metric-recording failures -- the underlying HTTP call always goes + * through unmodified. + */ +final class ApiClientMetricsInterceptor implements Interceptor { + + private final ApiClientMetrics metrics; + + ApiClientMetricsInterceptor(ApiClientMetrics metrics) { + this.metrics = metrics == null ? ApiClientMetrics.NOOP : metrics; + } + + @Override + public Response intercept(Chain chain) throws IOException { + Request request = chain.request(); + long startNanos = System.nanoTime(); + IOException ioError = null; + Response response = null; + try { + response = chain.proceed(request); + return response; + } catch (IOException e) { + ioError = e; + throw e; + } finally { + long elapsedNanos = System.nanoTime() - startNanos; + try { + String method = request.method(); + ConductorClient.PathTemplateTag tag = request.tag(ConductorClient.PathTemplateTag.class); + String uri = tag != null ? tag.path() : request.url().encodedPath(); + int status = response != null ? response.code() + : (ioError != null ? -1 : 0); + metrics.recordRequest(method, uri, status, Duration.ofNanos(elapsedNanos)); + recordPayloadSizeIfTagged(request); + } catch (Throwable ignored) { + } + } + } + + private void recordPayloadSizeIfTagged(Request request) { + PayloadKind kind = request.tag(PayloadKind.class); + if (kind == null || request.body() == null) { + return; + } + long len; + try { + len = request.body().contentLength(); + } catch (IOException e) { + return; + } + if (len < 0) { + return; + } + kind.recordSize(metrics, len); + } +} diff --git a/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/CanonicalPrometheusMetricsCollector.java b/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/CanonicalPrometheusMetricsCollector.java new file mode 100644 index 000000000..fdbb93a64 --- /dev/null +++ b/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/CanonicalPrometheusMetricsCollector.java @@ -0,0 +1,286 @@ +/* + * Copyright 2024 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.metrics.prometheus; + +import java.time.Duration; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicInteger; + +import com.netflix.conductor.client.events.task.TaskPayloadUsedEvent; +import com.netflix.conductor.client.events.task.TaskResultPayloadSizeEvent; +import com.netflix.conductor.client.events.taskrunner.ActiveWorkersChanged; +import com.netflix.conductor.client.events.taskrunner.PollCompleted; +import com.netflix.conductor.client.events.taskrunner.PollFailure; +import com.netflix.conductor.client.events.taskrunner.PollStarted; +import com.netflix.conductor.client.events.taskrunner.TaskAckError; +import com.netflix.conductor.client.events.taskrunner.TaskAckFailure; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionCompleted; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionFailure; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionQueueFull; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionStarted; +import com.netflix.conductor.client.events.taskrunner.TaskPaused; +import com.netflix.conductor.client.events.taskrunner.TaskUpdateCompleted; +import com.netflix.conductor.client.events.taskrunner.TaskUpdateFailure; +import com.netflix.conductor.client.events.taskrunner.ThreadUncaughtException; +import com.netflix.conductor.client.events.workflow.WorkflowInputPayloadSizeEvent; +import com.netflix.conductor.client.events.workflow.WorkflowPayloadUsedEvent; +import com.netflix.conductor.client.events.workflow.WorkflowStartedEvent; +import com.netflix.conductor.client.metrics.ApiClientMetrics; + +import io.micrometer.core.instrument.Counter; +import io.micrometer.core.instrument.Gauge; +import io.micrometer.core.instrument.Timer; +import io.micrometer.prometheusmetrics.PrometheusConfig; +import io.micrometer.prometheusmetrics.PrometheusMeterRegistry; + +/** + * Canonical Prometheus metrics implementation emitting the harmonized metric + * names defined in the cross-SDK metrics catalog (sdk-metrics-harmonization.md). + * + *

This class is selected at runtime when {@code WORKER_CANONICAL_METRICS=true}. + * No legacy metric names are emitted. + */ +public class CanonicalPrometheusMetricsCollector extends AbstractPrometheusMetricsCollector { + + private static final PrometheusMeterRegistry SHARED_REGISTRY = + new PrometheusMeterRegistry(PrometheusConfig.DEFAULT); + + @Override + public String collectorName() { + return "canonical"; + } + + private static final Duration[] CANONICAL_TIME_BUCKETS = { + Duration.ofMillis(1), + Duration.ofMillis(5), + Duration.ofMillis(10), + Duration.ofMillis(25), + Duration.ofMillis(50), + Duration.ofMillis(100), + Duration.ofMillis(250), + Duration.ofMillis(500), + Duration.ofSeconds(1), + Duration.ofMillis(2500), + Duration.ofSeconds(5), + Duration.ofSeconds(10), + }; + + private static final String STATUS_SUCCESS = "SUCCESS"; + private static final String STATUS_FAILURE = "FAILURE"; + + private final PrometheusApiClientMetrics apiClientMetrics; + private final ConcurrentHashMap activeWorkerGauges = new ConcurrentHashMap<>(); + + public CanonicalPrometheusMetricsCollector() { + super(SHARED_REGISTRY); + this.apiClientMetrics = new PrometheusApiClientMetrics(registry); + setAutoWiringEnabled(true); + setActiveWorkersTrackingEnabled(true); + setDiagnosticEventsEnabled(true); + } + + /** Package-private constructor for test isolation. */ + CanonicalPrometheusMetricsCollector(PrometheusMeterRegistry registry) { + super(registry); + this.apiClientMetrics = new PrometheusApiClientMetrics(registry); + setAutoWiringEnabled(true); + setActiveWorkersTrackingEnabled(true); + setDiagnosticEventsEnabled(true); + } + + @Override + public ApiClientMetrics getApiClientMetrics() { + return apiClientMetrics; + } + + // ----- Poll lifecycle ----- + + @Override + public void consume(PollStarted e) { + counter("task_poll_total", "Incremented each time polling is done", + "taskType", e.getTaskType()).increment(); + } + + @Override + public void consume(PollCompleted e) { + counter("task_poll_total", "Incremented each time polling is done", + "taskType", e.getTaskType()); // ensure counter exists even for success path + canonicalTaskTimer("task_poll_time_seconds", "Task poll latency in seconds", + e.getTaskType(), STATUS_SUCCESS).record(e.getDuration()); + } + + @Override + public void consume(PollFailure e) { + canonicalTaskTimer("task_poll_time_seconds", "Task poll latency in seconds", + e.getTaskType(), STATUS_FAILURE).record(e.getDuration()); + counter("task_poll_error_total", "Client error when polling for a task queue", + "taskType", e.getTaskType(), + "exception", exceptionLabel(e.getCause())).increment(); + } + + // ----- Task execution ----- + + @Override + public void consume(TaskExecutionStarted e) { + counter("task_execution_started_total", + "Incremented each time a polled task is dispatched to the worker function", + "taskType", e.getTaskType()).increment(); + } + + @Override + public void consume(TaskExecutionCompleted e) { + canonicalTaskTimer("task_execute_time_seconds", "Task execution latency in seconds", + e.getTaskType(), STATUS_SUCCESS).record(e.getDuration()); + } + + @Override + public void consume(TaskExecutionFailure e) { + canonicalTaskTimer("task_execute_time_seconds", "Task execution latency in seconds", + e.getTaskType(), STATUS_FAILURE).record(e.getDuration()); + counter("task_execute_error_total", "Execution error", + "taskType", e.getTaskType(), + "exception", exceptionLabel(e.getCause())).increment(); + } + + // ----- Active workers gauge ----- + + @Override + public void consume(ActiveWorkersChanged e) { + activeWorkerGauges + .computeIfAbsent(e.getTaskType(), t -> { + AtomicInteger val = new AtomicInteger(0); + Gauge.builder("active_workers", val, AtomicInteger::doubleValue) + .description("Current number of worker threads actively executing a task") + .tag("taskType", t) + .register(registry); + return val; + }) + .set(Math.max(0, e.getCount())); + } + + // ----- Task update ----- + + @Override + public void consume(TaskUpdateCompleted e) { + canonicalTaskTimer("task_update_time_seconds", "Task update (result-report) latency in seconds", + e.getTaskType(), STATUS_SUCCESS).record(e.getDuration()); + } + + @Override + public void consume(TaskUpdateFailure e) { + canonicalTaskTimer("task_update_time_seconds", "Task update (result-report) latency in seconds", + e.getTaskType(), STATUS_FAILURE).record(e.getDuration()); + counter("task_update_error_total", "Task status cannot be updated back to server", + "taskType", e.getTaskType(), + "exception", exceptionLabel(e.getCause())).increment(); + } + + // ----- Task ack / queueing / lifecycle ----- + + @Override + public void consume(TaskAckFailure e) { + counter("task_ack_failed_total", "Task ack failed", + "taskType", e.getTaskType()).increment(); + } + + @Override + public void consume(TaskAckError e) { + counter("task_ack_error_total", "Task ack has encountered an exception", + "taskType", e.getTaskType(), + "exception", exceptionLabel(e.getCause())).increment(); + } + + @Override + public void consume(TaskExecutionQueueFull e) { + counter("task_execution_queue_full_total", + "Incremented when a poll cycle is skipped because all worker threads are busy", + "taskType", e.getTaskType()).increment(); + } + + @Override + public void consume(TaskPaused e) { + counter("task_paused_total", + "Counter for number of times the task has been polled, when the worker has been paused", + "taskType", e.getTaskType()).increment(); + } + + @Override + public void consume(ThreadUncaughtException e) { + counter("thread_uncaught_exceptions_total", + "Uncaught exceptions raised inside worker threads", + "exception", exceptionLabel(e.getCause())).increment(); + } + + // ----- Payload / workflow events ----- + + @Override + public void consume(TaskPayloadUsedEvent e) { + counter("external_payload_used_total", + "Incremented each time external payload storage is used", + "entityName", nullToEmpty(e.getTaskType()), + "operation", nullToEmpty(e.getOperation()), + "payloadType", nullToEmpty(e.getPayloadType())).increment(); + } + + @Override + public void consume(TaskResultPayloadSizeEvent e) { + // Delegate to the same SPI used by the wire-time interceptor so the + // event publisher and the interceptor converge on the same series. + apiClientMetrics.recordTaskResultSize(e.getTaskType(), e.getSize()); + } + + @Override + public void consume(WorkflowPayloadUsedEvent event) { + counter("external_payload_used_total", + "Incremented each time external payload storage is used", + "entityName", nullToEmpty(event.getName()), + "operation", nullToEmpty(event.getOperation()), + "payloadType", nullToEmpty(event.getPayloadType())).increment(); + } + + @Override + public void consume(WorkflowInputPayloadSizeEvent event) { + // Delegate to the same SPI used by the wire-time interceptor so the + // event publisher and the interceptor converge on the same series. + apiClientMetrics.recordWorkflowInputSize(event.getName(), event.getVersion(), event.getSize()); + } + + @Override + public void consume(WorkflowStartedEvent event) { + if (event.isSuccess()) { + return; + } + counter("workflow_start_error_total", "Counter for workflow start errors", + "workflowType", nullToEmpty(event.getName()), + "exception", exceptionLabel(event.getThrowable())).increment(); + } + + // ----- Helpers ----- + + private Counter counter(String name, String description, String... tagKv) { + return Counter.builder(name) + .description(description) + .tags(tagKv) + .register(registry); + } + + private Timer canonicalTaskTimer(String name, String description, String taskType, String status) { + return Timer.builder(name) + .description(description) + .tag("taskType", nullToEmpty(taskType)) + .tag("status", status) + .publishPercentileHistogram(false) + .serviceLevelObjectives(CANONICAL_TIME_BUCKETS) + .register(registry); + } +} diff --git a/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/LegacyPrometheusMetricsCollector.java b/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/LegacyPrometheusMetricsCollector.java new file mode 100644 index 000000000..72779d223 --- /dev/null +++ b/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/LegacyPrometheusMetricsCollector.java @@ -0,0 +1,109 @@ +/* + * Copyright 2024 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.metrics.prometheus; + +import com.netflix.conductor.client.events.task.TaskPayloadUsedEvent; +import com.netflix.conductor.client.events.task.TaskResultPayloadSizeEvent; +import com.netflix.conductor.client.events.taskrunner.PollCompleted; +import com.netflix.conductor.client.events.taskrunner.PollFailure; +import com.netflix.conductor.client.events.taskrunner.PollStarted; +import com.netflix.conductor.client.events.taskrunner.TaskAckError; +import com.netflix.conductor.client.events.taskrunner.TaskAckFailure; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionCompleted; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionFailure; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionQueueFull; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionStarted; +import com.netflix.conductor.client.events.taskrunner.TaskPaused; +import com.netflix.conductor.client.events.taskrunner.TaskUpdateCompleted; +import com.netflix.conductor.client.events.taskrunner.TaskUpdateFailure; +import com.netflix.conductor.client.events.taskrunner.ThreadUncaughtException; +import com.netflix.conductor.client.events.workflow.WorkflowInputPayloadSizeEvent; +import com.netflix.conductor.client.events.workflow.WorkflowPayloadUsedEvent; +import com.netflix.conductor.client.events.workflow.WorkflowStartedEvent; + +import io.micrometer.prometheusmetrics.PrometheusConfig; +import io.micrometer.prometheusmetrics.PrometheusMeterRegistry; + +/** + * Legacy Prometheus metrics implementation preserving the metric names and + * label conventions from the original java-sdk ({@code poll_started{type}}, + * {@code poll_success{type}}, etc.). + * + *

Events that have no legacy metric equivalent are consumed as no-ops. + * This class is selected at runtime when {@code WORKER_CANONICAL_METRICS} + * is not set to {@code true}. + */ +public class LegacyPrometheusMetricsCollector extends AbstractPrometheusMetricsCollector { + + private static final PrometheusMeterRegistry SHARED_REGISTRY = + new PrometheusMeterRegistry(PrometheusConfig.DEFAULT); + + public LegacyPrometheusMetricsCollector() { + super(SHARED_REGISTRY); + } + + /** Package-private constructor for test isolation. */ + LegacyPrometheusMetricsCollector(PrometheusMeterRegistry registry) { + super(registry); + } + + @Override + public String collectorName() { + return "legacy"; + } + + @Override + public void consume(PollStarted e) { + registry.counter("poll_started", "type", e.getTaskType()).increment(); + } + + @Override + public void consume(PollCompleted e) { + registry.timer("poll_success", "type", e.getTaskType()).record(e.getDuration()); + } + + @Override + public void consume(PollFailure e) { + registry.timer("poll_failure", "type", e.getTaskType()).record(e.getDuration()); + } + + @Override + public void consume(TaskExecutionStarted e) { + registry.counter("task_execution_started", "type", e.getTaskType()).increment(); + } + + @Override + public void consume(TaskExecutionCompleted e) { + registry.timer("task_execution_completed", "type", e.getTaskType()).record(e.getDuration()); + } + + @Override + public void consume(TaskExecutionFailure e) { + registry.timer("task_execution_failure", "type", e.getTaskType()).record(e.getDuration()); + } + + // --- Events with no legacy metric: no-op --- + + @Override public void consume(TaskUpdateCompleted e) { } + @Override public void consume(TaskUpdateFailure e) { } + @Override public void consume(TaskAckFailure e) { } + @Override public void consume(TaskAckError e) { } + @Override public void consume(TaskExecutionQueueFull e) { } + @Override public void consume(TaskPaused e) { } + @Override public void consume(ThreadUncaughtException e) { } + @Override public void consume(TaskPayloadUsedEvent e) { } + @Override public void consume(TaskResultPayloadSizeEvent e) { } + @Override public void consume(WorkflowPayloadUsedEvent event) { } + @Override public void consume(WorkflowInputPayloadSizeEvent event) { } + @Override public void consume(WorkflowStartedEvent event) { } +} diff --git a/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/MetricsBundle.java b/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/MetricsBundle.java new file mode 100644 index 000000000..cc3f8072a --- /dev/null +++ b/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/MetricsBundle.java @@ -0,0 +1,75 @@ +/* + * Copyright 2024 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.metrics.prometheus; + +import java.io.IOException; + +/** + * One-stop convenience that creates a Prometheus metrics collector (legacy or + * canonical based on {@code WORKER_CANONICAL_METRICS}) and starts the scrape + * HTTP endpoint. The collector is ready to be passed to + * {@link com.netflix.conductor.client.http.ConductorClient.Builder#withMetricsCollector} + * so downstream clients auto-register themselves. + * + *

Typical usage: + *

+ * MetricsBundle bundle = MetricsBundle.create();
+ * ConductorClient client = ConductorClient.builder()
+ *         .basePath("...")
+ *         .withMetricsCollector(bundle.getCollector())
+ *         .build();
+ * 
+ */ +public final class MetricsBundle { + + private final AbstractPrometheusMetricsCollector collector; + private final int port; + + private MetricsBundle(AbstractPrometheusMetricsCollector collector, int port) { + this.collector = collector; + this.port = port; + } + + /** + * Create the bundle with defaults: factory-selected collector on port 9991. + */ + public static MetricsBundle create() throws IOException { + return create(9991); + } + + /** + * Create the bundle on the given port. The scrape endpoint is started + * immediately at {@code /metrics}. + */ + public static MetricsBundle create(int port) throws IOException { + return create(port, "/metrics"); + } + + /** + * Create the bundle on the given port and endpoint path. The scrape + * endpoint is started immediately. + */ + public static MetricsBundle create(int port, String endpoint) throws IOException { + AbstractPrometheusMetricsCollector collector = MetricsCollectorFactory.create(); + collector.startServer(port, endpoint); + return new MetricsBundle(collector, port); + } + + public AbstractPrometheusMetricsCollector getCollector() { + return collector; + } + + public int getPort() { + return port; + } +} diff --git a/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/MetricsCollectorFactory.java b/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/MetricsCollectorFactory.java new file mode 100644 index 000000000..0eb4525e1 --- /dev/null +++ b/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/MetricsCollectorFactory.java @@ -0,0 +1,64 @@ +/* + * Copyright 2024 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.metrics.prometheus; + +import java.util.Set; +import java.util.function.Function; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Factory that selects the correct Prometheus {@link AbstractPrometheusMetricsCollector} + * based on environment variables. + * + *

+ * + * If {@code WORKER_CANONICAL_METRICS} is true it takes priority regardless of + * the value of {@code WORKER_LEGACY_METRICS}. + */ +public final class MetricsCollectorFactory { + + private static final Logger log = LoggerFactory.getLogger(MetricsCollectorFactory.class); + + private MetricsCollectorFactory() { } + + /** + * Create the metrics collector selected by environment variables. + */ + public static AbstractPrometheusMetricsCollector create() { + return create(System::getenv); + } + + static AbstractPrometheusMetricsCollector create(Function envReader) { + if (envBool("WORKER_CANONICAL_METRICS", false, envReader)) { + log.info("WORKER_CANONICAL_METRICS is true — using CanonicalPrometheusMetricsCollector"); + return new CanonicalPrometheusMetricsCollector(); + } + log.info("Using LegacyPrometheusMetricsCollector (set WORKER_CANONICAL_METRICS=true for canonical metrics)"); + return new LegacyPrometheusMetricsCollector(); + } + + private static final Set TRUTHY_VALUES = Set.of("true", "1", "yes"); + + static boolean envBool(String name, boolean defaultValue, Function envReader) { + String value = envReader.apply(name); + if (value == null || value.isBlank()) { + return defaultValue; + } + return TRUTHY_VALUES.contains(value.trim().toLowerCase()); + } +} diff --git a/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/PrometheusApiClientMetrics.java b/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/PrometheusApiClientMetrics.java new file mode 100644 index 000000000..e4ee4216b --- /dev/null +++ b/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/PrometheusApiClientMetrics.java @@ -0,0 +1,99 @@ +/* + * Copyright 2024 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.metrics.prometheus; + +import java.time.Duration; + +import com.netflix.conductor.client.metrics.ApiClientMetrics; + +import io.micrometer.core.instrument.DistributionSummary; +import io.micrometer.core.instrument.Timer; +import io.micrometer.prometheusmetrics.PrometheusMeterRegistry; + +/** + * Prometheus-backed implementation of {@link ApiClientMetrics} that emits the + * canonical {@code http_api_client_request_seconds}, + * {@code task_result_size_bytes}, and {@code workflow_input_size_bytes} + * histograms. + */ +public final class PrometheusApiClientMetrics implements ApiClientMetrics { + + private static final Duration[] CANONICAL_TIME_BUCKETS = { + Duration.ofMillis(1), + Duration.ofMillis(5), + Duration.ofMillis(10), + Duration.ofMillis(25), + Duration.ofMillis(50), + Duration.ofMillis(100), + Duration.ofMillis(250), + Duration.ofMillis(500), + Duration.ofSeconds(1), + Duration.ofMillis(2500), + Duration.ofSeconds(5), + Duration.ofSeconds(10), + }; + + private static final double[] CANONICAL_SIZE_BUCKETS = { + 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000 + }; + + private final PrometheusMeterRegistry registry; + + public PrometheusApiClientMetrics(PrometheusMeterRegistry registry) { + this.registry = registry; + } + + @Override + public void recordRequest(String method, String uri, int statusCode, Duration duration) { + String statusLabel = statusCode <= 0 ? "0" : Integer.toString(statusCode); + Timer.builder("http_api_client_request_seconds") + .tag("method", nullToEmpty(method)) + .tag("uri", nullToEmpty(uri)) + .tag("status", statusLabel) + .publishPercentileHistogram(false) + .serviceLevelObjectives(CANONICAL_TIME_BUCKETS) + .register(registry) + .record(duration); + } + + @Override + public void recordTaskResultSize(String taskType, long sizeBytes) { + if (sizeBytes < 0) { + return; + } + DistributionSummary.builder("task_result_size_bytes") + .description("Records output payload size of a task in bytes") + .tag("taskType", nullToEmpty(taskType)) + .serviceLevelObjectives(CANONICAL_SIZE_BUCKETS) + .register(registry) + .record(sizeBytes); + } + + @Override + public void recordWorkflowInputSize(String workflowType, Integer version, long sizeBytes) { + if (sizeBytes < 0) { + return; + } + DistributionSummary.builder("workflow_input_size_bytes") + .description("Records input payload size of a workflow in bytes") + .tag("workflowType", nullToEmpty(workflowType)) + .tag("version", version == null ? "" : version.toString()) + .serviceLevelObjectives(CANONICAL_SIZE_BUCKETS) + .register(registry) + .record(sizeBytes); + } + + private static String nullToEmpty(String s) { + return s == null ? "" : s; + } +} diff --git a/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/PrometheusMetricsCollector.java b/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/PrometheusMetricsCollector.java index 97e7d644f..f4a4a0f2a 100644 --- a/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/PrometheusMetricsCollector.java +++ b/conductor-client-metrics/src/main/java/com/netflix/conductor/client/metrics/prometheus/PrometheusMetricsCollector.java @@ -12,109 +12,34 @@ */ package com.netflix.conductor.client.metrics.prometheus; -import java.io.IOException; -import java.net.InetSocketAddress; - -import com.netflix.conductor.client.events.task.TaskPayloadUsedEvent; -import com.netflix.conductor.client.events.task.TaskResultPayloadSizeEvent; -import com.netflix.conductor.client.events.taskrunner.PollCompleted; -import com.netflix.conductor.client.events.taskrunner.PollFailure; -import com.netflix.conductor.client.events.taskrunner.PollStarted; -import com.netflix.conductor.client.events.taskrunner.TaskExecutionCompleted; -import com.netflix.conductor.client.events.taskrunner.TaskExecutionFailure; -import com.netflix.conductor.client.events.taskrunner.TaskExecutionStarted; -import com.netflix.conductor.client.events.workflow.WorkflowInputPayloadSizeEvent; -import com.netflix.conductor.client.events.workflow.WorkflowPayloadUsedEvent; -import com.netflix.conductor.client.events.workflow.WorkflowStartedEvent; -import com.netflix.conductor.client.metrics.MetricsCollector; - -import com.sun.net.httpserver.HttpServer; -import io.micrometer.prometheusmetrics.PrometheusConfig; -import io.micrometer.prometheusmetrics.PrometheusMeterRegistry; - -public class PrometheusMetricsCollector implements MetricsCollector { - - private static final PrometheusMeterRegistry prometheusRegistry = new PrometheusMeterRegistry(PrometheusConfig.DEFAULT); - - private static final int DEFAULT_PORT = 9991; - - private static final String DEFAULT_ENDPOINT = "/metrics"; - - public void startServer() throws IOException { - startServer(DEFAULT_PORT, DEFAULT_ENDPOINT); - } - - public void startServer(int port, String endpoint) throws IOException { - var server = HttpServer.create(new InetSocketAddress(port), 0); - server.createContext(endpoint, (exchange -> { - var body = prometheusRegistry.scrape(); - exchange.getResponseHeaders().set("Content-Type", "text/plain"); - exchange.sendResponseHeaders(200, body.getBytes().length); - try (var os = exchange.getResponseBody()) { - os.write(body.getBytes()); - } - })); - server.start(); - } - - @Override - public void consume(PollFailure e) { - var timer = prometheusRegistry.timer("poll_failure", "type", e.getTaskType()); - timer.record(e.getDuration()); - } - - @Override - public void consume(PollCompleted e) { - var timer = prometheusRegistry.timer("poll_success", "type", e.getTaskType()); - timer.record(e.getDuration()); - } - - @Override - public void consume(PollStarted e) { - var counter = prometheusRegistry.counter("poll_started", "type", e.getTaskType()); - counter.increment(); - } - - @Override - public void consume(TaskExecutionStarted e) { - var counter = prometheusRegistry.counter("task_execution_started", "type", e.getTaskType()); - counter.increment(); - } - - @Override - public void consume(TaskExecutionCompleted e) { - var timer = prometheusRegistry.timer("task_execution_completed", "type", e.getTaskType()); - timer.record(e.getDuration()); - } - - @Override - public void consume(TaskExecutionFailure e) { - var timer = prometheusRegistry.timer("task_execution_failure", "type", e.getTaskType()); - timer.record(e.getDuration()); - } - - @Override - public void consume(TaskPayloadUsedEvent e) { - //TODO implement - } - - @Override - public void consume(TaskResultPayloadSizeEvent e) { - //TODO implement - } - - @Override - public void consume(WorkflowPayloadUsedEvent event) { - //TODO implement - } - - @Override - public void consume(WorkflowInputPayloadSizeEvent event) { - //TODO implement - } - - @Override - public void consume(WorkflowStartedEvent event) { - //TODO implement - } +/** + * Source-compatibility alias for {@link LegacyPrometheusMetricsCollector}, kept + * so callers that wrote {@code new PrometheusMetricsCollector()} against + * earlier 4.0.x releases continue to compile and emit the same legacy meter + * names byte-for-byte: + * + *

    + *
  • {@code poll_started{type}}
  • + *
  • {@code poll_success{type}}
  • + *
  • {@code poll_failure{type}}
  • + *
  • {@code task_execution_started{type}}
  • + *
  • {@code task_execution_completed{type}}
  • + *
  • {@code task_execution_failure{type}}
  • + *
+ * + *

This class deliberately delegates to {@link LegacyPrometheusMetricsCollector} + * (rather than {@link MetricsCollectorFactory#create()}) so that an upgrader who + * already has {@code WORKER_CANONICAL_METRICS=true} set in their environment + * does not silently flip to the canonical metric surface just by upgrading the + * SDK. Callers that want the env-var-driven selection should switch to + * {@link MetricsCollectorFactory#create()} or {@link MetricsBundle#create(int)}. + * + * @deprecated Use {@link MetricsCollectorFactory#create()} (or + * {@link MetricsBundle#create(int)}) for new code, which selects + * between {@link LegacyPrometheusMetricsCollector} and + * {@link CanonicalPrometheusMetricsCollector} based on + * {@code WORKER_CANONICAL_METRICS}. + */ +@Deprecated +public class PrometheusMetricsCollector extends LegacyPrometheusMetricsCollector { } diff --git a/conductor-client-metrics/src/test/java/com/netflix/conductor/client/metrics/prometheus/AbstractPrometheusMetricsCollectorTest.java b/conductor-client-metrics/src/test/java/com/netflix/conductor/client/metrics/prometheus/AbstractPrometheusMetricsCollectorTest.java new file mode 100644 index 000000000..384fb737f --- /dev/null +++ b/conductor-client-metrics/src/test/java/com/netflix/conductor/client/metrics/prometheus/AbstractPrometheusMetricsCollectorTest.java @@ -0,0 +1,146 @@ +/* + * Copyright 2025 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.metrics.prometheus; + +import java.lang.reflect.InvocationTargetException; +import java.util.concurrent.CompletionException; +import java.util.concurrent.ExecutionException; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class AbstractPrometheusMetricsCollectorTest { + + // --- nullToEmpty --- + + @Test + void nullToEmptyReturnsEmptyForNull() { + assertEquals("", AbstractPrometheusMetricsCollector.nullToEmpty(null)); + } + + @Test + void nullToEmptyReturnsOriginalForNonNull() { + assertEquals("hello", AbstractPrometheusMetricsCollector.nullToEmpty("hello")); + } + + @Test + void nullToEmptyReturnsEmptyStringAsIs() { + assertEquals("", AbstractPrometheusMetricsCollector.nullToEmpty("")); + } + + // --- versionLabel --- + + @Test + void versionLabelReturnsEmptyForNull() { + assertEquals("", AbstractPrometheusMetricsCollector.versionLabel(null)); + } + + @Test + void versionLabelReturnsStringForInteger() { + assertEquals("42", AbstractPrometheusMetricsCollector.versionLabel(42)); + } + + @Test + void versionLabelReturnsStringForZero() { + assertEquals("0", AbstractPrometheusMetricsCollector.versionLabel(0)); + } + + @Test + void versionLabelReturnsStringForNegative() { + assertEquals("-1", AbstractPrometheusMetricsCollector.versionLabel(-1)); + } + + // --- exceptionLabel --- + + @Test + void exceptionLabelReturnsEmptyForNull() { + assertEquals("", AbstractPrometheusMetricsCollector.exceptionLabel(null)); + } + + @Test + void exceptionLabelReturnsSimpleClassName() { + assertEquals("RuntimeException", + AbstractPrometheusMetricsCollector.exceptionLabel(new RuntimeException())); + } + + @Test + void exceptionLabelReturnsDifferentExceptionTypes() { + assertEquals("IllegalStateException", + AbstractPrometheusMetricsCollector.exceptionLabel(new IllegalStateException())); + assertEquals("NullPointerException", + AbstractPrometheusMetricsCollector.exceptionLabel(new NullPointerException())); + } + + // --- exceptionLabel: wrapper unwrapping --- + + @Test + void exceptionLabelUnwrapsExecutionException() { + Exception inner = new IllegalArgumentException("bad arg"); + ExecutionException wrapper = new ExecutionException(inner); + + assertEquals("IllegalArgumentException", + AbstractPrometheusMetricsCollector.exceptionLabel(wrapper)); + } + + @Test + void exceptionLabelUnwrapsCompletionException() { + Exception inner = new ArithmeticException("divide by zero"); + CompletionException wrapper = new CompletionException(inner); + + assertEquals("ArithmeticException", + AbstractPrometheusMetricsCollector.exceptionLabel(wrapper)); + } + + @Test + void exceptionLabelUnwrapsInvocationTargetException() { + Exception inner = new UnsupportedOperationException("nope"); + InvocationTargetException wrapper = new InvocationTargetException(inner); + + assertEquals("UnsupportedOperationException", + AbstractPrometheusMetricsCollector.exceptionLabel(wrapper)); + } + + @Test + void exceptionLabelDoesNotUnwrapNonWrapperExceptions() { + RuntimeException inner = new RuntimeException("inner"); + IllegalStateException outer = new IllegalStateException("outer", inner); + + assertEquals("IllegalStateException", + AbstractPrometheusMetricsCollector.exceptionLabel(outer)); + } + + @Test + void exceptionLabelDoesNotUnwrapWrapperWithNullCause() { + ExecutionException wrapper = new ExecutionException(null); + + assertEquals("ExecutionException", + AbstractPrometheusMetricsCollector.exceptionLabel(wrapper)); + } + + @Test + void exceptionLabelHandlesAnonymousExceptionClass() { + RuntimeException anon = new RuntimeException("anon") { }; + String label = AbstractPrometheusMetricsCollector.exceptionLabel(anon); + assertNotNull(label); + assertFalse(label.isEmpty()); + } + + // --- getRegistry --- + + @Test + void getRegistryReturnsNonNull() { + var collector = new LegacyPrometheusMetricsCollector(); + assertNotNull(collector.getRegistry()); + } +} diff --git a/conductor-client-metrics/src/test/java/com/netflix/conductor/client/metrics/prometheus/ApiClientMetricsInterceptorTest.java b/conductor-client-metrics/src/test/java/com/netflix/conductor/client/metrics/prometheus/ApiClientMetricsInterceptorTest.java new file mode 100644 index 000000000..adad748f7 --- /dev/null +++ b/conductor-client-metrics/src/test/java/com/netflix/conductor/client/metrics/prometheus/ApiClientMetricsInterceptorTest.java @@ -0,0 +1,187 @@ +/* + * Copyright 2025 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.metrics.prometheus; + +import java.io.IOException; +import java.time.Duration; + +import org.junit.jupiter.api.Test; + +import com.netflix.conductor.client.http.ConductorClient; +import com.netflix.conductor.client.metrics.ApiClientMetrics; + +import okhttp3.Interceptor; +import okhttp3.Protocol; +import okhttp3.Request; +import okhttp3.Response; + +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.Mockito.*; + +class ApiClientMetricsInterceptorTest { + + private static Request buildRequest(String url) { + return new Request.Builder().url(url).get().build(); + } + + private static Response buildResponse(Request request, int code) { + return new Response.Builder() + .request(request) + .protocol(Protocol.HTTP_1_1) + .code(code) + .message("OK") + .build(); + } + + @Test + void interceptRecordsMetricsOnSuccess() throws IOException { + ApiClientMetrics metrics = mock(ApiClientMetrics.class); + var interceptor = new ApiClientMetricsInterceptor(metrics); + + Request request = buildRequest("http://localhost/api/tasks"); + Response response = buildResponse(request, 200); + + Interceptor.Chain chain = mock(Interceptor.Chain.class); + when(chain.request()).thenReturn(request); + when(chain.proceed(request)).thenReturn(response); + + Response result = interceptor.intercept(chain); + + assertEquals(200, result.code()); + verify(metrics).recordRequest(eq("GET"), eq("/api/tasks"), eq(200), any(Duration.class)); + } + + @Test + void interceptRecordsNegativeStatusOnIOException() throws IOException { + ApiClientMetrics metrics = mock(ApiClientMetrics.class); + var interceptor = new ApiClientMetricsInterceptor(metrics); + + Request request = buildRequest("http://localhost/api/tasks"); + + Interceptor.Chain chain = mock(Interceptor.Chain.class); + when(chain.request()).thenReturn(request); + when(chain.proceed(request)).thenThrow(new IOException("connection refused")); + + assertThrows(IOException.class, () -> interceptor.intercept(chain)); + + verify(metrics).recordRequest(eq("GET"), eq("/api/tasks"), eq(-1), any(Duration.class)); + } + + @Test + void interceptDoesNotSwallowIOException() throws IOException { + ApiClientMetrics metrics = mock(ApiClientMetrics.class); + var interceptor = new ApiClientMetricsInterceptor(metrics); + + Request request = buildRequest("http://localhost/api/tasks"); + IOException expected = new IOException("timeout"); + + Interceptor.Chain chain = mock(Interceptor.Chain.class); + when(chain.request()).thenReturn(request); + when(chain.proceed(request)).thenThrow(expected); + + IOException thrown = assertThrows(IOException.class, () -> interceptor.intercept(chain)); + assertSame(expected, thrown); + } + + @Test + void interceptWithNullMetricsFallsBackToNoop() throws IOException { + var interceptor = new ApiClientMetricsInterceptor(null); + + Request request = buildRequest("http://localhost/api/workflows"); + Response response = buildResponse(request, 201); + + Interceptor.Chain chain = mock(Interceptor.Chain.class); + when(chain.request()).thenReturn(request); + when(chain.proceed(request)).thenReturn(response); + + Response result = interceptor.intercept(chain); + + assertNotNull(result); + assertEquals(201, result.code()); + } + + @Test + void interceptPassesRequestUnmodified() throws IOException { + ApiClientMetrics metrics = mock(ApiClientMetrics.class); + var interceptor = new ApiClientMetricsInterceptor(metrics); + + Request request = buildRequest("http://localhost/api/tasks/123"); + Response response = buildResponse(request, 200); + + Interceptor.Chain chain = mock(Interceptor.Chain.class); + when(chain.request()).thenReturn(request); + when(chain.proceed(request)).thenReturn(response); + + interceptor.intercept(chain); + + verify(chain).proceed(request); + } + + @Test + void interceptSwallowsMetricsRecordingFailure() throws IOException { + ApiClientMetrics metrics = mock(ApiClientMetrics.class); + doThrow(new RuntimeException("metrics broken")) + .when(metrics).recordRequest(any(), any(), anyInt(), any()); + + var interceptor = new ApiClientMetricsInterceptor(metrics); + Request request = buildRequest("http://localhost/api/tasks"); + Response response = buildResponse(request, 200); + + Interceptor.Chain chain = mock(Interceptor.Chain.class); + when(chain.request()).thenReturn(request); + when(chain.proceed(request)).thenReturn(response); + + Response result = assertDoesNotThrow(() -> interceptor.intercept(chain)); + assertEquals(200, result.code()); + } + + @Test + void interceptUsesPathTemplateTagOverResolvedUrl() throws IOException { + ApiClientMetrics metrics = mock(ApiClientMetrics.class); + var interceptor = new ApiClientMetricsInterceptor(metrics); + + Request request = new Request.Builder() + .url("http://localhost/api/workflow/abc-123-def") + .get() + .tag(ConductorClient.PathTemplateTag.class, + new ConductorClient.PathTemplateTag("/workflow/{workflowId}")) + .build(); + Response response = buildResponse(request, 200); + + Interceptor.Chain chain = mock(Interceptor.Chain.class); + when(chain.request()).thenReturn(request); + when(chain.proceed(request)).thenReturn(response); + + interceptor.intercept(chain); + + verify(metrics).recordRequest( + eq("GET"), eq("/workflow/{workflowId}"), eq(200), any(Duration.class)); + } + + @Test + void interceptExtractsEncodedPath() throws IOException { + ApiClientMetrics metrics = mock(ApiClientMetrics.class); + var interceptor = new ApiClientMetricsInterceptor(metrics); + + Request request = buildRequest("http://localhost/api/tasks/batch?ids=1,2,3"); + Response response = buildResponse(request, 200); + + Interceptor.Chain chain = mock(Interceptor.Chain.class); + when(chain.request()).thenReturn(request); + when(chain.proceed(request)).thenReturn(response); + + interceptor.intercept(chain); + + verify(metrics).recordRequest(eq("GET"), eq("/api/tasks/batch"), eq(200), any(Duration.class)); + } +} diff --git a/conductor-client-metrics/src/test/java/com/netflix/conductor/client/metrics/prometheus/CanonicalPrometheusMetricsCollectorTest.java b/conductor-client-metrics/src/test/java/com/netflix/conductor/client/metrics/prometheus/CanonicalPrometheusMetricsCollectorTest.java new file mode 100644 index 000000000..bccdc82f7 --- /dev/null +++ b/conductor-client-metrics/src/test/java/com/netflix/conductor/client/metrics/prometheus/CanonicalPrometheusMetricsCollectorTest.java @@ -0,0 +1,322 @@ +/* + * Copyright 2025 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.metrics.prometheus; + +import java.util.concurrent.TimeUnit; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import com.netflix.conductor.client.events.task.TaskPayloadUsedEvent; +import com.netflix.conductor.client.events.task.TaskResultPayloadSizeEvent; +import com.netflix.conductor.client.events.taskrunner.ActiveWorkersChanged; +import com.netflix.conductor.client.events.taskrunner.PollCompleted; +import com.netflix.conductor.client.events.taskrunner.PollFailure; +import com.netflix.conductor.client.events.taskrunner.PollStarted; +import com.netflix.conductor.client.events.taskrunner.TaskAckError; +import com.netflix.conductor.client.events.taskrunner.TaskAckFailure; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionCompleted; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionFailure; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionQueueFull; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionStarted; +import com.netflix.conductor.client.events.taskrunner.TaskPaused; +import com.netflix.conductor.client.events.taskrunner.TaskUpdateCompleted; +import com.netflix.conductor.client.events.taskrunner.TaskUpdateFailure; +import com.netflix.conductor.client.events.taskrunner.ThreadUncaughtException; +import com.netflix.conductor.client.events.workflow.WorkflowInputPayloadSizeEvent; +import com.netflix.conductor.client.events.workflow.WorkflowPayloadUsedEvent; +import com.netflix.conductor.client.events.workflow.WorkflowStartedEvent; + +import io.micrometer.prometheusmetrics.PrometheusConfig; +import io.micrometer.prometheusmetrics.PrometheusMeterRegistry; + +import static org.junit.jupiter.api.Assertions.*; + +class CanonicalPrometheusMetricsCollectorTest { + + private CanonicalPrometheusMetricsCollector collector; + private PrometheusMeterRegistry registry; + + @BeforeEach + void setUp() { + registry = new PrometheusMeterRegistry(PrometheusConfig.DEFAULT); + collector = new CanonicalPrometheusMetricsCollector(registry); + } + + // --- Poll lifecycle --- + + @Test + void pollStartedIncrementsCounter() { + collector.consume(new PollStarted("HTTP")); + + double count = registry.get("task_poll_total").tag("taskType", "HTTP").counter().count(); + assertEquals(1.0, count); + } + + @Test + void pollCompletedRecordsTimer() { + collector.consume(new PollCompleted("HTTP", 250)); + + var timer = registry.get("task_poll_time_seconds") + .tag("taskType", "HTTP").tag("status", "SUCCESS").timer(); + assertEquals(1, timer.count()); + assertTrue(timer.totalTime(TimeUnit.MILLISECONDS) >= 250); + } + + @Test + void pollFailureRecordsTimerAndErrorCounter() { + RuntimeException cause = new RuntimeException("timeout"); + collector.consume(new PollFailure("HTTP", 300, cause)); + + var timer = registry.get("task_poll_time_seconds") + .tag("taskType", "HTTP").tag("status", "FAILURE").timer(); + assertEquals(1, timer.count()); + + double errorCount = registry.get("task_poll_error_total") + .tag("taskType", "HTTP").tag("exception", "RuntimeException").counter().count(); + assertEquals(1.0, errorCount); + } + + // --- Task execution --- + + @Test + void taskExecutionStartedIncrementsCounter() { + collector.consume(new TaskExecutionStarted("SIMPLE", "t1", "w1")); + + double count = registry.get("task_execution_started_total") + .tag("taskType", "SIMPLE").counter().count(); + assertEquals(1.0, count); + } + + @Test + void taskExecutionCompletedRecordsTimer() { + collector.consume(new TaskExecutionCompleted("SIMPLE", "t1", "w1", 500)); + + var timer = registry.get("task_execute_time_seconds") + .tag("taskType", "SIMPLE").tag("status", "SUCCESS").timer(); + assertEquals(1, timer.count()); + } + + @Test + void taskExecutionFailureRecordsTimerAndErrorCounter() { + IllegalStateException cause = new IllegalStateException("bad"); + collector.consume(new TaskExecutionFailure("SIMPLE", "t1", "w1", cause, 600)); + + var timer = registry.get("task_execute_time_seconds") + .tag("taskType", "SIMPLE").tag("status", "FAILURE").timer(); + assertEquals(1, timer.count()); + + double errorCount = registry.get("task_execute_error_total") + .tag("taskType", "SIMPLE").tag("exception", "IllegalStateException").counter().count(); + assertEquals(1.0, errorCount); + } + + // --- Active workers gauge --- + + @Test + void activeWorkersChangedSetsGauge() { + collector.consume(new ActiveWorkersChanged("SIMPLE", 7)); + + double val = registry.get("active_workers").tag("taskType", "SIMPLE").gauge().value(); + assertEquals(7.0, val); + } + + @Test + void activeWorkersChangedClampsNegativeToZero() { + collector.consume(new ActiveWorkersChanged("SIMPLE", -1)); + + double val = registry.get("active_workers").tag("taskType", "SIMPLE").gauge().value(); + assertEquals(0.0, val); + } + + @Test + void activeWorkersChangedUpdatesExistingGauge() { + collector.consume(new ActiveWorkersChanged("SIMPLE", 3)); + collector.consume(new ActiveWorkersChanged("SIMPLE", 10)); + + double val = registry.get("active_workers").tag("taskType", "SIMPLE").gauge().value(); + assertEquals(10.0, val); + } + + // --- Task update --- + + @Test + void taskUpdateCompletedRecordsTimer() { + collector.consume(new TaskUpdateCompleted("SIMPLE", "t1", "w1", "wf1", 400)); + + var timer = registry.get("task_update_time_seconds") + .tag("taskType", "SIMPLE").tag("status", "SUCCESS").timer(); + assertEquals(1, timer.count()); + } + + @Test + void taskUpdateFailureRecordsTimerAndErrorCounter() { + RuntimeException cause = new RuntimeException("update failed"); + collector.consume(new TaskUpdateFailure("SIMPLE", "t1", "w1", "wf1", cause, 500)); + + var timer = registry.get("task_update_time_seconds") + .tag("taskType", "SIMPLE").tag("status", "FAILURE").timer(); + assertEquals(1, timer.count()); + + double errorCount = registry.get("task_update_error_total") + .tag("taskType", "SIMPLE").tag("exception", "RuntimeException").counter().count(); + assertEquals(1.0, errorCount); + } + + // --- Ack / queueing / lifecycle --- + + @Test + void taskAckFailureIncrementsCounter() { + collector.consume(new TaskAckFailure("HTTP", "t1")); + + double count = registry.get("task_ack_failed_total") + .tag("taskType", "HTTP").counter().count(); + assertEquals(1.0, count); + } + + @Test + void taskAckErrorIncrementsCounter() { + collector.consume(new TaskAckError("HTTP", "t1", new RuntimeException("net"))); + + double count = registry.get("task_ack_error_total") + .tag("taskType", "HTTP").tag("exception", "RuntimeException").counter().count(); + assertEquals(1.0, count); + } + + @Test + void taskExecutionQueueFullIncrementsCounter() { + collector.consume(new TaskExecutionQueueFull("SIMPLE")); + + double count = registry.get("task_execution_queue_full_total") + .tag("taskType", "SIMPLE").counter().count(); + assertEquals(1.0, count); + } + + @Test + void taskPausedIncrementsCounter() { + collector.consume(new TaskPaused("SIMPLE")); + + double count = registry.get("task_paused_total") + .tag("taskType", "SIMPLE").counter().count(); + assertEquals(1.0, count); + } + + @Test + void threadUncaughtExceptionIncrementsCounter() { + collector.consume(new ThreadUncaughtException(new NullPointerException())); + + double count = registry.get("thread_uncaught_exceptions_total") + .tag("exception", "NullPointerException").counter().count(); + assertEquals(1.0, count); + } + + // --- Payload / workflow events --- + + @Test + void taskPayloadUsedIncrementsCounter() { + collector.consume(new TaskPayloadUsedEvent("HTTP", "WRITE", "output")); + + double count = registry.get("external_payload_used_total") + .tag("entityName", "HTTP").tag("operation", "WRITE").tag("payloadType", "output") + .counter().count(); + assertEquals(1.0, count); + } + + @Test + void taskResultPayloadSizeRecordsHistogram() { + collector.consume(new TaskResultPayloadSizeEvent("HTTP", 50_000L)); + + var summary = registry.get("task_result_size_bytes") + .tag("taskType", "HTTP").summary(); + assertEquals(1, summary.count()); + assertEquals(50_000.0, summary.totalAmount()); + } + + @Test + void workflowPayloadUsedIncrementsCounter() { + collector.consume(new WorkflowPayloadUsedEvent("myWf", 1, "READ", "input")); + + double count = registry.get("external_payload_used_total") + .tag("entityName", "myWf").tag("operation", "READ").tag("payloadType", "input") + .counter().count(); + assertEquals(1.0, count); + } + + @Test + void workflowInputPayloadSizeRecordsHistogram() { + collector.consume(new WorkflowInputPayloadSizeEvent("myWf", 2, 10_000L)); + + var summary = registry.get("workflow_input_size_bytes") + .tag("workflowType", "myWf").tag("version", "2").summary(); + assertEquals(1, summary.count()); + assertEquals(10_000.0, summary.totalAmount()); + } + + @Test + void workflowStartedSuccessDoesNotIncrementErrorCounter() { + collector.consume(new WorkflowStartedEvent("myWf", 1)); + + assertNull(registry.find("workflow_start_error_total").counter()); + } + + @Test + void workflowStartedFailureIncrementsErrorCounter() { + RuntimeException cause = new RuntimeException("start failed"); + collector.consume(new WorkflowStartedEvent("myWf", 1, false, cause)); + + double count = registry.get("workflow_start_error_total") + .tag("workflowType", "myWf").tag("exception", "RuntimeException").counter().count(); + assertEquals(1.0, count); + } + + // --- getApiClientMetrics --- + + @Test + void getApiClientMetricsReturnsNonNull() { + assertNotNull(collector.getApiClientMetrics()); + assertInstanceOf(PrometheusApiClientMetrics.class, collector.getApiClientMetrics()); + } + + // --- Multiple increments accumulate --- + + @Test + void repeatedPollStartedAccumulates() { + collector.consume(new PollStarted("HTTP")); + collector.consume(new PollStarted("HTTP")); + collector.consume(new PollStarted("HTTP")); + + double count = registry.get("task_poll_total").tag("taskType", "HTTP").counter().count(); + assertEquals(3.0, count); + } + + // --- Null-safe label handling --- + + @Test + void taskPayloadUsedWithNullFieldsDoesNotThrow() { + assertDoesNotThrow(() -> collector.consume(new TaskPayloadUsedEvent(null, null, null))); + + double count = registry.get("external_payload_used_total") + .tag("entityName", "").tag("operation", "").tag("payloadType", "") + .counter().count(); + assertEquals(1.0, count); + } + + @Test + void workflowInputPayloadSizeWithNullVersion() { + assertDoesNotThrow(() -> collector.consume(new WorkflowInputPayloadSizeEvent("wf", null, 100L))); + + var summary = registry.get("workflow_input_size_bytes") + .tag("workflowType", "wf").tag("version", "").summary(); + assertEquals(1, summary.count()); + } +} diff --git a/conductor-client-metrics/src/test/java/com/netflix/conductor/client/metrics/prometheus/LegacyPrometheusMetricsCollectorTest.java b/conductor-client-metrics/src/test/java/com/netflix/conductor/client/metrics/prometheus/LegacyPrometheusMetricsCollectorTest.java new file mode 100644 index 000000000..33058c7fd --- /dev/null +++ b/conductor-client-metrics/src/test/java/com/netflix/conductor/client/metrics/prometheus/LegacyPrometheusMetricsCollectorTest.java @@ -0,0 +1,209 @@ +/* + * Copyright 2025 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.metrics.prometheus; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import com.netflix.conductor.client.events.task.TaskPayloadUsedEvent; +import com.netflix.conductor.client.events.task.TaskResultPayloadSizeEvent; +import com.netflix.conductor.client.events.taskrunner.ActiveWorkersChanged; +import com.netflix.conductor.client.events.taskrunner.PollCompleted; +import com.netflix.conductor.client.events.taskrunner.PollFailure; +import com.netflix.conductor.client.events.taskrunner.PollStarted; +import com.netflix.conductor.client.events.taskrunner.TaskAckError; +import com.netflix.conductor.client.events.taskrunner.TaskAckFailure; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionCompleted; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionFailure; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionQueueFull; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionStarted; +import com.netflix.conductor.client.events.taskrunner.TaskPaused; +import com.netflix.conductor.client.events.taskrunner.TaskUpdateCompleted; +import com.netflix.conductor.client.events.taskrunner.TaskUpdateFailure; +import com.netflix.conductor.client.events.taskrunner.ThreadUncaughtException; +import com.netflix.conductor.client.events.workflow.WorkflowInputPayloadSizeEvent; +import com.netflix.conductor.client.events.workflow.WorkflowPayloadUsedEvent; +import com.netflix.conductor.client.events.workflow.WorkflowStartedEvent; +import com.netflix.conductor.client.metrics.ApiClientMetrics; + +import io.micrometer.prometheusmetrics.PrometheusConfig; +import io.micrometer.prometheusmetrics.PrometheusMeterRegistry; + +import static org.junit.jupiter.api.Assertions.*; + +class LegacyPrometheusMetricsCollectorTest { + + private LegacyPrometheusMetricsCollector collector; + private PrometheusMeterRegistry registry; + + @BeforeEach + void setUp() { + registry = new PrometheusMeterRegistry(PrometheusConfig.DEFAULT); + collector = new LegacyPrometheusMetricsCollector(registry); + } + + // --- Active legacy metrics --- + + @Test + void pollStartedRecordsLegacyCounter() { + collector.consume(new PollStarted("HTTP")); + + double count = registry.get("poll_started").tag("type", "HTTP").counter().count(); + assertEquals(1.0, count); + } + + @Test + void pollCompletedRecordsLegacyTimer() { + collector.consume(new PollCompleted("HTTP", 200)); + + var timer = registry.get("poll_success").tag("type", "HTTP").timer(); + assertEquals(1, timer.count()); + } + + @Test + void pollFailureRecordsLegacyTimer() { + collector.consume(new PollFailure("HTTP", 300, new RuntimeException())); + + var timer = registry.get("poll_failure").tag("type", "HTTP").timer(); + assertEquals(1, timer.count()); + } + + @Test + void taskExecutionStartedRecordsLegacyCounter() { + collector.consume(new TaskExecutionStarted("SIMPLE", "t1", "w1")); + + double count = registry.get("task_execution_started").tag("type", "SIMPLE").counter().count(); + assertEquals(1.0, count); + } + + @Test + void taskExecutionCompletedRecordsLegacyTimer() { + collector.consume(new TaskExecutionCompleted("SIMPLE", "t1", "w1", 400)); + + var timer = registry.get("task_execution_completed").tag("type", "SIMPLE").timer(); + assertEquals(1, timer.count()); + } + + @Test + void taskExecutionFailureRecordsLegacyTimer() { + collector.consume(new TaskExecutionFailure("SIMPLE", "t1", "w1", new RuntimeException(), 500)); + + var timer = registry.get("task_execution_failure").tag("type", "SIMPLE").timer(); + assertEquals(1, timer.count()); + } + + // --- No-op events produce no metrics --- + + @Test + void taskUpdateCompletedIsNoop() { + collector.consume(new TaskUpdateCompleted("SIMPLE", "t1", "w1", "wf1", 100)); + + assertNull(registry.find("task_update_time_seconds").timer()); + } + + @Test + void taskUpdateFailureIsNoop() { + collector.consume(new TaskUpdateFailure("SIMPLE", "t1", "w1", "wf1", new RuntimeException(), 100)); + + assertNull(registry.find("task_update_time_seconds").timer()); + assertNull(registry.find("task_update_error_total").counter()); + } + + @Test + void taskAckFailureIsNoop() { + collector.consume(new TaskAckFailure("HTTP", "t1")); + + assertNull(registry.find("task_ack_failed_total").counter()); + } + + @Test + void taskAckErrorIsNoop() { + collector.consume(new TaskAckError("HTTP", "t1", new RuntimeException())); + + assertNull(registry.find("task_ack_error_total").counter()); + } + + @Test + void taskExecutionQueueFullIsNoop() { + collector.consume(new TaskExecutionQueueFull("SIMPLE")); + + assertNull(registry.find("task_execution_queue_full_total").counter()); + } + + @Test + void taskPausedIsNoop() { + collector.consume(new TaskPaused("SIMPLE")); + + assertNull(registry.find("task_paused_total").counter()); + } + + @Test + void threadUncaughtExceptionIsNoop() { + collector.consume(new ThreadUncaughtException(new RuntimeException())); + + assertNull(registry.find("thread_uncaught_exceptions_total").counter()); + } + + @Test + void taskPayloadUsedIsNoop() { + collector.consume(new TaskPayloadUsedEvent("HTTP", "WRITE", "output")); + + assertNull(registry.find("external_payload_used_total").counter()); + } + + @Test + void taskResultPayloadSizeIsNoop() { + collector.consume(new TaskResultPayloadSizeEvent("HTTP", 1024L)); + + assertNull(registry.find("task_result_size_bytes").summary()); + } + + @Test + void workflowPayloadUsedIsNoop() { + collector.consume(new WorkflowPayloadUsedEvent("wf", 1, "READ", "input")); + + assertNull(registry.find("external_payload_used_total").counter()); + } + + @Test + void workflowInputPayloadSizeIsNoop() { + collector.consume(new WorkflowInputPayloadSizeEvent("wf", 1, 1024L)); + + assertNull(registry.find("workflow_input_size_bytes").summary()); + } + + @Test + void workflowStartedIsNoop() { + collector.consume(new WorkflowStartedEvent("wf", 1, false, new RuntimeException())); + + assertNull(registry.find("workflow_start_error_total").counter()); + } + + // --- ActiveWorkersChanged has no default implementation to override, + // but the interface provides a default no-op. Legacy doesn't override it, + // so calling it should not produce any metric. --- + + @Test + void activeWorkersChangedUsesDefaultNoop() { + collector.consume(new ActiveWorkersChanged("SIMPLE", 5)); + + assertNull(registry.find("active_workers").gauge()); + } + + // --- getApiClientMetrics returns NOOP --- + + @Test + void getApiClientMetricsReturnsNoop() { + assertSame(ApiClientMetrics.NOOP, collector.getApiClientMetrics()); + } +} diff --git a/conductor-client-metrics/src/test/java/com/netflix/conductor/client/metrics/prometheus/MetricsCollectorFactoryTest.java b/conductor-client-metrics/src/test/java/com/netflix/conductor/client/metrics/prometheus/MetricsCollectorFactoryTest.java new file mode 100644 index 000000000..5d95c6da4 --- /dev/null +++ b/conductor-client-metrics/src/test/java/com/netflix/conductor/client/metrics/prometheus/MetricsCollectorFactoryTest.java @@ -0,0 +1,186 @@ +/* + * Copyright 2025 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.metrics.prometheus; + +import java.util.HashMap; +import java.util.Map; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class MetricsCollectorFactoryTest { + + // --- Factory selection --- + + @Test + void createReturnsCanonicalWhenEnvIsTrue() { + Map env = Map.of("WORKER_CANONICAL_METRICS", "true"); + var collector = MetricsCollectorFactory.create(env::get); + + assertInstanceOf(CanonicalPrometheusMetricsCollector.class, collector); + } + + @Test + void createReturnsLegacyWhenEnvNotSet() { + Map env = Map.of(); + var collector = MetricsCollectorFactory.create(env::get); + + assertInstanceOf(LegacyPrometheusMetricsCollector.class, collector); + } + + @Test + void createReturnsLegacyWhenEnvIsFalse() { + Map env = Map.of("WORKER_CANONICAL_METRICS", "false"); + var collector = MetricsCollectorFactory.create(env::get); + + assertInstanceOf(LegacyPrometheusMetricsCollector.class, collector); + } + + @Test + void createReturnsLegacyWhenEnvIsArbitraryString() { + Map env = Map.of("WORKER_CANONICAL_METRICS", "enabled"); + var collector = MetricsCollectorFactory.create(env::get); + + assertInstanceOf(LegacyPrometheusMetricsCollector.class, collector); + } + + @Test + void canonicalTakesPriorityOverLegacy() { + Map env = Map.of( + "WORKER_CANONICAL_METRICS", "true", + "WORKER_LEGACY_METRICS", "true"); + var collector = MetricsCollectorFactory.create(env::get); + + assertInstanceOf(CanonicalPrometheusMetricsCollector.class, collector); + } + + // --- collectorName() --- + + @Test + void legacyCollectorNameReturnsLegacy() { + Map env = Map.of(); + var collector = MetricsCollectorFactory.create(env::get); + + assertEquals("legacy", collector.collectorName()); + } + + @Test + void canonicalCollectorNameReturnsCanonical() { + Map env = Map.of("WORKER_CANONICAL_METRICS", "true"); + var collector = MetricsCollectorFactory.create(env::get); + + assertEquals("canonical", collector.collectorName()); + } + + // --- Env var truthiness --- + + @Test + void envBoolTrueString() { + assertTrue(MetricsCollectorFactory.envBool("X", false, name -> "true")); + } + + @Test + void envBoolOneString() { + assertTrue(MetricsCollectorFactory.envBool("X", false, name -> "1")); + } + + @Test + void envBoolYesString() { + assertTrue(MetricsCollectorFactory.envBool("X", false, name -> "yes")); + } + + @Test + void envBoolTrueUpperCase() { + assertTrue(MetricsCollectorFactory.envBool("X", false, name -> "TRUE")); + } + + @Test + void envBoolTrueMixedCase() { + assertTrue(MetricsCollectorFactory.envBool("X", false, name -> "True")); + } + + @Test + void envBoolYesUpperCase() { + assertTrue(MetricsCollectorFactory.envBool("X", false, name -> "YES")); + } + + @Test + void envBoolTrueWithWhitespace() { + assertTrue(MetricsCollectorFactory.envBool("X", false, name -> " true ")); + } + + @Test + void envBoolOneWithWhitespace() { + assertTrue(MetricsCollectorFactory.envBool("X", false, name -> " 1 ")); + } + + @Test + void envBoolFalseString() { + assertFalse(MetricsCollectorFactory.envBool("X", false, name -> "false")); + } + + @Test + void envBoolZeroString() { + assertFalse(MetricsCollectorFactory.envBool("X", false, name -> "0")); + } + + @Test + void envBoolNoString() { + assertFalse(MetricsCollectorFactory.envBool("X", false, name -> "no")); + } + + @Test + void envBoolArbitraryStringIsFalse() { + assertFalse(MetricsCollectorFactory.envBool("X", false, name -> "enabled")); + } + + @Test + void envBoolEmptyStringUsesDefault() { + assertFalse(MetricsCollectorFactory.envBool("X", false, name -> "")); + assertTrue(MetricsCollectorFactory.envBool("X", true, name -> "")); + } + + @Test + void envBoolBlankStringUsesDefault() { + assertFalse(MetricsCollectorFactory.envBool("X", false, name -> " ")); + assertTrue(MetricsCollectorFactory.envBool("X", true, name -> " ")); + } + + @Test + void envBoolNullUsesDefault() { + assertFalse(MetricsCollectorFactory.envBool("X", false, name -> null)); + assertTrue(MetricsCollectorFactory.envBool("X", true, name -> null)); + } + + @Test + void envBoolExplicitTrueOverridesDefaultFalse() { + assertTrue(MetricsCollectorFactory.envBool("X", false, name -> "true")); + } + + @Test + void envBoolExplicitFalseOverridesDefaultTrue() { + assertFalse(MetricsCollectorFactory.envBool("X", true, name -> "false")); + } + + @Test + void envBoolReadsCorrectEnvVarName() { + Map env = new HashMap<>(); + env.put("MY_VAR", "true"); + env.put("OTHER_VAR", "false"); + + assertTrue(MetricsCollectorFactory.envBool("MY_VAR", false, env::get)); + assertFalse(MetricsCollectorFactory.envBool("OTHER_VAR", false, env::get)); + assertFalse(MetricsCollectorFactory.envBool("MISSING_VAR", false, env::get)); + } +} diff --git a/conductor-client-metrics/src/test/java/com/netflix/conductor/client/metrics/prometheus/PrometheusApiClientMetricsTest.java b/conductor-client-metrics/src/test/java/com/netflix/conductor/client/metrics/prometheus/PrometheusApiClientMetricsTest.java new file mode 100644 index 000000000..2f1440937 --- /dev/null +++ b/conductor-client-metrics/src/test/java/com/netflix/conductor/client/metrics/prometheus/PrometheusApiClientMetricsTest.java @@ -0,0 +1,133 @@ +/* + * Copyright 2025 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.metrics.prometheus; + +import java.time.Duration; +import java.util.concurrent.TimeUnit; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import io.micrometer.prometheusmetrics.PrometheusConfig; +import io.micrometer.prometheusmetrics.PrometheusMeterRegistry; + +import static org.junit.jupiter.api.Assertions.*; + +class PrometheusApiClientMetricsTest { + + private PrometheusMeterRegistry registry; + private PrometheusApiClientMetrics metrics; + + @BeforeEach + void setUp() { + registry = new PrometheusMeterRegistry(PrometheusConfig.DEFAULT); + metrics = new PrometheusApiClientMetrics(registry); + } + + @Test + void recordRequestCreatesTimerWithCorrectTags() { + metrics.recordRequest("GET", "/api/tasks", 200, Duration.ofMillis(150)); + + var timer = registry.get("http_api_client_request_seconds") + .tag("method", "GET") + .tag("uri", "/api/tasks") + .tag("status", "200") + .timer(); + assertEquals(1, timer.count()); + assertTrue(timer.totalTime(TimeUnit.MILLISECONDS) >= 150); + } + + @Test + void recordRequestWithPostMethod() { + metrics.recordRequest("POST", "/api/workflows", 201, Duration.ofMillis(50)); + + var timer = registry.get("http_api_client_request_seconds") + .tag("method", "POST") + .tag("uri", "/api/workflows") + .tag("status", "201") + .timer(); + assertEquals(1, timer.count()); + } + + @Test + void recordRequestNegativeStatusCodeBecomesZero() { + metrics.recordRequest("GET", "/api/tasks", -1, Duration.ofMillis(10)); + + var timer = registry.get("http_api_client_request_seconds") + .tag("method", "GET") + .tag("uri", "/api/tasks") + .tag("status", "0") + .timer(); + assertEquals(1, timer.count()); + } + + @Test + void recordRequestZeroStatusCodeBecomesZero() { + metrics.recordRequest("GET", "/api/tasks", 0, Duration.ofMillis(10)); + + var timer = registry.get("http_api_client_request_seconds") + .tag("status", "0") + .timer(); + assertEquals(1, timer.count()); + } + + @Test + void recordRequestNullMethodBecomesEmpty() { + metrics.recordRequest(null, "/api/tasks", 200, Duration.ofMillis(10)); + + var timer = registry.get("http_api_client_request_seconds") + .tag("method", "") + .tag("uri", "/api/tasks") + .timer(); + assertEquals(1, timer.count()); + } + + @Test + void recordRequestNullUriBecomesEmpty() { + metrics.recordRequest("GET", null, 200, Duration.ofMillis(10)); + + var timer = registry.get("http_api_client_request_seconds") + .tag("method", "GET") + .tag("uri", "") + .timer(); + assertEquals(1, timer.count()); + } + + @Test + void multipleRequestsAccumulateInSameTimer() { + metrics.recordRequest("GET", "/api/tasks", 200, Duration.ofMillis(100)); + metrics.recordRequest("GET", "/api/tasks", 200, Duration.ofMillis(200)); + metrics.recordRequest("GET", "/api/tasks", 200, Duration.ofMillis(300)); + + var timer = registry.get("http_api_client_request_seconds") + .tag("method", "GET") + .tag("uri", "/api/tasks") + .tag("status", "200") + .timer(); + assertEquals(3, timer.count()); + } + + @Test + void differentStatusCodesCreateSeparateTimers() { + metrics.recordRequest("GET", "/api/tasks", 200, Duration.ofMillis(10)); + metrics.recordRequest("GET", "/api/tasks", 500, Duration.ofMillis(10)); + + var timer200 = registry.get("http_api_client_request_seconds") + .tag("status", "200").timer(); + var timer500 = registry.get("http_api_client_request_seconds") + .tag("status", "500").timer(); + + assertEquals(1, timer200.count()); + assertEquals(1, timer500.count()); + } +} diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/automator/TaskRunner.java b/conductor-client/src/main/java/com/netflix/conductor/client/automator/TaskRunner.java index 7d19050f0..202ae2fb8 100644 --- a/conductor-client/src/main/java/com/netflix/conductor/client/automator/TaskRunner.java +++ b/conductor-client/src/main/java/com/netflix/conductor/client/automator/TaskRunner.java @@ -30,6 +30,7 @@ import java.util.concurrent.Semaphore; import java.util.concurrent.ThreadFactory; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Function; import org.apache.commons.lang3.concurrent.BasicThreadFactory; @@ -39,13 +40,19 @@ import com.netflix.conductor.client.automator.filters.PollFilter; import com.netflix.conductor.client.config.PropertyFactory; import com.netflix.conductor.client.events.dispatcher.EventDispatcher; +import com.netflix.conductor.client.events.taskrunner.ActiveWorkersChanged; import com.netflix.conductor.client.events.taskrunner.PollCompleted; import com.netflix.conductor.client.events.taskrunner.PollFailure; import com.netflix.conductor.client.events.taskrunner.PollStarted; import com.netflix.conductor.client.events.taskrunner.TaskExecutionCompleted; import com.netflix.conductor.client.events.taskrunner.TaskExecutionFailure; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionQueueFull; import com.netflix.conductor.client.events.taskrunner.TaskExecutionStarted; +import com.netflix.conductor.client.events.taskrunner.TaskPaused; import com.netflix.conductor.client.events.taskrunner.TaskRunnerEvent; +import com.netflix.conductor.client.events.taskrunner.TaskUpdateCompleted; +import com.netflix.conductor.client.events.taskrunner.TaskUpdateFailure; +import com.netflix.conductor.client.events.taskrunner.ThreadUncaughtException; import com.netflix.conductor.client.http.TaskClient; import com.netflix.conductor.client.worker.Worker; import com.netflix.conductor.common.metadata.tasks.Task; @@ -77,6 +84,9 @@ class TaskRunner { private static final double LEASE_EXTEND_DURATION_FACTOR = 0.8; private final ScheduledExecutorService leaseExtendExecutorService; private Map> leaseExtendMap = new ConcurrentHashMap<>(); + private final boolean trackActiveWorkers; + private final boolean trackDiagnosticEvents; + private final AtomicInteger activeWorkerCount = new AtomicInteger(0); TaskRunner(Worker worker, TaskClient taskClient, @@ -87,6 +97,8 @@ class TaskRunner { int taskPollTimeout, List pollFilters, EventDispatcher eventDispatcher, + boolean trackActiveWorkers, + boolean trackDiagnosticEvents, boolean useVirtualThreads) { this.worker = worker; this.taskClient = taskClient; @@ -97,6 +109,8 @@ class TaskRunner { this.permits = new Semaphore(threadCount); this.pollFilters = pollFilters; this.eventDispatcher = eventDispatcher; + this.trackActiveWorkers = trackActiveWorkers; + this.trackDiagnosticEvents = trackDiagnosticEvents; this.tasksTobeExecuted = new LinkedBlockingQueue<>(); this.enableUpdateV2 = Boolean.parseBoolean(System.getProperty("taskUpdateV2", "false")) || Boolean.parseBoolean(System.getenv("taskUpdateV2")); @@ -122,6 +136,7 @@ class TaskRunner { } this.errorAt = errorInterval; LOGGER.info("Polling errors will be sampled at every {} error (after the first 100 errors) for taskType {}", this.errorAt, taskType); + this.uncaughtExceptionHandler = this::onUncaughtException; ThreadFactory threadFactory = null; if(useVirtualThreads) { threadFactory = Thread.ofVirtual().name(workerNamePrefix).uncaughtExceptionHandler(uncaughtExceptionHandler).factory(); @@ -171,7 +186,15 @@ public void pollAndExecute() { stopwatch = null; } tasks.forEach(task -> { - Future taskFuture = this.executorService.submit(() -> this.processTask(task)); + Future taskFuture; + try { + taskFuture = this.executorService.submit(() -> this.processTask(task)); + } catch (java.util.concurrent.RejectedExecutionException e) { + LOGGER.error("Task rejected by executor (likely shutting down); dropping task {} of type {}", + task.getTaskId(), taskType, e); + permits.release(); + return; + } if (task.getResponseTimeoutSeconds() > 0 && worker.leaseExtendEnabled()) { ScheduledFuture existingFuture = leaseExtendMap.remove(task.getTaskId()); @@ -228,6 +251,9 @@ private List pollTasksForWorker() { if (worker.paused()) { LOGGER.trace("Worker {} has been paused. Not polling anymore!", worker.getClass()); + if (trackDiagnosticEvents) { + eventDispatcher.publish(new TaskPaused(taskType)); + } return List.of(); } @@ -244,6 +270,9 @@ private List pollTasksForWorker() { } if (pollCount == 0) { + if (trackDiagnosticEvents) { + eventDispatcher.publish(new TaskExecutionQueueFull(taskType)); + } return List.of(); } @@ -306,14 +335,26 @@ private List pollTask(int count) { } @SuppressWarnings("FieldCanBeLocal") - private final Thread.UncaughtExceptionHandler uncaughtExceptionHandler = - (thread, error) -> { - // JVM may be in unstable state, try to send metrics then exit - LOGGER.error("Uncaught exception. Thread {} will exit now", thread, error); - }; + private final Thread.UncaughtExceptionHandler uncaughtExceptionHandler; + + private void onUncaughtException(Thread thread, Throwable error) { + // JVM may be in unstable state, try to send metrics then exit. + // Use publishSync (not publish) to avoid CompletableFuture.runAsync, + // which requires heap allocation and ForkJoinPool thread handoff -- + // unsafe when the trigger may be OutOfMemoryError. + LOGGER.error("Uncaught exception. Thread {} will exit now", thread, error); + try { + eventDispatcher.publishSync(new ThreadUncaughtException(taskType, error)); + } catch (Throwable t) { + LOGGER.warn("Failed to publish ThreadUncaughtException event", t); + } + } private Task processTask(Task task) { eventDispatcher.publish(new TaskExecutionStarted(taskType, task.getTaskId(), worker.getIdentity())); + if (trackActiveWorkers) { + eventDispatcher.publish(new ActiveWorkersChanged(taskType, activeWorkerCount.incrementAndGet())); + } // record execution start time for a task task.getExecutionMetadata().setExecutionStartTime(System.currentTimeMillis()); @@ -336,6 +377,9 @@ private Task processTask(Task task) { } finally { cancelLeaseExtension(task.getTaskId()); permits.release(); + if (trackActiveWorkers) { + eventDispatcher.publish(new ActiveWorkersChanged(taskType, activeWorkerCount.decrementAndGet())); + } } return task; } @@ -398,16 +442,11 @@ private void executeTask(Worker worker, Task task) { worker.getClass().getSimpleName(), worker.getIdentity(), result.getStatus()); - Stopwatch updateStopWatch = Stopwatch.createStarted(); updateTaskResult(updateRetryCount, task, result, worker); - updateStopWatch.stop(); - LOGGER.trace( - "Time taken to update the {} {} ms", - task.getTaskType(), - updateStopWatch.elapsed(TimeUnit.MILLISECONDS)); } private void updateTaskResult(int count, Task task, TaskResult result, Worker worker) { + Stopwatch updateStopwatch = Stopwatch.createStarted(); try { // upload if necessary Optional optionalExternalStorageLocation = @@ -428,14 +467,18 @@ private void updateTaskResult(int count, Task task, TaskResult result, Worker wo LOGGER.debug("Task {} outbound send time: {}", task.getTaskId(), clientSendTime); if(enableUpdateV2) { - Task nextTask = retryOperation(taskClient::updateTaskV2, count, result, "updateTaskV2"); + Task nextTask = retryOperation( + (TaskResult taskResult) -> taskClient.updateTaskV2(taskResult, taskType), + count, + result, + "updateTaskV2"); if (nextTask != null) { tasksTobeExecuted.add(nextTask); } } else { retryOperation( (TaskResult taskResult) -> { - taskClient.updateTask(taskResult); + taskClient.updateTask(taskResult, taskType); return null; }, count, @@ -443,7 +486,24 @@ private void updateTaskResult(int count, Task task, TaskResult result, Worker wo "updateTask"); } + updateStopwatch.stop(); + eventDispatcher.publish(new TaskUpdateCompleted( + taskType, + task.getTaskId(), + worker.getIdentity(), + task.getWorkflowInstanceId(), + updateStopwatch.elapsed(TimeUnit.MILLISECONDS))); } catch (Exception e) { + if (updateStopwatch.isRunning()) { + updateStopwatch.stop(); + } + eventDispatcher.publish(new TaskUpdateFailure( + taskType, + task.getTaskId(), + worker.getIdentity(), + task.getWorkflowInstanceId(), + e, + updateStopwatch.elapsed(TimeUnit.MILLISECONDS))); worker.onErrorUpdate(task); LOGGER.error( String.format( @@ -531,7 +591,7 @@ public void updateTask(TaskResult taskResult) { */ retryOperation( (TaskResult taskResult) -> { - taskClient.updateTask(taskResult); + taskClient.updateTask(taskResult, task.getTaskDefName()); return null; }, LEASE_EXTEND_RETRY_COUNT, diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/automator/TaskRunnerConfigurer.java b/conductor-client/src/main/java/com/netflix/conductor/client/automator/TaskRunnerConfigurer.java index 7c174b2dc..2b3e83692 100644 --- a/conductor-client/src/main/java/com/netflix/conductor/client/automator/TaskRunnerConfigurer.java +++ b/conductor-client/src/main/java/com/netflix/conductor/client/automator/TaskRunnerConfigurer.java @@ -53,6 +53,9 @@ public class TaskRunnerConfigurer { private ExecutorService scheduledExecutorService; private final List pollFilters; private final EventDispatcher eventDispatcher; + private final MetricsCollector metricsCollector; + private final boolean trackActiveWorkers; + private final boolean trackDiagnosticEvents; private final boolean useVirtualThreads; /** @@ -74,6 +77,9 @@ private TaskRunnerConfigurer(TaskRunnerConfigurer.Builder builder) { this.threadCount = builder.threadCount; this.pollFilters = builder.pollFilters; this.eventDispatcher = builder.eventDispatcher; + this.metricsCollector = builder.metricsCollector; + this.trackActiveWorkers = builder.resolveActiveWorkersTracking(); + this.trackDiagnosticEvents = builder.resolveDiagnosticEvents(); this.useVirtualThreads = builder.useVirtualThreads; builder.workers.forEach(this.workers::add); taskRunners = new LinkedList<>(); @@ -157,6 +163,10 @@ public void shutdown() { } } scheduledExecutorService.shutdown(); + if (metricsCollector != null) { + ListenerRegister.unregister(metricsCollector, eventDispatcher); + taskClient.unregisterListeners(); + } } private void startWorker(Worker worker) { @@ -173,6 +183,8 @@ private void startWorker(Worker worker) { taskPollTimeout, pollFilters, eventDispatcher, + trackActiveWorkers, + trackDiagnosticEvents, useVirtualThreads); // startWorker(worker) is executed by several threads. // taskRunners.add(taskRunner) without synchronization could lead to a race condition and unpredictable behavior, @@ -207,6 +219,11 @@ public static class Builder { private final EventDispatcher eventDispatcher = new EventDispatcher<>(); private boolean useVirtualThreads; + private MetricsCollector metricsCollector; + private boolean metricsCollectorExplicitlySet; + private Boolean activeWorkersTrackingOverride; + private Boolean diagnosticEventsOverride; + public Builder(TaskClient taskClient, Iterable workers) { Preconditions.checkNotNull(taskClient, "TaskClient cannot be null"); Preconditions.checkNotNull(workers, "Workers cannot be null"); @@ -317,6 +334,16 @@ public TaskRunnerConfigurer.Builder withTaskPollCount(int defaultPollCount) { * @return Builder instance */ public TaskRunnerConfigurer build() { + if (!metricsCollectorExplicitlySet) { + var conductorClient = taskClient.getConductorClient(); + if (conductorClient != null) { + MetricsCollector mc = conductorClient.getMetricsCollector(); + if (mc != null && mc.isAutoWiringEnabled()) { + this.metricsCollector = mc; + ListenerRegister.register(mc, eventDispatcher); + } + } + } return new TaskRunnerConfigurer(this); } @@ -344,10 +371,48 @@ public Builder withListener(Class eventType, Cons } public Builder withMetricsCollector(MetricsCollector metricsCollector) { + this.metricsCollector = metricsCollector; ListenerRegister.register(metricsCollector, eventDispatcher); + this.metricsCollectorExplicitlySet = true; + return this; + } + + /** + * Explicitly enable or disable {@code ActiveWorkersChanged} event + * publishing on every task start/finish. When set, this overrides the + * default derived from the {@link MetricsCollector}. Canonical + * metrics enable this by default; legacy metrics do not. + */ + public Builder withActiveWorkersTracking(boolean enabled) { + this.activeWorkersTrackingOverride = enabled; return this; } + private boolean resolveActiveWorkersTracking() { + if (activeWorkersTrackingOverride != null) { + return activeWorkersTrackingOverride; + } + return metricsCollector != null && metricsCollector.isActiveWorkersTrackingEnabled(); + } + + /** + * Explicitly enable or disable per-poll-cycle diagnostic events + * ({@code TaskPaused}, {@code TaskExecutionQueueFull}). When set, + * this overrides the default derived from the {@link MetricsCollector}. + * Canonical metrics enable this by default; legacy metrics do not. + */ + public Builder withDiagnosticEvents(boolean enabled) { + this.diagnosticEventsOverride = enabled; + return this; + } + + private boolean resolveDiagnosticEvents() { + if (diagnosticEventsOverride != null) { + return diagnosticEventsOverride; + } + return metricsCollector != null && metricsCollector.isDiagnosticEventsEnabled(); + } + public Builder withUseVirtualThreads(boolean useVirtualThreads) { this.useVirtualThreads = useVirtualThreads; return this; diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/events/dispatcher/EventDispatcher.java b/conductor-client/src/main/java/com/netflix/conductor/client/events/dispatcher/EventDispatcher.java index 5bfdbd7fd..ac20eb490 100644 --- a/conductor-client/src/main/java/com/netflix/conductor/client/events/dispatcher/EventDispatcher.java +++ b/conductor-client/src/main/java/com/netflix/conductor/client/events/dispatcher/EventDispatcher.java @@ -52,12 +52,30 @@ public void publish(T event) { return; } - CompletableFuture.runAsync(() -> { - List> eventListeners = getEventListeners(event); - for (Consumer listener : eventListeners) { - ((Consumer) listener).accept(event); - } - }); + CompletableFuture.runAsync(() -> dispatchToListeners(event)); + } + + /** + * Dispatches {@code event} to registered listeners on the calling + * thread. Use this instead of {@link #publish(ConductorClientEvent)} when + * the caller cannot tolerate heap allocation or thread-pool submission + * (e.g. inside an {@link Thread.UncaughtExceptionHandler} where the JVM + * may be in an unstable state). + */ + public void publishSync(T event) { + if (noListeners(event)) { + return; + } + + dispatchToListeners(event); + } + + @SuppressWarnings("unchecked") + private void dispatchToListeners(T event) { + List> eventListeners = getEventListeners(event); + for (Consumer listener : eventListeners) { + ((Consumer) listener).accept(event); + } } private boolean noListeners(T event) { diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/events/listeners/ListenerRegister.java b/conductor-client/src/main/java/com/netflix/conductor/client/events/listeners/ListenerRegister.java index 2b0db419d..985b182f3 100644 --- a/conductor-client/src/main/java/com/netflix/conductor/client/events/listeners/ListenerRegister.java +++ b/conductor-client/src/main/java/com/netflix/conductor/client/events/listeners/ListenerRegister.java @@ -12,41 +12,165 @@ */ package com.netflix.conductor.client.events.listeners; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.function.Consumer; + +import com.netflix.conductor.client.events.ConductorClientEvent; import com.netflix.conductor.client.events.dispatcher.EventDispatcher; import com.netflix.conductor.client.events.task.TaskClientEvent; import com.netflix.conductor.client.events.task.TaskPayloadUsedEvent; import com.netflix.conductor.client.events.task.TaskResultPayloadSizeEvent; +import com.netflix.conductor.client.events.taskrunner.ActiveWorkersChanged; import com.netflix.conductor.client.events.taskrunner.PollCompleted; import com.netflix.conductor.client.events.taskrunner.PollFailure; import com.netflix.conductor.client.events.taskrunner.PollStarted; +import com.netflix.conductor.client.events.taskrunner.TaskAckError; +import com.netflix.conductor.client.events.taskrunner.TaskAckFailure; import com.netflix.conductor.client.events.taskrunner.TaskExecutionCompleted; import com.netflix.conductor.client.events.taskrunner.TaskExecutionFailure; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionQueueFull; import com.netflix.conductor.client.events.taskrunner.TaskExecutionStarted; +import com.netflix.conductor.client.events.taskrunner.TaskPaused; import com.netflix.conductor.client.events.taskrunner.TaskRunnerEvent; +import com.netflix.conductor.client.events.taskrunner.TaskUpdateCompleted; +import com.netflix.conductor.client.events.taskrunner.TaskUpdateFailure; +import com.netflix.conductor.client.events.taskrunner.ThreadUncaughtException; import com.netflix.conductor.client.events.workflow.WorkflowClientEvent; import com.netflix.conductor.client.events.workflow.WorkflowInputPayloadSizeEvent; import com.netflix.conductor.client.events.workflow.WorkflowPayloadUsedEvent; import com.netflix.conductor.client.events.workflow.WorkflowStartedEvent; +/** + * Idempotent registration (and unregistration) of event listeners onto + * {@link EventDispatcher} instances. + * + *

Why the dedup key includes the dispatcher reference

+ * + *

A single {@link com.netflix.conductor.client.metrics.MetricsCollector + * MetricsCollector} is intentionally registered on multiple, distinct + * dispatchers because each dispatcher is an independent event source: + * + *

    + *
  1. {@code TaskClient.eventDispatcher} — + * {@link TaskClientEvent}s (payload size, external storage)
  2. + *
  3. {@code TaskClient.taskRunnerEventDispatcher} — + * {@link TaskRunnerEvent}s emitted from {@code TaskClient.ack()}
  4. + *
  5. {@code WorkflowClient.eventDispatcher} — + * {@link WorkflowClientEvent}s (workflow started, payload size)
  6. + *
  7. {@code TaskRunnerConfigurer.Builder.eventDispatcher} — + * {@link TaskRunnerEvent}s from the poll/execute/update cycle
  8. + *
+ * + *

The key {@code (listener, dispatcher)} therefore correctly allows the same + * collector to be wired onto all four dispatchers while preventing the same + * collector from being registered onto the same dispatcher twice + * (which would double-count every event on that source). + * + *

Lifecycle

+ * + *

Call the matching {@code unregister} overload during shutdown to release + * the entry from the static set and detach the listener from the dispatcher. + * This prevents the static set from accumulating stale entries in long-lived + * JVMs that re-create client or configurer instances. + */ public class ListenerRegister { - public static void register(TaskRunnerEventsListener listener, EventDispatcher dispatcher) { - dispatcher.register(PollFailure.class, listener::consume); - dispatcher.register(PollCompleted.class, listener::consume); - dispatcher.register(PollStarted.class, listener::consume); - dispatcher.register(TaskExecutionStarted.class, listener::consume); - dispatcher.register(TaskExecutionCompleted.class, listener::consume); - dispatcher.register(TaskExecutionFailure.class, listener::consume); + private static final Map> registered = + new ConcurrentHashMap<>(); + + // --- TaskRunnerEventsListener --- + + public static synchronized void register(TaskRunnerEventsListener listener, EventDispatcher dispatcher) { + RegistrationKey key = new RegistrationKey(listener, dispatcher); + if (registered.containsKey(key)) { + return; + } + + List bindings = List.of( + bind(dispatcher, PollFailure.class, listener::consume), + bind(dispatcher, PollCompleted.class, listener::consume), + bind(dispatcher, PollStarted.class, listener::consume), + bind(dispatcher, TaskExecutionStarted.class, listener::consume), + bind(dispatcher, TaskExecutionCompleted.class, listener::consume), + bind(dispatcher, TaskExecutionFailure.class, listener::consume), + bind(dispatcher, TaskUpdateCompleted.class, listener::consume), + bind(dispatcher, TaskUpdateFailure.class, listener::consume), + bind(dispatcher, TaskAckFailure.class, listener::consume), + bind(dispatcher, TaskAckError.class, listener::consume), + bind(dispatcher, TaskExecutionQueueFull.class, listener::consume), + bind(dispatcher, TaskPaused.class, listener::consume), + bind(dispatcher, ThreadUncaughtException.class, listener::consume), + bind(dispatcher, ActiveWorkersChanged.class, listener::consume)); + + registered.put(key, bindings); } - public static void register(TaskClientListener listener, EventDispatcher dispatcher) { - dispatcher.register(TaskResultPayloadSizeEvent.class, listener::consume); - dispatcher.register(TaskPayloadUsedEvent.class, listener::consume); + public static synchronized void unregister(TaskRunnerEventsListener listener, EventDispatcher dispatcher) { + removeBindings(new RegistrationKey(listener, dispatcher)); } - public static void register(WorkflowClientListener listener, EventDispatcher dispatcher) { - dispatcher.register(WorkflowStartedEvent.class, listener::consume); - dispatcher.register(WorkflowInputPayloadSizeEvent.class, listener::consume); - dispatcher.register(WorkflowPayloadUsedEvent.class, listener::consume); + // --- TaskClientListener --- + + public static synchronized void register(TaskClientListener listener, EventDispatcher dispatcher) { + RegistrationKey key = new RegistrationKey(listener, dispatcher); + if (registered.containsKey(key)) { + return; + } + + List bindings = List.of( + bind(dispatcher, TaskResultPayloadSizeEvent.class, listener::consume), + bind(dispatcher, TaskPayloadUsedEvent.class, listener::consume)); + + registered.put(key, bindings); + } + + public static synchronized void unregister(TaskClientListener listener, EventDispatcher dispatcher) { + removeBindings(new RegistrationKey(listener, dispatcher)); } + + // --- WorkflowClientListener --- + + public static synchronized void register(WorkflowClientListener listener, EventDispatcher dispatcher) { + RegistrationKey key = new RegistrationKey(listener, dispatcher); + if (registered.containsKey(key)) { + return; + } + + List bindings = List.of( + bind(dispatcher, WorkflowStartedEvent.class, listener::consume), + bind(dispatcher, WorkflowInputPayloadSizeEvent.class, listener::consume), + bind(dispatcher, WorkflowPayloadUsedEvent.class, listener::consume)); + + registered.put(key, bindings); + } + + public static synchronized void unregister(WorkflowClientListener listener, EventDispatcher dispatcher) { + removeBindings(new RegistrationKey(listener, dispatcher)); + } + + // --- internals --- + + @SuppressWarnings("unchecked") + private static void removeBindings(RegistrationKey key) { + List bindings = registered.remove(key); + if (bindings == null) { + return; + } + for (ConsumerBinding b : bindings) { + b.dispatcher.unregister(b.eventType, b.consumer); + } + } + + private static ConsumerBinding bind( + EventDispatcher dispatcher, Class eventType, Consumer consumer) { + dispatcher.register(eventType, consumer); + return new ConsumerBinding(dispatcher, eventType, consumer); + } + + private record RegistrationKey(Object listener, Object dispatcher) { } + + @SuppressWarnings("rawtypes") + private record ConsumerBinding(EventDispatcher dispatcher, Class eventType, Consumer consumer) { } } diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/events/listeners/TaskRunnerEventsListener.java b/conductor-client/src/main/java/com/netflix/conductor/client/events/listeners/TaskRunnerEventsListener.java index b5b0c4f40..43ebce6d1 100644 --- a/conductor-client/src/main/java/com/netflix/conductor/client/events/listeners/TaskRunnerEventsListener.java +++ b/conductor-client/src/main/java/com/netflix/conductor/client/events/listeners/TaskRunnerEventsListener.java @@ -12,12 +12,20 @@ */ package com.netflix.conductor.client.events.listeners; +import com.netflix.conductor.client.events.taskrunner.ActiveWorkersChanged; import com.netflix.conductor.client.events.taskrunner.PollCompleted; import com.netflix.conductor.client.events.taskrunner.PollFailure; import com.netflix.conductor.client.events.taskrunner.PollStarted; +import com.netflix.conductor.client.events.taskrunner.TaskAckError; +import com.netflix.conductor.client.events.taskrunner.TaskAckFailure; import com.netflix.conductor.client.events.taskrunner.TaskExecutionCompleted; import com.netflix.conductor.client.events.taskrunner.TaskExecutionFailure; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionQueueFull; import com.netflix.conductor.client.events.taskrunner.TaskExecutionStarted; +import com.netflix.conductor.client.events.taskrunner.TaskPaused; +import com.netflix.conductor.client.events.taskrunner.TaskUpdateCompleted; +import com.netflix.conductor.client.events.taskrunner.TaskUpdateFailure; +import com.netflix.conductor.client.events.taskrunner.ThreadUncaughtException; public interface TaskRunnerEventsListener { @@ -33,4 +41,20 @@ public interface TaskRunnerEventsListener { void consume(TaskExecutionFailure e); + default void consume(TaskUpdateCompleted e) {} + + default void consume(TaskUpdateFailure e) {} + + default void consume(TaskAckFailure e) {} + + default void consume(TaskAckError e) {} + + default void consume(TaskExecutionQueueFull e) {} + + default void consume(TaskPaused e) {} + + default void consume(ThreadUncaughtException e) {} + + default void consume(ActiveWorkersChanged e) {} + } diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/ActiveWorkersChanged.java b/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/ActiveWorkersChanged.java new file mode 100644 index 000000000..4b4c66498 --- /dev/null +++ b/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/ActiveWorkersChanged.java @@ -0,0 +1,27 @@ +/* + * Copyright 2024 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.events.taskrunner; + +import lombok.Getter; +import lombok.ToString; + +@Getter +@ToString +public final class ActiveWorkersChanged extends TaskRunnerEvent { + private final int count; + + public ActiveWorkersChanged(String taskType, int count) { + super(taskType); + this.count = count; + } +} diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/TaskAckError.java b/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/TaskAckError.java new file mode 100644 index 000000000..27086fd29 --- /dev/null +++ b/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/TaskAckError.java @@ -0,0 +1,35 @@ +/* + * Copyright 2024 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.events.taskrunner; + +import lombok.Getter; +import lombok.ToString; + +/** + * Published when the task-ack call to the server threw an exception + * client-side (network error, deserialization error, etc.). Distinct from + * {@link TaskAckFailure} which is raised when the server returned a non- + * success ack. + */ +@Getter +@ToString +public final class TaskAckError extends TaskRunnerEvent { + private final String taskId; + private final Throwable cause; + + public TaskAckError(String taskType, String taskId, Throwable cause) { + super(taskType); + this.taskId = taskId; + this.cause = cause; + } +} diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/TaskAckFailure.java b/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/TaskAckFailure.java new file mode 100644 index 000000000..5371be08a --- /dev/null +++ b/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/TaskAckFailure.java @@ -0,0 +1,32 @@ +/* + * Copyright 2024 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.events.taskrunner; + +import lombok.Getter; +import lombok.ToString; + +/** + * Published when the server responded to a task-ack with a non-success result + * (the ack did not throw but was declined). Distinct from {@link TaskAckError} + * which is raised when the ack call itself throws. + */ +@Getter +@ToString +public final class TaskAckFailure extends TaskRunnerEvent { + private final String taskId; + + public TaskAckFailure(String taskType, String taskId) { + super(taskType); + this.taskId = taskId; + } +} diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/TaskExecutionQueueFull.java b/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/TaskExecutionQueueFull.java new file mode 100644 index 000000000..866e54df5 --- /dev/null +++ b/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/TaskExecutionQueueFull.java @@ -0,0 +1,27 @@ +/* + * Copyright 2024 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.events.taskrunner; + +import lombok.ToString; + +/** + * Published when a poll cycle finds zero available permits (all worker + * threads are busy), indicating the worker is at capacity for this + * task type. + */ +@ToString +public final class TaskExecutionQueueFull extends TaskRunnerEvent { + public TaskExecutionQueueFull(String taskType) { + super(taskType); + } +} diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/TaskPaused.java b/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/TaskPaused.java new file mode 100644 index 000000000..fe6cb9c6e --- /dev/null +++ b/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/TaskPaused.java @@ -0,0 +1,26 @@ +/* + * Copyright 2024 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.events.taskrunner; + +import lombok.ToString; + +/** + * Published when a poll cycle is skipped for a task type because the worker + * has been paused externally. + */ +@ToString +public final class TaskPaused extends TaskRunnerEvent { + public TaskPaused(String taskType) { + super(taskType); + } +} diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/TaskUpdateCompleted.java b/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/TaskUpdateCompleted.java new file mode 100644 index 000000000..b31c1400d --- /dev/null +++ b/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/TaskUpdateCompleted.java @@ -0,0 +1,40 @@ +/* + * Copyright 2024 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.events.taskrunner; + +import java.time.Duration; + +import lombok.Getter; +import lombok.ToString; + +/** + * Published when the worker successfully reports a task result back to the + * Conductor server (an UpdateTask / UpdateTaskV2 call that returned without + * throwing). + */ +@Getter +@ToString +public final class TaskUpdateCompleted extends TaskRunnerEvent { + private final String taskId; + private final String workerId; + private final String workflowInstanceId; + private final Duration duration; + + public TaskUpdateCompleted(String taskType, String taskId, String workerId, String workflowInstanceId, long durationInMillis) { + super(taskType); + this.taskId = taskId; + this.workerId = workerId; + this.workflowInstanceId = workflowInstanceId; + this.duration = Duration.ofMillis(durationInMillis); + } +} diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/TaskUpdateFailure.java b/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/TaskUpdateFailure.java new file mode 100644 index 000000000..85c80c8f7 --- /dev/null +++ b/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/TaskUpdateFailure.java @@ -0,0 +1,43 @@ +/* + * Copyright 2024 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.events.taskrunner; + +import java.time.Duration; + +import lombok.Getter; +import lombok.ToString; + +/** + * Published when reporting a task result back to the Conductor server failed + * (UpdateTask / UpdateTaskV2 threw). Emitted once per failed attempt (not once + * per retry cycle), so counters derived from this event match what the user + * would see at the network layer. + */ +@Getter +@ToString +public final class TaskUpdateFailure extends TaskRunnerEvent { + private final String taskId; + private final String workerId; + private final String workflowInstanceId; + private final Duration duration; + private final Throwable cause; + + public TaskUpdateFailure(String taskType, String taskId, String workerId, String workflowInstanceId, Throwable cause, long durationInMillis) { + super(taskType); + this.taskId = taskId; + this.workerId = workerId; + this.workflowInstanceId = workflowInstanceId; + this.cause = cause; + this.duration = Duration.ofMillis(durationInMillis); + } +} diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/ThreadUncaughtException.java b/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/ThreadUncaughtException.java new file mode 100644 index 000000000..b425ba1b7 --- /dev/null +++ b/conductor-client/src/main/java/com/netflix/conductor/client/events/taskrunner/ThreadUncaughtException.java @@ -0,0 +1,39 @@ +/* + * Copyright 2024 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.events.taskrunner; + +import lombok.Getter; +import lombok.ToString; + +/** + * Published when a worker thread terminates with an uncaught exception (the + * default {@link Thread.UncaughtExceptionHandler} path). + * + * The {@code taskType} is empty for threads that are not tied to a specific + * worker (e.g. internal executor threads shared by the runner). + */ +@Getter +@ToString +public final class ThreadUncaughtException extends TaskRunnerEvent { + private final Throwable cause; + + public ThreadUncaughtException(String taskType, Throwable cause) { + super(taskType); + this.cause = cause; + } + + public ThreadUncaughtException(Throwable cause) { + super(""); + this.cause = cause; + } +} diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/http/ConductorClient.java b/conductor-client/src/main/java/com/netflix/conductor/client/http/ConductorClient.java index 2e198c017..b156efb03 100644 --- a/conductor-client/src/main/java/com/netflix/conductor/client/http/ConductorClient.java +++ b/conductor-client/src/main/java/com/netflix/conductor/client/http/ConductorClient.java @@ -48,6 +48,9 @@ import org.slf4j.LoggerFactory; import com.netflix.conductor.client.exception.ConductorClientException; +import com.netflix.conductor.client.metrics.ApiClientMetrics; +import com.netflix.conductor.client.metrics.MetricsCollector; +import com.netflix.conductor.client.metrics.PayloadKind; import com.netflix.conductor.common.config.ObjectMapperProvider; import com.fasterxml.jackson.core.JsonProcessingException; @@ -75,6 +78,7 @@ public class ConductorClient { private final InputStream sslCaCert; private final KeyManager[] keyManagers; private final List headerSuppliers; + private final MetricsCollector metricsCollector; public static Builder builder() { return new Builder<>(); @@ -90,6 +94,17 @@ protected ConductorClient(Builder builder) { this.sslCaCert = builder.sslCaCert; this.keyManagers = builder.keyManagers; this.headerSuppliers = builder.headerSupplier(); + this.metricsCollector = builder.metricsCollector; + + ApiClientMetrics apiClientMetrics = null; + if (this.metricsCollector != null) { + apiClientMetrics = this.metricsCollector.getApiClientMetrics(); + } else if (builder.httpMetrics != null) { + apiClientMetrics = builder.httpMetrics; + } + if (apiClientMetrics != null && apiClientMetrics != ApiClientMetrics.NOOP) { + okHttpBuilder.addInterceptor(new ApiClientMetricsOkHttpInterceptor(apiClientMetrics)); + } if (builder.connectTimeout > -1) { okHttpBuilder.connectTimeout(builder.connectTimeout, TimeUnit.MILLISECONDS); @@ -143,6 +158,16 @@ public String getBasePath() { return basePath; } + /** + * Returns the {@link MetricsCollector} associated with this client, or + * {@code null} if none was set at build time. Downstream clients + * ({@code TaskClient}, {@code WorkflowClient}) use this to auto-register + * themselves as listeners when metrics are wired through the builder. + */ + public MetricsCollector getMetricsCollector() { + return metricsCollector; + } + public void shutdown() { okHttpClient.dispatcher().executorService().shutdown(); okHttpClient.connectionPool().evictAll(); @@ -234,7 +259,8 @@ public ConductorClientResponse execute(ConductorClientRequest req, TypeRe pathParams, queryParams, headerParams, - req.getBody()); + req.getBody(), + req.getPayloadKind()); Call call = okHttpClient.newCall(request); if (typeReference == null) { @@ -408,10 +434,25 @@ protected Request buildRequest(String method, List queryParams, Map headers, Object body) { + return buildRequest(method, path, pathParams, queryParams, headers, body, null); + } + + protected Request buildRequest(String method, + String path, + List pathParams, + List queryParams, + Map headers, + Object body, + PayloadKind payloadKind) { final HttpUrl url = buildUrl(replacePathParams(path, pathParams), queryParams); final Request.Builder requestBuilder = new Request.Builder().url(url); processHeaderParams(requestBuilder, addHeadersFromProviders(method, path, headers)); RequestBody reqBody = requestBody(method, getContentType(headers), body); + requestBuilder.tag(PathTemplateTag.class, new PathTemplateTag(path)); + if (payloadKind != null) { + // Read by ApiClientMetricsOkHttpInterceptor at wire time. + requestBuilder.tag(PayloadKind.class, payloadKind); + } return requestBuilder.method(method, reqBody).build(); } @@ -564,6 +605,8 @@ public static class Builder> { private ConnectionPoolConfig connectionPoolConfig; private Supplier objectMapperSupplier = () -> new ObjectMapperProvider().getObjectMapper(); private final List headerSuppliers = new ArrayList<>(); + MetricsCollector metricsCollector; + private ApiClientMetrics httpMetrics; private boolean useEnvVariables = false; @@ -650,6 +693,36 @@ public T addHeaderSupplier(HeaderSupplier headerSupplier) { return self(); } + /** + * Attach a {@link MetricsCollector} to the client. The collector's + * {@link ApiClientMetrics} will be wired as an OkHttp interceptor + * automatically, and downstream clients ({@code TaskClient}, + * {@code WorkflowClient}, {@code TaskRunnerConfigurer}) will + * auto-register as listeners when constructed with this client. + */ + public T withMetricsCollector(MetricsCollector metricsCollector) { + this.metricsCollector = metricsCollector; + return self(); + } + + /** + * Install only the HTTP metrics interceptor without triggering + * automatic listener registration on downstream clients. Use this + * when you want {@code http_api_client_request_seconds}, + * {@code task_result_size_bytes}, and + * {@code workflow_input_size_bytes} but prefer to wire event + * listeners explicitly via {@code registerListener} / + * {@code registerTaskRunnerListener}. + * + *

If {@link #withMetricsCollector} is also called, the full + * auto-registration takes effect and this method has no additional + * impact (the interceptor is installed either way). + */ + public T withHttpMetrics(MetricsCollector metricsCollector) { + this.httpMetrics = metricsCollector != null ? metricsCollector.getApiClientMetrics() : null; + return self(); + } + protected List headerSupplier() { return headerSuppliers; } @@ -689,4 +762,79 @@ protected void applyEnvVariables() { } } } + + /** + * Marker tag attached to outbound OkHttp requests so the metrics + * interceptor can read a bounded-cardinality URI label (the path + * template, e.g. {@code "/workflow/{workflowId}"}) instead of the + * resolved URL which contains per-request IDs. + * + *

Uses a dedicated class rather than {@code String.class} so the + * tag slot does not collide with user-installed interceptors that may + * use {@code request.tag(String.class)} for their own purposes. + */ + public static final class PathTemplateTag { + private final String path; + public PathTemplateTag(String path) { this.path = path; } + public String path() { return path; } + } + + /** + * Lightweight OkHttp interceptor that delegates to {@link ApiClientMetrics}. + * Lives in {@code conductor-client} so we don't need a dependency on the + * {@code conductor-client-metrics} module just for the builder integration. + * Safe to use with {@link ApiClientMetrics#NOOP}. + */ + private static final class ApiClientMetricsOkHttpInterceptor implements okhttp3.Interceptor { + private final ApiClientMetrics metrics; + + ApiClientMetricsOkHttpInterceptor(ApiClientMetrics metrics) { + this.metrics = metrics == null ? ApiClientMetrics.NOOP : metrics; + } + + @Override + public Response intercept(Chain chain) throws IOException { + Request request = chain.request(); + long startNanos = System.nanoTime(); + IOException ioError = null; + Response response = null; + try { + response = chain.proceed(request); + return response; + } catch (IOException e) { + ioError = e; + throw e; + } finally { + long elapsedNanos = System.nanoTime() - startNanos; + try { + String method = request.method(); + PathTemplateTag tag = request.tag(PathTemplateTag.class); + String uri = tag != null ? tag.path() : request.url().encodedPath(); + int status = response != null ? response.code() + : (ioError != null ? -1 : 0); + metrics.recordRequest(method, uri, status, + java.time.Duration.ofNanos(elapsedNanos)); + recordPayloadSizeIfTagged(request); + } catch (Throwable ignored) { + } + } + } + + private void recordPayloadSizeIfTagged(Request request) { + PayloadKind kind = request.tag(PayloadKind.class); + if (kind == null || request.body() == null) { + return; + } + long len; + try { + len = request.body().contentLength(); + } catch (IOException e) { + return; + } + if (len < 0) { + return; + } + kind.recordSize(metrics, len); + } + } } diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/http/ConductorClientRequest.java b/conductor-client/src/main/java/com/netflix/conductor/client/http/ConductorClientRequest.java index a1e0ffa12..b7834617c 100644 --- a/conductor-client/src/main/java/com/netflix/conductor/client/http/ConductorClientRequest.java +++ b/conductor-client/src/main/java/com/netflix/conductor/client/http/ConductorClientRequest.java @@ -17,6 +17,8 @@ import java.util.List; import java.util.Map; +import com.netflix.conductor.client.metrics.PayloadKind; + import lombok.EqualsAndHashCode; import lombok.Getter; @@ -34,6 +36,14 @@ public enum Method { private final List queryParams; private final Map headerParams; private final Object body; + /** + * Optional discriminator read by the {@code ApiClientMetrics} OkHttp + * interceptor at wire time so it can record canonical payload-size + * histograms (e.g. {@code task_result_size_bytes}, + * {@code workflow_input_size_bytes}) without forcing the caller to + * serialize the body a second time. + */ + private final PayloadKind payloadKind; private ConductorClientRequest(Builder builder) { this.method = builder.method; @@ -42,6 +52,7 @@ private ConductorClientRequest(Builder builder) { this.queryParams = builder.queryParams; this.headerParams = builder.headerParams; this.body = builder.body; + this.payloadKind = builder.payloadKind; } public static Builder builder() { @@ -55,6 +66,7 @@ public static class Builder { private final List queryParams = new ArrayList<>(); private final Map headerParams = new HashMap<>(); private Object body; + private PayloadKind payloadKind; public Builder method(Method method) { if (method == null) { @@ -143,6 +155,17 @@ public Builder body(Object body) { return this; } + /** + * Tag the request with a {@link PayloadKind} so the metrics + * interceptor can label canonical payload-size histograms at wire + * time. May be {@code null} (the default), in which case no size + * histogram is recorded for this request. + */ + public Builder payloadKind(PayloadKind payloadKind) { + this.payloadKind = payloadKind; + return this; + } + public ConductorClientRequest build() { return new ConductorClientRequest(this); } diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/http/TaskClient.java b/conductor-client/src/main/java/com/netflix/conductor/client/http/TaskClient.java index ce15bd939..72652f42d 100644 --- a/conductor-client/src/main/java/com/netflix/conductor/client/http/TaskClient.java +++ b/conductor-client/src/main/java/com/netflix/conductor/client/http/TaskClient.java @@ -33,11 +33,16 @@ import com.netflix.conductor.client.events.dispatcher.EventDispatcher; import com.netflix.conductor.client.events.listeners.ListenerRegister; import com.netflix.conductor.client.events.listeners.TaskClientListener; +import com.netflix.conductor.client.events.listeners.TaskRunnerEventsListener; import com.netflix.conductor.client.events.task.TaskClientEvent; import com.netflix.conductor.client.events.task.TaskPayloadUsedEvent; -import com.netflix.conductor.client.events.task.TaskResultPayloadSizeEvent; +import com.netflix.conductor.client.events.taskrunner.TaskAckError; +import com.netflix.conductor.client.events.taskrunner.TaskAckFailure; +import com.netflix.conductor.client.events.taskrunner.TaskRunnerEvent; import com.netflix.conductor.client.exception.ConductorClientException; import com.netflix.conductor.client.http.ConductorClientRequest.Method; +import com.netflix.conductor.client.metrics.MetricsCollector; +import com.netflix.conductor.client.metrics.PayloadKind; import com.netflix.conductor.common.config.ObjectMapperProvider; import com.netflix.conductor.common.metadata.tasks.PollData; import com.netflix.conductor.common.metadata.tasks.Task; @@ -93,8 +98,12 @@ public class TaskClient { private final EventDispatcher eventDispatcher = new EventDispatcher<>(); + private final EventDispatcher taskRunnerEventDispatcher = new EventDispatcher<>(); + private PayloadStorage payloadStorage; + private MetricsCollector metricsCollector; + protected ConductorClient client; protected String localhost = "localhost"; @@ -118,6 +127,14 @@ public TaskClient(ConductorClient client, ConductorClientConfiguration config) { this.client = client; this.payloadStorage = new PayloadStorage(client); this.conductorClientConfiguration = config; + if (client != null) { + MetricsCollector mc = client.getMetricsCollector(); + if (mc != null && mc.isAutoWiringEnabled()) { + this.metricsCollector = mc; + registerListener(mc); + registerTaskRunnerListener(mc); + } + } } /** @@ -134,10 +151,35 @@ public void setRootURI(String rootUri) { payloadStorage = new PayloadStorage(client); } + public ConductorClient getConductorClient() { + return client; + } + public void registerListener(TaskClientListener listener) { ListenerRegister.register(listener, eventDispatcher); } + /** + * Register a {@link TaskRunnerEventsListener} with this {@code TaskClient}. + * Used for canonical task-runner events (e.g. {@code TaskAckFailure}/ + * {@code TaskAckError}) emitted from within {@code TaskClient} itself. + */ + public void registerTaskRunnerListener(TaskRunnerEventsListener listener) { + ListenerRegister.register(listener, taskRunnerEventDispatcher); + } + + /** + * Detaches the auto-wired {@link MetricsCollector} (if any) from this + * client's dispatchers. Called during shutdown to release the entries from + * {@link ListenerRegister}'s static registry. + */ + public void unregisterListeners() { + if (metricsCollector != null) { + ListenerRegister.unregister(metricsCollector, eventDispatcher); + ListenerRegister.unregister(metricsCollector, taskRunnerEventDispatcher); + } + } + /** * Perform a poll for a task of a specific task type. * @@ -225,11 +267,21 @@ public List batchPollTasksInDomain(String taskType, String domain, String * @param taskResult the {@link TaskResult} of the executed task to be updated. */ public void updateTask(TaskResult taskResult) { + updateTask(taskResult, null); + } + + /** + * Same as {@link #updateTask(TaskResult)} but propagates {@code taskType} + * to the canonical {@code task_result_size_bytes} histogram (since + * {@link TaskResult} does not carry the task definition name itself). + */ + public void updateTask(TaskResult taskResult, String taskType) { Validate.notNull(taskResult, "Task result cannot be null"); ConductorClientRequest request = ConductorClientRequest.builder() .method(Method.POST) .path("/tasks") .body(taskResult) + .payloadKind(new PayloadKind.TaskResult(taskType)) .build(); client.execute(request); @@ -244,11 +296,21 @@ public void updateTask(TaskResult taskResult) { * @param taskResult the {@link TaskResult} of the executed task to be updated. */ public Task updateTaskV2(TaskResult taskResult) { + return updateTaskV2(taskResult, null); + } + + /** + * Same as {@link #updateTaskV2(TaskResult)} but propagates {@code taskType} + * to the canonical {@code task_result_size_bytes} histogram (since + * {@link TaskResult} does not carry the task definition name itself). + */ + public Task updateTaskV2(TaskResult taskResult, String taskType) { Validate.notNull(taskResult, "Task result cannot be null"); ConductorClientRequest request = ConductorClientRequest.builder() .method(Method.POST) .path("/tasks/update-v2") .body(taskResult) + .payloadKind(new PayloadKind.TaskResult(taskType)) .build(); ConductorClientResponse response = client.execute(request, TASK_TYPE); @@ -259,12 +321,10 @@ public Optional evaluateAndUploadLargePayload(Map taskOu if (!conductorClientConfiguration.isEnforceThresholds()) { return Optional.empty(); } - try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) { objectMapper.writeValue(byteArrayOutputStream, taskOutputData); byte[] taskOutputBytes = byteArrayOutputStream.toByteArray(); long taskResultSize = taskOutputBytes.length; - eventDispatcher.publish(new TaskResultPayloadSizeEvent(taskType, taskResultSize)); long payloadSizeThreshold = conductorClientConfiguration.getTaskOutputPayloadThresholdKB() * 1024L; if (taskResultSize > payloadSizeThreshold) { if (!conductorClientConfiguration.isExternalPayloadStorageEnabled() @@ -294,8 +354,21 @@ public Optional evaluateAndUploadLargePayload(Map taskOu * @return true if the task was found with the given ID and acknowledged. False * otherwise. If * the server returns false, the client should NOT attempt to ack again. + * @deprecated This overload lacks the {@code taskType} parameter, so canonical + * ack metrics ({@code task_ack_failed_total{taskType}}, + * {@code task_ack_error_total{taskType}}) cannot be labeled. + * Use {@link #ack(String, String, String)} instead. */ + @Deprecated public Boolean ack(String taskId, String workerId) { + return ack(null, taskId, workerId); + } + + /** + * Ack variant that emits canonical task-runner metrics ({@code TaskAckFailure} + * / {@code TaskAckError}) when {@code taskType} is known. + */ + public Boolean ack(String taskType, String taskId, String workerId) { Validate.notBlank(taskId, "Task id cannot be blank"); ConductorClientRequest request = ConductorClientRequest.builder() .method(Method.POST) @@ -304,9 +377,20 @@ public Boolean ack(String taskId, String workerId) { .addQueryParam("workerid", workerId) .build(); - ConductorClientResponse response = client.execute(request, BOOLEAN_TYPE); - - return response.getData(); + boolean trackAck = taskType != null && metricsCollector != null; + try { + ConductorClientResponse response = client.execute(request, BOOLEAN_TYPE); + Boolean acked = response.getData(); + if (trackAck && !Boolean.TRUE.equals(acked)) { + taskRunnerEventDispatcher.publish(new TaskAckFailure(taskType, taskId)); + } + return acked; + } catch (Throwable t) { + if (trackAck) { + taskRunnerEventDispatcher.publish(new TaskAckError(taskType, taskId, t)); + } + throw t; + } } /** diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/http/WorkflowClient.java b/conductor-client/src/main/java/com/netflix/conductor/client/http/WorkflowClient.java index 31694d288..f79bb4e1c 100644 --- a/conductor-client/src/main/java/com/netflix/conductor/client/http/WorkflowClient.java +++ b/conductor-client/src/main/java/com/netflix/conductor/client/http/WorkflowClient.java @@ -46,11 +46,12 @@ import com.netflix.conductor.client.events.listeners.ListenerRegister; import com.netflix.conductor.client.events.listeners.WorkflowClientListener; import com.netflix.conductor.client.events.workflow.WorkflowClientEvent; -import com.netflix.conductor.client.events.workflow.WorkflowInputPayloadSizeEvent; import com.netflix.conductor.client.events.workflow.WorkflowPayloadUsedEvent; import com.netflix.conductor.client.events.workflow.WorkflowStartedEvent; import com.netflix.conductor.client.exception.ConductorClientException; import com.netflix.conductor.client.http.ConductorClientRequest.Method; +import com.netflix.conductor.client.metrics.MetricsCollector; +import com.netflix.conductor.client.metrics.PayloadKind; import com.netflix.conductor.common.config.ObjectMapperProvider; import com.netflix.conductor.common.metadata.workflow.RerunWorkflowRequest; import com.netflix.conductor.common.metadata.workflow.SkipTaskRequest; @@ -102,6 +103,8 @@ public class WorkflowClient implements AutoCloseable { private final EventDispatcher eventDispatcher = new EventDispatcher<>(); + private MetricsCollector metricsCollector; + protected ConductorClient client; private PayloadStorage payloadStorage; @@ -140,6 +143,14 @@ public WorkflowClient(ConductorClient client, ConductorClientConfiguration confi } else { this.executorService = Executors.newFixedThreadPool(executorThreadCount, factory); } + + if (client != null) { + MetricsCollector mc = client.getMetricsCollector(); + if (mc != null && mc.isAutoWiringEnabled()) { + this.metricsCollector = mc; + registerListener(mc); + } + } } @Override @@ -147,6 +158,9 @@ public void close() { if (executorService != null) { executorService.shutdown(); } + if (metricsCollector != null) { + ListenerRegister.unregister(metricsCollector, eventDispatcher); + } } /** @@ -191,6 +205,8 @@ public String startWorkflow(StartWorkflowRequest startWorkflowRequest) { .method(Method.POST) .path("/workflow") .body(startWorkflowRequest) + .payloadKind(new PayloadKind.WorkflowInput( + startWorkflowRequest.getName(), startWorkflowRequest.getVersion())) .build(); ConductorClientResponse resp = client.execute(request, STRING_TYPE); @@ -205,8 +221,6 @@ public void checkAndUploadToExternalStorage(StartWorkflowRequest startWorkflowRe objectMapper.writeValue(byteArrayOutputStream, startWorkflowRequest.getInput()); byte[] workflowInputBytes = byteArrayOutputStream.toByteArray(); long workflowInputSize = workflowInputBytes.length; - eventDispatcher.publish(new WorkflowInputPayloadSizeEvent(startWorkflowRequest.getName(), - startWorkflowRequest.getVersion(), workflowInputSize)); if (workflowInputSize > conductorClientConfiguration.getWorkflowInputPayloadThresholdKB() * 1024L) { if (!conductorClientConfiguration.isExternalPayloadStorageEnabled() || diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/metrics/ApiClientMetrics.java b/conductor-client/src/main/java/com/netflix/conductor/client/metrics/ApiClientMetrics.java new file mode 100644 index 000000000..94788a611 --- /dev/null +++ b/conductor-client/src/main/java/com/netflix/conductor/client/metrics/ApiClientMetrics.java @@ -0,0 +1,83 @@ +/* + * Copyright 2024 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.metrics; + +import java.time.Duration; + +/** + * Hook for recording metrics about the HTTP calls made by the generated / + * handwritten Conductor API clients. + * + *

Canonical metrics emitted by implementations: + *

+ *   http_api_client_request_seconds{method, uri, status}                 (Histogram)
+ *   task_result_size_bytes{taskType}                                     (Histogram)
+ *   workflow_input_size_bytes{workflowType, version}                     (Histogram)
+ * 
+ * + *

The size histograms are populated at wire time from the OkHttp + * {@code RequestBody.contentLength()} of bodies tagged with a + * {@link PayloadKind}; this avoids the previous double-JSON-serialization + * cost in {@code TaskClient}/{@code WorkflowClient}, and decouples + * payload-size observability from {@code isEnforceThresholds}. + * + *

Keeping this as an interface (rather than wiring directly to any + * particular metrics backend) lets {@code conductor-client} stay free of a + * Micrometer / Prometheus dependency; the {@code conductor-client-metrics} + * module ships the {@code PrometheusApiClientMetrics} implementation. + */ +public interface ApiClientMetrics { + + /** + * Record a single HTTP request the SDK issued to the Conductor server. + * + * @param method HTTP verb (GET, POST, ...). Never null. + * @param uri Request path. + * @param statusCode HTTP status code of the response, or a negative + * value if the request failed before a status was + * received (network error, timeout). Implementations + * typically translate negative values to a + * {@code status="0"} label. + * @param duration Wall-clock time between request issue and response + * received (or error raised). Never null. + */ + void recordRequest(String method, String uri, int statusCode, Duration duration); + + /** + * Record the serialized size of a task-result update body. Default no-op + * so existing implementations stay source- and binary-compatible. + * + * @param taskType Task definition name. May be empty/null if unknown. + * @param sizeBytes Size of the JSON body in bytes (from + * {@code RequestBody.contentLength()}). Implementations + * should ignore negative values. + */ + default void recordTaskResultSize(String taskType, long sizeBytes) { } + + /** + * Record the serialized size of a workflow-start input body. Default no-op + * so existing implementations stay source- and binary-compatible. + * + * @param workflowType Workflow definition name. May be empty/null if unknown. + * @param version Workflow version. May be null. + * @param sizeBytes Size of the JSON body in bytes (from + * {@code RequestBody.contentLength()}). Implementations + * should ignore negative values. + */ + default void recordWorkflowInputSize(String workflowType, Integer version, long sizeBytes) { } + + /** + * No-op instance for callers that want a non-null default. + */ + ApiClientMetrics NOOP = (method, uri, statusCode, duration) -> { }; +} diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/metrics/MetricsCollector.java b/conductor-client/src/main/java/com/netflix/conductor/client/metrics/MetricsCollector.java index 82cc1d188..a7a3914b1 100644 --- a/conductor-client/src/main/java/com/netflix/conductor/client/metrics/MetricsCollector.java +++ b/conductor-client/src/main/java/com/netflix/conductor/client/metrics/MetricsCollector.java @@ -18,4 +18,71 @@ public interface MetricsCollector extends TaskRunnerEventsListener, WorkflowClientListener, TaskClientListener { + default ApiClientMetrics getApiClientMetrics() { + return ApiClientMetrics.NOOP; + } + + /** + * Whether downstream clients ({@code TaskClient}, {@code WorkflowClient}, + * {@code TaskRunnerConfigurer}) should automatically register this + * collector as an event listener when a {@code ConductorClient} built with + * {@link com.netflix.conductor.client.http.ConductorClient.Builder#withMetricsCollector} + * is detected. + * + *

Defaults to {@code false} so that legacy SDK upgraders see no + * constructor side-effects. The canonical collector overrides this to + * {@code true}. Call {@link #setAutoWiringEnabled(boolean)} to override + * the default for any implementation. + */ + default boolean isAutoWiringEnabled() { + return false; + } + + /** + * Override the default auto-wiring behavior. No-op by default; concrete + * implementations that support the toggle should override this. + */ + default void setAutoWiringEnabled(boolean enabled) { } + + /** + * Whether {@code TaskRunner} should publish {@code ActiveWorkersChanged} + * events on every task start/finish to drive the {@code active_workers} + * gauge. This adds two async event dispatches per task execution. + * + *

Defaults to {@code false} so legacy SDK upgraders see no additional + * hot-path overhead. The canonical collector overrides this to + * {@code true}. Call {@link #setActiveWorkersTrackingEnabled(boolean)} to + * override the default for any implementation. + */ + default boolean isActiveWorkersTrackingEnabled() { + return false; + } + + /** + * Override the default active-workers tracking behavior. No-op by + * default; concrete implementations that support the toggle should + * override this. + */ + default void setActiveWorkersTrackingEnabled(boolean enabled) { } + + /** + * Whether {@code TaskRunner} should publish per-poll-cycle diagnostic + * events ({@code TaskPaused}, {@code TaskExecutionQueueFull}) and + * whether {@code TaskClient} should emit ack diagnostic events + * ({@code TaskAckFailure}, {@code TaskAckError}). + * + *

Defaults to {@code false} so legacy SDK upgraders see no + * additional hot-path overhead. The canonical collector overrides this + * to {@code true}. Call {@link #setDiagnosticEventsEnabled(boolean)} + * to override the default for any implementation. + */ + default boolean isDiagnosticEventsEnabled() { + return false; + } + + /** + * Override the default diagnostic-events behavior. No-op by default; + * concrete implementations that support the toggle should override this. + */ + default void setDiagnosticEventsEnabled(boolean enabled) { } } diff --git a/conductor-client/src/main/java/com/netflix/conductor/client/metrics/PayloadKind.java b/conductor-client/src/main/java/com/netflix/conductor/client/metrics/PayloadKind.java new file mode 100644 index 000000000..f31a5fa6a --- /dev/null +++ b/conductor-client/src/main/java/com/netflix/conductor/client/metrics/PayloadKind.java @@ -0,0 +1,56 @@ +/* + * Copyright 2024 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package com.netflix.conductor.client.metrics; + +/** + * Discriminator attached to outbound {@code ConductorClientRequest}s so the + * {@link ApiClientMetrics} interceptor can label payload-size histograms with + * the right canonical labels without serializing the body twice. + * + *

The tag is read at wire time from the OkHttp {@code Request} (set via + * {@code Request.Builder.tag(PayloadKind.class, …)}); the body's + * {@code contentLength()} is then exact and free, since the body has already + * been built by the HTTP layer. + * + *

Two kinds are recognised today: + *

    + *
  • {@link TaskResult} — body is a {@code TaskResult}; recorded as + * {@code task_result_size_bytes{taskType}}.
  • + *
  • {@link WorkflowInput} — body is a {@code StartWorkflowRequest}; + * recorded as {@code workflow_input_size_bytes{workflowType,version}}.
  • + *
+ * + *

Add a new kind by adding a permitted record below and a matching + * {@code recordSize(...)} hook on {@link ApiClientMetrics}. + */ +public sealed interface PayloadKind { + + /** Dispatch the recorded body size to the right canonical histogram. */ + void recordSize(ApiClientMetrics metrics, long sizeBytes); + + /** Marker for task-result update bodies (e.g. {@code POST /tasks}). */ + record TaskResult(String taskType) implements PayloadKind { + @Override + public void recordSize(ApiClientMetrics metrics, long sizeBytes) { + metrics.recordTaskResultSize(taskType, sizeBytes); + } + } + + /** Marker for workflow-start bodies (e.g. {@code POST /workflow}). */ + record WorkflowInput(String workflowType, Integer version) implements PayloadKind { + @Override + public void recordSize(ApiClientMetrics metrics, long sizeBytes) { + metrics.recordWorkflowInputSize(workflowType, version, sizeBytes); + } + } +} diff --git a/conductor-client/src/test/java/com/netflix/conductor/client/events/EventPojoTests.java b/conductor-client/src/test/java/com/netflix/conductor/client/events/EventPojoTests.java index fd53aba14..0568cf935 100644 --- a/conductor-client/src/test/java/com/netflix/conductor/client/events/EventPojoTests.java +++ b/conductor-client/src/test/java/com/netflix/conductor/client/events/EventPojoTests.java @@ -18,12 +18,20 @@ import com.netflix.conductor.client.events.task.TaskPayloadUsedEvent; import com.netflix.conductor.client.events.task.TaskResultPayloadSizeEvent; +import com.netflix.conductor.client.events.taskrunner.ActiveWorkersChanged; import com.netflix.conductor.client.events.taskrunner.PollCompleted; import com.netflix.conductor.client.events.taskrunner.PollFailure; import com.netflix.conductor.client.events.taskrunner.PollStarted; +import com.netflix.conductor.client.events.taskrunner.TaskAckError; +import com.netflix.conductor.client.events.taskrunner.TaskAckFailure; import com.netflix.conductor.client.events.taskrunner.TaskExecutionCompleted; import com.netflix.conductor.client.events.taskrunner.TaskExecutionFailure; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionQueueFull; import com.netflix.conductor.client.events.taskrunner.TaskExecutionStarted; +import com.netflix.conductor.client.events.taskrunner.TaskPaused; +import com.netflix.conductor.client.events.taskrunner.TaskUpdateCompleted; +import com.netflix.conductor.client.events.taskrunner.TaskUpdateFailure; +import com.netflix.conductor.client.events.taskrunner.ThreadUncaughtException; import com.netflix.conductor.client.events.workflow.WorkflowInputPayloadSizeEvent; import com.netflix.conductor.client.events.workflow.WorkflowPayloadUsedEvent; import com.netflix.conductor.client.events.workflow.WorkflowStartedEvent; @@ -96,6 +104,15 @@ void testTaskExecutionFailure() { assertNotNull(event.getTime()); } + @Test + void testActiveWorkersChanged() { + ActiveWorkersChanged event = new ActiveWorkersChanged("SIMPLE", 3); + + assertEquals("SIMPLE", event.getTaskType()); + assertEquals(3, event.getCount()); + assertNotNull(event.getTime()); + } + // --- workflow package --- @Test @@ -152,6 +169,90 @@ void testTaskResultPayloadSizeEvent() { assertNotNull(event.getTime()); } + // --- taskrunner package (continued: new event types) --- + + @Test + void testTaskUpdateCompleted() { + TaskUpdateCompleted event = new TaskUpdateCompleted("SIMPLE", "task-100", "worker-1", "wf-abc", 750L); + + assertEquals("SIMPLE", event.getTaskType()); + assertEquals("task-100", event.getTaskId()); + assertEquals("worker-1", event.getWorkerId()); + assertEquals("wf-abc", event.getWorkflowInstanceId()); + assertEquals(Duration.ofMillis(750), event.getDuration()); + assertNotNull(event.getTime()); + } + + @Test + void testTaskUpdateFailure() { + Throwable cause = new RuntimeException("update failed"); + TaskUpdateFailure event = new TaskUpdateFailure("SIMPLE", "task-101", "worker-2", "wf-def", cause, 800L); + + assertEquals("SIMPLE", event.getTaskType()); + assertEquals("task-101", event.getTaskId()); + assertEquals("worker-2", event.getWorkerId()); + assertEquals("wf-def", event.getWorkflowInstanceId()); + assertSame(cause, event.getCause()); + assertEquals(Duration.ofMillis(800), event.getDuration()); + assertNotNull(event.getTime()); + } + + @Test + void testTaskAckFailure() { + TaskAckFailure event = new TaskAckFailure("HTTP_TASK", "task-200"); + + assertEquals("HTTP_TASK", event.getTaskType()); + assertEquals("task-200", event.getTaskId()); + assertNotNull(event.getTime()); + } + + @Test + void testTaskAckError() { + Throwable cause = new RuntimeException("network error"); + TaskAckError event = new TaskAckError("HTTP_TASK", "task-201", cause); + + assertEquals("HTTP_TASK", event.getTaskType()); + assertEquals("task-201", event.getTaskId()); + assertSame(cause, event.getCause()); + assertNotNull(event.getTime()); + } + + @Test + void testTaskExecutionQueueFull() { + TaskExecutionQueueFull event = new TaskExecutionQueueFull("SIMPLE"); + + assertEquals("SIMPLE", event.getTaskType()); + assertNotNull(event.getTime()); + } + + @Test + void testTaskPaused() { + TaskPaused event = new TaskPaused("SIMPLE"); + + assertEquals("SIMPLE", event.getTaskType()); + assertNotNull(event.getTime()); + } + + @Test + void testThreadUncaughtExceptionWithTaskType() { + Throwable cause = new OutOfMemoryError("heap space"); + ThreadUncaughtException event = new ThreadUncaughtException("SIMPLE", cause); + + assertEquals("SIMPLE", event.getTaskType()); + assertSame(cause, event.getCause()); + assertNotNull(event.getTime()); + } + + @Test + void testThreadUncaughtExceptionWithoutTaskType() { + Throwable cause = new RuntimeException("unexpected"); + ThreadUncaughtException event = new ThreadUncaughtException(cause); + + assertEquals("", event.getTaskType()); + assertSame(cause, event.getCause()); + assertNotNull(event.getTime()); + } + // --- cross-cutting tests --- @Test diff --git a/conductor-client/src/test/java/com/netflix/conductor/client/events/dispatcher/EventDispatcherTest.java b/conductor-client/src/test/java/com/netflix/conductor/client/events/dispatcher/EventDispatcherTest.java index 729807bf3..33895cd02 100644 --- a/conductor-client/src/test/java/com/netflix/conductor/client/events/dispatcher/EventDispatcherTest.java +++ b/conductor-client/src/test/java/com/netflix/conductor/client/events/dispatcher/EventDispatcherTest.java @@ -115,4 +115,29 @@ void testUnregisterFromEmpty() { // Unregister from a type that was never registered - should not throw assertDoesNotThrow(() -> dispatcher.unregister(PollStarted.class, listener)); } + + @Test + void testPublishSyncRunsOnCallingThread() { + Thread callingThread = Thread.currentThread(); + AtomicReference listenerThread = new AtomicReference<>(); + AtomicReference received = new AtomicReference<>(); + + Consumer listener = event -> { + listenerThread.set(Thread.currentThread()); + received.set(event); + }; + + dispatcher.register(PollStarted.class, listener); + PollStarted event = new PollStarted("syncTask"); + dispatcher.publishSync(event); + + assertSame(callingThread, listenerThread.get(), + "publishSync must invoke listener on the calling thread"); + assertSame(event, received.get()); + } + + @Test + void testPublishSyncNoListeners() { + assertDoesNotThrow(() -> dispatcher.publishSync(new PollStarted("orphanTask"))); + } } diff --git a/conductor-client/src/test/java/com/netflix/conductor/client/events/listeners/ListenerRegisterTest.java b/conductor-client/src/test/java/com/netflix/conductor/client/events/listeners/ListenerRegisterTest.java index f084b8c72..b387bd1f4 100644 --- a/conductor-client/src/test/java/com/netflix/conductor/client/events/listeners/ListenerRegisterTest.java +++ b/conductor-client/src/test/java/com/netflix/conductor/client/events/listeners/ListenerRegisterTest.java @@ -12,8 +12,10 @@ */ package com.netflix.conductor.client.events.listeners; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; import org.junit.jupiter.api.Test; @@ -22,13 +24,21 @@ import com.netflix.conductor.client.events.task.TaskClientEvent; import com.netflix.conductor.client.events.task.TaskPayloadUsedEvent; import com.netflix.conductor.client.events.task.TaskResultPayloadSizeEvent; +import com.netflix.conductor.client.events.taskrunner.ActiveWorkersChanged; import com.netflix.conductor.client.events.taskrunner.PollCompleted; import com.netflix.conductor.client.events.taskrunner.PollFailure; import com.netflix.conductor.client.events.taskrunner.PollStarted; +import com.netflix.conductor.client.events.taskrunner.TaskAckError; +import com.netflix.conductor.client.events.taskrunner.TaskAckFailure; import com.netflix.conductor.client.events.taskrunner.TaskExecutionCompleted; import com.netflix.conductor.client.events.taskrunner.TaskExecutionFailure; +import com.netflix.conductor.client.events.taskrunner.TaskExecutionQueueFull; import com.netflix.conductor.client.events.taskrunner.TaskExecutionStarted; +import com.netflix.conductor.client.events.taskrunner.TaskPaused; import com.netflix.conductor.client.events.taskrunner.TaskRunnerEvent; +import com.netflix.conductor.client.events.taskrunner.TaskUpdateCompleted; +import com.netflix.conductor.client.events.taskrunner.TaskUpdateFailure; +import com.netflix.conductor.client.events.taskrunner.ThreadUncaughtException; import com.netflix.conductor.client.events.workflow.WorkflowClientEvent; import com.netflix.conductor.client.events.workflow.WorkflowInputPayloadSizeEvent; import com.netflix.conductor.client.events.workflow.WorkflowPayloadUsedEvent; @@ -77,6 +87,49 @@ public void consume(TaskExecutionFailure e) {} assertEquals("test_task", received.get().getTaskType()); } + @Test + void testRegisterActiveWorkersChangedListener() throws InterruptedException { + EventDispatcher dispatcher = new EventDispatcher<>(); + CountDownLatch latch = new CountDownLatch(1); + AtomicReference received = new AtomicReference<>(); + + TaskRunnerEventsListener listener = new TaskRunnerEventsListener() { + @Override + public void consume(PollStarted e) {} + + @Override + public void consume(PollCompleted e) {} + + @Override + public void consume(PollFailure e) {} + + @Override + public void consume(TaskExecutionStarted e) {} + + @Override + public void consume(TaskExecutionCompleted e) {} + + @Override + public void consume(TaskExecutionFailure e) {} + + @Override + public void consume(ActiveWorkersChanged e) { + received.set(e); + latch.countDown(); + } + }; + + ListenerRegister.register(listener, dispatcher); + + ActiveWorkersChanged event = new ActiveWorkersChanged("test_task", 5); + dispatcher.publish(event); + + assertTrue(latch.await(2, TimeUnit.SECONDS), "Listener should have received ActiveWorkersChanged event"); + assertSame(event, received.get()); + assertEquals("test_task", received.get().getTaskType()); + assertEquals(5, received.get().getCount()); + } + @Test void testRegisterTaskClientListener() throws InterruptedException { EventDispatcher dispatcher = new EventDispatcher<>(); @@ -136,4 +189,229 @@ public void consume(WorkflowPayloadUsedEvent event) {} assertEquals(1, received.get().getVersion()); assertTrue(received.get().isSuccess()); } + + @Test + void testAllTaskRunnerEventTypesDispatched() throws InterruptedException { + EventDispatcher dispatcher = new EventDispatcher<>(); + ConcurrentHashMap received = new ConcurrentHashMap<>(); + CountDownLatch latch = new CountDownLatch(14); + + TaskRunnerEventsListener listener = new TaskRunnerEventsListener() { + @Override public void consume(PollStarted e) { received.put("PollStarted", e); latch.countDown(); } + @Override public void consume(PollCompleted e) { received.put("PollCompleted", e); latch.countDown(); } + @Override public void consume(PollFailure e) { received.put("PollFailure", e); latch.countDown(); } + @Override public void consume(TaskExecutionStarted e) { received.put("TaskExecutionStarted", e); latch.countDown(); } + @Override public void consume(TaskExecutionCompleted e) { received.put("TaskExecutionCompleted", e); latch.countDown(); } + @Override public void consume(TaskExecutionFailure e) { received.put("TaskExecutionFailure", e); latch.countDown(); } + @Override public void consume(TaskUpdateCompleted e) { received.put("TaskUpdateCompleted", e); latch.countDown(); } + @Override public void consume(TaskUpdateFailure e) { received.put("TaskUpdateFailure", e); latch.countDown(); } + @Override public void consume(TaskAckFailure e) { received.put("TaskAckFailure", e); latch.countDown(); } + @Override public void consume(TaskAckError e) { received.put("TaskAckError", e); latch.countDown(); } + @Override public void consume(TaskExecutionQueueFull e) { received.put("TaskExecutionQueueFull", e); latch.countDown(); } + @Override public void consume(TaskPaused e) { received.put("TaskPaused", e); latch.countDown(); } + @Override public void consume(ThreadUncaughtException e) { received.put("ThreadUncaughtException", e); latch.countDown(); } + @Override public void consume(ActiveWorkersChanged e) { received.put("ActiveWorkersChanged", e); latch.countDown(); } + }; + + ListenerRegister.register(listener, dispatcher); + + dispatcher.publish(new PollStarted("t")); + dispatcher.publish(new PollCompleted("t", 100)); + dispatcher.publish(new PollFailure("t", 200, new RuntimeException())); + dispatcher.publish(new TaskExecutionStarted("t", "id", "w")); + dispatcher.publish(new TaskExecutionCompleted("t", "id", "w", 300)); + dispatcher.publish(new TaskExecutionFailure("t", "id", "w", new RuntimeException(), 400)); + dispatcher.publish(new TaskUpdateCompleted("t", "id", "w", "wfId", 500)); + dispatcher.publish(new TaskUpdateFailure("t", "id", "w", "wfId", new RuntimeException(), 600)); + dispatcher.publish(new TaskAckFailure("t", "id")); + dispatcher.publish(new TaskAckError("t", "id", new RuntimeException())); + dispatcher.publish(new TaskExecutionQueueFull("t")); + dispatcher.publish(new TaskPaused("t")); + dispatcher.publish(new ThreadUncaughtException(new RuntimeException())); + dispatcher.publish(new ActiveWorkersChanged("t", 5)); + + assertTrue(latch.await(5, TimeUnit.SECONDS), "All 14 task runner event types should be dispatched"); + assertEquals(14, received.size()); + } + + @Test + void testAllTaskClientEventTypesDispatched() throws InterruptedException { + EventDispatcher dispatcher = new EventDispatcher<>(); + ConcurrentHashMap received = new ConcurrentHashMap<>(); + CountDownLatch latch = new CountDownLatch(2); + + TaskClientListener listener = new TaskClientListener() { + @Override public void consume(TaskPayloadUsedEvent e) { received.put("TaskPayloadUsedEvent", e); latch.countDown(); } + @Override public void consume(TaskResultPayloadSizeEvent e) { received.put("TaskResultPayloadSizeEvent", e); latch.countDown(); } + }; + + ListenerRegister.register(listener, dispatcher); + + dispatcher.publish(new TaskPayloadUsedEvent("t", "WRITE", "output")); + dispatcher.publish(new TaskResultPayloadSizeEvent("t", 2048L)); + + assertTrue(latch.await(5, TimeUnit.SECONDS), "Both task client event types should be dispatched"); + assertEquals(2, received.size()); + } + + @Test + void testAllWorkflowClientEventTypesDispatched() throws InterruptedException { + EventDispatcher dispatcher = new EventDispatcher<>(); + ConcurrentHashMap received = new ConcurrentHashMap<>(); + CountDownLatch latch = new CountDownLatch(3); + + WorkflowClientListener listener = new WorkflowClientListener() { + @Override public void consume(WorkflowStartedEvent event) { received.put("WorkflowStartedEvent", event); latch.countDown(); } + @Override public void consume(WorkflowInputPayloadSizeEvent event) { received.put("WorkflowInputPayloadSizeEvent", event); latch.countDown(); } + @Override public void consume(WorkflowPayloadUsedEvent event) { received.put("WorkflowPayloadUsedEvent", event); latch.countDown(); } + }; + + ListenerRegister.register(listener, dispatcher); + + dispatcher.publish(new WorkflowStartedEvent("wf", 1)); + dispatcher.publish(new WorkflowInputPayloadSizeEvent("wf", 1, 1024L)); + dispatcher.publish(new WorkflowPayloadUsedEvent("wf", 1, "READ", "input")); + + assertTrue(latch.await(5, TimeUnit.SECONDS), "All 3 workflow client event types should be dispatched"); + assertEquals(3, received.size()); + } + + @Test + void testUnregisterTaskRunnerListenerStopsDelivery() throws InterruptedException { + EventDispatcher dispatcher = new EventDispatcher<>(); + AtomicBoolean called = new AtomicBoolean(false); + + TaskRunnerEventsListener listener = new TaskRunnerEventsListener() { + @Override public void consume(PollStarted e) { called.set(true); } + @Override public void consume(PollCompleted e) {} + @Override public void consume(PollFailure e) {} + @Override public void consume(TaskExecutionStarted e) {} + @Override public void consume(TaskExecutionCompleted e) {} + @Override public void consume(TaskExecutionFailure e) {} + }; + + ListenerRegister.register(listener, dispatcher); + ListenerRegister.unregister(listener, dispatcher); + + dispatcher.publish(new PollStarted("test_task")); + Thread.sleep(300); + + assertFalse(called.get(), "Listener should not receive events after unregister"); + } + + @Test + void testUnregisterAllowsReRegister() throws InterruptedException { + EventDispatcher dispatcher = new EventDispatcher<>(); + CountDownLatch latch = new CountDownLatch(1); + + TaskRunnerEventsListener listener = new TaskRunnerEventsListener() { + @Override public void consume(PollStarted e) { latch.countDown(); } + @Override public void consume(PollCompleted e) {} + @Override public void consume(PollFailure e) {} + @Override public void consume(TaskExecutionStarted e) {} + @Override public void consume(TaskExecutionCompleted e) {} + @Override public void consume(TaskExecutionFailure e) {} + }; + + ListenerRegister.register(listener, dispatcher); + ListenerRegister.unregister(listener, dispatcher); + + // Re-register the same pair — should succeed since unregister cleared the key + ListenerRegister.register(listener, dispatcher); + + dispatcher.publish(new PollStarted("test_task")); + assertTrue(latch.await(2, TimeUnit.SECONDS), + "Listener should receive events after re-registration"); + } + + @Test + void testUnregisterTaskClientListenerStopsDelivery() throws InterruptedException { + EventDispatcher dispatcher = new EventDispatcher<>(); + AtomicBoolean called = new AtomicBoolean(false); + + TaskClientListener listener = new TaskClientListener() { + @Override public void consume(TaskPayloadUsedEvent e) { called.set(true); } + @Override public void consume(TaskResultPayloadSizeEvent e) { called.set(true); } + }; + + ListenerRegister.register(listener, dispatcher); + ListenerRegister.unregister(listener, dispatcher); + + dispatcher.publish(new TaskResultPayloadSizeEvent("t", 100L)); + Thread.sleep(300); + + assertFalse(called.get(), "TaskClientListener should not receive events after unregister"); + } + + @Test + void testUnregisterWorkflowClientListenerStopsDelivery() throws InterruptedException { + EventDispatcher dispatcher = new EventDispatcher<>(); + AtomicBoolean called = new AtomicBoolean(false); + + WorkflowClientListener listener = new WorkflowClientListener() { + @Override public void consume(WorkflowStartedEvent event) { called.set(true); } + @Override public void consume(WorkflowInputPayloadSizeEvent event) { called.set(true); } + @Override public void consume(WorkflowPayloadUsedEvent event) { called.set(true); } + }; + + ListenerRegister.register(listener, dispatcher); + ListenerRegister.unregister(listener, dispatcher); + + dispatcher.publish(new WorkflowStartedEvent("wf", 1)); + Thread.sleep(300); + + assertFalse(called.get(), "WorkflowClientListener should not receive events after unregister"); + } + + @Test + void testUnregisterIsIdempotent() { + EventDispatcher dispatcher = new EventDispatcher<>(); + + TaskRunnerEventsListener listener = new TaskRunnerEventsListener() { + @Override public void consume(PollStarted e) {} + @Override public void consume(PollCompleted e) {} + @Override public void consume(PollFailure e) {} + @Override public void consume(TaskExecutionStarted e) {} + @Override public void consume(TaskExecutionCompleted e) {} + @Override public void consume(TaskExecutionFailure e) {} + }; + + ListenerRegister.register(listener, dispatcher); + + // Double unregister should not throw + ListenerRegister.unregister(listener, dispatcher); + assertDoesNotThrow(() -> ListenerRegister.unregister(listener, dispatcher)); + } + + @Test + void testSameListenerOnDifferentDispatchersIsIndependent() throws InterruptedException { + EventDispatcher dispatcher1 = new EventDispatcher<>(); + EventDispatcher dispatcher2 = new EventDispatcher<>(); + CountDownLatch latch = new CountDownLatch(1); + AtomicBoolean dispatcher1Called = new AtomicBoolean(false); + + TaskRunnerEventsListener listener = new TaskRunnerEventsListener() { + @Override public void consume(PollStarted e) { + dispatcher1Called.set(true); + latch.countDown(); + } + @Override public void consume(PollCompleted e) {} + @Override public void consume(PollFailure e) {} + @Override public void consume(TaskExecutionStarted e) {} + @Override public void consume(TaskExecutionCompleted e) {} + @Override public void consume(TaskExecutionFailure e) {} + }; + + ListenerRegister.register(listener, dispatcher1); + ListenerRegister.register(listener, dispatcher2); + + // Unregister only from dispatcher2 + ListenerRegister.unregister(listener, dispatcher2); + + // dispatcher1 should still deliver + dispatcher1.publish(new PollStarted("test_task")); + assertTrue(latch.await(2, TimeUnit.SECONDS), + "Unregistering from one dispatcher must not affect the other"); + assertTrue(dispatcher1Called.get()); + } } diff --git a/harness/manifests/deployment.yaml b/harness/manifests/deployment.yaml index 928e96bc6..22f6d2205 100644 --- a/harness/manifests/deployment.yaml +++ b/harness/manifests/deployment.yaml @@ -52,6 +52,18 @@ spec: - name: HARNESS_POLL_INTERVAL_MS value: "100" + # WorkflowStatusProbe rate (calls/sec) against /api/workflow/ and + # /api/workflow//status. 0 = disabled (default). Set to a small + # positive integer to surface UUID-bearing uri label values on the + # canonical http_api_client_request_seconds histogram. + - name: HARNESS_PROBE_RATE_PER_SEC + value: "0" + + # === METRICS IMPLEMENTATION SELECTION === + + - name: WORKER_CANONICAL_METRICS + value: "true" + ports: - name: metrics containerPort: 9991 diff --git a/harness/src/main/java/io/orkes/conductor/harness/HarnessMain.java b/harness/src/main/java/io/orkes/conductor/harness/HarnessMain.java index 88e216d96..cc6c7353b 100644 --- a/harness/src/main/java/io/orkes/conductor/harness/HarnessMain.java +++ b/harness/src/main/java/io/orkes/conductor/harness/HarnessMain.java @@ -25,7 +25,9 @@ import com.netflix.conductor.client.http.MetadataClient; import com.netflix.conductor.client.http.TaskClient; import com.netflix.conductor.client.http.WorkflowClient; -import com.netflix.conductor.client.metrics.prometheus.PrometheusMetricsCollector; +import com.netflix.conductor.client.metrics.prometheus.AbstractPrometheusMetricsCollector; +import com.netflix.conductor.client.metrics.prometheus.MetricsBundle; +import com.netflix.conductor.client.metrics.prometheus.MetricsCollectorFactory; import com.netflix.conductor.client.worker.Worker; import com.netflix.conductor.common.metadata.tasks.TaskDef; import com.netflix.conductor.common.metadata.tasks.TaskType; @@ -49,50 +51,130 @@ public class HarnessMain { }; public static void main(String[] args) throws Exception { - ConductorClient client = ApiClient.builder().useEnvVariables(true).readTimeout(10_000).connectTimeout(10_000) - .writeTimeout(10_000).build(); - int workflowsPerSec = envInt("HARNESS_WORKFLOWS_PER_SEC", 2); int batchSize = envInt("HARNESS_BATCH_SIZE", 20); int pollIntervalMs = envInt("HARNESS_POLL_INTERVAL_MS", 100); - - registerMetadata(client); - - PrometheusMetricsCollector metricsCollector = new PrometheusMetricsCollector(); int metricsPort = envInt("HARNESS_METRICS_PORT", 9991); - metricsCollector.startServer(metricsPort, "/metrics"); - log.info("Prometheus metrics server started on port {}", metricsPort); + String wiringMode = envString("METRICS_WIRING", "auto"); + // Opt-in: when > 0, WorkflowStatusProbe periodically calls + // /api/workflow/ and /api/workflow//status so the canonical + // http_api_client_request_seconds histogram picks up UUID-bearing + // uri label values (the realistic high-cardinality case). Default 0 + // = disabled, which keeps harness behavior unchanged. + int probeRatePerSec = envInt("HARNESS_PROBE_RATE_PER_SEC", 0); List workers = new ArrayList<>(); for (String[] entry : SIMULATED_WORKERS) { workers.add(new SimulatedTaskWorker(entry[0], entry[1], Integer.parseInt(entry[2]), batchSize, pollIntervalMs)); } - - TaskClient taskClient = new TaskClient(client); Map threadCounts = workers.stream().collect(Collectors.toMap(Worker::getTaskDefName, w -> batchSize)); - TaskRunnerConfigurer configurer = - new TaskRunnerConfigurer.Builder(taskClient, workers) - .withTaskThreadCount(threadCounts) - .withMetricsCollector(metricsCollector) - .build(); - configurer.init(); + WiringResult wiring = switch (wiringMode.toLowerCase().trim()) { + case "manual" -> wireManual(metricsPort, workers, threadCounts); + default -> wireAuto(metricsPort, workers, threadCounts); + }; - WorkflowClient workflowClient = new WorkflowClient(client); - WorkflowGovernor governor = new WorkflowGovernor(workflowClient, WORKFLOW_NAME, workflowsPerSec); + registerMetadata(wiring.client); + + wiring.configurer.init(); + + WorkflowStatusProbe probe = new WorkflowStatusProbe(wiring.workflowClient, probeRatePerSec); + WorkflowGovernor governor = new WorkflowGovernor( + wiring.workflowClient, WORKFLOW_NAME, workflowsPerSec, probe::offer); governor.start(); + probe.start(); Runtime.getRuntime().addShutdownHook(new Thread(() -> { log.info("Shutting down harness..."); governor.shutdown(); - configurer.shutdown(); + probe.shutdown(); + wiring.configurer.shutdown(); })); Thread.currentThread().join(); } + // ------------------------------------------------------------------------- + // METRICS_WIRING=auto — MetricsBundle + withMetricsCollector on builder; + // all listener registration is automatic + // ------------------------------------------------------------------------- + private static WiringResult wireAuto(int metricsPort, List workers, + Map threadCounts) throws Exception { + log.info("=== METRICS_WIRING=auto — automatic wiring via MetricsBundle ==="); + + // MetricsBundle.create() defaults to port 9991 and /metrics if called with no args + MetricsBundle bundle = MetricsBundle.create(metricsPort, "/metrics"); + log.info("Prometheus metrics server started on port {} ({} metrics)", + bundle.getPort(), bundle.getCollector().collectorName()); + + if (!bundle.getCollector().isAutoWiringEnabled()) { + bundle.getCollector().setAutoWiringEnabled(true); + log.info("Legacy collector does not auto-wire by default; " + + "explicitly enabling auto-wiring to honor METRICS_WIRING=auto"); + } + + ConductorClient client = ApiClient.builder() + .useEnvVariables(true) + .readTimeout(10_000) // optional — OkHttp default applies if omitted + .connectTimeout(10_000) // optional — OkHttp default applies if omitted + .writeTimeout(10_000) // optional — OkHttp default applies if omitted + .withMetricsCollector(bundle.getCollector()) + .build(); + + TaskClient taskClient = new TaskClient(client); + WorkflowClient workflowClient = new WorkflowClient(client); + + TaskRunnerConfigurer configurer = + new TaskRunnerConfigurer.Builder(taskClient, workers) + .withTaskThreadCount(threadCounts) + .build(); + + return new WiringResult(client, workflowClient, configurer); + } + + // ------------------------------------------------------------------------- + // METRICS_WIRING=manual — withHttpMetrics installs only the OkHttp + // interceptor; all listener registration is explicit + // ------------------------------------------------------------------------- + private static WiringResult wireManual(int metricsPort, List workers, + Map threadCounts) throws Exception { + log.info("=== METRICS_WIRING=manual — manual listener wiring ==="); + + AbstractPrometheusMetricsCollector metricsCollector = MetricsCollectorFactory.create(); + metricsCollector.startServer(metricsPort, "/metrics"); + log.info("Prometheus metrics server started on port {} ({} metrics)", metricsPort, metricsCollector.collectorName()); + + ConductorClient client = ApiClient.builder() + .useEnvVariables(true) + .readTimeout(10_000) + .connectTimeout(10_000) + .writeTimeout(10_000) + .withHttpMetrics(metricsCollector) + .build(); + + TaskClient taskClient = new TaskClient(client); + taskClient.registerListener(metricsCollector); + taskClient.registerTaskRunnerListener(metricsCollector); + + TaskRunnerConfigurer configurer = + new TaskRunnerConfigurer.Builder(taskClient, workers) + .withTaskThreadCount(threadCounts) + .withMetricsCollector(metricsCollector) + .build(); + + WorkflowClient workflowClient = new WorkflowClient(client); + workflowClient.registerListener(metricsCollector); + + return new WiringResult(client, workflowClient, configurer); + } + + private record WiringResult(ConductorClient client, WorkflowClient workflowClient, + TaskRunnerConfigurer configurer) { } + + // ------------------------------------------------------------------------- + private static void registerMetadata(ConductorClient client) { MetadataClient metadataClient = new MetadataClient(client); @@ -144,4 +226,12 @@ private static int envInt(String name, int defaultValue) { return defaultValue; } } + + private static String envString(String name, String defaultValue) { + String value = System.getenv(name); + if (value == null || value.isBlank()) { + return defaultValue; + } + return value.trim(); + } } diff --git a/harness/src/main/java/io/orkes/conductor/harness/SimulatedTaskWorker.java b/harness/src/main/java/io/orkes/conductor/harness/SimulatedTaskWorker.java index 02757884a..f633b8f59 100644 --- a/harness/src/main/java/io/orkes/conductor/harness/SimulatedTaskWorker.java +++ b/harness/src/main/java/io/orkes/conductor/harness/SimulatedTaskWorker.java @@ -79,6 +79,13 @@ public int getPollingInterval() { public TaskResult execute(Task task) { Map input = task.getInputData() != null ? task.getInputData() : new HashMap<>(); String taskId = task.getTaskId(); + // Stretch goal: feed taskId into a TaskStatusProbe (sibling of + // WorkflowStatusProbe) so it can periodically call + // TaskClient.getTaskDetails(taskId), which hits /api/tasks/. + // That exercises the UUID-bearing task path the v4 batch-poll-as-ack + // flow normally hides. Plumb it via a Consumer taskIdSink + // through this worker's constructor; default to a no-op so the + // probe stays opt-in. int taskIndex = getOrDefault(input, "taskIndex", -1); log.info("[{}] Starting simulated task [id={}, index={}, codename={}]", taskName, taskId, taskIndex, codename); diff --git a/harness/src/main/java/io/orkes/conductor/harness/WorkflowGovernor.java b/harness/src/main/java/io/orkes/conductor/harness/WorkflowGovernor.java index 0a152e248..6f3e2135e 100644 --- a/harness/src/main/java/io/orkes/conductor/harness/WorkflowGovernor.java +++ b/harness/src/main/java/io/orkes/conductor/harness/WorkflowGovernor.java @@ -15,6 +15,7 @@ import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; +import java.util.function.Consumer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -30,11 +31,14 @@ public class WorkflowGovernor { private final String workflowName; private final int workflowsPerSecond; private final ScheduledExecutorService scheduler; + private final Consumer idSink; - public WorkflowGovernor(WorkflowClient workflowClient, String workflowName, int workflowsPerSecond) { + public WorkflowGovernor(WorkflowClient workflowClient, String workflowName, int workflowsPerSecond, + Consumer idSink) { this.workflowClient = workflowClient; this.workflowName = workflowName; this.workflowsPerSecond = workflowsPerSecond; + this.idSink = idSink != null ? idSink : id -> { }; this.scheduler = Executors.newSingleThreadScheduledExecutor(r -> { Thread t = new Thread(r, "workflow-governor"); t.setDaemon(true); @@ -64,7 +68,16 @@ private void tick() { for (int i = 0; i < workflowsPerSecond; i++) { StartWorkflowRequest request = new StartWorkflowRequest(); request.setName(workflowName); - workflowClient.startWorkflow(request); + request.setVersion(1); + // Stretch goal: set a unique correlationId per request + // (e.g. UUID.randomUUID().toString()) so a future probe can + // call WorkflowClient.getWorkflows(name, correlationId, ...), + // which maps to /api/workflow//correlated/ + // — a third UUID-bearing endpoint useful for verifying that + // the OkHttp metrics interceptor templates *every* path + // segment, not just /workflow/. + String workflowId = workflowClient.startWorkflow(request); + idSink.accept(workflowId); } log.info("Governor: started {} workflow(s)", workflowsPerSecond); } catch (Exception e) { diff --git a/harness/src/main/java/io/orkes/conductor/harness/WorkflowStatusProbe.java b/harness/src/main/java/io/orkes/conductor/harness/WorkflowStatusProbe.java new file mode 100644 index 000000000..5567630eb --- /dev/null +++ b/harness/src/main/java/io/orkes/conductor/harness/WorkflowStatusProbe.java @@ -0,0 +1,154 @@ +/* + * Copyright 2024 Conductor Authors. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package io.orkes.conductor.harness; + +import java.util.concurrent.ConcurrentLinkedDeque; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.netflix.conductor.client.http.WorkflowClient; + +/** + * Opt-in control-plane probe that exercises the UUID-bearing workflow lookup + * endpoints, so {@code http_api_client_request_seconds} on the canonical + * Prometheus surface picks up entries with {@code uri=/api/workflow/<uuid>} + * and {@code uri=/api/workflow/<uuid>/status}. The default + * {@link HarnessMain} traffic only ever hits bounded, no-path-param URLs + * ({@code /api/tasks/poll/batch/<taskType>}, {@code /api/tasks}, + * {@code /api/workflow}, etc.), so the high-cardinality concern on the + * {@code uri} label is invisible without something like this probe. + * + *

Default off. The probe runs only when + * {@code HARNESS_PROBE_RATE_PER_SEC} is set to a positive integer. Zero (the + * default) means the harness behaves exactly as it did before. + * + *

Side-effect-free. The probe only issues read calls + * ({@link WorkflowClient#getWorkflow(String, boolean) getWorkflow} and + * {@link WorkflowClient#getWorkflowStatusSummary getWorkflowStatusSummary}) and + * does not pause, resume, terminate, or otherwise perturb the workflows the + * task workers are processing. + * + *

Self-bounded. Recently-started workflow IDs are kept in a + * fixed-size FIFO; the probe rotates through them, so memory usage is + * constant regardless of how long the harness runs. + * + *

Failure-tolerant. Workflows can be archived or swept while the + * probe still has their IDs; {@link Exception}s are logged at {@code debug} + * and otherwise ignored so the probe never crashes the harness. + * + *


+ * + *

Stretch goals not implemented yet — these are documented here so + * the next iteration has a single place to look: + * + *

    + *
  1. Per-task probe. Add a {@code TaskStatusProbe} that calls + * {@code TaskClient.getTaskDetails(taskId)} (which hits + * {@code /api/tasks/<taskId>}). Plumb a + * {@code Consumer taskIdSink} through + * {@link SimulatedTaskWorker}'s constructor and feed it from + * {@code execute(...)} (the {@code task.getTaskId()} value is already + * in scope there). This is the cardinality vector on the task + * path that the v4 batch-poll-as-ack pattern normally hides.
  2. + *
  3. Unique-correlationId probe. Have {@link WorkflowGovernor} set a + * fresh {@code correlationId} (e.g. {@code UUID.randomUUID().toString()}) + * on each {@code StartWorkflowRequest}, and have this probe + * occasionally call {@code WorkflowClient.getWorkflows(name, + * correlationId, ...)} which maps to + * {@code /api/workflow/<name>/correlated/<correlationId>}. That + * is a third UUID-bearing path and a useful "does the interceptor + * template any path segment?" test.
  4. + *
+ */ +public final class WorkflowStatusProbe { + + private static final Logger log = LoggerFactory.getLogger(WorkflowStatusProbe.class); + + /** Cap on retained workflow IDs. Keeps memory constant regardless of harness uptime. */ + private static final int MAX_TRACKED_IDS = 256; + + private final WorkflowClient workflowClient; + private final int callsPerSecond; + private final ScheduledExecutorService scheduler; + private final ConcurrentLinkedDeque recentIds = new ConcurrentLinkedDeque<>(); + + public WorkflowStatusProbe(WorkflowClient workflowClient, int callsPerSecond) { + this.workflowClient = workflowClient; + this.callsPerSecond = callsPerSecond; + this.scheduler = Executors.newSingleThreadScheduledExecutor(r -> { + Thread t = new Thread(r, "workflow-status-probe"); + t.setDaemon(true); + return t; + }); + } + + /** + * Capture a workflow ID for later probing. Safe to call from any thread; + * intended to be wired as the {@code idSink} of {@link WorkflowGovernor}. + */ + public void offer(String workflowId) { + if (workflowId == null || workflowId.isBlank()) { + return; + } + recentIds.addFirst(workflowId); + while (recentIds.size() > MAX_TRACKED_IDS) { + recentIds.pollLast(); + } + } + + public void start() { + if (callsPerSecond <= 0) { + log.info("WorkflowStatusProbe disabled (HARNESS_PROBE_RATE_PER_SEC<=0)"); + return; + } + log.info("WorkflowStatusProbe started: rate={}/sec, retainedIds<={}", callsPerSecond, MAX_TRACKED_IDS); + scheduler.scheduleAtFixedRate(this::tick, 1, 1, TimeUnit.SECONDS); + } + + public void shutdown() { + scheduler.shutdown(); + try { + if (!scheduler.awaitTermination(5, TimeUnit.SECONDS)) { + scheduler.shutdownNow(); + } + } catch (InterruptedException e) { + scheduler.shutdownNow(); + Thread.currentThread().interrupt(); + } + } + + private void tick() { + int budget = Math.min(callsPerSecond, recentIds.size()); + for (int i = 0; i < budget; i++) { + String id = recentIds.pollFirst(); + if (id == null) { + return; + } + recentIds.addLast(id); + try { + if (ThreadLocalRandom.current().nextBoolean()) { + workflowClient.getWorkflow(id, false); + } else { + workflowClient.getWorkflowStatusSummary(id, false, false); + } + } catch (Exception e) { + log.debug("Probe: lookup failed for {}: {}", id, e.toString()); + } + } + } +}