Environment note: Professional examples and deployment references assume development using Docker images with CUDA 12.9.1 (Ubuntu 22.04) and ROCm 7.0 (Ubuntu 24.04) for parity between environments. Enhanced build system supports professional-grade optimizations.
This comprehensive guide covers all aspects of deploying, maintaining, and scaling GPU applications in production environments, from architecture design to operational excellence.
- Production Architecture Design
- Error Handling and Resilience
- Deployment and DevOps
- Monitoring and Observability
- Scalability and Performance
- Security and Compliance
- Cost Optimization
- Production Best Practices
Service Decomposition Strategies:
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
│ Data Ingestion │ │ GPU Processing │ │ Result Storage│
│ Service │───▶│ Service │───▶│ Service │
└─────────────────┘ └─────────────────┘ └─────────────────┘
│ │ │
▼ ▼ ▼
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
│ Queue/Stream │ │ GPU Resource │ │ Notification │
│ Management │ │ Manager │ │ Service │
└─────────────────┘ └─────────────────┘ └─────────────────┘
Key Design Principles:
- Single Responsibility: Each service handles one GPU workload type
- Stateless Design: Services maintain no state between requests
- Resource Isolation: GPU resources are properly partitioned
- Fault Isolation: Service failures don't cascade
- Independent Scaling: Services scale based on individual demand
Event Flow Design:
Event Source → Event Queue → GPU Service → Result Queue → Consumer
│ │ │ │ │
▼ ▼ ▼ ▼ ▼
Monitoring Dead Letter GPU Metrics Result Cache Feedback
Service Queue Collection Service Loop
Implementation Patterns:
// Event-driven GPU service interface
class GPUEventProcessor {
public:
struct ProcessingEvent {
std::string event_id;
std::string event_type;
std::vector<uint8_t> payload;
std::chrono::system_clock::time_point timestamp;
std::map<std::string, std::string> metadata;
};
struct ProcessingResult {
std::string event_id;
bool success;
std::vector<uint8_t> result_data;
std::chrono::milliseconds processing_time;
std::map<std::string, float> metrics;
};
virtual ProcessingResult process_event(const ProcessingEvent& event) = 0;
virtual bool healthcheck() = 0;
virtual std::map<std::string, float> get_metrics() = 0;
};Resource Isolation Strategies:
- Hardware Isolation: Dedicated GPU devices per tenant
- Virtual GPU: GPU partitioning with MIG (Multi-Instance GPU)
- Time-Sharing: Scheduled access to GPU resources
- Memory Isolation: Separate memory spaces per tenant
- Container Isolation: Containerized GPU access control
Fair Scheduling Implementation:
class TenantGPUScheduler {
private:
struct TenantQuota {
std::string tenant_id;
float gpu_hours_allocated;
float gpu_hours_used;
int priority_level;
std::chrono::system_clock::time_point quota_reset_time;
};
std::unordered_map<std::string, TenantQuota> tenant_quotas;
std::priority_queue<SchedulingRequest> pending_requests;
public:
bool schedule_gpu_work(const std::string& tenant_id,
const GPUWorkload& workload);
void update_usage_metrics(const std::string& tenant_id,
float gpu_hours_consumed);
TenantQuota get_tenant_status(const std::string& tenant_id);
};Implementation Strategy:
class GPUCircuitBreaker {
public:
enum class State { CLOSED, OPEN, HALF_OPEN };
struct Configuration {
int failure_threshold = 5;
std::chrono::seconds timeout = std::chrono::seconds(60);
int success_threshold = 3; // for half-open state
};
private:
State current_state = State::CLOSED;
int failure_count = 0;
int success_count = 0;
std::chrono::system_clock::time_point last_failure_time;
Configuration config;
public:
template<typename Func>
auto execute(Func&& func) -> decltype(func()) {
if (current_state == State::OPEN) {
if (should_attempt_reset()) {
current_state = State::HALF_OPEN;
} else {
throw CircuitBreakerOpenException();
}
}
try {
auto result = func();
on_success();
return result;
} catch (...) {
on_failure();
throw;
}
}
};-
Transient Errors: Temporary GPU state issues
- Memory allocation failures
- Thermal throttling
- Driver communication timeouts
-
Permanent Errors: Hardware or configuration issues
- GPU hardware failures
- Driver corruption
- Invalid memory access patterns
-
Resource Exhaustion: Capacity limitations
- Out of GPU memory
- Compute unit saturation
- Bandwidth limitations
class ProductionGPUErrorHandler {
public:
enum class ErrorCategory {
TRANSIENT_RECOVERABLE,
TRANSIENT_NON_RECOVERABLE,
PERMANENT_HARDWARE,
RESOURCE_EXHAUSTION,
CONFIGURATION_ERROR
};
struct ErrorContext {
ErrorCategory category;
std::string error_message;
std::string gpu_device_id;
std::chrono::system_clock::time_point timestamp;
std::map<std::string, std::string> diagnostic_info;
};
struct RecoveryStrategy {
int max_retry_attempts;
std::chrono::milliseconds retry_delay;
std::function<bool()> recovery_action;
std::function<void()> fallback_action;
};
private:
std::unordered_map<ErrorCategory, RecoveryStrategy> recovery_strategies;
public:
bool handle_gpu_error(const GPUException& error,
const std::string& operation_context);
void register_recovery_strategy(ErrorCategory category,
const RecoveryStrategy& strategy);
ErrorContext classify_error(const GPUException& error);
};class GPUMemoryManager {
private:
struct MemoryPool {
size_t total_size;
size_t used_size;
std::vector<void*> free_blocks;
std::chrono::system_clock::time_point last_gc_time;
};
std::vector<MemoryPool> memory_pools;
public:
void* allocate_with_recovery(size_t size, int max_retries = 3) {
for (int attempt = 0; attempt < max_retries; ++attempt) {
try {
return allocate_from_pool(size);
} catch (const OutOfMemoryException& e) {
if (attempt < max_retries - 1) {
// Try recovery strategies
if (!try_garbage_collection()) {
if (!try_memory_defragmentation()) {
try_emergency_memory_cleanup();
}
}
std::this_thread::sleep_for(
std::chrono::milliseconds(100 * (attempt + 1)));
} else {
throw; // Final attempt failed
}
}
}
throw std::runtime_error("Memory allocation failed after retries");
}
};class AdaptiveGPUService {
public:
enum class ProcessingMode {
FULL_GPU, // All processing on GPU
HYBRID_CPU_GPU, // Critical parts on GPU, rest on CPU
CPU_ONLY, // Full CPU fallback
CACHED_RESULTS, // Return cached/approximate results
SERVICE_UNAVAILABLE // Graceful service degradation
};
private:
ProcessingMode current_mode = ProcessingMode::FULL_GPU;
std::chrono::system_clock::time_point last_health_check;
public:
ProcessResult process_request(const ProcessRequest& request) {
auto mode = determine_processing_mode();
switch (mode) {
case ProcessingMode::FULL_GPU:
return process_on_gpu(request);
case ProcessingMode::HYBRID_CPU_GPU:
return process_hybrid(request);
case ProcessingMode::CPU_ONLY:
return process_on_cpu(request);
case ProcessingMode::CACHED_RESULTS:
return get_cached_result(request);
default:
return ProcessResult::create_error("Service temporarily unavailable");
}
}
};# Multi-stage build for production GPU applications
FROM nvidia/cuda:12.9.1-devel-ubuntu22.04 AS builder
# Install dependencies
RUN apt-get update && apt-get install -y \
cmake \
build-essential \
wget \
&& rm -rf /var/lib/apt/lists/*
# Build application
COPY . /src
WORKDIR /src
RUN mkdir build && cd build && \
cmake .. -DCMAKE_BUILD_TYPE=Release && \
make -j$(nproc)
# Production runtime image
FROM nvidia/cuda:12.9.1-runtime-ubuntu22.04
# Install runtime dependencies only
RUN apt-get update && apt-get install -y \
libgomp1 \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
# Create non-root user for security
RUN groupadd -r gpuapp && useradd -r -g gpuapp gpuapp
# Copy application binary
COPY --from=builder /src/build/gpu_app /usr/local/bin/
COPY --from=builder /src/config/ /opt/app/config/
# Set resource limits and security context
USER gpuapp
WORKDIR /opt/app
# Health check configuration
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD ["/usr/local/bin/gpu_app", "--health-check"]
# Runtime configuration
EXPOSE 8080
CMD ["/usr/local/bin/gpu_app", "--config=/opt/app/config/production.yaml"]apiVersion: apps/v1
kind: Deployment
metadata:
name: gpu-service
namespace: production
spec:
replicas: 3
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: gpu-service
template:
metadata:
labels:
app: gpu-service
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9090"
prometheus.io/path: "/metrics"
spec:
nodeSelector:
accelerator: nvidia-tesla-v100
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
containers:
- name: gpu-service
image: company/gpu-service:v2.1.0
ports:
- containerPort: 8080
name: http
- containerPort: 9090
name: metrics
resources:
requests:
nvidia.com/gpu: 1
memory: 8Gi
cpu: 2
limits:
nvidia.com/gpu: 1
memory: 16Gi
cpu: 4
env:
- name: GPU_MEMORY_FRACTION
value: "0.8"
- name: LOG_LEVEL
value: "INFO"
readinessProbe:
httpGet:
path: /health/ready
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
livenessProbe:
httpGet:
path: /health/live
port: 8080
initialDelaySeconds: 60
periodSeconds: 30
volumeMounts:
- name: config-volume
mountPath: /opt/app/config
readOnly: true
volumes:
- name: config-volume
configMap:
name: gpu-service-configpipeline {
agent {
label 'gpu-enabled'
}
environment {
DOCKER_REGISTRY = 'registry.company.com'
IMAGE_NAME = 'gpu-service'
KUBECONFIG = credentials('k8s-prod-config')
}
stages {
stage('Checkout') {
steps {
checkout scm
}
}
stage('Build and Test') {
parallel {
stage('Build Application') {
steps {
sh '''
mkdir -p build
cd build
cmake .. -DCMAKE_BUILD_TYPE=Release
make -j$(nproc)
'''
}
}
stage('Run Unit Tests') {
steps {
sh '''
cd build
ctest --output-on-failure --parallel $(nproc)
'''
}
post {
always {
publishTestResults testResultsPattern: 'build/test-results/*.xml'
}
}
}
stage('GPU Integration Tests') {
steps {
sh '''
nvidia-smi
cd build
./integration_tests --gpu-enabled
'''
}
}
}
}
stage('Security Scanning') {
steps {
sh '''
# Container security scanning
trivy image --severity HIGH,CRITICAL ${IMAGE_NAME}:${BUILD_NUMBER}
'''
}
}
stage('Performance Benchmarking') {
steps {
sh '''
cd build
./performance_benchmark --iterations=10 --output=benchmark-results.json
'''
archiveArtifacts artifacts: 'build/benchmark-results.json'
}
}
stage('Build Docker Image') {
steps {
script {
def image = docker.build("${DOCKER_REGISTRY}/${IMAGE_NAME}:${BUILD_NUMBER}")
docker.withRegistry("https://${DOCKER_REGISTRY}", 'docker-registry-creds') {
image.push()
image.push('latest')
}
}
}
}
stage('Deploy to Staging') {
steps {
sh '''
helm upgrade --install gpu-service-staging ./helm-chart \
--namespace staging \
--set image.tag=${BUILD_NUMBER} \
--set replicaCount=1 \
--wait --timeout=300s
'''
}
}
stage('Staging Tests') {
steps {
sh '''
# Wait for deployment to be ready
kubectl wait --for=condition=available deployment/gpu-service-staging -n staging --timeout=300s
# Run integration tests against staging
./integration_tests --target=https://gpu-service-staging.company.com
'''
}
}
stage('Production Deployment') {
when {
branch 'main'
}
steps {
input message: 'Deploy to production?', ok: 'Deploy'
sh '''
# Blue-green deployment strategy
helm upgrade --install gpu-service-prod ./helm-chart \
--namespace production \
--set image.tag=${BUILD_NUMBER} \
--set replicaCount=5 \
--wait --timeout=600s
'''
}
}
stage('Production Smoke Tests') {
when {
branch 'main'
}
steps {
sh '''
# Basic smoke tests
curl -f https://gpu-service-prod.company.com/health
./smoke_tests --target=production
'''
}
}
}
post {
always {
cleanWs()
}
failure {
slackSend channel: '#gpu-ops',
color: 'danger',
message: "GPU Service pipeline failed: ${env.BUILD_URL}"
}
success {
slackSend channel: '#gpu-ops',
color: 'good',
message: "GPU Service deployed successfully: ${env.BUILD_URL}"
}
}
}class GPUMetricsExporter {
private:
struct GPUMetrics {
float utilization_percent;
size_t memory_used_bytes;
size_t memory_total_bytes;
float temperature_celsius;
float power_watts;
int64_t compute_processes;
int64_t memory_processes;
};
std::chrono::system_clock::time_point last_collection_time;
std::vector<GPUMetrics> gpu_metrics;
public:
void collect_gpu_metrics() {
nvmlReturn_t result = nvmlInit();
if (result != NVML_SUCCESS) {
throw std::runtime_error("Failed to initialize NVML");
}
unsigned int device_count;
result = nvmlDeviceGetCount(&device_count);
gpu_metrics.clear();
for (unsigned int i = 0; i < device_count; ++i) {
nvmlDevice_t device;
nvmlDeviceGetHandleByIndex(i, &device);
GPUMetrics metrics;
// Utilization
nvmlUtilization_t utilization;
nvmlDeviceGetUtilizationRates(device, &utilization);
metrics.utilization_percent = utilization.gpu;
// Memory
nvmlMemory_t memory_info;
nvmlDeviceGetMemoryInfo(device, &memory_info);
metrics.memory_used_bytes = memory_info.used;
metrics.memory_total_bytes = memory_info.total;
// Temperature
unsigned int temperature;
nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature);
metrics.temperature_celsius = temperature;
// Power
unsigned int power;
nvmlDeviceGetPowerUsage(device, &power);
metrics.power_watts = power / 1000.0f;
// Process count
unsigned int info_count = 64;
nvmlProcessInfo_t process_infos[64];
nvmlDeviceGetComputeRunningProcesses(device, &info_count, process_infos);
metrics.compute_processes = info_count;
gpu_metrics.push_back(metrics);
}
last_collection_time = std::chrono::system_clock::now();
}
std::string export_prometheus_format() {
std::stringstream ss;
for (size_t i = 0; i < gpu_metrics.size(); ++i) {
const auto& metrics = gpu_metrics[i];
ss << "# HELP gpu_utilization_percent GPU utilization percentage\n";
ss << "# TYPE gpu_utilization_percent gauge\n";
ss << "gpu_utilization_percent{device=\"" << i << "\"} "
<< metrics.utilization_percent << "\n";
ss << "# HELP gpu_memory_used_bytes GPU memory used in bytes\n";
ss << "# TYPE gpu_memory_used_bytes gauge\n";
ss << "gpu_memory_used_bytes{device=\"" << i << "\"} "
<< metrics.memory_used_bytes << "\n";
ss << "# HELP gpu_temperature_celsius GPU temperature in Celsius\n";
ss << "# TYPE gpu_temperature_celsius gauge\n";
ss << "gpu_temperature_celsius{device=\"" << i << "\"} "
<< metrics.temperature_celsius << "\n";
// ... additional metrics
}
return ss.str();
}
};class ApplicationMetrics {
private:
std::atomic<uint64_t> request_count{0};
std::atomic<uint64_t> error_count{0};
std::atomic<double> total_processing_time{0.0};
std::mutex histogram_mutex;
std::map<int, int> latency_histogram; // milliseconds -> count
public:
void record_request(std::chrono::milliseconds processing_time, bool success) {
request_count.fetch_add(1);
if (!success) {
error_count.fetch_add(1);
}
double time_ms = processing_time.count();
total_processing_time.fetch_add(time_ms);
// Update histogram
std::lock_guard<std::mutex> lock(histogram_mutex);
int bucket = static_cast<int>(std::log2(time_ms)) + 1;
latency_histogram[bucket]++;
}
struct MetricsSummary {
uint64_t total_requests;
uint64_t error_count;
double error_rate;
double average_latency_ms;
std::map<int, int> latency_distribution;
};
MetricsSummary get_summary() const {
MetricsSummary summary;
summary.total_requests = request_count.load();
summary.error_count = error_count.load();
summary.error_rate = summary.total_requests > 0 ?
static_cast<double>(summary.error_count) / summary.total_requests : 0.0;
summary.average_latency_ms = summary.total_requests > 0 ?
total_processing_time.load() / summary.total_requests : 0.0;
std::lock_guard<std::mutex> lock(histogram_mutex);
summary.latency_distribution = latency_histogram;
return summary;
}
};class GPUOperationTracer {
private:
std::shared_ptr<opentelemetry::trace::Tracer> tracer;
public:
GPUOperationTracer() {
auto provider = opentelemetry::trace::Provider::GetTracerProvider();
tracer = provider->GetTracer("gpu-service", "1.0.0");
}
template<typename Func>
auto trace_gpu_operation(const std::string& operation_name, Func&& func) {
auto span = tracer->StartSpan(operation_name);
auto scope = tracer->WithActiveSpan(span);
// Set GPU-specific attributes
span->SetAttribute("gpu.device_id", get_current_device_id());
span->SetAttribute("gpu.memory_allocated", get_allocated_memory());
span->SetAttribute("gpu.compute_capability", get_compute_capability());
try {
auto start_time = std::chrono::high_resolution_clock::now();
auto result = func();
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
end_time - start_time);
span->SetAttribute("gpu.execution_time_us", duration.count());
span->SetStatus(opentelemetry::trace::StatusCode::kOk);
return result;
} catch (const std::exception& e) {
span->SetStatus(opentelemetry::trace::StatusCode::kError, e.what());
span->SetAttribute("error.type", typeid(e).name());
span->SetAttribute("error.message", e.what());
throw;
}
}
};apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: gpu-service-hpa
namespace: production
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: gpu-service
minReplicas: 2
maxReplicas: 20
behavior:
scaleUp:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 50
periodSeconds: 60
scaleDown:
stabilizationWindowSeconds: 600
policies:
- type: Percent
value: 10
periodSeconds: 60
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: External
external:
metric:
name: gpu_utilization_percent
selector:
matchLabels:
service: gpu-service
target:
type: AverageValue
averageValue: "75"
- type: External
external:
metric:
name: gpu_memory_utilization_percent
selector:
matchLabels:
service: gpu-service
target:
type: AverageValue
averageValue: "80"class GPUAwareLoadBalancer {
private:
struct GPUNodeStatus {
std::string node_id;
int available_gpus;
float average_gpu_utilization;
float average_memory_utilization;
int active_connections;
std::chrono::system_clock::time_point last_health_check;
};
std::vector<GPUNodeStatus> gpu_nodes;
std::mutex nodes_mutex;
public:
std::string select_optimal_node(const WorkloadRequirements& requirements) {
std::lock_guard<std::mutex> lock(nodes_mutex);
// Filter nodes that can handle the workload
std::vector<GPUNodeStatus*> eligible_nodes;
for (auto& node : gpu_nodes) {
if (node.available_gpus >= requirements.required_gpus &&
node.average_gpu_utilization < 80.0f &&
is_node_healthy(node)) {
eligible_nodes.push_back(&node);
}
}
if (eligible_nodes.empty()) {
throw std::runtime_error("No eligible GPU nodes available");
}
// Select node with best resource availability
std::sort(eligible_nodes.begin(), eligible_nodes.end(),
[](const GPUNodeStatus* a, const GPUNodeStatus* b) {
float score_a = calculate_node_score(*a);
float score_b = calculate_node_score(*b);
return score_a > score_b; // Higher score is better
});
return eligible_nodes[0]->node_id;
}
private:
float calculate_node_score(const GPUNodeStatus& node) {
// Weighted scoring based on multiple factors
float gpu_availability = (100.0f - node.average_gpu_utilization) / 100.0f;
float memory_availability = (100.0f - node.average_memory_utilization) / 100.0f;
float connection_load = 1.0f / (1.0f + node.active_connections * 0.1f);
return gpu_availability * 0.5f +
memory_availability * 0.3f +
connection_load * 0.2f;
}
};class GPUResourcePool {
private:
struct GPUResource {
void* gpu_memory;
size_t memory_size;
cudaStream_t stream;
bool in_use;
std::chrono::system_clock::time_point last_used;
};
std::vector<GPUResource> resource_pool;
std::mutex pool_mutex;
std::condition_variable resource_available;
public:
class ResourceLease {
GPUResourcePool* pool;
size_t resource_index;
public:
ResourceLease(GPUResourcePool* p, size_t idx) : pool(p), resource_index(idx) {}
~ResourceLease() {
if (pool) {
pool->return_resource(resource_index);
}
}
GPUResource& get() {
return pool->resource_pool[resource_index];
}
// Move semantics
ResourceLease(ResourceLease&& other) noexcept :
pool(other.pool), resource_index(other.resource_index) {
other.pool = nullptr;
}
};
ResourceLease acquire_resource(size_t required_memory,
std::chrono::milliseconds timeout) {
std::unique_lock<std::mutex> lock(pool_mutex);
auto deadline = std::chrono::system_clock::now() + timeout;
while (true) {
// Look for available resource
for (size_t i = 0; i < resource_pool.size(); ++i) {
auto& resource = resource_pool[i];
if (!resource.in_use && resource.memory_size >= required_memory) {
resource.in_use = true;
resource.last_used = std::chrono::system_clock::now();
return ResourceLease(this, i);
}
}
// Try to create new resource if pool not at capacity
if (resource_pool.size() < max_pool_size) {
if (create_new_resource(required_memory)) {
size_t new_index = resource_pool.size() - 1;
resource_pool[new_index].in_use = true;
return ResourceLease(this, new_index);
}
}
// Wait for resource to become available
if (resource_available.wait_until(lock, deadline) ==
std::cv_status::timeout) {
throw std::runtime_error("Timeout waiting for GPU resource");
}
}
}
private:
void return_resource(size_t index) {
std::lock_guard<std::mutex> lock(pool_mutex);
resource_pool[index].in_use = false;
resource_available.notify_one();
}
};class SecureGPUMemoryManager {
private:
struct SecureMemoryRegion {
void* gpu_ptr;
size_t size;
std::string tenant_id;
bool encrypted;
std::chrono::system_clock::time_point created_time;
std::chrono::system_clock::time_point last_access;
};
std::unordered_map<void*, SecureMemoryRegion> memory_regions;
std::mutex memory_mutex;
public:
void* allocate_secure_memory(size_t size, const std::string& tenant_id,
bool encrypt = true) {
std::lock_guard<std::mutex> lock(memory_mutex);
void* gpu_ptr = nullptr;
cudaError_t result = cudaMalloc(&gpu_ptr, size);
if (result != cudaSuccess) {
throw std::runtime_error("Failed to allocate GPU memory");
}
// Zero out memory for security
cudaMemset(gpu_ptr, 0, size);
// Apply memory protection if supported
if (encrypt) {
apply_memory_encryption(gpu_ptr, size);
}
SecureMemoryRegion region;
region.gpu_ptr = gpu_ptr;
region.size = size;
region.tenant_id = tenant_id;
region.encrypted = encrypt;
region.created_time = std::chrono::system_clock::now();
region.last_access = region.created_time;
memory_regions[gpu_ptr] = region;
// Log allocation for audit trail
log_memory_allocation(tenant_id, gpu_ptr, size, encrypt);
return gpu_ptr;
}
void deallocate_secure_memory(void* gpu_ptr, const std::string& tenant_id) {
std::lock_guard<std::mutex> lock(memory_mutex);
auto it = memory_regions.find(gpu_ptr);
if (it == memory_regions.end()) {
throw std::runtime_error("Invalid GPU memory pointer");
}
// Verify tenant ownership
if (it->second.tenant_id != tenant_id) {
log_security_violation(tenant_id, "Attempted to free memory owned by another tenant");
throw std::runtime_error("Access denied: memory not owned by tenant");
}
// Secure memory wipe before deallocation
secure_memory_wipe(gpu_ptr, it->second.size);
cudaFree(gpu_ptr);
memory_regions.erase(it);
log_memory_deallocation(tenant_id, gpu_ptr);
}
private:
void secure_memory_wipe(void* gpu_ptr, size_t size) {
// Multiple pass secure wipe
for (int pass = 0; pass < 3; ++pass) {
uint8_t pattern = (pass == 0) ? 0x00 : (pass == 1) ? 0xFF : 0xAA;
cudaMemset(gpu_ptr, pattern, size);
cudaDeviceSynchronize();
}
}
};class ComplianceAuditLogger {
public:
enum class EventType {
GPU_RESOURCE_ACCESS,
DATA_PROCESSING,
MEMORY_ALLOCATION,
SECURITY_VIOLATION,
PERFORMANCE_ANOMALY,
CONFIGURATION_CHANGE
};
struct AuditEvent {
std::string event_id;
EventType event_type;
std::string tenant_id;
std::string user_id;
std::chrono::system_clock::time_point timestamp;
std::string description;
std::map<std::string, std::string> metadata;
std::string gpu_device_id;
};
private:
std::queue<AuditEvent> event_queue;
std::mutex queue_mutex;
std::condition_variable queue_not_empty;
std::thread audit_writer_thread;
bool shutdown_requested = false;
public:
void log_audit_event(const AuditEvent& event) {
std::lock_guard<std::mutex> lock(queue_mutex);
event_queue.push(event);
queue_not_empty.notify_one();
}
void log_gpu_access(const std::string& tenant_id, const std::string& user_id,
const std::string& operation, const std::string& gpu_device) {
AuditEvent event;
event.event_id = generate_uuid();
event.event_type = EventType::GPU_RESOURCE_ACCESS;
event.tenant_id = tenant_id;
event.user_id = user_id;
event.timestamp = std::chrono::system_clock::now();
event.description = "GPU resource access: " + operation;
event.gpu_device_id = gpu_device;
event.metadata["operation"] = operation;
log_audit_event(event);
}
void log_data_processing(const std::string& tenant_id,
const std::string& data_classification,
size_t data_size_bytes) {
AuditEvent event;
event.event_id = generate_uuid();
event.event_type = EventType::DATA_PROCESSING;
event.tenant_id = tenant_id;
event.timestamp = std::chrono::system_clock::now();
event.description = "Data processing operation";
event.metadata["data_classification"] = data_classification;
event.metadata["data_size_bytes"] = std::to_string(data_size_bytes);
log_audit_event(event);
}
private:
void audit_writer_loop() {
while (!shutdown_requested) {
std::unique_lock<std::mutex> lock(queue_mutex);
queue_not_empty.wait(lock, [this] {
return !event_queue.empty() || shutdown_requested;
});
while (!event_queue.empty()) {
auto event = event_queue.front();
event_queue.pop();
lock.unlock();
write_audit_event_to_storage(event);
lock.lock();
}
}
}
void write_audit_event_to_storage(const AuditEvent& event) {
// Write to tamper-proof audit log storage
// Implementation depends on compliance requirements
// (e.g., AWS CloudTrail, Azure Monitor, Splunk, etc.)
}
};class ProductionConfigManager {
private:
struct GPUConfiguration {
float memory_pool_fraction = 0.8f;
int max_concurrent_streams = 8;
bool enable_peer_to_peer = true;
int compute_mode = 0; // Default compute mode
bool enable_profiling = false;
std::string logging_level = "INFO";
std::chrono::seconds health_check_interval{30};
};
GPUConfiguration config;
std::string config_file_path;
std::mutex config_mutex;
public:
bool load_configuration(const std::string& config_path) {
std::lock_guard<std::mutex> lock(config_mutex);
try {
// Load from YAML/JSON configuration file
YAML::Node config_node = YAML::LoadFile(config_path);
config.memory_pool_fraction = config_node["gpu"]["memory_pool_fraction"].as<float>();
config.max_concurrent_streams = config_node["gpu"]["max_concurrent_streams"].as<int>();
config.enable_peer_to_peer = config_node["gpu"]["enable_p2p"].as<bool>();
config.compute_mode = config_node["gpu"]["compute_mode"].as<int>();
config.enable_profiling = config_node["monitoring"]["enable_profiling"].as<bool>();
config.logging_level = config_node["logging"]["level"].as<std::string>();
config_file_path = config_path;
return true;
} catch (const std::exception& e) {
std::cerr << "Failed to load configuration: " << e.what() << std::endl;
return false;
}
}
void apply_gpu_configuration() {
std::lock_guard<std::mutex> lock(config_mutex);
// Set memory pool size
size_t total_memory, free_memory;
cudaMemGetInfo(&free_memory, &total_memory);
size_t pool_size = static_cast<size_t>(total_memory * config.memory_pool_fraction);
// Configure memory pool
cudaMemPoolProps pool_props = {};
pool_props.allocType = cudaMemAllocationTypePinned;
pool_props.handleTypes = cudaMemHandleTypeNone;
pool_props.location.type = cudaMemLocationTypeDevice;
pool_props.location.id = 0;
cudaMemPool_t memory_pool;
cudaMemPoolCreate(&memory_pool, &pool_props);
cudaMemPoolSetAttribute(memory_pool, cudaMemPoolAttrReservedMemCurrent, &pool_size);
// Set compute mode
cudaDeviceSetLimit(cudaLimitStackSize, 8192);
cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128 * 1024 * 1024);
// Configure P2P if multiple GPUs
if (config.enable_peer_to_peer) {
enable_p2p_access();
}
}
private:
void enable_p2p_access() {
int device_count;
cudaGetDeviceCount(&device_count);
for (int i = 0; i < device_count; ++i) {
for (int j = 0; j < device_count; ++j) {
if (i != j) {
int can_access_peer;
cudaDeviceCanAccessPeer(&can_access_peer, i, j);
if (can_access_peer) {
cudaSetDevice(i);
cudaDeviceEnablePeerAccess(j, 0);
}
}
}
}
}
};class ProductionHealthChecker {
private:
struct HealthStatus {
bool gpu_available = false;
bool memory_sufficient = false;
bool compute_functional = false;
bool driver_responsive = false;
float gpu_temperature = 0.0f;
float memory_usage_percent = 0.0f;
std::string status_message;
std::chrono::system_clock::time_point last_check;
};
HealthStatus current_status;
std::mutex status_mutex;
public:
struct HealthCheckResult {
enum class Status { HEALTHY, DEGRADED, UNHEALTHY };
Status overall_status;
std::map<std::string, std::string> component_status;
std::string message;
};
HealthCheckResult perform_health_check() {
std::lock_guard<std::mutex> lock(status_mutex);
HealthCheckResult result;
result.overall_status = HealthCheckResult::Status::HEALTHY;
// Check GPU availability
int device_count;
cudaError_t error = cudaGetDeviceCount(&device_count);
if (error != cudaSuccess || device_count == 0) {
result.overall_status = HealthCheckResult::Status::UNHEALTHY;
result.component_status["gpu_availability"] = "FAILED";
result.message += "No GPU devices available. ";
} else {
result.component_status["gpu_availability"] = "OK";
}
// Check memory availability
size_t free_memory, total_memory;
error = cudaMemGetInfo(&free_memory, &total_memory);
if (error == cudaSuccess) {
float memory_usage = 1.0f - (static_cast<float>(free_memory) / total_memory);
current_status.memory_usage_percent = memory_usage * 100.0f;
if (memory_usage > 0.95f) {
result.overall_status = HealthCheckResult::Status::DEGRADED;
result.component_status["memory"] = "WARNING";
result.message += "GPU memory usage > 95%. ";
} else {
result.component_status["memory"] = "OK";
}
} else {
result.overall_status = HealthCheckResult::Status::UNHEALTHY;
result.component_status["memory"] = "FAILED";
result.message += "Cannot query GPU memory. ";
}
// Check compute functionality with simple kernel
if (!test_compute_functionality()) {
result.overall_status = HealthCheckResult::Status::UNHEALTHY;
result.component_status["compute"] = "FAILED";
result.message += "GPU compute test failed. ";
} else {
result.component_status["compute"] = "OK";
}
// Check temperature (if NVML available)
float temperature = get_gpu_temperature();
if (temperature > 85.0f) {
result.overall_status = std::max(result.overall_status,
HealthCheckResult::Status::DEGRADED);
result.component_status["temperature"] = "WARNING";
result.message += "High GPU temperature. ";
} else if (temperature > 0.0f) {
result.component_status["temperature"] = "OK";
}
current_status.last_check = std::chrono::system_clock::now();
if (result.message.empty()) {
result.message = "All systems operational";
}
return result;
}
private:
bool test_compute_functionality() {
try {
// Simple compute test
const int N = 1024;
float *d_a, *d_b, *d_c;
float *h_c = new float[N];
cudaMalloc(&d_a, N * sizeof(float));
cudaMalloc(&d_b, N * sizeof(float));
cudaMalloc(&d_c, N * sizeof(float));
// Initialize test data
cudaMemset(d_a, 1, N * sizeof(float));
cudaMemset(d_b, 2, N * sizeof(float));
// Launch simple kernel (vector addition)
dim3 block(256);
dim3 grid((N + block.x - 1) / block.x);
vector_add_kernel<<<grid, block>>>(d_a, d_b, d_c, N);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
cleanup_test_memory(d_a, d_b, d_c, h_c);
return false;
}
error = cudaDeviceSynchronize();
if (error != cudaSuccess) {
cleanup_test_memory(d_a, d_b, d_c, h_c);
return false;
}
// Verify result
cudaMemcpy(h_c, d_c, N * sizeof(float), cudaMemcpyDeviceToHost);
bool test_passed = true;
for (int i = 0; i < 10; ++i) { // Check first 10 elements
if (std::abs(h_c[i] - 3.0f) > 1e-5) {
test_passed = false;
break;
}
}
cleanup_test_memory(d_a, d_b, d_c, h_c);
return test_passed;
} catch (...) {
return false;
}
}
void cleanup_test_memory(float* d_a, float* d_b, float* d_c, float* h_c) {
if (d_a) cudaFree(d_a);
if (d_b) cudaFree(d_b);
if (d_c) cudaFree(d_c);
if (h_c) delete[] h_c;
}
};
// Simple test kernel
__global__ void vector_add_kernel(float* a, float* b, float* c, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}This comprehensive guide covers all essential aspects of production GPU programming:
- Architecture Design: Enterprise patterns for scalable, maintainable GPU applications
- Error Handling: Robust error recovery and resilience mechanisms
- Deployment: Modern DevOps practices for GPU applications
- Monitoring: Comprehensive observability for GPU workloads
- Scalability: Auto-scaling and load balancing strategies
- Security: Enterprise-grade security and compliance
- Best Practices: Professional configuration and health monitoring
These concepts enable the development of enterprise-grade GPU applications that meet the demanding requirements of production environments while maintaining high performance, reliability, and security standards.