Skip to content

Commit 2943ae0

Browse files
Add terminal error interceptor to prevent retrying non-retryable errors
When a gRPC call returns a status code that indicates a permanent failure (e.g. InvalidArgument, NotFound, PermissionDenied, Unauthenticated), retrying the request will not resolve the issue. Without marking these errors as terminal, the controller-runtime reconciler keeps requeuing the reconciliation indefinitely, wasting resources. Introduce a TerminalErrorInterceptor that wraps errors with non-retryable gRPC status codes as reconcile.TerminalError so the reconciler stops retrying immediately. Also fix error wrapping in GetDeviceByName which previously dropped the underlying error, and add documentation to GetDeviceBySerial.
1 parent f48174a commit 2943ae0

1 file changed

Lines changed: 43 additions & 5 deletions

File tree

internal/deviceutil/deviceutil.go

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,20 @@ import (
88
"crypto/x509"
99
"errors"
1010
"fmt"
11+
"slices"
1112
"time"
1213

1314
"google.golang.org/grpc"
15+
"google.golang.org/grpc/codes"
1416
"google.golang.org/grpc/credentials"
1517
"google.golang.org/grpc/credentials/insecure"
18+
"google.golang.org/grpc/status"
1619
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1720
"k8s.io/apimachinery/pkg/labels"
1821
"k8s.io/apimachinery/pkg/runtime/schema"
1922
ctrl "sigs.k8s.io/controller-runtime"
2023
"sigs.k8s.io/controller-runtime/pkg/client"
24+
"sigs.k8s.io/controller-runtime/pkg/reconcile"
2125

2226
"github.com/ironcore-dev/network-operator/api/core/v1alpha1"
2327
"github.com/ironcore-dev/network-operator/internal/clientutil"
@@ -58,17 +62,19 @@ func GetOwnerDevice(ctx context.Context, r client.Reader, obj metav1.Object) (*v
5862
func GetDeviceByName(ctx context.Context, r client.Reader, namespace, name string) (*v1alpha1.Device, error) {
5963
obj := new(v1alpha1.Device)
6064
if err := r.Get(ctx, client.ObjectKey{Namespace: namespace, Name: name}, obj); err != nil {
61-
return nil, fmt.Errorf("failed to get %s/%s", v1alpha1.GroupVersion.WithKind(v1alpha1.DeviceKind).String(), name)
65+
return nil, fmt.Errorf("failed to get %s/%s: %w", v1alpha1.GroupVersion.WithKind(v1alpha1.DeviceKind).String(), name, err)
6266
}
6367
return obj, nil
6468
}
6569

70+
// GetDeviceBySerial finds and returns a Device object using the specified serial number.
71+
// It returns an error if no device or multiple devices with the same serial number are found.
72+
// Note: This function assumes that the [v1alpha1.DeviceSerialLabel] is unique across all Device objects in the cluster.
6673
func GetDeviceBySerial(ctx context.Context, r client.Reader, namespace, serial string) (*v1alpha1.Device, error) {
6774
deviceList := &v1alpha1.DeviceList{}
6875
listOpts := &client.ListOptions{
6976
LabelSelector: labels.SelectorFromSet(labels.Set{v1alpha1.DeviceSerialLabel: serial}),
7077
}
71-
7278
if err := r.List(ctx, deviceList, listOpts); err != nil {
7379
return nil, fmt.Errorf("failed to list %s objects: %w", v1alpha1.GroupVersion.WithKind(v1alpha1.DeviceKind).String(), err)
7480
}
@@ -82,8 +88,6 @@ func GetDeviceBySerial(ctx context.Context, r client.Reader, namespace, serial s
8288
}
8389

8490
// Connection holds the necessary information to connect to a device's API.
85-
//
86-
// TODO(felix-kaestner): find a better place for this struct, maybe in a 'connection' package?
8791
type Connection struct {
8892
// Address is the API address of the device, in the format "host:port".
8993
Address string
@@ -150,7 +154,7 @@ func NewGrpcClient(ctx context.Context, conn *Connection, o ...Option) (*grpc.Cl
150154
creds = credentials.NewTLS(conn.TLS)
151155
}
152156

153-
opts := []grpc.DialOption{grpc.WithTransportCredentials(creds)}
157+
opts := []grpc.DialOption{grpc.WithTransportCredentials(creds), grpc.WithUnaryInterceptor(TerminalErrorInterceptor())}
154158
if conn.Username != "" && conn.Password != "" {
155159
opts = append(opts, grpc.WithPerRPCCredentials(&auth{
156160
Username: conn.Username,
@@ -216,3 +220,37 @@ func UnaryDefaultTimeoutInterceptor(timeout time.Duration) grpc.UnaryClientInter
216220
return invoker(ctx, method, req, reply, cc, opts...)
217221
}
218222
}
223+
224+
// TerminalErrorInterceptor returns a gRPC unary client interceptor that wraps errors returned by the gRPC invoker
225+
// as terminal errors if their gRPC status code is in the set of non-retryable codes defined in [terminalCodes].
226+
func TerminalErrorInterceptor() grpc.UnaryClientInterceptor {
227+
return func(ctx context.Context, method string, req, reply any, cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error {
228+
return WrapTerminalError(invoker(ctx, method, req, reply, cc, opts...))
229+
}
230+
}
231+
232+
// WrapTerminalError wraps the given error as a terminal error if its gRPC status error
233+
// with a non-retryable code.
234+
func WrapTerminalError(err error) error {
235+
if statusErr, ok := status.FromError(err); ok && slices.Contains(terminalCodes, statusErr.Code()) {
236+
return reconcile.TerminalError(err)
237+
}
238+
return err
239+
}
240+
241+
// terminalCodes holds the set of gRPC codes that are considered terminal.
242+
// That is, if an error has one of these codes, retrying the operation
243+
// is not expected to succeed.
244+
// This list is based on the gRPC documentation at https://grpc.io/docs/guides/status-codes.
245+
var terminalCodes = []codes.Code{
246+
codes.Unknown,
247+
codes.InvalidArgument,
248+
codes.NotFound,
249+
codes.AlreadyExists,
250+
codes.PermissionDenied,
251+
codes.FailedPrecondition,
252+
codes.OutOfRange,
253+
codes.Unimplemented,
254+
codes.DataLoss,
255+
codes.Unauthenticated,
256+
}

0 commit comments

Comments
 (0)