Skip to content

Commit

Permalink
Fix error when the operator metrics ServiceMonitor already exists (#3447
Browse files Browse the repository at this point in the history
)

* Fix error when the operator metrics ServiceMonitor already exists

Signed-off-by: Israel Blancas <[email protected]>

* Wrap the errors and log them

Signed-off-by: Israel Blancas <[email protected]>

---------

Signed-off-by: Israel Blancas <[email protected]>
  • Loading branch information
Israel Blancas authored Nov 12, 2024
1 parent 1108621 commit 917b096
Show file tree
Hide file tree
Showing 4 changed files with 167 additions and 22 deletions.
19 changes: 19 additions & 0 deletions .chloggen/3446.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: 'bug_fix'

# The name of the component, or a single word describing the area of concern, (e.g. collector, target allocator, auto-instrumentation, opamp, github action)
component: operator

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Operator pod crashed if the Service Monitor for the operator metrics was created before by another operator pod.

# One or more tracking issues related to the change
issues: [3446]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext: |
Operator fails when the pod is restarted and the Service Monitor for operator metrics was already created by another operator pod.
To fix this, the operator now sets the owner reference on the Service Monitor to itself and checks if the Service Monitor already exists.
82 changes: 67 additions & 15 deletions internal/operator-metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@ import (
"fmt"
"os"

"github.com/go-logr/logr"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/intstr"
Expand Down Expand Up @@ -51,26 +54,85 @@ var _ manager.Runnable = &OperatorMetrics{}

type OperatorMetrics struct {
kubeClient client.Client
log logr.Logger
}

func NewOperatorMetrics(config *rest.Config, scheme *runtime.Scheme) (OperatorMetrics, error) {
func NewOperatorMetrics(config *rest.Config, scheme *runtime.Scheme, log logr.Logger) (OperatorMetrics, error) {
kubeClient, err := client.New(config, client.Options{Scheme: scheme})
if err != nil {
return OperatorMetrics{}, err
}

return OperatorMetrics{
kubeClient: kubeClient,
log: log,
}, nil
}

func (om OperatorMetrics) Start(ctx context.Context) error {
err := om.createOperatorMetricsServiceMonitor(ctx)
if err != nil {
om.log.Error(err, "error creating Service Monitor for operator metrics")
}

return nil
}

func (om OperatorMetrics) NeedLeaderElection() bool {
return true
}

func (om OperatorMetrics) caConfigMapExists() bool {
return om.kubeClient.Get(context.Background(), client.ObjectKey{
Name: caBundleConfigMap,
Namespace: openshiftInClusterMonitoringNamespace,
}, &corev1.ConfigMap{},
) == nil
}

func (om OperatorMetrics) getOwnerReferences(ctx context.Context, namespace string) (metav1.OwnerReference, error) {
var deploymentList appsv1.DeploymentList

listOptions := []client.ListOption{
client.InNamespace(namespace),
client.MatchingLabels(map[string]string{
"app.kubernetes.io/name": "opentelemetry-operator",
"control-plane": "controller-manager",
}),
}

err := om.kubeClient.List(ctx, &deploymentList, listOptions...)
if err != nil {
return metav1.OwnerReference{}, err
}

if len(deploymentList.Items) == 0 {
return metav1.OwnerReference{}, fmt.Errorf("no deployments found with the specified label")
}
deployment := &deploymentList.Items[0]

ownerRef := metav1.OwnerReference{
APIVersion: "apps/v1",
Kind: "Deployment",
Name: deployment.Name,
UID: deployment.UID,
}

return ownerRef, nil
}

func (om OperatorMetrics) createOperatorMetricsServiceMonitor(ctx context.Context) error {
rawNamespace, err := os.ReadFile(namespaceFile)
if err != nil {
return fmt.Errorf("error reading namespace file: %w", err)
}
namespace := string(rawNamespace)

ownerRef, err := om.getOwnerReferences(ctx, namespace)
if err != nil {
return fmt.Errorf("error getting owner references: %w", err)
}

var tlsConfig *monitoringv1.TLSConfig

if om.caConfigMapExists() {
Expand Down Expand Up @@ -101,6 +163,7 @@ func (om OperatorMetrics) Start(ctx context.Context) error {
"app.kubernetes.io/part-of": "opentelemetry-operator",
"control-plane": "controller-manager",
},
OwnerReferences: []metav1.OwnerReference{ownerRef},
},
Spec: monitoringv1.ServiceMonitorSpec{
Selector: metav1.LabelSelector{
Expand All @@ -123,23 +186,12 @@ func (om OperatorMetrics) Start(ctx context.Context) error {
}

err = om.kubeClient.Create(ctx, &sm)
if err != nil {
return fmt.Errorf("error creating service monitor: %w", err)
// The ServiceMonitor can be already there if this is a restart
if err != nil && !apierrors.IsAlreadyExists(err) {
return err
}

<-ctx.Done()

return om.kubeClient.Delete(ctx, &sm)
}

func (om OperatorMetrics) NeedLeaderElection() bool {
return true
}

func (om OperatorMetrics) caConfigMapExists() bool {
return om.kubeClient.Get(context.Background(), client.ObjectKey{
Name: caBundleConfigMap,
Namespace: openshiftInClusterMonitoringNamespace,
}, &corev1.ConfigMap{},
) == nil
}
86 changes: 80 additions & 6 deletions internal/operator-metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,30 @@ package operatormetrics
import (
"context"
"os"
"reflect"
"testing"
"time"

"github.com/go-logr/logr"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/rest"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
)

func TestNewOperatorMetrics(t *testing.T) {
config := &rest.Config{}
scheme := runtime.NewScheme()
metrics, err := NewOperatorMetrics(config, scheme)
metrics, err := NewOperatorMetrics(config, scheme, logr.Discard())
assert.NoError(t, err)
assert.NotNil(t, metrics.kubeClient)
}
Expand All @@ -53,12 +57,15 @@ func TestOperatorMetrics_Start(t *testing.T) {
namespaceFile = tmpFile.Name()

scheme := runtime.NewScheme()
err = corev1.AddToScheme(scheme)
require.NoError(t, err)
err = monitoringv1.AddToScheme(scheme)
require.NoError(t, err)
require.NoError(t, corev1.AddToScheme(scheme))
require.NoError(t, appsv1.AddToScheme(scheme))
require.NoError(t, monitoringv1.AddToScheme(scheme))

client := fake.NewClientBuilder().WithScheme(scheme).Build()
client := fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects(
&appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{Name: "opentelemetry-operator", Namespace: "test-namespace", Labels: map[string]string{"app.kubernetes.io/name": "opentelemetry-operator", "control-plane": "controller-manager"}},
},
).Build()

metrics := OperatorMetrics{kubeClient: client}

Expand Down Expand Up @@ -125,3 +132,70 @@ func TestOperatorMetrics_caConfigMapExists(t *testing.T) {
metricsWithoutConfigMap := OperatorMetrics{kubeClient: clientWithoutConfigMap}
assert.False(t, metricsWithoutConfigMap.caConfigMapExists())
}

func TestOperatorMetrics_getOwnerReferences(t *testing.T) {
tests := []struct {
name string
namespace string
objects []client.Object
want metav1.OwnerReference
wantErr bool
}{
{
name: "successful owner reference retrieval",
namespace: "test-namespace",
objects: []client.Object{
&appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{
Name: "opentelemetry-operator",
Namespace: "test-namespace",
UID: "test-uid",
Labels: map[string]string{
"app.kubernetes.io/name": "opentelemetry-operator",
"control-plane": "controller-manager",
},
},
},
},
want: metav1.OwnerReference{
APIVersion: "apps/v1",
Kind: "Deployment",
Name: "opentelemetry-operator",
UID: "test-uid",
},
wantErr: false,
},
{
name: "no deployments found",
namespace: "test-namespace",
objects: []client.Object{},
want: metav1.OwnerReference{},
wantErr: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
scheme := runtime.NewScheme()
_ = appsv1.AddToScheme(scheme)
fakeClient := fake.NewClientBuilder().
WithScheme(scheme).
WithObjects(tt.objects...).
Build()

om := OperatorMetrics{
kubeClient: fakeClient,
log: logr.Discard(),
}

got, err := om.getOwnerReferences(context.Background(), tt.namespace)
if (err != nil) != tt.wantErr {
t.Errorf("getOwnerReferences() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("getOwnerReferences() got = %v, want %v", got, tt.want)
}
})
}
}
2 changes: 1 addition & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ func main() {
}

if cfg.PrometheusCRAvailability() == prometheus.Available {
operatorMetrics, opError := operatormetrics.NewOperatorMetrics(mgr.GetConfig(), scheme)
operatorMetrics, opError := operatormetrics.NewOperatorMetrics(mgr.GetConfig(), scheme, ctrl.Log.WithName("operator-metrics-sm"))
if opError != nil {
setupLog.Error(opError, "Failed to create the operator metrics SM")
}
Expand Down

0 comments on commit 917b096

Please sign in to comment.