-
Notifications
You must be signed in to change notification settings - Fork 52
Description
Hi, I have built a DJango operator using github.com/operator-framework/helm-operator-plugins/pkg/reconciler. The operator works fine and it is able to reconcile a DJango app using the helm chart embedded in the code. This is the relevant snippet of code:
r, err := reconciler.New(
reconciler.WithChart(*chartObj),
reconciler.WithGroupVersionKind(schema.GroupVersionKind{
Group: "django.djangooperator",
Version: "v1alpha1",
Kind: "DjangoApp",
}),
reconciler.SkipDependentWatches(true),
reconciler.WithMaxConcurrentReconciles(1),
reconciler.SkipPrimaryGVKSchemeRegistration(true),
reconciler.WithValueTranslator(specTranslator(mgr.GetClient())),
reconciler.WithLog(logf.Log.WithName("helm").WithName("DjangoApp")),
)
The only problem is that at the beginning of the reconciliation I see a lot of errors like this one:
ERROR Reconciler error {"controller": "djangoapp-controller", "object": {"name":"djangoapp-sample","namespace":"sample-django"}, "namespace": "sample-django", "name": "djangoapp-sample", "reconcileID": "32885e3f-f298-41c0-b746-a83b96639559", "error": "another operation (install/upgrade/rollback) is in progress", "errorVerbose": "another operation (install/upgrade/rollback) is in progress\nhelm.sh/helm/v3/pkg/action.init\n\t:1\nruntime.doInit1\n\t/usr/local/go/src/runtime/proc.go:7371\nruntime.doInit\n\t/usr/local/go/src/runtime/proc.go:7338\nruntime.main\n\t/usr/local/go/src/runtime/proc.go:254\nruntime.goexit\n\t/usr/local/go/src/runtime/asm_amd64.s:1700"}
sigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller[...]).reconcileHandler
/go/pkg/mod/sigs.k8s.io/[email protected]/pkg/internal/controller/controller.go:353
sigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller[...]).processNextWorkItem
/go/pkg/mod/sigs.k8s.io/[email protected]/pkg/internal/controller/controller.go:300
sigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller[...]).Start.func2.1
/go/pkg/mod/sigs.k8s.io/[email protected]/pkg/internal/controller/controller.go:202
But the reconciliation works, so I assume that is a class between helm and the reconciliation loop and any other external event. I would like to being able to handle the error gracefully. So far I have tried with a prehook to catch the error:
func buildPreHook() hook.PreHook {
return hook.PreHookFunc(func(obj *unstructured.Unstructured, vals chartutil.Values, log logr.Logger) error {
ns, name := obj.GetNamespace(), obj.GetName()
// get a Helm action.Configuration for this namespace
cfg, err := buildHelmActionConfig(ns, log)
if err != nil {
return fmt.Errorf("prehook: %w", err)
}
// check status
statusCli := action.NewStatus(cfg)
rel, err := statusCli.Run(name)
if err != nil {
// if there’s no release yet, we can install
if strings.Contains(err.Error(), "release: not found") {
return nil
}
return fmt.Errorf("helm status check failed: %w", err)
}
// if Helm thinks it's mid-operation, bail out
switch rel.Info.Status {
case release.StatusPendingInstall,
release.StatusPendingUpgrade,
release.StatusPendingRollback:
return fmt.Errorf("helm release %s/%s is busy: %s", ns, name, rel.Info.Status)
default:
return nil
}
})
}
But it did not work as it was only invoked once and also, the the cli was not able to detect the issue in progress. I have also tried to using action client machinery added in one PR of the library to add a wait for all the actions, but although it has reduced the number of errors in the logs, I still get several of them:
func actionClientBuilder(mgr ctrl.Manager) (actionclient.ActionClientGetter, error) {
cfg := mgr.GetConfig()
configDiscovery, err := discovery.NewDiscoveryClientForConfig(cfg)
if err != nil {
return nil, fmt.Errorf("creating discovery client: %w", err)
}
mapper := restmapper.NewDeferredDiscoveryRESTMapper(
memory.NewMemCacheClient(configDiscovery),
)
acg, err := actionclient.NewActionConfigGetter(
cfg,
mapper,
)
if err != nil {
return nil, fmt.Errorf("building custom ActionConfigGetter: %w", err)
}
actionClientGetter, err := actionclient.NewActionClientGetter(
acg,
// // always wait for installs
actionclient.AppendInstallOptions(func(i *action.Install) error {
i.Wait = true
i.Timeout = time.Minute * 5
return nil
}),
// always wait for upgrades
actionclient.AppendUpgradeOptions(func(u *action.Upgrade) error {
u.Wait = true
u.Timeout = time.Minute * 5
return nil
}),
// always wait for uninstalls
actionclient.AppendUninstallOptions(func(u *action.Uninstall) error {
u.Wait = true
return nil
}),
// on upgrade failure, force & wait the rollback (with a 5m timeout)
actionclient.AppendUpgradeFailureRollbackOptions(func(r *action.Rollback) error {
r.Force = true
r.Wait = true
r.Timeout = 5 * time.Minute
return nil
}),
)
if err != nil {
return nil, fmt.Errorf("building custom ActionClientGetter: %w", err)
}
return actionClientGetter, nil
}
I do not know if I am missing something, if this is a bug or it is a problem with my setup. If you could point me in the right direction about how to catch and handle the error it would be great.