Skip to content

[ovhcloud] openstack application credentials, gpu flavors update, stressed downscale fix #8092

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 68 additions & 6 deletions cluster-autoscaler/cloudprovider/ovhcloud/ovh_cloud_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import (
"errors"
"fmt"
"io"
"io/ioutil"
"slices"
"sync"
"time"

Expand Down Expand Up @@ -62,7 +62,7 @@ type OvhCloudManager struct {
ClusterID string
ProjectID string

NodePools []sdk.NodePool
NodePoolsPerID map[string]*sdk.NodePool
NodeGroupPerProviderID map[string]*NodeGroup
NodeGroupPerProviderIDLock sync.RWMutex

Expand All @@ -88,6 +88,10 @@ type Config struct {
OpenStackPassword string `json:"openstack_password"`
OpenStackDomain string `json:"openstack_domain"`

// OpenStack application credentials if authentication type is set to openstack_application.
OpenStackApplicationCredentialID string `json:"openstack_application_credential_id"`
OpenStackApplicationCredentialSecret string `json:"openstack_application_credential_secret"`

// Application credentials if CA is run as API consumer without using OpenStack keystone.
// Tokens can be created here: https://api.ovh.com/createToken/
ApplicationEndpoint string `json:"application_endpoint"`
Expand All @@ -101,6 +105,9 @@ const (
// OpenStackAuthenticationType to request a keystone token credentials.
OpenStackAuthenticationType = "openstack"

// OpenStackApplicationCredentialsAuthenticationType to request a keystone token credentials using keystone application credentials.
OpenStackApplicationCredentialsAuthenticationType = "openstack_application"

// ApplicationConsumerAuthenticationType to consume an application key credentials.
ApplicationConsumerAuthenticationType = "consumer"
)
Expand Down Expand Up @@ -130,6 +137,13 @@ func NewManager(configFile io.Reader) (*OvhCloudManager, error) {
return nil, fmt.Errorf("failed to create OpenStack provider: %w", err)
}

client, err = sdk.NewDefaultClientWithToken(openStackProvider.AuthUrl, openStackProvider.Token)
case OpenStackApplicationCredentialsAuthenticationType:
openStackProvider, err = sdk.NewOpenstackApplicationProvider(cfg.OpenStackAuthUrl, cfg.OpenStackApplicationCredentialID, cfg.OpenStackApplicationCredentialSecret, cfg.OpenStackDomain)
if err != nil {
return nil, fmt.Errorf("failed to create OpenStack provider: %w", err)
}

client, err = sdk.NewDefaultClientWithToken(openStackProvider.AuthUrl, openStackProvider.Token)
case ApplicationConsumerAuthenticationType:
client, err = sdk.NewClient(cfg.ApplicationEndpoint, cfg.ApplicationKey, cfg.ApplicationSecret, cfg.ApplicationConsumerKey)
Expand All @@ -148,7 +162,7 @@ func NewManager(configFile io.Reader) (*OvhCloudManager, error) {
ProjectID: cfg.ProjectID,
ClusterID: cfg.ClusterID,

NodePools: make([]sdk.NodePool, 0),
NodePoolsPerID: make(map[string]*sdk.NodePool),
NodeGroupPerProviderID: make(map[string]*NodeGroup),
NodeGroupPerProviderIDLock: sync.RWMutex{},

Expand Down Expand Up @@ -232,11 +246,45 @@ func (m *OvhCloudManager) ReAuthenticate() error {
return nil
}

// setNodePoolsState updates nodepool local informations based on given list
// Updates NodePoolsPerID by modifying data so the reference in NodeGroupPerProviderID can access refreshed data
//
// - Updates fields on already referenced nodepool
// - Adds nodepool if not referenced yet
// - Deletes from map if nodepool is not in the given list (it doesn't exist anymore)
func (m *OvhCloudManager) setNodePoolsState(pools []sdk.NodePool) {
m.NodeGroupPerProviderIDLock.Lock()
defer m.NodeGroupPerProviderIDLock.Unlock()

poolIDsToKeep := []string{}
for _, pool := range pools {
poolIDsToKeep = append(poolIDsToKeep, pool.ID)
}

// Update nodepools state
for _, pool := range pools {
poolRef, ok := m.NodePoolsPerID[pool.ID]
if ok {
*poolRef = pool // Update existing value
} else {
poolCopy := pool
m.NodePoolsPerID[pool.ID] = &poolCopy
}
}

// Remove nodepools that doesn't exist anymore
for poolID := range m.NodePoolsPerID {
if !slices.Contains(poolIDsToKeep, poolID) {
delete(m.NodePoolsPerID, poolID)
}
}
}

// readConfig read cloud provider configuration file into a struct
func readConfig(configFile io.Reader) (*Config, error) {
cfg := &Config{}
if configFile != nil {
body, err := ioutil.ReadAll(configFile)
body, err := io.ReadAll(configFile)
if err != nil {
return nil, fmt.Errorf("failed to read content: %w", err)
}
Expand All @@ -260,8 +308,8 @@ func validatePayload(cfg *Config) error {
return fmt.Errorf("`project_id` not found in config file")
}

if cfg.AuthenticationType != OpenStackAuthenticationType && cfg.AuthenticationType != ApplicationConsumerAuthenticationType {
return fmt.Errorf("`authentication_type` should only be `openstack` or `consumer`")
if cfg.AuthenticationType != OpenStackAuthenticationType && cfg.AuthenticationType != OpenStackApplicationCredentialsAuthenticationType && cfg.AuthenticationType != ApplicationConsumerAuthenticationType {
return fmt.Errorf("`authentication_type` should only be `openstack`, `openstack_application` or `consumer`")
}

if cfg.AuthenticationType == OpenStackAuthenticationType {
Expand All @@ -282,6 +330,20 @@ func validatePayload(cfg *Config) error {
}
}

if cfg.AuthenticationType == OpenStackApplicationCredentialsAuthenticationType {
if cfg.OpenStackAuthUrl == "" {
return fmt.Errorf("`openstack_auth_url` not found in config file")
}

if cfg.OpenStackApplicationCredentialID == "" {
return fmt.Errorf("`openstack_application_credential_id` not found in config file")
}

if cfg.OpenStackApplicationCredentialSecret == "" {
return fmt.Errorf("`openstack_application_credential_secret` not found in config file")
}
}

if cfg.AuthenticationType == ApplicationConsumerAuthenticationType {
if cfg.ApplicationEndpoint == "" {
return fmt.Errorf("`application_endpoint` not found in config file")
Expand Down
116 changes: 116 additions & 0 deletions cluster-autoscaler/cloudprovider/ovhcloud/ovh_cloud_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,26 @@ func newTestManager(t *testing.T) *OvhCloudManager {
return manager
}

func TestOvhCloudManager_validateConfig(t *testing.T) {
tests := []struct {
name string
configContent string
expectedErrorMessage string
}{
{
name: "New entry",
configContent: "{}",
expectedErrorMessage: "config content validation failed: `cluster_id` not found in config file",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
_, err := NewManager(bytes.NewBufferString(tt.configContent))
assert.ErrorContains(t, err, tt.expectedErrorMessage)
})
}
}

func TestOvhCloudManager_getFlavorsByName(t *testing.T) {
expectedFlavorsByNameFromAPICall := map[string]sdk.Flavor{
"b2-7": {
Expand Down Expand Up @@ -290,3 +310,99 @@ func TestOvhCloudManager_cacheConcurrency(t *testing.T) {
manager.getNodeGroupPerProviderID("")
})
}

func TestOvhCloudManager_setNodePoolsState(t *testing.T) {
manager := newTestManager(t)
np1 := sdk.NodePool{ID: "np1", DesiredNodes: 1}
np2 := sdk.NodePool{ID: "np2", DesiredNodes: 2}
np3 := sdk.NodePool{ID: "np3", DesiredNodes: 3}

type fields struct {
NodePoolsPerID map[string]*sdk.NodePool
NodeGroupPerProviderID map[string]*NodeGroup
}
type args struct {
poolsList []sdk.NodePool

nodePoolsPerID map[string]*sdk.NodePool
nodeGroupPerProviderID map[string]*NodeGroup
}
tests := []struct {
name string
fields fields
args args
wantNodePoolsPerID map[string]uint32 // ID => desired nodes
wantNodeGroupPerProviderID map[string]uint32 // ID => desired nodes
}{
{
name: "NodePoolsPerID and NodeGroupPerProviderID empty",
fields: fields{
NodePoolsPerID: map[string]*sdk.NodePool{},
NodeGroupPerProviderID: map[string]*NodeGroup{},
},
args: args{
poolsList: []sdk.NodePool{
np1,
},
nodePoolsPerID: map[string]*sdk.NodePool{},
nodeGroupPerProviderID: map[string]*NodeGroup{},
},
wantNodePoolsPerID: map[string]uint32{"np1": 1},
wantNodeGroupPerProviderID: map[string]uint32{},
},
{
name: "NodePoolsPerID and NodeGroupPerProviderID empty",
fields: fields{
NodePoolsPerID: map[string]*sdk.NodePool{
"np2": &np2,
"np3": &np3,
},
NodeGroupPerProviderID: map[string]*NodeGroup{
"np2-node-id": {NodePool: &np2},
"np3-node-id": {NodePool: &np3},
},
},
args: args{
poolsList: []sdk.NodePool{
{
ID: "np1",
DesiredNodes: 1,
},
{
ID: "np2",
DesiredNodes: 20,
},
},
nodePoolsPerID: map[string]*sdk.NodePool{},
nodeGroupPerProviderID: map[string]*NodeGroup{},
},
wantNodePoolsPerID: map[string]uint32{
"np1": 1, // np1 added
"np2": 20, // np2 updated
// np3 removed
},
wantNodeGroupPerProviderID: map[string]uint32{
"np2-node-id": 20,
"np3-node-id": 3, // Node reference that eventually stays in cache must not crash
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
manager.NodePoolsPerID = tt.fields.NodePoolsPerID
manager.NodeGroupPerProviderID = tt.fields.NodeGroupPerProviderID

manager.setNodePoolsState(tt.args.poolsList)

assert.Len(t, manager.NodePoolsPerID, len(tt.wantNodePoolsPerID))
for id, desiredNodes := range tt.wantNodePoolsPerID {
assert.Equal(t, desiredNodes, manager.NodePoolsPerID[id].DesiredNodes)
}

assert.Len(t, manager.NodeGroupPerProviderID, len(tt.wantNodeGroupPerProviderID))
for nodeID, desiredNodes := range tt.wantNodeGroupPerProviderID {
assert.Equal(t, desiredNodes, manager.NodeGroupPerProviderID[nodeID].DesiredNodes)
}
})
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ const providerIDPrefix = "openstack:///"

// NodeGroup implements cloudprovider.NodeGroup interface.
type NodeGroup struct {
sdk.NodePool
*sdk.NodePool

Manager *OvhCloudManager
CurrentSize int
Expand Down Expand Up @@ -294,7 +294,7 @@ func (ng *NodeGroup) Create() (cloudprovider.NodeGroup, error) {

// Forge a node group interface given the API response
return &NodeGroup{
NodePool: *np,
NodePool: np,
Manager: ng.Manager,
CurrentSize: int(ng.DesiredNodes),
}, nil
Expand Down Expand Up @@ -352,7 +352,11 @@ func (ng *NodeGroup) isGpu() bool {
if err != nil {
// Fallback when we are unable to get the flavor: refer to the only category
// known to be a GPU flavor category
return strings.HasPrefix(ng.Flavor, GPUMachineCategory)
for _, gpuCategoryPrefix := range GPUMachineCategoryPrefixes {
if strings.HasPrefix(ng.Flavor, gpuCategoryPrefix) {
return true
}
}
}

return flavor.GPUs > 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func newTestNodeGroup(t *testing.T, flavor string) *NodeGroup {

ng := &NodeGroup{
Manager: manager,
NodePool: sdk.NodePool{
NodePool: &sdk.NodePool{
ID: "id",
Name: fmt.Sprintf("pool-%s", flavor),
Flavor: flavor,
Expand Down Expand Up @@ -522,9 +522,19 @@ func TestOVHCloudNodeGroup_IsGpu(t *testing.T) {
})

t.Run("not found but belong to GPU category", func(t *testing.T) {
ng := newTestNodeGroup(t, GPUMachineCategory+"1-123")
ng := newTestNodeGroup(t, "t1-123")
assert.True(t, ng.isGpu())
})

t.Run("not found but belong to GPU category", func(t *testing.T) {
ng := newTestNodeGroup(t, "h1-123")
assert.True(t, ng.isGpu())
})

t.Run("not found and does not belong to GPU category", func(t *testing.T) {
ng := newTestNodeGroup(t, "n1-123")
assert.False(t, ng.isGpu())
})
}

func TestOVHCloudNodeGroup_GetOptions(t *testing.T) {
Expand Down
12 changes: 6 additions & 6 deletions cluster-autoscaler/cloudprovider/ovhcloud/ovh_cloud_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@ const (

// MachineAvailableState defines the state for available flavors for node resources.
MachineAvailableState = "available"

// GPUMachineCategory defines the default instance category for GPU resources.
GPUMachineCategory = "t"
)

// GPUMachineCategoryPrefixes defines the flavors prefixes for GPU resources.
var GPUMachineCategoryPrefixes = [4]string{"t", "h", "a", "l"}

// OVHCloudProvider implements CloudProvider interface.
type OVHCloudProvider struct {
manager *OvhCloudManager
Expand Down Expand Up @@ -100,7 +100,7 @@ func (provider *OVHCloudProvider) NodeGroups() []cloudprovider.NodeGroup {
groups := make([]cloudprovider.NodeGroup, 0)

// Cast API node pools into CA node groups
for _, pool := range provider.manager.NodePools {
for _, pool := range provider.manager.NodePoolsPerID {
// Node pools without autoscaling are equivalent to node pools with autoscaling but no scale possible
if !pool.Autoscale {
pool.MaxNodes = pool.DesiredNodes
Expand Down Expand Up @@ -238,7 +238,7 @@ func (provider *OVHCloudProvider) GetAvailableMachineTypes() ([]string, error) {
// Implementation optional.
func (provider *OVHCloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string, taints []apiv1.Taint, extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
ng := &NodeGroup{
NodePool: sdk.NodePool{
NodePool: &sdk.NodePool{
Name: fmt.Sprintf("%s-%d", machineType, rand.Int63()),
Flavor: machineType,
MinNodes: 0,
Expand Down Expand Up @@ -314,7 +314,7 @@ func (provider *OVHCloudProvider) Refresh() error {
}

// Update the node pools cache
provider.manager.NodePools = pools
provider.manager.setNodePoolsState(pools)

return nil
}
Loading
Loading