Skip to content

Commit 7fd8b42

Browse files
committed
HIVE-3067 infra_test: match machine pool e2e by Hive labels
1 parent e33d703 commit 7fd8b42

4 files changed

Lines changed: 54 additions & 46 deletions

File tree

hack/e2e-common.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ set -o monitor
2323
if [[ $0 == */e2e-pool-test.sh ]]; then
2424
# TODO: set this back to 148 when we figure out how to make the *test script*
2525
# timeout something other than 2h.
26-
timeout_minutes=148
26+
timeout_minutes=190
2727
else
28-
timeout_minutes=148
28+
timeout_minutes=190
2929
fi
3030
/usr/bin/bash -c "sleep $(($timeout_minutes*60)) && echo 'Timed out!' && kill -n 2 $$" &
3131
###

hack/e2e-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ INSTALL_RESULT=""
177177
i=1
178178
while [ $i -le ${max_cluster_deployment_status_checks} ]; do
179179
CD_JSON=$(oc get cd ${CLUSTER_NAME} -n ${CLUSTER_NAMESPACE} -o json)
180-
if [[ $(jq .spec.installed <<<"${CD_JSON}") == "true" ]] ; then
180+
if [[ $(jq .spec.installed <<<"${CD_JSON}") == "true" ]] && [[ $(jq -r .status.powerState <<<"${CD_JSON}") == "Running" ]] ; then
181181
INSTALL_RESULT="success"
182182
break
183183
fi

test/e2e/common/client.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,21 @@ import (
55

66
"sigs.k8s.io/controller-runtime/pkg/client"
77
"sigs.k8s.io/controller-runtime/pkg/client/config"
8+
ctrl "sigs.k8s.io/controller-runtime/pkg/log"
89

910
"k8s.io/client-go/dynamic"
1011
kclient "k8s.io/client-go/kubernetes"
1112
"k8s.io/client-go/rest"
1213
apiregv1client "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/typed/apiregistration/v1"
1314

15+
"github.com/openshift/hive/pkg/util/logrus"
1416
"github.com/openshift/hive/pkg/util/scheme"
1517
)
1618

19+
func init() {
20+
ctrl.SetLogger(logrus.NewLogr(log.StandardLogger()))
21+
}
22+
1723
func MustGetClient() client.WithWatch {
1824
return MustGetClientFromConfig(MustGetConfig())
1925
}

test/e2e/postinstall/machinesets/infra_test.go

Lines changed: 45 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
apierrors "k8s.io/apimachinery/pkg/api/errors"
1616
"k8s.io/apimachinery/pkg/api/resource"
1717
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
18+
"k8s.io/apimachinery/pkg/util/sets"
1819
"k8s.io/client-go/rest"
1920
"k8s.io/client-go/util/retry"
2021
"k8s.io/utils/ptr"
@@ -35,6 +36,11 @@ import (
3536
const (
3637
workerMachinePoolName = "worker"
3738
infraMachinePoolName = "infra"
39+
40+
capiClusterKey = "machine.openshift.io/cluster-api-cluster"
41+
capiMachineTypeKey = "machine.openshift.io/cluster-api-machine-type"
42+
hiveMachinePoolKey = "hive.openshift.io/machine-pool"
43+
openshiftMachineAPI = "openshift-machine-api"
3844
)
3945

4046
func TestScaleMachinePool(t *testing.T) {
@@ -52,27 +58,23 @@ func TestScaleMachinePool(t *testing.T) {
5258
}
5359

5460
c := common.MustGetClient()
55-
machinePrefix, err := machineNamePrefix(cd, workerMachinePoolName)
56-
require.NoError(t, err, "cannot determine machine name prefix")
5761

5862
// Scale down
59-
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
63+
err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
6064
pool := common.GetMachinePool(c, cd, workerMachinePoolName)
6165
require.NotNilf(t, pool, "worker machine pool does not exist: %s", workerMachinePoolName)
6266

6367
logger = logger.WithField("pool", pool.Name)
64-
logger.Infof("expected Machine name prefix: %s", machinePrefix)
65-
6668
logger.Info("scaling pool to 1 replicas")
6769
pool.Spec.Replicas = ptr.To(int64(1))
6870
return c.Update(context.TODO(), pool)
6971
})
7072
require.NoError(t, err, "cannot update worker machine pool to reduce replicas")
7173

72-
err = waitForMachines(logger, cfg, cd, machinePrefix, 1)
74+
err = waitForMachines(logger, cfg, cd, workerMachinePoolName, 1)
7375
require.NoError(t, err, "timed out waiting for machines to be scaled down")
7476

75-
err = waitForNodes(logger, cfg, cd, machinePrefix, 1)
77+
err = waitForNodes(logger, cfg, cd, workerMachinePoolName, 1)
7678
require.NoError(t, err, "timed out waiting for nodes to be scaled down")
7779

7880
// Scale up
@@ -86,10 +88,10 @@ func TestScaleMachinePool(t *testing.T) {
8688
})
8789
require.NoError(t, err, "cannot update worker machine pool to increase replicas")
8890

89-
err = waitForMachines(logger, cfg, cd, machinePrefix, 3)
91+
err = waitForMachines(logger, cfg, cd, workerMachinePoolName, 3)
9092
require.NoError(t, err, "timed out waiting for machines to be scaled up")
9193

92-
err = waitForNodes(logger, cfg, cd, machinePrefix, 3)
94+
err = waitForNodes(logger, cfg, cd, workerMachinePoolName, 3)
9395
require.NoError(t, err, "timed out waiting for nodes to be scaled up")
9496
}
9597

@@ -162,17 +164,11 @@ func TestNewMachinePool(t *testing.T) {
162164
err := c.Create(context.TODO(), infraMachinePool)
163165
require.NoError(t, err, "cannot create infra machine pool")
164166

165-
machinePrefix, err := machineNamePrefix(cd, infraMachinePoolName)
166-
require.NoError(t, err, "cannot find/calculate machine name prefix")
167-
logger.Infof("expected Machine name prefix: %s", machinePrefix)
168-
169-
// Wait for machines to be created
170167
t.Logf("Waiting for 3 infra machines to be created")
171-
err = waitForMachines(logger, cfg, cd, machinePrefix, 3)
168+
err = waitForMachines(logger, cfg, cd, infraMachinePoolName, 3)
172169
require.NoError(t, err, "timed out waiting for machines to be created")
173170

174-
err = waitForNodes(logger, cfg, cd, machinePrefix, 3,
175-
// Ensure that labels were applied to the nodes
171+
err = waitForNodes(logger, cfg, cd, infraMachinePoolName, 3,
176172
func(node *corev1.Node) bool {
177173
if machineType := node.Labels["openshift.io/machine-type"]; machineType != infraMachinePoolName {
178174
t.Logf("Did not find expected label in node")
@@ -205,7 +201,7 @@ func TestNewMachinePool(t *testing.T) {
205201
cfg,
206202
func(machineSets []*machinev1.MachineSet) bool {
207203
for _, ms := range machineSets {
208-
if strings.HasPrefix(ms.Name, machinePrefix) {
204+
if ms.Labels[hiveMachinePoolKey] == infraMachinePoolName {
209205
return false
210206
}
211207
}
@@ -277,9 +273,6 @@ func TestAutoscalingMachinePool(t *testing.T) {
277273
require.NoError(t, err, "cannot update worker machine pool to reduce replicas")
278274
logger = logger.WithField("pool", pool.Name)
279275

280-
machinePrefix, err := machineNamePrefix(cd, workerMachinePoolName)
281-
require.NoError(t, err, "cannot find/calculate machine name prefix")
282-
283276
logger.Info("lowering autoscaler delay so scaling down happens faster")
284277
clusterAutoscaler := &autoscalingv1.ClusterAutoscaler{}
285278
poll:
@@ -368,6 +361,7 @@ poll:
368361
},
369362
}},
370363
SecurityContext: &corev1.PodSecurityContext{
364+
RunAsUser: ptr.To(int64(1000)),
371365
RunAsNonRoot: ptr.To(true),
372366
SeccompProfile: &corev1.SeccompProfile{
373367
Type: corev1.SeccompProfileTypeRuntimeDefault,
@@ -380,9 +374,9 @@ poll:
380374
err = rc.Create(context.TODO(), busyboxDeployment)
381375
require.NoError(t, err, "cannot create busybox deployment")
382376

383-
err = waitForMachines(logger, cfg, cd, machinePrefix, maxReplicas)
377+
err = waitForMachines(logger, cfg, cd, workerMachinePoolName, maxReplicas)
384378
require.NoError(t, err, "timed out waiting for machines to be created")
385-
err = waitForNodes(logger, cfg, cd, machinePrefix, maxReplicas)
379+
err = waitForNodes(logger, cfg, cd, workerMachinePoolName, maxReplicas)
386380
require.NoError(t, err, "timed out waiting for nodes to be created")
387381

388382
// Scale down
@@ -392,9 +386,9 @@ poll:
392386
logger.Info("deleting busybox deployment to relieve cpu pressure and scale down machines")
393387
err = rc.Delete(context.TODO(), busyboxDeployment, client.PropagationPolicy(metav1.DeletePropagationForeground))
394388
require.NoError(t, err, "could not delete busybox deployment")
395-
err = waitForMachines(logger, cfg, cd, machinePrefix, minReplicas)
389+
err = waitForMachines(logger, cfg, cd, workerMachinePoolName, minReplicas)
396390
require.NoError(t, err, "timed out waiting for machine count")
397-
err = waitForNodes(logger, cfg, cd, machinePrefix, minReplicas)
391+
err = waitForNodes(logger, cfg, cd, workerMachinePoolName, minReplicas)
398392
require.NoError(t, err, "timed out waiting for nodes to be created")
399393

400394
logger.Info("disabling autoscaling")
@@ -407,45 +401,57 @@ poll:
407401
return c.Update(context.TODO(), pool)
408402
})
409403
require.NoError(t, err, "cannot update worker machine pool to turn off auto-scaling")
410-
err = waitForMachines(logger, cfg, cd, machinePrefix, 3)
404+
err = waitForMachines(logger, cfg, cd, workerMachinePoolName, 3)
411405
require.NoError(t, err, "timed out waiting for machines to be created")
412-
err = waitForNodes(logger, cfg, cd, machinePrefix, 3)
406+
err = waitForNodes(logger, cfg, cd, workerMachinePoolName, 3)
413407
require.NoError(t, err, "timed out waiting for nodes to be created")
414408
}
415409

416-
func waitForMachines(logger log.FieldLogger, cfg *rest.Config, cd *hivev1.ClusterDeployment, machinePrefix string, expectedReplicas int) error {
417-
logger.Infof("waiting for %d machines with prefix '%s'", expectedReplicas, machinePrefix)
410+
func waitForMachines(logger log.FieldLogger, cfg *rest.Config, cd *hivev1.ClusterDeployment, poolName string, expectedReplicas int) error {
411+
infraID := cd.Spec.ClusterMetadata.InfraID
412+
logger.Infof("waiting for %d machines (pool %s, infraID %s)", expectedReplicas, poolName, infraID)
418413
lastCount := 0
419414
return common.WaitForMachines(cfg, func(machines []*machinev1.Machine) bool {
420415
count := 0
421416
for _, m := range machines {
422-
if strings.HasPrefix(m.Name, machinePrefix) {
417+
if m.Labels[capiClusterKey] == infraID && m.Labels[capiMachineTypeKey] == poolName {
423418
count++
424419
}
425420
}
426421
if count != lastCount {
427-
logger.Infof("found %d machines with prefix '%s'", count, machinePrefix)
422+
logger.Infof("found %d machines for pool %s", count, poolName)
428423
lastCount = count
429424
}
430425
return count == expectedReplicas
431-
}, 20*time.Minute)
426+
}, 30*time.Minute)
432427
}
433428

434-
func waitForNodes(logger log.FieldLogger, cfg *rest.Config, cd *hivev1.ClusterDeployment, machinePrefix string, expectedReplicas int, extraChecks ...func(node *corev1.Node) bool) error {
435-
logger.Infof("waiting for %d nodes with machine annotation prefix '%s'", expectedReplicas, machinePrefix)
429+
func waitForNodes(logger log.FieldLogger, cfg *rest.Config, cd *hivev1.ClusterDeployment, poolName string, expectedReplicas int, extraChecks ...func(node *corev1.Node) bool) error {
430+
infraID := cd.Spec.ClusterMetadata.InfraID
431+
logger.Infof("waiting for %d nodes (pool %s)", expectedReplicas, poolName)
432+
rc := common.MustGetClientFromConfig(cfg)
436433
return common.WaitForNodes(cfg, func(nodes []*corev1.Node) bool {
434+
machineList := &machinev1.MachineList{}
435+
err := rc.List(context.TODO(), machineList, client.InNamespace(openshiftMachineAPI),
436+
client.MatchingLabels{capiClusterKey: infraID, capiMachineTypeKey: poolName})
437+
if err != nil {
438+
return false
439+
}
440+
poolMachineNames := sets.New[string]()
441+
for i := range machineList.Items {
442+
poolMachineNames.Insert(machineList.Items[i].Name)
443+
}
437444
poolNodes := []*corev1.Node{}
438445
for _, n := range nodes {
439446
if n.Annotations == nil {
440447
continue
441448
}
442449
machineAnnotation := n.Annotations["machine.openshift.io/machine"]
443-
name := strings.Split(machineAnnotation, "/")
444-
if len(name) < 2 {
450+
parts := strings.Split(machineAnnotation, "/")
451+
if len(parts) < 2 {
445452
continue
446453
}
447-
machineName := name[1]
448-
if strings.HasPrefix(machineName, machinePrefix) {
454+
if poolMachineNames.Has(parts[1]) {
449455
poolNodes = append(poolNodes, n)
450456
}
451457
}
@@ -463,9 +469,5 @@ func waitForNodes(logger log.FieldLogger, cfg *rest.Config, cd *hivev1.ClusterDe
463469
}
464470

465471
return true
466-
}, 15*time.Minute)
467-
}
468-
469-
func machineNamePrefix(cd *hivev1.ClusterDeployment, poolName string) (string, error) {
470-
return fmt.Sprintf("%s-%s-", cd.Spec.ClusterMetadata.InfraID, poolName), nil
472+
}, 30*time.Minute)
471473
}

0 commit comments

Comments
 (0)