From 1c2ec7ecb8ffcb3e4a63c6e760704da8f41348b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Julian=20T=C3=B6lle?= <julian.toelle@hetzner-cloud.de>
Date: Tue, 21 Feb 2023 12:45:45 +0100
Subject: [PATCH] ci(e2e): improve behaviour in spite of flakiness (#386)

Related to #381
---
 .github/workflows/test_e2e.yml | 3 +++
 e2etests/e2e_test.go           | 7 +++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test_e2e.yml b/.github/workflows/test_e2e.yml
index 6be487f..af4def8 100644
--- a/.github/workflows/test_e2e.yml
+++ b/.github/workflows/test_e2e.yml
@@ -5,6 +5,9 @@ jobs:
   test:
     runs-on: self-hosted
     strategy:
+      # The e2e tests are flaky and often one of the jobs fails. The default setting
+      # causes all other currently running jobs to abort and all need to be restarted.
+      fail-fast: false
       matrix:
         k8s: [ k8s-1.23.15, k8s-1.24.9, k8s-1.25.5, k8s-1.26.0 ]
     name: k8s ${{ matrix.k8s }}
diff --git a/e2etests/e2e_test.go b/e2etests/e2e_test.go
index 2d7124c..e61cf99 100644
--- a/e2etests/e2e_test.go
+++ b/e2etests/e2e_test.go
@@ -24,8 +24,11 @@ func TestMain(m *testing.M) {
 }
 
 func TestOfficialTestsuite(t *testing.T) {
+	// The e2e tests are a bit flaky, and at the moment in ~1/3 of the runs a test fails, causing the whole pipeline to
+	// fail. As ,the e2e tests take 15-20 minutes each, this is quite annoying. By setting -flakeAttempts=2, the pipeline
+	// will immediately retry any failed tests.
 	t.Run("parallel tests", func(t *testing.T) {
-		err := RunCommandVisibleOnServer(testCluster.setup.privKey, testCluster.setup.MainNode, "KUBECONFIG=/root/.kube/config ./ginkgo -nodes=6 -v -focus='External.Storage' -skip='\\[Feature:|\\[Disruptive\\]|\\[Serial\\]' ./e2e.test -- -storage.testdriver=test-driver.yml")
+		err := RunCommandVisibleOnServer(testCluster.setup.privKey, testCluster.setup.MainNode, "KUBECONFIG=/root/.kube/config ./ginkgo -nodes=6 -flakeAttempts=2 -v -focus='External.Storage' -skip='\\[Feature:|\\[Disruptive\\]|\\[Serial\\]' ./e2e.test -- -storage.testdriver=test-driver.yml")
 		if err != nil {
 			t.Error(err)
 		}
@@ -37,7 +40,7 @@ func TestOfficialTestsuite(t *testing.T) {
 		// Volume Access Mode in Kubernetes).
 		// This feature is being tracked in https://github.com/hetznercloud/csi-driver/issues/327
 		// and we should add the tests once we have implemented the capability.
-		err := RunCommandVisibleOnServer(testCluster.setup.privKey, testCluster.setup.MainNode, "KUBECONFIG=/root/.kube/config ./ginkgo -v -focus='External.Storage.*(\\[Feature:|\\[Serial\\])' -skip='\\[Feature:SELinuxMountReadWriteOncePod\\]' ./e2e.test -- -storage.testdriver=test-driver.yml")
+		err := RunCommandVisibleOnServer(testCluster.setup.privKey, testCluster.setup.MainNode, "KUBECONFIG=/root/.kube/config ./ginkgo -flakeAttempts=2 -v -focus='External.Storage.*(\\[Feature:|\\[Serial\\])' -skip='\\[Feature:SELinuxMountReadWriteOncePod\\]' ./e2e.test -- -storage.testdriver=test-driver.yml")
 		if err != nil {
 			t.Error(err)
 		}
-- 
GitLab