From 83b53f2b459d24bb7bbf5e0cc3287f5101c7cd0b Mon Sep 17 00:00:00 2001
From: Ole Langbehn <ole.langbehn@inoio.de>
Date: Thu, 26 Sep 2024 16:10:34 +0200
Subject: [PATCH] HOLI-10040 HOLI-10041 retry terraform destroy in CI for more
 reliable destroy

---
 .gitignore                                    |  1 +
 .gitlab-ci.yml                                |  7 ++++
 terraform/environments/scripts/destroy-env.sh | 33 ++++++++++++++++---
 3 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index 133ae09..3e3d850 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 .envrc.local
 coverage
+terraform*.log
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index f4d27c0..383459b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -120,6 +120,13 @@ review_destroy:
     name: review/$CI_COMMIT_REF_SLUG
     action: stop
   dependencies: [] # explicitly disable artifact usage
+  artifacts:
+    paths:
+      - "terraform/environments/crash.log" # optional, only available in case of a crash/panic
+      - "terraform/environments/terraform-*.log" # separate log for every step/command
+    name: "${CI_JOB_NAME}_${CI_JOB_ID}"
+    when: on_failure
+    expire_in: 1 week
   script:
     # branch may have been deleted, so we clone and checkout main
     - git clone $CI_REPOSITORY_URL main-clone
diff --git a/terraform/environments/scripts/destroy-env.sh b/terraform/environments/scripts/destroy-env.sh
index 8feb7be..d813769 100755
--- a/terraform/environments/scripts/destroy-env.sh
+++ b/terraform/environments/scripts/destroy-env.sh
@@ -1,4 +1,4 @@
-#!/usr/bin/env sh
+#!/usr/bin/env bash
 
 # exit when any command fails
 set -ex
@@ -6,11 +6,36 @@ set -ex
 # enable debug output in terraform
 export TF_LOG=DEBUG
 
-cd terraform/environments
+# retry logic for destroy: sometimes, a full workspace destroy does not work. This can be due to e.g.:
+# * implicit dependencies between terraform resources not declared with depends_on,
+# * unclean shutdown of resources, e.g. service does not close db connections, db still sees clients connected,
+# * GCP stuff not allowing our resources to be deleted.
+# Most of the time, retrying a destroy fixes these causes.
+retry() {
+  for i in {1..3}; do
+    set +e
+    "$@"
+    retval=$?
+    set -e
+    if [ "$retval" -ne "0" ]; then
+      if [ "$i" -lt "3" ]; then
+        echo "command '$*' failed in try $i, retrying after 60 seconds"
+        sleep 60 # let things settle a bit
+      else
+        echo "command '$*' failed in try $i, giving up"
+        exit $retval
+      fi
+    else
+      break # success
+    fi
+  done
+}
+
+cd "$(dirname "$0")"/..
 
 TF_LOG_PATH=terraform-init.log       terraform init
 TF_LOG_PATH=terraform-version.log    terraform version
-TF_LOG_PATH=terraform-workspace.log  terraform workspace new "$1" || terraform workspace select "$1"
-TF_LOG_PATH=terraform-destroy.log    terraform destroy -auto-approve -var="image_tag=dummy"
+TF_LOG_PATH=terraform-workspace.log  terraform workspace select -or-create=true "$1"
+TF_LOG_PATH=terraform-destroy.log    retry terraform destroy -auto-approve -var="image_tag=dummy"
 TF_LOG_PATH=terraform-ws-default.log terraform workspace select default
 TF_LOG_PATH=terraform-ws-delete.log  terraform workspace delete "$1"
-- 
GitLab