From 83b53f2b459d24bb7bbf5e0cc3287f5101c7cd0b Mon Sep 17 00:00:00 2001 From: Ole Langbehn <ole.langbehn@inoio.de> Date: Thu, 26 Sep 2024 16:10:34 +0200 Subject: [PATCH] HOLI-10040 HOLI-10041 retry terraform destroy in CI for more reliable destroy --- .gitignore | 1 + .gitlab-ci.yml | 7 ++++ terraform/environments/scripts/destroy-env.sh | 33 ++++++++++++++++--- 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 133ae09..3e3d850 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .envrc.local coverage +terraform*.log diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f4d27c0..383459b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -120,6 +120,13 @@ review_destroy: name: review/$CI_COMMIT_REF_SLUG action: stop dependencies: [] # explicitly disable artifact usage + artifacts: + paths: + - "terraform/environments/crash.log" # optional, only available in case of a crash/panic + - "terraform/environments/terraform-*.log" # separate log for every step/command + name: "${CI_JOB_NAME}_${CI_JOB_ID}" + when: on_failure + expire_in: 1 week script: # branch may have been deleted, so we clone and checkout main - git clone $CI_REPOSITORY_URL main-clone diff --git a/terraform/environments/scripts/destroy-env.sh b/terraform/environments/scripts/destroy-env.sh index 8feb7be..d813769 100755 --- a/terraform/environments/scripts/destroy-env.sh +++ b/terraform/environments/scripts/destroy-env.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env sh +#!/usr/bin/env bash # exit when any command fails set -ex @@ -6,11 +6,36 @@ set -ex # enable debug output in terraform export TF_LOG=DEBUG -cd terraform/environments +# retry logic for destroy: sometimes, a full workspace destroy does not work. This can be due to e.g.: +# * implicit dependencies between terraform resources not declared with depends_on, +# * unclean shutdown of resources, e.g. service does not close db connections, db still sees clients connected, +# * GCP stuff not allowing our resources to be deleted. +# Most of the time, retrying a destroy fixes these causes. +retry() { + for i in {1..3}; do + set +e + "$@" + retval=$? + set -e + if [ "$retval" -ne "0" ]; then + if [ "$i" -lt "3" ]; then + echo "command '$*' failed in try $i, retrying after 60 seconds" + sleep 60 # let things settle a bit + else + echo "command '$*' failed in try $i, giving up" + exit $retval + fi + else + break # success + fi + done +} + +cd "$(dirname "$0")"/.. TF_LOG_PATH=terraform-init.log terraform init TF_LOG_PATH=terraform-version.log terraform version -TF_LOG_PATH=terraform-workspace.log terraform workspace new "$1" || terraform workspace select "$1" -TF_LOG_PATH=terraform-destroy.log terraform destroy -auto-approve -var="image_tag=dummy" +TF_LOG_PATH=terraform-workspace.log terraform workspace select -or-create=true "$1" +TF_LOG_PATH=terraform-destroy.log retry terraform destroy -auto-approve -var="image_tag=dummy" TF_LOG_PATH=terraform-ws-default.log terraform workspace select default TF_LOG_PATH=terraform-ws-delete.log terraform workspace delete "$1" -- GitLab