From ec701f6d300fca67bd19cca47488cd5186b5f24b Mon Sep 17 00:00:00 2001
From: "Dr. Jan-Philip Gehrcke" <jgehrcke@googlemail.com>
Date: Wed, 19 Apr 2023 12:05:36 +0200
Subject: [PATCH] run_operator_locally.sh: more retrying, debuggability (#2218)

actually retry kubectl port-forward
and better messages
---
 run_operator_locally.sh | 64 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 58 insertions(+), 6 deletions(-)

diff --git a/run_operator_locally.sh b/run_operator_locally.sh
index 9e3e082d..600cc2f6 100755
--- a/run_operator_locally.sh
+++ b/run_operator_locally.sh
@@ -30,8 +30,8 @@ function retry(){
     local -r retry_cmd="$1"
     local -r retry_msg="$2"
 
-    # times out after 1 minute
-    for i in {1..20}; do
+    # Time out after three minutes.
+    for i in {1..60}; do
         if  eval "$retry_cmd"; then
             return 0
         fi
@@ -165,11 +165,63 @@ function forward_ports(){
     local operator_pod
     operator_pod=$(kubectl get pod -l name=postgres-operator -o jsonpath={.items..metadata.name})
 
-    # runs in the background to keep current terminal responsive
-    # stdout redirect removes the info message about forwarded ports; the message sometimes garbles the cli prompt
-    kubectl port-forward "$operator_pod" "$LOCAL_PORT":"$OPERATOR_PORT" &> /dev/null &
+    # Spawn `kubectl port-forward` in the background to keep current terminal
+    # responsive. Hide stdout because otherwise there is a note about each TCP
+    # connection. Do not hide stderr so port-forward setup errors can be
+    # debugged. Sometimes the port-forward setup fails because expected k8s
+    # state isn't achieved yet. Try to detect that case and then run the
+    # command again (in a finite loop).
+    for _attempt in {1..20}; do
+        # Delay between retry attempts. First attempt should already be
+        # delayed.
+        echo "soon: invoke kubectl port-forward command (attempt $_attempt)"
+        sleep 5
+
+        # With the --pod-running-timeout=4s argument the process is expected
+        # to terminate within about that time if the pod isn't ready yet.
+        kubectl port-forward --pod-running-timeout=4s "$operator_pod" "$LOCAL_PORT":"$OPERATOR_PORT" 1> /dev/null &
+        _kubectl_pid=$!
+        _pf_success=true
+
+        # A successful `kubectl port-forward` setup can pragmatically be
+        # detected with a time-based criterion: it is a long-running process if
+        # successfully set up. If it does not terminate within deadline then
+        # consider the setup successful. Overall, observe the process for
+        # roughly 7 seconds. If it terminates before that it's certainly an
+        # error. If it did not terminate within that time frame then consider
+        # setup successful.
+        for ib in {1..7}; do
+            sleep 1
+            # Portable and non-blocking test: is process still running?
+            if kill -s 0 -- "${_kubectl_pid}" >/dev/null 2>&1; then
+                echo "port-forward process is still running"
+            else
+                # port-forward process seems to have terminated, reap zombie
+                set +e
+                # `wait` is now expected to be non-blocking, and exits with the
+                # exit code of pid (first arg).
+                wait $_kubectl_pid
+                _kubectl_rc=$?
+                set -e
+                echo "port-forward process terminated with exit code ${_kubectl_rc}"
+                _pf_success=false
+                break
+            fi
+        done
+
+        if [ ${_pf_success} = true ]; then
+            echo "port-forward setup seems successful. leave retry loop."
+            break
+        fi
+
+    done
+
+    if [ "${_pf_success}" = false ]; then
+        echo "port-forward setup failed after retrying. exit."
+        exit 1
+    fi
 
-    echo $! > "$PATH_TO_PORT_FORWARED_KUBECTL_PID"
+    echo "${_kubectl_pid}" > "$PATH_TO_PORT_FORWARED_KUBECTL_PID"
 }
 
 
-- 
GitLab