From bfa7dd3f57717b8e5df07bbe8264c82844b27e96 Mon Sep 17 00:00:00 2001 From: Ivan Bodrov Date: Tue, 28 Jan 2025 14:07:38 -0500 Subject: [PATCH] concord-agent-operator: attempt to improve error logging --- .../{PodUtils.java => PodLabels.java} | 8 ++- .../planner/DeleteConfigMapChange.java | 20 ++----- .../agentoperator/planner/Planner.java | 8 +-- .../planner/TagForRemovalChange.java | 4 +- .../planner/TryToDeletePodChange.java | 4 +- .../resources/AgentConfigMap.java | 55 ++++++++++++++--- .../agentoperator/resources/AgentPod.java | 59 +++++++++++++------ .../agentoperator/scheduler/Scheduler.java | 4 +- 8 files changed, 105 insertions(+), 57 deletions(-) rename agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/{PodUtils.java => PodLabels.java} (90%) diff --git a/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/PodUtils.java b/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/PodLabels.java similarity index 90% rename from agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/PodUtils.java rename to agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/PodLabels.java index 183e2e40c2..3605ada09b 100644 --- a/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/PodUtils.java +++ b/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/PodLabels.java @@ -28,9 +28,9 @@ import java.util.Map; -public final class PodUtils { +public final class PodLabels { - private static final Logger log = LoggerFactory.getLogger(PodUtils.class); + private static final Logger log = LoggerFactory.getLogger(PodLabels.class); public static void applyTag(KubernetesClient client, String podName, String tagName, String tagValue) { Pod pod = client.pods().withName(podName).get(); @@ -51,10 +51,12 @@ public static void applyTag(KubernetesClient client, String podName, String tagN } catch (KubernetesClientException e) { if (e.getCode() == 404) { log.warn("['{}']: apply tag ['{}': '{}'] -> pod doesn't exist, nothing to do", podName, tagName, tagValue); + } else { + log.warn("['{}']: apply tag ['{}': '{}'] -> error", podName, tagName, tagValue, e); } } } - private PodUtils() { + private PodLabels() { } } diff --git a/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/planner/DeleteConfigMapChange.java b/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/planner/DeleteConfigMapChange.java index 02231d1805..12b9eb04c0 100644 --- a/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/planner/DeleteConfigMapChange.java +++ b/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/planner/DeleteConfigMapChange.java @@ -20,6 +20,7 @@ * ===== */ +import com.walmartlabs.concord.agentoperator.resources.AgentConfigMap; import io.fabric8.kubernetes.client.KubernetesClient; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,25 +37,14 @@ public DeleteConfigMapChange(String configMapName) { @Override public void apply(KubernetesClient client) { - var result = client.configMaps().withName(configMapName).delete(); - if (!result.isEmpty()) { - // wait till it's actually removed - while (client.configMaps().withName(configMapName).get() != null) { - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } - } - - log.info("apply -> removed a configmap {}", configMapName); - } + AgentConfigMap.delete(client, configMapName); + log.info("apply -> removed a configmap {}", configMapName); } @Override public String toString() { return "DeleteConfigMapChange{" + - "configMapName='" + configMapName + '\'' + - '}'; + "configMapName='" + configMapName + '\'' + + '}'; } } diff --git a/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/planner/Planner.java b/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/planner/Planner.java index e88e642358..2da878aedb 100644 --- a/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/planner/Planner.java +++ b/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/planner/Planner.java @@ -20,9 +20,9 @@ * ===== */ +import com.walmartlabs.concord.agentoperator.HashUtils; import com.walmartlabs.concord.agentoperator.agent.AgentClient; import com.walmartlabs.concord.agentoperator.agent.AgentClientFactory; -import com.walmartlabs.concord.agentoperator.HashUtils; import com.walmartlabs.concord.agentoperator.resources.AgentConfigMap; import com.walmartlabs.concord.agentoperator.resources.AgentPod; import com.walmartlabs.concord.agentoperator.scheduler.AgentPoolInstance; @@ -56,11 +56,7 @@ public List plan(AgentPoolInstance poolInstance) throws IOException { List changes = new ArrayList<>(); // process pods marked for removal first - client.pods() - .withLabel(AgentPod.TAGGED_FOR_REMOVAL_LABEL) - .withLabel(AgentPod.POOL_NAME_LABEL, resourceName) - .list() - .getItems() + AgentPod.listMarkedForRemoval(client, resourceName) .forEach(n -> changes.add(new TryToDeletePodChange(n.getMetadata().getName(), agentClientFactory.create(n)))); List pods = AgentPod.list(client, resourceName); diff --git a/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/planner/TagForRemovalChange.java b/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/planner/TagForRemovalChange.java index 91b761970f..c5f7d23a33 100644 --- a/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/planner/TagForRemovalChange.java +++ b/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/planner/TagForRemovalChange.java @@ -21,7 +21,7 @@ */ import com.walmartlabs.concord.agentoperator.agent.AgentClient; -import com.walmartlabs.concord.agentoperator.PodUtils; +import com.walmartlabs.concord.agentoperator.PodLabels; import com.walmartlabs.concord.agentoperator.resources.AgentPod; import io.fabric8.kubernetes.client.KubernetesClient; import org.slf4j.Logger; @@ -48,7 +48,7 @@ public void apply(KubernetesClient client) { return; } - PodUtils.applyTag(client, podName, AgentPod.TAGGED_FOR_REMOVAL_LABEL, "true"); + PodLabels.applyTag(client, podName, AgentPod.TAGGED_FOR_REMOVAL_LABEL, "true"); } @Override diff --git a/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/planner/TryToDeletePodChange.java b/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/planner/TryToDeletePodChange.java index e0bd1a0a78..e4b85a8236 100644 --- a/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/planner/TryToDeletePodChange.java +++ b/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/planner/TryToDeletePodChange.java @@ -21,7 +21,7 @@ */ import com.walmartlabs.concord.agentoperator.agent.AgentClient; -import com.walmartlabs.concord.agentoperator.PodUtils; +import com.walmartlabs.concord.agentoperator.PodLabels; import com.walmartlabs.concord.agentoperator.resources.AgentPod; import io.fabric8.kubernetes.api.model.Pod; import io.fabric8.kubernetes.client.KubernetesClient; @@ -88,7 +88,7 @@ public void apply(KubernetesClient client) { // agent pod in maintenance mode and all workers done client.pods().withName(podName).delete(); - PodUtils.applyTag(client, podName, AgentPod.PRE_STOP_HOOK_TERMINATION_LABEL, "true"); + PodLabels.applyTag(client, podName, AgentPod.PRE_STOP_HOOK_TERMINATION_LABEL, "true"); log.info("apply ['{}'] -> Marked for termination (former phase: {})", podName, pod.getStatus().getPhase()); } diff --git a/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/resources/AgentConfigMap.java b/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/resources/AgentConfigMap.java index a09c32c242..fc40773c1e 100644 --- a/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/resources/AgentConfigMap.java +++ b/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/resources/AgentConfigMap.java @@ -27,21 +27,55 @@ import com.walmartlabs.concord.agentoperator.scheduler.AgentPoolInstance; import io.fabric8.kubernetes.api.model.ConfigMap; import io.fabric8.kubernetes.client.KubernetesClient; +import io.fabric8.kubernetes.client.KubernetesClientException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.ByteArrayInputStream; import java.io.IOException; public final class AgentConfigMap { + private static final Logger log = LoggerFactory.getLogger(AgentConfigMap.class); + private static final ObjectMapper objectMapper = new ObjectMapper(); public static ConfigMap get(KubernetesClient client, String configMapName) { - return client.configMaps().withName(configMapName).get(); + try { + return client.configMaps().withName(configMapName).get(); + } catch (KubernetesClientException e) { + log.warn("get ['{}'] -> error while getting a configmap: {}", configMapName, e.getMessage()); + throw e; + } } public static void create(KubernetesClient client, AgentPoolInstance poolInstance, String configMapName) throws IOException { - ConfigMap m = prepare(client, poolInstance, configMapName); - client.configMaps().resource(m).create(); + try { + ConfigMap m = prepare(client, poolInstance, configMapName); + client.configMaps().resource(m).create(); + } catch (KubernetesClientException e) { + log.warn("create ['{}', '{}'] -> error while creating a configmap: {}", poolInstance.getName(), configMapName, e.getMessage()); + throw e; + } + } + + public static void delete(KubernetesClient client, String configMapName) { + try { + client.configMaps().withName(configMapName).delete(); + + // wait till it's actually removed + while (client.configMaps().withName(configMapName).get() != null) { + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } + } + } catch (KubernetesClientException e) { + log.warn("delete ['{}'] -> error while deleting a configmap: {}", configMapName, e.getMessage()); + throw e; + } } public static boolean hasChanges(KubernetesClient client, AgentPoolInstance poolInstance, ConfigMap a) throws IOException { @@ -54,13 +88,18 @@ public static boolean hasChanges(KubernetesClient client, AgentPoolInstance pool } private static ConfigMap prepare(KubernetesClient client, AgentPoolInstance poolInstance, String configMapName) throws IOException { - AgentPoolConfiguration spec = poolInstance.getResource().getSpec(); + try { + AgentPoolConfiguration spec = poolInstance.getResource().getSpec(); - String configMapYaml = objectMapper.writeValueAsString(spec.getConfigMap()) - .replaceAll("%%configMapName%%", configMapName) - .replace("%%preStopHook%%", escape(Resources.get("/prestop-hook.sh"))); + String configMapYaml = objectMapper.writeValueAsString(spec.getConfigMap()) + .replaceAll("%%configMapName%%", configMapName) + .replace("%%preStopHook%%", escape(Resources.get("/prestop-hook.sh"))); - return client.configMaps().load(new ByteArrayInputStream(configMapYaml.getBytes())).item(); + return client.configMaps().load(new ByteArrayInputStream(configMapYaml.getBytes())).item(); + } catch (KubernetesClientException e) { + log.warn("prepare ['{}', '{}'] -> error while preparing a configmap: {}", poolInstance.getName(), configMapName, e.getMessage()); + throw e; + } } private static String escape(String str) throws JsonProcessingException { diff --git a/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/resources/AgentPod.java b/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/resources/AgentPod.java index 56ab386c78..d5fc989b30 100644 --- a/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/resources/AgentPod.java +++ b/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/resources/AgentPod.java @@ -26,6 +26,9 @@ import com.walmartlabs.concord.agentoperator.scheduler.AgentPoolInstance; import io.fabric8.kubernetes.api.model.Pod; import io.fabric8.kubernetes.client.KubernetesClient; +import io.fabric8.kubernetes.client.KubernetesClientException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.ByteArrayInputStream; import java.io.IOException; @@ -33,6 +36,8 @@ public final class AgentPod { + private static final Logger log = LoggerFactory.getLogger(AgentPod.class); + public static final String TAGGED_FOR_REMOVAL_LABEL = "concordTaggedForRemoval"; public static final String PRE_STOP_HOOK_TERMINATION_LABEL = "preStopHookTermination"; public static final String POOL_NAME_LABEL = "poolName"; @@ -40,11 +45,29 @@ public final class AgentPod { private static final ObjectMapper objectMapper = new ObjectMapper(); + public static List listMarkedForRemoval(KubernetesClient client, String resourceName) { + try { + return client.pods() + .withLabel(AgentPod.TAGGED_FOR_REMOVAL_LABEL) + .withLabel(AgentPod.POOL_NAME_LABEL, resourceName) + .list() + .getItems(); + } catch (KubernetesClientException e) { + log.warn("listMarkedForRemoval ['{}'] -> error while listing marked for removal pods: {}", resourceName, e.getMessage()); + throw e; + } + } + public static List list(KubernetesClient client, String resourceName) { - return client.pods() - .withLabel(POOL_NAME_LABEL, resourceName) - .list() - .getItems(); + try { + return client.pods() + .withLabel(POOL_NAME_LABEL, resourceName) + .list() + .getItems(); + } catch (KubernetesClientException e) { + log.warn("list ['{}'] -> error while listing pool pods: {}", resourceName, e.getMessage()); + throw e; + } } public static void create(KubernetesClient client, @@ -53,23 +76,21 @@ public static void create(KubernetesClient client, String configMapName, String hash) throws IOException { - AgentPoolConfiguration spec = poolInstance.getResource().getSpec(); + try { + AgentPoolConfiguration spec = poolInstance.getResource().getSpec(); - String podYaml = objectMapper.writeValueAsString(spec.getPod()) - .replaceAll("%%podName%%", podName) - .replaceAll("%%app%%", AgentPool.SERVICE_FULL_NAME) - .replaceAll("%%" + POOL_NAME_LABEL + "%%", poolInstance.getName()) - .replaceAll("%%configMapName%%", configMapName) - .replaceAll("%%" + CONFIG_HASH_LABEL + "%%", hash); - - client.pods().load(new ByteArrayInputStream(podYaml.getBytes())).create(); - } + String podYaml = objectMapper.writeValueAsString(spec.getPod()) + .replaceAll("%%podName%%", podName) + .replaceAll("%%app%%", AgentPool.SERVICE_FULL_NAME) + .replaceAll("%%" + POOL_NAME_LABEL + "%%", poolInstance.getName()) + .replaceAll("%%configMapName%%", configMapName) + .replaceAll("%%" + CONFIG_HASH_LABEL + "%%", hash); - public static List listPods(KubernetesClient client, String resourceName) { - return client.pods() - .withLabel(POOL_NAME_LABEL, resourceName) - .list() - .getItems(); + client.pods().load(new ByteArrayInputStream(podYaml.getBytes())).create(); + } catch (KubernetesClientException e) { + log.warn("create ['{}', '{}', '{}', '{}'] -> error while creating a pod: {}", poolInstance.getName(), podName, configMapName, hash, e.getMessage()); + throw e; + } } private AgentPod() { diff --git a/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/scheduler/Scheduler.java b/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/scheduler/Scheduler.java index ef4f292fad..86d575f02d 100644 --- a/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/scheduler/Scheduler.java +++ b/agent-operator/src/main/java/com/walmartlabs/concord/agentoperator/scheduler/Scheduler.java @@ -122,7 +122,7 @@ private void doRun() { throw new IllegalArgumentException("Unknown pool status: " + i.getStatus()); } } catch (IOException e) { - log.error("doRun -> error while processing a registered pool {}: {}", i.getName(), e.getMessage()); + log.error("doRun -> error while processing a registered pool {} ({}): {}", i.getName(), i.getStatus(), e.getMessage()); } }); } @@ -176,7 +176,7 @@ private void processDeleted(AgentPoolInstance i) throws IOException { apply(changes); // if no pods left - remove the pool - List pods = AgentPod.listPods(k8sClient, resourceName); + List pods = AgentPod.list(k8sClient, resourceName); if (pods.isEmpty()) { synchronized (pools) { pools.remove(resourceName);