Skip to content

Commit

Permalink
concord-agent-operator: attempt to improve error logging
Browse files Browse the repository at this point in the history
  • Loading branch information
ibodrov committed Jan 28, 2025
1 parent 113635f commit bfa7dd3
Show file tree
Hide file tree
Showing 8 changed files with 105 additions and 57 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@

import java.util.Map;

public final class PodUtils {
public final class PodLabels {

private static final Logger log = LoggerFactory.getLogger(PodUtils.class);
private static final Logger log = LoggerFactory.getLogger(PodLabels.class);

public static void applyTag(KubernetesClient client, String podName, String tagName, String tagValue) {
Pod pod = client.pods().withName(podName).get();
Expand All @@ -51,10 +51,12 @@ public static void applyTag(KubernetesClient client, String podName, String tagN
} catch (KubernetesClientException e) {
if (e.getCode() == 404) {
log.warn("['{}']: apply tag ['{}': '{}'] -> pod doesn't exist, nothing to do", podName, tagName, tagValue);
} else {
log.warn("['{}']: apply tag ['{}': '{}'] -> error", podName, tagName, tagValue, e);
}
}
}

private PodUtils() {
private PodLabels() {
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
* =====
*/

import com.walmartlabs.concord.agentoperator.resources.AgentConfigMap;
import io.fabric8.kubernetes.client.KubernetesClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -36,25 +37,14 @@ public DeleteConfigMapChange(String configMapName) {

@Override
public void apply(KubernetesClient client) {
var result = client.configMaps().withName(configMapName).delete();
if (!result.isEmpty()) {
// wait till it's actually removed
while (client.configMaps().withName(configMapName).get() != null) {
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}

log.info("apply -> removed a configmap {}", configMapName);
}
AgentConfigMap.delete(client, configMapName);
log.info("apply -> removed a configmap {}", configMapName);
}

@Override
public String toString() {
return "DeleteConfigMapChange{" +
"configMapName='" + configMapName + '\'' +
'}';
"configMapName='" + configMapName + '\'' +
'}';
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
* =====
*/

import com.walmartlabs.concord.agentoperator.HashUtils;
import com.walmartlabs.concord.agentoperator.agent.AgentClient;
import com.walmartlabs.concord.agentoperator.agent.AgentClientFactory;
import com.walmartlabs.concord.agentoperator.HashUtils;
import com.walmartlabs.concord.agentoperator.resources.AgentConfigMap;
import com.walmartlabs.concord.agentoperator.resources.AgentPod;
import com.walmartlabs.concord.agentoperator.scheduler.AgentPoolInstance;
Expand Down Expand Up @@ -56,11 +56,7 @@ public List<Change> plan(AgentPoolInstance poolInstance) throws IOException {
List<Change> changes = new ArrayList<>();

// process pods marked for removal first
client.pods()
.withLabel(AgentPod.TAGGED_FOR_REMOVAL_LABEL)
.withLabel(AgentPod.POOL_NAME_LABEL, resourceName)
.list()
.getItems()
AgentPod.listMarkedForRemoval(client, resourceName)
.forEach(n -> changes.add(new TryToDeletePodChange(n.getMetadata().getName(), agentClientFactory.create(n))));

List<Pod> pods = AgentPod.list(client, resourceName);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
*/

import com.walmartlabs.concord.agentoperator.agent.AgentClient;
import com.walmartlabs.concord.agentoperator.PodUtils;
import com.walmartlabs.concord.agentoperator.PodLabels;
import com.walmartlabs.concord.agentoperator.resources.AgentPod;
import io.fabric8.kubernetes.client.KubernetesClient;
import org.slf4j.Logger;
Expand All @@ -48,7 +48,7 @@ public void apply(KubernetesClient client) {
return;
}

PodUtils.applyTag(client, podName, AgentPod.TAGGED_FOR_REMOVAL_LABEL, "true");
PodLabels.applyTag(client, podName, AgentPod.TAGGED_FOR_REMOVAL_LABEL, "true");
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
*/

import com.walmartlabs.concord.agentoperator.agent.AgentClient;
import com.walmartlabs.concord.agentoperator.PodUtils;
import com.walmartlabs.concord.agentoperator.PodLabels;
import com.walmartlabs.concord.agentoperator.resources.AgentPod;
import io.fabric8.kubernetes.api.model.Pod;
import io.fabric8.kubernetes.client.KubernetesClient;
Expand Down Expand Up @@ -88,7 +88,7 @@ public void apply(KubernetesClient client) {

// agent pod in maintenance mode and all workers done
client.pods().withName(podName).delete();
PodUtils.applyTag(client, podName, AgentPod.PRE_STOP_HOOK_TERMINATION_LABEL, "true");
PodLabels.applyTag(client, podName, AgentPod.PRE_STOP_HOOK_TERMINATION_LABEL, "true");
log.info("apply ['{}'] -> Marked for termination (former phase: {})", podName, pod.getStatus().getPhase());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,55 @@
import com.walmartlabs.concord.agentoperator.scheduler.AgentPoolInstance;
import io.fabric8.kubernetes.api.model.ConfigMap;
import io.fabric8.kubernetes.client.KubernetesClient;
import io.fabric8.kubernetes.client.KubernetesClientException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.ByteArrayInputStream;
import java.io.IOException;

public final class AgentConfigMap {

private static final Logger log = LoggerFactory.getLogger(AgentConfigMap.class);

private static final ObjectMapper objectMapper = new ObjectMapper();

public static ConfigMap get(KubernetesClient client, String configMapName) {
return client.configMaps().withName(configMapName).get();
try {
return client.configMaps().withName(configMapName).get();
} catch (KubernetesClientException e) {
log.warn("get ['{}'] -> error while getting a configmap: {}", configMapName, e.getMessage());
throw e;
}
}

public static void create(KubernetesClient client, AgentPoolInstance poolInstance, String configMapName) throws IOException {
ConfigMap m = prepare(client, poolInstance, configMapName);
client.configMaps().resource(m).create();
try {
ConfigMap m = prepare(client, poolInstance, configMapName);
client.configMaps().resource(m).create();
} catch (KubernetesClientException e) {
log.warn("create ['{}', '{}'] -> error while creating a configmap: {}", poolInstance.getName(), configMapName, e.getMessage());
throw e;
}
}

public static void delete(KubernetesClient client, String configMapName) {
try {
client.configMaps().withName(configMapName).delete();

// wait till it's actually removed
while (client.configMaps().withName(configMapName).get() != null) {
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
return;
}
}
} catch (KubernetesClientException e) {
log.warn("delete ['{}'] -> error while deleting a configmap: {}", configMapName, e.getMessage());
throw e;
}
}

public static boolean hasChanges(KubernetesClient client, AgentPoolInstance poolInstance, ConfigMap a) throws IOException {
Expand All @@ -54,13 +88,18 @@ public static boolean hasChanges(KubernetesClient client, AgentPoolInstance pool
}

private static ConfigMap prepare(KubernetesClient client, AgentPoolInstance poolInstance, String configMapName) throws IOException {
AgentPoolConfiguration spec = poolInstance.getResource().getSpec();
try {
AgentPoolConfiguration spec = poolInstance.getResource().getSpec();

String configMapYaml = objectMapper.writeValueAsString(spec.getConfigMap())
.replaceAll("%%configMapName%%", configMapName)
.replace("%%preStopHook%%", escape(Resources.get("/prestop-hook.sh")));
String configMapYaml = objectMapper.writeValueAsString(spec.getConfigMap())
.replaceAll("%%configMapName%%", configMapName)
.replace("%%preStopHook%%", escape(Resources.get("/prestop-hook.sh")));

return client.configMaps().load(new ByteArrayInputStream(configMapYaml.getBytes())).item();
return client.configMaps().load(new ByteArrayInputStream(configMapYaml.getBytes())).item();
} catch (KubernetesClientException e) {
log.warn("prepare ['{}', '{}'] -> error while preparing a configmap: {}", poolInstance.getName(), configMapName, e.getMessage());
throw e;
}
}

private static String escape(String str) throws JsonProcessingException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,25 +26,48 @@
import com.walmartlabs.concord.agentoperator.scheduler.AgentPoolInstance;
import io.fabric8.kubernetes.api.model.Pod;
import io.fabric8.kubernetes.client.KubernetesClient;
import io.fabric8.kubernetes.client.KubernetesClientException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.List;

public final class AgentPod {

private static final Logger log = LoggerFactory.getLogger(AgentPod.class);

public static final String TAGGED_FOR_REMOVAL_LABEL = "concordTaggedForRemoval";
public static final String PRE_STOP_HOOK_TERMINATION_LABEL = "preStopHookTermination";
public static final String POOL_NAME_LABEL = "poolName";
public static final String CONFIG_HASH_LABEL = "concordCfgHash";

private static final ObjectMapper objectMapper = new ObjectMapper();

public static List<Pod> listMarkedForRemoval(KubernetesClient client, String resourceName) {
try {
return client.pods()
.withLabel(AgentPod.TAGGED_FOR_REMOVAL_LABEL)
.withLabel(AgentPod.POOL_NAME_LABEL, resourceName)
.list()
.getItems();
} catch (KubernetesClientException e) {
log.warn("listMarkedForRemoval ['{}'] -> error while listing marked for removal pods: {}", resourceName, e.getMessage());
throw e;
}
}

public static List<Pod> list(KubernetesClient client, String resourceName) {
return client.pods()
.withLabel(POOL_NAME_LABEL, resourceName)
.list()
.getItems();
try {
return client.pods()
.withLabel(POOL_NAME_LABEL, resourceName)
.list()
.getItems();
} catch (KubernetesClientException e) {
log.warn("list ['{}'] -> error while listing pool pods: {}", resourceName, e.getMessage());
throw e;
}
}

public static void create(KubernetesClient client,
Expand All @@ -53,23 +76,21 @@ public static void create(KubernetesClient client,
String configMapName,
String hash) throws IOException {

AgentPoolConfiguration spec = poolInstance.getResource().getSpec();
try {
AgentPoolConfiguration spec = poolInstance.getResource().getSpec();

String podYaml = objectMapper.writeValueAsString(spec.getPod())
.replaceAll("%%podName%%", podName)
.replaceAll("%%app%%", AgentPool.SERVICE_FULL_NAME)
.replaceAll("%%" + POOL_NAME_LABEL + "%%", poolInstance.getName())
.replaceAll("%%configMapName%%", configMapName)
.replaceAll("%%" + CONFIG_HASH_LABEL + "%%", hash);

client.pods().load(new ByteArrayInputStream(podYaml.getBytes())).create();
}
String podYaml = objectMapper.writeValueAsString(spec.getPod())
.replaceAll("%%podName%%", podName)
.replaceAll("%%app%%", AgentPool.SERVICE_FULL_NAME)
.replaceAll("%%" + POOL_NAME_LABEL + "%%", poolInstance.getName())
.replaceAll("%%configMapName%%", configMapName)
.replaceAll("%%" + CONFIG_HASH_LABEL + "%%", hash);

public static List<Pod> listPods(KubernetesClient client, String resourceName) {
return client.pods()
.withLabel(POOL_NAME_LABEL, resourceName)
.list()
.getItems();
client.pods().load(new ByteArrayInputStream(podYaml.getBytes())).create();
} catch (KubernetesClientException e) {
log.warn("create ['{}', '{}', '{}', '{}'] -> error while creating a pod: {}", poolInstance.getName(), podName, configMapName, hash, e.getMessage());
throw e;
}
}

private AgentPod() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ private void doRun() {
throw new IllegalArgumentException("Unknown pool status: " + i.getStatus());
}
} catch (IOException e) {
log.error("doRun -> error while processing a registered pool {}: {}", i.getName(), e.getMessage());
log.error("doRun -> error while processing a registered pool {} ({}): {}", i.getName(), i.getStatus(), e.getMessage());
}
});
}
Expand Down Expand Up @@ -176,7 +176,7 @@ private void processDeleted(AgentPoolInstance i) throws IOException {
apply(changes);

// if no pods left - remove the pool
List<Pod> pods = AgentPod.listPods(k8sClient, resourceName);
List<Pod> pods = AgentPod.list(k8sClient, resourceName);
if (pods.isEmpty()) {
synchronized (pools) {
pools.remove(resourceName);
Expand Down

0 comments on commit bfa7dd3

Please sign in to comment.