diff --git a/eks-aggregator/Makefile b/eks-aggregator/Makefile new file mode 100644 index 0000000..0eff92f --- /dev/null +++ b/eks-aggregator/Makefile @@ -0,0 +1,41 @@ +ifndef ACCOUNT_ID +$(error ACCOUNT_ID is not set) +endif +.PHONY: cluster-up +cluster-up: + eksctl create cluster --with-oidc --name vector-demo + aws iam create-policy \ + --policy-name AWSLoadBalancerControllerIAMPolicy \ + --policy-document file://iam_policy.json | jq -r .Policy.Arn + eksctl create iamserviceaccount \ + --cluster=vector-demo \ + --namespace=kube-system \ + --name=aws-load-balancer-controller \ + --attach-policy-arn=arn:aws:iam::$(ACCOUNT_ID):policy/AWSLoadBalancerControllerIAMPolicy \ + --override-existing-serviceaccounts \ + --approve + helm repo add eks https://aws.github.io/eks-charts && \ + helm repo update + kubectl apply -k "github.com/aws/eks-charts/stable/aws-load-balancer-controller/crds?ref=master" + helm upgrade --install aws-load-balancer-controller eks/aws-load-balancer-controller \ + --set clusterName=vector-demo \ + --set serviceAccount.create=false \ + --set serviceAccount.name=aws-load-balancer-controller \ + --namespace kube-system + kubectl apply -f "https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml" + kubectl create namespace vector + kubectl create namespace datadog + +.PHONY: cluster-down +cluster-down: + helm uninstall aws-load-balancer-controller \ + --namespace kube-system + kubectl delete namespace vector + kubectl delete namespace datadog + eksctl delete iamserviceaccount \ + --cluster=vector-demo \ + --namespace=kube-system \ + --name=aws-load-balancer-controller + sleep 15 + aws iam delete-policy --policy-arn arn:aws:iam::$(ACCOUNT_ID):policy/AWSLoadBalancerControllerIAMPolicy + eksctl delete cluster --name=vector-demo diff --git a/eks-aggregator/README.md b/eks-aggregator/README.md new file mode 100644 index 0000000..cab4676 --- /dev/null +++ b/eks-aggregator/README.md @@ -0,0 +1,77 @@ +# EKS Stateless Aggregator Demo + +## Prerequisites + +- [helm](https://helm.sh/docs/intro/install/) +- [kubectl](https://kubernetes.io/docs/tasks/tools/) +- [ekctl](https://eksctl.io/introduction/#installation) - if creating a new cluster with the Makefile + +Your EKS cluster will need the [AWS Load Balancer Controller](https://github.com/kubernetes-sigs/aws-load-balancer-controller) installed, this will be installed via the `make` targets or you +can follow [Amazon's instructions](https://docs.aws.amazon.com/eks/latest/userguide/aws-load-balancer-controller.html) for your own cluster. Additionally you'll also need a Datadog API key for Vector and the Datadog Agents. + +## Getting started + +Add the necessary Helm repositories for the Vector and Datadog charts: + +```shell +helm repo add datadog https://helm.datadoghq.com +helm repo add vector https://helm.vector.dev +helm repo update +``` + +If you need to provision an EKS cluster with the AWS load balancer controller, you can use the included Makefile by running: + +```shell +ACCOUNT_ID= make cluster-up +``` + +The following command will install Vector as an Aggregator using an Application Load Balancer to route requests from Datadog Agents. +Vector is configured to process Datadog Agent logs in a similar fashion Datadog's [Pipelines](https://docs.datadoghq.com/logs/log_configuration/pipelines/) +feature, allowing you to move your log processing onto your own hardware. + +```shell +helm upgrade --install vector vector/vector --devel \ + --namespace vector --values helm/vector.yaml \ + --set secrets.generic.DATADOG_API_KEY= \ + --set ingress.hosts[0].host=DUMMY_VAL +``` + +Once your ALB is provisioned you can run the following command to extract it's generated hostname to replace the DUMMY_VAL above. + +```shell +export ALB_HOSTNAME=kubectl --namespace vector get ingress vector \ + --output go-template='{{(index .status.loadBalancer.ingress 0 ).hostname}}' +``` + +The following command will upgrade your `vector` release with the created ALB hostname. + +```shell +helm upgrade --install vector vector/vector --devel \ + --namespace vector --values helm/vector.yaml \ + --set secrets.generic.DATADOG_API_KEY= \ + --set ingress.hosts[0].host=${ALB_HOSTNAME} +``` + +Then install your Datadog Agents substituting the hostname from the previous step. + +```shell +helm upgrade --install datadog datadog/datadog \ + --namespace datadog --values helm/datadog.yaml \ + --set datadog.apiKey= \ + --set agents.customAgentConfig.logs_config.logs_dd_url=":8080" + ``` + +Once all the pods have started, you should begin to see logs being ingested to your [Datadog account](https://app.datadoghq.com/logs) that are being aggregated and parsed by Vector. + +## Cleaning up + +The _cluster-down_ target will delete the Namespaces and Cluster created during this demo. + +```shell +make cluster-down +``` + +## Notes + +- A nightly image is currently used to leverage our rewritten `datadog_logs` sink +- The `--devel` option is used to access our currently _pre-release_ [`vector`](https://github.com/vectordotdev/helm-charts/blob/develop/charts/vector/README.md) chart diff --git a/eks-aggregator/helm/datadog.yaml b/eks-aggregator/helm/datadog.yaml new file mode 100644 index 0000000..3c5d1ee --- /dev/null +++ b/eks-aggregator/helm/datadog.yaml @@ -0,0 +1,20 @@ +datadog: + logLevel: debug + clusterName: vector-demo + logs: + containerCollectAll: true + enabled: true + orchestratorExplorer: + enabled: true + processAgent: + processCollection: true + prometheusScrape: + enabled: true +agents: + useConfigMap: true + customAgentConfig: + logs_config: + logs_dd_url: "vector.mycompany.tld:8080" + logs_no_ssl: true + use_http: true + use_v2_api: false diff --git a/eks-aggregator/helm/vector.yaml b/eks-aggregator/helm/vector.yaml new file mode 100644 index 0000000..b35a549 --- /dev/null +++ b/eks-aggregator/helm/vector.yaml @@ -0,0 +1,103 @@ +role: Stateless-Aggregator + +image: + tag: nightly-2021-09-16-distroless-libc + +autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 5 + +resources: + requests: + cpu: 200m + memory: 256Mi + limits: + cpu: 200m + memory: 256Mi + +env: + - name: DATADOG_API_KEY + valueFrom: + secretKeyRef: + name: vector + key: DATADOG_API_KEY + +podAnnotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9090" + +customConfig: + api: + enabled: true + address: 0.0.0.0:8686 + sources: + datadog_agents: + type: datadog_agent + acknowledgements: true + address: 0.0.0.0:8080 + internal_metrics: + type: internal_metrics + transforms: + remap: + type: remap + inputs: + - datadog_agents + drop_on_abort: true + source: | + # Parse the received .ddtags field so we can more easily access the contained tags + .ddtags = parse_key_value!(.ddtags, key_value_delimiter: ":", field_delimiter: ",") + .ddtags.sender = "vector" + .ddtags.vector_aggregator = get_hostname!() + + if .service == "agent" { + parsed, err = + parse_grok(.message, s'(?%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{TIME}) UTC \| %{NOTSPACE:agent} \| %{LOGLEVEL:status} \| \(%{NOTSPACE:filename}:%{NUMBER:lineno} in %{WORD:process}\) \|( %{NOTSPACE:kv} \|)?( - \|)?( \(%{NOTSPACE:pyFilename}:%{NUMBER:pyLineno}\) \|)?%{GREEDYDATA}', remove_empty: true) ?? + parse_grok(.message, s'(?%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{TIME}) UTC \| %{LOGLEVEL:status} \| \(%{NOTSPACE:filename}:%{NUMBER:lineno} in %{WORD:process}\)%{GREEDYDATA}') ?? + parse_grok(.message, s'(?%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{TIME}) UTC \| %{NOTSPACE:agent} \| %{LOGLEVEL:status}\s+\| %{WORD:class} \| %{GREEDYDATA}') + if err != null { log("Failed to parse agent log: " + string!(.message), level: "error"); abort } + + parsed |= parse_key_value(del(parsed.kv), key_value_delimiter: ":", field_delimiter: ",") ?? {} + + ts = parse_timestamp!(parsed.timestamp, format: "%F %T") + parsed.timestamp = to_unix_timestamp(ts, unit: "milliseconds") + parsed.lineno = to_int!(parsed.lineno) + if exists(parsed.pyLineno) { parsed.pyLineno = to_int!(parsed.pyLineno) } + + . = merge(., parsed) + } + + # Re-encode Datadog tags as a string for the `datadog_logs` sink + .ddtags = encode_key_value!(.ddtags, key_value_delimiter: ":", field_delimiter: ",") + sinks: + to_datadog: + type: datadog_logs + inputs: + - remap + default_api_key: "${DATADOG_API_KEY}" + batch: + timeout_secs: 5 + encoding: + codec: json + prom_exporter: + type: prometheus_exporter + inputs: + - internal_metrics + address: 0.0.0.0:9090 + +ingress: + enabled: true + annotations: + alb.ingress.kubernetes.io/scheme: internal + alb.ingress.kubernetes.io/healthcheck-port: '8686' + alb.ingress.kubernetes.io/healthcheck-path: /health + alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 8080}]' + alb.ingress.kubernetes.io/target-type: ip + kubernetes.io/ingress.class: alb + hosts: + - host: vector.mycompany.tld + paths: + - path: / + pathType: Prefix + port: + name: datadog-agents diff --git a/eks-aggregator/iam_policy.json b/eks-aggregator/iam_policy.json new file mode 100644 index 0000000..c11ff94 --- /dev/null +++ b/eks-aggregator/iam_policy.json @@ -0,0 +1,207 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "iam:CreateServiceLinkedRole", + "ec2:DescribeAccountAttributes", + "ec2:DescribeAddresses", + "ec2:DescribeAvailabilityZones", + "ec2:DescribeInternetGateways", + "ec2:DescribeVpcs", + "ec2:DescribeSubnets", + "ec2:DescribeSecurityGroups", + "ec2:DescribeInstances", + "ec2:DescribeNetworkInterfaces", + "ec2:DescribeTags", + "ec2:GetCoipPoolUsage", + "ec2:DescribeCoipPools", + "elasticloadbalancing:DescribeLoadBalancers", + "elasticloadbalancing:DescribeLoadBalancerAttributes", + "elasticloadbalancing:DescribeListeners", + "elasticloadbalancing:DescribeListenerCertificates", + "elasticloadbalancing:DescribeSSLPolicies", + "elasticloadbalancing:DescribeRules", + "elasticloadbalancing:DescribeTargetGroups", + "elasticloadbalancing:DescribeTargetGroupAttributes", + "elasticloadbalancing:DescribeTargetHealth", + "elasticloadbalancing:DescribeTags" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "cognito-idp:DescribeUserPoolClient", + "acm:ListCertificates", + "acm:DescribeCertificate", + "iam:ListServerCertificates", + "iam:GetServerCertificate", + "waf-regional:GetWebACL", + "waf-regional:GetWebACLForResource", + "waf-regional:AssociateWebACL", + "waf-regional:DisassociateWebACL", + "wafv2:GetWebACL", + "wafv2:GetWebACLForResource", + "wafv2:AssociateWebACL", + "wafv2:DisassociateWebACL", + "shield:GetSubscriptionState", + "shield:DescribeProtection", + "shield:CreateProtection", + "shield:DeleteProtection" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "ec2:AuthorizeSecurityGroupIngress", + "ec2:RevokeSecurityGroupIngress" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "ec2:CreateSecurityGroup" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "ec2:CreateTags" + ], + "Resource": "arn:aws:ec2:*:*:security-group/*", + "Condition": { + "StringEquals": { + "ec2:CreateAction": "CreateSecurityGroup" + }, + "Null": { + "aws:RequestTag/elbv2.k8s.aws/cluster": "false" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "ec2:CreateTags", + "ec2:DeleteTags" + ], + "Resource": "arn:aws:ec2:*:*:security-group/*", + "Condition": { + "Null": { + "aws:RequestTag/elbv2.k8s.aws/cluster": "true", + "aws:ResourceTag/elbv2.k8s.aws/cluster": "false" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "ec2:AuthorizeSecurityGroupIngress", + "ec2:RevokeSecurityGroupIngress", + "ec2:DeleteSecurityGroup" + ], + "Resource": "*", + "Condition": { + "Null": { + "aws:ResourceTag/elbv2.k8s.aws/cluster": "false" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:CreateLoadBalancer", + "elasticloadbalancing:CreateTargetGroup" + ], + "Resource": "*", + "Condition": { + "Null": { + "aws:RequestTag/elbv2.k8s.aws/cluster": "false" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:CreateListener", + "elasticloadbalancing:DeleteListener", + "elasticloadbalancing:CreateRule", + "elasticloadbalancing:DeleteRule" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:AddTags", + "elasticloadbalancing:RemoveTags" + ], + "Resource": [ + "arn:aws:elasticloadbalancing:*:*:targetgroup/*/*", + "arn:aws:elasticloadbalancing:*:*:loadbalancer/net/*/*", + "arn:aws:elasticloadbalancing:*:*:loadbalancer/app/*/*" + ], + "Condition": { + "Null": { + "aws:RequestTag/elbv2.k8s.aws/cluster": "true", + "aws:ResourceTag/elbv2.k8s.aws/cluster": "false" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:AddTags", + "elasticloadbalancing:RemoveTags" + ], + "Resource": [ + "arn:aws:elasticloadbalancing:*:*:listener/net/*/*/*", + "arn:aws:elasticloadbalancing:*:*:listener/app/*/*/*", + "arn:aws:elasticloadbalancing:*:*:listener-rule/net/*/*/*", + "arn:aws:elasticloadbalancing:*:*:listener-rule/app/*/*/*" + ] + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:ModifyLoadBalancerAttributes", + "elasticloadbalancing:SetIpAddressType", + "elasticloadbalancing:SetSecurityGroups", + "elasticloadbalancing:SetSubnets", + "elasticloadbalancing:DeleteLoadBalancer", + "elasticloadbalancing:ModifyTargetGroup", + "elasticloadbalancing:ModifyTargetGroupAttributes", + "elasticloadbalancing:DeleteTargetGroup" + ], + "Resource": "*", + "Condition": { + "Null": { + "aws:ResourceTag/elbv2.k8s.aws/cluster": "false" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:RegisterTargets", + "elasticloadbalancing:DeregisterTargets" + ], + "Resource": "arn:aws:elasticloadbalancing:*:*:targetgroup/*/*" + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:SetWebAcl", + "elasticloadbalancing:ModifyListener", + "elasticloadbalancing:AddListenerCertificates", + "elasticloadbalancing:RemoveListenerCertificates", + "elasticloadbalancing:ModifyRule" + ], + "Resource": "*" + } + ] +}