Skip to content

Commit

Permalink
chore: add APP_INSIGHTS_ID to image build (#1266)
Browse files Browse the repository at this point in the history
# Description

* Add APP_INSIGHTS_ID to image build to allow monitoring through
heartbeat during test execution.
* Add test to configuration packages (for both operator and daemonset)
* Make heartbeat interval configurable
* New keys in
`deploy/standard/manifests/controller/helm/retina/values.yaml`

## Related Issue

If this pull request is related to any issue, please mention it here.
Additionally, make sure that the issue is assigned to you before
submitting this pull request.

## Checklist

- [ ] I have read the [contributing
documentation](https://retina.sh/docs/contributing).
- [ ] I signed and signed-off the commits (`git commit -S -s ...`). See
[this
documentation](https://docs.github.com/en/authentication/managing-commit-signature-verification/about-commit-signature-verification)
on signing commits.
- [ ] I have correctly attributed the author(s) of the code.
- [ ] I have tested the changes locally.
- [ ] I have followed the project's style guidelines.
- [ ] I have updated the documentation, if necessary.
- [ ] I have added tests, if applicable.

## Screenshots (if applicable) or Testing Completed

Manual test was also done to validate that the configuration was
propagated from helm chart `values.yaml` file to both operator and
agent. Value set was printed to log:

![image](https://github.com/user-attachments/assets/e2e80968-8874-4b31-a2d5-61eb2d3c39a7)

![image](https://github.com/user-attachments/assets/662b175a-072f-4806-a91b-864bd758b12d)


## Additional Notes

Add any additional notes or context about the pull request here.

---

Please refer to the [CONTRIBUTING.md](../CONTRIBUTING.md) file for more
information on how to contribute to this project.

---------

Signed-off-by: Alex Castilio dos Santos <[email protected]>
  • Loading branch information
alexcastilio authored Jan 31, 2025
1 parent 8543b7b commit 1fbf3b3
Show file tree
Hide file tree
Showing 16 changed files with 160 additions and 11 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/images.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ jobs:
IMAGE_NAMESPACE=${{ github.repository }} \
PLATFORM=${{ matrix.platform }}/${{ matrix.arch }} \
IMAGE_REGISTRY=${{ vars.ACR_NAME }} \
APP_INSIGHTS_ID=${{ secrets.AZURE_APP_INSIGHTS_ID }} \
BUILDX_ACTION=--push
else
make retina-image \
Expand Down Expand Up @@ -102,6 +103,7 @@ jobs:
IMAGE_NAMESPACE=${{ github.repository }} \
PLATFORM=${{ matrix.platform }}/${{ matrix.arch }} \
IMAGE_REGISTRY=${{ vars.ACR_NAME }} \
APP_INSIGHTS_ID=${{ secrets.AZURE_APP_INSIGHTS_ID }} \
WINDOWS_YEARS=${{ matrix.year }} \
BUILDX_ACTION=--push
else
Expand Down Expand Up @@ -153,6 +155,7 @@ jobs:
IMAGE_NAMESPACE=${{ github.repository }} \
PLATFORM=${{ matrix.platform }}/${{ matrix.arch }} \
IMAGE_REGISTRY=${{ vars.ACR_NAME }} \
APP_INSIGHTS_ID=${{ secrets.AZURE_APP_INSIGHTS_ID }} \
BUILDX_ACTION=--push
else
make retina-operator-image \
Expand Down
6 changes: 2 additions & 4 deletions cmd/standard/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (
"fmt"
"os"
"strings"
"time"

"go.uber.org/zap"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -48,8 +47,7 @@ import (
)

const (
logFileName = "retina.log"
heartbeatInterval = 15 * time.Minute
logFileName = "retina.log"

nodeNameEnvKey = "NODE_NAME"
nodeIPEnvKey = "NODE_IP"
Expand Down Expand Up @@ -309,7 +307,7 @@ func (d *Daemon) Start() error {
defer controllerMgr.Stop(ctx)

// start heartbeat goroutine for application insights
go tel.Heartbeat(ctx, heartbeatInterval)
go tel.Heartbeat(ctx, daemonConfig.TelemetryInterval)

// Start controller manager, which will start http server and plugin manager.
go controllerMgr.Start(ctx)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ data:
enableAnnotations: {{ .Values.enableAnnotations }}
bypassLookupIPOfInterest: {{ .Values.bypassLookupIPOfInterest }}
dataAggregationLevel: {{ .Values.dataAggregationLevel }}
telemetryInterval: {{ .Values.daemonset.telemetryInterval }}
{{- end}}
---
{{- if .Values.os.windows}}
Expand All @@ -48,6 +49,7 @@ data:
enableTelemetry: {{ .Values.enableTelemetry }}
enablePodLevel: {{ .Values.enablePodLevel }}
remoteContext: {{ .Values.remoteContext }}
telemetryInterval: {{ .Values.daemonset.telemetryInterval }}
{{- end}}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ data:
captureDebug: {{ .Values.capture.debug }}
captureJobNumLimit: {{ .Values.capture.jobNumLimit }}
enableManagedStorageAccount: {{ .Values.capture.enableManagedStorageAccount }}
telemetryInterval: {{ .Values.operator.telemetryInterval }}
{{- if .Values.capture.enableManagedStorageAccount }}
azureCredentialConfig: /etc/cloud-config/azure.json
{{- end }}
Expand Down
2 changes: 2 additions & 0 deletions deploy/standard/manifests/controller/helm/retina/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ operator:
args:
- "--config"
- "/retina/operator-config.yaml"
telemetryInterval: "5m"

image:
repository: ghcr.io/microsoft/retina/retina-agent
Expand Down Expand Up @@ -87,6 +88,7 @@ daemonset:
metricsBindAddress: ":18080"
ports:
containerPort: 10093
telemetryInterval: "15m"

# volume mounts with name and mountPath
volumeMounts:
Expand Down
5 changes: 1 addition & 4 deletions operator/cmd/standard/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import (
"net/http"
"net/http/pprof"
"os"
"time"

"go.uber.org/zap/zapcore"

Expand Down Expand Up @@ -55,8 +54,6 @@ var (
MaxFileSizeMB = 100
MaxBackups = 3
MaxAgeDays = 30

HeartbeatFrequency = 5 * time.Minute
)

func init() {
Expand Down Expand Up @@ -255,7 +252,7 @@ func (o *Operator) Start() {
}

// start heartbeat goroutine for application insights
go tel.Heartbeat(ctx, HeartbeatFrequency)
go tel.Heartbeat(ctx, oconfig.TelemetryInterval)
}

func EnablePProf() {
Expand Down
22 changes: 20 additions & 2 deletions operator/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,30 @@ package config

import (
"fmt"
"log"
"time"

"github.com/microsoft/retina/pkg/config"
"github.com/spf13/viper"
)

const MinTelemetryInterval time.Duration = 2 * time.Minute

var (
DefaultTelemetryInterval = 5 * time.Minute
ErrorTelemetryIntervalTooSmall = fmt.Errorf("telemetryInterval smaller than %v is not allowed", MinTelemetryInterval)
)

type OperatorConfig struct {
config.CaptureConfig `mapstructure:",squash"`

InstallCRDs bool `yaml:"installCRDs"`
EnableTelemetry bool `yaml:"enableTelemetry"`
LogLevel string `yaml:"logLevel"`
// EnableRetinaEndpoint indicates whether to enable RetinaEndpoint
EnableRetinaEndpoint bool `yaml:"enableRetinaEndpoint"`
RemoteContext bool `yaml:"remoteContext"`
EnableRetinaEndpoint bool `yaml:"enableRetinaEndpoint"`
RemoteContext bool `yaml:"remoteContext"`
TelemetryInterval time.Duration `yaml:"telemetryInterval"`
}

func GetConfig(cfgFileName string) (*OperatorConfig, error) {
Expand All @@ -35,5 +45,13 @@ func GetConfig(cfgFileName string) (*OperatorConfig, error) {
return nil, fmt.Errorf("error unmarshalling config: %w", err)
}

// If unset, default telemetry interval to 5 minutes.
if cfg.TelemetryInterval == 0 {
log.Printf("telemetryInterval is not set, defaulting to %v", DefaultTelemetryInterval)
cfg.TelemetryInterval = DefaultTelemetryInterval
} else if cfg.TelemetryInterval < MinTelemetryInterval {
return nil, ErrorTelemetryIntervalTooSmall
}

return &cfg, nil
}
43 changes: 43 additions & 0 deletions operator/config/config_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package config_test

import (
"errors"
"testing"
"time"

"github.com/microsoft/retina/operator/config"
)

func TestGetConfig(t *testing.T) {
c, err := config.GetConfig("./testwith/config.yaml")
if err != nil {
t.Errorf("Expected no error, instead got %+v", err)
}

if !c.InstallCRDs ||
!c.EnableTelemetry ||
c.LogLevel != "info" ||
!c.EnableRetinaEndpoint ||
!c.RemoteContext ||
c.TelemetryInterval != 15*time.Minute {
t.Errorf("Expeted config should be same as ./testwith/config.yaml; instead got %+v", c)
}
}

func TestGetConfig_DefaultTelemetryInterval(t *testing.T) {
c, err := config.GetConfig("./testwith/config-without-telemetry-interval.yaml")
if err != nil {
t.Errorf("Expected no error, instead got %+v", err)
}

if c.TelemetryInterval != config.DefaultTelemetryInterval {
t.Errorf("Expected telemetry interval to be %v, instead got %v", config.DefaultTelemetryInterval, c.TelemetryInterval)
}
}

func TestGetConfig_SmallTelemetryInterval(t *testing.T) {
_, err := config.GetConfig("./testwith/config-small-telemetry-interval.yaml")
if !errors.Is(err, config.ErrorTelemetryIntervalTooSmall) {
t.Errorf("Expected error %s, instead got %s", config.ErrorTelemetryIntervalTooSmall, err)
}
}
9 changes: 9 additions & 0 deletions operator/config/testwith/config-small-telemetry-interval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiServer:
host: "0.0.0.0"
port: 10093
installCRDs: true
enableTelemetry: true
logLevel: info
enableRetinaEndpoint: true
remoteContext: true
telemetryInterval: "10s"
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiServer:
host: "0.0.0.0"
port: 10093
installCRDs: true
enableTelemetry: true
logLevel: info
enableRetinaEndpoint: true
remoteContext: true
9 changes: 9 additions & 0 deletions operator/config/testwith/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiServer:
host: "0.0.0.0"
port: 10093
installCRDs: true
enableTelemetry: true
logLevel: info
enableRetinaEndpoint: true
remoteContext: true
telemetryInterval: "15m"
16 changes: 16 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,18 @@ import (
// Level defines the level of monitor aggregation.
type Level int

const MinTelemetryInterval time.Duration = 2 * time.Minute

const (
Low Level = iota
High
)

var (
ErrorTelemetryIntervalTooSmall = fmt.Errorf("telemetryInterval smaller than %v is not allowed", MinTelemetryInterval)
DefaultTelemetryInterval = 15 * time.Minute
)

func (l *Level) UnmarshalText(text []byte) error {
s := strings.ToLower(string(text))
switch s {
Expand Down Expand Up @@ -67,6 +74,7 @@ type Config struct {
BypassLookupIPOfInterest bool `yaml:"bypassLookupIPOfInterest"`
DataAggregationLevel Level `yaml:"dataAggregationLevel"`
MonitorSockPath string `yaml:"monitorSockPath"`
TelemetryInterval time.Duration `yaml:"telemetryInterval"`
}

func GetConfig(cfgFilename string) (*Config, error) {
Expand Down Expand Up @@ -107,6 +115,14 @@ func GetConfig(cfgFilename string) (*Config, error) {
log.Print("metricsInterval is deprecated, please use metricsIntervalDuration instead")
}

// If unset, default telemetry interval to 15 minutes.
if config.TelemetryInterval == 0 {
log.Printf("telemetryInterval is not set, defaulting to %v", DefaultTelemetryInterval)
config.TelemetryInterval = DefaultTelemetryInterval
} else if config.TelemetryInterval < MinTelemetryInterval {
return nil, ErrorTelemetryIntervalTooSmall
}

return &config, nil
}

Expand Down
21 changes: 20 additions & 1 deletion pkg/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package config

import (
"errors"
"reflect"
"testing"
"time"
Expand All @@ -26,8 +27,26 @@ func TestGetConfig(t *testing.T) {
!c.EnableRetinaEndpoint ||
c.RemoteContext ||
c.EnableAnnotations ||
c.TelemetryInterval != 15*time.Minute ||
c.DataAggregationLevel != Low {
t.Fatalf("Expeted config should be same as ./testwith/config.yaml; instead got %+v", c)
t.Errorf("Expeted config should be same as ./testwith/config.yaml; instead got %+v", c)
}
}

func TestGetConfig_SmallTelemetryInterval(t *testing.T) {
_, err := GetConfig("./testwith/config-small-telemetry-interval.yaml")
if !errors.Is(err, ErrorTelemetryIntervalTooSmall) {
t.Errorf("Expected error %s, instead got %s", ErrorTelemetryIntervalTooSmall, err)
}
}

func TestGetConfig_DefaultTelemetryInterval(t *testing.T) {
c, err := GetConfig("./testwith/config-without-telemetry-interval.yaml")
if err != nil {
t.Errorf("Expected no error, instead got %+v", err)
}
if c.TelemetryInterval != DefaultTelemetryInterval {
t.Errorf("Expected telemetry interval to be %v, instead got %v", DefaultTelemetryInterval, c.TelemetryInterval)
}
}

Expand Down
12 changes: 12 additions & 0 deletions pkg/config/testwith/config-small-telemetry-interval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiServer:
host: "0.0.0.0"
port: 10093
# Supported - debug, info, error, warn, panic, fatal.
logLevel: info
enabledPlugin: ["dropreason", "packetforward", "linuxutil"]
# Interval, in seconds, to scrape/publish metrics.
metricsIntervalDuration: "10s"
# used to export telemetry to AppInsights
telemetryEnabled: true
dataAggregationLevel: "low"
telemetryInterval: "10s"
11 changes: 11 additions & 0 deletions pkg/config/testwith/config-without-telemetry-interval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiServer:
host: "0.0.0.0"
port: 10093
# Supported - debug, info, error, warn, panic, fatal.
logLevel: info
enabledPlugin: ["dropreason", "packetforward", "linuxutil"]
# Interval, in seconds, to scrape/publish metrics.
metricsIntervalDuration: "10s"
# used to export telemetry to AppInsights
telemetryEnabled: true
dataAggregationLevel: "low"
1 change: 1 addition & 0 deletions pkg/config/testwith/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ metricsIntervalDuration: "10s"
# used to export telemetry to AppInsights
telemetryEnabled: true
dataAggregationLevel: "low"
telemetryInterval: "15m"

0 comments on commit 1fbf3b3

Please sign in to comment.