Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: rename DisableTraceLocality to TraceCache #1450

Merged
merged 5 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion app/app_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -766,7 +766,7 @@ func TestPeerRouting_TraceLocalityDisabled(t *testing.T) {
redisDB := 12 + i
cfg := defaultConfig(basePort, redisDB)
collectionCfg := cfg.GetCollectionConfig()
collectionCfg.DisableTraceLocality = true
collectionCfg.TraceLocalityMode = "distributed"
cfg.GetCollectionConfigVal = collectionCfg

apps[i], graph = newStartedApp(t, senders[i], nil, peers, cfg)
Expand Down
40 changes: 20 additions & 20 deletions collect/collect.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ func (i *InMemCollector) Start() error {
i.Peers.RegisterUpdatedPeersCallback(i.redistributeTimer.Reset)
}

if i.Config.GetCollectionConfig().DisableTraceLocality {
if !i.Config.GetCollectionConfig().TraceLocalityEnabled() {
i.keptDecisionMessages = make(chan string, decisionMessageBufferSize)
i.dropDecisionMessages = make(chan string, decisionMessageBufferSize)
i.PubSub.Subscribe(context.Background(), keptTraceDecisionTopic, i.signalKeptTraceDecisions)
Expand Down Expand Up @@ -524,7 +524,7 @@ func (i *InMemCollector) redistributeTraces(ctx context.Context) {

for _, sp := range trace.GetSpans() {

if i.Config.GetCollectionConfig().DisableTraceLocality {
if !i.Config.GetCollectionConfig().TraceLocalityEnabled() {
dc := i.createDecisionSpan(sp, trace, newTarget)
i.PeerTransmission.EnqueueEvent(dc)
continue
Expand Down Expand Up @@ -669,21 +669,21 @@ func (i *InMemCollector) processSpan(ctx context.Context, sp *types.Span, source
isMyTrace bool
)
// if trace locality is enabled, we should forward all spans to its correct peer
if i.Config.GetCollectionConfig().DisableTraceLocality {
targetShard, isMyTrace = i.IsMyTrace(sp.TraceID)
// if the span is a decision span and the trace no longer belong to us, we should not forward it to the peer
if !isMyTrace && sp.IsDecisionSpan() {
return
}

} else {
if i.Config.GetCollectionConfig().TraceLocalityEnabled() {
targetShard = i.Sharder.WhichShard(sp.TraceID)
isMyTrace = true
if !targetShard.Equals(i.Sharder.MyShard()) {
sp.APIHost = targetShard.GetAddress()
i.PeerTransmission.EnqueueSpan(sp)
return
}
} else {
targetShard, isMyTrace = i.IsMyTrace(sp.TraceID)
// if the span is a decision span and the trace no longer belong to us, we should not forward it to the peer
if !isMyTrace && sp.IsDecisionSpan() {
return
}

}

tcfg := i.Config.GetTracesConfig()
Expand Down Expand Up @@ -1069,7 +1069,7 @@ func (i *InMemCollector) Stop() error {
close(i.fromPeer)
close(i.outgoingTraces)

if i.Config.GetCollectionConfig().DisableTraceLocality {
if !i.Config.GetCollectionConfig().TraceLocalityEnabled() {
close(i.dropDecisionBuffer)
close(i.keptDecisionBuffer)
}
Expand Down Expand Up @@ -1527,23 +1527,23 @@ func (i *InMemCollector) makeDecision(ctx context.Context, trace *types.Trace, s
HasRoot: hasRoot,
}

if i.Config.GetCollectionConfig().DisableTraceLocality {
if !i.Config.GetCollectionConfig().TraceLocalityEnabled() {
i.publishTraceDecision(ctx, td)
}

return &td, nil
}

func (i *InMemCollector) IsMyTrace(traceID string) (sharder.Shard, bool) {
if i.Config.GetCollectionConfig().TraceLocalityEnabled() {
return i.Sharder.MyShard(), true
}

// if trace locality is disabled, we should only process
// traces that belong to the current refinery
if i.Config.GetCollectionConfig().DisableTraceLocality {
targeShard := i.Sharder.WhichShard(traceID)

return targeShard, i.Sharder.MyShard().Equals(targeShard)
}
targeShard := i.Sharder.WhichShard(traceID)

return i.Sharder.MyShard(), true
return targeShard, i.Sharder.MyShard().Equals(targeShard)

}

Expand Down Expand Up @@ -1578,7 +1578,7 @@ func (i *InMemCollector) publishTraceDecision(ctx context.Context, td TraceDecis
}

func (i *InMemCollector) sendKeptDecisions() {
if !i.Config.GetCollectionConfig().DisableTraceLocality {
if i.Config.GetCollectionConfig().TraceLocalityEnabled() {
return
}
interval := time.Duration(i.Config.GetCollectionConfig().KeptDecisionSendInterval)
Expand All @@ -1589,7 +1589,7 @@ func (i *InMemCollector) sendKeptDecisions() {
}

func (i *InMemCollector) sendDropDecisions() {
if !i.Config.GetCollectionConfig().DisableTraceLocality {
if i.Config.GetCollectionConfig().TraceLocalityEnabled() {
return
}
interval := time.Duration(i.Config.GetCollectionConfig().DropDecisionSendInterval)
Expand Down
14 changes: 7 additions & 7 deletions collect/collect_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ func newTestCollector(conf config.Config, transmission transmit.Transmission, pe
redistributeTimer: redistributeNotifier,
}

if conf.GetCollectionConfig().DisableTraceLocality {
if !conf.GetCollectionConfig().TraceLocalityEnabled() {
localPubSub.Subscribe(context.Background(), keptTraceDecisionTopic, c.signalKeptTraceDecisions)
}

Expand Down Expand Up @@ -196,7 +196,7 @@ func TestAddRootSpan(t *testing.T) {

conf.Mux.Lock()
collectionCfg := conf.GetCollectionConfigVal
collectionCfg.DisableTraceLocality = true
collectionCfg.TraceLocalityMode = "distributed"
conf.GetCollectionConfigVal = collectionCfg
conf.Mux.Unlock()

Expand Down Expand Up @@ -527,8 +527,8 @@ func TestDryRunMode(t *testing.T) {
DryRun: true,
ParentIdFieldNames: []string{"trace.parent_id", "parentId"},
GetCollectionConfigVal: config.CollectionConfig{
ShutdownDelay: config.Duration(1 * time.Millisecond),
DisableTraceLocality: true,
ShutdownDelay: config.Duration(1 * time.Millisecond),
TraceLocalityMode: "distributed",
},
}
transmission := &transmit.MockTransmission{}
Expand Down Expand Up @@ -2233,8 +2233,8 @@ func TestSendDropDecisions(t *testing.T) {
GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1},
ParentIdFieldNames: []string{"trace.parent_id", "parentId"},
GetCollectionConfigVal: config.CollectionConfig{
ShutdownDelay: config.Duration(1 * time.Millisecond),
DisableTraceLocality: true,
ShutdownDelay: config.Duration(1 * time.Millisecond),
TraceLocalityMode: "distributed",
},
}
transmission := &transmit.MockTransmission{}
Expand Down Expand Up @@ -2318,7 +2318,7 @@ func TestExpiredTracesCleanup(t *testing.T) {
MaxBatchSize: 1500,
},
GetCollectionConfigVal: config.CollectionConfig{
DisableTraceLocality: true,
TraceLocalityMode: "distributed",
},
GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1},
AddSpanCountToRoot: true,
Expand Down
2 changes: 1 addition & 1 deletion collect/stressRelief.go
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ func (s *StressRelief) Recalc() uint {
// If a single node is under significant stress, it can activate stress relief mode
overallStressLevel := uint(math.Max(float64(clusterStressLevel), float64(localLevel)))

if s.Config.GetCollectionConfig().DisableTraceLocality {
if !s.Config.GetCollectionConfig().TraceLocalityEnabled() {
overallStressLevel = clusterStressLevel
}
s.overallStressLevel = overallStressLevel
Expand Down
2 changes: 1 addition & 1 deletion collect/stress_relief_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ func TestStressRelief_OverallStressLevel_DisableTraceLocality(t *testing.T) {
MinimumActivationDuration: config.Duration(5 * time.Second),
},
GetCollectionConfigVal: config.CollectionConfig{
DisableTraceLocality: true,
TraceLocalityMode: "distributed",
},
}
// On startup, the stress relief should not be active
Expand Down
22 changes: 12 additions & 10 deletions config.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Honeycomb Refinery Configuration Documentation

This is the documentation for the configuration file for Honeycomb's Refinery.
It was automatically generated on 2024-11-26 at 18:43:37 UTC.
It was automatically generated on 2024-12-02 at 16:52:55 UTC.

## The Config file

Expand Down Expand Up @@ -402,7 +402,7 @@ This is useful for evaluating sampling rules.
When DryRun is enabled, traces is decorated with `meta.refinery.
dryrun.kept` that is set to `true` or `false`, based on whether the trace would be kept or dropped.
In addition, `SampleRate` will be set to the incoming rate for all traces, and the field `meta.refinery.dryrun.sample_rate` will be set to the sample rate that would have been used.
NOTE: This setting is not compatible with `DisableTraceLocality=true`, because drop trace decisions shared among peers do not contain all the relevant information needed to send traces to Honeycomb.
NOTE: This setting is not compatible with `TraceCache=distributed`, because drop trace decisions shared among peers do not contain all the relevant information needed to send traces to Honeycomb.

- Eligible for live reload.
- Type: `bool`
Expand Down Expand Up @@ -1021,21 +1021,23 @@ This value should be set to a bit less than the normal timeout period for shutti
- Type: `duration`
- Default: `15s`

### `DisableTraceLocality`
### `TraceLocalityMode`

DisableTraceLocality controls whether all spans that belongs to the same trace are sent to a single Refinery for processing.
TraceLocalityMode controls how Refinery handles spans that belong to the same trace in a clustered environment.

When `false`, Refinery will route all spans that belong to the same trace to a single peer.
When `concentrated`, Refinery will route all spans that belong to the same trace to a single peer.
This is the default behavior ("Trace Locality") and the way Refinery has worked in the past.
When `true`, Refinery will instead keep spans on the node where they were received, and forward proxy spans that contain only the key information needed to make a trace decision.
This can reduce the amount of traffic between peers in most cases, and can help avoid a situation where a single large trace can cause a memory overrun on a single node.
If `true`, the amount of traffic between peers will be reduced, but the amount of traffic between Refinery and Redis will significantly increase, because Refinery uses Redis to distribute the trace decisions to all nodes in the cluster.
When `distributed`, Refinery will instead keep spans on the node where they were received, and forward proxy spans that contain only the key information needed to make a trace decision.
This can reduce the amount of traffic between peers, and can help avoid a situation where a single large trace can cause a memory overrun on a single node.
If `distributed`, the amount of traffic between peers will be reduced, but the amount of traffic between Refinery and Redis will significantly increase, because Refinery uses Redis to distribute the trace decisions to all nodes in the cluster.
It is important to adjust the size of the Redis cluster in this case.
NOTE: This setting is not compatible with `DryRun` when set to true.
The total volume of network traffic in `distributed` mode should be expected to decrease unless the cluster size is very large (hundreds of nodes).
NOTE: This setting is not compatible with `DryRun` when set to `distributed`.
See `DryRun` for more information.

- Not eligible for live reload.
- Type: `bool`
- Type: `string`
- Default: `concentrated`

### `HealthCheckTimeout`

Expand Down
17 changes: 15 additions & 2 deletions config/file_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -317,8 +317,8 @@ type CollectionConfig struct {
DisableRedistribution bool `yaml:"DisableRedistribution"`
RedistributionDelay Duration `yaml:"RedistributionDelay" default:"30s"`

ShutdownDelay Duration `yaml:"ShutdownDelay" default:"15s"`
DisableTraceLocality bool `yaml:"DisableTraceLocality"`
ShutdownDelay Duration `yaml:"ShutdownDelay" default:"15s"`
TraceLocalityMode string `yaml:"TraceLocalityMode" default:"concentrated"`

MaxDropDecisionBatchSize int `yaml:"MaxDropDecisionBatchSize" default:"1000"`
DropDecisionSendInterval Duration `yaml:"DropDecisionSendInterval" default:"1s"`
Expand Down Expand Up @@ -355,6 +355,19 @@ func (c CollectionConfig) GetIncomingQueueSize() int {
return c.IncomingQueueSize
}

// TraceLocalityEnabled returns whether trace locality is enabled.
func (c CollectionConfig) TraceLocalityEnabled() bool {
switch c.TraceLocalityMode {
case "concentrated":
return true
case "distributed":
return false
default:
// Default to true for backwards compatibility
return true
}
}

type BufferSizeConfig struct {
UpstreamBufferSize int `yaml:"UpstreamBufferSize" default:"10_000"`
PeerBufferSize int `yaml:"PeerBufferSize" default:"100_000"`
Expand Down
24 changes: 12 additions & 12 deletions config/metadata/configMeta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ groups:
`meta.refinery.dryrun.sample_rate` will be set to the sample rate
that would have been used.

NOTE: This setting is not compatible with `DisableTraceLocality=true`,
NOTE: This setting is not compatible with `TraceCache=distributed`,
because drop trace decisions shared among peers do not contain all
the relevant information needed to send traces to Honeycomb.

Expand Down Expand Up @@ -1340,26 +1340,26 @@ groups:
This value should be set to a bit less than the normal timeout period
for shutting down without forcibly terminating the process.

- name: DisableTraceLocality
type: bool
- name: TraceLocalityMode
type: string
valuetype: nondefault
firstversion: v2.9
default: false
default: "concentrated"
reload: false
summary: controls whether all spans that belongs to the same trace are sent to a single Refinery for processing.
summary: controls how Refinery handles spans that belong to the same trace in a clustered environment.
description: >
When `false`, Refinery will route all spans that belong to the same trace to a single peer. This is the
default behavior ("Trace Locality") and the way Refinery has worked in the past. When `true`, Refinery
When `concentrated`, Refinery will route all spans that belong to the same trace to a single peer. This is the
default behavior ("Trace Locality") and the way Refinery has worked in the past. When `distributed`, Refinery
will instead keep spans on the node where they were received, and forward proxy spans that contain only
the key information needed to make a trace decision. This can reduce the amount of traffic between peers
in most cases, and can help avoid a situation where a single large trace can cause a memory overrun on
a single node.
the key information needed to make a trace decision. This can reduce the amount of traffic between peers,
and can help avoid a situation where a single large trace can cause a memory overrun on a single node.

If `true`, the amount of traffic between peers will be reduced, but the amount of traffic between Refinery
If `distributed`, the amount of traffic between peers will be reduced, but the amount of traffic between Refinery
and Redis will significantly increase, because Refinery uses Redis to distribute the trace decisions to all
nodes in the cluster. It is important to adjust the size of the Redis cluster in this case.

NOTE: This setting is not compatible with `DryRun` when set to true. See `DryRun` for more information.
The total volume of network traffic in `distributed` mode should be expected to decrease unless the cluster size is very large (hundreds of nodes).
NOTE: This setting is not compatible with `DryRun` when set to `distributed`. See `DryRun` for more information.

- name: HealthCheckTimeout
type: duration
Expand Down
49 changes: 26 additions & 23 deletions config_complete.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
## Honeycomb Refinery Configuration ##
######################################
#
# created on 2024-11-22 at 17:59:57 UTC from ../../config.yaml using a template generated on 2024-11-22 at 17:59:50 UTC
# created on 2024-12-02 at 16:40:51 UTC from ../../config.yaml using a template generated on 2024-12-02 at 16:40:45 UTC

# This file contains a configuration for the Honeycomb Refinery. It is in YAML
# format, organized into named groups, each of which contains a set of
Expand Down Expand Up @@ -427,7 +427,7 @@ Debugging:
## to the incoming rate for all traces, and the field
## `meta.refinery.dryrun.sample_rate` will be set to the sample rate that
## would have been used.
## NOTE: This setting is not compatible with `DisableTraceLocality=true`,
## NOTE: This setting is not compatible with `TraceCache=distributed`,
## because drop trace decisions shared among peers do not contain all the
## relevant information needed to send traces to Honeycomb.
##
Expand Down Expand Up @@ -1083,27 +1083,30 @@ Collection:
## Eligible for live reload.
# ShutdownDelay: 15s

## DisableTraceLocality controls whether all spans that belongs to the
## same trace are sent to a single Refinery for processing.
##
## When `false`, Refinery will route all spans that belong to the same
## trace to a single peer. This is the default behavior ("Trace
## Locality") and the way Refinery has worked in the past. When `true`,
## Refinery will instead keep spans on the node where they were received,
## and forward proxy spans that contain only the key information needed
## to make a trace decision. This can reduce the amount of traffic
## between peers in most cases, and can help avoid a situation where a
## single large trace can cause a memory overrun on a single node.
## If `true`, the amount of traffic between peers will be reduced, but
## the amount of traffic between Refinery and Redis will significantly
## increase, because Refinery uses Redis to distribute the trace
## decisions to all nodes in the cluster. It is important to adjust the
## size of the Redis cluster in this case.
## NOTE: This setting is not compatible with `DryRun` when set to true.
## See `DryRun` for more information.
##
## Not eligible for live reload.
# DisableTraceLocality: false
## TraceLocalityMode controls how Refinery handles spans that belongs to
## the same trace in a clustered environment.
##
## When `concentrated`, Refinery will route all spans that belong to the
## same trace to a single peer. This is the default behavior ("Trace
## Locality") and the way Refinery has worked in the past. When
## `distributed`, Refinery will instead keep spans on the node where they
## were received, and forward proxy spans that contain only the key
## information needed to make a trace decision. This can reduce the
## amount of traffic between peers, and can help avoid a situation where
## a single large trace can cause a memory overrun on a single node.
## If `distributed`, the amount of traffic between peers will be reduced,
## but the amount of traffic between Refinery and Redis will
## significantly increase, because Refinery uses Redis to distribute the
## trace decisions to all nodes in the cluster. It is important to adjust
## the size of the Redis cluster in this case.
## The total volume of network traffic in `distributed` mode should be
## expected to decrease unless the cluster size is very large (hundreds
## of nodes). NOTE: This setting is not compatible with `DryRun` when set
## to `distributed`. See `DryRun` for more information.
##
## default: concentrated
## Not eligible for live reload.
# TraceLocalityMode: concentrated

## HealthCheckTimeout controls the maximum duration allowed for
## collection health checks to complete.
Expand Down
Loading
Loading