diff --git a/error.go b/error.go index cd67e9488..c9753ac49 100644 --- a/error.go +++ b/error.go @@ -32,6 +32,9 @@ type ( // CanceledError returned when operation was canceled. CanceledError = internal.CanceledError + + // NonDeterministicError is returned when a workflow's replay was non-deterministic, and it could not be resumed safely. + NonDeterministicError = internal.NonDeterministicError ) // ErrNoData is returned when trying to extract strong typed data while there is no data available. diff --git a/internal/client.go b/internal/client.go index c42c8b61e..505660648 100644 --- a/internal/client.go +++ b/internal/client.go @@ -26,8 +26,6 @@ import ( "fmt" "time" - "go.uber.org/cadence/internal/common/isolationgroup" - "github.com/opentracing/opentracing-go" "github.com/uber-go/tally" "go.uber.org/zap" @@ -35,6 +33,7 @@ import ( "go.uber.org/cadence/.gen/go/cadence/workflowserviceclient" s "go.uber.org/cadence/.gen/go/shared" "go.uber.org/cadence/internal/common/auth" + "go.uber.org/cadence/internal/common/isolationgroup" "go.uber.org/cadence/internal/common/metrics" ) diff --git a/internal/common/isolationgroup/service_wrapper_test.go b/internal/common/isolationgroup/service_wrapper_test.go index 56aa1ec33..5504526c4 100644 --- a/internal/common/isolationgroup/service_wrapper_test.go +++ b/internal/common/isolationgroup/service_wrapper_test.go @@ -25,14 +25,12 @@ import ( "testing" "time" - "github.com/stretchr/testify/assert" - - "go.uber.org/cadence/.gen/go/cadence/workflowserviceclient" - "github.com/golang/mock/gomock" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/suite" "github.com/uber/tchannel-go/thrift" + "go.uber.org/cadence/.gen/go/cadence/workflowserviceclient" "go.uber.org/cadence/.gen/go/cadence/workflowservicetest" "go.uber.org/cadence/.gen/go/shared" ) diff --git a/internal/error.go b/internal/error.go index 0519d55d1..12dc9f654 100644 --- a/internal/error.go +++ b/internal/error.go @@ -28,6 +28,7 @@ import ( "strings" "go.uber.org/cadence/.gen/go/shared" + "go.uber.org/cadence/internal/common/util" ) /* @@ -125,6 +126,51 @@ type ( stackTrace string } + // NonDeterministicError contains some structured data related to a non-deterministic + // replay failure, and is primarily intended for allowing richer error reporting. + // + // WorkflowType, WorkflowID, RunID, TaskList, and DomainName will likely be long-term stable + // and included in some form in future library versions, but the rest of these fields may + // change at any time, or be removed in a future major version change. + NonDeterministicError struct { + + // Reason is a relatively free-form description of what kind of non-determinism + // was detected. + // + // You are *strongly* encouraged to not rely on these strings for behavior, only + // explanation, for a few reasons. More will likely appear in the future, they may + // change, and there is little that can be safely decided on in an automated way. + // + // Currently, values roughly match the historical error strings, and are: + // - "missing replay decision" (The error will contain HistoryEventText, as there + // is at least one history event that has no matching replayed decision) + // - "extra replay decision" (The error will contain DecisionText, as there is + // at least one decision from replay that has no matching history event) + // - "mismatch" (Both HistoryEventText and DecisionText will exist, as there + // are issues with both. This was previously shown as "history event is ..., + // replay decision is ..." error text.) + Reason string + + WorkflowType string + WorkflowID string + RunID string + TaskList string + DomainName string + + // intentionally avoiding "history event" and "decision" names + // because we *do* have types for them, but they are in thrift and should + // not be exposed directly. + // we should consider doing that eventually though, or providing a + // simplified object for richer failure information. + + // HistoryEventText contains a String() representation of a history + // event (i.e. previously recorded) that is related to the problem. + HistoryEventText string + // DecisionText contains a String() representation of a replay decision + // event (i.e. created during replay) that is related to the problem. + DecisionText string + } + // ContinueAsNewError contains information about how to continue the workflow as new. ContinueAsNewError struct { wfn interface{} @@ -419,3 +465,57 @@ func (b ErrorDetailsValues) Get(valuePtr ...interface{}) error { } return nil } + +// NewNonDeterminsticError constructs a new *NonDeterministicError. +// +// - reason should be a documented NonDeterminsticError.Reason value +// - info is always required. only a portion of it is used, but it is a convenient +// and currently always-available object. +// - history and decision may each be present or nil at any time +func NewNonDeterminsticError(reason string, info *WorkflowInfo, history *shared.HistoryEvent, decision *shared.Decision) error { + var historyText string + if history != nil { + historyText = util.HistoryEventToString(history) + } + var decisionText string + if decision != nil { + decisionText = util.DecisionToString(decision) + } + return &NonDeterministicError{ + Reason: reason, + + WorkflowType: info.WorkflowType.Name, + WorkflowID: info.WorkflowExecution.ID, + RunID: info.WorkflowExecution.RunID, + TaskList: info.TaskListName, + DomainName: info.Domain, + + HistoryEventText: historyText, + DecisionText: decisionText, + } +} + +func (e *NonDeterministicError) Error() string { + switch e.Reason { + case "missing replay decision": + // historical text + return "nondeterministic workflow: " + + "missing replay decision for " + e.HistoryEventText + case "extra replay decision": + // historical text + return "nondeterministic workflow: " + + "extra replay decision for " + e.DecisionText + case "mismatch": + // historical text + return "nondeterministic workflow: " + + "history event is " + e.HistoryEventText + ", " + + "replay decision is " + e.DecisionText + default: + // should not occur in practice, but it's basically fine if it does. + // ideally this should crash in internal builds / tests, to prevent mismatched values. + return fmt.Sprintf( + "unknown reason %q, history event is: %s, replay decision is: %s", + e.Reason, e.HistoryEventText, e.DecisionText, + ) + } +} diff --git a/internal/internal_task_handlers.go b/internal/internal_task_handlers.go index ff0c0affc..078ac1c98 100644 --- a/internal/internal_task_handlers.go +++ b/internal/internal_task_handlers.go @@ -932,7 +932,7 @@ ProcessEvents: var nonDeterministicErr error if !skipReplayCheck && !w.isWorkflowCompleted || isReplayTest { // check if decisions from reply matches to the history events - if err := matchReplayWithHistory(replayDecisions, respondEvents); err != nil { + if err := matchReplayWithHistory(w.workflowInfo, replayDecisions, respondEvents); err != nil { nonDeterministicErr = err } } @@ -947,7 +947,7 @@ ProcessEvents: nonDeterministicErr = panicErr } else { // Since we know there is an error, we do the replay check to give more context in the log - replayErr := matchReplayWithHistory(replayDecisions, respondEvents) + replayErr := matchReplayWithHistory(w.workflowInfo, replayDecisions, respondEvents) w.wth.logger.Error("Ignored workflow panic error", zap.String(tagWorkflowType, task.WorkflowType.GetName()), zap.String(tagWorkflowID, task.WorkflowExecution.GetWorkflowId()), diff --git a/internal/workflow_replayer_utils.go b/internal/workflow_replayer_utils.go index 5563ffc45..796cf0abd 100644 --- a/internal/workflow_replayer_utils.go +++ b/internal/workflow_replayer_utils.go @@ -22,15 +22,13 @@ package internal import ( "bytes" - "fmt" "reflect" "strings" s "go.uber.org/cadence/.gen/go/shared" - "go.uber.org/cadence/internal/common/util" ) -func matchReplayWithHistory(replayDecisions []*s.Decision, historyEvents []*s.HistoryEvent) error { +func matchReplayWithHistory(info *WorkflowInfo, replayDecisions []*s.Decision, historyEvents []*s.HistoryEvent) error { di := 0 hi := 0 hSize := len(historyEvents) @@ -60,16 +58,15 @@ matchLoop: } if d == nil { - return fmt.Errorf("nondeterministic workflow: missing replay decision for %s", util.HistoryEventToString(e)) + return NewNonDeterminsticError("missing replay decision", info, e, nil) } if e == nil { - return fmt.Errorf("nondeterministic workflow: extra replay decision for %s", util.DecisionToString(d)) + return NewNonDeterminsticError("extra replay decision", info, nil, d) } if !isDecisionMatchEvent(d, e, false) { - return fmt.Errorf("nondeterministic workflow: history event is %s, replay decision is %s", - util.HistoryEventToString(e), util.DecisionToString(d)) + return NewNonDeterminsticError("mismatch", info, e, d) } di++