Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: formalize gRPC errors in case of UDF exceptions #166

Merged
merged 8 commits into from
Feb 13, 2025
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions src/main/java/io/numaproj/numaflow/batchmapper/Service.java
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
package io.numaproj.numaflow.batchmapper;

import com.google.protobuf.Any;
import com.google.protobuf.ByteString;
import com.google.protobuf.Empty;
import com.google.rpc.Code;
import com.google.rpc.DebugInfo;
import io.grpc.Status;
import io.grpc.protobuf.StatusProto;
import io.grpc.stub.StreamObserver;
import io.numaproj.numaflow.map.v1.MapGrpc;
import io.numaproj.numaflow.map.v1.MapOuterClass;
import io.numaproj.numaflow.shared.ExceptionUtils;
import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;

Expand Down Expand Up @@ -98,10 +103,15 @@ public void onNext(MapOuterClass.MapRequest mapRequest) {
} catch (Exception e) {
log.error("Encountered an error in batch map onNext", e);
shutdownSignal.completeExceptionally(e);
responseObserver.onError(Status.INTERNAL
.withDescription(e.getMessage())
.withCause(e)
.asException());
// Build gRPC Status
com.google.rpc.Status status = com.google.rpc.Status.newBuilder()
.setCode(Code.INTERNAL.getNumber())
.setMessage(ExceptionUtils.ERR_BATCH_MAP_EXCEPTION + ": " + (e.getMessage() != null ? e.getMessage() : ""))
.addDetails(Any.pack(DebugInfo.newBuilder()
.setDetail(ExceptionUtils.getStackTrace(e))
.build()))
.build();
responseObserver.onError(StatusProto.toStatusRuntimeException(status));
}
}

Expand Down
18 changes: 14 additions & 4 deletions src/main/java/io/numaproj/numaflow/mapper/MapSupervisorActor.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,13 @@
import akka.japi.pf.DeciderBuilder;
import akka.japi.pf.ReceiveBuilder;
import io.grpc.Status;
import com.google.protobuf.Any;
import com.google.rpc.Code;
import com.google.rpc.DebugInfo;
import io.grpc.protobuf.StatusProto;
import io.grpc.stub.StreamObserver;
import io.numaproj.numaflow.map.v1.MapOuterClass;
import io.numaproj.numaflow.shared.ExceptionUtils;
import lombok.extern.slf4j.Slf4j;

import java.util.Optional;
Expand Down Expand Up @@ -106,10 +111,15 @@ private void handleFailure(Exception e) {
userException = e;
// only send the very first exception to the client
// one exception should trigger a container restart
responseObserver.onError(Status.INTERNAL
.withDescription(e.getMessage())
.withCause(e)
.asException());
// Build gRPC Status
com.google.rpc.Status status = com.google.rpc.Status.newBuilder()
.setCode(Code.INTERNAL.getNumber())
.setMessage(ExceptionUtils.ERR_MAP_EXCEPTION + ": " + (e.getMessage() != null ? e.getMessage() : ""))
.addDetails(Any.pack(DebugInfo.newBuilder()
.setDetail(ExceptionUtils.getStackTrace(e))
.build()))
.build();
responseObserver.onError(StatusProto.toStatusRuntimeException(status));
}
activeMapperCount--;
}
Expand Down
18 changes: 14 additions & 4 deletions src/main/java/io/numaproj/numaflow/mapstreamer/Service.java
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
package io.numaproj.numaflow.mapstreamer;

import com.google.protobuf.Any;
import com.google.protobuf.Empty;
import com.google.rpc.Code;
import com.google.rpc.DebugInfo;
import io.grpc.Status;
import io.grpc.protobuf.StatusProto;
import io.grpc.stub.StreamObserver;
import io.numaproj.numaflow.map.v1.MapGrpc;
import io.numaproj.numaflow.map.v1.MapOuterClass;
import io.numaproj.numaflow.shared.ExceptionUtils;
import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;

Expand Down Expand Up @@ -61,10 +66,15 @@ public void onNext(MapOuterClass.MapRequest request) {
} catch (Exception e) {
log.error("Encountered error in mapFn onNext", e);
shutdownSignal.completeExceptionally(e);
responseObserver.onError(Status.INTERNAL
.withDescription(e.getMessage())
.withCause(e)
.asException());
// Build gRPC Status
com.google.rpc.Status status = com.google.rpc.Status.newBuilder()
.setCode(Code.INTERNAL.getNumber())
.setMessage(ExceptionUtils.ERR_MAP_STREAM_EXCEPTION + ": " + (e.getMessage() != null ? e.getMessage() : ""))
.addDetails(Any.pack(DebugInfo.newBuilder()
.setDetail(ExceptionUtils.getStackTrace(e))
.build()))
.build();
responseObserver.onError(StatusProto.toStatusRuntimeException(status));
return;
}

Expand Down
31 changes: 31 additions & 0 deletions src/main/java/io/numaproj/numaflow/shared/ExceptionUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package io.numaproj.numaflow.shared;

import java.io.PrintWriter;
import java.io.StringWriter;

public class ExceptionUtils {

Check warning on line 6 in src/main/java/io/numaproj/numaflow/shared/ExceptionUtils.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/numaproj/numaflow/shared/ExceptionUtils.java#L6

Added line #L6 was not covered by tests
/**
* Formalized exception error strings
*/
public static final String ERR_SOURCE_EXCEPTION = "UDF_EXECUTION_ERROR(source)";
public static final String ERR_TRANSFORMER_EXCEPTION = "UDF_EXECUTION_ERROR(transformer)";
public static final String ERR_SINK_EXCEPTION = "UDF_EXECUTION_ERROR(sink)";
public static final String ERR_MAP_STREAM_EXCEPTION = "UDF_EXECUTION_ERROR(mapstream)";
public static final String ERR_MAP_EXCEPTION = "UDF_EXECUTION_ERROR(map)";
public static final String ERR_BATCH_MAP_EXCEPTION = "UDF_EXECUTION_ERROR(batchmap)";
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we skipping reducers?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, for now (to be taken up later across sdk's) numaproj/numaflow-go#173 (comment)


/**
* Converts the stack trace of an exception into a String.
*
* @param e the exception to extract the stack trace from
* @return the stack trace as a String
*/
public static String getStackTrace(Throwable t) {
if (t == null) {
return "No exception provided.";
}
StringWriter sw = new StringWriter();
t.printStackTrace(new PrintWriter(sw));
return sw.toString();
}
}
20 changes: 12 additions & 8 deletions src/main/java/io/numaproj/numaflow/sideinput/Server.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ public Server(SideInputRetriever sideInputRetriever) {
/**
* constructor to create gRPC server with gRPC config.
*
* @param grpcConfig to configure the max message size for grpc
* @param grpcConfig to configure the max message size for grpc
* @param sideInputRetriever to retrieve the side input
*/
public Server(SideInputRetriever sideInputRetriever, GRPCConfig grpcConfig) {
Expand All @@ -41,7 +41,8 @@ public Server(SideInputRetriever sideInputRetriever, GRPCConfig grpcConfig) {
}

@VisibleForTesting
protected Server(GRPCConfig grpcConfig, SideInputRetriever service, ServerInterceptor interceptor, String serverName) {
protected Server(GRPCConfig grpcConfig, SideInputRetriever service, ServerInterceptor interceptor,
String serverName) {
this.grpcConfig = grpcConfig;
this.server = new GrpcServerWrapper(
interceptor,
Expand All @@ -67,8 +68,7 @@ public void start() throws Exception {

log.info(
"server started, listening on {}",
this.grpcConfig.isLocal() ?
"localhost:" + this.grpcConfig.getPort() : this.grpcConfig.getSocketPath());
this.grpcConfig.isLocal() ? "localhost:" + this.grpcConfig.getPort() : this.grpcConfig.getSocketPath());

// register shutdown hook to gracefully shut down the server
Runtime.getRuntime().addShutdownHook(new Thread(() -> {
Expand All @@ -83,11 +83,14 @@ public void start() throws Exception {
}

/**
* Blocks until the server has terminated. If the server is already terminated, this method
* will return immediately. If the server is not yet terminated, this method will block the
* Blocks until the server has terminated. If the server is already terminated,
* this method
* will return immediately. If the server is not yet terminated, this method
* will block the
* calling thread until the server has terminated.
*
* @throws InterruptedException if the current thread is interrupted while waiting
* @throws InterruptedException if the current thread is interrupted while
* waiting
*/
public void awaitTermination() throws InterruptedException {
log.info("side input server is waiting for termination");
Expand All @@ -96,7 +99,8 @@ public void awaitTermination() throws InterruptedException {
}

/**
* Stop serving requests and shutdown resources. Await termination on the main thread since the
* Stop serving requests and shutdown resources. Await termination on the main
* thread since the
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the reformatting right? it seems we are getting more lines but short.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need to check why this was happening, reverted at most of the places.

* grpc library uses daemon threads.
*
* @throws InterruptedException if shutdown is interrupted
Expand Down
11 changes: 4 additions & 7 deletions src/main/java/io/numaproj/numaflow/sideinput/Service.java
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
package io.numaproj.numaflow.sideinput;

import com.google.protobuf.ByteString;
import com.google.protobuf.Empty;
import io.grpc.stub.StreamObserver;
import com.google.protobuf.ByteString;
import io.numaproj.numaflow.sideinput.v1.SideInputGrpc;
import io.numaproj.numaflow.sideinput.v1.Sideinput;
import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;


@Slf4j
@AllArgsConstructor
class Service extends SideInputGrpc.SideInputImplBase {
Expand All @@ -29,11 +28,8 @@ public void retrieveSideInput(
responseObserver);
return;
}


// process request
Message message = sideInputRetriever.retrieveSideInput();

// set response
responseObserver.onNext(buildResponse(message));
responseObserver.onCompleted();
Expand All @@ -50,8 +46,9 @@ public void isReady(Empty request, StreamObserver<Sideinput.ReadyResponse> respo

private Sideinput.SideInputResponse buildResponse(Message message) {
return Sideinput.SideInputResponse.newBuilder()
.setValue(message.getValue() == null ? ByteString.EMPTY : ByteString.copyFrom(
message.getValue()))
.setValue(message.getValue() == null ? ByteString.EMPTY
: ByteString.copyFrom(
message.getValue()))
.setNoBroadcast(message.isNoBroadcast())
.build();
}
Expand Down
14 changes: 14 additions & 0 deletions src/main/java/io/numaproj/numaflow/sinker/Service.java
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
package io.numaproj.numaflow.sinker;

import com.google.protobuf.Any;
import com.google.protobuf.Empty;
import com.google.rpc.Code;
import com.google.rpc.DebugInfo;
import io.grpc.Status;
import io.grpc.protobuf.StatusProto;
import io.grpc.stub.StreamObserver;
import io.numaproj.numaflow.shared.ExceptionUtils;
import io.numaproj.numaflow.sink.v1.SinkGrpc;
import io.numaproj.numaflow.sink.v1.SinkOuterClass;
import lombok.AllArgsConstructor;
Expand Down Expand Up @@ -100,6 +105,15 @@
responseObserver.onError(Status.INTERNAL
.withDescription(e.getMessage())
.asException());
// Build gRPC Status
com.google.rpc.Status status = com.google.rpc.Status.newBuilder()
.setCode(Code.INTERNAL.getNumber())
.setMessage(ExceptionUtils.ERR_SINK_EXCEPTION + ": " + (e.getMessage() != null ? e.getMessage() : ""))
.addDetails(Any.pack(DebugInfo.newBuilder()
.setDetail(ExceptionUtils.getStackTrace(e))
.build()))
.build();
responseObserver.onError(StatusProto.toStatusRuntimeException(status));

Check warning on line 116 in src/main/java/io/numaproj/numaflow/sinker/Service.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/numaproj/numaflow/sinker/Service.java#L116

Added line #L116 was not covered by tests
}
}

Expand Down
29 changes: 20 additions & 9 deletions src/main/java/io/numaproj/numaflow/sourcer/Service.java
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
package io.numaproj.numaflow.sourcer;

import com.google.protobuf.Any;
import com.google.protobuf.Empty;
import com.google.rpc.Code;
import com.google.rpc.DebugInfo;
import io.grpc.Status;
import io.grpc.protobuf.StatusProto;
import io.grpc.stub.StreamObserver;
import io.numaproj.numaflow.shared.ExceptionUtils;
import io.numaproj.numaflow.source.v1.SourceGrpc;
import io.numaproj.numaflow.source.v1.SourceOuterClass;
import lombok.AllArgsConstructor;
Expand All @@ -15,7 +20,6 @@

import static io.numaproj.numaflow.source.v1.SourceGrpc.getPendingFnMethod;


/**
* Implementation of the gRPC service for the sourcer.
*/
Expand All @@ -31,7 +35,8 @@
* @param responseObserver the response observer
*/
@Override
public StreamObserver<SourceOuterClass.ReadRequest> readFn(final StreamObserver<SourceOuterClass.ReadResponse> responseObserver) {
public StreamObserver<SourceOuterClass.ReadRequest> readFn(
final StreamObserver<SourceOuterClass.ReadResponse> responseObserver) {
return new StreamObserver<>() {
private boolean handshakeDone = false;

Expand Down Expand Up @@ -80,10 +85,15 @@
} catch (Exception e) {
log.error("Encountered error in readFn onNext", e);
shutdownSignal.completeExceptionally(e);
responseObserver.onError(Status.INTERNAL
.withDescription(e.getMessage())
.withCause(e)
.asException());
// Build gRPC Status
com.google.rpc.Status status = com.google.rpc.Status.newBuilder()
.setCode(Code.INTERNAL.getNumber())

Check warning on line 90 in src/main/java/io/numaproj/numaflow/sourcer/Service.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/numaproj/numaflow/sourcer/Service.java#L89-L90

Added lines #L89 - L90 were not covered by tests
.setMessage(ExceptionUtils.ERR_SOURCE_EXCEPTION + ": " + (e.getMessage() != null ? e.getMessage() : ""))
.addDetails(Any.pack(DebugInfo.newBuilder()
.setDetail(ExceptionUtils.getStackTrace(e))
.build()))
.build();
responseObserver.onError(StatusProto.toStatusRuntimeException(status));

Check warning on line 96 in src/main/java/io/numaproj/numaflow/sourcer/Service.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/numaproj/numaflow/sourcer/Service.java#L92-L96

Added lines #L92 - L96 were not covered by tests
}
}

Expand Down Expand Up @@ -201,7 +211,8 @@
SourceOuterClass.PendingResponse.Result
.newBuilder()
.setCount(this.sourcer.getPending())
.build()).build());
.build())
.build());
responseObserver.onCompleted();
}

Expand Down Expand Up @@ -236,8 +247,8 @@
responseObserver.onNext(SourceOuterClass.PartitionsResponse.newBuilder()
.setResult(
SourceOuterClass.PartitionsResponse.Result.newBuilder()
.addAllPartitions(partitions)).
build());
.addAllPartitions(partitions))
.build());
responseObserver.onCompleted();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,13 @@
import akka.actor.SupervisorStrategy;
import akka.japi.pf.DeciderBuilder;
import akka.japi.pf.ReceiveBuilder;
import io.grpc.Status;
import io.grpc.stub.StreamObserver;
import io.grpc.Status;
import com.google.protobuf.Any;
import com.google.rpc.Code;
import com.google.rpc.DebugInfo;
import io.grpc.protobuf.StatusProto;
import io.numaproj.numaflow.shared.ExceptionUtils;
import io.numaproj.numaflow.sourcetransformer.v1.Sourcetransformer;
import lombok.extern.slf4j.Slf4j;

Expand Down Expand Up @@ -144,10 +149,16 @@ private void handleFailure(Exception e) {
userException = e;
// only send the very first exception to the client
// one exception should trigger a container restart
responseObserver.onError(Status.INTERNAL
.withDescription(e.getMessage())
.withCause(e)
.asException());

// Build gRPC Status
com.google.rpc.Status status = com.google.rpc.Status.newBuilder()
.setCode(Code.INTERNAL.getNumber())
.setMessage(ExceptionUtils.ERR_TRANSFORMER_EXCEPTION + ": " + (e.getMessage() != null ? e.getMessage() : ""))
.addDetails(Any.pack(DebugInfo.newBuilder()
.setDetail(ExceptionUtils.getStackTrace(e))
.build()))
.build();
responseObserver.onError(StatusProto.toStatusRuntimeException(status));
}
activeTransformersCount--;
}
Expand Down Expand Up @@ -217,7 +228,6 @@ public SupervisorStrategy supervisorStrategy() {
.asException());
return SupervisorStrategy.stop();
})
.build()
);
.build());
}
}
Loading
Loading