From 5ad7f92d4a4d06b733cd7854e0782ca4475649d5 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 20 Dec 2023 15:18:38 -0800 Subject: [PATCH 01/83] Clean up RTI Prevent exiting via SIGPIPE on socket write failure and instead handle the error. Distinguish normal termination from interrupted termination and avoid mutexes in the latter and avoid lf_print in the former. Remove unnecessary absent messages using EIMT and EIMT_strict. Eliminate the bogus port search algorithm (which never worked) and just use DEFAULT_PORT or an override given on the command line or an `at` clause. Also, calloc instead of malloc federate info so that pointers are reliably NULL. Also, added lf_print_error_system_failure utility to print system call error information. --- core/federated/RTI/main.c | 45 ++++++++++--- core/federated/RTI/rti_common.c | 59 +++++++++++++---- core/federated/RTI/rti_common.h | 14 +++- core/federated/RTI/rti_local.c | 4 +- core/federated/RTI/rti_remote.c | 50 +++++++-------- core/federated/RTI/rti_remote.h | 4 +- core/federated/network/net_util.c | 47 ++++++++------ core/reactor.c | 1 + include/core/federated/network/net_common.h | 71 ++++++++------------- include/core/federated/network/net_util.h | 7 ++ include/core/reactor_common.h | 3 + include/core/trace.h | 4 ++ include/core/utils/util.h | 15 +++-- 13 files changed, 203 insertions(+), 121 deletions(-) diff --git a/core/federated/RTI/main.c b/core/federated/RTI/main.c index fdc234ced..b17bb41d5 100644 --- a/core/federated/RTI/main.c +++ b/core/federated/RTI/main.c @@ -67,16 +67,26 @@ static rti_remote_t rti; */ const char *rti_trace_file_name = "rti.lft"; +/** Indicator that normal termination has occurred. */ +bool normal_termination = false; + /** - * @brief A clean termination of the RTI will write the trace file, if tracing is - * enabled, before exiting. + * @brief Function to run upon termination. + * This function will be invoked both after main() returns and when a signal + * that results in terminating the process, such as SIGINT. In the former + * case, it should do nothing. In the latter case, it will attempt to write + * the trace file, but without acquiring a mutex lock, so the resulting files + * may be incomplete or even corrupted. But this is better than just failing + * to write the data we have collected so far. */ void termination() { - if (rti.base.tracing_enabled) { - stop_trace(rti.base.trace); - lf_print("RTI trace file saved."); + if (!normal_termination) { + if (rti.base.tracing_enabled) { + stop_trace_locked(rti.base.trace); + lf_print("RTI trace file saved."); + } + lf_print("RTI is exiting abnormally."); } - lf_print("RTI is exiting."); } void usage(int argc, const char* argv[]) { @@ -86,7 +96,7 @@ void usage(int argc, const char* argv[]) { lf_print(" -n, --number_of_federates "); lf_print(" The number of federates in the federation that this RTI will control.\n"); lf_print(" -p, --port "); - lf_print(" The port number to use for the RTI. Must be larger than 0 and smaller than %d. Default is %d.\n", UINT16_MAX, STARTING_PORT); + lf_print(" The port number to use for the RTI. Must be larger than 0 and smaller than %d. Default is %d.\n", UINT16_MAX, DEFAULT_PORT); lf_print(" -c, --clock_sync [off|init|on] [period ] [exchanges-per-interval ]"); lf_print(" The status of clock synchronization for this federate."); lf_print(" - off: Clock synchronization is off."); @@ -254,6 +264,16 @@ int main(int argc, const char* argv[]) { // Catch the Ctrl-C signal, for a clean exit that does not lose the trace information signal(SIGINT, exit); +#ifdef SIGPIPE + // Ignore SIGPIPE errors, which terminate the entire application if + // socket write() fails because the reader has closed the socket. + // Instead, cause an EPIPE error to be set when write() fails. + // NOTE: The reason for a broken socket causing a SIGPIPE signal + // instead of just having write() return an error is to robutly + // a foo | bar pipeline where bar crashes. The default behavior + // is for foo to also exit. + signal(SIGPIPE, SIG_IGN); +#endif // SIGPIPE if (atexit(termination) != 0) { lf_print_warning("Failed to register termination function!"); } @@ -277,15 +297,22 @@ int main(int argc, const char* argv[]) { // Allocate memory for the federates rti.base.scheduling_nodes = (scheduling_node_t**)calloc(rti.base.number_of_scheduling_nodes, sizeof(scheduling_node_t*)); for (uint16_t i = 0; i < rti.base.number_of_scheduling_nodes; i++) { - federate_info_t *fed_info = (federate_info_t *) malloc(sizeof(federate_info_t)); + federate_info_t *fed_info = (federate_info_t *) calloc(1, sizeof(federate_info_t)); initialize_federate(fed_info, i); rti.base.scheduling_nodes[i] = (scheduling_node_t *) fed_info; } int socket_descriptor = start_rti_server(rti.user_specified_port); wait_for_federates(socket_descriptor); + normal_termination = true; + if (rti.base.tracing_enabled) { + // No need for a mutex lock because all threads have exited. + stop_trace_locked(rti.base.trace); + lf_print("RTI trace file saved."); + } + + lf_print("RTI is exiting."); // Do this before freeing scheduling nodes. free_scheduling_nodes(rti.base.scheduling_nodes, rti.base.number_of_scheduling_nodes); - lf_print("RTI is exiting."); return 0; } #endif // STANDALONE_RTI diff --git a/core/federated/RTI/rti_common.c b/core/federated/RTI/rti_common.c index a6554195e..a3f996bd5 100644 --- a/core/federated/RTI/rti_common.c +++ b/core/federated/RTI/rti_common.c @@ -113,6 +113,40 @@ tag_t earliest_future_incoming_message_tag(scheduling_node_t* e) { return t_d; } +tag_t eimt_strict(scheduling_node_t* e) { + // Find the tag of the earliest possible incoming message from immediately upstream + // enclaves or federates that are not part of a zero-delay cycle. + // This will be the smallest upstream NET plus the least delay. + // This could be NEVER_TAG if the RTI has not seen a NET from some upstream node. + tag_t t_d = FOREVER_TAG; + for (int i = 0; i < e->num_upstream; i++) { + scheduling_node_t* upstream = rti_common->scheduling_nodes[e->upstream[i]]; + // Skip this node if it is part of a zero-delay cycle. + if (is_in_zero_delay_cycle(upstream)) continue; + // If we haven't heard from the upstream node, then assume it can send an event at the start time. + if (lf_tag_compare(upstream->next_event, NEVER_TAG) == 0) { + tag_t start_tag = {.time = start_time, .microstep = 0}; + upstream->next_event = start_tag; + } + // Need to consider nodes that are upstream of the upstream node because those + // nodes may send messages to the upstream node. + tag_t earliest = earliest_future_incoming_message_tag(upstream); + // If the next event of the upstream node is earlier, then use that. + if (lf_tag_compare(upstream->next_event, earliest) < 0) { + earliest = upstream->next_event; + } + tag_t earliest_tag_from_upstream = lf_delay_tag(earliest, e->upstream_delay[i]); + LF_PRINT_DEBUG("RTI: Strict EIMT of fed/encl %d at fed/encl %d has tag " PRINTF_TAG ".", + e->id, + upstream->id, + earliest_tag_from_upstream.time - start_time, earliest_tag_from_upstream.microstep); + if (lf_tag_compare(earliest_tag_from_upstream, t_d) < 0) { + t_d = earliest_tag_from_upstream; + } + } + return t_d; +} + tag_advance_grant_t tag_advance_grant_if_safe(scheduling_node_t* e) { tag_advance_grant_t result = {.tag = NEVER_TAG, .is_provisional = false}; @@ -152,24 +186,26 @@ tag_advance_grant_t tag_advance_grant_if_safe(scheduling_node_t* e) { // Find the tag of the earliest event that may be later received from an upstream enclave // or federate (which includes any after delays on the connections). tag_t t_d = earliest_future_incoming_message_tag(e); + // Strict version of the above. This is a tag that must be strictly greater than + // that of any granted PTAG. + tag_t t_d_strict = eimt_strict(e); LF_PRINT_LOG("RTI: Earliest next event upstream of node %d has tag " PRINTF_TAG ".", e->id, t_d.time - start_time, t_d.microstep); // Given an EIMT (earliest incoming message tag) there are these possible scenarios: // 1) The EIMT is greater than the NET we want to advance to. Grant a TAG. - // 2) The EIMT is equal to the NET and the federate is part of a zero-delay cycle (ZDC). - // 3) The EIMT is equal to the NET and the federate is not part of a ZDC. - // 4) The EIMT is less than the NET - // In (1) we can give a TAG to NET. In (2) we can give a PTAG. - // In (3) and (4), we wait for further updates from upstream federates. + // 2) The EIMT is equal to the NET and the strict EIMT is greater than the net + // and the federate is part of a zero-delay cycle (ZDC). Grant a PTAG. + // 3) Otherwise, grant nothing and wait for further updates. if ( // Scenario (1) above lf_tag_compare(t_d, e->next_event) > 0 // EIMT greater than NET + && lf_tag_compare(e->next_event, NEVER_TAG) > 0 // NET is not NEVER_TAG && lf_tag_compare(t_d, e->last_provisionally_granted) >= 0 // The grant is not redundant - // (equal is important to override any previous - // PTAGs). - && lf_tag_compare(t_d, e->last_granted) > 0 // The grant is not redundant. + // (equal is important to override any previous + // PTAGs). + && lf_tag_compare(t_d, e->last_granted) > 0 // The grant is not redundant. ) { // No upstream node can send events that will be received with a tag less than or equal to // e->next_event, so it is safe to send a TAG. @@ -180,9 +216,10 @@ tag_advance_grant_t tag_advance_grant_if_safe(scheduling_node_t* e) { e->next_event.time - lf_time_start(), e->next_event.microstep); result.tag = e->next_event; - } else if( // Scenario (2) or (3) above + } else if( // Scenario (2) above lf_tag_compare(t_d, e->next_event) == 0 // EIMT equal to NET && is_in_zero_delay_cycle(e) // The node is part of a ZDC + && lf_tag_compare(t_d_strict, e->next_event) > 0 // The strict EIMT is greater than the NET && lf_tag_compare(t_d, e->last_provisionally_granted) > 0 // The grant is not redundant && lf_tag_compare(t_d, e->last_granted) > 0 // The grant is not redundant. ) { @@ -317,8 +354,8 @@ void update_min_delays_upstream(scheduling_node_t* node) { // Put the results onto the node's struct. node->num_min_delays = count; - node->min_delays = (minimum_delay_t*)malloc(count * sizeof(minimum_delay_t)); - LF_PRINT_DEBUG("++++ Node %hu(is in ZDC: %d\n", node->id, node->flags & IS_IN_ZERO_DELAY_CYCLE); + node->min_delays = (minimum_delay_t*)calloc(count, sizeof(minimum_delay_t)); + LF_PRINT_DEBUG("++++ Node %hu is in ZDC: %d", node->id, is_in_zero_delay_cycle(node)); int k = 0; for (int i = 0; i < rti_common->number_of_scheduling_nodes; i++) { if (lf_tag_compare(path_delays[i], FOREVER_TAG) < 0) { diff --git a/core/federated/RTI/rti_common.h b/core/federated/RTI/rti_common.h index d71751a98..010b5f2f2 100644 --- a/core/federated/RTI/rti_common.h +++ b/core/federated/RTI/rti_common.h @@ -230,7 +230,7 @@ void update_scheduling_node_next_event_tag_locked(scheduling_node_t* e, tag_t ne /** * Given a node (enclave or federate), find the tag of the earliest possible incoming - * message from upstream enclaves or federates, which will be the smallest upstream NET + * message (EIMT) from upstream enclaves or federates, which will be the smallest upstream NET * plus the least delay. This could be NEVER_TAG if the RTI has not seen a NET from some * upstream node. * @param e The target node. @@ -238,6 +238,18 @@ void update_scheduling_node_next_event_tag_locked(scheduling_node_t* e, tag_t ne */ tag_t earliest_future_incoming_message_tag(scheduling_node_t* e); +/** + * Given a node (enclave or federate), find the earliest incoming message tag (EIMT) from + * any immediately upstream node that is not part of zero-delay cycle (ZDC). + * These tags are treated strictly by the RTI when deciding whether to grant a PTAG. + * Since the upstream node is not part of a ZDC, there is no need to block on the input + * from that node since we can simply wait for it to complete its tag without chance of + * introducing a deadlock. This will return FOREVER_TAG if there are no non-ZDC upstream nodes. + * @param e The target node. + * @return The earliest possible incoming message tag from a non-ZDC upstream node. + */ +tag_t eimt_strict(scheduling_node_t* e); + /** * Return true if the node is in a zero-delay cycle. * @param node The node. diff --git a/core/federated/RTI/rti_local.c b/core/federated/RTI/rti_local.c index 1f6cc0928..57af1047d 100644 --- a/core/federated/RTI/rti_local.c +++ b/core/federated/RTI/rti_local.c @@ -35,7 +35,7 @@ static rti_local_t * rti_local; lf_mutex_t rti_mutex; void initialize_local_rti(environment_t *envs, int num_envs) { - rti_local = (rti_local_t*)malloc(sizeof(rti_local_t)); + rti_local = (rti_local_t*)calloc(1, sizeof(rti_local_t)); LF_ASSERT(rti_local, "Out of memory"); initialize_rti_common(&rti_local->base); @@ -47,7 +47,7 @@ void initialize_local_rti(environment_t *envs, int num_envs) { // Allocate memory for the enclave_info objects rti_local->base.scheduling_nodes = (scheduling_node_t**)calloc(num_envs, sizeof(scheduling_node_t*)); for (int i = 0; i < num_envs; i++) { - enclave_info_t *enclave_info = (enclave_info_t *) malloc(sizeof(enclave_info_t)); + enclave_info_t *enclave_info = (enclave_info_t *) calloc(1, sizeof(enclave_info_t)); initialize_enclave_info(enclave_info, i, &envs[i]); rti_local->base.scheduling_nodes[i] = (scheduling_node_t *) enclave_info; diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 2fce8b1bf..608875df3 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -68,7 +68,7 @@ int create_server(int32_t specified_port, uint16_t port, socket_type_t socket_ty timeout_time = (struct timeval){.tv_sec = UDP_TIMEOUT_TIME / BILLION, .tv_usec = (UDP_TIMEOUT_TIME % BILLION) / 1000}; } if (socket_descriptor < 0) { - lf_print_error_and_exit("Failed to create RTI socket."); + lf_print_error_system_failure("Failed to create RTI socket."); } // Set the option for this socket to reuse the same address @@ -86,8 +86,8 @@ int create_server(int32_t specified_port, uint16_t port, socket_type_t socket_ty /* * The following used to permit reuse of a port that an RTI has previously - * used that has not been released. We no longer do this, but instead - * increment the port number until an available port is found. + * used that has not been released. We no longer do this, and instead retry + * some number of times after waiting. // SO_REUSEPORT (since Linux 3.9) // Permits multiple AF_INET or AF_INET6 sockets to be bound to an @@ -127,14 +127,11 @@ int create_server(int32_t specified_port, uint16_t port, socket_type_t socket_ty (struct sockaddr *) &server_fd, sizeof(server_fd)); - // If the binding fails with this port and no particular port was specified - // in the LF program, then try the next few ports in sequence. - while (result != 0 - && specified_port == 0 - && port >= STARTING_PORT - && port <= STARTING_PORT + PORT_RANGE_LIMIT) { - lf_print("RTI failed to get port %d. Trying %d.", port, port + 1); - port++; + // Try repeatedly to bind to the specified port. + int count = 1; + while (result != 0 && count++ < PORT_BIND_RETRY_LIMIT) { + lf_print("RTI failed to get port %d. Will try again.", port); + lf_sleep(PORT_BIND_RETRY_INTERVAL); server_fd.sin_port = htons(port); result = bind( socket_descriptor, @@ -142,13 +139,7 @@ int create_server(int32_t specified_port, uint16_t port, socket_type_t socket_ty sizeof(server_fd)); } if (result != 0) { - if (specified_port == 0) { - lf_print_error_and_exit("Failed to bind the RTI socket. Cannot find a usable port. " - "Consider increasing PORT_RANGE_LIMIT in net_common.h."); - } else { - lf_print_error_and_exit("Failed to bind the RTI socket. Specified port is not available. " - "Consider leaving the port unspecified"); - } + lf_print_error_and_exit("Failed to bind the RTI socket. Port %d is not available. ", port); } char* type = "TCP"; if (socket_type == UDP) { @@ -251,9 +242,9 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { // a later or equal PTAG or TAG sent previously and if their transitive // NET is greater than or equal to the tag. // This is needed to stimulate absent messages from upstream and break deadlocks. - // NOTE: This could later be replaced with a TNET mechanism once - // we have an available encoding of causality interfaces. - // That might be more efficient. + // The scenario this deals with is illustrated in `test/C/src/federated/FeedbackDelay2.lf` + // and `test/C/src/federated/FeedbackDelay4.lf`. + // Note that this is transitive. // NOTE: This is not needed for enclaves because zero-delay loops are prohibited. // It's only needed for federates, which is why this is implemented here. for (int j = 0; j < e->num_upstream; j++) { @@ -263,10 +254,13 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { if (upstream->state == NOT_CONNECTED) continue; tag_t earliest = earliest_future_incoming_message_tag(upstream); + tag_t strict_earliest = eimt_strict(upstream); // If these tags are equal, then a TAG or PTAG should have already been granted, // in which case, another will not be sent. But it may not have been already granted. - if (lf_tag_compare(earliest, tag) >= 0) { + if (lf_tag_compare(earliest, tag) > 0) { + notify_tag_advance_grant(upstream, tag); + } else if(lf_tag_compare(earliest, tag) == 0 && lf_tag_compare(strict_earliest, tag) > 0) { notify_provisional_tag_advance_grant(upstream, tag); } } @@ -1016,7 +1010,7 @@ void handle_federate_resign(federate_info_t *my_fed) { // an orderly shutdown. // close(my_fed->socket); // from unistd.h - lf_print("Federate %d has resigned.", my_fed->enclave.id); + lf_print("RTI: Federate %d has resigned.", my_fed->enclave.id); // Check downstream federates to see whether they should now be granted a TAG. // To handle cycles, need to create a boolean array to keep @@ -1093,8 +1087,10 @@ void* federate_info_thread_TCP(void* fed) { } // Nothing more to do. Close the socket and exit. + // Prevent multiple threads from closing the same socket at the same time. + lf_mutex_lock(&rti_mutex); close(my_fed->socket); // from unistd.h - + lf_mutex_unlock(&rti_mutex); return NULL; } @@ -1458,7 +1454,7 @@ void connect_to_federates(int socket_descriptor) { // Got a socket break; } else if (socket_id < 0 && (errno != EAGAIN || errno != EWOULDBLOCK)) { - lf_print_error_and_exit("RTI failed to accept the socket. %s.", strerror(errno)); + lf_print_error_system_failure("RTI failed to accept the socket."); } else { // Try again lf_print_warning("RTI failed to accept the socket. %s. Trying again.", strerror(errno)); @@ -1558,8 +1554,8 @@ void initialize_federate(federate_info_t* fed, uint16_t id) { int32_t start_rti_server(uint16_t port) { int32_t specified_port = port; if (port == 0) { - // Use the default starting port. - port = STARTING_PORT; + // Use the default port. + port = DEFAULT_PORT; } _lf_initialize_clock(); // Create the TCP socket server diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index b3249ec30..21264e76a 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -187,9 +187,7 @@ extern int lf_critical_section_exit(environment_t* env); /** * Create a server and enable listening for socket connections. * - * @note This function is similar to create_server(...) in - * federate.c. However, it contains logs that are specific - * to the RTI. + * @note This function is different from create_server(...) in federate.c. * * @param port The port number to use. * @param socket_type The type of the socket for the server (TCP or UDP). diff --git a/core/federated/network/net_util.c b/core/federated/network/net_util.c index 99c6e6cf8..8f5f46178 100644 --- a/core/federated/network/net_util.c +++ b/core/federated/network/net_util.c @@ -56,10 +56,14 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /** Number of nanoseconds to sleep before retrying a socket read. */ #define SOCKET_READ_RETRY_INTERVAL 1000000 +// Mutex lock held while performing socket close operations. +// A deadlock can occur if two threads simulataneously attempt to close the same socket. +lf_mutex_t socket_mutex; + int create_real_time_tcp_socket_errexit() { int sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if (sock < 0) { - lf_print_error_and_exit("Could not open TCP socket. Err=%d", sock); + lf_print_error_system_failure("Could not open TCP socket."); } // Disable Nagle's algorithm which bundles together small TCP messages to // reduce network traffic @@ -69,7 +73,7 @@ int create_real_time_tcp_socket_errexit() { int result = setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, &flag, sizeof(int)); if (result < 0) { - lf_print_error_and_exit("Failed to disable Nagle algorithm on socket server."); + lf_print_error_system_failure("Failed to disable Nagle algorithm on socket server."); } // Disable delayed ACKs. Only possible on Linux @@ -77,7 +81,7 @@ int create_real_time_tcp_socket_errexit() { result = setsockopt(sock, IPPROTO_TCP, TCP_QUICKACK, &flag, sizeof(int)); if (result < 0) { - lf_print_error_and_exit("Failed to disable Nagle algorithm on socket server."); + lf_print_error_system_failure("Failed to disable Nagle algorithm on socket server."); } #endif @@ -96,27 +100,30 @@ ssize_t read_from_socket_errexit( lf_print_error_and_exit(format, args); } ssize_t bytes_read = 0; + int retry_count = 0; while (bytes_read < (ssize_t)num_bytes) { ssize_t more = read(socket, buffer + bytes_read, num_bytes - (size_t)bytes_read); - if(more <= 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) { - // The error code set by the socket indicates + if(more < 0 && retry_count++ < NUM_SOCKET_RETRIES) { + // Used to retry only on: (errno == EAGAIN || errno == EWOULDBLOCK) + // Those error codes set by the socket indicates // that we should try again (@see man errno). - LF_PRINT_DEBUG("Reading from socket was blocked. Will try again."); + // Now we retry on all errors, but a bounded number of times. + LF_PRINT_DEBUG("Reading from socket failed. Will try again."); + lf_sleep(DELAY_BETWEEN_SOCKET_RETRIES); continue; - } else if (more <= 0) { - if (format != NULL) { - shutdown(socket, SHUT_RDWR); - close(socket); - lf_print_error("Read %ld bytes, but expected %zu. errno=%d", - more + bytes_read, num_bytes, errno); - lf_print_error_and_exit(format, args); - } else if (more == 0) { - // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, - // upon receiving a zero length packet or an error, we can close the socket. - // If there are any pending outgoing messages, this will attempt to send those - // followed by an EOF. - close(socket); - } + } else if (more < 0) { + // Retries are exhausted. + lf_print_error_system_failure("Socket read failed after %d tries. Read %ld bytes, but expected %zu.", + retry_count, more + bytes_read, num_bytes); + } else if (more == 0) { + // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, + // upon receiving a zero length packet or an error, we can close the socket. + // If there are any pending outgoing messages, this will attempt to send those + // followed by an EOF. + LF_PRINT_DEBUG("EOF received from client. Closing socket."); + lf_mutex_lock(&socket_mutex); + close(socket); + lf_mutex_unlock(&socket_mutex); return more; } bytes_read += more; diff --git a/core/reactor.c b/core/reactor.c index ce95b057d..7f5c3e344 100644 --- a/core/reactor.c +++ b/core/reactor.c @@ -397,6 +397,7 @@ int lf_reactor_c_main(int argc, const char* argv[]) { if (_lf_do_step(env)) { while (next(env) != 0); } + _lf_normal_termination = true; return 0; } else { return -1; diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index 38001cc0b..47bb644e8 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -37,12 +37,9 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Each federate attempts to connect with an RTI at the IP address * put into its code by the code generator (i.e., it attempts to - * open a TCP connection). It starts by trying the - * port number given by STARTING_PORT and increments the port number - * from there until it successfully connects. The maximum port number - * it will try before giving up is STARTING_PORT + PORT_RANGE_LIMIT. - * - * FIXME: What if a port is specified in the "at" of the federated statement? + * open a TCP connection). If an explicit port is given in the `at` clause + * on the `federated reactor` statement, it will use that port. Otherwise, it will + * use DEFAULT_PORT. * * When it has successfully opened a TCP connection, the first message it sends * to the RTI is a MSG_TYPE_FED_IDS message, which contains the ID of this federate @@ -137,9 +134,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * parameter of the target is "decentralized" and the federate has * inbound connections from other federates, then it starts a socket * server to listen for incoming connections from those federates. - * It attempts to create the server at the port given by STARTING_PORT, - * and if this fails, increments the port number from there until a - * port is available. It then sends to the RTI an MSG_TYPE_ADDRESS_ADVERTISEMENT message + * It then sends to the RTI an MSG_TYPE_ADDRESS_ADVERTISEMENT message * with the port number as a payload. The federate then creates a thread * to listen for incoming socket connections and messages. * @@ -208,7 +203,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define UDP_TIMEOUT_TIME SEC(1) - /** * Size of the buffer used for messages sent between federates. * This is used by both the federates and the rti, so message lengths @@ -217,63 +211,52 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FED_COM_BUFFER_SIZE 256u /** - * Number of nanoseconds that elapse between a federate's attempts - * to connect to the RTI. + * Time between a federate's attempts to connect to the RTI. */ -#define CONNECT_RETRY_INTERVAL 2000000000LL +#define CONNECT_RETRY_INTERVAL SEC(1) /** * Bound on the number of retries to connect to the RTI. * A federate will retry every CONNECT_RETRY_INTERVAL seconds - * this many times before giving up. E.g., 500 retries every - * 2 seconds results in retrying for about 16 minutes. + * this many times before giving up. E.g., 600 retries every + * 1 seconds results in retrying for about 10 minutes. + * This allows time to start federates before the RTI. */ -#define CONNECT_NUM_RETRIES 500 +#define CONNECT_MAX_RETRIES 600 /** - * Number of nanoseconds that a federate waits before asking + * Time that a federate waits before asking * the RTI again for the port and IP address of a federate * (an MSG_TYPE_ADDRESS_QUERY message) after the RTI responds that it - * does not know. + * does not know. This allows time for federates to start separately. */ -#define ADDRESS_QUERY_RETRY_INTERVAL 100000000LL +#define ADDRESS_QUERY_RETRY_INTERVAL SEC(1) /** - * Number of nanoseconds that a federate waits before trying - * another port for the RTI. This is to avoid overwhelming - * the OS and the socket with too many calls. - * FIXME: Is this too small? + * Time to wait before re-attempting to bind to a port. */ -#define PORT_KNOCKING_RETRY_INTERVAL 10000LL +#define PORT_BIND_RETRY_INTERVAL MSEC(10) /** - * Default starting port number for the RTI and federates' socket server. - * Unless a specific port has been specified by the LF program in the "at" - * for the RTI, when the federates start up, they will attempt - * to open a socket server - * on this port, and, if this fails, increment the port number and - * try again. The number of increments is limited by PORT_RANGE_LIMIT. - * FIXME: Clarify what happens if a specific port has been given in "at". + * Number of attempts to bind to a port before giving up. */ -#define STARTING_PORT 15045u +#define PORT_BIND_RETRY_LIMIT 100 /** - * Number of ports to try to connect to. Unless the LF program specifies - * a specific port number to use, the RTI or federates will attempt to start - * a socket server on port STARTING_PORT. If that port is not available (e.g., - * another RTI is running or has recently exited), then it will try the - * next port, STARTING_PORT+1, and keep incrementing the port number up to this - * limit. If no port between STARTING_PORT and STARTING_PORT + PORT_RANGE_LIMIT - * is available, then the RTI or the federate will fail to start. This number, therefore, - * limits the number of RTIs and federates that can be simultaneously - * running on any given machine without assigning specific port numbers. + * Default port number for the RTI. + * Unless a specific port has been specified by the LF program in the "at" + * for the RTI or on the command line, when the RTI starts up, it will attempt + * to open a socket server on this port. */ -#define PORT_RANGE_LIMIT 1024 +#define DEFAULT_PORT 15045u /** * Delay the start of all federates by this amount. - * FIXME: More. - * FIXME: Should use the latency estimates that were + * This helps ensure that the federates do not start at the same time. + * Each federate has provided its current physical time to the RTI, and + * the RTI has picked the largest of these. It will add this quantity + * and declare that to be the start time. + * FIXME: This could use the latency estimates that were * acquired during initial clock synchronization. */ #define DELAY_START SEC(1) diff --git a/include/core/federated/network/net_util.h b/include/core/federated/network/net_util.h index 5c6bcb966..651ffdda1 100644 --- a/include/core/federated/network/net_util.h +++ b/include/core/federated/network/net_util.h @@ -51,6 +51,9 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "../../platform.h" #include "../../tag.h" +#define NUM_SOCKET_RETRIES 10 +#define DELAY_BETWEEN_SOCKET_RETRIES MSEC(100) + #define HOST_LITTLE_ENDIAN 1 #define HOST_BIG_ENDIAN 2 @@ -62,6 +65,10 @@ int host_is_big_endian(void); #ifdef FEDERATED +/** + * Mutex protecting socket close operations. + */ +extern lf_mutex_t socket_mutex; /** * @brief Create an IPv4 TCP socket with Nagle's algorithm disabled diff --git a/include/core/reactor_common.h b/include/core/reactor_common.h index be74165b7..1ec8082b6 100644 --- a/include/core/reactor_common.h +++ b/include/core/reactor_common.h @@ -19,6 +19,9 @@ extern bool _lf_execution_started; extern bool keepalive_specified; extern interval_t _lf_fed_STA_offset; +/** Flag used to disable cleanup operations on normal termination. */ +extern bool _lf_normal_termination; + extern int default_argc; extern const char** default_argv; diff --git a/include/core/trace.h b/include/core/trace.h index 598c669bf..5946e16dc 100644 --- a/include/core/trace.h +++ b/include/core/trace.h @@ -435,6 +435,10 @@ void tracepoint_reaction_deadline_missed(trace_t* trace, reaction_t *reaction, i * close the files. */ void stop_trace(trace_t* trace); + +/** + * Version of stop_trace() that does not lock the trace mutex. + */ void stop_trace_locked(trace_t* trace); //////////////////////////////////////////////////////////// diff --git a/include/core/utils/util.h b/include/core/utils/util.h index d93f5de24..a41561e0a 100644 --- a/include/core/utils/util.h +++ b/include/core/utils/util.h @@ -35,6 +35,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include // Defines va_list #include +#include // Defines int64_t // To silence warnings about a function being a candidate for format checking // with gcc, add an attribute. @@ -50,10 +51,10 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Holds generic statistical data */ typedef struct lf_stat_ll { - long long average; - long long standard_deviation; - long long variance; - long long max; + int64_t average; + int64_t standard_deviation; + int64_t variance; + int64_t max; } lf_stat_ll; /** @@ -238,6 +239,12 @@ void lf_vprint_warning(const char* format, va_list args) ATTRIBUTE_FORMAT_PRINTF */ void lf_print_error_and_exit(const char* format, ...) ATTRIBUTE_FORMAT_PRINTF(1, 2); +/** + * Report an error and exit just like lf_print_error_and_exit(), but + * also print the system error message associated with the error. + */ +void lf_print_error_system_failure(const char* format, ...); + /** * varargs alternative of "lf_print_error_and_exit" */ From e41eb636d118414701c2be5e99e41ac957c1f34b Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 20 Dec 2023 15:20:17 -0800 Subject: [PATCH 02/83] Use lf_print_error_system_failure --- core/federated/clock-sync.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/core/federated/clock-sync.c b/core/federated/clock-sync.c index e438d83ac..2e7240454 100644 --- a/core/federated/clock-sync.c +++ b/core/federated/clock-sync.c @@ -169,16 +169,14 @@ uint16_t setup_clock_synchronization_with_rti() { _lf_rti_socket_UDP, (struct sockaddr *) &federate_UDP_addr, sizeof(federate_UDP_addr)) < 0) { - lf_print_error_and_exit("Failed to bind its UDP socket: %s.", - strerror(errno)); + lf_print_error_system_failure("Failed to bind its UDP socket."); } // Retrieve the port number that was assigned by the operating system socklen_t addr_length = sizeof(federate_UDP_addr); if (getsockname(_lf_rti_socket_UDP, (struct sockaddr *)&federate_UDP_addr, &addr_length) == -1) { // FIXME: Send 0 UDP_PORT message instead of exiting. // That will disable clock synchronization. - lf_print_error_and_exit("Failed to retrieve UDP port: %s.", - strerror(errno)); + lf_print_error_system_failure("Failed to retrieve UDP port."); } LF_PRINT_DEBUG("Assigned UDP port number %u to its socket.", ntohs(federate_UDP_addr.sin_port)); From 028e15a2ee35b012b8e6eb84051ed13dfe3e418d Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 20 Dec 2023 15:25:05 -0800 Subject: [PATCH 03/83] Clean up federates Distinguish normal termination from interrupted termination and avoid mutexes in the latter and avoid lf_print in the former. Remove unnecessary absent messages using EIMT and EIMT_strict. Eliminate the bogus port search algorithm (which never worked) and just use DEFAULT_PORT or an override given on the command line or an `at` clause. Replace last_time field on a trigger with last_tag. Also, implement and use lf_print_error_system_failure to report system call errors. --- core/federated/federate.c | 611 +++++++++++++++--------------- core/reactor_common.c | 173 +++++---- core/threaded/reactor_threaded.c | 47 +-- core/utils/util.c | 18 +- include/core/federated/federate.h | 44 +-- include/core/lf_types.h | 2 +- include/core/reactor.h | 24 +- 7 files changed, 476 insertions(+), 443 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index c57553464..ca8bc1fc9 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -46,7 +46,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include // Defined perror(), errno #include -#include // Defines sigaction. #include #include #include @@ -78,6 +77,9 @@ char* ERROR_SENDING_MESSAGE = "ERROR sending message to federate via RTI"; // Mutex lock held while performing socket write and close operations. lf_mutex_t outbound_socket_mutex; + +// The following two mutexes are initialized in generated code and associated +// with the top-level environment's mutex. lf_cond_t port_status_changed; lf_cond_t logical_time_changed; @@ -115,48 +117,10 @@ federation_metadata_t federation_metadata = { .rti_user = NULL }; - -/** - * Create a server to listen to incoming physical - * connections from remote federates. This function - * only handles the creation of the server socket. - * The reserved port for the server socket is then - * sent to the RTI by sending an MSG_TYPE_ADDRESS_ADVERTISEMENT message - * (@see net_common.h). This function expects no response - * from the RTI. - * - * If a port is specified by the user, that will be used - * as the only possibility for the server. This function - * will fail if that port is not available. If a port is not - * specified, the STARTING_PORT (@see net_common.h) will be used. - * The function will keep incrementing the port in this case - * until the number of tries reaches PORT_RANGE_LIMIT. - * - * @note This function is similar to create_server(...) in rti.c. - * However, it contains specific log messages for the peer to - * peer connections between federates. It also additionally - * sends an address advertisement (MSG_TYPE_ADDRESS_ADVERTISEMENT) message to the - * RTI informing it of the port. - * - * @param specified_port The specified port by the user. - */ void create_server(int specified_port) { - if (specified_port > UINT16_MAX || - specified_port < 0) { - lf_print_error( - "create_server(): The specified port (%d) is out of range." - " Starting with %d instead.", - specified_port, - STARTING_PORT - ); - specified_port = 0; - } + assert(specified_port <= UINT16_MAX && specified_port >= 0); uint16_t port = (uint16_t)specified_port; - if (specified_port == 0) { - // Use the default starting port. - port = STARTING_PORT; - } - LF_PRINT_DEBUG("Creating a socket server on port %d.", port); + LF_PRINT_LOG("Creating a socket server on port %d.", port); // Create an IPv4 socket for TCP (not UDP) communication over IP (0). int socket_descriptor = create_real_time_tcp_socket_errexit(); @@ -174,38 +138,37 @@ void create_server(int specified_port) { socket_descriptor, (struct sockaddr *) &server_fd, sizeof(server_fd)); - // If the binding fails with this port and no particular port was specified - // in the LF program, then try the next few ports in sequence. - while (result != 0 - && specified_port == 0 - && port >= STARTING_PORT - && port <= STARTING_PORT + PORT_RANGE_LIMIT) { - LF_PRINT_DEBUG("Failed to get port %d. Trying %d.", port, port + 1); - port++; - server_fd.sin_port = htons(port); + int count = 0; + while (result < 0 && count++ < PORT_BIND_RETRY_LIMIT) { + lf_sleep(PORT_BIND_RETRY_INTERVAL); result = bind( socket_descriptor, (struct sockaddr *) &server_fd, sizeof(server_fd)); } - if (result != 0) { - if (specified_port == 0) { - lf_print_error_and_exit("Failed to bind socket. Cannot find a usable port. \ - Consider increasing PORT_RANGE_LIMIT in federate.c"); - } else { - lf_print_error_and_exit("Failed to bind socket. Specified port is not available. \ - Consider leaving the port unspecified"); + if (result < 0) { + lf_print_error_and_exit("Failed to bind socket on port %d.", port); + } + + // Set the global server port. + if (specified_port == 0) { + // Need to retrieve the port number assigned by the OS. + struct sockaddr_in assigned; + socklen_t addr_len = sizeof(assigned); + if (getsockname(socket_descriptor, (struct sockaddr *) &assigned, &addr_len) < 0) { + lf_print_error_and_exit("Failed to retrieve assigned port number."); } + _fed.server_port = ntohs(assigned.sin_port); + } else { + _fed.server_port = port; } - LF_PRINT_LOG("Server for communicating with other federates started using port %d.", port); // Enable listening for socket connections. // The second argument is the maximum number of queued socket requests, // which according to the Mac man page is limited to 128. listen(socket_descriptor, 128); - // Set the global server port - _fed.server_port = port; + LF_PRINT_LOG("Server for communicating with other federates started using port %d.", _fed.server_port); // Send the server port number to the RTI // on an MSG_TYPE_ADDRESS_ADVERTISEMENT message (@see net_common.h). @@ -526,12 +489,9 @@ void _lf_send_tag(unsigned char type, tag_t tag, bool exit_on_error) { return; } else { lf_mutex_unlock(&outbound_socket_mutex); - lf_print_error_and_exit("Failed to send tag " PRINTF_TAG " to the RTI." - " Error code %d: %s", + lf_print_error_system_failure("Failed to send tag " PRINTF_TAG " to the RTI.", tag.time - start_time, - tag.microstep, - errno, - strerror(errno) + tag.microstep ); } } @@ -545,7 +505,6 @@ void _lf_send_tag(unsigned char type, tag_t tag, bool exit_on_error) { * sockets, exits. * @param env_arg pointer to the environment of this federate. */ - void* handle_p2p_connections_from_federates(void* env_arg) { assert(env_arg); environment_t* env = (environment_t *) env_arg; @@ -611,10 +570,10 @@ void* handle_p2p_connections_from_federates(void* env_arg) { tracepoint_federate_to_federate(_fed.trace, receive_FED_ID, _lf_my_fed_id, remote_fed_id, NULL); // Once we record the socket_id here, all future calls to close() on - // the socket should be done while holding a mutex, and this array + // the socket should be done while holding the socket_mutex, and this array // element should be reset to -1 during that critical section. // Otherwise, there can be race condition where, during termination, - // two threads attempt to simultaneously access the socket. + // two threads attempt to simultaneously close the socket. _fed.sockets_for_inbound_p2p_connections[remote_fed_id] = socket_id; // Send an MSG_TYPE_ACK message. @@ -634,8 +593,12 @@ void* handle_p2p_connections_from_federates(void* env_arg) { fed_id_arg); if (result != 0) { // Failed to create a listening thread. - close(socket_id); - _fed.sockets_for_inbound_p2p_connections[remote_fed_id] = -1; + lf_mutex_lock(&socket_mutex); + if (_fed.sockets_for_inbound_p2p_connections[remote_fed_id] != -1) { + close(socket_id); + _fed.sockets_for_inbound_p2p_connections[remote_fed_id] = -1; + } + lf_mutex_unlock(&socket_mutex); lf_print_error_and_exit( "Failed to create a thread to listen for incoming physical connection. Error code: %d.", result @@ -704,12 +667,10 @@ void* listen_for_upstream_messages_from_downstream_federates(void* fed_id_ptr) { LF_PRINT_DEBUG("Received EOF from federate %d.", fed_id); _lf_close_outbound_socket(fed_id); break; - } - if (bytes_read < 0) { - // EOF. - LF_PRINT_DEBUG("Error on socket from federate %d.", fed_id); + } else if (bytes_read < 0) { + // Error. _lf_close_outbound_socket(fed_id); - break; + lf_print_error_system_failure("Error on socket from federate %d.", fed_id); } } lf_mutex_unlock(&outbound_socket_mutex); @@ -767,7 +728,7 @@ void connect_to_federate(uint16_t remote_federate_id) { // remote federate has not yet sent an MSG_TYPE_ADDRESS_ADVERTISEMENT message to the RTI. // Sleep for some time before retrying. if (port == -1) { - if (count_tries++ >= CONNECT_NUM_RETRIES) { + if (count_tries++ >= CONNECT_MAX_RETRIES) { lf_print_error_and_exit("TIMEOUT obtaining IP/port for federate %d from the RTI.", remote_federate_id); } @@ -794,7 +755,7 @@ void connect_to_federate(uint16_t remote_federate_id) { #endif // Iterate until we either successfully connect or exceed the number of - // attempts given by CONNECT_NUM_RETRIES. + // attempts given by CONNECT_MAX_RETRIES. int socket_id = -1; while (result < 0) { // Create an IPv4 socket for TCP (not UDP) communication over IP (0). @@ -824,11 +785,11 @@ void connect_to_federate(uint16_t remote_federate_id) { // accepting socket connections. But possibly it will be busy (in process of accepting // another socket connection?). Hence, we retry. count_retries++; - if (count_retries > CONNECT_NUM_RETRIES) { - // If the remote federate is not accepting the connection after CONNECT_NUM_RETRIES + if (count_retries > CONNECT_MAX_RETRIES) { + // If the remote federate is not accepting the connection after CONNECT_MAX_RETRIES // treat it as a soft error condition and return. lf_print_error("Failed to connect to federate %d after %d retries. Giving up.", - remote_federate_id, CONNECT_NUM_RETRIES); + remote_federate_id, CONNECT_MAX_RETRIES); return; } lf_print_warning("Could not connect to federate %d. Will try again every %lld nanoseconds.\n", @@ -885,7 +846,7 @@ void connect_to_federate(uint16_t remote_federate_id) { // this downstream federate. uint16_t* remote_fed_id_copy = (uint16_t*)malloc(sizeof(uint16_t)); if (remote_fed_id_copy == NULL) { - lf_print_error_and_exit("malloc failed."); + lf_print_error_system_failure("malloc failed."); } *remote_fed_id_copy = remote_federate_id; lf_thread_t thread_id; @@ -969,18 +930,10 @@ void perform_hmac_authentication(int rti_socket) { } #endif -/** - * Connect to the RTI at the specified host and port and return - * the socket descriptor for the connection. If this fails, the - * program exits. If it succeeds, it sets the _fed.socket_TCP_RTI global - * variable to refer to the socket for communicating with the RTI. - * @param hostname A hostname, such as "localhost". - * @param port_number A port number. - */ void connect_to_rti(const char* hostname, int port) { LF_PRINT_LOG("Connecting to the RTI."); - // override passed hostname and port if passed as runtime arguments + // Override passed hostname and port if passed as runtime arguments. hostname = federation_metadata.rti_host ? federation_metadata.rti_host : hostname; port = federation_metadata.rti_port >= 0 ? federation_metadata.rti_port : port; @@ -989,25 +942,16 @@ void connect_to_rti(const char* hostname, int port) { port > INT16_MAX) { lf_print_error( "connect_to_rti(): Specified port (%d) is out of range," - " using zero instead.", - port + " using the default port %d instead.", + port, DEFAULT_PORT ); + uport = DEFAULT_PORT; } else { uport = (uint16_t)port; } - - // Repeatedly try to connect, one attempt every 2 seconds, until - // either the program is killed, the sleep is interrupted, - // or the connection succeeds. - // If the specified port is 0, set it instead to the start of the - // port range. - bool specific_port_given = true; if (uport == 0) { - uport = STARTING_PORT; - specific_port_given = false; + uport = DEFAULT_PORT; } - int result = -1; - int count_retries = 0; struct addrinfo hints; struct addrinfo *res; @@ -1020,144 +964,114 @@ void connect_to_rti(const char* hostname, int port) { hints.ai_next = NULL; hints.ai_flags = AI_NUMERICSERV; /* Allow only numeric port numbers */ - while (result < 0) { - // Convert port number to string - char str[6]; - sprintf(str,"%u",uport); - - // Get address structure matching hostname and hints criteria, and - // set port to the port number provided in str. There should only - // ever be one matching address structure, and we connect to that. - int server = getaddrinfo(hostname, (const char*)&str, &hints, &res); - if (server != 0) { - lf_print_error_and_exit("No host for RTI matching given hostname: %s", hostname); - } + // Convert port number to string + char str[6]; + sprintf(str,"%u",uport); - // Create a socket - _fed.socket_TCP_RTI = create_real_time_tcp_socket_errexit(); + // Get address structure matching hostname and hints criteria, and + // set port to the port number provided in str. There should only + // ever be one matching address structure, and we connect to that. + int server = getaddrinfo(hostname, (const char*)&str, &hints, &res); + if (server != 0) { + lf_print_error_and_exit("No host for RTI matching given hostname: %s", hostname); + } + + // Create a socket + _fed.socket_TCP_RTI = create_real_time_tcp_socket_errexit(); + int result = connect(_fed.socket_TCP_RTI, res->ai_addr, res->ai_addrlen);; + int count_retries = 1; + + while (result < 0 && count_retries++ < CONNECT_MAX_RETRIES) { + lf_print("Failed to connect to RTI on port %d. Will try again.", uport); + lf_sleep(CONNECT_RETRY_INTERVAL); result = connect(_fed.socket_TCP_RTI, res->ai_addr, res->ai_addrlen); - if (result == 0) { - lf_print("Successfully connected to RTI."); - } + } + freeaddrinfo(res); /* No longer needed */ - freeaddrinfo(res); /* No longer needed */ + if (result != 0) { + lf_print_error_and_exit("Failed to connect to RTI on port %d after %d tries.", uport, CONNECT_MAX_RETRIES); + } + lf_print("Successfully connected to an RTI."); - // If this failed, try more ports, unless a specific port was given. - if (result != 0 - && !specific_port_given - && uport >= STARTING_PORT - && uport <= STARTING_PORT + PORT_RANGE_LIMIT - ) { - lf_print("Failed to connect to RTI on port %d. Trying %d.", uport, uport + 1); - uport++; - // Wait PORT_KNOCKING_RETRY_INTERVAL seconds. - if (lf_sleep(PORT_KNOCKING_RETRY_INTERVAL) != 0) { - // Sleep was interrupted. - continue; - } - } - // If this still failed, try again with the original port after some time. - if (result < 0) { - if (!specific_port_given && uport == STARTING_PORT + PORT_RANGE_LIMIT + 1) { - uport = STARTING_PORT; - } - count_retries++; - if (count_retries > CONNECT_NUM_RETRIES) { - lf_print_error_and_exit("Failed to connect to the RTI after %d retries. Giving up.", - CONNECT_NUM_RETRIES); - } - lf_print("Could not connect to RTI at %s. Will try again every %lld seconds.", - hostname, CONNECT_RETRY_INTERVAL / BILLION); - // Wait CONNECT_RETRY_INTERVAL nanoseconds. - if (lf_sleep(CONNECT_RETRY_INTERVAL) != 0) { - // Sleep was interrupted. - continue; - } - } else { - // Have connected to an RTI, but not sure it's the right RTI. - // Send a MSG_TYPE_FED_IDS message and wait for a reply. - // Notify the RTI of the ID of this federate and its federation. - unsigned char buffer[4]; + // Have connected to an RTI, but not sure it's the right RTI. + // Send a MSG_TYPE_FED_IDS message and wait for a reply. + // Notify the RTI of the ID of this federate and its federation. + unsigned char buffer[4]; #ifdef FEDERATED_AUTHENTICATED - LF_PRINT_LOG("Connected to an RTI. Performing HMAC-based authentication using federation ID."); - perform_hmac_authentication(_fed.socket_TCP_RTI); + LF_PRINT_LOG("Connected to an RTI. Performing HMAC-based authentication using federation ID."); + perform_hmac_authentication(_fed.socket_TCP_RTI); #else - LF_PRINT_LOG("Connected to an RTI. Sending federation ID for authentication."); + LF_PRINT_LOG("Connected to an RTI. Sending federation ID for authentication."); #endif - // Send the message type first. - buffer[0] = MSG_TYPE_FED_IDS; - // Next send the federate ID. - if (_lf_my_fed_id > UINT16_MAX) { - lf_print_error_and_exit("Too many federates! More than %d.", UINT16_MAX); - } - encode_uint16((uint16_t)_lf_my_fed_id, &buffer[1]); - // Next send the federation ID length. - // The federation ID is limited to 255 bytes. - size_t federation_id_length = strnlen(federation_metadata.federation_id, 255); - buffer[1 + sizeof(uint16_t)] = (unsigned char)(federation_id_length & 0xff); + // Send the message type first. + buffer[0] = MSG_TYPE_FED_IDS; + // Next send the federate ID. + if (_lf_my_fed_id > UINT16_MAX) { + lf_print_error_and_exit("Too many federates! More than %d.", UINT16_MAX); + } + encode_uint16((uint16_t)_lf_my_fed_id, &buffer[1]); + // Next send the federation ID length. + // The federation ID is limited to 255 bytes. + size_t federation_id_length = strnlen(federation_metadata.federation_id, 255); + buffer[1 + sizeof(uint16_t)] = (unsigned char)(federation_id_length & 0xff); - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_FED_ID, _lf_my_fed_id, NULL); + // Trace the event when tracing is enabled + tracepoint_federate_to_rti(_fed.trace, send_FED_ID, _lf_my_fed_id, NULL); - write_to_socket_errexit(_fed.socket_TCP_RTI, 2 + sizeof(uint16_t), buffer, - "Failed to send federate ID to RTI."); + write_to_socket_errexit(_fed.socket_TCP_RTI, 2 + sizeof(uint16_t), buffer, + "Failed to send federate ID to RTI."); - // Next send the federation ID itself. - write_to_socket_errexit(_fed.socket_TCP_RTI, federation_id_length, (unsigned char*)federation_metadata.federation_id, - "Failed to send federation ID to RTI."); + // Next send the federation ID itself. + write_to_socket_errexit(_fed.socket_TCP_RTI, federation_id_length, (unsigned char*)federation_metadata.federation_id, + "Failed to send federation ID to RTI."); - // Wait for a response. - // The response will be MSG_TYPE_REJECT if the federation ID doesn't match. - // Otherwise, it will be either MSG_TYPE_ACK or MSG_TYPE_UDP_PORT, where the latter - // is used if clock synchronization will be performed. - unsigned char response; + // Wait for a response. + // The response will be MSG_TYPE_REJECT if the federation ID doesn't match. + // Otherwise, it will be either MSG_TYPE_ACK or MSG_TYPE_UDP_PORT, where the latter + // is used if clock synchronization will be performed. + unsigned char response; - LF_PRINT_DEBUG("Waiting for response to federation ID from the RTI."); + LF_PRINT_DEBUG("Waiting for response to federation ID from the RTI."); - read_from_socket_errexit(_fed.socket_TCP_RTI, 1, &response, "Failed to read response from RTI."); - if (response == MSG_TYPE_REJECT) { - // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_REJECT, _lf_my_fed_id, NULL); - // Read one more byte to determine the cause of rejection. - unsigned char cause; - read_from_socket_errexit(_fed.socket_TCP_RTI, 1, &cause, "Failed to read the cause of rejection by the RTI."); - if (cause == FEDERATION_ID_DOES_NOT_MATCH || cause == WRONG_SERVER) { - lf_print("Connected to the wrong RTI on port %d. Trying %d.", uport, uport + 1); - uport++; - result = -1; - continue; - } - lf_print_error_and_exit("RTI Rejected MSG_TYPE_FED_IDS message with response (see net_common.h): " - "%d. Error code: %d. Federate quits.\n", response, cause); - } else if (response == MSG_TYPE_ACK) { - // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_ACK, _lf_my_fed_id, NULL); - LF_PRINT_LOG("Received acknowledgment from the RTI."); - - // Call a generated (external) function that sends information - // about connections between this federate and other federates - // where messages are routed through the RTI. - // @see MSG_TYPE_NEIGHBOR_STRUCTURE in net_common.h - send_neighbor_structure_to_RTI(_fed.socket_TCP_RTI); - - uint16_t udp_port = setup_clock_synchronization_with_rti(); - - // Write the returned port number to the RTI - unsigned char UDP_port_number[1 + sizeof(uint16_t)]; - UDP_port_number[0] = MSG_TYPE_UDP_PORT; - encode_uint16(udp_port, &(UDP_port_number[1])); - write_to_socket_errexit(_fed.socket_TCP_RTI, 1 + sizeof(uint16_t), UDP_port_number, - "Failed to send the UDP port number to the RTI."); - } else { - lf_print_error_and_exit("Received unexpected response %u from the RTI (see net_common.h).", - response); - } - lf_print("Connected to RTI at %s:%d.", hostname, uport); + read_from_socket_errexit(_fed.socket_TCP_RTI, 1, &response, "Failed to read response from RTI."); + if (response == MSG_TYPE_REJECT) { + // Trace the event when tracing is enabled + tracepoint_federate_from_rti(_fed.trace, receive_REJECT, _lf_my_fed_id, NULL); + // Read one more byte to determine the cause of rejection. + unsigned char cause; + read_from_socket_errexit(_fed.socket_TCP_RTI, 1, &cause, "Failed to read the cause of rejection by the RTI."); + if (cause == FEDERATION_ID_DOES_NOT_MATCH || cause == WRONG_SERVER) { + lf_print_error_and_exit("Connected to the wrong RTI on port %d.", uport); } + lf_print_error_and_exit("RTI Rejected MSG_TYPE_FED_IDS message with response (see net_common.h): " + "%d. Error code: %d. Federate quits.\n", response, cause); + } else if (response == MSG_TYPE_ACK) { + // Trace the event when tracing is enabled + tracepoint_federate_from_rti(_fed.trace, receive_ACK, _lf_my_fed_id, NULL); + LF_PRINT_LOG("Received acknowledgment from the RTI."); + + // Call a generated (external) function that sends information + // about connections between this federate and other federates + // where messages are routed through the RTI. + // @see MSG_TYPE_NEIGHBOR_STRUCTURE in net_common.h + send_neighbor_structure_to_RTI(_fed.socket_TCP_RTI); + + uint16_t udp_port = setup_clock_synchronization_with_rti(); + + // Write the returned port number to the RTI + unsigned char UDP_port_number[1 + sizeof(uint16_t)]; + UDP_port_number[0] = MSG_TYPE_UDP_PORT; + encode_uint16(udp_port, &(UDP_port_number[1])); + write_to_socket_errexit(_fed.socket_TCP_RTI, 1 + sizeof(uint16_t), UDP_port_number, + "Failed to send the UDP port number to the RTI."); + } else { + lf_print_error_and_exit("Received unexpected response %u from the RTI (see net_common.h).", + response); } + lf_print("Connected to RTI at %s:%d.", hostname, uport); } /** @@ -1204,12 +1118,12 @@ instant_t get_start_time_from_rti(instant_t my_physical_time) { extern lf_action_base_t* _lf_action_table[]; extern interval_t _lf_action_delay_table[]; extern size_t _lf_action_table_size; -extern lf_action_base_t* _lf_zero_delay_action_table[]; -extern size_t _lf_zero_delay_action_table_size; +extern lf_action_base_t* _lf_zero_delay_cycle_action_table[]; +extern size_t _lf_zero_delay_cycle_action_table_size; extern reaction_t* network_input_reactions[]; extern size_t num_network_input_reactions; extern reaction_t* port_absent_reaction[]; -extern size_t num_sender_reactions; +extern size_t num_port_absent_reactions; #ifdef FEDERATED_DECENTRALIZED extern staa_t* staa_lst[]; extern size_t staa_lst_size; @@ -1361,11 +1275,11 @@ void enqueue_port_absent_reactions(environment_t* env){ } #endif LF_PRINT_DEBUG("Enqueueing port absent reactions at time %lld.", (long long) (env->current_tag.time - start_time)); - if (num_sender_reactions == 0) { + if (num_port_absent_reactions == 0) { LF_PRINT_DEBUG("No port absent reactions."); return; } - for (int i = 0; i < num_sender_reactions; i++) { + for (int i = 0; i < num_port_absent_reactions; i++) { reaction_t* reaction = port_absent_reaction[i]; if (reaction && reaction->status == inactive) { LF_PRINT_DEBUG("Inserting port absent reaction on reaction queue."); @@ -1471,7 +1385,8 @@ static trigger_handle_t schedule_message_received_from_network_locked( // federate. By default, assume it is not. bool message_tag_is_in_the_future = lf_tag_compare(tag, env->current_tag) > 0; - // Assign the intended tag + // Assign the intended tag temporarily to restore later. + tag_t previous_intended_tag = trigger->intended_tag; trigger->intended_tag = tag; // Calculate the extra_delay required to be passed @@ -1503,6 +1418,7 @@ static trigger_handle_t schedule_message_received_from_network_locked( "in the future.", extra_delay, tag.microstep - env->current_tag.microstep); return_value = _lf_schedule_at_tag(env, trigger, tag, token); } + trigger->intended_tag = previous_intended_tag; // Notify the main thread in case it is waiting for physical time to elapse. LF_PRINT_DEBUG("Broadcasting notification that event queue changed."); lf_cond_broadcast(&env->event_q_changed); @@ -1515,27 +1431,31 @@ static trigger_handle_t schedule_message_received_from_network_locked( * requesting that it close the socket. If the message is sent successfully, * this returns 1. Otherwise it returns 0, which presumably means that the * socket is already closed. + * + * This function assumes that the caller holds the socket_mutex lock. * * @param The ID of the peer federate sending messages to this federate. * * @return 1 if the MSG_TYPE_CLOSE_REQUEST message is sent successfully, 0 otherwise. */ -int _lf_request_close_inbound_socket(int fed_id) { +static int _lf_request_close_inbound_socket(int fed_id) { assert(fed_id >= 0 && fed_id < NUMBER_OF_FEDERATES); - if (_fed.sockets_for_inbound_p2p_connections[fed_id] < 1) return 0; + if (_fed.sockets_for_inbound_p2p_connections[fed_id] < 0) { + return 0; + } // Send a MSG_TYPE_CLOSE_REQUEST message. unsigned char message_marker = MSG_TYPE_CLOSE_REQUEST; - LF_PRINT_LOG("Sending MSG_TYPE_CLOSE_REQUEST message to upstream federate."); - - // Trace the event when tracing is enabled - tracepoint_federate_to_federate(_fed.trace, send_CLOSE_RQ, _lf_my_fed_id, fed_id, NULL); ssize_t written = write_to_socket( _fed.sockets_for_inbound_p2p_connections[fed_id], 1, &message_marker); _fed.sockets_for_inbound_p2p_connections[fed_id] = -1; + + // Trace the event when tracing is enabled + tracepoint_federate_to_federate(_fed.trace, send_CLOSE_RQ, _lf_my_fed_id, fed_id, NULL); + if (written == 1) { LF_PRINT_LOG("Sent MSG_TYPE_CLOSE_REQUEST message to upstream federate."); return 1; @@ -1553,6 +1473,7 @@ int _lf_request_close_inbound_socket(int fed_id) { * federate, or -1 if the RTI. */ void _lf_close_inbound_socket(int fed_id) { + lf_mutex_lock(&socket_mutex); if (fed_id < 0) { // socket connection is to the RTI. int socket = _fed.socket_TCP_RTI; @@ -1562,10 +1483,13 @@ void _lf_close_inbound_socket(int fed_id) { shutdown(socket, SHUT_RDWR); close(socket); } else if (_fed.sockets_for_inbound_p2p_connections[fed_id] >= 0) { - shutdown(_fed.sockets_for_inbound_p2p_connections[fed_id], SHUT_RDWR); - close(_fed.sockets_for_inbound_p2p_connections[fed_id]); - _fed.sockets_for_inbound_p2p_connections[fed_id] = -1; + if (_fed.sockets_for_inbound_p2p_connections[fed_id] >= 0) { + shutdown(_fed.sockets_for_inbound_p2p_connections[fed_id], SHUT_RDWR); + close(_fed.sockets_for_inbound_p2p_connections[fed_id]); + _fed.sockets_for_inbound_p2p_connections[fed_id] = -1; + } } + lf_mutex_unlock(&socket_mutex); } /** @@ -1674,7 +1598,7 @@ void stall_advance_level_federation(environment_t* env, size_t level) { } /** - * Handle a timed message being received from a remote federate via the RTI + * Handle a tagged message being received from a remote federate via the RTI * or directly from other federates. * This will read the tag encoded in the header * and calculate an offset to pass to the schedule function. @@ -1726,7 +1650,7 @@ void handle_tagged_message(int socket, int fed_id) { if (action->trigger->is_physical) { // Messages sent on physical connections should be handled via handle_message(). - lf_print_error_and_exit("Received a timed message on a physical connection."); + lf_print_error_and_exit("Received a tagged message on a physical connection."); } #ifdef FEDERATED_DECENTRALIZED @@ -1760,9 +1684,6 @@ void handle_tagged_message(int socket, int fed_id) { // Create a token for the message lf_token_t* message_token = _lf_new_token((token_type_t*)action, message_contents, length); - // FIXME: It might be enough to just check this field and not the status at all - update_last_known_status_on_input_port(intended_tag, port_id); - // Check whether reactions need to be inserted directly into the reaction // queue or a call to schedule is needed. This checks if the intended // tag of the message is for the current tag or a tag that is already @@ -1781,13 +1702,23 @@ void handle_tagged_message(int socket, int fed_id) { // to exit. The port status is on the other hand changed in this thread, and thus, // can be checked in this scenario without this race condition. The message with // intended_tag of 9 in this case needs to wait one microstep to be processed. - if (lf_tag_compare(intended_tag, lf_tag(env)) <= 0 && // The event is meant for the current or a previous tag. - (action->trigger->status == unknown || // if the status of the port is still unknown. - _lf_execution_started == false) // Or, execution hasn't even started, so it's safe to handle this event. + if (lf_tag_compare(intended_tag, lf_tag(env)) == 0 // The event is meant for the current tag. +#if defined FEDERATED_DECENTRALIZED + // Not sure why this test is only needed for decentralized coordination. + && _lf_execution_started +#endif // FEDERATED_DECENTRALIZED + // Check that MLAA is blocking at the right level. Otherwise, data can be lost. + && action->trigger->reactions[0]->index >= max_level_allowed_to_advance + && !action->trigger->is_physical + && lf_tag_compare(intended_tag, action->trigger->last_tag) > 0 // Not already enabled at the current tag. + && lf_tag_compare(intended_tag, action->trigger->last_known_status_tag) > 0 ) { // Since the message is intended for the current tag and a port absent reaction // was waiting for the message, trigger the corresponding reactions for this // message. + + update_last_known_status_on_input_port(intended_tag, port_id); + LF_PRINT_LOG( "Inserting reactions directly at tag " PRINTF_TAG ". " "Intended tag: " PRINTF_TAG ".", @@ -1812,6 +1743,8 @@ void handle_tagged_message(int socket, int fed_id) { // If no port absent reaction is waiting for this message, or if the intended // tag is in the future, use schedule functions to process the message. + update_last_known_status_on_input_port(intended_tag, port_id); + // Before that, if the current time >= stop time, discard the message. // But only if the stop time is not equal to the start time! if (lf_tag_compare(env->current_tag, env->stop_tag) >= 0) { @@ -1823,7 +1756,6 @@ void handle_tagged_message(int socket, int fed_id) { goto release; } - LF_PRINT_LOG("Calling schedule with tag " PRINTF_TAG ".", intended_tag.time - start_time, intended_tag.microstep); schedule_message_received_from_network_locked(env, action->trigger, intended_tag, message_token); } @@ -1937,8 +1869,8 @@ bool update_max_level(tag_t tag, bool is_provisional) { size_t action_table_size = _lf_action_table_size; lf_action_base_t** action_table = _lf_action_table; #else - size_t action_table_size = _lf_zero_delay_action_table_size; - lf_action_base_t** action_table = _lf_zero_delay_action_table; + size_t action_table_size = _lf_zero_delay_cycle_action_table_size; + lf_action_base_t** action_table = _lf_zero_delay_cycle_action_table; #endif // FEDERATED_DECENTRALIZED for (int i = 0; i < action_table_size; i++) { lf_action_base_t* input_port_action = action_table[i]; @@ -1957,6 +1889,12 @@ bool update_max_level(tag_t tag, bool is_provisional) { continue; } #endif // FEDERATED_DECENTRALIZED + // If the current tag is greater than the last known status tag of the input port, + // and the input port is not physical, then block on that port by ensuring + // the MLAA is no greater than the level of that port. + // For centralized coordination, this is applied only to input ports coming from + // federates that are in a ZDC. For decentralized coordination, this is applied + // to all input ports. if (lf_tag_compare(env->current_tag, input_port_action->trigger->last_known_status_tag) > 0 && !input_port_action->trigger->is_physical) { @@ -1981,7 +1919,7 @@ bool update_max_level(tag_t tag, bool is_provisional) { */ static bool a_port_is_unknown(staa_t* staa_elem) { bool do_wait = false; - for (int j = 0; j < staa_elem->numActions; ++j) { + for (int j = 0; j < staa_elem->num_actions; ++j) { if (staa_elem->actions[j]->trigger->status == unknown) { do_wait = true; break; @@ -2002,57 +1940,89 @@ static int id_of_action(lf_action_base_t* input_port_action) { } /** - * @brief Given a list of staa offsets and its associated triggers, - * have a single thread work to set ports to absent at a given logical time - * + * @brief Thread handling setting the known absent status of input ports. + * For the code-generated array of staa offsets `staa_lst`, which is sorted by STAA offset, + * wait for physical time to advance to the current time plus the STAA offset, + * then set the absent status of the input ports associated with the STAA. + * Then wait for current time to advance and start over. */ #ifdef FEDERATED_DECENTRALIZED static void* update_ports_from_staa_offsets(void* args) { + if (staa_lst_size == 0) return NULL; // Nothing to do. + // NOTE: Using only the top-level environment, which is the one that deals with network + // input ports. environment_t *env; int num_envs = _lf_get_environments(&env); + lf_mutex_lock(&env->mutex); while (1) { bool restart = false; tag_t tag_when_started_waiting = lf_tag(env); for (int i = 0; i < staa_lst_size; ++i) { staa_t* staa_elem = staa_lst[i]; - interval_t wait_until_time = env->current_tag.time + staa_elem->STAA + _lf_fed_STA_offset - _lf_action_delay_table[i]; - lf_mutex_lock(&env->mutex); - // Both before and after the wait, check that the tag has not changed - if (a_port_is_unknown(staa_elem) && lf_tag_compare(lf_tag(env), tag_when_started_waiting) == 0 && wait_until(env, wait_until_time, &port_status_changed) && lf_tag_compare(lf_tag(env), tag_when_started_waiting) == 0) { - for (int j = 0; j < staa_elem->numActions; ++j) { - lf_action_base_t* input_port_action = staa_elem->actions[j]; - if (input_port_action->trigger->status == unknown) { - input_port_action->trigger->status = absent; - LF_PRINT_DEBUG("Assuming port absent at time %lld.", (long long) (lf_tag(env).time - start_time)); - update_last_known_status_on_input_port(lf_tag(env), id_of_action(input_port_action)); - update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); - lf_cond_broadcast(&port_status_changed); + // The staa_elem is adjusted in the code generator to have subtracted the delay on the connection. + // The list is sorted in increasing order of adjusted STAA offsets. + // The wait_until function automatically adds the _lf_fed_STA_offset to the wait time. + interval_t wait_until_time = env->current_tag.time + staa_elem->STAA; + // The wait_until call will release the env->mutex while it is waiting. + // However, it will not release the env->mutex if the wait time is too small. + // At the cost of a small additional delay in deciding a port is absent, + // we require a minimum wait time here. Otherwise, if both the STAA and STA are + // zero, this thread will fail to ever release the environment mutex. + // This causes chaos. The MIN_SLEEP_DURATION is the smallest amount of time + // that wait_until will actually wait. Note that this strategy does not + // block progress of any execution that is actually processing events. + // It only slightly delays the decision that an event is absent, and only + // if the STAA and STA are extremely small. + if (_lf_fed_STA_offset + staa_elem->STAA < 5 * MIN_SLEEP_DURATION) { + wait_until_time += 5 * MIN_SLEEP_DURATION; + } + while (a_port_is_unknown(staa_elem)) { + if (wait_until(env, wait_until_time, &port_status_changed)) { + if (lf_tag_compare(lf_tag(env), tag_when_started_waiting) != 0) { + // Wait was not interrupted and we have committed to a new tag before we + // finished processing the list. Start over. + restart = true; + break; + } + /* Possibly useful for debugging: + tag_t current_tag = lf_tag(env); + lf_print("--------------------- FIXME: assuming absent! " PRINTF_TAG, current_tag.time - lf_time_start(), current_tag.microstep); + lf_print("--------------------- Lag is " PRINTF_TIME, current_tag.time - lf_time_physical()); + lf_print("--------------------- Wait until time is " PRINTF_TIME, wait_until_time - lf_time_start()); + */ + + // Wait went to completion. Mark any ports with this STAA that remain unknown as absent. + for (int j = 0; j < staa_elem->num_actions; ++j) { + lf_action_base_t* input_port_action = staa_elem->actions[j]; + if (input_port_action->trigger->status == unknown) { + input_port_action->trigger->status = absent; + LF_PRINT_DEBUG("Assuming port absent at time " PRINTF_TIME, lf_tag(env).time - start_time); + update_last_known_status_on_input_port(lf_tag(env), id_of_action(input_port_action)); + lf_cond_broadcast(&port_status_changed); + } } + } else if (lf_tag_compare(lf_tag(env), tag_when_started_waiting) != 0) { + // Wait was interrupted and we have committed to a new tag before we + // finished processing the list. Start over. + restart = true; + break; } - lf_mutex_unlock(&env->mutex); - } else if (lf_tag_compare(lf_tag(env), tag_when_started_waiting) != 0) { - // We have committed to a new tag before we finish processing the list. Start over. - restart = true; - lf_mutex_unlock(&env->mutex); - break; - } else { - lf_mutex_unlock(&env->mutex); } + if (restart) break; // No need to check the rest of the STAAs. } - if (restart) continue; + if (restart) continue; // No need to wait for a new tag. - lf_mutex_lock(&env->mutex); + // Wait until we progress to a new tag. while (lf_tag_compare(lf_tag(env), tag_when_started_waiting) == 0) { + // The following will release the env->mutex while waiting. lf_cond_wait(&logical_time_changed); } - lf_mutex_unlock(&env->mutex); } } /** - * @brief Spawns a thread to iterate through STAA structs, setting its associated ports absent + * @brief Spawn a thread to iterate through STAA structs, setting their associated ports absent * at an offset if the port is not present with a value by a certain physical time. - * */ void spawn_staa_thread(){ lf_thread_create(&_fed.staaSetter, update_ports_from_staa_offsets, NULL); @@ -2337,6 +2307,23 @@ void handle_stop_request_message() { lf_mutex_unlock(&outbound_socket_mutex); } +/** + * Send a resign signal to the RTI. + */ +static void send_resign_signal(environment_t* env) { + size_t bytes_to_write = 1 + sizeof(tag_t); + unsigned char buffer[bytes_to_write]; + buffer[0] = MSG_TYPE_RESIGN; + tag_t tag = env->current_tag; + encode_tag(&(buffer[1]), tag); + ssize_t written = write_to_socket(_fed.socket_TCP_RTI, bytes_to_write, &(buffer[0])); + if (written == bytes_to_write) { + LF_PRINT_LOG("Resigned."); + } + // Trace the event when tracing is enabled + tracepoint_federate_to_rti(_fed.trace, send_RESIGN, _lf_my_fed_id, &tag); +} + /** * Close sockets used to communicate with other federates, if they are open, * and send a MSG_TYPE_RESIGN message to the RTI. This implements the function @@ -2347,52 +2334,49 @@ void handle_stop_request_message() { void terminate_execution(environment_t* env) { assert(env != GLOBAL_ENVIRONMENT); + // For an abnormal termination (e.g. a SIGINT), we need to send a + // MSG_TYPE_RESIGN message to the RTI, but we should not acquire a mutex. + if (_fed.socket_TCP_RTI >= 0) { + if (_lf_normal_termination) { + lf_mutex_lock(&outbound_socket_mutex); + send_resign_signal(env); + lf_mutex_unlock(&outbound_socket_mutex); + } else { + // Do not acquire mutex and do not trace. + send_resign_signal(env); + } + } + + LF_PRINT_DEBUG("Requesting closing of incoming P2P sockets."); + // Request closing the incoming P2P sockets. + for (int i=0; i < NUMBER_OF_FEDERATES; i++) { + _lf_request_close_inbound_socket(i); + // Ignore errors. Mark the socket closed. + _fed.sockets_for_inbound_p2p_connections[i] = -1; + } + + // For abnormal termination, skip the rest, letting the threads be terminated + // and sockets be closed by the OS. + if (!_lf_normal_termination) return; + // Check for all outgoing physical connections in // _fed.sockets_for_outbound_p2p_connections and // if the socket ID is not -1, the connection is still open. // Send an EOF by closing the socket here. - // NOTE: It is dangerous to acquire a mutex in a termination - // process because it can block program exit if a deadlock occurs. - // Hence, it is paramount that these mutexes not allow for any - // possibility of deadlock. To ensure this, this - // function should NEVER be called while holding any mutex lock. - lf_mutex_lock(&outbound_socket_mutex); for (int i=0; i < NUMBER_OF_FEDERATES; i++) { + // Close outbound connections, in case they have not closed themselves. // This will result in EOF being sent to the remote federate, I think. _lf_close_outbound_socket(i); } - // Resign the federation, which will close the socket to the RTI. - if (_fed.socket_TCP_RTI >= 0) { - size_t bytes_to_write = 1 + sizeof(tag_t); - unsigned char buffer[bytes_to_write]; - buffer[0] = MSG_TYPE_RESIGN; - tag_t tag = env->current_tag; - encode_tag(&(buffer[1]), tag); - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_RESIGN, _lf_my_fed_id, &tag); - ssize_t written = write_to_socket(_fed.socket_TCP_RTI, bytes_to_write, &(buffer[0])); - if (written == bytes_to_write) { - LF_PRINT_LOG("Resigned."); - } - } - lf_mutex_unlock(&outbound_socket_mutex); - - LF_PRINT_DEBUG("Requesting closing of incoming P2P sockets."); - // Request closing the incoming P2P sockets. - for (int i=0; i < NUMBER_OF_FEDERATES; i++) { - if (_lf_request_close_inbound_socket(i) == 0) { - // Sending the close request failed. Mark the socket closed. - _fed.sockets_for_inbound_p2p_connections[i] = -1; - } - } LF_PRINT_DEBUG("Waiting for inbound p2p socket listener threads."); // Wait for each inbound socket listener thread to close. - if (_fed.number_of_inbound_p2p_connections > 0) { + if (_fed.number_of_inbound_p2p_connections > 0 && _fed.inbound_socket_listeners != NULL) { LF_PRINT_LOG("Waiting for %zu threads listening for incoming messages to exit.", _fed.number_of_inbound_p2p_connections); for (int i=0; i < _fed.number_of_inbound_p2p_connections; i++) { + if (_fed.inbound_socket_listeners[i] == NULL) continue; // Ignoring errors here. lf_thread_join(_fed.inbound_socket_listeners[i], NULL); } @@ -2790,9 +2774,10 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply } if (tag.time != FOREVER) { - // Create a dummy event that will force this federate to advance time and subsequently enable progress for - // downstream federates. - event_t* dummy = _lf_create_dummy_events(env, NULL, tag.time, NULL, 0); + // Create a dummy event that will force this federate to advance time and subsequently + // enable progress for downstream federates. Increment the time by ADVANCE_MESSAGE_INTERVAL + // to prevent too frequent dummy events. + event_t* dummy = _lf_create_dummy_events(env, NULL, tag.time + ADVANCE_MESSAGE_INTERVAL, NULL, 0); pqueue_insert(env->event_q, dummy); } @@ -2800,7 +2785,7 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply tag.time - lf_time_start()); if (!wait_for_reply) { - LF_PRINT_LOG("Not waiting physical time to advance further."); + LF_PRINT_LOG("Not waiting for physical time to advance further."); return tag; } diff --git a/core/reactor_common.c b/core/reactor_common.c index 6984d920b..db1d97146 100644 --- a/core/reactor_common.c +++ b/core/reactor_common.c @@ -116,6 +116,16 @@ bool keepalive_specified = false; */ interval_t _lf_fed_STA_offset = 0LL; +void _lf_print_event(void* event) { + if (event == NULL) { + printf("NULL"); + } else { + event_t* ev = (event_t*)event; + lf_print("Event: Time=" PRINTF_TIME ", dummy=%d, timer=%d", + ev->time - start_time, ev->is_dummy, ev->trigger->is_timer); + } +} + /** * Allocate memory using calloc (so the allocated memory is zeroed out) * and record the allocated memory on the specified self struct so that @@ -265,6 +275,11 @@ void _lf_trigger_reaction(environment_t* env, reaction_t* reaction, int worker_n * counts between time steps and at the end of execution. */ void _lf_start_time_step(environment_t *env) { + if (_lf_execution_started == false) { + // Execution hasn't started, so this is probably being invoked in termination + // due to an error. + return; + } assert(env != GLOBAL_ENVIRONMENT); LF_PRINT_LOG("--------- Start time step at tag " PRINTF_TAG ".", env->current_tag.time - start_time, env->current_tag.microstep); // Handle dynamically created tokens for mutable inputs. @@ -349,8 +364,8 @@ void _lf_pop_events(environment_t *env) { } #ifdef MODAL_REACTORS - // If this event is associated with an incative it should haven been suspended and no longer on the event queue. - // FIXME This should not be possible + // If this event is associated with an inactive mode it should haven been suspended and no longer on the event queue. + // NOTE: This should not be possible if (!_lf_mode_is_active(event->trigger->mode)) { lf_print_warning("Assumption violated. There is an event on the event queue that is associated to an inactive mode."); } @@ -366,8 +381,8 @@ void _lf_pop_events(environment_t *env) { #ifdef FEDERATED_DECENTRALIZED // In federated execution, an intended tag that is not (NEVER, 0) // indicates that this particular event is triggered by a network message. - // The intended tag is set in handle_timed_message in federate.c whenever - // a timed message arrives from another federate. + // The intended tag is set in handle_tagged_message in federate.c whenever + // a tagged message arrives from another federate. if (event->intended_tag.time != NEVER) { // If the intended tag of the event is actually set, // transfer the intended tag to the trigger so that @@ -918,7 +933,8 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t if (!trigger->is_timer) { delay += trigger->offset; } - interval_t intended_time = env->current_tag.time + delay; + tag_t intended_tag = (tag_t){.time = env->current_tag.time + delay, .microstep = 0}; + LF_PRINT_DEBUG("_lf_schedule: env->current_tag.time = " PRINTF_TIME ". Total logical delay = " PRINTF_TIME "", env->current_tag.time, delay); interval_t min_spacing = trigger->period; @@ -940,7 +956,7 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t // modify the intended time. if (trigger->is_physical) { // Get the current physical time and assign it as the intended time. - intended_time = lf_time_physical() + delay; + intended_tag.time = lf_time_physical() + delay; } else { // FIXME: We need to verify that we are executing within a reaction? // See reactor_threaded. @@ -951,11 +967,11 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t // - we have eliminated the possibility to have a negative additional delay; and // - we detect the asynchronous use of logical actions #ifndef NDEBUG - if (intended_time < env->current_tag.time) { + if (intended_tag.time < env->current_tag.time) { lf_print_warning("Attempting to schedule an event earlier than current time by " PRINTF_TIME " nsec! " "Revising to the current time " PRINTF_TIME ".", - env->current_tag.time - intended_time, env->current_tag.time); - intended_time = env->current_tag.time; + env->current_tag.time - intended_tag.time, env->current_tag.time); + intended_tag.time = env->current_tag.time; } #endif } @@ -969,7 +985,6 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t // Check for conflicts (a queued event with the same trigger and time). if (min_spacing <= 0) { // No minimum spacing defined. - tag_t intended_tag = (tag_t) {.time = intended_time, .microstep = 0u}; e->time = intended_tag.time; event_t* found = (event_t *)pqueue_find_equal_same_priority(env->event_q, e); // Check for conflicts. Let events pile up in super dense time. @@ -989,23 +1004,23 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t } // Hook the event into the list. found->next = e; - trigger->last_time = intended_tag.time; + trigger->last_tag = intended_tag; return(0); // FIXME: return value } // If there are not conflicts, schedule as usual. If intended time is // equal to the current logical time, the event will effectively be // scheduled at the next microstep. - } else if (!trigger->is_timer && trigger->last_time != NEVER) { + } else if (!trigger->is_timer && trigger->last_tag.time != NEVER) { // There is a min_spacing and there exists a previously // scheduled event. It determines the // earliest time at which the new event can be scheduled. // Check to see whether the event is too early. - instant_t earliest_time = trigger->last_time + min_spacing; + instant_t earliest_time = trigger->last_tag.time + min_spacing; LF_PRINT_DEBUG("There is a previously scheduled event; earliest possible time " "with min spacing: " PRINTF_TIME, earliest_time); // If the event is early, see which policy applies. - if (earliest_time > intended_time) { + if (earliest_time > intended_tag.time) { LF_PRINT_DEBUG("Event is early."); switch(trigger->policy) { case drop: @@ -1023,7 +1038,7 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t event_t* dummy = _lf_get_new_event(env); dummy->next = NULL; dummy->trigger = trigger; - dummy->time = trigger->last_time; + dummy->time = trigger->last_tag.time; event_t* found = (event_t *)pqueue_find_equal_same_priority(env->event_q, dummy); if (found != NULL) { @@ -1032,18 +1047,18 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t _lf_replace_token(found, token); _lf_recycle_event(env, e); _lf_recycle_event(env, dummy); - // Leave the last_time the same. + // Leave the last_tag the same. return(0); } _lf_recycle_event(env, dummy); // If the preceding event _has_ been handled, then adjust // the tag to defer the event. - intended_time = earliest_time; + intended_tag = (tag_t){.time = earliest_time, .microstep = 0}; break; default: // Default policy is defer - intended_time = earliest_time; + intended_tag = (tag_t){.time = earliest_time, .microstep = 0}; break; } } @@ -1054,16 +1069,16 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t // FIXME: This is a development assertion and might // not be necessary for end-user LF programs #ifndef NDEBUG - if (intended_time < env->current_tag.time) { + if (intended_tag.time < env->current_tag.time) { lf_print_error("Attempting to schedule an event earlier than current time by " PRINTF_TIME " nsec! " "Revising to the current time " PRINTF_TIME ".", - env->current_tag.time - intended_time, env->current_tag.time); - intended_time = env->current_tag.time; + env->current_tag.time - intended_tag.time, env->current_tag.time); + intended_tag.time = env->current_tag.time; } #endif // Set the tag of the event. - e->time = intended_time; + e->time = intended_tag.time; // Do not schedule events if if the event time is past the stop time // (current microsteps are checked earlier). @@ -1077,7 +1092,7 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t // Store the time in order to check the min spacing // between this and any following event. - trigger->last_time = intended_time; + trigger->last_tag = intended_tag; // Queue the event. // NOTE: There is no need for an explicit microstep because @@ -1367,7 +1382,7 @@ void schedule_output_reactions(environment_t *env, reaction_t* reaction, int wor #ifdef FEDERATED_DECENTRALIZED // Only pass down STP violation for federated programs that use decentralized coordination. // Extract the inherited STP violation bool inherited_STP_violation = reaction->is_STP_violated; - LF_PRINT_LOG("Reaction %s has STP violation status: %d.", reaction->name, reaction->is_STP_violated); + LF_PRINT_DEBUG("Reaction %s has STP violation status: %d.", reaction->name, reaction->is_STP_violated); #endif LF_PRINT_DEBUG("There are %zu outputs from reaction %s.", reaction->num_outputs, reaction->name); for (size_t i=0; i < reaction->num_outputs; i++) { @@ -1714,80 +1729,96 @@ void initialize_global(void) { _lf_initialize_trigger_objects() ; } +/** Flag to prevent termination function from executing twice. */ +bool _lf_termination_executed = false; + +/** Flag used to disable cleanup operations on normal termination. */ +bool _lf_normal_termination = false; + /** * Report elapsed logical and physical times and report if any * memory allocated by set_new, set_new_array, or lf_writable_copy * has not been freed. */ void termination(void) { + if (_lf_termination_executed) return; + _lf_termination_executed = true; + environment_t *env; int num_envs = _lf_get_environments(&env); // Invoke the code generated termination function. It terminates the federated related services. - // It should only be called for the top-level environment, which, after convention, is the first environment. + // It should only be called for the top-level environment, which, by convention, is the first environment. terminate_execution(env); - // In order to free tokens, we perform the same actions we would have for a new time step. for (int i = 0; iid); - if (!env->initialized) { + // NOTE: env pointer is incremented at the end of this loop. + if (env == NULL || !env->initialized) { lf_print_warning("---- Environment %u was never initialized", env->id); continue; } + lf_print("---- Terminating environment %u", env->id); // Stop any tracing, if it is running. + // No need to acquire a mutex because if this is normal termination, all + // other threads have stopped, and if it's not, then acquiring a mutex could + // lead to a deadlock. stop_trace_locked(env->trace); - _lf_start_time_step(env); + // Skip most cleanup on abnormal termination. + if (_lf_normal_termination) { + _lf_start_time_step(env); #ifdef MODAL_REACTORS - // Free events and tokens suspended by modal reactors. - _lf_terminate_modal_reactors(env); + // Free events and tokens suspended by modal reactors. + _lf_terminate_modal_reactors(env); #endif - - // If the event queue still has events on it, report that. - if (env->event_q != NULL && pqueue_size(env->event_q) > 0) { - lf_print_warning("---- There are %zu unprocessed future events on the event queue.", pqueue_size(env->event_q)); - event_t* event = (event_t*)pqueue_peek(env->event_q); - interval_t event_time = event->time - start_time; - lf_print_warning("---- The first future event has timestamp " PRINTF_TIME " after start time.", event_time); - } - // Print elapsed times. - // If these are negative, then the program failed to start up. - interval_t elapsed_time = lf_time_logical_elapsed(env); - if (elapsed_time >= 0LL) { - char time_buffer[29]; // 28 bytes is enough for the largest 64 bit number: 9,223,372,036,854,775,807 - lf_comma_separated_time(time_buffer, elapsed_time); - printf("---- Elapsed logical time (in nsec): %s\n", time_buffer); - - // If start_time is 0, then execution didn't get far enough along - // to initialize this. - if (start_time > 0LL) { - lf_comma_separated_time(time_buffer, lf_time_physical_elapsed()); - printf("---- Elapsed physical time (in nsec): %s\n", time_buffer); + // If the event queue still has events on it, report that. + if (env->event_q != NULL && pqueue_size(env->event_q) > 0) { + lf_print_warning("---- There are %zu unprocessed future events on the event queue.", pqueue_size(env->event_q)); + event_t* event = (event_t*)pqueue_peek(env->event_q); + interval_t event_time = event->time - start_time; + lf_print_warning("---- The first future event has timestamp " PRINTF_TIME " after start time.", event_time); + } + // Print elapsed times. + // If these are negative, then the program failed to start up. + interval_t elapsed_time = lf_time_logical_elapsed(env); + if (elapsed_time >= 0LL) { + char time_buffer[29]; // 28 bytes is enough for the largest 64 bit number: 9,223,372,036,854,775,807 + lf_comma_separated_time(time_buffer, elapsed_time); + printf("---- Elapsed logical time (in nsec): %s\n", time_buffer); + + // If start_time is 0, then execution didn't get far enough along + // to initialize this. + if (start_time > 0LL) { + lf_comma_separated_time(time_buffer, lf_time_physical_elapsed()); + printf("---- Elapsed physical time (in nsec): %s\n", time_buffer); + } } - } - // Free up memory associated with environment - environment_free(env); - + // Free up memory associated with environment + environment_free(env); + } env++; } - _lf_free_all_tokens(); // Must be done before freeing reactors. - // Issue a warning if a memory leak has been detected. - if (_lf_count_payload_allocations > 0) { - lf_print_warning("Memory allocated for messages has not been freed."); - lf_print_warning("Number of unfreed messages: %d.", _lf_count_payload_allocations); - } - if (_lf_count_token_allocations > 0) { - lf_print_warning("Memory allocated for tokens has not been freed!"); - lf_print_warning("Number of unfreed tokens: %d.", _lf_count_token_allocations); - } + // Skip most cleanup on abnormal termination. + if (_lf_normal_termination) { + _lf_free_all_tokens(); // Must be done before freeing reactors. + // Issue a warning if a memory leak has been detected. + if (_lf_count_payload_allocations > 0) { + lf_print_warning("Memory allocated for messages has not been freed."); + lf_print_warning("Number of unfreed messages: %d.", _lf_count_payload_allocations); + } + if (_lf_count_token_allocations > 0) { + lf_print_warning("Memory allocated for tokens has not been freed!"); + lf_print_warning("Number of unfreed tokens: %d.", _lf_count_token_allocations); + } #if !defined(LF_SINGLE_THREADED) - for (int i = 0; i < _lf_watchdog_count; i++) { - if (_lf_watchdogs[i].base->reactor_mutex != NULL) { - free(_lf_watchdogs[i].base->reactor_mutex); + for (int i = 0; i < _lf_watchdog_count; i++) { + if (_lf_watchdogs[i].base->reactor_mutex != NULL) { + free(_lf_watchdogs[i].base->reactor_mutex); + } } - } #endif - _lf_free_all_reactors(); + _lf_free_all_reactors(); + } } diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index a462dd77c..cfe2384cf 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -65,16 +65,6 @@ extern instant_t start_time; */ #define MAX_STALL_INTERVAL MSEC(1) -/** - * Unless the "fast" option is given, an LF program will wait until - * physical time matches logical time before handling an event with - * a given logical time. The amount of time is less than this given - * threshold, then no wait will occur. The purpose of this is - * to prevent unnecessary delays caused by simply setting up and - * performing the wait. - */ -#define MIN_SLEEP_DURATION USEC(10) - /** * Global mutex, used for synchronizing across environments. Mainly used for token-management and tracing */ @@ -246,23 +236,24 @@ void synchronize_with_other_federates(void); /** * Wait until physical time matches or exceeds the specified logical time, - * unless -fast is given. + * unless -fast is given. For decentralized coordination, this function will + * add the STA offset to the wait time. * * If an event is put on the event queue during the wait, then the wait is * interrupted and this function returns false. It also returns false if the - * timeout time is reached before the wait has completed. + * timeout time is reached before the wait has completed. Note this this could + * return true even if the a new event was placed on the queue if that event + * time matches or exceeds the specified time. * - * The mutex lock is assumed to be held by the calling thread. - * Note this this could return true even if the a new event - * was placed on the queue if that event time matches or exceeds - * the specified time. + * The mutex lock associated with the condition argument is assumed to be held by + * the calling thread. This mutex is released while waiting. If the wait time is + * too small to actually wait (less than MIN_SLEEP_DURATION), then this function + * immediately returns true and the mutex is not released. * * @param env Environment within which we are executing. * @param logical_time Logical time to wait until physical time matches it. - * @param return_if_interrupted If this is false, then wait_util will wait - * until physical time matches the logical time regardless of whether new - * events get put on the event queue. This is useful, for example, for - * synchronizing the start of the program. + * @param condition A condition variable that can interrupt the wait. The mutex + * associated with this condition variable will be released during the wait. * * @return Return false if the wait is interrupted either because of an event * queue signal or if the wait time was interrupted early by reaching @@ -846,7 +837,7 @@ bool _lf_worker_handle_deadline_violation_for_reaction(environment_t *env, int w * @param worker_number The ID of the worker. * @param reaction The reaction whose STP offset has been violated. * - * @return true if an STP violation occurred. false otherwise. + * @return true if an STP violation occurred and was handled. false otherwise. */ bool _lf_worker_handle_STP_violation_for_reaction(environment_t* env, int worker_number, reaction_t* reaction) { bool violation_occurred = false; @@ -877,6 +868,10 @@ bool _lf_worker_handle_STP_violation_for_reaction(environment_t* env, int worker violation_occurred = true; (*handler)(reaction->self); + // Reset the STP violation flag because it has been dealt with. + // Downstream handlers should not be invoked. + reaction->is_STP_violated = false; + // If the reaction produced outputs, put the resulting // triggered reactions into the queue or execute them directly if possible. schedule_output_reactions(env, reaction, worker_number); @@ -907,7 +902,7 @@ bool _lf_worker_handle_STP_violation_for_reaction(environment_t* env, int worker * @param worker_number The ID of the worker. * @param reaction The reaction. * - * @return true if a violation occurred. false otherwise. + * @return true if a violation occurred and was handled. false otherwise. */ bool _lf_worker_handle_violations(environment_t *env, int worker_number, reaction_t* reaction) { bool violation = false; @@ -1151,6 +1146,10 @@ int lf_reactor_c_main(int argc, const char* argv[]) { // Ignore SIGPIPE errors, which terminate the entire application if // socket write() fails because the reader has closed the socket. // Instead, cause an EPIPE error to be set when write() fails. + // NOTE: The reason for a broken socket causing a SIGPIPE signal + // instead of just having write() return an error is to robutly + // a foo | bar pipeline where bar crashes. The default behavior + // is for foo to also exit. signal(SIGPIPE, SIG_IGN); #endif // SIGPIPE @@ -1239,6 +1238,10 @@ int lf_reactor_c_main(int argc, const char* argv[]) { LF_PRINT_LOG("---- All worker threads exited successfully."); } } + _lf_normal_termination = true; + // Invoke termination function here before freeing the local RTI. + termination(); + #if defined LF_ENCLAVES free_local_rti(); #endif diff --git a/core/utils/util.c b/core/utils/util.c index 23daef364..302d716e4 100644 --- a/core/utils/util.c +++ b/core/utils/util.c @@ -79,6 +79,13 @@ void _lf_message_print( int is_error, const char* prefix, const char* format, va_list args, int log_level ) ATTRIBUTE_FORMAT_PRINTF(3, 0); +/** + * Print a fatal error message. Internal function. + */ +static void lf_vprint_fatal_error(const char* format, va_list args) { + _lf_message_print(1, "FATAL ERROR: ", format, args, LOG_LEVEL_ERROR); +} + /** * Internal implementation of the next few reporting functions. */ @@ -204,13 +211,18 @@ void lf_vprint_warning(const char* format, va_list args) { void lf_print_error_and_exit(const char* format, ...) { va_list args; va_start (args, format); - lf_vprint_error_and_exit(format, args); + lf_vprint_fatal_error(format, args); va_end (args); exit(EXIT_FAILURE); } -void lf_vprint_error_and_exit(const char* format, va_list args) { - _lf_message_print(1, "FATAL ERROR: ", format, args, LOG_LEVEL_ERROR); +void lf_print_error_system_failure(const char* format, ...) { + va_list args; + va_start (args, format); + lf_vprint_error(format, args); + va_end (args); + lf_print_error_and_exit("Error %d: %s", errno, strerror(errno)); + exit(EXIT_FAILURE); } void lf_register_print_function(print_message_function_t* function, int log_level) { diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index 880408ec6..7dc061ca8 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -212,10 +212,14 @@ typedef struct federate_instance_t { } federate_instance_t; #ifdef FEDERATED_DECENTRALIZED +/** + * An array of actions associated with network input ports that have STAA offsets. + */ typedef struct staa { - lf_action_base_t** actions; - size_t STAA; - size_t numActions; + lf_action_base_t** actions; // Array of pointers to actions with the same STAA offset. + interval_t* action_delays; // Array of delays on the network connections for these actions. + interval_t STAA; // The STAA offset. + size_t num_actions; // The length of the arrays. } staa_t; #endif @@ -239,12 +243,11 @@ extern lf_cond_t logical_time_changed; */ void send_neighbor_structure_to_RTI(int); +#ifdef FEDERATED_DECENTRALIZED /** - * @brief Spawns a thread to iterate through STAA structs, setting its associated ports absent + * @brief Spawn a thread to iterate through STAA structs, setting their associated ports absent * at an offset if the port is not present with a value by a certain physical time. - * */ -#ifdef FEDERATED_DECENTRALIZED void spawn_staa_thread(void); #endif @@ -273,11 +276,12 @@ void _lf_logical_tag_complete(tag_t); /** * Connect to the RTI at the specified host and port and return - * the socket descriptor for the connection. If this fails, the + * the socket descriptor for the connection. If this fails, wait CONNECT_RETRY_INTERVAL + * and try again. If it fails after CONNECT_MAX_RETRIES, the * program exits. If it succeeds, it sets the _fed.socket_TCP_RTI global * variable to refer to the socket for communicating with the RTI. * @param hostname A hostname, such as "localhost". - * @param port_number A port number. + * @param port_number A port number, or 0 to use the default port. */ void connect_to_rti(const char*, int); @@ -296,28 +300,22 @@ void connect_to_rti(const char*, int); void* listen_to_federates(void*); /** - * Create a server to listen to incoming physical - * connections from remote federates. This function + * Create a server to listen to incoming p2p connection (physical + * connections or decentralized connections) from remote federates. This function * only handles the creation of the server socket. - * The reserved port for the server socket is then + * The bound port for the server socket is then * sent to the RTI by sending an MSG_TYPE_ADDRESS_ADVERTISEMENT message * (@see net_common.h). This function expects no response * from the RTI. * - * If a port is specified by the user, that will be used - * as the only possibility for the server. This function - * will fail if that port is not available. If a port is not - * specified, the STARTING_PORT (@see net_common.h) will be used. - * The function will keep incrementing the port in this case - * until the number of tries reaches PORT_RANGE_LIMIT. + * If a port is specified by the user, that will be used. + * Otherwise, a random port will be assigned. If the bind fails, + * it will retry after PORT_BIND_RETRY_INTERVAL until it has tried + * PORT_BIND_RETRY_LIMIT times. Then it will fail. * - * @note This function is similar to create_server(...) in rti.c. - * However, it contains specific log messages for the peer to - * peer connections between federates. It also additionally - * sends an address advertisement (MSG_TYPE_ADDRESS_ADVERTISEMENT) message to the - * RTI informing it of the port. + * @note This function is different from create_server(...) in rti.c. * - * @param specified_port The specified port by the user. + * @param specified_port The specified port by the user or 0 to use a random port. */ void create_server(int specified_port); diff --git a/include/core/lf_types.h b/include/core/lf_types.h index 9cbcea9b3..eb626658e 100644 --- a/include/core/lf_types.h +++ b/include/core/lf_types.h @@ -239,7 +239,7 @@ struct trigger_t { interval_t offset; // Minimum delay of an action. For a timer, this is also the maximum delay. interval_t period; // Minimum interarrival time of an action. For a timer, this is also the maximal interarrival time. bool is_physical; // Indicator that this denotes a physical action. - instant_t last_time; // Time of the last event that was scheduled for this action. + tag_t last_tag; // Tag of the last event that was scheduled for this action. // This is only used for actions and will otherwise be NEVER. lf_spacing_policy_t policy; // Indicates which policy to use when an event is scheduled too early. port_status_t status; // Determines the status of the port at the current logical time. Therefore, this diff --git a/include/core/reactor.h b/include/core/reactor.h index d9ee515b5..0d4f84022 100644 --- a/include/core/reactor.h +++ b/include/core/reactor.h @@ -60,17 +60,21 @@ #define CONSTRUCTOR(classname) (new_ ## classname) #define SELF_STRUCT_T(classname) (classname ## _self_t) -//////////////////////////////////////////////////////////// -//// Macros for producing outputs. - -// NOTE: According to the "Swallowing the Semicolon" section on this page: -// https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html -// the following macros should use an odd do-while construct to avoid -// problems with if ... else statements that do not use braces around the -// two branches. - -// Declarations for functions used by the macros. +/** + * Unless the "fast" option is given, an LF program will wait until + * physical time matches logical time before handling an event with + * a given logical time. The amount of time is less than this given + * threshold, then no wait will occur. The purpose of this is + * to prevent unnecessary delays caused by simply setting up and + * performing the wait. + */ +#define MIN_SLEEP_DURATION USEC(10) +/** + * Print an event from the event queue. + * This is a function of type pqueue_print_entry_f. + */ +void _lf_print_event(void* event); /** * Mark the given port's is_present field as true. This is_present field * will later be cleaned up by _lf_start_time_step. From fb47b2bfa036f76907c354af2addc099f7959ab5 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 20 Dec 2023 15:26:12 -0800 Subject: [PATCH 04/83] Point to lingua-franca/federated-cleanup --- lingua-franca-ref.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingua-franca-ref.txt b/lingua-franca-ref.txt index 1f7391f92..e47bc9016 100644 --- a/lingua-franca-ref.txt +++ b/lingua-franca-ref.txt @@ -1 +1 @@ -master +federated-cleanup From c1c755da6ffad4f451f354c26a27f10d3346e568 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 20 Dec 2023 16:44:52 -0800 Subject: [PATCH 05/83] Fix possible segfault on tracing termination --- core/trace.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/core/trace.c b/core/trace.c index 6fffbc7bf..e45cbe909 100644 --- a/core/trace.c +++ b/core/trace.c @@ -53,7 +53,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fprintf(stderr, "WARNING: Access to trace file failed.\n"); \ fclose(trace->_lf_trace_file); \ trace->_lf_trace_file = NULL; \ - lf_critical_section_exit(trace->env); \ return -1; \ } while(0) @@ -196,7 +195,7 @@ void flush_trace_locked(trace_t* trace, int worker) { // This is deferred to here so that user trace objects can be // registered in startup reactions. if (!trace->_lf_trace_header_written) { - write_trace_header(trace); + if (write_trace_header(trace) < 0) return; trace->_lf_trace_header_written = true; } @@ -482,8 +481,10 @@ void stop_trace_locked(trace_t* trace) { flush_trace_locked(trace, 0); } trace->_lf_trace_stop = 1; - fclose(trace->_lf_trace_file); - trace->_lf_trace_file = NULL; + if (trace->_lf_trace_file != NULL) { + fclose(trace->_lf_trace_file); + trace->_lf_trace_file = NULL; + } LF_PRINT_DEBUG("Stopped tracing."); } From 548e27895b1571a5d648f55ab3c2528f40d15533 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Thu, 21 Dec 2023 06:59:25 -0800 Subject: [PATCH 06/83] Remove one more deadlock risk --- core/federated/federate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index ca8bc1fc9..272ef0dfb 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2320,8 +2320,6 @@ static void send_resign_signal(environment_t* env) { if (written == bytes_to_write) { LF_PRINT_LOG("Resigned."); } - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_RESIGN, _lf_my_fed_id, &tag); } /** @@ -2341,6 +2339,8 @@ void terminate_execution(environment_t* env) { lf_mutex_lock(&outbound_socket_mutex); send_resign_signal(env); lf_mutex_unlock(&outbound_socket_mutex); + // Trace the event when tracing is enabled + tracepoint_federate_to_rti(_fed.trace, send_RESIGN, _lf_my_fed_id, &tag); } else { // Do not acquire mutex and do not trace. send_resign_signal(env); From 444ebeeaf35a8f69c5e7bc84d6ec51eae8f60268 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Thu, 21 Dec 2023 08:20:04 -0800 Subject: [PATCH 07/83] Fix compile error and bogus comparison --- core/federated/federate.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 272ef0dfb..6ec584add 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2340,7 +2340,7 @@ void terminate_execution(environment_t* env) { send_resign_signal(env); lf_mutex_unlock(&outbound_socket_mutex); // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_RESIGN, _lf_my_fed_id, &tag); + tracepoint_federate_to_rti(_fed.trace, send_RESIGN, _lf_my_fed_id, &env->current_tag); } else { // Do not acquire mutex and do not trace. send_resign_signal(env); @@ -2376,7 +2376,6 @@ void terminate_execution(environment_t* env) { LF_PRINT_LOG("Waiting for %zu threads listening for incoming messages to exit.", _fed.number_of_inbound_p2p_connections); for (int i=0; i < _fed.number_of_inbound_p2p_connections; i++) { - if (_fed.inbound_socket_listeners[i] == NULL) continue; // Ignoring errors here. lf_thread_join(_fed.inbound_socket_listeners[i], NULL); } From ab7605e2c34e6b3957a8497981919c9dde2e0822 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Thu, 21 Dec 2023 10:19:52 -0800 Subject: [PATCH 08/83] Prevent sending redundant reply to stop request --- core/federated/RTI/rti_remote.c | 11 +++++------ core/federated/federate.c | 26 ++++++++++++++------------ core/reactor.c | 2 +- core/threaded/reactor_threaded.c | 4 ++-- 4 files changed, 22 insertions(+), 21 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 608875df3..8e33c54e1 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -605,6 +605,10 @@ void handle_stop_request_message(federate_info_t* fed) { read_from_socket_errexit(fed->socket, bytes_to_read, buffer, "RTI failed to read the MSG_TYPE_STOP_REQUEST payload from federate %d.", fed->enclave.id); + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(rti_remote->base.trace, receive_STOP_REQ, fed->enclave.id, &proposed_stop_tag); + } + // Acquire a mutex lock to ensure that this state does change while a // message is in transport or being used to determine a TAG. lf_mutex_lock(&rti_mutex); @@ -620,10 +624,6 @@ void handle_stop_request_message(federate_info_t* fed) { // Extract the proposed stop tag for the federate tag_t proposed_stop_tag = extract_tag(buffer); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(rti_remote->base.trace, receive_STOP_REQ, fed->enclave.id, &proposed_stop_tag); - } - // Update the maximum stop tag received from federates if (lf_tag_compare(proposed_stop_tag, rti_remote->base.max_stop_tag) > 0) { rti_remote->base.max_stop_tag = proposed_stop_tag; @@ -638,8 +638,7 @@ void handle_stop_request_message(federate_info_t* fed) { if (rti_remote->base.num_scheduling_nodes_handling_stop == rti_remote->base.number_of_scheduling_nodes) { // We now have information about the stop time of all - // federates. This is extremely unlikely, but it can occur - // all federates call lf_request_stop() at the same tag. + // federates, and mark_federate_requesting_stop has sent out stop time to all. lf_mutex_unlock(&rti_mutex); return; } diff --git a/core/federated/federate.c b/core/federated/federate.c index 6ec584add..f28172b76 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2165,12 +2165,12 @@ int _lf_fd_send_stop_request_to_rti(tag_t stop_tag) { lf_mutex_unlock(&outbound_socket_mutex); return -1; } - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ, _lf_my_fed_id, &stop_tag); write_to_socket_with_mutex(_fed.socket_TCP_RTI, MSG_TYPE_STOP_REQUEST_LENGTH, buffer, &outbound_socket_mutex, "Failed to send stop time " PRINTF_TIME " to the RTI.", stop_tag.time - start_time); lf_mutex_unlock(&outbound_socket_mutex); + // Trace the event when tracing is enabled + tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ, _lf_my_fed_id, &stop_tag); return 0; } else { lf_mutex_unlock(&outbound_socket_mutex); @@ -2221,10 +2221,7 @@ void handle_stop_granted_message() { env[i].stop_tag.microstep); if (env[i].barrier.requestors) _lf_decrement_tag_barrier_locked(&env[i]); - // We signal instead of broadcast under the assumption that only - // one worker thread can call wait_until at a given time because - // the call to wait_until is protected by a mutex lock - lf_cond_signal(&env[i].event_q_changed); + lf_cond_broadcast(&env[i].event_q_changed); lf_mutex_unlock(&env[i].mutex); } } @@ -2268,6 +2265,12 @@ void handle_stop_request_message() { } lf_mutex_unlock(&global_mutex); + if (already_blocked) { + // Either we have sent a stop request to the RTI ourselves, + // or we have previously received a stop request from the RTI. + // Nothing more to do. Tag advance is already blocked on enclaves. + return; + } // Iterate over the scheduling enclaves to find the maximum current tag // and adjust the tag_to_stop if any of those is greater than tag_to_stop. @@ -2281,10 +2284,9 @@ void handle_stop_request_message() { tag_to_stop = env->current_tag; tag_to_stop.microstep++; } - if (!already_blocked) { - // Set a barrier to prevent the enclave from advancing past the so-far tag to stop. - _lf_increment_tag_barrier_locked(&env[i], tag_to_stop); - } + // Set a barrier to prevent the enclave from advancing past the so-far tag to stop. + _lf_increment_tag_barrier_locked(&env[i], tag_to_stop); + lf_mutex_unlock(&env[i].mutex); } // Send the reply, which is the least tag at which we can stop. @@ -2297,14 +2299,14 @@ void handle_stop_request_message() { lf_mutex_unlock(&outbound_socket_mutex); return; } - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ_REP, _lf_my_fed_id, &tag_to_stop); // Send the current logical time to the RTI. This message does not have an identifying byte // since the RTI is waiting for a response from this federate. write_to_socket_with_mutex( _fed.socket_TCP_RTI, MSG_TYPE_STOP_REQUEST_REPLY_LENGTH, outgoing_buffer, &outbound_socket_mutex, "Failed to send the answer to MSG_TYPE_STOP_REQUEST to RTI."); lf_mutex_unlock(&outbound_socket_mutex); + // Trace the event when tracing is enabled + tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ_REP, _lf_my_fed_id, &tag_to_stop); } /** diff --git a/core/reactor.c b/core/reactor.c index 7f5c3e344..d0bf6a0cd 100644 --- a/core/reactor.c +++ b/core/reactor.c @@ -377,7 +377,7 @@ int lf_reactor_c_main(int argc, const char* argv[]) { NEVER_TAG.time - start_time, 0); environment_init_tags(env, start_time, duration); - // Start tracing if enalbed + // Start tracing if enabled. start_trace(env->trace); #ifdef MODAL_REACTORS // Set up modal infrastructure diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index cfe2384cf..f3b44bea4 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -611,9 +611,9 @@ void lf_request_stop() { } #ifdef FEDERATED - // In the federated case, do not set lf_stop_requested because the RTI might grant a + // In the federated case, the RTI might grant a // later stop tag than the current tag. The above code has raised - // a barrier no greater than the requested stop tag for each enclave. + // a barrier no greater than max_current_tag. if (_lf_fd_send_stop_request_to_rti(max_current_tag) != 0) { // Message was not sent to the RTI. // Decrement the barriers to reverse our previous increment. From b9b17af731784779bd47c4cf4c24540756ab260b Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Thu, 21 Dec 2023 10:26:04 -0800 Subject: [PATCH 09/83] Fixed compile error --- core/federated/RTI/rti_remote.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 8e33c54e1..71c126891 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -605,6 +605,9 @@ void handle_stop_request_message(federate_info_t* fed) { read_from_socket_errexit(fed->socket, bytes_to_read, buffer, "RTI failed to read the MSG_TYPE_STOP_REQUEST payload from federate %d.", fed->enclave.id); + // Extract the proposed stop tag for the federate + tag_t proposed_stop_tag = extract_tag(buffer); + if (rti_remote->base.tracing_enabled) { tracepoint_rti_from_federate(rti_remote->base.trace, receive_STOP_REQ, fed->enclave.id, &proposed_stop_tag); } @@ -621,9 +624,6 @@ void handle_stop_request_message(federate_info_t* fed) { return; } - // Extract the proposed stop tag for the federate - tag_t proposed_stop_tag = extract_tag(buffer); - // Update the maximum stop tag received from federates if (lf_tag_compare(proposed_stop_tag, rti_remote->base.max_stop_tag) > 0) { rti_remote->base.max_stop_tag = proposed_stop_tag; @@ -638,7 +638,7 @@ void handle_stop_request_message(federate_info_t* fed) { if (rti_remote->base.num_scheduling_nodes_handling_stop == rti_remote->base.number_of_scheduling_nodes) { // We now have information about the stop time of all - // federates, and mark_federate_requesting_stop has sent out stop time to all. + // federates, and mark_federate_requesting_stop has sent out stop time to. lf_mutex_unlock(&rti_mutex); return; } From d4512732ecf6d3c40e5fa7c5efdf85e1eae684db Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Thu, 21 Dec 2023 13:53:42 -0800 Subject: [PATCH 10/83] Treat the stop request from the RTI as if a local stop request had been received --- core/federated/federate.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/federated/federate.c b/core/federated/federate.c index f28172b76..4fb05158b 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2263,6 +2263,8 @@ void handle_stop_request_message() { if (lf_stop_requested) { already_blocked = true; } + // Treat the stop request from the RTI as if a local stop request had been received. + lf_stop_requested = true; lf_mutex_unlock(&global_mutex); if (already_blocked) { From 5bad8b95647eb90bafd446796e32266322fe6b05 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Fri, 22 Dec 2023 08:52:06 -0800 Subject: [PATCH 11/83] Adjust port binding retries to realistic times --- include/core/federated/network/net_common.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index 47bb644e8..5be4d50d1 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -234,13 +234,17 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /** * Time to wait before re-attempting to bind to a port. + * When a process closes, the network stack typically waits between 30 and 120 + * seconds before releasing the port. This is to allow for delayed packets so + * that a new process does not receive packets from a previous process. + * Here, we limit the retries to 120 seconds. */ -#define PORT_BIND_RETRY_INTERVAL MSEC(10) +#define PORT_BIND_RETRY_INTERVAL SEC(2) /** * Number of attempts to bind to a port before giving up. */ -#define PORT_BIND_RETRY_LIMIT 100 +#define PORT_BIND_RETRY_LIMIT 60 /** * Default port number for the RTI. From 4875564ada7622b9fee132daa80e7342588fabe6 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Fri, 22 Dec 2023 10:29:38 -0800 Subject: [PATCH 12/83] RTI sends RESIGN on abnormal termination --- core/federated/RTI/main.c | 37 ++++++++++++++++++--- core/federated/federate.c | 28 +++++++++++----- include/core/federated/network/net_common.h | 2 +- 3 files changed, 54 insertions(+), 13 deletions(-) diff --git a/core/federated/RTI/main.c b/core/federated/RTI/main.c index b17bb41d5..ab6b04845 100644 --- a/core/federated/RTI/main.c +++ b/core/federated/RTI/main.c @@ -48,6 +48,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "rti_remote.h" +#include "net_util.h" #include // To trap ctrl-c and invoke a clean stop to save the trace file, if needed. #include @@ -70,17 +71,45 @@ const char *rti_trace_file_name = "rti.lft"; /** Indicator that normal termination has occurred. */ bool normal_termination = false; +/** + * Send a resign signal to the RTI. The tag payload is the tag + * of the most recently received LTC from the federate or NEVER + * if no LTC has been received. + */ +static void send_resign_signal(federate_info_t* fed) { + size_t bytes_to_write = 1 + sizeof(tag_t); + unsigned char buffer[bytes_to_write]; + buffer[0] = MSG_TYPE_RESIGN; + tag_t tag = fed->enclave.completed; + encode_tag(&(buffer[1]), tag); + ssize_t written = write_to_socket(fed->socket, bytes_to_write, &(buffer[0])); + if (written == bytes_to_write) { + LF_PRINT_LOG("RTI has sent resign signal to federate %d due to abnormal termination.", fed->enclave.id); + } else { + LF_PRINT_LOG("RTI failed to send resign signal to federate %d.", fed->enclave.id); + } + if (rti.base.tracing_enabled) { + tracepoint_rti_to_federate(rti.base.trace, send_RESIGN, fed->enclave.id, &tag); + } +} + /** * @brief Function to run upon termination. * This function will be invoked both after main() returns and when a signal * that results in terminating the process, such as SIGINT. In the former - * case, it should do nothing. In the latter case, it will attempt to write - * the trace file, but without acquiring a mutex lock, so the resulting files - * may be incomplete or even corrupted. But this is better than just failing - * to write the data we have collected so far. + * case, it should do nothing. In the latter case, it will send a MSG_TYPE_RESIGN + * signal to each federate and attempt to write the trace file, but without + * acquiring a mutex lock, so the resulting files may be incomplete or even + * corrupted. But this is better than just failing to write the data we have + * collected so far. */ void termination() { if (!normal_termination) { + for (int i = 0; i < rti.base.number_of_scheduling_nodes; i++) { + federate_info_t *f = rti.base.scheduling_nodes[i]; + if (!f || f->enclave.state == NOT_CONNECTED) continue; + send_resign_signal(f); + } if (rti.base.tracing_enabled) { stop_trace_locked(rti.base.trace); lf_print("RTI trace file saved."); diff --git a/core/federated/federate.c b/core/federated/federate.c index 4fb05158b..e463d9322 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2359,10 +2359,6 @@ void terminate_execution(environment_t* env) { _fed.sockets_for_inbound_p2p_connections[i] = -1; } - // For abnormal termination, skip the rest, letting the threads be terminated - // and sockets be closed by the OS. - if (!_lf_normal_termination) return; - // Check for all outgoing physical connections in // _fed.sockets_for_outbound_p2p_connections and // if the socket ID is not -1, the connection is still open. @@ -2389,10 +2385,13 @@ void terminate_execution(environment_t* env) { // Wait for the thread listening for messages from the RTI to close. lf_thread_join(_fed.RTI_socket_listener, NULL); - LF_PRINT_DEBUG("Freeing memory occupied by the federate."); - free(_fed.inbound_socket_listeners); - free(federation_metadata.rti_host); - free(federation_metadata.rti_user); + // For abnormal termination, there is no need to free memory. + if (_lf_normal_termination) { + LF_PRINT_DEBUG("Freeing memory occupied by the federate."); + free(_fed.inbound_socket_listeners); + free(federation_metadata.rti_host); + free(federation_metadata.rti_user); + } } /** @@ -2476,6 +2475,16 @@ static void stop_all_traces() { } } +/** + * Handle a resign signal from the RTI. The RTI will only resign + * if it is forced to exit, e.g. by a SIG_INT. Hence, this federate + * will exit immediately with an error condition, counting on the + * termination functions to handle any cleanup needed. + */ +void handle_rti_resign_message(void) { + exit(1); +} + /** * Thread that listens for TCP inputs from the RTI. * When messages arrive, this calls the appropriate handler. @@ -2541,6 +2550,9 @@ void* listen_to_rti_TCP(void* args) { case MSG_TYPE_PORT_ABSENT: handle_port_absent_message(_fed.socket_TCP_RTI, -1); break; + case MSG_TYPE_RESIGN: + handle_rti_resign_message(); + break; case MSG_TYPE_CLOCK_SYNC_T1: case MSG_TYPE_CLOCK_SYNC_T4: lf_print_error("Federate %d received unexpected clock sync message from RTI on TCP socket.", diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index 5be4d50d1..21ed69141 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -384,7 +384,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define MSG_TYPE_MESSAGE 3 -/** Byte identifying that the federate is ending its execution. */ +/** Byte identifying that the federate or the RTI is ending its execution. */ #define MSG_TYPE_RESIGN 4 /** Byte identifying a timestamped message to forward to another federate. From 59ab5d25f903cf348e2aa98c4877ecdc67873b10 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sat, 23 Dec 2023 15:38:23 -0800 Subject: [PATCH 13/83] Free environment only after all logging and debug statements --- core/reactor_common.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/core/reactor_common.c b/core/reactor_common.c index db1d97146..022f903ff 100644 --- a/core/reactor_common.c +++ b/core/reactor_common.c @@ -1751,37 +1751,36 @@ void termination(void) { terminate_execution(env); // In order to free tokens, we perform the same actions we would have for a new time step. - for (int i = 0; iinitialized) { - lf_print_warning("---- Environment %u was never initialized", env->id); + for (int i = 0; i < num_envs; i++) { + if (!env[i].initialized) { + lf_print_warning("---- Environment %u was never initialized", env[i].id); continue; } - lf_print("---- Terminating environment %u", env->id); + LF_PRINT_LOG("---- Terminating environment %u, normal termination: %d", env[i].id, _lf_normal_termination); // Stop any tracing, if it is running. // No need to acquire a mutex because if this is normal termination, all // other threads have stopped, and if it's not, then acquiring a mutex could // lead to a deadlock. - stop_trace_locked(env->trace); + stop_trace_locked(env[i].trace); // Skip most cleanup on abnormal termination. if (_lf_normal_termination) { - _lf_start_time_step(env); + _lf_start_time_step(&env[i]); #ifdef MODAL_REACTORS // Free events and tokens suspended by modal reactors. - _lf_terminate_modal_reactors(env); + _lf_terminate_modal_reactors(env[i]); #endif // If the event queue still has events on it, report that. - if (env->event_q != NULL && pqueue_size(env->event_q) > 0) { - lf_print_warning("---- There are %zu unprocessed future events on the event queue.", pqueue_size(env->event_q)); - event_t* event = (event_t*)pqueue_peek(env->event_q); + if (env[i].event_q != NULL && pqueue_size(env[i].event_q) > 0) { + lf_print_warning("---- There are %zu unprocessed future events on the event queue.", pqueue_size(env[i].event_q)); + event_t* event = (event_t*)pqueue_peek(env[i].event_q); interval_t event_time = event->time - start_time; lf_print_warning("---- The first future event has timestamp " PRINTF_TIME " after start time.", event_time); } // Print elapsed times. // If these are negative, then the program failed to start up. - interval_t elapsed_time = lf_time_logical_elapsed(env); + interval_t elapsed_time = lf_time_logical_elapsed(&env[i]); if (elapsed_time >= 0LL) { char time_buffer[29]; // 28 bytes is enough for the largest 64 bit number: 9,223,372,036,854,775,807 lf_comma_separated_time(time_buffer, elapsed_time); @@ -1794,11 +1793,7 @@ void termination(void) { printf("---- Elapsed physical time (in nsec): %s\n", time_buffer); } } - - // Free up memory associated with environment - environment_free(env); } - env++; } // Skip most cleanup on abnormal termination. if (_lf_normal_termination) { @@ -1820,5 +1815,11 @@ void termination(void) { } #endif _lf_free_all_reactors(); + + // Free up memory associated with environment. + // Do this last so that printed warnings don't access freed memory. + for (int i = 0; i < num_envs; i++) { + environment_free(&env[i]); + } } } From e043239022bb0d6c0c85e0426012f665c237bee5 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sat, 23 Dec 2023 15:53:30 -0800 Subject: [PATCH 14/83] Better handling of socket shutdown. * Federate sends NEVER_TAG in RESIGN to indicate error and RTI returns an error code. * RTI reads the tag on the RESIGN message always, not just if tracing is enabled. * Avoid attempting to close stdin, which results from faulty initialization of array. * More systematic socket shutdown process that ensures acknowledgements are received. --- core/federated/RTI/main.c | 11 +-- core/federated/RTI/rti_remote.c | 43 ++++++++---- core/federated/RTI/rti_remote.h | 9 +++ core/federated/federate.c | 107 ++++++++++++++++++------------ core/federated/network/net_util.c | 26 ++++---- include/core/federated/federate.h | 18 ++--- 6 files changed, 129 insertions(+), 85 deletions(-) diff --git a/core/federated/RTI/main.c b/core/federated/RTI/main.c index ab6b04845..f799effac 100644 --- a/core/federated/RTI/main.c +++ b/core/federated/RTI/main.c @@ -68,11 +68,11 @@ static rti_remote_t rti; */ const char *rti_trace_file_name = "rti.lft"; -/** Indicator that normal termination has occurred. */ +/** Indicator that normal termination of the RTI has occurred. */ bool normal_termination = false; /** - * Send a resign signal to the RTI. The tag payload is the tag + * Send a resign signal to the specified federate. The tag payload is the tag * of the most recently received LTC from the federate or NEVER * if no LTC has been received. */ @@ -106,7 +106,7 @@ static void send_resign_signal(federate_info_t* fed) { void termination() { if (!normal_termination) { for (int i = 0; i < rti.base.number_of_scheduling_nodes; i++) { - federate_info_t *f = rti.base.scheduling_nodes[i]; + federate_info_t *f = (federate_info_t*)rti.base.scheduling_nodes[i]; if (!f || f->enclave.state == NOT_CONNECTED) continue; send_resign_signal(f); } @@ -342,7 +342,10 @@ int main(int argc, const char* argv[]) { lf_print("RTI is exiting."); // Do this before freeing scheduling nodes. free_scheduling_nodes(rti.base.scheduling_nodes, rti.base.number_of_scheduling_nodes); - return 0; + + // Even if the RTI is exiting normally, it should report an error code if one of the + // federates has reported an error. + return (int)_lf_federate_reports_error; } #endif // STANDALONE_RTI diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 71c126891..66404cf57 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -39,6 +39,8 @@ extern instant_t start_time; */ static rti_remote_t *rti_remote; +bool _lf_federate_reports_error = false; + // A convenient macro for getting the `federate_info_t *` at index `_idx` // and casting it. #define GET_FED_INFO(_idx) (federate_info_t *) rti_remote->base.scheduling_nodes[_idx] @@ -983,18 +985,27 @@ void* clock_synchronization_thread(void* noargs) { void handle_federate_resign(federate_info_t *my_fed) { // Nothing more to do. Close the socket and exit. lf_mutex_lock(&rti_mutex); + // Extract the tag + size_t header_size = 1 + sizeof(tag_t); + unsigned char buffer[header_size]; + // Read the header, minus the first byte which has already been read. + read_from_socket_errexit(my_fed->socket, header_size - 1, &(buffer[1]), + "RTI failed to read the resign tag from remote federate."); + // Extract the tag sent by the resigning federate + tag_t tag = extract_tag(&(buffer[1])); + if (rti_remote->base.tracing_enabled) { - // Extract the tag, for tracing purposes - size_t header_size = 1 + sizeof(tag_t); - unsigned char buffer[header_size]; - // Read the header, minus the first byte which has already been read. - read_from_socket_errexit(my_fed->socket, header_size - 1, &(buffer[1]), - "RTI failed to read the timed message header from remote federate."); - // Extract the tag sent by the resigning federate - tag_t tag = extract_tag(&(buffer[1])); tracepoint_rti_from_federate(rti_remote->base.trace, receive_RESIGN, my_fed->enclave.id, &tag); } + if (lf_tag_compare(tag, NEVER_TAG) == 0) { + // The federate is reporting an error. + _lf_federate_reports_error = true; + lf_print("RTI: Federate %d reports an error and has resigned.", my_fed->enclave.id); + } else { + lf_print("RTI: Federate %d has resigned.", my_fed->enclave.id); + } + my_fed->enclave.state = NOT_CONNECTED; // Indicate that there will no further events from this federate. @@ -1005,9 +1016,13 @@ void handle_federate_resign(federate_info_t *my_fed) { // Here, we just signal the other side that no further writes to the socket are // forthcoming, which should result in the other end getting a zero-length reception. shutdown(my_fed->socket, SHUT_WR); - // Do not close because this results in an error on the other side rather than - // an orderly shutdown. - // close(my_fed->socket); // from unistd.h + + // Wait for the federate to send an EOF or a socket error to occur. + // Discard any incoming bytes. Normally, this read should return 0 because + // the federate is resigning and should itself invoke shutdown. + while (read(my_fed->socket, buffer, header_size) > 0); + // We can now safely close the socket. + close(my_fed->socket); // from unistd.h lf_print("RTI: Federate %d has resigned.", my_fed->enclave.id); @@ -1594,9 +1609,9 @@ void wait_for_federates(int socket_descriptor) { rti_remote->all_federates_exited = true; - // Shutdown and close the socket so that the accept() call in - // respond_to_erroneous_connections returns. That thread should then - // check rti->all_federates_exited and it should exit. + // Shutdown and close the socket that is listening for incoming connections + // so that the accept() call in respond_to_erroneous_connections returns. + // That thread should then check rti->all_federates_exited and it should exit. if (shutdown(socket_descriptor, SHUT_RDWR)) { LF_PRINT_LOG("On shut down TCP socket, received reply: %s", strerror(errno)); } diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index 21264e76a..b1434fcbd 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -184,6 +184,11 @@ extern int lf_critical_section_enter(environment_t* env); */ extern int lf_critical_section_exit(environment_t* env); +/** + * Indicator that one or more federates have reported an error on resigning. + */ +extern bool _lf_federate_reports_error; + /** * Create a server and enable listening for socket connections. * @@ -363,6 +368,10 @@ void* clock_synchronization_thread(void* noargs); * message is sent at the time of termination * after all shutdown events are processed * on the federate. + * + * If the tag on the resign message is NEVER, then the RTI assumes that + * the federate is terminating abnormally. In this case, the RTI will + * also terminate abnormally, returning a non-zero exit code. * * This function assumes the caller does not hold the mutex. * diff --git a/core/federated/federate.c b/core/federated/federate.c index e463d9322..6cf4ced58 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -94,8 +94,6 @@ federate_instance_t _fed = { .number_of_inbound_p2p_connections = 0, .inbound_socket_listeners = NULL, .number_of_outbound_p2p_connections = 0, - .sockets_for_inbound_p2p_connections = { -1 }, - .sockets_for_outbound_p2p_connections = { -1 }, .inbound_p2p_handling_thread_id = 0, .server_socket = -1, .server_port = -1, @@ -615,14 +613,27 @@ void* handle_p2p_connections_from_federates(void* env_arg) { /** * Close the socket that sends outgoing messages to the * specified federate ID. This function assumes the caller holds - * the outbound_socket_mutex mutex lock. + * the outbound_socket_mutex mutex lock, at least during normal termination. * @param fed_id The ID of the peer federate receiving messages from this * federate, or -1 if the RTI (centralized coordination). + * @param flag 0 if the socket has received EOF, 1 if not, -1 if abnormal termination. */ -void _lf_close_outbound_socket(int fed_id) { +static void _lf_close_outbound_socket(int fed_id, int flag) { assert (fed_id >= 0 && fed_id < NUMBER_OF_FEDERATES); if (_fed.sockets_for_outbound_p2p_connections[fed_id] >= 0) { - shutdown(_fed.sockets_for_outbound_p2p_connections[fed_id], SHUT_RDWR); + // Close the socket by sending a FIN packet indicating that no further writes + // are expected. Then read until we get an EOF indication. + if (flag >= 0) { + // SHUT_WR indicates no further outgoing messages. + shutdown(_fed.sockets_for_outbound_p2p_connections[fed_id], SHUT_WR); + if (flag > 0) { + // Have not received EOF yet. read until we get an EOF or error indication. + // This compensates for delayed ACKs and disabling of Nagles algorithm + // by delaying exiting until the shutdown is complete. + unsigned char message[32]; + while (read(_fed.sockets_for_outbound_p2p_connections[fed_id], &message, 32) > 0); + } + } close(_fed.sockets_for_outbound_p2p_connections[fed_id]); _fed.sockets_for_outbound_p2p_connections[fed_id] = -1; } @@ -659,17 +670,17 @@ void* listen_for_upstream_messages_from_downstream_federates(void* fed_id_ptr) { LF_PRINT_DEBUG("Received MSG_TYPE_CLOSE_REQUEST from federate %d.", fed_id); // Trace the event when tracing is enabled tracepoint_federate_from_federate(_fed.trace, receive_CLOSE_RQ, _lf_my_fed_id, fed_id, NULL); - _lf_close_outbound_socket(fed_id); + _lf_close_outbound_socket(fed_id, bytes_read); break; } if (bytes_read == 0) { // EOF. LF_PRINT_DEBUG("Received EOF from federate %d.", fed_id); - _lf_close_outbound_socket(fed_id); + _lf_close_outbound_socket(fed_id, bytes_read); break; } else if (bytes_read < 0) { // Error. - _lf_close_outbound_socket(fed_id); + _lf_close_outbound_socket(fed_id, bytes_read); lf_print_error_system_failure("Error on socket from federate %d.", fed_id); } } @@ -1425,6 +1436,34 @@ static trigger_handle_t schedule_message_received_from_network_locked( return return_value; } +/** + * Close the socket that receives incoming messages from the + * specified federate ID. This function should be called when a read + * of incoming socket fails or when an EOF is received. + * + * @param fed_id The ID of the peer federate sending messages to this + * federate, or -1 if the RTI. + * @param flag 0 if an EOF was received, -1 if a socket error occurred, 1 otherwise. + */ +static void _lf_close_inbound_socket(int fed_id, int flag) { + lf_mutex_lock(&socket_mutex); + if (_fed.sockets_for_inbound_p2p_connections[fed_id] >= 0) { + if (_fed.sockets_for_inbound_p2p_connections[fed_id] >= 0) { + if (flag >= 0) { + shutdown(_fed.sockets_for_inbound_p2p_connections[fed_id], SHUT_WR); + if (flag > 0) { + // Flag indicates that there could still be incoming data. + unsigned char message[32]; + while (read(_fed.sockets_for_inbound_p2p_connections[fed_id], &message, 32) > 0); + } + } + close(_fed.sockets_for_inbound_p2p_connections[fed_id]); + _fed.sockets_for_inbound_p2p_connections[fed_id] = -1; + } + } + lf_mutex_unlock(&socket_mutex); +} + /** * Request to close the socket that receives incoming messages from the * specified federate ID. This sends a message to the upstream federate @@ -1451,7 +1490,8 @@ static int _lf_request_close_inbound_socket(int fed_id) { ssize_t written = write_to_socket( _fed.sockets_for_inbound_p2p_connections[fed_id], 1, &message_marker); - _fed.sockets_for_inbound_p2p_connections[fed_id] = -1; + // Close the socket upon receiving EOF. + _lf_close_inbound_socket(fed_id, 1); // Trace the event when tracing is enabled tracepoint_federate_to_federate(_fed.trace, send_CLOSE_RQ, _lf_my_fed_id, fed_id, NULL); @@ -1464,34 +1504,6 @@ static int _lf_request_close_inbound_socket(int fed_id) { } } -/** - * Close the socket that receives incoming messages from the - * specified federate ID or RTI. This function should be called when a read - * of incoming socket fails or when an EOF is received. - * - * @param The ID of the peer federate sending messages to this - * federate, or -1 if the RTI. - */ -void _lf_close_inbound_socket(int fed_id) { - lf_mutex_lock(&socket_mutex); - if (fed_id < 0) { - // socket connection is to the RTI. - int socket = _fed.socket_TCP_RTI; - // First, set the global socket to -1. - _fed.socket_TCP_RTI = -1; - // Then shutdown and close the socket. - shutdown(socket, SHUT_RDWR); - close(socket); - } else if (_fed.sockets_for_inbound_p2p_connections[fed_id] >= 0) { - if (_fed.sockets_for_inbound_p2p_connections[fed_id] >= 0) { - shutdown(_fed.sockets_for_inbound_p2p_connections[fed_id], SHUT_RDWR); - close(_fed.sockets_for_inbound_p2p_connections[fed_id]); - _fed.sockets_for_inbound_p2p_connections[fed_id] = -1; - } - } - lf_mutex_unlock(&socket_mutex); -} - /** * Handle a port absent message received from a remote federate. * This just sets the last known status tag of the port specified @@ -2312,14 +2324,19 @@ void handle_stop_request_message() { } /** - * Send a resign signal to the RTI. + * Send a resign signal to the RTI. The tag payload will be the current + * tag of the specified environment or, if there has been an error that + * will lead to an abnormal termination, the tag NEVER_TAG. */ static void send_resign_signal(environment_t* env) { size_t bytes_to_write = 1 + sizeof(tag_t); unsigned char buffer[bytes_to_write]; buffer[0] = MSG_TYPE_RESIGN; - tag_t tag = env->current_tag; - encode_tag(&(buffer[1]), tag); + if (_lf_normal_termination) { + encode_tag(&(buffer[1]), env->current_tag); + } else { + encode_tag(&(buffer[1]), NEVER_TAG); + } ssize_t written = write_to_socket(_fed.socket_TCP_RTI, bytes_to_write, &(buffer[0])); if (written == bytes_to_write) { LF_PRINT_LOG("Resigned."); @@ -2366,8 +2383,10 @@ void terminate_execution(environment_t* env) { for (int i=0; i < NUMBER_OF_FEDERATES; i++) { // Close outbound connections, in case they have not closed themselves. - // This will result in EOF being sent to the remote federate, I think. - _lf_close_outbound_socket(i); + // This will result in EOF being sent to the remote federate, except for + // abnormal termination, in which case it will just close the socket. + int flag = _lf_normal_termination? 1 : -1; + _lf_close_outbound_socket(i, flag); } LF_PRINT_DEBUG("Waiting for inbound p2p socket listener threads."); @@ -2427,11 +2446,11 @@ void* listen_to_federates(void* _args) { if (bytes_read == 0) { // EOF occurred. This breaks the connection. lf_print("Received EOF from peer federate %d. Closing the socket.", fed_id); - _lf_close_inbound_socket(fed_id); + _lf_close_inbound_socket(fed_id, bytes_read); break; } else if (bytes_read < 0) { lf_print_error("P2P socket to federate %d is broken.", fed_id); - _lf_close_inbound_socket(fed_id); + _lf_close_inbound_socket(fed_id, bytes_read); break; } LF_PRINT_DEBUG("Received a P2P message on socket %d of type %d.", diff --git a/core/federated/network/net_util.c b/core/federated/network/net_util.c index 8f5f46178..2048e02a1 100644 --- a/core/federated/network/net_util.c +++ b/core/federated/network/net_util.c @@ -66,9 +66,9 @@ int create_real_time_tcp_socket_errexit() { lf_print_error_system_failure("Could not open TCP socket."); } // Disable Nagle's algorithm which bundles together small TCP messages to - // reduce network traffic + // reduce network traffic. // TODO: Re-consider if we should do this, and whether disabling delayed ACKs - // is enough. + // is enough. int flag = 1; int result = setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, &flag, sizeof(int)); @@ -76,14 +76,14 @@ int create_real_time_tcp_socket_errexit() { lf_print_error_system_failure("Failed to disable Nagle algorithm on socket server."); } +#if defined(PLATFORM_Linux) // Disable delayed ACKs. Only possible on Linux - #if defined(PLATFORM_Linux) result = setsockopt(sock, IPPROTO_TCP, TCP_QUICKACK, &flag, sizeof(int)); if (result < 0) { lf_print_error_system_failure("Failed to disable Nagle algorithm on socket server."); } - #endif +#endif // Linux return sock; } @@ -150,15 +150,17 @@ ssize_t write_to_socket_with_mutex( // that we should try again (@see man errno). LF_PRINT_DEBUG("Writing to socket was blocked. Will try again."); continue; - } else if (more <= 0) { + } else if (more < 0) { + // An error occurred. + shutdown(socket, SHUT_RDWR); + close(socket); + if (mutex != NULL) { + lf_mutex_unlock(mutex); + } if (format != NULL) { - shutdown(socket, SHUT_RDWR); - close(socket); - if (mutex != NULL) { - lf_mutex_unlock(mutex); - } - lf_print_error(format, args); - lf_print_error("Code %d: %s.", errno, strerror(errno)); + lf_print_error_system_failure(format, args); + } else { + lf_print_error("Failed to write to socket. Closing it."); } return more; } diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index 7dc061ca8..f164fdb07 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -212,14 +212,10 @@ typedef struct federate_instance_t { } federate_instance_t; #ifdef FEDERATED_DECENTRALIZED -/** - * An array of actions associated with network input ports that have STAA offsets. - */ typedef struct staa { - lf_action_base_t** actions; // Array of pointers to actions with the same STAA offset. - interval_t* action_delays; // Array of delays on the network connections for these actions. - interval_t STAA; // The STAA offset. - size_t num_actions; // The length of the arrays. + lf_action_base_t** actions; + size_t STAA; + size_t num_actions; } staa_t; #endif @@ -245,7 +241,7 @@ void send_neighbor_structure_to_RTI(int); #ifdef FEDERATED_DECENTRALIZED /** - * @brief Spawn a thread to iterate through STAA structs, setting their associated ports absent + * Spawn a thread to iterate through STAA structs, setting their associated ports absent * at an offset if the port is not present with a value by a certain physical time. */ void spawn_staa_thread(void); @@ -281,9 +277,9 @@ void _lf_logical_tag_complete(tag_t); * program exits. If it succeeds, it sets the _fed.socket_TCP_RTI global * variable to refer to the socket for communicating with the RTI. * @param hostname A hostname, such as "localhost". - * @param port_number A port number, or 0 to use the default port. + * @param port_number A port number. */ -void connect_to_rti(const char*, int); +void connect_to_rti(const char* hostname, int port_number); /** * Thread that listens for inputs from other federates. @@ -315,7 +311,7 @@ void* listen_to_federates(void*); * * @note This function is different from create_server(...) in rti.c. * - * @param specified_port The specified port by the user or 0 to use a random port. + * @param specified_port The port specified by the user or 0 to use a random port. */ void create_server(int specified_port); From 24dab5a14fe3d9ebd248ca4d000bc83960991b80 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Tue, 26 Dec 2023 13:40:39 -0800 Subject: [PATCH 15/83] Major refactoring of network functions --- core/federated/RTI/main.c | 6 +- core/federated/RTI/rti_remote.c | 1113 +++++++++++-------- core/federated/RTI/rti_remote.h | 46 +- core/federated/federate.c | 802 ++++++------- core/federated/network/net_util.c | 104 +- include/core/federated/federate.h | 79 +- include/core/federated/network/net_common.h | 20 - include/core/federated/network/net_util.h | 96 +- include/core/utils/util.h | 28 + 9 files changed, 1143 insertions(+), 1151 deletions(-) diff --git a/core/federated/RTI/main.c b/core/federated/RTI/main.c index f799effac..225a3522a 100644 --- a/core/federated/RTI/main.c +++ b/core/federated/RTI/main.c @@ -82,11 +82,11 @@ static void send_resign_signal(federate_info_t* fed) { buffer[0] = MSG_TYPE_RESIGN; tag_t tag = fed->enclave.completed; encode_tag(&(buffer[1]), tag); - ssize_t written = write_to_socket(fed->socket, bytes_to_write, &(buffer[0])); - if (written == bytes_to_write) { + int failed = write_to_socket(fed->socket, bytes_to_write, &(buffer[0])); + if (failed == 0) { LF_PRINT_LOG("RTI has sent resign signal to federate %d due to abnormal termination.", fed->enclave.id); } else { - LF_PRINT_LOG("RTI failed to send resign signal to federate %d.", fed->enclave.id); + LF_PRINT_LOG("RTI failed to send resign signal to federate %d on socket ID %d.", fed->enclave.id, fed->socket); } if (rti.base.tracing_enabled) { tracepoint_rti_to_federate(rti.base.trace, send_RESIGN, fed->enclave.id, &tag); diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 66404cf57..a2781af56 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -4,7 +4,7 @@ * @author Edward A. Lee * @author Soroush Bateni * @author Erling Jellum - * @author Chadlia Jerad + * @author Chadlia Jerad * @copyright (c) 2020-2023, The University of California at Berkeley * License in [BSD 2-clause](https://github.com/lf-lang/reactor-c/blob/main/LICENSE.md) * @brief Runtime infrastructure (RTI) for distributed Lingua Franca programs. @@ -30,7 +30,6 @@ #include "net_util.h" #include - // Global variables defined in tag.c: extern instant_t start_time; @@ -42,47 +41,55 @@ static rti_remote_t *rti_remote; bool _lf_federate_reports_error = false; // A convenient macro for getting the `federate_info_t *` at index `_idx` -// and casting it. -#define GET_FED_INFO(_idx) (federate_info_t *) rti_remote->base.scheduling_nodes[_idx] +// and casting it. +#define GET_FED_INFO(_idx) (federate_info_t *)rti_remote->base.scheduling_nodes[_idx] lf_mutex_t rti_mutex; lf_cond_t received_start_times; lf_cond_t sent_start_time; -extern int lf_critical_section_enter(environment_t* env) { +extern int lf_critical_section_enter(environment_t *env) { return lf_mutex_lock(&rti_mutex); } -extern int lf_critical_section_exit(environment_t* env) { +extern int lf_critical_section_exit(environment_t *env) { return lf_mutex_unlock(&rti_mutex); } -int create_server(int32_t specified_port, uint16_t port, socket_type_t socket_type) { +int create_server(int32_t specified_port, uint16_t port, socket_type_t socket_type) +{ // Timeout time for the communications of the server struct timeval timeout_time = {.tv_sec = TCP_TIMEOUT_TIME / BILLION, .tv_usec = (TCP_TIMEOUT_TIME % BILLION) / 1000}; // Create an IPv4 socket for TCP (not UDP) communication over IP (0). int socket_descriptor = -1; - if (socket_type == TCP) { + if (socket_type == TCP) + { socket_descriptor = create_real_time_tcp_socket_errexit(); - } else if (socket_type == UDP) { + } + else if (socket_type == UDP) + { socket_descriptor = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); // Set the appropriate timeout time timeout_time = (struct timeval){.tv_sec = UDP_TIMEOUT_TIME / BILLION, .tv_usec = (UDP_TIMEOUT_TIME % BILLION) / 1000}; } - if (socket_descriptor < 0) { + if (socket_descriptor < 0) + { lf_print_error_system_failure("Failed to create RTI socket."); } // Set the option for this socket to reuse the same address int true_variable = 1; // setsockopt() requires a reference to the value assigned to an option - if (setsockopt(socket_descriptor, SOL_SOCKET, SO_REUSEADDR, &true_variable, sizeof(int32_t)) < 0) { + if (setsockopt(socket_descriptor, SOL_SOCKET, SO_REUSEADDR, &true_variable, sizeof(int32_t)) < 0) + { lf_print_error("RTI failed to set SO_REUSEADDR option on the socket: %s.", strerror(errno)); } // Set the timeout on the socket so that read and write operations don't block for too long - if (setsockopt(socket_descriptor, SOL_SOCKET, SO_RCVTIMEO, (const char*)&timeout_time, sizeof(timeout_time)) < 0) { + if (setsockopt(socket_descriptor, SOL_SOCKET, SO_RCVTIMEO, (const char *)&timeout_time, sizeof(timeout_time)) < 0) + { lf_print_error("RTI failed to set SO_RCVTIMEO option on the socket: %s.", strerror(errno)); } - if (setsockopt(socket_descriptor, SOL_SOCKET, SO_SNDTIMEO, (const char*)&timeout_time, sizeof(timeout_time)) < 0) { + if (setsockopt(socket_descriptor, SOL_SOCKET, SO_SNDTIMEO, (const char *)&timeout_time, sizeof(timeout_time)) < 0) + { lf_print_error("RTI failed to set SO_SNDTIMEO option on the socket: %s.", strerror(errno)); } @@ -117,45 +124,51 @@ int create_server(int32_t specified_port, uint16_t port, socket_type_t socket_ty // Server file descriptor. struct sockaddr_in server_fd; // Zero out the server address structure. - bzero((char *) &server_fd, sizeof(server_fd)); + bzero((char *)&server_fd, sizeof(server_fd)); - server_fd.sin_family = AF_INET; // IPv4 - server_fd.sin_addr.s_addr = INADDR_ANY; // All interfaces, 0.0.0.0. + server_fd.sin_family = AF_INET; // IPv4 + server_fd.sin_addr.s_addr = INADDR_ANY; // All interfaces, 0.0.0.0. // Convert the port number from host byte order to network byte order. server_fd.sin_port = htons(port); int result = bind( - socket_descriptor, - (struct sockaddr *) &server_fd, - sizeof(server_fd)); + socket_descriptor, + (struct sockaddr *)&server_fd, + sizeof(server_fd)); // Try repeatedly to bind to the specified port. int count = 1; - while (result != 0 && count++ < PORT_BIND_RETRY_LIMIT) { + while (result != 0 && count++ < PORT_BIND_RETRY_LIMIT) + { lf_print("RTI failed to get port %d. Will try again.", port); lf_sleep(PORT_BIND_RETRY_INTERVAL); server_fd.sin_port = htons(port); result = bind( - socket_descriptor, - (struct sockaddr *) &server_fd, - sizeof(server_fd)); + socket_descriptor, + (struct sockaddr *)&server_fd, + sizeof(server_fd)); } - if (result != 0) { + if (result != 0) + { lf_print_error_and_exit("Failed to bind the RTI socket. Port %d is not available. ", port); } - char* type = "TCP"; - if (socket_type == UDP) { + char *type = "TCP"; + if (socket_type == UDP) + { type = "UDP"; } lf_print("RTI using %s port %d for federation %s.", type, port, rti_remote->federation_id); - if (socket_type == TCP) { + if (socket_type == TCP) + { rti_remote->final_port_TCP = port; // Enable listening for socket connections. // The second argument is the maximum number of queued socket requests, // which according to the Mac man page is limited to 128. listen(socket_descriptor, 128); - } else if (socket_type == UDP) { + } + else if (socket_type == UDP) + { rti_remote->final_port_UDP = port; // No need to listen on the UDP socket } @@ -163,16 +176,16 @@ int create_server(int32_t specified_port, uint16_t port, socket_type_t socket_ty return socket_descriptor; } -void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { - if (e->state == NOT_CONNECTED - || lf_tag_compare(tag, e->last_granted) <= 0 - || lf_tag_compare(tag, e->last_provisionally_granted) < 0 - ) { +void notify_tag_advance_grant(scheduling_node_t *e, tag_t tag) +{ + if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || lf_tag_compare(tag, e->last_provisionally_granted) < 0) + { return; } // Need to make sure that the destination federate's thread has already // sent the starting MSG_TYPE_TIMESTAMP message. - while (e->state == PENDING) { + while (e->state == PENDING) + { // Need to wait here. lf_cond_wait(&sent_start_time); } @@ -182,36 +195,34 @@ void notify_tag_advance_grant(scheduling_node_t* e, tag_t tag) { encode_int64(tag.time, &(buffer[1])); encode_int32((int32_t)tag.microstep, &(buffer[1 + sizeof(int64_t)])); - if (rti_remote->base.tracing_enabled) { + if (rti_remote->base.tracing_enabled) + { tracepoint_rti_to_federate(rti_remote->base.trace, send_TAG, e->id, &tag); } // This function is called in notify_advance_grant_if_safe(), which is a long // function. During this call, the socket might close, causing the following write_to_socket // to fail. Consider a failure here a soft failure and update the federate's status. - ssize_t bytes_written = write_to_socket(((federate_info_t*)e)->socket, message_length, buffer); - if (bytes_written < (ssize_t)message_length) { + if (write_to_socket(((federate_info_t *)e)->socket, message_length, buffer)) { lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); - if (bytes_written < 0) { - e->state = NOT_CONNECTED; - // FIXME: We need better error handling, but don't stop other execution here. - } + e->state = NOT_CONNECTED; + // FIXME: We need better error handling, but don't stop other execution here. } else { e->last_granted = tag; LF_PRINT_LOG("RTI sent to federate %d the tag advance grant (TAG) " PRINTF_TAG ".", - e->id, tag.time - start_time, tag.microstep); + e->id, tag.time - start_time, tag.microstep); } } -void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { - if (e->state == NOT_CONNECTED - || lf_tag_compare(tag, e->last_granted) <= 0 - || lf_tag_compare(tag, e->last_provisionally_granted) <= 0 - ) { +void notify_provisional_tag_advance_grant(scheduling_node_t *e, tag_t tag) +{ + if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || lf_tag_compare(tag, e->last_provisionally_granted) <= 0) + { return; } // Need to make sure that the destination federate's thread has already // sent the starting MSG_TYPE_TIMESTAMP message. - while (e->state == PENDING) { + while (e->state == PENDING) + { // Need to wait here. lf_cond_wait(&sent_start_time); } @@ -221,21 +232,20 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { encode_int64(tag.time, &(buffer[1])); encode_int32((int32_t)tag.microstep, &(buffer[1 + sizeof(int64_t)])); - if (rti_remote->base.tracing_enabled){ + if (rti_remote->base.tracing_enabled) + { tracepoint_rti_to_federate(rti_remote->base.trace, send_PTAG, e->id, &tag); } // This function is called in notify_advance_grant_if_safe(), which is a long // function. During this call, the socket might close, causing the following write_to_socket // to fail. Consider a failure here a soft failure and update the federate's status. - ssize_t bytes_written = write_to_socket(((federate_info_t*)e)->socket, message_length, buffer); - - if (bytes_written < (ssize_t)message_length) { + if (write_to_socket(((federate_info_t *)e)->socket, message_length, buffer)) { lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); - if (bytes_written < 0) { - e->state = NOT_CONNECTED; - // FIXME: We need better error handling, but don't stop other execution here. - } - } else { + e->state = NOT_CONNECTED; + // FIXME: We need better error handling, but don't stop other execution here. + } + else + { e->last_provisionally_granted = tag; LF_PRINT_LOG("RTI sent to federate %d the Provisional Tag Advance Grant (PTAG) " PRINTF_TAG ".", e->id, tag.time - start_time, tag.microstep); @@ -249,105 +259,115 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag) { // Note that this is transitive. // NOTE: This is not needed for enclaves because zero-delay loops are prohibited. // It's only needed for federates, which is why this is implemented here. - for (int j = 0; j < e->num_upstream; j++) { - scheduling_node_t* upstream = rti_remote->base.scheduling_nodes[e->upstream[j]]; + for (int j = 0; j < e->num_upstream; j++) + { + scheduling_node_t *upstream = rti_remote->base.scheduling_nodes[e->upstream[j]]; // Ignore this federate if it has resigned. - if (upstream->state == NOT_CONNECTED) continue; + if (upstream->state == NOT_CONNECTED) + continue; tag_t earliest = earliest_future_incoming_message_tag(upstream); tag_t strict_earliest = eimt_strict(upstream); // If these tags are equal, then a TAG or PTAG should have already been granted, // in which case, another will not be sent. But it may not have been already granted. - if (lf_tag_compare(earliest, tag) > 0) { + if (lf_tag_compare(earliest, tag) > 0) + { notify_tag_advance_grant(upstream, tag); - } else if(lf_tag_compare(earliest, tag) == 0 && lf_tag_compare(strict_earliest, tag) > 0) { + } + else if (lf_tag_compare(earliest, tag) == 0 && lf_tag_compare(strict_earliest, tag) > 0) + { notify_provisional_tag_advance_grant(upstream, tag); } } } } -void update_federate_next_event_tag_locked(uint16_t federate_id, tag_t next_event_tag) { - federate_info_t* fed = GET_FED_INFO(federate_id); +void update_federate_next_event_tag_locked(uint16_t federate_id, tag_t next_event_tag) +{ + federate_info_t *fed = GET_FED_INFO(federate_id); tag_t min_in_transit_tag = get_minimum_in_transit_message_tag(fed->in_transit_message_tags); if (lf_tag_compare( min_in_transit_tag, - next_event_tag - ) < 0 - ) { + next_event_tag) < 0) + { next_event_tag = min_in_transit_tag; } update_scheduling_node_next_event_tag_locked(&(fed->enclave), next_event_tag); } -void handle_port_absent_message(federate_info_t* sending_federate, unsigned char* buffer) { +void handle_port_absent_message(federate_info_t *sending_federate, unsigned char *buffer) +{ size_t message_size = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int64_t) + sizeof(uint32_t); read_from_socket_errexit(sending_federate->socket, message_size, &(buffer[1]), - " RTI failed to read port absent message from federate %u.", - sending_federate->enclave.id); + " RTI failed to read port absent message from federate %u.", + sending_federate->enclave.id); uint16_t reactor_port_id = extract_uint16(&(buffer[1])); uint16_t federate_id = extract_uint16(&(buffer[1 + sizeof(uint16_t)])); tag_t tag = extract_tag(&(buffer[1 + 2 * sizeof(uint16_t)])); - if (rti_remote->base.tracing_enabled) { + if (rti_remote->base.tracing_enabled) + { tracepoint_rti_from_federate(rti_remote->base.trace, receive_PORT_ABS, sending_federate->enclave.id, &tag); } // Need to acquire the mutex lock to ensure that the thread handling // messages coming from the socket connected to the destination does not // issue a TAG before this message has been forwarded. - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(&rti_mutex); // If the destination federate is no longer connected, issue a warning // and return. - federate_info_t* fed = GET_FED_INFO(federate_id); - if (fed->enclave.state == NOT_CONNECTED) { - lf_mutex_unlock(&rti_mutex); + federate_info_t *fed = GET_FED_INFO(federate_id); + if (fed->enclave.state == NOT_CONNECTED) + { + LF_MUTEX_UNLOCK(&rti_mutex); lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", - federate_id); + federate_id); LF_PRINT_LOG("Fed status: next_event (" PRINTF_TIME ", %d), " - "completed (" PRINTF_TIME ", %d), " - "last_granted (" PRINTF_TIME ", %d), " - "last_provisionally_granted (" PRINTF_TIME ", %d).", - fed->enclave.next_event.time - start_time, - fed->enclave.next_event.microstep, - fed->enclave.completed.time - start_time, - fed->enclave.completed.microstep, - fed->enclave.last_granted.time - start_time, - fed->enclave.last_granted.microstep, - fed->enclave.last_provisionally_granted.time - start_time, - fed->enclave.last_provisionally_granted.microstep - ); + "completed (" PRINTF_TIME ", %d), " + "last_granted (" PRINTF_TIME ", %d), " + "last_provisionally_granted (" PRINTF_TIME ", %d).", + fed->enclave.next_event.time - start_time, + fed->enclave.next_event.microstep, + fed->enclave.completed.time - start_time, + fed->enclave.completed.microstep, + fed->enclave.last_granted.time - start_time, + fed->enclave.last_granted.microstep, + fed->enclave.last_provisionally_granted.time - start_time, + fed->enclave.last_provisionally_granted.microstep); return; } LF_PRINT_LOG("RTI forwarding port absent message for port %u to federate %u.", - reactor_port_id, - federate_id); + reactor_port_id, + federate_id); // Need to make sure that the destination federate's thread has already // sent the starting MSG_TYPE_TIMESTAMP message. - while (fed->enclave.state == PENDING) { + while (fed->enclave.state == PENDING) + { // Need to wait here. lf_cond_wait(&sent_start_time); } // Forward the message. - int destination_socket = fed->socket; - if (rti_remote->base.tracing_enabled) { + write_to_socket_fail_on_error(&fed->socket, message_size + 1, buffer, &rti_mutex, + "RTI failed to forward message to federate %d.", federate_id); + + LF_MUTEX_UNLOCK(&rti_mutex); + + if (rti_remote->base.tracing_enabled) + { tracepoint_rti_to_federate(rti_remote->base.trace, send_PORT_ABS, federate_id, &tag); } - write_to_socket_errexit(destination_socket, message_size + 1, buffer, - "RTI failed to forward message to federate %d.", federate_id); - - lf_mutex_unlock(&rti_mutex); } -void handle_timed_message(federate_info_t* sending_federate, unsigned char* buffer) { +void handle_timed_message(federate_info_t *sending_federate, unsigned char *buffer) +{ size_t header_size = 1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t) + sizeof(int64_t) + sizeof(uint32_t); // Read the header, minus the first byte which has already been read. read_from_socket_errexit(sending_federate->socket, header_size - 1, &(buffer[1]), "RTI failed to read the timed message header from remote federate."); @@ -362,84 +382,83 @@ void handle_timed_message(federate_info_t* sending_federate, unsigned char* buff size_t total_bytes_to_read = length + header_size; size_t bytes_to_read = length; - if (FED_COM_BUFFER_SIZE < header_size + 1) { + if (FED_COM_BUFFER_SIZE < header_size + 1) + { lf_print_error_and_exit("Buffer size (%d) is not large enough to " - "read the header plus one byte.", - FED_COM_BUFFER_SIZE); + "read the header plus one byte.", + FED_COM_BUFFER_SIZE); } // Cut up the payload in chunks. - if (bytes_to_read > FED_COM_BUFFER_SIZE - header_size) { + if (bytes_to_read > FED_COM_BUFFER_SIZE - header_size) + { bytes_to_read = FED_COM_BUFFER_SIZE - header_size; } - LF_PRINT_LOG("RTI received message from federate %d for federate %u port %u with intended tag " - PRINTF_TAG ". Forwarding.", - sending_federate->enclave.id, federate_id, reactor_port_id, - intended_tag.time - lf_time_start(), intended_tag.microstep); + LF_PRINT_LOG("RTI received message from federate %d for federate %u port %u with intended tag " PRINTF_TAG ". Forwarding.", + sending_federate->enclave.id, federate_id, reactor_port_id, + intended_tag.time - lf_time_start(), intended_tag.microstep); read_from_socket_errexit(sending_federate->socket, bytes_to_read, &(buffer[header_size]), - "RTI failed to read timed message from federate %d.", federate_id); + "RTI failed to read timed message from federate %d.", federate_id); size_t bytes_read = bytes_to_read + header_size; // Following only works for string messages. // LF_PRINT_DEBUG("Message received by RTI: %s.", buffer + header_size); - if (rti_remote->base.tracing_enabled) { + if (rti_remote->base.tracing_enabled) + { tracepoint_rti_from_federate(rti_remote->base.trace, receive_TAGGED_MSG, sending_federate->enclave.id, &intended_tag); } // Need to acquire the mutex lock to ensure that the thread handling // messages coming from the socket connected to the destination does not // issue a TAG before this message has been forwarded. - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(&rti_mutex); // If the destination federate is no longer connected, issue a warning // and return. federate_info_t *fed = GET_FED_INFO(federate_id); - if (fed->enclave.state == NOT_CONNECTED) { - lf_mutex_unlock(&rti_mutex); + if (fed->enclave.state == NOT_CONNECTED) + { + LF_MUTEX_UNLOCK(&rti_mutex); lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", - federate_id); + federate_id); LF_PRINT_LOG("Fed status: next_event (" PRINTF_TIME ", %d), " - "completed (" PRINTF_TIME ", %d), " - "last_granted (" PRINTF_TIME ", %d), " - "last_provisionally_granted (" PRINTF_TIME ", %d).", - fed->enclave.next_event.time - start_time, - fed->enclave.next_event.microstep, - fed->enclave.completed.time - start_time, - fed->enclave.completed.microstep, - fed->enclave.last_granted.time - start_time, - fed->enclave.last_granted.microstep, - fed->enclave.last_provisionally_granted.time - start_time, - fed->enclave.last_provisionally_granted.microstep - ); + "completed (" PRINTF_TIME ", %d), " + "last_granted (" PRINTF_TIME ", %d), " + "last_provisionally_granted (" PRINTF_TIME ", %d).", + fed->enclave.next_event.time - start_time, + fed->enclave.next_event.microstep, + fed->enclave.completed.time - start_time, + fed->enclave.completed.microstep, + fed->enclave.last_granted.time - start_time, + fed->enclave.last_granted.microstep, + fed->enclave.last_provisionally_granted.time - start_time, + fed->enclave.last_provisionally_granted.microstep); return; } - // Forward the message or message chunk. - int destination_socket = fed->socket; - LF_PRINT_DEBUG( "RTI forwarding message to port %d of federate %hu of length %zu.", reactor_port_id, federate_id, - length - ); + length); // Record this in-transit message in federate's in-transit message queue. - if (lf_tag_compare(fed->enclave.completed, intended_tag) < 0) { + if (lf_tag_compare(fed->enclave.completed, intended_tag) < 0) + { // Add a record of this message to the list of in-transit messages to this federate. add_in_transit_message_record( fed->in_transit_message_tags, - intended_tag - ); + intended_tag); LF_PRINT_DEBUG( "RTI: Adding a message with tag " PRINTF_TAG " to the list of in-transit messages for federate %d.", intended_tag.time - lf_time_start(), intended_tag.microstep, - federate_id - ); - } else { + federate_id); + } + else + { lf_print_error( "RTI: Federate %d has already completed tag " PRINTF_TAG ", but there is an in-transit message with tag " PRINTF_TAG " from federate %hu. " @@ -449,93 +468,98 @@ void handle_timed_message(federate_info_t* sending_federate, unsigned char* buff fed->enclave.completed.microstep, intended_tag.time - lf_time_start(), intended_tag.microstep, - sending_federate->enclave.id - ); + sending_federate->enclave.id); // FIXME: Drop the federate? } // Need to make sure that the destination federate's thread has already // sent the starting MSG_TYPE_TIMESTAMP message. - while (fed->enclave.state == PENDING) { + while (fed->enclave.state == PENDING) + { // Need to wait here. lf_cond_wait(&sent_start_time); } - if (rti_remote->base.tracing_enabled) { + if (rti_remote->base.tracing_enabled) + { tracepoint_rti_to_federate(rti_remote->base.trace, send_TAGGED_MSG, federate_id, &intended_tag); } - write_to_socket_errexit(destination_socket, bytes_read, buffer, - "RTI failed to forward message to federate %d.", federate_id); + write_to_socket_fail_on_error(&fed->socket, bytes_read, buffer, &rti_mutex, + "RTI failed to forward message to federate %d.", federate_id); // The message length may be longer than the buffer, // in which case we have to handle it in chunks. size_t total_bytes_read = bytes_read; - while (total_bytes_read < total_bytes_to_read) { + while (total_bytes_read < total_bytes_to_read) + { LF_PRINT_DEBUG("Forwarding message in chunks."); bytes_to_read = total_bytes_to_read - total_bytes_read; - if (bytes_to_read > FED_COM_BUFFER_SIZE) { + if (bytes_to_read > FED_COM_BUFFER_SIZE) + { bytes_to_read = FED_COM_BUFFER_SIZE; } read_from_socket_errexit(sending_federate->socket, bytes_to_read, buffer, - "RTI failed to read message chunks."); + "RTI failed to read message chunks."); total_bytes_read += bytes_to_read; // FIXME: a mutex needs to be held for this so that other threads // do not write to destination_socket and cause interleaving. However, // holding the rti_mutex might be very expensive. Instead, each outgoing // socket should probably have its own mutex. - write_to_socket_errexit(destination_socket, bytes_to_read, buffer, - "RTI failed to send message chunks."); + write_to_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, &rti_mutex, + "RTI failed to send message chunks."); } update_federate_next_event_tag_locked(federate_id, intended_tag); - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); } -void handle_logical_tag_complete(federate_info_t* fed) { +void handle_logical_tag_complete(federate_info_t *fed) +{ unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; read_from_socket_errexit(fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, - "RTI failed to read the content of the logical tag complete from federate %d.", fed->enclave.id); + "RTI failed to read the content of the logical tag complete from federate %d.", fed->enclave.id); tag_t completed = extract_tag(buffer); - if (rti_remote->base.tracing_enabled) { + if (rti_remote->base.tracing_enabled) + { tracepoint_rti_from_federate(rti_remote->base.trace, receive_LTC, fed->enclave.id, &completed); } _logical_tag_complete(&(fed->enclave), completed); // FIXME: Should this function be in the enclave version? - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(&rti_mutex); // See if we can remove any of the recorded in-transit messages for this. clean_in_transit_message_record_up_to_tag(fed->in_transit_message_tags, fed->enclave.completed); - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); } -void handle_next_event_tag(federate_info_t* fed) { +void handle_next_event_tag(federate_info_t *fed) +{ unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; read_from_socket_errexit(fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, - "RTI failed to read the content of the next event tag from federate %d.", fed->enclave.id); + "RTI failed to read the content of the next event tag from federate %d.", fed->enclave.id); // Acquire a mutex lock to ensure that this state does not change while a // message is in transport or being used to determine a TAG. - lf_mutex_lock(&rti_mutex); // FIXME: Instead of using a mutex, - // it might be more efficient to use a - // select() mechanism to read and process - // federates' buffers in an orderly fashion. - + LF_MUTEX_LOCK(&rti_mutex); // FIXME: Instead of using a mutex, + // it might be more efficient to use a + // select() mechanism to read and process + // federates' buffers in an orderly fashion. tag_t intended_tag = extract_tag(buffer); - if (rti_remote->base.tracing_enabled) { + if (rti_remote->base.tracing_enabled) + { tracepoint_rti_from_federate(rti_remote->base.trace, receive_NET, fed->enclave.id, &intended_tag); } LF_PRINT_LOG("RTI received from federate %d the Next Event Tag (NET) " PRINTF_TAG, - fed->enclave.id, intended_tag.time - start_time, - intended_tag.microstep); + fed->enclave.id, intended_tag.time - start_time, + intended_tag.microstep); update_federate_next_event_tag_locked( fed->enclave.id, - intended_tag - ); - lf_mutex_unlock(&rti_mutex); + intended_tag); + LF_MUTEX_UNLOCK(&rti_mutex); } /////////////////// STOP functions //////////////////// @@ -552,10 +576,12 @@ bool _lf_rti_stop_granted_already_sent_to_federates = false; * This function also checks the most recently received NET from * each federate and resets that be no greater than the _RTI.max_stop_tag. * - * This function assumes the caller holds the _RTI.rti_mutex lock. + * This function assumes the caller holds the rti_mutex lock. */ -void _lf_rti_broadcast_stop_time_to_federates_locked() { - if (_lf_rti_stop_granted_already_sent_to_federates == true) { +void _lf_rti_broadcast_stop_time_to_federates_locked() +{ + if (_lf_rti_stop_granted_already_sent_to_federates == true) + { return; } // Reply with a stop granted to all federates @@ -563,85 +589,97 @@ void _lf_rti_broadcast_stop_time_to_federates_locked() { ENCODE_STOP_GRANTED(outgoing_buffer, rti_remote->base.max_stop_tag.time, rti_remote->base.max_stop_tag.microstep); // Iterate over federates and send each the message. - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) + { federate_info_t *fed = GET_FED_INFO(i); - if (fed->enclave.state == NOT_CONNECTED) { + if (fed->enclave.state == NOT_CONNECTED) + { continue; } - if (lf_tag_compare(fed->enclave.next_event, rti_remote->base.max_stop_tag) >= 0) { + if (lf_tag_compare(fed->enclave.next_event, rti_remote->base.max_stop_tag) >= 0) + { // Need the next_event to be no greater than the stop tag. fed->enclave.next_event = rti_remote->base.max_stop_tag; } - if (rti_remote->base.tracing_enabled) { + if (rti_remote->base.tracing_enabled) + { tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP_GRN, fed->enclave.id, &rti_remote->base.max_stop_tag); } - write_to_socket_errexit(fed->socket, MSG_TYPE_STOP_GRANTED_LENGTH, outgoing_buffer, - "RTI failed to send MSG_TYPE_STOP_GRANTED message to federate %d.", fed->enclave.id); + write_to_socket_fail_on_error(&fed->socket, MSG_TYPE_STOP_GRANTED_LENGTH, outgoing_buffer, &rti_mutex, + "RTI failed to send MSG_TYPE_STOP_GRANTED message to federate %d.", fed->enclave.id); } LF_PRINT_LOG("RTI sent to federates MSG_TYPE_STOP_GRANTED with tag (" PRINTF_TIME ", %u).", - rti_remote->base.max_stop_tag.time - start_time, - rti_remote->base.max_stop_tag.microstep); + rti_remote->base.max_stop_tag.time - start_time, + rti_remote->base.max_stop_tag.microstep); _lf_rti_stop_granted_already_sent_to_federates = true; } -void mark_federate_requesting_stop(federate_info_t* fed) { - if (!fed->requested_stop) { +void mark_federate_requesting_stop(federate_info_t *fed) +{ + if (!fed->requested_stop) + { // Assume that the federate // has requested stop rti_remote->base.num_scheduling_nodes_handling_stop++; fed->requested_stop = true; } - if (rti_remote->base.num_scheduling_nodes_handling_stop == rti_remote->base.number_of_scheduling_nodes) { + if (rti_remote->base.num_scheduling_nodes_handling_stop == rti_remote->base.number_of_scheduling_nodes) + { // We now have information about the stop time of all // federates. _lf_rti_broadcast_stop_time_to_federates_locked(); } } -void handle_stop_request_message(federate_info_t* fed) { +void handle_stop_request_message(federate_info_t *fed) +{ LF_PRINT_DEBUG("RTI handling stop_request from federate %d.", fed->enclave.id); size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_LENGTH - 1; unsigned char buffer[bytes_to_read]; read_from_socket_errexit(fed->socket, bytes_to_read, buffer, - "RTI failed to read the MSG_TYPE_STOP_REQUEST payload from federate %d.", fed->enclave.id); + "RTI failed to read the MSG_TYPE_STOP_REQUEST payload from federate %d.", fed->enclave.id); // Extract the proposed stop tag for the federate tag_t proposed_stop_tag = extract_tag(buffer); - if (rti_remote->base.tracing_enabled) { + if (rti_remote->base.tracing_enabled) + { tracepoint_rti_from_federate(rti_remote->base.trace, receive_STOP_REQ, fed->enclave.id, &proposed_stop_tag); } // Acquire a mutex lock to ensure that this state does change while a // message is in transport or being used to determine a TAG. - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(&rti_mutex); // Check whether we have already received a stop_tag // from this federate - if (fed->requested_stop) { + if (fed->requested_stop) + { // Ignore this request - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); return; } // Update the maximum stop tag received from federates - if (lf_tag_compare(proposed_stop_tag, rti_remote->base.max_stop_tag) > 0) { + if (lf_tag_compare(proposed_stop_tag, rti_remote->base.max_stop_tag) > 0) + { rti_remote->base.max_stop_tag = proposed_stop_tag; } LF_PRINT_LOG("RTI received from federate %d a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", - fed->enclave.id, proposed_stop_tag.time - start_time, proposed_stop_tag.microstep); + fed->enclave.id, proposed_stop_tag.time - start_time, proposed_stop_tag.microstep); // If this federate has not already asked // for a stop, add it to the tally. mark_federate_requesting_stop(fed); - if (rti_remote->base.num_scheduling_nodes_handling_stop == rti_remote->base.number_of_scheduling_nodes) { + if (rti_remote->base.num_scheduling_nodes_handling_stop == rti_remote->base.number_of_scheduling_nodes) + { // We now have information about the stop time of all // federates, and mark_federate_requesting_stop has sent out stop time to. - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); return; } // Forward the stop request to all other federates that have not @@ -651,58 +689,67 @@ void handle_stop_request_message(federate_info_t* fed) { // Iterate over federates and send each the MSG_TYPE_STOP_REQUEST message // if we do not have a stop_time already for them. Do not do this more than once. - if (rti_remote->stop_in_progress) { - lf_mutex_unlock(&rti_mutex); + if (rti_remote->stop_in_progress) + { + LF_MUTEX_UNLOCK(&rti_mutex); return; } rti_remote->stop_in_progress = true; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) + { federate_info_t *f = GET_FED_INFO(i); - if (f->enclave.id != fed->enclave.id && f->requested_stop == false) { - if (f->enclave.state == NOT_CONNECTED) { + if (f->enclave.id != fed->enclave.id && f->requested_stop == false) + { + if (f->enclave.state == NOT_CONNECTED) + { mark_federate_requesting_stop(f); continue; } - if (rti_remote->base.tracing_enabled) { + if (rti_remote->base.tracing_enabled) + { tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP_REQ, f->enclave.id, &rti_remote->base.max_stop_tag); } - write_to_socket_errexit(f->socket, MSG_TYPE_STOP_REQUEST_LENGTH, stop_request_buffer, - "RTI failed to forward MSG_TYPE_STOP_REQUEST message to federate %d.", f->enclave.id); - if (rti_remote->base.tracing_enabled) { + write_to_socket_fail_on_error(&f->socket, MSG_TYPE_STOP_REQUEST_LENGTH, stop_request_buffer, &rti_mutex, + "RTI failed to forward MSG_TYPE_STOP_REQUEST message to federate %d.", f->enclave.id); + if (rti_remote->base.tracing_enabled) + { tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP_REQ, f->enclave.id, &rti_remote->base.max_stop_tag); } } } LF_PRINT_LOG("RTI forwarded to federates MSG_TYPE_STOP_REQUEST with tag (" PRINTF_TIME ", %u).", - rti_remote->base.max_stop_tag.time - start_time, - rti_remote->base.max_stop_tag.microstep); - lf_mutex_unlock(&rti_mutex); + rti_remote->base.max_stop_tag.time - start_time, + rti_remote->base.max_stop_tag.microstep); + LF_MUTEX_UNLOCK(&rti_mutex); } -void handle_stop_request_reply(federate_info_t* fed) { +void handle_stop_request_reply(federate_info_t *fed) +{ size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_REPLY_LENGTH - 1; unsigned char buffer_stop_time[bytes_to_read]; read_from_socket_errexit(fed->socket, bytes_to_read, buffer_stop_time, - "RTI failed to read the reply to MSG_TYPE_STOP_REQUEST message from federate %d.", fed->enclave.id); + "RTI failed to read the reply to MSG_TYPE_STOP_REQUEST message from federate %d.", fed->enclave.id); tag_t federate_stop_tag = extract_tag(buffer_stop_time); - if (rti_remote->base.tracing_enabled) { + if (rti_remote->base.tracing_enabled) + { tracepoint_rti_from_federate(rti_remote->base.trace, receive_STOP_REQ_REP, fed->enclave.id, &federate_stop_tag); } LF_PRINT_LOG("RTI received from federate %d STOP reply tag " PRINTF_TAG ".", fed->enclave.id, - federate_stop_tag.time - start_time, - federate_stop_tag.microstep); + federate_stop_tag.time - start_time, + federate_stop_tag.microstep); // Acquire the mutex lock so that we can change the state of the RTI - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(&rti_mutex); // If the federate has not requested stop before, count the reply - if (lf_tag_compare(federate_stop_tag, rti_remote->base.max_stop_tag) > 0) { + if (lf_tag_compare(federate_stop_tag, rti_remote->base.max_stop_tag) > 0) + { rti_remote->base.max_stop_tag = federate_stop_tag; } mark_federate_requesting_stop(fed); - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); } ////////////////////////////////////////////////// @@ -711,14 +758,14 @@ void handle_address_query(uint16_t fed_id) { federate_info_t *fed = GET_FED_INFO(fed_id); // Use buffer both for reading and constructing the reply. // The length is what is needed for the reply. - unsigned char buffer[sizeof(int32_t)]; - ssize_t bytes_read = read_from_socket(fed->socket, sizeof(uint16_t), (unsigned char*)buffer); + unsigned char buffer[1 + sizeof(int32_t)]; + ssize_t bytes_read = read_from_socket(fed->socket, sizeof(uint16_t), (unsigned char *)buffer); if (bytes_read == 0) { lf_print_error_and_exit("Failed to read address query."); } uint16_t remote_fed_id = extract_uint16(buffer); - if (rti_remote->base.tracing_enabled){ + if (rti_remote->base.tracing_enabled) { tracepoint_rti_from_federate(rti_remote->base.trace, receive_ADR_QR, fed_id, NULL); } @@ -728,22 +775,28 @@ void handle_address_query(uint16_t fed_id) { // the port number because it has not yet received an MSG_TYPE_ADDRESS_ADVERTISEMENT message // from this federate. In that case, it will respond by sending -1. + // Response message is also of type MSG_TYPE_ADDRESS_QUERY. + buffer[0] = MSG_TYPE_ADDRESS_QUERY; + // Encode the port number. federate_info_t *remote_fed = GET_FED_INFO(remote_fed_id); - encode_int32(remote_fed->server_port, (unsigned char*)buffer); + // Send the port number (which could be -1). - write_to_socket_errexit(fed->socket, sizeof(int32_t), (unsigned char*)buffer, - "Failed to write port number to socket of federate %d.", fed_id); + LF_MUTEX_LOCK(&rti_mutex); + encode_int32(remote_fed->server_port, (unsigned char *)&buffer[1]); + write_to_socket_fail_on_error( + &fed->socket, sizeof(int32_t) + 1, (unsigned char *)buffer, &rti_mutex, + "Failed to write port number to socket of federate %d.", fed_id); // Send the server IP address to federate. - write_to_socket_errexit(fed->socket, sizeof(remote_fed->server_ip_addr), - (unsigned char *)&remote_fed->server_ip_addr, - "Failed to write ip address to socket of federate %d.", fed_id); - - if (remote_fed->server_port != -1) { - LF_PRINT_DEBUG("Replied to address query from federate %d with address %s:%d.", - fed_id, remote_fed->server_hostname, remote_fed->server_port); - } + write_to_socket_fail_on_error( + &fed->socket, sizeof(remote_fed->server_ip_addr), + (unsigned char *)&remote_fed->server_ip_addr, &rti_mutex, + "Failed to write ip address to socket of federate %d.", fed_id); + LF_MUTEX_UNLOCK(&rti_mutex); + + LF_PRINT_DEBUG("Replied to address query from federate %d with address %s:%d.", + fed_id, remote_fed->server_hostname, remote_fed->server_port); } void handle_address_ad(uint16_t federate_id) { @@ -764,48 +817,57 @@ void handle_address_ad(uint16_t federate_id) { assert(server_port < 65536); - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(&rti_mutex); fed->server_port = server_port; + LF_MUTEX_UNLOCK(&rti_mutex); + + LF_PRINT_LOG("Received address advertisement with port %d from federate %d.", server_port, federate_id); if (rti_remote->base.tracing_enabled) { tracepoint_rti_from_federate(rti_remote->base.trace, receive_ADR_AD, federate_id, NULL); } - LF_PRINT_LOG("Received address advertisement from federate %d.", federate_id); - lf_mutex_unlock(&rti_mutex); } -void handle_timestamp(federate_info_t *my_fed) { +void handle_timestamp(federate_info_t *my_fed) +{ unsigned char buffer[sizeof(int64_t)]; // Read bytes from the socket. We need 8 bytes. - ssize_t bytes_read = read_from_socket(my_fed->socket, sizeof(int64_t), (unsigned char*)&buffer); - if (bytes_read < (ssize_t)sizeof(int64_t)) { + ssize_t bytes_read = read_from_socket(my_fed->socket, sizeof(int64_t), (unsigned char *)&buffer); + if (bytes_read < (ssize_t)sizeof(int64_t)) + { lf_print_error("ERROR reading timestamp from federate %d.\n", my_fed->enclave.id); } int64_t timestamp = swap_bytes_if_big_endian_int64(*((int64_t *)(&buffer))); - if (rti_remote->base.tracing_enabled) { + if (rti_remote->base.tracing_enabled) + { tag_t tag = {.time = timestamp, .microstep = 0}; tracepoint_rti_from_federate(rti_remote->base.trace, receive_TIMESTAMP, my_fed->enclave.id, &tag); } LF_PRINT_DEBUG("RTI received timestamp message with time: " PRINTF_TIME ".", timestamp); - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(&rti_mutex); rti_remote->num_feds_proposed_start++; - if (timestamp > rti_remote->max_start_time) { + if (timestamp > rti_remote->max_start_time) + { rti_remote->max_start_time = timestamp; } - if (rti_remote->num_feds_proposed_start == rti_remote->base.number_of_scheduling_nodes) { + if (rti_remote->num_feds_proposed_start == rti_remote->base.number_of_scheduling_nodes) + { // All federates have proposed a start time. lf_cond_broadcast(&received_start_times); - } else { + } + else + { // Some federates have not yet proposed a start time. // wait for a notification. - while (rti_remote->num_feds_proposed_start < rti_remote->base.number_of_scheduling_nodes) { + while (rti_remote->num_feds_proposed_start < rti_remote->base.number_of_scheduling_nodes) + { // FIXME: Should have a timeout here? lf_cond_wait(&received_start_times); } } - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); // Send back to the federate the maximum time plus an offset on a TIMESTAMP // message. @@ -815,32 +877,31 @@ void handle_timestamp(federate_info_t *my_fed) { start_time = rti_remote->max_start_time + DELAY_START; encode_int64(swap_bytes_if_big_endian_int64(start_time), &start_time_buffer[1]); - if (rti_remote->base.tracing_enabled) { + if (rti_remote->base.tracing_enabled) + { tag_t tag = {.time = start_time, .microstep = 0}; tracepoint_rti_to_federate(rti_remote->base.trace, send_TIMESTAMP, my_fed->enclave.id, &tag); } - ssize_t bytes_written = write_to_socket( - my_fed->socket, MSG_TYPE_TIMESTAMP_LENGTH, - start_time_buffer - ); - if (bytes_written < MSG_TYPE_TIMESTAMP_LENGTH) { + if (write_to_socket(my_fed->socket, MSG_TYPE_TIMESTAMP_LENGTH, start_time_buffer)) { lf_print_error("Failed to send the starting time to federate %d.", my_fed->enclave.id); } - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(&rti_mutex); // Update state for the federate to indicate that the MSG_TYPE_TIMESTAMP // message has been sent. That MSG_TYPE_TIMESTAMP message grants time advance to // the federate to the start time. my_fed->enclave.state = GRANTED; lf_cond_broadcast(&sent_start_time); LF_PRINT_LOG("RTI sent start time " PRINTF_TIME " to federate %d.", start_time, my_fed->enclave.id); - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); } -void send_physical_clock(unsigned char message_type, federate_info_t* fed, socket_type_t socket_type) { - if (fed->enclave.state == NOT_CONNECTED) { +void send_physical_clock(unsigned char message_type, federate_info_t *fed, socket_type_t socket_type) +{ + if (fed->enclave.state == NOT_CONNECTED) + { lf_print_warning("Clock sync: RTI failed to send physical time to federate %d. Socket not connected.\n", - fed->enclave.id); + fed->enclave.id); return; } unsigned char buffer[sizeof(int64_t) + 1]; @@ -849,80 +910,95 @@ void send_physical_clock(unsigned char message_type, federate_info_t* fed, socke encode_int64(current_physical_time, &(buffer[1])); // Send the message - if (socket_type == UDP) { + if (socket_type == UDP) + { // FIXME: UDP_addr is never initialized. LF_PRINT_DEBUG("Clock sync: RTI sending UDP message type %u.", buffer[0]); ssize_t bytes_written = sendto(rti_remote->socket_descriptor_UDP, buffer, 1 + sizeof(int64_t), 0, - (struct sockaddr*)&fed->UDP_addr, sizeof(fed->UDP_addr)); - if (bytes_written < (ssize_t)sizeof(int64_t) + 1) { + (struct sockaddr *)&fed->UDP_addr, sizeof(fed->UDP_addr)); + if (bytes_written < (ssize_t)sizeof(int64_t) + 1) + { lf_print_warning("Clock sync: RTI failed to send physical time to federate %d: %s\n", - fed->enclave.id, - strerror(errno)); + fed->enclave.id, + strerror(errno)); return; } - } else if (socket_type == TCP) { + } + else if (socket_type == TCP) + { LF_PRINT_DEBUG("Clock sync: RTI sending TCP message type %u.", buffer[0]); - write_to_socket_errexit(fed->socket, 1 + sizeof(int64_t), buffer, - "Clock sync: RTI failed to send physical time to federate %d: %s.", - fed->enclave.id, - strerror(errno)); + LF_MUTEX_LOCK(&rti_mutex); + write_to_socket_fail_on_error(&fed->socket, 1 + sizeof(int64_t), buffer, &rti_mutex, + "Clock sync: RTI failed to send physical time to federate %d.", + fed->enclave.id); + LF_MUTEX_UNLOCK(&rti_mutex); } LF_PRINT_DEBUG("Clock sync: RTI sent PHYSICAL_TIME_SYNC_MESSAGE with timestamp " PRINTF_TIME " to federate %d.", - current_physical_time, - fed->enclave.id); + current_physical_time, + fed->enclave.id); } -void handle_physical_clock_sync_message(federate_info_t* my_fed, socket_type_t socket_type) { +void handle_physical_clock_sync_message(federate_info_t *my_fed, socket_type_t socket_type) +{ // Lock the mutex to prevent interference between sending the two // coded probe messages. - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(&rti_mutex); // Reply with a T4 type message send_physical_clock(MSG_TYPE_CLOCK_SYNC_T4, my_fed, socket_type); // Send the corresponding coded probe immediately after, // but only if this is a UDP channel. - if (socket_type == UDP) { + if (socket_type == UDP) + { send_physical_clock(MSG_TYPE_CLOCK_SYNC_CODED_PROBE, my_fed, socket_type); } - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); } -void* clock_synchronization_thread(void* noargs) { +void *clock_synchronization_thread(void *noargs) +{ // Wait until all federates have been notified of the start time. // FIXME: Use lf_ version of this when merged with master. - lf_mutex_lock(&rti_mutex); - while (rti_remote->num_feds_proposed_start < rti_remote->base.number_of_scheduling_nodes) { + LF_MUTEX_LOCK(&rti_mutex); + while (rti_remote->num_feds_proposed_start < rti_remote->base.number_of_scheduling_nodes) + { lf_cond_wait(&received_start_times); } - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); // Wait until the start time before starting clock synchronization. // The above wait ensures that start_time has been set. interval_t ns_to_wait = start_time - lf_time_physical(); - if (ns_to_wait > 0LL) { + if (ns_to_wait > 0LL) + { lf_sleep(ns_to_wait); } // Initiate a clock synchronization every rti->clock_sync_period_ns // Initiate a clock synchronization every rti->clock_sync_period_ns - struct timespec sleep_time = {(time_t) rti_remote->clock_sync_period_ns / BILLION, + struct timespec sleep_time = {(time_t)rti_remote->clock_sync_period_ns / BILLION, rti_remote->clock_sync_period_ns % BILLION}; struct timespec remaining_time; bool any_federates_connected = true; - while (any_federates_connected) { + while (any_federates_connected) + { // Sleep lf_sleep(rti_remote->clock_sync_period_ns); // Can be interrupted any_federates_connected = false; - for (int fed_id = 0; fed_id < rti_remote->base.number_of_scheduling_nodes; fed_id++) { - federate_info_t* fed = GET_FED_INFO(fed_id); - if (fed->enclave.state == NOT_CONNECTED) { + for (int fed_id = 0; fed_id < rti_remote->base.number_of_scheduling_nodes; fed_id++) + { + federate_info_t *fed = GET_FED_INFO(fed_id); + if (fed->enclave.state == NOT_CONNECTED) + { // FIXME: We need better error handling here, but clock sync failure // should not stop execution. lf_print_error("Clock sync failed with federate %d. Not connected.", fed_id); continue; - } else if (!fed->clock_synchronization_enabled) { + } + else if (!fed->clock_synchronization_enabled) + { continue; } // Send the RTI's current physical time to the federate @@ -937,44 +1013,53 @@ void* clock_synchronization_thread(void* noargs) { // If the T3 message from this federate does not arrive and we keep receiving // other messages, then give up on this federate and move to the next federate. int remaining_attempts = 5; - while (remaining_attempts > 0) { + while (remaining_attempts > 0) + { remaining_attempts--; int bytes_read = read_from_socket(rti_remote->socket_descriptor_UDP, message_size, buffer); // If any errors occur, either discard the message or the clock sync round. - if (bytes_read == message_size) { - if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { + if (bytes_read == message_size) + { + if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) + { int32_t fed_id_2 = extract_int32(&(buffer[1])); // Check that this message came from the correct federate. - if (fed_id_2 != fed->enclave.id) { + if (fed_id_2 != fed->enclave.id) + { // Message is from the wrong federate. Discard the message. lf_print_warning("Clock sync: Received T3 message from federate %d, " - "but expected one from %d. Discarding message.", - fed_id_2, fed->enclave.id); + "but expected one from %d. Discarding message.", + fed_id_2, fed->enclave.id); continue; } LF_PRINT_DEBUG("Clock sync: RTI received T3 message from federate %d.", fed_id_2); handle_physical_clock_sync_message(GET_FED_INFO(fed_id_2), UDP); break; - } else { + } + else + { // The message is not a T3 message. Discard the message and // continue waiting for the T3 message. This is possibly a message // from a previous cycle that was discarded. lf_print_warning("Clock sync: Unexpected UDP message %u. Expected %u from federate %d. " - "Discarding message.", - buffer[0], - MSG_TYPE_CLOCK_SYNC_T3, - fed->enclave.id); + "Discarding message.", + buffer[0], + MSG_TYPE_CLOCK_SYNC_T3, + fed->enclave.id); continue; } - } else { + } + else + { lf_print_warning("Clock sync: Read from UDP socket failed: %s. " - "Skipping clock sync round for federate %d.", - strerror(errno), - fed->enclave.id); + "Skipping clock sync round for federate %d.", + strerror(errno), + fed->enclave.id); remaining_attempts = -1; } } - if (remaining_attempts > 0) { + if (remaining_attempts > 0) + { any_federates_connected = true; } } @@ -982,27 +1067,32 @@ void* clock_synchronization_thread(void* noargs) { return NULL; } -void handle_federate_resign(federate_info_t *my_fed) { +void handle_federate_resign(federate_info_t *my_fed) +{ // Nothing more to do. Close the socket and exit. - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(&rti_mutex); // Extract the tag size_t header_size = 1 + sizeof(tag_t); unsigned char buffer[header_size]; // Read the header, minus the first byte which has already been read. read_from_socket_errexit(my_fed->socket, header_size - 1, &(buffer[1]), - "RTI failed to read the resign tag from remote federate."); + "RTI failed to read the resign tag from remote federate."); // Extract the tag sent by the resigning federate tag_t tag = extract_tag(&(buffer[1])); - if (rti_remote->base.tracing_enabled) { + if (rti_remote->base.tracing_enabled) + { tracepoint_rti_from_federate(rti_remote->base.trace, receive_RESIGN, my_fed->enclave.id, &tag); } - if (lf_tag_compare(tag, NEVER_TAG) == 0) { + if (lf_tag_compare(tag, NEVER_TAG) == 0) + { // The federate is reporting an error. _lf_federate_reports_error = true; lf_print("RTI: Federate %d reports an error and has resigned.", my_fed->enclave.id); - } else { + } + else + { lf_print("RTI: Federate %d has resigned.", my_fed->enclave.id); } @@ -1020,7 +1110,8 @@ void handle_federate_resign(federate_info_t *my_fed) { // Wait for the federate to send an EOF or a socket error to occur. // Discard any incoming bytes. Normally, this read should return 0 because // the federate is resigning and should itself invoke shutdown. - while (read(my_fed->socket, buffer, header_size) > 0); + while (read(my_fed->socket, buffer, header_size) > 0) + ; // We can now safely close the socket. close(my_fed->socket); // from unistd.h @@ -1029,15 +1120,15 @@ void handle_federate_resign(federate_info_t *my_fed) { // Check downstream federates to see whether they should now be granted a TAG. // To handle cycles, need to create a boolean array to keep // track of which upstream federates have been visited. - bool* visited = (bool*)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. + bool *visited = (bool *)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); free(visited); - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); } -void* federate_info_thread_TCP(void* fed) { - federate_info_t* my_fed = (federate_info_t*)fed; +void *federate_info_thread_TCP(void *fed) { + federate_info_t *my_fed = (federate_info_t *)fed; // Buffer for incoming messages. // This does not constrain the message size because messages @@ -1057,82 +1148,100 @@ void* federate_info_thread_TCP(void* fed) { break; } LF_PRINT_DEBUG("RTI: Received message type %u from federate %d.", buffer[0], my_fed->enclave.id); - switch(buffer[0]) { - case MSG_TYPE_TIMESTAMP: - handle_timestamp(my_fed); - break; - case MSG_TYPE_ADDRESS_QUERY: - handle_address_query(my_fed->enclave.id); - break; - case MSG_TYPE_ADDRESS_ADVERTISEMENT: - handle_address_ad(my_fed->enclave.id); - break; - case MSG_TYPE_TAGGED_MESSAGE: - handle_timed_message(my_fed, buffer); - break; - case MSG_TYPE_RESIGN: - handle_federate_resign(my_fed); - return NULL; - break; - case MSG_TYPE_NEXT_EVENT_TAG: - handle_next_event_tag(my_fed); - break; - case MSG_TYPE_LOGICAL_TAG_COMPLETE: - handle_logical_tag_complete(my_fed); - break; - case MSG_TYPE_STOP_REQUEST: - handle_stop_request_message(my_fed); // FIXME: Reviewed until here. - // Need to also look at - // notify_advance_grant_if_safe() - // and notify_downstream_advance_grant_if_safe() - break; - case MSG_TYPE_STOP_REQUEST_REPLY: - handle_stop_request_reply(my_fed); - break; - case MSG_TYPE_PORT_ABSENT: - handle_port_absent_message(my_fed, buffer); - break; - default: - lf_print_error("RTI received from federate %d an unrecognized TCP message type: %u.", my_fed->enclave.id, buffer[0]); - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(rti_remote->base.trace, receive_UNIDENTIFIED, my_fed->enclave.id, NULL); - } + switch (buffer[0]) { + case MSG_TYPE_TIMESTAMP: + handle_timestamp(my_fed); + break; + case MSG_TYPE_ADDRESS_QUERY: + handle_address_query(my_fed->enclave.id); + break; + case MSG_TYPE_ADDRESS_ADVERTISEMENT: + handle_address_ad(my_fed->enclave.id); + break; + case MSG_TYPE_TAGGED_MESSAGE: + handle_timed_message(my_fed, buffer); + break; + case MSG_TYPE_RESIGN: + handle_federate_resign(my_fed); + return NULL; + break; + case MSG_TYPE_NEXT_EVENT_TAG: + handle_next_event_tag(my_fed); + break; + case MSG_TYPE_LOGICAL_TAG_COMPLETE: + handle_logical_tag_complete(my_fed); + break; + case MSG_TYPE_STOP_REQUEST: + handle_stop_request_message(my_fed); // FIXME: Reviewed until here. + // Need to also look at + // notify_advance_grant_if_safe() + // and notify_downstream_advance_grant_if_safe() + break; + case MSG_TYPE_STOP_REQUEST_REPLY: + handle_stop_request_reply(my_fed); + break; + case MSG_TYPE_PORT_ABSENT: + handle_port_absent_message(my_fed, buffer); + break; + default: + lf_print_error("RTI received from federate %d an unrecognized TCP message type: %u.", my_fed->enclave.id, buffer[0]); + if (rti_remote->base.tracing_enabled) + { + tracepoint_rti_from_federate(rti_remote->base.trace, receive_UNIDENTIFIED, my_fed->enclave.id, NULL); + } } } // Nothing more to do. Close the socket and exit. // Prevent multiple threads from closing the same socket at the same time. - lf_mutex_lock(&rti_mutex); + LF_MUTEX_LOCK(&rti_mutex); close(my_fed->socket); // from unistd.h - lf_mutex_unlock(&rti_mutex); + LF_MUTEX_UNLOCK(&rti_mutex); return NULL; } -void send_reject(int socket_id, unsigned char error_code) { +void send_reject(int *socket_id, unsigned char error_code) +{ LF_PRINT_DEBUG("RTI sending MSG_TYPE_REJECT."); unsigned char response[2]; response[0] = MSG_TYPE_REJECT; response[1] = error_code; + LF_MUTEX_LOCK(&rti_mutex); // NOTE: Ignore errors on this response. - write_to_socket_errexit(socket_id, 2, response, "RTI failed to write MSG_TYPE_REJECT message on the socket."); + if (write_to_socket(*socket_id, 2, response)) { + lf_print_warning("RTI failed to write MSG_TYPE_REJECT message on the socket."); + } // Close the socket. - close(socket_id); + shutdown(*socket_id, SHUT_RDWR); + close(*socket_id); + *socket_id = -1; + LF_MUTEX_UNLOCK(&rti_mutex); } -int32_t receive_and_check_fed_id_message(int socket_id, struct sockaddr_in* client_fd) { +/** + * Listen for a MSG_TYPE_FED_IDS message, which includes as a payload + * a federate ID and a federation ID. If the federation ID + * matches this federation, send an MSG_TYPE_ACK and otherwise send + * a MSG_TYPE_REJECT message. Return 1 if the federate is accepted to + * the federation and 0 otherwise. + * @param socket_id The socket on which to listen. + * @param client_fd The socket address. + * @return The federate ID for success or -1 for failure. + */ +static int32_t receive_and_check_fed_id_message(int *socket_id, struct sockaddr_in *client_fd) { // Buffer for message ID, federate ID, and federation ID length. size_t length = 1 + sizeof(uint16_t) + 1; // Message ID, federate ID, length of fedration ID. unsigned char buffer[length]; // Read bytes from the socket. We need 4 bytes. // FIXME: This should not exit with error but rather should just reject the connection. - read_from_socket_errexit(socket_id, length, buffer, "RTI failed to read from accepted socket."); + read_from_socket_errexit(*socket_id, length, buffer, "RTI failed to read from accepted socket."); uint16_t fed_id = rti_remote->base.number_of_scheduling_nodes; // Initialize to an invalid value. // First byte received is the message type. if (buffer[0] != MSG_TYPE_FED_IDS) { - if(buffer[0] == MSG_TYPE_P2P_SENDING_FED_ID || buffer[0] == MSG_TYPE_P2P_TAGGED_MESSAGE) { + if (buffer[0] == MSG_TYPE_P2P_SENDING_FED_ID || buffer[0] == MSG_TYPE_P2P_TAGGED_MESSAGE) { // The federate is trying to connect to a peer, not to the RTI. // It has connected to the RTI instead. // FIXME: This should not happen, but apparently has been observed. @@ -1144,7 +1253,7 @@ int32_t receive_and_check_fed_id_message(int socket_id, struct sockaddr_in* clie } else { send_reject(socket_id, UNEXPECTED_MESSAGE); } - if (rti_remote->base.tracing_enabled){ + if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); } lf_print_error("RTI expected a MSG_TYPE_FED_IDS message. Got %u (see net_common.h).", buffer[0]); @@ -1159,9 +1268,9 @@ int32_t receive_and_check_fed_id_message(int socket_id, struct sockaddr_in* clie char federation_id_received[federation_id_length + 1]; // One extra for null terminator. // Next read the actual federation ID. // FIXME: This should not exit on error, but rather just reject the connection. - read_from_socket_errexit(socket_id, federation_id_length, - (unsigned char*)federation_id_received, - "RTI failed to read federation id from federate %d.", fed_id); + read_from_socket_errexit(*socket_id, federation_id_length, + (unsigned char *)federation_id_received, + "RTI failed to read federation id from federate %d.", fed_id); // Terminate the string with a null. federation_id_received[federation_id_length] = 0; @@ -1175,10 +1284,10 @@ int32_t receive_and_check_fed_id_message(int socket_id, struct sockaddr_in* clie if (strncmp(rti_remote->federation_id, federation_id_received, federation_id_length) != 0) { // Federation IDs do not match. Send back a MSG_TYPE_REJECT message. lf_print_error("WARNING: Federate from another federation %s attempted to connect to RTI in federation %s.\n", - federation_id_received, - rti_remote->federation_id); + federation_id_received, + rti_remote->federation_id); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); + tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); } send_reject(socket_id, FEDERATION_ID_DOES_NOT_MATCH); return -1; @@ -1186,14 +1295,13 @@ int32_t receive_and_check_fed_id_message(int socket_id, struct sockaddr_in* clie if (fed_id >= rti_remote->base.number_of_scheduling_nodes) { // Federate ID is out of range. lf_print_error("RTI received federate ID %d, which is out of range.", fed_id); - if (rti_remote->base.tracing_enabled){ + if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); } send_reject(socket_id, FEDERATE_ID_OUT_OF_RANGE); return -1; } else { if ((rti_remote->base.scheduling_nodes[fed_id])->state != NOT_CONNECTED) { - lf_print_error("RTI received duplicate federate ID: %d.", fed_id); if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); @@ -1204,26 +1312,26 @@ int32_t receive_and_check_fed_id_message(int socket_id, struct sockaddr_in* clie } } } - federate_info_t* fed = GET_FED_INFO(fed_id); + federate_info_t *fed = GET_FED_INFO(fed_id); // The MSG_TYPE_FED_IDS message has the right federation ID. // Assign the address information for federate. // The IP address is stored here as an in_addr struct (in .server_ip_addr) that can be useful // to create sockets and can be efficiently sent over the network. // First, convert the sockaddr structure into a sockaddr_in that contains an internet address. - struct sockaddr_in* pV4_addr = client_fd; + struct sockaddr_in *pV4_addr = client_fd; // Then extract the internet address (which is in IPv4 format) and assign it as the federate's socket server fed->server_ip_addr = pV4_addr->sin_addr; #if LOG_LEVEL >= LOG_LEVEL_DEBUG // Create the human readable format and copy that into // the .server_hostname field of the federate. - char str[INET_ADDRSTRLEN]; - inet_ntop( AF_INET, &fed->server_ip_addr, str, INET_ADDRSTRLEN ); - strncpy (fed->server_hostname, str, INET_ADDRSTRLEN); + char str[INET_ADDRSTRLEN + 1]; + inet_ntop(AF_INET, &fed->server_ip_addr, str, INET_ADDRSTRLEN); + strncpy(fed->server_hostname, str, INET_ADDRSTRLEN); LF_PRINT_DEBUG("RTI got address %s from federate %d.", fed->server_hostname, fed_id); #endif - fed->socket = socket_id; + fed->socket = *socket_id; // Set the federate's state as pending // because it is waiting for the start time to be @@ -1236,105 +1344,130 @@ int32_t receive_and_check_fed_id_message(int socket_id, struct sockaddr_in* clie if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_ACK, fed_id, NULL); } - write_to_socket_errexit(socket_id, 1, &ack_message, + LF_MUTEX_LOCK(&rti_mutex); + write_to_socket_fail_on_error(&fed->socket, 1, &ack_message, &rti_mutex, "RTI failed to write MSG_TYPE_ACK message to federate %d.", fed_id); + LF_MUTEX_UNLOCK(&rti_mutex); + + LF_PRINT_DEBUG("RTI sent MSG_TYPE_ACK to federate %d.", fed_id); return (int32_t)fed_id; } -int receive_connection_information(int socket_id, uint16_t fed_id) { +/** + * Listen for a MSG_TYPE_NEIGHBOR_STRUCTURE message, and upon receiving it, fill + * out the relevant information in the federate's struct. + */ +static int receive_connection_information(int *socket_id, uint16_t fed_id) { LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_NEIGHBOR_STRUCTURE from federate %d.", fed_id); unsigned char connection_info_header[MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE]; read_from_socket_errexit( - socket_id, - MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE, - connection_info_header, - "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message header from federate %d.", - fed_id - ); + *socket_id, + MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE, + connection_info_header, + "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message header from federate %d.", + fed_id); if (connection_info_header[0] != MSG_TYPE_NEIGHBOR_STRUCTURE) { - lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " - "Rejecting federate.", fed_id, connection_info_header[0]); + lf_print_error( + "RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " + "Rejecting federate.", + fed_id, connection_info_header[0]); send_reject(socket_id, UNEXPECTED_MESSAGE); return 0; } else { - federate_info_t* fed = GET_FED_INFO(fed_id); + federate_info_t *fed = GET_FED_INFO(fed_id); // Read the number of upstream and downstream connections fed->enclave.num_upstream = extract_int32(&(connection_info_header[1])); fed->enclave.num_downstream = extract_int32(&(connection_info_header[1 + sizeof(int32_t)])); LF_PRINT_DEBUG( - "RTI got %d upstreams and %d downstreams from federate %d.", - fed->enclave.num_upstream, - fed->enclave.num_downstream, - fed_id); + "RTI got %d upstreams and %d downstreams from federate %d.", + fed->enclave.num_upstream, + fed->enclave.num_downstream, + fed_id); // Allocate memory for the upstream and downstream pointers if (fed->enclave.num_upstream > 0) { - fed->enclave.upstream = (int*)malloc(sizeof(uint16_t) * fed->enclave.num_upstream); + fed->enclave.upstream = (int *)malloc(sizeof(uint16_t) * fed->enclave.num_upstream); // Allocate memory for the upstream delay pointers - fed->enclave.upstream_delay = - (interval_t*)malloc( - sizeof(interval_t) * fed->enclave.num_upstream - ); + fed->enclave.upstream_delay = (interval_t *)malloc( + sizeof(interval_t) * fed->enclave.num_upstream); } else { - fed->enclave.upstream = (int*)NULL; - fed->enclave.upstream_delay = (interval_t*)NULL; + fed->enclave.upstream = (int *)NULL; + fed->enclave.upstream_delay = (interval_t *)NULL; } if (fed->enclave.num_downstream > 0) { - fed->enclave.downstream = (int*)malloc(sizeof(uint16_t) * fed->enclave.num_downstream); + fed->enclave.downstream = (int *)malloc(sizeof(uint16_t) * fed->enclave.num_downstream); } else { - fed->enclave.downstream = (int*)NULL; + fed->enclave.downstream = (int *)NULL; } - size_t connections_info_body_size = ((sizeof(uint16_t) + sizeof(int64_t)) * - fed->enclave.num_upstream) + (sizeof(uint16_t) * fed->enclave.num_downstream); - unsigned char* connections_info_body = (unsigned char*)malloc(connections_info_body_size); - read_from_socket_errexit( - socket_id, - connections_info_body_size, - connections_info_body, - "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message body from federate %d.", - fed_id - ); - - // Keep track of where we are in the buffer - size_t message_head = 0; - // First, read the info about upstream federates - for (int i=0; ienclave.num_upstream; i++) { - fed->enclave.upstream[i] = extract_uint16(&(connections_info_body[message_head])); - message_head += sizeof(uint16_t); - fed->enclave.upstream_delay[i] = extract_int64(&(connections_info_body[message_head])); - message_head += sizeof(int64_t); - } + size_t connections_info_body_size = ( + (sizeof(uint16_t) + sizeof(int64_t)) * fed->enclave.num_upstream) + + (sizeof(uint16_t) * fed->enclave.num_downstream); + unsigned char *connections_info_body = NULL; + if (connections_info_body_size > 0) { + connections_info_body = (unsigned char *)malloc(connections_info_body_size); + read_from_socket_errexit( + *socket_id, + connections_info_body_size, + connections_info_body, + "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message body from federate %d.", + fed_id); + // Keep track of where we are in the buffer + size_t message_head = 0; + // First, read the info about upstream federates + for (int i = 0; i < fed->enclave.num_upstream; i++) { + fed->enclave.upstream[i] = extract_uint16(&(connections_info_body[message_head])); + message_head += sizeof(uint16_t); + fed->enclave.upstream_delay[i] = extract_int64(&(connections_info_body[message_head])); + message_head += sizeof(int64_t); + } - // Next, read the info about downstream federates - for (int i=0; ienclave.num_downstream; i++) { - fed->enclave.downstream[i] = extract_uint16(&(connections_info_body[message_head])); - message_head += sizeof(uint16_t); - } + // Next, read the info about downstream federates + for (int i = 0; i < fed->enclave.num_downstream; i++) { + fed->enclave.downstream[i] = extract_uint16(&(connections_info_body[message_head])); + message_head += sizeof(uint16_t); + } - free(connections_info_body); - return 1; + free(connections_info_body); + } } + LF_PRINT_DEBUG("RTI received neighbor structure from federate %d.", fed_id); + return 1; } -int receive_udp_message_and_set_up_clock_sync(int socket_id, uint16_t fed_id) { +/** + * Listen for a MSG_TYPE_UDP_PORT message, and upon receiving it, set up + * clock synchronization and perform the initial clock synchronization. + * Initial clock synchronization is performed only if the MSG_TYPE_UDP_PORT message + * payload is not UINT16_MAX. If it is also not 0, then this function sets + * up to perform runtime clock synchronization using the UDP port number + * specified in the payload to communicate with the federate's clock + * synchronization logic. + * @param socket_id The socket on which to listen. + * @param fed_id The federate ID. + * @return 1 for success, 0 for failure. + */ +static int receive_udp_message_and_set_up_clock_sync(int *socket_id, uint16_t fed_id) { // Read the MSG_TYPE_UDP_PORT message from the federate regardless of the status of // clock synchronization. This message will tell the RTI whether the federate // is doing clock synchronization, and if it is, what port to use for UDP. LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_UDP_PORT from federate %d.", fed_id); unsigned char response[1 + sizeof(uint16_t)]; - read_from_socket_errexit(socket_id, 1 + sizeof(uint16_t) , response, + read_from_socket_errexit(*socket_id, 1 + sizeof(uint16_t), response, "RTI failed to read MSG_TYPE_UDP_PORT message from federate %d.", fed_id); if (response[0] != MSG_TYPE_UDP_PORT) { - lf_print_error("RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " - "Rejecting federate.", fed_id, response[0]); + lf_print_error( + "RTI was expecting a MSG_TYPE_UDP_PORT message from federate %d. Got %u instead. " + "Rejecting federate.", + fed_id, response[0]); send_reject(socket_id, UNEXPECTED_MESSAGE); return 0; } else { federate_info_t *fed = GET_FED_INFO(fed_id); - if (rti_remote->clock_sync_global_status >= clock_sync_init) {// If no initial clock sync, no need perform initial clock sync. + if (rti_remote->clock_sync_global_status >= clock_sync_init) { + // If no initial clock sync, no need perform initial clock sync. uint16_t federate_UDP_port_number = extract_uint16(&(response[1])); LF_PRINT_DEBUG("RTI got MSG_TYPE_UDP_PORT %u from federate %d.", federate_UDP_port_number, fed_id); @@ -1343,15 +1476,15 @@ int receive_udp_message_and_set_up_clock_sync(int socket_id, uint16_t fed_id) { if (federate_UDP_port_number != UINT16_MAX) { // Perform the initialization clock synchronization with the federate. // Send the required number of messages for clock synchronization - for (int i=0; i < rti_remote->clock_sync_exchanges_per_interval; i++) { + for (int i = 0; i < rti_remote->clock_sync_exchanges_per_interval; i++) { // Send the RTI's current physical time T1 to the federate. send_physical_clock(MSG_TYPE_CLOCK_SYNC_T1, fed, TCP); // Listen for reply message, which should be T3. size_t message_size = 1 + sizeof(int32_t); unsigned char buffer[message_size]; - read_from_socket_errexit(socket_id, message_size, buffer, - "Socket to federate %d unexpectedly closed.", fed_id); + read_from_socket_errexit(*socket_id, message_size, buffer, + "Socket to federate %d unexpectedly closed.", fed_id); if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { int32_t fed_id = extract_int32(&(buffer[1])); assert(fed_id > -1); @@ -1366,21 +1499,25 @@ int receive_udp_message_and_set_up_clock_sync(int socket_id, uint16_t fed_id) { } LF_PRINT_DEBUG("RTI finished initial clock synchronization with federate %d.", fed_id); } - if (rti_remote->clock_sync_global_status >= clock_sync_on) { // If no runtime clock sync, no need to set up the UDP port. - if (federate_UDP_port_number > 0) { - // Initialize the UDP_addr field of the federate struct - fed->UDP_addr.sin_family = AF_INET; - fed->UDP_addr.sin_port = htons(federate_UDP_port_number); - fed->UDP_addr.sin_addr = fed->server_ip_addr; - } + if (rti_remote->clock_sync_global_status >= clock_sync_on) { + // If no runtime clock sync, no need to set up the UDP port. + if (federate_UDP_port_number > 0) { + // Initialize the UDP_addr field of the federate struct + fed->UDP_addr.sin_family = AF_INET; + fed->UDP_addr.sin_port = htons(federate_UDP_port_number); + fed->UDP_addr.sin_addr = fed->server_ip_addr; + } } else { - // Disable clock sync after initial round. - fed->clock_synchronization_enabled = false; + // Disable clock sync after initial round. + fed->clock_synchronization_enabled = false; } - } else { // No clock synchronization at all. + } else { + // No clock synchronization at all. + LF_PRINT_DEBUG("RTI: No clock synchronization for federate %d.", fed_id); // Clock synchronization is universally disabled via the clock-sync command-line parameter // (-c off was passed to the RTI). - // Note that the federates are still going to send a MSG_TYPE_UDP_PORT message but with a payload (port) of -1. + // Note that the federates are still going to send a + // MSG_TYPE_UDP_PORT message but with a payload (port) of -1. fed->clock_synchronization_enabled = false; } } @@ -1388,13 +1525,20 @@ int receive_udp_message_and_set_up_clock_sync(int socket_id, uint16_t fed_id) { } #ifdef __RTI_AUTH__ -bool authenticate_federate(int socket) { +/** + * Authenticate incoming federate by performing HMAC-based authentication. + * + * @param socket Socket for the incoming federate tryting to authenticate. + * @return True if authentication is successful and false otherwise. + */ +static bool authenticate_federate(int *socket) { // Wait for MSG_TYPE_FED_NONCE from federate. size_t fed_id_length = sizeof(uint16_t); unsigned char buffer[1 + fed_id_length + NONCE_LENGTH]; - read_from_socket_errexit(socket, 1 + fed_id_length + NONCE_LENGTH, buffer, + read_from_socket_errexit(*socket, 1 + fed_id_length + NONCE_LENGTH, buffer, "Failed to read MSG_TYPE_FED_NONCE"); - if (buffer[0] != MSG_TYPE_FED_NONCE) { + if (buffer[0] != MSG_TYPE_FED_NONCE) + { lf_print_error_and_exit( "Received unexpected response %u from the FED (see net_common.h).", buffer[0]); @@ -1407,9 +1551,9 @@ bool authenticate_federate(int socket) { memcpy(&mac_buf[1], &buffer[1], fed_id_length); memcpy(&mac_buf[1 + fed_id_length], &buffer[1 + fed_id_length], NONCE_LENGTH); unsigned char hmac_tag[hmac_length]; - unsigned char * ret = HMAC(EVP_sha256(), rti_remote->federation_id, - federation_id_length, mac_buf, 1 + fed_id_length + NONCE_LENGTH, - hmac_tag, &hmac_length); + unsigned char *ret = HMAC(EVP_sha256(), rti_remote->federation_id, + federation_id_length, mac_buf, 1 + fed_id_length + NONCE_LENGTH, + hmac_tag, &hmac_length); if (ret == NULL) { lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_RTI_RESPONSE."); } @@ -1420,11 +1564,13 @@ bool authenticate_federate(int socket) { RAND_bytes(rti_nonce, NONCE_LENGTH); memcpy(&sender[1], rti_nonce, NONCE_LENGTH); memcpy(&sender[1 + NONCE_LENGTH], hmac_tag, hmac_length); - write_to_socket(socket, 1 + NONCE_LENGTH + hmac_length, sender); + if (write_to_socket(*socket, 1 + NONCE_LENGTH + hmac_length, sender)) { + lf_print_error("Failed to send nonce to federate."); + } // Wait for MSG_TYPE_FED_RESPONSE unsigned char received[1 + hmac_length]; - read_from_socket_errexit(socket, 1 + hmac_length, received, + read_from_socket_errexit(*socket, 1 + hmac_length, received, "Failed to read federate response."); if (received[0] != MSG_TYPE_FED_RESPONSE) { lf_print_error_and_exit( @@ -1438,7 +1584,7 @@ bool authenticate_federate(int socket) { memcpy(&mac_buf2[1], rti_nonce, NONCE_LENGTH); unsigned char rti_tag[hmac_length]; ret = HMAC(EVP_sha256(), rti_remote->federation_id, federation_id_length, - mac_buf2, 1 + NONCE_LENGTH, rti_tag, &hmac_length); + mac_buf2, 1 + NONCE_LENGTH, rti_tag, &hmac_length); if (ret == NULL) { lf_print_error_and_exit("HMAC construction failed for MSG_TYPE_FED_RESPONSE."); } @@ -1462,7 +1608,7 @@ void connect_to_federates(int socket_descriptor) { uint32_t client_length = sizeof(client_fd); // The following blocks until a federate connects. int socket_id = -1; - while(1) { + while (1) { socket_id = accept(rti_remote->socket_descriptor_TCP, &client_fd, &client_length); if (socket_id >= 0) { // Got a socket @@ -1476,23 +1622,27 @@ void connect_to_federates(int socket_descriptor) { } } - // Wait for the first message from the federate when RTI -a option is on. - #ifdef __RTI_AUTH__ +// Wait for the first message from the federate when RTI -a option is on. +#ifdef __RTI_AUTH__ if (rti_remote->authentication_enabled) { - if (!authenticate_federate(socket_id)) { + if (!authenticate_federate(&socket_id)) { lf_print_warning("RTI failed to authenticate the incoming federate."); + // Close the socket. + shutdown(socket_id, SHUT_RDWR); + close(socket_id); + socket_id = -1; // Ignore the federate that failed authentication. i--; continue; } } - #endif - +#endif + // The first message from the federate should contain its ID and the federation ID. - int32_t fed_id = receive_and_check_fed_id_message(socket_id, (struct sockaddr_in*)&client_fd); - if (fed_id >= 0 - && receive_connection_information(socket_id, (uint16_t)fed_id) - && receive_udp_message_and_set_up_clock_sync(socket_id, (uint16_t)fed_id)) { + int32_t fed_id = receive_and_check_fed_id_message(&socket_id, (struct sockaddr_in *)&client_fd); + if (fed_id >= 0 && socket_id >= 0 + && receive_connection_information(&socket_id, (uint16_t)fed_id) + && receive_udp_message_and_set_up_clock_sync(&socket_id, (uint16_t)fed_id)) { // Create a thread to communicate with the federate. // This has to be done after clock synchronization is finished @@ -1500,7 +1650,6 @@ void connect_to_federates(int socket_descriptor) { // synchronization messages. federate_info_t *fed = GET_FED_INFO(fed_id); lf_thread_create(&(fed->thread_id), federate_info_thread_TCP, fed); - } else { // Received message was rejected. Try again. i--; @@ -1515,7 +1664,7 @@ void connect_to_federates(int socket_descriptor) { // federate is performing runtime clock synchronization. bool clock_sync_enabled = false; for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed_info = GET_FED_INFO(i); + federate_info_t *fed_info = GET_FED_INFO(i); if (fed_info->clock_synchronization_enabled) { clock_sync_enabled = true; break; @@ -1527,7 +1676,7 @@ void connect_to_federates(int socket_descriptor) { } } -void* respond_to_erroneous_connections(void* nothing) { +void *respond_to_erroneous_connections(void *nothing) { while (true) { // Wait for an incoming connection request. struct sockaddr client_fd; @@ -1546,28 +1695,33 @@ void* respond_to_erroneous_connections(void* nothing) { response[0] = MSG_TYPE_REJECT; response[1] = FEDERATION_ID_DOES_NOT_MATCH; // Ignore errors on this response. - write_to_socket_errexit(socket_id, 2, response, - "RTI failed to write FEDERATION_ID_DOES_NOT_MATCH to erroneous incoming connection."); + if (write_to_socket(socket_id, 2, response)) { + lf_print_warning("RTI failed to write FEDERATION_ID_DOES_NOT_MATCH to erroneous incoming connection."); + } // Close the socket. + shutdown(socket_id, SHUT_RDWR); close(socket_id); } return NULL; } -void initialize_federate(federate_info_t* fed, uint16_t id) { +void initialize_federate(federate_info_t *fed, uint16_t id) +{ initialize_scheduling_node(&(fed->enclave), id); fed->requested_stop = false; - fed->socket = -1; // No socket. + fed->socket = -1; // No socket. fed->clock_synchronization_enabled = true; fed->in_transit_message_tags = initialize_in_transit_message_q(); - strncpy(fed->server_hostname ,"localhost", INET_ADDRSTRLEN); + strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); fed->server_ip_addr.s_addr = 0; fed->server_port = -1; } -int32_t start_rti_server(uint16_t port) { +int32_t start_rti_server(uint16_t port) +{ int32_t specified_port = port; - if (port == 0) { + if (port == 0) + { // Use the default port. port = DEFAULT_PORT; } @@ -1577,7 +1731,8 @@ int32_t start_rti_server(uint16_t port) { lf_print("RTI: Listening for federates."); // Create the UDP socket server // Try to get the rti->final_port_TCP + 1 port - if (rti_remote->clock_sync_global_status >= clock_sync_on) { + if (rti_remote->clock_sync_global_status >= clock_sync_on) + { rti_remote->socket_descriptor_UDP = create_server(specified_port, rti_remote->final_port_TCP + 1, UDP); } return rti_remote->socket_descriptor_TCP; @@ -1598,9 +1753,9 @@ void wait_for_federates(int socket_descriptor) { lf_thread_create(&responder_thread, respond_to_erroneous_connections, NULL); // Wait for federate threads to exit. - void* thread_exit_status; + void *thread_exit_status; for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { - federate_info_t* fed = GET_FED_INFO(i); + federate_info_t *fed = GET_FED_INFO(i); lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); lf_thread_join(fed->thread_id, &thread_exit_status); free_in_transit_message_q(fed->in_transit_message_tags); @@ -1630,14 +1785,14 @@ void wait_for_federates(int socket_descriptor) { } } - -void initialize_RTI(rti_remote_t *rti){ +void initialize_RTI(rti_remote_t *rti) +{ rti_remote = rti; - // Initialize thread synchronization primitives - LF_ASSERT(lf_mutex_init(&rti_mutex) == 0, "Failed to initialize Mutex"); - LF_ASSERT(lf_cond_init(&received_start_times, &rti_mutex) == 0, "Failed to initialize Condition Variable"); - LF_ASSERT(lf_cond_init(&sent_start_time, &rti_mutex) == 0, "Failed to initialize Condition Variable"); + // Initialize thread synchronization primitives + LF_MUTEX_INIT(&rti_mutex); + LF_COND_INIT(&received_start_times, &rti_mutex); + LF_COND_INIT(&sent_start_time, &rti_mutex); initialize_rti_common(&rti_remote->base); rti_remote->base.mutex = &rti_mutex; @@ -1660,12 +1815,16 @@ void initialize_RTI(rti_remote_t *rti){ rti_remote->stop_in_progress = false; } -void free_scheduling_nodes(scheduling_node_t** scheduling_nodes, uint16_t number_of_scheduling_nodes) { - for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { +void free_scheduling_nodes(scheduling_node_t **scheduling_nodes, uint16_t number_of_scheduling_nodes) +{ + for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) + { // FIXME: Gives error freeing memory not allocated!!!! - scheduling_node_t* node = scheduling_nodes[i]; - if (node->upstream != NULL) free(node->upstream); - if (node->downstream != NULL) free(node->downstream); + scheduling_node_t *node = scheduling_nodes[i]; + if (node->upstream != NULL) + free(node->upstream); + if (node->downstream != NULL) + free(node->downstream); } free(scheduling_nodes); } diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index b1434fcbd..01a6edee2 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -403,52 +403,10 @@ void* federate_info_thread_TCP(void* fed); /** * Send a MSG_TYPE_REJECT message to the specified socket and close the socket. - * @param socket_id The socket. + * @param socket_id Pointer to the socket ID. * @param error_code An error code. */ -void send_reject(int socket_id, unsigned char error_code); - -/** - * Listen for a MSG_TYPE_FED_IDS message, which includes as a payload - * a federate ID and a federation ID. If the federation ID - * matches this federation, send an MSG_TYPE_ACK and otherwise send - * a MSG_TYPE_REJECT message. Return 1 if the federate is accepted to - * the federation and 0 otherwise. - * @param socket_id The socket on which to listen. - * @param client_fd The socket address. - * @return The federate ID for success or -1 for failure. - */ -int32_t receive_and_check_fed_id_message(int socket_id, struct sockaddr_in* client_fd); - -/** - * Listen for a MSG_TYPE_NEIGHBOR_STRUCTURE message, and upon receiving it, fill - * out the relevant information in the federate's struct. - */ -int receive_connection_information(int socket_id, uint16_t fed_id); - -/** - * Listen for a MSG_TYPE_UDP_PORT message, and upon receiving it, set up - * clock synchronization and perform the initial clock synchronization. - * Initial clock synchronization is performed only if the MSG_TYPE_UDP_PORT message - * payload is not UINT16_MAX. If it is also not 0, then this function sets - * up to perform runtime clock synchronization using the UDP port number - * specified in the payload to communicate with the federate's clock - * synchronization logic. - * @param socket_id The socket on which to listen. - * @param fed_id The federate ID. - * @return 1 for success, 0 for failure. - */ -int receive_udp_message_and_set_up_clock_sync(int socket_id, uint16_t fed_id); - -#ifdef __RTI_AUTH__ -/** - * Authenticate incoming federate by performing HMAC-based authentication. - * - * @param socket Socket for the incoming federate tryting to authenticate. - * @return True if authentication is successful and false otherwise. - */ -bool authenticate_federate(int socket); -#endif +void send_reject(int* socket_id, unsigned char error_code); /** * Wait for one incoming connection request from each federate, diff --git a/core/federated/federate.c b/core/federated/federate.c index 6cf4ced58..d82144ea0 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -71,6 +71,9 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. extern instant_t _lf_last_reported_unadjusted_physical_time_ns; extern instant_t start_time; +// Global variable defined in reactor_common.c: +extern bool _lf_termination_executed; + // Error messages. char* ERROR_SENDING_HEADER = "ERROR sending header information to federate via RTI"; char* ERROR_SENDING_MESSAGE = "ERROR sending message to federate via RTI"; @@ -173,41 +176,19 @@ void create_server(int specified_port) { unsigned char buffer[sizeof(int32_t) + 1]; buffer[0] = MSG_TYPE_ADDRESS_ADVERTISEMENT; encode_int32(_fed.server_port, &(buffer[1])); + + // No need for a mutex because we have the only handle on this socket. + write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, sizeof(int32_t) + 1, (unsigned char*)buffer, NULL, + "Failed to send address advertisement."); + // Trace the event when tracing is enabled tracepoint_federate_to_rti(_fed.trace, send_ADR_AD, _lf_my_fed_id, NULL); - write_to_socket_errexit(_fed.socket_TCP_RTI, sizeof(int32_t) + 1, (unsigned char*)buffer, - "Failed to send address advertisement."); LF_PRINT_DEBUG("Sent port %d to the RTI.", _fed.server_port); // Set the global server socket _fed.server_socket = socket_descriptor; } -/** - * Send a message to another federate directly or via the RTI. - * This method assumes that the caller does not hold the outbound_socket_mutex lock, - * which it acquires to perform the send. - * - * If the socket connection to the remote federate or the RTI has been broken, - * then this returns 0 without sending. Otherwise, it returns 1. - * - * @note This function is similar to send_timed_message() except that it - * does not deal with time and timed_messages. - * - * @param message_type The type of the message being sent. - * Currently can be MSG_TYPE_MESSAGE for messages sent via - * RTI or MSG_TYPE_P2P_MESSAGE for messages sent between - * federates. - * @param port The ID of the destination port. - * @param federate The ID of the destination federate. - * @param next_destination_str The name of the next destination in string format - * @param length The message length. - * @param message The message. - * @return 1 if the message has been sent, 0 otherwise. - * FIXME: Currently, federates can send untimed messages to RTI, but there is no - * handling mechanism of MSG_TYPE_MESSAGE at the RTI side. - * Is it really needed? Or should the RTI be updated? - */ int send_message(int message_type, unsigned short port, unsigned short federate, @@ -216,18 +197,13 @@ int send_message(int message_type, unsigned char* message) { unsigned char header_buffer[1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t)]; // First byte identifies this as a timed message. - if (message_type != MSG_TYPE_MESSAGE && - message_type != MSG_TYPE_P2P_MESSAGE - ) { - lf_print_error( - "send_message() was called with an invalid message type (%d).", - message_type - ); - return 0; + if (message_type != MSG_TYPE_P2P_MESSAGE ) { + lf_print_error("send_message: Unsupported message type (%d).", message_type); + return -1; } header_buffer[0] = (unsigned char)message_type; // Next two bytes identify the destination port. - // NOTE: Send messages little endian, not big endian. + // NOTE: Send messages little endian (network order), not big endian. encode_uint16(port, &(header_buffer[1])); // Next two bytes identify the destination federate. @@ -236,75 +212,64 @@ int send_message(int message_type, // The next four bytes are the message length. encode_int32((int32_t)length, &(header_buffer[1 + sizeof(uint16_t) + sizeof(uint16_t)])); - LF_PRINT_LOG("Sending untimed message to %s.", next_destination_str); + LF_PRINT_LOG("Sending untagged message to %s.", next_destination_str); // Header: message_type + port_id + federate_id + length of message + timestamp + microstep const int header_length = 1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t); + // Use a mutex lock to prevent multiple threads from simultaneously sending. - lf_mutex_lock(&outbound_socket_mutex); - // First, check that the socket is still connected. This must done - // while holding the mutex lock. - int socket = -1; - if (message_type == MSG_TYPE_P2P_MESSAGE) { - socket = _fed.sockets_for_outbound_p2p_connections[federate]; - } else { - socket = _fed.socket_TCP_RTI; + LF_MUTEX_LOCK(&outbound_socket_mutex); + + int* socket = &_fed.sockets_for_outbound_p2p_connections[federate]; + + // Trace the event when tracing is enabled + tracepoint_federate_to_federate(_fed.trace, send_P2P_MSG, _lf_my_fed_id, federate, NULL); + + int result = write_to_socket_close_on_error(socket, header_length, header_buffer); + if (result == 0) { + // Header sent successfully. Send the body. + result = write_to_socket_close_on_error(socket, length, message); } - if (socket < 0) { - lf_print_warning("Socket is no longer connected. Dropping message."); - lf_mutex_unlock(&outbound_socket_mutex); - return 0; + if (result != 0) { + // Message did not send. Since this is used for physical connections, this is not critical. + lf_print_warning("Failed to send message to %s. Dropping the message.", next_destination_str); } - // Trace the event when tracing is enabled - if (message_type == MSG_TYPE_P2P_MESSAGE) { - tracepoint_federate_to_federate(_fed.trace, send_P2P_MSG, _lf_my_fed_id, federate, NULL); - } else { // message_type == MSG_TYPE_MESSAGE) - tracepoint_federate_to_rti(_fed.trace, send_MSG, _lf_my_fed_id, NULL); - } - write_to_socket_with_mutex(socket, header_length, header_buffer, &outbound_socket_mutex, - "Failed to send message header to to %s.", next_destination_str); - write_to_socket_with_mutex(socket, length, message, &outbound_socket_mutex, - "Failed to send message body to to %s.", next_destination_str); - lf_mutex_unlock(&outbound_socket_mutex); - return 1; + LF_MUTEX_UNLOCK(&outbound_socket_mutex); + return result; } /** - * Send the specified timestamped message to the specified port in the - * specified federate via the RTI or directly to a federate depending on - * the given socket. The timestamp is calculated as current_logical_time + - * additional delay which is greater than or equal to zero. - * The port should be an input port of a reactor in - * the destination federate. This version does include the timestamp - * in the message. The caller can reuse or free the memory after this returns. - * - * If the socket connection to the remote federate or the RTI has been broken, - * then this returns 0 without sending. Otherwise, it returns 1. - * - * This method assumes that the caller does not hold the outbound_socket_mutex lock, - * which it acquires to perform the send. - * - * @note This function is similar to send_message() except that it - * sends timed messages and also contains logics related to time. + * Close the socket that receives incoming messages from the + * specified federate ID. This function should be called when a read + * of incoming socket fails or when an EOF is received. + * It can also be called when the receiving end wants to stop communication, + * in which case, flag should be 1. * - * @param env The environment of the federate - * @param additional_delay The offset applied to the timestamp - * using after. The additional delay will be greater or equal to zero - * if an after is used on the connection. If no after is given in the - * program, -1 is passed. - * @param message_type The type of the message being sent. - * Currently can be MSG_TYPE_TAGGED_MESSAGE for messages sent via - * RTI or MSG_TYPE_P2P_TAGGED_MESSAGE for messages sent between - * federates. - * @param port The ID of the destination port. - * @param federate The ID of the destination federate. - * @param next_destination_str The next destination in string format (RTI or federate) - * (used for reporting errors). - * @param length The message length. - * @param message The message. - * @return 1 if the message has been sent, 0 otherwise. + * @param fed_id The ID of the peer federate sending messages to this + * federate, or -1 if the RTI. + * @param flag 0 if an EOF was received, -1 if a socket error occurred, 1 otherwise. */ -int send_timed_message(environment_t* env, +static void _lf_close_inbound_socket(int fed_id, int flag) { + LF_MUTEX_LOCK(&socket_mutex); + if (_fed.sockets_for_inbound_p2p_connections[fed_id] >= 0) { + if (flag >= 0) { + if (flag > 0) { + shutdown(_fed.sockets_for_inbound_p2p_connections[fed_id], SHUT_RDWR); + // Flag indicates that there could still be incoming data. + unsigned char message[32]; + while (read(_fed.sockets_for_inbound_p2p_connections[fed_id], &message, 32) > 0); + } else { + // Have received EOF from the other end. Send EOF to the other end. + shutdown(_fed.sockets_for_inbound_p2p_connections[fed_id], SHUT_WR); + } + } + close(_fed.sockets_for_inbound_p2p_connections[fed_id]); + _fed.sockets_for_inbound_p2p_connections[fed_id] = -1; + } + LF_MUTEX_UNLOCK(&socket_mutex); +} + +int send_tagged_message(environment_t* env, interval_t additional_delay, int message_type, unsigned short port, @@ -314,19 +279,17 @@ int send_timed_message(environment_t* env, unsigned char* message) { assert(env != GLOBAL_ENVIRONMENT); - unsigned char header_buffer[1 + sizeof(uint16_t) + sizeof(uint16_t) - + sizeof(int32_t) + sizeof(instant_t) + sizeof(microstep_t)]; - // First byte identifies this as a timed message. - if (message_type != MSG_TYPE_TAGGED_MESSAGE && - message_type != MSG_TYPE_P2P_TAGGED_MESSAGE - ) { - lf_print_error( - "send_message() was called with an invalid message type (%d).", - message_type - ); - return 0; + size_t header_length = 1 + sizeof(uint16_t) + sizeof(uint16_t) + + sizeof(int32_t) + sizeof(instant_t) + sizeof(microstep_t); + unsigned char header_buffer[header_length]; + + if (message_type != MSG_TYPE_TAGGED_MESSAGE && message_type != MSG_TYPE_P2P_TAGGED_MESSAGE) { + lf_print_error("send_message: Unsupported message type (%d).", message_type); + return -1; } + size_t buffer_head = 0; + // First byte is the message type. header_buffer[buffer_head] = (unsigned char)message_type; buffer_head += sizeof(unsigned char); // Next two bytes identify the destination port. @@ -343,157 +306,115 @@ int send_timed_message(environment_t* env, buffer_head += sizeof(int32_t); // Apply the additional delay to the current tag and use that as the intended - // tag of the outgoing message - tag_t current_message_intended_tag = lf_delay_tag(env->current_tag, - additional_delay); + // tag of the outgoing message. + tag_t current_message_intended_tag = lf_delay_tag(env->current_tag, additional_delay); + + if (_lf_is_tag_after_stop_tag(env, current_message_intended_tag)) { + // Message tag is past the timeout time (the stop time) so it should not be sent. + LF_PRINT_LOG("Dropping message because it will be after the timeout time."); + return -1; + } // Next 8 + 4 will be the tag (timestamp, microstep) encode_tag( &(header_buffer[buffer_head]), current_message_intended_tag ); - buffer_head += sizeof(int64_t) + sizeof(uint32_t); LF_PRINT_LOG("Sending message with tag " PRINTF_TAG " to %s.", - current_message_intended_tag.time - start_time, current_message_intended_tag.microstep, next_destination_str); - - // Header: message_type + port_id + federate_id + length of message + timestamp + microstep - size_t header_length = buffer_head; - - if (_lf_is_tag_after_stop_tag(env, current_message_intended_tag)) { - // Message tag is past the timeout time (the stop time) so it should - // not be sent. - return 0; - } + current_message_intended_tag.time - start_time, + current_message_intended_tag.microstep, + next_destination_str); // Use a mutex lock to prevent multiple threads from simultaneously sending. - lf_mutex_lock(&outbound_socket_mutex); - // First, check that the socket is still connected. This must done - // while holding the mutex lock. - int socket = -1; + LF_MUTEX_LOCK(&outbound_socket_mutex); + + int* socket; if (message_type == MSG_TYPE_P2P_TAGGED_MESSAGE) { - socket = _fed.sockets_for_outbound_p2p_connections[federate]; + socket = &_fed.sockets_for_outbound_p2p_connections[federate]; + tracepoint_federate_to_federate(_fed.trace, send_P2P_TAGGED_MSG, _lf_my_fed_id, federate, ¤t_message_intended_tag); } else { - socket = _fed.socket_TCP_RTI; + socket = &_fed.socket_TCP_RTI; + tracepoint_federate_to_rti(_fed.trace, send_TAGGED_MSG, _lf_my_fed_id, ¤t_message_intended_tag); } - if (socket < 0) { - lf_print_warning("Socket is no longer connected. Dropping message."); - lf_mutex_unlock(&outbound_socket_mutex); - return 0; + + int result = write_to_socket_close_on_error(socket, header_length, header_buffer); + if (result == 0) { + // Header sent successfully. Send the body. + result = write_to_socket_close_on_error(socket, length, message); } - // Trace the event when tracing is enabled - if (message_type == MSG_TYPE_TAGGED_MESSAGE) { - tracepoint_federate_to_rti(_fed.trace, send_TAGGED_MSG, _lf_my_fed_id, ¤t_message_intended_tag); - } else { // message_type == MSG_TYPE_P2P_TAGGED_MESSAGE - tracepoint_federate_to_federate(_fed.trace, send_P2P_TAGGED_MSG, _lf_my_fed_id, federate, ¤t_message_intended_tag); + if (result != 0) { + // Message did not send. Handling depends on message type. + if (message_type == MSG_TYPE_P2P_TAGGED_MESSAGE) { + lf_print_warning("Failed to send message to %s. Dropping the message.", next_destination_str); + } else { + lf_print_error_system_failure("Failed to send message to %s. Connection lost to the RTI.", + next_destination_str); + } } - write_to_socket_with_mutex(socket, header_length, header_buffer, &outbound_socket_mutex, - "Failed to send timed message header to %s.", next_destination_str); - write_to_socket_with_mutex(socket, length, message, &outbound_socket_mutex, - "Failed to send timed message body to %s.", next_destination_str); - lf_mutex_unlock(&outbound_socket_mutex); - return 1; + LF_MUTEX_UNLOCK(&outbound_socket_mutex); + return result; } /** - * Send a time to the RTI. - * This is not synchronized. - * It assumes the caller is. + * Send a time to the RTI. This acquires the outbound_socket_mutex. * @param type The message type (MSG_TYPE_TIMESTAMP). * @param time The time. - * @param exit_on_error If set to true, exit the program if sending 'time' fails. - * Print a soft error message otherwise */ -void _lf_send_time(unsigned char type, instant_t time, bool exit_on_error) { +void _lf_send_time(unsigned char type, instant_t time) { LF_PRINT_DEBUG("Sending time " PRINTF_TIME " to the RTI.", time); size_t bytes_to_write = 1 + sizeof(instant_t); unsigned char buffer[bytes_to_write]; buffer[0] = type; encode_int64(time, &(buffer[1])); - lf_mutex_lock(&outbound_socket_mutex); - if (_fed.socket_TCP_RTI < 0) { - lf_print_warning("Socket is no longer connected. Dropping message."); - lf_mutex_unlock(&outbound_socket_mutex); - return; - } + + LF_MUTEX_LOCK(&outbound_socket_mutex); + write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_write, buffer, &outbound_socket_mutex, + "Failed to send time " PRINTF_TIME " to the RTI.", time - start_time); + LF_MUTEX_UNLOCK(&outbound_socket_mutex); tag_t tag = {.time = time, .microstep = 0}; // Trace the event when tracing is enabled tracepoint_federate_to_rti(_fed.trace, send_TIMESTAMP, _lf_my_fed_id, &tag); - - ssize_t bytes_written = write_to_socket(_fed.socket_TCP_RTI, bytes_to_write, buffer); - if (bytes_written < (ssize_t)bytes_to_write) { - if (!exit_on_error) { - lf_print_error("Failed to send time " PRINTF_TIME " to the RTI." - " Error code %d: %s", - time - start_time, - errno, - strerror(errno) - ); - - } else if (errno == ENOTCONN) { - // FIXME: Shutdown is probably not working properly because the socket gets disconnected. - lf_print_error("Socket to the RTI is no longer connected. Considering this a soft error."); - } else { - lf_print_error_and_exit("Failed to send time " PRINTF_TIME " to the RTI." - " Error code %d: %s", - time - start_time, - errno, - strerror(errno) - ); - } - } - lf_mutex_unlock(&outbound_socket_mutex); } /** * Send a tag to the RTI. - * This is not synchronized. - * It assumes the caller is. + * This function acquires the outbound_socket_mutex. * @param type The message type (MSG_TYPE_NEXT_EVENT_TAG or MSG_TYPE_LOGICAL_TAG_COMPLETE). * @param tag The tag. - * @param exit_on_error If set to true, exit the program if sending 'tag' fails. - * Print a soft error message otherwise */ -void _lf_send_tag(unsigned char type, tag_t tag, bool exit_on_error) { +void _lf_send_tag(unsigned char type, tag_t tag) { LF_PRINT_DEBUG("Sending tag " PRINTF_TAG " to the RTI.", tag.time - start_time, tag.microstep); size_t bytes_to_write = 1 + sizeof(instant_t) + sizeof(microstep_t); unsigned char buffer[bytes_to_write]; buffer[0] = type; encode_tag(&(buffer[1]), tag); - lf_mutex_lock(&outbound_socket_mutex); + LF_MUTEX_LOCK(&outbound_socket_mutex); if (_fed.socket_TCP_RTI < 0) { lf_print_warning("Socket is no longer connected. Dropping message."); - lf_mutex_unlock(&outbound_socket_mutex); + LF_MUTEX_UNLOCK(&outbound_socket_mutex); return; } trace_event_t event_type = (type == MSG_TYPE_NEXT_EVENT_TAG) ? send_NET : send_LTC; // Trace the event when tracing is enabled tracepoint_federate_to_rti(_fed.trace, event_type, _lf_my_fed_id, &tag); - ssize_t bytes_written = write_to_socket(_fed.socket_TCP_RTI, bytes_to_write, buffer); - if (bytes_written < (ssize_t)bytes_to_write) { - if (!exit_on_error) { - lf_print_error("Failed to send tag " PRINTF_TAG " to the RTI." - " Error code %d: %s", - tag.time - start_time, - tag.microstep, - errno, - strerror(errno) - ); - return; - } else if (errno == ENOTCONN) { - lf_print_error("Socket to the RTI is no longer connected. Considering this a soft error."); - return; - } else { - lf_mutex_unlock(&outbound_socket_mutex); - lf_print_error_system_failure("Failed to send tag " PRINTF_TAG " to the RTI.", - tag.time - start_time, - tag.microstep - ); - } - } - lf_mutex_unlock(&outbound_socket_mutex); + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, bytes_to_write, buffer, &outbound_socket_mutex, + "Failed to send tag " PRINTF_TAG " to the RTI.", tag.time - start_time, tag.microstep); + LF_MUTEX_UNLOCK(&outbound_socket_mutex); +} + +/** + * Return true if either the socket to the RTI is broken or the socket is + * alive and the first unread byte on the socket's queue is MSG_TYPE_RESIGN. + */ +static bool rti_resigned() { + unsigned char first_byte; + ssize_t bytes = peek_from_socket(_fed.socket_TCP_RTI, &first_byte); + if (bytes < 0 || (bytes == 1 && first_byte == MSG_TYPE_RESIGN)) return true; + else return false; } /** @@ -509,16 +430,21 @@ void* handle_p2p_connections_from_federates(void* env_arg) { int received_federates = 0; // Allocate memory to store thread IDs. _fed.inbound_socket_listeners = (lf_thread_t*)calloc(_fed.number_of_inbound_p2p_connections, sizeof(lf_thread_t)); - while (received_federates < _fed.number_of_inbound_p2p_connections) { + while (received_federates < _fed.number_of_inbound_p2p_connections && !_lf_termination_executed) { // Wait for an incoming connection request. struct sockaddr client_fd; uint32_t client_length = sizeof(client_fd); int socket_id = accept(_fed.server_socket, &client_fd, &client_length); - // FIXME: Error handling here is too harsh maybe? - if (socket_id < 0 && errno != EAGAIN && errno != EWOULDBLOCK) { - lf_print_error("A fatal error occurred while accepting a new socket. " - "Federate will not accept connections anymore."); - return NULL; + + if (socket_id < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { + if (rti_resigned()) break; + else continue; // Try again. + } else if (errno == EPERM) { + lf_print_error_system_failure("Firewall permissions prohibit connection."); + } else { + lf_print_error_system_failure("A fatal error occurred while accepting a new socket."); + } } LF_PRINT_LOG("Accepted new connection from remote federate."); @@ -576,11 +502,18 @@ void* handle_p2p_connections_from_federates(void* env_arg) { // Send an MSG_TYPE_ACK message. unsigned char response = MSG_TYPE_ACK; - // Trace the event when tracing is enabled - tracepoint_federate_to_federate(_fed.trace, send_ACK, _lf_my_fed_id, remote_fed_id, NULL); - write_to_socket_errexit(socket_id, 1, (unsigned char*)&response, + + LF_MUTEX_LOCK(&outbound_socket_mutex); + write_to_socket_fail_on_error( + &_fed.sockets_for_inbound_p2p_connections[remote_fed_id], + 1, (unsigned char*)&response, + &outbound_socket_mutex, "Failed to write MSG_TYPE_ACK in response to federate %d.", remote_fed_id); + LF_MUTEX_UNLOCK(&outbound_socket_mutex); + + // Trace the event when tracing is enabled + tracepoint_federate_to_federate(_fed.trace, send_ACK, _lf_my_fed_id, remote_fed_id, NULL); // Start a thread to listen for incoming messages from other federates. // The fed_id is a uint16_t, which we assume can be safely cast to and from void*. @@ -591,12 +524,12 @@ void* handle_p2p_connections_from_federates(void* env_arg) { fed_id_arg); if (result != 0) { // Failed to create a listening thread. - lf_mutex_lock(&socket_mutex); + LF_MUTEX_LOCK(&socket_mutex); if (_fed.sockets_for_inbound_p2p_connections[remote_fed_id] != -1) { close(socket_id); _fed.sockets_for_inbound_p2p_connections[remote_fed_id] = -1; } - lf_mutex_unlock(&socket_mutex); + LF_MUTEX_UNLOCK(&socket_mutex); lf_print_error_and_exit( "Failed to create a thread to listen for incoming physical connection. Error code: %d.", result @@ -606,7 +539,7 @@ void* handle_p2p_connections_from_federates(void* env_arg) { received_federates++; } - LF_PRINT_LOG("All remote federates are connected."); + LF_PRINT_LOG("All %zu remote federates are connected.", _fed.number_of_inbound_p2p_connections); return NULL; } @@ -620,6 +553,7 @@ void* handle_p2p_connections_from_federates(void* env_arg) { */ static void _lf_close_outbound_socket(int fed_id, int flag) { assert (fed_id >= 0 && fed_id < NUMBER_OF_FEDERATES); + LF_MUTEX_LOCK(&outbound_socket_mutex); if (_fed.sockets_for_outbound_p2p_connections[fed_id] >= 0) { // Close the socket by sending a FIN packet indicating that no further writes // are expected. Then read until we get an EOF indication. @@ -637,60 +571,12 @@ static void _lf_close_outbound_socket(int fed_id, int flag) { close(_fed.sockets_for_outbound_p2p_connections[fed_id]); _fed.sockets_for_outbound_p2p_connections[fed_id] = -1; } -} - -/** - * For each incoming message socket, we create this thread that listens - * for upstream messages. Currently, the only possible upstream message - * is MSG_TYPE_CLOSE_REQUEST. If this thread receives that message, then closes - * the socket. The idea here is that a peer-to-peer socket connection - * is always closed from the sending end, never from the receiving end. - * This way, any sends in progress complete before the socket is actually - * closed. - */ -void* listen_for_upstream_messages_from_downstream_federates(void* fed_id_ptr) { - uint16_t fed_id = *((uint16_t*)fed_id_ptr); - unsigned char message; - - lf_mutex_lock(&outbound_socket_mutex); - while(_fed.sockets_for_outbound_p2p_connections[fed_id] >= 0) { - // Unlock the mutex before performing a blocking read. - // Note that there is a race condition here, but the read will return - // a failure if the socket gets closed. - lf_mutex_unlock(&outbound_socket_mutex); - - LF_PRINT_DEBUG("Thread listening for MSG_TYPE_CLOSE_REQUEST from federate %d", fed_id); - ssize_t bytes_read = read_from_socket( - _fed.sockets_for_outbound_p2p_connections[fed_id], 1, &message); - // Reacquire the mutex lock before closing or reading the socket again. - lf_mutex_lock(&outbound_socket_mutex); - - if (bytes_read == 1 && message == MSG_TYPE_CLOSE_REQUEST) { - // Received a request to close the socket. - LF_PRINT_DEBUG("Received MSG_TYPE_CLOSE_REQUEST from federate %d.", fed_id); - // Trace the event when tracing is enabled - tracepoint_federate_from_federate(_fed.trace, receive_CLOSE_RQ, _lf_my_fed_id, fed_id, NULL); - _lf_close_outbound_socket(fed_id, bytes_read); - break; - } - if (bytes_read == 0) { - // EOF. - LF_PRINT_DEBUG("Received EOF from federate %d.", fed_id); - _lf_close_outbound_socket(fed_id, bytes_read); - break; - } else if (bytes_read < 0) { - // Error. - _lf_close_outbound_socket(fed_id, bytes_read); - lf_print_error_system_failure("Error on socket from federate %d.", fed_id); - } - } - lf_mutex_unlock(&outbound_socket_mutex); - return NULL; + LF_MUTEX_UNLOCK(&outbound_socket_mutex); } /** * Connect to the federate with the specified id. This established - * connection will then be used in functions such as send_timed_message() + * connection will then be used in functions such as send_tagged_message() * to send messages directly to the specified federate. * This function first sends an MSG_TYPE_ADDRESS_QUERY message to the RTI to obtain * the IP address and port number of the specified federate. It then attempts @@ -707,11 +593,11 @@ void connect_to_federate(uint16_t remote_federate_id) { // Ask the RTI for port number of the remote federate. // The buffer is used for both sending and receiving replies. // The size is what is needed for receiving replies. - unsigned char buffer[sizeof(int32_t) + INET_ADDRSTRLEN]; + unsigned char buffer[sizeof(int32_t) + INET_ADDRSTRLEN + 1]; int port = -1; struct in_addr host_ip_addr; int count_tries = 0; - while (port == -1) { + while (port == -1 && !_lf_termination_executed) { buffer[0] = MSG_TYPE_ADDRESS_QUERY; // NOTE: Sending messages in little endian. encode_uint16(remote_federate_id, &(buffer[1])); @@ -719,16 +605,28 @@ void connect_to_federate(uint16_t remote_federate_id) { LF_PRINT_DEBUG("Sending address query for federate %d.", remote_federate_id); // Trace the event when tracing is enabled tracepoint_federate_to_rti(_fed.trace, send_ADR_QR, _lf_my_fed_id, NULL); - write_to_socket_errexit(_fed.socket_TCP_RTI, sizeof(uint16_t) + 1, buffer, + + LF_MUTEX_LOCK(&outbound_socket_mutex); + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, sizeof(uint16_t) + 1, buffer, &outbound_socket_mutex, "Failed to send address query for federate %d to RTI.", remote_federate_id); + LF_MUTEX_UNLOCK(&outbound_socket_mutex); // Read RTI's response. - read_from_socket_errexit(_fed.socket_TCP_RTI, sizeof(int32_t), buffer, + read_from_socket_errexit(_fed.socket_TCP_RTI, sizeof(int32_t) + 1, buffer, "Failed to read the requested port number for federate %d from RTI.", remote_federate_id); - port = extract_int32(buffer); + if (buffer[0] != MSG_TYPE_ADDRESS_QUERY) { + // Unexpected reply. Could be that RTI has failed and sent a resignation. + if (buffer[0] == MSG_TYPE_RESIGN) { + lf_print_error_and_exit("RTI has resigned."); + } else { + lf_print_error_and_exit("Unexpected reply of type %hhu from RTI (see net_common.h).", buffer[0]); + } + } + port = extract_int32(&buffer[1]); read_from_socket_errexit(_fed.socket_TCP_RTI, sizeof(host_ip_addr), (unsigned char*)&host_ip_addr, "Failed to read the IP address for federate %d from RTI.", @@ -768,7 +666,7 @@ void connect_to_federate(uint16_t remote_federate_id) { // Iterate until we either successfully connect or exceed the number of // attempts given by CONNECT_MAX_RETRIES. int socket_id = -1; - while (result < 0) { + while (result < 0 && !_lf_termination_executed) { // Create an IPv4 socket for TCP (not UDP) communication over IP (0). socket_id = create_real_time_tcp_socket_errexit(); @@ -805,11 +703,12 @@ void connect_to_federate(uint16_t remote_federate_id) { } lf_print_warning("Could not connect to federate %d. Will try again every %lld nanoseconds.\n", remote_federate_id, ADDRESS_QUERY_RETRY_INTERVAL); + + // Check whether the RTI is still there. + if (rti_resigned()) break; + // Wait ADDRESS_QUERY_RETRY_INTERVAL nanoseconds. - if (lf_sleep(ADDRESS_QUERY_RETRY_INTERVAL) != 0) { - // Sleep was interrupted. - continue; - } + lf_sleep(ADDRESS_QUERY_RETRY_INTERVAL); } else { // Connect was successful. size_t buffer_length = 1 + sizeof(uint16_t) + 1; @@ -824,11 +723,13 @@ void connect_to_federate(uint16_t remote_federate_id) { buffer[sizeof(uint16_t) + 1] = federation_id_length; // Trace the event when tracing is enabled tracepoint_federate_to_federate(_fed.trace, send_FED_ID, _lf_my_fed_id, remote_federate_id, NULL); - write_to_socket_errexit(socket_id, - buffer_length, buffer, + + // No need for a mutex because we have the only handle on the socket. + write_to_socket_fail_on_error(&socket_id, + buffer_length, buffer, NULL, "Failed to send fed_id to federate %d.", remote_federate_id); - write_to_socket_errexit(socket_id, - federation_id_length, (unsigned char*)federation_metadata.federation_id, + write_to_socket_fail_on_error(&socket_id, + federation_id_length, (unsigned char*)federation_metadata.federation_id, NULL, "Failed to send federation id to federate %d.", remote_federate_id); @@ -852,36 +753,14 @@ void connect_to_federate(uint16_t remote_federate_id) { // Once we set this variable, then all future calls to close() on this // socket ID should reset it to -1 within a critical section. _fed.sockets_for_outbound_p2p_connections[remote_federate_id] = socket_id; - - // Start a thread to listen for upstream messages (MSG_TYPE_CLOSE_REQUEST) from - // this downstream federate. - uint16_t* remote_fed_id_copy = (uint16_t*)malloc(sizeof(uint16_t)); - if (remote_fed_id_copy == NULL) { - lf_print_error_system_failure("malloc failed."); - } - *remote_fed_id_copy = remote_federate_id; - lf_thread_t thread_id; - result = lf_thread_create( - &thread_id, - listen_for_upstream_messages_from_downstream_federates, - remote_fed_id_copy); - if (result != 0) { - // Failed to create a listening thread. - lf_print_error_and_exit( - "Failed to create a thread to listen for upstream message. Error code: %d.", - result - ); - } } #ifdef FEDERATED_AUTHENTICATED /** * Perform HMAC-based authentication with the RTI, using the federation ID * as an HMAC key. - * - * @param rti_socket TCP socket for connection with the RTI. */ -void perform_hmac_authentication(int rti_socket) { +void perform_hmac_authentication() { // Send buffer including message type, federate ID, federate's nonce. size_t fed_id_length = sizeof(uint16_t); @@ -892,17 +771,27 @@ void perform_hmac_authentication(int rti_socket) { unsigned char fed_nonce[NONCE_LENGTH]; RAND_bytes(fed_nonce, NONCE_LENGTH); memcpy(&fed_hello_buf[1 + fed_id_length], fed_nonce, NONCE_LENGTH); - write_to_socket(rti_socket, message_length, fed_hello_buf); + + LF_MUTEX_LOCK(&outbound_socket_mutex); + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, message_length, fed_hello_buf, &outbound_socket_mutex, + "Failed to write nonce."); + LF_MUTEX_UNLOCK(&outbound_socket_mutex); // Check HMAC of received FED_RESPONSE message. unsigned int hmac_length = SHA256_HMAC_LENGTH; size_t federation_id_length = strnlen(federation_metadata.federation_id, 255); unsigned char received[1 + NONCE_LENGTH + hmac_length]; - read_from_socket_errexit(rti_socket, 1 + NONCE_LENGTH + hmac_length, received, "Failed to read RTI response."); + read_from_socket_errexit(_fed.socket_TCP_RTI, 1 + NONCE_LENGTH + hmac_length, received, "Failed to read RTI response."); if (received[0] != MSG_TYPE_RTI_RESPONSE) { - lf_print_error("Received unexpected response %u from the RTI (see net_common.h).", - received[0]); + if (received[0] == MSG_TYPE_RESIGN) { + lf_print_error_and_exit("RTI has resigned."); + } else { + lf_print_error_and_exit( + "Received unexpected response %u from the RTI (see net_common.h).", + received[0]); + } } // Create tag to compare to received tag. unsigned char buf_to_check[1 + fed_id_length + NONCE_LENGTH]; @@ -920,12 +809,16 @@ void perform_hmac_authentication(int rti_socket) { unsigned char response[2]; response[0] = MSG_TYPE_REJECT; response[1] = HMAC_DOES_NOT_MATCH; - write_to_socket_errexit( - rti_socket, 2, response, - "Federate failed to write MSG_TYPE_REJECT message on the socket."); - close(rti_socket); - } - else { + + LF_MUTEX_LOCK(&outbound_socket_mutex); + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, 2, response, &outbound_socket_mutex, + "Federate failed to write MSG_TYPE_REJECT message on the socket."); + shutdown(_fed.socket_TCP_RTI, SHUT_RDWR); + close(_fed.socket_TCP_RTI); + _fed.socket_TCP_RTI = -1; + LF_MUTEX_LOCK(&outbound_socket_mutex); + } else { LF_PRINT_LOG("HMAC verified."); // HMAC tag is created with MSG_TYPE_FED_RESPONSE and received federate nonce. unsigned char mac_buf[1 + NONCE_LENGTH]; @@ -936,7 +829,12 @@ void perform_hmac_authentication(int rti_socket) { sender[0] = MSG_TYPE_FED_RESPONSE; HMAC(EVP_sha256(), federation_metadata.federation_id, federation_id_length, mac_buf, 1 + NONCE_LENGTH, &sender[1], &hmac_length); - write_to_socket(rti_socket, 1 + hmac_length, sender); + + LF_MUTEX_LOCK(&outbound_socket_mutex); + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, 1 + hmac_length, sender, &outbound_socket_mutex, + "Failed to write fed response."); + LF_MUTEX_UNLOCK(&outbound_socket_mutex); } } #endif @@ -993,7 +891,7 @@ void connect_to_rti(const char* hostname, int port) { int result = connect(_fed.socket_TCP_RTI, res->ai_addr, res->ai_addrlen);; int count_retries = 1; - while (result < 0 && count_retries++ < CONNECT_MAX_RETRIES) { + while (result < 0 && count_retries++ < CONNECT_MAX_RETRIES && !_lf_termination_executed) { lf_print("Failed to connect to RTI on port %d. Will try again.", uport); lf_sleep(CONNECT_RETRY_INTERVAL); result = connect(_fed.socket_TCP_RTI, res->ai_addr, res->ai_addrlen); @@ -1012,7 +910,7 @@ void connect_to_rti(const char* hostname, int port) { #ifdef FEDERATED_AUTHENTICATED LF_PRINT_LOG("Connected to an RTI. Performing HMAC-based authentication using federation ID."); - perform_hmac_authentication(_fed.socket_TCP_RTI); + perform_hmac_authentication(); #else LF_PRINT_LOG("Connected to an RTI. Sending federation ID for authentication."); #endif @@ -1032,12 +930,14 @@ void connect_to_rti(const char* hostname, int port) { // Trace the event when tracing is enabled tracepoint_federate_to_rti(_fed.trace, send_FED_ID, _lf_my_fed_id, NULL); - write_to_socket_errexit(_fed.socket_TCP_RTI, 2 + sizeof(uint16_t), buffer, + // No need for a mutex here because no other threads are writing to this socket. + write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, 2 + sizeof(uint16_t), buffer, NULL, "Failed to send federate ID to RTI."); // Next send the federation ID itself. - write_to_socket_errexit(_fed.socket_TCP_RTI, federation_id_length, (unsigned char*)federation_metadata.federation_id, - "Failed to send federation ID to RTI."); + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, federation_id_length, (unsigned char*)federation_metadata.federation_id, NULL, + "Failed to send federation ID to RTI."); // Wait for a response. // The response will be MSG_TYPE_REJECT if the federation ID doesn't match. @@ -1076,8 +976,10 @@ void connect_to_rti(const char* hostname, int port) { unsigned char UDP_port_number[1 + sizeof(uint16_t)]; UDP_port_number[0] = MSG_TYPE_UDP_PORT; encode_uint16(udp_port, &(UDP_port_number[1])); - write_to_socket_errexit(_fed.socket_TCP_RTI, 1 + sizeof(uint16_t), UDP_port_number, + write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, 1 + sizeof(uint16_t), UDP_port_number, NULL, "Failed to send the UDP port number to the RTI."); + } else if (response == MSG_TYPE_RESIGN) { + lf_print_error_and_exit("RTI has resigned."); } else { lf_print_error_and_exit("Received unexpected response %u from the RTI (see net_common.h).", response); @@ -1096,7 +998,7 @@ void connect_to_rti(const char* hostname, int port) { */ instant_t get_start_time_from_rti(instant_t my_physical_time) { // Send the timestamp marker first. - _lf_send_time(MSG_TYPE_TIMESTAMP, my_physical_time, true); + _lf_send_time(MSG_TYPE_TIMESTAMP, my_physical_time); // Read bytes from the socket. We need 9 bytes. // Buffer for message ID plus timestamp. @@ -1301,20 +1203,19 @@ void enqueue_port_absent_reactions(environment_t* env){ /** * Send a port absent message to federate with fed_ID, informing the - * remote federate that the current federate will not produce an event - * on this network port at the current logical time. + * remote federate that it will not receive a message with tag less than the + * current tag of the specified environment delayed by the additional_delay. * - * @param env The environment of the federate - * @param additional_delay The offset applied to the timestamp - * using after. The additional delay will be greater or equal to zero - * if an after is used on the connection. If no after is given in the - * program, -1 is passed. + * @param env The environment from which to get the current tag. + * @param additional_delay The after delay of the connection or NEVER if none. * @param port_ID The ID of the receiving port. * @param fed_ID The fed ID of the receiving federate. */ -void send_port_absent_to_federate(environment_t* env, interval_t additional_delay, - unsigned short port_ID, - unsigned short fed_ID) { +void send_port_absent_to_federate( + environment_t* env, + interval_t additional_delay, + unsigned short port_ID, + unsigned short fed_ID) { assert(env != GLOBAL_ENVIRONMENT); // Construct the message @@ -1326,8 +1227,7 @@ void send_port_absent_to_federate(environment_t* env, interval_t additional_dela // then we cannot promise no message with tag = current_tag + delay because a // subsequent reaction might produce such a message. But we can promise no // message with a tag strictly less than current_tag + delay. - tag_t current_message_intended_tag = lf_delay_strict(env->current_tag, - additional_delay); + tag_t current_message_intended_tag = lf_delay_strict(env->current_tag, additional_delay); LF_PRINT_LOG("Sending port " "absent for tag " PRINTF_TAG " for port %d to federate %d.", @@ -1340,23 +1240,39 @@ void send_port_absent_to_federate(environment_t* env, interval_t additional_dela encode_uint16(fed_ID, &(buffer[1+sizeof(port_ID)])); encode_tag(&(buffer[1+sizeof(port_ID)+sizeof(fed_ID)]), current_message_intended_tag); - lf_mutex_lock(&outbound_socket_mutex); #ifdef FEDERATED_CENTRALIZED // Send the absent message through the RTI - int socket = _fed.socket_TCP_RTI; + int* socket = &_fed.socket_TCP_RTI; #else // Send the absent message directly to the federate - int socket = _fed.sockets_for_outbound_p2p_connections[fed_ID]; + int* socket = &_fed.sockets_for_outbound_p2p_connections[fed_ID]; #endif - // Do not write if the socket is closed. - if (socket >= 0) { - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_PORT_ABS, _lf_my_fed_id, ¤t_message_intended_tag); - write_to_socket_with_mutex(socket, message_length, buffer, &outbound_socket_mutex, - "Failed to send port absent message for port %hu to federate %hu.", - port_ID, fed_ID); + + LF_MUTEX_LOCK(&outbound_socket_mutex); + int result = write_to_socket_close_on_error(socket, message_length, buffer); + LF_MUTEX_UNLOCK(&outbound_socket_mutex); + + if (result != 0) { + // Write failed. Response depends on whether coordination is centralized. + if (socket == &_fed.socket_TCP_RTI) { + // Centralized coordination. This is a critical error. + lf_print_error_system_failure("Failed to send port absent message for port %hu to federate %hu.", + port_ID, fed_ID); + } else { + // Decentralized coordination. This is not a critical error. + lf_print_warning("Failed to send port absent message for port %hu to federate %hu.", + port_ID, fed_ID); + } + } else { + // Message sent correctly. Trace it. + if (socket == &_fed.socket_TCP_RTI) { + tracepoint_federate_to_rti( + _fed.trace, send_PORT_ABS, _lf_my_fed_id, ¤t_message_intended_tag); + } else { + tracepoint_federate_to_federate( + _fed.trace, send_PORT_ABS, _lf_my_fed_id, fed_ID, ¤t_message_intended_tag); + } } - lf_mutex_unlock(&outbound_socket_mutex); } ///////////////////////////////////////////////////////////////////////////////////////// @@ -1436,74 +1352,6 @@ static trigger_handle_t schedule_message_received_from_network_locked( return return_value; } -/** - * Close the socket that receives incoming messages from the - * specified federate ID. This function should be called when a read - * of incoming socket fails or when an EOF is received. - * - * @param fed_id The ID of the peer federate sending messages to this - * federate, or -1 if the RTI. - * @param flag 0 if an EOF was received, -1 if a socket error occurred, 1 otherwise. - */ -static void _lf_close_inbound_socket(int fed_id, int flag) { - lf_mutex_lock(&socket_mutex); - if (_fed.sockets_for_inbound_p2p_connections[fed_id] >= 0) { - if (_fed.sockets_for_inbound_p2p_connections[fed_id] >= 0) { - if (flag >= 0) { - shutdown(_fed.sockets_for_inbound_p2p_connections[fed_id], SHUT_WR); - if (flag > 0) { - // Flag indicates that there could still be incoming data. - unsigned char message[32]; - while (read(_fed.sockets_for_inbound_p2p_connections[fed_id], &message, 32) > 0); - } - } - close(_fed.sockets_for_inbound_p2p_connections[fed_id]); - _fed.sockets_for_inbound_p2p_connections[fed_id] = -1; - } - } - lf_mutex_unlock(&socket_mutex); -} - -/** - * Request to close the socket that receives incoming messages from the - * specified federate ID. This sends a message to the upstream federate - * requesting that it close the socket. If the message is sent successfully, - * this returns 1. Otherwise it returns 0, which presumably means that the - * socket is already closed. - * - * This function assumes that the caller holds the socket_mutex lock. - * - * @param The ID of the peer federate sending messages to this federate. - * - * @return 1 if the MSG_TYPE_CLOSE_REQUEST message is sent successfully, 0 otherwise. - */ -static int _lf_request_close_inbound_socket(int fed_id) { - assert(fed_id >= 0 && fed_id < NUMBER_OF_FEDERATES); - - if (_fed.sockets_for_inbound_p2p_connections[fed_id] < 0) { - return 0; - } - - // Send a MSG_TYPE_CLOSE_REQUEST message. - unsigned char message_marker = MSG_TYPE_CLOSE_REQUEST; - - ssize_t written = write_to_socket( - _fed.sockets_for_inbound_p2p_connections[fed_id], - 1, &message_marker); - // Close the socket upon receiving EOF. - _lf_close_inbound_socket(fed_id, 1); - - // Trace the event when tracing is enabled - tracepoint_federate_to_federate(_fed.trace, send_CLOSE_RQ, _lf_my_fed_id, fed_id, NULL); - - if (written == 1) { - LF_PRINT_LOG("Sent MSG_TYPE_CLOSE_REQUEST message to upstream federate."); - return 1; - } else { - return 0; - } -} - /** * Handle a port absent message received from a remote federate. * This just sets the last known status tag of the port specified @@ -1542,19 +1390,19 @@ static void handle_port_absent_message(int socket, int fed_id) { environment_t *env; _lf_get_environments(&env); - lf_mutex_lock(&env->mutex); + LF_MUTEX_LOCK(&env->mutex); #ifdef FEDERATED_DECENTRALIZED trigger_t* network_input_port_action = _lf_action_for_port(port_id)->trigger; if (lf_tag_compare(intended_tag, network_input_port_action->last_known_status_tag) < 0) { - lf_mutex_unlock(&env->mutex); + LF_MUTEX_UNLOCK(&env->mutex); } #endif // In centralized coordination, a TAG message from the RTI // can set the last_known_status_tag to a future tag where messages // have not arrived yet. // Set the mutex status as absent update_last_known_status_on_input_port(intended_tag, port_id); - lf_mutex_unlock(&env->mutex); + LF_MUTEX_UNLOCK(&env->mutex); } /** @@ -1600,13 +1448,13 @@ void handle_message(int socket, int fed_id) { void stall_advance_level_federation(environment_t* env, size_t level) { LF_PRINT_DEBUG("Acquiring the environment mutex."); - lf_mutex_lock(&env->mutex); + LF_MUTEX_LOCK(&env->mutex); LF_PRINT_DEBUG("Waiting on MLAA with next_reaction_level %zu and MLAA %d.", level, max_level_allowed_to_advance); while (((int) level) >= max_level_allowed_to_advance) { lf_cond_wait(&port_status_changed); }; LF_PRINT_DEBUG("Exiting wait with MLAA %d and next_reaction_level %zu.", max_level_allowed_to_advance, level); - lf_mutex_unlock(&env->mutex); + LF_MUTEX_UNLOCK(&env->mutex); } /** @@ -1689,7 +1537,7 @@ void handle_tagged_message(int socket, int fed_id) { // The following is only valid for string messages. // LF_PRINT_DEBUG("Message received: %s.", message_contents); - lf_mutex_lock(&env->mutex); + LF_MUTEX_LOCK(&env->mutex); action->trigger->physical_time_of_arrival = time_of_arrival; @@ -1761,18 +1609,19 @@ void handle_tagged_message(int socket, int fed_id) { // But only if the stop time is not equal to the start time! if (lf_tag_compare(env->current_tag, env->stop_tag) >= 0) { lf_print_error("Received message too late. Already at stop tag.\n" - "Current tag is " PRINTF_TAG " and intended tag is " PRINTF_TAG ".\n" - "Discarding message.", + " Current tag is " PRINTF_TAG " and intended tag is " PRINTF_TAG ".\n" + " Discarding message and closing the socket.", env->current_tag.time - start_time, env->current_tag.microstep, intended_tag.time - start_time, intended_tag.microstep); - goto release; + // Close socket, reading any incoming data and discarding it. + _lf_close_inbound_socket(fed_id, 1); + } else { + schedule_message_received_from_network_locked(env, action->trigger, intended_tag, message_token); } - - schedule_message_received_from_network_locked(env, action->trigger, intended_tag, message_token); } - release: -#ifdef FEDERATED_DECENTRALIZED // Only applicable for federated programs with decentralized coordination +#ifdef FEDERATED_DECENTRALIZED + // Only applicable for federated programs with decentralized coordination // Finally, decrement the barrier to allow the execution to continue // past the raised barrier _lf_decrement_tag_barrier_locked(env); @@ -1782,7 +1631,7 @@ void handle_tagged_message(int socket, int fed_id) { // logical time has been removed to avoid // the need for unecessary lock and unlock // operations. - lf_mutex_unlock(&env->mutex); + LF_MUTEX_UNLOCK(&env->mutex); } /** @@ -1812,7 +1661,7 @@ void handle_tag_advance_grant(void) { // Trace the event when tracing is enabled tracepoint_federate_from_rti(_fed.trace, receive_TAG, _lf_my_fed_id, &TAG); - lf_mutex_lock(&env->mutex); + LF_MUTEX_LOCK(&env->mutex); // Update the last known status tag of all network input ports // to the TAG received from the RTI. Here we assume that the RTI @@ -1830,16 +1679,17 @@ void handle_tag_advance_grant(void) { LF_PRINT_LOG("Received Time Advance Grant (TAG): " PRINTF_TAG ".", _fed.last_TAG.time - start_time, _fed.last_TAG.microstep); } else { - lf_mutex_unlock(&env->mutex); + LF_MUTEX_UNLOCK(&env->mutex); lf_print_error("Received a TAG " PRINTF_TAG " that wasn't larger " "than the previous TAG or PTAG " PRINTF_TAG ". Ignoring the TAG.", TAG.time - start_time, TAG.microstep, _fed.last_TAG.time - start_time, _fed.last_TAG.microstep); + return; } // Notify everything that is blocked. lf_cond_broadcast(&env->event_q_changed); - lf_mutex_unlock(&env->mutex); + LF_MUTEX_UNLOCK(&env->mutex); } /** @@ -1857,7 +1707,7 @@ void _lf_logical_tag_complete(tag_t tag_to_send) { LF_PRINT_LOG("Sending Logical Time Complete (LTC) " PRINTF_TAG " to the RTI.", tag_to_send.time - start_time, tag_to_send.microstep); - _lf_send_tag(MSG_TYPE_LOGICAL_TAG_COMPLETE, tag_to_send, true); + _lf_send_tag(MSG_TYPE_LOGICAL_TAG_COMPLETE, tag_to_send); _fed.last_sent_LTC = tag_to_send; } @@ -1965,7 +1815,7 @@ static void* update_ports_from_staa_offsets(void* args) { // input ports. environment_t *env; int num_envs = _lf_get_environments(&env); - lf_mutex_lock(&env->mutex); + LF_MUTEX_LOCK(&env->mutex); while (1) { bool restart = false; tag_t tag_when_started_waiting = lf_tag(env); @@ -2074,12 +1924,12 @@ void handle_provisional_tag_advance_grant() { // get updated to a PTAG value because a PTAG does not indicate that // the RTI knows about the status of all ports up to and _including_ // the value of PTAG. Only a TAG message indicates that. - lf_mutex_lock(&env->mutex); + LF_MUTEX_LOCK(&env->mutex); // Sanity check if (lf_tag_compare(PTAG, _fed.last_TAG) < 0 || (lf_tag_compare(PTAG, _fed.last_TAG) == 0 && !_fed.is_last_TAG_provisional)) { - lf_mutex_unlock(&env->mutex); + LF_MUTEX_UNLOCK(&env->mutex); lf_print_error_and_exit("Received a PTAG " PRINTF_TAG " that is equal or earlier " "than an already received TAG " PRINTF_TAG ".", PTAG.time, PTAG.microstep, @@ -2113,7 +1963,7 @@ void handle_provisional_tag_advance_grant() { // it is already treating the current tag as PTAG cycle (e.g. at the // start time) or it will be completing the current cycle and sending // a LTC message shortly. In either case, there is nothing more to do. - lf_mutex_unlock(&env->mutex); + LF_MUTEX_UNLOCK(&env->mutex); return; } else if (lf_tag_compare(env->current_tag, PTAG) > 0) { // Current tag is greater than the PTAG. @@ -2127,7 +1977,7 @@ void handle_provisional_tag_advance_grant() { // Send an LTC to indicate absent outputs. _lf_logical_tag_complete(PTAG); // Nothing more to do. - lf_mutex_unlock(&env->mutex); + LF_MUTEX_UNLOCK(&env->mutex); return; } else if (PTAG.time == env->current_tag.time) { // We now know env->current_tag < PTAG, but the times are equal. @@ -2148,7 +1998,7 @@ void handle_provisional_tag_advance_grant() { pqueue_insert(env->event_q, dummy); } - lf_mutex_unlock(&env->mutex); + LF_MUTEX_UNLOCK(&env->mutex); } /** @@ -2165,7 +2015,7 @@ int _lf_fd_send_stop_request_to_rti(tag_t stop_tag) { // Stop at the next microstep ENCODE_STOP_REQUEST(buffer, stop_tag.time, stop_tag.microstep + 1); - lf_mutex_lock(&outbound_socket_mutex); + LF_MUTEX_LOCK(&outbound_socket_mutex); // Do not send a stop request if a stop request has been previously received from the RTI. if (!_fed.received_stop_request_from_rti) { LF_PRINT_LOG("Sending to RTI a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", @@ -2174,18 +2024,18 @@ int _lf_fd_send_stop_request_to_rti(tag_t stop_tag) { if (_fed.socket_TCP_RTI < 0) { lf_print_warning("Socket is no longer connected. Dropping message."); - lf_mutex_unlock(&outbound_socket_mutex); + LF_MUTEX_UNLOCK(&outbound_socket_mutex); return -1; } - write_to_socket_with_mutex(_fed.socket_TCP_RTI, MSG_TYPE_STOP_REQUEST_LENGTH, + write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, MSG_TYPE_STOP_REQUEST_LENGTH, buffer, &outbound_socket_mutex, "Failed to send stop time " PRINTF_TIME " to the RTI.", stop_tag.time - start_time); - lf_mutex_unlock(&outbound_socket_mutex); + LF_MUTEX_UNLOCK(&outbound_socket_mutex); // Trace the event when tracing is enabled tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ, _lf_my_fed_id, &stop_tag); return 0; } else { - lf_mutex_unlock(&outbound_socket_mutex); + LF_MUTEX_UNLOCK(&outbound_socket_mutex); return 1; } } @@ -2216,7 +2066,7 @@ void handle_stop_granted_message() { int num_environments = _lf_get_environments(&env); for (int i = 0; i < num_environments; i++) { - lf_mutex_lock(&env[i].mutex); + LF_MUTEX_LOCK(&env[i].mutex); // Sanity check. if (lf_tag_compare(received_stop_tag, env[i].current_tag) <= 0) { @@ -2234,7 +2084,7 @@ void handle_stop_granted_message() { if (env[i].barrier.requestors) _lf_decrement_tag_barrier_locked(&env[i]); lf_cond_broadcast(&env[i].event_q_changed); - lf_mutex_unlock(&env[i].mutex); + LF_MUTEX_UNLOCK(&env[i].mutex); } } @@ -2261,23 +2111,23 @@ void handle_stop_request_message() { // is guarded by the outbound socket mutex. // The second is guarded by the global mutex. // Note that the RTI should not send stop requests more than once to federates. - lf_mutex_lock(&outbound_socket_mutex); + LF_MUTEX_LOCK(&outbound_socket_mutex); bool already_blocked = false; if (_fed.received_stop_request_from_rti) { already_blocked = true; } _fed.received_stop_request_from_rti = true; - lf_mutex_unlock(&outbound_socket_mutex); + LF_MUTEX_UNLOCK(&outbound_socket_mutex); extern lf_mutex_t global_mutex; extern bool lf_stop_requested; - lf_mutex_lock(&global_mutex); + LF_MUTEX_LOCK(&global_mutex); if (lf_stop_requested) { already_blocked = true; } // Treat the stop request from the RTI as if a local stop request had been received. lf_stop_requested = true; - lf_mutex_unlock(&global_mutex); + LF_MUTEX_UNLOCK(&global_mutex); if (already_blocked) { // Either we have sent a stop request to the RTI ourselves, @@ -2292,7 +2142,7 @@ void handle_stop_request_message() { environment_t *env; int num_environments = _lf_get_environments(&env); for (int i = 0; i < num_environments; i++) { - lf_mutex_lock(&env[i].mutex); + LF_MUTEX_LOCK(&env[i].mutex); if (lf_tag_compare(tag_to_stop, env[i].current_tag) <= 0) { // Can't stop at the requested tag. Make a counteroffer. tag_to_stop = env->current_tag; @@ -2301,24 +2151,18 @@ void handle_stop_request_message() { // Set a barrier to prevent the enclave from advancing past the so-far tag to stop. _lf_increment_tag_barrier_locked(&env[i], tag_to_stop); - lf_mutex_unlock(&env[i].mutex); + LF_MUTEX_UNLOCK(&env[i].mutex); } // Send the reply, which is the least tag at which we can stop. unsigned char outgoing_buffer[MSG_TYPE_STOP_REQUEST_REPLY_LENGTH]; ENCODE_STOP_REQUEST_REPLY(outgoing_buffer, tag_to_stop.time, tag_to_stop.microstep); - lf_mutex_lock(&outbound_socket_mutex); - if (_fed.socket_TCP_RTI < 0) { - lf_print_warning("Socket is no longer connected. Dropping message."); - lf_mutex_unlock(&outbound_socket_mutex); - return; - } - // Send the current logical time to the RTI. This message does not have an identifying byte - // since the RTI is waiting for a response from this federate. - write_to_socket_with_mutex( - _fed.socket_TCP_RTI, MSG_TYPE_STOP_REQUEST_REPLY_LENGTH, outgoing_buffer, &outbound_socket_mutex, + // Send the current logical time to the RTI. + LF_MUTEX_LOCK(&outbound_socket_mutex); + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, MSG_TYPE_STOP_REQUEST_REPLY_LENGTH, outgoing_buffer, &outbound_socket_mutex, "Failed to send the answer to MSG_TYPE_STOP_REQUEST to RTI."); - lf_mutex_unlock(&outbound_socket_mutex); + LF_MUTEX_UNLOCK(&outbound_socket_mutex); // Trace the event when tracing is enabled tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ_REP, _lf_my_fed_id, &tag_to_stop); } @@ -2337,10 +2181,12 @@ static void send_resign_signal(environment_t* env) { } else { encode_tag(&(buffer[1]), NEVER_TAG); } - ssize_t written = write_to_socket(_fed.socket_TCP_RTI, bytes_to_write, &(buffer[0])); - if (written == bytes_to_write) { - LF_PRINT_LOG("Resigned."); - } + LF_MUTEX_LOCK(&outbound_socket_mutex); + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, bytes_to_write, &(buffer[0]), &outbound_socket_mutex, + "Failed to send RESIGN."); + LF_MUTEX_UNLOCK(&outbound_socket_mutex); + LF_PRINT_LOG("Resigned."); } /** @@ -2357,9 +2203,9 @@ void terminate_execution(environment_t* env) { // MSG_TYPE_RESIGN message to the RTI, but we should not acquire a mutex. if (_fed.socket_TCP_RTI >= 0) { if (_lf_normal_termination) { - lf_mutex_lock(&outbound_socket_mutex); + LF_MUTEX_LOCK(&outbound_socket_mutex); send_resign_signal(env); - lf_mutex_unlock(&outbound_socket_mutex); + LF_MUTEX_UNLOCK(&outbound_socket_mutex); // Trace the event when tracing is enabled tracepoint_federate_to_rti(_fed.trace, send_RESIGN, _lf_my_fed_id, &env->current_tag); } else { @@ -2368,10 +2214,10 @@ void terminate_execution(environment_t* env) { } } - LF_PRINT_DEBUG("Requesting closing of incoming P2P sockets."); - // Request closing the incoming P2P sockets. + LF_PRINT_DEBUG("Closing incoming P2P sockets."); + // Close any incoming P2P sockets that are still open. for (int i=0; i < NUMBER_OF_FEDERATES; i++) { - _lf_request_close_inbound_socket(i); + _lf_close_inbound_socket(i, 1); // Ignore errors. Mark the socket closed. _fed.sockets_for_inbound_p2p_connections[i] = -1; } @@ -2444,12 +2290,14 @@ void* listen_to_federates(void* _args) { LF_PRINT_DEBUG("Waiting for a P2P message on socket %d.", socket_id); ssize_t bytes_read = read_from_socket(socket_id, 1, buffer); if (bytes_read == 0) { - // EOF occurred. This breaks the connection. - lf_print("Received EOF from peer federate %d. Closing the socket.", fed_id); - _lf_close_inbound_socket(fed_id, bytes_read); + // EOF occurred. Socket has been closed by read_from_socket. + lf_print("Received EOF from peer federate %d.", fed_id); + // Stop listening to this federate. break; } else if (bytes_read < 0) { lf_print_error("P2P socket to federate %d is broken.", fed_id); + // Stop listening to this federate. + // Mark the socket closed. _lf_close_inbound_socket(fed_id, bytes_read); break; } @@ -2475,9 +2323,9 @@ void* listen_to_federates(void* _args) { if (bad_message) { // FIXME: Better error handling needed. lf_print_error("Received erroneous message type: %d. Closing the socket.", buffer[0]); - break; // Trace the event when tracing is enabled tracepoint_federate_from_federate(_fed.trace, receive_UNIDENTIFIED, _lf_my_fed_id, fed_id, NULL); + break; } } return NULL; @@ -2760,7 +2608,7 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply // This if statement does not fall through but rather returns. // NET is not bounded by physical time or has no downstream federates. // Normal case. - _lf_send_tag(MSG_TYPE_NEXT_EVENT_TAG, tag, wait_for_reply); + _lf_send_tag(MSG_TYPE_NEXT_EVENT_TAG, tag); _fed.last_sent_NET = tag; LF_PRINT_LOG("Sent next event tag (NET) " PRINTF_TAG " to RTI.", tag.time - start_time, tag.microstep); @@ -2799,7 +2647,7 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply return _fed.last_TAG; } if (lf_tag_compare(next_tag, tag) != 0) { - _lf_send_tag(MSG_TYPE_NEXT_EVENT_TAG, next_tag, wait_for_reply); + _lf_send_tag(MSG_TYPE_NEXT_EVENT_TAG, next_tag); _fed.last_sent_NET = next_tag; LF_PRINT_LOG("Sent next event tag (NET) " PRINTF_TAG " to RTI from loop.", next_tag.time - lf_time_start(), next_tag.microstep); diff --git a/core/federated/network/net_util.c b/core/federated/network/net_util.c index 2048e02a1..665928cf9 100644 --- a/core/federated/network/net_util.c +++ b/core/federated/network/net_util.c @@ -103,25 +103,29 @@ ssize_t read_from_socket_errexit( int retry_count = 0; while (bytes_read < (ssize_t)num_bytes) { ssize_t more = read(socket, buffer + bytes_read, num_bytes - (size_t)bytes_read); - if(more < 0 && retry_count++ < NUM_SOCKET_RETRIES) { - // Used to retry only on: (errno == EAGAIN || errno == EWOULDBLOCK) + if(more < 0 && retry_count++ < NUM_SOCKET_RETRIES + && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)) { // Those error codes set by the socket indicates // that we should try again (@see man errno). - // Now we retry on all errors, but a bounded number of times. - LF_PRINT_DEBUG("Reading from socket failed. Will try again."); + lf_print_warning("Reading from socket failed. Will try again."); lf_sleep(DELAY_BETWEEN_SOCKET_RETRIES); continue; } else if (more < 0) { - // Retries are exhausted. - lf_print_error_system_failure("Socket read failed after %d tries. Read %ld bytes, but expected %zu.", - retry_count, more + bytes_read, num_bytes); + // Socket failure. Probably closed. + if (format != NULL) { + lf_print_error_system_failure(format, args); + } else { + lf_print_error("Socket read failed."); + return more; + } } else if (more == 0) { // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, // upon receiving a zero length packet or an error, we can close the socket. - // If there are any pending outgoing messages, this will attempt to send those - // followed by an EOF. LF_PRINT_DEBUG("EOF received from client. Closing socket."); lf_mutex_lock(&socket_mutex); + // If there are any pending outgoing messages, this will attempt to send those + // followed by an EOF. + shutdown(socket, SHUT_WR); close(socket); lf_mutex_unlock(&socket_mutex); return more; @@ -135,25 +139,68 @@ ssize_t read_from_socket(int socket, size_t num_bytes, unsigned char* buffer) { return read_from_socket_errexit(socket, num_bytes, buffer, NULL); } -ssize_t write_to_socket_with_mutex( - int socket, - size_t num_bytes, - unsigned char* buffer, - lf_mutex_t* mutex, - char* format, ...) { +ssize_t peek_from_socket(int socket, unsigned char* result) { + ssize_t bytes_read = recv(socket, result, 1, MSG_DONTWAIT | MSG_PEEK); + if (bytes_read < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) return 0; + else return bytes_read; +} + +int write_to_socket(int socket, size_t num_bytes, unsigned char* buffer) { + if (socket < 0) { + // Socket is not open. + errno = EBADF; + return -1; + } ssize_t bytes_written = 0; va_list args; while (bytes_written < (ssize_t)num_bytes) { ssize_t more = write(socket, buffer + bytes_written, num_bytes - (size_t)bytes_written); - if (more <= 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) { - // The error code set by the socket indicates - // that we should try again (@see man errno). + if (more <= 0 && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)) { + // The error codes EAGAIN or EWOULDBLOCK indicate + // that we should try again (@see man errno). + // The error code EINTR means the system call was interrupted before completing. LF_PRINT_DEBUG("Writing to socket was blocked. Will try again."); + lf_sleep(DELAY_BETWEEN_SOCKET_RETRIES); continue; } else if (more < 0) { - // An error occurred. - shutdown(socket, SHUT_RDWR); - close(socket); + // A more serious error occurred. + return -1; + } + bytes_written += more; + } + return 0; +} + +int write_to_socket_close_on_error(int* socket, size_t num_bytes, unsigned char* buffer) { + assert(socket); + if (*socket >= 0) { + int result = write_to_socket(*socket, num_bytes, buffer); + if (result) { + // Write failed. + // Socket has probably been closed from the other side. + // Shut down and close the socket from this side. + shutdown(*socket, SHUT_RDWR); + close(*socket); + // Mark the socket closed. + *socket = -1; + } + return result; + } + return -1; +} + +void write_to_socket_fail_on_error( + int* socket, + size_t num_bytes, + unsigned char* buffer, + lf_mutex_t* mutex, + char* format, ...) { + va_list args; + assert(socket); + if (*socket >= 0) { + int result = write_to_socket_close_on_error(socket, num_bytes, buffer); + if (result) { + // Write failed. if (mutex != NULL) { lf_mutex_unlock(mutex); } @@ -162,23 +209,8 @@ ssize_t write_to_socket_with_mutex( } else { lf_print_error("Failed to write to socket. Closing it."); } - return more; } - bytes_written += more; } - return bytes_written; -} - -ssize_t write_to_socket_errexit( - int socket, - size_t num_bytes, - unsigned char* buffer, - char* format, ...) { - return write_to_socket_with_mutex(socket, num_bytes, buffer, NULL, format); -} - -ssize_t write_to_socket(int socket, size_t num_bytes, unsigned char* buffer) { - return write_to_socket_with_mutex(socket, num_bytes, buffer, NULL, NULL); } #endif // FEDERATED diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index f164fdb07..78129e708 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -249,7 +249,7 @@ void spawn_staa_thread(void); /** * Connect to the federate with the specified id. This established - * connection will then be used in functions such as send_timed_message() + * connection will then be used in functions such as send_tagged_message() * to send messages directly to the specified federate. * This function first sends an MSG_TYPE_ADDRESS_QUERY message to the RTI to obtain * the IP address and port number of the specified federate. It then attempts @@ -371,26 +371,20 @@ void stall_advance_level_federation(environment_t* env, size_t level); bool update_max_level(tag_t tag, bool is_provisional); /** - * Send a message to another federate directly or via the RTI. + * Send a message to another federate. This function is used for physical connections + * between federates. If the socket connection to the remote federate or the RTI has been broken, + * then this returns -1 without sending. Otherwise, it returns 0. + * * This method assumes that the caller does not hold the outbound_socket_mutex lock, * which it acquires to perform the send. * - * If the socket connection to the remote federate or the RTI has been broken, - * then this returns 0 without sending. Otherwise, it returns 1. - * - * @note This function is similar to send_timed_message() except that it - * does not deal with time and timed_messages. - * - * @param message_type The type of the message being sent. - * Currently can be MSG_TYPE_TAGGED_MESSAGE for messages sent via - * RTI or MSG_TYPE_P2P_TAGGED_MESSAGE for messages sent between - * federates. + * @param message_type The type of the message being sent (currently only MSG_TYPE_P2P_MESSAGE). * @param port The ID of the destination port. * @param federate The ID of the destination federate. - * @param next_destination_str The name of the next destination in string format + * @param next_destination_str The name of the next destination in string format (for reporting). * @param length The message length. * @param message The message. - * @return 1 if the message has been sent, 0 otherwise. + * @return 0 if the message has been sent, -1 otherwise. */ int send_message(int message_type, unsigned short port, @@ -400,48 +394,45 @@ int send_message(int message_type, unsigned char* message); /** - * Send the specified timestamped message to the specified port in the - * specified federate via the RTI or directly to a federate depending on - * the given socket. The timestamp is calculated as current_logical_time + - * additional delay which is greater than or equal to zero. - * The port should be an input port of a reactor in - * the destination federate. This version does include the timestamp - * in the message. The caller can reuse or free the memory after this returns. + * Send a tagged message to the specified port of the specified federate. + * The tag will be the current tag of the specified environment delayed by the specified additional_delay. + * If the delayed tag falls after the timeout time, then the message is not sent and -1 is returned. + * The caller can reuse or free the memory storing the message after this returns. * - * If the socket connection to the remote federate or the RTI has been broken, - * then this returns 0 without sending. Otherwise, it returns 1. + * If the message fails to send (e.g. the socket connection is broken), then the + * response depends on the message_type. For MSG_TYPE_TAGGED_MESSAGE, the message is + * supposed to go via the RTI, and failure to communicate with the RTI is a critical failure. + * In this case, the program will exit with an error message. If the message type is + * MSG_TYPE_P2P_TAGGED_MESSAGE, then the failure is not critical. It may be due to the + * remote federate having exited, for example, because its safe-to-process offset led it + * to believe that there were no messages forthcoming. In this case, on failure to send + * the message, this function returns -11. * * This method assumes that the caller does not hold the outbound_socket_mutex lock, * which it acquires to perform the send. * - * @note This function is similar to send_message() except that it - * sends timed messages and also contains logics related to time. - * - * @param env The environment in which we are executing - * @param additional_delay The offset applied to the timestamp - * using after. The additional delay will be greater or equal to zero - * if an after is used on the connection. If no after is given in the - * program, -1 is passed. - * @param message_type The type of the message being sent. - * Currently can be MSG_TYPE_TAGGED_MESSAGE for messages sent via - * RTI or MSG_TYPE_P2P_TAGGED_MESSAGE for messages sent between - * federates. + * @param env The environment from which to get the current tag. + * @param additional_delay The after delay on the connection or NEVER is there is none. + * @param message_type The type of the message being sent. Currently can be + * MSG_TYPE_TAGGED_MESSAGE for messages sent via the RTI or MSG_TYPE_P2P_TAGGED_MESSAGE + * for messages sent directly between federates. * @param port The ID of the destination port. * @param federate The ID of the destination federate. * @param next_destination_str The next destination in string format (RTI or federate) * (used for reporting errors). * @param length The message length. * @param message The message. - * @return 1 if the message has been sent, 0 otherwise. + * @return 0 if the message has been sent, 1 otherwise. */ -int send_timed_message(environment_t*, - interval_t, - int, - unsigned short, - unsigned short, - const char*, - size_t, - unsigned char*); +int send_tagged_message( + environment_t* env, + interval_t additional_delay, + int message_type, + unsigned short port, + unsigned short federate, + const char* next_destination_str, + size_t length, + unsigned char* message); /** * Synchronize the start with other federates via the RTI. diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index 21ed69141..382037e8d 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -151,18 +151,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Physical connections also use the above P2P sockets between * federates even if the coordination is centralized. * - * Note: Peer-to-peer sockets can be closed by the downstream federate. - * For example, when a downstream federate reaches its stop time, then - * it will stop accepting physical messages. To achieve an orderly shutdown, - * the downstream federate sends a MSG_TYPE_CLOSE_REQUEST message to the upstream - * one and the upstream federate handles closing the socket. This way, any - * messages that are in the middle of being sent while the downstream - * federate shuts down will successfully traverse the socket, even if - * only to be ignored by the downstream federate. It is valid to ignore - * such messages if the connection is physical or if the coordination is - * decentralized and the messages arrive after the STP offset of the - * downstream federate (i.e., they are "tardy"). - * * Afterward, the federates and the RTI decide on a common start time by having * each federate report a reading of its physical clock to the RTI on a * `MSG_TYPE_TIMESTAMP`. The RTI broadcasts the maximum of these readings plus @@ -585,14 +573,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define MSG_TYPE_P2P_TAGGED_MESSAGE 17 -/** - * Byte identifying a message that a downstream federate sends to its - * upstream counterpart to request that the socket connection be closed. - * This is the only message that should flow upstream on such socket - * connections. - */ -#define MSG_TYPE_CLOSE_REQUEST 18 - //////////////////////////////////////////////// /** * Physical clock synchronization messages according to PTP. diff --git a/include/core/federated/network/net_util.h b/include/core/federated/network/net_util.h index 651ffdda1..2883b7a55 100644 --- a/include/core/federated/network/net_util.h +++ b/include/core/federated/network/net_util.h @@ -101,81 +101,77 @@ ssize_t read_from_socket_errexit( unsigned char* buffer, char* format, ...); -ssize_t write_to_socket(int socket, size_t num_bytes, unsigned char* buffer); - /** - * Read the specified number of bytes from the specified socket into the - * specified buffer. If a disconnect occurs during this - * reading, return a negative number. If an EOF occurs during this - * reading, return 0. Otherwise, return the number of bytes read. - * This is a version of read_from_socket_errexit() that does not error out. + * Without blocking, peek at the specified socket and, if there is + * anything on the queue, put its first byte at the specified address and return 1. + * If there is nothing on the queue, return 0, and if an error occurs, + * return -1. * @param socket The socket ID. - * @param num_bytes The number of bytes to read. - * @param buffer The buffer into which to put the bytes. - * @return The number of bytes read or 0 when EOF is received or negative for an error. + * @param result Pointer to where to put the first byte available on the socket. */ -ssize_t read_from_socket(int socket, size_t num_bytes, unsigned char* buffer); +ssize_t peek_from_socket(int socket, unsigned char* result); /** * Write the specified number of bytes to the specified socket from the - * specified buffer. If a disconnect or an EOF occurs during this - * reading, report an error and exit, unless the format string is NULL, - * in which case, report an error and return. This function takes a formatted - * string and additional optional arguments similar to printf(format, ...) - * that is appended to the error messages. + * specified buffer. If an error occurs, return -1 and set errno to indicate + * the cause of the error. If the write succeeds, return 0. + * This function repeats the attempt until the specified number of bytes + * have been written or an error occurs. Specifically, errors EAGAIN, + * EWOULDBLOCK, and EINTR are not considered errors and instead trigger + * another attempt. A delay between attempts is given by + * DELAY_BETWEEN_SOCKET_RETRIES. * @param socket The socket ID. * @param num_bytes The number of bytes to write. * @param buffer The buffer from which to get the bytes. - * @param mutex If non-NULL, the mutex to unlock before exiting. - * @param format A format string for error messages, followed by any number of - * fields that will be used to fill the format string as in printf, or NULL - * to prevent exit on error. - * @return The number of bytes written, or 0 if an EOF was received, or a negative - * number if an error occurred. + * @return 0 for success, -1 for failure. */ -ssize_t write_to_socket_with_mutex( - int socket, - size_t num_bytes, - unsigned char* buffer, - lf_mutex_t* mutex, - char* format, ...); +int write_to_socket(int socket, size_t num_bytes, unsigned char* buffer); /** - * Write the specified number of bytes to the specified socket from the - * specified buffer. If a disconnect or an EOF occurs during this - * reading, report an error and exit, unless the format string is NULL, - * in which case, report an error and return. This function takes a formatted - * string and additional optional arguments similar to printf(format, ...) - * that is appended to the error messages. - * @param socket The socket ID. + * Write the specified number of bytes to the specified socket using write_to_socket + * and close the socket if an error occurs. If an error occurs, this will change the + * socket ID pointed to by the first argument to -1 and will return -1. + * @param socket Pointer to the socket ID. + * @param num_bytes The number of bytes to write. + * @param buffer The buffer from which to get the bytes. + * @return 0 for success, -1 for failure. + */ +int write_to_socket_close_on_error(int* socket, size_t num_bytes, unsigned char* buffer); + +/** + * Write the specified number of bytes to the specified socket using + * write_to_socket_close_on_error and exit with an error code if an error occurs. + * If the mutex argument is non-NULL, release the mutex before exiting. If the + * format argument is non-null, then use it an any additional arguments to form + * the error message using printf conventions. Otherwise, print a generic error + * message. + * @param socket Pointer to the socket ID. * @param num_bytes The number of bytes to write. * @param buffer The buffer from which to get the bytes. * @param mutex If non-NULL, the mutex to unlock before exiting. * @param format A format string for error messages, followed by any number of * fields that will be used to fill the format string as in printf, or NULL - * to prevent exit on error. - * @return The number of bytes written, or 0 if an EOF was received, or a negative - * number if an error occurred. + * to print a generic error message. */ -ssize_t write_to_socket_errexit( - int socket, +void write_to_socket_fail_on_error( + int* socket, size_t num_bytes, unsigned char* buffer, + lf_mutex_t* mutex, char* format, ...); /** - * Write the specified number of bytes to the specified socket from the - * specified buffer. If a disconnect or an EOF occurs during this - * reading, return a negative number or 0 respectively. Otherwise, - * return the number of bytes written. - * This is a version of write_to_socket() that does not error out. + * Read the specified number of bytes from the specified socket into the + * specified buffer. If a disconnect occurs during this + * reading, return a negative number. If an EOF occurs during this + * reading, return 0. Otherwise, return the number of bytes read. + * This is a version of read_from_socket_errexit() that does not error out. * @param socket The socket ID. - * @param num_bytes The number of bytes to write. - * @param buffer The buffer from which to get the bytes. - * @return The number of bytes written, or 0 if an EOF was received, or a negative - * number if an error occurred. + * @param num_bytes The number of bytes to read. + * @param buffer The buffer into which to put the bytes. + * @return The number of bytes read or 0 when EOF is received or negative for an error. */ -int write_to_socket2(int socket, int num_bytes, unsigned char* buffer); +ssize_t read_from_socket(int socket, size_t num_bytes, unsigned char* buffer); #endif // FEDERATED diff --git a/include/core/utils/util.h b/include/core/utils/util.h index a41561e0a..66b9fe504 100644 --- a/include/core/utils/util.h +++ b/include/core/utils/util.h @@ -289,4 +289,32 @@ void lf_register_print_function(print_message_function_t* function, int log_leve } \ } while(0) #endif // LF_NOASSERT + +/** + * Checking mutex locking and unlocking. + */ +#define LF_MUTEX_INIT(mutex) \ + do { \ + int result = lf_mutex_init(mutex); \ + LF_ASSERT(result == 0, "Mutex init failed."); \ + } while (0) + +#define LF_MUTEX_LOCK(mutex) \ + do { \ + int result = lf_mutex_lock(mutex); \ + LF_ASSERT(result == 0, "Mutex lock failed."); \ + } while (0) + +#define LF_MUTEX_UNLOCK(mutex) \ + do { \ + int result = lf_mutex_unlock(mutex); \ + LF_ASSERT(result == 0, "Mutex unlock failed."); \ + } while (0) + +#define LF_COND_INIT(cond, mutex) \ + do { \ + int result = lf_cond_init(cond, mutex); \ + LF_ASSERT(result == 0, "Condition variable init failed."); \ + } while (0) + #endif /* UTIL_H */ From 6a1e31335b645e12a32c948a526a3d38c7bed6c7 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Tue, 26 Dec 2023 17:12:31 -0800 Subject: [PATCH 16/83] Send all messages to stdout, not stderr --- core/utils/util.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/core/utils/util.c b/core/utils/util.c index 302d716e4..f03403eaf 100644 --- a/core/utils/util.c +++ b/core/utils/util.c @@ -141,11 +141,8 @@ void _lf_message_print( #endif // STANDALONE_RTI } if (print_message_function == NULL) { - if (is_error) { - vfprintf(stderr, message, args); - } else { - vfprintf(stdout, message, args); - } + // NOTE: Send all messages to stdout, not to stderr, so that ordering makes sense. + vfprintf(stdout, message, args); } else { (*print_message_function)(message, args); } @@ -213,6 +210,7 @@ void lf_print_error_and_exit(const char* format, ...) { va_start (args, format); lf_vprint_fatal_error(format, args); va_end (args); + fflush(stdout); exit(EXIT_FAILURE); } From a31a5d4441acfdd6868a6cd4ae75a02697ab7b78 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Tue, 26 Dec 2023 17:13:31 -0800 Subject: [PATCH 17/83] Allow scheduling at current time before execution starts --- core/reactor_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/reactor_common.c b/core/reactor_common.c index 022f903ff..a282e0346 100644 --- a/core/reactor_common.c +++ b/core/reactor_common.c @@ -690,7 +690,7 @@ int _lf_schedule_at_tag(environment_t* env, trigger_t* trigger, tag_t tag, lf_to LF_PRINT_DEBUG("_lf_schedule_at_tag() called with tag " PRINTF_TAG " at tag " PRINTF_TAG ".", tag.time - start_time, tag.microstep, current_logical_tag.time - start_time, current_logical_tag.microstep); - if (lf_tag_compare(tag, current_logical_tag) <= 0) { + if (lf_tag_compare(tag, current_logical_tag) <= 0 && _lf_execution_started) { lf_print_warning("_lf_schedule_at_tag(): requested to schedule an event in the past."); return -1; } From b849176cca9d812639a1419ecbf4e7a217c3b63a Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Tue, 26 Dec 2023 17:13:40 -0800 Subject: [PATCH 18/83] Better handling of startup --- core/federated/federate.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index d82144ea0..0fe000713 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1128,16 +1128,14 @@ void update_last_known_status_on_input_ports(tag_t tag) { */ void update_last_known_status_on_input_port(tag_t tag, int port_id) { trigger_t* input_port_action = _lf_action_for_port(port_id)->trigger; - if (lf_tag_compare(tag, - input_port_action->last_known_status_tag) >= 0) { - if (lf_tag_compare(tag, - input_port_action->last_known_status_tag) == 0) { - // If the intended tag for an input port is equal to the last known status, we need - // to increment the microstep. This is a direct result of the behavior of the lf_delay_tag() - // semantics in tag.h. - tag.microstep++; - } - LF_PRINT_DEBUG( + if (lf_tag_compare(tag, input_port_action->last_known_status_tag) >= 0) { + if (lf_tag_compare(tag, input_port_action->last_known_status_tag) == 0) { + // If the intended tag for an input port is equal to the last known status, we need + // to increment the microstep. This is a direct result of the behavior of the lf_delay_tag() + // semantics in tag.h. + tag.microstep++; + } + LF_PRINT_LOG( "Updating the last known status tag of port %d from " PRINTF_TAG " to " PRINTF_TAG ".", port_id, input_port_action->last_known_status_tag.time - lf_time_start(), @@ -1311,7 +1309,6 @@ static trigger_handle_t schedule_message_received_from_network_locked( // in the future relative to the tag of this // federate. By default, assume it is not. bool message_tag_is_in_the_future = lf_tag_compare(tag, env->current_tag) > 0; - // Assign the intended tag temporarily to restore later. tag_t previous_intended_tag = trigger->intended_tag; trigger->intended_tag = tag; @@ -1319,17 +1316,19 @@ static trigger_handle_t schedule_message_received_from_network_locked( // Calculate the extra_delay required to be passed // to the schedule function. interval_t extra_delay = tag.time - env->current_tag.time; - if (!message_tag_is_in_the_future) { + if (!message_tag_is_in_the_future && _lf_execution_started) { #ifdef FEDERATED_CENTRALIZED // If the coordination is centralized, receiving a message // that does not carry a timestamp that is in the future // would indicate a critical condition, showing that the // time advance mechanism is not working correctly. - lf_print_error_and_exit("Received a message at tag " PRINTF_TAG " that" - " has a tag " PRINTF_TAG " that has violated the STP offset. " - "Centralized coordination should not have these types of messages.", - env->current_tag.time - start_time, env->current_tag.microstep, - tag.time - start_time, tag.microstep); + LF_MUTEX_UNLOCK(&env->mutex); + lf_print_error_and_exit( + "Received a message at tag " PRINTF_TAG " that has a tag " PRINTF_TAG + " that has violated the STP offset. " + "Centralized coordination should not have these types of messages.", + env->current_tag.time - start_time, env->current_tag.microstep, + tag.time - start_time, tag.microstep); #else // Set the delay back to 0 extra_delay = 0LL; @@ -1563,10 +1562,7 @@ void handle_tagged_message(int socket, int fed_id) { // can be checked in this scenario without this race condition. The message with // intended_tag of 9 in this case needs to wait one microstep to be processed. if (lf_tag_compare(intended_tag, lf_tag(env)) == 0 // The event is meant for the current tag. -#if defined FEDERATED_DECENTRALIZED - // Not sure why this test is only needed for decentralized coordination. && _lf_execution_started -#endif // FEDERATED_DECENTRALIZED // Check that MLAA is blocking at the right level. Otherwise, data can be lost. && action->trigger->reactions[0]->index >= max_level_allowed_to_advance && !action->trigger->is_physical @@ -1602,7 +1598,6 @@ void handle_tagged_message(int socket, int fed_id) { } else { // If no port absent reaction is waiting for this message, or if the intended // tag is in the future, use schedule functions to process the message. - update_last_known_status_on_input_port(intended_tag, port_id); // Before that, if the current time >= stop time, discard the message. From 535cacb735ea8eb4a1f5b2e4e2baf6f0c0594074 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 27 Dec 2023 06:52:03 -0800 Subject: [PATCH 19/83] Made execution_started an environment flag --- core/federated/federate.c | 4 ++-- core/reactor.c | 2 +- core/reactor_common.c | 14 +++++--------- core/threaded/reactor_threaded.c | 2 +- include/core/environment.h | 1 + include/core/reactor_common.h | 1 - 6 files changed, 10 insertions(+), 14 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 0fe000713..f98515448 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1316,7 +1316,7 @@ static trigger_handle_t schedule_message_received_from_network_locked( // Calculate the extra_delay required to be passed // to the schedule function. interval_t extra_delay = tag.time - env->current_tag.time; - if (!message_tag_is_in_the_future && _lf_execution_started) { + if (!message_tag_is_in_the_future && env->execution_started) { #ifdef FEDERATED_CENTRALIZED // If the coordination is centralized, receiving a message // that does not carry a timestamp that is in the future @@ -1562,7 +1562,7 @@ void handle_tagged_message(int socket, int fed_id) { // can be checked in this scenario without this race condition. The message with // intended_tag of 9 in this case needs to wait one microstep to be processed. if (lf_tag_compare(intended_tag, lf_tag(env)) == 0 // The event is meant for the current tag. - && _lf_execution_started + && env->execution_started // Check that MLAA is blocking at the right level. Otherwise, data can be lost. && action->trigger->reactions[0]->index >= max_level_allowed_to_advance && !action->trigger->is_physical diff --git a/core/reactor.c b/core/reactor.c index d0bf6a0cd..ceace5125 100644 --- a/core/reactor.c +++ b/core/reactor.c @@ -383,7 +383,6 @@ int lf_reactor_c_main(int argc, const char* argv[]) { // Set up modal infrastructure _lf_initialize_modes(env); #endif - _lf_execution_started = true; _lf_trigger_startup_reactions(env); _lf_initialize_timers(env); // If the stop_tag is (0,0), also insert the shutdown @@ -394,6 +393,7 @@ int lf_reactor_c_main(int argc, const char* argv[]) { } LF_PRINT_DEBUG("Running the program's main loop."); // Handle reactions triggered at time (T,m). + env->execution_started = true; if (_lf_do_step(env)) { while (next(env) != 0); } diff --git a/core/reactor_common.c b/core/reactor_common.c index a282e0346..b8e52e4d7 100644 --- a/core/reactor_common.c +++ b/core/reactor_common.c @@ -99,12 +99,6 @@ unsigned int _lf_number_of_workers = 0u; */ instant_t duration = -1LL; -/** - * Indicates whether or not the execution - * has started. - */ -bool _lf_execution_started = false; - /** Indicator of whether the keepalive command-line option was given. */ bool keepalive_specified = false; @@ -275,7 +269,7 @@ void _lf_trigger_reaction(environment_t* env, reaction_t* reaction, int worker_n * counts between time steps and at the end of execution. */ void _lf_start_time_step(environment_t *env) { - if (_lf_execution_started == false) { + if (!env->execution_started) { // Execution hasn't started, so this is probably being invoked in termination // due to an error. return; @@ -690,8 +684,8 @@ int _lf_schedule_at_tag(environment_t* env, trigger_t* trigger, tag_t tag, lf_to LF_PRINT_DEBUG("_lf_schedule_at_tag() called with tag " PRINTF_TAG " at tag " PRINTF_TAG ".", tag.time - start_time, tag.microstep, current_logical_tag.time - start_time, current_logical_tag.microstep); - if (lf_tag_compare(tag, current_logical_tag) <= 0 && _lf_execution_started) { - lf_print_warning("_lf_schedule_at_tag(): requested to schedule an event in the past."); + if (lf_tag_compare(tag, current_logical_tag) <= 0 && env->execution_started) { + lf_print_warning("_lf_schedule_at_tag(): requested to schedule an event at the current or past tag."); return -1; } @@ -1540,6 +1534,8 @@ void usage(int argc, const char* argv[]) { #ifdef FEDERATED printf(" -r, --rti \n"); printf(" The address of the RTI, which can be in the form of user@host:port or ip:port.\n\n"); + printf(" -l\n"); + printf(" Send stdout to individual log files for each federate.\n\n"); #endif printf("Command given:\n"); diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index f3b44bea4..91e53cbed 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -776,7 +776,7 @@ void _lf_initialize_start_tag(environment_t *env) { // Set the following boolean so that other thread(s), including federated threads, // know that the execution has started - _lf_execution_started = true; + env->execution_started = true; } /** For logging and debugging, each worker thread is numbered. */ diff --git a/include/core/environment.h b/include/core/environment.h index d4852ddca..8670b8213 100644 --- a/include/core/environment.h +++ b/include/core/environment.h @@ -67,6 +67,7 @@ typedef struct enclave_info_t enclave_info_t; */ typedef struct environment_t { bool initialized; + bool execution_started; // Events at the start tag have been pulled from the event queue. char *name; int id; tag_t current_tag; diff --git a/include/core/reactor_common.h b/include/core/reactor_common.h index 1ec8082b6..1010fc65e 100644 --- a/include/core/reactor_common.h +++ b/include/core/reactor_common.h @@ -15,7 +15,6 @@ extern unsigned int _lf_number_of_workers; extern bool fast; extern instant_t duration; -extern bool _lf_execution_started; extern bool keepalive_specified; extern interval_t _lf_fed_STA_offset; From e17ee9a3002e9dff900c3dfee674d56cdef38314 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 27 Dec 2023 09:50:39 -0800 Subject: [PATCH 20/83] Prevent spurious error at start --- core/federated/federate.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index f98515448..ae0372dc6 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -255,9 +255,6 @@ static void _lf_close_inbound_socket(int fed_id, int flag) { if (flag >= 0) { if (flag > 0) { shutdown(_fed.sockets_for_inbound_p2p_connections[fed_id], SHUT_RDWR); - // Flag indicates that there could still be incoming data. - unsigned char message[32]; - while (read(_fed.sockets_for_inbound_p2p_connections[fed_id], &message, 32) > 0); } else { // Have received EOF from the other end. Send EOF to the other end. shutdown(_fed.sockets_for_inbound_p2p_connections[fed_id], SHUT_WR); @@ -1602,7 +1599,7 @@ void handle_tagged_message(int socket, int fed_id) { // Before that, if the current time >= stop time, discard the message. // But only if the stop time is not equal to the start time! - if (lf_tag_compare(env->current_tag, env->stop_tag) >= 0) { + if (lf_tag_compare(env->current_tag, env->stop_tag) >= 0 && env->execution_started) { lf_print_error("Received message too late. Already at stop tag.\n" " Current tag is " PRINTF_TAG " and intended tag is " PRINTF_TAG ".\n" " Discarding message and closing the socket.", @@ -2629,7 +2626,9 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply while (true) { // Wait until either something changes on the event queue or // the RTI has responded with a TAG. - LF_PRINT_DEBUG("Waiting for a TAG from the RTI with _fed.last_TAG.time=%lld, %lld and net=%lld, %lld", (long long) _fed.last_TAG.time - start_time, (long long) _fed.last_TAG.microstep, (long long) tag.time - start_time, (long long) tag.microstep); + LF_PRINT_DEBUG("Waiting for a TAG from the RTI with _fed.last_TAG= " PRINTF_TAG " and net=" PRINTF_TAG, + _fed.last_TAG.time - start_time, _fed.last_TAG.microstep, + tag.time - start_time, tag.microstep); if (lf_cond_wait(&env->event_q_changed) != 0) { lf_print_error("Wait error."); } From f406b26c899d7c060d086869b310fc2c4c5fcde4 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 27 Dec 2023 09:50:54 -0800 Subject: [PATCH 21/83] Comment only --- core/reactor_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/reactor_common.c b/core/reactor_common.c index b8e52e4d7..43dde86d1 100644 --- a/core/reactor_common.c +++ b/core/reactor_common.c @@ -652,8 +652,8 @@ static void _lf_replace_token(event_t* event, lf_token_t* token) { /** * Schedule events at a specific tag (time, microstep), provided - * that the tag is in the future relative to the current tag. - * The input time values are absolute. + * that the tag is in the future relative to the current tag (or the + * environment has not started executing). The input time values are absolute. * * If there is an event found at the requested tag, the payload * is replaced and 0 is returned. From b5718260249f19988b1a59cf79ef4a0a27ea84b7 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 27 Dec 2023 09:51:22 -0800 Subject: [PATCH 22/83] Pop events after wait, not before --- core/threaded/reactor_threaded.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index 91e53cbed..c8dae9b15 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -736,10 +736,6 @@ void _lf_initialize_start_tag(environment_t *env) { // Restore the current tag to match the start time. env->current_tag = (tag_t){.time = start_time, .microstep = 0u}; - // For messages that may have arrived while we were waiting, put - // reactions on the reaction queue. - _lf_pop_events(env); - // If the stop_tag is (0,0), also insert the shutdown // reactions. This can only happen if the timeout time // was set to 0. @@ -763,6 +759,11 @@ void _lf_initialize_start_tag(environment_t *env) { // tag). Inform the RTI of this if needed. send_next_event_tag(env, env->current_tag, true); #endif // NOT FEDERATED_DECENTRALIZED + + // For messages that may have arrived while we were waiting, put + // reactions on the reaction queue. + _lf_pop_events(env); + #else // NOT FEDERATED _lf_initialize_timers(env); From dfde25f801b73fe1fbcca7840cacdd30e6b744a1 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 27 Dec 2023 10:10:37 -0800 Subject: [PATCH 23/83] Ensure dummy events before start of execution --- core/reactor_common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/reactor_common.c b/core/reactor_common.c index 43dde86d1..b278ec0c4 100644 --- a/core/reactor_common.c +++ b/core/reactor_common.c @@ -836,10 +836,11 @@ int _lf_schedule_at_tag(environment_t* env, trigger_t* trigger, tag_t tag, lf_to if (tag.time == current_logical_tag.time) { relative_microstep -= current_logical_tag.microstep; } - if (((tag.time == current_logical_tag.time) && (relative_microstep == 1)) || + if ((tag.time == current_logical_tag.time && relative_microstep == 1 && env->execution_started) || tag.microstep == 0) { // Do not need a dummy event if we are scheduling at 1 microstep // in the future at current time or at microstep 0 in a future time. + // Note that if execution hasn't started, then we have to insert dummy events. pqueue_insert(env->event_q, e); } else { // Create a dummy event. Insert it into the queue, and let its next From 88986310baac6b796209a287a7d845393726b186 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 27 Dec 2023 10:10:49 -0800 Subject: [PATCH 24/83] Typo in comment --- include/core/utils/pqueue_tag.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/core/utils/pqueue_tag.h b/include/core/utils/pqueue_tag.h index aef72d507..3eb8d4a5e 100644 --- a/include/core/utils/pqueue_tag.h +++ b/include/core/utils/pqueue_tag.h @@ -35,7 +35,7 @@ * pqueue_tag_insert_tag, pqueue_tag_insert_if_no_match, and pqueue_tag_pop_tag. * * To customize the element you put onto the queue, for example to carry - * a pyaload, you can create your own element struct type by simply declaring + * a payload, you can create your own element struct type by simply declaring * the first field to be a pqueue_tag_element_t. For example, if you want an * element of the queue to include a pointer to your own payload, you can * declare the following struct type: From be4a03adefc1d0adad9a66fd80fcbb7eb64d38af Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 27 Dec 2023 10:18:16 -0800 Subject: [PATCH 25/83] Fixed modal reactors --- core/reactor_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/reactor_common.c b/core/reactor_common.c index b278ec0c4..bf060a8af 100644 --- a/core/reactor_common.c +++ b/core/reactor_common.c @@ -1766,7 +1766,7 @@ void termination(void) { #ifdef MODAL_REACTORS // Free events and tokens suspended by modal reactors. - _lf_terminate_modal_reactors(env[i]); + _lf_terminate_modal_reactors(&env[i]); #endif // If the event queue still has events on it, report that. if (env[i].event_q != NULL && pqueue_size(env[i].event_q) > 0) { From 95b7dd8ca64816f155c1f582fb232404077140a4 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 27 Dec 2023 17:35:11 -0800 Subject: [PATCH 26/83] Reworked socket send functions --- core/federated/RTI/rti_remote.c | 169 +++++++++----------- core/federated/clock-sync.c | 30 +--- core/federated/federate.c | 122 +++++++------- core/federated/network/net_util.c | 123 +++++++------- include/core/federated/clock-sync.h | 4 +- include/core/federated/network/net_common.h | 2 +- include/core/federated/network/net_util.h | 51 +++--- 7 files changed, 242 insertions(+), 259 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index a2781af56..b81dac6e1 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -301,9 +301,10 @@ void handle_port_absent_message(federate_info_t *sending_federate, unsigned char { size_t message_size = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int64_t) + sizeof(uint32_t); - read_from_socket_errexit(sending_federate->socket, message_size, &(buffer[1]), - " RTI failed to read port absent message from federate %u.", - sending_federate->enclave.id); + read_from_socket_fail_on_error( + &sending_federate->socket, message_size, &(buffer[1]), NULL, + " RTI failed to read port absent message from federate %u.", + sending_federate->enclave.id); uint16_t reactor_port_id = extract_uint16(&(buffer[1])); uint16_t federate_id = extract_uint16(&(buffer[1 + sizeof(uint16_t)])); @@ -370,7 +371,9 @@ void handle_timed_message(federate_info_t *sending_federate, unsigned char *buff { size_t header_size = 1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t) + sizeof(int64_t) + sizeof(uint32_t); // Read the header, minus the first byte which has already been read. - read_from_socket_errexit(sending_federate->socket, header_size - 1, &(buffer[1]), "RTI failed to read the timed message header from remote federate."); + read_from_socket_fail_on_error( + &sending_federate->socket, header_size - 1, &(buffer[1]), NULL, + "RTI failed to read the timed message header from remote federate."); // Extract the header information. of the sender uint16_t reactor_port_id; uint16_t federate_id; @@ -399,8 +402,9 @@ void handle_timed_message(federate_info_t *sending_federate, unsigned char *buff sending_federate->enclave.id, federate_id, reactor_port_id, intended_tag.time - lf_time_start(), intended_tag.microstep); - read_from_socket_errexit(sending_federate->socket, bytes_to_read, &(buffer[header_size]), - "RTI failed to read timed message from federate %d.", federate_id); + read_from_socket_fail_on_error( + &sending_federate->socket, bytes_to_read, &(buffer[header_size]), NULL, + "RTI failed to read timed message from federate %d.", federate_id); size_t bytes_read = bytes_to_read + header_size; // Following only works for string messages. // LF_PRINT_DEBUG("Message received by RTI: %s.", buffer + header_size); @@ -499,8 +503,8 @@ void handle_timed_message(federate_info_t *sending_federate, unsigned char *buff { bytes_to_read = FED_COM_BUFFER_SIZE; } - read_from_socket_errexit(sending_federate->socket, bytes_to_read, buffer, - "RTI failed to read message chunks."); + read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, + "RTI failed to read message chunks."); total_bytes_read += bytes_to_read; // FIXME: a mutex needs to be held for this so that other threads @@ -519,8 +523,9 @@ void handle_timed_message(federate_info_t *sending_federate, unsigned char *buff void handle_logical_tag_complete(federate_info_t *fed) { unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; - read_from_socket_errexit(fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, - "RTI failed to read the content of the logical tag complete from federate %d.", fed->enclave.id); + read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, + "RTI failed to read the content of the logical tag complete from federate %d.", + fed->enclave.id); tag_t completed = extract_tag(buffer); if (rti_remote->base.tracing_enabled) { @@ -538,8 +543,9 @@ void handle_logical_tag_complete(federate_info_t *fed) void handle_next_event_tag(federate_info_t *fed) { unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; - read_from_socket_errexit(fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, - "RTI failed to read the content of the next event tag from federate %d.", fed->enclave.id); + read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, + "RTI failed to read the content of the next event tag from federate %d.", + fed->enclave.id); // Acquire a mutex lock to ensure that this state does not change while a // message is in transport or being used to determine a TAG. @@ -638,8 +644,9 @@ void handle_stop_request_message(federate_info_t *fed) size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_LENGTH - 1; unsigned char buffer[bytes_to_read]; - read_from_socket_errexit(fed->socket, bytes_to_read, buffer, - "RTI failed to read the MSG_TYPE_STOP_REQUEST payload from federate %d.", fed->enclave.id); + read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer, NULL, + "RTI failed to read the MSG_TYPE_STOP_REQUEST payload from federate %d.", + fed->enclave.id); // Extract the proposed stop tag for the federate tag_t proposed_stop_tag = extract_tag(buffer); @@ -727,8 +734,9 @@ void handle_stop_request_reply(federate_info_t *fed) { size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_REPLY_LENGTH - 1; unsigned char buffer_stop_time[bytes_to_read]; - read_from_socket_errexit(fed->socket, bytes_to_read, buffer_stop_time, - "RTI failed to read the reply to MSG_TYPE_STOP_REQUEST message from federate %d.", fed->enclave.id); + read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer_stop_time, NULL, + "RTI failed to read the reply to MSG_TYPE_STOP_REQUEST message from federate %d.", + fed->enclave.id); tag_t federate_stop_tag = extract_tag(buffer_stop_time); @@ -759,10 +767,8 @@ void handle_address_query(uint16_t fed_id) { // Use buffer both for reading and constructing the reply. // The length is what is needed for the reply. unsigned char buffer[1 + sizeof(int32_t)]; - ssize_t bytes_read = read_from_socket(fed->socket, sizeof(uint16_t), (unsigned char *)buffer); - if (bytes_read == 0) { - lf_print_error_and_exit("Failed to read address query."); - } + read_from_socket_fail_on_error(&fed->socket, sizeof(uint16_t), (unsigned char *)buffer, NULL, + "Failed to read address query."); uint16_t remote_fed_id = extract_uint16(buffer); if (rti_remote->base.tracing_enabled) { @@ -805,13 +811,8 @@ void handle_address_ad(uint16_t federate_id) { // connections to other federates int32_t server_port = -1; unsigned char buffer[sizeof(int32_t)]; - ssize_t bytes_read = read_from_socket(fed->socket, sizeof(int32_t), (unsigned char *)buffer); - - if (bytes_read < (ssize_t)sizeof(int32_t)) { - LF_PRINT_DEBUG("Error reading port data from federate %d.", federate_id); - // Leave the server port at -1, which means "I don't know". - return; - } + read_from_socket_fail_on_error(&fed->socket, sizeof(int32_t), (unsigned char *)buffer, NULL, + "Error reading port data from federate %d.", federate_id); server_port = extract_int32(buffer); @@ -831,11 +832,8 @@ void handle_timestamp(federate_info_t *my_fed) { unsigned char buffer[sizeof(int64_t)]; // Read bytes from the socket. We need 8 bytes. - ssize_t bytes_read = read_from_socket(my_fed->socket, sizeof(int64_t), (unsigned char *)&buffer); - if (bytes_read < (ssize_t)sizeof(int64_t)) - { - lf_print_error("ERROR reading timestamp from federate %d.\n", my_fed->enclave.id); - } + read_from_socket_fail_on_error(&my_fed->socket, sizeof(int64_t), (unsigned char *)&buffer, NULL, + "ERROR reading timestamp from federate %d.\n", my_fed->enclave.id); int64_t timestamp = swap_bytes_if_big_endian_int64(*((int64_t *)(&buffer))); if (rti_remote->base.tracing_enabled) @@ -954,14 +952,12 @@ void handle_physical_clock_sync_message(federate_info_t *my_fed, socket_type_t s LF_MUTEX_UNLOCK(&rti_mutex); } -void *clock_synchronization_thread(void *noargs) -{ +void *clock_synchronization_thread(void *noargs) { // Wait until all federates have been notified of the start time. // FIXME: Use lf_ version of this when merged with master. LF_MUTEX_LOCK(&rti_mutex); - while (rti_remote->num_feds_proposed_start < rti_remote->base.number_of_scheduling_nodes) - { + while (rti_remote->num_feds_proposed_start < rti_remote->base.number_of_scheduling_nodes) { lf_cond_wait(&received_start_times); } LF_MUTEX_UNLOCK(&rti_mutex); @@ -970,35 +966,29 @@ void *clock_synchronization_thread(void *noargs) // The above wait ensures that start_time has been set. interval_t ns_to_wait = start_time - lf_time_physical(); - if (ns_to_wait > 0LL) - { + if (ns_to_wait > 0LL) { lf_sleep(ns_to_wait); } // Initiate a clock synchronization every rti->clock_sync_period_ns // Initiate a clock synchronization every rti->clock_sync_period_ns struct timespec sleep_time = {(time_t)rti_remote->clock_sync_period_ns / BILLION, - rti_remote->clock_sync_period_ns % BILLION}; + rti_remote->clock_sync_period_ns % BILLION}; struct timespec remaining_time; bool any_federates_connected = true; - while (any_federates_connected) - { + while (any_federates_connected) { // Sleep lf_sleep(rti_remote->clock_sync_period_ns); // Can be interrupted any_federates_connected = false; - for (int fed_id = 0; fed_id < rti_remote->base.number_of_scheduling_nodes; fed_id++) - { + for (int fed_id = 0; fed_id < rti_remote->base.number_of_scheduling_nodes; fed_id++) { federate_info_t *fed = GET_FED_INFO(fed_id); - if (fed->enclave.state == NOT_CONNECTED) - { + if (fed->enclave.state == NOT_CONNECTED) { // FIXME: We need better error handling here, but clock sync failure // should not stop execution. lf_print_error("Clock sync failed with federate %d. Not connected.", fed_id); continue; - } - else if (!fed->clock_synchronization_enabled) - { + } else if (!fed->clock_synchronization_enabled) { continue; } // Send the RTI's current physical time to the federate @@ -1013,19 +1003,15 @@ void *clock_synchronization_thread(void *noargs) // If the T3 message from this federate does not arrive and we keep receiving // other messages, then give up on this federate and move to the next federate. int remaining_attempts = 5; - while (remaining_attempts > 0) - { + while (remaining_attempts > 0) { remaining_attempts--; - int bytes_read = read_from_socket(rti_remote->socket_descriptor_UDP, message_size, buffer); + int read_failed = read_from_socket(rti_remote->socket_descriptor_UDP, message_size, buffer); // If any errors occur, either discard the message or the clock sync round. - if (bytes_read == message_size) - { - if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) - { + if (!read_failed) { + if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { int32_t fed_id_2 = extract_int32(&(buffer[1])); // Check that this message came from the correct federate. - if (fed_id_2 != fed->enclave.id) - { + if (fed_id_2 != fed->enclave.id) { // Message is from the wrong federate. Discard the message. lf_print_warning("Clock sync: Received T3 message from federate %d, " "but expected one from %d. Discarding message.", @@ -1035,9 +1021,7 @@ void *clock_synchronization_thread(void *noargs) LF_PRINT_DEBUG("Clock sync: RTI received T3 message from federate %d.", fed_id_2); handle_physical_clock_sync_message(GET_FED_INFO(fed_id_2), UDP); break; - } - else - { + } else { // The message is not a T3 message. Discard the message and // continue waiting for the T3 message. This is possibly a message // from a previous cycle that was discarded. @@ -1048,9 +1032,7 @@ void *clock_synchronization_thread(void *noargs) fed->enclave.id); continue; } - } - else - { + } else { lf_print_warning("Clock sync: Read from UDP socket failed: %s. " "Skipping clock sync round for federate %d.", strerror(errno), @@ -1058,8 +1040,7 @@ void *clock_synchronization_thread(void *noargs) remaining_attempts = -1; } } - if (remaining_attempts > 0) - { + if (remaining_attempts > 0) { any_federates_connected = true; } } @@ -1067,32 +1048,27 @@ void *clock_synchronization_thread(void *noargs) return NULL; } -void handle_federate_resign(federate_info_t *my_fed) -{ +void handle_federate_resign(federate_info_t *my_fed) { // Nothing more to do. Close the socket and exit. LF_MUTEX_LOCK(&rti_mutex); // Extract the tag size_t header_size = 1 + sizeof(tag_t); unsigned char buffer[header_size]; // Read the header, minus the first byte which has already been read. - read_from_socket_errexit(my_fed->socket, header_size - 1, &(buffer[1]), - "RTI failed to read the resign tag from remote federate."); + read_from_socket_fail_on_error(&my_fed->socket, header_size - 1, &(buffer[1]), NULL, + "RTI failed to read the resign tag from remote federate."); // Extract the tag sent by the resigning federate tag_t tag = extract_tag(&(buffer[1])); - if (rti_remote->base.tracing_enabled) - { + if (rti_remote->base.tracing_enabled) { tracepoint_rti_from_federate(rti_remote->base.trace, receive_RESIGN, my_fed->enclave.id, &tag); } - if (lf_tag_compare(tag, NEVER_TAG) == 0) - { + if (lf_tag_compare(tag, NEVER_TAG) == 0) { // The federate is reporting an error. _lf_federate_reports_error = true; lf_print("RTI: Federate %d reports an error and has resigned.", my_fed->enclave.id); - } - else - { + } else { lf_print("RTI: Federate %d has resigned.", my_fed->enclave.id); } @@ -1115,8 +1091,6 @@ void handle_federate_resign(federate_info_t *my_fed) // We can now safely close the socket. close(my_fed->socket); // from unistd.h - lf_print("RTI: Federate %d has resigned.", my_fed->enclave.id); - // Check downstream federates to see whether they should now be granted a TAG. // To handle cycles, need to create a boolean array to keep // track of which upstream federates have been visited. @@ -1138,8 +1112,8 @@ void *federate_info_thread_TCP(void *fed) { // Listen for messages from the federate. while (my_fed->enclave.state != NOT_CONNECTED) { // Read no more than one byte to get the message type. - ssize_t bytes_read = read_from_socket(my_fed->socket, 1, buffer); - if (bytes_read < 1) { + int read_failed = read_from_socket(my_fed->socket, 1, buffer); + if (read_failed) { // Socket is closed lf_print_warning("RTI: Socket to federate %d is closed. Exiting the thread.", my_fed->enclave.id); my_fed->enclave.state = NOT_CONNECTED; @@ -1224,7 +1198,7 @@ void send_reject(int *socket_id, unsigned char error_code) * matches this federation, send an MSG_TYPE_ACK and otherwise send * a MSG_TYPE_REJECT message. Return 1 if the federate is accepted to * the federation and 0 otherwise. - * @param socket_id The socket on which to listen. + * @param socket_id Pointer to the socket on which to listen. * @param client_fd The socket address. * @return The federate ID for success or -1 for failure. */ @@ -1235,7 +1209,8 @@ static int32_t receive_and_check_fed_id_message(int *socket_id, struct sockaddr_ // Read bytes from the socket. We need 4 bytes. // FIXME: This should not exit with error but rather should just reject the connection. - read_from_socket_errexit(*socket_id, length, buffer, "RTI failed to read from accepted socket."); + read_from_socket_fail_on_error(socket_id, length, buffer, NULL, + "RTI failed to read from accepted socket."); uint16_t fed_id = rti_remote->base.number_of_scheduling_nodes; // Initialize to an invalid value. @@ -1268,9 +1243,9 @@ static int32_t receive_and_check_fed_id_message(int *socket_id, struct sockaddr_ char federation_id_received[federation_id_length + 1]; // One extra for null terminator. // Next read the actual federation ID. // FIXME: This should not exit on error, but rather just reject the connection. - read_from_socket_errexit(*socket_id, federation_id_length, - (unsigned char *)federation_id_received, - "RTI failed to read federation id from federate %d.", fed_id); + read_from_socket_fail_on_error(socket_id, federation_id_length, + (unsigned char *)federation_id_received, NULL, + "RTI failed to read federation id from federate %d.", fed_id); // Terminate the string with a null. federation_id_received[federation_id_length] = 0; @@ -1361,10 +1336,11 @@ static int32_t receive_and_check_fed_id_message(int *socket_id, struct sockaddr_ static int receive_connection_information(int *socket_id, uint16_t fed_id) { LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_NEIGHBOR_STRUCTURE from federate %d.", fed_id); unsigned char connection_info_header[MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE]; - read_from_socket_errexit( - *socket_id, + read_from_socket_fail_on_error( + socket_id, MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE, connection_info_header, + NULL, "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message header from federate %d.", fed_id); @@ -1408,10 +1384,11 @@ static int receive_connection_information(int *socket_id, uint16_t fed_id) { unsigned char *connections_info_body = NULL; if (connections_info_body_size > 0) { connections_info_body = (unsigned char *)malloc(connections_info_body_size); - read_from_socket_errexit( - *socket_id, + read_from_socket_fail_on_error( + socket_id, connections_info_body_size, connections_info_body, + NULL, "RTI failed to read MSG_TYPE_NEIGHBOR_STRUCTURE message body from federate %d.", fed_id); // Keep track of where we are in the buffer @@ -1455,7 +1432,7 @@ static int receive_udp_message_and_set_up_clock_sync(int *socket_id, uint16_t fe // is doing clock synchronization, and if it is, what port to use for UDP. LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_UDP_PORT from federate %d.", fed_id); unsigned char response[1 + sizeof(uint16_t)]; - read_from_socket_errexit(*socket_id, 1 + sizeof(uint16_t), response, + read_from_socket_fail_on_error(socket_id, 1 + sizeof(uint16_t), response, NULL, "RTI failed to read MSG_TYPE_UDP_PORT message from federate %d.", fed_id); if (response[0] != MSG_TYPE_UDP_PORT) { lf_print_error( @@ -1483,8 +1460,8 @@ static int receive_udp_message_and_set_up_clock_sync(int *socket_id, uint16_t fe // Listen for reply message, which should be T3. size_t message_size = 1 + sizeof(int32_t); unsigned char buffer[message_size]; - read_from_socket_errexit(*socket_id, message_size, buffer, - "Socket to federate %d unexpectedly closed.", fed_id); + read_from_socket_fail_on_error(socket_id, message_size, buffer, NULL, + "Socket to federate %d unexpectedly closed.", fed_id); if (buffer[0] == MSG_TYPE_CLOCK_SYNC_T3) { int32_t fed_id = extract_int32(&(buffer[1])); assert(fed_id > -1); @@ -1535,8 +1512,8 @@ static bool authenticate_federate(int *socket) { // Wait for MSG_TYPE_FED_NONCE from federate. size_t fed_id_length = sizeof(uint16_t); unsigned char buffer[1 + fed_id_length + NONCE_LENGTH]; - read_from_socket_errexit(*socket, 1 + fed_id_length + NONCE_LENGTH, buffer, - "Failed to read MSG_TYPE_FED_NONCE"); + read_from_socket_fail_on_error(socket, 1 + fed_id_length + NONCE_LENGTH, buffer, NULL, + "Failed to read MSG_TYPE_FED_NONCE"); if (buffer[0] != MSG_TYPE_FED_NONCE) { lf_print_error_and_exit( @@ -1570,8 +1547,8 @@ static bool authenticate_federate(int *socket) { // Wait for MSG_TYPE_FED_RESPONSE unsigned char received[1 + hmac_length]; - read_from_socket_errexit(*socket, 1 + hmac_length, received, - "Failed to read federate response."); + read_from_socket_fail_on_error(socket, 1 + hmac_length, received, NULL, + "Failed to read federate response."); if (received[0] != MSG_TYPE_FED_RESPONSE) { lf_print_error_and_exit( "Received unexpected response %u from the federate (see net_common.h).", diff --git a/core/federated/clock-sync.c b/core/federated/clock-sync.c index 2e7240454..74e1ed5f1 100644 --- a/core/federated/clock-sync.c +++ b/core/federated/clock-sync.c @@ -203,22 +203,7 @@ uint16_t setup_clock_synchronization_with_rti() { return port_to_return; } -/** - * Synchronize the initial physical clock with the RTI. - * A call to this function is inserted into the startup - * sequence by the code generator if initial clock synchronization - * is required. - * - * This is a blocking function that expects - * to read a MSG_TYPE_CLOCK_SYNC_T1 from the RTI TCP socket. - * It will then follow the PTP protocol to synchronize the local - * physical clock with the RTI. - * Failing to complete this protocol is treated as a catastrophic - * error that causes the federate to exit. - * - * @param rti_socket_TCP The rti's socket - */ -void synchronize_initial_physical_clock_with_rti(int rti_socket_TCP) { +void synchronize_initial_physical_clock_with_rti(int* rti_socket_TCP) { LF_PRINT_DEBUG("Waiting for initial clock synchronization messages from the RTI."); size_t message_size = 1 + sizeof(instant_t); @@ -226,7 +211,7 @@ void synchronize_initial_physical_clock_with_rti(int rti_socket_TCP) { for (int i=0; i < _LF_CLOCK_SYNC_EXCHANGES_PER_INTERVAL; i++) { // The first message expected from the RTI is MSG_TYPE_CLOCK_SYNC_T1 - read_from_socket_errexit(rti_socket_TCP, message_size, buffer, + read_from_socket_fail_on_error(rti_socket_TCP, message_size, buffer, NULL, "Federate %d did not get the initial clock synchronization message T1 from the RTI.", _lf_my_fed_id); @@ -240,12 +225,12 @@ void synchronize_initial_physical_clock_with_rti(int rti_socket_TCP) { // Handle the message and send a reply T3 message. // NOTE: No need to acquire the mutex lock during initialization because only // one thread is running. - if (handle_T1_clock_sync_message(buffer, rti_socket_TCP, receive_time) != 0) { + if (handle_T1_clock_sync_message(buffer, *rti_socket_TCP, receive_time) != 0) { lf_print_error_and_exit("Initial clock sync: Failed to send T3 reply to RTI."); } // Next message from the RTI is required to be MSG_TYPE_CLOCK_SYNC_T4 - read_from_socket_errexit(rti_socket_TCP, message_size, buffer, + read_from_socket_fail_on_error(rti_socket_TCP, message_size, buffer, NULL, "Federate %d did not get the clock synchronization message T4 from the RTI.", _lf_my_fed_id); @@ -255,7 +240,7 @@ void synchronize_initial_physical_clock_with_rti(int rti_socket_TCP) { } // Handle the message. - handle_T4_clock_sync_message(buffer, rti_socket_TCP, receive_time); + handle_T4_clock_sync_message(buffer, *rti_socket_TCP, receive_time); } LF_PRINT_LOG("Finished initial clock synchronization with the RTI."); @@ -359,12 +344,11 @@ void handle_T4_clock_sync_message(unsigned char* buffer, int socket, instant_t r if (socket == _lf_rti_socket_UDP) { // Read the coded probe message. // We can reuse the same buffer. - ssize_t bytes_read = read_from_socket(socket, 1 + sizeof(instant_t), buffer); + int read_failed = read_from_socket(socket, 1 + sizeof(instant_t), buffer); instant_t r5 = lf_time_physical(); - if ((bytes_read < 1 + (ssize_t)sizeof(instant_t)) - || buffer[0] != MSG_TYPE_CLOCK_SYNC_CODED_PROBE) { + if (read_failed || buffer[0] != MSG_TYPE_CLOCK_SYNC_CODED_PROBE) { lf_print_warning("Clock sync: Did not get the expected coded probe message from the RTI. " "Skipping clock synchronization round."); return; diff --git a/core/federated/federate.c b/core/federated/federate.c index ae0372dc6..f1c6df746 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -447,10 +447,11 @@ void* handle_p2p_connections_from_federates(void* env_arg) { size_t header_length = 1 + sizeof(uint16_t) + 1; unsigned char buffer[header_length]; - ssize_t bytes_read = read_from_socket(socket_id, header_length, (unsigned char*)&buffer); - if (bytes_read != (ssize_t)header_length || buffer[0] != MSG_TYPE_P2P_SENDING_FED_ID) { + int read_failed = read_from_socket(socket_id, header_length, (unsigned char*)&buffer); + if (read_failed || buffer[0] != MSG_TYPE_P2P_SENDING_FED_ID) { lf_print_warning("Federate received invalid first message on P2P socket. Closing socket."); - if (bytes_read >= 0) { + if (read_failed == 0) { + // Wrong message received. unsigned char response[2]; response[0] = MSG_TYPE_REJECT; response[1] = WRONG_SERVER; @@ -466,11 +467,10 @@ void* handle_p2p_connections_from_federates(void* env_arg) { // Get the federation ID and check it. unsigned char federation_id_length = buffer[header_length - 1]; char remote_federation_id[federation_id_length]; - bytes_read = read_from_socket(socket_id, federation_id_length, (unsigned char*)remote_federation_id); - if (bytes_read != federation_id_length - || (strncmp(federation_metadata.federation_id, remote_federation_id, strnlen(federation_metadata.federation_id, 255)) != 0)) { + read_failed = read_from_socket(socket_id, federation_id_length, (unsigned char*)remote_federation_id); + if (read_failed || (strncmp(federation_metadata.federation_id, remote_federation_id, strnlen(federation_metadata.federation_id, 255)) != 0)) { lf_print_warning("Received invalid federation ID. Closing socket."); - if (bytes_read >= 0) { + if (read_failed == 0) { unsigned char response[2]; response[0] = MSG_TYPE_REJECT; response[1] = FEDERATION_ID_DOES_NOT_MATCH; @@ -611,7 +611,7 @@ void connect_to_federate(uint16_t remote_federate_id) { LF_MUTEX_UNLOCK(&outbound_socket_mutex); // Read RTI's response. - read_from_socket_errexit(_fed.socket_TCP_RTI, sizeof(int32_t) + 1, buffer, + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, sizeof(int32_t) + 1, buffer, NULL, "Failed to read the requested port number for federate %d from RTI.", remote_federate_id); @@ -625,7 +625,8 @@ void connect_to_federate(uint16_t remote_federate_id) { } port = extract_int32(&buffer[1]); - read_from_socket_errexit(_fed.socket_TCP_RTI, sizeof(host_ip_addr), (unsigned char*)&host_ip_addr, + read_from_socket_fail_on_error( + &_fed.socket_TCP_RTI, sizeof(host_ip_addr), (unsigned char*)&host_ip_addr, NULL, "Failed to read the IP address for federate %d from RTI.", remote_federate_id); @@ -639,10 +640,7 @@ void connect_to_federate(uint16_t remote_federate_id) { remote_federate_id); } // Wait ADDRESS_QUERY_RETRY_INTERVAL nanoseconds. - if (lf_sleep(ADDRESS_QUERY_RETRY_INTERVAL) != 0) { - // Sleep was interrupted. - continue; - } + lf_sleep(ADDRESS_QUERY_RETRY_INTERVAL); } } assert(port < 65536); @@ -730,12 +728,12 @@ void connect_to_federate(uint16_t remote_federate_id) { "Failed to send federation id to federate %d.", remote_federate_id); - read_from_socket_errexit(socket_id, 1, (unsigned char*)buffer, + read_from_socket_fail_on_error(&socket_id, 1, (unsigned char*)buffer, NULL, "Failed to read MSG_TYPE_ACK from federate %d in response to sending fed_id.", remote_federate_id); if (buffer[0] != MSG_TYPE_ACK) { // Get the error code. - read_from_socket_errexit(socket_id, 1, (unsigned char*)buffer, + read_from_socket_fail_on_error(&socket_id, 1, (unsigned char*)buffer, NULL, "Failed to read error code from federate %d in response to sending fed_id.", remote_federate_id); lf_print_error("Received MSG_TYPE_REJECT message from remote federate (%d).", buffer[0]); result = -1; @@ -780,7 +778,8 @@ void perform_hmac_authentication() { size_t federation_id_length = strnlen(federation_metadata.federation_id, 255); unsigned char received[1 + NONCE_LENGTH + hmac_length]; - read_from_socket_errexit(_fed.socket_TCP_RTI, 1 + NONCE_LENGTH + hmac_length, received, "Failed to read RTI response."); + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, 1 + NONCE_LENGTH + hmac_length, received, NULL, + "Failed to read RTI response."); if (received[0] != MSG_TYPE_RTI_RESPONSE) { if (received[0] == MSG_TYPE_RESIGN) { lf_print_error_and_exit("RTI has resigned."); @@ -944,13 +943,15 @@ void connect_to_rti(const char* hostname, int port) { LF_PRINT_DEBUG("Waiting for response to federation ID from the RTI."); - read_from_socket_errexit(_fed.socket_TCP_RTI, 1, &response, "Failed to read response from RTI."); + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, 1, &response, NULL, + "Failed to read response from RTI."); if (response == MSG_TYPE_REJECT) { // Trace the event when tracing is enabled tracepoint_federate_from_rti(_fed.trace, receive_REJECT, _lf_my_fed_id, NULL); // Read one more byte to determine the cause of rejection. unsigned char cause; - read_from_socket_errexit(_fed.socket_TCP_RTI, 1, &cause, "Failed to read the cause of rejection by the RTI."); + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, 1, &cause, NULL, + "Failed to read the cause of rejection by the RTI."); if (cause == FEDERATION_ID_DOES_NOT_MATCH || cause == WRONG_SERVER) { lf_print_error_and_exit("Connected to the wrong RTI on port %d.", uport); } @@ -1002,14 +1003,18 @@ instant_t get_start_time_from_rti(instant_t my_physical_time) { size_t buffer_length = 1 + sizeof(instant_t); unsigned char buffer[buffer_length]; - read_from_socket_errexit(_fed.socket_TCP_RTI, buffer_length, buffer, + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, buffer_length, buffer, NULL, "Failed to read MSG_TYPE_TIMESTAMP message from RTI."); LF_PRINT_DEBUG("Read 9 bytes."); // First byte received is the message ID. if (buffer[0] != MSG_TYPE_TIMESTAMP) { - lf_print_error_and_exit("Expected a MSG_TYPE_TIMESTAMP message from the RTI. Got %u (see net_common.h).", - buffer[0]); + if (buffer[0] == MSG_TYPE_RESIGN) { + lf_print_error_and_exit("RTI has unexpectedly resigned."); + } + lf_print_error_and_exit( + "Expected a MSG_TYPE_TIMESTAMP message from the RTI. Got %u (see net_common.h).", + buffer[0]); } instant_t timestamp = extract_int64(&(buffer[1])); @@ -1353,14 +1358,14 @@ static trigger_handle_t schedule_message_received_from_network_locked( * This just sets the last known status tag of the port specified * in the message. * - * @param socket The socket to read the message from + * @param socket Pointer to the socket to read the message from * @param buffer The buffer to read * @param fed_id The sending federate ID or -1 if the centralized coordination. */ -static void handle_port_absent_message(int socket, int fed_id) { +static void handle_port_absent_message(int* socket, int fed_id) { size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(instant_t) + sizeof(microstep_t); unsigned char buffer[bytes_to_read]; - read_from_socket_errexit(socket, bytes_to_read, buffer, + read_from_socket_fail_on_error(socket, bytes_to_read, buffer, NULL, "Failed to read port absent message."); // Extract the header information. @@ -1405,16 +1410,15 @@ static void handle_port_absent_message(int socket, int fed_id) { * Handle a message being received from a remote federate. * * This function assumes the caller does not hold the mutex lock. - * @param socket The socket to read the message from - * @param buffer The buffer to read + * @param socket Pointer to the socket to read the message from. + * @param buffer The buffer to read. * @param fed_id The sending federate ID or -1 if the centralized coordination. */ -void handle_message(int socket, int fed_id) { - // FIXME: Need better error handling? +void handle_message(int* socket, int fed_id) { // Read the header. size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t); unsigned char buffer[bytes_to_read]; - read_from_socket_errexit(socket, bytes_to_read, buffer, + read_from_socket_fail_on_error(socket, bytes_to_read, buffer, NULL, "Failed to read message header."); // Extract the header information. @@ -1432,7 +1436,7 @@ void handle_message(int socket, int fed_id) { // Read the payload. // Allocate memory for the message contents. unsigned char* message_contents = (unsigned char*)malloc(length); - read_from_socket_errexit(socket, length, message_contents, + read_from_socket_fail_on_error(socket, length, message_contents, NULL, "Failed to read message body."); // Trace the event when tracing is enabled tracepoint_federate_from_federate(_fed.trace, receive_P2P_MSG, _lf_my_fed_id, federate_id, NULL); @@ -1465,11 +1469,11 @@ void stall_advance_level_federation(environment_t* env, size_t level) { * will not advance to the tag of the message if it is in the future, or * the tag will not advance at all if the tag of the message is * now or in the past. - * @param socket The socket to read the message from. + * @param socket Pointer to the socket to read the message from. * @param buffer The buffer to read. * @param fed_id The sending federate ID or -1 if the centralized coordination. */ -void handle_tagged_message(int socket, int fed_id) { +void handle_tagged_message(int* socket, int fed_id) { // Environment is always the one corresponding to the top-level scheduling enclave. environment_t *env; _lf_get_environments(&env); @@ -1479,7 +1483,7 @@ void handle_tagged_message(int socket, int fed_id) { size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t) + sizeof(instant_t) + sizeof(microstep_t); unsigned char buffer[bytes_to_read]; - read_from_socket_errexit(socket, bytes_to_read, buffer, + read_from_socket_fail_on_error(socket, bytes_to_read, buffer, NULL, "Failed to read timed message header"); // Extract the header information. @@ -1527,7 +1531,7 @@ void handle_tagged_message(int socket, int fed_id) { // Read the payload. // Allocate memory for the message contents. unsigned char* message_contents = (unsigned char*)malloc(length); - read_from_socket_errexit(socket, length, message_contents, + read_from_socket_fail_on_error(socket, length, message_contents, NULL, "Failed to read message body."); // The following is only valid for string messages. @@ -1646,7 +1650,7 @@ void handle_tag_advance_grant(void) { size_t bytes_to_read = sizeof(instant_t) + sizeof(microstep_t); unsigned char buffer[bytes_to_read]; - read_from_socket_errexit(_fed.socket_TCP_RTI, bytes_to_read, buffer, + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, "Failed to read tag advance grant from RTI."); tag_t TAG = extract_tag(buffer); @@ -1905,7 +1909,7 @@ void handle_provisional_tag_advance_grant() { size_t bytes_to_read = sizeof(instant_t) + sizeof(microstep_t); unsigned char buffer[bytes_to_read]; - read_from_socket_errexit(_fed.socket_TCP_RTI, bytes_to_read, buffer, + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, "Failed to read provisional tag advance grant from RTI."); tag_t PTAG = extract_tag(buffer); @@ -2043,7 +2047,7 @@ void handle_stop_granted_message() { size_t bytes_to_read = MSG_TYPE_STOP_GRANTED_LENGTH - 1; unsigned char buffer[bytes_to_read]; - read_from_socket_errexit(_fed.socket_TCP_RTI, bytes_to_read, buffer, + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, "Failed to read stop granted from RTI."); tag_t received_stop_tag = extract_tag(buffer); @@ -2086,7 +2090,7 @@ void handle_stop_granted_message() { void handle_stop_request_message() { size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_LENGTH - 1; unsigned char buffer[bytes_to_read]; - read_from_socket_errexit(_fed.socket_TCP_RTI, bytes_to_read, buffer, + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, "Failed to read stop request from RTI."); tag_t tag_to_stop = extract_tag(buffer); @@ -2269,7 +2273,7 @@ void* listen_to_federates(void* _args) { LF_PRINT_LOG("Listening to federate %d.", fed_id); - int socket_id = _fed.sockets_for_inbound_p2p_connections[fed_id]; + int* socket_id = &_fed.sockets_for_inbound_p2p_connections[fed_id]; // Buffer for incoming messages. // This does not constrain the message size @@ -2279,22 +2283,15 @@ void* listen_to_federates(void* _args) { // Listen for messages from the federate. while (1) { // Read one byte to get the message type. - LF_PRINT_DEBUG("Waiting for a P2P message on socket %d.", socket_id); - ssize_t bytes_read = read_from_socket(socket_id, 1, buffer); - if (bytes_read == 0) { - // EOF occurred. Socket has been closed by read_from_socket. - lf_print("Received EOF from peer federate %d.", fed_id); - // Stop listening to this federate. - break; - } else if (bytes_read < 0) { - lf_print_error("P2P socket to federate %d is broken.", fed_id); + LF_PRINT_DEBUG("Waiting for a P2P message on socket %d.", *socket_id); + if (read_from_socket_close_on_error(socket_id, 1, buffer)) { + // Socket has been closed. + lf_print("Socket from federate %d is close.", fed_id); // Stop listening to this federate. - // Mark the socket closed. - _lf_close_inbound_socket(fed_id, bytes_read); break; } LF_PRINT_DEBUG("Received a P2P message on socket %d of type %d.", - socket_id, buffer[0]); + *socket_id, buffer[0]); bool bad_message = false; switch (buffer[0]) { case MSG_TYPE_P2P_MESSAGE: @@ -2364,26 +2361,25 @@ void* listen_to_rti_TCP(void* args) { } // Read one byte to get the message type. // This will exit if the read fails. - ssize_t bytes_read = read_from_socket(_fed.socket_TCP_RTI, 1, buffer); - if (bytes_read < 0) { + int read_failed = read_from_socket(_fed.socket_TCP_RTI, 1, buffer); + if (read_failed < 0) { if (errno == ECONNRESET) { lf_print_error("Socket connection to the RTI was closed by the RTI without" - " properly sending an EOF first. Considering this a soft error."); + " properly sending an EOF first. Considering this a soft error."); // FIXME: If this happens, possibly a new RTI must be elected. _fed.socket_TCP_RTI = -1; return NULL; } else { - lf_print_error("Socket connection to the RTI has been broken" - " with error %d: %s. The RTI should" - " close connections with an EOF first." - " Considering this a soft error.", - errno, - strerror(errno)); + lf_print_error("Socket connection to the RTI has been broken with error %d: %s." + " The RTI should close connections with an EOF first." + " Considering this a soft error.", + errno, + strerror(errno)); // FIXME: If this happens, possibly a new RTI must be elected. _fed.socket_TCP_RTI = -1; return NULL; } - } else if (bytes_read == 0) { + } else if (read_failed > 0) { // EOF received. lf_print("Connection to the RTI closed with an EOF."); _fed.socket_TCP_RTI = -1; @@ -2392,7 +2388,7 @@ void* listen_to_rti_TCP(void* args) { } switch (buffer[0]) { case MSG_TYPE_TAGGED_MESSAGE: - handle_tagged_message(_fed.socket_TCP_RTI, -1); + handle_tagged_message(&_fed.socket_TCP_RTI, -1); break; case MSG_TYPE_TAG_ADVANCE_GRANT: handle_tag_advance_grant(); @@ -2407,7 +2403,7 @@ void* listen_to_rti_TCP(void* args) { handle_stop_granted_message(); break; case MSG_TYPE_PORT_ABSENT: - handle_port_absent_message(_fed.socket_TCP_RTI, -1); + handle_port_absent_message(&_fed.socket_TCP_RTI, -1); break; case MSG_TYPE_RESIGN: handle_rti_resign_message(); diff --git a/core/federated/network/net_util.c b/core/federated/network/net_util.c index 665928cf9..754a28ada 100644 --- a/core/federated/network/net_util.c +++ b/core/federated/network/net_util.c @@ -88,17 +88,12 @@ int create_real_time_tcp_socket_errexit() { return sock; } -ssize_t read_from_socket_errexit( - int socket, - size_t num_bytes, - unsigned char* buffer, - char* format, ...) { - va_list args; - // Error checking first - if (socket < 0 && format != NULL) { - lf_print_error("Socket is no longer open."); - lf_print_error_and_exit(format, args); - } +int read_from_socket(int socket, size_t num_bytes, unsigned char* buffer) { + if (socket < 0) { + // Socket is not open. + errno = EBADF; + return -1; + } ssize_t bytes_read = 0; int retry_count = 0; while (bytes_read < (ssize_t)num_bytes) { @@ -111,32 +106,53 @@ ssize_t read_from_socket_errexit( lf_sleep(DELAY_BETWEEN_SOCKET_RETRIES); continue; } else if (more < 0) { - // Socket failure. Probably closed. - if (format != NULL) { - lf_print_error_system_failure(format, args); - } else { - lf_print_error("Socket read failed."); - return more; - } + // A more serious error occurred. + return -1; } else if (more == 0) { - // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, - // upon receiving a zero length packet or an error, we can close the socket. - LF_PRINT_DEBUG("EOF received from client. Closing socket."); - lf_mutex_lock(&socket_mutex); - // If there are any pending outgoing messages, this will attempt to send those - // followed by an EOF. - shutdown(socket, SHUT_WR); - close(socket); - lf_mutex_unlock(&socket_mutex); - return more; + // EOF received. + return 1; } bytes_read += more; } - return bytes_read; + return 0; } -ssize_t read_from_socket(int socket, size_t num_bytes, unsigned char* buffer) { - return read_from_socket_errexit(socket, num_bytes, buffer, NULL); +int read_from_socket_close_on_error(int* socket, size_t num_bytes, unsigned char* buffer) { + assert(socket); + int read_failed = read_from_socket(*socket, num_bytes, buffer); + if (read_failed) { + // Read failed. + // Socket has probably been closed from the other side. + // Shut down and close the socket from this side. + shutdown(*socket, SHUT_RDWR); + close(*socket); + // Mark the socket closed. + *socket = -1; + return -1; + } + return 0; +} + +void read_from_socket_fail_on_error( + int* socket, + size_t num_bytes, + unsigned char* buffer, + lf_mutex_t* mutex, + char* format, ...) { + va_list args; + assert(socket); + int read_failed = read_from_socket_close_on_error(socket, num_bytes, buffer); + if (read_failed) { + // Read failed. + if (mutex != NULL) { + lf_mutex_unlock(mutex); + } + if (format != NULL) { + lf_print_error_system_failure(format, args); + } else { + lf_print_error_system_failure("Failed to read from socket."); + } + } } ssize_t peek_from_socket(int socket, unsigned char* result) { @@ -173,20 +189,17 @@ int write_to_socket(int socket, size_t num_bytes, unsigned char* buffer) { int write_to_socket_close_on_error(int* socket, size_t num_bytes, unsigned char* buffer) { assert(socket); - if (*socket >= 0) { - int result = write_to_socket(*socket, num_bytes, buffer); - if (result) { - // Write failed. - // Socket has probably been closed from the other side. - // Shut down and close the socket from this side. - shutdown(*socket, SHUT_RDWR); - close(*socket); - // Mark the socket closed. - *socket = -1; - } - return result; + int result = write_to_socket(*socket, num_bytes, buffer); + if (result) { + // Write failed. + // Socket has probably been closed from the other side. + // Shut down and close the socket from this side. + shutdown(*socket, SHUT_RDWR); + close(*socket); + // Mark the socket closed. + *socket = -1; } - return -1; + return result; } void write_to_socket_fail_on_error( @@ -197,18 +210,16 @@ void write_to_socket_fail_on_error( char* format, ...) { va_list args; assert(socket); - if (*socket >= 0) { - int result = write_to_socket_close_on_error(socket, num_bytes, buffer); - if (result) { - // Write failed. - if (mutex != NULL) { - lf_mutex_unlock(mutex); - } - if (format != NULL) { - lf_print_error_system_failure(format, args); - } else { - lf_print_error("Failed to write to socket. Closing it."); - } + int result = write_to_socket_close_on_error(socket, num_bytes, buffer); + if (result) { + // Write failed. + if (mutex != NULL) { + lf_mutex_unlock(mutex); + } + if (format != NULL) { + lf_print_error_system_failure(format, args); + } else { + lf_print_error("Failed to write to socket. Closing it."); } } } diff --git a/include/core/federated/clock-sync.h b/include/core/federated/clock-sync.h index eb3e4c341..0106afc54 100644 --- a/include/core/federated/clock-sync.h +++ b/include/core/federated/clock-sync.h @@ -149,9 +149,9 @@ uint16_t setup_clock_synchronization_with_rti(void); * Failing to complete this protocol is treated as a catastrophic * error that causes the federate to exit. * - * @param rti_socket_TCP The rti's socket + * @param rti_socket_TCP Pointer to the RTI's socket */ -void synchronize_initial_physical_clock_with_rti(int rti_socket_TCP); +void synchronize_initial_physical_clock_with_rti(int* rti_socket_TCP); /** * Handle a clock synchroninzation message T1 coming from the RTI. diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index 382037e8d..656a01f97 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -218,7 +218,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * (an MSG_TYPE_ADDRESS_QUERY message) after the RTI responds that it * does not know. This allows time for federates to start separately. */ -#define ADDRESS_QUERY_RETRY_INTERVAL SEC(1) +#define ADDRESS_QUERY_RETRY_INTERVAL MSEC(100) /** * Time to wait before re-attempting to bind to a port. diff --git a/include/core/federated/network/net_util.h b/include/core/federated/network/net_util.h index 2883b7a55..544e86d46 100644 --- a/include/core/federated/network/net_util.h +++ b/include/core/federated/network/net_util.h @@ -79,14 +79,41 @@ extern lf_mutex_t socket_mutex; */ int create_real_time_tcp_socket_errexit(); +/** + * Read the specified number of bytes from the specified socket into the specified buffer. + * If an error occurs during this reading, return -1 and set errno to indicate + * the cause of the error. If the read succeeds in reading the specified number of bytes, + * return 0. If an EOF occurs before reading the specified number of bytes, return 1. + * This function repeats the read attempt until the specified number of bytes + * have been read, an EOF is read, or an error occurs. Specifically, errors EAGAIN, + * EWOULDBLOCK, and EINTR are not considered errors and instead trigger + * another attempt. A delay between attempts is given by DELAY_BETWEEN_SOCKET_RETRIES. + * @param socket The socket ID. + * @param num_bytes The number of bytes to read. + * @param buffer The buffer into which to put the bytes. + * @return 0 for success, 1 for EOF, and -1 for an error. + */ +int read_from_socket(int socket, size_t num_bytes, unsigned char* buffer); + +/** + * Read the specified number of bytes to the specified socket using read_from_socket + * and close the socket if an error occurs. If an error occurs, this will change the + * socket ID pointed to by the first argument to -1 and will return -1. + * @param socket Pointer to the socket ID. + * @param num_bytes The number of bytes to write. + * @param buffer The buffer from which to get the bytes. + * @return 0 for success, -1 for failure. + */ +int read_from_socket_close_on_error(int* socket, size_t num_bytes, unsigned char* buffer); + /** * Read the specified number of bytes from the specified socket into the * specified buffer. If a disconnect or an EOF occurs during this * reading, then if format is non-null, report an error and exit. + * If the mutex argument is non-NULL, release the mutex before exiting. * If format is null, then report the error, but do not exit. - * This function takes a formatted - * string and additional optional arguments similar to printf(format, ...) - * that is appended to the error messages. + * This function takes a formatted string and additional optional arguments + * similar to printf(format, ...) that is appended to the error messages. * @param socket The socket ID. * @param num_bytes The number of bytes to read. * @param buffer The buffer into which to put the bytes. @@ -95,10 +122,11 @@ int create_real_time_tcp_socket_errexit(); * @return The number of bytes read, or 0 if an EOF is received, or * a negative number for an error. */ -ssize_t read_from_socket_errexit( - int socket, +void read_from_socket_fail_on_error( + int* socket, size_t num_bytes, unsigned char* buffer, + lf_mutex_t* mutex, char* format, ...); /** @@ -160,19 +188,6 @@ void write_to_socket_fail_on_error( lf_mutex_t* mutex, char* format, ...); -/** - * Read the specified number of bytes from the specified socket into the - * specified buffer. If a disconnect occurs during this - * reading, return a negative number. If an EOF occurs during this - * reading, return 0. Otherwise, return the number of bytes read. - * This is a version of read_from_socket_errexit() that does not error out. - * @param socket The socket ID. - * @param num_bytes The number of bytes to read. - * @param buffer The buffer into which to put the bytes. - * @return The number of bytes read or 0 when EOF is received or negative for an error. - */ -ssize_t read_from_socket(int socket, size_t num_bytes, unsigned char* buffer); - #endif // FEDERATED /** From 3f8ee2015957a26a32503d1e5def3ca5c773b520 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Thu, 28 Dec 2023 07:49:43 -0800 Subject: [PATCH 27/83] Typo --- core/federated/federate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index f1c6df746..ab0b1da4e 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2286,7 +2286,7 @@ void* listen_to_federates(void* _args) { LF_PRINT_DEBUG("Waiting for a P2P message on socket %d.", *socket_id); if (read_from_socket_close_on_error(socket_id, 1, buffer)) { // Socket has been closed. - lf_print("Socket from federate %d is close.", fed_id); + lf_print("Socket from federate %d is closed.", fed_id); // Stop listening to this federate. break; } From 9f6600fbc526d00b345d06dd0bb889515b7f33ee Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Thu, 28 Dec 2023 07:50:07 -0800 Subject: [PATCH 28/83] Better timing --- include/core/federated/network/net_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index 656a01f97..d3f6d7ff2 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -218,7 +218,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * (an MSG_TYPE_ADDRESS_QUERY message) after the RTI responds that it * does not know. This allows time for federates to start separately. */ -#define ADDRESS_QUERY_RETRY_INTERVAL MSEC(100) +#define ADDRESS_QUERY_RETRY_INTERVAL MSEC(250) /** * Time to wait before re-attempting to bind to a port. From 6d77482a3bec116eb8eaa4989c444bd257cd1ded Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Thu, 28 Dec 2023 07:50:56 -0800 Subject: [PATCH 29/83] Fix bug that can lead to deadlock on STP violation --- core/reactor_common.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/core/reactor_common.c b/core/reactor_common.c index bf060a8af..cf150c9ef 100644 --- a/core/reactor_common.c +++ b/core/reactor_common.c @@ -383,14 +383,16 @@ void _lf_pop_events(environment_t *env) { // the reaction can access the value. event->trigger->intended_tag = event->intended_tag; // And check if it is in the past compared to the current tag. - if (lf_tag_compare(event->intended_tag, - env->current_tag) < 0) { + if (lf_tag_compare(event->intended_tag, env->current_tag) < 0) { // Mark the triggered reaction with a STP violation reaction->is_STP_violated = true; LF_PRINT_LOG("Trigger %p has violated the reaction's STP offset. Intended tag: " PRINTF_TAG ". Current tag: " PRINTF_TAG, event->trigger, event->intended_tag.time - start_time, event->intended_tag.microstep, env->current_tag.time - start_time, env->current_tag.microstep); + // Need to update the last_known_status_tag of the port because otherwise, + // the MLAA could get stuck, causing the program to lock up. + event->trigger->last_known_status_tag = env->current_tag; } } #endif From 47cce7123a741bb4e884fb634d79aa90df1afa0b Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Fri, 29 Dec 2023 10:41:27 -0800 Subject: [PATCH 30/83] Comments only --- core/reactor_common.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/reactor_common.c b/core/reactor_common.c index cf150c9ef..307627a6e 100644 --- a/core/reactor_common.c +++ b/core/reactor_common.c @@ -392,6 +392,8 @@ void _lf_pop_events(environment_t *env) { env->current_tag.time - start_time, env->current_tag.microstep); // Need to update the last_known_status_tag of the port because otherwise, // the MLAA could get stuck, causing the program to lock up. + // This should not call update_last_known_status_on_input_port because we + // are starting a new tag step execution, so there are no reactions blocked on this input. event->trigger->last_known_status_tag = env->current_tag; } } From 2b0796eadaadd02aaf3c9babfa29d5357dc4e0f1 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Fri, 29 Dec 2023 10:41:45 -0800 Subject: [PATCH 31/83] Comments only --- include/core/federated/federate.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index 78129e708..aff78ed10 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -144,11 +144,9 @@ typedef struct federate_instance_t { int server_port; /** - * Most recent TIME_ADVANCE_GRANT received from the RTI, or NEVER if none - * has been received. - * This is used to communicate between the listen_to_rti_TCP thread and the - * main federate thread. - * This variable should only be accessed while holding the mutex lock. + * Most recent tag advance grant (TAG) received from the RTI, or NEVER if none + * has been received. This variable should only be accessed while holding the + * mutex lock on the top-level environment. */ tag_t last_TAG; From 089489f3d37f74aa3f99fae7aaae61301edff9f2 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Fri, 29 Dec 2023 12:31:31 -0800 Subject: [PATCH 32/83] Last known status of a port doesn't lag current tag --- core/federated/federate.c | 102 ++++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 47 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 15ab6e57b..6e4fe746f 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1072,16 +1072,15 @@ void set_network_port_status(int portID, port_status_t status) { * Update the last known status tag of all network input ports * to the value of `tag`, unless that the provided `tag` is less * than the last_known_status_tag of the port. This is called when - * all inputs to network ports with tags up to and including `tag` - * have been received by those ports. If any update occurs, - * then this broadcasts on `port_status_changed`. + * a TAG signal is received from the RTI in centralized coordination. + * If any update occurs, then this broadcasts on `port_status_changed`. * * This assumes the caller holds the mutex. * * @param tag The tag on which the latest status of all network input * ports is known. */ -void update_last_known_status_on_input_ports(tag_t tag) { +static void update_last_known_status_on_input_ports(tag_t tag) { LF_PRINT_DEBUG("In update_last_known_status_on_input ports."); bool notify = false; for (int i = 0; i < _lf_action_table_size; i++) { @@ -1116,26 +1115,38 @@ void update_last_known_status_on_input_ports(tag_t tag) { } /** - * Update the last known status tag of a network input port - * to the value of "tag". This is the largest tag at which the status - * (present or absent) of the port is known. + * @brief Update the last known status tag of a network input port. + * + * First, if the specified tag is less than the current_tag of the top-level + * environment, then ignore the specified tag and use the current_tag. This + * situation can arise if a message has arrived late (an STP violation has occurred). + * + * If the specified tag is greater than the previous last_known_status_tag + * of the port, then update the last_known_status_tag to the new tag. + * + * If the tag is equal to the previous last_known_status_tag, then + * increment the microstep of the last_known_status_tag. This situation can + * occur if a sequence of late messages (STP violations) are occurring all at + * once during an execution of a logical tag. + * + * This function is called when a message or absent message arrives. For decentralized + * coordination, it is also called by the background thread update_ports_from_staa_offsets + * which uses physical time to determine when an input port can be assumed to be absent + * if a message has not been received. * - * This function assumes the caller holds the mutex, and, if the tag - * actually increases, it broadcasts on `port_status_changed`. + * This function assumes the caller holds the mutex on the top-level environment, + * and, if the tag actually increases, it broadcasts on `port_status_changed`. * - * @param tag The tag on which the latest status of network input - * ports is known. - * @param portID The port ID + * @param env The top-level environment, whose mutex is assumed to be held. + * @param tag The tag on which the latest status of the specified network input port is known. + * @param portID The port ID. */ -void update_last_known_status_on_input_port(tag_t tag, int port_id) { +static void update_last_known_status_on_input_port(environment_t* env, tag_t tag, int port_id) { + if (lf_tag_compare(tag, env->current_tag) < 0) tag = env->current_tag; trigger_t* input_port_action = _lf_action_for_port(port_id)->trigger; - if (lf_tag_compare(tag, input_port_action->last_known_status_tag) >= 0) { - if (lf_tag_compare(tag, input_port_action->last_known_status_tag) == 0) { - // If the intended tag for an input port is equal to the last known status, we need - // to increment the microstep. This is a direct result of the behavior of the lf_delay_tag() - // semantics in tag.h. - tag.microstep++; - } + int comparison = lf_tag_compare(tag, input_port_action->last_known_status_tag); + if (comparison == 0) tag.microstep++; + if (comparison >= 0) { LF_PRINT_LOG( "Updating the last known status tag of port %d from " PRINTF_TAG " to " PRINTF_TAG ".", port_id, @@ -1145,14 +1156,19 @@ void update_last_known_status_on_input_port(tag_t tag, int port_id) { tag.microstep ); input_port_action->last_known_status_tag = tag; - // There is no guarantee that there is either a TAG or a PTAG for this time. + + // Check whether this port update implies a change to MLAA, which may unblock reactions. + // For decentralized coordination, the first argument is NEVER, so it has no effect. + // For centralized, the arguments probably also have no effect, but the port update may. + // Note that it would not be correct to pass `tag` as the first argument because + // there is no guarantee that there is either a TAG or a PTAG for this time. // The message that triggered this to be called could be from an upstream // federate that is far ahead of other upstream federates in logical time. - // Therefore, do not pass `tag` to `update_max_level`. update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); lf_cond_broadcast(&port_status_changed); } else { - LF_PRINT_DEBUG("Attempt to update the last known status tag " + // Message arrivals should be monotonic, so this should not occur. + lf_print_warning("Attempt to update the last known status tag " "of network input port %d to an earlier tag was ignored.", port_id); } } @@ -1274,8 +1290,6 @@ void send_port_absent_to_federate( } } -///////////////////////////////////////////////////////////////////////////////////////// - /** * Version of schedule_value() similar to that in reactor_common.c * except that it does not acquire the mutex lock and has a special @@ -1391,17 +1405,7 @@ static void handle_port_absent_message(int* socket, int fed_id) { _lf_get_environments(&env); LF_MUTEX_LOCK(env->mutex); -#ifdef FEDERATED_DECENTRALIZED - trigger_t* network_input_port_action = _lf_action_for_port(port_id)->trigger; - if (lf_tag_compare(intended_tag, - network_input_port_action->last_known_status_tag) < 0) { - LF_MUTEX_UNLOCK(env->mutex); - } -#endif // In centralized coordination, a TAG message from the RTI - // can set the last_known_status_tag to a future tag where messages - // have not arrived yet. - // Set the mutex status as absent - update_last_known_status_on_input_port(intended_tag, port_id); + update_last_known_status_on_input_port(env, intended_tag, port_id); LF_MUTEX_UNLOCK(env->mutex); } @@ -1573,7 +1577,7 @@ void handle_tagged_message(int* socket, int fed_id) { // was waiting for the message, trigger the corresponding reactions for this // message. - update_last_known_status_on_input_port(intended_tag, port_id); + update_last_known_status_on_input_port(env, intended_tag, port_id); LF_PRINT_LOG( "Inserting reactions directly at tag " PRINTF_TAG ". " @@ -1598,10 +1602,11 @@ void handle_tagged_message(int* socket, int fed_id) { } else { // If no port absent reaction is waiting for this message, or if the intended // tag is in the future, use schedule functions to process the message. - update_last_known_status_on_input_port(intended_tag, port_id); - // Before that, if the current time >= stop time, discard the message. // But only if the stop time is not equal to the start time! + + update_last_known_status_on_input_port(env, intended_tag, port_id); + if (lf_tag_compare(env->current_tag, env->stop_tag) >= 0 && env->execution_started) { lf_print_error("Received message too late. Already at stop tag.\n" " Current tag is " PRINTF_TAG " and intended tag is " PRINTF_TAG ".\n" @@ -1668,8 +1673,7 @@ void handle_tag_advance_grant(void) { // It is possible for this federate to have received a PTAG // earlier with the same tag as this TAG. if (lf_tag_compare(TAG, _fed.last_TAG) >= 0) { - _fed.last_TAG.time = TAG.time; - _fed.last_TAG.microstep = TAG.microstep; + _fed.last_TAG = TAG; _fed.is_last_TAG_provisional = false; LF_PRINT_LOG("Received Time Advance Grant (TAG): " PRINTF_TAG ".", _fed.last_TAG.time - start_time, _fed.last_TAG.microstep); @@ -1712,6 +1716,12 @@ bool update_max_level(tag_t tag, bool is_provisional) { _lf_get_environments(&env); int prev_max_level_allowed_to_advance = max_level_allowed_to_advance; max_level_allowed_to_advance = INT_MAX; +#ifdef FEDERATED_DECENTRALIZED + size_t action_table_size = _lf_action_table_size; + lf_action_base_t** action_table = _lf_action_table; +#else + // Note that the following test is never true for decentralized coordination, + // where tag always is NEVER_TAG. if ((lf_tag_compare(env->current_tag, tag) < 0) || ( lf_tag_compare(env->current_tag, tag) == 0 && !is_provisional )) { @@ -1722,10 +1732,7 @@ bool update_max_level(tag_t tag, bool is_provisional) { // Safe to complete the current tag return (prev_max_level_allowed_to_advance != max_level_allowed_to_advance); } -#ifdef FEDERATED_DECENTRALIZED - size_t action_table_size = _lf_action_table_size; - lf_action_base_t** action_table = _lf_action_table; -#else + size_t action_table_size = _lf_zero_delay_cycle_action_table_size; lf_action_base_t** action_table = _lf_zero_delay_cycle_action_table; #endif // FEDERATED_DECENTRALIZED @@ -1735,7 +1742,8 @@ bool update_max_level(tag_t tag, bool is_provisional) { // In decentralized execution, if the current_tag is close enough to the // start tag and there is a large enough delay on an incoming // connection, then there is no need to block progress waiting for this - // port status. + // port status. This is irrelevant for centralized because blocking only + // occurs on zero-delay cycles. if ( (_lf_action_delay_table[i] == 0 && env->current_tag.time == start_time && env->current_tag.microstep == 0) || (_lf_action_delay_table[i] > 0 && lf_tag_compare( @@ -1854,7 +1862,7 @@ static void* update_ports_from_staa_offsets(void* args) { if (input_port_action->trigger->status == unknown) { input_port_action->trigger->status = absent; LF_PRINT_DEBUG("Assuming port absent at time " PRINTF_TIME, lf_tag(env).time - start_time); - update_last_known_status_on_input_port(lf_tag(env), id_of_action(input_port_action)); + update_last_known_status_on_input_port(env, lf_tag(env), id_of_action(input_port_action)); lf_cond_broadcast(&port_status_changed); } } From 96adc51ce2085486bd5c1ac90a4a414c394c404e Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Fri, 29 Dec 2023 15:08:45 -0800 Subject: [PATCH 33/83] Resurrected port search for RTI. Set maximum number of RTIs on a host to 16 --- core/federated/RTI/rti_remote.c | 125 +++++---- core/federated/RTI/rti_remote.h | 11 - core/federated/federate.c | 289 ++++++++++++-------- include/core/federated/federate.h | 13 +- include/core/federated/network/net_common.h | 19 +- include/core/federated/network/net_util.h | 2 +- 6 files changed, 266 insertions(+), 193 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index fbe363779..c9e8015d4 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -56,40 +56,68 @@ extern int lf_critical_section_exit(environment_t *env) { return lf_mutex_unlock(&rti_mutex); } -int create_server(int32_t specified_port, uint16_t port, socket_type_t socket_type) -{ +/** + * Create a server and enable listening for socket connections. + * If the specified port if it is non-zero, it will attempt to acquire that port. + * If it fails, it will repeatedly attempt up to PORT_BIND_RETRY_LIMIT times with + * a delay of PORT_BIND_RETRY_INTERVAL in between. If the specified port is + * zero, then it will attempt to acquire DEFAULT_PORT first. If this fails, then it + * will repeatedly attempt up to PORT_BIND_RETRY_LIMIT times, incrementing the port + * number between attempts, with no delay between attempts. Once it has incremented + * the port number MAX_NUM_PORT_ADDRESSES times, it will cycle around and begin again + * with DEFAULT_PORT. + * + * @param port The port number to use or 0 to start trying at DEFAULT_PORT. + * @param socket_type The type of the socket for the server (TCP or UDP). + * @return The socket descriptor on which to accept connections. + */ +static int create_rti_server(uint16_t port, socket_type_t socket_type) { // Timeout time for the communications of the server - struct timeval timeout_time = {.tv_sec = TCP_TIMEOUT_TIME / BILLION, .tv_usec = (TCP_TIMEOUT_TIME % BILLION) / 1000}; + struct timeval timeout_time = { + .tv_sec = TCP_TIMEOUT_TIME / BILLION, + .tv_usec = (TCP_TIMEOUT_TIME % BILLION) / 1000 + }; // Create an IPv4 socket for TCP (not UDP) communication over IP (0). int socket_descriptor = -1; - if (socket_type == TCP) - { + if (socket_type == TCP) { socket_descriptor = create_real_time_tcp_socket_errexit(); - } - else if (socket_type == UDP) - { + } else if (socket_type == UDP) { socket_descriptor = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); // Set the appropriate timeout time - timeout_time = (struct timeval){.tv_sec = UDP_TIMEOUT_TIME / BILLION, .tv_usec = (UDP_TIMEOUT_TIME % BILLION) / 1000}; + timeout_time = (struct timeval){ + .tv_sec = UDP_TIMEOUT_TIME / BILLION, + .tv_usec = (UDP_TIMEOUT_TIME % BILLION) / 1000 + }; } - if (socket_descriptor < 0) - { + if (socket_descriptor < 0) { lf_print_error_system_failure("Failed to create RTI socket."); } // Set the option for this socket to reuse the same address int true_variable = 1; // setsockopt() requires a reference to the value assigned to an option - if (setsockopt(socket_descriptor, SOL_SOCKET, SO_REUSEADDR, &true_variable, sizeof(int32_t)) < 0) - { + if (setsockopt( + socket_descriptor, + SOL_SOCKET, + SO_REUSEADDR, + &true_variable, + sizeof(int32_t)) < 0) { lf_print_error("RTI failed to set SO_REUSEADDR option on the socket: %s.", strerror(errno)); } // Set the timeout on the socket so that read and write operations don't block for too long - if (setsockopt(socket_descriptor, SOL_SOCKET, SO_RCVTIMEO, (const char *)&timeout_time, sizeof(timeout_time)) < 0) - { + if (setsockopt( + socket_descriptor, + SOL_SOCKET, + SO_RCVTIMEO, + (const char *)&timeout_time, + sizeof(timeout_time)) < 0) { lf_print_error("RTI failed to set SO_RCVTIMEO option on the socket: %s.", strerror(errno)); } - if (setsockopt(socket_descriptor, SOL_SOCKET, SO_SNDTIMEO, (const char *)&timeout_time, sizeof(timeout_time)) < 0) - { + if (setsockopt( + socket_descriptor, + SOL_SOCKET, + SO_SNDTIMEO, + (const char *)&timeout_time, + sizeof(timeout_time)) < 0) { lf_print_error("RTI failed to set SO_SNDTIMEO option on the socket: %s.", strerror(errno)); } @@ -108,11 +136,6 @@ int create_server(int32_t specified_port, uint16_t port, socket_type_t socket_ty // UDP sockets. int reuse = 1; - if (setsockopt(socket_descriptor, SOL_SOCKET, SO_REUSEADDR, - (const char*)&reuse, sizeof(reuse)) < 0) { - perror("setsockopt(SO_REUSEADDR) failed"); - } - #ifdef SO_REUSEPORT if (setsockopt(socket_descriptor, SOL_SOCKET, SO_REUSEPORT, (const char*)&reuse, sizeof(reuse)) < 0) { @@ -126,23 +149,35 @@ int create_server(int32_t specified_port, uint16_t port, socket_type_t socket_ty // Zero out the server address structure. bzero((char *)&server_fd, sizeof(server_fd)); + uint16_t specified_port = port; + if (specified_port == 0) port = DEFAULT_PORT; + server_fd.sin_family = AF_INET; // IPv4 server_fd.sin_addr.s_addr = INADDR_ANY; // All interfaces, 0.0.0.0. // Convert the port number from host byte order to network byte order. server_fd.sin_port = htons(port); int result = bind( - socket_descriptor, - (struct sockaddr *)&server_fd, - sizeof(server_fd)); + socket_descriptor, + (struct sockaddr *)&server_fd, + sizeof(server_fd)); + + // Try repeatedly to bind to a port. If no specific port is specified, then + // increment the port number each time. - // Try repeatedly to bind to the specified port. int count = 1; - while (result != 0 && count++ < PORT_BIND_RETRY_LIMIT) - { - lf_print("RTI failed to get port %d. Will try again.", port); - lf_sleep(PORT_BIND_RETRY_INTERVAL); - server_fd.sin_port = htons(port); + while (result != 0 && count++ < PORT_BIND_RETRY_LIMIT) { + if (specified_port == 0) { + lf_print_warning("RTI failed to get port %d.", port); + port++; + if (port >= DEFAULT_PORT + MAX_NUM_PORT_ADDRESSES) port = DEFAULT_PORT; + lf_print_warning("RTI will try again with port %d.", port); + server_fd.sin_port = htons(port); + // Do not sleep. + } else { + lf_print("RTI failed to get port %d. Will try again.", port); + lf_sleep(PORT_BIND_RETRY_INTERVAL); + } result = bind( socket_descriptor, (struct sockaddr *)&server_fd, @@ -153,22 +188,18 @@ int create_server(int32_t specified_port, uint16_t port, socket_type_t socket_ty lf_print_error_and_exit("Failed to bind the RTI socket. Port %d is not available. ", port); } char *type = "TCP"; - if (socket_type == UDP) - { + if (socket_type == UDP) { type = "UDP"; } lf_print("RTI using %s port %d for federation %s.", type, port, rti_remote->federation_id); - if (socket_type == TCP) - { + if (socket_type == TCP) { rti_remote->final_port_TCP = port; // Enable listening for socket connections. // The second argument is the maximum number of queued socket requests, // which according to the Mac man page is limited to 128. listen(socket_descriptor, 128); - } - else if (socket_type == UDP) - { + } else if (socket_type == UDP) { rti_remote->final_port_UDP = port; // No need to listen on the UDP socket } @@ -1694,23 +1725,15 @@ void initialize_federate(federate_info_t *fed, uint16_t id) fed->server_port = -1; } -int32_t start_rti_server(uint16_t port) -{ - int32_t specified_port = port; - if (port == 0) - { - // Use the default port. - port = DEFAULT_PORT; - } +int32_t start_rti_server(uint16_t port) { _lf_initialize_clock(); // Create the TCP socket server - rti_remote->socket_descriptor_TCP = create_server(specified_port, port, TCP); + rti_remote->socket_descriptor_TCP = create_rti_server(port, TCP); lf_print("RTI: Listening for federates."); // Create the UDP socket server - // Try to get the rti->final_port_TCP + 1 port - if (rti_remote->clock_sync_global_status >= clock_sync_on) - { - rti_remote->socket_descriptor_UDP = create_server(specified_port, rti_remote->final_port_TCP + 1, UDP); + // Try to get the rti_remote->final_port_TCP + 1 port + if (rti_remote->clock_sync_global_status >= clock_sync_on) { + rti_remote->socket_descriptor_UDP = create_rti_server(rti_remote->final_port_TCP + 1, UDP); } return rti_remote->socket_descriptor_TCP; } diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index 01a6edee2..c78d4bf6a 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -189,17 +189,6 @@ extern int lf_critical_section_exit(environment_t* env); */ extern bool _lf_federate_reports_error; -/** - * Create a server and enable listening for socket connections. - * - * @note This function is different from create_server(...) in federate.c. - * - * @param port The port number to use. - * @param socket_type The type of the socket for the server (TCP or UDP). - * @return The socket descriptor on which to accept connections. - */ -int create_server(int32_t specified_port, uint16_t port, socket_type_t socket_type); - /** * @brief Update the next event tag of federate `federate_id`. * diff --git a/core/federated/federate.c b/core/federated/federate.c index 6e4fe746f..3ddb674c6 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -754,8 +754,9 @@ void connect_to_federate(uint16_t remote_federate_id) { /** * Perform HMAC-based authentication with the RTI, using the federation ID * as an HMAC key. + * @return 0 for success, -1 for failure. */ -void perform_hmac_authentication() { +static int perform_hmac_authentication() { // Send buffer including message type, federate ID, federate's nonce. size_t fed_id_length = sizeof(uint16_t); @@ -782,11 +783,13 @@ void perform_hmac_authentication() { "Failed to read RTI response."); if (received[0] != MSG_TYPE_RTI_RESPONSE) { if (received[0] == MSG_TYPE_RESIGN) { - lf_print_error_and_exit("RTI has resigned."); + lf_print_error("RTI has resigned."); + return -1; } else { - lf_print_error_and_exit( + lf_print_error( "Received unexpected response %u from the RTI (see net_common.h).", received[0]); + return -1; } } // Create tag to compare to received tag. @@ -807,13 +810,10 @@ void perform_hmac_authentication() { response[1] = HMAC_DOES_NOT_MATCH; LF_MUTEX_LOCK(outbound_socket_mutex); - write_to_socket_fail_on_error( - &_fed.socket_TCP_RTI, 2, response, &outbound_socket_mutex, - "Federate failed to write MSG_TYPE_REJECT message on the socket."); - shutdown(_fed.socket_TCP_RTI, SHUT_RDWR); - close(_fed.socket_TCP_RTI); - _fed.socket_TCP_RTI = -1; - LF_MUTEX_LOCK(outbound_socket_mutex); + // Ignore errors on writing back. + write_to_socket(&_fed.socket_TCP_RTI, 2, response, &outbound_socket_mutex); + LF_MUTEX_UNLOCK(outbound_socket_mutex); + return -1; } else { LF_PRINT_LOG("HMAC verified."); // HMAC tag is created with MSG_TYPE_FED_RESPONSE and received federate nonce. @@ -832,9 +832,46 @@ void perform_hmac_authentication() { "Failed to write fed response."); LF_MUTEX_UNLOCK(outbound_socket_mutex); } + return 0; } #endif +static void close_rti_socket() { + shutdown(_fed.socket_TCP_RTI, SHUT_RDWR); + close(_fed.socket_TCP_RTI); + _fed.socket_TCP_RTI = -1; +} + +/** + * Return in the result a struct with the address info for the specified hostname and port. + * The memory for the result is dynamically allocated and must be freed using freeaddrinfo. + * @param hostname The host name. + * @param port The port number. + * @param result The struct into which to write. + */ +static void rti_address(const char* hostname, uint16_t port, struct addrinfo** result) { + struct addrinfo hints; + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_INET; /* Allow IPv4 */ + hints.ai_socktype = SOCK_STREAM; /* Stream socket */ + hints.ai_protocol = IPPROTO_TCP; /* TCP protocol */ + hints.ai_addr = NULL; + hints.ai_next = NULL; + hints.ai_flags = AI_NUMERICSERV; /* Allow only numeric port numbers */ + + // Convert port number to string. + char str[6]; + sprintf(str, "%u", port); + + // Get address structure matching hostname and hints criteria, and + // set port to the port number provided in str. There should only + // ever be one matching address structure, and we connect to that. + if (getaddrinfo(hostname, (const char*)&str, &hints, result)) { + lf_print_error_and_exit("No host for RTI matching given hostname: %s", hostname); + } +} + void connect_to_rti(const char* hostname, int port) { LF_PRINT_LOG("Connecting to the RTI."); @@ -842,15 +879,16 @@ void connect_to_rti(const char* hostname, int port) { hostname = federation_metadata.rti_host ? federation_metadata.rti_host : hostname; port = federation_metadata.rti_port >= 0 ? federation_metadata.rti_port : port; + // Adjust the port. uint16_t uport = 0; - if (port < 0 || - port > INT16_MAX) { + if (port < 0 || port > INT16_MAX) { lf_print_error( - "connect_to_rti(): Specified port (%d) is out of range," - " using the default port %d instead.", - port, DEFAULT_PORT + "connect_to_rti(): Specified port (%d) is out of range," + " using the default port %d instead.", + port, DEFAULT_PORT ); uport = DEFAULT_PORT; + port = 0; // Mark so that increments occur between tries. } else { uport = (uint16_t)port; } @@ -858,129 +896,146 @@ void connect_to_rti(const char* hostname, int port) { uport = DEFAULT_PORT; } - struct addrinfo hints; - struct addrinfo *res; + // Create a socket + _fed.socket_TCP_RTI = create_real_time_tcp_socket_errexit(); - memset(&hints, 0, sizeof(hints)); - hints.ai_family = AF_INET; /* Allow IPv4 */ - hints.ai_socktype = SOCK_STREAM; /* Stream socket */ - hints.ai_protocol = IPPROTO_TCP; /* TCP protocol */ - hints.ai_addr = NULL; - hints.ai_next = NULL; - hints.ai_flags = AI_NUMERICSERV; /* Allow only numeric port numbers */ + int result = -1; + int count_retries = 0; + struct addrinfo* res = NULL; - // Convert port number to string - char str[6]; - sprintf(str,"%u",uport); + while (count_retries++ < CONNECT_MAX_RETRIES && !_lf_termination_executed) { + if (res != NULL) { + // This is a repeated attempt. + if (_fed.socket_TCP_RTI >= 0) close_rti_socket(); - // Get address structure matching hostname and hints criteria, and - // set port to the port number provided in str. There should only - // ever be one matching address structure, and we connect to that. - if (getaddrinfo(hostname, (const char*)&str, &hints, &res)) { - lf_print_error_and_exit("No host for RTI matching given hostname: %s", hostname); - } + lf_sleep(CONNECT_RETRY_INTERVAL); - // Create a socket - _fed.socket_TCP_RTI = create_real_time_tcp_socket_errexit(); + // Create a new socket. + _fed.socket_TCP_RTI = create_real_time_tcp_socket_errexit(); - int result = connect(_fed.socket_TCP_RTI, res->ai_addr, res->ai_addrlen);; - int count_retries = 1; + if (port == 0) { + // Free previously allocated address info. + freeaddrinfo(res); + // Increment the port number. + uport++; + if (uport >= DEFAULT_PORT + MAX_NUM_PORT_ADDRESSES) uport = DEFAULT_PORT; - while (result < 0 && count_retries++ < CONNECT_MAX_RETRIES && !_lf_termination_executed) { - lf_print("Failed to connect to RTI on port %d. Will try again.", uport); - lf_sleep(CONNECT_RETRY_INTERVAL); - result = connect(_fed.socket_TCP_RTI, res->ai_addr, res->ai_addrlen); - } - freeaddrinfo(res); /* No longer needed */ + // Reconstruct the address info. + rti_address(hostname, uport, &res); + } + lf_print("Trying RTI again on port %d (attempt %d).", uport, count_retries); + } else { + // This is the first attempt. + rti_address(hostname, uport, &res); + } - if (result != 0) { - lf_print_error_and_exit("Failed to connect to RTI on port %d after %d tries.", uport, CONNECT_MAX_RETRIES); - } - lf_print("Successfully connected to an RTI."); + result = connect(_fed.socket_TCP_RTI, res->ai_addr, res->ai_addrlen); + if (result < 0) continue; // Connect failed. - // Have connected to an RTI, but not sure it's the right RTI. - // Send a MSG_TYPE_FED_IDS message and wait for a reply. - // Notify the RTI of the ID of this federate and its federation. - unsigned char buffer[4]; + // Have connected to an RTI, but not sure it's the right RTI. + // Send a MSG_TYPE_FED_IDS message and wait for a reply. + // Notify the RTI of the ID of this federate and its federation. #ifdef FEDERATED_AUTHENTICATED - LF_PRINT_LOG("Connected to an RTI. Performing HMAC-based authentication using federation ID."); - perform_hmac_authentication(); + LF_PRINT_LOG("Connected to an RTI. Performing HMAC-based authentication using federation ID."); + if (perform_hmac_authentication()) { + if (port == 0) { + continue; // Try again with a new port. + } else { + // No point in trying again because it will be the same port. + close_rti_socket(); + lf_print_error_and_exit("Authentication failed."); + } + } #else - LF_PRINT_LOG("Connected to an RTI. Sending federation ID for authentication."); + LF_PRINT_LOG("Connected to an RTI. Sending federation ID for authentication."); #endif - // Send the message type first. - buffer[0] = MSG_TYPE_FED_IDS; - // Next send the federate ID. - if (_lf_my_fed_id > UINT16_MAX) { - lf_print_error_and_exit("Too many federates! More than %d.", UINT16_MAX); - } - encode_uint16((uint16_t)_lf_my_fed_id, &buffer[1]); - // Next send the federation ID length. - // The federation ID is limited to 255 bytes. - size_t federation_id_length = strnlen(federation_metadata.federation_id, 255); - buffer[1 + sizeof(uint16_t)] = (unsigned char)(federation_id_length & 0xff); + // Send the message type first. + unsigned char buffer[4]; + buffer[0] = MSG_TYPE_FED_IDS; + // Next send the federate ID. + if (_lf_my_fed_id > UINT16_MAX) { + lf_print_error_and_exit("Too many federates! More than %d.", UINT16_MAX); + } + encode_uint16((uint16_t)_lf_my_fed_id, &buffer[1]); + // Next send the federation ID length. + // The federation ID is limited to 255 bytes. + size_t federation_id_length = strnlen(federation_metadata.federation_id, 255); + buffer[1 + sizeof(uint16_t)] = (unsigned char)(federation_id_length & 0xff); - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_FED_ID, _lf_my_fed_id, NULL); + // Trace the event when tracing is enabled + tracepoint_federate_to_rti(_fed.trace, send_FED_ID, _lf_my_fed_id, NULL); - // No need for a mutex here because no other threads are writing to this socket. - write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, 2 + sizeof(uint16_t), buffer, NULL, - "Failed to send federate ID to RTI."); + // No need for a mutex here because no other threads are writing to this socket. + if (write_to_socket(_fed.socket_TCP_RTI, 2 + sizeof(uint16_t), buffer)) { + continue; // Try again, possibly on a new port. + } - // Next send the federation ID itself. - write_to_socket_fail_on_error( - &_fed.socket_TCP_RTI, federation_id_length, (unsigned char*)federation_metadata.federation_id, NULL, - "Failed to send federation ID to RTI."); + // Next send the federation ID itself. + if (write_to_socket( + _fed.socket_TCP_RTI, + federation_id_length, + (unsigned char*)federation_metadata.federation_id)) { + continue; // Try again. + } - // Wait for a response. - // The response will be MSG_TYPE_REJECT if the federation ID doesn't match. - // Otherwise, it will be either MSG_TYPE_ACK or MSG_TYPE_UDP_PORT, where the latter - // is used if clock synchronization will be performed. - unsigned char response; + // Wait for a response. + // The response will be MSG_TYPE_REJECT if the federation ID doesn't match. + // Otherwise, it will be either MSG_TYPE_ACK or MSG_TYPE_UDP_PORT, where the latter + // is used if clock synchronization will be performed. + unsigned char response; - LF_PRINT_DEBUG("Waiting for response to federation ID from the RTI."); + LF_PRINT_DEBUG("Waiting for response to federation ID from the RTI."); - read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, 1, &response, NULL, - "Failed to read response from RTI."); - if (response == MSG_TYPE_REJECT) { - // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_REJECT, _lf_my_fed_id, NULL); - // Read one more byte to determine the cause of rejection. - unsigned char cause; - read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, 1, &cause, NULL, - "Failed to read the cause of rejection by the RTI."); - if (cause == FEDERATION_ID_DOES_NOT_MATCH || cause == WRONG_SERVER) { - lf_print_error_and_exit("Connected to the wrong RTI on port %d.", uport); + if (read_from_socket(_fed.socket_TCP_RTI, 1, &response)) { + continue; // Try again. + } + if (response == MSG_TYPE_REJECT) { + // Trace the event when tracing is enabled + tracepoint_federate_from_rti(_fed.trace, receive_REJECT, _lf_my_fed_id, NULL); + // Read one more byte to determine the cause of rejection. + unsigned char cause; + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, 1, &cause, NULL, + "Failed to read the cause of rejection by the RTI."); + if (cause == FEDERATION_ID_DOES_NOT_MATCH || cause == WRONG_SERVER) { + lf_print_warning("Connected to the wrong RTI on port %d. Will try again", uport); + continue; + } + } else if (response == MSG_TYPE_ACK) { + // Trace the event when tracing is enabled + tracepoint_federate_from_rti(_fed.trace, receive_ACK, _lf_my_fed_id, NULL); + LF_PRINT_LOG("Received acknowledgment from the RTI."); + break; + } else if (response == MSG_TYPE_RESIGN) { + lf_print_warning("RTI on port %d resigned. Will try again", uport); + continue; + } else { + lf_print_warning("RTI on port %d gave unexpect response %u. Will try again", uport, response); + continue; } - lf_print_error_and_exit("RTI Rejected MSG_TYPE_FED_IDS message with response (see net_common.h): " - "%d. Error code: %d. Federate quits.\n", response, cause); - } else if (response == MSG_TYPE_ACK) { - // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_ACK, _lf_my_fed_id, NULL); - LF_PRINT_LOG("Received acknowledgment from the RTI."); - - // Call a generated (external) function that sends information - // about connections between this federate and other federates - // where messages are routed through the RTI. - // @see MSG_TYPE_NEIGHBOR_STRUCTURE in net_common.h - send_neighbor_structure_to_RTI(_fed.socket_TCP_RTI); - - uint16_t udp_port = setup_clock_synchronization_with_rti(); - - // Write the returned port number to the RTI - unsigned char UDP_port_number[1 + sizeof(uint16_t)]; - UDP_port_number[0] = MSG_TYPE_UDP_PORT; - encode_uint16(udp_port, &(UDP_port_number[1])); - write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, 1 + sizeof(uint16_t), UDP_port_number, NULL, - "Failed to send the UDP port number to the RTI."); - } else if (response == MSG_TYPE_RESIGN) { - lf_print_error_and_exit("RTI has resigned."); - } else { - lf_print_error_and_exit("Received unexpected response %u from the RTI (see net_common.h).", - response); } + if (result < 0) { + lf_print_error_and_exit("Failed to connect to RTI after %d tries.", CONNECT_MAX_RETRIES); + } + + freeaddrinfo(res); /* No longer needed */ + + // Call a generated (external) function that sends information + // about connections between this federate and other federates + // where messages are routed through the RTI. + // @see MSG_TYPE_NEIGHBOR_STRUCTURE in net_common.h + send_neighbor_structure_to_RTI(_fed.socket_TCP_RTI); + + uint16_t udp_port = setup_clock_synchronization_with_rti(); + + // Write the returned port number to the RTI + unsigned char UDP_port_number[1 + sizeof(uint16_t)]; + UDP_port_number[0] = MSG_TYPE_UDP_PORT; + encode_uint16(udp_port, &(UDP_port_number[1])); + write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, 1 + sizeof(uint16_t), UDP_port_number, NULL, + "Failed to send the UDP port number to the RTI."); + lf_print("Connected to RTI at %s:%d.", hostname, uport); } diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index aff78ed10..6e9d018be 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -269,13 +269,14 @@ void connect_to_federate(uint16_t); void _lf_logical_tag_complete(tag_t); /** - * Connect to the RTI at the specified host and port and return - * the socket descriptor for the connection. If this fails, wait CONNECT_RETRY_INTERVAL - * and try again. If it fails after CONNECT_MAX_RETRIES, the - * program exits. If it succeeds, it sets the _fed.socket_TCP_RTI global - * variable to refer to the socket for communicating with the RTI. + * Connect to the RTI at the specified host and port and return the socket descriptor + * for the connection. If port_number is 0, then start at DEFAULT_PORT and increment + * the port number on each attempt. If an attempt fails, wait CONNECT_RETRY_INTERVAL + * and try again. If it fails after CONNECT_MAX_RETRIES, the program exits. + * If it succeeds, it sets the _fed.socket_TCP_RTI global variable to refer to + * the socket for communicating with the RTI. * @param hostname A hostname, such as "localhost". - * @param port_number A port number. + * @param port_number A port number or 0 to start with the default. */ void connect_to_rti(const char* hostname, int port_number); diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index d3f6d7ff2..4e55216ed 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -201,16 +201,21 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /** * Time between a federate's attempts to connect to the RTI. */ -#define CONNECT_RETRY_INTERVAL SEC(1) +#define CONNECT_RETRY_INTERVAL MSEC(500) /** * Bound on the number of retries to connect to the RTI. * A federate will retry every CONNECT_RETRY_INTERVAL seconds - * this many times before giving up. E.g., 600 retries every - * 1 seconds results in retrying for about 10 minutes. - * This allows time to start federates before the RTI. + * this many times before giving up. */ -#define CONNECT_MAX_RETRIES 600 +#define CONNECT_MAX_RETRIES 100 + +/** + * Maximum number of port addresses that a federate will try to connect to the RTI on. + * If you are using automatic ports begining at DEFAULT_PORT, this puts an upper bound + * on the number of RTIs that can be running on the same host. + */ +#define MAX_NUM_PORT_ADDRESSES 16 /** * Time that a federate waits before asking @@ -225,9 +230,9 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * When a process closes, the network stack typically waits between 30 and 120 * seconds before releasing the port. This is to allow for delayed packets so * that a new process does not receive packets from a previous process. - * Here, we limit the retries to 120 seconds. + * Here, we limit the retries to 60 seconds. */ -#define PORT_BIND_RETRY_INTERVAL SEC(2) +#define PORT_BIND_RETRY_INTERVAL SEC(1) /** * Number of attempts to bind to a port before giving up. diff --git a/include/core/federated/network/net_util.h b/include/core/federated/network/net_util.h index 544e86d46..097127e25 100644 --- a/include/core/federated/network/net_util.h +++ b/include/core/federated/network/net_util.h @@ -75,7 +75,7 @@ extern lf_mutex_t socket_mutex; * (TCP_NODELAY) and Delayed ACKs disabled (TCP_QUICKACK). Exits application * on any error. * - * @return int + * @return The socket ID (a file descriptor). */ int create_real_time_tcp_socket_errexit(); From 6261541df8980f0d2dc93527b0b7b062098eaa3d Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Fri, 29 Dec 2023 15:54:11 -0800 Subject: [PATCH 34/83] Fixed authenticated --- core/federated/federate.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 3ddb674c6..d41172d95 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -768,11 +768,9 @@ static int perform_hmac_authentication() { RAND_bytes(fed_nonce, NONCE_LENGTH); memcpy(&fed_hello_buf[1 + fed_id_length], fed_nonce, NONCE_LENGTH); - LF_MUTEX_LOCK(outbound_socket_mutex); write_to_socket_fail_on_error( - &_fed.socket_TCP_RTI, message_length, fed_hello_buf, &outbound_socket_mutex, + &_fed.socket_TCP_RTI, message_length, fed_hello_buf, NULL, "Failed to write nonce."); - LF_MUTEX_UNLOCK(outbound_socket_mutex); // Check HMAC of received FED_RESPONSE message. unsigned int hmac_length = SHA256_HMAC_LENGTH; @@ -809,10 +807,8 @@ static int perform_hmac_authentication() { response[0] = MSG_TYPE_REJECT; response[1] = HMAC_DOES_NOT_MATCH; - LF_MUTEX_LOCK(outbound_socket_mutex); // Ignore errors on writing back. - write_to_socket(&_fed.socket_TCP_RTI, 2, response, &outbound_socket_mutex); - LF_MUTEX_UNLOCK(outbound_socket_mutex); + write_to_socket(_fed.socket_TCP_RTI, 2, response); return -1; } else { LF_PRINT_LOG("HMAC verified."); @@ -826,11 +822,9 @@ static int perform_hmac_authentication() { HMAC(EVP_sha256(), federation_metadata.federation_id, federation_id_length, mac_buf, 1 + NONCE_LENGTH, &sender[1], &hmac_length); - LF_MUTEX_LOCK(outbound_socket_mutex); write_to_socket_fail_on_error( - &_fed.socket_TCP_RTI, 1 + hmac_length, sender, &outbound_socket_mutex, + &_fed.socket_TCP_RTI, 1 + hmac_length, sender, NULL, "Failed to write fed response."); - LF_MUTEX_UNLOCK(outbound_socket_mutex); } return 0; } From 7ddc477551635374956554e62be6a6175bbcb0c8 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Fri, 29 Dec 2023 17:58:20 -0800 Subject: [PATCH 35/83] Fixed debug message --- core/threaded/scheduler_NP.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/threaded/scheduler_NP.c b/core/threaded/scheduler_NP.c index 9e032017b..aa3549e0d 100644 --- a/core/threaded/scheduler_NP.c +++ b/core/threaded/scheduler_NP.c @@ -128,7 +128,7 @@ int _lf_sched_distribute_ready_reactions(lf_scheduler_t* scheduler) { scheduler->next_reaction_level - 1 ]; - LF_PRINT_DEBUG("DEBUG: start of rxn queue at %lu is %p", scheduler->next_reaction_level - 1, ((reaction_t**)scheduler->executing_reactions)[0]); + LF_PRINT_DEBUG("Start of rxn queue at %lu is %p", scheduler->next_reaction_level - 1, ((reaction_t**)scheduler->executing_reactions)[0]); if (((reaction_t**)scheduler->executing_reactions)[0] != NULL) { // There is at least one reaction to execute return 1; From 191a261bc73c1930edc5fcd0eacd1889e58f9278 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Fri, 29 Dec 2023 17:58:53 -0800 Subject: [PATCH 36/83] Fixed tracing of stop request messages --- core/federated/federate.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index d41172d95..1fb1fceaf 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2065,7 +2065,8 @@ int _lf_fd_send_stop_request_to_rti(tag_t stop_tag) { // Send a stop request with the specified tag to the RTI unsigned char buffer[MSG_TYPE_STOP_REQUEST_LENGTH]; // Stop at the next microstep - ENCODE_STOP_REQUEST(buffer, stop_tag.time, stop_tag.microstep + 1); + stop_tag.microstep++; + ENCODE_STOP_REQUEST(buffer, stop_tag.time, stop_tag.microstep); LF_MUTEX_LOCK(outbound_socket_mutex); // Do not send a stop request if a stop request has been previously received from the RTI. @@ -2215,6 +2216,9 @@ void handle_stop_request_message() { &_fed.socket_TCP_RTI, MSG_TYPE_STOP_REQUEST_REPLY_LENGTH, outgoing_buffer, &outbound_socket_mutex, "Failed to send the answer to MSG_TYPE_STOP_REQUEST to RTI."); LF_MUTEX_UNLOCK(outbound_socket_mutex); + + LF_PRINT_DEBUG("Sent MSG_TYPE_STOP_REQUEST_REPLY to RTI with tag " PRINTF_TAG, + tag_to_stop.time, tag_to_stop.microstep); // Trace the event when tracing is enabled tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ_REP, _lf_my_fed_id, &tag_to_stop); } From c042ddd0d80a69190053da251c0e03af6f329023 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sat, 30 Dec 2023 08:13:09 -0800 Subject: [PATCH 37/83] Comment out too-verbose debug message --- core/federated/RTI/rti_common.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/federated/RTI/rti_common.c b/core/federated/RTI/rti_common.c index a3f996bd5..68e8608cc 100644 --- a/core/federated/RTI/rti_common.c +++ b/core/federated/RTI/rti_common.c @@ -102,10 +102,12 @@ tag_t earliest_future_incoming_message_tag(scheduling_node_t* e) { upstream->next_event = start_tag; } tag_t earliest_tag_from_upstream = lf_tag_add(upstream->next_event, e->min_delays[i].min_delay); + /* Following debug message is too verbose for normal use: LF_PRINT_DEBUG("RTI: Earliest next event upstream of fed/encl %d at fed/encl %d has tag " PRINTF_TAG ".", e->id, upstream->id, earliest_tag_from_upstream.time - start_time, earliest_tag_from_upstream.microstep); + */ if (lf_tag_compare(earliest_tag_from_upstream, t_d) < 0) { t_d = earliest_tag_from_upstream; } From 61c310f95ac5b67049a8b2a5db3f19348473e5c8 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sat, 30 Dec 2023 08:14:18 -0800 Subject: [PATCH 38/83] Fixed deadlock with race in lf_request_stop --- core/federated/RTI/rti_remote.c | 130 ++++++++++++++----------------- core/federated/RTI/rti_remote.h | 12 --- core/federated/federate.c | 35 +++++---- core/threaded/reactor_threaded.c | 2 + 4 files changed, 82 insertions(+), 97 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index c9e8015d4..7e92c03c3 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -605,7 +605,7 @@ void handle_next_event_tag(federate_info_t *fed) * Boolean used to prevent the RTI from sending the * MSG_TYPE_STOP_GRANTED message multiple times. */ -bool _lf_rti_stop_granted_already_sent_to_federates = false; +bool stop_granted_already_sent_to_federates = false; /** * Once the RTI has seen proposed tags from all connected federates, @@ -615,62 +615,62 @@ bool _lf_rti_stop_granted_already_sent_to_federates = false; * * This function assumes the caller holds the rti_mutex lock. */ -void _lf_rti_broadcast_stop_time_to_federates_locked() -{ - if (_lf_rti_stop_granted_already_sent_to_federates == true) - { +static void broadcast_stop_time_to_federates_locked() { + if (stop_granted_already_sent_to_federates == true) { return; } + stop_granted_already_sent_to_federates = true; + // Reply with a stop granted to all federates unsigned char outgoing_buffer[MSG_TYPE_STOP_GRANTED_LENGTH]; ENCODE_STOP_GRANTED(outgoing_buffer, rti_remote->base.max_stop_tag.time, rti_remote->base.max_stop_tag.microstep); // Iterate over federates and send each the message. - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) - { + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { federate_info_t *fed = GET_FED_INFO(i); - if (fed->enclave.state == NOT_CONNECTED) - { + if (fed->enclave.state == NOT_CONNECTED) { continue; } - if (lf_tag_compare(fed->enclave.next_event, rti_remote->base.max_stop_tag) >= 0) - { + if (lf_tag_compare(fed->enclave.next_event, rti_remote->base.max_stop_tag) >= 0) { // Need the next_event to be no greater than the stop tag. fed->enclave.next_event = rti_remote->base.max_stop_tag; } - if (rti_remote->base.tracing_enabled) - { + if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP_GRN, fed->enclave.id, &rti_remote->base.max_stop_tag); } - write_to_socket_fail_on_error(&fed->socket, MSG_TYPE_STOP_GRANTED_LENGTH, outgoing_buffer, &rti_mutex, - "RTI failed to send MSG_TYPE_STOP_GRANTED message to federate %d.", fed->enclave.id); + write_to_socket_fail_on_error( + &fed->socket, MSG_TYPE_STOP_GRANTED_LENGTH, outgoing_buffer, &rti_mutex, + "RTI failed to send MSG_TYPE_STOP_GRANTED message to federate %d.", fed->enclave.id); } - LF_PRINT_LOG("RTI sent to federates MSG_TYPE_STOP_GRANTED with tag (" PRINTF_TIME ", %u).", - rti_remote->base.max_stop_tag.time - start_time, - rti_remote->base.max_stop_tag.microstep); - _lf_rti_stop_granted_already_sent_to_federates = true; + LF_PRINT_LOG("RTI sent to federates MSG_TYPE_STOP_GRANTED with tag " PRINTF_TAG, + rti_remote->base.max_stop_tag.time - start_time, + rti_remote->base.max_stop_tag.microstep); } -void mark_federate_requesting_stop(federate_info_t *fed) -{ - if (!fed->requested_stop) - { - // Assume that the federate - // has requested stop +/** + * Mark a federate requesting stop. If the number of federates handling stop reaches the + * NUM_OF_FEDERATES, broadcast MSG_TYPE_STOP_GRANTED to every federate. + * This function assumes the _RTI.mutex is already locked. + * @param fed The federate that has requested a stop. + * @return 1 if stop time has been sent to all federates and 0 otherwise. + */ +static int mark_federate_requesting_stop(federate_info_t *fed) { + if (!fed->requested_stop) { rti_remote->base.num_scheduling_nodes_handling_stop++; fed->requested_stop = true; } - if (rti_remote->base.num_scheduling_nodes_handling_stop == rti_remote->base.number_of_scheduling_nodes) - { + if (rti_remote->base.num_scheduling_nodes_handling_stop + == rti_remote->base.number_of_scheduling_nodes) { // We now have information about the stop time of all // federates. - _lf_rti_broadcast_stop_time_to_federates_locked(); + broadcast_stop_time_to_federates_locked(); + return 1; } + return 0; } -void handle_stop_request_message(federate_info_t *fed) -{ +void handle_stop_request_message(federate_info_t *fed) { LF_PRINT_DEBUG("RTI handling stop_request from federate %d.", fed->enclave.id); size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_LENGTH - 1; @@ -682,75 +682,64 @@ void handle_stop_request_message(federate_info_t *fed) // Extract the proposed stop tag for the federate tag_t proposed_stop_tag = extract_tag(buffer); - if (rti_remote->base.tracing_enabled) - { + if (rti_remote->base.tracing_enabled) { tracepoint_rti_from_federate(rti_remote->base.trace, receive_STOP_REQ, fed->enclave.id, &proposed_stop_tag); } + LF_PRINT_LOG("RTI received from federate %d a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", + fed->enclave.id, proposed_stop_tag.time - start_time, proposed_stop_tag.microstep); + // Acquire a mutex lock to ensure that this state does change while a // message is in transport or being used to determine a TAG. LF_MUTEX_LOCK(rti_mutex); // Check whether we have already received a stop_tag // from this federate - if (fed->requested_stop) - { - // Ignore this request + if (fed->requested_stop) { + // If stop request messages have already been broadcast, treat this as if it were a reply. + if (rti_remote->stop_in_progress) { + mark_federate_requesting_stop(fed); + } LF_MUTEX_UNLOCK(rti_mutex); return; } // Update the maximum stop tag received from federates - if (lf_tag_compare(proposed_stop_tag, rti_remote->base.max_stop_tag) > 0) - { + if (lf_tag_compare(proposed_stop_tag, rti_remote->base.max_stop_tag) > 0) { rti_remote->base.max_stop_tag = proposed_stop_tag; } - LF_PRINT_LOG("RTI received from federate %d a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", - fed->enclave.id, proposed_stop_tag.time - start_time, proposed_stop_tag.microstep); - - // If this federate has not already asked - // for a stop, add it to the tally. - mark_federate_requesting_stop(fed); - - if (rti_remote->base.num_scheduling_nodes_handling_stop == rti_remote->base.number_of_scheduling_nodes) - { - // We now have information about the stop time of all - // federates, and mark_federate_requesting_stop has sent out stop time to. + // If all federates have replied, send stop request granted. + if (mark_federate_requesting_stop(fed)) { + // Have send stop request granted to all federates. Nothing more to do. LF_MUTEX_UNLOCK(rti_mutex); return; } + // Forward the stop request to all other federates that have not // also issued a stop request. unsigned char stop_request_buffer[MSG_TYPE_STOP_REQUEST_LENGTH]; - ENCODE_STOP_REQUEST(stop_request_buffer, rti_remote->base.max_stop_tag.time, rti_remote->base.max_stop_tag.microstep); + ENCODE_STOP_REQUEST(stop_request_buffer, + rti_remote->base.max_stop_tag.time, rti_remote->base.max_stop_tag.microstep); // Iterate over federates and send each the MSG_TYPE_STOP_REQUEST message // if we do not have a stop_time already for them. Do not do this more than once. - if (rti_remote->stop_in_progress) - { + if (rti_remote->stop_in_progress) { LF_MUTEX_UNLOCK(rti_mutex); return; } rti_remote->stop_in_progress = true; - for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) - { + // FIXME: Need a timeout here in case a federate never replies. + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { federate_info_t *f = GET_FED_INFO(i); - if (f->enclave.id != fed->enclave.id && f->requested_stop == false) - { - if (f->enclave.state == NOT_CONNECTED) - { + if (f->enclave.id != fed->enclave.id && f->requested_stop == false) { + if (f->enclave.state == NOT_CONNECTED) { mark_federate_requesting_stop(f); continue; } - if (rti_remote->base.tracing_enabled) - { - tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP_REQ, f->enclave.id, &rti_remote->base.max_stop_tag); - } write_to_socket_fail_on_error(&f->socket, MSG_TYPE_STOP_REQUEST_LENGTH, stop_request_buffer, &rti_mutex, - "RTI failed to forward MSG_TYPE_STOP_REQUEST message to federate %d.", f->enclave.id); - if (rti_remote->base.tracing_enabled) - { + "RTI failed to forward MSG_TYPE_STOP_REQUEST message to federate %d.", f->enclave.id); + if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP_REQ, f->enclave.id, &rti_remote->base.max_stop_tag); } } @@ -761,8 +750,7 @@ void handle_stop_request_message(federate_info_t *fed) LF_MUTEX_UNLOCK(rti_mutex); } -void handle_stop_request_reply(federate_info_t *fed) -{ +void handle_stop_request_reply(federate_info_t *fed) { size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_REPLY_LENGTH - 1; unsigned char buffer_stop_time[bytes_to_read]; read_from_socket_fail_on_error(&fed->socket, bytes_to_read, buffer_stop_time, NULL, @@ -771,20 +759,18 @@ void handle_stop_request_reply(federate_info_t *fed) tag_t federate_stop_tag = extract_tag(buffer_stop_time); - if (rti_remote->base.tracing_enabled) - { + if (rti_remote->base.tracing_enabled) { tracepoint_rti_from_federate(rti_remote->base.trace, receive_STOP_REQ_REP, fed->enclave.id, &federate_stop_tag); } LF_PRINT_LOG("RTI received from federate %d STOP reply tag " PRINTF_TAG ".", fed->enclave.id, - federate_stop_tag.time - start_time, - federate_stop_tag.microstep); + federate_stop_tag.time - start_time, + federate_stop_tag.microstep); // Acquire the mutex lock so that we can change the state of the RTI LF_MUTEX_LOCK(rti_mutex); // If the federate has not requested stop before, count the reply - if (lf_tag_compare(federate_stop_tag, rti_remote->base.max_stop_tag) > 0) - { + if (lf_tag_compare(federate_stop_tag, rti_remote->base.max_stop_tag) > 0) { rti_remote->base.max_stop_tag = federate_stop_tag; } mark_federate_requesting_stop(fed); diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index c78d4bf6a..ff219e996 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -242,18 +242,6 @@ void handle_logical_tag_complete(federate_info_t* fed); void handle_next_event_tag(federate_info_t* fed); /////////////////// STOP functions //////////////////// -/** - * Mark a federate requesting stop. - * - * If the number of federates handling stop reaches the - * NUM_OF_FEDERATES, broadcast MSG_TYPE_STOP_GRANTED to every federate. - * - * This function assumes the _RTI.mutex is already locked. - * - * @param fed The federate that has requested a stop or has suddenly - * stopped (disconnected). - */ -void mark_federate_requesting_stop(federate_info_t* fed); /** * Handle a MSG_TYPE_STOP_REQUEST message. diff --git a/core/federated/federate.c b/core/federated/federate.c index 1fb1fceaf..5ad41bd13 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2083,6 +2083,9 @@ int _lf_fd_send_stop_request_to_rti(tag_t stop_tag) { write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, MSG_TYPE_STOP_REQUEST_LENGTH, buffer, &outbound_socket_mutex, "Failed to send stop time " PRINTF_TIME " to the RTI.", stop_tag.time - start_time); + + // Treat this sending as equivalent to having received a stop request from the RTI. + _fed.received_stop_request_from_rti = true; LF_MUTEX_UNLOCK(outbound_socket_mutex); // Trace the event when tracing is enabled tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ, _lf_my_fed_id, &stop_tag); @@ -2153,10 +2156,23 @@ void handle_stop_request_message() { // Trace the event when tracing is enabled tracepoint_federate_from_rti(_fed.trace, receive_STOP_REQ, _lf_my_fed_id, &tag_to_stop); - LF_PRINT_LOG("Received from RTI a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", + LF_PRINT_LOG("Received from RTI a MSG_TYPE_STOP_REQUEST signal with tag " PRINTF_TAG ".", tag_to_stop.time - start_time, tag_to_stop.microstep); + extern lf_mutex_t global_mutex; + extern bool lf_stop_requested; + bool already_blocked = false; + + LF_MUTEX_LOCK(global_mutex); + if (lf_stop_requested) { + LF_PRINT_LOG("Ignoring MSG_TYPE_STOP_REQUEST from RTI because lf_request_stop has been called locally."); + already_blocked = true; + } + // Treat the stop request from the RTI as if a local stop request had been received. + lf_stop_requested = true; + LF_MUTEX_UNLOCK(global_mutex); + // If we have previously received from the RTI a stop request, // or we have previously sent a stop request to the RTI, // then we have already blocked tag advance in enclaves. @@ -2165,23 +2181,16 @@ void handle_stop_request_message() { // The second is guarded by the global mutex. // Note that the RTI should not send stop requests more than once to federates. LF_MUTEX_LOCK(outbound_socket_mutex); - bool already_blocked = false; if (_fed.received_stop_request_from_rti) { + LF_PRINT_LOG("Redundant MSG_TYPE_STOP_REQUEST from RTI. Ignoring it."); already_blocked = true; + } else if (!already_blocked) { + // Do this only if lf_request_stop has not been called because it will + // prevent lf_request_stop from sending. + _fed.received_stop_request_from_rti = true; } - _fed.received_stop_request_from_rti = true; LF_MUTEX_UNLOCK(outbound_socket_mutex); - extern lf_mutex_t global_mutex; - extern bool lf_stop_requested; - LF_MUTEX_LOCK(global_mutex); - if (lf_stop_requested) { - already_blocked = true; - } - // Treat the stop request from the RTI as if a local stop request had been received. - lf_stop_requested = true; - LF_MUTEX_UNLOCK(global_mutex); - if (already_blocked) { // Either we have sent a stop request to the RTI ourselves, // or we have previously received a stop request from the RTI. diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index c8dae9b15..b8858763b 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -587,9 +587,11 @@ bool lf_stop_requested = false; // See reactor.h for docs. void lf_request_stop() { // If a requested stop is pending, return without doing anything. + LF_PRINT_LOG("lf_request_stop() has been called."); lf_mutex_lock(&global_mutex); if (lf_stop_requested) { lf_mutex_unlock(&global_mutex); + LF_PRINT_LOG("Ignoring redundant lf_request_stop() call."); return; } lf_stop_requested = true; From 283ff040d56d2da71954dd22ca958d3e2adf6ab5 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sat, 30 Dec 2023 10:49:30 -0800 Subject: [PATCH 39/83] Impose a time out for response to stop requests --- core/federated/RTI/rti_remote.c | 24 +++++++++++++++++++++++- core/federated/RTI/rti_remote.h | 3 +++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 7e92c03c3..2dab501af 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -670,6 +670,25 @@ static int mark_federate_requesting_stop(federate_info_t *fed) { return 0; } +/** + * Thread to time out if federates do not reply to stop request. + */ +static void* wait_for_stop_request_reply(void* args) { + // Divide the time into small chunks and check periodically. + interval_t chunk = MAX_TIME_FOR_REPLY_TO_STOP_REQUEST/30; + int count = 0; + while (count++ < 30) { + if (stop_granted_already_sent_to_federates) return NULL; + lf_sleep(chunk); + } + // If we reach here, then error out. + lf_print_error_and_exit("Received only %d stop request replies within timeout " + PRINTF_TIME "ns. RTI is exiting.", + rti_remote->base.num_scheduling_nodes_handling_stop, + MAX_TIME_FOR_REPLY_TO_STOP_REQUEST + ); +} + void handle_stop_request_message(federate_info_t *fed) { LF_PRINT_DEBUG("RTI handling stop_request from federate %d.", fed->enclave.id); @@ -729,7 +748,10 @@ void handle_stop_request_message(federate_info_t *fed) { return; } rti_remote->stop_in_progress = true; - // FIXME: Need a timeout here in case a federate never replies. + // Need a timeout here in case a federate never replies. + lf_thread_t timeout_thread; + lf_thread_create(&timeout_thread, wait_for_stop_request_reply, NULL); + for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { federate_info_t *f = GET_FED_INFO(i); if (f->enclave.id != fed->enclave.id && f->requested_stop == false) { diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index ff219e996..0a6896bb0 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -32,6 +32,9 @@ #include "lf_types.h" #include "message_record/message_record.h" +/** Time allowed for federates to reply to stop request. */ +#define MAX_TIME_FOR_REPLY_TO_STOP_REQUEST SEC(30) + ///////////////////////////////////////////// //// Data structures From 1d3a2e1b31571af12c56bad97554e7e076926be1 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sat, 30 Dec 2023 14:43:44 -0800 Subject: [PATCH 40/83] Tolerate socket closing during reading physical connection --- core/federated/federate.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 5ad41bd13..6916ef069 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1470,8 +1470,14 @@ void handle_message(int* socket, int fed_id) { // Read the header. size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t); unsigned char buffer[bytes_to_read]; - read_from_socket_fail_on_error(socket, bytes_to_read, buffer, NULL, - "Failed to read message header."); + if (read_from_socket_close_on_error(socket, bytes_to_read, buffer)) { + // Read failed, which means the socket has been closed between reading the + // message ID byte and here. Issue a warning only. This is a physical + // connection, so likely the message is just late. If it's a serious failure, + // it should be caught in another thread. + lf_print_warning("Failed to read message header."); + return; + } // Extract the header information. unsigned short port_id; @@ -1488,8 +1494,9 @@ void handle_message(int* socket, int fed_id) { // Read the payload. // Allocate memory for the message contents. unsigned char* message_contents = (unsigned char*)malloc(length); - read_from_socket_fail_on_error(socket, length, message_contents, NULL, - "Failed to read message body."); + if (read_from_socket_close_on_error(socket, length, message_contents)) { + lf_print_warning("Failed to read message body."); + } // Trace the event when tracing is enabled tracepoint_federate_from_federate(_fed.trace, receive_P2P_MSG, _lf_my_fed_id, federate_id, NULL); LF_PRINT_LOG("Message received by federate: %s. Length: %zu.", message_contents, length); From 99edcf974516470d2ea4c4202555e0027649594f Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sat, 30 Dec 2023 17:56:27 -0800 Subject: [PATCH 41/83] Have _lf_schedule_at_tag return trigger_handle_t --- core/reactor_common.c | 15 ++++++++++----- include/core/reactor_common.h | 2 +- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/core/reactor_common.c b/core/reactor_common.c index 307627a6e..e16d58a12 100644 --- a/core/reactor_common.c +++ b/core/reactor_common.c @@ -678,10 +678,11 @@ static void _lf_replace_token(event_t* event, lf_token_t* token) { * @param tag Logical tag of the event * @param token The token wrapping the payload or NULL for no payload. * - * @return 1 for success, 0 if no new event was scheduled (instead, the payload was updated), - * or -1 for error (the tag is equal to or less than the current tag). + * @return A positive trigger handle for success, 0 if no new event was scheduled + * (instead, the payload was updated), or -1 for error (the tag is equal to or less + * than the current tag). */ -int _lf_schedule_at_tag(environment_t* env, trigger_t* trigger, tag_t tag, lf_token_t* token) { +trigger_handle_t _lf_schedule_at_tag(environment_t* env, trigger_t* trigger, tag_t tag, lf_token_t* token) { assert(env != GLOBAL_ENVIRONMENT); tag_t current_logical_tag = env->current_tag; @@ -852,7 +853,11 @@ int _lf_schedule_at_tag(environment_t* env, trigger_t* trigger, tag_t tag, lf_to pqueue_insert(env->event_q, _lf_create_dummy_events(env, trigger, tag.time, e, relative_microstep)); } } - return 1; + trigger_handle_t return_value = env->_lf_handle++; + if (env->_lf_handle < 0) { + env->_lf_handle = 1; + } + return return_value; } /** @@ -1110,7 +1115,7 @@ trigger_handle_t _lf_schedule(environment_t *env, trigger_t* trigger, interval_t // NOTE: Rather than wrapping around to get a negative number, // we reset the handle on the assumption that much earlier // handles are irrelevant. - int return_value = env->_lf_handle++; + trigger_handle_t return_value = env->_lf_handle++; if (env->_lf_handle < 0) { env->_lf_handle = 1; } diff --git a/include/core/reactor_common.h b/include/core/reactor_common.h index 1010fc65e..b9c9d86ea 100644 --- a/include/core/reactor_common.h +++ b/include/core/reactor_common.h @@ -63,7 +63,7 @@ event_t* _lf_create_dummy_events( event_t* next, microstep_t offset ); -int _lf_schedule_at_tag(environment_t* env, trigger_t* trigger, tag_t tag, lf_token_t* token); +trigger_handle_t _lf_schedule_at_tag(environment_t* env, trigger_t* trigger, tag_t tag, lf_token_t* token); trigger_handle_t _lf_schedule(environment_t* env, trigger_t* trigger, interval_t extra_delay, lf_token_t* token); trigger_handle_t _lf_insert_reactions_for_trigger(environment_t* env, trigger_t* trigger, lf_token_t* token); From 3c95c2f78850cb1af731d206b0b1266eb7ae9fe2 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sat, 30 Dec 2023 18:07:20 -0800 Subject: [PATCH 42/83] Allow tardy messages to unblock reactions and clarify docs --- core/federated/federate.c | 83 +++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 29 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 6916ef069..1f7c577b9 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1366,7 +1366,7 @@ static trigger_handle_t schedule_message_received_from_network_locked( assert(env != GLOBAL_ENVIRONMENT); // Return value of the function - int return_value = 0; + trigger_handle_t return_value = 0; // Indicates whether or not the intended tag // of the message (timestamp, microstep) is @@ -1516,6 +1516,52 @@ void stall_advance_level_federation(environment_t* env, size_t level) { LF_MUTEX_UNLOCK(env->mutex); } +/** + * Return true if reactions need to be inserted directly into the reaction queue and + * false if a call to schedule is needed (the normal case). This function handles zero-delay + * cycles, where processing at a tag must be able to begin before all messages have arrived + * at that tag. This returns true if the following conditions are all true: + * + * 1. the first reaction triggered has a level >= MLAA (a port is or will be blocked on this trigger); + * 2. the intended_tag is less than or equal to the current tag of the environment; + * 3. the intended_tag is greater than the last_tag of the trigger; + * 4. the intended_tag is greater than the last_known_status_tag of the trigger; + * 5. the execution has started (the event queue has been examined); + * 6. the trigger is not physical; + * + * The comparison against the MLAA (condition 1), if true, means that there is a blocking port + * waiting for this trigger (or possibly an earlier blocking port). For condition (2), if the + * intended tag is less than the current tag, then the message is tardy. A tardy message can + * unblock a port, although it will trigger an STP violation handler if one is defined or an + * error if not (or if centralized coordination is being used). The comparison against the + * last_tag of the trigger (condition 3) ensures that if the message is tardy but there is + * already an earlier tardy message that has been handled (or is being handled), then we + * don't try to handle two messages in the same tag, which is not allowed. For example, there + * could be a case where current tag is 10 with a port absent reaction waiting, and a message + * has arrived with intended_tag 8. This message will eventually cause the port absent reaction + * to exit, but before that, a message with intended_tag of 9 could arrive before the port absent + * reaction has had a chance to exit. The port status is on the other hand changed in this thread, + * and thus, can be checked in this scenario without this race condition. The message with + * intended_tag of 9 in this case needs to wait one microstep to be processed. The check with + * last_known_status_tag (condition 4) deals with messages arriving with identical intended + * tags (which should not happen). This one will be handled late (one microstep later than + * the current tag if 1 and 2 are true). + * + * This function assumes the mutex is held on the environment. + * + * @param env The environment. + * @param trigger The trigger. + * @param intended_tag The intended tag. + */ +static bool handle_message_now(environment_t* env, trigger_t* trigger, tag_t intended_tag) { + return trigger->reactions[0]->index >= max_level_allowed_to_advance + && lf_tag_compare(intended_tag, lf_tag(env)) <= 0 + && lf_tag_compare(intended_tag, trigger->last_tag) > 0 + && lf_tag_compare(intended_tag, trigger->last_known_status_tag) > 0 + && env->execution_started + && !trigger->is_physical; +} + /** * Handle a tagged message being received from a remote federate via the RTI * or directly from other federates. @@ -1603,35 +1649,9 @@ void handle_tagged_message(int* socket, int fed_id) { // Create a token for the message lf_token_t* message_token = _lf_new_token((token_type_t*)action, message_contents, length); - // Check whether reactions need to be inserted directly into the reaction - // queue or a call to schedule is needed. This checks if the intended - // tag of the message is for the current tag or a tag that is already - // passed and if any port absent reaction is waiting on this port (or the - // execution hasn't even started). - // If the tag is intended for a tag that is passed, the port absent reactions - // would need to exit because only one message can be processed per tag, - // and that message is going to be a tardy message. The actual tardiness - // handling is done inside _lf_insert_reactions_for_trigger. - // To prevent multiple processing of messages per tag, - // we also need to check the port status. - // For example, there could be a case where current tag is - // 10 with a port absent reaction waiting, and a message has arrived with intended_tag 8. - // This message will eventually cause the port absent reaction to exit, but before that, - // a message with intended_tag of 9 could arrive before the port absent reaction has had a chance - // to exit. The port status is on the other hand changed in this thread, and thus, - // can be checked in this scenario without this race condition. The message with - // intended_tag of 9 in this case needs to wait one microstep to be processed. - if (lf_tag_compare(intended_tag, lf_tag(env)) == 0 // The event is meant for the current tag. - && env->execution_started - // Check that MLAA is blocking at the right level. Otherwise, data can be lost. - && action->trigger->reactions[0]->index >= max_level_allowed_to_advance - && !action->trigger->is_physical - && lf_tag_compare(intended_tag, action->trigger->last_tag) > 0 // Not already enabled at the current tag. - && lf_tag_compare(intended_tag, action->trigger->last_known_status_tag) > 0 - ) { + if (handle_message_now(env, action->trigger, intended_tag)) { // Since the message is intended for the current tag and a port absent reaction - // was waiting for the message, trigger the corresponding reactions for this - // message. + // was waiting for the message, trigger the corresponding reactions for this message. update_last_known_status_on_input_port(env, intended_tag, port_id); @@ -1643,7 +1663,12 @@ void handle_tagged_message(int* socket, int fed_id) { intended_tag.time - lf_time_start(), intended_tag.microstep ); + // Only set the intended tag of the trigger if it is being executed now + // because otherwise this may preempt the intended_tag of a previous activation + // of the trigger. action->trigger->intended_tag = intended_tag; + + // This will mark the STP violation in the reaction if the message is tardy. _lf_insert_reactions_for_trigger(env, action->trigger, message_token); // Set the status of the port as present here to inform the network input From 0d4d997942db576830668285a7ad97d10a431f4a Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sun, 31 Dec 2023 17:07:48 -0800 Subject: [PATCH 43/83] General cleanup --- core/federated/RTI/rti_common.c | 2 +- core/federated/RTI/rti_common.h | 2 +- core/federated/RTI/rti_remote.c | 229 +- core/federated/RTI/rti_remote.h | 12 +- core/federated/clock-sync.c | 2 +- core/federated/federate.c | 4544 +++++++++---------- core/reactor_common.c | 41 +- core/threaded/reactor_threaded.c | 28 +- core/threaded/scheduler_sync_tag_advance.c | 6 +- include/core/federated/federate.h | 354 +- include/core/federated/network/net_common.h | 8 +- include/core/federated/network/net_util.h | 2 +- include/core/reactor.h | 3 - include/core/reactor_common.h | 15 - include/core/threaded/reactor_threaded.h | 4 +- 15 files changed, 2569 insertions(+), 2683 deletions(-) diff --git a/core/federated/RTI/rti_common.c b/core/federated/RTI/rti_common.c index 68e8608cc..424e48135 100644 --- a/core/federated/RTI/rti_common.c +++ b/core/federated/RTI/rti_common.c @@ -66,7 +66,7 @@ void _logical_tag_complete(scheduling_node_t* enclave, tag_t completed) { enclave->completed = completed; - LF_PRINT_LOG("RTI received from federate/enclave %d the Logical Tag Complete (LTC) " PRINTF_TAG ".", + LF_PRINT_LOG("RTI received from federate/enclave %d the latest tag complete (LTC) " PRINTF_TAG ".", enclave->id, enclave->completed.time - start_time, enclave->completed.microstep); // Check downstream scheduling_nodes to see whether they should now be granted a TAG. diff --git a/core/federated/RTI/rti_common.h b/core/federated/RTI/rti_common.h index 010b5f2f2..d80775e05 100644 --- a/core/federated/RTI/rti_common.h +++ b/core/federated/RTI/rti_common.h @@ -204,7 +204,7 @@ void notify_provisional_tag_advance_grant(scheduling_node_t* e, tag_t tag); * If M is equal to the NET of the federate, then return PTAG(M). * * This should be called whenever an immediately upstream federate sends to - * the RTI an LTC (Logical Tag Complete), or when a transitive upstream + * the RTI an LTC (latest tag complete), or when a transitive upstream * federate sends a NET (Next Event Tag) message. * It is also called when an upstream federate resigns from the federation. * diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 2dab501af..6c6f11f90 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -183,8 +183,7 @@ static int create_rti_server(uint16_t port, socket_type_t socket_type) { (struct sockaddr *)&server_fd, sizeof(server_fd)); } - if (result != 0) - { + if (result != 0) { lf_print_error_and_exit("Failed to bind the RTI socket. Port %d is not available. ", port); } char *type = "TCP"; @@ -209,14 +208,14 @@ static int create_rti_server(uint16_t port, socket_type_t socket_type) { void notify_tag_advance_grant(scheduling_node_t *e, tag_t tag) { - if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || lf_tag_compare(tag, e->last_provisionally_granted) < 0) - { + if (e->state == NOT_CONNECTED + || lf_tag_compare(tag, e->last_granted) <= 0 + || lf_tag_compare(tag, e->last_provisionally_granted) < 0) { return; } // Need to make sure that the destination federate's thread has already // sent the starting MSG_TYPE_TIMESTAMP message. - while (e->state == PENDING) - { + while (e->state == PENDING) { // Need to wait here. lf_cond_wait(&sent_start_time); } @@ -226,8 +225,7 @@ void notify_tag_advance_grant(scheduling_node_t *e, tag_t tag) encode_int64(tag.time, &(buffer[1])); encode_int32((int32_t)tag.microstep, &(buffer[1 + sizeof(int64_t)])); - if (rti_remote->base.tracing_enabled) - { + if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_TAG, e->id, &tag); } // This function is called in notify_advance_grant_if_safe(), which is a long @@ -244,16 +242,15 @@ void notify_tag_advance_grant(scheduling_node_t *e, tag_t tag) } } -void notify_provisional_tag_advance_grant(scheduling_node_t *e, tag_t tag) -{ - if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || lf_tag_compare(tag, e->last_provisionally_granted) <= 0) - { +void notify_provisional_tag_advance_grant(scheduling_node_t *e, tag_t tag) { + if (e->state == NOT_CONNECTED + || lf_tag_compare(tag, e->last_granted) <= 0 + || lf_tag_compare(tag, e->last_provisionally_granted) <= 0) { return; } // Need to make sure that the destination federate's thread has already // sent the starting MSG_TYPE_TIMESTAMP message. - while (e->state == PENDING) - { + while (e->state == PENDING) { // Need to wait here. lf_cond_wait(&sent_start_time); } @@ -263,8 +260,7 @@ void notify_provisional_tag_advance_grant(scheduling_node_t *e, tag_t tag) encode_int64(tag.time, &(buffer[1])); encode_int32((int32_t)tag.microstep, &(buffer[1 + sizeof(int64_t)])); - if (rti_remote->base.tracing_enabled) - { + if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_PTAG, e->id, &tag); } // This function is called in notify_advance_grant_if_safe(), which is a long @@ -274,9 +270,7 @@ void notify_provisional_tag_advance_grant(scheduling_node_t *e, tag_t tag) lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); e->state = NOT_CONNECTED; // FIXME: We need better error handling, but don't stop other execution here. - } - else - { + } else { e->last_provisionally_granted = tag; LF_PRINT_LOG("RTI sent to federate %d the Provisional Tag Advance Grant (PTAG) " PRINTF_TAG ".", e->id, tag.time - start_time, tag.microstep); @@ -290,8 +284,7 @@ void notify_provisional_tag_advance_grant(scheduling_node_t *e, tag_t tag) // Note that this is transitive. // NOTE: This is not needed for enclaves because zero-delay loops are prohibited. // It's only needed for federates, which is why this is implemented here. - for (int j = 0; j < e->num_upstream; j++) - { + for (int j = 0; j < e->num_upstream; j++) { scheduling_node_t *upstream = rti_remote->base.scheduling_nodes[e->upstream[j]]; // Ignore this federate if it has resigned. @@ -303,33 +296,27 @@ void notify_provisional_tag_advance_grant(scheduling_node_t *e, tag_t tag) // If these tags are equal, then a TAG or PTAG should have already been granted, // in which case, another will not be sent. But it may not have been already granted. - if (lf_tag_compare(earliest, tag) > 0) - { + if (lf_tag_compare(earliest, tag) > 0) { notify_tag_advance_grant(upstream, tag); - } - else if (lf_tag_compare(earliest, tag) == 0 && lf_tag_compare(strict_earliest, tag) > 0) - { + } else if (lf_tag_compare(earliest, tag) == 0 && lf_tag_compare(strict_earliest, tag) > 0) { notify_provisional_tag_advance_grant(upstream, tag); } } } } -void update_federate_next_event_tag_locked(uint16_t federate_id, tag_t next_event_tag) -{ +void update_federate_next_event_tag_locked(uint16_t federate_id, tag_t next_event_tag) { federate_info_t *fed = GET_FED_INFO(federate_id); tag_t min_in_transit_tag = get_minimum_in_transit_message_tag(fed->in_transit_message_tags); if (lf_tag_compare( min_in_transit_tag, - next_event_tag) < 0) - { + next_event_tag) < 0) { next_event_tag = min_in_transit_tag; } update_scheduling_node_next_event_tag_locked(&(fed->enclave), next_event_tag); } -void handle_port_absent_message(federate_info_t *sending_federate, unsigned char *buffer) -{ +void handle_port_absent_message(federate_info_t *sending_federate, unsigned char *buffer) { size_t message_size = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int64_t) + sizeof(uint32_t); read_from_socket_fail_on_error( @@ -341,8 +328,7 @@ void handle_port_absent_message(federate_info_t *sending_federate, unsigned char uint16_t federate_id = extract_uint16(&(buffer[1 + sizeof(uint16_t)])); tag_t tag = extract_tag(&(buffer[1 + 2 * sizeof(uint16_t)])); - if (rti_remote->base.tracing_enabled) - { + if (rti_remote->base.tracing_enabled) { tracepoint_rti_from_federate(rti_remote->base.trace, receive_PORT_ABS, sending_federate->enclave.id, &tag); } @@ -354,8 +340,7 @@ void handle_port_absent_message(federate_info_t *sending_federate, unsigned char // If the destination federate is no longer connected, issue a warning // and return. federate_info_t *fed = GET_FED_INFO(federate_id); - if (fed->enclave.state == NOT_CONNECTED) - { + if (fed->enclave.state == NOT_CONNECTED) { LF_MUTEX_UNLOCK(rti_mutex); lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", federate_id); @@ -380,8 +365,7 @@ void handle_port_absent_message(federate_info_t *sending_federate, unsigned char // Need to make sure that the destination federate's thread has already // sent the starting MSG_TYPE_TIMESTAMP message. - while (fed->enclave.state == PENDING) - { + while (fed->enclave.state == PENDING) { // Need to wait here. lf_cond_wait(&sent_start_time); } @@ -392,15 +376,14 @@ void handle_port_absent_message(federate_info_t *sending_federate, unsigned char LF_MUTEX_UNLOCK(rti_mutex); - if (rti_remote->base.tracing_enabled) - { + if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_PORT_ABS, federate_id, &tag); } } -void handle_timed_message(federate_info_t *sending_federate, unsigned char *buffer) -{ - size_t header_size = 1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t) + sizeof(int64_t) + sizeof(uint32_t); +void handle_timed_message(federate_info_t *sending_federate, unsigned char *buffer) { + size_t header_size = 1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t) + + sizeof(int64_t) + sizeof(uint32_t); // Read the header, minus the first byte which has already been read. read_from_socket_fail_on_error( &sending_federate->socket, header_size - 1, &(buffer[1]), NULL, @@ -416,16 +399,14 @@ void handle_timed_message(federate_info_t *sending_federate, unsigned char *buff size_t total_bytes_to_read = length + header_size; size_t bytes_to_read = length; - if (FED_COM_BUFFER_SIZE < header_size + 1) - { + if (FED_COM_BUFFER_SIZE < header_size + 1) { lf_print_error_and_exit("Buffer size (%d) is not large enough to " "read the header plus one byte.", FED_COM_BUFFER_SIZE); } // Cut up the payload in chunks. - if (bytes_to_read > FED_COM_BUFFER_SIZE - header_size) - { + if (bytes_to_read > FED_COM_BUFFER_SIZE - header_size) { bytes_to_read = FED_COM_BUFFER_SIZE - header_size; } @@ -440,8 +421,7 @@ void handle_timed_message(federate_info_t *sending_federate, unsigned char *buff // Following only works for string messages. // LF_PRINT_DEBUG("Message received by RTI: %s.", buffer + header_size); - if (rti_remote->base.tracing_enabled) - { + if (rti_remote->base.tracing_enabled) { tracepoint_rti_from_federate(rti_remote->base.trace, receive_TAGGED_MSG, sending_federate->enclave.id, &intended_tag); } @@ -453,8 +433,7 @@ void handle_timed_message(federate_info_t *sending_federate, unsigned char *buff // If the destination federate is no longer connected, issue a warning // and return. federate_info_t *fed = GET_FED_INFO(federate_id); - if (fed->enclave.state == NOT_CONNECTED) - { + if (fed->enclave.state == NOT_CONNECTED) { LF_MUTEX_UNLOCK(rti_mutex); lf_print_warning("RTI: Destination federate %d is no longer connected. Dropping message.", federate_id); @@ -480,8 +459,7 @@ void handle_timed_message(federate_info_t *sending_federate, unsigned char *buff length); // Record this in-transit message in federate's in-transit message queue. - if (lf_tag_compare(fed->enclave.completed, intended_tag) < 0) - { + if (lf_tag_compare(fed->enclave.completed, intended_tag) < 0) { // Add a record of this message to the list of in-transit messages to this federate. add_in_transit_message_record( fed->in_transit_message_tags, @@ -491,9 +469,7 @@ void handle_timed_message(federate_info_t *sending_federate, unsigned char *buff intended_tag.time - lf_time_start(), intended_tag.microstep, federate_id); - } - else - { + } else { lf_print_error( "RTI: Federate %d has already completed tag " PRINTF_TAG ", but there is an in-transit message with tag " PRINTF_TAG " from federate %hu. " @@ -509,29 +485,25 @@ void handle_timed_message(federate_info_t *sending_federate, unsigned char *buff // Need to make sure that the destination federate's thread has already // sent the starting MSG_TYPE_TIMESTAMP message. - while (fed->enclave.state == PENDING) - { + while (fed->enclave.state == PENDING) { // Need to wait here. lf_cond_wait(&sent_start_time); } - if (rti_remote->base.tracing_enabled) - { + if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_TAGGED_MSG, federate_id, &intended_tag); } write_to_socket_fail_on_error(&fed->socket, bytes_read, buffer, &rti_mutex, - "RTI failed to forward message to federate %d.", federate_id); + "RTI failed to forward message to federate %d.", federate_id); // The message length may be longer than the buffer, // in which case we have to handle it in chunks. size_t total_bytes_read = bytes_read; - while (total_bytes_read < total_bytes_to_read) - { + while (total_bytes_read < total_bytes_to_read) { LF_PRINT_DEBUG("Forwarding message in chunks."); bytes_to_read = total_bytes_to_read - total_bytes_read; - if (bytes_to_read > FED_COM_BUFFER_SIZE) - { + if (bytes_to_read > FED_COM_BUFFER_SIZE) { bytes_to_read = FED_COM_BUFFER_SIZE; } read_from_socket_fail_on_error(&sending_federate->socket, bytes_to_read, buffer, NULL, @@ -551,15 +523,13 @@ void handle_timed_message(federate_info_t *sending_federate, unsigned char *buff LF_MUTEX_UNLOCK(rti_mutex); } -void handle_logical_tag_complete(federate_info_t *fed) -{ +void handle_latest_tag_complete(federate_info_t *fed) { unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, "RTI failed to read the content of the logical tag complete from federate %d.", fed->enclave.id); tag_t completed = extract_tag(buffer); - if (rti_remote->base.tracing_enabled) - { + if (rti_remote->base.tracing_enabled) { tracepoint_rti_from_federate(rti_remote->base.trace, receive_LTC, fed->enclave.id, &completed); } _logical_tag_complete(&(fed->enclave), completed); @@ -571,8 +541,7 @@ void handle_logical_tag_complete(federate_info_t *fed) LF_MUTEX_UNLOCK(rti_mutex); } -void handle_next_event_tag(federate_info_t *fed) -{ +void handle_next_event_tag(federate_info_t *fed) { unsigned char buffer[sizeof(int64_t) + sizeof(uint32_t)]; read_from_socket_fail_on_error(&fed->socket, sizeof(int64_t) + sizeof(uint32_t), buffer, NULL, "RTI failed to read the content of the next event tag from federate %d.", @@ -580,19 +549,16 @@ void handle_next_event_tag(federate_info_t *fed) // Acquire a mutex lock to ensure that this state does not change while a // message is in transport or being used to determine a TAG. - LF_MUTEX_LOCK(rti_mutex); // FIXME: Instead of using a mutex, - // it might be more efficient to use a - // select() mechanism to read and process - // federates' buffers in an orderly fashion. + LF_MUTEX_LOCK(rti_mutex); // FIXME: Instead of using a mutex, it might be more efficient to use a + // select() mechanism to read and process federates' buffers in an orderly fashion. tag_t intended_tag = extract_tag(buffer); - if (rti_remote->base.tracing_enabled) - { + if (rti_remote->base.tracing_enabled) { tracepoint_rti_from_federate(rti_remote->base.trace, receive_NET, fed->enclave.id, &intended_tag); } LF_PRINT_LOG("RTI received from federate %d the Next Event Tag (NET) " PRINTF_TAG, - fed->enclave.id, intended_tag.time - start_time, - intended_tag.microstep); + fed->enclave.id, intended_tag.time - start_time, + intended_tag.microstep); update_federate_next_event_tag_locked( fed->enclave.id, intended_tag); @@ -687,6 +653,7 @@ static void* wait_for_stop_request_reply(void* args) { rti_remote->base.num_scheduling_nodes_handling_stop, MAX_TIME_FOR_REPLY_TO_STOP_REQUEST ); + return NULL; } void handle_stop_request_message(federate_info_t *fed) { @@ -867,16 +834,14 @@ void handle_address_ad(uint16_t federate_id) { } } -void handle_timestamp(federate_info_t *my_fed) -{ +void handle_timestamp(federate_info_t *my_fed) { unsigned char buffer[sizeof(int64_t)]; // Read bytes from the socket. We need 8 bytes. read_from_socket_fail_on_error(&my_fed->socket, sizeof(int64_t), (unsigned char *)&buffer, NULL, "ERROR reading timestamp from federate %d.\n", my_fed->enclave.id); int64_t timestamp = swap_bytes_if_big_endian_int64(*((int64_t *)(&buffer))); - if (rti_remote->base.tracing_enabled) - { + if (rti_remote->base.tracing_enabled) { tag_t tag = {.time = timestamp, .microstep = 0}; tracepoint_rti_from_federate(rti_remote->base.trace, receive_TIMESTAMP, my_fed->enclave.id, &tag); } @@ -884,21 +849,16 @@ void handle_timestamp(federate_info_t *my_fed) LF_MUTEX_LOCK(rti_mutex); rti_remote->num_feds_proposed_start++; - if (timestamp > rti_remote->max_start_time) - { + if (timestamp > rti_remote->max_start_time) { rti_remote->max_start_time = timestamp; } - if (rti_remote->num_feds_proposed_start == rti_remote->base.number_of_scheduling_nodes) - { + if (rti_remote->num_feds_proposed_start == rti_remote->base.number_of_scheduling_nodes) { // All federates have proposed a start time. lf_cond_broadcast(&received_start_times); - } - else - { + } else { // Some federates have not yet proposed a start time. // wait for a notification. - while (rti_remote->num_feds_proposed_start < rti_remote->base.number_of_scheduling_nodes) - { + while (rti_remote->num_feds_proposed_start < rti_remote->base.number_of_scheduling_nodes) { // FIXME: Should have a timeout here? lf_cond_wait(&received_start_times); } @@ -914,8 +874,7 @@ void handle_timestamp(federate_info_t *my_fed) start_time = rti_remote->max_start_time + DELAY_START; encode_int64(swap_bytes_if_big_endian_int64(start_time), &start_time_buffer[1]); - if (rti_remote->base.tracing_enabled) - { + if (rti_remote->base.tracing_enabled) { tag_t tag = {.time = start_time, .microstep = 0}; tracepoint_rti_to_federate(rti_remote->base.trace, send_TIMESTAMP, my_fed->enclave.id, &tag); } @@ -933,10 +892,8 @@ void handle_timestamp(federate_info_t *my_fed) LF_MUTEX_UNLOCK(rti_mutex); } -void send_physical_clock(unsigned char message_type, federate_info_t *fed, socket_type_t socket_type) -{ - if (fed->enclave.state == NOT_CONNECTED) - { +void send_physical_clock(unsigned char message_type, federate_info_t *fed, socket_type_t socket_type) { + if (fed->enclave.state == NOT_CONNECTED) { lf_print_warning("Clock sync: RTI failed to send physical time to federate %d. Socket not connected.\n", fed->enclave.id); return; @@ -947,36 +904,33 @@ void send_physical_clock(unsigned char message_type, federate_info_t *fed, socke encode_int64(current_physical_time, &(buffer[1])); // Send the message - if (socket_type == UDP) - { + if (socket_type == UDP) { // FIXME: UDP_addr is never initialized. LF_PRINT_DEBUG("Clock sync: RTI sending UDP message type %u.", buffer[0]); ssize_t bytes_written = sendto(rti_remote->socket_descriptor_UDP, buffer, 1 + sizeof(int64_t), 0, - (struct sockaddr *)&fed->UDP_addr, sizeof(fed->UDP_addr)); - if (bytes_written < (ssize_t)sizeof(int64_t) + 1) - { + (struct sockaddr *)&fed->UDP_addr, sizeof(fed->UDP_addr)); + if (bytes_written < (ssize_t)sizeof(int64_t) + 1) { lf_print_warning("Clock sync: RTI failed to send physical time to federate %d: %s\n", - fed->enclave.id, - strerror(errno)); + fed->enclave.id, + strerror(errno)); return; } } - else if (socket_type == TCP) - { + else if (socket_type == TCP) { LF_PRINT_DEBUG("Clock sync: RTI sending TCP message type %u.", buffer[0]); LF_MUTEX_LOCK(rti_mutex); write_to_socket_fail_on_error(&fed->socket, 1 + sizeof(int64_t), buffer, &rti_mutex, - "Clock sync: RTI failed to send physical time to federate %d.", - fed->enclave.id); + "Clock sync: RTI failed to send physical time to federate %d.", + fed->enclave.id); LF_MUTEX_UNLOCK(rti_mutex); } - LF_PRINT_DEBUG("Clock sync: RTI sent PHYSICAL_TIME_SYNC_MESSAGE with timestamp " PRINTF_TIME " to federate %d.", - current_physical_time, - fed->enclave.id); + LF_PRINT_DEBUG("Clock sync: RTI sent PHYSICAL_TIME_SYNC_MESSAGE with timestamp " PRINTF_TIME + " to federate %d.", + current_physical_time, + fed->enclave.id); } -void handle_physical_clock_sync_message(federate_info_t *my_fed, socket_type_t socket_type) -{ +void handle_physical_clock_sync_message(federate_info_t *my_fed, socket_type_t socket_type) { // Lock the mutex to prevent interference between sending the two // coded probe messages. LF_MUTEX_LOCK(rti_mutex); @@ -984,8 +938,7 @@ void handle_physical_clock_sync_message(federate_info_t *my_fed, socket_type_t s send_physical_clock(MSG_TYPE_CLOCK_SYNC_T4, my_fed, socket_type); // Send the corresponding coded probe immediately after, // but only if this is a UDP channel. - if (socket_type == UDP) - { + if (socket_type == UDP) { send_physical_clock(MSG_TYPE_CLOCK_SYNC_CODED_PROBE, my_fed, socket_type); } LF_MUTEX_UNLOCK(rti_mutex); @@ -1064,18 +1017,19 @@ void *clock_synchronization_thread(void *noargs) { // The message is not a T3 message. Discard the message and // continue waiting for the T3 message. This is possibly a message // from a previous cycle that was discarded. - lf_print_warning("Clock sync: Unexpected UDP message %u. Expected %u from federate %d. " - "Discarding message.", - buffer[0], - MSG_TYPE_CLOCK_SYNC_T3, - fed->enclave.id); + lf_print_warning( + "Clock sync: Unexpected UDP message %u. Expected %u from federate %d. " + "Discarding message.", + buffer[0], + MSG_TYPE_CLOCK_SYNC_T3, + fed->enclave.id); continue; } } else { lf_print_warning("Clock sync: Read from UDP socket failed: %s. " - "Skipping clock sync round for federate %d.", - strerror(errno), - fed->enclave.id); + "Skipping clock sync round for federate %d.", + strerror(errno), + fed->enclave.id); remaining_attempts = -1; } } @@ -1181,8 +1135,8 @@ void *federate_info_thread_TCP(void *fed) { case MSG_TYPE_NEXT_EVENT_TAG: handle_next_event_tag(my_fed); break; - case MSG_TYPE_LOGICAL_TAG_COMPLETE: - handle_logical_tag_complete(my_fed); + case MSG_TYPE_LATEST_TAG_COMPLETE: + handle_latest_tag_complete(my_fed); break; case MSG_TYPE_STOP_REQUEST: handle_stop_request_message(my_fed); // FIXME: Reviewed until here. @@ -1213,8 +1167,7 @@ void *federate_info_thread_TCP(void *fed) { return NULL; } -void send_reject(int *socket_id, unsigned char error_code) -{ +void send_reject(int *socket_id, unsigned char error_code) { LF_PRINT_DEBUG("RTI sending MSG_TYPE_REJECT."); unsigned char response[2]; response[0] = MSG_TYPE_REJECT; @@ -1553,11 +1506,10 @@ static bool authenticate_federate(int *socket) { unsigned char buffer[1 + fed_id_length + NONCE_LENGTH]; read_from_socket_fail_on_error(socket, 1 + fed_id_length + NONCE_LENGTH, buffer, NULL, "Failed to read MSG_TYPE_FED_NONCE"); - if (buffer[0] != MSG_TYPE_FED_NONCE) - { + if (buffer[0] != MSG_TYPE_FED_NONCE) { lf_print_error_and_exit( - "Received unexpected response %u from the FED (see net_common.h).", - buffer[0]); + "Received unexpected response %u from the FED (see net_common.h).", + buffer[0]); } unsigned int hmac_length = SHA256_HMAC_LENGTH; size_t federation_id_length = strnlen(rti_remote->federation_id, 255); @@ -1617,7 +1569,7 @@ static bool authenticate_federate(int *socket) { } #endif -void connect_to_federates(int socket_descriptor) { +void lf_connect_to_federates(int socket_descriptor) { for (int i = 0; i < rti_remote->base.number_of_scheduling_nodes; i++) { // Wait for an incoming connection request. struct sockaddr client_fd; @@ -1748,7 +1700,7 @@ int32_t start_rti_server(uint16_t port) { void wait_for_federates(int socket_descriptor) { // Wait for connections from federates and create a thread for each. - connect_to_federates(socket_descriptor); + lf_connect_to_federates(socket_descriptor); // All federates have connected. lf_print("RTI: All expected federates have connected. Starting execution."); @@ -1793,8 +1745,7 @@ void wait_for_federates(int socket_descriptor) { } } -void initialize_RTI(rti_remote_t *rti) -{ +void initialize_RTI(rti_remote_t *rti) { rti_remote = rti; // Initialize thread synchronization primitives @@ -1823,10 +1774,8 @@ void initialize_RTI(rti_remote_t *rti) rti_remote->stop_in_progress = false; } -void free_scheduling_nodes(scheduling_node_t **scheduling_nodes, uint16_t number_of_scheduling_nodes) -{ - for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) - { +void free_scheduling_nodes(scheduling_node_t **scheduling_nodes, uint16_t number_of_scheduling_nodes) { + for (uint16_t i = 0; i < number_of_scheduling_nodes; i++) { // FIXME: Gives error freeing memory not allocated!!!! scheduling_node_t *node = scheduling_nodes[i]; if (node->upstream != NULL) diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index 0a6896bb0..2bb00ba93 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -226,14 +226,14 @@ void handle_port_absent_message(federate_info_t* sending_federate, unsigned char void handle_timed_message(federate_info_t* sending_federate, unsigned char* buffer); /** - * Handle a logical tag complete (LTC) message. @see - * MSG_TYPE_LOGICAL_TAG_COMPLETE in rti.h. + * Handle a latest tag complete (LTC) message. @see + * MSG_TYPE_LATEST_TAG_COMPLETE in rti.h. * * This function assumes the caller does not hold the mutex. * * @param fed The federate that has completed a logical tag. */ -void handle_logical_tag_complete(federate_info_t* fed); +void handle_latest_tag_complete(federate_info_t* fed); /** * Handle a next event tag (NET) message. @see MSG_TYPE_NEXT_EVENT_TAG in rti.h. @@ -274,7 +274,7 @@ void handle_stop_request_reply(federate_info_t* fed); * are initialized to -1. If no MSG_TYPE_ADDRESS_ADVERTISEMENT message has been received from * the destination federate, the RTI will simply reply with -1 for the port. * The sending federate is responsible for checking back with the RTI after a - * period of time. @see connect_to_federate() in federate.c. * + * period of time. * @param fed_id The federate sending a MSG_TYPE_ADDRESS_QUERY message. */ void handle_address_query(uint16_t fed_id); @@ -286,7 +286,7 @@ void handle_address_query(uint16_t fed_id); * field of the _RTI.federates[federate_id] array of structs. * * The server_hostname and server_ip_addr fields are assigned - * in connect_to_federates() upon accepting the socket + * in lf_connect_to_federates() upon accepting the socket * from the remote federate. * * This function assumes the caller does not hold the mutex. @@ -394,7 +394,7 @@ void send_reject(int* socket_id, unsigned char error_code); * that federate. Return when all federates have connected. * @param socket_descriptor The socket on which to accept connections. */ -void connect_to_federates(int socket_descriptor); +void lf_connect_to_federates(int socket_descriptor); /** * Thread to respond to new connections, which could be federates of other diff --git a/core/federated/clock-sync.c b/core/federated/clock-sync.c index 74e1ed5f1..f718ace4e 100644 --- a/core/federated/clock-sync.c +++ b/core/federated/clock-sync.c @@ -71,7 +71,7 @@ instant_t _lf_last_clock_sync_instant = 0LL; /** * The UDP socket descriptor for this federate to communicate with the RTI. - * This is set by setup_clock_synchronization_with_rti() in connect_to_rti() + * This is set by setup_clock_synchronization_with_rti() in lf_connect_to_rti() * in federate.c, which must be called before other * functions that communicate with the rti are called. */ diff --git a/core/federated/federate.c b/core/federated/federate.c index 1f7c577b9..de1b56ac3 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1,67 +1,42 @@ /** * @file - * @author Edward A. Lee (eal@berkeley.edu) + * @author Soroush Bateni + * @author Peter Donovan + * @author Edward A. Lee + * @author Anirudh Rengarajsm * * @section LICENSE -Copyright (c) 2020, The University of California at Berkeley. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY -EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL -THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF -THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - + * See LICENSE.md in the root directory. * @section DESCRIPTION * Utility functions for a federate in a federated execution. - * The main entry point is synchronize_with_other_federates(). + * The main entry point is lf_synchronize_with_other_federates(). */ #ifdef FEDERATED -#ifdef PLATFORM_ARDUINO -#error To be implemented. No support for federation on Arduino yet. -#else +#if !defined(PLATFORM_Linux) && !defined(PLATFORM_Darwin) +#error No support for federated execution on this platform. +#endif + #include // inet_ntop & inet_pton #include // Defines getaddrinfo(), freeaddrinfo() and struct addrinfo. #include // Defines struct sockaddr_in - -#include -#include // Defines bzero(). #include -#endif +#include // Defines read(), write(), and close() #include #include // Defined perror(), errno -#include -#include -#include -#include -#include // Defines read(), write(), and close() +#include // Defines bzero(). #include "clock-sync.h" #include "federate.h" -#include "lf_types.h" #include "net_common.h" #include "net_util.h" -#include "platform.h" #include "reactor.h" #include "reactor_common.h" #include "reactor_threaded.h" #include "scheduler.h" #include "trace.h" + #ifdef FEDERATED_AUTHENTICATED #include // For secure random number generation. #include // For HMAC-based authentication of federates. @@ -74,23 +49,33 @@ extern instant_t start_time; // Global variable defined in reactor_common.c: extern bool _lf_termination_executed; -// Error messages. -char* ERROR_SENDING_HEADER = "ERROR sending header information to federate via RTI"; -char* ERROR_SENDING_MESSAGE = "ERROR sending message to federate via RTI"; - -// Mutex lock held while performing socket write and close operations. -lf_mutex_t outbound_socket_mutex; +// Global variables references in federate.h +lf_mutex_t lf_outbound_socket_mutex; +lf_cond_t lf_port_status_changed; +lf_cond_t lf_current_tag_changed; -// The following two mutexes are initialized in generated code and associated -// with the top-level environment's mutex. -lf_cond_t port_status_changed; -lf_cond_t logical_time_changed; - -// Variable to track how far in the reaction queue we can go until we need to wait for more network port statuses to be known. +/** + * The max level allowed to advance (MLAA) is a variable that tracks how far in the reaction + * queue we can go until we need to wait for more network port statuses to be known. + * Specifically, when an input port status is unknown at a tag (we don't know whether the upstream + * federate has sent or will send a message at that tag), then the downstream federate must + * pause before executing any reaction that depends on that port. A "level" is assigned to that + * port by the code generator based on the overall topology of the federation. Reactions that + * depend on the port have higher levels, whereas those with no dependence on that port have + * lower levels. The MLAA is a level at which the federate must block until the MLAA is + * incremented. It will be incremented as port statuses become known, and when all are known, + * it will become INT_MAX and all reactions will be unblocked. In decentralized execution, the + * MLAA is incremented by a background thread that monitors the local physical clock and + * increments the MLAA when it is safe to assume that the port is absent, if it has not already + * been incremented by the arrival of a message. In centralized execution, the MLAA is used + * only for ports that are involved in a zero-delay cycle (ZDC), and it is incremented when + * either a message or an absent message arrives. + */ int max_level_allowed_to_advance; /** - * The state of this federate instance. + * The state of this federate instance. Each executable has exactly one federate instance, + * and the _fed global variable refers to that instance. */ federate_instance_t _fed = { .socket_TCP_RTI = -1, @@ -110,7 +95,6 @@ federate_instance_t _fed = { .min_delay_from_physical_action_to_federate_output = NEVER }; - federation_metadata_t federation_metadata = { .federation_id = "Unidentified Federation", .rti_host = NULL, @@ -118,257 +102,26 @@ federation_metadata_t federation_metadata = { .rti_user = NULL }; -void create_server(int specified_port) { - assert(specified_port <= UINT16_MAX && specified_port >= 0); - uint16_t port = (uint16_t)specified_port; - LF_PRINT_LOG("Creating a socket server on port %d.", port); - // Create an IPv4 socket for TCP (not UDP) communication over IP (0). - int socket_descriptor = create_real_time_tcp_socket_errexit(); - - // Server file descriptor. - struct sockaddr_in server_fd; - // Zero out the server address structure. - bzero((char*)&server_fd, sizeof(server_fd)); - - server_fd.sin_family = AF_INET; // IPv4 - server_fd.sin_addr.s_addr = INADDR_ANY; // All interfaces, 0.0.0.0. - // Convert the port number from host byte order to network byte order. - server_fd.sin_port = htons(port); - - int result = bind( - socket_descriptor, - (struct sockaddr *) &server_fd, - sizeof(server_fd)); - int count = 0; - while (result < 0 && count++ < PORT_BIND_RETRY_LIMIT) { - lf_sleep(PORT_BIND_RETRY_INTERVAL); - result = bind( - socket_descriptor, - (struct sockaddr *) &server_fd, - sizeof(server_fd)); - } - if (result < 0) { - lf_print_error_and_exit("Failed to bind socket on port %d.", port); - } - - // Set the global server port. - if (specified_port == 0) { - // Need to retrieve the port number assigned by the OS. - struct sockaddr_in assigned; - socklen_t addr_len = sizeof(assigned); - if (getsockname(socket_descriptor, (struct sockaddr *) &assigned, &addr_len) < 0) { - lf_print_error_and_exit("Failed to retrieve assigned port number."); - } - _fed.server_port = ntohs(assigned.sin_port); - } else { - _fed.server_port = port; - } - - // Enable listening for socket connections. - // The second argument is the maximum number of queued socket requests, - // which according to the Mac man page is limited to 128. - listen(socket_descriptor, 128); - - LF_PRINT_LOG("Server for communicating with other federates started using port %d.", _fed.server_port); - - // Send the server port number to the RTI - // on an MSG_TYPE_ADDRESS_ADVERTISEMENT message (@see net_common.h). - unsigned char buffer[sizeof(int32_t) + 1]; - buffer[0] = MSG_TYPE_ADDRESS_ADVERTISEMENT; - encode_int32(_fed.server_port, &(buffer[1])); - - // No need for a mutex because we have the only handle on this socket. - write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, sizeof(int32_t) + 1, (unsigned char*)buffer, NULL, - "Failed to send address advertisement."); - - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_ADR_AD, _lf_my_fed_id, NULL); - LF_PRINT_DEBUG("Sent port %d to the RTI.", _fed.server_port); - - // Set the global server socket - _fed.server_socket = socket_descriptor; -} - -int send_message(int message_type, - unsigned short port, - unsigned short federate, - const char* next_destination_str, - size_t length, - unsigned char* message) { - unsigned char header_buffer[1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t)]; - // First byte identifies this as a timed message. - if (message_type != MSG_TYPE_P2P_MESSAGE ) { - lf_print_error("send_message: Unsupported message type (%d).", message_type); - return -1; - } - header_buffer[0] = (unsigned char)message_type; - // Next two bytes identify the destination port. - // NOTE: Send messages little endian (network order), not big endian. - encode_uint16(port, &(header_buffer[1])); - - // Next two bytes identify the destination federate. - encode_uint16(federate, &(header_buffer[1 + sizeof(uint16_t)])); - - // The next four bytes are the message length. - encode_int32((int32_t)length, &(header_buffer[1 + sizeof(uint16_t) + sizeof(uint16_t)])); - - LF_PRINT_LOG("Sending untagged message to %s.", next_destination_str); - - // Header: message_type + port_id + federate_id + length of message + timestamp + microstep - const int header_length = 1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t); - - // Use a mutex lock to prevent multiple threads from simultaneously sending. - LF_MUTEX_LOCK(outbound_socket_mutex); - - int* socket = &_fed.sockets_for_outbound_p2p_connections[federate]; - - // Trace the event when tracing is enabled - tracepoint_federate_to_federate(_fed.trace, send_P2P_MSG, _lf_my_fed_id, federate, NULL); - - int result = write_to_socket_close_on_error(socket, header_length, header_buffer); - if (result == 0) { - // Header sent successfully. Send the body. - result = write_to_socket_close_on_error(socket, length, message); - } - if (result != 0) { - // Message did not send. Since this is used for physical connections, this is not critical. - lf_print_warning("Failed to send message to %s. Dropping the message.", next_destination_str); - } - LF_MUTEX_UNLOCK(outbound_socket_mutex); - return result; -} - -/** - * Close the socket that receives incoming messages from the - * specified federate ID. This function should be called when a read - * of incoming socket fails or when an EOF is received. - * It can also be called when the receiving end wants to stop communication, - * in which case, flag should be 1. - * - * @param fed_id The ID of the peer federate sending messages to this - * federate, or -1 if the RTI. - * @param flag 0 if an EOF was received, -1 if a socket error occurred, 1 otherwise. - */ -static void _lf_close_inbound_socket(int fed_id, int flag) { - LF_MUTEX_LOCK(socket_mutex); - if (_fed.sockets_for_inbound_p2p_connections[fed_id] >= 0) { - if (flag >= 0) { - if (flag > 0) { - shutdown(_fed.sockets_for_inbound_p2p_connections[fed_id], SHUT_RDWR); - } else { - // Have received EOF from the other end. Send EOF to the other end. - shutdown(_fed.sockets_for_inbound_p2p_connections[fed_id], SHUT_WR); - } - } - close(_fed.sockets_for_inbound_p2p_connections[fed_id]); - _fed.sockets_for_inbound_p2p_connections[fed_id] = -1; - } - LF_MUTEX_UNLOCK(socket_mutex); -} - -int send_tagged_message(environment_t* env, - interval_t additional_delay, - int message_type, - unsigned short port, - unsigned short federate, - const char* next_destination_str, - size_t length, - unsigned char* message) { - assert(env != GLOBAL_ENVIRONMENT); - - size_t header_length = 1 + sizeof(uint16_t) + sizeof(uint16_t) - + sizeof(int32_t) + sizeof(instant_t) + sizeof(microstep_t); - unsigned char header_buffer[header_length]; - - if (message_type != MSG_TYPE_TAGGED_MESSAGE && message_type != MSG_TYPE_P2P_TAGGED_MESSAGE) { - lf_print_error("send_message: Unsupported message type (%d).", message_type); - return -1; - } - - size_t buffer_head = 0; - // First byte is the message type. - header_buffer[buffer_head] = (unsigned char)message_type; - buffer_head += sizeof(unsigned char); - // Next two bytes identify the destination port. - // NOTE: Send messages little endian, not big endian. - encode_uint16(port, &(header_buffer[buffer_head])); - buffer_head += sizeof(uint16_t); - - // Next two bytes identify the destination federate. - encode_uint16(federate, &(header_buffer[buffer_head])); - buffer_head += sizeof(uint16_t); - - // The next four bytes are the message length. - encode_int32((int32_t)length, &(header_buffer[buffer_head])); - buffer_head += sizeof(int32_t); - - // Apply the additional delay to the current tag and use that as the intended - // tag of the outgoing message. - tag_t current_message_intended_tag = lf_delay_tag(env->current_tag, additional_delay); - - if (_lf_is_tag_after_stop_tag(env, current_message_intended_tag)) { - // Message tag is past the timeout time (the stop time) so it should not be sent. - LF_PRINT_LOG("Dropping message because it will be after the timeout time."); - return -1; - } - - // Next 8 + 4 will be the tag (timestamp, microstep) - encode_tag( - &(header_buffer[buffer_head]), - current_message_intended_tag - ); - - LF_PRINT_LOG("Sending message with tag " PRINTF_TAG " to %s.", - current_message_intended_tag.time - start_time, - current_message_intended_tag.microstep, - next_destination_str); - - // Use a mutex lock to prevent multiple threads from simultaneously sending. - LF_MUTEX_LOCK(outbound_socket_mutex); - - int* socket; - if (message_type == MSG_TYPE_P2P_TAGGED_MESSAGE) { - socket = &_fed.sockets_for_outbound_p2p_connections[federate]; - tracepoint_federate_to_federate(_fed.trace, send_P2P_TAGGED_MSG, _lf_my_fed_id, federate, ¤t_message_intended_tag); - } else { - socket = &_fed.socket_TCP_RTI; - tracepoint_federate_to_rti(_fed.trace, send_TAGGED_MSG, _lf_my_fed_id, ¤t_message_intended_tag); - } - - int result = write_to_socket_close_on_error(socket, header_length, header_buffer); - if (result == 0) { - // Header sent successfully. Send the body. - result = write_to_socket_close_on_error(socket, length, message); - } - if (result != 0) { - // Message did not send. Handling depends on message type. - if (message_type == MSG_TYPE_P2P_TAGGED_MESSAGE) { - lf_print_warning("Failed to send message to %s. Dropping the message.", next_destination_str); - } else { - lf_print_error_system_failure("Failed to send message to %s. Connection lost to the RTI.", - next_destination_str); - } - } - LF_MUTEX_UNLOCK(outbound_socket_mutex); - return result; -} +////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////// +// Static functions (used only internally) /** - * Send a time to the RTI. This acquires the outbound_socket_mutex. + * Send a time to the RTI. This acquires the lf_outbound_socket_mutex. * @param type The message type (MSG_TYPE_TIMESTAMP). * @param time The time. */ -void _lf_send_time(unsigned char type, instant_t time) { +static void send_time(unsigned char type, instant_t time) { LF_PRINT_DEBUG("Sending time " PRINTF_TIME " to the RTI.", time); size_t bytes_to_write = 1 + sizeof(instant_t); unsigned char buffer[bytes_to_write]; buffer[0] = type; encode_int64(time, &(buffer[1])); - LF_MUTEX_LOCK(outbound_socket_mutex); - write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_write, buffer, &outbound_socket_mutex, + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_write, buffer, &lf_outbound_socket_mutex, "Failed to send time " PRINTF_TIME " to the RTI.", time - start_time); - LF_MUTEX_UNLOCK(outbound_socket_mutex); + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); tag_t tag = {.time = time, .microstep = 0}; // Trace the event when tracing is enabled @@ -377,30 +130,30 @@ void _lf_send_time(unsigned char type, instant_t time) { /** * Send a tag to the RTI. - * This function acquires the outbound_socket_mutex. - * @param type The message type (MSG_TYPE_NEXT_EVENT_TAG or MSG_TYPE_LOGICAL_TAG_COMPLETE). + * This function acquires the lf_outbound_socket_mutex. + * @param type The message type (MSG_TYPE_NEXT_EVENT_TAG or MSG_TYPE_LATEST_TAG_COMPLETE). * @param tag The tag. */ -void _lf_send_tag(unsigned char type, tag_t tag) { +static void send_tag(unsigned char type, tag_t tag) { LF_PRINT_DEBUG("Sending tag " PRINTF_TAG " to the RTI.", tag.time - start_time, tag.microstep); size_t bytes_to_write = 1 + sizeof(instant_t) + sizeof(microstep_t); unsigned char buffer[bytes_to_write]; buffer[0] = type; encode_tag(&(buffer[1]), tag); - LF_MUTEX_LOCK(outbound_socket_mutex); + LF_MUTEX_LOCK(lf_outbound_socket_mutex); if (_fed.socket_TCP_RTI < 0) { lf_print_warning("Socket is no longer connected. Dropping message."); - LF_MUTEX_UNLOCK(outbound_socket_mutex); + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); return; } trace_event_t event_type = (type == MSG_TYPE_NEXT_EVENT_TAG) ? send_NET : send_LTC; // Trace the event when tracing is enabled tracepoint_federate_to_rti(_fed.trace, event_type, _lf_my_fed_id, &tag); write_to_socket_fail_on_error( - &_fed.socket_TCP_RTI, bytes_to_write, buffer, &outbound_socket_mutex, + &_fed.socket_TCP_RTI, bytes_to_write, buffer, &lf_outbound_socket_mutex, "Failed to send tag " PRINTF_TAG " to the RTI.", tag.time - start_time, tag.microstep); - LF_MUTEX_UNLOCK(outbound_socket_mutex); + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); } /** @@ -414,2240 +167,2123 @@ static bool rti_resigned() { else return false; } -/** - * Thread to accept connections from other federates that send this federate - * messages directly (not through the RTI). This thread starts a thread for - * each accepted socket connection and, once it has opened all expected - * sockets, exits. - * @param env_arg pointer to the environment of this federate. - */ -void* handle_p2p_connections_from_federates(void* env_arg) { - assert(env_arg); - environment_t* env = (environment_t *) env_arg; - int received_federates = 0; - // Allocate memory to store thread IDs. - _fed.inbound_socket_listeners = (lf_thread_t*)calloc(_fed.number_of_inbound_p2p_connections, sizeof(lf_thread_t)); - while (received_federates < _fed.number_of_inbound_p2p_connections && !_lf_termination_executed) { - // Wait for an incoming connection request. - struct sockaddr client_fd; - uint32_t client_length = sizeof(client_fd); - int socket_id = accept(_fed.server_socket, &client_fd, &client_length); - - if (socket_id < 0) { - if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { - if (rti_resigned()) break; - else continue; // Try again. - } else if (errno == EPERM) { - lf_print_error_system_failure("Firewall permissions prohibit connection."); - } else { - lf_print_error_system_failure("A fatal error occurred while accepting a new socket."); - } - } - LF_PRINT_LOG("Accepted new connection from remote federate."); - - size_t header_length = 1 + sizeof(uint16_t) + 1; - unsigned char buffer[header_length]; - int read_failed = read_from_socket(socket_id, header_length, (unsigned char*)&buffer); - if (read_failed || buffer[0] != MSG_TYPE_P2P_SENDING_FED_ID) { - lf_print_warning("Federate received invalid first message on P2P socket. Closing socket."); - if (read_failed == 0) { - // Wrong message received. - unsigned char response[2]; - response[0] = MSG_TYPE_REJECT; - response[1] = WRONG_SERVER; - // Trace the event when tracing is enabled - tracepoint_federate_to_federate(_fed.trace, send_REJECT, _lf_my_fed_id, -3, NULL); - // Ignore errors on this response. - write_to_socket(socket_id, 2, response); - } - close(socket_id); - continue; - } - - // Get the federation ID and check it. - unsigned char federation_id_length = buffer[header_length - 1]; - char remote_federation_id[federation_id_length]; - read_failed = read_from_socket(socket_id, federation_id_length, (unsigned char*)remote_federation_id); - if (read_failed || (strncmp(federation_metadata.federation_id, remote_federation_id, strnlen(federation_metadata.federation_id, 255)) != 0)) { - lf_print_warning("Received invalid federation ID. Closing socket."); - if (read_failed == 0) { - unsigned char response[2]; - response[0] = MSG_TYPE_REJECT; - response[1] = FEDERATION_ID_DOES_NOT_MATCH; - // Trace the event when tracing is enabled - tracepoint_federate_to_federate(_fed.trace, send_REJECT, _lf_my_fed_id, -3, NULL); - // Ignore errors on this response. - write_to_socket(socket_id, 2, response); - } - close(socket_id); - continue; - } - - // Extract the ID of the sending federate. - uint16_t remote_fed_id = extract_uint16((unsigned char*)&(buffer[1])); - LF_PRINT_DEBUG("Received sending federate ID %d.", remote_fed_id); - - // Trace the event when tracing is enabled - tracepoint_federate_to_federate(_fed.trace, receive_FED_ID, _lf_my_fed_id, remote_fed_id, NULL); - - // Once we record the socket_id here, all future calls to close() on - // the socket should be done while holding the socket_mutex, and this array - // element should be reset to -1 during that critical section. - // Otherwise, there can be race condition where, during termination, - // two threads attempt to simultaneously close the socket. - _fed.sockets_for_inbound_p2p_connections[remote_fed_id] = socket_id; - - // Send an MSG_TYPE_ACK message. - unsigned char response = MSG_TYPE_ACK; - - LF_MUTEX_LOCK(outbound_socket_mutex); - write_to_socket_fail_on_error( - &_fed.sockets_for_inbound_p2p_connections[remote_fed_id], - 1, (unsigned char*)&response, - &outbound_socket_mutex, - "Failed to write MSG_TYPE_ACK in response to federate %d.", - remote_fed_id); - LF_MUTEX_UNLOCK(outbound_socket_mutex); - - // Trace the event when tracing is enabled - tracepoint_federate_to_federate(_fed.trace, send_ACK, _lf_my_fed_id, remote_fed_id, NULL); +//////////////////////////////// Port Status Handling /////////////////////////////////////// - // Start a thread to listen for incoming messages from other federates. - // The fed_id is a uint16_t, which we assume can be safely cast to and from void*. - void* fed_id_arg = (void*)(uintptr_t)remote_fed_id; - int result = lf_thread_create( - &_fed.inbound_socket_listeners[received_federates], - listen_to_federates, - fed_id_arg); - if (result != 0) { - // Failed to create a listening thread. - LF_MUTEX_LOCK(socket_mutex); - if (_fed.sockets_for_inbound_p2p_connections[remote_fed_id] != -1) { - close(socket_id); - _fed.sockets_for_inbound_p2p_connections[remote_fed_id] = -1; - } - LF_MUTEX_UNLOCK(socket_mutex); - lf_print_error_and_exit( - "Failed to create a thread to listen for incoming physical connection. Error code: %d.", - result - ); - } +extern lf_action_base_t* _lf_action_table[]; +extern interval_t _lf_action_delay_table[]; +extern size_t _lf_action_table_size; +extern lf_action_base_t* _lf_zero_delay_cycle_action_table[]; +extern size_t _lf_zero_delay_cycle_action_table_size; +extern reaction_t* network_input_reactions[]; +extern size_t num_network_input_reactions; +extern reaction_t* port_absent_reaction[]; +extern size_t num_port_absent_reactions; +#ifdef FEDERATED_DECENTRALIZED +extern staa_t* staa_lst[]; +extern size_t staa_lst_size; +#endif - received_federates++; +/** + * Return a pointer to the action struct for the action + * corresponding to the specified port ID. + * @param port_id The port ID. + * @return A pointer to an action struct or null if the ID is out of range. + */ +static lf_action_base_t* action_for_port(int port_id) { + if (port_id < _lf_action_table_size) { + return _lf_action_table[port_id]; } - - LF_PRINT_LOG("All %zu remote federates are connected.", _fed.number_of_inbound_p2p_connections); + lf_print_error("Invalid port ID: %d", port_id); return NULL; } /** - * Close the socket that sends outgoing messages to the - * specified federate ID. This function assumes the caller holds - * the outbound_socket_mutex mutex lock, at least during normal termination. - * @param fed_id The ID of the peer federate receiving messages from this - * federate, or -1 if the RTI (centralized coordination). - * @param flag 0 if the socket has received EOF, 1 if not, -1 if abnormal termination. + * Update the last known status tag of all network input ports + * to the value of `tag`, unless that the provided `tag` is less + * than the last_known_status_tag of the port. This is called when + * a TAG signal is received from the RTI in centralized coordination. + * If any update occurs, then this broadcasts on `lf_port_status_changed`. + * + * This assumes the caller holds the mutex. + * + * @param tag The tag on which the latest status of all network input + * ports is known. */ -static void _lf_close_outbound_socket(int fed_id, int flag) { - assert (fed_id >= 0 && fed_id < NUMBER_OF_FEDERATES); - LF_MUTEX_LOCK(outbound_socket_mutex); - if (_fed.sockets_for_outbound_p2p_connections[fed_id] >= 0) { - // Close the socket by sending a FIN packet indicating that no further writes - // are expected. Then read until we get an EOF indication. - if (flag >= 0) { - // SHUT_WR indicates no further outgoing messages. - shutdown(_fed.sockets_for_outbound_p2p_connections[fed_id], SHUT_WR); - if (flag > 0) { - // Have not received EOF yet. read until we get an EOF or error indication. - // This compensates for delayed ACKs and disabling of Nagles algorithm - // by delaying exiting until the shutdown is complete. - unsigned char message[32]; - while (read(_fed.sockets_for_outbound_p2p_connections[fed_id], &message, 32) > 0); - } +static void update_last_known_status_on_input_ports(tag_t tag) { + LF_PRINT_DEBUG("In update_last_known_status_on_input ports."); + bool notify = false; + for (int i = 0; i < _lf_action_table_size; i++) { + lf_action_base_t* input_port_action = action_for_port(i); + // This is called when a TAG is received. + // But it is possible for an input port to have received already + // a message with a larger tag (if there is an after delay on the + // connection), in which case, the last known status tag of the port + // is in the future and should not be rolled back. So in that case, + // we do not update the last known status tag. + if (lf_tag_compare(tag, + input_port_action->trigger->last_known_status_tag) >= 0) { + LF_PRINT_DEBUG( + "Updating the last known status tag of port %d from " PRINTF_TAG " to " PRINTF_TAG ".", + i, + input_port_action->trigger->last_known_status_tag.time - lf_time_start(), + input_port_action->trigger->last_known_status_tag.microstep, + tag.time - lf_time_start(), + tag.microstep + ); + input_port_action->trigger->last_known_status_tag = tag; + notify = true; } - close(_fed.sockets_for_outbound_p2p_connections[fed_id]); - _fed.sockets_for_outbound_p2p_connections[fed_id] = -1; } - LF_MUTEX_UNLOCK(outbound_socket_mutex); + // FIXME: We could put a condition variable into the trigger_t + // struct for each network input port, in which case this won't + // be a broadcast but rather a targetted signal. + if (notify && lf_update_max_level(tag, false)) { + // Notify network input reactions + lf_cond_broadcast(&lf_port_status_changed); + } } /** - * Connect to the federate with the specified id. This established - * connection will then be used in functions such as send_tagged_message() - * to send messages directly to the specified federate. - * This function first sends an MSG_TYPE_ADDRESS_QUERY message to the RTI to obtain - * the IP address and port number of the specified federate. It then attempts - * to establish a socket connection to the specified federate. - * If this fails, the program exits. If it succeeds, it sets element [id] of - * the _fed.sockets_for_outbound_p2p_connections global array to - * refer to the socket for communicating directly with the federate. - * @param remote_federate_id The ID of the remote federate. + * @brief Update the last known status tag of a network input port. + * + * First, if the specified tag is less than the current_tag of the top-level + * environment, then ignore the specified tag and use the current_tag. This + * situation can arise if a message has arrived late (an STP violation has occurred). + * + * If the specified tag is greater than the previous last_known_status_tag + * of the port, then update the last_known_status_tag to the new tag. + * + * If the tag is equal to the previous last_known_status_tag, then + * increment the microstep of the last_known_status_tag. This situation can + * occur if a sequence of late messages (STP violations) are occurring all at + * once during an execution of a logical tag. + * + * This function is called when a message or absent message arrives. For decentralized + * coordination, it is also called by the background thread update_ports_from_staa_offsets + * which uses physical time to determine when an input port can be assumed to be absent + * if a message has not been received. + * + * This function assumes the caller holds the mutex on the top-level environment, + * and, if the tag actually increases, it broadcasts on `lf_port_status_changed`. + * + * @param env The top-level environment, whose mutex is assumed to be held. + * @param tag The tag on which the latest status of the specified network input port is known. + * @param portID The port ID. */ -void connect_to_federate(uint16_t remote_federate_id) { - int result = -1; - int count_retries = 0; +static void update_last_known_status_on_input_port(environment_t* env, tag_t tag, int port_id) { + if (lf_tag_compare(tag, env->current_tag) < 0) tag = env->current_tag; + trigger_t* input_port_action = action_for_port(port_id)->trigger; + int comparison = lf_tag_compare(tag, input_port_action->last_known_status_tag); + if (comparison == 0) tag.microstep++; + if (comparison >= 0) { + LF_PRINT_LOG( + "Updating the last known status tag of port %d from " PRINTF_TAG " to " PRINTF_TAG ".", + port_id, + input_port_action->last_known_status_tag.time - lf_time_start(), + input_port_action->last_known_status_tag.microstep, + tag.time - lf_time_start(), + tag.microstep + ); + input_port_action->last_known_status_tag = tag; - // Ask the RTI for port number of the remote federate. - // The buffer is used for both sending and receiving replies. - // The size is what is needed for receiving replies. - unsigned char buffer[sizeof(int32_t) + INET_ADDRSTRLEN + 1]; - int port = -1; - struct in_addr host_ip_addr; - int count_tries = 0; - while (port == -1 && !_lf_termination_executed) { - buffer[0] = MSG_TYPE_ADDRESS_QUERY; - // NOTE: Sending messages in little endian. - encode_uint16(remote_federate_id, &(buffer[1])); + // Check whether this port update implies a change to MLAA, which may unblock reactions. + // For decentralized coordination, the first argument is NEVER, so it has no effect. + // For centralized, the arguments probably also have no effect, but the port update may. + // Note that it would not be correct to pass `tag` as the first argument because + // there is no guarantee that there is either a TAG or a PTAG for this time. + // The message that triggered this to be called could be from an upstream + // federate that is far ahead of other upstream federates in logical time. + lf_update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); + lf_cond_broadcast(&lf_port_status_changed); + } else { + // Message arrivals should be monotonic, so this should not occur. + lf_print_warning("Attempt to update the last known status tag " + "of network input port %d to an earlier tag was ignored.", port_id); + } +} - LF_PRINT_DEBUG("Sending address query for federate %d.", remote_federate_id); - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_ADR_QR, _lf_my_fed_id, NULL); +/** + * Set the status of network port with id portID. + * + * @param portID The network port ID + * @param status The network port status (port_status_t) + */ +static void set_network_port_status(int portID, port_status_t status) { + lf_action_base_t* network_input_port_action = action_for_port(portID); + network_input_port_action->trigger->status = status; +} - LF_MUTEX_LOCK(outbound_socket_mutex); - write_to_socket_fail_on_error( - &_fed.socket_TCP_RTI, sizeof(uint16_t) + 1, buffer, &outbound_socket_mutex, - "Failed to send address query for federate %d to RTI.", - remote_federate_id); - LF_MUTEX_UNLOCK(outbound_socket_mutex); - - // Read RTI's response. - read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, sizeof(int32_t) + 1, buffer, NULL, - "Failed to read the requested port number for federate %d from RTI.", - remote_federate_id); - - if (buffer[0] != MSG_TYPE_ADDRESS_QUERY) { - // Unexpected reply. Could be that RTI has failed and sent a resignation. - if (buffer[0] == MSG_TYPE_RESIGN) { - lf_print_error_and_exit("RTI has resigned."); - } else { - lf_print_error_and_exit("Unexpected reply of type %hhu from RTI (see net_common.h).", buffer[0]); - } - } - port = extract_int32(&buffer[1]); - - read_from_socket_fail_on_error( - &_fed.socket_TCP_RTI, sizeof(host_ip_addr), (unsigned char*)&host_ip_addr, NULL, - "Failed to read the IP address for federate %d from RTI.", - remote_federate_id); - - // A reply of -1 for the port means that the RTI does not know - // the port number of the remote federate, presumably because the - // remote federate has not yet sent an MSG_TYPE_ADDRESS_ADVERTISEMENT message to the RTI. - // Sleep for some time before retrying. - if (port == -1) { - if (count_tries++ >= CONNECT_MAX_RETRIES) { - lf_print_error_and_exit("TIMEOUT obtaining IP/port for federate %d from the RTI.", - remote_federate_id); - } - // Wait ADDRESS_QUERY_RETRY_INTERVAL nanoseconds. - lf_sleep(ADDRESS_QUERY_RETRY_INTERVAL); - } - } - assert(port < 65536); - assert(port > 0); - uint16_t uport = (uint16_t)port; - -#if LOG_LEVEL > 3 - // Print the received IP address in a human readable format - // Create the human readable format of the received address. - // This is avoided unless LOG_LEVEL is high enough to - // subdue the overhead caused by inet_ntop(). - char hostname[INET_ADDRSTRLEN]; - inet_ntop(AF_INET, &host_ip_addr, hostname, INET_ADDRSTRLEN); - LF_PRINT_LOG("Received address %s port %d for federate %d from RTI.", - hostname, uport, remote_federate_id); -#endif - - // Iterate until we either successfully connect or exceed the number of - // attempts given by CONNECT_MAX_RETRIES. - int socket_id = -1; - while (result < 0 && !_lf_termination_executed) { - // Create an IPv4 socket for TCP (not UDP) communication over IP (0). - socket_id = create_real_time_tcp_socket_errexit(); - - // Server file descriptor. - struct sockaddr_in server_fd; - // Zero out the server_fd struct. - bzero((char*)&server_fd, sizeof(server_fd)); - - // Set up the server_fd fields. - server_fd.sin_family = AF_INET; // IPv4 - server_fd.sin_addr = host_ip_addr; // Received from the RTI - - // Convert the port number from host byte order to network byte order. - server_fd.sin_port = htons(uport); - result = connect( - socket_id, - (struct sockaddr *)&server_fd, - sizeof(server_fd)); - - if (result != 0) { - lf_print_error("Failed to connect to federate %d on port %d.", remote_federate_id, uport); - - // Try again after some time if the connection failed. - // Note that this should not really happen since the remote federate should be - // accepting socket connections. But possibly it will be busy (in process of accepting - // another socket connection?). Hence, we retry. - count_retries++; - if (count_retries > CONNECT_MAX_RETRIES) { - // If the remote federate is not accepting the connection after CONNECT_MAX_RETRIES - // treat it as a soft error condition and return. - lf_print_error("Failed to connect to federate %d after %d retries. Giving up.", - remote_federate_id, CONNECT_MAX_RETRIES); - return; - } - lf_print_warning("Could not connect to federate %d. Will try again every %lld nanoseconds.\n", - remote_federate_id, ADDRESS_QUERY_RETRY_INTERVAL); - - // Check whether the RTI is still there. - if (rti_resigned()) break; - - // Wait ADDRESS_QUERY_RETRY_INTERVAL nanoseconds. - lf_sleep(ADDRESS_QUERY_RETRY_INTERVAL); - } else { - // Connect was successful. - size_t buffer_length = 1 + sizeof(uint16_t) + 1; - unsigned char buffer[buffer_length]; - buffer[0] = MSG_TYPE_P2P_SENDING_FED_ID; - if (_lf_my_fed_id > UINT16_MAX) { - // This error is very unlikely to occur. - lf_print_error_and_exit("Too many federates! More than %d.", UINT16_MAX); - } - encode_uint16((uint16_t)_lf_my_fed_id, (unsigned char*)&(buffer[1])); - unsigned char federation_id_length = (unsigned char)strnlen(federation_metadata.federation_id, 255); - buffer[sizeof(uint16_t) + 1] = federation_id_length; - // Trace the event when tracing is enabled - tracepoint_federate_to_federate(_fed.trace, send_FED_ID, _lf_my_fed_id, remote_federate_id, NULL); - - // No need for a mutex because we have the only handle on the socket. - write_to_socket_fail_on_error(&socket_id, - buffer_length, buffer, NULL, - "Failed to send fed_id to federate %d.", remote_federate_id); - write_to_socket_fail_on_error(&socket_id, - federation_id_length, (unsigned char*)federation_metadata.federation_id, NULL, - "Failed to send federation id to federate %d.", - remote_federate_id); - - read_from_socket_fail_on_error(&socket_id, 1, (unsigned char*)buffer, NULL, - "Failed to read MSG_TYPE_ACK from federate %d in response to sending fed_id.", - remote_federate_id); - if (buffer[0] != MSG_TYPE_ACK) { - // Get the error code. - read_from_socket_fail_on_error(&socket_id, 1, (unsigned char*)buffer, NULL, - "Failed to read error code from federate %d in response to sending fed_id.", remote_federate_id); - lf_print_error("Received MSG_TYPE_REJECT message from remote federate (%d).", buffer[0]); - result = -1; - continue; - } else { - lf_print("Connected to federate %d, port %d.", remote_federate_id, port); - // Trace the event when tracing is enabled - tracepoint_federate_to_federate(_fed.trace, receive_ACK, _lf_my_fed_id, remote_federate_id, NULL); - } - } - } - // Once we set this variable, then all future calls to close() on this - // socket ID should reset it to -1 within a critical section. - _fed.sockets_for_outbound_p2p_connections[remote_federate_id] = socket_id; -} - -#ifdef FEDERATED_AUTHENTICATED /** - * Perform HMAC-based authentication with the RTI, using the federation ID - * as an HMAC key. - * @return 0 for success, -1 for failure. - */ -static int perform_hmac_authentication() { - - // Send buffer including message type, federate ID, federate's nonce. - size_t fed_id_length = sizeof(uint16_t); - size_t message_length = 1 + fed_id_length + NONCE_LENGTH; - unsigned char fed_hello_buf[message_length]; - fed_hello_buf[0] = MSG_TYPE_FED_NONCE; - encode_uint16((uint16_t)_lf_my_fed_id, &fed_hello_buf[1]); - unsigned char fed_nonce[NONCE_LENGTH]; - RAND_bytes(fed_nonce, NONCE_LENGTH); - memcpy(&fed_hello_buf[1 + fed_id_length], fed_nonce, NONCE_LENGTH); - - write_to_socket_fail_on_error( - &_fed.socket_TCP_RTI, message_length, fed_hello_buf, NULL, - "Failed to write nonce."); - - // Check HMAC of received FED_RESPONSE message. - unsigned int hmac_length = SHA256_HMAC_LENGTH; - size_t federation_id_length = strnlen(federation_metadata.federation_id, 255); - - unsigned char received[1 + NONCE_LENGTH + hmac_length]; - read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, 1 + NONCE_LENGTH + hmac_length, received, NULL, - "Failed to read RTI response."); - if (received[0] != MSG_TYPE_RTI_RESPONSE) { - if (received[0] == MSG_TYPE_RESIGN) { - lf_print_error("RTI has resigned."); - return -1; - } else { - lf_print_error( - "Received unexpected response %u from the RTI (see net_common.h).", - received[0]); - return -1; - } - } - // Create tag to compare to received tag. - unsigned char buf_to_check[1 + fed_id_length + NONCE_LENGTH]; - buf_to_check[0] = MSG_TYPE_RTI_RESPONSE; - encode_uint16((uint16_t)_lf_my_fed_id, &buf_to_check[1]); - memcpy(&buf_to_check[1 + fed_id_length], fed_nonce, NONCE_LENGTH); - unsigned char fed_tag[hmac_length]; - HMAC(EVP_sha256(), federation_metadata.federation_id, federation_id_length, buf_to_check, 1 + fed_id_length + NONCE_LENGTH, - fed_tag, &hmac_length); - - // Compare received tag and created tag. - if (memcmp(&received[1 + NONCE_LENGTH], fed_tag, hmac_length) != 0) { - // HMAC does not match. Send back a MSG_TYPE_REJECT message. - lf_print_error("HMAC authentication failed."); - unsigned char response[2]; - response[0] = MSG_TYPE_REJECT; - response[1] = HMAC_DOES_NOT_MATCH; - - // Ignore errors on writing back. - write_to_socket(_fed.socket_TCP_RTI, 2, response); - return -1; - } else { - LF_PRINT_LOG("HMAC verified."); - // HMAC tag is created with MSG_TYPE_FED_RESPONSE and received federate nonce. - unsigned char mac_buf[1 + NONCE_LENGTH]; - mac_buf[0] = MSG_TYPE_FED_RESPONSE; - memcpy(&mac_buf[1], &received[1], NONCE_LENGTH); - // Buffer for message type and HMAC tag. - unsigned char sender[1 + hmac_length]; - sender[0] = MSG_TYPE_FED_RESPONSE; - HMAC(EVP_sha256(), federation_metadata.federation_id, federation_id_length, mac_buf, 1 + NONCE_LENGTH, - &sender[1], &hmac_length); - - write_to_socket_fail_on_error( - &_fed.socket_TCP_RTI, 1 + hmac_length, sender, NULL, - "Failed to write fed response."); - } - return 0; -} -#endif - -static void close_rti_socket() { - shutdown(_fed.socket_TCP_RTI, SHUT_RDWR); - close(_fed.socket_TCP_RTI); - _fed.socket_TCP_RTI = -1; -} - -/** - * Return in the result a struct with the address info for the specified hostname and port. - * The memory for the result is dynamically allocated and must be freed using freeaddrinfo. - * @param hostname The host name. - * @param port The port number. - * @param result The struct into which to write. - */ -static void rti_address(const char* hostname, uint16_t port, struct addrinfo** result) { - struct addrinfo hints; - - memset(&hints, 0, sizeof(hints)); - hints.ai_family = AF_INET; /* Allow IPv4 */ - hints.ai_socktype = SOCK_STREAM; /* Stream socket */ - hints.ai_protocol = IPPROTO_TCP; /* TCP protocol */ - hints.ai_addr = NULL; - hints.ai_next = NULL; - hints.ai_flags = AI_NUMERICSERV; /* Allow only numeric port numbers */ - - // Convert port number to string. - char str[6]; - sprintf(str, "%u", port); - - // Get address structure matching hostname and hints criteria, and - // set port to the port number provided in str. There should only - // ever be one matching address structure, and we connect to that. - if (getaddrinfo(hostname, (const char*)&str, &hints, result)) { - lf_print_error_and_exit("No host for RTI matching given hostname: %s", hostname); - } -} - -void connect_to_rti(const char* hostname, int port) { - LF_PRINT_LOG("Connecting to the RTI."); - - // Override passed hostname and port if passed as runtime arguments. - hostname = federation_metadata.rti_host ? federation_metadata.rti_host : hostname; - port = federation_metadata.rti_port >= 0 ? federation_metadata.rti_port : port; - - // Adjust the port. - uint16_t uport = 0; - if (port < 0 || port > INT16_MAX) { - lf_print_error( - "connect_to_rti(): Specified port (%d) is out of range," - " using the default port %d instead.", - port, DEFAULT_PORT - ); - uport = DEFAULT_PORT; - port = 0; // Mark so that increments occur between tries. - } else { - uport = (uint16_t)port; - } - if (uport == 0) { - uport = DEFAULT_PORT; - } - - // Create a socket - _fed.socket_TCP_RTI = create_real_time_tcp_socket_errexit(); - - int result = -1; - int count_retries = 0; - struct addrinfo* res = NULL; - - while (count_retries++ < CONNECT_MAX_RETRIES && !_lf_termination_executed) { - if (res != NULL) { - // This is a repeated attempt. - if (_fed.socket_TCP_RTI >= 0) close_rti_socket(); - - lf_sleep(CONNECT_RETRY_INTERVAL); - - // Create a new socket. - _fed.socket_TCP_RTI = create_real_time_tcp_socket_errexit(); - - if (port == 0) { - // Free previously allocated address info. - freeaddrinfo(res); - // Increment the port number. - uport++; - if (uport >= DEFAULT_PORT + MAX_NUM_PORT_ADDRESSES) uport = DEFAULT_PORT; + * Version of schedule_value() similar to that in reactor_common.c + * except that it does not acquire the mutex lock and has a special + * behavior during startup where it can inject reactions to the reaction + * queue if execution has not started yet. + * It is also responsible for setting the intended tag of the + * network message based on the calculated delay. + * This function assumes that the caller holds the mutex lock. + * + * This is used for handling incoming timed messages to a federate. + * + * @param env The environment of the federate + * @param action The action or timer to be triggered. + * @param tag The tag of the message received over the network. + * @param value Dynamically allocated memory containing the value to send. + * @param length The length of the array, if it is an array, or 1 for a + * scalar and 0 for no payload. + * @return A handle to the event, or 0 if no event was scheduled, or -1 for error. + */ +static trigger_handle_t schedule_message_received_from_network_locked( + environment_t* env, + trigger_t* trigger, + tag_t tag, + lf_token_t* token) { + assert(env != GLOBAL_ENVIRONMENT); - // Reconstruct the address info. - rti_address(hostname, uport, &res); - } - lf_print("Trying RTI again on port %d (attempt %d).", uport, count_retries); - } else { - // This is the first attempt. - rti_address(hostname, uport, &res); - } + // Return value of the function + trigger_handle_t return_value = 0; - result = connect(_fed.socket_TCP_RTI, res->ai_addr, res->ai_addrlen); - if (result < 0) continue; // Connect failed. + // Indicates whether or not the intended tag + // of the message (timestamp, microstep) is + // in the future relative to the tag of this + // federate. By default, assume it is not. + bool message_tag_is_in_the_future = lf_tag_compare(tag, env->current_tag) > 0; + // Assign the intended tag temporarily to restore later. + tag_t previous_intended_tag = trigger->intended_tag; + trigger->intended_tag = tag; - // Have connected to an RTI, but not sure it's the right RTI. - // Send a MSG_TYPE_FED_IDS message and wait for a reply. - // Notify the RTI of the ID of this federate and its federation. + // Calculate the extra_delay required to be passed + // to the schedule function. + interval_t extra_delay = tag.time - env->current_tag.time; + if (!message_tag_is_in_the_future && env->execution_started) { +#ifdef FEDERATED_CENTRALIZED + // If the coordination is centralized, receiving a message + // that does not carry a timestamp that is in the future + // would indicate a critical condition, showing that the + // time advance mechanism is not working correctly. + LF_MUTEX_UNLOCK(env->mutex); + lf_print_error_and_exit( + "Received a message at tag " PRINTF_TAG " that has a tag " PRINTF_TAG + " that has violated the STP offset. " + "Centralized coordination should not have these types of messages.", + env->current_tag.time - start_time, env->current_tag.microstep, + tag.time - start_time, tag.microstep); +#else + // Set the delay back to 0 + extra_delay = 0LL; + LF_PRINT_LOG("Calling schedule with 0 delay and intended tag " PRINTF_TAG ".", + trigger->intended_tag.time - start_time, + trigger->intended_tag.microstep); + return_value = _lf_schedule(env, trigger, extra_delay, token); +#endif + } else { + // In case the message is in the future, call + // _lf_schedule_at_tag() so that the microstep is respected. + LF_PRINT_LOG("Received a message that is (" PRINTF_TIME " nanoseconds, " PRINTF_MICROSTEP " microsteps) " + "in the future.", extra_delay, tag.microstep - env->current_tag.microstep); + return_value = _lf_schedule_at_tag(env, trigger, tag, token); + } + trigger->intended_tag = previous_intended_tag; + // Notify the main thread in case it is waiting for physical time to elapse. + LF_PRINT_DEBUG("Broadcasting notification that event queue changed."); + lf_cond_broadcast(&env->event_q_changed); + return return_value; +} -#ifdef FEDERATED_AUTHENTICATED - LF_PRINT_LOG("Connected to an RTI. Performing HMAC-based authentication using federation ID."); - if (perform_hmac_authentication()) { - if (port == 0) { - continue; // Try again with a new port. +/** + * Close the socket that receives incoming messages from the + * specified federate ID. This function should be called when a read + * of incoming socket fails or when an EOF is received. + * It can also be called when the receiving end wants to stop communication, + * in which case, flag should be 1. + * + * @param fed_id The ID of the peer federate sending messages to this + * federate, or -1 if the RTI. + * @param flag 0 if an EOF was received, -1 if a socket error occurred, 1 otherwise. + */ +static void close_inbound_socket(int fed_id, int flag) { + LF_MUTEX_LOCK(socket_mutex); + if (_fed.sockets_for_inbound_p2p_connections[fed_id] >= 0) { + if (flag >= 0) { + if (flag > 0) { + shutdown(_fed.sockets_for_inbound_p2p_connections[fed_id], SHUT_RDWR); } else { - // No point in trying again because it will be the same port. - close_rti_socket(); - lf_print_error_and_exit("Authentication failed."); + // Have received EOF from the other end. Send EOF to the other end. + shutdown(_fed.sockets_for_inbound_p2p_connections[fed_id], SHUT_WR); } } -#else - LF_PRINT_LOG("Connected to an RTI. Sending federation ID for authentication."); -#endif + close(_fed.sockets_for_inbound_p2p_connections[fed_id]); + _fed.sockets_for_inbound_p2p_connections[fed_id] = -1; + } + LF_MUTEX_UNLOCK(socket_mutex); +} - // Send the message type first. - unsigned char buffer[4]; - buffer[0] = MSG_TYPE_FED_IDS; - // Next send the federate ID. - if (_lf_my_fed_id > UINT16_MAX) { - lf_print_error_and_exit("Too many federates! More than %d.", UINT16_MAX); - } - encode_uint16((uint16_t)_lf_my_fed_id, &buffer[1]); - // Next send the federation ID length. - // The federation ID is limited to 255 bytes. - size_t federation_id_length = strnlen(federation_metadata.federation_id, 255); - buffer[1 + sizeof(uint16_t)] = (unsigned char)(federation_id_length & 0xff); +/** + * Return true if reactions need to be inserted directly into the reaction queue and + * false if a call to schedule is needed (the normal case). This function handles zero-delay + * cycles, where processing at a tag must be able to begin before all messages have arrived + * at that tag. This returns true if the following conditions are all true: + * + * 1. the first reaction triggered has a level >= MLAA (a port is or will be blocked on this trigger); + * 2. the intended_tag is less than or equal to the current tag of the environment; + * 3. the intended_tag is greater than the last_tag of the trigger; + * 4. the intended_tag is greater than the last_known_status_tag of the trigger; + * 5. the execution has started (the event queue has been examined); + * 6. the trigger is not physical; + * + * The comparison against the MLAA (condition 1), if true, means that there is a blocking port + * waiting for this trigger (or possibly an earlier blocking port). For condition (2), if the + * intended tag is less than the current tag, then the message is tardy. A tardy message can + * unblock a port, although it will trigger an STP violation handler if one is defined or an + * error if not (or if centralized coordination is being used). The comparison against the + * last_tag of the trigger (condition 3) ensures that if the message is tardy but there is + * already an earlier tardy message that has been handled (or is being handled), then we + * don't try to handle two messages in the same tag, which is not allowed. For example, there + * could be a case where current tag is 10 with a port absent reaction waiting, and a message + * has arrived with intended_tag 8. This message will eventually cause the port absent reaction + * to exit, but before that, a message with intended_tag of 9 could arrive before the port absent + * reaction has had a chance to exit. The port status is on the other hand changed in this thread, + * and thus, can be checked in this scenario without this race condition. The message with + * intended_tag of 9 in this case needs to wait one microstep to be processed. The check with + * last_known_status_tag (condition 4) deals with messages arriving with identical intended + * tags (which should not happen). This one will be handled late (one microstep later than + * the current tag if 1 and 2 are true). + * + * This function assumes the mutex is held on the environment. + * + * @param env The environment. + * @param trigger The trigger. + * @param intended_tag The intended tag. + */ +static bool handle_message_now(environment_t* env, trigger_t* trigger, tag_t intended_tag) { + return trigger->reactions[0]->index >= max_level_allowed_to_advance + && lf_tag_compare(intended_tag, lf_tag(env)) <= 0 + && lf_tag_compare(intended_tag, trigger->last_tag) > 0 + && lf_tag_compare(intended_tag, trigger->last_known_status_tag) > 0 + && env->execution_started + && !trigger->is_physical; +} - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_FED_ID, _lf_my_fed_id, NULL); +/** + * Handle a message being received from a remote federate. + * + * This function assumes the caller does not hold the mutex lock. + * @param socket Pointer to the socket to read the message from. + * @param buffer The buffer to read. + * @param fed_id The sending federate ID or -1 if the centralized coordination. + */ +static void handle_message(int* socket, int fed_id) { + // Read the header. + size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t); + unsigned char buffer[bytes_to_read]; + if (read_from_socket_close_on_error(socket, bytes_to_read, buffer)) { + // Read failed, which means the socket has been closed between reading the + // message ID byte and here. Issue a warning only. This is a physical + // connection, so likely the message is just late. If it's a serious failure, + // it should be caught in another thread. + lf_print_warning("Failed to read message header."); + return; + } - // No need for a mutex here because no other threads are writing to this socket. - if (write_to_socket(_fed.socket_TCP_RTI, 2 + sizeof(uint16_t), buffer)) { - continue; // Try again, possibly on a new port. - } + // Extract the header information. + unsigned short port_id; + unsigned short federate_id; + size_t length; + extract_header(buffer, &port_id, &federate_id, &length); + // Check if the message is intended for this federate + assert(_lf_my_fed_id == federate_id); + LF_PRINT_DEBUG("Receiving message to port %d of length %zu.", port_id, length); - // Next send the federation ID itself. - if (write_to_socket( - _fed.socket_TCP_RTI, - federation_id_length, - (unsigned char*)federation_metadata.federation_id)) { - continue; // Try again. - } + // Get the triggering action for the corresponding port + lf_action_base_t* action = action_for_port(port_id); - // Wait for a response. - // The response will be MSG_TYPE_REJECT if the federation ID doesn't match. - // Otherwise, it will be either MSG_TYPE_ACK or MSG_TYPE_UDP_PORT, where the latter - // is used if clock synchronization will be performed. - unsigned char response; + // Read the payload. + // Allocate memory for the message contents. + unsigned char* message_contents = (unsigned char*)malloc(length); + if (read_from_socket_close_on_error(socket, length, message_contents)) { + lf_print_warning("Failed to read message body."); + } + // Trace the event when tracing is enabled + tracepoint_federate_from_federate(_fed.trace, receive_P2P_MSG, _lf_my_fed_id, federate_id, NULL); + LF_PRINT_LOG("Message received by federate: %s. Length: %zu.", message_contents, length); - LF_PRINT_DEBUG("Waiting for response to federation ID from the RTI."); + LF_PRINT_DEBUG("Calling schedule for message received on a physical connection."); + _lf_schedule_value(action, 0, message_contents, length); +} - if (read_from_socket(_fed.socket_TCP_RTI, 1, &response)) { - continue; // Try again. - } - if (response == MSG_TYPE_REJECT) { - // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_REJECT, _lf_my_fed_id, NULL); - // Read one more byte to determine the cause of rejection. - unsigned char cause; - read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, 1, &cause, NULL, - "Failed to read the cause of rejection by the RTI."); - if (cause == FEDERATION_ID_DOES_NOT_MATCH || cause == WRONG_SERVER) { - lf_print_warning("Connected to the wrong RTI on port %d. Will try again", uport); - continue; - } - } else if (response == MSG_TYPE_ACK) { - // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_ACK, _lf_my_fed_id, NULL); - LF_PRINT_LOG("Received acknowledgment from the RTI."); - break; - } else if (response == MSG_TYPE_RESIGN) { - lf_print_warning("RTI on port %d resigned. Will try again", uport); - continue; - } else { - lf_print_warning("RTI on port %d gave unexpect response %u. Will try again", uport, response); - continue; - } - } - if (result < 0) { - lf_print_error_and_exit("Failed to connect to RTI after %d tries.", CONNECT_MAX_RETRIES); +/** + * Handle a tagged message being received from a remote federate via the RTI + * or directly from other federates. + * This will read the tag encoded in the header + * and calculate an offset to pass to the schedule function. + * This function assumes the caller does not hold the mutex lock. + * Instead of holding the mutex lock, this function calls + * _lf_increment_tag_barrier with the tag carried in + * the message header as an argument. This ensures that the current tag + * will not advance to the tag of the message if it is in the future, or + * the tag will not advance at all if the tag of the message is + * now or in the past. + * @param socket Pointer to the socket to read the message from. + * @param buffer The buffer to read. + * @param fed_id The sending federate ID or -1 if the centralized coordination. + */ +static void handle_tagged_message(int* socket, int fed_id) { + // Environment is always the one corresponding to the top-level scheduling enclave. + environment_t *env; + _lf_get_environments(&env); + + // FIXME: Need better error handling? + // Read the header which contains the timestamp. + size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t) + + sizeof(instant_t) + sizeof(microstep_t); + unsigned char buffer[bytes_to_read]; + read_from_socket_fail_on_error(socket, bytes_to_read, buffer, NULL, + "Failed to read timed message header"); + + // Extract the header information. + unsigned short port_id; + unsigned short federate_id; + size_t length; + tag_t intended_tag; + extract_timed_header(buffer, &port_id, &federate_id, &length, &intended_tag); + // Trace the event when tracing is enabled + if (fed_id == -1) { + tracepoint_federate_from_rti(_fed.trace, receive_TAGGED_MSG, _lf_my_fed_id, &intended_tag); + } else { + tracepoint_federate_from_federate(_fed.trace, receive_P2P_TAGGED_MSG, _lf_my_fed_id, fed_id, &intended_tag); } + // Check if the message is intended for this federate + assert(_lf_my_fed_id == federate_id); + LF_PRINT_DEBUG("Receiving message to port %d of length %zu.", port_id, length); - freeaddrinfo(res); /* No longer needed */ + // Get the triggering action for the corresponding port + lf_action_base_t* action = action_for_port(port_id); - // Call a generated (external) function that sends information - // about connections between this federate and other federates - // where messages are routed through the RTI. - // @see MSG_TYPE_NEIGHBOR_STRUCTURE in net_common.h - send_neighbor_structure_to_RTI(_fed.socket_TCP_RTI); + // Record the physical time of arrival of the message + instant_t time_of_arrival = lf_time_physical(); - uint16_t udp_port = setup_clock_synchronization_with_rti(); + if (action->trigger->is_physical) { + // Messages sent on physical connections should be handled via handle_message(). + lf_print_error_and_exit("Received a tagged message on a physical connection."); + } - // Write the returned port number to the RTI - unsigned char UDP_port_number[1 + sizeof(uint16_t)]; - UDP_port_number[0] = MSG_TYPE_UDP_PORT; - encode_uint16(udp_port, &(UDP_port_number[1])); - write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, 1 + sizeof(uint16_t), UDP_port_number, NULL, - "Failed to send the UDP port number to the RTI."); +#ifdef FEDERATED_DECENTRALIZED + // Only applicable for federated programs with decentralized coordination: + // For logical connections in decentralized coordination, + // increment the barrier to prevent advancement of tag beyond + // the received tag if possible. The following function call + // suggests that the tag barrier be raised to the tag provided + // by the message. If this tag is in the past, the function will cause + // the tag to freeze at the current level. + // If something happens, make sure to release the barrier. + _lf_increment_tag_barrier(env, intended_tag); +#endif + LF_PRINT_LOG("Received message on port %d with tag: " PRINTF_TAG ", Current tag: " PRINTF_TAG ".", + port_id, intended_tag.time - start_time, intended_tag.microstep, + lf_time_logical_elapsed(env), env->current_tag.microstep); - lf_print("Connected to RTI at %s:%d.", hostname, uport); -} + // Read the payload. + // Allocate memory for the message contents. + unsigned char* message_contents = (unsigned char*)malloc(length); + read_from_socket_fail_on_error(socket, length, message_contents, NULL, + "Failed to read message body."); -/** - * Send the specified timestamp to the RTI and wait for a response. - * The specified timestamp should be current physical time of the - * federate, and the response will be the designated start time for - * the federate. This procedure blocks until the response is - * received from the RTI. - * @param my_physical_time The physical time at this federate. - * @return The designated start time for the federate. - */ -instant_t get_start_time_from_rti(instant_t my_physical_time) { - // Send the timestamp marker first. - _lf_send_time(MSG_TYPE_TIMESTAMP, my_physical_time); + // The following is only valid for string messages. + // LF_PRINT_DEBUG("Message received: %s.", message_contents); - // Read bytes from the socket. We need 9 bytes. - // Buffer for message ID plus timestamp. - size_t buffer_length = 1 + sizeof(instant_t); - unsigned char buffer[buffer_length]; + LF_MUTEX_LOCK(env->mutex); - read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, buffer_length, buffer, NULL, - "Failed to read MSG_TYPE_TIMESTAMP message from RTI."); - LF_PRINT_DEBUG("Read 9 bytes."); + action->trigger->physical_time_of_arrival = time_of_arrival; - // First byte received is the message ID. - if (buffer[0] != MSG_TYPE_TIMESTAMP) { - if (buffer[0] == MSG_TYPE_RESIGN) { - lf_print_error_and_exit("RTI has unexpectedly resigned."); - } - lf_print_error_and_exit( - "Expected a MSG_TYPE_TIMESTAMP message from the RTI. Got %u (see net_common.h).", - buffer[0]); - } + // Create a token for the message + lf_token_t* message_token = _lf_new_token((token_type_t*)action, message_contents, length); - instant_t timestamp = extract_int64(&(buffer[1])); + if (handle_message_now(env, action->trigger, intended_tag)) { + // Since the message is intended for the current tag and a port absent reaction + // was waiting for the message, trigger the corresponding reactions for this message. - tag_t tag = {.time = timestamp, .microstep = 0}; - // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_TIMESTAMP, _lf_my_fed_id, &tag); - lf_print("Starting timestamp is: " PRINTF_TIME ".", timestamp); - LF_PRINT_LOG("Current physical time is: " PRINTF_TIME ".", lf_time_physical()); + update_last_known_status_on_input_port(env, intended_tag, port_id); - return timestamp; -} + LF_PRINT_LOG( + "Inserting reactions directly at tag " PRINTF_TAG ". " + "Intended tag: " PRINTF_TAG ".", + env->current_tag.time - lf_time_start(), + env->current_tag.microstep, + intended_tag.time - lf_time_start(), + intended_tag.microstep + ); + // Only set the intended tag of the trigger if it is being executed now + // because otherwise this may preempt the intended_tag of a previous activation + // of the trigger. + action->trigger->intended_tag = intended_tag; -//////////////////////////////// Port Status Handling /////////////////////////////////////// + // This will mark the STP violation in the reaction if the message is tardy. + _lf_insert_reactions_for_trigger(env, action->trigger, message_token); -extern lf_action_base_t* _lf_action_table[]; -extern interval_t _lf_action_delay_table[]; -extern size_t _lf_action_table_size; -extern lf_action_base_t* _lf_zero_delay_cycle_action_table[]; -extern size_t _lf_zero_delay_cycle_action_table_size; -extern reaction_t* network_input_reactions[]; -extern size_t num_network_input_reactions; -extern reaction_t* port_absent_reaction[]; -extern size_t num_port_absent_reactions; -#ifdef FEDERATED_DECENTRALIZED -extern staa_t* staa_lst[]; -extern size_t staa_lst_size; -#endif + // Set the status of the port as present here to inform the network input + // port absent reactions know that they no longer need to block. The reason for + // that is because the network receiver reaction is now in the reaction queue + // keeping the precedence order intact. + set_network_port_status(port_id, present); -/** - * Return a pointer to the action struct for the action - * corresponding to the specified port ID. - * @param port_id The port ID. - * @return A pointer to an action struct or null if the ID is out of range. - */ -lf_action_base_t* _lf_action_for_port(int port_id) { - if (port_id < _lf_action_table_size) { - return _lf_action_table[port_id]; - } - lf_print_error("Invalid port ID: %d", port_id); - return NULL; -} + // Port is now present. Therefore, notify the level advancer to proceed + lf_update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); + lf_cond_broadcast(&lf_port_status_changed); + } else { + // If no port absent reaction is waiting for this message, or if the intended + // tag is in the future, use schedule functions to process the message. + // Before that, if the current time >= stop time, discard the message. + // But only if the stop time is not equal to the start time! -/** - * Set the status of network port with id portID. - * - * @param portID The network port ID - * @param status The network port status (port_status_t) - */ -void set_network_port_status(int portID, port_status_t status) { - lf_action_base_t* network_input_port_action = _lf_action_for_port(portID); - network_input_port_action->trigger->status = status; -} + update_last_known_status_on_input_port(env, intended_tag, port_id); -/** - * Update the last known status tag of all network input ports - * to the value of `tag`, unless that the provided `tag` is less - * than the last_known_status_tag of the port. This is called when - * a TAG signal is received from the RTI in centralized coordination. - * If any update occurs, then this broadcasts on `port_status_changed`. - * - * This assumes the caller holds the mutex. - * - * @param tag The tag on which the latest status of all network input - * ports is known. - */ -static void update_last_known_status_on_input_ports(tag_t tag) { - LF_PRINT_DEBUG("In update_last_known_status_on_input ports."); - bool notify = false; - for (int i = 0; i < _lf_action_table_size; i++) { - lf_action_base_t* input_port_action = _lf_action_for_port(i); - // This is called when a TAG is received. - // But it is possible for an input port to have received already - // a message with a larger tag (if there is an after delay on the - // connection), in which case, the last known status tag of the port - // is in the future and should not be rolled back. So in that case, - // we do not update the last known status tag. - if (lf_tag_compare(tag, - input_port_action->trigger->last_known_status_tag) >= 0) { - LF_PRINT_DEBUG( - "Updating the last known status tag of port %d from " PRINTF_TAG " to " PRINTF_TAG ".", - i, - input_port_action->trigger->last_known_status_tag.time - lf_time_start(), - input_port_action->trigger->last_known_status_tag.microstep, - tag.time - lf_time_start(), - tag.microstep - ); - input_port_action->trigger->last_known_status_tag = tag; - notify = true; + if (lf_tag_compare(env->current_tag, env->stop_tag) >= 0 && env->execution_started) { + lf_print_error("Received message too late. Already at stop tag.\n" + " Current tag is " PRINTF_TAG " and intended tag is " PRINTF_TAG ".\n" + " Discarding message and closing the socket.", + env->current_tag.time - start_time, env->current_tag.microstep, + intended_tag.time - start_time, intended_tag.microstep); + // Close socket, reading any incoming data and discarding it. + close_inbound_socket(fed_id, 1); + } else { + schedule_message_received_from_network_locked(env, action->trigger, intended_tag, message_token); } } - // FIXME: We could put a condition variable into the trigger_t - // struct for each network input port, in which case this won't - // be a broadcast but rather a targetted signal. - if (notify && update_max_level(tag, false)) { - // Notify network input reactions - lf_cond_broadcast(&port_status_changed); - } -} -/** - * @brief Update the last known status tag of a network input port. - * - * First, if the specified tag is less than the current_tag of the top-level - * environment, then ignore the specified tag and use the current_tag. This - * situation can arise if a message has arrived late (an STP violation has occurred). - * - * If the specified tag is greater than the previous last_known_status_tag - * of the port, then update the last_known_status_tag to the new tag. - * - * If the tag is equal to the previous last_known_status_tag, then - * increment the microstep of the last_known_status_tag. This situation can - * occur if a sequence of late messages (STP violations) are occurring all at - * once during an execution of a logical tag. - * - * This function is called when a message or absent message arrives. For decentralized - * coordination, it is also called by the background thread update_ports_from_staa_offsets - * which uses physical time to determine when an input port can be assumed to be absent - * if a message has not been received. - * - * This function assumes the caller holds the mutex on the top-level environment, - * and, if the tag actually increases, it broadcasts on `port_status_changed`. +#ifdef FEDERATED_DECENTRALIZED + // Only applicable for federated programs with decentralized coordination + // Finally, decrement the barrier to allow the execution to continue + // past the raised barrier + _lf_decrement_tag_barrier_locked(env); +#endif + + // The mutex is unlocked here after the barrier on + // logical time has been removed to avoid + // the need for unecessary lock and unlock + // operations. + LF_MUTEX_UNLOCK(env->mutex); +} + +/** + * Handle a port absent message received from a remote federate. + * This just sets the last known status tag of the port specified + * in the message. * - * @param env The top-level environment, whose mutex is assumed to be held. - * @param tag The tag on which the latest status of the specified network input port is known. - * @param portID The port ID. + * @param socket Pointer to the socket to read the message from + * @param buffer The buffer to read + * @param fed_id The sending federate ID or -1 if the centralized coordination. */ -static void update_last_known_status_on_input_port(environment_t* env, tag_t tag, int port_id) { - if (lf_tag_compare(tag, env->current_tag) < 0) tag = env->current_tag; - trigger_t* input_port_action = _lf_action_for_port(port_id)->trigger; - int comparison = lf_tag_compare(tag, input_port_action->last_known_status_tag); - if (comparison == 0) tag.microstep++; - if (comparison >= 0) { - LF_PRINT_LOG( - "Updating the last known status tag of port %d from " PRINTF_TAG " to " PRINTF_TAG ".", - port_id, - input_port_action->last_known_status_tag.time - lf_time_start(), - input_port_action->last_known_status_tag.microstep, - tag.time - lf_time_start(), - tag.microstep - ); - input_port_action->last_known_status_tag = tag; +static void handle_port_absent_message(int* socket, int fed_id) { + size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(instant_t) + sizeof(microstep_t); + unsigned char buffer[bytes_to_read]; + read_from_socket_fail_on_error(socket, bytes_to_read, buffer, NULL, + "Failed to read port absent message."); - // Check whether this port update implies a change to MLAA, which may unblock reactions. - // For decentralized coordination, the first argument is NEVER, so it has no effect. - // For centralized, the arguments probably also have no effect, but the port update may. - // Note that it would not be correct to pass `tag` as the first argument because - // there is no guarantee that there is either a TAG or a PTAG for this time. - // The message that triggered this to be called could be from an upstream - // federate that is far ahead of other upstream federates in logical time. - update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); - lf_cond_broadcast(&port_status_changed); + // Extract the header information. + unsigned short port_id = extract_uint16(buffer); + // The next part of the message is the federate_id, but we don't need it. + // unsigned short federate_id = extract_uint16(&(buffer[sizeof(uint16_t)])); + tag_t intended_tag = extract_tag(&(buffer[sizeof(uint16_t)+sizeof(uint16_t)])); + + // Trace the event when tracing is enabled + if (fed_id == -1) { + tracepoint_federate_from_rti(_fed.trace, receive_PORT_ABS, _lf_my_fed_id, &intended_tag); } else { - // Message arrivals should be monotonic, so this should not occur. - lf_print_warning("Attempt to update the last known status tag " - "of network input port %d to an earlier tag was ignored.", port_id); + tracepoint_federate_from_federate(_fed.trace, receive_PORT_ABS, _lf_my_fed_id, fed_id, &intended_tag); } + LF_PRINT_LOG("Handling port absent for tag " PRINTF_TAG " for port %hu of fed %d.", + intended_tag.time - lf_time_start(), + intended_tag.microstep, + port_id, + fed_id + ); + + // Environment is always the one corresponding to the top-level scheduling enclave. + environment_t *env; + _lf_get_environments(&env); + + LF_MUTEX_LOCK(env->mutex); + update_last_known_status_on_input_port(env, intended_tag, port_id); + LF_MUTEX_UNLOCK(env->mutex); } /** - * Reset the status fields on network input ports to unknown. - * - * @note This function must be called at the beginning of each - * logical time. + * Thread that listens for inputs from other federates. + * This thread listens for messages of type MSG_TYPE_P2P_MESSAGE, + * MSG_TYPE_P2P_TAGGED_MESSAGE, or MSG_TYPE_PORT_ABSENT (@see net_common.h) from the specified + * peer federate and calls the appropriate handling function for + * each message type. If an error occurs or an EOF is received + * from the peer, then this procedure sets the corresponding + * socket in _fed.sockets_for_inbound_p2p_connections + * to -1 and returns, terminating the thread. + * @param _args The remote federate ID (cast to void*). + * @param fed_id_ptr A pointer to a uint16_t containing federate ID being listened to. + * This procedure frees the memory pointed to before returning. */ -void reset_status_fields_on_input_port_triggers() { - for (int i = 0; i < _lf_action_table_size; i++) { - set_network_port_status(i, unknown); +static void* listen_to_federates(void* _args) { + uint16_t fed_id = (uint16_t)(uintptr_t)_args; + + LF_PRINT_LOG("Listening to federate %d.", fed_id); + + int* socket_id = &_fed.sockets_for_inbound_p2p_connections[fed_id]; + + // Buffer for incoming messages. + // This does not constrain the message size + // because the message will be put into malloc'd memory. + unsigned char buffer[FED_COM_BUFFER_SIZE]; + + // Listen for messages from the federate. + while (1) { + // Read one byte to get the message type. + LF_PRINT_DEBUG("Waiting for a P2P message on socket %d.", *socket_id); + if (read_from_socket_close_on_error(socket_id, 1, buffer)) { + // Socket has been closed. + lf_print("Socket from federate %d is closed.", fed_id); + // Stop listening to this federate. + break; + } + LF_PRINT_DEBUG("Received a P2P message on socket %d of type %d.", + *socket_id, buffer[0]); + bool bad_message = false; + switch (buffer[0]) { + case MSG_TYPE_P2P_MESSAGE: + LF_PRINT_LOG("Received untimed message from federate %d.", fed_id); + handle_message(socket_id, fed_id); + break; + case MSG_TYPE_P2P_TAGGED_MESSAGE: + LF_PRINT_LOG("Received timed message from federate %d.", fed_id); + handle_tagged_message(socket_id, fed_id); + break; + case MSG_TYPE_PORT_ABSENT: + LF_PRINT_LOG("Received port absent message from federate %d.", fed_id); + handle_port_absent_message(socket_id, fed_id); + break; + default: + bad_message = true; + } + if (bad_message) { + // FIXME: Better error handling needed. + lf_print_error("Received erroneous message type: %d. Closing the socket.", buffer[0]); + // Trace the event when tracing is enabled + tracepoint_federate_from_federate(_fed.trace, receive_UNIDENTIFIED, _lf_my_fed_id, fed_id, NULL); + break; + } } - LF_PRINT_DEBUG("Resetting port status fields."); - update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); + return NULL; } /** - * Enqueue port absent reactions that will send a MSG_TYPE_PORT_ABSENT - * message to downstream federates if a given network output port is not present. - * @param env The environment of the federate + * Close the socket that sends outgoing messages to the + * specified federate ID. This function assumes the caller holds + * the lf_outbound_socket_mutex mutex lock, at least during normal termination. + * @param fed_id The ID of the peer federate receiving messages from this + * federate, or -1 if the RTI (centralized coordination). + * @param flag 0 if the socket has received EOF, 1 if not, -1 if abnormal termination. */ -void enqueue_port_absent_reactions(environment_t* env){ - assert(env != GLOBAL_ENVIRONMENT); -#ifdef FEDERATED_CENTRALIZED - if (!_fed.has_downstream) { - // This federate is not connected to any downstream federates via a - // logical connection. No need to trigger port absent - // reactions. - return; - } -#endif - LF_PRINT_DEBUG("Enqueueing port absent reactions at time %lld.", (long long) (env->current_tag.time - start_time)); - if (num_port_absent_reactions == 0) { - LF_PRINT_DEBUG("No port absent reactions."); - return; - } - for (int i = 0; i < num_port_absent_reactions; i++) { - reaction_t* reaction = port_absent_reaction[i]; - if (reaction && reaction->status == inactive) { - LF_PRINT_DEBUG("Inserting port absent reaction on reaction queue."); - lf_scheduler_trigger_reaction(env->scheduler, reaction, -1); +static void close_outbound_socket(int fed_id, int flag) { + assert (fed_id >= 0 && fed_id < NUMBER_OF_FEDERATES); + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + if (_fed.sockets_for_outbound_p2p_connections[fed_id] >= 0) { + // Close the socket by sending a FIN packet indicating that no further writes + // are expected. Then read until we get an EOF indication. + if (flag >= 0) { + // SHUT_WR indicates no further outgoing messages. + shutdown(_fed.sockets_for_outbound_p2p_connections[fed_id], SHUT_WR); + if (flag > 0) { + // Have not received EOF yet. read until we get an EOF or error indication. + // This compensates for delayed ACKs and disabling of Nagles algorithm + // by delaying exiting until the shutdown is complete. + unsigned char message[32]; + while (read(_fed.sockets_for_outbound_p2p_connections[fed_id], &message, 32) > 0); + } } + close(_fed.sockets_for_outbound_p2p_connections[fed_id]); + _fed.sockets_for_outbound_p2p_connections[fed_id] = -1; } + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); } +#ifdef FEDERATED_AUTHENTICATED /** - * Send a port absent message to federate with fed_ID, informing the - * remote federate that it will not receive a message with tag less than the - * current tag of the specified environment delayed by the additional_delay. - * - * @param env The environment from which to get the current tag. - * @param additional_delay The after delay of the connection or NEVER if none. - * @param port_ID The ID of the receiving port. - * @param fed_ID The fed ID of the receiving federate. + * Perform HMAC-based authentication with the RTI, using the federation ID + * as an HMAC key. + * @return 0 for success, -1 for failure. */ -void send_port_absent_to_federate( - environment_t* env, - interval_t additional_delay, - unsigned short port_ID, - unsigned short fed_ID) { - assert(env != GLOBAL_ENVIRONMENT); - - // Construct the message - size_t message_length = 1 + sizeof(port_ID) + sizeof(fed_ID) + sizeof(instant_t) + sizeof(microstep_t); - unsigned char buffer[message_length]; +static int perform_hmac_authentication() { - // Apply the additional delay to the current tag and use that as the intended - // tag of the outgoing message. Note that if there is delay on the connection, - // then we cannot promise no message with tag = current_tag + delay because a - // subsequent reaction might produce such a message. But we can promise no - // message with a tag strictly less than current_tag + delay. - tag_t current_message_intended_tag = lf_delay_strict(env->current_tag, additional_delay); + // Send buffer including message type, federate ID, federate's nonce. + size_t fed_id_length = sizeof(uint16_t); + size_t message_length = 1 + fed_id_length + NONCE_LENGTH; + unsigned char fed_hello_buf[message_length]; + fed_hello_buf[0] = MSG_TYPE_FED_NONCE; + encode_uint16((uint16_t)_lf_my_fed_id, &fed_hello_buf[1]); + unsigned char fed_nonce[NONCE_LENGTH]; + RAND_bytes(fed_nonce, NONCE_LENGTH); + memcpy(&fed_hello_buf[1 + fed_id_length], fed_nonce, NONCE_LENGTH); - LF_PRINT_LOG("Sending port " - "absent for tag " PRINTF_TAG " for port %d to federate %d.", - current_message_intended_tag.time - start_time, - current_message_intended_tag.microstep, - port_ID, fed_ID); + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, message_length, fed_hello_buf, NULL, + "Failed to write nonce."); - buffer[0] = MSG_TYPE_PORT_ABSENT; - encode_uint16(port_ID, &(buffer[1])); - encode_uint16(fed_ID, &(buffer[1+sizeof(port_ID)])); - encode_tag(&(buffer[1+sizeof(port_ID)+sizeof(fed_ID)]), current_message_intended_tag); + // Check HMAC of received FED_RESPONSE message. + unsigned int hmac_length = SHA256_HMAC_LENGTH; + size_t federation_id_length = strnlen(federation_metadata.federation_id, 255); -#ifdef FEDERATED_CENTRALIZED - // Send the absent message through the RTI - int* socket = &_fed.socket_TCP_RTI; -#else - // Send the absent message directly to the federate - int* socket = &_fed.sockets_for_outbound_p2p_connections[fed_ID]; -#endif + unsigned char received[1 + NONCE_LENGTH + hmac_length]; + if (read_from_socket_close_on_error(&_fed.socket_TCP_RTI, 1 + NONCE_LENGTH + hmac_length, received)) { + lf_print_warning("Failed to read RTI response."); + return -1; + } + if (received[0] != MSG_TYPE_RTI_RESPONSE) { + if (received[0] == MSG_TYPE_RESIGN) { + lf_print_error("RTI has resigned."); + return -1; + } else { + lf_print_error( + "Received unexpected response %u from the RTI (see net_common.h).", + received[0]); + return -1; + } + } + // Create tag to compare to received tag. + unsigned char buf_to_check[1 + fed_id_length + NONCE_LENGTH]; + buf_to_check[0] = MSG_TYPE_RTI_RESPONSE; + encode_uint16((uint16_t)_lf_my_fed_id, &buf_to_check[1]); + memcpy(&buf_to_check[1 + fed_id_length], fed_nonce, NONCE_LENGTH); + unsigned char fed_tag[hmac_length]; + HMAC(EVP_sha256(), federation_metadata.federation_id, federation_id_length, buf_to_check, 1 + fed_id_length + NONCE_LENGTH, + fed_tag, &hmac_length); - LF_MUTEX_LOCK(outbound_socket_mutex); - int result = write_to_socket_close_on_error(socket, message_length, buffer); - LF_MUTEX_UNLOCK(outbound_socket_mutex); + // Compare received tag and created tag. + if (memcmp(&received[1 + NONCE_LENGTH], fed_tag, hmac_length) != 0) { + // HMAC does not match. Send back a MSG_TYPE_REJECT message. + lf_print_error("HMAC authentication failed."); + unsigned char response[2]; + response[0] = MSG_TYPE_REJECT; + response[1] = HMAC_DOES_NOT_MATCH; - if (result != 0) { - // Write failed. Response depends on whether coordination is centralized. - if (socket == &_fed.socket_TCP_RTI) { - // Centralized coordination. This is a critical error. - lf_print_error_system_failure("Failed to send port absent message for port %hu to federate %hu.", - port_ID, fed_ID); - } else { - // Decentralized coordination. This is not a critical error. - lf_print_warning("Failed to send port absent message for port %hu to federate %hu.", - port_ID, fed_ID); - } + // Ignore errors on writing back. + write_to_socket(_fed.socket_TCP_RTI, 2, response); + return -1; } else { - // Message sent correctly. Trace it. - if (socket == &_fed.socket_TCP_RTI) { - tracepoint_federate_to_rti( - _fed.trace, send_PORT_ABS, _lf_my_fed_id, ¤t_message_intended_tag); - } else { - tracepoint_federate_to_federate( - _fed.trace, send_PORT_ABS, _lf_my_fed_id, fed_ID, ¤t_message_intended_tag); - } + LF_PRINT_LOG("HMAC verified."); + // HMAC tag is created with MSG_TYPE_FED_RESPONSE and received federate nonce. + unsigned char mac_buf[1 + NONCE_LENGTH]; + mac_buf[0] = MSG_TYPE_FED_RESPONSE; + memcpy(&mac_buf[1], &received[1], NONCE_LENGTH); + // Buffer for message type and HMAC tag. + unsigned char sender[1 + hmac_length]; + sender[0] = MSG_TYPE_FED_RESPONSE; + HMAC(EVP_sha256(), federation_metadata.federation_id, federation_id_length, mac_buf, 1 + NONCE_LENGTH, + &sender[1], &hmac_length); + + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, 1 + hmac_length, sender, NULL, + "Failed to write fed response."); } + return 0; +} +#endif + +static void close_rti_socket() { + shutdown(_fed.socket_TCP_RTI, SHUT_RDWR); + close(_fed.socket_TCP_RTI); + _fed.socket_TCP_RTI = -1; } /** - * Version of schedule_value() similar to that in reactor_common.c - * except that it does not acquire the mutex lock and has a special - * behavior during startup where it can inject reactions to the reaction - * queue if execution has not started yet. - * It is also responsible for setting the intended tag of the - * network message based on the calculated delay. - * This function assumes that the caller holds the mutex lock. - * - * This is used for handling incoming timed messages to a federate. - * - * @param env The environment of the federate - * @param action The action or timer to be triggered. - * @param tag The tag of the message received over the network. - * @param value Dynamically allocated memory containing the value to send. - * @param length The length of the array, if it is an array, or 1 for a - * scalar and 0 for no payload. - * @return A handle to the event, or 0 if no event was scheduled, or -1 for error. + * Return in the result a struct with the address info for the specified hostname and port. + * The memory for the result is dynamically allocated and must be freed using freeaddrinfo. + * @param hostname The host name. + * @param port The port number. + * @param result The struct into which to write. */ -static trigger_handle_t schedule_message_received_from_network_locked( - environment_t* env, - trigger_t* trigger, - tag_t tag, - lf_token_t* token) { - assert(env != GLOBAL_ENVIRONMENT); +static void rti_address(const char* hostname, uint16_t port, struct addrinfo** result) { + struct addrinfo hints; - // Return value of the function - trigger_handle_t return_value = 0; + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_INET; /* Allow IPv4 */ + hints.ai_socktype = SOCK_STREAM; /* Stream socket */ + hints.ai_protocol = IPPROTO_TCP; /* TCP protocol */ + hints.ai_addr = NULL; + hints.ai_next = NULL; + hints.ai_flags = AI_NUMERICSERV; /* Allow only numeric port numbers */ - // Indicates whether or not the intended tag - // of the message (timestamp, microstep) is - // in the future relative to the tag of this - // federate. By default, assume it is not. - bool message_tag_is_in_the_future = lf_tag_compare(tag, env->current_tag) > 0; - // Assign the intended tag temporarily to restore later. - tag_t previous_intended_tag = trigger->intended_tag; - trigger->intended_tag = tag; + // Convert port number to string. + char str[6]; + sprintf(str, "%u", port); - // Calculate the extra_delay required to be passed - // to the schedule function. - interval_t extra_delay = tag.time - env->current_tag.time; - if (!message_tag_is_in_the_future && env->execution_started) { -#ifdef FEDERATED_CENTRALIZED - // If the coordination is centralized, receiving a message - // that does not carry a timestamp that is in the future - // would indicate a critical condition, showing that the - // time advance mechanism is not working correctly. - LF_MUTEX_UNLOCK(env->mutex); - lf_print_error_and_exit( - "Received a message at tag " PRINTF_TAG " that has a tag " PRINTF_TAG - " that has violated the STP offset. " - "Centralized coordination should not have these types of messages.", - env->current_tag.time - start_time, env->current_tag.microstep, - tag.time - start_time, tag.microstep); -#else - // Set the delay back to 0 - extra_delay = 0LL; - LF_PRINT_LOG("Calling schedule with 0 delay and intended tag " PRINTF_TAG ".", - trigger->intended_tag.time - start_time, - trigger->intended_tag.microstep); - return_value = _lf_schedule(env, trigger, extra_delay, token); -#endif - } else { - // In case the message is in the future, call - // _lf_schedule_at_tag() so that the microstep is respected. - LF_PRINT_LOG("Received a message that is (" PRINTF_TIME " nanoseconds, " PRINTF_MICROSTEP " microsteps) " - "in the future.", extra_delay, tag.microstep - env->current_tag.microstep); - return_value = _lf_schedule_at_tag(env, trigger, tag, token); + // Get address structure matching hostname and hints criteria, and + // set port to the port number provided in str. There should only + // ever be one matching address structure, and we connect to that. + if (getaddrinfo(hostname, (const char*)&str, &hints, result)) { + lf_print_error_and_exit("No host for RTI matching given hostname: %s", hostname); } - trigger->intended_tag = previous_intended_tag; - // Notify the main thread in case it is waiting for physical time to elapse. - LF_PRINT_DEBUG("Broadcasting notification that event queue changed."); - lf_cond_broadcast(&env->event_q_changed); - return return_value; } /** - * Handle a port absent message received from a remote federate. - * This just sets the last known status tag of the port specified - * in the message. - * - * @param socket Pointer to the socket to read the message from - * @param buffer The buffer to read - * @param fed_id The sending federate ID or -1 if the centralized coordination. + * Send the specified timestamp to the RTI and wait for a response. + * The specified timestamp should be current physical time of the + * federate, and the response will be the designated start time for + * the federate. This procedure blocks until the response is + * received from the RTI. + * @param my_physical_time The physical time at this federate. + * @return The designated start time for the federate. */ -static void handle_port_absent_message(int* socket, int fed_id) { - size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(instant_t) + sizeof(microstep_t); - unsigned char buffer[bytes_to_read]; - read_from_socket_fail_on_error(socket, bytes_to_read, buffer, NULL, - "Failed to read port absent message."); +static instant_t get_start_time_from_rti(instant_t my_physical_time) { + // Send the timestamp marker first. + send_time(MSG_TYPE_TIMESTAMP, my_physical_time); - // Extract the header information. - unsigned short port_id = extract_uint16(buffer); - // The next part of the message is the federate_id, but we don't need it. - // unsigned short federate_id = extract_uint16(&(buffer[sizeof(uint16_t)])); - tag_t intended_tag = extract_tag(&(buffer[sizeof(uint16_t)+sizeof(uint16_t)])); + // Read bytes from the socket. We need 9 bytes. + // Buffer for message ID plus timestamp. + size_t buffer_length = 1 + sizeof(instant_t); + unsigned char buffer[buffer_length]; - // Trace the event when tracing is enabled - if (fed_id == -1) { - tracepoint_federate_from_rti(_fed.trace, receive_PORT_ABS, _lf_my_fed_id, &intended_tag); - } else { - tracepoint_federate_from_federate(_fed.trace, receive_PORT_ABS, _lf_my_fed_id, fed_id, &intended_tag); + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, buffer_length, buffer, NULL, + "Failed to read MSG_TYPE_TIMESTAMP message from RTI."); + LF_PRINT_DEBUG("Read 9 bytes."); + + // First byte received is the message ID. + if (buffer[0] != MSG_TYPE_TIMESTAMP) { + if (buffer[0] == MSG_TYPE_RESIGN) { + lf_print_error_and_exit("RTI has unexpectedly resigned."); + } + lf_print_error_and_exit( + "Expected a MSG_TYPE_TIMESTAMP message from the RTI. Got %u (see net_common.h).", + buffer[0]); } - LF_PRINT_LOG("Handling port absent for tag " PRINTF_TAG " for port %hu of fed %d.", - intended_tag.time - lf_time_start(), - intended_tag.microstep, - port_id, - fed_id - ); - // Environment is always the one corresponding to the top-level scheduling enclave. - environment_t *env; - _lf_get_environments(&env); + instant_t timestamp = extract_int64(&(buffer[1])); - LF_MUTEX_LOCK(env->mutex); - update_last_known_status_on_input_port(env, intended_tag, port_id); - LF_MUTEX_UNLOCK(env->mutex); + tag_t tag = {.time = timestamp, .microstep = 0}; + // Trace the event when tracing is enabled + tracepoint_federate_from_rti(_fed.trace, receive_TIMESTAMP, _lf_my_fed_id, &tag); + lf_print("Starting timestamp is: " PRINTF_TIME ".", timestamp); + LF_PRINT_LOG("Current physical time is: " PRINTF_TIME ".", lf_time_physical()); + + return timestamp; } /** - * Handle a message being received from a remote federate. + * Handle a time advance grant (TAG) message from the RTI. + * This updates the last known status tag for each network input + * port, and broadcasts a signal, which may cause a blocking + * port absent reaction to unblock. * - * This function assumes the caller does not hold the mutex lock. - * @param socket Pointer to the socket to read the message from. - * @param buffer The buffer to read. - * @param fed_id The sending federate ID or -1 if the centralized coordination. + * In addition, this updates the last known TAG/PTAG and broadcasts + * a notification of this update, which may unblock whichever worker + * thread is trying to advance time. + * + * @note This function is very similar to handle_provisinal_tag_advance_grant() except that + * it sets last_TAG_was_provisional to false. */ -void handle_message(int* socket, int fed_id) { - // Read the header. - size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t); +static void handle_tag_advance_grant(void) { + // Environment is always the one corresponding to the top-level scheduling enclave. + environment_t *env; + _lf_get_environments(&env); + + size_t bytes_to_read = sizeof(instant_t) + sizeof(microstep_t); unsigned char buffer[bytes_to_read]; - if (read_from_socket_close_on_error(socket, bytes_to_read, buffer)) { - // Read failed, which means the socket has been closed between reading the - // message ID byte and here. Issue a warning only. This is a physical - // connection, so likely the message is just late. If it's a serious failure, - // it should be caught in another thread. - lf_print_warning("Failed to read message header."); + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, + "Failed to read tag advance grant from RTI."); + tag_t TAG = extract_tag(buffer); + + // Trace the event when tracing is enabled + tracepoint_federate_from_rti(_fed.trace, receive_TAG, _lf_my_fed_id, &TAG); + + LF_MUTEX_LOCK(env->mutex); + + // Update the last known status tag of all network input ports + // to the TAG received from the RTI. Here we assume that the RTI + // knows the status of network ports up to and including the granted tag, + // so by extension, we assume that the federate can safely rely + // on the RTI to handle port statuses up until the granted tag. + update_last_known_status_on_input_ports(TAG); + + // It is possible for this federate to have received a PTAG + // earlier with the same tag as this TAG. + if (lf_tag_compare(TAG, _fed.last_TAG) >= 0) { + _fed.last_TAG = TAG; + _fed.is_last_TAG_provisional = false; + LF_PRINT_LOG("Received Time Advance Grant (TAG): " PRINTF_TAG ".", + _fed.last_TAG.time - start_time, _fed.last_TAG.microstep); + } else { + LF_MUTEX_UNLOCK(env->mutex); + lf_print_error("Received a TAG " PRINTF_TAG " that wasn't larger " + "than the previous TAG or PTAG " PRINTF_TAG ". Ignoring the TAG.", + TAG.time - start_time, TAG.microstep, + _fed.last_TAG.time - start_time, _fed.last_TAG.microstep); return; } + // Notify everything that is blocked. + lf_cond_broadcast(&env->event_q_changed); - // Extract the header information. - unsigned short port_id; - unsigned short federate_id; - size_t length; - extract_header(buffer, &port_id, &federate_id, &length); - // Check if the message is intended for this federate - assert(_lf_my_fed_id == federate_id); - LF_PRINT_DEBUG("Receiving message to port %d of length %zu.", port_id, length); - - // Get the triggering action for the corresponding port - lf_action_base_t* action = _lf_action_for_port(port_id); + LF_MUTEX_UNLOCK(env->mutex); +} - // Read the payload. - // Allocate memory for the message contents. - unsigned char* message_contents = (unsigned char*)malloc(length); - if (read_from_socket_close_on_error(socket, length, message_contents)) { - lf_print_warning("Failed to read message body."); +#ifdef FEDERATED_DECENTRALIZED +/** + * @brief Return whether there exists an input port whose status is unknown. + * + * @param staa_elem A record of all input port actions. + */ +static bool a_port_is_unknown(staa_t* staa_elem) { + bool do_wait = false; + for (int j = 0; j < staa_elem->num_actions; ++j) { + if (staa_elem->actions[j]->trigger->status == unknown) { + do_wait = true; + break; + } } - // Trace the event when tracing is enabled - tracepoint_federate_from_federate(_fed.trace, receive_P2P_MSG, _lf_my_fed_id, federate_id, NULL); - LF_PRINT_LOG("Message received by federate: %s. Length: %zu.", message_contents, length); - - LF_PRINT_DEBUG("Calling schedule for message received on a physical connection."); - _lf_schedule_value(action, 0, message_contents, length); + return do_wait; } +#endif -void stall_advance_level_federation(environment_t* env, size_t level) { - LF_PRINT_DEBUG("Acquiring the environment mutex."); - LF_MUTEX_LOCK(env->mutex); - LF_PRINT_DEBUG("Waiting on MLAA with next_reaction_level %zu and MLAA %d.", level, max_level_allowed_to_advance); - while (((int) level) >= max_level_allowed_to_advance) { - lf_cond_wait(&port_status_changed); - }; - LF_PRINT_DEBUG("Exiting wait with MLAA %d and next_reaction_level %zu.", max_level_allowed_to_advance, level); - LF_MUTEX_UNLOCK(env->mutex); +/** + * @brief Return the port ID of the port associated with the given action. + */ +static int id_of_action(lf_action_base_t* input_port_action) { + for (int i = 0; 1; i++) { + if (action_for_port(i) == input_port_action) return i; + } + // There will be no UB buffer overrun because action_for_port(i) has a check. } /** - * Return true if reactions need to be inserted directly into the reaction queue and - * false if a call to schedule is needed (the normal case). This function handles zero-delay - * cycles, where processing at a tag must be able to begin before all messages have arrived - * at that tag. This returns true if the following conditions are all true: - * - * 1. the first reaction triggered has a level >= MLAA (a port is or will be blocked on this trigger); - * 2. the intended_tag is less than or equal to the current tag of the environment; - * 3. the intended_tag is greater than the last_tag of the trigger; - * 4. the intended_tag is greater than the last_known_status_tag of the trigger; - * 5. the execution has started (the event queue has been examined); - * 6. the trigger is not physical; - * - * The comparison against the MLAA (condition 1), if true, means that there is a blocking port - * waiting for this trigger (or possibly an earlier blocking port). For condition (2), if the - * intended tag is less than the current tag, then the message is tardy. A tardy message can - * unblock a port, although it will trigger an STP violation handler if one is defined or an - * error if not (or if centralized coordination is being used). The comparison against the - * last_tag of the trigger (condition 3) ensures that if the message is tardy but there is - * already an earlier tardy message that has been handled (or is being handled), then we - * don't try to handle two messages in the same tag, which is not allowed. For example, there - * could be a case where current tag is 10 with a port absent reaction waiting, and a message - * has arrived with intended_tag 8. This message will eventually cause the port absent reaction - * to exit, but before that, a message with intended_tag of 9 could arrive before the port absent - * reaction has had a chance to exit. The port status is on the other hand changed in this thread, - * and thus, can be checked in this scenario without this race condition. The message with - * intended_tag of 9 in this case needs to wait one microstep to be processed. The check with - * last_known_status_tag (condition 4) deals with messages arriving with identical intended - * tags (which should not happen). This one will be handled late (one microstep later than - * the current tag if 1 and 2 are true). - * - * This function assumes the mutex is held on the environment. - * - * @param env The environment. - * @param trigger The trigger. - * @param intended_tag The intended tag. + * @brief Thread handling setting the known absent status of input ports. + * For the code-generated array of staa offsets `staa_lst`, which is sorted by STAA offset, + * wait for physical time to advance to the current time plus the STAA offset, + * then set the absent status of the input ports associated with the STAA. + * Then wait for current time to advance and start over. */ -static bool handle_message_now(environment_t* env, trigger_t* trigger, tag_t intended_tag) { - return trigger->reactions[0]->index >= max_level_allowed_to_advance - && lf_tag_compare(intended_tag, lf_tag(env)) <= 0 - && lf_tag_compare(intended_tag, trigger->last_tag) > 0 - && lf_tag_compare(intended_tag, trigger->last_known_status_tag) > 0 - && env->execution_started - && !trigger->is_physical; +#ifdef FEDERATED_DECENTRALIZED +static void* update_ports_from_staa_offsets(void* args) { + if (staa_lst_size == 0) return NULL; // Nothing to do. + // NOTE: Using only the top-level environment, which is the one that deals with network + // input ports. + environment_t *env; + int num_envs = _lf_get_environments(&env); + LF_MUTEX_LOCK(env->mutex); + while (1) { + bool restart = false; + tag_t tag_when_started_waiting = lf_tag(env); + for (int i = 0; i < staa_lst_size; ++i) { + staa_t* staa_elem = staa_lst[i]; + // The staa_elem is adjusted in the code generator to have subtracted the delay on the connection. + // The list is sorted in increasing order of adjusted STAA offsets. + // The wait_until function automatically adds the _lf_fed_STA_offset to the wait time. + interval_t wait_until_time = env->current_tag.time + staa_elem->STAA; + // The wait_until call will release the env->mutex while it is waiting. + // However, it will not release the env->mutex if the wait time is too small. + // At the cost of a small additional delay in deciding a port is absent, + // we require a minimum wait time here. Otherwise, if both the STAA and STA are + // zero, this thread will fail to ever release the environment mutex. + // This causes chaos. The MIN_SLEEP_DURATION is the smallest amount of time + // that wait_until will actually wait. Note that this strategy does not + // block progress of any execution that is actually processing events. + // It only slightly delays the decision that an event is absent, and only + // if the STAA and STA are extremely small. + if (_lf_fed_STA_offset + staa_elem->STAA < 5 * MIN_SLEEP_DURATION) { + wait_until_time += 5 * MIN_SLEEP_DURATION; + } + while (a_port_is_unknown(staa_elem)) { + if (wait_until(env, wait_until_time, &lf_port_status_changed)) { + if (lf_tag_compare(lf_tag(env), tag_when_started_waiting) != 0) { + // Wait was not interrupted and we have committed to a new tag before we + // finished processing the list. Start over. + restart = true; + break; + } + /* Possibly useful for debugging: + tag_t current_tag = lf_tag(env); + lf_print("--------------------- FIXME: assuming absent! " PRINTF_TAG, current_tag.time - lf_time_start(), current_tag.microstep); + lf_print("--------------------- Lag is " PRINTF_TIME, current_tag.time - lf_time_physical()); + lf_print("--------------------- Wait until time is " PRINTF_TIME, wait_until_time - lf_time_start()); + */ + + // Wait went to completion. Mark any ports with this STAA that remain unknown as absent. + for (int j = 0; j < staa_elem->num_actions; ++j) { + lf_action_base_t* input_port_action = staa_elem->actions[j]; + if (input_port_action->trigger->status == unknown) { + input_port_action->trigger->status = absent; + LF_PRINT_DEBUG("Assuming port absent at time " PRINTF_TIME, lf_tag(env).time - start_time); + update_last_known_status_on_input_port(env, lf_tag(env), id_of_action(input_port_action)); + lf_cond_broadcast(&lf_port_status_changed); + } + } + } else if (lf_tag_compare(lf_tag(env), tag_when_started_waiting) != 0) { + // Wait was interrupted and we have committed to a new tag before we + // finished processing the list. Start over. + restart = true; + break; + } + } + if (restart) break; // No need to check the rest of the STAAs. + } + if (restart) continue; // No need to wait for a new tag. + + // Wait until we progress to a new tag. + while (lf_tag_compare(lf_tag(env), tag_when_started_waiting) == 0) { + // The following will release the env->mutex while waiting. + lf_cond_wait(&lf_current_tag_changed); + } + } } +#endif // FEDERATED_DECENTRALIZED /** - * Handle a tagged message being received from a remote federate via the RTI - * or directly from other federates. - * This will read the tag encoded in the header - * and calculate an offset to pass to the schedule function. - * This function assumes the caller does not hold the mutex lock. - * Instead of holding the mutex lock, this function calls - * _lf_increment_tag_barrier with the tag carried in - * the message header as an argument. This ensures that the current tag - * will not advance to the tag of the message if it is in the future, or - * the tag will not advance at all if the tag of the message is - * now or in the past. - * @param socket Pointer to the socket to read the message from. - * @param buffer The buffer to read. - * @param fed_id The sending federate ID or -1 if the centralized coordination. + * Handle a provisional tag advance grant (PTAG) message from the RTI. + * This updates the last known TAG/PTAG and broadcasts + * a notification of this update, which may unblock whichever worker + * thread is trying to advance time. + * If current_time is less than the specified PTAG, then this will + * also insert into the event_q a dummy event with the specified tag. + * This will ensure that the federate advances time to the specified + * tag and, for centralized coordination, stimulates null-message-sending + * output reactions at that tag. + * + * @note This function is similar to handle_tag_advance_grant() except that + * it sets last_TAG_was_provisional to true and also it does not update the + * last known tag for input ports. */ -void handle_tagged_message(int* socket, int fed_id) { +static void handle_provisional_tag_advance_grant() { // Environment is always the one corresponding to the top-level scheduling enclave. environment_t *env; _lf_get_environments(&env); - // FIXME: Need better error handling? - // Read the header which contains the timestamp. - size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t) - + sizeof(instant_t) + sizeof(microstep_t); + size_t bytes_to_read = sizeof(instant_t) + sizeof(microstep_t); unsigned char buffer[bytes_to_read]; - read_from_socket_fail_on_error(socket, bytes_to_read, buffer, NULL, - "Failed to read timed message header"); + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, + "Failed to read provisional tag advance grant from RTI."); + tag_t PTAG = extract_tag(buffer); - // Extract the header information. - unsigned short port_id; - unsigned short federate_id; - size_t length; - tag_t intended_tag; - extract_timed_header(buffer, &port_id, &federate_id, &length, &intended_tag); // Trace the event when tracing is enabled - if (fed_id == -1) { - tracepoint_federate_from_rti(_fed.trace, receive_TAGGED_MSG, _lf_my_fed_id, &intended_tag); - } else { - tracepoint_federate_from_federate(_fed.trace, receive_P2P_TAGGED_MSG, _lf_my_fed_id, fed_id, &intended_tag); - } - // Check if the message is intended for this federate - assert(_lf_my_fed_id == federate_id); - LF_PRINT_DEBUG("Receiving message to port %d of length %zu.", port_id, length); - - // Get the triggering action for the corresponding port - lf_action_base_t* action = _lf_action_for_port(port_id); - - // Record the physical time of arrival of the message - instant_t time_of_arrival = lf_time_physical(); + tracepoint_federate_from_rti(_fed.trace, receive_PTAG, _lf_my_fed_id, &PTAG); - if (action->trigger->is_physical) { - // Messages sent on physical connections should be handled via handle_message(). - lf_print_error_and_exit("Received a tagged message on a physical connection."); - } + // Note: it is important that last_known_status_tag of ports does not + // get updated to a PTAG value because a PTAG does not indicate that + // the RTI knows about the status of all ports up to and _including_ + // the value of PTAG. Only a TAG message indicates that. + LF_MUTEX_LOCK(env->mutex); -#ifdef FEDERATED_DECENTRALIZED - // Only applicable for federated programs with decentralized coordination: - // For logical connections in decentralized coordination, - // increment the barrier to prevent advancement of tag beyond - // the received tag if possible. The following function call - // suggests that the tag barrier be raised to the tag provided - // by the message. If this tag is in the past, the function will cause - // the tag to freeze at the current level. - // If something happens, make sure to release the barrier. - _lf_increment_tag_barrier(env, intended_tag); -#endif - LF_PRINT_LOG("Received message on port %d with tag: " PRINTF_TAG ", Current tag: " PRINTF_TAG ".", - port_id, intended_tag.time - start_time, intended_tag.microstep, - lf_time_logical_elapsed(env), env->current_tag.microstep); + // Sanity check + if (lf_tag_compare(PTAG, _fed.last_TAG) < 0 + || (lf_tag_compare(PTAG, _fed.last_TAG) == 0 && !_fed.is_last_TAG_provisional)) { + LF_MUTEX_UNLOCK(env->mutex); + lf_print_error_and_exit("Received a PTAG " PRINTF_TAG " that is equal or earlier " + "than an already received TAG " PRINTF_TAG ".", + PTAG.time, PTAG.microstep, + _fed.last_TAG.time, _fed.last_TAG.microstep); + } - // Read the payload. - // Allocate memory for the message contents. - unsigned char* message_contents = (unsigned char*)malloc(length); - read_from_socket_fail_on_error(socket, length, message_contents, NULL, - "Failed to read message body."); + _fed.last_TAG = PTAG; + _fed.is_last_TAG_provisional = true; + LF_PRINT_LOG("At tag " PRINTF_TAG ", received Provisional Tag Advance Grant (PTAG): " PRINTF_TAG ".", + env->current_tag.time - start_time, env->current_tag.microstep, + _fed.last_TAG.time - start_time, _fed.last_TAG.microstep); - // The following is only valid for string messages. - // LF_PRINT_DEBUG("Message received: %s.", message_contents); + // Even if we don't modify the event queue, we need to broadcast a change + // because we do not need to continue to wait for a TAG. + lf_cond_broadcast(&env->event_q_changed); + // Notify level advance thread which is blocked. + lf_update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); + lf_cond_broadcast(&lf_port_status_changed); - LF_MUTEX_LOCK(env->mutex); + // Possibly insert a dummy event into the event queue if current time is behind + // (which it should be). Do not do this if the federate has not fully + // started yet. - action->trigger->physical_time_of_arrival = time_of_arrival; + instant_t dummy_event_time = PTAG.time; + microstep_t dummy_event_relative_microstep = PTAG.microstep; - // Create a token for the message - lf_token_t* message_token = _lf_new_token((token_type_t*)action, message_contents, length); + if (lf_tag_compare(env->current_tag, PTAG) == 0) { + // The current tag can equal the PTAG if we are at the start time + // or if this federate has been able to advance time to the current + // tag (e.g., it has no upstream federates). In either case, either + // it is already treating the current tag as PTAG cycle (e.g. at the + // start time) or it will be completing the current cycle and sending + // a LTC message shortly. In either case, there is nothing more to do. + LF_MUTEX_UNLOCK(env->mutex); + return; + } else if (lf_tag_compare(env->current_tag, PTAG) > 0) { + // Current tag is greater than the PTAG. + // It could be that we have sent an LTC that crossed with the incoming + // PTAG or that we have advanced to a tag greater than the PTAG. + // In the former case, there is nothing more to do. + // In the latter case, we may be blocked processing a PTAG cycle at + // a greater tag or we may be in the middle of processing a regular + // TAG. In either case, we know that at the PTAG tag, all outputs + // have either been sent or are absent, so we can send an LTC. + // Send an LTC to indicate absent outputs. + lf_latest_tag_complete(PTAG); + // Nothing more to do. + LF_MUTEX_UNLOCK(env->mutex); + return; + } else if (PTAG.time == env->current_tag.time) { + // We now know env->current_tag < PTAG, but the times are equal. + // Adjust the microstep for scheduling the dummy event. + dummy_event_relative_microstep -= env->current_tag.microstep; + } + // We now know env->current_tag < PTAG. - if (handle_message_now(env, action->trigger, intended_tag)) { - // Since the message is intended for the current tag and a port absent reaction - // was waiting for the message, trigger the corresponding reactions for this message. + if (dummy_event_time != FOREVER) { + // Schedule a dummy event at the specified time and (relative) microstep. + LF_PRINT_DEBUG("At tag " PRINTF_TAG ", inserting into the event queue a dummy event " + "with time " PRINTF_TIME " and (relative) microstep " PRINTF_MICROSTEP ".", + env->current_tag.time - start_time, env->current_tag.microstep, + dummy_event_time - start_time, dummy_event_relative_microstep); + // Dummy event points to a NULL trigger and NULL real event. + event_t* dummy = _lf_create_dummy_events(env, + NULL, dummy_event_time, NULL, dummy_event_relative_microstep); + pqueue_insert(env->event_q, dummy); + } - update_last_known_status_on_input_port(env, intended_tag, port_id); + LF_MUTEX_UNLOCK(env->mutex); +} - LF_PRINT_LOG( - "Inserting reactions directly at tag " PRINTF_TAG ". " - "Intended tag: " PRINTF_TAG ".", - env->current_tag.time - lf_time_start(), - env->current_tag.microstep, - intended_tag.time - lf_time_start(), - intended_tag.microstep - ); - // Only set the intended tag of the trigger if it is being executed now - // because otherwise this may preempt the intended_tag of a previous activation - // of the trigger. - action->trigger->intended_tag = intended_tag; +/** + * Handle a MSG_TYPE_STOP_GRANTED message from the RTI. + * + * This function removes the global barrier on + * logical time raised when lf_request_stop() was + * called in the environment for each enclave. + */ +static void handle_stop_granted_message() { - // This will mark the STP violation in the reaction if the message is tardy. - _lf_insert_reactions_for_trigger(env, action->trigger, message_token); + size_t bytes_to_read = MSG_TYPE_STOP_GRANTED_LENGTH - 1; + unsigned char buffer[bytes_to_read]; + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, + "Failed to read stop granted from RTI."); - // Set the status of the port as present here to inform the network input - // port absent reactions know that they no longer need to block. The reason for - // that is because the network receiver reaction is now in the reaction queue - // keeping the precedence order intact. - set_network_port_status(port_id, present); + tag_t received_stop_tag = extract_tag(buffer); - // Port is now present. Therefore, notify the level advancer to proceed - update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); - lf_cond_broadcast(&port_status_changed); - } else { - // If no port absent reaction is waiting for this message, or if the intended - // tag is in the future, use schedule functions to process the message. - // Before that, if the current time >= stop time, discard the message. - // But only if the stop time is not equal to the start time! + // Trace the event when tracing is enabled + tracepoint_federate_from_rti(_fed.trace, receive_STOP_GRN, _lf_my_fed_id, &received_stop_tag); - update_last_known_status_on_input_port(env, intended_tag, port_id); + LF_PRINT_LOG("Received from RTI a MSG_TYPE_STOP_GRANTED message with elapsed tag " PRINTF_TAG ".", + received_stop_tag.time - start_time, received_stop_tag.microstep); - if (lf_tag_compare(env->current_tag, env->stop_tag) >= 0 && env->execution_started) { - lf_print_error("Received message too late. Already at stop tag.\n" - " Current tag is " PRINTF_TAG " and intended tag is " PRINTF_TAG ".\n" - " Discarding message and closing the socket.", - env->current_tag.time - start_time, env->current_tag.microstep, - intended_tag.time - start_time, intended_tag.microstep); - // Close socket, reading any incoming data and discarding it. - _lf_close_inbound_socket(fed_id, 1); - } else { - schedule_message_received_from_network_locked(env, action->trigger, intended_tag, message_token); + environment_t *env; + int num_environments = _lf_get_environments(&env); + + for (int i = 0; i < num_environments; i++) { + LF_MUTEX_LOCK(env[i].mutex); + + // Sanity check. + if (lf_tag_compare(received_stop_tag, env[i].current_tag) <= 0) { + lf_print_error("RTI granted a MSG_TYPE_STOP_GRANTED tag that is equal to or less than this federate's current tag " PRINTF_TAG ". " + "Stopping at the next microstep instead.", + env[i].current_tag.time - start_time, env[i].current_tag.microstep); + received_stop_tag = env[i].current_tag; + received_stop_tag.microstep++; } - } -#ifdef FEDERATED_DECENTRALIZED - // Only applicable for federated programs with decentralized coordination - // Finally, decrement the barrier to allow the execution to continue - // past the raised barrier - _lf_decrement_tag_barrier_locked(env); -#endif + _lf_set_stop_tag(&env[i], received_stop_tag); + LF_PRINT_DEBUG("Setting the stop tag to " PRINTF_TAG ".", + env[i].stop_tag.time - start_time, + env[i].stop_tag.microstep); - // The mutex is unlocked here after the barrier on - // logical time has been removed to avoid - // the need for unecessary lock and unlock - // operations. - LF_MUTEX_UNLOCK(env->mutex); + if (env[i].barrier.requestors) _lf_decrement_tag_barrier_locked(&env[i]); + lf_cond_broadcast(&env[i].event_q_changed); + LF_MUTEX_UNLOCK(env[i].mutex); + } } /** - * Handle a time advance grant (TAG) message from the RTI. - * This updates the last known status tag for each network input - * port, and broadcasts a signal, which may cause a blocking - * port absent reaction to unblock. - * - * In addition, this updates the last known TAG/PTAG and broadcasts - * a notification of this update, which may unblock whichever worker - * thread is trying to advance time. - * - * @note This function is very similar to handle_provisinal_tag_advance_grant() except that - * it sets last_TAG_was_provisional to false. + * Handle a MSG_TYPE_STOP_REQUEST message from the RTI. */ -void handle_tag_advance_grant(void) { - // Environment is always the one corresponding to the top-level scheduling enclave. - environment_t *env; - _lf_get_environments(&env); - - size_t bytes_to_read = sizeof(instant_t) + sizeof(microstep_t); +static void handle_stop_request_message() { + size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_LENGTH - 1; unsigned char buffer[bytes_to_read]; read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, - "Failed to read tag advance grant from RTI."); - tag_t TAG = extract_tag(buffer); + "Failed to read stop request from RTI."); + tag_t tag_to_stop = extract_tag(buffer); // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_TAG, _lf_my_fed_id, &TAG); + tracepoint_federate_from_rti(_fed.trace, receive_STOP_REQ, _lf_my_fed_id, &tag_to_stop); + LF_PRINT_LOG("Received from RTI a MSG_TYPE_STOP_REQUEST signal with tag " PRINTF_TAG ".", + tag_to_stop.time - start_time, + tag_to_stop.microstep); - LF_MUTEX_LOCK(env->mutex); + extern lf_mutex_t global_mutex; + extern bool lf_stop_requested; + bool already_blocked = false; - // Update the last known status tag of all network input ports - // to the TAG received from the RTI. Here we assume that the RTI - // knows the status of network ports up to and including the granted tag, - // so by extension, we assume that the federate can safely rely - // on the RTI to handle port statuses up until the granted tag. - update_last_known_status_on_input_ports(TAG); + LF_MUTEX_LOCK(global_mutex); + if (lf_stop_requested) { + LF_PRINT_LOG("Ignoring MSG_TYPE_STOP_REQUEST from RTI because lf_request_stop has been called locally."); + already_blocked = true; + } + // Treat the stop request from the RTI as if a local stop request had been received. + lf_stop_requested = true; + LF_MUTEX_UNLOCK(global_mutex); - // It is possible for this federate to have received a PTAG - // earlier with the same tag as this TAG. - if (lf_tag_compare(TAG, _fed.last_TAG) >= 0) { - _fed.last_TAG = TAG; - _fed.is_last_TAG_provisional = false; - LF_PRINT_LOG("Received Time Advance Grant (TAG): " PRINTF_TAG ".", - _fed.last_TAG.time - start_time, _fed.last_TAG.microstep); - } else { - LF_MUTEX_UNLOCK(env->mutex); - lf_print_error("Received a TAG " PRINTF_TAG " that wasn't larger " - "than the previous TAG or PTAG " PRINTF_TAG ". Ignoring the TAG.", - TAG.time - start_time, TAG.microstep, - _fed.last_TAG.time - start_time, _fed.last_TAG.microstep); + // If we have previously received from the RTI a stop request, + // or we have previously sent a stop request to the RTI, + // then we have already blocked tag advance in enclaves. + // Do not do this twice. The record of whether the first has occurred + // is guarded by the outbound socket mutex. + // The second is guarded by the global mutex. + // Note that the RTI should not send stop requests more than once to federates. + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + if (_fed.received_stop_request_from_rti) { + LF_PRINT_LOG("Redundant MSG_TYPE_STOP_REQUEST from RTI. Ignoring it."); + already_blocked = true; + } else if (!already_blocked) { + // Do this only if lf_request_stop has not been called because it will + // prevent lf_request_stop from sending. + _fed.received_stop_request_from_rti = true; + } + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + + if (already_blocked) { + // Either we have sent a stop request to the RTI ourselves, + // or we have previously received a stop request from the RTI. + // Nothing more to do. Tag advance is already blocked on enclaves. return; } - // Notify everything that is blocked. - lf_cond_broadcast(&env->event_q_changed); - LF_MUTEX_UNLOCK(env->mutex); + // Iterate over the scheduling enclaves to find the maximum current tag + // and adjust the tag_to_stop if any of those is greater than tag_to_stop. + // If not done previously, block tag advance in the enclave. + environment_t *env; + int num_environments = _lf_get_environments(&env); + for (int i = 0; i < num_environments; i++) { + LF_MUTEX_LOCK(env[i].mutex); + if (lf_tag_compare(tag_to_stop, env[i].current_tag) <= 0) { + // Can't stop at the requested tag. Make a counteroffer. + tag_to_stop = env->current_tag; + tag_to_stop.microstep++; + } + // Set a barrier to prevent the enclave from advancing past the so-far tag to stop. + _lf_increment_tag_barrier_locked(&env[i], tag_to_stop); + + LF_MUTEX_UNLOCK(env[i].mutex); + } + // Send the reply, which is the least tag at which we can stop. + unsigned char outgoing_buffer[MSG_TYPE_STOP_REQUEST_REPLY_LENGTH]; + ENCODE_STOP_REQUEST_REPLY(outgoing_buffer, tag_to_stop.time, tag_to_stop.microstep); + + // Send the current logical time to the RTI. + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, MSG_TYPE_STOP_REQUEST_REPLY_LENGTH, outgoing_buffer, &lf_outbound_socket_mutex, + "Failed to send the answer to MSG_TYPE_STOP_REQUEST to RTI."); + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + + LF_PRINT_DEBUG("Sent MSG_TYPE_STOP_REQUEST_REPLY to RTI with tag " PRINTF_TAG, + tag_to_stop.time, tag_to_stop.microstep); + // Trace the event when tracing is enabled + tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ_REP, _lf_my_fed_id, &tag_to_stop); } /** - * Send a logical tag complete (LTC) message to the RTI - * unless an equal or later LTC has previously been sent. - * This function assumes the caller holds the mutex lock. - * - * @param tag_to_send The tag to send. + * Send a resign signal to the RTI. The tag payload will be the current + * tag of the specified environment or, if there has been an error that + * will lead to an abnormal termination, the tag NEVER_TAG. */ -void _lf_logical_tag_complete(tag_t tag_to_send) { - int compare_with_last_tag = lf_tag_compare(_fed.last_sent_LTC, tag_to_send); - if (compare_with_last_tag >= 0) { - return; - } - LF_PRINT_LOG("Sending Logical Time Complete (LTC) " PRINTF_TAG " to the RTI.", - tag_to_send.time - start_time, - tag_to_send.microstep); - _lf_send_tag(MSG_TYPE_LOGICAL_TAG_COMPLETE, tag_to_send); - _fed.last_sent_LTC = tag_to_send; -} - -bool update_max_level(tag_t tag, bool is_provisional) { - // This always needs the top-level environment, which will be env[0]. - environment_t *env; - _lf_get_environments(&env); - int prev_max_level_allowed_to_advance = max_level_allowed_to_advance; - max_level_allowed_to_advance = INT_MAX; -#ifdef FEDERATED_DECENTRALIZED - size_t action_table_size = _lf_action_table_size; - lf_action_base_t** action_table = _lf_action_table; -#else - // Note that the following test is never true for decentralized coordination, - // where tag always is NEVER_TAG. - if ((lf_tag_compare(env->current_tag, tag) < 0) || ( - lf_tag_compare(env->current_tag, tag) == 0 && !is_provisional - )) { - LF_PRINT_DEBUG("Updated MLAA to %d at time " PRINTF_TIME ".", - max_level_allowed_to_advance, - lf_time_logical_elapsed(env) - ); - // Safe to complete the current tag - return (prev_max_level_allowed_to_advance != max_level_allowed_to_advance); - } - - size_t action_table_size = _lf_zero_delay_cycle_action_table_size; - lf_action_base_t** action_table = _lf_zero_delay_cycle_action_table; -#endif // FEDERATED_DECENTRALIZED - for (int i = 0; i < action_table_size; i++) { - lf_action_base_t* input_port_action = action_table[i]; -#ifdef FEDERATED_DECENTRALIZED - // In decentralized execution, if the current_tag is close enough to the - // start tag and there is a large enough delay on an incoming - // connection, then there is no need to block progress waiting for this - // port status. This is irrelevant for centralized because blocking only - // occurs on zero-delay cycles. - if ( - (_lf_action_delay_table[i] == 0 && env->current_tag.time == start_time && env->current_tag.microstep == 0) - || (_lf_action_delay_table[i] > 0 && lf_tag_compare( - env->current_tag, - lf_delay_strict((tag_t) {.time=start_time, .microstep=0}, _lf_action_delay_table[i]) - ) <= 0) - ) { - continue; - } -#endif // FEDERATED_DECENTRALIZED - // If the current tag is greater than the last known status tag of the input port, - // and the input port is not physical, then block on that port by ensuring - // the MLAA is no greater than the level of that port. - // For centralized coordination, this is applied only to input ports coming from - // federates that are in a ZDC. For decentralized coordination, this is applied - // to all input ports. - if (lf_tag_compare(env->current_tag, - input_port_action->trigger->last_known_status_tag) > 0 - && !input_port_action->trigger->is_physical) { - max_level_allowed_to_advance = LF_MIN( - max_level_allowed_to_advance, - ((int) LF_LEVEL(input_port_action->trigger->reactions[0]->index)) - ); - } +static void send_resign_signal(environment_t* env) { + size_t bytes_to_write = 1 + sizeof(tag_t); + unsigned char buffer[bytes_to_write]; + buffer[0] = MSG_TYPE_RESIGN; + if (_lf_normal_termination) { + encode_tag(&(buffer[1]), env->current_tag); + } else { + encode_tag(&(buffer[1]), NEVER_TAG); } - LF_PRINT_DEBUG("Updated MLAA to %d at time " PRINTF_TIME ".", - max_level_allowed_to_advance, - lf_time_logical_elapsed(env) - ); - return (prev_max_level_allowed_to_advance != max_level_allowed_to_advance); + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, bytes_to_write, &(buffer[0]), &lf_outbound_socket_mutex, + "Failed to send RESIGN."); + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + LF_PRINT_LOG("Resigned."); } -#ifdef FEDERATED_DECENTRALIZED /** - * @brief Return whether there exists an input port whose status is unknown. - * - * @param staa_elem A record of all input port actions. + * @brief Stop the traces associated with all environments in the program. */ -static bool a_port_is_unknown(staa_t* staa_elem) { - bool do_wait = false; - for (int j = 0; j < staa_elem->num_actions; ++j) { - if (staa_elem->actions[j]->trigger->status == unknown) { - do_wait = true; - break; - } +static void stop_all_traces() { + environment_t *env; + int num_envs = _lf_get_environments(&env); + for (int i = 0; i < num_envs; i++) { + stop_trace(env[i].trace); } - return do_wait; } -#endif /** - * @brief Return the port ID of the port associated with the given action. + * Handle a resign signal from the RTI. The RTI will only resign + * if it is forced to exit, e.g. by a SIG_INT. Hence, this federate + * will exit immediately with an error condition, counting on the + * termination functions to handle any cleanup needed. */ -static int id_of_action(lf_action_base_t* input_port_action) { - for (int i = 0; 1; i++) { - if (_lf_action_for_port(i) == input_port_action) return i; - } - // There will be no UB buffer overrun because _lf_action_for_port(i) has a check. +static void handle_rti_resign_message(void) { + exit(1); } /** - * @brief Thread handling setting the known absent status of input ports. - * For the code-generated array of staa offsets `staa_lst`, which is sorted by STAA offset, - * wait for physical time to advance to the current time plus the STAA offset, - * then set the absent status of the input ports associated with the STAA. - * Then wait for current time to advance and start over. + * Thread that listens for TCP inputs from the RTI. + * When messages arrive, this calls the appropriate handler. + * @param args Ignored */ -#ifdef FEDERATED_DECENTRALIZED -static void* update_ports_from_staa_offsets(void* args) { - if (staa_lst_size == 0) return NULL; // Nothing to do. - // NOTE: Using only the top-level environment, which is the one that deals with network - // input ports. - environment_t *env; - int num_envs = _lf_get_environments(&env); - LF_MUTEX_LOCK(env->mutex); +static void* listen_to_rti_TCP(void* args) { + // Buffer for incoming messages. + // This does not constrain the message size + // because the message will be put into malloc'd memory. + unsigned char buffer[FED_COM_BUFFER_SIZE]; + + // Listen for messages from the federate. while (1) { - bool restart = false; - tag_t tag_when_started_waiting = lf_tag(env); - for (int i = 0; i < staa_lst_size; ++i) { - staa_t* staa_elem = staa_lst[i]; - // The staa_elem is adjusted in the code generator to have subtracted the delay on the connection. - // The list is sorted in increasing order of adjusted STAA offsets. - // The wait_until function automatically adds the _lf_fed_STA_offset to the wait time. - interval_t wait_until_time = env->current_tag.time + staa_elem->STAA; - // The wait_until call will release the env->mutex while it is waiting. - // However, it will not release the env->mutex if the wait time is too small. - // At the cost of a small additional delay in deciding a port is absent, - // we require a minimum wait time here. Otherwise, if both the STAA and STA are - // zero, this thread will fail to ever release the environment mutex. - // This causes chaos. The MIN_SLEEP_DURATION is the smallest amount of time - // that wait_until will actually wait. Note that this strategy does not - // block progress of any execution that is actually processing events. - // It only slightly delays the decision that an event is absent, and only - // if the STAA and STA are extremely small. - if (_lf_fed_STA_offset + staa_elem->STAA < 5 * MIN_SLEEP_DURATION) { - wait_until_time += 5 * MIN_SLEEP_DURATION; + // Check whether the RTI socket is still valid + if (_fed.socket_TCP_RTI < 0) { + lf_print_warning("Socket to the RTI unexpectedly closed."); + return NULL; + } + // Read one byte to get the message type. + // This will exit if the read fails. + int read_failed = read_from_socket(_fed.socket_TCP_RTI, 1, buffer); + if (read_failed < 0) { + if (errno == ECONNRESET) { + lf_print_error("Socket connection to the RTI was closed by the RTI without" + " properly sending an EOF first. Considering this a soft error."); + // FIXME: If this happens, possibly a new RTI must be elected. + _fed.socket_TCP_RTI = -1; + return NULL; + } else { + lf_print_error("Socket connection to the RTI has been broken with error %d: %s." + " The RTI should close connections with an EOF first." + " Considering this a soft error.", + errno, + strerror(errno)); + // FIXME: If this happens, possibly a new RTI must be elected. + _fed.socket_TCP_RTI = -1; + return NULL; } - while (a_port_is_unknown(staa_elem)) { - if (wait_until(env, wait_until_time, &port_status_changed)) { - if (lf_tag_compare(lf_tag(env), tag_when_started_waiting) != 0) { - // Wait was not interrupted and we have committed to a new tag before we - // finished processing the list. Start over. - restart = true; - break; - } - /* Possibly useful for debugging: - tag_t current_tag = lf_tag(env); - lf_print("--------------------- FIXME: assuming absent! " PRINTF_TAG, current_tag.time - lf_time_start(), current_tag.microstep); - lf_print("--------------------- Lag is " PRINTF_TIME, current_tag.time - lf_time_physical()); - lf_print("--------------------- Wait until time is " PRINTF_TIME, wait_until_time - lf_time_start()); - */ - - // Wait went to completion. Mark any ports with this STAA that remain unknown as absent. - for (int j = 0; j < staa_elem->num_actions; ++j) { - lf_action_base_t* input_port_action = staa_elem->actions[j]; - if (input_port_action->trigger->status == unknown) { - input_port_action->trigger->status = absent; - LF_PRINT_DEBUG("Assuming port absent at time " PRINTF_TIME, lf_tag(env).time - start_time); - update_last_known_status_on_input_port(env, lf_tag(env), id_of_action(input_port_action)); - lf_cond_broadcast(&port_status_changed); - } - } - } else if (lf_tag_compare(lf_tag(env), tag_when_started_waiting) != 0) { - // Wait was interrupted and we have committed to a new tag before we - // finished processing the list. Start over. - restart = true; - break; - } + } else if (read_failed > 0) { + // EOF received. + lf_print("Connection to the RTI closed with an EOF."); + _fed.socket_TCP_RTI = -1; + stop_all_traces(); + return NULL; + } + switch (buffer[0]) { + case MSG_TYPE_TAGGED_MESSAGE: + handle_tagged_message(&_fed.socket_TCP_RTI, -1); + break; + case MSG_TYPE_TAG_ADVANCE_GRANT: + handle_tag_advance_grant(); + break; + case MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT: + handle_provisional_tag_advance_grant(); + break; + case MSG_TYPE_STOP_REQUEST: + handle_stop_request_message(); + break; + case MSG_TYPE_STOP_GRANTED: + handle_stop_granted_message(); + break; + case MSG_TYPE_PORT_ABSENT: + handle_port_absent_message(&_fed.socket_TCP_RTI, -1); + break; + case MSG_TYPE_RESIGN: + handle_rti_resign_message(); + break; + case MSG_TYPE_CLOCK_SYNC_T1: + case MSG_TYPE_CLOCK_SYNC_T4: + lf_print_error("Federate %d received unexpected clock sync message from RTI on TCP socket.", + _lf_my_fed_id); + break; + default: + lf_print_error_and_exit("Received from RTI an unrecognized TCP message type: %hhx.", buffer[0]); + // Trace the event when tracing is enabled + tracepoint_federate_from_rti(_fed.trace, receive_UNIDENTIFIED, _lf_my_fed_id, NULL); } - if (restart) break; // No need to check the rest of the STAAs. - } - if (restart) continue; // No need to wait for a new tag. - - // Wait until we progress to a new tag. - while (lf_tag_compare(lf_tag(env), tag_when_started_waiting) == 0) { - // The following will release the env->mutex while waiting. - lf_cond_wait(&logical_time_changed); - } } + return NULL; } /** - * @brief Spawn a thread to iterate through STAA structs, setting their associated ports absent - * at an offset if the port is not present with a value by a certain physical time. + * Modify the specified tag, if necessary, to be an earlier tag based + * on the current physical time. The earlier tag is necessary if this federate + * has downstream federates and also has physical actions that may trigger + * outputs. In that case, the earlier tag will be the current physical time + * plus the minimum delay on all such physical actions plus any other delays + * along the path from the triggering physical action to the output port + * minus one nanosecond. The modified tag is assured of being less than any + * output tag that might later be produced. + * @param tag A pointer to the proposed NET. + * @return True if this federate requires this modification and the tag was + * modified. */ -void spawn_staa_thread(){ - lf_thread_create(&_fed.staaSetter, update_ports_from_staa_offsets, NULL); +static bool bounded_NET(tag_t* tag) { + // The tag sent by this function is a promise that, absent + // inputs from another federate, this federate will not produce events + // earlier than t. But if there are downstream federates and there is + // a physical action (not counting receivers from upstream federates), + // then we can only promise up to current physical time (plus the minimum + // of all minimum delays on the physical actions). + // In this case, we send a NET message with the current physical time + // to permit downstream federates to advance. To avoid + // overwhelming the network, this NET message should be sent periodically + // at specified intervals controlled by the target parameter + // coordination-options: {advance-message-interval: time units}. + // The larger the interval, the more downstream federates will lag + // behind real time, but the less network traffic. If this option is + // missing, we issue a warning message suggesting that a redesign + // might be in order so that outputs don't depend on physical actions. + LF_PRINT_DEBUG("Checking NET to see whether it should be bounded by physical time." + " Min delay from physical action: " PRINTF_TIME ".", + _fed.min_delay_from_physical_action_to_federate_output); + if (_fed.min_delay_from_physical_action_to_federate_output >= 0LL + && _fed.has_downstream + ) { + // There is a physical action upstream of some output from this + // federate, and there is at least one downstream federate. + // Compare the tag to the current physical time. + instant_t physical_time = lf_time_physical(); + if (physical_time + _fed.min_delay_from_physical_action_to_federate_output < tag->time) { + // Can only promise up and not including this new time: + tag->time = physical_time + _fed.min_delay_from_physical_action_to_federate_output - 1L; + tag->microstep = 0; + LF_PRINT_LOG("Has physical actions that bound NET to " PRINTF_TAG ".", + tag->time - start_time, tag->microstep); + return true; + } + } + return false; } -#endif + +////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////// +// Public functions (declared in reactor.h) +// An empty version of this function is code generated for unfederated execution. /** - * Handle a provisional tag advance grant (PTAG) message from the RTI. - * This updates the last known TAG/PTAG and broadcasts - * a notification of this update, which may unblock whichever worker - * thread is trying to advance time. - * If current_time is less than the specified PTAG, then this will - * also insert into the event_q a dummy event with the specified tag. - * This will ensure that the federate advances time to the specified - * tag and, for centralized coordination, stimulates null-message-sending - * output reactions at that tag. - * - * @note This function is similar to handle_tag_advance_grant() except that - * it sets last_TAG_was_provisional to true and also it does not update the - * last known tag for input ports. + * Close sockets used to communicate with other federates, if they are open, + * and send a MSG_TYPE_RESIGN message to the RTI. This implements the function + * defined in reactor.h. For unfederated execution, the code generator + * generates an empty implementation. + * @param env The environment of the federate */ -void handle_provisional_tag_advance_grant() { - // Environment is always the one corresponding to the top-level scheduling enclave. - environment_t *env; - _lf_get_environments(&env); +void terminate_execution(environment_t* env) { + assert(env != GLOBAL_ENVIRONMENT); - size_t bytes_to_read = sizeof(instant_t) + sizeof(microstep_t); - unsigned char buffer[bytes_to_read]; - read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, - "Failed to read provisional tag advance grant from RTI."); - tag_t PTAG = extract_tag(buffer); + // For an abnormal termination (e.g. a SIGINT), we need to send a + // MSG_TYPE_RESIGN message to the RTI, but we should not acquire a mutex. + if (_fed.socket_TCP_RTI >= 0) { + if (_lf_normal_termination) { + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + send_resign_signal(env); + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + // Trace the event when tracing is enabled + tracepoint_federate_to_rti(_fed.trace, send_RESIGN, _lf_my_fed_id, &env->current_tag); + } else { + // Do not acquire mutex and do not trace. + send_resign_signal(env); + } + } - // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_PTAG, _lf_my_fed_id, &PTAG); + LF_PRINT_DEBUG("Closing incoming P2P sockets."); + // Close any incoming P2P sockets that are still open. + for (int i=0; i < NUMBER_OF_FEDERATES; i++) { + close_inbound_socket(i, 1); + // Ignore errors. Mark the socket closed. + _fed.sockets_for_inbound_p2p_connections[i] = -1; + } - // Note: it is important that last_known_status_tag of ports does not - // get updated to a PTAG value because a PTAG does not indicate that - // the RTI knows about the status of all ports up to and _including_ - // the value of PTAG. Only a TAG message indicates that. - LF_MUTEX_LOCK(env->mutex); + // Check for all outgoing physical connections in + // _fed.sockets_for_outbound_p2p_connections and + // if the socket ID is not -1, the connection is still open. + // Send an EOF by closing the socket here. + for (int i=0; i < NUMBER_OF_FEDERATES; i++) { - // Sanity check - if (lf_tag_compare(PTAG, _fed.last_TAG) < 0 - || (lf_tag_compare(PTAG, _fed.last_TAG) == 0 && !_fed.is_last_TAG_provisional)) { - LF_MUTEX_UNLOCK(env->mutex); - lf_print_error_and_exit("Received a PTAG " PRINTF_TAG " that is equal or earlier " - "than an already received TAG " PRINTF_TAG ".", - PTAG.time, PTAG.microstep, - _fed.last_TAG.time, _fed.last_TAG.microstep); + // Close outbound connections, in case they have not closed themselves. + // This will result in EOF being sent to the remote federate, except for + // abnormal termination, in which case it will just close the socket. + int flag = _lf_normal_termination? 1 : -1; + close_outbound_socket(i, flag); } - _fed.last_TAG = PTAG; - _fed.is_last_TAG_provisional = true; - LF_PRINT_LOG("At tag " PRINTF_TAG ", received Provisional Tag Advance Grant (PTAG): " PRINTF_TAG ".", - env->current_tag.time - start_time, env->current_tag.microstep, - _fed.last_TAG.time - start_time, _fed.last_TAG.microstep); + LF_PRINT_DEBUG("Waiting for inbound p2p socket listener threads."); + // Wait for each inbound socket listener thread to close. + if (_fed.number_of_inbound_p2p_connections > 0 && _fed.inbound_socket_listeners != NULL) { + LF_PRINT_LOG("Waiting for %zu threads listening for incoming messages to exit.", + _fed.number_of_inbound_p2p_connections); + for (int i=0; i < _fed.number_of_inbound_p2p_connections; i++) { + // Ignoring errors here. + lf_thread_join(_fed.inbound_socket_listeners[i], NULL); + } + } - // Even if we don't modify the event queue, we need to broadcast a change - // because we do not need to continue to wait for a TAG. - lf_cond_broadcast(&env->event_q_changed); - // Notify level advance thread which is blocked. - update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); - lf_cond_broadcast(&port_status_changed); + LF_PRINT_DEBUG("Waiting for RTI's socket listener threads."); + // Wait for the thread listening for messages from the RTI to close. + lf_thread_join(_fed.RTI_socket_listener, NULL); - // Possibly insert a dummy event into the event queue if current time is behind - // (which it should be). Do not do this if the federate has not fully - // started yet. + // For abnormal termination, there is no need to free memory. + if (_lf_normal_termination) { + LF_PRINT_DEBUG("Freeing memory occupied by the federate."); + free(_fed.inbound_socket_listeners); + free(federation_metadata.rti_host); + free(federation_metadata.rti_user); + } +} - instant_t dummy_event_time = PTAG.time; - microstep_t dummy_event_relative_microstep = PTAG.microstep; - if (lf_tag_compare(env->current_tag, PTAG) == 0) { - // The current tag can equal the PTAG if we are at the start time - // or if this federate has been able to advance time to the current - // tag (e.g., it has no upstream federates). In either case, either - // it is already treating the current tag as PTAG cycle (e.g. at the - // start time) or it will be completing the current cycle and sending - // a LTC message shortly. In either case, there is nothing more to do. - LF_MUTEX_UNLOCK(env->mutex); - return; - } else if (lf_tag_compare(env->current_tag, PTAG) > 0) { - // Current tag is greater than the PTAG. - // It could be that we have sent an LTC that crossed with the incoming - // PTAG or that we have advanced to a tag greater than the PTAG. - // In the former case, there is nothing more to do. - // In the latter case, we may be blocked processing a PTAG cycle at - // a greater tag or we may be in the middle of processing a regular - // TAG. In either case, we know that at the PTAG tag, all outputs - // have either been sent or are absent, so we can send an LTC. - // Send an LTC to indicate absent outputs. - _lf_logical_tag_complete(PTAG); - // Nothing more to do. - LF_MUTEX_UNLOCK(env->mutex); - return; - } else if (PTAG.time == env->current_tag.time) { - // We now know env->current_tag < PTAG, but the times are equal. - // Adjust the microstep for scheduling the dummy event. - dummy_event_relative_microstep -= env->current_tag.microstep; - } - // We now know env->current_tag < PTAG. +////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////// +// Public functions (declared in federate.h, in alphabetical order) - if (dummy_event_time != FOREVER) { - // Schedule a dummy event at the specified time and (relative) microstep. - LF_PRINT_DEBUG("At tag " PRINTF_TAG ", inserting into the event queue a dummy event " - "with time " PRINTF_TIME " and (relative) microstep " PRINTF_MICROSTEP ".", - env->current_tag.time - start_time, env->current_tag.microstep, - dummy_event_time - start_time, dummy_event_relative_microstep); - // Dummy event points to a NULL trigger and NULL real event. - event_t* dummy = _lf_create_dummy_events(env, - NULL, dummy_event_time, NULL, dummy_event_relative_microstep); - pqueue_insert(env->event_q, dummy); - } +void lf_connect_to_federate(uint16_t remote_federate_id) { + int result = -1; + int count_retries = 0; - LF_MUTEX_UNLOCK(env->mutex); -} + // Ask the RTI for port number of the remote federate. + // The buffer is used for both sending and receiving replies. + // The size is what is needed for receiving replies. + unsigned char buffer[sizeof(int32_t) + INET_ADDRSTRLEN + 1]; + int port = -1; + struct in_addr host_ip_addr; + int count_tries = 0; + while (port == -1 && !_lf_termination_executed) { + buffer[0] = MSG_TYPE_ADDRESS_QUERY; + // NOTE: Sending messages in little endian. + encode_uint16(remote_federate_id, &(buffer[1])); -/** - * Send a MSG_TYPE_STOP_REQUEST message to the RTI with payload equal - * to the specified tag plus one microstep. If this federate has previously - * received a stop request from the RTI, then do not send the message and - * return 1. Return -1 if the socket is disconnected. Otherwise, return 0. - * @return 0 if the message is sent. - */ -int _lf_fd_send_stop_request_to_rti(tag_t stop_tag) { + LF_PRINT_DEBUG("Sending address query for federate %d.", remote_federate_id); + // Trace the event when tracing is enabled + tracepoint_federate_to_rti(_fed.trace, send_ADR_QR, _lf_my_fed_id, NULL); - // Send a stop request with the specified tag to the RTI - unsigned char buffer[MSG_TYPE_STOP_REQUEST_LENGTH]; - // Stop at the next microstep - stop_tag.microstep++; - ENCODE_STOP_REQUEST(buffer, stop_tag.time, stop_tag.microstep); + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, sizeof(uint16_t) + 1, buffer, &lf_outbound_socket_mutex, + "Failed to send address query for federate %d to RTI.", + remote_federate_id); + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); - LF_MUTEX_LOCK(outbound_socket_mutex); - // Do not send a stop request if a stop request has been previously received from the RTI. - if (!_fed.received_stop_request_from_rti) { - LF_PRINT_LOG("Sending to RTI a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", - stop_tag.time - start_time, - stop_tag.microstep); + // Read RTI's response. + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, sizeof(int32_t) + 1, buffer, NULL, + "Failed to read the requested port number for federate %d from RTI.", + remote_federate_id); + + if (buffer[0] != MSG_TYPE_ADDRESS_QUERY) { + // Unexpected reply. Could be that RTI has failed and sent a resignation. + if (buffer[0] == MSG_TYPE_RESIGN) { + lf_print_error_and_exit("RTI has resigned."); + } else { + lf_print_error_and_exit("Unexpected reply of type %hhu from RTI (see net_common.h).", buffer[0]); + } + } + port = extract_int32(&buffer[1]); + + read_from_socket_fail_on_error( + &_fed.socket_TCP_RTI, sizeof(host_ip_addr), (unsigned char*)&host_ip_addr, NULL, + "Failed to read the IP address for federate %d from RTI.", + remote_federate_id); - if (_fed.socket_TCP_RTI < 0) { - lf_print_warning("Socket is no longer connected. Dropping message."); - LF_MUTEX_UNLOCK(outbound_socket_mutex); - return -1; + // A reply of -1 for the port means that the RTI does not know + // the port number of the remote federate, presumably because the + // remote federate has not yet sent an MSG_TYPE_ADDRESS_ADVERTISEMENT message to the RTI. + // Sleep for some time before retrying. + if (port == -1) { + if (count_tries++ >= CONNECT_MAX_RETRIES) { + lf_print_error_and_exit("TIMEOUT obtaining IP/port for federate %d from the RTI.", + remote_federate_id); + } + // Wait ADDRESS_QUERY_RETRY_INTERVAL nanoseconds. + lf_sleep(ADDRESS_QUERY_RETRY_INTERVAL); } - write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, MSG_TYPE_STOP_REQUEST_LENGTH, - buffer, &outbound_socket_mutex, - "Failed to send stop time " PRINTF_TIME " to the RTI.", stop_tag.time - start_time); - - // Treat this sending as equivalent to having received a stop request from the RTI. - _fed.received_stop_request_from_rti = true; - LF_MUTEX_UNLOCK(outbound_socket_mutex); - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ, _lf_my_fed_id, &stop_tag); - return 0; - } else { - LF_MUTEX_UNLOCK(outbound_socket_mutex); - return 1; } -} + assert(port < 65536); + assert(port > 0); + uint16_t uport = (uint16_t)port; -/** - * Handle a MSG_TYPE_STOP_GRANTED message from the RTI. - * - * This function removes the global barrier on - * logical time raised when lf_request_stop() was - * called in the environment for each enclave. - */ -void handle_stop_granted_message() { +#if LOG_LEVEL > 3 + // Print the received IP address in a human readable format + // Create the human readable format of the received address. + // This is avoided unless LOG_LEVEL is high enough to + // subdue the overhead caused by inet_ntop(). + char hostname[INET_ADDRSTRLEN]; + inet_ntop(AF_INET, &host_ip_addr, hostname, INET_ADDRSTRLEN); + LF_PRINT_LOG("Received address %s port %d for federate %d from RTI.", + hostname, uport, remote_federate_id); +#endif - size_t bytes_to_read = MSG_TYPE_STOP_GRANTED_LENGTH - 1; - unsigned char buffer[bytes_to_read]; - read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, - "Failed to read stop granted from RTI."); + // Iterate until we either successfully connect or exceed the number of + // attempts given by CONNECT_MAX_RETRIES. + int socket_id = -1; + while (result < 0 && !_lf_termination_executed) { + // Create an IPv4 socket for TCP (not UDP) communication over IP (0). + socket_id = create_real_time_tcp_socket_errexit(); - tag_t received_stop_tag = extract_tag(buffer); + // Server file descriptor. + struct sockaddr_in server_fd; + // Zero out the server_fd struct. + bzero((char*)&server_fd, sizeof(server_fd)); - // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_STOP_GRN, _lf_my_fed_id, &received_stop_tag); + // Set up the server_fd fields. + server_fd.sin_family = AF_INET; // IPv4 + server_fd.sin_addr = host_ip_addr; // Received from the RTI - LF_PRINT_LOG("Received from RTI a MSG_TYPE_STOP_GRANTED message with elapsed tag " PRINTF_TAG ".", - received_stop_tag.time - start_time, received_stop_tag.microstep); + // Convert the port number from host byte order to network byte order. + server_fd.sin_port = htons(uport); + result = connect( + socket_id, + (struct sockaddr *)&server_fd, + sizeof(server_fd)); - environment_t *env; - int num_environments = _lf_get_environments(&env); + if (result != 0) { + lf_print_error("Failed to connect to federate %d on port %d.", remote_federate_id, uport); - for (int i = 0; i < num_environments; i++) { - LF_MUTEX_LOCK(env[i].mutex); + // Try again after some time if the connection failed. + // Note that this should not really happen since the remote federate should be + // accepting socket connections. But possibly it will be busy (in process of accepting + // another socket connection?). Hence, we retry. + count_retries++; + if (count_retries > CONNECT_MAX_RETRIES) { + // If the remote federate is not accepting the connection after CONNECT_MAX_RETRIES + // treat it as a soft error condition and return. + lf_print_error("Failed to connect to federate %d after %d retries. Giving up.", + remote_federate_id, CONNECT_MAX_RETRIES); + return; + } + lf_print_warning("Could not connect to federate %d. Will try again every %lld nanoseconds.\n", + remote_federate_id, ADDRESS_QUERY_RETRY_INTERVAL); + + // Check whether the RTI is still there. + if (rti_resigned()) break; - // Sanity check. - if (lf_tag_compare(received_stop_tag, env[i].current_tag) <= 0) { - lf_print_error("RTI granted a MSG_TYPE_STOP_GRANTED tag that is equal to or less than this federate's current tag " PRINTF_TAG ". " - "Stopping at the next microstep instead.", - env[i].current_tag.time - start_time, env[i].current_tag.microstep); - received_stop_tag = env[i].current_tag; - received_stop_tag.microstep++; + // Wait ADDRESS_QUERY_RETRY_INTERVAL nanoseconds. + lf_sleep(ADDRESS_QUERY_RETRY_INTERVAL); + } else { + // Connect was successful. + size_t buffer_length = 1 + sizeof(uint16_t) + 1; + unsigned char buffer[buffer_length]; + buffer[0] = MSG_TYPE_P2P_SENDING_FED_ID; + if (_lf_my_fed_id > UINT16_MAX) { + // This error is very unlikely to occur. + lf_print_error_and_exit("Too many federates! More than %d.", UINT16_MAX); + } + encode_uint16((uint16_t)_lf_my_fed_id, (unsigned char*)&(buffer[1])); + unsigned char federation_id_length = (unsigned char)strnlen(federation_metadata.federation_id, 255); + buffer[sizeof(uint16_t) + 1] = federation_id_length; + // Trace the event when tracing is enabled + tracepoint_federate_to_federate(_fed.trace, send_FED_ID, _lf_my_fed_id, remote_federate_id, NULL); + + // No need for a mutex because we have the only handle on the socket. + write_to_socket_fail_on_error(&socket_id, + buffer_length, buffer, NULL, + "Failed to send fed_id to federate %d.", remote_federate_id); + write_to_socket_fail_on_error(&socket_id, + federation_id_length, (unsigned char*)federation_metadata.federation_id, NULL, + "Failed to send federation id to federate %d.", + remote_federate_id); + + read_from_socket_fail_on_error(&socket_id, 1, (unsigned char*)buffer, NULL, + "Failed to read MSG_TYPE_ACK from federate %d in response to sending fed_id.", + remote_federate_id); + if (buffer[0] != MSG_TYPE_ACK) { + // Get the error code. + read_from_socket_fail_on_error(&socket_id, 1, (unsigned char*)buffer, NULL, + "Failed to read error code from federate %d in response to sending fed_id.", remote_federate_id); + lf_print_error("Received MSG_TYPE_REJECT message from remote federate (%d).", buffer[0]); + result = -1; + continue; + } else { + lf_print("Connected to federate %d, port %d.", remote_federate_id, port); + // Trace the event when tracing is enabled + tracepoint_federate_to_federate(_fed.trace, receive_ACK, _lf_my_fed_id, remote_federate_id, NULL); + } } + } + // Once we set this variable, then all future calls to close() on this + // socket ID should reset it to -1 within a critical section. + _fed.sockets_for_outbound_p2p_connections[remote_federate_id] = socket_id; +} - _lf_set_stop_tag(&env[i], received_stop_tag); - LF_PRINT_DEBUG("Setting the stop tag to " PRINTF_TAG ".", - env[i].stop_tag.time - start_time, - env[i].stop_tag.microstep); +void lf_connect_to_rti(const char* hostname, int port) { + LF_PRINT_LOG("Connecting to the RTI."); - if (env[i].barrier.requestors) _lf_decrement_tag_barrier_locked(&env[i]); - lf_cond_broadcast(&env[i].event_q_changed); - LF_MUTEX_UNLOCK(env[i].mutex); + // Override passed hostname and port if passed as runtime arguments. + hostname = federation_metadata.rti_host ? federation_metadata.rti_host : hostname; + port = federation_metadata.rti_port >= 0 ? federation_metadata.rti_port : port; + + // Adjust the port. + uint16_t uport = 0; + if (port < 0 || port > INT16_MAX) { + lf_print_error( + "lf_connect_to_rti(): Specified port (%d) is out of range," + " using the default port %d instead.", + port, DEFAULT_PORT + ); + uport = DEFAULT_PORT; + port = 0; // Mark so that increments occur between tries. + } else { + uport = (uint16_t)port; + } + if (uport == 0) { + uport = DEFAULT_PORT; } -} -/** - * Handle a MSG_TYPE_STOP_REQUEST message from the RTI. - */ -void handle_stop_request_message() { - size_t bytes_to_read = MSG_TYPE_STOP_REQUEST_LENGTH - 1; - unsigned char buffer[bytes_to_read]; - read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_read, buffer, NULL, - "Failed to read stop request from RTI."); - tag_t tag_to_stop = extract_tag(buffer); + // Create a socket + _fed.socket_TCP_RTI = create_real_time_tcp_socket_errexit(); - // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_STOP_REQ, _lf_my_fed_id, &tag_to_stop); - LF_PRINT_LOG("Received from RTI a MSG_TYPE_STOP_REQUEST signal with tag " PRINTF_TAG ".", - tag_to_stop.time - start_time, - tag_to_stop.microstep); + int result = -1; + int count_retries = 0; + struct addrinfo* res = NULL; - extern lf_mutex_t global_mutex; - extern bool lf_stop_requested; - bool already_blocked = false; + while (count_retries++ < CONNECT_MAX_RETRIES && !_lf_termination_executed) { + if (res != NULL) { + // This is a repeated attempt. + if (_fed.socket_TCP_RTI >= 0) close_rti_socket(); - LF_MUTEX_LOCK(global_mutex); - if (lf_stop_requested) { - LF_PRINT_LOG("Ignoring MSG_TYPE_STOP_REQUEST from RTI because lf_request_stop has been called locally."); - already_blocked = true; - } - // Treat the stop request from the RTI as if a local stop request had been received. - lf_stop_requested = true; - LF_MUTEX_UNLOCK(global_mutex); + lf_sleep(CONNECT_RETRY_INTERVAL); - // If we have previously received from the RTI a stop request, - // or we have previously sent a stop request to the RTI, - // then we have already blocked tag advance in enclaves. - // Do not do this twice. The record of whether the first has occurred - // is guarded by the outbound socket mutex. - // The second is guarded by the global mutex. - // Note that the RTI should not send stop requests more than once to federates. - LF_MUTEX_LOCK(outbound_socket_mutex); - if (_fed.received_stop_request_from_rti) { - LF_PRINT_LOG("Redundant MSG_TYPE_STOP_REQUEST from RTI. Ignoring it."); - already_blocked = true; - } else if (!already_blocked) { - // Do this only if lf_request_stop has not been called because it will - // prevent lf_request_stop from sending. - _fed.received_stop_request_from_rti = true; - } - LF_MUTEX_UNLOCK(outbound_socket_mutex); + // Create a new socket. + _fed.socket_TCP_RTI = create_real_time_tcp_socket_errexit(); - if (already_blocked) { - // Either we have sent a stop request to the RTI ourselves, - // or we have previously received a stop request from the RTI. - // Nothing more to do. Tag advance is already blocked on enclaves. - return; - } + if (port == 0) { + // Free previously allocated address info. + freeaddrinfo(res); + // Increment the port number. + uport++; + if (uport >= DEFAULT_PORT + MAX_NUM_PORT_ADDRESSES) uport = DEFAULT_PORT; + + // Reconstruct the address info. + rti_address(hostname, uport, &res); + } + lf_print("Trying RTI again on port %d (attempt %d).", uport, count_retries); + } else { + // This is the first attempt. + rti_address(hostname, uport, &res); + } + + result = connect(_fed.socket_TCP_RTI, res->ai_addr, res->ai_addrlen); + if (result < 0) continue; // Connect failed. + + // Have connected to an RTI, but not sure it's the right RTI. + // Send a MSG_TYPE_FED_IDS message and wait for a reply. + // Notify the RTI of the ID of this federate and its federation. - // Iterate over the scheduling enclaves to find the maximum current tag - // and adjust the tag_to_stop if any of those is greater than tag_to_stop. - // If not done previously, block tag advance in the enclave. - environment_t *env; - int num_environments = _lf_get_environments(&env); - for (int i = 0; i < num_environments; i++) { - LF_MUTEX_LOCK(env[i].mutex); - if (lf_tag_compare(tag_to_stop, env[i].current_tag) <= 0) { - // Can't stop at the requested tag. Make a counteroffer. - tag_to_stop = env->current_tag; - tag_to_stop.microstep++; +#ifdef FEDERATED_AUTHENTICATED + LF_PRINT_LOG("Connected to an RTI. Performing HMAC-based authentication using federation ID."); + if (perform_hmac_authentication()) { + if (port == 0) { + continue; // Try again with a new port. + } else { + // No point in trying again because it will be the same port. + close_rti_socket(); + lf_print_error_and_exit("Authentication failed."); + } } - // Set a barrier to prevent the enclave from advancing past the so-far tag to stop. - _lf_increment_tag_barrier_locked(&env[i], tag_to_stop); +#else + LF_PRINT_LOG("Connected to an RTI. Sending federation ID for authentication."); +#endif - LF_MUTEX_UNLOCK(env[i].mutex); - } - // Send the reply, which is the least tag at which we can stop. - unsigned char outgoing_buffer[MSG_TYPE_STOP_REQUEST_REPLY_LENGTH]; - ENCODE_STOP_REQUEST_REPLY(outgoing_buffer, tag_to_stop.time, tag_to_stop.microstep); + // Send the message type first. + unsigned char buffer[4]; + buffer[0] = MSG_TYPE_FED_IDS; + // Next send the federate ID. + if (_lf_my_fed_id > UINT16_MAX) { + lf_print_error_and_exit("Too many federates! More than %d.", UINT16_MAX); + } + encode_uint16((uint16_t)_lf_my_fed_id, &buffer[1]); + // Next send the federation ID length. + // The federation ID is limited to 255 bytes. + size_t federation_id_length = strnlen(federation_metadata.federation_id, 255); + buffer[1 + sizeof(uint16_t)] = (unsigned char)(federation_id_length & 0xff); - // Send the current logical time to the RTI. - LF_MUTEX_LOCK(outbound_socket_mutex); - write_to_socket_fail_on_error( - &_fed.socket_TCP_RTI, MSG_TYPE_STOP_REQUEST_REPLY_LENGTH, outgoing_buffer, &outbound_socket_mutex, - "Failed to send the answer to MSG_TYPE_STOP_REQUEST to RTI."); - LF_MUTEX_UNLOCK(outbound_socket_mutex); + // Trace the event when tracing is enabled + tracepoint_federate_to_rti(_fed.trace, send_FED_ID, _lf_my_fed_id, NULL); - LF_PRINT_DEBUG("Sent MSG_TYPE_STOP_REQUEST_REPLY to RTI with tag " PRINTF_TAG, - tag_to_stop.time, tag_to_stop.microstep); - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ_REP, _lf_my_fed_id, &tag_to_stop); -} + // No need for a mutex here because no other threads are writing to this socket. + if (write_to_socket(_fed.socket_TCP_RTI, 2 + sizeof(uint16_t), buffer)) { + continue; // Try again, possibly on a new port. + } -/** - * Send a resign signal to the RTI. The tag payload will be the current - * tag of the specified environment or, if there has been an error that - * will lead to an abnormal termination, the tag NEVER_TAG. - */ -static void send_resign_signal(environment_t* env) { - size_t bytes_to_write = 1 + sizeof(tag_t); - unsigned char buffer[bytes_to_write]; - buffer[0] = MSG_TYPE_RESIGN; - if (_lf_normal_termination) { - encode_tag(&(buffer[1]), env->current_tag); - } else { - encode_tag(&(buffer[1]), NEVER_TAG); - } - LF_MUTEX_LOCK(outbound_socket_mutex); - write_to_socket_fail_on_error( - &_fed.socket_TCP_RTI, bytes_to_write, &(buffer[0]), &outbound_socket_mutex, - "Failed to send RESIGN."); - LF_MUTEX_UNLOCK(outbound_socket_mutex); - LF_PRINT_LOG("Resigned."); -} + // Next send the federation ID itself. + if (write_to_socket( + _fed.socket_TCP_RTI, + federation_id_length, + (unsigned char*)federation_metadata.federation_id)) { + continue; // Try again. + } -/** - * Close sockets used to communicate with other federates, if they are open, - * and send a MSG_TYPE_RESIGN message to the RTI. This implements the function - * defined in reactor.h. For unfederated execution, the code generator - * generates an empty implementation. - * @param env The environment of the federate - */ -void terminate_execution(environment_t* env) { - assert(env != GLOBAL_ENVIRONMENT); + // Wait for a response. + // The response will be MSG_TYPE_REJECT if the federation ID doesn't match. + // Otherwise, it will be either MSG_TYPE_ACK or MSG_TYPE_UDP_PORT, where the latter + // is used if clock synchronization will be performed. + unsigned char response; - // For an abnormal termination (e.g. a SIGINT), we need to send a - // MSG_TYPE_RESIGN message to the RTI, but we should not acquire a mutex. - if (_fed.socket_TCP_RTI >= 0) { - if (_lf_normal_termination) { - LF_MUTEX_LOCK(outbound_socket_mutex); - send_resign_signal(env); - LF_MUTEX_UNLOCK(outbound_socket_mutex); + LF_PRINT_DEBUG("Waiting for response to federation ID from the RTI."); + + if (read_from_socket(_fed.socket_TCP_RTI, 1, &response)) { + continue; // Try again. + } + if (response == MSG_TYPE_REJECT) { // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_RESIGN, _lf_my_fed_id, &env->current_tag); + tracepoint_federate_from_rti(_fed.trace, receive_REJECT, _lf_my_fed_id, NULL); + // Read one more byte to determine the cause of rejection. + unsigned char cause; + read_from_socket_fail_on_error(&_fed.socket_TCP_RTI, 1, &cause, NULL, + "Failed to read the cause of rejection by the RTI."); + if (cause == FEDERATION_ID_DOES_NOT_MATCH || cause == WRONG_SERVER) { + lf_print_warning("Connected to the wrong RTI on port %d. Will try again", uport); + continue; + } + } else if (response == MSG_TYPE_ACK) { + // Trace the event when tracing is enabled + tracepoint_federate_from_rti(_fed.trace, receive_ACK, _lf_my_fed_id, NULL); + LF_PRINT_LOG("Received acknowledgment from the RTI."); + break; + } else if (response == MSG_TYPE_RESIGN) { + lf_print_warning("RTI on port %d resigned. Will try again", uport); + continue; } else { - // Do not acquire mutex and do not trace. - send_resign_signal(env); + lf_print_warning("RTI on port %d gave unexpect response %u. Will try again", uport, response); + continue; } } - - LF_PRINT_DEBUG("Closing incoming P2P sockets."); - // Close any incoming P2P sockets that are still open. - for (int i=0; i < NUMBER_OF_FEDERATES; i++) { - _lf_close_inbound_socket(i, 1); - // Ignore errors. Mark the socket closed. - _fed.sockets_for_inbound_p2p_connections[i] = -1; + if (result < 0) { + lf_print_error_and_exit("Failed to connect to RTI after %d tries.", CONNECT_MAX_RETRIES); } - // Check for all outgoing physical connections in - // _fed.sockets_for_outbound_p2p_connections and - // if the socket ID is not -1, the connection is still open. - // Send an EOF by closing the socket here. - for (int i=0; i < NUMBER_OF_FEDERATES; i++) { + freeaddrinfo(res); /* No longer needed */ - // Close outbound connections, in case they have not closed themselves. - // This will result in EOF being sent to the remote federate, except for - // abnormal termination, in which case it will just close the socket. - int flag = _lf_normal_termination? 1 : -1; - _lf_close_outbound_socket(i, flag); - } + // Call a generated (external) function that sends information + // about connections between this federate and other federates + // where messages are routed through the RTI. + // @see MSG_TYPE_NEIGHBOR_STRUCTURE in net_common.h + lf_send_neighbor_structure_to_RTI(_fed.socket_TCP_RTI); - LF_PRINT_DEBUG("Waiting for inbound p2p socket listener threads."); - // Wait for each inbound socket listener thread to close. - if (_fed.number_of_inbound_p2p_connections > 0 && _fed.inbound_socket_listeners != NULL) { - LF_PRINT_LOG("Waiting for %zu threads listening for incoming messages to exit.", - _fed.number_of_inbound_p2p_connections); - for (int i=0; i < _fed.number_of_inbound_p2p_connections; i++) { - // Ignoring errors here. - lf_thread_join(_fed.inbound_socket_listeners[i], NULL); - } - } + uint16_t udp_port = setup_clock_synchronization_with_rti(); - LF_PRINT_DEBUG("Waiting for RTI's socket listener threads."); - // Wait for the thread listening for messages from the RTI to close. - lf_thread_join(_fed.RTI_socket_listener, NULL); + // Write the returned port number to the RTI + unsigned char UDP_port_number[1 + sizeof(uint16_t)]; + UDP_port_number[0] = MSG_TYPE_UDP_PORT; + encode_uint16(udp_port, &(UDP_port_number[1])); + write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, 1 + sizeof(uint16_t), UDP_port_number, NULL, + "Failed to send the UDP port number to the RTI."); - // For abnormal termination, there is no need to free memory. - if (_lf_normal_termination) { - LF_PRINT_DEBUG("Freeing memory occupied by the federate."); - free(_fed.inbound_socket_listeners); - free(federation_metadata.rti_host); - free(federation_metadata.rti_user); - } + lf_print("Connected to RTI at %s:%d.", hostname, uport); } -/** - * Thread that listens for inputs from other federates. - * This thread listens for messages of type MSG_TYPE_P2P_MESSAGE, - * MSG_TYPE_P2P_TAGGED_MESSAGE, or MSG_TYPE_PORT_ABSENT (@see net_common.h) from the specified - * peer federate and calls the appropriate handling function for - * each message type. If an error occurs or an EOF is received - * from the peer, then this procedure sets the corresponding - * socket in _fed.sockets_for_inbound_p2p_connections - * to -1 and returns, terminating the thread. - * @param _args The remote federate ID (cast to void*). - * @param fed_id_ptr A pointer to a uint16_t containing federate ID being listened to. - * This procedure frees the memory pointed to before returning. - */ -void* listen_to_federates(void* _args) { - uint16_t fed_id = (uint16_t)(uintptr_t)_args; - - LF_PRINT_LOG("Listening to federate %d.", fed_id); +void lf_create_server(int specified_port) { + assert(specified_port <= UINT16_MAX && specified_port >= 0); + uint16_t port = (uint16_t)specified_port; + LF_PRINT_LOG("Creating a socket server on port %d.", port); + // Create an IPv4 socket for TCP (not UDP) communication over IP (0). + int socket_descriptor = create_real_time_tcp_socket_errexit(); - int* socket_id = &_fed.sockets_for_inbound_p2p_connections[fed_id]; + // Server file descriptor. + struct sockaddr_in server_fd; + // Zero out the server address structure. + bzero((char*)&server_fd, sizeof(server_fd)); - // Buffer for incoming messages. - // This does not constrain the message size - // because the message will be put into malloc'd memory. - unsigned char buffer[FED_COM_BUFFER_SIZE]; + server_fd.sin_family = AF_INET; // IPv4 + server_fd.sin_addr.s_addr = INADDR_ANY; // All interfaces, 0.0.0.0. + // Convert the port number from host byte order to network byte order. + server_fd.sin_port = htons(port); - // Listen for messages from the federate. - while (1) { - // Read one byte to get the message type. - LF_PRINT_DEBUG("Waiting for a P2P message on socket %d.", *socket_id); - if (read_from_socket_close_on_error(socket_id, 1, buffer)) { - // Socket has been closed. - lf_print("Socket from federate %d is closed.", fed_id); - // Stop listening to this federate. - break; - } - LF_PRINT_DEBUG("Received a P2P message on socket %d of type %d.", - *socket_id, buffer[0]); - bool bad_message = false; - switch (buffer[0]) { - case MSG_TYPE_P2P_MESSAGE: - LF_PRINT_LOG("Received untimed message from federate %d.", fed_id); - handle_message(socket_id, fed_id); - break; - case MSG_TYPE_P2P_TAGGED_MESSAGE: - LF_PRINT_LOG("Received timed message from federate %d.", fed_id); - handle_tagged_message(socket_id, fed_id); - break; - case MSG_TYPE_PORT_ABSENT: - LF_PRINT_LOG("Received port absent message from federate %d.", fed_id); - handle_port_absent_message(socket_id, fed_id); - break; - default: - bad_message = true; - } - if (bad_message) { - // FIXME: Better error handling needed. - lf_print_error("Received erroneous message type: %d. Closing the socket.", buffer[0]); - // Trace the event when tracing is enabled - tracepoint_federate_from_federate(_fed.trace, receive_UNIDENTIFIED, _lf_my_fed_id, fed_id, NULL); - break; - } + int result = bind( + socket_descriptor, + (struct sockaddr *) &server_fd, + sizeof(server_fd)); + int count = 0; + while (result < 0 && count++ < PORT_BIND_RETRY_LIMIT) { + lf_sleep(PORT_BIND_RETRY_INTERVAL); + result = bind( + socket_descriptor, + (struct sockaddr *) &server_fd, + sizeof(server_fd)); + } + if (result < 0) { + lf_print_error_and_exit("Failed to bind socket on port %d.", port); } - return NULL; -} -/** - * @brief Stop the traces associated with all environments in the program. - */ -static void stop_all_traces() { - environment_t *env; - int num_envs = _lf_get_environments(&env); - for (int i = 0; i < num_envs; i++) { - stop_trace(env[i].trace); + // Set the global server port. + if (specified_port == 0) { + // Need to retrieve the port number assigned by the OS. + struct sockaddr_in assigned; + socklen_t addr_len = sizeof(assigned); + if (getsockname(socket_descriptor, (struct sockaddr *) &assigned, &addr_len) < 0) { + lf_print_error_and_exit("Failed to retrieve assigned port number."); + } + _fed.server_port = ntohs(assigned.sin_port); + } else { + _fed.server_port = port; } + + // Enable listening for socket connections. + // The second argument is the maximum number of queued socket requests, + // which according to the Mac man page is limited to 128. + listen(socket_descriptor, 128); + + LF_PRINT_LOG("Server for communicating with other federates started using port %d.", _fed.server_port); + + // Send the server port number to the RTI + // on an MSG_TYPE_ADDRESS_ADVERTISEMENT message (@see net_common.h). + unsigned char buffer[sizeof(int32_t) + 1]; + buffer[0] = MSG_TYPE_ADDRESS_ADVERTISEMENT; + encode_int32(_fed.server_port, &(buffer[1])); + + // No need for a mutex because we have the only handle on this socket. + write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, sizeof(int32_t) + 1, (unsigned char*)buffer, NULL, + "Failed to send address advertisement."); + + // Trace the event when tracing is enabled + tracepoint_federate_to_rti(_fed.trace, send_ADR_AD, _lf_my_fed_id, NULL); + LF_PRINT_DEBUG("Sent port %d to the RTI.", _fed.server_port); + + // Set the global server socket + _fed.server_socket = socket_descriptor; } -/** - * Handle a resign signal from the RTI. The RTI will only resign - * if it is forced to exit, e.g. by a SIG_INT. Hence, this federate - * will exit immediately with an error condition, counting on the - * termination functions to handle any cleanup needed. - */ -void handle_rti_resign_message(void) { - exit(1); +void lf_enqueue_port_absent_reactions(environment_t* env){ + assert(env != GLOBAL_ENVIRONMENT); +#ifdef FEDERATED_CENTRALIZED + if (!_fed.has_downstream) { + // This federate is not connected to any downstream federates via a + // logical connection. No need to trigger port absent + // reactions. + return; + } +#endif + LF_PRINT_DEBUG("Enqueueing port absent reactions at time %lld.", (long long) (env->current_tag.time - start_time)); + if (num_port_absent_reactions == 0) { + LF_PRINT_DEBUG("No port absent reactions."); + return; + } + for (int i = 0; i < num_port_absent_reactions; i++) { + reaction_t* reaction = port_absent_reaction[i]; + if (reaction && reaction->status == inactive) { + LF_PRINT_DEBUG("Inserting port absent reaction on reaction queue."); + lf_scheduler_trigger_reaction(env->scheduler, reaction, -1); + } + } } -/** - * Thread that listens for TCP inputs from the RTI. - * When messages arrive, this calls the appropriate handler. - * @param args Ignored - */ -void* listen_to_rti_TCP(void* args) { - // Buffer for incoming messages. - // This does not constrain the message size - // because the message will be put into malloc'd memory. - unsigned char buffer[FED_COM_BUFFER_SIZE]; +void* lf_handle_p2p_connections_from_federates(void* env_arg) { + assert(env_arg); + environment_t* env = (environment_t *) env_arg; + int received_federates = 0; + // Allocate memory to store thread IDs. + _fed.inbound_socket_listeners = (lf_thread_t*)calloc(_fed.number_of_inbound_p2p_connections, sizeof(lf_thread_t)); + while (received_federates < _fed.number_of_inbound_p2p_connections && !_lf_termination_executed) { + // Wait for an incoming connection request. + struct sockaddr client_fd; + uint32_t client_length = sizeof(client_fd); + int socket_id = accept(_fed.server_socket, &client_fd, &client_length); - // Listen for messages from the federate. - while (1) { - // Check whether the RTI socket is still valid - if (_fed.socket_TCP_RTI < 0) { - lf_print_warning("Socket to the RTI unexpectedly closed."); - return NULL; - } - // Read one byte to get the message type. - // This will exit if the read fails. - int read_failed = read_from_socket(_fed.socket_TCP_RTI, 1, buffer); - if (read_failed < 0) { - if (errno == ECONNRESET) { - lf_print_error("Socket connection to the RTI was closed by the RTI without" - " properly sending an EOF first. Considering this a soft error."); - // FIXME: If this happens, possibly a new RTI must be elected. - _fed.socket_TCP_RTI = -1; - return NULL; + if (socket_id < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { + if (rti_resigned()) break; + else continue; // Try again. + } else if (errno == EPERM) { + lf_print_error_system_failure("Firewall permissions prohibit connection."); } else { - lf_print_error("Socket connection to the RTI has been broken with error %d: %s." - " The RTI should close connections with an EOF first." - " Considering this a soft error.", - errno, - strerror(errno)); - // FIXME: If this happens, possibly a new RTI must be elected. - _fed.socket_TCP_RTI = -1; - return NULL; + lf_print_error_system_failure("A fatal error occurred while accepting a new socket."); } - } else if (read_failed > 0) { - // EOF received. - lf_print("Connection to the RTI closed with an EOF."); - _fed.socket_TCP_RTI = -1; - stop_all_traces(); - return NULL; } - switch (buffer[0]) { - case MSG_TYPE_TAGGED_MESSAGE: - handle_tagged_message(&_fed.socket_TCP_RTI, -1); - break; - case MSG_TYPE_TAG_ADVANCE_GRANT: - handle_tag_advance_grant(); - break; - case MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT: - handle_provisional_tag_advance_grant(); - break; - case MSG_TYPE_STOP_REQUEST: - handle_stop_request_message(); - break; - case MSG_TYPE_STOP_GRANTED: - handle_stop_granted_message(); - break; - case MSG_TYPE_PORT_ABSENT: - handle_port_absent_message(&_fed.socket_TCP_RTI, -1); - break; - case MSG_TYPE_RESIGN: - handle_rti_resign_message(); - break; - case MSG_TYPE_CLOCK_SYNC_T1: - case MSG_TYPE_CLOCK_SYNC_T4: - lf_print_error("Federate %d received unexpected clock sync message from RTI on TCP socket.", - _lf_my_fed_id); - break; - default: - lf_print_error_and_exit("Received from RTI an unrecognized TCP message type: %hhx.", buffer[0]); + LF_PRINT_LOG("Accepted new connection from remote federate."); + + size_t header_length = 1 + sizeof(uint16_t) + 1; + unsigned char buffer[header_length]; + int read_failed = read_from_socket(socket_id, header_length, (unsigned char*)&buffer); + if (read_failed || buffer[0] != MSG_TYPE_P2P_SENDING_FED_ID) { + lf_print_warning("Federate received invalid first message on P2P socket. Closing socket."); + if (read_failed == 0) { + // Wrong message received. + unsigned char response[2]; + response[0] = MSG_TYPE_REJECT; + response[1] = WRONG_SERVER; // Trace the event when tracing is enabled - tracepoint_federate_from_rti(_fed.trace, receive_UNIDENTIFIED, _lf_my_fed_id, NULL); + tracepoint_federate_to_federate(_fed.trace, send_REJECT, _lf_my_fed_id, -3, NULL); + // Ignore errors on this response. + write_to_socket(socket_id, 2, response); + } + close(socket_id); + continue; + } + + // Get the federation ID and check it. + unsigned char federation_id_length = buffer[header_length - 1]; + char remote_federation_id[federation_id_length]; + read_failed = read_from_socket(socket_id, federation_id_length, (unsigned char*)remote_federation_id); + if (read_failed || (strncmp(federation_metadata.federation_id, remote_federation_id, strnlen(federation_metadata.federation_id, 255)) != 0)) { + lf_print_warning("Received invalid federation ID. Closing socket."); + if (read_failed == 0) { + unsigned char response[2]; + response[0] = MSG_TYPE_REJECT; + response[1] = FEDERATION_ID_DOES_NOT_MATCH; + // Trace the event when tracing is enabled + tracepoint_federate_to_federate(_fed.trace, send_REJECT, _lf_my_fed_id, -3, NULL); + // Ignore errors on this response. + write_to_socket(socket_id, 2, response); + } + close(socket_id); + continue; + } + + // Extract the ID of the sending federate. + uint16_t remote_fed_id = extract_uint16((unsigned char*)&(buffer[1])); + LF_PRINT_DEBUG("Received sending federate ID %d.", remote_fed_id); + + // Trace the event when tracing is enabled + tracepoint_federate_to_federate(_fed.trace, receive_FED_ID, _lf_my_fed_id, remote_fed_id, NULL); + + // Once we record the socket_id here, all future calls to close() on + // the socket should be done while holding the socket_mutex, and this array + // element should be reset to -1 during that critical section. + // Otherwise, there can be race condition where, during termination, + // two threads attempt to simultaneously close the socket. + _fed.sockets_for_inbound_p2p_connections[remote_fed_id] = socket_id; + + // Send an MSG_TYPE_ACK message. + unsigned char response = MSG_TYPE_ACK; + + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + write_to_socket_fail_on_error( + &_fed.sockets_for_inbound_p2p_connections[remote_fed_id], + 1, (unsigned char*)&response, + &lf_outbound_socket_mutex, + "Failed to write MSG_TYPE_ACK in response to federate %d.", + remote_fed_id); + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + + // Trace the event when tracing is enabled + tracepoint_federate_to_federate(_fed.trace, send_ACK, _lf_my_fed_id, remote_fed_id, NULL); + + // Start a thread to listen for incoming messages from other federates. + // The fed_id is a uint16_t, which we assume can be safely cast to and from void*. + void* fed_id_arg = (void*)(uintptr_t)remote_fed_id; + int result = lf_thread_create( + &_fed.inbound_socket_listeners[received_federates], + listen_to_federates, + fed_id_arg); + if (result != 0) { + // Failed to create a listening thread. + LF_MUTEX_LOCK(socket_mutex); + if (_fed.sockets_for_inbound_p2p_connections[remote_fed_id] != -1) { + close(socket_id); + _fed.sockets_for_inbound_p2p_connections[remote_fed_id] = -1; } + LF_MUTEX_UNLOCK(socket_mutex); + lf_print_error_and_exit( + "Failed to create a thread to listen for incoming physical connection. Error code: %d.", + result + ); + } + + received_federates++; } + + LF_PRINT_LOG("All %zu remote federates are connected.", _fed.number_of_inbound_p2p_connections); return NULL; } -void synchronize_with_other_federates(void) { - - LF_PRINT_DEBUG("Synchronizing with other federates."); +void lf_latest_tag_complete(tag_t tag_to_send) { + int compare_with_last_tag = lf_tag_compare(_fed.last_sent_LTC, tag_to_send); + if (compare_with_last_tag >= 0) { + return; + } + LF_PRINT_LOG("Sending Latest Time Complete (LTC) " PRINTF_TAG " to the RTI.", + tag_to_send.time - start_time, + tag_to_send.microstep); + send_tag(MSG_TYPE_LATEST_TAG_COMPLETE, tag_to_send); + _fed.last_sent_LTC = tag_to_send; +} - // Reset the start time to the coordinated start time for all federates. - // Note that this does not grant execution to this federate. - start_time = get_start_time_from_rti(lf_time_physical()); +parse_rti_code_t lf_parse_rti_addr(const char* rti_addr) { + bool has_host = false, has_port = false, has_user = false; + rti_addr_info_t rti_addr_info = {0}; + extract_rti_addr_info(rti_addr, &rti_addr_info); + if (!rti_addr_info.has_host && !rti_addr_info.has_port && !rti_addr_info.has_user) { + return FAILED_TO_PARSE; + } + if (rti_addr_info.has_host) { + if (validate_host(rti_addr_info.rti_host_str)) { + char* rti_host = (char*) calloc(256, sizeof(char)); + strncpy(rti_host, rti_addr_info.rti_host_str, 255); + federation_metadata.rti_host = rti_host; + } else { + return INVALID_HOST; + } + } + if (rti_addr_info.has_port) { + if (validate_port(rti_addr_info.rti_port_str)) { + federation_metadata.rti_port = atoi(rti_addr_info.rti_port_str); + } else { + return INVALID_PORT; + } + } + if (rti_addr_info.has_user) { + if (validate_user(rti_addr_info.rti_user_str)) { + char* rti_user = (char*) calloc(256, sizeof(char)); + strncpy(rti_user, rti_addr_info.rti_user_str, 255); + federation_metadata.rti_user = rti_user; + } else { + return INVALID_USER; + } + } + return SUCCESS; +} - // Start a thread to listen for incoming TCP messages from the RTI. - // @note Up until this point, the federate has been listening for messages - // from the RTI in a sequential manner in the main thread. From now on, a - // separate thread is created to allow for asynchronous communication. - lf_thread_create(&_fed.RTI_socket_listener, listen_to_rti_TCP, NULL); - lf_thread_t thread_id; - if (create_clock_sync_thread(&thread_id)) { - lf_print_warning("Failed to create thread to handle clock synchronization."); +void lf_reset_status_fields_on_input_port_triggers() { + for (int i = 0; i < _lf_action_table_size; i++) { + set_network_port_status(i, unknown); } + LF_PRINT_DEBUG("Resetting port status fields."); + lf_update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); } -/** - * Modify the specified tag, if necessary, to be an earlier tag based - * on the current physical time. The earlier tag is necessary if this federate - * has downstream federates and also has physical actions that may trigger - * outputs. In that case, the earlier tag will be the current physical time - * plus the minimum delay on all such physical actions plus any other delays - * along the path from the triggering physical action to the output port - * minus one nanosecond. The modified tag is assured of being less than any - * output tag that might later be produced. - * @param tag A pointer to the proposed NET. - * @return True if this federate requires this modification and the tag was - * modified. - */ -bool _lf_bounded_NET(tag_t* tag) { - // The tag sent by this function is a promise that, absent - // inputs from another federate, this federate will not produce events - // earlier than t. But if there are downstream federates and there is - // a physical action (not counting receivers from upstream federates), - // then we can only promise up to current physical time (plus the minimum - // of all minimum delays on the physical actions). - // In this case, we send a NET message with the current physical time - // to permit downstream federates to advance. To avoid - // overwhelming the network, this NET message should be sent periodically - // at specified intervals controlled by the target parameter - // coordination-options: {advance-message-interval: time units}. - // The larger the interval, the more downstream federates will lag - // behind real time, but the less network traffic. If this option is - // missing, we issue a warning message suggesting that a redesign - // might be in order so that outputs don't depend on physical actions. - LF_PRINT_DEBUG("Checking NET to see whether it should be bounded by physical time." - " Min delay from physical action: " PRINTF_TIME ".", - _fed.min_delay_from_physical_action_to_federate_output); - if (_fed.min_delay_from_physical_action_to_federate_output >= 0LL - && _fed.has_downstream - ) { - // There is a physical action upstream of some output from this - // federate, and there is at least one downstream federate. - // Compare the tag to the current physical time. - instant_t physical_time = lf_time_physical(); - if (physical_time + _fed.min_delay_from_physical_action_to_federate_output < tag->time) { - // Can only promise up and not including this new time: - tag->time = physical_time + _fed.min_delay_from_physical_action_to_federate_output - 1L; - tag->microstep = 0; - LF_PRINT_LOG("Has physical actions that bound NET to " PRINTF_TAG ".", - tag->time - start_time, tag->microstep); - return true; - } +int lf_send_message(int message_type, + unsigned short port, + unsigned short federate, + const char* next_destination_str, + size_t length, + unsigned char* message) { + unsigned char header_buffer[1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t)]; + // First byte identifies this as a timed message. + if (message_type != MSG_TYPE_P2P_MESSAGE ) { + lf_print_error("lf_send_message: Unsupported message type (%d).", message_type); + return -1; } - return false; + header_buffer[0] = (unsigned char)message_type; + // Next two bytes identify the destination port. + // NOTE: Send messages little endian (network order), not big endian. + encode_uint16(port, &(header_buffer[1])); + + // Next two bytes identify the destination federate. + encode_uint16(federate, &(header_buffer[1 + sizeof(uint16_t)])); + + // The next four bytes are the message length. + encode_int32((int32_t)length, &(header_buffer[1 + sizeof(uint16_t) + sizeof(uint16_t)])); + + LF_PRINT_LOG("Sending untagged message to %s.", next_destination_str); + + // Header: message_type + port_id + federate_id + length of message + timestamp + microstep + const int header_length = 1 + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t); + + // Use a mutex lock to prevent multiple threads from simultaneously sending. + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + + int* socket = &_fed.sockets_for_outbound_p2p_connections[federate]; + + // Trace the event when tracing is enabled + tracepoint_federate_to_federate(_fed.trace, send_P2P_MSG, _lf_my_fed_id, federate, NULL); + + int result = write_to_socket_close_on_error(socket, header_length, header_buffer); + if (result == 0) { + // Header sent successfully. Send the body. + result = write_to_socket_close_on_error(socket, length, message); + } + if (result != 0) { + // Message did not send. Since this is used for physical connections, this is not critical. + lf_print_warning("Failed to send message to %s. Dropping the message.", next_destination_str); + } + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + return result; } -/** - * If this federate depends on upstream federates or sends data to downstream - * federates, then send to the RTI a NET, which will give the tag of the - * earliest event on the event queue, or, if the queue is empty, the timeout - * time, or, if there is no timeout, FOREVER. - * - * If there are network outputs that - * depend on physical actions, then insert a dummy event to ensure this federate - * advances its tag so that downstream federates can make progress. - * - * A NET is a promise saying that, absent network inputs, this federate will - * not produce an output message with tag earlier than the NET value. - * - * If there are upstream federates, then after sending a NET, this will block - * until either the RTI grants the advance to the requested time or the wait - * for the response from the RTI is interrupted by a change in the event queue - * (e.g., a physical action triggered or a network message arrived). - * If there are no upstream federates, then it will not wait for a TAG - * (which won't be forthcoming anyway) and returns the earliest tag on the event queue. - * - * If the federate has neither upstream nor downstream federates, then this - * returns the specified tag immediately without sending anything to the RTI. - * - * If there is at least one physical action somewhere in the federate that can - * trigger an output to a downstream federate, then the NET is required to be - * less than the current physical time. If physical time is less than the - * earliest event in the event queue (or the event queue is empty), then this - * function will insert a dummy event with a tag equal to the current physical - * time (and a microstep of 0). This will enforce advancement of tag for this - * federate and causes a NET message to be sent repeatedly as physical time - * advances with the time interval between messages controlled by the target - * parameter coordination-options: {advance-message-interval timevalue}. It will - * stop creating dummy events if and when its event queue has an event with a - * timestamp less than physical time. - * - * If wait_for_reply is false, then this function will simply send the - * specified tag and return that tag immediately. This is useful when a - * federate is shutting down and will not be sending any more messages at all. - * - * In all cases, this returns either the specified tag or - * another tag when it is safe to advance logical time to the returned tag. - * The returned tag may be less than the specified tag if there are upstream - * federates and either the RTI responds with a lesser tag or - * the wait for a response from the RTI is interrupted by a - * change in the event queue. - * - * This function is used in centralized coordination only. - * - * This function assumes the caller holds the mutex lock. - * - * @param env The environment of the federate - * @param tag The tag. - * @param wait_for_reply If true, wait for a reply. - */ -tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply) { +tag_t lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply) { assert(env != GLOBAL_ENVIRONMENT); while (true) { if (!_fed.has_downstream && !_fed.has_upstream) { @@ -2677,7 +2313,7 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply return _fed.last_TAG; } - // Copy the tag because _lf_bounded_NET() may modify it. + // Copy the tag because bounded_NET() may modify it. tag_t original_tag = tag; // A NET sent by this function is a promise that, absent @@ -2688,7 +2324,7 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply // of all minimum delays on the physical actions). // If wait_for_reply is false, leave the tag alone. bool tag_bounded_by_physical_time = wait_for_reply ? - _lf_bounded_NET(&tag) + bounded_NET(&tag) : false; // What we do next depends on whether the NET has been bounded by @@ -2697,7 +2333,7 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply // This if statement does not fall through but rather returns. // NET is not bounded by physical time or has no downstream federates. // Normal case. - _lf_send_tag(MSG_TYPE_NEXT_EVENT_TAG, tag); + send_tag(MSG_TYPE_NEXT_EVENT_TAG, tag); _fed.last_sent_NET = tag; LF_PRINT_LOG("Sent next event tag (NET) " PRINTF_TAG " to RTI.", tag.time - start_time, tag.microstep); @@ -2738,7 +2374,7 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply return _fed.last_TAG; } if (lf_tag_compare(next_tag, tag) != 0) { - _lf_send_tag(MSG_TYPE_NEXT_EVENT_TAG, next_tag); + send_tag(MSG_TYPE_NEXT_EVENT_TAG, next_tag); _fed.last_sent_NET = next_tag; LF_PRINT_LOG("Sent next event tag (NET) " PRINTF_TAG " to RTI from loop.", next_tag.time - lf_time_start(), next_tag.microstep); @@ -2754,92 +2390,340 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply pqueue_insert(env->event_q, dummy); } - LF_PRINT_DEBUG("Inserted a dummy event for logical time " PRINTF_TIME ".", - tag.time - lf_time_start()); - - if (!wait_for_reply) { - LF_PRINT_LOG("Not waiting for physical time to advance further."); - return tag; - } + LF_PRINT_DEBUG("Inserted a dummy event for logical time " PRINTF_TIME ".", + tag.time - lf_time_start()); + + if (!wait_for_reply) { + LF_PRINT_LOG("Not waiting for physical time to advance further."); + return tag; + } + + // This federate should repeatedly advance its tag to ensure downstream federates can make progress. + // Before advancing to the next tag, we need to wait some time so that we don't overwhelm the network and the + // RTI. That amount of time will be no greater than ADVANCE_MESSAGE_INTERVAL in the future. + LF_PRINT_DEBUG("Waiting for physical time to elapse or an event on the event queue."); + + // The above call to bounded_NET called lf_time_physical() + // set _lf_last_reported_unadjusted_physical_time_ns, the + // time obtained using CLOCK_REALTIME before adjustment for + // clock synchronization. Since that is the clock used by + // lf_cond_timedwait, this is the clock we want to use. + instant_t wait_until_time_ns = + _lf_last_reported_unadjusted_physical_time_ns + ADVANCE_MESSAGE_INTERVAL; + + // Regardless of the ADVANCE_MESSAGE_INTERVAL, do not let this + // wait exceed the time of the next tag. + if (wait_until_time_ns > original_tag.time) { + wait_until_time_ns = original_tag.time; + } + + lf_cond_timedwait(&env->event_q_changed, wait_until_time_ns); + + LF_PRINT_DEBUG("Wait finished or interrupted."); + + // Either the timeout expired or the wait was interrupted by an event being + // put onto the event queue. In either case, we can just loop around. + // The next iteration will determine whether another + // NET should be sent or not. + tag = get_next_event_tag(env); + } +} + +void lf_send_port_absent_to_federate( + environment_t* env, + interval_t additional_delay, + unsigned short port_ID, + unsigned short fed_ID) { + assert(env != GLOBAL_ENVIRONMENT); + + // Construct the message + size_t message_length = 1 + sizeof(port_ID) + sizeof(fed_ID) + sizeof(instant_t) + sizeof(microstep_t); + unsigned char buffer[message_length]; + + // Apply the additional delay to the current tag and use that as the intended + // tag of the outgoing message. Note that if there is delay on the connection, + // then we cannot promise no message with tag = current_tag + delay because a + // subsequent reaction might produce such a message. But we can promise no + // message with a tag strictly less than current_tag + delay. + tag_t current_message_intended_tag = lf_delay_strict(env->current_tag, additional_delay); + + LF_PRINT_LOG("Sending port " + "absent for tag " PRINTF_TAG " for port %d to federate %d.", + current_message_intended_tag.time - start_time, + current_message_intended_tag.microstep, + port_ID, fed_ID); + + buffer[0] = MSG_TYPE_PORT_ABSENT; + encode_uint16(port_ID, &(buffer[1])); + encode_uint16(fed_ID, &(buffer[1+sizeof(port_ID)])); + encode_tag(&(buffer[1+sizeof(port_ID)+sizeof(fed_ID)]), current_message_intended_tag); + +#ifdef FEDERATED_CENTRALIZED + // Send the absent message through the RTI + int* socket = &_fed.socket_TCP_RTI; +#else + // Send the absent message directly to the federate + int* socket = &_fed.sockets_for_outbound_p2p_connections[fed_ID]; +#endif + + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + int result = write_to_socket_close_on_error(socket, message_length, buffer); + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + + if (result != 0) { + // Write failed. Response depends on whether coordination is centralized. + if (socket == &_fed.socket_TCP_RTI) { + // Centralized coordination. This is a critical error. + lf_print_error_system_failure("Failed to send port absent message for port %hu to federate %hu.", + port_ID, fed_ID); + } else { + // Decentralized coordination. This is not a critical error. + lf_print_warning("Failed to send port absent message for port %hu to federate %hu.", + port_ID, fed_ID); + } + } else { + // Message sent correctly. Trace it. + if (socket == &_fed.socket_TCP_RTI) { + tracepoint_federate_to_rti( + _fed.trace, send_PORT_ABS, _lf_my_fed_id, ¤t_message_intended_tag); + } else { + tracepoint_federate_to_federate( + _fed.trace, send_PORT_ABS, _lf_my_fed_id, fed_ID, ¤t_message_intended_tag); + } + } +} + +int lf_send_stop_request_to_rti(tag_t stop_tag) { - // This federate should repeatedly advance its tag to ensure downstream federates can make progress. - // Before advancing to the next tag, we need to wait some time so that we don't overwhelm the network and the - // RTI. That amount of time will be no greater than ADVANCE_MESSAGE_INTERVAL in the future. - LF_PRINT_DEBUG("Waiting for physical time to elapse or an event on the event queue."); + // Send a stop request with the specified tag to the RTI + unsigned char buffer[MSG_TYPE_STOP_REQUEST_LENGTH]; + // Stop at the next microstep + stop_tag.microstep++; + ENCODE_STOP_REQUEST(buffer, stop_tag.time, stop_tag.microstep); - // The above call to _lf_bounded_NET called lf_time_physical() - // set _lf_last_reported_unadjusted_physical_time_ns, the - // time obtained using CLOCK_REALTIME before adjustment for - // clock synchronization. Since that is the clock used by - // lf_cond_timedwait, this is the clock we want to use. - instant_t wait_until_time_ns = - _lf_last_reported_unadjusted_physical_time_ns + ADVANCE_MESSAGE_INTERVAL; + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + // Do not send a stop request if a stop request has been previously received from the RTI. + if (!_fed.received_stop_request_from_rti) { + LF_PRINT_LOG("Sending to RTI a MSG_TYPE_STOP_REQUEST message with tag " PRINTF_TAG ".", + stop_tag.time - start_time, + stop_tag.microstep); - // Regardless of the ADVANCE_MESSAGE_INTERVAL, do not let this - // wait exceed the time of the next tag. - if (wait_until_time_ns > original_tag.time) { - wait_until_time_ns = original_tag.time; + if (_fed.socket_TCP_RTI < 0) { + lf_print_warning("Socket is no longer connected. Dropping message."); + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + return -1; } + write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, MSG_TYPE_STOP_REQUEST_LENGTH, + buffer, &lf_outbound_socket_mutex, + "Failed to send stop time " PRINTF_TIME " to the RTI.", stop_tag.time - start_time); - lf_cond_timedwait(&env->event_q_changed, wait_until_time_ns); + // Treat this sending as equivalent to having received a stop request from the RTI. + _fed.received_stop_request_from_rti = true; + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + // Trace the event when tracing is enabled + tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ, _lf_my_fed_id, &stop_tag); + return 0; + } else { + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + return 1; + } +} - LF_PRINT_DEBUG("Wait finished or interrupted."); +int lf_send_tagged_message(environment_t* env, + interval_t additional_delay, + int message_type, + unsigned short port, + unsigned short federate, + const char* next_destination_str, + size_t length, + unsigned char* message) { + assert(env != GLOBAL_ENVIRONMENT); - // Either the timeout expired or the wait was interrupted by an event being - // put onto the event queue. In either case, we can just loop around. - // The next iteration will determine whether another - // NET should be sent or not. - tag = get_next_event_tag(env); + size_t header_length = 1 + sizeof(uint16_t) + sizeof(uint16_t) + + sizeof(int32_t) + sizeof(instant_t) + sizeof(microstep_t); + unsigned char header_buffer[header_length]; + + if (message_type != MSG_TYPE_TAGGED_MESSAGE && message_type != MSG_TYPE_P2P_TAGGED_MESSAGE) { + lf_print_error("lf_send_message: Unsupported message type (%d).", message_type); + return -1; } -} -/** - * Parse the address of the RTI and store them into the global federation_metadata struct. - * @return a parse_rti_code_t indicating the result of the parse. - */ -parse_rti_code_t parse_rti_addr(const char* rti_addr) { - bool has_host = false, has_port = false, has_user = false; - rti_addr_info_t rti_addr_info = {0}; - extract_rti_addr_info(rti_addr, &rti_addr_info); - if (!rti_addr_info.has_host && !rti_addr_info.has_port && !rti_addr_info.has_user) { - return FAILED_TO_PARSE; + size_t buffer_head = 0; + // First byte is the message type. + header_buffer[buffer_head] = (unsigned char)message_type; + buffer_head += sizeof(unsigned char); + // Next two bytes identify the destination port. + // NOTE: Send messages little endian, not big endian. + encode_uint16(port, &(header_buffer[buffer_head])); + buffer_head += sizeof(uint16_t); + + // Next two bytes identify the destination federate. + encode_uint16(federate, &(header_buffer[buffer_head])); + buffer_head += sizeof(uint16_t); + + // The next four bytes are the message length. + encode_int32((int32_t)length, &(header_buffer[buffer_head])); + buffer_head += sizeof(int32_t); + + // Apply the additional delay to the current tag and use that as the intended + // tag of the outgoing message. + tag_t current_message_intended_tag = lf_delay_tag(env->current_tag, additional_delay); + + if (_lf_is_tag_after_stop_tag(env, current_message_intended_tag)) { + // Message tag is past the timeout time (the stop time) so it should not be sent. + LF_PRINT_LOG("Dropping message because it will be after the timeout time."); + return -1; } - if (rti_addr_info.has_host) { - if (validate_host(rti_addr_info.rti_host_str)) { - char* rti_host = (char*) calloc(256, sizeof(char)); - strncpy(rti_host, rti_addr_info.rti_host_str, 255); - federation_metadata.rti_host = rti_host; - } else { - return INVALID_HOST; - } + + // Next 8 + 4 will be the tag (timestamp, microstep) + encode_tag( + &(header_buffer[buffer_head]), + current_message_intended_tag + ); + + LF_PRINT_LOG("Sending message with tag " PRINTF_TAG " to %s.", + current_message_intended_tag.time - start_time, + current_message_intended_tag.microstep, + next_destination_str); + + // Use a mutex lock to prevent multiple threads from simultaneously sending. + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + + int* socket; + if (message_type == MSG_TYPE_P2P_TAGGED_MESSAGE) { + socket = &_fed.sockets_for_outbound_p2p_connections[federate]; + tracepoint_federate_to_federate(_fed.trace, send_P2P_TAGGED_MSG, _lf_my_fed_id, federate, ¤t_message_intended_tag); + } else { + socket = &_fed.socket_TCP_RTI; + tracepoint_federate_to_rti(_fed.trace, send_TAGGED_MSG, _lf_my_fed_id, ¤t_message_intended_tag); } - if (rti_addr_info.has_port) { - if (validate_port(rti_addr_info.rti_port_str)) { - federation_metadata.rti_port = atoi(rti_addr_info.rti_port_str); - } else { - return INVALID_PORT; - } + + int result = write_to_socket_close_on_error(socket, header_length, header_buffer); + if (result == 0) { + // Header sent successfully. Send the body. + result = write_to_socket_close_on_error(socket, length, message); } - if (rti_addr_info.has_user) { - if (validate_user(rti_addr_info.rti_user_str)) { - char* rti_user = (char*) calloc(256, sizeof(char)); - strncpy(rti_user, rti_addr_info.rti_user_str, 255); - federation_metadata.rti_user = rti_user; + if (result != 0) { + // Message did not send. Handling depends on message type. + if (message_type == MSG_TYPE_P2P_TAGGED_MESSAGE) { + lf_print_warning("Failed to send message to %s. Dropping the message.", next_destination_str); } else { - return INVALID_USER; + lf_print_error_system_failure("Failed to send message to %s. Connection lost to the RTI.", + next_destination_str); } } - return SUCCESS; + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + return result; } -/** - * Sets the federation_id of this federate to fid. - */ -void set_federation_id(const char* fid) { +void lf_set_federation_id(const char* fid) { federation_metadata.federation_id = fid; } -void set_federation_trace_object(trace_t * trace) { +void lf_set_federation_trace_object(trace_t * trace) { _fed.trace = trace; } + +#ifdef FEDERATED_DECENTRALIZED +void lf_spawn_staa_thread(){ + lf_thread_create(&_fed.staaSetter, update_ports_from_staa_offsets, NULL); +} +#endif // FEDERATED_DECENTRALIZED + +void lf_stall_advance_level_federation(environment_t* env, size_t level) { + LF_PRINT_DEBUG("Acquiring the environment mutex."); + LF_MUTEX_LOCK(env->mutex); + LF_PRINT_DEBUG("Waiting on MLAA with next_reaction_level %zu and MLAA %d.", level, max_level_allowed_to_advance); + while (((int) level) >= max_level_allowed_to_advance) { + lf_cond_wait(&lf_port_status_changed); + }; + LF_PRINT_DEBUG("Exiting wait with MLAA %d and next_reaction_level %zu.", max_level_allowed_to_advance, level); + LF_MUTEX_UNLOCK(env->mutex); +} + +void lf_synchronize_with_other_federates(void) { + + LF_PRINT_DEBUG("Synchronizing with other federates."); + + // Reset the start time to the coordinated start time for all federates. + // Note that this does not grant execution to this federate. + start_time = get_start_time_from_rti(lf_time_physical()); + + // Start a thread to listen for incoming TCP messages from the RTI. + // @note Up until this point, the federate has been listening for messages + // from the RTI in a sequential manner in the main thread. From now on, a + // separate thread is created to allow for asynchronous communication. + lf_thread_create(&_fed.RTI_socket_listener, listen_to_rti_TCP, NULL); + lf_thread_t thread_id; + if (create_clock_sync_thread(&thread_id)) { + lf_print_warning("Failed to create thread to handle clock synchronization."); + } +} + +bool lf_update_max_level(tag_t tag, bool is_provisional) { + // This always needs the top-level environment, which will be env[0]. + environment_t *env; + _lf_get_environments(&env); + int prev_max_level_allowed_to_advance = max_level_allowed_to_advance; + max_level_allowed_to_advance = INT_MAX; +#ifdef FEDERATED_DECENTRALIZED + size_t action_table_size = _lf_action_table_size; + lf_action_base_t** action_table = _lf_action_table; +#else + // Note that the following test is never true for decentralized coordination, + // where tag always is NEVER_TAG. + if ((lf_tag_compare(env->current_tag, tag) < 0) || ( + lf_tag_compare(env->current_tag, tag) == 0 && !is_provisional + )) { + LF_PRINT_DEBUG("Updated MLAA to %d at time " PRINTF_TIME ".", + max_level_allowed_to_advance, + lf_time_logical_elapsed(env) + ); + // Safe to complete the current tag + return (prev_max_level_allowed_to_advance != max_level_allowed_to_advance); + } + + size_t action_table_size = _lf_zero_delay_cycle_action_table_size; + lf_action_base_t** action_table = _lf_zero_delay_cycle_action_table; +#endif // FEDERATED_DECENTRALIZED + for (int i = 0; i < action_table_size; i++) { + lf_action_base_t* input_port_action = action_table[i]; +#ifdef FEDERATED_DECENTRALIZED + // In decentralized execution, if the current_tag is close enough to the + // start tag and there is a large enough delay on an incoming + // connection, then there is no need to block progress waiting for this + // port status. This is irrelevant for centralized because blocking only + // occurs on zero-delay cycles. + if ( + (_lf_action_delay_table[i] == 0 && env->current_tag.time == start_time && env->current_tag.microstep == 0) + || (_lf_action_delay_table[i] > 0 && lf_tag_compare( + env->current_tag, + lf_delay_strict((tag_t) {.time=start_time, .microstep=0}, _lf_action_delay_table[i]) + ) <= 0) + ) { + continue; + } +#endif // FEDERATED_DECENTRALIZED + // If the current tag is greater than the last known status tag of the input port, + // and the input port is not physical, then block on that port by ensuring + // the MLAA is no greater than the level of that port. + // For centralized coordination, this is applied only to input ports coming from + // federates that are in a ZDC. For decentralized coordination, this is applied + // to all input ports. + if (lf_tag_compare(env->current_tag, + input_port_action->trigger->last_known_status_tag) > 0 + && !input_port_action->trigger->is_physical) { + max_level_allowed_to_advance = LF_MIN( + max_level_allowed_to_advance, + ((int) LF_LEVEL(input_port_action->trigger->reactions[0]->index)) + ); + } + } + LF_PRINT_DEBUG("Updated MLAA to %d at time " PRINTF_TIME ".", + max_level_allowed_to_advance, + lf_time_logical_elapsed(env) + ); + return (prev_max_level_allowed_to_advance != max_level_allowed_to_advance); +} + #endif diff --git a/core/reactor_common.c b/core/reactor_common.c index e16d58a12..dcc951369 100644 --- a/core/reactor_common.c +++ b/core/reactor_common.c @@ -269,12 +269,12 @@ void _lf_trigger_reaction(environment_t* env, reaction_t* reaction, int worker_n * counts between time steps and at the end of execution. */ void _lf_start_time_step(environment_t *env) { + assert(env != GLOBAL_ENVIRONMENT); if (!env->execution_started) { // Execution hasn't started, so this is probably being invoked in termination // due to an error. return; } - assert(env != GLOBAL_ENVIRONMENT); LF_PRINT_LOG("--------- Start time step at tag " PRINTF_TAG ".", env->current_tag.time - start_time, env->current_tag.microstep); // Handle dynamically created tokens for mutable inputs. _lf_free_token_copies(env); @@ -300,22 +300,29 @@ void _lf_start_time_step(environment_t *env) { } } } + env->is_present_fields_abbreviated_size = 0; + +#ifdef FEDERATED + // If the environment is the top-level one, we have some work to do. + environment_t *envs; + int num_envs = _lf_get_environments(&envs); + if (env == envs) { + // This is the top-level environment. #ifdef FEDERATED_DECENTRALIZED - for (int i = 0; i < env->is_present_fields_size; i++) { - // FIXME: For now, an intended tag of (NEVER, 0) - // indicates that it has never been set. - *env->_lf_intended_tag_fields[i] = (tag_t) {NEVER, 0}; + for (int i = 0; i < env->is_present_fields_size; i++) { + // An intended tag of NEVER_TAG indicates that it has never been set. + *env->_lf_intended_tag_fields[i] = NEVER_TAG; + } +#endif // FEDERATED_DECENTRALIZED + + // Reset absent fields on network ports because + // their status is unknown + lf_reset_status_fields_on_input_port_triggers(); + // Signal the helper thread to reset its progress since the logical time has changed. + lf_cond_signal(&lf_current_tag_changed); } -#endif -#ifdef FEDERATED - // Reset absent fields on network ports because - // their status is unknown - reset_status_fields_on_input_port_triggers(); - // Signal the helper thread to reset its progress since the logical time has changed. - lf_cond_signal(&logical_time_changed); -#endif - env->is_present_fields_abbreviated_size = 0; +#endif // FEDERATED } /** @@ -1669,7 +1676,7 @@ int process_args(int argc, const char* argv[]) { return 0; } const char* fid = argv[i++]; - set_federation_id(fid); + lf_set_federation_id(fid); lf_print("Federation ID for executable %s: %s", argv[0], fid); } else if (strcmp(arg, "-r") == 0 || strcmp(arg, "--rti") == 0) { if (argc < i + 1) { @@ -1677,7 +1684,7 @@ int process_args(int argc, const char* argv[]) { usage(argc, argv); return 0; } - parse_rti_code_t code = parse_rti_addr(argv[i++]); + parse_rti_code_t code = lf_parse_rti_addr(argv[i++]); if (code != SUCCESS) { switch (code) { case INVALID_HOST: @@ -1728,7 +1735,7 @@ void initialize_global(void) { // Federation trace object must be set before `initialize_trigger_objects` is called because it // uses tracing functionality depending on that pointer being set. #ifdef FEDERATED - set_federation_trace_object(envs->trace); + lf_set_federation_trace_object(envs->trace); #endif // Call the code-generated function to initialize all actions, timers, and ports // This is done for all environments/enclaves at the same time. diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index b8858763b..c7f1ca362 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -231,9 +231,6 @@ void _lf_set_present(lf_port_base_t* port) { } } -// Forward declaration. See federate.h -void synchronize_with_other_federates(void); - /** * Wait until physical time matches or exceeds the specified logical time, * unless -fast is given. For decentralized coordination, this function will @@ -384,11 +381,6 @@ tag_t get_next_event_tag(environment_t *env) { return next_tag; } -#ifdef FEDERATED_CENTRALIZED -// The following is defined in federate.c and used in the following function. -tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply); -#endif - /** * In a federated execution with centralized coordination, this function returns * a tag that is less than or equal to the specified tag when, as far @@ -405,7 +397,7 @@ tag_t _lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply */ tag_t send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply) { #if defined(FEDERATED_CENTRALIZED) - return _lf_send_next_event_tag(env, tag, wait_for_reply); + return lf_send_next_event_tag(env, tag, wait_for_reply); #elif defined(LF_ENCLAVES) return rti_next_event_tag_locked(env->enclave_info, tag); #else @@ -572,10 +564,10 @@ void _lf_next_locked(environment_t *env) { // stick them into the reaction queue. _lf_pop_events(env); #ifdef FEDERATED - enqueue_port_absent_reactions(env); + lf_enqueue_port_absent_reactions(env); // _lf_pop_events may have set some triggers present. extern federate_instance_t _fed; - update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); + lf_update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); #endif } @@ -616,7 +608,7 @@ void lf_request_stop() { // In the federated case, the RTI might grant a // later stop tag than the current tag. The above code has raised // a barrier no greater than max_current_tag. - if (_lf_fd_send_stop_request_to_rti(max_current_tag) != 0) { + if (lf_send_stop_request_to_rti(max_current_tag) != 0) { // Message was not sent to the RTI. // Decrement the barriers to reverse our previous increment. for (int i = 0; i < num_environments; i++) { @@ -691,10 +683,10 @@ void _lf_initialize_start_tag(environment_t *env) { if (env == top_level_env) { // Reset status fields before talking to the RTI to set network port // statuses to unknown - reset_status_fields_on_input_port_triggers(); + lf_reset_status_fields_on_input_port_triggers(); // Get a start_time from the RTI - synchronize_with_other_federates(); // Resets start_time in federated execution according to the RTI. + lf_synchronize_with_other_federates(); // Resets start_time in federated execution according to the RTI. } // The start time will likely have changed. Adjust the current tag and stop tag. @@ -712,7 +704,7 @@ void _lf_initialize_start_tag(environment_t *env) { env->current_tag = (tag_t){.time = start_time - _lf_fed_STA_offset, .microstep = 0u}; // Call wait_until if federated. This is required because the startup procedure - // in synchronize_with_other_federates() can decide on a new start_time that is + // in lf_synchronize_with_other_federates() can decide on a new start_time that is // larger than the current physical time. // Therefore, if --fast was not specified, wait until physical time matches // or exceeds the start time. Microstep is ignored. @@ -754,7 +746,7 @@ void _lf_initialize_start_tag(environment_t *env) { // once the complete message has been read. Here, we wait for that barrier // to be removed, if appropriate before proceeding to executing tag (0,0). _lf_wait_on_tag_barrier(env, (tag_t){.time=start_time,.microstep=0}); - spawn_staa_thread(); + lf_spawn_staa_thread(); #else // NOT FEDERATED_DECENTRALIZED // Each federate executes the start tag (which is the current @@ -942,7 +934,7 @@ void _lf_worker_invoke_reaction(environment_t *env, int worker_number, reaction_ void try_advance_level(environment_t* env, volatile size_t* next_reaction_level) { #ifdef FEDERATED - stall_advance_level_federation(env, *next_reaction_level); + lf_stall_advance_level_federation(env, *next_reaction_level); #endif if (*next_reaction_level < SIZE_MAX) *next_reaction_level += 1; } @@ -964,7 +956,7 @@ void _lf_worker_do_work(environment_t *env, int worker_number) { // lf_print_snapshot(); // This is quite verbose (but very useful in debugging reaction deadlocks). reaction_t* current_reaction_to_execute = NULL; #ifdef FEDERATED - stall_advance_level_federation(env, 0); + lf_stall_advance_level_federation(env, 0); #endif while ((current_reaction_to_execute = lf_sched_get_ready_reaction(env->scheduler, worker_number)) diff --git a/core/threaded/scheduler_sync_tag_advance.c b/core/threaded/scheduler_sync_tag_advance.c index 017dda77d..28d3fa458 100644 --- a/core/threaded/scheduler_sync_tag_advance.c +++ b/core/threaded/scheduler_sync_tag_advance.c @@ -52,7 +52,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /** * @brief Indicator that execution of at least one tag has completed. */ -static bool _lf_logical_tag_completed = false; +static bool _latest_tag_completed = false; /** * Return true if the worker should stop now; false otherwise. @@ -60,7 +60,7 @@ static bool _lf_logical_tag_completed = false; */ bool should_stop_locked(lf_scheduler_t * sched) { // If this is not the very first step, check against the stop tag to see whether this is the last step. - if (_lf_logical_tag_completed) { + if (_latest_tag_completed) { // If we are at the stop tag, do not call _lf_next_locked() // to prevent advancing the logical time. if (lf_tag_compare(sched->env->current_tag, sched->env->stop_tag) >= 0) { @@ -92,7 +92,7 @@ bool _lf_sched_advance_tag_locked(lf_scheduler_t * sched) { return true; } - _lf_logical_tag_completed = true; + _latest_tag_completed = true; // Advance time. // _lf_next_locked() may block waiting for real time to pass or events to appear. diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index 6e9d018be..d20f3a407 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -1,30 +1,12 @@ /** * @file - * @author Edward A. Lee (eal@berkeley.edu) + * @author Soroush Bateni + * @author Peter Donovan + * @author Edward A. Lee + * @author Anirudh Rengarajsm * * @section LICENSE -Copyright (c) 2020, The University of California at Berkeley. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY -EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL -THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF -THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - + * See LICENSE.md in the root directory. * @section DESCRIPTION * Data structures and functions used and defined in federate.c. */ @@ -43,13 +25,16 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ADVANCE_MESSAGE_INTERVAL MSEC(10) #endif +////////////////////////////////////////////////////////////////////////////////// +// Data types + /** * Structure that a federate instance uses to keep track of its own state. */ typedef struct federate_instance_t { /** * The TCP socket descriptor for this federate to communicate with the RTI. - * This is set by connect_to_rti(), which must be called before other + * This is set by lf_connect_to_rti(), which must be called before other * functions that communicate with the rti are called. */ int socket_TCP_RTI; @@ -59,14 +44,6 @@ typedef struct federate_instance_t { */ lf_thread_t RTI_socket_listener; - /** - * Thread responsible for setting ports to absent by an STAA offset if they - * aren't already known. - */ - #ifdef FEDERATED_DECENTRALIZED - lf_thread_t staaSetter; - #endif - /** * Number of inbound physical connections to the federate. * This can be either physical connections, or logical connections @@ -92,7 +69,7 @@ typedef struct federate_instance_t { * An array that holds the socket descriptors for inbound * connections from each federate. The index will be the federate * ID of the remote sending federate. This is initialized at startup - * to -1 and is set to a socket ID by handle_p2p_connections_from_federates() + * to -1 and is set to a socket ID by lf_handle_p2p_connections_from_federates() * when the socket is opened. * * @note There will not be an inbound socket unless a physical connection @@ -107,7 +84,7 @@ typedef struct federate_instance_t { * An array that holds the socket descriptors for outbound direct * connections to each remote federate. The index will be the federate * ID of the remote receiving federate. This is initialized at startup - * to -1 and is set to a socket ID by connect_to_federate() + * to -1 and is set to a socket ID by lf_connect_to_federate() * when the socket is opened. * * @note This federate will not open an outbound socket unless a physical @@ -126,7 +103,7 @@ typedef struct federate_instance_t { /** * A socket descriptor for the socket server of the federate. - * This is assigned in create_server(). + * This is assigned in lf_create_server(). * This socket is used to listen to incoming physical connections from * remote federates. Once an incoming connection is accepted, the * opened socket will be stored in @@ -135,11 +112,9 @@ typedef struct federate_instance_t { int server_socket; /** - * The port used for the server socket - * to listen for messages from other federates. - * The federate informs the RTI of this port once - * it has created its socket server by sending - * an ADDRESS_AD message (@see rti.h). + * The port used for the server socket to listen for messages from other federates. + * The federate informs the RTI of this port once it has created its socket server by + * sending an ADDRESS_AD message (@see rti.h). */ int server_port; @@ -151,8 +126,7 @@ typedef struct federate_instance_t { tag_t last_TAG; /** - * Indicates whether the last TAG received is provisional or an ordinary - * TAG. + * Indicates whether the last TAG received is provisional or an ordinary TAG. * If the last TAG has been provisional, network port absent reactions must be inserted. * This variable should only be accessed while holding the mutex lock. */ @@ -178,13 +152,12 @@ typedef struct federate_instance_t { bool received_stop_request_from_rti; /** - * A record of the most recently sent LTC (logical tag complete) message. + * A record of the most recently sent LTC (latest tag complete) message. * In some situations, federates can send logical_tag_complete for * the same tag twice or more in-a-row to the RTI. For example, when * _lf_next() returns without advancing tag. To prevent overwhelming * the RTI with extra messages, record the last sent logical tag - * complete message and check against it in - * _lf_logical_tag_complete(). + * complete message and check against it in lf_latest_tag_complete(). * * @note Here, the underlying assumption is that the TCP stack will * deliver the Logical TAG Complete message to the RTI eventually @@ -205,12 +178,22 @@ typedef struct federate_instance_t { */ instant_t min_delay_from_physical_action_to_federate_output; - // Trace object + /** + * Trace object for this federate, used if tracing is enabled. + */ trace_t* trace; + + #ifdef FEDERATED_DECENTRALIZED + /** + * Thread responsible for setting ports to absent by an STAA offset if they + * aren't already known. + */ + lf_thread_t staaSetter; + #endif } federate_instance_t; #ifdef FEDERATED_DECENTRALIZED -typedef struct staa { +typedef struct staa_t { lf_action_base_t** actions; size_t STAA; size_t num_actions; @@ -224,30 +207,38 @@ typedef struct federation_metadata_t { char* rti_user; } federation_metadata_t; -extern lf_mutex_t outbound_socket_mutex; -extern lf_cond_t port_status_changed; -extern lf_cond_t logical_time_changed; +typedef enum parse_rti_code_t { + SUCCESS, + INVALID_PORT, + INVALID_HOST, + INVALID_USER, + FAILED_TO_PARSE +} parse_rti_code_t; + +////////////////////////////////////////////////////////////////////////////////// +// Global variables /** -* Generated function that sends information about connections between this federate and -* other federates where messages are routed through the RTI. Currently, this -* only includes logical connections when the coordination is centralized. This -* information is needed for the RTI to perform the centralized coordination. -* @see MSG_TYPE_NEIGHBOR_STRUCTURE in net_common.h -*/ -void send_neighbor_structure_to_RTI(int); + * Mutex lock held while performing socket write and close operations. + */ +extern lf_mutex_t lf_outbound_socket_mutex; -#ifdef FEDERATED_DECENTRALIZED /** - * Spawn a thread to iterate through STAA structs, setting their associated ports absent - * at an offset if the port is not present with a value by a certain physical time. + * Condition variable for blocking on unkonwn federate input ports. */ -void spawn_staa_thread(void); -#endif +extern lf_cond_t lf_port_status_changed; + +/** + * Condition variable for blocking on tag advance in +*/ +extern lf_cond_t lf_current_tag_changed; + +////////////////////////////////////////////////////////////////////////////////// +// Public functions (in alphabetical order) /** * Connect to the federate with the specified id. This established - * connection will then be used in functions such as send_tagged_message() + * connection will then be used in functions such as lf_send_tagged_message() * to send messages directly to the specified federate. * This function first sends an MSG_TYPE_ADDRESS_QUERY message to the RTI to obtain * the IP address and port number of the specified federate. It then attempts @@ -257,16 +248,7 @@ void spawn_staa_thread(void); * refer to the socket for communicating directly with the federate. * @param remote_federate_id The ID of the remote federate. */ -void connect_to_federate(uint16_t); - -/** - * Send a logical tag complete (LTC) message to the RTI - * unless an equal or later LTC has previously been sent. - * This function assumes the caller holds the mutex lock. - * - * @param tag_to_send The tag to send. - */ -void _lf_logical_tag_complete(tag_t); +void lf_connect_to_federate(uint16_t); /** * Connect to the RTI at the specified host and port and return the socket descriptor @@ -278,21 +260,7 @@ void _lf_logical_tag_complete(tag_t); * @param hostname A hostname, such as "localhost". * @param port_number A port number or 0 to start with the default. */ -void connect_to_rti(const char* hostname, int port_number); - -/** - * Thread that listens for inputs from other federates. - * This thread listens for messages of type MSG_TYPE_P2P_MESSAGE, - * MSG_TYPE_P2P_TAGGED_MESSAGE, or MSG_TYPE_PORT_ABSENT (@see net_common.h) from the specified - * peer federate and calls the appropriate handling function for - * each message type. If an error occurs or an EOF is received - * from the peer, then this procedure sets the corresponding - * socket in _fed.sockets_for_inbound_p2p_connections - * to -1 and returns, terminating the thread. - * @param fed_id_ptr A pointer to a uint16_t containing federate ID being listened to. - * This procedure frees the memory pointed to before returning. - */ -void* listen_to_federates(void*); +void lf_connect_to_rti(const char* hostname, int port_number); /** * Create a server to listen to incoming p2p connection (physical @@ -308,11 +276,16 @@ void* listen_to_federates(void*); * it will retry after PORT_BIND_RETRY_INTERVAL until it has tried * PORT_BIND_RETRY_LIMIT times. Then it will fail. * - * @note This function is different from create_server(...) in rti.c. - * * @param specified_port The port specified by the user or 0 to use a random port. */ -void create_server(int specified_port); +void lf_create_server(int specified_port); + +/** + * Enqueue port absent reactions that will send a MSG_TYPE_PORT_ABSENT + * message to downstream federates if a given network output port is not present. + * @param env The environment of the federate + */ +void lf_enqueue_port_absent_reactions(environment_t* env); /** * Thread to accept connections from other federates that send this federate @@ -321,60 +294,38 @@ void create_server(int specified_port); * sockets, exits. * @param ignored No argument needed for this thread. */ -void* handle_p2p_connections_from_federates(void*); +void* lf_handle_p2p_connections_from_federates(void*); /** - * Send a port absent message to federate with fed_ID, informing the - * remote federate that the current federate will not produce an event - * on this network port at the current logical time. + * Send a latest tag complete (LTC) message to the RTI + * unless an equal or later LTC has previously been sent. + * This function assumes the caller holds the mutex lock + * on the top-level environment. * - * @param env The environment in which we are executing - * @param additional_delay The offset applied to the timestamp - * using after. The additional delay will be greater or equal to zero - * if an after is used on the connection. If no after is given in the - * program, -1 is passed. - * @param port_ID The ID of the receiving port. - * @param fed_ID The fed ID of the receiving federate. + * @param tag_to_send The tag to send. */ -void send_port_absent_to_federate(environment_t* env, interval_t, unsigned short, unsigned short); +void lf_latest_tag_complete(tag_t); /** - * Enqueue port absent reactions that will send a PORT_ABSENT - * message to downstream federates if a given network output port is not present. + * Parse the address of the RTI and store them into the global federation_metadata struct. + * @return a parse_rti_code_t indicating the result of the parse. */ -void enqueue_port_absent_reactions(environment_t* env); +parse_rti_code_t lf_parse_rti_addr(const char* rti_addr); /** - * @brief Wait until inputs statuses are known up to and including the specified level. - * Specifically, wait until the specified level is less that the max level allowed to - * advance (MLAA). - * @param env The environment (which should always be the top-level environment). - * @param level The level to which we would like to advance. - */ -void stall_advance_level_federation(environment_t* env, size_t level); - -/** - * @brief Update the max level allowed to advance (MLAA). - * If the specified tag is greater than the current_tag of the top-level environment - * (or equal and is_provisional is false), then set the MLAA to MAX_INT and return. - * This removes any barriers on execution at the current tag due to network inputs. - * Otherwise, set the MLAA to the minimum level over all (non-physical) network input ports - * where the status of the input port is not known at that current_tag. - * - * This function assumes that the caller holds the mutex. + * Reset the status fields on network input ports to unknown. * - * @param tag The latest TAG or PTAG received by this federate. - * @param is_provisional Whether the tag was provisional. - * @return True if the MLAA changed. + * @note This function must be called at the beginning of each + * logical time. */ -bool update_max_level(tag_t tag, bool is_provisional); +void lf_reset_status_fields_on_input_port_triggers(); /** * Send a message to another federate. This function is used for physical connections * between federates. If the socket connection to the remote federate or the RTI has been broken, * then this returns -1 without sending. Otherwise, it returns 0. * - * This method assumes that the caller does not hold the outbound_socket_mutex lock, + * This method assumes that the caller does not hold the lf_outbound_socket_mutex lock, * which it acquires to perform the send. * * @param message_type The type of the message being sent (currently only MSG_TYPE_P2P_MESSAGE). @@ -385,13 +336,103 @@ bool update_max_level(tag_t tag, bool is_provisional); * @param message The message. * @return 0 if the message has been sent, -1 otherwise. */ -int send_message(int message_type, +int lf_send_message(int message_type, unsigned short port, unsigned short federate, const char* next_destination_str, size_t length, unsigned char* message); +/** + * Generated function that sends information about connections between this federate and + * other federates where messages are routed through the RTI. Currently, this + * only includes logical connections when the coordination is centralized. This + * information is needed for the RTI to perform the centralized coordination. + * @see MSG_TYPE_NEIGHBOR_STRUCTURE in net_common.h + */ +void lf_send_neighbor_structure_to_RTI(int); + +/** + * If this federate depends on upstream federates or sends data to downstream + * federates, then send to the RTI a NET, which will give the tag of the + * earliest event on the event queue, or, if the queue is empty, the timeout + * time, or, if there is no timeout, FOREVER. + * + * If there are network outputs that + * depend on physical actions, then insert a dummy event to ensure this federate + * advances its tag so that downstream federates can make progress. + * + * A NET is a promise saying that, absent network inputs, this federate will + * not produce an output message with tag earlier than the NET value. + * + * If there are upstream federates, then after sending a NET, this will block + * until either the RTI grants the advance to the requested time or the wait + * for the response from the RTI is interrupted by a change in the event queue + * (e.g., a physical action triggered or a network message arrived). + * If there are no upstream federates, then it will not wait for a TAG + * (which won't be forthcoming anyway) and returns the earliest tag on the event queue. + * + * If the federate has neither upstream nor downstream federates, then this + * returns the specified tag immediately without sending anything to the RTI. + * + * If there is at least one physical action somewhere in the federate that can + * trigger an output to a downstream federate, then the NET is required to be + * less than the current physical time. If physical time is less than the + * earliest event in the event queue (or the event queue is empty), then this + * function will insert a dummy event with a tag equal to the current physical + * time (and a microstep of 0). This will enforce advancement of tag for this + * federate and causes a NET message to be sent repeatedly as physical time + * advances with the time interval between messages controlled by the target + * parameter coordination-options: {advance-message-interval timevalue}. It will + * stop creating dummy events if and when its event queue has an event with a + * timestamp less than physical time. + * + * If wait_for_reply is false, then this function will simply send the + * specified tag and return that tag immediately. This is useful when a + * federate is shutting down and will not be sending any more messages at all. + * + * In all cases, this returns either the specified tag or + * another tag when it is safe to advance logical time to the returned tag. + * The returned tag may be less than the specified tag if there are upstream + * federates and either the RTI responds with a lesser tag or + * the wait for a response from the RTI is interrupted by a + * change in the event queue. + * + * This function is used in centralized coordination only. + * + * This function assumes the caller holds the mutex lock. + * + * @param env The environment of the federate + * @param tag The tag. + * @param wait_for_reply If true, wait for a reply. + */ +tag_t lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply); + +/** + * Send a port absent message to federate with fed_ID, informing the + * remote federate that it will not receive a message with tag less than the + * current tag of the specified environment delayed by the additional_delay. + * + * @param env The environment from which to get the current tag. + * @param additional_delay The after delay of the connection or NEVER if none. + * @param port_ID The ID of the receiving port. + * @param fed_ID The fed ID of the receiving federate. + */ +void lf_send_port_absent_to_federate( + environment_t* env, + interval_t additional_delay, + unsigned short port_ID, + unsigned short fed_ID); + +/** + * Send a MSG_TYPE_STOP_REQUEST message to the RTI with payload equal + * to the specified tag plus one microstep. If this federate has previously + * received a stop request from the RTI, then do not send the message and + * return 1. Return -1 if the socket is disconnected. Otherwise, return 0. + * @return 0 if the message is sent. + */ +int lf_send_stop_request_to_rti(tag_t stop_tag); + /** * Send a tagged message to the specified port of the specified federate. * The tag will be the current tag of the specified environment delayed by the specified additional_delay. @@ -407,7 +448,7 @@ int send_message(int message_type, * to believe that there were no messages forthcoming. In this case, on failure to send * the message, this function returns -11. * - * This method assumes that the caller does not hold the outbound_socket_mutex lock, + * This method assumes that the caller does not hold the lf_outbound_socket_mutex lock, * which it acquires to perform the send. * * @param env The environment from which to get the current tag. @@ -423,7 +464,7 @@ int send_message(int message_type, * @param message The message. * @return 0 if the message has been sent, 1 otherwise. */ -int send_tagged_message( +int lf_send_tagged_message( environment_t* env, interval_t additional_delay, int message_type, @@ -433,6 +474,35 @@ int send_tagged_message( size_t length, unsigned char* message); +/** + * Set the federation_id of this federate. + * @param fid The federation ID. + */ +void lf_set_federation_id(const char* fid); + +/** + * Set the trace object for this federate (used when tracing is enabled). + * @param The trace object. + */ +void lf_set_federation_trace_object(trace_t * trace); + +#ifdef FEDERATED_DECENTRALIZED +/** + * Spawn a thread to iterate through STAA structs, setting their associated ports absent + * at an offset if the port is not present with a value by a certain physical time. + */ +void lf_spawn_staa_thread(void); +#endif + +/** + * @brief Wait until inputs statuses are known up to and including the specified level. + * Specifically, wait until the specified level is less that the max level allowed to + * advance (MLAA). + * @param env The environment (which should always be the top-level environment). + * @param level The level to which we would like to advance. + */ +void lf_stall_advance_level_federation(environment_t* env, size_t level); + /** * Synchronize the start with other federates via the RTI. * This assumes that a connection to the RTI is already made @@ -440,20 +510,22 @@ int send_tagged_message( * time to the RTI and waits for the RTI to respond with a specified * time. It starts a thread to listen for messages from the RTI. */ -void synchronize_with_other_federates(); +void lf_synchronize_with_other_federates(); /** - * Wait until the status of network port "port_ID" is known. - * - * In decentralized coordination mode, the wait time is capped by STAA + STA, - * after which the status of the port is presumed to be absent. + * @brief Update the max level allowed to advance (MLAA). + * If the specified tag is greater than the current_tag of the top-level environment + * (or equal and is_provisional is false), then set the MLAA to INT_MAX and return. + * This removes any barriers on execution at the current tag due to network inputs. + * Otherwise, set the MLAA to the minimum level over all (non-physical) network input ports + * where the status of the input port is not known at that current_tag. * - * This function assumes the holder does not hold a mutex. + * This function assumes that the caller holds the mutex. * - * @param env The environment in which we are executing - * @param port_ID The ID of the network port - * @param STAA The safe-to-assume-absent threshold for the port + * @param tag The latest TAG or PTAG received by this federate. + * @param is_provisional Whether the tag was provisional. + * @return True if the MLAA changed. */ -void wait_until_port_status_known(environment_t* env, int portID, interval_t STAA); +bool lf_update_max_level(tag_t tag, bool is_provisional); #endif // FEDERATE_H diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index 4e55216ed..f428bcc93 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -163,7 +163,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * each federate has a valid event at the start tag (start time, 0) and it will * inform the RTI of this event. * Subsequently, at the conclusion of each tag, each federate will send a - * `MSG_TYPE_LOGICAL_TAG_COMPLETE` followed by a `MSG_TYPE_NEXT_EVENT_TAG` (see + * `MSG_TYPE_LATEST_TAG_COMPLETE` followed by a `MSG_TYPE_NEXT_EVENT_TAG` (see * the comment for each message for further explanation). Each federate would * have to wait for a `MSG_TYPE_TAG_ADVANCE_GRANT` or a * `MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT` before it can advance to a @@ -303,7 +303,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * to the RTI. This is its first message to the RTI. * The RTI will respond with either MSG_TYPE_REJECT, MSG_TYPE_ACK, or MSG_TYPE_UDP_PORT. * If the federate is a C target LF program, the generated federate - * code does this by calling synchronize_with_other_federates(), + * code does this by calling lf_synchronize_with_other_federates(), * passing to it its federate ID. */ #define MSG_TYPE_FED_IDS 1 @@ -431,12 +431,12 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MSG_TYPE_PROVISIONAL_TAG_ADVANCE_GRANT 8 /** - * Byte identifying a logical tag complete (LTC) message sent by a federate + * Byte identifying a latest tag complete (LTC) message sent by a federate * to the RTI. * The next eight bytes will be the timestep of the completed tag. * The next four bytes will be the microsteps of the completed tag. */ -#define MSG_TYPE_LOGICAL_TAG_COMPLETE 9 +#define MSG_TYPE_LATEST_TAG_COMPLETE 9 /////////// Messages used in lf_request_stop() /////////////// //// Overview of the algorithm: diff --git a/include/core/federated/network/net_util.h b/include/core/federated/network/net_util.h index 097127e25..6346e21d3 100644 --- a/include/core/federated/network/net_util.h +++ b/include/core/federated/network/net_util.h @@ -350,7 +350,7 @@ void encode_tag( ); /** - * A helper struct for passing rti_addr information between parse_rti_addr and extract_rti_addr_info + * A helper struct for passing rti_addr information between lf_parse_rti_addr and extract_rti_addr_info */ typedef struct rti_addr_info_t { char rti_host_str[256]; diff --git a/include/core/reactor.h b/include/core/reactor.h index 0d4f84022..9d36c6627 100644 --- a/include/core/reactor.h +++ b/include/core/reactor.h @@ -559,9 +559,6 @@ trigger_handle_t _lf_schedule_value(lf_action_base_t* action, interval_t extra_d */ trigger_handle_t _lf_schedule_copy(lf_action_base_t* action, interval_t offset, void* value, size_t length); -// See reactor.h for doc. -int _lf_fd_send_stop_request_to_rti(tag_t stop_tag); - /** * @brief Will create and initialize the required number of environments for the program * @note Will be code generated by the compiler diff --git a/include/core/reactor_common.h b/include/core/reactor_common.h index b9c9d86ea..29fb73c58 100644 --- a/include/core/reactor_common.h +++ b/include/core/reactor_common.h @@ -24,21 +24,6 @@ extern bool _lf_normal_termination; extern int default_argc; extern const char** default_argv; -#ifdef FEDERATED -void reset_status_fields_on_input_port_triggers(); -port_status_t determine_port_status_if_possible(int portID); -typedef enum parse_rti_code_t { - SUCCESS, - INVALID_PORT, - INVALID_HOST, - INVALID_USER, - FAILED_TO_PARSE -} parse_rti_code_t; -parse_rti_code_t parse_rti_addr(const char* rti_addr); -void set_federation_id(const char* fid); -void set_federation_trace_object(trace_t * trace); -#endif - extern struct allocation_record_t* _lf_reactors_to_free; void* _lf_new_reactor(size_t size); void _lf_free(struct allocation_record_t** head); diff --git a/include/core/threaded/reactor_threaded.h b/include/core/threaded/reactor_threaded.h index 0053112d0..f0f3d424b 100644 --- a/include/core/threaded/reactor_threaded.h +++ b/include/core/threaded/reactor_threaded.h @@ -17,7 +17,7 @@ void try_advance_level(environment_t* env, volatile size_t* next_reaction_level) * message to downstream federates if a given network output port is not present. * @param env The environment in which we are executing */ -void enqueue_port_absent_reactions(environment_t* env); +void lf_enqueue_port_absent_reactions(environment_t* env); /** * Raise a barrier to prevent the current tag for the specified environment from advancing @@ -79,7 +79,7 @@ void _lf_increment_tag_barrier_locked(environment_t *env, tag_t future_tag); void _lf_decrement_tag_barrier_locked(environment_t* env); int _lf_wait_on_tag_barrier(environment_t* env, tag_t proposed_tag); -void synchronize_with_other_federates(void); +void lf_synchronize_with_other_federates(void); bool wait_until(environment_t* env, instant_t logical_time_ns, lf_cond_t* condition); tag_t get_next_event_tag(environment_t* env); tag_t send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply); From 8de37dd8bdf1dc80aeb950a9ed08bbca3b807c15 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sun, 31 Dec 2023 17:30:36 -0800 Subject: [PATCH 44/83] Clean up doxygen docs --- core/federated/federate.c | 9 +-- include/core/federated/federate.h | 94 +++++++++++++++++++------------ 2 files changed, 61 insertions(+), 42 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index de1b56ac3..545fe348e 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -4,12 +4,9 @@ * @author Peter Donovan * @author Edward A. Lee * @author Anirudh Rengarajsm - * - * @section LICENSE - * See LICENSE.md in the root directory. - * @section DESCRIPTION - * Utility functions for a federate in a federated execution. - * The main entry point is lf_synchronize_with_other_federates(). + * @copyright (c) 2020-2023, The University of California at Berkeley. + * License: BSD 2-clause + * @brief Utility functions for a federate in a federated execution. */ #ifdef FEDERATED diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index d20f3a407..be5e59609 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -4,11 +4,9 @@ * @author Peter Donovan * @author Edward A. Lee * @author Anirudh Rengarajsm - * - * @section LICENSE - * See LICENSE.md in the root directory. - * @section DESCRIPTION - * Data structures and functions used and defined in federate.c. + * @copyright (c) 2020-2023, The University of California at Berkeley. + * License: BSD 2-clause + * @brief Data structures and functions used and defined in federate.c. */ #ifndef FEDERATE_H @@ -237,8 +235,9 @@ extern lf_cond_t lf_current_tag_changed; // Public functions (in alphabetical order) /** - * Connect to the federate with the specified id. This established - * connection will then be used in functions such as lf_send_tagged_message() + * @brief Connect to the federate with the specified id. + * + * The established connection will then be used in functions such as lf_send_tagged_message() * to send messages directly to the specified federate. * This function first sends an MSG_TYPE_ADDRESS_QUERY message to the RTI to obtain * the IP address and port number of the specified federate. It then attempts @@ -251,8 +250,10 @@ extern lf_cond_t lf_current_tag_changed; void lf_connect_to_federate(uint16_t); /** - * Connect to the RTI at the specified host and port and return the socket descriptor - * for the connection. If port_number is 0, then start at DEFAULT_PORT and increment + * @brief Connect to the RTI at the specified host and port. + * + * This will return the socket descriptor for the connection. + * If port_number is 0, then start at DEFAULT_PORT and increment * the port number on each attempt. If an attempt fails, wait CONNECT_RETRY_INTERVAL * and try again. If it fails after CONNECT_MAX_RETRIES, the program exits. * If it succeeds, it sets the _fed.socket_TCP_RTI global variable to refer to @@ -263,13 +264,13 @@ void lf_connect_to_federate(uint16_t); void lf_connect_to_rti(const char* hostname, int port_number); /** - * Create a server to listen to incoming p2p connection (physical - * connections or decentralized connections) from remote federates. This function - * only handles the creation of the server socket. - * The bound port for the server socket is then - * sent to the RTI by sending an MSG_TYPE_ADDRESS_ADVERTISEMENT message - * (@see net_common.h). This function expects no response - * from the RTI. + * @brief Create a server to listen to incoming P2P connections. + * + * Such connections are used for physical connections or any connection if using + * decentralized coordination. This function only handles the creation of the server socket. + * The bound port for the server socket is then sent to the RTI by sending an + * MSG_TYPE_ADDRESS_ADVERTISEMENT message (@see net_common.h). + * This function expects no response from the RTI. * * If a port is specified by the user, that will be used. * Otherwise, a random port will be assigned. If the bind fails, @@ -281,24 +282,30 @@ void lf_connect_to_rti(const char* hostname, int port_number); void lf_create_server(int specified_port); /** - * Enqueue port absent reactions that will send a MSG_TYPE_PORT_ABSENT + * @brief Enqueue port absent reactions. + * + * These reactions will send a MSG_TYPE_PORT_ABSENT * message to downstream federates if a given network output port is not present. * @param env The environment of the federate */ void lf_enqueue_port_absent_reactions(environment_t* env); /** - * Thread to accept connections from other federates that send this federate - * messages directly (not through the RTI). This thread starts a thread for - * each accepted socket connection and, once it has opened all expected + * @brief Thread to accept connections from other federates. + * + * This thread accepts connections from federates that send messages directly + * to this one (not through the RTI). This thread starts a thread for + * each accepted socket connection to read messages and, once it has opened all expected * sockets, exits. * @param ignored No argument needed for this thread. */ void* lf_handle_p2p_connections_from_federates(void*); /** - * Send a latest tag complete (LTC) message to the RTI - * unless an equal or later LTC has previously been sent. + * @brief Send a latest tag complete (LTC) signal to the RTI. + * + * This avoids the send if an equal or later LTC has previously been sent. + * * This function assumes the caller holds the mutex lock * on the top-level environment. * @@ -307,13 +314,13 @@ void* lf_handle_p2p_connections_from_federates(void*); void lf_latest_tag_complete(tag_t); /** - * Parse the address of the RTI and store them into the global federation_metadata struct. + * @brief Parse the address of the RTI and store them into the global federation_metadata struct. * @return a parse_rti_code_t indicating the result of the parse. */ parse_rti_code_t lf_parse_rti_addr(const char* rti_addr); /** - * Reset the status fields on network input ports to unknown. + * @brief Reset the status fields on network input ports to unknown. * * @note This function must be called at the beginning of each * logical time. @@ -321,7 +328,9 @@ parse_rti_code_t lf_parse_rti_addr(const char* rti_addr); void lf_reset_status_fields_on_input_port_triggers(); /** - * Send a message to another federate. This function is used for physical connections + * @brief Send a message to another federate. + * + * This function is used for physical connections * between federates. If the socket connection to the remote federate or the RTI has been broken, * then this returns -1 without sending. Otherwise, it returns 0. * @@ -344,8 +353,10 @@ int lf_send_message(int message_type, unsigned char* message); /** - * Generated function that sends information about connections between this federate and - * other federates where messages are routed through the RTI. Currently, this + * @brief Send information about connections to the RTI. + * + * This is a generated function that sends information about connections between this federate + * and other federates where messages are routed through the RTI. Currently, this * only includes logical connections when the coordination is centralized. This * information is needed for the RTI to perform the centralized coordination. * @see MSG_TYPE_NEIGHBOR_STRUCTURE in net_common.h @@ -353,6 +364,8 @@ int lf_send_message(int message_type, void lf_send_neighbor_structure_to_RTI(int); /** + * @brief Send a next event tag (NET) signal. + * * If this federate depends on upstream federates or sends data to downstream * federates, then send to the RTI a NET, which will give the tag of the * earliest event on the event queue, or, if the queue is empty, the timeout @@ -409,8 +422,9 @@ void lf_send_neighbor_structure_to_RTI(int); tag_t lf_send_next_event_tag(environment_t* env, tag_t tag, bool wait_for_reply); /** - * Send a port absent message to federate with fed_ID, informing the - * remote federate that it will not receive a message with tag less than the + * @brief Send a port absent message. + * + * This informs the remote federate that it will not receive a message with tag less than the * current tag of the specified environment delayed by the additional_delay. * * @param env The environment from which to get the current tag. @@ -425,8 +439,9 @@ void lf_send_port_absent_to_federate( unsigned short fed_ID); /** - * Send a MSG_TYPE_STOP_REQUEST message to the RTI with payload equal - * to the specified tag plus one microstep. If this federate has previously + * @brief Send a MSG_TYPE_STOP_REQUEST message to the RTI. + * + * The payload is the specified tag plus one microstep. If this federate has previously * received a stop request from the RTI, then do not send the message and * return 1. Return -1 if the socket is disconnected. Otherwise, return 0. * @return 0 if the message is sent. @@ -434,7 +449,8 @@ void lf_send_port_absent_to_federate( int lf_send_stop_request_to_rti(tag_t stop_tag); /** - * Send a tagged message to the specified port of the specified federate. + * @brief Send a tagged message to the specified port of the specified federate. + * * The tag will be the current tag of the specified environment delayed by the specified additional_delay. * If the delayed tag falls after the timeout time, then the message is not sent and -1 is returned. * The caller can reuse or free the memory storing the message after this returns. @@ -475,20 +491,23 @@ int lf_send_tagged_message( unsigned char* message); /** - * Set the federation_id of this federate. + * @brief Set the federation_id of this federate. * @param fid The federation ID. */ void lf_set_federation_id(const char* fid); /** - * Set the trace object for this federate (used when tracing is enabled). + * @brief Set the trace object for this federate (used when tracing is enabled). + * * @param The trace object. */ void lf_set_federation_trace_object(trace_t * trace); #ifdef FEDERATED_DECENTRALIZED /** - * Spawn a thread to iterate through STAA structs, setting their associated ports absent + * @brief Spawn a thread to iterate through STAA structs. + * + * This will set their associated ports absent * at an offset if the port is not present with a value by a certain physical time. */ void lf_spawn_staa_thread(void); @@ -496,6 +515,7 @@ void lf_spawn_staa_thread(void); /** * @brief Wait until inputs statuses are known up to and including the specified level. + * * Specifically, wait until the specified level is less that the max level allowed to * advance (MLAA). * @param env The environment (which should always be the top-level environment). @@ -504,7 +524,8 @@ void lf_spawn_staa_thread(void); void lf_stall_advance_level_federation(environment_t* env, size_t level); /** - * Synchronize the start with other federates via the RTI. + * @brief Synchronize the start with other federates via the RTI. + * * This assumes that a connection to the RTI is already made * and _lf_rti_socket_TCP is valid. It then sends the current logical * time to the RTI and waits for the RTI to respond with a specified @@ -514,6 +535,7 @@ void lf_synchronize_with_other_federates(); /** * @brief Update the max level allowed to advance (MLAA). + * * If the specified tag is greater than the current_tag of the top-level environment * (or equal and is_provisional is false), then set the MLAA to INT_MAX and return. * This removes any barriers on execution at the current tag due to network inputs. From b9014186a290833037138ffa49148ef5442e33f0 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Mon, 1 Jan 2024 11:41:17 -0800 Subject: [PATCH 45/83] Probably uncessary precaution on connection failure --- core/federated/RTI/main.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/core/federated/RTI/main.c b/core/federated/RTI/main.c index 225a3522a..9d5f8a437 100644 --- a/core/federated/RTI/main.c +++ b/core/federated/RTI/main.c @@ -332,12 +332,14 @@ int main(int argc, const char* argv[]) { } int socket_descriptor = start_rti_server(rti.user_specified_port); - wait_for_federates(socket_descriptor); - normal_termination = true; - if (rti.base.tracing_enabled) { - // No need for a mutex lock because all threads have exited. - stop_trace_locked(rti.base.trace); - lf_print("RTI trace file saved."); + if (socket_descriptor >= 0) { + wait_for_federates(socket_descriptor); + normal_termination = true; + if (rti.base.tracing_enabled) { + // No need for a mutex lock because all threads have exited. + stop_trace_locked(rti.base.trace); + lf_print("RTI trace file saved."); + } } lf_print("RTI is exiting."); // Do this before freeing scheduling nodes. From a07f78b06e60e4ee68c72e05ef2e5d28e05ee0b8 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Mon, 1 Jan 2024 11:47:53 -0800 Subject: [PATCH 46/83] Fix a bug in EIMT on microstep/after delay interaction --- core/federated/RTI/rti_common.c | 11 ++++++++--- core/federated/RTI/rti_remote.c | 3 +-- core/federated/federate.c | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/core/federated/RTI/rti_common.c b/core/federated/RTI/rti_common.c index 424e48135..560827344 100644 --- a/core/federated/RTI/rti_common.c +++ b/core/federated/RTI/rti_common.c @@ -101,13 +101,18 @@ tag_t earliest_future_incoming_message_tag(scheduling_node_t* e) { tag_t start_tag = {.time = start_time, .microstep = 0}; upstream->next_event = start_tag; } - tag_t earliest_tag_from_upstream = lf_tag_add(upstream->next_event, e->min_delays[i].min_delay); - /* Following debug message is too verbose for normal use: + // The min_delay here is a tag_t, not an interval_t. + // No delay at all is represented by (0,0). A delay of 0 is represented by (0,1). + // If the time part of the delay is greater than 0, then we want to ignore the microstep in + // upstream->next_event. Otherwise, we want preserve it and add to it. + tag_t next_event = upstream->next_event; + if (e->min_delays[i].min_delay.time > 0) next_event.microstep = 0; + tag_t earliest_tag_from_upstream = lf_tag_add(next_event, e->min_delays[i].min_delay); + /* Following debug message is too verbose for normal use: */ LF_PRINT_DEBUG("RTI: Earliest next event upstream of fed/encl %d at fed/encl %d has tag " PRINTF_TAG ".", e->id, upstream->id, earliest_tag_from_upstream.time - start_time, earliest_tag_from_upstream.microstep); - */ if (lf_tag_compare(earliest_tag_from_upstream, t_d) < 0) { t_d = earliest_tag_from_upstream; } diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 6c6f11f90..9d4794419 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1673,8 +1673,7 @@ void *respond_to_erroneous_connections(void *nothing) { return NULL; } -void initialize_federate(federate_info_t *fed, uint16_t id) -{ +void initialize_federate(federate_info_t *fed, uint16_t id) { initialize_scheduling_node(&(fed->enclave), id); fed->requested_stop = false; fed->socket = -1; // No socket. diff --git a/core/federated/federate.c b/core/federated/federate.c index 545fe348e..677c9ef2c 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -2181,7 +2181,7 @@ void lf_latest_tag_complete(tag_t tag_to_send) { if (compare_with_last_tag >= 0) { return; } - LF_PRINT_LOG("Sending Latest Time Complete (LTC) " PRINTF_TAG " to the RTI.", + LF_PRINT_LOG("Sending Latest Tag Complete (LTC) " PRINTF_TAG " to the RTI.", tag_to_send.time - start_time, tag_to_send.microstep); send_tag(MSG_TYPE_LATEST_TAG_COMPLETE, tag_to_send); From 9de19abb7038ebc39f687472972bcc709f6844b8 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Mon, 1 Jan 2024 17:23:05 -0800 Subject: [PATCH 47/83] Comments and formatting only --- core/federated/RTI/rti_common.c | 17 ++++++++++------- core/federated/RTI/rti_common.h | 5 ++--- core/federated/RTI/rti_local.c | 13 +++++++------ 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/core/federated/RTI/rti_common.c b/core/federated/RTI/rti_common.c index 560827344..afc48526e 100644 --- a/core/federated/RTI/rti_common.c +++ b/core/federated/RTI/rti_common.c @@ -26,8 +26,6 @@ void initialize_rti_common(rti_common_t * _rti_common) { rti_common->num_scheduling_nodes_handling_stop = 0; } -// FIXME: For log and debug message in this file, what sould be kept: 'enclave', -// 'federate', or 'enlcave/federate'? Currently its is 'enclave/federate'. // FIXME: Should scheduling_nodes tracing use the same mechanism as federates? // It needs to account a federate having itself a number of scheduling_nodes. // Currently, all calls to tracepoint_from_federate() and @@ -101,10 +99,11 @@ tag_t earliest_future_incoming_message_tag(scheduling_node_t* e) { tag_t start_tag = {.time = start_time, .microstep = 0}; upstream->next_event = start_tag; } - // The min_delay here is a tag_t, not an interval_t. - // No delay at all is represented by (0,0). A delay of 0 is represented by (0,1). - // If the time part of the delay is greater than 0, then we want to ignore the microstep in - // upstream->next_event. Otherwise, we want preserve it and add to it. + // The min_delay here is a tag_t, not an interval_t because it may account for more than + // one connection. No delay at all is represented by (0,0). A delay of 0 is represented + // by (0,1). If the time part of the delay is greater than 0, then we want to ignore + // the microstep in upstream->next_event because that microstep will have been lost. + // Otherwise, we want preserve it and add to it. tag_t next_event = upstream->next_event; if (e->min_delays[i].min_delay.time > 0) next_event.microstep = 0; tag_t earliest_tag_from_upstream = lf_tag_add(next_event, e->min_delays[i].min_delay); @@ -294,7 +293,11 @@ void notify_advance_grant_if_safe(scheduling_node_t* e) { // Local function used recursively to find minimum delays upstream. // Return in count the number of non-FOREVER_TAG entries in path_delays[]. -static void _update_min_delays_upstream(scheduling_node_t* end, scheduling_node_t* intermediate, tag_t path_delays[], size_t* count) { +static void _update_min_delays_upstream( + scheduling_node_t* end, + scheduling_node_t* intermediate, + tag_t path_delays[], + size_t* count) { // On first call, intermediate will be NULL, so the path delay is initialized to zero. tag_t delay_from_intermediate_so_far = ZERO_TAG; if (intermediate == NULL) { diff --git a/core/federated/RTI/rti_common.h b/core/federated/RTI/rti_common.h index d80775e05..deca5caa2 100644 --- a/core/federated/RTI/rti_common.h +++ b/core/federated/RTI/rti_common.h @@ -6,9 +6,8 @@ * @author Chadlia Jerad (chadlia.jerad@ensi-uma.tn) * @copyright (c) 2020-2023, The University of California at Berkeley * License in [BSD 2-clause](https://github.com/lf-lang/reactor-c/blob/main/LICENSE.md) - * @brief Common declarations for runtime infrastructure (RTI) for scheduling enclaves and distributed Lingua Franca programs. - * This file declares RTI features that are used by scheduling enclaves as well as federated - * LF programs. + * @brief Common declarations for runtime infrastructure (RTI) for scheduling enclaves + * and distributed Lingua Franca programs. */ #if defined STANDALONE_RTI || defined LF_ENCLAVES #ifndef RTI_COMMON_H diff --git a/core/federated/RTI/rti_local.c b/core/federated/RTI/rti_local.c index 57af1047d..c75605426 100644 --- a/core/federated/RTI/rti_local.c +++ b/core/federated/RTI/rti_local.c @@ -7,17 +7,18 @@ * @copyright (c) 2020-2023, The University of California at Berkeley * License in [BSD 2-clause](https://github.com/lf-lang/reactor-c/blob/main/LICENSE.md) * - * This files implements the enclave coordination logic. + * This file implements the enclave coordination logic. * Here we are dealing with multiple mutexes. To avoid deadlocking we follow the * following rules: * 1) Mutexes are always locked in the following order: - * Enclave mutexes -> RTI mutex. + * Enclave mutexes followed by RTI mutex. * This means that we never lock an enclave mutex while holding the RTI mutex. * 2) Mutexes are always unlocked in the following order: - * RTI mutex -> Enclave mutex. - * 3) If the coordination logic might block. We unlock the enclave mutex - * -*/ + * RTI mutex followed by Enclave mutex. + * 3) If the coordination logic might block. We unlock the enclave mutex while + * blocking, using a condition variable to unblock. + * 4) When blocking on the coordination logic, never hold the RTI mutex. + */ #ifdef LF_ENCLAVES #include "rti_local.h" From 2f1c9df23fa2ab0ae38accfd041fa15201ca8b9b Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Tue, 2 Jan 2024 11:40:11 -0800 Subject: [PATCH 48/83] Removed message_record, replace with pqueue_tag --- core/federated/RTI/CMakeLists.txt | 1 - .../RTI/message_record/message_record.c | 176 ------------------ .../RTI/message_record/message_record.h | 86 --------- .../RTI/message_record/rti_pqueue_support.h | 101 ---------- core/federated/RTI/rti_common.h | 7 +- core/federated/RTI/rti_remote.c | 14 +- core/federated/RTI/rti_remote.h | 8 +- core/utils/pqueue_tag.c | 24 ++- include/core/utils/pqueue_tag.h | 62 ++++-- 9 files changed, 76 insertions(+), 403 deletions(-) delete mode 100644 core/federated/RTI/message_record/message_record.c delete mode 100644 core/federated/RTI/message_record/message_record.h delete mode 100644 core/federated/RTI/message_record/rti_pqueue_support.h diff --git a/core/federated/RTI/CMakeLists.txt b/core/federated/RTI/CMakeLists.txt index 75d812ab7..73b1b0d4e 100644 --- a/core/federated/RTI/CMakeLists.txt +++ b/core/federated/RTI/CMakeLists.txt @@ -73,7 +73,6 @@ add_executable( ${CoreLib}/utils/pqueue_base.c ${CoreLib}/utils/pqueue_tag.c ${CoreLib}/utils/pqueue.c - message_record/message_record.c ) IF(CMAKE_BUILD_TYPE MATCHES DEBUG) diff --git a/core/federated/RTI/message_record/message_record.c b/core/federated/RTI/message_record/message_record.c deleted file mode 100644 index bbea99b9b..000000000 --- a/core/federated/RTI/message_record/message_record.c +++ /dev/null @@ -1,176 +0,0 @@ -#if defined STANDALONE_RTI -/** - * @file message_record.c - * @author Soroush Bateni (soroush@berkeley.edu) - * @brief Record-keeping for in-transit messages. - * @version 0.1 - * @date 2022-06-02 - * - * @copyright Copyright (c) 2022, The University of California at Berkeley. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY -EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL -THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF -THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -***************/ - -#include "message_record.h" -#include "platform.h" -#include - -/** - * @brief Initialize the in-transit message record queue. - * - * @return in_transit_message_record_q - */ -in_transit_message_record_q_t* initialize_in_transit_message_q() { - in_transit_message_record_q_t* queue = - (in_transit_message_record_q_t*)calloc( - 1, - sizeof(in_transit_message_record_q_t) - ); - queue->main_queue = pqueue_init( - 10, - in_reverse_order, - get_message_record_index, - get_message_record_position, - set_message_record_position, - tags_match, - print_message_record - ); - - queue->transfer_queue = pqueue_init( - 10, - in_reverse_order, - get_message_record_index, - get_message_record_position, - set_message_record_position, - tags_match, - print_message_record - ); - - return queue; -} - -/** - * @brief Free the memory occupied by the `queue`. - * - * @param queue The queue to free. - */ -void free_in_transit_message_q(in_transit_message_record_q_t* queue) { - pqueue_free(queue->main_queue); - pqueue_free(queue->transfer_queue); - free(queue); -} - -/** - * @brief Add a record of the in-transit message. - * - * @param queue The queue to add to. - * @param tag The tag of the in-transit message. - * @return 0 on success. - */ -int add_in_transit_message_record(in_transit_message_record_q_t* queue, tag_t tag) { - in_transit_message_record_t* in_transit_record = malloc(sizeof(in_transit_message_record_t)); - in_transit_record->tag = tag; - return pqueue_insert( - queue->main_queue, - (void*)in_transit_record - ); -} - -/** - * @brief Clean the record of in-transit messages up to and including `tag`. - * - * @param queue The queue to clean. - * @param tag Will clean all messages with tags <= tag. - */ -void clean_in_transit_message_record_up_to_tag(in_transit_message_record_q_t* queue, tag_t tag) { - in_transit_message_record_t* head_of_in_transit_messages = (in_transit_message_record_t*)pqueue_peek(queue->main_queue); - while ( - head_of_in_transit_messages != NULL && // Queue is not empty - head_of_in_transit_messages->tag.time <= tag.time // The head message record has a time less than or equal to - // `tag.time`. - ) { - // Now compare the tags. The message record queue is ordered according to the `time` field, so we need to check - // all records with that `time` and find those that have smaller or equal full tags. - if (lf_tag_compare( - head_of_in_transit_messages->tag, - tag - ) <= 0 - ) { - LF_PRINT_DEBUG( - "RTI: Removed a message with tag (" PRINTF_TIME ", %u) from the list of in-transit messages.", - head_of_in_transit_messages->tag.time - lf_time_start(), - head_of_in_transit_messages->tag.microstep - ); - - free(pqueue_pop(queue->main_queue)); - } else { - // Add it to the transfer queue - pqueue_insert(queue->transfer_queue, pqueue_pop(queue->main_queue)); - } - head_of_in_transit_messages = (in_transit_message_record_t*)pqueue_peek(queue->main_queue); - } - // Empty the transfer queue (which holds messages with equal time but larger microstep) into the main queue. - pqueue_empty_into(&queue->main_queue, &queue->transfer_queue); -} - -/** - * @brief Get the minimum tag of all currently recorded in-transit messages. - * - * @param queue The queue to search in (of type `in_transit_message_record_q`). - * @return tag_t The minimum tag of all currently recorded in-transit messages. Return `FOREVER_TAG` if the queue is empty. - */ -tag_t get_minimum_in_transit_message_tag(in_transit_message_record_q_t* queue) { - tag_t minimum_tag = FOREVER_TAG; - - in_transit_message_record_t* head_of_in_transit_messages = (in_transit_message_record_t*)pqueue_peek(queue->main_queue); - while (head_of_in_transit_messages != NULL) { // Queue is not empty - // The message record queue is ordered according to the `time` field, so we need to check - // all records with the minimum `time` and find those that have the smallest tag. - if (lf_tag_compare( - head_of_in_transit_messages->tag, - minimum_tag - ) <= 0 - ) { - minimum_tag = head_of_in_transit_messages->tag; - } else if (head_of_in_transit_messages->tag.time > minimum_tag.time) { - break; - } - - // Add the head to the transfer queue. - pqueue_insert(queue->transfer_queue, pqueue_pop(queue->main_queue)); - - head_of_in_transit_messages = (in_transit_message_record_t*)pqueue_peek(queue->main_queue); - } - // Empty the transfer queue (which holds messages with equal time but larger microstep) into the main queue. - pqueue_empty_into(&queue->main_queue, &queue->transfer_queue); - - if (head_of_in_transit_messages != NULL) { - LF_PRINT_DEBUG( - "RTI: Minimum tag of all in-transit messages: " PRINTF_TAG, - head_of_in_transit_messages->tag.time - lf_time_start(), - head_of_in_transit_messages->tag.microstep - ); - } - - return minimum_tag; -} - -#endif // STANDALONE_RTI diff --git a/core/federated/RTI/message_record/message_record.h b/core/federated/RTI/message_record/message_record.h deleted file mode 100644 index d57f81f64..000000000 --- a/core/federated/RTI/message_record/message_record.h +++ /dev/null @@ -1,86 +0,0 @@ -#if defined STANDALONE_RTI -/** - * @file message_record.h - * @author Soroush Bateni (soroush@berkeley.edu) - * @brief Record-keeping for in-transit messages. - * @version 0.1 - * @date 2022-06-02 - * - * @copyright Copyright (c) 2022, The University of California at Berkeley. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY -EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL -THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF -THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -***************/ - -#ifndef RTI_MESSAGE_RECORD_H -#define RTI_MESSAGE_RECORD_H - -#include "rti_pqueue_support.h" - -/** - * @brief Queue to keep a record of in-transit messages. - * - */ -typedef struct { - pqueue_t* main_queue; // The primary queue. - pqueue_t* transfer_queue; // Queue used for housekeeping. -} in_transit_message_record_q_t; - -/** - * @brief Initialize the in-transit message record queue. - * - * @return in_transit_message_record_q - */ -in_transit_message_record_q_t* initialize_in_transit_message_q(); - -/** - * @brief Free the memory occupied by the `queue`. - * - * @param queue The queue to free. - */ -void free_in_transit_message_q(in_transit_message_record_q_t* queue); - -/** - * @brief Add a record of the in-transit message. - * - * @param queue The queue to add to (of type `in_transit_message_record_q`). - * @param tag The tag of the in-transit message. - * @return 0 on success. - */ -int add_in_transit_message_record(in_transit_message_record_q_t* queue, tag_t tag); - -/** - * @brief Clean the record of in-transit messages up to and including `tag`. - * - * @param queue The queue to clean (of type `in_transit_message_record_q`). - * @param tag Will clean all messages with tags <= tag. - */ -void clean_in_transit_message_record_up_to_tag(in_transit_message_record_q_t* queue, tag_t tag); - -/** - * @brief Get the minimum tag of all currently recorded in-transit messages. - * - * @param queue The queue to search in (of type `in_transit_message_record_q`). - * @return tag_t The minimum tag of all currently recorded in-transit messages. Return `FOREVER_TAG` if the queue is empty. - */ -tag_t get_minimum_in_transit_message_tag(in_transit_message_record_q_t* queue); - -#endif // RTI_MESSAGE_RECORD_H -#endif // STANDALONE_RTI diff --git a/core/federated/RTI/message_record/rti_pqueue_support.h b/core/federated/RTI/message_record/rti_pqueue_support.h deleted file mode 100644 index 09a35183a..000000000 --- a/core/federated/RTI/message_record/rti_pqueue_support.h +++ /dev/null @@ -1,101 +0,0 @@ -#if defined STANDALONE_RTI -/** - * @file rti_pqueue_support.h - * @author Soroush Bateni (soroush@berkeley.edu) - * @brief Header-only support functions for pqueue (in the RTI). - * @version 0.1 - * @date 2022-06-02 - * - * @copyright Copyright (c) 2022, The University of California at Berkeley. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY -EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL -THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF -THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -***************/ - -#ifndef RTI_PQUEUE_SUPPORT_H -#define RTI_PQUEUE_SUPPORT_H - -#include "tag.h" -#include "utils/pqueue.h" -#include "utils/util.h" -#include "platform.h" - -// ********** Priority Queue Support Start -/** - * @brief Represent an in-transit message. - * - */ -typedef struct in_transit_message_record { - tag_t tag; // Tag of the in-transit message. - size_t pos; // Position in the priority queue. -} in_transit_message_record_t; - -/** - * Return whether or not the given `in_transit_message_record_t` types have the same tag. - */ -static int tags_match(void* next, void* curr) { - return (lf_tag_compare( - ((in_transit_message_record_t*)next)->tag, - ((in_transit_message_record_t*)curr)->tag - ) == 0); -} - -/** - * Report a priority equal to the time of the given in-transit message. - * Used for sorting pointers to in_transit_message_record_t structs. - */ -static pqueue_pri_t get_message_record_index(void *a) { - return (pqueue_pri_t)(((in_transit_message_record_t*) a)->tag.time); -} - -/** - * Return the given in_transit_message_record_t's position in the queue. - */ -static size_t get_message_record_position(void *a) { - return ((in_transit_message_record_t*) a)->pos; -} - -/** - * Set the given in_transit_message_record_t's position in the queue. - */ -static void set_message_record_position(void *a, size_t pos) { - ((in_transit_message_record_t*) a)->pos = pos; -} - -/** - * Print some information about the given in-transit message. - * - * DEBUG function only. - */ -static void print_message_record(void *message) { - in_transit_message_record_t *r = (in_transit_message_record_t*)message; - LF_PRINT_DEBUG( - "Tag of the in_transit_message_record_t: (" PRINTF_TIME ", %u). " - "Its position in the priority queue: %zu", - r->tag.time - lf_time_start(), - r->tag.microstep, - r->pos - ); -} - -// ********** Priority Queue Support End -#endif - -#endif // STANDALONE_RTI diff --git a/core/federated/RTI/rti_common.h b/core/federated/RTI/rti_common.h index deca5caa2..770918d5b 100644 --- a/core/federated/RTI/rti_common.h +++ b/core/federated/RTI/rti_common.h @@ -111,9 +111,10 @@ typedef struct { void initialize_rti_common(rti_common_t * rti_common); /** - * An scheduling node calls this function after it completed a tag. - * The function updates the completed tag and check if the downstream scheduling nodes - * are eligible for receiving TAGs. + * @brief Update the completed tag for the specified node. + * + * This checks whether any downstream nodes become eligible to receive TAG + * or PTAG, and sends those signals if appropriate. * * The function is prepended with an underscore because a function called * `logical_tag_complete` is code-generated by the compiler. diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 9d4794419..afcb9d0b9 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -307,10 +307,8 @@ void notify_provisional_tag_advance_grant(scheduling_node_t *e, tag_t tag) { void update_federate_next_event_tag_locked(uint16_t federate_id, tag_t next_event_tag) { federate_info_t *fed = GET_FED_INFO(federate_id); - tag_t min_in_transit_tag = get_minimum_in_transit_message_tag(fed->in_transit_message_tags); - if (lf_tag_compare( - min_in_transit_tag, - next_event_tag) < 0) { + tag_t min_in_transit_tag = pqueue_tag_peek_tag(fed->in_transit_message_tags)->tag; + if (lf_tag_compare(min_in_transit_tag, next_event_tag) < 0) { next_event_tag = min_in_transit_tag; } update_scheduling_node_next_event_tag_locked(&(fed->enclave), next_event_tag); @@ -461,7 +459,7 @@ void handle_timed_message(federate_info_t *sending_federate, unsigned char *buff // Record this in-transit message in federate's in-transit message queue. if (lf_tag_compare(fed->enclave.completed, intended_tag) < 0) { // Add a record of this message to the list of in-transit messages to this federate. - add_in_transit_message_record( + pqueue_tag_insert_if_no_match( fed->in_transit_message_tags, intended_tag); LF_PRINT_DEBUG( @@ -537,7 +535,7 @@ void handle_latest_tag_complete(federate_info_t *fed) { // FIXME: Should this function be in the enclave version? LF_MUTEX_LOCK(rti_mutex); // See if we can remove any of the recorded in-transit messages for this. - clean_in_transit_message_record_up_to_tag(fed->in_transit_message_tags, fed->enclave.completed); + pqueue_tag_remove_up_to(fed->in_transit_message_tags, completed); LF_MUTEX_UNLOCK(rti_mutex); } @@ -1678,7 +1676,7 @@ void initialize_federate(federate_info_t *fed, uint16_t id) { fed->requested_stop = false; fed->socket = -1; // No socket. fed->clock_synchronization_enabled = true; - fed->in_transit_message_tags = initialize_in_transit_message_q(); + fed->in_transit_message_tags = pqueue_tag_init(); strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); fed->server_ip_addr.s_addr = 0; fed->server_port = -1; @@ -1717,7 +1715,7 @@ void wait_for_federates(int socket_descriptor) { federate_info_t *fed = GET_FED_INFO(i); lf_print("RTI: Waiting for thread handling federate %d.", fed->enclave.id); lf_thread_join(fed->thread_id, &thread_exit_status); - free_in_transit_message_q(fed->in_transit_message_tags); + pqueue_tag_free(fed->in_transit_message_tags); lf_print("RTI: Federate %d thread exited.", fed->enclave.id); } diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index 2bb00ba93..91897639d 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -30,7 +30,7 @@ #endif #include "lf_types.h" -#include "message_record/message_record.h" +#include "pqueue_tag.h" /** Time allowed for federates to reply to stop request. */ #define MAX_TIME_FOR_REPLY_TO_STOP_REQUEST SEC(30) @@ -61,9 +61,9 @@ typedef struct federate_info_t { struct sockaddr_in UDP_addr; // The UDP address for the federate. bool clock_synchronization_enabled; // Indicates the status of clock synchronization // for this federate. Enabled by default. - in_transit_message_record_q_t* in_transit_message_tags; // Record of in-transit messages to this federate that are not - // yet processed. This record is ordered based on the time - // value of each message for a more efficient access. + pqueue_tag_t* in_transit_message_tags; // Record of in-transit messages to this federate that are not + // yet processed. This record is ordered based on the time + // value of each message for a more efficient access. char server_hostname[INET_ADDRSTRLEN]; // Human-readable IP address and int32_t server_port; // port number of the socket server of the federate // if it has any incoming direct connections from other federates. diff --git a/core/utils/pqueue_tag.c b/core/utils/pqueue_tag.c index 579926f99..2d05af7bc 100644 --- a/core/utils/pqueue_tag.c +++ b/core/utils/pqueue_tag.c @@ -132,6 +132,16 @@ int pqueue_tag_insert_if_no_match(pqueue_tag_t* q, tag_t t) { } } +pqueue_tag_element_t* pqueue_tag_peek(pqueue_tag_t* q) { + return (pqueue_tag_element_t*) pqueue_peek((pqueue_t*)q); +} + +tag_t pqueue_tag_peek_tag(pqueue_tag_t* q) { + pqueue_tag_element_t* element = (pqueue_tag_element_t*)pqueue_tag_peek(q); + if (element == NULL) return FOREVER_TAG; + else return element->tag; +} + pqueue_tag_element_t* pqueue_tag_pop(pqueue_tag_t* q) { return (pqueue_tag_element_t*)pqueue_pop((pqueue_t*)q); } @@ -146,10 +156,14 @@ tag_t pqueue_tag_pop_tag(pqueue_tag_t* q) { } } -int pqueue_tag_remove(pqueue_tag_t* q, pqueue_tag_element_t* e) { - return pqueue_remove((pqueue_t*) q, (void*) e); +void pqueue_tag_remove(pqueue_tag_t* q, pqueue_tag_element_t* e) { + pqueue_remove((pqueue_t*) q, (void*) e); } -pqueue_tag_element_t* pqueue_tag_peek(pqueue_tag_t* q) { - return (pqueue_tag_element_t*) pqueue_peek((pqueue_t*)q); -} +void pqueue_tag_remove_up_to(pqueue_tag_t* q, tag_t t){ + tag_t head = pqueue_tag_peek_tag(q); + while (lf_tag_compare(head, FOREVER_TAG) < 0 && lf_tag_compare(head, t) <= 0) { + pqueue_tag_pop(q); + head = pqueue_tag_peek_tag(q); + } +} \ No newline at end of file diff --git a/include/core/utils/pqueue_tag.h b/include/core/utils/pqueue_tag.h index 3eb8d4a5e..ad4ac84d1 100644 --- a/include/core/utils/pqueue_tag.h +++ b/include/core/utils/pqueue_tag.h @@ -56,12 +56,13 @@ typedef struct { } pqueue_tag_element_t; /** - * Type of a priority queue sorted by tags. + * @brief Type of a priority queue sorted by tags. */ typedef pqueue_t pqueue_tag_t; /** * @brief Create a priority queue sorted by tags. + * * The elements of the priority queue will be of type pqueue_tag_element_t. * The caller should call pqueue_tag_free() when finished with the queue. * @return A dynamically allocated priority queue or NULL if memory allocation fails. @@ -69,19 +70,22 @@ typedef pqueue_t pqueue_tag_t; pqueue_tag_t* pqueue_tag_init(size_t initial_size); /** - * Free all memory used by the queue including any elements that are marked is_dynamic. + * @brief Free all memory used by the queue including elements that are marked dynamic. + * * @param q The queue. */ void pqueue_tag_free(pqueue_tag_t *q); /** - * Return the size of the queue. + * @brief Return the size of the queue. + * * @param q The queue. */ size_t pqueue_tag_size(pqueue_tag_t *q); /** - * Insert an element into the queue. + * @brief Insert an element into the queue. + * * @param q The queue. * @param e The element to insert. * @return 0 on success @@ -90,6 +94,7 @@ int pqueue_tag_insert(pqueue_tag_t* q, pqueue_tag_element_t* d); /** * @brief Insert a tag into the queue. + * * This automatically creates a dynamically allocated element in the queue * and ensures that if the element is still on the queue when pqueue_tag_free * is called, then that memory will be freed. @@ -101,6 +106,7 @@ int pqueue_tag_insert_tag(pqueue_tag_t* q, tag_t t); /** * @brief Insert a tag into the queue if the tag is not already in the queue. + * * This automatically creates a dynamically allocated element in the queue * and ensures that if the element is still on the queue when pqueue_tag_free * is called, then that memory will be freed. @@ -111,16 +117,30 @@ int pqueue_tag_insert_tag(pqueue_tag_t* q, tag_t t); int pqueue_tag_insert_if_no_match(pqueue_tag_t* q, tag_t t); /** - * @brief Pop the least-tag element from the queue and return its tag. - * If the queue is empty, return FOREVER_TAG. This function handles freeing - * the element struct if it was dynamically allocated. + * @brief Return the first item with the specified tag or NULL if there is none. * @param q The queue. - * @return NULL on error, otherwise the entry + * @param t The tag. + * @return An entry with the specified tag or NULL if there isn't one. */ -tag_t pqueue_tag_pop_tag(pqueue_tag_t* q); +pqueue_tag_element_t* pqueue_tag_find_with_tag(pqueue_tag_t *q, tag_t t); + +/** + * @brief Return highest-ranking item (the one with the least tag) without removing it. + * @param q The queue. + * @return NULL on if the queue is empty, otherwise the entry. + */ +pqueue_tag_element_t* pqueue_tag_peek(pqueue_tag_t* q); + +/** + * @brief Return the least tag in the queue or FOREVER if the queue is empty. + * @param q The queue. + * @return The least tag in the queue or FOREVER if the queue is empty. + */ +tag_t pqueue_tag_peek_tag(pqueue_tag_t* q); /** * @brief Pop the least-tag element from the queue. + * * If the entry was dynamically allocated, then it is now up to the caller * to ensure that it is freed. It will not be freed by pqueue_tag_free. * @param q The queue. @@ -129,26 +149,30 @@ tag_t pqueue_tag_pop_tag(pqueue_tag_t* q); pqueue_tag_element_t* pqueue_tag_pop(pqueue_tag_t* q); /** - * Return the first item with the specified tag or NULL if there is none. + * @brief Pop the least-tag element from the queue and return its tag. + * + * If the queue is empty, return FOREVER_TAG. This function handles freeing + * the element struct if it was dynamically allocated. * @param q The queue. - * @param t The tag. - * @return An entry with the specified tag or NULL if there isn't one. + * @return NULL on error, otherwise the entry */ -pqueue_tag_element_t* pqueue_tag_find_with_tag(pqueue_tag_t *q, tag_t t); +tag_t pqueue_tag_pop_tag(pqueue_tag_t* q); /** - * Remove an item from the queue. + * @brief Remove an item from the queue. + * * @param q The queue. * @param e The entry to remove. - * @return 0 on success */ -int pqueue_tag_remove(pqueue_tag_t* q, pqueue_tag_element_t* e); +void pqueue_tag_remove(pqueue_tag_t* q, pqueue_tag_element_t* e); /** - * Access highest-ranking item without removing it. + * @brief Remove items from the queue with tags up to and including the specified tag. + * + * If the specified tag is FOREVER_TAG, then all items will be removed. * @param q The queue. - * @return NULL on error, otherwise the entry. + * @param t The specified tag. */ -pqueue_tag_element_t* pqueue_tag_peek(pqueue_tag_t* q); +void pqueue_tag_remove_up_to(pqueue_tag_t* q, tag_t t); #endif // PQUEUE_TAG_H From 0ba93dab5231e67311eab9b7674351363122096c Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Tue, 2 Jan 2024 12:13:17 -0800 Subject: [PATCH 49/83] Fixed RTI compile errors --- core/federated/RTI/rti_remote.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index afcb9d0b9..487b0cb7c 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -307,7 +307,7 @@ void notify_provisional_tag_advance_grant(scheduling_node_t *e, tag_t tag) { void update_federate_next_event_tag_locked(uint16_t federate_id, tag_t next_event_tag) { federate_info_t *fed = GET_FED_INFO(federate_id); - tag_t min_in_transit_tag = pqueue_tag_peek_tag(fed->in_transit_message_tags)->tag; + tag_t min_in_transit_tag = pqueue_tag_peek_tag(fed->in_transit_message_tags); if (lf_tag_compare(min_in_transit_tag, next_event_tag) < 0) { next_event_tag = min_in_transit_tag; } @@ -1676,7 +1676,7 @@ void initialize_federate(federate_info_t *fed, uint16_t id) { fed->requested_stop = false; fed->socket = -1; // No socket. fed->clock_synchronization_enabled = true; - fed->in_transit_message_tags = pqueue_tag_init(); + fed->in_transit_message_tags = pqueue_tag_init(10); strncpy(fed->server_hostname, "localhost", INET_ADDRSTRLEN); fed->server_ip_addr.s_addr = 0; fed->server_port = -1; From 3c5a96bf2b902e6e036032585abb8a0e189c9bbd Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Tue, 2 Jan 2024 16:51:28 -0800 Subject: [PATCH 50/83] Update test for void return value --- test/general/utils/pqueue_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/general/utils/pqueue_test.c b/test/general/utils/pqueue_test.c index f95492799..e0f252c7d 100644 --- a/test/general/utils/pqueue_test.c +++ b/test/general/utils/pqueue_test.c @@ -79,7 +79,7 @@ static void pop_empty(pqueue_tag_t* q) { static void remove_from_queue(pqueue_tag_t* q, pqueue_tag_element_t* e1, pqueue_tag_element_t* e2) { assert(pqueue_tag_insert(q, e1) == 0); assert(pqueue_tag_insert(q, e2) == 0); - assert(pqueue_tag_remove(q, e1) == 0); + pqueue_tag_remove(q, e1); assert(pqueue_tag_peek(q) == e2); assert(pqueue_tag_size(q) == 1); } From c374f3172c50d1c8e2bef4e7cd3f7351c7eefc5b Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Tue, 2 Jan 2024 21:27:37 -0800 Subject: [PATCH 51/83] Make resign messages backward compatible --- core/federated/RTI/main.c | 22 +++--- core/federated/RTI/rti_remote.c | 84 ++++++++++++++++----- core/federated/RTI/rti_remote.h | 32 -------- core/federated/federate.c | 63 +++++++++------- include/core/federated/network/net_common.h | 30 +++++--- include/core/trace.h | 2 + 6 files changed, 129 insertions(+), 104 deletions(-) diff --git a/core/federated/RTI/main.c b/core/federated/RTI/main.c index 9d5f8a437..89fe8d62b 100644 --- a/core/federated/RTI/main.c +++ b/core/federated/RTI/main.c @@ -72,24 +72,20 @@ const char *rti_trace_file_name = "rti.lft"; bool normal_termination = false; /** - * Send a resign signal to the specified federate. The tag payload is the tag - * of the most recently received LTC from the federate or NEVER - * if no LTC has been received. + * Send a failed signal to the specified federate. */ -static void send_resign_signal(federate_info_t* fed) { - size_t bytes_to_write = 1 + sizeof(tag_t); +static void send_failed_signal(federate_info_t* fed) { + size_t bytes_to_write = 1; unsigned char buffer[bytes_to_write]; - buffer[0] = MSG_TYPE_RESIGN; - tag_t tag = fed->enclave.completed; - encode_tag(&(buffer[1]), tag); + buffer[0] = MSG_TYPE_FAILED; int failed = write_to_socket(fed->socket, bytes_to_write, &(buffer[0])); if (failed == 0) { - LF_PRINT_LOG("RTI has sent resign signal to federate %d due to abnormal termination.", fed->enclave.id); + LF_PRINT_LOG("RTI has sent failed signal to federate %d due to abnormal termination.", fed->enclave.id); } else { - LF_PRINT_LOG("RTI failed to send resign signal to federate %d on socket ID %d.", fed->enclave.id, fed->socket); + LF_PRINT_LOG("RTI failed to send failed signal to federate %d on socket ID %d.", fed->enclave.id, fed->socket); } if (rti.base.tracing_enabled) { - tracepoint_rti_to_federate(rti.base.trace, send_RESIGN, fed->enclave.id, &tag); + tracepoint_rti_to_federate(rti.base.trace, send_FAILED, fed->enclave.id, NULL); } } @@ -97,7 +93,7 @@ static void send_resign_signal(federate_info_t* fed) { * @brief Function to run upon termination. * This function will be invoked both after main() returns and when a signal * that results in terminating the process, such as SIGINT. In the former - * case, it should do nothing. In the latter case, it will send a MSG_TYPE_RESIGN + * case, it should do nothing. In the latter case, it will send a MSG_TYPE_FAILED * signal to each federate and attempt to write the trace file, but without * acquiring a mutex lock, so the resulting files may be incomplete or even * corrupted. But this is better than just failing to write the data we have @@ -108,7 +104,7 @@ void termination() { for (int i = 0; i < rti.base.number_of_scheduling_nodes; i++) { federate_info_t *f = (federate_info_t*)rti.base.scheduling_nodes[i]; if (!f || f->enclave.state == NOT_CONNECTED) continue; - send_resign_signal(f); + send_failed_signal(f); } if (rti.base.tracing_enabled) { stop_trace_locked(rti.base.trace); diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 487b0cb7c..514b7aa69 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1039,30 +1039,73 @@ void *clock_synchronization_thread(void *noargs) { return NULL; } -void handle_federate_resign(federate_info_t *my_fed) { +/** + * Handle MSG_TYPE_FAILED sent by a federate. This message is sent by a federate + * that is exiting in failure. In this case, the RTI will + * also terminate abnormally, returning a non-zero exit code when it exits. + * But it does not immediately exit. It does close the socket connection to the federate. + * + * This function assumes the caller does not hold the mutex. + * + * @param my_fed The federate sending a MSG_TYPE_FAILED message. + */ +static void handle_federate_failed(federate_info_t *my_fed) { // Nothing more to do. Close the socket and exit. LF_MUTEX_LOCK(rti_mutex); - // Extract the tag - size_t header_size = 1 + sizeof(tag_t); - unsigned char buffer[header_size]; - // Read the header, minus the first byte which has already been read. - read_from_socket_fail_on_error(&my_fed->socket, header_size - 1, &(buffer[1]), NULL, - "RTI failed to read the resign tag from remote federate."); - // Extract the tag sent by the resigning federate - tag_t tag = extract_tag(&(buffer[1])); if (rti_remote->base.tracing_enabled) { - tracepoint_rti_from_federate(rti_remote->base.trace, receive_RESIGN, my_fed->enclave.id, &tag); + tracepoint_rti_from_federate(rti_remote->base.trace, receive_FAILED, my_fed->enclave.id, NULL); } - if (lf_tag_compare(tag, NEVER_TAG) == 0) { - // The federate is reporting an error. - _lf_federate_reports_error = true; - lf_print("RTI: Federate %d reports an error and has resigned.", my_fed->enclave.id); - } else { - lf_print("RTI: Federate %d has resigned.", my_fed->enclave.id); + _lf_federate_reports_error = true; + lf_print_warning("RTI: Federate %d reports an error and has exited.", my_fed->enclave.id); + + my_fed->enclave.state = NOT_CONNECTED; + + // Indicate that there will no further events from this federate. + my_fed->enclave.next_event = FOREVER_TAG; + + // According to this: https://stackoverflow.com/questions/4160347/close-vs-shutdown-socket, + // the close should happen when receiving a 0 length message from the other end. + // Here, we just signal the other side that no further writes to the socket are + // forthcoming, which should result in the other end getting a zero-length reception. + shutdown(my_fed->socket, SHUT_RDWR); + + // We can now safely close the socket. + close(my_fed->socket); // from unistd.h + + // Check downstream federates to see whether they should now be granted a TAG. + // To handle cycles, need to create a boolean array to keep + // track of which upstream federates have been visited. + bool *visited = (bool *)calloc(rti_remote->base.number_of_scheduling_nodes, sizeof(bool)); // Initializes to 0. + notify_downstream_advance_grant_if_safe(&(my_fed->enclave), visited); + free(visited); + + LF_MUTEX_UNLOCK(rti_mutex); +} + +/** + * Handle MSG_TYPE_RESIGN sent by a federate. This message is sent at the time of termination + * after all shutdown events are processed on the federate. + * + * This function assumes the caller does not hold the mutex. + * + * @note At this point, the RTI might have outgoing messages to the federate. This + * function thus first performs a shutdown on the socket, which sends an EOF. It then + * waits for the remote socket to be closed before closing the socket itself. + * + * @param my_fed The federate sending a MSG_TYPE_RESIGN message. + */ +static void handle_federate_resign(federate_info_t *my_fed) { + // Nothing more to do. Close the socket and exit. + LF_MUTEX_LOCK(rti_mutex); + + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_from_federate(rti_remote->base.trace, receive_RESIGN, my_fed->enclave.id, NULL); } + lf_print("RTI: Federate %d has resigned.", my_fed->enclave.id); + my_fed->enclave.state = NOT_CONNECTED; // Indicate that there will no further events from this federate. @@ -1077,8 +1120,9 @@ void handle_federate_resign(federate_info_t *my_fed) { // Wait for the federate to send an EOF or a socket error to occur. // Discard any incoming bytes. Normally, this read should return 0 because // the federate is resigning and should itself invoke shutdown. - while (read(my_fed->socket, buffer, header_size) > 0) - ; + unsigned char buffer[10]; + while (read(my_fed->socket, buffer, 10) > 0); + // We can now safely close the socket. close(my_fed->socket); // from unistd.h @@ -1129,7 +1173,6 @@ void *federate_info_thread_TCP(void *fed) { case MSG_TYPE_RESIGN: handle_federate_resign(my_fed); return NULL; - break; case MSG_TYPE_NEXT_EVENT_TAG: handle_next_event_tag(my_fed); break; @@ -1148,6 +1191,9 @@ void *federate_info_thread_TCP(void *fed) { case MSG_TYPE_PORT_ABSENT: handle_port_absent_message(my_fed, buffer); break; + case MSG_TYPE_FAILED: + handle_federate_failed(my_fed); + return NULL; default: lf_print_error("RTI received from federate %d an unrecognized TCP message type: %u.", my_fed->enclave.id, buffer[0]); if (rti_remote->base.tracing_enabled) diff --git a/core/federated/RTI/rti_remote.h b/core/federated/RTI/rti_remote.h index 91897639d..9303da42d 100644 --- a/core/federated/RTI/rti_remote.h +++ b/core/federated/RTI/rti_remote.h @@ -342,38 +342,6 @@ void handle_physical_clock_sync_message(federate_info_t* my_fed, socket_type_t s */ void* clock_synchronization_thread(void* noargs); -/** - * A function to handle messages labeled - * as MSG_TYPE_RESIGN sent by a federate. This - * message is sent at the time of termination - * after all shutdown events are processed - * on the federate. - * - * If the tag on the resign message is NEVER, then the RTI assumes that - * the federate is terminating abnormally. In this case, the RTI will - * also terminate abnormally, returning a non-zero exit code. - * - * This function assumes the caller does not hold the mutex. - * - * @note At this point, the RTI might have - * outgoing messages to the federate. This - * function thus first performs a shutdown - * on the socket which sends an EOF. It then - * waits for the remote socket to be closed - * before closing the socket itself. - * - * Assumptions: - * - We assume that the other side (the federates) - * are in charge of closing the socket (by calling - * close() on the socket), and then wait for the RTI - * to shutdown the socket. - * - We assume that calling shutdown() follows the same - * shutdown procedure as stated in the TCP/IP specification. - * - * @param my_fed The federate sending a MSG_TYPE_RESIGN message. - **/ -void handle_federate_resign(federate_info_t *my_fed); - /** * Thread handling TCP communication with a federate. * @param fed A pointer to the federate's struct that has the diff --git a/core/federated/federate.c b/core/federated/federate.c index 677c9ef2c..6330adfec 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -155,12 +155,12 @@ static void send_tag(unsigned char type, tag_t tag) { /** * Return true if either the socket to the RTI is broken or the socket is - * alive and the first unread byte on the socket's queue is MSG_TYPE_RESIGN. + * alive and the first unread byte on the socket's queue is MSG_TYPE_FAILED. */ -static bool rti_resigned() { +static bool rti_failed() { unsigned char first_byte; ssize_t bytes = peek_from_socket(_fed.socket_TCP_RTI, &first_byte); - if (bytes < 0 || (bytes == 1 && first_byte == MSG_TYPE_RESIGN)) return true; + if (bytes < 0 || (bytes == 1 && first_byte == MSG_TYPE_FAILED)) return true; else return false; } @@ -831,8 +831,8 @@ static int perform_hmac_authentication() { return -1; } if (received[0] != MSG_TYPE_RTI_RESPONSE) { - if (received[0] == MSG_TYPE_RESIGN) { - lf_print_error("RTI has resigned."); + if (received[0] == MSG_TYPE_FAILED) { + lf_print_error("RTI has failed."); return -1; } else { lf_print_error( @@ -941,8 +941,8 @@ static instant_t get_start_time_from_rti(instant_t my_physical_time) { // First byte received is the message ID. if (buffer[0] != MSG_TYPE_TIMESTAMP) { - if (buffer[0] == MSG_TYPE_RESIGN) { - lf_print_error_and_exit("RTI has unexpectedly resigned."); + if (buffer[0] == MSG_TYPE_FAILED) { + lf_print_error_and_exit("RTI has failed."); } lf_print_error_and_exit( "Expected a MSG_TYPE_TIMESTAMP message from the RTI. Got %u (see net_common.h).", @@ -1374,27 +1374,33 @@ static void handle_stop_request_message() { } /** - * Send a resign signal to the RTI. The tag payload will be the current - * tag of the specified environment or, if there has been an error that - * will lead to an abnormal termination, the tag NEVER_TAG. + * Send a resign signal to the RTI. */ static void send_resign_signal(environment_t* env) { - size_t bytes_to_write = 1 + sizeof(tag_t); + size_t bytes_to_write = 1; unsigned char buffer[bytes_to_write]; buffer[0] = MSG_TYPE_RESIGN; - if (_lf_normal_termination) { - encode_tag(&(buffer[1]), env->current_tag); - } else { - encode_tag(&(buffer[1]), NEVER_TAG); - } LF_MUTEX_LOCK(lf_outbound_socket_mutex); write_to_socket_fail_on_error( &_fed.socket_TCP_RTI, bytes_to_write, &(buffer[0]), &lf_outbound_socket_mutex, - "Failed to send RESIGN."); + "Failed to send MSG_TYPE_RESIGN."); LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); LF_PRINT_LOG("Resigned."); } +/** + * Send a failed signal to the RTI. + */ +static void send_failed_signal(environment_t* env) { + size_t bytes_to_write = 1; + unsigned char buffer[bytes_to_write]; + buffer[0] = MSG_TYPE_FAILED; + write_to_socket_fail_on_error( + &_fed.socket_TCP_RTI, bytes_to_write, &(buffer[0]), NULL, + "Failed to send MSG_TYPE_FAILED."); + LF_PRINT_LOG("Failed."); +} + /** * @brief Stop the traces associated with all environments in the program. */ @@ -1407,12 +1413,12 @@ static void stop_all_traces() { } /** - * Handle a resign signal from the RTI. The RTI will only resign + * Handle a failed signal from the RTI. The RTI will only fail * if it is forced to exit, e.g. by a SIG_INT. Hence, this federate * will exit immediately with an error condition, counting on the * termination functions to handle any cleanup needed. */ -static void handle_rti_resign_message(void) { +static void handle_rti_failed_message(void) { exit(1); } @@ -1480,8 +1486,8 @@ static void* listen_to_rti_TCP(void* args) { case MSG_TYPE_PORT_ABSENT: handle_port_absent_message(&_fed.socket_TCP_RTI, -1); break; - case MSG_TYPE_RESIGN: - handle_rti_resign_message(); + case MSG_TYPE_FAILED: + handle_rti_failed_message(); break; case MSG_TYPE_CLOCK_SYNC_T1: case MSG_TYPE_CLOCK_SYNC_T4: @@ -1564,17 +1570,16 @@ void terminate_execution(environment_t* env) { assert(env != GLOBAL_ENVIRONMENT); // For an abnormal termination (e.g. a SIGINT), we need to send a - // MSG_TYPE_RESIGN message to the RTI, but we should not acquire a mutex. + // MSG_TYPE_FAILED message to the RTI, but we should not acquire a mutex. if (_fed.socket_TCP_RTI >= 0) { if (_lf_normal_termination) { - LF_MUTEX_LOCK(lf_outbound_socket_mutex); send_resign_signal(env); - LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); // Trace the event when tracing is enabled tracepoint_federate_to_rti(_fed.trace, send_RESIGN, _lf_my_fed_id, &env->current_tag); } else { // Do not acquire mutex and do not trace. - send_resign_signal(env); + send_failed_signal(env); + tracepoint_federate_to_rti(_fed.trace, send_FAILED, _lf_my_fed_id, &env->current_tag); } } @@ -1662,8 +1667,8 @@ void lf_connect_to_federate(uint16_t remote_federate_id) { if (buffer[0] != MSG_TYPE_ADDRESS_QUERY) { // Unexpected reply. Could be that RTI has failed and sent a resignation. - if (buffer[0] == MSG_TYPE_RESIGN) { - lf_print_error_and_exit("RTI has resigned."); + if (buffer[0] == MSG_TYPE_FAILED) { + lf_print_error_and_exit("RTI has failed."); } else { lf_print_error_and_exit("Unexpected reply of type %hhu from RTI (see net_common.h).", buffer[0]); } @@ -1745,7 +1750,7 @@ void lf_connect_to_federate(uint16_t remote_federate_id) { remote_federate_id, ADDRESS_QUERY_RETRY_INTERVAL); // Check whether the RTI is still there. - if (rti_resigned()) break; + if (rti_failed()) break; // Wait ADDRESS_QUERY_RETRY_INTERVAL nanoseconds. lf_sleep(ADDRESS_QUERY_RETRY_INTERVAL); @@ -2071,7 +2076,7 @@ void* lf_handle_p2p_connections_from_federates(void* env_arg) { if (socket_id < 0) { if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { - if (rti_resigned()) break; + if (rti_failed()) break; else continue; // Try again. } else if (errno == EPERM) { lf_print_error_system_failure("Firewall permissions prohibit connection."); diff --git a/include/core/federated/network/net_common.h b/include/core/federated/network/net_common.h index f428bcc93..9ea720fd7 100644 --- a/include/core/federated/network/net_common.h +++ b/include/core/federated/network/net_common.h @@ -377,20 +377,23 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define MSG_TYPE_MESSAGE 3 -/** Byte identifying that the federate or the RTI is ending its execution. */ +/** + * Byte identifying that the federate or the RTI is ending its execution. + */ #define MSG_TYPE_RESIGN 4 -/** Byte identifying a timestamped message to forward to another federate. - * The next two bytes will be the ID of the destination reactor port. - * The next two bytes are the destination federate ID. - * The four bytes after that will be the length of the message. - * The next eight bytes will be the timestamp of the message. - * The next four bytes will be the microstep of the message. - * The remaining bytes are the message. +/** + * Byte identifying a timestamped message to forward to another federate. + * The next two bytes will be the ID of the destination reactor port. + * The next two bytes are the destination federate ID. + * The four bytes after that will be the length of the message. + * The next eight bytes will be the timestamp of the message. + * The next four bytes will be the microstep of the message. + * The remaining bytes are the message. * - * With centralized coordination, all such messages flow through the RTI. - * With decentralized coordination, tagged messages are sent peer-to-peer - * between federates and are marked with MSG_TYPE_P2P_TAGGED_MESSAGE. + * With centralized coordination, all such messages flow through the RTI. + * With decentralized coordination, tagged messages are sent peer-to-peer + * between federates and are marked with MSG_TYPE_P2P_TAGGED_MESSAGE. */ #define MSG_TYPE_TAGGED_MESSAGE 5 @@ -657,6 +660,11 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MSG_TYPE_NEIGHBOR_STRUCTURE 24 #define MSG_TYPE_NEIGHBOR_STRUCTURE_HEADER_SIZE 9 +/** + * Byte identifying that the federate or the RTI has failed. + */ +#define MSG_TYPE_FAILED 25 + ///////////////////////////////////////////// //// Rejection codes diff --git a/include/core/trace.h b/include/core/trace.h index 5946e16dc..e8c69f3ab 100644 --- a/include/core/trace.h +++ b/include/core/trace.h @@ -79,6 +79,7 @@ typedef enum federated, // Everything above this is tracing federated interactions. // Sending messages send_ACK, + send_FAILED, send_TIMESTAMP, send_NET, send_LTC, @@ -100,6 +101,7 @@ typedef enum send_ADR_QR, // Receiving messages receive_ACK, + receive_FAILED, receive_TIMESTAMP, receive_NET, receive_LTC, From 37635350c8439d8b542eba03ccb1b9522e451c26 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 3 Jan 2024 07:49:25 -0800 Subject: [PATCH 52/83] Fixed tracing for FAILED message --- include/core/trace.h | 4 +++- util/tracing/visualization/fedsd.py | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/core/trace.h b/include/core/trace.h index e8c69f3ab..d88abc291 100644 --- a/include/core/trace.h +++ b/include/core/trace.h @@ -76,7 +76,7 @@ typedef enum worker_wait_ends, scheduler_advancing_time_starts, scheduler_advancing_time_ends, - federated, // Everything above this is tracing federated interactions. + federated, // Everything below this is for tracing federated interactions. // Sending messages send_ACK, send_FAILED, @@ -144,6 +144,7 @@ static const char *trace_event_names[] = { "Federated marker", // Sending messages "Sending ACK", + "Sending FAILED", "Sending TIMESTAMP", "Sending NET", "Sending LTC", @@ -165,6 +166,7 @@ static const char *trace_event_names[] = { "Sending ADR_QR", // Receiving messages "Receiving ACK", + "Receiving FAILED", "Receiving TIMESTAMP", "Receiving NET", "Receiving LTC", diff --git a/util/tracing/visualization/fedsd.py b/util/tracing/visualization/fedsd.py index d9d44253b..7254d6b2c 100644 --- a/util/tracing/visualization/fedsd.py +++ b/util/tracing/visualization/fedsd.py @@ -41,6 +41,7 @@ # communication rendering prune_event_name = { "Sending ACK": "ACK", + "Sending FAILED": "FAILED", "Sending TIMESTAMP": "TIMESTAMP", "Sending NET": "NET", "Sending LTC": "LTC", @@ -61,6 +62,7 @@ "Sending ADR_AD": "ADR_AD", "Sending ADR_QR": "ADR_QR", "Receiving ACK": "ACK", + "Receiving FAILED": "FAILED", "Receiving TIMESTAMP": "TIMESTAMP", "Receiving NET": "NET", "Receiving LTC": "LTC", @@ -104,7 +106,7 @@ # Events matching at the sender and receiver ends depend on whether they are tagged # (the elapsed logical time and microstep have to be the same) or not. # Set of tagged events (messages) -non_tagged_messages = {'FED_ID', 'ACK', 'REJECT', 'ADR_RQ', 'ADR_AD', 'MSG', 'P2P_MSG'} +non_tagged_messages = {'FED_ID', 'ACK', 'FAILED', 'REJECT', 'ADR_RQ', 'ADR_AD', 'MSG', 'P2P_MSG'} ################################################################################ @@ -659,7 +661,7 @@ def get_and_convert_lft_files(rti_lft_file, federates_lft_files): # FIXME: Using microseconds is hardwired here. physical_time = f'{int(row["physical_time"]/1000):,}' - if (row['event'] in {'FED_ID', 'ACK', 'REJECT', 'ADR_RQ', 'ADR_AD', 'MSG', 'P2P_MSG'}): + if (row['event'] in {'FED_ID', 'ACK', 'FAILED', 'REJECT', 'ADR_RQ', 'ADR_AD', 'MSG', 'P2P_MSG'}): label = row['event'] else: label = row['event'] + '(' + f'{int(row["logical_time"]):,}' + ', ' + str(row['microstep']) + ')' From 67db29f0f86f7376843c17bfd4493d2f8a416b87 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 3 Jan 2024 14:43:02 -0800 Subject: [PATCH 53/83] Fixed tracing of RESIGN --- util/tracing/visualization/fedsd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/tracing/visualization/fedsd.py b/util/tracing/visualization/fedsd.py index 7254d6b2c..79e222d0a 100644 --- a/util/tracing/visualization/fedsd.py +++ b/util/tracing/visualization/fedsd.py @@ -106,7 +106,7 @@ # Events matching at the sender and receiver ends depend on whether they are tagged # (the elapsed logical time and microstep have to be the same) or not. # Set of tagged events (messages) -non_tagged_messages = {'FED_ID', 'ACK', 'FAILED', 'REJECT', 'ADR_RQ', 'ADR_AD', 'MSG', 'P2P_MSG'} +non_tagged_messages = {'FED_ID', 'ACK', 'RESIGN', 'FAILED', 'REJECT', 'ADR_RQ', 'ADR_AD', 'MSG', 'P2P_MSG'} ################################################################################ From 3d0e8deb736204556924dbd23da8e5f90ef6da86 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 3 Jan 2024 14:45:07 -0800 Subject: [PATCH 54/83] make clean removes executables --- util/tracing/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/tracing/Makefile b/util/tracing/Makefile index b20292d00..15fd0c13e 100644 --- a/util/tracing/Makefile +++ b/util/tracing/Makefile @@ -37,4 +37,4 @@ install: trace_to_csv trace_to_chrome trace_to_influxdb chmod +x $(BIN_INSTALL_PATH)/fedsd clean: - rm -f *.o + rm -f *.o trace_to_chrome trace_to_influxdb trace_to_csv From a0e1c22421bf8b2da8626f20246c3a7c6d0ba280 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Fri, 5 Jan 2024 17:40:08 -0800 Subject: [PATCH 55/83] Tolerate incomplete message reads for decentralized --- core/federated/federate.c | 70 ++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 6330adfec..b4beb2e9a 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -467,18 +467,16 @@ static bool handle_message_now(environment_t* env, trigger_t* trigger, tag_t int * @param socket Pointer to the socket to read the message from. * @param buffer The buffer to read. * @param fed_id The sending federate ID or -1 if the centralized coordination. + * @return 0 for success, -1 for failure. */ -static void handle_message(int* socket, int fed_id) { +static int handle_message(int* socket, int fed_id) { // Read the header. size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t); unsigned char buffer[bytes_to_read]; if (read_from_socket_close_on_error(socket, bytes_to_read, buffer)) { // Read failed, which means the socket has been closed between reading the - // message ID byte and here. Issue a warning only. This is a physical - // connection, so likely the message is just late. If it's a serious failure, - // it should be caught in another thread. - lf_print_warning("Failed to read message header."); - return; + // message ID byte and here. + return -1; } // Extract the header information. @@ -497,7 +495,7 @@ static void handle_message(int* socket, int fed_id) { // Allocate memory for the message contents. unsigned char* message_contents = (unsigned char*)malloc(length); if (read_from_socket_close_on_error(socket, length, message_contents)) { - lf_print_warning("Failed to read message body."); + return -1; } // Trace the event when tracing is enabled tracepoint_federate_from_federate(_fed.trace, receive_P2P_MSG, _lf_my_fed_id, federate_id, NULL); @@ -505,6 +503,7 @@ static void handle_message(int* socket, int fed_id) { LF_PRINT_DEBUG("Calling schedule for message received on a physical connection."); _lf_schedule_value(action, 0, message_contents, length); + return 0; } /** @@ -522,8 +521,9 @@ static void handle_message(int* socket, int fed_id) { * @param socket Pointer to the socket to read the message from. * @param buffer The buffer to read. * @param fed_id The sending federate ID or -1 if the centralized coordination. + * @return 0 on successfully reading the message, -1 on failure (e.g. due to socket closed). */ -static void handle_tagged_message(int* socket, int fed_id) { +static int handle_tagged_message(int* socket, int fed_id) { // Environment is always the one corresponding to the top-level scheduling enclave. environment_t *env; _lf_get_environments(&env); @@ -533,8 +533,9 @@ static void handle_tagged_message(int* socket, int fed_id) { size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t) + sizeof(instant_t) + sizeof(microstep_t); unsigned char buffer[bytes_to_read]; - read_from_socket_fail_on_error(socket, bytes_to_read, buffer, NULL, - "Failed to read timed message header"); + if (read_from_socket_close_on_error(socket, bytes_to_read, buffer)) { + return -1; // Read failed. + } // Extract the header information. unsigned short port_id; @@ -581,8 +582,9 @@ static void handle_tagged_message(int* socket, int fed_id) { // Read the payload. // Allocate memory for the message contents. unsigned char* message_contents = (unsigned char*)malloc(length); - read_from_socket_fail_on_error(socket, length, message_contents, NULL, - "Failed to read message body."); + if (read_from_socket_close_on_error(socket, length, message_contents)) { + return -1; // Read failed. + } // The following is only valid for string messages. // LF_PRINT_DEBUG("Message received: %s.", message_contents); @@ -658,6 +660,8 @@ static void handle_tagged_message(int* socket, int fed_id) { // the need for unecessary lock and unlock // operations. LF_MUTEX_UNLOCK(env->mutex); + + return 0; } /** @@ -668,12 +672,14 @@ static void handle_tagged_message(int* socket, int fed_id) { * @param socket Pointer to the socket to read the message from * @param buffer The buffer to read * @param fed_id The sending federate ID or -1 if the centralized coordination. + * @return 0 for success, -1 for failure to complete the read. */ -static void handle_port_absent_message(int* socket, int fed_id) { +static int handle_port_absent_message(int* socket, int fed_id) { size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(instant_t) + sizeof(microstep_t); unsigned char buffer[bytes_to_read]; - read_from_socket_fail_on_error(socket, bytes_to_read, buffer, NULL, - "Failed to read port absent message."); + if (read_from_socket_close_on_error(socket, bytes_to_read, buffer)) { + return -1; + } // Extract the header information. unsigned short port_id = extract_uint16(buffer); @@ -701,6 +707,8 @@ static void handle_port_absent_message(int* socket, int fed_id) { LF_MUTEX_LOCK(env->mutex); update_last_known_status_on_input_port(env, intended_tag, port_id); LF_MUTEX_UNLOCK(env->mutex); + + return 0; } /** @@ -744,15 +752,31 @@ static void* listen_to_federates(void* _args) { switch (buffer[0]) { case MSG_TYPE_P2P_MESSAGE: LF_PRINT_LOG("Received untimed message from federate %d.", fed_id); - handle_message(socket_id, fed_id); + if (handle_message(socket_id, fed_id)) { + // Failed to complete the reading of a message on a physical connection. + lf_print_warning("Failed to complete reading of message on physical connection."); + return NULL; + } break; case MSG_TYPE_P2P_TAGGED_MESSAGE: LF_PRINT_LOG("Received timed message from federate %d.", fed_id); - handle_tagged_message(socket_id, fed_id); + if (handle_tagged_message(socket_id, fed_id)) { + // P2P tagged messages are only used in decentralized coordination, and + // it is not a fatal error if the socket is closed before the whole message is read. + // But this thread should exit. + lf_print_warning("Failed to complete reading of tagged message."); + return NULL; + } break; case MSG_TYPE_PORT_ABSENT: LF_PRINT_LOG("Received port absent message from federate %d.", fed_id); - handle_port_absent_message(socket_id, fed_id); + if (handle_port_absent_message(socket_id, fed_id)) { + // P2P tagged messages are only used in decentralized coordination, and + // it is not a fatal error if the socket is closed before the whole message is read. + // But this thread should exit. + lf_print_warning("Failed to complete reading of tagged message."); + return NULL; + } break; default: bad_message = true; @@ -1469,7 +1493,10 @@ static void* listen_to_rti_TCP(void* args) { } switch (buffer[0]) { case MSG_TYPE_TAGGED_MESSAGE: - handle_tagged_message(&_fed.socket_TCP_RTI, -1); + if (handle_tagged_message(&_fed.socket_TCP_RTI, -1)) { + // Failures to complete the read of messages from the RTI are fatal. + lf_print_error_and_exit("Failed to complete the reading of a message from the RTI."); + } break; case MSG_TYPE_TAG_ADVANCE_GRANT: handle_tag_advance_grant(); @@ -1484,7 +1511,10 @@ static void* listen_to_rti_TCP(void* args) { handle_stop_granted_message(); break; case MSG_TYPE_PORT_ABSENT: - handle_port_absent_message(&_fed.socket_TCP_RTI, -1); + if (handle_port_absent_message(&_fed.socket_TCP_RTI, -1)) { + // Failures to complete the read of absent messages from the RTI are fatal. + lf_print_error_and_exit("Failed to complete the reading of an absent message from the RTI."); + } break; case MSG_TYPE_FAILED: handle_rti_failed_message(); From 78970c82ca6d7641606303413af2df4e2b2e98f5 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sun, 7 Jan 2024 10:04:16 -0800 Subject: [PATCH 56/83] Formatting only --- core/federated/RTI/rti_remote.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 514b7aa69..7027db61d 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -206,8 +206,7 @@ static int create_rti_server(uint16_t port, socket_type_t socket_type) { return socket_descriptor; } -void notify_tag_advance_grant(scheduling_node_t *e, tag_t tag) -{ +void notify_tag_advance_grant(scheduling_node_t *e, tag_t tag) { if (e->state == NOT_CONNECTED || lf_tag_compare(tag, e->last_granted) <= 0 || lf_tag_compare(tag, e->last_provisionally_granted) < 0) { From 8ba8a7870e3e2b959f35ec59107cc71ad36ee4ce Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sun, 7 Jan 2024 10:38:09 -0800 Subject: [PATCH 57/83] Removed noisy debug message --- core/tag.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/tag.c b/core/tag.c index eee10699f..3e43db47c 100644 --- a/core/tag.c +++ b/core/tag.c @@ -99,13 +99,14 @@ instant_t _lf_physical_time() { _lf_last_reported_physical_time_ns = adjusted_clock_ns; } + /* Possibly useful, but usually noisy: LF_PRINT_DEBUG("Physical time: " PRINTF_TIME ". Elapsed: " PRINTF_TIME ". Offset: " PRINTF_TIME, _lf_last_reported_physical_time_ns, _lf_last_reported_physical_time_ns - start_time, _lf_time_physical_clock_offset + _lf_time_test_physical_clock_offset); - + */ return _lf_last_reported_physical_time_ns; } From 48bccefc796761c58df706bbd8bfff8e272b4aad Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Tue, 9 Jan 2024 15:13:32 -0800 Subject: [PATCH 58/83] Exit RTI immediately if federate fails --- core/federated/RTI/rti_remote.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 7027db61d..e0bbd6eb6 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1042,7 +1042,6 @@ void *clock_synchronization_thread(void *noargs) { * Handle MSG_TYPE_FAILED sent by a federate. This message is sent by a federate * that is exiting in failure. In this case, the RTI will * also terminate abnormally, returning a non-zero exit code when it exits. - * But it does not immediately exit. It does close the socket connection to the federate. * * This function assumes the caller does not hold the mutex. * @@ -1057,7 +1056,7 @@ static void handle_federate_failed(federate_info_t *my_fed) { } _lf_federate_reports_error = true; - lf_print_warning("RTI: Federate %d reports an error and has exited.", my_fed->enclave.id); + lf_print_error_and_exit("RTI: Federate %d reports an error and has exited.", my_fed->enclave.id); my_fed->enclave.state = NOT_CONNECTED; From 0106dfeb0e551c7e98af3a4524cdab59daf80af6 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Tue, 9 Jan 2024 19:39:00 -0800 Subject: [PATCH 59/83] Improve the handling of tardy messages. With the decentralized controller, it is possible for messages with a tag g to arrive after the destination federate has advanced its current tag past g. Such messages are tardy. The messages will be handles at a later tag g' > g, including the invocation of any STP violation handlers. Previously, the record of the last known tag for a trigger was being set to g. It should be set to g', and it is now. This commit also improves the handling of deadlocks that can arise from the MLAA mechanism. Now they error out instead of just locking up. --- core/federated/federate.c | 154 ++++++++++++++++++++---------- include/core/federated/federate.h | 6 +- 2 files changed, 110 insertions(+), 50 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index b4beb2e9a..0b961fb04 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -187,10 +187,10 @@ extern size_t staa_lst_size; * @return A pointer to an action struct or null if the ID is out of range. */ static lf_action_base_t* action_for_port(int port_id) { - if (port_id < _lf_action_table_size) { + if (port_id >= 0 && port_id < _lf_action_table_size) { return _lf_action_table[port_id]; } - lf_print_error("Invalid port ID: %d", port_id); + lf_print_error_and_exit("Invalid port ID: %d", port_id); return NULL; } @@ -210,7 +210,7 @@ static void update_last_known_status_on_input_ports(tag_t tag) { LF_PRINT_DEBUG("In update_last_known_status_on_input ports."); bool notify = false; for (int i = 0; i < _lf_action_table_size; i++) { - lf_action_base_t* input_port_action = action_for_port(i); + lf_action_base_t* input_port_action = _lf_action_table[i]; // This is called when a TAG is received. // But it is possible for an input port to have received already // a message with a larger tag (if there is an after delay on the @@ -421,17 +421,16 @@ static void close_inbound_socket(int fed_id, int flag) { * at that tag. This returns true if the following conditions are all true: * * 1. the first reaction triggered has a level >= MLAA (a port is or will be blocked on this trigger); - * 2. the intended_tag is less than or equal to the current tag of the environment; + * 2. the intended_tag is equal to the current tag of the environment; * 3. the intended_tag is greater than the last_tag of the trigger; * 4. the intended_tag is greater than the last_known_status_tag of the trigger; * 5. the execution has started (the event queue has been examined); * 6. the trigger is not physical; * * The comparison against the MLAA (condition 1), if true, means that there is a blocking port - * waiting for this trigger (or possibly an earlier blocking port). For condition (2), if the - * intended tag is less than the current tag, then the message is tardy. A tardy message can - * unblock a port, although it will trigger an STP violation handler if one is defined or an - * error if not (or if centralized coordination is being used). The comparison against the + * waiting for this trigger (or possibly an earlier blocking port). For condition (2), tardy + * messages are not scheduled now (they are already late), so if a reaction is blocked on + * unknown status of this port, it will be unblocked with an absent. The comparison against the * last_tag of the trigger (condition 3) ensures that if the message is tardy but there is * already an earlier tardy message that has been handled (or is being handled), then we * don't try to handle two messages in the same tag, which is not allowed. For example, there @@ -444,7 +443,7 @@ static void close_inbound_socket(int fed_id, int flag) { * last_known_status_tag (condition 4) deals with messages arriving with identical intended * tags (which should not happen). This one will be handled late (one microstep later than * the current tag if 1 and 2 are true). - * + * * This function assumes the mutex is held on the environment. * * @param env The environment. @@ -453,7 +452,7 @@ static void close_inbound_socket(int fed_id, int flag) { */ static bool handle_message_now(environment_t* env, trigger_t* trigger, tag_t intended_tag) { return trigger->reactions[0]->index >= max_level_allowed_to_advance - && lf_tag_compare(intended_tag, lf_tag(env)) <= 0 + && lf_tag_compare(intended_tag, lf_tag(env)) == 0 && lf_tag_compare(intended_tag, trigger->last_tag) > 0 && lf_tag_compare(intended_tag, trigger->last_known_status_tag) > 0 && env->execution_started @@ -465,7 +464,6 @@ static bool handle_message_now(environment_t* env, trigger_t* trigger, tag_t int * * This function assumes the caller does not hold the mutex lock. * @param socket Pointer to the socket to read the message from. - * @param buffer The buffer to read. * @param fed_id The sending federate ID or -1 if the centralized coordination. * @return 0 for success, -1 for failure. */ @@ -519,7 +517,6 @@ static int handle_message(int* socket, int fed_id) { * the tag will not advance at all if the tag of the message is * now or in the past. * @param socket Pointer to the socket to read the message from. - * @param buffer The buffer to read. * @param fed_id The sending federate ID or -1 if the centralized coordination. * @return 0 on successfully reading the message, -1 on failure (e.g. due to socket closed). */ @@ -575,7 +572,7 @@ static int handle_tagged_message(int* socket, int fed_id) { // If something happens, make sure to release the barrier. _lf_increment_tag_barrier(env, intended_tag); #endif - LF_PRINT_LOG("Received message on port %d with tag: " PRINTF_TAG ", Current tag: " PRINTF_TAG ".", + LF_PRINT_LOG("Received message on port %d with intended tag: " PRINTF_TAG ", Current tag: " PRINTF_TAG ".", port_id, intended_tag.time - start_time, intended_tag.microstep, lf_time_logical_elapsed(env), env->current_tag.microstep); @@ -583,6 +580,9 @@ static int handle_tagged_message(int* socket, int fed_id) { // Allocate memory for the message contents. unsigned char* message_contents = (unsigned char*)malloc(length); if (read_from_socket_close_on_error(socket, length, message_contents)) { +#ifdef FEDERATED_DECENTRALIZED + _lf_decrement_tag_barrier_locked(env); +#endif return -1; // Read failed. } @@ -623,18 +623,32 @@ static int handle_tagged_message(int* socket, int fed_id) { // that is because the network receiver reaction is now in the reaction queue // keeping the precedence order intact. set_network_port_status(port_id, present); - - // Port is now present. Therefore, notify the level advancer to proceed - lf_update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); - lf_cond_broadcast(&lf_port_status_changed); } else { // If no port absent reaction is waiting for this message, or if the intended - // tag is in the future, use schedule functions to process the message. - // Before that, if the current time >= stop time, discard the message. - // But only if the stop time is not equal to the start time! + // tag is in the future, or the message is tardy, use schedule functions to process the message. - update_last_known_status_on_input_port(env, intended_tag, port_id); + tag_t actual_tag = intended_tag; +#ifdef FEDERATED_DECENTRALIZED + // For tardy messages in decentralized coordination, we need to figure out what the actual tag will be. + // (Centralized coordination errors out with tardy messages). + if (lf_tag_compare(intended_tag, env->current_tag) <= 0) { + // Message is tardy. + actual_tag = env->current_tag; + actual_tag.microstep++; + // Check that this is greater than any previously scheduled event for this port. + trigger_t* input_port_action = action_for_port(port_id)->trigger; + if (lf_tag_compare(actual_tag, input_port_action->last_known_status_tag) <= 0) { + actual_tag = input_port_action->last_known_status_tag; + actual_tag.microstep++; + } + } + // The following will update the input_port_action->last_known_status_tag. + // For decentralized coordination, this is needed for the thread implementing STAA. + update_last_known_status_on_input_port(env, actual_tag, port_id); +#endif // FEDERATED_DECENTRALIZED + // If the current time >= stop time, discard the message. + // But only if the stop time is not equal to the start time! if (lf_tag_compare(env->current_tag, env->stop_tag) >= 0 && env->execution_started) { lf_print_error("Received message too late. Already at stop tag.\n" " Current tag is " PRINTF_TAG " and intended tag is " PRINTF_TAG ".\n" @@ -644,6 +658,8 @@ static int handle_tagged_message(int* socket, int fed_id) { // Close socket, reading any incoming data and discarding it. close_inbound_socket(fed_id, 1); } else { + // Need to use intended_tag here, not actual_tag, so that STP violations are detected. + // It will become actual_tag (that is when the reactions will be invoked). schedule_message_received_from_network_locked(env, action->trigger, intended_tag, message_token); } } @@ -670,7 +686,6 @@ static int handle_tagged_message(int* socket, int fed_id) { * in the message. * * @param socket Pointer to the socket to read the message from - * @param buffer The buffer to read * @param fed_id The sending federate ID or -1 if the centralized coordination. * @return 0 for success, -1 for failure to complete the read. */ @@ -738,12 +753,14 @@ static void* listen_to_federates(void* _args) { // Listen for messages from the federate. while (1) { + bool socket_closed = false; // Read one byte to get the message type. LF_PRINT_DEBUG("Waiting for a P2P message on socket %d.", *socket_id); if (read_from_socket_close_on_error(socket_id, 1, buffer)) { // Socket has been closed. lf_print("Socket from federate %d is closed.", fed_id); // Stop listening to this federate. + socket_closed = true; break; } LF_PRINT_DEBUG("Received a P2P message on socket %d of type %d.", @@ -755,17 +772,17 @@ static void* listen_to_federates(void* _args) { if (handle_message(socket_id, fed_id)) { // Failed to complete the reading of a message on a physical connection. lf_print_warning("Failed to complete reading of message on physical connection."); - return NULL; + socket_closed = true; } break; case MSG_TYPE_P2P_TAGGED_MESSAGE: - LF_PRINT_LOG("Received timed message from federate %d.", fed_id); + LF_PRINT_LOG("Received tagged message from federate %d.", fed_id); if (handle_tagged_message(socket_id, fed_id)) { // P2P tagged messages are only used in decentralized coordination, and // it is not a fatal error if the socket is closed before the whole message is read. // But this thread should exit. lf_print_warning("Failed to complete reading of tagged message."); - return NULL; + socket_closed = true; } break; case MSG_TYPE_PORT_ABSENT: @@ -775,18 +792,27 @@ static void* listen_to_federates(void* _args) { // it is not a fatal error if the socket is closed before the whole message is read. // But this thread should exit. lf_print_warning("Failed to complete reading of tagged message."); - return NULL; + socket_closed = true; } break; default: bad_message = true; } if (bad_message) { - // FIXME: Better error handling needed. lf_print_error("Received erroneous message type: %d. Closing the socket.", buffer[0]); // Trace the event when tracing is enabled tracepoint_federate_from_federate(_fed.trace, receive_UNIDENTIFIED, _lf_my_fed_id, fed_id, NULL); - break; + break; // while loop + } + if (socket_closed) { + // NOTE: For decentralized execution, once this socket is closed, we could + // update last known tags of all ports connected to the specified federate to FOREVER_TAG, + // which would eliminate the need to wait for STAA to assume an input is absent. + // However, at this time, we don't know which ports correspond to which upstream federates. + // The code generator would have to encode this information. Once that is done, + // we could call update_last_known_status_on_input_port with FOREVER_TAG. + + break; // while loop } } return NULL; @@ -1061,12 +1087,13 @@ static bool a_port_is_unknown(staa_t* staa_elem) { /** * @brief Return the port ID of the port associated with the given action. + * @return The port ID or -1 if there is no match. */ static int id_of_action(lf_action_base_t* input_port_action) { - for (int i = 0; 1; i++) { - if (action_for_port(i) == input_port_action) return i; + for (int i = 0; i < _lf_action_table_size; i++) { + if (_lf_action_table[i] == input_port_action) return i; } - // There will be no UB buffer overrun because action_for_port(i) has a check. + return -1; } /** @@ -1085,7 +1112,7 @@ static void* update_ports_from_staa_offsets(void* args) { int num_envs = _lf_get_environments(&env); LF_MUTEX_LOCK(env->mutex); while (1) { - bool restart = false; + LF_PRINT_DEBUG("**** (update thread) starting"); tag_t tag_when_started_waiting = lf_tag(env); for (int i = 0; i < staa_lst_size; ++i) { staa_t* staa_elem = staa_lst[i]; @@ -1093,6 +1120,8 @@ static void* update_ports_from_staa_offsets(void* args) { // The list is sorted in increasing order of adjusted STAA offsets. // The wait_until function automatically adds the _lf_fed_STA_offset to the wait time. interval_t wait_until_time = env->current_tag.time + staa_elem->STAA; + LF_PRINT_DEBUG("**** (update thread) original wait_until_time: " PRINTF_TIME, wait_until_time - lf_time_start()); + // The wait_until call will release the env->mutex while it is waiting. // However, it will not release the env->mutex if the wait time is too small. // At the cost of a small additional delay in deciding a port is absent, @@ -1107,47 +1136,66 @@ static void* update_ports_from_staa_offsets(void* args) { wait_until_time += 5 * MIN_SLEEP_DURATION; } while (a_port_is_unknown(staa_elem)) { + LF_PRINT_DEBUG("**** (update thread) waiting until: " PRINTF_TIME, wait_until_time - lf_time_start()); if (wait_until(env, wait_until_time, &lf_port_status_changed)) { if (lf_tag_compare(lf_tag(env), tag_when_started_waiting) != 0) { - // Wait was not interrupted and we have committed to a new tag before we - // finished processing the list. Start over. - restart = true; break; } /* Possibly useful for debugging: tag_t current_tag = lf_tag(env); - lf_print("--------------------- FIXME: assuming absent! " PRINTF_TAG, current_tag.time - lf_time_start(), current_tag.microstep); - lf_print("--------------------- Lag is " PRINTF_TIME, current_tag.time - lf_time_physical()); - lf_print("--------------------- Wait until time is " PRINTF_TIME, wait_until_time - lf_time_start()); + LF_PRINT_DEBUG("**** (update thread) Assuming absent! " PRINTF_TAG, current_tag.time - lf_time_start(), current_tag.microstep); + LF_PRINT_DEBUG("**** (update thread) Lag is " PRINTF_TIME, current_tag.time - lf_time_physical()); + LF_PRINT_DEBUG("**** (update thread) Wait until time is " PRINTF_TIME, wait_until_time - lf_time_start()); */ - // Wait went to completion. Mark any ports with this STAA that remain unknown as absent. for (int j = 0; j < staa_elem->num_actions; ++j) { lf_action_base_t* input_port_action = staa_elem->actions[j]; if (input_port_action->trigger->status == unknown) { input_port_action->trigger->status = absent; - LF_PRINT_DEBUG("Assuming port absent at time " PRINTF_TIME, lf_tag(env).time - start_time); + LF_PRINT_DEBUG("**** (update thread) Assuming port absent at time " PRINTF_TIME, lf_tag(env).time - start_time); update_last_known_status_on_input_port(env, lf_tag(env), id_of_action(input_port_action)); lf_cond_broadcast(&lf_port_status_changed); } } - } else if (lf_tag_compare(lf_tag(env), tag_when_started_waiting) != 0) { - // Wait was interrupted and we have committed to a new tag before we - // finished processing the list. Start over. - restart = true; - break; } + // If the tag has advanced, start over. + if (lf_tag_compare(lf_tag(env), tag_when_started_waiting) != 0) break; } - if (restart) break; // No need to check the rest of the STAAs. + // If the tag has advanced, start over. + if (lf_tag_compare(lf_tag(env), tag_when_started_waiting) != 0) break; + } + // If the tag has advanced, start over. + if (lf_tag_compare(lf_tag(env), tag_when_started_waiting) != 0) continue; + + // At this point, the current tag is the same as when we started waiting + // and all ports should be known, and hence max_level_allowed_to_advance + // should be INT_MAX. Check this to prevent an infinite wait. + if (max_level_allowed_to_advance != INT_MAX) { + // If this error occurs, then there is a mismatch between ports being known + // the max_level_allowed_to_advance. Perhaps max_level_allowed_to_advance is + // not being set when a port becomes known? + LF_MUTEX_UNLOCK(env->mutex); + lf_print_error_and_exit("**** (update thread) All port statuses are known at tag " PRINTF_TAG + ", but the MLAA of %d indicates otherwise!", + tag_when_started_waiting.time - start_time, + tag_when_started_waiting.microstep, + max_level_allowed_to_advance); } - if (restart) continue; // No need to wait for a new tag. // Wait until we progress to a new tag. while (lf_tag_compare(lf_tag(env), tag_when_started_waiting) == 0) { // The following will release the env->mutex while waiting. - lf_cond_wait(&lf_current_tag_changed); + LF_PRINT_DEBUG("**** (update thread) Waiting for tags to not match: " PRINTF_TAG ", " PRINTF_TAG, + lf_tag(env).time - lf_time_start(), lf_tag(env).microstep, + tag_when_started_waiting.time -lf_time_start(), tag_when_started_waiting.microstep); + // Ports are reset to unknown at the start of new tag, so that will wake this up. + lf_cond_wait(&lf_port_status_changed); } + LF_PRINT_DEBUG("**** (update thread) Tags after wait: " PRINTF_TAG ", " PRINTF_TAG, + lf_tag(env).time - lf_time_start(), lf_tag(env).microstep, + tag_when_started_waiting.time -lf_time_start(), tag_when_started_waiting.microstep); } + LF_MUTEX_UNLOCK(env->mutex); } #endif // FEDERATED_DECENTRALIZED @@ -2259,11 +2307,19 @@ parse_rti_code_t lf_parse_rti_addr(const char* rti_addr) { } void lf_reset_status_fields_on_input_port_triggers() { + environment_t *env; + _lf_get_environments(&env); + tag_t now = lf_tag(env); for (int i = 0; i < _lf_action_table_size; i++) { - set_network_port_status(i, unknown); + if (lf_tag_compare(_lf_action_table[i]->trigger->last_known_status_tag, now) >= 0) { + set_network_port_status(i, absent); // Default may be overriden to become present. + } else { + set_network_port_status(i, unknown); + } } LF_PRINT_DEBUG("Resetting port status fields."); lf_update_max_level(_fed.last_TAG, _fed.is_last_TAG_provisional); + lf_cond_broadcast(&lf_port_status_changed); } int lf_send_message(int message_type, diff --git a/include/core/federated/federate.h b/include/core/federated/federate.h index be5e59609..e035d94c0 100644 --- a/include/core/federated/federate.h +++ b/include/core/federated/federate.h @@ -320,8 +320,12 @@ void lf_latest_tag_complete(tag_t); parse_rti_code_t lf_parse_rti_addr(const char* rti_addr); /** - * @brief Reset the status fields on network input ports to unknown. + * @brief Reset the status fields on network input ports to unknown or absent. * + * This will reset to absent if the last_known_status_tag field of the port + * is greater than or equal to the current tag of the top-level environment. + * This should be overriden to present if an event gets scheduled. + * Otherwise, set the status to unknown. * @note This function must be called at the beginning of each * logical time. */ From 6712510af3ad9409d564ad8cbff44f20b05539b8 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Tue, 9 Jan 2024 23:01:27 -0800 Subject: [PATCH 60/83] Update last known status also for centralized coordination --- core/federated/federate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 0b961fb04..60abaaf41 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -642,10 +642,10 @@ static int handle_tagged_message(int* socket, int fed_id) { actual_tag.microstep++; } } +#endif // FEDERATED_DECENTRALIZED // The following will update the input_port_action->last_known_status_tag. // For decentralized coordination, this is needed for the thread implementing STAA. update_last_known_status_on_input_port(env, actual_tag, port_id); -#endif // FEDERATED_DECENTRALIZED // If the current time >= stop time, discard the message. // But only if the stop time is not equal to the start time! From 5de7c5e56f98c3aeb6358f585c925f1fd2e2b1f6 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 10 Jan 2024 08:09:11 -0800 Subject: [PATCH 61/83] Do not wait for tag to advance if MLAA is finite --- core/federated/federate.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 60abaaf41..0696bf3c0 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1171,15 +1171,12 @@ static void* update_ports_from_staa_offsets(void* args) { // and all ports should be known, and hence max_level_allowed_to_advance // should be INT_MAX. Check this to prevent an infinite wait. if (max_level_allowed_to_advance != INT_MAX) { - // If this error occurs, then there is a mismatch between ports being known - // the max_level_allowed_to_advance. Perhaps max_level_allowed_to_advance is - // not being set when a port becomes known? - LF_MUTEX_UNLOCK(env->mutex); - lf_print_error_and_exit("**** (update thread) All port statuses are known at tag " PRINTF_TAG - ", but the MLAA of %d indicates otherwise!", - tag_when_started_waiting.time - start_time, - tag_when_started_waiting.microstep, - max_level_allowed_to_advance); + // If this occurs, then the current tag advanced during a wait. + // Some ports may have been reset to uknown during that wait, in which case, + // it would be huge mistake to enter the wait for a new tag below because the + // program will freeze. Hence, we start over rather than wait for the current + // tag to advance. + continue; } // Wait until we progress to a new tag. From a8fa18fcbe19884f9304157db3bf37a240616940 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 10 Jan 2024 09:22:31 -0800 Subject: [PATCH 62/83] Added includes (why weren't these needed before?) --- core/federated/federate.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/federated/federate.c b/core/federated/federate.c index 0696bf3c0..a64100238 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -19,6 +19,8 @@ #include // Defines struct sockaddr_in #include #include // Defines read(), write(), and close() +#include // Defines memset(), strnlen(), strncmp(), strncpy() +#include // Defines strerror() #include #include // Defined perror(), errno From 9da509469054a0bbfe22f9a2b97577be95bedba7 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Wed, 10 Jan 2024 10:35:12 -0800 Subject: [PATCH 63/83] Check that ports are in fact unknown before looping --- core/federated/federate.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index a64100238..df776161f 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1176,8 +1176,24 @@ static void* update_ports_from_staa_offsets(void* args) { // If this occurs, then the current tag advanced during a wait. // Some ports may have been reset to uknown during that wait, in which case, // it would be huge mistake to enter the wait for a new tag below because the - // program will freeze. Hence, we start over rather than wait for the current - // tag to advance. + // program will freeze. First, check whether any ports are unknown: + bool port_unkonwn = false; + for (int i = 0; i < staa_lst_size; ++i) { + staa_t* staa_elem = staa_lst[i]; + if (a_port_is_unknown(staa_elem)) { + port_unkonwn = true; + break; + } + } + if (!port_unkonwn) { + // If this occurs, then there is a race condition that can lead to deadlocks. + lf_print_error_and_exit("**** (update thread) Inconsistency: All ports are known, but MLAA is blocking."); + } + + // Since max_level_allowed_to_advance will block advancement of time, we cannot follow + // through to the next step without deadlocking. Wait some time, then continue. + // The wait is necessary to prevent a busy wait. + lf_sleep(2 * MIN_SLEEP_DURATION); continue; } From a961d9cada57c336bb36f537fe469f738d7a1422 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Thu, 11 Jan 2024 07:31:18 -0800 Subject: [PATCH 64/83] Fixed use of write_to_socket --- core/federated/clock-sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/federated/clock-sync.c b/core/federated/clock-sync.c index f718ace4e..eea5e753c 100644 --- a/core/federated/clock-sync.c +++ b/core/federated/clock-sync.c @@ -277,7 +277,7 @@ int handle_T1_clock_sync_message(unsigned char* buffer, int socket, instant_t t2 // Write the reply to the socket. LF_PRINT_DEBUG("Sending T3 message to RTI."); - if (write_to_socket(socket, 1 + sizeof(int), reply_buffer) != 1 + sizeof(int)) { + if (write_to_socket(socket, 1 + sizeof(int), reply_buffer)) { lf_print_error("Clock sync: Failed to send T3 message to RTI."); return -1; } From 9a797bba35235dd73944140668fd1c42ff3fabc1 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Fri, 12 Jan 2024 18:56:53 -0800 Subject: [PATCH 65/83] Fix deadlock caused by STP violation --- core/reactor_common.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/core/reactor_common.c b/core/reactor_common.c index dcc951369..7a151ec99 100644 --- a/core/reactor_common.c +++ b/core/reactor_common.c @@ -401,7 +401,9 @@ void _lf_pop_events(environment_t *env) { // the MLAA could get stuck, causing the program to lock up. // This should not call update_last_known_status_on_input_port because we // are starting a new tag step execution, so there are no reactions blocked on this input. - event->trigger->last_known_status_tag = env->current_tag; + if (lf_tag_compare(env->current_tag, event->trigger->last_known_status_tag) > 0) { + event->trigger->last_known_status_tag = env->current_tag; + } } } #endif From 5e1d9bb9ae3b0ab2b58ce01616bd8915835b2edf Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sat, 13 Jan 2024 13:50:53 -0800 Subject: [PATCH 66/83] Removed outdated comments --- core/federated/RTI/rti_remote.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index e0bbd6eb6..b1c1ce9b0 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -233,7 +233,6 @@ void notify_tag_advance_grant(scheduling_node_t *e, tag_t tag) { if (write_to_socket(((federate_info_t *)e)->socket, message_length, buffer)) { lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); e->state = NOT_CONNECTED; - // FIXME: We need better error handling, but don't stop other execution here. } else { e->last_granted = tag; LF_PRINT_LOG("RTI sent to federate %d the tag advance grant (TAG) " PRINTF_TAG ".", @@ -268,7 +267,6 @@ void notify_provisional_tag_advance_grant(scheduling_node_t *e, tag_t tag) { if (write_to_socket(((federate_info_t *)e)->socket, message_length, buffer)) { lf_print_error("RTI failed to send tag advance grant to federate %d.", e->id); e->state = NOT_CONNECTED; - // FIXME: We need better error handling, but don't stop other execution here. } else { e->last_provisionally_granted = tag; LF_PRINT_LOG("RTI sent to federate %d the Provisional Tag Advance Grant (PTAG) " PRINTF_TAG ".", From f6e090d1e371deccb9ae8c42fb3adb3027c5220d Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sun, 14 Jan 2024 09:57:55 -0800 Subject: [PATCH 67/83] Update core/federated/RTI/main.c Co-authored-by: Peter Donovan <33707478+petervdonovan@users.noreply.github.com> --- core/federated/RTI/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/federated/RTI/main.c b/core/federated/RTI/main.c index 89fe8d62b..f727c9ea1 100644 --- a/core/federated/RTI/main.c +++ b/core/federated/RTI/main.c @@ -82,7 +82,7 @@ static void send_failed_signal(federate_info_t* fed) { if (failed == 0) { LF_PRINT_LOG("RTI has sent failed signal to federate %d due to abnormal termination.", fed->enclave.id); } else { - LF_PRINT_LOG("RTI failed to send failed signal to federate %d on socket ID %d.", fed->enclave.id, fed->socket); + lf_print_error("RTI failed to send failed signal to federate %d on socket ID %d.", fed->enclave.id, fed->socket); } if (rti.base.tracing_enabled) { tracepoint_rti_to_federate(rti.base.trace, send_FAILED, fed->enclave.id, NULL); From f6e685e39ffdb9cb0c406245f1fecaa53b8b96dd Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sun, 14 Jan 2024 10:33:52 -0800 Subject: [PATCH 68/83] Update core/federated/RTI/rti_common.c Co-authored-by: Peter Donovan <33707478+petervdonovan@users.noreply.github.com> --- core/federated/RTI/rti_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/federated/RTI/rti_common.c b/core/federated/RTI/rti_common.c index afc48526e..45cd8416f 100644 --- a/core/federated/RTI/rti_common.c +++ b/core/federated/RTI/rti_common.c @@ -193,7 +193,7 @@ tag_advance_grant_t tag_advance_grant_if_safe(scheduling_node_t* e) { // or federate (which includes any after delays on the connections). tag_t t_d = earliest_future_incoming_message_tag(e); // Strict version of the above. This is a tag that must be strictly greater than - // that of any granted PTAG. + // that of the next granted PTAG. tag_t t_d_strict = eimt_strict(e); LF_PRINT_LOG("RTI: Earliest next event upstream of node %d has tag " PRINTF_TAG ".", From edfde091499a4e2a0a513456ae7f4d9a6438dfa4 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sun, 14 Jan 2024 10:21:25 -0800 Subject: [PATCH 69/83] Absorb delay functionality into lf_tag_add() --- core/federated/RTI/rti_common.c | 10 +++++----- core/tag.c | 1 + include/core/tag.h | 5 +++++ 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/core/federated/RTI/rti_common.c b/core/federated/RTI/rti_common.c index 45cd8416f..362dd5fa6 100644 --- a/core/federated/RTI/rti_common.c +++ b/core/federated/RTI/rti_common.c @@ -103,15 +103,15 @@ tag_t earliest_future_incoming_message_tag(scheduling_node_t* e) { // one connection. No delay at all is represented by (0,0). A delay of 0 is represented // by (0,1). If the time part of the delay is greater than 0, then we want to ignore // the microstep in upstream->next_event because that microstep will have been lost. - // Otherwise, we want preserve it and add to it. - tag_t next_event = upstream->next_event; - if (e->min_delays[i].min_delay.time > 0) next_event.microstep = 0; - tag_t earliest_tag_from_upstream = lf_tag_add(next_event, e->min_delays[i].min_delay); - /* Following debug message is too verbose for normal use: */ + // Otherwise, we want preserve it and add to it. This is handled by lf_tag_add(). + tag_t earliest_tag_from_upstream = lf_tag_add(upstream->next_event, e->min_delays[i].min_delay); + + /* Following debug message is too verbose for normal use: LF_PRINT_DEBUG("RTI: Earliest next event upstream of fed/encl %d at fed/encl %d has tag " PRINTF_TAG ".", e->id, upstream->id, earliest_tag_from_upstream.time - start_time, earliest_tag_from_upstream.microstep); + */ if (lf_tag_compare(earliest_tag_from_upstream, t_d) < 0) { t_d = earliest_tag_from_upstream; } diff --git a/core/tag.c b/core/tag.c index 3e43db47c..c632476c0 100644 --- a/core/tag.c +++ b/core/tag.c @@ -120,6 +120,7 @@ tag_t lf_tag(void *env) { tag_t lf_tag_add(tag_t a, tag_t b) { if (a.time == NEVER || b.time == NEVER) return NEVER_TAG; if (a.time == FOREVER || b.time == FOREVER) return FOREVER_TAG; + if (b.time > 0) a.microstep = 0; // Ignore microstep of first arg if time of second is > 0. tag_t result = {.time = a.time + b.time, .microstep = a.microstep + b.microstep}; if (result.microstep < a.microstep) return FOREVER_TAG; if (result.time < a.time && b.time > 0) return FOREVER_TAG; diff --git a/include/core/tag.h b/include/core/tag.h index 0a480a40b..e38ea7de5 100644 --- a/include/core/tag.h +++ b/include/core/tag.h @@ -88,6 +88,11 @@ tag_t lf_tag(void* env); * return NEVER_TAG or FOREVER_TAG, respectively. Also return NEVER_TAG or FOREVER_TAG * if the result underflows or overflows when adding the times. * If the microstep overflows, also return FOREVER_TAG. + * If the time field of the second tag is greater than 0, then the microstep of the first tag + * is reset to 0 before adding. This models the delay semantics in LF and makes this + * addition operation non-commutative. + * @param a The first tag. + * @param b The second tag. */ tag_t lf_tag_add(tag_t a, tag_t b); From 161f00a7e84df6f9d69bc7bbb2105402a6d49127 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sun, 14 Jan 2024 10:39:48 -0800 Subject: [PATCH 70/83] Clarify comments for eimt_strict() --- core/federated/RTI/rti_common.c | 2 +- core/federated/RTI/rti_remote.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/federated/RTI/rti_common.c b/core/federated/RTI/rti_common.c index 362dd5fa6..33049db50 100644 --- a/core/federated/RTI/rti_common.c +++ b/core/federated/RTI/rti_common.c @@ -192,7 +192,7 @@ tag_advance_grant_t tag_advance_grant_if_safe(scheduling_node_t* e) { // Find the tag of the earliest event that may be later received from an upstream enclave // or federate (which includes any after delays on the connections). tag_t t_d = earliest_future_incoming_message_tag(e); - // Strict version of the above. This is a tag that must be strictly greater than + // Non-ZDC version of the above. This is a tag that must be strictly greater than // that of the next granted PTAG. tag_t t_d_strict = eimt_strict(e); diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index b1c1ce9b0..c09cf4624 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -289,7 +289,7 @@ void notify_provisional_tag_advance_grant(scheduling_node_t *e, tag_t tag) { continue; tag_t earliest = earliest_future_incoming_message_tag(upstream); - tag_t strict_earliest = eimt_strict(upstream); + tag_t strict_earliest = eimt_strict(upstream); // Non-ZDC version. // If these tags are equal, then a TAG or PTAG should have already been granted, // in which case, another will not be sent. But it may not have been already granted. From fd05adaca420e6803d3556a5490271659914cec3 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sun, 14 Jan 2024 10:42:07 -0800 Subject: [PATCH 71/83] Print error on failure to write trace file --- core/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/core/trace.c b/core/trace.c index e45cbe909..34b7cd5d2 100644 --- a/core/trace.c +++ b/core/trace.c @@ -195,7 +195,10 @@ void flush_trace_locked(trace_t* trace, int worker) { // This is deferred to here so that user trace objects can be // registered in startup reactions. if (!trace->_lf_trace_header_written) { - if (write_trace_header(trace) < 0) return; + if (write_trace_header(trace) < 0) { + lf_print_error("Failed to write trace header. Trace file will be incomplete."); + return; + } trace->_lf_trace_header_written = true; } From f4ab3d88cad18f16559f05d01fe7f7ae74bf6667 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sun, 14 Jan 2024 11:20:58 -0800 Subject: [PATCH 72/83] Comment only --- core/reactor_common.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/core/reactor_common.c b/core/reactor_common.c index 7a151ec99..83a9532a9 100644 --- a/core/reactor_common.c +++ b/core/reactor_common.c @@ -1744,7 +1744,10 @@ void initialize_global(void) { _lf_initialize_trigger_objects() ; } -/** Flag to prevent termination function from executing twice. */ +/** + * Flag to prevent termination function from executing twice and to signal to background + * threads to terminate. + */ bool _lf_termination_executed = false; /** Flag used to disable cleanup operations on normal termination. */ From e1783f11481c466c470cd6aee1c5ec155e38cc80 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sun, 14 Jan 2024 16:29:39 -0800 Subject: [PATCH 73/83] Update core/federated/RTI/rti_remote.c Co-authored-by: Peter Donovan <33707478+petervdonovan@users.noreply.github.com> --- core/federated/RTI/rti_remote.c | 1 + 1 file changed, 1 insertion(+) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index c09cf4624..8a912d3c0 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1364,6 +1364,7 @@ static int32_t receive_and_check_fed_id_message(int *socket_id, struct sockaddr_ /** * Listen for a MSG_TYPE_NEIGHBOR_STRUCTURE message, and upon receiving it, fill * out the relevant information in the federate's struct. + * @return 1 on success and 0 on failure. */ static int receive_connection_information(int *socket_id, uint16_t fed_id) { LF_PRINT_DEBUG("RTI waiting for MSG_TYPE_NEIGHBOR_STRUCTURE from federate %d.", fed_id); From 4b7c9409e98e45122598357cc220ccd7f7492de1 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sun, 14 Jan 2024 15:51:39 -0800 Subject: [PATCH 74/83] Comment only --- core/federated/federate.c | 1 - 1 file changed, 1 deletion(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index df776161f..28fb1aa5c 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -1670,7 +1670,6 @@ void terminate_execution(environment_t* env) { // Trace the event when tracing is enabled tracepoint_federate_to_rti(_fed.trace, send_RESIGN, _lf_my_fed_id, &env->current_tag); } else { - // Do not acquire mutex and do not trace. send_failed_signal(env); tracepoint_federate_to_rti(_fed.trace, send_FAILED, _lf_my_fed_id, &env->current_tag); } From 5ff00a2a8472c1995f593bd4676afba78730eb1e Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sun, 14 Jan 2024 15:52:42 -0800 Subject: [PATCH 75/83] Comment only --- core/reactor_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/reactor_common.c b/core/reactor_common.c index 83a9532a9..4dd1f2a9e 100644 --- a/core/reactor_common.c +++ b/core/reactor_common.c @@ -1750,7 +1750,7 @@ void initialize_global(void) { */ bool _lf_termination_executed = false; -/** Flag used to disable cleanup operations on normal termination. */ +/** Flag used to disable cleanup operations on abnormal termination. */ bool _lf_normal_termination = false; /** From 193bd66ac6602fa45ef9679080998d29f203c7f3 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sun, 14 Jan 2024 16:00:55 -0800 Subject: [PATCH 76/83] Move freeing of local RTI to termination function --- core/reactor_common.c | 3 +++ core/threaded/reactor_threaded.c | 6 ------ 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/core/reactor_common.c b/core/reactor_common.c index 4dd1f2a9e..938202909 100644 --- a/core/reactor_common.c +++ b/core/reactor_common.c @@ -1839,5 +1839,8 @@ void termination(void) { for (int i = 0; i < num_envs; i++) { environment_free(&env[i]); } +#if defined LF_ENCLAVES + free_local_rti(); +#endif } } diff --git a/core/threaded/reactor_threaded.c b/core/threaded/reactor_threaded.c index c7f1ca362..b24d70390 100644 --- a/core/threaded/reactor_threaded.c +++ b/core/threaded/reactor_threaded.c @@ -1234,12 +1234,6 @@ int lf_reactor_c_main(int argc, const char* argv[]) { } } _lf_normal_termination = true; - // Invoke termination function here before freeing the local RTI. - termination(); - -#if defined LF_ENCLAVES - free_local_rti(); -#endif return 0; } From 7f84a33555d810c3911d88deeef51ebcb4ceee56 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sun, 14 Jan 2024 16:28:15 -0800 Subject: [PATCH 77/83] Don't exit immediately on federate failure --- core/federated/RTI/rti_remote.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 8a912d3c0..d1423197a 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1053,8 +1053,9 @@ static void handle_federate_failed(federate_info_t *my_fed) { tracepoint_rti_from_federate(rti_remote->base.trace, receive_FAILED, my_fed->enclave.id, NULL); } + // Set the flag telling the RTI to exit with an error code when it exits. _lf_federate_reports_error = true; - lf_print_error_and_exit("RTI: Federate %d reports an error and has exited.", my_fed->enclave.id); + lf_print_error("RTI: Federate %d reports an error and has exited.", my_fed->enclave.id); my_fed->enclave.state = NOT_CONNECTED; From 753d79cc3db77bb169f932ca86a99bab1d14c15e Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sun, 14 Jan 2024 16:49:10 -0800 Subject: [PATCH 78/83] Clean up error handling in receive_and_check_fed_id_message --- core/federated/RTI/rti_remote.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index d1423197a..f86301d32 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -1229,8 +1229,7 @@ void send_reject(int *socket_id, unsigned char error_code) { * Listen for a MSG_TYPE_FED_IDS message, which includes as a payload * a federate ID and a federation ID. If the federation ID * matches this federation, send an MSG_TYPE_ACK and otherwise send - * a MSG_TYPE_REJECT message. Return 1 if the federate is accepted to - * the federation and 0 otherwise. + * a MSG_TYPE_REJECT message. * @param socket_id Pointer to the socket on which to listen. * @param client_fd The socket address. * @return The federate ID for success or -1 for failure. @@ -1241,9 +1240,10 @@ static int32_t receive_and_check_fed_id_message(int *socket_id, struct sockaddr_ unsigned char buffer[length]; // Read bytes from the socket. We need 4 bytes. - // FIXME: This should not exit with error but rather should just reject the connection. - read_from_socket_fail_on_error(socket_id, length, buffer, NULL, - "RTI failed to read from accepted socket."); + if (read_from_socket_close_on_error(socket_id, length, buffer)) { + lf_print_error("RTI failed to read from accepted socket."); + return -1; + } uint16_t fed_id = rti_remote->base.number_of_scheduling_nodes; // Initialize to an invalid value. @@ -1275,10 +1275,11 @@ static int32_t receive_and_check_fed_id_message(int *socket_id, struct sockaddr_ size_t federation_id_length = (size_t)buffer[sizeof(uint16_t) + 1]; char federation_id_received[federation_id_length + 1]; // One extra for null terminator. // Next read the actual federation ID. - // FIXME: This should not exit on error, but rather just reject the connection. - read_from_socket_fail_on_error(socket_id, federation_id_length, - (unsigned char *)federation_id_received, NULL, - "RTI failed to read federation id from federate %d.", fed_id); + if (read_from_socket_close_on_error(socket_id, federation_id_length, + (unsigned char *)federation_id_received)) { + lf_print_error("RTI failed to read federation id from federate %d.", fed_id); + return -1; + } // Terminate the string with a null. federation_id_received[federation_id_length] = 0; @@ -1291,9 +1292,9 @@ static int32_t receive_and_check_fed_id_message(int *socket_id, struct sockaddr_ // Compare the received federation ID to mine. if (strncmp(rti_remote->federation_id, federation_id_received, federation_id_length) != 0) { // Federation IDs do not match. Send back a MSG_TYPE_REJECT message. - lf_print_error("WARNING: Federate from another federation %s attempted to connect to RTI in federation %s.\n", - federation_id_received, - rti_remote->federation_id); + lf_print_warning("Federate from another federation %s attempted to connect to RTI in federation %s.", + federation_id_received, + rti_remote->federation_id); if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); } @@ -1353,8 +1354,11 @@ static int32_t receive_and_check_fed_id_message(int *socket_id, struct sockaddr_ tracepoint_rti_to_federate(rti_remote->base.trace, send_ACK, fed_id, NULL); } LF_MUTEX_LOCK(rti_mutex); - write_to_socket_fail_on_error(&fed->socket, 1, &ack_message, &rti_mutex, - "RTI failed to write MSG_TYPE_ACK message to federate %d.", fed_id); + if (write_to_socket_close_on_error(&fed->socket, 1, &ack_message)) { + LF_MUTEX_UNLOCK(rti_mutex); + lf_print_error("RTI failed to write MSG_TYPE_ACK message to federate %d.", fed_id); + return -1; + } LF_MUTEX_UNLOCK(rti_mutex); LF_PRINT_DEBUG("RTI sent MSG_TYPE_ACK to federate %d.", fed_id); From e44d2840b3a8cedbf4f455509e87802ab60248d3 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Mon, 15 Jan 2024 12:55:08 -0800 Subject: [PATCH 79/83] Do not overwrite NET with message tag unless less --- core/federated/RTI/rti_remote.c | 56 ++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index f86301d32..2a71642b4 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -453,31 +453,6 @@ void handle_timed_message(federate_info_t *sending_federate, unsigned char *buff federate_id, length); - // Record this in-transit message in federate's in-transit message queue. - if (lf_tag_compare(fed->enclave.completed, intended_tag) < 0) { - // Add a record of this message to the list of in-transit messages to this federate. - pqueue_tag_insert_if_no_match( - fed->in_transit_message_tags, - intended_tag); - LF_PRINT_DEBUG( - "RTI: Adding a message with tag " PRINTF_TAG " to the list of in-transit messages for federate %d.", - intended_tag.time - lf_time_start(), - intended_tag.microstep, - federate_id); - } else { - lf_print_error( - "RTI: Federate %d has already completed tag " PRINTF_TAG - ", but there is an in-transit message with tag " PRINTF_TAG " from federate %hu. " - "This is going to cause an STP violation under centralized coordination.", - federate_id, - fed->enclave.completed.time - lf_time_start(), - fed->enclave.completed.microstep, - intended_tag.time - lf_time_start(), - intended_tag.microstep, - sending_federate->enclave.id); - // FIXME: Drop the federate? - } - // Need to make sure that the destination federate's thread has already // sent the starting MSG_TYPE_TIMESTAMP message. while (fed->enclave.state == PENDING) { @@ -513,7 +488,36 @@ void handle_timed_message(federate_info_t *sending_federate, unsigned char *buff "RTI failed to send message chunks."); } - update_federate_next_event_tag_locked(federate_id, intended_tag); + // Record this in-transit message in federate's in-transit message queue. + if (lf_tag_compare(fed->enclave.completed, intended_tag) < 0) { + // Add a record of this message to the list of in-transit messages to this federate. + pqueue_tag_insert_if_no_match( + fed->in_transit_message_tags, + intended_tag); + LF_PRINT_DEBUG( + "RTI: Adding a message with tag " PRINTF_TAG " to the list of in-transit messages for federate %d.", + intended_tag.time - lf_time_start(), + intended_tag.microstep, + federate_id); + } else { + lf_print_error( + "RTI: Federate %d has already completed tag " PRINTF_TAG + ", but there is an in-transit message with tag " PRINTF_TAG " from federate %hu. " + "This is going to cause an STP violation under centralized coordination.", + federate_id, + fed->enclave.completed.time - lf_time_start(), + fed->enclave.completed.microstep, + intended_tag.time - lf_time_start(), + intended_tag.microstep, + sending_federate->enclave.id); + // FIXME: Drop the federate? + } + + // If the message tag is less than the most recently received NET from the federate, + // then update the federate's next event tag to match the message tag. + if (lf_tag_compare(intended_tag, fed->enclave.next_event) < 0) { + update_federate_next_event_tag_locked(federate_id, intended_tag); + } LF_MUTEX_UNLOCK(rti_mutex); } From c37968d22ea4552f503287ca7fe1ac553c794c91 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Thu, 18 Jan 2024 08:34:47 -0800 Subject: [PATCH 80/83] Comments only --- core/federated/federate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 28fb1aa5c..f69c9ddb4 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -396,7 +396,7 @@ static trigger_handle_t schedule_message_received_from_network_locked( * in which case, flag should be 1. * * @param fed_id The ID of the peer federate sending messages to this - * federate, or -1 if the RTI. + * federate. * @param flag 0 if an EOF was received, -1 if a socket error occurred, 1 otherwise. */ static void close_inbound_socket(int fed_id, int flag) { From 30601a85322a41997dbca3702e3c4275a183b3f9 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sat, 20 Jan 2024 14:54:16 -0800 Subject: [PATCH 81/83] Comments only --- core/federated/federate.c | 1 - 1 file changed, 1 deletion(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index f69c9ddb4..378dc1238 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -527,7 +527,6 @@ static int handle_tagged_message(int* socket, int fed_id) { environment_t *env; _lf_get_environments(&env); - // FIXME: Need better error handling? // Read the header which contains the timestamp. size_t bytes_to_read = sizeof(uint16_t) + sizeof(uint16_t) + sizeof(int32_t) + sizeof(instant_t) + sizeof(microstep_t); From ea398c78e3545c8649cb79abc12d96a3e614ef1d Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sun, 21 Jan 2024 09:45:13 -0800 Subject: [PATCH 82/83] Trace before write and after read --- core/federated/RTI/main.c | 6 ++-- core/federated/RTI/rti_remote.c | 21 +++++++------- core/federated/federate.c | 51 +++++++++++++++++---------------- 3 files changed, 39 insertions(+), 39 deletions(-) diff --git a/core/federated/RTI/main.c b/core/federated/RTI/main.c index f727c9ea1..700304aea 100644 --- a/core/federated/RTI/main.c +++ b/core/federated/RTI/main.c @@ -78,15 +78,15 @@ static void send_failed_signal(federate_info_t* fed) { size_t bytes_to_write = 1; unsigned char buffer[bytes_to_write]; buffer[0] = MSG_TYPE_FAILED; + if (rti.base.tracing_enabled) { + tracepoint_rti_to_federate(rti.base.trace, send_FAILED, fed->enclave.id, NULL); + } int failed = write_to_socket(fed->socket, bytes_to_write, &(buffer[0])); if (failed == 0) { LF_PRINT_LOG("RTI has sent failed signal to federate %d due to abnormal termination.", fed->enclave.id); } else { lf_print_error("RTI failed to send failed signal to federate %d on socket ID %d.", fed->enclave.id, fed->socket); } - if (rti.base.tracing_enabled) { - tracepoint_rti_to_federate(rti.base.trace, send_FAILED, fed->enclave.id, NULL); - } } /** diff --git a/core/federated/RTI/rti_remote.c b/core/federated/RTI/rti_remote.c index 2a71642b4..373a109df 100644 --- a/core/federated/RTI/rti_remote.c +++ b/core/federated/RTI/rti_remote.c @@ -365,15 +365,15 @@ void handle_port_absent_message(federate_info_t *sending_federate, unsigned char lf_cond_wait(&sent_start_time); } + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(rti_remote->base.trace, send_PORT_ABS, federate_id, &tag); + } + // Forward the message. write_to_socket_fail_on_error(&fed->socket, message_size + 1, buffer, &rti_mutex, "RTI failed to forward message to federate %d.", federate_id); LF_MUTEX_UNLOCK(rti_mutex); - - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_PORT_ABS, federate_id, &tag); - } } void handle_timed_message(federate_info_t *sending_federate, unsigned char *buffer) { @@ -725,11 +725,11 @@ void handle_stop_request_message(federate_info_t *fed) { mark_federate_requesting_stop(f); continue; } - write_to_socket_fail_on_error(&f->socket, MSG_TYPE_STOP_REQUEST_LENGTH, stop_request_buffer, &rti_mutex, - "RTI failed to forward MSG_TYPE_STOP_REQUEST message to federate %d.", f->enclave.id); if (rti_remote->base.tracing_enabled) { tracepoint_rti_to_federate(rti_remote->base.trace, send_STOP_REQ, f->enclave.id, &rti_remote->base.max_stop_tag); } + write_to_socket_fail_on_error(&f->socket, MSG_TYPE_STOP_REQUEST_LENGTH, stop_request_buffer, &rti_mutex, + "RTI failed to forward MSG_TYPE_STOP_REQUEST message to federate %d.", f->enclave.id); } } LF_PRINT_LOG("RTI forwarded to federates MSG_TYPE_STOP_REQUEST with tag (" PRINTF_TIME ", %u).", @@ -1197,8 +1197,7 @@ void *federate_info_thread_TCP(void *fed) { return NULL; default: lf_print_error("RTI received from federate %d an unrecognized TCP message type: %u.", my_fed->enclave.id, buffer[0]); - if (rti_remote->base.tracing_enabled) - { + if (rti_remote->base.tracing_enabled) { tracepoint_rti_from_federate(rti_remote->base.trace, receive_UNIDENTIFIED, my_fed->enclave.id, NULL); } } @@ -1253,6 +1252,9 @@ static int32_t receive_and_check_fed_id_message(int *socket_id, struct sockaddr_ // First byte received is the message type. if (buffer[0] != MSG_TYPE_FED_IDS) { + if (rti_remote->base.tracing_enabled) { + tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); + } if (buffer[0] == MSG_TYPE_P2P_SENDING_FED_ID || buffer[0] == MSG_TYPE_P2P_TAGGED_MESSAGE) { // The federate is trying to connect to a peer, not to the RTI. // It has connected to the RTI instead. @@ -1265,9 +1267,6 @@ static int32_t receive_and_check_fed_id_message(int *socket_id, struct sockaddr_ } else { send_reject(socket_id, UNEXPECTED_MESSAGE); } - if (rti_remote->base.tracing_enabled) { - tracepoint_rti_to_federate(rti_remote->base.trace, send_REJECT, fed_id, NULL); - } lf_print_error("RTI expected a MSG_TYPE_FED_IDS message. Got %u (see net_common.h).", buffer[0]); return -1; } else { diff --git a/core/federated/federate.c b/core/federated/federate.c index 378dc1238..950039138 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -117,14 +117,14 @@ static void send_time(unsigned char type, instant_t time) { buffer[0] = type; encode_int64(time, &(buffer[1])); + // Trace the event when tracing is enabled + tag_t tag = {.time = time, .microstep = 0}; + tracepoint_federate_to_rti(_fed.trace, send_TIMESTAMP, _lf_my_fed_id, &tag); + LF_MUTEX_LOCK(lf_outbound_socket_mutex); write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, bytes_to_write, buffer, &lf_outbound_socket_mutex, "Failed to send time " PRINTF_TIME " to the RTI.", time - start_time); LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); - - tag_t tag = {.time = time, .microstep = 0}; - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_TIMESTAMP, _lf_my_fed_id, &tag); } /** @@ -1446,6 +1446,9 @@ static void handle_stop_request_message() { unsigned char outgoing_buffer[MSG_TYPE_STOP_REQUEST_REPLY_LENGTH]; ENCODE_STOP_REQUEST_REPLY(outgoing_buffer, tag_to_stop.time, tag_to_stop.microstep); + // Trace the event when tracing is enabled + tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ_REP, _lf_my_fed_id, &tag_to_stop); + // Send the current logical time to the RTI. LF_MUTEX_LOCK(lf_outbound_socket_mutex); write_to_socket_fail_on_error( @@ -1455,8 +1458,6 @@ static void handle_stop_request_message() { LF_PRINT_DEBUG("Sent MSG_TYPE_STOP_REQUEST_REPLY to RTI with tag " PRINTF_TAG, tag_to_stop.time, tag_to_stop.microstep); - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ_REP, _lf_my_fed_id, &tag_to_stop); } /** @@ -1665,12 +1666,11 @@ void terminate_execution(environment_t* env) { // MSG_TYPE_FAILED message to the RTI, but we should not acquire a mutex. if (_fed.socket_TCP_RTI >= 0) { if (_lf_normal_termination) { - send_resign_signal(env); - // Trace the event when tracing is enabled tracepoint_federate_to_rti(_fed.trace, send_RESIGN, _lf_my_fed_id, &env->current_tag); + send_resign_signal(env); } else { - send_failed_signal(env); tracepoint_federate_to_rti(_fed.trace, send_FAILED, _lf_my_fed_id, &env->current_tag); + send_failed_signal(env); } } @@ -2117,12 +2117,13 @@ void lf_create_server(int specified_port) { buffer[0] = MSG_TYPE_ADDRESS_ADVERTISEMENT; encode_int32(_fed.server_port, &(buffer[1])); + // Trace the event when tracing is enabled + tracepoint_federate_to_rti(_fed.trace, send_ADR_AD, _lf_my_fed_id, NULL); + // No need for a mutex because we have the only handle on this socket. write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, sizeof(int32_t) + 1, (unsigned char*)buffer, NULL, "Failed to send address advertisement."); - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_ADR_AD, _lf_my_fed_id, NULL); LF_PRINT_DEBUG("Sent port %d to the RTI.", _fed.server_port); // Set the global server socket @@ -2232,6 +2233,9 @@ void* lf_handle_p2p_connections_from_federates(void* env_arg) { // Send an MSG_TYPE_ACK message. unsigned char response = MSG_TYPE_ACK; + // Trace the event when tracing is enabled + tracepoint_federate_to_federate(_fed.trace, send_ACK, _lf_my_fed_id, remote_fed_id, NULL); + LF_MUTEX_LOCK(lf_outbound_socket_mutex); write_to_socket_fail_on_error( &_fed.sockets_for_inbound_p2p_connections[remote_fed_id], @@ -2241,9 +2245,6 @@ void* lf_handle_p2p_connections_from_federates(void* env_arg) { remote_fed_id); LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); - // Trace the event when tracing is enabled - tracepoint_federate_to_federate(_fed.trace, send_ACK, _lf_my_fed_id, remote_fed_id, NULL); - // Start a thread to listen for incoming messages from other federates. // The fed_id is a uint16_t, which we assume can be safely cast to and from void*. void* fed_id_arg = (void*)(uintptr_t)remote_fed_id; @@ -2567,6 +2568,14 @@ void lf_send_port_absent_to_federate( int* socket = &_fed.sockets_for_outbound_p2p_connections[fed_ID]; #endif + if (socket == &_fed.socket_TCP_RTI) { + tracepoint_federate_to_rti( + _fed.trace, send_PORT_ABS, _lf_my_fed_id, ¤t_message_intended_tag); + } else { + tracepoint_federate_to_federate( + _fed.trace, send_PORT_ABS, _lf_my_fed_id, fed_ID, ¤t_message_intended_tag); + } + LF_MUTEX_LOCK(lf_outbound_socket_mutex); int result = write_to_socket_close_on_error(socket, message_length, buffer); LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); @@ -2582,15 +2591,6 @@ void lf_send_port_absent_to_federate( lf_print_warning("Failed to send port absent message for port %hu to federate %hu.", port_ID, fed_ID); } - } else { - // Message sent correctly. Trace it. - if (socket == &_fed.socket_TCP_RTI) { - tracepoint_federate_to_rti( - _fed.trace, send_PORT_ABS, _lf_my_fed_id, ¤t_message_intended_tag); - } else { - tracepoint_federate_to_federate( - _fed.trace, send_PORT_ABS, _lf_my_fed_id, fed_ID, ¤t_message_intended_tag); - } } } @@ -2614,6 +2614,9 @@ int lf_send_stop_request_to_rti(tag_t stop_tag) { LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); return -1; } + // Trace the event when tracing is enabled + tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ, _lf_my_fed_id, &stop_tag); + write_to_socket_fail_on_error(&_fed.socket_TCP_RTI, MSG_TYPE_STOP_REQUEST_LENGTH, buffer, &lf_outbound_socket_mutex, "Failed to send stop time " PRINTF_TIME " to the RTI.", stop_tag.time - start_time); @@ -2621,8 +2624,6 @@ int lf_send_stop_request_to_rti(tag_t stop_tag) { // Treat this sending as equivalent to having received a stop request from the RTI. _fed.received_stop_request_from_rti = true; LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); - // Trace the event when tracing is enabled - tracepoint_federate_to_rti(_fed.trace, send_STOP_REQ, _lf_my_fed_id, &stop_tag); return 0; } else { LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); From 6e4af8e6534832babbd63ad26b7e9a6cef091124 Mon Sep 17 00:00:00 2001 From: "Edward A. Lee" Date: Sun, 21 Jan 2024 10:40:27 -0800 Subject: [PATCH 83/83] Do not acquire mutex during abnormal termination --- core/federated/federate.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/core/federated/federate.c b/core/federated/federate.c index 950039138..cd9149e9e 100644 --- a/core/federated/federate.c +++ b/core/federated/federate.c @@ -821,15 +821,17 @@ static void* listen_to_federates(void* _args) { /** * Close the socket that sends outgoing messages to the - * specified federate ID. This function assumes the caller holds - * the lf_outbound_socket_mutex mutex lock, at least during normal termination. + * specified federate ID. This function acquires the lf_outbound_socket_mutex mutex lock + * if _lf_normal_termination is true and otherwise proceeds without the lock. * @param fed_id The ID of the peer federate receiving messages from this * federate, or -1 if the RTI (centralized coordination). * @param flag 0 if the socket has received EOF, 1 if not, -1 if abnormal termination. */ static void close_outbound_socket(int fed_id, int flag) { assert (fed_id >= 0 && fed_id < NUMBER_OF_FEDERATES); - LF_MUTEX_LOCK(lf_outbound_socket_mutex); + if (_lf_normal_termination) { + LF_MUTEX_LOCK(lf_outbound_socket_mutex); + } if (_fed.sockets_for_outbound_p2p_connections[fed_id] >= 0) { // Close the socket by sending a FIN packet indicating that no further writes // are expected. Then read until we get an EOF indication. @@ -847,7 +849,9 @@ static void close_outbound_socket(int fed_id, int flag) { close(_fed.sockets_for_outbound_p2p_connections[fed_id]); _fed.sockets_for_outbound_p2p_connections[fed_id] = -1; } - LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + if (_lf_normal_termination) { + LF_MUTEX_UNLOCK(lf_outbound_socket_mutex); + } } #ifdef FEDERATED_AUTHENTICATED