diff --git a/Makefile b/Makefile index 11f1fa95f6..7a5f6001c3 100644 --- a/Makefile +++ b/Makefile @@ -73,6 +73,7 @@ DBG_WS ?= 0 DBG_APM ?= 0 DBG_GFSM ?= 0 DBG_HTTP ?= 0 +DBG_HTTP2 ?= 0 DBG_HTTP_FRAME ?= 0 DBG_HTTP_SESS ?= 0 DBG_HTTP_STREAM ?= 0 @@ -89,6 +90,7 @@ TFW_CFLAGS += -DDBG_HTTP_SESS=$(DBG_HTTP_SESS) TFW_CFLAGS += -DDBG_HTTP_STREAM=$(DBG_HTTP_STREAM) TFW_CFLAGS += -DDBG_HPACK=$(DBG_HPACK) -DDBG_CACHE=$(DBG_CACHE) TFW_CFLAGS += -DDBG_SRV=$(DBG_SRV) -DDBG_VHOST=$(DBG_VHOST) -DDBG_TEST=$(DBG_TEST) +TFW_CFLAGS += -DDBG_HTTP2=$(DBG_HTTP2) # By default Tempesta TLS randomizes elliptic curve points using RDRAND # instruction, which provides a high speed random numbers generator. diff --git a/fw/cache.c b/fw/cache.c index 8aa30f67a3..13cb7c61c7 100644 --- a/fw/cache.c +++ b/fw/cache.c @@ -1018,8 +1018,7 @@ tfw_cache_send_304(TfwHttpReq *req, TfwCacheEntry *ce) resp->mit.start_off = FRAME_HEADER_SIZE; - r = tfw_h2_resp_status_write(resp, 304, false, true, - stream_id); + r = tfw_h2_resp_status_write(resp, 304, false, true); if (unlikely(r)) goto err_setup; /* account for :status field itself */ @@ -1060,7 +1059,7 @@ tfw_cache_send_304(TfwHttpReq *req, TfwCacheEntry *ce) return; } - if (tfw_h2_frame_local_resp(resp, stream_id, h_len, NULL)) + if (tfw_h2_frame_local_resp(resp, h_len, NULL)) goto err_setup; tfw_h2_req_unlink_stream(req); @@ -2666,7 +2665,7 @@ tfw_cache_add_body_page(TfwMsgIter *it, char *p, int sz, bool h2, */ static int tfw_cache_build_resp_body(TDB *db, TdbVRec *trec, TfwMsgIter *it, char *p, - unsigned long body_sz, bool h2, unsigned int stream_id) + unsigned long body_sz, bool h2) { int r; bool sh_frag = h2 ? false : true; @@ -2702,10 +2701,6 @@ tfw_cache_build_resp_body(TDB *db, TdbVRec *trec, TfwMsgIter *it, char *p, !body_sz); if (r) return r; - if (stream_id) { - skb_set_tfw_flags(it->skb, SS_F_HTTT2_FRAME_DATA); - skb_set_tfw_cb(it->skb, stream_id); - } } if (!body_sz || !(trec = tdb_next_rec_chunk(db, trec))) break; @@ -2728,8 +2723,7 @@ tfw_cache_build_resp_body(TDB *db, TdbVRec *trec, TfwMsgIter *it, char *p, } static int -tfw_cache_set_hdr_age(TfwHttpResp *resp, TfwCacheEntry *ce, - unsigned int stream_id) +tfw_cache_set_hdr_age(TfwHttpResp *resp, TfwCacheEntry *ce) { int r; size_t digs; @@ -2760,8 +2754,7 @@ tfw_cache_set_hdr_age(TfwHttpResp *resp, TfwCacheEntry *ce, if (to_h2) { h_age.hpack_idx = 21; - if ((r = tfw_hpack_encode(resp, &h_age, false, false, - stream_id))) + if ((r = tfw_hpack_encode(resp, &h_age, false, false))) goto err; } else { if ((r = tfw_http_msg_expand_data(&mit->iter, skb_head, @@ -2803,8 +2796,7 @@ tfw_cache_set_hdr_age(TfwHttpResp *resp, TfwCacheEntry *ce, * TODO use iterator and passed skbs to be called from net_tx_action. */ static TfwHttpResp * -tfw_cache_build_resp(TfwHttpReq *req, TfwCacheEntry *ce, long lifetime, - unsigned int stream_id) +tfw_cache_build_resp(TfwHttpReq *req, TfwCacheEntry *ce, long lifetime) { int h; TfwStr dummy_body = { 0 }; @@ -2863,14 +2855,14 @@ tfw_cache_build_resp(TfwHttpReq *req, TfwCacheEntry *ce, long lifetime, * Set 'set-cookie' header if needed, for HTTP/2 or HTTP/1.1 * response. */ - if (tfw_http_sess_resp_process(resp, true, stream_id)) + if (tfw_http_sess_resp_process(resp, true)) goto free; /* * RFC 7234 p.4 Constructing Responses from Caches: * When a stored response is used to satisfy a request without * validation, a cache MUST generate an Age header field. */ - if (tfw_cache_set_hdr_age(resp, ce, stream_id)) + if (tfw_cache_set_hdr_age(resp, ce)) goto free; if (!TFW_MSG_H2(req)) { @@ -2898,11 +2890,11 @@ tfw_cache_build_resp(TfwHttpReq *req, TfwCacheEntry *ce, long lifetime, } /* Set additional headers for HTTP/2 response. */ - if (tfw_h2_resp_add_loc_hdrs(resp, h_mods, true, stream_id) + if (tfw_h2_resp_add_loc_hdrs(resp, h_mods, true) || (lifetime > ce->lifetime - && tfw_h2_set_stale_warn(resp, stream_id)) + && tfw_h2_set_stale_warn(resp)) || (!test_bit(TFW_HTTP_B_HDR_DATE, resp->flags) - && tfw_h2_add_hdr_date(resp, true, stream_id))) + && tfw_h2_add_hdr_date(resp, true))) goto free; h_len += mit->acc_len; @@ -2923,7 +2915,7 @@ tfw_cache_build_resp(TfwHttpReq *req, TfwCacheEntry *ce, long lifetime, * send content in the response. */ dummy_body.len = req->method != TFW_HTTP_METH_HEAD ? ce->body_len : 0; - if (tfw_h2_frame_local_resp(resp, stream_id, h_len, &dummy_body)) + if (tfw_h2_frame_local_resp(resp, h_len, &dummy_body)) goto free; it->skb = ss_skb_peek_tail(&it->skb_head); it->frag = skb_shinfo(it->skb)->nr_frags - 1; @@ -2933,7 +2925,7 @@ tfw_cache_build_resp(TfwHttpReq *req, TfwCacheEntry *ce, long lifetime, BUG_ON(p != TDB_PTR(db->hdr, ce->body)); if (ce->body_len && req->method != TFW_HTTP_METH_HEAD) { if (tfw_cache_build_resp_body(db, trec, it, p, ce->body_len, - TFW_MSG_H2(req), stream_id)) + TFW_MSG_H2(req))) goto free; } resp->content_length = ce->body_len; @@ -2994,8 +2986,7 @@ cache_req_process_node(TfwHttpReq *req, tfw_http_cache_cb_t action) } } - resp = tfw_cache_build_resp(req, ce, lifetime, id); - + resp = tfw_cache_build_resp(req, ce, lifetime); /* * The stream of HTTP/2-request should be closed here since we have * successfully created the resulting response from cache and will diff --git a/fw/connection.h b/fw/connection.h index 7a69955bb4..bf1bc34222 100644 --- a/fw/connection.h +++ b/fw/connection.h @@ -28,7 +28,7 @@ #include "gfsm.h" #include "peer.h" #include "sync_socket.h" -#include "http_frame.h" +#include "http2.h" #include "tls.h" /* We account users with FRANG_FREQ frequency per second. */ @@ -292,7 +292,7 @@ typedef struct { */ #define tfw_h2_context_unsafe(conn) ((TfwH2Ctx *)(&((TfwH2Conn *)conn)->h2)) #define tfw_h2_context_safe(conn) \ - ttls_hs_done(tfw_tls_context(conn)) ? tfw_h2_context_unsafe(conn) : NULL; + ttls_hs_done(tfw_tls_context(conn)) ? tfw_h2_context_unsafe(conn) : NULL /* Callbacks used by l5-l7 protocols to operate on connection level. */ @@ -548,8 +548,8 @@ tfw_connection_unlink_from_sk(struct sock *sk) sk->sk_data_ready = NULL; sk->sk_state_change = NULL; - sk->sk_prepare_xmit = NULL; sk->sk_write_xmit = NULL; + sk->sk_fill_write_queue = NULL; sk->sk_destroy_cb = NULL; sk->sk_user_data = NULL; diff --git a/fw/hpack.c b/fw/hpack.c index ff085de011..c8b9a7b15b 100644 --- a/fw/hpack.c +++ b/fw/hpack.c @@ -1116,7 +1116,6 @@ tfw_hpack_init(TfwHPack *__restrict hp, unsigned int htbl_sz) goto err_dt; et->window = htbl_sz; - spin_lock_init(&et->lock); et->rb_size = HPACK_ENC_TABLE_MAX_SIZE; if (!(et->pool = __tfw_pool_new(HPACK_ENC_TABLE_MAX_SIZE))) goto err_et; @@ -3162,12 +3161,6 @@ tfw_hpack_encoder_index(TfwHPackETbl *__restrict tbl, if (WARN_ON_ONCE(!hdr)) return -EINVAL; - spin_lock(&tbl->lock); - - if (!test_bit(TFW_HTTP_B_H2_TRANS_ENTERED, flags) - && atomic64_read(&tbl->guard) < 0) - goto out; - tfw_http_hdr_split(hdr, &h_name, &h_val, spcolon); if (WARN_ON_ONCE(TFW_STR_EMPTY(&h_name))) return -EINVAL; @@ -3177,76 +3170,13 @@ tfw_hpack_encoder_index(TfwHPackETbl *__restrict tbl, *out_index = HPACK_NODE_GET_INDEX(tbl, node); - /* - * Encoder dynamic index can be in three states: initial state (@guard - * is zero), read state (@guard is 1 or greater), and write state - * (@guard is -1); in read state any thread can search in index, but - * nobody can add or evict entries in index; if index in the write state - * only one thread (current writer) can add/evict entries in index and - * nobody can search in index; index can be switched to write state - * only from initial state (in general case) or from read state (if - * current reader is the sole read owner of the index). - */ - if (!test_bit(TFW_HTTP_B_H2_TRANS_ENTERED, flags)) { - if(res != HPACK_IDX_ST_FOUND - && !atomic64_read(&tbl->guard) - && !tfw_hpack_add_node(tbl, hdr, &place, spcolon)) - { - res |= HPACK_IDX_FLAG_ADD; - atomic64_set(&tbl->guard, -1); - __set_bit(TFW_HTTP_B_H2_TRANS_ENTERED, flags); - } - else if (res != HPACK_IDX_ST_NOT_FOUND) - { - atomic64_inc(&tbl->guard); - __set_bit(TFW_HTTP_B_H2_TRANS_ENTERED, flags); - } - } - else { - /* - * If value of guard is 1, we are the sole owner of the encoder - * dynamic index with read rights, thus we can write to it. - * Note, that @guard cannot be zero here, since we are already - * owning encoder index with read or write rights (i.e. the flag - * @TFW_HTTP_B_H2_TRANS_ENTERED is set for the corrently - * processed message), thus we have already set the @guard - * equal to 1 (or greater) or to -1 before. - */ - WARN_ON_ONCE(!atomic64_read(&tbl->guard)); - if (res != HPACK_IDX_ST_FOUND - && atomic64_read(&tbl->guard) <= 1 - && !tfw_hpack_add_node(tbl, hdr, &place, spcolon)) - { - res |= HPACK_IDX_FLAG_ADD; - atomic64_set(&tbl->guard, -1); - } - } - -out: - spin_unlock(&tbl->lock); + if(res != HPACK_IDX_ST_FOUND + && !tfw_hpack_add_node(tbl, hdr, &place, spcolon)) + res |= HPACK_IDX_FLAG_ADD; return res; } -void -tfw_hpack_enc_release(TfwHPack *__restrict hp, unsigned long *flags) -{ - TfwHPackETbl *tbl = &hp->enc_tbl; - - if (!test_bit(TFW_HTTP_B_H2_TRANS_ENTERED, flags)) - return; - - if (atomic64_read(&tbl->guard) < 0) { - atomic64_set(&tbl->guard, 0); - } - else { - WARN_ON_ONCE(!atomic64_read(&tbl->guard)); - atomic64_dec(&tbl->guard); - } - - __clear_bit(TFW_HTTP_B_H2_TRANS_ENTERED, flags); -} - static unsigned long tfw_huffman_encode_string_len(TfwStr *str) { @@ -3617,17 +3547,16 @@ tfw_hpack_hdr_expand(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr, */ static int __tfw_hpack_encode(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr, - bool use_pool, bool dyn_indexing, bool trans, - unsigned int stream_id) + bool use_pool, bool dyn_indexing, bool trans) { TfwHPackInt idx; bool st_full_index; unsigned short st_index, index = 0; - TfwH2Ctx *ctx = tfw_h2_context_unsafe(resp->req->conn); + TfwConn *conn = resp->req->conn; + TfwH2Ctx *ctx = tfw_h2_context_unsafe(conn); TfwHPackETbl *tbl = &ctx->hpack.enc_tbl; int r = HPACK_IDX_ST_NOT_FOUND; bool name_indexed = true; - struct sk_buff *skb = resp->mit.iter.skb; if (WARN_ON_ONCE(!hdr || TFW_STR_EMPTY(hdr))) return -EINVAL; @@ -3640,6 +3569,7 @@ __tfw_hpack_encode(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr, T_DBG_PRINT_HPACK_RBTREE(tbl); if (!st_full_index && dyn_indexing) { + assert_spin_locked(&conn->sk->sk_lock.slock); r = tfw_hpack_encoder_index(tbl, hdr, &index, resp->flags, trans); if (r < 0) @@ -3664,7 +3594,7 @@ __tfw_hpack_encode(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr, return r; resp->mit.acc_len += idx.sz * !use_pool; - goto set_skb_priv; + return 0; } if (st_index || HPACK_IDX_RES(r) == HPACK_IDX_ST_NM_FOUND) { @@ -3697,30 +3627,14 @@ __tfw_hpack_encode(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr, r = tfw_hpack_hdr_add(resp, hdr, &idx, name_indexed, trans); else r = tfw_hpack_hdr_expand(resp, hdr, &idx, name_indexed); -set_skb_priv: - if (likely(!r) && stream_id) { - /* - * Very long headers can be located in several skbs, - * mark them all. - */ - while(skb && unlikely(skb != resp->mit.iter.skb)) { - skb_set_tfw_flags(skb, SS_F_HTTT2_FRAME_HEADERS); - skb_set_tfw_cb(skb, stream_id); - skb = skb->next; - } - - skb_set_tfw_flags(resp->mit.iter.skb, SS_F_HTTT2_FRAME_HEADERS); - skb_set_tfw_cb(resp->mit.iter.skb, stream_id); - } return r; } int tfw_hpack_encode(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr, - bool use_pool, bool dyn_indexing, unsigned int stream_id) + bool use_pool, bool dyn_indexing) { - return __tfw_hpack_encode(resp, hdr, use_pool, dyn_indexing, false, - stream_id); + return __tfw_hpack_encode(resp, hdr, use_pool, dyn_indexing, false); } /* @@ -3728,10 +3642,9 @@ tfw_hpack_encode(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr, * into the HTTP/2 HPACK format. */ int -tfw_hpack_transform(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr, - unsigned int stream_id) +tfw_hpack_transform(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr) { - return __tfw_hpack_encode(resp, hdr, true, true, true, stream_id); + return __tfw_hpack_encode(resp, hdr, true, true, true); } void @@ -3743,8 +3656,6 @@ tfw_hpack_set_rbuf_size(TfwHPackETbl *__restrict tbl, unsigned short new_size) new_size = HPACK_ENC_TABLE_MAX_SIZE; } - spin_lock(&tbl->lock); - T_DBG3("%s: tbl->rb_len=%hu, tbl->size=%hu, tbl->window=%hu," " new_size=%hu\n", __func__, tbl->rb_len, tbl->size, tbl->window, new_size); @@ -3757,7 +3668,7 @@ tfw_hpack_set_rbuf_size(TfwHPackETbl *__restrict tbl, unsigned short new_size) * size that occurs in that interval MUST be signaled in a dynamic * table size update. */ - if (tbl->window != new_size && (likely(!atomic_read(&tbl->wnd_changed)) + if (tbl->window != new_size && (likely(!tbl->wnd_changed) || unlikely(!tbl->window) || new_size < tbl->window)) { if (tbl->size > new_size) @@ -3766,20 +3677,16 @@ tfw_hpack_set_rbuf_size(TfwHPackETbl *__restrict tbl, unsigned short new_size) WARN_ON_ONCE(tbl->rb_len > tbl->size); tbl->window = new_size; - atomic_set(&tbl->wnd_changed, 1); + tbl->wnd_changed = true; } - - spin_unlock(&tbl->lock); } int -tfw_hpack_enc_tbl_write_sz(TfwHPackETbl *__restrict tbl, struct sock *sk, - struct sk_buff *skb, TfwStream *stream, - unsigned int mss_now, unsigned int *t_tz) +tfw_hpack_enc_tbl_write_sz(TfwHPackETbl *__restrict tbl, TfwStream *stream) { TfwMsgIter it = { - .skb = skb, - .skb_head = ((struct sk_buff *)&sk->sk_write_queue), + .skb = stream->xmit.skb_head, + .skb_head = stream->xmit.skb_head, .frag = -1 }; TfwStr new_size = {}; @@ -3787,51 +3694,23 @@ tfw_hpack_enc_tbl_write_sz(TfwHPackETbl *__restrict tbl, struct sock *sk, char *data; int r = 0; - /* - * We should encode hpack dynamic table size, only in case when - * it was changed and only once. - */ - if (unlikely(atomic_cmpxchg(&tbl->wnd_changed, 1, -1) == 1)) { - write_int(tbl->window, 0x1F, 0x20, &tmp); - new_size.data = tmp.buf; - new_size.len = tmp.sz; - - data = tfw_http_iter_set_at_skb(&it, skb, FRAME_HEADER_SIZE); - if (!data) { - r = -E2BIG; - goto finish; - } + WARN_ON_ONCE(!tbl->wnd_changed); - r = tfw_h2_insert_frame_header(sk, skb, stream, mss_now, &it, - &data, &new_size, t_tz); - if (unlikely(r)) - goto finish; + write_int(tbl->window, 0x1F, 0x20, &tmp); + new_size.data = tmp.buf; + new_size.len = tmp.sz; - stream->xmit.h_len += tmp.sz; - } + data = ss_skb_data_ptr_by_offset(stream->xmit.skb_head, + FRAME_HEADER_SIZE); + BUG_ON(!data); -finish: + r = tfw_http_msg_insert(&it, &data, &new_size); if (unlikely(r)) - /* - * In case of error we should restore value of `wnd_changed` - * flag. - */ - atomic_set(&tbl->wnd_changed, 1); - return r; -} + return r; -void -tfw_hpack_enc_tbl_write_sz_release(TfwHPackETbl *__restrict tbl, int r) -{ - /* - * Before calling this function, we should check that we encode - * new dynamic table size into the frame, so `old` can have only - * two values (-1 in most of all cases, since we set it previosly - * or 1 if changing of dynamic table size was occured, before this - * function is called). - * We should change this flag only if it wasn't changed by - * `tfw_hpack_set_rbuf_size` function. - */ - int old = atomic_cmpxchg(&tbl->wnd_changed, -1, r == 0 ? 0 : 1); - WARN_ON_ONCE(!old); + stream->xmit.h_len += tmp.sz; + tbl->wnd_changed = false; + + return 0; } + diff --git a/fw/hpack.h b/fw/hpack.h index 558810d578..010e2f9a16 100644 --- a/fw/hpack.h +++ b/fw/hpack.h @@ -90,32 +90,21 @@ typedef struct { * * @window - maximum pseudo-length of the dynamic table (in bytes); this * value used as threshold to flushing old entries; - * @wnd_changed - flag indicates, that window was changed by settings update, - * - can be in three states: - * - 0 in case when window size isn't changed. - * - 1 in case when window size is changed and it should be written - * into the first response, before the first header block. - * - -1 in case when window size is written into the first response, - * but this response was not sent to a client yet. + * @wnd_changed - flag indicates, that window was changed by settings update; * @rbuf - pointer to the ring buffer; * @root - pointer to the root node of binary tree; * @pool - memory pool for dynamic table; * @idx_acc - current accumulated index, intended for real indexes * calculation; - * @guard - atomic protection against races during entries - * addition/eviction in encoder dynamic index; - * @lock - spinlock to synchronize concurrent access to encoder index. */ typedef struct { TFW_HPACK_ETBL_COMMON; unsigned short window; - atomic_t wnd_changed; + bool wnd_changed; char *rbuf; TfwHPackNode *root; TfwPool *pool; unsigned long idx_acc; - atomic64_t guard; - spinlock_t lock; } TfwHPackETbl; /** @@ -314,10 +303,9 @@ void write_int(unsigned long index, unsigned short max, unsigned short mask, TfwHPackInt *__restrict res_idx); int tfw_hpack_init(TfwHPack *__restrict hp, unsigned int htbl_sz); void tfw_hpack_clean(TfwHPack *__restrict hp); -int tfw_hpack_transform(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr, - unsigned int stream_id); +int tfw_hpack_transform(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr); int tfw_hpack_encode(TfwHttpResp *__restrict resp, TfwStr *__restrict hdr, - bool use_pool, bool dyn_indexing, unsigned int stream_id); + bool use_pool, bool dyn_indexing); void tfw_hpack_set_rbuf_size(TfwHPackETbl *__restrict tbl, unsigned short new_size); int tfw_hpack_decode(TfwHPack *__restrict hp, unsigned char *__restrict src, @@ -327,11 +315,7 @@ int tfw_hpack_cache_decode_expand(TfwHPack *__restrict hp, TfwHttpResp *__restrict resp, unsigned char *__restrict src, unsigned long n, TfwDecodeCacheIter *__restrict cd_iter); -void tfw_hpack_enc_release(TfwHPack *__restrict hp, unsigned long *flags); -int tfw_hpack_enc_tbl_write_sz(TfwHPackETbl *__restrict tbl, struct sock *sk, - struct sk_buff *skb, TfwStream *stream, - unsigned int mss_now, unsigned int *t_tz); -void tfw_hpack_enc_tbl_write_sz_release(TfwHPackETbl *__restrict tbl, int r); +int tfw_hpack_enc_tbl_write_sz(TfwHPackETbl *__restrict tbl, TfwStream *stream); static inline unsigned int tfw_hpack_int_size(unsigned long index, unsigned short max) diff --git a/fw/http.c b/fw/http.c index b1bc9d4782..f3049a1b4b 100644 --- a/fw/http.c +++ b/fw/http.c @@ -586,10 +586,14 @@ tfw_http_resp_status_line(int status, size_t *len) /* * Preparing custom HTTP2 response to a client. + * We don't use hpack dynamic indexing in this function, because + * this function is used only for local responses and redirections + * which are used quite rarely. Also we don't use dynamic indexing + * for cache responses, which is much more significant (#1801). The + * behaviour may be changed during solving #1801. */ static int -tfw_h2_prep_resp(TfwHttpResp *resp, unsigned short status, TfwStr *msg, - unsigned int stream_id) +tfw_h2_prep_resp(TfwHttpResp *resp, unsigned short status, TfwStr *msg) { int r, i; unsigned long hdrs_len = 0; @@ -601,16 +605,9 @@ tfw_h2_prep_resp(TfwHttpResp *resp, unsigned short status, TfwStr *msg, }; TfwStr *body = NULL; - BUG_ON(!resp->req); - if (!stream_id) { - stream_id = tfw_h2_req_stream_id(resp->req); - if (unlikely(!stream_id)) - return -EPIPE; - } - /* Set HTTP/2 ':status' pseudo-header. */ mit->start_off = FRAME_HEADER_SIZE; - r = tfw_h2_resp_status_write(resp, status, false, false, stream_id); + r = tfw_h2_resp_status_write(resp, status, false, true); if (unlikely(r)) goto out; @@ -636,7 +633,7 @@ tfw_h2_prep_resp(TfwHttpResp *resp, unsigned short status, TfwStr *msg, __TFW_STR_CH(&hdr, 0)->hpack_idx = name->hpack_idx; r = tfw_hpack_encode(resp, __TFW_STR_CH(&hdr, 0), - false, false, stream_id); + false, false); if (unlikely(r)) goto out; @@ -661,8 +658,7 @@ tfw_h2_prep_resp(TfwHttpResp *resp, unsigned short status, TfwStr *msg, __TFW_STR_CH(&hdr, 1)->len; hdr.hpack_idx = name->hpack_idx; - if ((r = tfw_hpack_encode(resp, &hdr, false, true, - stream_id))) + if ((r = tfw_hpack_encode(resp, &hdr, false, false))) goto out; } } @@ -677,7 +673,7 @@ tfw_h2_prep_resp(TfwHttpResp *resp, unsigned short status, TfwStr *msg, body = TFW_STR_BODY_CH(msg); - r = tfw_h2_frame_local_resp(resp, stream_id, hdrs_len, body); + r = tfw_h2_frame_local_resp(resp, hdrs_len, body); out: if (r) @@ -867,7 +863,7 @@ do { \ }; r = TFW_MSG_H2(req) - ? tfw_h2_prep_resp(resp, status, &msg, 0) + ? tfw_h2_prep_resp(resp, status, &msg) : tfw_h1_prep_resp(resp, status, &msg); return r; @@ -992,6 +988,17 @@ tfw_http_resp_pair_free(TfwHttpReq *req) tfw_http_conn_msg_free((TfwHttpMsg *)req); } +void +tfw_http_resp_pair_free_and_put_conn(void *opaque_data) +{ + TfwHttpResp *resp = (TfwHttpResp *)(opaque_data); + TfwHttpReq *req = resp->req; + + BUG_ON(!req || !req->conn); + tfw_connection_put(req->conn); + tfw_http_resp_pair_free(req); +} + /* * Close the client connection and free unpaired request. This function * is needed for cases when we cannot prepare response for this request. @@ -1052,7 +1059,7 @@ tfw_http_enum_resp_code(int status) */ int tfw_h2_resp_status_write(TfwHttpResp *resp, unsigned short status, - bool use_pool, bool cache, unsigned int stream_id) + bool use_pool, bool cache) { int ret; unsigned short index = tfw_h2_pseudo_index(status); @@ -1078,8 +1085,7 @@ tfw_h2_resp_status_write(TfwHttpResp *resp, unsigned short status, if (!tfw_ultoa(status, __TFW_STR_CH(&s_hdr, 1)->data, H2_STAT_VAL_LEN)) return -E2BIG; - if ((ret = tfw_hpack_encode(resp, &s_hdr, use_pool, !cache, - stream_id))) + if ((ret = tfw_hpack_encode(resp, &s_hdr, use_pool, !cache))) return ret; /* set status on response for access logging */ @@ -1091,27 +1097,27 @@ tfw_h2_resp_status_write(TfwHttpResp *resp, unsigned short status, void tfw_h2_resp_fwd(TfwHttpResp *resp) { + bool resp_in_xmit = + (TFW_SKB_CB(resp->msg.skb_head)->opaque_data == resp); TfwHttpReq *req = resp->req; - TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn); + TfwConn *conn = req->conn; - tfw_connection_get(req->conn); + tfw_connection_get(conn); do_access_log(resp); - if (tfw_cli_conn_send((TfwCliConn *)req->conn, (TfwMsg *)resp)) { + if (tfw_cli_conn_send((TfwCliConn *)conn, (TfwMsg *)resp)) { T_DBG("%s: cannot send data to client via HTTP/2\n", __func__); TFW_INC_STAT_BH(serv.msgs_otherr); - tfw_connection_close(req->conn, true); - } - else { + tfw_connection_close(conn, true); + /* We can't send response, so we should free it here. */ + resp_in_xmit = false; + } else { TFW_INC_STAT_BH(serv.msgs_forwarded); tfw_inc_global_hm_stats(resp->status); } - tfw_connection_put(req->conn); - - tfw_hpack_enc_release(&ctx->hpack, resp->flags); - - tfw_http_resp_pair_free(req); + if (!resp_in_xmit) + tfw_http_resp_pair_free_and_put_conn(resp); } /* @@ -1127,14 +1133,16 @@ tfw_h2_resp_fwd(TfwHttpResp *resp) */ static void tfw_h2_send_resp(TfwHttpReq *req, TfwStr *msg, int status, - unsigned int stream_id) + bool close_after_send) { - TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn); TfwHttpResp *resp = tfw_http_msg_alloc_resp_light(req); if (unlikely(!resp)) goto err; - if (tfw_h2_prep_resp(resp, status, msg, stream_id)) + if (close_after_send) + set_bit(TFW_HTTP_B_CLOSE_ERROR_RESPONSE, resp->flags); + + if (tfw_h2_prep_resp(resp, status, msg)) goto err_setup; /* Send resulting HTTP/2 response and release HPACK encoder index. */ @@ -1146,8 +1154,6 @@ tfw_h2_send_resp(TfwHttpReq *req, TfwStr *msg, int status, T_DBG("%s: HTTP/2 response message transformation error: conn=[%p]\n", __func__, req->conn); - tfw_hpack_enc_release(&ctx->hpack, resp->flags); - tfw_http_msg_free((TfwHttpMsg *)resp); err: tfw_http_resp_build_error(req); @@ -1219,12 +1225,12 @@ tfw_http_prep_err_resp(TfwHttpReq *req, int status, TfwStr *msg) * pairing for pipelined requests is violated. */ static void -tfw_h2_send_err_resp(TfwHttpReq *req, int status, unsigned int stream_id) +tfw_h2_send_err_resp(TfwHttpReq *req, int status, bool close_after_send) { TfwStr msg = MAX_PREDEF_RESP; tfw_http_prep_err_resp(req, status, &msg); - tfw_h2_send_resp(req, &msg, status, stream_id); + tfw_h2_send_resp(req, &msg, status, close_after_send); } /* @@ -1485,7 +1491,7 @@ tfw_http_send_err_resp(TfwHttpReq *req, int status, const char *reason) TFW_NO_PORT, status); if (TFW_MSG_H2(req)) - tfw_h2_send_err_resp(req, status, 0); + tfw_h2_send_err_resp(req, status, false); else tfw_h1_send_err_resp(req, status); } @@ -1494,7 +1500,7 @@ static void tfw_http_send_resp(TfwHttpReq *req, TfwStr *msg, int status) { if (TFW_MSG_H2(req)) { - tfw_h2_send_resp(req, msg, status, 0); + tfw_h2_send_resp(req, msg, status, false); } else { TfwCliConn *cli_conn = (TfwCliConn *)req->conn; @@ -4101,7 +4107,7 @@ tfw_http_adjust_resp(TfwHttpResp *resp) return r; } - r = tfw_http_sess_resp_process(resp, false, 0); + r = tfw_http_sess_resp_process(resp, false); if (r < 0) return r; @@ -4334,7 +4340,7 @@ tfw_h2_hdr_map(TfwHttpResp *resp, const TfwStr *hdr, unsigned int id) * transformation. */ static int -tfw_h2_add_hdr_via(TfwHttpResp *resp, unsigned int stream_id) +tfw_h2_add_hdr_via(TfwHttpResp *resp) { int r; TfwGlobal *g_vhost = tfw_vhost_get_global(); @@ -4354,7 +4360,7 @@ tfw_h2_add_hdr_via(TfwHttpResp *resp, unsigned int stream_id) via.hpack_idx = 60; - r = tfw_hpack_encode(resp, &via, true, true, stream_id); + r = tfw_hpack_encode(resp, &via, true, true); if (unlikely(r)) T_ERR("HTTP/2: unable to add 'via' header (resp=[%p])\n", resp); else @@ -4367,7 +4373,7 @@ tfw_h2_add_hdr_via(TfwHttpResp *resp, unsigned int stream_id) * transformation and for building response from cache. */ int -tfw_h2_add_hdr_date(TfwHttpResp *resp, bool cache, unsigned int stream_id) +tfw_h2_add_hdr_date(TfwHttpResp *resp, bool cache) { int r; char *s_date = *this_cpu_ptr(&g_buf); @@ -4384,7 +4390,7 @@ tfw_h2_add_hdr_date(TfwHttpResp *resp, bool cache, unsigned int stream_id) hdr.hpack_idx = 33; - r = tfw_hpack_encode(resp, &hdr, !cache, !cache, stream_id); + r = tfw_hpack_encode(resp, &hdr, !cache, !cache); if (unlikely(r)) T_ERR("HTTP/2: unable to add 'date' header to response" " [%p]\n", resp); @@ -4398,7 +4404,7 @@ tfw_h2_add_hdr_date(TfwHttpResp *resp, bool cache, unsigned int stream_id) * Add 'Content-Length:' header field to an HTTP message. */ static int -tfw_h2_add_hdr_clen(TfwHttpResp *resp, unsigned int stream_id) +tfw_h2_add_hdr_clen(TfwHttpResp *resp) { int r; char* buf = *this_cpu_ptr(&g_buf); @@ -4408,7 +4414,7 @@ tfw_h2_add_hdr_clen(TfwHttpResp *resp, unsigned int stream_id) r = tfw_h2_msg_hdr_add(resp, "content-length", SLEN("content-length"), buf, - cl_valsize, 28, stream_id); + cl_valsize, 28); if (unlikely(r)) T_ERR("%s: unable to add 'content-length' header (resp=[%p])\n", @@ -4426,7 +4432,7 @@ tfw_h2_add_hdr_clen(TfwHttpResp *resp, unsigned int stream_id) * from transfer encoding. */ static int -tfw_h2_add_hdr_cenc(TfwHttpResp *resp, TfwStr *value, unsigned int stream_id) +tfw_h2_add_hdr_cenc(TfwHttpResp *resp, TfwStr *value) { int r; TfwStr name = { .data = "content-encoding", @@ -4441,7 +4447,7 @@ tfw_h2_add_hdr_cenc(TfwHttpResp *resp, TfwStr *value, unsigned int stream_id) .hpack_idx = 26 }; - r = tfw_hpack_encode(resp, &hdr, true, true, stream_id); + r = tfw_hpack_encode(resp, &hdr, true, true); if (unlikely(r)) goto err; @@ -4511,7 +4517,7 @@ tfw_http_resp_copy_encodings(TfwHttpResp *resp, TfwStr* dst, size_t max_len) * In case if response is stale, we should pass it with a warning. */ int -tfw_h2_set_stale_warn(TfwHttpResp *resp, unsigned int stream_id) +tfw_h2_set_stale_warn(TfwHttpResp *resp) { TfwStr wh = { .chunks = (TfwStr []){ @@ -4522,7 +4528,7 @@ tfw_h2_set_stale_warn(TfwHttpResp *resp, unsigned int stream_id) .nchunks = 2 }; - return tfw_hpack_encode(resp, &wh, false, false, stream_id); + return tfw_hpack_encode(resp, &wh, false, false); } /* @@ -4663,7 +4669,7 @@ tfw_h2_hdr_size(unsigned long n_len, unsigned long v_len, int tfw_h2_resp_add_loc_hdrs(TfwHttpResp *resp, const TfwHdrMods *h_mods, - bool cache, unsigned int stream_id) + bool cache) { unsigned int i; TfwHttpHdrTbl *ht = resp->h_tbl; @@ -4687,8 +4693,7 @@ tfw_h2_resp_add_loc_hdrs(TfwHttpResp *resp, const TfwHdrMods *h_mods, continue; } - r = tfw_hpack_encode(resp, desc->hdr, !cache, !cache, - stream_id); + r = tfw_hpack_encode(resp, desc->hdr, !cache, !cache); if (unlikely(r)) return r; } @@ -4730,8 +4735,7 @@ tfw_h2_hdr_sub(unsigned short hid, const TfwStr *hdr, const TfwHdrMods *h_mods) } static int -tfw_h2_hpack_encode_headers(TfwHttpResp *resp, const TfwHdrMods *h_mods, - unsigned int stream_id) +tfw_h2_hpack_encode_headers(TfwHttpResp *resp, const TfwHdrMods *h_mods) { int r; unsigned int i; @@ -4778,7 +4782,7 @@ tfw_h2_hpack_encode_headers(TfwHttpResp *resp, const TfwHdrMods *h_mods, if (hid == TFW_HTTP_HDR_SERVER) continue; - r = tfw_hpack_transform(resp, tgt, stream_id); + r = tfw_hpack_transform(resp, tgt); if (unlikely(r)) return r; } @@ -4796,8 +4800,7 @@ tfw_h2_hpack_encode_headers(TfwHttpResp *resp, const TfwHdrMods *h_mods, * processing thus no chunked body allowed, only plain TfwStr is accepted there. */ static int -tfw_h2_append_predefined_body(TfwHttpResp *resp, unsigned int stream_id, - const TfwStr *body) +tfw_h2_append_predefined_body(TfwHttpResp *resp, const TfwStr *body) { TfwHttpTransIter *mit = &resp->mit; TfwMsgIter *it = &mit->iter; @@ -4842,10 +4845,6 @@ tfw_h2_append_predefined_body(TfwHttpResp *resp, unsigned int stream_id, skb_fill_page_desc(it->skb, it->frag, page, 0, copy); ss_skb_adjust_data_len(it->skb, copy); - BUG_ON(!stream_id); - skb_set_tfw_flags(it->skb, SS_F_HTTT2_FRAME_DATA); - skb_set_tfw_cb(it->skb, stream_id); - if (it->frag + 1 == MAX_SKB_FRAGS && (r = tfw_msg_iter_append_skb(it))) { @@ -4856,54 +4855,51 @@ tfw_h2_append_predefined_body(TfwHttpResp *resp, unsigned int stream_id, return 0; } -/** - * Frame forwarded response. - */ -static int -tfw_h2_frame_fwd_resp(TfwHttpResp *resp, unsigned int stream_id, - unsigned long h_len) +int +tfw_http_on_send_resp(void *conn, struct sk_buff **skb_head) { - unsigned long b_len = TFW_HTTP_RESP_CUT_BODY_SZ(resp); - TfwMsgIter iter = {.frag = -1, .skb_head = resp->msg.skb_head}; - int r = 0; - - r = tfw_h2_stream_init_for_xmit(resp->req, h_len, b_len); - if (unlikely(r)) - return r; - - if (test_bit(TFW_HTTP_B_CHUNKED, resp->flags)) { - r = tfw_http_msg_cutoff_body_chunks(resp); - if (unlikely(r)) - return r; - } + TfwH2Ctx *ctx = tfw_h2_context_unsafe((TfwConn *)conn); + struct tfw_skb_cb *tfw_cb = TFW_SKB_CB(*skb_head); + TfwStream *stream; - if (b_len) { - if (test_bit(TFW_HTTP_B_CHUNKED, resp->flags)) - iter.skb = resp->body_start_skb; - else - iter.skb = resp->body.skb; - tfw_msg_iter_set_skb_priv(&iter, stream_id, - SS_F_HTTT2_FRAME_DATA); - } + stream = tfw_h2_find_not_closed_stream(ctx, tfw_cb->stream_id, false); + /* + * Very unlikely case. We check that stream is active, before + * calling ss_send, but there is a very small chance, that + * stream was canceled by RST STREAM from the client + * before ss_do_send was called. + */ + if (unlikely(!stream)) + return -EPIPE; + + BUG_ON(stream->xmit.skb_head); + stream->xmit.resp = (TfwHttpResp *)tfw_cb->opaque_data; + if (test_bit(TFW_HTTP_B_CLOSE_ERROR_RESPONSE, stream->xmit.resp->flags)) + ctx->error = stream; + swap(stream->xmit.skb_head, *skb_head); + sock_set_flag(((TfwConn *)conn)->sk, SOCK_TEMPESTA_HAS_DATA); + if (!stream->xmit.is_blocked) + tfw_h2_sched_activate_stream(&ctx->sched, stream); - return r; + return 0; } /** * Frame response generated locally. */ int -tfw_h2_frame_local_resp(TfwHttpResp *resp, unsigned int stream_id, - unsigned long h_len, const TfwStr *body) +tfw_h2_frame_local_resp(TfwHttpResp *resp, unsigned long h_len, + const TfwStr *body) { unsigned long b_len = body ? body->len : 0; int r; - r = tfw_h2_stream_init_for_xmit(resp->req, h_len, b_len); + r = tfw_h2_append_predefined_body(resp, body); if (unlikely(r)) return r; - return tfw_h2_append_predefined_body(resp, stream_id, body); + return tfw_h2_stream_init_for_xmit(resp, HTTP2_RELEASE_RESPONSE, + h_len, b_len); } static void @@ -4997,6 +4993,8 @@ tfw_h2_error_resp(TfwHttpReq *req, int status, bool reply, ErrorType type, { TfwStream *stream; TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn); + bool close_after_send = (type == TFW_ERROR_TYPE_ATTACK || + type == TFW_ERROR_TYPE_BAD); /* * block_action attack/error drop - Tempesta FW must block message @@ -5036,9 +5034,8 @@ tfw_h2_error_resp(TfwHttpReq *req, int status, bool reply, ErrorType type, * and GOAWAY frame should be sent (RFC 7540 section 6.8) after * error response. */ - tfw_h2_send_err_resp(req, status, stream->id); - if (type == TFW_ERROR_TYPE_ATTACK - || type == TFW_ERROR_TYPE_BAD) { + tfw_h2_send_err_resp(req, status, close_after_send); + if (close_after_send) { tfw_h2_conn_terminate_close(ctx, err_code, !on_req_recv_event, type == TFW_ERROR_TYPE_ATTACK); } else { @@ -5051,8 +5048,7 @@ tfw_h2_error_resp(TfwHttpReq *req, int status, bool reply, ErrorType type, goto out; skip_stream: - if (type == TFW_ERROR_TYPE_ATTACK - || type == TFW_ERROR_TYPE_BAD) { + if (close_after_send) { tfw_h2_conn_terminate_close(ctx, err_code, !on_req_recv_event, type == TFW_ERROR_TYPE_ATTACK); } @@ -5266,13 +5262,11 @@ __tfw_h2_resp_cleanup(TfwHttpRespCleanup *cleanup) * Major browsers and curl ignore that RFC requirement an work well. But * that is definitely an RFC violation and implementation specific behaviour. */ -static void -tfw_h2_resp_adjust_fwd(TfwHttpResp *resp) +int +tfw_h2_resp_encode_headers(TfwHttpResp *resp) { int r; - unsigned int stream_id; TfwHttpReq *req = resp->req; - TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn); TfwHttpTransIter *mit = &resp->mit; TfwHttpRespCleanup cleanup = {}; TfwStr codings = {.data = *this_cpu_ptr(&g_te_buf), .len = 0}; @@ -5280,10 +5274,6 @@ tfw_h2_resp_adjust_fwd(TfwHttpResp *resp) req->vhost, TFW_VHOST_HDRMOD_RESP); - stream_id = tfw_h2_req_stream_id(req); - if (unlikely(!stream_id)) - goto out; - /* * Accordingly to RFC 9113 8.2.2 connection-specific headers can't * be used in HTTP/2. @@ -5315,7 +5305,8 @@ tfw_h2_resp_adjust_fwd(TfwHttpResp *resp) WARN_ON_ONCE(mit->acc_len); tfw_h2_msg_transform_setup(mit, resp->msg.skb_head, true); - if (tfw_h2_msg_cutoff_headers(resp, &cleanup)) + r = tfw_h2_msg_cutoff_headers(resp, &cleanup); + if (unlikely(r)) goto clean; /* @@ -5326,12 +5317,11 @@ tfw_h2_resp_adjust_fwd(TfwHttpResp *resp) if (unlikely(r)) goto clean; - r = tfw_h2_resp_status_write(resp, resp->status, true, false, - stream_id); + r = tfw_h2_resp_status_write(resp, resp->status, true, false); if (unlikely(r)) goto clean; - r = tfw_h2_hpack_encode_headers(resp, h_mods, stream_id); + r = tfw_h2_hpack_encode_headers(resp, h_mods); if (unlikely(r)) goto clean; @@ -5341,42 +5331,38 @@ tfw_h2_resp_adjust_fwd(TfwHttpResp *resp) * processed above and which have non-empty value (i.e. configured * not for deletion). */ - r = tfw_http_sess_resp_process(resp, false, stream_id); + r = tfw_http_sess_resp_process(resp, false); if (unlikely(r)) goto clean; - r = tfw_h2_add_hdr_via(resp, stream_id); + r = tfw_h2_add_hdr_via(resp); if (unlikely(r)) goto clean; if (!test_bit(TFW_HTTP_B_HDR_DATE, resp->flags)) { - r = tfw_h2_add_hdr_date(resp, false, stream_id); + r = tfw_h2_add_hdr_date(resp, false); if (unlikely(r)) goto clean; } if (test_bit(TFW_HTTP_B_CHUNKED, resp->flags)) { - if (unlikely(tfw_h2_add_hdr_clen(resp, stream_id))) + if (unlikely(tfw_h2_add_hdr_clen(resp))) goto clean; } if (test_bit(TFW_HTTP_B_TE_EXTRA, resp->flags)) { - r = tfw_h2_add_hdr_cenc(resp, &codings, stream_id); + r = tfw_h2_add_hdr_cenc(resp, &codings); if (unlikely(r)) goto clean; TFW_STR_INIT(&codings); } - r = TFW_H2_MSG_HDR_ADD(resp, "server", TFW_SERVER, 54, stream_id); + r = TFW_H2_MSG_HDR_ADD(resp, "server", TFW_SERVER, 54); if (unlikely(r)) goto clean; - r = tfw_h2_resp_add_loc_hdrs(resp, h_mods, false, stream_id); - if (unlikely(r)) - goto clean; - - r = tfw_h2_frame_fwd_resp(resp, stream_id, mit->acc_len); + r = tfw_h2_resp_add_loc_hdrs(resp, h_mods, false); if (unlikely(r)) goto clean; @@ -5384,26 +5370,31 @@ tfw_h2_resp_adjust_fwd(TfwHttpResp *resp) req, resp); SS_SKB_QUEUE_DUMP(&resp->msg.skb_head); - tfw_h2_req_unlink_stream(req); - tfw_h2_resp_fwd(resp); - __tfw_h2_resp_cleanup(&cleanup); + return 0; - return; clean: __tfw_h2_resp_cleanup(&cleanup); - tfw_http_conn_msg_free((TfwHttpMsg *)resp); - if (!(tfw_blk_flags & TFW_BLK_ERR_NOLOG)) - T_WARN_ADDR_STATUS("response dropped: processing error", - &req->conn->peer->addr, - TFW_NO_PORT, 500); - tfw_h2_send_err_resp(req, 500, stream_id); - tfw_hpack_enc_release(&ctx->hpack, resp->flags); - TFW_INC_STAT_BH(serv.msgs_otherr); + return r; +} - return; -out: - tfw_http_resp_pair_free(req); +static void +tfw_h2_resp_adjust_fwd(TfwHttpResp *resp) +{ + TfwHttpReq *req = resp->req; + int r; + + /* + * This function can be failed only if stream is + * already closed and deleted. + */ + r = tfw_h2_stream_init_for_xmit(resp, HTTP2_ENCODE_HEADERS, 0, 0); + if (unlikely(r)) { + tfw_http_resp_pair_free(req); + } else { + tfw_h2_req_unlink_stream(req); + tfw_h2_resp_fwd(resp); + } } /** diff --git a/fw/http.h b/fw/http.h index 9cc5a872ad..53b4402e71 100644 --- a/fw/http.h +++ b/fw/http.h @@ -741,13 +741,13 @@ int tfw_http_expand_hbh(TfwHttpResp *resp, unsigned short status); int tfw_http_expand_hdr_via(TfwHttpResp *resp); void tfw_h2_resp_fwd(TfwHttpResp *resp); int tfw_h2_hdr_map(TfwHttpResp *resp, const TfwStr *hdr, unsigned int id); -int tfw_h2_add_hdr_date(TfwHttpResp *resp, bool cache, unsigned int stream_id); -int tfw_h2_set_stale_warn(TfwHttpResp *resp, unsigned int stream_id); +int tfw_h2_add_hdr_date(TfwHttpResp *resp, bool cache); +int tfw_h2_set_stale_warn(TfwHttpResp *resp); int tfw_h2_resp_add_loc_hdrs(TfwHttpResp *resp, const TfwHdrMods *h_mods, - bool cache, unsigned int stream_id); + bool cache); int tfw_h2_resp_status_write(TfwHttpResp *resp, unsigned short status, - bool use_pool, bool cache, - unsigned int stream_id); + bool use_pool, bool cache); +int tfw_h2_resp_encode_headers(TfwHttpResp *resp); /* * Functions to send an HTTP error response to a client. */ @@ -756,6 +756,7 @@ int tfw_http_prep_redir(TfwHttpResp *resp, unsigned short status, int tfw_http_prep_304(TfwHttpReq *req, struct sk_buff **skb_head, TfwMsgIter *it); void tfw_http_conn_msg_free(TfwHttpMsg *hm); +void tfw_http_resp_pair_free_and_put_conn(void *opaque_data); void tfw_http_send_err_resp(TfwHttpReq *req, int status, const char *reason); /* Helper functions */ @@ -764,12 +765,13 @@ unsigned long tfw_http_hdr_split(TfwStr *hdr, TfwStr *name_out, TfwStr *val_out, bool inplace); unsigned long tfw_h2_hdr_size(unsigned long n_len, unsigned long v_len, unsigned short st_index); -int tfw_h2_frame_local_resp(TfwHttpResp *resp, unsigned int stream_id, - unsigned long h_len, const TfwStr *body); +int tfw_h2_frame_local_resp(TfwHttpResp *resp, unsigned long h_len, + const TfwStr *body); int tfw_http_resp_copy_encodings(TfwHttpResp *resp, TfwStr* dst, size_t max_len); void tfw_http_extract_request_authority(TfwHttpReq *req); bool tfw_http_mark_is_in_whitlist(unsigned int mark); char *tfw_http_resp_status_line(int status, size_t *len); +int tfw_http_on_send_resp(void *conn, struct sk_buff **skb_head); #endif /* __TFW_HTTP_H__ */ diff --git a/fw/http2.c b/fw/http2.c new file mode 100644 index 0000000000..8188b36b21 --- /dev/null +++ b/fw/http2.c @@ -0,0 +1,556 @@ +/** + * Tempesta FW + * + * Copyright (C) 2024 Tempesta Technologies, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#undef DEBUG +#if DBG_HTTP2 > 0 +#define DEBUG DBG_HTTP2 +#endif + +#include "connection.h" +#include "http.h" +#include "http2.h" +#include "http_frame.h" +#include "http_msg.h" + +#define TFW_MAX_CLOSED_STREAMS 5 + +/** + * Usually client firstly send SETTINGS frame to a server, so: + * - we don't have many streams to iterate over in this function + * (usually we have no streams at all). + * - typically there is only one SETTINGS_INITIAL_WINDOW_SIZE + * frame is sent from a client side. + */ +static void +tfw_h2_apply_wnd_sz_change(TfwH2Ctx *ctx, long int delta) +{ + TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2); + TfwStream *stream, *next; + + /* + * Order is no matter, use default funtion from the Linux kernel. + * According to RFC 9113 6.9.2 + * When the value of SETTINGS_INITIAL_WINDOW_SIZE changes, a receiver + * MUST adjust the size of all stream flow-control windows that it + * maintains by the difference between the new value and the old value. + * A change to SETTINGS_INITIAL_WINDOW_SIZE can cause the available + * space in a flow-control window to become negative. + */ + rbtree_postorder_for_each_entry_safe(stream, next, + &ctx->sched.streams, node) { + TfwStreamState state = tfw_h2_get_stream_state(stream); + if (state == HTTP2_STREAM_OPENED || + state == HTTP2_STREAM_REM_HALF_CLOSED) { + stream->rem_wnd += delta; + tfw_h2_stream_try_unblock(&ctx->sched, stream); + if (stream->rem_wnd > 0) { + sock_set_flag(((TfwConn *)conn)->sk, + SOCK_TEMPESTA_HAS_DATA); + } + } + } +} + +static void +tfw_h2_apply_settings_entry(TfwH2Ctx *ctx, unsigned short id, + unsigned int val) +{ + TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2); + TfwSettings *dest = &ctx->rsettings; + long int delta; + + switch (id) { + case HTTP2_SETTINGS_TABLE_SIZE: + assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock); + dest->hdr_tbl_sz = min_t(unsigned int, + val, HPACK_ENC_TABLE_MAX_SIZE); + tfw_hpack_set_rbuf_size(&ctx->hpack.enc_tbl, dest->hdr_tbl_sz); + break; + + case HTTP2_SETTINGS_ENABLE_PUSH: + BUG_ON(val > 1); + dest->push = val; + break; + + case HTTP2_SETTINGS_MAX_STREAMS: + dest->max_streams = val; + break; + + case HTTP2_SETTINGS_INIT_WND_SIZE: + BUG_ON(val > MAX_WND_SIZE); + delta = (long int)val - (long int)dest->wnd_sz; + tfw_h2_apply_wnd_sz_change(ctx, delta); + dest->wnd_sz = val; + break; + + case HTTP2_SETTINGS_MAX_FRAME_SIZE: + BUG_ON(val < FRAME_DEF_LENGTH || val > FRAME_MAX_LENGTH); + dest->max_frame_sz = val; + break; + + case HTTP2_SETTINGS_MAX_HDR_LIST_SIZE: + dest->max_lhdr_sz = val; + break; + + default: + /* + * We should silently ignore unknown identifiers (see + * RFC 9113 section 6.5.2) + */ + break; + } +} + +int +tfw_h2_check_settings_entry(TfwH2Ctx *ctx, unsigned short id, unsigned int val) +{ + TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2); + + assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock); + + switch (id) { + case HTTP2_SETTINGS_TABLE_SIZE: + break; + + case HTTP2_SETTINGS_ENABLE_PUSH: + if (val > 1) + return -EINVAL; + break; + + case HTTP2_SETTINGS_MAX_STREAMS: + break; + + case HTTP2_SETTINGS_INIT_WND_SIZE: + if (val > MAX_WND_SIZE) + return -EINVAL; + break; + + case HTTP2_SETTINGS_MAX_FRAME_SIZE: + if (val < FRAME_DEF_LENGTH || val > FRAME_MAX_LENGTH) + return -EINVAL; + break; + + case HTTP2_SETTINGS_MAX_HDR_LIST_SIZE: + break; + + default: + /* + * We should silently ignore unknown identifiers (see + * RFC 9113 section 6.5.2) + */ + break; + } + + return 0; +} + +void +tfw_h2_save_settings_entry(TfwH2Ctx *ctx, unsigned short id, unsigned int val) +{ + TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2); + + assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock); + + if (id > 0 && id < _HTTP2_SETTINGS_MAX) { + ctx->new_settings[id - 1] = val; + __set_bit(id, ctx->settings_to_apply); + __set_bit(HTTP2_SETTINGS_NEED_TO_APPLY, + ctx->settings_to_apply); + } +} + +void +tfw_h2_apply_new_settings(TfwH2Ctx *ctx) +{ + TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2); + unsigned int id; + + assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock); + + for (id = HTTP2_SETTINGS_TABLE_SIZE; id < _HTTP2_SETTINGS_MAX; id++) { + if (test_bit(id, ctx->settings_to_apply)) { + unsigned int val = ctx->new_settings[id - 1]; + tfw_h2_apply_settings_entry(ctx, id, val); + } + } + clear_bit(HTTP2_SETTINGS_NEED_TO_APPLY, ctx->settings_to_apply); +} + +int +tfw_h2_init(void) +{ + return tfw_h2_stream_cache_create(); +} + +void +tfw_h2_cleanup(void) +{ + tfw_h2_stream_cache_destroy(); +} + +int +tfw_h2_context_init(TfwH2Ctx *ctx) +{ + TfwStreamQueue *closed_streams = &ctx->closed_streams; + TfwStreamQueue *idle_streams = &ctx->idle_streams; + TfwSettings *lset = &ctx->lsettings; + TfwSettings *rset = &ctx->rsettings; + + bzero_fast(ctx, sizeof(*ctx)); + + ctx->state = HTTP2_RECV_CLI_START_SEQ; + ctx->loc_wnd = DEF_WND_SIZE; + ctx->rem_wnd = DEF_WND_SIZE; + + spin_lock_init(&ctx->lock); + INIT_LIST_HEAD(&closed_streams->list); + INIT_LIST_HEAD(&idle_streams->list); + + tfw_h2_init_stream_sched(&ctx->sched); + + lset->hdr_tbl_sz = rset->hdr_tbl_sz = HPACK_TABLE_DEF_SIZE; + lset->push = rset->push = 1; + lset->max_streams = tfw_cli_max_concurrent_streams; + rset->max_streams = 0xffffffff; + lset->max_frame_sz = rset->max_frame_sz = FRAME_DEF_LENGTH; + lset->max_lhdr_sz = max_header_list_size ? + max_header_list_size : UINT_MAX; + rset->max_lhdr_sz = UINT_MAX; + + lset->wnd_sz = DEF_WND_SIZE; + rset->wnd_sz = DEF_WND_SIZE; + + return tfw_hpack_init(&ctx->hpack, HPACK_TABLE_DEF_SIZE); +} +ALLOW_ERROR_INJECTION(tfw_h2_context_init, ERRNO); + +void +tfw_h2_context_clear(TfwH2Ctx *ctx) +{ + WARN_ON_ONCE(ctx->streams_num); + /* + * Free POSTPONED SKBs. This is necessary when h2 context has + * postponed frames and connection closing initiated. + */ + ss_skb_queue_purge(&ctx->skb_head); + tfw_hpack_clean(&ctx->hpack); +} + +void +tfw_h2_conn_terminate_close(TfwH2Ctx *ctx, TfwH2Err err_code, bool close, + bool attack) +{ + TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2); + + if (tfw_h2_send_goaway(ctx, err_code, attack) && close) { + if (attack) + tfw_connection_close((TfwConn *)conn, true); + else + tfw_connection_shutdown((TfwConn *)conn, true); + } +} + +/** + * According to RFC 9113 section 5.1.1: + * The first use of a new stream identifier implicitly closes all + * streams in the "idle" state that might have been initiated by that + * peer with a lower-valued stream identifier. + */ +void +tfw_h2_remove_idle_streams(TfwH2Ctx *ctx, unsigned int id) +{ + TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2); + TfwStream *stream, *tmp; + + /* + * We add and remove streams from idle queue under + * socket lock. + */ + assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock); + + list_for_each_entry_safe_reverse(stream, tmp, &ctx->idle_streams.list, + hcl_node) + { + if (id <= stream->id) + break; + + tfw_h2_stream_del_from_queue_nolock(stream); + tfw_h2_set_stream_state(stream, HTTP2_STREAM_CLOSED); + tfw_h2_stream_add_closed(ctx, stream); + } +} + +void +tfw_h2_conn_streams_cleanup(TfwH2Ctx *ctx) +{ + TfwStream *cur, *next; + TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2); + TfwStreamSched *sched = &ctx->sched; + + WARN_ON_ONCE(((TfwConn *)conn)->stream.msg); + + T_DBG3("%s: ctx [%p] conn %p sched %p\n", __func__, ctx, conn, sched); + + tfw_h2_remove_idle_streams(ctx, UINT_MAX); + + rbtree_postorder_for_each_entry_safe(cur, next, &sched->streams, node) { + tfw_h2_stream_purge_all_and_free_response(cur); + tfw_h2_stream_unlink_lock(ctx, cur); + + /* The streams tree is about to be destroyed and + * we don't want to trigger rebalancing. + * No further actions regarding streams dependencies/prio + * is required at this stage. + */ + tfw_h2_delete_stream(cur); + --ctx->streams_num; + } + sched->streams = RB_ROOT; +} + +void +tfw_h2_current_stream_remove(TfwH2Ctx *ctx) +{ + T_DBG3("%s: ctx [%p] ctx->cur_stream %p\n", __func__, + ctx, ctx->cur_stream); + tfw_h2_stream_unlink_lock(ctx, ctx->cur_stream); + tfw_h2_stream_clean(ctx, ctx->cur_stream); + ctx->cur_stream = NULL; +} + +/* + * Clean the queue of closed streams if its size has exceeded a certain + * value. + */ +void +tfw_h2_closed_streams_shrink(TfwH2Ctx *ctx) +{ + TfwStream *cur; + TfwStreamQueue *closed_streams = &ctx->closed_streams; + + T_DBG3("%s: ctx [%p] closed streams num %lu\n", __func__, ctx, + closed_streams->num); + + while (1) { + spin_lock(&ctx->lock); + + if (closed_streams->num <= TFW_MAX_CLOSED_STREAMS) { + spin_unlock(&ctx->lock); + break; + } + + BUG_ON(list_empty(&closed_streams->list)); + cur = list_first_entry(&closed_streams->list, TfwStream, + hcl_node); + tfw_h2_stream_unlink_nolock(ctx, cur); + + spin_unlock(&ctx->lock); + + T_DBG3("%s: ctx [%p] cur stream [%p]\n", __func__, ctx, cur); + + tfw_h2_stream_clean(ctx, cur); + } +} + +void +tfw_h2_check_current_stream_is_closed(TfwH2Ctx *ctx) +{ + BUG_ON(!ctx->cur_stream); + + T_DBG3("%s: strm [%p] id %u state %d(%s), streams_num %lu\n", + __func__, ctx->cur_stream, ctx->cur_stream->id, + tfw_h2_get_stream_state(ctx->cur_stream), + __h2_strm_st_n(ctx->cur_stream), ctx->streams_num); + + if (tfw_h2_stream_is_closed(ctx->cur_stream)) + tfw_h2_current_stream_remove(ctx); +} + +TfwStream * +tfw_h2_find_not_closed_stream(TfwH2Ctx *ctx, unsigned int id, bool recv) +{ + TfwStream *stream; + + stream = tfw_h2_find_stream(&ctx->sched, id); + return stream && !tfw_h2_stream_is_closed(stream) ? stream : NULL; +} + +/* + * Get stream ID for upper layer to create frames info. + */ +unsigned int +tfw_h2_req_stream_id(TfwHttpReq *req) +{ + unsigned int id = 0; + TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn); + + spin_lock(&ctx->lock); + + if (req->stream) + id = req->stream->id; + + spin_unlock(&ctx->lock); + + return id; +} + +/* + * Unlink request from corresponding stream (if linked). + */ +void +tfw_h2_req_unlink_stream(TfwHttpReq *req) +{ + TfwStream *stream; + TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn); + + spin_lock(&ctx->lock); + + stream = req->stream; + if (!stream) { + spin_unlock(&ctx->lock); + return; + } + + req->stream = NULL; + stream->msg = NULL; + + spin_unlock(&ctx->lock); +} + +/* + * Unlink request from corresponding stream (if linked), + * send RST STREAM and add stream to closed queue. + */ +void +tfw_h2_req_unlink_stream_with_rst(TfwHttpReq *req) +{ + TfwStreamFsmRes r; + TfwStream *stream; + TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn); + + spin_lock(&ctx->lock); + + stream = req->stream; + if (!stream) { + spin_unlock(&ctx->lock); + return; + } + + req->stream = NULL; + stream->msg = NULL; + + r = tfw_h2_stream_fsm_ignore_err(ctx, stream, HTTP2_RST_STREAM, 0); + WARN_ON_ONCE(r != STREAM_FSM_RES_OK && r != STREAM_FSM_RES_IGNORE); + + tfw_h2_stream_add_to_queue_nolock(&ctx->closed_streams, stream); + + spin_unlock(&ctx->lock); +} + +int +tfw_h2_stream_xmit_prepare_resp(TfwStream *stream) +{ + TfwHttpResp *resp = stream->xmit.resp; + unsigned char tls_type; + unsigned int mark; + int r = 0; + + BUG_ON(!resp || resp->msg.skb_head || !resp->req + || !resp->req->conn || !stream->xmit.skb_head); + + tls_type = skb_tfw_tls_type(stream->xmit.skb_head); + mark = stream->xmit.skb_head->mark; + swap(resp->msg.skb_head, stream->xmit.skb_head); + + r = tfw_h2_resp_encode_headers(resp); + if (unlikely(r)) { + T_WARN("Failed to encode headers"); + goto finish; + } + + stream->xmit.h_len = resp->mit.acc_len; + stream->xmit.b_len = TFW_HTTP_RESP_CUT_BODY_SZ(resp); + if (test_bit(TFW_HTTP_B_CHUNKED, resp->flags)) + r = tfw_http_msg_cutoff_body_chunks(resp); + +finish: + swap(stream->xmit.skb_head, resp->msg.skb_head); + ss_skb_setup_head_of_list(stream->xmit.skb_head, mark, tls_type); + + return r; +} + +int +tfw_h2_entail_stream_skb(struct sock *sk, TfwH2Ctx *ctx, TfwStream *stream, + unsigned int *len, bool should_split) +{ + unsigned char tls_type = skb_tfw_tls_type(stream->xmit.skb_head); + unsigned int mark = stream->xmit.skb_head->mark; + struct sk_buff *skb, *split; + int r = 0; + + BUG_ON(!TFW_SKB_CB(stream->xmit.skb_head)->is_head); + while (*len) { + skb = ss_skb_dequeue(&stream->xmit.skb_head); + BUG_ON(!skb); + + if (unlikely(!skb->len)) { + T_DBG3("[%d]: %s: drop skb=%px data_len=%u len=%u\n", + smp_processor_id(), __func__, + skb, skb->data_len, skb->len); + kfree_skb(skb); + continue; + } + + BUG_ON(!tls_type); + BUG_ON(!skb->len); + + if (skb->len > *len) { + if (should_split) { + split = ss_skb_split(skb, *len); + if (!split) { + ss_skb_queue_head(&stream->xmit.skb_head, + skb); + r = -ENOMEM; + break; + } + + ss_skb_queue_head(&stream->xmit.skb_head, split); + } else { + ss_skb_queue_head(&stream->xmit.skb_head, skb); + break; + } + } + *len -= skb->len; + ss_skb_tcp_entail(sk, skb, mark, tls_type); + } + + /* + * We use tls_type and mark from skb_head when we entail data in + * socket write queue. So we should set tls_type and mark for the + * new skb_head. + */ + if (stream->xmit.skb_head + && !TFW_SKB_CB(stream->xmit.skb_head)->is_head) { + ss_skb_setup_head_of_list(stream->xmit.skb_head, mark, + tls_type); + } + + return r; +} diff --git a/fw/http2.h b/fw/http2.h new file mode 100644 index 0000000000..69e5a55dcc --- /dev/null +++ b/fw/http2.h @@ -0,0 +1,167 @@ +/** + * Tempesta FW + * + * Copyright (C) 2024 Tempesta Technologies, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#ifndef __HTTP2__ +#define __HTTP2__ + +#include "http_frame.h" + +/** + * Representation of SETTINGS parameters for HTTP/2 connection (RFC 7540 + * section 6.5.2). + * + * @hdr_tbl_sz - maximum size of the endpoint's header compression + * table used to decode header blocks; + * @push - enable/disable indicator for server push; + * @max_streams - maximum number of streams that the endpoint will + * allow; + * @wnd_sz - endpoint's initial window size for stream-level + * flow control; + * @max_frame_sz - size of the largest frame payload the endpoint wish + * to receive; + * @max_lhdr_sz - maximum size of header list the endpoint prepared + * to accept; + */ +typedef struct { + unsigned int hdr_tbl_sz; + unsigned int push; + unsigned int max_streams; + unsigned int wnd_sz; + unsigned int max_frame_sz; + unsigned int max_lhdr_sz; +} TfwSettings; + +/** + * Context for HTTP/2 frames processing. + * + * @lock - spinlock to protect stream-request linkage; + * @lsettings - local settings for HTTP/2 connection; + * @rsettings - settings for HTTP/2 connection received from the + * remote endpoint; + * @lstream_id - ID of last stream initiated by client and processed + * on the server side; + * @streams_num - number of the streams initiated by client; + * @sched - streams' priority scheduler; + * @closed_streams - queue of closed streams (in HTTP2_STREAM_CLOSED or + * HTTP2_STREAM_REM_CLOSED state), which are waiting + * for removal; + * @idle_streams - queue of idle streams (in HTTP2_STREAM_IDLE) state; + * @loc_wnd - connection's current flow controlled window; + * @rem_wnd - remote peer current flow controlled window; + * @hpack - HPACK context, used in processing of + * HEADERS/CONTINUATION frames; + * @cur_send_headers - stream for which we have already started sending + * headers, but have not yet sent the END_HEADERS flag; + * @cur_recv_headers - stream for which we have already started receiving + * headers, but have not yet received the END_HEADERS + * flag; + * @error - the stream where the error occurred; + * @new_settings - new settings to apply when ack is pushed to socket + * write queue; + * @settings_to_apply - bitmap to save what settings we should apply. first + * bit is used to fast check that we should apply new + * settings. 1 - _HTTP2_SETTINGS_MAX - 1 bits are used + * to save what @new_settings should be applyed. bits + * from _HTTP2_SETTINGS_MAX are used to save what + * settings we sent to the client; + * @__off - offset to reinitialize processing context; + * @skb_head - collected list of processed skbs containing HTTP/2 + * frames; + * @cur_stream - found stream for the frame currently being processed; + * @priority - unpacked data from priority part of payload of + * processed HEADERS or PRIORITY frames; + * @hdr - unpacked data from header of currently processed + * frame; + * @plen - payload length of currently processed frame + * (HEADERS/CONTINUATION/DATA frames); + * @state - current FSM state of HTTP/2 processing context; + * @to_read - indicates how much data of HTTP/2 frame should + * be read on next FSM @state; + * @rlen - length of accumulated data in @rbuf + * or length of the payload read in current FSM state; + * @rbuf - buffer for data accumulation from frames headers and + * payloads (for service frames) during frames + * processing; + * @padlen - length of current frame's padding (if exists); + * @data_off - offset of app data in HEADERS, CONTINUATION and DATA + * frames (after all service payloads); + * + * NOTE: we can keep HPACK context in general connection-wide HTTP/2 context + * (instead of separate HPACK context for each stream), since frames from other + * streams cannot occur between the HEADERS/CONTINUATION frames of particular + * stream (RFC 7540, sections 6.2, 6.10, 8.1). + */ +typedef struct tfw_h2_ctx_t { + spinlock_t lock; + TfwSettings lsettings; + TfwSettings rsettings; + unsigned int lstream_id; + unsigned long streams_num; + TfwStreamSched sched; + TfwStreamQueue closed_streams; + TfwStreamQueue idle_streams; + long int loc_wnd; + long int rem_wnd; + TfwHPack hpack; + TfwStream *cur_send_headers; + TfwStream *cur_recv_headers; + TfwStream *error; + unsigned int new_settings[_HTTP2_SETTINGS_MAX - 1]; + DECLARE_BITMAP (settings_to_apply, 2 * _HTTP2_SETTINGS_MAX - 1); + char __off[0]; + struct sk_buff *skb_head; + TfwStream *cur_stream; + TfwFramePri priority; + TfwFrameHdr hdr; + unsigned int plen; + int state; + int to_read; + int rlen; + unsigned char rbuf[FRAME_HEADER_SIZE]; + unsigned char padlen; + unsigned char data_off; +} TfwH2Ctx; + +int tfw_h2_init(void); +void tfw_h2_cleanup(void); +int tfw_h2_context_init(TfwH2Ctx *ctx); +void tfw_h2_context_clear(TfwH2Ctx *ctx); +int tfw_h2_check_settings_entry(TfwH2Ctx *ctx, unsigned short id, + unsigned int val); +void tfw_h2_save_settings_entry(TfwH2Ctx *ctx, unsigned short id, + unsigned int val); +void tfw_h2_apply_new_settings(TfwH2Ctx *ctx); +void tfw_h2_conn_terminate_close(TfwH2Ctx *ctx, TfwH2Err err_code, bool close, + bool attack); +void tfw_h2_conn_streams_cleanup(TfwH2Ctx *ctx); +void tfw_h2_current_stream_remove(TfwH2Ctx *ctx); +void tfw_h2_remove_idle_streams(TfwH2Ctx *ctx, unsigned int id); +void tfw_h2_closed_streams_shrink(TfwH2Ctx *ctx); +void tfw_h2_check_current_stream_is_closed(TfwH2Ctx *ctx); +TfwStream *tfw_h2_find_not_closed_stream(TfwH2Ctx *ctx, unsigned int id, + bool recv); + +unsigned int tfw_h2_req_stream_id(TfwHttpReq *req); +void tfw_h2_req_unlink_stream(TfwHttpReq *req); +void tfw_h2_req_unlink_stream_with_rst(TfwHttpReq *req); +int tfw_h2_stream_xmit_prepare_resp(TfwStream *stream); +int tfw_h2_entail_stream_skb(struct sock *sk, TfwH2Ctx *ctx, TfwStream *stream, + unsigned int *len, bool should_split); + +#endif /* __HTTP2__ */ diff --git a/fw/http_frame.c b/fw/http_frame.c index 4cb11d4903..77e672dcbc 100644 --- a/fw/http_frame.c +++ b/fw/http_frame.c @@ -26,6 +26,7 @@ #include "lib/str.h" #include "procfs.h" #include "http.h" +#include "http2.h" #include "http_frame.h" #include "http_msg.h" #include "tcp.h" @@ -45,34 +46,6 @@ #define STREAM_ID_SIZE 4 #define ERR_CODE_SIZE 4 -#define MAX_WND_SIZE ((1U << 31) - 1) -#define DEF_WND_SIZE ((1U << 16) - 1) - -#define TFW_MAX_CLOSED_STREAMS 5 - -/** - * FSM states for HTTP/2 frames processing. - */ -typedef enum { - HTTP2_RECV_FRAME_HEADER, - HTTP2_RECV_CLI_START_SEQ, - HTTP2_RECV_FIRST_SETTINGS, - HTTP2_RECV_FRAME_PRIORITY, - HTTP2_RECV_FRAME_WND_UPDATE, - HTTP2_RECV_FRAME_PING, - HTTP2_RECV_FRAME_RST_STREAM, - HTTP2_RECV_FRAME_SETTINGS, - HTTP2_RECV_FRAME_GOAWAY, - HTTP2_RECV_FRAME_PADDED, - HTTP2_RECV_HEADER_PRI, - HTTP2_IGNORE_FRAME_DATA, - __HTTP2_RECV_FRAME_APP, - HTTP2_RECV_HEADER = __HTTP2_RECV_FRAME_APP, - HTTP2_RECV_CONT, - HTTP2_RECV_DATA, - HTTP2_RECV_APP_DATA_POST -} TfwFrameState; - typedef enum { TFW_FRAME_DEFAULT, TFW_FRAME_SHUTDOWN, @@ -162,13 +135,9 @@ do { \ #define SET_TO_READ_VERIFY(ctx, next_state) \ do { \ - typeof(next_state) state = (!ctx->cur_stream || \ - tfw_h2_get_stream_state(ctx->cur_stream) < \ - HTTP2_STREAM_LOC_CLOSED) ? \ - next_state : HTTP2_IGNORE_FRAME_DATA; \ if ((ctx)->hdr.length) { \ SET_TO_READ(ctx); \ - (ctx)->state = state; \ + (ctx)->state = next_state; \ } else { \ (ctx)->state = HTTP2_IGNORE_FRAME_DATA; \ } \ @@ -194,77 +163,102 @@ do { \ tfw_h2_conn_terminate((ctx), err); \ return T_BAD; \ } else if (res == STREAM_FSM_RES_TERM_STREAM) { \ - return tfw_h2_stream_close((ctx), \ - (hdr)->stream_id, \ - &(ctx)->cur_stream, \ - err); \ + ctx->cur_stream = NULL; \ + return tfw_h2_send_rst_stream((ctx), \ + (hdr)->stream_id, \ + err); \ } \ return T_OK; \ } \ }) -int -tfw_h2_init(void) +static inline void +tfw_h2_unpack_priority(TfwFramePri *pri, const unsigned char *buf) { - return tfw_h2_stream_cache_create(); + pri->stream_id = ntohl(*(unsigned int *)buf) & FRAME_STREAM_ID_MASK; + pri->exclusive = (buf[0] & 0x80) > 0; + pri->weight = buf[4] + 1; } -void -tfw_h2_cleanup(void) -{ - tfw_h2_stream_cache_destroy(); -} +/** + * The flags indicate that an appropriate SETTINGS parameter is waited for an + * update. + */ +static const unsigned char +ctx_new_settings_flags[] = { + [HTTP2_SETTINGS_TABLE_SIZE] = 0x01, + [HTTP2_SETTINGS_ENABLE_PUSH] = 0x02, + [HTTP2_SETTINGS_MAX_STREAMS] = 0x04, + [HTTP2_SETTINGS_INIT_WND_SIZE] = 0x08, + [HTTP2_SETTINGS_MAX_FRAME_SIZE] = 0x10, + [HTTP2_SETTINGS_MAX_HDR_LIST_SIZE] = 0x20 +}; -int -tfw_h2_context_init(TfwH2Ctx *ctx) +static void +tfw_h2_on_tcp_entail_ack(void *conn, struct sk_buff *skb_head) { - TfwStreamQueue *closed_streams = &ctx->closed_streams; - TfwSettings *lset = &ctx->lsettings; - TfwSettings *rset = &ctx->rsettings; - - bzero_fast(ctx, sizeof(*ctx)); + TfwH2Ctx *ctx = tfw_h2_context_unsafe((TfwConn *)conn); - ctx->state = HTTP2_RECV_CLI_START_SEQ; - ctx->loc_wnd = DEF_WND_SIZE; - ctx->rem_wnd = DEF_WND_SIZE; - - spin_lock_init(&ctx->lock); - INIT_LIST_HEAD(&closed_streams->list); + if (test_bit(HTTP2_SETTINGS_NEED_TO_APPLY, ctx->settings_to_apply)) + tfw_h2_apply_new_settings(ctx); +} - lset->hdr_tbl_sz = rset->hdr_tbl_sz = HPACK_TABLE_DEF_SIZE; - lset->push = rset->push = 1; - lset->max_streams = tfw_cli_max_concurrent_streams; - rset->max_streams = 0xffffffff; - lset->max_frame_sz = rset->max_frame_sz = FRAME_DEF_LENGTH; - lset->max_lhdr_sz = max_header_list_size ? - max_header_list_size : UINT_MAX; - rset->max_lhdr_sz = UINT_MAX; +static int +tfw_h2_on_send_goaway(void *conn, struct sk_buff **skb_head) +{ + TfwH2Ctx *ctx = tfw_h2_context_unsafe((TfwConn *)conn); - lset->wnd_sz = DEF_WND_SIZE; - rset->wnd_sz = DEF_WND_SIZE; + if (ctx->error && ctx->error->xmit.skb_head) { + ss_skb_queue_splice(&ctx->error->xmit.skb_head, skb_head); + } else if (ctx->cur_send_headers) { + /* + * Other frames (from any stream) MUST NOT occur between + * the HEADERS frame and any CONTINUATION frames that might + * follow. Send goaway later. + */ + ctx->error = ctx->cur_send_headers; + ss_skb_queue_splice(&ctx->error->xmit.skb_head, skb_head); + } - return tfw_hpack_init(&ctx->hpack, HPACK_TABLE_DEF_SIZE); + return 0; } -ALLOW_ERROR_INJECTION(tfw_h2_context_init, ERRNO); -void -tfw_h2_context_clear(TfwH2Ctx *ctx) +static int +tfw_h2_on_send_rst_stream(void *conn, struct sk_buff **skb_head) { - WARN_ON_ONCE(ctx->streams_num); + TfwH2Ctx *ctx = tfw_h2_context_unsafe((TfwConn *)conn); + unsigned int stream_id = TFW_SKB_CB(*skb_head)->stream_id; + TfwStream *stream; + + stream = tfw_h2_find_not_closed_stream(ctx, stream_id, false); + /* - * Free POSTPONED SKBs. This is necessary when h2 context has - * postponed frames and connection closing initiated. + * Send RST STREAM after all pending data otherwise directly push it + * to socket write queue. + * Stream can not exist in case when we send RST stream because a + * remote peer exceeded max_concurrent_streams limit. */ - ss_skb_queue_purge(&ctx->skb_head); - tfw_hpack_clean(&ctx->hpack); + if (stream && stream->xmit.skb_head) { + ss_skb_queue_splice(&stream->xmit.skb_head, skb_head); + } else if (ctx->cur_send_headers) { + ss_skb_queue_splice(&ctx->cur_send_headers->xmit.postponed, + skb_head); + } + + return 0; } -static inline void -tfw_h2_unpack_priority(TfwFramePri *pri, const unsigned char *buf) +static int +tfw_h2_on_send_dflt(void *conn, struct sk_buff **skb_head) { - pri->stream_id = ntohl(*(unsigned int *)buf) & FRAME_STREAM_ID_MASK; - pri->exclusive = (buf[0] & 0x80) > 0; - pri->weight = buf[4] + 1; + TfwH2Ctx *ctx = tfw_h2_context_unsafe((TfwConn *)conn); + + if (ctx->cur_send_headers) { + ss_skb_queue_splice(&ctx->cur_send_headers->xmit.postponed, + skb_head); + } + + return 0; } /** @@ -316,12 +310,18 @@ __tfw_h2_send_frame(TfwH2Ctx *ctx, TfwFrameHdr *hdr, TfwStr *data, break; } - if (hdr->flags == HTTP2_F_ACK && - (ctx->new_settings.flags & SS_F_HTTP2_ACK_FOR_HPACK_TBL_RESIZING)) - { - skb_set_tfw_flags(it.skb, SS_F_HTTP2_ACK_FOR_HPACK_TBL_RESIZING); - skb_set_tfw_cb(it.skb, ctx->new_settings.hdr_tbl_sz); - ctx->new_settings.flags &= ~SS_F_HTTP2_ACK_FOR_HPACK_TBL_RESIZING; + if (hdr->type == HTTP2_GOAWAY) { + TFW_SKB_CB(msg.skb_head)->on_send = tfw_h2_on_send_goaway; + } else if (hdr->type == HTTP2_RST_STREAM) { + TFW_SKB_CB(msg.skb_head)->on_send = tfw_h2_on_send_rst_stream; + TFW_SKB_CB(msg.skb_head)->stream_id = hdr->stream_id; + } else { + TFW_SKB_CB(msg.skb_head)->on_send = tfw_h2_on_send_dflt; + } + + if (hdr->type == HTTP2_SETTINGS && hdr->flags == HTTP2_F_ACK) { + TFW_SKB_CB(msg.skb_head)->on_tcp_entail = + tfw_h2_on_tcp_entail_ack; } if ((r = tfw_connection_send((TfwConn *)conn, &msg))) @@ -444,12 +444,14 @@ tfw_h2_send_settings_init(TfwH2Ctx *ctx) field[0].key = htons(HTTP2_SETTINGS_TABLE_SIZE); field[0].value = htonl(HPACK_ENC_TABLE_MAX_SIZE); - ctx->sent_settings[HTTP2_SETTINGS_TABLE_SIZE] = true; + __set_bit(_HTTP2_SETTINGS_MAX - 1 + HTTP2_SETTINGS_TABLE_SIZE, + ctx->settings_to_apply); BUILD_BUG_ON(SETTINGS_VAL_SIZE != sizeof(ctx->lsettings.wnd_sz)); field[1].key = htons(HTTP2_SETTINGS_INIT_WND_SIZE); field[1].value = htonl(ctx->lsettings.wnd_sz); - ctx->sent_settings[HTTP2_SETTINGS_INIT_WND_SIZE] = true; + __set_bit(_HTTP2_SETTINGS_MAX -1 + HTTP2_SETTINGS_INIT_WND_SIZE, + ctx->settings_to_apply); field[2].key = htons(HTTP2_SETTINGS_MAX_STREAMS); field[2].value = htonl(ctx->lsettings.max_streams); @@ -459,7 +461,9 @@ tfw_h2_send_settings_init(TfwH2Ctx *ctx) htons(HTTP2_SETTINGS_MAX_HDR_LIST_SIZE); field[required_fields].value = htonl(ctx->lsettings.max_lhdr_sz); - ctx->sent_settings[HTTP2_SETTINGS_MAX_HDR_LIST_SIZE] = true; + __set_bit(_HTTP2_SETTINGS_MAX - 1 + + HTTP2_SETTINGS_MAX_HDR_LIST_SIZE, + ctx->settings_to_apply); data.chunks[1].len += sizeof(field[0]); hdr.length += sizeof(field[0]); } @@ -481,7 +485,7 @@ tfw_h2_send_settings_ack(TfwH2Ctx *ctx) return tfw_h2_send_frame(ctx, &hdr, &data); } -static inline int +int tfw_h2_send_goaway(TfwH2Ctx *ctx, TfwH2Err err_code, bool attack) { unsigned char id_buf[STREAM_ID_SIZE]; @@ -539,20 +543,6 @@ tfw_h2_send_rst_stream(TfwH2Ctx *ctx, unsigned int id, TfwH2Err err_code) return tfw_h2_send_frame(ctx, &hdr, &data); } -void -tfw_h2_conn_terminate_close(TfwH2Ctx *ctx, TfwH2Err err_code, bool close, - bool attack) -{ - TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2); - - if (tfw_h2_send_goaway(ctx, err_code, attack) && close) { - if (attack) - tfw_connection_close((TfwConn *)conn, true); - else - tfw_connection_shutdown((TfwConn *)conn, true); - } -} - static inline void tfw_h2_conn_terminate(TfwH2Ctx *ctx, TfwH2Err err_code) { @@ -609,160 +599,6 @@ tfw_h2_headers_pri_process(TfwH2Ctx *ctx) return T_OK; } -static inline void -tfw_h2_current_stream_remove(TfwH2Ctx *ctx) -{ - T_DBG3("%s: ctx [%p] ctx->cur_stream %p\n", __func__, ctx, ctx->cur_stream); - tfw_h2_stream_unlink_lock(ctx, ctx->cur_stream); - tfw_h2_stream_clean(ctx, ctx->cur_stream); - ctx->cur_stream = NULL; -} - -void -tfw_h2_conn_streams_cleanup(TfwH2Ctx *ctx) -{ - TfwStream *cur, *next; - TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2); - TfwStreamSched *sched = &ctx->sched; - - WARN_ON_ONCE(((TfwConn *)conn)->stream.msg); - - T_DBG3("%s: ctx [%p] conn %p sched %p\n", __func__, ctx, conn, sched); - - rbtree_postorder_for_each_entry_safe(cur, next, &sched->streams, node) { - tfw_h2_stream_unlink_lock(ctx, cur); - - /* The streams tree is about to be destroyed and - * we don't want to trigger rebalancing. - * No further actions regarding streams dependencies/prio - * is required at this stage. - */ - tfw_h2_delete_stream(cur); - --ctx->streams_num; - } - sched->streams = RB_ROOT; -} - -/* - * Get stream ID for upper layer to create frames info. - */ -unsigned int -tfw_h2_req_stream_id(TfwHttpReq *req) -{ - unsigned int id = 0; - TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn); - - spin_lock(&ctx->lock); - - if (req->stream) - id = req->stream->id; - - spin_unlock(&ctx->lock); - - return id; -} - -/* - * Unlink request from corresponding stream (if linked). - */ -void -tfw_h2_req_unlink_stream(TfwHttpReq *req) -{ - TfwStream *stream; - TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn); - - spin_lock(&ctx->lock); - - stream = req->stream; - if (!stream) { - spin_unlock(&ctx->lock); - return; - } - - req->stream = NULL; - stream->msg = NULL; - - spin_unlock(&ctx->lock); -} - -/* - * Unlink request from corresponding stream (if linked), - * send RST STREAM and add stream to closed queue. - */ -void -tfw_h2_req_unlink_stream_with_rst(TfwHttpReq *req) -{ - TfwStreamFsmRes r; - TfwStream *stream; - TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn); - - spin_lock(&ctx->lock); - - stream = req->stream; - if (!stream) { - spin_unlock(&ctx->lock); - return; - } - - req->stream = NULL; - stream->msg = NULL; - - r = tfw_h2_stream_fsm_ignore_err(ctx, stream, HTTP2_RST_STREAM, 0); - WARN_ON_ONCE(r != STREAM_FSM_RES_OK && r != STREAM_FSM_RES_IGNORE); - - tfw_h2_stream_add_to_queue_nolock(&ctx->closed_streams, stream); - - spin_unlock(&ctx->lock); -} - -/* - * Clean the queue of closed streams if its size has exceeded a certain - * value. - */ -static void -tfw_h2_closed_streams_shrink(TfwH2Ctx *ctx) -{ - TfwStream *cur; - TfwStreamQueue *closed_streams = &ctx->closed_streams; - - T_DBG3("%s: ctx [%p] closed streams num %lu\n", __func__, ctx, - closed_streams->num); - - while (1) { - spin_lock(&ctx->lock); - - if (closed_streams->num <= TFW_MAX_CLOSED_STREAMS) { - spin_unlock(&ctx->lock); - break; - } - - BUG_ON(list_empty(&closed_streams->list)); - cur = list_first_entry(&closed_streams->list, TfwStream, - hcl_node); - tfw_h2_stream_unlink_nolock(ctx, cur); - - spin_unlock(&ctx->lock); - - T_DBG3("%s: ctx [%p] cur stream [%p]\n", __func__, ctx, cur); - - tfw_h2_stream_clean(ctx, cur); - } -} - -static inline void -tfw_h2_check_closed_stream(TfwH2Ctx *ctx) -{ - BUG_ON(!ctx->cur_stream); - - T_DBG3("%s: strm [%p] id %u state %d(%s), streams_num %lu\n", - __func__, ctx->cur_stream, ctx->cur_stream->id, - tfw_h2_get_stream_state(ctx->cur_stream), - __h2_strm_st_n(ctx->cur_stream), ctx->streams_num); - - if (tfw_h2_stream_is_closed(ctx->cur_stream)) - tfw_h2_current_stream_remove(ctx); -} - static inline int tfw_h2_current_stream_state_process(TfwH2Ctx *ctx) { @@ -770,7 +606,7 @@ tfw_h2_current_stream_state_process(TfwH2Ctx *ctx) STREAM_RECV_PROCESS(ctx, hdr); - tfw_h2_check_closed_stream(ctx); + tfw_h2_check_current_stream_is_closed(ctx); return T_OK; } @@ -796,8 +632,9 @@ tfw_h2_headers_process(TfwH2Ctx *ctx) HTTP2_RST_STREAM, 0)) return -EPERM; - return tfw_h2_stream_close(ctx, hdr->stream_id, &ctx->cur_stream, - HTTP2_ECODE_PROTO); + ctx->cur_stream = NULL; + return tfw_h2_send_rst_stream(ctx, hdr->stream_id, + HTTP2_ECODE_PROTO); } if (!ctx->cur_stream) { @@ -805,6 +642,9 @@ tfw_h2_headers_process(TfwH2Ctx *ctx) if (!ctx->cur_stream) return -ENOMEM; ctx->lstream_id = hdr->stream_id; + } else if (ctx->cur_stream->state == HTTP2_STREAM_IDLE) { + tfw_h2_stream_remove_idle(ctx, ctx->cur_stream); + ctx->lstream_id = hdr->stream_id; } /* * Since the same received HEADERS frame can cause the stream to become @@ -848,19 +688,23 @@ tfw_h2_wnd_update_process(TfwH2Ctx *ctx) TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2); long int *window = ctx->cur_stream ? &ctx->cur_stream->rem_wnd : &ctx->rem_wnd; - int size, mss; if (tfw_h2_increment_wnd_sz(window, wnd_incr)) { err_code = HTTP2_ECODE_FLOW; goto fail; } + if (ctx->cur_stream) + tfw_h2_stream_try_unblock(&ctx->sched, ctx->cur_stream); + if (*window > 0) { - mss = tcp_send_mss(((TfwConn *)conn)->sk, &size, - MSG_DONTWAIT); - tcp_push(((TfwConn *)conn)->sk, MSG_DONTWAIT, mss, - TCP_NAGLE_OFF|TCP_NAGLE_PUSH, size); + if (tfw_h2_stream_sched_is_active(&ctx->sched.root)) { + sock_set_flag(((TfwConn *)conn)->sk, + SOCK_TEMPESTA_HAS_DATA); + tcp_push_pending_frames(((TfwConn *)conn)->sk); + } } + return T_OK; } @@ -874,8 +718,9 @@ tfw_h2_wnd_update_process(TfwH2Ctx *ctx) HTTP2_RST_STREAM, 0)) return -EPERM; - return tfw_h2_stream_close(ctx, hdr->stream_id, &ctx->cur_stream, - err_code); + ctx->cur_stream = NULL; + return tfw_h2_send_rst_stream(ctx, hdr->stream_id, + err_code); } static inline int @@ -897,6 +742,18 @@ tfw_h2_priority_process(TfwH2Ctx *ctx) return T_OK; } + if (ctx->cur_stream->state == HTTP2_STREAM_IDLE) { + /* + * According to RFC 9113 we should response with stream + * error of type PROTOCOL ERROR here, but we can't send + * RST_STREAM for idle stream. + * RFC 9113 doesn't describe this case, so terminate + * connection. + */ + tfw_h2_conn_terminate(ctx, HTTP2_ECODE_PROTO); + return T_BAD; + } + /* * Stream cannot depend on itself (see RFC 7540 section 5.1.2 for * details). @@ -908,8 +765,9 @@ tfw_h2_priority_process(TfwH2Ctx *ctx) HTTP2_RST_STREAM, 0)) return -EPERM; - return tfw_h2_stream_close(ctx, hdr->stream_id, &ctx->cur_stream, - HTTP2_ECODE_PROTO); + ctx->cur_stream = NULL; + return tfw_h2_send_rst_stream(ctx, hdr->stream_id, + HTTP2_ECODE_PROTO); } static inline void @@ -923,93 +781,19 @@ tfw_h2_rst_stream_process(TfwH2Ctx *ctx) tfw_h2_current_stream_remove(ctx); } -static void -tfw_h2_apply_wnd_sz_change(TfwH2Ctx *ctx, long int delta) -{ - TfwStream *stream, *next; - - /* - * Order is no matter, use default funtion from the Linux kernel. - * According to RFC 9113 6.9.2 - * When the value of SETTINGS_INITIAL_WINDOW_SIZE changes, a receiver - * MUST adjust the size of all stream flow-control windows that it - * maintains by the difference between the new value and the old value. - * A change to SETTINGS_INITIAL_WINDOW_SIZE can cause the available - * space in a flow-control window to become negative. - */ - rbtree_postorder_for_each_entry_safe(stream, next, - &ctx->sched.streams, node) { - TfwStreamState state = tfw_h2_get_stream_state(stream); - if (state == HTTP2_STREAM_OPENED || - state == HTTP2_STREAM_REM_HALF_CLOSED) - stream->rem_wnd += delta; - } -} - -static int -tfw_h2_apply_settings_entry(TfwH2Ctx *ctx, unsigned short id, - unsigned int val) -{ - TfwSettings *dest = &ctx->rsettings; - long int delta; - - switch (id) { - case HTTP2_SETTINGS_TABLE_SIZE: - ctx->new_settings.flags |= SS_F_HTTP2_ACK_FOR_HPACK_TBL_RESIZING; - ctx->new_settings.hdr_tbl_sz = min_t(unsigned int, - val, HPACK_ENC_TABLE_MAX_SIZE); - break; - - case HTTP2_SETTINGS_ENABLE_PUSH: - if (val > 1) - return -EINVAL; - dest->push = val; - break; - - case HTTP2_SETTINGS_MAX_STREAMS: - dest->max_streams = val; - break; - - case HTTP2_SETTINGS_INIT_WND_SIZE: - if (val > MAX_WND_SIZE) - return -EINVAL; - - delta = (long int)val - (long int)dest->wnd_sz; - tfw_h2_apply_wnd_sz_change(ctx, delta); - dest->wnd_sz = val; - break; - - case HTTP2_SETTINGS_MAX_FRAME_SIZE: - if (val < FRAME_DEF_LENGTH || val > FRAME_MAX_LENGTH) - return -EINVAL; - dest->max_frame_sz = val; - break; - - case HTTP2_SETTINGS_MAX_HDR_LIST_SIZE: - dest->max_lhdr_sz = val; - break; - - default: - /* - * We should silently ignore unknown identifiers (see - * RFC 7540 section 6.5.2) - */ - return 0; - } - - return 0; -} - static void tfw_h2_settings_ack_process(TfwH2Ctx *ctx) { T_DBG3("%s: parsed, stream_id=%u, flags=%hhu\n", __func__, ctx->hdr.stream_id, ctx->hdr.flags); - if (ctx->sent_settings[HTTP2_SETTINGS_TABLE_SIZE]) { + if (test_bit(_HTTP2_SETTINGS_MAX - 1 + HTTP2_SETTINGS_TABLE_SIZE, + ctx->settings_to_apply)) + { ctx->hpack.max_window = ctx->lsettings.hdr_tbl_sz; ctx->hpack.dec_tbl.wnd_update = true; - ctx->sent_settings[HTTP2_SETTINGS_TABLE_SIZE] = false; + clear_bit(_HTTP2_SETTINGS_MAX -1 + HTTP2_SETTINGS_TABLE_SIZE, + ctx->settings_to_apply); } } @@ -1023,9 +807,11 @@ tfw_h2_settings_process(TfwH2Ctx *ctx) T_DBG3("%s: entry parsed, id=%hu, val=%u\n", __func__, id, val); - if ((r = tfw_h2_apply_settings_entry(ctx, id, val))) + if ((r = tfw_h2_check_settings_entry(ctx, id, val))) return r; + tfw_h2_save_settings_entry(ctx, id, val); + ctx->to_read = hdr->length ? FRAME_SETTINGS_ENTRY_SIZE : 0; hdr->length -= ctx->to_read; @@ -1237,9 +1023,6 @@ tfw_h2_frame_type_process(TfwH2Ctx *ctx) (hdr->type <= _HTTP2_UNDEFINED ? hdr->type : _HTTP2_UNDEFINED); TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2); -/* - * TODO: Use this macro for processing PRIORITY frame. - */ #define VERIFY_MAX_CONCURRENT_STREAMS(ctx, ACTION) \ do { \ unsigned int max_streams = ctx->lsettings.max_streams; \ @@ -1248,7 +1031,7 @@ do { \ \ if (max_streams == ctx->streams_num) { \ T_WARN("Max streams number exceeded: %lu\n", \ - ctx->streams_num); \ + ctx->streams_num); \ SET_TO_READ_VERIFY(ctx, HTTP2_IGNORE_FRAME_DATA); \ ACTION; \ } \ @@ -1314,13 +1097,7 @@ do { \ if (hdr->flags & HTTP2_F_PADDED) return tfw_h2_recv_padded(ctx); - /* TODO: #1196 Rework this part. */ - if (tfw_h2_get_stream_state(ctx->cur_stream) >= - HTTP2_STREAM_LOC_CLOSED) - ctx->state = HTTP2_IGNORE_FRAME_DATA; - else - ctx->state = HTTP2_RECV_DATA; - + ctx->state = HTTP2_RECV_DATA; SET_TO_READ(ctx); return 0; @@ -1338,6 +1115,9 @@ do { \ err_code = HTTP2_ECODE_PROTO; goto conn_term; } + + tfw_h2_remove_idle_streams(ctx, hdr->stream_id); + /* * Endpoints must not exceed the limit set by their peer for * maximum number of concurrent streams (see RFC 7540 section @@ -1358,17 +1138,9 @@ do { \ if (hdr->flags & HTTP2_F_PRIORITY) return tfw_h2_recv_priority(ctx); - /* TODO: #1196 Rework this part. */ - if (ctx->cur_stream && - tfw_h2_get_stream_state(ctx->cur_stream) >= - HTTP2_STREAM_LOC_CLOSED) - { - ctx->state = HTTP2_IGNORE_FRAME_DATA; - } else { - ctx->state = HTTP2_RECV_HEADER; - } - + ctx->state = HTTP2_RECV_HEADER; SET_TO_READ(ctx); + return 0; case HTTP2_PRIORITY: @@ -1378,26 +1150,39 @@ do { \ } ctx->cur_stream = - tfw_h2_find_not_closed_stream(ctx, hdr->stream_id, - true); + tfw_h2_find_stream(&ctx->sched, hdr->stream_id); if (hdr->length != FRAME_PRIORITY_SIZE) goto conn_term; - /* TODO: #1196 Rework this part. */ - if (!ctx->cur_stream || - tfw_h2_get_stream_state(ctx->cur_stream) >= - HTTP2_STREAM_LOC_CLOSED) - { + if (ctx->cur_stream) { + STREAM_RECV_PROCESS(ctx, hdr); + ctx->state = HTTP2_RECV_FRAME_PRIORITY; + } else if (hdr->stream_id > ctx->lstream_id) { + VERIFY_MAX_CONCURRENT_STREAMS(ctx, { + err_code = HTTP2_ECODE_PROTO; + goto conn_term; + }); + /* + * According to RFC 9113 section 6.3: + * Priority frame can be sent in any stream state, + * including idle or closed streams. + */ + ctx->cur_stream = + tfw_h2_stream_create(ctx, hdr->stream_id); + if (!ctx->cur_stream) + return -ENOMEM; + + tfw_h2_stream_add_idle(ctx, ctx->cur_stream); + STREAM_RECV_PROCESS(ctx, hdr); + ctx->state = HTTP2_RECV_FRAME_PRIORITY; + } else { /* * According to RFC 9113 section 5.1: * PRIORITY frames are allowed in the `closed` state, - * but if the stream was moved to closed queue or was - * already removed from memory, just ignore this frame. + * but if the stream was already removed from memory, + * just ignore this frame. */ ctx->state = HTTP2_IGNORE_FRAME_DATA; - } else { - STREAM_RECV_PROCESS(ctx, hdr); - ctx->state = HTTP2_RECV_FRAME_PRIORITY; } SET_TO_READ(ctx); @@ -1773,8 +1558,14 @@ tfw_h2_frame_recv(void *data, unsigned char *buf, unsigned int len, if ((ret = tfw_h2_current_stream_state_process(ctx))) FRAME_FSM_EXIT(ret); + if (unlikely(ctx->state == HTTP2_IGNORE_FRAME_DATA)) + __fsm_const_state = ctx->state; + if (unlikely(ctx->to_read)) { - FRAME_FSM_MOVE(HTTP2_RECV_APP_DATA_POST); + if (unlikely(ctx->state == HTTP2_IGNORE_FRAME_DATA)) + FRAME_FSM_MOVE(HTTP2_IGNORE_FRAME_DATA); + else + FRAME_FSM_MOVE(HTTP2_RECV_APP_DATA_POST); } FRAME_FSM_EXIT(T_OK); @@ -1786,8 +1577,14 @@ tfw_h2_frame_recv(void *data, unsigned char *buf, unsigned int len, if ((ret = tfw_h2_headers_process(ctx))) FRAME_FSM_EXIT(ret); + if (unlikely(ctx->state == HTTP2_IGNORE_FRAME_DATA)) + __fsm_const_state = ctx->state; + if (unlikely(ctx->to_read)) { - FRAME_FSM_MOVE(HTTP2_RECV_APP_DATA_POST); + if (unlikely(ctx->state == HTTP2_IGNORE_FRAME_DATA)) + FRAME_FSM_MOVE(HTTP2_IGNORE_FRAME_DATA); + else + FRAME_FSM_MOVE(HTTP2_RECV_APP_DATA_POST); } FRAME_FSM_EXIT(T_OK); @@ -1799,8 +1596,14 @@ tfw_h2_frame_recv(void *data, unsigned char *buf, unsigned int len, if ((ret = tfw_h2_current_stream_state_process(ctx))) FRAME_FSM_EXIT(ret); + if (unlikely(ctx->state == HTTP2_IGNORE_FRAME_DATA)) + __fsm_const_state = ctx->state; + if (unlikely(ctx->to_read)) { - FRAME_FSM_MOVE(HTTP2_RECV_APP_DATA_POST); + if (unlikely(ctx->state == HTTP2_IGNORE_FRAME_DATA)) + FRAME_FSM_MOVE(HTTP2_IGNORE_FRAME_DATA); + else + FRAME_FSM_MOVE(HTTP2_RECV_APP_DATA_POST); } FRAME_FSM_EXIT(T_OK); @@ -2073,240 +1876,334 @@ tfw_h2_frame_process(TfwConn *c, struct sk_buff *skb, struct sk_buff **next) return r; } -int -tfw_h2_insert_frame_header(struct sock *sk, struct sk_buff *skb, - TfwStream *stream, unsigned int mss_now, - TfwMsgIter *it, char **data, - const TfwStr *frame_hdr_str, - unsigned int *t_tz) +static inline unsigned int +tfw_h2_calc_frame_length(TfwH2Ctx *ctx, TfwStream *stream, TfwFrameType type, + unsigned int len, unsigned int max_len) { - struct sk_buff *next = NULL; - unsigned len = skb->len, next_len = 0; - unsigned int truesize = skb->truesize, next_truesize = 0; - unsigned long clear_mask; - long tmp_t_tz, delta; - int r; - -#define __ADJUST_SKB_LEN_CHANGE(sk, skb, olen) \ - delta = (long)skb->len - (long)olen; \ - TCP_SKB_CB(skb)->end_seq += delta; \ - tcp_sk(sk)->write_seq += delta; - - if (likely(!tcp_skb_is_last(sk, skb))) { - next = skb_queue_next(&sk->sk_write_queue, skb); - next_len = next->len; - next_truesize = next->truesize; - } - - r = tfw_http_msg_insert(it, data, frame_hdr_str); - if (unlikely(r)) - return r; + unsigned int length; - __ADJUST_SKB_LEN_CHANGE(sk, skb, len); - tmp_t_tz = (long)skb->truesize - (long)truesize; - /* - * If all HEADERS are located in current skb, we should clear - * an appropriate flag in the next skb. - */ - clear_mask = (skb->len - FRAME_HEADER_SIZE >= stream->xmit.h_len ? - SS_F_HTTT2_FRAME_HEADERS : 0); - - if (!tcp_skb_is_last(sk, skb) - && skb->next != next) { - /* New skb was allocated during data insertion. */ - next = skb->next; - /* Remove skb since it must be inserted into sk write queue. */ - ss_skb_remove(next); - - tcp_sk(sk)->write_seq += next->len; - - tfw_tcp_setup_new_skb(sk, skb, next, mss_now); - skb_copy_tfw_cb(next, skb); - skb_clear_tfw_flag(next, clear_mask); - tmp_t_tz += next->truesize; - } else if (next && next->len != next_len) { - /* Some frags from current skb was moved to the next skb. */ - BUG_ON(next->len < next_len); - __ADJUST_SKB_LEN_CHANGE(sk, next, next_len); - - tfw_tcp_propagate_dseq(sk, skb); - skb_copy_tfw_cb(next, skb); - skb_clear_tfw_flag(next, clear_mask); - tmp_t_tz += (long)next->truesize - (long)next_truesize; - } else { - tfw_tcp_propagate_dseq(sk, skb); + length = min3(ctx->rsettings.max_frame_sz, len, max_len); + if (type == HTTP2_DATA) { + length = min3(length, (unsigned int)ctx->rem_wnd, + (unsigned int)stream->rem_wnd); } - BUG_ON(tmp_t_tz < 0); - *t_tz = tmp_t_tz; - - return r; - -#undef __ADJUST_SKB_LEN_CHANGE + return length; } -static unsigned char -tfw_h2_prepare_frame_flags(TfwStream *stream, TfwFrameType type, bool end) +static inline char +tf2_h2_calc_frame_flags(TfwStream *stream, TfwFrameType type) { - unsigned char flags; - switch (type) { case HTTP2_HEADERS: + return stream->xmit.h_len ? + (stream->xmit.b_len ? 0 : HTTP2_F_END_STREAM) : + (stream->xmit.b_len ? HTTP2_F_END_HEADERS : + HTTP2_F_END_HEADERS | HTTP2_F_END_STREAM); case HTTP2_CONTINUATION: - flags = stream->xmit.b_len ? 0 : HTTP2_F_END_STREAM; - flags |= end ? HTTP2_F_END_HEADERS : 0; - break; + return stream->xmit.h_len ? 0 : HTTP2_F_END_HEADERS; case HTTP2_DATA: - flags = end ? HTTP2_F_END_STREAM : 0; - break; + return stream->xmit.b_len ? 0 : HTTP2_F_END_STREAM; default: BUG(); }; - return flags; + return 0; } -static unsigned int -tfw_h2_calc_frame_length(struct sock *sk, struct sk_buff *skb, TfwH2Ctx *ctx, - TfwStream *stream, TfwFrameType type, - unsigned int limit, unsigned int len) +static inline int +tfw_h2_insert_frame_header(struct sock *sk, TfwH2Ctx *ctx, TfwStream *stream, + TfwFrameType type, unsigned long *snd_wnd, + unsigned long len) { - unsigned char tls_type = skb_tfw_tls_type(skb); - unsigned short clear_flag = (type == HTTP2_DATA ? - SS_F_HTTT2_FRAME_DATA : SS_F_HTTT2_FRAME_HEADERS); - unsigned short other_flag = (type == HTTP2_DATA ? - SS_F_HTTT2_FRAME_HEADERS : SS_F_HTTT2_FRAME_DATA); - unsigned int max_sz = min3(ctx->rsettings.max_frame_sz, limit, len); - unsigned int frame_sz = skb->len - FRAME_HEADER_SIZE - - stream->xmit.processed; - struct sk_buff *next = skb, *skb_tail = skb; + TfwMsgIter it = { + .skb_head = stream->xmit.skb_head, + .skb = stream->xmit.skb_head, + .frag = -1 + }; + unsigned char buf[FRAME_HEADER_SIZE]; + const TfwStr frame_hdr_str = { .data = buf, .len = sizeof(buf)}; + TfwFrameHdr frame_hdr = {}; + unsigned char tls_type = skb_tfw_tls_type(stream->xmit.skb_head); + unsigned int mark = stream->xmit.skb_head->mark; + unsigned int max_len = (*snd_wnd > TLS_MAX_PAYLOAD_SIZE + TLS_MAX_OVERHEAD) ? + TLS_MAX_PAYLOAD_SIZE : *snd_wnd - TLS_MAX_OVERHEAD; + unsigned int length; + char *data; + int r; + + data = ss_skb_data_ptr_by_offset(stream->xmit.skb_head, + stream->xmit.frame_length); + if(unlikely(!data)) + data = stream->xmit.skb_head->data; + if (type == HTTP2_CONTINUATION || type == HTTP2_DATA) { + it.skb = it.skb_head = stream->xmit.skb_head; + if ((r = tfw_http_msg_insert(&it, &data, &frame_hdr_str))) + return r; + stream->xmit.skb_head = it.skb_head; + } + + /* + * Set tls_type and mark, because skb_head could be changed + * during previous operations. + */ + ss_skb_setup_head_of_list(stream->xmit.skb_head, mark, tls_type); + + length = tfw_h2_calc_frame_length(ctx, stream, type, len, + max_len - FRAME_HEADER_SIZE); if (type == HTTP2_DATA) { - BUG_ON(ctx->rem_wnd <= 0 || stream->rem_wnd <= 0); - max_sz = min3(max_sz, (unsigned int)ctx->rem_wnd, - (unsigned int)stream->rem_wnd); + ctx->rem_wnd -= length; + stream->rem_wnd -= length; + stream->xmit.b_len -= length; + } else { + stream->xmit.h_len -= length; } - while (!tcp_skb_is_last(sk, skb_tail)) { - next = skb_queue_next(&sk->sk_write_queue, skb_tail); + *snd_wnd -= length; - if (frame_sz + next->len > max_sz) - break; - /* Don't put different message types into the same frame. */ - if (skb_tfw_tls_type(next) != tls_type) - break; - /* Don't agregate skbs with different frame types. */ - if (skb_tfw_flags(next) & other_flag) - break; - skb_clear_tfw_flag(next, clear_flag); - skb_set_tfw_flags(next, SS_F_HTTP2_FRAME_PREPARED); - stream->xmit.nskbs++; - frame_sz += next->len; - skb_tail = next; + frame_hdr.length = length; + frame_hdr.stream_id = stream->id; + frame_hdr.type = type; + frame_hdr.flags = tf2_h2_calc_frame_flags(stream, type); + tfw_h2_pack_frame_header(data, &frame_hdr); + + stream->xmit.frame_length += length + FRAME_HEADER_SIZE; + switch (tfw_h2_stream_send_process(ctx, stream, type)) { + case STREAM_FSM_RES_OK: + case STREAM_FSM_RES_IGNORE: + break; + case STREAM_FSM_RES_TERM_STREAM: + /* Send previosly successfully prepared frames if exist. */ + stream->xmit.frame_length -= length + FRAME_HEADER_SIZE; + if (stream->xmit.frame_length) { + r = tfw_h2_entail_stream_skb(sk, ctx, stream, + &stream->xmit.frame_length, + true); + } + stream->xmit.frame_length += length + FRAME_HEADER_SIZE; + /* + * Purge stream send queue, but leave postponed + * skbs and rst stream/goaway/tls alert if exist. + */ + tfw_h2_stream_purge_send_queue(stream); + return r; + case STREAM_FSM_RES_TERM_CONN: + return -EPIPE; } - return min(max_sz, frame_sz); + return 0; } static int -tfw_h2_make_frames(struct sock *sk, struct sk_buff *skb, TfwH2Ctx *ctx, - TfwStream *stream, TfwFrameType type, unsigned int mss_now, - unsigned int limit, unsigned int *t_tz) +tfw_h2_stream_xmit_process(struct sock *sk, TfwH2Ctx *ctx, TfwStream *stream, + int ss_action, unsigned long *snd_wnd) { int r = 0; - char *data; - unsigned char buf[FRAME_HEADER_SIZE]; - const TfwStr frame_hdr_str = { .data = buf, .len = sizeof(buf)}; - TfwMsgIter it = { - .skb = skb, - .skb_head = ((struct sk_buff *)&sk->sk_write_queue), - .frag = -1 - }; - TfwFrameHdr frame_hdr = {}; - unsigned long *len = (type == HTTP2_DATA ? - &stream->xmit.b_len : &stream->xmit.h_len); + TfwFrameType frame_type; + T_FSM_INIT(stream->xmit.state, "HTTP/2 make frames"); - if (WARN_ON_ONCE(limit <= FRAME_HEADER_SIZE)) - return -EINVAL; +#define CALC_SND_WND_AND_SET_FRAME_TYPE(type) \ +do { \ + if (*snd_wnd <= FRAME_HEADER_SIZE + TLS_MAX_OVERHEAD) \ + T_FSM_EXIT(); \ + frame_type = type; \ +} while(0) - data = tfw_http_iter_set_at_skb(&it, skb, stream->xmit.processed); - if (!data) - return -E2BIG; - if (type != HTTP2_HEADERS) { - /* - * Insert empty header first, because if some fragments will - * be moved from current skb to the next one, skb length will - * be changed. - */ - r = tfw_h2_insert_frame_header(sk, skb, stream, mss_now, &it, - &data, &frame_hdr_str, t_tz); + T_FSM_START(stream->xmit.state) { + + T_FSM_STATE(HTTP2_ENCODE_HEADERS) { + r = tfw_h2_stream_xmit_prepare_resp(stream); + fallthrough; + } + + T_FSM_STATE(HTTP2_RELEASE_RESPONSE) { + TfwHttpResp *resp = stream->xmit.resp; + + BUG_ON(!resp || !resp->req || !resp->req->conn); + tfw_http_resp_pair_free_and_put_conn(resp); + stream->xmit.resp = NULL; + /* Error during headers encoding. */ if (unlikely(r)) return r; + fallthrough; } - limit -= FRAME_HEADER_SIZE; + T_FSM_STATE(HTTP2_MAKE_HEADERS_FRAMES) { + CALC_SND_WND_AND_SET_FRAME_TYPE(HTTP2_HEADERS); + if (unlikely(ctx->hpack.enc_tbl.wnd_changed)) { + r = tfw_hpack_enc_tbl_write_sz(&ctx->hpack.enc_tbl, + stream); + if (unlikely(r < 0)) { + T_WARN("Failed to encode hpack dynamic" + "table size %d", r); + return r; + } + } - frame_hdr.stream_id = stream->id; - frame_hdr.type = type; - frame_hdr.length = tfw_h2_calc_frame_length(sk, skb, ctx, stream, - type, limit, *len); - frame_hdr.flags = tfw_h2_prepare_frame_flags(stream, type, - *len == frame_hdr.length); - tfw_h2_pack_frame_header(data, &frame_hdr); + r = tfw_h2_insert_frame_header(sk, ctx, stream, frame_type, + snd_wnd, stream->xmit.h_len); + if (unlikely(r)) { + T_WARN("Failed to make headers frame %d", r); + return r; + } - if (type == HTTP2_DATA) { - ctx->rem_wnd -= frame_hdr.length; - stream->rem_wnd -= frame_hdr.length; + T_FSM_JMP(HTTP2_SEND_FRAMES); } - stream->xmit.processed += frame_hdr.length + FRAME_HEADER_SIZE; - *len -= frame_hdr.length; - return 0; -} + T_FSM_STATE(HTTP2_MAKE_CONTINUATION_FRAMES) { + CALC_SND_WND_AND_SET_FRAME_TYPE(HTTP2_CONTINUATION); + r = tfw_h2_insert_frame_header(sk, ctx, stream, frame_type, + snd_wnd, stream->xmit.h_len); + if (unlikely(r)) { + T_WARN("Failed to make continuation frame %d", r); + return r; + } -int -tfw_h2_make_headers_frames(struct sock *sk, struct sk_buff *skb, - TfwH2Ctx *ctx, TfwStream *stream, - unsigned int mss_now, unsigned int limit, - unsigned int *t_tz) -{ - TfwFrameType type = skb_tfw_flags(skb) & SS_F_HTTP2_FRAME_START ? - HTTP2_HEADERS : HTTP2_CONTINUATION; + T_FSM_JMP(HTTP2_SEND_FRAMES); + } - return tfw_h2_make_frames(sk, skb, ctx, stream, type, - mss_now, limit, t_tz); -} + T_FSM_STATE(HTTP2_MAKE_DATA_FRAMES) { + if (ctx->rem_wnd <= 0 || stream->rem_wnd <= 0) { + ctx->sched.blocked_streams += + (stream->rem_wnd <= 0 + && !stream->xmit.is_blocked); + stream->xmit.is_blocked = stream->rem_wnd <= 0; + T_FSM_EXIT(); + } -int -tfw_h2_make_data_frames(struct sock *sk, struct sk_buff *skb, - TfwH2Ctx *ctx, TfwStream *stream, - unsigned int mss_now, unsigned int limit, - unsigned int *t_tz) -{ - return tfw_h2_make_frames(sk, skb, ctx, stream, HTTP2_DATA, - mss_now, limit, t_tz); + CALC_SND_WND_AND_SET_FRAME_TYPE(HTTP2_DATA); + r = tfw_h2_insert_frame_header(sk, ctx, stream, frame_type, + snd_wnd, stream->xmit.b_len); + if (unlikely (r)) { + T_WARN("Failed to make data frame %d", r); + return r; + } + + fallthrough; + } + + T_FSM_STATE(HTTP2_SEND_FRAMES) { + r = tfw_h2_entail_stream_skb(sk, ctx, stream, + &stream->xmit.frame_length, + false); + if (unlikely(r)) { + T_WARN("Failed to send frame %d", r); + return r; + } + + if (stream->xmit.h_len) { + T_FSM_JMP(HTTP2_MAKE_CONTINUATION_FRAMES); + } else { + if (unlikely(stream->xmit.postponed) && + !stream->xmit.frame_length) + ss_skb_tcp_entail_list(sk, + &stream->xmit.postponed); + if (stream->xmit.b_len) { + T_FSM_JMP(HTTP2_MAKE_DATA_FRAMES); + } else { + fallthrough; + } + } + } + + T_FSM_STATE(HTTP2_MAKE_FRAMES_FINISH) { + BUG_ON(stream->xmit.resp); + /* + * skb_head is not empty because RST stream or + * GOAWAY and TLS ALERT are pending until error + * response is sent. + */ + if (unlikely(stream->xmit.skb_head)) { + ss_skb_tcp_entail_list(sk, &stream->xmit.skb_head); + /* + * We set ctx->error only when we close connection + * after sending error response. If ss_action is + * SS_CLOSE we don't need to shutdown socket, because + * we will done it from `ss_do_close`. + */ + if (stream == ctx->error && ss_action != SS_CLOSE) + tcp_shutdown(sk, SEND_SHUTDOWN); + } + tfw_h2_stream_add_closed(ctx, stream); + if (stream == ctx->error) + ctx->error = NULL; + T_FSM_EXIT(); + } + + } + + T_FSM_FINISH(r, stream->xmit.state); + + if (stream->xmit.frame_length) { + r = tfw_h2_entail_stream_skb(sk, ctx, stream, + &stream->xmit.frame_length, + true); + } + + + return r; + +#undef CALC_SND_WND_AND_SET_FRAME_TYPE } -TfwStream * -tfw_h2_find_not_closed_stream(TfwH2Ctx *ctx, unsigned int id, - bool recv) +int +tfw_h2_make_frames(struct sock *sk, TfwH2Ctx *ctx, unsigned long snd_wnd, + int ss_action, bool *data_is_available) { + TfwStreamSched *sched = &ctx->sched; + TfwStreamSchedEntry *parent; TfwStream *stream; + u64 deficit; + bool error_was_sent = false; + int r = 0; + + while (tfw_h2_stream_sched_is_active(&sched->root) + && snd_wnd > FRAME_HEADER_SIZE + TLS_MAX_OVERHEAD + && ctx->rem_wnd > 0) + { + if (ctx->cur_send_headers) { + stream = ctx->cur_send_headers; + parent = stream->sched.parent; + tfw_h2_stream_sched_remove(sched, stream); + } else if (ctx->error) { + stream = ctx->error; + parent = stream->sched.parent; + tfw_h2_stream_sched_remove(sched, stream); + error_was_sent = true; + } else { + stream = tfw_h2_sched_stream_dequeue(sched, &parent); + } + + /* + * If root scheduler is active we always can find + * active stream. + */ + BUG_ON(!stream); + r = tfw_h2_stream_xmit_process(sk, ctx, stream, ss_action, + &snd_wnd); + deficit = tfw_h2_stream_recalc_deficit(stream); + tfw_h2_sched_stream_enqueue(sched, stream, parent, deficit); + + /* + * If we send error response we stop to send any data + * from other streams, so we either sent all error response + * or blocked by window size. + */ + if (error_was_sent || r) + break; + } + + *data_is_available = + tfw_h2_stream_sched_is_active(&sched->root) && ctx->rem_wnd; - stream = tfw_h2_find_stream(&ctx->sched, id); /* - * RFC 9113 section 5.1: - * An endpoint that sends a RST_STREAM frame on a stream that is in - * the "open" or "half-closed (local)" state could receive any type - * of frame. The peer might have sent or enqueued for sending these - * frames before processing the RST_STREAM frame. - * It is HTTP2_STREAM_LOC_CLOSED state in our implementation. + * Send shutdown if there is no pending error response in our scheduler + * and this function is called from `ss_do_shutdown`. */ - if (!stream || (stream->queue == &ctx->closed_streams - && (!recv || tfw_h2_get_stream_state(stream) > - HTTP2_STREAM_LOC_CLOSED))) - return NULL; + if ((!ctx->error || r) && ss_action == SS_SHUTDOWN) + tcp_shutdown(sk, SEND_SHUTDOWN); - return stream; + return r; } diff --git a/fw/http_frame.h b/fw/http_frame.h index 57378ed03c..bca9e1f1ba 100644 --- a/fw/http_frame.h +++ b/fw/http_frame.h @@ -53,6 +53,7 @@ typedef enum { * section 6.5.2). */ typedef enum { + HTTP2_SETTINGS_NEED_TO_APPLY = 0x00, HTTP2_SETTINGS_TABLE_SIZE = 0x01, HTTP2_SETTINGS_ENABLE_PUSH, HTTP2_SETTINGS_MAX_STREAMS, @@ -127,149 +128,39 @@ typedef struct { } TfwFramePri; /** - * Representation of SETTINGS parameters for HTTP/2 connection (RFC 7540 - * section 6.5.2). - * - * @hdr_tbl_sz - maximum size of the endpoint's header compression - * table used to decode header blocks; - * @push - enable/disable indicator for server push; - * @max_streams - maximum number of streams that the endpoint will - * allow; - * @wnd_sz - endpoint's initial window size for stream-level - * flow control; - * @max_frame_sz - size of the largest frame payload the endpoint wish - * to receive; - * @max_lhdr_sz - maximum size of header list the endpoint prepared - * to accept; - */ -typedef struct { - unsigned int hdr_tbl_sz; - unsigned int push; - unsigned int max_streams; - unsigned int wnd_sz; - unsigned int max_frame_sz; - unsigned int max_lhdr_sz; -} TfwSettings; - -/** - * Context for HTTP/2 frames processing. - * - * @lock - spinlock to protect stream-request linkage; - * @lsettings - local settings for HTTP/2 connection; - * @rsettings - settings for HTTP/2 connection received from the - * remote endpoint; - * @streams_num - number of the streams initiated by client; - * @sched - streams' priority scheduler; - * @closed_streams - queue of closed streams (in HTTP2_STREAM_CLOSED or - * HTTP2_STREAM_REM_CLOSED state), which are waiting - * for removal; - * @lstream_id - ID of last stream initiated by client and processed on - * the server side; - * @loc_wnd - connection's current flow controlled window; - * @rem_wnd - remote peer current flow controlled window; - * @hpack - HPACK context, used in processing of - * HEADERS/CONTINUATION frames; - * @cur_send_headers - stream for which we have already started sending - * headers, but have not yet sent the END_HEADERS flag; - * @cur_recv_headers - stream for which we have already started receiving - * headers, but have not yet received the END_HEADERS - * flag; - * @sent_settings - the settings were sent, when ack will be received - * we should apply these local settings. - * @__off - offset to reinitialize processing context; - * @skb_head - collected list of processed skbs containing HTTP/2 - * frames; - * @cur_stream - found stream for the frame currently being processed; - * @priority - unpacked data from priority part of payload of - * processed HEADERS or PRIORITY frames; - * @hdr - unpacked data from header of currently processed - * frame; - * @plen - payload length of currently processed frame - * (HEADERS/CONTINUATION/DATA frames); - * @state - current FSM state of HTTP/2 processing context; - * @to_read - indicates how much data of HTTP/2 frame should - * be read on next FSM @state; - * @rlen - length of accumulated data in @rbuf - * or length of the payload read in current FSM state; - * @rbuf - buffer for data accumulation from frames headers and - * payloads (for service frames) during frames - * processing; - * @padlen - length of current frame's padding (if exists); - * @data_off - offset of app data in HEADERS, CONTINUATION and DATA - * frames (after all service payloads); - * @new_settings - struct which contains flags and new settings, which - * should be applyed in `xmit` callback. Currently it - * is used only for new hpack dynamic table size, but - * can be wide later. - * - * NOTE: we can keep HPACK context in general connection-wide HTTP/2 context - * (instead of separate HPACK context for each stream), since frames from other - * streams cannot occur between the HEADERS/CONTINUATION frames of particular - * stream (RFC 7540, sections 6.2, 6.10, 8.1). + * FSM states for HTTP/2 frames processing. */ -typedef struct tfw_h2_ctx_t { - spinlock_t lock; - TfwSettings lsettings; - TfwSettings rsettings; - unsigned long streams_num; - TfwStreamSched sched; - TfwStreamQueue closed_streams; - unsigned int lstream_id; - long int loc_wnd; - long int rem_wnd; - TfwHPack hpack; - TfwStream *cur_send_headers; - TfwStream *cur_recv_headers; - bool sent_settings[_HTTP2_SETTINGS_MAX]; - char __off[0]; - struct sk_buff *skb_head; - TfwStream *cur_stream; - TfwFramePri priority; - TfwFrameHdr hdr; - unsigned int plen; - int state; - int to_read; - int rlen; - unsigned char rbuf[FRAME_HEADER_SIZE]; - unsigned char padlen; - unsigned char data_off; - struct { - unsigned short flags; - unsigned int hdr_tbl_sz; - } new_settings; -} TfwH2Ctx; +typedef enum { + HTTP2_RECV_FRAME_HEADER, + HTTP2_RECV_CLI_START_SEQ, + HTTP2_RECV_FIRST_SETTINGS, + HTTP2_RECV_FRAME_PRIORITY, + HTTP2_RECV_FRAME_WND_UPDATE, + HTTP2_RECV_FRAME_PING, + HTTP2_RECV_FRAME_RST_STREAM, + HTTP2_RECV_FRAME_SETTINGS, + HTTP2_RECV_FRAME_GOAWAY, + HTTP2_RECV_FRAME_PADDED, + HTTP2_RECV_HEADER_PRI, + HTTP2_IGNORE_FRAME_DATA, + __HTTP2_RECV_FRAME_APP, + HTTP2_RECV_HEADER = __HTTP2_RECV_FRAME_APP, + HTTP2_RECV_CONT, + HTTP2_RECV_DATA, + HTTP2_RECV_APP_DATA_POST +} TfwFrameState; + +#define MAX_WND_SIZE ((1U << 31) - 1) +#define DEF_WND_SIZE ((1U << 16) - 1) typedef struct tfw_conn_t TfwConn; -int tfw_h2_init(void); -void tfw_h2_cleanup(void); -int tfw_h2_context_init(TfwH2Ctx *ctx); -void tfw_h2_context_clear(TfwH2Ctx *ctx); int tfw_h2_frame_process(TfwConn *c, struct sk_buff *skb, struct sk_buff **next); -void tfw_h2_conn_streams_cleanup(TfwH2Ctx *ctx); -TfwStream *tfw_h2_find_not_closed_stream(TfwH2Ctx *ctx, unsigned int id, - bool recv); -unsigned int tfw_h2_req_stream_id(TfwHttpReq *req); -void tfw_h2_req_unlink_stream(TfwHttpReq *req); -void tfw_h2_req_unlink_stream_with_rst(TfwHttpReq *req); -void tfw_h2_conn_terminate_close(TfwH2Ctx *ctx, TfwH2Err err_code, bool close, - bool attack); int tfw_h2_send_rst_stream(TfwH2Ctx *ctx, unsigned int id, TfwH2Err err_code); - -int tfw_h2_make_headers_frames(struct sock *sk, struct sk_buff *skb, - TfwH2Ctx *ctx, TfwStream *stream, - unsigned int mss_now, unsigned int limit, - unsigned int *t_tz); -int tfw_h2_make_data_frames(struct sock *sk, struct sk_buff *skb, - TfwH2Ctx *ctx, TfwStream *stream, - unsigned int mss_now, unsigned int limit, - unsigned int *t_tz); -int tfw_h2_insert_frame_header(struct sock *sk, struct sk_buff *skb, - TfwStream *stream, unsigned int mss_now, - TfwMsgIter *it, char **data, - const TfwStr *frame_hdr_str, - unsigned int *t_tz); +int tfw_h2_send_goaway(TfwH2Ctx *ctx, TfwH2Err err_code, bool attack); +int tfw_h2_make_frames(struct sock *sk, TfwH2Ctx *ctx, unsigned long smd_wnd, + int ss_action, bool *data_is_available); static inline void tfw_h2_pack_frame_header(unsigned char *p, const TfwFrameHdr *hdr) @@ -299,13 +190,4 @@ tfw_h2_unpack_frame_header(TfwFrameHdr *hdr, const unsigned char *buf) __func__, hdr->length, hdr->stream_id, hdr->type, hdr->flags); } -static inline void -tfw_h2_conn_reset_stream_on_close(TfwH2Ctx *ctx, TfwStream *stream) -{ - if (ctx->cur_send_headers == stream) - ctx->cur_send_headers = NULL; - if (ctx->cur_recv_headers == stream) - ctx->cur_recv_headers = NULL; -} - #endif /* __HTTP_FRAME__ */ diff --git a/fw/http_msg.c b/fw/http_msg.c index e9a119e436..f4449501da 100644 --- a/fw/http_msg.c +++ b/fw/http_msg.c @@ -1472,8 +1472,15 @@ __tfw_http_msg_move_body(TfwHttpResp *resp, struct sk_buff *nskb) return 0; } -static inline int -__tfw_http_msg_linear_transform(TfwMsgIter *it) +/* + * Move linear data to paged fragment before inserting data into skb. + * We must do it, because we want to insert new data "before" linear. + * For instance: We want to insert headers. Linear data contains part + * of the body, if we insert headers without moving linear part, + * headers will be inserted after the body or between the body chunks. + */ +int +tfw_http_msg_linear_transform(TfwMsgIter *it) { /* * There is no sense to move linear part if next skb has linear @@ -1519,15 +1526,8 @@ __tfw_http_msg_expand_from_pool(TfwHttpResp *resp, const TfwStr *str, BUG_ON(it->skb->len > SS_SKB_MAX_DATA_LEN); - /* - * Move linear data to paged fragment before inserting data into skb. - * We must do it, because we want to insert new data "before" linear. - * For instance: We want to insert headers. Linear data contains part - * of the body, if we insert headers without moving linear part, - * headers will be inserted after the body or between the body chunks. - */ if (skb_headlen(it->skb)) { - if (unlikely((r = __tfw_http_msg_linear_transform(it)))) + if (unlikely((r = tfw_http_msg_linear_transform(it)))) return r; } diff --git a/fw/http_msg.h b/fw/http_msg.h index 1fcb028833..d8350dd438 100644 --- a/fw/http_msg.h +++ b/fw/http_msg.h @@ -122,7 +122,7 @@ tfw_h2_msg_transform_setup(TfwHttpTransIter *mit, struct sk_buff *skb, static inline int tfw_h2_msg_hdr_add(TfwHttpResp *resp, char *name, size_t nlen, char *val, - size_t vlen, unsigned short idx, unsigned int stream_id) + size_t vlen, unsigned short idx) { TfwStr hdr = { .chunks = (TfwStr []){ @@ -134,7 +134,7 @@ tfw_h2_msg_hdr_add(TfwHttpResp *resp, char *name, size_t nlen, char *val, .hpack_idx = idx }; - return tfw_hpack_encode(resp, &hdr, true, true, stream_id); + return tfw_hpack_encode(resp, &hdr, true, true); } int __must_check __tfw_http_msg_add_str_data(TfwHttpMsg *hm, TfwStr *str, @@ -178,9 +178,10 @@ int __hdr_name_cmp(const TfwStr *hdr, const TfwStr *cmp_hdr); int __http_hdr_lookup(TfwHttpMsg *hm, const TfwStr *hdr); int tfw_h2_msg_cutoff_headers(TfwHttpResp *resp, TfwHttpRespCleanup* cleanup); int tfw_http_msg_insert(TfwMsgIter *it, char **off, const TfwStr *data); +int tfw_http_msg_linear_transform(TfwMsgIter *it); -#define TFW_H2_MSG_HDR_ADD(hm, name, val, idx, stream_id) \ +#define TFW_H2_MSG_HDR_ADD(hm, name, val, idx) \ tfw_h2_msg_hdr_add(hm, name, sizeof(name) - 1, val, \ - sizeof(val) - 1, idx, stream_id) + sizeof(val) - 1, idx) #endif /* __TFW_HTTP_MSG_H__ */ diff --git a/fw/http_sess.c b/fw/http_sess.c index 426508f278..8e8392e1ee 100644 --- a/fw/http_sess.c +++ b/fw/http_sess.c @@ -329,7 +329,7 @@ tfw_http_sticky_calc(TfwHttpReq *req, StickyVal *sv) * to the HTTP response' header block. */ static int -tfw_http_sticky_add(TfwHttpResp *resp, bool cache, unsigned int stream_id) +tfw_http_sticky_add(TfwHttpResp *resp, bool cache) { int r; static const unsigned int len = sizeof(StickyVal) * 2; @@ -362,8 +362,7 @@ tfw_http_sticky_add(TfwHttpResp *resp, bool cache, unsigned int stream_id) if (to_h2) { set_cookie.hpack_idx = 55; - r = tfw_hpack_encode(resp, &set_cookie, !cache, !cache, - stream_id); + r = tfw_hpack_encode(resp, &set_cookie, !cache, !cache); } else if (cache) { TfwHttpTransIter *mit = &resp->mit; @@ -577,8 +576,7 @@ tfw_http_sticky_req_process(TfwHttpReq *req, StickyVal *sv, TfwStr *cookie_val) * Add Tempesta sticky cookie to an HTTP response if needed. */ int -tfw_http_sess_resp_process(TfwHttpResp *resp, bool cache, - unsigned int stream_id) +tfw_http_sess_resp_process(TfwHttpResp *resp, bool cache) { TfwHttpReq *req = resp->req; TfwStickyCookie *sticky = req->vhost->cookie; @@ -600,7 +598,7 @@ tfw_http_sess_resp_process(TfwHttpResp *resp, bool cache, */ if (test_bit(TFW_HTTP_B_HAS_STICKY, req->flags)) return 0; - return tfw_http_sticky_add(resp, cache, stream_id); + return tfw_http_sticky_add(resp, cache); } /** diff --git a/fw/http_sess.h b/fw/http_sess.h index ea996b2861..802dc6e384 100644 --- a/fw/http_sess.h +++ b/fw/http_sess.h @@ -178,8 +178,7 @@ enum { int tfw_http_sess_obtain(TfwHttpReq *req); int tfw_http_sess_learn(TfwHttpResp *resp); -int tfw_http_sess_resp_process(TfwHttpResp *resp, bool cache, - unsigned int stream_id); +int tfw_http_sess_resp_process(TfwHttpResp *resp, bool cache); void tfw_http_sess_put(TfwHttpSess *sess); void tfw_http_sess_pin_vhost(TfwHttpSess *sess, TfwVhost *vhost); diff --git a/fw/http_stream.c b/fw/http_stream.c index 1104fedb82..e44792b31c 100644 --- a/fw/http_stream.c +++ b/fw/http_stream.c @@ -47,33 +47,22 @@ tfw_h2_stream_cache_destroy(void) kmem_cache_destroy(stream_cache); } -static int -tfw_h2_find_stream_dep(TfwStreamSched *sched, unsigned int id, TfwStream **dep) -{ - /* - * TODO: implement dependency/priority logic (according to RFC 7540 - * section 5.3) in context of #1196. - */ - return 0; -} -static void -tfw_h2_add_stream_dep(TfwStreamSched *sched, TfwStream *stream, TfwStream *dep, - bool excl) +static inline void +tfw_h2_conn_reset_stream_on_close(TfwH2Ctx *ctx, TfwStream *stream) { - /* - * TODO: implement dependency/priority logic (according to RFC 7540 - * section 5.3) in context of #1196. - */ + if (ctx->cur_send_headers == stream) + ctx->cur_send_headers = NULL; + if (ctx->cur_recv_headers == stream) + ctx->cur_recv_headers = NULL; } -static void -tfw_h2_remove_stream_dep(TfwStreamSched *sched, TfwStream *stream) +static inline void +tfw_h2_stream_purge_all(TfwStream *stream) { - /* - * TODO: implement dependency/priority logic (according to RFC 7540 - * section 5.3) in context of #1196. - */ + ss_skb_queue_purge(&stream->xmit.skb_head); + ss_skb_queue_purge(&stream->xmit.postponed); + stream->xmit.h_len = stream->xmit.b_len = 0; } static void @@ -81,8 +70,15 @@ tfw_h2_stop_stream(TfwStreamSched *sched, TfwStream *stream) { TfwH2Ctx *ctx = container_of(sched, TfwH2Ctx, sched); - tfw_h2_conn_reset_stream_on_close(ctx, stream); + /* + * Should be done before purging stream send queue, + * to correct adjusting count of active streams in + * the scheduler. + */ tfw_h2_remove_stream_dep(sched, stream); + tfw_h2_stream_purge_all_and_free_response(stream); + + tfw_h2_conn_reset_stream_on_close(ctx, stream); rb_erase(&stream->node, &sched->streams); } @@ -91,6 +87,9 @@ tfw_h2_init_stream(TfwStream *stream, unsigned int id, unsigned short weight, long int loc_wnd, long int rem_wnd) { RB_CLEAR_NODE(&stream->node); + bzero_fast(&stream->sched_node, sizeof(stream->sched_node)); + stream->sched_state = HTTP2_STREAM_SCHED_STATE_UNKNOWN; + tfw_h2_init_stream_sched_entry(&stream->sched); INIT_LIST_HEAD(&stream->hcl_node); spin_lock_init(&stream->st_lock); stream->id = id; @@ -134,6 +133,84 @@ tfw_h2_add_stream(TfwStreamSched *sched, unsigned int id, unsigned short weight, return new_stream; } +void +tfw_h2_stream_purge_send_queue(TfwStream *stream) +{ + unsigned long len = stream->xmit.h_len + stream->xmit.b_len + + stream->xmit.frame_length; + struct sk_buff *skb; + + while (len) { + skb = ss_skb_dequeue(&stream->xmit.skb_head); + BUG_ON(!skb); + + len -= skb->len; + kfree_skb(skb); + } + stream->xmit.h_len = stream->xmit.b_len = stream->xmit.frame_length = 0; +} + +void +tfw_h2_stream_purge_all_and_free_response(TfwStream *stream) +{ + TfwHttpResp*resp = stream->xmit.resp; + + if (resp) { + tfw_http_resp_pair_free_and_put_conn(resp); + stream->xmit.resp = NULL; + } + tfw_h2_stream_purge_all(stream); +} + +void +tfw_h2_stream_add_idle(TfwH2Ctx *ctx, TfwStream *idle) +{ + TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2); + struct list_head *pos, *prev = &ctx->idle_streams.list; + bool found = false; + + /* + * We add and remove streams from idle queue under the + * socket lock. + */ + assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock); + + /* + * Found first idle stream with id less than new idle + * stream, then insert new stream before this stream. + */ + list_for_each(pos, &ctx->idle_streams.list) { + TfwStream *stream = list_entry(pos, TfwStream, hcl_node); + + if (idle->id > stream->id) { + found = true; + break; + } + prev = &stream->hcl_node; + } + + if (found) { + list_add(&idle->hcl_node, prev); + idle->queue = &ctx->idle_streams; + ++idle->queue->num; + } else { + tfw_h2_stream_add_to_queue_nolock(&ctx->idle_streams, idle); + } +} + +void +tfw_h2_stream_remove_idle(TfwH2Ctx *ctx, TfwStream *stream) +{ + TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2); + + /* + * We add and remove streams from idle queue under the + * socket lock. + */ + assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock); + tfw_h2_stream_del_from_queue_nolock(stream); +} + /* * Create a new stream and add it to the streams storage and to the dependency * tree. Note, that we do not need to protect the streams storage in @sched from @@ -143,13 +220,12 @@ tfw_h2_add_stream(TfwStreamSched *sched, unsigned int id, unsigned short weight, TfwStream * tfw_h2_stream_create(TfwH2Ctx *ctx, unsigned int id) { - TfwStream *stream, *dep = NULL; + TfwStream *stream; + TfwStreamSchedEntry *dep = NULL; TfwFramePri *pri = &ctx->priority; bool excl = pri->exclusive; - if (tfw_h2_find_stream_dep(&ctx->sched, pri->stream_id, &dep)) - return NULL; - + dep = tfw_h2_find_stream_dep(&ctx->sched, pri->stream_id); stream = tfw_h2_add_stream(&ctx->sched, id, pri->weight, ctx->lsettings.wnd_sz, ctx->rsettings.wnd_sz); @@ -160,10 +236,10 @@ tfw_h2_stream_create(TfwH2Ctx *ctx, unsigned int id) ++ctx->streams_num; - T_DBG3("%s: ctx [%p] (streams_num %lu, dep strm id %u, dep strm [%p], excl %u)\n" - "added strm [%p] id %u weight %u\n", - __func__, ctx, ctx->streams_num, pri->stream_id, dep, pri->exclusive, - stream, id, stream->weight); + T_DBG3("%s: ctx [%p] (streams_num %lu, dep strm id %u, dep strm [%p]," + "excl %u) added strm [%p] id %u weight %u\n", + __func__, ctx, ctx->streams_num, pri->stream_id, dep, + pri->exclusive, stream, id, stream->weight); return stream; } @@ -228,40 +304,6 @@ tfw_h2_stream_add_closed(TfwH2Ctx *ctx, TfwStream *stream) spin_unlock(&ctx->lock); } -/* - * Stream closing procedure: move the stream into special queue of closed - * streams and send RST_STREAM frame to peer. This procedure is intended - * for usage only in receiving flow of Framing layer, thus the stream is - * definitely alive here and we need not any unlinking operations since - * all the unlinking and cleaning work will be made later, during shrinking - * the queue of closed streams; thus, we just move the stream into the - * closed queue here. - * We also reset the current stream of the H2 context here. - */ -int -tfw_h2_stream_close(TfwH2Ctx *ctx, unsigned int id, TfwStream **stream, - TfwH2Err err_code) -{ - if (stream && *stream) { - T_DBG3("%s: ctx [%p] strm %p id %d err %u\n", __func__, - ctx, *stream, id, err_code); - tfw_h2_conn_reset_stream_on_close(ctx, *stream); - if (tfw_h2_get_stream_state(*stream) > - HTTP2_STREAM_REM_HALF_CLOSED) { - tfw_h2_stream_add_closed(ctx, *stream); - } else { - /* - * This function is always called after processing - * RST STREAM or stream error. - */ - BUG(); - } - *stream = NULL; - } - - return tfw_h2_send_rst_stream(ctx, id, err_code); -} - /* * Stream FSM processing during frames receipt (see RFC 7540 section * 5.1 for details). @@ -434,12 +476,6 @@ do { \ break; } - if (send) { - TFW_H2_FSM_TYPE_CHECK(ctx, stream, send, type); - } else { - TFW_H2_FSM_TYPE_CHECK(ctx, stream, recv, type); - } - if (type == HTTP2_CONTINUATION) { /* * Empty CONTINUATION frames without END_HEADERS flag @@ -572,6 +608,7 @@ do { \ HTTP2_F_END_STREAM)) { case HTTP2_F_END_HEADERS | HTTP2_F_END_STREAM: + ctx->cur_recv_headers = NULL; SET_STATE(HTTP2_STREAM_CLOSED); break; case HTTP2_F_END_HEADERS: @@ -582,8 +619,9 @@ do { \ ctx->cur_recv_headers = NULL; break; case HTTP2_F_END_STREAM: - SET_STATE(HTTP2_STREAM_CLOSED); - ctx->cur_recv_headers = NULL; + ctx->cur_recv_headers = stream; + stream->state |= + HTTP2_STREAM_RECV_END_OF_STREAM; break; default: ctx->cur_recv_headers = stream; @@ -616,9 +654,46 @@ do { \ case HTTP2_STREAM_REM_HALF_CLOSED: if (send) { - if (type == HTTP2_RST_STREAM - || flags & HTTP2_F_END_STREAM) + if (type == HTTP2_HEADERS || + type == HTTP2_CONTINUATION) { + switch (flags + & (HTTP2_F_END_HEADERS | + HTTP2_F_END_STREAM)) + { + /* + * RFC 9113 5.1 (half-closed (remote) state): + * A stream can transition from this state to + * "closed" by sending a frame with the + * END_STREAM flag set. + */ + case HTTP2_F_END_STREAM: + ctx->cur_send_headers = stream; + stream->state |= + HTTP2_STREAM_SEND_END_OF_STREAM; + break; + case HTTP2_F_END_HEADERS | HTTP2_F_END_STREAM: + ctx->cur_send_headers = NULL; + SET_STATE(HTTP2_STREAM_CLOSED); + break; + case HTTP2_F_END_HEADERS: + /* + * Headers are ended, next frame in the + * stream should be DATA frame. + */ + ctx->cur_send_headers = NULL; + break; + + default: + ctx->cur_send_headers = stream; + break; + } + } else if (type == HTTP2_DATA) { + if (flags & HTTP2_F_END_STREAM) + SET_STATE(HTTP2_STREAM_CLOSED); + } else if (type == HTTP2_RST_STREAM) { SET_STATE(HTTP2_STREAM_REM_CLOSED); + } + break; } @@ -639,9 +714,9 @@ do { \ /* * We always send RST_STREAM to the peer in this case; * thus, the stream should be switched to the - * 'closed (remote)' state. + * 'closed' state. */ - SET_STATE(HTTP2_STREAM_REM_CLOSED); + SET_STATE(HTTP2_STREAM_CLOSED); *err = HTTP2_ECODE_CLOSED; res = STREAM_FSM_RES_TERM_STREAM; } @@ -654,19 +729,23 @@ do { \ * frame on a stream in the "open" or "half-closed (local)" state. */ case HTTP2_STREAM_LOC_CLOSED: + if (send) { + res = STREAM_FSM_RES_IGNORE; + break; + } + /* * RFC 9113 section 5.1: * An endpoint that sends a RST_STREAM frame on a stream * that is in the "open" or "half-closed (local)" state * could receive any type of frame. + * An endpoint MUST minimally process and then discard + * any frames it receives in this state. */ - if (send) { - res = STREAM_FSM_RES_IGNORE; - break; - } - if (type == HTTP2_RST_STREAM) SET_STATE(HTTP2_STREAM_CLOSED); + else if (type != HTTP2_WINDOW_UPDATE) + res = STREAM_FSM_RES_IGNORE; break; @@ -701,20 +780,22 @@ do { \ " flags=0x%hhx\n", __func__, stream->id, type, flags); if (send) { res = STREAM_FSM_RES_IGNORE; - break; + } else { + if (type != HTTP2_PRIORITY) { + *err = HTTP2_ECODE_PROTO; + res = STREAM_FSM_RES_TERM_CONN; + } } - /* - * In moment when the final 'closed' state is achieved, stream - * actually must be removed from stream's storage (and from - * memory), thus the receive execution flow must not reach this - * point. - */ - fallthrough; + + break; default: BUG(); } finish: + if (type == HTTP2_RST_STREAM || res == STREAM_FSM_RES_TERM_STREAM) + tfw_h2_conn_reset_stream_on_close(ctx, stream); + T_DBG3("exit %s: strm [%p] state %d(%s), res %d\n", __func__, stream, tfw_h2_get_stream_state(stream), __h2_strm_st_n(stream), res); @@ -749,38 +830,38 @@ tfw_h2_find_stream(TfwStreamSched *sched, unsigned int id) void tfw_h2_delete_stream(TfwStream *stream) { + BUG_ON(stream->xmit.resp || stream->xmit.skb_head); kmem_cache_free(stream_cache, stream); } -void -tfw_h2_change_stream_dep(TfwStreamSched *sched, unsigned int stream_id, - unsigned int new_dep, unsigned short new_weight, - bool excl) -{ - /* - * TODO: implement dependency/priority logic (according to RFC 7540 - * section 5.3) in context of #1196. - */ -} - int -tfw_h2_stream_init_for_xmit(TfwHttpReq *req, unsigned long h_len, - unsigned long b_len) +tfw_h2_stream_init_for_xmit(TfwHttpResp *resp, TfwStreamXmitState state, + unsigned long h_len, unsigned long b_len) { - TfwH2Ctx *ctx = tfw_h2_context_unsafe(req->conn); + TfwH2Ctx *ctx = tfw_h2_context_unsafe(resp->req->conn); + struct sk_buff *skb_head = resp->msg.skb_head; TfwStream *stream; spin_lock(&ctx->lock); - stream = req->stream; + stream = resp->req->stream; if (!stream) { spin_unlock(&ctx->lock); return -EPIPE; } + TFW_SKB_CB(skb_head)->opaque_data = resp; + TFW_SKB_CB(skb_head)->destructor = tfw_http_resp_pair_free_and_put_conn; + TFW_SKB_CB(skb_head)->on_send = tfw_http_on_send_resp; + TFW_SKB_CB(skb_head)->stream_id = stream->id; + + stream->xmit.resp = NULL; + stream->xmit.skb_head = NULL; stream->xmit.h_len = h_len; stream->xmit.b_len = b_len; - tfw_h2_stream_xmit_reinit(&stream->xmit); + stream->xmit.state = state; + stream->xmit.frame_length = 0; + stream->xmit.is_blocked = false; spin_unlock(&ctx->lock); @@ -790,21 +871,18 @@ tfw_h2_stream_init_for_xmit(TfwHttpReq *req, unsigned long h_len, TfwStreamFsmRes tfw_h2_stream_send_process(TfwH2Ctx *ctx, TfwStream *stream, unsigned char type) { - TfwStreamFsmRes r; unsigned char flags = 0; - BUG_ON(stream->xmit.h_len && stream->xmit.b_len); + if (stream->xmit.h_len && !stream->xmit.b_len + && type == HTTP2_HEADERS) + flags |= HTTP2_F_END_STREAM; if (!stream->xmit.h_len && type != HTTP2_DATA) flags |= HTTP2_F_END_HEADERS; - if (!stream->xmit.b_len) + if (!stream->xmit.h_len && !stream->xmit.b_len + && !tfw_h2_stream_is_eos_sent(stream)) flags |= HTTP2_F_END_STREAM; - r = tfw_h2_stream_fsm_ignore_err(ctx, stream, type, flags); - if (flags & HTTP2_F_END_STREAM - || (r && r != STREAM_FSM_RES_IGNORE)) - tfw_h2_stream_add_closed(ctx, stream); - - return r != STREAM_FSM_RES_IGNORE ? r : STREAM_FSM_RES_OK; + return tfw_h2_stream_fsm_ignore_err(ctx, stream, type, flags); } diff --git a/fw/http_stream.h b/fw/http_stream.h index 7d1de29d32..0064ce0742 100644 --- a/fw/http_stream.h +++ b/fw/http_stream.h @@ -20,11 +20,11 @@ #ifndef __HTTP_STREAM__ #define __HTTP_STREAM__ -#include - #include "msg.h" #include "http_parser.h" +#include "http_stream_sched.h" #include "lib/str.h" +#include "ss_skb.h" /** * States for HTTP/2 streams processing. @@ -32,9 +32,7 @@ * NOTE: there is no exact matching between these states and states from * RFC 7540 (section 5.1), since several intermediate states were added in * current implementation to handle some edge states which are not mentioned - * explicitly in RFC (special kinds of closed state). Besides, there is no - * explicit 'idle' state here, since in current implementation idle stream - * is just a stream that has not been created yet. + * explicitly in RFC (special kinds of closed state). */ typedef enum { HTTP2_STREAM_IDLE, @@ -55,6 +53,21 @@ enum { HTTP2_STREAM_RECV_END_OF_STREAM = 0x2 << HTTP2_STREAM_FLAGS_OFFSET, }; +/* + * We use 3 bits for this state in TfwHttpXmit structure. + * If you add some new state here, do not forget to increase + * count of bits used for this state. + */ +typedef enum { + HTTP2_ENCODE_HEADERS, + HTTP2_RELEASE_RESPONSE, + HTTP2_MAKE_HEADERS_FRAMES, + HTTP2_MAKE_CONTINUATION_FRAMES, + HTTP2_MAKE_DATA_FRAMES, + HTTP2_SEND_FRAMES, + HTTP2_MAKE_FRAMES_FINISH, +} TfwStreamXmitState; + static const char *__tfw_strm_st_names[] = { [HTTP2_STREAM_IDLE] = "HTTP2_STREAM_IDLE", [HTTP2_STREAM_LOC_RESERVED] = "HTTP2_STREAM_LOC_RESERVED", @@ -103,31 +116,36 @@ typedef enum { * Last http2 response info, used to prepare frames * in `xmit` callbacks. * + * @resp - responce, that should be sent; + * @skb_head - head of skb list that must be sent; + * @postponed - head of skb list that must be sent + * after sending headers for this stream; * @h_len - length of headers in http2 response; + * @frame_length - length of current sending frame, or 0 + * if we send some service frames (for + * example RST STREAM after all pending data); * @b_len - length of body in http2 response; - * @__off - offset to reinitialize processing context; - * @processed - count of bytes, processed during prepare xmit - * callback; - * @nskbs - count of skbs processed during prepare xmit callback; + * @is_blocked - stream is blocked; + * @state - current stream xmit state (what type of + * frame should be made for this stream); */ typedef struct { - unsigned long h_len; - unsigned long b_len; - char __off[0]; - unsigned int processed; - unsigned int nskbs; + TfwHttpResp *resp; + struct sk_buff *skb_head; + struct sk_buff *postponed; + unsigned int h_len; + unsigned int frame_length; + u64 b_len : 60; + u64 is_blocked : 1; + u64 state : 3; } TfwHttpXmit; /** - * Limited queue for temporary storage of half-closed or pending half-closed - * streams. + * Limited queue for temporary storage of idle or closed streams * This structure provides the possibility of temporary existing in memory - - * for streams which are in HTTP2_STREAM_LOC_CLOSED or HTTP2_STREAM_REM_CLOSED - * states (see RFC 7540, section 5.1, the 'closed' paragraph). Note, that - * streams in HTTP2_STREAM_CLOSED state are not stored in this queue and must - * be removed right away. + * for streams which are in HTTP2_STREAM_CLOSED state. * - * @list - list of streams which are in closed state; + * @list - list of streams; * @num - number of streams in the list; */ typedef struct { @@ -135,10 +153,19 @@ typedef struct { unsigned long num; } TfwStreamQueue; +typedef enum { + HTTP2_STREAM_SCHED_STATE_UNKNOWN, + HTTP2_STREAM_SCHED_STATE_BLOCKED, + HTTP2_STREAM_SCHED_STATE_ACTIVE, +} TfwStreamSchedState; + /** * Representation of HTTP/2 stream entity. * * @node - entry in per-connection storage of streams (red-black tree); + * @sched_node - entry in per-connection priority storage of active streams; + * sched_state - state of stream in the per-connection scheduler; + * @sched - scheduler for child streams; * @hcl_node - entry in queue of half-closed or closed streams; * @id - stream ID; * @state - stream's current state; @@ -153,6 +180,9 @@ typedef struct { */ struct tfw_http_stream_t { struct rb_node node; + struct eb64_node sched_node; + TfwStreamSchedState sched_state; + TfwStreamSchedEntry sched; struct list_head hcl_node; unsigned int id; int state; @@ -166,26 +196,13 @@ struct tfw_http_stream_t { TfwHttpXmit xmit; }; -/** - * Scheduler for stream's processing distribution based on dependency/priority - * values. - * TODO: the structure is not completed yet and should be finished in context - * of #1196. - * - * @streams - root red-black tree entry for per-connection streams' storage; - */ -typedef struct { - struct rb_root streams; -} TfwStreamSched; - typedef struct tfw_h2_ctx_t TfwH2Ctx; int tfw_h2_stream_cache_create(void); void tfw_h2_stream_cache_destroy(void); -TfwStream * tfw_h2_stream_create(TfwH2Ctx *ctx, unsigned int id); +TfwStream *tfw_h2_stream_create(TfwH2Ctx *ctx, unsigned int id); +void tfw_h2_stream_remove_idle(TfwH2Ctx *ctx, TfwStream *stream); void tfw_h2_stream_clean(TfwH2Ctx *ctx, TfwStream *stream); -int tfw_h2_stream_close(TfwH2Ctx *ctx, unsigned int id, TfwStream **stream, - TfwH2Err err_code); void tfw_h2_stream_unlink_nolock(TfwH2Ctx *ctx, TfwStream *stream); void tfw_h2_stream_unlink_lock(TfwH2Ctx *ctx, TfwStream *stream); TfwStreamFsmRes tfw_h2_stream_fsm(TfwH2Ctx *ctx, TfwStream *stream, @@ -193,14 +210,14 @@ TfwStreamFsmRes tfw_h2_stream_fsm(TfwH2Ctx *ctx, TfwStream *stream, bool send, TfwH2Err *err); TfwStream *tfw_h2_find_stream(TfwStreamSched *sched, unsigned int id); void tfw_h2_delete_stream(TfwStream *stream); -void tfw_h2_change_stream_dep(TfwStreamSched *sched, unsigned int stream_id, - unsigned int new_dep, unsigned short new_weight, - bool excl); -int tfw_h2_stream_init_for_xmit(TfwHttpReq *req, unsigned long h_len, - unsigned long b_len); +int tfw_h2_stream_init_for_xmit(TfwHttpResp *resp, TfwStreamXmitState state, + unsigned long h_len, unsigned long b_len); void tfw_h2_stream_add_closed(TfwH2Ctx *ctx, TfwStream *stream); +void tfw_h2_stream_add_idle(TfwH2Ctx *ctx, TfwStream *idle); TfwStreamFsmRes tfw_h2_stream_send_process(TfwH2Ctx *ctx, TfwStream *stream, unsigned char type); +void tfw_h2_stream_purge_send_queue(TfwStream *stream); +void tfw_h2_stream_purge_all_and_free_response(TfwStream *stream); static inline TfwStreamState tfw_h2_get_stream_state(TfwStream *stream) @@ -233,10 +250,25 @@ __h2_strm_st_n(TfwStream *stream) return __tfw_strm_st_names[tfw_h2_get_stream_state(stream)]; } +static inline bool +tfw_h2_stream_is_active(TfwStream *stream) +{ + return stream->xmit.skb_head && !stream->xmit.is_blocked; +} + static inline void -tfw_h2_stream_xmit_reinit(TfwHttpXmit *xmit) +tfw_h2_stream_try_unblock(TfwStreamSched *sched, TfwStream *stream) { - bzero_fast(xmit->__off, sizeof(*xmit) - offsetof(TfwHttpXmit, __off)); + bool stream_was_blocked = stream->xmit.is_blocked; + + if (stream->rem_wnd > 0) { + stream->xmit.is_blocked = false; + if (stream->xmit.skb_head && stream_was_blocked) { + sched->blocked_streams--; + BUG_ON(sched->blocked_streams < 0); + tfw_h2_sched_activate_stream(sched, stream); + } + } } static inline bool @@ -297,4 +329,55 @@ tfw_h2_stream_del_from_queue_nolock(TfwStream *stream) stream->queue = NULL; } +static inline u64 +tfw_h2_stream_default_deficit(TfwStream *stream) +{ + static const unsigned tbl[256] = { + 65536, 32768, 21845, 16384, 13107, 10922, 9362, 8192, 7281, + 6553, 5957, 5461, 5041, 4681, 4369, 4096, 3855, 3640, 3449, + 3276, 3120, 2978, 2849, 2730, 2621, 2520, 2427, 2340, 2259, + 2184, 2114, 2048, 1985, 1927, 1872, 1820, 1771, 1724, 1680, + 1638, 1598, 1560, 1524, 1489, 1456, 1424, 1394, 1365, 1337, + 1310, 1285, 1260, 1236, 1213, 1191, 1170, 1149, 1129, 1110, + 1092, 1074, 1057, 1040, 1024, 1008, 992, 978, 963, 949, 936, + 923, 910, 897, 885, 873, 862, 851, 840, 829, 819, 809, 799, + 789, 780, 771, 762, 753, 744, 736, 728, 720, 712, 704, 697, + 689, 682, 675, 668, 661, 655, 648, 642, 636, 630, 624, 618, + 612, 606, 601, 595, 590, 585, 579, 574, 569, 564, 560, 555, + 550, 546, 541, 537, 532, 528, 524, 520, 516, 512, 508, 504, + 500, 496, 492, 489, 485, 481, 478, 474, 471, 468, 464, 461, + 458, 455, 451, 448, 445, 442, 439, 436, 434, 431, 428, 425, + 422, 420, 417, 414, 412, 409, 407, 404, 402, 399, 397, 394, + 392, 390, 387, 385, 383, 381, 378, 376, 374, 372, 370, 368, + 366, 364, 362, 360, 358, 356, 354, 352, 350, 348, 346, 344, + 343, 341, 339, 337, 336, 334, 332, 330, 329, 327, 326, 324, + 322, 321, 319, 318, 316, 315, 313, 312, 310, 309, 307, 306, + 304, 303, 302, 300, 299, 297, 296, 295, 293, 292, 291, 289, + 288, 287, 286, 284, 283, 282, 281, 280, 278, 277, 276, 275, + 274, 273, 271, 270, 269, 268, 267, 266, 265, 264, 263, 262, + 261, 260, 259, 258, 257, 256 + }; + + return tbl[stream->weight - 1]; +} + +static inline u64 +tfw_h2_stream_recalc_deficit(TfwStream *stream) +{ + /* + * This function should be called only for streams, + * which were removed from scheduler. + */ + BUG_ON(stream->sched_node.node.leaf_p || + stream->sched_state != HTTP2_STREAM_SCHED_STATE_UNKNOWN); + /* deficit = last_deficit + constant / weight */ + return stream->sched_node.key + tfw_h2_stream_default_deficit(stream); +} + +static inline bool +tfw_h2_stream_has_default_deficit(TfwStream *stream) +{ + return stream->sched_node.key == tfw_h2_stream_default_deficit(stream); +} + #endif /* __HTTP_STREAM__ */ diff --git a/fw/http_stream_sched.c b/fw/http_stream_sched.c new file mode 100644 index 0000000000..f4805e497d --- /dev/null +++ b/fw/http_stream_sched.c @@ -0,0 +1,616 @@ +/** + * Tempesta FW + * + * HTTP2 stream scheduler which implements stream prioritization + * accoring RFC 7540 5.3. + * + * There are two algorithm of stream prioritization which are described + * in RFC 7540 5.3 and RFC 9218. RFC 7540 5.3 is deprecated, but we + * implement our scheduler according to RFC 7540, because all modern + * browsers use RFC 7540 for HTTP2 stream prioritization and use modern + * RFC 9218 only for HTTP3. + * + * Before developing of our own HTTP streams scheduling logic, we analyzed + * how other open source HTTP servers implement this. + * Nginx not fully support RFC 7540. A frame is inserted into the sending list + * according to the rank (the level in the priority tree) of the stream and + * weight. But it does not correspond to the RFC: a server should not send data + * for a stream which depends on other streams. Also the algorithm can lead to + * O(n) complexity (linear scan) if each next frame has higher priority than + * the previous one. + * H20 uses an O(1) approach described as an Array of Queue. This is the very + * fast scheduler but it has two main disadvantages - it consumes a lot of + * memory and is not fair. + * We decide to implement WFQ algorithm. There are a lot of data structures + * which can be used for this purpose (list, different type of heaps and + * different types of trees). We analyzed some of them (e.g. Fibonacci heap, + * RB-tree, insertion sorted array etc) and found that the HAproxy’s ebtree + * provides the best performance (at least x2 faster than the closest in + * performance Fibonacci heap) on small data (about 100 to 1000 streams in a + * queue) to pick a minimum item and reinsert it. + * + * We use deficit as a key in our priority ebtree. Deficit of the stream + * calculated as decribed below: + * new: deficit = min_deficit_in_heap + constant / weight + * exist: deficit = last_deficit + constant / weight + * + * When we search for the most priority stream we iterate over the levels of + * the priority tree. For exanple: + * 1 (256) + * 3 (256) 5 (1) + * 7 (256) 9 (1) 11 (256) 13 (1) + * + * In this example we have streams 3 and 5 which depend on stream 1, + * streams 7 and 9 which depend on stream 3, and streams 11 and 13, which + * depend on stream 5. We start from stream 1 and if it is active (has data + * to send and not blocked by HTTP window exceeding) we return it. If is not + * active but has active children we move to the next level of the tree + * (streams 3 and 5) and choose the stream (which is active or has active + * children) with the lowest deficit. We remove it from the tree and if it + * is active return it. Later after sending data for this stream we recalculate + * its deficit (deficit = deficit + constant / weight) and insert it back to + * the tree. + * + * Copyright (C) 2024 Tempesta Technologies, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include "http_stream_sched.h" +#include "http_stream.h" +#include "connection.h" + +static inline void +tfw_h2_stream_sched_spin_lock_assert(TfwStreamSched *sched) +{ + TfwH2Ctx *ctx = container_of(sched, TfwH2Ctx, sched); + TfwH2Conn *conn = container_of(ctx, TfwH2Conn, h2); + + /* + * All scheduler functions schould be called under the + * socket lock. + */ + assert_spin_locked(&((TfwConn *)conn)->sk->sk_lock.slock); +} + +/** + * Remove stream from the ebtree of the blocked streams and insert + * it in the ebtree of active streams. Should be called only for + * active streams or the streams with active children, which is + * not already in the ebtree of active streams. + */ +static void +tfw_h2_stream_sched_insert_active(TfwStream *stream, u64 deficit) +{ + TfwStreamSchedEntry *parent = stream->sched.parent; + + BUG_ON(!parent || (!tfw_h2_stream_is_active(stream) && + !tfw_h2_stream_sched_is_active(&stream->sched))); + BUG_ON(stream->sched_state == HTTP2_STREAM_SCHED_STATE_ACTIVE); + + eb64_delete(&stream->sched_node); + stream->sched_node.key = deficit; + eb64_insert(&parent->active, &stream->sched_node); + stream->sched_state = HTTP2_STREAM_SCHED_STATE_ACTIVE; +} + +/** + * Remove stream from the ebtree of active streams and insert + * it in the ebtree of the blocked streams. Should be called + * only for the blocked streams and the streams without active + * children, which are not already in the ebtree of the blocked + * streams. + */ +static void +tfw_h2_stream_sched_insert_blocked(TfwStream *stream, u64 deficit) +{ + TfwStreamSchedEntry *parent = stream->sched.parent; + + BUG_ON(!parent || tfw_h2_stream_is_active(stream) + || tfw_h2_stream_sched_is_active(&stream->sched)); + BUG_ON(stream->sched_state == HTTP2_STREAM_SCHED_STATE_BLOCKED); + + eb64_delete(&stream->sched_node); + stream->sched_node.key = deficit; + eb64_insert(&parent->blocked, &stream->sched_node); + stream->sched_state = HTTP2_STREAM_SCHED_STATE_BLOCKED; +} + +/** + * Calculate minimum deficit for the current scheduler. + * New stream is inserted in the scheduler with + * deficit = min_deficit + 65536 / stream->weight. + */ +static u64 +tfw_h2_stream_sched_min_deficit(TfwStreamSchedEntry *parent) +{ + TfwStream *prio; + + /* + * First of all check active streams in the scheduler. + * If there are any active streams new stream is inserted + * with deficit = min_deficit + 65536 / stream->weight. + * Where min_deficit is a deficit of a most prio stream, + * if it was scheduled at least one time. + */ + prio = !eb_is_empty(&parent->active) ? + eb64_entry(eb64_first(&parent->active), TfwStream, sched_node) : + NULL; + if (prio) { + return tfw_h2_stream_has_default_deficit(prio) ? + 0 : prio->sched_node.key; + } + + /* Same for blocked streams. */ + prio = !eb_is_empty(&parent->blocked) ? + eb64_entry(eb64_first(&parent->blocked), TfwStream, sched_node) : + NULL; + if (prio) { + return tfw_h2_stream_has_default_deficit(prio) ? + 0 : prio->sched_node.key; + } + + return 0; +} + +/** + * Recalculate count of active streams for parent schedulers, when + * new stream is added to the priority tree. If parent scheduler + * is activated in this function, insert appropriate parent stream + * in the tree of active streams. + */ +static void +tfw_h2_stream_sched_propagate_add_active_cnt(TfwStreamSched *sched, + TfwStream *stream) +{ + TfwStreamSchedEntry *parent = stream->sched.parent; + bool stream_is_active = tfw_h2_stream_is_active(stream); + long int active_cnt = + stream->sched.active_cnt + (stream_is_active ? 1 : 0); + + if (!active_cnt) + return; + + while (true) { + bool need_activate = !tfw_h2_stream_sched_is_active(parent); + parent->active_cnt += active_cnt; + if (parent == &sched->root) + break; + + stream = container_of(parent, TfwStream, sched); + parent = stream->sched.parent; + /* + * Stream can have no parent if it is removed from + * the scheduler due to priority tree rebuilding. + */ + if (!parent) + break; + + if (need_activate && !tfw_h2_stream_is_active(stream)) { + BUG_ON(stream->sched_state != HTTP2_STREAM_SCHED_STATE_BLOCKED); + tfw_h2_stream_sched_insert_active(stream, + stream->sched_node.key); + } + } +} + +/** + * Recalculate count of active streams for parent schedulers, when + * new stream is removed from the priority tree. If parent scheduler + * is deactivated in this function, remove appropriate parent stream + * from the tree of active streams. + */ +static void +tfw_h2_stream_sched_propagate_dec_active_cnt(TfwStreamSched *sched, + TfwStream *stream) +{ + TfwStreamSchedEntry *parent = stream->sched.parent; + bool stream_is_active = tfw_h2_stream_is_active(stream); + long int active_cnt = + stream->sched.active_cnt + (stream_is_active ? 1 : 0); + + if (!active_cnt) + return; + + while (true) { + parent->active_cnt -= active_cnt; + if (parent == &sched->root) + break; + + stream = container_of(parent, TfwStream, sched); + parent = stream->sched.parent; + /* + * Stream can have no parent if it is removed from + * the scheduler due to priority tree rebuilding. + */ + if (!parent) + break; + + if (tfw_h2_stream_is_active(stream) + || tfw_h2_stream_sched_is_active(&stream->sched)) + continue; + + BUG_ON(stream->sched_state != HTTP2_STREAM_SCHED_STATE_ACTIVE); + tfw_h2_stream_sched_insert_blocked(stream, stream->sched_node.key); + } +} + +/** + * Remove stream from the scheduler. Since this function is + * used when we delete stream also we should explicitly remove + * stream both from the tree. It is a caller responsibility + * to add stream again to the scheduler if it is necessary + * with appropriate deficite. + */ +void +tfw_h2_stream_sched_remove(TfwStreamSched *sched, TfwStream *stream) +{ + TfwStreamSchedEntry *parent = stream->sched.parent; + + tfw_h2_stream_sched_spin_lock_assert(sched); + + eb64_delete(&stream->sched_node); + stream->sched_state = HTTP2_STREAM_SCHED_STATE_UNKNOWN; + tfw_h2_stream_sched_propagate_dec_active_cnt(sched, stream); + stream->sched.parent = NULL; + parent->total_weight -= stream->weight; +} + +/** + * Find parent scheduler by id of the parent stream. If id == 0 or + * we can't find parent stream return root scheduler according to + * RFC 7540 5.3.1. + */ +TfwStreamSchedEntry * +tfw_h2_find_stream_dep(TfwStreamSched *sched, unsigned int id) +{ + tfw_h2_stream_sched_spin_lock_assert(sched); + + if (id) { + TfwStream *stream = tfw_h2_find_stream(sched, id); + if (stream) + return &stream->sched; + } + /* + * RFC 7540 5.3.1: + * A dependency on a stream that is not currently in the tree -- such + * as a stream in the "idle" state -- results in that stream being + * given a default priority. + */ + return &sched->root; +} + +static inline bool +tfw_h2_stream_sched_has_children(TfwStreamSchedEntry *entry) +{ + return !eb_is_empty(&entry->active) || !eb_is_empty(&entry->blocked); +} + +static inline void +tfw_h2_stream_sched_move_child(TfwStreamSched *sched, TfwStream *child, + TfwStreamSchedEntry *parent, u64 deficit) +{ + tfw_h2_stream_sched_remove(sched, child); + tfw_h2_sched_stream_enqueue(sched, child, parent, deficit); +} + +/** + * Add stream to the scheduler tree. @dep is a parent of new + * added stream. + */ +void +tfw_h2_add_stream_dep(TfwStreamSched *sched, TfwStream *stream, + TfwStreamSchedEntry *dep, bool excl) +{ + u64 deficit, min_deficit; + bool stream_has_children; + + tfw_h2_stream_sched_spin_lock_assert(sched); + + if (!excl) { + deficit = tfw_h2_stream_sched_min_deficit(dep) + + tfw_h2_stream_default_deficit(stream); + tfw_h2_sched_stream_enqueue(sched, stream, dep, deficit); + return; + } + + /* + * Here we move children of dep scheduler to the current stream + * scheduler. If current stream scheduler has no children we move + * dep children as is (saving there deficit in the priority WFQ). + * Otherwise we calculate minimal deficit of the scheduler and use + * it as a base of new children deficit. + */ + stream_has_children = tfw_h2_stream_sched_has_children(&stream->sched); + min_deficit = !stream_has_children ? 0 : + tfw_h2_stream_sched_min_deficit(&stream->sched); + + /* + * RFC 7540 5.3.1: + * An exclusive flag allows for the insertion of a new level of + * dependencies. The exclusive flag causes the stream to become the + * sole dependency of its parent stream, causing other dependencies + * to become dependent on the exclusive stream. + */ + while (!eb_is_empty(&dep->blocked)) { + struct eb64_node *node = eb64_first(&dep->blocked); + TfwStream *child = eb64_entry(node, TfwStream, sched_node); + + deficit = !stream_has_children ? child->sched_node.key : + min_deficit + tfw_h2_stream_default_deficit(child); + tfw_h2_stream_sched_move_child(sched, child, &stream->sched, + deficit); + } + + while (!eb_is_empty(&dep->active)) { + struct eb64_node *node = eb64_first(&dep->active); + TfwStream *child = eb64_entry(node, TfwStream, sched_node); + + deficit = !stream_has_children ? child->sched_node.key : + min_deficit + tfw_h2_stream_default_deficit(child); + tfw_h2_stream_sched_move_child(sched, child, &stream->sched, + deficit); + } + + BUG_ON(tfw_h2_stream_sched_has_children(dep)); + /* Stream is the only one in dep scheduler, use default deficit. */ + tfw_h2_sched_stream_enqueue(sched, stream, dep, + tfw_h2_stream_default_deficit(stream)); +} + +/** + * Remove stream from the dependency tree. Move it's children to its + * parent scheduler according RFC 7540. + */ +void +tfw_h2_remove_stream_dep(TfwStreamSched *sched, TfwStream *stream) +{ + TfwStreamSchedEntry *parent = stream->sched.parent; + size_t total_weight = stream->sched.total_weight; + unsigned short new_weight; + bool parent_has_children; + u64 deficit; + + tfw_h2_stream_sched_spin_lock_assert(sched); + + /* Remove stream from the parent scheduler. */ + tfw_h2_stream_sched_remove(sched, stream); + + /* + * Here we move children of the removed stream to the parent + * scheduler. If parent scheduler has no children we move + * current removed stream children as is (saving there deficit + * in the priority WFQ). Otherwise we put them in the parent + * scheduler with current removed stream deficit. We can't + * save children deficit, because it has no matter for the + * parent scheduler WFQ. + */ + parent_has_children = tfw_h2_stream_sched_has_children(parent); + + /* + * According to RFC 7540 section 5.3.4: + * If the parent stream is removed from the tree, the weight of the + * parent stream is divided between it's children according to there + * weights. + */ + while (!eb_is_empty(&stream->sched.blocked)) { + struct eb64_node *node = eb64_first(&stream->sched.blocked); + TfwStream *child = eb64_entry(node, TfwStream, sched_node); + + /* + * Remove children of the removed stream, recalculate there + * weights and add them to the scheduler of the parent of + * the removed stream. + */ + new_weight = child->weight * + stream->weight / total_weight; + child->weight = new_weight > 0 ? new_weight : 1; + deficit = !parent_has_children ? + child->sched_node.key : stream->sched_node.key; + tfw_h2_stream_sched_move_child(sched, child, parent, deficit); + } + + while (!eb_is_empty(&stream->sched.active)) { + struct eb64_node *node = eb64_first(&stream->sched.active); + TfwStream *child = eb64_entry(node, TfwStream, sched_node); + + /* + * Remove children of the removed stream, recalculate there + * weights and add them to the scheduler of the parent of + * the removed stream. + */ + new_weight = child->weight * + stream->weight / total_weight; + child->weight = new_weight > 0 ? new_weight : 1; + deficit = !parent_has_children ? + child->sched_node.key : stream->sched_node.key; + tfw_h2_stream_sched_move_child(sched, child, parent, deficit); + } + + BUG_ON(stream->sched.active_cnt); +} + +/** + * Check if the stream is now depends from it's child. + */ +static bool +tfw_h2_is_stream_depend_on_child(TfwStreamSched *sched, TfwStream *stream, + TfwStreamSchedEntry *new_parent) +{ + TfwStreamSchedEntry *parent = new_parent->parent; + TfwStream *next; + + while (parent && parent != &sched->root) { + next = container_of(parent, TfwStream, sched); + if (next == stream) + return true; + parent = parent->parent; + } + + return false; +} + +void +tfw_h2_change_stream_dep(TfwStreamSched *sched, unsigned int stream_id, + unsigned int new_dep, unsigned short new_weight, + bool excl) +{ + TfwStreamSchedEntry *old_parent, *new_parent; + TfwStream *stream, *np; + bool is_stream_depends_on_child; + + tfw_h2_stream_sched_spin_lock_assert(sched); + + stream = tfw_h2_find_stream(sched, stream_id); + BUG_ON(!stream); + old_parent = stream->sched.parent; + BUG_ON(!old_parent); + + new_parent = tfw_h2_find_stream_dep(sched, new_dep); + + is_stream_depends_on_child = + tfw_h2_is_stream_depend_on_child(sched, stream, new_parent); + + if (!is_stream_depends_on_child) { + /* + * If stream is not dependent from it's child, just remove + * this stream change it's weight and add stream to the + * new parent. + * The order of calling next functions is important: + * 1. First we should remove current stream from the + * dependency tree (with recalculation of total + * weight of parent schedulers). + * 2. Change stream weight. + * 3. Insert stream in the dependency tree as a + * child of the new parent. + */ + tfw_h2_stream_sched_remove(sched, stream); + stream->weight = new_weight; + tfw_h2_add_stream_dep(sched, stream, new_parent, excl); + } else { + /* + * If stream is dependent from it's child, remove this + * child from the dependency tree, put this child to the + * location of the current stream and then add current + * stream as a child of the new parent (which was a child + * of current stream). + * (See RFC 7540 section 5.3.3). + * The order of calling next functions is important: + * 1. Remove new parent, which is a child of current stream. + * (with recalculation of weight and active count of current + * stream scheduler). + * 2. Remove current stream from the dependency tree. + * 3. Change stream weight and insert new parent and stream + * according RFC 7540. + */ + BUG_ON(new_parent == &sched->root); + np = container_of(new_parent, TfwStream, sched); + + tfw_h2_stream_sched_remove(sched, np); + tfw_h2_stream_sched_remove(sched, stream); + stream->weight = new_weight; + tfw_h2_add_stream_dep(sched, np, old_parent, false); + tfw_h2_add_stream_dep(sched, stream, new_parent, excl); + } + +} + +void +tfw_h2_sched_stream_enqueue(TfwStreamSched *sched, TfwStream *stream, + TfwStreamSchedEntry *parent, u64 deficit) +{ + tfw_h2_stream_sched_spin_lock_assert(sched); + + parent->total_weight += stream->weight; + stream->sched.parent = parent; + + /* + * This function should be called only for new created streams or + * streams which were previously removed from the scheduler. + */ + BUG_ON(stream->sched_node.node.leaf_p); + + if (tfw_h2_stream_is_active(stream) + || tfw_h2_stream_sched_is_active(&stream->sched)) + tfw_h2_stream_sched_insert_active(stream, deficit); + else + tfw_h2_stream_sched_insert_blocked(stream, deficit); + + tfw_h2_stream_sched_propagate_add_active_cnt(sched, stream); +} + +TfwStream * +tfw_h2_sched_stream_dequeue(TfwStreamSched *sched, TfwStreamSchedEntry **parent) +{ + TfwStreamSchedEntry *entry = &sched->root; + struct eb64_node *node = eb64_first(&entry->active); + u64 deficit; + + while (node) { + TfwStream *stream = eb64_entry(node, TfwStream, sched_node); + + if (tfw_h2_stream_is_active(stream)) { + *parent = entry; + tfw_h2_stream_sched_remove(sched, stream); + return stream; + } else if (tfw_h2_stream_sched_is_active(&stream->sched)) { + /* + * This stream is blocked, but have active children, try + * to use one of them. + */ + *parent = stream->sched.parent; + tfw_h2_stream_sched_remove(sched, stream); + deficit = tfw_h2_stream_recalc_deficit(stream); + tfw_h2_sched_stream_enqueue(sched, stream, *parent, + deficit); + entry = &stream->sched; + node = eb64_first(&entry->active); + } else { + /* + * Since node is in active tree it should be active or + * has active children. + */ + BUG(); + } + } + + return NULL; +} + +void +tfw_h2_sched_activate_stream(TfwStreamSched *sched, TfwStream *stream) +{ + TfwStreamSchedEntry *parent = stream->sched.parent; + + tfw_h2_stream_sched_spin_lock_assert(sched); + BUG_ON(!tfw_h2_stream_is_active(stream)); + BUG_ON(!parent); + + if (!tfw_h2_stream_sched_is_active(&stream->sched)) + tfw_h2_stream_sched_insert_active(stream, stream->sched_node.key); + + while (true) { + bool need_activate = !tfw_h2_stream_sched_is_active(parent); + parent->active_cnt += 1; + if (parent == &sched->root) + break; + + stream = container_of(parent, TfwStream, sched); + parent = stream->sched.parent; + BUG_ON(!parent); + + if (need_activate && !tfw_h2_stream_is_active(stream)) + tfw_h2_stream_sched_insert_active(stream, stream->sched_node.key); + } +} diff --git a/fw/http_stream_sched.h b/fw/http_stream_sched.h new file mode 100644 index 0000000000..0767e5a11e --- /dev/null +++ b/fw/http_stream_sched.h @@ -0,0 +1,95 @@ +/** + * Tempesta FW + * + * Copyright (C) 2024 Tempesta Technologies, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#ifndef __HTTP_STREAM_SCHED__ +#define __HTTP_STREAM_SCHED__ + +#include +#include + +#include "lib/eb64tree.h" +#include "http_types.h" + +/** + * @total_weight - total weight of the streams for this scheduler; + * @active_cnt - count of active child streams for this scheduler; + * @parent - parent scheduler; + * @active - root of the active streams scheduler ebtree; + * @blocked - root of the blocked streams scheduler ebtree; + */ +typedef struct tfw_stream_sched_entry_t { + u64 total_weight; + long int active_cnt; + struct tfw_stream_sched_entry_t *parent; + struct eb_root active; + struct eb_root blocked; +} TfwStreamSchedEntry; + +/** + * Scheduler for stream's processing distribution based on dependency/priority + * values. + * + * @streams - root red-black tree entry for per-connection streams storage; + * @root - root scheduler of per-connection priority tree; + * @blocked_streams - count of blocked streams; + */ +typedef struct tfw_stream_sched_t { + struct rb_root streams; + TfwStreamSchedEntry root; + long int blocked_streams; +} TfwStreamSched; + +TfwStreamSchedEntry *tfw_h2_find_stream_dep(TfwStreamSched *sched, + unsigned int id); +void tfw_h2_add_stream_dep(TfwStreamSched *sched, TfwStream *stream, + TfwStreamSchedEntry *dep, bool excl); +void tfw_h2_remove_stream_dep(TfwStreamSched *sched, TfwStream *stream); +void tfw_h2_change_stream_dep(TfwStreamSched *sched, unsigned int stream_id, + unsigned int new_dep, unsigned short new_weight, + bool excl); + +void tfw_h2_stream_sched_remove(TfwStreamSched *sched, TfwStream *stream); +void tfw_h2_sched_stream_enqueue(TfwStreamSched *sched, TfwStream *stream, + TfwStreamSchedEntry *parent, u64 deficit); +TfwStream *tfw_h2_sched_stream_dequeue(TfwStreamSched *sched, + TfwStreamSchedEntry **parent); +void tfw_h2_sched_activate_stream(TfwStreamSched *sched, TfwStream *stream); + +static inline bool +tfw_h2_stream_sched_is_active(TfwStreamSchedEntry *sched) +{ + return sched->active_cnt; +} + +static inline void +tfw_h2_init_stream_sched_entry(TfwStreamSchedEntry *entry) +{ + entry->total_weight = entry->active_cnt = 0; + entry->parent = NULL; + entry->blocked = entry->active = EB_ROOT; +} + +static inline void +tfw_h2_init_stream_sched(TfwStreamSched *sched) +{ + sched->streams = RB_ROOT; + tfw_h2_init_stream_sched_entry(&sched->root); +} + +#endif /* __HTTP_STREAM_SCHED__ */ diff --git a/fw/http_types.h b/fw/http_types.h index d70f2337c0..f767592e7d 100644 --- a/fw/http_types.h +++ b/fw/http_types.h @@ -79,8 +79,6 @@ enum { TFW_HTTP_B_H2, /* Message has all mandatory pseudo-headers (applicable for HTTP/2 mode only) */ TFW_HTTP_B_H2_HDRS_FULL, - /* Message in HTTP/2 transformation (applicable for HTTP/2 mode only). */ - TFW_HTTP_B_H2_TRANS_ENTERED, /* Request flags. */ TFW_HTTP_FLAGS_REQ, @@ -126,6 +124,11 @@ enum { TFW_HTTP_B_HDR_ETAG_HAS_NO_QOUTES, /* Request URI is absolute (HTTP/1.x only) */ TFW_HTTP_B_ABSOLUTE_URI, + /* + * This is the error response, connection + * will be closed after sending it. + */ + TFW_HTTP_B_CLOSE_ERROR_RESPONSE, _TFW_HTTP_FLAGS_NUM }; diff --git a/fw/msg.c b/fw/msg.c index 90f035dabf..19e66960fe 100644 --- a/fw/msg.c +++ b/fw/msg.c @@ -97,42 +97,6 @@ int tfw_http_iter_set_at(TfwMsgIter *it, char *off) return -E2BIG; } -char * -tfw_http_iter_set_at_skb(TfwMsgIter *it, struct sk_buff *skb, - unsigned long off) -{ - char *begin, *end; - unsigned long d; - unsigned char i; - - if (skb_headlen(it->skb)) { - begin = it->skb->data; - end = begin + skb_headlen(it->skb); - - if (begin + off <= end) { - it->frag = -1; - return begin + off; - } - off -= skb_headlen(it->skb); - } - - for (i = 0; i < skb_shinfo(it->skb)->nr_frags; i++) { - skb_frag_t *f = &skb_shinfo(it->skb)->frags[i]; - - begin = skb_frag_address(f); - end = begin + skb_frag_size(f); - d = end - begin; - if (off >= d) { - off -= d; - continue; - } - it->frag = i; - return begin + off; - } - - return NULL; -} - /** * Move message iterator from @data pointer by @sz symbols right. * @sz must be less than remaining message size, otherwise an error will be diff --git a/fw/msg.h b/fw/msg.h index bb604456d2..f4bcd15bcc 100644 --- a/fw/msg.h +++ b/fw/msg.h @@ -25,7 +25,7 @@ #include -#include "sync_socket.h" +#include "str.h" /** * @seq_list - member in the ordered queue of messages; @@ -100,24 +100,8 @@ int tfw_msg_iter_setup(TfwMsgIter *it, struct sk_buff **skb_head, size_t data_len, unsigned int tx_flags); int tfw_msg_iter_append_skb(TfwMsgIter *it); int tfw_http_iter_set_at(TfwMsgIter *it, char *off); -char *tfw_http_iter_set_at_skb(TfwMsgIter *it, struct sk_buff *skb, - unsigned long off); int tfw_msg_iter_move(TfwMsgIter *it, unsigned char **data, unsigned long sz); -static inline void -tfw_msg_iter_set_skb_priv(TfwMsgIter *it, unsigned int priv, - unsigned short flags) -{ - struct sk_buff *skb = it->skb; - do { - if (flags) - skb_set_tfw_flags(skb, flags); - if (priv) - skb_set_tfw_cb(skb, priv); - skb = skb->next; - } while (skb != it->skb_head); -} - static inline int tfw_msg_iter_next_data_frag(TfwMsgIter *it) { diff --git a/fw/sock.c b/fw/sock.c index 5cc313b395..eaff708049 100644 --- a/fw/sock.c +++ b/fw/sock.c @@ -39,12 +39,6 @@ #include "work_queue.h" #include "http_limits.h" -typedef enum { - SS_SEND, - SS_CLOSE, - SS_SHUTDOWN, -} SsAction; - typedef struct { struct sock *sk; struct sk_buff *skb_head; @@ -195,7 +189,7 @@ ss_active_guard_exit(unsigned long val) static void ss_conn_drop_guard_exit(struct sock *sk) { - SS_CONN_TYPE(sk) &= ~(Conn_Closing | Conn_Shutdown); + SS_CONN_TYPE(sk) &= ~Conn_Closing; SS_CALL(connection_drop, sk); if (sk->sk_security) tfw_classify_conn_close(sk); @@ -370,30 +364,48 @@ ss_forced_mem_schedule(struct sock *sk, int size) sk_memory_allocated_add(sk, amt); } -/** - * @skb_head can be invalid after the function call, don't try to use it. - */ -static void -ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags) +void +ss_skb_tcp_entail(struct sock *sk, struct sk_buff *skb, unsigned int mark, + unsigned char tls_type) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb, *head = *skb_head; - int size, mss = tcp_send_mss(sk, &size, MSG_DONTWAIT); - unsigned int mark = (*skb_head)->mark; - T_DBG3("[%d]: %s: sk=%pK queue_empty=%d send_head=%pK" - " sk_state=%d mss=%d size=%d\n", - smp_processor_id(), __func__, - sk, tcp_write_queue_empty(sk), tcp_send_head(sk), - sk->sk_state, mss, size); + ss_skb_on_tcp_entail(sk->sk_user_data, skb); + ss_skb_init_for_xmit(skb); + skb->mark = mark; + if (tls_type) + skb_set_tfw_tls_type(skb, tls_type); + ss_forced_mem_schedule(sk, skb->truesize); + skb_entail(sk, skb); + tp->write_seq += skb->len; + TCP_SKB_CB(skb)->end_seq += skb->len; + + T_DBG3("[%d]: %s: entail sk=%pK skb=%pK data_len=%u len=%u" + " truesize=%u mark=%u tls_type=%x\n", + smp_processor_id(), __func__, sk, skb, skb->data_len, + skb->len, skb->truesize, skb->mark, + skb_tfw_tls_type(skb)); +} - /* If the socket is inactive, there's no recourse. Drop the data. */ - if (unlikely(!ss_sock_active(sk))) { - ss_skb_queue_purge(skb_head); - return; - } +void +ss_skb_tcp_entail_list(struct sock *sk, struct sk_buff **skb_head) +{ + struct sk_buff *skb; + unsigned char tls_type = 0; + unsigned int mark = 0; while ((skb = ss_skb_dequeue(skb_head))) { + /* + * @skb_head can be the head of several different skb + * lists. We set tls type for the head of each new + * skb list and we should entail each skb with mark + * and tls_type of the head of the list to which it + * belongs. + */ + if (TFW_SKB_CB(skb)->is_head) { + tls_type = skb_tfw_tls_type(skb); + mark = skb->mark; + } /* * Zero-sized SKBs may appear when the message headers (or any * other contents) are modified or deleted by Tempesta. Drop @@ -406,28 +418,42 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags) kfree_skb(skb); continue; } + ss_skb_tcp_entail(sk, skb, mark, tls_type); + } +} - ss_skb_init_for_xmit(skb); - if (flags & SS_F_ENCRYPT) { - skb_set_tfw_tls_type(skb, SS_SKB_F2TYPE(flags)); - if (skb == head) - skb_set_tfw_flags(skb, SS_F_HTTP2_FRAME_START); - } - /* Propagate mark of message head skb.*/ - skb->mark = mark; +/** + * @skb_head can be invalid after the function call, don't try to use it. + */ +static void +ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags) +{ + int size, mss = tcp_send_mss(sk, &size, MSG_DONTWAIT); + void *conn = sk->sk_user_data; + unsigned char tls_type = flags & SS_F_ENCRYPT ? + SS_SKB_F2TYPE(flags) : 0; - T_DBG3("[%d]: %s: entail sk=%pK skb=%pK data_len=%u len=%u" - " truesize=%u mark=%u tls_type=%x\n", - smp_processor_id(), __func__, sk, - skb, skb->data_len, skb->len, skb->truesize, skb->mark, - skb_tfw_tls_type(skb)); + T_DBG3("[%d]: %s: sk=%pK queue_empty=%d send_head=%pK" + " sk_state=%d mss=%d size=%d\n", + smp_processor_id(), __func__, + sk, tcp_write_queue_empty(sk), tcp_send_head(sk), + sk->sk_state, mss, size); - ss_forced_mem_schedule(sk, skb->truesize); - skb_entail(sk, skb); + /* If the socket is inactive, there's no recourse. Drop the data. */ + if (unlikely(!conn || !ss_sock_active(sk))) + goto cleanup; - tp->write_seq += skb->len; - TCP_SKB_CB(skb)->end_seq += skb->len; - } + ss_skb_setup_head_of_list(*skb_head, (*skb_head)->mark, tls_type); + + if (ss_skb_on_send(conn, skb_head)) + goto cleanup; + + /* + * If skbs were pushed to scheuler tree, @skb_head is + * empty and `ss_skb_tcp_entail_list` doesn't make + * any job. + */ + ss_skb_tcp_entail_list(sk, skb_head); T_DBG3("[%d]: %s: sk=%p send_head=%p sk_state=%d flags=%x\n", smp_processor_id(), __func__, @@ -440,7 +466,34 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags) if (flags & SS_F_CONN_CLOSE) return; - tcp_push(sk, MSG_DONTWAIT, mss, TCP_NAGLE_OFF|TCP_NAGLE_PUSH, size); + /* + * We set SOCK_TEMPESTA_HAS_DATA when we add some skb in our + * scheduler tree. + * So there are two cases here: + * - packets out is equal to zero and sock flag is set, + * this means that we should call `tcp_push_pending_frames`. + * In this function our scheduler choose the most priority + * stream, make frames for this stream and push them to the + * socket write queue. + * - socket flag is not set, this means that we push skb directly + * to the socket write queue so we call `tcp_push` and don't + * run scheduler. + * If packets_out is not equal to zero `tcp_push_pending_frames` + * will be called later from `tcp_data_snd_check` when we receive + * ack from the peer. + */ + if (sock_flag(sk, SOCK_TEMPESTA_HAS_DATA)) { + tcp_push_pending_frames(sk); + } else { + tcp_push(sk, MSG_DONTWAIT, mss, TCP_NAGLE_OFF | TCP_NAGLE_PUSH, + size); + } + + return; + +cleanup: + ss_skb_destroy_opaque_data(*skb_head); + ss_skb_queue_purge(skb_head); } /** @@ -604,6 +657,9 @@ ss_do_close(struct sock *sk, int flags) tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, sk->sk_allocation); } else if (tcp_close_state(sk)) { + int size, mss = tcp_send_mss(sk, &size, MSG_DONTWAIT); + if (sk->sk_fill_write_queue) + sk->sk_fill_write_queue(sk, mss, SS_CLOSE); tcp_send_fin(sk); } @@ -789,6 +845,7 @@ do { \ * own flags, thus clear it. */ skb->dev = NULL; + memset(skb->cb, 0, sizeof(skb->cb)); if (unlikely(offset >= skb->len)) { offset -= skb->len; @@ -1017,6 +1074,7 @@ ss_tcp_state_change(struct sock *sk) { T_DBG3("[%d]: %s: sk=%p state=%s\n", smp_processor_id(), __func__, sk, ss_statename[sk->sk_state]); + ss_sk_incoming_cpu_update(sk); assert_spin_locked(&sk->sk_lock.slock); TFW_VALIDATE_SK_LOCK_OWNER(sk); @@ -1445,7 +1503,16 @@ __sk_close_locked(struct sock *sk, int flags) static inline void ss_do_shutdown(struct sock *sk) { - tcp_shutdown(sk, SEND_SHUTDOWN); + int size, mss = tcp_send_mss(sk, &size, MSG_DONTWAIT); + /* + * We send `tcp_shutdown` from `sk_fill_write_queue` if + * there is no pending data in our sceduler and SS_SHUTDOWN + * is passed as ss_action. + */ + if (sk->sk_fill_write_queue) + sk->sk_fill_write_queue(sk, mss, SS_SHUTDOWN); + else + tcp_shutdown(sk, SEND_SHUTDOWN); SS_CONN_TYPE(sk) |= Conn_Shutdown; } @@ -1571,6 +1638,9 @@ ss_tx_action(void) } dead_sock: sock_put(sk); /* paired with push() calls */ + if (sw.skb_head) + ss_skb_destroy_opaque_data(sw.skb_head); + while ((skb = ss_skb_dequeue(&sw.skb_head))) kfree_skb(skb); } diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c index 479d998457..b1819e91fe 100644 --- a/fw/sock_clnt.c +++ b/fw/sock_clnt.c @@ -34,6 +34,7 @@ #include "server.h" #include "sync_socket.h" #include "tls.h" +#include "tcp.h" /* * ------------------------------------------------------------------------ @@ -172,266 +173,14 @@ tfw_cli_conn_send(TfwCliConn *cli_conn, TfwMsg *msg) return r; } -/** - * First `xmit` callback, which is used to add headers for HTTP2 - * HEADERS and DATA frames. Also used to add hpack dynamic table - * size at the beginning of the first header block according to - * RFC 7541. Implemented in separate function, because we use - * `tso_fragment` with new limit to split skb before passing it - * to the second `xmit` callback. - */ -static int -tfw_h2_sk_prepare_xmit(struct sock *sk, struct sk_buff *skb, - unsigned int mss_now, unsigned int *limit, - unsigned int *nskbs) -{ - TfwConn *conn = sk->sk_user_data; - unsigned short flags = skb_tfw_flags(skb); - unsigned int skb_priv = skb_tfw_cb(skb); - unsigned int truesize = 0, tmp_truesize = 0; - bool headers_was_done = false; - TfwH2Ctx *h2 = NULL; - TfwHPackETbl *tbl = NULL; - TfwStream *stream = NULL; - int r = 0; - -#define FRAME_HEADERS_SHOULD_BE_MADE(flags) \ - (flags & SS_F_HTTT2_FRAME_HEADERS) - -#define FRAME_DATA_SHOULD_BE_MADE(flags) \ - (flags & SS_F_HTTT2_FRAME_DATA) - -#define FRAME_HEADERS_OR_DATA_SHOULD_BE_MADE(flags) \ - (FRAME_HEADERS_SHOULD_BE_MADE(flags) \ - || FRAME_DATA_SHOULD_BE_MADE(flags)) - -#define FRAME_ALREADY_PREPARED(flags) \ - (flags & SS_F_HTTP2_FRAME_PREPARED) - -#define CHECK_STREAM_IS_PRESENT(stream) \ -do { \ - h2 = tfw_h2_context_unsafe(conn); \ - tbl = &h2->hpack.enc_tbl; \ - stream = tfw_h2_find_not_closed_stream(h2, skb_priv, false); \ - if (!stream) { \ - T_WARN("%s: stream with id (%u) already closed", \ - __func__, skb_priv); \ - /* \ - * TODO #1196: \ - * Don't purge tcp queue and don't close connection, \ - * because we can still send data for other streams. \ - */ \ - r = -EPIPE; \ - goto ret; \ - } \ -} while (0); - -#define TFW_H2_STREAM_SEND_PROCESS(h2, stream, type) \ - r = tfw_h2_stream_send_process(h2, stream, type); \ - if (unlikely(r != STREAM_FSM_RES_OK)) { \ - T_WARN("Failed to process stream %d", (int)r); \ - /* \ - * TODO #1196: \ - * drop all skbs for corresponding stream if \ - * r == STREAM_FSM_RES_TERM_STREAM. \ - */ \ - if (r == STREAM_FSM_RES_TERM_CONN) { \ - r = -EPIPE; \ - goto ret; \ - } \ - } - - BUG_ON(FRAME_ALREADY_PREPARED(flags)); - - /* - * If some error occurs between `tcp_tfw_sk_prepare_xmit` and - * `tcp_tfw_sk_write_xmit`, skb which was already processed will - * be passed to this function again. We should not process this - * skb, just update limit according to already processed bytes. - */ - if (FRAME_HEADERS_OR_DATA_SHOULD_BE_MADE(flags)) { - CHECK_STREAM_IS_PRESENT(stream); - tfw_h2_stream_xmit_reinit(&stream->xmit); - stream->xmit.nskbs = 1; - } else { - struct sk_buff *next = skb; - unsigned short flags; - - /* - * Here we deal with skbs which do not contain HEADERS or - * DATA frames. They should be encrypted in separate tls - * record. - */ - *nskbs = 1; - while (!tcp_skb_is_last(sk, next)) { - next = skb_queue_next(&sk->sk_write_queue, next); - flags = skb_tfw_flags(next); - - if (FRAME_HEADERS_OR_DATA_SHOULD_BE_MADE(flags)) - break; - (*nskbs)++; - } - } - - if (flags & SS_F_HTTP2_ACK_FOR_HPACK_TBL_RESIZING) { - h2 = tfw_h2_context_unsafe(conn); - tbl = &h2->hpack.enc_tbl; - - tfw_hpack_set_rbuf_size(tbl, skb_priv); - h2->rsettings.hdr_tbl_sz = tbl->window; - skb_clear_tfw_flag(skb, SS_F_HTTP2_ACK_FOR_HPACK_TBL_RESIZING); - } - - /* - * We should write new hpack dynamic table size at the - * beginning of the first header block. - */ - if (flags & SS_F_HTTP2_FRAME_START && - !(flags & SS_F_HTTT2_HPACK_TBL_SZ_ENCODED) - && FRAME_HEADERS_SHOULD_BE_MADE(flags)) - { - r = tfw_hpack_enc_tbl_write_sz(tbl, sk, skb, stream, - mss_now, &tmp_truesize); - if (unlikely(r)) { - T_WARN("%s: failed to encode new hpack dynamic " - "table size (%d)", __func__, r); - goto ret; - } - - flags |= (tmp_truesize ? SS_F_HTTT2_HPACK_TBL_SZ_ENCODED : 0); - skb_set_tfw_flags(skb, flags); - } - - truesize += tmp_truesize; - tmp_truesize = 0; - - if (FRAME_HEADERS_SHOULD_BE_MADE(flags)) { - if (*limit - stream->xmit.processed <= FRAME_HEADER_SIZE) { - r = -ENOMEM; - goto ret; - } - - r = tfw_h2_make_headers_frames(sk, skb, h2, stream, mss_now, - *limit - stream->xmit.processed, - &tmp_truesize); - if (unlikely(r)) { - T_WARN("%s: failed to make headers frames (%d)", - __func__, r); - goto ret; - } - - truesize += tmp_truesize; - tmp_truesize = 0; - headers_was_done = true; - - /* - * We clear this flag to prevent it's copying - * during skb splitting. - */ - if (!stream->xmit.h_len) { - skb_clear_tfw_flag(skb, SS_F_HTTT2_FRAME_HEADERS); - TFW_H2_STREAM_SEND_PROCESS(h2, stream, HTTP2_HEADERS); - } - } - - if (FRAME_DATA_SHOULD_BE_MADE(flags)) { - if (stream->rem_wnd <= 0 || h2->rem_wnd <= 0 - || *limit - stream->xmit.processed <= FRAME_HEADER_SIZE) { - if (headers_was_done) - goto update_limit; - r = -ENOMEM; - goto ret; - } - - r = tfw_h2_make_data_frames(sk, skb, h2, stream, mss_now, - *limit - stream->xmit.processed, - &tmp_truesize); - if (unlikely(r)) { - T_WARN("%s: failed to make data frames (%d)", - __func__, r); - if (r == -ENOMEM && headers_was_done) { - r = 0; - goto update_limit; - } - goto ret; - } - - truesize += tmp_truesize; - tmp_truesize = 0; - - /* - * We clear this flag to prevent it's copying - * during skb splitting. - */ - if (!stream->xmit.b_len) { - skb_clear_tfw_flag(skb, SS_F_HTTT2_FRAME_DATA); - TFW_H2_STREAM_SEND_PROCESS(h2, stream, HTTP2_DATA); - } - } - -update_limit: - if (FRAME_HEADERS_OR_DATA_SHOULD_BE_MADE(flags) - && stream && stream->xmit.nskbs == 1) - *limit = stream->xmit.processed; - - if (skb->len > *limit) { - unsigned short saved_flags = skb_tfw_flags(skb); - - /* - * Hacky way to clear flags of skb that will be created after - * splitting such skb must be with cleared flags, but - * current skb must be with already set flags. - */ - skb->tfw_cb.flags &= (unsigned short)(~TEMPESTA_SKB_FLAG_CLEAR_MASK); - r = tso_fragment(sk, skb, *limit, mss_now, - sk_gfp_mask(sk, GFP_ATOMIC)); - skb->tfw_cb.flags = saved_flags; - } - -ret: - /* Reinit stream xmit context. */ - if (stream) - *nskbs = !r ? stream->xmit.nskbs : 0; - - /* - * Since we add some data to skb, we should adjust the socket write - * memory both in case of success and in case of failure. - */ - if (unlikely(ss_add_overhead(sk, truesize))) { - T_WARN("%s: failed to add overhead to current TCP " - "socket control data.", __func__); - /* - * In case of previous error return it, - * otherwise return -ENOMEM. - */ - r = r ? r : -ENOMEM; - } - - if (unlikely(r) && r != -ENOMEM) { - if (stream) - tfw_h2_stream_add_closed(h2, stream); - } - - if (likely(!r)) - skb_set_tfw_flags(skb, SS_F_HTTP2_FRAME_PREPARED); - - return r; - -#undef TFW_H2_STREAM_SEND_PROCESS -#undef CHECK_STREAM_IS_PRESENT -#undef FRAME_ALREADY_PREPARED -#undef FRAME_HEADERS_OR_DATA_SHOULD_BE_MADE -#undef FRAME_DATA_SHOULD_BE_MADE -#undef FRAME_HEADERS_SHOULD_BE_MADE -} - static int -tfw_sk_prepare_xmit(struct sock *sk, struct sk_buff *skb, unsigned int mss_now, - unsigned int *limit, unsigned int *nskbs) +tfw_sk_fill_write_queue(struct sock *sk, unsigned int mss_now, int ss_action) { TfwConn *conn = sk->sk_user_data; - bool h2_mode; - int r = 0; + TfwH2Ctx *h2; + bool data_is_available = false; + unsigned long snd_wnd; + int r; assert_spin_locked(&sk->sk_lock.slock); /* @@ -443,38 +192,28 @@ tfw_sk_prepare_xmit(struct sock *sk, struct sk_buff *skb, unsigned int mss_now, */ BUG_ON(!conn); - *nskbs = UINT_MAX; - h2_mode = TFW_CONN_PROTO(conn) == TFW_FSM_H2; - if (h2_mode) - r = tfw_h2_sk_prepare_xmit(sk, skb, mss_now, limit, nskbs); - - return r; -} - -static int -tfw_sk_write_xmit(struct sock *sk, struct sk_buff *skb, unsigned int mss_now, - unsigned int limit, unsigned int nskbs) -{ - TfwConn *conn = sk->sk_user_data; - unsigned short flags; - bool h2_mode; - int r = 0; - - assert_spin_locked(&sk->sk_lock.slock); - /* Same as for tfw_sk_prepare_xmit(). */ - BUG_ON(!conn); + /* + * This function can be called both for HTTP1 and HTTP2 connections. + * Moreover this function can be called when HTTP2 connection is + * shutdowned before TLS hadshake was finished. + */ + h2 = TFW_CONN_PROTO(conn) == TFW_FSM_H2 ? + tfw_h2_context_safe(conn) : NULL; + if (!h2) { + if (ss_action == SS_SHUTDOWN) + tcp_shutdown(sk, SEND_SHUTDOWN); + return 0; + } - h2_mode = TFW_CONN_PROTO(conn) == TFW_FSM_H2; - flags = skb_tfw_flags(skb); + snd_wnd = tfw_tcp_calc_snd_wnd(sk, mss_now); //TRY ULONG MAX ALSO - r = tfw_tls_encrypt(sk, skb, mss_now, limit, nskbs); + r = tfw_h2_make_frames(sk, h2, snd_wnd, ss_action, &data_is_available); + if (unlikely(r < 0)) + return r; - if (h2_mode && r != -ENOMEM && (flags & SS_F_HTTT2_HPACK_TBL_SZ_ENCODED)) { - TfwH2Ctx *h2 = tfw_h2_context_unsafe(conn); - TfwHPackETbl *tbl = &h2->hpack.enc_tbl; + if (!data_is_available) + sock_reset_flag(sk, SOCK_TEMPESTA_HAS_DATA); - tfw_hpack_enc_tbl_write_sz_release(tbl, r); - } return r; } @@ -543,8 +282,8 @@ tfw_sock_clnt_new(struct sock *sk) * upcall beside GFSM and SS, but that's efficient and I didn't * find a simple and better solution. */ - sk->sk_prepare_xmit = tfw_sk_prepare_xmit; - sk->sk_write_xmit = tfw_sk_write_xmit; + sk->sk_write_xmit = tfw_tls_encrypt; + sk->sk_fill_write_queue = tfw_sk_fill_write_queue; } /* Activate keepalive timer. */ @@ -944,7 +683,7 @@ tfw_cfgop_keepalive_timeout(TfwCfgSpec *cs, TfwCfgEntry *ce) if (tfw_cli_cfg_ka_timeout < 0) { T_ERR_NL("Unable to parse 'keepalive_timeout' value: '%s'\n", - "Value less the zero"); + "Value less then zero"); return -EINVAL; } diff --git a/fw/ss_skb.c b/fw/ss_skb.c index 4091e0c737..3bf00b49a5 100644 --- a/fw/ss_skb.c +++ b/fw/ss_skb.c @@ -231,7 +231,7 @@ __extend_pgfrags(struct sk_buff *skb_head, struct sk_buff *skb, int from, int n) /* No fragments to shift. */ if (!tail_frags) - return 0; + goto finish; /* * Move @n_excess number of page fragments to new SKB. We @@ -262,6 +262,8 @@ __extend_pgfrags(struct sk_buff *skb_head, struct sk_buff *skb, int from, int n) if (n_shift > 0) memmove(&si->frags[from + n], &si->frags[from], n_shift * sizeof(skb_frag_t)); + +finish: si->nr_frags += n - n_excess; return 0; @@ -1308,11 +1310,15 @@ ss_skb_init_for_xmit(struct sk_buff *skb) struct skb_shared_info *shinfo = skb_shinfo(skb); __u8 pfmemalloc = skb->pfmemalloc; - WARN_ON_ONCE(skb->next || skb->prev); WARN_ON_ONCE(skb->sk); skb_dst_drop(skb); INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); + /* + * Since we use skb->sb for our purpose we should + * zeroed it before pass skb to the kernel. + */ + memset(skb->cb, 0, sizeof(skb->cb)); if (!skb_transport_header_was_set(skb)) { /* Quick path for new skbs. */ @@ -1321,7 +1327,6 @@ ss_skb_init_for_xmit(struct sk_buff *skb) } skb->skb_mstamp_ns = 0; - bzero_fast(skb->cb, sizeof(skb->cb)); nf_reset_ct(skb); skb->mac_len = 0; skb->queue_mapping = 0; diff --git a/fw/ss_skb.h b/fw/ss_skb.h index cb07c90ef9..b4ed84f9c4 100644 --- a/fw/ss_skb.h +++ b/fw/ss_skb.h @@ -23,6 +23,7 @@ #define __TFW_SS_SKB_H__ #include +#include #include "str.h" #include "lib/log.h" @@ -51,6 +52,77 @@ enum { SS_OK = T_OK, }; +typedef int (*on_send_cb_t)(void *conn, struct sk_buff **skb_head); +typedef void (*on_tcp_entail_t)(void *conn, struct sk_buff *skb_head); + +/* + * Tempesta FW sk_buff private data. + * @opaque_data - pointer to some private data (typically http response); + * @destructor - destructor of the opaque data, should be set if data is + * not NULL + * @on_send - callback to special handling this skb before sending; + * @on_tcp_entail - callback to special handling this skb before pushing + * to socket write queue; + * @stream_id - id of sender stream; + * @is_head - flag indicates that this is a head of skb list; + */ +struct tfw_skb_cb { + void *opaque_data; + void (*destructor)(void *opaque_data); + on_send_cb_t on_send; + on_tcp_entail_t on_tcp_entail; + unsigned int stream_id; + bool is_head; +}; + +#define TFW_SKB_CB(skb) ((struct tfw_skb_cb *)&((skb)->cb[0])) + +static inline void +ss_skb_setup_head_of_list(struct sk_buff *skb_head, unsigned int mark, + unsigned char tls_type) +{ + if (tls_type) + skb_set_tfw_tls_type(skb_head, tls_type); + skb_head->mark = mark; + TFW_SKB_CB(skb_head)->is_head = true; +} + +static inline void +ss_skb_destroy_opaque_data(struct sk_buff *skb_head) +{ + void *opaque_data = TFW_SKB_CB(skb_head)->opaque_data; + void (*destructor)(void *) = TFW_SKB_CB(skb_head)->destructor; + + BUILD_BUG_ON(sizeof(struct tfw_skb_cb) > + sizeof(((struct sk_buff *)(0))->cb)); + + if (opaque_data) { + BUG_ON(!destructor); + destructor(opaque_data); + } +} + +static inline int +ss_skb_on_send(void *conn, struct sk_buff **skb_head) +{ + on_send_cb_t on_send = TFW_SKB_CB(*skb_head)->on_send; + int r = 0; + + if (on_send) + r = on_send(conn, skb_head); + + return r; +} + +static inline void +ss_skb_on_tcp_entail(void *conn, struct sk_buff *skb_head) +{ + on_tcp_entail_t on_tcp_entail = TFW_SKB_CB(skb_head)->on_tcp_entail; + + if (on_tcp_entail) + on_tcp_entail(conn, skb_head); +} + typedef int ss_skb_actor_t(void *conn, unsigned char *data, unsigned int len, unsigned int *read); @@ -92,6 +164,25 @@ ss_skb_queue_append(struct sk_buff **skb_head, struct sk_buff *skb) tail->next = skb; } +static inline void +ss_skb_queue_splice(struct sk_buff **skb_head, struct sk_buff **skb) +{ + struct sk_buff *tail; + + if ((!*skb_head)) { + swap(*skb_head, *skb); + return; + } + + tail = (*skb_head)->prev; + (*skb_head)->prev = (*skb)->prev; + (*skb)->prev->next = (*skb_head); + tail->next = *skb; + (*skb)->prev = tail; + + *skb = NULL; +} + static inline void ss_skb_remove(struct sk_buff *skb) { @@ -169,6 +260,19 @@ ss_skb_insert_before(struct sk_buff **skb_head, struct sk_buff *skb, *skb_head = nskb; } +static inline void +ss_skb_queue_head(struct sk_buff **skb_head, struct sk_buff *skb) +{ + /* The skb shouldn't be in any other queue. */ + WARN_ON_ONCE(skb->next || skb->prev); + if (!*skb_head) { + *skb_head = skb; + skb->prev = skb->next = skb; + return; + } + ss_skb_insert_before(skb_head, *skb_head, skb); +} + /** * Almost a copy of standard skb_dequeue() except it works with skb list * instead of sk_buff_head. Several crucial data include skb list and we don't @@ -294,6 +398,39 @@ ss_skb_move_frags(struct sk_buff *skb, struct sk_buff *nskb, int from, ss_skb_adjust_data_len(nskb, e_size); } +static inline char * +ss_skb_data_ptr_by_offset(struct sk_buff *skb, unsigned int off) +{ + char *begin, *end; + unsigned long d; + unsigned char i; + + if (skb_headlen(skb)) { + begin = skb->data; + end = begin + skb_headlen(skb); + + if (begin + off <= end) + return begin + off; + off -= skb_headlen(skb); + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *f = &skb_shinfo(skb)->frags[i]; + + begin = skb_frag_address(f); + end = begin + skb_frag_size(f); + d = end - begin; + + if (off >= d) { + off -= d; + continue; + } + return begin + off; + } + + return NULL; +} + #define SS_SKB_MAX_DATA_LEN (SKB_MAX_HEADER + MAX_SKB_FRAGS * PAGE_SIZE) char *ss_skb_fmt_src_addr(const struct sk_buff *skb, char *out_buf); diff --git a/fw/sync_socket.h b/fw/sync_socket.h index ee23a70aff..f9986a1fd0 100644 --- a/fw/sync_socket.h +++ b/fw/sync_socket.h @@ -34,6 +34,12 @@ typedef struct ss_proto_t { int type; } SsProto; +typedef enum { + SS_SEND, + SS_CLOSE, + SS_SHUTDOWN, +} SsAction; + /* * Flag bits definition for SsProto.type field. * NOTE: There are also flags definition for this @@ -49,7 +55,7 @@ enum { * requests longer accepted (flag is intended * only for client connections). */ - Conn_Stop = 0x1 << __Flag_Bits, + Conn_Stop = (0x1 << __Flag_Bits), /* * Connection is in special state: we send FIN to * the client and wait until ACK to our FIN is come. @@ -60,7 +66,7 @@ enum { * Connection is in special state: it socket is DEAD * and wait until ACK to our FIN is come. */ - Conn_Closing = 0x3 << __Flag_Bits, + Conn_Closing = (0x3 << __Flag_Bits), }; typedef struct tfw_conn_t TfwConn; @@ -71,11 +77,11 @@ typedef struct ss_hooks { int (*connection_new)(struct sock *sk); /* - * Intentional socket closing when the socket is already closed (i.e. there - * could not be ingress data on it) and we can safely do some cleanup stuff - * or error on TCP connection (on Linux TCP socket layer) associated with - * the socket or at application (data processing) layer, i.e. unintentional - * connection closing. + * Intentional socket closing when the socket is already closed (i.e. + * there could not be ingress data on it) and we can safely do some + * cleanup stuff or error on TCP connection (on Linux TCP socket layer) + * associated with the socket or at application (data processing) + * layer, i.e. unintentional connection closing. * We need the callback since socket closing always has a chance to run * asynchronously on another CPU and a caller doesn't know when it * completes. @@ -177,6 +183,9 @@ void ss_start(void); void ss_stop(void); bool ss_active(void); void ss_get_stat(SsStat *stat); +void ss_skb_tcp_entail(struct sock *sk, struct sk_buff *skb, unsigned int mark, + unsigned char tls_type); +void ss_skb_tcp_entail_list(struct sock *sk, struct sk_buff **skb_head); #define SS_CALL(f, ...) \ (sk->sk_user_data && ((SsProto *)(sk)->sk_user_data)->hooks->f \ diff --git a/fw/t/unit/helpers.c b/fw/t/unit/helpers.c index 3d5a1cf5b2..91d7c8dbe3 100644 --- a/fw/t/unit/helpers.c +++ b/fw/t/unit/helpers.c @@ -174,6 +174,15 @@ ss_stop(void) { } +void ss_skb_tcp_entail(struct sock *sk, struct sk_buff *skb, unsigned int mark, + unsigned char tls_type) +{ +} + +void ss_skb_tcp_entail_list(struct sock *sk, struct sk_buff **skb) +{ +} + void tfw_client_set_expires_time(unsigned int expires_time) { @@ -433,4 +442,10 @@ ttls_hs_done(TlsCtx *tls) return true; } +bool +ttls_xfrm_need_encrypt(TlsCtx *tls) +{ + return true; +} + unsigned int cache_default_ttl = 60; diff --git a/fw/t/unit/http2.c b/fw/t/unit/http2.c new file mode 120000 index 0000000000..e7e99861e9 --- /dev/null +++ b/fw/t/unit/http2.c @@ -0,0 +1 @@ +../../http2.c \ No newline at end of file diff --git a/fw/t/unit/test.c b/fw/t/unit/test.c index fb6c82fb18..4b951eba73 100644 --- a/fw/t/unit/test.c +++ b/fw/t/unit/test.c @@ -99,6 +99,7 @@ TEST_SUITE(wq); TEST_SUITE(tls); TEST_SUITE(hpack); TEST_SUITE(pool); +TEST_SUITE(ebtree); extern int tfw_pool_init(void); extern void tfw_pool_exit(void); @@ -156,6 +157,9 @@ test_run_all(void) TEST_SUITE_RUN(pool); __fpu_schedule(); + TEST_SUITE_RUN(ebtree); + __fpu_schedule(); + kernel_fpu_end(); tfw_pool_exit(); diff --git a/fw/t/unit/test_ebtree.c b/fw/t/unit/test_ebtree.c new file mode 100644 index 0000000000..de50327db8 --- /dev/null +++ b/fw/t/unit/test_ebtree.c @@ -0,0 +1,52 @@ +#include "test.h" +#include "helpers.h" + +#include "lib/eb64tree.h" + +#define EB64_NODES_MAX 1000 +static struct eb64_node nodes[EB64_NODES_MAX]; + +static unsigned long +find_min_key(struct eb64_node *nodes, int size) +{ + unsigned long min = nodes[0].key; + unsigned int i; + + for (i = 1; i < size; i++) { + if (nodes[i].key < min) + min = nodes[i].key; + } + + return min; +} + +TEST(ebtree, extract_min) +{ + struct eb_root tree = EB_ROOT; + struct eb64_node *root; + unsigned long min; + unsigned int i; + + for (i = 0; i < EB64_NODES_MAX; i++) { + nodes[i].key = get_random_long(); + eb64_insert(&tree, &nodes[i]); + } + + for (i = 0; i < EB64_NODES_MAX; i++) { + /* + * Find minimal node using linear search and compare + * it with the minimal value from the tree. + */ + min = find_min_key(nodes, EB64_NODES_MAX); + root = eb64_first(&tree); + EXPECT_EQ(root->key, min); + eb64_delete(root); + root->key = get_random_long(); + eb64_insert(&tree, root); + } +} + +TEST_SUITE(ebtree) +{ + TEST_RUN(ebtree, extract_min); +} diff --git a/fw/t/unit/test_hpack.c b/fw/t/unit/test_hpack.c index 64ea1e9f5a..8ee320e78e 100644 --- a/fw/t/unit/test_hpack.c +++ b/fw/t/unit/test_hpack.c @@ -20,6 +20,7 @@ #include "hpack.c" #define tfw_connection_send(a, b) 0 #include "http_stream.c" +#include "http_stream_sched.c" #include "http_frame.c" #include "http.c" diff --git a/fw/tcp.h b/fw/tcp.h index 3461cb9c37..5f50e347ce 100644 --- a/fw/tcp.h +++ b/fw/tcp.h @@ -24,6 +24,41 @@ void tfw_tcp_propagate_dseq(struct sock *sk, struct sk_buff *skb); void tfw_tcp_setup_new_skb(struct sock *sk, struct sk_buff *skb, - struct sk_buff *nskb, unsigned int mss_now); + struct sk_buff *nskb, unsigned int mss_now); + +/* + * Calculate window size to send in bytes. We calculate the sender + * and receiver window and select the smallest of them. + * We ajust also @not_account_in_flight counf of skbs, which were + * previously pushed to socket write queue. In `tcp_write_xmit` + * main loop cong_win is calculated on each loop iteration and + * if we calculate `cong_win` for making frames without taking + * into account previously pushed skbs we push more data into + * socket write queue then we can send. + */ +static inline unsigned long +tfw_tcp_calc_snd_wnd(struct sock *sk, unsigned int mss_now) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int in_flight = tcp_packets_in_flight(tp); + unsigned int qlen = skb_queue_len(&sk->sk_write_queue); + unsigned int send_win, cong_win; + + /* + * Update snd_cwnd if nedeed, to correct caclulation + * of count of bytes to send. + */ + tcp_slow_start_after_idle_check(sk); + + if (in_flight + qlen >= tp->snd_cwnd) + return 0; + + if (after(tp->write_seq, tcp_wnd_end(tp))) + return 0; + + cong_win = (tp->snd_cwnd - in_flight - qlen) * mss_now; + send_win = tcp_wnd_end(tp) - tp->write_seq; + return min(cong_win, send_win); +} #endif /* __TFW_TCP_H__ */ diff --git a/fw/tls.c b/fw/tls.c index 1a4749e2ab..579c07e91a 100644 --- a/fw/tls.c +++ b/fw/tls.c @@ -237,16 +237,17 @@ tfw_tls_connection_recv(TfwConn *conn, struct sk_buff *skb) */ int tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int mss_now, - unsigned int limit, unsigned int nskbs) + unsigned int limit) { /* * TODO #1103 currently even trivial 500-bytes HTTP message generates * 6 segment skb. After the fix the number probably should be decreased. */ #define AUTO_SEGS_N 8 +#define MAX_SEG_N 64 int r = -ENOMEM; - unsigned int head_sz, len, frags, t_sz, out_frags, i = 0; + unsigned int head_sz, len, frags, t_sz, out_frags, next_nents; unsigned char type; struct sk_buff *next = skb, *skb_tail = skb; struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -267,7 +268,7 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int mss_now, xfrm = &tls->xfrm; T_DBG3("%s: sk=%pK(snd_una=%u snd_nxt=%u limit=%u)" - " skb=%pK(len=%u data_len=%u type=%u frags=%u headlen=%u" + " skb=%px(len=%u data_len=%u type=%u frags=%u headlen=%u" " seq=%u:%u)\n", __func__, sk, tcp_sk(sk)->snd_una, tcp_sk(sk)->snd_nxt, limit, skb, skb->len, skb->data_len, skb_tfw_tls_type(skb), @@ -279,7 +280,7 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int mss_now, != tcb->end_seq); head_sz = ttls_payload_off(xfrm); - len = head_sz + skb->len + TTLS_TAG_LEN; + len = skb->len; type = skb_tfw_tls_type(skb); if (!type) { T_WARN("%s: bad skb type %u\n", __func__, type); @@ -291,10 +292,11 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int mss_now, tcb->end_seq += head_sz; /* Try to aggregate several skbs into one TLS record. */ - while (!tcp_skb_is_last(sk, skb_tail) && i++ < nskbs - 1) { + while (!tcp_skb_is_last(sk, skb_tail)) { next = skb_queue_next(&sk->sk_write_queue, skb_tail); + next_nents = skb_shinfo(next)->nr_frags + !!skb_headlen(next); - T_DBG3("next skb (%pK) in write queue: len=%u frags=%u/%u" + T_DBG3("next skb (%px) in write queue: len=%u frags=%u/%u" " type=%u seq=%u:%u\n", next, next->len, skb_shinfo(next)->nr_frags, !!skb_headlen(next), skb_tfw_tls_type(next), @@ -302,6 +304,8 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int mss_now, if (len + next->len > limit) break; + if (unlikely(sgt.nents + next_nents > MAX_SEG_N)) + break; /* Don't put different message types into the same record. */ if (type != skb_tfw_tls_type(next)) break; @@ -313,11 +317,13 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int mss_now, tfw_tcp_propagate_dseq(sk, skb_tail); len += next->len; - sgt.nents += skb_shinfo(next)->nr_frags + !!skb_headlen(next); - out_sgt.nents += skb_shinfo(next)->nr_frags + !!skb_headlen(next); + sgt.nents += next_nents; + out_sgt.nents += next_nents; skb_tail = next; } + len += head_sz + TTLS_TAG_LEN; + /* * Use skb_tail->next as skb_head in __extend_pgfrags() to not try to * put TAG to the next skb, which is out of our limit. In worst case, @@ -485,6 +491,7 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int mss_now, __func__, r); return r; #undef AUTO_SEGS_N +#undef MAX_SEG_N } static inline int @@ -513,6 +520,30 @@ tfw_tls_close_msg_flags(TlsIOCtx *io) return flags; } +static inline int +tfw_tls_on_send_alert(void *conn, struct sk_buff **skb_head) +{ + TfwH2Ctx *ctx; + + ctx = tfw_h2_context_safe((TfwConn *)conn); + if (!ctx) + return 0; + + if (ctx->error && ctx->error->xmit.skb_head) { + ss_skb_queue_splice(&ctx->error->xmit.skb_head, skb_head); + } else if (ctx->cur_send_headers) { + /* + * Other frames (from any stream) MUST NOT occur between + * the HEADERS frame and any CONTINUATION frames that might + * follow. Send TLS alert later. + */ + ctx->error = ctx->cur_send_headers; + ss_skb_queue_splice(&ctx->error->xmit.skb_head, skb_head); + } + + return 0; +} + /** * Callback function which is called by TLS module under tls->lock when it * initiates a record transmission, e.g. alert or a handshake message. @@ -589,6 +620,7 @@ tfw_tls_send(TlsCtx *tls, struct sg_table *sgt) io->alert[0] == TTLS_ALERT_LEVEL_FATAL)) { TFW_CONN_TYPE(((TfwConn *)conn)) |= Conn_Stop; flags |= tfw_tls_close_msg_flags(io); + TFW_SKB_CB(io->skb_list)->on_send = tfw_tls_on_send_alert; } r = ss_send(conn->cli_conn.sk, &io->skb_list, flags); diff --git a/fw/tls.h b/fw/tls.h index db2536258b..a9cb243489 100644 --- a/fw/tls.h +++ b/fw/tls.h @@ -31,7 +31,7 @@ void tfw_tls_cfg_configured(bool global); void tfw_tls_set_allow_any_sni(bool match); int tfw_tls_cfg_alpn_protos(const char *cfg_str); int tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int mss_now, - unsigned int limit, unsigned int nskbs); + unsigned int limit); typedef struct tfw_conn_t TfwConn; int tfw_tls_connection_recv(TfwConn *conn, struct sk_buff *skb); diff --git a/lib/Makefile b/lib/Makefile index f7991a4203..daa009f2b1 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -22,7 +22,7 @@ obj-m = tempesta_lib.o GCOV_PROFILE := $(TFW_GCOV) -tempesta_lib-objs = hash.o main.o +tempesta_lib-objs = hash.o main.o ebtree.o eb64tree.o ifdef AVX2 tempesta_lib-objs += str_simd.o endif diff --git a/lib/eb64tree.c b/lib/eb64tree.c new file mode 100755 index 0000000000..a6a4215088 --- /dev/null +++ b/lib/eb64tree.c @@ -0,0 +1,35 @@ +/* + * Elastic Binary Trees - exported functions for operations on 64bit nodes. + * + * Copyright (C) 2000-2015 Willy Tarreau - w@1wt.eu + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* Consult eb64tree.h for more details about those functions */ + +#include "eb64tree.h" + +struct eb64_node *eb64_insert(struct eb_root *root, struct eb64_node *new) +{ + return __eb64_insert(root, new); +} +EXPORT_SYMBOL(eb64_insert); diff --git a/lib/eb64tree.h b/lib/eb64tree.h new file mode 100755 index 0000000000..81ba27bec0 --- /dev/null +++ b/lib/eb64tree.h @@ -0,0 +1,273 @@ +/* + * Elastic Binary Trees - macros and structures for operations on 64bit nodes. + * + * Copyright (C) 2000-2015 Willy Tarreau - w@1wt.eu + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _EB64TREE_H +#define _EB64TREE_H + +#include "ebtree.h" + +#include + + +/* Return the structure of type whose member points to */ +#define eb64_entry(ptr, type, member) container_of(ptr, type, member) + +#define EB64_ROOT EB_ROOT +#define EB64_TREE_HEAD EB_TREE_HEAD + +/* This structure carries a node, a leaf, and a key. It must start with the + * eb_node so that it can be cast into an eb_node. We could also have put some + * sort of transparent union here to reduce the indirection level, but the fact + * is, the end user is not meant to manipulate internals, so this is pointless. + * In case sizeof(void*)>=sizeof(u64), we know there will be some padding after + * the key if it's unaligned. In this case we force the alignment on void* so + * that we prefer to have the padding before for more efficient accesses. + */ +struct eb64_node { + struct eb_node node; /* the tree node, must be at the beginning */ + MAYBE_ALIGN(sizeof(u64)); + ALWAYS_ALIGN(sizeof(void*)); + u64 key; +} ALIGNED(sizeof(void*)); + +/* + * Exported functions and macros. + * Many of them are always inlined because they are extremely small, and + * are generally called at most once or twice in a program. + */ + +/* Return leftmost node in the tree, or NULL if none */ +static inline struct eb64_node *eb64_first(struct eb_root *root) +{ + return eb64_entry(eb_first(root), struct eb64_node, node); +} + +/* Delete node from the tree if it was linked in. Mark the node unused. Note + * that this function relies on a non-inlined generic function: eb_delete. + */ +static inline void eb64_delete(struct eb64_node *eb64) +{ + eb_delete(&eb64->node); +} + +/* + * The following functions are not inlined by default. They are declared + * in eb64tree.c, which simply relies on their inline version. + */ +struct eb64_node *eb64_insert(struct eb_root *root, struct eb64_node *new); + +/* + * The following functions are less likely to be used directly, because their + * code is larger. The non-inlined version is preferred. + */ + +/* Delete node from the tree if it was linked in. Mark the node unused. */ +static forceinline void __eb64_delete(struct eb64_node *eb64) +{ + __eb_delete(&eb64->node); +} + +/* Insert eb64_node into subtree starting at node root . + * Only new->key needs be set with the key. The eb64_node is returned. + * If root->b[EB_RGHT]==1, the tree may only contain unique keys. + */ +static forceinline struct eb64_node * +__eb64_insert(struct eb_root *root, struct eb64_node *new) { + struct eb64_node *old; + unsigned int side; + eb_troot_t *troot; + u64 newkey; /* caching the key saves approximately one cycle */ + eb_troot_t *root_right; + int old_node_bit; + + side = EB_LEFT; + troot = root->b[EB_LEFT]; + root_right = root->b[EB_RGHT]; + if (unlikely(troot == NULL)) { + /* Tree is empty, insert the leaf part below the left branch */ + root->b[EB_LEFT] = eb_dotag(&new->node.branches, EB_LEAF); + new->node.leaf_p = eb_dotag(root, EB_LEFT); + new->node.node_p = NULL; /* node part unused */ + return new; + } + + /* The tree descent is fairly easy : + * - first, check if we have reached a leaf node + * - second, check if we have gone too far + * - third, reiterate + * Everywhere, we use for the node node we are inserting, + * for the node we attach it to, and for the node we are + * displacing below . will always point to the future node + * (tagged with its type). carries the side the node is + * attached to below its parent, which is also where previous node + * was attached. carries the key being inserted. + */ + newkey = new->key; + + while (1) { + if (unlikely(eb_gettag(troot) == EB_LEAF)) { + eb_troot_t *new_left, *new_rght; + eb_troot_t *new_leaf, *old_leaf; + + old = container_of(eb_untag(troot, EB_LEAF), + struct eb64_node, node.branches); + + new_left = eb_dotag(&new->node.branches, EB_LEFT); + new_rght = eb_dotag(&new->node.branches, EB_RGHT); + new_leaf = eb_dotag(&new->node.branches, EB_LEAF); + old_leaf = eb_dotag(&old->node.branches, EB_LEAF); + + new->node.node_p = old->node.leaf_p; + + /* Right here, we have 3 possibilities : + - the tree does not contain the key, and we have + new->key < old->key. We insert new above old, on + the left ; + + - the tree does not contain the key, and we have + new->key > old->key. We insert new above old, on + the right ; + + - the tree does contain the key, which implies it + is alone. We add the new key next to it as a + first duplicate. + + The last two cases can easily be partially merged. + */ + + if (new->key < old->key) { + new->node.leaf_p = new_left; + old->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = new_leaf; + new->node.branches.b[EB_RGHT] = old_leaf; + } else { + /* we may refuse to duplicate this key if the tree is + * tagged as containing only unique keys. + */ + if ((new->key == old->key) && eb_gettag(root_right)) + return old; + + /* new->key >= old->key, new goes the right */ + old->node.leaf_p = new_left; + new->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = old_leaf; + new->node.branches.b[EB_RGHT] = new_leaf; + + if (new->key == old->key) { + new->node.bit = -1; + root->b[side] = eb_dotag(&new->node.branches, EB_NODE); + return new; + } + } + break; + } + + /* OK we're walking down this link */ + old = container_of(eb_untag(troot, EB_NODE), + struct eb64_node, node.branches); + old_node_bit = old->node.bit; + + /* Stop going down when we don't have common bits anymore. We + * also stop in front of a duplicates tree because it means we + * have to insert above. + */ + + if ((old_node_bit < 0) || /* we're above a duplicate tree, stop here */ + (((new->key ^ old->key) >> old_node_bit) >= EB_NODE_BRANCHES)) { + /* The tree did not contain the key, so we insert before the node + * , and set ->bit to designate the lowest bit position in + * which applies to ->branches.b[]. + */ + eb_troot_t *new_left, *new_rght; + eb_troot_t *new_leaf, *old_node; + + new_left = eb_dotag(&new->node.branches, EB_LEFT); + new_rght = eb_dotag(&new->node.branches, EB_RGHT); + new_leaf = eb_dotag(&new->node.branches, EB_LEAF); + old_node = eb_dotag(&old->node.branches, EB_NODE); + + new->node.node_p = old->node.node_p; + + if (new->key < old->key) { + new->node.leaf_p = new_left; + old->node.node_p = new_rght; + new->node.branches.b[EB_LEFT] = new_leaf; + new->node.branches.b[EB_RGHT] = old_node; + } + else if (new->key > old->key) { + old->node.node_p = new_left; + new->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = old_node; + new->node.branches.b[EB_RGHT] = new_leaf; + } + else { + struct eb_node *ret; + ret = eb_insert_dup(&old->node, &new->node); + return container_of(ret, struct eb64_node, node); + } + break; + } + + /* walk down */ + root = &old->node.branches; + + if (sizeof(long) >= 8) { + side = newkey >> old_node_bit; + } else { + /* note: provides the best code on low-register count archs + * such as i386. + */ + side = newkey; + side >>= old_node_bit; + if (old_node_bit >= 32) { + side = newkey >> 32; + side >>= old_node_bit & 0x1F; + } + } + side &= EB_NODE_BRANCH_MASK; + troot = root->b[side]; + } + + /* Ok, now we are inserting between and . 's + * parent is already set to , and the 's branch is still in + * . Update the root's leaf till we have it. Note that we can also + * find the side by checking the side of new->node.node_p. + */ + + /* We need the common higher bits between new->key and old->key. + * What differences are there between new->key and the node here ? + * NOTE that bit(new) is always < bit(root) because highest + * bit of new->key and old->key are identical here (otherwise they + * would sit on different branches). + */ + /* note that if EB_NODE_BITS > 1, we should check that it's still >= 0 */ + new->node.bit = fls64(new->key ^ old->key) - EB_NODE_BITS; + root->b[side] = eb_dotag(&new->node.branches, EB_NODE); + + return new; +} + +#endif /* _EB64_TREE_H */ diff --git a/lib/ebtree.c b/lib/ebtree.c new file mode 100755 index 0000000000..c9f9953dd2 --- /dev/null +++ b/lib/ebtree.c @@ -0,0 +1,40 @@ +/* + * Elastic Binary Trees - exported generic functions + * + * Copyright (C) 2000-2015 Willy Tarreau - w@1wt.eu + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "ebtree.h" + +void eb_delete(struct eb_node *node) +{ + __eb_delete(node); +} +EXPORT_SYMBOL(eb_delete); + +/* used by insertion primitives */ +struct eb_node *eb_insert_dup(struct eb_node *sub, struct eb_node *new) +{ + return __eb_insert_dup(sub, new); +} +EXPORT_SYMBOL(eb_insert_dup); diff --git a/lib/ebtree.h b/lib/ebtree.h new file mode 100755 index 0000000000..affddd145c --- /dev/null +++ b/lib/ebtree.h @@ -0,0 +1,597 @@ +/* + * Elastic Binary Trees - generic macros and structures. + * + * Copyright (C) 2000-2015 Willy Tarreau - w@1wt.eu + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + + + +/* + General idea: + ------------- + In a radix binary tree, we may have up to 2N-1 nodes for N keys if all of + them are leaves. If we find a way to differentiate intermediate nodes (later + called "nodes") and final nodes (later called "leaves"), and we associate + them by two, it is possible to build sort of a self-contained radix tree with + intermediate nodes always present. It will not be as cheap as the ultree for + optimal cases as shown below, but the optimal case almost never happens : + + Eg, to store 8, 10, 12, 13, 14 : + + ultree this theorical tree + + 8 8 + / \ / \ + 10 12 10 12 + / \ / \ + 13 14 12 14 + / \ + 12 13 + + Note that on real-world tests (with a scheduler), is was verified that the + case with data on an intermediate node never happens. This is because the + data spectrum is too large for such coincidences to happen. It would require + for instance that a task has its expiration time at an exact second, with + other tasks sharing that second. This is too rare to try to optimize for it. + + What is interesting is that the node will only be added above the leaf when + necessary, which implies that it will always remain somewhere above it. So + both the leaf and the node can share the exact value of the leaf, because + when going down the node, the bit mask will be applied to comparisons. So we + are tempted to have one single key shared between the node and the leaf. + + The bit only serves the nodes, and the dups only serve the leaves. So we can + put a lot of information in common. This results in one single entity with + two branch pointers and two parent pointers, one for the node part, and one + for the leaf part : + + node's leaf's + parent parent + | | + [node] [leaf] + / \ + left right + branch branch + + The node may very well refer to its leaf counterpart in one of its branches, + indicating that its own leaf is just below it : + + node's + parent + | + [node] + / \ + left [leaf] + branch + + Adding keys in such a tree simply consists in inserting nodes between + other nodes and/or leaves : + + [root] + | + [node2] + / \ + [leaf1] [node3] + / \ + [leaf2] [leaf3] + + On this diagram, we notice that [node2] and [leaf2] have been pulled away + from each other due to the insertion of [node3], just as if there would be + an elastic between both parts. This elastic-like behaviour gave its name to + the tree : "Elastic Binary Tree", or "EBtree". The entity which associates a + node part and a leaf part will be called an "EB node". + + We also notice on the diagram that there is a root entity required to attach + the tree. It only contains two branches and there is nothing above it. This + is an "EB root". Some will note that [leaf1] has no [node1]. One property of + the EBtree is that all nodes have their branches filled, and that if a node + has only one branch, it does not need to exist. Here, [leaf1] was added + below [root] and did not need any node. + + An EB node contains : + - a pointer to the node's parent (node_p) + - a pointer to the leaf's parent (leaf_p) + - two branches pointing to lower nodes or leaves (branches) + - a bit position (bit) + - an optional key. + + The key here is optional because it's used only during insertion, in order + to classify the nodes. Nothing else in the tree structure requires knowledge + of the key. This makes it possible to write type-agnostic primitives for + everything, and type-specific insertion primitives. This has led to consider + two types of EB nodes. The type-agnostic ones will serve as a header for the + other ones, and will simply be called "struct eb_node". The other ones will + have their type indicated in the structure name. Eg: "struct eb32_node" for + nodes carrying 32 bit keys. + + We will also node that the two branches in a node serve exactly the same + purpose as an EB root. For this reason, a "struct eb_root" will be used as + well inside the struct eb_node. In order to ease pointer manipulation and + ROOT detection when walking upwards, all the pointers inside an eb_node will + point to the eb_root part of the referenced EB nodes, relying on the same + principle as the linked lists in Linux. + + Another important point to note, is that when walking inside a tree, it is + very convenient to know where a node is attached in its parent, and what + type of branch it has below it (leaf or node). In order to simplify the + operations and to speed up the processing, it was decided in this specific + implementation to use the lowest bit from the pointer to designate the side + of the upper pointers (left/right) and the type of a branch (leaf/node). + This practise is not mandatory by design, but an implementation-specific + optimisation permitted on all platforms on which data must be aligned. All + known 32 bit platforms align their integers and pointers to 32 bits, leaving + the two lower bits unused. So, we say that the pointers are "tagged". And + since they designate pointers to root parts, we simply call them + "tagged root pointers", or "eb_troot" in the code. + + Duplicate keys are stored in a special manner. When inserting a key, if + the same one is found, then an incremental binary tree is built at this + place from these keys. This ensures that no special case has to be written + to handle duplicates when walking through the tree or when deleting entries. + It also guarantees that duplicates will be walked in the exact same order + they were inserted. This is very important when trying to achieve fair + processing distribution for instance. + + Algorithmic complexity can be derived from 3 variables : + - the number of possible different keys in the tree : P + - the number of entries in the tree : N + - the number of duplicates for one key : D + + Note that this tree is deliberately NOT balanced. For this reason, the worst + case may happen with a small tree (eg: 32 distinct keys of one bit). BUT, + the operations required to manage such data are so much cheap that they make + it worth using it even under such conditions. For instance, a balanced tree + may require only 6 levels to store those 32 keys when this tree will + require 32. But if per-level operations are 5 times cheaper, it wins. + + Minimal, Maximal and Average times are specified in number of operations. + Minimal is given for best condition, Maximal for worst condition, and the + average is reported for a tree containing random keys. An operation + generally consists in jumping from one node to the other. + + Complexity : + - lookup : min=1, max=log(P), avg=log(N) + - insertion from root : min=1, max=log(P), avg=log(N) + - insertion of dups : min=1, max=log(D), avg=log(D)/2 after lookup + - deletion : min=1, max=1, avg=1 + - prev/next : min=1, max=log(P), avg=2 : + N/2 nodes need 1 hop => 1*N/2 + N/4 nodes need 2 hops => 2*N/4 + N/8 nodes need 3 hops => 3*N/8 + ... + N/x nodes need log(x) hops => log2(x)*N/x + Total cost for all N nodes : sum[i=1..N](log2(i)*N/i) = N*sum[i=1..N](log2(i)/i) + Average cost across N nodes = total / N = sum[i=1..N](log2(i)/i) = 2 + + This design is currently limited to only two branches per node. Most of the + tree descent algorithm would be compatible with more branches (eg: 4, to cut + the height in half), but this would probably require more complex operations + and the deletion algorithm would be problematic. + + Useful properties : + - a node is always added above the leaf it is tied to, and never can get + below nor in another branch. This implies that leaves directly attached + to the root do not use their node part, which is indicated by a NULL + value in node_p. This also enhances the cache efficiency when walking + down the tree, because when the leaf is reached, its node part will + already have been visited (unless it's the first leaf in the tree). + + - pointers to lower nodes or leaves are stored in "branch" pointers. Only + the root node may have a NULL in either branch, it is not possible for + other branches. Since the nodes are attached to the left branch of the + root, it is not possible to see a NULL left branch when walking up a + tree. Thus, an empty tree is immediately identified by a NULL left + branch at the root. Conversely, the one and only way to identify the + root node is to check that it right branch is NULL. Note that the + NULL pointer may have a few low-order bits set. + + - a node connected to its own leaf will have branch[0|1] pointing to + itself, and leaf_p pointing to itself. + + - a node can never have node_p pointing to itself. + + - a node is linked in a tree if and only if it has a non-null leaf_p. + + - a node can never have both branches equal, except for the root which can + have them both NULL. + + - deletion only applies to leaves. When a leaf is deleted, its parent must + be released too (unless it's the root), and its sibling must attach to + the grand-parent, replacing the parent. Also, when a leaf is deleted, + the node tied to this leaf will be removed and must be released too. If + this node is different from the leaf's parent, the freshly released + leaf's parent will be used to replace the node which must go. A released + node will never be used anymore, so there's no point in tracking it. + + - the bit index in a node indicates the bit position in the key which is + represented by the branches. That means that a node with (bit == 0) is + just above two leaves. Negative bit values are used to build a duplicate + tree. The first node above two identical leaves gets (bit == -1). This + value logarithmically decreases as the duplicate tree grows. During + duplicate insertion, a node is inserted above the highest bit value (the + lowest absolute value) in the tree during the right-sided walk. If bit + -1 is not encountered (highest < -1), we insert above last leaf. + Otherwise, we insert above the node with the highest value which was not + equal to the one of its parent + 1. + + - the "eb_next" primitive walks from left to right, which means from lower + to higher keys. It returns duplicates in the order they were inserted. + The "eb_first" primitive returns the left-most entry. + + - the "eb_prev" primitive walks from right to left, which means from + higher to lower keys. It returns duplicates in the opposite order they + were inserted. The "eb_last" primitive returns the right-most entry. + + - a tree which has 1 in the lower bit of its root's right branch is a + tree with unique nodes. This means that when a node is inserted with + a key which already exists will not be inserted, and the previous + entry will be returned. + + */ +#ifndef __EBTREE_H__ +#define __EBTREE_H__ + +#include + +/* By default, gcc does not inline large chunks of code, but we want it to + * respect our choices. + */ +#if !defined(forceinline) +#if __GNUC__ < 3 +#define forceinline inline +#else +#define forceinline inline __attribute__((always_inline)) +#endif +#endif + +/* sets alignment for current field or variable */ +#ifndef ALIGNED +#define ALIGNED(x) __attribute__((aligned(x))) +#endif + +/* add a mandatory alignment for next fields in a structure */ +#ifndef ALWAYS_ALIGN +#define ALWAYS_ALIGN(x) union { } ALIGNED(x) +#endif + +/* add an optional alignment for next fields in a structure, only for archs + * which do not support unaligned accesses. + */ +#ifndef MAYBE_ALIGN +#define MAYBE_ALIGN(x) union { } ALIGNED(x) +#else +#define MAYBE_ALIGN(x) +#endif + +/* Number of bits per node, and number of leaves per node */ +#define EB_NODE_BITS 1 +#define EB_NODE_BRANCHES (1 << EB_NODE_BITS) +#define EB_NODE_BRANCH_MASK (EB_NODE_BRANCHES - 1) + +/* Be careful not to tweak those values. The walking code is optimized for NULL + * detection on the assumption that the following values are intact. + */ +#define EB_LEFT 0 +#define EB_RGHT 1 +#define EB_LEAF 0 +#define EB_NODE 1 + +/* Tags to set in root->b[EB_RGHT] : + * - EB_NORMAL is a normal tree which stores duplicate keys. + * - EB_UNIQUE is a tree which stores unique keys. + */ +#define EB_NORMAL 0 +#define EB_UNIQUE 1 + +/* This is the same as an eb_node pointer, except that the lower bit embeds + * a tag. See eb_dotag()/eb_untag()/eb_gettag(). This tag has two meanings : + * - 0=left, 1=right to designate the parent's branch for leaf_p/node_p + * - 0=link, 1=leaf to designate the branch's type for branch[] + */ +typedef void eb_troot_t; + +/* The eb_root connects the node which contains it, to two nodes below it, one + * of which may be the same node. At the top of the tree, we use an eb_root + * too, which always has its right branch NULL (+/1 low-order bits). + */ +struct eb_root { + eb_troot_t *b[EB_NODE_BRANCHES]; /* left and right branches */ +}; + +/* The eb_node contains the two parts, one for the leaf, which always exists, + * and one for the node, which remains unused in the very first node inserted + * into the tree. This structure is 20 bytes per node on 32-bit machines. Do + * not change the order, benchmarks have shown that it's optimal this way. + * Note: be careful about this struct's alignment if it gets included into + * another struct and some atomic ops are expected on the keys or the node. + */ +struct eb_node { + struct eb_root branches; /* branches, must be at the beginning */ + eb_troot_t *node_p; /* link node's parent */ + eb_troot_t *leaf_p; /* leaf node's parent */ + short int bit; /* link's bit position. */ + short unsigned int pfx; /* data prefix length, always related to leaf */ +} __attribute__((packed)); + +/* Return the structure of type whose member points to */ +#define eb_entry(ptr, type, member) container_of(ptr, type, member) + +/* The root of a tree is an eb_root initialized with both pointers NULL. + * During its life, only the left pointer will change. The right one will + * always remain NULL, which is the way we detect it. + */ +#define EB_ROOT \ + (struct eb_root) { \ + .b = {[0] = NULL, [1] = NULL }, \ + } + +#define EB_ROOT_UNIQUE \ + (struct eb_root) { \ + .b = {[0] = NULL, [1] = (void *)1 }, \ + } + +#define EB_TREE_HEAD(name) \ + struct eb_root name = EB_ROOT + + +/***************************************\ + * Private functions. Not for end-user * +\***************************************/ + +/* Converts a root pointer to its equivalent eb_troot_t pointer, + * ready to be stored in ->branch[], leaf_p or node_p. NULL is not + * conserved. To be used with EB_LEAF, EB_NODE, EB_LEFT or EB_RGHT in . + */ +static inline eb_troot_t *eb_dotag(const struct eb_root *root, const int tag) +{ + return (eb_troot_t *)((char *)root + tag); +} + +/* Converts an eb_troot_t pointer pointer to its equivalent eb_root pointer, + * for use with pointers from ->branch[], leaf_p or node_p. NULL is conserved + * as long as the tree is not corrupted. To be used with EB_LEAF, EB_NODE, + * EB_LEFT or EB_RGHT in . + */ +static inline struct eb_root *eb_untag(const eb_troot_t *troot, const int tag) +{ + return (struct eb_root *)((char *)troot - tag); +} + +/* returns the tag associated with an eb_troot_t pointer */ +static inline int eb_gettag(eb_troot_t *troot) +{ + return (unsigned long)troot & 1; +} + +/* Converts a root pointer to its equivalent eb_troot_t pointer and clears the + * tag, no matter what its value was. + */ +static inline struct eb_root *eb_clrtag(const eb_troot_t *troot) +{ + return (struct eb_root *)((unsigned long)troot & ~1UL); +} + +/* Returns a pointer to the eb_node holding */ +static inline struct eb_node *eb_root_to_node(struct eb_root *root) +{ + return container_of(root, struct eb_node, branches); +} + +/* Walks down starting at root pointer , and always walking on side + * . It either returns the node hosting the first leaf on that side, + * or NULL if no leaf is found. may either be NULL or a branch pointer. + * The pointer to the leaf (or NULL) is returned. + */ +static inline struct eb_node *eb_walk_down(eb_troot_t *start, unsigned int side) +{ + /* A NULL pointer on an empty tree root will be returned as-is */ + while (eb_gettag(start) == EB_NODE) + start = (eb_untag(start, EB_NODE))->b[side]; + /* NULL is left untouched (root==eb_node, EB_LEAF==0) */ + return eb_root_to_node(eb_untag(start, EB_LEAF)); +} + +/* This function is used to build a tree of duplicates by adding a new node to + * a subtree of at least 2 entries. It will probably never be needed inlined, + * and it is not for end-user. + */ +static forceinline struct eb_node * +__eb_insert_dup(struct eb_node *sub, struct eb_node *new) +{ + struct eb_node *head = sub; + + eb_troot_t *new_left = eb_dotag(&new->branches, EB_LEFT); + eb_troot_t *new_rght = eb_dotag(&new->branches, EB_RGHT); + eb_troot_t *new_leaf = eb_dotag(&new->branches, EB_LEAF); + + /* first, identify the deepest hole on the right branch */ + while (eb_gettag(head->branches.b[EB_RGHT]) != EB_LEAF) { + struct eb_node *last = head; + head = container_of(eb_untag(head->branches.b[EB_RGHT], EB_NODE), + struct eb_node, branches); + if (head->bit > last->bit + 1) + sub = head; /* there's a hole here */ + } + + /* Here we have a leaf attached to (head)->b[EB_RGHT] */ + if (head->bit < -1) { + /* A hole exists just before the leaf, we insert there */ + new->bit = -1; + sub = container_of(eb_untag(head->branches.b[EB_RGHT], EB_LEAF), + struct eb_node, branches); + head->branches.b[EB_RGHT] = eb_dotag(&new->branches, EB_NODE); + + new->node_p = sub->leaf_p; + new->leaf_p = new_rght; + sub->leaf_p = new_left; + new->branches.b[EB_LEFT] = eb_dotag(&sub->branches, EB_LEAF); + new->branches.b[EB_RGHT] = new_leaf; + return new; + } else { + int side; + /* No hole was found before a leaf. We have to insert above + * . Note that we cannot be certain that is attached + * to the right of its parent, as this is only true if + * is inside the dup tree, not at the head. + */ + new->bit = sub->bit - 1; /* install at the lowest level */ + side = eb_gettag(sub->node_p); + head = container_of(eb_untag(sub->node_p, side), struct eb_node, branches); + head->branches.b[side] = eb_dotag(&new->branches, EB_NODE); + + new->node_p = sub->node_p; + new->leaf_p = new_rght; + sub->node_p = new_left; + new->branches.b[EB_LEFT] = eb_dotag(&sub->branches, EB_NODE); + new->branches.b[EB_RGHT] = new_leaf; + return new; + } +} + + +/**************************************\ + * Public functions, for the end-user * +\**************************************/ + +/* Return non-zero if the tree is empty, otherwise zero */ +static inline int eb_is_empty(struct eb_root *root) +{ + return !root->b[EB_LEFT]; +} + +/* Return the first leaf in the tree starting at , or NULL if none */ +static inline struct eb_node *eb_first(struct eb_root *root) +{ + return eb_walk_down(root->b[0], EB_LEFT); +} + +/* Return the last leaf in the tree starting at , or NULL if none */ +static inline struct eb_node *eb_last(struct eb_root *root) +{ + return eb_walk_down(root->b[0], EB_RGHT); +} + +/* Removes a leaf node from the tree if it was still in it. Marks the node + * as unlinked. + */ +static forceinline void __eb_delete(struct eb_node *node) +{ + unsigned int pside, gpside, sibtype; + struct eb_node *parent; + struct eb_root *gparent; + + if (!node->leaf_p) + return; + + /* we need the parent, our side, and the grand parent */ + pside = eb_gettag(node->leaf_p); + parent = eb_root_to_node(eb_untag(node->leaf_p, pside)); + + /* We likely have to release the parent link, unless it's the root, + * in which case we only set our branch to NULL. Note that we can + * only be attached to the root by its left branch. + */ + + if (eb_clrtag(parent->branches.b[EB_RGHT]) == NULL) { + /* we're just below the root, it's trivial. */ + parent->branches.b[EB_LEFT] = NULL; + goto delete_unlink; + } + + /* To release our parent, we have to identify our sibling, and reparent + * it directly to/from the grand parent. Note that the sibling can + * either be a link or a leaf. + */ + + gpside = eb_gettag(parent->node_p); + gparent = eb_untag(parent->node_p, gpside); + + gparent->b[gpside] = parent->branches.b[!pside]; + sibtype = eb_gettag(gparent->b[gpside]); + + if (sibtype == EB_LEAF) { + eb_root_to_node(eb_untag(gparent->b[gpside], EB_LEAF))->leaf_p = + eb_dotag(gparent, gpside); + } else { + eb_root_to_node(eb_untag(gparent->b[gpside], EB_NODE))->node_p = + eb_dotag(gparent, gpside); + } + /* Mark the parent unused. Note that we do not check if the parent is + * our own node, but that's not a problem because if it is, it will be + * marked unused at the same time, which we'll use below to know we can + * safely remove it. + */ + parent->node_p = NULL; + + /* The parent node has been detached, and is currently unused. It may + * belong to another node, so we cannot remove it that way. Also, our + * own node part might still be used. so we can use this spare node + * to replace ours if needed. + */ + + /* If our link part is unused, we can safely exit now */ + if (!node->node_p) + goto delete_unlink; + + /* From now on, and are necessarily different, and the + * 's node part is in use. By definition, is at least + * below , so keeping its key for the bit string is OK. + */ + + parent->node_p = node->node_p; + parent->branches = node->branches; + parent->bit = node->bit; + + /* We must now update the new node's parent... */ + gpside = eb_gettag(parent->node_p); + gparent = eb_untag(parent->node_p, gpside); + gparent->b[gpside] = eb_dotag(&parent->branches, EB_NODE); + + /* ... and its branches */ + for (pside = 0; pside <= 1; pside++) { + if (eb_gettag(parent->branches.b[pside]) == EB_NODE) { + eb_root_to_node(eb_untag(parent->branches.b[pside], EB_NODE))->node_p = + eb_dotag(&parent->branches, pside); + } else { + eb_root_to_node(eb_untag(parent->branches.b[pside], EB_LEAF))->leaf_p = + eb_dotag(&parent->branches, pside); + } + } + delete_unlink: + /* Now the node has been completely unlinked */ + node->leaf_p = NULL; + return; /* tree is not empty yet */ +} + +/* These functions are declared in ebtree.c */ +void eb_delete(struct eb_node *node); +struct eb_node *eb_insert_dup(struct eb_node *sub, struct eb_node *new); + +#endif /* __EBTREE_H__ */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/linux-5.10.35.patch b/linux-5.10.35.patch index dbc293dc1c..a0a4a9a7f4 100644 --- a/linux-5.10.35.patch +++ b/linux-5.10.35.patch @@ -535,7 +535,7 @@ index e37480b5f..8236d5929 100644 /* diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h -index a828cf99c..ea837130d 100644 +index a828cf99c..5d416997a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -232,6 +232,12 @@ @@ -551,22 +551,20 @@ index a828cf99c..ea837130d 100644 /* return minimum truesize of one skb containing X bytes of data */ #define SKB_TRUESIZE(X) ((X) + \ -@@ -724,6 +730,14 @@ struct sk_buff { +@@ -724,6 +730,12 @@ struct sk_buff { * UDP receive path is one user. */ unsigned long dev_scratch; +#ifdef CONFIG_SECURITY_TEMPESTA -+ struct { -+ __u8 present : 1; -+ __u8 tls_type : 7; -+ __u16 flags : 16; -+ unsigned int cb; -+ } tfw_cb; ++ struct { ++ __u8 present : 1; ++ __u8 tls_type : 7; ++ } tfw_cb; +#endif }; }; struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ -@@ -784,9 +798,15 @@ struct sk_buff { +@@ -784,9 +796,15 @@ struct sk_buff { fclone:2, peeked:1, head_frag:1, @@ -578,11 +576,11 @@ index a828cf99c..ea837130d 100644 __u8 active_extensions; +#endif +#ifdef CONFIG_SECURITY_TEMPESTA -+ __u8 tail_lock:1; ++ __u8 tail_lock:1; #endif /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() -@@ -839,7 +859,6 @@ struct sk_buff { +@@ -839,7 +857,6 @@ struct sk_buff { #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif @@ -590,32 +588,11 @@ index a828cf99c..ea837130d 100644 __u8 ipvs_property:1; __u8 inner_protocol_type:1; __u8 remcsum_offload:1; -@@ -931,6 +950,96 @@ struct sk_buff { +@@ -931,6 +948,43 @@ struct sk_buff { #define SKB_ALLOC_RX 0x02 #define SKB_ALLOC_NAPI 0x04 +#ifdef CONFIG_SECURITY_TEMPESTA -+enum { -+ /* This skb contains start of http2 frame. */ -+ SS_F_HTTP2_FRAME_START = 0x01, -+ /* This skb contains new hpack dynamic table size. */ -+ SS_F_HTTT2_HPACK_TBL_SZ_ENCODED = 0x02, -+ /* This skb contains headers frame. */ -+ SS_F_HTTT2_FRAME_HEADERS = 0x04, -+ /* This skb contains data frame. */ -+ SS_F_HTTT2_FRAME_DATA = 0x08, -+ /* This skb was already prepared. */ -+ SS_F_HTTP2_FRAME_PREPARED = 0x10, -+ /* This skb acks new hpack dynamic tbl size. */ -+ SS_F_HTTP2_ACK_FOR_HPACK_TBL_RESIZING = 0x20, -+ /* -+ * These flags should be cleared when we copy flags -+ * from one skb to another one. -+ */ -+ TEMPESTA_SKB_FLAG_CLEAR_MASK = SS_F_HTTP2_ACK_FOR_HPACK_TBL_RESIZING | -+ SS_F_HTTT2_HPACK_TBL_SZ_ENCODED | -+ SS_F_HTTP2_FRAME_START, -+}; + +static inline unsigned long +skb_tfw_is_present(struct sk_buff *skb) @@ -626,9 +603,9 @@ index a828cf99c..ea837130d 100644 +static inline void +skb_set_tfw_tls_type(struct sk_buff *skb, unsigned char tls_type) +{ -+ BUG_ON(tls_type > 0x7F); -+ skb->tfw_cb.present = 1; -+ skb->tfw_cb.tls_type = tls_type; ++ BUG_ON(tls_type > 0x7F); ++ skb->tfw_cb.present = 1; ++ skb->tfw_cb.tls_type = tls_type; +} + +static inline unsigned char @@ -638,38 +615,6 @@ index a828cf99c..ea837130d 100644 +} + +static inline void -+skb_set_tfw_flags(struct sk_buff *skb, unsigned short flags) -+{ -+ skb->tfw_cb.present = 1; -+ skb->tfw_cb.flags |= flags; -+} -+ -+static inline void -+skb_clear_tfw_flag(struct sk_buff *skb, unsigned short flag) -+{ -+ skb->tfw_cb.flags &= ~flag; -+} -+ -+static inline unsigned short -+skb_tfw_flags(struct sk_buff *skb) -+{ -+ return skb->tfw_cb.present ? skb->tfw_cb.flags : 0; -+} -+ -+static inline void -+skb_set_tfw_cb(struct sk_buff *skb, unsigned int cb) -+{ -+ skb->tfw_cb.present = 1; -+ skb->tfw_cb.cb = cb; -+} -+ -+static inline unsigned int -+skb_tfw_cb(struct sk_buff *skb) -+{ -+ return skb->tfw_cb.present ? skb->tfw_cb.cb : 0; -+} -+ -+static inline void +skb_copy_tfw_cb(struct sk_buff *dst, struct sk_buff *src) +{ + dst->dev = src->dev; @@ -687,7 +632,7 @@ index a828cf99c..ea837130d 100644 /** * skb_pfmemalloc - Test if the skb was allocated from PFMEMALLOC reserves * @skb: buffer -@@ -1074,6 +1183,7 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen); +@@ -1074,6 +1128,7 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen); bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, bool *fragstolen, int *delta_truesize); @@ -695,7 +640,7 @@ index a828cf99c..ea837130d 100644 struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, int node); struct sk_buff *__build_skb(void *data, unsigned int frag_size); -@@ -2104,7 +2214,11 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list); +@@ -2104,7 +2159,11 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list); static inline bool skb_is_nonlinear(const struct sk_buff *skb) { @@ -707,7 +652,7 @@ index a828cf99c..ea837130d 100644 } static inline unsigned int skb_headlen(const struct sk_buff *skb) -@@ -2341,6 +2455,20 @@ static inline unsigned int skb_headroom(const struct sk_buff *skb) +@@ -2341,6 +2400,20 @@ static inline unsigned int skb_headroom(const struct sk_buff *skb) return skb->data - skb->head; } @@ -738,7 +683,7 @@ index 000000000..90eedcba5 + * Linux interface for Tempesta FW. + * + * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com). -+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc. ++ * Copyright (C) 2015-2023 Tempesta Technologies, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by @@ -807,40 +752,53 @@ index 89163ef8c..49ad1ddc9 100644 union { struct ip_options_rcu __rcu *ireq_opt; diff --git a/include/net/sock.h b/include/net/sock.h -index 261195598..456f6bd50 100644 +index 261195598..6b7910c55 100644 --- a/include/net/sock.h +++ b/include/net/sock.h -@@ -506,6 +506,19 @@ struct sock { +@@ -506,6 +506,31 @@ struct sock { void (*sk_state_change)(struct sock *sk); void (*sk_data_ready)(struct sock *sk); void (*sk_write_space)(struct sock *sk); +#ifdef CONFIG_SECURITY_TEMPESTA -+ int (*sk_prepare_xmit)(struct sock *sk, -+ struct sk_buff *skb, -+ unsigned int mss_now, -+ unsigned int *limit, -+ unsigned int *skbs); ++ /* ++ * Tempesta FW callback to ecrypt one ++ * or more skb in socket write queue ++ * before sending. ++ */ + int (*sk_write_xmit)(struct sock *sk, + struct sk_buff *skb, + unsigned int mss_now, -+ unsigned int limit, -+ unsigned int skbs); ++ unsigned int limit); ++ /* ++ * Tempesta FW callback to prepare and push ++ * skbs from Tempesta FW private scheduler ++ * to socket write queue according sender ++ * and receiver window. ++ */ ++ int (*sk_fill_write_queue)(struct sock *sk, ++ unsigned int mss_now, ++ int ss_action); ++ /* ++ * Tempesta FW callback to free all private ++ * resources associated with socket. ++ */ + void (*sk_destroy_cb)(struct sock *sk); +#endif void (*sk_error_report)(struct sock *sk); int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); -@@ -861,6 +874,9 @@ enum sock_flags { +@@ -861,6 +886,10 @@ enum sock_flags { SOCK_TXTIME, SOCK_XDP, /* XDP is attached */ SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ +#ifdef CONFIG_SECURITY_TEMPESTA + SOCK_TEMPESTA, /* The socket is managed by Tempesta FW */ ++ SOCK_TEMPESTA_HAS_DATA /* The socket has data in Tempesta FW write queue */ +#endif }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) -@@ -1081,6 +1097,16 @@ static inline void sock_rps_reset_rxhash(struct sock *sk) +@@ -1081,6 +1110,16 @@ static inline void sock_rps_reset_rxhash(struct sock *sk) __rc; \ }) @@ -857,7 +815,7 @@ index 261195598..456f6bd50 100644 int sk_stream_wait_connect(struct sock *sk, long *timeo_p); int sk_stream_wait_memory(struct sock *sk, long *timeo_p); void sk_stream_wait_close(struct sock *sk, long timeo_p); -@@ -1915,8 +1941,7 @@ static inline bool sk_rethink_txhash(struct sock *sk) +@@ -1915,8 +1954,7 @@ static inline bool sk_rethink_txhash(struct sock *sk) static inline struct dst_entry * __sk_dst_get(struct sock *sk) { @@ -868,7 +826,7 @@ index 261195598..456f6bd50 100644 static inline struct dst_entry * diff --git a/include/net/tcp.h b/include/net/tcp.h -index 7d66c61d2..8ec3cbbfb 100644 +index 7d66c61d2..7785fc8a6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -307,6 +307,7 @@ bool tcp_check_oom(struct sock *sk, int shift); @@ -879,16 +837,7 @@ index 7d66c61d2..8ec3cbbfb 100644 #define TCP_INC_STATS(net, field) SNMP_INC_STATS((net)->mib.tcp_statistics, field) #define __TCP_INC_STATS(net, field) __SNMP_INC_STATS((net)->mib.tcp_statistics, field) -@@ -584,6 +585,8 @@ enum tcp_queue { - TCP_FRAG_IN_WRITE_QUEUE, - TCP_FRAG_IN_RTX_QUEUE, - }; -+int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, -+ unsigned int mss_now, gfp_t gfp); - int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, - struct sk_buff *skb, u32 len, - unsigned int mss_now, gfp_t gfp); -@@ -653,6 +656,22 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) +@@ -653,6 +654,22 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) /* tcp.c */ void tcp_get_info(struct sock *, struct tcp_info *); @@ -911,6 +860,58 @@ index 7d66c61d2..8ec3cbbfb 100644 /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); +@@ -1858,11 +1875,51 @@ static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct soc + sk_wmem_free_skb(sk, skb); + } + ++#ifdef CONFIG_SECURITY_TEMPESTA ++/** ++ * This function is similar to `tcp_write_err` except that we send ++ * TCP RST to remote peer. We call this function when an error occurs ++ * while sending data from which we cannot recover, so we close the ++ * connection with TCP RST. ++ */ ++static inline void ++tcp_tfw_handle_error(struct sock *sk, int error) ++{ ++ tcp_send_active_reset(sk, GFP_ATOMIC); ++ sk->sk_err = error; ++ sk->sk_error_report(sk); ++ tcp_write_queue_purge(sk); ++ tcp_done(sk); ++} ++#endif ++ + static inline void tcp_push_pending_frames(struct sock *sk) + { ++#ifdef CONFIG_SECURITY_TEMPESTA ++ unsigned int mss_now = 0; ++ ++ if (sock_flag(sk, SOCK_TEMPESTA_HAS_DATA) ++ && sk->sk_fill_write_queue) ++ { ++ int result; ++ ++ mss_now = tcp_current_mss(sk); ++ result = sk->sk_fill_write_queue(sk, mss_now, 0); ++ if (unlikely(result < 0 && result != -ENOMEM)) { ++ tcp_tfw_handle_error(sk, result); ++ return; ++ } ++ } ++#endif + if (tcp_send_head(sk)) { + struct tcp_sock *tp = tcp_sk(sk); + ++#ifdef CONFIG_SECURITY_TEMPESTA ++ if (mss_now != 0) { ++ int nonagle = TCP_NAGLE_OFF | TCP_NAGLE_PUSH; ++ __tcp_push_pending_frames(sk, mss_now, nonagle); ++ } else ++#endif + __tcp_push_pending_frames(sk, tcp_current_mss(sk), tp->nonagle); + } + } diff --git a/include/net/tls.h b/include/net/tls.h index 2bdd80221..356850dda 100644 --- a/include/net/tls.h @@ -1051,7 +1052,7 @@ index 000000000..7ee3ead54 +/** + * Tempesta Memory Reservation + * -+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc. ++ * Copyright (C) 2015-2022 Tempesta Technologies, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by @@ -1408,7 +1409,7 @@ index f35c2e998..6ec40ac3c 100644 } +EXPORT_SYMBOL(reqsk_fastopen_remove); diff --git a/net/core/skbuff.c b/net/core/skbuff.c -index 1301ea694..02ff44569 100644 +index 1301ea694..42fc8a110 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -80,7 +80,9 @@ @@ -2422,7 +2423,7 @@ index f0f67b25c..58fbfb071 100644 return NULL; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index f99494637..6364d7c5f 100644 +index f99494637..14d28bcca 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -39,6 +39,9 @@ @@ -2435,7 +2436,17 @@ index f99494637..6364d7c5f 100644 #include #include -@@ -389,7 +392,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, +@@ -155,6 +158,9 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta) + tp->snd_cwnd_stamp = tcp_jiffies32; + tp->snd_cwnd_used = 0; + } ++#ifdef CONFIG_SECURITY_TEMPESTA ++EXPORT_SYMBOL(tcp_cwnd_restart); ++#endif + + /* Congestion state accounting after a packet has been sent. */ + static void tcp_event_data_sent(struct tcp_sock *tp, +@@ -389,7 +395,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, /* Constructs common control bits of non-data skb. If SYN/FIN is present, * auto increment end seqno. */ @@ -2444,7 +2455,7 @@ index f99494637..6364d7c5f 100644 { skb->ip_summed = CHECKSUM_PARTIAL; -@@ -403,6 +406,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) +@@ -403,6 +409,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) seq++; TCP_SKB_CB(skb)->end_seq = seq; } @@ -2452,7 +2463,7 @@ index f99494637..6364d7c5f 100644 static inline bool tcp_urg_mode(const struct tcp_sock *tp) { -@@ -1428,7 +1432,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, +@@ -1428,7 +1435,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, * otherwise socket can stall. */ @@ -2461,7 +2472,7 @@ index f99494637..6364d7c5f 100644 { struct tcp_sock *tp = tcp_sk(sk); -@@ -1439,9 +1443,10 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) +@@ -1439,9 +1446,10 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); } @@ -2473,7 +2484,7 @@ index f99494637..6364d7c5f 100644 { if (skb->len <= mss_now) { /* Avoid the costly divide in the normal -@@ -1454,11 +1459,12 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) +@@ -1454,11 +1462,12 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) TCP_SKB_CB(skb)->tcp_gso_size = mss_now; } } @@ -2487,7 +2498,7 @@ index f99494637..6364d7c5f 100644 { struct tcp_sock *tp = tcp_sk(sk); -@@ -1482,6 +1488,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de +@@ -1482,6 +1491,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de tcp_verify_left_out(tp); } @@ -2495,7 +2506,7 @@ index f99494637..6364d7c5f 100644 static bool tcp_has_tx_tstamp(const struct sk_buff *skb) { -@@ -1489,7 +1496,7 @@ static bool tcp_has_tx_tstamp(const struct sk_buff *skb) +@@ -1489,7 +1499,7 @@ static bool tcp_has_tx_tstamp(const struct sk_buff *skb) (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP); } @@ -2504,7 +2515,7 @@ index f99494637..6364d7c5f 100644 { struct skb_shared_info *shinfo = skb_shinfo(skb); -@@ -1505,12 +1512,14 @@ static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2) +@@ -1505,12 +1515,14 @@ static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2) TCP_SKB_CB(skb)->txstamp_ack = 0; } } @@ -2520,7 +2531,7 @@ index f99494637..6364d7c5f 100644 /* Insert buff after skb on the write or rtx queue of sk. */ static void tcp_insert_write_queue_after(struct sk_buff *skb, -@@ -1518,12 +1527,39 @@ static void tcp_insert_write_queue_after(struct sk_buff *skb, +@@ -1518,12 +1530,39 @@ static void tcp_insert_write_queue_after(struct sk_buff *skb, struct sock *sk, enum tcp_queue tcp_queue) { @@ -2560,7 +2571,7 @@ index f99494637..6364d7c5f 100644 /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. -@@ -1561,7 +1597,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, +@@ -1561,7 +1600,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, return -ENOMEM; } @@ -2569,7 +2580,7 @@ index f99494637..6364d7c5f 100644 return -ENOMEM; /* Get a new skb... force flag on. */ -@@ -1575,6 +1611,9 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, +@@ -1575,6 +1614,9 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, nlen = skb->len - len - nsize; buff->truesize += nlen; skb->truesize -= nlen; @@ -2579,7 +2590,7 @@ index f99494637..6364d7c5f 100644 /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; -@@ -1670,7 +1709,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) +@@ -1670,7 +1712,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { u32 delta_truesize; @@ -2588,7 +2599,7 @@ index f99494637..6364d7c5f 100644 return -ENOMEM; delta_truesize = __pskb_trim_head(skb, len); -@@ -1848,6 +1887,7 @@ unsigned int tcp_current_mss(struct sock *sk) +@@ -1848,6 +1890,7 @@ unsigned int tcp_current_mss(struct sock *sk) return mss_now; } @@ -2596,18 +2607,7 @@ index f99494637..6364d7c5f 100644 /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. * As additional protections, we do not touch cwnd in retransmission phases, -@@ -2108,8 +2148,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, - * know that all the data is in scatter-gather pages, and that the - * packet has never been sent out before (and thus is not cloned). - */ --static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, -- unsigned int mss_now, gfp_t gfp) -+int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, -+ unsigned int mss_now, gfp_t gfp) - { - int nlen = skb->len - len; - struct sk_buff *buff; -@@ -2129,6 +2169,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, +@@ -2129,6 +2172,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, sk_mem_charge(sk, buff->truesize); buff->truesize += nlen; skb->truesize -= nlen; @@ -2617,15 +2617,7 @@ index f99494637..6364d7c5f 100644 /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; -@@ -2159,6 +2202,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, - - return 0; - } -+EXPORT_SYMBOL(tso_fragment); - - /* Try to defer sending, if possible, in order to minimize the amount - * of TSO splitting we do. View it as a kind of TSO Nagle test. -@@ -2303,6 +2347,14 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) +@@ -2303,6 +2349,14 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb)) return false; @@ -2640,57 +2632,46 @@ index f99494637..6364d7c5f 100644 len -= skb->len; } -@@ -2577,6 +2629,78 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) +@@ -2577,6 +2631,66 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) tcp_chrono_set(tp, TCP_CHRONO_BUSY); } +#ifdef CONFIG_SECURITY_TEMPESTA + +/** -+ * The next two functions are called from places: from `tcp_write_xmit` ++ * The next funtion is called from places: from `tcp_write_xmit` + * (a usual case) and from `tcp_write_wakeup`. In other places where + * `tcp_transmit_skb` is called we deal with special TCP skbs or skbs + * not from tcp send queue. + */ +static int -+tcp_tfw_sk_prepare_xmit(struct sock *sk, struct sk_buff *skb, -+ unsigned int mss_now, unsigned int *limit, -+ unsigned int *nskbs) -+{ -+ if (!sk->sk_prepare_xmit || !skb_tfw_tls_type(skb)) -+ return 0; -+ -+ if (unlikely(*limit <= TLS_MAX_OVERHEAD)) { -+ net_warn_ratelimited("%s: too small MSS %u" -+ " for TLS\n", -+ __func__, mss_now); -+ return -ENOMEM; -+ } -+ -+ if (*limit > TLS_MAX_PAYLOAD_SIZE + TLS_MAX_OVERHEAD) -+ *limit = TLS_MAX_PAYLOAD_SIZE; -+ else -+ *limit -= TLS_MAX_OVERHEAD; -+ -+ if (unlikely(skb_tfw_flags(skb) & SS_F_HTTP2_FRAME_PREPARED)) { -+ *nskbs = 1; -+ return 0; -+ } -+ -+ return sk->sk_prepare_xmit(sk, skb, mss_now, limit, nskbs); -+} -+ -+static int +tcp_tfw_sk_write_xmit(struct sock *sk, struct sk_buff *skb, -+ unsigned int mss_now, unsigned int limit, -+ unsigned int nskbs) ++ unsigned int mss_now) +{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ unsigned int in_flight = tcp_packets_in_flight(tp); ++ unsigned int send_win, cong_win; ++ unsigned int limit; + int result; + + if (!sk->sk_write_xmit || !skb_tfw_tls_type(skb)) + return 0; + -+ result = sk->sk_write_xmit(sk, skb, mss_now, limit, nskbs); ++ /* Should be checked early. */ ++ BUG_ON(after(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))); ++ cong_win = (tp->snd_cwnd - in_flight) * mss_now; ++ send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; ++ /* ++ * A receive side doesn’t start to process a TLS recod until ++ * it’s fully read from a socket. Too small record size causes ++ * too much overhead. On the other side too large record size ++ * can lead to significant delays on receive side if current ++ * TCP congestion and/or the receiver’s advertised window are ++ * smaller than a TLS record size. ++ */ ++ limit = min3(cong_win, send_win, (unsigned int)TLS_MAX_PAYLOAD_SIZE); ++ ++ result = sk->sk_write_xmit(sk, skb, mss_now, limit); + if (unlikely(result)) + return result; + @@ -2699,61 +2680,51 @@ index f99494637..6364d7c5f 100644 + return 0; +} + -+/** -+ * This function is similar to `tcp_write_err` except that we send -+ * TCP RST to remote peer. We call this function when an error occurs -+ * while sending data from which we cannot recover, so we close the -+ * connection with TCP RST. ++/* ++ * We should recalculate max_size, and split skb according ++ * new limit, because we add extra TLS_MAX_OVERHEAD bytes ++ * during tls encription. If we don't adjust it, we push ++ * skb with incorrect length to network. + */ -+static void -+tcp_tfw_handle_error(struct sock *sk, int error) -+{ -+ tcp_send_active_reset(sk, GFP_ATOMIC); -+ sk->sk_err = error; -+ sk->sk_error_report(sk); -+ tcp_write_queue_purge(sk); -+ tcp_done(sk); -+} ++#define TFW_ADJUST_TLS_OVERHEAD(max_size) \ ++do { \ ++ if (max_size > TLS_MAX_PAYLOAD_SIZE + TLS_MAX_OVERHEAD) \ ++ max_size = TLS_MAX_PAYLOAD_SIZE; \ ++ else \ ++ max_size -= TLS_MAX_OVERHEAD; \ ++} while(0) ++ +#endif + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. -@@ -2601,6 +2725,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, - int result; - bool is_cwnd_limited = false, is_rwnd_limited = false; - u32 max_segs; -+#ifdef CONFIG_SECURITY_TEMPESTA -+ unsigned int nskbs = UINT_MAX; -+#endif - - sent_pkts = 0; - -@@ -2666,7 +2793,16 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +@@ -2666,7 +2780,17 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, cwnd_quota, max_segs), nonagle); - +#ifdef CONFIG_SECURITY_TEMPESTA -+ result = tcp_tfw_sk_prepare_xmit(sk, skb, mss_now, &limit, -+ &nskbs); -+ if (unlikely(result)) { -+ if (result == -ENOMEM) -+ break; /* try again next time */ -+ tcp_tfw_handle_error(sk, result); -+ return false; ++ if (sk->sk_write_xmit && skb_tfw_tls_type(skb)) { ++ if (unlikely(limit <= TLS_MAX_OVERHEAD)) { ++ net_warn_ratelimited("%s: too small MSS %u" ++ " for TLS\n", ++ __func__, mss_now); ++ break; ++ } ++ TFW_ADJUST_TLS_OVERHEAD(limit); + } +#endif if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; -@@ -2681,7 +2817,15 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +@@ -2681,7 +2805,15 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, */ if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) break; - +#ifdef CONFIG_SECURITY_TEMPESTA -+ result = tcp_tfw_sk_write_xmit(sk, skb, mss_now, limit, nskbs); ++ result = tcp_tfw_sk_write_xmit(sk, skb, mss_now); + if (unlikely(result)) { + if (result == -ENOMEM) + break; /* try again next time */ @@ -2764,7 +2735,7 @@ index f99494637..6364d7c5f 100644 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; -@@ -2866,6 +3010,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, +@@ -2866,6 +2998,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, sk_gfp_mask(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } @@ -2772,7 +2743,7 @@ index f99494637..6364d7c5f 100644 /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. -@@ -3183,7 +3328,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) +@@ -3183,7 +3316,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { @@ -2781,7 +2752,7 @@ index f99494637..6364d7c5f 100644 return -ENOMEM; diff = tcp_skb_pcount(skb); -@@ -3421,6 +3566,7 @@ void tcp_send_fin(struct sock *sk) +@@ -3421,6 +3554,7 @@ void tcp_send_fin(struct sock *sk) } __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); } @@ -2789,7 +2760,7 @@ index f99494637..6364d7c5f 100644 /* We get here when a process closes a file descriptor (either due to * an explicit close() or as a byproduct of exit()'ing) and there -@@ -3454,6 +3600,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) +@@ -3454,6 +3588,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) */ trace_tcp_send_reset(sk, NULL); } @@ -2797,39 +2768,31 @@ index f99494637..6364d7c5f 100644 /* Send a crossed SYN-ACK during socket establishment. * WARNING: This routine must only be called when we have already sent -@@ -4030,6 +4177,9 @@ int tcp_write_wakeup(struct sock *sk, int mib) - - skb = tcp_send_head(sk); - if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { -+#ifdef CONFIG_SECURITY_TEMPESTA -+ unsigned int nskbs = UINT_MAX; -+#endif - int err; - unsigned int mss = tcp_current_mss(sk); - unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; -@@ -4037,6 +4187,15 @@ int tcp_write_wakeup(struct sock *sk, int mib) - if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) - tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; - -+#ifdef CONFIG_SECURITY_TEMPESTA -+ err = tcp_tfw_sk_prepare_xmit(sk, skb, mss, &seg_size, &nskbs); -+ if (unlikely(err)) { -+ if (err != -ENOMEM) -+ tcp_tfw_handle_error(sk, err); -+ return err; -+ } +@@ -4044,6 +4179,17 @@ int tcp_write_wakeup(struct sock *sk, int mib) + if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || + skb->len > mss) { + seg_size = min(seg_size, mss); ++#ifdef CONFIG_SECURITY_TEMPESTA ++ if (sk->sk_write_xmit && skb_tfw_tls_type(skb)) { ++ if (unlikely(seg_size <= TLS_MAX_OVERHEAD)) { ++ net_warn_ratelimited("%s: too small" ++ " MSS %u for TLS\n", ++ __func__, mss); ++ return -ENOMEM; ++ } ++ TFW_ADJUST_TLS_OVERHEAD(seg_size); ++ } +#endif -+ - /* We are probing the opening of a window - * but the window size is != 0 - * must have been a result SWS avoidance ( sender ) -@@ -4052,6 +4211,16 @@ int tcp_write_wakeup(struct sock *sk, int mib) + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; + if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, + skb, seg_size, mss, GFP_ATOMIC)) +@@ -4052,6 +4198,16 @@ int tcp_write_wakeup(struct sock *sk, int mib) tcp_set_skb_tso_segs(skb, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; + +#ifdef CONFIG_SECURITY_TEMPESTA -+ err = tcp_tfw_sk_write_xmit(sk, skb, mss, seg_size, nskbs); ++ err = tcp_tfw_sk_write_xmit(sk, skb, mss); + if (unlikely(err)) { + if (err != -ENOMEM) + tcp_tfw_handle_error(sk, err); @@ -3010,7 +2973,7 @@ index 000000000..313101304 + * Tempesta FW + * + * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com). -+ * Copyright (C) 2015-2024 Tempesta Technologies, Inc. ++ * Copyright (C) 2015-2023 Tempesta Technologies, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by