diff --git a/README.md b/README.md index 8f492eeb4..5d31a0e73 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ Dynomite can be configured through a YAML file specified by the -c or --conf-fil + **gos_interval**: The sleeping time in milliseconds at the end of a gossip round. + **tokens**: The token(s) owned by a node. Currently, we don't support vnode yet so this only works with one token for the time being. + **dyn_seed_provider**: A seed provider implementation to provide a list of seed nodes. -+ **dyn_seeds**: A list of seed nodes in the format: address:port:rack:dc:tokens (node that vnode is not supported yet) ++ **dyn_seeds**: A list of seed nodes in the format: address:port:rack:dc:tokens (note that vnode is not supported yet) + **listen**: The listening address and port (name:port or ip:port) for this server pool. + **timeout**: The timeout value in msec that we wait for to establish a connection to the server or receive a response from a server. By default, we wait indefinitely. + **preconnect**: A boolean value that controls if dynomite should preconnect to all the servers in this pool on process start. Defaults to false. diff --git a/src/dyn_array.c b/src/dyn_array.c index f03a383a7..961b80cdf 100644 --- a/src/dyn_array.c +++ b/src/dyn_array.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,209 +20,186 @@ * limitations under the License. */ +#include "dyn_array.h" + #include #include "dyn_core.h" +#include "dyn_util.h" -struct array * -array_create(uint32_t n, size_t size) -{ - struct array *a; +struct array *array_create(uint32_t n, size_t size) { + struct array *a; - ASSERT(n != 0 && size != 0); + ASSERT(n != 0 && size != 0); - a = dn_alloc(sizeof(*a)); - if (a == NULL) { - return NULL; - } + a = dn_alloc(sizeof(*a)); + if (a == NULL) { + return NULL; + } - a->elem = dn_alloc(n * size); - if (a->elem == NULL) { - dn_free(a); - return NULL; - } + a->elem = dn_alloc(n * size); + if (a->elem == NULL) { + dn_free(a); + return NULL; + } - a->nelem = 0; - a->size = size; - a->nalloc = n; + a->nelem = 0; + a->size = size; + a->nalloc = n; - return a; + return a; } -void -array_destroy(struct array *a) -{ - array_deinit(a); - dn_free(a); +void array_destroy(struct array *a) { + array_deinit(a); + dn_free(a); } -rstatus_t -array_init(struct array *a, uint32_t n, size_t size) -{ - ASSERT(n != 0 && size != 0); +rstatus_t array_init(struct array *a, uint32_t n, size_t size) { + ASSERT(n != 0 && size != 0); - a->elem = dn_alloc(n * size); - if (a->elem == NULL) { - return DN_ENOMEM; - } + a->elem = dn_alloc(n * size); + if (a->elem == NULL) { + return DN_ENOMEM; + } - a->nelem = 0; - a->size = size; - a->nalloc = n; + a->nelem = 0; + a->size = size; + a->nalloc = n; - return DN_OK; + return DN_OK; } -void -array_deinit(struct array *a) -{ - //ASSERT(a->nelem == 0); +void array_deinit(struct array *a) { + // ASSERT(a->nelem == 0); - if (a->elem != NULL) { - dn_free(a->elem); - } + if (a->elem != NULL) { + dn_free(a->elem); + } } -uint32_t -array_idx(struct array *a, void *elem) -{ - uint8_t *p, *q; - uint32_t off, idx; +uint32_t array_idx(struct array *a, void *elem) { + uint8_t *p, *q; + uint32_t off, idx; - ASSERT(elem >= a->elem); + ASSERT(elem >= a->elem); - p = a->elem; - q = elem; - off = (uint32_t)(q - p); + p = a->elem; + q = elem; + off = (uint32_t)(q - p); - ASSERT(off % (uint32_t)a->size == 0); + ASSERT(off % (uint32_t)a->size == 0); - idx = off / (uint32_t)a->size; + idx = off / (uint32_t)a->size; - return idx; + return idx; } -void * -array_push(struct array *a) -{ - void *elem, *new; - size_t size; - - if (a->nelem == a->nalloc) { +void *array_push(struct array *a) { + void *elem, *new; + size_t size; - /* the array is full; allocate new array */ - size = a->size * a->nalloc; - new = dn_realloc(a->elem, 2 * size); - if (new == NULL) { - return NULL; - } - - a->elem = new; - a->nalloc *= 2; + if (a->nelem == a->nalloc) { + /* the array is full; allocate new array */ + size = a->size * a->nalloc; + new = dn_realloc(a->elem, 2 * size); + if (new == NULL) { + return NULL; } - elem = (uint8_t *)a->elem + a->size * a->nelem; - a->nelem++; + a->elem = new; + a->nalloc *= 2; + } + + elem = (uint8_t *)a->elem + a->size * a->nelem; + a->nelem++; - return elem; + return elem; } -void * -array_pop(struct array *a) -{ - void *elem; +void *array_pop(struct array *a) { + void *elem; - ASSERT(a->nelem != 0); + ASSERT(a->nelem != 0); - a->nelem--; - elem = (uint8_t *)a->elem + a->size * a->nelem; + a->nelem--; + elem = (uint8_t *)a->elem + a->size * a->nelem; - return elem; + return elem; } -void * -array_get(struct array *a, uint32_t idx) -{ - void *elem; +void *array_get(struct array *a, uint32_t idx) { + void *elem; - ASSERT(a->nelem != 0); - ASSERT(idx < a->nelem); + ASSERT(a->nelem != 0); + ASSERT(idx < a->nelem); - elem = (uint8_t *)a->elem + (a->size * idx); + elem = (uint8_t *)a->elem + (a->size * idx); - return elem; + return elem; } -void * -array_top(struct array *a) -{ - ASSERT(a->nelem != 0); +void *array_top(struct array *a) { + ASSERT(a->nelem != 0); - return array_get(a, a->nelem - 1); + return array_get(a, a->nelem - 1); } -void -array_swap(struct array *a, struct array *b) -{ - struct array tmp; +void array_swap(struct array *a, struct array *b) { + struct array tmp; - tmp = *a; - *a = *b; - *b = tmp; + tmp = *a; + *a = *b; + *b = tmp; } /* * Sort nelem elements of the array in ascending order based on the * compare comparator. */ -void -array_sort(struct array *a, array_compare_t compare) -{ - ASSERT(a->nelem != 0); +void array_sort(struct array *a, array_compare_t compare) { + ASSERT(a->nelem != 0); - qsort(a->elem, a->nelem, a->size, compare); + qsort(a->elem, a->nelem, a->size, compare); } /* * Calls the func once for each element in the array as long as func returns * success. On failure short-circuits and returns the error status. */ -rstatus_t -array_each(struct array *a, array_each_t func) -{ - uint32_t i, nelem; +rstatus_t array_each(struct array *a, array_each_t func) { + uint32_t i, nelem; - ASSERT(func != NULL); + ASSERT(func != NULL); - for (i = 0, nelem = array_n(a); i < nelem; i++) { - void *elem = array_get(a, i); - rstatus_t status; + for (i = 0, nelem = array_n(a); i < nelem; i++) { + void *elem = array_get(a, i); + rstatus_t status; - status = func(elem); - if (status != DN_OK) { - return status; - } + status = func(elem); + if (status != DN_OK) { + return status; } + } - return DN_OK; + return DN_OK; } -rstatus_t -array_each_2(struct array *a, array_each_2_t func, void *data1, void *data2) -{ - uint32_t i, nelem; +rstatus_t array_each_2(struct array *a, array_each_2_t func, void *data1, + void *data2) { + uint32_t i, nelem; - ASSERT(func != NULL); + ASSERT(func != NULL); - for (i = 0, nelem = array_n(a); i < nelem; i++) { - void *elem = array_get(a, i); - rstatus_t status; + for (i = 0, nelem = array_n(a); i < nelem; i++) { + void *elem = array_get(a, i); + rstatus_t status; - status = func(elem, data1, data2); - if (status != DN_OK) { - return status; - } + status = func(elem, data1, data2); + if (status != DN_OK) { + return status; } + } - return DN_OK; + return DN_OK; } diff --git a/src/dyn_array.h b/src/dyn_array.h index 7bc356ae1..a29d4b82e 100644 --- a/src/dyn_array.h +++ b/src/dyn_array.h @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,55 +20,43 @@ * limitations under the License. */ -#include "dyn_core.h" - - #ifndef _DYN_ARRAY_H_ #define _DYN_ARRAY_H_ +#include "dyn_types.h" typedef int (*array_compare_t)(const void *, const void *); typedef rstatus_t (*array_each_t)(void *elem); typedef rstatus_t (*array_each_2_t)(void *elem, void *data1, void *data2); struct array { - uint32_t nelem; /* # element */ - void *elem; /* element */ - size_t size; /* element size */ - uint32_t nalloc; /* # allocated element */ + uint32_t nelem; /* # element */ + void *elem; /* element */ + size_t size; /* element size */ + uint32_t nalloc; /* # allocated element */ }; -#define null_array { 0, NULL, 0, 0 } +#define null_array \ + { 0, NULL, 0, 0 } -static inline void -array_null(struct array *a) -{ - a->nelem = 0; - a->elem = NULL; - a->size = 0; - a->nalloc = 0; +static inline void array_null(struct array *a) { + a->nelem = 0; + a->elem = NULL; + a->size = 0; + a->nalloc = 0; } -static inline void -array_set(struct array *a, void *elem, size_t size, uint32_t nalloc) -{ - a->nelem = 0; - a->elem = elem; - a->size = size; - a->nalloc = nalloc; +static inline void array_set(struct array *a, void *elem, size_t size, + uint32_t nalloc) { + a->nelem = 0; + a->elem = elem; + a->size = size; + a->nalloc = nalloc; } -static inline uint32_t -array_n(const struct array *a) -{ - return a->nelem; -} +static inline uint32_t array_n(const struct array *a) { return a->nelem; } -static inline void -array_reset(struct array *a) -{ - a->nelem = 0; -} +static inline void array_reset(struct array *a) { a->nelem = 0; } struct array *array_create(uint32_t n, size_t size); void array_destroy(struct array *a); @@ -83,6 +71,7 @@ void *array_top(struct array *a); void array_swap(struct array *a, struct array *b); void array_sort(struct array *a, array_compare_t compare); rstatus_t array_each(struct array *a, array_each_t func); -rstatus_t array_each_2(struct array *a, array_each_2_t func, void *data1, void *data2); +rstatus_t array_each_2(struct array *a, array_each_2_t func, void *data1, + void *data2); #endif diff --git a/src/dyn_asciilogo.h b/src/dyn_asciilogo.h index e355599d6..fe2e228b8 100644 --- a/src/dyn_asciilogo.h +++ b/src/dyn_asciilogo.h @@ -1,19 +1,17 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2015 Netflix, Inc. + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2015 Netflix, Inc. * * author: Ioannis Papapanagiotou - */ - + */ char *ascii_logo = -" \n" -" # \ m \n" -" mmm# m m mmmm mmm mmmmm mmm mm#mm mmm \n" -" # # \\m m/ # # # # # # # # # # # \n" -" # # #m# # # # # # # # # # #'''' \n" -" \\#m## \\# # # #m# # # # mm#mm mm #mm \n" -" m/\n" -" ##\n"; + " \n" + " # \ m \n" + " mmm# m m mmmm mmm mmmmm mmm mm#mm mmm \n" + " # # \\m m/ # # # # # # # # # # # \n" + " # # #m# # # # # # # # # # #'''' \n" + " \\#m## \\# # # #m# # # # mm#mm mm #mm \n" + " m/\n" + " ##\n"; //" \n\n"; - diff --git a/src/dyn_cbuf.h b/src/dyn_cbuf.h index ef2f85f14..46d0c32ac 100644 --- a/src/dyn_cbuf.h +++ b/src/dyn_cbuf.h @@ -1,182 +1,185 @@ /**************************************************************************** -* -* Since this code originated from code which is public domain, I -* hereby declare this code to be public domain as well. -* -* Dave Hylands - dhylands@gmail.com -* -****************************************************************************/ + * + * Since this code originated from code which is public domain, I + * hereby declare this code to be public domain as well. + * + * Dave Hylands - dhylands@gmail.com + * + ****************************************************************************/ /** -* -* @file CBUF.h -* -* @defgroup CBUF Circular Buffer -* @{ -* -* @brief A simple and efficient set of circular buffer manipulations. -* -* These macros implement a circular buffer which employs get and put -* pointers, in such a way that mutual exclusion is not required -* (assumes one reader & one writer). -* -* It requires that the circular buffer size be a power of two, and the -* size of the buffer needs to smaller than the index. So an 8 bit index -* supports a circular buffer up to ( 1 << 7 ) = 128 entries, and a 16 bit -* index supports a circular buffer up to ( 1 << 15 ) = 32768 entries. -* -* The basis for these routines came from an article in Jack Ganssle's -* Embedded Muse: http://www.ganssle.com/tem/tem110.pdf -* -* In order to offer the most amount of flexibility for embedded environments -* you need to define a macro for the size. -* -* First, you need to name your circular buffer. For this example, we'll -* call it @c myQ. -* -* The size macro that needs to be defined will be the name of the -* circular buffer followed by @c _SIZE. The size must be a power of two -* and it needs to fit in the get/put indicies. i.e. if you use an -* 8 bit index, then the maximum supported size would be 128. -* -* The structure which defines the circular buffer needs to have 3 members -* m_getIdx, m_putIdx, and @c m_entry. -* -* @c m_getIdx and @c m_putIdx need to be unsigned integers of the same size. -* -* @c m_entry needs to be an array of @c xxx_SIZE entries, or a pointer to an -* array of @c xxx_SIZE entries. The type of each entry is entirely up to the -* caller. -* -* @code -* #define myQ_SIZE 64 -* -* volatile struct -* { -* uint8_t m_getIdx; -* uint8_t m_putIdx; -* uint8_t m_entry[ myQ_SIZE ]; -* -* } myQ; -* @endcode -* -* You could then use CBUF_Push to add a character to the circular buffer: -* -* @code -* CBUF_Push( myQ, 'x' ); -* @endcode -* -* And CBUF_Pop to retrieve an element from the buffer: -* -* @code -* ch = CBUF_Pop( myQ ); -* @endcode -* -* -****************************************************************************/ - -#if !defined( DYN_CBUF_H ) -#define DYN_CBUF_H - - - -/** -* Initializes the circular buffer for use. -*/ - -#define CBUF_Init( cbuf ) cbuf.m_getIdx = cbuf.m_putIdx = 0 + * + * @file CBUF.h + * + * @defgroup CBUF Circular Buffer + * @{ + * + * @brief A simple and efficient set of circular buffer manipulations. + * + * These macros implement a circular buffer which employs get and put + * pointers, in such a way that mutual exclusion is not required + * (assumes one reader & one writer). + * + * It requires that the circular buffer size be a power of two, and the + * size of the buffer needs to smaller than the index. So an 8 bit index + * supports a circular buffer up to ( 1 << 7 ) = 128 entries, and a 16 bit + * index supports a circular buffer up to ( 1 << 15 ) = 32768 entries. + * + * The basis for these routines came from an article in Jack Ganssle's + * Embedded Muse: http://www.ganssle.com/tem/tem110.pdf + * + * In order to offer the most amount of flexibility for embedded environments + * you need to define a macro for the size. + * + * First, you need to name your circular buffer. For this example, we'll + * call it @c myQ. + * + * The size macro that needs to be defined will be the name of the + * circular buffer followed by @c _SIZE. The size must be a power of two + * and it needs to fit in the get/put indicies. i.e. if you use an + * 8 bit index, then the maximum supported size would be 128. + * + * The structure which defines the circular buffer needs to have 3 members + * m_getIdx, m_putIdx, and @c m_entry. + * + * @c m_getIdx and @c m_putIdx need to be unsigned integers of the same size. + * + * @c m_entry needs to be an array of @c xxx_SIZE entries, or a pointer to an + * array of @c xxx_SIZE entries. The type of each entry is entirely up to the + * caller. + * + * @code + * #define myQ_SIZE 64 + * + * volatile struct + * { + * uint8_t m_getIdx; + * uint8_t m_putIdx; + * uint8_t m_entry[ myQ_SIZE ]; + * + * } myQ; + * @endcode + * + * You could then use CBUF_Push to add a character to the circular buffer: + * + * @code + * CBUF_Push( myQ, 'x' ); + * @endcode + * + * And CBUF_Pop to retrieve an element from the buffer: + * + * @code + * ch = CBUF_Pop( myQ ); + * @endcode + * + * + ****************************************************************************/ + +#if !defined(DYN_CBUF_H) +#define DYN_CBUF_H /** -* Returns the number of elements which are currently -* contained in the circular buffer. -*/ + * Initializes the circular buffer for use. + */ -#define CBUF_Len( cbuf ) ((typeof( cbuf.m_putIdx ))(( cbuf.m_putIdx ) - ( cbuf.m_getIdx ))) +#define CBUF_Init(cbuf) cbuf.m_getIdx = cbuf.m_putIdx = 0 /** -* Appends an element to the end of the circular buffer. The -* element is expected to be of the same type as the @c m_entry -* member. -*/ + * Returns the number of elements which are currently + * contained in the circular buffer. + */ -#define CBUF_Push( cbuf, elem ) (cbuf.m_entry)[ cbuf.m_putIdx++ & (( cbuf##_SIZE ) - 1 )] = (elem) +#define CBUF_Len(cbuf) \ + ((typeof(cbuf.m_putIdx))((cbuf.m_putIdx) - (cbuf.m_getIdx))) /** -* Retrieves an element from the beginning of the circular buffer -*/ + * Appends an element to the end of the circular buffer. The + * element is expected to be of the same type as the @c m_entry + * member. + */ -#define CBUF_Pop( cbuf ) (cbuf.m_entry)[ cbuf.m_getIdx++ & (( cbuf##_SIZE ) - 1 )] +#define CBUF_Push(cbuf, elem) \ + (cbuf.m_entry)[cbuf.m_putIdx++ & ((cbuf##_SIZE) - 1)] = (elem) /** -* Returns a pointer to the last spot that was pushed. -*/ + * Retrieves an element from the beginning of the circular buffer + */ -#define CBUF_GetLastEntryPtr( cbuf ) &(cbuf.m_entry)[ ( cbuf.m_putIdx - 1 ) & (( cbuf##_SIZE ) - 1 )] +#define CBUF_Pop(cbuf) (cbuf.m_entry)[cbuf.m_getIdx++ & ((cbuf##_SIZE) - 1)] /** -* Returns a pointer to the next spot to push. This can be used -* in conjunction with CBUF_AdvancePushIdx to fill out an entry -* before indicating that it's available. It is the caller's -* responsibility to enure that space is available, and that no -* other items are pushed to overwrite the entry returned. -*/ + * Returns a pointer to the last spot that was pushed. + */ -#define CBUF_GetPushEntryPtr( cbuf ) &(cbuf.m_entry)[ cbuf.m_putIdx & (( cbuf##_SIZE ) - 1 )] +#define CBUF_GetLastEntryPtr(cbuf) \ + &(cbuf.m_entry)[(cbuf.m_putIdx - 1) & ((cbuf##_SIZE) - 1)] /** -* Advances the put index. This is useful if you need to -* reserve space for an item but can't fill in the contents -* yet. CBUG_GetLastEntryPtr can be used to get a pointer to -* the item. It is the caller's responsibility to ensure that -* the item isn't popped before the contents are filled in. -*/ + * Returns a pointer to the next spot to push. This can be used + * in conjunction with CBUF_AdvancePushIdx to fill out an entry + * before indicating that it's available. It is the caller's + * responsibility to enure that space is available, and that no + * other items are pushed to overwrite the entry returned. + */ -#define CBUF_AdvancePushIdx( cbuf ) cbuf.m_putIdx++ +#define CBUF_GetPushEntryPtr(cbuf) \ + &(cbuf.m_entry)[cbuf.m_putIdx & ((cbuf##_SIZE) - 1)] /** -* Advances the get index. This is slightly more efficient than -* popping and tossing the result. -*/ + * Advances the put index. This is useful if you need to + * reserve space for an item but can't fill in the contents + * yet. CBUG_GetLastEntryPtr can be used to get a pointer to + * the item. It is the caller's responsibility to ensure that + * the item isn't popped before the contents are filled in. + */ -#define CBUF_AdvancePopIdx( cbuf ) cbuf.m_getIdx++ +#define CBUF_AdvancePushIdx(cbuf) cbuf.m_putIdx++ /** -* Retrieves the idx'th element from the beginning of -* the circular buffer -*/ + * Advances the get index. This is slightly more efficient than + * popping and tossing the result. + */ -#define CBUF_Get( cbuf, idx ) (cbuf.m_entry)[( cbuf.m_getIdx + idx ) & (( cbuf##_SIZE ) - 1 )] +#define CBUF_AdvancePopIdx(cbuf) cbuf.m_getIdx++ /** -* Retrieves the idx'th element from the end of the -* circular buffer. -*/ + * Retrieves the idx'th element from the beginning of + * the circular buffer + */ -#define CBUF_GetEnd( cbuf, idx ) (cbuf.m_entry)[( cbuf.m_putIdx - idx - 1 ) & (( cbuf##_SIZE ) - 1 )] +#define CBUF_Get(cbuf, idx) \ + (cbuf.m_entry)[(cbuf.m_getIdx + idx) & ((cbuf##_SIZE) - 1)] /** -* Returns a pointer to the next spot to push. -*/ + * Retrieves the idx'th element from the end of the + * circular buffer. + */ -#define CBUF_GetPopEntryPtr( cbuf ) &(cbuf.m_entry)[ cbuf.m_getIdx & (( cbuf##_SIZE ) - 1 )] +#define CBUF_GetEnd(cbuf, idx) \ + (cbuf.m_entry)[(cbuf.m_putIdx - idx - 1) & ((cbuf##_SIZE) - 1)] /** -* Determines if the circular buffer is empty. -*/ + * Returns a pointer to the next spot to push. + */ -#define CBUF_IsEmpty( cbuf ) ( CBUF_Len( cbuf ) == 0 ) +#define CBUF_GetPopEntryPtr(cbuf) \ + &(cbuf.m_entry)[cbuf.m_getIdx & ((cbuf##_SIZE) - 1)] /** -* Determines if the circular buffer is full. -*/ + * Determines if the circular buffer is empty. + */ -#define CBUF_IsFull( cbuf ) ( CBUF_Len( cbuf ) == ( cbuf##_SIZE )) +#define CBUF_IsEmpty(cbuf) (CBUF_Len(cbuf) == 0) /** -* Determines if the circular buffer is currenly overflowed or underflowed. -*/ + * Determines if the circular buffer is full. + */ -#define CBUF_Error( cbuf ) ( CBUF_Len( cbuf ) > cbuf##_SIZE ) +#define CBUF_IsFull(cbuf) (CBUF_Len(cbuf) == (cbuf##_SIZE)) +/** + * Determines if the circular buffer is currenly overflowed or underflowed. + */ +#define CBUF_Error(cbuf) (CBUF_Len(cbuf) > cbuf##_SIZE) -#endif // DYN_CBUF_H +#endif // DYN_CBUF_H diff --git a/src/dyn_client.c b/src/dyn_client.c index dda48dc0c..b93067226 100644 --- a/src/dyn_client.c +++ b/src/dyn_client.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -24,143 +24,130 @@ * This is the client connection. It receives requests from the client, and * forwards it to the corresponding peers and local data store server (if this * node owns the token). - * There is fair amount of machinery involved here mainly for consistency feature - * It acts more of a co-ordinator than a mere client connection handler. + * There is fair amount of machinery involved here mainly for consistency + * feature It acts more of a co-ordinator than a mere client connection handler. * - outstanding_msgs_dict : This is a hash table (HT) of request id to request * mapping. When it receives a request, it adds the message to the HT, and * removes it when it finished responding. We need a hash table mainly for - * implementing consistency. When a response is received from a peer, it is + * implementing consistency. When a response is received from a peer, it is * handed over to the client connection. It uses this HT to get the request & * calls the request's response handler. - * - waiting_to_unref: Now that we distribute messages to multiple nodes and that - * we have consistency, there is a need for the responses to refer back to the - * original requests. This makes cleaning up and connection tear down fairly - * complex. The client connection has to wait for all responses (either a good - * response or a error response due to timeout). Hence the client connection - * should wait for the above HT outstanding_msgs_dict to get empty. This flag - * waiting_to_unref indicates that the client connection is ready to close and - * just waiting for the outstanding messages to finish. + * - waiting_to_unref: Now that we distribute messages to multiple nodes and + * that we have consistency, there is a need for the responses to refer back to + * the original requests. This makes cleaning up and connection tear down + * fairly complex. The client connection has to wait for all responses (either + * a good response or a error response due to timeout). Hence the client + * connection should wait for the above HT outstanding_msgs_dict to get empty. + * This flag waiting_to_unref indicates that the client connection is ready to + * close and just waiting for the outstanding messages to finish. */ -#include "dyn_core.h" -#include "dyn_server.h" #include "dyn_client.h" -#include "dyn_dnode_peer.h" +#include "dyn_core.h" #include "dyn_dict_msg_id.h" +#include "dyn_dnode_peer.h" +#include "dyn_server.h" static rstatus_t msg_quorum_rsp_handler(struct msg *req, struct msg *rsp); static msg_response_handler_t msg_get_rsp_handler(struct msg *req); -static rstatus_t rewrite_query_if_necessary(struct msg** req, struct context* ctx); -static rstatus_t fragment_query_if_necessary(struct msg* req, struct conn* conn, - struct msg_tqh* frag_msgq); +static rstatus_t rewrite_query_if_necessary(struct msg **req, + struct context *ctx); +static rstatus_t fragment_query_if_necessary(struct msg *req, struct conn *conn, + struct msg_tqh *frag_msgq); -static void -client_ref(struct conn *conn, void *owner) -{ - struct server_pool *pool = owner; +static void client_ref(struct conn *conn, void *owner) { + struct server_pool *pool = owner; - ASSERT(conn->type == CONN_CLIENT); - ASSERT(conn->owner == NULL); + ASSERT(conn->type == CONN_CLIENT); + ASSERT(conn->owner == NULL); - /* - * We use null pointer as the sockaddr argument in the accept() call as - * we are not interested in the address of the peer for the accepted - * connection - */ - conn->family = 0; - conn->addrlen = 0; - conn->addr = NULL; + /* + * We use null pointer as the sockaddr argument in the accept() call as + * we are not interested in the address of the peer for the accepted + * connection + */ + conn->family = 0; + conn->addrlen = 0; + conn->addr = NULL; - TAILQ_INSERT_TAIL(&pool->c_conn_q, conn, conn_tqe); + TAILQ_INSERT_TAIL(&pool->c_conn_q, conn, conn_tqe); - /* owner of the client connection is the server pool */ - conn->owner = owner; - conn->outstanding_msgs_dict = dictCreate(&msg_table_dict_type, NULL); - conn->waiting_to_unref = 0; + /* owner of the client connection is the server pool */ + conn->owner = owner; + conn->outstanding_msgs_dict = dictCreate(&msg_table_dict_type, NULL); + conn->waiting_to_unref = 0; - log_debug(LOG_VVERB, "%s ref owner %p into pool '%.*s'", print_obj(conn), pool, - pool->name.len, pool->name.data); + log_debug(LOG_VVERB, "%s ref owner %p into pool '%.*s'", print_obj(conn), + pool, pool->name.len, pool->name.data); } -static void -client_unref_internal_try_put(struct conn *conn) -{ - ASSERT(conn->waiting_to_unref); - unsigned long msgs = dictSize(conn->outstanding_msgs_dict); - if (msgs != 0) { - log_warn("%s Waiting for %lu outstanding messages", print_obj(conn), msgs); - return; - } - ASSERT(conn->owner != NULL); - conn_event_del_conn(conn); - log_warn("%s unref owner %s", print_obj(conn), print_obj(conn->owner)); - conn->owner = NULL; - dictRelease(conn->outstanding_msgs_dict); - conn->outstanding_msgs_dict = NULL; - conn->waiting_to_unref = 0; - conn_put(conn); +static void client_unref_internal_try_put(struct conn *conn) { + ASSERT(conn->waiting_to_unref); + unsigned long msgs = dictSize(conn->outstanding_msgs_dict); + if (msgs != 0) { + log_warn("%s Waiting for %lu outstanding messages", print_obj(conn), msgs); + return; + } + ASSERT(conn->owner != NULL); + conn_event_del_conn(conn); + log_warn("%s unref owner %s", print_obj(conn), print_obj(conn->owner)); + conn->owner = NULL; + dictRelease(conn->outstanding_msgs_dict); + conn->outstanding_msgs_dict = NULL; + conn->waiting_to_unref = 0; + conn_put(conn); } -static void -client_unref_and_try_put(struct conn *conn) -{ - ASSERT(conn->type == CONN_CLIENT); - - struct server_pool *pool; - pool = conn->owner; - ASSERT(conn->owner != NULL); - ASSERT(TAILQ_COUNT(&pool->c_conn_q) != 0); - TAILQ_REMOVE(&pool->c_conn_q, conn, conn_tqe); - conn->waiting_to_unref = 1; - client_unref_internal_try_put(conn); -} +static void client_unref_and_try_put(struct conn *conn) { + ASSERT(conn->type == CONN_CLIENT); -static void -client_unref(struct conn *conn) -{ - client_unref_and_try_put(conn); + struct server_pool *pool; + pool = conn->owner; + ASSERT(conn->owner != NULL); + ASSERT(TAILQ_COUNT(&pool->c_conn_q) != 0); + TAILQ_REMOVE(&pool->c_conn_q, conn, conn_tqe); + conn->waiting_to_unref = 1; + client_unref_internal_try_put(conn); } -static bool -client_active(struct conn *conn) -{ - ASSERT(conn->type == CONN_CLIENT); +static void client_unref(struct conn *conn) { client_unref_and_try_put(conn); } - ASSERT(TAILQ_EMPTY(&conn->imsg_q)); +static bool client_active(struct conn *conn) { + ASSERT(conn->type == CONN_CLIENT); - if (!TAILQ_EMPTY(&conn->omsg_q)) { - log_debug(LOG_VVERB, "%s is active", print_obj(conn)); - return true; - } + ASSERT(TAILQ_EMPTY(&conn->imsg_q)); - if (conn->rmsg != NULL) { - log_debug(LOG_VVERB, "%s is active", print_obj(conn)); - return true; - } + if (!TAILQ_EMPTY(&conn->omsg_q)) { + log_debug(LOG_VVERB, "%s is active", print_obj(conn)); + return true; + } - if (conn->smsg != NULL) { - log_debug(LOG_VVERB, "%s is active", print_obj(conn)); - return true; - } + if (conn->rmsg != NULL) { + log_debug(LOG_VVERB, "%s is active", print_obj(conn)); + return true; + } + + if (conn->smsg != NULL) { + log_debug(LOG_VVERB, "%s is active", print_obj(conn)); + return true; + } - log_debug(LOG_VVERB, "%s is inactive", print_obj(conn)); + log_debug(LOG_VVERB, "%s is inactive", print_obj(conn)); - return false; + return false; } -static void -client_close_stats(struct context *ctx, struct server_pool *pool, err_t err, - unsigned eof) -{ - stats_pool_decr(ctx, client_connections); +static void client_close_stats(struct context *ctx, struct server_pool *pool, + err_t err, unsigned eof) { + stats_pool_decr(ctx, client_connections); - if (eof) { - stats_pool_incr(ctx, client_eof); - return; - } + if (eof) { + stats_pool_incr(ctx, client_eof); + return; + } - switch (err) { + switch (err) { case EPIPE: case ETIMEDOUT: case ECONNRESET: @@ -171,72 +158,71 @@ client_close_stats(struct context *ctx, struct server_pool *pool, err_t err, case EHOSTDOWN: case EHOSTUNREACH: default: - stats_pool_incr(ctx, client_err); - break; - } + stats_pool_incr(ctx, client_err); + break; + } } -static void -client_close(struct context *ctx, struct conn *conn) -{ - rstatus_t status; - struct msg *req, *nreq; /* current and next message */ +static void client_close(struct context *ctx, struct conn *conn) { + rstatus_t status; + struct msg *req, *nreq; /* current and next message */ - ASSERT(conn->type == CONN_CLIENT); + ASSERT(conn->type == CONN_CLIENT); - client_close_stats(ctx, conn->owner, conn->err, conn->eof); + client_close_stats(ctx, conn->owner, conn->err, conn->eof); - if (conn->sd < 0) { - client_unref(conn); - return; - } - - req = conn->rmsg; - if (req != NULL) { - conn->rmsg = NULL; + if (conn->sd < 0) { + client_unref(conn); + return; + } - ASSERT(req->selected_rsp == NULL); - ASSERT(req->is_request && !req->done); + req = conn->rmsg; + if (req != NULL) { + conn->rmsg = NULL; - log_info("%s close, discarding pending %s len %"PRIu32, - print_obj(conn), print_obj(req), req->mlen); + ASSERT(req->selected_rsp == NULL); + ASSERT(req->is_request && !req->done); - req_put(req); - } + log_info("%s close, discarding pending %s len %" PRIu32, print_obj(conn), + print_obj(req), req->mlen); - ASSERT(conn->smsg == NULL); - ASSERT(TAILQ_EMPTY(&conn->imsg_q)); + req_put(req); + } - for (req = TAILQ_FIRST(&conn->omsg_q); req != NULL; req = nreq) { - nreq = TAILQ_NEXT(req, c_tqe); + ASSERT(conn->smsg == NULL); + ASSERT(TAILQ_EMPTY(&conn->imsg_q)); - /* dequeue the message (request) from client outq */ - conn_dequeue_outq(ctx, conn, req); + for (req = TAILQ_FIRST(&conn->omsg_q); req != NULL; req = nreq) { + nreq = TAILQ_NEXT(req, c_tqe); - if (req->done || req->selected_rsp) { - log_info("%s close, discarding %s %s len %"PRIu32, print_obj(conn), - req->is_error ? "error": "completed", print_obj(req), req->mlen); - req_put(req); - } else { - req->swallow = 1; + /* dequeue the message (request) from client outq */ + conn_dequeue_outq(ctx, conn, req); - ASSERT(req->is_request); - ASSERT(req->selected_rsp == NULL); + if (req->done || req->selected_rsp) { + log_info("%s close, discarding %s %s len %" PRIu32, print_obj(conn), + req->is_error ? "error" : "completed", print_obj(req), + req->mlen); + req_put(req); + } else { + req->swallow = 1; - log_info("%s close, schedule swallow of %s len %"PRIu32, print_obj(conn), - print_obj(req), req->mlen); - } + ASSERT(req->is_request); + ASSERT(req->selected_rsp == NULL); - stats_pool_incr(ctx, client_dropped_requests); + log_info("%s close, schedule swallow of %s len %" PRIu32, print_obj(conn), + print_obj(req), req->mlen); } - ASSERT(TAILQ_EMPTY(&conn->omsg_q)); - status = close(conn->sd); - if (status < 0) { - log_error("close %s failed, ignored: %s", print_obj(conn), strerror(errno)); - } - conn->sd = -1; - client_unref(conn); + stats_pool_incr(ctx, client_dropped_requests); + } + ASSERT(TAILQ_EMPTY(&conn->omsg_q)); + + status = close(conn->sd); + if (status < 0) { + log_error("close %s failed, ignored: %s", print_obj(conn), strerror(errno)); + } + conn->sd = -1; + client_unref(conn); } /* Handle a response to a given request. if this is a quorum setting, choose the @@ -244,146 +230,140 @@ client_close(struct context *ctx, struct conn *conn) * request scenario and then use the post coalesce logic to cook up a combined * response */ -static rstatus_t -client_handle_response(struct conn *conn, msgid_t reqid, struct msg *rsp) -{ - // now the handler owns the response. - ASSERT(conn->type == CONN_CLIENT); - // Fetch the original request - struct msg *req = dictFetchValue(conn->outstanding_msgs_dict, &reqid); - if (!req) { - log_notice("looks like we already cleanedup the request for %d", reqid); - rsp_put(rsp); - return DN_OK; - } - // we have to submit the response irrespective of the unref status. - rstatus_t status = msg_handle_response(req, rsp); - if (conn->waiting_to_unref) { - // don't care about the status. - if (req->awaiting_rsps) - return DN_OK; - // all responses received +static rstatus_t client_handle_response(struct conn *conn, msgid_t reqid, + struct msg *rsp) { + // now the handler owns the response. + ASSERT(conn->type == CONN_CLIENT); + // Fetch the original request + struct msg *req = dictFetchValue(conn->outstanding_msgs_dict, &reqid); + if (!req) { + log_notice("looks like we already cleanedup the request for %d", reqid); + rsp_put(rsp); + return DN_OK; + } + // we have to submit the response irrespective of the unref status. + rstatus_t status = msg_handle_response(req, rsp); + if (conn->waiting_to_unref) { + // don't care about the status. + if (req->awaiting_rsps) return DN_OK; + // all responses received + dictDelete(conn->outstanding_msgs_dict, &reqid); + log_info("%s Putting %s", print_obj(conn), print_obj(req)); + req_put(req); + client_unref_internal_try_put(conn); + return DN_OK; + } + if (status == DN_NOOPS) { + // by now the response is dropped + if (!req->awaiting_rsps) { + // if we have sent the response for this request or the connection + // is closed and we are just waiting to drain off the messages. + if (req->rsp_sent) { dictDelete(conn->outstanding_msgs_dict, &reqid); log_info("%s Putting %s", print_obj(conn), print_obj(req)); req_put(req); - client_unref_internal_try_put(conn); - return DN_OK; + } } - if (status == DN_NOOPS) { - // by now the response is dropped - if (!req->awaiting_rsps) { - // if we have sent the response for this request or the connection - // is closed and we are just waiting to drain off the messages. - if (req->rsp_sent) { - dictDelete(conn->outstanding_msgs_dict, &reqid); - log_info("%s Putting %s", print_obj(conn), print_obj(req)); - req_put(req); - } - } - } else if (status == DN_OK) { - g_pre_coalesce(req->selected_rsp); - if (req_done(conn, req)) { - status = conn_event_add_out(conn); - if (status != DN_OK) { - conn->err = errno; - } - } + } else if (status == DN_OK) { + g_pre_coalesce(req->selected_rsp); + if (req_done(conn, req)) { + status = conn_event_add_out(conn); + if (status != DN_OK) { + conn->err = errno; + } } - return status; + } + return status; } -struct msg * -req_recv_next(struct context *ctx, struct conn *conn, bool alloc) -{ - struct msg *req; +struct msg *req_recv_next(struct context *ctx, struct conn *conn, bool alloc) { + struct msg *req; - ASSERT((conn->type == CONN_DNODE_PEER_CLIENT) || - (conn->type = CONN_CLIENT)); + ASSERT((conn->type == CONN_DNODE_PEER_CLIENT) || (conn->type = CONN_CLIENT)); - if (conn->eof) { - req = conn->rmsg; - - //if (conn->dyn_mode) { - // if (conn->non_bytes_recv > MAX_CONN_ALLOWABLE_NON_RECV) { - // conn->err = EPIPE; - // return NULL; - // } - // conn->eof = 0; - // return req; - //} - - /* client sent eof before sending the entire request */ - if (req != NULL) { - conn->rmsg = NULL; + if (conn->eof) { + req = conn->rmsg; - ASSERT(req->selected_rsp == NULL); - ASSERT(req->is_request && !req->done); + // if (conn->dyn_mode) { + // if (conn->non_bytes_recv > MAX_CONN_ALLOWABLE_NON_RECV) { + // conn->err = EPIPE; + // return NULL; + // } + // conn->eof = 0; + // return req; + //} - log_error("%s EOF discarding incomplete %s %"PRIu32"", print_obj(conn), print_obj(req), - req->mlen); + /* client sent eof before sending the entire request */ + if (req != NULL) { + conn->rmsg = NULL; - req_put(req); - } + ASSERT(req->selected_rsp == NULL); + ASSERT(req->is_request && !req->done); - /* - * TCP half-close enables the client to terminate its half of the - * connection (i.e. the client no longer sends data), but it still - * is able to receive data from the proxy. The proxy closes its - * half (by sending the second FIN) when the client has no - * outstanding requests - */ - if (!conn_active(conn)) { - conn->done = 1; - log_debug(LOG_INFO, "%s DONE", print_obj(conn)); - } + log_error("%s EOF discarding incomplete %s %" PRIu32 "", print_obj(conn), + print_obj(req), req->mlen); - return NULL; + req_put(req); } - req = conn->rmsg; - if (req != NULL) { - ASSERT(req->is_request); - return req; + /* + * TCP half-close enables the client to terminate its half of the + * connection (i.e. the client no longer sends data), but it still + * is able to receive data from the proxy. The proxy closes its + * half (by sending the second FIN) when the client has no + * outstanding requests + */ + if (!conn_active(conn)) { + conn->done = 1; + log_debug(LOG_INFO, "%s DONE", print_obj(conn)); } - if (!alloc) { - return NULL; - } - - req = req_get(conn); - if (req != NULL) { - conn->rmsg = req; - } + return NULL; + } + req = conn->rmsg; + if (req != NULL) { + ASSERT(req->is_request); return req; -} + } -static bool -req_filter(struct context *ctx, struct conn *conn, struct msg *req) -{ - ASSERT(conn->type == CONN_CLIENT); + if (!alloc) { + return NULL; + } - if (msg_empty(req)) { - ASSERT(conn->rmsg == NULL); - log_debug(LOG_VERB, "%s filter empty %s", print_obj(conn), print_obj(req)); - req_put(req); - return true; - } + req = req_get(conn); + if (req != NULL) { + conn->rmsg = req; + } - /* - * Handle "quit\r\n", which is the protocol way of doing a - * passive close - */ - if (req->quit) { - ASSERT(conn->rmsg == NULL); - log_debug(LOG_VERB, "%s filter quit %s", print_obj(conn), print_obj(req)); - conn->eof = 1; - conn->recv_ready = 0; - req_put(req); - return true; - } + return req; +} - return false; +static bool req_filter(struct context *ctx, struct conn *conn, + struct msg *req) { + ASSERT(conn->type == CONN_CLIENT); + + if (msg_empty(req)) { + ASSERT(conn->rmsg == NULL); + log_debug(LOG_VERB, "%s filter empty %s", print_obj(conn), print_obj(req)); + req_put(req); + return true; + } + + /* + * Handle "quit\r\n", which is the protocol way of doing a + * passive close + */ + if (req->quit) { + ASSERT(conn->rmsg == NULL); + log_debug(LOG_VERB, "%s filter quit %s", print_obj(conn), print_obj(req)); + conn->eof = 1; + conn->recv_ready = 0; + req_put(req); + return true; + } + + return false; } /* @@ -404,85 +384,78 @@ send_rsp_integer(struct context *ctx, struct conn *c_conn, struct msg *req) */ /* Expects req to be in the conn's outq already */ -void -req_forward_error(struct context *ctx, struct conn *conn, struct msg *req, - err_t error_code, err_t dyn_error_code) -{ - log_info("%s FORWARD FAILED %s len %"PRIu32": %d:%s", print_obj(conn), print_obj(req), req->mlen, - error_code, dn_strerror(error_code)); - - // Nothing to do if request is not expecting a reply. - // The higher layer will take care of freeing the request - if (!req->expect_datastore_reply) { - return; - } - - // Create an appropriate response for the request so its propagated up; - // This response gets dropped in rsp_make_error anyways. But since this is - // an error path its ok with the overhead. - struct msg *rsp = msg_get_error(conn, dyn_error_code, error_code); - rsp->peer = req; - //TODO: Check if this is required in response - rsp->dmsg = dmsg_get(); - rsp->dmsg->id = req->id; - - rstatus_t status = - conn_handle_response(conn, req->parent_id ? req->parent_id : req->id, - rsp); - IGNORE_RET_VAL(status); +void req_forward_error(struct context *ctx, struct conn *conn, struct msg *req, + err_t error_code, err_t dyn_error_code) { + log_info("%s FORWARD FAILED %s len %" PRIu32 ": %d:%s", print_obj(conn), + print_obj(req), req->mlen, error_code, dn_strerror(error_code)); + + // Nothing to do if request is not expecting a reply. + // The higher layer will take care of freeing the request + if (!req->expect_datastore_reply) { + return; + } + + // Create an appropriate response for the request so its propagated up; + // This response gets dropped in rsp_make_error anyways. But since this is + // an error path its ok with the overhead. + struct msg *rsp = msg_get_error(conn, dyn_error_code, error_code); + rsp->peer = req; + // TODO: Check if this is required in response + rsp->dmsg = dmsg_get(); + rsp->dmsg->id = req->id; + + rstatus_t status = conn_handle_response( + conn, req->parent_id ? req->parent_id : req->id, rsp); + IGNORE_RET_VAL(status); } -static void -req_redis_stats(struct context *ctx, struct msg *req) -{ - - switch (req->type) { - +static void req_redis_stats(struct context *ctx, struct msg *req) { + switch (req->type) { case MSG_REQ_REDIS_GET: - stats_server_incr(ctx, redis_req_get); - break; + stats_server_incr(ctx, redis_req_get); + break; case MSG_REQ_REDIS_SET: - stats_server_incr(ctx, redis_req_set); - break; + stats_server_incr(ctx, redis_req_set); + break; case MSG_REQ_REDIS_DEL: - stats_server_incr(ctx, redis_req_del); - break; + stats_server_incr(ctx, redis_req_del); + break; case MSG_REQ_REDIS_INCR: case MSG_REQ_REDIS_DECR: - stats_server_incr(ctx, redis_req_incr_decr); - break; + stats_server_incr(ctx, redis_req_incr_decr); + break; case MSG_REQ_REDIS_KEYS: - stats_server_incr(ctx, redis_req_keys); - break; + stats_server_incr(ctx, redis_req_keys); + break; case MSG_REQ_REDIS_MGET: - stats_server_incr(ctx, redis_req_mget); - break; + stats_server_incr(ctx, redis_req_mget); + break; case MSG_REQ_REDIS_SCAN: - stats_server_incr(ctx, redis_req_scan); - break; + stats_server_incr(ctx, redis_req_scan); + break; case MSG_REQ_REDIS_SORT: - stats_server_incr(ctx, redis_req_sort); - break; + stats_server_incr(ctx, redis_req_sort); + break; case MSG_REQ_REDIS_PING: - stats_server_incr(ctx, redis_req_ping); - break; + stats_server_incr(ctx, redis_req_ping); + break; case MSG_REQ_REDIS_LREM: - stats_server_incr(ctx, redis_req_lreqm); - /* do not break as this is a list operation as the following. - * We count twice the LREM because it is an intensive operation/ - * */ + stats_server_incr(ctx, redis_req_lreqm); + /* do not break as this is a list operation as the following. + * We count twice the LREM because it is an intensive operation/ + * */ case MSG_REQ_REDIS_LRANGE: case MSG_REQ_REDIS_LSET: case MSG_REQ_REDIS_LTRIM: case MSG_REQ_REDIS_LINDEX: case MSG_REQ_REDIS_LPUSHX: - stats_server_incr(ctx, redis_req_lists); - break; + stats_server_incr(ctx, redis_req_lists); + break; case MSG_REQ_REDIS_SUNION: - stats_server_incr(ctx, redis_req_sunion); - /* do not break as this is a set operation as the following. - * We count twice the SUNION because it is an intensive operation/ - * */ + stats_server_incr(ctx, redis_req_sunion); + /* do not break as this is a set operation as the following. + * We count twice the SUNION because it is an intensive operation/ + * */ case MSG_REQ_REDIS_SETBIT: case MSG_REQ_REDIS_SETEX: case MSG_REQ_REDIS_SETRANGE: @@ -494,8 +467,8 @@ req_redis_stats(struct context *ctx, struct msg *req) case MSG_REQ_REDIS_SREM: case MSG_REQ_REDIS_SUNIONSTORE: case MSG_REQ_REDIS_SSCAN: - stats_server_incr(ctx, redis_req_set); - break; + stats_server_incr(ctx, redis_req_set); + break; case MSG_REQ_REDIS_ZADD: case MSG_REQ_REDIS_ZINTERSTORE: case MSG_REQ_REDIS_ZRANGE: @@ -513,623 +486,600 @@ req_redis_stats(struct context *ctx, struct msg *req) case MSG_REQ_REDIS_ZREMRANGEBYRANK: case MSG_REQ_REDIS_ZREMRANGEBYSCORE: case MSG_REQ_REDIS_ZREVRANGEBYLEX: - stats_server_incr(ctx, redis_req_sortedsets); - break; + stats_server_incr(ctx, redis_req_sortedsets); + break; case MSG_REQ_REDIS_HINCRBY: case MSG_REQ_REDIS_HINCRBYFLOAT: case MSG_REQ_REDIS_HSET: case MSG_REQ_REDIS_HSETNX: - stats_server_incr(ctx, redis_req_hashes); - break; + stats_server_incr(ctx, redis_req_hashes); + break; default: - stats_server_incr(ctx, redis_req_other); - break; - } + stats_server_incr(ctx, redis_req_other); + break; + } } -static void -req_forward_stats(struct context *ctx, struct msg *req) -{ - ASSERT(req->is_request); +static void req_forward_stats(struct context *ctx, struct msg *req) { + ASSERT(req->is_request); - if (req->is_read) { - stats_server_incr(ctx, read_requests); - stats_server_incr_by(ctx, read_request_bytes, req->mlen); - } else { - stats_server_incr(ctx, write_requests); - stats_server_incr_by(ctx, write_request_bytes, req->mlen); - } + if (req->is_read) { + stats_server_incr(ctx, read_requests); + stats_server_incr_by(ctx, read_request_bytes, req->mlen); + } else { + stats_server_incr(ctx, write_requests); + stats_server_incr_by(ctx, write_request_bytes, req->mlen); + } } -rstatus_t -local_req_forward(struct context *ctx, struct conn *c_conn, struct msg *req, - uint8_t *key, uint32_t keylen, dyn_error_t *dyn_error_code) -{ - rstatus_t status; - struct conn * s_conn; - - ASSERT((c_conn->type == CONN_CLIENT) || - (c_conn->type == CONN_DNODE_PEER_CLIENT)); - - s_conn = get_datastore_conn(ctx, c_conn->owner, c_conn->sd); - log_debug(LOG_VERB, "c_conn %p got server conn %p", c_conn, s_conn); - if (s_conn == NULL) { - *dyn_error_code = STORAGE_CONNECTION_REFUSE; - return errno; - } - ASSERT(s_conn->type == CONN_SERVER); - - log_info("%s FORWARD %s to storage conn %s", print_obj(c_conn), print_obj(req), print_obj(s_conn)); - - if (ctx->dyn_state == NORMAL) { - /* enqueue the message (request) into server inq */ - if (TAILQ_EMPTY(&s_conn->imsg_q)) { - status = conn_event_add_out(s_conn); - - if (status != DN_OK) { - *dyn_error_code = DYNOMITE_UNKNOWN_ERROR; - s_conn->err = errno; - return DN_ERROR; - } - } - } else if (ctx->dyn_state == STANDBY) { //no reads/writes from peers/clients - log_debug(LOG_INFO, "Node is in STANDBY state. Drop write/read requests"); - *dyn_error_code = DYNOMITE_INVALID_STATE; - return DN_ERROR; - } else if (ctx->dyn_state == WRITES_ONLY && req->is_read) { - //no reads from peers/clients but allow writes from peers/clients - log_debug(LOG_INFO, "Node is in WRITES_ONLY state. Drop read requests"); - *dyn_error_code = DYNOMITE_INVALID_STATE; +rstatus_t local_req_forward(struct context *ctx, struct conn *c_conn, + struct msg *req, uint8_t *key, uint32_t keylen, + dyn_error_t *dyn_error_code) { + rstatus_t status; + struct conn *s_conn; + + ASSERT((c_conn->type == CONN_CLIENT) || + (c_conn->type == CONN_DNODE_PEER_CLIENT)); + + s_conn = get_datastore_conn(ctx, c_conn->owner, c_conn->sd); + log_debug(LOG_VERB, "c_conn %p got server conn %p", c_conn, s_conn); + if (s_conn == NULL) { + *dyn_error_code = STORAGE_CONNECTION_REFUSE; + return errno; + } + ASSERT(s_conn->type == CONN_SERVER); + + log_info("%s FORWARD %s to storage conn %s", print_obj(c_conn), + print_obj(req), print_obj(s_conn)); + + if (ctx->dyn_state == NORMAL) { + /* enqueue the message (request) into server inq */ + if (TAILQ_EMPTY(&s_conn->imsg_q)) { + status = conn_event_add_out(s_conn); + + if (status != DN_OK) { + *dyn_error_code = DYNOMITE_UNKNOWN_ERROR; + s_conn->err = errno; return DN_ERROR; - } else if (ctx->dyn_state == RESUMING) { - log_debug(LOG_INFO, "Node is in RESUMING state. Still drop read requests and flush out all the queued writes"); - if (req->is_read) { - *dyn_error_code = DYNOMITE_INVALID_STATE; - return DN_ERROR; - } - - status = conn_event_add_out(s_conn); - - if (status != DN_OK) { - *dyn_error_code = DYNOMITE_UNKNOWN_ERROR; - s_conn->err = errno; - return DN_ERROR; - } + } } - - conn_enqueue_inq(ctx, s_conn, req); - req_forward_stats(ctx, req); - if(g_data_store == DATA_REDIS){ - req_redis_stats(ctx, req); + } else if (ctx->dyn_state == STANDBY) { // no reads/writes from peers/clients + log_debug(LOG_INFO, "Node is in STANDBY state. Drop write/read requests"); + *dyn_error_code = DYNOMITE_INVALID_STATE; + return DN_ERROR; + } else if (ctx->dyn_state == WRITES_ONLY && req->is_read) { + // no reads from peers/clients but allow writes from peers/clients + log_debug(LOG_INFO, "Node is in WRITES_ONLY state. Drop read requests"); + *dyn_error_code = DYNOMITE_INVALID_STATE; + return DN_ERROR; + } else if (ctx->dyn_state == RESUMING) { + log_debug(LOG_INFO, + "Node is in RESUMING state. Still drop read requests and flush " + "out all the queued writes"); + if (req->is_read) { + *dyn_error_code = DYNOMITE_INVALID_STATE; + return DN_ERROR; } + status = conn_event_add_out(s_conn); - log_debug(LOG_VERB, "%s local forward %s to %s len %"PRIu32" key '%.*s'", - print_obj(c_conn), print_obj(req), print_obj(s_conn), req->mlen, keylen, key); - *dyn_error_code = 0; - return DN_OK; -} - - -static rstatus_t -admin_local_req_forward(struct context *ctx, struct conn *c_conn, struct msg *req, - struct rack *rack, uint8_t *key, uint32_t keylen, - dyn_error_t *dyn_error_code) -{ - ASSERT((c_conn->type == CONN_CLIENT) || - (c_conn->type == CONN_DNODE_PEER_CLIENT)); - - struct node *peer = dnode_peer_pool_server(ctx, c_conn->owner, rack, key, keylen, req->msg_routing); - if (!peer->is_local) { - *dyn_error_code = DYNOMITE_INVALID_ADMIN_REQ; - return DN_ERROR; + if (status != DN_OK) { + *dyn_error_code = DYNOMITE_UNKNOWN_ERROR; + s_conn->err = errno; + return DN_ERROR; } + } + + conn_enqueue_inq(ctx, s_conn, req); + req_forward_stats(ctx, req); + if (g_data_store == DATA_REDIS) { + req_redis_stats(ctx, req); + } + + log_debug(LOG_VERB, "%s local forward %s to %s len %" PRIu32 " key '%.*s'", + print_obj(c_conn), print_obj(req), print_obj(s_conn), req->mlen, + keylen, key); + *dyn_error_code = 0; + return DN_OK; +} - log_debug(LOG_NOTICE, "%s Need to delete [%.*s] ", print_obj(c_conn), keylen, key); - return local_req_forward(ctx, c_conn, req, key, keylen, dyn_error_code); +static rstatus_t admin_local_req_forward(struct context *ctx, + struct conn *c_conn, struct msg *req, + struct rack *rack, uint8_t *key, + uint32_t keylen, + dyn_error_t *dyn_error_code) { + ASSERT((c_conn->type == CONN_CLIENT) || + (c_conn->type == CONN_DNODE_PEER_CLIENT)); + + struct node *peer = dnode_peer_pool_server(ctx, c_conn->owner, rack, key, + keylen, req->msg_routing); + if (!peer->is_local) { + *dyn_error_code = DYNOMITE_INVALID_ADMIN_REQ; + return DN_ERROR; + } + + log_debug(LOG_NOTICE, "%s Need to delete [%.*s] ", print_obj(c_conn), keylen, + key); + return local_req_forward(ctx, c_conn, req, key, keylen, dyn_error_code); } /* On Success, the request is placed in the other connection's inq. Otherwise * it is the caller's responsibility to take care of freeing it. */ -rstatus_t -remote_req_forward(struct context *ctx, struct conn *c_conn, struct msg *req, - struct rack *rack, uint8_t *key, uint32_t keylen, - dyn_error_t *dyn_error_code) -{ - ASSERT((c_conn->type == CONN_CLIENT) || - (c_conn->type == CONN_DNODE_PEER_CLIENT)); - - struct node * peer = dnode_peer_pool_server(ctx, c_conn->owner, rack, key, - keylen, req->msg_routing); - if (peer->is_local) { - return local_req_forward(ctx, c_conn, req, key, keylen, dyn_error_code); - } +rstatus_t remote_req_forward(struct context *ctx, struct conn *c_conn, + struct msg *req, struct rack *rack, uint8_t *key, + uint32_t keylen, dyn_error_t *dyn_error_code) { + ASSERT((c_conn->type == CONN_CLIENT) || + (c_conn->type == CONN_DNODE_PEER_CLIENT)); + + struct node *peer = dnode_peer_pool_server(ctx, c_conn->owner, rack, key, + keylen, req->msg_routing); + if (peer->is_local) { + return local_req_forward(ctx, c_conn, req, key, keylen, dyn_error_code); + } + + // now get a peer connection + struct conn *p_conn = dnode_peer_get_conn(ctx, peer, c_conn->sd); + if (!p_conn) { + // No active connection. return error + *dyn_error_code = PEER_HOST_NOT_CONNECTED; + return DN_ERROR; + } + + return dnode_peer_req_forward(ctx, c_conn, p_conn, req, rack, key, keylen, + dyn_error_code); +} - // now get a peer connection - struct conn *p_conn = dnode_peer_get_conn(ctx, peer, c_conn->sd); - if (!p_conn) { - // No active connection. return error - *dyn_error_code = PEER_HOST_NOT_CONNECTED; - return DN_ERROR; - } +void req_forward_all_local_racks(struct context *ctx, struct conn *c_conn, + struct msg *req, struct mbuf *orig_mbuf, + uint8_t *key, uint32_t keylen, + struct datacenter *dc) { + uint8_t rack_cnt = (uint8_t)array_n(&dc->racks); + uint8_t rack_index; + init_response_mgr(&req->rspmgr, req, req->is_read, rack_cnt, c_conn); + log_info("%s %s same DC racks:%d expect replies %d", print_obj(c_conn), + print_obj(req), rack_cnt, req->rspmgr.max_responses); + for (rack_index = 0; rack_index < rack_cnt; rack_index++) { + struct rack *rack = array_get(&dc->racks, rack_index); + struct server_pool *pool = c_conn->owner; + dyn_error_t dyn_error_code = 0; + rstatus_t s = DN_OK; - return dnode_peer_req_forward(ctx, c_conn, p_conn, req, rack, key, keylen, dyn_error_code); -} + if (string_compare(rack->name, &pool->rack) == 0) { + // Local Rack + s = remote_req_forward(ctx, c_conn, req, rack, key, keylen, + &dyn_error_code); + if (s != DN_OK) { + req_forward_error(ctx, c_conn, req, s, dyn_error_code); + } -void -req_forward_all_local_racks(struct context *ctx, struct conn *c_conn, - struct msg *req, struct mbuf *orig_mbuf, - uint8_t *key, uint32_t keylen, struct datacenter *dc) -{ - uint8_t rack_cnt = (uint8_t)array_n(&dc->racks); - uint8_t rack_index; - init_response_mgr(&req->rspmgr, req, req->is_read, rack_cnt, c_conn); - log_info("%s %s same DC racks:%d expect replies %d", print_obj(c_conn), print_obj(req), - rack_cnt, req->rspmgr.max_responses); - for(rack_index = 0; rack_index < rack_cnt; rack_index++) { - - struct rack *rack = array_get(&dc->racks, rack_index); - struct server_pool *pool = c_conn->owner; - dyn_error_t dyn_error_code = 0; - rstatus_t s = DN_OK; - - if (string_compare(rack->name, &pool->rack) == 0 ) { - - // Local Rack - s = remote_req_forward(ctx, c_conn, req, rack, key, - keylen, &dyn_error_code); - if (s != DN_OK) { - req_forward_error(ctx, c_conn, req, s, dyn_error_code); - } - - } else { - // Remote Rack - struct msg *rack_msg = msg_get(c_conn, req->is_request, __FUNCTION__); - if (rack_msg == NULL) { - log_error("whelp, looks like yer screwed " - "now, buddy. no inter-rack messages for you!"); - if (req->consistency != DC_ONE) { - // expecting a reply to form a quorum - req_forward_error(ctx, c_conn, req, DN_ENOMEM, DYNOMITE_UNKNOWN_ERROR); - } - continue; - } - - msg_clone(req, orig_mbuf, rack_msg); - rack_msg->swallow = true; - - log_info("%s forwarding cloned %s to same dc rack '%.*s'", - print_obj(c_conn), print_obj(rack_msg), rack->name->len, rack->name->data); - - s = remote_req_forward(ctx, c_conn, rack_msg, rack, key, keylen, &dyn_error_code); - if (s != DN_OK) { - if (req->consistency != DC_ONE) { - // expecting a reply to form a quorum - req_forward_error(ctx, c_conn, rack_msg, s, dyn_error_code); - } - req_put(rack_msg); - continue; - } + } else { + // Remote Rack + struct msg *rack_msg = msg_get(c_conn, req->is_request, __FUNCTION__); + if (rack_msg == NULL) { + log_error( + "whelp, looks like yer screwed " + "now, buddy. no inter-rack messages for you!"); + if (req->consistency != DC_ONE) { + // expecting a reply to form a quorum + req_forward_error(ctx, c_conn, req, DN_ENOMEM, + DYNOMITE_UNKNOWN_ERROR); + } + continue; + } + + msg_clone(req, orig_mbuf, rack_msg); + rack_msg->swallow = true; + + log_info("%s forwarding cloned %s to same dc rack '%.*s'", + print_obj(c_conn), print_obj(rack_msg), rack->name->len, + rack->name->data); + + s = remote_req_forward(ctx, c_conn, rack_msg, rack, key, keylen, + &dyn_error_code); + if (s != DN_OK) { + if (req->consistency != DC_ONE) { + // expecting a reply to form a quorum + req_forward_error(ctx, c_conn, rack_msg, s, dyn_error_code); } + req_put(rack_msg); + continue; + } } + } } -static bool -request_send_to_all_dcs(struct msg *req) -{ - // There is a routing override - if (req->msg_routing != ROUTING_NORMAL) - return false; +static bool request_send_to_all_dcs(struct msg *req) { + // There is a routing override + if (req->msg_routing != ROUTING_NORMAL) return false; - // Reads are not propagated - if (req->is_read) - return false; + // Reads are not propagated + if (req->is_read) return false; - return true; + return true; } /** * Determine if a request should be forwarded to all replicas within the local * DC. * @param[in] req Message. - * @return bool True if message should be forwarded to all local replicas, else false + * @return bool True if message should be forwarded to all local replicas, else + * false */ -static bool -request_send_to_all_local_racks(struct msg *req) -{ - /* There is a routing override set by the parser on this message. Do not - * propagate it to other racks irrespective of the consistency setting */ - if (req->msg_routing != ROUTING_NORMAL) - return false; - - // A write should go to all racks - if (!req->is_read) - return true; - - if ((req->consistency == DC_QUORUM) || - (req->consistency == DC_SAFE_QUORUM)) - return true; - return false; +static bool request_send_to_all_local_racks(struct msg *req) { + /* There is a routing override set by the parser on this message. Do not + * propagate it to other racks irrespective of the consistency setting */ + if (req->msg_routing != ROUTING_NORMAL) return false; + + // A write should go to all racks + if (!req->is_read) return true; + + if ((req->consistency == DC_QUORUM) || (req->consistency == DC_SAFE_QUORUM)) + return true; + return false; } -static void -req_forward_remote_dc(struct context *ctx, struct conn *c_conn, struct msg *req, - struct mbuf *orig_mbuf, uint8_t *key, uint32_t keylen, - struct datacenter *dc) -{ - const uint32_t rack_cnt = array_n(&dc->racks); - if (rack_cnt == 0) - return; +static void req_forward_remote_dc(struct context *ctx, struct conn *c_conn, + struct msg *req, struct mbuf *orig_mbuf, + uint8_t *key, uint32_t keylen, + struct datacenter *dc) { + const uint32_t rack_cnt = array_n(&dc->racks); + if (rack_cnt == 0) return; + + struct rack *rack = dc->preselected_rack_for_replication; + if (rack == NULL) rack = array_get(&dc->racks, 0); + + struct msg *rack_msg = msg_get(c_conn, req->is_request, __FUNCTION__); + if (rack_msg == NULL) { + log_debug(LOG_VERB, + "whelp, looks like yer screwed now, buddy. no inter-rack " + "messages for you!"); + return; + } + + msg_clone(req, orig_mbuf, rack_msg); + rack_msg->swallow = true; - struct rack *rack = dc->preselected_rack_for_replication; - if (rack == NULL) - rack = array_get(&dc->racks, 0); + log_info("%s forwarding cloned %s on remote dc rack '%.*s'", + print_obj(c_conn), print_obj(rack_msg), rack->name->len, + rack->name->data); - struct msg *rack_msg = msg_get(c_conn, req->is_request, __FUNCTION__); + dyn_error_t dyn_error_code = 0; + rstatus_t s = remote_req_forward(ctx, c_conn, rack_msg, rack, key, keylen, + &dyn_error_code); + if (s == DN_OK) { + return; + } + req_put(rack_msg); + // Start over with another rack. + uint8_t rack_index; + for (rack_index = 0; rack_index < rack_cnt; rack_index++) { + rack = array_get(&dc->racks, rack_index); + + if (rack == dc->preselected_rack_for_replication) continue; + rack_msg = msg_get(c_conn, req->is_request, __FUNCTION__); if (rack_msg == NULL) { - log_debug(LOG_VERB, "whelp, looks like yer screwed now, buddy. no inter-rack messages for you!"); - return; + log_debug(LOG_VERB, + "whelp, looks like yer screwed now, buddy. no inter-rack " + "messages for you!"); + return; } msg_clone(req, orig_mbuf, rack_msg); rack_msg->swallow = true; - log_info("%s forwarding cloned %s on remote dc rack '%.*s'", - print_obj(c_conn), print_obj(rack_msg), rack->name->len, rack->name->data); + log_info("%s FAILOVER forwarding cloned %s to remote dc rack '%.*s'", + print_obj(c_conn), print_obj(rack_msg), rack->name->len, + rack->name->data); - dyn_error_t dyn_error_code = 0; - rstatus_t s = remote_req_forward(ctx, c_conn, rack_msg, rack, key, keylen, - &dyn_error_code); + dyn_error_code = DYNOMITE_OK; + s = remote_req_forward(ctx, c_conn, rack_msg, rack, key, keylen, + &dyn_error_code); if (s == DN_OK) { - return; + stats_pool_incr(ctx, remote_peer_failover_requests); + return; } req_put(rack_msg); - // Start over with another rack. - uint8_t rack_index; - for(rack_index = 0; rack_index < rack_cnt; rack_index++) { - rack = array_get(&dc->racks, rack_index); - - if (rack == dc->preselected_rack_for_replication) - continue; - rack_msg = msg_get(c_conn, req->is_request, __FUNCTION__); - if (rack_msg == NULL) { - log_debug(LOG_VERB, "whelp, looks like yer screwed now, buddy. no inter-rack messages for you!"); - return; - } - - msg_clone(req, orig_mbuf, rack_msg); - rack_msg->swallow = true; - - log_info("%s FAILOVER forwarding cloned %s to remote dc rack '%.*s'", - print_obj(c_conn), print_obj(rack_msg), rack->name->len, rack->name->data); - - dyn_error_code = DYNOMITE_OK; - s = remote_req_forward(ctx, c_conn, rack_msg, rack, key, keylen, - &dyn_error_code); - if (s == DN_OK) { - stats_pool_incr(ctx, remote_peer_failover_requests); - return; - } - req_put(rack_msg); - } - stats_pool_incr(ctx, remote_peer_dropped_requests); -} - -static void -req_forward_local_dc(struct context *ctx, struct conn *c_conn, struct msg *req, - struct mbuf *orig_mbuf, uint8_t *key, uint32_t keylen, - struct datacenter *dc) -{ - struct server_pool *pool = c_conn->owner; - req->rsp_handler = msg_get_rsp_handler(req); - if (request_send_to_all_local_racks(req)) { - // send request to all local racks - req_forward_all_local_racks(ctx, c_conn, req, orig_mbuf, key, keylen, dc); - } else { - // send request to only local token owner - struct rack * rack = server_get_rack_by_dc_rack(pool, &pool->rack, - &pool->dc); - dyn_error_t dyn_error_code = 0; - rstatus_t s = remote_req_forward(ctx, c_conn, req, rack, key, keylen, - &dyn_error_code); - if (s != DN_OK) { - req_forward_error(ctx, c_conn, req, s, dyn_error_code); - } - } + } + stats_pool_incr(ctx, remote_peer_dropped_requests); } -static void -req_forward(struct context *ctx, struct conn *c_conn, struct msg *req) -{ - struct server_pool *pool = c_conn->owner; - dyn_error_t dyn_error_code = DYNOMITE_OK; - rstatus_t s = DN_OK; - - ASSERT(c_conn->type == CONN_CLIENT); - - if (req->is_read) { - if (req->type != MSG_REQ_REDIS_PING) - stats_pool_incr(ctx, client_read_requests); - } else - stats_pool_incr(ctx, client_write_requests); - - uint32_t keylen = 0; - uint8_t *key = msg_get_tagged_key(req, 0, &keylen); - uint32_t full_keylen = 0; - uint8_t *full_key = msg_get_full_key(req, 0, &full_keylen); - - log_info(">>>>>>>>>>>>>>>>>>>>>>> %s RECEIVED %s key '%.*s' tagged key '%.*s'", - print_obj(c_conn), print_obj(req), full_keylen, full_key, keylen, key); - // add the message to the dict - dictAdd(c_conn->outstanding_msgs_dict, &req->id, req); - - s = g_verify_request(req, pool, server_get_rack_by_dc_rack(pool, &pool->rack, &pool->dc)); +static void req_forward_local_dc(struct context *ctx, struct conn *c_conn, + struct msg *req, struct mbuf *orig_mbuf, + uint8_t *key, uint32_t keylen, + struct datacenter *dc) { + struct server_pool *pool = c_conn->owner; + req->rsp_handler = msg_get_rsp_handler(req); + if (request_send_to_all_local_racks(req)) { + // send request to all local racks + req_forward_all_local_racks(ctx, c_conn, req, orig_mbuf, key, keylen, dc); + } else { + // send request to only local token owner + struct rack *rack = + server_get_rack_by_dc_rack(pool, &pool->rack, &pool->dc); + dyn_error_t dyn_error_code = 0; + rstatus_t s = remote_req_forward(ctx, c_conn, req, rack, key, keylen, + &dyn_error_code); if (s != DN_OK) { - if (req->expect_datastore_reply) { - conn_enqueue_outq(ctx, c_conn, req); - } - req_forward_error(ctx, c_conn, req, DN_OK, s); - return; + req_forward_error(ctx, c_conn, req, s, dyn_error_code); } + } +} - - // need to capture the initial mbuf location as once we add in the dynomite - // headers (as mbufs to the src req), that will bork the request sent to - // secondary racks - struct mbuf *orig_mbuf = STAILQ_FIRST(&req->mhdr); - - /* enqueue message (request) into client outq, if response is expected */ +static void req_forward(struct context *ctx, struct conn *c_conn, + struct msg *req) { + struct server_pool *pool = c_conn->owner; + dyn_error_t dyn_error_code = DYNOMITE_OK; + rstatus_t s = DN_OK; + + ASSERT(c_conn->type == CONN_CLIENT); + + if (req->is_read) { + if (req->type != MSG_REQ_REDIS_PING) + stats_pool_incr(ctx, client_read_requests); + } else + stats_pool_incr(ctx, client_write_requests); + + uint32_t keylen = 0; + uint8_t *key = msg_get_tagged_key(req, 0, &keylen); + uint32_t full_keylen = 0; + uint8_t *full_key = msg_get_full_key(req, 0, &full_keylen); + + log_info( + ">>>>>>>>>>>>>>>>>>>>>>> %s RECEIVED %s key '%.*s' tagged key '%.*s'", + print_obj(c_conn), print_obj(req), full_keylen, full_key, keylen, key); + // add the message to the dict + dictAdd(c_conn->outstanding_msgs_dict, &req->id, req); + + s = g_verify_request( + req, pool, server_get_rack_by_dc_rack(pool, &pool->rack, &pool->dc)); + if (s != DN_OK) { if (req->expect_datastore_reply) { - conn_enqueue_outq(ctx, c_conn, req); + conn_enqueue_outq(ctx, c_conn, req); } - - if (ctx->admin_opt == 1) { - if (req->type == MSG_REQ_REDIS_DEL || req->type == MSG_REQ_MC_DELETE) { - struct rack * rack = server_get_rack_by_dc_rack(pool, &pool->rack, &pool->dc); - s = admin_local_req_forward(ctx, c_conn, req, rack, key, - keylen, &dyn_error_code); - if (s != DN_OK) { - req_forward_error(ctx, c_conn, req, s, dyn_error_code); - } - return; - } + req_forward_error(ctx, c_conn, req, DN_OK, s); + return; + } + + // need to capture the initial mbuf location as once we add in the dynomite + // headers (as mbufs to the src req), that will bork the request sent to + // secondary racks + struct mbuf *orig_mbuf = STAILQ_FIRST(&req->mhdr); + + /* enqueue message (request) into client outq, if response is expected */ + if (req->expect_datastore_reply) { + conn_enqueue_outq(ctx, c_conn, req); + } + + if (ctx->admin_opt == 1) { + if (req->type == MSG_REQ_REDIS_DEL || req->type == MSG_REQ_MC_DELETE) { + struct rack *rack = + server_get_rack_by_dc_rack(pool, &pool->rack, &pool->dc); + s = admin_local_req_forward(ctx, c_conn, req, rack, key, keylen, + &dyn_error_code); + if (s != DN_OK) { + req_forward_error(ctx, c_conn, req, s, dyn_error_code); + } + return; } + } - if (req->msg_routing == ROUTING_LOCAL_NODE_ONLY) { - // Strictly local host only - req->consistency = DC_ONE; - req->rsp_handler = msg_local_one_rsp_handler; + if (req->msg_routing == ROUTING_LOCAL_NODE_ONLY) { + // Strictly local host only + req->consistency = DC_ONE; + req->rsp_handler = msg_local_one_rsp_handler; - s = local_req_forward(ctx, c_conn, req, key, keylen, &dyn_error_code); - if (s != DN_OK) { - req_forward_error(ctx, c_conn, req, s, dyn_error_code); - } - return; + s = local_req_forward(ctx, c_conn, req, key, keylen, &dyn_error_code); + if (s != DN_OK) { + req_forward_error(ctx, c_conn, req, s, dyn_error_code); } + return; + } - req->consistency = req->is_read ? conn_get_read_consistency(c_conn) : - conn_get_write_consistency(c_conn); - - /* forward the request */ - uint32_t dc_cnt = array_n(&pool->datacenters); - uint32_t dc_index; + req->consistency = req->is_read ? conn_get_read_consistency(c_conn) + : conn_get_write_consistency(c_conn); - for(dc_index = 0; dc_index < dc_cnt; dc_index++) { + /* forward the request */ + uint32_t dc_cnt = array_n(&pool->datacenters); + uint32_t dc_index; - struct datacenter *dc = array_get(&pool->datacenters, dc_index); - if (dc == NULL) { - log_error("Wow, this is very bad, dc is NULL"); - return; - } + for (dc_index = 0; dc_index < dc_cnt; dc_index++) { + struct datacenter *dc = array_get(&pool->datacenters, dc_index); + if (dc == NULL) { + log_error("Wow, this is very bad, dc is NULL"); + return; + } - if (string_compare(dc->name, &pool->dc) == 0) - req_forward_local_dc(ctx, c_conn, req, orig_mbuf, key, keylen, dc); - else if (request_send_to_all_dcs(req)) { - req_forward_remote_dc(ctx, c_conn, req, orig_mbuf, key, keylen, dc); - } + if (string_compare(dc->name, &pool->dc) == 0) + req_forward_local_dc(ctx, c_conn, req, orig_mbuf, key, keylen, dc); + else if (request_send_to_all_dcs(req)) { + req_forward_remote_dc(ctx, c_conn, req, orig_mbuf, key, keylen, dc); } + } } /* * Rewrites a query if necessary. * - * If a rewrite occured, it will replace '*req' with the new 'msg' that contains the new query - * and free up the original msg. + * If a rewrite occured, it will replace '*req' with the new 'msg' that contains + * the new query and free up the original msg. * */ -rstatus_t rewrite_query_if_necessary(struct msg** req, struct context* ctx) { - bool did_rewrite = false; - struct msg* new_req = NULL; - rstatus_t ret_status = g_rewrite_query(*req, ctx, &did_rewrite, &new_req); - THROW_STATUS(ret_status); - - if (did_rewrite) { - // If we successfully did a rewrite, we need to recycle the memory used by the original - // request and point it to the 'new_req'. - msg_put(*req); - *req = new_req; - } - return DN_OK; +rstatus_t rewrite_query_if_necessary(struct msg **req, struct context *ctx) { + bool did_rewrite = false; + struct msg *new_req = NULL; + rstatus_t ret_status = g_rewrite_query(*req, ctx, &did_rewrite, &new_req); + THROW_STATUS(ret_status); + + if (did_rewrite) { + // If we successfully did a rewrite, we need to recycle the memory used by + // the original request and point it to the 'new_req'. + msg_put(*req); + *req = new_req; + } + return DN_OK; } /* * Fragments a query if applicable. * 'frag_msgq' will be non-empty if the query is fragmented. */ -rstatus_t fragment_query_if_necessary(struct msg* req, struct conn* conn, struct msg_tqh* frag_msgq) { - struct server_pool *pool = conn->owner; - struct rack *rack = server_get_rack_by_dc_rack(pool, &pool->rack, &pool->dc); - return g_fragment(req, pool, rack, frag_msgq); +rstatus_t fragment_query_if_necessary(struct msg *req, struct conn *conn, + struct msg_tqh *frag_msgq) { + struct server_pool *pool = conn->owner; + struct rack *rack = server_get_rack_by_dc_rack(pool, &pool->rack, &pool->dc); + return g_fragment(req, pool, rack, frag_msgq); } -void -req_recv_done(struct context *ctx, struct conn *conn, - struct msg *req, struct msg *nreq) -{ - ASSERT(conn->type == CONN_CLIENT); - ASSERT(req->is_request); - ASSERT(req->owner == conn); - ASSERT(conn->rmsg == req); - ASSERT(nreq == NULL || nreq->is_request); +void req_recv_done(struct context *ctx, struct conn *conn, struct msg *req, + struct msg *nreq) { + ASSERT(conn->type == CONN_CLIENT); + ASSERT(req->is_request); + ASSERT(req->owner == conn); + ASSERT(conn->rmsg == req); + ASSERT(nreq == NULL || nreq->is_request); - if (!req->is_read) - stats_histo_add_payloadsize(ctx, req->mlen); + if (!req->is_read) stats_histo_add_payloadsize(ctx, req->mlen); - /* enqueue next message (request), if any */ - conn->rmsg = nreq; + /* enqueue next message (request), if any */ + conn->rmsg = nreq; - if (req_filter(ctx, conn, req)) { - return; - } + if (req_filter(ctx, conn, req)) { + return; + } - req->stime_in_microsec = dn_usec_now(); - struct msg_tqh frag_msgq; - TAILQ_INIT(&frag_msgq); + req->stime_in_microsec = dn_usec_now(); + struct msg_tqh frag_msgq; + TAILQ_INIT(&frag_msgq); - rstatus_t status = rewrite_query_if_necessary(&req, ctx); - if (status != DN_OK) goto error; + rstatus_t status = rewrite_query_if_necessary(&req, ctx); + if (status != DN_OK) goto error; - status = fragment_query_if_necessary(req, conn, &frag_msgq); - if (status != DN_OK) goto error; + status = fragment_query_if_necessary(req, conn, &frag_msgq); + if (status != DN_OK) goto error; - /* if no fragment happened */ - if (TAILQ_EMPTY(&frag_msgq)) { - req_forward(ctx, conn, req); - return; - } + /* if no fragment happened */ + if (TAILQ_EMPTY(&frag_msgq)) { + req_forward(ctx, conn, req); + return; + } - status = req_make_reply(ctx, conn, req); - if (status != DN_OK) goto error; + status = req_make_reply(ctx, conn, req); + if (status != DN_OK) goto error; - struct msg *sub_msg, *tmsg; - for (sub_msg = TAILQ_FIRST(&frag_msgq); sub_msg != NULL; sub_msg = tmsg) { - tmsg = TAILQ_NEXT(sub_msg, m_tqe); + struct msg *sub_msg, *tmsg; + for (sub_msg = TAILQ_FIRST(&frag_msgq); sub_msg != NULL; sub_msg = tmsg) { + tmsg = TAILQ_NEXT(sub_msg, m_tqe); - TAILQ_REMOVE(&frag_msgq, sub_msg, m_tqe); - log_info("Forwarding split request %s", print_obj(sub_msg)); - req_forward(ctx, conn, sub_msg); - } - ASSERT(TAILQ_EMPTY(&frag_msgq)); - return; + TAILQ_REMOVE(&frag_msgq, sub_msg, m_tqe); + log_info("Forwarding split request %s", print_obj(sub_msg)); + req_forward(ctx, conn, sub_msg); + } + ASSERT(TAILQ_EMPTY(&frag_msgq)); + return; error: - if (req->expect_datastore_reply) { - conn_enqueue_outq(ctx, conn, req); - } - req_forward_error(ctx, conn, req, DN_OK, status); //TODO: CHeck error code - return; + if (req->expect_datastore_reply) { + conn_enqueue_outq(ctx, conn, req); + } + req_forward_error(ctx, conn, req, DN_OK, status); // TODO: CHeck error code + return; } - -static msg_response_handler_t -msg_get_rsp_handler(struct msg *req) -{ - if (request_send_to_all_local_racks(req)) { - // Request is being braoadcasted - // Check if its quorum - if ((req->consistency == DC_QUORUM) || - (req->consistency == DC_SAFE_QUORUM)) - return msg_quorum_rsp_handler; - } - return msg_local_one_rsp_handler; +static msg_response_handler_t msg_get_rsp_handler(struct msg *req) { + if (request_send_to_all_local_racks(req)) { + // Request is being braoadcasted + // Check if its quorum + if ((req->consistency == DC_QUORUM) || (req->consistency == DC_SAFE_QUORUM)) + return msg_quorum_rsp_handler; + } + return msg_local_one_rsp_handler; } -rstatus_t -msg_local_one_rsp_handler(struct msg *req, struct msg *rsp) -{ - ASSERT_LOG(!req->selected_rsp, "Received more than one response for dc_one.\ - %s prev %s new rsp %s", print_obj(req), print_obj(req->selected_rsp), print_obj(rsp)); - req->awaiting_rsps = 0; - rsp->peer = req; - req->is_error = rsp->is_error; - req->error_code = rsp->error_code; - req->dyn_error_code = rsp->dyn_error_code; - req->selected_rsp = rsp; - log_info("%d SELECTED %d", print_obj(req), print_obj(rsp)); - return DN_OK; +rstatus_t msg_local_one_rsp_handler(struct msg *req, struct msg *rsp) { + ASSERT_LOG(!req->selected_rsp, + "Received more than one response for dc_one.\ + %s prev %s new rsp %s", + print_obj(req), print_obj(req->selected_rsp), print_obj(rsp)); + req->awaiting_rsps = 0; + rsp->peer = req; + req->is_error = rsp->is_error; + req->error_code = rsp->error_code; + req->dyn_error_code = rsp->dyn_error_code; + req->selected_rsp = rsp; + log_info("%d SELECTED %d", print_obj(req), print_obj(rsp)); + return DN_OK; } -static rstatus_t -swallow_extra_rsp(struct msg *req, struct msg *rsp) -{ - log_info("%s SWALLOW %s awaiting %d", print_obj(req), print_obj(rsp), - req->awaiting_rsps); - ASSERT_LOG(req->awaiting_rsps, "%s has no awaiting rsps, received %s", print_obj(req), print_obj(rsp)); - // drop this response. - rsp_put(rsp); - msg_decr_awaiting_rsps(req); - return DN_NOOPS; +static rstatus_t swallow_extra_rsp(struct msg *req, struct msg *rsp) { + log_info("%s SWALLOW %s awaiting %d", print_obj(req), print_obj(rsp), + req->awaiting_rsps); + ASSERT_LOG(req->awaiting_rsps, "%s has no awaiting rsps, received %s", + print_obj(req), print_obj(rsp)); + // drop this response. + rsp_put(rsp); + msg_decr_awaiting_rsps(req); + return DN_NOOPS; } -static rstatus_t -msg_quorum_rsp_handler(struct msg *req, struct msg *rsp) -{ - if (req->rspmgr.done) - return swallow_extra_rsp(req, rsp); - rspmgr_submit_response(&req->rspmgr, rsp); - if (!rspmgr_check_is_done(&req->rspmgr)) - return DN_EAGAIN; - // rsp is absorbed by rspmgr. so we can use that variable - rsp = rspmgr_get_response(&req->rspmgr); - ASSERT(rsp); - rspmgr_free_other_responses(&req->rspmgr, rsp); - rsp->peer = req; - req->selected_rsp = rsp; - req->error_code = rsp->error_code; - req->is_error = rsp->is_error; - req->dyn_error_code = rsp->dyn_error_code; - return DN_OK; +static rstatus_t msg_quorum_rsp_handler(struct msg *req, struct msg *rsp) { + if (req->rspmgr.done) return swallow_extra_rsp(req, rsp); + rspmgr_submit_response(&req->rspmgr, rsp); + if (!rspmgr_check_is_done(&req->rspmgr)) return DN_EAGAIN; + // rsp is absorbed by rspmgr. so we can use that variable + rsp = rspmgr_get_response(&req->rspmgr); + ASSERT(rsp); + rspmgr_free_other_responses(&req->rspmgr, rsp); + rsp->peer = req; + req->selected_rsp = rsp; + req->error_code = rsp->error_code; + req->is_error = rsp->is_error; + req->dyn_error_code = rsp->dyn_error_code; + return DN_OK; } -static void -req_client_enqueue_omsgq(struct context *ctx, struct conn *conn, struct msg *req) -{ - ASSERT(req->is_request); - ASSERT(conn->type == CONN_CLIENT); +static void req_client_enqueue_omsgq(struct context *ctx, struct conn *conn, + struct msg *req) { + ASSERT(req->is_request); + ASSERT(conn->type == CONN_CLIENT); - TAILQ_INSERT_TAIL(&conn->omsg_q, req, c_tqe); - histo_add(&ctx->stats->client_out_queue, TAILQ_COUNT(&conn->omsg_q)); - log_debug(LOG_VERB, "%s enqueue outq %s", print_obj(conn), print_obj(req)); + TAILQ_INSERT_TAIL(&conn->omsg_q, req, c_tqe); + histo_add(&ctx->stats->client_out_queue, TAILQ_COUNT(&conn->omsg_q)); + log_debug(LOG_VERB, "%s enqueue outq %s", print_obj(conn), print_obj(req)); } -static void -req_client_dequeue_omsgq(struct context *ctx, struct conn *conn, struct msg *req) -{ - ASSERT(req->is_request); - ASSERT(conn->type == CONN_CLIENT); - - if (req->stime_in_microsec) { - usec_t latency = dn_usec_now() - req->stime_in_microsec; - stats_histo_add_latency(ctx, latency); - } - TAILQ_REMOVE(&conn->omsg_q, req, c_tqe); - histo_add(&ctx->stats->client_out_queue, TAILQ_COUNT(&conn->omsg_q)); - log_debug(LOG_VERB, "%s dequeue outq %s", print_obj(conn), print_obj(req)); +static void req_client_dequeue_omsgq(struct context *ctx, struct conn *conn, + struct msg *req) { + ASSERT(req->is_request); + ASSERT(conn->type == CONN_CLIENT); + + if (req->stime_in_microsec) { + usec_t latency = dn_usec_now() - req->stime_in_microsec; + stats_histo_add_latency(ctx, latency); + } + TAILQ_REMOVE(&conn->omsg_q, req, c_tqe); + histo_add(&ctx->stats->client_out_queue, TAILQ_COUNT(&conn->omsg_q)); + log_debug(LOG_VERB, "%s dequeue outq %s", print_obj(conn), print_obj(req)); } -struct conn_ops client_ops = { - msg_recv, - req_recv_next, - req_recv_done, - msg_send, - rsp_send_next, - rsp_send_done, - client_close, - client_active, - client_ref, - client_unref, - NULL, - NULL, - req_client_enqueue_omsgq, - req_client_dequeue_omsgq, - client_handle_response -}; - -void -init_client_conn(struct conn *conn) -{ - conn->dyn_mode = 0; - conn->type = CONN_CLIENT; - conn->ops = &client_ops; +struct conn_ops client_ops = {msg_recv, + req_recv_next, + req_recv_done, + msg_send, + rsp_send_next, + rsp_send_done, + client_close, + client_active, + client_ref, + client_unref, + NULL, + NULL, + req_client_enqueue_omsgq, + req_client_dequeue_omsgq, + client_handle_response}; + +void init_client_conn(struct conn *conn) { + conn->dyn_mode = 0; + conn->type = CONN_CLIENT; + conn->ops = &client_ops; } diff --git a/src/dyn_client.h b/src/dyn_client.h index 649bfd181..c8e1a6172 100644 --- a/src/dyn_client.h +++ b/src/dyn_client.h @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. diff --git a/src/dyn_conf.c b/src/dyn_conf.c index 1de0f68f1..93242e363 100644 --- a/src/dyn_conf.c +++ b/src/dyn_conf.c @@ -2,7 +2,7 @@ * Dynomite - A thin, distributed replication layer for multi non-distributed * storage engines. * Copyright (C) 2014 Netflix, Inc. - */ + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -28,78 +28,78 @@ * Set default configuration values, parse dynomite.yaml, and update the various * configuration structs including connections and server pool. */ -#include "dyn_core.h" #include "dyn_conf.h" -#include "dyn_server.h" +#include "dyn_core.h" #include "dyn_dnode_peer.h" +#include "dyn_server.h" #include "dyn_token.h" -#include "proto/dyn_proto.h" #include "hashkit/dyn_hashkit.h" +#include "proto/dyn_proto.h" -#define CONF_OK (void *) NULL -#define CONF_ERROR (void *) "has an invalid value" -#define CONF_ROOT_DEPTH 1 -#define CONF_MAX_DEPTH CONF_ROOT_DEPTH + 1 -#define CONF_DEFAULT_ARGS 3 +#define CONF_OK (void *)NULL +#define CONF_ERROR (void *)"has an invalid value" +#define CONF_ROOT_DEPTH 1 +#define CONF_MAX_DEPTH CONF_ROOT_DEPTH + 1 +#define CONF_DEFAULT_ARGS 3 #define CONF_UNSET_BOOL false -#define CONF_UNSET_NUM UNSET_NUM +#define CONF_UNSET_NUM UNSET_NUM #define CONF_DEFAULT_CONNECTIONS 1 -#define CONF_UNSET_PTR NULL -#define CONF_DEFAULT_SERVERS 8 -#define CONF_UNSET_HASH (hash_type_t) -1 - -#define CONF_DEFAULT_HASH HASH_MURMUR -#define CONF_DEFAULT_DIST DIST_VNODE -#define CONF_DEFAULT_TIMEOUT 5000 -#define CONF_DEFAULT_LISTEN_BACKLOG 512 -#define CONF_DEFAULT_CLIENT_CONNECTIONS 0 -#define CONF_DEFAULT_DATASTORE DATA_REDIS -#define CONF_DEFAULT_PRECONNECT true -#define CONF_DEFAULT_AUTO_EJECT_HOSTS true -#define CONF_DEFAULT_SERVER_RETRY_TIMEOUT 10 * 1000 /* in msec */ -#define CONF_DEFAULT_SERVER_FAILURE_LIMIT 3 -#define CONF_DEFAULT_KETAMA_PORT 11211 - -#define CONF_DEFAULT_SEEDS 5 -#define CONF_DEFAULT_DYN_READ_TIMEOUT 10000 -#define CONF_DEFAULT_DYN_WRITE_TIMEOUT 10000 -#define CONF_DEFAULT_DYN_CONNECTIONS 100 -#define CONF_DEFAULT_VNODE_TOKENS 1 -#define CONF_DEFAULT_GOS_INTERVAL 30000 //in millisec - -#define CONF_DEFAULT_MBUF_SIZE MBUF_SIZE -#define CONF_DEFAULT_MBUF_MIN_SIZE MBUF_MIN_SIZE -#define CONF_DEFAULT_MBUF_MAX_SIZE MBUF_MAX_SIZE - -#define CONF_DEFAULT_ALLOC_MSGS ALLOC_MSGS -#define CONF_DEFAULT_MIN_ALLOC_MSGS MIN_ALLOC_MSGS -#define CONF_DEFAULT_MAX_ALLOC_MSGS MAX_ALLOC_MSGS - -#define CONF_SECURE_OPTION_NONE "none" -#define CONF_SECURE_OPTION_DC "datacenter" -#define CONF_SECURE_OPTION_RACK "rack" -#define CONF_SECURE_OPTION_ALL "all" - -#define CONF_DEFAULT_RACK "localrack" -#define CONF_DEFAULT_DC "localdc" -#define CONF_DEFAULT_SECURE_SERVER_OPTION CONF_SECURE_OPTION_NONE - -#define CONF_DEFAULT_SEED_PROVIDER "simple_provider" - -#define CONF_DEFAULT_STATS_PNAME "0.0.0.0:22222" // default stats port -#define CONF_DEFAULT_STATS_PORT 22222 -#define CONF_DEFAULT_STATS_INTERVAL_MS (30 * 1000) /* in msec */ - -#define PEM_KEY_FILE "conf/dynomite.pem" -#define RECON_KEY_FILE "conf/recon_key.pem" -#define RECON_IV_FILE "conf/recon_iv.pem" +#define CONF_UNSET_PTR NULL +#define CONF_DEFAULT_SERVERS 8 +#define CONF_UNSET_HASH (hash_type_t) - 1 + +#define CONF_DEFAULT_HASH HASH_MURMUR +#define CONF_DEFAULT_DIST DIST_VNODE +#define CONF_DEFAULT_TIMEOUT 5000 +#define CONF_DEFAULT_LISTEN_BACKLOG 512 +#define CONF_DEFAULT_CLIENT_CONNECTIONS 0 +#define CONF_DEFAULT_DATASTORE DATA_REDIS +#define CONF_DEFAULT_PRECONNECT true +#define CONF_DEFAULT_AUTO_EJECT_HOSTS true +#define CONF_DEFAULT_SERVER_RETRY_TIMEOUT 10 * 1000 /* in msec */ +#define CONF_DEFAULT_SERVER_FAILURE_LIMIT 3 +#define CONF_DEFAULT_KETAMA_PORT 11211 + +#define CONF_DEFAULT_SEEDS 5 +#define CONF_DEFAULT_DYN_READ_TIMEOUT 10000 +#define CONF_DEFAULT_DYN_WRITE_TIMEOUT 10000 +#define CONF_DEFAULT_DYN_CONNECTIONS 100 +#define CONF_DEFAULT_VNODE_TOKENS 1 +#define CONF_DEFAULT_GOS_INTERVAL 30000 // in millisec + +#define CONF_DEFAULT_MBUF_SIZE MBUF_SIZE +#define CONF_DEFAULT_MBUF_MIN_SIZE MBUF_MIN_SIZE +#define CONF_DEFAULT_MBUF_MAX_SIZE MBUF_MAX_SIZE + +#define CONF_DEFAULT_ALLOC_MSGS ALLOC_MSGS +#define CONF_DEFAULT_MIN_ALLOC_MSGS MIN_ALLOC_MSGS +#define CONF_DEFAULT_MAX_ALLOC_MSGS MAX_ALLOC_MSGS + +#define CONF_SECURE_OPTION_NONE "none" +#define CONF_SECURE_OPTION_DC "datacenter" +#define CONF_SECURE_OPTION_RACK "rack" +#define CONF_SECURE_OPTION_ALL "all" + +#define CONF_DEFAULT_RACK "localrack" +#define CONF_DEFAULT_DC "localdc" +#define CONF_DEFAULT_SECURE_SERVER_OPTION CONF_SECURE_OPTION_NONE + +#define CONF_DEFAULT_SEED_PROVIDER "simple_provider" + +#define CONF_DEFAULT_STATS_PNAME "0.0.0.0:22222" // default stats port +#define CONF_DEFAULT_STATS_PORT 22222 +#define CONF_DEFAULT_STATS_INTERVAL_MS (30 * 1000) /* in msec */ + +#define PEM_KEY_FILE "conf/dynomite.pem" +#define RECON_KEY_FILE "conf/recon_key.pem" +#define RECON_IV_FILE "conf/recon_iv.pem" data_store_t g_data_store = CONF_DEFAULT_DATASTORE; struct command { - struct string name; - char *(*set)(struct conf *cf, struct command *cmd, void *data); - int offset; + struct string name; + char *(*set)(struct conf *cf, struct command *cmd, void *data); + int offset; }; /** @@ -107,73 +107,67 @@ struct command { * @param[in,out] cs Server configuration. * @return rstatus_t Return status code. */ -static rstatus_t -conf_server_init(struct conf_server *cs) -{ - string_init(&cs->pname); - string_init(&cs->name); - string_init(&cs->rack); - string_init(&cs->dc); - - rstatus_t status = array_init(&cs->tokens, CONF_DEFAULT_VNODE_TOKENS, - sizeof(struct dyn_token)); - if (status != DN_OK) { - string_deinit(&cs->pname); - string_deinit(&cs->name); - string_deinit(&cs->rack); - string_deinit(&cs->dc); - return status; - } +static rstatus_t conf_server_init(struct conf_server *cs) { + string_init(&cs->pname); + string_init(&cs->name); + string_init(&cs->rack); + string_init(&cs->dc); + + rstatus_t status = array_init(&cs->tokens, CONF_DEFAULT_VNODE_TOKENS, + sizeof(struct dyn_token)); + if (status != DN_OK) { + string_deinit(&cs->pname); + string_deinit(&cs->name); + string_deinit(&cs->rack); + string_deinit(&cs->dc); + return status; + } - cs->port = 0; + cs->port = 0; - memset(&cs->info, 0, sizeof(cs->info)); + memset(&cs->info, 0, sizeof(cs->info)); - cs->valid = 0; + cs->valid = 0; - log_debug(LOG_VVERB, "init conf server %p", cs); - return DN_OK; + log_debug(LOG_VVERB, "init conf server %p", cs); + return DN_OK; } /** * Deinitialize the server configuration and free memory. * @param[in,out] cs Server configuration. */ -static void -conf_server_deinit(struct conf_server *cs) -{ - string_deinit(&cs->pname); - string_deinit(&cs->name); - string_deinit(&cs->rack); - string_deinit(&cs->dc); - array_deinit(&cs->tokens); - cs->valid = 0; - log_debug(LOG_VVERB, "deinit conf server %p", cs); +static void conf_server_deinit(struct conf_server *cs) { + string_deinit(&cs->pname); + string_deinit(&cs->name); + string_deinit(&cs->rack); + string_deinit(&cs->dc); + array_deinit(&cs->tokens); + cs->valid = 0; + log_debug(LOG_VVERB, "deinit conf server %p", cs); } // copy from struct conf_server to struct server -rstatus_t -conf_datastore_transform(struct datastore *s, struct conf_pool *cp, - struct conf_server *cs) -{ - ASSERT(cs->valid); - ASSERT(s != NULL); - s->owner = NULL; - s->endpoint.pname = cs->pname; - s->name = cs->name; - s->endpoint.port = (uint16_t)cs->port; - - s->endpoint.family = cs->info.family; - s->endpoint.addrlen = cs->info.addrlen; - s->endpoint.addr = (struct sockaddr *)&cs->info.addr; - s->conn_pool = NULL; - s->max_connections = cp->datastore_connections; - s->next_retry_ms = 0ULL; - s->failure_count = 0; - - log_debug(LOG_NOTICE, "Created %s", print_obj(s)); - - return DN_OK; +rstatus_t conf_datastore_transform(struct datastore *s, struct conf_pool *cp, + struct conf_server *cs) { + ASSERT(cs->valid); + ASSERT(s != NULL); + s->owner = NULL; + s->endpoint.pname = cs->pname; + s->name = cs->name; + s->endpoint.port = (uint16_t)cs->port; + + s->endpoint.family = cs->info.family; + s->endpoint.addrlen = cs->info.addrlen; + s->endpoint.addr = (struct sockaddr *)&cs->info.addr; + s->conn_pool = NULL; + s->max_connections = cp->datastore_connections; + s->next_retry_ms = 0ULL; + s->failure_count = 0; + + log_debug(LOG_NOTICE, "Created %s", print_obj(s)); + + return DN_OK; } /** @@ -182,2066 +176,1958 @@ conf_datastore_transform(struct datastore *s, struct conf_pool *cp, * @param name Pool name. * @return rstatus_t Return status code. */ -//TODOs: make sure to do a mem release for all these -static rstatus_t -conf_pool_init(struct conf_pool *cp, struct string *name) -{ - rstatus_t status; - memset(cp, 0, sizeof(*cp)); - - string_init(&cp->name); - - string_init(&cp->listen.pname); - string_init(&cp->listen.name); - - string_init(&cp->rack); - - cp->listen.port = 0; - memset(&cp->listen.info, 0, sizeof(cp->listen.info)); - cp->listen.valid = 0; - - cp->hash = CONF_UNSET_HASH; - string_init(&cp->hash_tag); - - cp->timeout = CONF_UNSET_NUM; - cp->backlog = CONF_UNSET_NUM; - - cp->client_connections = CONF_UNSET_NUM; - - cp->data_store = CONF_UNSET_NUM; - cp->preconnect = CONF_UNSET_NUM; - cp->auto_eject_hosts = CONF_UNSET_NUM; - cp->server_retry_timeout_ms = CONF_UNSET_NUM; - cp->server_failure_limit = CONF_UNSET_NUM; - cp->datastore_connections = CONF_UNSET_NUM; - cp->local_peer_connections = CONF_UNSET_NUM; - cp->remote_peer_connections = CONF_UNSET_NUM; - cp->stats_interval = CONF_UNSET_NUM; - - //initialization for dynomite - string_init(&cp->dyn_seed_provider); - string_init(&cp->dyn_listen.pname); - string_init(&cp->dyn_listen.name); - string_init(&cp->secure_server_option); - string_init(&cp->read_consistency); - string_init(&cp->write_consistency); - string_init(&cp->pem_key_file); - string_init(&cp->recon_key_file); - string_init(&cp->recon_iv_file); - string_init(&cp->stats_listen.pname); - string_init(&cp->stats_listen.name); - string_init(&cp->dc); - string_init(&cp->env); - cp->dyn_listen.port = 0; - memset(&cp->dyn_listen.info, 0, sizeof(cp->dyn_listen.info)); - cp->dyn_listen.valid = 0; - - cp->stats_listen.port = 0; - memset(&cp->stats_listen.info, 0, sizeof(cp->stats_listen.info)); - cp->stats_listen.valid = 0; - - cp->dyn_read_timeout = CONF_UNSET_NUM; - cp->dyn_write_timeout = CONF_UNSET_NUM; - cp->dyn_port = CONF_UNSET_NUM; - cp->dyn_connections = CONF_UNSET_NUM; - - cp->gos_interval = CONF_UNSET_NUM; - - cp->conn_msg_rate = CONF_UNSET_NUM; - - array_null(&cp->dyn_seeds); - - cp->valid = 0; - cp->enable_gossip = CONF_UNSET_BOOL; - cp->mbuf_size = CONF_UNSET_NUM; - cp->alloc_msgs_max = CONF_UNSET_NUM; - - status = string_duplicate(&cp->name, name); - if (status != DN_OK) { - return status; - } - - cp->conf_datastore = NULL; - - status = array_init(&cp->dyn_seeds, CONF_DEFAULT_SEEDS, - sizeof(struct conf_server)); - if (status != DN_OK) { - string_deinit(&cp->name); - return status; - } +// TODOs: make sure to do a mem release for all these +static rstatus_t conf_pool_init(struct conf_pool *cp, struct string *name) { + rstatus_t status; + memset(cp, 0, sizeof(*cp)); + + string_init(&cp->name); + + string_init(&cp->listen.pname); + string_init(&cp->listen.name); + + string_init(&cp->rack); + + cp->listen.port = 0; + memset(&cp->listen.info, 0, sizeof(cp->listen.info)); + cp->listen.valid = 0; + + cp->hash = CONF_UNSET_HASH; + string_init(&cp->hash_tag); + + cp->timeout = CONF_UNSET_NUM; + cp->backlog = CONF_UNSET_NUM; + + cp->client_connections = CONF_UNSET_NUM; + + cp->data_store = CONF_UNSET_NUM; + cp->preconnect = CONF_UNSET_NUM; + cp->auto_eject_hosts = CONF_UNSET_NUM; + cp->server_retry_timeout_ms = CONF_UNSET_NUM; + cp->server_failure_limit = CONF_UNSET_NUM; + cp->datastore_connections = CONF_UNSET_NUM; + cp->local_peer_connections = CONF_UNSET_NUM; + cp->remote_peer_connections = CONF_UNSET_NUM; + cp->stats_interval = CONF_UNSET_NUM; + + // initialization for dynomite + string_init(&cp->dyn_seed_provider); + string_init(&cp->dyn_listen.pname); + string_init(&cp->dyn_listen.name); + string_init(&cp->secure_server_option); + string_init(&cp->read_consistency); + string_init(&cp->write_consistency); + string_init(&cp->pem_key_file); + string_init(&cp->recon_key_file); + string_init(&cp->recon_iv_file); + string_init(&cp->stats_listen.pname); + string_init(&cp->stats_listen.name); + string_init(&cp->dc); + string_init(&cp->env); + cp->dyn_listen.port = 0; + memset(&cp->dyn_listen.info, 0, sizeof(cp->dyn_listen.info)); + cp->dyn_listen.valid = 0; + + cp->stats_listen.port = 0; + memset(&cp->stats_listen.info, 0, sizeof(cp->stats_listen.info)); + cp->stats_listen.valid = 0; + + cp->dyn_read_timeout = CONF_UNSET_NUM; + cp->dyn_write_timeout = CONF_UNSET_NUM; + cp->dyn_port = CONF_UNSET_NUM; + cp->dyn_connections = CONF_UNSET_NUM; + + cp->gos_interval = CONF_UNSET_NUM; + + cp->conn_msg_rate = CONF_UNSET_NUM; + + array_null(&cp->dyn_seeds); + + cp->valid = 0; + cp->enable_gossip = CONF_UNSET_BOOL; + cp->mbuf_size = CONF_UNSET_NUM; + cp->alloc_msgs_max = CONF_UNSET_NUM; + + status = string_duplicate(&cp->name, name); + if (status != DN_OK) { + return status; + } + + cp->conf_datastore = NULL; + + status = array_init(&cp->dyn_seeds, CONF_DEFAULT_SEEDS, + sizeof(struct conf_server)); + if (status != DN_OK) { + string_deinit(&cp->name); + return status; + } - status = array_init(&cp->tokens, CONF_DEFAULT_VNODE_TOKENS, - sizeof(struct dyn_token)); - if (status != DN_OK) { - string_deinit(&cp->name); - array_deinit(&cp->dyn_seeds); - return status; - } + status = array_init(&cp->tokens, CONF_DEFAULT_VNODE_TOKENS, + sizeof(struct dyn_token)); + if (status != DN_OK) { + string_deinit(&cp->name); + array_deinit(&cp->dyn_seeds); + return status; + } - log_debug(LOG_VVERB, "init conf pool %p, '%.*s'", cp, name->len, name->data); + log_debug(LOG_VVERB, "init conf pool %p, '%.*s'", cp, name->len, name->data); - return DN_OK; + return DN_OK; } /** * De-initialize the connection pool configuration and free memory. * @param[in,out] cp Connection pool configuration. */ -static void -conf_pool_deinit(struct conf_pool *cp) -{ - string_deinit(&cp->name); - - string_deinit(&cp->listen.pname); - string_deinit(&cp->listen.name); - - conf_server_deinit(cp->conf_datastore); - dn_free(cp->conf_datastore); - cp->conf_datastore = NULL; - - //deinit dynomite - string_deinit(&cp->dyn_seed_provider); - string_deinit(&cp->dyn_listen.pname); - string_deinit(&cp->dyn_listen.name); - string_deinit(&cp->secure_server_option); - string_deinit(&cp->read_consistency); - string_deinit(&cp->write_consistency); - string_deinit(&cp->pem_key_file); - string_deinit(&cp->recon_key_file); - string_deinit(&cp->recon_iv_file); - string_deinit(&cp->stats_listen.pname); - string_deinit(&cp->stats_listen.name); - string_deinit(&cp->dc); - string_deinit(&cp->env); - - if (array_n(&cp->dyn_seeds) != 0) - array_deinit(&cp->dyn_seeds); - - array_deinit(&cp->tokens); - - log_debug(LOG_VVERB, "deinit conf pool %p", cp); +static void conf_pool_deinit(struct conf_pool *cp) { + string_deinit(&cp->name); + + string_deinit(&cp->listen.pname); + string_deinit(&cp->listen.name); + + conf_server_deinit(cp->conf_datastore); + dn_free(cp->conf_datastore); + cp->conf_datastore = NULL; + + // deinit dynomite + string_deinit(&cp->dyn_seed_provider); + string_deinit(&cp->dyn_listen.pname); + string_deinit(&cp->dyn_listen.name); + string_deinit(&cp->secure_server_option); + string_deinit(&cp->read_consistency); + string_deinit(&cp->write_consistency); + string_deinit(&cp->pem_key_file); + string_deinit(&cp->recon_key_file); + string_deinit(&cp->recon_iv_file); + string_deinit(&cp->stats_listen.pname); + string_deinit(&cp->stats_listen.name); + string_deinit(&cp->dc); + string_deinit(&cp->env); + + if (array_n(&cp->dyn_seeds) != 0) array_deinit(&cp->dyn_seeds); + + array_deinit(&cp->tokens); + + log_debug(LOG_VVERB, "deinit conf pool %p", cp); } -secure_server_option_t -get_secure_server_option(struct string *option) -{ - if (dn_strcmp(option->data, CONF_SECURE_OPTION_NONE) == 0) { - return SECURE_OPTION_NONE; - } - if (dn_strcmp(option->data, CONF_SECURE_OPTION_RACK) == 0) { - return SECURE_OPTION_RACK; - } - if (dn_strcmp(option->data, CONF_SECURE_OPTION_DC) == 0) { - return SECURE_OPTION_DC; - } - if (dn_strcmp(option->data, CONF_SECURE_OPTION_ALL) == 0) { - return SECURE_OPTION_ALL; - } +secure_server_option_t get_secure_server_option(struct string *option) { + if (dn_strcmp(option->data, CONF_SECURE_OPTION_NONE) == 0) { return SECURE_OPTION_NONE; + } + if (dn_strcmp(option->data, CONF_SECURE_OPTION_RACK) == 0) { + return SECURE_OPTION_RACK; + } + if (dn_strcmp(option->data, CONF_SECURE_OPTION_DC) == 0) { + return SECURE_OPTION_DC; + } + if (dn_strcmp(option->data, CONF_SECURE_OPTION_ALL) == 0) { + return SECURE_OPTION_ALL; + } + return SECURE_OPTION_NONE; } /** * Output the entire configuration into the log file. * @param[in] cf Dynomite configuration. */ -static void -conf_dump(struct conf *cf) -{ - uint32_t j; - struct string *s; - - log_debug(LOG_VVERB, "pool in configuration file '%s'", cf->fname); - - struct conf_pool *cp = &cf->pool; - - log_debug(LOG_VVERB, "%.*s", cp->name.len, cp->name.data); - log_debug(LOG_VVERB, " listen: %.*s", - cp->listen.pname.len, cp->listen.pname.data); - log_debug(LOG_VVERB, " timeout: %d", cp->timeout); - log_debug(LOG_VVERB, " backlog: %d", cp->backlog); - log_debug(LOG_VVERB, " hash: %d", cp->hash); - log_debug(LOG_VVERB, " hash_tag: \"%.*s\"", cp->hash_tag.len, +static void conf_dump(struct conf *cf) { + uint32_t j; + struct string *s; + + log_debug(LOG_VVERB, "pool in configuration file '%s'", cf->fname); + + struct conf_pool *cp = &cf->pool; + + log_debug(LOG_VVERB, "%.*s", cp->name.len, cp->name.data); + log_debug(LOG_VVERB, " listen: %.*s", cp->listen.pname.len, + cp->listen.pname.data); + log_debug(LOG_VVERB, " timeout: %d", cp->timeout); + log_debug(LOG_VVERB, " backlog: %d", cp->backlog); + log_debug(LOG_VVERB, " hash: %d", cp->hash); + log_debug(LOG_VVERB, " hash_tag: \"%.*s\"", cp->hash_tag.len, cp->hash_tag.data); - log_debug(LOG_VVERB, " client_connections: %d", - cp->client_connections); - const char * temp_log = "unknown"; - if(g_data_store == DATA_REDIS){ - temp_log = "redis"; - } - else if(g_data_store == DATA_MEMCACHE){ - temp_log = "memcache"; - } - log_debug(LOG_VVERB, " data_store: %d (%s)", g_data_store, temp_log); - log_debug(LOG_VVERB, " preconnect: %d", cp->preconnect); - log_debug(LOG_VVERB, " auto_eject_hosts: %d", cp->auto_eject_hosts); - log_debug(LOG_VVERB, " server_retry_timeout: %d (msec)", + log_debug(LOG_VVERB, " client_connections: %d", cp->client_connections); + const char *temp_log = "unknown"; + if (g_data_store == DATA_REDIS) { + temp_log = "redis"; + } else if (g_data_store == DATA_MEMCACHE) { + temp_log = "memcache"; + } + log_debug(LOG_VVERB, " data_store: %d (%s)", g_data_store, temp_log); + log_debug(LOG_VVERB, " preconnect: %d", cp->preconnect); + log_debug(LOG_VVERB, " auto_eject_hosts: %d", cp->auto_eject_hosts); + log_debug(LOG_VVERB, " server_retry_timeout: %d (msec)", cp->server_retry_timeout_ms); - log_debug(LOG_VVERB, " server_failure_limit: %d", - cp->server_failure_limit); + log_debug(LOG_VVERB, " server_failure_limit: %d", cp->server_failure_limit); - log_debug(LOG_VVERB, " datastore: "); - log_debug(LOG_VVERB, " %.*s", - cp->conf_datastore->name.len, cp->conf_datastore->name.data); + log_debug(LOG_VVERB, " datastore: "); + log_debug(LOG_VVERB, " %.*s", cp->conf_datastore->name.len, + cp->conf_datastore->name.data); - log_debug(LOG_VVERB, " dyn_seed_provider: \"%.*s\"", cp->dyn_seed_provider.len, cp->dyn_seed_provider.data); + log_debug(LOG_VVERB, " dyn_seed_provider: \"%.*s\"", + cp->dyn_seed_provider.len, cp->dyn_seed_provider.data); - uint32_t nseeds = array_n(&cp->dyn_seeds); - log_debug(LOG_VVERB, " dyn_seeds: %"PRIu32"", nseeds); - for (j = 0; j < nseeds; j++) { - s = array_get(&cp->dyn_seeds, j); - log_debug(LOG_VVERB, " %.*s", s->len, s->data); - } + uint32_t nseeds = array_n(&cp->dyn_seeds); + log_debug(LOG_VVERB, " dyn_seeds: %" PRIu32 "", nseeds); + for (j = 0; j < nseeds; j++) { + s = array_get(&cp->dyn_seeds, j); + log_debug(LOG_VVERB, " %.*s", s->len, s->data); + } - log_debug(LOG_VVERB, " env: %.*s", cp->env.len, cp->env.data); - log_debug(LOG_VVERB, " rack: %.*s", cp->rack.len, cp->rack.data); - log_debug(LOG_VVERB, " dc: %.*s", cp->dc.len, cp->dc.data); + log_debug(LOG_VVERB, " env: %.*s", cp->env.len, cp->env.data); + log_debug(LOG_VVERB, " rack: %.*s", cp->rack.len, cp->rack.data); + log_debug(LOG_VVERB, " dc: %.*s", cp->dc.len, cp->dc.data); - log_debug(LOG_VVERB, " dyn_listen: %.*s", - cp->dyn_listen.pname.len, cp->dyn_listen.pname.data); - log_debug(LOG_VVERB, " dyn_read_timeout: %d", cp->dyn_read_timeout); - log_debug(LOG_VVERB, " dyn_write_timeout: %d", cp->dyn_write_timeout); - log_debug(LOG_VVERB, " dyn_connections: %d", cp->dyn_connections); + log_debug(LOG_VVERB, " dyn_listen: %.*s", cp->dyn_listen.pname.len, + cp->dyn_listen.pname.data); + log_debug(LOG_VVERB, " dyn_read_timeout: %d", cp->dyn_read_timeout); + log_debug(LOG_VVERB, " dyn_write_timeout: %d", cp->dyn_write_timeout); + log_debug(LOG_VVERB, " dyn_connections: %d", cp->dyn_connections); - log_debug(LOG_VVERB, " gos_interval: %lu", cp->gos_interval); - log_debug(LOG_VVERB, " conn_msg_rate: %d", cp->conn_msg_rate); + log_debug(LOG_VVERB, " gos_interval: %lu", cp->gos_interval); + log_debug(LOG_VVERB, " conn_msg_rate: %d", cp->conn_msg_rate); - log_debug(LOG_VVERB, " secure_server_option: \"%.*s\"", - cp->secure_server_option.len, - cp->secure_server_option.data); + log_debug(LOG_VVERB, " secure_server_option: \"%.*s\"", + cp->secure_server_option.len, cp->secure_server_option.data); - log_debug(LOG_VVERB, " read_consistency: \"%.*s\"", - cp->read_consistency.len, + log_debug(LOG_VVERB, " read_consistency: \"%.*s\"", cp->read_consistency.len, cp->read_consistency.data); - log_debug(LOG_VVERB, " write_consistency: \"%.*s\"", - cp->write_consistency.len, - cp->write_consistency.data); + log_debug(LOG_VVERB, " write_consistency: \"%.*s\"", + cp->write_consistency.len, cp->write_consistency.data); - log_debug(LOG_VVERB, " stats_interval: %lu", cp->stats_interval); - log_debug(LOG_VVERB, " stats_listen: %.*s", - cp->stats_listen.pname.len, cp->stats_listen.pname.data); + log_debug(LOG_VVERB, " stats_interval: %lu", cp->stats_interval); + log_debug(LOG_VVERB, " stats_listen: %.*s", cp->stats_listen.pname.len, + cp->stats_listen.pname.data); - log_debug(LOG_VVERB, " enable_gossip: %s", cp->enable_gossip ? "true" : "false"); + log_debug(LOG_VVERB, " enable_gossip: %s", + cp->enable_gossip ? "true" : "false"); - log_debug(LOG_VVERB, " mbuf_size: %d", cp->mbuf_size); - log_debug(LOG_VVERB, " max_msgs: %d", cp->alloc_msgs_max); + log_debug(LOG_VVERB, " mbuf_size: %d", cp->mbuf_size); + log_debug(LOG_VVERB, " max_msgs: %d", cp->alloc_msgs_max); - log_debug(LOG_VVERB, " dc: \"%.*s\"", cp->dc.len, cp->dc.data); - log_debug(LOG_VVERB, " datastore_connections: %d", + log_debug(LOG_VVERB, " dc: \"%.*s\"", cp->dc.len, cp->dc.data); + log_debug(LOG_VVERB, " datastore_connections: %d", cp->datastore_connections); - log_debug(LOG_VVERB, " local_peer_connections: %d", + log_debug(LOG_VVERB, " local_peer_connections: %d", cp->local_peer_connections); - log_debug(LOG_VVERB, " remote_peer_connections: %d", + log_debug(LOG_VVERB, " remote_peer_connections: %d", cp->remote_peer_connections); } -static rstatus_t -conf_yaml_init(struct conf *cf) -{ - int rv; +static rstatus_t conf_yaml_init(struct conf *cf) { + int rv; - ASSERT(!cf->valid_parser); + ASSERT(!cf->valid_parser); - rv = fseek(cf->fh, 0L, SEEK_SET); - if (rv < 0) { - log_error("conf: failed to seek to the beginning of file '%s': %s", - cf->fname, strerror(errno)); - return DN_ERROR; - } + rv = fseek(cf->fh, 0L, SEEK_SET); + if (rv < 0) { + log_error("conf: failed to seek to the beginning of file '%s': %s", + cf->fname, strerror(errno)); + return DN_ERROR; + } - rv = yaml_parser_initialize(&cf->parser); - if (!rv) { - log_error("conf: failed (err %d) to initialize yaml parser", - cf->parser.error); - return DN_ERROR; - } + rv = yaml_parser_initialize(&cf->parser); + if (!rv) { + log_error("conf: failed (err %d) to initialize yaml parser", + cf->parser.error); + return DN_ERROR; + } - yaml_parser_set_input_file(&cf->parser, cf->fh); - cf->valid_parser = 1; + yaml_parser_set_input_file(&cf->parser, cf->fh); + cf->valid_parser = 1; - return DN_OK; + return DN_OK; } -static void -conf_yaml_deinit(struct conf *cf) -{ - if (cf->valid_parser) { - yaml_parser_delete(&cf->parser); - cf->valid_parser = 0; - } +static void conf_yaml_deinit(struct conf *cf) { + if (cf->valid_parser) { + yaml_parser_delete(&cf->parser); + cf->valid_parser = 0; + } } -static rstatus_t -conf_token_next(struct conf *cf) -{ - int rv; +static rstatus_t conf_token_next(struct conf *cf) { + int rv; - ASSERT(cf->valid_parser && !cf->valid_token); + ASSERT(cf->valid_parser && !cf->valid_token); - rv = yaml_parser_scan(&cf->parser, &cf->token); - if (!rv) { - log_error("conf: failed (err %d) to scan next token", cf->parser.error); - return DN_ERROR; - } - cf->valid_token = 1; + rv = yaml_parser_scan(&cf->parser, &cf->token); + if (!rv) { + log_error("conf: failed (err %d) to scan next token", cf->parser.error); + return DN_ERROR; + } + cf->valid_token = 1; - return DN_OK; + return DN_OK; } -static void -conf_token_done(struct conf *cf) -{ - ASSERT(cf->valid_parser); +static void conf_token_done(struct conf *cf) { + ASSERT(cf->valid_parser); - if (cf->valid_token) { - yaml_token_delete(&cf->token); - cf->valid_token = 0; - } + if (cf->valid_token) { + yaml_token_delete(&cf->token); + cf->valid_token = 0; + } } -static rstatus_t -conf_event_next(struct conf *cf) -{ - int rv; +static rstatus_t conf_event_next(struct conf *cf) { + int rv; - ASSERT(cf->valid_parser && !cf->valid_event); + ASSERT(cf->valid_parser && !cf->valid_event); - rv = yaml_parser_parse(&cf->parser, &cf->event); - if (!rv) { - log_error("conf: failed (err %d) to get next event", cf->parser.error); - return DN_ERROR; - } - cf->valid_event = 1; + rv = yaml_parser_parse(&cf->parser, &cf->event); + if (!rv) { + log_error("conf: failed (err %d) to get next event", cf->parser.error); + return DN_ERROR; + } + cf->valid_event = 1; - return DN_OK; + return DN_OK; } -static void -conf_event_done(struct conf *cf) -{ - if (cf->valid_event) { - yaml_event_delete(&cf->event); - cf->valid_event = 0; - } +static void conf_event_done(struct conf *cf) { + if (cf->valid_event) { + yaml_event_delete(&cf->event); + cf->valid_event = 0; + } } -static rstatus_t -conf_push_scalar(struct conf *cf) -{ - rstatus_t status; - struct string *value; - uint8_t *scalar; - uint32_t scalar_len; - - scalar = cf->event.data.scalar.value; - scalar_len = (uint32_t)cf->event.data.scalar.length; - if (scalar_len == 0) { - return DN_ERROR; - } - log_debug(LOG_VVERB, "push '%.*s'", scalar_len, scalar); +static rstatus_t conf_push_scalar(struct conf *cf) { + rstatus_t status; + struct string *value; + uint8_t *scalar; + uint32_t scalar_len; - value = array_push(&cf->arg); - if (value == NULL) { - return DN_ENOMEM; - } - string_init(value); - - status = string_copy(value, scalar, scalar_len); - if (status != DN_OK) { - array_pop(&cf->arg); - return status; - } - - return DN_OK; + scalar = cf->event.data.scalar.value; + scalar_len = (uint32_t)cf->event.data.scalar.length; + if (scalar_len == 0) { + return DN_ERROR; + } + log_debug(LOG_VVERB, "push '%.*s'", scalar_len, scalar); + + value = array_push(&cf->arg); + if (value == NULL) { + return DN_ENOMEM; + } + string_init(value); + + status = string_copy(value, scalar, scalar_len); + if (status != DN_OK) { + array_pop(&cf->arg); + return status; + } + + return DN_OK; } -static void -conf_pop_scalar(struct conf *cf) -{ - struct string *value; +static void conf_pop_scalar(struct conf *cf) { + struct string *value; - value = array_pop(&cf->arg); - log_debug(LOG_VVERB, "pop '%.*s'", value->len, value->data); - string_deinit(value); + value = array_pop(&cf->arg); + log_debug(LOG_VVERB, "pop '%.*s'", value->len, value->data); + string_deinit(value); } -static char * -conf_set_string(struct conf *cf, struct command *cmd, void *conf) -{ - rstatus_t status; - uint8_t *p; - struct string *field, *value; +static char *conf_set_string(struct conf *cf, struct command *cmd, void *conf) { + rstatus_t status; + uint8_t *p; + struct string *field, *value; - p = conf; - field = (struct string *)(p + cmd->offset); + p = conf; + field = (struct string *)(p + cmd->offset); - if (field->data != CONF_UNSET_PTR) { - return "is a duplicate"; - } + if (field->data != CONF_UNSET_PTR) { + return "is a duplicate"; + } - value = array_top(&cf->arg); + value = array_top(&cf->arg); - status = string_duplicate(field, value); - if (status != DN_OK) { - return CONF_ERROR; - } + status = string_duplicate(field, value); + if (status != DN_OK) { + return CONF_ERROR; + } - return CONF_OK; + return CONF_OK; } -static char * -conf_set_listen(struct conf *cf, struct command *cmd, void *conf) -{ - rstatus_t status; - struct string *value; - struct conf_listen *field; - uint8_t *p, *name; - uint32_t namelen; +static char *conf_set_listen(struct conf *cf, struct command *cmd, void *conf) { + rstatus_t status; + struct string *value; + struct conf_listen *field; + uint8_t *p, *name; + uint32_t namelen; - p = conf; - field = (struct conf_listen *)(p + cmd->offset); + p = conf; + field = (struct conf_listen *)(p + cmd->offset); - if (field->valid == 1) { - return "is a duplicate"; - } + if (field->valid == 1) { + return "is a duplicate"; + } - value = array_top(&cf->arg); + value = array_top(&cf->arg); - status = string_duplicate(&field->pname, value); - if (status != DN_OK) { - return CONF_ERROR; - } + status = string_duplicate(&field->pname, value); + if (status != DN_OK) { + return CONF_ERROR; + } - if (value->data[0] == '/') { - name = value->data; - namelen = value->len; - } else { - uint8_t *q, *start, *port; - uint32_t portlen; - - /* parse "hostname:port" from the end */ - p = value->data + value->len - 1; - start = value->data; - q = dn_strrchr(p, start, ':'); - if (q == NULL) { - return "has an invalid \"hostname:port\" format string"; - } + if (value->data[0] == '/') { + name = value->data; + namelen = value->len; + } else { + uint8_t *q, *start, *port; + uint32_t portlen; - port = q + 1; - portlen = (uint32_t)(p - port + 1); + /* parse "hostname:port" from the end */ + p = value->data + value->len - 1; + start = value->data; + q = dn_strrchr(p, start, ':'); + if (q == NULL) { + return "has an invalid \"hostname:port\" format string"; + } - p = q - 1; + port = q + 1; + portlen = (uint32_t)(p - port + 1); - name = start; - namelen = (uint32_t)(p - start + 1); + p = q - 1; - field->port = dn_atoi(port, portlen); - if (field->port < 0 || !dn_valid_port(field->port)) { - return "has an invalid port in \"hostname:port\" format string"; - } - } + name = start; + namelen = (uint32_t)(p - start + 1); - status = string_copy(&field->name, name, namelen); - if (status != DN_OK) { - return CONF_ERROR; + field->port = dn_atoi(port, portlen); + if (field->port < 0 || !dn_valid_port(field->port)) { + return "has an invalid port in \"hostname:port\" format string"; } + } - status = dn_resolve(&field->name, field->port, &field->info); - if (status != DN_OK) { - return CONF_ERROR; - } + status = string_copy(&field->name, name, namelen); + if (status != DN_OK) { + return CONF_ERROR; + } + + status = dn_resolve(&field->name, field->port, &field->info); + if (status != DN_OK) { + return CONF_ERROR; + } - field->valid = 1; + field->valid = 1; - return CONF_OK; + return CONF_OK; } /* Parses server:port:data_store from yaml */ -static char * -conf_add_server(struct conf *cf, struct command *cmd, void *conf) -{ - rstatus_t status; - struct string *value; - struct conf_server *field; - uint8_t *p, *q, *start; - uint8_t *pname, *addr, *port, *name; - uint32_t k, delimlen, pnamelen, addrlen, portlen, namelen; - struct string address; - char delim[] = " ::"; - - string_init(&address); - p = conf; - struct conf_server **pfield = (struct conf_server **)(p + cmd->offset); - ASSERT(*pfield == NULL); - *pfield = (struct conf_server *)dn_zalloc(sizeof(struct conf_server)); - field = *pfield; - status = conf_server_init(field); - if (status != DN_OK) { - dn_free(*pfield); - *pfield = NULL; - return CONF_ERROR; +static char *conf_add_server(struct conf *cf, struct command *cmd, void *conf) { + rstatus_t status; + struct string *value; + struct conf_server *field; + uint8_t *p, *q, *start; + uint8_t *pname, *addr, *port, *name; + uint32_t k, delimlen, pnamelen, addrlen, portlen, namelen; + struct string address; + char delim[] = " ::"; + + string_init(&address); + p = conf; + struct conf_server **pfield = (struct conf_server **)(p + cmd->offset); + ASSERT(*pfield == NULL); + *pfield = (struct conf_server *)dn_zalloc(sizeof(struct conf_server)); + field = *pfield; + status = conf_server_init(field); + if (status != DN_OK) { + dn_free(*pfield); + *pfield = NULL; + return CONF_ERROR; + } + + value = array_top(&cf->arg); + + /* parse "hostname:port:weight [name]" or "/path/unix_socket:weight [name]" + * from the end */ + p = value->data + value->len - 1; + start = value->data; + addr = NULL; + addrlen = 0; + port = NULL; + portlen = 0; + name = NULL; + namelen = 0; + + delimlen = value->data[0] == '/' ? 2 : 3; + + for (k = 0; k < sizeof(delim); k++) { + q = dn_strrchr(p, start, delim[k]); + if (q == NULL) { + if (k == 0) { + /* + * name in "hostname:port:weight [name]" format string is + * optional + */ + continue; + } + break; } - value = array_top(&cf->arg); - - /* parse "hostname:port:weight [name]" or "/path/unix_socket:weight [name]" from the end */ - p = value->data + value->len - 1; - start = value->data; - addr = NULL; - addrlen = 0; - port = NULL; - portlen = 0; - name = NULL; - namelen = 0; - - delimlen = value->data[0] == '/' ? 2 : 3; - - for (k = 0; k < sizeof(delim); k++) { - q = dn_strrchr(p, start, delim[k]); - if (q == NULL) { - if (k == 0) { - /* - * name in "hostname:port:weight [name]" format string is - * optional - */ - continue; - } - break; - } - - switch (k) { - case 0: - name = q + 1; - namelen = (uint32_t)(p - name + 1); - break; - - case 1: - // ignore the weight portion, we never use it. - // But parse it nevertheless for backward compatibility - break; + switch (k) { + case 0: + name = q + 1; + namelen = (uint32_t)(p - name + 1); + break; - case 2: - port = q + 1; - portlen = (uint32_t)(p - port + 1); - break; + case 1: + // ignore the weight portion, we never use it. + // But parse it nevertheless for backward compatibility + break; - default: - NOT_REACHED(); - } + case 2: + port = q + 1; + portlen = (uint32_t)(p - port + 1); + break; - p = q - 1; + default: + NOT_REACHED(); } - if (k != delimlen) { - return "has an invalid \"hostname:port:weight [name]\"or \"/path/unix_socket:weight [name]\" format string"; - } + p = q - 1; + } - pname = value->data; - pnamelen = namelen > 0 ? value->len - (namelen + 1) : value->len; - status = string_copy(&field->pname, pname, pnamelen); - if (status != DN_OK) { - dn_free(*pfield); - *pfield = NULL; - return CONF_ERROR; - } + if (k != delimlen) { + return "has an invalid \"hostname:port:weight [name]\"or " + "\"/path/unix_socket:weight [name]\" format string"; + } - // addr: hostname - addr = start; - // addrlen: hostname length - addrlen = (uint32_t)(p - start + 1); + pname = value->data; + pnamelen = namelen > 0 ? value->len - (namelen + 1) : value->len; + status = string_copy(&field->pname, pname, pnamelen); + if (status != DN_OK) { + dn_free(*pfield); + *pfield = NULL; + return CONF_ERROR; + } - // port is relevant only for non unix socket address - if (value->data[0] != '/') { - field->port = dn_atoi(port, portlen); - if (field->port < 0 || !dn_valid_port(field->port)) { - return "has an invalid port in \"hostname:port:weight [name]\" format string"; - } - } + // addr: hostname + addr = start; + // addrlen: hostname length + addrlen = (uint32_t)(p - start + 1); - if (name == NULL) { - /* - * To maintain backward compatibility with libmemcached, we don't - * include the port as the part of the input string to the consistent - * hashing algorithm, when it is equal to 11211. - */ - if (field->port == CONF_DEFAULT_KETAMA_PORT) { - name = addr; - namelen = addrlen; - } else { - name = addr; - namelen = addrlen + 1 + portlen; - } - } - - status = string_copy(&field->name, name, namelen); - if (status != DN_OK) { - return CONF_ERROR; + // port is relevant only for non unix socket address + if (value->data[0] != '/') { + field->port = dn_atoi(port, portlen); + if (field->port < 0 || !dn_valid_port(field->port)) { + return "has an invalid port in \"hostname:port:weight [name]\" format " + "string"; } + } - status = string_copy(&address, addr, addrlen); - if (status != DN_OK) { - return CONF_ERROR; + if (name == NULL) { + /* + * To maintain backward compatibility with libmemcached, we don't + * include the port as the part of the input string to the consistent + * hashing algorithm, when it is equal to 11211. + */ + if (field->port == CONF_DEFAULT_KETAMA_PORT) { + name = addr; + namelen = addrlen; + } else { + name = addr; + namelen = addrlen + 1 + portlen; } + } + status = string_copy(&field->name, name, namelen); + if (status != DN_OK) { + return CONF_ERROR; + } - status = dn_resolve(&address, field->port, &field->info); - if (status != DN_OK) { - string_deinit(&address); - return CONF_ERROR; - } + status = string_copy(&address, addr, addrlen); + if (status != DN_OK) { + return CONF_ERROR; + } + status = dn_resolve(&address, field->port, &field->info); + if (status != DN_OK) { string_deinit(&address); - field->valid = 1; + return CONF_ERROR; + } - return CONF_OK; -} + string_deinit(&address); + field->valid = 1; + return CONF_OK; +} /* - * Well, this just blows. Copied from conf_add_server() there is a colon delimited - * string in the yaml that requires a few levels of magic in order to guess the - * structure of, rather than doing the right thing and making proper fields. - * Need to fix however, there's bigger fish to deep fry at this point. + * Well, this just blows. Copied from conf_add_server() there is a colon + * delimited string in the yaml that requires a few levels of magic in order to + * guess the structure of, rather than doing the right thing and making proper + * fields. Need to fix however, there's bigger fish to deep fry at this point. */ -static char * -conf_add_dyn_server(struct conf *cf, struct command *cmd, void *conf) -{ - rstatus_t status; - struct array *a; - struct string *value; - struct conf_server *field; - uint8_t *p, *q, *start; - uint8_t *pname, *addr, *port, *rack, *tokens, *name, *dc; - uint32_t k, delimlen, pnamelen, addrlen, portlen, racklen, tokenslen, namelen, dclen; - struct string address; - char delim[] = " ::::"; - - string_init(&address); - p = conf; // conf_pool - a = (struct array *)(p + cmd->offset); // a is conf_server array - - field = array_push(a); - if (field == NULL) { - return CONF_ERROR; - } - - status = conf_server_init(field); // field is conf_server - if (status != DN_OK) { - return CONF_ERROR; - } - - value = array_top(&cf->arg); - - /* parse "hostname:port:rack:dc:tokens [name]" */ - p = value->data + value->len - 1; // p is now pointing to a string - start = value->data; - addr = NULL; - addrlen = 0; - rack = NULL; - racklen = 0; - tokens = NULL; - tokenslen = 0; - port = NULL; - portlen = 0; - name = NULL; - namelen = 0; - dc = NULL; - dclen = 0; - - delimlen = 5; - - for (k = 0; k < sizeof(delim); k++) { - q = dn_strrchr(p, start, delim[k]); - if (q == NULL) { - if (k == 0) { - /* - * name in "hostname:port:rack:dc:tokens [name]" format string is - * optional - */ - continue; - } - break; - } - - switch (k) { - case 0: - name = q + 1; - namelen = (uint32_t)(p - name + 1); - break; - - case 1: - tokens = q + 1; - tokenslen = (uint32_t)(p - tokens + 1); - break; - - case 2: - dc = q + 1; - dclen = (uint32_t)(p - dc + 1); - break; - - case 3: - rack = q + 1; - racklen = (uint32_t)(p - rack + 1); - break; - - case 4: - port = q + 1; - portlen = (uint32_t)(p - port + 1); - break; - - default: - NOT_REACHED(); - } - - p = q - 1; - } - - if (k != delimlen) { - return "has an invalid format must match \"hostname:port:rack:dc:tokens [name]\""; - } - - pname = value->data; // seed node config string. - pnamelen = namelen > 0 ? value->len - (namelen + 1) : value->len; - status = string_copy(&field->pname, pname, pnamelen); - if (status != DN_OK) { - array_pop(a); - return CONF_ERROR; +static char *conf_add_dyn_server(struct conf *cf, struct command *cmd, + void *conf) { + rstatus_t status; + struct array *a; + struct string *value; + struct conf_server *field; + uint8_t *p, *q, *start; + uint8_t *pname, *addr, *port, *rack, *tokens, *name, *dc; + uint32_t k, delimlen, pnamelen, addrlen, portlen, racklen, tokenslen, namelen, + dclen; + struct string address; + char delim[] = " ::::"; + + string_init(&address); + p = conf; // conf_pool + a = (struct array *)(p + cmd->offset); // a is conf_server array + + field = array_push(a); + if (field == NULL) { + return CONF_ERROR; + } + + status = conf_server_init(field); // field is conf_server + if (status != DN_OK) { + return CONF_ERROR; + } + + value = array_top(&cf->arg); + + /* parse "hostname:port:rack:dc:tokens [name]" */ + p = value->data + value->len - 1; // p is now pointing to a string + start = value->data; + addr = NULL; + addrlen = 0; + rack = NULL; + racklen = 0; + tokens = NULL; + tokenslen = 0; + port = NULL; + portlen = 0; + name = NULL; + namelen = 0; + dc = NULL; + dclen = 0; + + delimlen = 5; + + for (k = 0; k < sizeof(delim); k++) { + q = dn_strrchr(p, start, delim[k]); + if (q == NULL) { + if (k == 0) { + /* + * name in "hostname:port:rack:dc:tokens [name]" format string is + * optional + */ + continue; + } + break; } - status = string_copy(&field->dc, dc, dclen); - if (status != DN_OK) { - array_pop(a); - return CONF_ERROR; - } + switch (k) { + case 0: + name = q + 1; + namelen = (uint32_t)(p - name + 1); + break; - status = string_copy(&field->rack, rack, racklen); - if (status != DN_OK) { - array_pop(a); - return CONF_ERROR; - } + case 1: + tokens = q + 1; + tokenslen = (uint32_t)(p - tokens + 1); + break; - uint8_t *t_end = tokens + tokenslen; - status = derive_tokens(&field->tokens, tokens, t_end); - if (status != DN_OK) { - array_pop(a); - return CONF_ERROR; - } + case 2: + dc = q + 1; + dclen = (uint32_t)(p - dc + 1); + break; - // addr is hostname - addr = start; - addrlen = (uint32_t)(p - start + 1); + case 3: + rack = q + 1; + racklen = (uint32_t)(p - rack + 1); + break; - field->port = dn_atoi(port, portlen); - if (field->port < 0 || !dn_valid_port(field->port)) { - return "has an invalid port in \"hostname:port:weight [name]\" format string"; - } + case 4: + port = q + 1; + portlen = (uint32_t)(p - port + 1); + break; - if (name == NULL) { - /* - * To maintain backward compatibility with libmemcached, we don't - * include the port as the part of the input string to the consistent - * hashing algorithm, when it is equal to 11211. - */ - if (field->port == CONF_DEFAULT_KETAMA_PORT) { - name = addr; - namelen = addrlen; - } else { - name = addr; - namelen = addrlen + 1 + portlen; - } + default: + NOT_REACHED(); } - status = string_copy(&field->name, name, namelen); - if (status != DN_OK) { - return CONF_ERROR; + p = q - 1; + } + + if (k != delimlen) { + return "has an invalid format must match \"hostname:port:rack:dc:tokens " + "[name]\""; + } + + pname = value->data; // seed node config string. + pnamelen = namelen > 0 ? value->len - (namelen + 1) : value->len; + status = string_copy(&field->pname, pname, pnamelen); + if (status != DN_OK) { + array_pop(a); + return CONF_ERROR; + } + + status = string_copy(&field->dc, dc, dclen); + if (status != DN_OK) { + array_pop(a); + return CONF_ERROR; + } + + status = string_copy(&field->rack, rack, racklen); + if (status != DN_OK) { + array_pop(a); + return CONF_ERROR; + } + + uint8_t *t_end = tokens + tokenslen; + status = derive_tokens(&field->tokens, tokens, t_end); + if (status != DN_OK) { + array_pop(a); + return CONF_ERROR; + } + + // addr is hostname + addr = start; + addrlen = (uint32_t)(p - start + 1); + + field->port = dn_atoi(port, portlen); + if (field->port < 0 || !dn_valid_port(field->port)) { + return "has an invalid port in \"hostname:port:weight [name]\" format " + "string"; + } + + if (name == NULL) { + /* + * To maintain backward compatibility with libmemcached, we don't + * include the port as the part of the input string to the consistent + * hashing algorithm, when it is equal to 11211. + */ + if (field->port == CONF_DEFAULT_KETAMA_PORT) { + name = addr; + namelen = addrlen; + } else { + name = addr; + namelen = addrlen + 1 + portlen; } + } - status = string_copy(&address, addr, addrlen); - if (status != DN_OK) { - return CONF_ERROR; - } + status = string_copy(&field->name, name, namelen); + if (status != DN_OK) { + return CONF_ERROR; + } - status = dn_resolve(&address, field->port, &field->info); - if (status != DN_OK) { - string_deinit(&address); - return CONF_ERROR; - } + status = string_copy(&address, addr, addrlen); + if (status != DN_OK) { + return CONF_ERROR; + } + status = dn_resolve(&address, field->port, &field->info); + if (status != DN_OK) { string_deinit(&address); - field->valid = 1; + return CONF_ERROR; + } - return CONF_OK; + string_deinit(&address); + field->valid = 1; + + return CONF_OK; } -static char * -conf_set_tokens(struct conf *cf, struct command *cmd, void *conf) -{ - uint8_t *p = conf; - struct array *tokens = (struct array *)(p + cmd->offset); - struct string *value = array_top(&cf->arg); - p = value->data + value->len; +static char *conf_set_tokens(struct conf *cf, struct command *cmd, void *conf) { + uint8_t *p = conf; + struct array *tokens = (struct array *)(p + cmd->offset); + struct string *value = array_top(&cf->arg); + p = value->data + value->len; - rstatus_t status = derive_tokens(tokens, value->data, p); - if (status != DN_OK) { - //TODO: should we dealloc the tokens/array? - return CONF_ERROR; - } + rstatus_t status = derive_tokens(tokens, value->data, p); + if (status != DN_OK) { + // TODO: should we dealloc the tokens/array? + return CONF_ERROR; + } - return CONF_OK; + return CONF_OK; } -static char * -conf_set_num(struct conf *cf, struct command *cmd, void *conf) -{ - uint8_t *p; - int num, *np; - struct string *value; +static char *conf_set_num(struct conf *cf, struct command *cmd, void *conf) { + uint8_t *p; + int num, *np; + struct string *value; - p = conf; - np = (int *)(p + cmd->offset); + p = conf; + np = (int *)(p + cmd->offset); - if (*np != CONF_UNSET_NUM) { - return "is a duplicate"; - } + if (*np != CONF_UNSET_NUM) { + return "is a duplicate"; + } - value = array_top(&cf->arg); + value = array_top(&cf->arg); - num = dn_atoi(value->data, value->len); - if (num < 0) { - return "is not a number"; - } + num = dn_atoi(value->data, value->len); + if (num < 0) { + return "is not a number"; + } - *np = num; + *np = num; - return CONF_OK; + return CONF_OK; } -static char * -conf_set_bool(struct conf *cf, struct command *cmd, void *conf) -{ - uint8_t *p; - int *bp; - struct string *value, true_str, false_str; +static char *conf_set_bool(struct conf *cf, struct command *cmd, void *conf) { + uint8_t *p; + int *bp; + struct string *value, true_str, false_str; - p = conf; - bp = (int *)(p + cmd->offset); + p = conf; + bp = (int *)(p + cmd->offset); - if (*bp != CONF_UNSET_NUM) { - return "is a duplicate"; - } + if (*bp != CONF_UNSET_NUM) { + return "is a duplicate"; + } - value = array_top(&cf->arg); - string_set_text(&true_str, "true"); - string_set_text(&false_str, "false"); + value = array_top(&cf->arg); + string_set_text(&true_str, "true"); + string_set_text(&false_str, "false"); - if (string_compare(value, &true_str) == 0) { - *bp = 1; - } else if (string_compare(value, &false_str) == 0) { - *bp = 0; - } else { - return "is not \"true\" or \"false\""; - } + if (string_compare(value, &true_str) == 0) { + *bp = 1; + } else if (string_compare(value, &false_str) == 0) { + *bp = 0; + } else { + return "is not \"true\" or \"false\""; + } - return CONF_OK; + return CONF_OK; } -static char * -conf_set_hash(struct conf *cf, struct command *cmd, void *conf) -{ - uint8_t *p; - hash_type_t *hp; - struct string *value; +static char *conf_set_hash(struct conf *cf, struct command *cmd, void *conf) { + uint8_t *p; + hash_type_t *hp; + struct string *value; - p = conf; - hp = (hash_type_t *)(p + cmd->offset); + p = conf; + hp = (hash_type_t *)(p + cmd->offset); - if (*hp != CONF_UNSET_HASH) { - return "is a duplicate"; - } + if (*hp != CONF_UNSET_HASH) { + return "is a duplicate"; + } - value = array_top(&cf->arg); + value = array_top(&cf->arg); - *hp = get_hash_type(value); - if (*hp == HASH_INVALID) - return "is not a valid hash"; - return CONF_OK; + *hp = get_hash_type(value); + if (*hp == HASH_INVALID) return "is not a valid hash"; + return CONF_OK; } -static char * -conf_set_deprecated(struct conf *cf, struct command *cmd, void *conf) -{ - log_warn("******** Field \"%.*s\" in the conf file is DEPRECATED *********", - cmd->name.len, cmd->name.data); - return CONF_OK; +static char *conf_set_deprecated(struct conf *cf, struct command *cmd, + void *conf) { + log_warn("******** Field \"%.*s\" in the conf file is DEPRECATED *********", + cmd->name.len, cmd->name.data); + return CONF_OK; } -static char * -conf_set_hashtag(struct conf *cf, struct command *cmd, void *conf) -{ - rstatus_t status; - uint8_t *p; - struct string *field, *value; +static char *conf_set_hashtag(struct conf *cf, struct command *cmd, + void *conf) { + rstatus_t status; + uint8_t *p; + struct string *field, *value; - p = conf; - field = (struct string *)(p + cmd->offset); + p = conf; + field = (struct string *)(p + cmd->offset); - if (field->data != CONF_UNSET_PTR) { - return "is a duplicate"; - } + if (field->data != CONF_UNSET_PTR) { + return "is a duplicate"; + } - value = array_top(&cf->arg); + value = array_top(&cf->arg); - if (value->len != 2) { - return "is not a valid hash tag string with two characters"; - } + if (value->len != 2) { + return "is not a valid hash tag string with two characters"; + } - status = string_duplicate(field, value); - if (status != DN_OK) { - return CONF_ERROR; - } + status = string_duplicate(field, value); + if (status != DN_OK) { + return CONF_ERROR; + } - return CONF_OK; + return CONF_OK; } static struct command conf_commands[] = { - { string("listen"), - conf_set_listen, - offsetof(struct conf_pool, listen) }, + {string("listen"), conf_set_listen, offsetof(struct conf_pool, listen)}, - { string("hash"), - conf_set_hash, - offsetof(struct conf_pool, hash) }, + {string("hash"), conf_set_hash, offsetof(struct conf_pool, hash)}, - { string("hash_tag"), - conf_set_hashtag, - offsetof(struct conf_pool, hash_tag) }, + {string("hash_tag"), conf_set_hashtag, + offsetof(struct conf_pool, hash_tag)}, - { string("distribution"), - conf_set_deprecated, - offsetof(struct conf_pool, deprecated) }, + {string("distribution"), conf_set_deprecated, + offsetof(struct conf_pool, deprecated)}, - { string("timeout"), - conf_set_num, - offsetof(struct conf_pool, timeout) }, + {string("timeout"), conf_set_num, offsetof(struct conf_pool, timeout)}, - { string("backlog"), - conf_set_num, - offsetof(struct conf_pool, backlog) }, + {string("backlog"), conf_set_num, offsetof(struct conf_pool, backlog)}, - { string("client_connections"), - conf_set_num, - offsetof(struct conf_pool, client_connections) }, + {string("client_connections"), conf_set_num, + offsetof(struct conf_pool, client_connections)}, - { string("data_store"), - conf_set_num, - offsetof(struct conf_pool, data_store) }, + {string("data_store"), conf_set_num, + offsetof(struct conf_pool, data_store)}, - { string("preconnect"), - conf_set_bool, - offsetof(struct conf_pool, preconnect) }, + {string("preconnect"), conf_set_bool, + offsetof(struct conf_pool, preconnect)}, - { string("auto_eject_hosts"), - conf_set_bool, - offsetof(struct conf_pool, auto_eject_hosts) }, + {string("auto_eject_hosts"), conf_set_bool, + offsetof(struct conf_pool, auto_eject_hosts)}, - { string("server_connections"), - conf_set_deprecated, - offsetof(struct conf_pool, deprecated) }, + {string("server_connections"), conf_set_deprecated, + offsetof(struct conf_pool, deprecated)}, - { string("server_retry_timeout"), - conf_set_num, - offsetof(struct conf_pool, server_retry_timeout_ms) }, + {string("server_retry_timeout"), conf_set_num, + offsetof(struct conf_pool, server_retry_timeout_ms)}, - { string("server_failure_limit"), - conf_set_num, - offsetof(struct conf_pool, server_failure_limit) }, + {string("server_failure_limit"), conf_set_num, + offsetof(struct conf_pool, server_failure_limit)}, - { string("servers"), - conf_add_server, - offsetof(struct conf_pool, conf_datastore) }, + {string("servers"), conf_add_server, + offsetof(struct conf_pool, conf_datastore)}, - { string("dyn_read_timeout"), - conf_set_num, - offsetof(struct conf_pool, dyn_read_timeout) }, + {string("dyn_read_timeout"), conf_set_num, + offsetof(struct conf_pool, dyn_read_timeout)}, - { string("dyn_write_timeout"), - conf_set_num, - offsetof(struct conf_pool, dyn_write_timeout) }, + {string("dyn_write_timeout"), conf_set_num, + offsetof(struct conf_pool, dyn_write_timeout)}, - { string("dyn_listen"), - conf_set_listen, - offsetof(struct conf_pool, dyn_listen) }, + {string("dyn_listen"), conf_set_listen, + offsetof(struct conf_pool, dyn_listen)}, - { string("dyn_seed_provider"), - conf_set_string, - offsetof(struct conf_pool, dyn_seed_provider) }, + {string("dyn_seed_provider"), conf_set_string, + offsetof(struct conf_pool, dyn_seed_provider)}, - { string("dyn_seeds"), - conf_add_dyn_server, - offsetof(struct conf_pool, dyn_seeds) }, + {string("dyn_seeds"), conf_add_dyn_server, + offsetof(struct conf_pool, dyn_seeds)}, - { string("dyn_port"), - conf_set_num, - offsetof(struct conf_pool, dyn_port) }, + {string("dyn_port"), conf_set_num, offsetof(struct conf_pool, dyn_port)}, - { string("dyn_connections"), - conf_set_num, - offsetof(struct conf_pool, dyn_connections) }, + {string("dyn_connections"), conf_set_num, + offsetof(struct conf_pool, dyn_connections)}, - { string("rack"), - conf_set_string, - offsetof(struct conf_pool, rack) }, + {string("rack"), conf_set_string, offsetof(struct conf_pool, rack)}, - { string("tokens"), - conf_set_tokens, - offsetof(struct conf_pool, tokens) }, + {string("tokens"), conf_set_tokens, offsetof(struct conf_pool, tokens)}, - { string("gos_interval"), - conf_set_num, - offsetof(struct conf_pool, gos_interval) }, + {string("gos_interval"), conf_set_num, + offsetof(struct conf_pool, gos_interval)}, - { string("secure_server_option"), - conf_set_string, - offsetof(struct conf_pool, secure_server_option) }, + {string("secure_server_option"), conf_set_string, + offsetof(struct conf_pool, secure_server_option)}, - { string("pem_key_file"), - conf_set_string, - offsetof(struct conf_pool, pem_key_file) }, + {string("pem_key_file"), conf_set_string, + offsetof(struct conf_pool, pem_key_file)}, - { string("recon_key_file"), - conf_set_string, - offsetof(struct conf_pool, recon_key_file) }, + {string("recon_key_file"), conf_set_string, + offsetof(struct conf_pool, recon_key_file)}, - { string("recon_iv_file"), - conf_set_string, - offsetof(struct conf_pool, recon_iv_file) }, + {string("recon_iv_file"), conf_set_string, + offsetof(struct conf_pool, recon_iv_file)}, - { string("datacenter"), - conf_set_string, - offsetof(struct conf_pool, dc) }, + {string("datacenter"), conf_set_string, offsetof(struct conf_pool, dc)}, - { string("env"), - conf_set_string, - offsetof(struct conf_pool, env) }, + {string("env"), conf_set_string, offsetof(struct conf_pool, env)}, - { string("conn_msg_rate"), - conf_set_num, - offsetof(struct conf_pool, conn_msg_rate)}, + {string("conn_msg_rate"), conf_set_num, + offsetof(struct conf_pool, conn_msg_rate)}, - { string("read_consistency"), - conf_set_string, - offsetof(struct conf_pool, read_consistency) }, + {string("read_consistency"), conf_set_string, + offsetof(struct conf_pool, read_consistency)}, - { string("write_consistency"), - conf_set_string, - offsetof(struct conf_pool, write_consistency) }, + {string("write_consistency"), conf_set_string, + offsetof(struct conf_pool, write_consistency)}, - { string("stats_listen"), - conf_set_listen, - offsetof(struct conf_pool, stats_listen) }, + {string("stats_listen"), conf_set_listen, + offsetof(struct conf_pool, stats_listen)}, - { string("stats_interval"), - conf_set_num, - offsetof(struct conf_pool, stats_interval) }, + {string("stats_interval"), conf_set_num, + offsetof(struct conf_pool, stats_interval)}, - { string("enable_gossip"), - conf_set_bool, - offsetof(struct conf_pool, enable_gossip) }, + {string("enable_gossip"), conf_set_bool, + offsetof(struct conf_pool, enable_gossip)}, - { string("mbuf_size"), - conf_set_num, - offsetof(struct conf_pool, mbuf_size) }, + {string("mbuf_size"), conf_set_num, offsetof(struct conf_pool, mbuf_size)}, - { string("max_msgs"), - conf_set_num, - offsetof(struct conf_pool, alloc_msgs_max) }, + {string("max_msgs"), conf_set_num, + offsetof(struct conf_pool, alloc_msgs_max)}, - { string("datastore_connections"), - conf_set_num, - offsetof(struct conf_pool, datastore_connections) }, + {string("datastore_connections"), conf_set_num, + offsetof(struct conf_pool, datastore_connections)}, - { string("local_peer_connections"), - conf_set_num, - offsetof(struct conf_pool, local_peer_connections) }, + {string("local_peer_connections"), conf_set_num, + offsetof(struct conf_pool, local_peer_connections)}, - { string("remote_peer_connections"), - conf_set_num, - offsetof(struct conf_pool, remote_peer_connections) }, + {string("remote_peer_connections"), conf_set_num, + offsetof(struct conf_pool, remote_peer_connections)}, - null_command -}; + null_command}; -static rstatus_t -conf_handler(struct conf *cf, void *data) -{ - struct command *cmd; - struct string *key, *value; - uint32_t narg; - - if (array_n(&cf->arg) == 1) { - value = array_top(&cf->arg); - log_debug(LOG_VVERB, "conf handler on '%.*s'", value->len, value->data); - return conf_pool_init(data, value); - } +static rstatus_t conf_handler(struct conf *cf, void *data) { + struct command *cmd; + struct string *key, *value; + uint32_t narg; - narg = array_n(&cf->arg); - value = array_get(&cf->arg, narg - 1); - key = array_get(&cf->arg, narg - 2); + if (array_n(&cf->arg) == 1) { + value = array_top(&cf->arg); + log_debug(LOG_VVERB, "conf handler on '%.*s'", value->len, value->data); + return conf_pool_init(data, value); + } - log_debug(LOG_VVERB, "conf handler on %.*s: %.*s", key->len, key->data, - value->len, value->data); + narg = array_n(&cf->arg); + value = array_get(&cf->arg, narg - 1); + key = array_get(&cf->arg, narg - 2); - for (cmd = conf_commands; cmd->name.len != 0; cmd++) { - char *rv; + log_debug(LOG_VVERB, "conf handler on %.*s: %.*s", key->len, key->data, + value->len, value->data); - if (string_compare(key, &cmd->name) != 0) { - continue; - } + for (cmd = conf_commands; cmd->name.len != 0; cmd++) { + char *rv; - rv = cmd->set(cf, cmd, data); - if (rv != CONF_OK) { - log_error("conf: directive \"%.*s\" %s", key->len, key->data, rv); - return DN_ERROR; - } + if (string_compare(key, &cmd->name) != 0) { + continue; + } - return DN_OK; + rv = cmd->set(cf, cmd, data); + if (rv != CONF_OK) { + log_error("conf: directive \"%.*s\" %s", key->len, key->data, rv); + return DN_ERROR; } - log_error("conf: directive \"%.*s\" is unknown", key->len, key->data); + return DN_OK; + } - return DN_ERROR; + log_error("conf: directive \"%.*s\" is unknown", key->len, key->data); + + return DN_ERROR; } -static rstatus_t -conf_begin_parse(struct conf *cf) -{ - rstatus_t status; - bool done; +static rstatus_t conf_begin_parse(struct conf *cf) { + rstatus_t status; + bool done; - ASSERT(cf->sound && !cf->parsed); - ASSERT(cf->depth == 0); + ASSERT(cf->sound && !cf->parsed); + ASSERT(cf->depth == 0); - status = conf_yaml_init(cf); + status = conf_yaml_init(cf); + if (status != DN_OK) { + return status; + } + + done = false; + do { + status = conf_event_next(cf); if (status != DN_OK) { - return status; + return status; } - done = false; - do { - status = conf_event_next(cf); - if (status != DN_OK) { - return status; - } - - log_debug(LOG_VVERB, "next begin event %d", cf->event.type); + log_debug(LOG_VVERB, "next begin event %d", cf->event.type); - switch (cf->event.type) { - case YAML_STREAM_START_EVENT: - case YAML_DOCUMENT_START_EVENT: - break; + switch (cf->event.type) { + case YAML_STREAM_START_EVENT: + case YAML_DOCUMENT_START_EVENT: + break; - case YAML_MAPPING_START_EVENT: - ASSERT(cf->depth < CONF_MAX_DEPTH); - cf->depth++; - done = true; - break; + case YAML_MAPPING_START_EVENT: + ASSERT(cf->depth < CONF_MAX_DEPTH); + cf->depth++; + done = true; + break; - default: - NOT_REACHED(); - } + default: + NOT_REACHED(); + } - conf_event_done(cf); + conf_event_done(cf); - } while (!done); + } while (!done); - return DN_OK; + return DN_OK; } -static rstatus_t -conf_end_parse(struct conf *cf) -{ - rstatus_t status; - bool done; +static rstatus_t conf_end_parse(struct conf *cf) { + rstatus_t status; + bool done; - ASSERT(cf->sound && !cf->parsed); - ASSERT(cf->depth == 0); + ASSERT(cf->sound && !cf->parsed); + ASSERT(cf->depth == 0); - done = false; - do { - status = conf_event_next(cf); - if (status != DN_OK) { - return status; - } + done = false; + do { + status = conf_event_next(cf); + if (status != DN_OK) { + return status; + } - log_debug(LOG_VVERB, "next end event %d", cf->event.type); + log_debug(LOG_VVERB, "next end event %d", cf->event.type); - switch (cf->event.type) { - case YAML_STREAM_END_EVENT: - done = true; - break; + switch (cf->event.type) { + case YAML_STREAM_END_EVENT: + done = true; + break; - case YAML_DOCUMENT_END_EVENT: - break; + case YAML_DOCUMENT_END_EVENT: + break; - default: - NOT_REACHED(); - } + default: + NOT_REACHED(); + } - conf_event_done(cf); - } while (!done); + conf_event_done(cf); + } while (!done); - conf_yaml_deinit(cf); + conf_yaml_deinit(cf); - return DN_OK; + return DN_OK; } -static rstatus_t -conf_parse_core(struct conf *cf, void *data) -{ - rstatus_t status; - bool done, leaf, new_pool; +static rstatus_t conf_parse_core(struct conf *cf, void *data) { + rstatus_t status; + bool done, leaf, new_pool; - ASSERT(cf->sound); + ASSERT(cf->sound); - status = conf_event_next(cf); - if (status != DN_OK) { - return status; - } + status = conf_event_next(cf); + if (status != DN_OK) { + return status; + } - log_debug(LOG_VVERB, "next event %d depth %"PRIu32" seq %d", cf->event.type, - cf->depth, cf->seq); + log_debug(LOG_VVERB, "next event %d depth %" PRIu32 " seq %d", cf->event.type, + cf->depth, cf->seq); - done = false; - leaf = false; - new_pool = false; + done = false; + leaf = false; + new_pool = false; - switch (cf->event.type) { + switch (cf->event.type) { case YAML_MAPPING_END_EVENT: - cf->depth--; - if (cf->depth == 1) { - conf_pop_scalar(cf); - } else if (cf->depth == 0) { - done = true; - } - break; + cf->depth--; + if (cf->depth == 1) { + conf_pop_scalar(cf); + } else if (cf->depth == 0) { + done = true; + } + break; case YAML_MAPPING_START_EVENT: - cf->depth++; - break; + cf->depth++; + break; case YAML_SEQUENCE_START_EVENT: - cf->seq = 1; - break; + cf->seq = 1; + break; case YAML_SEQUENCE_END_EVENT: - conf_pop_scalar(cf); - cf->seq = 0; - break; + conf_pop_scalar(cf); + cf->seq = 0; + break; case YAML_SCALAR_EVENT: - status = conf_push_scalar(cf); - if (status != DN_OK) { - break; - } - - /* take appropriate action */ - if (cf->seq) { - /* for a sequence, leaf is at CONF_MAX_DEPTH */ - ASSERT(cf->depth == CONF_MAX_DEPTH); - leaf = true; - } else if (cf->depth == CONF_ROOT_DEPTH) { - data = &cf->pool; - new_pool = true; - } else if (array_n(&cf->arg) == cf->depth + 1) { - /* for {key: value}, leaf is at CONF_MAX_DEPTH */ - ASSERT(cf->depth == CONF_MAX_DEPTH); - leaf = true; - } + status = conf_push_scalar(cf); + if (status != DN_OK) { break; + } + + /* take appropriate action */ + if (cf->seq) { + /* for a sequence, leaf is at CONF_MAX_DEPTH */ + ASSERT(cf->depth == CONF_MAX_DEPTH); + leaf = true; + } else if (cf->depth == CONF_ROOT_DEPTH) { + data = &cf->pool; + new_pool = true; + } else if (array_n(&cf->arg) == cf->depth + 1) { + /* for {key: value}, leaf is at CONF_MAX_DEPTH */ + ASSERT(cf->depth == CONF_MAX_DEPTH); + leaf = true; + } + break; default: - NOT_REACHED(); - break; - } + NOT_REACHED(); + break; + } - conf_event_done(cf); + conf_event_done(cf); - if (status != DN_OK) { - return status; - } + if (status != DN_OK) { + return status; + } - if (done) { - /* terminating condition */ - return DN_OK; - } + if (done) { + /* terminating condition */ + return DN_OK; + } - if (leaf || new_pool) { - status = conf_handler(cf, data); + if (leaf || new_pool) { + status = conf_handler(cf, data); - if (leaf) { - conf_pop_scalar(cf); - if (!cf->seq) { - conf_pop_scalar(cf); - } - } + if (leaf) { + conf_pop_scalar(cf); + if (!cf->seq) { + conf_pop_scalar(cf); + } + } - if (status != DN_OK) { - return status; - } + if (status != DN_OK) { + return status; } + } - return conf_parse_core(cf, data); + return conf_parse_core(cf, data); } -static rstatus_t -conf_parse(struct conf *cf) -{ - rstatus_t status; +static rstatus_t conf_parse(struct conf *cf) { + rstatus_t status; - ASSERT(cf->sound && !cf->parsed); - ASSERT(array_n(&cf->arg) == 0); + ASSERT(cf->sound && !cf->parsed); + ASSERT(array_n(&cf->arg) == 0); - status = conf_begin_parse(cf); - if (status != DN_OK) { - return status; - } + status = conf_begin_parse(cf); + if (status != DN_OK) { + return status; + } - status = conf_parse_core(cf, NULL); - if (status != DN_OK) { - return status; - } + status = conf_parse_core(cf, NULL); + if (status != DN_OK) { + return status; + } - status = conf_end_parse(cf); - if (status != DN_OK) { - return status; - } + status = conf_end_parse(cf); + if (status != DN_OK) { + return status; + } - cf->parsed = 1; + cf->parsed = 1; - return DN_OK; + return DN_OK; } -static struct conf * -conf_open(char *filename) -{ - rstatus_t status; - struct conf *cf; - FILE *fh; - - fh = fopen(filename, "r"); - if (fh == NULL) { - log_error("conf: failed to open configuration '%s': %s", filename, - strerror(errno)); - return NULL; - } - - cf = dn_zalloc(sizeof(*cf)); - if (cf == NULL) { - fclose(fh); - return NULL; - } - - status = array_init(&cf->arg, CONF_DEFAULT_ARGS, sizeof(struct string)); - if (status != DN_OK) { - dn_free(cf); - fclose(fh); - return NULL; - } +static struct conf *conf_open(char *filename) { + rstatus_t status; + struct conf *cf; + FILE *fh; - cf->fname = filename; - cf->fh = fh; - cf->depth = 0; - /* parser, event, and token are initialized later */ - cf->seq = 0; - cf->valid_parser = 0; - cf->valid_event = 0; - cf->valid_token = 0; - cf->sound = 0; - cf->parsed = 0; - cf->valid = 0; + fh = fopen(filename, "r"); + if (fh == NULL) { + log_error("conf: failed to open configuration '%s': %s", filename, + strerror(errno)); + return NULL; + } - log_debug(LOG_VVERB, "opened conf '%s'", filename); + cf = dn_zalloc(sizeof(*cf)); + if (cf == NULL) { + fclose(fh); + return NULL; + } - return cf; + status = array_init(&cf->arg, CONF_DEFAULT_ARGS, sizeof(struct string)); + if (status != DN_OK) { + dn_free(cf); + fclose(fh); + return NULL; + } + + cf->fname = filename; + cf->fh = fh; + cf->depth = 0; + /* parser, event, and token are initialized later */ + cf->seq = 0; + cf->valid_parser = 0; + cf->valid_event = 0; + cf->valid_token = 0; + cf->sound = 0; + cf->parsed = 0; + cf->valid = 0; + + log_debug(LOG_VVERB, "opened conf '%s'", filename); + + return cf; } -static rstatus_t -conf_validate_document(struct conf *cf) -{ - rstatus_t status; - uint32_t count; - bool done; +static rstatus_t conf_validate_document(struct conf *cf) { + rstatus_t status; + uint32_t count; + bool done; + + status = conf_yaml_init(cf); + if (status != DN_OK) { + return status; + } + + count = 0; + done = false; + do { + yaml_document_t document; + yaml_node_t *node; + int rv; - status = conf_yaml_init(cf); - if (status != DN_OK) { - return status; + rv = yaml_parser_load(&cf->parser, &document); + if (!rv) { + log_error("conf: failed (err %d) to get the next yaml document", + cf->parser.error); + conf_yaml_deinit(cf); + return DN_ERROR; } - count = 0; - done = false; - do { - yaml_document_t document; - yaml_node_t *node; - int rv; - - rv = yaml_parser_load(&cf->parser, &document); - if (!rv) { - log_error("conf: failed (err %d) to get the next yaml document", - cf->parser.error); - conf_yaml_deinit(cf); - return DN_ERROR; - } - - node = yaml_document_get_root_node(&document); - if (node == NULL) { - done = true; - } else { - count++; - } + node = yaml_document_get_root_node(&document); + if (node == NULL) { + done = true; + } else { + count++; + } - yaml_document_delete(&document); - } while (!done); + yaml_document_delete(&document); + } while (!done); - conf_yaml_deinit(cf); + conf_yaml_deinit(cf); - if (count != 1) { - log_error("conf: '%s' must contain only 1 document; found %"PRIu32" " - "documents", cf->fname, count); - return DN_ERROR; - } + if (count != 1) { + log_error("conf: '%s' must contain only 1 document; found %" PRIu32 + " " + "documents", + cf->fname, count); + return DN_ERROR; + } - return DN_OK; + return DN_OK; } -static rstatus_t -conf_validate_tokens(struct conf *cf) -{ - rstatus_t status; - bool done, error; - int type; +static rstatus_t conf_validate_tokens(struct conf *cf) { + rstatus_t status; + bool done, error; + int type; + + status = conf_yaml_init(cf); + if (status != DN_OK) { + return status; + } - status = conf_yaml_init(cf); + done = false; + error = false; + do { + status = conf_token_next(cf); if (status != DN_OK) { - return status; + return status; } + type = cf->token.type; - done = false; - error = false; - do { - status = conf_token_next(cf); - if (status != DN_OK) { - return status; - } - type = cf->token.type; - - switch (type) { - case YAML_NO_TOKEN: - error = true; - log_error("conf: no token (%d) is disallowed", type); - break; - - case YAML_VERSION_DIRECTIVE_TOKEN: - error = true; - log_error("conf: version directive token (%d) is disallowed", type); - break; - - case YAML_TAG_DIRECTIVE_TOKEN: - error = true; - log_error("conf: tag directive token (%d) is disallowed", type); - break; - - case YAML_DOCUMENT_START_TOKEN: - error = true; - log_error("conf: document start token (%d) is disallowed", type); - break; - - case YAML_DOCUMENT_END_TOKEN: - error = true; - log_error("conf: document end token (%d) is disallowed", type); - break; - - case YAML_FLOW_SEQUENCE_START_TOKEN: - error = true; - log_error("conf: flow sequence start token (%d) is disallowed", type); - break; - - case YAML_FLOW_SEQUENCE_END_TOKEN: - error = true; - log_error("conf: flow sequence end token (%d) is disallowed", type); - break; - - case YAML_FLOW_MAPPING_START_TOKEN: - error = true; - log_error("conf: flow mapping start token (%d) is disallowed", type); - break; - - case YAML_FLOW_MAPPING_END_TOKEN: - error = true; - log_error("conf: flow mapping end token (%d) is disallowed", type); - break; - - case YAML_FLOW_ENTRY_TOKEN: - error = true; - log_error("conf: flow entry token (%d) is disallowed", type); - break; - - case YAML_ALIAS_TOKEN: - error = true; - log_error("conf: alias token (%d) is disallowed", type); - break; - - case YAML_ANCHOR_TOKEN: - error = true; - log_error("conf: anchor token (%d) is disallowed", type); - break; - - case YAML_TAG_TOKEN: - error = true; - log_error("conf: tag token (%d) is disallowed", type); - break; - - case YAML_BLOCK_SEQUENCE_START_TOKEN: - case YAML_BLOCK_MAPPING_START_TOKEN: - case YAML_BLOCK_END_TOKEN: - case YAML_BLOCK_ENTRY_TOKEN: - break; - - case YAML_KEY_TOKEN: - case YAML_VALUE_TOKEN: - case YAML_SCALAR_TOKEN: - break; - - case YAML_STREAM_START_TOKEN: - break; - - case YAML_STREAM_END_TOKEN: - done = true; - log_debug(LOG_VVERB, "conf '%s' has valid tokens", cf->fname); - break; - - default: - error = true; - log_error("conf: unknown token (%d) is disallowed", type); - break; - } + switch (type) { + case YAML_NO_TOKEN: + error = true; + log_error("conf: no token (%d) is disallowed", type); + break; - conf_token_done(cf); - } while (!done && !error); + case YAML_VERSION_DIRECTIVE_TOKEN: + error = true; + log_error("conf: version directive token (%d) is disallowed", type); + break; - conf_yaml_deinit(cf); + case YAML_TAG_DIRECTIVE_TOKEN: + error = true; + log_error("conf: tag directive token (%d) is disallowed", type); + break; - return !error ? DN_OK : DN_ERROR; -} + case YAML_DOCUMENT_START_TOKEN: + error = true; + log_error("conf: document start token (%d) is disallowed", type); + break; -static rstatus_t -conf_validate_structure(struct conf *cf) -{ - rstatus_t status; - int type, depth; - uint32_t i, count[CONF_MAX_DEPTH + 1]; - bool done, error, seq; + case YAML_DOCUMENT_END_TOKEN: + error = true; + log_error("conf: document end token (%d) is disallowed", type); + break; - status = conf_yaml_init(cf); - if (status != DN_OK) { - return status; - } + case YAML_FLOW_SEQUENCE_START_TOKEN: + error = true; + log_error("conf: flow sequence start token (%d) is disallowed", type); + break; - done = false; - error = false; - seq = false; - depth = 0; - for (i = 0; i < CONF_MAX_DEPTH + 1; i++) { - count[i] = 0; - } + case YAML_FLOW_SEQUENCE_END_TOKEN: + error = true; + log_error("conf: flow sequence end token (%d) is disallowed", type); + break; - /* - * Validate that the configuration conforms roughly to the following - * yaml tree structure: - * - * keyx: - * key1: value1 - * key2: value2 - * seq: - * - elem1 - * - elem2 - * - elem3 - * key3: value3 - * - * keyy: - * key1: value1 - * key2: value2 - * seq: - * - elem1 - * - elem2 - * - elem3 - * key3: value3 - */ - do { - status = conf_event_next(cf); - if (status != DN_OK) { - return status; - } + case YAML_FLOW_MAPPING_START_TOKEN: + error = true; + log_error("conf: flow mapping start token (%d) is disallowed", type); + break; - type = cf->event.type; - - log_debug(LOG_VVERB, "next event %d depth %d seq %d", type, depth, seq); - - switch (type) { - case YAML_STREAM_START_EVENT: - case YAML_DOCUMENT_START_EVENT: - break; - - case YAML_DOCUMENT_END_EVENT: - break; - - case YAML_STREAM_END_EVENT: - done = true; - break; - - case YAML_MAPPING_START_EVENT: - if (depth == CONF_ROOT_DEPTH && count[depth] != 1) { - error = true; - log_error("conf: '%s' has more than one \"key:value\" at depth" - " %d", cf->fname, depth); - } else if (depth >= CONF_MAX_DEPTH) { - error = true; - log_error("conf: '%s' has a depth greater than %d", cf->fname, - CONF_MAX_DEPTH); - } - depth++; - break; - - case YAML_MAPPING_END_EVENT: - if (depth == CONF_MAX_DEPTH) { - if (seq) { - seq = false; - //} else { - // error = true; - // log_error("conf: '%s' missing sequence directive at depth " - // "%d", cf->fname, depth); - } - } - depth--; - count[depth] = 0; - break; - - case YAML_SEQUENCE_START_EVENT: - if (seq) { - error = true; - log_error("conf: '%s' has more than one sequence directive", - cf->fname); - } else if (depth != CONF_MAX_DEPTH) { - error = true; - log_error("conf: '%s' has sequence at depth %d instead of %d", - cf->fname, depth, CONF_MAX_DEPTH); - } else if (count[depth] != 1) { - error = true; - log_error("conf: '%s' has invalid \"key:value\" at depth %d", - cf->fname, depth); - } - seq = true; - break; - - case YAML_SEQUENCE_END_EVENT: - ASSERT(depth == CONF_MAX_DEPTH); - count[depth] = 0; - seq = false; - break; - - case YAML_SCALAR_EVENT: - if (depth == 0) { - error = true; - log_error("conf: '%s' has invalid empty \"key:\" at depth %d", - cf->fname, depth); - } else if (depth == CONF_ROOT_DEPTH && count[depth] != 0) { - error = true; - log_error("conf: '%s' has invalid mapping \"key:\" at depth %d", - cf->fname, depth); - } else if (depth == CONF_MAX_DEPTH && count[depth] == 2) { - /* found a "key: value", resetting! */ - count[depth] = 0; - } - count[depth]++; - break; - - default: - NOT_REACHED(); - } + case YAML_FLOW_MAPPING_END_TOKEN: + error = true; + log_error("conf: flow mapping end token (%d) is disallowed", type); + break; - conf_event_done(cf); - } while (!done && !error); + case YAML_FLOW_ENTRY_TOKEN: + error = true; + log_error("conf: flow entry token (%d) is disallowed", type); + break; - conf_yaml_deinit(cf); + case YAML_ALIAS_TOKEN: + error = true; + log_error("conf: alias token (%d) is disallowed", type); + break; - return !error ? DN_OK : DN_ERROR; -} + case YAML_ANCHOR_TOKEN: + error = true; + log_error("conf: anchor token (%d) is disallowed", type); + break; -static rstatus_t -conf_pre_validate(struct conf *cf) -{ - rstatus_t status; + case YAML_TAG_TOKEN: + error = true; + log_error("conf: tag token (%d) is disallowed", type); + break; + case YAML_BLOCK_SEQUENCE_START_TOKEN: + case YAML_BLOCK_MAPPING_START_TOKEN: + case YAML_BLOCK_END_TOKEN: + case YAML_BLOCK_ENTRY_TOKEN: + break; - status = conf_validate_document(cf); - if (status != DN_OK) { - return status; - } + case YAML_KEY_TOKEN: + case YAML_VALUE_TOKEN: + case YAML_SCALAR_TOKEN: + break; - status = conf_validate_tokens(cf); - if (status != DN_OK) { - return status; - } + case YAML_STREAM_START_TOKEN: + break; - status = conf_validate_structure(cf); - if (status != DN_OK) { - return status; + case YAML_STREAM_END_TOKEN: + done = true; + log_debug(LOG_VVERB, "conf '%s' has valid tokens", cf->fname); + break; + + default: + error = true; + log_error("conf: unknown token (%d) is disallowed", type); + break; } - cf->sound = 1; + conf_token_done(cf); + } while (!done && !error); - return DN_OK; + conf_yaml_deinit(cf); + + return !error ? DN_OK : DN_ERROR; } -static rstatus_t -conf_validate_server(struct conf *cf, struct conf_pool *cp) -{ - if (cp->conf_datastore == NULL) { - log_error("conf: pool '%.*s' has no datastores", cp->name.len, - cp->name.data); - return DN_ERROR; +static rstatus_t conf_validate_structure(struct conf *cf) { + rstatus_t status; + int type, depth; + uint32_t i, count[CONF_MAX_DEPTH + 1]; + bool done, error, seq; + + status = conf_yaml_init(cf); + if (status != DN_OK) { + return status; + } + + done = false; + error = false; + seq = false; + depth = 0; + for (i = 0; i < CONF_MAX_DEPTH + 1; i++) { + count[i] = 0; + } + + /* + * Validate that the configuration conforms roughly to the following + * yaml tree structure: + * + * keyx: + * key1: value1 + * key2: value2 + * seq: + * - elem1 + * - elem2 + * - elem3 + * key3: value3 + * + * keyy: + * key1: value1 + * key2: value2 + * seq: + * - elem1 + * - elem2 + * - elem3 + * key3: value3 + */ + do { + status = conf_event_next(cf); + if (status != DN_OK) { + return status; } - return DN_OK; -} + type = cf->event.type; -/* Validate pool config and set defaults. */ -static rstatus_t -conf_validate_pool(struct conf *cf, struct conf_pool *cp) -{ - rstatus_t status; + log_debug(LOG_VVERB, "next event %d depth %d seq %d", type, depth, seq); - ASSERT(!cp->valid); - ASSERT(!string_empty(&cp->name)); + switch (type) { + case YAML_STREAM_START_EVENT: + case YAML_DOCUMENT_START_EVENT: + break; - if (!cp->listen.valid) { - log_error("conf: directive \"listen:\" is missing"); - return DN_ERROR; - } + case YAML_DOCUMENT_END_EVENT: + break; - /* set default values for unset directives */ + case YAML_STREAM_END_EVENT: + done = true; + break; - if (string_empty(&cp->dyn_seed_provider)) { - string_copy_c(&cp->dyn_seed_provider, (const uint8_t *)CONF_DEFAULT_SEED_PROVIDER); - } + case YAML_MAPPING_START_EVENT: + if (depth == CONF_ROOT_DEPTH && count[depth] != 1) { + error = true; + log_error( + "conf: '%s' has more than one \"key:value\" at depth" + " %d", + cf->fname, depth); + } else if (depth >= CONF_MAX_DEPTH) { + error = true; + log_error("conf: '%s' has a depth greater than %d", cf->fname, + CONF_MAX_DEPTH); + } + depth++; + break; - if (cp->hash == CONF_UNSET_HASH) { - cp->hash = CONF_DEFAULT_HASH; - } + case YAML_MAPPING_END_EVENT: + if (depth == CONF_MAX_DEPTH) { + if (seq) { + seq = false; + //} else { + // error = true; + // log_error("conf: '%s' missing sequence directive at depth " + // "%d", cf->fname, depth); + } + } + depth--; + count[depth] = 0; + break; - if (cp->timeout == CONF_UNSET_NUM) { - cp->timeout = CONF_DEFAULT_TIMEOUT; - } + case YAML_SEQUENCE_START_EVENT: + if (seq) { + error = true; + log_error("conf: '%s' has more than one sequence directive", + cf->fname); + } else if (depth != CONF_MAX_DEPTH) { + error = true; + log_error("conf: '%s' has sequence at depth %d instead of %d", + cf->fname, depth, CONF_MAX_DEPTH); + } else if (count[depth] != 1) { + error = true; + log_error("conf: '%s' has invalid \"key:value\" at depth %d", + cf->fname, depth); + } + seq = true; + break; - if (cp->backlog == CONF_UNSET_NUM) { - cp->backlog = CONF_DEFAULT_LISTEN_BACKLOG; - } + case YAML_SEQUENCE_END_EVENT: + ASSERT(depth == CONF_MAX_DEPTH); + count[depth] = 0; + seq = false; + break; - cp->client_connections = CONF_DEFAULT_CLIENT_CONNECTIONS; + case YAML_SCALAR_EVENT: + if (depth == 0) { + error = true; + log_error("conf: '%s' has invalid empty \"key:\" at depth %d", + cf->fname, depth); + } else if (depth == CONF_ROOT_DEPTH && count[depth] != 0) { + error = true; + log_error("conf: '%s' has invalid mapping \"key:\" at depth %d", + cf->fname, depth); + } else if (depth == CONF_MAX_DEPTH && count[depth] == 2) { + /* found a "key: value", resetting! */ + count[depth] = 0; + } + count[depth]++; + break; - if (cp->data_store == CONF_UNSET_NUM) { - cp->data_store = CONF_DEFAULT_DATASTORE; + default: + NOT_REACHED(); } - if (cp->preconnect == CONF_UNSET_NUM) { - cp->preconnect = CONF_DEFAULT_PRECONNECT; - } + conf_event_done(cf); + } while (!done && !error); - if (cp->auto_eject_hosts == CONF_UNSET_NUM) { - cp->auto_eject_hosts = CONF_DEFAULT_AUTO_EJECT_HOSTS; - } + conf_yaml_deinit(cf); - if (cp->server_retry_timeout_ms == CONF_UNSET_NUM) { - cp->server_retry_timeout_ms = CONF_DEFAULT_SERVER_RETRY_TIMEOUT; - } + return !error ? DN_OK : DN_ERROR; +} - if (cp->server_failure_limit == CONF_UNSET_NUM) { - cp->server_failure_limit = CONF_DEFAULT_SERVER_FAILURE_LIMIT; - } +static rstatus_t conf_pre_validate(struct conf *cf) { + rstatus_t status; - if (cp->dyn_read_timeout == CONF_UNSET_NUM) { - cp->dyn_read_timeout = CONF_DEFAULT_DYN_READ_TIMEOUT; - } + status = conf_validate_document(cf); + if (status != DN_OK) { + return status; + } - if (cp->dyn_write_timeout == CONF_UNSET_NUM) { - cp->dyn_write_timeout = CONF_DEFAULT_DYN_WRITE_TIMEOUT; - } + status = conf_validate_tokens(cf); + if (status != DN_OK) { + return status; + } - if (cp->dyn_connections == CONF_UNSET_NUM) { - cp->dyn_connections = CONF_DEFAULT_DYN_CONNECTIONS; - } else if (cp->dyn_connections == 0) { - log_error("conf: directive \"dyn_connections:\" cannot be 0"); - return DN_ERROR; - } + status = conf_validate_structure(cf); + if (status != DN_OK) { + return status; + } - if (cp->gos_interval == CONF_UNSET_NUM) { - cp->gos_interval = CONF_DEFAULT_GOS_INTERVAL; - } + cf->sound = 1; - if (cp->conn_msg_rate == CONF_UNSET_NUM) { - cp->conn_msg_rate = CONF_DEFAULT_CONN_MSG_RATE; - } + return DN_OK; +} - if (cp->mbuf_size == CONF_UNSET_NUM) { - log_debug(LOG_INFO,"setting mbuf_size to default value:%d", CONF_DEFAULT_MBUF_SIZE); - /* - * After backward compatibility is supported, enable this - * cp->mbuf_size = CONF_DEFAULT_MBUF_SIZE; - */ - } - else { - /* Validating mbuf_size correctness */ - if (cp->mbuf_size <= 0) { - log_stderr("mbuf_size: requires a positive number"); - return DN_ERROR; - } - - if (cp->mbuf_size < CONF_DEFAULT_MBUF_MIN_SIZE || cp->mbuf_size > CONF_DEFAULT_MBUF_MAX_SIZE) { - log_stderr("mbuf_size: mbuf chunk size must be between %zu and" - " %zu bytes", CONF_DEFAULT_MBUF_MIN_SIZE, CONF_DEFAULT_MBUF_MAX_SIZE); - return DN_ERROR; - } - - if ((cp->mbuf_size / 16) * 16 != cp->mbuf_size) { - log_stderr("mbuf_size: mbuf size must be a multiple of 16"); - return DN_ERROR; - } - } +static rstatus_t conf_validate_server(struct conf *cf, struct conf_pool *cp) { + if (cp->conf_datastore == NULL) { + log_error("conf: pool '%.*s' has no datastores", cp->name.len, + cp->name.data); + return DN_ERROR; + } - if (cp->alloc_msgs_max == CONF_UNSET_NUM) { - log_debug(LOG_INFO,"setting max_msgs to default value:%d",CONF_DEFAULT_MAX_ALLOC_MSGS); - /* - * After backward compatibility is supported, enable this - * cp->alloc_msgs_max = CONF_DEFAULT_MAX_ALLOC_MSGS; - */ + return DN_OK; +} - } - else { - if (cp->alloc_msgs_max <= 0) { - log_stderr("dynomite: option -M requires a non-zero number"); - return DN_ERROR; - } +/* Validate pool config and set defaults. */ +static rstatus_t conf_validate_pool(struct conf *cf, struct conf_pool *cp) { + rstatus_t status; - if (cp->alloc_msgs_max < CONF_DEFAULT_MIN_ALLOC_MSGS || cp->alloc_msgs_max > CONF_DEFAULT_MAX_ALLOC_MSGS) { - log_stderr("max_msgs: max allocated messages buffer must be between %zu and" - " %zu messages", CONF_DEFAULT_MIN_ALLOC_MSGS, CONF_DEFAULT_MAX_ALLOC_MSGS); - return DN_ERROR; - } - } + ASSERT(!cp->valid); + ASSERT(!string_empty(&cp->name)); - if (string_empty(&cp->rack)) { - string_copy_c(&cp->rack, (const uint8_t *)CONF_DEFAULT_RACK); - log_debug(LOG_INFO, "setting rack to default value:%s", CONF_DEFAULT_RACK); - } + if (!cp->listen.valid) { + log_error("conf: directive \"listen:\" is missing"); + return DN_ERROR; + } - if (string_empty(&cp->dc)) { - string_copy_c(&cp->dc, (const uint8_t *)CONF_DEFAULT_DC); - log_debug(LOG_INFO, "setting dc to default value:%s", CONF_DEFAULT_DC); - } + /* set default values for unset directives */ - if (string_empty(&cp->secure_server_option)) { - string_copy_c(&cp->secure_server_option, - (const uint8_t *)CONF_DEFAULT_SECURE_SERVER_OPTION); - log_debug(LOG_INFO, "setting secure_server_option to default value:%s", - CONF_DEFAULT_SECURE_SERVER_OPTION); - } + if (string_empty(&cp->dyn_seed_provider)) { + string_copy_c(&cp->dyn_seed_provider, + (const uint8_t *)CONF_DEFAULT_SEED_PROVIDER); + } - if (string_empty(&cp->read_consistency)) { - string_copy_c(&cp->read_consistency, - (const uint8_t *)CONF_STR_DC_ONE); - log_debug(LOG_INFO, "setting read_consistency to default value:%s", - CONF_STR_DC_ONE); - } + if (cp->hash == CONF_UNSET_HASH) { + cp->hash = CONF_DEFAULT_HASH; + } - if (string_empty(&cp->write_consistency)) { - string_copy_c(&cp->write_consistency, - (const uint8_t *)CONF_STR_DC_ONE); - log_debug(LOG_INFO, "setting write_consistency to default value:%s", - CONF_STR_DC_ONE); - } + if (cp->timeout == CONF_UNSET_NUM) { + cp->timeout = CONF_DEFAULT_TIMEOUT; + } - if (cp->stats_interval == CONF_UNSET_NUM) { - log_debug(LOG_INFO,"setting stats_interval to default value:%d",CONF_DEFAULT_STATS_INTERVAL_MS); - cp->stats_interval = CONF_DEFAULT_STATS_INTERVAL_MS; - } + if (cp->backlog == CONF_UNSET_NUM) { + cp->backlog = CONF_DEFAULT_LISTEN_BACKLOG; + } - if (!cp->stats_listen.valid) { - log_error("conf: directive \"stats_listen:\" is missing - using defaults %s", - CONF_DEFAULT_STATS_PNAME, CONF_DEFAULT_STATS_PORT); - cp->stats_listen.port=CONF_DEFAULT_STATS_PORT; - string_copy_c(&cp->stats_listen.pname, - (const uint8_t *)CONF_DEFAULT_STATS_PNAME); - } + cp->client_connections = CONF_DEFAULT_CLIENT_CONNECTIONS; - if (dn_strcmp(cp->secure_server_option.data, CONF_SECURE_OPTION_NONE) && - dn_strcmp(cp->secure_server_option.data, CONF_SECURE_OPTION_RACK) && - dn_strcmp(cp->secure_server_option.data, CONF_SECURE_OPTION_DC) && - dn_strcmp(cp->secure_server_option.data, CONF_SECURE_OPTION_ALL)) - { - log_error("conf: directive \"secure_server_option:\"must be one of 'none' 'rack' 'datacenter' 'all'"); - } + if (cp->data_store == CONF_UNSET_NUM) { + cp->data_store = CONF_DEFAULT_DATASTORE; + } - if (!dn_strcasecmp(cp->read_consistency.data, CONF_STR_DC_ONE)) - g_read_consistency = DC_ONE; - else if (!dn_strcasecmp(cp->read_consistency.data, CONF_STR_DC_SAFE_QUORUM)) - g_read_consistency = DC_SAFE_QUORUM; - else if (!dn_strcasecmp(cp->read_consistency.data, CONF_STR_DC_QUORUM)) - g_read_consistency = DC_QUORUM; - else { - log_error("conf: directive \"read_consistency:\"must be one of 'DC_ONE' 'DC_QUORUM' 'DC_SAFE_QUORUM'"); - return DN_ERROR; - } + if (cp->preconnect == CONF_UNSET_NUM) { + cp->preconnect = CONF_DEFAULT_PRECONNECT; + } - if (!dn_strcasecmp(cp->write_consistency.data, CONF_STR_DC_ONE)) - g_write_consistency = DC_ONE; - else if (!dn_strcasecmp(cp->write_consistency.data, CONF_STR_DC_SAFE_QUORUM)) - g_write_consistency = DC_SAFE_QUORUM; - else if (!dn_strcasecmp(cp->write_consistency.data, CONF_STR_DC_QUORUM)) - g_write_consistency = DC_QUORUM; - else { - log_error("conf: directive \"write_consistency:\"must be one of 'DC_ONE' 'DC_QUORUM' 'DC_SAFE_QUORUM'"); - return DN_ERROR; - } + if (cp->auto_eject_hosts == CONF_UNSET_NUM) { + cp->auto_eject_hosts = CONF_DEFAULT_AUTO_EJECT_HOSTS; + } - if (string_empty(&cp->env)) { - string_copy_c(&cp->env, (const uint8_t *)CONF_DEFAULT_ENV); - log_debug(LOG_INFO, "setting env to default value:%s", CONF_DEFAULT_ENV); - } + if (cp->server_retry_timeout_ms == CONF_UNSET_NUM) { + cp->server_retry_timeout_ms = CONF_DEFAULT_SERVER_RETRY_TIMEOUT; + } - if (string_empty(&cp->pem_key_file)) { - string_copy_c(&cp->pem_key_file, (const uint8_t *)PEM_KEY_FILE); - log_debug(LOG_INFO, "setting pem key file to default value:%s", PEM_KEY_FILE); - } + if (cp->server_failure_limit == CONF_UNSET_NUM) { + cp->server_failure_limit = CONF_DEFAULT_SERVER_FAILURE_LIMIT; + } - if (string_empty(&cp->recon_key_file)) { - string_copy_c(&cp->recon_key_file, (const uint8_t *)RECON_KEY_FILE); - log_debug(LOG_INFO, "setting reconciliation key file to default value:%s", RECON_KEY_FILE); - } + if (cp->dyn_read_timeout == CONF_UNSET_NUM) { + cp->dyn_read_timeout = CONF_DEFAULT_DYN_READ_TIMEOUT; + } - if (string_empty(&cp->recon_iv_file)) { - string_copy_c(&cp->recon_iv_file, (const uint8_t *)RECON_IV_FILE); - log_debug(LOG_INFO, "setting reconciliation IV file to default value:%s", RECON_IV_FILE); - } + if (cp->dyn_write_timeout == CONF_UNSET_NUM) { + cp->dyn_write_timeout = CONF_DEFAULT_DYN_WRITE_TIMEOUT; + } - if (cp->datastore_connections == CONF_UNSET_NUM) { - cp->datastore_connections = CONF_DEFAULT_CONNECTIONS; - } + if (cp->dyn_connections == CONF_UNSET_NUM) { + cp->dyn_connections = CONF_DEFAULT_DYN_CONNECTIONS; + } else if (cp->dyn_connections == 0) { + log_error("conf: directive \"dyn_connections:\" cannot be 0"); + return DN_ERROR; + } - if (cp->local_peer_connections == CONF_UNSET_NUM) { - cp->local_peer_connections = CONF_DEFAULT_CONNECTIONS; + if (cp->gos_interval == CONF_UNSET_NUM) { + cp->gos_interval = CONF_DEFAULT_GOS_INTERVAL; + } + + if (cp->conn_msg_rate == CONF_UNSET_NUM) { + cp->conn_msg_rate = CONF_DEFAULT_CONN_MSG_RATE; + } + + if (cp->mbuf_size == CONF_UNSET_NUM) { + log_debug(LOG_INFO, "setting mbuf_size to default value:%d", + CONF_DEFAULT_MBUF_SIZE); + /* + * After backward compatibility is supported, enable this + * cp->mbuf_size = CONF_DEFAULT_MBUF_SIZE; + */ + } else { + /* Validating mbuf_size correctness */ + if (cp->mbuf_size <= 0) { + log_stderr("mbuf_size: requires a positive number"); + return DN_ERROR; } - if (cp->remote_peer_connections == CONF_UNSET_NUM) { - cp->remote_peer_connections = CONF_DEFAULT_CONNECTIONS; + if (cp->mbuf_size < CONF_DEFAULT_MBUF_MIN_SIZE || + cp->mbuf_size > CONF_DEFAULT_MBUF_MAX_SIZE) { + log_stderr( + "mbuf_size: mbuf chunk size must be between %zu and" + " %zu bytes", + CONF_DEFAULT_MBUF_MIN_SIZE, CONF_DEFAULT_MBUF_MAX_SIZE); + return DN_ERROR; } - status = conf_validate_server(cf, cp); - if (status != DN_OK) { - return status; + if ((cp->mbuf_size / 16) * 16 != cp->mbuf_size) { + log_stderr("mbuf_size: mbuf size must be a multiple of 16"); + return DN_ERROR; } + } - cp->valid = 1; + if (cp->alloc_msgs_max == CONF_UNSET_NUM) { + log_debug(LOG_INFO, "setting max_msgs to default value:%d", + CONF_DEFAULT_MAX_ALLOC_MSGS); + /* + * After backward compatibility is supported, enable this + * cp->alloc_msgs_max = CONF_DEFAULT_MAX_ALLOC_MSGS; + */ - return DN_OK; + } else { + if (cp->alloc_msgs_max <= 0) { + log_stderr("dynomite: option -M requires a non-zero number"); + return DN_ERROR; + } + + if (cp->alloc_msgs_max < CONF_DEFAULT_MIN_ALLOC_MSGS || + cp->alloc_msgs_max > CONF_DEFAULT_MAX_ALLOC_MSGS) { + log_stderr( + "max_msgs: max allocated messages buffer must be between %zu and" + " %zu messages", + CONF_DEFAULT_MIN_ALLOC_MSGS, CONF_DEFAULT_MAX_ALLOC_MSGS); + return DN_ERROR; + } + } + + if (string_empty(&cp->rack)) { + string_copy_c(&cp->rack, (const uint8_t *)CONF_DEFAULT_RACK); + log_debug(LOG_INFO, "setting rack to default value:%s", CONF_DEFAULT_RACK); + } + + if (string_empty(&cp->dc)) { + string_copy_c(&cp->dc, (const uint8_t *)CONF_DEFAULT_DC); + log_debug(LOG_INFO, "setting dc to default value:%s", CONF_DEFAULT_DC); + } + + if (string_empty(&cp->secure_server_option)) { + string_copy_c(&cp->secure_server_option, + (const uint8_t *)CONF_DEFAULT_SECURE_SERVER_OPTION); + log_debug(LOG_INFO, "setting secure_server_option to default value:%s", + CONF_DEFAULT_SECURE_SERVER_OPTION); + } + + if (string_empty(&cp->read_consistency)) { + string_copy_c(&cp->read_consistency, (const uint8_t *)CONF_STR_DC_ONE); + log_debug(LOG_INFO, "setting read_consistency to default value:%s", + CONF_STR_DC_ONE); + } + + if (string_empty(&cp->write_consistency)) { + string_copy_c(&cp->write_consistency, (const uint8_t *)CONF_STR_DC_ONE); + log_debug(LOG_INFO, "setting write_consistency to default value:%s", + CONF_STR_DC_ONE); + } + + if (cp->stats_interval == CONF_UNSET_NUM) { + log_debug(LOG_INFO, "setting stats_interval to default value:%d", + CONF_DEFAULT_STATS_INTERVAL_MS); + cp->stats_interval = CONF_DEFAULT_STATS_INTERVAL_MS; + } + + if (!cp->stats_listen.valid) { + log_error( + "conf: directive \"stats_listen:\" is missing - using defaults %s", + CONF_DEFAULT_STATS_PNAME, CONF_DEFAULT_STATS_PORT); + cp->stats_listen.port = CONF_DEFAULT_STATS_PORT; + string_copy_c(&cp->stats_listen.pname, + (const uint8_t *)CONF_DEFAULT_STATS_PNAME); + } + + if (dn_strcmp(cp->secure_server_option.data, CONF_SECURE_OPTION_NONE) && + dn_strcmp(cp->secure_server_option.data, CONF_SECURE_OPTION_RACK) && + dn_strcmp(cp->secure_server_option.data, CONF_SECURE_OPTION_DC) && + dn_strcmp(cp->secure_server_option.data, CONF_SECURE_OPTION_ALL)) { + log_error( + "conf: directive \"secure_server_option:\"must be one of 'none' 'rack' " + "'datacenter' 'all'"); + } + + if (!dn_strcasecmp(cp->read_consistency.data, CONF_STR_DC_ONE)) + g_read_consistency = DC_ONE; + else if (!dn_strcasecmp(cp->read_consistency.data, CONF_STR_DC_SAFE_QUORUM)) + g_read_consistency = DC_SAFE_QUORUM; + else if (!dn_strcasecmp(cp->read_consistency.data, CONF_STR_DC_QUORUM)) + g_read_consistency = DC_QUORUM; + else { + log_error( + "conf: directive \"read_consistency:\"must be one of 'DC_ONE' " + "'DC_QUORUM' 'DC_SAFE_QUORUM'"); + return DN_ERROR; + } + + if (!dn_strcasecmp(cp->write_consistency.data, CONF_STR_DC_ONE)) + g_write_consistency = DC_ONE; + else if (!dn_strcasecmp(cp->write_consistency.data, CONF_STR_DC_SAFE_QUORUM)) + g_write_consistency = DC_SAFE_QUORUM; + else if (!dn_strcasecmp(cp->write_consistency.data, CONF_STR_DC_QUORUM)) + g_write_consistency = DC_QUORUM; + else { + log_error( + "conf: directive \"write_consistency:\"must be one of 'DC_ONE' " + "'DC_QUORUM' 'DC_SAFE_QUORUM'"); + return DN_ERROR; + } + + if (string_empty(&cp->env)) { + string_copy_c(&cp->env, (const uint8_t *)CONF_DEFAULT_ENV); + log_debug(LOG_INFO, "setting env to default value:%s", CONF_DEFAULT_ENV); + } + + if (string_empty(&cp->pem_key_file)) { + string_copy_c(&cp->pem_key_file, (const uint8_t *)PEM_KEY_FILE); + log_debug(LOG_INFO, "setting pem key file to default value:%s", + PEM_KEY_FILE); + } + + if (string_empty(&cp->recon_key_file)) { + string_copy_c(&cp->recon_key_file, (const uint8_t *)RECON_KEY_FILE); + log_debug(LOG_INFO, "setting reconciliation key file to default value:%s", + RECON_KEY_FILE); + } + + if (string_empty(&cp->recon_iv_file)) { + string_copy_c(&cp->recon_iv_file, (const uint8_t *)RECON_IV_FILE); + log_debug(LOG_INFO, "setting reconciliation IV file to default value:%s", + RECON_IV_FILE); + } + + if (cp->datastore_connections == CONF_UNSET_NUM) { + cp->datastore_connections = CONF_DEFAULT_CONNECTIONS; + } + + if (cp->local_peer_connections == CONF_UNSET_NUM) { + cp->local_peer_connections = CONF_DEFAULT_CONNECTIONS; + } + + if (cp->remote_peer_connections == CONF_UNSET_NUM) { + cp->remote_peer_connections = CONF_DEFAULT_CONNECTIONS; + } + + status = conf_validate_server(cf, cp); + if (status != DN_OK) { + return status; + } + + cp->valid = 1; + + return DN_OK; } -bool -is_secure(secure_server_option_t option, struct string *this_dc, struct string *this_rack, - struct string *that_dc, struct string *that_rack) -{ - // if dc then communication only between nodes in different dc is secured - if (option == SECURE_OPTION_DC) { - if (string_compare(this_dc, that_dc)) { - return true; - } - } - // if rack then communication only between nodes in different rack is secured. - // communication secured between nodes if they are in rack with same name across dcs. - else if (option == SECURE_OPTION_RACK) { - // if not same rack or dc - if (string_compare(this_rack, that_rack) - || string_compare(this_dc, that_dc)) { - return true; - } - } - // if all then all communication between nodes will be secured. - else if (option == SECURE_OPTION_ALL) { - return true; - } - return false; +bool is_secure(secure_server_option_t option, struct string *this_dc, + struct string *this_rack, struct string *that_dc, + struct string *that_rack) { + // if dc then communication only between nodes in different dc is secured + if (option == SECURE_OPTION_DC) { + if (string_compare(this_dc, that_dc)) { + return true; + } + } + // if rack then communication only between nodes in different rack is secured. + // communication secured between nodes if they are in rack with same name + // across dcs. + else if (option == SECURE_OPTION_RACK) { + // if not same rack or dc + if (string_compare(this_rack, that_rack) || + string_compare(this_dc, that_dc)) { + return true; + } + } + // if all then all communication between nodes will be secured. + else if (option == SECURE_OPTION_ALL) { + return true; + } + return false; } -static rstatus_t -conf_post_validate(struct conf *cf) -{ - ASSERT(cf->sound && cf->parsed); - ASSERT(!cf->valid); +static rstatus_t conf_post_validate(struct conf *cf) { + ASSERT(cf->sound && cf->parsed); + ASSERT(!cf->valid); - THROW_STATUS(conf_validate_pool(cf, &cf->pool)); - return DN_OK; + THROW_STATUS(conf_validate_pool(cf, &cf->pool)); + return DN_OK; } -struct conf * -conf_create(char *filename) -{ - rstatus_t status; - struct conf *cf; +struct conf *conf_create(char *filename) { + rstatus_t status; + struct conf *cf; - cf = conf_open(filename); - if (cf == NULL) { - return NULL; - } + cf = conf_open(filename); + if (cf == NULL) { + return NULL; + } - /* validate configuration file before parsing */ - status = conf_pre_validate(cf); - if (status != DN_OK) { - goto error; - } + /* validate configuration file before parsing */ + status = conf_pre_validate(cf); + if (status != DN_OK) { + goto error; + } - /* parse the configuration file */ - status = conf_parse(cf); - if (status != DN_OK) { - goto error; - } + /* parse the configuration file */ + status = conf_parse(cf); + if (status != DN_OK) { + goto error; + } - /* validate parsed configuration */ - status = conf_post_validate(cf); - if (status != DN_OK) { - goto error; - } + /* validate parsed configuration */ + status = conf_post_validate(cf); + if (status != DN_OK) { + goto error; + } - conf_dump(cf); + conf_dump(cf); - fclose(cf->fh); - cf->fh = NULL; + fclose(cf->fh); + cf->fh = NULL; - return cf; + return cf; error: - log_stderr("dynomite: configuration file '%s' syntax is invalid", filename); - fclose(cf->fh); - cf->fh = NULL; - conf_destroy(cf); - return NULL; + log_stderr("dynomite: configuration file '%s' syntax is invalid", filename); + fclose(cf->fh); + cf->fh = NULL; + conf_destroy(cf); + return NULL; } -void -conf_destroy(struct conf *cf) -{ - while (array_n(&cf->arg) != 0) { - conf_pop_scalar(cf); - } - array_deinit(&cf->arg); +void conf_destroy(struct conf *cf) { + while (array_n(&cf->arg) != 0) { + conf_pop_scalar(cf); + } + array_deinit(&cf->arg); - conf_pool_deinit(&cf->pool); - dn_free(cf); + conf_pool_deinit(&cf->pool); + dn_free(cf); } - diff --git a/src/dyn_conf.h b/src/dyn_conf.h index 34b038362..b82c1ab80 100644 --- a/src/dyn_conf.h +++ b/src/dyn_conf.h @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -27,125 +27,132 @@ * Set default configuration values, parse dynomite.yaml, and update the various * configuration structs including connections and server pool. */ -#include +#ifndef _DYN_CONF_H_ +#define _DYN_CONF_H_ + #include #include +#include #include -#include "dyn_core.h" +#include "dyn_array.h" +#include "dyn_string.h" +#include "dyn_util.h" #include "hashkit/dyn_hashkit.h" +#define CONF_DEFAULT_PEERS 200 +#define CONF_DEFAULT_ENV "aws" +#define CONF_DEFAULT_CONN_MSG_RATE 50000 // conn msgs per sec -#ifndef _DYN_CONF_H_ -#define _DYN_CONF_H_ - -#define CONF_DEFAULT_PEERS 200 -#define CONF_DEFAULT_ENV "aws" -#define CONF_DEFAULT_CONN_MSG_RATE 50000 //conn msgs per sec +#define CONF_STR_DC_ONE "dc_one" +#define CONF_STR_DC_QUORUM "dc_quorum" +#define CONF_STR_DC_SAFE_QUORUM "dc_safe_quorum" -#define CONF_STR_DC_ONE "dc_one" -#define CONF_STR_DC_QUORUM "dc_quorum" -#define CONF_STR_DC_SAFE_QUORUM "dc_safe_quorum" +#define UNSET_NUM 0 -#define UNSET_NUM 0 +// Forward declarations +struct datastore; struct conf_listen { - struct string pname; /* listen: as "name:port" */ - struct string name; /* name */ - int port; /* port */ - struct sockinfo info; /* listen socket info */ - unsigned valid:1; /* valid? */ + struct string pname; /* listen: as "name:port" */ + struct string name; /* name */ + int port; /* port */ + struct sockinfo info; /* listen socket info */ + unsigned valid : 1; /* valid? */ }; /** \struct conf_server * Server configuration. */ struct conf_server { - struct string pname; /* server: as "name:port:weight" or "hostname:port:rack:dc:tokens" */ - struct string name; /* name if given or the hostname */ - int port; /* port */ - struct sockinfo info; /* connect socket info */ - struct array tokens; /* tokens for this server, empty for local server */ - struct string rack; /* peer node or server's rack */ - struct string dc; /* peer node's dc */ - unsigned valid:1; /* valid? */ + struct string pname; /* server: as "name:port:weight" or + "hostname:port:rack:dc:tokens" */ + struct string name; /* name if given or the hostname */ + int port; /* port */ + struct sockinfo info; /* connect socket info */ + struct array tokens; /* tokens for this server, empty for local server */ + struct string rack; /* peer node or server's rack */ + struct string dc; /* peer node's dc */ + unsigned valid : 1; /* valid? */ }; /** \struct conf_pool * Connection pool configuration. */ struct conf_pool { - struct string name; /* pool name (root node) */ - struct conf_listen listen; /* listen: */ - hash_type_t hash; /* hash: */ - struct string hash_tag; /* hash_tag: */ - void *deprecated; /* Deprecated: distribution, server_connections */ - msec_t timeout; /* timeout: */ - int backlog; /* backlog: */ - int client_connections; /* client_connections: */ - int data_store; /* data_store: */ - int preconnect; /* preconnect: */ - int auto_eject_hosts; /* auto_eject_hosts: */ - msec_t server_retry_timeout_ms; /* server_retry_timeout: in msec */ - int server_failure_limit; /* server_failure_limit: */ - struct conf_server *conf_datastore; /* This is the underlying datastore */ - unsigned valid:1; /* valid? */ - struct conf_listen dyn_listen; /* dyn_listen */ - int dyn_read_timeout; /* inter dyn nodes' read timeout in ms */ - int dyn_write_timeout; /* inter dyn nodes' write timeout in ms */ - struct string dyn_seed_provider; /* seed provider */ - struct array dyn_seeds; /* seed nodes: conf_server array */ - int dyn_port; - int dyn_connections; /* dyn connections */ - struct string rack; /* this node's logical rack */ - struct array tokens; /* this node's token: dyn_token array */ - msec_t gos_interval; /* wake up interval in ms */ - - /* none | datacenter | rack | all in order of increasing number of connections. (default is datacenter) */ - struct string secure_server_option; - struct string read_consistency; - struct string write_consistency; - struct string pem_key_file; - struct string recon_key_file; /* file with Key encryption in reconciliation */ - struct string recon_iv_file; /* file with Initialization Vector encryption in reconciliation */ - struct string dc; /* this node's dc */ - struct string env; /* AWS, Google, network, ... */ - uint32_t conn_msg_rate; /* conn msg per sec */ - bool enable_gossip; /* enable/disable gossip */ - size_t mbuf_size; /* mbuf chunk size */ - size_t alloc_msgs_max; /* allocated messages buffer size */ - - /* stats info */ - msec_t stats_interval; /* stats aggregation interval */ - struct conf_listen stats_listen; /* stats_listen: socket info for stats */ - - /* connection pool details */ - uint8_t datastore_connections; - uint8_t local_peer_connections; - uint8_t remote_peer_connections; - + struct string name; /* pool name (root node) */ + struct conf_listen listen; /* listen: */ + hash_type_t hash; /* hash: */ + struct string hash_tag; /* hash_tag: */ + void *deprecated; /* Deprecated: distribution, server_connections */ + msec_t timeout; /* timeout: */ + int backlog; /* backlog: */ + int client_connections; /* client_connections: */ + int data_store; /* data_store: */ + int preconnect; /* preconnect: */ + int auto_eject_hosts; /* auto_eject_hosts: */ + msec_t server_retry_timeout_ms; /* server_retry_timeout: in msec */ + int server_failure_limit; /* server_failure_limit: */ + struct conf_server *conf_datastore; /* This is the underlying datastore */ + unsigned valid : 1; /* valid? */ + struct conf_listen dyn_listen; /* dyn_listen */ + int dyn_read_timeout; /* inter dyn nodes' read timeout in ms */ + int dyn_write_timeout; /* inter dyn nodes' write timeout in ms */ + struct string dyn_seed_provider; /* seed provider */ + struct array dyn_seeds; /* seed nodes: conf_server array */ + int dyn_port; + int dyn_connections; /* dyn connections */ + struct string rack; /* this node's logical rack */ + struct array tokens; /* this node's token: dyn_token array */ + msec_t gos_interval; /* wake up interval in ms */ + + /* none | datacenter | rack | all in order of increasing number of + * connections. (default is datacenter) */ + struct string secure_server_option; + struct string read_consistency; + struct string write_consistency; + struct string pem_key_file; + struct string recon_key_file; /* file with Key encryption in reconciliation */ + struct string + recon_iv_file; /* file with Initialization Vector encryption in + reconciliation */ + struct string dc; /* this node's dc */ + struct string env; /* AWS, Google, network, ... */ + uint32_t conn_msg_rate; /* conn msg per sec */ + bool enable_gossip; /* enable/disable gossip */ + size_t mbuf_size; /* mbuf chunk size */ + size_t alloc_msgs_max; /* allocated messages buffer size */ + + /* stats info */ + msec_t stats_interval; /* stats aggregation interval */ + struct conf_listen stats_listen; /* stats_listen: socket info for stats */ + + /* connection pool details */ + uint8_t datastore_connections; + uint8_t local_peer_connections; + uint8_t remote_peer_connections; }; - struct conf { - char *fname; /* file name (ref in argv[]) */ - FILE *fh; /* file handle */ - struct array arg; /* string[] (parsed {key, value} pairs) */ - struct conf_pool pool; /* conf_pool[] (parsed pools) */ - uint32_t depth; /* parsed tree depth */ - yaml_parser_t parser; /* yaml parser */ - yaml_event_t event; /* yaml event */ - yaml_token_t token; /* yaml token */ - unsigned seq:1; /* sequence? */ - unsigned valid_parser:1; /* valid parser? */ - unsigned valid_event:1; /* valid event? */ - unsigned valid_token:1; /* valid token? */ - unsigned sound:1; /* sound? */ - unsigned parsed:1; /* parsed? */ - unsigned valid:1; /* valid? */ + char *fname; /* file name (ref in argv[]) */ + FILE *fh; /* file handle */ + struct array arg; /* string[] (parsed {key, value} pairs) */ + struct conf_pool pool; /* conf_pool[] (parsed pools) */ + uint32_t depth; /* parsed tree depth */ + yaml_parser_t parser; /* yaml parser */ + yaml_event_t event; /* yaml event */ + yaml_token_t token; /* yaml token */ + unsigned seq : 1; /* sequence? */ + unsigned valid_parser : 1; /* valid parser? */ + unsigned valid_event : 1; /* valid event? */ + unsigned valid_token : 1; /* valid token? */ + unsigned sound : 1; /* sound? */ + unsigned parsed : 1; /* parsed? */ + unsigned valid : 1; /* valid? */ }; -#define null_command { null_string, NULL, 0 } +#define null_command \ + { null_string, NULL, 0 } struct conf *conf_create(char *filename); void conf_destroy(struct conf *cf); diff --git a/src/dyn_connection.c b/src/dyn_connection.c index 68e9270d0..c07ede7c9 100644 --- a/src/dyn_connection.c +++ b/src/dyn_connection.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,11 +20,10 @@ * limitations under the License. */ -#include "dyn_core.h" #include "dyn_connection_internal.h" +#include "dyn_core.h" #include "event/dyn_event.h" - /* * dyn_connection.[ch] * Connection (struct conn) @@ -49,11 +48,11 @@ * Request received over the client connection are forwarded to the server by * enqueuing the request in the chosen server's in_q. From the client's * perspective once the request is forwarded, it is outstanding and is tracked - * in the client's out_q (unless the request was tagged as !expect_datastore_reply). The server - * in turn picks up requests from its own in_q in fifo order and puts them on - * the wire. Once the request is outstanding on the wire, and a response is - * expected for it, the server keeps track of outstanding requests it in its - * own out_q. + * in the client's out_q (unless the request was tagged as + * !expect_datastore_reply). The server in turn picks up requests from its own + * in_q in fifo order and puts them on the wire. Once the request is outstanding + * on the wire, and a response is expected for it, the server keeps track of + * outstanding requests it in its own out_q. * * The server's out_q enables us to pair a request with a response while the * client's out_q enables us to pair request and response in the order in @@ -89,398 +88,349 @@ consistency_t g_read_consistency = DEFAULT_READ_CONSISTENCY; consistency_t g_write_consistency = DEFAULT_WRITE_CONSISTENCY; -bool -conn_is_req_first_in_outqueue(struct conn *conn, struct msg *req) -{ - struct msg *first_req_in_outqueue = TAILQ_FIRST(&conn->omsg_q); - return req == first_req_in_outqueue; +bool conn_is_req_first_in_outqueue(struct conn *conn, struct msg *req) { + struct msg *first_req_in_outqueue = TAILQ_FIRST(&conn->omsg_q); + return req == first_req_in_outqueue; } /* * Return the context associated with this connection. */ -struct context * -conn_to_ctx(struct conn *conn) -{ - struct datastore *server; - struct node *peer; - struct server_pool *pool; - switch(conn->type) { - case CONN_PROXY: - case CONN_CLIENT: - case CONN_DNODE_PEER_PROXY: - case CONN_DNODE_PEER_CLIENT: - pool = conn->owner; - break; - case CONN_SERVER: - server = conn->owner; - pool = server ? server->owner : NULL; - break; - case CONN_DNODE_PEER_SERVER: - peer = conn->owner; - pool = peer ? peer->owner : NULL; - break; - default: - return NULL; - } - - return pool ? pool->ctx : NULL; +struct context *conn_to_ctx(struct conn *conn) { + struct datastore *server; + struct node *peer; + struct server_pool *pool; + switch (conn->type) { + case CONN_PROXY: + case CONN_CLIENT: + case CONN_DNODE_PEER_PROXY: + case CONN_DNODE_PEER_CLIENT: + pool = conn->owner; + break; + case CONN_SERVER: + server = conn->owner; + pool = server ? server->owner : NULL; + break; + case CONN_DNODE_PEER_SERVER: + peer = conn->owner; + pool = peer ? peer->owner : NULL; + break; + default: + return NULL; + } + + return pool ? pool->ctx : NULL; } -inline void -conn_set_read_consistency(struct conn *conn, consistency_t cons) -{ - conn->read_consistency = cons; +inline void conn_set_read_consistency(struct conn *conn, consistency_t cons) { + conn->read_consistency = cons; } -inline consistency_t -conn_get_read_consistency(struct conn *conn) -{ - //return conn->read_consistency; - return g_read_consistency; +inline consistency_t conn_get_read_consistency(struct conn *conn) { + // return conn->read_consistency; + return g_read_consistency; } -inline void -conn_set_write_consistency(struct conn *conn, consistency_t cons) -{ - conn->write_consistency = cons; +inline void conn_set_write_consistency(struct conn *conn, consistency_t cons) { + conn->write_consistency = cons; } -inline consistency_t -conn_get_write_consistency(struct conn *conn) -{ - //return conn->write_consistency; - return g_write_consistency; +inline consistency_t conn_get_write_consistency(struct conn *conn) { + // return conn->write_consistency; + return g_write_consistency; } -rstatus_t -conn_event_del_conn(struct conn *conn) -{ - struct context *ctx = conn_to_ctx(conn); - _remove_from_ready_q(ctx, conn); - if (conn->sd != -1) - return event_del_conn(ctx->evb, conn); - return DN_OK; +rstatus_t conn_event_del_conn(struct conn *conn) { + struct context *ctx = conn_to_ctx(conn); + _remove_from_ready_q(ctx, conn); + if (conn->sd != -1) return event_del_conn(ctx->evb, conn); + return DN_OK; } -rstatus_t -conn_event_add_out(struct conn *conn) -{ - struct context *ctx = conn_to_ctx(conn); - _add_to_ready_q(ctx, conn); - return event_add_out(ctx->evb, conn); +rstatus_t conn_event_add_out(struct conn *conn) { + struct context *ctx = conn_to_ctx(conn); + _add_to_ready_q(ctx, conn); + return event_add_out(ctx->evb, conn); } -rstatus_t -conn_event_add_conn(struct conn *conn) -{ - struct context *ctx = conn_to_ctx(conn); - return event_add_conn(ctx->evb, conn); +rstatus_t conn_event_add_conn(struct conn *conn) { + struct context *ctx = conn_to_ctx(conn); + return event_add_conn(ctx->evb, conn); } -rstatus_t -conn_event_del_out(struct conn *conn) -{ - struct context *ctx = conn_to_ctx(conn); - _remove_from_ready_q(ctx, conn); - return event_del_out(ctx->evb, conn); +rstatus_t conn_event_del_out(struct conn *conn) { + struct context *ctx = conn_to_ctx(conn); + _remove_from_ready_q(ctx, conn); + return event_del_out(ctx->evb, conn); } -struct conn * -conn_get(void *owner, func_conn_init_t func_conn_init) -{ - struct conn *conn; +struct conn *conn_get(void *owner, func_conn_init_t func_conn_init) { + struct conn *conn; - conn = _conn_get(); - if (conn == NULL) { - return NULL; - } + conn = _conn_get(); + if (conn == NULL) { + return NULL; + } - /* connection handles the data store messages (redis, memcached or other) */ + /* connection handles the data store messages (redis, memcached or other) */ - func_conn_init(conn); - - conn_ref(conn, owner); + func_conn_init(conn); - log_debug(LOG_VVERB, "get conn %p %s", conn, _conn_get_type_string(conn)); + conn_ref(conn, owner); - return conn; -} + log_debug(LOG_VVERB, "get conn %p %s", conn, _conn_get_type_string(conn)); -void -conn_put(struct conn *conn) -{ - ASSERT(conn->sd < 0); - ASSERT(conn->owner == NULL); - log_debug(LOG_VVERB, "putting %s", print_obj(conn)); - _conn_put(conn); + return conn; } -void -conn_init(void) -{ - _conn_init(); +void conn_put(struct conn *conn) { + ASSERT(conn->sd < 0); + ASSERT(conn->owner == NULL); + log_debug(LOG_VVERB, "putting %s", print_obj(conn)); + _conn_put(conn); } -void -conn_deinit(void) -{ - _conn_deinit(); -} +void conn_init(void) { _conn_init(); } -rstatus_t -conn_listen(struct context *ctx, struct conn *p) -{ - rstatus_t status; - struct server_pool *pool = &ctx->pool; +void conn_deinit(void) { _conn_deinit(); } - ASSERT((p->type == CONN_PROXY) || - (p->type == CONN_DNODE_PEER_PROXY)); +rstatus_t conn_listen(struct context *ctx, struct conn *p) { + rstatus_t status; + struct server_pool *pool = &ctx->pool; - p->sd = socket(p->family, SOCK_STREAM, 0); - if (p->sd < 0) { - log_error("socket failed: %s", strerror(errno)); - return DN_ERROR; - } + ASSERT((p->type == CONN_PROXY) || (p->type == CONN_DNODE_PEER_PROXY)); - status = _conn_reuse(p); - if (status < 0) { - log_error("reuse of addr '%.*s' for listening on p %d failed: %s", - p->pname.len, p->pname.data, p->sd, - strerror(errno)); - return DN_ERROR; - } + p->sd = socket(p->family, SOCK_STREAM, 0); + if (p->sd < 0) { + log_error("socket failed: %s", strerror(errno)); + return DN_ERROR; + } - status = bind(p->sd, p->addr, p->addrlen); - if (status < 0) { - log_error("bind on p %d to addr '%.*s' failed: %s", p->sd, - p->pname.len, p->pname.data, strerror(errno)); - return DN_ERROR; - } + status = _conn_reuse(p); + if (status < 0) { + log_error("reuse of addr '%.*s' for listening on p %d failed: %s", + p->pname.len, p->pname.data, p->sd, strerror(errno)); + return DN_ERROR; + } - status = listen(p->sd, pool->backlog); - if (status < 0) { - log_error("listen on p %d on addr '%.*s' failed: %s", p->sd, - p->pname.len, p->pname.data, strerror(errno)); - return DN_ERROR; - } + status = bind(p->sd, p->addr, p->addrlen); + if (status < 0) { + log_error("bind on p %d to addr '%.*s' failed: %s", p->sd, p->pname.len, + p->pname.data, strerror(errno)); + return DN_ERROR; + } - status = dn_set_nonblocking(p->sd); - if (status < 0) { - log_error("set nonblock on p %d on addr '%.*s' failed: %s", p->sd, - p->pname.len, p->pname.data, strerror(errno)); - return DN_ERROR; - } + status = listen(p->sd, pool->backlog); + if (status < 0) { + log_error("listen on p %d on addr '%.*s' failed: %s", p->sd, p->pname.len, + p->pname.data, strerror(errno)); + return DN_ERROR; + } - status = conn_event_add_conn(p); - if (status < 0) { - log_error("event add conn p %d on addr '%.*s' failed: %s", - p->sd, p->pname.len, p->pname.data, - strerror(errno)); - return DN_ERROR; - } + status = dn_set_nonblocking(p->sd); + if (status < 0) { + log_error("set nonblock on p %d on addr '%.*s' failed: %s", p->sd, + p->pname.len, p->pname.data, strerror(errno)); + return DN_ERROR; + } - status = conn_event_del_out(p); - if (status < 0) { - log_error("event del out p %d on addr '%.*s' failed: %s", - p->sd, p->pname.len, p->pname.data, - strerror(errno)); - return DN_ERROR; - } + status = conn_event_add_conn(p); + if (status < 0) { + log_error("event add conn p %d on addr '%.*s' failed: %s", p->sd, + p->pname.len, p->pname.data, strerror(errno)); + return DN_ERROR; + } - return DN_OK; + status = conn_event_del_out(p); + if (status < 0) { + log_error("event del out p %d on addr '%.*s' failed: %s", p->sd, + p->pname.len, p->pname.data, strerror(errno)); + return DN_ERROR; + } + + return DN_OK; } -rstatus_t -conn_connect(struct context *ctx, struct conn *conn) -{ - rstatus_t status; +rstatus_t conn_connect(struct context *ctx, struct conn *conn) { + rstatus_t status; - // Outgoing connection to another Dynomite node and admin mode is disabled - if ((conn->type == CONN_DNODE_PEER_SERVER) && (ctx->admin_opt > 0)) - return DN_OK; + // Outgoing connection to another Dynomite node and admin mode is disabled + if ((conn->type == CONN_DNODE_PEER_SERVER) && (ctx->admin_opt > 0)) + return DN_OK; - // Only continue if the connection type is: - // 1. CONN_DNODE_PEER_SERVER: Outbound connection to another Dynomite node - // 2. CONN_SERVER: Outbound connection to backend datastore (Redis, ARDB) - ASSERT((conn->type == CONN_DNODE_PEER_SERVER) || - (conn->type == CONN_SERVER)); + // Only continue if the connection type is: + // 1. CONN_DNODE_PEER_SERVER: Outbound connection to another Dynomite node + // 2. CONN_SERVER: Outbound connection to backend datastore (Redis, ARDB) + ASSERT((conn->type == CONN_DNODE_PEER_SERVER) || (conn->type == CONN_SERVER)); - if (conn->sd > 0) { - /* already connected on peer connection */ - return DN_OK; + if (conn->sd > 0) { + /* already connected on peer connection */ + return DN_OK; + } + + conn->sd = socket(conn->family, SOCK_STREAM, 0); + if (conn->sd < 0) { + log_error("dyn: socket for '%.*s' failed: %s", conn->pname.len, + conn->pname.data, strerror(errno)); + status = DN_ERROR; + goto error; + } + log_warn("%s connecting.....", print_obj(conn)); + + status = dn_set_nonblocking(conn->sd); + if (status != DN_OK) { + log_error("set nonblock on s %d for '%.*s' failed: %s", conn->sd, + conn->pname.len, conn->pname.data, strerror(errno)); + goto error; + } + status = dn_set_keepalive(conn->sd, DYN_KEEPALIVE_INTERVAL_S); + if (status != DN_OK) { + log_error("set keepalive on s %d for '%.*s' failed: %s", conn->sd, + conn->pname.len, conn->pname.data, strerror(errno)); + // Continue since this is not catastrophic + } + + if (conn->pname.data[0] != '/') { + status = dn_set_tcpnodelay(conn->sd); + if (status != DN_OK) { + log_warn("set tcpnodelay on s %d for '%.*s' failed, ignored: %s", + conn->sd, conn->pname.len, conn->pname.data, strerror(errno)); } + } - conn->sd = socket(conn->family, SOCK_STREAM, 0); - if (conn->sd < 0) { - log_error("dyn: socket for '%.*s' failed: %s", conn->pname.len, - conn->pname.data, strerror(errno)); - status = DN_ERROR; - goto error; - } - log_warn("%s connecting.....", print_obj(conn)); + status = conn_event_add_conn(conn); + if (status != DN_OK) { + log_error("event add conn s %d for '%.*s' failed: %s", conn->sd, + conn->pname.len, conn->pname.data, strerror(errno)); + goto error; + } - status = dn_set_nonblocking(conn->sd); - if (status != DN_OK) { - log_error("set nonblock on s %d for '%.*s' failed: %s", - conn->sd, conn->pname.len, conn->pname.data, - strerror(errno)); - goto error; - } - status = dn_set_keepalive(conn->sd, DYN_KEEPALIVE_INTERVAL_S); - if (status != DN_OK) { - log_error("set keepalive on s %d for '%.*s' failed: %s", - conn->sd, conn->pname.len, conn->pname.data, - strerror(errno)); - // Continue since this is not catastrophic - } + ASSERT(!conn->connecting && !conn->connected); - if (conn->pname.data[0] != '/') { - status = dn_set_tcpnodelay(conn->sd); - if (status != DN_OK) { - log_warn("set tcpnodelay on s %d for '%.*s' failed, ignored: %s", - conn->sd, conn->pname.len, conn->pname.data, - strerror(errno)); - } - } + status = connect(conn->sd, conn->addr, conn->addrlen); - status = conn_event_add_conn(conn); - if (status != DN_OK) { - log_error("event add conn s %d for '%.*s' failed: %s", - conn->sd, conn->pname.len, conn->pname.data, - strerror(errno)); - goto error; + if (status != DN_OK) { + if (errno == EINPROGRESS) { + conn->connecting = 1; + return DN_OK; } - ASSERT(!conn->connecting && !conn->connected); + log_error("connect on s %d to '%.*s' failed: %s", conn->sd, conn->pname.len, + conn->pname.data, strerror(errno)); - status = connect(conn->sd, conn->addr, conn->addrlen); + goto error; + } - if (status != DN_OK) { - if (errno == EINPROGRESS) { - conn->connecting = 1; - return DN_OK; - } + ASSERT(!conn->connecting); + conn->connected = 1; + conn_pool_connected(conn->conn_pool, conn); + log_debug(LOG_WARN, "%s connected to '%.*s'", print_obj(conn), + conn->pname.len, conn->pname.data); - log_error("connect on s %d to '%.*s' failed: %s", conn->sd, - conn->pname.len, conn->pname.data, strerror(errno)); + return DN_OK; - goto error; - } +error: + conn->err = errno; + return status; +} - ASSERT(!conn->connecting); - conn->connected = 1; - conn_pool_connected(conn->conn_pool, conn); - log_debug(LOG_WARN, "%s connected to '%.*s'", print_obj(conn), - conn->pname.len, conn->pname.data); +ssize_t conn_recv_data(struct conn *conn, void *buf, size_t size) { + ssize_t n; - return DN_OK; + ASSERT(buf != NULL); + ASSERT(size > 0); + ASSERT(conn->recv_ready); - error: - conn->err = errno; - return status; -} + for (;;) { + n = dn_read(conn->sd, buf, size); + + log_debug(LOG_VERB, "%s recv %zd of %zu", print_obj(conn), n, size); -ssize_t -conn_recv_data(struct conn *conn, void *buf, size_t size) -{ - ssize_t n; - - ASSERT(buf != NULL); - ASSERT(size > 0); - ASSERT(conn->recv_ready); - - for (;;) { - n = dn_read(conn->sd, buf, size); - - log_debug(LOG_VERB, "%s recv %zd of %zu", print_obj(conn), n, size); - - if (n > 0) { - if (n < (ssize_t) size) { - conn->recv_ready = 0; - } - conn->recv_bytes += (size_t)n; - return n; - } - - if (n == 0) { - conn->recv_ready = 0; - conn->eof = 1; - log_debug(LOG_NOTICE, "%s recv eof rb %zu sb %zu", print_obj(conn), - conn->recv_bytes, conn->send_bytes); - return n; - } - - if (errno == EINTR) { - log_debug(LOG_VERB, "%s recv not ready - eintr", print_obj(conn)); - continue; - } else if (errno == EAGAIN || errno == EWOULDBLOCK) { - conn->recv_ready = 0; - log_debug(LOG_VERB, "%s recv not ready - eagain", print_obj(conn)); - return DN_EAGAIN; - } else { - conn->recv_ready = 0; - conn->err = errno; - log_error("%s recv failed: %s", print_obj(conn), strerror(errno)); - return DN_ERROR; - } + if (n > 0) { + if (n < (ssize_t)size) { + conn->recv_ready = 0; + } + conn->recv_bytes += (size_t)n; + return n; } - NOT_REACHED(); + if (n == 0) { + conn->recv_ready = 0; + conn->eof = 1; + log_debug(LOG_NOTICE, "%s recv eof rb %zu sb %zu", print_obj(conn), + conn->recv_bytes, conn->send_bytes); + return n; + } - return DN_ERROR; + if (errno == EINTR) { + log_debug(LOG_VERB, "%s recv not ready - eintr", print_obj(conn)); + continue; + } else if (errno == EAGAIN || errno == EWOULDBLOCK) { + conn->recv_ready = 0; + log_debug(LOG_VERB, "%s recv not ready - eagain", print_obj(conn)); + return DN_EAGAIN; + } else { + conn->recv_ready = 0; + conn->err = errno; + log_error("%s recv failed: %s", print_obj(conn), strerror(errno)); + return DN_ERROR; + } + } + + NOT_REACHED(); + + return DN_ERROR; } -ssize_t -conn_sendv_data(struct conn *conn, struct array *sendv, size_t nsend) -{ - ssize_t n; - - ASSERT(array_n(sendv) > 0); - ASSERT(nsend != 0); - ASSERT(conn->send_ready); - - for (;;) { - n = dn_writev(conn->sd, sendv->elem, sendv->nelem); - - log_debug(LOG_VERB, "sendv on sd %d %zd of %zu in %"PRIu32" buffers", - conn->sd, n, nsend, sendv->nelem); - - if (n > 0) { - if (n < (ssize_t) nsend) { - conn->send_ready = 0; - } - conn->send_bytes += (size_t)n; - //conn->non_bytes_send = 0; - return n; - } - - if (n == 0) { - log_warn("sendv on sd %d returned zero", conn->sd); - conn->send_ready = 0; - //conn->non_bytes_send++; - //if (conn->dyn_mode && conn->non_bytes_send > MAX_CONN_ALLOWABLE_NON_SEND) { - // conn->err = ENOTRECOVERABLE; - //} - return 0; - } - - if (errno == EINTR) { - log_debug(LOG_VERB, "sendv on sd %d not ready - eintr", conn->sd); - continue; - } else if (errno == EAGAIN || errno == EWOULDBLOCK) { - conn->send_ready = 0; - log_debug(LOG_VERB, "sendv on sd %d not ready - eagain", conn->sd); - return DN_EAGAIN; - } else { - conn->send_ready = 0; - conn->err = errno; - log_error("sendv on sd %d failed: %s", conn->sd, strerror(errno)); - return DN_ERROR; - } +ssize_t conn_sendv_data(struct conn *conn, struct array *sendv, size_t nsend) { + ssize_t n; + + ASSERT(array_n(sendv) > 0); + ASSERT(nsend != 0); + ASSERT(conn->send_ready); + + for (;;) { + n = dn_writev(conn->sd, sendv->elem, sendv->nelem); + + log_debug(LOG_VERB, "sendv on sd %d %zd of %zu in %" PRIu32 " buffers", + conn->sd, n, nsend, sendv->nelem); + + if (n > 0) { + if (n < (ssize_t)nsend) { + conn->send_ready = 0; + } + conn->send_bytes += (size_t)n; + // conn->non_bytes_send = 0; + return n; } - NOT_REACHED(); + if (n == 0) { + log_warn("sendv on sd %d returned zero", conn->sd); + conn->send_ready = 0; + // conn->non_bytes_send++; + // if (conn->dyn_mode && conn->non_bytes_send > + // MAX_CONN_ALLOWABLE_NON_SEND) { + // conn->err = ENOTRECOVERABLE; + //} + return 0; + } - return DN_ERROR; + if (errno == EINTR) { + log_debug(LOG_VERB, "sendv on sd %d not ready - eintr", conn->sd); + continue; + } else if (errno == EAGAIN || errno == EWOULDBLOCK) { + conn->send_ready = 0; + log_debug(LOG_VERB, "sendv on sd %d not ready - eagain", conn->sd); + return DN_EAGAIN; + } else { + conn->send_ready = 0; + conn->err = errno; + log_error("sendv on sd %d failed: %s", conn->sd, strerror(errno)); + return DN_ERROR; + } + } + + NOT_REACHED(); + + return DN_ERROR; } diff --git a/src/dyn_connection.h b/src/dyn_connection.h index c6253388c..a34ee8c3f 100644 --- a/src/dyn_connection.h +++ b/src/dyn_connection.h @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,7 +20,6 @@ * limitations under the License. */ - /** * In twemproxy there are 3 types of connections: * PROXY - listens for client connections (default: 8102) @@ -28,26 +27,33 @@ * SERVER - outgoing connection to the underlying data store. * * Dynomite extended this same concept and added 3 other types of connections - * DNODE_PEER_PROXY - listens to connections from other dynomite node (default 8101) - * DNODE_PEER_CLIENT - incoming connection from other dnode + * DNODE_PEER_PROXY - listens to connections from other dynomite node (default + * 8101) DNODE_PEER_CLIENT - incoming connection from other dnode * DNODE_PEER_SERVER - outgoing connection to other dnode. * */ - + #ifndef _DYN_CONNECTION_H_ #define _DYN_CONNECTION_H_ -#include "dyn_core.h" -#define MAX_CONN_QUEUE_SIZE 20000 -#define MAX_CONN_ALLOWABLE_NON_RECV 5 -#define MAX_CONN_ALLOWABLE_NON_SEND 5 +#include + +#include "dyn_message.h" +#include "dyn_queue.h" +#include "dyn_string.h" +#include "dyn_types.h" + +#define MAX_CONN_QUEUE_SIZE 20000 +#define MAX_CONN_ALLOWABLE_NON_RECV 5 +#define MAX_CONN_ALLOWABLE_NON_SEND 5 -typedef rstatus_t (*func_recv_t)(struct context *, struct conn*); -typedef struct msg* (*func_recv_next_t)(struct context *, struct conn *, bool); -typedef void (*func_recv_done_t)(struct context *, struct conn *, struct msg *, struct msg *); +typedef rstatus_t (*func_recv_t)(struct context *, struct conn *); +typedef struct msg *(*func_recv_next_t)(struct context *, struct conn *, bool); +typedef void (*func_recv_done_t)(struct context *, struct conn *, struct msg *, + struct msg *); -typedef rstatus_t (*func_send_t)(struct context *, struct conn*); -typedef struct msg* (*func_send_next_t)(struct context *, struct conn *); +typedef rstatus_t (*func_send_t)(struct context *, struct conn *); +typedef struct msg *(*func_send_next_t)(struct context *, struct conn *); typedef void (*func_send_done_t)(struct context *, struct conn *, struct msg *); typedef void (*func_close_t)(struct context *, struct conn *); @@ -62,127 +68,121 @@ typedef rstatus_t (*func_response_handler)(struct conn *, msgid_t reqid, struct conn_pool; struct conn_ops { - func_recv_t recv; /* recv (read) handler */ - func_recv_next_t recv_next; /* recv next message handler */ - func_recv_done_t recv_done; /* read done handler */ - func_send_t send; /* send (write) handler */ - func_send_next_t send_next; /* write next message handler */ - func_send_done_t send_done; /* write done handler */ - func_close_t close; /* close handler */ - func_active_t active; /* active? handler */ - - func_ref_t ref; /* connection reference handler */ - func_unref_t unref; /* connection unreference handler */ - - func_msgq_t enqueue_inq; /* connection inq msg enqueue handler */ - func_msgq_t dequeue_inq; /* connection inq msg dequeue handler */ - func_msgq_t enqueue_outq; /* connection outq msg enqueue handler */ - func_msgq_t dequeue_outq; /* connection outq msg dequeue handler */ - func_response_handler rsp_handler; + func_recv_t recv; /* recv (read) handler */ + func_recv_next_t recv_next; /* recv next message handler */ + func_recv_done_t recv_done; /* read done handler */ + func_send_t send; /* send (write) handler */ + func_send_next_t send_next; /* write next message handler */ + func_send_done_t send_done; /* write done handler */ + func_close_t close; /* close handler */ + func_active_t active; /* active? handler */ + + func_ref_t ref; /* connection reference handler */ + func_unref_t unref; /* connection unreference handler */ + + func_msgq_t enqueue_inq; /* connection inq msg enqueue handler */ + func_msgq_t dequeue_inq; /* connection inq msg dequeue handler */ + func_msgq_t enqueue_outq; /* connection outq msg enqueue handler */ + func_msgq_t dequeue_outq; /* connection outq msg dequeue handler */ + func_response_handler rsp_handler; }; typedef enum connection_type { - CONN_UNSPECIFIED, - CONN_PROXY, // a dynomite proxy (listening) connection - CONN_CLIENT, // this is connected to a client connection - CONN_SERVER, // this is connected to underlying datastore ...redis/memcache - CONN_DNODE_PEER_PROXY, // this is a dnode (listening) connection...default 8101 - CONN_DNODE_PEER_CLIENT, // this is connected to a dnode peer client - CONN_DNODE_PEER_SERVER, // this is connected to a dnode peer server + CONN_UNSPECIFIED, + CONN_PROXY, // a dynomite proxy (listening) connection + CONN_CLIENT, // this is connected to a client connection + CONN_SERVER, // this is connected to underlying datastore ...redis/memcache + CONN_DNODE_PEER_PROXY, // this is a dnode (listening) connection...default + // 8101 + CONN_DNODE_PEER_CLIENT, // this is connected to a dnode peer client + CONN_DNODE_PEER_SERVER, // this is connected to a dnode peer server } connection_type_t; struct conn { - object_t object; - TAILQ_ENTRY(conn) conn_tqe; /* link in server_pool / server / free q */ - TAILQ_ENTRY(conn) ready_tqe; /* link in ready connection q */ - void *owner; /* connection owner - server_pool / server */ - struct conn_pool *conn_pool; - - int sd; /* socket descriptor */ - struct string pname; - int family; /* socket address family */ - socklen_t addrlen; /* socket length */ - struct sockaddr *addr; /* socket address (ref in server or server_pool) */ - - struct msg_tqh imsg_q; /* incoming request Q */ - struct msg_tqh omsg_q; /* outstanding request Q */ - - struct msg *rmsg; /* current message being rcvd */ - struct msg *smsg; /* current message being sent */ - - struct conn_ops *ops; - size_t recv_bytes; /* received (read) bytes */ - size_t send_bytes; /* sent (written) bytes */ - - uint32_t events; /* connection io events */ - err_t err; /* connection errno */ - unsigned recv_active:1; /* recv active? */ - unsigned recv_ready:1; /* recv ready? */ - unsigned send_active:1; /* send active? */ - unsigned send_ready:1; /* send ready? */ - - unsigned connecting:1; /* connecting? */ - unsigned connected:1; /* connected? */ - unsigned eof:1; /* eof? aka passive close? */ - unsigned waiting_to_unref:1; /* eof? aka passive close? */ - unsigned done:1; /* done? aka close? */ - unsigned dyn_mode:1; /* is a dyn connection? */ - unsigned dnode_secured:1; /* is a secured connection? */ - unsigned crypto_key_sent:1; /* crypto state */ - unsigned char aes_key[50]; //aes_key[34]; /* a place holder for AES key */ - unsigned same_dc:1; /* bit to indicate whether a peer conn is same DC */ - uint32_t avail_tokens; /* used to throttle the traffics */ - uint32_t last_sent; /* ts in sec used to determine the last sent time */ - //uint32_t non_bytes_send; /* #times or epoll triggers that we are not able to send any bytes */ - consistency_t read_consistency; - consistency_t write_consistency; - dict *outstanding_msgs_dict; - connection_type_t type; + object_t object; + TAILQ_ENTRY(conn) conn_tqe; /* link in server_pool / server / free q */ + TAILQ_ENTRY(conn) ready_tqe; /* link in ready connection q */ + void *owner; /* connection owner - server_pool / server */ + struct conn_pool *conn_pool; + + int sd; /* socket descriptor */ + struct string pname; + int family; /* socket address family */ + socklen_t addrlen; /* socket length */ + struct sockaddr *addr; /* socket address (ref in server or server_pool) */ + + struct msg_tqh imsg_q; /* incoming request Q */ + struct msg_tqh omsg_q; /* outstanding request Q */ + + struct msg *rmsg; /* current message being rcvd */ + struct msg *smsg; /* current message being sent */ + + struct conn_ops *ops; + size_t recv_bytes; /* received (read) bytes */ + size_t send_bytes; /* sent (written) bytes */ + + uint32_t events; /* connection io events */ + err_t err; /* connection errno */ + unsigned recv_active : 1; /* recv active? */ + unsigned recv_ready : 1; /* recv ready? */ + unsigned send_active : 1; /* send active? */ + unsigned send_ready : 1; /* send ready? */ + + unsigned connecting : 1; /* connecting? */ + unsigned connected : 1; /* connected? */ + unsigned eof : 1; /* eof? aka passive close? */ + unsigned waiting_to_unref : 1; /* eof? aka passive close? */ + unsigned done : 1; /* done? aka close? */ + unsigned dyn_mode : 1; /* is a dyn connection? */ + unsigned dnode_secured : 1; /* is a secured connection? */ + unsigned crypto_key_sent : 1; /* crypto state */ + unsigned char aes_key[50]; // aes_key[34]; /* a place holder for + // AES key */ + unsigned same_dc : 1; /* bit to indicate whether a peer conn is same DC */ + uint32_t avail_tokens; /* used to throttle the traffics */ + uint32_t last_sent; /* ts in sec used to determine the last sent time */ + // uint32_t non_bytes_send; /* #times or epoll triggers that + // we are not able to send any bytes */ + consistency_t read_consistency; + consistency_t write_consistency; + dict *outstanding_msgs_dict; + connection_type_t type; }; -static inline rstatus_t -conn_cant_handle_response(struct conn *conn, msgid_t reqid, struct msg *resp) -{ - return DN_ENO_IMPL; +static inline rstatus_t conn_cant_handle_response(struct conn *conn, + msgid_t reqid, + struct msg *resp) { + return DN_ENO_IMPL; } -static inline rstatus_t -conn_handle_response(struct conn *conn, msgid_t msgid, struct msg *rsp) -{ - return conn->ops->rsp_handler(conn, msgid, rsp); +static inline rstatus_t conn_handle_response(struct conn *conn, msgid_t msgid, + struct msg *rsp) { + return conn->ops->rsp_handler(conn, msgid, rsp); } -#define conn_recv(ctx, conn) \ - (conn)->ops->recv(ctx, conn) -#define conn_recv_next(ctx, conn, alloc) \ - (conn)->ops->recv_next(ctx, conn, alloc) -#define conn_recv_done(ctx, conn, msg, nmsg) \ - (conn)->ops->recv_done(ctx, conn, msg, nmsg) - -#define conn_send(ctx, conn) \ - (conn)->ops->send(ctx, conn) -#define conn_send_next(ctx, conn) \ - (conn)->ops->send_next(ctx, conn) -#define conn_send_done(ctx, conn, msg) \ - (conn)->ops->send_done(ctx, conn, msg) - -#define conn_close(ctx, conn) \ - (conn)->ops->close(ctx, conn) -#define conn_active(conn) \ - (conn)->ops->active(conn) -#define conn_ref(conn, owner) \ - (conn)->ops->ref(conn, owner) -#define conn_unref(conn) \ - (conn)->ops->unref(conn) - -#define conn_enqueue_inq(ctx, conn, msg) \ - (conn)->ops->enqueue_inq(ctx, conn, msg) -#define conn_dequeue_inq(ctx, conn, msg) \ - (conn)->ops->dequeue_inq(ctx, conn, msg) -#define conn_enqueue_outq(ctx, conn, msg) \ - (conn)->ops->enqueue_outq(ctx, conn, msg) -#define conn_dequeue_outq(ctx, conn, msg) \ - (conn)->ops->dequeue_outq(ctx, conn, msg) +#define conn_recv(ctx, conn) (conn)->ops->recv(ctx, conn) +#define conn_recv_next(ctx, conn, alloc) \ + (conn)->ops->recv_next(ctx, conn, alloc) +#define conn_recv_done(ctx, conn, msg, nmsg) \ + (conn)->ops->recv_done(ctx, conn, msg, nmsg) + +#define conn_send(ctx, conn) (conn)->ops->send(ctx, conn) +#define conn_send_next(ctx, conn) (conn)->ops->send_next(ctx, conn) +#define conn_send_done(ctx, conn, msg) (conn)->ops->send_done(ctx, conn, msg) + +#define conn_close(ctx, conn) (conn)->ops->close(ctx, conn) +#define conn_active(conn) (conn)->ops->active(conn) +#define conn_ref(conn, owner) (conn)->ops->ref(conn, owner) +#define conn_unref(conn) (conn)->ops->unref(conn) + +#define conn_enqueue_inq(ctx, conn, msg) \ + (conn)->ops->enqueue_inq(ctx, conn, msg) +#define conn_dequeue_inq(ctx, conn, msg) \ + (conn)->ops->dequeue_inq(ctx, conn, msg) +#define conn_enqueue_outq(ctx, conn, msg) \ + (conn)->ops->enqueue_outq(ctx, conn, msg) +#define conn_dequeue_outq(ctx, conn, msg) \ + (conn)->ops->dequeue_outq(ctx, conn, msg) TAILQ_HEAD(conn_tqh, conn); void conn_set_write_consistency(struct conn *conn, consistency_t cons); @@ -212,8 +212,8 @@ void conn_init(void); void conn_deinit(void); bool conn_is_req_first_in_outqueue(struct conn *conn, struct msg *req); -rstatus_t conn_event_add_conn(struct conn * conn); -rstatus_t conn_event_add_out(struct conn * conn); -rstatus_t conn_event_del_conn(struct conn * conn); -rstatus_t conn_event_del_out(struct conn * conn); +rstatus_t conn_event_add_conn(struct conn *conn); +rstatus_t conn_event_add_out(struct conn *conn); +rstatus_t conn_event_del_conn(struct conn *conn); +rstatus_t conn_event_del_out(struct conn *conn); #endif diff --git a/src/dyn_connection_internal.c b/src/dyn_connection_internal.c index 00f4d6ca3..1bea4b738 100644 --- a/src/dyn_connection_internal.c +++ b/src/dyn_connection_internal.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * Licensed under the Apache License, Version 2.0 (the "License"); @@ -19,232 +19,217 @@ #include "dyn_connection_internal.h" #include "dyn_connection_pool.h" +#include "dyn_core.h" +#include "dyn_setting.h" +#include "dyn_util.h" #include "event/dyn_event.h" static uint32_t nfree_connq; /* # free conn q */ static struct conn_tqh free_connq; /* free conn q */ -inline char * -_conn_get_type_string(struct conn *conn) -{ - switch(conn->type) { - case CONN_UNSPECIFIED: return "UNSPEC"; - case CONN_PROXY : return "CONN_PROXY"; - case CONN_CLIENT: return "CONN_CLIENT"; - case CONN_SERVER: return "CONN_SERVER"; - case CONN_DNODE_PEER_PROXY: return "CONN_PEER_PROXY"; - case CONN_DNODE_PEER_CLIENT: return conn->same_dc ? - "CONN_LOCAL_PEER_CLIENT" : "CONN_REMOTE_PEER_CLIENT"; - case CONN_DNODE_PEER_SERVER: return conn->same_dc ? - "CONN_LOCAL_PEER_SERVER" : "CONN_REMOTE_PEER_SERVER"; - } - return "INVALID"; +inline char *_conn_get_type_string(struct conn *conn) { + switch (conn->type) { + case CONN_UNSPECIFIED: + return "UNSPEC"; + case CONN_PROXY: + return "CONN_PROXY"; + case CONN_CLIENT: + return "CONN_CLIENT"; + case CONN_SERVER: + return "CONN_SERVER"; + case CONN_DNODE_PEER_PROXY: + return "CONN_PEER_PROXY"; + case CONN_DNODE_PEER_CLIENT: + return conn->same_dc ? "CONN_LOCAL_PEER_CLIENT" + : "CONN_REMOTE_PEER_CLIENT"; + case CONN_DNODE_PEER_SERVER: + return conn->same_dc ? "CONN_LOCAL_PEER_SERVER" + : "CONN_REMOTE_PEER_SERVER"; + } + return "INVALID"; } -static char* -_print_conn(const struct object *obj) -{ - ASSERT(obj->type == OBJ_CONN); - struct conn *conn = (struct conn *)obj; - if ((conn->type == CONN_DNODE_PEER_PROXY) || - (conn->type == CONN_PROXY)) { - snprintf(obj->print_buff, PRINT_BUF_SIZE, "<%s %p %d listening on '%.*s'>", - _conn_get_type_string(conn), conn, conn->sd, - conn->pname.len, conn->pname.data); - return obj->print_buff; - } - if ((conn->type == CONN_DNODE_PEER_CLIENT) || - (conn->type == CONN_CLIENT)) { - snprintf(obj->print_buff, PRINT_BUF_SIZE, "<%s %p %d from '%.*s'>", - _conn_get_type_string(conn), conn, conn->sd, - conn->pname.len, conn->pname.data); - return obj->print_buff; - } - if ((conn->type == CONN_DNODE_PEER_SERVER) || - (conn->type == CONN_SERVER)) { - snprintf(obj->print_buff, PRINT_BUF_SIZE, "<%s %p %d to '%.*s'>", - _conn_get_type_string(conn), conn, conn->sd, - conn->pname.len, conn->pname.data); - return obj->print_buff; - } - - snprintf(obj->print_buff, PRINT_BUF_SIZE, "<%s %p %d>", - _conn_get_type_string(conn), conn, conn->sd); +static char *_print_conn(const struct object *obj) { + ASSERT(obj->type == OBJ_CONN); + struct conn *conn = (struct conn *)obj; + if ((conn->type == CONN_DNODE_PEER_PROXY) || (conn->type == CONN_PROXY)) { + snprintf(obj->print_buff, PRINT_BUF_SIZE, "<%s %p %d listening on '%.*s'>", + _conn_get_type_string(conn), conn, conn->sd, conn->pname.len, + conn->pname.data); return obj->print_buff; -} - - -struct conn * -_conn_get(void) -{ - struct conn *conn; + } + if ((conn->type == CONN_DNODE_PEER_CLIENT) || (conn->type == CONN_CLIENT)) { + snprintf(obj->print_buff, PRINT_BUF_SIZE, "<%s %p %d from '%.*s'>", + _conn_get_type_string(conn), conn, conn->sd, conn->pname.len, + conn->pname.data); + return obj->print_buff; + } + if ((conn->type == CONN_DNODE_PEER_SERVER) || (conn->type == CONN_SERVER)) { + snprintf(obj->print_buff, PRINT_BUF_SIZE, "<%s %p %d to '%.*s'>", + _conn_get_type_string(conn), conn, conn->sd, conn->pname.len, + conn->pname.data); + return obj->print_buff; + } - // Generate a new key for each connection - unsigned char *aes_key = generate_aes_key(); - if (aes_key == NULL) { - return NULL; - } + snprintf(obj->print_buff, PRINT_BUF_SIZE, "<%s %p %d>", + _conn_get_type_string(conn), conn, conn->sd); + return obj->print_buff; +} - if (!TAILQ_EMPTY(&free_connq)) { - ASSERT(nfree_connq > 0); - - conn = TAILQ_FIRST(&free_connq); - nfree_connq--; - TAILQ_REMOVE(&free_connq, conn, conn_tqe); - } else { - conn = dn_alloc(sizeof(*conn)); - if (conn == NULL) { - return NULL; - } - memset(conn, 0, sizeof(*conn)); +struct conn *_conn_get(void) { + struct conn *conn; + + // Generate a new key for each connection + unsigned char *aes_key = generate_aes_key(); + if (aes_key == NULL) { + return NULL; + } + + if (!TAILQ_EMPTY(&free_connq)) { + ASSERT(nfree_connq > 0); + + conn = TAILQ_FIRST(&free_connq); + nfree_connq--; + TAILQ_REMOVE(&free_connq, conn, conn_tqe); + } else { + conn = dn_alloc(sizeof(*conn)); + if (conn == NULL) { + return NULL; } - - init_object(&conn->object, OBJ_CONN, _print_conn); - conn->owner = NULL; - conn->conn_pool = NULL; - - // Save a key generated earlier within the connection - memcpy(conn->aes_key, aes_key, AES_KEYLEN); - - conn->sd = -1; - string_init(&conn->pname); - /* {family, addrlen, addr} are initialized in enqueue handler */ - - TAILQ_INIT(&conn->imsg_q); - - TAILQ_INIT(&conn->omsg_q); - - conn->rmsg = NULL; - conn->smsg = NULL; - - /* - * Callbacks {recv, recv_next, recv_done}, {send, send_next, send_done}, - * {close, active}, parse, {ref, unref}, {enqueue_inq, dequeue_inq} and - * {enqueue_outq, dequeue_outq} are initialized by the wrapper. - */ - - conn->send_bytes = 0; - conn->recv_bytes = 0; - - conn->events = 0; - conn->err = 0; - conn->recv_active = 0; - conn->recv_ready = 0; - conn->send_active = 0; - conn->send_ready = 0; - - conn->connecting = 0; - conn->connected = 0; - conn->eof = 0; - conn->done = 0; - conn->waiting_to_unref = 0; - - /* for dynomite */ - conn->dyn_mode = 0; - conn->dnode_secured = 0; - conn->crypto_key_sent = 0; - - conn->same_dc = 1; - conn->avail_tokens = msgs_per_sec(); - conn->last_sent = 0; - //conn->non_bytes_send = 0; - conn_set_read_consistency(conn, g_read_consistency); - conn_set_write_consistency(conn, g_write_consistency); - conn->type = CONN_UNSPECIFIED; - - return conn; + memset(conn, 0, sizeof(*conn)); + } + + init_object(&conn->object, OBJ_CONN, _print_conn); + conn->owner = NULL; + conn->conn_pool = NULL; + + // Save a key generated earlier within the connection + memcpy(conn->aes_key, aes_key, AES_KEYLEN); + + conn->sd = -1; + string_init(&conn->pname); + /* {family, addrlen, addr} are initialized in enqueue handler */ + + TAILQ_INIT(&conn->imsg_q); + + TAILQ_INIT(&conn->omsg_q); + + conn->rmsg = NULL; + conn->smsg = NULL; + + /* + * Callbacks {recv, recv_next, recv_done}, {send, send_next, send_done}, + * {close, active}, parse, {ref, unref}, {enqueue_inq, dequeue_inq} and + * {enqueue_outq, dequeue_outq} are initialized by the wrapper. + */ + + conn->send_bytes = 0; + conn->recv_bytes = 0; + + conn->events = 0; + conn->err = 0; + conn->recv_active = 0; + conn->recv_ready = 0; + conn->send_active = 0; + conn->send_ready = 0; + + conn->connecting = 0; + conn->connected = 0; + conn->eof = 0; + conn->done = 0; + conn->waiting_to_unref = 0; + + /* for dynomite */ + conn->dyn_mode = 0; + conn->dnode_secured = 0; + conn->crypto_key_sent = 0; + + conn->same_dc = 1; + conn->avail_tokens = msgs_per_sec(); + conn->last_sent = 0; + // conn->non_bytes_send = 0; + conn_set_read_consistency(conn, g_read_consistency); + conn_set_write_consistency(conn, g_write_consistency); + conn->type = CONN_UNSPECIFIED; + + return conn; } -void -_add_to_ready_q(struct context *ctx, struct conn *conn) -{ - // This check is required to check if the connection is already - // on the ready queue - if (conn->ready_tqe.tqe_prev == NULL) { - struct server_pool *pool = &ctx->pool; - TAILQ_INSERT_TAIL(&pool->ready_conn_q, conn, ready_tqe); - } +void _add_to_ready_q(struct context *ctx, struct conn *conn) { + // This check is required to check if the connection is already + // on the ready queue + if (conn->ready_tqe.tqe_prev == NULL) { + struct server_pool *pool = &ctx->pool; + TAILQ_INSERT_TAIL(&pool->ready_conn_q, conn, ready_tqe); + } } -void -_remove_from_ready_q(struct context *ctx, struct conn *conn) -{ - // This check is required to check if the connection is already - // on the ready queue - if (conn->ready_tqe.tqe_prev != NULL) { - struct server_pool *pool = &ctx->pool; - TAILQ_REMOVE(&pool->ready_conn_q, conn, ready_tqe); - } +void _remove_from_ready_q(struct context *ctx, struct conn *conn) { + // This check is required to check if the connection is already + // on the ready queue + if (conn->ready_tqe.tqe_prev != NULL) { + struct server_pool *pool = &ctx->pool; + TAILQ_REMOVE(&pool->ready_conn_q, conn, ready_tqe); + } } -static void -_conn_free(struct conn *conn) -{ - log_debug(LOG_VVERB, "free conn %p", conn); - dn_free(conn); +static void _conn_free(struct conn *conn) { + log_debug(LOG_VVERB, "free conn %p", conn); + dn_free(conn); } -void -_conn_put(struct conn *conn) -{ - nfree_connq++; - TAILQ_INSERT_HEAD(&free_connq, conn, conn_tqe); - if (conn->conn_pool) - conn_pool_notify_conn_close(conn->conn_pool, conn); +void _conn_put(struct conn *conn) { + nfree_connq++; + TAILQ_INSERT_HEAD(&free_connq, conn, conn_tqe); + if (conn->conn_pool) conn_pool_notify_conn_close(conn->conn_pool, conn); } /** * Initialize connections. */ -void -_conn_init(void) -{ - log_debug(LOG_DEBUG, "conn size %d", sizeof(struct conn)); - nfree_connq = 0; - TAILQ_INIT(&free_connq); +void _conn_init(void) { + log_debug(LOG_DEBUG, "conn size %d", sizeof(struct conn)); + nfree_connq = 0; + TAILQ_INIT(&free_connq); } -void -_conn_deinit(void) -{ - struct conn *conn, *nconn; /* current and next connection */ +void _conn_deinit(void) { + struct conn *conn, *nconn; /* current and next connection */ - for (conn = TAILQ_FIRST(&free_connq); conn != NULL; - conn = nconn, nfree_connq--) { - ASSERT(nfree_connq > 0); - nconn = TAILQ_NEXT(conn, conn_tqe); - _conn_free(conn); - } - ASSERT(nfree_connq == 0); + for (conn = TAILQ_FIRST(&free_connq); conn != NULL; + conn = nconn, nfree_connq--) { + ASSERT(nfree_connq > 0); + nconn = TAILQ_NEXT(conn, conn_tqe); + _conn_free(conn); + } + ASSERT(nfree_connq == 0); } -rstatus_t -_conn_reuse(struct conn *p) -{ - rstatus_t status; - struct sockaddr_un *un; +rstatus_t _conn_reuse(struct conn *p) { + rstatus_t status; + struct sockaddr_un *un; - switch (p->family) { + switch (p->family) { case AF_INET: case AF_INET6: - status = dn_set_reuseaddr(p->sd); - break; + status = dn_set_reuseaddr(p->sd); + break; case AF_UNIX: - /* - * bind() will fail if the pathname already exist. So, we call unlink() - * to delete the pathname, in case it already exists. If it does not - * exist, unlink() returns error, which we ignore - */ - un = (struct sockaddr_un *) p->addr; - unlink(un->sun_path); - status = DN_OK; - break; + /* + * bind() will fail if the pathname already exist. So, we call unlink() + * to delete the pathname, in case it already exists. If it does not + * exist, unlink() returns error, which we ignore + */ + un = (struct sockaddr_un *)p->addr; + unlink(un->sun_path); + status = DN_OK; + break; default: - NOT_REACHED(); - status = DN_ERROR; - } + NOT_REACHED(); + status = DN_ERROR; + } - return status; + return status; } diff --git a/src/dyn_connection_internal.h b/src/dyn_connection_internal.h index e734b64b7..f8cdc1466 100644 --- a/src/dyn_connection_internal.h +++ b/src/dyn_connection_internal.h @@ -1,5 +1,11 @@ #pragma once -#include "dyn_core.h" + +#include "dyn_types.h" + +// Forward declarations. +struct conn; +struct context; + extern void _conn_deinit(void); extern void _conn_init(void); extern struct conn *_conn_get(void); diff --git a/src/dyn_connection_pool.c b/src/dyn_connection_pool.c index 413c37104..4f5716f7f 100644 --- a/src/dyn_connection_pool.c +++ b/src/dyn_connection_pool.c @@ -1,233 +1,214 @@ -#include "dyn_core.h" #include "dyn_connection_pool.h" +#include "dyn_core.h" #include "dyn_task.h" -#define MIN_WAIT_BEFORE_RECONNECT_IN_SECS 1ULL +#define MIN_WAIT_BEFORE_RECONNECT_IN_SECS 1ULL struct conn_pool { - struct object obj; - uint8_t max_connections; // connections this conn_pool owns - void *owner; // the owner of this conn pool, this gets passed - // to each connection - struct context *ctx; - func_conn_init_t func_conn_init; // initializtion function for each connection - - // connection state - struct array active_connections; /* pool connections */ - uint8_t active_conn_count; /* Count of currently good connections */ - - // backoff logic - uint8_t failure_count; - uint8_t max_failure_count; - msec_t current_timeout_sec; - msec_t max_timeout_sec; - struct task *scheduled_reconnect_task; + struct object obj; + uint8_t max_connections; // connections this conn_pool owns + void *owner; // the owner of this conn pool, this gets passed + // to each connection + struct context *ctx; + func_conn_init_t + func_conn_init; // initializtion function for each connection + + // connection state + struct array active_connections; /* pool connections */ + uint8_t active_conn_count; /* Count of currently good connections */ + + // backoff logic + uint8_t failure_count; + uint8_t max_failure_count; + msec_t current_timeout_sec; + msec_t max_timeout_sec; + struct task *scheduled_reconnect_task; }; -static char* -_print_conn_pool(const struct object *obj) -{ - ASSERT(obj->type == OBJ_CONN_POOL); - conn_pool_t *cp = (conn_pool_t *)obj; - snprintf(obj->print_buff, PRINT_BUF_SIZE, "", cp, - cp->active_conn_count, array_n(&cp->active_connections), cp->max_connections); - return obj->print_buff; -} - -static void -_create_missing_connections(conn_pool_t *cp) -{ - // create connections if they are less than required. - uint8_t idx = 0, failures = 0; - uint32_t count = array_n(&cp->active_connections); - while (idx < count) { - struct conn** pconn = array_get(&cp->active_connections, idx); - if (*pconn != NULL) { - idx++; - continue; - } - struct conn *conn = conn_get(cp->owner, cp->func_conn_init); - if (conn != NULL) { - conn->conn_pool = cp; - log_notice("%s %s created %s", print_obj(cp->owner), print_obj(cp), print_obj(conn)); - *pconn = conn; // set that in the array - cp->active_conn_count++; - idx++; - } else { - if (++failures == 3) { - return; - } - } - } +static char *_print_conn_pool(const struct object *obj) { + ASSERT(obj->type == OBJ_CONN_POOL); + conn_pool_t *cp = (conn_pool_t *)obj; + snprintf(obj->print_buff, PRINT_BUF_SIZE, + "", cp, + cp->active_conn_count, array_n(&cp->active_connections), + cp->max_connections); + return obj->print_buff; } -conn_pool_t * -conn_pool_create(struct context *ctx, void *owner, uint8_t max_connections, - func_conn_init_t func_conn_init, uint8_t max_failures, - sec_t max_timeout) -{ - conn_pool_t *cp = dn_alloc(sizeof(struct conn_pool)); - if (!cp) - return NULL; - init_object(&cp->obj, OBJ_CONN_POOL, _print_conn_pool); - cp->max_connections = max_connections; - cp->owner = owner; - cp->ctx = ctx; - cp->func_conn_init = func_conn_init; - - cp->active_conn_count = 0; - if (array_init(&cp->active_connections, max_connections, sizeof(struct conn *)) - != DN_OK) - { - log_notice("%s Failed to initialize conn array", print_obj(owner)); - dn_free(cp); - return NULL; +static void _create_missing_connections(conn_pool_t *cp) { + // create connections if they are less than required. + uint8_t idx = 0, failures = 0; + uint32_t count = array_n(&cp->active_connections); + while (idx < count) { + struct conn **pconn = array_get(&cp->active_connections, idx); + if (*pconn != NULL) { + idx++; + continue; } - cp->failure_count = 0; - cp->max_failure_count = max_failures; - cp->current_timeout_sec = 0; - cp->max_timeout_sec = max_timeout; - cp->scheduled_reconnect_task = NULL; - - log_notice("%s Creating %s", print_obj(cp->owner), print_obj(cp)); - uint8_t idx = 0; - for (idx = 0; idx < max_connections; idx++) { - struct conn **pconn = array_push(&cp->active_connections); - *pconn = NULL; + struct conn *conn = conn_get(cp->owner, cp->func_conn_init); + if (conn != NULL) { + conn->conn_pool = cp; + log_notice("%s %s created %s", print_obj(cp->owner), print_obj(cp), + print_obj(conn)); + *pconn = conn; // set that in the array + cp->active_conn_count++; + idx++; + } else { + if (++failures == 3) { + return; + } } - _create_missing_connections(cp); - return cp; + } } -rstatus_t -conn_pool_preconnect(conn_pool_t *cp) -{ - log_notice("%s %s Preconnecting", print_obj(cp->owner), print_obj(cp)); - _create_missing_connections(cp); - // for each conn in array, call conn_connect - rstatus_t overall_status = DN_OK; - uint8_t idx = 0; - uint32_t count = array_n(&cp->active_connections); - for (idx = 0; idx < count; idx++) { - struct conn **pconn = array_get(&cp->active_connections, idx); - if (*pconn == NULL) - continue; - struct conn *conn = *pconn; - rstatus_t s = conn_connect(cp->ctx, conn); - if (s == DN_OK) { - continue; - } - // this will remove the connection from the array - conn_close(cp->ctx, conn); - overall_status = s; - } - return overall_status; +conn_pool_t *conn_pool_create(struct context *ctx, void *owner, + uint8_t max_connections, + func_conn_init_t func_conn_init, + uint8_t max_failures, sec_t max_timeout) { + conn_pool_t *cp = dn_alloc(sizeof(struct conn_pool)); + if (!cp) return NULL; + init_object(&cp->obj, OBJ_CONN_POOL, _print_conn_pool); + cp->max_connections = max_connections; + cp->owner = owner; + cp->ctx = ctx; + cp->func_conn_init = func_conn_init; + + cp->active_conn_count = 0; + if (array_init(&cp->active_connections, max_connections, + sizeof(struct conn *)) != DN_OK) { + log_notice("%s Failed to initialize conn array", print_obj(owner)); + dn_free(cp); + return NULL; + } + cp->failure_count = 0; + cp->max_failure_count = max_failures; + cp->current_timeout_sec = 0; + cp->max_timeout_sec = max_timeout; + cp->scheduled_reconnect_task = NULL; + + log_notice("%s Creating %s", print_obj(cp->owner), print_obj(cp)); + uint8_t idx = 0; + for (idx = 0; idx < max_connections; idx++) { + struct conn **pconn = array_push(&cp->active_connections); + *pconn = NULL; + } + _create_missing_connections(cp); + return cp; } -struct conn * -conn_pool_get(conn_pool_t *cp, int tag) -{ - struct conn **pconn = array_get(&cp->active_connections, - (uint32_t)tag % array_n(&cp->active_connections)); - if (*pconn) { - if ((*pconn)->connected) { - return *pconn; - } else { - return NULL; - } +rstatus_t conn_pool_preconnect(conn_pool_t *cp) { + log_notice("%s %s Preconnecting", print_obj(cp->owner), print_obj(cp)); + _create_missing_connections(cp); + // for each conn in array, call conn_connect + rstatus_t overall_status = DN_OK; + uint8_t idx = 0; + uint32_t count = array_n(&cp->active_connections); + for (idx = 0; idx < count; idx++) { + struct conn **pconn = array_get(&cp->active_connections, idx); + if (*pconn == NULL) continue; + struct conn *conn = *pconn; + rstatus_t s = conn_connect(cp->ctx, conn); + if (s == DN_OK) { + continue; } - return *pconn; + // this will remove the connection from the array + conn_close(cp->ctx, conn); + overall_status = s; + } + return overall_status; } -rstatus_t -conn_pool_destroy(conn_pool_t *cp) -{ - uint8_t idx = 0; - uint32_t count = array_n(&cp->active_connections); - for (idx = 0; idx < count; idx++) { - struct conn** pconn = array_get(&cp->active_connections, idx); - if (*pconn == NULL) { - continue; - } - struct conn *conn = *pconn; - log_notice("%s Closing %s", print_obj(cp), print_obj(conn)); - conn_close(cp->ctx, conn); - *pconn = NULL; - } - if (cp->scheduled_reconnect_task) { - log_notice("%s %s Cancelling task %p", print_obj(cp->owner), print_obj(cp), - cp->scheduled_reconnect_task); - cancel_task(cp->scheduled_reconnect_task); - cp->scheduled_reconnect_task = NULL; +struct conn *conn_pool_get(conn_pool_t *cp, int tag) { + struct conn **pconn = + array_get(&cp->active_connections, + (uint32_t)tag % array_n(&cp->active_connections)); + if (*pconn) { + if ((*pconn)->connected) { + return *pconn; + } else { + return NULL; } - log_notice("%s Destroying", print_obj(cp)); - dn_free(cp); - return DN_OK; + } + return *pconn; } -void -conn_pool_notify_conn_close(conn_pool_t *cp, struct conn *conn) -{ - log_notice("%s Removing %s", print_obj(cp), print_obj(conn)); - if (conn == NULL) - return; - - uint8_t idx = 0; - uint32_t count = array_n(&cp->active_connections); - for (idx = 0; idx < count; idx++) { - struct conn** pconn = array_get(&cp->active_connections, idx); - if (*pconn == conn) { - *pconn = NULL; - cp->active_conn_count--; - return; - } +rstatus_t conn_pool_destroy(conn_pool_t *cp) { + uint8_t idx = 0; + uint32_t count = array_n(&cp->active_connections); + for (idx = 0; idx < count; idx++) { + struct conn **pconn = array_get(&cp->active_connections, idx); + if (*pconn == NULL) { + continue; } - log_warn("%s did not find %s", print_obj(cp), print_obj(conn)); -} - -static void -_conn_pool_reconnect_task(void *arg1) -{ - conn_pool_t *cp = arg1; + struct conn *conn = *pconn; + log_notice("%s Closing %s", print_obj(cp), print_obj(conn)); + conn_close(cp->ctx, conn); + *pconn = NULL; + } + if (cp->scheduled_reconnect_task) { + log_notice("%s %s Cancelling task %p", print_obj(cp->owner), print_obj(cp), + cp->scheduled_reconnect_task); + cancel_task(cp->scheduled_reconnect_task); cp->scheduled_reconnect_task = NULL; - conn_pool_preconnect(cp); + } + log_notice("%s Destroying", print_obj(cp)); + dn_free(cp); + return DN_OK; } -void -conn_pool_notify_conn_errored(conn_pool_t *cp) -{ - // check if reconnect task is active - // if so , never mind - if (cp->scheduled_reconnect_task) { - log_notice("%s already have a reconnect task %p", print_obj(cp), cp->scheduled_reconnect_task); - return; +void conn_pool_notify_conn_close(conn_pool_t *cp, struct conn *conn) { + log_notice("%s Removing %s", print_obj(cp), print_obj(conn)); + if (conn == NULL) return; + + uint8_t idx = 0; + uint32_t count = array_n(&cp->active_connections); + for (idx = 0; idx < count; idx++) { + struct conn **pconn = array_get(&cp->active_connections, idx); + if (*pconn == conn) { + *pconn = NULL; + cp->active_conn_count--; + return; } - // else increase error count, and schedule a task after the backoff wait - cp->failure_count++; - - if (cp->current_timeout_sec < (MIN_WAIT_BEFORE_RECONNECT_IN_SECS)) - cp->current_timeout_sec = MIN_WAIT_BEFORE_RECONNECT_IN_SECS; + } + log_warn("%s did not find %s", print_obj(cp), print_obj(conn)); +} - cp->scheduled_reconnect_task = schedule_task_1(_conn_pool_reconnect_task, - cp, cp->current_timeout_sec * 1000); - log_notice("%s %s Scheduled reconnect task %p after %u secs", - print_obj(cp->owner), print_obj(cp), cp->scheduled_reconnect_task, - cp->current_timeout_sec); +static void _conn_pool_reconnect_task(void *arg1) { + conn_pool_t *cp = arg1; + cp->scheduled_reconnect_task = NULL; + conn_pool_preconnect(cp); +} - cp->current_timeout_sec = 2 * cp->current_timeout_sec; - if (cp->current_timeout_sec > cp->max_timeout_sec) - cp->current_timeout_sec = cp->max_timeout_sec; +void conn_pool_notify_conn_errored(conn_pool_t *cp) { + // check if reconnect task is active + // if so , never mind + if (cp->scheduled_reconnect_task) { + log_notice("%s already have a reconnect task %p", print_obj(cp), + cp->scheduled_reconnect_task); + return; + } + // else increase error count, and schedule a task after the backoff wait + cp->failure_count++; + + if (cp->current_timeout_sec < (MIN_WAIT_BEFORE_RECONNECT_IN_SECS)) + cp->current_timeout_sec = MIN_WAIT_BEFORE_RECONNECT_IN_SECS; + + cp->scheduled_reconnect_task = schedule_task_1( + _conn_pool_reconnect_task, cp, cp->current_timeout_sec * 1000); + log_notice("%s %s Scheduled reconnect task %p after %u secs", + print_obj(cp->owner), print_obj(cp), cp->scheduled_reconnect_task, + cp->current_timeout_sec); + + cp->current_timeout_sec = 2 * cp->current_timeout_sec; + if (cp->current_timeout_sec > cp->max_timeout_sec) + cp->current_timeout_sec = cp->max_timeout_sec; } - -void -conn_pool_connected(conn_pool_t *cp, struct conn *conn) -{ - cp->failure_count = 0; - cp->current_timeout_sec = 0; + +void conn_pool_connected(conn_pool_t *cp, struct conn *conn) { + cp->failure_count = 0; + cp->current_timeout_sec = 0; } -uint8_t -conn_pool_active_count(conn_pool_t *cp) -{ - return cp->active_conn_count; +uint8_t conn_pool_active_count(conn_pool_t *cp) { + return cp->active_conn_count; } diff --git a/src/dyn_connection_pool.h b/src/dyn_connection_pool.h index abdf30ffc..ef4b25ade 100644 --- a/src/dyn_connection_pool.h +++ b/src/dyn_connection_pool.h @@ -2,13 +2,14 @@ #include "dyn_connection.h" #include "dyn_types.h" -//struct conn_pool; + +// struct conn_pool; typedef struct conn_pool conn_pool_t; /** * Creates a connection pool with max_connections in it. - * creates connection objects using conn_get and uses func_conn_init to initialize - * them. + * creates connection objects using conn_get and uses func_conn_init to + * initialize them. */ conn_pool_t *conn_pool_create(struct context *ctx, void *owner, uint8_t max_connections, @@ -16,17 +17,17 @@ conn_pool_t *conn_pool_create(struct context *ctx, void *owner, uint8_t max_failures, sec_t max_timeout); /** - * This function starts a preconnect process for every underlying connection object - * but does not wait for it to finish. The conn_connect function automatically - * adds the connection to the event loop + * This function starts a preconnect process for every underlying connection + * object but does not wait for it to finish. The conn_connect function + * automatically adds the connection to the event loop */ rstatus_t conn_pool_preconnect(conn_pool_t *cp); /** * Given a tag (just an int number), get a connection from the connection pool. - * The purpose of the tag is to get the same underlying connection for a given tag. - * If the tag is not seen before, a new random connection is allocated. - * And all subsequent conn_pool_get with the same tag should yield the same + * The purpose of the tag is to get the same underlying connection for a given + * tag. If the tag is not seen before, a new random connection is allocated. And + * all subsequent conn_pool_get with the same tag should yield the same * underlying connection. The tag could be as simple as the socket number of the * client connection to map to the underlying resource in the connection pool. * If this association is missing then the request from the client connections @@ -37,21 +38,21 @@ struct conn *conn_pool_get(conn_pool_t *cp, int tag); /** * This function, tears down all the connection in the pool, clears up its state - * + * */ rstatus_t conn_pool_destroy(conn_pool_t *cp); /** * If a connection that is part of a pool is being closed, this function should * called so the pool can do its cleanup. - * + * */ void conn_pool_notify_conn_close(conn_pool_t *cp, struct conn *conn); /** * If a connection that is part of a pool got errored, this function should * called before closing so the pool can schedule proper reconnection logic - * + * */ void conn_pool_notify_conn_errored(conn_pool_t *cp); diff --git a/src/dyn_core.c b/src/dyn_core.c index af7eae710..4c5023ad1 100644 --- a/src/dyn_core.c +++ b/src/dyn_core.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -23,151 +23,126 @@ #include #include -#include "dyn_core.h" #include "dyn_conf.h" -#include "dyn_server.h" -#include "dyn_proxy.h" -#include "dyn_dnode_proxy.h" +#include "dyn_core.h" #include "dyn_dnode_peer.h" +#include "dyn_dnode_proxy.h" #include "dyn_gossip.h" -#include "event/dyn_event.h" +#include "dyn_proxy.h" +#include "dyn_server.h" #include "dyn_task.h" +#include "event/dyn_event.h" uint32_t admin_opt = 0; -static void -core_print_peer_status(void *arg1) -{ - struct context *ctx = arg1; - struct server_pool *sp = &ctx->pool; - // iterate over all peers - uint32_t dc_cnt = array_n(&sp->datacenters); - uint32_t dc_index; - for(dc_index = 0; dc_index < dc_cnt; dc_index++) { - struct datacenter *dc = array_get(&sp->datacenters, dc_index); - if (!dc) - log_panic("DC is null. Topology not inited proerly"); - uint8_t rack_cnt = (uint8_t)array_n(&dc->racks); - uint8_t rack_index; - for(rack_index = 0; rack_index < rack_cnt; rack_index++) { - struct rack *rack = array_get(&dc->racks, rack_index); - uint8_t i = 0; - for (i = 0; i< rack->ncontinuum; i++) { - struct continuum *c = &rack->continuum[i]; - uint32_t peer_index = c->index; - struct node *peer = *(struct node **)array_get(&sp->peers, peer_index); - if (!peer) - log_panic("peer is null. Topology not inited proerly"); - - log_notice("%u)%p %.*s %.*s %.*s %s", peer_index, peer,dc->name->len, dc->name->data, - rack->name->len, rack->name->data, peer->endpoint.pname.len, - peer->endpoint.pname.data, get_state(peer->state)); - } - } +static void core_print_peer_status(void *arg1) { + struct context *ctx = arg1; + struct server_pool *sp = &ctx->pool; + // iterate over all peers + uint32_t dc_cnt = array_n(&sp->datacenters); + uint32_t dc_index; + for (dc_index = 0; dc_index < dc_cnt; dc_index++) { + struct datacenter *dc = array_get(&sp->datacenters, dc_index); + if (!dc) log_panic("DC is null. Topology not inited proerly"); + uint8_t rack_cnt = (uint8_t)array_n(&dc->racks); + uint8_t rack_index; + for (rack_index = 0; rack_index < rack_cnt; rack_index++) { + struct rack *rack = array_get(&dc->racks, rack_index); + uint8_t i = 0; + for (i = 0; i < rack->ncontinuum; i++) { + struct continuum *c = &rack->continuum[i]; + uint32_t peer_index = c->index; + struct node *peer = *(struct node **)array_get(&sp->peers, peer_index); + if (!peer) log_panic("peer is null. Topology not inited proerly"); + + log_notice("%u)%p %.*s %.*s %.*s %s", peer_index, peer, dc->name->len, + dc->name->data, rack->name->len, rack->name->data, + peer->endpoint.pname.len, peer->endpoint.pname.data, + get_state(peer->state)); + } } + } } -void -core_set_local_state(struct context *ctx, dyn_state_t state) -{ - struct server_pool *sp = &ctx->pool; - struct node *peer = *(struct node **)array_get(&sp->peers, 0); - ctx->dyn_state = state; - peer->state = state; +void core_set_local_state(struct context *ctx, dyn_state_t state) { + struct server_pool *sp = &ctx->pool; + struct node *peer = *(struct node **)array_get(&sp->peers, 0); + ctx->dyn_state = state; + peer->state = state; } -static rstatus_t -core_init_last(struct context *ctx) -{ - core_debug(ctx); - preselect_remote_rack_for_replication(ctx); - // Print the network health once after 30 secs - schedule_task_1(core_print_peer_status, ctx, 30000); - return DN_OK; +static rstatus_t core_init_last(struct context *ctx) { + core_debug(ctx); + preselect_remote_rack_for_replication(ctx); + // Print the network health once after 30 secs + schedule_task_1(core_print_peer_status, ctx, 30000); + return DN_OK; } -static rstatus_t -core_gossip_pool_init(struct context *ctx) -{ - //init ring msg queue - CBUF_Init(C2G_InQ); - CBUF_Init(C2G_OutQ); +static rstatus_t core_gossip_pool_init(struct context *ctx) { + // init ring msg queue + CBUF_Init(C2G_InQ); + CBUF_Init(C2G_OutQ); - THROW_STATUS(gossip_pool_init(ctx)); - THROW_STATUS(core_init_last(ctx)); - return DN_OK; + THROW_STATUS(gossip_pool_init(ctx)); + THROW_STATUS(core_init_last(ctx)); + return DN_OK; } -static rstatus_t -core_dnode_peer_pool_preconnect(struct context *ctx) -{ - rstatus_t status = dnode_peer_pool_preconnect(ctx); - IGNORE_RET_VAL(status); - status = core_gossip_pool_init(ctx); - //if (status != DN_OK) - // gossip_pool_deinit(ctx); - return status; +static rstatus_t core_dnode_peer_pool_preconnect(struct context *ctx) { + rstatus_t status = dnode_peer_pool_preconnect(ctx); + IGNORE_RET_VAL(status); + status = core_gossip_pool_init(ctx); + // if (status != DN_OK) + // gossip_pool_deinit(ctx); + return status; } -static rstatus_t -core_dnode_peer_init(struct context *ctx) -{ - /* initialize peers */ - THROW_STATUS(dnode_initialize_peers(ctx)); - rstatus_t status = core_dnode_peer_pool_preconnect(ctx); - if (status != DN_OK) - dnode_peer_pool_disconnect(ctx); - return status; +static rstatus_t core_dnode_peer_init(struct context *ctx) { + /* initialize peers */ + THROW_STATUS(dnode_initialize_peers(ctx)); + rstatus_t status = core_dnode_peer_pool_preconnect(ctx); + if (status != DN_OK) dnode_peer_pool_disconnect(ctx); + return status; } -static rstatus_t -core_dnode_proxy_init(struct context *ctx) -{ - /* initialize dnode listener per server pool */ - THROW_STATUS(dnode_proxy_init(ctx)); +static rstatus_t core_dnode_proxy_init(struct context *ctx) { + /* initialize dnode listener per server pool */ + THROW_STATUS(dnode_proxy_init(ctx)); - ctx->dyn_state = JOINING; //TODOS: change this to JOINING - rstatus_t status = core_dnode_peer_init(ctx); - if (status != DN_OK) { - dnode_peer_deinit(&ctx->pool.peers); - } - return status; + ctx->dyn_state = JOINING; // TODOS: change this to JOINING + rstatus_t status = core_dnode_peer_init(ctx); + if (status != DN_OK) { + dnode_peer_deinit(&ctx->pool.peers); + } + return status; } -static rstatus_t -core_proxy_init(struct context *ctx) -{ - /* initialize proxy per server pool */ - THROW_STATUS(proxy_init(ctx)); - rstatus_t status = core_dnode_proxy_init(ctx); - if (status != DN_OK) - dnode_proxy_deinit(ctx); - return status; +static rstatus_t core_proxy_init(struct context *ctx) { + /* initialize proxy per server pool */ + THROW_STATUS(proxy_init(ctx)); + rstatus_t status = core_dnode_proxy_init(ctx); + if (status != DN_OK) dnode_proxy_deinit(ctx); + return status; } -static rstatus_t -core_server_pool_preconnect(struct context *ctx) -{ - rstatus_t status = server_pool_preconnect(ctx); - IGNORE_RET_VAL(status); +static rstatus_t core_server_pool_preconnect(struct context *ctx) { + rstatus_t status = server_pool_preconnect(ctx); + IGNORE_RET_VAL(status); - status = core_proxy_init(ctx); - if (status != DN_OK) - proxy_deinit(ctx); - return status; + status = core_proxy_init(ctx); + if (status != DN_OK) proxy_deinit(ctx); + return status; } -static rstatus_t -core_event_base_create(struct context *ctx) -{ - /* initialize event handling for client, proxy and server */ - ctx->evb = event_base_create(EVENT_SIZE, &core_core); - if (ctx->evb == NULL) { - log_error("Failed to create socket event handling!!!"); - return DN_ERROR; - } - rstatus_t status = core_server_pool_preconnect(ctx); - if (status != DN_OK) - server_pool_disconnect(ctx); - return status; +static rstatus_t core_event_base_create(struct context *ctx) { + /* initialize event handling for client, proxy and server */ + ctx->evb = event_base_create(EVENT_SIZE, &core_core); + if (ctx->evb == NULL) { + log_error("Failed to create socket event handling!!!"); + return DN_ERROR; + } + rstatus_t status = core_server_pool_preconnect(ctx); + if (status != DN_OK) server_pool_disconnect(ctx); + return status; } /** @@ -175,21 +150,18 @@ core_event_base_create(struct context *ctx) * @param[in,out] ctx Context. * @return rstatus_t Return status code. */ -static rstatus_t -core_entropy_init(struct context *ctx) -{ - struct instance *nci = ctx->instance; - /* initializing anti-entropy */ - ctx->entropy = entropy_init(ctx, nci->entropy_port, nci->entropy_addr); - if (ctx->entropy == NULL) { - log_error("Failed to create entropy!!!"); - } - - rstatus_t status = core_event_base_create(ctx); - if (status != DN_OK) - event_base_destroy(ctx->evb); - - return status; +static rstatus_t core_entropy_init(struct context *ctx) { + struct instance *nci = ctx->instance; + /* initializing anti-entropy */ + ctx->entropy = entropy_init(ctx, nci->entropy_port, nci->entropy_addr); + if (ctx->entropy == NULL) { + log_error("Failed to create entropy!!!"); + } + + rstatus_t status = core_event_base_create(ctx); + if (status != DN_OK) event_base_destroy(ctx->evb); + + return status; } /** @@ -198,24 +170,21 @@ core_entropy_init(struct context *ctx) * @param[in,out] ctx Context. * @return rstatus_t Return status code. */ -static rstatus_t -core_stats_create(struct context *ctx) -{ - struct instance *nci = ctx->instance; - struct server_pool *sp = &ctx->pool; - - ctx->stats = stats_create(sp->stats_endpoint.port, sp->stats_endpoint.pname, sp->stats_interval, - nci->hostname, &ctx->pool, ctx); - if (ctx->stats == NULL) { - log_error("Failed to create stats!!!"); - return DN_ERROR; - } +static rstatus_t core_stats_create(struct context *ctx) { + struct instance *nci = ctx->instance; + struct server_pool *sp = &ctx->pool; - rstatus_t status = core_entropy_init(ctx); - if (status != DN_OK) - entropy_conn_destroy(ctx->entropy); - - return status; + ctx->stats = stats_create(sp->stats_endpoint.port, sp->stats_endpoint.pname, + sp->stats_interval, nci->hostname, &ctx->pool, ctx); + if (ctx->stats == NULL) { + log_error("Failed to create stats!!!"); + return DN_ERROR; + } + + rstatus_t status = core_entropy_init(ctx); + if (status != DN_OK) entropy_conn_destroy(ctx->entropy); + + return status; } /** @@ -223,17 +192,15 @@ core_stats_create(struct context *ctx) * @param[in,out] ctx Dynomite server context. * @return rstatus_t Return status code. */ -static rstatus_t -core_crypto_init(struct context *ctx) -{ - /* crypto init */ - THROW_STATUS(crypto_init(&ctx->pool)); - rstatus_t status = core_stats_create(ctx); - if (status != DN_OK) { - if (ctx->stats) stats_destroy(ctx->stats); - } - - return status; +static rstatus_t core_crypto_init(struct context *ctx) { + /* crypto init */ + THROW_STATUS(crypto_init(&ctx->pool)); + rstatus_t status = core_stats_create(ctx); + if (status != DN_OK) { + if (ctx->stats) stats_destroy(ctx->stats); + } + + return status; } /** @@ -241,14 +208,11 @@ core_crypto_init(struct context *ctx) * @param[in,out] ctx Context. * @return rstatus_t Return status code. */ -static rstatus_t -core_server_pool_init(struct context *ctx) -{ - THROW_STATUS(server_pool_init(&ctx->pool, &ctx->cf->pool, ctx)); - rstatus_t status = core_crypto_init(ctx); - if (status != DN_OK) - crypto_deinit(); - return status; +static rstatus_t core_server_pool_init(struct context *ctx) { + THROW_STATUS(server_pool_init(&ctx->pool, &ctx->cf->pool, ctx)); + rstatus_t status = core_crypto_init(ctx); + if (status != DN_OK) crypto_deinit(); + return status; } /** @@ -256,59 +220,55 @@ core_server_pool_init(struct context *ctx) * @param[in,out] nci Dynomite instance. * @return rstatus_t Return status code. */ -static rstatus_t -core_ctx_create(struct instance *nci) -{ - struct context *ctx; - - srand((unsigned) time(NULL)); - - ctx = dn_alloc(sizeof(*ctx)); - if (ctx == NULL) { - loga("Failed to create context!!!"); - return DN_ERROR; - } - - nci->ctx = ctx; - ctx->instance = nci; - ctx->cf = NULL; - ctx->stats = NULL; - ctx->evb = NULL; - ctx->dyn_state = INIT; - ctx->admin_opt = admin_opt; - - /* parse and create configuration */ - ctx->cf = conf_create(nci->conf_filename); - if (ctx->cf == NULL) { - loga("Failed to create conf!!!"); - dn_free(ctx); - return DN_ERROR; - } +static rstatus_t core_ctx_create(struct instance *nci) { + struct context *ctx; + + srand((unsigned)time(NULL)); + + ctx = dn_alloc(sizeof(*ctx)); + if (ctx == NULL) { + loga("Failed to create context!!!"); + return DN_ERROR; + } + + nci->ctx = ctx; + ctx->instance = nci; + ctx->cf = NULL; + ctx->stats = NULL; + ctx->evb = NULL; + ctx->dyn_state = INIT; + ctx->admin_opt = admin_opt; + + /* parse and create configuration */ + ctx->cf = conf_create(nci->conf_filename); + if (ctx->cf == NULL) { + loga("Failed to create conf!!!"); + dn_free(ctx); + return DN_ERROR; + } - struct conf_pool *cp = &ctx->cf->pool; - ctx->max_timeout = cp->stats_interval; - ctx->timeout = ctx->max_timeout; + struct conf_pool *cp = &ctx->cf->pool; + ctx->max_timeout = cp->stats_interval; + ctx->timeout = ctx->max_timeout; - rstatus_t status = core_server_pool_init(ctx); - if (status != DN_OK) { - server_pool_deinit(&ctx->pool); - conf_destroy(ctx->cf); - dn_free(ctx); - return DN_ERROR; - } - return status; -} - -static void -core_ctx_destroy(struct context *ctx) -{ - proxy_deinit(ctx); - server_pool_disconnect(ctx); - event_base_destroy(ctx->evb); - stats_destroy(ctx->stats); + rstatus_t status = core_server_pool_init(ctx); + if (status != DN_OK) { server_pool_deinit(&ctx->pool); conf_destroy(ctx->cf); dn_free(ctx); + return DN_ERROR; + } + return status; +} + +static void core_ctx_destroy(struct context *ctx) { + proxy_deinit(ctx); + server_pool_disconnect(ctx); + event_base_destroy(ctx->evb); + stats_destroy(ctx->stats); + server_pool_deinit(&ctx->pool); + conf_destroy(ctx->cf); + dn_free(ctx); } /** @@ -316,52 +276,50 @@ core_ctx_destroy(struct context *ctx) * @param[in] nci Dynomite instance. * @return rstatus_t Return status code. */ -rstatus_t -core_start(struct instance *nci) -{ - conn_init(); - task_mgr_init(); - - rstatus_t status = core_ctx_create(nci); - if (status != DN_OK) { - conn_deinit(); - return status; - } - - /** - * Providing mbuf_size and alloc_msgs through the command line - * has been deprecated. For backward compatibility - * we support both ways here: One through nci (command line) - * and one through the YAML file (server_pool). - */ - struct context *ctx = nci->ctx; - struct server_pool *sp = &ctx->pool; - - if(sp->mbuf_size == UNSET_NUM) { - loga("mbuf_size not in YAML: using deprecated way %d", nci->mbuf_chunk_size); - mbuf_init(nci->mbuf_chunk_size); - } else { - loga("YAML provided mbuf_size: %d", sp->mbuf_size); - mbuf_init(sp->mbuf_size); - } - if(sp->alloc_msgs_max == UNSET_NUM) { - loga("max_msgs not in YAML: using deprecated way %d", nci->alloc_msgs_max); - msg_init(nci->alloc_msgs_max); - } else { - loga("YAML provided max_msgs: %d", sp->alloc_msgs_max); - msg_init(sp->alloc_msgs_max); - } +rstatus_t core_start(struct instance *nci) { + conn_init(); + task_mgr_init(); + rstatus_t status = core_ctx_create(nci); + if (status != DN_OK) { + conn_deinit(); return status; + } + + /** + * Providing mbuf_size and alloc_msgs through the command line + * has been deprecated. For backward compatibility + * we support both ways here: One through nci (command line) + * and one through the YAML file (server_pool). + */ + struct context *ctx = nci->ctx; + struct server_pool *sp = &ctx->pool; + + if (sp->mbuf_size == UNSET_NUM) { + loga("mbuf_size not in YAML: using deprecated way %d", + nci->mbuf_chunk_size); + mbuf_init(nci->mbuf_chunk_size); + } else { + loga("YAML provided mbuf_size: %d", sp->mbuf_size); + mbuf_init(sp->mbuf_size); + } + if (sp->alloc_msgs_max == UNSET_NUM) { + loga("max_msgs not in YAML: using deprecated way %d", nci->alloc_msgs_max); + msg_init(nci->alloc_msgs_max); + } else { + loga("YAML provided max_msgs: %d", sp->alloc_msgs_max); + msg_init(sp->alloc_msgs_max); + } + + return status; } -char* -print_server_pool(const struct object *obj) -{ - ASSERT(obj->type == OBJ_POOL); - struct server_pool *sp = (struct server_pool *)obj; - snprintf(obj->print_buff, PRINT_BUF_SIZE, "", sp, sp->name.len, sp->name.data); - return obj->print_buff; +char *print_server_pool(const struct object *obj) { + ASSERT(obj->type == OBJ_POOL); + struct server_pool *sp = (struct server_pool *)obj; + snprintf(obj->print_buff, PRINT_BUF_SIZE, "", sp, + sp->name.len, sp->name.data); + return obj->print_buff; } /** @@ -369,268 +327,264 @@ print_server_pool(const struct object *obj) * context. * @param[in] ctx Dynomite process context. */ -void -core_stop(struct context *ctx) -{ - conn_deinit(); - msg_deinit(); - dmsg_deinit(); - mbuf_deinit(); - core_ctx_destroy(ctx); +void core_stop(struct context *ctx) { + conn_deinit(); + msg_deinit(); + dmsg_deinit(); + mbuf_deinit(); + core_ctx_destroy(ctx); } -static rstatus_t -core_recv(struct context *ctx, struct conn *conn) -{ - rstatus_t status; +static rstatus_t core_recv(struct context *ctx, struct conn *conn) { + rstatus_t status; - status = conn_recv(ctx, conn); - if (status != DN_OK) { - log_info("%s recv failed: %s", print_obj(conn), strerror(errno)); - } + status = conn_recv(ctx, conn); + if (status != DN_OK) { + log_info("%s recv failed: %s", print_obj(conn), strerror(errno)); + } - return status; + return status; } -static rstatus_t -core_send(struct context *ctx, struct conn *conn) -{ - rstatus_t status; +static rstatus_t core_send(struct context *ctx, struct conn *conn) { + rstatus_t status; - status = conn_send(ctx, conn); - if (status != DN_OK) { - log_info("%s send failed: %s", print_obj(conn), strerror(errno)); - } + status = conn_send(ctx, conn); + if (status != DN_OK) { + log_info("%s send failed: %s", print_obj(conn), strerror(errno)); + } - return status; + return status; } -static void -core_close(struct context *ctx, struct conn *conn) -{ - rstatus_t status; +static void core_close(struct context *ctx, struct conn *conn) { + rstatus_t status; - ASSERT(conn->sd > 0); + ASSERT(conn->sd > 0); - log_debug(LOG_NOTICE, "close %s on event %04"PRIX32" eof %d done " - "%d rb %zu sb %zu%c %s", print_obj(conn), - conn->events, conn->eof, conn->done, conn->recv_bytes, - conn->send_bytes, - conn->err ? ':' : ' ', conn->err ? strerror(conn->err) : ""); + log_debug(LOG_NOTICE, + "close %s on event %04" PRIX32 + " eof %d done " + "%d rb %zu sb %zu%c %s", + print_obj(conn), conn->events, conn->eof, conn->done, + conn->recv_bytes, conn->send_bytes, conn->err ? ':' : ' ', + conn->err ? strerror(conn->err) : ""); - status = conn_event_del_conn(conn); - if (status < 0) { - log_warn("event del conn %d failed, ignored: %s", - conn->sd, strerror(errno)); - } + status = conn_event_del_conn(conn); + if (status < 0) { + log_warn("event del conn %d failed, ignored: %s", conn->sd, + strerror(errno)); + } - conn_close(ctx, conn); + conn_close(ctx, conn); } -static void -core_error(struct context *ctx, struct conn *conn) -{ - rstatus_t status; +static void core_error(struct context *ctx, struct conn *conn) { + rstatus_t status; - status = dn_get_soerror(conn->sd); - if (status < 0) { - log_warn("get soerr on %s failed, ignored: %s", print_obj(conn), strerror(errno)); - } - conn->err = errno; + status = dn_get_soerror(conn->sd); + if (status < 0) { + log_warn("get soerr on %s failed, ignored: %s", print_obj(conn), + strerror(errno)); + } + conn->err = errno; - core_close(ctx, conn); + core_close(ctx, conn); } -static void -core_timeout(struct context *ctx) -{ - for (;;) { - struct msg *req; - struct conn *conn; - msec_t now, then; - - req = msg_tmo_min(); - if (req == NULL) { - ctx->timeout = ctx->max_timeout; - return; - } - - /* skip over req that are in-error or done */ - - if (req->is_error || req->done) { - msg_tmo_delete(req); - continue; - } - - /* - * timeout expired req and all the outstanding req on the timing - * out server - */ - - conn = req->tmo_rbe.data; - then = req->tmo_rbe.key; - - now = dn_msec_now(); - if (now < then) { - msec_t delta = (msec_t)(then - now); - ctx->timeout = MIN(delta, ctx->max_timeout); - return; - } - - log_warn("%s on %s timedout, timeout was %d", print_obj(req), print_obj(conn), req->tmo_rbe.timeout); - - msg_tmo_delete(req); - - if (conn->dyn_mode) { - if (conn->type == CONN_DNODE_PEER_SERVER) { //outgoing peer requests - if (conn->same_dc) - stats_pool_incr(ctx, peer_timedout_requests); - else - stats_pool_incr(ctx, remote_peer_timedout_requests); - } - } else { - if (conn->type == CONN_SERVER) { //storage server requests - stats_server_incr(ctx, server_dropped_requests); - } - } - - conn->err = ETIMEDOUT; - - core_close(ctx, conn); - } -} +static void core_timeout(struct context *ctx) { + for (;;) { + struct msg *req; + struct conn *conn; + msec_t now, then; + + req = msg_tmo_min(); + if (req == NULL) { + ctx->timeout = ctx->max_timeout; + return; + } + + /* skip over req that are in-error or done */ + + if (req->is_error || req->done) { + msg_tmo_delete(req); + continue; + } + + /* + * timeout expired req and all the outstanding req on the timing + * out server + */ + + conn = req->tmo_rbe.data; + then = req->tmo_rbe.key; + + now = dn_msec_now(); + if (now < then) { + msec_t delta = (msec_t)(then - now); + ctx->timeout = MIN(delta, ctx->max_timeout); + return; + } + + log_warn("%s on %s timedout, timeout was %d", print_obj(req), + print_obj(conn), req->tmo_rbe.timeout); + + msg_tmo_delete(req); + + if (conn->dyn_mode) { + if (conn->type == CONN_DNODE_PEER_SERVER) { // outgoing peer requests + if (conn->same_dc) + stats_pool_incr(ctx, peer_timedout_requests); + else + stats_pool_incr(ctx, remote_peer_timedout_requests); + } + } else { + if (conn->type == CONN_SERVER) { // storage server requests + stats_server_incr(ctx, server_dropped_requests); + } + } -rstatus_t -core_core(void *arg, uint32_t events) -{ - rstatus_t status; - struct conn *conn = arg; - struct context *ctx = conn_to_ctx(conn); - - log_debug(LOG_VVERB, "event %04"PRIX32" on %s", events, print_obj(conn)); - - conn->events = events; - - /* error takes precedence over read | write */ - if (events & EVENT_ERR) { - if (conn->err && conn->dyn_mode) { - loga("conn err on dnode EVENT_ERR: %d", conn->err); - } - core_error(ctx, conn); - - return DN_ERROR; - } - - /* read takes precedence over write */ - if (events & EVENT_READ) { - status = core_recv(ctx, conn); - - if (status != DN_OK || conn->done || conn->err) { - if (conn->dyn_mode) { - if (conn->err) { - loga("conn err on dnode EVENT_READ: %d", conn->err); - core_close(ctx, conn); - return DN_ERROR; - } - core_close(ctx, conn); - return DN_OK; - } - - core_close(ctx, conn); - return DN_ERROR; - } - } - - if (events & EVENT_WRITE) { - status = core_send(ctx, conn); - if (status != DN_OK || conn->done || conn->err) { - if (conn->dyn_mode) { - if (conn->err) { - loga("conn err on dnode EVENT_WRITE: %d", conn->err); - core_close(ctx, conn); - return DN_ERROR; - } - return DN_OK; - } - - core_close(ctx, conn); - return DN_ERROR; - } - } - - return DN_OK; + conn->err = ETIMEDOUT; + + core_close(ctx, conn); + } } +rstatus_t core_core(void *arg, uint32_t events) { + rstatus_t status; + struct conn *conn = arg; + struct context *ctx = conn_to_ctx(conn); -void -core_debug(struct context *ctx) -{ - log_debug(LOG_VERB, "=====================Peers info====================="); - struct server_pool *sp = &ctx->pool; - log_debug(LOG_VERB, "Server pool : '%.*s'", sp->name); - uint32_t j, n; - for (j = 0, n = array_n(&sp->peers); j < n; j++) { - log_debug(LOG_VERB, "=============================================="); - struct node *peer = *(struct node **) array_get(&sp->peers, j); - log_debug(LOG_VERB, "\tPeer DC : '%.*s'",peer ->dc); - log_debug(LOG_VERB, "\tPeer Rack : '%.*s'", peer->rack); - - log_debug(LOG_VERB, "\tPeer name : '%.*s'", peer->name); - log_debug(LOG_VERB, "\tPeer pname : '%.*s'", peer->endpoint.pname); - - log_debug(LOG_VERB, "\tPeer state : %s", get_state(peer->state)); - log_debug(LOG_VERB, "\tPeer port : %"PRIu32"", peer->endpoint.port); - log_debug(LOG_VERB, "\tPeer is_local : %"PRIu32" ", peer->is_local); - log_debug(LOG_VERB, "\tPeer failure_count : %"PRIu32" ", peer->failure_count); - log_debug(LOG_VERB, "\tPeer num tokens : %d", array_n(&peer->tokens)); - - uint32_t k; - for (k = 0; k < array_n(&peer->tokens); k++) { - struct dyn_token *token = (struct dyn_token *) array_get(&peer->tokens, k); - print_dyn_token(token, 12); - } + log_debug(LOG_VVERB, "event %04" PRIX32 " on %s", events, print_obj(conn)); + + conn->events = events; + + /* error takes precedence over read | write */ + if (events & EVENT_ERR) { + if (conn->err && conn->dyn_mode) { + loga("conn err on dnode EVENT_ERR: %d", conn->err); } + core_error(ctx, conn); + + return DN_ERROR; + } + + /* read takes precedence over write */ + if (events & EVENT_READ) { + status = core_recv(ctx, conn); - log_debug(LOG_VERB, "Peers Datacenters/racks/nodes ................................................."); - uint32_t dc_index, dc_len; - for(dc_index = 0, dc_len = array_n(&sp->datacenters); dc_index < dc_len; dc_index++) { - struct datacenter *dc = array_get(&sp->datacenters, dc_index); - log_debug(LOG_VERB, "Peer datacenter........'%.*s'", dc->name->len, dc->name->data); - uint32_t rack_index, rack_len; - for(rack_index=0, rack_len = array_n(&dc->racks); rack_index < rack_len; rack_index++) { - struct rack *rack = array_get(&dc->racks, rack_index); - log_debug(LOG_VERB, "\tPeer rack........'%.*s'", rack->name->len, rack->name->data); - log_debug(LOG_VERB, "\tPeer rack ncontinuumm : %d", rack->ncontinuum); - log_debug(LOG_VERB, "\tPeer rack nserver_continuum : %d", rack->nserver_continuum); + if (status != DN_OK || conn->done || conn->err) { + if (conn->dyn_mode) { + if (conn->err) { + loga("conn err on dnode EVENT_READ: %d", conn->err); + core_close(ctx, conn); + return DN_ERROR; } + core_close(ctx, conn); + return DN_OK; + } + + core_close(ctx, conn); + return DN_ERROR; } - log_debug(LOG_VERB, "..............................................................................."); + } + + if (events & EVENT_WRITE) { + status = core_send(ctx, conn); + if (status != DN_OK || conn->done || conn->err) { + if (conn->dyn_mode) { + if (conn->err) { + loga("conn err on dnode EVENT_WRITE: %d", conn->err); + core_close(ctx, conn); + return DN_ERROR; + } + return DN_OK; + } + + core_close(ctx, conn); + return DN_ERROR; + } + } + + return DN_OK; +} + +void core_debug(struct context *ctx) { + log_debug(LOG_VERB, "=====================Peers info====================="); + struct server_pool *sp = &ctx->pool; + log_debug(LOG_VERB, "Server pool : '%.*s'", sp->name); + uint32_t j, n; + for (j = 0, n = array_n(&sp->peers); j < n; j++) { + log_debug(LOG_VERB, "=============================================="); + struct node *peer = *(struct node **)array_get(&sp->peers, j); + log_debug(LOG_VERB, "\tPeer DC : '%.*s'", peer->dc); + log_debug(LOG_VERB, "\tPeer Rack : '%.*s'", peer->rack); + + log_debug(LOG_VERB, "\tPeer name : '%.*s'", peer->name); + log_debug(LOG_VERB, "\tPeer pname : '%.*s'", peer->endpoint.pname); + + log_debug(LOG_VERB, "\tPeer state : %s", get_state(peer->state)); + log_debug(LOG_VERB, "\tPeer port : %" PRIu32 "", + peer->endpoint.port); + log_debug(LOG_VERB, "\tPeer is_local : %" PRIu32 " ", peer->is_local); + log_debug(LOG_VERB, "\tPeer failure_count : %" PRIu32 " ", + peer->failure_count); + log_debug(LOG_VERB, "\tPeer num tokens : %d", array_n(&peer->tokens)); + + uint32_t k; + for (k = 0; k < array_n(&peer->tokens); k++) { + struct dyn_token *token = (struct dyn_token *)array_get(&peer->tokens, k); + print_dyn_token(token, 12); + } + } + + log_debug(LOG_VERB, + "Peers Datacenters/racks/nodes " + "................................................."); + uint32_t dc_index, dc_len; + for (dc_index = 0, dc_len = array_n(&sp->datacenters); dc_index < dc_len; + dc_index++) { + struct datacenter *dc = array_get(&sp->datacenters, dc_index); + log_debug(LOG_VERB, "Peer datacenter........'%.*s'", dc->name->len, + dc->name->data); + uint32_t rack_index, rack_len; + for (rack_index = 0, rack_len = array_n(&dc->racks); rack_index < rack_len; + rack_index++) { + struct rack *rack = array_get(&dc->racks, rack_index); + log_debug(LOG_VERB, "\tPeer rack........'%.*s'", rack->name->len, + rack->name->data); + log_debug(LOG_VERB, "\tPeer rack ncontinuumm : %d", rack->ncontinuum); + log_debug(LOG_VERB, "\tPeer rack nserver_continuum : %d", + rack->nserver_continuum); + } + } + log_debug(LOG_VERB, + ".................................................................." + "............."); } /** * Process elements in the circular buffer. * @return rstatus_t Return status code. */ -static rstatus_t -core_process_messages(void) -{ - log_debug(LOG_VVVERB, "length of C2G_OutQ : %d", CBUF_Len( C2G_OutQ )); - - // Continue to process messages while the circular buffer is not empty - while (!CBUF_IsEmpty(C2G_OutQ)) { - // Get an element from the beginning of the circular buffer - struct ring_msg *ring_msg = (struct ring_msg *) CBUF_Pop(C2G_OutQ); - if (ring_msg != NULL && ring_msg->cb != NULL) { - // CBUF_Push - // ./src/dyn_dnode_msg.c - // ./src/dyn_gossip.c - ring_msg->cb(ring_msg); - core_debug(ring_msg->sp->ctx); - ring_msg_deinit(ring_msg); - } - } - - return DN_OK; +static rstatus_t core_process_messages(void) { + log_debug(LOG_VVVERB, "length of C2G_OutQ : %d", CBUF_Len(C2G_OutQ)); + + // Continue to process messages while the circular buffer is not empty + while (!CBUF_IsEmpty(C2G_OutQ)) { + // Get an element from the beginning of the circular buffer + struct ring_msg *ring_msg = (struct ring_msg *)CBUF_Pop(C2G_OutQ); + if (ring_msg != NULL && ring_msg->cb != NULL) { + // CBUF_Push + // ./src/dyn_dnode_msg.c + // ./src/dyn_gossip.c + ring_msg->cb(ring_msg); + core_debug(ring_msg->sp->ctx); + ring_msg_deinit(ring_msg); + } + } + + return DN_OK; } /** @@ -638,34 +592,32 @@ core_process_messages(void) * @param[in] ctx Dynomite process context. * @return rstatus_t Return status code. */ -rstatus_t -core_loop(struct context *ctx) -{ - int nsd; - - core_process_messages(); - - core_timeout(ctx); - execute_expired_tasks(0); - ctx->timeout = MIN(ctx->timeout, time_to_next_task()); - nsd = event_wait(ctx->evb, (int)ctx->timeout); - if (nsd < 0) { - return nsd; - } - - // go through all the ready queue and send each of them - /*struct server_pool *sp = &ctx->pool; - struct conn *conn, *nconn; - TAILQ_FOREACH_SAFE(conn, &sp->ready_conn_q, ready_tqe, nconn) { - rstatus_t status = core_send(ctx, conn); - if (status == DN_OK) { - log_debug(LOG_VVERB, "Flushing writes on %s", print_obj(conn)); - conn_event_del_out(conn); - } else { - TAILQ_REMOVE(&sp->ready_conn_q, conn, ready_tqe); - } - }*/ - stats_swap(ctx->stats); - - return DN_OK; +rstatus_t core_loop(struct context *ctx) { + int nsd; + + core_process_messages(); + + core_timeout(ctx); + execute_expired_tasks(0); + ctx->timeout = MIN(ctx->timeout, time_to_next_task()); + nsd = event_wait(ctx->evb, (int)ctx->timeout); + if (nsd < 0) { + return nsd; + } + + // go through all the ready queue and send each of them + /*struct server_pool *sp = &ctx->pool; + struct conn *conn, *nconn; + TAILQ_FOREACH_SAFE(conn, &sp->ready_conn_q, ready_tqe, nconn) { + rstatus_t status = core_send(ctx, conn); + if (status == DN_OK) { + log_debug(LOG_VVERB, "Flushing writes on %s", print_obj(conn)); + conn_event_del_out(conn); + } else { + TAILQ_REMOVE(&sp->ready_conn_q, conn, ready_tqe); + } + }*/ + stats_swap(ctx->stats); + + return DN_OK; } diff --git a/src/dyn_core.h b/src/dyn_core.h index e322aa1d4..cebf43a1f 100644 --- a/src/dyn_core.h +++ b/src/dyn_core.h @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -24,148 +24,133 @@ #define _DYN_CORE_H_ #ifdef HAVE_CONFIG_H -# include +#include #endif #ifdef HAVE_DEBUG_LOG -# define DN_DEBUG_LOG 1 +#define DN_DEBUG_LOG 1 #endif #ifdef HAVE_ASSERT_PANIC -# define DN_ASSERT_PANIC 1 +#define DN_ASSERT_PANIC 1 #endif #ifdef HAVE_ASSERT_LOG -# define DN_ASSERT_LOG 1 +#define DN_ASSERT_LOG 1 #endif #ifdef HAVE_STATS -# define DN_STATS 1 +#define DN_STATS 1 #else -# define DN_STATS 0 +#define DN_STATS 0 #endif #ifdef HAVE_EPOLL -# define DN_HAVE_EPOLL 1 +#define DN_HAVE_EPOLL 1 #elif HAVE_KQUEUE -# define DN_HAVE_KQUEUE 1 +#define DN_HAVE_KQUEUE 1 #elif HAVE_EVENT_PORTS -# define DN_HAVE_EVENT_PORTS 1 +#define DN_HAVE_EVENT_PORTS 1 #else -# error missing scalable I/O event notification mechanism +#error missing scalable I/O event notification mechanism #endif #ifdef HAVE_LITTLE_ENDIAN -# define DN_LITTLE_ENDIAN 1 +#define DN_LITTLE_ENDIAN 1 #endif #ifdef HAVE_BACKTRACE -# define DN_HAVE_BACKTRACE 1 +#define DN_HAVE_BACKTRACE 1 #endif -#define DN_NOOPS 1 -#define DN_OK 0 -#define DN_ERROR -1 -#define DN_EAGAIN -2 -#define DN_ENOMEM -3 -#define DN_ENO_IMPL -4 - - -typedef int rstatus_t; /* return type */ -typedef int err_t; /* error type */ - -#define THROW_STATUS(s) \ - { \ - rstatus_t __ret = (s); \ - if (__ret != DN_OK) { \ - log_debug(LOG_WARN, "failed "#s); \ - return __ret; \ - } \ - } - -#define IGNORE_RET_VAL(x) x; - +#include +#include +#include +#include +#include #include #include #include -#include -#include #include -#include -#include #include #include -#include -#include +#include #include -#include #include -#include +#include +#include -#include "dyn_types.h" #include "dyn_array.h" +#include "dyn_cbuf.h" +#include "dyn_connection.h" +#include "dyn_connection_pool.h" +#include "dyn_crypto.h" #include "dyn_dict.h" -#include "dyn_string.h" -#include "dyn_queue.h" -#include "dyn_rbtree.h" #include "dyn_log.h" -#include "dyn_util.h" -#include "dyn_stats.h" #include "dyn_mbuf.h" #include "dyn_message.h" -#include "dyn_connection.h" -#include "dyn_connection_pool.h" -#include "dyn_cbuf.h" +#include "dyn_queue.h" +#include "dyn_rbtree.h" #include "dyn_ring_queue.h" -#include "dyn_crypto.h" #include "dyn_setting.h" +#include "dyn_stats.h" +#include "dyn_string.h" +#include "dyn_types.h" +#include "dyn_util.h" +#include "hashkit/dyn_hashkit.h" #include "entropy/dyn_entropy.h" #define ENCRYPTION 1 -typedef rstatus_t (*hash_func_t)(const unsigned char *, size_t, struct dyn_token *); typedef enum dyn_state { - INIT = 0, - STANDBY = 1, - WRITES_ONLY = 2, - RESUMING = 3, - NORMAL = 4, - //SUSPENDING = 5, - //LEAVING = 6, - JOINING = 7, - DOWN = 8, - //REMOVED = 9, - //EXITING = 10, - RESET = 11, - UNKNOWN = 12 + INIT = 0, + STANDBY = 1, + WRITES_ONLY = 2, + RESUMING = 3, + NORMAL = 4, + // SUSPENDING = 5, + // LEAVING = 6, + JOINING = 7, + DOWN = 8, + // REMOVED = 9, + // EXITING = 10, + RESET = 11, + UNKNOWN = 12 } dyn_state_t; -static inline char* -get_state(dyn_state_t s) { - switch(s) - { - case INIT: return "INIT"; - case STANDBY: return "STANDBY"; - case WRITES_ONLY: return "WRITES_ONLY"; - case RESUMING: return "RESUMING"; - case NORMAL: return "NORMAL"; - //case SUSPENDING: return "SUSPENDING"; - //case LEAVING: return "LEAVING"; - case JOINING: return "JOINING"; - case DOWN: return "DOWN"; - //case REMOVED: return "REMOVED"; - //case EXITING: return "EXITING"; - case RESET: return "RESET"; - case UNKNOWN: return "Unknown"; - } - return "INVALID STATE"; +static inline char *get_state(dyn_state_t s) { + switch (s) { + case INIT: + return "INIT"; + case STANDBY: + return "STANDBY"; + case WRITES_ONLY: + return "WRITES_ONLY"; + case RESUMING: + return "RESUMING"; + case NORMAL: + return "NORMAL"; + // case SUSPENDING: return "SUSPENDING"; + // case LEAVING: return "LEAVING"; + case JOINING: + return "JOINING"; + case DOWN: + return "DOWN"; + // case REMOVED: return "REMOVED"; + // case EXITING: return "EXITING"; + case RESET: + return "RESET"; + case UNKNOWN: + return "Unknown"; + } + return "INVALID STATE"; } typedef enum data_store { - DATA_REDIS = 0, /* Data store is Redis */ - DATA_MEMCACHE = 1 /* Data store is Memcache */ + DATA_REDIS = 0, /* Data store is Redis */ + DATA_MEMCACHE = 1 /* Data store is Memcache */ } data_store_t; extern data_store_t g_data_store; @@ -179,88 +164,87 @@ extern uint32_t admin_opt; * pid file and various other properties. */ struct instance { - struct context *ctx; /* active context */ - int log_level; /* log level */ - char *log_filename; /* log filename */ - char *conf_filename; /* configuration filename */ - char hostname[DN_MAXHOSTNAMELEN]; /* hostname */ - uint16_t entropy_port; /* send reconciliation port */ - char *entropy_addr; /* send reconciliation addr */ - size_t mbuf_chunk_size; /* mbuf chunk size */ - size_t alloc_msgs_max; /* allocated messages buffer size */ - pid_t pid; /* process id */ - char *pid_filename; /* pid filename */ - unsigned pidfile:1; /* pid file created? */ + struct context *ctx; /* active context */ + int log_level; /* log level */ + char *log_filename; /* log filename */ + char *conf_filename; /* configuration filename */ + char hostname[DN_MAXHOSTNAMELEN]; /* hostname */ + uint16_t entropy_port; /* send reconciliation port */ + char *entropy_addr; /* send reconciliation addr */ + size_t mbuf_chunk_size; /* mbuf chunk size */ + size_t alloc_msgs_max; /* allocated messages buffer size */ + pid_t pid; /* process id */ + char *pid_filename; /* pid filename */ + unsigned pidfile : 1; /* pid file created? */ }; - struct continuum { - uint32_t index; /* dyn_peer index */ - uint32_t value; /* hash value, used by ketama */ - struct dyn_token *token; /* used in vnode/dyn_token situations */ + uint32_t index; /* dyn_peer index */ + uint32_t value; /* hash value, used by ketama */ + struct dyn_token *token; /* used in vnode/dyn_token situations */ }; struct rack { - struct string *name; - struct string *dc; - uint32_t ncontinuum; /* # continuum points */ - uint32_t nserver_continuum; /* # servers - live and dead on continuum (const) */ - struct continuum *continuum; /* continuum */ + struct string *name; + struct string *dc; + uint32_t ncontinuum; /* # continuum points */ + uint32_t + nserver_continuum; /* # servers - live and dead on continuum (const) */ + struct continuum *continuum; /* continuum */ }; - struct datacenter { - struct string *name; /* datacenter name */ - struct array racks; /* list of racks in a datacenter */ - struct rack *preselected_rack_for_replication; - dict *dict_rack; + struct string *name; /* datacenter name */ + struct array racks; /* list of racks in a datacenter */ + struct rack *preselected_rack_for_replication; + dict *dict_rack; }; struct endpoint { - struct string pname; /* name:port:weight (ref in conf_server) */ - uint16_t port; /* port */ - int family; /* socket family */ - socklen_t addrlen; /* socket length */ - struct sockaddr *addr; /* socket address (ref in conf_server) */ + struct string pname; /* name:port:weight (ref in conf_server) */ + uint16_t port; /* port */ + int family; /* socket family */ + socklen_t addrlen; /* socket length */ + struct sockaddr *addr; /* socket address (ref in conf_server) */ }; struct datastore { - struct object obj; - uint32_t idx; /* server index */ - struct server_pool *owner; /* owner pool */ - struct endpoint endpoint; - struct string name; /* name (ref in conf_server) */ + struct object obj; + uint32_t idx; /* server index */ + struct server_pool *owner; /* owner pool */ + struct endpoint endpoint; + struct string name; /* name (ref in conf_server) */ - conn_pool_t *conn_pool; - uint8_t max_connections; + conn_pool_t *conn_pool; + uint8_t max_connections; - msec_t next_retry_ms; /* next retry time in msec */ - uint32_t failure_count; /* # consecutive failures */ + msec_t next_retry_ms; /* next retry time in msec */ + uint32_t failure_count; /* # consecutive failures */ }; /** \struct node * @brief Dynomite server node. */ struct node { - struct object obj; - uint32_t idx; /* server index */ - struct server_pool *owner; /* owner pool */ - struct endpoint endpoint; - struct string name; /* name (ref in conf_server) */ - - conn_pool_t *conn_pool; /* the only peer connection */ - - msec_t next_retry_ms; /* next retry time in msec */ - uint32_t failure_count; /* # consecutive failures */ - - struct string rack; /* logical rack */ - struct string dc; /* server's dc */ - struct array tokens; /* DHT tokens this peer owns */ - bool is_local; /* is this peer the current running node? */ - bool is_same_dc; /* is this peer the current running node? */ - unsigned processed:1; /* flag to indicate whether this has been processed */ - unsigned is_secure:1; /* is the connection to the server secure? */ - dyn_state_t state; /* state of the server - used mainly in peers */ + struct object obj; + uint32_t idx; /* server index */ + struct server_pool *owner; /* owner pool */ + struct endpoint endpoint; + struct string name; /* name (ref in conf_server) */ + + conn_pool_t *conn_pool; /* the only peer connection */ + + msec_t next_retry_ms; /* next retry time in msec */ + uint32_t failure_count; /* # consecutive failures */ + + struct string rack; /* logical rack */ + struct string dc; /* server's dc */ + struct array tokens; /* DHT tokens this peer owns */ + bool is_local; /* is this peer the current running node? */ + bool is_same_dc; /* is this peer the current running node? */ + unsigned processed : 1; /* flag to indicate whether this has been processed */ + unsigned is_secure : 1; /* is the connection to the server secure? */ + dyn_state_t state; /* state of the server - used mainly in peers */ }; /** \struct server_pool @@ -273,58 +257,61 @@ struct node { * information such as dc, rack, node token and runtime environment. */ struct server_pool { - object_t object; - struct context *ctx; /* owner context */ - struct conf_pool *conf_pool; /* back reference to conf_pool */ - - struct conn *p_conn; /* proxy connection (listener) */ - struct conn_tqh c_conn_q; /* client connection q */ - struct conn_tqh ready_conn_q; /* ready connection q */ - - struct datastore *datastore; /* underlying datastore */ - struct array datacenters; /* racks info */ - uint64_t next_rebuild; /* next distribution rebuild time in usec */ - - struct string name; /* pool name (ref in conf_pool) */ - struct endpoint proxy_endpoint; - int key_hash_type; /* key hash type (hash_type_t) */ - hash_func_t key_hash; /* key hasher */ - struct string hash_tag; /* key hash tag (ref in conf_pool) */ - msec_t timeout; /* timeout in msec */ - int backlog; /* listen backlog */ - uint32_t client_connections; /* maximum # client connection */ - msec_t server_retry_timeout_ms; /* server retry timeout in msec */ - uint8_t server_failure_limit; /* server failure limit */ - unsigned auto_eject_hosts:1; /* auto_eject_hosts? */ - unsigned preconnect:1; /* preconnect? */ - - /* dynomite */ - struct string seed_provider; - struct array peers; - struct conn *d_conn; /* dnode connection (listener) */ - struct endpoint dnode_proxy_endpoint; - int d_timeout; /* peer timeout in msec */ - int d_backlog; /* listen backlog */ - int64_t d_retry_timeout; /* peer retry timeout in usec */ - uint32_t d_failure_limit; /* peer failure limit */ - uint8_t max_local_peer_connections; - uint8_t max_remote_peer_connections; - struct string rack; /* the rack for this node */ - struct array tokens; /* the DHT tokens for this server */ - - msec_t g_interval; /* gossip interval */ - struct string dc; /* server's dc */ - struct string env; /* aws, network, etc */ - /* none | datacenter | rack | all in order of increasing number of connections. (default is datacenter) */ - secure_server_option_t secure_server_option; - struct string pem_key_file; - struct string recon_key_file; /* file with Key encryption in reconciliation */ - struct string recon_iv_file; /* file with Initialization Vector encryption in reconciliation */ - struct endpoint stats_endpoint; /* stats_listen: socket info for stats */ - msec_t stats_interval; /* stats aggregation interval */ - bool enable_gossip; /* enable/disable gossip */ - size_t mbuf_size; /* mbuf chunk size */ - size_t alloc_msgs_max; /* allocated messages buffer size */ + object_t object; + struct context *ctx; /* owner context */ + struct conf_pool *conf_pool; /* back reference to conf_pool */ + + struct conn *p_conn; /* proxy connection (listener) */ + struct conn_tqh c_conn_q; /* client connection q */ + struct conn_tqh ready_conn_q; /* ready connection q */ + + struct datastore *datastore; /* underlying datastore */ + struct array datacenters; /* racks info */ + uint64_t next_rebuild; /* next distribution rebuild time in usec */ + + struct string name; /* pool name (ref in conf_pool) */ + struct endpoint proxy_endpoint; + int key_hash_type; /* key hash type (hash_type_t) */ + hash_func_t key_hash; /* key hasher */ + struct string hash_tag; /* key hash tag (ref in conf_pool) */ + msec_t timeout; /* timeout in msec */ + int backlog; /* listen backlog */ + uint32_t client_connections; /* maximum # client connection */ + msec_t server_retry_timeout_ms; /* server retry timeout in msec */ + uint8_t server_failure_limit; /* server failure limit */ + unsigned auto_eject_hosts : 1; /* auto_eject_hosts? */ + unsigned preconnect : 1; /* preconnect? */ + + /* dynomite */ + struct string seed_provider; + struct array peers; + struct conn *d_conn; /* dnode connection (listener) */ + struct endpoint dnode_proxy_endpoint; + int d_timeout; /* peer timeout in msec */ + int d_backlog; /* listen backlog */ + int64_t d_retry_timeout; /* peer retry timeout in usec */ + uint32_t d_failure_limit; /* peer failure limit */ + uint8_t max_local_peer_connections; + uint8_t max_remote_peer_connections; + struct string rack; /* the rack for this node */ + struct array tokens; /* the DHT tokens for this server */ + + msec_t g_interval; /* gossip interval */ + struct string dc; /* server's dc */ + struct string env; /* aws, network, etc */ + /* none | datacenter | rack | all in order of increasing number of + * connections. (default is datacenter) */ + secure_server_option_t secure_server_option; + struct string pem_key_file; + struct string recon_key_file; /* file with Key encryption in reconciliation */ + struct string + recon_iv_file; /* file with Initialization Vector encryption in + reconciliation */ + struct endpoint stats_endpoint; /* stats_listen: socket info for stats */ + msec_t stats_interval; /* stats aggregation interval */ + bool enable_gossip; /* enable/disable gossip */ + size_t mbuf_size; /* mbuf chunk size */ + size_t alloc_msgs_max; /* allocated messages buffer size */ }; /** \struct context @@ -336,27 +323,25 @@ struct server_pool { * functionality is enabled/disabled. */ struct context { - struct instance *instance; /* back pointer to instance */ - struct conf *cf; /* configuration */ - struct stats *stats; /* stats */ - struct entropy *entropy; /* reconciliation connection */ - struct server_pool pool; /* server_pool[] */ - struct event_base *evb; /* event base */ - msec_t max_timeout; /* max timeout in msec */ - msec_t timeout; /* timeout in msec */ - dyn_state_t dyn_state; /* state of the node. Don't need volatile as - it is ok to eventually get its new value */ - uint32_t admin_opt; /* admin mode */ + struct instance *instance; /* back pointer to instance */ + struct conf *cf; /* configuration */ + struct stats *stats; /* stats */ + struct entropy *entropy; /* reconciliation connection */ + struct server_pool pool; /* server_pool[] */ + struct event_base *evb; /* event base */ + msec_t max_timeout; /* max timeout in msec */ + msec_t timeout; /* timeout in msec */ + dyn_state_t dyn_state; /* state of the node. Don't need volatile as + it is ok to eventually get its new value */ + uint32_t admin_opt; /* admin mode */ }; - - rstatus_t core_start(struct instance *nci); void core_stop(struct context *ctx); rstatus_t core_core(void *arg, uint32_t events); rstatus_t core_loop(struct context *ctx); void core_debug(struct context *ctx); void core_set_local_state(struct context *ctx, dyn_state_t state); -char* print_server_pool(const struct object *obj); +char *print_server_pool(const struct object *obj); #endif diff --git a/src/dyn_crypto.c b/src/dyn_crypto.c index 6aba54a44..830cef770 100644 --- a/src/dyn_crypto.c +++ b/src/dyn_crypto.c @@ -1,17 +1,16 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. */ -#include -#include -#include -#include #include #include +#include +#include +#include +#include #include - #include "dyn_core.h" #include "dyn_crypto.h" #include "dyn_server.h" @@ -20,7 +19,7 @@ static const EVP_CIPHER *aes_cipher; static RSA *rsa; static int rsa_size = 0; -static unsigned char aes_key[AES_KEYLEN+1]; +static unsigned char aes_key[AES_KEYLEN + 1]; static EVP_CIPHER_CTX *aes_encrypt_ctx; static EVP_CIPHER_CTX *aes_decrypt_ctx; @@ -30,75 +29,73 @@ static EVP_CIPHER_CTX *aes_decrypt_ctx; * @param[in,out] pem_key_file PEM key file. * @return rstatus_t Return status code. */ -static rstatus_t load_private_rsa_key_by_file(const struct string *pem_key_file) -{ - FILE * fp; - - if (string_empty(pem_key_file)) { - log_error("Error: PEM key file name is empty. Unable to read file."); - return DN_ERROR; +static rstatus_t load_private_rsa_key_by_file( + const struct string *pem_key_file) { + FILE *fp; + + if (string_empty(pem_key_file)) { + log_error("Error: PEM key file name is empty. Unable to read file."); + return DN_ERROR; + } + + char file_name[pem_key_file->len + 1]; + memcpy(file_name, pem_key_file->data, pem_key_file->len); + file_name[pem_key_file->len] = '\0'; + + if (access(file_name, F_OK) < 0) { + log_error("Error: file %s does not exist", file_name); + return DN_ERROR; + } + + if (NULL != (fp = fopen(file_name, "r"))) { + rsa = PEM_read_RSAPrivateKey(fp, NULL, NULL, NULL); + if (rsa == NULL) { + log_error("Error: could NOT read RSA pem key file at %s", file_name); + return DN_ERROR; } - char file_name[pem_key_file->len + 1]; - memcpy(file_name, pem_key_file->data, pem_key_file->len); - file_name[pem_key_file->len] = '\0'; - - if( access(file_name, F_OK ) < 0 ) { - log_error("Error: file %s does not exist", file_name); - return DN_ERROR; - } - - if(NULL != (fp= fopen(file_name, "r")) ) - { - rsa = PEM_read_RSAPrivateKey(fp, NULL, NULL, NULL); - if(rsa == NULL) - { - log_error("Error: could NOT read RSA pem key file at %s", file_name); - return DN_ERROR; - } - - } else { - log_error("Error: could NOT locate RSA pem key file at %s", file_name); - return DN_ERROR; - } + } else { + log_error("Error: could NOT locate RSA pem key file at %s", file_name); + return DN_ERROR; + } - rsa_size = RSA_size(rsa); + rsa_size = RSA_size(rsa); - log_debug(LOG_INFO, "Private RSA structure filled with size %d", rsa_size); - /* - char *pri_key; // Private key - char *pub_key; // Public key - size_t pri_len; // Length of private key - size_t pub_len; // Length of public key + log_debug(LOG_INFO, "Private RSA structure filled with size %d", rsa_size); + /* + char *pri_key; // Private key + char *pub_key; // Public key + size_t pri_len; // Length of private key + size_t pub_len; // Length of public key - // To get the C-string PEM form: - BIO *pri = BIO_new(BIO_s_mem()); - BIO *pub = BIO_new(BIO_s_mem()); + // To get the C-string PEM form: + BIO *pri = BIO_new(BIO_s_mem()); + BIO *pub = BIO_new(BIO_s_mem()); - PEM_write_bio_RSAPrivateKey(pri, rsa, NULL, NULL, 0, NULL, NULL); - PEM_write_bio_RSAPublicKey(pub, rsa); + PEM_write_bio_RSAPrivateKey(pri, rsa, NULL, NULL, 0, NULL, NULL); + PEM_write_bio_RSAPublicKey(pub, rsa); - pri_len = BIO_pending(pri); - pub_len = BIO_pending(pub); + pri_len = BIO_pending(pri); + pub_len = BIO_pending(pub); - pri_key = malloc(pri_len + 1); - pub_key = malloc(pub_len + 1); + pri_key = malloc(pri_len + 1); + pub_key = malloc(pub_len + 1); - BIO_read(pri, pri_key, pri_len); - BIO_read(pub, pub_key, pub_len); + BIO_read(pri, pri_key, pri_len); + BIO_read(pub, pub_key, pub_len); - pri_key[pri_len] = '\0'; - pub_key[pub_len] = '\0'; + pri_key[pri_len] = '\0'; + pub_key[pub_len] = '\0'; - //log_debug(LOG_VERB, ("pri_key %s", pri_key); - //log_debug(LOG_VERB, "pub_key %s", pub_key); + //log_debug(LOG_VERB, ("pri_key %s", pri_key); + //log_debug(LOG_VERB, "pub_key %s", pub_key); - BIO_free_all(pub); - BIO_free_all(pri); + BIO_free_all(pub); + BIO_free_all(pri); - */ + */ - return DN_OK; + return DN_OK; } /** @@ -106,56 +103,52 @@ static rstatus_t load_private_rsa_key_by_file(const struct string *pem_key_file) * @param[in] sp Server pool. * @return rstatus_t Return status code. */ -static rstatus_t -load_private_rsa_key(struct server_pool *sp) -{ - if (sp == NULL || string_empty(&sp->pem_key_file)) { - log_error("Could NOT read RSA pem key file due to bad context or configuration"); - return DN_ERROR; - } - - THROW_STATUS(load_private_rsa_key_by_file(&sp->pem_key_file)); - return DN_OK; +static rstatus_t load_private_rsa_key(struct server_pool *sp) { + if (sp == NULL || string_empty(&sp->pem_key_file)) { + log_error( + "Could NOT read RSA pem key file due to bad context or configuration"); + return DN_ERROR; + } + + THROW_STATUS(load_private_rsa_key_by_file(&sp->pem_key_file)); + return DN_OK; } /** * Initialize AES. * @return rstatus_t Return status code. */ -static rstatus_t -aes_init(void) -{ - // Initialize contexts +static rstatus_t aes_init(void) { +// Initialize contexts #if OPENSSL_VERSION_NUMBER < 0x10100000L - aes_encrypt_ctx = (EVP_CIPHER_CTX*) malloc(sizeof(EVP_CIPHER_CTX)); - aes_decrypt_ctx = (EVP_CIPHER_CTX*) malloc(sizeof(EVP_CIPHER_CTX)); + aes_encrypt_ctx = (EVP_CIPHER_CTX *)malloc(sizeof(EVP_CIPHER_CTX)); + aes_decrypt_ctx = (EVP_CIPHER_CTX *)malloc(sizeof(EVP_CIPHER_CTX)); - EVP_CIPHER_CTX_init(aes_encrypt_ctx); - EVP_CIPHER_CTX_init(aes_decrypt_ctx); + EVP_CIPHER_CTX_init(aes_encrypt_ctx); + EVP_CIPHER_CTX_init(aes_decrypt_ctx); #else - aes_encrypt_ctx = EVP_CIPHER_CTX_new(); - aes_decrypt_ctx = EVP_CIPHER_CTX_new(); + aes_encrypt_ctx = EVP_CIPHER_CTX_new(); + aes_decrypt_ctx = EVP_CIPHER_CTX_new(); - EVP_CIPHER_CTX_reset(aes_encrypt_ctx); - EVP_CIPHER_CTX_reset(aes_decrypt_ctx); + EVP_CIPHER_CTX_reset(aes_encrypt_ctx); + EVP_CIPHER_CTX_reset(aes_decrypt_ctx); #endif - //EVP_CIPHER_CTX_set_padding(aes_encrypt_ctx, RSA_PKCS1_PADDING); - EVP_CIPHER_CTX_set_padding(aes_encrypt_ctx, RSA_NO_PADDING); + // EVP_CIPHER_CTX_set_padding(aes_encrypt_ctx, RSA_PKCS1_PADDING); + EVP_CIPHER_CTX_set_padding(aes_encrypt_ctx, RSA_NO_PADDING); - //EVP_CIPHER_CTX_set_padding(aes_decrypt_ctx, RSA_PKCS1_PADDING); - EVP_CIPHER_CTX_set_padding(aes_decrypt_ctx, RSA_NO_PADDING); + // EVP_CIPHER_CTX_set_padding(aes_decrypt_ctx, RSA_PKCS1_PADDING); + EVP_CIPHER_CTX_set_padding(aes_decrypt_ctx, RSA_NO_PADDING); - // Init AES - aes_cipher = EVP_aes_128_cbc(); + // Init AES + aes_cipher = EVP_aes_128_cbc(); + if (RAND_bytes(aes_key, AES_KEYLEN) == 0) { + return DN_ERROR; + } - if(RAND_bytes(aes_key, AES_KEYLEN) == 0) { - return DN_ERROR; - } - - return DN_OK; + return DN_OK; } /** @@ -163,64 +156,58 @@ aes_init(void) * @param[in] sp Server pool. * @return rstatus_t Return status code. */ -rstatus_t -crypto_init(struct server_pool *sp) -{ - if (sp->secure_server_option == SECURE_OPTION_NONE) { - log_debug(LOG_NOTICE, "secure_server_option is none, skipping crypto_init()"); - return DN_OK; - } - - //init AES - THROW_STATUS(aes_init()); - //init RSA - THROW_STATUS(load_private_rsa_key(sp)); +rstatus_t crypto_init(struct server_pool *sp) { + if (sp->secure_server_option == SECURE_OPTION_NONE) { + log_debug(LOG_NOTICE, + "secure_server_option is none, skipping crypto_init()"); return DN_OK; + } + + // init AES + THROW_STATUS(aes_init()); + // init RSA + THROW_STATUS(load_private_rsa_key(sp)); + return DN_OK; } -rstatus_t -crypto_deinit(void) -{ +rstatus_t crypto_deinit(void) { #if OPENSSL_VERSION_NUMBER < 0x10100000L - EVP_CIPHER_CTX_cleanup(aes_encrypt_ctx); - EVP_CIPHER_CTX_cleanup(aes_decrypt_ctx); - free(aes_encrypt_ctx); - free(aes_decrypt_ctx); + EVP_CIPHER_CTX_cleanup(aes_encrypt_ctx); + EVP_CIPHER_CTX_cleanup(aes_decrypt_ctx); + free(aes_encrypt_ctx); + free(aes_decrypt_ctx); #else - EVP_CIPHER_CTX_free(aes_encrypt_ctx); - EVP_CIPHER_CTX_free(aes_decrypt_ctx); + EVP_CIPHER_CTX_free(aes_encrypt_ctx); + EVP_CIPHER_CTX_free(aes_decrypt_ctx); #endif - return DN_OK; + return DN_OK; } - -char* -base64_encode(const unsigned char *message, const size_t length) -{ - BIO *bio; - BIO *b64; - FILE* stream; - - size_t encodedSize = (size_t)(4 * ceil((double)length/3)); - char *buffer = (char*)malloc(encodedSize + 1); - if(buffer == NULL) { - fprintf(stderr, "Failed to allocate memory\n"); - exit(1); - } - - stream = fmemopen(buffer, (size_t)encodedSize + 1, "w"); - b64 = BIO_new(BIO_f_base64()); - bio = BIO_new_fp(stream, BIO_NOCLOSE); - bio = BIO_push(b64, bio); - BIO_set_flags(bio, BIO_FLAGS_BASE64_NO_NL); - BIO_write(bio, message, (int)length); - (void)BIO_flush(bio); - BIO_free_all(bio); - fclose(stream); - - return buffer; +char *base64_encode(const unsigned char *message, const size_t length) { + BIO *bio; + BIO *b64; + FILE *stream; + + size_t encodedSize = (size_t)(4 * ceil((double)length / 3)); + char *buffer = (char *)malloc(encodedSize + 1); + if (buffer == NULL) { + fprintf(stderr, "Failed to allocate memory\n"); + exit(1); + } + + stream = fmemopen(buffer, (size_t)encodedSize + 1, "w"); + b64 = BIO_new(BIO_f_base64()); + bio = BIO_new_fp(stream, BIO_NOCLOSE); + bio = BIO_push(b64, bio); + BIO_set_flags(bio, BIO_FLAGS_BASE64_NO_NL); + BIO_write(bio, message, (int)length); + (void)BIO_flush(bio); + BIO_free_all(bio); + fclose(stream); + + return buffer; } /* @@ -239,7 +226,8 @@ calc_decode_length(const char *b64input, const size_t length) } static int -base64_decode(const char *b64message, const size_t length, unsigned char **buffer) +base64_decode(const char *b64message, const size_t length, unsigned char +**buffer) { BIO *bio; BIO *b64; @@ -266,265 +254,285 @@ base64_decode(const char *b64message, const size_t length, unsigned char **buffe } */ -rstatus_t -aes_encrypt(const unsigned char *msg, size_t msg_len, unsigned char **enc_msg, unsigned char *arg_aes_key) -{ - int block_len = 0; - int enc_msg_len = 0; - - *enc_msg = (unsigned char*)malloc(msg_len + AES_BLOCK_SIZE); - if(enc_msg == NULL) - return DN_ERROR; - - //if(!EVP_EncryptInit_ex(aes_encrypt_ctx, aes_cipher, NULL, arg_aes_key, aes_iv)) { - if(!EVP_EncryptInit_ex(aes_encrypt_ctx, aes_cipher, NULL, arg_aes_key, arg_aes_key)) { - log_debug(LOG_VERB, "This is bad data in EVP_EncryptInit_ex : '%.*s'", msg_len, msg); - return DN_ERROR; - } - - if(!EVP_EncryptUpdate(aes_encrypt_ctx, *enc_msg, &block_len, (unsigned char*)msg, (int)msg_len)) { - log_debug(LOG_VERB, "This is bad data in EVP_EncryptUpdate : '%.*s'", msg_len, msg); - return DN_ERROR; - } - enc_msg_len += block_len; - - if(!EVP_EncryptFinal_ex(aes_encrypt_ctx, *enc_msg + enc_msg_len, &block_len)) { - log_debug(LOG_VERB, "This is bad data in EVP_EncryptFinal_ex : '%.*s'", msg_len, msg); - return DN_ERROR; - } - - //EVP_CIPHER_CTX_cleanup(aesEncryptCtx); - - return enc_msg_len + block_len; +rstatus_t aes_encrypt(const unsigned char *msg, size_t msg_len, + unsigned char **enc_msg, unsigned char *arg_aes_key) { + int block_len = 0; + int enc_msg_len = 0; + + *enc_msg = (unsigned char *)malloc(msg_len + AES_BLOCK_SIZE); + if (enc_msg == NULL) return DN_ERROR; + + // if(!EVP_EncryptInit_ex(aes_encrypt_ctx, aes_cipher, NULL, arg_aes_key, + // aes_iv)) { + if (!EVP_EncryptInit_ex(aes_encrypt_ctx, aes_cipher, NULL, arg_aes_key, + arg_aes_key)) { + log_debug(LOG_VERB, "This is bad data in EVP_EncryptInit_ex : '%.*s'", + msg_len, msg); + return DN_ERROR; + } + + if (!EVP_EncryptUpdate(aes_encrypt_ctx, *enc_msg, &block_len, + (unsigned char *)msg, (int)msg_len)) { + log_debug(LOG_VERB, "This is bad data in EVP_EncryptUpdate : '%.*s'", + msg_len, msg); + return DN_ERROR; + } + enc_msg_len += block_len; + + if (!EVP_EncryptFinal_ex(aes_encrypt_ctx, *enc_msg + enc_msg_len, + &block_len)) { + log_debug(LOG_VERB, "This is bad data in EVP_EncryptFinal_ex : '%.*s'", + msg_len, msg); + return DN_ERROR; + } + + // EVP_CIPHER_CTX_cleanup(aesEncryptCtx); + + return enc_msg_len + block_len; } - -rstatus_t -dyn_aes_encrypt(const unsigned char *msg, size_t msg_len, struct mbuf *mbuf, unsigned char *arg_aes_key) -{ - int block_len = 0; - int enc_msg_len = 0; - - ASSERT(mbuf != NULL && mbuf->last == mbuf->pos); - - //if(!EVP_EncryptInit_ex(aes_encrypt_ctx, aes_cipher, NULL, arg_aes_key, aes_iv)) { - if(!EVP_EncryptInit_ex(aes_encrypt_ctx, aes_cipher, NULL, arg_aes_key, arg_aes_key)) { - loga_hexdump(msg, msg_len, "Bad data in EVP_EncryptInit_ex, crypto data with %ld bytes of data", msg_len); - return DN_ERROR; - } - - if(!EVP_EncryptUpdate(aes_encrypt_ctx, mbuf->start, &block_len, (unsigned char*) msg, (int)msg_len)) { - loga_hexdump(msg, msg_len, "Bad data in EVP_EncryptUpdate, crypto data with %ld bytes of data", msg_len); - return DN_ERROR; - } - enc_msg_len += block_len; - - if(!EVP_EncryptFinal_ex(aes_encrypt_ctx, mbuf->start + enc_msg_len, &block_len)) { - loga_hexdump(msg, msg_len, "Bad data in EVP_EncryptFinal_ex, crypto data with %ld bytes of data", msg_len); - return DN_ERROR; - } +rstatus_t dyn_aes_encrypt(const unsigned char *msg, size_t msg_len, + struct mbuf *mbuf, unsigned char *arg_aes_key) { + int block_len = 0; + int enc_msg_len = 0; + + ASSERT(mbuf != NULL && mbuf->last == mbuf->pos); + + // if(!EVP_EncryptInit_ex(aes_encrypt_ctx, aes_cipher, NULL, arg_aes_key, + // aes_iv)) { + if (!EVP_EncryptInit_ex(aes_encrypt_ctx, aes_cipher, NULL, arg_aes_key, + arg_aes_key)) { + loga_hexdump( + msg, msg_len, + "Bad data in EVP_EncryptInit_ex, crypto data with %ld bytes of data", + msg_len); + return DN_ERROR; + } + + if (!EVP_EncryptUpdate(aes_encrypt_ctx, mbuf->start, &block_len, + (unsigned char *)msg, (int)msg_len)) { + loga_hexdump( + msg, msg_len, + "Bad data in EVP_EncryptUpdate, crypto data with %ld bytes of data", + msg_len); + return DN_ERROR; + } + enc_msg_len += block_len; + + if (!EVP_EncryptFinal_ex(aes_encrypt_ctx, mbuf->start + enc_msg_len, + &block_len)) { + loga_hexdump( + msg, msg_len, + "Bad data in EVP_EncryptFinal_ex, crypto data with %ld bytes of data", + msg_len); + return DN_ERROR; + } #if OPENSSL_VERSION_NUMBER < 0x10100000L - EVP_CIPHER_CTX_cleanup(aes_encrypt_ctx); + EVP_CIPHER_CTX_cleanup(aes_encrypt_ctx); #else - EVP_CIPHER_CTX_reset(aes_encrypt_ctx); + EVP_CIPHER_CTX_reset(aes_encrypt_ctx); #endif - //for encrypt, we allow to use up to the extra space - if (enc_msg_len + block_len > mbuf->end_extra - mbuf->last) { - return DN_ERROR; - } + // for encrypt, we allow to use up to the extra space + if (enc_msg_len + block_len > mbuf->end_extra - mbuf->last) { + return DN_ERROR; + } - mbuf->last = mbuf->pos + enc_msg_len + block_len; + mbuf->last = mbuf->pos + enc_msg_len + block_len; - return enc_msg_len + block_len; + return enc_msg_len + block_len; } +rstatus_t dyn_aes_decrypt(unsigned char *enc_msg, size_t enc_msg_len, + struct mbuf *mbuf, unsigned char *arg_aes_key) { + if (ENCRYPTION) { + size_t dec_len = 0; + size_t block_len = 0; -rstatus_t -dyn_aes_decrypt(unsigned char *enc_msg, size_t enc_msg_len, struct mbuf *mbuf, unsigned char *arg_aes_key) -{ - if (ENCRYPTION) { - size_t dec_len = 0; - size_t block_len = 0; - - ASSERT(mbuf != NULL && mbuf->start == mbuf->pos); - - //if(!EVP_DecryptInit_ex(aes_decrypt_ctx, aes_cipher, NULL, arg_aes_key, aes_iv)) { - if(!EVP_DecryptInit_ex(aes_decrypt_ctx, aes_cipher, NULL, arg_aes_key, arg_aes_key)) { - loga_hexdump(enc_msg, enc_msg_len, "Bad data in EVP_DecryptInit_ex, crypto data with %ld bytes of data", enc_msg_len); - return DN_ERROR; - } + ASSERT(mbuf != NULL && mbuf->start == mbuf->pos); + + // if(!EVP_DecryptInit_ex(aes_decrypt_ctx, aes_cipher, NULL, arg_aes_key, + // aes_iv)) { + if (!EVP_DecryptInit_ex(aes_decrypt_ctx, aes_cipher, NULL, arg_aes_key, + arg_aes_key)) { + loga_hexdump( + enc_msg, enc_msg_len, + "Bad data in EVP_DecryptInit_ex, crypto data with %ld bytes of data", + enc_msg_len); + return DN_ERROR; + } - if(!EVP_DecryptUpdate(aes_decrypt_ctx, mbuf->pos, (int*) &block_len, enc_msg, (int)enc_msg_len)) { - loga_hexdump(enc_msg, enc_msg_len, "Bad data in EVP_DecryptUpdate, crypto data with %ld bytes of data", enc_msg_len); - return DN_ERROR; - } - dec_len += block_len; + if (!EVP_DecryptUpdate(aes_decrypt_ctx, mbuf->pos, (int *)&block_len, + enc_msg, (int)enc_msg_len)) { + loga_hexdump( + enc_msg, enc_msg_len, + "Bad data in EVP_DecryptUpdate, crypto data with %ld bytes of data", + enc_msg_len); + return DN_ERROR; + } + dec_len += block_len; - if(!EVP_DecryptFinal_ex(aes_decrypt_ctx, mbuf->pos + dec_len, (int*) &block_len)) { - loga_hexdump(enc_msg, enc_msg_len, "Bad data in EVP_DecryptFinal_ex, crypto data with %ld bytes of data", enc_msg_len); - return DN_ERROR; - } + if (!EVP_DecryptFinal_ex(aes_decrypt_ctx, mbuf->pos + dec_len, + (int *)&block_len)) { + loga_hexdump( + enc_msg, enc_msg_len, + "Bad data in EVP_DecryptFinal_ex, crypto data with %ld bytes of data", + enc_msg_len); + return DN_ERROR; + } - dec_len += block_len; - mbuf->last = mbuf->pos + dec_len; + dec_len += block_len; + mbuf->last = mbuf->pos + dec_len; #if OPENSSL_VERSION_NUMBER < 0x10100000L - EVP_CIPHER_CTX_cleanup(aes_decrypt_ctx); + EVP_CIPHER_CTX_cleanup(aes_decrypt_ctx); #else - EVP_CIPHER_CTX_reset(aes_decrypt_ctx); + EVP_CIPHER_CTX_reset(aes_decrypt_ctx); #endif - return (int) dec_len; - } + return (int)dec_len; + } - mbuf_copy(mbuf, enc_msg, enc_msg_len); - return (int) enc_msg_len; + mbuf_copy(mbuf, enc_msg, enc_msg_len); + return (int)enc_msg_len; } /* * AES encrypt a msg with one or more buffers * */ -rstatus_t -dyn_aes_encrypt_msg(struct msg *msg, unsigned char *arg_aes_key, size_t* outlen) -{ - struct mhdr mhdr_tem; - int count = 0; - - if (STAILQ_EMPTY(&msg->mhdr)) { - // 'msg' is empty. Nothing to encrypt. - return DN_ERROR; - } - - STAILQ_INIT(&mhdr_tem); - - struct mbuf *mbuf; - while (!STAILQ_EMPTY(&msg->mhdr)) { - mbuf = STAILQ_FIRST(&msg->mhdr); - //STAILQ_REMOVE_HEAD(&msg->mhdr, next); - mbuf_remove(&msg->mhdr, mbuf); - - //mbuf_dump(mbuf); - - struct mbuf *nbuf = mbuf_get(); - if (nbuf == NULL) { - // Unable to obtain an 'mbuf'. - mbuf_put(mbuf); - return DN_ENOMEM; - } - - int n = dyn_aes_encrypt(mbuf->pos, mbuf_length(mbuf), nbuf, arg_aes_key); - if (n > 0) - count += n; - - mbuf_put(mbuf); - //mbuf_dump(nbuf); - if (STAILQ_EMPTY(&mhdr_tem)) { - STAILQ_INSERT_HEAD(&mhdr_tem, nbuf, next); - } else { - STAILQ_INSERT_TAIL(&mhdr_tem, nbuf, next); - } - } - - while (!STAILQ_EMPTY(&mhdr_tem)) { - mbuf = STAILQ_FIRST(&mhdr_tem); - //STAILQ_REMOVE_HEAD(&mhdr_tem, next); - mbuf_remove(&mhdr_tem, mbuf); - - if (STAILQ_EMPTY(&msg->mhdr)) { - STAILQ_INSERT_HEAD(&msg->mhdr, mbuf, next); - } else { - STAILQ_INSERT_TAIL(&msg->mhdr, mbuf, next); - } +rstatus_t dyn_aes_encrypt_msg(struct msg *msg, unsigned char *arg_aes_key, + size_t *outlen) { + struct mhdr mhdr_tem; + int count = 0; + + if (STAILQ_EMPTY(&msg->mhdr)) { + // 'msg' is empty. Nothing to encrypt. + return DN_ERROR; + } + + STAILQ_INIT(&mhdr_tem); + + struct mbuf *mbuf; + while (!STAILQ_EMPTY(&msg->mhdr)) { + mbuf = STAILQ_FIRST(&msg->mhdr); + // STAILQ_REMOVE_HEAD(&msg->mhdr, next); + mbuf_remove(&msg->mhdr, mbuf); + + // mbuf_dump(mbuf); + + struct mbuf *nbuf = mbuf_get(); + if (nbuf == NULL) { + // Unable to obtain an 'mbuf'. + mbuf_put(mbuf); + return DN_ENOMEM; } - *outlen = count; - return DN_OK; -} + int n = dyn_aes_encrypt(mbuf->pos, mbuf_length(mbuf), nbuf, arg_aes_key); + if (n > 0) count += n; - -rstatus_t -aes_decrypt(unsigned char *enc_msg, size_t enc_msg_len, unsigned char **dec_msg, unsigned char *arg_aes_key) -{ - size_t dec_len = 0; - size_t block_len = 0; - - *dec_msg = (unsigned char*) malloc(enc_msg_len); - if(*dec_msg == NULL) - return DN_ERROR; - - //if(!EVP_DecryptInit_ex(aes_decrypt_ctx, aes_cipher, NULL, arg_aes_key, aes_iv)) { - if(!EVP_DecryptInit_ex(aes_decrypt_ctx, aes_cipher, NULL, arg_aes_key, arg_aes_key)) { - log_debug(LOG_VERB, "This is bad data in EVP_DecryptInit_ex : '%.*s'", enc_msg_len, enc_msg); - return DN_ERROR; + mbuf_put(mbuf); + // mbuf_dump(nbuf); + if (STAILQ_EMPTY(&mhdr_tem)) { + STAILQ_INSERT_HEAD(&mhdr_tem, nbuf, next); + } else { + STAILQ_INSERT_TAIL(&mhdr_tem, nbuf, next); } + } - if(!EVP_DecryptUpdate(aes_decrypt_ctx, (unsigned char*) *dec_msg, (int*) &block_len, enc_msg, (int) enc_msg_len)) { - log_debug(LOG_VERB, "This is bad data in EVP_DecryptUpdate : '%.*s'", enc_msg_len, enc_msg); - return DN_ERROR; - } - dec_len += block_len; + while (!STAILQ_EMPTY(&mhdr_tem)) { + mbuf = STAILQ_FIRST(&mhdr_tem); + // STAILQ_REMOVE_HEAD(&mhdr_tem, next); + mbuf_remove(&mhdr_tem, mbuf); - if(!EVP_DecryptFinal_ex(aes_decrypt_ctx, (unsigned char*) *dec_msg + dec_len, (int*) &block_len)) { - log_debug(LOG_VERB, "This is bad data in EVP_DecryptFinal_ex : '%.*s'", enc_msg_len, enc_msg); - return DN_ERROR; + if (STAILQ_EMPTY(&msg->mhdr)) { + STAILQ_INSERT_HEAD(&msg->mhdr, mbuf, next); + } else { + STAILQ_INSERT_TAIL(&msg->mhdr, mbuf, next); } - dec_len += block_len; + } - //EVP_CIPHER_CTX_cleanup(aesDecryptCtx); - - return (int)dec_len; + *outlen = count; + return DN_OK; } - -unsigned char* generate_aes_key(void) -{ - if(RAND_bytes(aes_key, AES_KEYLEN) == 0) { - return NULL; - } - aes_key[AES_KEYLEN] = '\0'; - - return aes_key; +rstatus_t aes_decrypt(unsigned char *enc_msg, size_t enc_msg_len, + unsigned char **dec_msg, unsigned char *arg_aes_key) { + size_t dec_len = 0; + size_t block_len = 0; + + *dec_msg = (unsigned char *)malloc(enc_msg_len); + if (*dec_msg == NULL) return DN_ERROR; + + // if(!EVP_DecryptInit_ex(aes_decrypt_ctx, aes_cipher, NULL, arg_aes_key, + // aes_iv)) { + if (!EVP_DecryptInit_ex(aes_decrypt_ctx, aes_cipher, NULL, arg_aes_key, + arg_aes_key)) { + log_debug(LOG_VERB, "This is bad data in EVP_DecryptInit_ex : '%.*s'", + enc_msg_len, enc_msg); + return DN_ERROR; + } + + if (!EVP_DecryptUpdate(aes_decrypt_ctx, (unsigned char *)*dec_msg, + (int *)&block_len, enc_msg, (int)enc_msg_len)) { + log_debug(LOG_VERB, "This is bad data in EVP_DecryptUpdate : '%.*s'", + enc_msg_len, enc_msg); + return DN_ERROR; + } + dec_len += block_len; + + if (!EVP_DecryptFinal_ex(aes_decrypt_ctx, (unsigned char *)*dec_msg + dec_len, + (int *)&block_len)) { + log_debug(LOG_VERB, "This is bad data in EVP_DecryptFinal_ex : '%.*s'", + enc_msg_len, enc_msg); + return DN_ERROR; + } + dec_len += block_len; + + // EVP_CIPHER_CTX_cleanup(aesDecryptCtx); + + return (int)dec_len; } +unsigned char *generate_aes_key(void) { + if (RAND_bytes(aes_key, AES_KEYLEN) == 0) { + return NULL; + } + aes_key[AES_KEYLEN] = '\0'; -int dyn_rsa_size(void) { - return rsa_size; + return aes_key; } -rstatus_t -dyn_rsa_encrypt(unsigned char *plain_msg, unsigned char *encrypted_buf) -{ - if(RSA_public_encrypt(AES_KEYLEN, plain_msg, encrypted_buf, rsa, RSA_PKCS1_OAEP_PADDING) != RSA_size(rsa)) { +int dyn_rsa_size(void) { return rsa_size; } + +rstatus_t dyn_rsa_encrypt(unsigned char *plain_msg, + unsigned char *encrypted_buf) { + if (RSA_public_encrypt(AES_KEYLEN, plain_msg, encrypted_buf, rsa, + RSA_PKCS1_OAEP_PADDING) != RSA_size(rsa)) { #if OPENSSL_VERSION_NUMBER < 0x10100000L - ERR_load_crypto_strings(); + ERR_load_crypto_strings(); #endif - char err[130]; - ERR_error_string(ERR_get_error(), err); - log_debug(LOG_VERB, "Error in encrypting message: %s\n", err); - return DN_ERROR; - } - return RSA_size(rsa); + char err[130]; + ERR_error_string(ERR_get_error(), err); + log_debug(LOG_VERB, "Error in encrypting message: %s\n", err); + return DN_ERROR; + } + return RSA_size(rsa); } -rstatus_t -dyn_rsa_decrypt(unsigned char *encrypted_msg, unsigned char *decrypted_buf) -{ - if(RSA_private_decrypt(RSA_size(rsa), - encrypted_msg, - decrypted_buf, - rsa, RSA_PKCS1_OAEP_PADDING) != AES_KEYLEN) { +rstatus_t dyn_rsa_decrypt(unsigned char *encrypted_msg, + unsigned char *decrypted_buf) { + if (RSA_private_decrypt(RSA_size(rsa), encrypted_msg, decrypted_buf, rsa, + RSA_PKCS1_OAEP_PADDING) != AES_KEYLEN) { #if OPENSSL_VERSION_NUMBER < 0x10100000L - ERR_load_crypto_strings(); + ERR_load_crypto_strings(); #endif - char err[130]; - ERR_error_string(ERR_get_error(), err); - log_debug(LOG_VERB, "Error in decrypting message: %s\n", err); - return DN_ERROR; - } + char err[130]; + ERR_error_string(ERR_get_error(), err); + log_debug(LOG_VERB, "Error in decrypting message: %s\n", err); + return DN_ERROR; + } - return AES_KEYLEN; + return AES_KEYLEN; } - diff --git a/src/dyn_crypto.h b/src/dyn_crypto.h index a0c7c9e65..b76f90f21 100644 --- a/src/dyn_crypto.h +++ b/src/dyn_crypto.h @@ -1,54 +1,59 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. */ #ifndef DYN_CRYPTO_H_ #define DYN_CRYPTO_H_ - +#include #include #include -#include #include #include -#include -#include #include #include +#include +#include #include - -#include "dyn_core.h" - +#include "dyn_types.h" #define AES_KEYLEN 32 +// Forward declarations +struct mbuf; +struct msg; +struct server_pool; rstatus_t crypto_init(struct server_pool *sp); rstatus_t crypto_deinit(void); -char* base64_encode(const unsigned char *message, const size_t length); +char *base64_encode(const unsigned char *message, const size_t length); -rstatus_t aes_encrypt(const unsigned char *msg, size_t msgLen, unsigned char **encMsg, unsigned char *aes_key); -rstatus_t aes_decrypt(unsigned char *encMsg, size_t encMsgLen, unsigned char **decMsg, unsigned char *aes_key); +rstatus_t aes_encrypt(const unsigned char *msg, size_t msgLen, + unsigned char **encMsg, unsigned char *aes_key); +rstatus_t aes_decrypt(unsigned char *encMsg, size_t encMsgLen, + unsigned char **decMsg, unsigned char *aes_key); rstatus_t dyn_aes_encrypt(const unsigned char *msg, size_t msgLen, - struct mbuf *mbuf, unsigned char *aes_key); + struct mbuf *mbuf, unsigned char *aes_key); rstatus_t dyn_aes_decrypt(unsigned char *encMsg, size_t encMsgLen, - struct mbuf *mbuf, unsigned char *aes_key); + struct mbuf *mbuf, unsigned char *aes_key); -rstatus_t dyn_aes_encrypt_msg(struct msg *msg, unsigned char *aes_key, size_t* outlen); -unsigned char* generate_aes_key(void); +rstatus_t dyn_aes_encrypt_msg(struct msg *msg, unsigned char *aes_key, + size_t *outlen); +unsigned char *generate_aes_key(void); int dyn_rsa_size(void); -rstatus_t dyn_rsa_encrypt(unsigned char *plain_msg, unsigned char *encrypted_buf); - -rstatus_t dyn_rsa_decrypt(unsigned char *encrypted_msg, unsigned char *decrypted_buf); +rstatus_t dyn_rsa_encrypt(unsigned char *plain_msg, + unsigned char *encrypted_buf); +rstatus_t dyn_rsa_decrypt(unsigned char *encrypted_msg, + unsigned char *decrypted_buf); #endif /* DYN_CRYPTO_H_ */ diff --git a/src/dyn_dict.c b/src/dyn_dict.c index 6598ea2df..339e4d5f7 100644 --- a/src/dyn_dict.c +++ b/src/dyn_dict.c @@ -35,16 +35,16 @@ //#include "fmacros.h" +#include +#include +#include #include #include #include -#include -#include #include -#include -#include "dyn_dict.h" #include "dyn_core.h" +#include "dyn_dict.h" /* Using dictEnableResize() / dictDisableResize() we make possible to * enable/disable resizing of the hash table as needed. This is very important @@ -88,13 +88,9 @@ dictIdentityHashFunction(unsigned int key) static uint32_t dict_hash_function_seed = 5381; -void dictSetHashFunctionSeed(uint32_t seed) { - dict_hash_function_seed = seed; -} +void dictSetHashFunctionSeed(uint32_t seed) { dict_hash_function_seed = seed; } -uint32_t dictGetHashFunctionSeed(void) { - return dict_hash_function_seed; -} +uint32_t dictGetHashFunctionSeed(void) { return dict_hash_function_seed; } /* MurmurHash2, by Austin Appleby * Note - This code makes a few assumptions about how your machine behaves - @@ -108,133 +104,128 @@ uint32_t dictGetHashFunctionSeed(void) { * machines. */ unsigned int dictGenHashFunction(const void *key, uint32_t len) { - /* 'm' and 'r' are mixing constants generated offline. - They're not really 'magic', they just happen to work well. */ - uint32_t seed = dict_hash_function_seed; - const uint32_t m = 0x5bd1e995; - const int r = 24; - - /* Initialize the hash to a 'random' value */ - uint32_t h = seed ^ len; - - /* Mix 4 bytes at a time into the hash */ - const unsigned char *data = (const unsigned char *)key; + /* 'm' and 'r' are mixing constants generated offline. + They're not really 'magic', they just happen to work well. */ + uint32_t seed = dict_hash_function_seed; + const uint32_t m = 0x5bd1e995; + const int r = 24; - while(len >= 4) { - uint32_t k = *(uint32_t*)data; + /* Initialize the hash to a 'random' value */ + uint32_t h = seed ^ len; - k *= m; - k ^= k >> r; - k *= m; + /* Mix 4 bytes at a time into the hash */ + const unsigned char *data = (const unsigned char *)key; - h *= m; - h ^= k; + while (len >= 4) { + uint32_t k = *(uint32_t *)data; - data += 4; - len -= 4; - } - - /* Handle the last few bytes of the input array */ - switch(len) { - case 3: h ^= data[2] << 16; - case 2: h ^= data[1] << 8; - case 1: h ^= data[0]; h *= m; - }; + k *= m; + k ^= k >> r; + k *= m; - /* Do a few final mixes of the hash to ensure the last few - * bytes are well-incorporated. */ - h ^= h >> 13; h *= m; - h ^= h >> 15; - - return (unsigned int)h; + h ^= k; + + data += 4; + len -= 4; + } + + /* Handle the last few bytes of the input array */ + switch (len) { + case 3: + h ^= data[2] << 16; + case 2: + h ^= data[1] << 8; + case 1: + h ^= data[0]; + h *= m; + }; + + /* Do a few final mixes of the hash to ensure the last few + * bytes are well-incorporated. */ + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return (unsigned int)h; } /* And a case insensitive hash function (based on djb hash) */ unsigned int dictGenCaseHashFunction(const unsigned char *buf, uint32_t len) { - unsigned int hash = (unsigned int)dict_hash_function_seed; + unsigned int hash = (unsigned int)dict_hash_function_seed; - while (len--) - hash = ((hash << 5) + hash) + (tolower(*buf++)); /* hash * 33 + c */ - return hash; + while (len--) + hash = ((hash << 5) + hash) + (tolower(*buf++)); /* hash * 33 + c */ + return hash; } /* ----------------------------- API implementation ------------------------- */ /* Reset a hash table already initialized with ht_init(). * NOTE: This function should only be called by ht_destroy(). */ -static void _dictReset(dictht *ht) -{ - ht->table = NULL; - ht->size = 0; - ht->sizemask = 0; - ht->used = 0; +static void _dictReset(dictht *ht) { + ht->table = NULL; + ht->size = 0; + ht->sizemask = 0; + ht->used = 0; } /* Create a new hash table */ -dict *dictCreate(dictType *type, - void *privDataPtr) -{ - dict *d = dn_zalloc(sizeof(*d)); +dict *dictCreate(dictType *type, void *privDataPtr) { + dict *d = dn_zalloc(sizeof(*d)); - _dictInit(d,type,privDataPtr); - return d; + _dictInit(d, type, privDataPtr); + return d; } /* Initialize the hash table */ -int _dictInit(dict *d, dictType *type, - void *privDataPtr) -{ - _dictReset(&d->ht[0]); - _dictReset(&d->ht[1]); - d->type = type; - d->privdata = privDataPtr; - d->rehashidx = -1; - d->iterators = 0; - return DICT_OK; +int _dictInit(dict *d, dictType *type, void *privDataPtr) { + _dictReset(&d->ht[0]); + _dictReset(&d->ht[1]); + d->type = type; + d->privdata = privDataPtr; + d->rehashidx = -1; + d->iterators = 0; + return DICT_OK; } /* Resize the table to the minimal size that contains all the elements, * but with the invariant of a USED/BUCKETS ratio near to <= 1 */ -int dictResize(dict *d) -{ - int minimal; +int dictResize(dict *d) { + int minimal; - if (!dict_can_resize || dictIsRehashing(d)) return DICT_ERR; - minimal = d->ht[0].used; - if (minimal < DICT_HT_INITIAL_SIZE) - minimal = DICT_HT_INITIAL_SIZE; - return dictExpand(d, minimal); + if (!dict_can_resize || dictIsRehashing(d)) return DICT_ERR; + minimal = d->ht[0].used; + if (minimal < DICT_HT_INITIAL_SIZE) minimal = DICT_HT_INITIAL_SIZE; + return dictExpand(d, minimal); } /* Expand or create the hash table */ -int dictExpand(dict *d, unsigned long size) -{ - dictht n; /* the new hash table */ - unsigned long realsize = _dictNextPower(size); - - /* the size is invalid if it is smaller than the number of - * elements already inside the hash table */ - if (dictIsRehashing(d) || d->ht[0].used > size) - return DICT_ERR; - - /* Allocate the new hash table and initialize all pointers to NULL */ - n.size = realsize; - n.sizemask = realsize-1; - n.table = dn_zalloc(realsize*sizeof(dictEntry*)); - n.used = 0; - - /* Is this the first initialization? If so it's not really a rehashing - * we just set the first hash table so that it can accept keys. */ - if (d->ht[0].table == NULL) { - d->ht[0] = n; - return DICT_OK; - } - - /* Prepare a second hash table for incremental rehashing */ - d->ht[1] = n; - d->rehashidx = 0; +int dictExpand(dict *d, unsigned long size) { + dictht n; /* the new hash table */ + unsigned long realsize = _dictNextPower(size); + + /* the size is invalid if it is smaller than the number of + * elements already inside the hash table */ + if (dictIsRehashing(d) || d->ht[0].used > size) return DICT_ERR; + + /* Allocate the new hash table and initialize all pointers to NULL */ + n.size = realsize; + n.sizemask = realsize - 1; + n.table = dn_zalloc(realsize * sizeof(dictEntry *)); + n.used = 0; + + /* Is this the first initialization? If so it's not really a rehashing + * we just set the first hash table so that it can accept keys. */ + if (d->ht[0].table == NULL) { + d->ht[0] = n; return DICT_OK; + } + + /* Prepare a second hash table for incremental rehashing */ + d->ht[1] = n; + d->rehashidx = 0; + return DICT_OK; } /* Performs N steps of incremental rehashing. Returns 1 if there are still @@ -242,43 +233,42 @@ int dictExpand(dict *d, unsigned long size) * Note that a rehashing step consists in moving a bucket (that may have more * than one key as we use chaining) from the old to the new hash table. */ int dictRehash(dict *d, int n) { - if (!dictIsRehashing(d)) return 0; - - while(n--) { - dictEntry *de, *nextde; - - /* Check if we already rehashed the whole table... */ - if (d->ht[0].used == 0) { - if (d->ht[0].table) - dn_free(d->ht[0].table); - d->ht[0] = d->ht[1]; - _dictReset(&d->ht[1]); - d->rehashidx = -1; - return 0; - } + if (!dictIsRehashing(d)) return 0; + + while (n--) { + dictEntry *de, *nextde; + + /* Check if we already rehashed the whole table... */ + if (d->ht[0].used == 0) { + if (d->ht[0].table) dn_free(d->ht[0].table); + d->ht[0] = d->ht[1]; + _dictReset(&d->ht[1]); + d->rehashidx = -1; + return 0; + } - /* Note that rehashidx can't overflow as we are sure there are more - * elements because ht[0].used != 0 */ - ASSERT(d->ht[0].size > (unsigned)d->rehashidx); - while(d->ht[0].table[d->rehashidx] == NULL) d->rehashidx++; - de = d->ht[0].table[d->rehashidx]; - /* Move all the keys in this bucket from the old to the new hash HT */ - while(de) { - unsigned int h; - - nextde = de->next; - /* Get the index in the new hash table */ - h = dictHashKey(d, de->key) & d->ht[1].sizemask; - de->next = d->ht[1].table[h]; - d->ht[1].table[h] = de; - d->ht[0].used--; - d->ht[1].used++; - de = nextde; - } - d->ht[0].table[d->rehashidx] = NULL; - d->rehashidx++; + /* Note that rehashidx can't overflow as we are sure there are more + * elements because ht[0].used != 0 */ + ASSERT(d->ht[0].size > (unsigned)d->rehashidx); + while (d->ht[0].table[d->rehashidx] == NULL) d->rehashidx++; + de = d->ht[0].table[d->rehashidx]; + /* Move all the keys in this bucket from the old to the new hash HT */ + while (de) { + unsigned int h; + + nextde = de->next; + /* Get the index in the new hash table */ + h = dictHashKey(d, de->key) & d->ht[1].sizemask; + de->next = d->ht[1].table[h]; + d->ht[1].table[h] = de; + d->ht[0].used--; + d->ht[1].used++; + de = nextde; } - return 1; + d->ht[0].table[d->rehashidx] = NULL; + d->rehashidx++; + } + return 1; } /*static long long @@ -313,17 +303,16 @@ dictRehashMilliseconds(dict *d, int ms) * dictionary so that the hash table automatically migrates from H1 to H2 * while it is actively used. */ static void _dictRehashStep(dict *d) { - if (d->iterators == 0) dictRehash(d,1); + if (d->iterators == 0) dictRehash(d, 1); } /* Add an element to the target hash table */ -int dictAdd(dict *d, void *key, void *val) -{ - dictEntry *entry = dictAddRaw(d,key); +int dictAdd(dict *d, void *key, void *val) { + dictEntry *entry = dictAddRaw(d, key); - if (!entry) return DICT_ERR; - dictSetVal(d, entry, val); - return DICT_OK; + if (!entry) return DICT_ERR; + dictSetVal(d, entry, val); + return DICT_OK; } /* Low level add. This function adds the entry but instead of setting @@ -341,54 +330,50 @@ int dictAdd(dict *d, void *key, void *val) * If key already exists NULL is returned. * If key was added, the hash entry is returned to be manipulated by the caller. */ -dictEntry *dictAddRaw(dict *d, void *key) -{ - int index; - dictEntry *entry; - dictht *ht; - - if (dictIsRehashing(d)) _dictRehashStep(d); - - /* Get the index of the new element, or -1 if - * the element already exists. */ - if ((index = _dictKeyIndex(d, key)) == -1) - return NULL; - - /* Allocate the memory and store the new entry */ - ht = dictIsRehashing(d) ? &d->ht[1] : &d->ht[0]; - entry = dn_zalloc(sizeof(*entry)); - entry->next = ht->table[index]; - ht->table[index] = entry; - ht->used++; - - /* Set the hash entry fields. */ - dictSetKey(d, entry, key); - return entry; +dictEntry *dictAddRaw(dict *d, void *key) { + int index; + dictEntry *entry; + dictht *ht; + + if (dictIsRehashing(d)) _dictRehashStep(d); + + /* Get the index of the new element, or -1 if + * the element already exists. */ + if ((index = _dictKeyIndex(d, key)) == -1) return NULL; + + /* Allocate the memory and store the new entry */ + ht = dictIsRehashing(d) ? &d->ht[1] : &d->ht[0]; + entry = dn_zalloc(sizeof(*entry)); + entry->next = ht->table[index]; + ht->table[index] = entry; + ht->used++; + + /* Set the hash entry fields. */ + dictSetKey(d, entry, key); + return entry; } /* Add an element, discarding the old if the key already exists. * Return 1 if the key was added from scratch, 0 if there was already an * element with such key and dictReplace() just performed a value update * operation. */ -int dictReplace(dict *d, void *key, void *val) -{ - dictEntry *entry, auxentry; - - /* Try to add the element. If the key - * does not exists dictAdd will succeed. */ - if (dictAdd(d, key, val) == DICT_OK) - return 1; - /* It already exists, get the entry */ - entry = dictFind(d, key); - /* Set the new value and free the old one. Note that it is important - * to do that in this order, as the value may just be exactly the same - * as the previous one. In this context, think to reference counting, - * you want to increment (set), and then decrement (free), and not the - * reverse. */ - auxentry = *entry; - dictSetVal(d, entry, val); - dictFreeVal(d, &auxentry); - return 0; +int dictReplace(dict *d, void *key, void *val) { + dictEntry *entry, auxentry; + + /* Try to add the element. If the key + * does not exists dictAdd will succeed. */ + if (dictAdd(d, key, val) == DICT_OK) return 1; + /* It already exists, get the entry */ + entry = dictFind(d, key); + /* Set the new value and free the old one. Note that it is important + * to do that in this order, as the value may just be exactly the same + * as the previous one. In this context, think to reference counting, + * you want to increment (set), and then decrement (free), and not the + * reverse. */ + auxentry = *entry; + dictSetVal(d, entry, val); + dictFreeVal(d, &auxentry); + return 0; } /* dictReplaceRaw() is simply a version of dictAddRaw() that always @@ -398,271 +383,258 @@ int dictReplace(dict *d, void *key, void *val) * * See dictAddRaw() for more information. */ dictEntry *dictReplaceRaw(dict *d, void *key) { - dictEntry *entry = dictFind(d,key); + dictEntry *entry = dictFind(d, key); - return entry ? entry : dictAddRaw(d,key); + return entry ? entry : dictAddRaw(d, key); } /* Search and remove an element */ -static int dictGenericDelete(dict *d, const void *key, int nofree) -{ - unsigned int h, idx; - dictEntry *he, *prevHe; - int table; - - if (d->ht[0].size == 0) return DICT_ERR; /* d->ht[0].table is NULL */ - if (dictIsRehashing(d)) _dictRehashStep(d); - h = dictHashKey(d, key); - - for (table = 0; table <= 1; table++) { - idx = h & d->ht[table].sizemask; - he = d->ht[table].table[idx]; - prevHe = NULL; - while(he) { - if (dictCompareKeys(d, key, he->key)) { - /* Unlink the element from the list */ - if (prevHe) - prevHe->next = he->next; - else - d->ht[table].table[idx] = he->next; - if (!nofree) { - dictFreeKey(d, he); - dictFreeVal(d, he); - } - //zfree(he); - dn_free(he); - d->ht[table].used--; - return DICT_OK; - } - prevHe = he; - he = he->next; +static int dictGenericDelete(dict *d, const void *key, int nofree) { + unsigned int h, idx; + dictEntry *he, *prevHe; + int table; + + if (d->ht[0].size == 0) return DICT_ERR; /* d->ht[0].table is NULL */ + if (dictIsRehashing(d)) _dictRehashStep(d); + h = dictHashKey(d, key); + + for (table = 0; table <= 1; table++) { + idx = h & d->ht[table].sizemask; + he = d->ht[table].table[idx]; + prevHe = NULL; + while (he) { + if (dictCompareKeys(d, key, he->key)) { + /* Unlink the element from the list */ + if (prevHe) + prevHe->next = he->next; + else + d->ht[table].table[idx] = he->next; + if (!nofree) { + dictFreeKey(d, he); + dictFreeVal(d, he); } - if (!dictIsRehashing(d)) break; + // zfree(he); + dn_free(he); + d->ht[table].used--; + return DICT_OK; + } + prevHe = he; + he = he->next; } - return DICT_ERR; /* not found */ + if (!dictIsRehashing(d)) break; + } + return DICT_ERR; /* not found */ } int dictDelete(dict *ht, const void *key) { - return dictGenericDelete(ht,key,0); + return dictGenericDelete(ht, key, 0); } int dictDeleteNoFree(dict *ht, const void *key) { - return dictGenericDelete(ht,key,1); + return dictGenericDelete(ht, key, 1); } /* Destroy an entire dictionary */ -static int -_dictClear(dict *d, dictht *ht, void(callback)(void *)) -{ - unsigned long i; - - /* Free all the elements */ - for (i = 0; i < ht->size && ht->used > 0; i++) { - dictEntry *he, *nextHe; - - if (callback && (i & 65535) == 0) callback(d->privdata); - - if ((he = ht->table[i]) == NULL) continue; - while(he) { - nextHe = he->next; - dictFreeKey(d, he); - dictFreeVal(d, he); - //zfree(he); - dn_free(he); - ht->used--; - he = nextHe; - } +static int _dictClear(dict *d, dictht *ht, void(callback)(void *)) { + unsigned long i; + + /* Free all the elements */ + for (i = 0; i < ht->size && ht->used > 0; i++) { + dictEntry *he, *nextHe; + + if (callback && (i & 65535) == 0) callback(d->privdata); + + if ((he = ht->table[i]) == NULL) continue; + while (he) { + nextHe = he->next; + dictFreeKey(d, he); + dictFreeVal(d, he); + // zfree(he); + dn_free(he); + ht->used--; + he = nextHe; } - /* Free the table and the allocated cache structure */ - //zfree(ht->table); - if (ht->size && ht->table) - dn_free(ht->table); - /* Re-initialize the table */ - _dictReset(ht); - return DICT_OK; /* never fails */ + } + /* Free the table and the allocated cache structure */ + // zfree(ht->table); + if (ht->size && ht->table) dn_free(ht->table); + /* Re-initialize the table */ + _dictReset(ht); + return DICT_OK; /* never fails */ } /* Clear & Release the hash table */ -void dictRelease(dict *d) -{ - _dictClear(d,&d->ht[0],NULL); - _dictClear(d,&d->ht[1],NULL); - //zfree(d); - dn_free(d); +void dictRelease(dict *d) { + _dictClear(d, &d->ht[0], NULL); + _dictClear(d, &d->ht[1], NULL); + // zfree(d); + dn_free(d); } -dictEntry *dictFind(dict *d, const void *key) -{ - dictEntry *he; - unsigned int h, idx, table; - - if (d->ht[0].size == 0) return NULL; /* We don't have a table at all */ - if (dictIsRehashing(d)) _dictRehashStep(d); - h = dictHashKey(d, key); - for (table = 0; table <= 1; table++) { - idx = h & d->ht[table].sizemask; - he = d->ht[table].table[idx]; - while(he) { - if (dictCompareKeys(d, key, he->key)) - return he; - he = he->next; - } - if (!dictIsRehashing(d)) return NULL; +dictEntry *dictFind(dict *d, const void *key) { + dictEntry *he; + unsigned int h, idx, table; + + if (d->ht[0].size == 0) return NULL; /* We don't have a table at all */ + if (dictIsRehashing(d)) _dictRehashStep(d); + h = dictHashKey(d, key); + for (table = 0; table <= 1; table++) { + idx = h & d->ht[table].sizemask; + he = d->ht[table].table[idx]; + while (he) { + if (dictCompareKeys(d, key, he->key)) return he; + he = he->next; } - return NULL; + if (!dictIsRehashing(d)) return NULL; + } + return NULL; } void *dictFetchValue(dict *d, const void *key) { - dictEntry *he; + dictEntry *he; - he = dictFind(d,key); - return he ? dictGetVal(he) : NULL; + he = dictFind(d, key); + return he ? dictGetVal(he) : NULL; } /* A fingerprint is a 64 bit number that represents the state of the dictionary * at a given time, it's just a few dict properties xored together. - * When an unsafe iterator is initialized, we get the dict fingerprint, and check - * the fingerprint again when the iterator is released. - * If the two fingerprints are different it means that the user of the iterator - * performed forbidden operations against the dictionary while iterating. */ -long long -dictFingerprint(dict *d) -{ - long long integers[6], hash = 0; - int j; - - integers[0] = (long) d->ht[0].table; - integers[1] = d->ht[0].size; - integers[2] = d->ht[0].used; - integers[3] = (long) d->ht[1].table; - integers[4] = d->ht[1].size; - integers[5] = d->ht[1].used; - - /* We hash N integers by summing every successive integer with the integer - * hashing of the previous sum. Basically: - * - * Result = hash(hash(hash(int1)+int2)+int3) ... - * - * This way the same set of integers in a different order will (likely) hash - * to a different number. */ - for (j = 0; j < 6; j++) { - hash += integers[j]; - /* For the hashing step we use Tomas Wang's 64 bit integer hash. */ - hash = (~hash) + (hash << 21); // hash = (hash << 21) - hash - 1; - hash = hash ^ (hash >> 24); - hash = (hash + (hash << 3)) + (hash << 8); // hash * 265 - hash = hash ^ (hash >> 14); - hash = (hash + (hash << 2)) + (hash << 4); // hash * 21 - hash = hash ^ (hash >> 28); - hash = hash + (hash << 31); - } - return hash; + * When an unsafe iterator is initialized, we get the dict fingerprint, and + * check the fingerprint again when the iterator is released. If the two + * fingerprints are different it means that the user of the iterator performed + * forbidden operations against the dictionary while iterating. */ +long long dictFingerprint(dict *d) { + long long integers[6], hash = 0; + int j; + + integers[0] = (long)d->ht[0].table; + integers[1] = d->ht[0].size; + integers[2] = d->ht[0].used; + integers[3] = (long)d->ht[1].table; + integers[4] = d->ht[1].size; + integers[5] = d->ht[1].used; + + /* We hash N integers by summing every successive integer with the integer + * hashing of the previous sum. Basically: + * + * Result = hash(hash(hash(int1)+int2)+int3) ... + * + * This way the same set of integers in a different order will (likely) hash + * to a different number. */ + for (j = 0; j < 6; j++) { + hash += integers[j]; + /* For the hashing step we use Tomas Wang's 64 bit integer hash. */ + hash = (~hash) + (hash << 21); // hash = (hash << 21) - hash - 1; + hash = hash ^ (hash >> 24); + hash = (hash + (hash << 3)) + (hash << 8); // hash * 265 + hash = hash ^ (hash >> 14); + hash = (hash + (hash << 2)) + (hash << 4); // hash * 21 + hash = hash ^ (hash >> 28); + hash = hash + (hash << 31); + } + return hash; } -dictIterator *dictGetIterator(dict *d) -{ - dictIterator *iter = dn_zalloc(sizeof(*iter)); - - iter->d = d; - iter->table = 0; - iter->index = -1; - iter->safe = 0; - iter->entry = NULL; - iter->nextEntry = NULL; - return iter; +dictIterator *dictGetIterator(dict *d) { + dictIterator *iter = dn_zalloc(sizeof(*iter)); + + iter->d = d; + iter->table = 0; + iter->index = -1; + iter->safe = 0; + iter->entry = NULL; + iter->nextEntry = NULL; + return iter; } dictIterator *dictGetSafeIterator(dict *d) { - dictIterator *i = dictGetIterator(d); + dictIterator *i = dictGetIterator(d); - i->safe = 1; - return i; + i->safe = 1; + return i; } -dictEntry *dictNext(dictIterator *iter) -{ - while (1) { - if (iter->entry == NULL) { - dictht *ht = &iter->d->ht[iter->table]; - if (iter->index == -1 && iter->table == 0) { - if (iter->safe) - iter->d->iterators++; - else - iter->fingerprint = dictFingerprint(iter->d); - } - iter->index++; - if (iter->index >= (signed) ht->size) { - if (dictIsRehashing(iter->d) && iter->table == 0) { - iter->table++; - iter->index = 0; - ht = &iter->d->ht[1]; - } else { - break; - } - } - iter->entry = ht->table[iter->index]; +dictEntry *dictNext(dictIterator *iter) { + while (1) { + if (iter->entry == NULL) { + dictht *ht = &iter->d->ht[iter->table]; + if (iter->index == -1 && iter->table == 0) { + if (iter->safe) + iter->d->iterators++; + else + iter->fingerprint = dictFingerprint(iter->d); + } + iter->index++; + if (iter->index >= (signed)ht->size) { + if (dictIsRehashing(iter->d) && iter->table == 0) { + iter->table++; + iter->index = 0; + ht = &iter->d->ht[1]; } else { - iter->entry = iter->nextEntry; - } - if (iter->entry) { - /* We need to save the 'next' here, the iterator user - * may delete the entry we are returning. */ - iter->nextEntry = iter->entry->next; - return iter->entry; + break; } + } + iter->entry = ht->table[iter->index]; + } else { + iter->entry = iter->nextEntry; } - return NULL; + if (iter->entry) { + /* We need to save the 'next' here, the iterator user + * may delete the entry we are returning. */ + iter->nextEntry = iter->entry->next; + return iter->entry; + } + } + return NULL; } -void dictReleaseIterator(dictIterator *iter) -{ - if (!(iter->index == -1 && iter->table == 0)) { - if (iter->safe) - iter->d->iterators--; - else - ASSERT(iter->fingerprint == dictFingerprint(iter->d)); - } - //zfree(iter); - dn_free(iter); +void dictReleaseIterator(dictIterator *iter) { + if (!(iter->index == -1 && iter->table == 0)) { + if (iter->safe) + iter->d->iterators--; + else + ASSERT(iter->fingerprint == dictFingerprint(iter->d)); + } + // zfree(iter); + dn_free(iter); } /* Return a random entry from the hash table. Useful to * implement randomized algorithms */ -dictEntry *dictGetRandomKey(dict *d) -{ - dictEntry *he, *orighe; - unsigned int h; - int listlen, listele; - - if (dictSize(d) == 0) return NULL; - if (dictIsRehashing(d)) _dictRehashStep(d); - if (dictIsRehashing(d)) { - do { - h = random() % (d->ht[0].size+d->ht[1].size); - he = (h >= d->ht[0].size) ? d->ht[1].table[h - d->ht[0].size] : - d->ht[0].table[h]; - } while(he == NULL); - } else { - do { - h = random() & d->ht[0].sizemask; - he = d->ht[0].table[h]; - } while(he == NULL); - } - - /* Now we found a non empty bucket, but it is a linked - * list and we need to get a random element from the list. - * The only sane way to do so is counting the elements and - * select a random index. */ - listlen = 0; - orighe = he; - while(he) { - he = he->next; - listlen++; - } - listele = random() % listlen; - he = orighe; - while(listele--) he = he->next; - return he; +dictEntry *dictGetRandomKey(dict *d) { + dictEntry *he, *orighe; + unsigned int h; + int listlen, listele; + + if (dictSize(d) == 0) return NULL; + if (dictIsRehashing(d)) _dictRehashStep(d); + if (dictIsRehashing(d)) { + do { + h = random() % (d->ht[0].size + d->ht[1].size); + he = (h >= d->ht[0].size) ? d->ht[1].table[h - d->ht[0].size] + : d->ht[0].table[h]; + } while (he == NULL); + } else { + do { + h = random() & d->ht[0].sizemask; + he = d->ht[0].table[h]; + } while (he == NULL); + } + + /* Now we found a non empty bucket, but it is a linked + * list and we need to get a random element from the list. + * The only sane way to do so is counting the elements and + * select a random index. */ + listlen = 0; + orighe = he; + while (he) { + he = he->next; + listlen++; + } + listele = random() % listlen; + he = orighe; + while (listele--) he = he->next; + return he; } /* This is a version of dictGetRandomKey() that is modified in order to @@ -685,48 +657,48 @@ dictEntry *dictGetRandomKey(dict *d) * at producing N elements, and the elements are guaranteed to be non * repeating. */ int dictGetRandomKeys(dict *d, dictEntry **des, int count) { - int j; /* internal hash table id, 0 or 1. */ - int stored = 0; - - if (dictSize(d) < count) count = dictSize(d); - while(stored < count) { - for (j = 0; j < 2; j++) { - /* Pick a random point inside the hash table 0 or 1. */ - unsigned int i = random() & d->ht[j].sizemask; - int size = d->ht[j].size; - - /* Make sure to visit every bucket by iterating 'size' times. */ - while(size--) { - dictEntry *he = d->ht[j].table[i]; - while (he) { - /* Collect all the elements of the buckets found non - * empty while iterating. */ - *des = he; - des++; - he = he->next; - stored++; - if (stored == count) return stored; - } - i = (i+1) & d->ht[j].sizemask; - } - /* If there is only one table and we iterated it all, we should - * already have 'count' elements. Assert this condition. */ - ASSERT(dictIsRehashing(d) != 0); + int j; /* internal hash table id, 0 or 1. */ + int stored = 0; + + if (dictSize(d) < count) count = dictSize(d); + while (stored < count) { + for (j = 0; j < 2; j++) { + /* Pick a random point inside the hash table 0 or 1. */ + unsigned int i = random() & d->ht[j].sizemask; + int size = d->ht[j].size; + + /* Make sure to visit every bucket by iterating 'size' times. */ + while (size--) { + dictEntry *he = d->ht[j].table[i]; + while (he) { + /* Collect all the elements of the buckets found non + * empty while iterating. */ + *des = he; + des++; + he = he->next; + stored++; + if (stored == count) return stored; } + i = (i + 1) & d->ht[j].sizemask; + } + /* If there is only one table and we iterated it all, we should + * already have 'count' elements. Assert this condition. */ + ASSERT(dictIsRehashing(d) != 0); } - return stored; /* Never reached. */ + } + return stored; /* Never reached. */ } /* Function to reverse bits. Algorithm from: * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel */ static unsigned long rev(unsigned long v) { - unsigned long s = 8 * sizeof(v); // bit size; must be power of 2 - unsigned long mask = ~0; - while ((s >>= 1) > 0) { - mask ^= (mask << s); - v = ((v >> s) & mask) | ((v << s) & ~mask); - } - return v; + unsigned long s = 8 * sizeof(v); // bit size; must be power of 2 + unsigned long mask = ~0; + while ((s >>= 1) > 0) { + mask ^= (mask << s); + v = ((v >> s) & mask) | ((v << s) & ~mask); + } + return v; } /* dictScan() is used to iterate over the elements of a dictionary. @@ -813,112 +785,105 @@ static unsigned long rev(unsigned long v) { * 3) The reverse cursor is somewhat hard to understand at first, but this * comment is supposed to help. */ -unsigned long dictScan(dict *d, - unsigned long v, - dictScanFunction *fn, - void *privdata) -{ - dictht *t0, *t1; - const dictEntry *de; - unsigned long m0, m1; - - if (dictSize(d) == 0) return 0; - - if (!dictIsRehashing(d)) { - t0 = &(d->ht[0]); - m0 = t0->sizemask; - - /* Emit entries at cursor */ - de = t0->table[v & m0]; - while (de) { - fn(privdata, de); - de = de->next; - } - - } else { - t0 = &d->ht[0]; - t1 = &d->ht[1]; - - /* Make sure t0 is the smaller and t1 is the bigger table */ - if (t0->size > t1->size) { - t0 = &d->ht[1]; - t1 = &d->ht[0]; - } - - m0 = t0->sizemask; - m1 = t1->sizemask; +unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, + void *privdata) { + dictht *t0, *t1; + const dictEntry *de; + unsigned long m0, m1; + + if (dictSize(d) == 0) return 0; + + if (!dictIsRehashing(d)) { + t0 = &(d->ht[0]); + m0 = t0->sizemask; + + /* Emit entries at cursor */ + de = t0->table[v & m0]; + while (de) { + fn(privdata, de); + de = de->next; + } - /* Emit entries at cursor */ - de = t0->table[v & m0]; - while (de) { - fn(privdata, de); - de = de->next; - } + } else { + t0 = &d->ht[0]; + t1 = &d->ht[1]; - /* Iterate over indices in larger table that are the expansion - * of the index pointed to by the cursor in the smaller table */ - do { - /* Emit entries at cursor */ - de = t1->table[v & m1]; - while (de) { - fn(privdata, de); - de = de->next; - } - - /* Increment bits not covered by the smaller mask */ - v = (((v | m0) + 1) & ~m0) | (v & m0); - - /* Continue while bits covered by mask difference is non-zero */ - } while (v & (m0 ^ m1)); + /* Make sure t0 is the smaller and t1 is the bigger table */ + if (t0->size > t1->size) { + t0 = &d->ht[1]; + t1 = &d->ht[0]; } - /* Set unmasked bits so incrementing the reversed cursor - * operates on the masked bits of the smaller table */ - v |= ~m0; + m0 = t0->sizemask; + m1 = t1->sizemask; - /* Increment the reverse cursor */ - v = rev(v); - v++; - v = rev(v); + /* Emit entries at cursor */ + de = t0->table[v & m0]; + while (de) { + fn(privdata, de); + de = de->next; + } - return v; + /* Iterate over indices in larger table that are the expansion + * of the index pointed to by the cursor in the smaller table */ + do { + /* Emit entries at cursor */ + de = t1->table[v & m1]; + while (de) { + fn(privdata, de); + de = de->next; + } + + /* Increment bits not covered by the smaller mask */ + v = (((v | m0) + 1) & ~m0) | (v & m0); + + /* Continue while bits covered by mask difference is non-zero */ + } while (v & (m0 ^ m1)); + } + + /* Set unmasked bits so incrementing the reversed cursor + * operates on the masked bits of the smaller table */ + v |= ~m0; + + /* Increment the reverse cursor */ + v = rev(v); + v++; + v = rev(v); + + return v; } /* ------------------------- private functions ------------------------------ */ /* Expand the hash table if needed */ -static int _dictExpandIfNeeded(dict *d) -{ - /* Incremental rehashing already in progress. Return. */ - if (dictIsRehashing(d)) return DICT_OK; - - /* If the hash table is empty expand it to the initial size. */ - if (d->ht[0].size == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE); - - /* If we reached the 1:1 ratio, and we are allowed to resize the hash - * table (global setting) or we should avoid it but the ratio between - * elements/buckets is over the "safe" threshold, we resize doubling - * the number of buckets. */ - if (d->ht[0].used >= d->ht[0].size && - (dict_can_resize || - d->ht[0].used/d->ht[0].size > dict_force_resize_ratio)) - { - return dictExpand(d, d->ht[0].used*2); - } - return DICT_OK; +static int _dictExpandIfNeeded(dict *d) { + /* Incremental rehashing already in progress. Return. */ + if (dictIsRehashing(d)) return DICT_OK; + + /* If the hash table is empty expand it to the initial size. */ + if (d->ht[0].size == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE); + + /* If we reached the 1:1 ratio, and we are allowed to resize the hash + * table (global setting) or we should avoid it but the ratio between + * elements/buckets is over the "safe" threshold, we resize doubling + * the number of buckets. */ + if (d->ht[0].used >= d->ht[0].size && + (dict_can_resize || + d->ht[0].used / d->ht[0].size > dict_force_resize_ratio)) { + return dictExpand(d, d->ht[0].used * 2); + } + return DICT_OK; } /* Our hash table capability is a power of two */ -static unsigned long _dictNextPower(unsigned long size) -{ - unsigned long i = DICT_HT_INITIAL_SIZE; - - if (size >= LONG_MAX) return LONG_MAX; - while(1) { - if (i >= size) - return i; - i *= 2; - } +static unsigned long _dictNextPower(unsigned long size) { + unsigned long i = DICT_HT_INITIAL_SIZE; + + if (size >= LONG_MAX) return LONG_MAX; + while (1) { + if (i >= size) return i; + i *= 2; + } } /* Returns the index of a free slot that can be populated with @@ -927,45 +892,38 @@ static unsigned long _dictNextPower(unsigned long size) * * Note that if we are in the process of rehashing the hash table, the * index is always returned in the context of the second (new) hash table. */ -static int _dictKeyIndex(dict *d, const void *key) -{ - unsigned int h, table; - int idx; - dictEntry *he; - - /* Expand the hash table if needed */ - if (_dictExpandIfNeeded(d) == DICT_ERR) - return -1; - /* Compute the key hash value */ - h = dictHashKey(d, key); - for (table = 0; table <= 1; table++) { - idx = h & d->ht[table].sizemask; - /* Search if this slot does not already contain the given key */ - he = d->ht[table].table[idx]; - while(he) { - if (dictCompareKeys(d, key, he->key)) - return -1; - he = he->next; - } - if (!dictIsRehashing(d)) break; +static int _dictKeyIndex(dict *d, const void *key) { + unsigned int h, table; + int idx; + dictEntry *he; + + /* Expand the hash table if needed */ + if (_dictExpandIfNeeded(d) == DICT_ERR) return -1; + /* Compute the key hash value */ + h = dictHashKey(d, key); + for (table = 0; table <= 1; table++) { + idx = h & d->ht[table].sizemask; + /* Search if this slot does not already contain the given key */ + he = d->ht[table].table[idx]; + while (he) { + if (dictCompareKeys(d, key, he->key)) return -1; + he = he->next; } - return idx; + if (!dictIsRehashing(d)) break; + } + return idx; } -void dictEmpty(dict *d, void(callback)(void*)) { - _dictClear(d,&d->ht[0],callback); - _dictClear(d,&d->ht[1],callback); - d->rehashidx = -1; - d->iterators = 0; +void dictEmpty(dict *d, void(callback)(void *)) { + _dictClear(d, &d->ht[0], callback); + _dictClear(d, &d->ht[1], callback); + d->rehashidx = -1; + d->iterators = 0; } -void dictEnableResize(void) { - dict_can_resize = 1; -} +void dictEnableResize(void) { dict_can_resize = 1; } -void dictDisableResize(void) { - dict_can_resize = 0; -} +void dictDisableResize(void) { dict_can_resize = 0; } #if 0 diff --git a/src/dyn_dict.h b/src/dyn_dict.h index 5779ee82e..50a03af69 100644 --- a/src/dyn_dict.h +++ b/src/dyn_dict.h @@ -42,42 +42,42 @@ #define DICT_ERR 1 /* Unused arguments generate annoying warnings... */ -#define DICT_NOTUSED(V) ((void) V) +#define DICT_NOTUSED(V) ((void)V) typedef struct dictEntry { - void *key; - union { - void *val; - uint64_t u64; - int64_t s64; - } v; - struct dictEntry *next; + void *key; + union { + void *val; + uint64_t u64; + int64_t s64; + } v; + struct dictEntry *next; } dictEntry; typedef struct dictType { - unsigned int (*hashFunction)(const void *key); - void *(*keyDup)(void *privdata, const void *key); - void *(*valDup)(void *privdata, const void *obj); - int (*keyCompare)(void *privdata, const void *key1, const void *key2); - void (*keyDestructor)(void *privdata, void *key); - void (*valDestructor)(void *privdata, void *obj); + unsigned int (*hashFunction)(const void *key); + void *(*keyDup)(void *privdata, const void *key); + void *(*valDup)(void *privdata, const void *obj); + int (*keyCompare)(void *privdata, const void *key1, const void *key2); + void (*keyDestructor)(void *privdata, void *key); + void (*valDestructor)(void *privdata, void *obj); } dictType; /* This is our hash table structure. Every dictionary has two of this as we * implement incremental rehashing, for the old to the new table. */ typedef struct dictht { - dictEntry **table; - unsigned long size; - unsigned long sizemask; - unsigned long used; + dictEntry **table; + unsigned long size; + unsigned long sizemask; + unsigned long used; } dictht; typedef struct dict { - dictType *type; - void *privdata; - dictht ht[2]; - int rehashidx; /* rehashing not in progress if rehashidx == -1 */ - int iterators; /* number of iterators currently running */ + dictType *type; + void *privdata; + dictht ht[2]; + int rehashidx; /* rehashing not in progress if rehashidx == -1 */ + int iterators; /* number of iterators currently running */ } dict; /* If safe is set to 1 this is a safe iterator, that means, you can call @@ -85,50 +85,55 @@ typedef struct dict { * iterating. Otherwise it is a non safe iterator, and only dictNext() * should be called while iterating. */ typedef struct dictIterator { - dict *d; - int table, index, safe; - dictEntry *entry, *nextEntry; - long long fingerprint; /* unsafe iterator fingerprint for misuse detection */ + dict *d; + int table, index, safe; + dictEntry *entry, *nextEntry; + long long fingerprint; /* unsafe iterator fingerprint for misuse detection */ } dictIterator; -typedef void (dictScanFunction)(void *privdata, const dictEntry *de); +typedef void(dictScanFunction)(void *privdata, const dictEntry *de); /* This is the initial size of every hash table */ -#define DICT_HT_INITIAL_SIZE 4 +#define DICT_HT_INITIAL_SIZE 4 /* ------------------------------- Macros ------------------------------------*/ -#define dictFreeVal(d, entry) \ - if ((d)->type->valDestructor) \ - (d)->type->valDestructor((d)->privdata, (entry)->v.val) - -#define dictSetVal(d, entry, _val_) do { \ - if ((d)->type->valDup) \ - entry->v.val = (d)->type->valDup((d)->privdata, _val_); \ - else \ - entry->v.val = (_val_); \ -} while(0) +#define dictFreeVal(d, entry) \ + if ((d)->type->valDestructor) \ + (d)->type->valDestructor((d)->privdata, (entry)->v.val) + +#define dictSetVal(d, entry, _val_) \ + do { \ + if ((d)->type->valDup) \ + entry->v.val = (d)->type->valDup((d)->privdata, _val_); \ + else \ + entry->v.val = (_val_); \ + } while (0) #define dictSetSignedIntegerVal(entry, _val_) \ - do { entry->v.s64 = _val_; } while(0) + do { \ + entry->v.s64 = _val_; \ + } while (0) #define dictSetUnsignedIntegerVal(entry, _val_) \ - do { entry->v.u64 = _val_; } while(0) - -#define dictFreeKey(d, entry) \ - if ((d)->type->keyDestructor) \ - (d)->type->keyDestructor((d)->privdata, (entry)->key) - -#define dictSetKey(d, entry, _key_) do { \ - if ((d)->type->keyDup) \ - entry->key = (d)->type->keyDup((d)->privdata, _key_); \ - else \ - entry->key = (_key_); \ -} while(0) - -#define dictCompareKeys(d, key1, key2) \ - (((d)->type->keyCompare) ? \ - (d)->type->keyCompare((d)->privdata, key1, key2) : \ - (key1) == (key2)) + do { \ + entry->v.u64 = _val_; \ + } while (0) + +#define dictFreeKey(d, entry) \ + if ((d)->type->keyDestructor) \ + (d)->type->keyDestructor((d)->privdata, (entry)->key) + +#define dictSetKey(d, entry, _key_) \ + do { \ + if ((d)->type->keyDup) \ + entry->key = (d)->type->keyDup((d)->privdata, _key_); \ + else \ + entry->key = (_key_); \ + } while (0) + +#define dictCompareKeys(d, key1, key2) \ + (((d)->type->keyCompare) ? (d)->type->keyCompare((d)->privdata, key1, key2) \ + : (key1) == (key2)) #define dictHashKey(d, key) (d)->type->hashFunction(key) #define dictGetKey(he) ((he)->key) @@ -137,8 +142,8 @@ typedef void (dictScanFunction)(void *privdata, const dictEntry *de); #define dictGetEntryVal(he) ((he)->val) #define dictGetSignedIntegerVal(he) ((he)->v.s64) #define dictGetUnsignedIntegerVal(he) ((he)->v.u64) -#define dictSlots(d) ((d)->ht[0].size+(d)->ht[1].size) -#define dictSize(d) ((d)->ht[0].used+(d)->ht[1].used) +#define dictSlots(d) ((d)->ht[0].size + (d)->ht[1].size) +#define dictSize(d) ((d)->ht[0].used + (d)->ht[1].used) #define dictIsRehashing(ht) ((ht)->rehashidx != -1) /* API */ @@ -151,7 +156,7 @@ dictEntry *dictReplaceRaw(dict *d, void *key); int dictDelete(dict *d, const void *key); int dictDeleteNoFree(dict *d, const void *key); void dictRelease(dict *d); -dictEntry * dictFind(dict *d, const void *key); +dictEntry *dictFind(dict *d, const void *key); void *dictFetchValue(dict *d, const void *key); int dictResize(dict *d); dictIterator *dictGetIterator(dict *d); @@ -163,14 +168,15 @@ int dictGetRandomKeys(dict *d, dictEntry **des, int count); void dictPrintStats(dict *d); unsigned int dictGenHashFunction(const void *key, uint32_t len); unsigned int dictGenCaseHashFunction(const unsigned char *buf, uint32_t len); -void dictEmpty(dict *d, void(callback)(void*)); +void dictEmpty(dict *d, void(callback)(void *)); void dictEnableResize(void); void dictDisableResize(void); int dictRehash(dict *d, int n); void dictSetHashFunctionSeed(unsigned int initval); long long dictFingerprint(dict *d); unsigned int dictGetHashFunctionSeed(void); -unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, void *privdata); +unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, + void *privdata); /* Hash table types */ extern dictType dictTypeHeapStringCopyKey; diff --git a/src/dyn_dict_msg_id.c b/src/dyn_dict_msg_id.c index 7522243b0..0b39af953 100644 --- a/src/dyn_dict_msg_id.c +++ b/src/dyn_dict_msg_id.c @@ -1,29 +1,23 @@ -# include // For NULL -#include "dyn_types.h" #include "dyn_dict_msg_id.h" +#include // For NULL +#include "dyn_types.h" -static unsigned int -dict_msg_id_hash(const void *key) -{ - msgid_t id = *(msgid_t*)key; - return dictGenHashFunction(key, sizeof(id)); +static unsigned int dict_msg_id_hash(const void *key) { + msgid_t id = *(msgid_t *)key; + return dictGenHashFunction(key, sizeof(id)); } -static int -dict_msg_id_cmp(void *privdata, const void *key1, const void *key2) -{ - msgid_t id1 = *(msgid_t*)key1; - msgid_t id2 = *(msgid_t*)key2; - return id1 == id2; +static int dict_msg_id_cmp(void *privdata, const void *key1, const void *key2) { + msgid_t id1 = *(msgid_t *)key1; + msgid_t id2 = *(msgid_t *)key2; + return id1 == id2; } dictType msg_table_dict_type = { - dict_msg_id_hash, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - dict_msg_id_cmp, /* key compare */ - NULL, /* key destructor */ - NULL /* val destructor */ + dict_msg_id_hash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dict_msg_id_cmp, /* key compare */ + NULL, /* key destructor */ + NULL /* val destructor */ }; - - diff --git a/src/dyn_dnode_client.c b/src/dyn_dnode_client.c index 1d5a51fd0..cdabe915a 100644 --- a/src/dyn_dnode_client.c +++ b/src/dyn_dnode_client.c @@ -1,125 +1,113 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ -#include "dyn_core.h" -#include "dyn_server.h" #include "dyn_dnode_client.h" +#include "dyn_core.h" #include "dyn_dict_msg_id.h" #include "dyn_response_mgr.h" +#include "dyn_server.h" -static void -dnode_client_ref(struct conn *conn, void *owner) -{ - struct server_pool *pool = owner; - - ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); - ASSERT(conn->owner == NULL); - - /* - * We use null pointer as the sockaddr argument in the accept() call as - * we are not interested in the address of the peer for the accepted - * connection - */ - conn->family = 0; - conn->addrlen = 0; - conn->addr = NULL; - - TAILQ_INSERT_TAIL(&pool->c_conn_q, conn, conn_tqe); - - /* owner of the client connection is the server pool */ - conn->owner = owner; - conn->outstanding_msgs_dict = dictCreate(&msg_table_dict_type, NULL); - log_debug(LOG_VVERB, "dyn: ref conn %p owner %p into pool '%.*s'", conn, pool, - pool->name.len, pool->name.data); -} +static void dnode_client_ref(struct conn *conn, void *owner) { + struct server_pool *pool = owner; -static void -dnode_client_unref_internal_try_put(struct conn *conn) -{ - ASSERT(conn->waiting_to_unref); - unsigned long msgs = dictSize(conn->outstanding_msgs_dict); - if (msgs != 0) { - log_warn("%s Waiting for %lu outstanding messages", print_obj(conn), msgs); - return; - } - struct server_pool *pool; - ASSERT(conn->owner != NULL); - conn_event_del_conn(conn); - pool = conn->owner; - conn->owner = NULL; - dictRelease(conn->outstanding_msgs_dict); - conn->outstanding_msgs_dict = NULL; - conn->waiting_to_unref = 0; - log_warn("unref %s owner %p from pool '%.*s'", - print_obj(conn), pool, pool->name.len, pool->name.data); - conn_put(conn); -} + ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); + ASSERT(conn->owner == NULL); -static void -dnode_client_unref_and_try_put(struct conn *conn) -{ + /* + * We use null pointer as the sockaddr argument in the accept() call as + * we are not interested in the address of the peer for the accepted + * connection + */ + conn->family = 0; + conn->addrlen = 0; + conn->addr = NULL; - ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); + TAILQ_INSERT_TAIL(&pool->c_conn_q, conn, conn_tqe); - struct server_pool *pool; - pool = conn->owner; - ASSERT(conn->owner != NULL); - ASSERT(TAILQ_COUNT(&pool->c_conn_q) != 0); - TAILQ_REMOVE(&pool->c_conn_q, conn, conn_tqe); - conn->waiting_to_unref = 1; - dnode_client_unref_internal_try_put(conn); - log_debug(LOG_VVERB, "dyn: unref conn %p owner %p from pool '%.*s'", conn, - pool, pool->name.len, pool->name.data); + /* owner of the client connection is the server pool */ + conn->owner = owner; + conn->outstanding_msgs_dict = dictCreate(&msg_table_dict_type, NULL); + log_debug(LOG_VVERB, "dyn: ref conn %p owner %p into pool '%.*s'", conn, pool, + pool->name.len, pool->name.data); } -static void -dnode_client_unref(struct conn *conn) -{ - dnode_client_unref_and_try_put(conn); +static void dnode_client_unref_internal_try_put(struct conn *conn) { + ASSERT(conn->waiting_to_unref); + unsigned long msgs = dictSize(conn->outstanding_msgs_dict); + if (msgs != 0) { + log_warn("%s Waiting for %lu outstanding messages", print_obj(conn), msgs); + return; + } + struct server_pool *pool; + ASSERT(conn->owner != NULL); + conn_event_del_conn(conn); + pool = conn->owner; + conn->owner = NULL; + dictRelease(conn->outstanding_msgs_dict); + conn->outstanding_msgs_dict = NULL; + conn->waiting_to_unref = 0; + log_warn("unref %s owner %p from pool '%.*s'", print_obj(conn), pool, + pool->name.len, pool->name.data); + conn_put(conn); } -static bool -dnode_client_active(struct conn *conn) -{ - ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); +static void dnode_client_unref_and_try_put(struct conn *conn) { + ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); + + struct server_pool *pool; + pool = conn->owner; + ASSERT(conn->owner != NULL); + ASSERT(TAILQ_COUNT(&pool->c_conn_q) != 0); + TAILQ_REMOVE(&pool->c_conn_q, conn, conn_tqe); + conn->waiting_to_unref = 1; + dnode_client_unref_internal_try_put(conn); + log_debug(LOG_VVERB, "dyn: unref conn %p owner %p from pool '%.*s'", conn, + pool, pool->name.len, pool->name.data); +} - ASSERT(TAILQ_EMPTY(&conn->imsg_q)); +static void dnode_client_unref(struct conn *conn) { + dnode_client_unref_and_try_put(conn); +} - if (!TAILQ_EMPTY(&conn->omsg_q)) { - log_debug(LOG_VVERB, "dyn: c %d is active", conn->sd); - return true; - } +static bool dnode_client_active(struct conn *conn) { + ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); - if (conn->rmsg != NULL) { - log_debug(LOG_VVERB, "dyn: c %d is active", conn->sd); - return true; - } + ASSERT(TAILQ_EMPTY(&conn->imsg_q)); - if (conn->smsg != NULL) { - log_debug(LOG_VVERB, "dyn: c %d is active", conn->sd); - return true; - } + if (!TAILQ_EMPTY(&conn->omsg_q)) { + log_debug(LOG_VVERB, "dyn: c %d is active", conn->sd); + return true; + } - log_debug(LOG_VVERB, "dyn: c %d is inactive", conn->sd); + if (conn->rmsg != NULL) { + log_debug(LOG_VVERB, "dyn: c %d is active", conn->sd); + return true; + } - return false; + if (conn->smsg != NULL) { + log_debug(LOG_VVERB, "dyn: c %d is active", conn->sd); + return true; + } + + log_debug(LOG_VVERB, "dyn: c %d is inactive", conn->sd); + + return false; } -static void -dnode_client_close_stats(struct context *ctx, struct server_pool *pool, err_t err, - unsigned eof) -{ - stats_pool_decr(ctx, dnode_client_connections); +static void dnode_client_close_stats(struct context *ctx, + struct server_pool *pool, err_t err, + unsigned eof) { + stats_pool_decr(ctx, dnode_client_connections); - if (eof) { - //fix this also - stats_pool_incr(ctx, dnode_client_eof); - return; - } + if (eof) { + // fix this also + stats_pool_incr(ctx, dnode_client_eof); + return; + } - switch (err) { + switch (err) { case EPIPE: case ETIMEDOUT: case ECONNRESET: @@ -130,422 +118,420 @@ dnode_client_close_stats(struct context *ctx, struct server_pool *pool, err_t er case EHOSTDOWN: case EHOSTUNREACH: default: - //fix this also - stats_pool_incr(ctx, dnode_client_err); - break; - } + // fix this also + stats_pool_incr(ctx, dnode_client_err); + break; + } } -static void -dnode_client_close(struct context *ctx, struct conn *conn) -{ - rstatus_t status; - struct msg *req, *nmsg; /* current and next message */ +static void dnode_client_close(struct context *ctx, struct conn *conn) { + rstatus_t status; + struct msg *req, *nmsg; /* current and next message */ - ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); + ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); - dnode_client_close_stats(ctx, conn->owner, conn->err, conn->eof); + dnode_client_close_stats(ctx, conn->owner, conn->err, conn->eof); - if (conn->sd < 0) { - conn_unref(conn); - return; + if (conn->sd < 0) { + conn_unref(conn); + return; + } + + req = conn->rmsg; + if (req != NULL) { + conn->rmsg = NULL; + + ASSERT(req->selected_rsp == NULL); + ASSERT(req->is_request && !req->done); + + if (log_loggable(LOG_INFO)) { + log_debug(LOG_INFO, + "dyn: close c %d discarding pending req %" PRIu64 + " len " + "%" PRIu32 " type %d", + conn->sd, req->id, req->mlen, req->type); } - req = conn->rmsg; - if (req != NULL) { - conn->rmsg = NULL; - - ASSERT(req->selected_rsp == NULL); - ASSERT(req->is_request && !req->done); - - if (log_loggable(LOG_INFO)) { - log_debug(LOG_INFO, "dyn: close c %d discarding pending req %"PRIu64" len " - "%"PRIu32" type %d", conn->sd, req->id, req->mlen, - req->type); - } + dictDelete(conn->outstanding_msgs_dict, &req->id); + req_put(req); + } - dictDelete(conn->outstanding_msgs_dict, &req->id); - req_put(req); - } + ASSERT(conn->smsg == NULL); + ASSERT(TAILQ_EMPTY(&conn->imsg_q)); - ASSERT(conn->smsg == NULL); - ASSERT(TAILQ_EMPTY(&conn->imsg_q)); - - for (req = TAILQ_FIRST(&conn->omsg_q); req != NULL; req = nmsg) { - nmsg = TAILQ_NEXT(req, c_tqe); - - /* dequeue the message (request) from client outq */ - conn_dequeue_outq(ctx, conn, req); - - if (req->done || req->selected_rsp) { - if (log_loggable(LOG_INFO)) { - log_debug(LOG_INFO, "dyn: close c %d discarding %s req %"PRIu64" len " - "%"PRIu32" type %d", conn->sd, - req->is_error ? "error": "completed", req->id, req->mlen, - req->type); - } - dictDelete(conn->outstanding_msgs_dict, &req->id); - req_put(req); - } else { - req->swallow = 1; - - ASSERT(req->is_request); - - if (log_loggable(LOG_INFO)) { - log_debug(LOG_INFO, "dyn: close c %d schedule swallow of req %"PRIu64" " - "len %"PRIu32" type %d", conn->sd, req->id, req->mlen, - req->type); - } - } - } - ASSERT(TAILQ_EMPTY(&conn->omsg_q)); + for (req = TAILQ_FIRST(&conn->omsg_q); req != NULL; req = nmsg) { + nmsg = TAILQ_NEXT(req, c_tqe); + /* dequeue the message (request) from client outq */ + conn_dequeue_outq(ctx, conn, req); - status = close(conn->sd); - if (status < 0) { - log_error("dyn: close c %d failed, ignored: %s", conn->sd, strerror(errno)); + if (req->done || req->selected_rsp) { + if (log_loggable(LOG_INFO)) { + log_debug(LOG_INFO, + "dyn: close c %d discarding %s req %" PRIu64 + " len " + "%" PRIu32 " type %d", + conn->sd, req->is_error ? "error" : "completed", req->id, + req->mlen, req->type); + } + dictDelete(conn->outstanding_msgs_dict, &req->id); + req_put(req); + } else { + req->swallow = 1; + + ASSERT(req->is_request); + + if (log_loggable(LOG_INFO)) { + log_debug(LOG_INFO, + "dyn: close c %d schedule swallow of req %" PRIu64 + " " + "len %" PRIu32 " type %d", + conn->sd, req->id, req->mlen, req->type); + } } - conn->sd = -1; - conn_unref(conn); + } + ASSERT(TAILQ_EMPTY(&conn->omsg_q)); + + status = close(conn->sd); + if (status < 0) { + log_error("dyn: close c %d failed, ignored: %s", conn->sd, strerror(errno)); + } + conn->sd = -1; + conn_unref(conn); } -static rstatus_t -dnode_client_handle_response(struct conn *conn, msgid_t reqid, struct msg *rsp) -{ - // Forward the response to the caller which is client connection. - rstatus_t status = DN_OK; - - ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); - // Fetch the original request - struct msg *req = dictFetchValue(conn->outstanding_msgs_dict, &reqid); - if (!req) { - log_notice("looks like we already cleanedup the request for %d", reqid); - rsp_put(rsp); - return DN_OK; - } - - // dnode client has no extra logic of coalescing etc like the client/coordinator. - // Hence all work for this request is done at this time - ASSERT_LOG(!req->selected_rsp, "req %lu:%lu has selected_rsp set", req->id, req->parent_id); - status = msg_handle_response(req, rsp); - if (conn->waiting_to_unref) { - dictDelete(conn->outstanding_msgs_dict, &reqid); - log_info("Putting %s", print_obj(req)); - req_put(req); - dnode_client_unref_internal_try_put(conn); - return DN_OK; - } - - // Remove the message from the hash table. +static rstatus_t dnode_client_handle_response(struct conn *conn, msgid_t reqid, + struct msg *rsp) { + // Forward the response to the caller which is client connection. + rstatus_t status = DN_OK; + + ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); + // Fetch the original request + struct msg *req = dictFetchValue(conn->outstanding_msgs_dict, &reqid); + if (!req) { + log_notice("looks like we already cleanedup the request for %d", reqid); + rsp_put(rsp); + return DN_OK; + } + + // dnode client has no extra logic of coalescing etc like the + // client/coordinator. Hence all work for this request is done at this time + ASSERT_LOG(!req->selected_rsp, "req %lu:%lu has selected_rsp set", req->id, + req->parent_id); + status = msg_handle_response(req, rsp); + if (conn->waiting_to_unref) { dictDelete(conn->outstanding_msgs_dict, &reqid); - - // If this request is first in the out queue, then the connection is ready, - // add the connection to epoll for writing - if (conn_is_req_first_in_outqueue(conn, req)) { - status = conn_event_add_out(conn); - if (status != DN_OK) { - conn->err = errno; - } + log_info("Putting %s", print_obj(req)); + req_put(req); + dnode_client_unref_internal_try_put(conn); + return DN_OK; + } + + // Remove the message from the hash table. + dictDelete(conn->outstanding_msgs_dict, &reqid); + + // If this request is first in the out queue, then the connection is ready, + // add the connection to epoll for writing + if (conn_is_req_first_in_outqueue(conn, req)) { + status = conn_event_add_out(conn); + if (status != DN_OK) { + conn->err = errno; } - return status; + } + return status; } -static bool -dnode_req_filter(struct context *ctx, struct conn *conn, struct msg *req) -{ - ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); +static bool dnode_req_filter(struct context *ctx, struct conn *conn, + struct msg *req) { + ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); - if (msg_empty(req)) { - ASSERT(conn->rmsg == NULL); - if (log_loggable(LOG_VERB)) { - log_debug(LOG_VERB, "dyn: filter empty req %"PRIu64" from c %d", req->id, - conn->sd); - } - req_put(req); - return true; + if (msg_empty(req)) { + ASSERT(conn->rmsg == NULL); + if (log_loggable(LOG_VERB)) { + log_debug(LOG_VERB, "dyn: filter empty req %" PRIu64 " from c %d", + req->id, conn->sd); } - - /* dynomite handler */ - if (req->dmsg != NULL) { - if (dmsg_process(ctx, conn, req->dmsg)) { - req_put(req); - return true; - } - + req_put(req); + return true; + } + + /* dynomite handler */ + if (req->dmsg != NULL) { + if (dmsg_process(ctx, conn, req->dmsg)) { + req_put(req); + return true; } + } - return false; + return false; } -static void -dnode_req_forward(struct context *ctx, struct conn *conn, struct msg *req) -{ - struct server_pool *pool; - - log_debug(LOG_DEBUG, "%s DNODE REQ RECEIVED dmsg->id %u", print_obj(conn), req->dmsg->id); - - ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); - - pool = conn->owner; - - log_debug(LOG_DEBUG, "%s adding message %d:%d", print_obj(conn), req->id, req->parent_id); - dictAdd(conn->outstanding_msgs_dict, &req->id, req); - - uint32_t keylen = 0; - uint8_t *key = msg_get_tagged_key(req, 0, &keylen); - - ASSERT(req->dmsg != NULL); - /* enqueue message (request) into client outq, if response is expected - * and its not marked for swallow */ - if (req->expect_datastore_reply && !req->swallow) { - conn_enqueue_outq(ctx, conn, req); - req->rsp_handler = msg_local_one_rsp_handler; - } - if (req->dmsg->type == DMSG_REQ) { - // This is a request received from a peer rack in the same DC, just forward - // it to the local datastore - dyn_error_t dyn_error_code = DN_OK; - rstatus_t s = local_req_forward(ctx, conn, req, key, keylen, &dyn_error_code); - if (s != DN_OK) { - req_forward_error(ctx, conn, req, s, dyn_error_code); - } - } else if (req->dmsg->type == DMSG_REQ_FORWARD) { - // This is a request received from a remote DC. Forward it to all local racks - struct mbuf *orig_mbuf = STAILQ_FIRST(&req->mhdr); - struct datacenter *dc = server_get_dc(pool, &pool->dc); - req_forward_all_local_racks(ctx, conn, req, orig_mbuf, key, keylen, dc); +static void dnode_req_forward(struct context *ctx, struct conn *conn, + struct msg *req) { + struct server_pool *pool; + + log_debug(LOG_DEBUG, "%s DNODE REQ RECEIVED dmsg->id %u", print_obj(conn), + req->dmsg->id); + + ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); + + pool = conn->owner; + + log_debug(LOG_DEBUG, "%s adding message %d:%d", print_obj(conn), req->id, + req->parent_id); + dictAdd(conn->outstanding_msgs_dict, &req->id, req); + + uint32_t keylen = 0; + uint8_t *key = msg_get_tagged_key(req, 0, &keylen); + + ASSERT(req->dmsg != NULL); + /* enqueue message (request) into client outq, if response is expected + * and its not marked for swallow */ + if (req->expect_datastore_reply && !req->swallow) { + conn_enqueue_outq(ctx, conn, req); + req->rsp_handler = msg_local_one_rsp_handler; + } + if (req->dmsg->type == DMSG_REQ) { + // This is a request received from a peer rack in the same DC, just forward + // it to the local datastore + dyn_error_t dyn_error_code = DN_OK; + rstatus_t s = + local_req_forward(ctx, conn, req, key, keylen, &dyn_error_code); + if (s != DN_OK) { + req_forward_error(ctx, conn, req, s, dyn_error_code); } + } else if (req->dmsg->type == DMSG_REQ_FORWARD) { + // This is a request received from a remote DC. Forward it to all local + // racks + struct mbuf *orig_mbuf = STAILQ_FIRST(&req->mhdr); + struct datacenter *dc = server_get_dc(pool, &pool->dc); + req_forward_all_local_racks(ctx, conn, req, orig_mbuf, key, keylen, dc); + } } -static struct msg * -dnode_req_recv_next(struct context *ctx, struct conn *conn, bool alloc) -{ - ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); - return req_recv_next(ctx, conn, alloc); +static struct msg *dnode_req_recv_next(struct context *ctx, struct conn *conn, + bool alloc) { + ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); + return req_recv_next(ctx, conn, alloc); } -static void -dnode_req_recv_done(struct context *ctx, struct conn *conn, - struct msg *req, struct msg *nmsg) -{ - ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); - ASSERT(req->is_request); - ASSERT(req->owner == conn); - ASSERT(conn->rmsg == req); - ASSERT(nmsg == NULL || nmsg->is_request); - - /* enqueue next message (request), if any */ - conn->rmsg = nmsg; - - if (dnode_req_filter(ctx, conn, req)) { - return; - } +static void dnode_req_recv_done(struct context *ctx, struct conn *conn, + struct msg *req, struct msg *nmsg) { + ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); + ASSERT(req->is_request); + ASSERT(req->owner == conn); + ASSERT(conn->rmsg == req); + ASSERT(nmsg == NULL || nmsg->is_request); - log_debug(LOG_VERB, "received req %d:%d", req->id, req->parent_id); - dnode_req_forward(ctx, conn, req); + /* enqueue next message (request), if any */ + conn->rmsg = nmsg; + + if (dnode_req_filter(ctx, conn, req)) { + return; + } + + log_debug(LOG_VERB, "received req %d:%d", req->id, req->parent_id); + dnode_req_forward(ctx, conn, req); } -static void -dnode_req_client_enqueue_omsgq(struct context *ctx, struct conn *conn, struct msg *req) -{ - ASSERT(req->is_request); - ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); +static void dnode_req_client_enqueue_omsgq(struct context *ctx, + struct conn *conn, struct msg *req) { + ASSERT(req->is_request); + ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); - log_debug(LOG_VERB, "conn %p enqueue outq %p", conn, req); - TAILQ_INSERT_TAIL(&conn->omsg_q, req, c_tqe); + log_debug(LOG_VERB, "conn %p enqueue outq %p", conn, req); + TAILQ_INSERT_TAIL(&conn->omsg_q, req, c_tqe); - histo_add(&ctx->stats->dnode_client_out_queue, TAILQ_COUNT(&conn->omsg_q)); - stats_pool_incr(ctx, dnode_client_out_queue); - stats_pool_incr_by(ctx, dnode_client_out_queue_bytes, req->mlen); + histo_add(&ctx->stats->dnode_client_out_queue, TAILQ_COUNT(&conn->omsg_q)); + stats_pool_incr(ctx, dnode_client_out_queue); + stats_pool_incr_by(ctx, dnode_client_out_queue_bytes, req->mlen); } -static void -dnode_req_client_dequeue_omsgq(struct context *ctx, struct conn *conn, struct msg *req) -{ - ASSERT(req->is_request); - ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); +static void dnode_req_client_dequeue_omsgq(struct context *ctx, + struct conn *conn, struct msg *req) { + ASSERT(req->is_request); + ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); - TAILQ_REMOVE(&conn->omsg_q, req, c_tqe); - log_debug(LOG_VERB, "conn %p dequeue outq %p", conn, req); + TAILQ_REMOVE(&conn->omsg_q, req, c_tqe); + log_debug(LOG_VERB, "conn %p dequeue outq %p", conn, req); - histo_add(&ctx->stats->dnode_client_out_queue, TAILQ_COUNT(&conn->omsg_q)); - stats_pool_decr(ctx, dnode_client_out_queue); - stats_pool_decr_by(ctx, dnode_client_out_queue_bytes, req->mlen); + histo_add(&ctx->stats->dnode_client_out_queue, TAILQ_COUNT(&conn->omsg_q)); + stats_pool_decr(ctx, dnode_client_out_queue); + stats_pool_decr_by(ctx, dnode_client_out_queue_bytes, req->mlen); } /* dnode sends a response back to a peer */ -static struct msg * -dnode_rsp_send_next(struct context *ctx, struct conn *conn) -{ - rstatus_t status; - - // SMB: There is some non trivial thing happening here. And I think it is very - // important to read this before anything is changed in here. There is also a - // bug that exists which I will mention briefly: - // A message is a structure that has a list of mbufs which hold the actual data. - // Each mbuf has start, pos, last as pointers (amongst others) which indicate start of the - // buffer, current read position and end of the buffer respectively. - // - // Every time a message is sent to a peer within dynomite, a DNODE header is - // prepended which is created using dmsg_write. A message remembers this case - // in dnode_header_prepended, so that if the message is sent in parts, the - // header is not prepended again for the subsequent parts. - // - // Like I said earlier there is a pos pointer in mbuf. If a message is sent - // partially (or it is parsed partially too I think) the pos reflects that - // case such that things can be resumed where it left off. - // - // dmsg_write has a parameter which reflects the payload length following the - // dnode header calculated by msg_length. msg_length is a summation of all - // mbuf sizes (last - start). Which I think is wrong. - // - // +------------+ +---------------+ - // | DC1N1 +---------> | DC2N1 | - // +------------+ +-------+-------+ - // | - // | - // | - // | - // +-------v-------+ - // | DC2N2 | - // +---------------+ - // - // Consider the case where - // a node DC1N1 in region DC1 sends a request to DC2N1 which forwards it to - // to local token owner DC2N2. Now DC2N1 receives a response from DC2N2 which - // has to be relayed back to DC1N1. This response from DC2N2 already has a - // dnode header but for the link between DC2N1 and DC2N2. DC2N1 should strip - // this header and prepend its own header for sending it back to DC1N1. This - // gets handled in encryption case since we overwrite all mbufs in the response - // However if the encryption is off, the message length sent to dmsg_write - // consists of the header from DC2N2 also which is wrong. So this relaying - // of responses will not work for the case where encryption is disabled. - // - // So msg_length should really be from mbuf->pos and not mbuf->start. This - // is a problem only with remote region replication since that is the only - // case where we CAN have 2 hops to send the request/response. This is also - // not a problem if encryption is ON. - ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); - - struct msg *rsp = rsp_send_next(ctx, conn); - - if (rsp != NULL && conn->dyn_mode) { - struct msg *req = rsp->peer; - - //need to deal with multi-block later - uint64_t msg_id = req->dmsg->id; - if (rsp->dnode_header_prepended) { - return rsp; - } +static struct msg *dnode_rsp_send_next(struct context *ctx, struct conn *conn) { + rstatus_t status; + + // SMB: There is some non trivial thing happening here. And I think it is very + // important to read this before anything is changed in here. There is also a + // bug that exists which I will mention briefly: + // A message is a structure that has a list of mbufs which hold the actual + // data. Each mbuf has start, pos, last as pointers (amongst others) which + // indicate start of the buffer, current read position and end of the buffer + // respectively. + // + // Every time a message is sent to a peer within dynomite, a DNODE header is + // prepended which is created using dmsg_write. A message remembers this case + // in dnode_header_prepended, so that if the message is sent in parts, the + // header is not prepended again for the subsequent parts. + // + // Like I said earlier there is a pos pointer in mbuf. If a message is sent + // partially (or it is parsed partially too I think) the pos reflects that + // case such that things can be resumed where it left off. + // + // dmsg_write has a parameter which reflects the payload length following the + // dnode header calculated by msg_length. msg_length is a summation of all + // mbuf sizes (last - start). Which I think is wrong. + // + // +------------+ +---------------+ + // | DC1N1 +---------> | DC2N1 | + // +------------+ +-------+-------+ + // | + // | + // | + // | + // +-------v-------+ + // | DC2N2 | + // +---------------+ + // + // Consider the case where + // a node DC1N1 in region DC1 sends a request to DC2N1 which forwards it to + // to local token owner DC2N2. Now DC2N1 receives a response from DC2N2 which + // has to be relayed back to DC1N1. This response from DC2N2 already has a + // dnode header but for the link between DC2N1 and DC2N2. DC2N1 should strip + // this header and prepend its own header for sending it back to DC1N1. This + // gets handled in encryption case since we overwrite all mbufs in the + // response However if the encryption is off, the message length sent to + // dmsg_write consists of the header from DC2N2 also which is wrong. So this + // relaying of responses will not work for the case where encryption is + // disabled. + // + // So msg_length should really be from mbuf->pos and not mbuf->start. This + // is a problem only with remote region replication since that is the only + // case where we CAN have 2 hops to send the request/response. This is also + // not a problem if encryption is ON. + ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); + + struct msg *rsp = rsp_send_next(ctx, conn); + + if (rsp != NULL && conn->dyn_mode) { + struct msg *req = rsp->peer; + + // need to deal with multi-block later + uint64_t msg_id = req->dmsg->id; + if (rsp->dnode_header_prepended) { + return rsp; + } - struct mbuf *header_buf = mbuf_get(); - if (header_buf == NULL) { - loga("Unable to obtain an mbuf for header!"); - return NULL; //need to address error here properly - } - dmsg_type_t msg_type = DMSG_RES; - //TODOs: need to set the outcoming conn to be secured too if the incoming conn is secured - if (req->owner->dnode_secured || conn->dnode_secured) { - if (log_loggable(LOG_VVERB)) { - log_debug(LOG_VVERB, "Encrypting response ..."); - SCOPED_CHARPTR(encoded_aes_key) = base64_encode(conn->aes_key, AES_KEYLEN); - if (encoded_aes_key) - loga("AES encryption key: %s\n", (char *)encoded_aes_key); - } - - if (ENCRYPTION) { - size_t encrypted_bytes; - status = dyn_aes_encrypt_msg(rsp, conn->aes_key, &encrypted_bytes); - if (status != DN_OK) { - if (status == DN_ENOMEM) { - loga("OOM to obtain an mbuf for encryption!"); - } else if (status == DN_ERROR) { - loga("Encryption failed: Empty message"); - } - mbuf_put(header_buf); - rsp_put(rsp); - return NULL; - } - - if (log_loggable(LOG_VVERB)) { - log_debug(LOG_VERB, "#encrypted bytes : %d", encrypted_bytes); - } - - dmsg_write(header_buf, msg_id, msg_type, conn, msg_length(rsp)); - } else { - if (log_loggable(LOG_VVERB)) { - log_debug(LOG_VERB, "no encryption on the rsp payload"); - } - dmsg_write(header_buf, msg_id, msg_type, conn, msg_length(rsp)); - } - - } else { - //write dnode header - log_debug(LOG_VERB, "sending dnode response with msg_id %u", msg_id); - dmsg_write(header_buf, msg_id, msg_type, conn, msg_length(rsp)); + struct mbuf *header_buf = mbuf_get(); + if (header_buf == NULL) { + loga("Unable to obtain an mbuf for header!"); + return NULL; // need to address error here properly + } + dmsg_type_t msg_type = DMSG_RES; + // TODOs: need to set the outcoming conn to be secured too if the incoming + // conn is secured + if (req->owner->dnode_secured || conn->dnode_secured) { + if (log_loggable(LOG_VVERB)) { + log_debug(LOG_VVERB, "Encrypting response ..."); + SCOPED_CHARPTR(encoded_aes_key) = + base64_encode(conn->aes_key, AES_KEYLEN); + if (encoded_aes_key) + loga("AES encryption key: %s\n", (char *)encoded_aes_key); + } + + if (ENCRYPTION) { + size_t encrypted_bytes; + status = dyn_aes_encrypt_msg(rsp, conn->aes_key, &encrypted_bytes); + if (status != DN_OK) { + if (status == DN_ENOMEM) { + loga("OOM to obtain an mbuf for encryption!"); + } else if (status == DN_ERROR) { + loga("Encryption failed: Empty message"); + } + mbuf_put(header_buf); + rsp_put(rsp); + return NULL; } - rsp->dnode_header_prepended = 1; - mbuf_insert_head(&rsp->mhdr, header_buf); + if (log_loggable(LOG_VVERB)) { + log_debug(LOG_VERB, "#encrypted bytes : %d", encrypted_bytes); + } + dmsg_write(header_buf, msg_id, msg_type, conn, msg_length(rsp)); + } else { if (log_loggable(LOG_VVERB)) { - log_hexdump(LOG_VVERB, header_buf->pos, mbuf_length(header_buf), "resp dyn message - header: "); - msg_dump(LOG_VVERB, rsp); + log_debug(LOG_VERB, "no encryption on the rsp payload"); } + dmsg_write(header_buf, msg_id, msg_type, conn, msg_length(rsp)); + } + } else { + // write dnode header + log_debug(LOG_VERB, "sending dnode response with msg_id %u", msg_id); + dmsg_write(header_buf, msg_id, msg_type, conn, msg_length(rsp)); } - return rsp; -} + rsp->dnode_header_prepended = 1; + mbuf_insert_head(&rsp->mhdr, header_buf); -static void -dnode_rsp_send_done(struct context *ctx, struct conn *conn, struct msg *rsp) -{ if (log_loggable(LOG_VVERB)) { - log_debug(LOG_VVERB, "dnode_rsp_send_done entering"); - } + log_hexdump(LOG_VVERB, header_buf->pos, mbuf_length(header_buf), + "resp dyn message - header: "); + msg_dump(LOG_VVERB, rsp); + } + } + + return rsp; +} - struct msg *req; /* peer message (request) */ +static void dnode_rsp_send_done(struct context *ctx, struct conn *conn, + struct msg *rsp) { + if (log_loggable(LOG_VVERB)) { + log_debug(LOG_VVERB, "dnode_rsp_send_done entering"); + } - ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); - ASSERT(conn->smsg == NULL); + struct msg *req; /* peer message (request) */ - log_debug(LOG_VERB, "dyn: send done rsp %"PRIu64" on c %d", rsp->id, conn->sd); + ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); + ASSERT(conn->smsg == NULL); - req = rsp->peer; + log_debug(LOG_VERB, "dyn: send done rsp %" PRIu64 " on c %d", rsp->id, + conn->sd); - ASSERT(!rsp->is_request && req->is_request); - ASSERT(req->selected_rsp == rsp); - log_debug(LOG_DEBUG, "%s DNODE RSP SENT dmsg->id %u", print_obj(conn), req->dmsg->id); + req = rsp->peer; - /* dequeue request from client outq */ - conn_dequeue_outq(ctx, conn, req); + ASSERT(!rsp->is_request && req->is_request); + ASSERT(req->selected_rsp == rsp); + log_debug(LOG_DEBUG, "%s DNODE RSP SENT dmsg->id %u", print_obj(conn), + req->dmsg->id); - req_put(req); + /* dequeue request from client outq */ + conn_dequeue_outq(ctx, conn, req); + + req_put(req); } -struct conn_ops dnode_client_ops = { - msg_recv, - dnode_req_recv_next, - dnode_req_recv_done, - msg_send, - dnode_rsp_send_next, - dnode_rsp_send_done, - dnode_client_close, - dnode_client_active, - dnode_client_ref, - dnode_client_unref, - NULL, - NULL, - dnode_req_client_enqueue_omsgq, - dnode_req_client_dequeue_omsgq, - dnode_client_handle_response -}; - -void -init_dnode_client_conn(struct conn *conn) -{ - conn->dyn_mode = 1; - conn->type = CONN_DNODE_PEER_CLIENT; - conn->ops = &dnode_client_ops; +struct conn_ops dnode_client_ops = {msg_recv, + dnode_req_recv_next, + dnode_req_recv_done, + msg_send, + dnode_rsp_send_next, + dnode_rsp_send_done, + dnode_client_close, + dnode_client_active, + dnode_client_ref, + dnode_client_unref, + NULL, + NULL, + dnode_req_client_enqueue_omsgq, + dnode_req_client_dequeue_omsgq, + dnode_client_handle_response}; + +void init_dnode_client_conn(struct conn *conn) { + conn->dyn_mode = 1; + conn->type = CONN_DNODE_PEER_CLIENT; + conn->ops = &dnode_client_ops; } diff --git a/src/dyn_dnode_client.h b/src/dyn_dnode_client.h index 338db1884..fba1b6000 100644 --- a/src/dyn_dnode_client.h +++ b/src/dyn_dnode_client.h @@ -1,14 +1,14 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ - -#include "dyn_core.h" - + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ #ifndef _DYN_DNODE_CLIENT_H_ #define _DYN_DNODE_CLIENT_H_ +// Forward declarations +struct conn; + void init_dnode_client_conn(struct conn *conn); #endif diff --git a/src/dyn_dnode_msg.c b/src/dyn_dnode_msg.c index 7d35730db..5b2e22664 100644 --- a/src/dyn_dnode_msg.c +++ b/src/dyn_dnode_msg.c @@ -1,19 +1,20 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ #include +#include "dyn_dnode_msg.h" + #include "dyn_core.h" #include "dyn_crypto.h" -#include "dyn_dnode_msg.h" #include "dyn_server.h" #include "proto/dyn_proto.h" static uint8_t version = VERSION_10; -static uint64_t dmsg_id; /* message id counter */ +static uint64_t dmsg_id; /* message id counter */ static struct dmsg_tqh free_dmsgq; /* free msg q */ static const struct string MAGIC_STR = string(" $2014$ "); @@ -24,1046 +25,994 @@ static unsigned char aes_decrypted_buf[34]; static rstatus_t dmsg_to_gossip(struct ring_msg *rmsg); -static bool -dyn_parse_core(struct msg *r) -{ - struct dmsg *dmsg; - struct mbuf *b; - uint8_t *p = r->pos, *token; - uint8_t ch = ' '; - uint64_t num = 0; - - dyn_parse_state_t dyn_state = r->dyn_parse_state; - log_debug(LOG_VVERB, "dyn_parse_state: %d", r->dyn_parse_state); - - if (r->dyn_parse_state == DYN_DONE || r->dyn_parse_state == DYN_POST_DONE) - return true; - - b = STAILQ_LAST(&r->mhdr, mbuf, next); - - dmsg = r->dmsg; - if (dmsg == NULL) { - r->dmsg = dmsg_get(); - dmsg = r->dmsg; - dmsg->owner = r; - if (dmsg == NULL) {//should track this as a dropped message - loga("unable to create a new dmsg"); - goto error; //should count as OOM error - } - } +static bool dyn_parse_core(struct msg *r) { + struct dmsg *dmsg; + struct mbuf *b; + uint8_t *p = r->pos, *token; + uint8_t ch = ' '; + uint64_t num = 0; + + dyn_parse_state_t dyn_state = r->dyn_parse_state; + log_debug(LOG_VVERB, "dyn_parse_state: %d", r->dyn_parse_state); + + if (r->dyn_parse_state == DYN_DONE || r->dyn_parse_state == DYN_POST_DONE) + return true; + + b = STAILQ_LAST(&r->mhdr, mbuf, next); + + dmsg = r->dmsg; + if (dmsg == NULL) { + r->dmsg = dmsg_get(); + dmsg = r->dmsg; + dmsg->owner = r; + if (dmsg == NULL) { // should track this as a dropped message + loga("unable to create a new dmsg"); + goto error; // should count as OOM error + } + } - token = NULL; + token = NULL; - for (p = r->pos; p < b->last; p++) { - ch = *p; - switch (dyn_state) { + for (p = r->pos; p < b->last; p++) { + ch = *p; + switch (dyn_state) { case DYN_START: - log_debug(LOG_VVERB, "DYN_START"); - if (ch != ' ' && ch != '$') { - break; - } - - if (ch == ' ') { - if (token == NULL) - token = p; - - break; - } - - if (ch == '$') { - if (p + 5 < b->last) { - if ((*(p+1) == '2') && - (*(p+2) == '0') && - (*(p+3) == '1') && - (*(p+4) == '4') && - (*(p+5) == '$')) { - dyn_state = DYN_MAGIC_STRING; - p += 5; - } else { - //goto skip; - token = NULL; //reset - } - } else { - goto split; - } - } else { - loga("Facing a weird char %c", p); - //goto skip; - token = NULL; //reset - } - - break; + log_debug(LOG_VVERB, "DYN_START"); + if (ch != ' ' && ch != '$') { + break; + } + + if (ch == ' ') { + if (token == NULL) token = p; + + break; + } + + if (ch == '$') { + if (p + 5 < b->last) { + if ((*(p + 1) == '2') && (*(p + 2) == '0') && (*(p + 3) == '1') && + (*(p + 4) == '4') && (*(p + 5) == '$')) { + dyn_state = DYN_MAGIC_STRING; + p += 5; + } else { + // goto skip; + token = NULL; // reset + } + } else { + goto split; + } + } else { + loga("Facing a weird char %c", p); + // goto skip; + token = NULL; // reset + } + + break; case DYN_MAGIC_STRING: - log_debug(LOG_VVERB, "DYN_MAGIC_STRING"); - if (ch == ' ') { - dyn_state = DYN_MSG_ID; - num = 0; - break; - } else { - //loga("char is '%c %c %c %c'", *(p-2), *(p-1), ch, *(p+1)); - token = NULL; - loga("Facing a weird char %c", p); - //goto skip; - dyn_state = DYN_START; - } - - break; + log_debug(LOG_VVERB, "DYN_MAGIC_STRING"); + if (ch == ' ') { + dyn_state = DYN_MSG_ID; + num = 0; + break; + } else { + // loga("char is '%c %c %c %c'", *(p-2), *(p-1), ch, *(p+1)); + token = NULL; + loga("Facing a weird char %c", p); + // goto skip; + dyn_state = DYN_START; + } + + break; case DYN_MSG_ID: - log_debug(LOG_VVERB, "DYN_MSG_ID num = %d", num); - if (isdigit(ch)) { - num = num*10 + (uint64_t)(ch - '0'); - } else if (ch == ' ' && isdigit(*(p-1))) { - log_debug(LOG_VERB, "MSG ID : %d", num); - dmsg->id = num; - dyn_state = DYN_TYPE_ID; - num = 0; - } else { - //loga("char is '%c %c %c %c'", *(p-2), *(p-1), ch, *(p+1)); - //goto skip; - token = NULL; //reset - dyn_state = DYN_START; - if (ch == '$') - p -= 1; - } - break; + log_debug(LOG_VVERB, "DYN_MSG_ID num = %d", num); + if (isdigit(ch)) { + num = num * 10 + (uint64_t)(ch - '0'); + } else if (ch == ' ' && isdigit(*(p - 1))) { + log_debug(LOG_VERB, "MSG ID : %d", num); + dmsg->id = num; + dyn_state = DYN_TYPE_ID; + num = 0; + } else { + // loga("char is '%c %c %c %c'", *(p-2), *(p-1), ch, *(p+1)); + // goto skip; + token = NULL; // reset + dyn_state = DYN_START; + if (ch == '$') p -= 1; + } + break; case DYN_TYPE_ID: - log_debug(LOG_VVERB, "DYN_TYPE_ID: num = %d", num); - if (isdigit(ch)) { - num = num*10 + (uint64_t)(ch - '0'); - } else if (ch == ' ' && isdigit(*(p-1))) { - log_debug(LOG_VERB, "Type Id: %d", num); - dmsg->type = num; - dyn_state = DYN_BIT_FIELD; - num = 0; - } else { - //loga("char is '%c %c %c %c'", *(p-2), *(p-1), ch, *(p+1)); - token = NULL; - dyn_state = DYN_START; - if (ch == '$') - p -= 1; - } - - break; + log_debug(LOG_VVERB, "DYN_TYPE_ID: num = %d", num); + if (isdigit(ch)) { + num = num * 10 + (uint64_t)(ch - '0'); + } else if (ch == ' ' && isdigit(*(p - 1))) { + log_debug(LOG_VERB, "Type Id: %d", num); + dmsg->type = num; + dyn_state = DYN_BIT_FIELD; + num = 0; + } else { + // loga("char is '%c %c %c %c'", *(p-2), *(p-1), ch, *(p+1)); + token = NULL; + dyn_state = DYN_START; + if (ch == '$') p -= 1; + } + + break; case DYN_BIT_FIELD: - log_debug(LOG_VVERB, "DYN_FLAGS_FIELD, num = %d", num); - if (isdigit(ch)) { - num = num*10 + (uint64_t)(ch - '0'); - } else if (ch == ' ' && isdigit(*(p-1))) { - log_debug(LOG_VERB, "DYN_FLAGS_FIELD: %d", num); - dmsg->flags = num & 0xF; - dyn_state = DYN_VERSION; - num = 0; - } else { - token = NULL; - //loga("char is '%c %c %c %c'", *(p-2), *(p-1), ch, *(p+1)); - dyn_state = DYN_START; - if (ch == '$') - p -= 1; - } - - break; + log_debug(LOG_VVERB, "DYN_FLAGS_FIELD, num = %d", num); + if (isdigit(ch)) { + num = num * 10 + (uint64_t)(ch - '0'); + } else if (ch == ' ' && isdigit(*(p - 1))) { + log_debug(LOG_VERB, "DYN_FLAGS_FIELD: %d", num); + dmsg->flags = num & 0xF; + dyn_state = DYN_VERSION; + num = 0; + } else { + token = NULL; + // loga("char is '%c %c %c %c'", *(p-2), *(p-1), ch, *(p+1)); + dyn_state = DYN_START; + if (ch == '$') p -= 1; + } + + break; case DYN_VERSION: - log_debug(LOG_VVERB, "DYN_VERSION: num = %d", num); - if (isdigit(ch)) { - num = num*10 + (uint64_t)(ch - '0'); - } else if (ch == ' ' && isdigit(*(p-1))) { - log_debug(LOG_VERB, "VERSION : %d", num); - dmsg->version = num; - dyn_state = DYN_SAME_DC; - num = 0; - } else { - token = NULL; - //loga("char is '%c %c %c %c'", *(p-2), *(p-1), ch, *(p+1)); - dyn_state = DYN_START; - if (ch == '$') - p -= 1; - } - - break; + log_debug(LOG_VVERB, "DYN_VERSION: num = %d", num); + if (isdigit(ch)) { + num = num * 10 + (uint64_t)(ch - '0'); + } else if (ch == ' ' && isdigit(*(p - 1))) { + log_debug(LOG_VERB, "VERSION : %d", num); + dmsg->version = num; + dyn_state = DYN_SAME_DC; + num = 0; + } else { + token = NULL; + // loga("char is '%c %c %c %c'", *(p-2), *(p-1), ch, *(p+1)); + dyn_state = DYN_START; + if (ch == '$') p -= 1; + } + + break; case DYN_SAME_DC: - if (isdigit(ch)) { - dmsg->same_dc = (uint8_t)(ch - '0'); - log_debug(LOG_VERB, "DYN_SAME_DC %d", dmsg->same_dc); - } else if (ch == ' ' && isdigit(*(p-1))) { - dyn_state = DYN_DATA_LEN; - num = 0; - } else { - token = NULL; - //loga("char is '%c %c %c %c'", *(p-2), *(p-1), ch, *(p+1)); - dyn_state = DYN_START; - if (ch == '$') - p -= 1; - } - - break; + if (isdigit(ch)) { + dmsg->same_dc = (uint8_t)(ch - '0'); + log_debug(LOG_VERB, "DYN_SAME_DC %d", dmsg->same_dc); + } else if (ch == ' ' && isdigit(*(p - 1))) { + dyn_state = DYN_DATA_LEN; + num = 0; + } else { + token = NULL; + // loga("char is '%c %c %c %c'", *(p-2), *(p-1), ch, *(p+1)); + dyn_state = DYN_START; + if (ch == '$') p -= 1; + } + + break; case DYN_DATA_LEN: - log_debug(LOG_VVERB, "DYN_DATA_LEN: num = %d", num); - if (ch == '*') { - break; - } else if (isdigit(ch)) { - num = num*10 + (uint64_t)(ch - '0'); - } else if (ch == ' ' && isdigit(*(p-1))) { - log_debug(LOG_VERB, "Data len: %d", num); - dmsg->mlen = (uint32_t)num; - dyn_state = DYN_DATA; - num = 0; - } else { - token = NULL; - //loga("char is '%c %c %c %c'", *(p-2), *(p-1), ch, *(p+1)); - dyn_state = DYN_START; - if (ch == '$') - p -= 1; - } - break; + log_debug(LOG_VVERB, "DYN_DATA_LEN: num = %d", num); + if (ch == '*') { + break; + } else if (isdigit(ch)) { + num = num * 10 + (uint64_t)(ch - '0'); + } else if (ch == ' ' && isdigit(*(p - 1))) { + log_debug(LOG_VERB, "Data len: %d", num); + dmsg->mlen = (uint32_t)num; + dyn_state = DYN_DATA; + num = 0; + } else { + token = NULL; + // loga("char is '%c %c %c %c'", *(p-2), *(p-1), ch, *(p+1)); + dyn_state = DYN_START; + if (ch == '$') p -= 1; + } + break; case DYN_DATA: - log_debug(LOG_VVERB, "DYN_DATA"); - if (p + dmsg->mlen < b->last) { - dmsg->data = p; - p += dmsg->mlen - 1; - dyn_state = DYN_SPACES_BEFORE_PAYLOAD_LEN; - } else { - //loga("char is '%c %c %c %c'", *(p-2), *(p-1), ch, *(p+1)); - goto split; - } - - break; + log_debug(LOG_VVERB, "DYN_DATA"); + if (p + dmsg->mlen < b->last) { + dmsg->data = p; + p += dmsg->mlen - 1; + dyn_state = DYN_SPACES_BEFORE_PAYLOAD_LEN; + } else { + // loga("char is '%c %c %c %c'", *(p-2), *(p-1), ch, *(p+1)); + goto split; + } + + break; case DYN_SPACES_BEFORE_PAYLOAD_LEN: - log_debug(LOG_VVERB, "DYN_SPACES_BEFORE_PAYLOAD_LEN"); - if (ch == ' ') { - break; - } else if (ch == '*') { - dyn_state = DYN_PAYLOAD_LEN; - num = 0; - } + log_debug(LOG_VVERB, "DYN_SPACES_BEFORE_PAYLOAD_LEN"); + if (ch == ' ') { + break; + } else if (ch == '*') { + dyn_state = DYN_PAYLOAD_LEN; + num = 0; + } - break; + break; case DYN_PAYLOAD_LEN: - if (isdigit(ch)) { - num = num*10 + (uint64_t)(ch - '0'); - } else if (ch == CR) { - log_debug(LOG_VERB, "Payload len: %d", num); - dmsg->plen = (uint32_t)num; - num = 0; - dyn_state = DYN_CRLF_BEFORE_DONE; - } else { - token = NULL; - dyn_state = DYN_START; - if (ch == '$') - p -= 1; - } - break; + if (isdigit(ch)) { + num = num * 10 + (uint64_t)(ch - '0'); + } else if (ch == CR) { + log_debug(LOG_VERB, "Payload len: %d", num); + dmsg->plen = (uint32_t)num; + num = 0; + dyn_state = DYN_CRLF_BEFORE_DONE; + } else { + token = NULL; + dyn_state = DYN_START; + if (ch == '$') p -= 1; + } + break; case DYN_CRLF_BEFORE_DONE: - log_debug(LOG_VVERB, "DYN_CRLF_BEFORE_DONE"); - if (*p == LF) { - dyn_state = DYN_DONE; - } else { - token = NULL; - dyn_state = DYN_START; - if (ch == '$') - p -= 1; - } - - break; + log_debug(LOG_VVERB, "DYN_CRLF_BEFORE_DONE"); + if (*p == LF) { + dyn_state = DYN_DONE; + } else { + token = NULL; + dyn_state = DYN_START; + if (ch == '$') p -= 1; + } + + break; case DYN_DONE: - log_debug(LOG_VVERB, "DYN_DONE"); - r->mlen -= (uint32_t)(p - r->pos); - r->pos = p; - dmsg->payload = p; - r->dyn_parse_state = DYN_DONE; - b->pos = p; - goto done; - break; + log_debug(LOG_VVERB, "DYN_DONE"); + r->mlen -= (uint32_t)(p - r->pos); + r->pos = p; + dmsg->payload = p; + r->dyn_parse_state = DYN_DONE; + b->pos = p; + goto done; + break; default: - NOT_REACHED(); - break; - + NOT_REACHED(); + break; + } + } + + log_debug(LOG_DEBUG, "Not fully parsed yet!!!!!!"); +split: + // this is an attempt recovery when we got a bad message + // we try to look for the start the next good one and throw away the bad part + if (r->dyn_parse_state == DYN_START) { + r->result = MSG_PARSE_AGAIN; + if (b->last == b->end) { + struct mbuf *nbuf = mbuf_get(); + if (nbuf == NULL) { + loga("Unable to obtain a new mbuf for replacement!"); + mbuf_put(b); + nbuf = mbuf_get(); + mbuf_insert_head(&r->mhdr, nbuf); + r->pos = nbuf->pos; + return false; } - } - - log_debug(LOG_DEBUG, "Not fully parsed yet!!!!!!"); - split: - //this is an attempt recovery when we got a bad message - //we try to look for the start the next good one and throw away the bad part - if (r->dyn_parse_state == DYN_START) { - r->result = MSG_PARSE_AGAIN; - if (b->last == b->end) { - struct mbuf *nbuf = mbuf_get(); - if (nbuf == NULL) { - loga("Unable to obtain a new mbuf for replacement!"); - mbuf_put(b); - nbuf = mbuf_get(); - mbuf_insert_head(&r->mhdr, nbuf); - r->pos = nbuf->pos; - return false; - } - - //replacing the bad mbuf with a new and empty mbuf - mbuf_insert(&r->mhdr, nbuf); - mbuf_remove(&r->mhdr, b); - mbuf_put(b); - r->pos = nbuf->pos; - return false; - } else { //split it and throw away the bad portion - struct mbuf *nbuf; - - nbuf = mbuf_split(&r->mhdr, r->pos, NULL, NULL); - if (nbuf == NULL) { - return DN_ENOMEM; - } - mbuf_insert(&r->mhdr, nbuf); - mbuf_remove(&r->mhdr, b); - mbuf_put(b); - r->pos = nbuf->pos; - return false; - } - - } - - if (mbuf_length(b) == 0 || b->last == b->end) { - log_debug(LOG_DEBUG, "Would this case ever happen?"); - r->result = MSG_PARSE_AGAIN; + // replacing the bad mbuf with a new and empty mbuf + mbuf_insert(&r->mhdr, nbuf); + mbuf_remove(&r->mhdr, b); + mbuf_put(b); + r->pos = nbuf->pos; + return false; + } else { // split it and throw away the bad portion + struct mbuf *nbuf; + + nbuf = mbuf_split(&r->mhdr, r->pos, NULL, NULL); + if (nbuf == NULL) { + return DN_ENOMEM; + } + mbuf_insert(&r->mhdr, nbuf); + mbuf_remove(&r->mhdr, b); + mbuf_put(b); + r->pos = nbuf->pos; return false; - } - - if (r->pos == b->last) { - log_debug(LOG_DEBUG, "Forward to reading the new block of data"); - r->dyn_parse_state = DYN_START; - r->result = MSG_PARSE_AGAIN; - token = NULL; - return false; - } - - if (log_loggable(LOG_VVERB)) { - log_debug(LOG_VVERB, "in split"); - } - r->dyn_parse_state = DYN_START; - r->pos = token; - r->result = MSG_PARSE_REPAIR; - if (log_loggable(LOG_VVERB)) { - log_hexdump(LOG_VVERB, b->pos, mbuf_length(b), "split and inspecting req %"PRIu64" " - "res %d type %d state %d", r->id, r->result, r->type, - r->dyn_parse_state); - - log_hexdump(LOG_VVERB, b->start, b->last - b->start, "split and inspecting full req %"PRIu64" " - "res %d type %d state %d", r->id, r->result, r->type, - r->dyn_parse_state); - } - return false; + } + } + + if (mbuf_length(b) == 0 || b->last == b->end) { + log_debug(LOG_DEBUG, "Would this case ever happen?"); + r->result = MSG_PARSE_AGAIN; + return false; + } + + if (r->pos == b->last) { + log_debug(LOG_DEBUG, "Forward to reading the new block of data"); + r->dyn_parse_state = DYN_START; + r->result = MSG_PARSE_AGAIN; + token = NULL; + return false; + } + + if (log_loggable(LOG_VVERB)) { + log_debug(LOG_VVERB, "in split"); + } + r->dyn_parse_state = DYN_START; + r->pos = token; + r->result = MSG_PARSE_REPAIR; + if (log_loggable(LOG_VVERB)) { + log_hexdump(LOG_VVERB, b->pos, mbuf_length(b), + "split and inspecting req %" PRIu64 + " " + "res %d type %d state %d", + r->id, r->result, r->type, r->dyn_parse_state); + + log_hexdump(LOG_VVERB, b->start, b->last - b->start, + "split and inspecting full req %" PRIu64 + " " + "res %d type %d state %d", + r->id, r->result, r->type, r->dyn_parse_state); + } + return false; done: - r->pos = p; - dmsg->source_address = r->owner->addr; - log_debug(LOG_DEBUG, "MSG ID: %d, type: %d, secured %d, version %d, "\ - "same_dc %d, datalen %u, payload len: %u", dmsg->id, - dmsg->type, dmsg->flags & 0x1, dmsg->version, dmsg->same_dc, - dmsg->mlen, dmsg->plen); - - if (log_loggable(LOG_VVERB)) { - log_debug(LOG_VVERB, "at done with p at %d", p); - log_hexdump(LOG_VVERB, r->pos, b->last - r->pos, "done and inspecting req %"PRIu64" " - "res %d type %d state %d", r->id, r->result, r->type, - r->dyn_parse_state); - log_hexdump(LOG_VVERB, b->start, b->last - b->start, "inspecting req %"PRIu64" " - "res %d type %d state %d", r->id, r->result, r->type, - r->dyn_parse_state); - } - - return true; + r->pos = p; + dmsg->source_address = r->owner->addr; + log_debug(LOG_DEBUG, + "MSG ID: %d, type: %d, secured %d, version %d, " + "same_dc %d, datalen %u, payload len: %u", + dmsg->id, dmsg->type, dmsg->flags & 0x1, dmsg->version, + dmsg->same_dc, dmsg->mlen, dmsg->plen); + + if (log_loggable(LOG_VVERB)) { + log_debug(LOG_VVERB, "at done with p at %d", p); + log_hexdump(LOG_VVERB, r->pos, b->last - r->pos, + "done and inspecting req %" PRIu64 + " " + "res %d type %d state %d", + r->id, r->result, r->type, r->dyn_parse_state); + log_hexdump(LOG_VVERB, b->start, b->last - b->start, + "inspecting req %" PRIu64 + " " + "res %d type %d state %d", + r->id, r->result, r->type, r->dyn_parse_state); + } + + return true; error: - log_debug(LOG_ERR, "at error for state %d and c %c", dyn_state, *p); - r->result = MSG_PARSE_ERROR; - r->pos = p; - errno = EINVAL; - - if (log_loggable(LOG_ERR)) { - log_hexdump(LOG_ERR, b->pos, mbuf_length(b), "parsed bad req %"PRIu64" " - "res %d type %d state %d", r->id, r->result, r->type, - dyn_state); - log_hexdump(LOG_ERR, p, b->last - p, "inspecting req %"PRIu64" " - "res %d type %d state %d", r->id, r->result, r->type, - dyn_state); - } - r->dyn_parse_state = dyn_state; - - return false; + log_debug(LOG_ERR, "at error for state %d and c %c", dyn_state, *p); + r->result = MSG_PARSE_ERROR; + r->pos = p; + errno = EINVAL; + + if (log_loggable(LOG_ERR)) { + log_hexdump(LOG_ERR, b->pos, mbuf_length(b), + "parsed bad req %" PRIu64 + " " + "res %d type %d state %d", + r->id, r->result, r->type, dyn_state); + log_hexdump(LOG_ERR, p, b->last - p, + "inspecting req %" PRIu64 + " " + "res %d type %d state %d", + r->id, r->result, r->type, dyn_state); + } + r->dyn_parse_state = dyn_state; + + return false; } -static void -data_store_parse_req(struct msg *r, const struct string *hash_tag) -{ - if (g_data_store == DATA_REDIS) { - return redis_parse_req(r, hash_tag); - } - else if (g_data_store == DATA_MEMCACHE){ - return memcache_parse_req(r, hash_tag); - } - else{ - ASSERT_LOG(false, "invalid datastore"); - exit(1); - } +static void data_store_parse_req(struct msg *r, const struct string *hash_tag) { + if (g_data_store == DATA_REDIS) { + return redis_parse_req(r, hash_tag); + } else if (g_data_store == DATA_MEMCACHE) { + return memcache_parse_req(r, hash_tag); + } else { + ASSERT_LOG(false, "invalid datastore"); + exit(1); + } } -void -dyn_parse_req(struct msg *r, const struct string *hash_tag) -{ - bool done_parsing = false; - struct mbuf *b = STAILQ_LAST(&r->mhdr, mbuf, next); - - if (dyn_parse_core(r)) { - struct dmsg *dmsg = r->dmsg; - struct conn *conn = r->owner; - conn->same_dc = !!dmsg->same_dc; - - if (dmsg->type != DMSG_UNKNOWN && dmsg->type != DMSG_REQ && - dmsg->type != DMSG_REQ_FORWARD && dmsg->type != GOSSIP_SYN) { - r->state = 0; - r->result = MSG_PARSE_OK; - r->dyn_parse_state = DYN_DONE; - return; - } - - if (r->dyn_parse_state == DYN_DONE && dmsg->flags == 1) { - dmsg->owner->owner->dnode_secured = 1; - r->owner->crypto_key_sent = 1; - r->dyn_parse_state = DYN_POST_DONE; - r->result = MSG_PARSE_REPAIR; - - if (dmsg->mlen > 1) { - //Decrypt AES key - dyn_rsa_decrypt(dmsg->data, aes_decrypted_buf); - memcpy(r->owner->aes_key, aes_decrypted_buf, AES_KEYLEN); - SCOPED_CHARPTR(encoded_aes_key) = base64_encode(r->owner->aes_key, AES_KEYLEN); - if (encoded_aes_key) - loga("AES decryption key: %s\n", (char*)encoded_aes_key); - } - - if (dmsg->plen + b->pos <= b->last) { - struct mbuf *decrypted_buf = mbuf_get(); - if (decrypted_buf == NULL) { - loga("Unable to obtain an mbuf for dnode msg's header!"); - r->result = MSG_OOM_ERROR; - return; - } - - dyn_aes_decrypt(b->pos, dmsg->plen, decrypted_buf, r->owner->aes_key); - decrypted_buf->flags |= MBUF_FLAGS_JUST_DECRYPTED; - - b->pos = b->pos + dmsg->plen; - r->pos = decrypted_buf->start; - mbuf_copy(decrypted_buf, b->pos, mbuf_length(b)); - - mbuf_insert(&r->mhdr, decrypted_buf); - mbuf_remove(&r->mhdr, b); - mbuf_put(b); - - r->mlen = mbuf_length(decrypted_buf); - - data_store_parse_req(r, hash_tag); - return; - - } - - //substract already received bytes - dmsg->plen -= (uint32_t)(b->last - b->pos); - - return; - } else if (r->dyn_parse_state == DYN_POST_DONE) { - struct mbuf *last_buf = STAILQ_LAST(&r->mhdr, mbuf, next); - if (last_buf->flags & MBUF_FLAGS_READ_FLIP) { - data_store_parse_req(r, hash_tag); - } else { - r->result = MSG_PARSE_AGAIN; - } - return; - } - - if (dmsg->type == GOSSIP_SYN) { - //TODOs: need to address multi-buffer msg later - dmsg->payload = b->pos; - - b->pos = b->pos + dmsg->plen; - r->pos = b->pos; - - done_parsing = true; - } - - if (done_parsing) - return; - - return data_store_parse_req(r, hash_tag); - } - - //bad case - if (log_loggable(LOG_VVERB)) { - log_debug(LOG_VVERB, "Bad or splitted message"); //fix me to do something - msg_dump(LOG_VVERB, r); - } - r->result = MSG_PARSE_AGAIN; -} - -static void -data_store_parse_rsp(struct msg *r, const struct string *hash_tag) -{ - if (g_data_store == DATA_REDIS) { - return redis_parse_rsp(r, hash_tag); - } - else if (g_data_store == DATA_MEMCACHE){ - return memcache_parse_rsp(r, hash_tag); +void dyn_parse_req(struct msg *r, const struct string *hash_tag) { + bool done_parsing = false; + struct mbuf *b = STAILQ_LAST(&r->mhdr, mbuf, next); + + if (dyn_parse_core(r)) { + struct dmsg *dmsg = r->dmsg; + struct conn *conn = r->owner; + conn->same_dc = !!dmsg->same_dc; + + if (dmsg->type != DMSG_UNKNOWN && dmsg->type != DMSG_REQ && + dmsg->type != DMSG_REQ_FORWARD && dmsg->type != GOSSIP_SYN) { + r->state = 0; + r->result = MSG_PARSE_OK; + r->dyn_parse_state = DYN_DONE; + return; } - else{ - ASSERT_LOG(false, "invalid datastore"); - exit(1); - } -} -void dyn_parse_rsp(struct msg *r, const struct string *UNUSED) -{ - if (log_loggable(LOG_VVERB)) { - log_debug(LOG_VVERB, ":::::::::::::::::::::: In dyn_parse_rsp, start to process response :::::::::::::::::::::::: "); - msg_dump(LOG_VVERB, r); - } - - bool done_parsing = false; - struct mbuf *b = STAILQ_LAST(&r->mhdr, mbuf, next); - if (dyn_parse_core(r)) { - struct dmsg *dmsg = r->dmsg; - struct conn *conn = r->owner; - conn->same_dc = !!dmsg->same_dc; - - if (dmsg->type != DMSG_UNKNOWN && dmsg->type != DMSG_RES) { - log_debug(LOG_DEBUG, "Resp parser: I got a dnode msg of type %d", dmsg->type); - r->state = 0; - r->result = MSG_PARSE_OK; - r->dyn_parse_state = DYN_DONE; - return; - } - - if (r->dyn_parse_state == DYN_DONE && dmsg->flags == 1) { - dmsg->owner->owner->dnode_secured = 1; - r->owner->crypto_key_sent = 1; - r->dyn_parse_state = DYN_POST_DONE; - r->result = MSG_PARSE_REPAIR; - - if (dmsg->mlen > 1) { - //Decrypt AES key - dyn_rsa_decrypt(dmsg->data, aes_decrypted_buf); - memcpy(r->owner->aes_key, aes_decrypted_buf, AES_KEYLEN); - } - - // we have received all the remaining ecrypted data - if (dmsg->plen + b->pos <= b->last) { - struct mbuf *decrypted_buf = mbuf_get(); - if (decrypted_buf == NULL) { - loga("Unable to obtain an mbuf for dnode msg's header!"); - r->result = MSG_OOM_ERROR; - return; - } - - dyn_aes_decrypt(b->pos, dmsg->plen, decrypted_buf, r->owner->aes_key); - decrypted_buf->flags |= MBUF_FLAGS_JUST_DECRYPTED; - - b->pos = b->pos + dmsg->plen; - r->pos = decrypted_buf->start; - mbuf_copy(decrypted_buf, b->pos, mbuf_length(b)); - - mbuf_insert(&r->mhdr, decrypted_buf); - mbuf_remove(&r->mhdr, b); - mbuf_put(b); - - r->mlen = mbuf_length(decrypted_buf); - - return data_store_parse_rsp(r, UNUSED); - } - - //Subtract already received bytes - dmsg->plen -= (uint32_t)(b->last - b->pos); - return; - - } else if (r->dyn_parse_state == DYN_POST_DONE) { - struct mbuf *last_buf = STAILQ_LAST(&r->mhdr, mbuf, next); - if (last_buf->flags & MBUF_FLAGS_READ_FLIP) { - data_store_parse_rsp(r, UNUSED); - } else { - r->result = MSG_PARSE_AGAIN; - } - return; - } - - if (done_parsing) - return; - - return data_store_parse_rsp(r, UNUSED); - } - - //bad case - if (log_loggable(LOG_DEBUG)) { - log_debug(LOG_DEBUG, "Resp: bad message - cannot parse"); //fix me to do something - msg_dump(LOG_DEBUG, r); - } - - r->result = MSG_PARSE_AGAIN; + if (r->dyn_parse_state == DYN_DONE && dmsg->flags == 1) { + dmsg->owner->owner->dnode_secured = 1; + r->owner->crypto_key_sent = 1; + r->dyn_parse_state = DYN_POST_DONE; + r->result = MSG_PARSE_REPAIR; + + if (dmsg->mlen > 1) { + // Decrypt AES key + dyn_rsa_decrypt(dmsg->data, aes_decrypted_buf); + memcpy(r->owner->aes_key, aes_decrypted_buf, AES_KEYLEN); + SCOPED_CHARPTR(encoded_aes_key) = + base64_encode(r->owner->aes_key, AES_KEYLEN); + if (encoded_aes_key) + loga("AES decryption key: %s\n", (char *)encoded_aes_key); + } -} + if (dmsg->plen + b->pos <= b->last) { + struct mbuf *decrypted_buf = mbuf_get(); + if (decrypted_buf == NULL) { + loga("Unable to obtain an mbuf for dnode msg's header!"); + r->result = MSG_OOM_ERROR; + return; + } + dyn_aes_decrypt(b->pos, dmsg->plen, decrypted_buf, r->owner->aes_key); + decrypted_buf->flags |= MBUF_FLAGS_JUST_DECRYPTED; -void -dmsg_free(struct dmsg *dmsg) -{ - log_debug(LOG_VVVERB, "free dmsg %p id %"PRIu64"", dmsg, dmsg->id); - dn_free(dmsg); -} + b->pos = b->pos + dmsg->plen; + r->pos = decrypted_buf->start; + mbuf_copy(decrypted_buf, b->pos, mbuf_length(b)); + mbuf_insert(&r->mhdr, decrypted_buf); + mbuf_remove(&r->mhdr, b); + mbuf_put(b); -void -dmsg_put(struct dmsg *dmsg) -{ - if (log_loggable(LOG_VVVERB)) { - log_debug(LOG_VVVERB, "put dmsg %p id %"PRIu64"", dmsg, dmsg->id); - } - TAILQ_INSERT_HEAD(&free_dmsgq, dmsg, m_tqe); -} - -void -dmsg_dump(struct dmsg *dmsg) -{ - log_debug(LOG_VVVERB, "dmsg dump: id %"PRIu64" version %d flags %d type %d len %"PRIu32" plen %"PRIu32" ", - dmsg->id, dmsg->version, dmsg->flags , dmsg->type, dmsg->mlen, dmsg->plen); -} + r->mlen = mbuf_length(decrypted_buf); + data_store_parse_req(r, hash_tag); + return; + } -void -dmsg_init(void) -{ - log_debug(LOG_VVVERB, "dmsg size %d", sizeof(struct dmsg)); + // substract already received bytes + dmsg->plen -= (uint32_t)(b->last - b->pos); - dmsg_id = 0; - TAILQ_INIT(&free_dmsgq); -} + return; + } else if (r->dyn_parse_state == DYN_POST_DONE) { + struct mbuf *last_buf = STAILQ_LAST(&r->mhdr, mbuf, next); + if (last_buf->flags & MBUF_FLAGS_READ_FLIP) { + data_store_parse_req(r, hash_tag); + } else { + r->result = MSG_PARSE_AGAIN; + } + return; + } + if (dmsg->type == GOSSIP_SYN) { + // TODOs: need to address multi-buffer msg later + dmsg->payload = b->pos; -void -dmsg_deinit(void) -{ - struct dmsg *dmsg, *ndmsg; + b->pos = b->pos + dmsg->plen; + r->pos = b->pos; - for (dmsg = TAILQ_FIRST(&free_dmsgq); dmsg != NULL; - dmsg = ndmsg) { - ASSERT(TAILQ_COUNT(&free_dmsgq) > 0); - ndmsg = TAILQ_NEXT(dmsg, m_tqe); - dmsg_free(dmsg); + done_parsing = true; } - ASSERT(TAILQ_COUNT(&free_dmsgq) == 0); -} + if (done_parsing) return; -bool -dmsg_empty(struct dmsg *dmsg) -{ - return dmsg->mlen == 0 ? true : false; -} - + return data_store_parse_req(r, hash_tag); + } -struct dmsg * -dmsg_get(void) -{ - struct dmsg *dmsg; + // bad case + if (log_loggable(LOG_VVERB)) { + log_debug(LOG_VVERB, "Bad or splitted message"); // fix me to do something + msg_dump(LOG_VVERB, r); + } + r->result = MSG_PARSE_AGAIN; +} - if (!TAILQ_EMPTY(&free_dmsgq)) { - ASSERT(TAILQ_COUNT(&free_dmsgq) > 0); +static void data_store_parse_rsp(struct msg *r, const struct string *hash_tag) { + if (g_data_store == DATA_REDIS) { + return redis_parse_rsp(r, hash_tag); + } else if (g_data_store == DATA_MEMCACHE) { + return memcache_parse_rsp(r, hash_tag); + } else { + ASSERT_LOG(false, "invalid datastore"); + exit(1); + } +} - dmsg = TAILQ_FIRST(&free_dmsgq); - TAILQ_REMOVE(&free_dmsgq, dmsg, m_tqe); - goto done; +void dyn_parse_rsp(struct msg *r, const struct string *UNUSED) { + if (log_loggable(LOG_VVERB)) { + log_debug(LOG_VVERB, + ":::::::::::::::::::::: In dyn_parse_rsp, start to process " + "response :::::::::::::::::::::::: "); + msg_dump(LOG_VVERB, r); + } + + bool done_parsing = false; + struct mbuf *b = STAILQ_LAST(&r->mhdr, mbuf, next); + if (dyn_parse_core(r)) { + struct dmsg *dmsg = r->dmsg; + struct conn *conn = r->owner; + conn->same_dc = !!dmsg->same_dc; + + if (dmsg->type != DMSG_UNKNOWN && dmsg->type != DMSG_RES) { + log_debug(LOG_DEBUG, "Resp parser: I got a dnode msg of type %d", + dmsg->type); + r->state = 0; + r->result = MSG_PARSE_OK; + r->dyn_parse_state = DYN_DONE; + return; } - dmsg = dn_alloc(sizeof(*dmsg)); - if (dmsg == NULL) { - return NULL; - } + if (r->dyn_parse_state == DYN_DONE && dmsg->flags == 1) { + dmsg->owner->owner->dnode_secured = 1; + r->owner->crypto_key_sent = 1; + r->dyn_parse_state = DYN_POST_DONE; + r->result = MSG_PARSE_REPAIR; -done: - // STAILQ_INIT(&dmsg->mhdr); - dmsg->mlen = 0; - dmsg->data = NULL; - - dmsg->plen = 0; - dmsg->payload = NULL; - - dmsg->type = DMSG_UNKNOWN; - dmsg->version = VERSION_10; - dmsg->id = 0; - dmsg->source_address = NULL; - dmsg->owner = NULL; - dmsg->flags = 0; - dmsg->same_dc = 1; - - return dmsg; -} + if (dmsg->mlen > 1) { + // Decrypt AES key + dyn_rsa_decrypt(dmsg->data, aes_decrypted_buf); + memcpy(r->owner->aes_key, aes_decrypted_buf, AES_KEYLEN); + } + // we have received all the remaining ecrypted data + if (dmsg->plen + b->pos <= b->last) { + struct mbuf *decrypted_buf = mbuf_get(); + if (decrypted_buf == NULL) { + loga("Unable to obtain an mbuf for dnode msg's header!"); + r->result = MSG_OOM_ERROR; + return; + } -rstatus_t -dmsg_write(struct mbuf *mbuf, uint64_t msg_id, uint8_t type, - struct conn *conn, uint32_t payload_len) -{ + dyn_aes_decrypt(b->pos, dmsg->plen, decrypted_buf, r->owner->aes_key); + decrypted_buf->flags |= MBUF_FLAGS_JUST_DECRYPTED; - mbuf_write_string(mbuf, &MAGIC_STR); - mbuf_write_uint64(mbuf, msg_id); + b->pos = b->pos + dmsg->plen; + r->pos = decrypted_buf->start; + mbuf_copy(decrypted_buf, b->pos, mbuf_length(b)); - //type - mbuf_write_char(mbuf, ' '); - mbuf_write_uint8(mbuf, type); + mbuf_insert(&r->mhdr, decrypted_buf); + mbuf_remove(&r->mhdr, b); + mbuf_put(b); - //bit field - mbuf_write_char(mbuf, ' '); - //encryption bit - uint8_t flags = 0; - - if (conn->dnode_secured) { - flags |= 0x1; - } - mbuf_write_uint8(mbuf, flags); + r->mlen = mbuf_length(decrypted_buf); - //version - mbuf_write_char(mbuf, ' '); - mbuf_write_uint8(mbuf, version); + return data_store_parse_rsp(r, UNUSED); + } - //same-dc - mbuf_write_char(mbuf, ' '); - if (conn->same_dc) - mbuf_write_uint8(mbuf, 1); - else - mbuf_write_uint8(mbuf, 0); + // Subtract already received bytes + dmsg->plen -= (uint32_t)(b->last - b->pos); + return; - //data - mbuf_write_char(mbuf, ' '); - mbuf_write_char(mbuf, '*'); - - //write aes key - unsigned char *aes_key = conn->aes_key; - - if (conn->dnode_secured && !conn->crypto_key_sent) { - mbuf_write_uint32(mbuf, (uint32_t)dyn_rsa_size()); - //payload - mbuf_write_char(mbuf, ' '); - dyn_rsa_encrypt(aes_key, aes_encrypted_buf); - mbuf_write_bytes(mbuf, aes_encrypted_buf, dyn_rsa_size()); - conn->crypto_key_sent = 1; - } else { - mbuf_write_uint32(mbuf, 1); - //payload - mbuf_write_char(mbuf, ' '); - mbuf_write_char(mbuf, 'd'); //TODOs: replace with another string + } else if (r->dyn_parse_state == DYN_POST_DONE) { + struct mbuf *last_buf = STAILQ_LAST(&r->mhdr, mbuf, next); + if (last_buf->flags & MBUF_FLAGS_READ_FLIP) { + data_store_parse_rsp(r, UNUSED); + } else { + r->result = MSG_PARSE_AGAIN; + } + return; } - mbuf_write_char(mbuf, ' '); - mbuf_write_char(mbuf, '*'); - mbuf_write_uint32(mbuf, payload_len); - mbuf_write_string(mbuf, &CRLF_STR); - - return DN_OK; -} + if (done_parsing) return; -//Used in gossip forwarding msg only for now -rstatus_t -dmsg_write_mbuf(struct mbuf *mbuf, uint64_t msg_id, uint8_t type, struct conn *conn, uint32_t plen) -{ - mbuf_write_string(mbuf, &MAGIC_STR); - mbuf_write_uint64(mbuf, msg_id); - mbuf_write_char(mbuf, ' '); - mbuf_write_uint8(mbuf, type); - mbuf_write_char(mbuf, ' '); + return data_store_parse_rsp(r, UNUSED); + } - //encryption bit - if (conn->dnode_secured) { - mbuf_write_uint8(mbuf, 1); - } else { - mbuf_write_uint8(mbuf, 0); - } + // bad case + if (log_loggable(LOG_DEBUG)) { + log_debug(LOG_DEBUG, + "Resp: bad message - cannot parse"); // fix me to do something + msg_dump(LOG_DEBUG, r); + } - mbuf_write_char(mbuf, ' '); - mbuf_write_uint8(mbuf, version); + r->result = MSG_PARSE_AGAIN; +} - //same-dc - mbuf_write_char(mbuf, ' '); - if (conn->same_dc) - mbuf_write_uint8(mbuf, 1); - else - mbuf_write_uint8(mbuf, 0); +void dmsg_free(struct dmsg *dmsg) { + log_debug(LOG_VVVERB, "free dmsg %p id %" PRIu64 "", dmsg, dmsg->id); + dn_free(dmsg); +} - //mbuf_write_string(mbuf, &CRLF_STR); - mbuf_write_char(mbuf, ' '); - mbuf_write_char(mbuf, '*'); +void dmsg_put(struct dmsg *dmsg) { + if (log_loggable(LOG_VVVERB)) { + log_debug(LOG_VVVERB, "put dmsg %p id %" PRIu64 "", dmsg, dmsg->id); + } + TAILQ_INSERT_HEAD(&free_dmsgq, dmsg, m_tqe); +} - //write aes key - unsigned char *aes_key = conn->aes_key; - if (conn->dnode_secured) { - mbuf_write_uint32(mbuf, (uint32_t)dyn_rsa_size()); - } else { - mbuf_write_uint32(mbuf, 1); - } +void dmsg_dump(struct dmsg *dmsg) { + log_debug(LOG_VVVERB, + "dmsg dump: id %" PRIu64 + " version %d flags %d type %d len %" PRIu32 " plen %" PRIu32 " ", + dmsg->id, dmsg->version, dmsg->flags, dmsg->type, dmsg->mlen, + dmsg->plen); +} - mbuf_write_char(mbuf, ' '); - //mbuf_write_mbuf(mbuf, data); - if (conn->dnode_secured) { - dyn_rsa_encrypt(aes_key, aes_encrypted_buf); - mbuf_write_bytes(mbuf, aes_encrypted_buf, dyn_rsa_size()); - } else { - mbuf_write_char(mbuf, 'a'); //TODOs: replace with another string - } +void dmsg_init(void) { + log_debug(LOG_VVVERB, "dmsg size %d", sizeof(struct dmsg)); - //mbuf_write_string(mbuf, &CRLF_STR); - mbuf_write_char(mbuf, ' '); - mbuf_write_char(mbuf, '*'); - mbuf_write_uint32(mbuf, plen); + dmsg_id = 0; + TAILQ_INIT(&free_dmsgq); +} - mbuf_write_string(mbuf, &CRLF_STR); +void dmsg_deinit(void) { + struct dmsg *dmsg, *ndmsg; - return DN_OK; + for (dmsg = TAILQ_FIRST(&free_dmsgq); dmsg != NULL; dmsg = ndmsg) { + ASSERT(TAILQ_COUNT(&free_dmsgq) > 0); + ndmsg = TAILQ_NEXT(dmsg, m_tqe); + dmsg_free(dmsg); + } + ASSERT(TAILQ_COUNT(&free_dmsgq) == 0); } +bool dmsg_empty(struct dmsg *dmsg) { return dmsg->mlen == 0 ? true : false; } -static rstatus_t -dmsg_to_gossip(struct ring_msg *rmsg) -{ - CBUF_Push(C2G_InQ, rmsg); +struct dmsg *dmsg_get(void) { + struct dmsg *dmsg; - return DN_OK; -} + if (!TAILQ_EMPTY(&free_dmsgq)) { + ASSERT(TAILQ_COUNT(&free_dmsgq) > 0); -static void -dmsg_parse_host_id(uint8_t *start, uint32_t len, - struct string *dc, struct string *rack, struct dyn_token *token) -{ - uint8_t *p, *q; - uint8_t *dc_p, *rack_p, *token_p; - uint32_t k, delimlen, dc_len, rack_len, token_len; - char delim[] = "$$"; - delimlen = 2; - - /* parse "dc$rack$token : don't support vnode for now */ - p = start + len - 1; - dc_p = NULL; - rack_p = NULL; - token_p = NULL; - - dc_len = rack_len = token_len = 0; - - for (k = 0; k < sizeof(delim)-1; k++) { - q = dn_strrchr(p, start, delim[k]); + dmsg = TAILQ_FIRST(&free_dmsgq); + TAILQ_REMOVE(&free_dmsgq, dmsg, m_tqe); + goto done; + } - switch (k) { - case 0: - //no support for vnode at this time - token_p = q + 1; - token_len = (uint32_t)(p - token_p + 1); - parse_dyn_token(token_p, token_len, token); - break; - case 1: - rack_p = q + 1; - rack_len = (uint32_t)(p - rack_p + 1); + dmsg = dn_alloc(sizeof(*dmsg)); + if (dmsg == NULL) { + return NULL; + } - string_copy(rack, rack_p, rack_len); - break; +done: + // STAILQ_INIT(&dmsg->mhdr); + dmsg->mlen = 0; + dmsg->data = NULL; + + dmsg->plen = 0; + dmsg->payload = NULL; + + dmsg->type = DMSG_UNKNOWN; + dmsg->version = VERSION_10; + dmsg->id = 0; + dmsg->source_address = NULL; + dmsg->owner = NULL; + dmsg->flags = 0; + dmsg->same_dc = 1; + + return dmsg; +} - default: - NOT_REACHED(); - } - p = q - 1; - } +rstatus_t dmsg_write(struct mbuf *mbuf, uint64_t msg_id, uint8_t type, + struct conn *conn, uint32_t payload_len) { + mbuf_write_string(mbuf, &MAGIC_STR); + mbuf_write_uint64(mbuf, msg_id); + + // type + mbuf_write_char(mbuf, ' '); + mbuf_write_uint8(mbuf, type); + + // bit field + mbuf_write_char(mbuf, ' '); + // encryption bit + uint8_t flags = 0; + + if (conn->dnode_secured) { + flags |= 0x1; + } + mbuf_write_uint8(mbuf, flags); + + // version + mbuf_write_char(mbuf, ' '); + mbuf_write_uint8(mbuf, version); + + // same-dc + mbuf_write_char(mbuf, ' '); + if (conn->same_dc) + mbuf_write_uint8(mbuf, 1); + else + mbuf_write_uint8(mbuf, 0); + + // data + mbuf_write_char(mbuf, ' '); + mbuf_write_char(mbuf, '*'); + + // write aes key + unsigned char *aes_key = conn->aes_key; + + if (conn->dnode_secured && !conn->crypto_key_sent) { + mbuf_write_uint32(mbuf, (uint32_t)dyn_rsa_size()); + // payload + mbuf_write_char(mbuf, ' '); + dyn_rsa_encrypt(aes_key, aes_encrypted_buf); + mbuf_write_bytes(mbuf, aes_encrypted_buf, dyn_rsa_size()); + conn->crypto_key_sent = 1; + } else { + mbuf_write_uint32(mbuf, 1); + // payload + mbuf_write_char(mbuf, ' '); + mbuf_write_char(mbuf, 'd'); // TODOs: replace with another string + } - if (k != delimlen) { - loga("Error: this should not happen"); - return;// DN_ERROR; - } + mbuf_write_char(mbuf, ' '); + mbuf_write_char(mbuf, '*'); + mbuf_write_uint32(mbuf, payload_len); + mbuf_write_string(mbuf, &CRLF_STR); - dc_p = start; - dc_len = len - (token_len + rack_len + 2); - string_copy(dc, dc_p, dc_len); + return DN_OK; +} +// Used in gossip forwarding msg only for now +rstatus_t dmsg_write_mbuf(struct mbuf *mbuf, uint64_t msg_id, uint8_t type, + struct conn *conn, uint32_t plen) { + mbuf_write_string(mbuf, &MAGIC_STR); + mbuf_write_uint64(mbuf, msg_id); + mbuf_write_char(mbuf, ' '); + mbuf_write_uint8(mbuf, type); + mbuf_write_char(mbuf, ' '); + + // encryption bit + if (conn->dnode_secured) { + mbuf_write_uint8(mbuf, 1); + } else { + mbuf_write_uint8(mbuf, 0); + } + + mbuf_write_char(mbuf, ' '); + mbuf_write_uint8(mbuf, version); + + // same-dc + mbuf_write_char(mbuf, ' '); + if (conn->same_dc) + mbuf_write_uint8(mbuf, 1); + else + mbuf_write_uint8(mbuf, 0); + + // mbuf_write_string(mbuf, &CRLF_STR); + mbuf_write_char(mbuf, ' '); + mbuf_write_char(mbuf, '*'); + + // write aes key + unsigned char *aes_key = conn->aes_key; + if (conn->dnode_secured) { + mbuf_write_uint32(mbuf, (uint32_t)dyn_rsa_size()); + } else { + mbuf_write_uint32(mbuf, 1); + } + + mbuf_write_char(mbuf, ' '); + // mbuf_write_mbuf(mbuf, data); + if (conn->dnode_secured) { + dyn_rsa_encrypt(aes_key, aes_encrypted_buf); + mbuf_write_bytes(mbuf, aes_encrypted_buf, dyn_rsa_size()); + } else { + mbuf_write_char(mbuf, 'a'); // TODOs: replace with another string + } + + // mbuf_write_string(mbuf, &CRLF_STR); + mbuf_write_char(mbuf, ' '); + mbuf_write_char(mbuf, '*'); + mbuf_write_uint32(mbuf, plen); + + mbuf_write_string(mbuf, &CRLF_STR); + + return DN_OK; } +static rstatus_t dmsg_to_gossip(struct ring_msg *rmsg) { + CBUF_Push(C2G_InQ, rmsg); -static struct ring_msg * -dmsg_parse(struct dmsg *dmsg) -{ - //rstatus_t status; - uint8_t *p, *q, *start, *end, *pipe_p; - uint8_t *host_id, *host_addr, *ts, *node_state; - uint32_t k, delimlen, host_id_len, host_addr_len, ts_len, node_state_len; - char delim[] = ",,,"; - delimlen = 3; - - - /* parse "host_id1,generation_ts1,host_state1,host_broadcast_address1|host_id2,generation_ts2,host_state2,host_broadcast_address2" */ - /* host_id = dc-rack-token */ - //p = dmsg->data + dmsg->mlen - 1; - //p = dmsg->owner->pos + dmsg->owner->mlen - 1; - p = dmsg->payload + dmsg->plen - 1; - end = p; - - //start = dmsg->data; - //start = dmsg->owner->pos; - start = dmsg->payload; - - host_id = NULL; - host_addr = NULL; - ts = NULL; - node_state = NULL; - - host_id_len = 0; - host_addr_len = 0; - ts_len = 0; - node_state_len = 0; - pipe_p = start; - uint32_t count = 0; - - do { - q = dn_strrchr(p, start, '|'); - count++; - p = q - 1; - } while (q != NULL); + return DN_OK; +} - struct ring_msg *ring_msg = create_ring_msg_with_size(count, true); - if (ring_msg == NULL) { - log_debug(LOG_ERR, "Error: unable to create a new ring msg!"); - //we just drop this msg - return NULL; - } +static void dmsg_parse_host_id(uint8_t *start, uint32_t len, struct string *dc, + struct string *rack, struct dyn_token *token) { + uint8_t *p, *q; + uint8_t *dc_p, *rack_p, *token_p; + uint32_t k, delimlen, dc_len, rack_len, token_len; + char delim[] = "$$"; + delimlen = 2; - struct server_pool *sp = (struct server_pool *) dmsg->owner->owner->owner; - ring_msg->sp = sp; + /* parse "dc$rack$token : don't support vnode for now */ + p = start + len - 1; + dc_p = NULL; + rack_p = NULL; + token_p = NULL; - ring_msg->cb = gossip_msg_peer_update; + dc_len = rack_len = token_len = 0; - count = 0; - //p = dmsg->data + dmsg->mlen - 1; - p = dmsg->payload + dmsg->plen - 1; + for (k = 0; k < sizeof(delim) - 1; k++) { + q = dn_strrchr(p, start, delim[k]); - do { + switch (k) { + case 0: + // no support for vnode at this time + token_p = q + 1; + token_len = (uint32_t)(p - token_p + 1); + parse_dyn_token(token_p, token_len, token); + break; + case 1: + rack_p = q + 1; + rack_len = (uint32_t)(p - rack_p + 1); - for (k = 0; k < sizeof(delim)-1; k++) { - q = dn_strrchr(p, start, delim[k]); + string_copy(rack, rack_p, rack_len); + break; - switch (k) { - case 0: - host_addr = q + 1; - host_addr_len = (uint32_t)(p - host_addr + 1); - break; - case 1: - node_state = q + 1; - node_state_len = (uint32_t)(p - node_state + 1); + default: + NOT_REACHED(); + } + p = q - 1; + } - break; - case 2: - ts = q + 1; - ts_len = (uint32_t)(p - ts + 1); + if (k != delimlen) { + loga("Error: this should not happen"); + return; // DN_ERROR; + } - break; + dc_p = start; + dc_len = len - (token_len + rack_len + 2); + string_copy(dc, dc_p, dc_len); +} - default: - NOT_REACHED(); - } - p = q - 1; +static struct ring_msg *dmsg_parse(struct dmsg *dmsg) { + // rstatus_t status; + uint8_t *p, *q, *start, *end, *pipe_p; + uint8_t *host_id, *host_addr, *ts, *node_state; + uint32_t k, delimlen, host_id_len, host_addr_len, ts_len, node_state_len; + char delim[] = ",,,"; + delimlen = 3; + + /* parse + * "host_id1,generation_ts1,host_state1,host_broadcast_address1|host_id2,generation_ts2,host_state2,host_broadcast_address2" + */ + /* host_id = dc-rack-token */ + // p = dmsg->data + dmsg->mlen - 1; + // p = dmsg->owner->pos + dmsg->owner->mlen - 1; + p = dmsg->payload + dmsg->plen - 1; + end = p; + + // start = dmsg->data; + // start = dmsg->owner->pos; + start = dmsg->payload; + + host_id = NULL; + host_addr = NULL; + ts = NULL; + node_state = NULL; + + host_id_len = 0; + host_addr_len = 0; + ts_len = 0; + node_state_len = 0; + pipe_p = start; + uint32_t count = 0; + + do { + q = dn_strrchr(p, start, '|'); + count++; + p = q - 1; + } while (q != NULL); + + struct ring_msg *ring_msg = create_ring_msg_with_size(count, true); + if (ring_msg == NULL) { + log_debug(LOG_ERR, "Error: unable to create a new ring msg!"); + // we just drop this msg + return NULL; + } + + struct server_pool *sp = (struct server_pool *)dmsg->owner->owner->owner; + ring_msg->sp = sp; + + ring_msg->cb = gossip_msg_peer_update; + + count = 0; + // p = dmsg->data + dmsg->mlen - 1; + p = dmsg->payload + dmsg->plen - 1; + + do { + for (k = 0; k < sizeof(delim) - 1; k++) { + q = dn_strrchr(p, start, delim[k]); - } + switch (k) { + case 0: + host_addr = q + 1; + host_addr_len = (uint32_t)(p - host_addr + 1); + break; + case 1: + node_state = q + 1; + node_state_len = (uint32_t)(p - node_state + 1); - if (k != delimlen) { - loga("Error: this is insanely bad"); - return NULL;// DN_ERROR; - } + break; + case 2: + ts = q + 1; + ts_len = (uint32_t)(p - ts + 1); - pipe_p = dn_strrchr(p, start, '|'); + break; - if (pipe_p == NULL) { - pipe_p = start; - } else { - pipe_p = pipe_p + 1; - p = pipe_p - 2; + default: + NOT_REACHED(); } + p = q - 1; + } - //host_id = dmsg->data; - //host_id_len = dmsg->mlen - (host_addr_len + node_state_len + ts_len + 3); - host_id = pipe_p; - host_id_len = (uint32_t)(end - pipe_p - (host_addr_len + node_state_len + ts_len + 3) - + 1); + if (k != delimlen) { + loga("Error: this is insanely bad"); + return NULL; // DN_ERROR; + } - end = p; + pipe_p = dn_strrchr(p, start, '|'); + if (pipe_p == NULL) { + pipe_p = start; + } else { + pipe_p = pipe_p + 1; + p = pipe_p - 2; + } - struct gossip_node *rnode = (struct gossip_node *) array_get(&ring_msg->nodes, count); - dmsg_parse_host_id(host_id, host_id_len, &rnode->dc, &rnode->rack, &rnode->token); + // host_id = dmsg->data; + // host_id_len = dmsg->mlen - (host_addr_len + node_state_len + ts_len + 3); + host_id = pipe_p; + host_id_len = (uint32_t)(end - pipe_p - + (host_addr_len + node_state_len + ts_len + 3) + 1); + end = p; - string_copy(&rnode->name, host_addr, host_addr_len); - string_copy(&rnode->pname, host_addr, host_addr_len); //need to add port + struct gossip_node *rnode = + (struct gossip_node *)array_get(&ring_msg->nodes, count); + dmsg_parse_host_id(host_id, host_id_len, &rnode->dc, &rnode->rack, + &rnode->token); - rnode->port = sp->dnode_proxy_endpoint.port; - rnode->is_local = false; + string_copy(&rnode->name, host_addr, host_addr_len); + string_copy(&rnode->pname, host_addr, host_addr_len); // need to add port - ts[ts_len] = '\0'; - rnode->ts = (uint64_t)atol((char*)ts); + rnode->port = sp->dnode_proxy_endpoint.port; + rnode->is_local = false; - node_state[node_state_len] = '\0'; - rnode->state = (uint8_t) atoi((char*)node_state); + ts[ts_len] = '\0'; + rnode->ts = (uint64_t)atol((char *)ts); - count++; - } while (pipe_p != start); + node_state[node_state_len] = '\0'; + rnode->state = (uint8_t)atoi((char *)node_state); - //TODOs: should move this outside - dmsg_to_gossip(ring_msg); + count++; + } while (pipe_p != start); - return ring_msg; -} + // TODOs: should move this outside + dmsg_to_gossip(ring_msg); + return ring_msg; +} /* * return : true to bypass all processing from the stack * false to go through the whole stack to process a given msg */ -bool -dmsg_process(struct context *ctx, struct conn *conn, struct dmsg *dmsg) -{ - ASSERT(dmsg != NULL); - ASSERT(conn->dyn_mode); - - switch(dmsg->type) { - - case GOSSIP_SYN: - log_debug(LOG_DEBUG, "I have got a GOSSIP_SYN!!!!!!"); - dmsg_dump(dmsg); - //TODOs: fix this to reply the 1st time sender - //dnode_rsp_gos_syn(ctx, conn, dmsg->owner); - dmsg_parse(dmsg); - return true; - - case CRYPTO_HANDSHAKE: - log_debug(LOG_DEBUG, "I have a crypto handshake msg and processing it now"); - //TODOs: will work on this to optimize the performance - return true; - - case GOSSIP_SYN_REPLY: - log_debug(LOG_DEBUG, "I have got a GOSSIP_SYN_REPLY!!!!!!"); - - return true; - default: - log_debug(LOG_DEBUG, "nothing to do"); - } - - return false; +bool dmsg_process(struct context *ctx, struct conn *conn, struct dmsg *dmsg) { + ASSERT(dmsg != NULL); + ASSERT(conn->dyn_mode); + + switch (dmsg->type) { + case GOSSIP_SYN: + log_debug(LOG_DEBUG, "I have got a GOSSIP_SYN!!!!!!"); + dmsg_dump(dmsg); + // TODOs: fix this to reply the 1st time sender + // dnode_rsp_gos_syn(ctx, conn, dmsg->owner); + dmsg_parse(dmsg); + return true; + + case CRYPTO_HANDSHAKE: + log_debug(LOG_DEBUG, + "I have a crypto handshake msg and processing it now"); + // TODOs: will work on this to optimize the performance + return true; + + case GOSSIP_SYN_REPLY: + log_debug(LOG_DEBUG, "I have got a GOSSIP_SYN_REPLY!!!!!!"); + + return true; + default: + log_debug(LOG_DEBUG, "nothing to do"); + } + + return false; } diff --git a/src/dyn_dnode_msg.h b/src/dyn_dnode_msg.h index 169efc409..9cb91c5bc 100644 --- a/src/dyn_dnode_msg.h +++ b/src/dyn_dnode_msg.h @@ -1,86 +1,82 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ - -#include "dyn_core.h" - + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ #ifndef _DYN_DNODE_MSG_H_ #define _DYN_DNODE_MSG_H_ +#include -typedef enum dmsg_version { - VERSION_10 = 1 -} dmsg_version_t; +#include "dyn_queue.h" +#include "dyn_types.h" + +typedef enum dmsg_version { VERSION_10 = 1 } dmsg_version_t; typedef enum { - DYN_START = 0, - DYN_MAGIC_STRING = 1000, - DYN_MSG_ID, - DYN_TYPE_ID, - DYN_BIT_FIELD, - DYN_VERSION, - DYN_SAME_DC, - DYN_STAR, - DYN_DATA_LEN, - DYN_DATA, - DYN_SPACES_BEFORE_PAYLOAD_LEN, - DYN_PAYLOAD_LEN, - DYN_CRLF_BEFORE_DONE, - DYN_DONE, - DYN_POST_DONE, - DYN_UNKNOWN + DYN_START = 0, + DYN_MAGIC_STRING = 1000, + DYN_MSG_ID, + DYN_TYPE_ID, + DYN_BIT_FIELD, + DYN_VERSION, + DYN_SAME_DC, + DYN_STAR, + DYN_DATA_LEN, + DYN_DATA, + DYN_SPACES_BEFORE_PAYLOAD_LEN, + DYN_PAYLOAD_LEN, + DYN_CRLF_BEFORE_DONE, + DYN_DONE, + DYN_POST_DONE, + DYN_UNKNOWN } dyn_parse_state_t; typedef enum dmsg_type { - DMSG_UNKNOWN = 0, - DMSG_DEBUG, - DMSG_PARSE_ERROR, - DMSG_REQ, - DMSG_REQ_FORWARD, - DMSG_RES, - CRYPTO_HANDSHAKE, - GOSSIP_SYN, - GOSSIP_SYN_REPLY, - GOSSIP_ACK, - GOSSIP_DIGEST_SYN, - GOSSIP_DIGEST_ACK, - GOSSIP_DIGEST_ACK2, - GOSSIP_SHUTDOWN + DMSG_UNKNOWN = 0, + DMSG_DEBUG, + DMSG_PARSE_ERROR, + DMSG_REQ, + DMSG_REQ_FORWARD, + DMSG_RES, + CRYPTO_HANDSHAKE, + GOSSIP_SYN, + GOSSIP_SYN_REPLY, + GOSSIP_ACK, + GOSSIP_DIGEST_SYN, + GOSSIP_DIGEST_ACK, + GOSSIP_DIGEST_ACK2, + GOSSIP_SHUTDOWN } dmsg_type_t; - struct dval { - uint8_t type; - uint32_t len; /* length */ - uint8_t *data; /* data */ + uint8_t type; + uint32_t len; /* length */ + uint8_t *data; /* data */ }; - struct dmsg { - TAILQ_ENTRY(dmsg) m_tqe; /* link in free q */ - struct msg *owner; - - uint64_t id; /* message id */ - dmsg_type_t type; /* message type */ - uint8_t flags; /* bits to indicate encryption or decryption. Right most bit indicates encryption. - 2nd right most bit indicates compression */ - dmsg_version_t version; /* version of the message sender */ - uint8_t same_dc; /* indicate it is an inter_dc */ - - struct sockaddr *source_address; /* source ip */ - uint32_t mlen; /* length */ - uint8_t *data; /* data */ - - uint32_t plen; /* payload length */ - uint8_t *payload; /* pointer to payload */ + TAILQ_ENTRY(dmsg) m_tqe; /* link in free q */ + struct msg *owner; + + uint64_t id; /* message id */ + dmsg_type_t type; /* message type */ + uint8_t flags; /* bits to indicate encryption or decryption. Right most bit + indicates encryption. 2nd right most bit indicates + compression */ + dmsg_version_t version; /* version of the message sender */ + uint8_t same_dc; /* indicate it is an inter_dc */ + + struct sockaddr *source_address; /* source ip */ + uint32_t mlen; /* length */ + uint8_t *data; /* data */ + + uint32_t plen; /* payload length */ + uint8_t *payload; /* pointer to payload */ }; - TAILQ_HEAD(dmsg_tqh, dmsg); - void dyn_parse_req(struct msg *r, const struct string *hash_tag); void dyn_parse_rsp(struct msg *r, const struct string *UNUSED); @@ -92,10 +88,10 @@ void dmsg_deinit(void); bool dmsg_empty(struct dmsg *msg); struct dmsg *dmsg_get(void); rstatus_t dmsg_write(struct mbuf *mbuf, uint64_t msg_id, uint8_t type, - struct conn *conn, uint32_t payload_len); + struct conn *conn, uint32_t payload_len); rstatus_t dmsg_write_mbuf(struct mbuf *mbuf, uint64_t msg_id, uint8_t type, - struct conn *conn, uint32_t plen); + struct conn *conn, uint32_t plen); bool dmsg_process(struct context *ctx, struct conn *conn, struct dmsg *dmsg); #endif diff --git a/src/dyn_dnode_peer.c b/src/dyn_dnode_peer.c index 2e2e7e261..f134bd924 100644 --- a/src/dyn_dnode_peer.c +++ b/src/dyn_dnode_peer.c @@ -1,379 +1,359 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ -#include #include +#include #include -#include "dyn_core.h" #include "dyn_conf.h" -#include "dyn_server.h" +#include "dyn_core.h" #include "dyn_dnode_peer.h" #include "dyn_node_snitch.h" +#include "dyn_server.h" #include "dyn_task.h" #include "dyn_token.h" #include "dyn_vnode.h" static rstatus_t dnode_peer_pool_update(struct server_pool *pool); -static void -dnode_peer_ref(struct conn *conn, void *owner) -{ - struct node *peer = owner; +static void dnode_peer_ref(struct conn *conn, void *owner) { + struct node *peer = owner; - ASSERT(conn->type == CONN_DNODE_PEER_SERVER); - ASSERT(conn->owner == NULL); + ASSERT(conn->type == CONN_DNODE_PEER_SERVER); + ASSERT(conn->owner == NULL); - conn->family = peer->endpoint.family; - conn->addrlen = peer->endpoint.addrlen; - conn->addr = peer->endpoint.addr; - string_duplicate(&conn->pname, &peer->endpoint.pname); + conn->family = peer->endpoint.family; + conn->addrlen = peer->endpoint.addrlen; + conn->addr = peer->endpoint.addr; + string_duplicate(&conn->pname, &peer->endpoint.pname); - conn->owner = peer; + conn->owner = peer; - conn->dnode_secured = peer->is_secure; - conn->crypto_key_sent = 0; - conn->same_dc = peer->is_same_dc; + conn->dnode_secured = peer->is_secure; + conn->crypto_key_sent = 0; + conn->same_dc = peer->is_same_dc; - if (log_loggable(LOG_VVERB)) { - log_debug(LOG_VVERB, "dyn: ref peer conn %p owner %p into '%.*s", conn, peer, - peer->endpoint.pname.len, peer->endpoint.pname.data); - } + if (log_loggable(LOG_VVERB)) { + log_debug(LOG_VVERB, "dyn: ref peer conn %p owner %p into '%.*s", conn, + peer, peer->endpoint.pname.len, peer->endpoint.pname.data); + } } -static void -dnode_peer_unref(struct conn *conn) -{ - struct node *peer; - - ASSERT(conn->type == CONN_DNODE_PEER_SERVER); - ASSERT(conn->owner != NULL); - conn_event_del_conn(conn); - - peer = conn->owner; - conn->owner = NULL; - // if this is the last connection, mark the peer as down. - if (conn_pool_active_count(peer->conn_pool) == 1) { - log_notice("Marking %s as down", print_obj(peer)); - peer->state = DOWN; - } +static void dnode_peer_unref(struct conn *conn) { + struct node *peer; + + ASSERT(conn->type == CONN_DNODE_PEER_SERVER); + ASSERT(conn->owner != NULL); + conn_event_del_conn(conn); - log_debug(LOG_VVERB, "dyn: unref peer conn %p owner %p from '%.*s'", conn, peer, - peer->endpoint.pname.len, peer->endpoint.pname.data); + peer = conn->owner; + conn->owner = NULL; + // if this is the last connection, mark the peer as down. + if (conn_pool_active_count(peer->conn_pool) == 1) { + log_notice("Marking %s as down", print_obj(peer)); + peer->state = DOWN; + } + + log_debug(LOG_VVERB, "dyn: unref peer conn %p owner %p from '%.*s'", conn, + peer, peer->endpoint.pname.len, peer->endpoint.pname.data); } -msec_t -dnode_peer_timeout(struct msg *req, struct conn *conn) -{ - ASSERT(conn->type == CONN_DNODE_PEER_SERVER); +msec_t dnode_peer_timeout(struct msg *req, struct conn *conn) { + ASSERT(conn->type == CONN_DNODE_PEER_SERVER); - struct node *peer = conn->owner; - struct server_pool *pool = peer->owner; - msec_t additional_timeout = 0; + struct node *peer = conn->owner; + struct server_pool *pool = peer->owner; + msec_t additional_timeout = 0; - if (peer->is_same_dc) - additional_timeout = 200; - else - additional_timeout = 5000; + if (peer->is_same_dc) + additional_timeout = 200; + else + additional_timeout = 5000; - if (!req->is_read) //make sure write request has a longer timeout so we almost never want to drop it - additional_timeout += 20000; + if (!req->is_read) // make sure write request has a longer timeout so we + // almost never want to drop it + additional_timeout += 20000; - return pool->timeout + additional_timeout; + return pool->timeout + additional_timeout; } -static bool -dnode_peer_active(struct conn *conn) -{ - ASSERT(conn->type == CONN_DNODE_PEER_SERVER); +static bool dnode_peer_active(struct conn *conn) { + ASSERT(conn->type == CONN_DNODE_PEER_SERVER); - if (!TAILQ_EMPTY(&conn->imsg_q)) { - log_debug(LOG_VVERB, "dyn: s %d is active", conn->sd); - return true; - } + if (!TAILQ_EMPTY(&conn->imsg_q)) { + log_debug(LOG_VVERB, "dyn: s %d is active", conn->sd); + return true; + } - if (!TAILQ_EMPTY(&conn->omsg_q)) { - log_debug(LOG_VVERB, "dyn: s %d is active", conn->sd); - return true; - } + if (!TAILQ_EMPTY(&conn->omsg_q)) { + log_debug(LOG_VVERB, "dyn: s %d is active", conn->sd); + return true; + } - if (conn->rmsg != NULL) { - log_debug(LOG_VVERB, "dyn: s %d is active", conn->sd); - return true; - } + if (conn->rmsg != NULL) { + log_debug(LOG_VVERB, "dyn: s %d is active", conn->sd); + return true; + } - if (conn->smsg != NULL) { - log_debug(LOG_VVERB, "dyn: s %d is active", conn->sd); - return true; - } + if (conn->smsg != NULL) { + log_debug(LOG_VVERB, "dyn: s %d is active", conn->sd); + return true; + } - log_debug(LOG_VVERB, "dyn: s %d is inactive", conn->sd); + log_debug(LOG_VVERB, "dyn: s %d is inactive", conn->sd); - return false; + return false; } -static char* -_print_node(const struct object *obj) -{ - ASSERT(obj->type == OBJ_NODE); - struct node *node = (struct node *)obj; - snprintf(obj->print_buff, PRINT_BUF_SIZE, "", - node, node->name.len, node->name.data, node->dc.len, node->dc.data, - node->rack.len, node->rack.data, node->is_secure); - return obj->print_buff; +static char *_print_node(const struct object *obj) { + ASSERT(obj->type == OBJ_NODE); + struct node *node = (struct node *)obj; + snprintf(obj->print_buff, PRINT_BUF_SIZE, + "", node, node->name.len, + node->name.data, node->dc.len, node->dc.data, node->rack.len, + node->rack.data, node->is_secure); + return obj->print_buff; } -static void -_init_peer_struct(struct node *node) -{ - memset(node, 0, sizeof(*node)); - init_object(&node->obj, OBJ_NODE, _print_node); +static void _init_peer_struct(struct node *node) { + memset(node, 0, sizeof(*node)); + init_object(&node->obj, OBJ_NODE, _print_node); } -static rstatus_t -dnode_peer_add_local(struct server_pool *pool, struct node *self) -{ - ASSERT(self != NULL); +static rstatus_t dnode_peer_add_local(struct server_pool *pool, + struct node *self) { + ASSERT(self != NULL); - _init_peer_struct(self); - self->owner = pool; - - // Initialize the endpoint - struct string *p_pname = &pool->dnode_proxy_endpoint.pname; - string_duplicate(&self->endpoint.pname, p_pname); - self->endpoint.port = pool->dnode_proxy_endpoint.port; - self->endpoint.family = pool->dnode_proxy_endpoint.family; - self->endpoint.addrlen = pool->dnode_proxy_endpoint.addrlen; - self->endpoint.addr = pool->dnode_proxy_endpoint.addr; + _init_peer_struct(self); + self->owner = pool; - uint8_t *p = p_pname->data + p_pname->len - 1; - uint8_t *start = p_pname->data; - string_copy(&self->name, start, (uint32_t)(dn_strrchr(p, start, ':') - start)); + // Initialize the endpoint + struct string *p_pname = &pool->dnode_proxy_endpoint.pname; + string_duplicate(&self->endpoint.pname, p_pname); + self->endpoint.port = pool->dnode_proxy_endpoint.port; + self->endpoint.family = pool->dnode_proxy_endpoint.family; + self->endpoint.addrlen = pool->dnode_proxy_endpoint.addrlen; + self->endpoint.addr = pool->dnode_proxy_endpoint.addr; - string_duplicate(&self->rack, &pool->rack); - string_duplicate(&self->dc, &pool->dc); - self->tokens = pool->tokens; + uint8_t *p = p_pname->data + p_pname->len - 1; + uint8_t *start = p_pname->data; + string_copy(&self->name, start, + (uint32_t)(dn_strrchr(p, start, ':') - start)); + string_duplicate(&self->rack, &pool->rack); + string_duplicate(&self->dc, &pool->dc); + self->tokens = pool->tokens; - self->is_local = true; - self->is_same_dc = true; - self->processed = 0; - self->is_secure = false; - self->state = JOINING; + self->is_local = true; + self->is_same_dc = true; + self->processed = 0; + self->is_secure = false; + self->state = JOINING; - log_notice("Initialized local peer: %s", print_obj(self)); + log_notice("Initialized local peer: %s", print_obj(self)); - return DN_OK; + return DN_OK; } -void -dnode_peer_deinit(struct array *nodes) -{ - uint32_t i, nnode; +void dnode_peer_deinit(struct array *nodes) { + uint32_t i, nnode; - for (i = 0, nnode = array_n(nodes); i < nnode; i++) { - struct node *s = *(struct node **)array_pop(nodes); - if (s->conn_pool) { - conn_pool_destroy(s->conn_pool); - s->conn_pool = NULL; - } + for (i = 0, nnode = array_n(nodes); i < nnode; i++) { + struct node *s = *(struct node **)array_pop(nodes); + if (s->conn_pool) { + conn_pool_destroy(s->conn_pool); + s->conn_pool = NULL; } - array_deinit(nodes); + } + array_deinit(nodes); } -static rstatus_t -dnode_peer_pool_run(struct server_pool *pool) -{ - ASSERT(array_n(&pool->peers) != 0); - return vnode_update(pool); +static rstatus_t dnode_peer_pool_run(struct server_pool *pool) { + ASSERT(array_n(&pool->peers) != 0); + return vnode_update(pool); } -static void -dnode_create_connection_pool(struct server_pool *sp, struct node *peer) -{ - if (peer->conn_pool) - return; - struct context *ctx = sp->ctx; - if (!peer->is_local) { - uint8_t max_connections = peer->is_same_dc ? sp->max_local_peer_connections : - sp->max_remote_peer_connections; - peer->conn_pool = conn_pool_create(ctx, peer, max_connections, - init_dnode_peer_conn, sp->server_failure_limit, - MAX_WAIT_BEFORE_RECONNECT_IN_SECS); - } - +static void dnode_create_connection_pool(struct server_pool *sp, + struct node *peer) { + if (peer->conn_pool) return; + struct context *ctx = sp->ctx; + if (!peer->is_local) { + uint8_t max_connections = peer->is_same_dc + ? sp->max_local_peer_connections + : sp->max_remote_peer_connections; + peer->conn_pool = conn_pool_create( + ctx, peer, max_connections, init_dnode_peer_conn, + sp->server_failure_limit, MAX_WAIT_BEFORE_RECONNECT_IN_SECS); + } } -static rstatus_t -dnode_initialize_peer_each(void *elem, void *data1, void *data2) -{ - struct context *ctx = data1; - struct server_pool *sp = &ctx->pool; - - struct conf_server *cseed = elem; - ASSERT(cseed->valid); - struct array *peers = data2; - struct node **sptr = array_push(peers); - struct node *s = dn_zalloc(sizeof(struct node)); - if (!s || !sptr) - return DN_ENOMEM; - *sptr = s; - _init_peer_struct(s); - - s->idx = array_idx(peers, sptr); - s->owner = sp; - - string_copy(&s->endpoint.pname, cseed->pname.data, cseed->pname.len); - s->endpoint.port = (uint16_t)cseed->port; - s->endpoint.family = cseed->info.family; - s->endpoint.addrlen = cseed->info.addrlen; - s->endpoint.addr = (struct sockaddr *)&cseed->info.addr; //TODOs: fix this by copying, not reference - - uint8_t *p = cseed->name.data + cseed->name.len - 1; - uint8_t *start = cseed->name.data; - string_copy(&s->name, start, (uint32_t)(dn_strrchr(p, start, ':') - start)); - - string_copy(&s->rack, cseed->rack.data, cseed->rack.len); - string_copy(&s->dc, cseed->dc.data, cseed->dc.len); - - s->tokens = cseed->tokens; - - s->is_local = false; - s->is_same_dc = (string_compare(&sp->dc, &s->dc) == 0); - s->processed = 0; - - s->is_secure = is_secure(sp->secure_server_option, &sp->dc, &sp->rack, &s->dc, - &s->rack); - s->state = DOWN;//assume peers are down initially - dnode_create_connection_pool(sp, s); - log_notice("added peer %s", print_obj(s)); - - return DN_OK; +static rstatus_t dnode_initialize_peer_each(void *elem, void *data1, + void *data2) { + struct context *ctx = data1; + struct server_pool *sp = &ctx->pool; + + struct conf_server *cseed = elem; + ASSERT(cseed->valid); + struct array *peers = data2; + struct node **sptr = array_push(peers); + struct node *s = dn_zalloc(sizeof(struct node)); + if (!s || !sptr) return DN_ENOMEM; + *sptr = s; + _init_peer_struct(s); + + s->idx = array_idx(peers, sptr); + s->owner = sp; + + string_copy(&s->endpoint.pname, cseed->pname.data, cseed->pname.len); + s->endpoint.port = (uint16_t)cseed->port; + s->endpoint.family = cseed->info.family; + s->endpoint.addrlen = cseed->info.addrlen; + s->endpoint.addr = (struct sockaddr *)&cseed->info + .addr; // TODOs: fix this by copying, not reference + + uint8_t *p = cseed->name.data + cseed->name.len - 1; + uint8_t *start = cseed->name.data; + string_copy(&s->name, start, (uint32_t)(dn_strrchr(p, start, ':') - start)); + + string_copy(&s->rack, cseed->rack.data, cseed->rack.len); + string_copy(&s->dc, cseed->dc.data, cseed->dc.len); + + s->tokens = cseed->tokens; + + s->is_local = false; + s->is_same_dc = (string_compare(&sp->dc, &s->dc) == 0); + s->processed = 0; + + s->is_secure = + is_secure(sp->secure_server_option, &sp->dc, &sp->rack, &s->dc, &s->rack); + s->state = DOWN; // assume peers are down initially + dnode_create_connection_pool(sp, s); + log_notice("added peer %s", print_obj(s)); + + return DN_OK; } -rstatus_t -dnode_initialize_peers(struct context *ctx) -{ - struct server_pool *sp = &ctx->pool; - struct array *conf_seeds = &sp->conf_pool->dyn_seeds; +rstatus_t dnode_initialize_peers(struct context *ctx) { + struct server_pool *sp = &ctx->pool; + struct array *conf_seeds = &sp->conf_pool->dyn_seeds; - struct array *peers = &sp->peers; - uint32_t nseed; + struct array *peers = &sp->peers; + uint32_t nseed; - /* initialize peers list = seeds list */ - ASSERT(array_n(peers) == 0); + /* initialize peers list = seeds list */ + ASSERT(array_n(peers) == 0); - /* init seeds list */ - nseed = array_n(conf_seeds); + /* init seeds list */ + nseed = array_n(conf_seeds); - log_debug(LOG_INFO, "Adding local node to the peer list"); + log_debug(LOG_INFO, "Adding local node to the peer list"); - THROW_STATUS(array_init(peers, nseed + 1, sizeof(struct node *))); + THROW_STATUS(array_init(peers, nseed + 1, sizeof(struct node *))); - // Add self node - struct node **selfptr = array_push(peers); - struct node *self = dn_zalloc(sizeof(struct node)); - if (!self || !selfptr) - return DN_ENOMEM; - *selfptr = self; - THROW_STATUS(dnode_peer_add_local(sp, self)); + // Add self node + struct node **selfptr = array_push(peers); + struct node *self = dn_zalloc(sizeof(struct node)); + if (!self || !selfptr) return DN_ENOMEM; + *selfptr = self; + THROW_STATUS(dnode_peer_add_local(sp, self)); - // Add the peer nodes - THROW_STATUS(array_each_2(conf_seeds, dnode_initialize_peer_each, ctx, peers)); + // Add the peer nodes + THROW_STATUS( + array_each_2(conf_seeds, dnode_initialize_peer_each, ctx, peers)); - ASSERT(array_n(peers) == (nseed + 1)); + ASSERT(array_n(peers) == (nseed + 1)); - THROW_STATUS(dnode_peer_pool_run(sp)); + THROW_STATUS(dnode_peer_pool_run(sp)); - log_debug(LOG_DEBUG, "init %"PRIu32" peers in pool %s'", nseed, print_obj(sp)); + log_debug(LOG_DEBUG, "init %" PRIu32 " peers in pool %s'", nseed, + print_obj(sp)); - return DN_OK; + return DN_OK; } -static struct conn * -dnode_peer_conn(struct node *peer, int tag) -{ - return conn_pool_get(peer->conn_pool, tag); +static struct conn *dnode_peer_conn(struct node *peer, int tag) { + return conn_pool_get(peer->conn_pool, tag); } -static void -dnode_peer_ack_err(struct context *ctx, struct conn *conn, struct msg *req) -{ - if ((req->swallow && !req->expect_datastore_reply) || // no reply - (req->swallow && (req->consistency == DC_ONE)) || // dc one - (req->swallow && ((req->consistency == DC_QUORUM) || (req->consistency == DC_SAFE_QUORUM)) // remote dc request - && (!conn->same_dc)) || - (req->owner == conn)) // a gossip message that originated on this conn - { - log_info("%s Closing, swallow req %u:%u len %"PRIu32" type %d", - print_obj(conn), req->id, req->parent_id, req->mlen, req->type); - req_put(req); - return; - } - struct conn *c_conn = req->owner; - // At other connections, these responses would be swallowed. - ASSERT_LOG((c_conn->type == CONN_CLIENT) || - (c_conn->type == CONN_DNODE_PEER_CLIENT), - "conn:%s c_conn:%s, req %d:%d", print_obj(conn), print_obj(c_conn), req->id, req->parent_id); - - // Create an appropriate response for the request so its propagated up; - // This response gets dropped in rsp_make_error anyways. But since this is - // an error path its ok with the overhead. - struct msg *rsp = msg_get(conn, false, __FUNCTION__); - req->done = 1; - rsp->peer = req; - rsp->is_error = req->is_error = 1; - rsp->error_code = req->error_code = conn->err; - rsp->dyn_error_code = req->dyn_error_code = PEER_CONNECTION_REFUSE; - rsp->dmsg = dmsg_get(); - rsp->dmsg->id = req->id; - - log_info("%s Closing req %u:%u len %"PRIu32" type %d %c %s", print_obj(conn), - req->id, req->parent_id, req->mlen, req->type, - conn->err ? ':' : ' ', conn->err ? strerror(conn->err): " "); - rstatus_t status = - conn_handle_response(c_conn, req->parent_id ? req->parent_id : req->id, - rsp); - IGNORE_RET_VAL(status); - if (req->swallow) - req_put(req); +static void dnode_peer_ack_err(struct context *ctx, struct conn *conn, + struct msg *req) { + if ((req->swallow && !req->expect_datastore_reply) || // no reply + (req->swallow && (req->consistency == DC_ONE)) || // dc one + (req->swallow && + ((req->consistency == DC_QUORUM) || + (req->consistency == DC_SAFE_QUORUM)) // remote dc request + && (!conn->same_dc)) || + (req->owner == conn)) // a gossip message that originated on this conn + { + log_info("%s Closing, swallow req %u:%u len %" PRIu32 " type %d", + print_obj(conn), req->id, req->parent_id, req->mlen, req->type); + req_put(req); + return; + } + struct conn *c_conn = req->owner; + // At other connections, these responses would be swallowed. + ASSERT_LOG( + (c_conn->type == CONN_CLIENT) || (c_conn->type == CONN_DNODE_PEER_CLIENT), + "conn:%s c_conn:%s, req %d:%d", print_obj(conn), print_obj(c_conn), + req->id, req->parent_id); + + // Create an appropriate response for the request so its propagated up; + // This response gets dropped in rsp_make_error anyways. But since this is + // an error path its ok with the overhead. + struct msg *rsp = msg_get_error(conn, PEER_CONNECTION_REFUSE, conn->err); + if (rsp == NULL) { + // TODO: It's not clear how the peer should behave if we hit this error + // condition. Return an appropriate error instead. + log_warn("Could not allocate msg for notifying an error to peer"); + return; + } + req->done = 1; + rsp->peer = req; + rsp->is_error = req->is_error = 1; + rsp->error_code = req->error_code = conn->err; + rsp->dyn_error_code = req->dyn_error_code = PEER_CONNECTION_REFUSE; + rsp->dmsg = dmsg_get(); + rsp->dmsg->id = req->id; + + log_info("%s Closing req %u:%u len %" PRIu32 " type %d %c %s", + print_obj(conn), req->id, req->parent_id, req->mlen, req->type, + conn->err ? ':' : ' ', conn->err ? strerror(conn->err) : " "); + rstatus_t status = conn_handle_response( + c_conn, req->parent_id ? req->parent_id : req->id, rsp); + IGNORE_RET_VAL(status); + if (req->swallow) req_put(req); } +static void dnode_peer_failure(struct context *ctx, struct node *peer) { + struct server_pool *pool = peer->owner; + conn_pool_notify_conn_errored(peer->conn_pool); + stats_pool_set_ts(ctx, peer_ejected_at, (int64_t)dn_msec_now()); -static void -dnode_peer_failure(struct context *ctx, struct node *peer) -{ - struct server_pool *pool = peer->owner; - conn_pool_notify_conn_errored(peer->conn_pool); - stats_pool_set_ts(ctx, peer_ejected_at, (int64_t)dn_msec_now()); - - if (dnode_peer_pool_update(peer->owner) != DN_OK) { - log_error("dyn: updating peer pool '%.*s' failed: %s", - pool->name.len, pool->name.data, strerror(errno)); - } + if (dnode_peer_pool_update(peer->owner) != DN_OK) { + log_error("dyn: updating peer pool '%.*s' failed: %s", pool->name.len, + pool->name.data, strerror(errno)); + } } -static void -dnode_peer_close_stats(struct context *ctx, struct conn *conn) -{ - if (conn->connected) { - stats_pool_decr(ctx, peer_connections); - } +static void dnode_peer_close_stats(struct context *ctx, struct conn *conn) { + if (conn->connected) { + stats_pool_decr(ctx, peer_connections); + } - if (conn->eof) { - stats_pool_incr(ctx, peer_eof); - return; - } + if (conn->eof) { + stats_pool_incr(ctx, peer_eof); + return; + } - switch (conn->err) { + switch (conn->err) { case ETIMEDOUT: - if (conn->same_dc) - stats_pool_incr(ctx, peer_timedout); - else - stats_pool_incr(ctx, remote_peer_timedout); - break; + if (conn->same_dc) + stats_pool_incr(ctx, peer_timedout); + else + stats_pool_incr(ctx, remote_peer_timedout); + break; case EPIPE: case ECONNRESET: case ECONNABORTED: @@ -384,309 +364,302 @@ dnode_peer_close_stats(struct context *ctx, struct conn *conn) case EHOSTDOWN: case EHOSTUNREACH: default: - stats_pool_incr(ctx, peer_err); - break; - } + stats_pool_incr(ctx, peer_err); + break; + } } +static void dnode_peer_close(struct context *ctx, struct conn *conn) { + rstatus_t status; + struct msg *req, *nmsg; /* current and next message */ -static void -dnode_peer_close(struct context *ctx, struct conn *conn) -{ - rstatus_t status; - struct msg *req, *nmsg; /* current and next message */ - - ASSERT(conn->type == CONN_DNODE_PEER_SERVER); - struct node *peer = conn->owner; - - dnode_peer_close_stats(ctx, conn); - - if (conn->sd < 0) { - conn_unref(conn); - conn_put(conn); - dnode_peer_failure(ctx, peer); - return; - } - uint32_t out_counter = 0; - for (req = TAILQ_FIRST(&conn->omsg_q); req != NULL; req = nmsg) { - nmsg = TAILQ_NEXT(req, s_tqe); - - /* dequeue the message (request) from peer outq */ - conn_dequeue_outq(ctx, conn, req); - dnode_peer_ack_err(ctx, conn, req); - out_counter++; - } + ASSERT(conn->type == CONN_DNODE_PEER_SERVER); + struct node *peer = conn->owner; - ASSERT(TAILQ_EMPTY(&conn->omsg_q)); + dnode_peer_close_stats(ctx, conn); - uint32_t in_counter = 0; - for (req = TAILQ_FIRST(&conn->imsg_q); req != NULL; req = nmsg) { - nmsg = TAILQ_NEXT(req, s_tqe); + if (conn->sd < 0) { + conn_unref(conn); + conn_put(conn); + dnode_peer_failure(ctx, peer); + return; + } + uint32_t out_counter = 0; + for (req = TAILQ_FIRST(&conn->omsg_q); req != NULL; req = nmsg) { + nmsg = TAILQ_NEXT(req, s_tqe); + + /* dequeue the message (request) from peer outq */ + conn_dequeue_outq(ctx, conn, req); + dnode_peer_ack_err(ctx, conn, req); + out_counter++; + } + + ASSERT(TAILQ_EMPTY(&conn->omsg_q)); + + uint32_t in_counter = 0; + for (req = TAILQ_FIRST(&conn->imsg_q); req != NULL; req = nmsg) { + nmsg = TAILQ_NEXT(req, s_tqe); + + /* dequeue the message (request) from peer inq */ + conn_dequeue_inq(ctx, conn, req); + // We should also remove the req from the timeout rbtree. + // for outq, its already taken care of + msg_tmo_delete(req); + dnode_peer_ack_err(ctx, conn, req); + in_counter++; - /* dequeue the message (request) from peer inq */ - conn_dequeue_inq(ctx, conn, req); - // We should also remove the req from the timeout rbtree. - // for outq, its already taken care of - msg_tmo_delete(req); - dnode_peer_ack_err(ctx, conn, req); - in_counter++; + if (conn->same_dc) + stats_pool_incr(ctx, peer_dropped_requests); + else + stats_pool_incr(ctx, remote_peer_dropped_requests); + } - if (conn->same_dc) - stats_pool_incr(ctx, peer_dropped_requests); - else - stats_pool_incr(ctx, remote_peer_dropped_requests); - } + ASSERT(TAILQ_EMPTY(&conn->imsg_q)); - ASSERT(TAILQ_EMPTY(&conn->imsg_q)); + log_warn("%s Closing, Dropped %u outqueue & %u inqueue requests", + print_obj(conn), out_counter, in_counter); - log_warn("%s Closing, Dropped %u outqueue & %u inqueue requests", - print_obj(conn), out_counter, in_counter); + struct msg *rsp = conn->rmsg; + if (rsp != NULL) { + conn->rmsg = NULL; - struct msg *rsp = conn->rmsg; - if (rsp != NULL) { - conn->rmsg = NULL; + ASSERT(!rsp->is_request); + ASSERT(rsp->peer == NULL); - ASSERT(!rsp->is_request); - ASSERT(rsp->peer == NULL); + rsp_put(rsp); - rsp_put(rsp); + log_debug(LOG_INFO, + "dyn: close s %d discarding rsp %" PRIu64 " len %" PRIu32 + " " + "in error", + conn->sd, rsp->id, rsp->mlen); + } - log_debug(LOG_INFO, "dyn: close s %d discarding rsp %"PRIu64" len %"PRIu32" " - "in error", conn->sd, rsp->id, rsp->mlen); - } + ASSERT(conn->smsg == NULL); - ASSERT(conn->smsg == NULL); + conn_unref(conn); - conn_unref(conn); - - status = close(conn->sd); - if (status < 0) { - log_error("dyn: close s %d failed, ignored: %s", conn->sd, strerror(errno)); - } - conn->sd = -1; - - conn_put(conn); - dnode_peer_failure(ctx, peer); + status = close(conn->sd); + if (status < 0) { + log_error("dyn: close s %d failed, ignored: %s", conn->sd, strerror(errno)); + } + conn->sd = -1; + conn_put(conn); + dnode_peer_failure(ctx, peer); } -static rstatus_t -dnode_peer_each_preconnect(void *elem) -{ - struct node *peer = *(struct node **)elem; +static rstatus_t dnode_peer_each_preconnect(void *elem) { + struct node *peer = *(struct node **)elem; - if (peer->is_local) //don't bother to connect if it is a self-connection - return DN_OK; + if (peer->is_local) // don't bother to connect if it is a self-connection + return DN_OK; - return conn_pool_preconnect(peer->conn_pool); + return conn_pool_preconnect(peer->conn_pool); } -static rstatus_t -dnode_peer_each_disconnect(void *elem) -{ - struct node *peer = *(struct node **)elem; +static rstatus_t dnode_peer_each_disconnect(void *elem) { + struct node *peer = *(struct node **)elem; - if (peer->conn_pool) { - conn_pool_destroy(peer->conn_pool); - peer->conn_pool = NULL; - } + if (peer->conn_pool) { + conn_pool_destroy(peer->conn_pool); + peer->conn_pool = NULL; + } - return DN_OK; + return DN_OK; } -rstatus_t -dnode_peer_forward_state(void *rmsg) -{ - rstatus_t status; - struct ring_msg *msg = rmsg; - struct server_pool *sp = msg->sp; +rstatus_t dnode_peer_forward_state(void *rmsg) { + rstatus_t status; + struct ring_msg *msg = rmsg; + struct server_pool *sp = msg->sp; - log_debug(LOG_VVERB, "dnode_peer_forward_state: forwarding"); + log_debug(LOG_VVERB, "dnode_peer_forward_state: forwarding"); - //we assume one mbuf is enough for now - will enhance with multiple mbufs later - struct mbuf *mbuf = mbuf_get(); - if (mbuf == NULL) { - log_debug(LOG_VVERB, "Too bad, not enough memory!"); - return DN_ENOMEM; - } + // we assume one mbuf is enough for now - will enhance with multiple mbufs + // later + struct mbuf *mbuf = mbuf_get(); + if (mbuf == NULL) { + log_debug(LOG_VVERB, "Too bad, not enough memory!"); + return DN_ENOMEM; + } - mbuf_copy(mbuf, msg->data, msg->len); + mbuf_copy(mbuf, msg->data, msg->len); - struct array *peers = &sp->peers; - uint32_t nelem = array_n(peers); + struct array *peers = &sp->peers; + uint32_t nelem = array_n(peers); - //pick a random peer - uint32_t ran_index = (uint32_t)rand() % nelem; + // pick a random peer + uint32_t ran_index = (uint32_t)rand() % nelem; - if (ran_index == 0) - ran_index += 1; + if (ran_index == 0) ran_index += 1; - struct node *peer = *(struct node **) array_get(peers, ran_index); + struct node *peer = *(struct node **)array_get(peers, ran_index); - //log_debug(LOG_VVERB, "Gossiping to node '%.*s'", peer->name.len, peer->name.data); + // log_debug(LOG_VVERB, "Gossiping to node '%.*s'", peer->name.len, + // peer->name.data); - struct conn * conn = dnode_peer_conn(peer, 0); - if (conn == NULL) { - //running out of connection due to memory exhaust - log_debug(LOG_ERR, "Unable to obtain a connection object"); - return DN_ERROR; - } + struct conn *conn = dnode_peer_conn(peer, 0); + if (conn == NULL) { + // running out of connection due to memory exhaust + log_debug(LOG_ERR, "Unable to obtain a connection object"); + mbuf_put(mbuf); + return DN_ERROR; + } - status = conn_connect(sp->ctx, conn); - if (status != DN_OK ) { - conn_close(sp->ctx, conn); - log_debug(LOG_ERR, "Error happened in connecting on conn %d", conn->sd); - return DN_ERROR; - } + status = conn_connect(sp->ctx, conn); + if (status != DN_OK) { + conn_close(sp->ctx, conn); + mbuf_put(mbuf); + log_debug(LOG_ERR, "Error happened in connecting on conn %d", conn->sd); + return DN_ERROR; + } - dnode_peer_gossip_forward(sp->ctx, conn, mbuf); + dnode_peer_gossip_forward(sp->ctx, conn, mbuf); - //free this as nobody else will do - //mbuf_put(mbuf); + // free this as nobody else will do + // mbuf_put(mbuf); - return status; + return status; } - -rstatus_t -dnode_peer_handshake_announcing(void *rmsg) -{ - rstatus_t status; - struct ring_msg *msg = rmsg; - struct server_pool *sp = msg->sp; - log_debug(LOG_VVERB, "dyn: handshaking peers"); - struct array *peers = &sp->peers; - - uint32_t i,nelem; - nelem = array_n(peers); - - //we assume one mbuf is enough for now - will enhance with multiple mbufs later - struct mbuf *mbuf = mbuf_get(); - if (mbuf == NULL) { - log_debug(LOG_VVERB, "Too bad, not enough memory!"); - return DN_ENOMEM; +rstatus_t dnode_peer_handshake_announcing(void *rmsg) { + rstatus_t status; + struct ring_msg *msg = rmsg; + struct server_pool *sp = msg->sp; + log_debug(LOG_VVERB, "dyn: handshaking peers"); + struct array *peers = &sp->peers; + + uint32_t i, nelem; + nelem = array_n(peers); + + // we assume one mbuf is enough for now - will enhance with multiple mbufs + // later + struct mbuf *mbuf = mbuf_get(); + if (mbuf == NULL) { + log_debug(LOG_VVERB, "Too bad, not enough memory!"); + return DN_ENOMEM; + } + + // annoucing myself by sending msg: + // 'dc$rack$token,started_ts,node_state,node_dns' + mbuf_write_string(mbuf, &sp->dc); + mbuf_write_char(mbuf, '$'); + mbuf_write_string(mbuf, &sp->rack); + mbuf_write_char(mbuf, '$'); + struct dyn_token *token = (struct dyn_token *)array_get(&sp->tokens, 0); + if (token == NULL) { + log_debug(LOG_VVERB, "Why? This should not be null!"); + mbuf_put(mbuf); + return DN_ERROR; + } + + mbuf_write_uint32(mbuf, token->mag[0]); + mbuf_write_char(mbuf, ','); + int64_t cur_ts = (int64_t)time(NULL); + mbuf_write_uint64(mbuf, (uint64_t)cur_ts); + mbuf_write_char(mbuf, ','); + mbuf_write_uint8(mbuf, sp->ctx->dyn_state); + mbuf_write_char(mbuf, ','); + + unsigned char *broadcast_addr = get_broadcast_address(sp); + mbuf_write_bytes(mbuf, broadcast_addr, (int)dn_strlen(broadcast_addr)); + + // for each peer, send a registered msg + for (i = 0; i < nelem; i++) { + struct node *peer = *(struct node **)array_get(peers, i); + if (peer->is_local) continue; + + log_debug(LOG_VVERB, "Gossiping to node '%.*s'", peer->name.len, + peer->name.data); + + struct conn *conn = dnode_peer_conn(peer, 0); + if (conn == NULL) { + // running out of connection due to memory exhaust + log_debug(LOG_DEBUG, "Unable to obtain a connection object"); + mbuf_put(mbuf); + return DN_ERROR; } - //annoucing myself by sending msg: 'dc$rack$token,started_ts,node_state,node_dns' - mbuf_write_string(mbuf, &sp->dc); - mbuf_write_char(mbuf, '$'); - mbuf_write_string(mbuf, &sp->rack); - mbuf_write_char(mbuf, '$'); - struct dyn_token *token = (struct dyn_token *) array_get(&sp->tokens, 0); - if (token == NULL) { - log_debug(LOG_VVERB, "Why? This should not be null!"); - mbuf_put(mbuf); - return DN_ERROR; + status = conn_connect(sp->ctx, conn); + if (status != DN_OK) { + conn_close(sp->ctx, conn); + log_debug(LOG_DEBUG, "Error happened in connecting on conn %d", conn->sd); + mbuf_put(mbuf); + return DN_ERROR; } - mbuf_write_uint32(mbuf, token->mag[0]); - mbuf_write_char(mbuf, ','); - int64_t cur_ts = (int64_t)time(NULL); - mbuf_write_uint64(mbuf, (uint64_t)cur_ts); - mbuf_write_char(mbuf, ','); - mbuf_write_uint8(mbuf, sp->ctx->dyn_state); - mbuf_write_char(mbuf, ','); - - unsigned char *broadcast_addr = get_broadcast_address(sp); - mbuf_write_bytes(mbuf, broadcast_addr, (int)dn_strlen(broadcast_addr)); - - //for each peer, send a registered msg - for (i = 0; i < nelem; i++) { - struct node *peer = *(struct node **) array_get(peers, i); - if (peer->is_local) - continue; - - log_debug(LOG_VVERB, "Gossiping to node '%.*s'", peer->name.len, peer->name.data); - - struct conn * conn = dnode_peer_conn(peer, 0); - if (conn == NULL) { - //running out of connection due to memory exhaust - log_debug(LOG_DEBUG, "Unable to obtain a connection object"); - return DN_ERROR; - } - - - status = conn_connect(sp->ctx, conn); - if (status != DN_OK ) { - conn_close(sp->ctx, conn); - log_debug(LOG_DEBUG, "Error happened in connecting on conn %d", conn->sd); - return DN_ERROR; - } - - //conn-> - - dnode_peer_gossip_forward(sp->ctx, conn, mbuf); - //peer_gossip_forward1(sp->ctx, conn, sp->data_store, &data); - } + struct mbuf *peer_mbuf = mbuf_get(); + mbuf_write_mbuf(peer_mbuf, mbuf); + dnode_peer_gossip_forward(sp->ctx, conn, mbuf); + // peer_gossip_forward1(sp->ctx, conn, sp->data_store, &data); + } - //free this as nobody else will do - //mbuf_put(mbuf); + // free this as nobody else will do + mbuf_put(mbuf); - return DN_OK; + return DN_OK; } -static rstatus_t -dnode_peer_add_node(struct server_pool *sp, struct gossip_node *node) -{ - rstatus_t status; - struct array *peers = &sp->peers; - struct node **sptr = array_push(peers); - struct node *s = dn_zalloc(sizeof(struct node)); - if (!s || !sptr) - return DN_ENOMEM; - *sptr = s; - _init_peer_struct(s); - - s->idx = array_idx(peers, sptr); - s->owner = sp; - - string_copy(&s->endpoint.pname, node->pname.data, node->pname.len); - s->endpoint.port = (uint16_t) node->port; - string_copy(&s->name, node->name.data, node->name.len); - struct sockinfo *info = dn_alloc(sizeof(*info)); //need to free this - dn_resolve(&s->name, s->endpoint.port, info); - s->endpoint.family = info->family; - s->endpoint.addrlen = info->addrlen; - s->endpoint.addr = (struct sockaddr *)&info->addr; //TODOs: fix this by copying, not reference - - string_copy(&s->rack, node->rack.data, node->rack.len); - string_copy(&s->dc, node->dc.data, node->dc.len); - - array_init(&s->tokens, 1, sizeof(struct dyn_token)); - struct dyn_token *dst_token = array_push(&s->tokens); - copy_dyn_token(&node->token, dst_token); - - s->is_local = node->is_local; - s->is_same_dc = (string_compare(&sp->dc, &s->dc) == 0); - s->processed = 0; - - s->is_secure = is_secure(sp->secure_server_option, &sp->dc, &sp->rack, &s->dc, - &s->rack); - s->state = node->state; - - dnode_create_connection_pool(sp, s); - log_notice("added peer %s", print_obj(s)); - - status = dnode_peer_pool_run(sp); - if (status != DN_OK) - return status; - - status = dnode_peer_each_preconnect(&s); - - return status; +static rstatus_t dnode_peer_add_node(struct server_pool *sp, + struct gossip_node *node) { + rstatus_t status; + struct array *peers = &sp->peers; + struct node **sptr = array_push(peers); + struct node *s = dn_zalloc(sizeof(struct node)); + if (!s || !sptr) return DN_ENOMEM; + *sptr = s; + _init_peer_struct(s); + + s->idx = array_idx(peers, sptr); + s->owner = sp; + + string_copy(&s->endpoint.pname, node->pname.data, node->pname.len); + s->endpoint.port = (uint16_t)node->port; + string_copy(&s->name, node->name.data, node->name.len); + struct sockinfo *info = dn_alloc(sizeof(*info)); // need to free this + dn_resolve(&s->name, s->endpoint.port, info); + s->endpoint.family = info->family; + s->endpoint.addrlen = info->addrlen; + s->endpoint.addr = (struct sockaddr *)&info + ->addr; // TODOs: fix this by copying, not reference + + string_copy(&s->rack, node->rack.data, node->rack.len); + string_copy(&s->dc, node->dc.data, node->dc.len); + + array_init(&s->tokens, 1, sizeof(struct dyn_token)); + struct dyn_token *dst_token = array_push(&s->tokens); + copy_dyn_token(&node->token, dst_token); + + s->is_local = node->is_local; + s->is_same_dc = (string_compare(&sp->dc, &s->dc) == 0); + s->processed = 0; + + s->is_secure = + is_secure(sp->secure_server_option, &sp->dc, &sp->rack, &s->dc, &s->rack); + s->state = node->state; + + dnode_create_connection_pool(sp, s); + log_notice("added peer %s", print_obj(s)); + + status = dnode_peer_pool_run(sp); + if (status != DN_OK) return status; + + status = dnode_peer_each_preconnect(&s); + + return status; } -rstatus_t -dnode_peer_add(void *rmsg) -{ - rstatus_t status; - struct ring_msg *msg = rmsg; - struct server_pool *sp = msg->sp; - struct gossip_node *node = array_get(&msg->nodes, 0); - log_debug(LOG_NOTICE, "dyn: peer has an added message '%.*s'", node->name.len, node->name.data); - status = dnode_peer_add_node(sp, node); +rstatus_t dnode_peer_add(void *rmsg) { + rstatus_t status; + struct ring_msg *msg = rmsg; + struct server_pool *sp = msg->sp; + struct gossip_node *node = array_get(&msg->nodes, 0); + log_debug(LOG_NOTICE, "dyn: peer has an added message '%.*s'", node->name.len, + node->name.data); + status = dnode_peer_add_node(sp, node); - return status; + return status; } /* @@ -695,509 +668,495 @@ dnode_peer_add(struct server_pool *sp, struct gossip_node *node) { rstatus_t status; - log_debug(LOG_VVERB, "dyn: peer has an added message '%.*s'", node->name.len, node->name.data); - status = dnode_peer_add_node(sp, node); + log_debug(LOG_VVERB, "dyn: peer has an added message '%.*s'", +node->name.len, node->name.data); status = dnode_peer_add_node(sp, node); return status; } */ -rstatus_t -dnode_peer_replace(void *rmsg) -{ - //rstatus_t status; - struct ring_msg *msg = rmsg; - struct server_pool *sp = msg->sp; - struct gossip_node *node = array_get(&msg->nodes, 0); - log_debug(LOG_VVERB, "dyn: peer has a replaced message '%.*s'", node->name.len, node->name.data); - struct array *peers = &sp->peers; - struct node *s = NULL; - - uint32_t i,nelem; - //bool node_exist = false; - //TODOs: use hash table here - for (i=1, nelem = array_n(peers); i< nelem; i++) { - struct node * peer = *(struct node **) array_get(peers, i); - if (string_compare(&peer->rack, &node->rack) == 0) { - //TODOs: now only compare 1st token and support vnode later - use hash string on a tokens for comparison - struct dyn_token *ptoken = (struct dyn_token *) array_get(&peer->tokens, 0); - struct dyn_token *ntoken = &node->token; - - if (cmp_dyn_token(ptoken, ntoken) == 0) { - s = peer; //found a node to replace - break; - } - } +rstatus_t dnode_peer_replace(void *rmsg) { + // rstatus_t status; + struct ring_msg *msg = rmsg; + struct server_pool *sp = msg->sp; + struct gossip_node *node = array_get(&msg->nodes, 0); + log_debug(LOG_VVERB, "dyn: peer has a replaced message '%.*s'", + node->name.len, node->name.data); + struct array *peers = &sp->peers; + struct node *s = NULL; + + uint32_t i, nelem; + // bool node_exist = false; + // TODOs: use hash table here + for (i = 1, nelem = array_n(peers); i < nelem; i++) { + struct node *peer = *(struct node **)array_get(peers, i); + if (string_compare(&peer->rack, &node->rack) == 0) { + // TODOs: now only compare 1st token and support vnode later - use hash + // string on a tokens for comparison + struct dyn_token *ptoken = + (struct dyn_token *)array_get(&peer->tokens, 0); + struct dyn_token *ntoken = &node->token; + + if (cmp_dyn_token(ptoken, ntoken) == 0) { + s = peer; // found a node to replace + break; + } } + } + if (s != NULL) { + log_notice("Found an old node to replace '%.*s'", s->name.len, + s->name.data); + log_notice("Replace with address '%.*s'", node->name.len, node->name.data); - if (s != NULL) { - log_notice("Found an old node to replace '%.*s'", s->name.len, s->name.data); - log_notice("Replace with address '%.*s'", node->name.len, node->name.data); - - dnode_peer_each_disconnect(&s); - string_deinit(&s->endpoint.pname); - string_deinit(&s->name); - string_copy(&s->endpoint.pname, node->pname.data, node->pname.len); - string_copy(&s->name, node->name.data, node->name.len); + dnode_peer_each_disconnect(&s); + string_deinit(&s->endpoint.pname); + string_deinit(&s->name); + string_copy(&s->endpoint.pname, node->pname.data, node->pname.len); + string_copy(&s->name, node->name.data, node->name.len); - //TODOs: need to free the previous s->endpoint.addr? - //if (s->endpoint.addr != NULL) { - // dn_free(s->endpoint.addr); - //} + // TODOs: need to free the previous s->endpoint.addr? + // if (s->endpoint.addr != NULL) { + // dn_free(s->endpoint.addr); + //} - struct sockinfo *info = dn_alloc(sizeof(*info)); //need to free this - dn_resolve(&s->name, s->endpoint.port, info); - s->endpoint.family = info->family; - s->endpoint.addrlen = info->addrlen; - s->endpoint.addr = (struct sockaddr *)&info->addr; //TODOs: fix this by copying, not reference + struct sockinfo *info = dn_alloc(sizeof(*info)); // need to free this + dn_resolve(&s->name, s->endpoint.port, info); + s->endpoint.family = info->family; + s->endpoint.addrlen = info->addrlen; + s->endpoint.addr = (struct sockaddr *)&info + ->addr; // TODOs: fix this by copying, not reference - dnode_create_connection_pool(sp, s); + dnode_create_connection_pool(sp, s); - dnode_peer_each_preconnect(&s); - } else { - log_debug(LOG_INFO, "Unable to find any node matched the token"); - } + dnode_peer_each_preconnect(&s); + } else { + log_debug(LOG_INFO, "Unable to find any node matched the token"); + } - return DN_OK; + return DN_OK; } -void -dnode_peer_connected(struct context *ctx, struct conn *conn) -{ - struct node *peer = conn->owner; +void dnode_peer_connected(struct context *ctx, struct conn *conn) { + struct node *peer = conn->owner; - ASSERT(conn->type == CONN_DNODE_PEER_SERVER); - ASSERT(conn->connecting && !conn->connected); + ASSERT(conn->type == CONN_DNODE_PEER_SERVER); + ASSERT(conn->connecting && !conn->connected); - stats_pool_incr(ctx, peer_connections); + stats_pool_incr(ctx, peer_connections); - conn->connecting = 0; - conn->connected = 1; - peer->state = NORMAL; - conn_pool_connected(peer->conn_pool, conn); + conn->connecting = 0; + conn->connected = 1; + peer->state = NORMAL; + conn_pool_connected(peer->conn_pool, conn); - log_notice("%s connected", print_obj(conn)); + log_notice("%s connected", print_obj(conn)); } -static void -dnode_peer_ok(struct context *ctx, struct conn *conn) -{ - struct node *server = conn->owner; +static void dnode_peer_ok(struct context *ctx, struct conn *conn) { + struct node *server = conn->owner; - ASSERT(conn->type == CONN_DNODE_PEER_SERVER); - ASSERT(conn->connected); + ASSERT(conn->type == CONN_DNODE_PEER_SERVER); + ASSERT(conn->connected); - log_debug(LOG_VERB, "dyn: reset peer '%.*s' failure count from %"PRIu32 - " to 0", server->endpoint.pname.len, server->endpoint.pname.data, + log_debug(LOG_VERB, + "dyn: reset peer '%.*s' failure count from %" PRIu32 " to 0", + server->endpoint.pname.len, server->endpoint.pname.data, server->failure_count); - server->failure_count = 0; + server->failure_count = 0; } -static rstatus_t -dnode_peer_pool_update(struct server_pool *pool) -{ - msec_t now = dn_msec_now(); - if (now < 0) { - return DN_ERROR; - } - - if (now <= pool->next_rebuild) { - return DN_OK; - } +static rstatus_t dnode_peer_pool_update(struct server_pool *pool) { + msec_t now = dn_msec_now(); + if (now < 0) { + return DN_ERROR; + } - pool->next_rebuild = now + WAIT_BEFORE_UPDATE_PEERS_IN_MILLIS; - return dnode_peer_pool_run(pool); + if (now <= pool->next_rebuild) { + return DN_OK; + } + pool->next_rebuild = now + WAIT_BEFORE_UPDATE_PEERS_IN_MILLIS; + return dnode_peer_pool_run(pool); } -uint32_t -dnode_peer_idx_for_key_on_rack(struct server_pool *pool, struct rack *rack, - uint8_t *key, uint32_t keylen) -{ - struct dyn_token token; - pool->key_hash(key, keylen, &token); - return vnode_dispatch(rack->continuum, rack->ncontinuum, &token); +uint32_t dnode_peer_idx_for_key_on_rack(struct server_pool *pool, + struct rack *rack, uint8_t *key, + uint32_t keylen) { + struct dyn_token token; + pool->key_hash(key, keylen, &token); + return vnode_dispatch(rack->continuum, rack->ncontinuum, &token); } -static struct node * -dnode_peer_for_key_on_rack(struct server_pool *pool, struct rack *rack, - uint8_t *key, uint32_t keylen) -{ - struct node *server; - uint32_t idx; +static struct node *dnode_peer_for_key_on_rack(struct server_pool *pool, + struct rack *rack, uint8_t *key, + uint32_t keylen) { + struct node *server; + uint32_t idx; - ASSERT(array_n(&pool->peers) != 0); + ASSERT(array_n(&pool->peers) != 0); - if (keylen == 0) { - idx = 0; //for no argument command - } else { - idx = dnode_peer_idx_for_key_on_rack(pool, rack, key, keylen); - } + if (keylen == 0) { + idx = 0; // for no argument command + } else { + idx = dnode_peer_idx_for_key_on_rack(pool, rack, key, keylen); + } - ASSERT(idx < array_n(&pool->peers)); + ASSERT(idx < array_n(&pool->peers)); - server = *(struct node **)array_get(&pool->peers, idx); + server = *(struct node **)array_get(&pool->peers, idx); - if (log_loggable(LOG_VERB)) { - log_debug(LOG_VERB, "dyn: key '%.*s' maps to server '%.*s'", keylen, - key, server->endpoint.pname.len, server->endpoint.pname.data); - } + if (log_loggable(LOG_VERB)) { + log_debug(LOG_VERB, "dyn: key '%.*s' maps to server '%.*s'", keylen, key, + server->endpoint.pname.len, server->endpoint.pname.data); + } - return server; + return server; } -struct node * -dnode_peer_pool_server(struct context *ctx, struct server_pool *pool, - struct rack *rack, uint8_t *key, uint32_t keylen, - msg_routing_t msg_routing) -{ - rstatus_t status; - struct node *peer; - - log_debug(LOG_VERB, "Entering dnode_peer_pool_conn ................................"); - - status = dnode_peer_pool_update(pool); - if (status != DN_OK) { - loga("status is not OK"); - return NULL; +struct node *dnode_peer_pool_server(struct context *ctx, + struct server_pool *pool, struct rack *rack, + uint8_t *key, uint32_t keylen, + msg_routing_t msg_routing) { + rstatus_t status; + struct node *peer; + + log_debug(LOG_VERB, + "Entering dnode_peer_pool_conn ................................"); + + status = dnode_peer_pool_update(pool); + if (status != DN_OK) { + loga("status is not OK"); + return NULL; + } + + if (msg_routing == ROUTING_LOCAL_NODE_ONLY) { // always local + peer = *(struct node **)array_get(&pool->peers, 0); + } else { + /* from a given {key, keylen} pick a peer from pool */ + peer = dnode_peer_for_key_on_rack(pool, rack, key, keylen); + if (peer == NULL) { + log_debug(LOG_VERB, + "What? There is no such peer in rack '%.*s' for key '%.*s'", + rack->name, keylen, key); + return NULL; } - - if (msg_routing == ROUTING_LOCAL_NODE_ONLY) { //always local - peer = *(struct node **)array_get(&pool->peers, 0); - } else { - /* from a given {key, keylen} pick a peer from pool */ - peer = dnode_peer_for_key_on_rack(pool, rack, key, keylen); - if (peer == NULL) { - log_debug(LOG_VERB, "What? There is no such peer in rack '%.*s' for key '%.*s'", - rack->name, keylen, key); - return NULL; - } - } - return peer; + } + return peer; } -struct conn * -dnode_peer_get_conn(struct context *ctx, struct node *peer, int tag) -{ - ASSERT(!peer->is_local); +struct conn *dnode_peer_get_conn(struct context *ctx, struct node *peer, + int tag) { + ASSERT(!peer->is_local); - if (peer->state == RESET) { - log_debug(LOG_WARN, "Detecting peer '%.*s' is set with state Reset", peer->name); - if (peer->conn_pool) { - conn_pool_destroy(peer->conn_pool); - peer->conn_pool = NULL; - } - - dnode_create_connection_pool(&ctx->pool, peer); - if (conn_pool_preconnect(peer->conn_pool) != DN_OK) - return NULL; - } - /* pick a connection to a given peer */ - struct conn *conn = dnode_peer_conn(peer, tag); - if (conn == NULL) { - return NULL; - } - - if (conn_connect(ctx, conn) != DN_OK) { - conn_close(ctx, conn); - return NULL; + if (peer->state == RESET) { + log_debug(LOG_WARN, "Detecting peer '%.*s' is set with state Reset", + peer->name); + if (peer->conn_pool) { + conn_pool_destroy(peer->conn_pool); + peer->conn_pool = NULL; } - return conn; + dnode_create_connection_pool(&ctx->pool, peer); + if (conn_pool_preconnect(peer->conn_pool) != DN_OK) return NULL; + } + /* pick a connection to a given peer */ + struct conn *conn = dnode_peer_conn(peer, tag); + if (conn == NULL) { + return NULL; + } + + if (conn_connect(ctx, conn) != DN_OK) { + conn_close(ctx, conn); + return NULL; + } + + return conn; } +rstatus_t dnode_peer_pool_preconnect(struct context *ctx) { + rstatus_t status; + struct server_pool *sp = &ctx->pool; -rstatus_t -dnode_peer_pool_preconnect(struct context *ctx) -{ - rstatus_t status; - struct server_pool *sp = &ctx->pool; - - if (!sp->preconnect) { - return DN_OK; - } - - status = array_each(&sp->peers, dnode_peer_each_preconnect); - if (status != DN_OK) { - return status; - } - + if (!sp->preconnect) { return DN_OK; -} - + } -void -dnode_peer_pool_disconnect(struct context *ctx) -{ - rstatus_t status; - struct server_pool *sp = &ctx->pool; + status = array_each(&sp->peers, dnode_peer_each_preconnect); + if (status != DN_OK) { + return status; + } - status = array_each(&sp->peers, dnode_peer_each_disconnect); - IGNORE_RET_VAL(status); + return DN_OK; } -static bool -dnode_rsp_filter(struct context *ctx, struct conn *conn, struct msg *rsp) -{ - struct msg *req; +void dnode_peer_pool_disconnect(struct context *ctx) { + rstatus_t status; + struct server_pool *sp = &ctx->pool; - ASSERT(conn->type == CONN_DNODE_PEER_SERVER); - - if (msg_empty(rsp)) { - ASSERT(conn->rmsg == NULL); - log_debug(LOG_VERB, "dyn: filter empty rsp %"PRIu64" on s %d", rsp->id, - conn->sd); - rsp_put(rsp); - return true; - } - - req = TAILQ_FIRST(&conn->omsg_q); - if (req == NULL) { - log_debug(LOG_INFO, "dyn: filter stray rsp %"PRIu64" len %"PRIu32" on s %d", - rsp->id, rsp->mlen, conn->sd); - rsp_put(rsp); - return true; - } - ASSERT(req->is_request && !req->done); + status = array_each(&sp->peers, dnode_peer_each_disconnect); + IGNORE_RET_VAL(status); +} - return false; +static bool dnode_rsp_filter(struct context *ctx, struct conn *conn, + struct msg *rsp) { + struct msg *req; + + ASSERT(conn->type == CONN_DNODE_PEER_SERVER); + + if (msg_empty(rsp)) { + ASSERT(conn->rmsg == NULL); + log_debug(LOG_VERB, "dyn: filter empty rsp %" PRIu64 " on s %d", rsp->id, + conn->sd); + rsp_put(rsp); + return true; + } + + req = TAILQ_FIRST(&conn->omsg_q); + if (req == NULL) { + log_debug(LOG_INFO, + "dyn: filter stray rsp %" PRIu64 " len %" PRIu32 " on s %d", + rsp->id, rsp->mlen, conn->sd); + rsp_put(rsp); + return true; + } + ASSERT(req->is_request && !req->done); + + return false; } -static void -dnode_rsp_forward_stats(struct context *ctx, struct msg *rsp) -{ - ASSERT(!rsp->is_request); - stats_pool_incr(ctx, peer_responses); - stats_pool_incr_by(ctx, peer_response_bytes, rsp->mlen); +static void dnode_rsp_forward_stats(struct context *ctx, struct msg *rsp) { + ASSERT(!rsp->is_request); + stats_pool_incr(ctx, peer_responses); + stats_pool_incr_by(ctx, peer_response_bytes, rsp->mlen); } -static void -dnode_rsp_swallow(struct context *ctx, struct conn *peer_conn, - struct msg *req, struct msg *rsp) -{ - conn_dequeue_outq(ctx, peer_conn, req); - req->done = 1; - log_debug(LOG_VERB, "conn %p swallow %p", peer_conn, req); - if (rsp) { - log_debug(LOG_INFO, "%s %s SWALLOW %s len %"PRIu32, - print_obj(peer_conn), print_obj(req), print_obj(rsp), rsp->mlen); - rsp_put(rsp); - } - req_put(req); +static void dnode_rsp_swallow(struct context *ctx, struct conn *peer_conn, + struct msg *req, struct msg *rsp) { + conn_dequeue_outq(ctx, peer_conn, req); + req->done = 1; + log_debug(LOG_VERB, "conn %p swallow %p", peer_conn, req); + if (rsp) { + log_debug(LOG_INFO, "%s %s SWALLOW %s len %" PRIu32, print_obj(peer_conn), + print_obj(req), print_obj(rsp), rsp->mlen); + rsp_put(rsp); + } + req_put(req); } /* Description: link data from a peer connection to a client-facing connection * peer_conn: a peer connection * msg : msg with data from the peer connection after parsing */ -static void -dnode_rsp_forward_match(struct context *ctx, struct conn *peer_conn, struct msg *rsp) -{ - rstatus_t status; - struct msg *req; - struct conn *c_conn; +static void dnode_rsp_forward_match(struct context *ctx, struct conn *peer_conn, + struct msg *rsp) { + rstatus_t status; + struct msg *req; + struct conn *c_conn; + + req = TAILQ_FIRST(&peer_conn->omsg_q); + c_conn = req->owner; + + /* if client consistency is dc_one forward the response from only the + local node. Since dyn_dnode_peer is always a remote node, drop the rsp */ + if (req->consistency == DC_ONE) { + if (req->swallow) { + dnode_rsp_swallow(ctx, peer_conn, req, rsp); + return; + } + // log_warn("req %d:%d with DC_ONE consistency is not being swallowed"); + } + + /* if client consistency is dc_quorum or dc_safe_quorum, forward the response + from only the local region/DC. */ + if (((req->consistency == DC_QUORUM) || + (req->consistency == DC_SAFE_QUORUM)) && + !peer_conn->same_dc) { + if (req->swallow) { + dnode_rsp_swallow(ctx, peer_conn, req, rsp); + return; + } + } + + log_debug(LOG_DEBUG, + "%s DNODE RSP RECEIVED dmsg->id %u req %u:%u rsp %u:%u, ", + print_obj(peer_conn), rsp->dmsg->id, req->id, req->parent_id, + rsp->id, rsp->parent_id); + ASSERT(req != NULL); + ASSERT(req->is_request); + + if (log_loggable(LOG_VVERB)) { + loga("%s Dumping content:", print_obj(rsp)); + msg_dump(LOG_VVERB, rsp); + + loga("%d Dumping content:", print_obj(req)); + msg_dump(LOG_VVERB, req); + } + + conn_dequeue_outq(ctx, peer_conn, req); + req->done = 1; + + log_info("%s %s RECEIVED %s", print_obj(c_conn), print_obj(req), + print_obj(rsp)); + + ASSERT_LOG( + (c_conn->type == CONN_CLIENT) || (c_conn->type == CONN_DNODE_PEER_CLIENT), + "c_conn %s", print_obj(c_conn)); + + dnode_rsp_forward_stats(ctx, rsp); + // c_conn owns respnse now + status = conn_handle_response(c_conn, + req->parent_id ? req->parent_id : req->id, rsp); + IGNORE_RET_VAL(status); + if (req->swallow) { + log_info("swallow request %d:%d", req->id, req->parent_id); + req_put(req); + } +} + +/* There are chances that the request to the remote peer or its response got + * dropped. Hence we may not always receive a response to the request at the + * head of the FIFO. Hence what we do is we mark that request as errored and + * move on the next one in the outgoing queue. This works since we always have + * message ids in monotonically increasing order. + */ +static void dnode_rsp_forward(struct context *ctx, struct conn *peer_conn, + struct msg *rsp) { + struct msg *req; + struct conn *c_conn; + + ASSERT(peer_conn->type == CONN_DNODE_PEER_SERVER); + + /* response from a peer implies that peer is ok and heartbeating */ + dnode_peer_ok(ctx, peer_conn); + /* dequeue peer message (request) from peer conn */ + while (true) { req = TAILQ_FIRST(&peer_conn->omsg_q); + log_debug(LOG_VERB, "dnode_rsp_forward entering req %p rsp %p...", req, + rsp); c_conn = req->owner; - /* if client consistency is dc_one forward the response from only the - local node. Since dyn_dnode_peer is always a remote node, drop the rsp */ + if (req->request_send_time) { + struct stats *st = ctx->stats; + uint64_t delay = dn_usec_now() - req->request_send_time; + if (!peer_conn->same_dc) + histo_add(&st->cross_region_latency_histo, delay); + else + histo_add(&st->cross_zone_latency_histo, delay); + } + + if (req->id == rsp->dmsg->id) { + dnode_rsp_forward_match(ctx, peer_conn, rsp); + return; + } + // Report a mismatch and try to rectify + log_error("%s MISMATCH: rsp_dmsg_id %u req %u:%u dnode rsp %u:%u", + print_obj(peer_conn), rsp->dmsg->id, req->id, req->parent_id, + rsp->id, rsp->parent_id); + if (c_conn && conn_to_ctx(c_conn)) + stats_pool_incr(conn_to_ctx(c_conn), peer_mismatch_requests); + + // TODO : should you be worried about message id getting wrapped around to + // 0? + if (rsp->dmsg->id < req->id) { + // We received a response from the past. This indeed proves out of order + // responses. A blunder to the architecture. Log it and drop the response. + log_error("MISMATCH: received response from the past. Dropping it"); + rsp_put(rsp); + return; + } + if (req->consistency == DC_ONE) { - if (req->swallow) { - dnode_rsp_swallow(ctx, peer_conn, req, rsp); - return; - } - //log_warn("req %d:%d with DC_ONE consistency is not being swallowed"); + if (req->swallow) { + // swallow the request and move on the next one + dnode_rsp_swallow(ctx, peer_conn, req, NULL); + continue; + } + log_warn("req %d:%d with DC_ONE consistency is not being swallowed"); } - /* if client consistency is dc_quorum or dc_safe_quorum, forward the response from only the - local region/DC. */ - if (((req->consistency == DC_QUORUM) || (req->consistency == DC_SAFE_QUORUM)) - && !peer_conn->same_dc) { - if (req->swallow) { - dnode_rsp_swallow(ctx, peer_conn, req, rsp); - return; - } + if (((req->consistency == DC_QUORUM) || + (req->consistency == DC_SAFE_QUORUM)) && + !peer_conn->same_dc) { + if (req->swallow) { + // swallow the request and move on the next one + dnode_rsp_swallow(ctx, peer_conn, req, NULL); + continue; + } } - log_debug(LOG_DEBUG, "%s DNODE RSP RECEIVED dmsg->id %u req %u:%u rsp %u:%u, ", - print_obj(peer_conn), rsp->dmsg->id, req->id, req->parent_id, rsp->id, rsp->parent_id); + log_error( + "%s MISMATCHED DNODE RSP RECEIVED dmsg->id %u req %u:%u rsp %u:%u, " + "skipping....", + print_obj(peer_conn), rsp->dmsg->id, req->id, req->parent_id, rsp->id, + rsp->parent_id); ASSERT(req != NULL); - ASSERT(req->is_request); + ASSERT(req->is_request && !req->done); if (log_loggable(LOG_VVERB)) { - loga("%s Dumping content:", print_obj(rsp)); - msg_dump(LOG_VVERB, rsp); - - loga("%d Dumping content:", print_obj(req)); - msg_dump(LOG_VVERB, req); + loga("skipping req: "); + msg_dump(LOG_VVERB, req); } conn_dequeue_outq(ctx, peer_conn, req); req->done = 1; - log_info("%s %s RECEIVED %s", print_obj(c_conn), print_obj(req), print_obj(rsp)); - - ASSERT_LOG((c_conn->type == CONN_CLIENT) || - (c_conn->type == CONN_DNODE_PEER_CLIENT), "c_conn %s", print_obj(c_conn)); - - dnode_rsp_forward_stats(ctx, rsp); - // c_conn owns respnse now - status = conn_handle_response(c_conn, req->parent_id ? req->parent_id : req->id, - rsp); + // Create an appropriate response for the request so its propagated up; + struct msg *err_rsp = msg_get(peer_conn, false, __FUNCTION__); + err_rsp->is_error = req->is_error = 1; + err_rsp->error_code = req->error_code = BAD_FORMAT; + err_rsp->dyn_error_code = req->dyn_error_code = BAD_FORMAT; + err_rsp->dmsg = dmsg_get(); + err_rsp->dmsg->id = req->id; + log_debug(LOG_VERB, "%p <-> %p", req, err_rsp); + /* establish err_rsp <-> req (response <-> request) link */ + err_rsp->peer = req; + + log_error( + "Peer connection s %d skipping request %u:%u, dummy err_rsp %u:%u", + peer_conn->sd, req->id, req->parent_id, err_rsp->id, + err_rsp->parent_id); + rstatus_t status = conn_handle_response( + c_conn, req->parent_id ? req->parent_id : req->id, err_rsp); IGNORE_RET_VAL(status); if (req->swallow) { - log_info("swallow request %d:%d", req->id, req->parent_id); - req_put(req); + log_debug(LOG_INFO, "swallow request %d:%d", req->id, req->parent_id); + req_put(req); } + } } -/* There are chances that the request to the remote peer or its response got dropped. - * Hence we may not always receive a response to the request at the head of the FIFO. - * Hence what we do is we mark that request as errored and move on the next one - * in the outgoing queue. This works since we always have message ids in monotonically - * increasing order. - */ -static void -dnode_rsp_forward(struct context *ctx, struct conn *peer_conn, struct msg *rsp) -{ - struct msg *req; - struct conn *c_conn; - - ASSERT(peer_conn->type == CONN_DNODE_PEER_SERVER); - - /* response from a peer implies that peer is ok and heartbeating */ - dnode_peer_ok(ctx, peer_conn); - - /* dequeue peer message (request) from peer conn */ - while (true) { - req = TAILQ_FIRST(&peer_conn->omsg_q); - log_debug(LOG_VERB, "dnode_rsp_forward entering req %p rsp %p...", req, rsp); - c_conn = req->owner; - - if (req->request_send_time) { - struct stats *st = ctx->stats; - uint64_t delay = dn_usec_now() - req->request_send_time; - if (!peer_conn->same_dc) - histo_add(&st->cross_region_latency_histo, delay); - else - histo_add(&st->cross_zone_latency_histo, delay); - } +static void dnode_rsp_recv_done(struct context *ctx, struct conn *conn, + struct msg *rsp, struct msg *nmsg) { + log_debug(LOG_VERB, "dnode_rsp_recv_done entering ..."); - if (req->id == rsp->dmsg->id) { - dnode_rsp_forward_match(ctx, peer_conn, rsp); - return; - } - // Report a mismatch and try to rectify - log_error("%s MISMATCH: rsp_dmsg_id %u req %u:%u dnode rsp %u:%u", - print_obj(peer_conn), - rsp->dmsg->id, req->id, req->parent_id, rsp->id, - rsp->parent_id); - if (c_conn && conn_to_ctx(c_conn)) - stats_pool_incr(conn_to_ctx(c_conn), - peer_mismatch_requests); - - // TODO : should you be worried about message id getting wrapped around to 0? - if (rsp->dmsg->id < req->id) { - // We received a response from the past. This indeed proves out of order - // responses. A blunder to the architecture. Log it and drop the response. - log_error("MISMATCH: received response from the past. Dropping it"); - rsp_put(rsp); - return; - } - - if (req->consistency == DC_ONE) { - if (req->swallow) { - // swallow the request and move on the next one - dnode_rsp_swallow(ctx, peer_conn, req, NULL); - continue; - } - log_warn("req %d:%d with DC_ONE consistency is not being swallowed"); - } + ASSERT(conn->type == CONN_DNODE_PEER_SERVER); + ASSERT(rsp != NULL && conn->rmsg == rsp); + ASSERT(!rsp->is_request); + ASSERT(rsp->owner == conn); + ASSERT(nmsg == NULL || !nmsg->is_request); - if (((req->consistency == DC_QUORUM) || (req->consistency == DC_SAFE_QUORUM)) - && !peer_conn->same_dc) { - if (req->swallow) { - // swallow the request and move on the next one - dnode_rsp_swallow(ctx, peer_conn, req, NULL); - continue; - } - } - - log_error("%s MISMATCHED DNODE RSP RECEIVED dmsg->id %u req %u:%u rsp %u:%u, skipping....", - print_obj(peer_conn), rsp->dmsg->id, - req->id, req->parent_id, rsp->id, rsp->parent_id); - ASSERT(req != NULL); - ASSERT(req->is_request && !req->done); - - if (log_loggable(LOG_VVERB)) { - loga("skipping req: "); - msg_dump(LOG_VVERB, req); - } + if (log_loggable(LOG_VVERB)) { + loga("Dumping content for rsp: "); + msg_dump(LOG_VVERB, rsp); - - conn_dequeue_outq(ctx, peer_conn, req); - req->done = 1; - - // Create an appropriate response for the request so its propagated up; - struct msg *err_rsp = msg_get(peer_conn, false, __FUNCTION__); - err_rsp->is_error = req->is_error = 1; - err_rsp->error_code = req->error_code = BAD_FORMAT; - err_rsp->dyn_error_code = req->dyn_error_code = BAD_FORMAT; - err_rsp->dmsg = dmsg_get(); - err_rsp->dmsg->id = req->id; - log_debug(LOG_VERB, "%p <-> %p", req, err_rsp); - /* establish err_rsp <-> req (response <-> request) link */ - err_rsp->peer = req; - - log_error("Peer connection s %d skipping request %u:%u, dummy err_rsp %u:%u", - peer_conn->sd, req->id, req->parent_id, err_rsp->id, err_rsp->parent_id); - rstatus_t status = - conn_handle_response(c_conn, req->parent_id ? req->parent_id : req->id, - err_rsp); - IGNORE_RET_VAL(status); - if (req->swallow) { - log_debug(LOG_INFO, "swallow request %d:%d", req->id, req->parent_id); - req_put(req); - } - } -} - -static void -dnode_rsp_recv_done(struct context *ctx, struct conn *conn, - struct msg *rsp, struct msg *nmsg) -{ - log_debug(LOG_VERB, "dnode_rsp_recv_done entering ..."); - - ASSERT(conn->type == CONN_DNODE_PEER_SERVER); - ASSERT(rsp != NULL && conn->rmsg == rsp); - ASSERT(!rsp->is_request); - ASSERT(rsp->owner == conn); - ASSERT(nmsg == NULL || !nmsg->is_request); - - if (log_loggable(LOG_VVERB)) { - loga("Dumping content for rsp: "); - msg_dump(LOG_VVERB, rsp); - - if (nmsg != NULL) { - loga("Dumping content for nmsg :"); - msg_dump(LOG_VVERB, nmsg); - } + if (nmsg != NULL) { + loga("Dumping content for nmsg :"); + msg_dump(LOG_VVERB, nmsg); } + } - /* enqueue next message (response), if any */ - conn->rmsg = nmsg; + /* enqueue next message (response), if any */ + conn->rmsg = nmsg; - if (dnode_rsp_filter(ctx, conn, rsp)) { - return; - } - dnode_rsp_forward(ctx, conn, rsp); + if (dnode_rsp_filter(ctx, conn, rsp)) { + return; + } + dnode_rsp_forward(ctx, conn, rsp); } - - -//TODOs: fix this in using dmsg_write with encrypted msgs +// TODOs: fix this in using dmsg_write with encrypted msgs // It is not in use now. /* void @@ -1246,7 +1205,8 @@ dnode_rsp_gos_syn(struct context *ctx, struct conn *p_conn, struct msg *msg) //p_conn->enqueue_outq(ctx, p_conn, pmsg); - //if (TAILQ_FIRST(&p_conn->omsg_q) != NULL && req_done(p_conn, TAILQ_FIRST(&p_conn->omsg_q))) { + //if (TAILQ_FIRST(&p_conn->omsg_q) != NULL && req_done(p_conn, +TAILQ_FIRST(&p_conn->omsg_q))) { // status = conn_event_add_out(p_conn); // if (status != DN_OK) { // p_conn->err = errno; @@ -1254,10 +1214,9 @@ dnode_rsp_gos_syn(struct context *ctx, struct conn *p_conn, struct msg *msg) //} - if (TAILQ_FIRST(&p_conn->omsg_q) != NULL && req_done(p_conn, TAILQ_FIRST(&p_conn->omsg_q))) { - status = conn_event_add_out(p_conn); - if (status != DN_OK) { - p_conn->err = errno; + if (TAILQ_FIRST(&p_conn->omsg_q) != NULL && req_done(p_conn, +TAILQ_FIRST(&p_conn->omsg_q))) { status = conn_event_add_out(p_conn); if (status +!= DN_OK) { p_conn->err = errno; } } @@ -1265,231 +1224,215 @@ dnode_rsp_gos_syn(struct context *ctx, struct conn *p_conn, struct msg *msg) } */ -static struct msg * -dnode_req_send_next(struct context *ctx, struct conn *conn) -{ - rstatus_t status; - - ASSERT(conn->type == CONN_DNODE_PEER_SERVER); - - // TODO: Not the right way to use time_t directly. FIXME - uint32_t now = (uint32_t)time(NULL); - //throttling the sending traffics here - if (!conn->same_dc) { - if (conn->last_sent != 0) { - uint32_t elapsed_time = now - conn->last_sent; - uint32_t earned_tokens = elapsed_time * msgs_per_sec(); - conn->avail_tokens = (conn->avail_tokens + earned_tokens) < msgs_per_sec()? - conn->avail_tokens + earned_tokens : msgs_per_sec(); - - } - - conn->last_sent = now; - if (conn->avail_tokens > 0) { - conn->avail_tokens--; - return req_send_next(ctx, conn); - } - - //requeue - status = conn_event_add_out(conn); - IGNORE_RET_VAL(status); - - return NULL; +static struct msg *dnode_req_send_next(struct context *ctx, struct conn *conn) { + rstatus_t status; + + ASSERT(conn->type == CONN_DNODE_PEER_SERVER); + + // TODO: Not the right way to use time_t directly. FIXME + uint32_t now = (uint32_t)time(NULL); + // throttling the sending traffics here + if (!conn->same_dc) { + if (conn->last_sent != 0) { + uint32_t elapsed_time = now - conn->last_sent; + uint32_t earned_tokens = elapsed_time * msgs_per_sec(); + conn->avail_tokens = (conn->avail_tokens + earned_tokens) < msgs_per_sec() + ? conn->avail_tokens + earned_tokens + : msgs_per_sec(); } conn->last_sent = now; - return req_send_next(ctx, conn); -} + if (conn->avail_tokens > 0) { + conn->avail_tokens--; + return req_send_next(ctx, conn); + } -static void -dnode_req_peer_enqueue_imsgq(struct context *ctx, struct conn *conn, struct msg *req) -{ - ASSERT(req->is_request); - ASSERT(conn->type == CONN_DNODE_PEER_SERVER); - req->request_inqueue_enqueue_time_us = dn_usec_now(); + // requeue + status = conn_event_add_out(conn); + IGNORE_RET_VAL(status); - if (req->expect_datastore_reply) { - msg_tmo_insert(req, conn); - } - TAILQ_INSERT_TAIL(&conn->imsg_q, req, s_tqe); - log_debug(LOG_VERB, "conn %p enqueue inq %d:%d", conn, req->id, req->parent_id); - - if (conn->same_dc) { - histo_add(&ctx->stats->peer_in_queue, TAILQ_COUNT(&conn->imsg_q)); - stats_pool_incr(ctx, peer_in_queue); - stats_pool_incr_by(ctx, peer_in_queue_bytes, req->mlen); - } else { - histo_add(&ctx->stats->remote_peer_in_queue, TAILQ_COUNT(&conn->imsg_q)); - stats_pool_incr(ctx, remote_peer_in_queue); - stats_pool_incr_by(ctx, remote_peer_in_queue_bytes, req->mlen); - } + return NULL; + } + conn->last_sent = now; + return req_send_next(ctx, conn); } -static void -dnode_req_peer_dequeue_imsgq(struct context *ctx, struct conn *conn, struct msg *req) -{ - ASSERT(req->is_request); - ASSERT(conn->type == CONN_DNODE_PEER_SERVER); - - usec_t delay_us = 0; - if (req->request_inqueue_enqueue_time_us) { - delay_us = dn_usec_now() - req->request_inqueue_enqueue_time_us; - if (conn->same_dc) - histo_add(&ctx->stats->cross_zone_queue_wait_time_histo, delay_us); - else - histo_add(&ctx->stats->cross_region_queue_wait_time_histo, delay_us); - } - TAILQ_REMOVE(&conn->imsg_q, req, s_tqe); - log_debug(LOG_VERB, "conn %p dequeue inq %d:%d", conn, req->id, req->parent_id); - - if (conn->same_dc) { - histo_add(&ctx->stats->peer_in_queue, TAILQ_COUNT(&conn->imsg_q)); - stats_pool_decr(ctx, peer_in_queue); - stats_pool_decr_by(ctx, peer_in_queue_bytes, req->mlen); - } else { - histo_add(&ctx->stats->remote_peer_in_queue, TAILQ_COUNT(&conn->imsg_q)); - stats_pool_decr(ctx, remote_peer_in_queue); - stats_pool_decr_by(ctx, remote_peer_in_queue_bytes, req->mlen); - } +static void dnode_req_peer_enqueue_imsgq(struct context *ctx, struct conn *conn, + struct msg *req) { + ASSERT(req->is_request); + ASSERT(conn->type == CONN_DNODE_PEER_SERVER); + req->request_inqueue_enqueue_time_us = dn_usec_now(); + + if (req->expect_datastore_reply) { + msg_tmo_insert(req, conn); + } + TAILQ_INSERT_TAIL(&conn->imsg_q, req, s_tqe); + log_debug(LOG_VERB, "conn %p enqueue inq %d:%d", conn, req->id, + req->parent_id); + + if (conn->same_dc) { + histo_add(&ctx->stats->peer_in_queue, TAILQ_COUNT(&conn->imsg_q)); + stats_pool_incr(ctx, peer_in_queue); + stats_pool_incr_by(ctx, peer_in_queue_bytes, req->mlen); + } else { + histo_add(&ctx->stats->remote_peer_in_queue, TAILQ_COUNT(&conn->imsg_q)); + stats_pool_incr(ctx, remote_peer_in_queue); + stats_pool_incr_by(ctx, remote_peer_in_queue_bytes, req->mlen); + } } -static void -dnode_req_peer_enqueue_omsgq(struct context *ctx, struct conn *conn, struct msg *req) -{ - ASSERT(req->is_request); - ASSERT(conn->type == CONN_DNODE_PEER_SERVER); - - TAILQ_INSERT_TAIL(&conn->omsg_q, req, s_tqe); - log_debug(LOG_VERB, "conn %p enqueue outq %d:%d", conn, req->id, req->parent_id); - - if (conn->same_dc) { - histo_add(&ctx->stats->peer_out_queue, TAILQ_COUNT(&conn->omsg_q)); - stats_pool_incr(ctx, peer_out_queue); - stats_pool_incr_by(ctx, peer_out_queue_bytes, req->mlen); - } else { - histo_add(&ctx->stats->remote_peer_out_queue, TAILQ_COUNT(&conn->omsg_q)); - stats_pool_incr(ctx, remote_peer_out_queue); - stats_pool_incr_by(ctx, remote_peer_out_queue_bytes, req->mlen); - } +static void dnode_req_peer_dequeue_imsgq(struct context *ctx, struct conn *conn, + struct msg *req) { + ASSERT(req->is_request); + ASSERT(conn->type == CONN_DNODE_PEER_SERVER); + + usec_t delay_us = 0; + if (req->request_inqueue_enqueue_time_us) { + delay_us = dn_usec_now() - req->request_inqueue_enqueue_time_us; + if (conn->same_dc) + histo_add(&ctx->stats->cross_zone_queue_wait_time_histo, delay_us); + else + histo_add(&ctx->stats->cross_region_queue_wait_time_histo, delay_us); + } + TAILQ_REMOVE(&conn->imsg_q, req, s_tqe); + log_debug(LOG_VERB, "conn %p dequeue inq %d:%d", conn, req->id, + req->parent_id); + + if (conn->same_dc) { + histo_add(&ctx->stats->peer_in_queue, TAILQ_COUNT(&conn->imsg_q)); + stats_pool_decr(ctx, peer_in_queue); + stats_pool_decr_by(ctx, peer_in_queue_bytes, req->mlen); + } else { + histo_add(&ctx->stats->remote_peer_in_queue, TAILQ_COUNT(&conn->imsg_q)); + stats_pool_decr(ctx, remote_peer_in_queue); + stats_pool_decr_by(ctx, remote_peer_in_queue_bytes, req->mlen); + } } -static void -dnode_req_peer_dequeue_omsgq(struct context *ctx, struct conn *conn, struct msg *req) -{ - ASSERT(req->is_request); - ASSERT(conn->type == CONN_DNODE_PEER_SERVER); - - msg_tmo_delete(req); - - TAILQ_REMOVE(&conn->omsg_q, req, s_tqe); - log_debug(LOG_VVERB, "conn %p dequeue outq %p", conn, req); - - if (conn->same_dc) { - histo_add(&ctx->stats->peer_out_queue, TAILQ_COUNT(&conn->omsg_q)); - stats_pool_decr(ctx, peer_out_queue); - stats_pool_decr_by(ctx, peer_out_queue_bytes, req->mlen); - } else { - histo_add(&ctx->stats->remote_peer_out_queue, TAILQ_COUNT(&conn->omsg_q)); - stats_pool_decr(ctx, remote_peer_out_queue); - stats_pool_decr_by(ctx, remote_peer_out_queue_bytes, req->mlen); - } +static void dnode_req_peer_enqueue_omsgq(struct context *ctx, struct conn *conn, + struct msg *req) { + ASSERT(req->is_request); + ASSERT(conn->type == CONN_DNODE_PEER_SERVER); + + TAILQ_INSERT_TAIL(&conn->omsg_q, req, s_tqe); + log_debug(LOG_VERB, "conn %p enqueue outq %d:%d", conn, req->id, + req->parent_id); + + if (conn->same_dc) { + histo_add(&ctx->stats->peer_out_queue, TAILQ_COUNT(&conn->omsg_q)); + stats_pool_incr(ctx, peer_out_queue); + stats_pool_incr_by(ctx, peer_out_queue_bytes, req->mlen); + } else { + histo_add(&ctx->stats->remote_peer_out_queue, TAILQ_COUNT(&conn->omsg_q)); + stats_pool_incr(ctx, remote_peer_out_queue); + stats_pool_incr_by(ctx, remote_peer_out_queue_bytes, req->mlen); + } } +static void dnode_req_peer_dequeue_omsgq(struct context *ctx, struct conn *conn, + struct msg *req) { + ASSERT(req->is_request); + ASSERT(conn->type == CONN_DNODE_PEER_SERVER); + + msg_tmo_delete(req); + + TAILQ_REMOVE(&conn->omsg_q, req, s_tqe); + log_debug(LOG_VVERB, "conn %p dequeue outq %p", conn, req); + + if (conn->same_dc) { + histo_add(&ctx->stats->peer_out_queue, TAILQ_COUNT(&conn->omsg_q)); + stats_pool_decr(ctx, peer_out_queue); + stats_pool_decr_by(ctx, peer_out_queue_bytes, req->mlen); + } else { + histo_add(&ctx->stats->remote_peer_out_queue, TAILQ_COUNT(&conn->omsg_q)); + stats_pool_decr(ctx, remote_peer_out_queue); + stats_pool_decr_by(ctx, remote_peer_out_queue_bytes, req->mlen); + } +} -struct conn_ops dnode_peer_ops = { - msg_recv, - rsp_recv_next, - dnode_rsp_recv_done, - msg_send, - dnode_req_send_next, - req_send_done, - dnode_peer_close, - dnode_peer_active, - dnode_peer_ref, - dnode_peer_unref, - dnode_req_peer_enqueue_imsgq, - dnode_req_peer_dequeue_imsgq, - dnode_req_peer_enqueue_omsgq, - dnode_req_peer_dequeue_omsgq, - conn_cant_handle_response -}; - -void -init_dnode_peer_conn(struct conn *conn) -{ - conn->dyn_mode = 1; - conn->type = CONN_DNODE_PEER_SERVER; - conn->ops = &dnode_peer_ops; +struct conn_ops dnode_peer_ops = {msg_recv, + rsp_recv_next, + dnode_rsp_recv_done, + msg_send, + dnode_req_send_next, + req_send_done, + dnode_peer_close, + dnode_peer_active, + dnode_peer_ref, + dnode_peer_unref, + dnode_req_peer_enqueue_imsgq, + dnode_req_peer_dequeue_imsgq, + dnode_req_peer_enqueue_omsgq, + dnode_req_peer_dequeue_omsgq, + conn_cant_handle_response}; + +void init_dnode_peer_conn(struct conn *conn) { + conn->dyn_mode = 1; + conn->type = CONN_DNODE_PEER_SERVER; + conn->ops = &dnode_peer_ops; } -static int -rack_name_cmp(const void *t1, const void *t2) -{ - const struct rack *s1 = t1, *s2 = t2; +static int rack_name_cmp(const void *t1, const void *t2) { + const struct rack *s1 = t1, *s2 = t2; - return string_compare(s1->name, s2->name); + return string_compare(s1->name, s2->name); } // The idea here is to have a designated rack in each remote region to replicate // data to. This is used to replicate writes to remote regions -void -preselect_remote_rack_for_replication(struct context *ctx) -{ - struct server_pool *sp = &ctx->pool; - uint32_t dc_cnt = array_n(&sp->datacenters); - uint32_t dc_index; - uint32_t my_rack_index = 0; - - // Sort the racks in the dcs - for(dc_index = 0; dc_index < dc_cnt; dc_index++) { - struct datacenter *dc = array_get(&sp->datacenters, dc_index); - // sort the racks. - array_sort(&dc->racks, rack_name_cmp); - } - - // Find the rack index for the local rack - for(dc_index = 0; dc_index < dc_cnt; dc_index++) { - struct datacenter *dc = array_get(&sp->datacenters, dc_index); - - if (string_compare(dc->name, &sp->dc) != 0) - continue; - - // if the dc is a local dc, get the rack_idx - uint32_t rack_index; - uint32_t rack_cnt = array_n(&dc->racks); - for(rack_index = 0; rack_index < rack_cnt; rack_index++) { - struct rack *rack = array_get(&dc->racks, rack_index); - if (string_compare(rack->name, &sp->rack) == 0) { - my_rack_index = rack_index; - log_notice("my rack index %u", my_rack_index); - break; - } - } - } - - // For every remote DC, find the corresponding rack to replicate to. - for(dc_index = 0; dc_index < dc_cnt; dc_index++) { - struct datacenter *dc = array_get(&sp->datacenters, dc_index); - dc->preselected_rack_for_replication = NULL; - - // Nothing to do for local DC, continue; - if (string_compare(dc->name, &sp->dc) == 0) - continue; - - // if no racks, keep preselected_rack_for_replication as NULL - uint32_t rack_cnt = array_n(&dc->racks); - if (rack_cnt == 0) - continue; - - // if the dc is a remote dc, get the rack at rack_idx - // use that as preselected rack for replication - uint32_t this_rack_index = my_rack_index % rack_cnt; - dc->preselected_rack_for_replication = array_get(&dc->racks, - this_rack_index); - log_notice("Selected rack %.*s for replication to remote region %.*s", - dc->preselected_rack_for_replication->name->len, - dc->preselected_rack_for_replication->name->data, - dc->name->len, dc->name->data); +void preselect_remote_rack_for_replication(struct context *ctx) { + struct server_pool *sp = &ctx->pool; + uint32_t dc_cnt = array_n(&sp->datacenters); + uint32_t dc_index; + uint32_t my_rack_index = 0; + + // Sort the racks in the dcs + for (dc_index = 0; dc_index < dc_cnt; dc_index++) { + struct datacenter *dc = array_get(&sp->datacenters, dc_index); + // sort the racks. + array_sort(&dc->racks, rack_name_cmp); + } + + // Find the rack index for the local rack + for (dc_index = 0; dc_index < dc_cnt; dc_index++) { + struct datacenter *dc = array_get(&sp->datacenters, dc_index); + + if (string_compare(dc->name, &sp->dc) != 0) continue; + + // if the dc is a local dc, get the rack_idx + uint32_t rack_index; + uint32_t rack_cnt = array_n(&dc->racks); + for (rack_index = 0; rack_index < rack_cnt; rack_index++) { + struct rack *rack = array_get(&dc->racks, rack_index); + if (string_compare(rack->name, &sp->rack) == 0) { + my_rack_index = rack_index; + log_notice("my rack index %u", my_rack_index); + break; + } } + } + + // For every remote DC, find the corresponding rack to replicate to. + for (dc_index = 0; dc_index < dc_cnt; dc_index++) { + struct datacenter *dc = array_get(&sp->datacenters, dc_index); + dc->preselected_rack_for_replication = NULL; + + // Nothing to do for local DC, continue; + if (string_compare(dc->name, &sp->dc) == 0) continue; + + // if no racks, keep preselected_rack_for_replication as NULL + uint32_t rack_cnt = array_n(&dc->racks); + if (rack_cnt == 0) continue; + + // if the dc is a remote dc, get the rack at rack_idx + // use that as preselected rack for replication + uint32_t this_rack_index = my_rack_index % rack_cnt; + dc->preselected_rack_for_replication = + array_get(&dc->racks, this_rack_index); + log_notice("Selected rack %.*s for replication to remote region %.*s", + dc->preselected_rack_for_replication->name->len, + dc->preselected_rack_for_replication->name->data, dc->name->len, + dc->name->data); + } } diff --git a/src/dyn_dnode_peer.h b/src/dyn_dnode_peer.h index accb528f7..a8d8cc0cb 100644 --- a/src/dyn_dnode_peer.h +++ b/src/dyn_dnode_peer.h @@ -1,30 +1,38 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ -#include "dyn_core.h" -#include "dyn_server.h" - + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ #ifndef _DYN_DNODE_PEER_H_ #define _DYN_DNODE_PEER_H_ -#define MAX_WAIT_BEFORE_RECONNECT_IN_SECS 10 -#define WAIT_BEFORE_UPDATE_PEERS_IN_MILLIS 30000 +#include "dyn_message.h" +#include "dyn_types.h" + +#define MAX_WAIT_BEFORE_RECONNECT_IN_SECS 10 +#define WAIT_BEFORE_UPDATE_PEERS_IN_MILLIS 30000 + +// Forward declarations +struct context; +struct msg; +struct rack; msec_t dnode_peer_timeout(struct msg *msg, struct conn *conn); rstatus_t dnode_initialize_peers(struct context *ctx); void dnode_peer_deinit(struct array *nodes); void dnode_peer_connected(struct context *ctx, struct conn *conn); -struct node *dnode_peer_pool_server(struct context *ctx, struct server_pool *pool, - struct rack *rack, uint8_t *key, uint32_t keylen, +struct node *dnode_peer_pool_server(struct context *ctx, + struct server_pool *pool, struct rack *rack, + uint8_t *key, uint32_t keylen, msg_routing_t msg_routing); -struct conn *dnode_peer_get_conn(struct context *ctx, struct node *server, int tag); +struct conn *dnode_peer_get_conn(struct context *ctx, struct node *server, + int tag); rstatus_t dnode_peer_pool_preconnect(struct context *ctx); void dnode_peer_pool_disconnect(struct context *ctx); -uint32_t dnode_peer_idx_for_key_on_rack(struct server_pool *pool, struct rack *rack, - uint8_t *key, uint32_t keylen); +uint32_t dnode_peer_idx_for_key_on_rack(struct server_pool *pool, + struct rack *rack, uint8_t *key, + uint32_t keylen); rstatus_t dnode_peer_forward_state(void *rmsg); rstatus_t dnode_peer_add(void *rmsg); rstatus_t dnode_peer_replace(void *rmsg); @@ -32,4 +40,4 @@ rstatus_t dnode_peer_handshake_announcing(void *rmsg); void init_dnode_peer_conn(struct conn *conn); void preselect_remote_rack_for_replication(struct context *ctx); -#endif +#endif diff --git a/src/dyn_dnode_proxy.c b/src/dyn_dnode_proxy.c index 36d26ab75..505f04a88 100644 --- a/src/dyn_dnode_proxy.c +++ b/src/dyn_dnode_proxy.c @@ -1,257 +1,231 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ -#include #include +#include #include "dyn_core.h" -#include "dyn_server.h" -#include "dyn_dnode_peer.h" #include "dyn_dnode_client.h" +#include "dyn_dnode_peer.h" #include "dyn_dnode_proxy.h" +#include "dyn_server.h" -static void -dnode_ref(struct conn *conn, void *owner) -{ - struct server_pool *pool = owner; +static void dnode_ref(struct conn *conn, void *owner) { + struct server_pool *pool = owner; - ASSERT(conn->type == CONN_DNODE_PEER_PROXY); - ASSERT(conn->owner == NULL); + ASSERT(conn->type == CONN_DNODE_PEER_PROXY); + ASSERT(conn->owner == NULL); - conn->family = pool->dnode_proxy_endpoint.family; - conn->addrlen = pool->dnode_proxy_endpoint.addrlen; - conn->addr = pool->dnode_proxy_endpoint.addr; - string_duplicate(&conn->pname, &pool->dnode_proxy_endpoint.pname); + conn->family = pool->dnode_proxy_endpoint.family; + conn->addrlen = pool->dnode_proxy_endpoint.addrlen; + conn->addr = pool->dnode_proxy_endpoint.addr; + string_duplicate(&conn->pname, &pool->dnode_proxy_endpoint.pname); - pool->d_conn = conn; + pool->d_conn = conn; - /* owner of the proxy connection is the server pool */ - conn->owner = owner; + /* owner of the proxy connection is the server pool */ + conn->owner = owner; - log_debug(LOG_VVERB, "ref conn %p owner %p", conn, pool); + log_debug(LOG_VVERB, "ref conn %p owner %p", conn, pool); } -static void -dnode_unref(struct conn *conn) -{ - struct server_pool *pool; +static void dnode_unref(struct conn *conn) { + struct server_pool *pool; - ASSERT(conn->type == CONN_DNODE_PEER_PROXY); - ASSERT(conn->owner != NULL); + ASSERT(conn->type == CONN_DNODE_PEER_PROXY); + ASSERT(conn->owner != NULL); - conn_event_del_conn(conn); - pool = conn->owner; - conn->owner = NULL; + conn_event_del_conn(conn); + pool = conn->owner; + conn->owner = NULL; - pool->d_conn = NULL; + pool->d_conn = NULL; - log_debug(LOG_VVERB, "unref conn %p owner %p", conn, pool); + log_debug(LOG_VVERB, "unref conn %p owner %p", conn, pool); } -static void -dnode_close(struct context *ctx, struct conn *conn) -{ - rstatus_t status; - - ASSERT(conn->type == CONN_DNODE_PEER_PROXY); - - if (conn->sd < 0) { - conn_unref(conn); - conn_put(conn); - return; - } +static void dnode_close(struct context *ctx, struct conn *conn) { + rstatus_t status; - ASSERT(conn->rmsg == NULL); - ASSERT(conn->smsg == NULL); - ASSERT(TAILQ_EMPTY(&conn->imsg_q)); - ASSERT(TAILQ_EMPTY(&conn->omsg_q)); + ASSERT(conn->type == CONN_DNODE_PEER_PROXY); + if (conn->sd < 0) { conn_unref(conn); + conn_put(conn); + return; + } - status = close(conn->sd); - if (status < 0) { - log_error("close p %d failed, ignored: %s", conn->sd, strerror(errno)); - } - conn->sd = -1; + ASSERT(conn->rmsg == NULL); + ASSERT(conn->smsg == NULL); + ASSERT(TAILQ_EMPTY(&conn->imsg_q)); + ASSERT(TAILQ_EMPTY(&conn->omsg_q)); - conn_put(conn); + conn_unref(conn); + + status = close(conn->sd); + if (status < 0) { + log_error("close p %d failed, ignored: %s", conn->sd, strerror(errno)); + } + conn->sd = -1; + + conn_put(conn); } /* * Initialize the Dynomite node. Check the connection and backend data store, * then log a message with the socket descriptor, the Dynomite */ -rstatus_t -dnode_proxy_init(struct context *ctx) -{ - rstatus_t status; - struct server_pool *pool = &ctx->pool; - struct conn *p = conn_get(pool, init_dnode_proxy_conn); - if (p == NULL) { - return DN_ENOMEM; - } - - status = conn_listen(pool->ctx, p); - if (status != DN_OK) { - conn_close(pool->ctx, p); - return status; - } - - const char * log_datastore = "not selected data store"; - if (g_data_store == DATA_REDIS){ - log_datastore = "redis"; - } - else if (g_data_store == DATA_MEMCACHE){ - log_datastore = "memcache"; - } - - log_debug(LOG_NOTICE, "%s inited in %s %s", - print_obj(p), log_datastore, print_obj(pool)); - return DN_OK; +rstatus_t dnode_proxy_init(struct context *ctx) { + rstatus_t status; + struct server_pool *pool = &ctx->pool; + struct conn *p = conn_get(pool, init_dnode_proxy_conn); + if (p == NULL) { + return DN_ENOMEM; + } + + status = conn_listen(pool->ctx, p); + if (status != DN_OK) { + conn_close(pool->ctx, p); + return status; + } + + const char *log_datastore = "not selected data store"; + if (g_data_store == DATA_REDIS) { + log_datastore = "redis"; + } else if (g_data_store == DATA_MEMCACHE) { + log_datastore = "memcache"; + } + + log_debug(LOG_NOTICE, "%s inited in %s %s", print_obj(p), log_datastore, + print_obj(pool)); + return DN_OK; } -void -dnode_proxy_deinit(struct context *ctx) -{ - struct server_pool *pool = &ctx->pool; - struct conn *p = pool->d_conn; - if (p != NULL) { - conn_close(pool->ctx, p); - } +void dnode_proxy_deinit(struct context *ctx) { + struct server_pool *pool = &ctx->pool; + struct conn *p = pool->d_conn; + if (p != NULL) { + conn_close(pool->ctx, p); + } - log_debug(LOG_VVERB, "deinit dnode"); + log_debug(LOG_VVERB, "deinit dnode"); } -static rstatus_t -dnode_accept(struct context *ctx, struct conn *p) -{ - rstatus_t status; - struct conn *c; - struct sockaddr_in client_address; - socklen_t client_len = sizeof(client_address); - int sd = 0; - - ASSERT(p->type == CONN_DNODE_PEER_PROXY); - ASSERT(p->sd > 0); - ASSERT(p->recv_active && p->recv_ready); - - - for (;;) { - sd = accept(p->sd, (struct sockaddr *)&client_address, &client_len); - if (sd < 0) { - if (errno == EINTR) { - log_warn("accept on %s not ready - eintr", print_obj(p)); - continue; - } - - if (errno == EAGAIN || errno == EWOULDBLOCK) { - p->recv_ready = 0; - return DN_OK; - } - - /* - * FIXME: On EMFILE or ENFILE mask out IN event on the proxy; mask - * it back in when some existing connection gets closed - */ - - log_error("accept on %s failed: %s", print_obj(p), strerror(errno)); - return DN_ERROR; - } - - break; +static rstatus_t dnode_accept(struct context *ctx, struct conn *p) { + rstatus_t status; + struct conn *c; + struct sockaddr_in client_address; + socklen_t client_len = sizeof(client_address); + int sd = 0; + + ASSERT(p->type == CONN_DNODE_PEER_PROXY); + ASSERT(p->sd > 0); + ASSERT(p->recv_active && p->recv_ready); + + for (;;) { + sd = accept(p->sd, (struct sockaddr *)&client_address, &client_len); + if (sd < 0) { + if (errno == EINTR) { + log_warn("accept on %s not ready - eintr", print_obj(p)); + continue; + } + + if (errno == EAGAIN || errno == EWOULDBLOCK) { + p->recv_ready = 0; + return DN_OK; + } + + /* + * FIXME: On EMFILE or ENFILE mask out IN event on the proxy; mask + * it back in when some existing connection gets closed + */ + + log_error("accept on %s failed: %s", print_obj(p), strerror(errno)); + return DN_ERROR; } - char clntName[INET_ADDRSTRLEN]; - - if(inet_ntop(AF_INET, &client_address.sin_addr.s_addr, clntName, sizeof(clntName))!=NULL){ - loga("Accepting client connection from %s:%d on sd %d",clntName, ntohs(client_address.sin_port), sd); - } else { - loga("Unable to get client's address for accept on sd %d\n", sd); - } + break; + } - c = conn_get(p->owner, init_dnode_client_conn); - if (c == NULL) { - log_error("get conn for PEER_CLIENT %d from %s failed: %s", sd, print_obj(p), - strerror(errno)); - status = close(sd); - if (status < 0) { - log_error("dyn: close c %d failed, ignored: %s", sd, strerror(errno)); - } - return DN_ENOMEM; - } - c->sd = sd; - string_copy_c(&c->pname, (unsigned char *)dn_unresolve_peer_desc(c->sd)); + char clntName[INET_ADDRSTRLEN]; - stats_pool_incr(ctx, dnode_client_connections); + if (inet_ntop(AF_INET, &client_address.sin_addr.s_addr, clntName, + sizeof(clntName)) != NULL) { + loga("Accepting client connection from %s:%d on sd %d", clntName, + ntohs(client_address.sin_port), sd); + } else { + loga("Unable to get client's address for accept on sd %d\n", sd); + } - status = dn_set_nonblocking(c->sd); + c = conn_get(p->owner, init_dnode_client_conn); + if (c == NULL) { + log_error("get conn for PEER_CLIENT %d from %s failed: %s", sd, + print_obj(p), strerror(errno)); + status = close(sd); if (status < 0) { - log_error("%s Failed to set nonblock on %s: %s", print_obj(p), print_obj(c), strerror(errno)); - conn_close(ctx, c); - return status; - } - - if (p->family == AF_INET || p->family == AF_INET6) { - status = dn_set_tcpnodelay(c->sd); - if (status < 0) { - log_warn("%s Failed to set tcpnodelay on %s: %s", - print_obj(p), print_obj(c), strerror(errno)); - } + log_error("dyn: close c %d failed, ignored: %s", sd, strerror(errno)); } - - status = conn_event_add_conn(c); + return DN_ENOMEM; + } + c->sd = sd; + string_copy_c(&c->pname, (unsigned char *)dn_unresolve_peer_desc(c->sd)); + + stats_pool_incr(ctx, dnode_client_connections); + + status = dn_set_nonblocking(c->sd); + if (status < 0) { + log_error("%s Failed to set nonblock on %s: %s", print_obj(p), print_obj(c), + strerror(errno)); + conn_close(ctx, c); + return status; + } + + if (p->family == AF_INET || p->family == AF_INET6) { + status = dn_set_tcpnodelay(c->sd); if (status < 0) { - log_error("%s Failed to add %s to event loop: %s", print_obj(p), print_obj(c), strerror(errno)); - conn_close(ctx, c); - return status; + log_warn("%s Failed to set tcpnodelay on %s: %s", print_obj(p), + print_obj(c), strerror(errno)); } + } - log_notice("%s accepted %s", print_obj(p), print_obj(c)); + status = conn_event_add_conn(c); + if (status < 0) { + log_error("%s Failed to add %s to event loop: %s", print_obj(p), + print_obj(c), strerror(errno)); + conn_close(ctx, c); + return status; + } - return DN_OK; + log_notice("%s accepted %s", print_obj(p), print_obj(c)); + + return DN_OK; } -static rstatus_t -dnode_recv(struct context *ctx, struct conn *conn) -{ - ASSERT(conn->type == CONN_DNODE_PEER_PROXY); - ASSERT(conn->recv_active); - - conn->recv_ready = 1; - do { - if (dnode_accept(ctx, conn) != DN_OK) { - log_error("%s Failed to accept a connection. Continuing", print_obj(conn)); - continue; - } - } while (conn->recv_ready); - - return DN_OK; +static rstatus_t dnode_recv(struct context *ctx, struct conn *conn) { + ASSERT(conn->type == CONN_DNODE_PEER_PROXY); + ASSERT(conn->recv_active); + + conn->recv_ready = 1; + do { + if (dnode_accept(ctx, conn) != DN_OK) { + log_error("%s Failed to accept a connection. Continuing", + print_obj(conn)); + continue; + } + } while (conn->recv_ready); + + return DN_OK; } struct conn_ops dnode_server_ops = { - dnode_recv, - NULL, - NULL, - NULL, - NULL, - NULL, - dnode_close, - NULL, - dnode_ref, - dnode_unref, - NULL, - NULL, - NULL, - NULL, - conn_cant_handle_response -}; - -void -init_dnode_proxy_conn(struct conn *conn) -{ - conn->dyn_mode = 1; - conn->type = CONN_DNODE_PEER_PROXY; - conn->ops = &dnode_server_ops; + dnode_recv, NULL, NULL, NULL, NULL, + NULL, dnode_close, NULL, dnode_ref, dnode_unref, + NULL, NULL, NULL, NULL, conn_cant_handle_response}; + +void init_dnode_proxy_conn(struct conn *conn) { + conn->dyn_mode = 1; + conn->type = CONN_DNODE_PEER_PROXY; + conn->ops = &dnode_server_ops; } diff --git a/src/dyn_dnode_proxy.h b/src/dyn_dnode_proxy.h index e852f17cb..27e1fff79 100644 --- a/src/dyn_dnode_proxy.h +++ b/src/dyn_dnode_proxy.h @@ -1,16 +1,19 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ - - -#include "dyn_core.h" + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ #ifndef _DYN_DNODE_SERVER_H_ #define _DYN_DNODE_SERVER_H_ +#include "dyn_types.h" + +// Forward declarations +struct conn; +struct context; + rstatus_t dnode_proxy_init(struct context *ctx); void dnode_proxy_deinit(struct context *ctx); void init_dnode_proxy_conn(struct conn *conn); -#endif +#endif diff --git a/src/dyn_dnode_request.c b/src/dyn_dnode_request.c index 9a4cd1234..a4627569b 100644 --- a/src/dyn_dnode_request.c +++ b/src/dyn_dnode_request.c @@ -1,155 +1,153 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ #include "dyn_core.h" #include "dyn_dnode_peer.h" #include "dyn_mbuf.h" #include "dyn_server.h" - -//static struct string client_request_dyn_msg = string("Client_request"); +// static struct string client_request_dyn_msg = string("Client_request"); static uint64_t peer_msg_id = 0; -static void -dnode_req_forward_error(struct context *ctx, struct conn *conn, struct msg *req) -{ - rstatus_t status; +static void dnode_req_forward_error(struct context *ctx, struct conn *conn, + struct msg *req) { + rstatus_t status; - ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); + ASSERT(conn->type == CONN_DNODE_PEER_CLIENT); - log_debug(LOG_INFO, "dyn: forward req %"PRIu64" len %"PRIu32" type %d from " - "c %d failed: %s", req->id, req->mlen, req->type, conn->sd, - strerror(errno)); + log_debug(LOG_INFO, + "dyn: forward req %" PRIu64 " len %" PRIu32 + " type %d from " + "c %d failed: %s", + req->id, req->mlen, req->type, conn->sd, strerror(errno)); - req->done = 1; - req->is_error = 1; - req->error_code = errno; + req->done = 1; + req->is_error = 1; + req->error_code = errno; - if (!req->expect_datastore_reply || req->swallow) { - req_put(req); - return; - } + if (!req->expect_datastore_reply || req->swallow) { + req_put(req); + return; + } - if (req_done(conn, TAILQ_FIRST(&conn->omsg_q))) { - status = conn_event_add_out(conn); - if (status != DN_OK) { - conn->err = errno; - } + if (req_done(conn, TAILQ_FIRST(&conn->omsg_q))) { + status = conn_event_add_out(conn); + if (status != DN_OK) { + conn->err = errno; } - + } } -static void -dnode_peer_req_forward_stats(struct context *ctx, struct node *server, struct msg *req) -{ - ASSERT(req->is_request); - stats_pool_incr(ctx, peer_requests); - stats_pool_incr_by(ctx, peer_request_bytes, req->mlen); +static void dnode_peer_req_forward_stats(struct context *ctx, + struct node *server, struct msg *req) { + ASSERT(req->is_request); + stats_pool_incr(ctx, peer_requests); + stats_pool_incr_by(ctx, peer_request_bytes, req->mlen); } - /* Forward a client request over to a peer */ -rstatus_t -dnode_peer_req_forward(struct context *ctx, struct conn *c_conn, - struct conn *p_conn, struct msg *req, - struct rack *rack, uint8_t *key, uint32_t keylen, - dyn_error_t *dyn_error_code) -{ - - struct node *server = p_conn->owner; - log_info("%s FORWARD %s to peer %s on rack '%.*s' dc '%.*s' ", - print_obj(c_conn), print_obj(req), print_obj(p_conn), rack->name->len, rack->name->data, - server->dc.len, server->dc.data); - - struct string *dc = rack->dc; - rstatus_t status; - - ASSERT(p_conn->type == CONN_DNODE_PEER_SERVER); - ASSERT((c_conn->type == CONN_CLIENT) || - (c_conn->type == CONN_DNODE_PEER_CLIENT)); - - /* enqueue the message (request) into peer inq */ - status = conn_event_add_out(p_conn); - if (status != DN_OK) { - *dyn_error_code = DYNOMITE_UNKNOWN_ERROR; - p_conn->err = errno; - return DN_ERROR; - } - - struct mbuf *header_buf = mbuf_get(); - if (header_buf == NULL) { - loga("Unable to obtain an mbuf for dnode msg's header!"); - *dyn_error_code = DYNOMITE_OK; - return DN_ENOMEM; +rstatus_t dnode_peer_req_forward(struct context *ctx, struct conn *c_conn, + struct conn *p_conn, struct msg *req, + struct rack *rack, uint8_t *key, + uint32_t keylen, dyn_error_t *dyn_error_code) { + struct node *server = p_conn->owner; + log_info("%s FORWARD %s to peer %s on rack '%.*s' dc '%.*s' ", + print_obj(c_conn), print_obj(req), print_obj(p_conn), + rack->name->len, rack->name->data, server->dc.len, server->dc.data); + + struct string *dc = rack->dc; + rstatus_t status; + + ASSERT(p_conn->type == CONN_DNODE_PEER_SERVER); + ASSERT((c_conn->type == CONN_CLIENT) || + (c_conn->type == CONN_DNODE_PEER_CLIENT)); + + /* enqueue the message (request) into peer inq */ + status = conn_event_add_out(p_conn); + if (status != DN_OK) { + *dyn_error_code = DYNOMITE_UNKNOWN_ERROR; + p_conn->err = errno; + return DN_ERROR; + } + + struct mbuf *header_buf = mbuf_get(); + if (header_buf == NULL) { + loga("Unable to obtain an mbuf for dnode msg's header!"); + *dyn_error_code = DYNOMITE_OK; + return DN_ENOMEM; + } + + struct server_pool *pool = c_conn->owner; + dmsg_type_t msg_type = + (string_compare(&pool->dc, dc) != 0) ? DMSG_REQ_FORWARD : DMSG_REQ; + + // SMB: THere is some non trivial business happening here. Better refer to the + // comment in dnode_rsp_send_next to understand the stuff here. + // Note: THere MIGHT BE A NEED TO PORT THE dnode_header_prepended FIX FROM + // THERE TO HERE. especially when a message is being sent in parts + if (p_conn->dnode_secured) { + // Encrypting and adding header for a request + if (log_loggable(LOG_VVERB)) { + SCOPED_CHARPTR(encoded_aes_key) = + base64_encode(p_conn->aes_key, AES_KEYLEN); + if (encoded_aes_key) + log_debug(LOG_VVERB, "AES encryption key: %s\n", encoded_aes_key); } - struct server_pool *pool = c_conn->owner; - dmsg_type_t msg_type = (string_compare(&pool->dc, dc) != 0)? DMSG_REQ_FORWARD : DMSG_REQ; - - // SMB: THere is some non trivial business happening here. Better refer to the - // comment in dnode_rsp_send_next to understand the stuff here. - // Note: THere MIGHT BE A NEED TO PORT THE dnode_header_prepended FIX FROM THERE - // TO HERE. especially when a message is being sent in parts - if (p_conn->dnode_secured) { - //Encrypting and adding header for a request - if (log_loggable(LOG_VVERB)) { - SCOPED_CHARPTR(encoded_aes_key) = base64_encode(p_conn->aes_key, AES_KEYLEN); - if (encoded_aes_key) - log_debug(LOG_VVERB, "AES encryption key: %s\n", encoded_aes_key); + // write dnode header + if (ENCRYPTION) { + size_t encrypted_bytes; + status = dyn_aes_encrypt_msg(req, p_conn->aes_key, &encrypted_bytes); + if (status != DN_OK) { + if (status == DN_ENOMEM) { + loga("OOM to obtain an mbuf for encryption!"); + } else if (status == DN_ERROR) { + loga("Encryption failed: Empty message"); } + *dyn_error_code = status; + mbuf_put(header_buf); + return status; + } - //write dnode header - if (ENCRYPTION) { - size_t encrypted_bytes; - status = dyn_aes_encrypt_msg(req, p_conn->aes_key, &encrypted_bytes); - if (status != DN_OK) { - if (status == DN_ENOMEM) { - loga("OOM to obtain an mbuf for encryption!"); - } else if (status == DN_ERROR) { - loga("Encryption failed: Empty message"); - } - *dyn_error_code = status; - mbuf_put(header_buf); - return status; - } - - log_debug(LOG_VVERB, "#encrypted bytes : %d", encrypted_bytes); - - dmsg_write(header_buf, req->id, msg_type, p_conn, msg_length(req)); - } else { - log_debug(LOG_VVERB, "no encryption on the msg payload"); - dmsg_write(header_buf, req->id, msg_type, p_conn, msg_length(req)); - } + log_debug(LOG_VVERB, "#encrypted bytes : %d", encrypted_bytes); + dmsg_write(header_buf, req->id, msg_type, p_conn, msg_length(req)); } else { - //write dnode header - dmsg_write(header_buf, req->id, msg_type, p_conn, msg_length(req)); + log_debug(LOG_VVERB, "no encryption on the msg payload"); + dmsg_write(header_buf, req->id, msg_type, p_conn, msg_length(req)); } - mbuf_insert_head(&req->mhdr, header_buf); + } else { + // write dnode header + dmsg_write(header_buf, req->id, msg_type, p_conn, msg_length(req)); + } - if (log_loggable(LOG_VVERB)) { - log_hexdump(LOG_VVERB, header_buf->pos, mbuf_length(header_buf), "dyn message header: "); - msg_dump(LOG_VVERB, req); - } + mbuf_insert_head(&req->mhdr, header_buf); - conn_enqueue_inq(ctx, p_conn, req); + if (log_loggable(LOG_VVERB)) { + log_hexdump(LOG_VVERB, header_buf->pos, mbuf_length(header_buf), + "dyn message header: "); + msg_dump(LOG_VVERB, req); + } - dnode_peer_req_forward_stats(ctx, p_conn->owner, req); + conn_enqueue_inq(ctx, p_conn, req); - log_debug(LOG_VVERB, "remote forward from c %d to s %d req %"PRIu64" len %"PRIu32 - " type %d with key '%.*s'", c_conn->sd, p_conn->sd, req->id, - req->mlen, req->type, keylen, key); - return DN_OK; -} + dnode_peer_req_forward_stats(ctx, p_conn->owner, req); + log_debug(LOG_VVERB, + "remote forward from c %d to s %d req %" PRIu64 " len %" PRIu32 + " type %d with key '%.*s'", + c_conn->sd, p_conn->sd, req->id, req->mlen, req->type, keylen, key); + return DN_OK; +} /* //for sending a string over to a peer void -peer_gossip_forward1(struct context *ctx, struct conn *conn, bool redis, struct string *data) +peer_gossip_forward1(struct context *ctx, struct conn *conn, bool redis, struct +string *data) { rstatus_t status; struct msg *msg = msg_get(conn, 1, redis); @@ -183,8 +181,8 @@ peer_gossip_forward1(struct context *ctx, struct conn *conn, bool redis, struct conn->enqueue_inq(ctx, conn, msg); - log_debug(LOG_VERB, "gossip to peer %d with msg_id %"PRIu64" '%.*s'", conn->sd, msg->id, - data->len, data->data); + log_debug(LOG_VERB, "gossip to peer %d with msg_id %"PRIu64" '%.*s'", +conn->sd, msg->id, data->len, data->data); } */ @@ -192,95 +190,108 @@ peer_gossip_forward1(struct context *ctx, struct conn *conn, bool redis, struct /* * Sending a mbuf of gossip data over the wire to a peer */ -void -dnode_peer_gossip_forward(struct context *ctx, struct conn *conn, struct mbuf *data_buf) -{ - rstatus_t status; - struct msg *msg = msg_get(conn, 1, __FUNCTION__); - - if (msg == NULL) { - log_debug(LOG_DEBUG, "Unable to obtain a msg"); - return; +void dnode_peer_gossip_forward(struct context *ctx, struct conn *conn, + struct mbuf *data_buf) { + rstatus_t status; + struct msg *msg = msg_get(conn, 1, __FUNCTION__); + + if (msg == NULL) { + log_debug(LOG_DEBUG, "Unable to obtain a msg"); + mbuf_put(data_buf); + return; + } + + struct mbuf *header_buf = mbuf_get(); + if (header_buf == NULL) { + log_debug(LOG_DEBUG, "Unable to obtain a header_buf"); + rsp_put(msg); + mbuf_put(data_buf); + return; + } + + uint64_t msg_id = peer_msg_id++; + + if (conn->dnode_secured) { + if (log_loggable(LOG_VERB)) { + log_debug(LOG_VERB, "Assemble a secured msg to send"); + SCOPED_CHARPTR(encoded_aes_key) = + base64_encode(conn->aes_key, AES_KEYLEN); + if (encoded_aes_key) + log_debug(LOG_VERB, "AES encryption key: %s\n", encoded_aes_key); } - struct mbuf *header_buf = mbuf_get(); - if (header_buf == NULL) { - log_debug(LOG_DEBUG, "Unable to obtain a data_buf"); + if (ENCRYPTION) { + struct mbuf *encrypted_buf = mbuf_get(); + if (encrypted_buf == NULL) { + loga("Unable to obtain an encrypted_buf for encryption!"); rsp_put(msg); - return; - } - - uint64_t msg_id = peer_msg_id++; - - if (conn->dnode_secured) { - if (log_loggable(LOG_VERB)) { - log_debug(LOG_VERB, "Assemble a secured msg to send"); - SCOPED_CHARPTR(encoded_aes_key) = base64_encode(conn->aes_key, AES_KEYLEN); - if (encoded_aes_key) - log_debug(LOG_VERB, "AES encryption key: %s\n", encoded_aes_key); - } - - if (ENCRYPTION) { - struct mbuf *encrypted_buf = mbuf_get(); - if (encrypted_buf == NULL) { - loga("Unable to obtain an data_buf for encryption!"); - return; //TODOs: need to clean up - } - - status = dyn_aes_encrypt(data_buf->pos, mbuf_length(data_buf), encrypted_buf, conn->aes_key); - if (log_loggable(LOG_VERB)) { - log_debug(LOG_VERB, "#encrypted bytes : %d", status); - } - - //write dnode header - dmsg_write(header_buf, msg_id, GOSSIP_SYN, conn, mbuf_length(encrypted_buf)); - - if (log_loggable(LOG_VVERB)) { - log_hexdump(LOG_VVERB, data_buf->pos, mbuf_length(data_buf), "dyn message original payload: "); - log_hexdump(LOG_VVERB, encrypted_buf->pos, mbuf_length(encrypted_buf), "dyn message encrypted payload: "); - } - - mbuf_remove(&msg->mhdr, data_buf); - mbuf_insert(&msg->mhdr, encrypted_buf); - //free data_buf as no one will need it again - mbuf_put(data_buf); //TODOS: need to remove this from the msg->mhdr as in the other method - - } else { - if (log_loggable(LOG_VVERB)) { - log_debug(LOG_VVERB, "No encryption"); - } - dmsg_write_mbuf(header_buf, msg_id, GOSSIP_SYN, conn, mbuf_length(data_buf)); - mbuf_insert(&msg->mhdr, data_buf); - } + mbuf_put(data_buf); + mbuf_put(header_buf); + return; + } + + status = dyn_aes_encrypt(data_buf->pos, mbuf_length(data_buf), + encrypted_buf, conn->aes_key); + if (log_loggable(LOG_VERB)) { + log_debug(LOG_VERB, "#encrypted bytes : %d", status); + } + + // write dnode header + dmsg_write(header_buf, msg_id, GOSSIP_SYN, conn, + mbuf_length(encrypted_buf)); + + if (log_loggable(LOG_VVERB)) { + log_hexdump(LOG_VVERB, data_buf->pos, mbuf_length(data_buf), + "dyn message original payload: "); + log_hexdump(LOG_VVERB, encrypted_buf->pos, mbuf_length(encrypted_buf), + "dyn message encrypted payload: "); + } + + //mbuf_remove(&msg->mhdr, data_buf); + mbuf_insert(&msg->mhdr, encrypted_buf); + // free data_buf as no one will need it again + mbuf_put(data_buf); // TODOS: need to remove this from the msg->mhdr as + // in the other method } else { - if (log_loggable(LOG_VVERB)) { - log_debug(LOG_VVERB, "Assemble a non-secured msg to send"); - } - dmsg_write_mbuf(header_buf, msg_id, GOSSIP_SYN, conn, mbuf_length(data_buf)); - mbuf_insert(&msg->mhdr, data_buf); + if (log_loggable(LOG_VVERB)) { + log_debug(LOG_VVERB, "No encryption"); + } + dmsg_write_mbuf(header_buf, msg_id, GOSSIP_SYN, conn, + mbuf_length(data_buf)); + mbuf_insert(&msg->mhdr, data_buf); } - mbuf_insert_head(&msg->mhdr, header_buf); - + } else { if (log_loggable(LOG_VVERB)) { - log_hexdump(LOG_VVERB, header_buf->pos, mbuf_length(header_buf), "dyn gossip message header: "); - msg_dump(LOG_VVERB, msg); + log_debug(LOG_VVERB, "Assemble a non-secured msg to send"); } - - /* enqueue the message (request) into peer inq */ - if (TAILQ_EMPTY(&conn->imsg_q)) { - status = conn_event_add_out(conn); - if (status != DN_OK) { - dnode_req_forward_error(ctx, conn, msg); - conn->err = errno; - return; - } + dmsg_write_mbuf(header_buf, msg_id, GOSSIP_SYN, conn, + mbuf_length(data_buf)); + mbuf_insert(&msg->mhdr, data_buf); + } + + mbuf_insert_head(&msg->mhdr, header_buf); + + if (log_loggable(LOG_VVERB)) { + log_hexdump(LOG_VVERB, header_buf->pos, mbuf_length(header_buf), + "dyn gossip message header: "); + msg_dump(LOG_VVERB, msg); + } + + /* enqueue the message (request) into peer inq */ + if (TAILQ_EMPTY(&conn->imsg_q)) { + status = conn_event_add_out(conn); + if (status != DN_OK) { + dnode_req_forward_error(ctx, conn, msg); + conn->err = errno; + return; } + } - //need to handle a reply - //conn->enqueue_outq(ctx, conn, msg); + // need to handle a reply + // conn->enqueue_outq(ctx, conn, msg); - msg->expect_datastore_reply = 0; - conn_enqueue_inq(ctx, conn, msg); + msg->expect_datastore_reply = 0; + conn_enqueue_inq(ctx, conn, msg); } diff --git a/src/dyn_gossip.c b/src/dyn_gossip.c index 58bcdb2a0..1e324c89d 100644 --- a/src/dyn_gossip.c +++ b/src/dyn_gossip.c @@ -1,21 +1,18 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. */ - -#include +#include #include #include -#include -#include #include "dyn_core.h" #include "dyn_dict.h" #include "dyn_dnode_peer.h" #include "dyn_gossip.h" -#include "dyn_node_snitch.h" #include "dyn_mbuf.h" +#include "dyn_node_snitch.h" #include "dyn_ring_queue.h" #include "dyn_server.h" #include "dyn_string.h" @@ -27,764 +24,731 @@ static const struct string PEER_PORT = string("8101"); static const struct string PEER_SSL_PORT = string("8103"); - static void gossip_debug(void); struct gossip_node_pool gn_pool; static uint32_t node_count = 0; static struct gossip_node *current_node = NULL; -static struct mbuf * seeds_buf = NULL; - -static unsigned int -dict_node_hash(const void *key) -{ - const struct gossip_node *node = key; - if (node == NULL) - return 0; - return dictGenHashFunction((unsigned char*)node->dc.data, node->dc.len) + - dictGenHashFunction((unsigned char*)node->rack.data, node->rack.len) + - node->token.mag[0]; +static struct mbuf *seeds_buf = NULL; + +static unsigned int dict_node_hash(const void *key) { + const struct gossip_node *node = key; + if (node == NULL) return 0; + return dictGenHashFunction((unsigned char *)node->dc.data, node->dc.len) + + dictGenHashFunction((unsigned char *)node->rack.data, node->rack.len) + + node->token.mag[0]; } -static int -dict_node_key_compare(void *privdata, const void *key1, const void *key2) -{ - DICT_NOTUSED(privdata); - const struct gossip_node *node1 = key1; - const struct gossip_node *node2 = key2; - - ASSERT(node1 == NULL || node2 == NULL); +static int dict_node_key_compare(void *privdata, const void *key1, + const void *key2) { + DICT_NOTUSED(privdata); + const struct gossip_node *node1 = key1; + const struct gossip_node *node2 = key2; - return (string_compare(&node1->dc, &node2->dc) == 0) && - (string_compare(&node1->rack, &node2->rack) == 0) && - (cmp_dyn_token(&node1->token, &node2->token) == 0); + ASSERT(node1 == NULL || node2 == NULL); + return (string_compare(&node1->dc, &node2->dc) == 0) && + (string_compare(&node1->rack, &node2->rack) == 0) && + (cmp_dyn_token(&node1->token, &node2->token) == 0); } -int -dict_string_key_compare(void *privdata, const void *key1, const void *key2) -{ - DICT_NOTUSED(privdata); - const struct string *s1 = key1; - const struct string *s2 = key2; +int dict_string_key_compare(void *privdata, const void *key1, + const void *key2) { + DICT_NOTUSED(privdata); + const struct string *s1 = key1; + const struct string *s2 = key2; - //return (s1->len != s2->len)? 0 : strncmp(s1->data, s2->data, s1->len) == 0; - return string_compare(s1, s2) == 0; + // return (s1->len != s2->len)? 0 : strncmp(s1->data, s2->data, s1->len) == 0; + return string_compare(s1, s2) == 0; } -static void -dict_node_destructor(void *privdata, void *val) -{ - DICT_NOTUSED(privdata); +static void dict_node_destructor(void *privdata, void *val) { + DICT_NOTUSED(privdata); - struct gossip_node *node = val; - node_deinit(node); - dn_free(node); + struct gossip_node *node = val; + node_deinit(node); + dn_free(node); } -void -dict_string_destructor(void *privdata, void *val) -{ - DICT_NOTUSED(privdata); +void dict_string_destructor(void *privdata, void *val) { + DICT_NOTUSED(privdata); - struct string *s = val; - string_deinit(s); - dn_free(s); + struct string *s = val; + string_deinit(s); + dn_free(s); } -unsigned int -dict_string_hash(const void *key) -{ - const struct string *s = key; - //return dictGenHashFunction((unsigned char*)key, dn_strlen((char*)key)); - if (s == NULL) - return 0; - return dictGenHashFunction(s->data, s->len); +unsigned int dict_string_hash(const void *key) { + const struct string *s = key; + // return dictGenHashFunction((unsigned char*)key, dn_strlen((char*)key)); + if (s == NULL) return 0; + return dictGenHashFunction(s->data, s->len); } - - - dictType token_table_dict_type = { - dict_string_hash, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - dict_string_key_compare, /* key compare */ - dict_string_destructor, /* key destructor */ - NULL /* val destructor */ + dict_string_hash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dict_string_key_compare, /* key compare */ + dict_string_destructor, /* key destructor */ + NULL /* val destructor */ }; - dictType string_table_dict_type = { - dict_string_hash, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - dict_string_key_compare, /* key compare */ - dict_string_destructor, /* key destructor */ - NULL /* val destructor */ + dict_string_hash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dict_string_key_compare, /* key compare */ + dict_string_destructor, /* key destructor */ + NULL /* val destructor */ }; - dictType gossip_table_dict_type = { - dict_node_hash, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - dict_node_key_compare, /* key compare */ - dict_node_destructor, /* key destructor */ - NULL /* val destructor */ + dict_node_hash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dict_node_key_compare, /* key compare */ + dict_node_destructor, /* key destructor */ + NULL /* val destructor */ }; +static rstatus_t gossip_process_msgs(void) { + // TODOs: fix this to process an array of nodes + while (!CBUF_IsEmpty(C2G_InQ)) { + struct ring_msg *msg = (struct ring_msg *)CBUF_Pop(C2G_InQ); + msg->cb(msg); + ring_msg_deinit(msg); + } - -static rstatus_t -gossip_process_msgs(void) -{ - //TODOs: fix this to process an array of nodes - while (!CBUF_IsEmpty(C2G_InQ)) { - struct ring_msg *msg = (struct ring_msg *) CBUF_Pop(C2G_InQ); - msg->cb(msg); - ring_msg_deinit(msg); - } - - return DN_OK; + return DN_OK; } +static rstatus_t gossip_msg_to_core(struct server_pool *sp, + struct gossip_node *node, void *cb) { + struct ring_msg *msg = create_ring_msg(); + struct gossip_node *rnode = (struct gossip_node *)array_get(&msg->nodes, 0); + node_copy(node, rnode); -static rstatus_t -gossip_msg_to_core(struct server_pool *sp, struct gossip_node *node, void *cb) -{ - struct ring_msg *msg = create_ring_msg(); - struct gossip_node *rnode = (struct gossip_node *) array_get(&msg->nodes, 0); - node_copy(node, rnode); - - msg->cb = cb; - msg->sp = sp; - CBUF_Push(C2G_OutQ, msg); + msg->cb = cb; + msg->sp = sp; + CBUF_Push(C2G_OutQ, msg); - return DN_OK; + return DN_OK; } -static rstatus_t -gossip_ring_msg_to_core(struct server_pool *sp, struct ring_msg *msg, void *cb) -{ - msg->cb = cb; - msg->sp = sp; - CBUF_Push(C2G_OutQ, msg); +static rstatus_t gossip_ring_msg_to_core(struct server_pool *sp, + struct ring_msg *msg, void *cb) { + msg->cb = cb; + msg->sp = sp; + CBUF_Push(C2G_OutQ, msg); - return DN_OK; + return DN_OK; } - -static void -write_char(uint8_t *pos, unsigned char ch) -{ - *pos = ch; - pos += 1; +static void write_char(uint8_t *pos, unsigned char ch) { + *pos = ch; + pos += 1; } -static int -write_number(uint8_t *pos, uint64_t num, int *count) -{ - if (num < 10) { - write_char(pos, (unsigned char)('0' + num)); - *count = 1; - return 1; - } - - write_number(pos, num / 10, count); - write_char(pos + (*count), (unsigned char)('0' + (num % 10))); - *count = *count + 1; - return *count; +static int write_number(uint8_t *pos, uint64_t num, int *count) { + if (num < 10) { + write_char(pos, (unsigned char)('0' + num)); + *count = 1; + return 1; + } + + write_number(pos, num / 10, count); + write_char(pos + (*count), (unsigned char)('0' + (num % 10))); + *count = *count + 1; + return *count; } +static void string_write_uint32(struct string *str, uint32_t num, + uint32_t pos) { + if (num < 10) { + *(str->data + pos) = '0' + num; + return; + } -static void -string_write_uint32(struct string *str, uint32_t num, uint32_t pos) -{ - if (num < 10) { - *(str->data + pos) = '0' + num; - return; - } - - *(str->data + pos) = '0' + (num % 10); - string_write_uint32(str, num / 10, pos - 1); + *(str->data + pos) = '0' + (num % 10); + string_write_uint32(str, num / 10, pos - 1); } static uint32_t num_len(uint32_t num) { - if (num < 10) { - return 1; - } + if (num < 10) { + return 1; + } - return 1 + num_len(num / 10); + return 1 + num_len(num / 10); } static struct string *token_to_string(struct dyn_token *token) { - uint32_t num = token->mag[0]; - uint32_t len = num_len(num); - struct string *result = dn_alloc(sizeof(*result)); - string_init(result); + uint32_t num = token->mag[0]; + uint32_t len = num_len(num); + struct string *result = dn_alloc(sizeof(*result)); + string_init(result); - result->data = dn_alloc(sizeof(uint8_t) * len); - result->len = len; + result->data = dn_alloc(sizeof(uint8_t) * len); + result->len = len; - string_write_uint32(result, num, result->len - 1); - return result; + string_write_uint32(result, num, result->len - 1); + return result; } +// Simple failure detector - will have an advanced version later +static uint8_t gossip_failure_detector(struct gossip_node *node) { + log_debug(LOG_VERB, "In gossip_failure_detector"); -//Simple failure detector - will have an advanced version later -static uint8_t -gossip_failure_detector(struct gossip_node *node) -{ - log_debug(LOG_VERB, "In gossip_failure_detector"); - - if (node == NULL) - return UNKNOWN; + if (node == NULL) return UNKNOWN; - if (node->is_local) - return NORMAL; + if (node->is_local) return NORMAL; - uint64_t cur_ts = (uint64_t) time(NULL); - // Not sure why divide by 1000 - sec_t delta = gn_pool.g_interval/1000 * 40; //g_internal is in milliseconds + uint64_t cur_ts = (uint64_t)time(NULL); + // Not sure why divide by 1000 + sec_t delta = gn_pool.g_interval / 1000 * 40; // g_internal is in + // milliseconds - //loga("cur_ts %d", cur_ts); - //loga("delta %d", delta); - //loga("node->ts = %d", node->ts); - //loga("node state = %d", node->state); + // loga("cur_ts %d", cur_ts); + // loga("delta %d", delta); + // loga("node->ts = %d", node->ts); + // loga("node state = %d", node->state); - if (cur_ts - node->ts > delta) { //if there is no update for delta time - return DOWN; - } + if (cur_ts - node->ts > delta) { // if there is no update for delta time + return DOWN; + } - return node->state; + return node->state; } +static rstatus_t gossip_forward_state(struct server_pool *sp) { + // assume each record needs maximum 256 bytes + struct ring_msg *msg = create_ring_msg_with_data(256 * node_count); + uint8_t *data = + msg->data; // dn_zalloc(sizeof(uint8_t) * 256 * node_count);//msg->data; + uint8_t *pos = data; + int i = 0; + + dictIterator *dc_it; + dictEntry *dc_de; + dc_it = dictGetIterator(gn_pool.dict_dc); + while ((dc_de = dictNext(dc_it)) != NULL) { + struct gossip_dc *g_dc = dictGetVal(dc_de); + // log_debug(LOG_VERB, "\tDC name : '%.*s'", g_dc->name.len, + // g_dc->name.data); + dictIterator *rack_it = dictGetIterator(g_dc->dict_rack); + dictEntry *rack_de; + while ((rack_de = dictNext(rack_it)) != NULL) { + struct gossip_rack *g_rack = dictGetVal(rack_de); + // log_debug(LOG_VERB, "\tRack name : '%.*s'", g_rack->name.len, + // g_rack->name.data); + + dictIterator *node_it = dictGetIterator(g_rack->dict_token_nodes); + dictEntry *node_de; + while ((node_de = dictNext(node_it)) != NULL) { + struct gossip_node *gnode = dictGetVal(node_de); + // log_debug(LOG_VERB, "\tNode name : '%.*s'", + // gnode->name.len, gnode->name.data); + + if (i++ > 0) { + // pipe separator + *pos = '|'; + pos += 1; + } -static rstatus_t -gossip_forward_state(struct server_pool *sp) -{ - //assume each record needs maximum 256 bytes - struct ring_msg *msg = create_ring_msg_with_data(256 * node_count); - uint8_t *data = msg->data; //dn_zalloc(sizeof(uint8_t) * 256 * node_count);//msg->data; - uint8_t *pos = data; - int i = 0; - - dictIterator *dc_it; - dictEntry *dc_de; - dc_it = dictGetIterator(gn_pool.dict_dc); - while ((dc_de = dictNext(dc_it)) != NULL) { - struct gossip_dc *g_dc = dictGetVal(dc_de); - //log_debug(LOG_VERB, "\tDC name : '%.*s'", g_dc->name.len, g_dc->name.data); - dictIterator *rack_it = dictGetIterator(g_dc->dict_rack); - dictEntry *rack_de; - while ((rack_de = dictNext(rack_it)) != NULL) { - struct gossip_rack *g_rack = dictGetVal(rack_de); - //log_debug(LOG_VERB, "\tRack name : '%.*s'", g_rack->name.len, g_rack->name.data); - - dictIterator *node_it = dictGetIterator(g_rack->dict_token_nodes); - dictEntry *node_de; - while ((node_de = dictNext(node_it)) != NULL) { - struct gossip_node *gnode = dictGetVal(node_de); - //log_debug(LOG_VERB, "\tNode name : '%.*s'", gnode->name.len, gnode->name.data); - - if (i++ > 0) { - //pipe separator - *pos = '|'; - pos += 1; - } - - //write dc name - dn_memcpy(pos, g_dc->name.data, g_dc->name.len); - pos += g_dc->name.len; - - //$ separator - *pos = '$'; - pos += 1; - - //write rack name - dn_memcpy(pos, g_rack->name.data, g_rack->name.len); - pos += g_rack->name.len; - - //$ separator - *pos = '$'; - pos += 1; - - //write node token - struct string *token_str = dictGetKey(node_de); - //log_debug(LOG_VERB, "\tToken string : '%.*s'", token_str->len, token_str->data); - uint32_t k; - for(k=0; klen;k++, pos++) { - *pos = *(token_str->data + k); - } - - //comma separator - *pos = ','; - pos += 1; - - //write ts - int count = 0; - uint64_t ts; - if (gnode->is_local) //only update my own timestamp - ts = (uint64_t) time(NULL); - else ts = gnode->ts; - - count = 0; - write_number(pos, ts, &count); - pos += count; - - //comma separator - *pos = ','; - pos += 1; - - //write state - uint8_t new_state = gossip_failure_detector(gnode); - gnode->state = new_state; - count = 0; - write_number(pos, gnode->state, &count); - pos += count; - - //comma separator - *pos = ','; - pos += 1; - - //write address - for(k=0; kname.len; k++, pos++) { - *pos = *(gnode->name.data + k); - } + // write dc name + dn_memcpy(pos, g_dc->name.data, g_dc->name.len); + pos += g_dc->name.len; + + //$ separator + *pos = '$'; + pos += 1; + + // write rack name + dn_memcpy(pos, g_rack->name.data, g_rack->name.len); + pos += g_rack->name.len; + + //$ separator + *pos = '$'; + pos += 1; + + // write node token + struct string *token_str = dictGetKey(node_de); + // log_debug(LOG_VERB, "\tToken string : '%.*s'", + // token_str->len, token_str->data); + uint32_t k; + for (k = 0; k < token_str->len; k++, pos++) { + *pos = *(token_str->data + k); + } - } - dictReleaseIterator(node_it); + // comma separator + *pos = ','; + pos += 1; + + // write ts + int count = 0; + uint64_t ts; + if (gnode->is_local) // only update my own timestamp + ts = (uint64_t)time(NULL); + else + ts = gnode->ts; + + count = 0; + write_number(pos, ts, &count); + pos += count; + + // comma separator + *pos = ','; + pos += 1; + + // write state + uint8_t new_state = gossip_failure_detector(gnode); + gnode->state = new_state; + count = 0; + write_number(pos, gnode->state, &count); + pos += count; + + // comma separator + *pos = ','; + pos += 1; + + // write address + for (k = 0; k < gnode->name.len; k++, pos++) { + *pos = *(gnode->name.data + k); } - dictReleaseIterator(rack_it); + } + dictReleaseIterator(node_it); } + dictReleaseIterator(rack_it); + } - msg->len = pos-data; + msg->len = pos - data; - log_debug(LOG_VERB, "\tForwarding my current gossip states : '%.*s'", (pos-data), data); + log_debug(LOG_VERB, + "\tForwarding my current gossip states : '%.*s'", + (pos - data), data); - dictReleaseIterator(dc_it); + dictReleaseIterator(dc_it); - return gossip_ring_msg_to_core(sp, msg, dnode_peer_forward_state); + return gossip_ring_msg_to_core(sp, msg, dnode_peer_forward_state); } - -static rstatus_t -gossip_announce_joining(struct server_pool *sp) -{ - return gossip_msg_to_core(sp, NULL, dnode_peer_handshake_announcing); +static rstatus_t gossip_announce_joining(struct server_pool *sp) { + return gossip_msg_to_core(sp, NULL, dnode_peer_handshake_announcing); } -static rstatus_t -parse_seeds(struct string *seeds, struct string *dc_name, struct string *rack_name, - struct string *port_str, struct string *address, struct string *name, - struct dyn_token *ptoken) -{ - rstatus_t status; - uint8_t *p, *q, *start; - uint8_t *pname, *port, *rack, *dc, *token, *addr; - uint32_t k, delimlen, pnamelen, portlen, racklen, dclen, tokenlen, addrlen; - char delim[] = "::::"; - - /* parse "hostname:port:rack:dc:tokens" */ - p = seeds->data + seeds->len - 1; - start = seeds->data; - rack = NULL; - dc = NULL; - racklen = 0; - dclen = 0; - token = NULL; - tokenlen = 0; - port = NULL; - portlen = 0; - delimlen = 4; - - for (k = 0; k < sizeof(delim)-1; k++) { - q = dn_strrchr(p, start, delim[k]); - - if (q == NULL) { - break; - } +static rstatus_t parse_seeds(struct string *seeds, struct string *dc_name, + struct string *rack_name, struct string *port_str, + struct string *address, struct string *name, + struct dyn_token *ptoken) { + rstatus_t status; + uint8_t *p, *q, *start; + uint8_t *pname, *port, *rack, *dc, *token, *addr; + uint32_t k, delimlen, pnamelen, portlen, racklen, dclen, tokenlen, addrlen; + char delim[] = "::::"; + + /* parse "hostname:port:rack:dc:tokens" */ + p = seeds->data + seeds->len - 1; + start = seeds->data; + rack = NULL; + dc = NULL; + racklen = 0; + dclen = 0; + token = NULL; + tokenlen = 0; + port = NULL; + portlen = 0; + delimlen = 4; + + for (k = 0; k < sizeof(delim) - 1; k++) { + q = dn_strrchr(p, start, delim[k]); - switch (k) { - case 0: - token = q + 1; - tokenlen = (uint32_t)(p - token + 1); - if (tokenlen == 0) { - return GOS_ERROR; - } - break; - case 1: - dc = q + 1; - dclen = (uint32_t)(p - dc + 1); - if (dclen == 0) { - return GOS_ERROR; - } - string_copy(dc_name, dc, dclen); - break; - case 2: - rack = q + 1; - racklen = (uint32_t)(p - rack + 1); - if (racklen == 0) { - return GOS_ERROR; - } - string_copy(rack_name, rack, racklen); - break; - - case 3: - port = q + 1; - portlen = (uint32_t)(p - port + 1); - if (portlen == 0) { - return GOS_ERROR; - } - string_copy(port_str, port, portlen); - break; - - default: - NOT_REACHED(); - } - - p = q - 1; - } - - if (k != delimlen) { - return GOS_ERROR; - } - - //pname = hostname:port:rack:dc:token - pname = seeds->data; - log_debug(LOG_VERB, "pname %s", pname); - pnamelen = seeds->len - (tokenlen + racklen + dclen + 3); - if (pnamelen == 0) { - return GOS_ERROR; + if (q == NULL) { + break; } - // address = hostname:port - status = string_copy(address, pname, pnamelen); - - //addr = hostname or ip only - addr = start; - addrlen = (uint32_t)(p - start + 1); - if (addrlen == 0) { - return GOS_ERROR; - } - //if it is a dns name, convert to IP or otherwise keep that IP - if (!isdigit( (char) addr[0])) { - addr[addrlen] = '\0'; - unsigned char *local_ip4 = (unsigned char *)hostname_to_private_ip4( (char *) addr); - if (local_ip4 != NULL) { - status = string_copy_c(name, local_ip4); - } else - status = string_copy(name, addr, addrlen); - } else { - status = string_copy(name, addr, addrlen); - } - log_debug(LOG_VERB, "name: %.*s", name->len, name->data); - if (status != DN_OK) { - return GOS_ERROR; - } + switch (k) { + case 0: + token = q + 1; + tokenlen = (uint32_t)(p - token + 1); + if (tokenlen == 0) { + return GOS_ERROR; + } + break; + case 1: + dc = q + 1; + dclen = (uint32_t)(p - dc + 1); + if (dclen == 0) { + return GOS_ERROR; + } + string_copy(dc_name, dc, dclen); + break; + case 2: + rack = q + 1; + racklen = (uint32_t)(p - rack + 1); + if (racklen == 0) { + return GOS_ERROR; + } + string_copy(rack_name, rack, racklen); + break; + + case 3: + port = q + 1; + portlen = (uint32_t)(p - port + 1); + if (portlen == 0) { + return GOS_ERROR; + } + string_copy(port_str, port, portlen); + break; - uint8_t *t_end = token + tokenlen; - status = derive_token(ptoken, token, t_end); - if (status != DN_OK) { - return GOS_ERROR; + default: + NOT_REACHED(); } - //status = dn_resolve(&address, field->port, &field->info); - //if (status != DN_OK) { - // string_deinit(&address); - // return CONF_ERROR; - //} - - return GOS_OK; + p = q - 1; + } + + if (k != delimlen) { + return GOS_ERROR; + } + + // pname = hostname:port:rack:dc:token + pname = seeds->data; + log_debug(LOG_VERB, "pname %s", pname); + pnamelen = seeds->len - (tokenlen + racklen + dclen + 3); + if (pnamelen == 0) { + return GOS_ERROR; + } + // address = hostname:port + status = string_copy(address, pname, pnamelen); + + // addr = hostname or ip only + addr = start; + addrlen = (uint32_t)(p - start + 1); + if (addrlen == 0) { + return GOS_ERROR; + } + // if it is a dns name, convert to IP or otherwise keep that IP + if (!isdigit((char)addr[0])) { + addr[addrlen] = '\0'; + unsigned char *local_ip4 = + (unsigned char *)hostname_to_private_ip4((char *)addr); + if (local_ip4 != NULL) { + status = string_copy_c(name, local_ip4); + } else + status = string_copy(name, addr, addrlen); + } else { + status = string_copy(name, addr, addrlen); + } + log_debug(LOG_VERB, "name: %.*s", name->len, name->data); + if (status != DN_OK) { + return GOS_ERROR; + } + + uint8_t *t_end = token + tokenlen; + status = derive_token(ptoken, token, t_end); + if (status != DN_OK) { + return GOS_ERROR; + } + + // status = dn_resolve(&address, field->port, &field->info); + // if (status != DN_OK) { + // string_deinit(&address); + // return CONF_ERROR; + //} + + return GOS_OK; } -static rstatus_t -gossip_dc_init(struct gossip_dc *g_dc, struct string *dc) -{ - rstatus_t status; - g_dc->dict_rack = dictCreate(&string_table_dict_type, NULL); - string_copy(&g_dc->name, dc->data, dc->len); - status = array_init(&g_dc->racks, 50, sizeof(struct gossip_rack)); - return status; +static rstatus_t gossip_dc_init(struct gossip_dc *g_dc, struct string *dc) { + rstatus_t status; + g_dc->dict_rack = dictCreate(&string_table_dict_type, NULL); + string_copy(&g_dc->name, dc->data, dc->len); + status = array_init(&g_dc->racks, 50, sizeof(struct gossip_rack)); + return status; } - -static rstatus_t -gossip_rack_init(struct gossip_rack *g_rack, struct string *dc, struct string *rack) -{ - rstatus_t status; - g_rack->dict_name_nodes = dictCreate(&string_table_dict_type, NULL); - g_rack->dict_token_nodes = dictCreate(&token_table_dict_type, NULL); - string_copy(&g_rack->name, rack->data, rack->len); - string_copy(&g_rack->dc, dc->data, dc->len); - g_rack->nnodes = 0; - g_rack->nlive_nodes = 0; - status = array_init(&g_rack->nodes, 200, sizeof(struct gossip_node)); - - return status; +static rstatus_t gossip_rack_init(struct gossip_rack *g_rack, struct string *dc, + struct string *rack) { + rstatus_t status; + g_rack->dict_name_nodes = dictCreate(&string_table_dict_type, NULL); + g_rack->dict_token_nodes = dictCreate(&token_table_dict_type, NULL); + string_copy(&g_rack->name, rack->data, rack->len); + string_copy(&g_rack->dc, dc->data, dc->len); + g_rack->nnodes = 0; + g_rack->nlive_nodes = 0; + status = array_init(&g_rack->nodes, 200, sizeof(struct gossip_node)); + + return status; } - -static struct gossip_node * -gossip_add_node_to_rack(struct server_pool *sp, struct string *dc, struct gossip_rack *g_rack, - struct string *address, struct string *ip, struct string *port, struct dyn_token *token) -{ - rstatus_t status; - log_debug(LOG_VERB, "gossip_add_node_to_rack : dc[%.*s] rack[%.*s] address[%.*s] ip[%.*s] port[%.*s]", - dc->len, dc->data, g_rack->name, address->len, address->data, ip->len, ip->data, port->len, port->data); - - - int port_i = dn_atoi(port->data, port->len); - if (port_i == 0) { - return NULL; //bad data - } - - struct gossip_node *gnode = (struct gossip_node *) array_push(&g_rack->nodes); - node_init(gnode); - status = string_copy(&gnode->dc, dc->data, dc->len); - status = string_copy(&gnode->rack, g_rack->name.data, g_rack->name.len); - status = string_copy(&gnode->name, ip->data, ip->len); - status = string_copy(&gnode->pname, address->data, address->len); //ignore the port for now - IGNORE_RET_VAL(status); - gnode->port = port_i; - - struct dyn_token * gtoken = &gnode->token; - copy_dyn_token(token, gtoken); - - g_rack->nnodes++; - - //add into dicts - dictAdd(g_rack->dict_name_nodes, &gnode->name, gnode); - dictAdd(g_rack->dict_token_nodes, token_to_string(token), gnode); - - return gnode; +static struct gossip_node *gossip_add_node_to_rack( + struct server_pool *sp, struct string *dc, struct gossip_rack *g_rack, + struct string *address, struct string *ip, struct string *port, + struct dyn_token *token) { + rstatus_t status; + log_debug(LOG_VERB, + "gossip_add_node_to_rack : dc[%.*s] rack[%.*s] address[%.*s] " + "ip[%.*s] port[%.*s]", + dc->len, dc->data, g_rack->name, address->len, address->data, + ip->len, ip->data, port->len, port->data); + + int port_i = dn_atoi(port->data, port->len); + if (port_i == 0) { + return NULL; // bad data + } + + struct gossip_node *gnode = (struct gossip_node *)array_push(&g_rack->nodes); + node_init(gnode); + status = string_copy(&gnode->dc, dc->data, dc->len); + status = string_copy(&gnode->rack, g_rack->name.data, g_rack->name.len); + status = string_copy(&gnode->name, ip->data, ip->len); + status = string_copy(&gnode->pname, address->data, + address->len); // ignore the port for now + IGNORE_RET_VAL(status); + gnode->port = port_i; + + struct dyn_token *gtoken = &gnode->token; + copy_dyn_token(token, gtoken); + + g_rack->nnodes++; + + // add into dicts + dictAdd(g_rack->dict_name_nodes, &gnode->name, gnode); + dictAdd(g_rack->dict_token_nodes, token_to_string(token), gnode); + + return gnode; } - -static rstatus_t -gossip_add_node(struct server_pool *sp, struct string *dc, struct gossip_rack *g_rack, - struct string *address, struct string *ip, struct string *port, struct dyn_token *token, uint8_t state) -{ - rstatus_t status; - log_debug(LOG_VERB, "gossip_add_node : dc[%.*s] rack[%.*s] address[%.*s] ip[%.*s] port[%.*s]", - dc->len, dc->data, g_rack->name.len, g_rack->name.data, address->len, address->data, ip->len, ip->data, port->len, port->data); - - struct gossip_node *gnode = gossip_add_node_to_rack(sp, dc, g_rack, address, ip, port, token); - if (gnode == NULL) { - return DN_ENOMEM; - } - - node_count++; - gnode->state = state; - - status = gossip_msg_to_core(sp, gnode, dnode_peer_add); - return status; +static rstatus_t gossip_add_node(struct server_pool *sp, struct string *dc, + struct gossip_rack *g_rack, + struct string *address, struct string *ip, + struct string *port, struct dyn_token *token, + uint8_t state) { + rstatus_t status; + log_debug( + LOG_VERB, + "gossip_add_node : dc[%.*s] rack[%.*s] address[%.*s] ip[%.*s] port[%.*s]", + dc->len, dc->data, g_rack->name.len, g_rack->name.data, address->len, + address->data, ip->len, ip->data, port->len, port->data); + + struct gossip_node *gnode = + gossip_add_node_to_rack(sp, dc, g_rack, address, ip, port, token); + if (gnode == NULL) { + return DN_ENOMEM; + } + + node_count++; + gnode->state = state; + + status = gossip_msg_to_core(sp, gnode, dnode_peer_add); + return status; } - -static rstatus_t -gossip_replace_node(struct server_pool *sp, struct gossip_node *node, - struct string *new_address, struct string *new_ip, uint8_t state) -{ - rstatus_t status; - log_debug(LOG_WARN, "gossip_replace_node : dc[%.*s] rack[%.*s] oldaddr[%.*s] newaddr[%.*s] newip[%.*s]", - node->dc, node->rack, node->name, new_address->len, new_address->data, new_ip->len, new_ip->data); - - string_deinit(&node->name); - string_deinit(&node->pname); - status = string_copy(&node->name, new_ip->data, new_ip->len); - status = string_copy(&node->pname, new_address->data, new_address->len); - //port is supposed to be the same - - node->state = state; - gossip_msg_to_core(sp, node, dnode_peer_replace); - - //should check for status - return status; +static rstatus_t gossip_replace_node(struct server_pool *sp, + struct gossip_node *node, + struct string *new_address, + struct string *new_ip, uint8_t state) { + rstatus_t status; + log_debug(LOG_WARN, + "gossip_replace_node : dc[%.*s] rack[%.*s] oldaddr[%.*s] " + "newaddr[%.*s] newip[%.*s]", + node->dc, node->rack, node->name, new_address->len, + new_address->data, new_ip->len, new_ip->data); + + string_deinit(&node->name); + string_deinit(&node->pname); + status = string_copy(&node->name, new_ip->data, new_ip->len); + status = string_copy(&node->pname, new_address->data, new_address->len); + // port is supposed to be the same + + node->state = state; + gossip_msg_to_core(sp, node, dnode_peer_replace); + + // should check for status + return status; } - - -static rstatus_t -gossip_update_state(struct server_pool *sp, struct gossip_node *node, uint8_t state, const uint64_t timestamp) -{ - rstatus_t status = DN_OK; - log_debug(LOG_VVERB, "gossip_update_state : dc[%.*s] rack[%.*s] name[%.*s] token[%d] state[%d]", +static rstatus_t gossip_update_state(struct server_pool *sp, + struct gossip_node *node, uint8_t state, + const uint64_t timestamp) { + rstatus_t status = DN_OK; + log_debug(LOG_VVERB, + "gossip_update_state : dc[%.*s] rack[%.*s] name[%.*s] token[%d] " + "state[%d]", node->dc, node->rack, node->name, node->token.mag[0], state); - if (node->ts < timestamp) { - node->state = state; - node->ts = timestamp; - } + if (node->ts < timestamp) { + node->state = state; + node->ts = timestamp; + } - //gossip_msg_to_core(sp, node, dnode_peer_update_state); + // gossip_msg_to_core(sp, node, dnode_peer_update_state); - return status; + return status; } - -static rstatus_t -gossip_add_node_if_absent(struct server_pool *sp, - struct string *dc, - struct string *rack, - struct string *address, - struct string *ip, - struct string *port, - struct dyn_token *token, - uint8_t state, - const uint64_t timestamp) -{ - log_debug(LOG_VERB, "gossip_add_node_if_absent : '%.*s'", address->len, address->data); - - struct gossip_dc * g_dc = dictFetchValue(gn_pool.dict_dc, dc); - if (g_dc == NULL) { - log_info("We don't have this datacenter? '%.*s' ", dc->len, dc->data); - g_dc = array_push(&gn_pool.datacenters); - gossip_dc_init(g_dc, dc); - dictAdd(gn_pool.dict_dc, &g_dc->name, g_dc); - } else { - log_debug(LOG_VERB, "We got a datacenter in dict for '%.*s' ", dc->len, dc->data); - } - - struct gossip_rack *g_rack = dictFetchValue(g_dc->dict_rack, rack); - if (g_rack == NULL) { - log_info("We don't have this rack? '%.*s' ", rack->len, rack->data); - g_rack = array_push(&g_dc->racks); - gossip_rack_init(g_rack, dc, rack); - dictAdd(g_dc->dict_rack, &g_rack->name, g_rack); - } else { - log_debug(LOG_VERB, "We got a rack for '%.*s' ", rack->len, rack->data); - } - - struct string *token_str = token_to_string(token); - struct gossip_node *g_node = dictFetchValue(g_rack->dict_token_nodes, token_str); - - if (g_node == NULL) { //never existed - log_debug(LOG_NOTICE, "Node not found! We need to add it"); - log_debug(LOG_NOTICE, "adding node : dc[%.*s]", dc->len, dc->data); - log_debug(LOG_NOTICE, "adding node : g_rack[%.*s]", g_rack->name.len, g_rack->name.data); - log_debug(LOG_NOTICE, "adding node : address[%.*s]", address->len, address->data); - log_debug(LOG_NOTICE, "adding node : ip[%.*s]", ip->len, ip->data); - log_debug(LOG_NOTICE, "adding node : port[%.*s]", port->len, port->data); - log_debug(LOG_NOTICE, "suggested state : %d", state); - //print_dyn_token(token, 6); - gossip_add_node(sp, dc, g_rack, address, ip, port, token, state); - } else if (dictFind(g_rack->dict_name_nodes, ip) != NULL) { - log_debug(LOG_VERB, "Node found"); - if (!g_node->is_local) { //don't update myself here - if (string_compare(&g_node->name, ip) != 0) { - log_debug(LOG_WARN, "Replacing an existing token with new info"); - gossip_replace_node(sp, g_node, address, ip, state); - } else { //update state - gossip_update_state(sp, g_node, state, timestamp); - } - } - } else { - log_debug(LOG_WARN, "Replacing an existing token with new IP or address"); +static rstatus_t gossip_add_node_if_absent( + struct server_pool *sp, struct string *dc, struct string *rack, + struct string *address, struct string *ip, struct string *port, + struct dyn_token *token, uint8_t state, const uint64_t timestamp) { + log_debug(LOG_VERB, "gossip_add_node_if_absent : '%.*s'", + address->len, address->data); + + struct gossip_dc *g_dc = dictFetchValue(gn_pool.dict_dc, dc); + if (g_dc == NULL) { + log_info("We don't have this datacenter? '%.*s' ", dc->len, dc->data); + g_dc = array_push(&gn_pool.datacenters); + gossip_dc_init(g_dc, dc); + dictAdd(gn_pool.dict_dc, &g_dc->name, g_dc); + } else { + log_debug(LOG_VERB, "We got a datacenter in dict for '%.*s' ", dc->len, + dc->data); + } + + struct gossip_rack *g_rack = dictFetchValue(g_dc->dict_rack, rack); + if (g_rack == NULL) { + log_info("We don't have this rack? '%.*s' ", rack->len, rack->data); + g_rack = array_push(&g_dc->racks); + gossip_rack_init(g_rack, dc, rack); + dictAdd(g_dc->dict_rack, &g_rack->name, g_rack); + } else { + log_debug(LOG_VERB, "We got a rack for '%.*s' ", rack->len, rack->data); + } + + struct string *token_str = token_to_string(token); + struct gossip_node *g_node = + dictFetchValue(g_rack->dict_token_nodes, token_str); + + if (g_node == NULL) { // never existed + log_debug(LOG_NOTICE, "Node not found! We need to add it"); + log_debug(LOG_NOTICE, "adding node : dc[%.*s]", dc->len, dc->data); + log_debug(LOG_NOTICE, "adding node : g_rack[%.*s]", g_rack->name.len, + g_rack->name.data); + log_debug(LOG_NOTICE, "adding node : address[%.*s]", address->len, + address->data); + log_debug(LOG_NOTICE, "adding node : ip[%.*s]", ip->len, ip->data); + log_debug(LOG_NOTICE, "adding node : port[%.*s]", port->len, port->data); + log_debug(LOG_NOTICE, "suggested state : %d", state); + // print_dyn_token(token, 6); + gossip_add_node(sp, dc, g_rack, address, ip, port, token, state); + } else if (dictFind(g_rack->dict_name_nodes, ip) != NULL) { + log_debug(LOG_VERB, "Node found"); + if (!g_node->is_local) { // don't update myself here + if (string_compare(&g_node->name, ip) != 0) { + log_debug(LOG_WARN, "Replacing an existing token with new info"); gossip_replace_node(sp, g_node, address, ip, state); - dictAdd(g_rack->dict_name_nodes, &g_node->name, g_node); + } else { // update state + gossip_update_state(sp, g_node, state, timestamp); + } } - - //free token_str - string_deinit(token_str); - dn_free(token_str); - return 0; + } else { + log_debug(LOG_WARN, "Replacing an existing token with new IP or address"); + gossip_replace_node(sp, g_node, address, ip, state); + dictAdd(g_rack->dict_name_nodes, &g_node->name, g_node); + } + + // free token_str + string_deinit(token_str); + dn_free(token_str); + return 0; } - -static rstatus_t -gossip_update_seeds(struct server_pool *sp, struct mbuf *seeds) -{ - struct string rack_name; - struct string dc_name; - struct string port_str; - struct string address; - struct string ip; - //struct array tokens; - struct dyn_token token; - - struct string temp; - - rstatus_t parse_status; - - string_init(&rack_name); - string_init(&dc_name); - string_init(&port_str); - string_init(&address); - string_init(&ip); +static rstatus_t gossip_update_seeds(struct server_pool *sp, + struct mbuf *seeds) { + struct string rack_name; + struct string dc_name; + struct string port_str; + struct string address; + struct string ip; + // struct array tokens; + struct dyn_token token; + + struct string temp; + + rstatus_t parse_status; + + string_init(&rack_name); + string_init(&dc_name); + string_init(&port_str); + string_init(&address); + string_init(&ip); + init_dyn_token(&token); + + uint8_t *p, *q, *start; + start = seeds->start; + p = seeds->last - 1; + q = dn_strrchr(p, start, '|'); + + uint8_t *seed_node; + uint32_t seed_node_len; + + while (q != NULL && q > start) { + seed_node = q + 1; + seed_node_len = (uint32_t)(p - seed_node + 1); + string_copy(&temp, seed_node, seed_node_len); + // array_init(&tokens, 1, sizeof(struct dyn_token)); init_dyn_token(&token); - - uint8_t *p, *q, *start; - start = seeds->start; - p = seeds->last - 1; - q = dn_strrchr(p, start, '|'); - - uint8_t *seed_node; - uint32_t seed_node_len; - - while (q != NULL && q > start) { - seed_node = q + 1; - seed_node_len = (uint32_t)(p - seed_node + 1); - string_copy(&temp, seed_node, seed_node_len); - //array_init(&tokens, 1, sizeof(struct dyn_token)); - init_dyn_token(&token); - parse_status = parse_seeds(&temp, &dc_name, &rack_name, &port_str, &address, &ip, &token); - log_debug(LOG_VERB, "address : '%.*s'", address.len, address.data); - log_debug(LOG_VERB, "rack_name : '%.*s'", rack_name.len, rack_name.data); - log_debug(LOG_VERB, "dc_name : '%.*s'", dc_name.len, dc_name.data); - log_debug(LOG_VERB, "ip : '%.*s'", ip.len, ip.data); - log_debug(LOG_VERB, "port : '%.*s'", port_str.len, port_str.data); - - //struct dyn_token *token = array_get(&tokens, 0); - if (parse_status == GOS_OK) { - gossip_add_node_if_absent(sp, &dc_name, &rack_name, &address, &ip, &port_str, &token, NORMAL, (uint64_t) time(NULL)); - } - - p = q - 1; - q = dn_strrchr(p, start, '|'); - string_deinit(&temp); - //array_deinit(&tokens); - deinit_dyn_token(&token); - string_deinit(&rack_name); - string_deinit(&dc_name); - string_deinit(&port_str); - string_deinit(&address); - string_deinit(&ip); - } - - if (q == NULL) { - seed_node_len = (uint32_t)(p - start + 1); - seed_node = start; - - string_copy(&temp, seed_node, seed_node_len); - //array_init(&tokens, 1, sizeof(struct dyn_token)); - init_dyn_token(&token); - parse_status = parse_seeds(&temp, &dc_name, &rack_name, &port_str, &address, &ip, &token); - log_debug(LOG_VERB, "address : '%.*s'", address.len, address.data); - log_debug(LOG_VERB, "rack_name : '%.*s'", rack_name.len, rack_name.data); - log_debug(LOG_VERB, "dc_name : '%.*s'", dc_name.len, dc_name.data); - log_debug(LOG_VERB, "ip : '%.*s'", ip.len, ip.data); - log_debug(LOG_VERB, "port : '%.*s'", port_str.len, port_str.data); - - //struct dyn_token *token = array_get(&tokens, 0); - if (parse_status == GOS_OK) { - gossip_add_node_if_absent(sp, &dc_name, &rack_name, &address, &ip, &port_str, &token, NORMAL, (uint64_t) time(NULL)); - } + parse_status = parse_seeds(&temp, &dc_name, &rack_name, &port_str, &address, + &ip, &token); + log_debug(LOG_VERB, "address : '%.*s'", address.len, address.data); + log_debug(LOG_VERB, "rack_name : '%.*s'", rack_name.len, rack_name.data); + log_debug(LOG_VERB, "dc_name : '%.*s'", dc_name.len, dc_name.data); + log_debug(LOG_VERB, "ip : '%.*s'", ip.len, ip.data); + log_debug(LOG_VERB, "port : '%.*s'", port_str.len, port_str.data); + + // struct dyn_token *token = array_get(&tokens, 0); + if (parse_status == GOS_OK) { + gossip_add_node_if_absent(sp, &dc_name, &rack_name, &address, &ip, + &port_str, &token, NORMAL, + (uint64_t)time(NULL)); } + p = q - 1; + q = dn_strrchr(p, start, '|'); string_deinit(&temp); - //array_deinit(&tokens); + // array_deinit(&tokens); deinit_dyn_token(&token); string_deinit(&rack_name); string_deinit(&dc_name); string_deinit(&port_str); string_deinit(&address); string_deinit(&ip); + } - gossip_debug(); - return DN_OK; -} + if (q == NULL) { + seed_node_len = (uint32_t)(p - start + 1); + seed_node = start; + string_copy(&temp, seed_node, seed_node_len); + // array_init(&tokens, 1, sizeof(struct dyn_token)); + init_dyn_token(&token); + parse_status = parse_seeds(&temp, &dc_name, &rack_name, &port_str, &address, + &ip, &token); + log_debug(LOG_VERB, "address : '%.*s'", address.len, address.data); + log_debug(LOG_VERB, "rack_name : '%.*s'", rack_name.len, rack_name.data); + log_debug(LOG_VERB, "dc_name : '%.*s'", dc_name.len, dc_name.data); + log_debug(LOG_VERB, "ip : '%.*s'", ip.len, ip.data); + log_debug(LOG_VERB, "port : '%.*s'", port_str.len, port_str.data); + + // struct dyn_token *token = array_get(&tokens, 0); + if (parse_status == GOS_OK) { + gossip_add_node_if_absent(sp, &dc_name, &rack_name, &address, &ip, + &port_str, &token, NORMAL, + (uint64_t)time(NULL)); + } + } + + string_deinit(&temp); + // array_deinit(&tokens); + deinit_dyn_token(&token); + string_deinit(&rack_name); + string_deinit(&dc_name); + string_deinit(&port_str); + string_deinit(&address); + string_deinit(&ip); + + gossip_debug(); + return DN_OK; +} /*static void gossip_metainfo(void) @@ -794,312 +758,309 @@ gossip_metainfo(void) dc_it = dictGetIterator(gn_pool.dict_dc); while ((dc_de = dictNext(dc_it)) != NULL) { struct gossip_dc *g_dc = dictGetVal(dc_de); - log_debug(LOG_VERB, "\tDC name : '%.*s'", g_dc->name.len, g_dc->name.data); - dictIterator *rack_it = dictGetIterator(g_dc->dict_rack); + log_debug(LOG_VERB, "\tDC name : '%.*s'", g_dc->name.len, +g_dc->name.data); dictIterator *rack_it = dictGetIterator(g_dc->dict_rack); dictEntry *rack_de; while ((rack_de = dictNext(rack_it)) != NULL) { struct gossip_rack *g_rack = dictGetVal(rack_de); - log_debug(LOG_VERB, "\tRack name : '%.*s'", g_rack->name.len, g_rack->name.data); + log_debug(LOG_VERB, "\tRack name : '%.*s'", +g_rack->name.len, g_rack->name.data); - dictIterator *node_it = dictGetIterator(g_rack->dict_token_nodes); - dictEntry *node_de; - while ((node_de = dictNext(node_it)) != NULL) { - struct gossip_node *gnode = dictGetVal(node_de); - log_debug(LOG_VERB, "\tNode name : '%.*s'", gnode->name.len, gnode->name.data); + dictIterator *node_it = +dictGetIterator(g_rack->dict_token_nodes); dictEntry *node_de; while ((node_de = +dictNext(node_it)) != NULL) { struct gossip_node *gnode = dictGetVal(node_de); + log_debug(LOG_VERB, "\tNode name : '%.*s'", +gnode->name.len, gnode->name.data); struct string *token_key = dictGetKey(node_de); - log_debug(LOG_VERB, "\tNode token : '%.*s'", *token_key); + log_debug(LOG_VERB, "\tNode token : '%.*s'", +*token_key); } } } }*/ -static void * -gossip_loop(void *arg) -{ - struct server_pool *sp = arg; - usec_t gossip_interval = gn_pool.g_interval * 1000; +static void *gossip_loop(void *arg) { + struct server_pool *sp = arg; + usec_t gossip_interval = gn_pool.g_interval * 1000; - seeds_buf = mbuf_alloc(SEED_BUF_SIZE); + seeds_buf = mbuf_alloc(SEED_BUF_SIZE); - log_debug(LOG_VVERB, "gossip_interval : %lu msecs", gn_pool.g_interval); - for(;;) { - usleep((useconds_t)gossip_interval); + log_debug(LOG_VVERB, "gossip_interval : %lu msecs", gn_pool.g_interval); + for (;;) { + usleep((useconds_t)gossip_interval); - log_debug(LOG_VERB, "Gossip is running ..."); + log_debug(LOG_VERB, "Gossip is running ..."); - if (gn_pool.seeds_provider != NULL && gn_pool.seeds_provider(sp->ctx, seeds_buf) == DN_OK) { - log_info("Got seed nodes '%.*s'", mbuf_length(seeds_buf), seeds_buf->pos); - gossip_update_seeds(sp, seeds_buf); - } + if (gn_pool.seeds_provider != NULL && + gn_pool.seeds_provider(sp->ctx, seeds_buf) == DN_OK) { + log_info("Got seed nodes '%.*s'", mbuf_length(seeds_buf), + seeds_buf->pos); + gossip_update_seeds(sp, seeds_buf); + } - current_node->ts = (uint64_t) time(NULL); - gossip_process_msgs(); + current_node->ts = (uint64_t)time(NULL); + gossip_process_msgs(); - if (current_node->state == NORMAL) { - gn_pool.ctx->dyn_state = NORMAL; - } + if (current_node->state == NORMAL) { + gn_pool.ctx->dyn_state = NORMAL; + } - if (!sp->enable_gossip) { - //gossip_debug(); - continue; //no gossiping - } + if (!sp->enable_gossip) { + // gossip_debug(); + continue; // no gossiping + } - if (node_count == 1) { //single node deployment - gn_pool.ctx->dyn_state = NORMAL; - continue; - } + if (node_count == 1) { // single node deployment + gn_pool.ctx->dyn_state = NORMAL; + continue; + } - //STANDBY state for warm bootstrap - if (gn_pool.ctx->dyn_state == STANDBY) - continue; - - if (gn_pool.ctx->dyn_state == JOINING) { - log_debug(LOG_NOTICE, "I am still joining the ring!"); - //aggressively contact all known nodes before changing to state NORMAL - gossip_announce_joining(sp); - usleep((useconds_t)MAX(gn_pool.ctx->timeout, gossip_interval) * 2); - } else if (gn_pool.ctx->dyn_state == NORMAL) { - gossip_forward_state(sp); - } + // STANDBY state for warm bootstrap + if (gn_pool.ctx->dyn_state == STANDBY) continue; - gossip_debug(); + if (gn_pool.ctx->dyn_state == JOINING) { + log_debug(LOG_NOTICE, "I am still joining the ring!"); + // aggressively contact all known nodes before changing to state NORMAL + gossip_announce_joining(sp); + usleep((useconds_t)MAX(gn_pool.ctx->timeout, gossip_interval) * 2); + } else if (gn_pool.ctx->dyn_state == NORMAL) { + gossip_forward_state(sp); + } - } //end for loop + gossip_debug(); - mbuf_dealloc(seeds_buf); - seeds_buf = NULL; + } // end for loop - return NULL; -} + mbuf_dealloc(seeds_buf); + seeds_buf = NULL; + return NULL; +} -rstatus_t -gossip_start(struct server_pool *sp) -{ - rstatus_t status; - pthread_t tid; +rstatus_t gossip_start(struct server_pool *sp) { + rstatus_t status; + pthread_t tid; - status = pthread_create(&tid, NULL, gossip_loop, sp); - if (status < 0) { - log_error("gossip service create failed: %s", strerror(status)); - return DN_ERROR; - } + status = pthread_create(&tid, NULL, gossip_loop, sp); + if (status < 0) { + log_error("gossip service create failed: %s", strerror(status)); + return DN_ERROR; + } - return DN_OK; + return DN_OK; } - -static void -gossip_set_seeds_provider(struct string * seeds_provider_str) -{ - log_debug(LOG_VERB, "Seed provider :::::: '%.*s'", - seeds_provider_str->len, seeds_provider_str->data); - - if (dn_strncmp(seeds_provider_str->data, FLORIDA_PROVIDER, 16) == 0) { - gn_pool.seeds_provider = florida_get_seeds; - } else - if (dn_strncmp(seeds_provider_str->data, DNS_PROVIDER, 12) == 0) { - gn_pool.seeds_provider = dns_get_seeds; - } else { - gn_pool.seeds_provider = NULL; - } +static void gossip_set_seeds_provider(struct string *seeds_provider_str) { + log_debug(LOG_VERB, "Seed provider :::::: '%.*s'", seeds_provider_str->len, + seeds_provider_str->data); + + if (dn_strncmp(seeds_provider_str->data, FLORIDA_PROVIDER, 16) == 0) { + gn_pool.seeds_provider = florida_get_seeds; + } else if (dn_strncmp(seeds_provider_str->data, DNS_PROVIDER, 12) == 0) { + gn_pool.seeds_provider = dns_get_seeds; + } else { + gn_pool.seeds_provider = NULL; + } } +rstatus_t gossip_pool_init(struct context *ctx) { + rstatus_t status; + struct server_pool *sp = &ctx->pool; -rstatus_t -gossip_pool_init(struct context *ctx) -{ - rstatus_t status; - struct server_pool *sp = &ctx->pool; - - gn_pool.ctx = sp->ctx; - gn_pool.name = &sp->name; - gn_pool.idx = 0; - gn_pool.g_interval = sp->g_interval; + gn_pool.ctx = sp->ctx; + gn_pool.name = &sp->name; + gn_pool.idx = 0; + gn_pool.g_interval = sp->g_interval; - //dictDisableResize(); - gn_pool.dict_dc = dictCreate(&string_table_dict_type, NULL); + // dictDisableResize(); + gn_pool.dict_dc = dictCreate(&string_table_dict_type, NULL); - gossip_set_seeds_provider(&sp->seed_provider); + gossip_set_seeds_provider(&sp->seed_provider); - uint32_t n_dc = array_n(&sp->datacenters); - if (n_dc == 0) - return DN_OK; + uint32_t n_dc = array_n(&sp->datacenters); + if (n_dc == 0) return DN_OK; - if (n_dc > 0) { - status = array_init(&gn_pool.datacenters, n_dc, sizeof(struct gossip_dc)); - if (status != DN_OK) { - return status; - } + if (n_dc > 0) { + status = array_init(&gn_pool.datacenters, n_dc, sizeof(struct gossip_dc)); + if (status != DN_OK) { + return status; } - - //add racks and datacenters - uint32_t dc_cnt = array_n(&sp->datacenters); - uint32_t dc_index; - for(dc_index = 0; dc_index < dc_cnt; dc_index++) { - struct datacenter *dc = array_get(&sp->datacenters, dc_index); - uint32_t rack_cnt = array_n(&dc->racks); - uint32_t rack_index; - for(rack_index = 0; rack_index < rack_cnt; rack_index++) { - struct rack *rack = array_get(&dc->racks, rack_index); - - if (dictFind(gn_pool.dict_dc, rack->dc) == NULL) { - struct gossip_dc *g_dc = array_push(&gn_pool.datacenters); - gossip_dc_init(g_dc, rack->dc); - dictAdd(gn_pool.dict_dc, &g_dc->name, g_dc); - } - - struct gossip_dc *g_dc = dictFetchValue(gn_pool.dict_dc, rack->dc); - if (dictFind(g_dc->dict_rack, rack->name) == NULL) { - log_debug(LOG_VERB, "What?? No rack in Dict for rack : '%.*s'", *(rack->name)); - struct gossip_rack *g_rack = array_push(&g_dc->racks); - gossip_rack_init(g_rack, rack->dc, rack->name); - dictAdd(g_dc->dict_rack, &g_rack->name, g_rack); - } - } + } + + // add racks and datacenters + uint32_t dc_cnt = array_n(&sp->datacenters); + uint32_t dc_index; + for (dc_index = 0; dc_index < dc_cnt; dc_index++) { + struct datacenter *dc = array_get(&sp->datacenters, dc_index); + uint32_t rack_cnt = array_n(&dc->racks); + uint32_t rack_index; + for (rack_index = 0; rack_index < rack_cnt; rack_index++) { + struct rack *rack = array_get(&dc->racks, rack_index); + + if (dictFind(gn_pool.dict_dc, rack->dc) == NULL) { + struct gossip_dc *g_dc = array_push(&gn_pool.datacenters); + gossip_dc_init(g_dc, rack->dc); + dictAdd(gn_pool.dict_dc, &g_dc->name, g_dc); + } + + struct gossip_dc *g_dc = dictFetchValue(gn_pool.dict_dc, rack->dc); + if (dictFind(g_dc->dict_rack, rack->name) == NULL) { + log_debug(LOG_VERB, "What?? No rack in Dict for rack : '%.*s'", + *(rack->name)); + struct gossip_rack *g_rack = array_push(&g_dc->racks); + gossip_rack_init(g_rack, rack->dc, rack->name); + dictAdd(g_dc->dict_rack, &g_rack->name, g_rack); + } } + } - uint32_t i, nelem; - for (i = 0, nelem = array_n(&sp->peers); i < nelem; i++) { - struct node *peer = *(struct node **)array_get(&sp->peers, i); - struct gossip_dc *g_dc = dictFetchValue(gn_pool.dict_dc, &peer->dc); - struct gossip_rack *g_rack = dictFetchValue(g_dc->dict_rack, &peer->rack); - struct gossip_node *gnode = array_push(&g_rack->nodes); - - node_init(gnode); - - // pname is the entire hostname:port:rack:dc:tokens for peer node and just 0.0.0.0:8101 for local - string_copy(&gnode->pname, peer->endpoint.pname.data, peer->endpoint.pname.len); - // name or the hostname that gets overridden for local node by its broadcast address - string_copy(&gnode->name, peer->name.data, peer->name.len); - gnode->port = peer->endpoint.port; - string_copy(&gnode->rack, g_rack->name.data, g_rack->name.len); - string_copy(&gnode->dc, peer->dc.data, peer->dc.len); - gnode->is_local = peer->is_local; - - - if (i == 0) { //Don't override its own state - gnode->state = sp->ctx->dyn_state; //likely it is JOINING state - gnode->ts = (uint64_t)time(NULL); - current_node = gnode; - unsigned char *b_address = get_broadcast_address(sp); - string_deinit(&gnode->name); - string_copy(&gnode->name, b_address, dn_strlen(b_address)); - } else { - - unsigned char *local_ip4 = hostname_to_private_ip4( (char *) gnode->name.data); - // Use the local_ipv4 instead of the hostname, that's what we use for - // comparison eventually anyways - if (local_ip4 != NULL) { - string_deinit(&gnode->name); - string_copy_c(&gnode->name, local_ip4); - } - gnode->state = DOWN; - gnode->ts = 1010101; //make this to be a very aged ts - } - - struct dyn_token *ptoken = array_get(&peer->tokens, 0); - copy_dyn_token(ptoken, &gnode->token); - - //copy socket stuffs - - g_rack->nnodes++; - //add into dicts - dictAdd(g_rack->dict_name_nodes, &gnode->name, gnode); - dictAdd(g_rack->dict_token_nodes, token_to_string(&gnode->token), gnode); + uint32_t i, nelem; + for (i = 0, nelem = array_n(&sp->peers); i < nelem; i++) { + struct node *peer = *(struct node **)array_get(&sp->peers, i); + struct gossip_dc *g_dc = dictFetchValue(gn_pool.dict_dc, &peer->dc); + struct gossip_rack *g_rack = dictFetchValue(g_dc->dict_rack, &peer->rack); + struct gossip_node *gnode = array_push(&g_rack->nodes); - node_count++; + node_init(gnode); + // pname is the entire hostname:port:rack:dc:tokens for peer node and just + // 0.0.0.0:8101 for local + string_copy(&gnode->pname, peer->endpoint.pname.data, + peer->endpoint.pname.len); + // name or the hostname that gets overridden for local node by its broadcast + // address + string_copy(&gnode->name, peer->name.data, peer->name.len); + gnode->port = peer->endpoint.port; + string_copy(&gnode->rack, g_rack->name.data, g_rack->name.len); + string_copy(&gnode->dc, peer->dc.data, peer->dc.len); + gnode->is_local = peer->is_local; + + if (i == 0) { // Don't override its own state + gnode->state = sp->ctx->dyn_state; // likely it is JOINING state + gnode->ts = (uint64_t)time(NULL); + current_node = gnode; + unsigned char *b_address = get_broadcast_address(sp); + string_deinit(&gnode->name); + string_copy(&gnode->name, b_address, dn_strlen(b_address)); + } else { + unsigned char *local_ip4 = + hostname_to_private_ip4((char *)gnode->name.data); + // Use the local_ipv4 instead of the hostname, that's what we use for + // comparison eventually anyways + if (local_ip4 != NULL) { + string_deinit(&gnode->name); + string_copy_c(&gnode->name, local_ip4); + } + gnode->state = DOWN; + gnode->ts = 1010101; // make this to be a very aged ts } - //gossip_debug(); + struct dyn_token *ptoken = array_get(&peer->tokens, 0); + copy_dyn_token(ptoken, &gnode->token); - status = gossip_start(sp); - if (status != DN_OK) { - goto error; - } + // copy socket stuffs - return DN_OK; + g_rack->nnodes++; + // add into dicts + dictAdd(g_rack->dict_name_nodes, &gnode->name, gnode); + dictAdd(g_rack->dict_token_nodes, token_to_string(&gnode->token), gnode); - error: - gossip_destroy(sp); - return DN_OK; + node_count++; + } -} + // gossip_debug(); -rstatus_t -gossip_destroy(struct server_pool *sp) -{ - return DN_OK; -} + status = gossip_start(sp); + if (status != DN_OK) { + goto error; + } -void gossip_debug(void) -{ - uint32_t i, nelem; - for (i = 0, nelem = array_n(&gn_pool.datacenters); i < nelem; i++) { - log_debug(LOG_VERB, "===============Gossip dump==============================="); - struct gossip_dc *g_dc = (struct gossip_dc *) array_get(&gn_pool.datacenters, i); - log_debug(LOG_VERB, "\tDC name : '%.*s'", g_dc->name.len, g_dc->name.data); - log_debug(LOG_VERB, "========================================================="); - - uint32_t k, kelem; - for (k = 0, kelem = array_n(&g_dc->racks); k < kelem; k++) { - struct gossip_rack *g_rack = (struct gossip_rack *) array_get(&g_dc->racks, k); - - log_debug(LOG_VERB, "\tRACK name : '%.*s'", g_rack->name.len, g_rack->name.data); - log_debug(LOG_VERB, "\tNum nodes in RACK : '%d'", array_n(&g_rack->nodes)); - uint32_t jj; - for (jj = 0; jj < array_n(&g_rack->nodes); jj++) { - log_debug(LOG_VERB, "-----------------------------------------"); - struct gossip_node *node = (struct gossip_node *) array_get(&g_rack->nodes, jj); - log_debug(LOG_VERB, "\t\tNode name : '%.*s'", node->name); - log_debug(LOG_VERB, "\t\tNode pname : '%.*s'", node->pname); - log_debug(LOG_VERB, "\t\tNode state : %"PRIu32"", node->state); - log_debug(LOG_VERB, "\t\tNode port : %"PRIu32"", node->port); - log_debug(LOG_VERB, "\t\tNode is_local : %"PRIu32" ", node->is_local); - - print_dyn_token(&node->token, 8); - log_debug(LOG_VERB, "\t\tFinger print : %"PRIu64" ", dictFingerprint(g_rack->dict_token_nodes)); + return DN_OK; - } - } - } - log_debug(LOG_VERB, "..........................................................."); +error: + gossip_destroy(sp); + return DN_OK; } - -rstatus_t -gossip_msg_peer_update(void *rmsg) -{ - rstatus_t status; - struct ring_msg *msg = rmsg; - struct server_pool *sp = msg->sp; - - //TODOs: need to fix this as it is breaking warm bootstrap - current_node->state = NORMAL; - sp->ctx->dyn_state = NORMAL; - - uint32_t i=0; - uint32_t n = array_n(&msg->nodes); - for(i=0; inodes, i); - log_debug(LOG_VVERB, "Processing msg gossip_msg_peer_update '%.*s'", node->name.len, node->name.data); - log_debug(LOG_VVERB, "Processing gossip_msg_peer_update : datacenter '%.*s'", node->dc.len, node->dc.data); - log_debug(LOG_VVERB, "Processing gossip_msg_peer_update : rack '%.*s'", node->rack.len, node->rack.data); - log_debug(LOG_VVERB, "Processing gossip_msg_peer_update : name '%.*s'", node->name.len, node->name.data); - log_debug(LOG_VVERB, "State %d", node->state); - print_dyn_token(&node->token, 10); - - status = gossip_add_node_if_absent(sp, &node->dc, &node->rack, - &node->name, &node->name, - (node->port == 8101)? &PEER_PORT : &PEER_SSL_PORT, - &node->token, - node->state, - node->ts); +rstatus_t gossip_destroy(struct server_pool *sp) { return DN_OK; } + +void gossip_debug(void) { + uint32_t i, nelem; + for (i = 0, nelem = array_n(&gn_pool.datacenters); i < nelem; i++) { + log_debug(LOG_VERB, + "===============Gossip dump==============================="); + struct gossip_dc *g_dc = + (struct gossip_dc *)array_get(&gn_pool.datacenters, i); + log_debug(LOG_VERB, "\tDC name : '%.*s'", g_dc->name.len, + g_dc->name.data); + log_debug(LOG_VERB, + "========================================================="); + + uint32_t k, kelem; + for (k = 0, kelem = array_n(&g_dc->racks); k < kelem; k++) { + struct gossip_rack *g_rack = + (struct gossip_rack *)array_get(&g_dc->racks, k); + + log_debug(LOG_VERB, "\tRACK name : '%.*s'", g_rack->name.len, + g_rack->name.data); + log_debug(LOG_VERB, "\tNum nodes in RACK : '%d'", + array_n(&g_rack->nodes)); + uint32_t jj; + for (jj = 0; jj < array_n(&g_rack->nodes); jj++) { + log_debug(LOG_VERB, "-----------------------------------------"); + struct gossip_node *node = + (struct gossip_node *)array_get(&g_rack->nodes, jj); + log_debug(LOG_VERB, "\t\tNode name : '%.*s'", node->name); + log_debug(LOG_VERB, "\t\tNode pname : '%.*s'", node->pname); + log_debug(LOG_VERB, "\t\tNode state : %" PRIu32 "", + node->state); + log_debug(LOG_VERB, "\t\tNode port : %" PRIu32 "", node->port); + log_debug(LOG_VERB, "\t\tNode is_local : %" PRIu32 " ", + node->is_local); + + print_dyn_token(&node->token, 8); + log_debug(LOG_VERB, "\t\tFinger print : %" PRIu64 " ", + dictFingerprint(g_rack->dict_token_nodes)); + } } - gossip_debug(); + } + log_debug(LOG_VERB, + "..........................................................."); +} - return status; +rstatus_t gossip_msg_peer_update(void *rmsg) { + rstatus_t status; + struct ring_msg *msg = rmsg; + struct server_pool *sp = msg->sp; + + // TODOs: need to fix this as it is breaking warm bootstrap + current_node->state = NORMAL; + sp->ctx->dyn_state = NORMAL; + + uint32_t i = 0; + uint32_t n = array_n(&msg->nodes); + for (i = 0; i < n; i++) { + struct gossip_node *node = array_get(&msg->nodes, i); + log_debug(LOG_VVERB, "Processing msg gossip_msg_peer_update '%.*s'", + node->name.len, node->name.data); + log_debug(LOG_VVERB, + "Processing gossip_msg_peer_update : datacenter '%.*s'", + node->dc.len, node->dc.data); + log_debug(LOG_VVERB, "Processing gossip_msg_peer_update : rack '%.*s'", + node->rack.len, node->rack.data); + log_debug(LOG_VVERB, "Processing gossip_msg_peer_update : name '%.*s'", + node->name.len, node->name.data); + log_debug(LOG_VVERB, "State %d", node->state); + print_dyn_token(&node->token, 10); + + status = gossip_add_node_if_absent( + sp, &node->dc, &node->rack, &node->name, &node->name, + (node->port == 8101) ? &PEER_PORT : &PEER_SSL_PORT, &node->token, + node->state, node->ts); + } + gossip_debug(); + + return status; } diff --git a/src/dyn_gossip.h b/src/dyn_gossip.h index 2ce899446..462afbe41 100644 --- a/src/dyn_gossip.h +++ b/src/dyn_gossip.h @@ -1,85 +1,76 @@ -#include "hashkit/dyn_token.h" -#include "dyn_core.h" -#include "dyn_dict.h" - #ifndef DYN_GOSSIP_H_ #define DYN_GOSSIP_H_ +#include "dyn_array.h" +#include "dyn_dict.h" +#include "dyn_string.h" +#include "hashkit/dyn_token.h" -#define GOS_NOOPS 1 -#define GOS_OK 0 -#define GOS_ERROR -1 - -#define SIMPLE_PROVIDER "simple_provider" -#define FLORIDA_PROVIDER "florida_provider" -#define DNS_PROVIDER "dns_provider" +#define GOS_NOOPS 1 +#define GOS_OK 0 +#define GOS_ERROR -1 -#define SEED_BUF_SIZE (1024*1024) //in bytes +#define SIMPLE_PROVIDER "simple_provider" +#define FLORIDA_PROVIDER "florida_provider" +#define DNS_PROVIDER "dns_provider" +#define SEED_BUF_SIZE (1024 * 1024) // in bytes typedef uint8_t (*seeds_provider_t)(struct context *, struct mbuf *); extern struct gossip_node_pool gn_pool; - // In comparison to conf_server in dyn_conf.h, this structure, // has sockinfo & valid flag missing // It has is_local, state, and timestamp extra // Also in conf_server, pname is name:port:weight, // whereas here it is just name:port struct gossip_node { - struct string pname; /* name:port */ - struct string name; /* name */ - int port; /* port */ - // info is missing - struct dyn_token token; /* token for this node */ - struct string rack; - struct string dc; - bool is_secure; /* is a secured conn */ - - bool is_local; /* is this peer the current running node? */ - uint8_t state; /* state of a node that this host knows */ - uint64_t ts; /* timestamp */ - + struct string pname; /* name:port */ + struct string name; /* name */ + int port; /* port */ + // info is missing + struct dyn_token token; /* token for this node */ + struct string rack; + struct string dc; + bool is_secure; /* is a secured conn */ + + bool is_local; /* is this peer the current running node? */ + uint8_t state; /* state of a node that this host knows */ + uint64_t ts; /* timestamp */ }; - struct gossip_rack { - struct string name; - struct string dc; - uint32_t nnodes; /* # total nodes */ - uint32_t nlive_nodes; /* # live nodes */ - struct array nodes; /* nodes */ - dict *dict_token_nodes; - dict *dict_name_nodes; + struct string name; + struct string dc; + uint32_t nnodes; /* # total nodes */ + uint32_t nlive_nodes; /* # live nodes */ + struct array nodes; /* nodes */ + dict *dict_token_nodes; + dict *dict_name_nodes; }; - struct gossip_dc { - struct string name; /* datacenter name */ - struct array racks; /* list of gossip_rack in a datacenter */ - dict *dict_rack; + struct string name; /* datacenter name */ + struct array racks; /* list of gossip_rack in a datacenter */ + dict *dict_rack; }; struct gossip_node_pool { - struct string *name; /* pool name (ref in conf_pool) */ - uint32_t idx; /* pool index */ - struct context *ctx; /* owner context */ - seeds_provider_t seeds_provider; /* seeds provider */ - struct array datacenters; /* gossip datacenters */ - int64_t last_run; /* last time run in usec */ - msec_t g_interval; /* gossip interval */ - dict *dict_dc; - + struct string *name; /* pool name (ref in conf_pool) */ + uint32_t idx; /* pool index */ + struct context *ctx; /* owner context */ + seeds_provider_t seeds_provider; /* seeds provider */ + struct array datacenters; /* gossip datacenters */ + int64_t last_run; /* last time run in usec */ + msec_t g_interval; /* gossip interval */ + dict *dict_dc; }; - rstatus_t gossip_pool_init(struct context *ctx); void gossip_pool_deinit(struct context *ctx); rstatus_t gossip_start(struct server_pool *sp); rstatus_t gossip_destroy(struct server_pool *sp); - rstatus_t gossip_msg_peer_update(void *msg); - #endif /* DYN_GOSSIP_H_ */ diff --git a/src/dyn_histogram.c b/src/dyn_histogram.c index abfcfa8a7..7ebed28b6 100644 --- a/src/dyn_histogram.c +++ b/src/dyn_histogram.c @@ -6,285 +6,261 @@ */ #include -#include #include +#include #include -#include "dyn_core.h" #include "dyn_conf.h" +#include "dyn_core.h" #include "dyn_histogram.h" - - /* a port from this java code: * https://github.com/apache/cassandra/blob/cassandra-1.2/src/java/org/apache/cassandra/utils/EstimatedHistogram.java * * Will try to use https://github.com/HdrHistogram/HdrHistogram_c later */ - - static uint64_t bucket_offsets[BUCKET_SIZE]; - -rstatus_t histo_init(volatile struct histogram *histo) -{ - if (histo == NULL) { - return DN_ERROR; - } - - uint64_t *buckets = histo->buckets; - uint64_t last = 1; - bucket_offsets[0] = last; - int i; - for(i = 1; imean = 0; - histo->val_95th = 0; - histo->val_999th = 0; - histo->val_99th = 0; - histo->val_max = 0; - - return DN_OK; +rstatus_t histo_init(volatile struct histogram *histo) { + if (histo == NULL) { + return DN_ERROR; + } + + uint64_t *buckets = histo->buckets; + uint64_t last = 1; + bucket_offsets[0] = last; + int i; + for (i = 1; i < BUCKET_SIZE; i++) { + uint64_t next = (uint64_t)floor((double)last * 1.2); + if (next == last) next++; + + bucket_offsets[i] = next; + last = next; + } + + for (i = 0; i < BUCKET_SIZE; i++) { + buckets[i] = 0; + } + + histo->mean = 0; + histo->val_95th = 0; + histo->val_999th = 0; + histo->val_99th = 0; + histo->val_max = 0; + + return DN_OK; } +rstatus_t histo_reset(volatile struct histogram *histo) { + if (histo == NULL) { + return DN_ERROR; + } -rstatus_t histo_reset(volatile struct histogram *histo) -{ - if (histo == NULL) { - return DN_ERROR; - } - - uint64_t *buckets = histo->buckets; - int i; - for(i = 0; imean = 0; - histo->val_95th = 0; - histo->val_999th = 0; - histo->val_99th = 0; - histo->val_max = 0; - - return DN_OK; -} + uint64_t *buckets = histo->buckets; + int i; + for (i = 0; i < BUCKET_SIZE; i++) { + buckets[i] = 0; + } + histo->mean = 0; + histo->val_95th = 0; + histo->val_999th = 0; + histo->val_99th = 0; + histo->val_max = 0; -static uint64_t count(struct histogram *histo) -{ - if (histo == NULL) { - return 0; - } - - uint64_t *buckets = histo->buckets; - uint64_t sum = 0L; - int i; - for (i = 0; i < BUCKET_SIZE; i++) - sum += buckets[i]; - return sum; + return DN_OK; } -void histo_add(volatile struct histogram *histo, uint64_t val) -{ - if (histo == NULL) { - return; - } +static uint64_t count(struct histogram *histo) { + if (histo == NULL) { + return 0; + } + + uint64_t *buckets = histo->buckets; + uint64_t sum = 0L; + int i; + for (i = 0; i < BUCKET_SIZE; i++) sum += buckets[i]; + return sum; +} - uint64_t *buckets = histo->buckets; - int left_index, right_index, middle_index, index = 0; +void histo_add(volatile struct histogram *histo, uint64_t val) { + if (histo == NULL) { + return; + } - left_index = 0; - right_index = BUCKET_SIZE-1; + uint64_t *buckets = histo->buckets; + int left_index, right_index, middle_index, index = 0; + left_index = 0; + right_index = BUCKET_SIZE - 1; - while (left_index < right_index ) { - middle_index = left_index + (right_index - left_index) / 2; + while (left_index < right_index) { + middle_index = left_index + (right_index - left_index) / 2; - if (val == bucket_offsets[middle_index]) { - index = middle_index; - break; - } else if (val < bucket_offsets[middle_index]) { - right_index = middle_index; - } else { - left_index = middle_index; - } + if (val == bucket_offsets[middle_index]) { + index = middle_index; + break; + } else if (val < bucket_offsets[middle_index]) { + right_index = middle_index; + } else { + left_index = middle_index; + } - if (left_index == right_index - 1) { - index = left_index; - break; - } - } + if (left_index == right_index - 1) { + index = left_index; + break; + } + } - if (left_index == right_index - 1) - index = left_index; + if (left_index == right_index - 1) index = left_index; - buckets[index]++; + buckets[index]++; - //store max value - histo->val_max = (histo->val_max > val)? histo->val_max : val; + // store max value + histo->val_max = (histo->val_max > val) ? histo->val_max : val; } - /*uint64_t histo_get_bucket(struct histogram *histo, int bucket) { - if (histo == NULL) { - return -1; - } + if (histo == NULL) { + return -1; + } - uint64_t *buckets = histo->buckets; - if (bucket < BUCKET_SIZE) - return buckets[bucket]; + uint64_t *buckets = histo->buckets; + if (bucket < BUCKET_SIZE) + return buckets[bucket]; - return 0; + return 0; } void histo_get_buckets(struct histogram *histo, uint64_t* my_buckets) { - if (histo == NULL) { - return; - } + if (histo == NULL) { + return; + } - uint64_t *buckets = histo->buckets; - int i; - for(i=0; ibuckets; + int i; + for(i=0; ibuckets; - - if (percentile < 0 && percentile > 1.0) { - return 0; - } - - int last_bucket = BUCKET_SIZE - 1; - if (buckets[last_bucket] > 0) { - log_error("histogram overflowed!"); - return -1; - } - - uint64_t pcount = floor(count(histo) * percentile); - if (pcount == 0) - return 0; - - uint64_t elements = 0; - int i; - for (i = 0; i < last_bucket; i++) - { - elements += buckets[i]; - if (elements >= pcount) - return bucket_offsets[i]; - } - - return 0; + if (histo == NULL) { + return -1; + } + + uint64_t *buckets = histo->buckets; + + if (percentile < 0 && percentile > 1.0) { + return 0; + } + + int last_bucket = BUCKET_SIZE - 1; + if (buckets[last_bucket] > 0) { + log_error("histogram overflowed!"); + return -1; + } + + uint64_t pcount = floor(count(histo) * percentile); + if (pcount == 0) + return 0; + + uint64_t elements = 0; + int i; + for (i = 0; i < last_bucket; i++) + { + elements += buckets[i]; + if (elements >= pcount) + return bucket_offsets[i]; + } + + return 0; } uint64_t histo_mean(struct histogram *histo) { - if (histo == NULL) { - return -1; - } - - uint64_t *buckets = histo->buckets; - - int last_bucket = BUCKET_SIZE - 1; - if (buckets[last_bucket] > 0) { - log_error("histogram overflowed!"); - return -1; - } - - uint64_t elements = 0; - uint64_t sum = 0; - int i; - for (i = 0; i < last_bucket; i++) - { - elements += buckets[i]; - sum += buckets[i] * bucket_offsets[i]; - } - - return ceil((double) sum / elements); + if (histo == NULL) { + return -1; + } + + uint64_t *buckets = histo->buckets; + + int last_bucket = BUCKET_SIZE - 1; + if (buckets[last_bucket] > 0) { + log_error("histogram overflowed!"); + return -1; + } + + uint64_t elements = 0; + uint64_t sum = 0; + int i; + for (i = 0; i < last_bucket; i++) + { + elements += buckets[i]; + sum += buckets[i] * bucket_offsets[i]; + } + + return ceil((double) sum / elements); } uint64_t histo_max(struct histogram *histo) { - if (histo == NULL) { - return -1; - } + if (histo == NULL) { + return -1; + } - return histo->val_max; + return histo->val_max; } */ -void histo_compute(volatile struct histogram *histo) -{ - if (histo == NULL) { - return; - } - - uint64_t *buckets = histo->buckets; - - int last_bucket = BUCKET_SIZE - 1; - if (buckets[last_bucket] > 0) { - log_error("histogram overflowed!"); - return; - } +void histo_compute(volatile struct histogram *histo) { + if (histo == NULL) { + return; + } - uint64_t p95_count = (uint64_t)floor((double)count(histo) * 0.95); - uint64_t p99_count = (uint64_t)floor((double)count(histo) * 0.99); - uint64_t p999_count = (uint64_t)floor((double)count(histo) * 0.999); + uint64_t *buckets = histo->buckets; - uint64_t val_95th = 0; - uint64_t val_99th = 0; - uint64_t val_999th = 0; + int last_bucket = BUCKET_SIZE - 1; + if (buckets[last_bucket] > 0) { + log_error("histogram overflowed!"); + return; + } + uint64_t p95_count = (uint64_t)floor((double)count(histo) * 0.95); + uint64_t p99_count = (uint64_t)floor((double)count(histo) * 0.99); + uint64_t p999_count = (uint64_t)floor((double)count(histo) * 0.999); - uint64_t elements = 0; - uint64_t sum = 0; - int i; - for (i = 0; i < last_bucket; i++) - { - elements += buckets[i]; - if (elements >= p95_count && val_95th == 0) - val_95th = bucket_offsets[i]; + uint64_t val_95th = 0; + uint64_t val_99th = 0; + uint64_t val_999th = 0; - if (elements >= p99_count && val_99th == 0) - val_99th = bucket_offsets[i]; + uint64_t elements = 0; + uint64_t sum = 0; + int i; + for (i = 0; i < last_bucket; i++) { + elements += buckets[i]; + if (elements >= p95_count && val_95th == 0) val_95th = bucket_offsets[i]; - if (elements >= p999_count && val_999th == 0) - val_999th = bucket_offsets[i]; + if (elements >= p99_count && val_99th == 0) val_99th = bucket_offsets[i]; - sum += buckets[i] * bucket_offsets[i]; + if (elements >= p999_count && val_999th == 0) val_999th = bucket_offsets[i]; - } + sum += buckets[i] * bucket_offsets[i]; + } - if (elements != 0) - histo->mean = (uint64_t)ceil((double) sum / (double)elements); + if (elements != 0) + histo->mean = (uint64_t)ceil((double)sum / (double)elements); - histo->val_95th = val_95th; - histo->val_99th = val_99th; - histo->val_999th = val_999th; + histo->val_95th = val_95th; + histo->val_99th = val_99th; + histo->val_999th = val_999th; } diff --git a/src/dyn_histogram.h b/src/dyn_histogram.h index 2a6732e87..047e9946e 100644 --- a/src/dyn_histogram.h +++ b/src/dyn_histogram.h @@ -8,29 +8,25 @@ #ifndef DYN_HISTOGRAM_H_ #define DYN_HISTOGRAM_H_ - #define BUCKET_SIZE 94 - struct histogram { - uint64_t buckets[BUCKET_SIZE]; - uint64_t mean; - uint64_t val_95th; - uint64_t val_99th; - uint64_t val_999th; - uint64_t val_max; + uint64_t buckets[BUCKET_SIZE]; + uint64_t mean; + uint64_t val_95th; + uint64_t val_99th; + uint64_t val_999th; + uint64_t val_max; }; - rstatus_t histo_init(volatile struct histogram *histo); rstatus_t histo_reset(volatile struct histogram *histo); void histo_add(volatile struct histogram *histo, uint64_t val); uint64_t histo_get_bucket(volatile struct histogram *histo, int bucket); -void histo_get_buckets(volatile struct histogram *histo, uint64_t* my_buckets); +void histo_get_buckets(volatile struct histogram *histo, uint64_t *my_buckets); uint64_t histo_percentile(volatile struct histogram *histo, double percentile); uint64_t histo_mean(volatile struct histogram *histo); uint64_t histo_max(volatile struct histogram *histo); void histo_compute(volatile struct histogram *histo); - #endif /* DYN_HISTOGRAM_H_ */ diff --git a/src/dyn_log.c b/src/dyn_log.c index 731b3ef6a..476f1ba3b 100644 --- a/src/dyn_log.c +++ b/src/dyn_log.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,13 +20,13 @@ * limitations under the License. */ -#include -#include #include -#include +#include +#include +#include #include #include -#include +#include #include "dyn_core.h" @@ -39,239 +39,217 @@ static struct logger logger; * @param[in] name Full path to the log file. * @return */ -int -log_init(int level, char *name) -{ - struct logger *l = &logger; - - l->level = MAX(LOG_EMERG, MIN(level, LOG_PVERB)); - l->name = name; - if (name == NULL || !strlen(name)) { - l->fd = STDERR_FILENO; - } else { - l->fd = open(name, O_WRONLY | O_APPEND | O_CREAT, 0644); - if (l->fd < 0) { - log_stderr("opening log file '%s' failed: %s", name, - strerror(errno)); - return -1; - } +int log_init(int level, char *name) { + struct logger *l = &logger; + + l->level = MAX(LOG_EMERG, MIN(level, LOG_PVERB)); + l->name = name; + if (name == NULL || !strlen(name)) { + l->fd = STDERR_FILENO; + } else { + l->fd = open(name, O_WRONLY | O_APPEND | O_CREAT, 0644); + if (l->fd < 0) { + log_stderr("opening log file '%s' failed: %s", name, strerror(errno)); + return -1; } + } - return 0; + return 0; } /** * Close the logging file descriptor. */ -void -log_deinit(void) -{ - struct logger *l = &logger; +void log_deinit(void) { + struct logger *l = &logger; - if (l->fd < 0 || l->fd == STDERR_FILENO) { - return; - } + if (l->fd < 0 || l->fd == STDERR_FILENO) { + return; + } - close(l->fd); + close(l->fd); } -void -log_reopen(void) -{ - struct logger *l = &logger; - - if (l->fd != STDERR_FILENO) { - close(l->fd); - l->fd = open(l->name, O_WRONLY | O_APPEND | O_CREAT, 0644); - if (l->fd < 0) { - log_stderr("reopening log file '%s' failed, ignored: %s", l->name, - strerror(errno)); - } +void log_reopen(void) { + struct logger *l = &logger; + + if (l->fd != STDERR_FILENO) { + close(l->fd); + l->fd = open(l->name, O_WRONLY | O_APPEND | O_CREAT, 0644); + if (l->fd < 0) { + log_stderr("reopening log file '%s' failed, ignored: %s", l->name, + strerror(errno)); } + } } -void -log_level_up(void) -{ - struct logger *l = &logger; +void log_level_up(void) { + struct logger *l = &logger; - if (l->level < LOG_PVERB) { - l->level++; - loga("up log level to %d", l->level); - } + if (l->level < LOG_PVERB) { + l->level++; + loga("up log level to %d", l->level); + } } -void -log_level_down(void) -{ - struct logger *l = &logger; +void log_level_down(void) { + struct logger *l = &logger; - if (l->level > LOG_EMERG) { - l->level--; - loga("down log level to %d", l->level); - } + if (l->level > LOG_EMERG) { + l->level--; + loga("down log level to %d", l->level); + } } -void -log_level_set(int level) -{ - struct logger *l = &logger; +void log_level_set(int level) { + struct logger *l = &logger; - l->level = MAX(LOG_EMERG, MIN(level, LOG_PVERB)); - loga("set log level to %d", l->level); + l->level = MAX(LOG_EMERG, MIN(level, LOG_PVERB)); + loga("set log level to %d", l->level); } -int -log_loggable(int level) -{ - struct logger *l = &logger; +int log_loggable(int level) { + struct logger *l = &logger; - if (level > l->level) { - return 0; - } + if (level > l->level) { + return 0; + } - return 1; + return 1; } -void -_log(const char *file, int line, int panic, const char *fmt, ...) -{ - struct logger *l = &logger; - int len, size, errno_save; - char buf[LOG_MAX_LEN]; - va_list args; - ssize_t n; +void _log(const char *file, int line, int panic, const char *fmt, ...) { + struct logger *l = &logger; + int len, size, errno_save; + char buf[LOG_MAX_LEN]; + va_list args; + ssize_t n; - if (l->fd < 0) { - return; - } + if (l->fd < 0) { + return; + } - errno_save = errno; - len = 0; /* length of output buffer */ - size = LOG_MAX_LEN; /* size of output buffer */ + errno_save = errno; + len = 0; /* length of output buffer */ + size = LOG_MAX_LEN; /* size of output buffer */ - struct timeval curTime; - gettimeofday(&curTime, NULL); + struct timeval curTime; + gettimeofday(&curTime, NULL); - char buffer [80]; - strftime(buffer, 80, "%Y-%m-%d %H:%M:%S", localtime(&curTime.tv_sec)); + char buffer[80]; + strftime(buffer, 80, "%Y-%m-%d %H:%M:%S", localtime(&curTime.tv_sec)); - // May be not the perfect place to fix this - len += dn_scnprintf(buf + len, size - len, - "[%.*s.%03d] %s:%d ", - strlen(buffer), buffer, (int64_t)curTime.tv_usec / 1000, - file, line); + // May be not the perfect place to fix this + len += + dn_scnprintf(buf + len, size - len, "[%.*s.%03d] %s:%d ", strlen(buffer), + buffer, (int64_t)curTime.tv_usec / 1000, file, line); - va_start(args, fmt); + va_start(args, fmt); - len += dn_vscnprintf(buf + len, size - len, fmt, args); + len += dn_vscnprintf(buf + len, size - len, fmt, args); - va_end(args); + va_end(args); - buf[len++] = '\n'; + buf[len++] = '\n'; - n = dn_write(l->fd, buf, len); - if (n < 0) { - l->nerror++; - } + n = dn_write(l->fd, buf, len); + if (n < 0) { + l->nerror++; + } - errno = errno_save; + errno = errno_save; - if (panic) { - fsync(l->fd); - close(l->fd); - abort(); - } + if (panic) { + fsync(l->fd); + close(l->fd); + abort(); + } } -void -_log_stderr(const char *fmt, ...) -{ - struct logger *l = &logger; - int len, size, errno_save; - char buf[4 * LOG_MAX_LEN]; - va_list args; - ssize_t n; +void _log_stderr(const char *fmt, ...) { + struct logger *l = &logger; + int len, size, errno_save; + char buf[4 * LOG_MAX_LEN]; + va_list args; + ssize_t n; - errno_save = errno; - len = 0; /* length of output buffer */ - size = 4 * LOG_MAX_LEN; /* size of output buffer */ + errno_save = errno; + len = 0; /* length of output buffer */ + size = 4 * LOG_MAX_LEN; /* size of output buffer */ - va_start(args, fmt); - len += dn_vscnprintf(buf, size, fmt, args); - va_end(args); + va_start(args, fmt); + len += dn_vscnprintf(buf, size, fmt, args); + va_end(args); - buf[len++] = '\n'; + buf[len++] = '\n'; - n = dn_write(STDERR_FILENO, buf, len); - if (n < 0) { - l->nerror++; - } + n = dn_write(STDERR_FILENO, buf, len); + if (n < 0) { + l->nerror++; + } - errno = errno_save; + errno = errno_save; } /* * Hexadecimal dump in the canonical hex + ascii display * See -C option in man hexdump */ -void -_log_hexdump(const char *file, int line, char *data, int datalen, - const char *fmt, ...) -{ - struct logger *l = &logger; - char buf[8 * LOG_MAX_LEN]; - int i, off, len, size, errno_save; - ssize_t n; - - if (l->fd < 0) { - return; +void _log_hexdump(const char *file, int line, char *data, int datalen, + const char *fmt, ...) { + struct logger *l = &logger; + char buf[8 * LOG_MAX_LEN]; + int i, off, len, size, errno_save; + ssize_t n; + + if (l->fd < 0) { + return; + } + + /* log hexdump */ + errno_save = errno; + off = 0; /* data offset */ + len = 0; /* length of output buffer */ + size = 8 * LOG_MAX_LEN; /* size of output buffer */ + + while (datalen != 0 && (len < size - 1)) { + char *save, *str; + unsigned char c; + int savelen; + + len += dn_scnprintf(buf + len, size - len, "%08x ", off); + + save = data; + savelen = datalen; + + for (i = 0; datalen != 0 && i < 16; data++, datalen--, i++) { + c = (unsigned char)(*data); + str = (i == 7) ? " " : " "; + len += dn_scnprintf(buf + len, size - len, "%02x%s", c, str); + } + for (; i < 16; i++) { + str = (i == 7) ? " " : " "; + len += dn_scnprintf(buf + len, size - len, " %s", str); } - /* log hexdump */ - errno_save = errno; - off = 0; /* data offset */ - len = 0; /* length of output buffer */ - size = 8 * LOG_MAX_LEN; /* size of output buffer */ - - while (datalen != 0 && (len < size - 1)) { - char *save, *str; - unsigned char c; - int savelen; - - len += dn_scnprintf(buf + len, size - len, "%08x ", off); - - save = data; - savelen = datalen; - - for (i = 0; datalen != 0 && i < 16; data++, datalen--, i++) { - c = (unsigned char)(*data); - str = (i == 7) ? " " : " "; - len += dn_scnprintf(buf + len, size - len, "%02x%s", c, str); - } - for ( ; i < 16; i++) { - str = (i == 7) ? " " : " "; - len += dn_scnprintf(buf + len, size - len, " %s", str); - } - - data = save; - datalen = savelen; - - len += dn_scnprintf(buf + len, size - len, " |"); + data = save; + datalen = savelen; - for (i = 0; datalen != 0 && i < 16; data++, datalen--, i++) { - c = (unsigned char)(isprint(*data) ? *data : '.'); - len += dn_scnprintf(buf + len, size - len, "%c", c); - } - len += dn_scnprintf(buf + len, size - len, "|\n"); + len += dn_scnprintf(buf + len, size - len, " |"); - off += 16; + for (i = 0; datalen != 0 && i < 16; data++, datalen--, i++) { + c = (unsigned char)(isprint(*data) ? *data : '.'); + len += dn_scnprintf(buf + len, size - len, "%c", c); } + len += dn_scnprintf(buf + len, size - len, "|\n"); - n = dn_write(l->fd, buf, len); - if (n < 0) { - l->nerror++; - } + off += 16; + } + + n = dn_write(l->fd, buf, len); + if (n < 0) { + l->nerror++; + } - errno = errno_save; + errno = errno_save; } diff --git a/src/dyn_log.h b/src/dyn_log.h index 8e639895d..c19baf8a6 100644 --- a/src/dyn_log.h +++ b/src/dyn_log.h @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -24,24 +24,24 @@ #define _DYN_LOG_H_ struct logger { - char *name; /* log file name */ - int level; /* log level */ - int fd; /* log file descriptor */ - int nerror; /* # log error */ + char *name; /* log file name */ + int level; /* log level */ + int fd; /* log file descriptor */ + int nerror; /* # log error */ }; -#define LOG_EMERG 0 /* system in unusable */ -#define LOG_ALERT 1 /* action must be taken immediately */ -#define LOG_CRIT 2 /* critical conditions */ -#define LOG_ERR 3 /* error conditions */ -#define LOG_WARN 4 /* warning conditions */ -#define LOG_NOTICE 5 /* normal but significant condition (default) */ -#define LOG_INFO 6 /* informational */ -#define LOG_DEBUG 7 /* debug messages */ -#define LOG_VERB 8 /* verbose messages */ -#define LOG_VVERB 9 /* verbose messages on crack */ -#define LOG_VVVERB 10 /* verbose messages on ganga */ -#define LOG_PVERB 11 /* periodic verbose messages on crack */ +#define LOG_EMERG 0 /* system in unusable */ +#define LOG_ALERT 1 /* action must be taken immediately */ +#define LOG_CRIT 2 /* critical conditions */ +#define LOG_ERR 3 /* error conditions */ +#define LOG_WARN 4 /* warning conditions */ +#define LOG_NOTICE 5 /* normal but significant condition (default) */ +#define LOG_INFO 6 /* informational */ +#define LOG_DEBUG 7 /* debug messages */ +#define LOG_VERB 8 /* verbose messages */ +#define LOG_VVERB 9 /* verbose messages on crack */ +#define LOG_VVVERB 10 /* verbose messages on ganga */ +#define LOG_PVERB 11 /* periodic verbose messages on crack */ #define LOG_MAX_LEN 512 /* max length of log message */ @@ -58,19 +58,21 @@ struct logger { */ #ifdef DN_DEBUG_LOG -#define log_debug(_level, ...) do { \ - if (log_loggable(_level) != 0) { \ - _log(__FUNCTION__, __LINE__, 0, __VA_ARGS__); \ - } \ -} while (0) - -#define log_hexdump(_level, _data, _datalen, ...) do { \ - if (log_loggable(_level) != 0) { \ - _log(__FUNCTION__, __LINE__, 0, __VA_ARGS__); \ - _log_hexdump(__FUNCTION__, __LINE__, (char *)(_data), (int)(_datalen), \ - __VA_ARGS__); \ - } \ -} while (0) +#define log_debug(_level, ...) \ + do { \ + if (log_loggable(_level) != 0) { \ + _log(__FUNCTION__, __LINE__, 0, __VA_ARGS__); \ + } \ + } while (0) + +#define log_hexdump(_level, _data, _datalen, ...) \ + do { \ + if (log_loggable(_level) != 0) { \ + _log(__FUNCTION__, __LINE__, 0, __VA_ARGS__); \ + _log_hexdump(__FUNCTION__, __LINE__, (char *)(_data), (int)(_datalen), \ + __VA_ARGS__); \ + } \ + } while (0) #else @@ -79,42 +81,46 @@ struct logger { #endif -#define log_notice(...) \ - log_debug(LOG_NOTICE, __VA_ARGS__); -#define log_info(...) \ - log_debug(LOG_INFO, __VA_ARGS__); - -#define log_stderr(...) do { \ - _log_stderr(__VA_ARGS__); \ -} while (0) - -#define loga(...) do { \ - _log(__FUNCTION__, __LINE__, 0, __VA_ARGS__); \ -} while (0) - -#define loga_hexdump(_data, _datalen, ...) do { \ - _log(__FUNCTION__, __LINE__, 0, __VA_ARGS__); \ - _log_hexdump(__FUNCTION__, __LINE__, (char *)(_data), (int)(_datalen), \ - __VA_ARGS__); \ -} while (0) \ - -#define log_error(...) do { \ - if (log_loggable(LOG_ALERT) != 0) { \ - _log(__FUNCTION__, __LINE__, 0, __VA_ARGS__); \ - } \ -} while (0) - -#define log_warn(...) do { \ - if (log_loggable(LOG_WARN) != 0) { \ - _log(__FUNCTION__, __LINE__, 0, __VA_ARGS__); \ - } \ -} while (0) - -#define log_panic(...) do { \ - if (log_loggable(LOG_EMERG) != 0) { \ - _log(__FUNCTION__, __LINE__, 1, __VA_ARGS__); \ - } \ -} while (0) +#define log_notice(...) log_debug(LOG_NOTICE, __VA_ARGS__); +#define log_info(...) log_debug(LOG_INFO, __VA_ARGS__); + +#define log_stderr(...) \ + do { \ + _log_stderr(__VA_ARGS__); \ + } while (0) + +#define loga(...) \ + do { \ + _log(__FUNCTION__, __LINE__, 0, __VA_ARGS__); \ + } while (0) + +#define loga_hexdump(_data, _datalen, ...) \ + do { \ + _log(__FUNCTION__, __LINE__, 0, __VA_ARGS__); \ + _log_hexdump(__FUNCTION__, __LINE__, (char *)(_data), (int)(_datalen), \ + __VA_ARGS__); \ + } while (0) + +#define log_error(...) \ + do { \ + if (log_loggable(LOG_ALERT) != 0) { \ + _log(__FUNCTION__, __LINE__, 0, __VA_ARGS__); \ + } \ + } while (0) + +#define log_warn(...) \ + do { \ + if (log_loggable(LOG_WARN) != 0) { \ + _log(__FUNCTION__, __LINE__, 0, __VA_ARGS__); \ + } \ + } while (0) + +#define log_panic(...) \ + do { \ + if (log_loggable(LOG_EMERG) != 0) { \ + _log(__FUNCTION__, __LINE__, 1, __VA_ARGS__); \ + } \ + } while (0) int log_init(int level, char *filename); void log_deinit(void); @@ -125,6 +131,7 @@ void log_reopen(void); int log_loggable(int level); void _log(const char *file, int line, int panic, const char *fmt, ...); void _log_stderr(const char *fmt, ...); -void _log_hexdump(const char *file, int line, char *data, int datalen, const char *fmt, ...); +void _log_hexdump(const char *file, int line, char *data, int datalen, + const char *fmt, ...); #endif diff --git a/src/dyn_mbuf.c b/src/dyn_mbuf.c index 0b170c332..24cd2a1c4 100644 --- a/src/dyn_mbuf.c +++ b/src/dyn_mbuf.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -29,224 +29,192 @@ static uint64_t nfree_mbufq; /* # free mbuf */ static struct mhdr free_mbufq; /* free mbuf q */ static size_t mbuf_chunk_size; /* mbuf chunk size - header + data (const) */ -static size_t mbuf_offset; /* mbuf offset in chunk (const) - include the extra space*/ +static size_t + mbuf_offset; /* mbuf offset in chunk (const) - include the extra space*/ static uint64_t mbuf_alloc_count = 0; -uint64_t -mbuf_alloc_get_count(void) -{ - return mbuf_alloc_count; -} +uint64_t mbuf_alloc_get_count(void) { return mbuf_alloc_count; } + +static struct mbuf *_mbuf_get(void) { + struct mbuf *mbuf; + uint8_t *buf; + + // loga("_mbuf_get, nfree_mbufq = %d", nfree_mbufq); + + if (!STAILQ_EMPTY(&free_mbufq)) { + ASSERT(nfree_mbufq > 0); -static struct mbuf * -_mbuf_get(void) -{ - struct mbuf *mbuf; - uint8_t *buf; - - //loga("_mbuf_get, nfree_mbufq = %d", nfree_mbufq); - - if (!STAILQ_EMPTY(&free_mbufq)) { - ASSERT(nfree_mbufq > 0); - - mbuf = STAILQ_FIRST(&free_mbufq); - nfree_mbufq--; - STAILQ_REMOVE_HEAD(&free_mbufq, next); - - ASSERT(mbuf->magic == MBUF_MAGIC); - goto done; - } - - buf = dn_alloc(mbuf_chunk_size); - if (buf == NULL) { - return NULL; - } - mbuf_alloc_count++; - - /* - * mbuf header is at the tail end of the mbuf. This enables us to catch - * buffer overrun early by asserting on the magic value during get or - * put operations - * - * <------------- mbuf_chunk_size -------------------------> - * +-------------------------------------------------------+ - * | mbuf data | mbuf header | - * | (mbuf_offset) | (struct mbuf) | - * +-------------------------------------------------------+ - * ^ ^ ^ ^ ^^ - * | | | | || - * | | | | \ \mbuf->end_extra (one byte past valid bound) - * \ | | \ \ - * mbuf->start \ | mbuf->end mbuf - * mbuf->pos | - * \ - * mbuf->last (one byte past valid byte) - * - */ - mbuf = (struct mbuf *)(buf + mbuf_offset); - mbuf->magic = MBUF_MAGIC; - mbuf->chunk_size = mbuf_chunk_size; + mbuf = STAILQ_FIRST(&free_mbufq); + nfree_mbufq--; + STAILQ_REMOVE_HEAD(&free_mbufq, next); + + ASSERT(mbuf->magic == MBUF_MAGIC); + goto done; + } + + buf = dn_alloc(mbuf_chunk_size); + if (buf == NULL) { + return NULL; + } + mbuf_alloc_count++; + + /* + * mbuf header is at the tail end of the mbuf. This enables us to catch + * buffer overrun early by asserting on the magic value during get or + * put operations + * + * <------------- mbuf_chunk_size -------------------------> + * +-------------------------------------------------------+ + * | mbuf data | mbuf header | + * | (mbuf_offset) | (struct mbuf) | + * +-------------------------------------------------------+ + * ^ ^ ^ ^ ^^ + * | | | | || + * | | | | \ \mbuf->end_extra (one byte past + * valid bound) + * \ | | \ \ + * mbuf->start \ | mbuf->end mbuf + * mbuf->pos | + * \ + * mbuf->last (one byte past valid byte) + * + */ + mbuf = (struct mbuf *)(buf + mbuf_offset); + mbuf->magic = MBUF_MAGIC; + mbuf->chunk_size = mbuf_chunk_size; done: - STAILQ_NEXT(mbuf, next) = NULL; - return mbuf; + STAILQ_NEXT(mbuf, next) = NULL; + return mbuf; } +struct mbuf *mbuf_get(void) { + struct mbuf *mbuf; + uint8_t *buf; -struct mbuf * -mbuf_get(void) -{ - struct mbuf *mbuf; - uint8_t *buf; - - mbuf = _mbuf_get(); - if (mbuf == NULL) { - loga("mbuf is Null"); - return NULL; - } + mbuf = _mbuf_get(); + if (mbuf == NULL) { + loga("mbuf is Null"); + return NULL; + } - buf = (uint8_t *)mbuf - mbuf_offset; - mbuf->start = buf; - mbuf->end = buf + mbuf_offset - MBUF_ESIZE; - mbuf->end_extra = buf + mbuf_offset; + buf = (uint8_t *)mbuf - mbuf_offset; + mbuf->start = buf; + mbuf->end = buf + mbuf_offset - MBUF_ESIZE; + mbuf->end_extra = buf + mbuf_offset; - //ASSERT(mbuf->end - mbuf->start == (int)mbuf_offset); - ASSERT(mbuf->start < mbuf->end); + // ASSERT(mbuf->end - mbuf->start == (int)mbuf_offset); + ASSERT(mbuf->start < mbuf->end); - mbuf->pos = mbuf->start; - mbuf->last = mbuf->start; + mbuf->pos = mbuf->start; + mbuf->last = mbuf->start; - mbuf->flags = 0; + mbuf->flags = 0; - log_debug(LOG_VVERB, "get mbuf %p", mbuf); + log_debug(LOG_VVERB, "get mbuf %p", mbuf); - return mbuf; + return mbuf; } -static void -mbuf_free(struct mbuf *mbuf) -{ - uint8_t *buf; +static void mbuf_free(struct mbuf *mbuf) { + uint8_t *buf; - log_debug(LOG_VVERB, "put mbuf %p len %d", mbuf, mbuf->last - mbuf->pos); + log_debug(LOG_VVERB, "put mbuf %p len %d", mbuf, mbuf->last - mbuf->pos); - ASSERT(STAILQ_NEXT(mbuf, next) == NULL); - ASSERT(mbuf->magic == MBUF_MAGIC); + ASSERT(STAILQ_NEXT(mbuf, next) == NULL); + ASSERT(mbuf->magic == MBUF_MAGIC); - buf = (uint8_t *)mbuf - mbuf_offset; - dn_free(buf); + buf = (uint8_t *)mbuf - mbuf_offset; + dn_free(buf); } -uint64_t -mbuf_free_queue_size(void) -{ - return nfree_mbufq; -} +uint64_t mbuf_free_queue_size(void) { return nfree_mbufq; } +void mbuf_dump(struct mbuf *mbuf) { + uint8_t *p, *q; + long int len; -void mbuf_dump(struct mbuf *mbuf) -{ - uint8_t *p, *q; - long int len; + p = mbuf->pos; + q = mbuf->last; + len = q - p; - p = mbuf->pos; - q = mbuf->last; - len = q - p; - - loga_hexdump(p, len, "mbuf %p with %ld bytes of data", mbuf, len); + loga_hexdump(p, len, "mbuf %p with %ld bytes of data", mbuf, len); } -void -mbuf_put(struct mbuf *mbuf) -{ - log_debug(LOG_VVERB, "put mbuf %p len %d", mbuf, mbuf->last - mbuf->pos); +void mbuf_put(struct mbuf *mbuf) { + log_debug(LOG_VVERB, "put mbuf %p len %d", mbuf, mbuf->last - mbuf->pos); - ASSERT(STAILQ_NEXT(mbuf, next) == NULL); - ASSERT(mbuf->magic == MBUF_MAGIC); + ASSERT(STAILQ_NEXT(mbuf, next) == NULL); + ASSERT(mbuf->magic == MBUF_MAGIC); - nfree_mbufq++; - STAILQ_INSERT_HEAD(&free_mbufq, mbuf, next); + nfree_mbufq++; + STAILQ_INSERT_HEAD(&free_mbufq, mbuf, next); } /* * Rewind the mbuf by discarding any of the read or unread data that it * might hold. */ -void -mbuf_rewind(struct mbuf *mbuf) -{ - mbuf->pos = mbuf->start; - mbuf->last = mbuf->start; +void mbuf_rewind(struct mbuf *mbuf) { + mbuf->pos = mbuf->start; + mbuf->last = mbuf->start; } /* * Return the length of data in mbuf. Mbuf cannot contain more than * 2^32 bytes (4G). */ -uint32_t -mbuf_length(struct mbuf *mbuf) -{ - ASSERT(mbuf->last >= mbuf->pos); +uint32_t mbuf_length(struct mbuf *mbuf) { + ASSERT(mbuf->last >= mbuf->pos); - return (uint32_t)(mbuf->last - mbuf->pos); + return (uint32_t)(mbuf->last - mbuf->pos); } /* * Return the remaining space size for any new data in mbuf. Mbuf cannot * contain more than 2^32 bytes (4G). */ -uint32_t -mbuf_size(struct mbuf *mbuf) -{ - ASSERT(mbuf->end >= mbuf->last); +uint32_t mbuf_size(struct mbuf *mbuf) { + ASSERT(mbuf->end >= mbuf->last); - return (uint32_t)(mbuf->end - mbuf->last); + return (uint32_t)(mbuf->end - mbuf->last); } /* * Return the maximum available space size for data in any mbuf. Mbuf cannot * contain more than 2^32 bytes (4G). */ -size_t -mbuf_data_size(void) -{ - return mbuf_offset; -} +size_t mbuf_data_size(void) { return mbuf_offset; } /* * Insert mbuf at the tail of the mhdr Q */ -void -mbuf_insert(struct mhdr *mhdr, struct mbuf *mbuf) -{ - STAILQ_INSERT_TAIL(mhdr, mbuf, next); - log_debug(LOG_VVERB, "insert mbuf %p len %d", mbuf, mbuf->last - mbuf->pos); +void mbuf_insert(struct mhdr *mhdr, struct mbuf *mbuf) { + STAILQ_INSERT_TAIL(mhdr, mbuf, next); + log_debug(LOG_VVERB, "insert mbuf %p len %d", mbuf, mbuf->last - mbuf->pos); } -void -mbuf_insert_head(struct mhdr *mhdr, struct mbuf *mbuf) -{ - STAILQ_INSERT_HEAD(mhdr, mbuf, next); - log_debug(LOG_VVERB, "insert head mbuf %p len %d", mbuf, mbuf->last - mbuf->pos); +void mbuf_insert_head(struct mhdr *mhdr, struct mbuf *mbuf) { + STAILQ_INSERT_HEAD(mhdr, mbuf, next); + log_debug(LOG_VVERB, "insert head mbuf %p len %d", mbuf, + mbuf->last - mbuf->pos); } -void -mbuf_insert_after(struct mhdr *mhdr, struct mbuf *mbuf, struct mbuf *nbuf) -{ - STAILQ_INSERT_AFTER(mhdr, nbuf, mbuf, next); - log_debug(LOG_VVERB, "insert head mbuf %p len %d", mbuf, mbuf->last - mbuf->pos); +void mbuf_insert_after(struct mhdr *mhdr, struct mbuf *mbuf, + struct mbuf *nbuf) { + STAILQ_INSERT_AFTER(mhdr, nbuf, mbuf, next); + log_debug(LOG_VVERB, "insert head mbuf %p len %d", mbuf, + mbuf->last - mbuf->pos); } /* * Remove mbuf from the mhdr Q */ -void -mbuf_remove(struct mhdr *mhdr, struct mbuf *mbuf) -{ - log_debug(LOG_VVERB, "remove mbuf %p len %d", mbuf, mbuf->last - mbuf->pos); +void mbuf_remove(struct mhdr *mhdr, struct mbuf *mbuf) { + log_debug(LOG_VVERB, "remove mbuf %p len %d", mbuf, mbuf->last - mbuf->pos); - STAILQ_REMOVE(mhdr, mbuf, mbuf, next); - STAILQ_NEXT(mbuf, next) = NULL; + STAILQ_REMOVE(mhdr, mbuf, mbuf, next); + STAILQ_NEXT(mbuf, next) = NULL; } /* @@ -255,21 +223,19 @@ mbuf_remove(struct mhdr *mhdr, struct mbuf *mbuf) * The memory areas should not overlap and the mbuf should have * enough space for n bytes. */ -void -mbuf_copy(struct mbuf *mbuf, uint8_t *pos, size_t n) -{ - if (n == 0) { - return; - } +void mbuf_copy(struct mbuf *mbuf, uint8_t *pos, size_t n) { + if (n == 0) { + return; + } - /* mbuf has space for n bytes */ - ASSERT(!mbuf_full(mbuf) && n <= mbuf_size(mbuf)); + /* mbuf has space for n bytes */ + ASSERT(!mbuf_full(mbuf) && n <= mbuf_size(mbuf)); - /* no overlapping copy */ - ASSERT(pos < mbuf->start || pos >= mbuf->end); + /* no overlapping copy */ + ASSERT(pos < mbuf->start || pos >= mbuf->end); - dn_memcpy(mbuf->last, pos, n); - mbuf->last += n; + dn_memcpy(mbuf->last, pos, n); + mbuf->last += n; } /* @@ -279,177 +245,152 @@ mbuf_copy(struct mbuf *mbuf, uint8_t *pos, size_t n) * * Return new mbuf t, if the split was successful. */ -struct mbuf * -mbuf_split(struct mhdr *h, uint8_t *pos, func_mbuf_copy_t cb, void *cbarg) -{ - struct mbuf *mbuf, *nbuf; - size_t size; +struct mbuf *mbuf_split(struct mhdr *h, uint8_t *pos, func_mbuf_copy_t cb, + void *cbarg) { + struct mbuf *mbuf, *nbuf; + size_t size; - ASSERT(!STAILQ_EMPTY(h)); + ASSERT(!STAILQ_EMPTY(h)); - mbuf = STAILQ_LAST(h, mbuf, next); + mbuf = STAILQ_LAST(h, mbuf, next); - //ASSERT(pos >= mbuf->pos && pos <= mbuf->last); - if (pos < mbuf->pos || pos > mbuf->last) - return NULL; + // ASSERT(pos >= mbuf->pos && pos <= mbuf->last); + if (pos < mbuf->pos || pos > mbuf->last) return NULL; - nbuf = mbuf_get(); - if (nbuf == NULL) { - return NULL; - } + nbuf = mbuf_get(); + if (nbuf == NULL) { + return NULL; + } - if (cb != NULL) { - /* precopy nbuf */ - cb(nbuf, cbarg); - } + if (cb != NULL) { + /* precopy nbuf */ + cb(nbuf, cbarg); + } - /* copy data from mbuf to nbuf */ - size = (size_t)(mbuf->last - pos); - mbuf_copy(nbuf, pos, size); + /* copy data from mbuf to nbuf */ + size = (size_t)(mbuf->last - pos); + mbuf_copy(nbuf, pos, size); - /* adjust mbuf */ - mbuf->last = pos; + /* adjust mbuf */ + mbuf->last = pos; - log_debug(LOG_VVERB, "split into mbuf %p len %"PRIu32" and nbuf %p len " - "%"PRIu32" copied %zu bytes", mbuf, mbuf_length(mbuf), nbuf, - mbuf_length(nbuf), size); + log_debug(LOG_VVERB, + "split into mbuf %p len %" PRIu32 + " and nbuf %p len " + "%" PRIu32 " copied %zu bytes", + mbuf, mbuf_length(mbuf), nbuf, mbuf_length(nbuf), size); - return nbuf; + return nbuf; } /** * Initialize memory buffers to store network packets/socket buffers. * @param[in,out] mbuf_size */ -void -mbuf_init(size_t mbuf_size) -{ - nfree_mbufq = 0; - STAILQ_INIT(&free_mbufq); +void mbuf_init(size_t mbuf_size) { + nfree_mbufq = 0; + STAILQ_INIT(&free_mbufq); - mbuf_chunk_size = mbuf_size + MBUF_ESIZE; - mbuf_offset = mbuf_chunk_size - MBUF_HSIZE; + mbuf_chunk_size = mbuf_size + MBUF_ESIZE; + mbuf_offset = mbuf_chunk_size - MBUF_HSIZE; - log_debug(LOG_DEBUG, "mbuf hsize %d chunk size %zu offset %zu length %zu", - MBUF_HSIZE, mbuf_chunk_size, mbuf_offset, mbuf_offset); + log_debug(LOG_DEBUG, "mbuf hsize %d chunk size %zu offset %zu length %zu", + MBUF_HSIZE, mbuf_chunk_size, mbuf_offset, mbuf_offset); } -void -mbuf_deinit(void) -{ - while (!STAILQ_EMPTY(&free_mbufq)) { - struct mbuf *mbuf = STAILQ_FIRST(&free_mbufq); - mbuf_remove(&free_mbufq, mbuf); - mbuf_free(mbuf); - nfree_mbufq--; - } - ASSERT(nfree_mbufq == 0); +void mbuf_deinit(void) { + while (!STAILQ_EMPTY(&free_mbufq)) { + struct mbuf *mbuf = STAILQ_FIRST(&free_mbufq); + mbuf_remove(&free_mbufq, mbuf); + mbuf_free(mbuf); + nfree_mbufq--; + } + ASSERT(nfree_mbufq == 0); } - -void -mbuf_write_char(struct mbuf *mbuf, char ch) -{ - ASSERT(mbuf_size(mbuf) >= 1); - *mbuf->last = ch; - mbuf->last += 1; +void mbuf_write_char(struct mbuf *mbuf, char ch) { + ASSERT(mbuf_size(mbuf) >= 1); + *mbuf->last = ch; + mbuf->last += 1; } - -void -mbuf_write_string(struct mbuf *mbuf, const struct string *s) -{ - ASSERT(s->len < mbuf_size(mbuf)); - mbuf_copy(mbuf, s->data, s->len); +void mbuf_write_string(struct mbuf *mbuf, const struct string *s) { + ASSERT(s->len < mbuf_size(mbuf)); + mbuf_copy(mbuf, s->data, s->len); } -void mbuf_write_mbuf(struct mbuf *mbuf, struct mbuf *data) -{ - mbuf_copy(mbuf, data->pos, data->last - data->pos); +void mbuf_write_mbuf(struct mbuf *mbuf, struct mbuf *data) { + mbuf_copy(mbuf, data->pos, data->last - data->pos); } -void mbuf_write_bytes(struct mbuf *mbuf, unsigned char *data, int len) -{ - mbuf_copy(mbuf, data, len); +void mbuf_write_bytes(struct mbuf *mbuf, unsigned char *data, int len) { + mbuf_copy(mbuf, data, len); } -void -mbuf_write_uint8(struct mbuf *mbuf, uint8_t num) -{ - if (num < 10) { - mbuf_write_char(mbuf, '0' + num); - return; - } +void mbuf_write_uint8(struct mbuf *mbuf, uint8_t num) { + if (num < 10) { + mbuf_write_char(mbuf, '0' + num); + return; + } - mbuf_write_uint8(mbuf, num / 10); - mbuf_write_char(mbuf, '0' + (num % 10)); + mbuf_write_uint8(mbuf, num / 10); + mbuf_write_char(mbuf, '0' + (num % 10)); } +void mbuf_write_uint32(struct mbuf *mbuf, uint32_t num) { + if (num < 10) { + mbuf_write_char(mbuf, '0' + num); + return; + } -void -mbuf_write_uint32(struct mbuf *mbuf, uint32_t num) -{ - if (num < 10) { - mbuf_write_char(mbuf, '0' + num); - return; - } - - mbuf_write_uint32(mbuf, num / 10); - mbuf_write_char(mbuf, '0' + (num % 10)); + mbuf_write_uint32(mbuf, num / 10); + mbuf_write_char(mbuf, '0' + (num % 10)); } +void mbuf_write_uint64(struct mbuf *mbuf, uint64_t num) { + if (num < 10) { + mbuf_write_char(mbuf, '0' + num); + return; + } -void -mbuf_write_uint64(struct mbuf *mbuf, uint64_t num) -{ - - if (num < 10) { - mbuf_write_char(mbuf, '0' + num); - return; - } - - mbuf_write_uint64(mbuf, num / 10); - mbuf_write_char(mbuf, '0' + (num % 10)); + mbuf_write_uint64(mbuf, num / 10); + mbuf_write_char(mbuf, '0' + (num % 10)); } +// allocate an arbitrary size mbuf for a general purpose operation +struct mbuf *mbuf_alloc(const size_t size) { + size_t mbuf_chunk_size = size + MBUF_HSIZE; -//allocate an arbitrary size mbuf for a general purpose operation -struct mbuf * -mbuf_alloc(const size_t size) -{ - uint8_t *buf = dn_alloc(size + MBUF_HSIZE); - if (buf == NULL) { - return NULL; - } + uint8_t *buf = dn_alloc(mbuf_chunk_size); + if (buf == NULL) { + return NULL; + } - struct mbuf *mbuf = (struct mbuf *)(buf + size); - mbuf->magic = MBUF_MAGIC; - mbuf->chunk_size = size; + struct mbuf *mbuf = (struct mbuf *)(buf + size); + mbuf->magic = MBUF_MAGIC; + mbuf->chunk_size = mbuf_chunk_size; - STAILQ_NEXT(mbuf, next) = NULL; + STAILQ_NEXT(mbuf, next) = NULL; - mbuf->start = buf; - mbuf->end = buf + size - MBUF_ESIZE; - mbuf->end_extra = buf + size; + mbuf->start = buf; + mbuf->end = buf + size - MBUF_ESIZE; + mbuf->end_extra = buf + size; - mbuf->pos = mbuf->start; - mbuf->last = mbuf->start; + mbuf->pos = mbuf->start; + mbuf->last = mbuf->start; - return mbuf; + return mbuf; } +void mbuf_dealloc(struct mbuf *mbuf) { + uint8_t *buf; -void -mbuf_dealloc(struct mbuf *mbuf) -{ - uint8_t *buf; + log_debug(LOG_VVERB, "free mbuf %p len %d", mbuf, mbuf->last - mbuf->pos); - log_debug(LOG_VVERB, "free mbuf %p len %d", mbuf, mbuf->last - mbuf->pos); - - ASSERT(STAILQ_NEXT(mbuf, next) == NULL); - ASSERT(mbuf->magic == MBUF_MAGIC); + ASSERT(STAILQ_NEXT(mbuf, next) == NULL); + ASSERT(mbuf->magic == MBUF_MAGIC); - size_t size = mbuf->chunk_size - MBUF_HSIZE; - buf = (uint8_t *)mbuf - size; - dn_free(buf); + size_t size = mbuf->chunk_size - MBUF_HSIZE; + buf = (uint8_t *)mbuf - size; + dn_free(buf); } diff --git a/src/dyn_mbuf.h b/src/dyn_mbuf.h index 2a30f79d0..5b280785c 100644 --- a/src/dyn_mbuf.h +++ b/src/dyn_mbuf.h @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -19,51 +19,50 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "dyn_core.h" #ifndef _DYN_MBUF_H_ #define _DYN_MBUF_H_ +#include +#include +#include +#include +#include "dyn_queue.h" typedef void (*func_mbuf_copy_t)(struct mbuf *, void *); struct mbuf { - uint32_t magic; /* mbuf magic (const) */ - STAILQ_ENTRY(mbuf) next; /* next mbuf */ - uint8_t *pos; /* read marker */ - uint8_t *last; /* write marker */ - uint8_t *start; /* start of buffer (const) */ - uint8_t *end; /* end of buffer (const) */ - uint8_t *end_extra; /*end of the buffer - including the extra region */ - uint32_t flags; /* flags: readflip, just_decrypted etc */ - uint32_t chunk_size; + uint32_t magic; /* mbuf magic (const) */ + STAILQ_ENTRY(mbuf) next; /* next mbuf */ + uint8_t *pos; /* read marker */ + uint8_t *last; /* write marker */ + uint8_t *start; /* start of buffer (const) */ + uint8_t *end; /* end of buffer (const) */ + uint8_t *end_extra; /*end of the buffer - including the extra region */ + uint32_t flags; /* flags: readflip, just_decrypted etc */ + uint32_t chunk_size; }; STAILQ_HEAD(mhdr, mbuf); -#define MBUF_MAGIC 0xdeadbeef -#define MBUF_MIN_SIZE 512 -#define MBUF_MAX_SIZE 512000 -#define MBUF_SIZE 16384 -#define MBUF_HSIZE sizeof(struct mbuf) -#define MBUF_ESIZE 16 +#define MBUF_MAGIC 0xdeadbeef +#define MBUF_MIN_SIZE 512 +#define MBUF_MAX_SIZE 512000 +#define MBUF_SIZE 16384 +#define MBUF_HSIZE sizeof(struct mbuf) +#define MBUF_ESIZE 16 // FLAGS -#define MBUF_FLAGS_READ_FLIP 0x00000001 -#define MBUF_FLAGS_JUST_DECRYPTED 0x00000002 - +#define MBUF_FLAGS_READ_FLIP 0x00000001 +#define MBUF_FLAGS_JUST_DECRYPTED 0x00000002 -static inline bool -mbuf_empty(struct mbuf *mbuf) -{ - return mbuf->pos == mbuf->last ? true : false; +static inline bool mbuf_empty(struct mbuf *mbuf) { + return mbuf->pos == mbuf->last ? true : false; } -static inline bool -mbuf_full(struct mbuf *mbuf) -{ - return mbuf->last == mbuf->end? true : false; +static inline bool mbuf_full(struct mbuf *mbuf) { + return mbuf->last == mbuf->end ? true : false; } void mbuf_init(size_t mbuf_chunk_size); @@ -82,7 +81,8 @@ void mbuf_insert_head(struct mhdr *mhdr, struct mbuf *mbuf); void mbuf_insert_after(struct mhdr *mhdr, struct mbuf *mbuf, struct mbuf *nbuf); void mbuf_remove(struct mhdr *mhdr, struct mbuf *mbuf); void mbuf_copy(struct mbuf *mbuf, uint8_t *pos, size_t n); -struct mbuf *mbuf_split(struct mhdr *h, uint8_t *pos, func_mbuf_copy_t cb, void *cbarg); +struct mbuf *mbuf_split(struct mhdr *h, uint8_t *pos, func_mbuf_copy_t cb, + void *cbarg); void mbuf_write_char(struct mbuf *mbuf, char ch); void mbuf_write_string(struct mbuf *mbuf, const struct string *s); diff --git a/src/dyn_message.c b/src/dyn_message.c index 1c247f7f4..044cdfc87 100644 --- a/src/dyn_message.c +++ b/src/dyn_message.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -26,10 +26,10 @@ #include #include "dyn_core.h" -#include "dyn_server.h" #include "dyn_dnode_peer.h" -#include "proto/dyn_proto.h" +#include "dyn_server.h" #include "hashkit/dyn_hashkit.h" +#include "proto/dyn_proto.h" #if (IOV_MAX > 128) #define DN_IOV_MAX 128 @@ -44,7 +44,8 @@ * | | . * / \ . * Request Response .../ dyn_mbuf.[ch] (message buffers) - * dyn_request.c dyn_response.c .../ dyn_memcache.c; dyn_redis.c (message parser) + * dyn_request.c dyn_response.c .../ dyn_memcache.c; dyn_redis.c (message + * parser) * * Messages in dynomite are manipulated by a chain of processing handlers, * where each handler is responsible for taking the input and producing an @@ -94,7 +95,7 @@ * Req \ . / * ===> req_filter* . *rsp_filter * + . + - * | . | + * | . | * \ . / * req_forward-// (a) . (c) \\-rsp_forward * . @@ -117,27 +118,27 @@ */ /* Changes to message for consistency: - * In order to implement consistency, following changes have been made to message - * peer: Previously there was a one to one relation between request and a response - * both of which is struct message unfortunately. And due to the fact that - * some requests are forwarded as is to the underlying server while some - * are copied, the notion of 'peer' gets complicated. hence I changed its - * meaning somewhat. response->peer points to request that this response belongs - * to. Right now request->peer does not have any meaning other than some - * code in redis which does coalescing etc, and some other code just for - * the sake of it. - * awaiting_rsps: This is a counter of the number of responses that a request is - * still expecting. For DC_ONE consistency this is immaterial. For DC_QUORUM, - * this is the total number of responses expected. We wait for them to arrive - * before we free the request. A client connection in turn waits for all the - * requests to finish before freeing itself. (Look for waiting_to_unref). - * selected_rsp : A request->selected_rsp is the response selected for a given - * request. All code related to sending response should look at selected_rsp. - * rsp_sent : Due to consistency DC_QUORUM, we would have sent the response for - * a request even before all the responses arrive. The responses coming after - * rsp_sent are extra and can be swallowed. Also at this time we know that - * the response is sent and the request can be deleted from the client hash - * table outstanding_msgs_dict. + * In order to implement consistency, following changes have been made to + * message peer: Previously there was a one to one relation between request and + * a response both of which is struct message unfortunately. And due to the fact + * that some requests are forwarded as is to the underlying server while + * some are copied, the notion of 'peer' gets complicated. hence I changed + * its meaning somewhat. response->peer points to request that this + * response belongs to. Right now request->peer does not have any meaning + * other than some code in redis which does coalescing etc, and some other + * code just for the sake of it. awaiting_rsps: This is a counter of the + * number of responses that a request is still expecting. For DC_ONE consistency + * this is immaterial. For DC_QUORUM, this is the total number of responses + * expected. We wait for them to arrive before we free the request. A + * client connection in turn waits for all the requests to finish before + * freeing itself. (Look for waiting_to_unref). selected_rsp : A + * request->selected_rsp is the response selected for a given request. All code + * related to sending response should look at selected_rsp. rsp_sent : Due + * to consistency DC_QUORUM, we would have sent the response for a request even + * before all the responses arrive. The responses coming after rsp_sent are + * extra and can be swallowed. Also at this time we know that the response + * is sent and the request can be deleted from the client hash table + * outstanding_msgs_dict. * * So generally request->selected_rsp & response->peer is valid. Eventually it * will be good to have different structures for request and response. @@ -147,613 +148,565 @@ static uint64_t frag_id; /* fragment id counter */ static struct msg_tqh free_msgq; /* free msg q */ static struct rbtree tmo_rbt; /* timeout rbtree */ static struct rbnode tmo_rbs; /* timeout rbtree sentinel */ -static size_t alloc_msgs_max; /* maximum number of allowed allocated messages */ +static size_t alloc_msgs_max; /* maximum number of allowed allocated messages */ uint8_t g_timeout_factor = 1; -func_msg_coalesce_t g_pre_coalesce; /* message pre-coalesce */ -func_msg_coalesce_t g_post_coalesce; /* message post-coalesce */ -func_msg_fragment_t g_fragment; /* message post-coalesce */ -func_msg_verify_t g_verify_request; /* message post-coalesce */ +func_msg_coalesce_t g_pre_coalesce; /* message pre-coalesce */ +func_msg_coalesce_t g_post_coalesce; /* message post-coalesce */ +func_msg_fragment_t g_fragment; /* message post-coalesce */ +func_msg_verify_t g_verify_request; /* message post-coalesce */ func_is_multikey_request g_is_multikey_request; func_reconcile_responses g_reconcile_responses; -func_msg_rewrite_t g_rewrite_query; /* rewrite query in a msg if necessary */ +func_msg_rewrite_t g_rewrite_query; /* rewrite query in a msg if necessary */ #define DEFINE_ACTION(_name) string(#_name), -static struct string msg_type_strings[] = { - MSG_TYPE_CODEC( DEFINE_ACTION ) - null_string -}; +static struct string msg_type_strings[] = {MSG_TYPE_CODEC(DEFINE_ACTION) + null_string}; #undef DEFINE_ACTION -static char* -print_req(const struct object *obj) -{ - ASSERT(obj->type == OBJ_REQ); - struct msg *req = (struct msg *)obj; - struct string *req_type = msg_type_string(req->type); - snprintf(obj->print_buff, PRINT_BUF_SIZE, "", req, req->id, req->parent_id, - req->frag_id, req_type->len, req_type->data, req->mlen); - return obj->print_buff; +static char *print_req(const struct object *obj) { + ASSERT(obj->type == OBJ_REQ); + struct msg *req = (struct msg *)obj; + struct string *req_type = msg_type_string(req->type); + snprintf(obj->print_buff, PRINT_BUF_SIZE, + "", req, req->id, req->parent_id, + req->frag_id, req_type->len, req_type->data, req->mlen); + return obj->print_buff; } -static char* -print_rsp(const struct object *obj) -{ - ASSERT(obj->type == OBJ_RSP); - struct msg *rsp = (struct msg *)obj; - struct string *rsp_type = msg_type_string(rsp->type); - snprintf(obj->print_buff, PRINT_BUF_SIZE, "", rsp, rsp->id, - rsp->parent_id, rsp_type->len, rsp_type->data, rsp->mlen); - return obj->print_buff; +static char *print_rsp(const struct object *obj) { + ASSERT(obj->type == OBJ_RSP); + struct msg *rsp = (struct msg *)obj; + struct string *rsp_type = msg_type_string(rsp->type); + snprintf(obj->print_buff, PRINT_BUF_SIZE, "", rsp, + rsp->id, rsp->parent_id, rsp_type->len, rsp_type->data, rsp->mlen); + return obj->print_buff; } -void -set_datastore_ops(void) -{ - switch(g_data_store) { - case DATA_REDIS: - g_pre_coalesce = redis_pre_coalesce; - g_post_coalesce = redis_post_coalesce; - g_fragment = redis_fragment; - g_verify_request = redis_verify_request; - g_is_multikey_request = redis_is_multikey_request; - g_reconcile_responses = redis_reconcile_responses; - g_rewrite_query = redis_rewrite_query; - break; - case DATA_MEMCACHE: - g_pre_coalesce = memcache_pre_coalesce; - g_post_coalesce = memcache_post_coalesce; - g_fragment = memcache_fragment; - g_verify_request = memcache_verify_request; - g_is_multikey_request = memcache_is_multikey_request; - g_reconcile_responses = memcache_reconcile_responses; - g_rewrite_query = memcache_rewrite_query; - break; - default: - return; - } +void set_datastore_ops(void) { + switch (g_data_store) { + case DATA_REDIS: + g_pre_coalesce = redis_pre_coalesce; + g_post_coalesce = redis_post_coalesce; + g_fragment = redis_fragment; + g_verify_request = redis_verify_request; + g_is_multikey_request = redis_is_multikey_request; + g_reconcile_responses = redis_reconcile_responses; + g_rewrite_query = redis_rewrite_query; + break; + case DATA_MEMCACHE: + g_pre_coalesce = memcache_pre_coalesce; + g_post_coalesce = memcache_post_coalesce; + g_fragment = memcache_fragment; + g_verify_request = memcache_verify_request; + g_is_multikey_request = memcache_is_multikey_request; + g_reconcile_responses = memcache_reconcile_responses; + g_rewrite_query = memcache_rewrite_query; + break; + default: + return; + } } -static inline rstatus_t -msg_cant_handle_response(struct msg *req, struct msg *rsp) -{ - return DN_ENO_IMPL; +static inline rstatus_t msg_cant_handle_response(struct msg *req, + struct msg *rsp) { + return DN_ENO_IMPL; } -static struct msg * -msg_from_rbe(struct rbnode *node) -{ - struct msg *req; - int offset; +static struct msg *msg_from_rbe(struct rbnode *node) { + struct msg *req; + int offset; - offset = offsetof(struct msg, tmo_rbe); - req = (struct msg *)((char *)node - offset); + offset = offsetof(struct msg, tmo_rbe); + req = (struct msg *)((char *)node - offset); - return req; + return req; } -struct msg * -msg_tmo_min(void) -{ - struct rbnode *node; +struct msg *msg_tmo_min(void) { + struct rbnode *node; - node = rbtree_min(&tmo_rbt); - if (node == NULL) { - return NULL; - } + node = rbtree_min(&tmo_rbt); + if (node == NULL) { + return NULL; + } - return msg_from_rbe(node); + return msg_from_rbe(node); } -void -msg_tmo_insert(struct msg *req, struct conn *conn) -{ - struct rbnode *node; - msec_t timeout; - - //ASSERT(req->is_request); - ASSERT(!req->quit && req->expect_datastore_reply); - - timeout = conn->dyn_mode? dnode_peer_timeout(req, conn) : server_timeout(conn); - if (timeout <= 0) { - return; - } - timeout = timeout * g_timeout_factor; - - node = &req->tmo_rbe; - node->timeout = timeout; - node->key = dn_msec_now() + timeout; - node->data = conn; - - rbtree_insert(&tmo_rbt, node); - - if (log_loggable(LOG_VERB)) { - log_debug(LOG_VERB, "insert req %"PRIu64" into tmo rbt with expiry of " - "%d msec", req->id, timeout); - } +void msg_tmo_insert(struct msg *req, struct conn *conn) { + struct rbnode *node; + msec_t timeout; + + // ASSERT(req->is_request); + ASSERT(!req->quit && req->expect_datastore_reply); + + timeout = + conn->dyn_mode ? dnode_peer_timeout(req, conn) : server_timeout(conn); + if (timeout <= 0) { + return; + } + timeout = timeout * g_timeout_factor; + + node = &req->tmo_rbe; + node->timeout = timeout; + node->key = dn_msec_now() + timeout; + node->data = conn; + + rbtree_insert(&tmo_rbt, node); + + if (log_loggable(LOG_VERB)) { + log_debug(LOG_VERB, + "insert req %" PRIu64 + " into tmo rbt with expiry of " + "%d msec", + req->id, timeout); + } } -void -msg_tmo_delete(struct msg *req) -{ - struct rbnode *node; +void msg_tmo_delete(struct msg *req) { + struct rbnode *node; - node = &req->tmo_rbe; + node = &req->tmo_rbe; - /* already deleted */ + /* already deleted */ - if (node->data == NULL) { - return; - } + if (node->data == NULL) { + return; + } - rbtree_delete(&tmo_rbt, node); + rbtree_delete(&tmo_rbt, node); - if (log_loggable(LOG_VERB)) { - log_debug(LOG_VERB, "delete req %"PRIu64" from tmo rbt", req->id); - } + if (log_loggable(LOG_VERB)) { + log_debug(LOG_VERB, "delete req %" PRIu64 " from tmo rbt", req->id); + } } - static size_t alloc_msg_count = 0; -static struct msg * -_msg_get(struct conn *conn, bool request, const char *const caller) -{ - struct msg *msg; +static struct msg *_msg_get(struct conn *conn, bool request, + const char *const caller) { + struct msg *msg; - if (!TAILQ_EMPTY(&free_msgq)) { - ASSERT(TAILQ_COUNT(&free_msgq)); + if (!TAILQ_EMPTY(&free_msgq)) { + ASSERT(TAILQ_COUNT(&free_msgq)); - msg = TAILQ_FIRST(&free_msgq); - TAILQ_REMOVE(&free_msgq, msg, m_tqe); - goto done; - } + msg = TAILQ_FIRST(&free_msgq); + TAILQ_REMOVE(&free_msgq, msg, m_tqe); + goto done; + } - //protect our server in the slow network and high traffics. - //we drop client requests but still honor our peer requests - if (alloc_msg_count >= alloc_msgs_max) { - log_debug(LOG_WARN, "allocated #msgs %lu hit max allowable limit", alloc_msg_count); - return NULL; - } + // protect our server in the slow network and high traffics. + // we drop client requests but still honor our peer requests + if (alloc_msg_count >= alloc_msgs_max) { + log_debug(LOG_WARN, "allocated #msgs %lu hit max allowable limit", + alloc_msg_count); + return NULL; + } - alloc_msg_count++; + alloc_msg_count++; + if (alloc_msg_count % 1000 == 0) + log_warn("alloc_msg_count: %lu caller: %s %s", alloc_msg_count, caller, + print_obj(conn)); + else + log_info("alloc_msg_count: %lu caller: %s %s", alloc_msg_count, caller, + print_obj(conn)); - if (alloc_msg_count % 1000 == 0) - log_warn("alloc_msg_count: %lu caller: %s %s", - alloc_msg_count, caller, print_obj(conn)); - else - log_info("alloc_msg_count: %lu caller: %s %s", - alloc_msg_count, caller, print_obj(conn)); - - msg = dn_alloc(sizeof(*msg)); - if (msg == NULL) { - return NULL; - } + msg = dn_alloc(sizeof(*msg)); + if (msg == NULL) { + return NULL; + } done: - /* c_tqe, s_tqe, and m_tqe are left uninitialized */ - if (request) { - init_object(&msg->object, OBJ_REQ, print_req); - } else { - init_object(&msg->object, OBJ_RSP, print_rsp); - } - - msg->id = ++msg_id; - msg->parent_id = 0; - msg->peer = NULL; - msg->owner = NULL; - msg->stime_in_microsec = 0ULL; - msg->request_send_time = 0L; - msg->request_inqueue_enqueue_time_us = 0L; - msg->awaiting_rsps = 0; - msg->selected_rsp = NULL; - - rbtree_node_init(&msg->tmo_rbe); - - STAILQ_INIT(&msg->mhdr); - msg->mlen = 0; - - msg->state = 0; - msg->pos = NULL; - msg->token = NULL; - - msg->parser = NULL; - msg->result = MSG_PARSE_OK; - - msg->type = MSG_UNKNOWN; - - msg->keys = array_create(1, sizeof(struct keypos)); - if (msg->keys == NULL) { - dn_free(msg); - return NULL; - } + /* c_tqe, s_tqe, and m_tqe are left uninitialized */ + if (request) { + init_object(&msg->object, OBJ_REQ, print_req); + } else { + init_object(&msg->object, OBJ_RSP, print_rsp); + } + + msg->id = ++msg_id; + msg->parent_id = 0; + msg->peer = NULL; + msg->owner = NULL; + msg->stime_in_microsec = 0ULL; + msg->request_send_time = 0L; + msg->request_inqueue_enqueue_time_us = 0L; + msg->awaiting_rsps = 0; + msg->selected_rsp = NULL; + + rbtree_node_init(&msg->tmo_rbe); + + STAILQ_INIT(&msg->mhdr); + msg->mlen = 0; + + msg->state = 0; + msg->pos = NULL; + msg->token = NULL; + + msg->parser = NULL; + msg->result = MSG_PARSE_OK; + + msg->type = MSG_UNKNOWN; + + msg->keys = array_create(1, sizeof(struct keypos)); + if (msg->keys == NULL) { + dn_free(msg); + return NULL; + } + + msg->vlen = 0; + msg->end = NULL; + + msg->frag_owner = NULL; + msg->frag_seq = NULL; + msg->nfrag = 0; + msg->nfrag_done = 0; + msg->frag_id = 0; + + msg->narg_start = NULL; + msg->narg_end = NULL; + msg->narg = 0; + msg->rnarg = 0; + msg->nkeys = 0; + msg->rlen = 0; + msg->integer = 0; + + msg->error_code = 0; + msg->is_error = 0; + msg->is_ferror = 0; + msg->is_request = 0; + msg->quit = 0; + msg->expect_datastore_reply = 1; + msg->done = 0; + msg->fdone = 0; + msg->swallow = 0; + msg->dnode_header_prepended = 0; + msg->rsp_sent = 0; + + // dynomite + msg->is_read = 1; + msg->dyn_parse_state = 0; + msg->dmsg = NULL; + msg->msg_routing = ROUTING_NORMAL; + msg->dyn_error_code = 0; + msg->rsp_handler = msg_local_one_rsp_handler; + msg->consistency = DC_ONE; + return msg; +} - msg->vlen = 0; - msg->end = NULL; +size_t msg_alloc_msgs() { return alloc_msg_count; } - msg->frag_owner = NULL; - msg->frag_seq = NULL; - msg->nfrag = 0; - msg->nfrag_done = 0; - msg->frag_id = 0; - - msg->narg_start = NULL; - msg->narg_end = NULL; - msg->narg = 0; - msg->rnarg = 0; - msg->nkeys = 0; - msg->rlen = 0; - msg->integer = 0; - - msg->error_code = 0; - msg->is_error = 0; - msg->is_ferror = 0; - msg->is_request = 0; - msg->quit = 0; - msg->expect_datastore_reply = 1; - msg->done = 0; - msg->fdone = 0; - msg->swallow = 0; - msg->dnode_header_prepended = 0; - msg->rsp_sent = 0; - - //dynomite - msg->is_read = 1; - msg->dyn_parse_state = 0; - msg->dmsg = NULL; - msg->msg_routing = ROUTING_NORMAL; - msg->dyn_error_code = 0; - msg->rsp_handler = msg_local_one_rsp_handler; - msg->consistency = DC_ONE; - return msg; -} +size_t msg_free_queue_size(void) { return TAILQ_COUNT(&free_msgq); } -size_t msg_alloc_msgs() -{ - return alloc_msg_count; -} +struct msg *msg_get(struct conn *conn, bool request, const char *const caller) { + struct msg *msg; -size_t msg_free_queue_size(void) -{ - return TAILQ_COUNT(&free_msgq); -} + msg = _msg_get(conn, request, caller); + if (msg == NULL) { + return NULL; + } -struct msg * -msg_get(struct conn *conn, bool request, const char * const caller) -{ - struct msg *msg; + msg->owner = conn; + msg->is_request = request ? 1 : 0; - msg = _msg_get(conn, request, caller); - if (msg == NULL) { - return NULL; + if (g_data_store == DATA_REDIS) { + if (request) { + if (conn->dyn_mode) { + msg->parser = dyn_parse_req; + } else { + msg->parser = redis_parse_req; + } + } else { + if (conn->dyn_mode) { + msg->parser = dyn_parse_rsp; + } else { + msg->parser = redis_parse_rsp; + } } - - msg->owner = conn; - msg->is_request = request ? 1 : 0; - - if (g_data_store == DATA_REDIS) { - if (request) { - if (conn->dyn_mode) { - msg->parser = dyn_parse_req; - } else { - msg->parser = redis_parse_req; - } - } else { - if (conn->dyn_mode) { - msg->parser = dyn_parse_rsp; - } else { - msg->parser = redis_parse_rsp; - } - } - } else if (g_data_store == DATA_MEMCACHE) { - if (request) { - if (conn->dyn_mode) { - msg->parser = dyn_parse_req; - } else { - msg->parser = memcache_parse_req; - } - } else { - if (conn->dyn_mode) { - msg->parser = dyn_parse_rsp; - } else { - msg->parser = memcache_parse_rsp; - } - } - } else{ - log_debug(LOG_VVERB,"incorrect selection of data store %d", g_data_store); - exit(0); + } else if (g_data_store == DATA_MEMCACHE) { + if (request) { + if (conn->dyn_mode) { + msg->parser = dyn_parse_req; + } else { + msg->parser = memcache_parse_req; + } + } else { + if (conn->dyn_mode) { + msg->parser = dyn_parse_rsp; + } else { + msg->parser = memcache_parse_rsp; + } } + } else { + log_debug(LOG_VVERB, "incorrect selection of data store %d", g_data_store); + exit(0); + } - if (log_loggable(LOG_VVERB)) { - log_debug(LOG_VVERB, "get msg %p id %"PRIu64" request %d owner sd %d", + if (log_loggable(LOG_VVERB)) { + log_debug(LOG_VVERB, "get msg %p id %" PRIu64 " request %d owner sd %d", msg, msg->id, msg->is_request, conn->sd); - } - - return msg; -} - -rstatus_t -msg_clone(struct msg *src, struct mbuf *mbuf_start, struct msg *target) -{ - target->parent_id = src->id; - target->owner = src->owner; - target->is_request = src->is_request; - - target->parser = src->parser; - target->expect_datastore_reply = src->expect_datastore_reply; - target->swallow = src->swallow; - target->type = src->type; - target->mlen = src->mlen; - target->pos = src->pos; - target->vlen = src->vlen; - target->is_read = src->is_read; - target->consistency = src->consistency; - - struct mbuf *mbuf, *nbuf; - bool started = false; - STAILQ_FOREACH(mbuf, &src->mhdr, next) { - if (!started && mbuf != mbuf_start) { - continue; - } else { - started = true; - } - nbuf = mbuf_get(); - if (nbuf == NULL) { - return DN_ENOMEM; - } + } - uint32_t len = mbuf_length(mbuf); - mbuf_copy(nbuf, mbuf->pos, len); - mbuf_insert(&target->mhdr, nbuf); - } - - return DN_OK; + return msg; } - -struct msg * -msg_get_error(struct conn *conn, dyn_error_t dyn_error_code, err_t error_code) -{ - struct msg *rsp; - struct mbuf *mbuf; - int n; - char *errstr = dyn_error_code ? dn_strerror(dyn_error_code) : "unknown"; - char *protstr = g_data_store == DATA_REDIS ? "-ERR" : "SERVER_ERROR"; - char *source = dyn_error_source(dyn_error_code); - - rsp = _msg_get(conn, false, __FUNCTION__); - if (rsp == NULL) { - return NULL; +rstatus_t msg_clone(struct msg *src, struct mbuf *mbuf_start, + struct msg *target) { + target->parent_id = src->id; + target->owner = src->owner; + target->is_request = src->is_request; + + target->parser = src->parser; + target->expect_datastore_reply = src->expect_datastore_reply; + target->swallow = src->swallow; + target->type = src->type; + target->mlen = src->mlen; + target->pos = src->pos; + target->vlen = src->vlen; + target->is_read = src->is_read; + target->consistency = src->consistency; + + struct mbuf *mbuf, *nbuf; + bool started = false; + STAILQ_FOREACH(mbuf, &src->mhdr, next) { + if (!started && mbuf != mbuf_start) { + continue; + } else { + started = true; } - - rsp->state = 0; - rsp->is_error = true; - rsp->error_code = error_code; - rsp->dyn_error_code = dyn_error_code; - rsp->type = g_data_store == DATA_REDIS ? MSG_RSP_REDIS_ERROR : MSG_RSP_MC_SERVER_ERROR; - - mbuf = mbuf_get(); - if (mbuf == NULL) { - msg_put(rsp); - return NULL; + nbuf = mbuf_get(); + if (nbuf == NULL) { + return DN_ENOMEM; } - mbuf_insert(&rsp->mhdr, mbuf); - n = dn_scnprintf(mbuf->last, mbuf_size(mbuf), "%s %s %s"CRLF, protstr, source, errstr); - mbuf->last += n; - rsp->mlen = (uint32_t)n; + uint32_t len = mbuf_length(mbuf); + mbuf_copy(nbuf, mbuf->pos, len); + mbuf_insert(&target->mhdr, nbuf); + } - if (log_loggable(LOG_VVERB)) { - log_debug(LOG_VVERB, "get rsp %p id %"PRIu64" len %"PRIu32" err %d error '%s'", - rsp, rsp->id, rsp->mlen, error_code, errstr); - } + return DN_OK; +} - return rsp; +struct msg *msg_get_error(struct conn *conn, dyn_error_t dyn_error_code, + err_t error_code) { + struct msg *rsp; + struct mbuf *mbuf; + int n; + char *errstr = dyn_error_code ? dn_strerror(dyn_error_code) : "unknown"; + char *protstr = g_data_store == DATA_REDIS ? "-ERR" : "SERVER_ERROR"; + char *source = dyn_error_source(dyn_error_code); + + rsp = _msg_get(conn, false, __FUNCTION__); + if (rsp == NULL) { + return NULL; + } + + rsp->state = 0; + rsp->is_error = true; + rsp->error_code = error_code; + rsp->dyn_error_code = dyn_error_code; + rsp->type = g_data_store == DATA_REDIS ? MSG_RSP_REDIS_ERROR + : MSG_RSP_MC_SERVER_ERROR; + + mbuf = mbuf_get(); + if (mbuf == NULL) { + msg_put(rsp); + return NULL; + } + mbuf_insert(&rsp->mhdr, mbuf); + + n = dn_scnprintf(mbuf->last, mbuf_size(mbuf), "%s %s %s" CRLF, protstr, + source, errstr); + mbuf->last += n; + rsp->mlen = (uint32_t)n; + + if (log_loggable(LOG_VVERB)) { + log_debug(LOG_VVERB, + "get rsp %p id %" PRIu64 " len %" PRIu32 " err %d error '%s'", + rsp, rsp->id, rsp->mlen, error_code, errstr); + } + + return rsp; } +struct msg *msg_get_rsp_integer(struct conn *conn) { + struct msg *rsp; + struct mbuf *mbuf; + int n; -struct msg * -msg_get_rsp_integer(struct conn *conn) -{ - struct msg *rsp; - struct mbuf *mbuf; - int n; + rsp = _msg_get(conn, false, __FUNCTION__); + if (rsp == NULL) { + return NULL; + } - rsp = _msg_get(conn, false, __FUNCTION__); - if (rsp == NULL) { - return NULL; - } + rsp->state = 0; + rsp->type = MSG_RSP_REDIS_INTEGER; - rsp->state = 0; - rsp->type = MSG_RSP_REDIS_INTEGER; + mbuf = mbuf_get(); + if (mbuf == NULL) { + msg_put(rsp); + return NULL; + } + mbuf_insert(&rsp->mhdr, mbuf); - mbuf = mbuf_get(); - if (mbuf == NULL) { - msg_put(rsp); - return NULL; - } - mbuf_insert(&rsp->mhdr, mbuf); + n = dn_scnprintf(mbuf->last, mbuf_size(mbuf), ":0\r\n"); + mbuf->last += n; + rsp->mlen = (uint32_t)n; - n = dn_scnprintf(mbuf->last, mbuf_size(mbuf), ":0\r\n"); - mbuf->last += n; - rsp->mlen = (uint32_t)n; + if (log_loggable(LOG_VVERB)) { + log_debug(LOG_VVERB, "get rsp %p id %" PRIu64 " len %" PRIu32 " ", rsp, + rsp->id, rsp->mlen); + } - if (log_loggable(LOG_VVERB)) { - log_debug(LOG_VVERB, "get rsp %p id %"PRIu64" len %"PRIu32" ", - rsp, rsp->id, rsp->mlen); - } - - return rsp; + return rsp; } -static void -msg_free(struct msg *msg) -{ - ASSERT(STAILQ_EMPTY(&msg->mhdr)); +static void msg_free(struct msg *msg) { + ASSERT(STAILQ_EMPTY(&msg->mhdr)); - if (log_loggable(LOG_VVERB)) { - log_debug(LOG_VVERB, "free msg %p id %"PRIu64"", msg, msg->id); - } - dn_free(msg); + if (log_loggable(LOG_VVERB)) { + log_debug(LOG_VVERB, "free msg %p id %" PRIu64 "", msg, msg->id); + } + dn_free(msg); } -void -msg_put(struct msg *msg) -{ - if (msg == NULL) { - log_debug(LOG_ERR, "Unable to put a null msg - probably due to memory hard-set limit"); - return; - } - - if (msg->is_request && msg->awaiting_rsps != 0) { - log_error("Not freeing req %d, awaiting_rsps = %u", - msg->id, msg->awaiting_rsps); - return; - } - - - struct dmsg *dmsg = msg->dmsg; - if (dmsg != NULL) { - dmsg_put(dmsg); - msg->dmsg = NULL; - } +void msg_put(struct msg *msg) { + if (msg == NULL) { + log_debug( + LOG_ERR, + "Unable to put a null msg - probably due to memory hard-set limit"); + return; + } + + if (msg->is_request && msg->awaiting_rsps != 0) { + log_error("Not freeing req %d, awaiting_rsps = %u", msg->id, + msg->awaiting_rsps); + return; + } + + struct dmsg *dmsg = msg->dmsg; + if (dmsg != NULL) { + dmsg_put(dmsg); + msg->dmsg = NULL; + } - while (!STAILQ_EMPTY(&msg->mhdr)) { - struct mbuf *mbuf = STAILQ_FIRST(&msg->mhdr); - mbuf_remove(&msg->mhdr, mbuf); - mbuf_put(mbuf); - } + while (!STAILQ_EMPTY(&msg->mhdr)) { + struct mbuf *mbuf = STAILQ_FIRST(&msg->mhdr); + mbuf_remove(&msg->mhdr, mbuf); + mbuf_put(mbuf); + } - if (msg->frag_seq) { - dn_free(msg->frag_seq); - msg->frag_seq = NULL; - } + if (msg->frag_seq) { + dn_free(msg->frag_seq); + msg->frag_seq = NULL; + } - if (msg->keys) { - array_destroy(msg->keys); - msg->keys = NULL; - } + if (msg->keys) { + array_destroy(msg->keys); + msg->keys = NULL; + } - TAILQ_INSERT_HEAD(&free_msgq, msg, m_tqe); + TAILQ_INSERT_HEAD(&free_msgq, msg, m_tqe); } +uint32_t msg_mbuf_size(struct msg *msg) { + uint32_t count = 0; + struct mbuf *mbuf; -uint32_t msg_mbuf_size(struct msg *msg) -{ - uint32_t count = 0; - struct mbuf *mbuf; - - STAILQ_FOREACH(mbuf, &msg->mhdr, next) { - count++; - } + STAILQ_FOREACH(mbuf, &msg->mhdr, next) { count++; } - return count; + return count; } -uint32_t msg_length(struct msg *msg) -{ - uint32_t count = 0; - struct mbuf *mbuf; +uint32_t msg_length(struct msg *msg) { + uint32_t count = 0; + struct mbuf *mbuf; - STAILQ_FOREACH(mbuf, &msg->mhdr, next) { - ASSERT(mbuf->last >= mbuf->start); - count += (uint32_t)(mbuf->last - mbuf->start); - } + STAILQ_FOREACH(mbuf, &msg->mhdr, next) { + ASSERT(mbuf->last >= mbuf->start); + count += (uint32_t)(mbuf->last - mbuf->start); + } - return count; + return count; } -void -msg_dump(int level, struct msg *msg) -{ - - if (!log_loggable(level)) { - return; - } - struct mbuf *mbuf; - - if (msg == NULL) { - loga("msg is NULL - cannot display its info"); - return; - } - - loga("msg dump id %"PRIu64" request %d len %"PRIu32" type %d done %d " - "error %d (err %d)", msg->id, msg->is_request, msg->mlen, msg->type, - msg->done, msg->is_error, msg->error_code); - - STAILQ_FOREACH(mbuf, &msg->mhdr, next) { - mbuf_dump(mbuf); - } - loga("================================================="); - +void msg_dump(int level, struct msg *msg) { + if (!log_loggable(level)) { + return; + } + struct mbuf *mbuf; + + if (msg == NULL) { + loga("msg is NULL - cannot display its info"); + return; + } + + loga("msg dump id %" PRIu64 " request %d len %" PRIu32 + " type %d done %d " + "error %d (err %d)", + msg->id, msg->is_request, msg->mlen, msg->type, msg->done, msg->is_error, + msg->error_code); + + STAILQ_FOREACH(mbuf, &msg->mhdr, next) { mbuf_dump(mbuf); } + loga("================================================="); } /** * Initialize the message queue. * @param[in] alloc_msgs_max Dynomite instance. */ -void -msg_init(size_t msgs_max) -{ - log_debug(LOG_DEBUG, "msg size %d", sizeof(struct msg)); - msg_id = 0; - frag_id = 0; - alloc_msgs_max = msgs_max; - TAILQ_INIT(&free_msgq); - rbtree_init(&tmo_rbt, &tmo_rbs); +void msg_init(size_t msgs_max) { + log_debug(LOG_DEBUG, "msg size %d", sizeof(struct msg)); + msg_id = 0; + frag_id = 0; + alloc_msgs_max = msgs_max; + TAILQ_INIT(&free_msgq); + rbtree_init(&tmo_rbt, &tmo_rbs); } -void -msg_deinit(void) -{ - struct msg *msg, *nmsg; +void msg_deinit(void) { + struct msg *msg, *nmsg; - for (msg = TAILQ_FIRST(&free_msgq); msg != NULL; - msg = nmsg) { - ASSERT(TAILQ_COUNT(&free_msgq)); - nmsg = TAILQ_NEXT(msg, m_tqe); - msg_free(msg); - } - ASSERT(TAILQ_COUNT(&free_msgq) == 0); + for (msg = TAILQ_FIRST(&free_msgq); msg != NULL; msg = nmsg) { + ASSERT(TAILQ_COUNT(&free_msgq)); + nmsg = TAILQ_NEXT(msg, m_tqe); + msg_free(msg); + } + ASSERT(TAILQ_COUNT(&free_msgq) == 0); } -struct string * -msg_type_string(msg_type_t type) -{ - return &msg_type_strings[type]; +struct string *msg_type_string(msg_type_t type) { + return &msg_type_strings[type]; } -bool -msg_empty(struct msg *msg) -{ - return msg->mlen == 0 ? true : (msg->dyn_error_code == BAD_FORMAT? true : false); +bool msg_empty(struct msg *msg) { + return msg->mlen == 0 ? true + : (msg->dyn_error_code == BAD_FORMAT ? true : false); } -static uint8_t * -msg_get_key(struct msg *req, uint32_t key_index, uint32_t *keylen, bool tagged_only) -{ - *keylen = 0; - if (array_n(req->keys) == 0) - return NULL; - ASSERT_LOG(key_index < array_n(req->keys), "%s has %u keys", print_obj(req), array_n(req->keys)); - - struct keypos *kpos = array_get(req->keys, key_index); - uint8_t *key_start = tagged_only ? kpos->tag_start : kpos->start; - uint8_t *key_end = tagged_only ? kpos->tag_end : kpos->end; - *keylen = (uint32_t)(key_end - key_start); - return key_start; +static uint8_t *msg_get_key(struct msg *req, uint32_t key_index, + uint32_t *keylen, bool tagged_only) { + *keylen = 0; + if (array_n(req->keys) == 0) return NULL; + ASSERT_LOG(key_index < array_n(req->keys), "%s has %u keys", print_obj(req), + array_n(req->keys)); + + struct keypos *kpos = array_get(req->keys, key_index); + uint8_t *key_start = tagged_only ? kpos->tag_start : kpos->start; + uint8_t *key_end = tagged_only ? kpos->tag_end : kpos->end; + *keylen = (uint32_t)(key_end - key_start); + return key_start; } -uint8_t * -msg_get_full_key(struct msg *req, uint32_t key_index, uint32_t *keylen) -{ - return msg_get_key(req, key_index, keylen, false); +uint8_t *msg_get_full_key(struct msg *req, uint32_t key_index, + uint32_t *keylen) { + return msg_get_key(req, key_index, keylen, false); } -uint8_t * -msg_get_tagged_key(struct msg *req, uint32_t key_index, uint32_t *keylen) -{ - return msg_get_key(req, key_index, keylen, true); +uint8_t *msg_get_tagged_key(struct msg *req, uint32_t key_index, + uint32_t *keylen) { + return msg_get_key(req, key_index, keylen, true); } /* @@ -764,592 +717,566 @@ msg_get_tagged_key(struct msg *req, uint32_t key_index, uint32_t *keylen) * * Returns NULL if key does not exist or if we're unable to allocate memory. */ -uint8_t* msg_get_full_key_copy(struct msg* msg, int idx, uint32_t *keylen) { - // Get a pointer to the required key in 'msg'. - uint8_t* key_ptr = msg_get_full_key(msg, idx, keylen); +uint8_t *msg_get_full_key_copy(struct msg *msg, int idx, uint32_t *keylen) { + // Get a pointer to the required key in 'msg'. + uint8_t *key_ptr = msg_get_full_key(msg, idx, keylen); - // Allocate a new buffer for the key. - uint8_t* copied_key = dn_alloc((size_t) (*keylen + 1)); - if (copied_key == NULL) return NULL; + // Allocate a new buffer for the key. + uint8_t *copied_key = dn_alloc((size_t)(*keylen + 1)); + if (copied_key == NULL) return NULL; - // Copy contents of the key from 'msg' to our new buffer. - dn_memcpy(copied_key, key_ptr, *keylen); - copied_key[*keylen] = '\0'; + // Copy contents of the key from 'msg' to our new buffer. + dn_memcpy(copied_key, key_ptr, *keylen); + copied_key[*keylen] = '\0'; - return copied_key; + return copied_key; } -uint32_t -msg_payload_crc32(struct msg *rsp) -{ - ASSERT(rsp != NULL); - // take a continuous buffer crc - uint32_t crc = 0; - struct mbuf *mbuf; - /* Since we want to checksum only the payload, we have to start from the - payload offset. which is somewhere in the mbufs. Skip the mbufs till we - find the start of the payload. If there is no dyno header, we start from - the beginning of the first mbuf */ - bool start_found = rsp->dmsg ? false : true; - - STAILQ_FOREACH(mbuf, &rsp->mhdr, next) { - uint8_t *start = mbuf->start; - uint8_t *end = mbuf->last; - if (!start_found) { - // if payload start is within this mbuf - if ((mbuf->start <= rsp->dmsg->payload) && - (rsp->dmsg->payload < mbuf->last)) { - start = rsp->dmsg->payload; - start_found = true; - } else { - // else skip this mbuf - continue; - } - } - - crc = crc32_sz((char *)start, (size_t)(end - start), crc); +uint32_t msg_payload_crc32(struct msg *rsp) { + ASSERT(rsp != NULL); + // take a continuous buffer crc + uint32_t crc = 0; + struct mbuf *mbuf; + /* Since we want to checksum only the payload, we have to start from the + payload offset. which is somewhere in the mbufs. Skip the mbufs till we + find the start of the payload. If there is no dyno header, we start from + the beginning of the first mbuf */ + bool start_found = rsp->dmsg ? false : true; + + STAILQ_FOREACH(mbuf, &rsp->mhdr, next) { + uint8_t *start = mbuf->start; + uint8_t *end = mbuf->last; + if (!start_found) { + // if payload start is within this mbuf + if ((mbuf->start <= rsp->dmsg->payload) && + (rsp->dmsg->payload < mbuf->last)) { + start = rsp->dmsg->payload; + start_found = true; + } else { + // else skip this mbuf + continue; + } } - return crc; - -} -inline uint64_t -msg_gen_frag_id(void) -{ - return ++frag_id; + crc = crc32_sz((char *)start, (size_t)(end - start), crc); + } + return crc; } -static rstatus_t -msg_parsed(struct context *ctx, struct conn *conn, struct msg *msg) -{ - struct msg *nmsg; - struct mbuf *mbuf, *nbuf; - - mbuf = STAILQ_LAST(&msg->mhdr, mbuf, next); - - if (msg->pos == mbuf->last) { - /* no more data to parse */ - conn_recv_done(ctx, conn, msg, NULL); - return DN_OK; - } +inline uint64_t msg_gen_frag_id(void) { return ++frag_id; } +static rstatus_t msg_parsed(struct context *ctx, struct conn *conn, + struct msg *msg) { + struct msg *nmsg; + struct mbuf *mbuf, *nbuf; - /* - * Input mbuf has un-parsed data. Split mbuf of the current message msg - * into (mbuf, nbuf), where mbuf is the portion of the message that has - * been parsed and nbuf is the portion of the message that is un-parsed. - * Parse nbuf as a new message nmsg in the next iteration. - */ - nbuf = mbuf_split(&msg->mhdr, msg->pos, NULL, NULL); - if (nbuf == NULL) { - return DN_ENOMEM; - } - - nmsg = msg_get(msg->owner, msg->is_request, __FUNCTION__); - if (nmsg == NULL) { - mbuf_put(nbuf); - return DN_ENOMEM; - } - mbuf_insert(&nmsg->mhdr, nbuf); - nmsg->pos = nbuf->pos; - - /* update length of current (msg) and new message (nmsg) */ - nmsg->mlen = mbuf_length(nbuf); - msg->mlen -= nmsg->mlen; - - conn_recv_done(ctx, conn, msg, nmsg); + mbuf = STAILQ_LAST(&msg->mhdr, mbuf, next); + if (msg->pos == mbuf->last) { + /* no more data to parse */ + conn_recv_done(ctx, conn, msg, NULL); return DN_OK; + } + + /* + * Input mbuf has un-parsed data. Split mbuf of the current message msg + * into (mbuf, nbuf), where mbuf is the portion of the message that has + * been parsed and nbuf is the portion of the message that is un-parsed. + * Parse nbuf as a new message nmsg in the next iteration. + */ + nbuf = mbuf_split(&msg->mhdr, msg->pos, NULL, NULL); + if (nbuf == NULL) { + return DN_ENOMEM; + } + + nmsg = msg_get(msg->owner, msg->is_request, __FUNCTION__); + if (nmsg == NULL) { + mbuf_put(nbuf); + return DN_ENOMEM; + } + mbuf_insert(&nmsg->mhdr, nbuf); + nmsg->pos = nbuf->pos; + + /* update length of current (msg) and new message (nmsg) */ + nmsg->mlen = mbuf_length(nbuf); + msg->mlen -= nmsg->mlen; + + conn_recv_done(ctx, conn, msg, nmsg); + + return DN_OK; } -static rstatus_t -msg_repair(struct context *ctx, struct conn *conn, struct msg *msg) -{ - struct mbuf *nbuf; +static rstatus_t msg_repair(struct context *ctx, struct conn *conn, + struct msg *msg) { + struct mbuf *nbuf; - nbuf = mbuf_split(&msg->mhdr, msg->pos, NULL, NULL); - if (nbuf == NULL) { - return DN_ENOMEM; - } - mbuf_insert(&msg->mhdr, nbuf); - msg->pos = nbuf->pos; + nbuf = mbuf_split(&msg->mhdr, msg->pos, NULL, NULL); + if (nbuf == NULL) { + return DN_ENOMEM; + } + mbuf_insert(&msg->mhdr, nbuf); + msg->pos = nbuf->pos; - return DN_OK; + return DN_OK; } +static rstatus_t msg_parse(struct context *ctx, struct conn *conn, + struct msg *msg) { + rstatus_t status; -static rstatus_t -msg_parse(struct context *ctx, struct conn *conn, struct msg *msg) -{ - rstatus_t status; - - if (msg_empty(msg)) { - /* no data to parse */ - conn_recv_done(ctx, conn, msg, NULL); - return DN_OK; - } + if (msg_empty(msg)) { + /* no data to parse */ + conn_recv_done(ctx, conn, msg, NULL); + return DN_OK; + } - msg->parser(msg, &ctx->pool.hash_tag); + msg->parser(msg, &ctx->pool.hash_tag); - switch (msg->result) { + switch (msg->result) { case MSG_PARSE_OK: - //log_debug(LOG_VVERB, "MSG_PARSE_OK"); - status = msg_parsed(ctx, conn, msg); - break; + // log_debug(LOG_VVERB, "MSG_PARSE_OK"); + status = msg_parsed(ctx, conn, msg); + break; case MSG_PARSE_REPAIR: - //log_debug(LOG_VVERB, "MSG_PARSE_REPAIR"); - status = msg_repair(ctx, conn, msg); - break; + // log_debug(LOG_VVERB, "MSG_PARSE_REPAIR"); + status = msg_repair(ctx, conn, msg); + break; case MSG_PARSE_AGAIN: - //log_debug(LOG_VVERB, "MSG_PARSE_AGAIN"); - status = DN_OK; - break; + // log_debug(LOG_VVERB, "MSG_PARSE_AGAIN"); + status = DN_OK; + break; default: - /* - if (!conn->dyn_mode) { - status = DN_ERROR; - conn->err = errno; - } else { - log_debug(LOG_VVERB, "Parsing error in dyn_mode"); - status = DN_OK; - } - */ - status = DN_ERROR; - conn->err = errno; - break; + /* + if (!conn->dyn_mode) { + status = DN_ERROR; + conn->err = errno; + } else { + log_debug(LOG_VVERB, "Parsing error in dyn_mode"); + status = DN_OK; + } + */ + status = DN_ERROR; + conn->err = errno; + break; + } + + return conn->err != 0 ? DN_ERROR : status; +} + +static rstatus_t msg_recv_chain(struct context *ctx, struct conn *conn, + struct msg *msg) { + rstatus_t status; + struct msg *nmsg; + struct mbuf *mbuf; + size_t msize; + ssize_t n; + bool encryption_detected = (msg->dyn_parse_state == DYN_DONE || + msg->dyn_parse_state == DYN_POST_DONE) && + (msg->dmsg->flags & 0x1); + + mbuf = STAILQ_LAST(&msg->mhdr, mbuf, next); + /* This logic is unncessarily complicated. Ideally a connection should read + * the entire payload of an encrypted message before it starts decrypting. + * However the code tries to check if a buffer is full and decrypts it before + * moving to the next buffer. So at any given point, a message large enough + * can have some buffers decrypted and the last one either decrypted or + * encrypted. We start decrypting a buffer if we finish reading the payload + * (dmsg->plen) or we reach till mbuf->end_extra. If a buffer is encrypted, it + * can span till mbuf->end_extra. If this buffer gets decrypted, it can span + * till mbuf_full() i.e mbuf->end. which is 16 bytes (one cypher block) less + * than mbuf->end_extra + * + * However there is no way to tell if a buffer that is filled till mbuf->end + * is encrypted or decrypted so we cannot know if we should continue writing + * to that buffer from mbuf->end till mbuf->end_extra (which is 16 bytes) or + * whether we just decrypted a buffer and now it spans till mbuf->end and we + * should create a new buffer to start receiving new encrypted data. Hence, + * I created a new flag MBUF_FLAG_JUST_DECRYPTED which is solely for this + * purpose. One should not write a code like this. We should receive the + * entire payload first and then decrypt it. Its slightly slow but worth the + * simplicity + * + * Create a new buffer if: + * 1) mbuf is NULL + * 2) unencrypted case and mbuf is full + * 3) encrypted case and + * a) mbuf is full till end_extra + * b) mbuf is full till mbuf->end (mbuf_full) and we just decrypted that + * buffer. + */ + if (mbuf == NULL || ((!encryption_detected) && mbuf_full(mbuf)) || + (!encryption_detected && mbuf->last == mbuf->end_extra) || + (!encryption_detected && mbuf_full(mbuf) && + (mbuf->flags & MBUF_FLAGS_JUST_DECRYPTED))) { + mbuf = mbuf_get(); + if (mbuf == NULL) { + return DN_ENOMEM; } + mbuf_insert(&msg->mhdr, mbuf); - return conn->err != 0 ? DN_ERROR : status; -} + msg->pos = mbuf->pos; + } + ASSERT(mbuf->end_extra - mbuf->last > 0); -static rstatus_t -msg_recv_chain(struct context *ctx, struct conn *conn, struct msg *msg) -{ - rstatus_t status; - struct msg *nmsg; - struct mbuf *mbuf; - size_t msize; - ssize_t n; - bool encryption_detected = (msg->dyn_parse_state == DYN_DONE || - msg->dyn_parse_state == DYN_POST_DONE) && - (msg->dmsg->flags & 0x1); + if (!encryption_detected) { + msize = mbuf_size(mbuf); + } else { + msize = (size_t)MIN(msg->dmsg->plen, mbuf->end_extra - mbuf->last); + } - mbuf = STAILQ_LAST(&msg->mhdr, mbuf, next); - /* This logic is unncessarily complicated. Ideally a connection should read - * the entire payload of an encrypted message before it starts decrypting. - * However the code tries to check if a buffer is full and decrypts it before - * moving to the next buffer. So at any given point, a message large enough can - * have some buffers decrypted and the last one either decrypted or encrypted. - * We start decrypting a buffer if we finish reading the payload (dmsg->plen) or - * we reach till mbuf->end_extra. - * If a buffer is encrypted, it can span till mbuf->end_extra. - * If this buffer gets decrypted, it can span till mbuf_full() i.e mbuf->end. - * which is 16 bytes (one cypher block) less than mbuf->end_extra - * - * However there is no way to tell if a buffer that is filled till mbuf->end - * is encrypted or decrypted so we cannot know if we should continue writing - * to that buffer from mbuf->end till mbuf->end_extra (which is 16 bytes) or - * whether we just decrypted a buffer and now it spans till mbuf->end and we - * should create a new buffer to start receiving new encrypted data. Hence, - * I created a new flag MBUF_FLAG_JUST_DECRYPTED which is solely for this - * purpose. One should not write a code like this. We should receive the entire - * payload first and then decrypt it. Its slightly slow but worth the simplicity - * - * Create a new buffer if: - * 1) mbuf is NULL - * 2) unencrypted case and mbuf is full - * 3) encrypted case and - * a) mbuf is full till end_extra - * b) mbuf is full till mbuf->end (mbuf_full) and we just decrypted that buffer. - */ - if (mbuf == NULL || - ((!encryption_detected) && mbuf_full(mbuf)) || - (!encryption_detected && mbuf->last == mbuf->end_extra) || - (!encryption_detected && mbuf_full(mbuf) && (mbuf->flags & MBUF_FLAGS_JUST_DECRYPTED))) { - mbuf = mbuf_get(); - if (mbuf == NULL) { - return DN_ENOMEM; - } - mbuf_insert(&msg->mhdr, mbuf); + n = conn_recv_data(conn, mbuf->last, msize); - msg->pos = mbuf->pos; + if (n < 0) { + if (n == DN_EAGAIN) { + return DN_OK; } + return DN_ERROR; + } - ASSERT(mbuf->end_extra - mbuf->last > 0); + ASSERT((mbuf->last + n) <= mbuf->end_extra); + mbuf->last += n; + msg->mlen += (uint32_t)n; - if (!encryption_detected) { - msize = mbuf_size(mbuf); - } else { - msize = (size_t)MIN(msg->dmsg->plen, mbuf->end_extra - mbuf->last); - } + // Only used in encryption case + if (encryption_detected) { + if (n >= msg->dmsg->plen || mbuf->end_extra == mbuf->last) { + // log_debug(LOG_VERB, "About to decrypt this mbuf as it is full or + // eligible!"); + struct mbuf *nbuf = NULL; - n = conn_recv_data(conn, mbuf->last, msize); + if (n >= msg->dmsg->plen) { + nbuf = mbuf_get(); - if (n < 0) { - if (n == DN_EAGAIN) { - return DN_OK; + if (nbuf == NULL) { + loga("Not enough memory error!!!"); + return DN_ENOMEM; } - return DN_ERROR; - } - ASSERT((mbuf->last + n) <= mbuf->end_extra); - mbuf->last += n; - msg->mlen += (uint32_t)n; - - //Only used in encryption case - if (encryption_detected) { - if ( n >= msg->dmsg->plen || mbuf->end_extra == mbuf->last) { - //log_debug(LOG_VERB, "About to decrypt this mbuf as it is full or eligible!"); - struct mbuf *nbuf = NULL; - - if (n >= msg->dmsg->plen) { - nbuf = mbuf_get(); - - if (nbuf == NULL) { - loga("Not enough memory error!!!"); - return DN_ENOMEM; - } - - status = dyn_aes_decrypt(mbuf->start, (size_t)(mbuf->last - mbuf->start), - nbuf, msg->owner->aes_key); - if (status >= DN_OK) { - int remain = n - msg->dmsg->plen; - uint8_t *pos = mbuf->last - remain; - mbuf_copy(nbuf, pos, remain); - } - - } else if (mbuf->end_extra == mbuf->last) { - nbuf = mbuf_get(); - - if (nbuf == NULL) { - loga("Not enough memory error!!!"); - return DN_ENOMEM; - } - - status = dyn_aes_decrypt(mbuf->start, mbuf->last - mbuf->start, nbuf, msg->owner->aes_key); - } - - if (status >= 0 && nbuf != NULL) { - nbuf->flags |= MBUF_FLAGS_JUST_DECRYPTED; - nbuf->flags |= MBUF_FLAGS_READ_FLIP; - mbuf_remove(&msg->mhdr, mbuf); - mbuf_insert(&msg->mhdr, nbuf); - msg->pos = nbuf->start; - - msg->mlen -= mbuf->last - mbuf->start; - msg->mlen += nbuf->last - nbuf->start; - - mbuf_put(mbuf); - } else { //clean up the mess and recover it - mbuf_insert(&msg->mhdr, nbuf); - msg->pos = nbuf->last; - msg->dyn_error_code = BAD_FORMAT; - } + status = + dyn_aes_decrypt(mbuf->start, (size_t)(mbuf->last - mbuf->start), + nbuf, msg->owner->aes_key); + if (status >= DN_OK) { + int remain = n - msg->dmsg->plen; + uint8_t *pos = mbuf->last - remain; + mbuf_copy(nbuf, pos, remain); } - msg->dmsg->plen -= n; - } + } else if (mbuf->end_extra == mbuf->last) { + nbuf = mbuf_get(); - for (;;) { - status = msg_parse(ctx, conn, msg); - if (status != DN_OK) { - return status; + if (nbuf == NULL) { + loga("Not enough memory error!!!"); + return DN_ENOMEM; } - /* get next message to parse */ - nmsg = conn_recv_next(ctx, conn, false); - if (nmsg == NULL || nmsg == msg) { - /* no more data to parse */ - break; - } + status = dyn_aes_decrypt(mbuf->start, mbuf->last - mbuf->start, nbuf, + msg->owner->aes_key); + } - msg = nmsg; + if (status >= 0 && nbuf != NULL) { + nbuf->flags |= MBUF_FLAGS_JUST_DECRYPTED; + nbuf->flags |= MBUF_FLAGS_READ_FLIP; + mbuf_remove(&msg->mhdr, mbuf); + mbuf_insert(&msg->mhdr, nbuf); + msg->pos = nbuf->start; + + msg->mlen -= mbuf->last - mbuf->start; + msg->mlen += nbuf->last - nbuf->start; + + mbuf_put(mbuf); + } else { // clean up the mess and recover it + mbuf_insert(&msg->mhdr, nbuf); + msg->pos = nbuf->last; + msg->dyn_error_code = BAD_FORMAT; + } } - return DN_OK; + msg->dmsg->plen -= n; + } + + for (;;) { + status = msg_parse(ctx, conn, msg); + if (status != DN_OK) { + return status; + } + + /* get next message to parse */ + nmsg = conn_recv_next(ctx, conn, false); + if (nmsg == NULL || nmsg == msg) { + /* no more data to parse */ + break; + } + + msg = nmsg; + } + + return DN_OK; } -rstatus_t -msg_recv(struct context *ctx, struct conn *conn) -{ - rstatus_t status; - struct msg *msg; +rstatus_t msg_recv(struct context *ctx, struct conn *conn) { + rstatus_t status; + struct msg *msg; - ASSERT(conn->recv_active); - conn->recv_ready = 1; + ASSERT(conn->recv_active); + conn->recv_ready = 1; - do { - msg = conn_recv_next(ctx, conn, true); - if (msg == NULL) { - return DN_OK; - } + do { + msg = conn_recv_next(ctx, conn, true); + if (msg == NULL) { + return DN_OK; + } - status = msg_recv_chain(ctx, conn, msg); - if (status != DN_OK) { - return status; - } + status = msg_recv_chain(ctx, conn, msg); + if (status != DN_OK) { + return status; + } - } while (conn->recv_ready); + } while (conn->recv_ready); - return DN_OK; + return DN_OK; } -static rstatus_t -msg_send_chain(struct context *ctx, struct conn *conn, struct msg *msg) -{ - struct msg_tqh send_msgq; /* send msg q */ - struct msg *nmsg; /* next msg */ - struct mbuf *mbuf, *nbuf; /* current and next mbuf */ - size_t mlen; /* current mbuf data length */ - struct iovec *ciov, iov[DN_IOV_MAX]; /* current iovec */ - struct array sendv; /* send iovec */ - size_t nsend, nsent; /* bytes to send; bytes sent */ - size_t limit; /* bytes to send limit */ - ssize_t n = 0; /* bytes sent by sendv */ - - if (log_loggable(LOG_VVERB)) { - loga("About to dump out the content of msg"); - msg_dump(LOG_VVERB, msg); - } +static rstatus_t msg_send_chain(struct context *ctx, struct conn *conn, + struct msg *msg) { + struct msg_tqh send_msgq; /* send msg q */ + struct msg *nmsg; /* next msg */ + struct mbuf *mbuf, *nbuf; /* current and next mbuf */ + size_t mlen; /* current mbuf data length */ + struct iovec *ciov, iov[DN_IOV_MAX]; /* current iovec */ + struct array sendv; /* send iovec */ + size_t nsend, nsent; /* bytes to send; bytes sent */ + size_t limit; /* bytes to send limit */ + ssize_t n = 0; /* bytes sent by sendv */ - TAILQ_INIT(&send_msgq); + if (log_loggable(LOG_VVERB)) { + loga("About to dump out the content of msg"); + msg_dump(LOG_VVERB, msg); + } - array_set(&sendv, iov, sizeof(iov[0]), DN_IOV_MAX); + TAILQ_INIT(&send_msgq); - /* preprocess - build iovec */ + array_set(&sendv, iov, sizeof(iov[0]), DN_IOV_MAX); - nsend = 0; - /* - * readv() and writev() returns EINVAL if the sum of the iov_len values - * overflows an ssize_t value Or, the vector count iovcnt is less than - * zero or greater than the permitted maximum. - */ - limit = SSIZE_MAX; + /* preprocess - build iovec */ - for (;;) { - ASSERT(conn->smsg == msg); + nsend = 0; + /* + * readv() and writev() returns EINVAL if the sum of the iov_len values + * overflows an ssize_t value Or, the vector count iovcnt is less than + * zero or greater than the permitted maximum. + */ + limit = SSIZE_MAX; - TAILQ_INSERT_TAIL(&send_msgq, msg, m_tqe); + for (;;) { + ASSERT(conn->smsg == msg); - STAILQ_FOREACH(mbuf, &msg->mhdr, next) { - if (!(array_n(&sendv) < DN_IOV_MAX) && (nsend < limit)) - break; + TAILQ_INSERT_TAIL(&send_msgq, msg, m_tqe); - if (mbuf_empty(mbuf)) { - continue; - } + STAILQ_FOREACH(mbuf, &msg->mhdr, next) { + if (!(array_n(&sendv) < DN_IOV_MAX) && (nsend < limit)) break; - mlen = mbuf_length(mbuf); - if ((nsend + mlen) > limit) { - mlen = limit - nsend; - } + if (mbuf_empty(mbuf)) { + continue; + } - ciov = array_push(&sendv); - ciov->iov_base = mbuf->pos; - ciov->iov_len = mlen; + mlen = mbuf_length(mbuf); + if ((nsend + mlen) > limit) { + mlen = limit - nsend; + } - nsend += mlen; - } + ciov = array_push(&sendv); + ciov->iov_base = mbuf->pos; + ciov->iov_len = mlen; - if (array_n(&sendv) >= DN_IOV_MAX || nsend >= limit) { - break; - } + nsend += mlen; + } - msg = conn_send_next(ctx, conn); - if (msg == NULL) { - break; - } + if (array_n(&sendv) >= DN_IOV_MAX || nsend >= limit) { + break; } - conn->smsg = NULL; + msg = conn_send_next(ctx, conn); + if (msg == NULL) { + break; + } + } - if (nsend != 0) - n = conn_sendv_data(conn, &sendv, nsend); + conn->smsg = NULL; - nsent = n > 0 ? (size_t)n : 0; + if (nsend != 0) n = conn_sendv_data(conn, &sendv, nsend); - /* postprocess - process sent messages in send_msgq */ - TAILQ_FOREACH_SAFE(msg, &send_msgq, m_tqe, nmsg) { + nsent = n > 0 ? (size_t)n : 0; - TAILQ_REMOVE(&send_msgq, msg, m_tqe); + /* postprocess - process sent messages in send_msgq */ + TAILQ_FOREACH_SAFE(msg, &send_msgq, m_tqe, nmsg) { + TAILQ_REMOVE(&send_msgq, msg, m_tqe); - if (nsent == 0) { - if (msg->mlen == 0) { - conn_send_done(ctx, conn, msg); - } - continue; - } + if (nsent == 0) { + if (msg->mlen == 0) { + conn_send_done(ctx, conn, msg); + } + continue; + } - /* adjust mbufs of the sent message */ - for (mbuf = STAILQ_FIRST(&msg->mhdr); mbuf != NULL; mbuf = nbuf) { - nbuf = STAILQ_NEXT(mbuf, next); - - if (mbuf_empty(mbuf)) { - continue; - } - - mlen = mbuf_length(mbuf); - if (nsent < mlen) { - /* mbuf was sent partially; process remaining bytes later */ - mbuf->pos += nsent; - ASSERT(mbuf->pos < mbuf->last); - nsent = 0; - break; - } - - /* mbuf was sent completely; mark it empty */ - mbuf->pos = mbuf->last; - nsent -= mlen; - } + /* adjust mbufs of the sent message */ + for (mbuf = STAILQ_FIRST(&msg->mhdr); mbuf != NULL; mbuf = nbuf) { + nbuf = STAILQ_NEXT(mbuf, next); - /* message has been sent completely, finalize it */ - if (mbuf == NULL) { - conn_send_done(ctx, conn, msg); - } - } + if (mbuf_empty(mbuf)) { + continue; + } - ASSERT(TAILQ_EMPTY(&send_msgq)); + mlen = mbuf_length(mbuf); + if (nsent < mlen) { + /* mbuf was sent partially; process remaining bytes later */ + mbuf->pos += nsent; + ASSERT(mbuf->pos < mbuf->last); + nsent = 0; + break; + } - if (n > 0) { - return DN_OK; + /* mbuf was sent completely; mark it empty */ + mbuf->pos = mbuf->last; + nsent -= mlen; } - return (n == DN_EAGAIN) ? DN_OK : DN_ERROR; + /* message has been sent completely, finalize it */ + if (mbuf == NULL) { + conn_send_done(ctx, conn, msg); + } + } + + ASSERT(TAILQ_EMPTY(&send_msgq)); + + if (n > 0) { + return DN_OK; + } + + return (n == DN_EAGAIN) ? DN_OK : DN_ERROR; } -rstatus_t -msg_send(struct context *ctx, struct conn *conn) -{ - rstatus_t status; - struct msg *msg; +rstatus_t msg_send(struct context *ctx, struct conn *conn) { + rstatus_t status; + struct msg *msg; - ASSERT_LOG(conn->send_active, "%s is not active", print_obj(conn)); + ASSERT_LOG(conn->send_active, "%s is not active", print_obj(conn)); - conn->send_ready = 1; - do { - msg = conn_send_next(ctx, conn); - if (msg == NULL) { - /* nothing to send */ - return DN_OK; - } + conn->send_ready = 1; + do { + msg = conn_send_next(ctx, conn); + if (msg == NULL) { + /* nothing to send */ + return DN_OK; + } - status = msg_send_chain(ctx, conn, msg); - if (status != DN_OK) { - return status; - } + status = msg_send_chain(ctx, conn, msg); + if (status != DN_OK) { + return status; + } - if (TAILQ_COUNT(&conn->omsg_q) > MAX_CONN_QUEUE_SIZE) { - conn->send_ready = 0; - conn->err = ENOTRECOVERABLE; - log_error("%s Setting ENOTRECOVERABLE happens here!", print_obj(conn)); - } + if (TAILQ_COUNT(&conn->omsg_q) > MAX_CONN_QUEUE_SIZE) { + conn->send_ready = 0; + conn->err = ENOTRECOVERABLE; + log_error("%s Setting ENOTRECOVERABLE happens here!", print_obj(conn)); + } - } while (conn->send_ready); + } while (conn->send_ready); - return DN_OK; + return DN_OK; } -struct mbuf * -msg_ensure_mbuf(struct msg *msg, size_t len) -{ - struct mbuf *mbuf; +struct mbuf *msg_ensure_mbuf(struct msg *msg, size_t len) { + struct mbuf *mbuf; - if (STAILQ_EMPTY(&msg->mhdr) || - mbuf_size(STAILQ_LAST(&msg->mhdr, mbuf, next)) < len) { - mbuf = mbuf_get(); - if (mbuf == NULL) { - return NULL; - } - mbuf_insert(&msg->mhdr, mbuf); - } else { - mbuf = STAILQ_LAST(&msg->mhdr, mbuf, next); + if (STAILQ_EMPTY(&msg->mhdr) || + mbuf_size(STAILQ_LAST(&msg->mhdr, mbuf, next)) < len) { + mbuf = mbuf_get(); + if (mbuf == NULL) { + return NULL; } + mbuf_insert(&msg->mhdr, mbuf); + } else { + mbuf = STAILQ_LAST(&msg->mhdr, mbuf, next); + } - return mbuf; + return mbuf; } - /* * Append n bytes of data, with n <= mbuf_size(mbuf) * into mbuf */ -rstatus_t -msg_append(struct msg *msg, uint8_t *pos, size_t n) -{ - struct mbuf *mbuf; +rstatus_t msg_append(struct msg *msg, uint8_t *pos, size_t n) { + struct mbuf *mbuf; - ASSERT(n <= mbuf_data_size()); + ASSERT(n <= mbuf_data_size()); - mbuf = msg_ensure_mbuf(msg, n); - if (mbuf == NULL) { - return DN_ENOMEM; - } + mbuf = msg_ensure_mbuf(msg, n); + if (mbuf == NULL) { + return DN_ENOMEM; + } - ASSERT(n <= mbuf_size(mbuf)); + ASSERT(n <= mbuf_size(mbuf)); - mbuf_copy(mbuf, pos, n); - msg->mlen += (uint32_t)n; + mbuf_copy(mbuf, pos, n); + msg->mlen += (uint32_t)n; - return DN_OK; + return DN_OK; } /* * Prepend n bytes of data, with n <= mbuf_size(mbuf) * into mbuf */ -rstatus_t -msg_prepend(struct msg *msg, uint8_t *pos, size_t n) -{ - struct mbuf *mbuf; +rstatus_t msg_prepend(struct msg *msg, uint8_t *pos, size_t n) { + struct mbuf *mbuf; - mbuf = mbuf_get(); - if (mbuf == NULL) { - return DN_ENOMEM; - } + mbuf = mbuf_get(); + if (mbuf == NULL) { + return DN_ENOMEM; + } - ASSERT(n <= mbuf_size(mbuf)); + ASSERT(n <= mbuf_size(mbuf)); - mbuf_copy(mbuf, pos, n); - msg->mlen += (uint32_t)n; + mbuf_copy(mbuf, pos, n); + msg->mlen += (uint32_t)n; - STAILQ_INSERT_HEAD(&msg->mhdr, mbuf, next); + STAILQ_INSERT_HEAD(&msg->mhdr, mbuf, next); - return DN_OK; + return DN_OK; } /* * Prepend a formatted string into msg. Returns an error if the formatted * string does not fit in a single mbuf. */ -rstatus_t -msg_prepend_format(struct msg *msg, const char *fmt, ...) -{ - struct mbuf *mbuf; - int n; - uint32_t size; - va_list args; - - mbuf = mbuf_get(); - if (mbuf == NULL) { - return DN_ENOMEM; - } - - size = mbuf_size(mbuf); - - va_start(args, fmt); - n = dn_vscnprintf(mbuf->last, size, fmt, args); - va_end(args); - if (n <= 0 || n >= (int)size) { - return DN_ERROR; - } - - mbuf->last += n; - msg->mlen += (uint32_t)n; - STAILQ_INSERT_HEAD(&msg->mhdr, mbuf, next); - - return DN_OK; +rstatus_t msg_prepend_format(struct msg *msg, const char *fmt, ...) { + struct mbuf *mbuf; + int n; + uint32_t size; + va_list args; + + mbuf = mbuf_get(); + if (mbuf == NULL) { + return DN_ENOMEM; + } + + size = mbuf_size(mbuf); + + va_start(args, fmt); + n = dn_vscnprintf(mbuf->last, size, fmt, args); + va_end(args); + if (n <= 0 || n >= (int)size) { + return DN_ERROR; + } + + mbuf->last += n; + msg->mlen += (uint32_t)n; + STAILQ_INSERT_HEAD(&msg->mhdr, mbuf, next); + + return DN_OK; } diff --git a/src/dyn_message.h b/src/dyn_message.h index a7e75f688..da8f3e1a7 100644 --- a/src/dyn_message.h +++ b/src/dyn_message.h @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -23,304 +23,295 @@ #ifndef _DYN_MESSAGE_H_ #define _DYN_MESSAGE_H_ -#include "dyn_core.h" +#include + +#include "dyn_dict.h" #include "dyn_dnode_msg.h" +#include "dyn_mbuf.h" +#include "dyn_queue.h" +#include "dyn_rbtree.h" #include "dyn_response_mgr.h" #include "dyn_types.h" -#define ALLOC_MSGS 200000 -#define MIN_ALLOC_MSGS 100000 -#define MAX_ALLOC_MSGS 1000000 +#define ALLOC_MSGS 200000 +#define MIN_ALLOC_MSGS 100000 +#define MAX_ALLOC_MSGS 1000000 -#define MAX_ALLOWABLE_PROCESSED_MSGS 500 +#define MAX_ALLOWABLE_PROCESSED_MSGS 500 typedef void (*func_msg_parse_t)(struct msg *, const struct string *hash_tag); typedef rstatus_t (*func_msg_fragment_t)(struct msg *, struct server_pool *, struct rack *, struct msg_tqh *); typedef rstatus_t (*func_msg_verify_t)(struct msg *, struct server_pool *, - struct rack *); + struct rack *); typedef void (*func_msg_coalesce_t)(struct msg *r); typedef rstatus_t (*msg_response_handler_t)(struct msg *req, struct msg *rsp); typedef bool (*func_msg_failure_t)(struct msg *r); typedef bool (*func_is_multikey_request)(struct msg *r); typedef struct msg *(*func_reconcile_responses)(struct response_mgr *rspmgr); -typedef rstatus_t (*func_msg_rewrite_t)(struct msg *orig_msg, struct context* ctx, - bool* did_rewrite, struct msg** new_msg_ptr); - -extern func_msg_coalesce_t g_pre_coalesce; /* message pre-coalesce */ -extern func_msg_coalesce_t g_post_coalesce; /* message post-coalesce */ -extern func_msg_fragment_t g_fragment; /* message fragment */ -extern func_msg_verify_t g_verify_request; /* message verify */ +typedef rstatus_t (*func_msg_rewrite_t)(struct msg *orig_msg, + struct context *ctx, bool *did_rewrite, + struct msg **new_msg_ptr); + +extern func_msg_coalesce_t g_pre_coalesce; /* message pre-coalesce */ +extern func_msg_coalesce_t g_post_coalesce; /* message post-coalesce */ +extern func_msg_fragment_t g_fragment; /* message fragment */ +extern func_msg_verify_t g_verify_request; /* message verify */ extern func_is_multikey_request g_is_multikey_request; extern func_reconcile_responses g_reconcile_responses; -extern func_msg_rewrite_t g_rewrite_query; /* rewrite query in a msg if necessary */ +extern func_msg_rewrite_t + g_rewrite_query; /* rewrite query in a msg if necessary */ void set_datastore_ops(void); typedef enum msg_parse_result { - MSG_PARSE_OK, /* parsing ok */ - MSG_PARSE_ERROR, /* parsing error */ - MSG_PARSE_REPAIR, /* more to parse -> repair parsed & unparsed data */ - MSG_PARSE_FRAGMENT, /* multi-vector request -> fragment */ - MSG_PARSE_AGAIN, /* incomplete -> parse again */ - MSG_OOM_ERROR + MSG_PARSE_OK, /* parsing ok */ + MSG_PARSE_ERROR, /* parsing error */ + MSG_PARSE_REPAIR, /* more to parse -> repair parsed & unparsed data */ + MSG_PARSE_FRAGMENT, /* multi-vector request -> fragment */ + MSG_PARSE_AGAIN, /* incomplete -> parse again */ + MSG_OOM_ERROR } msg_parse_result_t; -#define MSG_TYPE_CODEC(ACTION) \ - ACTION( UNKNOWN ) \ - ACTION( REQ_MC_GET ) /* memcache retrieval requests */ \ - ACTION( REQ_MC_GETS ) \ - ACTION( REQ_MC_DELETE ) /* memcache delete request */ \ - ACTION( REQ_MC_CAS ) /* memcache cas request and storage request */ \ - ACTION( REQ_MC_SET ) /* memcache storage request */ \ - ACTION( REQ_MC_ADD ) \ - ACTION( REQ_MC_REPLACE ) \ - ACTION( REQ_MC_APPEND ) \ - ACTION( REQ_MC_PREPEND ) \ - ACTION( REQ_MC_INCR ) /* memcache arithmetic request */ \ - ACTION( REQ_MC_DECR ) \ - ACTION( REQ_MC_TOUCH ) /* memcache touch request */ \ - ACTION( REQ_MC_QUIT ) /* memcache quit request */ \ - ACTION( RSP_MC_NUM ) /* memcache arithmetic response */ \ - ACTION( RSP_MC_STORED ) /* memcache cas and storage response */ \ - ACTION( RSP_MC_NOT_STORED ) \ - ACTION( RSP_MC_EXISTS ) \ - ACTION( RSP_MC_NOT_FOUND ) \ - ACTION( RSP_MC_END ) \ - ACTION( RSP_MC_VALUE ) \ - ACTION( RSP_MC_DELETED ) /* memcache delete response */ \ - ACTION( RSP_MC_TOUCHED ) /* memcache touch response */ \ - ACTION( RSP_MC_ERROR ) /* memcache error responses */ \ - ACTION( RSP_MC_CLIENT_ERROR ) \ - ACTION( RSP_MC_SERVER_ERROR ) \ - ACTION( REQ_REDIS_DEL ) /* redis commands - keys */ \ - ACTION( REQ_REDIS_EXISTS ) \ - ACTION( REQ_REDIS_EXPIRE ) \ - ACTION( REQ_REDIS_EXPIREAT ) \ - ACTION( REQ_REDIS_PEXPIRE ) \ - ACTION( REQ_REDIS_PEXPIREAT ) \ - ACTION( REQ_REDIS_PERSIST ) \ - ACTION( REQ_REDIS_PTTL ) \ - ACTION( REQ_REDIS_SCAN ) \ - ACTION( REQ_REDIS_SORT ) \ - ACTION( REQ_REDIS_TTL ) \ - ACTION( REQ_REDIS_TYPE ) \ - ACTION( REQ_REDIS_APPEND ) /* redis requests - string */ \ - ACTION( REQ_REDIS_BITCOUNT ) \ - ACTION( REQ_REDIS_BITPOS ) \ - ACTION( REQ_REDIS_DECR ) \ - ACTION( REQ_REDIS_DECRBY ) \ - ACTION( REQ_REDIS_DUMP ) \ - ACTION( REQ_REDIS_GET ) \ - ACTION( REQ_REDIS_GETBIT ) \ - ACTION( REQ_REDIS_GETRANGE ) \ - ACTION( REQ_REDIS_GETSET ) \ - ACTION( REQ_REDIS_INCR ) \ - ACTION( REQ_REDIS_INCRBY ) \ - ACTION( REQ_REDIS_INCRBYFLOAT ) \ - ACTION( REQ_REDIS_MSET ) \ - ACTION( REQ_REDIS_MGET ) \ - ACTION( REQ_REDIS_PSETEX ) \ - ACTION( REQ_REDIS_RESTORE ) \ - ACTION( REQ_REDIS_SET ) \ - ACTION( REQ_REDIS_SETBIT ) \ - ACTION( REQ_REDIS_SETEX ) \ - ACTION( REQ_REDIS_SETNX ) \ - ACTION( REQ_REDIS_SETRANGE ) \ - ACTION( REQ_REDIS_STRLEN ) \ - ACTION( REQ_REDIS_HDEL ) /* redis requests - hashes */ \ - ACTION( REQ_REDIS_HEXISTS ) \ - ACTION( REQ_REDIS_HGET ) \ - ACTION( REQ_REDIS_HGETALL ) \ - ACTION( REQ_REDIS_HINCRBY ) \ - ACTION( REQ_REDIS_HINCRBYFLOAT ) \ - ACTION( REQ_REDIS_HKEYS ) \ - ACTION( REQ_REDIS_HLEN ) \ - ACTION( REQ_REDIS_HMGET ) \ - ACTION( REQ_REDIS_HMSET ) \ - ACTION( REQ_REDIS_HSET ) \ - ACTION( REQ_REDIS_HSETNX ) \ - ACTION( REQ_REDIS_HSCAN) \ - ACTION( REQ_REDIS_HVALS ) \ - ACTION( REQ_REDIS_KEYS ) \ - ACTION( REQ_REDIS_INFO ) \ - ACTION( REQ_REDIS_LINDEX ) /* redis requests - lists */ \ - ACTION( REQ_REDIS_LINSERT ) \ - ACTION( REQ_REDIS_LLEN ) \ - ACTION( REQ_REDIS_LPOP ) \ - ACTION( REQ_REDIS_LPUSH ) \ - ACTION( REQ_REDIS_LPUSHX ) \ - ACTION( REQ_REDIS_LRANGE ) \ - ACTION( REQ_REDIS_LREM ) \ - ACTION( REQ_REDIS_LSET ) \ - ACTION( REQ_REDIS_LTRIM ) \ - ACTION( REQ_REDIS_PING ) \ - ACTION( REQ_REDIS_QUIT ) \ - ACTION( REQ_REDIS_RPOP ) \ - ACTION( REQ_REDIS_RPOPLPUSH ) \ - ACTION( REQ_REDIS_RPUSH ) \ - ACTION( REQ_REDIS_RPUSHX ) \ - ACTION( REQ_REDIS_SADD ) /* redis requests - sets */ \ - ACTION( REQ_REDIS_SCARD ) \ - ACTION( REQ_REDIS_SDIFF ) \ - ACTION( REQ_REDIS_SDIFFSTORE ) \ - ACTION( REQ_REDIS_SINTER ) \ - ACTION( REQ_REDIS_SINTERSTORE ) \ - ACTION( REQ_REDIS_SISMEMBER ) \ - ACTION( REQ_REDIS_SLAVEOF ) \ - ACTION( REQ_REDIS_SMEMBERS ) \ - ACTION( REQ_REDIS_SMOVE ) \ - ACTION( REQ_REDIS_SPOP ) \ - ACTION( REQ_REDIS_SRANDMEMBER ) \ - ACTION( REQ_REDIS_SREM ) \ - ACTION( REQ_REDIS_SUNION ) \ - ACTION( REQ_REDIS_SUNIONSTORE ) \ - ACTION( REQ_REDIS_SSCAN) \ - ACTION( REQ_REDIS_ZADD ) /* redis requests - sorted sets */ \ - ACTION( REQ_REDIS_ZCARD ) \ - ACTION( REQ_REDIS_ZCOUNT ) \ - ACTION( REQ_REDIS_ZINCRBY ) \ - ACTION( REQ_REDIS_ZINTERSTORE ) \ - ACTION( REQ_REDIS_ZLEXCOUNT ) \ - ACTION( REQ_REDIS_ZRANGE ) \ - ACTION( REQ_REDIS_ZRANGEBYLEX ) \ - ACTION( REQ_REDIS_ZRANGEBYSCORE ) \ - ACTION( REQ_REDIS_ZRANK ) \ - ACTION( REQ_REDIS_ZREM ) \ - ACTION( REQ_REDIS_ZREMRANGEBYRANK ) \ - ACTION( REQ_REDIS_ZREMRANGEBYLEX ) \ - ACTION( REQ_REDIS_ZREMRANGEBYSCORE ) \ - ACTION( REQ_REDIS_ZREVRANGE ) \ - ACTION( REQ_REDIS_ZREVRANGEBYLEX ) \ - ACTION( REQ_REDIS_ZREVRANGEBYSCORE ) \ - ACTION( REQ_REDIS_ZREVRANK ) \ - ACTION( REQ_REDIS_ZSCORE ) \ - ACTION( REQ_REDIS_ZUNIONSTORE ) \ - ACTION( REQ_REDIS_ZSCAN) \ - ACTION( REQ_REDIS_EVAL ) /* redis requests - eval */ \ - ACTION( REQ_REDIS_EVALSHA ) \ - ACTION( REQ_REDIS_GEOADD ) /* redis geo requests */ \ - ACTION( REQ_REDIS_GEORADIUS ) \ - ACTION( REQ_REDIS_GEODIST ) \ - ACTION( REQ_REDIS_GEOHASH ) \ - ACTION( REQ_REDIS_GEOPOS ) \ - ACTION( REQ_REDIS_GEORADIUSBYMEMBER ) \ - /* ACTION( REQ_REDIS_AUTH) */ \ - /* ACTION( REQ_REDIS_SELECT)*/ /* only during init */ \ - ACTION( REQ_REDIS_PFADD ) /* redis requests - hyperloglog */ \ - ACTION( REQ_REDIS_PFCOUNT ) \ - ACTION( RSP_REDIS_STATUS ) /* redis response */ \ - ACTION( RSP_REDIS_INTEGER ) \ - ACTION( RSP_REDIS_BULK ) \ - ACTION( RSP_REDIS_MULTIBULK ) \ - ACTION( REQ_REDIS_CONFIG ) \ - ACTION( RSP_REDIS_ERROR ) \ - ACTION( RSP_REDIS_ERROR_ERR ) \ - ACTION( RSP_REDIS_ERROR_OOM ) \ - ACTION( RSP_REDIS_ERROR_BUSY ) \ - ACTION( RSP_REDIS_ERROR_NOAUTH ) \ - ACTION( RSP_REDIS_ERROR_LOADING ) \ - ACTION( RSP_REDIS_ERROR_BUSYKEY ) \ - ACTION( RSP_REDIS_ERROR_MISCONF ) \ - ACTION( RSP_REDIS_ERROR_NOSCRIPT ) \ - ACTION( RSP_REDIS_ERROR_READONLY ) \ - ACTION( RSP_REDIS_ERROR_WRONGTYPE ) \ - ACTION( RSP_REDIS_ERROR_EXECABORT ) \ - ACTION( RSP_REDIS_ERROR_MASTERDOWN ) \ - ACTION( RSP_REDIS_ERROR_NOREPLICAS ) \ - ACTION( SENTINEL ) \ - +#define MSG_TYPE_CODEC(ACTION) \ + ACTION(UNKNOWN) \ + ACTION(REQ_MC_GET) /* memcache retrieval requests */ \ + ACTION(REQ_MC_GETS) \ + ACTION(REQ_MC_DELETE) /* memcache delete request */ \ + ACTION(REQ_MC_CAS) /* memcache cas request and storage request */ \ + ACTION(REQ_MC_SET) /* memcache storage request */ \ + ACTION(REQ_MC_ADD) \ + ACTION(REQ_MC_REPLACE) \ + ACTION(REQ_MC_APPEND) \ + ACTION(REQ_MC_PREPEND) \ + ACTION(REQ_MC_INCR) /* memcache arithmetic request */ \ + ACTION(REQ_MC_DECR) \ + ACTION(REQ_MC_TOUCH) /* memcache touch request */ \ + ACTION(REQ_MC_QUIT) /* memcache quit request */ \ + ACTION(RSP_MC_NUM) /* memcache arithmetic response */ \ + ACTION(RSP_MC_STORED) /* memcache cas and storage response */ \ + ACTION(RSP_MC_NOT_STORED) \ + ACTION(RSP_MC_EXISTS) \ + ACTION(RSP_MC_NOT_FOUND) \ + ACTION(RSP_MC_END) \ + ACTION(RSP_MC_VALUE) \ + ACTION(RSP_MC_DELETED) /* memcache delete response */ \ + ACTION(RSP_MC_TOUCHED) /* memcache touch response */ \ + ACTION(RSP_MC_ERROR) /* memcache error responses */ \ + ACTION(RSP_MC_CLIENT_ERROR) \ + ACTION(RSP_MC_SERVER_ERROR) \ + ACTION(REQ_REDIS_DEL) /* redis commands - keys */ \ + ACTION(REQ_REDIS_EXISTS) \ + ACTION(REQ_REDIS_EXPIRE) \ + ACTION(REQ_REDIS_EXPIREAT) \ + ACTION(REQ_REDIS_PEXPIRE) \ + ACTION(REQ_REDIS_PEXPIREAT) \ + ACTION(REQ_REDIS_PERSIST) \ + ACTION(REQ_REDIS_PTTL) \ + ACTION(REQ_REDIS_SCAN) \ + ACTION(REQ_REDIS_SORT) \ + ACTION(REQ_REDIS_TTL) \ + ACTION(REQ_REDIS_TYPE) \ + ACTION(REQ_REDIS_APPEND) /* redis requests - string */ \ + ACTION(REQ_REDIS_BITCOUNT) \ + ACTION(REQ_REDIS_BITPOS) \ + ACTION(REQ_REDIS_DECR) \ + ACTION(REQ_REDIS_DECRBY) \ + ACTION(REQ_REDIS_DUMP) \ + ACTION(REQ_REDIS_GET) \ + ACTION(REQ_REDIS_GETBIT) \ + ACTION(REQ_REDIS_GETRANGE) \ + ACTION(REQ_REDIS_GETSET) \ + ACTION(REQ_REDIS_INCR) \ + ACTION(REQ_REDIS_INCRBY) \ + ACTION(REQ_REDIS_INCRBYFLOAT) \ + ACTION(REQ_REDIS_MSET) \ + ACTION(REQ_REDIS_MGET) \ + ACTION(REQ_REDIS_PSETEX) \ + ACTION(REQ_REDIS_RESTORE) \ + ACTION(REQ_REDIS_SET) \ + ACTION(REQ_REDIS_SETBIT) \ + ACTION(REQ_REDIS_SETEX) \ + ACTION(REQ_REDIS_SETNX) \ + ACTION(REQ_REDIS_SETRANGE) \ + ACTION(REQ_REDIS_STRLEN) \ + ACTION(REQ_REDIS_HDEL) /* redis requests - hashes */ \ + ACTION(REQ_REDIS_HEXISTS) \ + ACTION(REQ_REDIS_HGET) \ + ACTION(REQ_REDIS_HGETALL) \ + ACTION(REQ_REDIS_HINCRBY) \ + ACTION(REQ_REDIS_HINCRBYFLOAT) \ + ACTION(REQ_REDIS_HKEYS) \ + ACTION(REQ_REDIS_HLEN) \ + ACTION(REQ_REDIS_HMGET) \ + ACTION(REQ_REDIS_HMSET) \ + ACTION(REQ_REDIS_HSET) \ + ACTION(REQ_REDIS_HSETNX) \ + ACTION(REQ_REDIS_HSCAN) \ + ACTION(REQ_REDIS_HVALS) \ + ACTION(REQ_REDIS_KEYS) \ + ACTION(REQ_REDIS_INFO) \ + ACTION(REQ_REDIS_LINDEX) /* redis requests - lists */ \ + ACTION(REQ_REDIS_LINSERT) \ + ACTION(REQ_REDIS_LLEN) \ + ACTION(REQ_REDIS_LPOP) \ + ACTION(REQ_REDIS_LPUSH) \ + ACTION(REQ_REDIS_LPUSHX) \ + ACTION(REQ_REDIS_LRANGE) \ + ACTION(REQ_REDIS_LREM) \ + ACTION(REQ_REDIS_LSET) \ + ACTION(REQ_REDIS_LTRIM) \ + ACTION(REQ_REDIS_PING) \ + ACTION(REQ_REDIS_QUIT) \ + ACTION(REQ_REDIS_RPOP) \ + ACTION(REQ_REDIS_RPOPLPUSH) \ + ACTION(REQ_REDIS_RPUSH) \ + ACTION(REQ_REDIS_RPUSHX) \ + ACTION(REQ_REDIS_SADD) /* redis requests - sets */ \ + ACTION(REQ_REDIS_SCARD) \ + ACTION(REQ_REDIS_SDIFF) \ + ACTION(REQ_REDIS_SDIFFSTORE) \ + ACTION(REQ_REDIS_SINTER) \ + ACTION(REQ_REDIS_SINTERSTORE) \ + ACTION(REQ_REDIS_SISMEMBER) \ + ACTION(REQ_REDIS_SLAVEOF) \ + ACTION(REQ_REDIS_SMEMBERS) \ + ACTION(REQ_REDIS_SMOVE) \ + ACTION(REQ_REDIS_SPOP) \ + ACTION(REQ_REDIS_SRANDMEMBER) \ + ACTION(REQ_REDIS_SREM) \ + ACTION(REQ_REDIS_SUNION) \ + ACTION(REQ_REDIS_SUNIONSTORE) \ + ACTION(REQ_REDIS_SSCAN) \ + ACTION(REQ_REDIS_ZADD) /* redis requests - sorted sets */ \ + ACTION(REQ_REDIS_ZCARD) \ + ACTION(REQ_REDIS_ZCOUNT) \ + ACTION(REQ_REDIS_ZINCRBY) \ + ACTION(REQ_REDIS_ZINTERSTORE) \ + ACTION(REQ_REDIS_ZLEXCOUNT) \ + ACTION(REQ_REDIS_ZRANGE) \ + ACTION(REQ_REDIS_ZRANGEBYLEX) \ + ACTION(REQ_REDIS_ZRANGEBYSCORE) \ + ACTION(REQ_REDIS_ZRANK) \ + ACTION(REQ_REDIS_ZREM) \ + ACTION(REQ_REDIS_ZREMRANGEBYRANK) \ + ACTION(REQ_REDIS_ZREMRANGEBYLEX) \ + ACTION(REQ_REDIS_ZREMRANGEBYSCORE) \ + ACTION(REQ_REDIS_ZREVRANGE) \ + ACTION(REQ_REDIS_ZREVRANGEBYLEX) \ + ACTION(REQ_REDIS_ZREVRANGEBYSCORE) \ + ACTION(REQ_REDIS_ZREVRANK) \ + ACTION(REQ_REDIS_ZSCORE) \ + ACTION(REQ_REDIS_ZUNIONSTORE) \ + ACTION(REQ_REDIS_ZSCAN) \ + ACTION(REQ_REDIS_EVAL) /* redis requests - eval */ \ + ACTION(REQ_REDIS_EVALSHA) \ + ACTION(REQ_REDIS_GEOADD) /* redis geo requests */ \ + ACTION(REQ_REDIS_GEORADIUS) \ + ACTION(REQ_REDIS_GEODIST) \ + ACTION(REQ_REDIS_GEOHASH) \ + ACTION(REQ_REDIS_GEOPOS) \ + ACTION(REQ_REDIS_GEORADIUSBYMEMBER) \ + /* ACTION( REQ_REDIS_AUTH) */ \ + /* ACTION( REQ_REDIS_SELECT)*/ /* only during init */ \ + ACTION(REQ_REDIS_PFADD) /* redis requests - hyperloglog */ \ + ACTION(REQ_REDIS_PFCOUNT) ACTION(RSP_REDIS_STATUS) /* redis response */ \ + ACTION(RSP_REDIS_INTEGER) ACTION(RSP_REDIS_BULK) \ + ACTION(RSP_REDIS_MULTIBULK) ACTION(REQ_REDIS_CONFIG) ACTION( \ + RSP_REDIS_ERROR) ACTION(RSP_REDIS_ERROR_ERR) \ + ACTION(RSP_REDIS_ERROR_OOM) ACTION(RSP_REDIS_ERROR_BUSY) ACTION( \ + RSP_REDIS_ERROR_NOAUTH) ACTION(RSP_REDIS_ERROR_LOADING) \ + ACTION(RSP_REDIS_ERROR_BUSYKEY) \ + ACTION(RSP_REDIS_ERROR_MISCONF) \ + ACTION(RSP_REDIS_ERROR_NOSCRIPT) \ + ACTION(RSP_REDIS_ERROR_READONLY) \ + ACTION(RSP_REDIS_ERROR_WRONGTYPE) ACTION( \ + RSP_REDIS_ERROR_EXECABORT) \ + ACTION(RSP_REDIS_ERROR_MASTERDOWN) \ + ACTION(RSP_REDIS_ERROR_NOREPLICAS) \ + ACTION(SENTINEL) #define DEFINE_ACTION(_name) MSG_##_name, -typedef enum msg_type { - MSG_TYPE_CODEC(DEFINE_ACTION) -} msg_type_t; +typedef enum msg_type { MSG_TYPE_CODEC(DEFINE_ACTION) } msg_type_t; #undef DEFINE_ACTION - typedef enum dyn_error { - DYNOMITE_OK, - DYNOMITE_UNKNOWN_ERROR, - DYNOMITE_INVALID_STATE, - DYNOMITE_INVALID_ADMIN_REQ, - PEER_CONNECTION_REFUSE, - PEER_HOST_DOWN, - PEER_HOST_NOT_CONNECTED, - STORAGE_CONNECTION_REFUSE, - BAD_FORMAT, - DYNOMITE_NO_QUORUM_ACHIEVED, - DYNOMITE_SCRIPT_SPANS_NODES, + DYNOMITE_OK, + DYNOMITE_UNKNOWN_ERROR, + DYNOMITE_INVALID_STATE, + DYNOMITE_INVALID_ADMIN_REQ, + PEER_CONNECTION_REFUSE, + PEER_HOST_DOWN, + PEER_HOST_NOT_CONNECTED, + STORAGE_CONNECTION_REFUSE, + BAD_FORMAT, + DYNOMITE_NO_QUORUM_ACHIEVED, + DYNOMITE_SCRIPT_SPANS_NODES, } dyn_error_t; -static inline char * -dn_strerror(dyn_error_t err) -{ - switch(err) - { - case DYNOMITE_OK: - return "Success"; - case DYNOMITE_UNKNOWN_ERROR: - return "Unknown Error"; - case DYNOMITE_INVALID_STATE: - return "Dynomite's current state does not allow this request"; - case DYNOMITE_INVALID_ADMIN_REQ: - return "Invalid request in Dynomite's admin mode"; - case PEER_CONNECTION_REFUSE: - return "Peer Node refused connection"; - case PEER_HOST_DOWN: - return "Peer Node is down"; - case PEER_HOST_NOT_CONNECTED: - return "Peer Node is not connected"; - case STORAGE_CONNECTION_REFUSE: - return "Datastore refused connection"; - case DYNOMITE_NO_QUORUM_ACHIEVED: - return "Failed to achieve Quorum"; - case DYNOMITE_SCRIPT_SPANS_NODES: - return "Keys in the script cannot span multiple nodes"; - default: - return strerror(err); - } +static inline char *dn_strerror(dyn_error_t err) { + switch (err) { + case DYNOMITE_OK: + return "Success"; + case DYNOMITE_UNKNOWN_ERROR: + return "Unknown Error"; + case DYNOMITE_INVALID_STATE: + return "Dynomite's current state does not allow this request"; + case DYNOMITE_INVALID_ADMIN_REQ: + return "Invalid request in Dynomite's admin mode"; + case PEER_CONNECTION_REFUSE: + return "Peer Node refused connection"; + case PEER_HOST_DOWN: + return "Peer Node is down"; + case PEER_HOST_NOT_CONNECTED: + return "Peer Node is not connected"; + case STORAGE_CONNECTION_REFUSE: + return "Datastore refused connection"; + case DYNOMITE_NO_QUORUM_ACHIEVED: + return "Failed to achieve Quorum"; + case DYNOMITE_SCRIPT_SPANS_NODES: + return "Keys in the script cannot span multiple nodes"; + default: + return strerror(err); + } } -static inline char * -dyn_error_source(dyn_error_t err) -{ - switch(err) - { - case DYNOMITE_INVALID_ADMIN_REQ: - case DYNOMITE_INVALID_STATE: - case DYNOMITE_NO_QUORUM_ACHIEVED: - case DYNOMITE_SCRIPT_SPANS_NODES: - return "Dynomite:"; - case PEER_CONNECTION_REFUSE: - case PEER_HOST_DOWN: - case PEER_HOST_NOT_CONNECTED: - return "Peer:"; - case STORAGE_CONNECTION_REFUSE: - return "Storage:"; - default: - return "unknown:"; - } +static inline char *dyn_error_source(dyn_error_t err) { + switch (err) { + case DYNOMITE_INVALID_ADMIN_REQ: + case DYNOMITE_INVALID_STATE: + case DYNOMITE_NO_QUORUM_ACHIEVED: + case DYNOMITE_SCRIPT_SPANS_NODES: + return "Dynomite:"; + case PEER_CONNECTION_REFUSE: + case PEER_HOST_DOWN: + case PEER_HOST_NOT_CONNECTED: + return "Peer:"; + case STORAGE_CONNECTION_REFUSE: + return "Storage:"; + default: + return "unknown:"; + } } /* This is a wrong place for this typedef. But adding to core has some * dependency issues - FixIt someother day :( */ typedef enum consistency { - DC_ONE = 0, - DC_QUORUM, - DC_SAFE_QUORUM, + DC_ONE = 0, + DC_QUORUM, + DC_SAFE_QUORUM, } consistency_t; -static inline char* -get_consistency_string(consistency_t cons) -{ - switch(cons) - { - case DC_ONE: return "DC_ONE"; - case DC_QUORUM: return "DC_QUORUM"; - case DC_SAFE_QUORUM: return "DC_SAFE_QUORUM"; - } - return "INVALID CONSISTENCY"; +static inline char *get_consistency_string(consistency_t cons) { + switch (cons) { + case DC_ONE: + return "DC_ONE"; + case DC_QUORUM: + return "DC_QUORUM"; + case DC_SAFE_QUORUM: + return "DC_SAFE_QUORUM"; + } + return "INVALID CONSISTENCY"; } #define DEFAULT_READ_CONSISTENCY DC_ONE @@ -330,131 +321,132 @@ extern consistency_t g_read_consistency; extern uint8_t g_timeout_factor; typedef enum msg_routing { - ROUTING_NORMAL = 0, - ROUTING_LOCAL_NODE_ONLY = 1, /* Ignore the key hashing */ - ROUTING_TOKEN_OWNER_LOCAL_RACK_ONLY = 2, /* apply key hashing, but local rack only */ - ROUTING_ALL_NODES_LOCAL_RACK_ONLY = 3, /* Ignore key hashing, local rack only */ + ROUTING_NORMAL = 0, + ROUTING_LOCAL_NODE_ONLY = 1, /* Ignore the key hashing */ + ROUTING_TOKEN_OWNER_LOCAL_RACK_ONLY = + 2, /* apply key hashing, but local rack only */ + ROUTING_ALL_NODES_LOCAL_RACK_ONLY = + 3, /* Ignore key hashing, local rack only */ } msg_routing_t; -static inline char* -get_msg_routing_string(msg_routing_t route) -{ - switch(route) - { - case ROUTING_NORMAL: return "ROUTING_NORMAL"; - case ROUTING_LOCAL_NODE_ONLY: return "ROUTING_LOCAL_NODE_ONLY"; - case ROUTING_TOKEN_OWNER_LOCAL_RACK_ONLY: return "ROUTING_TOKEN_OWNER_LOCAL_RACK_ONLY"; - case ROUTING_ALL_NODES_LOCAL_RACK_ONLY: return "ROUTING_ALL_NODES_LOCAL_RACK_ONLY"; - } - return "INVALID MSG ROUTING TYPE"; +static inline char *get_msg_routing_string(msg_routing_t route) { + switch (route) { + case ROUTING_NORMAL: + return "ROUTING_NORMAL"; + case ROUTING_LOCAL_NODE_ONLY: + return "ROUTING_LOCAL_NODE_ONLY"; + case ROUTING_TOKEN_OWNER_LOCAL_RACK_ONLY: + return "ROUTING_TOKEN_OWNER_LOCAL_RACK_ONLY"; + case ROUTING_ALL_NODES_LOCAL_RACK_ONLY: + return "ROUTING_ALL_NODES_LOCAL_RACK_ONLY"; + } + return "INVALID MSG ROUTING TYPE"; } struct keypos { - uint8_t *start; /* key start pos */ - uint8_t *end; /* key end pos */ - uint8_t *tag_start; /* hashtagged key start pos */ - uint8_t *tag_end; /* hashtagged key end pos */ + uint8_t *start; /* key start pos */ + uint8_t *end; /* key end pos */ + uint8_t *tag_start; /* hashtagged key start pos */ + uint8_t *tag_end; /* hashtagged key end pos */ }; struct msg { - object_t object; - TAILQ_ENTRY(msg) c_tqe; /* link in client q */ - TAILQ_ENTRY(msg) s_tqe; /* link in server q */ - TAILQ_ENTRY(msg) m_tqe; /* link in send q / free q */ - - msgid_t id; /* message id */ - struct msg *peer; /* message peer */ - struct conn *owner; /* message owner - client | server */ - usec_t stime_in_microsec; /* start time in microsec */ - usec_t request_inqueue_enqueue_time_us; /* when message was enqueued in inqueue, either to the data store or remote region or cross rack */ - usec_t request_send_time; /* when message was sent: either to the data store or remote region or cross rack */ - uint8_t awaiting_rsps; - struct msg *selected_rsp; - - struct rbnode tmo_rbe; /* entry in rbtree */ - - struct mhdr mhdr; /* message mbuf header */ - uint32_t mlen; /* message length */ - - int state; /* current parser state */ - uint8_t *pos; /* parser position marker */ - uint8_t *token; /* token marker */ - - func_msg_parse_t parser; /* message parser */ - msg_parse_result_t result; /* message parsing result */ - - - msg_type_t type; /* message type */ - - struct array *keys; /* array of keypos, for req */ - - uint32_t vlen; /* value length (memcache) */ - uint8_t *end; /* end marker (memcache) */ - - uint8_t *narg_start; /* narg start (redis) */ - uint8_t *narg_end; /* narg end (redis) */ - uint32_t narg; /* # arguments (redis) */ - uint32_t nkeys; /* # keys in script (redis EVAL/EVALSHA) */ - uint32_t rnarg; /* running # arg used by parsing fsa (redis) */ - uint32_t rlen; /* running length in parsing fsa (redis) */ - uint32_t integer; /* integer reply value (redis) */ - - struct msg *frag_owner; /* owner of fragment message */ - uint32_t nfrag; /* # fragment */ - uint32_t nfrag_done; /* # fragment done */ - uint64_t frag_id; /* id of fragmented message */ - struct msg **frag_seq; /* sequence of fragment message, map from keys to fragments*/ - - err_t error_code; /* errno on error? */ - unsigned is_error:1; /* error? */ - unsigned is_ferror:1; /* one or more fragments are in error? */ - unsigned is_request:1; /* request? or response? */ - unsigned quit:1; /* quit request? */ - unsigned expect_datastore_reply:1; /* expect datastore reply */ - unsigned done:1; /* done? */ - unsigned fdone:1; /* all fragments are done? */ - unsigned swallow:1; /* swallow response? */ - /* We need a way in dnode_rsp_send_next to remember if we already - * did a dmsg_write of a dnode header in this message. If we do not remember it, - * then if the same message gets attempted to be sent twice in msg_send_chain, - * (due to lack of space in the previous attempt), we will prepend another header - * and we will have corrupted message at the destination */ - unsigned dnode_header_prepended:1; - unsigned rsp_sent:1; /* is a response sent for this request?*/ - - //dynomite - struct dmsg *dmsg; /* dyn message */ - dyn_parse_state_t dyn_parse_state; - dyn_error_t dyn_error_code; /* error code for dynomite */ - msg_routing_t msg_routing; - unsigned is_read:1; /* 0 : write - 1 : read */ - msg_response_handler_t rsp_handler; - consistency_t consistency; - msgid_t parent_id; /* parent message id */ - struct response_mgr rspmgr; + object_t object; + TAILQ_ENTRY(msg) c_tqe; /* link in client q */ + TAILQ_ENTRY(msg) s_tqe; /* link in server q */ + TAILQ_ENTRY(msg) m_tqe; /* link in send q / free q */ + + msgid_t id; /* message id */ + struct msg *peer; /* message peer */ + struct conn *owner; /* message owner - client | server */ + usec_t stime_in_microsec; /* start time in microsec */ + usec_t request_inqueue_enqueue_time_us; /* when message was enqueued in + inqueue, either to the data store + or remote region or cross rack */ + usec_t request_send_time; /* when message was sent: either to the data store + or remote region or cross rack */ + uint8_t awaiting_rsps; + struct msg *selected_rsp; + + struct rbnode tmo_rbe; /* entry in rbtree */ + + struct mhdr mhdr; /* message mbuf header */ + uint32_t mlen; /* message length */ + + int state; /* current parser state */ + uint8_t *pos; /* parser position marker */ + uint8_t *token; /* token marker */ + + func_msg_parse_t parser; /* message parser */ + msg_parse_result_t result; /* message parsing result */ + + msg_type_t type; /* message type */ + + struct array *keys; /* array of keypos, for req */ + + uint32_t vlen; /* value length (memcache) */ + uint8_t *end; /* end marker (memcache) */ + + uint8_t *narg_start; /* narg start (redis) */ + uint8_t *narg_end; /* narg end (redis) */ + uint32_t narg; /* # arguments (redis) */ + uint32_t nkeys; /* # keys in script (redis EVAL/EVALSHA) */ + uint32_t rnarg; /* running # arg used by parsing fsa (redis) */ + uint32_t rlen; /* running length in parsing fsa (redis) */ + uint32_t integer; /* integer reply value (redis) */ + + struct msg *frag_owner; /* owner of fragment message */ + uint32_t nfrag; /* # fragment */ + uint32_t nfrag_done; /* # fragment done */ + uint64_t frag_id; /* id of fragmented message */ + struct msg * + *frag_seq; /* sequence of fragment message, map from keys to fragments*/ + + err_t error_code; /* errno on error? */ + unsigned is_error : 1; /* error? */ + unsigned is_ferror : 1; /* one or more fragments are in error? */ + unsigned is_request : 1; /* request? or response? */ + unsigned quit : 1; /* quit request? */ + unsigned expect_datastore_reply : 1; /* expect datastore reply */ + unsigned done : 1; /* done? */ + unsigned fdone : 1; /* all fragments are done? */ + unsigned swallow : 1; /* swallow response? */ + /* We need a way in dnode_rsp_send_next to remember if we already + * did a dmsg_write of a dnode header in this message. If we do not remember + * it, then if the same message gets attempted to be sent twice in + * msg_send_chain, (due to lack of space in the previous attempt), we will + * prepend another header and we will have corrupted message at the + * destination */ + unsigned dnode_header_prepended : 1; + unsigned rsp_sent : 1; /* is a response sent for this request?*/ + + // dynomite + struct dmsg *dmsg; /* dyn message */ + dyn_parse_state_t dyn_parse_state; + dyn_error_t dyn_error_code; /* error code for dynomite */ + msg_routing_t msg_routing; + unsigned is_read : 1; /* 0 : write + 1 : read */ + msg_response_handler_t rsp_handler; + consistency_t consistency; + msgid_t parent_id; /* parent message id */ + struct response_mgr rspmgr; }; TAILQ_HEAD(msg_tqh, msg); -static inline void -msg_incr_awaiting_rsps(struct msg *req) -{ - req->awaiting_rsps++; - return; +static inline void msg_incr_awaiting_rsps(struct msg *req) { + req->awaiting_rsps++; + return; } -static inline void -msg_decr_awaiting_rsps(struct msg *req) -{ - req->awaiting_rsps--; - return; +static inline void msg_decr_awaiting_rsps(struct msg *req) { + req->awaiting_rsps--; + return; } -static inline rstatus_t -msg_handle_response(struct msg *req, struct msg *rsp) -{ - return req->rsp_handler(req, rsp); +static inline rstatus_t msg_handle_response(struct msg *req, struct msg *rsp) { + return req->rsp_handler(req, rsp); } size_t msg_free_queue_size(void); @@ -464,10 +456,11 @@ void msg_tmo_insert(struct msg *msg, struct conn *conn); void msg_tmo_delete(struct msg *msg); void msg_init(size_t alloc_msgs_max); -rstatus_t msg_clone(struct msg *src, struct mbuf *mbuf_start, struct msg *target); +rstatus_t msg_clone(struct msg *src, struct mbuf *mbuf_start, + struct msg *target); void msg_deinit(void); struct string *msg_type_string(msg_type_t type); -struct msg *msg_get(struct conn *conn, bool request, const char* const caller); +struct msg *msg_get(struct conn *conn, bool request, const char *const caller); void msg_put(struct msg *msg); uint32_t msg_mbuf_size(struct msg *msg); uint32_t msg_length(struct msg *msg); @@ -485,38 +478,41 @@ rstatus_t msg_append(struct msg *msg, uint8_t *pos, size_t n); rstatus_t msg_prepend(struct msg *msg, uint8_t *pos, size_t n); rstatus_t msg_prepend_format(struct msg *msg, const char *fmt, ...); -uint8_t *msg_get_tagged_key(struct msg *req, uint32_t key_index, uint32_t *keylen); -uint8_t *msg_get_full_key(struct msg *req, uint32_t key_index, uint32_t *keylen); -uint8_t* msg_get_full_key_copy(struct msg* msg, int idx, uint32_t *keylen); +uint8_t *msg_get_tagged_key(struct msg *req, uint32_t key_index, + uint32_t *keylen); +uint8_t *msg_get_full_key(struct msg *req, uint32_t key_index, + uint32_t *keylen); +uint8_t *msg_get_full_key_copy(struct msg *msg, int idx, uint32_t *keylen); struct msg *req_get(struct conn *conn); void req_put(struct msg *msg); bool req_done(struct conn *conn, struct msg *msg); bool req_error(struct conn *conn, struct msg *msg); struct msg *req_recv_next(struct context *ctx, struct conn *conn, bool alloc); -void req_recv_done(struct context *ctx, struct conn *conn, struct msg *msg, struct msg *nmsg); -rstatus_t req_make_reply(struct context *ctx, struct conn *conn, struct msg *req); +void req_recv_done(struct context *ctx, struct conn *conn, struct msg *msg, + struct msg *nmsg); +rstatus_t req_make_reply(struct context *ctx, struct conn *conn, + struct msg *req); struct msg *req_send_next(struct context *ctx, struct conn *conn); void req_send_done(struct context *ctx, struct conn *conn, struct msg *msg); struct msg *rsp_get(struct conn *conn); void rsp_put(struct msg *msg); struct msg *rsp_recv_next(struct context *ctx, struct conn *conn, bool alloc); -void server_rsp_recv_done(struct context *ctx, struct conn *conn, struct msg *msg, struct msg *nmsg); +void server_rsp_recv_done(struct context *ctx, struct conn *conn, + struct msg *msg, struct msg *nmsg); struct msg *rsp_send_next(struct context *ctx, struct conn *conn); void rsp_send_done(struct context *ctx, struct conn *conn, struct msg *msg); - /* for dynomite */ -void dnode_rsp_gos_syn(struct context *ctx, struct conn *p_conn, struct msg *msg); - - -void -req_forward_error(struct context *ctx, struct conn *conn, struct msg *req, - err_t error_code, err_t dyn_error_code); -rstatus_t remote_req_forward(struct context *ctx, struct conn *c_conn, struct msg *msg, - struct rack *rack, uint8_t *key, uint32_t keylen, - dyn_error_t *dyn_error_code); +void dnode_rsp_gos_syn(struct context *ctx, struct conn *p_conn, + struct msg *msg); + +void req_forward_error(struct context *ctx, struct conn *conn, struct msg *req, + err_t error_code, err_t dyn_error_code); +rstatus_t remote_req_forward(struct context *ctx, struct conn *c_conn, + struct msg *msg, struct rack *rack, uint8_t *key, + uint32_t keylen, dyn_error_t *dyn_error_code); void req_forward_all_local_racks(struct context *ctx, struct conn *c_conn, struct msg *req, struct mbuf *orig_mbuf, uint8_t *key, uint32_t keylen, @@ -529,6 +525,8 @@ rstatus_t dnode_peer_req_forward(struct context *ctx, struct conn *c_conn, struct rack *rack, uint8_t *key, uint32_t keylen, dyn_error_t *dyn_error_code); -//void peer_gossip_forward(struct context *ctx, struct conn *conn, struct string *data); -void dnode_peer_gossip_forward(struct context *ctx, struct conn *conn, struct mbuf *data); +// void peer_gossip_forward(struct context *ctx, struct conn *conn, struct +// string *data); +void dnode_peer_gossip_forward(struct context *ctx, struct conn *conn, + struct mbuf *data); #endif diff --git a/src/dyn_node_snitch.c b/src/dyn_node_snitch.c index 0ae3ed79a..d566c2570 100644 --- a/src/dyn_node_snitch.c +++ b/src/dyn_node_snitch.c @@ -1,148 +1,119 @@ -#include -#include -#include -#include -#include -#include -#include +#include #include +#include +#include +#include +#include +#include +#include -#include "dyn_node_snitch.h" -#include "dyn_core.h" #include "dyn_conf.h" +#include "dyn_core.h" +#include "dyn_node_snitch.h" #include "dyn_string.h" #include "dyn_util.h" - - static unsigned char *broadcast_address = NULL; static char *public_hostname = NULL; static char *public_ip4 = NULL; static char *private_ip4 = NULL; -static bool is_aws_env(struct server_pool *sp) -{ - return dn_strncmp(&sp->env.data, CONF_DEFAULT_ENV, 3); +static bool is_aws_env(struct server_pool *sp) { + return dn_strncmp(&sp->env.data, CONF_DEFAULT_ENV, 3); } -static unsigned char * -hostname_to_ip(char * hostname) -{ - struct hostent *he; - struct in_addr **addr_list; - int i; +static unsigned char *hostname_to_ip(char *hostname) { + struct hostent *he; + struct in_addr **addr_list; + int i; - if ((he = gethostbyname(hostname)) == NULL) - { - return NULL; - } + if ((he = gethostbyname(hostname)) == NULL) { + return NULL; + } - addr_list = (struct in_addr **) he->h_addr_list; - for(i = 0; addr_list[i] != NULL; i++); + addr_list = (struct in_addr **)he->h_addr_list; + for (i = 0; addr_list[i] != NULL; i++) + ; - unsigned char *ip = dn_alloc(i); + unsigned char *ip = dn_alloc(i); - for(i = 0; addr_list[i] != NULL; i++) - { - //Return the first one; - strcpy(ip , inet_ntoa(*(0 + addr_list[i])) ); - return ip; - } + for (i = 0; addr_list[i] != NULL; i++) { + // Return the first one; + strcpy(ip, inet_ntoa(*(0 + addr_list[i]))); + return ip; + } - return NULL; + return NULL; } +unsigned char *get_broadcast_address(struct server_pool *sp) { + if (broadcast_address != NULL) return broadcast_address; -unsigned char * -get_broadcast_address(struct server_pool *sp) -{ - if (broadcast_address != NULL) - return broadcast_address; - - if (is_aws_env(sp)) { - broadcast_address = getenv("EC2_PUBLIC_HOSTNAME"); - if (broadcast_address != NULL) - return broadcast_address; - } else { - broadcast_address = getenv("PUBLIC_HOSTNAME"); - if (broadcast_address != NULL) - return broadcast_address; - } - - struct node *peer = *(struct node **) array_get(&sp->peers, 0); - broadcast_address = (char *) peer->name.data; - return broadcast_address; -} + if (is_aws_env(sp)) { + broadcast_address = getenv("EC2_PUBLIC_HOSTNAME"); + if (broadcast_address != NULL) return broadcast_address; + } else { + broadcast_address = getenv("PUBLIC_HOSTNAME"); + if (broadcast_address != NULL) return broadcast_address; + } -char *get_public_hostname(struct server_pool *sp) -{ - if (public_hostname != NULL) - return public_hostname; - - if (is_aws_env(sp)) { - public_hostname = getenv("EC2_PUBLIC_HOSTNAME"); - if (public_hostname != NULL) - return public_hostname; - } else { - public_hostname = getenv("PUBLIC_HOSTNAME"); - if (public_hostname != NULL) - return public_hostname; - } - - struct node *peer = *(struct node **) array_get(&sp->peers, 0); - char c = (char) peer->name.data[0]; - if ((peer != NULL) && (peer->name.data != NULL) && !isdigit(c) ) { - public_hostname = (char *) peer->name.data; - return public_hostname; - } - return NULL; + struct node *peer = *(struct node **)array_get(&sp->peers, 0); + broadcast_address = (char *)peer->name.data; + return broadcast_address; } - -char *get_public_ip4(struct server_pool *sp) -{ - if (public_ip4 != NULL) - return public_ip4; - - if (is_aws_env(sp)) { - public_ip4 = getenv("EC2_PUBLIC_IPV4"); - if (public_ip4 != NULL) - return public_ip4; - } else { - public_ip4 = getenv("PUBLIC_IPV4"); - if (public_ip4 != NULL) - return public_ip4; - } - - struct node *peer = *(struct node **) array_get(&sp->peers, 0); - if ((peer != NULL) && (peer->name.data != NULL)) { - char c = (char) peer->name.data[0]; - if (isdigit(c)) - return (char *) peer->name.data; - } - return NULL; +char *get_public_hostname(struct server_pool *sp) { + if (public_hostname != NULL) return public_hostname; + + if (is_aws_env(sp)) { + public_hostname = getenv("EC2_PUBLIC_HOSTNAME"); + if (public_hostname != NULL) return public_hostname; + } else { + public_hostname = getenv("PUBLIC_HOSTNAME"); + if (public_hostname != NULL) return public_hostname; + } + + struct node *peer = *(struct node **)array_get(&sp->peers, 0); + char c = (char)peer->name.data[0]; + if ((peer != NULL) && (peer->name.data != NULL) && !isdigit(c)) { + public_hostname = (char *)peer->name.data; + return public_hostname; + } + return NULL; } +char *get_public_ip4(struct server_pool *sp) { + if (public_ip4 != NULL) return public_ip4; + + if (is_aws_env(sp)) { + public_ip4 = getenv("EC2_PUBLIC_IPV4"); + if (public_ip4 != NULL) return public_ip4; + } else { + public_ip4 = getenv("PUBLIC_IPV4"); + if (public_ip4 != NULL) return public_ip4; + } + + struct node *peer = *(struct node **)array_get(&sp->peers, 0); + if ((peer != NULL) && (peer->name.data != NULL)) { + char c = (char)peer->name.data[0]; + if (isdigit(c)) return (char *)peer->name.data; + } + return NULL; +} -char *get_private_ip4(struct server_pool *sp) -{ - if (private_ip4 != NULL) - return private_ip4; - - if (is_aws_env(sp)) { - private_ip4 = getenv("EC2_LOCAL_IPV4"); - if (private_ip4 != NULL) - return private_ip4; - } else { - private_ip4 = getenv("LOCAL_IPV4"); - if (private_ip4 != NULL) - return private_ip4; - } - return NULL; +char *get_private_ip4(struct server_pool *sp) { + if (private_ip4 != NULL) return private_ip4; + + if (is_aws_env(sp)) { + private_ip4 = getenv("EC2_LOCAL_IPV4"); + if (private_ip4 != NULL) return private_ip4; + } else { + private_ip4 = getenv("LOCAL_IPV4"); + if (private_ip4 != NULL) return private_ip4; + } + return NULL; } -unsigned char * -hostname_to_private_ip4(char *hostname) -{ - return hostname_to_ip(hostname); +unsigned char *hostname_to_private_ip4(char *hostname) { + return hostname_to_ip(hostname); } diff --git a/src/dyn_node_snitch.h b/src/dyn_node_snitch.h index 7c7cd0645..b56e36c77 100644 --- a/src/dyn_node_snitch.h +++ b/src/dyn_node_snitch.h @@ -1,10 +1,9 @@ -#include "dyn_core.h" - - #ifndef _DYN_SNITCH_H_ #define _DYN_SNITCH_H_ +// Forward declarations +struct server_pool; unsigned char *get_broadcast_address(struct server_pool *sp); char *get_public_hostname(struct server_pool *sp); diff --git a/src/dyn_proxy.c b/src/dyn_proxy.c index 3b9e87ff3..37220ede1 100644 --- a/src/dyn_proxy.c +++ b/src/dyn_proxy.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -22,241 +22,212 @@ #include -#include "dyn_core.h" -#include "dyn_server.h" #include "dyn_client.h" +#include "dyn_core.h" #include "dyn_proxy.h" +#include "dyn_server.h" -static void -proxy_ref(struct conn *conn, void *owner) -{ - struct server_pool *pool = owner; +static void proxy_ref(struct conn *conn, void *owner) { + struct server_pool *pool = owner; - ASSERT(conn->type == CONN_PROXY); - ASSERT(conn->owner == NULL); + ASSERT(conn->type == CONN_PROXY); + ASSERT(conn->owner == NULL); - conn->family = pool->proxy_endpoint.family; - conn->addrlen = pool->proxy_endpoint.addrlen; - conn->addr = pool->proxy_endpoint.addr; - string_duplicate(&conn->pname, &pool->proxy_endpoint.pname); + conn->family = pool->proxy_endpoint.family; + conn->addrlen = pool->proxy_endpoint.addrlen; + conn->addr = pool->proxy_endpoint.addr; + string_duplicate(&conn->pname, &pool->proxy_endpoint.pname); - pool->p_conn = conn; + pool->p_conn = conn; - /* owner of the proxy connection is the server pool */ - conn->owner = owner; + /* owner of the proxy connection is the server pool */ + conn->owner = owner; - log_debug(LOG_VVERB, "ref conn %p owner %p", conn, pool); + log_debug(LOG_VVERB, "ref conn %p owner %p", conn, pool); } -static void -proxy_unref(struct conn *conn) -{ - struct server_pool *pool; +static void proxy_unref(struct conn *conn) { + struct server_pool *pool; - ASSERT(conn->type == CONN_PROXY); - ASSERT(conn->owner != NULL); + ASSERT(conn->type == CONN_PROXY); + ASSERT(conn->owner != NULL); - conn_event_del_conn(conn); - pool = conn->owner; - conn->owner = NULL; + conn_event_del_conn(conn); + pool = conn->owner; + conn->owner = NULL; - pool->p_conn = NULL; + pool->p_conn = NULL; - log_debug(LOG_VVERB, "unref conn %p owner %p", conn, pool); + log_debug(LOG_VVERB, "unref conn %p owner %p", conn, pool); } -static void -proxy_close(struct context *ctx, struct conn *conn) -{ - rstatus_t status; +static void proxy_close(struct context *ctx, struct conn *conn) { + rstatus_t status; - ASSERT(conn->type == CONN_PROXY); + ASSERT(conn->type == CONN_PROXY); - if (conn->sd < 0) { - conn_unref(conn); - conn_put(conn); - return; - } + if (conn->sd < 0) { + conn_unref(conn); + conn_put(conn); + return; + } - ASSERT(conn->rmsg == NULL); - ASSERT(conn->smsg == NULL); - ASSERT(TAILQ_EMPTY(&conn->imsg_q)); - ASSERT(TAILQ_EMPTY(&conn->omsg_q)); + ASSERT(conn->rmsg == NULL); + ASSERT(conn->smsg == NULL); + ASSERT(TAILQ_EMPTY(&conn->imsg_q)); + ASSERT(TAILQ_EMPTY(&conn->omsg_q)); - conn_unref(conn); + conn_unref(conn); - status = close(conn->sd); - if (status < 0) { - log_error("close p %d failed, ignored: %s", conn->sd, strerror(errno)); - } - conn->sd = -1; + status = close(conn->sd); + if (status < 0) { + log_error("close p %d failed, ignored: %s", conn->sd, strerror(errno)); + } + conn->sd = -1; - conn_put(conn); + conn_put(conn); } -rstatus_t -proxy_init(struct context *ctx) -{ - rstatus_t status; - struct server_pool *pool = &ctx->pool; +rstatus_t proxy_init(struct context *ctx) { + rstatus_t status; + struct server_pool *pool = &ctx->pool; - struct conn *p = conn_get(pool, init_proxy_conn); - if (!p) { - return DN_ENOMEM; - } + struct conn *p = conn_get(pool, init_proxy_conn); + if (!p) { + return DN_ENOMEM; + } - status = conn_listen(pool->ctx, p); - if (status != DN_OK) { - conn_close(pool->ctx, p); - return status; - } + status = conn_listen(pool->ctx, p); + if (status != DN_OK) { + conn_close(pool->ctx, p); + return status; + } - char * log_datastore = "not selected data store"; - if (g_data_store == DATA_REDIS){ - log_datastore = "redis"; - } - else if (g_data_store == DATA_MEMCACHE){ - log_datastore = "memcache"; - } + char *log_datastore = "not selected data store"; + if (g_data_store == DATA_REDIS) { + log_datastore = "redis"; + } else if (g_data_store == DATA_MEMCACHE) { + log_datastore = "memcache"; + } - log_debug(LOG_NOTICE, "%s inited in %s %s", - print_obj(p), log_datastore, print_obj(pool)); + log_debug(LOG_NOTICE, "%s inited in %s %s", print_obj(p), log_datastore, + print_obj(pool)); - return DN_OK; + return DN_OK; } -void -proxy_deinit(struct context *ctx) -{ - struct server_pool *pool = &ctx->pool; - struct conn *p = pool->p_conn; - if (p != NULL) { - conn_close(pool->ctx, p); - pool->p_conn = NULL; - } +void proxy_deinit(struct context *ctx) { + struct server_pool *pool = &ctx->pool; + struct conn *p = pool->p_conn; + if (p != NULL) { + conn_close(pool->ctx, p); + pool->p_conn = NULL; + } - log_debug(LOG_VVERB, "deinit proxy"); + log_debug(LOG_VVERB, "deinit proxy"); } -static rstatus_t -proxy_accept(struct context *ctx, struct conn *p) -{ - rstatus_t status; - struct conn *c; - int sd; - - ASSERT(p->type == CONN_PROXY); - ASSERT(p->sd > 0); - ASSERT(p->recv_active && p->recv_ready); - - for (;;) { - sd = accept(p->sd, NULL, NULL); - if (sd < 0) { - if (errno == EINTR) { - log_warn("accept on %s not ready - eintr", print_obj(p)); - continue; - } - - if (errno == EAGAIN || errno == EWOULDBLOCK) { - p->recv_ready = 0; - return DN_OK; - } - - /* - * FIXME: On EMFILE or ENFILE mask out IN event on the proxy; mask - * it back in when some existing connection gets closed - */ - - log_error("accept on %s failed: %s", print_obj(p), strerror(errno)); - return DN_ERROR; - } - - break; +static rstatus_t proxy_accept(struct context *ctx, struct conn *p) { + rstatus_t status; + struct conn *c; + int sd; + + ASSERT(p->type == CONN_PROXY); + ASSERT(p->sd > 0); + ASSERT(p->recv_active && p->recv_ready); + + for (;;) { + sd = accept(p->sd, NULL, NULL); + if (sd < 0) { + if (errno == EINTR) { + log_warn("accept on %s not ready - eintr", print_obj(p)); + continue; + } + + if (errno == EAGAIN || errno == EWOULDBLOCK) { + p->recv_ready = 0; + return DN_OK; + } + + /* + * FIXME: On EMFILE or ENFILE mask out IN event on the proxy; mask + * it back in when some existing connection gets closed + */ + + log_error("accept on %s failed: %s", print_obj(p), strerror(errno)); + return DN_ERROR; } - c = conn_get(p->owner, init_client_conn); - if (c == NULL) { - log_error("get conn for CLIENT %d from %s failed: %s", sd, print_obj(p), - strerror(errno)); - status = close(sd); - if (status < 0) { - log_error("close c %d failed, ignored: %s", sd, strerror(errno)); - } - return DN_ENOMEM; - } - c->sd = sd; - string_copy_c(&c->pname, (unsigned char *)dn_unresolve_peer_desc(c->sd)); - - stats_pool_incr(ctx, client_connections); + break; + } - status = dn_set_nonblocking(c->sd); + c = conn_get(p->owner, init_client_conn); + if (c == NULL) { + log_error("get conn for CLIENT %d from %s failed: %s", sd, print_obj(p), + strerror(errno)); + status = close(sd); if (status < 0) { - log_error("%s Failed to set nonblock on %s: %s", print_obj(p), print_obj(c), strerror(errno)); - conn_close(ctx, c); - return status; - } - - if (p->family == AF_INET || p->family == AF_INET6) { - status = dn_set_tcpnodelay(c->sd); - if (status < 0) { - log_warn("%s Failed to set tcpnodelay on %s: %s", - print_obj(p), print_obj(c), strerror(errno)); - } + log_error("close c %d failed, ignored: %s", sd, strerror(errno)); } - - status = conn_event_add_conn(c); + return DN_ENOMEM; + } + c->sd = sd; + string_copy_c(&c->pname, (unsigned char *)dn_unresolve_peer_desc(c->sd)); + + stats_pool_incr(ctx, client_connections); + + status = dn_set_nonblocking(c->sd); + if (status < 0) { + log_error("%s Failed to set nonblock on %s: %s", print_obj(p), print_obj(c), + strerror(errno)); + conn_close(ctx, c); + return status; + } + + if (p->family == AF_INET || p->family == AF_INET6) { + status = dn_set_tcpnodelay(c->sd); if (status < 0) { - log_error("%s Failed to add %s to event loop: %s", print_obj(p), print_obj(c), strerror(errno)); - conn_close(ctx, c); - return status; + log_warn("%s Failed to set tcpnodelay on %s: %s", print_obj(p), + print_obj(c), strerror(errno)); } + } - log_notice("%s accepted %s", print_obj(p), print_obj(c)); + status = conn_event_add_conn(c); + if (status < 0) { + log_error("%s Failed to add %s to event loop: %s", print_obj(p), + print_obj(c), strerror(errno)); + conn_close(ctx, c); + return status; + } - return DN_OK; -} + log_notice("%s accepted %s", print_obj(p), print_obj(c)); -static rstatus_t -proxy_recv(struct context *ctx, struct conn *conn) -{ - ASSERT(conn->type == CONN_PROXY); - ASSERT(conn->recv_active); - - conn->recv_ready = 1; - do { - if (proxy_accept(ctx, conn) != DN_OK) { - log_error("%s Failed to accept a connection. Continuing", print_obj(conn)); - continue; - } - } while (conn->recv_ready); - - return DN_OK; + return DN_OK; } -struct conn_ops proxy_ops = { - proxy_recv, - NULL, - NULL, - NULL, - NULL, - NULL, - proxy_close, - NULL, - proxy_ref, - proxy_unref, - // enqueue, dequeues - NULL, - NULL, - NULL, - NULL, - conn_cant_handle_response -}; - -void -init_proxy_conn(struct conn *conn) -{ - conn->dyn_mode = 0; - conn->type = CONN_PROXY; - conn->ops = &proxy_ops; +static rstatus_t proxy_recv(struct context *ctx, struct conn *conn) { + ASSERT(conn->type == CONN_PROXY); + ASSERT(conn->recv_active); + + conn->recv_ready = 1; + do { + if (proxy_accept(ctx, conn) != DN_OK) { + log_error("%s Failed to accept a connection. Continuing", + print_obj(conn)); + continue; + } + } while (conn->recv_ready); + + return DN_OK; } +struct conn_ops proxy_ops = {proxy_recv, NULL, NULL, NULL, NULL, NULL, + proxy_close, NULL, proxy_ref, proxy_unref, + // enqueue, dequeues + NULL, NULL, NULL, NULL, conn_cant_handle_response}; + +void init_proxy_conn(struct conn *conn) { + conn->dyn_mode = 0; + conn->type = CONN_PROXY; + conn->ops = &proxy_ops; +} diff --git a/src/dyn_proxy.h b/src/dyn_proxy.h index 063ea3612..8ae0b795c 100644 --- a/src/dyn_proxy.h +++ b/src/dyn_proxy.h @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -19,12 +19,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "dyn_core.h" - #ifndef _DYN_PROXY_H_ #define _DYN_PROXY_H_ +#include "dyn_types.h" + +// Forward declarations +struct conn; +struct context; + rstatus_t proxy_init(struct context *ctx); void proxy_deinit(struct context *ctx); void init_proxy_conn(struct conn *conn); diff --git a/src/dyn_queue.h b/src/dyn_queue.h index 323acfb82..a96919bc2 100644 --- a/src/dyn_queue.h +++ b/src/dyn_queue.h @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -54,12 +54,9 @@ #include #include "dyn_log.h" - #ifndef _DYN_QUEUE_H_ #define _DYN_QUEUE_H_ - - #ifndef __offsetof #define __offsetof(type, field) ((size_t)(&((type *)NULL)->field)) #endif @@ -138,17 +135,18 @@ #define QUEUE_MACRO_SCRUB 1 #ifdef DN_ASSERT_PANIC -# define QUEUE_MACRO_TRACE 1 -# define QUEUE_MACRO_ASSERT 1 +#define QUEUE_MACRO_TRACE 1 +#define QUEUE_MACRO_ASSERT 1 #endif #ifdef QUEUE_MACRO_SCRUB -#define QMD_SAVELINK(name, link) void **name = (void *)&(link) +#define QMD_SAVELINK(name, link) void **name = (void *)&(link) -#define TRASHIT(x) do { \ - (x) = (void *) NULL; \ -} while (0) +#define TRASHIT(x) \ + do { \ + (x) = (void *)NULL; \ + } while (0) #else @@ -161,27 +159,29 @@ /* Store the last 2 places the queue element or head was altered */ struct qm_trace { - char *lastfile; - int lastline; - char *prevfile; - int prevline; + char *lastfile; + int lastline; + char *prevfile; + int prevline; }; -#define TRACEBUF struct qm_trace trace; +#define TRACEBUF struct qm_trace trace; -#define QMD_TRACE_HEAD(head) do { \ - (head)->trace.prevline = (head)->trace.lastline; \ - (head)->trace.prevfile = (head)->trace.lastfile; \ - (head)->trace.lastline = __LINE__; \ - (head)->trace.lastfile = __FILE__; \ -} while (0) +#define QMD_TRACE_HEAD(head) \ + do { \ + (head)->trace.prevline = (head)->trace.lastline; \ + (head)->trace.prevfile = (head)->trace.lastfile; \ + (head)->trace.lastline = __LINE__; \ + (head)->trace.lastfile = __FILE__; \ + } while (0) -#define QMD_TRACE_ELEM(elem) do { \ - (elem)->trace.prevline = (elem)->trace.lastline; \ - (elem)->trace.prevfile = (elem)->trace.lastfile; \ - (elem)->trace.lastline = __LINE__; \ - (elem)->trace.lastfile = __FILE__; \ -} while (0) +#define QMD_TRACE_ELEM(elem) \ + do { \ + (elem)->trace.prevline = (elem)->trace.lastline; \ + (elem)->trace.prevfile = (elem)->trace.lastfile; \ + (elem)->trace.lastline = __LINE__; \ + (elem)->trace.lastfile = __FILE__; \ + } while (0) #else @@ -194,214 +194,218 @@ struct qm_trace { /* * Singly-linked List declarations. */ -#define SLIST_HEAD(name, type) \ -struct name { \ - struct type *slh_first; /* first element */ \ -} +#define SLIST_HEAD(name, type) \ + struct name { \ + struct type *slh_first; /* first element */ \ + } -#define SLIST_HEAD_INITIALIZER(head) \ - { NULL } +#define SLIST_HEAD_INITIALIZER(head) \ + { NULL } -#define SLIST_ENTRY(type) \ -struct { \ - struct type *sle_next; /* next element */ \ -} +#define SLIST_ENTRY(type) \ + struct { \ + struct type *sle_next; /* next element */ \ + } /* * Singly-linked List functions. */ -#define SLIST_EMPTY(head) ((head)->slh_first == NULL) - -#define SLIST_FIRST(head) ((head)->slh_first) - -#define SLIST_FOREACH(var, head, field) \ - for ((var) = SLIST_FIRST((head)); \ - (var); \ - (var) = SLIST_NEXT((var), field)) - -#define SLIST_FOREACH_SAFE(var, head, field, tvar) \ - for ((var) = SLIST_FIRST((head)); \ - (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ - (var) = (tvar)) - -#define SLIST_FOREACH_PREVPTR(var, varp, head, field) \ - for ((varp) = &SLIST_FIRST((head)); \ - ((var) = *(varp)) != NULL; \ - (varp) = &SLIST_NEXT((var), field)) - -#define SLIST_INIT(head) do { \ - SLIST_FIRST((head)) = NULL; \ -} while (0) - -#define SLIST_INSERT_AFTER(slistelm, elm, field) do { \ - SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \ - SLIST_NEXT((slistelm), field) = (elm); \ -} while (0) - -#define SLIST_INSERT_HEAD(head, elm, field) do { \ - SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \ - SLIST_FIRST((head)) = (elm); \ -} while (0) - -#define SLIST_NEXT(elm, field) ((elm)->field.sle_next) - -#define SLIST_REMOVE(head, elm, type, field) do { \ - if (SLIST_FIRST((head)) == (elm)) { \ - SLIST_REMOVE_HEAD((head), field); \ - } else { \ - struct type *curelm = SLIST_FIRST((head)); \ - while (SLIST_NEXT(curelm, field) != (elm)) { \ - curelm = SLIST_NEXT(curelm, field); \ - } \ - SLIST_REMOVE_AFTER(curelm, field); \ - } \ -} while (0) - -#define SLIST_REMOVE_AFTER(elm, field) do { \ +#define SLIST_EMPTY(head) ((head)->slh_first == NULL) + +#define SLIST_FIRST(head) ((head)->slh_first) + +#define SLIST_FOREACH(var, head, field) \ + for ((var) = SLIST_FIRST((head)); (var); (var) = SLIST_NEXT((var), field)) + +#define SLIST_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = SLIST_FIRST((head)); \ + (var) && ((tvar) = SLIST_NEXT((var), field), 1); (var) = (tvar)) + +#define SLIST_FOREACH_PREVPTR(var, varp, head, field) \ + for ((varp) = &SLIST_FIRST((head)); ((var) = *(varp)) != NULL; \ + (varp) = &SLIST_NEXT((var), field)) + +#define SLIST_INIT(head) \ + do { \ + SLIST_FIRST((head)) = NULL; \ + } while (0) + +#define SLIST_INSERT_AFTER(slistelm, elm, field) \ + do { \ + SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \ + SLIST_NEXT((slistelm), field) = (elm); \ + } while (0) + +#define SLIST_INSERT_HEAD(head, elm, field) \ + do { \ + SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \ + SLIST_FIRST((head)) = (elm); \ + } while (0) + +#define SLIST_NEXT(elm, field) ((elm)->field.sle_next) + +#define SLIST_REMOVE(head, elm, type, field) \ + do { \ + if (SLIST_FIRST((head)) == (elm)) { \ + SLIST_REMOVE_HEAD((head), field); \ + } else { \ + struct type *curelm = SLIST_FIRST((head)); \ + while (SLIST_NEXT(curelm, field) != (elm)) { \ + curelm = SLIST_NEXT(curelm, field); \ + } \ + SLIST_REMOVE_AFTER(curelm, field); \ + } \ + } while (0) + +#define SLIST_REMOVE_AFTER(elm, field) \ + do { \ QMD_SAVELINK(oldnext, SLIST_NEXT(SLIST_NEXT(elm, field), field)); \ SLIST_NEXT(elm, field) = SLIST_NEXT(SLIST_NEXT(elm, field), field); \ TRASHIT(*oldnext); \ -} while (0) + } while (0) -#define SLIST_REMOVE_HEAD(head, field) do { \ - QMD_SAVELINK(oldnext, SLIST_NEXT(SLIST_FIRST((head)), field)); \ - SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \ - TRASHIT(*oldnext); \ -} while (0) +#define SLIST_REMOVE_HEAD(head, field) \ + do { \ + QMD_SAVELINK(oldnext, SLIST_NEXT(SLIST_FIRST((head)), field)); \ + SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \ + TRASHIT(*oldnext); \ + } while (0) /* * Singly-linked Tail queue declarations. */ -#define STAILQ_HEAD(name, type) \ -struct name { \ - struct type *stqh_first; /* first element */ \ - struct type **stqh_last; /* addr of last next element */ \ -} +#define STAILQ_HEAD(name, type) \ + struct name { \ + struct type *stqh_first; /* first element */ \ + struct type **stqh_last; /* addr of last next element */ \ + } -#define STAILQ_HEAD_INITIALIZER(head) \ - { NULL, &(head).stqh_first } +#define STAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).stqh_first } -#define STAILQ_ENTRY(type) \ -struct { \ - struct type *stqe_next; /* next element */ \ -} +#define STAILQ_ENTRY(type) \ + struct { \ + struct type *stqe_next; /* next element */ \ + } /* * Singly-linked Tail queue functions. */ -#define STAILQ_CONCAT(head1, head2) do { \ - if (!STAILQ_EMPTY((head2))) { \ - *(head1)->stqh_last = (head2)->stqh_first; \ - (head1)->stqh_last = (head2)->stqh_last; \ - STAILQ_INIT((head2)); \ - } \ -} while (0) - -#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL) - -#define STAILQ_FIRST(head) ((head)->stqh_first) - -#define STAILQ_FOREACH(var, head, field) \ - for ((var) = STAILQ_FIRST((head)); \ - (var); \ - (var) = STAILQ_NEXT((var), field)) - -#define STAILQ_FOREACH_SAFE(var, head, field, tvar) \ - for ((var) = STAILQ_FIRST((head)); \ - (var) && ((tvar) = STAILQ_NEXT((var), field), 1); \ - (var) = (tvar)) - -#define STAILQ_INIT(head) do { \ - STAILQ_FIRST((head)) = NULL; \ - (head)->stqh_last = &STAILQ_FIRST((head)); \ -} while (0) - -#define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \ - if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\ - (head)->stqh_last = &STAILQ_NEXT((elm), field); \ - STAILQ_NEXT((tqelm), field) = (elm); \ -} while (0) - -#define STAILQ_INSERT_HEAD(head, elm, field) do { \ - if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \ - (head)->stqh_last = &STAILQ_NEXT((elm), field); \ - STAILQ_FIRST((head)) = (elm); \ -} while (0) - -#define STAILQ_INSERT_TAIL(head, elm, field) do { \ - STAILQ_NEXT((elm), field) = NULL; \ - *(head)->stqh_last = (elm); \ - (head)->stqh_last = &STAILQ_NEXT((elm), field); \ -} while (0) - -#define STAILQ_LAST(head, type, field) \ - (STAILQ_EMPTY((head)) ? \ - NULL : \ - ((struct type *)(void *) \ - ((char *)((head)->stqh_last) - __offsetof(struct type, field)))) - -#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next) - -#define STAILQ_REMOVE(head, elm, type, field) do { \ - if (STAILQ_FIRST((head)) == (elm)) { \ - STAILQ_REMOVE_HEAD((head), field); \ - } \ - else { \ - struct type *curelm = STAILQ_FIRST((head)); \ - while (STAILQ_NEXT(curelm, field) != (elm)) \ - curelm = STAILQ_NEXT(curelm, field); \ - STAILQ_REMOVE_AFTER(head, curelm, field); \ - } \ -} while (0) - -#define STAILQ_REMOVE_HEAD(head, field) do { \ - QMD_SAVELINK(oldnext, STAILQ_NEXT(STAILQ_FIRST((head)), field)); \ - if ((STAILQ_FIRST((head)) = \ - STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) { \ - (head)->stqh_last = &STAILQ_FIRST((head)); \ - } \ - TRASHIT(*oldnext); \ -} while (0) - -#define STAILQ_REMOVE_AFTER(head, elm, field) do { \ +#define STAILQ_CONCAT(head1, head2) \ + do { \ + if (!STAILQ_EMPTY((head2))) { \ + *(head1)->stqh_last = (head2)->stqh_first; \ + (head1)->stqh_last = (head2)->stqh_last; \ + STAILQ_INIT((head2)); \ + } \ + } while (0) + +#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL) + +#define STAILQ_FIRST(head) ((head)->stqh_first) + +#define STAILQ_FOREACH(var, head, field) \ + for ((var) = STAILQ_FIRST((head)); (var); (var) = STAILQ_NEXT((var), field)) + +#define STAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = STAILQ_FIRST((head)); \ + (var) && ((tvar) = STAILQ_NEXT((var), field), 1); (var) = (tvar)) + +#define STAILQ_INIT(head) \ + do { \ + STAILQ_FIRST((head)) = NULL; \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ + } while (0) + +#define STAILQ_INSERT_AFTER(head, tqelm, elm, field) \ + do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL) \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_NEXT((tqelm), field) = (elm); \ + } while (0) + +#define STAILQ_INSERT_HEAD(head, elm, field) \ + do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_FIRST((head)) = (elm); \ + } while (0) + +#define STAILQ_INSERT_TAIL(head, elm, field) \ + do { \ + STAILQ_NEXT((elm), field) = NULL; \ + *(head)->stqh_last = (elm); \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + } while (0) + +#define STAILQ_LAST(head, type, field) \ + (STAILQ_EMPTY((head)) \ + ? NULL \ + : ((struct type *)(void *)((char *)((head)->stqh_last) - \ + __offsetof(struct type, field)))) + +#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next) + +#define STAILQ_REMOVE(head, elm, type, field) \ + do { \ + if (STAILQ_FIRST((head)) == (elm)) { \ + STAILQ_REMOVE_HEAD((head), field); \ + } else { \ + struct type *curelm = STAILQ_FIRST((head)); \ + while (STAILQ_NEXT(curelm, field) != (elm)) \ + curelm = STAILQ_NEXT(curelm, field); \ + STAILQ_REMOVE_AFTER(head, curelm, field); \ + } \ + } while (0) + +#define STAILQ_REMOVE_HEAD(head, field) \ + do { \ + QMD_SAVELINK(oldnext, STAILQ_NEXT(STAILQ_FIRST((head)), field)); \ + if ((STAILQ_FIRST((head)) = STAILQ_NEXT(STAILQ_FIRST((head)), field)) == \ + NULL) { \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ + } \ + TRASHIT(*oldnext); \ + } while (0) + +#define STAILQ_REMOVE_AFTER(head, elm, field) \ + do { \ QMD_SAVELINK(oldnext, STAILQ_NEXT(STAILQ_NEXT(elm, field), field)); \ if ((STAILQ_NEXT(elm, field) = \ - STAILQ_NEXT(STAILQ_NEXT(elm, field), field)) == NULL) { \ - (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_NEXT(STAILQ_NEXT(elm, field), field)) == NULL) { \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ } \ TRASHIT(*oldnext); \ -} while (0) + } while (0) -#define STAILQ_SWAP(head1, head2, type) do { \ +#define STAILQ_SWAP(head1, head2, type) \ + do { \ struct type *swap_first = STAILQ_FIRST(head1); \ struct type **swap_last = (head1)->stqh_last; \ STAILQ_FIRST(head1) = STAILQ_FIRST(head2); \ (head1)->stqh_last = (head2)->stqh_last; \ STAILQ_FIRST(head2) = swap_first; \ (head2)->stqh_last = swap_last; \ - if (STAILQ_EMPTY(head1)) \ - (head1)->stqh_last = &STAILQ_FIRST(head1); \ - if (STAILQ_EMPTY(head2)) \ - (head2)->stqh_last = &STAILQ_FIRST(head2); \ -} while (0) - + if (STAILQ_EMPTY(head1)) (head1)->stqh_last = &STAILQ_FIRST(head1); \ + if (STAILQ_EMPTY(head2)) (head2)->stqh_last = &STAILQ_FIRST(head2); \ + } while (0) /* * List declarations. */ -#define LIST_HEAD(name, type) \ -struct name { \ - struct type *lh_first; /* first element */ \ -} +#define LIST_HEAD(name, type) \ + struct name { \ + struct type *lh_first; /* first element */ \ + } -#define LIST_HEAD_INITIALIZER(head) \ - { NULL } +#define LIST_HEAD_INITIALIZER(head) \ + { NULL } -#define LIST_ENTRY(type) \ -struct { \ - struct type *le_next; /* next element */ \ - struct type **le_prev; /* address of previous next element */ \ -} +#define LIST_ENTRY(type) \ + struct { \ + struct type *le_next; /* next element */ \ + struct type **le_prev; /* address of previous next element */ \ + } /* * List functions. @@ -409,25 +413,28 @@ struct { \ #ifdef QUEUE_MACRO_ASSERT -#define QMD_LIST_CHECK_HEAD(head, field) do { \ - if (LIST_FIRST((head)) != NULL && \ - LIST_FIRST((head))->field.le_prev != &LIST_FIRST((head))) { \ - log_panic("Bad list head %p first->prev != head", (void *)(head)); \ - } \ -} while (0) - -#define QMD_LIST_CHECK_NEXT(elm, field) do { \ - if (LIST_NEXT((elm), field) != NULL && \ - LIST_NEXT((elm), field)->field.le_prev != &((elm)->field.le_next)) {\ - log_panic("Bad link elm %p next->prev != elm",(void *)(elm)); \ - } \ -} while (0) - -#define QMD_LIST_CHECK_PREV(elm, field) do { \ - if (*(elm)->field.le_prev != (elm)) { \ - log_panic("Bad link elm %p prev->next != elm",(void *)(elm)); \ - } \ -} while (0) +#define QMD_LIST_CHECK_HEAD(head, field) \ + do { \ + if (LIST_FIRST((head)) != NULL && \ + LIST_FIRST((head))->field.le_prev != &LIST_FIRST((head))) { \ + log_panic("Bad list head %p first->prev != head", (void *)(head)); \ + } \ + } while (0) + +#define QMD_LIST_CHECK_NEXT(elm, field) \ + do { \ + if (LIST_NEXT((elm), field) != NULL && \ + LIST_NEXT((elm), field)->field.le_prev != &((elm)->field.le_next)) { \ + log_panic("Bad link elm %p next->prev != elm", (void *)(elm)); \ + } \ + } while (0) + +#define QMD_LIST_CHECK_PREV(elm, field) \ + do { \ + if (*(elm)->field.le_prev != (elm)) { \ + log_panic("Bad link elm %p prev->next != elm", (void *)(elm)); \ + } \ + } while (0) #else @@ -437,125 +444,131 @@ struct { \ #endif /* QUEUE_MACRO_ASSERT */ -#define LIST_EMPTY(head) ((head)->lh_first == NULL) - -#define LIST_FIRST(head) ((head)->lh_first) - -#define LIST_FOREACH(var, head, field) \ - for ((var) = LIST_FIRST((head)); \ - (var); \ - (var) = LIST_NEXT((var), field)) - -#define LIST_FOREACH_SAFE(var, head, field, tvar) \ - for ((var) = LIST_FIRST((head)); \ - (var) && ((tvar) = LIST_NEXT((var), field), 1); \ - (var) = (tvar)) - -#define LIST_INIT(head) do { \ - LIST_FIRST((head)) = NULL; \ -} while (0) - -#define LIST_INSERT_AFTER(listelm, elm, field) do { \ - QMD_LIST_CHECK_NEXT(listelm, field); \ - if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\ - LIST_NEXT((listelm), field)->field.le_prev = \ - &LIST_NEXT((elm), field); \ - LIST_NEXT((listelm), field) = (elm); \ - (elm)->field.le_prev = &LIST_NEXT((listelm), field); \ -} while (0) - -#define LIST_INSERT_BEFORE(listelm, elm, field) do { \ - QMD_LIST_CHECK_PREV(listelm, field); \ - (elm)->field.le_prev = (listelm)->field.le_prev; \ - LIST_NEXT((elm), field) = (listelm); \ - *(listelm)->field.le_prev = (elm); \ - (listelm)->field.le_prev = &LIST_NEXT((elm), field); \ -} while (0) - -#define LIST_INSERT_HEAD(head, elm, field) do { \ - QMD_LIST_CHECK_HEAD((head), field); \ - if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \ - LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field); \ - LIST_FIRST((head)) = (elm); \ - (elm)->field.le_prev = &LIST_FIRST((head)); \ -} while (0) - -#define LIST_NEXT(elm, field) ((elm)->field.le_next) - -#define LIST_REMOVE(elm, field) do { \ - QMD_SAVELINK(oldnext, (elm)->field.le_next); \ - QMD_SAVELINK(oldprev, (elm)->field.le_prev); \ - QMD_LIST_CHECK_NEXT(elm, field); \ - QMD_LIST_CHECK_PREV(elm, field); \ - if (LIST_NEXT((elm), field) != NULL) \ - LIST_NEXT((elm), field)->field.le_prev = \ - (elm)->field.le_prev; \ - *(elm)->field.le_prev = LIST_NEXT((elm), field); \ - TRASHIT(*oldnext); \ - TRASHIT(*oldprev); \ -} while (0) - -#define LIST_SWAP(head1, head2, type, field) do { \ - struct type *swap_tmp = LIST_FIRST((head1)); \ - LIST_FIRST((head1)) = LIST_FIRST((head2)); \ - LIST_FIRST((head2)) = swap_tmp; \ - if ((swap_tmp = LIST_FIRST((head1))) != NULL) \ - swap_tmp->field.le_prev = &LIST_FIRST((head1)); \ - if ((swap_tmp = LIST_FIRST((head2))) != NULL) \ - swap_tmp->field.le_prev = &LIST_FIRST((head2)); \ -} while (0) +#define LIST_EMPTY(head) ((head)->lh_first == NULL) + +#define LIST_FIRST(head) ((head)->lh_first) + +#define LIST_FOREACH(var, head, field) \ + for ((var) = LIST_FIRST((head)); (var); (var) = LIST_NEXT((var), field)) + +#define LIST_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = LIST_FIRST((head)); \ + (var) && ((tvar) = LIST_NEXT((var), field), 1); (var) = (tvar)) + +#define LIST_INIT(head) \ + do { \ + LIST_FIRST((head)) = NULL; \ + } while (0) + +#define LIST_INSERT_AFTER(listelm, elm, field) \ + do { \ + QMD_LIST_CHECK_NEXT(listelm, field); \ + if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL) \ + LIST_NEXT((listelm), field)->field.le_prev = &LIST_NEXT((elm), field); \ + LIST_NEXT((listelm), field) = (elm); \ + (elm)->field.le_prev = &LIST_NEXT((listelm), field); \ + } while (0) + +#define LIST_INSERT_BEFORE(listelm, elm, field) \ + do { \ + QMD_LIST_CHECK_PREV(listelm, field); \ + (elm)->field.le_prev = (listelm)->field.le_prev; \ + LIST_NEXT((elm), field) = (listelm); \ + *(listelm)->field.le_prev = (elm); \ + (listelm)->field.le_prev = &LIST_NEXT((elm), field); \ + } while (0) + +#define LIST_INSERT_HEAD(head, elm, field) \ + do { \ + QMD_LIST_CHECK_HEAD((head), field); \ + if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \ + LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field); \ + LIST_FIRST((head)) = (elm); \ + (elm)->field.le_prev = &LIST_FIRST((head)); \ + } while (0) + +#define LIST_NEXT(elm, field) ((elm)->field.le_next) + +#define LIST_REMOVE(elm, field) \ + do { \ + QMD_SAVELINK(oldnext, (elm)->field.le_next); \ + QMD_SAVELINK(oldprev, (elm)->field.le_prev); \ + QMD_LIST_CHECK_NEXT(elm, field); \ + QMD_LIST_CHECK_PREV(elm, field); \ + if (LIST_NEXT((elm), field) != NULL) \ + LIST_NEXT((elm), field)->field.le_prev = (elm)->field.le_prev; \ + *(elm)->field.le_prev = LIST_NEXT((elm), field); \ + TRASHIT(*oldnext); \ + TRASHIT(*oldprev); \ + } while (0) + +#define LIST_SWAP(head1, head2, type, field) \ + do { \ + struct type *swap_tmp = LIST_FIRST((head1)); \ + LIST_FIRST((head1)) = LIST_FIRST((head2)); \ + LIST_FIRST((head2)) = swap_tmp; \ + if ((swap_tmp = LIST_FIRST((head1))) != NULL) \ + swap_tmp->field.le_prev = &LIST_FIRST((head1)); \ + if ((swap_tmp = LIST_FIRST((head2))) != NULL) \ + swap_tmp->field.le_prev = &LIST_FIRST((head2)); \ + } while (0) /* * Tail queue declarations. */ -#define TAILQ_HEAD(name, type) \ -struct name { \ - struct type *tqh_first; /* first element */ \ - struct type **tqh_last; /* addr of last next element */ \ - uint64_t count; \ - TRACEBUF \ -} - -#define TAILQ_HEAD_INITIALIZER(head) \ - { NULL, &(head).tqh_first, 0 } - -#define TAILQ_ENTRY(type) \ -struct { \ - struct type *tqe_next; /* next element */ \ - struct type **tqe_prev; /* address of previous next element */ \ - TRACEBUF \ -} +#define TAILQ_HEAD(name, type) \ + struct name { \ + struct type *tqh_first; /* first element */ \ + struct type **tqh_last; /* addr of last next element */ \ + uint64_t count; \ + TRACEBUF \ + } + +#define TAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).tqh_first, 0 } + +#define TAILQ_ENTRY(type) \ + struct { \ + struct type *tqe_next; /* next element */ \ + struct type **tqe_prev; /* address of previous next element */ \ + TRACEBUF \ + } /* * Tail queue functions. */ #ifdef QUEUE_MACRO_ASSERT -#define QMD_TAILQ_CHECK_HEAD(head, field) do { \ - if (!TAILQ_EMPTY(head) && \ - TAILQ_FIRST((head))->field.tqe_prev != &TAILQ_FIRST((head))) { \ - log_panic("Bad tailq head %p first->prev != head", (void *)(head)); \ - } \ -} while (0) - -#define QMD_TAILQ_CHECK_TAIL(head, field) do { \ - if (*(head)->tqh_last != NULL) { \ - log_panic("Bad tailq NEXT(%p->tqh_last) != NULL",(void *)(head)); \ - } \ -} while (0) - -#define QMD_TAILQ_CHECK_NEXT(elm, field) do { \ - if (TAILQ_NEXT((elm), field) != NULL && \ - TAILQ_NEXT((elm), field)->field.tqe_prev != &((elm)->field.tqe_next)) {\ - log_panic("Bad link elm %p next->prev != elm",(void *)(elm)); \ - } \ -} while (0) - -#define QMD_TAILQ_CHECK_PREV(elm, field) do { \ - if (*(elm)->field.tqe_prev != (elm)) { \ - log_panic("Bad link elm %p prev->next != elm",(void *)(elm)); \ - } \ -} while (0) +#define QMD_TAILQ_CHECK_HEAD(head, field) \ + do { \ + if (!TAILQ_EMPTY(head) && \ + TAILQ_FIRST((head))->field.tqe_prev != &TAILQ_FIRST((head))) { \ + log_panic("Bad tailq head %p first->prev != head", (void *)(head)); \ + } \ + } while (0) + +#define QMD_TAILQ_CHECK_TAIL(head, field) \ + do { \ + if (*(head)->tqh_last != NULL) { \ + log_panic("Bad tailq NEXT(%p->tqh_last) != NULL", (void *)(head)); \ + } \ + } while (0) + +#define QMD_TAILQ_CHECK_NEXT(elm, field) \ + do { \ + if (TAILQ_NEXT((elm), field) != NULL && \ + TAILQ_NEXT((elm), field)->field.tqe_prev != \ + &((elm)->field.tqe_next)) { \ + log_panic("Bad link elm %p next->prev != elm", (void *)(elm)); \ + } \ + } while (0) + +#define QMD_TAILQ_CHECK_PREV(elm, field) \ + do { \ + if (*(elm)->field.tqe_prev != (elm)) { \ + log_panic("Bad link elm %p prev->next != elm", (void *)(elm)); \ + } \ + } while (0) #else @@ -566,165 +579,167 @@ struct { \ #endif /* QUEUE_MACRO_ASSERT */ -#define TAILQ_CONCAT(head1, head2, field) do { \ - if (!TAILQ_EMPTY(head2)) { \ - *(head1)->tqh_last = (head2)->tqh_first; \ - (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \ - (head1)->tqh_last = (head2)->tqh_last; \ - (head1)->count += (head2)->count; \ - TAILQ_INIT((head2)); \ - QMD_TRACE_HEAD(head1); \ - QMD_TRACE_HEAD(head2); \ - } \ -} while (0) - -#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) - -#define TAILQ_COUNT(head) ((head)->count) -#define TAILQ_FIRST(head) ((head)->tqh_first) - -#define TAILQ_FOREACH(var, head, field) \ - for ((var) = TAILQ_FIRST((head)); \ - (var); \ - (var) = TAILQ_NEXT((var), field)) - -#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ - for ((var) = TAILQ_FIRST((head)); \ - (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ - (var) = (tvar)) - -#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ - for ((var) = TAILQ_LAST((head), headname); \ - (var); \ - (var) = TAILQ_PREV((var), headname, field)) - -#define TAILQ_FOREACH_REVERSE_SAFE(var, head, headname, field, tvar) \ - for ((var) = TAILQ_LAST((head), headname); \ - (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \ - (var) = (tvar)) - -#define TAILQ_INIT(head) do { \ - TAILQ_FIRST((head)) = NULL; \ - (head)->count = 0; \ - (head)->tqh_last = &TAILQ_FIRST((head)); \ - QMD_TRACE_HEAD(head); \ -} while (0) - -#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ - QMD_TAILQ_CHECK_NEXT(listelm, field); \ - if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL) { \ - TAILQ_NEXT((elm), field)->field.tqe_prev = &TAILQ_NEXT((elm), field);\ - } else { \ - (head)->tqh_last = &TAILQ_NEXT((elm), field); \ - QMD_TRACE_HEAD(head); \ - } \ - TAILQ_NEXT((listelm), field) = (elm); \ - (head)->count++; \ - (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \ - QMD_TRACE_ELEM(&(elm)->field); \ - QMD_TRACE_ELEM(&listelm->field); \ -} while (0) - -#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ - QMD_TAILQ_CHECK_PREV(listelm, field); \ - (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ - TAILQ_NEXT((elm), field) = (listelm); \ - (head)->count++; \ - *(listelm)->field.tqe_prev = (elm); \ - (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \ - QMD_TRACE_ELEM(&(elm)->field); \ - QMD_TRACE_ELEM(&listelm->field); \ -} while (0) - -#define TAILQ_INSERT_HEAD(head, elm, field) do { \ - QMD_TAILQ_CHECK_HEAD(head, field); \ - if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \ - TAILQ_FIRST((head))->field.tqe_prev = \ - &TAILQ_NEXT((elm), field); \ - else \ - (head)->tqh_last = &TAILQ_NEXT((elm), field); \ - TAILQ_FIRST((head)) = (elm); \ - (head)->count++; \ - (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \ - QMD_TRACE_HEAD(head); \ - QMD_TRACE_ELEM(&(elm)->field); \ -} while (0) - -#define TAILQ_INSERT_TAIL(head, elm, field) do { \ - QMD_TAILQ_CHECK_TAIL(head, field); \ - TAILQ_NEXT((elm), field) = NULL; \ - (elm)->field.tqe_prev = (head)->tqh_last; \ - (head)->count++; \ - *(head)->tqh_last = (elm); \ - (head)->tqh_last = &TAILQ_NEXT((elm), field); \ - QMD_TRACE_HEAD(head); \ - QMD_TRACE_ELEM(&(elm)->field); \ -} while (0) - -#define TAILQ_LAST(head, headname) \ - (*(((struct headname *)((head)->tqh_last))->tqh_last)) +#define TAILQ_CONCAT(head1, head2, field) \ + do { \ + if (!TAILQ_EMPTY(head2)) { \ + *(head1)->tqh_last = (head2)->tqh_first; \ + (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \ + (head1)->tqh_last = (head2)->tqh_last; \ + (head1)->count += (head2)->count; \ + TAILQ_INIT((head2)); \ + QMD_TRACE_HEAD(head1); \ + QMD_TRACE_HEAD(head2); \ + } \ + } while (0) + +#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) + +#define TAILQ_COUNT(head) ((head)->count) +#define TAILQ_FIRST(head) ((head)->tqh_first) + +#define TAILQ_FOREACH(var, head, field) \ + for ((var) = TAILQ_FIRST((head)); (var); (var) = TAILQ_NEXT((var), field)) + +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = TAILQ_FIRST((head)); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); (var) = (tvar)) + +#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ + for ((var) = TAILQ_LAST((head), headname); (var); \ + (var) = TAILQ_PREV((var), headname, field)) + +#define TAILQ_FOREACH_REVERSE_SAFE(var, head, headname, field, tvar) \ + for ((var) = TAILQ_LAST((head), headname); \ + (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \ + (var) = (tvar)) + +#define TAILQ_INIT(head) \ + do { \ + TAILQ_FIRST((head)) = NULL; \ + (head)->count = 0; \ + (head)->tqh_last = &TAILQ_FIRST((head)); \ + QMD_TRACE_HEAD(head); \ + } while (0) + +#define TAILQ_INSERT_AFTER(head, listelm, elm, field) \ + do { \ + QMD_TAILQ_CHECK_NEXT(listelm, field); \ + if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL) { \ + TAILQ_NEXT((elm), field)->field.tqe_prev = &TAILQ_NEXT((elm), field); \ + } else { \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_HEAD(head); \ + } \ + TAILQ_NEXT((listelm), field) = (elm); \ + (head)->count++; \ + (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \ + QMD_TRACE_ELEM(&(elm)->field); \ + QMD_TRACE_ELEM(&listelm->field); \ + } while (0) + +#define TAILQ_INSERT_BEFORE(listelm, elm, field) \ + do { \ + QMD_TAILQ_CHECK_PREV(listelm, field); \ + (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ + TAILQ_NEXT((elm), field) = (listelm); \ + (head)->count++; \ + *(listelm)->field.tqe_prev = (elm); \ + (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_ELEM(&(elm)->field); \ + QMD_TRACE_ELEM(&listelm->field); \ + } while (0) + +#define TAILQ_INSERT_HEAD(head, elm, field) \ + do { \ + QMD_TAILQ_CHECK_HEAD(head, field); \ + if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \ + TAILQ_FIRST((head))->field.tqe_prev = &TAILQ_NEXT((elm), field); \ + else \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + TAILQ_FIRST((head)) = (elm); \ + (head)->count++; \ + (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_ELEM(&(elm)->field); \ + } while (0) + +#define TAILQ_INSERT_TAIL(head, elm, field) \ + do { \ + QMD_TAILQ_CHECK_TAIL(head, field); \ + TAILQ_NEXT((elm), field) = NULL; \ + (elm)->field.tqe_prev = (head)->tqh_last; \ + (head)->count++; \ + *(head)->tqh_last = (elm); \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_ELEM(&(elm)->field); \ + } while (0) + +#define TAILQ_LAST(head, headname) \ + (*(((struct headname *)((head)->tqh_last))->tqh_last)) #define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) -#define TAILQ_PREV(elm, headname, field) \ - (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) +#define TAILQ_PREV(elm, headname, field) \ + (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) -#define TAILQ_REMOVE(head, elm, field) do { \ +#define TAILQ_REMOVE(head, elm, field) \ + do { \ QMD_SAVELINK(oldnext, (elm)->field.tqe_next); \ QMD_SAVELINK(oldprev, (elm)->field.tqe_prev); \ QMD_TAILQ_CHECK_NEXT(elm, field); \ QMD_TAILQ_CHECK_PREV(elm, field); \ if ((TAILQ_NEXT((elm), field)) != NULL) { \ - TAILQ_NEXT((elm), field)->field.tqe_prev = \ - (elm)->field.tqe_prev; \ + TAILQ_NEXT((elm), field)->field.tqe_prev = (elm)->field.tqe_prev; \ } else { \ - (head)->tqh_last = (elm)->field.tqe_prev; \ - QMD_TRACE_HEAD(head); \ + (head)->tqh_last = (elm)->field.tqe_prev; \ + QMD_TRACE_HEAD(head); \ } \ *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \ (head)->count--; \ TRASHIT(*oldnext); \ TRASHIT(*oldprev); \ QMD_TRACE_ELEM(&(elm)->field); \ -} while (0) - -#define TAILQ_SWAP(head1, head2, type, field) do { \ - struct type *swap_first = (head1)->tqh_first; \ - struct type **swap_last = (head1)->tqh_last; \ - (head1)->tqh_first = (head2)->tqh_first; \ - (head1)->tqh_last = (head2)->tqh_last; \ - (head2)->tqh_first = swap_first; \ - (head2)->tqh_last = swap_last; \ - if ((swap_first = (head1)->tqh_first) != NULL) \ - swap_first->field.tqe_prev = &(head1)->tqh_first; \ - else \ - (head1)->tqh_last = &(head1)->tqh_first; \ - if ((swap_first = (head2)->tqh_first) != NULL) \ - swap_first->field.tqe_prev = &(head2)->tqh_first; \ - else \ - (head2)->tqh_last = &(head2)->tqh_first; \ - uint64_t temp = (head1)->count; \ - (head1)->count = (head2)->count; \ - (head2)->count = temp; \ -} while (0) + } while (0) + +#define TAILQ_SWAP(head1, head2, type, field) \ + do { \ + struct type *swap_first = (head1)->tqh_first; \ + struct type **swap_last = (head1)->tqh_last; \ + (head1)->tqh_first = (head2)->tqh_first; \ + (head1)->tqh_last = (head2)->tqh_last; \ + (head2)->tqh_first = swap_first; \ + (head2)->tqh_last = swap_last; \ + if ((swap_first = (head1)->tqh_first) != NULL) \ + swap_first->field.tqe_prev = &(head1)->tqh_first; \ + else \ + (head1)->tqh_last = &(head1)->tqh_first; \ + if ((swap_first = (head2)->tqh_first) != NULL) \ + swap_first->field.tqe_prev = &(head2)->tqh_first; \ + else \ + (head2)->tqh_last = &(head2)->tqh_first; \ + uint64_t temp = (head1)->count; \ + (head1)->count = (head2)->count; \ + (head2)->count = temp; \ + } while (0) /* * Circular queue declarations. */ -#define CIRCLEQ_HEAD(name, type) \ -struct name { \ - struct type *cqh_first; /* first element */ \ - struct type *cqh_last; /* last element */ \ -} +#define CIRCLEQ_HEAD(name, type) \ + struct name { \ + struct type *cqh_first; /* first element */ \ + struct type *cqh_last; /* last element */ \ + } -#define CIRCLEQ_HEAD_INITIALIZER(head) \ - { (void *)&(head), (void *)&(head) } +#define CIRCLEQ_HEAD_INITIALIZER(head) \ + { (void *)&(head), (void *)&(head) } -#define CIRCLEQ_ENTRY(type) \ -struct { \ - struct type *cqe_next; /* next element */ \ - struct type *cqe_prev; /* previous element */ \ -} +#define CIRCLEQ_ENTRY(type) \ + struct { \ + struct type *cqe_next; /* next element */ \ + struct type *cqe_prev; /* previous element */ \ + } /* * Circular queue functions. @@ -733,78 +748,84 @@ struct { \ #define CIRCLEQ_FIRST(head) ((head)->cqh_first) -#define CIRCLEQ_FOREACH(var, head, field) \ - for ((var) = CIRCLEQ_FIRST((head)); \ - (var) != (void *)(head) || ((var) = NULL); \ - (var) = CIRCLEQ_NEXT((var), field)) - -#define CIRCLEQ_FOREACH_REVERSE(var, head, field) \ - for ((var) = CIRCLEQ_LAST((head)); \ - (var) != (void *)(head) || ((var) = NULL); \ - (var) = CIRCLEQ_PREV((var), field)) - -#define CIRCLEQ_INIT(head) do { \ - CIRCLEQ_FIRST((head)) = (void *)(head); \ - CIRCLEQ_LAST((head)) = (void *)(head); \ -} while (0) - -#define CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) do { \ - CIRCLEQ_NEXT((elm), field) = CIRCLEQ_NEXT((listelm), field); \ - CIRCLEQ_PREV((elm), field) = (listelm); \ - if (CIRCLEQ_NEXT((listelm), field) == (void *)(head)) \ - CIRCLEQ_LAST((head)) = (elm); \ - else \ - CIRCLEQ_PREV(CIRCLEQ_NEXT((listelm), field), field) = (elm); \ - CIRCLEQ_NEXT((listelm), field) = (elm); \ -} while (0) - -#define CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) do { \ - CIRCLEQ_NEXT((elm), field) = (listelm); \ - CIRCLEQ_PREV((elm), field) = CIRCLEQ_PREV((listelm), field); \ - if (CIRCLEQ_PREV((listelm), field) == (void *)(head)) \ - CIRCLEQ_FIRST((head)) = (elm); \ - else \ - CIRCLEQ_NEXT(CIRCLEQ_PREV((listelm), field), field) = (elm); \ - CIRCLEQ_PREV((listelm), field) = (elm); \ -} while (0) - -#define CIRCLEQ_INSERT_HEAD(head, elm, field) do { \ - CIRCLEQ_NEXT((elm), field) = CIRCLEQ_FIRST((head)); \ - CIRCLEQ_PREV((elm), field) = (void *)(head); \ - if (CIRCLEQ_LAST((head)) == (void *)(head)) \ - CIRCLEQ_LAST((head)) = (elm); \ - else \ - CIRCLEQ_PREV(CIRCLEQ_FIRST((head)), field) = (elm); \ - CIRCLEQ_FIRST((head)) = (elm); \ -} while (0) - -#define CIRCLEQ_INSERT_TAIL(head, elm, field) do { \ - CIRCLEQ_NEXT((elm), field) = (void *)(head); \ - CIRCLEQ_PREV((elm), field) = CIRCLEQ_LAST((head)); \ - if (CIRCLEQ_FIRST((head)) == (void *)(head)) \ - CIRCLEQ_FIRST((head)) = (elm); \ - else \ - CIRCLEQ_NEXT(CIRCLEQ_LAST((head)), field) = (elm); \ - CIRCLEQ_LAST((head)) = (elm); \ -} while (0) +#define CIRCLEQ_FOREACH(var, head, field) \ + for ((var) = CIRCLEQ_FIRST((head)); \ + (var) != (void *)(head) || ((var) = NULL); \ + (var) = CIRCLEQ_NEXT((var), field)) + +#define CIRCLEQ_FOREACH_REVERSE(var, head, field) \ + for ((var) = CIRCLEQ_LAST((head)); \ + (var) != (void *)(head) || ((var) = NULL); \ + (var) = CIRCLEQ_PREV((var), field)) + +#define CIRCLEQ_INIT(head) \ + do { \ + CIRCLEQ_FIRST((head)) = (void *)(head); \ + CIRCLEQ_LAST((head)) = (void *)(head); \ + } while (0) + +#define CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) \ + do { \ + CIRCLEQ_NEXT((elm), field) = CIRCLEQ_NEXT((listelm), field); \ + CIRCLEQ_PREV((elm), field) = (listelm); \ + if (CIRCLEQ_NEXT((listelm), field) == (void *)(head)) \ + CIRCLEQ_LAST((head)) = (elm); \ + else \ + CIRCLEQ_PREV(CIRCLEQ_NEXT((listelm), field), field) = (elm); \ + CIRCLEQ_NEXT((listelm), field) = (elm); \ + } while (0) + +#define CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) \ + do { \ + CIRCLEQ_NEXT((elm), field) = (listelm); \ + CIRCLEQ_PREV((elm), field) = CIRCLEQ_PREV((listelm), field); \ + if (CIRCLEQ_PREV((listelm), field) == (void *)(head)) \ + CIRCLEQ_FIRST((head)) = (elm); \ + else \ + CIRCLEQ_NEXT(CIRCLEQ_PREV((listelm), field), field) = (elm); \ + CIRCLEQ_PREV((listelm), field) = (elm); \ + } while (0) + +#define CIRCLEQ_INSERT_HEAD(head, elm, field) \ + do { \ + CIRCLEQ_NEXT((elm), field) = CIRCLEQ_FIRST((head)); \ + CIRCLEQ_PREV((elm), field) = (void *)(head); \ + if (CIRCLEQ_LAST((head)) == (void *)(head)) \ + CIRCLEQ_LAST((head)) = (elm); \ + else \ + CIRCLEQ_PREV(CIRCLEQ_FIRST((head)), field) = (elm); \ + CIRCLEQ_FIRST((head)) = (elm); \ + } while (0) + +#define CIRCLEQ_INSERT_TAIL(head, elm, field) \ + do { \ + CIRCLEQ_NEXT((elm), field) = (void *)(head); \ + CIRCLEQ_PREV((elm), field) = CIRCLEQ_LAST((head)); \ + if (CIRCLEQ_FIRST((head)) == (void *)(head)) \ + CIRCLEQ_FIRST((head)) = (elm); \ + else \ + CIRCLEQ_NEXT(CIRCLEQ_LAST((head)), field) = (elm); \ + CIRCLEQ_LAST((head)) = (elm); \ + } while (0) #define CIRCLEQ_LAST(head) ((head)->cqh_last) -#define CIRCLEQ_NEXT(elm,field) ((elm)->field.cqe_next) - -#define CIRCLEQ_PREV(elm,field) ((elm)->field.cqe_prev) - -#define CIRCLEQ_REMOVE(head, elm, field) do { \ - if (CIRCLEQ_NEXT((elm), field) == (void *)(head)) \ - CIRCLEQ_LAST((head)) = CIRCLEQ_PREV((elm), field); \ - else \ - CIRCLEQ_PREV(CIRCLEQ_NEXT((elm), field), field) = \ - CIRCLEQ_PREV((elm), field); \ - if (CIRCLEQ_PREV((elm), field) == (void *)(head)) \ - CIRCLEQ_FIRST((head)) = CIRCLEQ_NEXT((elm), field); \ - else \ - CIRCLEQ_NEXT(CIRCLEQ_PREV((elm), field), field) = \ - CIRCLEQ_NEXT((elm), field); \ -} while (0) +#define CIRCLEQ_NEXT(elm, field) ((elm)->field.cqe_next) + +#define CIRCLEQ_PREV(elm, field) ((elm)->field.cqe_prev) + +#define CIRCLEQ_REMOVE(head, elm, field) \ + do { \ + if (CIRCLEQ_NEXT((elm), field) == (void *)(head)) \ + CIRCLEQ_LAST((head)) = CIRCLEQ_PREV((elm), field); \ + else \ + CIRCLEQ_PREV(CIRCLEQ_NEXT((elm), field), field) = \ + CIRCLEQ_PREV((elm), field); \ + if (CIRCLEQ_PREV((elm), field) == (void *)(head)) \ + CIRCLEQ_FIRST((head)) = CIRCLEQ_NEXT((elm), field); \ + else \ + CIRCLEQ_NEXT(CIRCLEQ_PREV((elm), field), field) = \ + CIRCLEQ_NEXT((elm), field); \ + } while (0) #endif diff --git a/src/dyn_rbtree.c b/src/dyn_rbtree.c index bf65918bf..5e26128bf 100644 --- a/src/dyn_rbtree.c +++ b/src/dyn_rbtree.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -28,15 +28,13 @@ #include "dyn_core.h" -void -rbtree_node_init(struct rbnode *node) -{ - node->left = NULL; - node->right = NULL; - node->parent = NULL; - node->key = 0ULL; - node->data = NULL; - /* color is left uninitialized */ +void rbtree_node_init(struct rbnode *node) { + node->left = NULL; + node->right = NULL; + node->parent = NULL; + node->key = 0ULL; + node->data = NULL; + /* color is left uninitialized */ } /** @@ -44,321 +42,304 @@ rbtree_node_init(struct rbnode *node) * @param[in,out] tree * @param[in,out] node */ -void -rbtree_init(struct rbtree *tree, struct rbnode *node) -{ - rbtree_node_init(node); - rbtree_black(node); - tree->root = node; - tree->sentinel = node; +void rbtree_init(struct rbtree *tree, struct rbnode *node) { + rbtree_node_init(node); + rbtree_black(node); + tree->root = node; + tree->sentinel = node; } -static struct rbnode * -rbtree_node_min(struct rbnode *node, struct rbnode *sentinel) -{ - /* traverse left links */ +static struct rbnode *rbtree_node_min(struct rbnode *node, + struct rbnode *sentinel) { + /* traverse left links */ - while (node->left != sentinel) { - node = node->left; - } + while (node->left != sentinel) { + node = node->left; + } - return node; + return node; } -struct rbnode * -rbtree_min(struct rbtree *tree) -{ - struct rbnode *node = tree->root; - struct rbnode *sentinel = tree->sentinel; +struct rbnode *rbtree_min(struct rbtree *tree) { + struct rbnode *node = tree->root; + struct rbnode *sentinel = tree->sentinel; - /* empty tree */ + /* empty tree */ - if (node == sentinel) { - return NULL; - } + if (node == sentinel) { + return NULL; + } - return rbtree_node_min(node, sentinel); + return rbtree_node_min(node, sentinel); } -static void -rbtree_left_rotate(struct rbnode **root, struct rbnode *sentinel, - struct rbnode *node) -{ - struct rbnode *temp; +static void rbtree_left_rotate(struct rbnode **root, struct rbnode *sentinel, + struct rbnode *node) { + struct rbnode *temp; - temp = node->right; - node->right = temp->left; + temp = node->right; + node->right = temp->left; - if (temp->left != sentinel) { - temp->left->parent = node; - } + if (temp->left != sentinel) { + temp->left->parent = node; + } - temp->parent = node->parent; + temp->parent = node->parent; - if (node == *root) { - *root = temp; - } else if (node == node->parent->left) { - node->parent->left = temp; - } else { - node->parent->right = temp; - } + if (node == *root) { + *root = temp; + } else if (node == node->parent->left) { + node->parent->left = temp; + } else { + node->parent->right = temp; + } - temp->left = node; - node->parent = temp; + temp->left = node; + node->parent = temp; } -static void -rbtree_right_rotate(struct rbnode **root, struct rbnode *sentinel, - struct rbnode *node) -{ - struct rbnode *temp; +static void rbtree_right_rotate(struct rbnode **root, struct rbnode *sentinel, + struct rbnode *node) { + struct rbnode *temp; - temp = node->left; - node->left = temp->right; + temp = node->left; + node->left = temp->right; - if (temp->right != sentinel) { - temp->right->parent = node; - } + if (temp->right != sentinel) { + temp->right->parent = node; + } - temp->parent = node->parent; + temp->parent = node->parent; - if (node == *root) { - *root = temp; - } else if (node == node->parent->right) { - node->parent->right = temp; - } else { - node->parent->left = temp; - } + if (node == *root) { + *root = temp; + } else if (node == node->parent->right) { + node->parent->right = temp; + } else { + node->parent->left = temp; + } - temp->right = node; - node->parent = temp; + temp->right = node; + node->parent = temp; } -void -rbtree_insert(struct rbtree *tree, struct rbnode *node) -{ - struct rbnode **root = &tree->root; - struct rbnode *sentinel = tree->sentinel; - struct rbnode *temp, **p; - - /* empty tree */ - - if (*root == sentinel) { - node->parent = NULL; - node->left = sentinel; - node->right = sentinel; - rbtree_black(node); - *root = node; - return; +void rbtree_insert(struct rbtree *tree, struct rbnode *node) { + struct rbnode **root = &tree->root; + struct rbnode *sentinel = tree->sentinel; + struct rbnode *temp, **p; + + /* empty tree */ + + if (*root == sentinel) { + node->parent = NULL; + node->left = sentinel; + node->right = sentinel; + rbtree_black(node); + *root = node; + return; + } + + /* a binary tree insert */ + + temp = *root; + for (;;) { + p = (node->key < temp->key) ? &temp->left : &temp->right; + if (*p == sentinel) { + break; } + temp = *p; + } - /* a binary tree insert */ + *p = node; + node->parent = temp; + node->left = sentinel; + node->right = sentinel; + rbtree_red(node); - temp = *root; - for (;;) { + /* re-balance tree */ - p = (node->key < temp->key) ? &temp->left : &temp->right; - if (*p == sentinel) { - break; + while (node != *root && rbtree_is_red(node->parent)) { + if (node->parent == node->parent->parent->left) { + temp = node->parent->parent->right; + + if (rbtree_is_red(temp)) { + rbtree_black(node->parent); + rbtree_black(temp); + rbtree_red(node->parent->parent); + node = node->parent->parent; + } else { + if (node == node->parent->right) { + node = node->parent; + rbtree_left_rotate(root, sentinel, node); } - temp = *p; - } - *p = node; - node->parent = temp; - node->left = sentinel; - node->right = sentinel; - rbtree_red(node); - - /* re-balance tree */ - - while (node != *root && rbtree_is_red(node->parent)) { - - if (node->parent == node->parent->parent->left) { - temp = node->parent->parent->right; - - if (rbtree_is_red(temp)) { - rbtree_black(node->parent); - rbtree_black(temp); - rbtree_red(node->parent->parent); - node = node->parent->parent; - } else { - if (node == node->parent->right) { - node = node->parent; - rbtree_left_rotate(root, sentinel, node); - } - - rbtree_black(node->parent); - rbtree_red(node->parent->parent); - rbtree_right_rotate(root, sentinel, node->parent->parent); - } - } else { - temp = node->parent->parent->left; - - if (rbtree_is_red(temp)) { - rbtree_black(node->parent); - rbtree_black(temp); - rbtree_red(node->parent->parent); - node = node->parent->parent; - } else { - if (node == node->parent->left) { - node = node->parent; - rbtree_right_rotate(root, sentinel, node); - } - - rbtree_black(node->parent); - rbtree_red(node->parent->parent); - rbtree_left_rotate(root, sentinel, node->parent->parent); - } + rbtree_black(node->parent); + rbtree_red(node->parent->parent); + rbtree_right_rotate(root, sentinel, node->parent->parent); + } + } else { + temp = node->parent->parent->left; + + if (rbtree_is_red(temp)) { + rbtree_black(node->parent); + rbtree_black(temp); + rbtree_red(node->parent->parent); + node = node->parent->parent; + } else { + if (node == node->parent->left) { + node = node->parent; + rbtree_right_rotate(root, sentinel, node); } + + rbtree_black(node->parent); + rbtree_red(node->parent->parent); + rbtree_left_rotate(root, sentinel, node->parent->parent); + } } + } - rbtree_black(*root); + rbtree_black(*root); } -void -rbtree_delete(struct rbtree *tree, struct rbnode *node) -{ - struct rbnode **root = &tree->root; - struct rbnode *sentinel = tree->sentinel; - struct rbnode *subst, *temp, *w; - uint8_t red; - - /* a binary tree delete */ - - if (node->left == sentinel) { - temp = node->right; - subst = node; - } else if (node->right == sentinel) { - temp = node->left; - subst = node; +void rbtree_delete(struct rbtree *tree, struct rbnode *node) { + struct rbnode **root = &tree->root; + struct rbnode *sentinel = tree->sentinel; + struct rbnode *subst, *temp, *w; + uint8_t red; + + /* a binary tree delete */ + + if (node->left == sentinel) { + temp = node->right; + subst = node; + } else if (node->right == sentinel) { + temp = node->left; + subst = node; + } else { + subst = rbtree_node_min(node->right, sentinel); + if (subst->left != sentinel) { + temp = subst->left; } else { - subst = rbtree_node_min(node->right, sentinel); - if (subst->left != sentinel) { - temp = subst->left; - } else { - temp = subst->right; - } + temp = subst->right; } + } - if (subst == *root) { - *root = temp; - rbtree_black(temp); + if (subst == *root) { + *root = temp; + rbtree_black(temp); - rbtree_node_init(node); + rbtree_node_init(node); - return; - } + return; + } + + red = rbtree_is_red(subst); - red = rbtree_is_red(subst); + if (subst == subst->parent->left) { + subst->parent->left = temp; + } else { + subst->parent->right = temp; + } - if (subst == subst->parent->left) { - subst->parent->left = temp; + if (subst == node) { + temp->parent = subst->parent; + } else { + if (subst->parent == node) { + temp->parent = subst; } else { - subst->parent->right = temp; + temp->parent = subst->parent; } - if (subst == node) { - temp->parent = subst->parent; + subst->left = node->left; + subst->right = node->right; + subst->parent = node->parent; + rbtree_copy_color(subst, node); + + if (node == *root) { + *root = subst; } else { + if (node == node->parent->left) { + node->parent->left = subst; + } else { + node->parent->right = subst; + } + } - if (subst->parent == node) { - temp->parent = subst; - } else { - temp->parent = subst->parent; - } + if (subst->left != sentinel) { + subst->left->parent = subst; + } - subst->left = node->left; - subst->right = node->right; - subst->parent = node->parent; - rbtree_copy_color(subst, node); - - if (node == *root) { - *root = subst; - } else { - if (node == node->parent->left) { - node->parent->left = subst; - } else { - node->parent->right = subst; - } + if (subst->right != sentinel) { + subst->right->parent = subst; + } + } + + rbtree_node_init(node); + + if (red) { + return; + } + + /* a delete fixup */ + + while (temp != *root && rbtree_is_black(temp)) { + if (temp == temp->parent->left) { + w = temp->parent->right; + + if (rbtree_is_red(w)) { + rbtree_black(w); + rbtree_red(temp->parent); + rbtree_left_rotate(root, sentinel, temp->parent); + w = temp->parent->right; + } + + if (rbtree_is_black(w->left) && rbtree_is_black(w->right)) { + rbtree_red(w); + temp = temp->parent; + } else { + if (rbtree_is_black(w->right)) { + rbtree_black(w->left); + rbtree_red(w); + rbtree_right_rotate(root, sentinel, w); + w = temp->parent->right; } - if (subst->left != sentinel) { - subst->left->parent = subst; - } + rbtree_copy_color(w, temp->parent); + rbtree_black(temp->parent); + rbtree_black(w->right); + rbtree_left_rotate(root, sentinel, temp->parent); + temp = *root; + } - if (subst->right != sentinel) { - subst->right->parent = subst; + } else { + w = temp->parent->left; + + if (rbtree_is_red(w)) { + rbtree_black(w); + rbtree_red(temp->parent); + rbtree_right_rotate(root, sentinel, temp->parent); + w = temp->parent->left; + } + + if (rbtree_is_black(w->left) && rbtree_is_black(w->right)) { + rbtree_red(w); + temp = temp->parent; + } else { + if (rbtree_is_black(w->left)) { + rbtree_black(w->right); + rbtree_red(w); + rbtree_left_rotate(root, sentinel, w); + w = temp->parent->left; } - } - - rbtree_node_init(node); - if (red) { - return; + rbtree_copy_color(w, temp->parent); + rbtree_black(temp->parent); + rbtree_black(w->left); + rbtree_right_rotate(root, sentinel, temp->parent); + temp = *root; + } } + } - /* a delete fixup */ - - while (temp != *root && rbtree_is_black(temp)) { - - if (temp == temp->parent->left) { - w = temp->parent->right; - - if (rbtree_is_red(w)) { - rbtree_black(w); - rbtree_red(temp->parent); - rbtree_left_rotate(root, sentinel, temp->parent); - w = temp->parent->right; - } - - if (rbtree_is_black(w->left) && rbtree_is_black(w->right)) { - rbtree_red(w); - temp = temp->parent; - } else { - if (rbtree_is_black(w->right)) { - rbtree_black(w->left); - rbtree_red(w); - rbtree_right_rotate(root, sentinel, w); - w = temp->parent->right; - } - - rbtree_copy_color(w, temp->parent); - rbtree_black(temp->parent); - rbtree_black(w->right); - rbtree_left_rotate(root, sentinel, temp->parent); - temp = *root; - } - - } else { - w = temp->parent->left; - - if (rbtree_is_red(w)) { - rbtree_black(w); - rbtree_red(temp->parent); - rbtree_right_rotate(root, sentinel, temp->parent); - w = temp->parent->left; - } - - if (rbtree_is_black(w->left) && rbtree_is_black(w->right)) { - rbtree_red(w); - temp = temp->parent; - } else { - if (rbtree_is_black(w->left)) { - rbtree_black(w->right); - rbtree_red(w); - rbtree_left_rotate(root, sentinel, w); - w = temp->parent->left; - } - - rbtree_copy_color(w, temp->parent); - rbtree_black(temp->parent); - rbtree_black(w->left); - rbtree_right_rotate(root, sentinel, temp->parent); - temp = *root; - } - } - } - - rbtree_black(temp); + rbtree_black(temp); } diff --git a/src/dyn_rbtree.h b/src/dyn_rbtree.h index c2b0ea41c..492fbf824 100644 --- a/src/dyn_rbtree.h +++ b/src/dyn_rbtree.h @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -24,25 +24,25 @@ #define _DYN_RBTREE_ #include "dyn_types.h" -#define rbtree_red(_node) ((_node)->color = 1) -#define rbtree_black(_node) ((_node)->color = 0) -#define rbtree_is_red(_node) ((_node)->color) -#define rbtree_is_black(_node) (!rbtree_is_red(_node)) +#define rbtree_red(_node) ((_node)->color = 1) +#define rbtree_black(_node) ((_node)->color = 0) +#define rbtree_is_red(_node) ((_node)->color) +#define rbtree_is_black(_node) (!rbtree_is_red(_node)) #define rbtree_copy_color(_n1, _n2) ((_n1)->color = (_n2)->color) struct rbnode { - struct rbnode *left; /* left link */ - struct rbnode *right; /* right link */ - struct rbnode *parent; /* parent link */ - msec_t key; /* key for ordering */ - msec_t timeout; /* timeout */ - void *data; /* opaque data */ - uint8_t color; /* red | black */ + struct rbnode *left; /* left link */ + struct rbnode *right; /* right link */ + struct rbnode *parent; /* parent link */ + msec_t key; /* key for ordering */ + msec_t timeout; /* timeout */ + void *data; /* opaque data */ + uint8_t color; /* red | black */ }; struct rbtree { - struct rbnode *root; /* root node */ - struct rbnode *sentinel; /* nil node */ + struct rbnode *root; /* root node */ + struct rbnode *sentinel; /* nil node */ }; void rbtree_node_init(struct rbnode *node); diff --git a/src/dyn_request.c b/src/dyn_request.c index 8d25b2705..aeb6ea190 100644 --- a/src/dyn_request.c +++ b/src/dyn_request.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -21,43 +21,38 @@ */ #include "dyn_core.h" -#include "dyn_server.h" #include "dyn_dnode_peer.h" +#include "dyn_server.h" -struct msg * -req_get(struct conn *conn) -{ - struct msg *req; +struct msg *req_get(struct conn *conn) { + struct msg *req; - ASSERT((conn->type == CONN_CLIENT) || - (conn->type == CONN_DNODE_PEER_CLIENT)); + ASSERT((conn->type == CONN_CLIENT) || (conn->type == CONN_DNODE_PEER_CLIENT)); - req = msg_get(conn, true, __FUNCTION__); - if (req == NULL) { - conn->err = errno; - } + req = msg_get(conn, true, __FUNCTION__); + if (req == NULL) { + conn->err = errno; + } - return req; + return req; } -void -req_put(struct msg *req) -{ - struct msg *rsp; /* peer message (response) */ +void req_put(struct msg *req) { + struct msg *rsp; /* peer message (response) */ - ASSERT(req->is_request); + ASSERT(req->is_request); - rsp = req->selected_rsp; - if (rsp != NULL) { - ASSERT(!rsp->is_request && rsp->peer == req); - req->selected_rsp = NULL; - rsp->peer = NULL; - rsp_put(rsp); - } + rsp = req->selected_rsp; + if (rsp != NULL) { + ASSERT(!rsp->is_request && rsp->peer == req); + req->selected_rsp = NULL; + rsp->peer = NULL; + rsp_put(rsp); + } - msg_tmo_delete(req); + msg_tmo_delete(req); - msg_put(req); + msg_put(req); } /* @@ -67,107 +62,97 @@ req_put(struct msg *req) * A request vector is done if we received responses for all its * fragments. */ -bool -req_done(struct conn *conn, struct msg *req) -{ - struct msg *cmsg; /* current and previous message */ - uint64_t id; /* fragment id */ - uint32_t nfragment; /* # fragment */ - - ASSERT((conn->type == CONN_CLIENT) || - (conn->type == CONN_DNODE_PEER_CLIENT)); +bool req_done(struct conn *conn, struct msg *req) { + struct msg *cmsg; /* current and previous message */ + uint64_t id; /* fragment id */ + uint32_t nfragment; /* # fragment */ - if (req == NULL) - return false; + ASSERT((conn->type == CONN_CLIENT) || (conn->type == CONN_DNODE_PEER_CLIENT)); - ASSERT(req->is_request); - - if (!req->selected_rsp) - return false; - - id = req->frag_id; - if (id == 0) { - return true; - } - - if (req->fdone) { - /* request has already been marked as done */ - return true; - } + if (req == NULL) return false; - struct msg *frag_owner = req->frag_owner; - if (frag_owner->nfrag_done < frag_owner->nfrag) - return false; - - // check all fragments of the given request vector are done. - for (cmsg = TAILQ_PREV(req, msg_tqh, c_tqe); - cmsg != NULL && cmsg->frag_id == id; - cmsg = TAILQ_PREV(cmsg, msg_tqh, c_tqe)) { - - if (!cmsg->selected_rsp) - return false; - } - - for (cmsg = TAILQ_NEXT(req, c_tqe); - cmsg != NULL && cmsg->frag_id == id; - cmsg = TAILQ_NEXT(cmsg, c_tqe)) { - - if (!cmsg->selected_rsp) - return false; - } - - /* - * At this point, all the fragments including the last fragment have - * been received. - * - * Mark all fragments of the given request vector to be done to speed up - * future req_done calls for any of fragments of this request - */ - - req->fdone = 1; - nfragment = 0; - - for (cmsg = TAILQ_PREV(req, msg_tqh, c_tqe); - cmsg != NULL && cmsg->frag_id == id; - cmsg = TAILQ_PREV(cmsg, msg_tqh, c_tqe)) { - cmsg->fdone = 1; - nfragment++; - } - - for (cmsg = TAILQ_NEXT(req, c_tqe); - cmsg != NULL && cmsg->frag_id == id; - cmsg = TAILQ_NEXT(cmsg, c_tqe)) { - cmsg->fdone = 1; - nfragment++; - } + ASSERT(req->is_request); - ASSERT(req->frag_owner->nfrag == nfragment); + if (!req->selected_rsp) return false; - g_post_coalesce(req->frag_owner); - - log_debug(LOG_DEBUG, "req from c %d with fid %"PRIu64" and %"PRIu32" " - "fragments is done", conn->sd, id, nfragment); + id = req->frag_id; + if (id == 0) { + return true; + } + if (req->fdone) { + /* request has already been marked as done */ return true; + } + + struct msg *frag_owner = req->frag_owner; + if (frag_owner->nfrag_done < frag_owner->nfrag) return false; + + // check all fragments of the given request vector are done. + for (cmsg = TAILQ_PREV(req, msg_tqh, c_tqe); + cmsg != NULL && cmsg->frag_id == id; + cmsg = TAILQ_PREV(cmsg, msg_tqh, c_tqe)) { + if (!cmsg->selected_rsp) return false; + } + + for (cmsg = TAILQ_NEXT(req, c_tqe); cmsg != NULL && cmsg->frag_id == id; + cmsg = TAILQ_NEXT(cmsg, c_tqe)) { + if (!cmsg->selected_rsp) return false; + } + + /* + * At this point, all the fragments including the last fragment have + * been received. + * + * Mark all fragments of the given request vector to be done to speed up + * future req_done calls for any of fragments of this request + */ + + req->fdone = 1; + nfragment = 0; + + for (cmsg = TAILQ_PREV(req, msg_tqh, c_tqe); + cmsg != NULL && cmsg->frag_id == id; + cmsg = TAILQ_PREV(cmsg, msg_tqh, c_tqe)) { + cmsg->fdone = 1; + nfragment++; + } + + for (cmsg = TAILQ_NEXT(req, c_tqe); cmsg != NULL && cmsg->frag_id == id; + cmsg = TAILQ_NEXT(cmsg, c_tqe)) { + cmsg->fdone = 1; + nfragment++; + } + + ASSERT(req->frag_owner->nfrag == nfragment); + + g_post_coalesce(req->frag_owner); + + log_debug(LOG_DEBUG, + "req from c %d with fid %" PRIu64 " and %" PRIu32 + " " + "fragments is done", + conn->sd, id, nfragment); + + return true; } -rstatus_t -req_make_reply(struct context *ctx, struct conn *conn, struct msg *req) -{ - struct msg *rsp = msg_get(conn, false, __FUNCTION__); - if (rsp == NULL) { - conn->err = errno; - return DN_ENOMEM; - } +rstatus_t req_make_reply(struct context *ctx, struct conn *conn, + struct msg *req) { + struct msg *rsp = msg_get(conn, false, __FUNCTION__); + if (rsp == NULL) { + conn->err = errno; + return DN_ENOMEM; + } - req->selected_rsp = rsp; - rsp->peer = req; - rsp->is_request = 0; + req->selected_rsp = rsp; + rsp->peer = req; + rsp->is_request = 0; - req->done = 1; - conn_enqueue_outq(ctx, conn, req); + req->done = 1; + conn_enqueue_outq(ctx, conn, req); - return DN_OK; + return DN_OK; } /* @@ -177,77 +162,74 @@ req_make_reply(struct context *ctx, struct conn *conn, struct msg *req) * given request. A multiget request is in error if there was an error in * receiving response for any its fragments. */ -bool -req_error(struct conn *conn, struct msg *req) -{ - struct msg *cmsg; /* current message */ - uint64_t id; - uint32_t nfragment; +bool req_error(struct conn *conn, struct msg *req) { + struct msg *cmsg; /* current message */ + uint64_t id; + uint32_t nfragment; - ASSERT(req->is_request && req_done(conn, req)); - - if (req->is_error) { - return true; - } + ASSERT(req->is_request && req_done(conn, req)); - id = req->frag_id; - if (id == 0) { - return false; - } + if (req->is_error) { + return true; + } - if (req->is_ferror) { - /* request has already been marked to be in error */ - return true; - } + id = req->frag_id; + if (id == 0) { + return false; + } - /* check if any of the fragments of the given request are in error */ + if (req->is_ferror) { + /* request has already been marked to be in error */ + return true; + } - for (cmsg = TAILQ_PREV(req, msg_tqh, c_tqe); - cmsg != NULL && cmsg->frag_id == id; - cmsg = TAILQ_PREV(cmsg, msg_tqh, c_tqe)) { + /* check if any of the fragments of the given request are in error */ - if (cmsg->is_error) { - goto ferror; - } + for (cmsg = TAILQ_PREV(req, msg_tqh, c_tqe); + cmsg != NULL && cmsg->frag_id == id; + cmsg = TAILQ_PREV(cmsg, msg_tqh, c_tqe)) { + if (cmsg->is_error) { + goto ferror; } + } - for (cmsg = TAILQ_NEXT(req, c_tqe); - cmsg != NULL && cmsg->frag_id == id; - cmsg = TAILQ_NEXT(cmsg, c_tqe)) { - - if (cmsg->is_error) { - goto ferror; - } + for (cmsg = TAILQ_NEXT(req, c_tqe); cmsg != NULL && cmsg->frag_id == id; + cmsg = TAILQ_NEXT(cmsg, c_tqe)) { + if (cmsg->is_error) { + goto ferror; } + } - return false; + return false; ferror: - /* - * Mark all fragments of the given request to be in error to speed up - * future req_error calls for any of fragments of this request - */ - - req->is_ferror = 1; - nfragment = 1; - - for (cmsg = TAILQ_PREV(req, msg_tqh, c_tqe); - cmsg != NULL && cmsg->frag_id == id; - cmsg = TAILQ_PREV(cmsg, msg_tqh, c_tqe)) { - cmsg->is_ferror = 1; - nfragment++; - } - - for (cmsg = TAILQ_NEXT(req, c_tqe); - cmsg != NULL && cmsg->frag_id == id; - cmsg = TAILQ_NEXT(cmsg, c_tqe)) { - cmsg->is_ferror = 1; - nfragment++; - } - - log_debug(LOG_DEBUG, "req from c %d with fid %"PRIu64" and %"PRIu32" " - "fragments is in error", conn->sd, id, nfragment); - - return true; + /* + * Mark all fragments of the given request to be in error to speed up + * future req_error calls for any of fragments of this request + */ + + req->is_ferror = 1; + nfragment = 1; + + for (cmsg = TAILQ_PREV(req, msg_tqh, c_tqe); + cmsg != NULL && cmsg->frag_id == id; + cmsg = TAILQ_PREV(cmsg, msg_tqh, c_tqe)) { + cmsg->is_ferror = 1; + nfragment++; + } + + for (cmsg = TAILQ_NEXT(req, c_tqe); cmsg != NULL && cmsg->frag_id == id; + cmsg = TAILQ_NEXT(cmsg, c_tqe)) { + cmsg->is_ferror = 1; + nfragment++; + } + + log_debug(LOG_DEBUG, + "req from c %d with fid %" PRIu64 " and %" PRIu32 + " " + "fragments is in error", + conn->sd, id, nfragment); + + return true; } diff --git a/src/dyn_response.c b/src/dyn_response.c index d63d7d11e..8a3350300 100644 --- a/src/dyn_response.c +++ b/src/dyn_response.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -23,178 +23,166 @@ #include "dyn_core.h" #include "dyn_server.h" -struct msg * -rsp_get(struct conn *conn) -{ - struct msg *rsp; +struct msg *rsp_get(struct conn *conn) { + struct msg *rsp; - ASSERT((conn->type == CONN_DNODE_PEER_SERVER) || - (conn->type == CONN_SERVER) || - (conn->type == CONN_CLIENT)); + ASSERT((conn->type == CONN_DNODE_PEER_SERVER) || + (conn->type == CONN_SERVER) || (conn->type == CONN_CLIENT)); - rsp = msg_get(conn, false, __FUNCTION__); - if (rsp == NULL) { - conn->err = errno; - } + rsp = msg_get(conn, false, __FUNCTION__); + if (rsp == NULL) { + conn->err = errno; + } - return rsp; + return rsp; } -void -rsp_put(struct msg *rsp) -{ - if (!rsp) - return; - ASSERT(!rsp->is_request); - //ASSERT(rsp->peer == NULL); - msg_put(rsp); +void rsp_put(struct msg *rsp) { + if (!rsp) return; + ASSERT(!rsp->is_request); + // ASSERT(rsp->peer == NULL); + msg_put(rsp); } -static struct msg * -rsp_make_error(struct context *ctx, struct conn *conn, struct msg *req) -{ - struct msg *rsp; /* peer message (response) */ - struct msg *cmsg, *nmsg; /* current and next message (request) */ - uint64_t id; - err_t error_code = 0, dyn_error_code = 0; - - ASSERT((conn->type == CONN_CLIENT) || - (conn->type == CONN_DNODE_PEER_CLIENT)); - ASSERT(req->is_request && req_error(conn, req)); - ASSERT(req->owner == conn); - - // first grab the error from the current req - error_code = req->error_code; - dyn_error_code = req->dyn_error_code; - - id = req->frag_id; - if (id != 0) { - for (cmsg = TAILQ_NEXT(req, c_tqe); - cmsg != NULL && cmsg->frag_id == id; - cmsg = nmsg) { - nmsg = TAILQ_NEXT(cmsg, c_tqe); - - /* dequeue request (error fragment) from client outq */ - conn_dequeue_outq(ctx, conn, cmsg); - if (!error_code && cmsg->error_code != 0) { - error_code = cmsg->error_code; - dyn_error_code = cmsg->dyn_error_code; - } - req_put(cmsg); - } +static struct msg *rsp_make_error(struct context *ctx, struct conn *conn, + struct msg *req) { + struct msg *rsp; /* peer message (response) */ + struct msg *cmsg, *nmsg; /* current and next message (request) */ + uint64_t id; + err_t error_code = 0, dyn_error_code = 0; + + ASSERT((conn->type == CONN_CLIENT) || (conn->type == CONN_DNODE_PEER_CLIENT)); + ASSERT(req->is_request && req_error(conn, req)); + ASSERT(req->owner == conn); + + // first grab the error from the current req + error_code = req->error_code; + dyn_error_code = req->dyn_error_code; + + id = req->frag_id; + if (id != 0) { + for (cmsg = TAILQ_NEXT(req, c_tqe); cmsg != NULL && cmsg->frag_id == id; + cmsg = nmsg) { + nmsg = TAILQ_NEXT(cmsg, c_tqe); + + /* dequeue request (error fragment) from client outq */ + conn_dequeue_outq(ctx, conn, cmsg); + if (!error_code && cmsg->error_code != 0) { + error_code = cmsg->error_code; + dyn_error_code = cmsg->dyn_error_code; + } + req_put(cmsg); } + } + + rsp = req->selected_rsp; + if (rsp != NULL) { + if (rsp->is_error) return rsp; + ASSERT(!rsp->is_request && rsp->peer == req); + req->selected_rsp = NULL; + rsp->peer = NULL; + rsp_put(rsp); + } + + return msg_get_error(conn, dyn_error_code, error_code); +} - rsp = req->selected_rsp; - if (rsp != NULL) { - if (rsp->is_error) - return rsp; - ASSERT(!rsp->is_request && rsp->peer == req); - req->selected_rsp = NULL; - rsp->peer = NULL; - rsp_put(rsp); - } +struct msg *rsp_send_next(struct context *ctx, struct conn *conn) { + rstatus_t status; + struct msg *rsp, *req; /* response and it's peer request */ - return msg_get_error(conn, dyn_error_code, error_code); -} + ASSERT_LOG( + (conn->type == CONN_DNODE_PEER_CLIENT) || (conn->type = CONN_CLIENT), + "conn %s", print_obj(conn)); -struct msg * -rsp_send_next(struct context *ctx, struct conn *conn) -{ - rstatus_t status; - struct msg *rsp, *req; /* response and it's peer request */ - - ASSERT_LOG((conn->type == CONN_DNODE_PEER_CLIENT) || - (conn->type = CONN_CLIENT), "conn %s", print_obj(conn)); - - req = TAILQ_FIRST(&conn->omsg_q); - if (req == NULL || !req_done(conn, req)) { - /* nothing is outstanding, initiate close? */ - if (req == NULL && conn->eof) { - conn->done = 1; - log_debug(LOG_INFO, "c %d is done", conn->sd); - } - - status = conn_event_del_out(conn); - if (status != DN_OK) { - conn->err = errno; - } - - return NULL; + req = TAILQ_FIRST(&conn->omsg_q); + if (req == NULL || !req_done(conn, req)) { + /* nothing is outstanding, initiate close? */ + if (req == NULL && conn->eof) { + conn->done = 1; + log_debug(LOG_INFO, "c %d is done", conn->sd); } - rsp = conn->smsg; - if (rsp != NULL) { - ASSERT(!rsp->is_request); - ASSERT(rsp->peer != NULL); - req = TAILQ_NEXT(rsp->peer, c_tqe); + status = conn_event_del_out(conn); + if (status != DN_OK) { + conn->err = errno; } - if (req == NULL || !req_done(conn, req)) { - conn->smsg = NULL; - return NULL; + return NULL; + } + + rsp = conn->smsg; + if (rsp != NULL) { + ASSERT(!rsp->is_request); + ASSERT(rsp->peer != NULL); + req = TAILQ_NEXT(rsp->peer, c_tqe); + } + + if (req == NULL || !req_done(conn, req)) { + conn->smsg = NULL; + return NULL; + } + ASSERT(req->is_request && !req->swallow); + + if (req_error(conn, req)) { + rsp = rsp_make_error(ctx, conn, req); + if (rsp == NULL) { + conn->err = errno; + return NULL; } - ASSERT(req->is_request && !req->swallow); - - if (req_error(conn, req)) { - rsp = rsp_make_error(ctx, conn, req); - if (rsp == NULL) { - conn->err = errno; - return NULL; - } - rsp->peer = req; - req->selected_rsp = rsp; - log_debug(LOG_VERB, "creating new error rsp %s", print_obj(rsp)); - if (conn->dyn_mode) { - stats_pool_incr(ctx, peer_forward_error); - } else { - stats_pool_incr(ctx, forward_error); - } + rsp->peer = req; + req->selected_rsp = rsp; + log_debug(LOG_VERB, "creating new error rsp %s", print_obj(rsp)); + if (conn->dyn_mode) { + stats_pool_incr(ctx, peer_forward_error); } else { - rsp = req->selected_rsp; + stats_pool_incr(ctx, forward_error); } - ASSERT(!rsp->is_request); + } else { + rsp = req->selected_rsp; + } + ASSERT(!rsp->is_request); - conn->smsg = rsp; + conn->smsg = rsp; - if (log_loggable(LOG_VVERB)) { - log_debug(LOG_VVERB, "send next rsp %"PRIu64" on c %d", rsp->id, conn->sd); - } + if (log_loggable(LOG_VVERB)) { + log_debug(LOG_VVERB, "send next rsp %" PRIu64 " on c %d", rsp->id, + conn->sd); + } - return rsp; + return rsp; } -void -rsp_send_done(struct context *ctx, struct conn *conn, struct msg *rsp) -{ - - ASSERT(conn->type == CONN_CLIENT); - ASSERT(conn->smsg == NULL); - - if (log_loggable(LOG_VVERB)) { - log_debug(LOG_VVERB, "send done rsp %"PRIu64" on c %d", rsp->id, conn->sd); - } - - log_debug(LOG_VERB, "conn %p rsp %p done", conn, rsp); - struct msg *req = rsp->peer; - ASSERT_LOG(req, "response %d does not have a corresponding request", rsp->id); - ASSERT_LOG(!req->rsp_sent, "request %d:%d already had a response sent", - req->id, req->parent_id); - - ASSERT(!rsp->is_request && req->is_request); - ASSERT(req->selected_rsp == rsp); - req->rsp_sent = 1; - - /* dequeue request from client outq */ - conn_dequeue_outq(ctx, conn, req); - - // Remove it from the dict - if (!req->awaiting_rsps) { - log_debug(LOG_VERB, "conn %p removing message %d:%d", conn, req->id, req->parent_id); - dictDelete(conn->outstanding_msgs_dict, &req->id); - req_put(req); - } else { - log_info("req %d:%d still awaiting rsps %d", req->id, req->parent_id, - req->awaiting_rsps); - } +void rsp_send_done(struct context *ctx, struct conn *conn, struct msg *rsp) { + ASSERT(conn->type == CONN_CLIENT); + ASSERT(conn->smsg == NULL); + + if (log_loggable(LOG_VVERB)) { + log_debug(LOG_VVERB, "send done rsp %" PRIu64 " on c %d", rsp->id, + conn->sd); + } + + log_debug(LOG_VERB, "conn %p rsp %p done", conn, rsp); + struct msg *req = rsp->peer; + ASSERT_LOG(req, "response %d does not have a corresponding request", rsp->id); + ASSERT_LOG(!req->rsp_sent, "request %d:%d already had a response sent", + req->id, req->parent_id); + + ASSERT(!rsp->is_request && req->is_request); + ASSERT(req->selected_rsp == rsp); + req->rsp_sent = 1; + + /* dequeue request from client outq */ + conn_dequeue_outq(ctx, conn, req); + + // Remove it from the dict + if (!req->awaiting_rsps) { + log_debug(LOG_VERB, "conn %p removing message %d:%d", conn, req->id, + req->parent_id); + dictDelete(conn->outstanding_msgs_dict, &req->id); + req_put(req); + } else { + log_info("req %d:%d still awaiting rsps %d", req->id, req->parent_id, + req->awaiting_rsps); + } } - diff --git a/src/dyn_response_mgr.c b/src/dyn_response_mgr.c index 10ef88390..de553495c 100644 --- a/src/dyn_response_mgr.c +++ b/src/dyn_response_mgr.c @@ -1,40 +1,32 @@ #include "dyn_core.h" -#include "dyn_server.h" #include "dyn_dnode_peer.h" +#include "dyn_server.h" -void -init_response_mgr(struct response_mgr *rspmgr, struct msg *req, bool is_read, - uint8_t max_responses, struct conn *conn) -{ - memset(rspmgr, 0, sizeof(struct response_mgr)); - rspmgr->is_read = is_read; - rspmgr->max_responses = max_responses; - rspmgr->quorum_responses = (uint8_t)(max_responses/2 + 1); - rspmgr->conn = conn; - rspmgr->msg = req; - req->awaiting_rsps = max_responses; +void init_response_mgr(struct response_mgr *rspmgr, struct msg *req, + bool is_read, uint8_t max_responses, struct conn *conn) { + memset(rspmgr, 0, sizeof(struct response_mgr)); + rspmgr->is_read = is_read; + rspmgr->max_responses = max_responses; + rspmgr->quorum_responses = (uint8_t)(max_responses / 2 + 1); + rspmgr->conn = conn; + rspmgr->msg = req; + req->awaiting_rsps = max_responses; } -static bool -rspmgr_is_quorum_achieved(struct response_mgr *rspmgr) -{ - if (rspmgr->quorum_responses == 1 && - rspmgr->good_responses == rspmgr->quorum_responses) - return true; - if (rspmgr->good_responses < rspmgr->quorum_responses) - return false; +static bool rspmgr_is_quorum_achieved(struct response_mgr *rspmgr) { + if (rspmgr->quorum_responses == 1 && + rspmgr->good_responses == rspmgr->quorum_responses) + return true; + if (rspmgr->good_responses < rspmgr->quorum_responses) return false; - uint32_t chk0, chk1, chk2; - chk0 = rspmgr->checksums[0]; - chk1 = rspmgr->checksums[1]; - if (chk0 == chk1) - return true; - if (rspmgr->good_responses < 3) - return false; - chk2 = rspmgr->checksums[2]; - if ((chk1 == chk2) || (chk0 == chk2)) - return true; - return false; + uint32_t chk0, chk1, chk2; + chk0 = rspmgr->checksums[0]; + chk1 = rspmgr->checksums[1]; + if (chk0 == chk1) return true; + if (rspmgr->good_responses < 3) return false; + chk2 = rspmgr->checksums[2]; + if ((chk1 == chk2) || (chk0 == chk2)) return true; + return false; } // Wait for all responses before responding @@ -52,152 +44,135 @@ rspmgr_check_is_done(struct response_mgr *rspmgr) }*/ // Wait for only quorum number of responses before responding -bool -rspmgr_check_is_done(struct response_mgr *rspmgr) -{ - uint8_t pending_responses = (uint8_t)(rspmgr->max_responses - - rspmgr->good_responses - - rspmgr->error_responses); - // do the required calculation and tell if we are done here - if (rspmgr->good_responses >= rspmgr->quorum_responses) { - // We received enough good responses but do their checksum match? - if (rspmgr_is_quorum_achieved(rspmgr)) { - log_info("req %lu quorum achieved", rspmgr->msg->id); - rspmgr->done = true; - } else if (pending_responses) { - // There's a mismatch in checksum. Wait for any pending responses - rspmgr->done = false; - } else { - // no pending responses, and the checksum do not match. - rspmgr->done = true; - } - } else if ((pending_responses + rspmgr->good_responses) < - rspmgr->quorum_responses) { - // Even if we receive all the pending responses, still we do not form - // a quorum, So decision is done, no quorum possible - rspmgr->done = true; +bool rspmgr_check_is_done(struct response_mgr *rspmgr) { + uint8_t pending_responses = (uint8_t)( + rspmgr->max_responses - rspmgr->good_responses - rspmgr->error_responses); + // do the required calculation and tell if we are done here + if (rspmgr->good_responses >= rspmgr->quorum_responses) { + // We received enough good responses but do their checksum match? + if (rspmgr_is_quorum_achieved(rspmgr)) { + log_info("req %lu quorum achieved", rspmgr->msg->id); + rspmgr->done = true; + } else if (pending_responses) { + // There's a mismatch in checksum. Wait for any pending responses + rspmgr->done = false; + } else { + // no pending responses, and the checksum do not match. + rspmgr->done = true; } - return rspmgr->done; + } else if ((pending_responses + rspmgr->good_responses) < + rspmgr->quorum_responses) { + // Even if we receive all the pending responses, still we do not form + // a quorum, So decision is done, no quorum possible + rspmgr->done = true; + } + return rspmgr->done; } -static void -rspmgr_incr_non_quorum_responses_stats(struct response_mgr *rspmgr) -{ - if (rspmgr->is_read) - stats_pool_incr(conn_to_ctx(rspmgr->conn), - client_non_quorum_r_responses); - else - stats_pool_incr(conn_to_ctx(rspmgr->conn), - client_non_quorum_w_responses); - +static void rspmgr_incr_non_quorum_responses_stats( + struct response_mgr *rspmgr) { + if (rspmgr->is_read) + stats_pool_incr(conn_to_ctx(rspmgr->conn), client_non_quorum_r_responses); + else + stats_pool_incr(conn_to_ctx(rspmgr->conn), client_non_quorum_w_responses); } -struct msg* -rspmgr_get_response(struct response_mgr *rspmgr) -{ - // no quorum possible - if (rspmgr->good_responses < rspmgr->quorum_responses) { - ASSERT(rspmgr->err_rsp); - rspmgr_incr_non_quorum_responses_stats(rspmgr); - log_error("req: %lu return non quorum error rsp %p good rsp:%u quorum: %u", - rspmgr->msg->id, rspmgr->err_rsp, rspmgr->good_responses, - rspmgr->quorum_responses); - msg_dump(LOG_DEBUG, rspmgr->err_rsp); - return rspmgr->err_rsp; - } - - uint32_t chk0, chk1, chk2; - chk0 = rspmgr->checksums[0]; - chk1 = rspmgr->checksums[1]; - if (chk0 == chk1) { - return rspmgr->responses[0]; - } else if (rspmgr->good_responses == 3) { - chk2 = rspmgr->checksums[2]; - if (chk1 == chk2) - return rspmgr->responses[1]; - else if (chk0 == chk2) - return rspmgr->responses[0]; - } +struct msg *rspmgr_get_response(struct response_mgr *rspmgr) { + // no quorum possible + if (rspmgr->good_responses < rspmgr->quorum_responses) { + ASSERT(rspmgr->err_rsp); rspmgr_incr_non_quorum_responses_stats(rspmgr); - if (log_loggable(LOG_DEBUG)) { - log_error("Request: "); - msg_dump(LOG_DEBUG, rspmgr->msg); - } - if (log_loggable(LOG_VVERB)) { - log_error("Respone 0: "); - msg_dump(LOG_VVERB, rspmgr->responses[0]); - log_error("Respone 1: "); - msg_dump(LOG_VVERB, rspmgr->responses[1]); - if (rspmgr->good_responses == 3) { - log_error("Respone 2: "); - msg_dump(LOG_VVERB, rspmgr->responses[2]); - } - } - return g_reconcile_responses(rspmgr); -} + log_error("req: %lu return non quorum error rsp %p good rsp:%u quorum: %u", + rspmgr->msg->id, rspmgr->err_rsp, rspmgr->good_responses, + rspmgr->quorum_responses); + msg_dump(LOG_DEBUG, rspmgr->err_rsp); + return rspmgr->err_rsp; + } -void -rspmgr_free_other_responses(struct response_mgr *rspmgr, struct msg *dont_free) -{ - int i; - for (i = 0; i < rspmgr->good_responses; i++) { - if (dont_free && (rspmgr->responses[i] == dont_free)) - continue; - rsp_put(rspmgr->responses[i]); - } - if (rspmgr->err_rsp) { - if (dont_free && (dont_free == rspmgr->err_rsp)) - return; - rsp_put(rspmgr->err_rsp); + uint32_t chk0, chk1, chk2; + chk0 = rspmgr->checksums[0]; + chk1 = rspmgr->checksums[1]; + if (chk0 == chk1) { + return rspmgr->responses[0]; + } else if (rspmgr->good_responses == 3) { + chk2 = rspmgr->checksums[2]; + if (chk1 == chk2) + return rspmgr->responses[1]; + else if (chk0 == chk2) + return rspmgr->responses[0]; + } + rspmgr_incr_non_quorum_responses_stats(rspmgr); + if (log_loggable(LOG_DEBUG)) { + log_error("Request: "); + msg_dump(LOG_DEBUG, rspmgr->msg); + } + if (log_loggable(LOG_VVERB)) { + log_error("Respone 0: "); + msg_dump(LOG_VVERB, rspmgr->responses[0]); + log_error("Respone 1: "); + msg_dump(LOG_VVERB, rspmgr->responses[1]); + if (rspmgr->good_responses == 3) { + log_error("Respone 2: "); + msg_dump(LOG_VVERB, rspmgr->responses[2]); } + } + return g_reconcile_responses(rspmgr); } -rstatus_t -rspmgr_submit_response(struct response_mgr *rspmgr, struct msg*rsp) -{ - log_info("req %d submitting response %d awaiting_rsps %d", - rspmgr->msg->id, rsp->id, rspmgr->msg->awaiting_rsps); - if (rsp->is_error) { - log_debug(LOG_VERB, "Received error response %d:%d for req %d:%d", - rsp->id, rsp->parent_id, rspmgr->msg->id, rspmgr->msg->parent_id); - rspmgr->error_responses++; - if (rspmgr->err_rsp == NULL) - rspmgr->err_rsp = rsp; - else - rsp_put(rsp); - } else { - rspmgr->checksums[rspmgr->good_responses] = msg_payload_crc32(rsp); - log_debug(LOG_VERB, "Good response %d:%d checksum %u", rsp->id, - rsp->parent_id, rspmgr->checksums[rspmgr->good_responses]); - rspmgr->responses[rspmgr->good_responses++] = rsp; - } - msg_decr_awaiting_rsps(rspmgr->msg); - return DN_OK; +void rspmgr_free_other_responses(struct response_mgr *rspmgr, + struct msg *dont_free) { + int i; + for (i = 0; i < rspmgr->good_responses; i++) { + if (dont_free && (rspmgr->responses[i] == dont_free)) continue; + rsp_put(rspmgr->responses[i]); + } + if (rspmgr->err_rsp) { + if (dont_free && (dont_free == rspmgr->err_rsp)) return; + rsp_put(rspmgr->err_rsp); + } } -rstatus_t -rspmgr_clone_responses(struct response_mgr *rspmgr, struct array *responses) -{ - uint8_t iter = 0; - struct msg *dst = NULL; - rstatus_t s = DN_OK; - for(iter = 0; iter < rspmgr->good_responses; iter++) - { - struct msg *src = rspmgr->responses[iter]; - dst = rsp_get(rspmgr->conn); - if (!dst) { - s = DN_ENOMEM; - goto error; - } +rstatus_t rspmgr_submit_response(struct response_mgr *rspmgr, struct msg *rsp) { + log_info("req %d submitting response %d awaiting_rsps %d", rspmgr->msg->id, + rsp->id, rspmgr->msg->awaiting_rsps); + if (rsp->is_error) { + log_debug(LOG_VERB, "Received error response %d:%d for req %d:%d", rsp->id, + rsp->parent_id, rspmgr->msg->id, rspmgr->msg->parent_id); + rspmgr->error_responses++; + if (rspmgr->err_rsp == NULL) + rspmgr->err_rsp = rsp; + else + rsp_put(rsp); + } else { + rspmgr->checksums[rspmgr->good_responses] = msg_payload_crc32(rsp); + log_debug(LOG_VERB, "Good response %d:%d checksum %u", rsp->id, + rsp->parent_id, rspmgr->checksums[rspmgr->good_responses]); + rspmgr->responses[rspmgr->good_responses++] = rsp; + } + msg_decr_awaiting_rsps(rspmgr->msg); + return DN_OK; +} - s = msg_clone(src, STAILQ_FIRST(&src->mhdr), dst); - if (s != DN_OK) - goto error; - struct msg **pdst = (struct msg **)array_push(responses); - *pdst = dst; +rstatus_t rspmgr_clone_responses(struct response_mgr *rspmgr, + struct array *responses) { + uint8_t iter = 0; + struct msg *dst = NULL; + rstatus_t s = DN_OK; + for (iter = 0; iter < rspmgr->good_responses; iter++) { + struct msg *src = rspmgr->responses[iter]; + dst = rsp_get(rspmgr->conn); + if (!dst) { + s = DN_ENOMEM; + goto error; } - return DN_OK; + + s = msg_clone(src, STAILQ_FIRST(&src->mhdr), dst); + if (s != DN_OK) goto error; + struct msg **pdst = (struct msg **)array_push(responses); + *pdst = dst; + } + return DN_OK; error: - rsp_put(dst); - return s; + rsp_put(dst); + return s; } diff --git a/src/dyn_response_mgr.h b/src/dyn_response_mgr.h index 396f8e732..249fdf980 100644 --- a/src/dyn_response_mgr.h +++ b/src/dyn_response_mgr.h @@ -1,32 +1,35 @@ #ifndef _DYN_RESPONSE_MGR_H_ #define _DYN_RESPONSE_MGR_H_ -#define MAX_REPLICAS_PER_DC 3 +#define MAX_REPLICAS_PER_DC 3 struct response_mgr { - bool is_read; - bool done; - /* we could use the dynamic array - here. But we have only 3 ASGs */ - struct msg *responses[MAX_REPLICAS_PER_DC]; - uint32_t checksums[MAX_REPLICAS_PER_DC]; - uint8_t good_responses; // non-error responses received. (nil) is not an error - uint8_t max_responses; // max responses expected. - uint8_t quorum_responses; // responses expected to form a quorum - uint8_t error_responses; // error responses received - struct msg *err_rsp; // first error response - struct conn *conn; - struct msg *msg; // corresponding request + bool is_read; + bool done; + /* we could use the dynamic array + here. But we have only 3 ASGs */ + struct msg *responses[MAX_REPLICAS_PER_DC]; + uint32_t checksums[MAX_REPLICAS_PER_DC]; + uint8_t + good_responses; // non-error responses received. (nil) is not an error + uint8_t max_responses; // max responses expected. + uint8_t quorum_responses; // responses expected to form a quorum + uint8_t error_responses; // error responses received + struct msg *err_rsp; // first error response + struct conn *conn; + struct msg *msg; // corresponding request }; -void init_response_mgr(struct response_mgr *rspmgr, struct msg*, bool is_read, +void init_response_mgr(struct response_mgr *rspmgr, struct msg *, bool is_read, uint8_t max_responses, struct conn *conn); // DN_OK if response was accepted rstatus_t rspmgr_submit_response(struct response_mgr *rspmgr, struct msg *rsp); bool rspmgr_check_is_done(struct response_mgr *rspmgr); -struct msg* rspmgr_get_response(struct response_mgr *rspmgr); +struct msg *rspmgr_get_response(struct response_mgr *rspmgr); void rspmgr_free_response(struct response_mgr *rspmgr, struct msg *dont_free); -void rspmgr_free_other_responses(struct response_mgr *rspmgr, struct msg *dont_free); +void rspmgr_free_other_responses(struct response_mgr *rspmgr, + struct msg *dont_free); rstatus_t msg_local_one_rsp_handler(struct msg *req, struct msg *rsp); -rstatus_t rspmgr_clone_responses(struct response_mgr *src, struct array *responses); +rstatus_t rspmgr_clone_responses(struct response_mgr *src, + struct array *responses); #endif diff --git a/src/dyn_ring_queue.c b/src/dyn_ring_queue.c index 0ddbce86c..a0107feff 100644 --- a/src/dyn_ring_queue.c +++ b/src/dyn_ring_queue.c @@ -5,187 +5,148 @@ * Author: mdo */ +#include "dyn_ring_queue.h" #include "dyn_array.h" #include "dyn_core.h" #include "dyn_gossip.h" -#include "dyn_ring_queue.h" #include "dyn_token.h" +// should use pooling to store struct ring_message so that we can reuse +struct ring_msg *create_ring_msg(void) { + struct ring_msg *msg = dn_alloc(sizeof(*msg)); -//should use pooling to store struct ring_message so that we can reuse -struct ring_msg * -create_ring_msg(void) -{ - struct ring_msg *msg = dn_alloc(sizeof(*msg)); + if (msg == NULL) return NULL; - if (msg == NULL) - return NULL; + ring_msg_init(msg, 1, true); + msg->data = NULL; - ring_msg_init(msg, 1, true); - msg->data = NULL; - - return msg; + return msg; } +struct ring_msg *create_ring_msg_with_data(uint32_t capacity) { + struct ring_msg *msg = dn_alloc(sizeof(*msg)); -struct ring_msg * -create_ring_msg_with_data(uint32_t capacity) -{ - struct ring_msg *msg = dn_alloc(sizeof(*msg)); - - if (msg == NULL) - return NULL; + if (msg == NULL) return NULL; - rstatus_t status = ring_msg_init(msg, 1, true); - if (status != DN_OK) { - dn_free(msg); - return NULL; - } + rstatus_t status = ring_msg_init(msg, 1, true); + if (status != DN_OK) { + dn_free(msg); + return NULL; + } - msg->data = dn_zalloc(sizeof(uint8_t) * capacity); - msg->capacity = capacity; - msg->len = 0; + msg->data = dn_zalloc(sizeof(uint8_t) * capacity); + msg->capacity = capacity; + msg->len = 0; - return msg; + return msg; } +struct ring_msg *create_ring_msg_with_size(uint32_t size, bool init_node) { + struct ring_msg *msg = dn_alloc(sizeof(*msg)); -struct ring_msg * -create_ring_msg_with_size(uint32_t size, bool init_node) -{ - struct ring_msg *msg = dn_alloc(sizeof(*msg)); + if (msg == NULL) return NULL; - if (msg == NULL) - return NULL; + rstatus_t status = ring_msg_init(msg, size, init_node); + if (status != DN_OK) { + dn_free(msg); + return NULL; + } - rstatus_t status = ring_msg_init(msg, size, init_node); - if (status != DN_OK) { - dn_free(msg); - return NULL; - } + msg->data = NULL; + msg->capacity = 0; + msg->len = 0; - msg->data = NULL; - msg->capacity = 0; - msg->len = 0; - - return msg; + return msg; } +rstatus_t ring_msg_init(struct ring_msg *msg, uint32_t n, bool init_node) { + if (msg == NULL) return DN_ERROR; -rstatus_t -ring_msg_init(struct ring_msg *msg, uint32_t n, bool init_node) -{ - if (msg == NULL) - return DN_ERROR; - - rstatus_t status = array_init(&msg->nodes, n, sizeof(struct gossip_node)); - if (status != DN_OK) - return status; + rstatus_t status = array_init(&msg->nodes, n, sizeof(struct gossip_node)); + if (status != DN_OK) return status; - if (init_node) { - uint32_t i; - for(i=0; inodes); - node_init(node); - } - } + if (init_node) { + uint32_t i; + for (i = 0; i < n; i++) { + struct gossip_node *node = array_push(&msg->nodes); + node_init(node); + } + } - return DN_OK; + return DN_OK; } +rstatus_t ring_msg_deinit(struct ring_msg *msg) { + if (msg == NULL) return DN_ERROR; -rstatus_t -ring_msg_deinit(struct ring_msg *msg) -{ - if (msg == NULL) - return DN_ERROR; - - uint32_t i; - for(i=0; inodes); i++) { - struct gossip_node *node = array_get(&msg->nodes, i); - node_deinit(node); - } - array_deinit(&msg->nodes); + uint32_t i; + for (i = 0; i < array_n(&msg->nodes); i++) { + struct gossip_node *node = array_get(&msg->nodes, i); + node_deinit(node); + } + array_deinit(&msg->nodes); - if (msg->data != NULL) { - dn_free(msg->data); - } + if (msg->data != NULL) { + dn_free(msg->data); + } + dn_free(msg); - dn_free(msg); - - return DN_OK; + return DN_OK; } +struct gossip_node *create_node() { + struct gossip_node *result = dn_alloc(sizeof(*result)); + node_init(result); - -struct gossip_node * -create_node() -{ - struct gossip_node *result = dn_alloc(sizeof(*result)); - node_init(result); - - return result; + return result; } +rstatus_t node_init(struct gossip_node *node) { + if (node == NULL) return DN_ERROR; -rstatus_t -node_init(struct gossip_node *node) -{ - if (node == NULL) - return DN_ERROR; - - init_dyn_token(&node->token); - string_init(&node->dc); - string_init(&node->rack); - string_init(&node->name); - string_init(&node->pname); + init_dyn_token(&node->token); + string_init(&node->dc); + string_init(&node->rack); + string_init(&node->name); + string_init(&node->pname); - node->port = 8101; + node->port = 8101; - node->is_local = false; - node->state = INIT; + node->is_local = false; + node->state = INIT; - return DN_OK; + return DN_OK; } +rstatus_t node_deinit(struct gossip_node *node) { + if (node == NULL) return DN_ERROR; -rstatus_t -node_deinit(struct gossip_node *node) -{ - if (node == NULL) - return DN_ERROR; + // array_deinit(&node->tokens); + string_deinit(&node->dc); + string_deinit(&node->rack); + string_deinit(&node->name); + string_deinit(&node->pname); + deinit_dyn_token(&node->token); - //array_deinit(&node->tokens); - string_deinit(&node->dc); - string_deinit(&node->rack); - string_deinit(&node->name); - string_deinit(&node->pname); - deinit_dyn_token(&node->token); + // dn_free(node); - //dn_free(node); - - return DN_OK; + return DN_OK; } +rstatus_t node_copy(const struct gossip_node *src, struct gossip_node *dst) { + if (src == NULL || dst == NULL) return DN_ERROR; -rstatus_t -node_copy(const struct gossip_node *src, struct gossip_node *dst) -{ - if (src == NULL || dst == NULL) - return DN_ERROR; - - dst->state = src->state; - dst->is_local = src->is_local; - dst->port = src->port; - dst->is_secure = src->is_secure; - - string_copy(&dst->pname, src->pname.data, src->pname.len); - string_copy(&dst->name, src->name.data, src->name.len); - string_copy(&dst->rack, src->rack.data, src->rack.len); - string_copy(&dst->dc, src->dc.data, src->dc.len); + dst->state = src->state; + dst->is_local = src->is_local; + dst->port = src->port; + dst->is_secure = src->is_secure; + string_copy(&dst->pname, src->pname.data, src->pname.len); + string_copy(&dst->name, src->name.data, src->name.len); + string_copy(&dst->rack, src->rack.data, src->rack.len); + string_copy(&dst->dc, src->dc.data, src->dc.len); - copy_dyn_token(&src->token, &dst->token); - return DN_OK; + copy_dyn_token(&src->token, &dst->token); + return DN_OK; } diff --git a/src/dyn_ring_queue.h b/src/dyn_ring_queue.h index 5601dcb24..0617402ae 100644 --- a/src/dyn_ring_queue.h +++ b/src/dyn_ring_queue.h @@ -1,46 +1,40 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. */ - #include "dyn_gossip.h" - #ifndef _DYN_RING_QUEUE_ #define _DYN_RING_QUEUE_ - -#define C2G_InQ_SIZE 256 -#define C2G_OutQ_SIZE 256 +#define C2G_InQ_SIZE 256 +#define C2G_OutQ_SIZE 256 struct gossip_node; typedef rstatus_t (*callback_t)(void *msg); typedef void (*data_func_t)(void *); -volatile struct -{ - long m_getIdx; - long m_putIdx; - void* m_entry[C2G_InQ_SIZE]; +volatile struct { + long m_getIdx; + long m_putIdx; + void *m_entry[C2G_InQ_SIZE]; } C2G_InQ; -volatile struct -{ - long m_getIdx; - long m_putIdx; - void* m_entry[C2G_OutQ_SIZE]; +volatile struct { + long m_getIdx; + long m_putIdx; + void *m_entry[C2G_OutQ_SIZE]; } C2G_OutQ; struct ring_msg { - callback_t cb; - uint8_t *data; /* place holder for a msg */ - uint32_t capacity; /* max capacity */ - uint32_t len; /* # of useful bytes in data (len =< capacity) */ - struct array nodes; - struct server_pool *sp; - + callback_t cb; + uint8_t *data; /* place holder for a msg */ + uint32_t capacity; /* max capacity */ + uint32_t len; /* # of useful bytes in data (len =< capacity) */ + struct array nodes; + struct server_pool *sp; }; struct ring_msg *create_ring_msg(void); @@ -49,10 +43,9 @@ struct ring_msg *create_ring_msg_with_size(uint32_t size, bool init_node); rstatus_t ring_msg_init(struct ring_msg *msg, uint32_t size, bool init_node); rstatus_t ring_msg_deinit(struct ring_msg *msg); -struct gossip_node * create_node(void); +struct gossip_node *create_node(void); rstatus_t node_init(struct gossip_node *node); rstatus_t node_deinit(struct gossip_node *node); rstatus_t node_copy(const struct gossip_node *src, struct gossip_node *dst); - #endif diff --git a/src/dyn_server.c b/src/dyn_server.c index 4a25e0a35..c1e89e75f 100644 --- a/src/dyn_server.c +++ b/src/dyn_server.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -23,379 +23,352 @@ #include #include +#include "dyn_conf.h" #include "dyn_core.h" +#include "dyn_dnode_peer.h" #include "dyn_server.h" -#include "dyn_conf.h" #include "dyn_token.h" -#include "dyn_dnode_peer.h" - -static char* -_print_datastore(const struct object *obj) -{ - ASSERT(obj->type == OBJ_DATASTORE); - struct datastore *ds = (struct datastore *)obj; - snprintf(obj->print_buff, PRINT_BUF_SIZE, "", ds, ds->endpoint.pname.len, - ds->endpoint.pname.data); - return obj->print_buff; -} -static void -server_ref(struct conn *conn, void *owner) -{ - struct datastore *datastore = owner; - - ASSERT(conn->type == CONN_SERVER); - ASSERT(conn->owner == NULL); - conn->family = datastore->endpoint.family; - conn->addrlen = datastore->endpoint.addrlen; - conn->addr = datastore->endpoint.addr; - string_duplicate(&conn->pname, &datastore->endpoint.pname); - - conn->owner = datastore; - - log_debug(LOG_VVERB, "ref conn %p owner %p into '%.*s", conn, datastore, - datastore->endpoint.pname.len, datastore->endpoint.pname.data); +static char *_print_datastore(const struct object *obj) { + ASSERT(obj->type == OBJ_DATASTORE); + struct datastore *ds = (struct datastore *)obj; + snprintf(obj->print_buff, PRINT_BUF_SIZE, "", ds, + ds->endpoint.pname.len, ds->endpoint.pname.data); + return obj->print_buff; } +static void server_ref(struct conn *conn, void *owner) { + struct datastore *datastore = owner; -static void -server_unref(struct conn *conn) -{ - struct datastore *server; + ASSERT(conn->type == CONN_SERVER); + ASSERT(conn->owner == NULL); - ASSERT(conn->type == CONN_SERVER); - ASSERT(conn->owner != NULL); + conn->family = datastore->endpoint.family; + conn->addrlen = datastore->endpoint.addrlen; + conn->addr = datastore->endpoint.addr; + string_duplicate(&conn->pname, &datastore->endpoint.pname); - conn_event_del_conn(conn); - server = conn->owner; - conn->owner = NULL; + conn->owner = datastore; - log_debug(LOG_VVERB, "unref conn %p owner %p from '%.*s'", conn, server, - server->endpoint.pname.len, server->endpoint.pname.data); + log_debug(LOG_VVERB, "ref conn %p owner %p into '%.*s", conn, datastore, + datastore->endpoint.pname.len, datastore->endpoint.pname.data); } -msec_t -server_timeout(struct conn *conn) -{ - struct datastore *server; - struct server_pool *pool; +static void server_unref(struct conn *conn) { + struct datastore *server; - ASSERT(conn->type == CONN_SERVER); + ASSERT(conn->type == CONN_SERVER); + ASSERT(conn->owner != NULL); - server = conn->owner; - pool = server->owner; + conn_event_del_conn(conn); + server = conn->owner; + conn->owner = NULL; - return pool->timeout; + log_debug(LOG_VVERB, "unref conn %p owner %p from '%.*s'", conn, server, + server->endpoint.pname.len, server->endpoint.pname.data); } -static bool -server_active(struct conn *conn) -{ - ASSERT(conn->type == CONN_SERVER); - - if (!TAILQ_EMPTY(&conn->imsg_q)) { - log_debug(LOG_VVERB, "s %d is active", conn->sd); - return true; - } - - if (!TAILQ_EMPTY(&conn->omsg_q)) { - log_debug(LOG_VVERB, "s %d is active", conn->sd); - return true; - } +msec_t server_timeout(struct conn *conn) { + struct datastore *server; + struct server_pool *pool; - if (conn->rmsg != NULL) { - log_debug(LOG_VVERB, "s %d is active", conn->sd); - return true; - } + ASSERT(conn->type == CONN_SERVER); - if (conn->smsg != NULL) { - log_debug(LOG_VVERB, "s %d is active", conn->sd); - return true; - } + server = conn->owner; + pool = server->owner; - log_debug(LOG_VVERB, "s %d is inactive", conn->sd); - - return false; + return pool->timeout; } -static void -server_deinit(struct datastore *pdatastore) -{ - if (!pdatastore) - return; - if (pdatastore->conn_pool) { - conn_pool_destroy(pdatastore->conn_pool); - pdatastore->conn_pool = NULL; - } -} +static bool server_active(struct conn *conn) { + ASSERT(conn->type == CONN_SERVER); -static struct conn * -server_conn(struct datastore *datastore, int tag) -{ - return conn_pool_get(datastore->conn_pool, tag); -} + if (!TAILQ_EMPTY(&conn->imsg_q)) { + log_debug(LOG_VVERB, "s %d is active", conn->sd); + return true; + } + + if (!TAILQ_EMPTY(&conn->omsg_q)) { + log_debug(LOG_VVERB, "s %d is active", conn->sd); + return true; + } + + if (conn->rmsg != NULL) { + log_debug(LOG_VVERB, "s %d is active", conn->sd); + return true; + } + + if (conn->smsg != NULL) { + log_debug(LOG_VVERB, "s %d is active", conn->sd); + return true; + } + + log_debug(LOG_VVERB, "s %d is inactive", conn->sd); + + return false; +} + +static void server_deinit(struct datastore *pdatastore) { + if (!pdatastore) return; + if (pdatastore->conn_pool) { + conn_pool_destroy(pdatastore->conn_pool); + pdatastore->conn_pool = NULL; + } +} + +static struct conn *server_conn(struct datastore *datastore, int tag) { + return conn_pool_get(datastore->conn_pool, tag); +} + +static rstatus_t datastore_preconnect(struct datastore *datastore) { + return conn_pool_preconnect(datastore->conn_pool); +} + +static void server_failure(struct context *ctx, struct datastore *server) { + conn_pool_notify_conn_errored(server->conn_pool); + if (ctx->stats) { + stats_server_set_ts(ctx, server_ejected_at, dn_msec_now()); + stats_pool_incr(ctx, server_ejects); + } +} + +static void server_close_stats(struct context *ctx, struct datastore *server, + err_t err, unsigned eof, unsigned connected) { + if (eof) { + stats_server_incr(ctx, server_eof); + return; + } + + switch (err) { + case ETIMEDOUT: + stats_server_incr(ctx, server_timedout); + break; + case EPIPE: + case ECONNRESET: + case ECONNABORTED: + case ECONNREFUSED: + case ENOTCONN: + case ENETDOWN: + case ENETUNREACH: + case EHOSTDOWN: + case EHOSTUNREACH: + default: + stats_server_incr(ctx, server_err); + break; + } +} + +static void server_ack_err(struct context *ctx, struct conn *conn, + struct msg *req) { + // I want to make sure we do not have swallow here. + // ASSERT_LOG(!req->swallow, "req %d:%d has swallow set??", req->id, + // req->parent_id); + if ((req->swallow && !req->expect_datastore_reply) || + (req->swallow && (req->consistency == DC_ONE)) || + (req->swallow && + ((req->consistency == DC_QUORUM) || + (req->consistency == DC_SAFE_QUORUM)) && + (!conn->same_dc))) { + log_info("%s SWALLOW %s len %" PRIu32, print_obj(conn), print_obj(req), + req->mlen); + req_put(req); + return; + } + struct conn *c_conn = req->owner; + // At other connections, these responses would be swallowed. + ASSERT_LOG( + (c_conn->type == CONN_CLIENT) || (c_conn->type == CONN_DNODE_PEER_CLIENT), + "c_conn %s", print_obj(c_conn)); + + // Create an appropriate response for the request so its propagated up; + // This response gets dropped in rsp_make_error anyways. But since this is + // an error path its ok with the overhead. + struct msg *rsp = msg_get_error(conn, STORAGE_CONNECTION_REFUSE, conn->err); + if (rsp == NULL) { + // TODO: It's not clear how the client should behave if we hit this error + // condition. Return an appropriate error instead. + log_warn("Could not allocate msg for notifying an error to the client."); + return; + } + req->done = 1; + rsp->peer = req; + rsp->is_error = req->is_error = 1; + rsp->error_code = req->error_code = conn->err; + rsp->dyn_error_code = req->dyn_error_code = STORAGE_CONNECTION_REFUSE; + rsp->dmsg = NULL; + log_debug(LOG_DEBUG, "%s <-> %s", print_obj(req), print_obj(rsp)); + + log_info("close %s req %s len %" PRIu32 " from %s %c %s", print_obj(conn), + print_obj(req), req->mlen, print_obj(c_conn), conn->err ? ':' : ' ', + conn->err ? strerror(conn->err) : " "); + rstatus_t status = conn_handle_response( + c_conn, req->parent_id ? req->parent_id : req->id, rsp); + IGNORE_RET_VAL(status); + if (req->swallow) req_put(req); +} + +static void server_close(struct context *ctx, struct conn *conn) { + struct msg *req, *nmsg; /* current and next message */ + + ASSERT(conn->type == CONN_SERVER); + struct datastore *datastore = conn->owner; + + if (ctx->stats) { + server_close_stats(ctx, datastore, conn->err, conn->eof, conn->connected); + } + + if (conn->sd < 0) { + conn_unref(conn); + conn_put(conn); + server_failure(ctx, datastore); + return; + } + + uint32_t out_counter = 0; + for (req = TAILQ_FIRST(&conn->omsg_q); req != NULL; req = nmsg) { + nmsg = TAILQ_NEXT(req, s_tqe); + + /* dequeue the message (request) from server outq */ + conn_dequeue_outq(ctx, conn, req); + server_ack_err(ctx, conn, req); + out_counter++; + } + ASSERT(TAILQ_EMPTY(&conn->omsg_q)); + + uint32_t in_counter = 0; + for (req = TAILQ_FIRST(&conn->imsg_q); req != NULL; req = nmsg) { + nmsg = TAILQ_NEXT(req, s_tqe); -static rstatus_t -datastore_preconnect(struct datastore *datastore) -{ - return conn_pool_preconnect(datastore->conn_pool); -} - -static void -server_failure(struct context *ctx, struct datastore *server) -{ - conn_pool_notify_conn_errored(server->conn_pool); - if (ctx->stats) { - stats_server_set_ts(ctx, server_ejected_at, dn_msec_now()); - stats_pool_incr(ctx, server_ejects); - } -} - -static void -server_close_stats(struct context *ctx, struct datastore *server, err_t err, - unsigned eof, unsigned connected) -{ - if (eof) { - stats_server_incr(ctx, server_eof); - return; - } - - switch (err) { - case ETIMEDOUT: - stats_server_incr(ctx, server_timedout); - break; - case EPIPE: - case ECONNRESET: - case ECONNABORTED: - case ECONNREFUSED: - case ENOTCONN: - case ENETDOWN: - case ENETUNREACH: - case EHOSTDOWN: - case EHOSTUNREACH: - default: - stats_server_incr(ctx, server_err); - break; - } -} - -static void -server_ack_err(struct context *ctx, struct conn *conn, struct msg *req) -{ - // I want to make sure we do not have swallow here. - //ASSERT_LOG(!req->swallow, "req %d:%d has swallow set??", req->id, req->parent_id); - if ((req->swallow && !req->expect_datastore_reply) || - (req->swallow && (req->consistency == DC_ONE)) || - (req->swallow && ((req->consistency == DC_QUORUM) || (req->consistency == DC_SAFE_QUORUM)) - && (!conn->same_dc))) { - log_info("%s SWALLOW %s len %"PRIu32, print_obj(conn), print_obj(req), req->mlen); - req_put(req); - return; - } - struct conn *c_conn = req->owner; - // At other connections, these responses would be swallowed. - ASSERT_LOG((c_conn->type == CONN_CLIENT) || - (c_conn->type == CONN_DNODE_PEER_CLIENT), "c_conn %s", - print_obj(c_conn)); - - // Create an appropriate response for the request so its propagated up; - // This response gets dropped in rsp_make_error anyways. But since this is - // an error path its ok with the overhead. - struct msg *rsp = msg_get(conn, false, __FUNCTION__); - if (rsp == NULL) { - log_warn("Could not allocate msg."); - return; - } - req->done = 1; - rsp->peer = req; - rsp->is_error = req->is_error = 1; - rsp->error_code = req->error_code = conn->err; - rsp->dyn_error_code = req->dyn_error_code = STORAGE_CONNECTION_REFUSE; - rsp->dmsg = NULL; - log_debug(LOG_DEBUG, "%s <-> %s", print_obj(req), print_obj(rsp)); - - log_info("close %s req %s len %"PRIu32" from %s %c %s", - print_obj(conn), print_obj(req), req->mlen, print_obj(c_conn), conn->err ? ':' : ' ', - conn->err ? strerror(conn->err): " "); - rstatus_t status = - conn_handle_response(c_conn, req->parent_id ? req->parent_id : req->id, - rsp); - IGNORE_RET_VAL(status); - if (req->swallow) - req_put(req); -} - -static void -server_close(struct context *ctx, struct conn *conn) -{ - struct msg *req, *nmsg; /* current and next message */ - - ASSERT(conn->type == CONN_SERVER); - struct datastore *datastore = conn->owner; + /* dequeue the message (request) from server inq */ + conn_dequeue_inq(ctx, conn, req); + // We should also remove the req from the timeout rbtree. + msg_tmo_delete(req); + server_ack_err(ctx, conn, req); + in_counter++; - if (ctx->stats) { - server_close_stats(ctx, datastore, conn->err, conn->eof, conn->connected); - } + if (ctx->stats) stats_server_incr(ctx, server_dropped_requests); + } + ASSERT(TAILQ_EMPTY(&conn->imsg_q)); - if (conn->sd < 0) { - conn_unref(conn); - conn_put(conn); - server_failure(ctx, datastore); - return; - } + log_warn("close %s Dropped %u outqueue & %u inqueue requests", + print_obj(conn), out_counter, in_counter); - uint32_t out_counter = 0; - for (req = TAILQ_FIRST(&conn->omsg_q); req != NULL; req = nmsg) { - nmsg = TAILQ_NEXT(req, s_tqe); + struct msg *rsp = conn->rmsg; + if (rsp != NULL) { + conn->rmsg = NULL; - /* dequeue the message (request) from server outq */ - conn_dequeue_outq(ctx, conn, req); - server_ack_err(ctx, conn, req); - out_counter++; - } - ASSERT(TAILQ_EMPTY(&conn->omsg_q)); + ASSERT(!rsp->is_request); + ASSERT(rsp->peer == NULL); - uint32_t in_counter = 0; - for (req = TAILQ_FIRST(&conn->imsg_q); req != NULL; req = nmsg) { - nmsg = TAILQ_NEXT(req, s_tqe); + rsp_put(rsp); - /* dequeue the message (request) from server inq */ - conn_dequeue_inq(ctx, conn, req); - // We should also remove the req from the timeout rbtree. - msg_tmo_delete(req); - server_ack_err(ctx, conn, req); - in_counter++; + log_info("close %s discarding rsp %s len %" PRIu32 + " " + "in error", + print_obj(conn), print_obj(rsp), rsp->mlen); + } - if (ctx->stats) stats_server_incr(ctx, server_dropped_requests); - } - ASSERT(TAILQ_EMPTY(&conn->imsg_q)); + ASSERT(conn->smsg == NULL); - log_warn("close %s Dropped %u outqueue & %u inqueue requests", - print_obj(conn), out_counter, in_counter); + conn_unref(conn); - struct msg *rsp = conn->rmsg; - if (rsp != NULL) { - conn->rmsg = NULL; + rstatus_t status = close(conn->sd); + if (status < 0) { + log_error("close s %s failed, ignored: %s", print_obj(conn), + strerror(errno)); + } + conn->sd = -1; - ASSERT(!rsp->is_request); - ASSERT(rsp->peer == NULL); + conn_put(conn); - rsp_put(rsp); + server_failure(ctx, datastore); +} - log_info("close %s discarding rsp %s len %"PRIu32" " - "in error", print_obj(conn), print_obj(rsp), rsp->mlen); - } +static void server_connected(struct context *ctx, struct conn *conn) { + ASSERT(conn->type == CONN_SERVER); + ASSERT(conn->connecting && !conn->connected); - ASSERT(conn->smsg == NULL); + conn->connecting = 0; + conn->connected = 1; + conn_pool_connected(conn->conn_pool, conn); - conn_unref(conn); + log_notice("%s connected ", print_obj(conn)); +} - rstatus_t status = close(conn->sd); - if (status < 0) { - log_error("close s %s failed, ignored: %s", print_obj(conn), strerror(errno)); - } - conn->sd = -1; +static void server_ok(struct context *ctx, struct conn *conn) { + struct datastore *server = conn->owner; - conn_put(conn); + ASSERT(conn->type == CONN_SERVER); + ASSERT(conn->connected); - server_failure(ctx, datastore); + if (log_loggable(LOG_VERB)) { + log_debug(LOG_VERB, + "reset server '%.*s' failure count from %" PRIu32 " to 0", + server->endpoint.pname.len, server->endpoint.pname.data, + server->failure_count); + } + server->failure_count = 0; + server->next_retry_ms = 0ULL; } -static void -server_connected(struct context *ctx, struct conn *conn) -{ - ASSERT(conn->type == CONN_SERVER); - ASSERT(conn->connecting && !conn->connected); +static rstatus_t datastore_check_autoeject(struct datastore *datastore) { + struct server_pool *pool = datastore->owner; + if (!pool->auto_eject_hosts) { + return DN_OK; + } - conn->connecting = 0; - conn->connected = 1; - conn_pool_connected(conn->conn_pool, conn); + msec_t now_ms = dn_msec_now(); + if (now_ms == 0) { + return DN_ERROR; + } - log_notice("%s connected ", print_obj(conn)); -} + if (now_ms <= datastore->next_retry_ms) { + errno = ECONNREFUSED; + return DN_ERROR; + } -static void -server_ok(struct context *ctx, struct conn *conn) -{ - struct datastore *server = conn->owner; + return DN_OK; +} - ASSERT(conn->type == CONN_SERVER); - ASSERT(conn->connected); +struct conn *get_datastore_conn(struct context *ctx, struct server_pool *pool, + int tag) { + rstatus_t status; + struct datastore *datastore = pool->datastore; + struct conn *conn; - if (log_loggable(LOG_VERB)) { - log_debug(LOG_VERB, "reset server '%.*s' failure count from %"PRIu32 - " to 0", server->endpoint.pname.len, server->endpoint.pname.data, - server->failure_count); - } - server->failure_count = 0; - server->next_retry_ms = 0ULL; -} + ASSERT(datastore); + status = datastore_check_autoeject(datastore); + if (status != DN_OK) { + return NULL; + } -static rstatus_t -datastore_check_autoeject(struct datastore *datastore) -{ - struct server_pool *pool = datastore->owner; - if (!pool->auto_eject_hosts) { - return DN_OK; - } - - msec_t now_ms = dn_msec_now(); - if (now_ms == 0) { - return DN_ERROR; - } - - if (now_ms <= datastore->next_retry_ms) { - errno = ECONNREFUSED; - return DN_ERROR; - } + /* pick a connection to a given server */ + conn = server_conn(datastore, tag); + if (conn == NULL) { + return NULL; + } - return DN_OK; -} + status = conn_connect(ctx, conn); + if (status != DN_OK) { + conn_close(ctx, conn); + return NULL; + } -struct conn * -get_datastore_conn(struct context *ctx, struct server_pool *pool, int tag) -{ - rstatus_t status; - struct datastore *datastore = pool->datastore; - struct conn *conn; - - ASSERT(datastore); - status = datastore_check_autoeject(datastore); - if (status != DN_OK) { - return NULL; - } - - /* pick a connection to a given server */ - conn = server_conn(datastore, tag); - if (conn == NULL) { - return NULL; - } - - status = conn_connect(ctx, conn); - if (status != DN_OK) { - conn_close(ctx, conn); - return NULL; - } - - return conn; + return conn; } -rstatus_t -server_pool_preconnect(struct context *ctx) -{ - if (!ctx->pool.preconnect) { - return DN_OK; - } - return datastore_preconnect(ctx->pool.datastore); +rstatus_t server_pool_preconnect(struct context *ctx) { + if (!ctx->pool.preconnect) { + return DN_OK; + } + return datastore_preconnect(ctx->pool.datastore); } -void -server_pool_disconnect(struct context *ctx) -{ - struct datastore *datastore = ctx->pool.datastore; - if (datastore->conn_pool) { - conn_pool_destroy(datastore->conn_pool); - datastore->conn_pool = NULL; - } +void server_pool_disconnect(struct context *ctx) { + struct datastore *datastore = ctx->pool.datastore; + if (datastore->conn_pool) { + conn_pool_destroy(datastore->conn_pool); + datastore->conn_pool = NULL; + } } /** @@ -405,110 +378,107 @@ server_pool_disconnect(struct context *ctx) * @param[in] ctx Context. * @return rstatus_t Return status code. */ -rstatus_t -server_pool_init(struct server_pool *sp, struct conf_pool *cp, struct context *ctx) -{ - ASSERT(cp->valid); - - memset(sp, 0, sizeof(struct server_pool)); - init_object(&sp->object, OBJ_POOL, print_server_pool); - sp->ctx = ctx; - sp->p_conn = NULL; - TAILQ_INIT(&sp->c_conn_q); - TAILQ_INIT(&sp->ready_conn_q); - - array_null(&sp->datacenters); - /* sp->ncontinuum = 0; */ - /* sp->nserver_continuum = 0; */ - /* sp->continuum = NULL; */ - sp->next_rebuild = 0ULL; - - sp->name = cp->name; - sp->proxy_endpoint.pname = cp->listen.pname; - sp->proxy_endpoint.port = (uint16_t)cp->listen.port; - - sp->proxy_endpoint.family = cp->listen.info.family; - sp->proxy_endpoint.addrlen = cp->listen.info.addrlen; - sp->proxy_endpoint.addr = (struct sockaddr *)&cp->listen.info.addr; - - sp->key_hash_type = cp->hash; - sp->key_hash = get_hash_func(cp->hash); - sp->hash_tag = cp->hash_tag; - - g_data_store = cp->data_store; - if ((g_data_store != DATA_REDIS) && - (g_data_store != DATA_MEMCACHE)) { - log_error("Invalid datastore in conf file"); - return DN_ERROR; - } - set_datastore_ops(); - sp->timeout = cp->timeout; - sp->backlog = cp->backlog; - - sp->client_connections = (uint32_t)cp->client_connections; - - sp->server_retry_timeout_ms = cp->server_retry_timeout_ms; - sp->server_failure_limit = (uint8_t)cp->server_failure_limit; - sp->auto_eject_hosts = cp->auto_eject_hosts ? 1 : 0; - sp->preconnect = cp->preconnect ? 1 : 0; - - sp->datastore = dn_zalloc(sizeof(*sp->datastore)); - init_object(&(sp->datastore->obj), OBJ_DATASTORE, _print_datastore); - THROW_STATUS(conf_datastore_transform(sp->datastore, cp, cp->conf_datastore)); - sp->datastore->owner = sp; - log_debug(LOG_DEBUG, "init datastore in pool '%.*s'", - sp->name.len, sp->name.data); - - /* dynomite init */ - sp->seed_provider = cp->dyn_seed_provider; - sp->dnode_proxy_endpoint.pname = cp->dyn_listen.pname; - sp->dnode_proxy_endpoint.port = (uint16_t)cp->dyn_listen.port; - sp->dnode_proxy_endpoint.family = cp->dyn_listen.info.family; - sp->dnode_proxy_endpoint.addrlen = cp->dyn_listen.info.addrlen; - sp->dnode_proxy_endpoint.addr = (struct sockaddr *)&cp->dyn_listen.info.addr; - sp->max_local_peer_connections = cp->local_peer_connections; - sp->max_remote_peer_connections = cp->remote_peer_connections; - sp->rack = cp->rack; - sp->dc = cp->dc; - sp->tokens = cp->tokens; - sp->env = cp->env; - sp->enable_gossip = cp->enable_gossip; - - /* dynomite stats init */ - sp->stats_endpoint.pname = cp->stats_listen.pname; - sp->stats_endpoint.port = (uint16_t)cp->stats_listen.port; - sp->stats_endpoint.family = cp->stats_listen.info.family; - sp->stats_endpoint.addrlen = cp->stats_listen.info.addrlen; - sp->stats_endpoint.addr = (struct sockaddr *)&cp->stats_listen.info.addr; - sp->stats_interval = cp->stats_interval; - sp->mbuf_size = cp->mbuf_size; - sp->alloc_msgs_max = cp->alloc_msgs_max; - - sp->secure_server_option = get_secure_server_option(&cp->secure_server_option); - sp->pem_key_file = cp->pem_key_file; - sp->recon_key_file = cp->recon_key_file; - sp->recon_iv_file = cp->recon_iv_file; - - array_null(&sp->peers); - array_init(&sp->datacenters, 1, sizeof(struct datacenter)); - sp->conf_pool = cp; - - /* gossip */ - sp->g_interval = cp->gos_interval; - - set_msgs_per_sec(cp->conn_msg_rate); - - log_debug(LOG_VERB, "transform to pool '%.*s'", sp->name.len, sp->name.data); - - - sp->ctx = ctx; - struct datastore *datastore = sp->datastore; - datastore->conn_pool = conn_pool_create(ctx, datastore, - datastore->max_connections, - init_server_conn, sp->server_failure_limit, - sp->server_retry_timeout_ms/1000); - log_debug(LOG_DEBUG, "Initialized server pool"); - return DN_OK; +rstatus_t server_pool_init(struct server_pool *sp, struct conf_pool *cp, + struct context *ctx) { + ASSERT(cp->valid); + + memset(sp, 0, sizeof(struct server_pool)); + init_object(&sp->object, OBJ_POOL, print_server_pool); + sp->ctx = ctx; + sp->p_conn = NULL; + TAILQ_INIT(&sp->c_conn_q); + TAILQ_INIT(&sp->ready_conn_q); + + array_null(&sp->datacenters); + /* sp->ncontinuum = 0; */ + /* sp->nserver_continuum = 0; */ + /* sp->continuum = NULL; */ + sp->next_rebuild = 0ULL; + + sp->name = cp->name; + sp->proxy_endpoint.pname = cp->listen.pname; + sp->proxy_endpoint.port = (uint16_t)cp->listen.port; + + sp->proxy_endpoint.family = cp->listen.info.family; + sp->proxy_endpoint.addrlen = cp->listen.info.addrlen; + sp->proxy_endpoint.addr = (struct sockaddr *)&cp->listen.info.addr; + + sp->key_hash_type = cp->hash; + sp->key_hash = get_hash_func(cp->hash); + sp->hash_tag = cp->hash_tag; + + g_data_store = cp->data_store; + if ((g_data_store != DATA_REDIS) && (g_data_store != DATA_MEMCACHE)) { + log_error("Invalid datastore in conf file"); + return DN_ERROR; + } + set_datastore_ops(); + sp->timeout = cp->timeout; + sp->backlog = cp->backlog; + + sp->client_connections = (uint32_t)cp->client_connections; + + sp->server_retry_timeout_ms = cp->server_retry_timeout_ms; + sp->server_failure_limit = (uint8_t)cp->server_failure_limit; + sp->auto_eject_hosts = cp->auto_eject_hosts ? 1 : 0; + sp->preconnect = cp->preconnect ? 1 : 0; + + sp->datastore = dn_zalloc(sizeof(*sp->datastore)); + init_object(&(sp->datastore->obj), OBJ_DATASTORE, _print_datastore); + THROW_STATUS(conf_datastore_transform(sp->datastore, cp, cp->conf_datastore)); + sp->datastore->owner = sp; + log_debug(LOG_DEBUG, "init datastore in pool '%.*s'", sp->name.len, + sp->name.data); + + /* dynomite init */ + sp->seed_provider = cp->dyn_seed_provider; + sp->dnode_proxy_endpoint.pname = cp->dyn_listen.pname; + sp->dnode_proxy_endpoint.port = (uint16_t)cp->dyn_listen.port; + sp->dnode_proxy_endpoint.family = cp->dyn_listen.info.family; + sp->dnode_proxy_endpoint.addrlen = cp->dyn_listen.info.addrlen; + sp->dnode_proxy_endpoint.addr = (struct sockaddr *)&cp->dyn_listen.info.addr; + sp->max_local_peer_connections = cp->local_peer_connections; + sp->max_remote_peer_connections = cp->remote_peer_connections; + sp->rack = cp->rack; + sp->dc = cp->dc; + sp->tokens = cp->tokens; + sp->env = cp->env; + sp->enable_gossip = cp->enable_gossip; + + /* dynomite stats init */ + sp->stats_endpoint.pname = cp->stats_listen.pname; + sp->stats_endpoint.port = (uint16_t)cp->stats_listen.port; + sp->stats_endpoint.family = cp->stats_listen.info.family; + sp->stats_endpoint.addrlen = cp->stats_listen.info.addrlen; + sp->stats_endpoint.addr = (struct sockaddr *)&cp->stats_listen.info.addr; + sp->stats_interval = cp->stats_interval; + sp->mbuf_size = cp->mbuf_size; + sp->alloc_msgs_max = cp->alloc_msgs_max; + + sp->secure_server_option = + get_secure_server_option(&cp->secure_server_option); + sp->pem_key_file = cp->pem_key_file; + sp->recon_key_file = cp->recon_key_file; + sp->recon_iv_file = cp->recon_iv_file; + + array_null(&sp->peers); + array_init(&sp->datacenters, 1, sizeof(struct datacenter)); + sp->conf_pool = cp; + + /* gossip */ + sp->g_interval = cp->gos_interval; + + set_msgs_per_sec(cp->conn_msg_rate); + + log_debug(LOG_VERB, "transform to pool '%.*s'", sp->name.len, sp->name.data); + + sp->ctx = ctx; + struct datastore *datastore = sp->datastore; + datastore->conn_pool = conn_pool_create( + ctx, datastore, datastore->max_connections, init_server_conn, + sp->server_failure_limit, sp->server_retry_timeout_ms / 1000); + log_debug(LOG_DEBUG, "Initialized server pool"); + return DN_OK; } /** @@ -516,529 +486,495 @@ server_pool_init(struct server_pool *sp, struct conf_pool *cp, struct context *c * data store and setting the number of live backend servers to 0. * @param[in,out] sp Server pool. */ -void -server_pool_deinit(struct server_pool *sp) -{ - ASSERT(sp->p_conn == NULL); - ASSERT(TAILQ_EMPTY(&sp->c_conn_q)); - - server_deinit(sp->datastore); - dn_free(sp->datastore); - sp->datastore = NULL; - log_debug(LOG_DEBUG, "deinit pool '%.*s'", sp->name.len, sp->name.data); -} +void server_pool_deinit(struct server_pool *sp) { + ASSERT(sp->p_conn == NULL); + ASSERT(TAILQ_EMPTY(&sp->c_conn_q)); + server_deinit(sp->datastore); + dn_free(sp->datastore); + sp->datastore = NULL; + log_debug(LOG_DEBUG, "deinit pool '%.*s'", sp->name.len, sp->name.data); +} dictType dc_string_dict_type = { - dict_string_hash, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - dict_string_key_compare, /* key compare */ - dict_string_destructor, /* key destructor */ - NULL /* val destructor */ + dict_string_hash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dict_string_key_compare, /* key compare */ + dict_string_destructor, /* key destructor */ + NULL /* val destructor */ }; +static rstatus_t rack_init(struct rack *rack) { + rack->continuum = dn_alloc(sizeof(struct continuum)); + rack->ncontinuum = 0; + rack->nserver_continuum = 0; + rack->name = dn_alloc(sizeof(struct string)); + string_init(rack->name); -static rstatus_t -rack_init(struct rack *rack) -{ - rack->continuum = dn_alloc(sizeof(struct continuum)); - rack->ncontinuum = 0; - rack->nserver_continuum = 0; - rack->name = dn_alloc(sizeof(struct string)); - string_init(rack->name); - - rack->dc = dn_alloc(sizeof(struct string)); - string_init(rack->dc); + rack->dc = dn_alloc(sizeof(struct string)); + string_init(rack->dc); - return DN_OK; + return DN_OK; } +static rstatus_t rack_deinit(struct rack *rack) { + if (rack->continuum != NULL) { + dn_free(rack->continuum); + } -static rstatus_t -rack_deinit(struct rack *rack) -{ - if (rack->continuum != NULL) { - dn_free(rack->continuum); - } - - return DN_OK; + return DN_OK; } +static rstatus_t dc_init(struct datacenter *dc) { + rstatus_t status; -static rstatus_t -dc_init(struct datacenter *dc) -{ - rstatus_t status; - - dc->dict_rack = dictCreate(&dc_string_dict_type, NULL); - dc->name = dn_alloc(sizeof(struct string)); - string_init(dc->name); - dc->preselected_rack_for_replication = NULL; + dc->dict_rack = dictCreate(&dc_string_dict_type, NULL); + dc->name = dn_alloc(sizeof(struct string)); + string_init(dc->name); + dc->preselected_rack_for_replication = NULL; - status = array_init(&dc->racks, 3, sizeof(struct rack)); + status = array_init(&dc->racks, 3, sizeof(struct rack)); - return status; + return status; } -static rstatus_t -rack_destroy(void *elem) -{ - struct rack *rack = elem; - return rack_deinit(rack); +static rstatus_t rack_destroy(void *elem) { + struct rack *rack = elem; + return rack_deinit(rack); } -static rstatus_t -dc_deinit(struct datacenter *dc) -{ - array_each(&dc->racks, rack_destroy); - string_deinit(dc->name); - //dictRelease(dc->dict_rack); - return DN_OK; +static rstatus_t dc_deinit(struct datacenter *dc) { + array_each(&dc->racks, rack_destroy); + string_deinit(dc->name); + // dictRelease(dc->dict_rack); + return DN_OK; } -rstatus_t -datacenter_destroy(void *elem, void *data) -{ - struct datacenter *dc = elem; - dc_deinit(dc); +rstatus_t datacenter_destroy(void *elem, void *data) { + struct datacenter *dc = elem; + dc_deinit(dc); - return DN_OK; + return DN_OK; } +struct datacenter *server_get_dc(struct server_pool *pool, + struct string *dcname) { + struct datacenter *dc; + uint32_t i, len; -struct datacenter * -server_get_dc(struct server_pool *pool, struct string *dcname) -{ - struct datacenter *dc; - uint32_t i, len; + if (log_loggable(LOG_DEBUG)) { + log_debug(LOG_DEBUG, "server_get_dc dc '%.*s'", dcname->len, dcname->data); + } - if (log_loggable(LOG_DEBUG)) { - log_debug(LOG_DEBUG, "server_get_dc dc '%.*s'", - dcname->len, dcname->data); - } + for (i = 0, len = array_n(&pool->datacenters); i < len; i++) { + dc = (struct datacenter *)array_get(&pool->datacenters, i); + ASSERT(dc != NULL); + ASSERT(dc->name != NULL); - for (i = 0, len = array_n(&pool->datacenters); i < len; i++) { - dc = (struct datacenter *) array_get(&pool->datacenters, i); - ASSERT(dc != NULL); - ASSERT(dc->name != NULL); - - if (string_compare(dc->name, dcname) == 0) { - return dc; - } - } + if (string_compare(dc->name, dcname) == 0) { + return dc; + } + } - dc = array_push(&pool->datacenters); - dc_init(dc); - string_copy(dc->name, dcname->data, dcname->len); + dc = array_push(&pool->datacenters); + dc_init(dc); + string_copy(dc->name, dcname->data, dcname->len); - if (log_loggable(LOG_DEBUG)) { - log_debug(LOG_DEBUG, "server_get_dc about to exit dc '%.*s'", - dc->name->len, dc->name->data); - } + if (log_loggable(LOG_DEBUG)) { + log_debug(LOG_DEBUG, "server_get_dc about to exit dc '%.*s'", dc->name->len, + dc->name->data); + } - return dc; + return dc; } +struct rack *server_get_rack(struct datacenter *dc, struct string *rackname) { + ASSERT(dc != NULL); + ASSERT(dc->dict_rack != NULL); + ASSERT(dc->name != NULL); -struct rack * -server_get_rack(struct datacenter *dc, struct string *rackname) -{ - ASSERT(dc != NULL); - ASSERT(dc->dict_rack != NULL); - ASSERT(dc->name != NULL); - - if (log_loggable(LOG_DEBUG)) { - log_debug(LOG_DEBUG, "server_get_rack '%.*s'", rackname->len, rackname->data); - } - /* - struct rack *rack = dictFetchValue(dc->dict_rack, rackname); - if (rack == NULL) { - rack = array_push(&dc->racks); - rack_init(rack); - string_copy(rack->name, rackname->data, rackname->len); - string_copy(rack->dc, dc->name->data, dc->name->len); - rack->continuum = dn_alloc(sizeof(struct continuum)); - - dictAdd(dc->dict_rack, rackname, rack); - } - */ - - struct rack *rack; - uint32_t i, len; - for (i = 0, len = array_n(&dc->racks); i < len; i++) { - rack = (struct rack *) array_get(&dc->racks, i); - - if (string_compare(rack->name, rackname) == 0) { - return rack; - } - } - - rack = array_push(&dc->racks); - rack_init(rack); - string_copy(rack->name, rackname->data, rackname->len); - string_copy(rack->dc, dc->name->data, dc->name->len); - - if (log_loggable(LOG_DEBUG)) { - log_debug(LOG_DEBUG, "server_get_rack exiting '%.*s'", - rack->name->len, rack->name->data); - } - - return rack; -} - + if (log_loggable(LOG_DEBUG)) { + log_debug(LOG_DEBUG, "server_get_rack '%.*s'", rackname->len, + rackname->data); + } + /* +struct rack *rack = dictFetchValue(dc->dict_rack, rackname); +if (rack == NULL) { +rack = array_push(&dc->racks); +rack_init(rack); +string_copy(rack->name, rackname->data, rackname->len); +string_copy(rack->dc, dc->name->data, dc->name->len); +rack->continuum = dn_alloc(sizeof(struct continuum)); -struct rack * -server_get_rack_by_dc_rack(struct server_pool *sp, struct string *rackname, struct string *dcname) -{ - struct datacenter *dc = server_get_dc(sp, dcname); - return server_get_rack(dc, rackname); + dictAdd(dc->dict_rack, rackname, rack); } + */ -struct msg * -rsp_recv_next(struct context *ctx, struct conn *conn, bool alloc) -{ - struct msg *rsp; + struct rack *rack; + uint32_t i, len; + for (i = 0, len = array_n(&dc->racks); i < len; i++) { + rack = (struct rack *)array_get(&dc->racks, i); - ASSERT((conn->type == CONN_DNODE_PEER_SERVER) || - (conn->type == CONN_SERVER)); - - if (conn->eof) { - rsp = conn->rmsg; + if (string_compare(rack->name, rackname) == 0) { + return rack; + } + } - /* server sent eof before sending the entire request */ - if (rsp != NULL) { - conn->rmsg = NULL; + rack = array_push(&dc->racks); + rack_init(rack); + string_copy(rack->name, rackname->data, rackname->len); + string_copy(rack->dc, dc->name->data, dc->name->len); - ASSERT(rsp->peer == NULL); - ASSERT(!rsp->is_request); + if (log_loggable(LOG_DEBUG)) { + log_debug(LOG_DEBUG, "server_get_rack exiting '%.*s'", rack->name->len, + rack->name->data); + } - log_error("%s EOF discarding incomplete rsp %s len %"PRIu32, print_obj(conn), - print_obj(rsp), rsp->mlen); + return rack; +} - rsp_put(rsp); - } +struct rack *server_get_rack_by_dc_rack(struct server_pool *sp, + struct string *rackname, + struct string *dcname) { + struct datacenter *dc = server_get_dc(sp, dcname); + return server_get_rack(dc, rackname); +} - /* - * We treat TCP half-close from a server different from how we treat - * those from a client. On a FIN from a server, we close the connection - * immediately by sending the second FIN even if there were outstanding - * or pending requests. This is actually a tricky part in the FA, as - * we don't expect this to happen unless the server is misbehaving or - * it crashes - */ - conn->done = 1; - log_debug(LOG_DEBUG, "s %d active %d is done", conn->sd, conn_active(conn)); +struct msg *rsp_recv_next(struct context *ctx, struct conn *conn, bool alloc) { + struct msg *rsp; - return NULL; - } + ASSERT((conn->type == CONN_DNODE_PEER_SERVER) || (conn->type == CONN_SERVER)); + if (conn->eof) { rsp = conn->rmsg; - if (rsp != NULL) { - ASSERT(!rsp->is_request); - return rsp; - } - - if (!alloc) { - return NULL; - } - rsp = rsp_get(conn); + /* server sent eof before sending the entire request */ if (rsp != NULL) { - conn->rmsg = rsp; - } - - return rsp; -} + conn->rmsg = NULL; -static bool -server_rsp_filter(struct context *ctx, struct conn *conn, struct msg *rsp) -{ - struct msg *req; + ASSERT(rsp->peer == NULL); + ASSERT(!rsp->is_request); - ASSERT(conn->type == CONN_SERVER); + log_error("%s EOF discarding incomplete rsp %s len %" PRIu32, + print_obj(conn), print_obj(rsp), rsp->mlen); - if (msg_empty(rsp)) { - ASSERT(conn->rmsg == NULL); - log_debug(LOG_VERB, "filter empty rsp %"PRIu64" on s %d", rsp->id, - conn->sd); - rsp_put(rsp); - return true; + rsp_put(rsp); } - req= TAILQ_FIRST(&conn->omsg_q); - if (req== NULL) { - log_debug(LOG_VERB, "filter stray rsp %"PRIu64" len %"PRIu32" on s %d", - rsp->id, rsp->mlen, conn->sd); - rsp_put(rsp); - return true; - } - - if (!req->expect_datastore_reply) { - conn_dequeue_outq(ctx, conn, req); - req_put(req); - rsp_put(rsp); - return true; - } + /* + * We treat TCP half-close from a server different from how we treat + * those from a client. On a FIN from a server, we close the connection + * immediately by sending the second FIN even if there were outstanding + * or pending requests. This is actually a tricky part in the FA, as + * we don't expect this to happen unless the server is misbehaving or + * it crashes + */ + conn->done = 1; + log_debug(LOG_DEBUG, "s %d active %d is done", conn->sd, conn_active(conn)); - ASSERT(req->is_request); + return NULL; + } - if (req->swallow) { - conn_dequeue_outq(ctx, conn, req); - req->done = 1; + rsp = conn->rmsg; + if (rsp != NULL) { + ASSERT(!rsp->is_request); + return rsp; + } - log_debug(LOG_DEBUG, "swallow rsp %"PRIu64" len %"PRIu32" of req " - "%"PRIu64" on s %d", rsp->id, rsp->mlen, req->id, - conn->sd); + if (!alloc) { + return NULL; + } - rsp_put(rsp); - req_put(req); - return true; - } + rsp = rsp_get(conn); + if (rsp != NULL) { + conn->rmsg = rsp; + } - return false; + return rsp; } -static void -server_rsp_forward_stats(struct context *ctx, struct msg *rsp) -{ - ASSERT(!rsp->is_request); - - if (rsp->is_read) { - stats_server_incr(ctx, read_responses); - stats_server_incr_by(ctx, read_response_bytes, rsp->mlen); - } else { - stats_server_incr(ctx, write_responses); - stats_server_incr_by(ctx, write_response_bytes, rsp->mlen); - } -} +static bool server_rsp_filter(struct context *ctx, struct conn *conn, + struct msg *rsp) { + struct msg *req; -static void -server_rsp_forward(struct context *ctx, struct conn *s_conn, struct msg *rsp) -{ - rstatus_t status; - struct msg *req; - struct conn *c_conn; - ASSERT(s_conn->type == CONN_SERVER); - - /* response from server implies that server is ok and heartbeating */ - server_ok(ctx, s_conn); - - /* dequeue peer message (request) from server */ - req = TAILQ_FIRST(&s_conn->omsg_q); - ASSERT(req->is_request); - if (req->request_send_time) { - struct stats *st = ctx->stats; - uint64_t delay = dn_usec_now() - req->request_send_time; - histo_add(&st->server_latency_histo, delay); - } - conn_dequeue_outq(ctx, s_conn, req); - - c_conn = req->owner; - log_info("%s %s RECEIVED %s", print_obj(c_conn), print_obj(req), print_obj(rsp)); - - ASSERT((c_conn->type == CONN_CLIENT) || - (c_conn->type == CONN_DNODE_PEER_CLIENT)); - - server_rsp_forward_stats(ctx, rsp); - // handler owns the response now - status = conn_handle_response(c_conn, req->id, rsp); - IGNORE_RET_VAL(status); -} + ASSERT(conn->type == CONN_SERVER); -static void -rsp_recv_done(struct context *ctx, struct conn *conn, struct msg *rsp, - struct msg *nmsg) -{ - ASSERT(conn->type == CONN_SERVER); - ASSERT(rsp != NULL && conn->rmsg == rsp); - ASSERT(!rsp->is_request); - ASSERT(rsp->owner == conn); - ASSERT(nmsg == NULL || !nmsg->is_request); + if (msg_empty(rsp)) { + ASSERT(conn->rmsg == NULL); + log_debug(LOG_VERB, "filter empty rsp %" PRIu64 " on s %d", rsp->id, + conn->sd); + rsp_put(rsp); + return true; + } - /* enqueue next message (response), if any */ - conn->rmsg = nmsg; + req = TAILQ_FIRST(&conn->omsg_q); + if (req == NULL) { + log_debug(LOG_VERB, "filter stray rsp %" PRIu64 " len %" PRIu32 " on s %d", + rsp->id, rsp->mlen, conn->sd); + rsp_put(rsp); + return true; + } - if (server_rsp_filter(ctx, conn, rsp)) { - return; - } - server_rsp_forward(ctx, conn, rsp); -} - -struct msg * -req_send_next(struct context *ctx, struct conn *conn) -{ - rstatus_t status; - struct msg *req, *nmsg; /* current and next message */ - - ASSERT((conn->type == CONN_SERVER) || - (conn->type == CONN_DNODE_PEER_SERVER)); - - if (conn->connecting) { - if (conn->type == CONN_SERVER) { - server_connected(ctx, conn); - } else if (conn->type == CONN_DNODE_PEER_SERVER) { - dnode_peer_connected(ctx, conn); - } - } + if (!req->expect_datastore_reply) { + conn_dequeue_outq(ctx, conn, req); + req_put(req); + rsp_put(rsp); + return true; + } - nmsg = TAILQ_FIRST(&conn->imsg_q); - if (nmsg == NULL) { - /* nothing to send as the server inq is empty */ - status = conn_event_del_out(conn); - if (status != DN_OK) { - conn->err = errno; - } - - return NULL; - } - - req = conn->smsg; - if (req != NULL) { - ASSERT(req->is_request && !req->done); - nmsg = TAILQ_NEXT(req, s_tqe); - } + ASSERT(req->is_request); - conn->smsg = nmsg; + if (req->swallow) { + conn_dequeue_outq(ctx, conn, req); + req->done = 1; - if (nmsg == NULL) { - return NULL; + log_debug(LOG_DEBUG, + "swallow rsp %" PRIu64 " len %" PRIu32 + " of req " + "%" PRIu64 " on s %d", + rsp->id, rsp->mlen, req->id, conn->sd); + + rsp_put(rsp); + req_put(req); + return true; + } + + return false; +} + +static void server_rsp_forward_stats(struct context *ctx, struct msg *rsp) { + ASSERT(!rsp->is_request); + + if (rsp->is_read) { + stats_server_incr(ctx, read_responses); + stats_server_incr_by(ctx, read_response_bytes, rsp->mlen); + } else { + stats_server_incr(ctx, write_responses); + stats_server_incr_by(ctx, write_response_bytes, rsp->mlen); + } +} + +static void server_rsp_forward(struct context *ctx, struct conn *s_conn, + struct msg *rsp) { + rstatus_t status; + struct msg *req; + struct conn *c_conn; + ASSERT(s_conn->type == CONN_SERVER); + + /* response from server implies that server is ok and heartbeating */ + server_ok(ctx, s_conn); + + /* dequeue peer message (request) from server */ + req = TAILQ_FIRST(&s_conn->omsg_q); + ASSERT(req->is_request); + if (req->request_send_time) { + struct stats *st = ctx->stats; + uint64_t delay = dn_usec_now() - req->request_send_time; + histo_add(&st->server_latency_histo, delay); + } + conn_dequeue_outq(ctx, s_conn, req); + + c_conn = req->owner; + log_info("%s %s RECEIVED %s", print_obj(c_conn), print_obj(req), + print_obj(rsp)); + + ASSERT((c_conn->type == CONN_CLIENT) || + (c_conn->type == CONN_DNODE_PEER_CLIENT)); + + server_rsp_forward_stats(ctx, rsp); + // handler owns the response now + status = conn_handle_response(c_conn, req->id, rsp); + IGNORE_RET_VAL(status); +} + +static void rsp_recv_done(struct context *ctx, struct conn *conn, + struct msg *rsp, struct msg *nmsg) { + ASSERT(conn->type == CONN_SERVER); + ASSERT(rsp != NULL && conn->rmsg == rsp); + ASSERT(!rsp->is_request); + ASSERT(rsp->owner == conn); + ASSERT(nmsg == NULL || !nmsg->is_request); + + /* enqueue next message (response), if any */ + conn->rmsg = nmsg; + + if (server_rsp_filter(ctx, conn, rsp)) { + return; + } + server_rsp_forward(ctx, conn, rsp); +} + +struct msg *req_send_next(struct context *ctx, struct conn *conn) { + rstatus_t status; + struct msg *req, *nmsg; /* current and next message */ + + ASSERT((conn->type == CONN_SERVER) || (conn->type == CONN_DNODE_PEER_SERVER)); + + if (conn->connecting) { + if (conn->type == CONN_SERVER) { + server_connected(ctx, conn); + } else if (conn->type == CONN_DNODE_PEER_SERVER) { + dnode_peer_connected(ctx, conn); } - - ASSERT(nmsg->is_request && !nmsg->done); - - if (log_loggable(LOG_VVERB)) { - log_debug(LOG_VVERB, "send next req %"PRIu64" len %"PRIu32" type %d on " - "s %d", nmsg->id, nmsg->mlen, nmsg->type, conn->sd); + } + + nmsg = TAILQ_FIRST(&conn->imsg_q); + if (nmsg == NULL) { + /* nothing to send as the server inq is empty */ + status = conn_event_del_out(conn); + if (status != DN_OK) { + conn->err = errno; } - return nmsg; -} + return NULL; + } -void -req_send_done(struct context *ctx, struct conn *conn, struct msg *req) -{ - ASSERT((conn->type == CONN_SERVER) || - (conn->type == CONN_DNODE_PEER_SERVER)); - ASSERT(req != NULL && conn->smsg == NULL); + req = conn->smsg; + if (req != NULL) { ASSERT(req->is_request && !req->done); - //ASSERT(req->owner == conn); - - if (log_loggable(LOG_VVERB)) { - log_debug(LOG_VVERB, "send done req %"PRIu64" len %"PRIu32" type %d on " - "s %d", req->id, req->mlen, req->type, conn->sd); - } - - /* dequeue the message (request) from server inq */ - conn_dequeue_inq(ctx, conn, req); - req->request_send_time = dn_usec_now(); - - /* - * expect_datastore_reply request instructs the server to send response. So, - * enqueue message (request) in server outq, if response is expected. - * Otherwise, free the request - */ - if (req->expect_datastore_reply || (conn->type == CONN_SERVER)) - conn_enqueue_outq(ctx, conn, req); - else - req_put(req); -} - -static void -req_server_enqueue_imsgq(struct context *ctx, struct conn *conn, struct msg *req) -{ - ASSERT(req->is_request); - ASSERT(conn->type == CONN_SERVER); - req->request_inqueue_enqueue_time_us = dn_usec_now(); - - /* - * timeout clock starts ticking the instant the message is enqueued into - * the server in_q; the clock continues to tick until it either expires - * or the message is dequeued from the server out_q - * - * expect_datastore_reply request have timeouts because client is expecting - * a response - */ - if (req->expect_datastore_reply) { - msg_tmo_insert(req, conn); - } - - TAILQ_INSERT_TAIL(&conn->imsg_q, req, s_tqe); - log_debug(LOG_VERB, "conn %p enqueue inq %d:%d", conn, req->id, req->parent_id); - - histo_add(&ctx->stats->server_in_queue, TAILQ_COUNT(&conn->imsg_q)); - stats_server_incr(ctx, in_queue); - stats_server_incr_by(ctx, in_queue_bytes, req->mlen); -} - -static void -req_server_dequeue_imsgq(struct context *ctx, struct conn *conn, struct msg *req) -{ - ASSERT(req->is_request); - ASSERT(conn->type == CONN_SERVER); - - TAILQ_REMOVE(&conn->imsg_q, req, s_tqe); - log_debug(LOG_VERB, "conn %p dequeue inq %d:%d", conn, req->id, req->parent_id); - usec_t delay = dn_usec_now() - req->request_inqueue_enqueue_time_us; - histo_add(&ctx->stats->server_queue_wait_time_histo, delay); - - histo_add(&ctx->stats->server_in_queue, TAILQ_COUNT(&conn->imsg_q)); - stats_server_decr(ctx, in_queue); - stats_server_decr_by(ctx, in_queue_bytes, req->mlen); -} - -static void -req_server_enqueue_omsgq(struct context *ctx, struct conn *conn, struct msg *req) -{ - ASSERT(req->is_request); - ASSERT(conn->type == CONN_SERVER); - - TAILQ_INSERT_TAIL(&conn->omsg_q, req, s_tqe); - log_debug(LOG_VERB, "conn %p enqueue outq %d:%d", conn, req->id, req->parent_id); - - histo_add(&ctx->stats->server_out_queue, TAILQ_COUNT(&conn->omsg_q)); - stats_server_incr(ctx, out_queue); - stats_server_incr_by(ctx, out_queue_bytes, req->mlen); -} - -static void -req_server_dequeue_omsgq(struct context *ctx, struct conn *conn, struct msg *req) -{ - ASSERT(req->is_request); - ASSERT(conn->type == CONN_SERVER); - - msg_tmo_delete(req); - - TAILQ_REMOVE(&conn->omsg_q, req, s_tqe); - log_debug(LOG_VERB, "conn %p dequeue outq %d:%d", conn, req->id, req->parent_id); - - histo_add(&ctx->stats->server_out_queue, TAILQ_COUNT(&conn->omsg_q)); - stats_server_decr(ctx, out_queue); - stats_server_decr_by(ctx, out_queue_bytes, req->mlen); -} - -struct conn_ops server_ops = { - msg_recv, - rsp_recv_next, - rsp_recv_done, - msg_send, - req_send_next, - req_send_done, - server_close, - server_active, - server_ref, - server_unref, - req_server_enqueue_imsgq, - req_server_dequeue_imsgq, - req_server_enqueue_omsgq, - req_server_dequeue_omsgq, - conn_cant_handle_response -}; - -void -init_server_conn(struct conn *conn) -{ - conn->dyn_mode = 0; - conn->type = CONN_SERVER; - conn->ops = &server_ops; + nmsg = TAILQ_NEXT(req, s_tqe); + } + + conn->smsg = nmsg; + + if (nmsg == NULL) { + return NULL; + } + + ASSERT(nmsg->is_request && !nmsg->done); + + if (log_loggable(LOG_VVERB)) { + log_debug(LOG_VVERB, + "send next req %" PRIu64 " len %" PRIu32 + " type %d on " + "s %d", + nmsg->id, nmsg->mlen, nmsg->type, conn->sd); + } + + return nmsg; +} + +void req_send_done(struct context *ctx, struct conn *conn, struct msg *req) { + ASSERT((conn->type == CONN_SERVER) || (conn->type == CONN_DNODE_PEER_SERVER)); + ASSERT(req != NULL && conn->smsg == NULL); + ASSERT(req->is_request && !req->done); + // ASSERT(req->owner == conn); + + if (log_loggable(LOG_VVERB)) { + log_debug(LOG_VVERB, + "send done req %" PRIu64 " len %" PRIu32 + " type %d on " + "s %d", + req->id, req->mlen, req->type, conn->sd); + } + + /* dequeue the message (request) from server inq */ + conn_dequeue_inq(ctx, conn, req); + req->request_send_time = dn_usec_now(); + + /* + * expect_datastore_reply request instructs the server to send response. So, + * enqueue message (request) in server outq, if response is expected. + * Otherwise, free the request + */ + if (req->expect_datastore_reply || (conn->type == CONN_SERVER)) + conn_enqueue_outq(ctx, conn, req); + else + req_put(req); +} + +static void req_server_enqueue_imsgq(struct context *ctx, struct conn *conn, + struct msg *req) { + ASSERT(req->is_request); + ASSERT(conn->type == CONN_SERVER); + req->request_inqueue_enqueue_time_us = dn_usec_now(); + + /* + * timeout clock starts ticking the instant the message is enqueued into + * the server in_q; the clock continues to tick until it either expires + * or the message is dequeued from the server out_q + * + * expect_datastore_reply request have timeouts because client is expecting + * a response + */ + if (req->expect_datastore_reply) { + msg_tmo_insert(req, conn); + } + + TAILQ_INSERT_TAIL(&conn->imsg_q, req, s_tqe); + log_debug(LOG_VERB, "conn %p enqueue inq %d:%d", conn, req->id, + req->parent_id); + + histo_add(&ctx->stats->server_in_queue, TAILQ_COUNT(&conn->imsg_q)); + stats_server_incr(ctx, in_queue); + stats_server_incr_by(ctx, in_queue_bytes, req->mlen); +} + +static void req_server_dequeue_imsgq(struct context *ctx, struct conn *conn, + struct msg *req) { + ASSERT(req->is_request); + ASSERT(conn->type == CONN_SERVER); + + TAILQ_REMOVE(&conn->imsg_q, req, s_tqe); + log_debug(LOG_VERB, "conn %p dequeue inq %d:%d", conn, req->id, + req->parent_id); + usec_t delay = dn_usec_now() - req->request_inqueue_enqueue_time_us; + histo_add(&ctx->stats->server_queue_wait_time_histo, delay); + + histo_add(&ctx->stats->server_in_queue, TAILQ_COUNT(&conn->imsg_q)); + stats_server_decr(ctx, in_queue); + stats_server_decr_by(ctx, in_queue_bytes, req->mlen); +} + +static void req_server_enqueue_omsgq(struct context *ctx, struct conn *conn, + struct msg *req) { + ASSERT(req->is_request); + ASSERT(conn->type == CONN_SERVER); + + TAILQ_INSERT_TAIL(&conn->omsg_q, req, s_tqe); + log_debug(LOG_VERB, "conn %p enqueue outq %d:%d", conn, req->id, + req->parent_id); + + histo_add(&ctx->stats->server_out_queue, TAILQ_COUNT(&conn->omsg_q)); + stats_server_incr(ctx, out_queue); + stats_server_incr_by(ctx, out_queue_bytes, req->mlen); +} + +static void req_server_dequeue_omsgq(struct context *ctx, struct conn *conn, + struct msg *req) { + ASSERT(req->is_request); + ASSERT(conn->type == CONN_SERVER); + + msg_tmo_delete(req); + + TAILQ_REMOVE(&conn->omsg_q, req, s_tqe); + log_debug(LOG_VERB, "conn %p dequeue outq %d:%d", conn, req->id, + req->parent_id); + + histo_add(&ctx->stats->server_out_queue, TAILQ_COUNT(&conn->omsg_q)); + stats_server_decr(ctx, out_queue); + stats_server_decr_by(ctx, out_queue_bytes, req->mlen); +} + +struct conn_ops server_ops = {msg_recv, + rsp_recv_next, + rsp_recv_done, + msg_send, + req_send_next, + req_send_done, + server_close, + server_active, + server_ref, + server_unref, + req_server_enqueue_imsgq, + req_server_dequeue_imsgq, + req_server_enqueue_omsgq, + req_server_dequeue_omsgq, + conn_cant_handle_response}; + +void init_server_conn(struct conn *conn) { + conn->dyn_mode = 0; + conn->type = CONN_SERVER; + conn->ops = &server_ops; } diff --git a/src/dyn_server.h b/src/dyn_server.h index 6e3e9c7cb..f2e524b8c 100644 --- a/src/dyn_server.h +++ b/src/dyn_server.h @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -23,8 +23,14 @@ #ifndef _DYN_SERVER_H_ #define _DYN_SERVER_H_ -#include "dyn_core.h" #include "dyn_dict.h" +#include "dyn_types.h" + +// Forward declarations +struct conf_pool; +struct context; +struct datastore; +struct server_pool; /* * server_pool is a collection of servers and their continuum. Each @@ -65,14 +71,17 @@ * // */ - msec_t server_timeout(struct conn *conn); rstatus_t server_init(struct server_pool *sp, struct array *conf_server); -rstatus_t server_connect(struct context *ctx, struct datastore *server, struct conn *conn); +rstatus_t server_connect(struct context *ctx, struct datastore *server, + struct conn *conn); -struct datacenter *server_get_dc(struct server_pool *pool, struct string *dcname); +struct datacenter *server_get_dc(struct server_pool *pool, + struct string *dcname); struct rack *server_get_rack(struct datacenter *dc, struct string *rackname); -struct rack *server_get_rack_by_dc_rack(struct server_pool *sp, struct string *rackname, struct string *dcname); +struct rack *server_get_rack_by_dc_rack(struct server_pool *sp, + struct string *rackname, + struct string *dcname); rstatus_t datacenter_destroy(void *elem, void *data); @@ -80,7 +89,8 @@ struct conn *get_datastore_conn(struct context *ctx, struct server_pool *pool, int tag); rstatus_t server_pool_preconnect(struct context *ctx); void server_pool_disconnect(struct context *ctx); -rstatus_t server_pool_init(struct server_pool *server_pool, struct conf_pool *conf_pool, struct context *ctx); +rstatus_t server_pool_init(struct server_pool *server_pool, + struct conf_pool *conf_pool, struct context *ctx); void server_pool_deinit(struct server_pool *server_pool); void init_server_conn(struct conn *conn); diff --git a/src/dyn_setting.c b/src/dyn_setting.c index 3adc02492..93b9cb035 100644 --- a/src/dyn_setting.c +++ b/src/dyn_setting.c @@ -1,9 +1,8 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. Licensed under the Apache License, + * Version 2.0 (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * @@ -14,19 +13,14 @@ * limitations under the License. */ - -#include "dyn_core.h" #include "dyn_conf.h" +#include "dyn_core.h" -static uint32_t conn_msg_rate = CONF_DEFAULT_CONN_MSG_RATE; //conn msgs per sec - +static uint32_t conn_msg_rate = CONF_DEFAULT_CONN_MSG_RATE; // conn msgs per + // sec -uint32_t msgs_per_sec(void) -{ - return conn_msg_rate; -} +uint32_t msgs_per_sec(void) { return conn_msg_rate; } -void set_msgs_per_sec(uint32_t tokens_per_sec) -{ - conn_msg_rate = tokens_per_sec; +void set_msgs_per_sec(uint32_t tokens_per_sec) { + conn_msg_rate = tokens_per_sec; } diff --git a/src/dyn_setting.h b/src/dyn_setting.h index a2222b05d..20508dc27 100644 --- a/src/dyn_setting.h +++ b/src/dyn_setting.h @@ -1,9 +1,8 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. Licensed under the Apache License, + * Version 2.0 (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * @@ -14,14 +13,10 @@ * limitations under the License. */ - - #ifndef _DYN_SETTING_H_ #define _DYN_SETTING_H_ - uint32_t msgs_per_sec(void); void set_msgs_per_sec(uint32_t tokens_per_sec); - #endif diff --git a/src/dyn_signal.c b/src/dyn_signal.c index 88a26f6c8..53963605e 100644 --- a/src/dyn_signal.c +++ b/src/dyn_signal.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,122 +20,113 @@ * limitations under the License. */ -#include #include +#include #include "dyn_core.h" #include "dyn_signal.h" static struct signal signals[] = { - { SIGUSR1, "SIGUSR1", 0, signal_handler }, - { SIGUSR2, "SIGUSR2", 0, signal_handler }, - { SIGTTIN, "SIGTTIN", 0, signal_handler }, - { SIGTTOU, "SIGTTOU", 0, signal_handler }, - { SIGHUP, "SIGHUP", 0, signal_handler }, - { SIGINT, "SIGINT", 0, signal_handler }, - { SIGSEGV, "SIGSEGV", (int)SA_RESETHAND, signal_handler }, - { SIGPIPE, "SIGPIPE", 0, SIG_IGN }, - { 0, NULL, 0, NULL } -}; + {SIGUSR1, "SIGUSR1", 0, signal_handler}, + {SIGUSR2, "SIGUSR2", 0, signal_handler}, + {SIGTTIN, "SIGTTIN", 0, signal_handler}, + {SIGTTOU, "SIGTTOU", 0, signal_handler}, + {SIGHUP, "SIGHUP", 0, signal_handler}, + {SIGINT, "SIGINT", 0, signal_handler}, + {SIGSEGV, "SIGSEGV", (int)SA_RESETHAND, signal_handler}, + {SIGPIPE, "SIGPIPE", 0, SIG_IGN}, + {0, NULL, 0, NULL}}; /** * Initialize the list of POSIX signals that dynomite can respond to and map * each signal to a handler function. * @return status */ -rstatus_t -signal_init(void) -{ - struct signal *sig; - - for (sig = signals; sig->signo != 0; sig++) { - rstatus_t status; - struct sigaction sa; - - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = sig->handler; - sa.sa_flags = sig->flags; - sigemptyset(&sa.sa_mask); - - status = sigaction(sig->signo, &sa, NULL); - if (status < 0) { - log_error("sigaction(%s) failed: %s", sig->signame, - strerror(errno)); - return DN_ERROR; - } +rstatus_t signal_init(void) { + struct signal *sig; + + for (sig = signals; sig->signo != 0; sig++) { + rstatus_t status; + struct sigaction sa; + + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = sig->handler; + sa.sa_flags = sig->flags; + sigemptyset(&sa.sa_mask); + + status = sigaction(sig->signo, &sa, NULL); + if (status < 0) { + log_error("sigaction(%s) failed: %s", sig->signame, strerror(errno)); + return DN_ERROR; } + } - return DN_OK; + return DN_OK; } -void -signal_deinit(void) -{ -} +void signal_deinit(void) {} -void -signal_handler(int signo) -{ - struct signal *sig; - void (*action)(void); - char *actionstr; - bool done; - - for (sig = signals; sig->signo != 0; sig++) { - if (sig->signo == signo) { - break; - } +void signal_handler(int signo) { + struct signal *sig; + void (*action)(void); + char *actionstr; + bool done; + + for (sig = signals; sig->signo != 0; sig++) { + if (sig->signo == signo) { + break; } - ASSERT(sig->signo != 0); + } + ASSERT(sig->signo != 0); - actionstr = ""; - action = NULL; - done = false; + actionstr = ""; + action = NULL; + done = false; - switch (signo) { + switch (signo) { case SIGUSR1: - break; + break; case SIGUSR2: - break; + break; case SIGTTIN: - actionstr = ", up logging level"; - action = log_level_up; - break; + actionstr = ", up logging level"; + action = log_level_up; + break; case SIGTTOU: - actionstr = ", down logging level"; - action = log_level_down; - break; + actionstr = ", down logging level"; + action = log_level_down; + break; case SIGHUP: - actionstr = ", reopening log file"; - action = log_reopen; - break; + actionstr = ", reopening log file"; + action = log_reopen; + break; case SIGINT: - done = true; - actionstr = ", exiting"; - break; + done = true; + actionstr = ", exiting"; + break; case SIGSEGV: - dn_stacktrace(1); - actionstr = ", core dumping"; - raise(SIGSEGV); - break; + dn_stacktrace(1); + actionstr = ", core dumping"; + raise(SIGSEGV); + break; default: - NOT_REACHED(); - } + NOT_REACHED(); + } - loga("signal %d (%s) received%s", signo, sig->signame, actionstr); + loga("signal %d (%s) received%s", signo, sig->signame, actionstr); - if (action != NULL) { - action(); - } + if (action != NULL) { + action(); + } - if (done) { - exit(1); - } + if (done) { + exit(1); + } } diff --git a/src/dyn_signal.h b/src/dyn_signal.h index 3667e9210..b023f6245 100644 --- a/src/dyn_signal.h +++ b/src/dyn_signal.h @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,21 +20,19 @@ * limitations under the License. */ - -#include "dyn_core.h" - #ifndef _DYN_SIGNAL_H_ #define _DYN_SIGNAL_H_ +#include "dyn_types.h" /** * @brief POSIX signal */ struct signal { - int signo; - char *signame; - int flags; - void (*handler)(int signo); + int signo; + char *signame; + int flags; + void (*handler)(int signo); }; rstatus_t signal_init(void); diff --git a/src/dyn_stats.c b/src/dyn_stats.c index eada3be68..92b91e51e 100644 --- a/src/dyn_stats.c +++ b/src/dyn_stats.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -24,1869 +24,1746 @@ #include #include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include "dyn_conf.h" +#include "dyn_connection.h" #include "dyn_core.h" +#include "dyn_gossip.h" #include "dyn_histogram.h" -#include "dyn_server.h" #include "dyn_node_snitch.h" #include "dyn_ring_queue.h" -#include "dyn_gossip.h" -#include "dyn_connection.h" -#include "dyn_conf.h" +#include "dyn_server.h" struct stats_desc { - char *name; /* stats name */ - char *desc; /* stats description */ + char *name; /* stats name */ + char *desc; /* stats description */ }; -#define DEFINE_ACTION(_name, _type, _desc) { .type = _type, .name = string(#_name) }, +#define DEFINE_ACTION(_name, _type, _desc) \ + {.type = _type, .name = string(#_name)}, static struct stats_metric stats_pool_codec[] = { - STATS_POOL_CODEC( DEFINE_ACTION ) -}; + STATS_POOL_CODEC(DEFINE_ACTION)}; static struct stats_metric stats_server_codec[] = { - STATS_SERVER_CODEC( DEFINE_ACTION ) -}; + STATS_SERVER_CODEC(DEFINE_ACTION)}; #undef DEFINE_ACTION -#define DEFINE_ACTION(_name, _type, _desc) { .name = #_name, .desc = _desc }, -static struct stats_desc stats_pool_desc[] = { - STATS_POOL_CODEC( DEFINE_ACTION ) -}; +#define DEFINE_ACTION(_name, _type, _desc) {.name = #_name, .desc = _desc}, +static struct stats_desc stats_pool_desc[] = {STATS_POOL_CODEC(DEFINE_ACTION)}; static struct stats_desc stats_server_desc[] = { - STATS_SERVER_CODEC( DEFINE_ACTION ) -}; + STATS_SERVER_CODEC(DEFINE_ACTION)}; #undef DEFINE_ACTION -#define MAX_HTTP_HEADER_SIZE 1024 -static struct string header_str = string("HTTP/1.1 200 OK \nContent-Type: application/json; charset=utf-8 \nContent-Length:"); -//static struct string endline = string("\r\n"); +#define MAX_HTTP_HEADER_SIZE 1024 +static struct string header_str = string( + "HTTP/1.1 200 OK \nContent-Type: application/json; charset=utf-8 " + "\nContent-Length:"); +// static struct string endline = string("\r\n"); static struct string ok = string("OK\r\n"); static struct string err_resp = string("ERR"); static struct string all = string("all"); -void -stats_describe(void) -{ - uint32_t i; +void stats_describe(void) { + uint32_t i; - log_stderr("pool stats:"); - for (i = 0; i < NELEMS(stats_pool_desc); i++) { - log_stderr(" %-20s\"%s\"", stats_pool_desc[i].name, - stats_pool_desc[i].desc); - } + log_stderr("pool stats:"); + for (i = 0; i < NELEMS(stats_pool_desc); i++) { + log_stderr(" %-20s\"%s\"", stats_pool_desc[i].name, + stats_pool_desc[i].desc); + } - log_stderr(""); + log_stderr(""); - log_stderr("server stats:"); - for (i = 0; i < NELEMS(stats_server_desc); i++) { - log_stderr(" %-20s\"%s\"", stats_server_desc[i].name, - stats_server_desc[i].desc); - } + log_stderr("server stats:"); + for (i = 0; i < NELEMS(stats_server_desc); i++) { + log_stderr(" %-20s\"%s\"", stats_server_desc[i].name, + stats_server_desc[i].desc); + } } -static void -stats_metric_init(struct stats_metric *stm) -{ - switch (stm->type) { +static void stats_metric_init(struct stats_metric *stm) { + switch (stm->type) { case STATS_COUNTER: - stm->value.counter = 0LL; - break; + stm->value.counter = 0LL; + break; case STATS_GAUGE: - stm->value.counter = 0LL; - break; + stm->value.counter = 0LL; + break; case STATS_TIMESTAMP: - stm->value.timestamp = 0LL; - break; + stm->value.timestamp = 0LL; + break; case STATS_STRING: - string_deinit(&stm->value.str); // first free the existing data - string_init(&stm->value.str); - break; + string_deinit(&stm->value.str); // first free the existing data + string_init(&stm->value.str); + break; default: - NOT_REACHED(); - } + NOT_REACHED(); + } } -static void -stats_metric_reset(struct array *stats_metric) -{ - uint32_t i, nmetric; +static void stats_metric_reset(struct array *stats_metric) { + uint32_t i, nmetric; - nmetric = array_n(stats_metric); - ASSERT(nmetric == STATS_POOL_NFIELD || nmetric == STATS_SERVER_NFIELD); + nmetric = array_n(stats_metric); + ASSERT(nmetric == STATS_POOL_NFIELD || nmetric == STATS_SERVER_NFIELD); - for (i = 0; i < nmetric; i++) { - struct stats_metric *stm = array_get(stats_metric, i); + for (i = 0; i < nmetric; i++) { + struct stats_metric *stm = array_get(stats_metric, i); - stats_metric_init(stm); - } + stats_metric_init(stm); + } } -static rstatus_t -stats_pool_metric_init(struct array *stats_metric) -{ - uint32_t i, nfield = STATS_POOL_NFIELD; +static rstatus_t stats_pool_metric_init(struct array *stats_metric) { + uint32_t i, nfield = STATS_POOL_NFIELD; - THROW_STATUS(array_init(stats_metric, nfield, sizeof(struct stats_metric))); + THROW_STATUS(array_init(stats_metric, nfield, sizeof(struct stats_metric))); - for (i = 0; i < nfield; i++) { - struct stats_metric *stm = array_push(stats_metric); + for (i = 0; i < nfield; i++) { + struct stats_metric *stm = array_push(stats_metric); - /* initialize from pool codec first */ - *stm = stats_pool_codec[i]; + /* initialize from pool codec first */ + *stm = stats_pool_codec[i]; - /* initialize individual metric */ - stats_metric_init(stm); - } + /* initialize individual metric */ + stats_metric_init(stm); + } - return DN_OK; + return DN_OK; } -static rstatus_t -stats_server_metric_init(struct stats_server *sts) -{ - uint32_t i, nfield = STATS_SERVER_NFIELD; +static rstatus_t stats_server_metric_init(struct stats_server *sts) { + uint32_t i, nfield = STATS_SERVER_NFIELD; - THROW_STATUS(array_init(&sts->metric, nfield, sizeof(struct stats_metric))); + THROW_STATUS(array_init(&sts->metric, nfield, sizeof(struct stats_metric))); - for (i = 0; i < nfield; i++) { - struct stats_metric *stm = array_push(&sts->metric); + for (i = 0; i < nfield; i++) { + struct stats_metric *stm = array_push(&sts->metric); - /* initialize from server codec first */ - *stm = stats_server_codec[i]; + /* initialize from server codec first */ + *stm = stats_server_codec[i]; - /* initialize individual metric */ - stats_metric_init(stm); - } + /* initialize individual metric */ + stats_metric_init(stm); + } - return DN_OK; + return DN_OK; } -static void -stats_metric_deinit(struct array *metric) -{ - uint32_t i, nmetric; +static void stats_metric_deinit(struct array *metric) { + uint32_t i, nmetric; - nmetric = array_n(metric); - for (i = 0; i < nmetric; i++) { - array_pop(metric); - } - array_deinit(metric); + nmetric = array_n(metric); + for (i = 0; i < nmetric; i++) { + array_pop(metric); + } + array_deinit(metric); } -static rstatus_t -stats_server_init(struct stats_server *sts, struct datastore *s) -{ - sts->name = s->name; - array_null(&sts->metric); +static rstatus_t stats_server_init(struct stats_server *sts, + struct datastore *s) { + sts->name = s->name; + array_null(&sts->metric); - THROW_STATUS(stats_server_metric_init(sts)); + THROW_STATUS(stats_server_metric_init(sts)); - log_debug(LOG_VVVERB, "init stats server '%.*s' with %"PRIu32" metric", - sts->name.len, sts->name.data, array_n(&sts->metric)); - - return DN_OK; + log_debug(LOG_VVVERB, "init stats server '%.*s' with %" PRIu32 " metric", + sts->name.len, sts->name.data, array_n(&sts->metric)); + return DN_OK; } -static rstatus_t -stats_server_map(struct stats_server *sts, struct datastore *datastore) -{ - ASSERT(datastore != NULL); - THROW_STATUS(stats_server_init(sts, datastore)); +static rstatus_t stats_server_map(struct stats_server *sts, + struct datastore *datastore) { + ASSERT(datastore != NULL); + THROW_STATUS(stats_server_init(sts, datastore)); - log_debug(LOG_VVVERB, "mapped stats servers"); + log_debug(LOG_VVVERB, "mapped stats servers"); - return DN_OK; + return DN_OK; } -static void -stats_server_unmap(struct stats_server *sts) -{ - stats_metric_deinit(&sts->metric); - log_debug(LOG_VVVERB, "unmap stats servers"); +static void stats_server_unmap(struct stats_server *sts) { + stats_metric_deinit(&sts->metric); + log_debug(LOG_VVVERB, "unmap stats servers"); } -static rstatus_t -stats_pool_init(struct stats_pool *stp, struct server_pool *sp) -{ - rstatus_t status; +static rstatus_t stats_pool_init(struct stats_pool *stp, + struct server_pool *sp) { + rstatus_t status; - stp->name = sp->name; - array_null(&stp->metric); + stp->name = sp->name; + array_null(&stp->metric); - THROW_STATUS(stats_pool_metric_init(&stp->metric)); + THROW_STATUS(stats_pool_metric_init(&stp->metric)); - status = stats_server_map(&stp->server, sp->datastore); - if (status != DN_OK) { - stats_metric_deinit(&stp->metric); - return status; - } + status = stats_server_map(&stp->server, sp->datastore); + if (status != DN_OK) { + stats_metric_deinit(&stp->metric); + return status; + } - log_debug(LOG_VVVERB, "init stats pool '%.*s' with %"PRIu32" metric", - stp->name.len, stp->name.data, array_n(&stp->metric)); + log_debug(LOG_VVVERB, "init stats pool '%.*s' with %" PRIu32 " metric", + stp->name.len, stp->name.data, array_n(&stp->metric)); - return DN_OK; + return DN_OK; } -static void -stats_pool_reset(struct stats_pool *stp) -{ - stats_metric_reset(&stp->metric); +static void stats_pool_reset(struct stats_pool *stp) { + stats_metric_reset(&stp->metric); - struct stats_server *sts = &stp->server; - stats_metric_reset(&sts->metric); + struct stats_server *sts = &stp->server; + stats_metric_reset(&sts->metric); } -static void -stats_pool_unmap(struct stats_pool *stp) -{ - stats_metric_deinit(&stp->metric); - stats_server_unmap(&stp->server); +static void stats_pool_unmap(struct stats_pool *stp) { + stats_metric_deinit(&stp->metric); + stats_server_unmap(&stp->server); } -static rstatus_t -stats_create_bufs(struct stats *st) -{ - uint32_t int64_max_digits = 20; /* INT64_MAX = 9223372036854775807 */ - uint32_t int32_max_digits = 10; /* INT32_MAX = 4294967294 */ - uint32_t key_value_extra = 8; /* "key": "value", */ - uint32_t pool_extra = 8; /* '"pool_name": { ' + ' }' */ - uint32_t server_extra = 8; /* '"server_name": { ' + ' }' */ - size_t size = 0; - - ASSERT(st->buf.data == NULL && st->buf.size == 0); - - /* header */ - size += 1; - - size += st->service_str.len; - size += st->service.len; - size += key_value_extra; - - size += st->source_str.len; - size += st->source.len; - size += key_value_extra; - - size += st->version_str.len; - size += st->version.len; - size += key_value_extra; - - size += st->uptime_str.len; - size += int64_max_digits; - size += key_value_extra; - - size += st->timestamp_str.len; - size += int64_max_digits; - size += key_value_extra; +static rstatus_t stats_create_bufs(struct stats *st) { + uint32_t int64_max_digits = 20; /* INT64_MAX = 9223372036854775807 */ + uint32_t int32_max_digits = 10; /* INT32_MAX = 4294967294 */ + uint32_t key_value_extra = 8; /* "key": "value", */ + uint32_t pool_extra = 8; /* '"pool_name": { ' + ' }' */ + uint32_t server_extra = 8; /* '"server_name": { ' + ' }' */ + size_t size = 0; - size += st->rack_str.len; - size += st->rack.len; - size += key_value_extra; + ASSERT(st->buf.data == NULL && st->buf.size == 0); - size += st->dc_str.len; - size += st->dc.len; - size += key_value_extra; + /* header */ + size += 1; - size += st->latency_999th_str.len; - size += int64_max_digits; - size += key_value_extra; + size += st->service_str.len; + size += st->service.len; + size += key_value_extra; - size += st->latency_99th_str.len; - size += int64_max_digits; - size += key_value_extra; + size += st->source_str.len; + size += st->source.len; + size += key_value_extra; - size += st->latency_95th_str.len; - size += int64_max_digits; - size += key_value_extra; + size += st->version_str.len; + size += st->version.len; + size += key_value_extra; - size += st->latency_mean_str.len; - size += int64_max_digits; - size += key_value_extra; + size += st->uptime_str.len; + size += int64_max_digits; + size += key_value_extra; - size += st->latency_max_str.len; - size += int64_max_digits; - size += key_value_extra; + size += st->timestamp_str.len; + size += int64_max_digits; + size += key_value_extra; + size += st->rack_str.len; + size += st->rack.len; + size += key_value_extra; - size += st->payload_size_999th_str.len; - size += int32_max_digits; - size += key_value_extra; + size += st->dc_str.len; + size += st->dc.len; + size += key_value_extra; - size += st->payload_size_99th_str.len; - size += int32_max_digits; - size += key_value_extra; + size += st->latency_999th_str.len; + size += int64_max_digits; + size += key_value_extra; - size += st->payload_size_95th_str.len; - size += int32_max_digits; - size += key_value_extra; + size += st->latency_99th_str.len; + size += int64_max_digits; + size += key_value_extra; - size += st->payload_size_mean_str.len; - size += int32_max_digits; - size += key_value_extra; + size += st->latency_95th_str.len; + size += int64_max_digits; + size += key_value_extra; - size += st->payload_size_max_str.len; - size += int32_max_digits; - size += key_value_extra; + size += st->latency_mean_str.len; + size += int64_max_digits; + size += key_value_extra; - size += st->alloc_msgs_str.len; - size += int64_max_digits; - size += key_value_extra; + size += st->latency_max_str.len; + size += int64_max_digits; + size += key_value_extra; - size += st->free_msgs_str.len; - size += int64_max_digits; - size += key_value_extra; + size += st->payload_size_999th_str.len; + size += int32_max_digits; + size += key_value_extra; - size += st->alloc_mbufs_str.len; - size += int64_max_digits; - size += key_value_extra; + size += st->payload_size_99th_str.len; + size += int32_max_digits; + size += key_value_extra; - size += st->free_mbufs_str.len; - size += int64_max_digits; - size += key_value_extra; + size += st->payload_size_95th_str.len; + size += int32_max_digits; + size += key_value_extra; - size += st->dyn_memory_str.len; - size += int64_max_digits; - size += key_value_extra; + size += st->payload_size_mean_str.len; + size += int32_max_digits; + size += key_value_extra; - struct stats_pool *stp = &st->sum; - uint32_t j; + size += st->payload_size_max_str.len; + size += int32_max_digits; + size += key_value_extra; - size += stp->name.len; - size += pool_extra; + size += st->alloc_msgs_str.len; + size += int64_max_digits; + size += key_value_extra; - for (j = 0; j < array_n(&stp->metric); j++) { - struct stats_metric *stm = array_get(&stp->metric, j); + size += st->free_msgs_str.len; + size += int64_max_digits; + size += key_value_extra; - size += stm->name.len; - size += int64_max_digits; - size += key_value_extra; - } + size += st->alloc_mbufs_str.len; + size += int64_max_digits; + size += key_value_extra; - /* servers per pool */ - struct stats_server *sts = &stp->server; - uint32_t k; + size += st->free_mbufs_str.len; + size += int64_max_digits; + size += key_value_extra; - size += sts->name.len; - size += server_extra; + size += st->dyn_memory_str.len; + size += int64_max_digits; + size += key_value_extra; - for (k = 0; k < array_n(&sts->metric); k++) { - struct stats_metric *stm = array_get(&sts->metric, k); + struct stats_pool *stp = &st->sum; + uint32_t j; - size += stm->name.len; - size += int64_max_digits; - size += key_value_extra; - } + size += stp->name.len; + size += pool_extra; - /* footer */ - size += 2; - // Accommodate for new fields that are directly added using stats_add_num_str - size += 1024; - - size = DN_ALIGN(size, DN_ALIGNMENT); - st->buf.data = dn_alloc(size); - if (st->buf.data == NULL) { - log_error("create stats buffer of size %zu failed: %s", size, - strerror(errno)); - return DN_ENOMEM; - } - st->buf.size = size; - st->buf.len = 0; + for (j = 0; j < array_n(&stp->metric); j++) { + struct stats_metric *stm = array_get(&stp->metric, j); - log_debug(LOG_DEBUG, "stats info buffer size %zu", size); + size += stm->name.len; + size += int64_max_digits; + size += key_value_extra; + } - st->clus_desc_buf.len = 0; - st->clus_desc_buf.size = 0; + /* servers per pool */ + struct stats_server *sts = &stp->server; + uint32_t k; - return DN_OK; -} + size += sts->name.len; + size += server_extra; -static void -stats_destroy_buf(struct stats_buffer *buf) -{ - if (buf->size != 0) { - ASSERT(buf->data != NULL); - dn_free(buf->data); - buf->size = 0; - } -} + for (k = 0; k < array_n(&sts->metric); k++) { + struct stats_metric *stm = array_get(&sts->metric, k); -static void -stats_reset_buf(struct stats_buffer *buf) -{ - buf->len = 0; -} + size += stm->name.len; + size += int64_max_digits; + size += key_value_extra; + } -static rstatus_t -stats_add_string(struct stats_buffer *buf, struct string *key, struct string *val) -{ - uint8_t *pos; - size_t room; - int n; + /* footer */ + size += 2; + // Accommodate for new fields that are directly added using stats_add_num_str + size += 1024; - pos = buf->data + buf->len; - room = buf->size - buf->len - 1; + size = DN_ALIGN(size, DN_ALIGNMENT); + st->buf.data = dn_alloc(size); + if (st->buf.data == NULL) { + log_error("create stats buffer of size %zu failed: %s", size, + strerror(errno)); + return DN_ENOMEM; + } + st->buf.size = size; + st->buf.len = 0; - n = dn_snprintf(pos, room, "\"%.*s\":\"%.*s\",", key->len, key->data, - val->len, val->data); - if (n < 0 || n >= (int)room) { - log_debug(LOG_ERR, "no room size:%u len %u", buf->size, buf->len); - return DN_ERROR; - } + log_debug(LOG_DEBUG, "stats info buffer size %zu", size); - buf->len += (size_t)n; + st->clus_desc_buf.len = 0; + st->clus_desc_buf.size = 0; - return DN_OK; + return DN_OK; } -static rstatus_t -stats_add_num_last(struct stats_buffer *buf, struct string *key, int64_t val, bool last) -{ - uint8_t *pos; - size_t room; - int n; - - pos = buf->data + buf->len; - room = buf->size - buf->len - 1; - - if (!last) { - n = dn_snprintf(pos, room, "\"%.*s\":%"PRId64",\n", key->len, key->data, - val); - } else { - n = dn_snprintf(pos, room, "\"%.*s\":%"PRId64"\n", key->len, key->data, - val); - } - - if (n < 0 || n >= (int)room) { - log_debug(LOG_ERR, "no room size:%u len %u", buf->size, buf->len); - return DN_ERROR; - } - - buf->len += (size_t)n; - - return DN_OK; +static void stats_destroy_buf(struct stats_buffer *buf) { + if (buf->size != 0) { + ASSERT(buf->data != NULL); + dn_free(buf->data); + buf->size = 0; + } } -static rstatus_t -stats_add_num_str(struct stats_buffer *buf, const char *key, int64_t val) -{ - uint8_t *pos; - size_t room; - int n; - - pos = buf->data + buf->len; - room = buf->size - buf->len - 1; +static void stats_reset_buf(struct stats_buffer *buf) { buf->len = 0; } - n = dn_snprintf(pos, room, "\"%s\":%"PRId64",\n", key, val); - if (n < 0 || n >= (int)room) { - log_debug(LOG_ERR, "no room size:%u len %u", buf->size, buf->len); - return DN_ERROR; +static rstatus_t stats_add_string(struct stats_buffer *buf, struct string *key, + struct string *val) { + uint8_t *pos; + size_t room; + int n; + + pos = buf->data + buf->len; + room = buf->size - buf->len - 1; + + n = dn_snprintf(pos, room, "\"%.*s\":\"%.*s\",", key->len, key->data, + val->len, val->data); + if (n < 0 || n >= (int)room) { + log_debug(LOG_ERR, "no room size:%u len %u", buf->size, buf->len); + return DN_ERROR; + } + + buf->len += (size_t)n; + + return DN_OK; +} + +static rstatus_t stats_add_num_last(struct stats_buffer *buf, + struct string *key, int64_t val, + bool last) { + uint8_t *pos; + size_t room; + int n; + + pos = buf->data + buf->len; + room = buf->size - buf->len - 1; + + if (!last) { + n = dn_snprintf(pos, room, "\"%.*s\":%" PRId64 ",\n", key->len, key->data, + val); + } else { + n = dn_snprintf(pos, room, "\"%.*s\":%" PRId64 "\n", key->len, key->data, + val); + } + + if (n < 0 || n >= (int)room) { + log_debug(LOG_ERR, "no room size:%u len %u", buf->size, buf->len); + return DN_ERROR; + } + + buf->len += (size_t)n; + + return DN_OK; +} + +static rstatus_t stats_add_num_str(struct stats_buffer *buf, const char *key, + int64_t val) { + uint8_t *pos; + size_t room; + int n; + + pos = buf->data + buf->len; + room = buf->size - buf->len - 1; + + n = dn_snprintf(pos, room, "\"%s\":%" PRId64 ",\n", key, val); + if (n < 0 || n >= (int)room) { + log_debug(LOG_ERR, "no room size:%u len %u", buf->size, buf->len); + return DN_ERROR; + } + buf->len += (size_t)n; + return DN_OK; +} + +static rstatus_t stats_add_num(struct stats_buffer *buf, struct string *key, + int64_t val) { + if (stats_add_num_last(buf, key, val, false) == DN_ERROR) { + return DN_ERROR; + } + + return DN_OK; +} + +static rstatus_t stats_add_header(struct stats *st) { + struct stats_buffer *buf; + int64_t cur_ts, uptime; + + buf = &st->buf; + buf->data[0] = '{'; + buf->len = 1; + + cur_ts = (int64_t)time(NULL); + uptime = cur_ts - st->start_ts; + + THROW_STATUS(stats_add_string(&st->buf, &st->service_str, &st->service)); + THROW_STATUS(stats_add_string(&st->buf, &st->source_str, &st->source)); + THROW_STATUS(stats_add_string(&st->buf, &st->version_str, &st->version)); + THROW_STATUS(stats_add_num(&st->buf, &st->uptime_str, uptime)); + THROW_STATUS(stats_add_num(&st->buf, &st->timestamp_str, cur_ts)); + THROW_STATUS(stats_add_string(&st->buf, &st->rack_str, &st->rack)); + THROW_STATUS(stats_add_string(&st->buf, &st->dc_str, &st->dc)); + // latency histogram + THROW_STATUS(stats_add_num(&st->buf, &st->latency_max_str, + (int64_t)st->latency_histo.val_max)) + THROW_STATUS(stats_add_num(&st->buf, &st->latency_999th_str, + (int64_t)st->latency_histo.val_999th)); + THROW_STATUS(stats_add_num(&st->buf, &st->latency_99th_str, + (int64_t)st->latency_histo.val_99th)); + THROW_STATUS(stats_add_num(&st->buf, &st->latency_95th_str, + (int64_t)st->latency_histo.val_95th)); + THROW_STATUS(stats_add_num(&st->buf, &st->latency_mean_str, + (int64_t)st->latency_histo.mean)); + // payload size histogram + THROW_STATUS(stats_add_num(&st->buf, &st->payload_size_max_str, + (int64_t)st->payload_size_histo.val_max)); + THROW_STATUS(stats_add_num(&st->buf, &st->payload_size_999th_str, + (int64_t)st->payload_size_histo.val_999th)); + THROW_STATUS(stats_add_num(&st->buf, &st->payload_size_99th_str, + (int64_t)st->payload_size_histo.val_99th)); + THROW_STATUS(stats_add_num(&st->buf, &st->payload_size_95th_str, + (int64_t)st->payload_size_histo.val_95th)); + THROW_STATUS(stats_add_num(&st->buf, &st->payload_size_mean_str, + (int64_t)st->payload_size_histo.mean)); + + THROW_STATUS(stats_add_num_str(&st->buf, "average_cross_region_rtt", + (int64_t)st->cross_region_latency_histo.mean)); + THROW_STATUS( + stats_add_num_str(&st->buf, "99_cross_region_rtt", + (int64_t)st->cross_region_latency_histo.val_99th)); + THROW_STATUS(stats_add_num_str(&st->buf, "average_cross_zone_latency", + (int64_t)st->cross_zone_latency_histo.mean)); + THROW_STATUS( + stats_add_num_str(&st->buf, "99_cross_zone_latency", + (int64_t)st->cross_zone_latency_histo.val_99th)); + THROW_STATUS(stats_add_num_str(&st->buf, "average_server_latency", + (int64_t)st->server_latency_histo.mean)); + THROW_STATUS(stats_add_num_str(&st->buf, "99_server_latency", + (int64_t)st->server_latency_histo.val_99th)); + + THROW_STATUS( + stats_add_num_str(&st->buf, "average_cross_region_queue_wait", + (int64_t)st->cross_region_queue_wait_time_histo.mean)); + THROW_STATUS(stats_add_num_str( + &st->buf, "99_cross_region_queue_wait", + (int64_t)st->cross_region_queue_wait_time_histo.val_99th)); + THROW_STATUS( + stats_add_num_str(&st->buf, "average_cross_zone_queue_wait", + (int64_t)st->cross_zone_queue_wait_time_histo.mean)); + THROW_STATUS(stats_add_num_str( + &st->buf, "99_cross_zone_queue_wait", + (int64_t)st->cross_zone_queue_wait_time_histo.val_99th)); + THROW_STATUS( + stats_add_num_str(&st->buf, "average_server_queue_wait", + (int64_t)st->server_queue_wait_time_histo.mean)); + THROW_STATUS( + stats_add_num_str(&st->buf, "99_server_queue_wait", + (int64_t)st->server_queue_wait_time_histo.val_99th)); + + THROW_STATUS(stats_add_num(&st->buf, &st->client_out_queue_99, + (int64_t)st->client_out_queue.val_99th)); + THROW_STATUS(stats_add_num(&st->buf, &st->server_in_queue_99, + (int64_t)st->server_in_queue.val_99th)); + THROW_STATUS(stats_add_num(&st->buf, &st->server_out_queue_99, + (int64_t)st->server_out_queue.val_99th)); + THROW_STATUS(stats_add_num(&st->buf, &st->dnode_client_out_queue_99, + (int64_t)st->dnode_client_out_queue.val_99th)); + THROW_STATUS(stats_add_num(&st->buf, &st->peer_in_queue_99, + (int64_t)st->peer_in_queue.val_99th)); + THROW_STATUS(stats_add_num(&st->buf, &st->peer_out_queue_99, + (int64_t)st->peer_out_queue.val_99th)); + THROW_STATUS(stats_add_num(&st->buf, &st->remote_peer_out_queue_99, + (int64_t)st->remote_peer_out_queue.val_99th)); + THROW_STATUS(stats_add_num(&st->buf, &st->remote_peer_in_queue_99, + (int64_t)st->remote_peer_in_queue.val_99th)); + THROW_STATUS( + stats_add_num(&st->buf, &st->alloc_msgs_str, (int64_t)st->alloc_msgs)); + THROW_STATUS( + stats_add_num(&st->buf, &st->free_msgs_str, (int64_t)st->free_msgs)); + THROW_STATUS( + stats_add_num(&st->buf, &st->alloc_mbufs_str, (int64_t)st->alloc_mbufs)); + THROW_STATUS( + stats_add_num(&st->buf, &st->free_mbufs_str, (int64_t)st->free_mbufs)); + THROW_STATUS( + stats_add_num(&st->buf, &st->dyn_memory_str, (int64_t)st->dyn_memory)); + + return DN_OK; +} + +static rstatus_t stats_add_footer(struct stats_buffer *buf) { + uint8_t *pos; + + if (buf->len == buf->size) { + return DN_ERROR; + } + + /* overwrite the last byte and add a new byte */ + pos = buf->data + buf->len - 1; + pos[0] = '}'; + pos[1] = '\n'; + buf->len += 1; + + return DN_OK; +} + +static rstatus_t stats_begin_nesting(struct stats_buffer *buf, + struct string *key, bool arr) { + uint8_t *pos; + size_t room; + int n; + + pos = buf->data + buf->len; + room = buf->size - buf->len - 1; + + if (key) + n = dn_snprintf(pos, room, "\"%.*s\": %c", key->len, key->data, + arr ? '[' : '{'); + else + n = dn_snprintf(pos, room, "%c", arr ? '[' : '{'); + if (n < 0 || n >= (int)room) { + log_debug(LOG_ERR, "failed, len:%u size %u", buf->len, buf->size); + return DN_ERROR; + } + + buf->len += (size_t)n; + + return DN_OK; +} + +static rstatus_t stats_end_nesting(struct stats_buffer *buf, bool arr) { + uint8_t *pos; + + pos = buf->data + buf->len; + + // if last non-white space character is , remove it + // first count white spaces at end + int space_count = 0; + while (isspace(*(pos - space_count - 1))) { + space_count++; + } + if (*(pos - space_count - 1) == ',') { + // now remove , from end + pos -= (space_count + 1); + buf->len--; + // put white spaces back + while (space_count > 0) { + *pos = *(pos + 1); + pos++; + space_count--; } - buf->len += (size_t)n; - return DN_OK; -} + } + // append "}," + if ((buf->len + 2) > buf->size) { + return DN_ERROR; + } + pos[0] = arr ? ']' : '}'; + pos[1] = ','; + buf->len += 2; -static rstatus_t -stats_add_num(struct stats_buffer *buf, struct string *key, int64_t val) -{ - if (stats_add_num_last(buf, key, val, false) == DN_ERROR) { - return DN_ERROR; - } - - return DN_OK; + return DN_OK; } -static rstatus_t -stats_add_header(struct stats *st) -{ - struct stats_buffer *buf; - int64_t cur_ts, uptime; - - buf = &st->buf; - buf->data[0] = '{'; - buf->len = 1; - - cur_ts = (int64_t)time(NULL); - uptime = cur_ts - st->start_ts; - - THROW_STATUS(stats_add_string(&st->buf, &st->service_str, &st->service)); - THROW_STATUS(stats_add_string(&st->buf, &st->source_str, &st->source)); - THROW_STATUS(stats_add_string(&st->buf, &st->version_str, &st->version)); - THROW_STATUS(stats_add_num(&st->buf, &st->uptime_str, uptime)); - THROW_STATUS(stats_add_num(&st->buf, &st->timestamp_str, cur_ts)); - THROW_STATUS(stats_add_string(&st->buf, &st->rack_str, &st->rack)); - THROW_STATUS(stats_add_string(&st->buf, &st->dc_str, &st->dc)); - //latency histogram - THROW_STATUS(stats_add_num(&st->buf, &st->latency_max_str, - (int64_t)st->latency_histo.val_max)) - THROW_STATUS(stats_add_num(&st->buf, &st->latency_999th_str, - (int64_t)st->latency_histo.val_999th)); - THROW_STATUS(stats_add_num(&st->buf, &st->latency_99th_str, - (int64_t)st->latency_histo.val_99th)); - THROW_STATUS(stats_add_num(&st->buf, &st->latency_95th_str, - (int64_t)st->latency_histo.val_95th)); - THROW_STATUS(stats_add_num(&st->buf, &st->latency_mean_str, - (int64_t)st->latency_histo.mean)); - //payload size histogram - THROW_STATUS(stats_add_num(&st->buf, &st->payload_size_max_str, - (int64_t)st->payload_size_histo.val_max)); - THROW_STATUS(stats_add_num(&st->buf, &st->payload_size_999th_str, - (int64_t)st->payload_size_histo.val_999th)); - THROW_STATUS(stats_add_num(&st->buf, &st->payload_size_99th_str, - (int64_t)st->payload_size_histo.val_99th)); - THROW_STATUS(stats_add_num(&st->buf, &st->payload_size_95th_str, - (int64_t)st->payload_size_histo.val_95th)); - THROW_STATUS(stats_add_num(&st->buf, &st->payload_size_mean_str, - (int64_t)st->payload_size_histo.mean)); - - THROW_STATUS(stats_add_num_str(&st->buf, "average_cross_region_rtt", - (int64_t)st->cross_region_latency_histo.mean)); - THROW_STATUS(stats_add_num_str(&st->buf, "99_cross_region_rtt", - (int64_t)st->cross_region_latency_histo.val_99th)); - THROW_STATUS(stats_add_num_str(&st->buf, "average_cross_zone_latency", - (int64_t)st->cross_zone_latency_histo.mean)); - THROW_STATUS(stats_add_num_str(&st->buf, "99_cross_zone_latency", - (int64_t)st->cross_zone_latency_histo.val_99th)); - THROW_STATUS(stats_add_num_str(&st->buf, "average_server_latency", - (int64_t)st->server_latency_histo.mean)); - THROW_STATUS(stats_add_num_str(&st->buf, "99_server_latency", - (int64_t)st->server_latency_histo.val_99th)); - - THROW_STATUS(stats_add_num_str(&st->buf, "average_cross_region_queue_wait", - (int64_t)st->cross_region_queue_wait_time_histo.mean)); - THROW_STATUS(stats_add_num_str(&st->buf, "99_cross_region_queue_wait", - (int64_t)st->cross_region_queue_wait_time_histo.val_99th)); - THROW_STATUS(stats_add_num_str(&st->buf, "average_cross_zone_queue_wait", - (int64_t)st->cross_zone_queue_wait_time_histo.mean)); - THROW_STATUS(stats_add_num_str(&st->buf, "99_cross_zone_queue_wait", - (int64_t)st->cross_zone_queue_wait_time_histo.val_99th)); - THROW_STATUS(stats_add_num_str(&st->buf, "average_server_queue_wait", - (int64_t)st->server_queue_wait_time_histo.mean)); - THROW_STATUS(stats_add_num_str(&st->buf, "99_server_queue_wait", - (int64_t)st->server_queue_wait_time_histo.val_99th)); - - THROW_STATUS(stats_add_num(&st->buf, &st->client_out_queue_99, - (int64_t)st->client_out_queue.val_99th)); - THROW_STATUS(stats_add_num(&st->buf, &st->server_in_queue_99, - (int64_t)st->server_in_queue.val_99th)); - THROW_STATUS(stats_add_num(&st->buf, &st->server_out_queue_99, - (int64_t)st->server_out_queue.val_99th)); - THROW_STATUS(stats_add_num(&st->buf, &st->dnode_client_out_queue_99, - (int64_t)st->dnode_client_out_queue.val_99th)); - THROW_STATUS(stats_add_num(&st->buf, &st->peer_in_queue_99, - (int64_t)st->peer_in_queue.val_99th)); - THROW_STATUS(stats_add_num(&st->buf, &st->peer_out_queue_99, - (int64_t)st->peer_out_queue.val_99th)); - THROW_STATUS(stats_add_num(&st->buf, &st->remote_peer_out_queue_99, - (int64_t)st->remote_peer_out_queue.val_99th)); - THROW_STATUS(stats_add_num(&st->buf, &st->remote_peer_in_queue_99, - (int64_t)st->remote_peer_in_queue.val_99th)); - THROW_STATUS(stats_add_num(&st->buf, &st->alloc_msgs_str, - (int64_t)st->alloc_msgs)); - THROW_STATUS(stats_add_num(&st->buf, &st->free_msgs_str, - (int64_t)st->free_msgs)); - THROW_STATUS(stats_add_num(&st->buf, &st->alloc_mbufs_str, - (int64_t)st->alloc_mbufs)); - THROW_STATUS(stats_add_num(&st->buf, &st->free_mbufs_str, - (int64_t)st->free_mbufs)); - THROW_STATUS(stats_add_num(&st->buf, &st->dyn_memory_str, - (int64_t)st->dyn_memory)); +static rstatus_t stats_copy_metric(struct stats *st, struct array *metric, + bool trim_comma) { + uint32_t i; - return DN_OK; -} + // Do not include last element in loop as we need to check if it gets a comma + for (i = 0; i < array_n(metric) - 1; i++) { + struct stats_metric *stm = array_get(metric, i); + THROW_STATUS(stats_add_num(&st->buf, &stm->name, stm->value.counter)); + } -static rstatus_t -stats_add_footer(struct stats_buffer *buf) -{ - uint8_t *pos; + // Last metric inside dyn_o_mite:{} does not get a comma + struct stats_metric *stm = array_get(metric, array_n(metric) - 1); + THROW_STATUS( + stats_add_num_last(&st->buf, &stm->name, stm->value.counter, trim_comma)); - if (buf->len == buf->size) { - return DN_ERROR; - } - - /* overwrite the last byte and add a new byte */ - pos = buf->data + buf->len - 1; - pos[0] = '}'; - pos[1] = '\n'; - buf->len += 1; - - return DN_OK; + return DN_OK; } -static rstatus_t -stats_begin_nesting(struct stats_buffer *buf, struct string *key, bool arr) -{ - uint8_t *pos; - size_t room; - int n; +static void stats_aggregate_metric(struct array *dst, struct array *src) { + uint32_t i; - pos = buf->data + buf->len; - room = buf->size - buf->len - 1; + for (i = 0; i < array_n(src); i++) { + struct stats_metric *stm1, *stm2; - if (key) - n = dn_snprintf(pos, room, "\"%.*s\": %c", key->len, key->data, - arr ? '[' : '{'); - else - n = dn_snprintf(pos, room, "%c", arr ? '[' : '{'); - if (n < 0 || n >= (int)room) { - log_debug(LOG_ERR, "failed, len:%u size %u", buf->len, buf->size); - return DN_ERROR; - } + stm1 = array_get(src, i); + stm2 = array_get(dst, i); - buf->len += (size_t)n; - - return DN_OK; -} + ASSERT(stm1->type == stm2->type); -static rstatus_t -stats_end_nesting(struct stats_buffer *buf, bool arr) -{ - uint8_t *pos; + switch (stm1->type) { + case STATS_COUNTER: + stm2->value.counter += stm1->value.counter; + break; - pos = buf->data + buf->len; + case STATS_GAUGE: + stm2->value.counter += stm1->value.counter; + break; - // if last non-white space character is , remove it - // first count white spaces at end - int space_count = 0; - while (isspace(*(pos - space_count - 1))) { - space_count++; - } - if (*(pos - space_count - 1) == ',') { - // now remove , from end - pos -= (space_count + 1); - buf->len--; - // put white spaces back - while (space_count > 0) { - *pos = *(pos + 1); - pos++; - space_count--; + case STATS_TIMESTAMP: + if (stm1->value.timestamp) { + stm2->value.timestamp = stm1->value.timestamp; } - } - // append "}," - if ((buf->len + 2) > buf->size) { - return DN_ERROR; - } - pos[0] = arr ? ']' : '}'; - pos[1] = ','; - buf->len += 2; - - return DN_OK; -} - -static rstatus_t -stats_copy_metric(struct stats *st, struct array *metric, bool trim_comma) -{ - uint32_t i; + break; - // Do not include last element in loop as we need to check if it gets a comma - for (i = 0; i < array_n(metric) - 1; i++) { - struct stats_metric *stm = array_get(metric, i); - THROW_STATUS(stats_add_num(&st->buf, &stm->name, stm->value.counter)); + default: + NOT_REACHED(); } - - // Last metric inside dyn_o_mite:{} does not get a comma - struct stats_metric *stm = array_get(metric, array_n(metric) - 1); - THROW_STATUS(stats_add_num_last(&st->buf, &stm->name, stm->value.counter, trim_comma)); - - return DN_OK; + } } -static void -stats_aggregate_metric(struct array *dst, struct array *src) -{ - uint32_t i; - - for (i = 0; i < array_n(src); i++) { - struct stats_metric *stm1, *stm2; - - stm1 = array_get(src, i); - stm2 = array_get(dst, i); +static void stats_aggregate(struct stats *st) { + if (st->aggregate == 0) { + log_debug(LOG_PVERB, + "skip aggregate of shadow to sum as generator is slow"); + return; + } - ASSERT(stm1->type == stm2->type); + log_debug(LOG_PVERB, "aggregate stats shadow %p to sum %p", &st->shadow, + &st->sum); - switch (stm1->type) { - case STATS_COUNTER: - stm2->value.counter += stm1->value.counter; - break; - - case STATS_GAUGE: - stm2->value.counter += stm1->value.counter; - break; - - case STATS_TIMESTAMP: - if (stm1->value.timestamp) { - stm2->value.timestamp = stm1->value.timestamp; - } - break; - - default: - NOT_REACHED(); - } - } -} + struct stats_pool *stp1 = &st->shadow; + struct stats_pool *stp2 = &st->sum; + stats_aggregate_metric(&st->sum.metric, &st->shadow.metric); -static void -stats_aggregate(struct stats *st) -{ - if (st->aggregate == 0) { - log_debug(LOG_PVERB, "skip aggregate of shadow to sum as generator is slow"); - return; - } + struct stats_server *sts1, *sts2; - log_debug(LOG_PVERB, "aggregate stats shadow %p to sum %p", &st->shadow, - &st->sum); + sts1 = &stp1->server; + sts2 = &stp2->server; + stats_aggregate_metric(&sts2->metric, &sts1->metric); - struct stats_pool *stp1 = &st->shadow; - struct stats_pool *stp2 = &st->sum; - stats_aggregate_metric(&st->sum.metric, &st->shadow.metric); + static msec_t last_reset = 0; + if (!last_reset) last_reset = dn_msec_now(); + if ((last_reset + 5 * 60 * 1000) < dn_msec_now()) { + st->reset_histogram = 1; + last_reset = dn_msec_now(); + } + if (st->reset_histogram) { + st->reset_histogram = 0; + histo_reset(&st->latency_histo); + histo_reset(&st->payload_size_histo); - struct stats_server *sts1, *sts2; + histo_reset(&st->server_latency_histo); + histo_reset(&st->cross_zone_latency_histo); + histo_reset(&st->cross_region_latency_histo); - sts1 = &stp1->server; - sts2 = &stp2->server; - stats_aggregate_metric(&sts2->metric, &sts1->metric); + histo_reset(&st->server_queue_wait_time_histo); + histo_reset(&st->cross_zone_queue_wait_time_histo); + histo_reset(&st->cross_region_queue_wait_time_histo); - static msec_t last_reset = 0; - if (!last_reset) - last_reset = dn_msec_now(); - if ((last_reset + 5*60*1000) < dn_msec_now()) { - st->reset_histogram = 1; - last_reset = dn_msec_now(); - } - if (st->reset_histogram) { - st->reset_histogram = 0; - histo_reset(&st->latency_histo); - histo_reset(&st->payload_size_histo); - - histo_reset(&st->server_latency_histo); - histo_reset(&st->cross_zone_latency_histo); - histo_reset(&st->cross_region_latency_histo); - - histo_reset(&st->server_queue_wait_time_histo); - histo_reset(&st->cross_zone_queue_wait_time_histo); - histo_reset(&st->cross_region_queue_wait_time_histo); - - histo_reset(&st->server_in_queue); - histo_reset(&st->server_out_queue); - histo_reset(&st->client_out_queue); - histo_reset(&st->dnode_client_out_queue); - histo_reset(&st->peer_in_queue); - histo_reset(&st->peer_out_queue); - histo_reset(&st->remote_peer_in_queue); - histo_reset(&st->remote_peer_out_queue); - } - st->aggregate = 0; + histo_reset(&st->server_in_queue); + histo_reset(&st->server_out_queue); + histo_reset(&st->client_out_queue); + histo_reset(&st->dnode_client_out_queue); + histo_reset(&st->peer_in_queue); + histo_reset(&st->peer_out_queue); + histo_reset(&st->remote_peer_in_queue); + histo_reset(&st->remote_peer_out_queue); + } + st->aggregate = 0; } -static rstatus_t -stats_make_info_rsp(struct stats *st) -{ +static rstatus_t stats_make_info_rsp(struct stats *st) { + THROW_STATUS(stats_add_header(st)); - THROW_STATUS(stats_add_header(st)); + struct stats_pool *stp = &st->sum; - struct stats_pool *stp = &st->sum; + THROW_STATUS(stats_begin_nesting(&st->buf, &stp->name, false)); + /* copy pool metric from sum(c) to buffer */ + THROW_STATUS(stats_copy_metric(st, &stp->metric, false)); - THROW_STATUS(stats_begin_nesting(&st->buf, &stp->name, false)); - /* copy pool metric from sum(c) to buffer */ - THROW_STATUS(stats_copy_metric(st, &stp->metric, false)); + struct stats_server *sts = &stp->server; - struct stats_server *sts = &stp->server; + THROW_STATUS(stats_begin_nesting(&st->buf, &sts->name, false)); + /* copy server metric from sum(c) to buffer */ + THROW_STATUS(stats_copy_metric(st, &sts->metric, true)); + THROW_STATUS(stats_end_nesting(&st->buf, false)); - THROW_STATUS(stats_begin_nesting(&st->buf, &sts->name, false)); - /* copy server metric from sum(c) to buffer */ - THROW_STATUS(stats_copy_metric(st, &sts->metric, true)); - THROW_STATUS(stats_end_nesting(&st->buf, false)); + THROW_STATUS(stats_end_nesting(&st->buf, false)); + THROW_STATUS(stats_add_footer(&st->buf)); - THROW_STATUS(stats_end_nesting(&st->buf, false)); - THROW_STATUS(stats_add_footer(&st->buf)); - - return DN_OK; + return DN_OK; } -static rstatus_t -get_host_from_pname(struct string *host, struct string *pname) -{ - uint8_t *found = dn_strchr(pname->data, - &pname->data[pname->len], ':'); - string_init(host); - if (found) { - size_t hostlen = found - pname->data; - THROW_STATUS(string_copy(host, pname->data, hostlen)); - return DN_OK; - } - THROW_STATUS(string_copy(host, pname->data, pname->len)); +static rstatus_t get_host_from_pname(struct string *host, + struct string *pname) { + uint8_t *found = dn_strchr(pname->data, &pname->data[pname->len], ':'); + string_init(host); + if (found) { + size_t hostlen = found - pname->data; + THROW_STATUS(string_copy(host, pname->data, hostlen)); return DN_OK; -} - -static rstatus_t -stats_add_node_host(struct stats *st, struct gossip_node *node) -{ - struct string host_str; - string_set_text(&host_str, "host"); - struct server_pool *sp = &st->ctx->pool; - struct string host; - // pname is host:port. for local its 0.0.0.0:port - // so try to get the hostname if local otherwise use whats in pname - char *hn = NULL; - if (node->is_local && (hn = get_public_hostname(sp))) { - THROW_STATUS(string_copy(&host, hn, dn_strlen(hn))); + } + THROW_STATUS(string_copy(host, pname->data, pname->len)); + return DN_OK; +} + +static rstatus_t stats_add_node_host(struct stats *st, + struct gossip_node *node) { + struct string host_str; + string_set_text(&host_str, "host"); + struct server_pool *sp = &st->ctx->pool; + struct string host; + // pname is host:port. for local its 0.0.0.0:port + // so try to get the hostname if local otherwise use whats in pname + char *hn = NULL; + if (node->is_local && (hn = get_public_hostname(sp))) { + THROW_STATUS(string_copy(&host, hn, dn_strlen(hn))); + } else + get_host_from_pname(&host, &node->pname); + + THROW_STATUS(stats_add_string(&st->clus_desc_buf, &host_str, &host)); + string_deinit(&host); + return DN_OK; +} + +static rstatus_t stats_add_node_name(struct stats *st, + struct gossip_node *node) { + struct string name_str; + string_set_text(&name_str, "name"); + struct server_pool *sp = &st->ctx->pool; + // name is the ip address + if (node->is_local) { + // get the ip aka name + struct string ip; + char *ip4 = get_public_ip4(sp); + if (ip4) { + string_set_raw(&ip, ip4); + THROW_STATUS(stats_add_string(&st->clus_desc_buf, &name_str, &ip)); } else - get_host_from_pname(&host, &node->pname); - - THROW_STATUS(stats_add_string(&st->clus_desc_buf, &host_str, - &host)); - string_deinit(&host); - return DN_OK; + THROW_STATUS( + stats_add_string(&st->clus_desc_buf, &name_str, &node->name)); + } else { + THROW_STATUS(stats_add_string(&st->clus_desc_buf, &name_str, &node->name)); + } + return DN_OK; +} + +static rstatus_t stats_add_node_details(struct stats *st, + struct gossip_node *node) { + struct string port_str, token_str; + string_set_text(&port_str, "port"); + string_set_text(&token_str, "token"); + + THROW_STATUS(stats_add_node_name(st, node)); + THROW_STATUS(stats_add_node_host(st, node)); + THROW_STATUS(stats_add_num(&st->clus_desc_buf, &port_str, node->port)); + THROW_STATUS(stats_add_num_last(&st->clus_desc_buf, &token_str, + *(node->token.mag), true)); + return DN_OK; +} + +static rstatus_t stats_add_rack_details(struct stats *st, + struct gossip_rack *rack) { + struct string name_str, servers_str; + string_set_text(&name_str, "name"); + string_set_text(&servers_str, "servers"); + THROW_STATUS(stats_add_string(&st->clus_desc_buf, &name_str, &rack->name)); + // servers : [ + THROW_STATUS(stats_begin_nesting(&st->clus_desc_buf, &servers_str, true)); + uint32_t ni; + for (ni = 0; ni < array_n(&rack->nodes); ni++) { + struct gossip_node *node = array_get(&rack->nodes, ni); + THROW_STATUS(stats_begin_nesting(&st->clus_desc_buf, NULL, false)); + THROW_STATUS(stats_add_node_details(st, node)); + THROW_STATUS(stats_end_nesting(&st->clus_desc_buf, false)); + } + THROW_STATUS(stats_end_nesting(&st->clus_desc_buf, true)); + return DN_OK; } -static rstatus_t -stats_add_node_name(struct stats *st, struct gossip_node *node) -{ - struct string name_str; - string_set_text(&name_str, "name"); - struct server_pool *sp = &st->ctx->pool; - // name is the ip address - if (node->is_local) { - // get the ip aka name - struct string ip; - char * ip4 = get_public_ip4(sp); - if (ip4) { - string_set_raw(&ip, ip4); - THROW_STATUS(stats_add_string(&st->clus_desc_buf, &name_str, &ip)); - } else - THROW_STATUS(stats_add_string(&st->clus_desc_buf, &name_str, - &node->name)); - } else { - THROW_STATUS(stats_add_string(&st->clus_desc_buf, &name_str, - &node->name)); - } - return DN_OK; -} +static rstatus_t stats_add_dc_details(struct stats *st, struct gossip_dc *dc) { + struct string name_str, racks_str; + string_set_text(&name_str, "name"); + string_set_text(&racks_str, "racks"); -static rstatus_t -stats_add_node_details(struct stats *st, struct gossip_node *node) -{ - struct string port_str, token_str; - string_set_text(&port_str, "port"); - string_set_text(&token_str, "token"); - - THROW_STATUS(stats_add_node_name(st, node)); - THROW_STATUS(stats_add_node_host(st, node)); - THROW_STATUS(stats_add_num(&st->clus_desc_buf, &port_str, node->port)); - THROW_STATUS(stats_add_num_last(&st->clus_desc_buf, &token_str, *(node->token.mag), true)); - return DN_OK; -} + THROW_STATUS(stats_add_string(&st->clus_desc_buf, &name_str, &dc->name)); + // racks : [ + THROW_STATUS(stats_begin_nesting(&st->clus_desc_buf, &racks_str, true)); + uint32_t ri; + for (ri = 0; ri < array_n(&dc->racks); ri++) { + struct gossip_rack *rack = array_get(&dc->racks, ri); -static rstatus_t -stats_add_rack_details(struct stats *st, struct gossip_rack *rack) -{ - struct string name_str, servers_str; - string_set_text(&name_str, "name"); - string_set_text(&servers_str, "servers"); - THROW_STATUS(stats_add_string(&st->clus_desc_buf, &name_str, &rack->name)); - // servers : [ - THROW_STATUS(stats_begin_nesting(&st->clus_desc_buf, &servers_str, true)); - uint32_t ni; - for(ni = 0; ni < array_n(&rack->nodes); ni++) { - struct gossip_node *node = array_get(&rack->nodes, ni); - THROW_STATUS(stats_begin_nesting(&st->clus_desc_buf, NULL, false)); - THROW_STATUS(stats_add_node_details(st, node)); - THROW_STATUS(stats_end_nesting(&st->clus_desc_buf, false)); + THROW_STATUS(stats_begin_nesting(&st->clus_desc_buf, NULL, false)); + THROW_STATUS(stats_add_rack_details(st, rack)); + THROW_STATUS(stats_end_nesting(&st->clus_desc_buf, false)); + } + THROW_STATUS(stats_end_nesting(&st->clus_desc_buf, true)); + return DN_OK; +} + +static rstatus_t stats_resize_clus_desc_buf(struct stats *st) { + struct server_pool *sp = &st->ctx->pool; + ASSERT(sp); + size_t size = 1024 * array_n(&sp->peers); + size = DN_ALIGN(size, DN_ALIGNMENT); + if (st->clus_desc_buf.size < size) { + stats_destroy_buf(&st->clus_desc_buf); + st->clus_desc_buf.data = dn_alloc(size); + if (st->clus_desc_buf.data == NULL) { + log_error("create cluster desc buffer of size %zu failed: %s", size, + strerror(errno)); + return DN_ENOMEM; } - THROW_STATUS(stats_end_nesting(&st->clus_desc_buf, true)); - return DN_OK; -} - -static rstatus_t -stats_add_dc_details(struct stats *st, struct gossip_dc *dc) -{ - struct string name_str, racks_str; - string_set_text(&name_str, "name"); - string_set_text(&racks_str, "racks"); - - THROW_STATUS(stats_add_string(&st->clus_desc_buf, &name_str, &dc->name)); - // racks : [ - THROW_STATUS(stats_begin_nesting(&st->clus_desc_buf, &racks_str, true)); - uint32_t ri; - for(ri = 0; ri < array_n(&dc->racks); ri++) { - struct gossip_rack *rack = array_get(&dc->racks, ri); + st->clus_desc_buf.size = size; + } - THROW_STATUS(stats_begin_nesting(&st->clus_desc_buf, NULL, false)); - THROW_STATUS(stats_add_rack_details(st, rack)); - THROW_STATUS(stats_end_nesting(&st->clus_desc_buf, false)); - } - THROW_STATUS(stats_end_nesting(&st->clus_desc_buf, true)); - return DN_OK; + stats_reset_buf(&st->clus_desc_buf); + return DN_OK; } -static rstatus_t -stats_resize_clus_desc_buf(struct stats *st) -{ - struct server_pool *sp = &st->ctx->pool; - ASSERT(sp); - size_t size = 1024 * array_n(&sp->peers); - size = DN_ALIGN(size, DN_ALIGNMENT); - if (st->clus_desc_buf.size < size) { - stats_destroy_buf(&st->clus_desc_buf); - st->clus_desc_buf.data = dn_alloc(size); - if (st->clus_desc_buf.data == NULL) { - log_error("create cluster desc buffer of size %zu failed: %s", - size, strerror(errno)); - return DN_ENOMEM; - } - st->clus_desc_buf.size = size; - } +static rstatus_t stats_make_cl_desc_rsp(struct stats *st) { + THROW_STATUS(stats_resize_clus_desc_buf(st)); + THROW_STATUS(stats_begin_nesting(&st->clus_desc_buf, NULL, false)); - stats_reset_buf(&st->clus_desc_buf); - return DN_OK; -} + struct string dcs_str; + string_set_text(&dcs_str, "dcs"); + THROW_STATUS(stats_begin_nesting(&st->clus_desc_buf, &dcs_str, true)); + uint32_t di; + for (di = 0; di < array_n(&gn_pool.datacenters); di++) { + struct gossip_dc *dc = array_get(&gn_pool.datacenters, di); -static rstatus_t -stats_make_cl_desc_rsp(struct stats *st) -{ - THROW_STATUS(stats_resize_clus_desc_buf(st)); THROW_STATUS(stats_begin_nesting(&st->clus_desc_buf, NULL, false)); + THROW_STATUS(stats_add_dc_details(st, dc)); + THROW_STATUS(stats_end_nesting(&st->clus_desc_buf, false)); + } - struct string dcs_str; - string_set_text(&dcs_str, "dcs"); - THROW_STATUS(stats_begin_nesting(&st->clus_desc_buf, &dcs_str, true)); - uint32_t di; - for(di = 0; di < array_n(&gn_pool.datacenters); di++) { - struct gossip_dc *dc = array_get(&gn_pool.datacenters, di); - - THROW_STATUS(stats_begin_nesting(&st->clus_desc_buf, NULL, false)); - THROW_STATUS(stats_add_dc_details(st, dc)); - THROW_STATUS(stats_end_nesting(&st->clus_desc_buf, false)); - - } - - THROW_STATUS(stats_end_nesting(&st->clus_desc_buf, true)); - THROW_STATUS(stats_add_footer(&st->clus_desc_buf)); - return DN_OK; + THROW_STATUS(stats_end_nesting(&st->clus_desc_buf, true)); + THROW_STATUS(stats_add_footer(&st->clus_desc_buf)); + return DN_OK; } +static void parse_request(int sd, struct stats_cmd *st_cmd) { + size_t max_buf_size = 65535; + char mesg[max_buf_size], *reqline[3]; + int rcvd; -static void -parse_request(int sd, struct stats_cmd *st_cmd) -{ - size_t max_buf_size = 99999; - char mesg[max_buf_size], *reqline[3]; - int rcvd; - - memset( (void*)mesg, (int)'\0', max_buf_size ); + memset((void *)mesg, (int)'\0', max_buf_size); - rcvd=recv(sd, mesg, max_buf_size, 0); + rcvd = recv(sd, mesg, max_buf_size, 0); - if (rcvd < 0) { - log_debug(LOG_VERB, "stats recv error"); - } else if (rcvd == 0) { // receive socket closed - log_debug(LOG_VERB, "Client disconnected upexpectedly"); - } else { // message received - log_debug(LOG_VERB, "%s", mesg); - reqline[0] = strtok(mesg, " \t\n"); - if (!reqline[0]) { + if (rcvd < 0) { + log_debug(LOG_VERB, "stats recv error"); + } else if (rcvd == 0) { // receive socket closed + log_debug(LOG_VERB, "Client disconnected upexpectedly"); + } else { // message received + log_debug(LOG_VERB, "%s", mesg); + reqline[0] = strtok(mesg, " \t\n"); + if (!reqline[0]) { + return; + } + if (strncmp(reqline[0], "GET\0", 4) == 0) { + reqline[1] = strtok(NULL, " \t"); + reqline[2] = strtok(NULL, " \t\n"); + log_debug(LOG_VERB, "0: %s\n", reqline[0]); + log_debug(LOG_VERB, "1: %s\n", reqline[1]); + log_debug(LOG_VERB, "2: %s\n", reqline[2]); + + if (!reqline[1] || !reqline[2] || + (strncmp(reqline[2], "HTTP/1.0", 8) != 0 && + strncmp(reqline[2], "HTTP/1.1", 8) != 0)) { + ssize_t wrote = write(sd, "HTTP/1.0 400 Bad Request\n", 25); + IGNORE_RET_VAL(wrote); + st_cmd->cmd = CMD_UNKNOWN; + return; + } else { + if (strncmp(reqline[1], "/\0", 2) == 0) { + reqline[1] = "/info"; + return; + } else if (strcmp(reqline[1], "/info") == 0) { + st_cmd->cmd = CMD_INFO; + return; + } else if (strcmp(reqline[1], "/help") == 0) { + st_cmd->cmd = CMD_HELP; + return; + } else if (strcmp(reqline[1], "/ping") == 0) { + st_cmd->cmd = CMD_PING; + return; + } else if (strncmp(reqline[1], "/setloglevel", + dn_strlen("/setloglevel")) == 0) { + st_cmd->cmd = CMD_SET_LOG_LEVEL; + log_notice("Setting loglevel: %s", reqline[1]); + char *val = reqline[1] + dn_strlen("/setloglevel"); + if (*val != '/') { + st_cmd->cmd = CMD_UNKNOWN; return; + } else { + val++; + string_init(&st_cmd->req_data); + string_copy_c(&st_cmd->req_data, val); + } + return; + } else if (strcmp(reqline[1], "/loglevelup") == 0) { + st_cmd->cmd = CMD_LOG_LEVEL_UP; + return; + } else if (strcmp(reqline[1], "/logleveldown") == 0) { + st_cmd->cmd = CMD_LOG_LEVEL_DOWN; + return; + } else if (strcmp(reqline[1], "/historeset") == 0) { + st_cmd->cmd = CMD_HISTO_RESET; + return; + } else if (strcmp(reqline[1], "/cluster_describe") == 0) { + st_cmd->cmd = CMD_CL_DESCRIBE; + return; + } else if (strcmp(reqline[1], "/get_consistency") == 0) { + st_cmd->cmd = CMD_GET_CONSISTENCY; + return; + } else if (strncmp(reqline[1], "/set_consistency", 16) == 0) { + st_cmd->cmd = CMD_SET_CONSISTENCY; + log_notice("Setting consistency parameters: %s", reqline[1]); + char *op = reqline[1] + 16; + if (strncmp(op, "/read", 5) == 0) { + char *type = op + 5; + log_notice("op: %s", op); + log_notice("type: %s", type); + if (!dn_strcasecmp(type, "/" CONF_STR_DC_ONE)) + g_read_consistency = DC_ONE; + else if (!dn_strcasecmp(type, "/" CONF_STR_DC_QUORUM)) + g_read_consistency = DC_QUORUM; + else if (!dn_strcasecmp(type, "/" CONF_STR_DC_SAFE_QUORUM)) + g_read_consistency = DC_SAFE_QUORUM; + else + st_cmd->cmd = CMD_UNKNOWN; + } else if (strncmp(op, "/write", 6) == 0) { + char *type = op + 6; + if (!dn_strcasecmp(type, "/" CONF_STR_DC_ONE)) + g_write_consistency = DC_ONE; + else if (!dn_strcasecmp(type, "/" CONF_STR_DC_QUORUM)) + g_write_consistency = DC_QUORUM; + else if (!dn_strcasecmp(type, "/" CONF_STR_DC_SAFE_QUORUM)) + g_write_consistency = DC_SAFE_QUORUM; + else + st_cmd->cmd = CMD_UNKNOWN; + } else + st_cmd->cmd = CMD_UNKNOWN; + return; + } else if (strcmp(reqline[1], "/get_timeout_factor") == 0) { + st_cmd->cmd = CMD_GET_TIMEOUT_FACTOR; + return; + } else if (dn_strncmp(reqline[1], "/set_timeout_factor", 19) == 0) { + st_cmd->cmd = CMD_SET_TIMEOUT_FACTOR; + log_notice("Setting timeout factor: %s", reqline[1]); + char *val = reqline[1] + dn_strlen("/set_timeout_factor"); + if (*val != '/') { + st_cmd->cmd = CMD_UNKNOWN; + return; + } else { + val++; + string_init(&st_cmd->req_data); + string_copy_c(&st_cmd->req_data, val); + } + return; + } else if (strncmp(reqline[1], "/peer", 5) == 0) { + log_debug(LOG_VERB, "Setting peer - URL Parameters : %s", reqline[1]); + char *peer_state = reqline[1] + 5; + log_debug(LOG_VERB, "Peer : %s", peer_state); + if (strncmp(peer_state, "/down", 5) == 0) { + log_debug(LOG_VERB, "Peer's state is down!"); + st_cmd->cmd = CMD_PEER_DOWN; + string_init(&st_cmd->req_data); + string_copy_c(&st_cmd->req_data, peer_state + 6); + } else if (strncmp(peer_state, "/up", 3) == 0) { + log_debug(LOG_VERB, "Peer's state is UP!"); + st_cmd->cmd = CMD_PEER_UP; + string_init(&st_cmd->req_data); + string_copy_c(&st_cmd->req_data, peer_state + 4); + } else if (strncmp(peer_state, "/reset", 6) == 0) { + log_debug(LOG_VERB, "Peer's state is RESET!"); + st_cmd->cmd = CMD_PEER_RESET; + string_init(&st_cmd->req_data); + string_copy_c(&st_cmd->req_data, peer_state + 7); + } else { + st_cmd->cmd = CMD_PING; + } + + return; } - if ( strncmp(reqline[0], "GET\0", 4) == 0 ) { - reqline[1] = strtok (NULL, " \t"); - reqline[2] = strtok (NULL, " \t\n"); - log_debug(LOG_VERB, "0: %s\n", reqline[0]); - log_debug(LOG_VERB, "1: %s\n", reqline[1]); - log_debug(LOG_VERB, "2: %s\n", reqline[2]); - - if (!reqline[1] || !reqline[2] || - (strncmp( reqline[2], "HTTP/1.0", 8)!=0 && - strncmp( reqline[2], "HTTP/1.1", 8)!=0)) { - ssize_t wrote = write(sd, "HTTP/1.0 400 Bad Request\n", 25); - IGNORE_RET_VAL(wrote); - st_cmd->cmd = CMD_UNKNOWN; - return; - } else { - if (strncmp(reqline[1], "/\0", 2) == 0 ) { - reqline[1] = "/info"; - return; - } else if (strcmp(reqline[1], "/info") == 0) { - st_cmd->cmd = CMD_INFO; - return; - } else if (strcmp(reqline[1], "/help") == 0) { - st_cmd->cmd = CMD_HELP; - return; - } else if (strcmp(reqline[1], "/ping") == 0) { - st_cmd->cmd = CMD_PING; - return; - } else if (strncmp(reqline[1], "/setloglevel", dn_strlen("/setloglevel")) == 0) { - st_cmd->cmd = CMD_SET_LOG_LEVEL; - log_notice("Setting loglevel: %s", reqline[1]); - char* val = reqline[1] + dn_strlen("/setloglevel"); - if (*val != '/') { - st_cmd->cmd = CMD_UNKNOWN; - return; - } else { - val++; - string_init(&st_cmd->req_data); - string_copy_c(&st_cmd->req_data, val); - } - return; - } else if (strcmp(reqline[1], "/loglevelup") == 0) { - st_cmd->cmd = CMD_LOG_LEVEL_UP; - return; - } else if (strcmp(reqline[1], "/logleveldown") == 0) { - st_cmd->cmd = CMD_LOG_LEVEL_DOWN; - return; - } else if (strcmp(reqline[1], "/historeset") == 0) { - st_cmd->cmd = CMD_HISTO_RESET; - return; - } else if (strcmp(reqline[1], "/cluster_describe") == 0) { - st_cmd->cmd = CMD_CL_DESCRIBE; - return; - } else if (strcmp(reqline[1], "/get_consistency") == 0) { - st_cmd->cmd = CMD_GET_CONSISTENCY; - return; - } else if (strncmp(reqline[1], "/set_consistency", 16) == 0) { - st_cmd->cmd = CMD_SET_CONSISTENCY; - log_notice("Setting consistency parameters: %s", reqline[1]); - char* op = reqline[1] + 16; - if (strncmp(op, "/read", 5) == 0) { - char* type = op + 5; - log_notice("op: %s", op); - log_notice("type: %s", type); - if (!dn_strcasecmp(type, "/"CONF_STR_DC_ONE)) - g_read_consistency = DC_ONE; - else if (!dn_strcasecmp(type, "/"CONF_STR_DC_QUORUM)) - g_read_consistency = DC_QUORUM; - else if (!dn_strcasecmp(type, "/"CONF_STR_DC_SAFE_QUORUM)) - g_read_consistency = DC_SAFE_QUORUM; - else - st_cmd->cmd = CMD_UNKNOWN; - } else if (strncmp(op, "/write", 6) == 0) { - char* type = op + 6; - if (!dn_strcasecmp(type, "/"CONF_STR_DC_ONE)) - g_write_consistency = DC_ONE; - else if (!dn_strcasecmp(type, "/"CONF_STR_DC_QUORUM)) - g_write_consistency = DC_QUORUM; - else if (!dn_strcasecmp(type, "/"CONF_STR_DC_SAFE_QUORUM)) - g_write_consistency = DC_SAFE_QUORUM; - else - st_cmd->cmd = CMD_UNKNOWN; - } else - st_cmd->cmd = CMD_UNKNOWN; - return; - } else if (strcmp(reqline[1], "/get_timeout_factor") == 0) { - st_cmd->cmd = CMD_GET_TIMEOUT_FACTOR; - return; - } else if (dn_strncmp(reqline[1], "/set_timeout_factor", 19) == 0) { - st_cmd->cmd = CMD_SET_TIMEOUT_FACTOR; - log_notice("Setting timeout factor: %s", reqline[1]); - char* val = reqline[1] + dn_strlen("/set_timeout_factor"); - if (*val != '/') { - st_cmd->cmd = CMD_UNKNOWN; - return; - } else { - val++; - string_init(&st_cmd->req_data); - string_copy_c(&st_cmd->req_data, val); - } - return; - } else if (strncmp(reqline[1], "/peer", 5) == 0) { - log_debug(LOG_VERB, "Setting peer - URL Parameters : %s", reqline[1]); - char* peer_state = reqline[1] + 5; - log_debug(LOG_VERB, "Peer : %s", peer_state); - if (strncmp(peer_state, "/down", 5) == 0) { - log_debug(LOG_VERB, "Peer's state is down!"); - st_cmd->cmd = CMD_PEER_DOWN; - string_init(&st_cmd->req_data); - string_copy_c(&st_cmd->req_data, peer_state + 6); - } else if (strncmp(peer_state, "/up", 3) == 0) { - log_debug(LOG_VERB, "Peer's state is UP!"); - st_cmd->cmd = CMD_PEER_UP; - string_init(&st_cmd->req_data); - string_copy_c(&st_cmd->req_data, peer_state + 4); - } else if (strncmp(peer_state, "/reset", 6) == 0) { - log_debug(LOG_VERB, "Peer's state is RESET!"); - st_cmd->cmd = CMD_PEER_RESET; - string_init(&st_cmd->req_data); - string_copy_c(&st_cmd->req_data, peer_state + 7); - } else { - st_cmd->cmd = CMD_PING; - } - - return; - } - - if (strncmp(reqline[1], "/state", 6) == 0) { - log_debug(LOG_VERB, "Setting/Getting state - URL Parameters : %s", reqline[1]); - char* state = reqline[1] + 7; - log_debug(LOG_VERB, "cmd : %s", state); - if (strcmp(state, "standby") == 0) { - st_cmd->cmd = CMD_STANDBY; - return; - } else if (strcmp(state, "writes_only") == 0) { - st_cmd->cmd = CMD_WRITES_ONLY; - return; - } else if (strcmp(state, "normal") == 0) { - st_cmd->cmd = CMD_NORMAL; - return; - } else if (strcmp(state, "resuming") == 0) { - st_cmd->cmd = CMD_RESUMING; - return; - } else if (strcmp(state, "get_state") == 0) { - st_cmd->cmd = CMD_GET_STATE; - return; - } - } - - st_cmd->cmd = CMD_PING; - return; - } + + if (strncmp(reqline[1], "/state", 6) == 0) { + log_debug(LOG_VERB, "Setting/Getting state - URL Parameters : %s", + reqline[1]); + char *state = reqline[1] + 7; + log_debug(LOG_VERB, "cmd : %s", state); + if (strcmp(state, "standby") == 0) { + st_cmd->cmd = CMD_STANDBY; + return; + } else if (strcmp(state, "writes_only") == 0) { + st_cmd->cmd = CMD_WRITES_ONLY; + return; + } else if (strcmp(state, "normal") == 0) { + st_cmd->cmd = CMD_NORMAL; + return; + } else if (strcmp(state, "resuming") == 0) { + st_cmd->cmd = CMD_RESUMING; + return; + } else if (strcmp(state, "get_state") == 0) { + st_cmd->cmd = CMD_GET_STATE; + return; + } } + + st_cmd->cmd = CMD_PING; + return; + } } + } } +static rstatus_t stats_http_rsp(int sd, uint8_t *content, size_t len) { + ssize_t n; + uint8_t http_header[MAX_HTTP_HEADER_SIZE]; + memset((void *)http_header, (int)'\0', MAX_HTTP_HEADER_SIZE); + n = dn_snprintf(http_header, MAX_HTTP_HEADER_SIZE, "%.*s %lu \r\n\r\n", + header_str.len, header_str.data, len); -static rstatus_t -stats_http_rsp(int sd, uint8_t *content, size_t len) -{ - ssize_t n; - uint8_t http_header[MAX_HTTP_HEADER_SIZE]; - memset( (void*)http_header, (int)'\0', MAX_HTTP_HEADER_SIZE ); - n = dn_snprintf(http_header, MAX_HTTP_HEADER_SIZE, "%.*s %lu \r\n\r\n", header_str.len, header_str.data, len); - - if (n < 0 || n >= MAX_HTTP_HEADER_SIZE) { - return DN_ERROR; - } + if (n < 0 || n >= MAX_HTTP_HEADER_SIZE) { + return DN_ERROR; + } - n = dn_sendn(sd, http_header, n); - if (n < 0) { - log_error("send http headers on sd %d failed: %s", sd, strerror(errno)); - close(sd); - return DN_ERROR; - } - - n = dn_sendn(sd, content, len); + n = dn_sendn(sd, http_header, n); + if (n < 0) { + log_error("send http headers on sd %d failed: %s", sd, strerror(errno)); + close(sd); + return DN_ERROR; + } - if (n < 0) { - log_error("send stats on sd %d failed: %s", sd, strerror(errno)); - close(sd); - return DN_ERROR; - } + n = dn_sendn(sd, content, len); + if (n < 0) { + log_error("send stats on sd %d failed: %s", sd, strerror(errno)); close(sd); + return DN_ERROR; + } - return DN_OK; -} + close(sd); + return DN_OK; +} -static rstatus_t -stats_send_rsp(struct stats *st) -{ - int sd; +static rstatus_t stats_send_rsp(struct stats *st) { + int sd; - sd = accept(st->sd, NULL, NULL); - if (sd < 0) { - log_error("accept on m %d failed: %s", st->sd, strerror(errno)); - return DN_ERROR; - } + sd = accept(st->sd, NULL, NULL); + if (sd < 0) { + log_error("accept on m %d failed: %s", st->sd, strerror(errno)); + return DN_ERROR; + } - struct stats_cmd st_cmd; + struct stats_cmd st_cmd; - parse_request(sd, &st_cmd); - stats_cmd_t cmd = st_cmd.cmd; + parse_request(sd, &st_cmd); + stats_cmd_t cmd = st_cmd.cmd; - log_debug(LOG_VERB, "cmd %d", cmd); + log_debug(LOG_VERB, "cmd %d", cmd); - if (cmd == CMD_INFO) { - if (stats_make_info_rsp(st) != DN_OK) - return stats_http_rsp(sd, err_resp.data, err_resp.len); - else { - log_debug(LOG_VERB, "send stats on sd %d %d bytes", sd, st->buf.len); - return stats_http_rsp(sd, st->buf.data, st->buf.len); - } - } else if (cmd == CMD_HELP) { - char rsp[5120]; - dn_sprintf(rsp, "/info\n/help\n/ping\n/cluster_describe\n"\ - "/setloglevel/<0-11>\n/loglevelup\n/logleveldown\n/historeset\n"\ - "/get_consistency\n/set_consistency//\n"\ - "/get_timeout_factor\n/set_timeout_factor/<1-10>\n/peer/\n"\ - "/state/\n\n", "resuming"); - return stats_http_rsp(sd, rsp, dn_strlen(rsp)); - } else if (cmd == CMD_NORMAL) { - core_set_local_state(st->ctx, NORMAL); - return stats_http_rsp(sd, ok.data, ok.len); - } else if (cmd == CMD_CL_DESCRIBE) { - if (stats_make_cl_desc_rsp(st) != DN_OK) - return stats_http_rsp(sd, err_resp.data, err_resp.len); - else - return stats_http_rsp(sd, st->clus_desc_buf.data, st->clus_desc_buf.len); - } else if (cmd == CMD_STANDBY) { - core_set_local_state(st->ctx, STANDBY); - return stats_http_rsp(sd, ok.data, ok.len); - } else if (cmd == CMD_WRITES_ONLY) { - core_set_local_state(st->ctx, WRITES_ONLY); - return stats_http_rsp(sd, ok.data, ok.len); - } else if (cmd == CMD_RESUMING) { - core_set_local_state(st->ctx, RESUMING); - return stats_http_rsp(sd, ok.data, ok.len); - } else if (cmd == CMD_GET_STATE) { - char rsp[1024]; - dn_sprintf(rsp, "State: %s\n", get_state(st->ctx->dyn_state)); - return stats_http_rsp(sd, rsp, dn_strlen(rsp)); - } else if (cmd == CMD_SET_LOG_LEVEL) { - int8_t loglevel = 0; - log_warn("st_cmd.req_data '%.*s' ", st_cmd.req_data); - sscanf(st_cmd.req_data.data, "%d", &loglevel); - log_warn("setting log level = %d", loglevel); - log_level_set(loglevel); - return stats_http_rsp(sd, ok.data, ok.len); - } else if (cmd == CMD_LOG_LEVEL_UP) { - log_level_up(); - return stats_http_rsp(sd, ok.data, ok.len); - } else if (cmd == CMD_LOG_LEVEL_DOWN) { - log_level_down(); - return stats_http_rsp(sd, ok.data, ok.len); - } else if (cmd == CMD_HISTO_RESET) { - st->reset_histogram = 1; - st->updated = 1; - return stats_http_rsp(sd, ok.data, ok.len); - } else if (cmd == CMD_GET_CONSISTENCY) { - char cons_rsp[1024]; - dn_sprintf(cons_rsp, "Read Consistency: %s\r\nWrite Consistency: %s\r\n", - get_consistency_string(g_read_consistency), - get_consistency_string(g_write_consistency)); - return stats_http_rsp(sd, cons_rsp, dn_strlen(cons_rsp)); - } else if (cmd == CMD_GET_TIMEOUT_FACTOR) { - char rsp[1024]; - dn_sprintf(rsp, "Timeout factor: %d\n", g_timeout_factor); - return stats_http_rsp(sd, rsp, dn_strlen(rsp)); - } else if (cmd == CMD_SET_TIMEOUT_FACTOR) { - int8_t timeout_factor = 0; - log_warn("st_cmd.req_data '%.*s' ", st_cmd.req_data); - sscanf(st_cmd.req_data.data, "%d", &timeout_factor); - log_warn("timeout factor = %d", timeout_factor); - // make sure timeout factor is within a range - if (timeout_factor < 1) - timeout_factor = 1; - if (timeout_factor > 10) - timeout_factor = 10; - g_timeout_factor = timeout_factor; - log_warn("setting timeout_factor to %d", g_timeout_factor); - return stats_http_rsp(sd, ok.data, ok.len); - } else if (cmd == CMD_PEER_DOWN || cmd == CMD_PEER_UP || cmd == CMD_PEER_RESET) { - log_debug(LOG_VERB, "st_cmd.req_data '%.*s' ", st_cmd.req_data); - struct server_pool *sp = &st->ctx->pool; - uint32_t i, len; - - //I think it is ok to keep this simple without a synchronization - for (i = 0, len = array_n(&sp->peers); i < len; i++) { - struct node *peer = *(struct node **)array_get(&sp->peers, i); - log_debug(LOG_VERB, "peer '%.*s' ", peer->name); - - if (string_compare(&st_cmd.req_data, &all) == 0) { - log_debug(LOG_VERB, "\t\tSetting peer '%.*s' to state %d due to RESET/ALL command", st_cmd.req_data, cmd); - peer->state = RESET; - } else if (string_compare(&peer->name, &st_cmd.req_data) == 0) { - log_debug(LOG_VERB, "\t\tSetting peer '%.*s' to a new state due to command %d", st_cmd.req_data, cmd); - switch (cmd) { - case CMD_PEER_UP: - peer->state = NORMAL; - break; - case CMD_PEER_RESET: - peer->state = RESET; - break; - case CMD_PEER_DOWN: - peer->state = DOWN; - break; - default: - peer->state = NORMAL; - } - break; - } + if (cmd == CMD_INFO) { + if (stats_make_info_rsp(st) != DN_OK) + return stats_http_rsp(sd, err_resp.data, err_resp.len); + else { + log_debug(LOG_VERB, "send stats on sd %d %d bytes", sd, st->buf.len); + return stats_http_rsp(sd, st->buf.data, st->buf.len); + } + } else if (cmd == CMD_HELP) { + char rsp[5120]; + dn_sprintf(rsp, + "/info\n/help\n/ping\n/cluster_describe\n" + "/setloglevel/<0-11>\n/loglevelup\n/logleveldown\n/historeset\n" + "/get_consistency\n/set_consistency//" + "\n" + "/get_timeout_factor\n/set_timeout_factor/<1-10>\n/peer/" + "\n" + "/state/\n\n", + "resuming"); + return stats_http_rsp(sd, rsp, dn_strlen(rsp)); + } else if (cmd == CMD_NORMAL) { + core_set_local_state(st->ctx, NORMAL); + return stats_http_rsp(sd, ok.data, ok.len); + } else if (cmd == CMD_CL_DESCRIBE) { + if (stats_make_cl_desc_rsp(st) != DN_OK) + return stats_http_rsp(sd, err_resp.data, err_resp.len); + else + return stats_http_rsp(sd, st->clus_desc_buf.data, st->clus_desc_buf.len); + } else if (cmd == CMD_STANDBY) { + core_set_local_state(st->ctx, STANDBY); + return stats_http_rsp(sd, ok.data, ok.len); + } else if (cmd == CMD_WRITES_ONLY) { + core_set_local_state(st->ctx, WRITES_ONLY); + return stats_http_rsp(sd, ok.data, ok.len); + } else if (cmd == CMD_RESUMING) { + core_set_local_state(st->ctx, RESUMING); + return stats_http_rsp(sd, ok.data, ok.len); + } else if (cmd == CMD_GET_STATE) { + char rsp[1024]; + dn_sprintf(rsp, "State: %s\n", get_state(st->ctx->dyn_state)); + return stats_http_rsp(sd, rsp, dn_strlen(rsp)); + } else if (cmd == CMD_SET_LOG_LEVEL) { + int8_t loglevel = 0; + log_warn("st_cmd.req_data '%.*s' ", st_cmd.req_data); + sscanf(st_cmd.req_data.data, "%d", &loglevel); + log_warn("setting log level = %d", loglevel); + log_level_set(loglevel); + return stats_http_rsp(sd, ok.data, ok.len); + } else if (cmd == CMD_LOG_LEVEL_UP) { + log_level_up(); + return stats_http_rsp(sd, ok.data, ok.len); + } else if (cmd == CMD_LOG_LEVEL_DOWN) { + log_level_down(); + return stats_http_rsp(sd, ok.data, ok.len); + } else if (cmd == CMD_HISTO_RESET) { + st->reset_histogram = 1; + st->updated = 1; + return stats_http_rsp(sd, ok.data, ok.len); + } else if (cmd == CMD_GET_CONSISTENCY) { + char cons_rsp[1024]; + dn_sprintf(cons_rsp, "Read Consistency: %s\r\nWrite Consistency: %s\r\n", + get_consistency_string(g_read_consistency), + get_consistency_string(g_write_consistency)); + return stats_http_rsp(sd, cons_rsp, dn_strlen(cons_rsp)); + } else if (cmd == CMD_GET_TIMEOUT_FACTOR) { + char rsp[1024]; + dn_sprintf(rsp, "Timeout factor: %d\n", g_timeout_factor); + return stats_http_rsp(sd, rsp, dn_strlen(rsp)); + } else if (cmd == CMD_SET_TIMEOUT_FACTOR) { + int8_t timeout_factor = 0; + log_warn("st_cmd.req_data '%.*s' ", st_cmd.req_data); + sscanf(st_cmd.req_data.data, "%d", &timeout_factor); + log_warn("timeout factor = %d", timeout_factor); + // make sure timeout factor is within a range + if (timeout_factor < 1) timeout_factor = 1; + if (timeout_factor > 10) timeout_factor = 10; + g_timeout_factor = timeout_factor; + log_warn("setting timeout_factor to %d", g_timeout_factor); + return stats_http_rsp(sd, ok.data, ok.len); + } else if (cmd == CMD_PEER_DOWN || cmd == CMD_PEER_UP || + cmd == CMD_PEER_RESET) { + log_debug(LOG_VERB, "st_cmd.req_data '%.*s' ", st_cmd.req_data); + struct server_pool *sp = &st->ctx->pool; + uint32_t i, len; + + // I think it is ok to keep this simple without a synchronization + for (i = 0, len = array_n(&sp->peers); i < len; i++) { + struct node *peer = *(struct node **)array_get(&sp->peers, i); + log_debug(LOG_VERB, "peer '%.*s' ", peer->name); + + if (string_compare(&st_cmd.req_data, &all) == 0) { + log_debug( + LOG_VERB, + "\t\tSetting peer '%.*s' to state %d due to RESET/ALL command", + st_cmd.req_data, cmd); + peer->state = RESET; + } else if (string_compare(&peer->name, &st_cmd.req_data) == 0) { + log_debug(LOG_VERB, + "\t\tSetting peer '%.*s' to a new state due to command %d", + st_cmd.req_data, cmd); + switch (cmd) { + case CMD_PEER_UP: + peer->state = NORMAL; + break; + case CMD_PEER_RESET: + peer->state = RESET; + break; + case CMD_PEER_DOWN: + peer->state = DOWN; + break; + default: + peer->state = NORMAL; } - string_deinit(&st_cmd.req_data); - } else { - log_debug(LOG_VERB, "Unsupported cmd"); + break; + } } + string_deinit(&st_cmd.req_data); + } else { + log_debug(LOG_VERB, "Unsupported cmd"); + } - stats_http_rsp(sd, ok.data, ok.len); - close(sd); + stats_http_rsp(sd, ok.data, ok.len); + close(sd); - return DN_OK; + return DN_OK; } -static void -stats_loop_callback(void *arg1, void *arg2) -{ - struct stats *st = arg1; - int n = *((int *)arg2); +static void stats_loop_callback(void *arg1, void *arg2) { + struct stats *st = arg1; + int n = *((int *)arg2); - /* aggregate stats from shadow (b) -> sum (c) */ - stats_aggregate(st); + /* aggregate stats from shadow (b) -> sum (c) */ + stats_aggregate(st); - if (n == 0) { - return; - } + if (n == 0) { + return; + } - /* send aggregate stats sum (c) to collector */ - stats_send_rsp(st); + /* send aggregate stats sum (c) to collector */ + stats_send_rsp(st); } -static void * -stats_loop(void *arg) -{ - event_loop_stats(stats_loop_callback, arg); - return NULL; +static void *stats_loop(void *arg) { + event_loop_stats(stats_loop_callback, arg); + return NULL; } -static rstatus_t -stats_listen(struct stats *st) -{ - rstatus_t status; - struct sockinfo si; +static rstatus_t stats_listen(struct stats *st) { + rstatus_t status; + struct sockinfo si; - status = dn_resolve(&st->addr, st->port, &si); - if (status < 0) { - return status; - } + status = dn_resolve(&st->addr, st->port, &si); + if (status < 0) { + return status; + } - st->sd = socket(si.family, SOCK_STREAM, 0); - if (st->sd < 0) { - log_error("socket failed: %s", strerror(errno)); - return DN_ERROR; - } + st->sd = socket(si.family, SOCK_STREAM, 0); + if (st->sd < 0) { + log_error("socket failed: %s", strerror(errno)); + return DN_ERROR; + } - status = dn_set_reuseaddr(st->sd); - if (status < 0) { - log_error("set reuseaddr on m %d failed: %s", st->sd, strerror(errno)); - return DN_ERROR; - } + status = dn_set_reuseaddr(st->sd); + if (status < 0) { + log_error("set reuseaddr on m %d failed: %s", st->sd, strerror(errno)); + return DN_ERROR; + } - status = bind(st->sd, (struct sockaddr *)&si.addr, si.addrlen); - if (status < 0) { - log_error("bind on m %d to addr '%.*s:%u' failed: %s", st->sd, - st->addr.len, st->addr.data, st->port, strerror(errno)); - return DN_ERROR; - } + status = bind(st->sd, (struct sockaddr *)&si.addr, si.addrlen); + if (status < 0) { + log_error("bind on m %d to addr '%.*s:%u' failed: %s", st->sd, st->addr.len, + st->addr.data, st->port, strerror(errno)); + return DN_ERROR; + } - status = listen(st->sd, SOMAXCONN); - if (status < 0) { - log_error("listen on m %d failed: %s", st->sd, strerror(errno)); - return DN_ERROR; - } + status = listen(st->sd, SOMAXCONN); + if (status < 0) { + log_error("listen on m %d failed: %s", st->sd, strerror(errno)); + return DN_ERROR; + } - log_debug(LOG_NOTICE, "m %d listening on '%.*s:%u'", st->sd, - st->addr.len, st->addr.data, st->port); + log_debug(LOG_NOTICE, "m %d listening on '%.*s:%u'", st->sd, st->addr.len, + st->addr.data, st->port); - return DN_OK; + return DN_OK; } -static rstatus_t -stats_start_aggregator(struct stats *st) -{ - rstatus_t status; +static rstatus_t stats_start_aggregator(struct stats *st) { + rstatus_t status; - if (!stats_enabled) { - return DN_OK; - } + if (!stats_enabled) { + return DN_OK; + } - THROW_STATUS(stats_listen(st)); + THROW_STATUS(stats_listen(st)); - status = pthread_create(&st->tid, NULL, stats_loop, st); - if (status < 0) { - log_error("stats aggregator create failed: %s", strerror(status)); - return DN_ERROR; - } + status = pthread_create(&st->tid, NULL, stats_loop, st); + if (status < 0) { + log_error("stats aggregator create failed: %s", strerror(status)); + return DN_ERROR; + } - return DN_OK; + return DN_OK; } -static void -stats_stop_aggregator(struct stats *st) -{ - if (!stats_enabled) { - return; - } +static void stats_stop_aggregator(struct stats *st) { + if (!stats_enabled) { + return; + } - close(st->sd); + close(st->sd); } -struct stats * -stats_create(uint16_t stats_port, struct string pname, msec_t stats_interval, - char *source, struct server_pool *sp, struct context *ctx) -{ - rstatus_t status; - struct stats *st; - - struct string stats_ip; - get_host_from_pname(&stats_ip, &pname); +struct stats *stats_create(uint16_t stats_port, struct string pname, + msec_t stats_interval, char *source, + struct server_pool *sp, struct context *ctx) { + rstatus_t status; + struct stats *st; + struct string stats_ip; + get_host_from_pname(&stats_ip, &pname); - st = dn_alloc(sizeof(*st)); - if (st == NULL) { - return NULL; - } - - st->port = stats_port; - st->interval = stats_interval; - string_init(&st->addr); - if (string_duplicate(&st->addr,&stats_ip) != DN_OK) { - goto error; - } - - st->start_ts = (int64_t)time(NULL); - - st->buf.len = 0; - st->buf.data = NULL; - st->buf.size = 0; - - st->tid = (pthread_t) -1; - st->sd = -1; - - string_set_text(&st->service_str, "service"); - string_set_text(&st->service, "dynomite"); - - string_set_text(&st->source_str, "source"); - string_set_raw(&st->source, source); - - string_set_text(&st->version_str, "version"); - string_set_text(&st->version, VERSION); - - string_set_text(&st->uptime_str, "uptime"); - string_set_text(&st->timestamp_str, "timestamp"); - - //for latency histo - string_set_text(&st->latency_999th_str, "latency_999th"); - string_set_text(&st->latency_99th_str, "latency_99th"); - string_set_text(&st->latency_95th_str, "latency_95th"); - string_set_text(&st->latency_mean_str, "latency_mean"); - string_set_text(&st->latency_max_str, "latency_max"); - - //for payload size histo - string_set_text(&st->payload_size_999th_str, "payload_size_999th"); - string_set_text(&st->payload_size_99th_str, "payload_size_99th"); - string_set_text(&st->payload_size_95th_str, "payload_size_95th"); - string_set_text(&st->payload_size_mean_str, "payload_size_mean"); - string_set_text(&st->payload_size_max_str, "payload_size_max"); - - // cross region average latency - string_set_text(&st->cross_region_avg_rtt, "average_cross_region_rtt"); - string_set_text(&st->cross_region_99_rtt, "99_cross_region_rtt"); - - string_set_text(&st->client_out_queue_99, "client_out_queue_99"); - string_set_text(&st->server_in_queue_99, "server_in_queue_99"); - string_set_text(&st->server_out_queue_99, "server_out_queue_99"); - string_set_text(&st->dnode_client_out_queue_99, "dnode_client_out_queue_99"); - string_set_text(&st->peer_in_queue_99, "peer_in_queue_99"); - string_set_text(&st->peer_out_queue_99, "peer_out_queue_99"); - string_set_text(&st->remote_peer_in_queue_99, "remote_peer_in_queue_99"); - string_set_text(&st->remote_peer_out_queue_99, "remote_peer_out_queue_99"); - - string_set_text(&st->alloc_msgs_str, "alloc_msgs"); - string_set_text(&st->free_msgs_str, "free_msgs"); - string_set_text(&st->alloc_mbufs_str, "alloc_mbufs"); - string_set_text(&st->free_mbufs_str, "free_mbufs"); - string_set_text(&st->dyn_memory_str, "dyn_memory"); - - string_set_text(&st->rack_str, "rack"); - - string_copy(&st->rack, sp->rack.data, sp->rack.len); - - string_set_text(&st->dc_str, "dc"); - string_copy(&st->dc, sp->dc.data, sp->dc.len); - - st->updated = 0; - st->aggregate = 0; - - histo_init(&st->latency_histo); - histo_init(&st->payload_size_histo); - - histo_init(&st->server_latency_histo); - histo_init(&st->cross_zone_latency_histo); - histo_init(&st->cross_region_latency_histo); - - histo_init(&st->server_queue_wait_time_histo); - histo_init(&st->cross_zone_queue_wait_time_histo); - histo_init(&st->cross_region_queue_wait_time_histo); - - histo_init(&st->client_out_queue); - histo_init(&st->server_in_queue); - histo_init(&st->server_out_queue); - histo_init(&st->dnode_client_out_queue); - histo_init(&st->peer_in_queue); - histo_init(&st->peer_out_queue); - histo_init(&st->remote_peer_in_queue); - histo_init(&st->remote_peer_out_queue); - st->reset_histogram = 0; - st->alloc_msgs = 0; - st->free_msgs = 0; - st->alloc_mbufs = 0; - st->free_mbufs = 0; - st->dyn_memory = 0; - - /* map server pool to current (a), shadow (b) and sum (c) */ - - status = stats_pool_init(&st->current, sp); - if (status != DN_OK) { - goto error; - } - - status = stats_pool_init(&st->shadow, sp); - if (status != DN_OK) { - goto error; - } - - status = stats_pool_init(&st->sum, sp); - if (status != DN_OK) { - goto error; - } - - status = stats_create_bufs(st); - if (status != DN_OK) { - goto error; - } - - status = stats_start_aggregator(st); - if (status != DN_OK) { - goto error; - } - - st->ctx = ctx; - return st; - -error: - stats_destroy(st); + st = dn_alloc(sizeof(*st)); + if (st == NULL) { return NULL; -} - -void -stats_destroy(struct stats *st) -{ - stats_stop_aggregator(st); - stats_pool_unmap(&st->sum); - stats_pool_unmap(&st->shadow); - stats_pool_unmap(&st->current); - stats_destroy_buf(&st->buf); - stats_destroy_buf(&st->clus_desc_buf); - dn_free(st); -} - -void -stats_swap(struct stats *st) -{ - struct rusage r_usage; - if (!stats_enabled) { - return; - } + } + + st->port = stats_port; + st->interval = stats_interval; + string_init(&st->addr); + if (string_duplicate(&st->addr, &stats_ip) != DN_OK) { + goto error; + } + + st->start_ts = (int64_t)time(NULL); + + st->buf.len = 0; + st->buf.data = NULL; + st->buf.size = 0; + + st->tid = (pthread_t)-1; + st->sd = -1; + + string_set_text(&st->service_str, "service"); + string_set_text(&st->service, "dynomite"); + + string_set_text(&st->source_str, "source"); + string_set_raw(&st->source, source); + + string_set_text(&st->version_str, "version"); + string_set_text(&st->version, VERSION); + + string_set_text(&st->uptime_str, "uptime"); + string_set_text(&st->timestamp_str, "timestamp"); + + // for latency histo + string_set_text(&st->latency_999th_str, "latency_999th"); + string_set_text(&st->latency_99th_str, "latency_99th"); + string_set_text(&st->latency_95th_str, "latency_95th"); + string_set_text(&st->latency_mean_str, "latency_mean"); + string_set_text(&st->latency_max_str, "latency_max"); + + // for payload size histo + string_set_text(&st->payload_size_999th_str, "payload_size_999th"); + string_set_text(&st->payload_size_99th_str, "payload_size_99th"); + string_set_text(&st->payload_size_95th_str, "payload_size_95th"); + string_set_text(&st->payload_size_mean_str, "payload_size_mean"); + string_set_text(&st->payload_size_max_str, "payload_size_max"); + + // cross region average latency + string_set_text(&st->cross_region_avg_rtt, "average_cross_region_rtt"); + string_set_text(&st->cross_region_99_rtt, "99_cross_region_rtt"); + + string_set_text(&st->client_out_queue_99, "client_out_queue_99"); + string_set_text(&st->server_in_queue_99, "server_in_queue_99"); + string_set_text(&st->server_out_queue_99, "server_out_queue_99"); + string_set_text(&st->dnode_client_out_queue_99, "dnode_client_out_queue_99"); + string_set_text(&st->peer_in_queue_99, "peer_in_queue_99"); + string_set_text(&st->peer_out_queue_99, "peer_out_queue_99"); + string_set_text(&st->remote_peer_in_queue_99, "remote_peer_in_queue_99"); + string_set_text(&st->remote_peer_out_queue_99, "remote_peer_out_queue_99"); + + string_set_text(&st->alloc_msgs_str, "alloc_msgs"); + string_set_text(&st->free_msgs_str, "free_msgs"); + string_set_text(&st->alloc_mbufs_str, "alloc_mbufs"); + string_set_text(&st->free_mbufs_str, "free_mbufs"); + string_set_text(&st->dyn_memory_str, "dyn_memory"); + + string_set_text(&st->rack_str, "rack"); + + string_copy(&st->rack, sp->rack.data, sp->rack.len); + + string_set_text(&st->dc_str, "dc"); + string_copy(&st->dc, sp->dc.data, sp->dc.len); + + st->updated = 0; + st->aggregate = 0; + + histo_init(&st->latency_histo); + histo_init(&st->payload_size_histo); + + histo_init(&st->server_latency_histo); + histo_init(&st->cross_zone_latency_histo); + histo_init(&st->cross_region_latency_histo); + + histo_init(&st->server_queue_wait_time_histo); + histo_init(&st->cross_zone_queue_wait_time_histo); + histo_init(&st->cross_region_queue_wait_time_histo); + + histo_init(&st->client_out_queue); + histo_init(&st->server_in_queue); + histo_init(&st->server_out_queue); + histo_init(&st->dnode_client_out_queue); + histo_init(&st->peer_in_queue); + histo_init(&st->peer_out_queue); + histo_init(&st->remote_peer_in_queue); + histo_init(&st->remote_peer_out_queue); + st->reset_histogram = 0; + st->alloc_msgs = 0; + st->free_msgs = 0; + st->alloc_mbufs = 0; + st->free_mbufs = 0; + st->dyn_memory = 0; + + /* map server pool to current (a), shadow (b) and sum (c) */ + + status = stats_pool_init(&st->current, sp); + if (status != DN_OK) { + goto error; + } + + status = stats_pool_init(&st->shadow, sp); + if (status != DN_OK) { + goto error; + } + + status = stats_pool_init(&st->sum, sp); + if (status != DN_OK) { + goto error; + } + + status = stats_create_bufs(st); + if (status != DN_OK) { + goto error; + } + + status = stats_start_aggregator(st); + if (status != DN_OK) { + goto error; + } + + st->ctx = ctx; + return st; - if (st->aggregate == 1) { - log_debug(LOG_PVERB, "skip swap of current %p shadow %p as aggregator " - "is busy", &st->current, &st->shadow); - return; - } - - if (st->updated == 0) { - log_debug(LOG_PVERB, "skip swap of current %p shadow %p as there is " - "nothing new", &st->current, &st->shadow); - return; - } +error: + stats_destroy(st); + return NULL; +} + +void stats_destroy(struct stats *st) { + stats_stop_aggregator(st); + stats_pool_unmap(&st->sum); + stats_pool_unmap(&st->shadow); + stats_pool_unmap(&st->current); + stats_destroy_buf(&st->buf); + stats_destroy_buf(&st->clus_desc_buf); + dn_free(st); +} + +void stats_swap(struct stats *st) { + struct rusage r_usage; + if (!stats_enabled) { + return; + } - log_debug(LOG_PVERB, "swap stats current %p shadow %p", &st->current, - &st->shadow); + if (st->aggregate == 1) { + log_debug(LOG_PVERB, + "skip swap of current %p shadow %p as aggregator " + "is busy", + &st->current, &st->shadow); + return; + } + if (st->updated == 0) { + log_debug(LOG_PVERB, + "skip swap of current %p shadow %p as there is " + "nothing new", + &st->current, &st->shadow); + return; + } - //set the latencies - histo_compute(&st->latency_histo); + log_debug(LOG_PVERB, "swap stats current %p shadow %p", &st->current, + &st->shadow); - histo_compute(&st->payload_size_histo); + // set the latencies + histo_compute(&st->latency_histo); - histo_compute(&st->server_latency_histo); - histo_compute(&st->cross_zone_latency_histo); - histo_compute(&st->cross_region_latency_histo); + histo_compute(&st->payload_size_histo); - histo_compute(&st->server_queue_wait_time_histo); - histo_compute(&st->cross_zone_queue_wait_time_histo); - histo_compute(&st->cross_region_queue_wait_time_histo); + histo_compute(&st->server_latency_histo); + histo_compute(&st->cross_zone_latency_histo); + histo_compute(&st->cross_region_latency_histo); - histo_compute(&st->client_out_queue); - histo_compute(&st->server_in_queue); - histo_compute(&st->server_out_queue); - histo_compute(&st->dnode_client_out_queue); - histo_compute(&st->peer_in_queue); - histo_compute(&st->peer_out_queue); - histo_compute(&st->remote_peer_in_queue); - histo_compute(&st->remote_peer_out_queue); + histo_compute(&st->server_queue_wait_time_histo); + histo_compute(&st->cross_zone_queue_wait_time_histo); + histo_compute(&st->cross_region_queue_wait_time_histo); - st->alloc_msgs = msg_alloc_msgs(); - st->free_msgs = msg_free_queue_size(); - st->alloc_mbufs = mbuf_alloc_get_count(); - st->free_mbufs = mbuf_free_queue_size(); + histo_compute(&st->client_out_queue); + histo_compute(&st->server_in_queue); + histo_compute(&st->server_out_queue); + histo_compute(&st->dnode_client_out_queue); + histo_compute(&st->peer_in_queue); + histo_compute(&st->peer_out_queue); + histo_compute(&st->remote_peer_in_queue); + histo_compute(&st->remote_peer_out_queue); - getrusage(RUSAGE_SELF,&r_usage); - st->dyn_memory = r_usage.ru_maxrss; + st->alloc_msgs = msg_alloc_msgs(); + st->free_msgs = msg_free_queue_size(); + st->alloc_mbufs = mbuf_alloc_get_count(); + st->free_mbufs = mbuf_free_queue_size(); - // swap current and shadow - struct stats_pool temp = st->current; - st->current = st->shadow; - st->shadow = temp; + getrusage(RUSAGE_SELF, &r_usage); + st->dyn_memory = r_usage.ru_maxrss; - /* - * Reset current (a) stats before giving it back to generator to keep - * stats addition idempotent - */ - stats_pool_reset(&st->current); - st->updated = 0; + // swap current and shadow + struct stats_pool temp = st->current; + st->current = st->shadow; + st->shadow = temp; - st->aggregate = 1; + /* + * Reset current (a) stats before giving it back to generator to keep + * stats addition idempotent + */ + stats_pool_reset(&st->current); + st->updated = 0; + st->aggregate = 1; } -uint64_t -_stats_pool_get_ts(struct context *ctx, - stats_pool_field_t fidx) -{ - struct stats *st = ctx->stats; - struct stats_pool *stp = &st->current; - struct stats_metric *stm = array_get(&stp->metric, fidx); - return stm->value.counter; +uint64_t _stats_pool_get_ts(struct context *ctx, stats_pool_field_t fidx) { + struct stats *st = ctx->stats; + struct stats_pool *stp = &st->current; + struct stats_metric *stm = array_get(&stp->metric, fidx); + return stm->value.counter; } -int64_t -_stats_pool_get_val(struct context *ctx, - stats_pool_field_t fidx) -{ - struct stats *st = ctx->stats; - struct stats_pool *stp = &st->current; - struct stats_metric *stm = array_get(&stp->metric, fidx); - return stm->value.counter; +int64_t _stats_pool_get_val(struct context *ctx, stats_pool_field_t fidx) { + struct stats *st = ctx->stats; + struct stats_pool *stp = &st->current; + struct stats_metric *stm = array_get(&stp->metric, fidx); + return stm->value.counter; } - -static struct stats_metric * -stats_pool_to_metric(struct context *ctx, - stats_pool_field_t fidx) -{ - struct stats *st = ctx->stats; - struct stats_pool *stp = &st->current; - struct stats_metric *stm = array_get(&stp->metric, fidx); - st->updated = 1; - return stm; +static struct stats_metric *stats_pool_to_metric(struct context *ctx, + stats_pool_field_t fidx) { + struct stats *st = ctx->stats; + struct stats_pool *stp = &st->current; + struct stats_metric *stm = array_get(&stp->metric, fidx); + st->updated = 1; + return stm; } -void -_stats_pool_incr(struct context *ctx, - stats_pool_field_t fidx) -{ - struct stats_metric *stm = stats_pool_to_metric(ctx, fidx); +void _stats_pool_incr(struct context *ctx, stats_pool_field_t fidx) { + struct stats_metric *stm = stats_pool_to_metric(ctx, fidx); - ASSERT(stm->type == STATS_COUNTER || stm->type == STATS_GAUGE); - stm->value.counter++; + ASSERT(stm->type == STATS_COUNTER || stm->type == STATS_GAUGE); + stm->value.counter++; - log_debug(LOG_VVVERB, "incr field '%.*s' to %"PRId64"", stm->name.len, - stm->name.data, stm->value.counter); + log_debug(LOG_VVVERB, "incr field '%.*s' to %" PRId64 "", stm->name.len, + stm->name.data, stm->value.counter); } -void -_stats_pool_decr(struct context *ctx, - stats_pool_field_t fidx) -{ - struct stats_metric *stm = stats_pool_to_metric(ctx, fidx); +void _stats_pool_decr(struct context *ctx, stats_pool_field_t fidx) { + struct stats_metric *stm = stats_pool_to_metric(ctx, fidx); - ASSERT(stm->type == STATS_GAUGE); - stm->value.counter--; + ASSERT(stm->type == STATS_GAUGE); + stm->value.counter--; - log_debug(LOG_VVVERB, "decr field '%.*s' to %"PRId64"", stm->name.len, - stm->name.data, stm->value.counter); + log_debug(LOG_VVVERB, "decr field '%.*s' to %" PRId64 "", stm->name.len, + stm->name.data, stm->value.counter); } -void -_stats_pool_incr_by(struct context *ctx, - stats_pool_field_t fidx, int64_t val) -{ - struct stats_metric *stm = stats_pool_to_metric(ctx, fidx); +void _stats_pool_incr_by(struct context *ctx, stats_pool_field_t fidx, + int64_t val) { + struct stats_metric *stm = stats_pool_to_metric(ctx, fidx); - ASSERT(stm->type == STATS_COUNTER || stm->type == STATS_GAUGE); - stm->value.counter += val; + ASSERT(stm->type == STATS_COUNTER || stm->type == STATS_GAUGE); + stm->value.counter += val; - log_debug(LOG_VVVERB, "incr by field '%.*s' to %"PRId64"", stm->name.len, - stm->name.data, stm->value.counter); + log_debug(LOG_VVVERB, "incr by field '%.*s' to %" PRId64 "", stm->name.len, + stm->name.data, stm->value.counter); } -void -_stats_pool_decr_by(struct context *ctx, - stats_pool_field_t fidx, int64_t val) -{ - struct stats_metric *stm = stats_pool_to_metric(ctx, fidx); +void _stats_pool_decr_by(struct context *ctx, stats_pool_field_t fidx, + int64_t val) { + struct stats_metric *stm = stats_pool_to_metric(ctx, fidx); - ASSERT(stm->type == STATS_GAUGE); - stm->value.counter -= val; + ASSERT(stm->type == STATS_GAUGE); + stm->value.counter -= val; - log_debug(LOG_VVVERB, "decr by field '%.*s' to %"PRId64"", stm->name.len, - stm->name.data, stm->value.counter); + log_debug(LOG_VVVERB, "decr by field '%.*s' to %" PRId64 "", stm->name.len, + stm->name.data, stm->value.counter); } -void -_stats_pool_set_ts(struct context *ctx, - stats_pool_field_t fidx, int64_t val) -{ - struct stats_metric *stm = stats_pool_to_metric(ctx, fidx); +void _stats_pool_set_ts(struct context *ctx, stats_pool_field_t fidx, + int64_t val) { + struct stats_metric *stm = stats_pool_to_metric(ctx, fidx); - ASSERT(stm->type == STATS_TIMESTAMP); - stm->value.timestamp = val; + ASSERT(stm->type == STATS_TIMESTAMP); + stm->value.timestamp = val; - log_debug(LOG_VVVERB, "set ts field '%.*s' to %"PRId64"", stm->name.len, - stm->name.data, stm->value.timestamp); + log_debug(LOG_VVVERB, "set ts field '%.*s' to %" PRId64 "", stm->name.len, + stm->name.data, stm->value.timestamp); } -uint64_t -_stats_server_get_ts(struct context *ctx, - stats_server_field_t fidx) -{ - struct stats *st = ctx->stats; - struct stats_pool *stp = &st->current; - struct stats_server *sts = &stp->server; - struct stats_metric *stm = array_get(&sts->metric, fidx); +uint64_t _stats_server_get_ts(struct context *ctx, stats_server_field_t fidx) { + struct stats *st = ctx->stats; + struct stats_pool *stp = &st->current; + struct stats_server *sts = &stp->server; + struct stats_metric *stm = array_get(&sts->metric, fidx); - return stm->value.timestamp; + return stm->value.timestamp; } -void -_stats_pool_set_val(struct context *ctx, - stats_pool_field_t fidx, int64_t val) -{ - struct stats_metric *stm = stats_pool_to_metric(ctx, fidx); +void _stats_pool_set_val(struct context *ctx, stats_pool_field_t fidx, + int64_t val) { + struct stats_metric *stm = stats_pool_to_metric(ctx, fidx); - stm->value.counter = val; + stm->value.counter = val; - log_debug(LOG_VVVERB, "set val field '%.*s' to %"PRId64"", stm->name.len, - stm->name.data, stm->value.counter); + log_debug(LOG_VVVERB, "set val field '%.*s' to %" PRId64 "", stm->name.len, + stm->name.data, stm->value.counter); } -int64_t -_stats_server_get_val(struct context *ctx, - stats_server_field_t fidx) -{ - struct stats *st = ctx->stats; - struct stats_pool *stp = &st->current; - struct stats_server *sts = &stp->server; - struct stats_metric *stm = array_get(&sts->metric, fidx); +int64_t _stats_server_get_val(struct context *ctx, stats_server_field_t fidx) { + struct stats *st = ctx->stats; + struct stats_pool *stp = &st->current; + struct stats_server *sts = &stp->server; + struct stats_metric *stm = array_get(&sts->metric, fidx); - return stm->value.counter; + return stm->value.counter; } -static struct stats_metric * -stats_server_to_metric(struct context *ctx, - stats_server_field_t fidx) -{ - struct stats *st = ctx->stats; - ASSERT(st != NULL); +static struct stats_metric *stats_server_to_metric(struct context *ctx, + stats_server_field_t fidx) { + struct stats *st = ctx->stats; + ASSERT(st != NULL); - struct stats_pool *stp = &st->current; - struct stats_server *sts = &stp->server; - struct stats_metric *stm = array_get(&sts->metric, fidx); + struct stats_pool *stp = &st->current; + struct stats_server *sts = &stp->server; + struct stats_metric *stm = array_get(&sts->metric, fidx); - st->updated = 1; + st->updated = 1; - log_debug(LOG_VVVERB, "metric '%.*s' for server", - stm->name.len, stm->name.data); + log_debug(LOG_VVVERB, "metric '%.*s' for server", stm->name.len, + stm->name.data); - return stm; + return stm; } -void -_stats_server_incr(struct context *ctx, - stats_server_field_t fidx) -{ - - struct stats_metric *stm = stats_server_to_metric(ctx, fidx); +void _stats_server_incr(struct context *ctx, stats_server_field_t fidx) { + struct stats_metric *stm = stats_server_to_metric(ctx, fidx); - ASSERT(stm->type == STATS_COUNTER || stm->type == STATS_GAUGE); - stm->value.counter++; + ASSERT(stm->type == STATS_COUNTER || stm->type == STATS_GAUGE); + stm->value.counter++; - log_debug(LOG_VVVERB, "incr field '%.*s' to %"PRId64"", stm->name.len, - stm->name.data, stm->value.counter); - + log_debug(LOG_VVVERB, "incr field '%.*s' to %" PRId64 "", stm->name.len, + stm->name.data, stm->value.counter); } -void -_stats_server_decr(struct context *ctx, - stats_server_field_t fidx) -{ - - struct stats_metric *stm = stats_server_to_metric(ctx, fidx); +void _stats_server_decr(struct context *ctx, stats_server_field_t fidx) { + struct stats_metric *stm = stats_server_to_metric(ctx, fidx); - ASSERT(stm->type == STATS_GAUGE); - stm->value.counter--; - - log_debug(LOG_VVVERB, "decr field '%.*s' to %"PRId64"", stm->name.len, - stm->name.data, stm->value.counter); + ASSERT(stm->type == STATS_GAUGE); + stm->value.counter--; + log_debug(LOG_VVVERB, "decr field '%.*s' to %" PRId64 "", stm->name.len, + stm->name.data, stm->value.counter); } -void -_stats_server_incr_by(struct context *ctx, - stats_server_field_t fidx, int64_t val) -{ - - struct stats_metric *stm = stats_server_to_metric(ctx, fidx); +void _stats_server_incr_by(struct context *ctx, stats_server_field_t fidx, + int64_t val) { + struct stats_metric *stm = stats_server_to_metric(ctx, fidx); - ASSERT(stm->type == STATS_COUNTER || stm->type == STATS_GAUGE); - stm->value.counter += val; - - log_debug(LOG_VVVERB, "incr by field '%.*s' to %"PRId64"", stm->name.len, - stm->name.data, stm->value.counter); + ASSERT(stm->type == STATS_COUNTER || stm->type == STATS_GAUGE); + stm->value.counter += val; + log_debug(LOG_VVVERB, "incr by field '%.*s' to %" PRId64 "", stm->name.len, + stm->name.data, stm->value.counter); } -void -_stats_server_decr_by(struct context *ctx, - stats_server_field_t fidx, int64_t val) -{ - - struct stats_metric *stm = stats_server_to_metric(ctx, fidx); +void _stats_server_decr_by(struct context *ctx, stats_server_field_t fidx, + int64_t val) { + struct stats_metric *stm = stats_server_to_metric(ctx, fidx); - ASSERT(stm->type == STATS_GAUGE); - stm->value.counter -= val; - - log_debug(LOG_VVVERB, "decr by field '%.*s' to %"PRId64"", stm->name.len, - stm->name.data, stm->value.counter); + ASSERT(stm->type == STATS_GAUGE); + stm->value.counter -= val; + log_debug(LOG_VVVERB, "decr by field '%.*s' to %" PRId64 "", stm->name.len, + stm->name.data, stm->value.counter); } -void -_stats_server_set_ts(struct context *ctx, - stats_server_field_t fidx, uint64_t val) -{ - - struct stats_metric *stm = stats_server_to_metric(ctx, fidx); +void _stats_server_set_ts(struct context *ctx, stats_server_field_t fidx, + uint64_t val) { + struct stats_metric *stm = stats_server_to_metric(ctx, fidx); - ASSERT(stm->type == STATS_TIMESTAMP); - stm->value.timestamp = val; + ASSERT(stm->type == STATS_TIMESTAMP); + stm->value.timestamp = val; - log_debug(LOG_VVVERB, "set ts field '%.*s' to %"PRId64"", stm->name.len, - stm->name.data, stm->value.timestamp); - + log_debug(LOG_VVVERB, "set ts field '%.*s' to %" PRId64 "", stm->name.len, + stm->name.data, stm->value.timestamp); } -//should use macro or something else to make this more elegant -void stats_histo_add_latency(struct context *ctx, uint64_t val) -{ - struct stats *st = ctx->stats; - histo_add(&st->latency_histo, val); - ctx->stats->updated = 1; +// should use macro or something else to make this more elegant +void stats_histo_add_latency(struct context *ctx, uint64_t val) { + struct stats *st = ctx->stats; + histo_add(&st->latency_histo, val); + ctx->stats->updated = 1; } -void stats_histo_add_payloadsize(struct context *ctx, uint64_t val) -{ - struct stats *st = ctx->stats; - histo_add(&st->payload_size_histo, val); - ctx->stats->updated = 1; +void stats_histo_add_payloadsize(struct context *ctx, uint64_t val) { + struct stats *st = ctx->stats; + histo_add(&st->payload_size_histo, val); + ctx->stats->updated = 1; } diff --git a/src/dyn_stats.h b/src/dyn_stats.h index ebd93d256..b1e16bb45 100644 --- a/src/dyn_stats.h +++ b/src/dyn_stats.h @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,353 +20,391 @@ * limitations under the License. */ -#include "dyn_core.h" -#include "dyn_histogram.h" - - #ifndef _DYN_STATS_H_ #define _DYN_STATS_H_ - - -#define STATS_POOL_CODEC(ACTION) \ - /* client behavior */ \ - ACTION( client_eof, STATS_COUNTER, "# eof on client connections") \ - ACTION( client_err, STATS_COUNTER, "# errors on client connections") \ - ACTION( client_connections, STATS_GAUGE, "# active client connections") \ - ACTION( client_read_requests, STATS_COUNTER, "# client read requests") \ - ACTION( client_write_requests, STATS_COUNTER, "# client write responses") \ - ACTION( client_dropped_requests, STATS_COUNTER, "# client dropped requests") \ - ACTION( client_non_quorum_w_responses,STATS_COUNTER, "# client non quorum write responses") \ - ACTION( client_non_quorum_r_responses,STATS_COUNTER, "# client non quorum read responses") \ - /* pool behavior */ \ - ACTION( server_ejects, STATS_COUNTER, "# times backend server was ejected") \ - /* dnode client behavior */ \ - ACTION( dnode_client_eof, STATS_COUNTER, "# eof on dnode client connections") \ - ACTION( dnode_client_err, STATS_COUNTER, "# errors on dnode client connections") \ - ACTION( dnode_client_connections, STATS_GAUGE, "# active dnode client connections") \ - ACTION( dnode_client_in_queue, STATS_GAUGE, "# dnode client requests in incoming queue") \ - ACTION( dnode_client_in_queue_bytes, STATS_GAUGE, "current dnode client request bytes in incoming queue") \ - ACTION( dnode_client_out_queue, STATS_GAUGE, "# dnode client requests in outgoing queue") \ - ACTION( dnode_client_out_queue_bytes, STATS_GAUGE, "current dnode client request bytes in outgoing queue") \ - /* peer behavior */ \ - ACTION( peer_dropped_requests, STATS_COUNTER, "# local dc peer dropped requests") \ - ACTION( peer_timedout_requests, STATS_COUNTER, "# local dc peer timedout requests") \ - ACTION( remote_peer_dropped_requests, STATS_COUNTER, "# remote dc peer dropped requests") \ - ACTION( remote_peer_timedout_requests,STATS_COUNTER, "# remote dc peer timedout requests") \ - ACTION( remote_peer_failover_requests,STATS_COUNTER, "# remote dc peer failover requests") \ - ACTION( peer_eof, STATS_COUNTER, "# eof on peer connections") \ - ACTION( peer_err, STATS_COUNTER, "# errors on peer connections") \ - ACTION( peer_timedout, STATS_COUNTER, "# timeouts on local dc peer connections") \ - ACTION( remote_peer_timedout, STATS_COUNTER, "# timeouts on remote dc peer connections") \ - ACTION( peer_connections, STATS_GAUGE, "# active peer connections") \ - ACTION( peer_forward_error, STATS_GAUGE, "# times we encountered a peer forwarding error") \ - ACTION( peer_requests, STATS_COUNTER, "# peer requests") \ - ACTION( peer_request_bytes, STATS_COUNTER, "total peer request bytes") \ - ACTION( peer_responses, STATS_COUNTER, "# peer respones") \ - ACTION( peer_response_bytes, STATS_COUNTER, "total peer response bytes") \ - ACTION( peer_ejected_at, STATS_TIMESTAMP, "timestamp when peer was ejected") \ - ACTION( peer_ejects, STATS_COUNTER, "# times a peer was ejected") \ - ACTION( peer_in_queue, STATS_GAUGE, "# local dc peer requests in incoming queue") \ - ACTION( remote_peer_in_queue, STATS_GAUGE, "# remote dc peer requests in incoming queue") \ - ACTION( peer_in_queue_bytes, STATS_GAUGE, "current peer request bytes in incoming queue") \ - ACTION( remote_peer_in_queue_bytes, STATS_GAUGE, "current peer request bytes in incoming queue to remote DC") \ - ACTION( peer_out_queue, STATS_GAUGE, "# local dc peer requests in outgoing queue") \ - ACTION( remote_peer_out_queue, STATS_GAUGE, "# remote dc peer requests in outgoing queue") \ - ACTION( peer_out_queue_bytes, STATS_GAUGE, "current peer request bytes in outgoing queue") \ - ACTION( remote_peer_out_queue_bytes, STATS_GAUGE, "current peer request bytes in outgoing queue to remote DC") \ - ACTION( peer_mismatch_requests, STATS_COUNTER, "current dnode peer mismatched messages") \ - /* forwarder behavior */ \ - ACTION( forward_error, STATS_COUNTER, "# times we encountered a forwarding error") \ - ACTION( fragments, STATS_COUNTER, "# fragments created from a multi-vector request") \ - ACTION( stats_count, STATS_COUNTER, "# stats request") \ - -#define STATS_SERVER_CODEC(ACTION) \ - /* server behavior */ \ - ACTION( server_eof, STATS_COUNTER, "# eof on server connections") \ - ACTION( server_err, STATS_COUNTER, "# errors on server connections") \ - ACTION( server_timedout, STATS_COUNTER, "# timeouts on server connections") \ - ACTION( server_ejected_at, STATS_TIMESTAMP, "timestamp when server was ejected in usec since epoch") \ - ACTION( server_dropped_requests, STATS_COUNTER, "# server dropped requests") \ - ACTION( server_timedout_requests, STATS_COUNTER, "# server timedout requests") \ - /* data behavior */ \ - ACTION( read_requests, STATS_COUNTER, "# read requests") \ - ACTION( read_request_bytes, STATS_COUNTER, "total read request bytes") \ - ACTION( write_requests, STATS_COUNTER, "# write requests") \ - ACTION( write_request_bytes, STATS_COUNTER, "total write request bytes") \ - ACTION( read_responses, STATS_COUNTER, "# read respones") \ - ACTION( read_response_bytes, STATS_COUNTER, "total read response bytes") \ - ACTION( write_responses, STATS_COUNTER, "# write respones") \ - ACTION( write_response_bytes, STATS_COUNTER, "total write response bytes") \ - ACTION( in_queue, STATS_GAUGE, "# requests in incoming queue") \ - ACTION( in_queue_bytes, STATS_GAUGE, "current request bytes in incoming queue") \ - ACTION( out_queue, STATS_GAUGE, "# requests in outgoing queue") \ - ACTION( out_queue_bytes, STATS_GAUGE, "current request bytes in outgoing queue") \ - /* Redis */ \ - ACTION( redis_req_get, STATS_COUNTER, "# Redis get") \ - ACTION( redis_req_set, STATS_COUNTER, "# Redis set") \ - ACTION( redis_req_del, STATS_COUNTER, "# Redis del") \ - ACTION( redis_req_incr_decr, STATS_COUNTER, "# Redis incr or decr") \ - ACTION( redis_req_keys, STATS_COUNTER, "# Redis keys") \ - ACTION( redis_req_mget, STATS_COUNTER, "# Redis mget") \ - ACTION( redis_req_scan, STATS_COUNTER, "# Redis scan") \ - ACTION( redis_req_sort, STATS_COUNTER, "# Redis sort") \ - ACTION( redis_req_lreqm, STATS_COUNTER, "# Redis lreqm") \ - ACTION( redis_req_sunion, STATS_COUNTER, "# Redis sunion") \ - ACTION( redis_req_ping, STATS_COUNTER, "# Redis ping") \ - ACTION( redis_req_lists, STATS_COUNTER, "# Redis lists") \ - ACTION( redis_req_sets, STATS_COUNTER, "# Redis sets") \ - ACTION( redis_req_hashes, STATS_COUNTER, "# Redis hashes") \ - ACTION( redis_req_sortedsets, STATS_COUNTER, "# Redis sortedsets") \ - ACTION( redis_req_other, STATS_COUNTER, "# Redis other") \ - +#include "dyn_array.h" +#include "dyn_histogram.h" +#include "dyn_string.h" + +// Forward declarations +struct context; +struct server_pool; + +#define STATS_POOL_CODEC(ACTION) \ + /* client behavior */ \ + ACTION(client_eof, STATS_COUNTER, "# eof on client connections") \ + ACTION(client_err, STATS_COUNTER, "# errors on client connections") \ + ACTION(client_connections, STATS_GAUGE, "# active client connections") \ + ACTION(client_read_requests, STATS_COUNTER, "# client read requests") \ + ACTION(client_write_requests, STATS_COUNTER, "# client write responses") \ + ACTION(client_dropped_requests, STATS_COUNTER, "# client dropped requests") \ + ACTION(client_non_quorum_w_responses, STATS_COUNTER, \ + "# client non quorum write responses") \ + ACTION(client_non_quorum_r_responses, STATS_COUNTER, \ + "# client non quorum read responses") \ + /* pool behavior */ \ + ACTION(server_ejects, STATS_COUNTER, "# times backend server was ejected") \ + /* dnode client behavior */ \ + ACTION(dnode_client_eof, STATS_COUNTER, "# eof on dnode client connections") \ + ACTION(dnode_client_err, STATS_COUNTER, \ + "# errors on dnode client connections") \ + ACTION(dnode_client_connections, STATS_GAUGE, \ + "# active dnode client connections") \ + ACTION(dnode_client_in_queue, STATS_GAUGE, \ + "# dnode client requests in incoming queue") \ + ACTION(dnode_client_in_queue_bytes, STATS_GAUGE, \ + "current dnode client request bytes in incoming queue") \ + ACTION(dnode_client_out_queue, STATS_GAUGE, \ + "# dnode client requests in outgoing queue") \ + ACTION(dnode_client_out_queue_bytes, STATS_GAUGE, \ + "current dnode client request bytes in outgoing queue") \ + /* peer behavior */ \ + ACTION(peer_dropped_requests, STATS_COUNTER, \ + "# local dc peer dropped requests") \ + ACTION(peer_timedout_requests, STATS_COUNTER, \ + "# local dc peer timedout requests") \ + ACTION(remote_peer_dropped_requests, STATS_COUNTER, \ + "# remote dc peer dropped requests") \ + ACTION(remote_peer_timedout_requests, STATS_COUNTER, \ + "# remote dc peer timedout requests") \ + ACTION(remote_peer_failover_requests, STATS_COUNTER, \ + "# remote dc peer failover requests") \ + ACTION(peer_eof, STATS_COUNTER, "# eof on peer connections") \ + ACTION(peer_err, STATS_COUNTER, "# errors on peer connections") \ + ACTION(peer_timedout, STATS_COUNTER, \ + "# timeouts on local dc peer connections") \ + ACTION(remote_peer_timedout, STATS_COUNTER, \ + "# timeouts on remote dc peer connections") \ + ACTION(peer_connections, STATS_GAUGE, "# active peer connections") \ + ACTION(peer_forward_error, STATS_GAUGE, \ + "# times we encountered a peer forwarding error") \ + ACTION(peer_requests, STATS_COUNTER, "# peer requests") \ + ACTION(peer_request_bytes, STATS_COUNTER, "total peer request bytes") \ + ACTION(peer_responses, STATS_COUNTER, "# peer respones") \ + ACTION(peer_response_bytes, STATS_COUNTER, "total peer response bytes") \ + ACTION(peer_ejected_at, STATS_TIMESTAMP, "timestamp when peer was ejected") \ + ACTION(peer_ejects, STATS_COUNTER, "# times a peer was ejected") \ + ACTION(peer_in_queue, STATS_GAUGE, \ + "# local dc peer requests in incoming queue") \ + ACTION(remote_peer_in_queue, STATS_GAUGE, \ + "# remote dc peer requests in incoming queue") \ + ACTION(peer_in_queue_bytes, STATS_GAUGE, \ + "current peer request bytes in incoming queue") \ + ACTION(remote_peer_in_queue_bytes, STATS_GAUGE, \ + "current peer request bytes in incoming queue to remote DC") \ + ACTION(peer_out_queue, STATS_GAUGE, \ + "# local dc peer requests in outgoing queue") \ + ACTION(remote_peer_out_queue, STATS_GAUGE, \ + "# remote dc peer requests in outgoing queue") \ + ACTION(peer_out_queue_bytes, STATS_GAUGE, \ + "current peer request bytes in outgoing queue") \ + ACTION(remote_peer_out_queue_bytes, STATS_GAUGE, \ + "current peer request bytes in outgoing queue to remote DC") \ + ACTION(peer_mismatch_requests, STATS_COUNTER, \ + "current dnode peer mismatched messages") \ + /* forwarder behavior */ \ + ACTION(forward_error, STATS_COUNTER, \ + "# times we encountered a forwarding error") \ + ACTION(fragments, STATS_COUNTER, \ + "# fragments created from a multi-vector request") \ + ACTION(stats_count, STATS_COUNTER, "# stats request") + +#define STATS_SERVER_CODEC(ACTION) \ + /* server behavior */ \ + ACTION(server_eof, STATS_COUNTER, "# eof on server connections") \ + ACTION(server_err, STATS_COUNTER, "# errors on server connections") \ + ACTION(server_timedout, STATS_COUNTER, "# timeouts on server connections") \ + ACTION(server_ejected_at, STATS_TIMESTAMP, \ + "timestamp when server was ejected in usec since epoch") \ + ACTION(server_dropped_requests, STATS_COUNTER, "# server dropped requests") \ + ACTION(server_timedout_requests, STATS_COUNTER, \ + "# server timedout requests") \ + /* data behavior */ \ + ACTION(read_requests, STATS_COUNTER, "# read requests") \ + ACTION(read_request_bytes, STATS_COUNTER, "total read request bytes") \ + ACTION(write_requests, STATS_COUNTER, "# write requests") \ + ACTION(write_request_bytes, STATS_COUNTER, "total write request bytes") \ + ACTION(read_responses, STATS_COUNTER, "# read respones") \ + ACTION(read_response_bytes, STATS_COUNTER, "total read response bytes") \ + ACTION(write_responses, STATS_COUNTER, "# write respones") \ + ACTION(write_response_bytes, STATS_COUNTER, "total write response bytes") \ + ACTION(in_queue, STATS_GAUGE, "# requests in incoming queue") \ + ACTION(in_queue_bytes, STATS_GAUGE, \ + "current request bytes in incoming queue") \ + ACTION(out_queue, STATS_GAUGE, "# requests in outgoing queue") \ + ACTION(out_queue_bytes, STATS_GAUGE, \ + "current request bytes in outgoing queue") \ + /* Redis */ \ + ACTION(redis_req_get, STATS_COUNTER, "# Redis get") \ + ACTION(redis_req_set, STATS_COUNTER, "# Redis set") \ + ACTION(redis_req_del, STATS_COUNTER, "# Redis del") \ + ACTION(redis_req_incr_decr, STATS_COUNTER, "# Redis incr or decr") \ + ACTION(redis_req_keys, STATS_COUNTER, "# Redis keys") \ + ACTION(redis_req_mget, STATS_COUNTER, "# Redis mget") \ + ACTION(redis_req_scan, STATS_COUNTER, "# Redis scan") \ + ACTION(redis_req_sort, STATS_COUNTER, "# Redis sort") \ + ACTION(redis_req_lreqm, STATS_COUNTER, "# Redis lreqm") \ + ACTION(redis_req_sunion, STATS_COUNTER, "# Redis sunion") \ + ACTION(redis_req_ping, STATS_COUNTER, "# Redis ping") \ + ACTION(redis_req_lists, STATS_COUNTER, "# Redis lists") \ + ACTION(redis_req_sets, STATS_COUNTER, "# Redis sets") \ + ACTION(redis_req_hashes, STATS_COUNTER, "# Redis hashes") \ + ACTION(redis_req_sortedsets, STATS_COUNTER, "# Redis sortedsets") \ + ACTION(redis_req_other, STATS_COUNTER, "# Redis other") typedef enum stats_type { - STATS_INVALID, - STATS_COUNTER, /* monotonic accumulator */ - STATS_GAUGE, /* non-monotonic accumulator */ - STATS_TIMESTAMP, /* monotonic timestamp (in nsec) */ - STATS_STRING, - STATS_SENTINEL + STATS_INVALID, + STATS_COUNTER, /* monotonic accumulator */ + STATS_GAUGE, /* non-monotonic accumulator */ + STATS_TIMESTAMP, /* monotonic timestamp (in nsec) */ + STATS_STRING, + STATS_SENTINEL } stats_type_t; typedef enum { - CMD_UNKNOWN, - CMD_HELP, - CMD_INFO, - CMD_PING, - CMD_DESCRIBE, - CMD_STANDBY, - CMD_WRITES_ONLY, - CMD_RESUMING, - CMD_NORMAL, - CMD_BOOTSTRAPING, - CMD_LEAVING, - CMD_PEER_DOWN, - CMD_PEER_UP, - CMD_PEER_RESET, - CMD_SET_LOG_LEVEL, - CMD_LOG_LEVEL_UP, - CMD_LOG_LEVEL_DOWN, - CMD_HISTO_RESET, - CMD_CL_DESCRIBE, /* cluster_describe */ - CMD_SET_CONSISTENCY, - CMD_GET_CONSISTENCY, - CMD_GET_TIMEOUT_FACTOR, - CMD_SET_TIMEOUT_FACTOR, - CMD_GET_STATE, + CMD_UNKNOWN, + CMD_HELP, + CMD_INFO, + CMD_PING, + CMD_DESCRIBE, + CMD_STANDBY, + CMD_WRITES_ONLY, + CMD_RESUMING, + CMD_NORMAL, + CMD_BOOTSTRAPING, + CMD_LEAVING, + CMD_PEER_DOWN, + CMD_PEER_UP, + CMD_PEER_RESET, + CMD_SET_LOG_LEVEL, + CMD_LOG_LEVEL_UP, + CMD_LOG_LEVEL_DOWN, + CMD_HISTO_RESET, + CMD_CL_DESCRIBE, /* cluster_describe */ + CMD_SET_CONSISTENCY, + CMD_GET_CONSISTENCY, + CMD_GET_TIMEOUT_FACTOR, + CMD_SET_TIMEOUT_FACTOR, + CMD_GET_STATE, } stats_cmd_t; struct stats_metric { - stats_type_t type; /* type */ - struct string name; /* name (ref) */ - union { - int64_t counter; /* accumulating counter */ - int64_t timestamp; /* monotonic timestamp */ - struct string str; /* store string value */ - } value; + stats_type_t type; /* type */ + struct string name; /* name (ref) */ + union { + int64_t counter; /* accumulating counter */ + int64_t timestamp; /* monotonic timestamp */ + struct string str; /* store string value */ + } value; }; struct stats_dnode { - struct string name; /* dnode server name (ref) */ - struct array metric; /* stats_metric[] for dnode server codec */ + struct string name; /* dnode server name (ref) */ + struct array metric; /* stats_metric[] for dnode server codec */ }; struct stats_server { - struct string name; /* server name (ref) */ - struct array metric; /* stats_metric[] for server codec */ + struct string name; /* server name (ref) */ + struct array metric; /* stats_metric[] for server codec */ }; struct stats_pool { - struct string name; /* pool name (ref) */ - struct array metric; /* stats_metric[] for pool codec */ - struct stats_server server; /* stats for datastore */ + struct string name; /* pool name (ref) */ + struct array metric; /* stats_metric[] for pool codec */ + struct stats_server server; /* stats for datastore */ }; struct stats_buffer { - size_t len; /* buffer length */ - uint8_t *data; /* buffer data */ - size_t size; /* buffer alloc size */ + size_t len; /* buffer length */ + uint8_t *data; /* buffer data */ + size_t size; /* buffer alloc size */ }; /** \struct stats * Dynomite server performance statistics. */ struct stats { - struct context *ctx; - uint16_t port; /* stats monitoring port */ - msec_t interval; /* stats aggregation interval */ - struct string addr; /* stats monitoring address */ - - int64_t start_ts; /* start timestamp of dynomite */ - struct stats_buffer buf; /* info buffer */ - struct stats_buffer clus_desc_buf; /* cluster_describe buffer */ - - struct stats_pool current; /* stats_pool[] (a) */ - struct stats_pool shadow; /* stats_pool[] (b) */ - struct stats_pool sum; /* stats_pool[] (c = a + b) */ - - pthread_t tid; /* stats aggregator thread */ - int sd; /* stats descriptor */ - - struct string service_str; /* service string */ - struct string service; /* service */ - struct string source_str; /* source string */ - struct string source; /* source */ - struct string version_str; /* version string */ - struct string version; /* version */ - struct string uptime_str; /* uptime string */ - struct string timestamp_str; /* timestamp string */ - struct string latency_999th_str; - struct string latency_99th_str; - struct string latency_95th_str; - struct string latency_mean_str; - struct string latency_max_str; - - struct string payload_size_999th_str; - struct string payload_size_99th_str; - struct string payload_size_95th_str; - struct string payload_size_mean_str; - struct string payload_size_max_str; - - struct string cross_region_avg_rtt; - struct string cross_region_99_rtt; - - struct string client_out_queue_99; - struct string server_in_queue_99; - struct string server_out_queue_99; - struct string dnode_client_out_queue_99; - struct string peer_in_queue_99; - struct string peer_out_queue_99; - struct string remote_peer_in_queue_99; - struct string remote_peer_out_queue_99; - - struct string alloc_msgs_str; - struct string free_msgs_str; - struct string alloc_mbufs_str; - struct string free_mbufs_str; - struct string dyn_memory_str; - - struct string rack_str; - struct string rack; - - struct string dc_str; - struct string dc; - - volatile int aggregate; /* shadow (b) aggregate? */ - volatile int updated; /* current (a) updated? */ - volatile bool reset_histogram; - volatile struct histogram latency_histo; - volatile struct histogram payload_size_histo; - - volatile struct histogram server_latency_histo; - volatile struct histogram cross_zone_latency_histo; - volatile struct histogram cross_region_latency_histo; - - volatile struct histogram server_queue_wait_time_histo; - volatile struct histogram cross_zone_queue_wait_time_histo; - volatile struct histogram cross_region_queue_wait_time_histo; - - volatile struct histogram client_out_queue; - volatile struct histogram server_in_queue; - volatile struct histogram server_out_queue; - volatile struct histogram dnode_client_out_queue; - volatile struct histogram peer_in_queue; - volatile struct histogram peer_out_queue; - volatile struct histogram remote_peer_in_queue; - volatile struct histogram remote_peer_out_queue; - - size_t alloc_msgs; - size_t free_msgs; - uint64_t alloc_mbufs; - uint64_t free_mbufs; - uint64_t dyn_memory; - + struct context *ctx; + uint16_t port; /* stats monitoring port */ + msec_t interval; /* stats aggregation interval */ + struct string addr; /* stats monitoring address */ + + int64_t start_ts; /* start timestamp of dynomite */ + struct stats_buffer buf; /* info buffer */ + struct stats_buffer clus_desc_buf; /* cluster_describe buffer */ + + struct stats_pool current; /* stats_pool[] (a) */ + struct stats_pool shadow; /* stats_pool[] (b) */ + struct stats_pool sum; /* stats_pool[] (c = a + b) */ + + pthread_t tid; /* stats aggregator thread */ + int sd; /* stats descriptor */ + + struct string service_str; /* service string */ + struct string service; /* service */ + struct string source_str; /* source string */ + struct string source; /* source */ + struct string version_str; /* version string */ + struct string version; /* version */ + struct string uptime_str; /* uptime string */ + struct string timestamp_str; /* timestamp string */ + struct string latency_999th_str; + struct string latency_99th_str; + struct string latency_95th_str; + struct string latency_mean_str; + struct string latency_max_str; + + struct string payload_size_999th_str; + struct string payload_size_99th_str; + struct string payload_size_95th_str; + struct string payload_size_mean_str; + struct string payload_size_max_str; + + struct string cross_region_avg_rtt; + struct string cross_region_99_rtt; + + struct string client_out_queue_99; + struct string server_in_queue_99; + struct string server_out_queue_99; + struct string dnode_client_out_queue_99; + struct string peer_in_queue_99; + struct string peer_out_queue_99; + struct string remote_peer_in_queue_99; + struct string remote_peer_out_queue_99; + + struct string alloc_msgs_str; + struct string free_msgs_str; + struct string alloc_mbufs_str; + struct string free_mbufs_str; + struct string dyn_memory_str; + + struct string rack_str; + struct string rack; + + struct string dc_str; + struct string dc; + + volatile int aggregate; /* shadow (b) aggregate? */ + volatile int updated; /* current (a) updated? */ + volatile bool reset_histogram; + volatile struct histogram latency_histo; + volatile struct histogram payload_size_histo; + + volatile struct histogram server_latency_histo; + volatile struct histogram cross_zone_latency_histo; + volatile struct histogram cross_region_latency_histo; + + volatile struct histogram server_queue_wait_time_histo; + volatile struct histogram cross_zone_queue_wait_time_histo; + volatile struct histogram cross_region_queue_wait_time_histo; + + volatile struct histogram client_out_queue; + volatile struct histogram server_in_queue; + volatile struct histogram server_out_queue; + volatile struct histogram dnode_client_out_queue; + volatile struct histogram peer_in_queue; + volatile struct histogram peer_out_queue; + volatile struct histogram remote_peer_in_queue; + volatile struct histogram remote_peer_out_queue; + + size_t alloc_msgs; + size_t free_msgs; + uint64_t alloc_mbufs; + uint64_t free_mbufs; + uint64_t dyn_memory; }; - #define DEFINE_ACTION(_name, _type, _desc) STATS_POOL_##_name, typedef enum stats_pool_field { - STATS_POOL_CODEC(DEFINE_ACTION) - STATS_POOL_NFIELD + STATS_POOL_CODEC(DEFINE_ACTION) STATS_POOL_NFIELD } stats_pool_field_t; #undef DEFINE_ACTION #define DEFINE_ACTION(_name, _type, _desc) STATS_SERVER_##_name, typedef enum stats_server_field { - STATS_SERVER_CODEC(DEFINE_ACTION) - STATS_SERVER_NFIELD + STATS_SERVER_CODEC(DEFINE_ACTION) STATS_SERVER_NFIELD } stats_server_field_t; #undef DEFINE_ACTION struct stats_cmd { - stats_cmd_t cmd; - struct string req_data; + stats_cmd_t cmd; + struct string req_data; }; - #if defined DN_STATS && DN_STATS == 1 -#define stats_pool_incr(_ctx, _name) do { \ - _stats_pool_incr(_ctx, STATS_POOL_##_name); \ -} while (0) - -#define stats_pool_decr(_ctx, _name) do { \ - _stats_pool_decr(_ctx, STATS_POOL_##_name); \ -} while (0) - -#define stats_pool_incr_by(_ctx, _name, _val) do { \ - _stats_pool_incr_by(_ctx, STATS_POOL_##_name, _val); \ -} while (0) - -#define stats_pool_decr_by(_ctx, _name, _val) do { \ - _stats_pool_decr_by(_ctx, STATS_POOL_##_name, _val); \ -} while (0) - -#define stats_pool_set_ts(_ctx, _name, _val) do { \ - _stats_pool_set_ts(_ctx, STATS_POOL_##_name, _val); \ -} while (0) - -#define stats_pool_get_ts(_ctx, _name) \ - _stats_pool_get_ts(_ctx, STATS_POOL_##_name) - -#define stats_pool_set_val(_ctx, _name, _val) do { \ - _stats_pool_set_val(_ctx, STATS_POOL_##_name, _val); \ -} while (0) - -#define stats_pool_get_val(_ctx, _name) \ - _stats_pool_get_val(_ctx, STATS_POOL_##_name) - -#define stats_server_incr(_ctx, _name) do { \ - _stats_server_incr(_ctx, STATS_SERVER_##_name); \ -} while (0) - -#define stats_server_decr(_ctx, _name) do { \ - _stats_server_decr(_ctx, STATS_SERVER_##_name); \ -} while (0) - -#define stats_server_incr_by(_ctx, _name, _val) do { \ - _stats_server_incr_by(_ctx, STATS_SERVER_##_name, _val); \ -} while (0) - -#define stats_server_decr_by(_ctx, _name, _val) do { \ - _stats_server_decr_by(_ctx, STATS_SERVER_##_name, _val); \ -} while (0) - -#define stats_server_set_ts(_ctx, _name, _val) do { \ - _stats_server_set_ts(_ctx, STATS_SERVER_##_name, _val); \ -} while (0) - -#define stats_server_get_ts(_ctx, _name) \ - _stats_server_get_ts(_ctx, STATS_SERVER_##_name) - -#define stats_server_set_val(_ctx, _name, _val) do { \ - _stats_server_set_val(_ctx, STATS_SERVER_##_name, _val); \ -} while (0) - -#define stats_server_get_val(_ctx, _name) \ - _stats_server_get_val(_ctx, STATS_SERVER_##_name) - +#define stats_pool_incr(_ctx, _name) \ + do { \ + _stats_pool_incr(_ctx, STATS_POOL_##_name); \ + } while (0) + +#define stats_pool_decr(_ctx, _name) \ + do { \ + _stats_pool_decr(_ctx, STATS_POOL_##_name); \ + } while (0) + +#define stats_pool_incr_by(_ctx, _name, _val) \ + do { \ + _stats_pool_incr_by(_ctx, STATS_POOL_##_name, _val); \ + } while (0) + +#define stats_pool_decr_by(_ctx, _name, _val) \ + do { \ + _stats_pool_decr_by(_ctx, STATS_POOL_##_name, _val); \ + } while (0) + +#define stats_pool_set_ts(_ctx, _name, _val) \ + do { \ + _stats_pool_set_ts(_ctx, STATS_POOL_##_name, _val); \ + } while (0) + +#define stats_pool_get_ts(_ctx, _name) \ + _stats_pool_get_ts(_ctx, STATS_POOL_##_name) + +#define stats_pool_set_val(_ctx, _name, _val) \ + do { \ + _stats_pool_set_val(_ctx, STATS_POOL_##_name, _val); \ + } while (0) + +#define stats_pool_get_val(_ctx, _name) \ + _stats_pool_get_val(_ctx, STATS_POOL_##_name) + +#define stats_server_incr(_ctx, _name) \ + do { \ + _stats_server_incr(_ctx, STATS_SERVER_##_name); \ + } while (0) + +#define stats_server_decr(_ctx, _name) \ + do { \ + _stats_server_decr(_ctx, STATS_SERVER_##_name); \ + } while (0) + +#define stats_server_incr_by(_ctx, _name, _val) \ + do { \ + _stats_server_incr_by(_ctx, STATS_SERVER_##_name, _val); \ + } while (0) + +#define stats_server_decr_by(_ctx, _name, _val) \ + do { \ + _stats_server_decr_by(_ctx, STATS_SERVER_##_name, _val); \ + } while (0) + +#define stats_server_set_ts(_ctx, _name, _val) \ + do { \ + _stats_server_set_ts(_ctx, STATS_SERVER_##_name, _val); \ + } while (0) + +#define stats_server_get_ts(_ctx, _name) \ + _stats_server_get_ts(_ctx, STATS_SERVER_##_name) + +#define stats_server_set_val(_ctx, _name, _val) \ + do { \ + _stats_server_set_val(_ctx, STATS_SERVER_##_name, _val); \ + } while (0) + +#define stats_server_get_val(_ctx, _name) \ + _stats_server_get_val(_ctx, STATS_SERVER_##_name) #else @@ -400,39 +438,44 @@ struct stats_cmd { #endif -#define stats_enabled DN_STATS +#define stats_enabled DN_STATS void stats_describe(void); void _stats_pool_incr(struct context *ctx, stats_pool_field_t fidx); void _stats_pool_decr(struct context *ctx, stats_pool_field_t fidx); -void _stats_pool_incr_by(struct context *ctx, stats_pool_field_t fidx, int64_t val); -void _stats_pool_decr_by(struct context *ctx, stats_pool_field_t fidx, int64_t val); -void _stats_pool_set_ts(struct context *ctx, stats_pool_field_t fidx, int64_t val); -uint64_t _stats_pool_get_ts(struct context *ctx,stats_pool_field_t fidx); -void _stats_pool_set_val(struct context *ctx,stats_pool_field_t fidx, int64_t val); -int64_t _stats_pool_get_val(struct context *ctx, - stats_pool_field_t fidx); +void _stats_pool_incr_by(struct context *ctx, stats_pool_field_t fidx, + int64_t val); +void _stats_pool_decr_by(struct context *ctx, stats_pool_field_t fidx, + int64_t val); +void _stats_pool_set_ts(struct context *ctx, stats_pool_field_t fidx, + int64_t val); +uint64_t _stats_pool_get_ts(struct context *ctx, stats_pool_field_t fidx); +void _stats_pool_set_val(struct context *ctx, stats_pool_field_t fidx, + int64_t val); +int64_t _stats_pool_get_val(struct context *ctx, stats_pool_field_t fidx); void _stats_server_incr(struct context *ctx, stats_server_field_t fidx); void _stats_server_decr(struct context *ctx, stats_server_field_t fidx); -void _stats_server_incr_by(struct context *ctx, stats_server_field_t fidx, int64_t val); -void _stats_server_decr_by(struct context *ctx, stats_server_field_t fidx, int64_t val); -void _stats_server_set_ts(struct context *ctx, stats_server_field_t fidx, uint64_t val); +void _stats_server_incr_by(struct context *ctx, stats_server_field_t fidx, + int64_t val); +void _stats_server_decr_by(struct context *ctx, stats_server_field_t fidx, + int64_t val); +void _stats_server_set_ts(struct context *ctx, stats_server_field_t fidx, + uint64_t val); uint64_t _stats_server_get_ts(struct context *ctx, stats_server_field_t fidx); -void _stats_server_set_val(struct context *ctx, stats_server_field_t fidx, int64_t val); +void _stats_server_set_val(struct context *ctx, stats_server_field_t fidx, + int64_t val); int64_t _stats_server_get_val(struct context *ctx, stats_server_field_t fidx); -struct stats * stats_create(uint16_t stats_port, struct string pname, msec_t stats_interval, - char *source, struct server_pool *sp, struct context *ctx); +struct stats *stats_create(uint16_t stats_port, struct string pname, + msec_t stats_interval, char *source, + struct server_pool *sp, struct context *ctx); void stats_destroy(struct stats *stats); void stats_swap(struct stats *stats); - void stats_histo_add_latency(struct context *ctx, uint64_t val); void stats_histo_add_payloadsize(struct context *ctx, uint64_t val); - #endif - diff --git a/src/dyn_string.c b/src/dyn_string.c index 5cd3cecbb..374eb8d23 100644 --- a/src/dyn_string.c +++ b/src/dyn_string.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -42,78 +42,65 @@ * to be freed. */ -void -string_init(struct string *str) -{ - str->len = 0; - str->data = NULL; +void string_init(struct string *str) { + str->len = 0; + str->data = NULL; } -void -string_deinit(struct string *str) -{ - ASSERT((str->len == 0 && str->data == NULL) || - (str->len != 0 && str->data != NULL)); +void string_deinit(struct string *str) { + ASSERT((str->len == 0 && str->data == NULL) || + (str->len != 0 && str->data != NULL)); - if (str->data != NULL) { - dn_free(str->data); - string_init(str); - } + if (str->data != NULL) { + dn_free(str->data); + string_init(str); + } } -bool -string_empty(const struct string *str) -{ - ASSERT((str->len == 0 && str->data == NULL) || - (str->len != 0 && str->data != NULL)); - return str->len == 0 ? true : false; +bool string_empty(const struct string *str) { + ASSERT((str->len == 0 && str->data == NULL) || + (str->len != 0 && str->data != NULL)); + return str->len == 0 ? true : false; } -rstatus_t -string_duplicate(struct string *dst, const struct string *src) -{ - ASSERT(dst->len == 0 && dst->data == NULL); - ASSERT(src->len != 0 && src->data != NULL); +rstatus_t string_duplicate(struct string *dst, const struct string *src) { + ASSERT(dst->len == 0 && dst->data == NULL); + ASSERT(src->len != 0 && src->data != NULL); - dst->data = dn_strndup(src->data, src->len); - if (dst->data == NULL) { - return DN_ENOMEM; - } + dst->data = dn_strndup(src->data, src->len); + if (dst->data == NULL) { + return DN_ENOMEM; + } - dst->len = dn_strlen(dst->data); - return DN_OK; + dst->len = dn_strlen(dst->data); + return DN_OK; } -rstatus_t -string_copy(struct string *dst, const uint8_t *src, uint32_t srclen) -{ - //ASSERT(dst->len == 0 && dst->data == NULL); - ASSERT(src != NULL && srclen != 0); +rstatus_t string_copy(struct string *dst, const uint8_t *src, uint32_t srclen) { + // ASSERT(dst->len == 0 && dst->data == NULL); + ASSERT(src != NULL && srclen != 0); - dst->data = dn_strndup(src, srclen); - if (dst->data == NULL) { - return DN_ENOMEM; - } + dst->data = dn_strndup(src, srclen); + if (dst->data == NULL) { + return DN_ENOMEM; + } - dst->len = dn_strlen(dst->data); + dst->len = dn_strlen(dst->data); - return DN_OK; + return DN_OK; } /* For copying constant string into dst */ -rstatus_t string_copy_c(struct string *dst, const uint8_t *src) -{ - return string_copy(dst, src, (uint32_t) dn_strlen(src)); +rstatus_t string_copy_c(struct string *dst, const uint8_t *src) { + return string_copy(dst, src, (uint32_t)dn_strlen(src)); } -int -string_compare(const struct string *s1, const struct string *s2) -{ - if (s1->len != s2->len) { - return s1->len - s2->len > 0 ? 1 : -1; - } +int string_compare(const struct string *s1, const struct string *s2) { + if (s1->len != s2->len) { + return s1->len - s2->len > 0 ? 1 : -1; + } - return dn_strncmp(s1->data, s2->data, s1->len); + return dn_strncmp(s1->data, s2->data, s1->len); } /* diff --git a/src/dyn_string.h b/src/dyn_string.h index 27afa9a57..d786789ba 100644 --- a/src/dyn_string.h +++ b/src/dyn_string.h @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -23,27 +23,32 @@ #ifndef _DYN_STRING_H_ #define _DYN_STRING_H_ +#include #include -#include "dyn_core.h" +#include "dyn_types.h" struct string { - uint32_t len; /* string length */ - uint8_t *data; /* string data */ + uint32_t len; /* string length */ + uint8_t *data; /* string data */ }; -#define string(_str) { sizeof(_str) - 1, (uint8_t *)(_str) } -#define null_string { 0, NULL } +#define string(_str) \ + { sizeof(_str) - 1, (uint8_t *)(_str) } +#define null_string \ + { 0, NULL } -#define string_set_text(_str, _text) do { \ - (_str)->len = (uint32_t)(sizeof(_text) - 1);\ - (_str)->data = (uint8_t *)(_text); \ -} while (0); +#define string_set_text(_str, _text) \ + do { \ + (_str)->len = (uint32_t)(sizeof(_text) - 1); \ + (_str)->data = (uint8_t *)(_text); \ + } while (0); -#define string_set_raw(_str, _raw) do { \ - (_str)->len = (uint32_t)(dn_strlen(_raw)); \ - (_str)->data = (uint8_t *)(_raw); \ -} while (0); +#define string_set_raw(_str, _raw) \ + do { \ + (_str)->len = (uint32_t)(dn_strlen(_raw)); \ + (_str)->data = (uint8_t *)(_raw); \ + } while (0); void string_init(struct string *str); void string_deinit(struct string *str); @@ -57,72 +62,62 @@ int string_compare(const struct string *s1, const struct string *s2); * Wrapper around common routines for manipulating C character * strings */ -#define dn_memcpy(_d, _c, _n) \ - memcpy(_d, _c, (size_t)(_n)) +#define dn_memcpy(_d, _c, _n) memcpy(_d, _c, (size_t)(_n)) -#define dn_memmove(_d, _c, _n) \ - memmove(_d, _c, (size_t)(_n)) +#define dn_memmove(_d, _c, _n) memmove(_d, _c, (size_t)(_n)) -#define dn_memchr(_d, _c, _n) \ - memchr(_d, _c, (size_t)(_n)) +#define dn_memchr(_d, _c, _n) memchr(_d, _c, (size_t)(_n)) -#define dn_strlen(_s) \ - (uint32_t)strlen((char *)(_s)) +#define dn_strlen(_s) (uint32_t) strlen((char *)(_s)) -#define dn_strncmp(_s1, _s2, _n) \ - strncmp((char *)(_s1), (char *)(_s2), (size_t)(_n)) +#define dn_strncmp(_s1, _s2, _n) \ + strncmp((char *)(_s1), (char *)(_s2), (size_t)(_n)) -#define dn_strcmp(_s1, _cs2) \ - strncmp((char *)(_s1), (char *)(_cs2), strlen((_cs2))) +#define dn_strcmp(_s1, _cs2) \ + strncmp((char *)(_s1), (char *)(_cs2), strlen((_cs2))) -#define dn_strcasecmp(_s1, _cs2) \ - strncasecmp((char *)(_s1), (char *)(_cs2), strlen((_cs2))) +#define dn_strcasecmp(_s1, _cs2) \ + strncasecmp((char *)(_s1), (char *)(_cs2), strlen((_cs2))) -#define dn_strchr(_p, _l, _c) \ - _dn_strchr((uint8_t *)(_p), (uint8_t *)(_l), (uint8_t)(_c)) +#define dn_strchr(_p, _l, _c) \ + _dn_strchr((uint8_t *)(_p), (uint8_t *)(_l), (uint8_t)(_c)) -#define dn_strrchr(_p, _s, _c) \ - _dn_strrchr((uint8_t *)(_p),(uint8_t *)(_s), (uint8_t)(_c)) +#define dn_strrchr(_p, _s, _c) \ + _dn_strrchr((uint8_t *)(_p), (uint8_t *)(_s), (uint8_t)(_c)) -#define dn_strndup(_s, _n) \ - (uint8_t *)strndup((char *)(_s), (size_t)(_n)); +#define dn_strndup(_s, _n) (uint8_t *)strndup((char *)(_s), (size_t)(_n)); -#define dn_snprintf(_s, _n, ...) \ - snprintf((char *)(_s), (size_t)(_n), __VA_ARGS__) +#define dn_snprintf(_s, _n, ...) \ + snprintf((char *)(_s), (size_t)(_n), __VA_ARGS__) -#define dn_sprintf(_s, _f, ...) \ - sprintf((char *) (_s), _f, __VA_ARGS__) +#define dn_sprintf(_s, _f, ...) sprintf((char *)(_s), _f, __VA_ARGS__) -#define dn_scnprintf(_s, _n, ...) \ - _scnprintf((char *)(_s), (size_t)(_n), __VA_ARGS__) +#define dn_scnprintf(_s, _n, ...) \ + _scnprintf((char *)(_s), (size_t)(_n), __VA_ARGS__) -#define dn_vscnprintf(_s, _n, _f, _a) \ - _vscnprintf((char *)(_s), (size_t)(_n), _f, _a) +#define dn_vscnprintf(_s, _n, _f, _a) \ + _vscnprintf((char *)(_s), (size_t)(_n), _f, _a) -static inline uint8_t * -_dn_strchr(uint8_t *p, uint8_t *last, uint8_t c) -{ - while (p < last) { - if (*p == c) { - return p; - } - p++; +static inline uint8_t *_dn_strchr(uint8_t *p, uint8_t *last, uint8_t c) { + while (p < last) { + if (*p == c) { + return p; } + p++; + } - return NULL; + return NULL; } -static inline uint8_t * -_dn_strrchr(uint8_t *p, uint8_t *start, uint8_t c) -{ - while (p >= start) { - if (*p == c) { - return p; - } - p--; +static inline uint8_t *_dn_strrchr(uint8_t *p, uint8_t *start, uint8_t c) { + while (p >= start) { + if (*p == c) { + return p; } + p--; + } - return NULL; + return NULL; } #endif diff --git a/src/dyn_task.c b/src/dyn_task.c index 4873b8b4e..52e874fbb 100644 --- a/src/dyn_task.c +++ b/src/dyn_task.c @@ -1,10 +1,14 @@ #include "dyn_task.h" +#include + +#include "dyn_util.h" + /** * This is a generic task manager. There was a increasing demand in Dynomite to * create a module to schedule task a specific times. For example reconnecting - * a disconnected peer, or handling gossip messages, or timeouts etc. This is a module to - * implement such use cases. + * a disconnected peer, or handling gossip messages, or timeouts etc. This is a + * module to implement such use cases. * * Sequence of calls: * @@ -12,111 +16,90 @@ * schedule_task() * execute_expired_tasks() * - * The time complexity for insert/delete in the red black tree is O(log N), where - * N is the number of elements in the tree. + * The time complexity for insert/delete in the red black tree is O(log N), + * where N is the number of elements in the tree. * */ - - struct rbtree task_rbt; /* rbtree which holds the tasks */ struct rbnode task_rbs; /* rbtree sentinel */ // Individual task struct task { - struct rbnode rbnode; /* always be the first field */ - task_handler_1 handler; - void *arg1; + struct rbnode rbnode; /* always be the first field */ + task_handler_1 handler; + void *arg1; }; -rstatus_t -task_mgr_init() -{ - rbtree_init(&task_rbt, &task_rbs); - return DN_OK; +rstatus_t task_mgr_init() { + rbtree_init(&task_rbt, &task_rbs); + return DN_OK; } -static struct task * -_create_task(void) -{ - struct task *task = dn_alloc(sizeof(struct task)); - if (!task) - return NULL; - memset(task, 0, sizeof(struct task)); - return task; +static struct task *_create_task(void) { + struct task *task = dn_alloc(sizeof(struct task)); + if (!task) return NULL; + memset(task, 0, sizeof(struct task)); + return task; } -struct task * -schedule_task_1(task_handler_1 handler1, void *arg1, msec_t timeout) -{ - struct task *task = _create_task(); - task->handler = handler1; - task->arg1 = arg1; - - msec_t now_ms = dn_msec_now(); - - struct rbnode *rbnode = (struct rbnode *)task; - rbnode->timeout = timeout; - rbnode->key = now_ms + timeout; - rbnode->data = task; - rbtree_insert(&task_rbt, rbnode); - return task; -} +struct task *schedule_task_1(task_handler_1 handler1, void *arg1, + msec_t timeout) { + struct task *task = _create_task(); + task->handler = handler1; + task->arg1 = arg1; + msec_t now_ms = dn_msec_now(); -msec_t -time_to_next_task(void) -{ - struct rbnode *rbnode = rbtree_min(&task_rbt); - if (!rbnode) - return UINT64_MAX; - msec_t now_ms = dn_msec_now(); - msec_t fire_at_ms = rbnode->key; - if (now_ms > fire_at_ms) - return 0; - return fire_at_ms - now_ms; + struct rbnode *rbnode = (struct rbnode *)task; + rbnode->timeout = timeout; + rbnode->key = now_ms + timeout; + rbnode->data = task; + rbtree_insert(&task_rbt, rbnode); + return task; } -static bool -task_expired(struct task *task) -{ - msec_t now_ms = dn_msec_now(); - msec_t fire_at_ms = task->rbnode.key; +msec_t time_to_next_task(void) { + struct rbnode *rbnode = rbtree_min(&task_rbt); + if (!rbnode) return UINT64_MAX; + msec_t now_ms = dn_msec_now(); + msec_t fire_at_ms = rbnode->key; + if (now_ms > fire_at_ms) return 0; + return fire_at_ms - now_ms; +} + +static bool task_expired(struct task *task) { + msec_t now_ms = dn_msec_now(); + msec_t fire_at_ms = task->rbnode.key; - if (now_ms > fire_at_ms) - return true; - return false; + if (now_ms > fire_at_ms) return true; + return false; } -void -execute_expired_tasks(uint32_t limit) -{ - uint32_t executed = 0; - for (;;) { - struct rbnode *rbnode = rbtree_min(&task_rbt); - if (!rbnode) { - return; - } - - struct task *task = rbnode->data; - - if (task_expired(task)) { - rbtree_delete(&task_rbt, rbnode); - task->handler(task->arg1); - dn_free(task); - executed++; - if (!limit && executed == limit) - return; - continue; - } - break; +void execute_expired_tasks(uint32_t limit) { + uint32_t executed = 0; + for (;;) { + struct rbnode *rbnode = rbtree_min(&task_rbt); + if (!rbnode) { + return; + } + + struct task *task = rbnode->data; + + if (task_expired(task)) { + rbtree_delete(&task_rbt, rbnode); + task->handler(task->arg1); + dn_free(task); + executed++; + if (!limit && executed == limit) return; + continue; } + break; + } } -void -cancel_task(struct task *task) -{ - struct rbnode *rbnode = (struct rbnode *)task; - rbtree_delete(&task_rbt, rbnode); - dn_free(task); +void cancel_task(struct task *task) { + struct rbnode *rbnode = (struct rbnode *)task; + rbtree_delete(&task_rbt, rbnode); + dn_free(task); } diff --git a/src/dyn_task.h b/src/dyn_task.h index 935652f2b..848df9073 100644 --- a/src/dyn_task.h +++ b/src/dyn_task.h @@ -1,6 +1,8 @@ +#ifndef _DYN_TASK_H_ +#define _DYN_TASK_H_ + #include "dyn_rbtree.h" #include "dyn_types.h" -#include "dyn_core.h" struct task; @@ -15,7 +17,8 @@ rstatus_t task_mgr_init(void); * arg1 : the argument that will be sent back to handler1 * timeout : time in msec after which this task should get fired */ -struct task *schedule_task_1(task_handler_1 handler1, void *arg1, msec_t timeout); +struct task *schedule_task_1(task_handler_1 handler1, void *arg1, + msec_t timeout); /* Returns the time in msec to the next task */ msec_t time_to_next_task(void); @@ -29,3 +32,5 @@ void execute_expired_tasks(uint32_t limit); /* Cancel the provided task. The caller should keep track of the tasks scheduled * and use it to cancel */ void cancel_task(struct task *task); + +#endif /* _DYN_TASK_H_ */ diff --git a/src/dyn_test.c b/src/dyn_test.c index cc6757c7e..24e6b460a 100644 --- a/src/dyn_test.c +++ b/src/dyn_test.c @@ -1,454 +1,430 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2015 Netflix, Inc. - */ - + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2015 Netflix, Inc. + */ +#include +#include +#include #include #include -#include -#include -#include -#include #include #include +#include -#include "dyn_core.h" #include "dyn_conf.h" -#include "dyn_signal.h" +#include "dyn_core.h" #include "dyn_dnode_peer.h" +#include "dyn_signal.h" -#define TEST_CONF_PATH "conf/dynomite.yml" +#define TEST_CONF_PATH "conf/dynomite.yml" -#define TEST_LOG_DEFAULT LOG_NOTICE -#define TEST_LOG_PATH NULL +#define TEST_LOG_DEFAULT LOG_NOTICE +#define TEST_LOG_PATH NULL -#define TEST_MBUF_SIZE 16384 -#define TEST_ALLOC_MSGS_MAX 300000 +#define TEST_MBUF_SIZE 16384 +#define TEST_ALLOC_MSGS_MAX 300000 static int show_help; static int test_conf; - -static char *data = "$2014$ 1 3 0 1 1 *1 d *0\r\n*3\r\n$3\r\nset\r\n$4\r\nfoo1\r\n$4\r\nbar1\r\n" - "$2014$ 2 3 0 1 1 *1 d *0\r\n*3\r\n$3\r\nset\r\n$4\r\nfoo2\r\n$413\r\nbar01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567892222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222\r\n" - "$2014$ 3 3 0 1 1 *1 d *0\r\n*3\r\n$3\r\nset\r\n$4\r\nfoo3\r\n$4\r\nbar3\r\n"; +static char *data = + "$2014$ 1 3 0 1 1 *1 d " + "*0\r\n*3\r\n$3\r\nset\r\n$4\r\nfoo1\r\n$4\r\nbar1\r\n" + "$2014$ 2 3 0 1 1 *1 d " + "*0\r\n*3\r\n$3\r\nset\r\n$4\r\nfoo2\r\n$" + "413\r\nbar0123456789012345678901234567890123456789012345678901234567890123" + "45678901234567890123456789012345678922222222222222222222222222222222222222" + "22222222222222222222222222222222222222222222222222222222222222222222222222" + "22222222222222222222222222222222222222222222222222222222222222222222222222" + "22222222222222222222222222222222222222222222222222222222222222222222222222" + "22222222222222222222222222222222222222222222222222\r\n" + "$2014$ 3 3 0 1 1 *1 d " + "*0\r\n*3\r\n$3\r\nset\r\n$4\r\nfoo3\r\n$4\r\nbar3\r\n"; static size_t position = 0; static struct option long_options[] = { - { "help", no_argument, NULL, 'h' }, - { "version", no_argument, NULL, 'V' }, - { "test-conf", no_argument, NULL, 't' }, - { "describe-stats", no_argument, NULL, 'D' }, - { "verbose", required_argument, NULL, 'v' }, - { "output", required_argument, NULL, 'o' }, - { "conf-file", required_argument, NULL, 'c' }, - { "pid-file", required_argument, NULL, 'p' }, - { NULL, 0, NULL, 0 } -}; + {"help", no_argument, NULL, 'h'}, + {"version", no_argument, NULL, 'V'}, + {"test-conf", no_argument, NULL, 't'}, + {"describe-stats", no_argument, NULL, 'D'}, + {"verbose", required_argument, NULL, 'v'}, + {"output", required_argument, NULL, 'o'}, + {"conf-file", required_argument, NULL, 'c'}, + {"pid-file", required_argument, NULL, 'p'}, + {NULL, 0, NULL, 0}}; static char short_options[] = "hVtDgv:o:c:s:i:a:p"; - -static void -dn_show_usage(void) -{ - log_stderr( - "Usage: test [-?hVdDt] [-v verbosity level] [-o output file]" CRLF - " [-c conf file] [-m mbuf size] [-M max alloc messages]" CRLF - ""); - log_stderr( - "Options:" CRLF - " -h, --help : this help" CRLF - " -V, --version : show version and exit" CRLF - " -t, --test-conf : test configuration for syntax errors and exit"); - log_stderr( - " -v, --verbosity=N : set logging level (default: %d, min: %d, max: %d)" CRLF - " -o, --output=S : set logging file (default: %s)" CRLF - " -c, --conf-file=S : set configuration file (default: %s)" CRLF - "", - TEST_LOG_DEFAULT, TEST_LOG_DEFAULT, TEST_LOG_DEFAULT, - TEST_LOG_PATH != NULL ? TEST_LOG_PATH : "stderr", - TEST_CONF_PATH); +static void dn_show_usage(void) { + log_stderr("Usage: test [-?hVdDt] [-v verbosity level] [-o output file]" CRLF + " [-c conf file] [-m mbuf size] [-M max alloc " + "messages]" CRLF ""); + log_stderr("Options:" CRLF " -h, --help : this help" CRLF + " -V, --version : show version and exit" CRLF + " -t, --test-conf : test configuration for syntax errors " + "and exit"); + log_stderr( + " -v, --verbosity=N : set logging level (default: %d, min: %d, " + "max: %d)" CRLF + " -o, --output=S : set logging file (default: %s)" CRLF + " -c, --conf-file=S : set configuration file (default: %s)" CRLF "", + TEST_LOG_DEFAULT, TEST_LOG_DEFAULT, TEST_LOG_DEFAULT, + TEST_LOG_PATH != NULL ? TEST_LOG_PATH : "stderr", TEST_CONF_PATH); } +static rstatus_t test_pre_run(struct instance *nci) { + rstatus_t status; -static rstatus_t -test_pre_run(struct instance *nci) -{ - rstatus_t status; + status = log_init(nci->log_level, nci->log_filename); + if (status != DN_OK) { + return status; + } - status = log_init(nci->log_level, nci->log_filename); - if (status != DN_OK) { - return status; - } - - status = signal_init(); - if (status != DN_OK) { - return status; - } + status = signal_init(); + if (status != DN_OK) { + return status; + } - return DN_OK; + return DN_OK; } +static void test_set_default_options(struct instance *nci) { + int status; -static void -test_set_default_options(struct instance *nci) -{ - int status; - - nci->ctx = NULL; - nci->log_level = TEST_LOG_DEFAULT; - nci->log_filename = TEST_LOG_PATH; - nci->conf_filename = TEST_CONF_PATH; - - status = dn_gethostname(nci->hostname, DN_MAXHOSTNAMELEN); - if (status < 0) { - log_warn("gethostname failed, ignored: %s", strerror(errno)); - dn_snprintf(nci->hostname, DN_MAXHOSTNAMELEN, "unknown"); - } + nci->ctx = NULL; + nci->log_level = TEST_LOG_DEFAULT; + nci->log_filename = TEST_LOG_PATH; + nci->conf_filename = TEST_CONF_PATH; - nci->hostname[DN_MAXHOSTNAMELEN - 1] = '\0'; + status = dn_gethostname(nci->hostname, DN_MAXHOSTNAMELEN); + if (status < 0) { + log_warn("gethostname failed, ignored: %s", strerror(errno)); + dn_snprintf(nci->hostname, DN_MAXHOSTNAMELEN, "unknown"); + } + nci->hostname[DN_MAXHOSTNAMELEN - 1] = '\0'; } -static rstatus_t -test_get_options(int argc, char **argv, struct instance *nci) -{ - int c, value; +static rstatus_t test_get_options(int argc, char **argv, struct instance *nci) { + int c, value; - opterr = 0; + opterr = 0; - for (;;) { - c = getopt_long(argc, argv, short_options, long_options, NULL); - if (c == -1) { - /* no more options */ - break; - } - - switch (c) { - case 'h': - show_help = 1; - break; + for (;;) { + c = getopt_long(argc, argv, short_options, long_options, NULL); + if (c == -1) { + /* no more options */ + break; + } - case 't': - test_conf = 1; - nci->log_level = 11; + switch (c) { + case 'h': + show_help = 1; + break; + + case 't': + test_conf = 1; + nci->log_level = 11; + break; + + case 'v': + value = dn_atoi(optarg, strlen(optarg)); + if (value < 0) { + log_stderr("test: option -v requires a number"); + return DN_ERROR; + } + nci->log_level = value; + break; + + case 'o': + nci->log_filename = optarg; + break; + + case 'c': + nci->conf_filename = optarg; + break; + + case '?': + switch (optopt) { + case 'o': + case 'c': + case 'p': + log_stderr("test: option -%c requires a file name", optopt); break; - case 'v': - value = dn_atoi(optarg, strlen(optarg)); - if (value < 0) { - log_stderr("test: option -v requires a number"); - return DN_ERROR; - } - nci->log_level = value; + case 'v': + case 's': + case 'i': + log_stderr("test: option -%c requires a number", optopt); break; - case 'o': - nci->log_filename = optarg; + case 'a': + log_stderr("test: option -%c requires a string", optopt); break; - case 'c': - nci->conf_filename = optarg; + default: + log_stderr("test: invalid option -- '%c'", optopt); break; - - case '?': - switch (optopt) { - case 'o': - case 'c': - case 'p': - log_stderr("test: option -%c requires a file name", - optopt); - break; - - case 'v': - case 's': - case 'i': - log_stderr("test: option -%c requires a number", optopt); - break; - - case 'a': - log_stderr("test: option -%c requires a string", optopt); - break; - - default: - log_stderr("test: invalid option -- '%c'", optopt); - break; - } - return DN_ERROR; - - default: - log_stderr("dynomite: invalid option -- '%c'", optopt); - return DN_ERROR; - } + return DN_ERROR; + + default: + log_stderr("dynomite: invalid option -- '%c'", optopt); + return DN_ERROR; } + } - return DN_OK; + return DN_OK; } -static void -print_banner(const char*s) -{ - loga("==================================================================="); - loga(" Running Test: %s", s); - loga("==================================================================="); +static void print_banner(const char *s) { + loga("==================================================================="); + loga(" Running Test: %s", s); + loga("==================================================================="); } -static rstatus_t -init_peer(struct node *s) -{ - s->idx = 0; - s->owner = NULL; +static rstatus_t init_peer(struct node *s) { + s->idx = 0; + s->owner = NULL; - struct string pname = string("127.0.0.1:8102"); - string_copy(&s->endpoint.pname, pname.data, pname.len); + struct string pname = string("127.0.0.1:8102"); + string_copy(&s->endpoint.pname, pname.data, pname.len); - struct string name = string("127.0.0.1"); - string_copy(&s->name, name.data, name.len); + struct string name = string("127.0.0.1"); + string_copy(&s->name, name.data, name.len); - s->state = UNKNOWN; + s->state = UNKNOWN; - s->endpoint.port = (uint16_t)8102; + s->endpoint.port = (uint16_t)8102; - struct string rack = string("rack1"); - string_copy(&s->rack, rack.data, rack.len); + struct string rack = string("rack1"); + string_copy(&s->rack, rack.data, rack.len); - struct string dc = string("dc1"); - string_copy(&s->dc, dc.data, dc.len); + struct string dc = string("dc1"); + string_copy(&s->dc, dc.data, dc.len); - s->is_local = false; - //TODO-need to init tokens - //s->tokens = cseed->tokens; + s->is_local = false; + // TODO-need to init tokens + // s->tokens = cseed->tokens; - struct sockinfo *info = malloc(sizeof(struct sockinfo)); + struct sockinfo *info = malloc(sizeof(struct sockinfo)); - memset(info, 0, sizeof(*info)); - dn_resolve(&name, s->endpoint.port, info); + memset(info, 0, sizeof(*info)); + dn_resolve(&name, s->endpoint.port, info); - s->endpoint.family = info->family; - s->endpoint.addrlen = info->addrlen; - s->endpoint.addr = (struct sockaddr *)&info->addr; + s->endpoint.family = info->family; + s->endpoint.addrlen = info->addrlen; + s->endpoint.addr = (struct sockaddr *)&info->addr; - s->next_retry_ms = 0ULL; - s->failure_count = 0; + s->next_retry_ms = 0ULL; + s->failure_count = 0; - s->processed = 0; - s->is_secure = 0; + s->processed = 0; + s->is_secure = 0; - log_debug(LOG_NOTICE, "Filling up server data"); + log_debug(LOG_NOTICE, "Filling up server data"); - return DN_OK; + return DN_OK; } +static size_t fill_buffer(struct mbuf *mbuf) { + loga("total data size: %d", dn_strlen(data)); + loga("mbuf size: %d", mbuf_size(mbuf)); + size_t data_size = dn_strlen(data) - position; -static size_t fill_buffer(struct mbuf *mbuf) -{ - loga("total data size: %d", dn_strlen(data)); - loga("mbuf size: %d", mbuf_size(mbuf)); - size_t data_size = dn_strlen(data) - position; + loga("data left-over size: %d", data_size); + if (data_size <= 0) { + return 0; + } - loga("data left-over size: %d", data_size); - if (data_size <= 0) { - return 0; - } + size_t min_len = data_size > mbuf_size(mbuf) ? mbuf_size(mbuf) : data_size; + mbuf_copy(mbuf, &data[position], min_len); + position += min_len; - size_t min_len = data_size > mbuf_size(mbuf) ? mbuf_size(mbuf) : data_size; - mbuf_copy(mbuf, &data[position], min_len); - position += min_len; - - return min_len; + return min_len; } -static rstatus_t -test_msg_recv_chain(struct conn *conn, struct msg *msg) -{ - struct msg *nmsg; - struct mbuf *mbuf, *nbuf; - print_banner("MSG_RECV_CHAIN"); - - mbuf = STAILQ_LAST(&msg->mhdr, mbuf, next); - - mbuf = mbuf_get(); - mbuf_insert(&msg->mhdr, mbuf); - msg->pos = mbuf->pos; - - ASSERT(mbuf->end - mbuf->last > 0); +static rstatus_t test_msg_recv_chain(struct conn *conn, struct msg *msg) { + struct msg *nmsg; + struct mbuf *mbuf, *nbuf; + print_banner("MSG_RECV_CHAIN"); + mbuf = STAILQ_LAST(&msg->mhdr, mbuf, next); - uint32_t data_n = (uint32_t)fill_buffer(mbuf); - msg->mlen += data_n; + mbuf = mbuf_get(); + mbuf_insert(&msg->mhdr, mbuf); + msg->pos = mbuf->pos; + ASSERT(mbuf->end - mbuf->last > 0); - loga("msg->mlen = %d", + msg->mlen); - loga("mbuf_length = %d", mbuf_length(mbuf)); + uint32_t data_n = (uint32_t)fill_buffer(mbuf); + msg->mlen += data_n; + loga("msg->mlen = %d", +msg->mlen); + loga("mbuf_length = %d", mbuf_length(mbuf)); - bool is_done = false; - struct string hash_tag; - string_init(&hash_tag); + bool is_done = false; + struct string hash_tag; + string_init(&hash_tag); - for(;!is_done;) { - msg->parser(msg, &hash_tag); + for (; !is_done;) { + msg->parser(msg, &hash_tag); - switch (msg->result) { - case MSG_PARSE_OK: - log_debug(LOG_VVERB, "Parsing MSG_PARSE_OK"); - if (msg->pos == mbuf->last) { - log_debug(LOG_VVERB, "Parsing MSG_PARSE_OK - done - no more data to parse!"); - is_done = true; - } + switch (msg->result) { + case MSG_PARSE_OK: + log_debug(LOG_VVERB, "Parsing MSG_PARSE_OK"); + if (msg->pos == mbuf->last) { + log_debug(LOG_VVERB, + "Parsing MSG_PARSE_OK - done - no more data to parse!"); + is_done = true; + } - nbuf = mbuf_split(&msg->mhdr, msg->pos, NULL, NULL); - if (nbuf == NULL) { - log_debug(LOG_VVERB, "Parsing MSG_PARSE_OK - more data but can't split!"); - } + nbuf = mbuf_split(&msg->mhdr, msg->pos, NULL, NULL); + if (nbuf == NULL) { + log_debug(LOG_VVERB, + "Parsing MSG_PARSE_OK - more data but can't split!"); + } - nmsg = msg_get(msg->owner, msg->is_request, __FUNCTION__); - mbuf_insert(&nmsg->mhdr, nbuf); - nmsg->pos = nbuf->pos; + nmsg = msg_get(msg->owner, msg->is_request, __FUNCTION__); + mbuf_insert(&nmsg->mhdr, nbuf); + nmsg->pos = nbuf->pos; - /* update length of current (msg) and new message (nmsg) */ - nmsg->mlen = mbuf_length(nbuf); - msg->mlen -= nmsg->mlen; + /* update length of current (msg) and new message (nmsg) */ + nmsg->mlen = mbuf_length(nbuf); + msg->mlen -= nmsg->mlen; - data_n = (uint32_t)fill_buffer(nbuf); - nmsg->mlen += data_n; + data_n = (uint32_t)fill_buffer(nbuf); + nmsg->mlen += data_n; - msg = nmsg; - mbuf = nbuf; + msg = nmsg; + mbuf = nbuf; - break; + break; - case MSG_PARSE_REPAIR: - //status = msg_repair(ctx, conn, msg); - log_debug(LOG_VVERB, "Parsing MSG_PARSE_REPAIR"); - msg = NULL; - break; + case MSG_PARSE_REPAIR: + // status = msg_repair(ctx, conn, msg); + log_debug(LOG_VVERB, "Parsing MSG_PARSE_REPAIR"); + msg = NULL; + break; - case MSG_PARSE_AGAIN: - log_debug(LOG_VVERB, "Parsing MSG_PARSE_AGAIN"); + case MSG_PARSE_AGAIN: + log_debug(LOG_VVERB, "Parsing MSG_PARSE_AGAIN"); - nbuf = mbuf_split(&msg->mhdr, msg->pos, NULL, NULL); - mbuf_insert(&msg->mhdr, nbuf); - msg->pos = nbuf->pos; - data_n = (uint32_t)fill_buffer(nbuf); - msg->mlen += data_n; - mbuf = nbuf; + nbuf = mbuf_split(&msg->mhdr, msg->pos, NULL, NULL); + mbuf_insert(&msg->mhdr, nbuf); + msg->pos = nbuf->pos; + data_n = (uint32_t)fill_buffer(nbuf); + msg->mlen += data_n; + mbuf = nbuf; - break; - - default: - log_debug(LOG_VVERB, "Parsing error in dyn_mode"); - msg = NULL; - break; - } + break; + default: + log_debug(LOG_VVERB, "Parsing error in dyn_mode"); + msg = NULL; + break; } + } - loga("Done parsing .........!"); - return DN_OK; + loga("Done parsing .........!"); + return DN_OK; } +static rstatus_t rsa_test(void) { + static unsigned char encrypted_buf[256]; + static unsigned char decrypted_buf[AES_KEYLEN + 1]; + static unsigned char *msg; -static rstatus_t -rsa_test(void) -{ - static unsigned char encrypted_buf[256]; - static unsigned char decrypted_buf[AES_KEYLEN + 1]; - static unsigned char *msg; - - print_banner("RSA"); - int i=0; - for(; i<3; i++) { - msg = generate_aes_key(); - - log_debug(LOG_VERB, "i = %d", i); - SCOPED_CHARPTR(encoded_aes_key) = base64_encode(msg, AES_KEYLEN); - log_debug(LOG_VERB, "AES key : %s \n", encoded_aes_key); + print_banner("RSA"); + int i = 0; + for (; i < 3; i++) { + msg = generate_aes_key(); + log_debug(LOG_VERB, "i = %d", i); + SCOPED_CHARPTR(encoded_aes_key) = base64_encode(msg, AES_KEYLEN); + log_debug(LOG_VERB, "AES key : %s \n", encoded_aes_key); - dyn_rsa_encrypt(msg, encrypted_buf); + dyn_rsa_encrypt(msg, encrypted_buf); - dyn_rsa_decrypt(encrypted_buf, decrypted_buf); + dyn_rsa_decrypt(encrypted_buf, decrypted_buf); - SCOPED_CHARPTR(encoded_decrypted_buf) = base64_encode(decrypted_buf, AES_KEYLEN); - log_debug(LOG_VERB, "Decrypted message : %s \n", encoded_decrypted_buf); - } + SCOPED_CHARPTR(encoded_decrypted_buf) = + base64_encode(decrypted_buf, AES_KEYLEN); + log_debug(LOG_VERB, "Decrypted message : %s \n", encoded_decrypted_buf); + } - return DN_OK; + return DN_OK; } -static void gen_random(unsigned char *s, const int len) -{ - static const unsigned char possible_data[] = - "0123456789" - "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz\r\n"; - int i; - for (i = 0; i < len; ++i) { - s[i] = possible_data[rand() % (sizeof(possible_data) - 1)]; - } - - s[len] = 0; +static void gen_random(unsigned char *s, const int len) { + static const unsigned char possible_data[] = + "0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz\r\n"; + int i; + for (i = 0; i < len; ++i) { + s[i] = possible_data[rand() % (sizeof(possible_data) - 1)]; + } + + s[len] = 0; } #define MAX_MSG_LEN 512 -static rstatus_t -aes_test(void) -{ - unsigned char msg[MAX_MSG_LEN+1]; - print_banner("AES"); - unsigned char* aes_key = generate_aes_key(); - SCOPED_CHARPTR(aes_key_print) = base64_encode(aes_key, AES_KEYLEN); - loga("aesKey is '%s'", aes_key_print); - - size_t i=0; - size_t count = 10000000; - loga("Running %lu encryption/decryption messages", count); - for(;idyn_mode = 1; - conn->sd = 0; - conn->ops = &peer_ops; +static void init_peer_conn(struct conn *conn) { + conn->dyn_mode = 1; + conn->sd = 0; + conn->ops = &peer_ops; } /* Inspection test */ -static rstatus_t -aes_msg_test(struct node *server) -{ - print_banner("AES MSG"); - //unsigned char* aes_key = generate_aes_key(); - struct conn *conn = conn_get(server, init_peer_conn); - struct msg *msg = msg_get(conn, true, __FUNCTION__); +static rstatus_t aes_msg_test(struct node *server) { + print_banner("AES MSG"); + // unsigned char* aes_key = generate_aes_key(); + struct conn *conn = conn_get(server, init_peer_conn); + struct msg *msg = msg_get(conn, true, __FUNCTION__); - struct mbuf *mbuf1 = mbuf_get(); - struct string s1 = string("abc"); - mbuf_write_string(mbuf1, &s1); - STAILQ_INSERT_HEAD(&msg->mhdr, mbuf1, next); + struct mbuf *mbuf1 = mbuf_get(); + struct string s1 = string("abc"); + mbuf_write_string(mbuf1, &s1); + STAILQ_INSERT_HEAD(&msg->mhdr, mbuf1, next); - struct mbuf *mbuf2 = mbuf_get(); - struct string s2 = string("abcabc"); - mbuf_write_string(mbuf2, &s2); - STAILQ_INSERT_TAIL(&msg->mhdr, mbuf2, next); + struct mbuf *mbuf2 = mbuf_get(); + struct string s2 = string("abcabc"); + mbuf_write_string(mbuf2, &s2); + STAILQ_INSERT_TAIL(&msg->mhdr, mbuf2, next); - /* - loga("dumping the content of the original msg: "); - msg_dump(msg); + /* + loga("dumping the content of the original msg: "); + msg_dump(msg); - dyn_aes_encrypt_msg(msg, aes_key); + dyn_aes_encrypt_msg(msg, aes_key); - loga("dumping the content of encrypted msg"); - msg_dump(msg); + loga("dumping the content of encrypted msg"); + msg_dump(msg); - dyn_aes_decrypt_msg(msg, aes_key); + dyn_aes_decrypt_msg(msg, aes_key); - loga("dumping the content of decrytped msg"); - msg_dump(msg); - */ + loga("dumping the content of decrytped msg"); + msg_dump(msg); + */ - return DN_OK; + return DN_OK; } /* @@ -625,8 +577,8 @@ aes_msg_test2(struct node *server) STAILQ_INSERT_HEAD(&msg->mhdr, mbuf1, next); struct mbuf *mbuf2 = mbuf_get(); - mbuf_write_bytes(mbuf2, data + mbuf_size(mbuf2), strlen(data) - mbuf_size(mbuf2)); - STAILQ_INSERT_TAIL(&msg->mhdr, mbuf2, next); + mbuf_write_bytes(mbuf2, data + mbuf_size(mbuf2), strlen(data) - +mbuf_size(mbuf2)); STAILQ_INSERT_TAIL(&msg->mhdr, mbuf2, next); loga("dumping the content of the original msg: "); msg_dump(msg); @@ -645,116 +597,106 @@ aes_msg_test2(struct node *server) } */ -static void -test_core_ctx_create(struct instance *nci) -{ - struct context *ctx; - - srand((unsigned) time(NULL)); - - ctx = dn_alloc(sizeof(*ctx)); - if (ctx == NULL) { - loga("Failed to create context!!!"); - } - nci->ctx = ctx; - ctx->instance = nci; - ctx->cf = NULL; - ctx->stats = NULL; - ctx->evb = NULL; - ctx->dyn_state = INIT; +static void test_core_ctx_create(struct instance *nci) { + struct context *ctx; + + srand((unsigned)time(NULL)); + + ctx = dn_alloc(sizeof(*ctx)); + if (ctx == NULL) { + loga("Failed to create context!!!"); + } + nci->ctx = ctx; + ctx->instance = nci; + ctx->cf = NULL; + ctx->stats = NULL; + ctx->evb = NULL; + ctx->dyn_state = INIT; } /** * This is very primitive */ -static void -test_server_pool(struct instance *nci) -{ - struct context *ctx = nci->ctx; - struct server_pool *sp = &ctx->pool; - sp->mbuf_size = TEST_MBUF_SIZE; - sp->alloc_msgs_max = TEST_ALLOC_MSGS_MAX; - char *filename = "conf/dynomite.pem"; - string_copy(&sp->pem_key_file, filename, strlen(filename)); - sp->secure_server_option = SECURE_OPTION_DC; - - mbuf_init(sp->mbuf_size); - msg_init(sp->alloc_msgs_max); +static void test_server_pool(struct instance *nci) { + struct context *ctx = nci->ctx; + struct server_pool *sp = &ctx->pool; + sp->mbuf_size = TEST_MBUF_SIZE; + sp->alloc_msgs_max = TEST_ALLOC_MSGS_MAX; + char *filename = "conf/dynomite.pem"; + string_copy(&sp->pem_key_file, filename, strlen(filename)); + sp->secure_server_option = SECURE_OPTION_DC; + + mbuf_init(sp->mbuf_size); + msg_init(sp->alloc_msgs_max); } -static void -init_test(int argc, char **argv) -{ - rstatus_t status; - struct instance nci; +static void init_test(int argc, char **argv) { + rstatus_t status; + struct instance nci; + test_set_default_options(&nci); - test_set_default_options(&nci); + status = test_get_options(argc, argv, &nci); + if (status != DN_OK) { + dn_show_usage(); + exit(1); + } - status = test_get_options(argc, argv, &nci); - if (status != DN_OK) { - dn_show_usage(); - exit(1); - } + test_pre_run(&nci); - test_pre_run(&nci); + test_core_ctx_create(&nci); + position = 0; + conn_init(); - test_core_ctx_create(&nci); - position = 0; - conn_init(); + test_server_pool(&nci); - test_server_pool(&nci); - - - crypto_init(&(nci.ctx->pool)); + crypto_init(&(nci.ctx->pool)); } -int -main(int argc, char **argv) -{ - //rstatus_t status; - init_test(argc, argv); - - struct node *peer = malloc(sizeof(struct node)); - memset(peer, 0, sizeof(struct node)); - init_peer(peer); - - struct conn *conn = conn_get(peer, init_peer_conn); - struct msg *msg = msg_get(conn, true, __FUNCTION__); - - //test payload larger than mbuf_size - rstatus_t ret = DN_OK; - ret = test_msg_recv_chain(conn, msg); - if (ret != DN_OK) { - loga("Error in testing msg_recv_chain!!!"); - goto err_out; - } - - ret = test_tailq(); - if (ret != DN_OK) { - loga("Error in testing msg_recv_chain!!!"); - goto err_out; - } - - //ret = rsa_test(); - if (ret != DN_OK) { - loga("Error in testing RSA !!!"); - goto err_out; - } - - ret = aes_test(); - if (ret != DN_OK) { - loga("Error in testing AES !!!"); - goto err_out; - } - - ret = aes_msg_test(peer); - if (ret != DN_OK) { - loga("Error in testing aes_msg_test !!!"); - goto err_out; - } - - loga("Testing is done!!!"); +int main(int argc, char **argv) { + // rstatus_t status; + init_test(argc, argv); + + struct node *peer = malloc(sizeof(struct node)); + memset(peer, 0, sizeof(struct node)); + init_peer(peer); + + struct conn *conn = conn_get(peer, init_peer_conn); + struct msg *msg = msg_get(conn, true, __FUNCTION__); + + // test payload larger than mbuf_size + rstatus_t ret = DN_OK; + ret = test_msg_recv_chain(conn, msg); + if (ret != DN_OK) { + loga("Error in testing msg_recv_chain!!!"); + goto err_out; + } + + ret = test_tailq(); + if (ret != DN_OK) { + loga("Error in testing msg_recv_chain!!!"); + goto err_out; + } + + // ret = rsa_test(); + if (ret != DN_OK) { + loga("Error in testing RSA !!!"); + goto err_out; + } + + ret = aes_test(); + if (ret != DN_OK) { + loga("Error in testing AES !!!"); + goto err_out; + } + + ret = aes_msg_test(peer); + if (ret != DN_OK) { + loga("Error in testing aes_msg_test !!!"); + goto err_out; + } + + loga("Testing is done!!!"); err_out: - return ret; + return ret; } diff --git a/src/dyn_types.c b/src/dyn_types.c index ace1c7155..f19ebb3d7 100644 --- a/src/dyn_types.c +++ b/src/dyn_types.c @@ -3,37 +3,33 @@ #define OBJECT_MAGIC 0xdead -void -cleanup_charptr(char **ptr) { - if (*ptr) - free(*ptr); +void cleanup_charptr(char **ptr) { + if (*ptr) free(*ptr); } -void -init_object(struct object *obj, object_type_t type, func_print_t print) -{ - obj->magic = OBJECT_MAGIC; - obj->type = type; - obj->func_print = print; +void init_object(struct object *obj, object_type_t type, func_print_t print) { + obj->magic = OBJECT_MAGIC; + obj->type = type; + obj->func_print = print; } -char* -print_obj(const void *ptr) -{ - const object_t *obj = (const object_t *)ptr; - static char buffer[PRINT_BUF_SIZE]; - if (obj == NULL) { - snprintf(buffer, PRINT_BUF_SIZE, ""); - return buffer; - } - if (obj->magic != OBJECT_MAGIC) { - snprintf(buffer, PRINT_BUF_SIZE, "addr:%p MAGIC NUMBER 0x%x", obj, obj->magic); - return buffer; - } - if ((obj->type >= 0) && (obj->type < OBJ_LAST)) { - return obj->func_print(obj); - } else { - snprintf(buffer, PRINT_BUF_SIZE, "addr:%p INVALID TYPE %d", obj, obj->type); - return buffer; - } +char *print_obj(const void *ptr) { + const object_t *obj = (const object_t *)ptr; + static char buffer[PRINT_BUF_SIZE]; + if (obj == NULL) { + snprintf(buffer, PRINT_BUF_SIZE, ""); + return buffer; + } + if (obj->magic != OBJECT_MAGIC) { + snprintf(buffer, PRINT_BUF_SIZE, "addr:%p MAGIC NUMBER 0x%x", + obj, obj->magic); + return buffer; + } + if ((obj->type >= 0) && (obj->type < OBJ_LAST)) { + return obj->func_print(obj); + } else { + snprintf(buffer, PRINT_BUF_SIZE, "addr:%p INVALID TYPE %d", + obj, obj->type); + return buffer; + } } diff --git a/src/dyn_types.h b/src/dyn_types.h index 4a92ef36b..0af87d746 100644 --- a/src/dyn_types.h +++ b/src/dyn_types.h @@ -1,17 +1,39 @@ #pragma once #include #include + +#define DN_NOOPS 1 +#define DN_OK 0 +#define DN_ERROR -1 +#define DN_EAGAIN -2 +#define DN_ENOMEM -3 +#define DN_ENO_IMPL -4 + +#define THROW_STATUS(s) \ + { \ + rstatus_t __ret = (s); \ + if (__ret != DN_OK) { \ + log_debug(LOG_WARN, "failed " #s); \ + return __ret; \ + } \ + } + +#define IGNORE_RET_VAL(x) x; + typedef uint64_t msgid_t; typedef uint64_t msec_t; typedef uint64_t usec_t; typedef uint64_t sec_t; +typedef int rstatus_t; /* return type */ +typedef int err_t; /* error type */ + typedef enum { - SECURE_OPTION_NONE, - SECURE_OPTION_RACK, - SECURE_OPTION_DC, - SECURE_OPTION_ALL, -}secure_server_option_t; + SECURE_OPTION_NONE, + SECURE_OPTION_RACK, + SECURE_OPTION_DC, + SECURE_OPTION_ALL, +} secure_server_option_t; struct array; struct string; @@ -36,30 +58,30 @@ struct dyn_ring; extern void cleanup_charptr(char **ptr); #define SCOPED_CHARPTR(var) \ - char * var __attribute__ ((__cleanup__(cleanup_charptr))) + char *var __attribute__((__cleanup__(cleanup_charptr))) typedef enum { - OBJ_REQ, - OBJ_RSP, - OBJ_CONN, - OBJ_CONN_POOL, - OBJ_POOL, - OBJ_DATASTORE, - OBJ_NODE, - OBJ_LAST -}object_type_t; + OBJ_REQ, + OBJ_RSP, + OBJ_CONN, + OBJ_CONN_POOL, + OBJ_POOL, + OBJ_DATASTORE, + OBJ_NODE, + OBJ_LAST +} object_type_t; #define PRINT_BUF_SIZE 255 struct object; -typedef char* (*func_print_t)(const struct object *obj); +typedef char *(*func_print_t)(const struct object *obj); typedef struct object { - uint16_t magic; - object_type_t type; - func_print_t func_print; - char print_buff[PRINT_BUF_SIZE]; -}object_t; + uint16_t magic; + object_type_t type; + func_print_t func_print; + char print_buff[PRINT_BUF_SIZE]; +} object_t; void init_object(object_t *obj, object_type_t type, func_print_t func_print); -char* print_obj(const void *ptr); +char *print_obj(const void *ptr); diff --git a/src/dyn_util.c b/src/dyn_util.c index 672c4b9de..bbb6ce900 100644 --- a/src/dyn_util.c +++ b/src/dyn_util.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,18 +20,18 @@ * limitations under the License. */ +#include +#include +#include #include #include -#include #include #include -#include -#include +#include +#include #include #include -#include -#include #include #include @@ -39,51 +39,43 @@ #include "dyn_core.h" #ifdef DN_HAVE_BACKTRACE -# include +#include #endif -int -dn_set_blocking(int sd) -{ - int flags; +int dn_set_blocking(int sd) { + int flags; - flags = fcntl(sd, F_GETFL, 0); - if (flags < 0) { - return flags; - } + flags = fcntl(sd, F_GETFL, 0); + if (flags < 0) { + return flags; + } - return fcntl(sd, F_SETFL, flags & ~O_NONBLOCK); + return fcntl(sd, F_SETFL, flags & ~O_NONBLOCK); } -int -dn_set_nonblocking(int sd) -{ - int flags; +int dn_set_nonblocking(int sd) { + int flags; - flags = fcntl(sd, F_GETFL, 0); - if (flags < 0) { - return flags; - } + flags = fcntl(sd, F_GETFL, 0); + if (flags < 0) { + return flags; + } - return fcntl(sd, F_SETFL, flags | O_NONBLOCK); + return fcntl(sd, F_SETFL, flags | O_NONBLOCK); } -int -dn_set_reuseaddr(int sd) -{ - int reuse; - socklen_t len; +int dn_set_reuseaddr(int sd) { + int reuse; + socklen_t len; - reuse = 1; - len = sizeof(reuse); + reuse = 1; + len = sizeof(reuse); - return setsockopt(sd, SOL_SOCKET, SO_REUSEADDR, &reuse, len); + return setsockopt(sd, SOL_SOCKET, SO_REUSEADDR, &reuse, len); } -int -dn_set_keepalive(int sd, int val) -{ - return setsockopt(sd, SOL_SOCKET, SO_KEEPALIVE, &val, sizeof(val)); +int dn_set_keepalive(int sd, int val) { + return setsockopt(sd, SOL_SOCKET, SO_KEEPALIVE, &val, sizeof(val)); } /* @@ -94,477 +86,426 @@ dn_set_keepalive(int sd, int val) * option must use readv() or writev() to do data transfer in bulk and * hence avoid the overhead of small packets. */ -int -dn_set_tcpnodelay(int sd) -{ - int nodelay; - socklen_t len; +int dn_set_tcpnodelay(int sd) { + int nodelay; + socklen_t len; - nodelay = 1; - len = sizeof(nodelay); + nodelay = 1; + len = sizeof(nodelay); - return setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, &nodelay, len); + return setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, &nodelay, len); } -int -dn_set_linger(int sd, int timeout) -{ - struct linger linger; - socklen_t len; +int dn_set_linger(int sd, int timeout) { + struct linger linger; + socklen_t len; - linger.l_onoff = 1; - linger.l_linger = timeout; + linger.l_onoff = 1; + linger.l_linger = timeout; - len = sizeof(linger); + len = sizeof(linger); - return setsockopt(sd, SOL_SOCKET, SO_LINGER, &linger, len); + return setsockopt(sd, SOL_SOCKET, SO_LINGER, &linger, len); } -int -dn_set_sndbuf(int sd, int size) -{ - socklen_t len; +int dn_set_sndbuf(int sd, int size) { + socklen_t len; - len = sizeof(size); + len = sizeof(size); - return setsockopt(sd, SOL_SOCKET, SO_SNDBUF, &size, len); + return setsockopt(sd, SOL_SOCKET, SO_SNDBUF, &size, len); } -int -dn_set_rcvbuf(int sd, int size) -{ - socklen_t len; +int dn_set_rcvbuf(int sd, int size) { + socklen_t len; - len = sizeof(size); + len = sizeof(size); - return setsockopt(sd, SOL_SOCKET, SO_RCVBUF, &size, len); + return setsockopt(sd, SOL_SOCKET, SO_RCVBUF, &size, len); } -int -dn_get_soerror(int sd) -{ - int status, err; - socklen_t len; +int dn_get_soerror(int sd) { + int status, err; + socklen_t len; - err = 0; - len = sizeof(err); + err = 0; + len = sizeof(err); - status = getsockopt(sd, SOL_SOCKET, SO_ERROR, &err, &len); - if (status == 0) { - errno = err; - } + status = getsockopt(sd, SOL_SOCKET, SO_ERROR, &err, &len); + if (status == 0) { + errno = err; + } - return status; + return status; } -int -dn_get_sndbuf(int sd) -{ - int status, size; - socklen_t len; +int dn_get_sndbuf(int sd) { + int status, size; + socklen_t len; - size = 0; - len = sizeof(size); + size = 0; + len = sizeof(size); - status = getsockopt(sd, SOL_SOCKET, SO_SNDBUF, &size, &len); - if (status < 0) { - return status; - } + status = getsockopt(sd, SOL_SOCKET, SO_SNDBUF, &size, &len); + if (status < 0) { + return status; + } - return size; + return size; } -int -dn_get_rcvbuf(int sd) -{ - int status, size; - socklen_t len; +int dn_get_rcvbuf(int sd) { + int status, size; + socklen_t len; - size = 0; - len = sizeof(size); + size = 0; + len = sizeof(size); - status = getsockopt(sd, SOL_SOCKET, SO_RCVBUF, &size, &len); - if (status < 0) { - return status; - } + status = getsockopt(sd, SOL_SOCKET, SO_RCVBUF, &size, &len); + if (status < 0) { + return status; + } - return size; + return size; } -int -_dn_atoi(uint8_t *line, size_t n) -{ - int value; +int _dn_atoi(uint8_t *line, size_t n) { + int value; - if (n == 0) { - return -1; - } + if (n == 0) { + return -1; + } - for (value = 0; n--; line++) { - if (*line < '0' || *line > '9') { - return -1; - } - - value = value * 10 + (*line - '0'); + for (value = 0; n--; line++) { + if (*line < '0' || *line > '9') { + return -1; } - if (value < 0) { - return -1; - } + value = value * 10 + (*line - '0'); + } - return value; -} + if (value < 0) { + return -1; + } -uint32_t -_dn_atoui(uint8_t *line, size_t n) -{ - uint32_t value; + return value; +} - if (n == 0) { - return 0; - } +uint32_t _dn_atoui(uint8_t *line, size_t n) { + uint32_t value; - for (value = 0; n--; line++) { - if (*line < '0' || *line > '9') { - return 0; - } + if (n == 0) { + return 0; + } - value = value * 10 + (uint32_t)(*line - '0'); + for (value = 0; n--; line++) { + if (*line < '0' || *line > '9') { + return 0; } - return value; + value = value * 10 + (uint32_t)(*line - '0'); + } + + return value; } -bool -dn_valid_port(int n) -{ - if (n < 1 || n > UINT16_MAX) { - return false; - } +bool dn_valid_port(int n) { + if (n < 1 || n > UINT16_MAX) { + return false; + } - return true; + return true; } -void * -_dn_alloc(size_t size, const char *name, int line) -{ - void *p; +void *_dn_alloc(size_t size, const char *name, int line) { + void *p; - ASSERT(size != 0); + ASSERT(size != 0); - p = malloc(size); - if (p == NULL) { - log_error("malloc(%zu) failed @ %s:%d", size, name, line); - } else { - log_debug(LOG_VVERB, "malloc(%zu) at %p @ %s:%d", size, p, name, line); - } + p = malloc(size); + if (p == NULL) { + log_error("malloc(%zu) failed @ %s:%d", size, name, line); + } else { + log_debug(LOG_VVERB, "malloc(%zu) at %p @ %s:%d", size, p, name, line); + } - return p; + return p; } -void * -_dn_zalloc(size_t size, const char *name, int line) -{ - void *p; +void *_dn_zalloc(size_t size, const char *name, int line) { + void *p; - p = _dn_alloc(size, name, line); - if (p != NULL) { - memset(p, 0, size); - } + p = _dn_alloc(size, name, line); + if (p != NULL) { + memset(p, 0, size); + } - return p; + return p; } -void * -_dn_calloc(size_t nmemb, size_t size, const char *name, int line) -{ - return _dn_zalloc(nmemb * size, name, line); +void *_dn_calloc(size_t nmemb, size_t size, const char *name, int line) { + return _dn_zalloc(nmemb * size, name, line); } -void * -_dn_realloc(void *ptr, size_t size, const char *name, int line) -{ - void *p; +void *_dn_realloc(void *ptr, size_t size, const char *name, int line) { + void *p; - ASSERT(size != 0); + ASSERT(size != 0); - p = realloc(ptr, size); - if (p == NULL) { - log_error("realloc(%zu) failed @ %s:%d", size, name, line); - } else { - log_debug(LOG_VVERB, "realloc(%zu) at %p @ %s:%d", size, p, name, line); - } + p = realloc(ptr, size); + if (p == NULL) { + log_error("realloc(%zu) failed @ %s:%d", size, name, line); + } else { + log_debug(LOG_VVERB, "realloc(%zu) at %p @ %s:%d", size, p, name, line); + } - return p; + return p; } -void -_dn_free(void *ptr, const char *name, int line) -{ - ASSERT(ptr != NULL); - log_debug(LOG_VVERB, "free(%p) @ %s:%d", ptr, name, line); - free(ptr); +void _dn_free(void *ptr, const char *name, int line) { + ASSERT(ptr != NULL); + log_debug(LOG_VVERB, "free(%p) @ %s:%d", ptr, name, line); + free(ptr); } -void -dn_stacktrace(int skip_count) -{ +void dn_stacktrace(int skip_count) { #ifdef DN_HAVE_BACKTRACE - void *stack[64]; - char **symbols; - int size, i, j; - - size = backtrace(stack, 64); - symbols = backtrace_symbols(stack, size); - if (symbols == NULL) { - return; - } - - skip_count++; /* skip the current frame also */ - - for (i = skip_count, j = 0; i < size; i++, j++) { - loga("[%d] %s", j, symbols[i]); - - char syscom[256]; - snprintf(syscom, sizeof(syscom), "addr2line %p -e /proc/%d/exe >&2", stack[i], getpid()); - if (system(syscom) < 0) { - loga("system command did not succeed to print filename"); - } + void *stack[64]; + char **symbols; + int size, i, j; + + size = backtrace(stack, 64); + symbols = backtrace_symbols(stack, size); + if (symbols == NULL) { + return; + } + + skip_count++; /* skip the current frame also */ + + for (i = skip_count, j = 0; i < size; i++, j++) { + loga("[%d] %s", j, symbols[i]); + + char syscom[256]; + snprintf(syscom, sizeof(syscom), "addr2line %p -e /proc/%d/exe >&2", + stack[i], getpid()); + if (system(syscom) < 0) { + loga("system command did not succeed to print filename"); } + } - free(symbols); + free(symbols); #endif } -void -dn_assert(const char *cond, const char *file, int line, int panic) -{ - log_error("assert '%s' failed @ (%s, %d)", cond, file, line); - if (panic) { - dn_stacktrace(1); - abort(); - } +void dn_assert(const char *cond, const char *file, int line, int panic) { + log_error("assert '%s' failed @ (%s, %d)", cond, file, line); + if (panic) { + dn_stacktrace(1); + abort(); + } } -int -_vscnprintf(char *buf, size_t size, const char *fmt, va_list args) -{ - int n; +int _vscnprintf(char *buf, size_t size, const char *fmt, va_list args) { + int n; - n = vsnprintf(buf, size, fmt, args); + n = vsnprintf(buf, size, fmt, args); - /* - * The return value is the number of characters which would be written - * into buf not including the trailing '\0'. If size is == 0 the - * function returns 0. - * - * On error, the function also returns 0. This is to allow idiom such - * as len += _vscnprintf(...) - * - * See: http://lwn.net/Articles/69419/ - */ - if (n <= 0) { - return 0; - } + /* + * The return value is the number of characters which would be written + * into buf not including the trailing '\0'. If size is == 0 the + * function returns 0. + * + * On error, the function also returns 0. This is to allow idiom such + * as len += _vscnprintf(...) + * + * See: http://lwn.net/Articles/69419/ + */ + if (n <= 0) { + return 0; + } - if (n < (int) size) { - return n; - } + if (n < (int)size) { + return n; + } - return (int)(size - 1); + return (int)(size - 1); } -int -_scnprintf(char *buf, size_t size, const char *fmt, ...) -{ - va_list args; - int n; +int _scnprintf(char *buf, size_t size, const char *fmt, ...) { + va_list args; + int n; - va_start(args, fmt); - n = _vscnprintf(buf, size, fmt, args); - va_end(args); + va_start(args, fmt); + n = _vscnprintf(buf, size, fmt, args); + va_end(args); - return n; + return n; } /* * Send n bytes on a blocking descriptor */ -ssize_t -_dn_sendn(int sd, const void *vptr, size_t n) -{ - size_t nleft; - ssize_t nsend; - const char *ptr; - - ptr = vptr; - nleft = n; - while (nleft > 0) { - nsend = send(sd, ptr, nleft, 0); - if (nsend < 0) { - if (errno == EINTR) { - continue; - } - return nsend; - } - if (nsend == 0) { - return -1; - } - - nleft -= (size_t)nsend; - ptr += nsend; +ssize_t _dn_sendn(int sd, const void *vptr, size_t n) { + size_t nleft; + ssize_t nsend; + const char *ptr; + + ptr = vptr; + nleft = n; + while (nleft > 0) { + nsend = send(sd, ptr, nleft, 0); + if (nsend < 0) { + if (errno == EINTR) { + continue; + } + return nsend; } + if (nsend == 0) { + return -1; + } + + nleft -= (size_t)nsend; + ptr += nsend; + } - return (ssize_t)n; + return (ssize_t)n; } /* * Recv n bytes from a blocking descriptor */ -ssize_t -_dn_recvn(int sd, void *vptr, size_t n) -{ - size_t nleft; - ssize_t nrecv; - char *ptr; - - ptr = vptr; - nleft = n; - while (nleft > 0) { - nrecv = recv(sd, ptr, nleft, 0); - if (nrecv < 0) { - if (errno == EINTR) { - continue; - } - return nrecv; - } - if (nrecv == 0) { - break; - } - - nleft -= (size_t)nrecv; - ptr += nrecv; +ssize_t _dn_recvn(int sd, void *vptr, size_t n) { + size_t nleft; + ssize_t nrecv; + char *ptr; + + ptr = vptr; + nleft = n; + while (nleft > 0) { + nrecv = recv(sd, ptr, nleft, 0); + if (nrecv < 0) { + if (errno == EINTR) { + continue; + } + return nrecv; } + if (nrecv == 0) { + break; + } + + nleft -= (size_t)nrecv; + ptr += nrecv; + } - return (ssize_t)(n - nleft); + return (ssize_t)(n - nleft); } /* * Return the current time in microseconds since Epoch */ -usec_t -dn_usec_now(void) -{ - struct timeval now; - uint64_t usec; - int status; - - status = gettimeofday(&now, NULL); - if (status < 0) { - log_error("gettimeofday failed: %s", strerror(errno)); - return 0; - } +usec_t dn_usec_now(void) { + struct timeval now; + uint64_t usec; + int status; + + status = gettimeofday(&now, NULL); + if (status < 0) { + log_error("gettimeofday failed: %s", strerror(errno)); + return 0; + } - usec = (uint64_t)now.tv_sec * 1000000ULL + (uint64_t)now.tv_usec; + usec = (uint64_t)now.tv_sec * 1000000ULL + (uint64_t)now.tv_usec; - return usec; + return usec; } /* * Return the current time in milliseconds since Epoch */ -msec_t -dn_msec_now(void) -{ - return dn_usec_now() / 1000ULL; -} - -static int -dn_resolve_inet(struct string *name, int port, struct sockinfo *si) -{ - int status; - struct addrinfo *ai, *cai; /* head and current addrinfo */ - struct addrinfo hints; - char *node, service[DN_UINTMAX_MAXLEN]; - bool found; - - ASSERT(dn_valid_port(port)); - - memset(&hints, 0, sizeof(hints)); - hints.ai_flags = AI_NUMERICSERV; - hints.ai_family = AF_UNSPEC; /* AF_INET or AF_INET6 */ - hints.ai_socktype = SOCK_STREAM; - hints.ai_protocol = 0; - hints.ai_addrlen = 0; - hints.ai_addr = NULL; - hints.ai_canonname = NULL; - - if (name != NULL) { - node = (char *)name->data; - } else { - /* - * If AI_PASSIVE flag is specified in hints.ai_flags, and node is - * NULL, then the returned socket addresses will be suitable for - * bind(2)ing a socket that will accept(2) connections. The returned - * socket address will contain the wildcard IP address. - */ - node = NULL; - hints.ai_flags |= AI_PASSIVE; - } +msec_t dn_msec_now(void) { return dn_usec_now() / 1000ULL; } + +static int dn_resolve_inet(struct string *name, int port, struct sockinfo *si) { + int status; + struct addrinfo *ai, *cai; /* head and current addrinfo */ + struct addrinfo hints; + char *node, service[DN_UINTMAX_MAXLEN]; + bool found; + + ASSERT(dn_valid_port(port)); + + memset(&hints, 0, sizeof(hints)); + hints.ai_flags = AI_NUMERICSERV; + hints.ai_family = AF_UNSPEC; /* AF_INET or AF_INET6 */ + hints.ai_socktype = SOCK_STREAM; + hints.ai_protocol = 0; + hints.ai_addrlen = 0; + hints.ai_addr = NULL; + hints.ai_canonname = NULL; + + if (name != NULL) { + node = (char *)name->data; + } else { + /* + * If AI_PASSIVE flag is specified in hints.ai_flags, and node is + * NULL, then the returned socket addresses will be suitable for + * bind(2)ing a socket that will accept(2) connections. The returned + * socket address will contain the wildcard IP address. + */ + node = NULL; + hints.ai_flags |= AI_PASSIVE; + } - dn_snprintf(service, DN_UINTMAX_MAXLEN, "%d", port); + dn_snprintf(service, DN_UINTMAX_MAXLEN, "%d", port); - status = getaddrinfo(node, service, &hints, &ai); - if (status < 0) { - log_error("address resolution of node '%s' service '%s' failed: %s", - node, service, gai_strerror(status)); - return -1; - } + status = getaddrinfo(node, service, &hints, &ai); + if (status < 0) { + log_error("address resolution of node '%s' service '%s' failed: %s", node, + service, gai_strerror(status)); + return -1; + } - /* - * getaddrinfo() can return a linked list of more than one addrinfo, - * since we requested for both AF_INET and AF_INET6 addresses and the - * host itself can be multi-homed. Since we don't care whether we are - * using ipv4 or ipv6, we just use the first address from this collection - * in the order in which it was returned. - * - * The sorting function used within getaddrinfo() is defined in RFC 3484; - * the order can be tweaked for a particular system by editing - * /etc/gai.conf - */ - for (cai = ai, found = false; cai != NULL; cai = cai->ai_next) { - si->family = cai->ai_family; - si->addrlen = cai->ai_addrlen; - dn_memcpy(&si->addr, cai->ai_addr, si->addrlen); - found = true; - break; - } + /* + * getaddrinfo() can return a linked list of more than one addrinfo, + * since we requested for both AF_INET and AF_INET6 addresses and the + * host itself can be multi-homed. Since we don't care whether we are + * using ipv4 or ipv6, we just use the first address from this collection + * in the order in which it was returned. + * + * The sorting function used within getaddrinfo() is defined in RFC 3484; + * the order can be tweaked for a particular system by editing + * /etc/gai.conf + */ + for (cai = ai, found = false; cai != NULL; cai = cai->ai_next) { + si->family = cai->ai_family; + si->addrlen = cai->ai_addrlen; + dn_memcpy(&si->addr, cai->ai_addr, si->addrlen); + found = true; + break; + } - freeaddrinfo(ai); + freeaddrinfo(ai); - return !found ? -1 : 0; + return !found ? -1 : 0; } -static int -dn_resolve_unix(struct string *name, struct sockinfo *si) -{ - struct sockaddr_un *un; +static int dn_resolve_unix(struct string *name, struct sockinfo *si) { + struct sockaddr_un *un; - if (name->len >= DN_UNIX_ADDRSTRLEN) { - return -1; - } + if (name->len >= DN_UNIX_ADDRSTRLEN) { + return -1; + } - un = &si->addr.un; + un = &si->addr.un; - un->sun_family = AF_UNIX; - dn_memcpy(un->sun_path, name->data, name->len); - un->sun_path[name->len] = '\0'; + un->sun_family = AF_UNIX; + dn_memcpy(un->sun_path, name->data, name->len); + un->sun_path[name->len] = '\0'; - si->family = AF_UNIX; - si->addrlen = sizeof(*un); - /* si->addr is an alias of un */ + si->family = AF_UNIX; + si->addrlen = sizeof(*un); + /* si->addr is an alias of un */ - return 0; + return 0; } /* @@ -573,14 +514,12 @@ dn_resolve_unix(struct string *name, struct sockinfo *si) * * This routine is reentrant */ -int -dn_resolve(struct string *name, int port, struct sockinfo *si) -{ - if (name != NULL && name->data[0] == '/') { - return dn_resolve_unix(name, si); - } +int dn_resolve(struct string *name, int port, struct sockinfo *si) { + if (name != NULL && name->data[0] == '/') { + return dn_resolve_unix(name, si); + } - return dn_resolve_inet(name, port, si); + return dn_resolve_inet(name, port, si); } /* @@ -589,23 +528,20 @@ dn_resolve(struct string *name, int port, struct sockinfo *si) * * This routine is not reentrant */ -char * -dn_unresolve_addr(struct sockaddr *addr, socklen_t addrlen) -{ - static char unresolve[NI_MAXHOST + NI_MAXSERV]; - static char host[NI_MAXHOST], service[NI_MAXSERV]; - int status; - - status = getnameinfo(addr, addrlen, host, sizeof(host), - service, sizeof(service), - NI_NUMERICHOST | NI_NUMERICSERV); - if (status < 0) { - return "unknown"; - } +char *dn_unresolve_addr(struct sockaddr *addr, socklen_t addrlen) { + static char unresolve[NI_MAXHOST + NI_MAXSERV]; + static char host[NI_MAXHOST], service[NI_MAXSERV]; + int status; - dn_snprintf(unresolve, sizeof(unresolve), "%s:%s", host, service); + status = getnameinfo(addr, addrlen, host, sizeof(host), service, + sizeof(service), NI_NUMERICHOST | NI_NUMERICSERV); + if (status < 0) { + return "unknown"; + } - return unresolve; + dn_snprintf(unresolve, sizeof(unresolve), "%s:%s", host, service); + + return unresolve; } /* @@ -614,24 +550,22 @@ dn_unresolve_addr(struct sockaddr *addr, socklen_t addrlen) * * This routine is not reentrant */ -char * -dn_unresolve_peer_desc(int sd) -{ - static struct sockinfo si; - struct sockaddr *addr; - socklen_t addrlen; - int status; - - memset(&si, 0, sizeof(si)); - addr = (struct sockaddr *)&si.addr; - addrlen = sizeof(si.addr); - - status = getpeername(sd, addr, &addrlen); - if (status < 0) { - return "unknown"; - } +char *dn_unresolve_peer_desc(int sd) { + static struct sockinfo si; + struct sockaddr *addr; + socklen_t addrlen; + int status; + + memset(&si, 0, sizeof(si)); + addr = (struct sockaddr *)&si.addr; + addrlen = sizeof(si.addr); + + status = getpeername(sd, addr, &addrlen); + if (status < 0) { + return "unknown"; + } - return dn_unresolve_addr(addr, addrlen); + return dn_unresolve_addr(addr, addrlen); } /* @@ -640,23 +574,20 @@ dn_unresolve_peer_desc(int sd) * * This routine is not reentrant */ -char * -dn_unresolve_desc(int sd) -{ - static struct sockinfo si; - struct sockaddr *addr; - socklen_t addrlen; - int status; - - memset(&si, 0, sizeof(si)); - addr = (struct sockaddr *)&si.addr; - addrlen = sizeof(si.addr); - - status = getsockname(sd, addr, &addrlen); - if (status < 0) { - return "unknown"; - } +char *dn_unresolve_desc(int sd) { + static struct sockinfo si; + struct sockaddr *addr; + socklen_t addrlen; + int status; - return dn_unresolve_addr(addr, addrlen); -} + memset(&si, 0, sizeof(si)); + addr = (struct sockaddr *)&si.addr; + addrlen = sizeof(si.addr); + status = getsockname(sd, addr, &addrlen); + if (status < 0) { + return "unknown"; + } + + return dn_unresolve_addr(addr, addrlen); +} diff --git a/src/dyn_util.h b/src/dyn_util.h index 112ac3da3..6ef06ff2a 100644 --- a/src/dyn_util.h +++ b/src/dyn_util.h @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -23,30 +23,34 @@ #ifndef _DYN_UTIL_H_ #define _DYN_UTIL_H_ +#include #include +#include +#include +#include -#define LF (uint8_t) 10 -#define CR (uint8_t) 13 -#define CRLF "\x0d\x0a" -#define CRLF_LEN (sizeof("\x0d\x0a") - 1) +#define LF (uint8_t)10 +#define CR (uint8_t)13 +#define CRLF "\x0d\x0a" +#define CRLF_LEN (sizeof("\x0d\x0a") - 1) -#define NELEMS(a) ((sizeof(a)) / sizeof((a)[0])) +#define NELEMS(a) ((sizeof(a)) / sizeof((a)[0])) -#define MIN(a, b) ((a) < (b) ? (a) : (b)) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) -#define SQUARE(d) ((d) * (d)) -#define VAR(s, s2, n) (((n) < 2) ? 0.0 : ((s2) - SQUARE(s)/(n)) / ((n) - 1)) -#define STDDEV(s, s2, n) (((n) < 2) ? 0.0 : sqrt(VAR((s), (s2), (n)))) +#define SQUARE(d) ((d) * (d)) +#define VAR(s, s2, n) (((n) < 2) ? 0.0 : ((s2)-SQUARE(s) / (n)) / ((n)-1)) +#define STDDEV(s, s2, n) (((n) < 2) ? 0.0 : sqrt(VAR((s), (s2), (n)))) #define DN_INET4_ADDRSTRLEN (sizeof("255.255.255.255") - 1) #define DN_INET6_ADDRSTRLEN \ - (sizeof("ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255") - 1) -#define DN_INET_ADDRSTRLEN MAX(DN_INET4_ADDRSTRLEN, DN_INET6_ADDRSTRLEN) -#define DN_UNIX_ADDRSTRLEN \ - (sizeof(struct sockaddr_un) - offsetof(struct sockaddr_un, sun_path)) + (sizeof("ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255") - 1) +#define DN_INET_ADDRSTRLEN MAX(DN_INET4_ADDRSTRLEN, DN_INET6_ADDRSTRLEN) +#define DN_UNIX_ADDRSTRLEN \ + (sizeof(struct sockaddr_un) - offsetof(struct sockaddr_un, sun_path)) -#define DN_MAXHOSTNAMELEN 256 +#define DN_MAXHOSTNAMELEN 256 /* * Length of 1 byte, 2 bytes, 4 bytes, 8 bytes and largest integral @@ -58,32 +62,29 @@ * # define UINT32_MAX (4294967295U) * # define UINT64_MAX (__UINT64_C(18446744073709551615)) */ -#define DN_UINT8_MAXLEN (3 + 1) -#define DN_UINT16_MAXLEN (5 + 1) -#define DN_UINT32_MAXLEN (10 + 1) -#define DN_UINT64_MAXLEN (20 + 1) -#define DN_UINTMAX_MAXLEN DN_UINT64_MAXLEN +#define DN_UINT8_MAXLEN (3 + 1) +#define DN_UINT16_MAXLEN (5 + 1) +#define DN_UINT32_MAXLEN (10 + 1) +#define DN_UINT64_MAXLEN (20 + 1) +#define DN_UINTMAX_MAXLEN DN_UINT64_MAXLEN /* * Make data 'd' or pointer 'p', n-byte aligned, where n is a power of 2 * of 2. */ -#define DN_ALIGNMENT sizeof(unsigned long) /* platform word */ -#define DN_ALIGN(d, n) (((d) + (n - 1)) & ~(n - 1)) -#define DN_ALIGN_PTR(p, n) \ - (void *) (((uintptr_t) (p) + ((uintptr_t) n - 1)) & ~((uintptr_t) n - 1)) +#define DN_ALIGNMENT sizeof(unsigned long) /* platform word */ +#define DN_ALIGN(d, n) (((d) + (n - 1)) & ~(n - 1)) +#define DN_ALIGN_PTR(p, n) \ + (void *)(((uintptr_t)(p) + ((uintptr_t)n - 1)) & ~((uintptr_t)n - 1)) /* * Wrapper to workaround well known, safe, implicit type conversion when * invoking system calls. */ -#define dn_gethostname(_name, _len) \ - gethostname((char *)_name, (size_t)_len) +#define dn_gethostname(_name, _len) gethostname((char *)_name, (size_t)_len) -#define dn_atoi(_line, _n) \ - _dn_atoi((uint8_t *)_line, (size_t)_n) -#define dn_atoui(_line, _n) \ - _dn_atoui((uint8_t *)_line, (size_t)_n) +#define dn_atoi(_line, _n) _dn_atoi((uint8_t *)_line, (size_t)_n) +#define dn_atoui(_line, _n) _dn_atoui((uint8_t *)_line, (size_t)_n) int dn_set_blocking(int sd); int dn_set_nonblocking(int sd); @@ -107,22 +108,20 @@ bool dn_valid_port(int n); * These wrappers enables us to loosely detect double free, dangling * pointer access and zero-byte alloc. */ -#define dn_alloc(_s) \ - _dn_alloc((size_t)(_s), __FILE__, __LINE__) +#define dn_alloc(_s) _dn_alloc((size_t)(_s), __FILE__, __LINE__) -#define dn_zalloc(_s) \ - _dn_zalloc((size_t)(_s), __FILE__, __LINE__) +#define dn_zalloc(_s) _dn_zalloc((size_t)(_s), __FILE__, __LINE__) -#define dn_calloc(_n, _s) \ - _dn_calloc((size_t)(_n), (size_t)(_s), __FILE__, __LINE__) +#define dn_calloc(_n, _s) \ + _dn_calloc((size_t)(_n), (size_t)(_s), __FILE__, __LINE__) -#define dn_realloc(_p, _s) \ - _dn_realloc(_p, (size_t)(_s), __FILE__, __LINE__) +#define dn_realloc(_p, _s) _dn_realloc(_p, (size_t)(_s), __FILE__, __LINE__) -#define dn_free(_p) do { \ - _dn_free(_p, __FILE__, __LINE__); \ - (_p) = NULL; \ -} while (0) +#define dn_free(_p) \ + do { \ + _dn_free(_p, __FILE__, __LINE__); \ + (_p) = NULL; \ + } while (0) void *_dn_alloc(size_t size, const char *name, int line); void *_dn_zalloc(size_t size, const char *name, int line); @@ -134,27 +133,21 @@ void _dn_free(void *ptr, const char *name, int line); * Wrappers to send or receive n byte message on a blocking * socket descriptor. */ -#define dn_sendn(_s, _b, _n) \ - _dn_sendn(_s, _b, (size_t)(_n)) +#define dn_sendn(_s, _b, _n) _dn_sendn(_s, _b, (size_t)(_n)) -#define dn_recvn(_s, _b, _n) \ - _dn_recvn(_s, _b, (size_t)(_n)) +#define dn_recvn(_s, _b, _n) _dn_recvn(_s, _b, (size_t)(_n)) /* * Wrappers to read or write data to/from (multiple) buffers * to a file or socket descriptor. */ -#define dn_read(_d, _b, _n) \ - read(_d, _b, (size_t)(_n)) +#define dn_read(_d, _b, _n) read(_d, _b, (size_t)(_n)) -#define dn_readv(_d, _b, _n) \ - readv(_d, _b, (int)(_n)) +#define dn_readv(_d, _b, _n) readv(_d, _b, (int)(_n)) -#define dn_write(_d, _b, _n) \ - write(_d, _b, (size_t)(_n)) +#define dn_write(_d, _b, _n) write(_d, _b, (size_t)(_n)) -#define dn_writev(_d, _b, _n) \ - writev(_d, _b, (int)(_n)) +#define dn_writev(_d, _b, _n) writev(_d, _b, (int)(_n)) ssize_t _dn_sendn(int sd, const void *vptr, size_t n); ssize_t _dn_recvn(int sd, void *vptr, size_t n); @@ -169,48 +162,54 @@ ssize_t _dn_recvn(int sd, void *vptr, size_t n); #define ASSERT_CONCAT_(a, b) a##b #define ASSERT_CONCAT(a, b) ASSERT_CONCAT_(a, b) #ifdef __COUNTER__ - #define STATIC_ASSERT(e,m) \ - ;enum { ASSERT_CONCAT(static_assert_, __COUNTER__) = 1/(!!(e)) } +#define STATIC_ASSERT(e, m) \ + ; \ + enum { ASSERT_CONCAT(static_assert_, __COUNTER__) = 1 / (!!(e)) } #else - /* This can't be used twice on the same line so ensure if using in headers - * that the headers are not included twice (by wrapping in #ifndef...#endif) - * Note it doesn't cause an issue when used on same line of separate modules - * compiled with gcc -combine -fwhole-program. */ - #define STATIC_ASSERT(e,m) \ - ;enum { ASSERT_CONCAT(assert_line_, __LINE__) = 1/(!!(e)) } +/* This can't be used twice on the same line so ensure if using in headers + * that the headers are not included twice (by wrapping in #ifndef...#endif) + * Note it doesn't cause an issue when used on same line of separate modules + * compiled with gcc -combine -fwhole-program. */ +#define STATIC_ASSERT(e, m) \ + ; \ + enum { ASSERT_CONCAT(assert_line_, __LINE__) = 1 / (!!(e)) } #endif #ifdef DN_ASSERT_PANIC -#define ASSERT(_x) do { \ - if (!(_x)) { \ - dn_assert(#_x, __FILE__, __LINE__, 1); \ - } \ -} while (0) - -#define ASSERT_LOG(_x, _M, ...) do { \ - if (!(_x)) { \ - log_error("Assertion Failed: "_M, ##__VA_ARGS__); \ - dn_assert(#_x, __FILE__, __LINE__, 1); \ - } \ -} while (0) +#define ASSERT(_x) \ + do { \ + if (!(_x)) { \ + dn_assert(#_x, __FILE__, __LINE__, 1); \ + } \ + } while (0) + +#define ASSERT_LOG(_x, _M, ...) \ + do { \ + if (!(_x)) { \ + log_error("Assertion Failed: "_M, ##__VA_ARGS__); \ + dn_assert(#_x, __FILE__, __LINE__, 1); \ + } \ + } while (0) #define NOT_REACHED() ASSERT(0) #elif DN_ASSERT_LOG -#define ASSERT(_x) do { \ - if (!(_x)) { \ - dn_assert(#_x, __FILE__, __LINE__, 0); \ - } \ -} while (0) - -#define ASSERT_LOG(_x, _M, ...) do { \ - if (!(_x)) { \ - log_error("ASSERTION FAILED: "_M, ##__VA_ARGS__); \ - dn_assert(#_x, __FILE__, __LINE__, 0); \ - } \ -} while (0) +#define ASSERT(_x) \ + do { \ + if (!(_x)) { \ + dn_assert(#_x, __FILE__, __LINE__, 0); \ + } \ + } while (0) + +#define ASSERT_LOG(_x, _M, ...) \ + do { \ + if (!(_x)) { \ + log_error("ASSERTION FAILED: "_M, ##__VA_ARGS__); \ + dn_assert(#_x, __FILE__, __LINE__, 0); \ + } \ + } while (0) #define NOT_REACHED() ASSERT(0) @@ -223,131 +222,131 @@ ssize_t _dn_recvn(int sd, void *vptr, size_t n); #endif - #ifdef DN_LITTLE_ENDIAN -#define str4cmp(m, c0, c1, c2, c3) \ - (*(uint32_t *) m == ((c3 << 24) | (c2 << 16) | (c1 << 8) | c0)) +#define str4cmp(m, c0, c1, c2, c3) \ + (*(uint32_t *)m == ((c3 << 24) | (c2 << 16) | (c1 << 8) | c0)) -#define str5cmp(m, c0, c1, c2, c3, c4) \ - (str4cmp(m, c0, c1, c2, c3) && (m[4] == c4)) +#define str5cmp(m, c0, c1, c2, c3, c4) \ + (str4cmp(m, c0, c1, c2, c3) && (m[4] == c4)) -#define str6cmp(m, c0, c1, c2, c3, c4, c5) \ - (str4cmp(m, c0, c1, c2, c3) && \ - (((uint32_t *) m)[1] & 0xffff) == ((c5 << 8) | c4)) +#define str6cmp(m, c0, c1, c2, c3, c4, c5) \ + (str4cmp(m, c0, c1, c2, c3) && \ + (((uint32_t *)m)[1] & 0xffff) == ((c5 << 8) | c4)) -#define str7cmp(m, c0, c1, c2, c3, c4, c5, c6) \ - (str6cmp(m, c0, c1, c2, c3, c4, c5) && (m[6] == c6)) +#define str7cmp(m, c0, c1, c2, c3, c4, c5, c6) \ + (str6cmp(m, c0, c1, c2, c3, c4, c5) && (m[6] == c6)) -#define str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) \ - (str4cmp(m, c0, c1, c2, c3) && \ - (((uint32_t *) m)[1] == ((c7 << 24) | (c6 << 16) | (c5 << 8) | c4))) +#define str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) \ + (str4cmp(m, c0, c1, c2, c3) && \ + (((uint32_t *)m)[1] == ((c7 << 24) | (c6 << 16) | (c5 << 8) | c4))) -#define str9cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8) \ - (str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) && m[8] == c8) +#define str9cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8) \ + (str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) && m[8] == c8) -#define str10cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) \ - (str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) && \ - (((uint32_t *) m)[2] & 0xffff) == ((c9 << 8) | c8)) +#define str10cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) \ + (str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) && \ + (((uint32_t *)m)[2] & 0xffff) == ((c9 << 8) | c8)) -#define str11cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10) \ - (str10cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) && (m[10] == c10)) +#define str11cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10) \ + (str10cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) && (m[10] == c10)) -#define str12cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11) \ - (str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) && \ - (((uint32_t *) m)[2] == ((c11 << 24) | (c10 << 16) | (c9 << 8) | c8))) +#define str12cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11) \ + (str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) && \ + (((uint32_t *)m)[2] == ((c11 << 24) | (c10 << 16) | (c9 << 8) | c8))) #else -#define str4cmp(m, c0, c1, c2, c3) \ - (m[0] == c0 && m[1] == c1 && m[2] == c2 && m[3] == c3) +#define str4cmp(m, c0, c1, c2, c3) \ + (m[0] == c0 && m[1] == c1 && m[2] == c2 && m[3] == c3) -#define str5cmp(m, c0, c1, c2, c3, c4) \ - (str4cmp(m, c0, c1, c2, c3) && (m[4] == c4)) +#define str5cmp(m, c0, c1, c2, c3, c4) \ + (str4cmp(m, c0, c1, c2, c3) && (m[4] == c4)) -#define str6cmp(m, c0, c1, c2, c3, c4, c5) \ - (str5cmp(m, c0, c1, c2, c3, c4) && m[5] == c5) +#define str6cmp(m, c0, c1, c2, c3, c4, c5) \ + (str5cmp(m, c0, c1, c2, c3, c4) && m[5] == c5) -#define str7cmp(m, c0, c1, c2, c3, c4, c5, c6) \ - (str6cmp(m, c0, c1, c2, c3, c4, c5) && m[6] == c6) +#define str7cmp(m, c0, c1, c2, c3, c4, c5, c6) \ + (str6cmp(m, c0, c1, c2, c3, c4, c5) && m[6] == c6) -#define str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) \ - (str7cmp(m, c0, c1, c2, c3, c4, c5, c6) && m[7] == c7) +#define str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) \ + (str7cmp(m, c0, c1, c2, c3, c4, c5, c6) && m[7] == c7) -#define str9cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8) \ - (str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) && m[8] == c8) +#define str9cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8) \ + (str8cmp(m, c0, c1, c2, c3, c4, c5, c6, c7) && m[8] == c8) -#define str10cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) \ - (str9cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8) && m[9] == c9) +#define str10cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) \ + (str9cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8) && m[9] == c9) -#define str11cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10) \ - (str10cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) && m[10] == c10) +#define str11cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10) \ + (str10cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) && m[10] == c10) -#define str12cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11) \ - (str11cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10) && m[11] == c11) +#define str12cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11) \ + (str11cmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10) && m[11] == c11) #endif -#define str3icmp(m, c0, c1, c2) \ - ((m[0] == c0 || m[0] == (c0 ^ 0x20)) && \ - (m[1] == c1 || m[1] == (c1 ^ 0x20)) && \ - (m[2] == c2 || m[2] == (c2 ^ 0x20))) +#define str3icmp(m, c0, c1, c2) \ + ((m[0] == c0 || m[0] == (c0 ^ 0x20)) && \ + (m[1] == c1 || m[1] == (c1 ^ 0x20)) && (m[2] == c2 || m[2] == (c2 ^ 0x20))) -#define str4icmp(m, c0, c1, c2, c3) \ - (str3icmp(m, c0, c1, c2) && (m[3] == c3 || m[3] == (c3 ^ 0x20))) +#define str4icmp(m, c0, c1, c2, c3) \ + (str3icmp(m, c0, c1, c2) && (m[3] == c3 || m[3] == (c3 ^ 0x20))) -#define str5icmp(m, c0, c1, c2, c3, c4) \ - (str4icmp(m, c0, c1, c2, c3) && (m[4] == c4 || m[4] == (c4 ^ 0x20))) +#define str5icmp(m, c0, c1, c2, c3, c4) \ + (str4icmp(m, c0, c1, c2, c3) && (m[4] == c4 || m[4] == (c4 ^ 0x20))) -#define str6icmp(m, c0, c1, c2, c3, c4, c5) \ - (str5icmp(m, c0, c1, c2, c3, c4) && (m[5] == c5 || m[5] == (c5 ^ 0x20))) +#define str6icmp(m, c0, c1, c2, c3, c4, c5) \ + (str5icmp(m, c0, c1, c2, c3, c4) && (m[5] == c5 || m[5] == (c5 ^ 0x20))) -#define str7icmp(m, c0, c1, c2, c3, c4, c5, c6) \ - (str6icmp(m, c0, c1, c2, c3, c4, c5) && \ - (m[6] == c6 || m[6] == (c6 ^ 0x20))) +#define str7icmp(m, c0, c1, c2, c3, c4, c5, c6) \ + (str6icmp(m, c0, c1, c2, c3, c4, c5) && (m[6] == c6 || m[6] == (c6 ^ 0x20))) -#define str8icmp(m, c0, c1, c2, c3, c4, c5, c6, c7) \ - (str7icmp(m, c0, c1, c2, c3, c4, c5, c6) && \ - (m[7] == c7 || m[7] == (c7 ^ 0x20))) +#define str8icmp(m, c0, c1, c2, c3, c4, c5, c6, c7) \ + (str7icmp(m, c0, c1, c2, c3, c4, c5, c6) && \ + (m[7] == c7 || m[7] == (c7 ^ 0x20))) -#define str9icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8) \ - (str8icmp(m, c0, c1, c2, c3, c4, c5, c6, c7) && \ - (m[8] == c8 || m[8] == (c8 ^ 0x20))) +#define str9icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8) \ + (str8icmp(m, c0, c1, c2, c3, c4, c5, c6, c7) && \ + (m[8] == c8 || m[8] == (c8 ^ 0x20))) -#define str10icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) \ - (str9icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8) && \ - (m[9] == c9 || m[9] == (c9 ^ 0x20))) +#define str10icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) \ + (str9icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8) && \ + (m[9] == c9 || m[9] == (c9 ^ 0x20))) -#define str11icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10) \ - (str10icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) && \ - (m[10] == c10 || m[10] == (c10 ^ 0x20))) +#define str11icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10) \ + (str10icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9) && \ + (m[10] == c10 || m[10] == (c10 ^ 0x20))) -#define str12icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11) \ - (str11icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10) && \ - (m[11] == c11 || m[11] == (c11 ^ 0x20))) +#define str12icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11) \ + (str11icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10) && \ + (m[11] == c11 || m[11] == (c11 ^ 0x20))) -#define str13icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12) \ - (str12icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11) && \ - (m[12] == c12 || m[12] == (c12 ^ 0x20))) +#define str13icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12) \ + (str12icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11) && \ + (m[12] == c12 || m[12] == (c12 ^ 0x20))) -#define str14icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13) \ - (str13icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12) && \ - (m[13] == c13 || m[13] == (c13 ^ 0x20))) - -#define str15icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14) \ - (str14icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13) && \ - (m[14] == c14 || m[14] == (c14 ^ 0x20))) - -#define str16icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15) \ - (str15icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14) && \ - (m[15] == c15 || m[15] == (c15 ^ 0x20))) - -#define str17icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16) \ - (str16icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15) && \ - (m[16] == c16 || m[16] == (c16 ^ 0x20))) +#define str14icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, \ + c13) \ + (str13icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12) && \ + (m[13] == c13 || m[13] == (c13 ^ 0x20))) +#define str15icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, \ + c13, c14) \ + (str14icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13) && \ + (m[14] == c14 || m[14] == (c14 ^ 0x20))) +#define str16icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, \ + c13, c14, c15) \ + (str15icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, \ + c14) && \ + (m[15] == c15 || m[15] == (c15 ^ 0x20))) +#define str17icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, \ + c13, c14, c15, c16) \ + (str16icmp(m, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, \ + c14, c15) && \ + (m[16] == c16 || m[16] == (c16 ^ 0x20))) void dn_assert(const char *cond, const char *file, int line, int panic); void dn_stacktrace(int skip_count); @@ -363,13 +362,13 @@ msec_t dn_msec_now(void); */ struct sockinfo { - int family; /* socket address family */ - socklen_t addrlen; /* socket address length */ - union { - struct sockaddr_in in; /* ipv4 socket address */ - struct sockaddr_in6 in6; /* ipv6 socket address */ - struct sockaddr_un un; /* unix domain address */ - } addr; + int family; /* socket address family */ + socklen_t addrlen; /* socket address length */ + union { + struct sockaddr_in in; /* ipv4 socket address */ + struct sockaddr_in6 in6; /* ipv6 socket address */ + struct sockaddr_un un; /* unix domain address */ + } addr; }; int dn_resolve(struct string *name, int port, struct sockinfo *si); @@ -381,5 +380,4 @@ unsigned int dict_string_hash(const void *key); int dict_string_key_compare(void *privdata, const void *key1, const void *key2); void dict_string_destructor(void *privdata, void *val); - #endif diff --git a/src/dyn_vnode.c b/src/dyn_vnode.c index 0e64116ef..c73b53623 100644 --- a/src/dyn_vnode.c +++ b/src/dyn_vnode.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,133 +20,132 @@ * limitations under the License. */ +#include #include #include -#include -#include #include +#include #include #include -static int -vnode_item_cmp(const void *t1, const void *t2) -{ - const struct continuum *ct1 = t1, *ct2 = t2; +static int vnode_item_cmp(const void *t1, const void *t2) { + const struct continuum *ct1 = t1, *ct2 = t2; - return cmp_dyn_token(ct1->token, ct2->token); + return cmp_dyn_token(ct1->token, ct2->token); } -static rstatus_t -vnode_rack_verify_continuum(void *elem) -{ - struct rack *rack = elem; - qsort(rack->continuum, rack->ncontinuum, sizeof(*rack->continuum), - vnode_item_cmp); - - log_debug(LOG_VERB, "**** printing continuums for rack '%.*s'", rack->name->len, rack->name->data); - uint32_t i; - for (i = 0; i < rack->ncontinuum; i++) { - struct continuum *c = &rack->continuum[i]; - log_debug(LOG_VERB, "next c[%d]: idx = %u, token->mag = %u", i, c->index, c->token->mag[0]); +static rstatus_t vnode_rack_verify_continuum(void *elem) { + struct rack *rack = elem; + qsort(rack->continuum, rack->ncontinuum, sizeof(*rack->continuum), + vnode_item_cmp); + + log_debug(LOG_VERB, "**** printing continuums for rack '%.*s'", + rack->name->len, rack->name->data); + uint32_t i; + for (i = 0; i < rack->ncontinuum; i++) { + struct continuum *c = &rack->continuum[i]; + log_debug(LOG_VERB, "next c[%d]: idx = %u, token->mag = %u", i, c->index, + c->token->mag[0]); + } + log_debug(LOG_VERB, "**** end printing continuums for rack '%.*s'", + rack->name->len, rack->name->data); + + return DN_OK; +} + +rstatus_t vnode_update(struct server_pool *sp) { + ASSERT(array_n(&sp->peers) > 0); + + uint32_t i, len; + for (i = 0, len = array_n(&sp->peers); i < len; i++) { + struct node *peer = *(struct node **)array_get(&sp->peers, i); + + log_debug(LOG_VERB, "peer name : '%.*s'", peer->name.len, + peer->name.data); + log_debug(LOG_VERB, "peer rack : '%.*s'", peer->rack.len, + peer->rack.data); + log_debug(LOG_VERB, "peer dc : '%.*s'", peer->dc.len, peer->dc.data); + log_debug(LOG_VERB, "peer->processed = %d", peer->processed); + + // update its own state + if (i == 0) { + peer->state = sp->ctx->dyn_state; } - log_debug(LOG_VERB, "**** end printing continuums for rack '%.*s'", rack->name->len, rack->name->data); - return DN_OK; -} + if (peer->processed) { + continue; + } + + peer->processed = 1; + + struct datacenter *dc = server_get_dc(sp, &peer->dc); + struct rack *rack = server_get_rack(dc, &peer->rack); + + ASSERT(rack != NULL); -rstatus_t -vnode_update(struct server_pool *sp) -{ - ASSERT(array_n(&sp->peers) > 0); - - uint32_t i, len; - for (i = 0, len = array_n(&sp->peers); i < len; i++) { - struct node *peer = *(struct node **)array_get(&sp->peers, i); - - log_debug(LOG_VERB, "peer name : '%.*s'", peer->name.len, peer->name.data); - log_debug(LOG_VERB, "peer rack : '%.*s'", peer->rack.len, peer->rack.data); - log_debug(LOG_VERB, "peer dc : '%.*s'", peer->dc.len, peer->dc.data); - log_debug(LOG_VERB, "peer->processed = %d", peer->processed); - - //update its own state - if (i == 0) { - peer->state = sp->ctx->dyn_state; - } - - if (peer->processed) { - continue; - } - - peer->processed = 1; - - struct datacenter *dc = server_get_dc(sp, &peer->dc); - struct rack *rack = server_get_rack(dc, &peer->rack); - - ASSERT(rack != NULL); - - uint32_t token_cnt = array_n(&peer->tokens); - uint32_t orig_cnt = rack->nserver_continuum; - uint32_t new_cnt = orig_cnt + token_cnt; - - if (new_cnt > 1) { - struct continuum *continuum = dn_realloc(rack->continuum, sizeof(struct continuum) * new_cnt); - if (continuum == NULL) { - log_debug(LOG_ERR, "Are we failing? Why???? This is a serious issue"); - return DN_ENOMEM; - } - - rack->continuum = continuum; - } - rack->nserver_continuum = new_cnt; - - uint32_t j; - for (j = 0; j < token_cnt; j++) { - struct continuum *c = &rack->continuum[orig_cnt + j]; - c->index = i; - c->value = 0; /* set this to an empty value, only used by ketama */ - c->token = array_get(&peer->tokens, j); - rack->ncontinuum++; - } - - if (array_n(&dc->racks) != 0) { - rstatus_t status = array_each(&dc->racks, vnode_rack_verify_continuum); - if (status != DN_OK) { - return status; - } - } + uint32_t token_cnt = array_n(&peer->tokens); + uint32_t orig_cnt = rack->nserver_continuum; + uint32_t new_cnt = orig_cnt + token_cnt; + + if (new_cnt > 1) { + struct continuum *continuum = + dn_realloc(rack->continuum, sizeof(struct continuum) * new_cnt); + if (continuum == NULL) { + log_debug(LOG_ERR, "Are we failing? Why???? This is a serious issue"); + return DN_ENOMEM; + } + + rack->continuum = continuum; + } + rack->nserver_continuum = new_cnt; + + uint32_t j; + for (j = 0; j < token_cnt; j++) { + struct continuum *c = &rack->continuum[orig_cnt + j]; + c->index = i; + c->value = 0; /* set this to an empty value, only used by ketama */ + c->token = array_get(&peer->tokens, j); + rack->ncontinuum++; } + if (array_n(&dc->racks) != 0) { + rstatus_t status = array_each(&dc->racks, vnode_rack_verify_continuum); + if (status != DN_OK) { + return status; + } + } + } - return DN_OK; + return DN_OK; } -//if token falls into interval (a,b], we return b. -uint32_t -vnode_dispatch(struct continuum *continuum, uint32_t ncontinuum, struct dyn_token *token) -{ - struct continuum *left, *right, *middle; - - ASSERT(continuum != NULL); - ASSERT(ncontinuum != 0); - - left = continuum; - right = continuum + ncontinuum - 1; - - if (cmp_dyn_token(right->token, token) < 0 || cmp_dyn_token(left->token, token) >= 0) - return left->index; - - while (left < right) { - middle = left + (right - left) / 2; - int32_t cmp = cmp_dyn_token(middle->token, token); - if (cmp == 0) { - return middle->index; - } else if (cmp < 0) { - left = middle + 1; - } else { - right = middle; - } +// if token falls into interval (a,b], we return b. +uint32_t vnode_dispatch(struct continuum *continuum, uint32_t ncontinuum, + struct dyn_token *token) { + struct continuum *left, *right, *middle; + + ASSERT(continuum != NULL); + ASSERT(ncontinuum != 0); + + left = continuum; + right = continuum + ncontinuum - 1; + + if (cmp_dyn_token(right->token, token) < 0 || + cmp_dyn_token(left->token, token) >= 0) + return left->index; + + while (left < right) { + middle = left + (right - left) / 2; + int32_t cmp = cmp_dyn_token(middle->token, token); + if (cmp == 0) { + return middle->index; + } else if (cmp < 0) { + left = middle + 1; + } else { + right = middle; } + } - return right->index; + return right->index; } diff --git a/src/dyn_vnode.h b/src/dyn_vnode.h index 467118a80..10630b644 100644 --- a/src/dyn_vnode.h +++ b/src/dyn_vnode.h @@ -2,5 +2,5 @@ #include rstatus_t vnode_update(struct server_pool *pool); -uint32_t vnode_dispatch(struct continuum *continuum, uint32_t ncontinuum, struct dyn_token *token); - +uint32_t vnode_dispatch(struct continuum *continuum, uint32_t ncontinuum, + struct dyn_token *token); diff --git a/src/dynomite.c b/src/dynomite.c index 17127c426..58bcc1a63 100644 --- a/src/dynomite.c +++ b/src/dynomite.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,40 +20,40 @@ * limitations under the License. */ +#include +#include +#include #include #include -#include -#include -#include -#include #include #include #include +#include -#include "dyn_core.h" +#include "dyn_asciilogo.h" #include "dyn_conf.h" +#include "dyn_core.h" #include "dyn_signal.h" -#include "dyn_asciilogo.h" -#define DN_CONF_PATH "conf/dynomite.yml" +#define DN_CONF_PATH "conf/dynomite.yml" -#define DN_LOG_DEFAULT LOG_NOTICE -#define DN_LOG_MIN LOG_EMERG -#define DN_LOG_MAX LOG_PVERB -#define DN_LOG_PATH NULL +#define DN_LOG_DEFAULT LOG_NOTICE +#define DN_LOG_MIN LOG_EMERG +#define DN_LOG_MAX LOG_PVERB +#define DN_LOG_PATH NULL -#define DN_ENTROPY_PORT ENTROPY_PORT -#define DN_ENTROPY_ADDR ENTROPY_ADDR +#define DN_ENTROPY_PORT ENTROPY_PORT +#define DN_ENTROPY_ADDR ENTROPY_ADDR -#define DN_PID_FILE NULL +#define DN_PID_FILE NULL -#define DN_MBUF_SIZE MBUF_SIZE -#define DN_MBUF_MIN_SIZE MBUF_MIN_SIZE -#define DN_MBUF_MAX_SIZE MBUF_MAX_SIZE +#define DN_MBUF_SIZE MBUF_SIZE +#define DN_MBUF_MIN_SIZE MBUF_MIN_SIZE +#define DN_MBUF_MAX_SIZE MBUF_MAX_SIZE -#define DN_ALLOC_MSGS ALLOC_MSGS -#define DN_MIN_ALLOC_MSGS MIN_ALLOC_MSGS -#define DN_MAX_ALLOC_MSGS MAX_ALLOC_MSGS +#define DN_ALLOC_MSGS ALLOC_MSGS +#define DN_MIN_ALLOC_MSGS MIN_ALLOC_MSGS +#define DN_MAX_ALLOC_MSGS MAX_ALLOC_MSGS static int show_help; static int show_version; @@ -62,21 +62,20 @@ static int daemonize; static int describe_stats; static struct option long_options[] = { - { "help", no_argument, NULL, 'h' }, - { "version", no_argument, NULL, 'V' }, - { "test-conf", no_argument, NULL, 't' }, - { "daemonize", no_argument, NULL, 'd' }, - { "describe-stats", no_argument, NULL, 'D' }, - { "verbosity", required_argument, NULL, 'v' }, - { "output", required_argument, NULL, 'o' }, - { "conf-file", required_argument, NULL, 'c' }, - { "pid-file", required_argument, NULL, 'p' }, - { "mbuf-size", required_argument, NULL, 'm' }, - { "max-msgs", required_argument, NULL, 'M' }, - { "admin-operation", required_argument, NULL, 'x' }, - { "admin-param", required_argument, NULL, 'y' }, - { NULL, 0, NULL, 0 } -}; + {"help", no_argument, NULL, 'h'}, + {"version", no_argument, NULL, 'V'}, + {"test-conf", no_argument, NULL, 't'}, + {"daemonize", no_argument, NULL, 'd'}, + {"describe-stats", no_argument, NULL, 'D'}, + {"verbosity", required_argument, NULL, 'v'}, + {"output", required_argument, NULL, 'o'}, + {"conf-file", required_argument, NULL, 'c'}, + {"pid-file", required_argument, NULL, 'p'}, + {"mbuf-size", required_argument, NULL, 'm'}, + {"max-msgs", required_argument, NULL, 'M'}, + {"admin-operation", required_argument, NULL, 'x'}, + {"admin-param", required_argument, NULL, 'y'}, + {NULL, 0, NULL, 0}}; static char short_options[] = "hVtdDv:o:c:s:i:a:p:m:M:x:y"; @@ -85,207 +84,192 @@ static char short_options[] = "hVtdDv:o:c:s:i:a:p:m:M:x:y"; * @param[in] dump_core If set to 0 then dynomite tries to chdir to /. * @return rstatus_t Return status code. */ -static rstatus_t -dn_daemonize(int dump_core) -{ - rstatus_t status; - pid_t pid, sid; - int fd; - - pid = fork(); - switch (pid) { +static rstatus_t dn_daemonize(int dump_core) { + rstatus_t status; + pid_t pid, sid; + int fd; + + pid = fork(); + switch (pid) { case -1: - log_error("fork() failed: %s", strerror(errno)); - return DN_ERROR; + log_error("fork() failed: %s", strerror(errno)); + return DN_ERROR; case 0: - break; + break; default: - /* parent terminates */ - _exit(0); - } + /* parent terminates */ + _exit(0); + } - /* 1st child continues and becomes the session leader */ + /* 1st child continues and becomes the session leader */ - sid = setsid(); - if (sid < 0) { - log_error("setsid() failed: %s", strerror(errno)); - return DN_ERROR; - } + sid = setsid(); + if (sid < 0) { + log_error("setsid() failed: %s", strerror(errno)); + return DN_ERROR; + } - if (signal(SIGHUP, SIG_IGN) == SIG_ERR) { - log_error("signal(SIGHUP, SIG_IGN) failed: %s", strerror(errno)); - return DN_ERROR; - } + if (signal(SIGHUP, SIG_IGN) == SIG_ERR) { + log_error("signal(SIGHUP, SIG_IGN) failed: %s", strerror(errno)); + return DN_ERROR; + } - pid = fork(); - switch (pid) { + pid = fork(); + switch (pid) { case -1: - log_error("fork() failed: %s", strerror(errno)); - return DN_ERROR; + log_error("fork() failed: %s", strerror(errno)); + return DN_ERROR; case 0: - break; + break; default: - /* 1st child terminates */ - _exit(0); - } + /* 1st child terminates */ + _exit(0); + } - /* 2nd child continues */ + /* 2nd child continues */ - /* change working directory */ - if (dump_core == 0) { - status = chdir("/"); - if (status < 0) { - log_error("chdir(\"/\") failed: %s", strerror(errno)); - return DN_ERROR; - } + /* change working directory */ + if (dump_core == 0) { + status = chdir("/"); + if (status < 0) { + log_error("chdir(\"/\") failed: %s", strerror(errno)); + return DN_ERROR; } + } - /* clear file mode creation mask */ - umask(0); + /* clear file mode creation mask */ + umask(0); - /* redirect stdin, stdout and stderr to "/dev/null" */ + /* redirect stdin, stdout and stderr to "/dev/null" */ - fd = open("/dev/null", O_RDWR); - if (fd < 0) { - log_error("open(\"/dev/null\") failed: %s", strerror(errno)); - return DN_ERROR; - } + fd = open("/dev/null", O_RDWR); + if (fd < 0) { + log_error("open(\"/dev/null\") failed: %s", strerror(errno)); + return DN_ERROR; + } - status = dup2(fd, STDIN_FILENO); - if (status < 0) { - log_error("dup2(%d, STDIN) failed: %s", fd, strerror(errno)); - close(fd); - return DN_ERROR; - } + status = dup2(fd, STDIN_FILENO); + if (status < 0) { + log_error("dup2(%d, STDIN) failed: %s", fd, strerror(errno)); + close(fd); + return DN_ERROR; + } - status = dup2(fd, STDOUT_FILENO); - if (status < 0) { - log_error("dup2(%d, STDOUT) failed: %s", fd, strerror(errno)); - close(fd); - return DN_ERROR; - } + status = dup2(fd, STDOUT_FILENO); + if (status < 0) { + log_error("dup2(%d, STDOUT) failed: %s", fd, strerror(errno)); + close(fd); + return DN_ERROR; + } - status = dup2(fd, STDERR_FILENO); - if (status < 0) { - log_error("dup2(%d, STDERR) failed: %s", fd, strerror(errno)); - close(fd); - return DN_ERROR; - } + status = dup2(fd, STDERR_FILENO); + if (status < 0) { + log_error("dup2(%d, STDERR) failed: %s", fd, strerror(errno)); + close(fd); + return DN_ERROR; + } - if (fd > STDERR_FILENO) { - status = close(fd); - if (status < 0) { - log_error("close(%d) failed: %s", fd, strerror(errno)); - return DN_ERROR; - } + if (fd > STDERR_FILENO) { + status = close(fd); + if (status < 0) { + log_error("close(%d) failed: %s", fd, strerror(errno)); + return DN_ERROR; } + } - return DN_OK; + return DN_OK; } /** * Print start messages. * @param[in] nci Dynomite instance */ -static void -dn_print_run(struct instance *nci) -{ - int status; - struct utsname name; - - status = uname(&name); - if (status < 0) { - loga("dynomite-%s started on pid %d", VERSION, nci->pid); - } else { - loga("dynomite-%s built for %s %s %s started on pid %d", - VERSION, name.sysname, name.release, name.machine, nci->pid); - } - - loga("run, rabbit run / dig that hole, forget the sun / " - "and when at last the work is done / don't sit down / " - "it's time to dig another one"); - - loga("%s",ascii_logo); +static void dn_print_run(struct instance *nci) { + int status; + struct utsname name; + + status = uname(&name); + if (status < 0) { + loga("dynomite-%s started on pid %d", VERSION, nci->pid); + } else { + loga("dynomite-%s built for %s %s %s started on pid %d", VERSION, + name.sysname, name.release, name.machine, nci->pid); + } + + loga( + "run, rabbit run / dig that hole, forget the sun / " + "and when at last the work is done / don't sit down / " + "it's time to dig another one"); + + loga("%s", ascii_logo); } -static void -dn_print_done(void) -{ - loga("done, rabbit done"); +static void dn_print_done(void) { loga("done, rabbit done"); } + +static void dn_show_usage(void) { + log_stderr( + "Usage: dynomite [-?hVdDt] [-v verbosity level] [-o output file]" CRLF + " [-c conf file] [-p pid file] [-m mbuf size " + "(deprecated)]" CRLF + " [-M max alloc messages (deprecated)]" CRLF ""); + log_stderr("Options:" CRLF " -h, --help : this help" CRLF + " -V, --version : show version and exit" CRLF + " -t, --test-conf : test configuration for syntax errors " + "and exit" CRLF " -d, --daemonize : run as a daemon" CRLF + " -D, --describe-stats : print stats description and exit"); + log_stderr( + " -v, --verbosity=N : set logging level (default: %d, min: " + "%d, max: %d)" CRLF + " -o, --output=S : set logging file (default: %s)" CRLF + " -c, --conf-file=S : set configuration file (default: " + "%s)" CRLF + " -p, --pid-file=S : set pid file (default: %s)" CRLF + " -m, --mbuf-size=N : set size of mbuf chunk in bytes " + "(default: %d bytes)" CRLF "", + DN_LOG_DEFAULT, DN_LOG_MIN, DN_LOG_MAX, + DN_LOG_PATH != NULL ? DN_LOG_PATH : "stderr", DN_CONF_PATH, + DN_PID_FILE != NULL ? DN_PID_FILE : "off", 0); } -static void -dn_show_usage(void) -{ - log_stderr( - "Usage: dynomite [-?hVdDt] [-v verbosity level] [-o output file]" CRLF - " [-c conf file] [-p pid file] [-m mbuf size (deprecated)]" CRLF - " [-M max alloc messages (deprecated)]" CRLF - ""); - log_stderr( - "Options:" CRLF - " -h, --help : this help" CRLF - " -V, --version : show version and exit" CRLF - " -t, --test-conf : test configuration for syntax errors and exit" CRLF - " -d, --daemonize : run as a daemon" CRLF - " -D, --describe-stats : print stats description and exit"); - log_stderr( - " -v, --verbosity=N : set logging level (default: %d, min: %d, max: %d)" CRLF - " -o, --output=S : set logging file (default: %s)" CRLF - " -c, --conf-file=S : set configuration file (default: %s)" CRLF - " -p, --pid-file=S : set pid file (default: %s)" CRLF - " -m, --mbuf-size=N : set size of mbuf chunk in bytes (default: %d bytes)" CRLF - "", - DN_LOG_DEFAULT, DN_LOG_MIN, DN_LOG_MAX, - DN_LOG_PATH != NULL ? DN_LOG_PATH : "stderr", - DN_CONF_PATH, - DN_PID_FILE != NULL ? DN_PID_FILE : "off", - 0); -} +static rstatus_t dn_create_pidfile(struct instance *nci) { + char pid[DN_UINTMAX_MAXLEN]; + int fd, pid_len; + ssize_t n; -static rstatus_t -dn_create_pidfile(struct instance *nci) -{ - char pid[DN_UINTMAX_MAXLEN]; - int fd, pid_len; - ssize_t n; - - fd = open(nci->pid_filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); - if (fd < 0) { - log_error("opening pid file '%s' failed: %s", nci->pid_filename, - strerror(errno)); - return DN_ERROR; - } - nci->pidfile = 1; + fd = open(nci->pid_filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + log_error("opening pid file '%s' failed: %s", nci->pid_filename, + strerror(errno)); + return DN_ERROR; + } + nci->pidfile = 1; - pid_len = dn_snprintf(pid, DN_UINTMAX_MAXLEN, "%d", nci->pid); + pid_len = dn_snprintf(pid, DN_UINTMAX_MAXLEN, "%d", nci->pid); - n = dn_write(fd, pid, pid_len); - if (n < 0) { - log_error("write to pid file '%s' failed: %s", nci->pid_filename, - strerror(errno)); - return DN_ERROR; - } + n = dn_write(fd, pid, pid_len); + if (n < 0) { + log_error("write to pid file '%s' failed: %s", nci->pid_filename, + strerror(errno)); + return DN_ERROR; + } - close(fd); + close(fd); - return DN_OK; + return DN_OK; } -static void -dn_remove_pidfile(struct instance *nci) -{ - int status; +static void dn_remove_pidfile(struct instance *nci) { + int status; - status = unlink(nci->pid_filename); - if (status < 0) { - log_error("unlink of pid file '%s' failed, ignored: %s", - nci->pid_filename, strerror(errno)); - } + status = unlink(nci->pid_filename); + if (status < 0) { + log_error("unlink of pid file '%s' failed, ignored: %s", nci->pid_filename, + strerror(errno)); + } } /** @@ -293,35 +277,33 @@ dn_remove_pidfile(struct instance *nci) * hostname which is set via gethostname(). * @param nci dynomite instance */ -static void -dn_set_default_options(struct instance *nci) -{ - int status; +static void dn_set_default_options(struct instance *nci) { + int status; - nci->ctx = NULL; + nci->ctx = NULL; - nci->log_level = DN_LOG_DEFAULT; - nci->log_filename = DN_LOG_PATH; + nci->log_level = DN_LOG_DEFAULT; + nci->log_filename = DN_LOG_PATH; - nci->conf_filename = DN_CONF_PATH; + nci->conf_filename = DN_CONF_PATH; - nci->entropy_port = DN_ENTROPY_PORT; - nci->entropy_addr = DN_ENTROPY_ADDR; + nci->entropy_port = DN_ENTROPY_PORT; + nci->entropy_addr = DN_ENTROPY_ADDR; - status = dn_gethostname(nci->hostname, DN_MAXHOSTNAMELEN); - if (status < 0) { - log_warn("gethostname failed, ignored: %s", strerror(errno)); - dn_snprintf(nci->hostname, DN_MAXHOSTNAMELEN, "unknown"); - } - nci->hostname[DN_MAXHOSTNAMELEN - 1] = '\0'; + status = dn_gethostname(nci->hostname, DN_MAXHOSTNAMELEN); + if (status < 0) { + log_warn("gethostname failed, ignored: %s", strerror(errno)); + dn_snprintf(nci->hostname, DN_MAXHOSTNAMELEN, "unknown"); + } + nci->hostname[DN_MAXHOSTNAMELEN - 1] = '\0'; - nci->mbuf_chunk_size = DN_MBUF_SIZE; + nci->mbuf_chunk_size = DN_MBUF_SIZE; - nci->alloc_msgs_max = DN_ALLOC_MSGS; + nci->alloc_msgs_max = DN_ALLOC_MSGS; - nci->pid = (pid_t)-1; - nci->pid_filename = NULL; - nci->pidfile = 0; + nci->pid = (pid_t)-1; + nci->pid_filename = NULL; + nci->pidfile = 0; } /** @@ -331,146 +313,149 @@ dn_set_default_options(struct instance *nci) * @param nci dynomite instance * @return return status */ -static rstatus_t -dn_get_options(int argc, char **argv, struct instance *nci) -{ - int c, value; +static rstatus_t dn_get_options(int argc, char **argv, struct instance *nci) { + int c, value; - opterr = 0; - - c = getopt_long(argc, argv, short_options, long_options, NULL); - while (c != -1) { - - switch (c) { - case 'h': - show_version = 1; - show_help = 1; - break; + opterr = 0; - case 'V': - show_version = 1; - break; + c = getopt_long(argc, argv, short_options, long_options, NULL); + while (c != -1) { + switch (c) { + case 'h': + show_version = 1; + show_help = 1; + break; - case 't': - test_conf = 1; - nci->log_level = 11; - break; + case 'V': + show_version = 1; + break; - case 'd': - daemonize = 1; - break; + case 't': + test_conf = 1; + nci->log_level = 11; + break; - case 'D': - describe_stats = 1; - show_version = 1; - break; + case 'd': + daemonize = 1; + break; - case 'v': - value = dn_atoi(optarg, strlen(optarg)); - if (value < 0) { - log_stderr("dynomite: option -v requires a number"); - return DN_ERROR; - } - nci->log_level = value; - break; + case 'D': + describe_stats = 1; + show_version = 1; + break; - case 'o': - nci->log_filename = optarg; - break; + case 'v': + value = dn_atoi(optarg, strlen(optarg)); + if (value < 0) { + log_stderr("dynomite: option -v requires a number"); + return DN_ERROR; + } + nci->log_level = value; + break; - case 'c': - nci->conf_filename = optarg; - break; + case 'o': + nci->log_filename = optarg; + break; - case 'p': - nci->pid_filename = optarg; - break; + case 'c': + nci->conf_filename = optarg; + break; - case 'm': // deprecated argument - loga("-m or --mbuf-size command line arguments has been deprecated. Use configuration file."); - value = dn_atoi(optarg, strlen(optarg)); - if (value <= 0) { - log_stderr("dynomite: option -m requires a non-zero number"); - return DN_ERROR; - } - - if (value < DN_MBUF_MIN_SIZE || value > DN_MBUF_MAX_SIZE) { - log_stderr("dynomite: mbuf chunk size must be between %zu and" - " %zu bytes", DN_MBUF_MIN_SIZE, DN_MBUF_MAX_SIZE); - return DN_ERROR; - } - - if ((value / 16) * 16 != value) { - log_stderr("dynomite: mbuf chunk size must be a multiple of 16"); - return DN_ERROR; - } - - nci->mbuf_chunk_size = (size_t)value; - break; + case 'p': + nci->pid_filename = optarg; + break; - case 'M': // deprecated argument - loga("-M or max-msgs command line argument has been deprecated. Use configuration file."); - value = dn_atoi(optarg, strlen(optarg)); - if (value <= 0) { - log_stderr("dynomite: option -M requires a non-zero number"); - return DN_ERROR; - } + case 'm': // deprecated argument + loga( + "-m or --mbuf-size command line arguments has been deprecated. Use " + "configuration file."); + value = dn_atoi(optarg, strlen(optarg)); + if (value <= 0) { + log_stderr("dynomite: option -m requires a non-zero number"); + return DN_ERROR; + } - if (value < DN_MIN_ALLOC_MSGS || value > DN_MAX_ALLOC_MSGS) { - log_stderr("dynomite: max allocated messages buffer must be between %zu and" - " %zu messages", DN_MIN_ALLOC_MSGS, DN_MAX_ALLOC_MSGS); - return DN_ERROR; - } + if (value < DN_MBUF_MIN_SIZE || value > DN_MBUF_MAX_SIZE) { + log_stderr( + "dynomite: mbuf chunk size must be between %zu and" + " %zu bytes", + DN_MBUF_MIN_SIZE, DN_MBUF_MAX_SIZE); + return DN_ERROR; + } - nci->alloc_msgs_max = (size_t)value; + if ((value / 16) * 16 != value) { + log_stderr("dynomite: mbuf chunk size must be a multiple of 16"); + return DN_ERROR; + } - break; + nci->mbuf_chunk_size = (size_t)value; + break; - case 'x': - value = dn_atoi(optarg, strlen(optarg)); - if (value <= 0) { - log_stderr("dynomite: option -x requires a non-zero positive number"); - return DN_ERROR; - } - admin_opt = (uint32_t)value; + case 'M': // deprecated argument + loga( + "-M or max-msgs command line argument has been deprecated. Use " + "configuration file."); + value = dn_atoi(optarg, strlen(optarg)); + if (value <= 0) { + log_stderr("dynomite: option -M requires a non-zero number"); + return DN_ERROR; + } + if (value < DN_MIN_ALLOC_MSGS || value > DN_MAX_ALLOC_MSGS) { + log_stderr( + "dynomite: max allocated messages buffer must be between %zu and" + " %zu messages", + DN_MIN_ALLOC_MSGS, DN_MAX_ALLOC_MSGS); + return DN_ERROR; + } + + nci->alloc_msgs_max = (size_t)value; + + break; + + case 'x': + value = dn_atoi(optarg, strlen(optarg)); + if (value <= 0) { + log_stderr("dynomite: option -x requires a non-zero positive number"); + return DN_ERROR; + } + admin_opt = (uint32_t)value; + + break; + case '?': + switch (optopt) { + case 'o': + case 'c': + case 'p': + log_stderr("dynomite: option -%c requires a file name", optopt); + break; + + case 'm': + case 'M': + case 'v': + case 's': + case 'i': + log_stderr("dynomite: option -%c requires a number", optopt); + break; + + case 'a': + log_stderr("dynomite: option -%c requires a string", optopt); break; - case '?': - switch (optopt) { - case 'o': - case 'c': - case 'p': - log_stderr("dynomite: option -%c requires a file name", - optopt); - break; - - case 'm': - case 'M': - case 'v': - case 's': - case 'i': - log_stderr("dynomite: option -%c requires a number", optopt); - break; - - case 'a': - log_stderr("dynomite: option -%c requires a string", optopt); - break; - - default: - log_stderr("dynomite: invalid option -- '%c'", optopt); - break; - } - return DN_ERROR; - - default: - log_stderr("dynomite: invalid option -- '%c'", optopt); - return DN_ERROR; + default: + log_stderr("dynomite: invalid option -- '%c'", optopt); + break; } - c = getopt_long(argc, argv, short_options, long_options, NULL); + return DN_ERROR; + + default: + log_stderr("dynomite: invalid option -- '%c'", optopt); + return DN_ERROR; } + c = getopt_long(argc, argv, short_options, long_options, NULL); + } - return DN_OK; + return DN_OK; } /** @@ -479,23 +464,21 @@ dn_get_options(int argc, char **argv, struct instance *nci) * @return bool true if the configuration file has a valid syntax or false if * syntax is invalid */ -static bool -dn_test_conf(struct instance *nci) -{ - struct conf *cf; - - cf = conf_create(nci->conf_filename); - if (cf == NULL) { - log_stderr("dynomite: configuration file '%s' syntax is invalid", - nci->conf_filename); - return false; - } +static bool dn_test_conf(struct instance *nci) { + struct conf *cf; - conf_destroy(cf); - - log_stderr("dynomite: configuration file '%s' syntax is valid", + cf = conf_create(nci->conf_filename); + if (cf == NULL) { + log_stderr("dynomite: configuration file '%s' syntax is invalid", nci->conf_filename); - return true; + return false; + } + + conf_destroy(cf); + + log_stderr("dynomite: configuration file '%s' syntax is valid", + nci->conf_filename); + return true; } /** @@ -504,40 +487,38 @@ dn_test_conf(struct instance *nci) * @param[in] nci Dynomite instance. * @return rstatus_t Return status code. */ -static rstatus_t -dn_pre_run(struct instance *nci) -{ - rstatus_t status; +static rstatus_t dn_pre_run(struct instance *nci) { + rstatus_t status; + + status = log_init(nci->log_level, nci->log_filename); + if (status != DN_OK) { + return status; + } - status = log_init(nci->log_level, nci->log_filename); + if (daemonize) { + status = dn_daemonize(1); if (status != DN_OK) { - return status; + return status; } + } - if (daemonize) { - status = dn_daemonize(1); - if (status != DN_OK) { - return status; - } - } + nci->pid = getpid(); - nci->pid = getpid(); + status = signal_init(); + if (status != DN_OK) { + return status; + } - status = signal_init(); + if (nci->pid_filename) { + status = dn_create_pidfile(nci); if (status != DN_OK) { - return status; - } - - if (nci->pid_filename) { - status = dn_create_pidfile(nci); - if (status != DN_OK) { - return status; - } + return status; } + } - dn_print_run(nci); + dn_print_run(nci); - return DN_OK; + return DN_OK; } /** @@ -545,18 +526,16 @@ dn_pre_run(struct instance *nci) * and close the logging file descriptor. * @param[in] nci Dynomite instance. */ -static void -dn_post_run(struct instance *nci) -{ - if (nci->pidfile) { - dn_remove_pidfile(nci); - } +static void dn_post_run(struct instance *nci) { + if (nci->pidfile) { + dn_remove_pidfile(nci); + } - signal_deinit(); + signal_deinit(); - dn_print_done(); + dn_print_done(); - log_deinit(); + log_deinit(); } /** @@ -566,87 +545,80 @@ dn_post_run(struct instance *nci) * @param[in] nci Dynomite instance. * @return rstatus_t Return status code. */ -static rstatus_t -dn_run(struct instance *nci) -{ - rstatus_t status; +static rstatus_t dn_run(struct instance *nci) { + rstatus_t status; - THROW_STATUS(core_start(nci)); + THROW_STATUS(core_start(nci)); - struct context *ctx = nci->ctx; + struct context *ctx = nci->ctx; - struct server_pool *sp = &ctx->pool; - if (!sp->enable_gossip) - core_set_local_state(ctx, NORMAL); + struct server_pool *sp = &ctx->pool; + if (!sp->enable_gossip) core_set_local_state(ctx, NORMAL); - /* run rabbit run */ - for (;;) { - status = core_loop(ctx); - if (status != DN_OK) { - break; - } + /* run rabbit run */ + for (;;) { + status = core_loop(ctx); + if (status != DN_OK) { + break; } + } - core_stop(ctx); - return DN_OK; + core_stop(ctx); + return DN_OK; } /** * Set unlimited core dump resource limits. */ -static void -dn_coredump_init(void) -{ - struct rlimit core_limits; - core_limits.rlim_cur = core_limits.rlim_max = RLIM_INFINITY; - setrlimit(RLIMIT_CORE, &core_limits); +static void dn_coredump_init(void) { + struct rlimit core_limits; + core_limits.rlim_cur = core_limits.rlim_max = RLIM_INFINITY; + setrlimit(RLIMIT_CORE, &core_limits); } -int -main(int argc, char **argv) -{ - rstatus_t status; - struct instance nci; - - dn_coredump_init(); - dn_set_default_options(&nci); - - status = dn_get_options(argc, argv, &nci); - if (status != DN_OK) { - dn_show_usage(); - exit(1); - } +int main(int argc, char **argv) { + rstatus_t status; + struct instance nci; - if (show_version) { - log_stderr("This is dynomite-%s" CRLF, VERSION); - if (show_help) { - dn_show_usage(); - } + dn_coredump_init(); + dn_set_default_options(&nci); - if (describe_stats) { - stats_describe(); - } + status = dn_get_options(argc, argv, &nci); + if (status != DN_OK) { + dn_show_usage(); + exit(1); + } - exit(0); + if (show_version) { + log_stderr("This is dynomite-%s" CRLF, VERSION); + if (show_help) { + dn_show_usage(); } - if (test_conf) { - if (!dn_test_conf(&nci)) { - exit(1); - } - exit(0); + if (describe_stats) { + stats_describe(); } - status = dn_pre_run(&nci); - if (status != DN_OK) { - dn_post_run(&nci); - exit(1); - } + exit(0); + } - status = dn_run(&nci); - IGNORE_RET_VAL(status); + if (test_conf) { + if (!dn_test_conf(&nci)) { + exit(1); + } + exit(0); + } + status = dn_pre_run(&nci); + if (status != DN_OK) { dn_post_run(&nci); - exit(1); + } + + status = dn_run(&nci); + IGNORE_RET_VAL(status); + + dn_post_run(&nci); + + exit(1); } diff --git a/src/entropy/dyn_entropy.h b/src/entropy/dyn_entropy.h index 7439caa24..055fccc17 100644 --- a/src/entropy/dyn_entropy.h +++ b/src/entropy/dyn_entropy.h @@ -1,6 +1,6 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2015 Netflix, Inc. + * Dynomite - A thin, distributed replication layer for multi non-distributed + *storages. Copyright (C) 2015 Netflix, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,18 +15,21 @@ * limitations under the License. */ +#ifndef _DYN_ENTROPY_H_ +#define _DYN_ENTROPY_H_ +#include "../dyn_string.h" +#include "../dyn_types.h" -#include "dyn_core.h" +#define ENTROPY_ADDR "127.0.0.1" +#define ENTROPY_PORT 8105 -#define ENTROPY_ADDR "127.0.0.1" -#define ENTROPY_PORT 8105 +#define ENCRYPT_FLAG 1 +#define DECRYPT_FLAG 0 -#define ENCRYPT_FLAG 1 -#define DECRYPT_FLAG 0 - -#define BUFFER_SIZE (16 * 1024) // BUFFER_SIZE 16384 -#define CIPHER_SIZE (BUFFER_SIZE + 1024) // CIPHER_SIZE must be larger than BUFFER_SIZE +#define BUFFER_SIZE (16 * 1024) // BUFFER_SIZE 16384 +#define CIPHER_SIZE \ + (BUFFER_SIZE + 1024) // CIPHER_SIZE must be larger than BUFFER_SIZE /** * @brief Structure for sending AOF to Spark Cluster @@ -34,27 +37,32 @@ * Structure for sending AOF to Spark Cluster */ struct entropy { - struct context *ctx; - uint16_t port; /* port */ - struct string addr; /* address */ - int64_t entropy_ts; /* timestamp of dynomite */ - pthread_t tid; /* aggregator thread */ - int interval; /* entropy aggregation interval */ - int sd; /* socket descriptor */ - int redis_sd; /* Redis socket descriptor for AOF */ + struct context *ctx; + uint16_t port; /* port */ + struct string addr; /* address */ + int64_t entropy_ts; /* timestamp of dynomite */ + pthread_t tid; /* aggregator thread */ + int interval; /* entropy aggregation interval */ + int sd; /* socket descriptor */ + int redis_sd; /* Redis socket descriptor for AOF */ }; -struct entropy *entropy_init(struct context *ctx, uint16_t entropy_port, char *entropy_ip); +struct entropy *entropy_init(struct context *ctx, uint16_t entropy_port, + char *entropy_ip); void *entropy_loop(void *arg); rstatus_t entropy_conn_start(struct entropy *cn); void entropy_conn_destroy(struct entropy *cn); rstatus_t entropy_listen(struct entropy *cn); -int entropy_encrypt(unsigned char *plaintext, int plaintext_len, unsigned char *ciphertext); -int entropy_decrypt(unsigned char *plaintext, int plaintext_len, unsigned char *ciphertext); +int entropy_encrypt(unsigned char *plaintext, int plaintext_len, + unsigned char *ciphertext); +int entropy_decrypt(unsigned char *plaintext, int plaintext_len, + unsigned char *ciphertext); rstatus_t entropy_key_iv_load(struct context *ctx); -rstatus_t entropy_snd_start(int peer_socket, int header_size, int buffer_size, int cipher_size); -rstatus_t entropy_rcv_start(int peer_socket, int header_size, int buffer_size, int cipher_size); - +rstatus_t entropy_snd_start(int peer_socket, int header_size, int buffer_size, + int cipher_size); +rstatus_t entropy_rcv_start(int peer_socket, int header_size, int buffer_size, + int cipher_size); +#endif /* _DYN_ENTROPY_H_ */ diff --git a/src/entropy/dyn_entropy_rcv.c b/src/entropy/dyn_entropy_rcv.c index 095e75ab4..272207c8c 100644 --- a/src/entropy/dyn_entropy_rcv.c +++ b/src/entropy/dyn_entropy_rcv.c @@ -1,6 +1,6 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2015 Netflix, Inc. + * Dynomite - A thin, distributed replication layer for multi non-distributed + *storages. Copyright (C) 2015 Netflix, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,26 +15,23 @@ * limitations under the License. */ - +#include // for open +#include // to do ceil for number of chunks #include #include -#include // for open -#include //for close -#include // to do ceil for number of chunks +#include //for close #include -#include #include +#include -#include -#include -#include #include +#include +#include +#include #include "dyn_core.h" - - /* * Function: entropy_redis_connector * -------------------- @@ -42,28 +39,27 @@ * returns: rstatus_t for the status of opening of the redis connection. */ -static int -entropy_redis_connector(){ - loga("trying to connect to Redis..."); - - struct sockaddr_in serv_addr; - int sockfd = socket(AF_INET, SOCK_STREAM, 0); - if (sockfd < 0){ - log_error("open socket to Redis failed"); - return -1; - } - bzero((char *) &serv_addr, sizeof(serv_addr)); - serv_addr.sin_family = AF_INET; - serv_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); /* set destination IP number - localhost, 127.0.0.1*/ - serv_addr.sin_port = htons(22122); - if (connect(sockfd,(struct sockaddr *)&serv_addr,sizeof(serv_addr)) < 0){ - log_error("connecting to Redis failed"); - return -1; - } - - loga("redis-server connection established: %d", sockfd); - return sockfd; - +static int entropy_redis_connector() { + loga("trying to connect to Redis..."); + + struct sockaddr_in serv_addr; + int sockfd = socket(AF_INET, SOCK_STREAM, 0); + if (sockfd < 0) { + log_error("open socket to Redis failed"); + return -1; + } + bzero((char *)&serv_addr, sizeof(serv_addr)); + serv_addr.sin_family = AF_INET; + serv_addr.sin_addr.s_addr = htonl( + INADDR_LOOPBACK); /* set destination IP number - localhost, 127.0.0.1*/ + serv_addr.sin_port = htons(22122); + if (connect(sockfd, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) { + log_error("connecting to Redis failed"); + return -1; + } + + loga("redis-server connection established: %d", sockfd); + return sockfd; } /* @@ -73,135 +69,125 @@ entropy_redis_connector(){ * Receives the keys from the entropy engine * and pushes them to Redis. */ -rstatus_t -entropy_rcv_start(int peer_socket, int header_size, int buffer_size, int cipher_size){ - - int redis_socket = 0; - char aof[buffer_size]; - char buff[buffer_size]; - unsigned char ciphertext[cipher_size]; - int32_t keyValueLength; - int32_t tempInt; - int i = 0; - int numberOfKeys; - int redis_written_bytes = 0; - - - - /* Check the encryption flag and initialize the crypto */ - if(DECRYPT_FLAG == 1){ - entropy_crypto_init(); +rstatus_t entropy_rcv_start(int peer_socket, int header_size, int buffer_size, + int cipher_size) { + int redis_socket = 0; + char aof[buffer_size]; + char buff[buffer_size]; + unsigned char ciphertext[cipher_size]; + int32_t keyValueLength; + int32_t tempInt; + int i = 0; + int numberOfKeys; + int redis_written_bytes = 0; + + /* Check the encryption flag and initialize the crypto */ + if (DECRYPT_FLAG == 1) { + entropy_crypto_init(); + } else { + loga("Encryption is disabled for entropy receiver"); + } + + /* Processing header for number of Keys */ + if (DECRYPT_FLAG == 1) { + int bytesRead = read(peer_socket, ciphertext, cipher_size); + if (bytesRead < 1) { + log_error("Error on receiving number of keys --> %s", strerror(errno)); + goto error; } - else{ - loga("Encryption is disabled for entropy receiver"); + loga("Bytes read %d", bytesRead); + if (entropy_decrypt(ciphertext, buffer_size, buff) < 0) { + log_error("Error decrypting the AOF file size"); + goto error; } + numberOfKeys = ntohl(buff); - /* Processing header for number of Keys */ - if(DECRYPT_FLAG == 1) { - int bytesRead = read(peer_socket, ciphertext, cipher_size); - if( bytesRead < 1 ){ - log_error("Error on receiving number of keys --> %s", strerror(errno)); - goto error; - } - loga("Bytes read %d", bytesRead); - if( entropy_decrypt (ciphertext, buffer_size, buff) < 0 ) - { - log_error("Error decrypting the AOF file size"); - goto error; - } - numberOfKeys = ntohl(buff); - + } else { + if (read(peer_socket, &tempInt, sizeof(int32_t)) < 1) { + log_error("Error on receiving number of keys --> %s", strerror(errno)); + goto error; } - else{ - if( read(peer_socket, &tempInt, sizeof(int32_t)) < 1 ){ - log_error("Error on receiving number of keys --> %s", strerror(errno)); - goto error; - } - numberOfKeys = ntohl(tempInt); + numberOfKeys = ntohl(tempInt); + } + if (numberOfKeys < 0) { + log_error("receive header not processed properly"); + goto error; + } else if (numberOfKeys == 0) { + log_error("no keys sent"); + goto error; + } + loga("Expected number of keys: %d", numberOfKeys); + + /* Connect to redis-server */ + redis_socket = entropy_redis_connector(); + if (redis_socket == -1) { + goto error; + } + + /* Iterating around the keys */ + for (i = 0; i < numberOfKeys; i++) { + /* + * if the encrypt flag is set then, we need to decrypt the aof size + * and then decrypt the key/OldValue/newValue in Redis serialized format. + */ + if (DECRYPT_FLAG == 1) { + if (read(peer_socket, ciphertext, cipher_size) < 1) { + log_error("Error on receiving aof size --> %s", strerror(errno)); + goto error; + } + if (entropy_decrypt(ciphertext, buffer_size, buff) < 0) { + log_error("Error decrypting the buffer for AOF file size"); + goto error; + } + keyValueLength = ntohl(buff); + log_info("AOF Length: %d", keyValueLength); + memset(&aof[0], 0, sizeof(aof)); + if (read(peer_socket, ciphertext, cipher_size) < 1) { + log_error("Error on receiving aof size --> %s", strerror(errno)); + goto error; + } + if (entropy_decrypt(ciphertext, buffer_size, aof) < + 0) // TODO: I am not sure the buffer_size is correct here. + { + log_error("Error decrypting the buffer for key/oldValue/newValue"); + goto error; + } + } else { + /* Step 1: Read the key/Value size */ + if (read(peer_socket, &keyValueLength, sizeof(int32_t)) < 1) { + log_error("Error on receiving aof size --> %s", strerror(errno)); + goto error; + } + keyValueLength = ntohl(keyValueLength); + log_info("AOF Length: %d", keyValueLength); + memset(&aof[0], 0, sizeof(aof)); + + /* Step 2: Read the key/Value using the keyValueLength */ + if (read(peer_socket, &aof, keyValueLength) < 1) { + log_error("Error on receiving aof file --> %s", strerror(errno)); + goto error; + } } - if (numberOfKeys < 0) { - log_error("receive header not processed properly"); - goto error; - } - else if (numberOfKeys == 0) { - log_error("no keys sent"); - goto error; - } - loga("Expected number of keys: %d", numberOfKeys); - - /* Connect to redis-server */ - redis_socket = entropy_redis_connector(); - if(redis_socket == -1){ - goto error; + loga("Key: %d/%d - Redis serialized form: \n%s", i + 1, numberOfKeys, aof); + redis_written_bytes = write(redis_socket, &aof, keyValueLength); + if (redis_written_bytes < 1) { + log_error("Error on writing to Redis, bytes: %d --> %s", + redis_written_bytes, strerror(errno)); + goto error; } + loga("Bytes written to Redis %d", redis_written_bytes); + } - /* Iterating around the keys */ - for(i=0; i %s", strerror(errno)); - goto error; - } - if( entropy_decrypt (ciphertext, buffer_size, buff) < 0 ) - { - log_error("Error decrypting the buffer for AOF file size"); - goto error; - } - keyValueLength = ntohl(buff); - log_info("AOF Length: %d", keyValueLength); - memset(&aof[0], 0, sizeof(aof)); - if( read(peer_socket, ciphertext, cipher_size) < 1 ){ - log_error("Error on receiving aof size --> %s", strerror(errno)); - goto error; - } - if( entropy_decrypt (ciphertext, buffer_size, aof) < 0 ) //TODO: I am not sure the buffer_size is correct here. - { - log_error("Error decrypting the buffer for key/oldValue/newValue"); - goto error; - } - } - else{ - /* Step 1: Read the key/Value size */ - if( read(peer_socket, &keyValueLength, sizeof(int32_t)) < 1 ){ - log_error("Error on receiving aof size --> %s", strerror(errno)); - goto error; - } - keyValueLength = ntohl(keyValueLength); - log_info("AOF Length: %d", keyValueLength); - memset(&aof[0], 0, sizeof(aof)); - - /* Step 2: Read the key/Value using the keyValueLength */ - if( read(peer_socket, &aof, keyValueLength) < 1 ){ - log_error("Error on receiving aof file --> %s", strerror(errno)); - goto error; - } - } - loga("Key: %d/%d - Redis serialized form: \n%s", i+1,numberOfKeys,aof); - redis_written_bytes = write(redis_socket, &aof, keyValueLength); - if( redis_written_bytes < 1 ){ - log_error("Error on writing to Redis, bytes: %d --> %s", redis_written_bytes, strerror(errno)); - goto error; - } - loga("Bytes written to Redis %d", redis_written_bytes); - } - - if(redis_socket > -1) - close(redis_socket); + if (redis_socket > -1) close(redis_socket); - return DN_OK; + return DN_OK; error: - if(redis_socket > -1){ - close(redis_socket); - log_error("entropy rcv closing redis socket because of error."); - } + if (redis_socket > -1) { + close(redis_socket); + log_error("entropy rcv closing redis socket because of error."); + } - return DN_ERROR; + return DN_ERROR; } - diff --git a/src/entropy/dyn_entropy_snd.c b/src/entropy/dyn_entropy_snd.c index 125eece67..aebaf8699 100644 --- a/src/entropy/dyn_entropy_snd.c +++ b/src/entropy/dyn_entropy_snd.c @@ -1,6 +1,6 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2015 Netflix, Inc. + * Dynomite - A thin, distributed replication layer for multi non-distributed + *storages. Copyright (C) 2015 Netflix, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,26 +15,26 @@ * limitations under the License. */ +#include // for open +#include // to do ceil for number of chunks #include #include -#include // for open -#include //for close -#include // to do ceil for number of chunks +#include //for close #include -#include #include +#include -#include -#include -#include #include +#include +#include +#include #include "dyn_core.h" -#define LOG_CHUNK_LEVEL 1000 // every how many chunks to log -#define THROUGHPUT_THROTTLE 10000000 -#define AOF_TO_SEND "/mnt/data/nfredis/appendonly.aof" /* add in .yml */ +#define LOG_CHUNK_LEVEL 1000 // every how many chunks to log +#define THROUGHPUT_THROTTLE 10000000 +#define AOF_TO_SEND "/mnt/data/nfredis/appendonly.aof" /* add in .yml */ /* * Function: entropy_redis_compact_aof @@ -45,30 +45,29 @@ * If the second time fails, the Socket to spark is closed. */ -static rstatus_t -entropy_redis_compact_aof(int buffer_size){ - char command[buffer_size]; - int sys_ret = 0; - - memset(&command[0], 0, sizeof(command)); - sprintf(command, "redis-cli -p 22122 bgrewriteaof"); +static rstatus_t entropy_redis_compact_aof(int buffer_size) { + char command[buffer_size]; + int sys_ret = 0; + + memset(&command[0], 0, sizeof(command)); + sprintf(command, "redis-cli -p 22122 bgrewriteaof"); + sys_ret = system(command); + if (sys_ret < 0) { + log_error("Error on system call --> %s", strerror(errno)); + loga("Thread sleeping 10 seconds and retrying"); + sleep(10); sys_ret = system(command); - if( sys_ret < 0 ){ - log_error("Error on system call --> %s", strerror(errno)); - loga("Thread sleeping 10 seconds and retrying"); - sleep(10); - sys_ret = system(command); - if( sys_ret < 0 ){ - log_error("Error on bgrewriteaof for seconds time --> %s", strerror(errno)); - return DN_ERROR; - } + if (sys_ret < 0) { + log_error("Error on bgrewriteaof for seconds time --> %s", + strerror(errno)); + return DN_ERROR; } - else if( sys_ret > 0 ){ - log_error("Cannot connect to Redis on port 22122: %d", sys_ret); - return DN_ERROR; - } - loga("Redis BGREWRITEAOF completed"); - return DN_OK; + } else if (sys_ret > 0) { + log_error("Cannot connect to Redis on port 22122: %d", sys_ret); + return DN_ERROR; + } + loga("Redis BGREWRITEAOF completed"); + return DN_OK; } /* @@ -79,35 +78,34 @@ entropy_redis_compact_aof(int buffer_size){ * Header Format: file size | encryption | data store * */ -static rstatus_t -header_send(struct stat file_stat, int peer_socket, int header_size) -{ - char header_buff[header_size]; - ssize_t transmit_len; - - memset(&header_buff[0], 0, sizeof(header_buff)); - header_buff[0] = (int)((((int)file_stat.st_size) >> 24) & 0xFF); - header_buff[1] = (int)((((int)file_stat.st_size) >> 16) & 0xFF); - header_buff[2] = (int)((((int)file_stat.st_size) >> 8) & 0XFF); - header_buff[3] = (int)((((int)file_stat.st_size) & 0XFF)); - - // TODO: encrypt flag does not have to be int but a single byte. - header_buff[4] = (int)((ENCRYPT_FLAG >> 24) & 0xFF); - header_buff[5] = (int)((ENCRYPT_FLAG >> 16) & 0xFF); - header_buff[6] = (int)((ENCRYPT_FLAG >> 8) & 0XFF); - header_buff[7] = (int)((ENCRYPT_FLAG & 0XFF)); - - //TODO: we can add data store information as well - - transmit_len = send(peer_socket, header_buff, sizeof(header_buff), 0); - if (transmit_len < 0) - { - log_error("Error on sending AOF file size --> %s", strerror(errno)); - return DN_ERROR; - } +static rstatus_t header_send(struct stat file_stat, int peer_socket, + int header_size) { + char header_buff[header_size]; + ssize_t transmit_len; + + memset(&header_buff[0], 0, sizeof(header_buff)); + header_buff[0] = (int)((((int)file_stat.st_size) >> 24) & 0xFF); + header_buff[1] = (int)((((int)file_stat.st_size) >> 16) & 0xFF); + header_buff[2] = (int)((((int)file_stat.st_size) >> 8) & 0XFF); + header_buff[3] = (int)((((int)file_stat.st_size) & 0XFF)); + + // TODO: encrypt flag does not have to be int but a single byte. + header_buff[4] = (int)((ENCRYPT_FLAG >> 24) & 0xFF); + header_buff[5] = (int)((ENCRYPT_FLAG >> 16) & 0xFF); + header_buff[6] = (int)((ENCRYPT_FLAG >> 8) & 0XFF); + header_buff[7] = (int)((ENCRYPT_FLAG & 0XFF)); + + // TODO: we can add data store information as well + + transmit_len = send(peer_socket, header_buff, sizeof(header_buff), 0); + if (transmit_len < 0) { + log_error("Error on sending AOF file size --> %s", strerror(errno)); + return DN_ERROR; + } - loga("The size of header is %d",sizeof(header_buff)); //TODO: this can be moved to log_info - return DN_OK; + loga("The size of header is %d", + sizeof(header_buff)); // TODO: this can be moved to log_info + return DN_OK; } /* @@ -117,13 +115,14 @@ header_send(struct stat file_stat, int peer_socket, int header_size) * Logging statistics about the transfer; * */ -static void -entropy_snd_stats(int current_chunk, time_t elapsed_time, double chunk_thr, double byte_thr){ - - if(elapsed_time > 0 && current_chunk > 0){ - loga("Transferring chunk %d (%.2f chunks/sec" - " -- %.2f MB/sec)", current_chunk, chunk_thr, byte_thr); - } +static void entropy_snd_stats(int current_chunk, time_t elapsed_time, + double chunk_thr, double byte_thr) { + if (elapsed_time > 0 && current_chunk > 0) { + loga( + "Transferring chunk %d (%.2f chunks/sec" + " -- %.2f MB/sec)", + current_chunk, chunk_thr, byte_thr); + } } /* @@ -132,196 +131,188 @@ entropy_snd_stats(int current_chunk, time_t elapsed_time, double chunk_thr, doub * * Processes the AOF and transmits to the entropy engine */ -rstatus_t -entropy_snd_start(int peer_socket, int header_size, int buffer_size, int cipher_size){ - - struct stat file_stat; - ssize_t transmit_len; - ssize_t data_trasmitted = 0; - FILE *fp = NULL; - int fd; - char data_buff[buffer_size]; - unsigned char ciphertext[cipher_size]; - int ciphertext_len = 0; - size_t aof_bytes_read; - int nchunk; - int i; //iterator for chunks - size_t last_chunk_size; - double chunk_thr = 0; - double byte_thr = 0; - time_t elapsed_time; - - /* compact AOF in Redis before sending to Spark */ - if(entropy_redis_compact_aof(buffer_size) == DN_ERROR){ - log_error("Redis failed to perform bgrewriteaof"); - goto error; +rstatus_t entropy_snd_start(int peer_socket, int header_size, int buffer_size, + int cipher_size) { + struct stat file_stat; + ssize_t transmit_len; + ssize_t data_trasmitted = 0; + FILE *fp = NULL; + int fd; + char data_buff[buffer_size]; + unsigned char ciphertext[cipher_size]; + int ciphertext_len = 0; + size_t aof_bytes_read; + int nchunk; + int i; // iterator for chunks + size_t last_chunk_size; + double chunk_thr = 0; + double byte_thr = 0; + time_t elapsed_time; + + /* compact AOF in Redis before sending to Spark */ + if (entropy_redis_compact_aof(buffer_size) == DN_ERROR) { + log_error("Redis failed to perform bgrewriteaof"); + goto error; + } + /* short sleep to finish AOF rewriting */ + sleep(1); + + /* create a file pointer for the AOF */ + fp = fopen(AOF_TO_SEND, "r"); + if (fp == NULL) { + log_error("Error opening Redis AOF file: %s", strerror(errno)); + goto error; + } + + /* Get the file descriptor from the file pointer */ + fd = fileno(fp); + + /* Get the file size */ + if (fstat(fd, &file_stat) < 0) { + log_error("Error fstat --> %s", strerror(errno)); + goto error; + } + + /* No file AOF found to send */ + if (file_stat.st_size == 0) { + log_error("Cannot retrieve an AOF file in %s", AOF_TO_SEND); + goto error; + } + loga("Redis appendonly.aof ready to be sent"); + + /* sending header */ + if (header_send(file_stat, peer_socket, header_size) == DN_ERROR) { + goto error; + } + + /* Determine the number of chunks + * if the size of the file is larger than the Buffer size + * then split it, otherwise we need one chunk only. + * */ + if (file_stat.st_size > buffer_size) { + nchunk = (int)(ceil(file_stat.st_size / buffer_size) + 1); + } else { + nchunk = 1; + } + + /* Last chunk size is calculated by subtracting from the total file size + * the size of each chunk excluding the last one. + */ + last_chunk_size = (long)(file_stat.st_size - (nchunk - 1) * buffer_size); + + loga( + "HEADER INFO: file size: %d -- buffer size: %d -- cipher size: %d -- " + "encryption: %d ", + (int)file_stat.st_size, buffer_size, cipher_size, ENCRYPT_FLAG); + loga("CHUNK INFO: number of chunks: %d -- last chunk size: %ld", nchunk, + last_chunk_size); + + time_t stats_start_time = time(NULL); + struct timeval now; + gettimeofday(&now, NULL); + time_t throttle_start_sec = now.tv_sec; + suseconds_t throttle_start_usec = now.tv_usec; + suseconds_t throttle_elapsed_usec; + suseconds_t throttle_current_rate_usec; + + int stat_chunks_in_window = 0; + ssize_t stat_bytes_in_window = 0; + ssize_t throttle_bytes = 0; + + for (i = 0; i < nchunk; i++) { + /* clear buffer before using it */ + memset(data_buff, 0, sizeof(data_buff)); + + /* Read file data in chunks of buffer_size bytes */ + if (i < nchunk - 1) { + aof_bytes_read = fread(data_buff, sizeof(char), buffer_size, fp); + } else { + aof_bytes_read = fread(data_buff, sizeof(char), last_chunk_size, fp); } - /* short sleep to finish AOF rewriting */ - sleep(1); - - /* create a file pointer for the AOF */ - fp = fopen(AOF_TO_SEND, "r"); - if (fp == NULL) - { - log_error("Error opening Redis AOF file: %s", strerror(errno)); - goto error; - } - - /* Get the file descriptor from the file pointer */ - fd = fileno(fp); - /* Get the file size */ - if (fstat(fd, &file_stat) < 0) - { - log_error("Error fstat --> %s", strerror(errno)); - goto error; + /* checking for errors */ + if (aof_bytes_read < 0) { + log_error("Error reading chunk of AOF file --> %s", strerror(errno)); + goto error; } - /* No file AOF found to send */ - if(file_stat.st_size == 0){ - log_error("Cannot retrieve an AOF file in %s", AOF_TO_SEND); - goto error; + /***** THROTTLER ******/ + + /* Capture the current time, the elapsed time, and the bytes */ + gettimeofday(&now, NULL); + throttle_elapsed_usec = (now.tv_sec - throttle_start_sec) * 1000000 + + now.tv_usec - throttle_start_usec; + throttle_bytes += aof_bytes_read; + + /* Determine the expected throughput on the usec level */ + throttle_current_rate_usec = + (suseconds_t)1000000 * throttle_bytes / THROUGHPUT_THROTTLE; + + /* if the rate is higher than the expected, then wait for the corresponding + * time to throttle it */ + if (throttle_current_rate_usec > throttle_elapsed_usec) { + usleep(throttle_current_rate_usec - throttle_elapsed_usec); + throttle_bytes = 0; + throttle_start_sec = now.tv_sec; + throttle_start_usec = now.tv_usec; } - loga("Redis appendonly.aof ready to be sent"); - - - /* sending header */ - if(header_send(file_stat, peer_socket, header_size)==DN_ERROR){ - goto error; + /******************/ + + if (ENCRYPT_FLAG == 1) { + if (i < nchunk - 1) { + ciphertext_len = entropy_encrypt(data_buff, buffer_size, ciphertext); + } else { + ciphertext_len = + entropy_encrypt(data_buff, last_chunk_size, ciphertext); + loga("Size of last chunk: %d", sizeof(data_buff)); + } + if (ciphertext_len < 0) { + log_error("Error encrypting the AOF chunk --> %s", strerror(errno)); + goto error; + } + transmit_len = send(peer_socket, ciphertext, sizeof(ciphertext), 0); + } else { + if (i < nchunk - 1) { + transmit_len = send(peer_socket, data_buff, buffer_size, 0); + } else { + transmit_len = send(peer_socket, data_buff, last_chunk_size, 0); + } } - /* Determine the number of chunks - * if the size of the file is larger than the Buffer size - * then split it, otherwise we need one chunk only. - * */ - if(file_stat.st_size > buffer_size){ - nchunk = (int)(ceil(file_stat.st_size/buffer_size) + 1); - } - else{ - nchunk = 1; - } - - /* Last chunk size is calculated by subtracting from the total file size - * the size of each chunk excluding the last one. - */ - last_chunk_size = (long)(file_stat.st_size - (nchunk-1) * buffer_size); - - loga("HEADER INFO: file size: %d -- buffer size: %d -- cipher size: %d -- encryption: %d ", - (int)file_stat.st_size, buffer_size, cipher_size, ENCRYPT_FLAG); - loga("CHUNK INFO: number of chunks: %d -- last chunk size: %ld", nchunk, last_chunk_size); - - time_t stats_start_time = time(NULL); - struct timeval now; - gettimeofday(&now, NULL); - time_t throttle_start_sec = now.tv_sec; - suseconds_t throttle_start_usec = now.tv_usec; - suseconds_t throttle_elapsed_usec; - suseconds_t throttle_current_rate_usec; - - int stat_chunks_in_window = 0; - ssize_t stat_bytes_in_window = 0; - ssize_t throttle_bytes = 0; - - for(i=0; i %s", strerror(errno)); - goto error; - } - - /***** THROTTLER ******/ - - /* Capture the current time, the elapsed time, and the bytes */ - gettimeofday(&now, NULL); - throttle_elapsed_usec = (now.tv_sec-throttle_start_sec)*1000000 + now.tv_usec-throttle_start_usec; - throttle_bytes += aof_bytes_read; - - /* Determine the expected throughput on the usec level */ - throttle_current_rate_usec = (suseconds_t) 1000000 * throttle_bytes/THROUGHPUT_THROTTLE; - - /* if the rate is higher than the expected, then wait for the corresponding time to throttle it */ - if (throttle_current_rate_usec > throttle_elapsed_usec ){ - usleep(throttle_current_rate_usec - throttle_elapsed_usec); - throttle_bytes = 0; - throttle_start_sec = now.tv_sec; - throttle_start_usec = now.tv_usec; - } - /******************/ - - if(ENCRYPT_FLAG == 1){ - if (i < nchunk-1){ - ciphertext_len = entropy_encrypt (data_buff, buffer_size, ciphertext); - } - else{ - ciphertext_len = entropy_encrypt (data_buff, last_chunk_size, ciphertext); - loga("Size of last chunk: %d", sizeof(data_buff)); - } - if(ciphertext_len < 0){ - log_error("Error encrypting the AOF chunk --> %s", strerror(errno)); - goto error; - } - transmit_len = send(peer_socket, ciphertext, sizeof(ciphertext), 0); - } - else{ - if(i %s", strerror(errno)); - log_error("Data transmitted up to error: %ld and chunks: %d", data_trasmitted, i+1); - goto error; - } - else if ( transmit_len == 0){ - loga("No data in chunk"); - } - else{ - data_trasmitted +=transmit_len; - stat_chunks_in_window++; - stat_bytes_in_window +=transmit_len; - - elapsed_time = time(NULL) - stats_start_time; - - if (elapsed_time > 0 && (i % LOG_CHUNK_LEVEL == 0 || i == nchunk)){ - chunk_thr = (double)(stat_chunks_in_window/elapsed_time); - byte_thr = (double)(stat_bytes_in_window/elapsed_time)/1000000; //Divide by 1M for MB - entropy_snd_stats(i, elapsed_time, chunk_thr, byte_thr); - stat_bytes_in_window = 0; - stat_chunks_in_window = 0; - stats_start_time = time(NULL); - } - } + if (transmit_len < 0) { + log_error("Error sending the AOF chunk --> %s", strerror(errno)); + log_error("Data transmitted up to error: %ld and chunks: %d", + data_trasmitted, i + 1); + goto error; + } else if (transmit_len == 0) { + loga("No data in chunk"); + } else { + data_trasmitted += transmit_len; + stat_chunks_in_window++; + stat_bytes_in_window += transmit_len; + + elapsed_time = time(NULL) - stats_start_time; + + if (elapsed_time > 0 && (i % LOG_CHUNK_LEVEL == 0 || i == nchunk)) { + chunk_thr = (double)(stat_chunks_in_window / elapsed_time); + byte_thr = (double)(stat_bytes_in_window / elapsed_time) / + 1000000; // Divide by 1M for MB + entropy_snd_stats(i, elapsed_time, chunk_thr, byte_thr); + stat_bytes_in_window = 0; + stat_chunks_in_window = 0; + stats_start_time = time(NULL); + } } + } - loga("Chunks transferred: %d ---> AOF transfer completed!", i); - if(fp!=NULL) - fclose(fp); + loga("Chunks transferred: %d ---> AOF transfer completed!", i); + if (fp != NULL) fclose(fp); - return DN_OK; + return DN_OK; error: - if(fp!=NULL) - fclose(fp); - - return DN_ERROR; + if (fp != NULL) fclose(fp); + return DN_ERROR; } - - diff --git a/src/entropy/dyn_entropy_util.c b/src/entropy/dyn_entropy_util.c index 6ffffd259..ea5879bb0 100644 --- a/src/entropy/dyn_entropy_util.c +++ b/src/entropy/dyn_entropy_util.c @@ -1,6 +1,6 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2016 Netflix, Inc. + * Dynomite - A thin, distributed replication layer for multi non-distributed + *storages. Copyright (C) 2016 Netflix, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,24 +15,23 @@ * limitations under the License. */ +#include // for open +#include // to do ceil for number of chunks #include #include -#include // for open -#include //for close -#include // to do ceil for number of chunks +#include //for close #include -#include #include +#include -#include -#include -#include #include +#include +#include +#include #include "dyn_core.h" - /** * Anti - Entropy * ------------ @@ -48,15 +47,16 @@ * by the external cluster. Dynomite processes the following header: * * 1. Dynomite entropy receives a header with the following information - * a. 4 Bytes: Magic number which consists of 64640000 + 000X, where X is the version - * b. 4 Bytes: Dynomite to send the snapshot (1) or to receive reconciled data (2) - * c. 4 Bytes: size of the header - * d. 4 Bytes: size of each chunk size (or else referred to as buffer_size) - * e. 4 Bytes: size of the cipher + * a. 4 Bytes: Magic number which consists of 64640000 + 000X, where X is the + * version b. 4 Bytes: Dynomite to send the snapshot (1) or to receive + * reconciled data (2) c. 4 Bytes: size of the header d. 4 Bytes: size of + * each chunk size (or else referred to as buffer_size) e. 4 Bytes: size of + * the cipher * * //TODO: need to add the IV in the header from Spark ---> Dynomite * - * 2. Based on the fist byte the "dyn_entropy_snd.c" or "dyn_entropy_rcv.c" is invoked. + * 2. Based on the fist byte the "dyn_entropy_snd.c" or "dyn_entropy_rcv.c" is + * invoked. * * Dynomite Sender * --------------- @@ -75,7 +75,8 @@ * * Dynomite Receiver * --------------- - * The receiver first opens a connection with the Redis server to talk through RESP. + * The receiver first opens a connection with the Redis server to talk through + * RESP. * 3. Dynomite receiver receives * a. 4 Bytes: key length * b. key length Bytes : key @@ -87,21 +88,19 @@ * 4. Data are flushed to Redis. */ - /* Magic number for the protocol*/ #define MAGIC_NUMBER 64640001 /* Define max values so that Dynomite operates under limits */ #define MAX_HEADER_SIZE 1024 -#define MAX_BUFFER_SIZE 5120000 //5MB -#define MAX_CIPHER_SIZE 5120000 //5MB +#define MAX_BUFFER_SIZE 5120000 // 5MB +#define MAX_CIPHER_SIZE 5120000 // 5MB /* A 128 bit key */ static unsigned char *theKey = (unsigned char *)"0123456789012345"; /* A 128 bit IV */ -static unsigned char *theIv = (unsigned char*)"0123456789012345"; - +static unsigned char *theIv = (unsigned char *)"0123456789012345"; /* * Function: entropy_crypto_init @@ -109,14 +108,12 @@ static unsigned char *theIv = (unsigned char*)"0123456789012345"; * * Initialize crypto libraries per connection */ -void -entropy_crypto_init() -{ +void entropy_crypto_init() { #if OPENSSL_VERSION_NUMBER < 0x10100000L - ERR_load_crypto_strings(); - OpenSSL_add_all_algorithms(); + ERR_load_crypto_strings(); + OpenSSL_add_all_algorithms(); #endif - OPENSSL_config(NULL); + OPENSSL_config(NULL); } /* @@ -125,17 +122,13 @@ entropy_crypto_init() * * Clean crypto libraries per connection */ -void -entropy_crypto_deinit() -{ +void entropy_crypto_deinit() { #if OPENSSL_VERSION_NUMBER < 0x10100000L - EVP_cleanup(); - ERR_free_strings(); + EVP_cleanup(); + ERR_free_strings(); #endif } - - /* * Function: entropy_decrypt * -------------------- @@ -147,9 +140,8 @@ entropy_crypto_deinit() * */ -int -entropy_decrypt(unsigned char *ciphertext, int ciphertext_len, unsigned char *plaintext) -{ +int entropy_decrypt(unsigned char *ciphertext, int ciphertext_len, + unsigned char *plaintext) { EVP_CIPHER_CTX *ctx; int len; @@ -157,26 +149,24 @@ entropy_decrypt(unsigned char *ciphertext, int ciphertext_len, unsigned char *pl int plaintext_len = 0; /* Create and initialize the context */ - if(!(ctx = EVP_CIPHER_CTX_new())) - goto error; + if (!(ctx = EVP_CIPHER_CTX_new())) goto error; /* Initialize the decryption operation with 128 bit AES */ - if(1 != EVP_DecryptInit_ex(ctx, EVP_aes_128_cbc(), NULL, theKey, theIv)) - goto error; + if (1 != EVP_DecryptInit_ex(ctx, EVP_aes_128_cbc(), NULL, theKey, theIv)) + goto error; /* Provide the message to be decrypted, and obtain the encrypted output. * EVP_EncryptUpdate can be called multiple times if necessary */ - if(1 != EVP_DecryptUpdate(ctx, plaintext, &len, ciphertext, ciphertext_len)) - goto error; + if (1 != EVP_DecryptUpdate(ctx, plaintext, &len, ciphertext, ciphertext_len)) + goto error; plaintext_len = len; /* Finalize the decryption. Further ciphertext bytes may be written at * this stage. */ - if(1 != EVP_DecryptFinal_ex(ctx, ciphertext + len, &len)) - goto error; + if (1 != EVP_DecryptFinal_ex(ctx, ciphertext + len, &len)) goto error; plaintext_len += len; @@ -187,14 +177,11 @@ entropy_decrypt(unsigned char *ciphertext, int ciphertext_len, unsigned char *pl error: - if(ctx != NULL) - EVP_CIPHER_CTX_free(ctx); + if (ctx != NULL) EVP_CIPHER_CTX_free(ctx); return DN_ERROR; - } - /* * Function: entropy_encrypt * -------------------- @@ -206,9 +193,8 @@ entropy_decrypt(unsigned char *ciphertext, int ciphertext_len, unsigned char *pl * */ -int -entropy_encrypt(unsigned char *plaintext, int plaintext_len, unsigned char *ciphertext) -{ +int entropy_encrypt(unsigned char *plaintext, int plaintext_len, + unsigned char *ciphertext) { EVP_CIPHER_CTX *ctx; int len; @@ -216,60 +202,50 @@ entropy_encrypt(unsigned char *plaintext, int plaintext_len, unsigned char *ciph int ciphertext_len = 0; /* Create and initialize the context */ - if(!(ctx = EVP_CIPHER_CTX_new())) - return DN_ERROR; + if (!(ctx = EVP_CIPHER_CTX_new())) return DN_ERROR; /* Padding */ - if(1 != EVP_CIPHER_CTX_set_padding(ctx,0)) - goto error; + if (1 != EVP_CIPHER_CTX_set_padding(ctx, 0)) goto error; /* Initialize the encryption operation with 256 bit AES */ - if(1 != EVP_EncryptInit_ex(ctx, EVP_aes_128_cbc(), NULL, theKey, theIv)) - goto error; + if (1 != EVP_EncryptInit_ex(ctx, EVP_aes_128_cbc(), NULL, theKey, theIv)) + goto error; /* Provide the message to be encrypted, and obtain the encrypted output. * EVP_EncryptUpdate can be called multiple times if necessary */ - if(1 != EVP_EncryptUpdate(ctx, ciphertext, &len, plaintext, plaintext_len)) - goto error; + if (1 != EVP_EncryptUpdate(ctx, ciphertext, &len, plaintext, plaintext_len)) + goto error; ciphertext_len = len; /* Finalize the encryption. Further ciphertext bytes may be written at * this stage. */ - if(1 != EVP_EncryptFinal_ex(ctx, ciphertext + len, &len)) - goto error; + if (1 != EVP_EncryptFinal_ex(ctx, ciphertext + len, &len)) goto error; ciphertext_len += len; /* Clean up */ EVP_CIPHER_CTX_free(ctx); - // loga("Block size: %d", EVP_CIPHER_block_size(ctx) ); + // loga("Block size: %d", EVP_CIPHER_block_size(ctx) ); return ciphertext_len; error: - if(ctx != NULL) - EVP_CIPHER_CTX_free(ctx); - - return DN_ERROR; + if (ctx != NULL) EVP_CIPHER_CTX_free(ctx); + return DN_ERROR; } - /* * Function: entropy_conn_stop * -------------------- * closes the socket connection */ -static void -entropy_conn_stop(struct entropy *cn) -{ - close(cn->sd); -} +static void entropy_conn_stop(struct entropy *cn) { close(cn->sd); } /* * Function: entropy_conn_destroy @@ -277,11 +253,9 @@ entropy_conn_stop(struct entropy *cn) * Frees up the memory pointer for the connection */ -void -entropy_conn_destroy(struct entropy *cn) -{ - entropy_conn_stop(cn); - dn_free(cn); +void entropy_conn_destroy(struct entropy *cn) { + entropy_conn_stop(cn); + dn_free(cn); } /* @@ -291,47 +265,46 @@ entropy_conn_destroy(struct entropy *cn) * corresponding phases, e.g. socket, bind, listen etc. */ -rstatus_t -entropy_listen(struct entropy *cn) -{ - rstatus_t status; - struct sockinfo si; +rstatus_t entropy_listen(struct entropy *cn) { + rstatus_t status; + struct sockinfo si; - status = dn_resolve(&cn->addr, cn->port, &si); - if (status < 0) { - return status; - } - - cn->sd = socket(si.family, SOCK_STREAM, 0); - if (cn->sd < 0) { - log_error("anti-entropy socket failed: %s", strerror(errno)); - return DN_ERROR; - } + status = dn_resolve(&cn->addr, cn->port, &si); + if (status < 0) { + return status; + } - status = dn_set_reuseaddr(cn->sd); - if (status < 0) { - log_error("anti-entropy set reuseaddr on m %d failed: %s", cn->sd, strerror(errno)); - return DN_ERROR; - } + cn->sd = socket(si.family, SOCK_STREAM, 0); + if (cn->sd < 0) { + log_error("anti-entropy socket failed: %s", strerror(errno)); + return DN_ERROR; + } - status = bind(cn->sd, (struct sockaddr *)&si.addr, si.addrlen); - if (status < 0) { - log_error(" anti-entropy bind on m %d to addr '%.*s:%u' failed: %s", cn->sd, - cn->addr.len, cn->addr.data, cn->port, strerror(errno)); - return DN_ERROR; - } + status = dn_set_reuseaddr(cn->sd); + if (status < 0) { + log_error("anti-entropy set reuseaddr on m %d failed: %s", cn->sd, + strerror(errno)); + return DN_ERROR; + } - status = listen(cn->sd, SOMAXCONN); - if (status < 0) { - log_error("anti-entropy listen on m %d failed: %s", cn->sd, strerror(errno)); - return DN_ERROR; - } + status = bind(cn->sd, (struct sockaddr *)&si.addr, si.addrlen); + if (status < 0) { + log_error(" anti-entropy bind on m %d to addr '%.*s:%u' failed: %s", cn->sd, + cn->addr.len, cn->addr.data, cn->port, strerror(errno)); + return DN_ERROR; + } + status = listen(cn->sd, SOMAXCONN); + if (status < 0) { + log_error("anti-entropy listen on m %d failed: %s", cn->sd, + strerror(errno)); + return DN_ERROR; + } - log_debug(LOG_NOTICE, "anti-entropy m %d listening on '%.*s:%u'", cn->sd, - cn->addr.len, cn->addr.data, cn->port); + log_debug(LOG_NOTICE, "anti-entropy m %d listening on '%.*s:%u'", cn->sd, + cn->addr.len, cn->addr.data, cn->port); - return DN_OK; + return DN_OK; } /* @@ -340,148 +313,146 @@ entropy_listen(struct entropy *cn) * * Loads the send IV from a file */ -rstatus_t -entropy_key_iv_load(struct context *ctx){ +rstatus_t entropy_key_iv_load(struct context *ctx) { + int fd; + struct stat file_stat; + unsigned char buff[BUFFER_SIZE]; - int fd; - struct stat file_stat; - unsigned char buff[BUFFER_SIZE]; + struct server_pool *pool = &ctx->pool; - struct server_pool *pool = &ctx->pool; + /* 1. Check if the String array of the file names has been allocated */ + if (string_empty(&pool->recon_key_file) || + string_empty(&pool->recon_iv_file)) { + log_error("Could NOT read entropy key or iv file"); + return DN_ERROR; + } - /* 1. Check if the String array of the file names has been allocated */ - if (string_empty(&pool->recon_key_file) || string_empty(&pool->recon_iv_file)) { - log_error("Could NOT read entropy key or iv file"); - return DN_ERROR; - } + /* 2. allocate char based on the length in the string arrays */ + char key_file_name[pool->recon_key_file.len + 1]; + char iv_file_name[pool->recon_iv_file.len + 1]; - /* 2. allocate char based on the length in the string arrays */ - char key_file_name[pool->recon_key_file.len + 1]; - char iv_file_name[pool->recon_iv_file.len + 1]; + /* copy the content to the allocated array */ + memcpy(key_file_name, pool->recon_key_file.data, pool->recon_key_file.len); + key_file_name[pool->recon_key_file.len] = '\0'; + memcpy(iv_file_name, pool->recon_iv_file.data, pool->recon_iv_file.len); + iv_file_name[pool->recon_iv_file.len] = '\0'; - /* copy the content to the allocated array */ - memcpy(key_file_name, pool->recon_key_file.data, pool->recon_key_file.len); - key_file_name[pool->recon_key_file.len] = '\0'; - memcpy(iv_file_name, pool->recon_iv_file.data, pool->recon_iv_file.len); - iv_file_name[pool->recon_iv_file.len] = '\0'; + loga("Key File name: %s - IV File name: %s", key_file_name, iv_file_name); - loga("Key File name: %s - IV File name: %s", key_file_name, iv_file_name); + /* 3. checking if the key and iv files exist using access */ + if (access(key_file_name, F_OK) < 0) { + log_error("Error: file %s does not exist", key_file_name); + return DN_ERROR; + } else if (access(iv_file_name, F_OK) < 0) { + log_error("Error: file %s does not exist", iv_file_name); + return DN_ERROR; + } - /* 3. checking if the key and iv files exist using access */ - if( access(key_file_name, F_OK ) < 0 ) { - log_error("Error: file %s does not exist", key_file_name); - return DN_ERROR; - } - else if( access(iv_file_name, F_OK ) < 0 ) { - log_error("Error: file %s does not exist", iv_file_name); - return DN_ERROR; - } + /* 4. loading the .pem files */ + FILE *key_file = fopen(key_file_name, "r"); + if (key_file == NULL) { + log_error("opening key.pem file failed %s", pool->recon_key_file); + return DN_ERROR; + } + FILE *iv_file = fopen(iv_file_name, "r"); + if (iv_file == NULL) { + log_error("opening iv.pem file failed %s", pool->recon_iv_file); + return DN_ERROR; + } - /* 4. loading the .pem files */ - FILE *key_file = fopen(key_file_name,"r"); - if(key_file == NULL){ - log_error("opening key.pem file failed %s", pool->recon_key_file); - return DN_ERROR; - } - FILE *iv_file = fopen(iv_file_name,"r"); - if(iv_file == NULL){ - log_error("opening iv.pem file failed %s", pool->recon_iv_file); - return DN_ERROR; - } - - /* 5. using the file descriptor to do some checking with the BUFFER_SIZE */ - fd = fileno(key_file); - if (fstat(fd, &file_stat) < 0) /* Get the file size */ - { - log_error("Error fstat --> %s", strerror(errno)); - return DN_ERROR; - } + /* 5. using the file descriptor to do some checking with the BUFFER_SIZE */ + fd = fileno(key_file); + if (fstat(fd, &file_stat) < 0) /* Get the file size */ + { + log_error("Error fstat --> %s", strerror(errno)); + return DN_ERROR; + } - if (file_stat.st_size > BUFFER_SIZE){ /* Compare file size with BUFFER_SIZE */ - log_error("key file size is bigger then the buffer size"); - return DN_ERROR; - } + if (file_stat.st_size > + BUFFER_SIZE) { /* Compare file size with BUFFER_SIZE */ + log_error("key file size is bigger then the buffer size"); + return DN_ERROR; + } - fd = fileno(iv_file); - if (fstat(fd, &file_stat) < 0) - { - log_error("Error fstat --> %s", strerror(errno)); - return DN_ERROR; - } + fd = fileno(iv_file); + if (fstat(fd, &file_stat) < 0) { + log_error("Error fstat --> %s", strerror(errno)); + return DN_ERROR; + } - if (file_stat.st_size > BUFFER_SIZE){ - log_error("IV file size is bigger then the buffer size"); - return DN_ERROR; - } + if (file_stat.st_size > BUFFER_SIZE) { + log_error("IV file size is bigger then the buffer size"); + return DN_ERROR; + } - /* 6. reading the files for the key and iv */ - if (fgets(buff,BUFFER_SIZE-1,key_file) == NULL){ - log_error("Processing Key file error"); - return DN_ERROR; - } + /* 6. reading the files for the key and iv */ + if (fgets(buff, BUFFER_SIZE - 1, key_file) == NULL) { + log_error("Processing Key file error"); + return DN_ERROR; + } // theKey = (unsigned char *)buff; - loga("key loaded: %s", theKey); + loga("key loaded: %s", theKey); - memset( buff, '\0', BUFFER_SIZE ); - if (fgets(buff,BUFFER_SIZE-1,iv_file) == NULL){ - log_error("Processing IV file error"); - return DN_ERROR; - } - // theIv = (unsigned char *)buff; - loga("iv loaded: %s", theIv); + memset(buff, '\0', BUFFER_SIZE); + if (fgets(buff, BUFFER_SIZE - 1, iv_file) == NULL) { + log_error("Processing IV file error"); + return DN_ERROR; + } + // theIv = (unsigned char *)buff; + loga("iv loaded: %s", theIv); - return DN_OK; + return DN_OK; } - /* * Function: entropy_snd_init * -------------------- - * Initiates the data for the connection towards another cluster for reconciliation. - * Loading of key/iv happens only once by calling entropy_key_iv_load, which is a util function. - * The same key/iv are reused for both entropy rcv and snd. + * Initiates the data for the connection towards another cluster for + * reconciliation. Loading of key/iv happens only once by calling + * entropy_key_iv_load, which is a util function. The same key/iv are reused for + * both entropy rcv and snd. * * returns: a entropy_conn structure with information about the connection * or NULL if a new thread cannot be picked up. */ -struct entropy * -entropy_init( struct context *ctx, uint16_t entropy_port, char *entropy_ip) -{ +struct entropy *entropy_init(struct context *ctx, uint16_t entropy_port, + char *entropy_ip) { + rstatus_t status; + struct entropy *cn; - rstatus_t status; - struct entropy *cn; + cn = dn_alloc(sizeof(*cn)); + if (cn == NULL) { + log_error("Cannot allocate entropy structure"); + goto error; + } - cn = dn_alloc(sizeof(*cn)); - if (cn == NULL) { - log_error("Cannot allocate entropy structure"); - goto error; - } - - if(entropy_key_iv_load(ctx) == DN_ERROR){ //TODO: we do not need to do that if encryption flag is not set. - log_error("recon_key.pem or recon_iv.pem cannot be loaded properly"); - goto error; - } + if (entropy_key_iv_load(ctx) == DN_ERROR) { // TODO: we do not need to do + // that if encryption flag is not + // set. + log_error("recon_key.pem or recon_iv.pem cannot be loaded properly"); + goto error; + } - cn->port = entropy_port; - string_set_raw(&cn->addr, entropy_ip); + cn->port = entropy_port; + string_set_raw(&cn->addr, entropy_ip); - cn->entropy_ts = (int64_t)time(NULL); - cn->tid = (pthread_t) -1; //Initialize thread id to -1 - cn->sd = -1; // Initialize socket descriptor to -1 - cn->redis_sd = -1; // Initialize redis socket descriptor to -1 + cn->entropy_ts = (int64_t)time(NULL); + cn->tid = (pthread_t)-1; // Initialize thread id to -1 + cn->sd = -1; // Initialize socket descriptor to -1 + cn->redis_sd = -1; // Initialize redis socket descriptor to -1 - status = entropy_conn_start(cn); - if (status != DN_OK) { - goto error; - } + status = entropy_conn_start(cn); + if (status != DN_OK) { + goto error; + } - cn->ctx = ctx; - return cn; + cn->ctx = ctx; + return cn; error: - entropy_conn_destroy(cn); - return NULL; + entropy_conn_destroy(cn); + return NULL; } /* @@ -495,185 +466,177 @@ entropy_init( struct context *ctx, uint16_t entropy_port, char *entropy_ip) * entropy engine). */ -static void -entropy_callback(void *arg1, void *arg2) -{ - - int n = *((int *)arg2); - struct entropy *st = arg1; - - if (n == 0) { - return; - } - - /* Check the encryption flag and initialize the crypto */ - if(ENCRYPT_FLAG == 1 || DECRYPT_FLAG == 1) { - entropy_crypto_init(); - } - else if (ENCRYPT_FLAG == 0) { - loga("Encryption is disabled for entropy sender"); - - } - else if (DECRYPT_FLAG == 0) { - loga("Decryption is disabled for entropy receiver"); - } - - /* accept the connection */ - int peer_socket = accept(st->sd, NULL, NULL); - if(peer_socket < 0){ - log_error("peer socket could not be established"); - goto error; - } - loga("Recon socket connection accepted"); //TODO: print information about the socket IP address. - - /* Read header from Lepton */ - uint32_t magic = 0; - if( read(peer_socket, &magic, sizeof(uint32_t)) < 1) { - log_error("Error on processing header from Lepton --> %s", strerror(errno)); - goto error; - } - magic = ntohl(magic); - - uint32_t sndOrRcv = 0; - if( read(peer_socket, &sndOrRcv, sizeof(uint32_t)) < 1) { - log_error("Error on processing header from Lepton --> %s", strerror(errno)); - goto error; - } - sndOrRcv = ntohl(sndOrRcv); - - uint32_t headerSize; - if( read(peer_socket, &headerSize, sizeof(uint32_t)) < 1) { - log_error("Error on processing header size from Lepton --> %s", strerror(errno)); - goto error; - } - headerSize = ntohl(headerSize); - - uint32_t bufferSize; - if( read(peer_socket, &bufferSize, sizeof(uint32_t)) < 1) { - log_error("Error on processing buffer size from Lepton --> %s", strerror(errno)); - goto error; - } - bufferSize = ntohl(bufferSize); - - uint32_t cipherSize; - if( read(peer_socket, &cipherSize, sizeof(uint32_t)) < 1) { - log_error("Error on processing cipher size from Lepton --> %s", strerror(errno)); - goto error; - } - cipherSize = ntohl(cipherSize); - - if (magic != MAGIC_NUMBER) { - log_error("Magic number not correct or not receiver properly --> %s ----> %d", strerror(errno),magic); - log_error("Expected magic number: %d", MAGIC_NUMBER); - goto error; - } - else{ - log_debug("Protocol magic number: %d", magic); - } - - if (sndOrRcv != 1 && sndOrRcv !=2) { - log_error("Error on receiving PULL/PUSH --> %s ----> %d", strerror(errno),sndOrRcv); - goto error; - } - - if (headerSize < 1 || headerSize > MAX_HEADER_SIZE){ - log_error("Header size was not received --> %d", headerSize); - goto error; - } - - if (bufferSize < 1 || bufferSize > MAX_BUFFER_SIZE){ - log_error("Buffer size was not received --> %d", bufferSize); - goto error; - } - - if (cipherSize < 1 || cipherSize > MAX_CIPHER_SIZE){ - log_error("Cipher size was not received --> %d", cipherSize); - goto error; - } - - - loga("Header size: %d Buffer size: %d Cipher size: %d", headerSize, bufferSize, cipherSize); - - if (cipherSize <= bufferSize){ - log_error("AES encryption does not allow cipher size to be smaller than buffer size " - "-- Cipher size: %d buffer size %d", cipherSize, bufferSize); - goto error; - } - if (sndOrRcv == 1) { - loga("PULL: Dynomite to send data to entropy engine"); - if (entropy_snd_start(peer_socket, headerSize, bufferSize, cipherSize) == DN_ERROR){ - log_error("Entropy send faced issue ---> cleaning resources"); - goto error; - } - else{ - loga("Entropy receive completed ---> cleaning resources"); - } - - } - else if (sndOrRcv == 2) { - loga("PUSH: Dynomite to receive data from entropy engine"); - if (entropy_rcv_start(peer_socket, headerSize, bufferSize, cipherSize) == DN_ERROR){ - log_error("Entropy receive faced issue ---> cleaning resources"); - goto error; - } - else{ - loga("Entropy send completed ---> cleaning resources"); - } - } - - - - if(ENCRYPT_FLAG == 1 || DECRYPT_FLAG == 1) - entropy_crypto_deinit(); - - if(peer_socket > -1) - close(peer_socket); +static void entropy_callback(void *arg1, void *arg2) { + int n = *((int *)arg2); + struct entropy *st = arg1; + if (n == 0) { return; + } + + /* Check the encryption flag and initialize the crypto */ + if (ENCRYPT_FLAG == 1 || DECRYPT_FLAG == 1) { + entropy_crypto_init(); + } else if (ENCRYPT_FLAG == 0) { + loga("Encryption is disabled for entropy sender"); + + } else if (DECRYPT_FLAG == 0) { + loga("Decryption is disabled for entropy receiver"); + } + + /* accept the connection */ + int peer_socket = accept(st->sd, NULL, NULL); + if (peer_socket < 0) { + log_error("peer socket could not be established"); + goto error; + } + loga("Recon socket connection accepted"); // TODO: print information about + // the socket IP address. + + /* Read header from Lepton */ + uint32_t magic = 0; + if (read(peer_socket, &magic, sizeof(uint32_t)) < 1) { + log_error("Error on processing header from Lepton --> %s", strerror(errno)); + goto error; + } + magic = ntohl(magic); + + uint32_t sndOrRcv = 0; + if (read(peer_socket, &sndOrRcv, sizeof(uint32_t)) < 1) { + log_error("Error on processing header from Lepton --> %s", strerror(errno)); + goto error; + } + sndOrRcv = ntohl(sndOrRcv); + + uint32_t headerSize; + if (read(peer_socket, &headerSize, sizeof(uint32_t)) < 1) { + log_error("Error on processing header size from Lepton --> %s", + strerror(errno)); + goto error; + } + headerSize = ntohl(headerSize); + + uint32_t bufferSize; + if (read(peer_socket, &bufferSize, sizeof(uint32_t)) < 1) { + log_error("Error on processing buffer size from Lepton --> %s", + strerror(errno)); + goto error; + } + bufferSize = ntohl(bufferSize); + + uint32_t cipherSize; + if (read(peer_socket, &cipherSize, sizeof(uint32_t)) < 1) { + log_error("Error on processing cipher size from Lepton --> %s", + strerror(errno)); + goto error; + } + cipherSize = ntohl(cipherSize); + + if (magic != MAGIC_NUMBER) { + log_error( + "Magic number not correct or not receiver properly --> %s ----> %d", + strerror(errno), magic); + log_error("Expected magic number: %d", MAGIC_NUMBER); + goto error; + } else { + log_debug("Protocol magic number: %d", magic); + } + + if (sndOrRcv != 1 && sndOrRcv != 2) { + log_error("Error on receiving PULL/PUSH --> %s ----> %d", strerror(errno), + sndOrRcv); + goto error; + } + + if (headerSize < 1 || headerSize > MAX_HEADER_SIZE) { + log_error("Header size was not received --> %d", headerSize); + goto error; + } + + if (bufferSize < 1 || bufferSize > MAX_BUFFER_SIZE) { + log_error("Buffer size was not received --> %d", bufferSize); + goto error; + } + + if (cipherSize < 1 || cipherSize > MAX_CIPHER_SIZE) { + log_error("Cipher size was not received --> %d", cipherSize); + goto error; + } + + loga("Header size: %d Buffer size: %d Cipher size: %d", headerSize, + bufferSize, cipherSize); + + if (cipherSize <= bufferSize) { + log_error( + "AES encryption does not allow cipher size to be smaller than buffer " + "size " + "-- Cipher size: %d buffer size %d", + cipherSize, bufferSize); + goto error; + } + if (sndOrRcv == 1) { + loga("PULL: Dynomite to send data to entropy engine"); + if (entropy_snd_start(peer_socket, headerSize, bufferSize, cipherSize) == + DN_ERROR) { + log_error("Entropy send faced issue ---> cleaning resources"); + goto error; + } else { + loga("Entropy receive completed ---> cleaning resources"); + } + + } else if (sndOrRcv == 2) { + loga("PUSH: Dynomite to receive data from entropy engine"); + if (entropy_rcv_start(peer_socket, headerSize, bufferSize, cipherSize) == + DN_ERROR) { + log_error("Entropy receive faced issue ---> cleaning resources"); + goto error; + } else { + loga("Entropy send completed ---> cleaning resources"); + } + } + + if (ENCRYPT_FLAG == 1 || DECRYPT_FLAG == 1) entropy_crypto_deinit(); + + if (peer_socket > -1) close(peer_socket); + + return; /* resource cleanup */ error: - if(ENCRYPT_FLAG == 1 || DECRYPT_FLAG == 1) - entropy_crypto_deinit(); - - if(peer_socket > -1) - close(peer_socket); + if (ENCRYPT_FLAG == 1 || DECRYPT_FLAG == 1) entropy_crypto_deinit(); - return; + if (peer_socket > -1) close(peer_socket); + return; } -void * -entropy_loop(void *arg) -{ - event_loop_entropy(entropy_callback, arg); - return NULL; +void *entropy_loop(void *arg) { + event_loop_entropy(entropy_callback, arg); + return NULL; } - /* * Function: entropy_conn_start * -------------------- - * Checks if resources are available, and initializes the connection information. - * Loads the IV and creates a new thread to loop for the entropy receive. + * Checks if resources are available, and initializes the connection + * information. Loads the IV and creates a new thread to loop for the entropy + * receive. * * returns: rstatus_t for the status of opening of the new connection. */ -rstatus_t -entropy_conn_start(struct entropy *cn) -{ - rstatus_t status; +rstatus_t entropy_conn_start(struct entropy *cn) { + rstatus_t status; - THROW_STATUS(entropy_listen(cn)); + THROW_STATUS(entropy_listen(cn)); - status = pthread_create(&cn->tid, NULL, entropy_loop, cn); - if (status < 0) { - log_error("reconciliation thread for socket create failed: %s", strerror(status)); - return DN_ERROR; - } + status = pthread_create(&cn->tid, NULL, entropy_loop, cn); + if (status < 0) { + log_error("reconciliation thread for socket create failed: %s", + strerror(status)); + return DN_ERROR; + } - return DN_OK; + return DN_OK; } - diff --git a/src/event/dyn_epoll.c b/src/event/dyn_epoll.c index 4cbaf775d..bf3fa188b 100644 --- a/src/event/dyn_epoll.c +++ b/src/event/dyn_epoll.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -24,379 +24,352 @@ #ifdef DN_HAVE_EPOLL -#include #include +#include -struct event_base * -event_base_create(int nevent, event_cb_t cb) -{ - struct event_base *evb; - int status, ep; - struct epoll_event *event; +struct event_base *event_base_create(int nevent, event_cb_t cb) { + struct event_base *evb; + int status, ep; + struct epoll_event *event; - ASSERT(nevent > 0); + ASSERT(nevent > 0); - ep = epoll_create(nevent); - if (ep < 0) { - log_error("epoll create of size %d failed: %s", nevent, strerror(errno)); - return NULL; - } + ep = epoll_create(nevent); + if (ep < 0) { + log_error("epoll create of size %d failed: %s", nevent, strerror(errno)); + return NULL; + } - event = dn_calloc(nevent, sizeof(*event)); - if (event == NULL) { - status = close(ep); - if (status < 0) { - log_error("close e %d failed, ignored: %s", ep, strerror(errno)); - } - return NULL; + event = dn_calloc(nevent, sizeof(*event)); + if (event == NULL) { + status = close(ep); + if (status < 0) { + log_error("close e %d failed, ignored: %s", ep, strerror(errno)); } + return NULL; + } - evb = dn_alloc(sizeof(*evb)); - if (evb == NULL) { - dn_free(event); - status = close(ep); - if (status < 0) { - log_error("close e %d failed, ignored: %s", ep, strerror(errno)); - } - return NULL; + evb = dn_alloc(sizeof(*evb)); + if (evb == NULL) { + dn_free(event); + status = close(ep); + if (status < 0) { + log_error("close e %d failed, ignored: %s", ep, strerror(errno)); } + return NULL; + } - evb->ep = ep; - evb->event = event; - evb->nevent = nevent; - evb->cb = cb; + evb->ep = ep; + evb->event = event; + evb->nevent = nevent; + evb->cb = cb; - log_debug(LOG_INFO, "e %d with nevent %d", evb->ep, evb->nevent); + log_debug(LOG_INFO, "e %d with nevent %d", evb->ep, evb->nevent); - return evb; + return evb; } -void -event_base_destroy(struct event_base *evb) -{ - int status; +void event_base_destroy(struct event_base *evb) { + int status; - if (evb == NULL) { - return; - } + if (evb == NULL) { + return; + } - ASSERT(evb->ep >= 0); + ASSERT(evb->ep >= 0); - dn_free(evb->event); + dn_free(evb->event); - status = close(evb->ep); - if (status < 0) { - log_error("close e %d failed, ignored: %s", evb->ep, strerror(errno)); - } - evb->ep = -1; + status = close(evb->ep); + if (status < 0) { + log_error("close e %d failed, ignored: %s", evb->ep, strerror(errno)); + } + evb->ep = -1; - dn_free(evb); + dn_free(evb); } -int -event_add_in(struct event_base *evb, struct conn *c) -{ - int status; - struct epoll_event event; - int ep = evb->ep; +int event_add_in(struct event_base *evb, struct conn *c) { + int status; + struct epoll_event event; + int ep = evb->ep; - ASSERT(ep >= 0); - ASSERT(c != NULL); - ASSERT(c->sd > 0); + ASSERT(ep >= 0); + ASSERT(c != NULL); + ASSERT(c->sd > 0); - if (c->recv_active) { - return 0; - } + if (c->recv_active) { + return 0; + } - event.events = (uint32_t)(EPOLLIN); // | EPOLLET); - event.data.ptr = c; + event.events = (uint32_t)(EPOLLIN); // | EPOLLET); + event.data.ptr = c; - status = epoll_ctl(ep, EPOLL_CTL_MOD, c->sd, &event); - if (status < 0) { - log_error("epoll ctl on e %d sd %d failed: %s", ep, c->sd, - strerror(errno)); - } else { - c->recv_active = 1; - } + status = epoll_ctl(ep, EPOLL_CTL_MOD, c->sd, &event); + if (status < 0) { + log_error("epoll ctl on e %d sd %d failed: %s", ep, c->sd, strerror(errno)); + } else { + c->recv_active = 1; + } - return status; + return status; } -int -event_del_in(struct event_base *evb, struct conn *c) -{ - return 0; -} +int event_del_in(struct event_base *evb, struct conn *c) { return 0; } -int -event_add_out(struct event_base *evb, struct conn *c) -{ - int status; - struct epoll_event event; - int ep = evb->ep; +int event_add_out(struct event_base *evb, struct conn *c) { + int status; + struct epoll_event event; + int ep = evb->ep; - ASSERT(ep >= 0); - ASSERT(c != NULL); - ASSERT(c->sd > 0); - ASSERT(c->recv_active); + ASSERT(ep >= 0); + ASSERT(c != NULL); + ASSERT(c->sd > 0); + ASSERT(c->recv_active); - if (c->send_active) { - return 0; - } + if (c->send_active) { + return 0; + } - event.events = (uint32_t)(EPOLLIN | EPOLLOUT); // | EPOLLET); - event.data.ptr = c; + event.events = (uint32_t)(EPOLLIN | EPOLLOUT); // | EPOLLET); + event.data.ptr = c; - log_debug(LOG_DEBUG, "adding conn %s to active", print_obj(c)); - status = epoll_ctl(ep, EPOLL_CTL_MOD, c->sd, &event); - if (status < 0) { - log_error("epoll ctl on e %d sd %d failed: %s", ep, c->sd, - strerror(errno)); - } else { - c->send_active = 1; - } + log_debug(LOG_DEBUG, "adding conn %s to active", print_obj(c)); + status = epoll_ctl(ep, EPOLL_CTL_MOD, c->sd, &event); + if (status < 0) { + log_error("epoll ctl on e %d sd %d failed: %s", ep, c->sd, strerror(errno)); + } else { + c->send_active = 1; + } - return status; + return status; } -int -event_del_out(struct event_base *evb, struct conn *c) -{ - int status; - struct epoll_event event; - int ep = evb->ep; +int event_del_out(struct event_base *evb, struct conn *c) { + int status; + struct epoll_event event; + int ep = evb->ep; - ASSERT(ep >= 0); - ASSERT(c != NULL); - ASSERT(c->sd > 0); - ASSERT(c->recv_active); + ASSERT(ep >= 0); + ASSERT(c != NULL); + ASSERT(c->sd > 0); + ASSERT(c->recv_active); - if (!c->send_active) { - return 0; - } + if (!c->send_active) { + return 0; + } - event.events = (uint32_t)(EPOLLIN | EPOLLET); - event.data.ptr = c; + event.events = (uint32_t)(EPOLLIN | EPOLLET); + event.data.ptr = c; - log_debug(LOG_DEBUG, "removing conn %s from active", print_obj(c)); - status = epoll_ctl(ep, EPOLL_CTL_MOD, c->sd, &event); - if (status < 0) { - log_error("epoll ctl on e %d sd %d failed: %s", ep, c->sd, - strerror(errno)); - } else { - c->send_active = 0; - } + log_debug(LOG_DEBUG, "removing conn %s from active", print_obj(c)); + status = epoll_ctl(ep, EPOLL_CTL_MOD, c->sd, &event); + if (status < 0) { + log_error("epoll ctl on e %d sd %d failed: %s", ep, c->sd, strerror(errno)); + } else { + c->send_active = 0; + } - return status; + return status; } -int -event_add_conn(struct event_base *evb, struct conn *c) -{ - int status; - struct epoll_event event; - int ep = evb->ep; +int event_add_conn(struct event_base *evb, struct conn *c) { + int status; + struct epoll_event event; + int ep = evb->ep; - ASSERT(ep >= 0); - ASSERT(c != NULL); - ASSERT(c->sd > 0); + ASSERT(ep >= 0); + ASSERT(c != NULL); + ASSERT(c->sd > 0); - event.events = (uint32_t)(EPOLLIN | EPOLLOUT | EPOLLET); - event.data.ptr = c; + event.events = (uint32_t)(EPOLLIN | EPOLLOUT | EPOLLET); + event.data.ptr = c; - log_debug(LOG_DEBUG, "adding conn %s to active", print_obj(c)); - status = epoll_ctl(ep, EPOLL_CTL_ADD, c->sd, &event); - if (status < 0) { - log_error("epoll ctl on e %d sd %d failed: %s", ep, c->sd, - strerror(errno)); - } else { - c->send_active = 1; - c->recv_active = 1; - } + log_debug(LOG_DEBUG, "adding conn %s to active", print_obj(c)); + status = epoll_ctl(ep, EPOLL_CTL_ADD, c->sd, &event); + if (status < 0) { + log_error("epoll ctl on e %d sd %d failed: %s", ep, c->sd, strerror(errno)); + } else { + c->send_active = 1; + c->recv_active = 1; + } - return status; + return status; } -int -event_del_conn(struct event_base *evb, struct conn *c) -{ - int status; - int ep = evb->ep; +int event_del_conn(struct event_base *evb, struct conn *c) { + int status; + int ep = evb->ep; - ASSERT(ep >= 0); - ASSERT(c != NULL); - ASSERT(c->sd > 0); + ASSERT(ep >= 0); + ASSERT(c != NULL); + ASSERT(c->sd > 0); - log_debug(LOG_DEBUG, "removing conn %s from active", print_obj(c)); - status = epoll_ctl(ep, EPOLL_CTL_DEL, c->sd, NULL); - if (status < 0) { - log_error("epoll ctl on e %d sd %d failed: %s", ep, c->sd, - strerror(errno)); - } else { - c->recv_active = 0; - c->send_active = 0; - } + log_debug(LOG_DEBUG, "removing conn %s from active", print_obj(c)); + status = epoll_ctl(ep, EPOLL_CTL_DEL, c->sd, NULL); + if (status < 0) { + log_error("epoll ctl on e %d sd %d failed: %s", ep, c->sd, strerror(errno)); + } else { + c->recv_active = 0; + c->send_active = 0; + } - return status; + return status; } -int -event_wait(struct event_base *evb, int timeout) -{ - int ep = evb->ep; - struct epoll_event *event = evb->event; - int nevent = evb->nevent; - - ASSERT(ep >= 0); - ASSERT(event != NULL); - ASSERT(nevent > 0); - - for (;;) { - int i, nsd; - - nsd = epoll_wait(ep, event, nevent, timeout); - if (nsd > 0) { - for (i = 0; i < nsd; i++) { - struct epoll_event *ev = &evb->event[i]; - uint32_t events = 0; - - log_debug(LOG_VVVERB, "epoll %04"PRIX32" triggered on conn %p", - ev->events, ev->data.ptr); - - if (ev->events & (EPOLLERR | EPOLLRDHUP)) { - events |= EVENT_ERR; - } - - if (ev->events & (EPOLLIN | EPOLLHUP)) { - events |= EVENT_READ; - } - - if (ev->events & EPOLLOUT) { - events |= EVENT_WRITE; - } - - if (evb->cb != NULL) { - evb->cb(ev->data.ptr, events); - } - } - return nsd; - } +int event_wait(struct event_base *evb, int timeout) { + int ep = evb->ep; + struct epoll_event *event = evb->event; + int nevent = evb->nevent; - if (nsd == 0) { - if (timeout == -1) { - log_error("epoll wait on e %d with %d events and %d timeout " - "returned no events", ep, nevent, timeout); - return -1; - } + ASSERT(ep >= 0); + ASSERT(event != NULL); + ASSERT(nevent > 0); - return 0; - } + for (;;) { + int i, nsd; - if (errno == EINTR) { - continue; - } + nsd = epoll_wait(ep, event, nevent, timeout); + if (nsd > 0) { + for (i = 0; i < nsd; i++) { + struct epoll_event *ev = &evb->event[i]; + uint32_t events = 0; - log_error("epoll wait on e %d with %d events failed: %s", ep, nevent, - strerror(errno)); - return -1; - } + log_debug(LOG_VVVERB, "epoll %04" PRIX32 " triggered on conn %p", + ev->events, ev->data.ptr); - NOT_REACHED(); -} + if (ev->events & (EPOLLERR | EPOLLRDHUP)) { + events |= EVENT_ERR; + } -void -event_loop_stats(event_stats_cb_t cb, void *arg) -{ - struct stats *st = arg; - int status, ep; - struct epoll_event ev; - - ep = epoll_create(1); - if (ep < 0) { - log_error("epoll create failed: %s", strerror(errno)); - return; - } + if (ev->events & (EPOLLIN | EPOLLHUP)) { + events |= EVENT_READ; + } - ev.data.fd = st->sd; - ev.events = EPOLLIN; + if (ev->events & EPOLLOUT) { + events |= EVENT_WRITE; + } - status = epoll_ctl(ep, EPOLL_CTL_ADD, st->sd, &ev); - if (status < 0) { - log_error("epoll ctl on e %d sd %d failed: %s", ep, st->sd, - strerror(errno)); - goto error; + if (evb->cb != NULL) { + evb->cb(ev->data.ptr, events); + } + } + return nsd; } - for (;;) { - int n; - - n = epoll_wait(ep, &ev, 1, st->interval); - if (n < 0) { - if (errno == EINTR) { - continue; - } - log_error("epoll wait on e %d with m %d failed: %s", ep, - st->sd, strerror(errno)); - break; - } + if (nsd == 0) { + if (timeout == -1) { + log_error( + "epoll wait on e %d with %d events and %d timeout " + "returned no events", + ep, nevent, timeout); + return -1; + } - cb(st, &n); + return 0; } -error: - status = close(ep); - if (status < 0) { - log_error("close e %d failed, ignored: %s", ep, strerror(errno)); + if (errno == EINTR) { + continue; } - ep = -1; -} -void -event_loop_entropy(event_entropy_cb_t cb, void *arg) -{ - struct entropy *ent = arg; - int status, ep; - struct epoll_event ev; - ent->interval = 30; - - ep = epoll_create(1); - if (ep < 0) { - log_error("entropy epoll create failed: %s", strerror(errno)); - return; - } + log_error("epoll wait on e %d with %d events failed: %s", ep, nevent, + strerror(errno)); + return -1; + } - ev.data.fd = ent->sd; - ev.events = EPOLLIN; + NOT_REACHED(); +} - status = epoll_ctl(ep, EPOLL_CTL_ADD, ent->sd, &ev); - if (status < 0) { - log_error("entropy epoll ctl on e %d sd %d failed: %s", ep, ent->sd, - strerror(errno)); - goto error; +void event_loop_stats(event_stats_cb_t cb, void *arg) { + struct stats *st = arg; + int status, ep; + struct epoll_event ev; + + ep = epoll_create(1); + if (ep < 0) { + log_error("epoll create failed: %s", strerror(errno)); + return; + } + + ev.data.fd = st->sd; + ev.events = EPOLLIN; + + status = epoll_ctl(ep, EPOLL_CTL_ADD, st->sd, &ev); + if (status < 0) { + log_error("epoll ctl on e %d sd %d failed: %s", ep, st->sd, + strerror(errno)); + goto error; + } + + for (;;) { + int n; + + n = epoll_wait(ep, &ev, 1, st->interval); + if (n < 0) { + if (errno == EINTR) { + continue; + } + log_error("epoll wait on e %d with m %d failed: %s", ep, st->sd, + strerror(errno)); + break; } - for (;;) { - int n; - - n = epoll_wait(ep, &ev, 1, ent->interval); - if (n < 0) { - if (errno == EINTR) { - continue; - } - log_error("entropy epoll wait on e %d with m %d failed: %s", ep, - ent->sd, strerror(errno)); - break; - } + cb(st, &n); + } - cb(ent, &n); +error: + status = close(ep); + if (status < 0) { + log_error("close e %d failed, ignored: %s", ep, strerror(errno)); + } + ep = -1; +} + +void event_loop_entropy(event_entropy_cb_t cb, void *arg) { + struct entropy *ent = arg; + int status, ep; + struct epoll_event ev; + ent->interval = 30; + + ep = epoll_create(1); + if (ep < 0) { + log_error("entropy epoll create failed: %s", strerror(errno)); + return; + } + + ev.data.fd = ent->sd; + ev.events = EPOLLIN; + + status = epoll_ctl(ep, EPOLL_CTL_ADD, ent->sd, &ev); + if (status < 0) { + log_error("entropy epoll ctl on e %d sd %d failed: %s", ep, ent->sd, + strerror(errno)); + goto error; + } + + for (;;) { + int n; + + n = epoll_wait(ep, &ev, 1, ent->interval); + if (n < 0) { + if (errno == EINTR) { + continue; + } + log_error("entropy epoll wait on e %d with m %d failed: %s", ep, ent->sd, + strerror(errno)); + break; } + cb(ent, &n); + } + error: - status = close(ep); - if (status < 0) { - log_error("close e %d failed, ignored: %s", ep, strerror(errno)); - } - ep = -1; + status = close(ep); + if (status < 0) { + log_error("close e %d failed, ignored: %s", ep, strerror(errno)); + } + ep = -1; } #endif /* DN_HAVE_EPOLL */ diff --git a/src/event/dyn_event.h b/src/event/dyn_event.h index 7e0585f36..780ec7c7c 100644 --- a/src/event/dyn_event.h +++ b/src/event/dyn_event.h @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,17 +20,17 @@ * limitations under the License. */ -#include - #ifndef _DN_EVENT_H_ #define _DN_EVENT_H_ +// Forward declarations +struct conn; -#define EVENT_SIZE 1024 +#define EVENT_SIZE 1024 -#define EVENT_READ 0x0000ff +#define EVENT_READ 0x0000ff #define EVENT_WRITE 0x00ff00 -#define EVENT_ERR 0xff0000 +#define EVENT_ERR 0xff0000 typedef int (*event_cb_t)(void *, uint32_t); typedef void (*event_stats_cb_t)(void *, void *); @@ -39,63 +39,51 @@ typedef void (*event_entropy_cb_t)(void *, void *); #ifdef DN_HAVE_KQUEUE struct event_base { - int kq; /* kernel event queue descriptor */ + int kq; /* kernel event queue descriptor */ - struct kevent *change; /* change[] - events we want to monitor */ - int nchange; /* # change */ + struct kevent *change; /* change[] - events we want to monitor */ + int nchange; /* # change */ - struct kevent *event; /* event[] - events that were triggered */ - int nevent; /* # event */ - int nreturned; /* # event placed in event[] */ - int nprocessed; /* # event processed from event[] */ + struct kevent *event; /* event[] - events that were triggered */ + int nevent; /* # event */ + int nreturned; /* # event placed in event[] */ + int nprocessed; /* # event processed from event[] */ - event_cb_t cb; /* event callback */ + event_cb_t cb; /* event callback */ }; -static inline int -event_fd(struct event_base *evb) -{ - return evb->kq; -} +static inline int event_fd(struct event_base *evb) { return evb->kq; } #elif DN_HAVE_EPOLL struct event_base { - int ep; /* epoll descriptor */ + int ep; /* epoll descriptor */ - struct epoll_event *event; /* event[] - events that were triggered */ - int nevent; /* # event */ + struct epoll_event *event; /* event[] - events that were triggered */ + int nevent; /* # event */ - event_cb_t cb; /* event callback */ + event_cb_t cb; /* event callback */ }; -static inline int -event_fd(struct event_base *evb) -{ - return evb->ep; -} +static inline int event_fd(struct event_base *evb) { return evb->ep; } #elif DN_HAVE_EVENT_PORTS #include struct event_base { - int evp; /* event port descriptor */ + int evp; /* event port descriptor */ - port_event_t *event; /* event[] - events that were triggered */ - int nevent; /* # event */ + port_event_t *event; /* event[] - events that were triggered */ + int nevent; /* # event */ - event_cb_t cb; /* event callback */ + event_cb_t cb; /* event callback */ }; -static inline int -event_fd(struct event_base *evb) -{ - return evb->evp; -} +static inline int event_fd(struct event_base *evb) { return evb->evp; } #else -# error missing scalable I/O event notification mechanism +#error missing scalable I/O event notification mechanism #endif struct event_base *event_base_create(int size, event_cb_t cb); diff --git a/src/event/dyn_evport.c b/src/event/dyn_evport.c index 2d1d3ac18..74d83ed44 100644 --- a/src/event/dyn_evport.c +++ b/src/event/dyn_evport.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -24,471 +24,445 @@ #ifdef DN_HAVE_EVENT_PORTS -#include #include +#include -struct event_base * -event_base_create(int nevent, event_cb_t cb) -{ - struct event_base *evb; - int status, evp; - port_event_t *event; +struct event_base *event_base_create(int nevent, event_cb_t cb) { + struct event_base *evb; + int status, evp; + port_event_t *event; - ASSERT(nevent > 0); + ASSERT(nevent > 0); - evp = port_create(); - if (evp < 0) { - log_error("port create failed: %s", strerror(errno)); - return NULL; - } + evp = port_create(); + if (evp < 0) { + log_error("port create failed: %s", strerror(errno)); + return NULL; + } - event = dn_calloc(nevent, sizeof(*event)); - if (event == NULL) { - status = close(evp); - if (status < 0) { - log_error("close evp %d failed, ignored: %s", evp, strerror(errno)); - } - return NULL; + event = dn_calloc(nevent, sizeof(*event)); + if (event == NULL) { + status = close(evp); + if (status < 0) { + log_error("close evp %d failed, ignored: %s", evp, strerror(errno)); } + return NULL; + } - evb = dn_alloc(sizeof(*evb)); - if (evb == NULL) { - dn_free(event); - status = close(evp); - if (status < 0) { - log_error("close evp %d failed, ignored: %s", evp, strerror(errno)); - } - return NULL; + evb = dn_alloc(sizeof(*evb)); + if (evb == NULL) { + dn_free(event); + status = close(evp); + if (status < 0) { + log_error("close evp %d failed, ignored: %s", evp, strerror(errno)); } + return NULL; + } - evb->evp = evp; - evb->event = event; - evb->nevent = nevent; - evb->cb = cb; + evb->evp = evp; + evb->event = event; + evb->nevent = nevent; + evb->cb = cb; - log_debug(LOG_INFO, "evp %d with nevent %d", evb->evp, evb->nevent); + log_debug(LOG_INFO, "evp %d with nevent %d", evb->evp, evb->nevent); - return evb; + return evb; } -void -event_base_destroy(struct event_base *evb) -{ - int status; +void event_base_destroy(struct event_base *evb) { + int status; - if (evb == NULL) { - return; - } + if (evb == NULL) { + return; + } - ASSERT(evb->evp >= 0); + ASSERT(evb->evp >= 0); - dn_free(evb->event); + dn_free(evb->event); - status = close(evb->evp); - if (status < 0) { - log_error("close evp %d failed, ignored: %s", evb->evp, strerror(errno)); - } - evb->evp = -1; + status = close(evb->evp); + if (status < 0) { + log_error("close evp %d failed, ignored: %s", evb->evp, strerror(errno)); + } + evb->evp = -1; - dn_free(evb); + dn_free(evb); } -int -event_add_in(struct event_base *evb, struct conn *c) -{ - return 0; -} +int event_add_in(struct event_base *evb, struct conn *c) { return 0; } -int -event_del_in(struct event_base *evb, struct conn *c) -{ - return 0; -} +int event_del_in(struct event_base *evb, struct conn *c) { return 0; } -int -event_add_out(struct event_base *evb, struct conn *c) -{ - int status; - int evp = evb->evp; +int event_add_out(struct event_base *evb, struct conn *c) { + int status; + int evp = evb->evp; - ASSERT(evp > 0); - ASSERT(c != NULL); - ASSERT(c->sd > 0); - ASSERT(c->recv_active); + ASSERT(evp > 0); + ASSERT(c != NULL); + ASSERT(c->sd > 0); + ASSERT(c->recv_active); - if (c->send_active) { - return 0; - } + if (c->send_active) { + return 0; + } - status = port_associate(evp, PORT_SOURCE_FD, c->sd, POLLIN | POLLOUT, c); - if (status < 0) { - log_error("port associate on evp %d sd %d failed: %s", evp, c->sd, - strerror(errno)); - } else { - c->send_active = 1; - } + status = port_associate(evp, PORT_SOURCE_FD, c->sd, POLLIN | POLLOUT, c); + if (status < 0) { + log_error("port associate on evp %d sd %d failed: %s", evp, c->sd, + strerror(errno)); + } else { + c->send_active = 1; + } - return status; + return status; } -int -event_del_out(struct event_base *evb, struct conn *c) -{ - int status; - int evp = evb->evp; +int event_del_out(struct event_base *evb, struct conn *c) { + int status; + int evp = evb->evp; - ASSERT(evp > 0); - ASSERT(c != NULL); - ASSERT(c->sd > 0); - ASSERT(c->recv_active); + ASSERT(evp > 0); + ASSERT(c != NULL); + ASSERT(c->sd > 0); + ASSERT(c->recv_active); - if (!c->send_active) { - return 0; - } + if (!c->send_active) { + return 0; + } - status = port_associate(evp, PORT_SOURCE_FD, c->sd, POLLIN, c); - if (status < 0) { - log_error("port associate on evp %d sd %d failed: %s", evp, c->sd, - strerror(errno)); - } else { - c->send_active = 0; - } + status = port_associate(evp, PORT_SOURCE_FD, c->sd, POLLIN, c); + if (status < 0) { + log_error("port associate on evp %d sd %d failed: %s", evp, c->sd, + strerror(errno)); + } else { + c->send_active = 0; + } - return status; + return status; } -int -event_add_conn(struct event_base *evb, struct conn *c) -{ - int status; - int evp = evb->evp; +int event_add_conn(struct event_base *evb, struct conn *c) { + int status; + int evp = evb->evp; + + ASSERT(evp > 0); + ASSERT(c != NULL); + ASSERT(c->sd > 0); + ASSERT(!c->recv_active); + ASSERT(!c->send_active); + + status = port_associate(evp, PORT_SOURCE_FD, c->sd, POLLIN | POLLOUT, c); + if (status < 0) { + log_error("port associate on evp %d sd %d failed: %s", evp, c->sd, + strerror(errno)); + } else { + c->send_active = 1; + c->recv_active = 1; + } + + return status; +} - ASSERT(evp > 0); - ASSERT(c != NULL); - ASSERT(c->sd > 0); - ASSERT(!c->recv_active); - ASSERT(!c->send_active); +int event_del_conn(struct event_base *evb, struct conn *c) { + int status; + int evp = evb->evp; - status = port_associate(evp, PORT_SOURCE_FD, c->sd, POLLIN | POLLOUT, c); - if (status < 0) { - log_error("port associate on evp %d sd %d failed: %s", evp, c->sd, - strerror(errno)); - } else { - c->send_active = 1; - c->recv_active = 1; - } + ASSERT(evp > 0); + ASSERT(c != NULL); + ASSERT(c->sd > 0); + if (!c->send_active && !c->recv_active) { + return 0; + } + + /* + * Removes the association of an object with a port. The association + * is also removed if the port gets closed. + * + * On failure, we check for ENOENT errno because it is likely that we + * are deleting this connection after it was returned from the event + * loop and before we had a chance of reactivating it by calling + * port_associate() on it. + */ + status = port_dissociate(evp, PORT_SOURCE_FD, c->sd); + if (status < 0 && errno != ENOENT) { + log_error("port dissociate evp %d sd %d failed: %s", evp, c->sd, + strerror(errno)); return status; -} + } -int -event_del_conn(struct event_base *evb, struct conn *c) -{ - int status; - int evp = evb->evp; + c->recv_active = 0; + c->send_active = 0; - ASSERT(evp > 0); - ASSERT(c != NULL); - ASSERT(c->sd > 0); + return 0; +} - if (!c->send_active && !c->recv_active) { - return 0; - } +static int event_reassociate(struct event_base *evb, struct conn *c) { + int status, events; + int evp = evb->evp; - /* - * Removes the association of an object with a port. The association - * is also removed if the port gets closed. - * - * On failure, we check for ENOENT errno because it is likely that we - * are deleting this connection after it was returned from the event - * loop and before we had a chance of reactivating it by calling - * port_associate() on it. - */ - status = port_dissociate(evp, PORT_SOURCE_FD, c->sd); - if (status < 0 && errno != ENOENT) { - log_error("port dissociate evp %d sd %d failed: %s", evp, c->sd, - strerror(errno)); - return status; - } + ASSERT(evp > 0); + ASSERT(c != NULL); + ASSERT(c->sd > 0); + ASSERT(c->recv_active); - c->recv_active = 0; - c->send_active = 0; + if (c->send_active) { + events = POLLIN | POLLOUT; + } else { + events = POLLIN; + } - return 0; + status = port_associate(evp, PORT_SOURCE_FD, c->sd, events, c); + if (status < 0) { + log_error("port associate on evp %d sd %d failed: %s", evp, c->sd, + strerror(errno)); + } + + return status; } -static int -event_reassociate(struct event_base *evb, struct conn *c) -{ - int status, events; - int evp = evb->evp; - - ASSERT(evp > 0); - ASSERT(c != NULL); - ASSERT(c->sd > 0); - ASSERT(c->recv_active); - - if (c->send_active) { - events = POLLIN | POLLOUT; - } else { - events = POLLIN; - } +int event_wait(struct event_base *evb, int timeout) { + int evp = evb->evp; + port_event_t *event = evb->event; + int nevent = evb->nevent; + struct timespec ts, *tsp; + + ASSERT(evp > 0); + ASSERT(event != NULL); + ASSERT(nevent > 0); + + /* port_getn should block indefinitely if timeout < 0 */ + if (timeout < 0) { + tsp = NULL; + } else { + tsp = &ts; + tsp->tv_sec = timeout / 1000LL; + tsp->tv_nsec = (timeout % 1000LL) * 1000000LL; + } + + for (;;) { + int i, status; + unsigned int nreturned = 1; - status = port_associate(evp, PORT_SOURCE_FD, c->sd, events , c); + /* + * port_getn() retrieves multiple events from a port. A port_getn() + * call will block until at least nreturned events is triggered. On + * a successful return event[] is populated with triggered events + * up to the maximum sized allowed by nevent. The number of entries + * actually placed in event[] is saved in nreturned, which may be + * more than what we asked for but less than nevent. + */ + status = port_getn(evp, event, nevent, &nreturned, tsp); if (status < 0) { - log_error("port associate on evp %d sd %d failed: %s", evp, c->sd, + if (errno == EINTR || errno == EAGAIN) { + continue; + } + + /* + * ETIME - The time interval expired before the expected number + * of events have been posted to the port or nreturned is updated + * with the number of returned port_event_t structures in event[] + */ + if (errno != ETIME) { + log_error("port getn on evp %d with %d events failed: %s", evp, nevent, strerror(errno)); + return -1; + } } - return status; -} + if (nreturned > 0) { + for (i = 0; i < nreturned; i++) { + port_event_t *ev = &evb->event[i]; + uint32_t events = 0; -int -event_wait(struct event_base *evb, int timeout) -{ - int evp = evb->evp; - port_event_t *event = evb->event; - int nevent = evb->nevent; - struct timespec ts, *tsp; - - ASSERT(evp > 0); - ASSERT(event != NULL); - ASSERT(nevent > 0); - - /* port_getn should block indefinitely if timeout < 0 */ - if (timeout < 0) { - tsp = NULL; - } else { - tsp = &ts; - tsp->tv_sec = timeout / 1000LL; - tsp->tv_nsec = (timeout % 1000LL) * 1000000LL; - } + log_debug(LOG_VVERB, + "port %04" PRIX32 + " from source %d " + "triggered on conn %p", + ev->portev_events, ev->portev_source, ev->portev_user); - for (;;) { - int i, status; - unsigned int nreturned = 1; - - /* - * port_getn() retrieves multiple events from a port. A port_getn() - * call will block until at least nreturned events is triggered. On - * a successful return event[] is populated with triggered events - * up to the maximum sized allowed by nevent. The number of entries - * actually placed in event[] is saved in nreturned, which may be - * more than what we asked for but less than nevent. - */ - status = port_getn(evp, event, nevent, &nreturned, tsp); - if (status < 0) { - if (errno == EINTR || errno == EAGAIN) { - continue; - } - - /* - * ETIME - The time interval expired before the expected number - * of events have been posted to the port or nreturned is updated - * with the number of returned port_event_t structures in event[] - */ - if (errno != ETIME) { - log_error("port getn on evp %d with %d events failed: %s", evp, - nevent, strerror(errno)); - return -1; - } + if (ev->portev_events & POLLERR) { + events |= EVENT_ERR; } - if (nreturned > 0) { - for (i = 0; i < nreturned; i++) { - port_event_t *ev = &evb->event[i]; - uint32_t events = 0; - - log_debug(LOG_VVERB, "port %04"PRIX32" from source %d " - "triggered on conn %p", ev->portev_events, - ev->portev_source, ev->portev_user); - - if (ev->portev_events & POLLERR) { - events |= EVENT_ERR; - } - - if (ev->portev_events & POLLIN) { - events |= EVENT_READ; - } - - if (ev->portev_events & POLLOUT) { - events |= EVENT_WRITE; - } - - if (evb->cb != NULL && events != 0) { - status = evb->cb(ev->portev_user, events); - if (status < 0) { - continue; - } - - /* - * When an event for a PORT_SOURCE_FD object is retrieved, - * the object no longer has an association with the port. - * The event can be processed without the possibility that - * another thread can retrieve a subsequent event for the - * same object. After processing of the file descriptor - * is completed, the port_associate() function can be - * called to reassociate the object with the port. - * - * If the descriptor is still capable of accepting data, - * this reassociation is required for the reactivation of - * the data detection. - */ - event_reassociate(evb, ev->portev_user); - } - } - - return nreturned; + if (ev->portev_events & POLLIN) { + events |= EVENT_READ; } - if (timeout == -1) { - log_error("port getn on evp %d with %d events and %d timeout " - "returned no events", evp, nevent, timeout); - return -1; + if (ev->portev_events & POLLOUT) { + events |= EVENT_WRITE; } - return 0; - } - - NOT_REACHED(); -} - -void -event_loop_stats(event_stats_cb_t cb, void *arg) -{ - struct stats *st = arg; - int status, evp; - port_event_t event; - struct timespec ts, *tsp; - - evp = port_create(); - if (evp < 0) { - log_error("port create failed: %s", strerror(errno)); - return; - } + if (evb->cb != NULL && events != 0) { + status = evb->cb(ev->portev_user, events); + if (status < 0) { + continue; + } + + /* + * When an event for a PORT_SOURCE_FD object is retrieved, + * the object no longer has an association with the port. + * The event can be processed without the possibility that + * another thread can retrieve a subsequent event for the + * same object. After processing of the file descriptor + * is completed, the port_associate() function can be + * called to reassociate the object with the port. + * + * If the descriptor is still capable of accepting data, + * this reassociation is required for the reactivation of + * the data detection. + */ + event_reassociate(evb, ev->portev_user); + } + } - status = port_associate(evp, PORT_SOURCE_FD, st->sd, POLLIN, NULL); - if (status < 0) { - log_error("port associate on evp %d sd %d failed: %s", evp, st->sd, - strerror(errno)); - goto error; + return nreturned; } - /* port_getn should block indefinitely if st->interval < 0 */ - if (st->interval < 0) { - tsp = NULL; - } else { - tsp = &ts; - tsp->tv_sec = st->interval / 1000LL; - tsp->tv_nsec = (st->interval % 1000LL) * 1000000LL; + if (timeout == -1) { + log_error( + "port getn on evp %d with %d events and %d timeout " + "returned no events", + evp, nevent, timeout); + return -1; } + return 0; + } - for (;;) { - unsigned int nreturned = 1; - - status = port_getn(evp, &event, 1, &nreturned, tsp); - if (status != DN_OK) { - if (errno == EINTR || errno == EAGAIN) { - continue; - } - - if (errno != ETIME) { - log_error("port getn on evp %d with m %d failed: %s", evp, - st->sd, strerror(errno)); - goto error; - } - } + NOT_REACHED(); +} - ASSERT(nreturned <= 1); +void event_loop_stats(event_stats_cb_t cb, void *arg) { + struct stats *st = arg; + int status, evp; + port_event_t event; + struct timespec ts, *tsp; + + evp = port_create(); + if (evp < 0) { + log_error("port create failed: %s", strerror(errno)); + return; + } + + status = port_associate(evp, PORT_SOURCE_FD, st->sd, POLLIN, NULL); + if (status < 0) { + log_error("port associate on evp %d sd %d failed: %s", evp, st->sd, + strerror(errno)); + goto error; + } + + /* port_getn should block indefinitely if st->interval < 0 */ + if (st->interval < 0) { + tsp = NULL; + } else { + tsp = &ts; + tsp->tv_sec = st->interval / 1000LL; + tsp->tv_nsec = (st->interval % 1000LL) * 1000000LL; + } + + for (;;) { + unsigned int nreturned = 1; + + status = port_getn(evp, &event, 1, &nreturned, tsp); + if (status != DN_OK) { + if (errno == EINTR || errno == EAGAIN) { + continue; + } + + if (errno != ETIME) { + log_error("port getn on evp %d with m %d failed: %s", evp, st->sd, + strerror(errno)); + goto error; + } + } - if (nreturned == 1) { - /* re-associate monitoring descriptor with the port */ - status = port_associate(evp, PORT_SOURCE_FD, st->sd, POLLIN, NULL); - if (status < 0) { - log_error("port associate on evp %d sd %d failed: %s", evp, st->sd, - strerror(errno)); - } - } + ASSERT(nreturned <= 1); - cb(st, &nreturned); + if (nreturned == 1) { + /* re-associate monitoring descriptor with the port */ + status = port_associate(evp, PORT_SOURCE_FD, st->sd, POLLIN, NULL); + if (status < 0) { + log_error("port associate on evp %d sd %d failed: %s", evp, st->sd, + strerror(errno)); + } } + cb(st, &nreturned); + } + error: - status = close(evp); - if (status < 0) { - log_error("close evp %d failed, ignored: %s", evp, strerror(errno)); - } - evp = -1; + status = close(evp); + if (status < 0) { + log_error("close evp %d failed, ignored: %s", evp, strerror(errno)); + } + evp = -1; } -void -event_loop_entropy(event_entropy_cb_t cb, void *arg) -{ - struct entropy *ent = arg; - int status, evp; - port_event_t event; - struct timespec ts, *tsp; - - evp = port_create(); - if (evp < 0) { - log_error("port create failed: %s", strerror(errno)); - return; - } - - status = port_associate(evp, PORT_SOURCE_FD, ent->sd, POLLIN, NULL); - if (status < 0) { - log_error("port associate on evp %d sd %d failed: %s", evp, ent->sd, +void event_loop_entropy(event_entropy_cb_t cb, void *arg) { + struct entropy *ent = arg; + int status, evp; + port_event_t event; + struct timespec ts, *tsp; + + evp = port_create(); + if (evp < 0) { + log_error("port create failed: %s", strerror(errno)); + return; + } + + status = port_associate(evp, PORT_SOURCE_FD, ent->sd, POLLIN, NULL); + if (status < 0) { + log_error("port associate on evp %d sd %d failed: %s", evp, ent->sd, + strerror(errno)); + goto error; + } + + /* port_getn should block indefinitely if ent->interval < 0 */ + if (ent->interval < 0) { + tsp = NULL; + } else { + tsp = &ts; + tsp->tv_sec = ent->interval / 1000LL; + tsp->tv_nsec = (ent->interval % 1000LL) * 1000000LL; + } + + for (;;) { + unsigned int nreturned = 1; + + status = port_getn(evp, &event, 1, &nreturned, tsp); + if (status != DN_OK) { + if (errno == EINTR || errno == EAGAIN) { + continue; + } + + if (errno != ETIME) { + log_error("port getn on evp %d with m %d failed: %s", evp, ent->sd, strerror(errno)); goto error; + } } - /* port_getn should block indefinitely if ent->interval < 0 */ - if (ent->interval < 0) { - tsp = NULL; - } else { - tsp = &ts; - tsp->tv_sec = ent->interval / 1000LL; - tsp->tv_nsec = (ent->interval % 1000LL) * 1000000LL; - } - - - for (;;) { - unsigned int nreturned = 1; - - status = port_getn(evp, &event, 1, &nreturned, tsp); - if (status != DN_OK) { - if (errno == EINTR || errno == EAGAIN) { - continue; - } + ASSERT(nreturned <= 1); - if (errno != ETIME) { - log_error("port getn on evp %d with m %d failed: %s", evp, - ent->sd, strerror(errno)); - goto error; - } - } - - ASSERT(nreturned <= 1); - - if (nreturned == 1) { - /* re-associate monitoring descriptor with the port */ - status = port_associate(evp, PORT_SOURCE_FD, ent->sd, POLLIN, NULL); - if (status < 0) { - log_error("port associate on evp %d sd %d failed: %s", evp, ent->sd, - strerror(errno)); - } - } - - cb(ent, &nreturned); + if (nreturned == 1) { + /* re-associate monitoring descriptor with the port */ + status = port_associate(evp, PORT_SOURCE_FD, ent->sd, POLLIN, NULL); + if (status < 0) { + log_error("port associate on evp %d sd %d failed: %s", evp, ent->sd, + strerror(errno)); + } } + cb(ent, &nreturned); + } + error: - status = close(evp); - if (status < 0) { - log_error("close evp %d failed, ignored: %s", evp, strerror(errno)); - } - evp = -1; + status = close(evp); + if (status < 0) { + log_error("close evp %d failed, ignored: %s", evp, strerror(errno)); + } + evp = -1; } #endif /* DN_HAVE_EVENT_PORTS */ diff --git a/src/event/dyn_kqueue.c b/src/event/dyn_kqueue.c index 5da680212..91979134e 100644 --- a/src/event/dyn_kqueue.c +++ b/src/event/dyn_kqueue.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -24,462 +24,441 @@ #ifdef DN_HAVE_KQUEUE -#include #include +#include -struct event_base * -event_base_create(int nevent, event_cb_t cb) -{ - struct event_base *evb; - int status, kq; - struct kevent *change, *event; +struct event_base *event_base_create(int nevent, event_cb_t cb) { + struct event_base *evb; + int status, kq; + struct kevent *change, *event; - ASSERT(nevent > 0); + ASSERT(nevent > 0); - kq = kqueue(); - if (kq < 0) { - log_error("kqueue failed: %s", strerror(errno)); - return NULL; - } + kq = kqueue(); + if (kq < 0) { + log_error("kqueue failed: %s", strerror(errno)); + return NULL; + } - change = dn_calloc(nevent, sizeof(*change)); - if (change == NULL) { - status = close(kq); - if (status < 0) { - log_error("close kq %d failed, ignored: %s", kq, strerror(errno)); - } - return NULL; + change = dn_calloc(nevent, sizeof(*change)); + if (change == NULL) { + status = close(kq); + if (status < 0) { + log_error("close kq %d failed, ignored: %s", kq, strerror(errno)); } + return NULL; + } - event = dn_calloc(nevent, sizeof(*event)); - if (event == NULL) { - dn_free(change); - status = close(kq); - if (status < 0) { - log_error("close kq %d failed, ignored: %s", kq, strerror(errno)); - } - return NULL; + event = dn_calloc(nevent, sizeof(*event)); + if (event == NULL) { + dn_free(change); + status = close(kq); + if (status < 0) { + log_error("close kq %d failed, ignored: %s", kq, strerror(errno)); } + return NULL; + } - evb = dn_alloc(sizeof(*evb)); - if (evb == NULL) { - dn_free(change); - dn_free(event); - status = close(kq); - if (status < 0) { - log_error("close kq %d failed, ignored: %s", kq, strerror(errno)); - } - return NULL; + evb = dn_alloc(sizeof(*evb)); + if (evb == NULL) { + dn_free(change); + dn_free(event); + status = close(kq); + if (status < 0) { + log_error("close kq %d failed, ignored: %s", kq, strerror(errno)); } + return NULL; + } - evb->kq = kq; - evb->change = change; - evb->nchange = 0; - evb->event = event; - evb->nevent = nevent; - evb->nreturned = 0; - evb->nprocessed = 0; - evb->cb = cb; + evb->kq = kq; + evb->change = change; + evb->nchange = 0; + evb->event = event; + evb->nevent = nevent; + evb->nreturned = 0; + evb->nprocessed = 0; + evb->cb = cb; - log_debug(LOG_INFO, "kq %d with nevent %d", evb->kq, evb->nevent); + log_debug(LOG_INFO, "kq %d with nevent %d", evb->kq, evb->nevent); - return evb; + return evb; } -void -event_base_destroy(struct event_base *evb) -{ - int status; +void event_base_destroy(struct event_base *evb) { + int status; - if (evb == NULL) { - return; - } + if (evb == NULL) { + return; + } - ASSERT(evb->kq > 0); + ASSERT(evb->kq > 0); - dn_free(evb->change); - dn_free(evb->event); + dn_free(evb->change); + dn_free(evb->event); - status = close(evb->kq); - if (status < 0) { - log_error("close kq %d failed, ignored: %s", evb->kq, strerror(errno)); - } - evb->kq = -1; + status = close(evb->kq); + if (status < 0) { + log_error("close kq %d failed, ignored: %s", evb->kq, strerror(errno)); + } + evb->kq = -1; - dn_free(evb); + dn_free(evb); } -int -event_add_in(struct event_base *evb, struct conn *c) -{ - struct kevent *event; +int event_add_in(struct event_base *evb, struct conn *c) { + struct kevent *event; - ASSERT(evb->kq > 0); - ASSERT(c != NULL); - ASSERT(c->sd > 0); - ASSERT(evb->nchange < evb->nevent); + ASSERT(evb->kq > 0); + ASSERT(c != NULL); + ASSERT(c->sd > 0); + ASSERT(evb->nchange < evb->nevent); - if (c->recv_active) { - return 0; - } + if (c->recv_active) { + return 0; + } - event = &evb->change[evb->nchange++]; - EV_SET(event, c->sd, EVFILT_READ, EV_ADD | EV_CLEAR, 0, 0, c); + event = &evb->change[evb->nchange++]; + EV_SET(event, c->sd, EVFILT_READ, EV_ADD | EV_CLEAR, 0, 0, c); - c->recv_active = 1; + c->recv_active = 1; - return 0; + return 0; } -int -event_del_in(struct event_base *evb, struct conn *c) -{ - struct kevent *event; +int event_del_in(struct event_base *evb, struct conn *c) { + struct kevent *event; - ASSERT(evb->kq > 0); - ASSERT(c != NULL); - ASSERT(c->sd > 0); - ASSERT(evb->nchange < evb->nevent); + ASSERT(evb->kq > 0); + ASSERT(c != NULL); + ASSERT(c->sd > 0); + ASSERT(evb->nchange < evb->nevent); - if (!c->recv_active) { - return 0; - } + if (!c->recv_active) { + return 0; + } - event = &evb->change[evb->nchange++]; - EV_SET(event, c->sd, EVFILT_READ, EV_DELETE, 0, 0, c); + event = &evb->change[evb->nchange++]; + EV_SET(event, c->sd, EVFILT_READ, EV_DELETE, 0, 0, c); - c->recv_active = 0; + c->recv_active = 0; - return 0; + return 0; } -int -event_add_out(struct event_base *evb, struct conn *c) -{ - struct kevent *event; +int event_add_out(struct event_base *evb, struct conn *c) { + struct kevent *event; - ASSERT(evb->kq > 0); - ASSERT(c != NULL); - ASSERT(c->sd > 0); - ASSERT(c->recv_active); - ASSERT(evb->nchange < evb->nevent); + ASSERT(evb->kq > 0); + ASSERT(c != NULL); + ASSERT(c->sd > 0); + ASSERT(c->recv_active); + ASSERT(evb->nchange < evb->nevent); - if (c->send_active) { - return 0; - } + if (c->send_active) { + return 0; + } - event = &evb->change[evb->nchange++]; - EV_SET(event, c->sd, EVFILT_WRITE, EV_ADD | EV_CLEAR, 0, 0, c); + event = &evb->change[evb->nchange++]; + EV_SET(event, c->sd, EVFILT_WRITE, EV_ADD | EV_CLEAR, 0, 0, c); - c->send_active = 1; + c->send_active = 1; - return 0; + return 0; } -int -event_del_out(struct event_base *evb, struct conn *c) -{ - struct kevent *event; +int event_del_out(struct event_base *evb, struct conn *c) { + struct kevent *event; - ASSERT(evb->kq > 0); - ASSERT(c != NULL); - ASSERT(c->sd > 0); - ASSERT(c->recv_active); - ASSERT(evb->nchange < evb->nevent); + ASSERT(evb->kq > 0); + ASSERT(c != NULL); + ASSERT(c->sd > 0); + ASSERT(c->recv_active); + ASSERT(evb->nchange < evb->nevent); - if (!c->send_active) { - return 0; - } + if (!c->send_active) { + return 0; + } - event = &evb->change[evb->nchange++]; - EV_SET(event, c->sd, EVFILT_WRITE, EV_DELETE, 0, 0, c); + event = &evb->change[evb->nchange++]; + EV_SET(event, c->sd, EVFILT_WRITE, EV_DELETE, 0, 0, c); - c->send_active = 0; + c->send_active = 0; - return 0; + return 0; } -int -event_add_conn(struct event_base *evb, struct conn *c) -{ - ASSERT(evb->kq > 0); - ASSERT(c != NULL); - ASSERT(c->sd > 0); - ASSERT(!c->recv_active); - ASSERT(!c->send_active); - ASSERT(evb->nchange < evb->nevent); +int event_add_conn(struct event_base *evb, struct conn *c) { + ASSERT(evb->kq > 0); + ASSERT(c != NULL); + ASSERT(c->sd > 0); + ASSERT(!c->recv_active); + ASSERT(!c->send_active); + ASSERT(evb->nchange < evb->nevent); - event_add_in(evb, c); - event_add_out(evb, c); + event_add_in(evb, c); + event_add_out(evb, c); - return 0; + return 0; } -int -event_del_conn(struct event_base *evb, struct conn *c) -{ - int i; +int event_del_conn(struct event_base *evb, struct conn *c) { + int i; + + ASSERT(evb->kq > 0); + ASSERT(c != NULL); + ASSERT(c->sd > 0); + ASSERT(evb->nchange < evb->nevent); + + event_del_out(evb, c); + event_del_in(evb, c); + + /* + * Now, eliminate pending events for c->sd (there should be at most one + * other event). This is important because we will close c->sd and free + * c when we return. + */ + for (i = evb->nprocessed + 1; i < evb->nreturned; i++) { + struct kevent *ev = &evb->event[i]; + if (ev->ident == (uintptr_t)c->sd) { + ev->flags = 0; + ev->filter = 0; + break; + } + } - ASSERT(evb->kq > 0); - ASSERT(c != NULL); - ASSERT(c->sd > 0); - ASSERT(evb->nchange < evb->nevent); + return 0; +} - event_del_out(evb, c); - event_del_in(evb, c); +int event_wait(struct event_base *evb, int timeout) { + int kq = evb->kq; + struct timespec ts, *tsp; - /* - * Now, eliminate pending events for c->sd (there should be at most one - * other event). This is important because we will close c->sd and free - * c when we return. - */ - for (i = evb->nprocessed + 1; i < evb->nreturned; i++) { - struct kevent *ev = &evb->event[i]; - if (ev->ident == (uintptr_t)c->sd) { - ev->flags = 0; - ev->filter = 0; - break; - } - } + ASSERT(kq > 0); - return 0; -} + /* kevent should block indefinitely if timeout < 0 */ + if (timeout < 0) { + tsp = NULL; + } else { + tsp = &ts; + tsp->tv_sec = timeout / 1000LL; + tsp->tv_nsec = (timeout % 1000LL) * 1000000LL; + } -int -event_wait(struct event_base *evb, int timeout) -{ - int kq = evb->kq; - struct timespec ts, *tsp; - - ASSERT(kq > 0); - - /* kevent should block indefinitely if timeout < 0 */ - if (timeout < 0) { - tsp = NULL; - } else { - tsp = &ts; - tsp->tv_sec = timeout / 1000LL; - tsp->tv_nsec = (timeout % 1000LL) * 1000000LL; - } + for (;;) { + /* + * kevent() is used both to register new events with kqueue, and to + * retrieve any pending events. Changes that should be applied to the + * kqueue are given in the change[] and any returned events are placed + * in event[], up to the maximum sized allowed by nevent. The number + * of entries actually placed in event[] is returned by the kevent() + * call and saved in nreturned. + * + * Events are registered with the system by the application via a + * struct kevent, and an event is uniquely identified with the system + * by a (kq, ident, filter) tuple. This means that there can be only + * one (ident, filter) pair for a given kqueue. + */ + evb->nreturned = + kevent(kq, evb->change, evb->nchange, evb->event, evb->nevent, tsp); + evb->nchange = 0; + if (evb->nreturned > 0) { + for (evb->nprocessed = 0; evb->nprocessed < evb->nreturned; + evb->nprocessed++) { + struct kevent *ev = &evb->event[evb->nprocessed]; + uint32_t events = 0; + + log_debug(LOG_VVERB, + "kevent %04" PRIX32 + " with filter %d " + "triggered on sd %d", + ev->flags, ev->filter, ev->ident); - for (;;) { /* - * kevent() is used both to register new events with kqueue, and to - * retrieve any pending events. Changes that should be applied to the - * kqueue are given in the change[] and any returned events are placed - * in event[], up to the maximum sized allowed by nevent. The number - * of entries actually placed in event[] is returned by the kevent() - * call and saved in nreturned. - * - * Events are registered with the system by the application via a - * struct kevent, and an event is uniquely identified with the system - * by a (kq, ident, filter) tuple. This means that there can be only - * one (ident, filter) pair for a given kqueue. + * If an error occurs while processing an element of the + * change[] and there is enough room in the event[], then the + * event event will be placed in the eventlist with EV_ERROR + * set in flags and the system error(errno) in data. */ - evb->nreturned = kevent(kq, evb->change, evb->nchange, evb->event, - evb->nevent, tsp); - evb->nchange = 0; - if (evb->nreturned > 0) { - for (evb->nprocessed = 0; evb->nprocessed < evb->nreturned; - evb->nprocessed++) { - struct kevent *ev = &evb->event[evb->nprocessed]; - uint32_t events = 0; - - log_debug(LOG_VVERB, "kevent %04"PRIX32" with filter %d " - "triggered on sd %d", ev->flags, ev->filter, - ev->ident); - - /* - * If an error occurs while processing an element of the - * change[] and there is enough room in the event[], then the - * event event will be placed in the eventlist with EV_ERROR - * set in flags and the system error(errno) in data. - */ - if (ev->flags & EV_ERROR) { - /* - * Error messages that can happen, when a delete fails. - * EBADF happens when the file descriptor has been closed - * ENOENT when the file descriptor was closed and then - * reopened. - * EINVAL for some reasons not understood; EINVAL - * should not be returned ever; but FreeBSD does :-\ - * An error is also indicated when a callback deletes an - * event we are still processing. In that case the data - * field is set to ENOENT. - */ - if (ev->data == EBADF || ev->data == EINVAL || - ev->data == ENOENT || ev->data == EINTR) { - continue; - } - events |= EVENT_ERR; - } - - if (ev->filter == EVFILT_READ) { - events |= EVENT_READ; - } - - if (ev->filter == EVFILT_WRITE) { - events |= EVENT_WRITE; - } - - if (evb->cb != NULL && events != 0) { - evb->cb(ev->udata, events); - } - } - return evb->nreturned; + if (ev->flags & EV_ERROR) { + /* + * Error messages that can happen, when a delete fails. + * EBADF happens when the file descriptor has been closed + * ENOENT when the file descriptor was closed and then + * reopened. + * EINVAL for some reasons not understood; EINVAL + * should not be returned ever; but FreeBSD does :-\ + * An error is also indicated when a callback deletes an + * event we are still processing. In that case the data + * field is set to ENOENT. + */ + if (ev->data == EBADF || ev->data == EINVAL || ev->data == ENOENT || + ev->data == EINTR) { + continue; + } + events |= EVENT_ERR; } - if (evb->nreturned == 0) { - if (timeout == -1) { - log_error("kevent on kq %d with %d events and %d timeout " - "returned no events", kq, evb->nevent, timeout); - return -1; - } + if (ev->filter == EVFILT_READ) { + events |= EVENT_READ; + } - return 0; + if (ev->filter == EVFILT_WRITE) { + events |= EVENT_WRITE; } - if (errno == EINTR) { - continue; + if (evb->cb != NULL && events != 0) { + evb->cb(ev->udata, events); } + } + return evb->nreturned; + } - log_error("kevent on kq %d with %d events failed: %s", kq, evb->nevent, - strerror(errno)); + if (evb->nreturned == 0) { + if (timeout == -1) { + log_error( + "kevent on kq %d with %d events and %d timeout " + "returned no events", + kq, evb->nevent, timeout); return -1; - } + } - NOT_REACHED(); -} + return 0; + } -void -event_loop_stats(event_stats_cb_t cb, void *arg) -{ - struct stats *st = arg; - int status, kq; - struct kevent change, event; - struct timespec ts, *tsp; - - kq = kqueue(); - if (kq < 0) { - log_error("kqueue failed: %s", strerror(errno)); - return; + if (errno == EINTR) { + continue; } - EV_SET(&change, st->sd, EVFILT_READ, EV_ADD | EV_CLEAR, 0, 0, NULL); + log_error("kevent on kq %d with %d events failed: %s", kq, evb->nevent, + strerror(errno)); + return -1; + } - /* kevent should block indefinitely if st->interval < 0 */ - if (st->interval < 0) { - tsp = NULL; - } else { - tsp = &ts; - tsp->tv_sec = st->interval / 1000LL; - tsp->tv_nsec = (st->interval % 1000LL) * 1000000LL; - } + NOT_REACHED(); +} - for (;;) { - int nreturned; - - nreturned = kevent(kq, &change, 1, &event, 1, tsp); - if (nreturned < 0) { - if (errno == EINTR) { - continue; - } - log_error("kevent on kq %d with m %d failed: %s", kq, st->sd, - strerror(errno)); - goto error; - } +void event_loop_stats(event_stats_cb_t cb, void *arg) { + struct stats *st = arg; + int status, kq; + struct kevent change, event; + struct timespec ts, *tsp; + + kq = kqueue(); + if (kq < 0) { + log_error("kqueue failed: %s", strerror(errno)); + return; + } + + EV_SET(&change, st->sd, EVFILT_READ, EV_ADD | EV_CLEAR, 0, 0, NULL); + + /* kevent should block indefinitely if st->interval < 0 */ + if (st->interval < 0) { + tsp = NULL; + } else { + tsp = &ts; + tsp->tv_sec = st->interval / 1000LL; + tsp->tv_nsec = (st->interval % 1000LL) * 1000000LL; + } + + for (;;) { + int nreturned; + + nreturned = kevent(kq, &change, 1, &event, 1, tsp); + if (nreturned < 0) { + if (errno == EINTR) { + continue; + } + log_error("kevent on kq %d with m %d failed: %s", kq, st->sd, + strerror(errno)); + goto error; + } - ASSERT(nreturned <= 1); + ASSERT(nreturned <= 1); - if (nreturned == 1) { - struct kevent *ev = &event; + if (nreturned == 1) { + struct kevent *ev = &event; - if (ev->flags & EV_ERROR) { - if (ev->data == EINTR) { - continue; - } - log_error("kevent on kq %d with m %d failed: %s", kq, st->sd, - strerror(ev->data)); - goto error; - } + if (ev->flags & EV_ERROR) { + if (ev->data == EINTR) { + continue; } - - cb(st, &nreturned); + log_error("kevent on kq %d with m %d failed: %s", kq, st->sd, + strerror(ev->data)); + goto error; + } } + cb(st, &nreturned); + } + error: - status = close(kq); - if (status < 0) { - log_error("close kq %d failed, ignored: %s", kq, strerror(errno)); - } - kq = -1; + status = close(kq); + if (status < 0) { + log_error("close kq %d failed, ignored: %s", kq, strerror(errno)); + } + kq = -1; } - -void -event_loop_entropy(event_entropy_cb_t cb, void *arg) -{ - struct entropy *ent = arg; - int status, kq; - struct kevent change, event; - struct timespec ts, *tsp; - - kq = kqueue(); - if (kq < 0) { - log_error("entropy kqueue failed: %s", strerror(errno)); - return; +void event_loop_entropy(event_entropy_cb_t cb, void *arg) { + struct entropy *ent = arg; + int status, kq; + struct kevent change, event; + struct timespec ts, *tsp; + + kq = kqueue(); + if (kq < 0) { + log_error("entropy kqueue failed: %s", strerror(errno)); + return; + } + + EV_SET(&change, ent->sd, EVFILT_READ, EV_ADD | EV_CLEAR, 0, 0, NULL); + + /* kevent should block indefinitely if ent->interval < 0 */ + if (ent->interval < 0) { + tsp = NULL; + } else { + tsp = &ts; + tsp->tv_sec = ent->interval / 1000LL; + tsp->tv_nsec = (ent->interval % 1000LL) * 1000000LL; + } + + for (;;) { + int nreturned; + + nreturned = kevent(kq, &change, 1, &event, 1, tsp); + if (nreturned < 0) { + if (errno == EINTR) { + continue; + } + log_error("kevent on kq %d with m %d failed: %s", kq, ent->sd, + strerror(errno)); + goto error; } - EV_SET(&change, ent->sd, EVFILT_READ, EV_ADD | EV_CLEAR, 0, 0, NULL); - - /* kevent should block indefinitely if ent->interval < 0 */ - if (ent->interval < 0) { - tsp = NULL; - } else { - tsp = &ts; - tsp->tv_sec = ent->interval / 1000LL; - tsp->tv_nsec = (ent->interval % 1000LL) * 1000000LL; - } - - - for (;;) { - int nreturned; + ASSERT(nreturned <= 1); - nreturned = kevent(kq, &change, 1, &event, 1, tsp); - if (nreturned < 0) { - if (errno == EINTR) { - continue; - } - log_error("kevent on kq %d with m %d failed: %s", kq, ent->sd, - strerror(errno)); - goto error; - } - - ASSERT(nreturned <= 1); - - if (nreturned == 1) { - struct kevent *ev = &event; + if (nreturned == 1) { + struct kevent *ev = &event; - if (ev->flags & EV_ERROR) { - if (ev->data == EINTR) { - continue; - } - log_error("kevent on kq %d with m %d failed: %s", kq, ent->sd, - strerror(ev->data)); - goto error; - } + if (ev->flags & EV_ERROR) { + if (ev->data == EINTR) { + continue; } - - cb(ent, &nreturned); + log_error("kevent on kq %d with m %d failed: %s", kq, ent->sd, + strerror(ev->data)); + goto error; + } } -error: - status = close(kq); - if (status < 0) { - log_error("close kq %d failed, ignored: %s", kq, strerror(errno)); - } - kq = -1; + cb(ent, &nreturned); + } +error: + status = close(kq); + if (status < 0) { + log_error("close kq %d failed, ignored: %s", kq, strerror(errno)); + } + kq = -1; } #endif /* DN_HAVE_KQUEUE */ diff --git a/src/hashkit/dyn_crc16.c b/src/hashkit/dyn_crc16.c index 63955de91..915913465 100644 --- a/src/hashkit/dyn_crc16.c +++ b/src/hashkit/dyn_crc16.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,56 +20,52 @@ * limitations under the License. */ -#include #include +#include static const uint16_t crc16tab[256] = { - 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7, - 0x8108, 0x9129, 0xa14a, 0xb16b, 0xc18c, 0xd1ad, 0xe1ce, 0xf1ef, - 0x1231, 0x0210, 0x3273, 0x2252, 0x52b5, 0x4294, 0x72f7, 0x62d6, - 0x9339, 0x8318, 0xb37b, 0xa35a, 0xd3bd, 0xc39c, 0xf3ff, 0xe3de, - 0x2462, 0x3443, 0x0420, 0x1401, 0x64e6, 0x74c7, 0x44a4, 0x5485, - 0xa56a, 0xb54b, 0x8528, 0x9509, 0xe5ee, 0xf5cf, 0xc5ac, 0xd58d, - 0x3653, 0x2672, 0x1611, 0x0630, 0x76d7, 0x66f6, 0x5695, 0x46b4, - 0xb75b, 0xa77a, 0x9719, 0x8738, 0xf7df, 0xe7fe, 0xd79d, 0xc7bc, - 0x48c4, 0x58e5, 0x6886, 0x78a7, 0x0840, 0x1861, 0x2802, 0x3823, - 0xc9cc, 0xd9ed, 0xe98e, 0xf9af, 0x8948, 0x9969, 0xa90a, 0xb92b, - 0x5af5, 0x4ad4, 0x7ab7, 0x6a96, 0x1a71, 0x0a50, 0x3a33, 0x2a12, - 0xdbfd, 0xcbdc, 0xfbbf, 0xeb9e, 0x9b79, 0x8b58, 0xbb3b, 0xab1a, - 0x6ca6, 0x7c87, 0x4ce4, 0x5cc5, 0x2c22, 0x3c03, 0x0c60, 0x1c41, - 0xedae, 0xfd8f, 0xcdec, 0xddcd, 0xad2a, 0xbd0b, 0x8d68, 0x9d49, - 0x7e97, 0x6eb6, 0x5ed5, 0x4ef4, 0x3e13, 0x2e32, 0x1e51, 0x0e70, - 0xff9f, 0xefbe, 0xdfdd, 0xcffc, 0xbf1b, 0xaf3a, 0x9f59, 0x8f78, - 0x9188, 0x81a9, 0xb1ca, 0xa1eb, 0xd10c, 0xc12d, 0xf14e, 0xe16f, - 0x1080, 0x00a1, 0x30c2, 0x20e3, 0x5004, 0x4025, 0x7046, 0x6067, - 0x83b9, 0x9398, 0xa3fb, 0xb3da, 0xc33d, 0xd31c, 0xe37f, 0xf35e, - 0x02b1, 0x1290, 0x22f3, 0x32d2, 0x4235, 0x5214, 0x6277, 0x7256, - 0xb5ea, 0xa5cb, 0x95a8, 0x8589, 0xf56e, 0xe54f, 0xd52c, 0xc50d, - 0x34e2, 0x24c3, 0x14a0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405, - 0xa7db, 0xb7fa, 0x8799, 0x97b8, 0xe75f, 0xf77e, 0xc71d, 0xd73c, - 0x26d3, 0x36f2, 0x0691, 0x16b0, 0x6657, 0x7676, 0x4615, 0x5634, - 0xd94c, 0xc96d, 0xf90e, 0xe92f, 0x99c8, 0x89e9, 0xb98a, 0xa9ab, - 0x5844, 0x4865, 0x7806, 0x6827, 0x18c0, 0x08e1, 0x3882, 0x28a3, - 0xcb7d, 0xdb5c, 0xeb3f, 0xfb1e, 0x8bf9, 0x9bd8, 0xabbb, 0xbb9a, - 0x4a75, 0x5a54, 0x6a37, 0x7a16, 0x0af1, 0x1ad0, 0x2ab3, 0x3a92, - 0xfd2e, 0xed0f, 0xdd6c, 0xcd4d, 0xbdaa, 0xad8b, 0x9de8, 0x8dc9, - 0x7c26, 0x6c07, 0x5c64, 0x4c45, 0x3ca2, 0x2c83, 0x1ce0, 0x0cc1, - 0xef1f, 0xff3e, 0xcf5d, 0xdf7c, 0xaf9b, 0xbfba, 0x8fd9, 0x9ff8, - 0x6e17, 0x7e36, 0x4e55, 0x5e74, 0x2e93, 0x3eb2, 0x0ed1, 0x1ef0, + 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7, 0x8108, + 0x9129, 0xa14a, 0xb16b, 0xc18c, 0xd1ad, 0xe1ce, 0xf1ef, 0x1231, 0x0210, + 0x3273, 0x2252, 0x52b5, 0x4294, 0x72f7, 0x62d6, 0x9339, 0x8318, 0xb37b, + 0xa35a, 0xd3bd, 0xc39c, 0xf3ff, 0xe3de, 0x2462, 0x3443, 0x0420, 0x1401, + 0x64e6, 0x74c7, 0x44a4, 0x5485, 0xa56a, 0xb54b, 0x8528, 0x9509, 0xe5ee, + 0xf5cf, 0xc5ac, 0xd58d, 0x3653, 0x2672, 0x1611, 0x0630, 0x76d7, 0x66f6, + 0x5695, 0x46b4, 0xb75b, 0xa77a, 0x9719, 0x8738, 0xf7df, 0xe7fe, 0xd79d, + 0xc7bc, 0x48c4, 0x58e5, 0x6886, 0x78a7, 0x0840, 0x1861, 0x2802, 0x3823, + 0xc9cc, 0xd9ed, 0xe98e, 0xf9af, 0x8948, 0x9969, 0xa90a, 0xb92b, 0x5af5, + 0x4ad4, 0x7ab7, 0x6a96, 0x1a71, 0x0a50, 0x3a33, 0x2a12, 0xdbfd, 0xcbdc, + 0xfbbf, 0xeb9e, 0x9b79, 0x8b58, 0xbb3b, 0xab1a, 0x6ca6, 0x7c87, 0x4ce4, + 0x5cc5, 0x2c22, 0x3c03, 0x0c60, 0x1c41, 0xedae, 0xfd8f, 0xcdec, 0xddcd, + 0xad2a, 0xbd0b, 0x8d68, 0x9d49, 0x7e97, 0x6eb6, 0x5ed5, 0x4ef4, 0x3e13, + 0x2e32, 0x1e51, 0x0e70, 0xff9f, 0xefbe, 0xdfdd, 0xcffc, 0xbf1b, 0xaf3a, + 0x9f59, 0x8f78, 0x9188, 0x81a9, 0xb1ca, 0xa1eb, 0xd10c, 0xc12d, 0xf14e, + 0xe16f, 0x1080, 0x00a1, 0x30c2, 0x20e3, 0x5004, 0x4025, 0x7046, 0x6067, + 0x83b9, 0x9398, 0xa3fb, 0xb3da, 0xc33d, 0xd31c, 0xe37f, 0xf35e, 0x02b1, + 0x1290, 0x22f3, 0x32d2, 0x4235, 0x5214, 0x6277, 0x7256, 0xb5ea, 0xa5cb, + 0x95a8, 0x8589, 0xf56e, 0xe54f, 0xd52c, 0xc50d, 0x34e2, 0x24c3, 0x14a0, + 0x0481, 0x7466, 0x6447, 0x5424, 0x4405, 0xa7db, 0xb7fa, 0x8799, 0x97b8, + 0xe75f, 0xf77e, 0xc71d, 0xd73c, 0x26d3, 0x36f2, 0x0691, 0x16b0, 0x6657, + 0x7676, 0x4615, 0x5634, 0xd94c, 0xc96d, 0xf90e, 0xe92f, 0x99c8, 0x89e9, + 0xb98a, 0xa9ab, 0x5844, 0x4865, 0x7806, 0x6827, 0x18c0, 0x08e1, 0x3882, + 0x28a3, 0xcb7d, 0xdb5c, 0xeb3f, 0xfb1e, 0x8bf9, 0x9bd8, 0xabbb, 0xbb9a, + 0x4a75, 0x5a54, 0x6a37, 0x7a16, 0x0af1, 0x1ad0, 0x2ab3, 0x3a92, 0xfd2e, + 0xed0f, 0xdd6c, 0xcd4d, 0xbdaa, 0xad8b, 0x9de8, 0x8dc9, 0x7c26, 0x6c07, + 0x5c64, 0x4c45, 0x3ca2, 0x2c83, 0x1ce0, 0x0cc1, 0xef1f, 0xff3e, 0xcf5d, + 0xdf7c, 0xaf9b, 0xbfba, 0x8fd9, 0x9ff8, 0x6e17, 0x7e36, 0x4e55, 0x5e74, + 0x2e93, 0x3eb2, 0x0ed1, 0x1ef0, }; -rstatus_t -hash_crc16(const unsigned char *key, size_t key_length, struct dyn_token *token) -{ - uint64_t x; - uint32_t crc = 0; +rstatus_t hash_crc16(const unsigned char *key, size_t key_length, + struct dyn_token *token) { + uint64_t x; + uint32_t crc = 0; - for (x=0; x < key_length; x++) { - crc = (crc << 8) ^ crc16tab[((crc >> 8) ^ (uint32_t)*key++) & 0x00ff]; - } + for (x = 0; x < key_length; x++) { + crc = (crc << 8) ^ crc16tab[((crc >> 8) ^ (uint32_t)*key++) & 0x00ff]; + } - size_dyn_token(token, 1); - set_int_dyn_token(token, crc); + size_dyn_token(token, 1); + set_int_dyn_token(token, crc); - return DN_OK; + return DN_OK; } diff --git a/src/hashkit/dyn_crc32.c b/src/hashkit/dyn_crc32.c index 1e50a9908..cc0b7ea4b 100644 --- a/src/hashkit/dyn_crc32.c +++ b/src/hashkit/dyn_crc32.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -27,74 +27,54 @@ * src/usr.bin/cksum/crc32.c. */ -#include -#include #include +#include "../dyn_types.h" +#include "dyn_token.h" + static const uint32_t crc32tab[256] = { - 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, - 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3, - 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, - 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, - 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, - 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, - 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, - 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5, - 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, - 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, - 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, - 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, - 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, - 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f, - 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, - 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, - 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, - 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, - 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, - 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, - 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, - 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, - 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, - 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, - 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, - 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, - 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, - 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, - 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, - 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, - 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, - 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, - 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, - 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, - 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, - 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, - 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, - 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7, - 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, - 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, - 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, - 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, - 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, - 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79, - 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, - 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, - 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, - 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, - 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, - 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, - 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, - 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, - 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, - 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, - 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, - 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, - 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, - 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, - 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, - 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, - 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, - 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf, - 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, + 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, + 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2, + 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, + 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, + 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, + 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c, + 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, + 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, + 0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, + 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106, + 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, + 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, + 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, + 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, + 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, + 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, + 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, + 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, + 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, + 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, + 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84, + 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, + 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, + 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, + 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e, + 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, + 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, + 0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, + 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28, + 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, + 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, + 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, + 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, + 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, + 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, + 0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, + 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, + 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, + 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d, }; @@ -102,56 +82,51 @@ static const uint32_t crc32tab[256] = { * CRC-32 implementation compatible with libmemcached library. Unfortunately * this implementation does not return CRC-32 as per spec. */ -rstatus_t -hash_crc32(const unsigned char *key, size_t key_length, struct dyn_token *token) -{ - uint64_t x; - uint32_t crc = UINT32_MAX; +rstatus_t hash_crc32(const unsigned char *key, size_t key_length, + struct dyn_token *token) { + uint64_t x; + uint32_t crc = UINT32_MAX; - for (x = 0; x < key_length; x++) { - crc = (crc >> 8) ^ crc32tab[(crc ^ (uint64_t)key[x]) & 0xff]; - } + for (x = 0; x < key_length; x++) { + crc = (crc >> 8) ^ crc32tab[(crc ^ (uint64_t)key[x]) & 0xff]; + } - uint32_t val = ((~crc) >> 16) & 0x7fff; - size_dyn_token(token, 1); - set_int_dyn_token(token, val); + uint32_t val = ((~crc) >> 16) & 0x7fff; + size_dyn_token(token, 1); + set_int_dyn_token(token, val); - return DN_OK; + return DN_OK; } -uint32_t -hash_crc32a(const unsigned char *key, size_t key_length, struct dyn_token *token) -{ - const uint8_t *p = key; - uint32_t crc; +uint32_t hash_crc32a(const unsigned char *key, size_t key_length, + struct dyn_token *token) { + const uint8_t *p = key; + uint32_t crc; - crc = ~0U; - while (key_length--) { - crc = crc32tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8); - } + crc = ~0U; + while (key_length--) { + crc = crc32tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8); + } - uint32_t val = crc ^ ~0U; - size_dyn_token(token, 1); - set_int_dyn_token(token, val); + uint32_t val = crc ^ ~0U; + size_dyn_token(token, 1); + set_int_dyn_token(token, val); - return DN_OK; + return DN_OK; } -// crc32 for sequential buffers. - -#define _CRC32_(crc, ch) ((crc) = ((crc) >> 8) ^ crc32tab[((crc) ^ (ch)) &\ - 0xff]) -uint32_t -crc32_sz(const unsigned char *buf, size_t buf_length, uint32_t in_crc32) -{ - uint32_t crc = ~in_crc32; - const unsigned char *p; - int len, - nr; - - len = 0; - nr = buf_length; - for (len += nr, p = buf; nr--; ++p) - _CRC32_(crc, tolower((unsigned int) *p)); - return ~crc; + // crc32 for sequential buffers. + +#define _CRC32_(crc, ch) \ + ((crc) = ((crc) >> 8) ^ crc32tab[((crc) ^ (ch)) & 0xff]) +uint32_t crc32_sz(const unsigned char *buf, size_t buf_length, + uint32_t in_crc32) { + uint32_t crc = ~in_crc32; + const unsigned char *p; + int len, nr; + + len = 0; + nr = buf_length; + for (len += nr, p = buf; nr--; ++p) _CRC32_(crc, tolower((unsigned int)*p)); + return ~crc; } diff --git a/src/hashkit/dyn_fnv.c b/src/hashkit/dyn_fnv.c index 27f829eb4..a467331e3 100644 --- a/src/hashkit/dyn_fnv.c +++ b/src/hashkit/dyn_fnv.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,83 +20,79 @@ * limitations under the License. */ -#include #include +#include static uint64_t FNV_64_INIT = UINT64_C(0xcbf29ce484222325); static uint64_t FNV_64_PRIME = UINT64_C(0x100000001b3); static uint32_t FNV_32_INIT = 2166136261UL; static uint32_t FNV_32_PRIME = 16777619; -rstatus_t -hash_fnv1_64(const unsigned char *key, size_t key_length, struct dyn_token *token) -{ - uint64_t hash = FNV_64_INIT; - size_t x; +rstatus_t hash_fnv1_64(const unsigned char *key, size_t key_length, + struct dyn_token *token) { + uint64_t hash = FNV_64_INIT; + size_t x; - for (x = 0; x < key_length; x++) { - hash *= FNV_64_PRIME; - hash ^= (uint64_t)key[x]; - } + for (x = 0; x < key_length; x++) { + hash *= FNV_64_PRIME; + hash ^= (uint64_t)key[x]; + } - //note: original version simply downcast the uint64_t to uint32_t - uint32_t val = (uint32_t)hash; - size_dyn_token(token, 1); - set_int_dyn_token(token, val); + // note: original version simply downcast the uint64_t to uint32_t + uint32_t val = (uint32_t)hash; + size_dyn_token(token, 1); + set_int_dyn_token(token, val); - return DN_OK; + return DN_OK; } -rstatus_t -hash_fnv1a_64(const unsigned char *key, size_t key_length, struct dyn_token *token) -{ - uint32_t hash = (uint32_t) FNV_64_INIT; - size_t x; +rstatus_t hash_fnv1a_64(const unsigned char *key, size_t key_length, + struct dyn_token *token) { + uint32_t hash = (uint32_t)FNV_64_INIT; + size_t x; - for (x = 0; x < key_length; x++) { - uint32_t val = (uint32_t)key[x]; - hash ^= val; - hash *= (uint32_t) FNV_64_PRIME; - } + for (x = 0; x < key_length; x++) { + uint32_t val = (uint32_t)key[x]; + hash ^= val; + hash *= (uint32_t)FNV_64_PRIME; + } - size_dyn_token(token, 1); - set_int_dyn_token(token, hash); + size_dyn_token(token, 1); + set_int_dyn_token(token, hash); - return DN_OK; + return DN_OK; } -rstatus_t -hash_fnv1_32(const unsigned char *key, size_t key_length, struct dyn_token *token) -{ - uint32_t hash = FNV_32_INIT; - size_t x; +rstatus_t hash_fnv1_32(const unsigned char *key, size_t key_length, + struct dyn_token *token) { + uint32_t hash = FNV_32_INIT; + size_t x; - for (x = 0; x < key_length; x++) { - uint32_t val = (uint32_t)key[x]; - hash *= FNV_32_PRIME; - hash ^= val; - } + for (x = 0; x < key_length; x++) { + uint32_t val = (uint32_t)key[x]; + hash *= FNV_32_PRIME; + hash ^= val; + } - size_dyn_token(token, 1); - set_int_dyn_token(token, hash); + size_dyn_token(token, 1); + set_int_dyn_token(token, hash); - return DN_OK; + return DN_OK; } -rstatus_t -hash_fnv1a_32(const unsigned char *key, size_t key_length, struct dyn_token *token) -{ - uint32_t hash = FNV_32_INIT; - size_t x; +rstatus_t hash_fnv1a_32(const unsigned char *key, size_t key_length, + struct dyn_token *token) { + uint32_t hash = FNV_32_INIT; + size_t x; - for (x= 0; x < key_length; x++) { - uint32_t val = (uint32_t)key[x]; - hash ^= val; - hash *= FNV_32_PRIME; - } + for (x = 0; x < key_length; x++) { + uint32_t val = (uint32_t)key[x]; + hash ^= val; + hash *= FNV_32_PRIME; + } - size_dyn_token(token, 1); - set_int_dyn_token(token, hash); + size_dyn_token(token, 1); + set_int_dyn_token(token, hash); - return DN_OK; + return DN_OK; } diff --git a/src/hashkit/dyn_hashkit.c b/src/hashkit/dyn_hashkit.c index a03506160..a01e3f713 100644 --- a/src/hashkit/dyn_hashkit.c +++ b/src/hashkit/dyn_hashkit.c @@ -1,44 +1,38 @@ #include "dyn_hashkit.h" +#include "../dyn_string.h" + #define DEFINE_ACTION(_hash, _name) string(#_name), -struct string hash_strings[] = { - HASH_CODEC( DEFINE_ACTION ) - null_string -}; +struct string hash_strings[] = {HASH_CODEC(DEFINE_ACTION) null_string}; #undef DEFINE_ACTION // Defines all the hashing functions -#define DEFINE_ACTION(_hash, _name) rstatus_t hash_##_name(const unsigned char *key, size_t length, struct dyn_token *token); -HASH_CODEC( DEFINE_ACTION ) \ +#define DEFINE_ACTION(_hash, _name) \ + rstatus_t hash_##_name(const unsigned char *key, size_t length, \ + struct dyn_token *token); +HASH_CODEC(DEFINE_ACTION) #undef DEFINE_ACTION // Creates an array of hash functions #define DEFINE_ACTION(_hash, _name) hash_##_name, -static hash_func_t hash_algos[] = { - HASH_CODEC( DEFINE_ACTION ) - NULL -}; +static hash_func_t hash_algos[] = {HASH_CODEC(DEFINE_ACTION) NULL}; #undef DEFINE_ACTION -hash_func_t -get_hash_func(hash_type_t hash_type) -{ - if ((hash_type >= 0) && (hash_type < HASH_INVALID)) - return hash_algos[hash_type]; - return NULL; +hash_func_t get_hash_func(hash_type_t hash_type) { + if ((hash_type >= 0) && (hash_type < HASH_INVALID)) + return hash_algos[hash_type]; + return NULL; } -hash_type_t -get_hash_type(struct string *hash_name) -{ - struct string *hash_iter; - for (hash_iter = hash_strings; hash_iter->len != 0; hash_iter++) { - if (string_compare(hash_name, hash_iter) != 0) { - continue; - } - - return hash_iter - hash_strings; +hash_type_t get_hash_type(struct string *hash_name) { + struct string *hash_iter; + for (hash_iter = hash_strings; hash_iter->len != 0; hash_iter++) { + if (string_compare(hash_name, hash_iter) != 0) { + continue; } - return HASH_INVALID; + + return hash_iter - hash_strings; + } + return HASH_INVALID; } diff --git a/src/hashkit/dyn_hashkit.h b/src/hashkit/dyn_hashkit.h index 126ce8cc5..0a2f59783 100644 --- a/src/hashkit/dyn_hashkit.h +++ b/src/hashkit/dyn_hashkit.h @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -19,37 +19,41 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include -#include - #ifndef _DYN_HASHKIT_H_ #define _DYN_HASHKIT_H_ -void md5_signature(const unsigned char *key, unsigned int length, unsigned char *result); +#include "../dyn_types.h" + +// Forward declarations +struct dyn_token; +struct string; + +typedef rstatus_t (*hash_func_t)(const unsigned char *, size_t, + struct dyn_token *); + +void md5_signature(const unsigned char *key, unsigned int length, + unsigned char *result); uint32_t crc32_sz(const char *buf, size_t length, uint32_t in_crc32); -#define HASH_CODEC(ACTION) \ - ACTION( HASH_ONE_AT_A_TIME, one_at_a_time ) \ - ACTION( HASH_MD5, md5 ) \ - ACTION( HASH_CRC16, crc16 ) \ - ACTION( HASH_CRC32, crc32 ) \ - ACTION( HASH_CRC32A, crc32a ) \ - ACTION( HASH_FNV1_64, fnv1_64 ) \ - ACTION( HASH_FNV1A_64, fnv1a_64 ) \ - ACTION( HASH_FNV1_32, fnv1_32 ) \ - ACTION( HASH_FNV1A_32, fnv1a_32 ) \ - ACTION( HASH_HSIEH, hsieh ) \ - ACTION( HASH_MURMUR, murmur ) \ - ACTION( HASH_JENKINS, jenkins ) \ - ACTION( HASH_MURMUR3, murmur3 ) \ +#define HASH_CODEC(ACTION) \ + ACTION(HASH_ONE_AT_A_TIME, one_at_a_time) \ + ACTION(HASH_MD5, md5) \ + ACTION(HASH_CRC16, crc16) \ + ACTION(HASH_CRC32, crc32) \ + ACTION(HASH_CRC32A, crc32a) \ + ACTION(HASH_FNV1_64, fnv1_64) \ + ACTION(HASH_FNV1A_64, fnv1a_64) \ + ACTION(HASH_FNV1_32, fnv1_32) \ + ACTION(HASH_FNV1A_32, fnv1a_32) \ + ACTION(HASH_HSIEH, hsieh) \ + ACTION(HASH_MURMUR, murmur) \ + ACTION(HASH_JENKINS, jenkins) \ + ACTION(HASH_MURMUR3, murmur3) #define DEFINE_ACTION(_hash, _name) _hash, -typedef enum hash_type { - HASH_CODEC( DEFINE_ACTION ) - HASH_INVALID -} hash_type_t; +typedef enum hash_type { HASH_CODEC(DEFINE_ACTION) HASH_INVALID } hash_type_t; #undef DEFINE_ACTION hash_func_t get_hash_func(hash_type_t hash_type); diff --git a/src/hashkit/dyn_hsieh.c b/src/hashkit/dyn_hsieh.c index b516a471a..e9bff67be 100644 --- a/src/hashkit/dyn_hsieh.c +++ b/src/hashkit/dyn_hsieh.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -26,78 +26,77 @@ * See: http://www.azillionmonkeys.com/qed/weblicense.html for license * details. * http://www.azillionmonkeys.com/qed/hash.html -*/ + */ -#include #include +#include #undef get16bits #if (defined(__GNUC__) && defined(__i386__)) -#define get16bits(d) (*((const uint16_t *) (d))) +#define get16bits(d) (*((const uint16_t *)(d))) #endif -#if !defined (get16bits) -#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\ - +(uint32_t)(((const uint8_t *)(d))[0]) ) +#if !defined(get16bits) +#define get16bits(d) \ + ((((uint32_t)(((const uint8_t *)(d))[1])) << 8) + \ + (uint32_t)(((const uint8_t *)(d))[0])) #endif -rstatus_t -hash_hsieh(const unsigned char *key, size_t key_length, struct dyn_token *token) -{ - uint32_t hash = 0, tmp; - int rem; - - if (key_length <= 0 || key == NULL) { - return 0; - } - - rem = key_length & 3; - key_length >>= 2; - - /* Main loop */ - for (;key_length > 0; key_length--) { - hash += get16bits (key); - tmp = (get16bits (key+2) << 11) ^ hash; - hash = (hash << 16) ^ tmp; - key += 2*sizeof (uint16_t); - hash += hash >> 11; - } - - /* Handle end cases */ - switch (rem) { +rstatus_t hash_hsieh(const unsigned char *key, size_t key_length, + struct dyn_token *token) { + uint32_t hash = 0, tmp; + int rem; + + if (key_length <= 0 || key == NULL) { + return 0; + } + + rem = key_length & 3; + key_length >>= 2; + + /* Main loop */ + for (; key_length > 0; key_length--) { + hash += get16bits(key); + tmp = (get16bits(key + 2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + key += 2 * sizeof(uint16_t); + hash += hash >> 11; + } + + /* Handle end cases */ + switch (rem) { case 3: - hash += get16bits (key); - hash ^= hash << 16; - hash ^= (uint32_t)key[sizeof (uint16_t)] << 18; - hash += hash >> 11; - break; + hash += get16bits(key); + hash ^= hash << 16; + hash ^= (uint32_t)key[sizeof(uint16_t)] << 18; + hash += hash >> 11; + break; case 2: - hash += get16bits (key); - hash ^= hash << 11; - hash += hash >> 17; - break; + hash += get16bits(key); + hash ^= hash << 11; + hash += hash >> 17; + break; case 1: - hash += (unsigned char)(*key); - hash ^= hash << 10; - hash += hash >> 1; + hash += (unsigned char)(*key); + hash ^= hash << 10; + hash += hash >> 1; default: - break; - } - - /* Force "avalanching" of final 127 bits */ - hash ^= hash << 3; - hash += hash >> 5; - hash ^= hash << 4; - hash += hash >> 17; - hash ^= hash << 25; - hash += hash >> 6; + break; + } - size_dyn_token(token, 1); - set_int_dyn_token(token, hash); + /* Force "avalanching" of final 127 bits */ + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; - return DN_OK; + size_dyn_token(token, 1); + set_int_dyn_token(token, hash); + return DN_OK; } diff --git a/src/hashkit/dyn_jenkins.c b/src/hashkit/dyn_jenkins.c index 876dada15..e0308da0e 100644 --- a/src/hashkit/dyn_jenkins.c +++ b/src/hashkit/dyn_jenkins.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -32,33 +32,52 @@ * Add big endian support */ -#include #include +#include -#define hashsize(n) ((uint32_t)1<<(n)) -#define hashmask(n) (hashsize(n)-1) -#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k)))) - -#define mix(a,b,c) \ -{ \ - a -= c; a ^= rot(c, 4); c += b; \ - b -= a; b ^= rot(a, 6); a += c; \ - c -= b; c ^= rot(b, 8); b += a; \ - a -= c; a ^= rot(c,16); c += b; \ - b -= a; b ^= rot(a,19); a += c; \ - c -= b; c ^= rot(b, 4); b += a; \ -} +#define hashsize(n) ((uint32_t)1 << (n)) +#define hashmask(n) (hashsize(n) - 1) +#define rot(x, k) (((x) << (k)) | ((x) >> (32 - (k)))) -#define final(a,b,c) \ -{ \ - c ^= b; c -= rot(b,14); \ - a ^= c; a -= rot(c,11); \ - b ^= a; b -= rot(a,25); \ - c ^= b; c -= rot(b,16); \ - a ^= c; a -= rot(c,4); \ - b ^= a; b -= rot(a,14); \ - c ^= b; c -= rot(b,24); \ -} +#define mix(a, b, c) \ + { \ + a -= c; \ + a ^= rot(c, 4); \ + c += b; \ + b -= a; \ + b ^= rot(a, 6); \ + a += c; \ + c -= b; \ + c ^= rot(b, 8); \ + b += a; \ + a -= c; \ + a ^= rot(c, 16); \ + c += b; \ + b -= a; \ + b ^= rot(a, 19); \ + a += c; \ + c -= b; \ + c ^= rot(b, 4); \ + b += a; \ + } + +#define final(a, b, c) \ + { \ + c ^= b; \ + c -= rot(b, 14); \ + a ^= c; \ + a -= rot(c, 11); \ + b ^= a; \ + b -= rot(a, 25); \ + c ^= b; \ + c -= rot(b, 16); \ + a ^= c; \ + a -= rot(c, 4); \ + b ^= a; \ + b -= rot(a, 14); \ + c ^= b; \ + c -= rot(b, 24); \ + } #define JENKINS_INITVAL 13 @@ -78,28 +97,28 @@ * In which case, the hash table should have hashsize(10) elements. */ -rstatus_t -hash_jenkins(const unsigned char *key, size_t length, struct dyn_token *token) -{ - uint32_t a,b,c; /* internal state */ - union { const void *ptr; size_t i; } u; /* needed for Mac Powerbook G4 */ +rstatus_t hash_jenkins(const unsigned char *key, size_t length, + struct dyn_token *token) { + uint32_t a, b, c; /* internal state */ + union { + const void *ptr; + size_t i; + } u; /* needed for Mac Powerbook G4 */ /* Set up the internal state */ a = b = c = 0xdeadbeef + ((uint32_t)length) + JENKINS_INITVAL; u.ptr = key; #ifndef WORDS_BIGENDIAN - if ((u.i & 0x3) == 0) - { - const uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */ + if ((u.i & 0x3) == 0) { + const uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */ /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */ - while (length > 12) - { + while (length > 12) { a += k[0]; b += k[1]; c += k[2]; - mix(a,b,c); + mix(a, b, c); length -= 12; k += 3; } @@ -114,127 +133,185 @@ hash_jenkins(const unsigned char *key, size_t length, struct dyn_token *token) * still catch it and complain. The masking trick does make the hash * noticeably faster for short strings (like English words). */ - switch(length) - { - case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; - case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break; - case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break; - case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break; - case 8 : b+=k[1]; a+=k[0]; break; - case 7 : b+=k[1]&0xffffff; a+=k[0]; break; - case 6 : b+=k[1]&0xffff; a+=k[0]; break; - case 5 : b+=k[1]&0xff; a+=k[0]; break; - case 4 : a+=k[0]; break; - case 3 : a+=k[0]&0xffffff; break; - case 2 : a+=k[0]&0xffff; break; - case 1 : a+=k[0]&0xff; break; - case 0 : return c; /* zero length strings require no mixing */ - default: return c; + switch (length) { + case 12: + c += k[2]; + b += k[1]; + a += k[0]; + break; + case 11: + c += k[2] & 0xffffff; + b += k[1]; + a += k[0]; + break; + case 10: + c += k[2] & 0xffff; + b += k[1]; + a += k[0]; + break; + case 9: + c += k[2] & 0xff; + b += k[1]; + a += k[0]; + break; + case 8: + b += k[1]; + a += k[0]; + break; + case 7: + b += k[1] & 0xffffff; + a += k[0]; + break; + case 6: + b += k[1] & 0xffff; + a += k[0]; + break; + case 5: + b += k[1] & 0xff; + a += k[0]; + break; + case 4: + a += k[0]; + break; + case 3: + a += k[0] & 0xffffff; + break; + case 2: + a += k[0] & 0xffff; + break; + case 1: + a += k[0] & 0xff; + break; + case 0: + return c; /* zero length strings require no mixing */ + default: + return c; } - } - else if ((u.i & 0x1) == 0) - { - const uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */ - const uint8_t *k8; + } else if ((u.i & 0x1) == 0) { + const uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */ + const uint8_t *k8; /*--------------- all but last block: aligned reads and different mixing */ - while (length > 12) - { - a += k[0] + (((uint32_t)k[1])<<16); - b += k[2] + (((uint32_t)k[3])<<16); - c += k[4] + (((uint32_t)k[5])<<16); - mix(a,b,c); + while (length > 12) { + a += k[0] + (((uint32_t)k[1]) << 16); + b += k[2] + (((uint32_t)k[3]) << 16); + c += k[4] + (((uint32_t)k[5]) << 16); + mix(a, b, c); length -= 12; k += 6; } /*----------------------------- handle the last (probably partial) block */ k8 = (const uint8_t *)k; - switch(length) - { - case 12: c+=k[4]+(((uint32_t)k[5])<<16); - b+=k[2]+(((uint32_t)k[3])<<16); - a+=k[0]+(((uint32_t)k[1])<<16); - break; - case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ - case 10: c+=k[4]; - b+=k[2]+(((uint32_t)k[3])<<16); - a+=k[0]+(((uint32_t)k[1])<<16); - break; - case 9 : c+=k8[8]; /* fall through */ - case 8 : b+=k[2]+(((uint32_t)k[3])<<16); - a+=k[0]+(((uint32_t)k[1])<<16); - break; - case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ - case 6 : b+=k[2]; - a+=k[0]+(((uint32_t)k[1])<<16); - break; - case 5 : b+=k8[4]; /* fall through */ - case 4 : a+=k[0]+(((uint32_t)k[1])<<16); - break; - case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ - case 2 : a+=k[0]; - break; - case 1 : a+=k8[0]; - break; - case 0 : return c; /* zero length requires no mixing */ - default: return c; + switch (length) { + case 12: + c += k[4] + (((uint32_t)k[5]) << 16); + b += k[2] + (((uint32_t)k[3]) << 16); + a += k[0] + (((uint32_t)k[1]) << 16); + break; + case 11: + c += ((uint32_t)k8[10]) << 16; /* fall through */ + case 10: + c += k[4]; + b += k[2] + (((uint32_t)k[3]) << 16); + a += k[0] + (((uint32_t)k[1]) << 16); + break; + case 9: + c += k8[8]; /* fall through */ + case 8: + b += k[2] + (((uint32_t)k[3]) << 16); + a += k[0] + (((uint32_t)k[1]) << 16); + break; + case 7: + b += ((uint32_t)k8[6]) << 16; /* fall through */ + case 6: + b += k[2]; + a += k[0] + (((uint32_t)k[1]) << 16); + break; + case 5: + b += k8[4]; /* fall through */ + case 4: + a += k[0] + (((uint32_t)k[1]) << 16); + break; + case 3: + a += ((uint32_t)k8[2]) << 16; /* fall through */ + case 2: + a += k[0]; + break; + case 1: + a += k8[0]; + break; + case 0: + return c; /* zero length requires no mixing */ + default: + return c; } - } - else - { /* need to read the key one byte at a time */ -#endif /* little endian */ + } else { /* need to read the key one byte at a time */ +#endif /* little endian */ const uint8_t *k = (const uint8_t *)key; /*--------------- all but the last block: affect some 32 bits of (a,b,c) */ - while (length > 12) - { + while (length > 12) { a += k[0]; - a += ((uint32_t)k[1])<<8; - a += ((uint32_t)k[2])<<16; - a += ((uint32_t)k[3])<<24; + a += ((uint32_t)k[1]) << 8; + a += ((uint32_t)k[2]) << 16; + a += ((uint32_t)k[3]) << 24; b += k[4]; - b += ((uint32_t)k[5])<<8; - b += ((uint32_t)k[6])<<16; - b += ((uint32_t)k[7])<<24; + b += ((uint32_t)k[5]) << 8; + b += ((uint32_t)k[6]) << 16; + b += ((uint32_t)k[7]) << 24; c += k[8]; - c += ((uint32_t)k[9])<<8; - c += ((uint32_t)k[10])<<16; - c += ((uint32_t)k[11])<<24; - mix(a,b,c); + c += ((uint32_t)k[9]) << 8; + c += ((uint32_t)k[10]) << 16; + c += ((uint32_t)k[11]) << 24; + mix(a, b, c); length -= 12; k += 12; } /*-------------------------------- last block: affect all 32 bits of (c) */ - switch(length) /* all the case statements fall through */ + switch (length) /* all the case statements fall through */ { - case 12: c+=((uint32_t)k[11])<<24; - case 11: c+=((uint32_t)k[10])<<16; - case 10: c+=((uint32_t)k[9])<<8; - case 9 : c+=k[8]; - case 8 : b+=((uint32_t)k[7])<<24; - case 7 : b+=((uint32_t)k[6])<<16; - case 6 : b+=((uint32_t)k[5])<<8; - case 5 : b+=k[4]; - case 4 : a+=((uint32_t)k[3])<<24; - case 3 : a+=((uint32_t)k[2])<<16; - case 2 : a+=((uint32_t)k[1])<<8; - case 1 : a+=k[0]; - break; - case 0 : return c; - default : return c; + case 12: + c += ((uint32_t)k[11]) << 24; + case 11: + c += ((uint32_t)k[10]) << 16; + case 10: + c += ((uint32_t)k[9]) << 8; + case 9: + c += k[8]; + case 8: + b += ((uint32_t)k[7]) << 24; + case 7: + b += ((uint32_t)k[6]) << 16; + case 6: + b += ((uint32_t)k[5]) << 8; + case 5: + b += k[4]; + case 4: + a += ((uint32_t)k[3]) << 24; + case 3: + a += ((uint32_t)k[2]) << 16; + case 2: + a += ((uint32_t)k[1]) << 8; + case 1: + a += k[0]; + break; + case 0: + return c; + default: + return c; } #ifndef WORDS_BIGENDIAN } #endif - final(a,b,c); + final(a, b, c); - size_dyn_token(token, 1); - set_int_dyn_token(token, c); + size_dyn_token(token, 1); + set_int_dyn_token(token, c); - return DN_OK; + return DN_OK; } diff --git a/src/hashkit/dyn_ketama.c b/src/hashkit/dyn_ketama.c index f121675b6..a63ac7d8a 100644 --- a/src/hashkit/dyn_ketama.c +++ b/src/hashkit/dyn_ketama.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,17 +20,18 @@ * limitations under the License. */ +#include #include #include -#include #include -#include #include +#include -#define KETAMA_CONTINUUM_ADDITION 10 /* # extra slots to build into continuum */ -#define KETAMA_POINTS_PER_SERVER 160 /* 40 points per hash */ -#define KETAMA_MAX_HOSTLEN 86 +#define KETAMA_CONTINUUM_ADDITION 10 /* # extra slots to build into continuum \ + */ +#define KETAMA_POINTS_PER_SERVER 160 /* 40 points per hash */ +#define KETAMA_MAX_HOSTLEN 86 /*static uint32_t ketama_hash(const char *key, size_t key_length, uint32_t alignment) @@ -59,187 +60,191 @@ ketama_item_cmp(const void *t1, const void *t2) } }*/ -rstatus_t -ketama_update(struct server_pool *pool) -{ - /* uint32_t nserver; /\* # server - live and dead *\/ */ - /* uint32_t nlive_server; /\* # live server *\/ */ - /* uint32_t pointer_per_server; /\* pointers per server proportional to weight *\/ */ - /* uint32_t pointer_per_hash; /\* pointers per hash *\/ */ - /* uint32_t pointer_counter; /\* # pointers on continuum *\/ */ - /* uint32_t pointer_index; /\* pointer index *\/ */ - /* uint32_t points_per_server; /\* points per server *\/ */ - /* uint32_t continuum_index; /\* continuum index *\/ */ - /* uint32_t continuum_addition; /\* extra space in the continuum *\/ */ - /* uint32_t server_index; /\* server index *\/ */ - /* uint32_t value; /\* continuum value *\/ */ - /* uint32_t total_weight; /\* total live server weight *\/ */ - /* int64_t now; /\* current timestamp in usec *\/ */ - - /* ASSERT(array_n(&pool->server) > 0); */ - - /* now = dn_usec_now(); */ - /* if (now < 0) { */ - /* return DN_ERROR; */ - /* } */ - - /* /\* */ - /* * Count live servers and total weight, and also update the next time to */ - /* * rebuild the distribution */ - /* *\/ */ - /* nserver = array_n(&pool->server); */ - /* nlive_server = 0; */ - /* total_weight = 0; */ - /* pool->next_rebuild = 0LL; */ - /* for (server_index = 0; server_index < nserver; server_index++) { */ - /* struct server *server = array_get(&pool->server, server_index); */ - - /* if (pool->auto_eject_hosts) { */ - /* if (server->next_retry <= now) { */ - /* server->next_retry = 0LL; */ - /* nlive_server++; */ - /* } else if (pool->next_rebuild == 0LL || */ - /* server->next_retry < pool->next_rebuild) { */ - /* pool->next_rebuild = server->next_retry; */ - /* } */ - /* } else { */ - /* nlive_server++; */ - /* } */ - - /* ASSERT(server->weight > 0); */ - - /* /\* count weight only for live servers *\/ */ - /* if (!pool->auto_eject_hosts || server->next_retry <= now) { */ - /* total_weight += server->weight; */ - /* } */ - /* } */ - - /* pool->nlive_server = nlive_server; */ - - /* if (nlive_server == 0) { */ - /* log_debug(LOG_DEBUG, "no live servers for pool %"PRIu32" '%.*s'", */ - /* pool->idx, pool->name.len, pool->name.data); */ - - /* return DN_OK; */ - /* } */ - /* log_debug(LOG_DEBUG, "%"PRIu32" of %"PRIu32" servers are live for pool " */ - /* "%"PRIu32" '%.*s'", nlive_server, nserver, pool->idx, */ - /* pool->name.len, pool->name.data); */ - - /* continuum_addition = KETAMA_CONTINUUM_ADDITION; */ - /* points_per_server = KETAMA_POINTS_PER_SERVER; */ - /* /\* */ - /* * Allocate the continuum for the pool, the first time, and every time we */ - /* * add a new server to the pool */ - /* *\/ */ - /* if (nlive_server > pool->nserver_continuum) { */ - /* struct continuum *continuum; */ - /* uint32_t nserver_continuum = nlive_server + continuum_addition; */ - /* uint32_t ncontinuum = nserver_continuum * points_per_server; */ - - /* continuum = dn_realloc(pool->continuum, sizeof(*continuum) * ncontinuum); */ - /* if (continuum == NULL) { */ - /* return DN_ENOMEM; */ - /* } */ - - /* pool->continuum = continuum; */ - /* pool->nserver_continuum = nserver_continuum; */ - /* /\* pool->ncontinuum is initialized later as it could be <= ncontinuum *\/ */ - /* } */ - - /* /\* */ - /* * Build a continuum with the servers that are live and points from */ - /* * these servers that are proportial to their weight */ - /* *\/ */ - /* continuum_index = 0; */ - /* pointer_counter = 0; */ - /* for (server_index = 0; server_index < nserver; server_index++) { */ - /* struct server *server; */ - /* float pct; */ - - /* server = array_get(&pool->server, server_index); */ - - /* if (pool->auto_eject_hosts && server->next_retry > now) { */ - /* continue; */ - /* } */ - - /* pct = (float)server->weight / (float)total_weight; */ - /* pointer_per_server = (uint32_t) ((floorf((float) (pct * KETAMA_POINTS_PER_SERVER / 4 * (float)nlive_server + 0.0000000001))) * 4); */ - /* pointer_per_hash = 4; */ - - /* log_debug(LOG_VERB, "%.*s:%"PRIu16" weight %"PRIu32" of %"PRIu32" " */ - /* "pct %0.5f points per server %"PRIu32"", */ - /* server->name.len, server->name.data, server->port, */ - /* server->weight, total_weight, pct, pointer_per_server); */ - - /* for (pointer_index = 1; */ - /* pointer_index <= pointer_per_server / pointer_per_hash; */ - /* pointer_index++) { */ - - /* char host[KETAMA_MAX_HOSTLEN]= ""; */ - /* size_t hostlen; */ - /* uint32_t x; */ - - /* hostlen = snprintf(host, KETAMA_MAX_HOSTLEN, "%.*s-%u", */ - /* server->name.len, server->name.data, */ - /* pointer_index - 1); */ - - /* for (x = 0; x < pointer_per_hash; x++) { */ - /* value = ketama_hash(host, hostlen, x); */ - /* pool->continuum[continuum_index].index = server_index; */ - /* pool->continuum[continuum_index++].value = value; */ - /* } */ - /* } */ - /* pointer_counter += pointer_per_server; */ - /* } */ - - /* pool->ncontinuum = pointer_counter; */ - /* qsort(pool->continuum, pool->ncontinuum, sizeof(*pool->continuum), */ - /* ketama_item_cmp); */ - - /* for (pointer_index = 0; */ - /* pointer_index < ((nlive_server * KETAMA_POINTS_PER_SERVER) - 1); */ - /* pointer_index++) { */ - /* if (pointer_index + 1 >= pointer_counter) { */ - /* break; */ - /* } */ - /* ASSERT(pool->continuum[pointer_index].value <= */ - /* pool->continuum[pointer_index + 1].value); */ - /* } */ - - /* log_debug(LOG_VERB, "updated pool %"PRIu32" '%.*s' with %"PRIu32" of " */ - /* "%"PRIu32" servers live in %"PRIu32" slots and %"PRIu32" " */ - /* "active points in %"PRIu32" slots", pool->idx, */ - /* pool->name.len, pool->name.data, nlive_server, nserver, */ - /* pool->nserver_continuum, pool->ncontinuum, */ - /* (pool->nserver_continuum + continuum_addition) * points_per_server); */ - - return DN_OK; +rstatus_t ketama_update(struct server_pool *pool) { + /* uint32_t nserver; /\* # server - live and dead *\/ */ + /* uint32_t nlive_server; /\* # live server *\/ */ + /* uint32_t pointer_per_server; /\* pointers per server proportional to + * weight *\/ */ + /* uint32_t pointer_per_hash; /\* pointers per hash *\/ */ + /* uint32_t pointer_counter; /\* # pointers on continuum *\/ */ + /* uint32_t pointer_index; /\* pointer index *\/ */ + /* uint32_t points_per_server; /\* points per server *\/ */ + /* uint32_t continuum_index; /\* continuum index *\/ */ + /* uint32_t continuum_addition; /\* extra space in the continuum *\/ */ + /* uint32_t server_index; /\* server index *\/ */ + /* uint32_t value; /\* continuum value *\/ */ + /* uint32_t total_weight; /\* total live server weight *\/ */ + /* int64_t now; /\* current timestamp in usec *\/ */ + + /* ASSERT(array_n(&pool->server) > 0); */ + + /* now = dn_usec_now(); */ + /* if (now < 0) { */ + /* return DN_ERROR; */ + /* } */ + + /* /\* */ + /* * Count live servers and total weight, and also update the next time to */ + /* * rebuild the distribution */ + /* *\/ */ + /* nserver = array_n(&pool->server); */ + /* nlive_server = 0; */ + /* total_weight = 0; */ + /* pool->next_rebuild = 0LL; */ + /* for (server_index = 0; server_index < nserver; server_index++) { */ + /* struct server *server = array_get(&pool->server, server_index); */ + + /* if (pool->auto_eject_hosts) { */ + /* if (server->next_retry <= now) { */ + /* server->next_retry = 0LL; */ + /* nlive_server++; */ + /* } else if (pool->next_rebuild == 0LL || */ + /* server->next_retry < pool->next_rebuild) { */ + /* pool->next_rebuild = server->next_retry; */ + /* } */ + /* } else { */ + /* nlive_server++; */ + /* } */ + + /* ASSERT(server->weight > 0); */ + + /* /\* count weight only for live servers *\/ */ + /* if (!pool->auto_eject_hosts || server->next_retry <= now) { */ + /* total_weight += server->weight; */ + /* } */ + /* } */ + + /* pool->nlive_server = nlive_server; */ + + /* if (nlive_server == 0) { */ + /* log_debug(LOG_DEBUG, "no live servers for pool %"PRIu32" '%.*s'", */ + /* pool->idx, pool->name.len, pool->name.data); */ + + /* return DN_OK; */ + /* } */ + /* log_debug(LOG_DEBUG, "%"PRIu32" of %"PRIu32" servers are live for pool " */ + /* "%"PRIu32" '%.*s'", nlive_server, nserver, pool->idx, */ + /* pool->name.len, pool->name.data); */ + + /* continuum_addition = KETAMA_CONTINUUM_ADDITION; */ + /* points_per_server = KETAMA_POINTS_PER_SERVER; */ + /* /\* */ + /* * Allocate the continuum for the pool, the first time, and every time we + */ + /* * add a new server to the pool */ + /* *\/ */ + /* if (nlive_server > pool->nserver_continuum) { */ + /* struct continuum *continuum; */ + /* uint32_t nserver_continuum = nlive_server + continuum_addition; */ + /* uint32_t ncontinuum = nserver_continuum * points_per_server; */ + + /* continuum = dn_realloc(pool->continuum, sizeof(*continuum) * + * ncontinuum); */ + /* if (continuum == NULL) { */ + /* return DN_ENOMEM; */ + /* } */ + + /* pool->continuum = continuum; */ + /* pool->nserver_continuum = nserver_continuum; */ + /* /\* pool->ncontinuum is initialized later as it could be <= ncontinuum + * *\/ */ + /* } */ + + /* /\* */ + /* * Build a continuum with the servers that are live and points from */ + /* * these servers that are proportial to their weight */ + /* *\/ */ + /* continuum_index = 0; */ + /* pointer_counter = 0; */ + /* for (server_index = 0; server_index < nserver; server_index++) { */ + /* struct server *server; */ + /* float pct; */ + + /* server = array_get(&pool->server, server_index); */ + + /* if (pool->auto_eject_hosts && server->next_retry > now) { */ + /* continue; */ + /* } */ + + /* pct = (float)server->weight / (float)total_weight; */ + /* pointer_per_server = (uint32_t) ((floorf((float) (pct * + * KETAMA_POINTS_PER_SERVER / 4 * (float)nlive_server + 0.0000000001))) * 4); + */ + /* pointer_per_hash = 4; */ + + /* log_debug(LOG_VERB, "%.*s:%"PRIu16" weight %"PRIu32" of %"PRIu32" " */ + /* "pct %0.5f points per server %"PRIu32"", */ + /* server->name.len, server->name.data, server->port, */ + /* server->weight, total_weight, pct, pointer_per_server); */ + + /* for (pointer_index = 1; */ + /* pointer_index <= pointer_per_server / pointer_per_hash; */ + /* pointer_index++) { */ + + /* char host[KETAMA_MAX_HOSTLEN]= ""; */ + /* size_t hostlen; */ + /* uint32_t x; */ + + /* hostlen = snprintf(host, KETAMA_MAX_HOSTLEN, "%.*s-%u", */ + /* server->name.len, server->name.data, */ + /* pointer_index - 1); */ + + /* for (x = 0; x < pointer_per_hash; x++) { */ + /* value = ketama_hash(host, hostlen, x); */ + /* pool->continuum[continuum_index].index = server_index; */ + /* pool->continuum[continuum_index++].value = value; */ + /* } */ + /* } */ + /* pointer_counter += pointer_per_server; */ + /* } */ + + /* pool->ncontinuum = pointer_counter; */ + /* qsort(pool->continuum, pool->ncontinuum, sizeof(*pool->continuum), */ + /* ketama_item_cmp); */ + + /* for (pointer_index = 0; */ + /* pointer_index < ((nlive_server * KETAMA_POINTS_PER_SERVER) - 1); */ + /* pointer_index++) { */ + /* if (pointer_index + 1 >= pointer_counter) { */ + /* break; */ + /* } */ + /* ASSERT(pool->continuum[pointer_index].value <= */ + /* pool->continuum[pointer_index + 1].value); */ + /* } */ + + /* log_debug(LOG_VERB, "updated pool %"PRIu32" '%.*s' with %"PRIu32" of " */ + /* "%"PRIu32" servers live in %"PRIu32" slots and %"PRIu32" " */ + /* "active points in %"PRIu32" slots", pool->idx, */ + /* pool->name.len, pool->name.data, nlive_server, nserver, */ + /* pool->nserver_continuum, pool->ncontinuum, */ + /* (pool->nserver_continuum + continuum_addition) * + * points_per_server); */ + + return DN_OK; } -uint32_t -ketama_dispatch(struct continuum *continuum, uint32_t ncontinuum, uint32_t hash) -{ - struct continuum *begin, *end, *left, *right, *middle; +uint32_t ketama_dispatch(struct continuum *continuum, uint32_t ncontinuum, + uint32_t hash) { + struct continuum *begin, *end, *left, *right, *middle; - ASSERT(continuum != NULL); - ASSERT(ncontinuum != 0); + ASSERT(continuum != NULL); + ASSERT(ncontinuum != 0); - begin = left = continuum; - end = right = continuum + ncontinuum; + begin = left = continuum; + end = right = continuum + ncontinuum; - while (left < right) { - middle = left + (right - left) / 2; - if (middle->value < hash) { - left = middle + 1; - } else { - right = middle; - } + while (left < right) { + middle = left + (right - left) / 2; + if (middle->value < hash) { + left = middle + 1; + } else { + right = middle; } + } - if (right == end) { - right = begin; - } + if (right == end) { + right = begin; + } - return right->index; + return right->index; } diff --git a/src/hashkit/dyn_md5.c b/src/hashkit/dyn_md5.c index 649c7161e..265236476 100644 --- a/src/hashkit/dyn_md5.c +++ b/src/hashkit/dyn_md5.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,16 +20,17 @@ * limitations under the License. */ -#include -#include +#include "dyn_token.h" /* * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. * MD5 Message-Digest Algorithm (RFC 1321). * - * Homepage: http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 + * Homepage: + * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 * - * Author: Alexander Peslyak, better known as Solar Designer + * Author: Alexander Peslyak, better known as Solar Designer */ #include @@ -37,10 +38,10 @@ typedef unsigned int MD5_u32plus; typedef struct { - MD5_u32plus lo, hi; - MD5_u32plus a, b, c, d; - unsigned char buffer[64]; - MD5_u32plus block[16]; + MD5_u32plus lo, hi; + MD5_u32plus a, b, c, d; + unsigned char buffer[64]; + MD5_u32plus block[16]; } MD5_CTX; /* @@ -50,18 +51,18 @@ typedef struct { * architectures that lack an AND-NOT instruction, just like in Colin Plumb's * implementation. */ -#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) -#define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y)))) -#define H(x, y, z) ((x) ^ (y) ^ (z)) -#define I(x, y, z) ((y) ^ ((x) | ~(z))) +#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y)))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | ~(z))) /* * The MD5 transformation for all four rounds. */ -#define STEP(f, a, b, c, d, x, t, s) \ - (a) += f((b), (c), (d)) + (x) + (t); \ - (a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \ - (a) += (b); +#define STEP(f, a, b, c, d, x, t, s) \ + (a) += f((b), (c), (d)) + (x) + (t); \ + (a) = (((a) << (s)) | (((a)&0xffffffff) >> (32 - (s)))); \ + (a) += (b); /* * SET reads 4 input bytes in little-endian byte order and stores them @@ -72,261 +73,246 @@ typedef struct { * doesn't work. */ #if defined(__i386__) || defined(__x86_64__) || defined(__vax__) -#define SET(n) \ - (*(MD5_u32plus *)&ptr[(n) * 4]) -#define GET(n) \ - SET(n) +#define SET(n) (*(MD5_u32plus *)&ptr[(n)*4]) +#define GET(n) SET(n) #else -#define SET(n) \ - (ctx->block[(n)] = \ - (MD5_u32plus)ptr[(n) * 4] | \ - ((MD5_u32plus)ptr[(n) * 4 + 1] << 8) | \ - ((MD5_u32plus)ptr[(n) * 4 + 2] << 16) | \ - ((MD5_u32plus)ptr[(n) * 4 + 3] << 24)) -#define GET(n) \ - (ctx->block[(n)]) +#define SET(n) \ + (ctx->block[(n)] = (MD5_u32plus)ptr[(n)*4] | \ + ((MD5_u32plus)ptr[(n)*4 + 1] << 8) | \ + ((MD5_u32plus)ptr[(n)*4 + 2] << 16) | \ + ((MD5_u32plus)ptr[(n)*4 + 3] << 24)) +#define GET(n) (ctx->block[(n)]) #endif /* * This processes one or more 64-byte data blocks, but does NOT update * the bit counters. There are no alignment requirements. */ -static void * -body(MD5_CTX *ctx, void *data, unsigned long size) -{ - unsigned char *ptr; - MD5_u32plus a, b, c, d; - MD5_u32plus saved_a, saved_b, saved_c, saved_d; - - ptr = data; - - a = ctx->a; - b = ctx->b; - c = ctx->c; - d = ctx->d; - - do { - saved_a = a; - saved_b = b; - saved_c = c; - saved_d = d; - - /* Round 1 */ - STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7) - STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12) - STEP(F, c, d, a, b, SET(2), 0x242070db, 17) - STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22) - STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7) - STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12) - STEP(F, c, d, a, b, SET(6), 0xa8304613, 17) - STEP(F, b, c, d, a, SET(7), 0xfd469501, 22) - STEP(F, a, b, c, d, SET(8), 0x698098d8, 7) - STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12) - STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17) - STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22) - STEP(F, a, b, c, d, SET(12), 0x6b901122, 7) - STEP(F, d, a, b, c, SET(13), 0xfd987193, 12) - STEP(F, c, d, a, b, SET(14), 0xa679438e, 17) - STEP(F, b, c, d, a, SET(15), 0x49b40821, 22) - - /* Round 2 */ - STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5) - STEP(G, d, a, b, c, GET(6), 0xc040b340, 9) - STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14) - STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20) - STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5) - STEP(G, d, a, b, c, GET(10), 0x02441453, 9) - STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14) - STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20) - STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5) - STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9) - STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14) - STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20) - STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5) - STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9) - STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14) - STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20) - - /* Round 3 */ - STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4) - STEP(H, d, a, b, c, GET(8), 0x8771f681, 11) - STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16) - STEP(H, b, c, d, a, GET(14), 0xfde5380c, 23) - STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4) - STEP(H, d, a, b, c, GET(4), 0x4bdecfa9, 11) - STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16) - STEP(H, b, c, d, a, GET(10), 0xbebfbc70, 23) - STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4) - STEP(H, d, a, b, c, GET(0), 0xeaa127fa, 11) - STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16) - STEP(H, b, c, d, a, GET(6), 0x04881d05, 23) - STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4) - STEP(H, d, a, b, c, GET(12), 0xe6db99e5, 11) - STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16) - STEP(H, b, c, d, a, GET(2), 0xc4ac5665, 23) - - /* Round 4 */ - STEP(I, a, b, c, d, GET(0), 0xf4292244, 6) - STEP(I, d, a, b, c, GET(7), 0x432aff97, 10) - STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15) - STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21) - STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6) - STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10) - STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15) - STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21) - STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6) - STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10) - STEP(I, c, d, a, b, GET(6), 0xa3014314, 15) - STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21) - STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6) - STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10) - STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15) - STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21) - - a += saved_a; - b += saved_b; - c += saved_c; - d += saved_d; - - ptr += 64; - } while (size -= 64); - - ctx->a = a; - ctx->b = b; - ctx->c = c; - ctx->d = d; - - return ptr; +static void *body(MD5_CTX *ctx, void *data, unsigned long size) { + unsigned char *ptr; + MD5_u32plus a, b, c, d; + MD5_u32plus saved_a, saved_b, saved_c, saved_d; + + ptr = data; + + a = ctx->a; + b = ctx->b; + c = ctx->c; + d = ctx->d; + + do { + saved_a = a; + saved_b = b; + saved_c = c; + saved_d = d; + + /* Round 1 */ + STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7) + STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12) + STEP(F, c, d, a, b, SET(2), 0x242070db, 17) + STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22) + STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7) + STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12) + STEP(F, c, d, a, b, SET(6), 0xa8304613, 17) + STEP(F, b, c, d, a, SET(7), 0xfd469501, 22) + STEP(F, a, b, c, d, SET(8), 0x698098d8, 7) + STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12) + STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17) + STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22) + STEP(F, a, b, c, d, SET(12), 0x6b901122, 7) + STEP(F, d, a, b, c, SET(13), 0xfd987193, 12) + STEP(F, c, d, a, b, SET(14), 0xa679438e, 17) + STEP(F, b, c, d, a, SET(15), 0x49b40821, 22) + + /* Round 2 */ + STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5) + STEP(G, d, a, b, c, GET(6), 0xc040b340, 9) + STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14) + STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20) + STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5) + STEP(G, d, a, b, c, GET(10), 0x02441453, 9) + STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14) + STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20) + STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5) + STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9) + STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14) + STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20) + STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5) + STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9) + STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14) + STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20) + + /* Round 3 */ + STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4) + STEP(H, d, a, b, c, GET(8), 0x8771f681, 11) + STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16) + STEP(H, b, c, d, a, GET(14), 0xfde5380c, 23) + STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4) + STEP(H, d, a, b, c, GET(4), 0x4bdecfa9, 11) + STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16) + STEP(H, b, c, d, a, GET(10), 0xbebfbc70, 23) + STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4) + STEP(H, d, a, b, c, GET(0), 0xeaa127fa, 11) + STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16) + STEP(H, b, c, d, a, GET(6), 0x04881d05, 23) + STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4) + STEP(H, d, a, b, c, GET(12), 0xe6db99e5, 11) + STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16) + STEP(H, b, c, d, a, GET(2), 0xc4ac5665, 23) + + /* Round 4 */ + STEP(I, a, b, c, d, GET(0), 0xf4292244, 6) + STEP(I, d, a, b, c, GET(7), 0x432aff97, 10) + STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15) + STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21) + STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6) + STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10) + STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15) + STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21) + STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6) + STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10) + STEP(I, c, d, a, b, GET(6), 0xa3014314, 15) + STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21) + STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6) + STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10) + STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15) + STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21) + + a += saved_a; + b += saved_b; + c += saved_c; + d += saved_d; + + ptr += 64; + } while (size -= 64); + + ctx->a = a; + ctx->b = b; + ctx->c = c; + ctx->d = d; + + return ptr; } -void -MD5_Init(MD5_CTX *ctx) -{ - ctx->a = 0x67452301; - ctx->b = 0xefcdab89; - ctx->c = 0x98badcfe; - ctx->d = 0x10325476; +void MD5_Init(MD5_CTX *ctx) { + ctx->a = 0x67452301; + ctx->b = 0xefcdab89; + ctx->c = 0x98badcfe; + ctx->d = 0x10325476; - ctx->lo = 0; - ctx->hi = 0; + ctx->lo = 0; + ctx->hi = 0; } -void -MD5_Update(MD5_CTX *ctx, void *data, unsigned long size) -{ - MD5_u32plus saved_lo; - unsigned long used, free; - - saved_lo = ctx->lo; - if ((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo) { - ctx->hi++; - } - ctx->hi += size >> 29; - - used = saved_lo & 0x3f; +void MD5_Update(MD5_CTX *ctx, void *data, unsigned long size) { + MD5_u32plus saved_lo; + unsigned long used, free; - if (used) { - free = 64 - used; + saved_lo = ctx->lo; + if ((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo) { + ctx->hi++; + } + ctx->hi += size >> 29; - if (size < free) { - memcpy(&ctx->buffer[used], data, size); - return; - } + used = saved_lo & 0x3f; - memcpy(&ctx->buffer[used], data, free); - data = (unsigned char *)data + free; - size -= free; - body(ctx, ctx->buffer, 64); - } + if (used) { + free = 64 - used; - if (size >= 64) { - data = body(ctx, data, size & ~(unsigned long)0x3f); - size &= 0x3f; + if (size < free) { + memcpy(&ctx->buffer[used], data, size); + return; } - memcpy(ctx->buffer, data, size); -} - -void -MD5_Final(unsigned char *result, MD5_CTX *ctx) -{ - unsigned long used, free; + memcpy(&ctx->buffer[used], data, free); + data = (unsigned char *)data + free; + size -= free; + body(ctx, ctx->buffer, 64); + } - used = ctx->lo & 0x3f; + if (size >= 64) { + data = body(ctx, data, size & ~(unsigned long)0x3f); + size &= 0x3f; + } - ctx->buffer[used++] = 0x80; + memcpy(ctx->buffer, data, size); +} - free = 64 - used; +void MD5_Final(unsigned char *result, MD5_CTX *ctx) { + unsigned long used, free; - if (free < 8) { - memset(&ctx->buffer[used], 0, free); - body(ctx, ctx->buffer, 64); - used = 0; - free = 64; - } + used = ctx->lo & 0x3f; - memset(&ctx->buffer[used], 0, free - 8); + ctx->buffer[used++] = 0x80; - ctx->lo <<= 3; - ctx->buffer[56] = ctx->lo; - ctx->buffer[57] = ctx->lo >> 8; - ctx->buffer[58] = ctx->lo >> 16; - ctx->buffer[59] = ctx->lo >> 24; - ctx->buffer[60] = ctx->hi; - ctx->buffer[61] = ctx->hi >> 8; - ctx->buffer[62] = ctx->hi >> 16; - ctx->buffer[63] = ctx->hi >> 24; + free = 64 - used; + if (free < 8) { + memset(&ctx->buffer[used], 0, free); body(ctx, ctx->buffer, 64); - - result[0] = ctx->a; - result[1] = ctx->a >> 8; - result[2] = ctx->a >> 16; - result[3] = ctx->a >> 24; - result[4] = ctx->b; - result[5] = ctx->b >> 8; - result[6] = ctx->b >> 16; - result[7] = ctx->b >> 24; - result[8] = ctx->c; - result[9] = ctx->c >> 8; - result[10] = ctx->c >> 16; - result[11] = ctx->c >> 24; - result[12] = ctx->d; - result[13] = ctx->d >> 8; - result[14] = ctx->d >> 16; - result[15] = ctx->d >> 24; - - memset(ctx, 0, sizeof(*ctx)); + used = 0; + free = 64; + } + + memset(&ctx->buffer[used], 0, free - 8); + + ctx->lo <<= 3; + ctx->buffer[56] = ctx->lo; + ctx->buffer[57] = ctx->lo >> 8; + ctx->buffer[58] = ctx->lo >> 16; + ctx->buffer[59] = ctx->lo >> 24; + ctx->buffer[60] = ctx->hi; + ctx->buffer[61] = ctx->hi >> 8; + ctx->buffer[62] = ctx->hi >> 16; + ctx->buffer[63] = ctx->hi >> 24; + + body(ctx, ctx->buffer, 64); + + result[0] = ctx->a; + result[1] = ctx->a >> 8; + result[2] = ctx->a >> 16; + result[3] = ctx->a >> 24; + result[4] = ctx->b; + result[5] = ctx->b >> 8; + result[6] = ctx->b >> 16; + result[7] = ctx->b >> 24; + result[8] = ctx->c; + result[9] = ctx->c >> 8; + result[10] = ctx->c >> 16; + result[11] = ctx->c >> 24; + result[12] = ctx->d; + result[13] = ctx->d >> 8; + result[14] = ctx->d >> 16; + result[15] = ctx->d >> 24; + + memset(ctx, 0, sizeof(*ctx)); } /* * Just a simple method for getting the signature * result must be == 16 */ -void -md5_signature(unsigned char *key, unsigned long length, unsigned char *result) -{ - MD5_CTX my_md5; - - MD5_Init(&my_md5); - (void)MD5_Update(&my_md5, key, length); - MD5_Final(result, &my_md5); +void md5_signature(unsigned char *key, unsigned long length, + unsigned char *result) { + MD5_CTX my_md5; + + MD5_Init(&my_md5); + (void)MD5_Update(&my_md5, key, length); + MD5_Final(result, &my_md5); } -rstatus_t -hash_md5(const unsigned char *key, size_t key_length, struct dyn_token *token) -{ - unsigned char results[16]; +rstatus_t hash_md5(const unsigned char *key, size_t key_length, + struct dyn_token *token) { + unsigned char results[16]; + + md5_signature((unsigned char *)key, (unsigned long)key_length, results); - md5_signature((unsigned char*)key, (unsigned long)key_length, results); + uint32_t val = ((uint32_t)(results[3] & 0xFF) << 24) | + ((uint32_t)(results[2] & 0xFF) << 16) | + ((uint32_t)(results[1] & 0xFF) << 8) | (results[0] & 0xFF); - uint32_t val = ((uint32_t) (results[3] & 0xFF) << 24) | - ((uint32_t) (results[2] & 0xFF) << 16) | - ((uint32_t) (results[1] & 0xFF) << 8) | - (results[0] & 0xFF); + size_dyn_token(token, 1); + set_int_dyn_token(token, val); - size_dyn_token(token, 1); - set_int_dyn_token(token, val); - - return DN_OK; + return DN_OK; } diff --git a/src/hashkit/dyn_modula.c b/src/hashkit/dyn_modula.c index 1a207c7f1..a7d36fcec 100644 --- a/src/hashkit/dyn_modula.c +++ b/src/hashkit/dyn_modula.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -24,138 +24,142 @@ #include #include -#include #include +#include -#define MODULA_CONTINUUM_ADDITION 10 /* # extra slots to build into continuum */ -#define MODULA_POINTS_PER_SERVER 1 - -rstatus_t -modula_update(struct server_pool *pool) -{ - /* uint32_t nserver; /\* # server - live and dead *\/ */ - /* uint32_t nlive_server; /\* # live server *\/ */ - /* uint32_t pointer_per_server; /\* pointers per server proportional to weight *\/ */ - /* uint32_t pointer_counter; /\* # pointers on continuum *\/ */ - /* uint32_t points_per_server; /\* points per server *\/ */ - /* uint32_t continuum_index; /\* continuum index *\/ */ - /* uint32_t continuum_addition; /\* extra space in the continuum *\/ */ - /* uint32_t server_index; /\* server index *\/ */ - /* uint32_t weight_index; /\* weight index *\/ */ - /* uint32_t total_weight; /\* total live server weight *\/ */ - /* int64_t now; /\* current timestamp in usec *\/ */ - - /* now = dn_usec_now(); */ - /* if (now < 0) { */ - /* return DN_ERROR; */ - /* } */ - - /* nserver = array_n(&pool->server); */ - /* nlive_server = 0; */ - /* total_weight = 0; */ - /* pool->next_rebuild = 0LL; */ - - /* for (server_index = 0; server_index < nserver; server_index++) { */ - /* struct server *server = array_get(&pool->server, server_index); */ - - /* if (pool->auto_eject_hosts) { */ - /* if (server->next_retry <= now) { */ - /* server->next_retry = 0LL; */ - /* nlive_server++; */ - /* } else if (pool->next_rebuild == 0LL || */ - /* server->next_retry < pool->next_rebuild) { */ - /* pool->next_rebuild = server->next_retry; */ - /* } */ - /* } else { */ - /* nlive_server++; */ - /* } */ - - /* ASSERT(server->weight > 0); */ - - /* /\* count weight only for live servers *\/ */ - /* if (!pool->auto_eject_hosts || server->next_retry <= now) { */ - /* total_weight += server->weight; */ - /* } */ - /* } */ - - /* pool->nlive_server = nlive_server; */ - - /* if (nlive_server == 0) { */ - /* ASSERT(pool->continuum != NULL); */ - /* ASSERT(pool->ncontinuum != 0); */ - - /* log_debug(LOG_DEBUG, "no live servers for pool %"PRIu32" '%.*s'", */ - /* pool->idx, pool->name.len, pool->name.data); */ - - /* return DN_OK; */ - /* } */ - /* log_debug(LOG_DEBUG, "%"PRIu32" of %"PRIu32" servers are live for pool " */ - /* "%"PRIu32" '%.*s'", nlive_server, nserver, pool->idx, */ - /* pool->name.len, pool->name.data); */ - - /* continuum_addition = MODULA_CONTINUUM_ADDITION; */ - /* points_per_server = MODULA_POINTS_PER_SERVER; */ - - /* /\* */ - /* * Allocate the continuum for the pool, the first time, and every time we */ - /* * add a new server to the pool */ - /* *\/ */ - /* if (total_weight > pool->nserver_continuum) { */ - /* struct continuum *continuum; */ - /* uint32_t nserver_continuum = total_weight + MODULA_CONTINUUM_ADDITION; */ - /* uint32_t ncontinuum = nserver_continuum * MODULA_POINTS_PER_SERVER; */ - - /* continuum = dn_realloc(pool->continuum, sizeof(*continuum) * ncontinuum); */ - /* if (continuum == NULL) { */ - /* return DN_ENOMEM; */ - /* } */ - - /* pool->continuum = continuum; */ - /* pool->nserver_continuum = nserver_continuum; */ - /* /\* pool->ncontinuum is initialized later as it could be <= ncontinuum *\/ */ - /* } */ - - /* /\* update the continuum with the servers that are live *\/ */ - /* continuum_index = 0; */ - /* pointer_counter = 0; */ - /* for (server_index = 0; server_index < nserver; server_index++) { */ - /* struct server *server = array_get(&pool->server, server_index); */ - - /* if (pool->auto_eject_hosts && server->next_retry > now) { */ - /* continue; */ - /* } */ - - /* for (weight_index = 0; weight_index < server->weight; weight_index++) { */ - /* pointer_per_server = 1; */ - - /* pool->continuum[continuum_index].index = server_index; */ - /* pool->continuum[continuum_index++].value = 0; */ - - /* pointer_counter += pointer_per_server; */ - /* } */ - /* } */ - /* pool->ncontinuum = pointer_counter; */ - - /* log_debug(LOG_VERB, "updated pool %"PRIu32" '%.*s' with %"PRIu32" of " */ - /* "%"PRIu32" servers live in %"PRIu32" slots and %"PRIu32" " */ - /* "active points in %"PRIu32" slots", pool->idx, */ - /* pool->name.len, pool->name.data, nlive_server, nserver, */ - /* pool->nserver_continuum, pool->ncontinuum, */ - /* (pool->nserver_continuum + continuum_addition) * points_per_server); */ - - return DN_OK; - +#define MODULA_CONTINUUM_ADDITION 10 /* # extra slots to build into continuum \ + */ +#define MODULA_POINTS_PER_SERVER 1 + +rstatus_t modula_update(struct server_pool *pool) { + /* uint32_t nserver; /\* # server - live and dead *\/ */ + /* uint32_t nlive_server; /\* # live server *\/ */ + /* uint32_t pointer_per_server; /\* pointers per server proportional to + * weight *\/ */ + /* uint32_t pointer_counter; /\* # pointers on continuum *\/ */ + /* uint32_t points_per_server; /\* points per server *\/ */ + /* uint32_t continuum_index; /\* continuum index *\/ */ + /* uint32_t continuum_addition; /\* extra space in the continuum *\/ */ + /* uint32_t server_index; /\* server index *\/ */ + /* uint32_t weight_index; /\* weight index *\/ */ + /* uint32_t total_weight; /\* total live server weight *\/ */ + /* int64_t now; /\* current timestamp in usec *\/ */ + + /* now = dn_usec_now(); */ + /* if (now < 0) { */ + /* return DN_ERROR; */ + /* } */ + + /* nserver = array_n(&pool->server); */ + /* nlive_server = 0; */ + /* total_weight = 0; */ + /* pool->next_rebuild = 0LL; */ + + /* for (server_index = 0; server_index < nserver; server_index++) { */ + /* struct server *server = array_get(&pool->server, server_index); */ + + /* if (pool->auto_eject_hosts) { */ + /* if (server->next_retry <= now) { */ + /* server->next_retry = 0LL; */ + /* nlive_server++; */ + /* } else if (pool->next_rebuild == 0LL || */ + /* server->next_retry < pool->next_rebuild) { */ + /* pool->next_rebuild = server->next_retry; */ + /* } */ + /* } else { */ + /* nlive_server++; */ + /* } */ + + /* ASSERT(server->weight > 0); */ + + /* /\* count weight only for live servers *\/ */ + /* if (!pool->auto_eject_hosts || server->next_retry <= now) { */ + /* total_weight += server->weight; */ + /* } */ + /* } */ + + /* pool->nlive_server = nlive_server; */ + + /* if (nlive_server == 0) { */ + /* ASSERT(pool->continuum != NULL); */ + /* ASSERT(pool->ncontinuum != 0); */ + + /* log_debug(LOG_DEBUG, "no live servers for pool %"PRIu32" '%.*s'", */ + /* pool->idx, pool->name.len, pool->name.data); */ + + /* return DN_OK; */ + /* } */ + /* log_debug(LOG_DEBUG, "%"PRIu32" of %"PRIu32" servers are live for pool " */ + /* "%"PRIu32" '%.*s'", nlive_server, nserver, pool->idx, */ + /* pool->name.len, pool->name.data); */ + + /* continuum_addition = MODULA_CONTINUUM_ADDITION; */ + /* points_per_server = MODULA_POINTS_PER_SERVER; */ + + /* /\* */ + /* * Allocate the continuum for the pool, the first time, and every time we + */ + /* * add a new server to the pool */ + /* *\/ */ + /* if (total_weight > pool->nserver_continuum) { */ + /* struct continuum *continuum; */ + /* uint32_t nserver_continuum = total_weight + MODULA_CONTINUUM_ADDITION; + */ + /* uint32_t ncontinuum = nserver_continuum * MODULA_POINTS_PER_SERVER; */ + + /* continuum = dn_realloc(pool->continuum, sizeof(*continuum) * + * ncontinuum); */ + /* if (continuum == NULL) { */ + /* return DN_ENOMEM; */ + /* } */ + + /* pool->continuum = continuum; */ + /* pool->nserver_continuum = nserver_continuum; */ + /* /\* pool->ncontinuum is initialized later as it could be <= ncontinuum + * *\/ */ + /* } */ + + /* /\* update the continuum with the servers that are live *\/ */ + /* continuum_index = 0; */ + /* pointer_counter = 0; */ + /* for (server_index = 0; server_index < nserver; server_index++) { */ + /* struct server *server = array_get(&pool->server, server_index); */ + + /* if (pool->auto_eject_hosts && server->next_retry > now) { */ + /* continue; */ + /* } */ + + /* for (weight_index = 0; weight_index < server->weight; weight_index++) { + */ + /* pointer_per_server = 1; */ + + /* pool->continuum[continuum_index].index = server_index; */ + /* pool->continuum[continuum_index++].value = 0; */ + + /* pointer_counter += pointer_per_server; */ + /* } */ + /* } */ + /* pool->ncontinuum = pointer_counter; */ + + /* log_debug(LOG_VERB, "updated pool %"PRIu32" '%.*s' with %"PRIu32" of " */ + /* "%"PRIu32" servers live in %"PRIu32" slots and %"PRIu32" " */ + /* "active points in %"PRIu32" slots", pool->idx, */ + /* pool->name.len, pool->name.data, nlive_server, nserver, */ + /* pool->nserver_continuum, pool->ncontinuum, */ + /* (pool->nserver_continuum + continuum_addition) * + * points_per_server); */ + + return DN_OK; } -uint32_t -modula_dispatch(struct continuum *continuum, uint32_t ncontinuum, uint32_t hash) -{ - struct continuum *c; +uint32_t modula_dispatch(struct continuum *continuum, uint32_t ncontinuum, + uint32_t hash) { + struct continuum *c; - ASSERT(continuum != NULL); - ASSERT(ncontinuum != 0); + ASSERT(continuum != NULL); + ASSERT(ncontinuum != 0); - c = continuum + hash % ncontinuum; + c = continuum + hash % ncontinuum; - return c->index; + return c->index; } diff --git a/src/hashkit/dyn_murmur.c b/src/hashkit/dyn_murmur.c index ca0a5d929..c12fa8879 100644 --- a/src/hashkit/dyn_murmur.c +++ b/src/hashkit/dyn_murmur.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -40,69 +40,67 @@ #include #include -rstatus_t -hash_murmur(const unsigned char *key, size_t length, struct dyn_token *token) -{ - /* - * 'm' and 'r' are mixing constants generated offline. They're not - * really 'magic', they just happen to work well. - */ - - const unsigned int m = 0x5bd1e995; - const uint32_t seed = (0xdeadbeef * (uint32_t)length); - const int r = 24; +rstatus_t hash_murmur(const unsigned char *key, size_t length, + struct dyn_token *token) { + /* + * 'm' and 'r' are mixing constants generated offline. They're not + * really 'magic', they just happen to work well. + */ + const unsigned int m = 0x5bd1e995; + const uint32_t seed = (0xdeadbeef * (uint32_t)length); + const int r = 24; - /* Initialize the hash to a 'random' value */ + /* Initialize the hash to a 'random' value */ - uint32_t h = seed ^ (uint32_t)length; + uint32_t h = seed ^ (uint32_t)length; - /* Mix 4 bytes at a time into the hash */ + /* Mix 4 bytes at a time into the hash */ - const unsigned char * data = (const unsigned char *)key; + const unsigned char *data = (const unsigned char *)key; - while (length >= 4) { - unsigned int k = *(unsigned int *)data; + while (length >= 4) { + unsigned int k = *(unsigned int *)data; - k *= m; - k ^= k >> r; - k *= m; + k *= m; + k ^= k >> r; + k *= m; - h *= m; - h ^= k; + h *= m; + h ^= k; - data += 4; - length -= 4; - } + data += 4; + length -= 4; + } - /* Handle the last few bytes of the input array */ + /* Handle the last few bytes of the input array */ - switch(length) { + switch (length) { case 3: - h ^= ((uint32_t)data[2]) << 16; + h ^= ((uint32_t)data[2]) << 16; case 2: - h ^= ((uint32_t)data[1]) << 8; + h ^= ((uint32_t)data[1]) << 8; case 1: - h ^= data[0]; - h *= m; + h ^= data[0]; + h *= m; default: - break; - }; + break; + }; - /* - * Do a few final mixes of the hash to ensure the last few bytes are - * well-incorporated. - */ + /* + * Do a few final mixes of the hash to ensure the last few bytes are + * well-incorporated. + */ - h ^= h >> 13; - h *= m; - h ^= h >> 15; + h ^= h >> 13; + h *= m; + h ^= h >> 15; - size_dyn_token(token, 1); - set_int_dyn_token(token, h); + size_dyn_token(token, 1); + set_int_dyn_token(token, h); - return DN_OK; + return DN_OK; } diff --git a/src/hashkit/dyn_murmur3.c b/src/hashkit/dyn_murmur3.c index 963e0fe0e..23289feca 100644 --- a/src/hashkit/dyn_murmur3.c +++ b/src/hashkit/dyn_murmur3.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,21 +20,20 @@ * limitations under the License. */ -#include #include +#include #include #define MURMUR3_SEED 0xc0a1e5ce -rstatus_t -hash_murmur3(const unsigned char *key, size_t length, struct dyn_token *token) -{ - rstatus_t status = size_dyn_token(token, 4); - if (status != DN_OK) { - return status; - } +rstatus_t hash_murmur3(const unsigned char *key, size_t length, + struct dyn_token *token) { + rstatus_t status = size_dyn_token(token, 4); + if (status != DN_OK) { + return status; + } -// MurmurHash3_x86_128(key, length, MURMUR3_SEED, token->mag); + // MurmurHash3_x86_128(key, length, MURMUR3_SEED, token->mag); - return DN_OK; + return DN_OK; } diff --git a/src/hashkit/dyn_one_at_a_time.c b/src/hashkit/dyn_one_at_a_time.c index d8d2d0a03..3b598bf1e 100644 --- a/src/hashkit/dyn_one_at_a_time.c +++ b/src/hashkit/dyn_one_at_a_time.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -37,24 +37,23 @@ #include #include -rstatus_t -hash_one_at_a_time(const unsigned char *key, size_t key_length, struct dyn_token *token) -{ - const char *ptr = key; - uint32_t value = 0; - - while (key_length--) { - uint32_t val = (uint32_t) *ptr++; - value += val; - value += (value << 10); - value ^= (value >> 6); - } - value += (value << 3); - value ^= (value >> 11); - value += (value << 15); - - size_dyn_token(token, 1); - set_int_dyn_token(token, value); - - return DN_OK; +rstatus_t hash_one_at_a_time(const unsigned char *key, size_t key_length, + struct dyn_token *token) { + const char *ptr = key; + uint32_t value = 0; + + while (key_length--) { + uint32_t val = (uint32_t)*ptr++; + value += val; + value += (value << 10); + value ^= (value >> 6); + } + value += (value << 3); + value ^= (value >> 11); + value += (value << 15); + + size_dyn_token(token, 1); + set_int_dyn_token(token, value); + + return DN_OK; } diff --git a/src/hashkit/dyn_random.c b/src/hashkit/dyn_random.c index e899035b0..0658792d7 100644 --- a/src/hashkit/dyn_random.c +++ b/src/hashkit/dyn_random.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -24,128 +24,131 @@ #include #include -#include #include +#include -#define RANDOM_CONTINUUM_ADDITION 10 /* # extra slots to build into continuum */ -#define RANDOM_POINTS_PER_SERVER 1 - -rstatus_t -random_update(struct server_pool *pool) -{ - /* uint32_t nserver; /\* # server - live and dead *\/ */ - /* uint32_t nlive_server; /\* # live server *\/ */ - /* uint32_t pointer_per_server; /\* pointers per server proportional to weight *\/ */ - /* uint32_t pointer_counter; /\* # pointers on continuum *\/ */ - /* uint32_t points_per_server; /\* points per server *\/ */ - /* uint32_t continuum_index; /\* continuum index *\/ */ - /* uint32_t continuum_addition; /\* extra space in the continuum *\/ */ - /* uint32_t server_index; /\* server index *\/ */ - /* int64_t now; /\* current timestamp in usec *\/ */ - - /* now = dn_usec_now(); */ - /* if (now < 0) { */ - /* return DN_ERROR; */ - /* } */ - - /* nserver = array_n(&pool->server); */ - /* nlive_server = 0; */ - /* pool->next_rebuild = 0LL; */ - - /* for (server_index = 0; server_index < nserver; server_index++) { */ - /* struct server *server = array_get(&pool->server, server_index); */ - - /* if (pool->auto_eject_hosts) { */ - /* if (server->next_retry <= now) { */ - /* server->next_retry = 0LL; */ - /* nlive_server++; */ - /* } else if (pool->next_rebuild == 0LL || */ - /* server->next_retry < pool->next_rebuild) { */ - /* pool->next_rebuild = server->next_retry; */ - /* } */ - /* } else { */ - /* nlive_server++; */ - /* } */ - /* } */ - - /* pool->nlive_server = nlive_server; */ - - /* if (nlive_server == 0) { */ - /* ASSERT(pool->continuum != NULL); */ - /* ASSERT(pool->ncontinuum != 0); */ - - /* log_debug(LOG_DEBUG, "no live servers for pool %"PRIu32" '%.*s'", */ - /* pool->idx, pool->name.len, pool->name.data); */ - - /* return DN_OK; */ - /* } */ - /* log_debug(LOG_DEBUG, "%"PRIu32" of %"PRIu32" servers are live for pool " */ - /* "%"PRIu32" '%.*s'", nlive_server, nserver, pool->idx, */ - /* pool->name.len, pool->name.data); */ - - /* continuum_addition = RANDOM_CONTINUUM_ADDITION; */ - /* points_per_server = RANDOM_POINTS_PER_SERVER; */ - - /* /\* */ - /* * Allocate the continuum for the pool, the first time, and every time we */ - /* * add a new server to the pool */ - /* *\/ */ - /* if (nlive_server > pool->nserver_continuum) { */ - /* struct continuum *continuum; */ - /* uint32_t nserver_continuum = nlive_server + RANDOM_CONTINUUM_ADDITION; */ - /* uint32_t ncontinuum = nserver_continuum * RANDOM_POINTS_PER_SERVER; */ - - /* continuum = dn_realloc(pool->continuum, sizeof(*continuum) * ncontinuum); */ - /* if (continuum == NULL) { */ - /* return DN_ENOMEM; */ - /* } */ - - /* srandom((uint32_t)time(NULL)); */ - - /* pool->continuum = continuum; */ - /* pool->nserver_continuum = nserver_continuum; */ - /* /\* pool->ncontinuum is initialized later as it could be <= ncontinuum *\/ */ - /* } */ - - /* /\* update the continuum with the servers that are live *\/ */ - /* continuum_index = 0; */ - /* pointer_counter = 0; */ - /* for (server_index = 0; server_index < nserver; server_index++) { */ - /* struct server *server = array_get(&pool->server, server_index); */ - - /* if (pool->auto_eject_hosts && server->next_retry > now) { */ - /* continue; */ - /* } */ - - /* pointer_per_server = 1; */ - - /* pool->continuum[continuum_index].index = server_index; */ - /* pool->continuum[continuum_index++].value = 0; */ - - /* pointer_counter += pointer_per_server; */ - /* } */ - /* pool->ncontinuum = pointer_counter; */ - - /* log_debug(LOG_VERB, "updated pool %"PRIu32" '%.*s' with %"PRIu32" of " */ - /* "%"PRIu32" servers live in %"PRIu32" slots and %"PRIu32" " */ - /* "active points in %"PRIu32" slots", pool->idx, */ - /* pool->name.len, pool->name.data, nlive_server, nserver, */ - /* pool->nserver_continuum, pool->ncontinuum, */ - /* (pool->nserver_continuum + continuum_addition) * points_per_server); */ - - return DN_OK; - +#define RANDOM_CONTINUUM_ADDITION 10 /* # extra slots to build into continuum \ + */ +#define RANDOM_POINTS_PER_SERVER 1 + +rstatus_t random_update(struct server_pool *pool) { + /* uint32_t nserver; /\* # server - live and dead *\/ */ + /* uint32_t nlive_server; /\* # live server *\/ */ + /* uint32_t pointer_per_server; /\* pointers per server proportional to + * weight *\/ */ + /* uint32_t pointer_counter; /\* # pointers on continuum *\/ */ + /* uint32_t points_per_server; /\* points per server *\/ */ + /* uint32_t continuum_index; /\* continuum index *\/ */ + /* uint32_t continuum_addition; /\* extra space in the continuum *\/ */ + /* uint32_t server_index; /\* server index *\/ */ + /* int64_t now; /\* current timestamp in usec *\/ */ + + /* now = dn_usec_now(); */ + /* if (now < 0) { */ + /* return DN_ERROR; */ + /* } */ + + /* nserver = array_n(&pool->server); */ + /* nlive_server = 0; */ + /* pool->next_rebuild = 0LL; */ + + /* for (server_index = 0; server_index < nserver; server_index++) { */ + /* struct server *server = array_get(&pool->server, server_index); */ + + /* if (pool->auto_eject_hosts) { */ + /* if (server->next_retry <= now) { */ + /* server->next_retry = 0LL; */ + /* nlive_server++; */ + /* } else if (pool->next_rebuild == 0LL || */ + /* server->next_retry < pool->next_rebuild) { */ + /* pool->next_rebuild = server->next_retry; */ + /* } */ + /* } else { */ + /* nlive_server++; */ + /* } */ + /* } */ + + /* pool->nlive_server = nlive_server; */ + + /* if (nlive_server == 0) { */ + /* ASSERT(pool->continuum != NULL); */ + /* ASSERT(pool->ncontinuum != 0); */ + + /* log_debug(LOG_DEBUG, "no live servers for pool %"PRIu32" '%.*s'", */ + /* pool->idx, pool->name.len, pool->name.data); */ + + /* return DN_OK; */ + /* } */ + /* log_debug(LOG_DEBUG, "%"PRIu32" of %"PRIu32" servers are live for pool " */ + /* "%"PRIu32" '%.*s'", nlive_server, nserver, pool->idx, */ + /* pool->name.len, pool->name.data); */ + + /* continuum_addition = RANDOM_CONTINUUM_ADDITION; */ + /* points_per_server = RANDOM_POINTS_PER_SERVER; */ + + /* /\* */ + /* * Allocate the continuum for the pool, the first time, and every time we + */ + /* * add a new server to the pool */ + /* *\/ */ + /* if (nlive_server > pool->nserver_continuum) { */ + /* struct continuum *continuum; */ + /* uint32_t nserver_continuum = nlive_server + RANDOM_CONTINUUM_ADDITION; + */ + /* uint32_t ncontinuum = nserver_continuum * RANDOM_POINTS_PER_SERVER; */ + + /* continuum = dn_realloc(pool->continuum, sizeof(*continuum) * + * ncontinuum); */ + /* if (continuum == NULL) { */ + /* return DN_ENOMEM; */ + /* } */ + + /* srandom((uint32_t)time(NULL)); */ + + /* pool->continuum = continuum; */ + /* pool->nserver_continuum = nserver_continuum; */ + /* /\* pool->ncontinuum is initialized later as it could be <= ncontinuum + * *\/ */ + /* } */ + + /* /\* update the continuum with the servers that are live *\/ */ + /* continuum_index = 0; */ + /* pointer_counter = 0; */ + /* for (server_index = 0; server_index < nserver; server_index++) { */ + /* struct server *server = array_get(&pool->server, server_index); */ + + /* if (pool->auto_eject_hosts && server->next_retry > now) { */ + /* continue; */ + /* } */ + + /* pointer_per_server = 1; */ + + /* pool->continuum[continuum_index].index = server_index; */ + /* pool->continuum[continuum_index++].value = 0; */ + + /* pointer_counter += pointer_per_server; */ + /* } */ + /* pool->ncontinuum = pointer_counter; */ + + /* log_debug(LOG_VERB, "updated pool %"PRIu32" '%.*s' with %"PRIu32" of " */ + /* "%"PRIu32" servers live in %"PRIu32" slots and %"PRIu32" " */ + /* "active points in %"PRIu32" slots", pool->idx, */ + /* pool->name.len, pool->name.data, nlive_server, nserver, */ + /* pool->nserver_continuum, pool->ncontinuum, */ + /* (pool->nserver_continuum + continuum_addition) * + * points_per_server); */ + + return DN_OK; } -uint32_t -random_dispatch(struct continuum *continuum, uint32_t ncontinuum, uint32_t hash) -{ - struct continuum *c; +uint32_t random_dispatch(struct continuum *continuum, uint32_t ncontinuum, + uint32_t hash) { + struct continuum *c; - ASSERT(continuum != NULL); - ASSERT(ncontinuum != 0); + ASSERT(continuum != NULL); + ASSERT(ncontinuum != 0); - c = continuum + random() % ncontinuum; + c = continuum + random() % ncontinuum; - return c->index; + return c->index; } diff --git a/src/hashkit/dyn_token.c b/src/hashkit/dyn_token.c index cc4934f98..961595588 100644 --- a/src/hashkit/dyn_token.c +++ b/src/hashkit/dyn_token.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,251 +20,231 @@ * limitations under the License. */ -#include -#include #include "dyn_token.h" +#include +#include +#include "../dyn_array.h" +#include "../dyn_log.h" +#include "../dyn_string.h" +#include "../dyn_util.h" -/* bitsPerDigit radix of 10 multiplied by 1024, rounded up to avoid underallocation */ +/* bitsPerDigit radix of 10 multiplied by 1024, rounded up to avoid + * underallocation */ #define BITS_PER_DIGIT 3402 /* TODO: jeb - this should be 9, but hacked for POC */ #define DIGITS_PER_INT 10 -void -init_dyn_token(struct dyn_token *token) -{ - token->signum = 0; - token->len = 0; +void init_dyn_token(struct dyn_token *token) { + token->signum = 0; + token->len = 0; } -void -deinit_dyn_token(struct dyn_token *token) -{ - token->signum = 0; - token->len = 0; +void deinit_dyn_token(struct dyn_token *token) { + token->signum = 0; + token->len = 0; } -//This implementation does not take into account that token's val can be very large, larger than an uint32 number -rstatus_t -size_dyn_token(struct dyn_token *token, uint32_t token_len) -{ - ASSERT(token_len <= 4); - token->len = token_len; - token->signum = 0; +// This implementation does not take into account that token's val can be very +// large, larger than an uint32 number +rstatus_t size_dyn_token(struct dyn_token *token, uint32_t token_len) { + ASSERT(token_len <= 4); + token->len = token_len; + token->signum = 0; - return DN_OK; + return DN_OK; } - -//This implementation does not take into account that token's val can be very large, larger than an uint32 number -rstatus_t -copy_dyn_token(const struct dyn_token * src, struct dyn_token * dst) -{ - rstatus_t status = size_dyn_token(dst, 1); - set_int_dyn_token(dst, src->mag[0]); - return status; +// This implementation does not take into account that token's val can be very +// large, larger than an uint32 number +rstatus_t copy_dyn_token(const struct dyn_token *src, struct dyn_token *dst) { + rstatus_t status = size_dyn_token(dst, 1); + set_int_dyn_token(dst, src->mag[0]); + return status; } - -//This implementation does not take into account that token's val can be very large, larger than an uint32 number -void -set_int_dyn_token(struct dyn_token *token, uint32_t val) -{ - token->mag[0] = val; - token->len = 1; - token->signum = val > 0 ? 1 : 0; +// This implementation does not take into account that token's val can be very +// large, larger than an uint32 number +void set_int_dyn_token(struct dyn_token *token, uint32_t val) { + token->mag[0] = val; + token->len = 1; + token->signum = val > 0 ? 1 : 0; } -void print_dyn_token(struct dyn_token *token, int num_tabs) -{ - if (token == NULL) - log_debug(LOG_VERB, "Token is null!!!!!"); - - if (num_tabs < 0) - num_tabs = 0; +void print_dyn_token(struct dyn_token *token, int num_tabs) { + if (token == NULL) log_debug(LOG_VERB, "Token is null!!!!!"); - log_debug(LOG_VERB, "%*cToken : %"PRIu32" %"PRIu32" %"PRIu32" ", num_tabs, '\t', token->signum, token->mag[0], token->len); + if (num_tabs < 0) num_tabs = 0; + log_debug(LOG_VERB, "%*cToken : %" PRIu32 " %" PRIu32 " %" PRIu32 " ", + num_tabs, '\t', token->signum, token->mag[0], token->len); } -static void -add_next_word(uint32_t *buf, uint32_t len, uint32_t next_int) -{ - uint64_t product = 0; - uint64_t carry = 0; - - /* magick! */ - uint32_t radix_val = 0x17179149; - uint32_t i; - for (i = len - 1; i >= 0; i--) { - product = radix_val * buf[i] + carry; - buf[i] = (uint32_t)product; - carry = product >> 32; - } - - uint64_t sum = buf[len-1] + next_int; - buf[len-1] = (uint32_t)sum; - carry = sum >> 32; - for (i = len-2; i >= 0; i--) { - sum = buf[i] + carry; - buf[i] = (uint32_t)sum; - carry = sum >> 32; - } +static void add_next_word(uint32_t *buf, uint32_t len, uint32_t next_int) { + uint64_t product = 0; + uint64_t carry = 0; + + /* magick! */ + uint32_t radix_val = 0x17179149; + uint32_t i; + for (i = len - 1; i >= 0; i--) { + product = radix_val * buf[i] + carry; + buf[i] = (uint32_t)product; + carry = product >> 32; + } + + uint64_t sum = buf[len - 1] + next_int; + buf[len - 1] = (uint32_t)sum; + carry = sum >> 32; + for (i = len - 2; i >= 0; i--) { + sum = buf[i] + carry; + buf[i] = (uint32_t)sum; + carry = sum >> 32; + } } -rstatus_t -parse_dyn_token(uint8_t *start, uint32_t len, struct dyn_token *token) -{ - ASSERT(len > 0); - ASSERT(token != NULL); - - /* TODO-jeb: check for whitespace */ - char sign = '-'; - uint8_t *p = start; - uint8_t *q = p + len; - uint32_t digits = len; - if (p[0] == sign) { - token->signum = -1; - p++; - digits--; - ASSERT(digits > 0); - } else if (digits == 1 && p[0] == '0') { - token->signum = 0; - } else { - token->signum = 1; - } - - uint32_t nwords; - /* if (digits < 10) { */ - nwords = 1; - /* } else { */ - /* uint32_t nbits = ((digits * BITS_PER_DIGIT) >> 10) + 1; */ - /* nwords = (nbits + 32) >> 5; */ - /* } */ - - uint32_t *buf = token->mag; - token->len = nwords; - - // Process first (potentially short) digit group - uint32_t first_group_len = digits % DIGITS_PER_INT; - if (first_group_len == 0) - first_group_len = DIGITS_PER_INT; - buf[nwords - 1] = dn_atoui(p, first_group_len); - p += first_group_len; - - // Process remaining digit groups - while (p < q) { - uint32_t local_int = dn_atoui(p, DIGITS_PER_INT); - add_next_word(buf, nwords, local_int); - p += DIGITS_PER_INT; - } - - return DN_OK; +rstatus_t parse_dyn_token(uint8_t *start, uint32_t len, + struct dyn_token *token) { + ASSERT(len > 0); + ASSERT(token != NULL); + + /* TODO-jeb: check for whitespace */ + char sign = '-'; + uint8_t *p = start; + uint8_t *q = p + len; + uint32_t digits = len; + if (p[0] == sign) { + token->signum = -1; + p++; + digits--; + ASSERT(digits > 0); + } else if (digits == 1 && p[0] == '0') { + token->signum = 0; + } else { + token->signum = 1; + } + + uint32_t nwords; + /* if (digits < 10) { */ + nwords = 1; + /* } else { */ + /* uint32_t nbits = ((digits * BITS_PER_DIGIT) >> 10) + 1; */ + /* nwords = (nbits + 32) >> 5; */ + /* } */ + + uint32_t *buf = token->mag; + token->len = nwords; + + // Process first (potentially short) digit group + uint32_t first_group_len = digits % DIGITS_PER_INT; + if (first_group_len == 0) first_group_len = DIGITS_PER_INT; + buf[nwords - 1] = dn_atoui(p, first_group_len); + p += first_group_len; + + // Process remaining digit groups + while (p < q) { + uint32_t local_int = dn_atoui(p, DIGITS_PER_INT); + add_next_word(buf, nwords, local_int); + p += DIGITS_PER_INT; + } + + return DN_OK; } -int32_t -cmp_dyn_token(const struct dyn_token *t1, const struct dyn_token *t2) -{ - ASSERT(t1 != NULL); - ASSERT(t2 != NULL); - - if (t1->signum == t2->signum) { - if (t1->signum == 0) { - return 0; - } - - if (t1->len == t2->len) { - uint32_t i; - for (i = 0; i < t1->len; i++) { - uint32_t a = t1->mag[i]; - uint32_t b = t2->mag[i]; - if (a != b) { - return a > b ? 1 : -1; - } - } - return 0; - } - - return t1->len > t2->len ? 1 : -1; - } - - return t1->signum > t2->signum ? 1 : -1; +int32_t cmp_dyn_token(const struct dyn_token *t1, const struct dyn_token *t2) { + ASSERT(t1 != NULL); + ASSERT(t2 != NULL); + + if (t1->signum == t2->signum) { + if (t1->signum == 0) { + return 0; + } + + if (t1->len == t2->len) { + uint32_t i; + for (i = 0; i < t1->len; i++) { + uint32_t a = t1->mag[i]; + uint32_t b = t2->mag[i]; + if (a != b) { + return a > b ? 1 : -1; + } + } + return 0; + } + + return t1->len > t2->len ? 1 : -1; + } + + return t1->signum > t2->signum ? 1 : -1; } - /* * Does the work of reading an array of chars, and constructing the tokens * for the array. */ -rstatus_t -derive_tokens(struct array *tokens, uint8_t *start, uint8_t *end) -{ - ASSERT (end > start); - uint8_t *p = end; - uint8_t *q; - while (p >= start) { - struct dyn_token *token = array_push(tokens); - ASSERT (token != NULL); - init_dyn_token(token); - - q = dn_strrchr(p, start, ','); - if (q == NULL) { - q = start; /* we're at the beginning of the list */ - } else { - q++; - } - - uint32_t len = 0; - if (p == end) { - len = (uint32_t)(p - q); - } else { - len = (uint32_t)(p - q + 1); - } - - rstatus_t status = parse_dyn_token(q, len, token); - if (status != DN_OK) { - return DN_ERROR; - } - - p = q - 2; - } - - return DN_OK; +rstatus_t derive_tokens(struct array *tokens, uint8_t *start, uint8_t *end) { + ASSERT(end > start); + uint8_t *p = end; + uint8_t *q; + while (p >= start) { + struct dyn_token *token = array_push(tokens); + ASSERT(token != NULL); + init_dyn_token(token); + + q = dn_strrchr(p, start, ','); + if (q == NULL) { + q = start; /* we're at the beginning of the list */ + } else { + q++; + } + + uint32_t len = 0; + if (p == end) { + len = (uint32_t)(p - q); + } else { + len = (uint32_t)(p - q + 1); + } + + rstatus_t status = parse_dyn_token(q, len, token); + if (status != DN_OK) { + return DN_ERROR; + } + + p = q - 2; + } + + return DN_OK; } - -rstatus_t -derive_token(struct dyn_token *token, uint8_t *start, uint8_t *end) -{ - ASSERT (end > start); - uint8_t *p = end; - uint8_t *q; - ASSERT (token != NULL); - init_dyn_token(token); - - if (p >= start) { - q = dn_strrchr(p, start, ','); - if (q == NULL) { - q = start; /* we're at the beginning of the list */ - } else { - q++; - } - - uint32_t len = 0; - if (p == end) { - len = (uint32_t)(p - q); - } else { - len = (uint32_t)(p - q + 1); - } - - rstatus_t status = parse_dyn_token(q, len, token); - if (status != DN_OK) { - return DN_ERROR; - } - - p = q - 2; - } - - return DN_OK; +rstatus_t derive_token(struct dyn_token *token, uint8_t *start, uint8_t *end) { + ASSERT(end > start); + uint8_t *p = end; + uint8_t *q; + ASSERT(token != NULL); + init_dyn_token(token); + + if (p >= start) { + q = dn_strrchr(p, start, ','); + if (q == NULL) { + q = start; /* we're at the beginning of the list */ + } else { + q++; + } + + uint32_t len = 0; + if (p == end) { + len = (uint32_t)(p - q); + } else { + len = (uint32_t)(p - q + 1); + } + + rstatus_t status = parse_dyn_token(q, len, token); + if (status != DN_OK) { + return DN_ERROR; + } + + p = q - 2; + } + + return DN_OK; } - diff --git a/src/hashkit/dyn_token.h b/src/hashkit/dyn_token.h index 08dec88ab..6b9c06d04 100644 --- a/src/hashkit/dyn_token.h +++ b/src/hashkit/dyn_token.h @@ -1,33 +1,34 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ - - -#include "dyn_core.h" + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ #ifndef _DYN_TOKEN_H_ #define _DYN_TOKEN_H_ +#include "../dyn_types.h" + +// Forward declarations +struct array; struct dyn_token { - uint32_t signum; - uint32_t mag[4]; - uint32_t len; + uint32_t signum; + uint32_t mag[4]; + uint32_t len; }; - void init_dyn_token(struct dyn_token *token); void deinit_dyn_token(struct dyn_token *token); rstatus_t size_dyn_token(struct dyn_token *token, uint32_t size); -rstatus_t copy_dyn_token(const struct dyn_token * src, struct dyn_token * dst); +rstatus_t copy_dyn_token(const struct dyn_token *src, struct dyn_token *dst); /** * convenience function for setting a token whose value is just an int */ void set_int_dyn_token(struct dyn_token *token, uint32_t val); -rstatus_t parse_dyn_token(uint8_t *start, uint32_t len, struct dyn_token *token); +rstatus_t parse_dyn_token(uint8_t *start, uint32_t len, + struct dyn_token *token); int32_t cmp_dyn_token(const struct dyn_token *t1, const struct dyn_token *t2); rstatus_t derive_tokens(struct array *tokens, uint8_t *start, uint8_t *end); rstatus_t derive_token(struct dyn_token *token, uint8_t *start, uint8_t *end); diff --git a/src/proto/dyn_memcache.c b/src/proto/dyn_memcache.c index 3518f2d7b..27303e3ff 100644 --- a/src/proto/dyn_memcache.c +++ b/src/proto/dyn_memcache.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -42,1491 +42,1490 @@ * Return true, if the memcache command is a storage command, otherwise * return false */ -static bool -memcache_storage(struct msg *r) -{ - switch (r->type) { +static bool memcache_storage(struct msg *r) { + switch (r->type) { case MSG_REQ_MC_SET: case MSG_REQ_MC_CAS: case MSG_REQ_MC_ADD: case MSG_REQ_MC_REPLACE: case MSG_REQ_MC_APPEND: case MSG_REQ_MC_PREPEND: - return true; + return true; default: - break; - } + break; + } - return false; + return false; } /* * Return true, if the memcache command is a cas command, otherwise * return false */ -static bool -memcache_cas(struct msg *r) -{ - if (r->type == MSG_REQ_MC_CAS) { - return true; - } +static bool memcache_cas(struct msg *r) { + if (r->type == MSG_REQ_MC_CAS) { + return true; + } - return false; + return false; } /* * Return true, if the memcache command is a retrieval command, otherwise * return false */ -static bool -memcache_retrieval(struct msg *r) -{ - switch (r->type) { +static bool memcache_retrieval(struct msg *r) { + switch (r->type) { case MSG_REQ_MC_GET: case MSG_REQ_MC_GETS: - return true; + return true; default: - break; - } + break; + } - return false; + return false; } /* * Return true, if the memcache command is a arithmetic command, otherwise * return false */ -static bool -memcache_arithmetic(struct msg *r) -{ - switch (r->type) { +static bool memcache_arithmetic(struct msg *r) { + switch (r->type) { case MSG_REQ_MC_INCR: case MSG_REQ_MC_DECR: - return true; + return true; default: - break; - } + break; + } - return false; + return false; } /* * Return true, if the memcache command is a delete command, otherwise * return false */ -static bool -memcache_delete(struct msg *r) -{ - if (r->type == MSG_REQ_MC_DELETE) { - return true; - } +static bool memcache_delete(struct msg *r) { + if (r->type == MSG_REQ_MC_DELETE) { + return true; + } - return false; + return false; } /* * Return true if the memcache command is a touch command, otherwise * return false */ -static bool -memcache_touch(struct msg *r) -{ - if (r->type == MSG_REQ_MC_TOUCH) { - return true; - } +static bool memcache_touch(struct msg *r) { + if (r->type == MSG_REQ_MC_TOUCH) { + return true; + } - return false; + return false; } -void -memcache_parse_req(struct msg *r, const struct string *hash_tag) -{ - struct mbuf *b; - uint8_t *p, *m; - uint8_t ch; - enum { - SW_START, - SW_REQ_TYPE, - SW_SPACES_BEFORE_KEY, - SW_KEY, - SW_SPACES_BEFORE_KEYS, - SW_SPACES_BEFORE_FLAGS, - SW_FLAGS, - SW_SPACES_BEFORE_EXPIRY, - SW_EXPIRY, - SW_SPACES_BEFORE_VLEN, - SW_VLEN, - SW_SPACES_BEFORE_CAS, - SW_CAS, - SW_RUNTO_VAL, - SW_VAL, - SW_SPACES_BEFORE_NUM, - SW_NUM, - SW_RUNTO_CRLF, - SW_CRLF, - SW_NOREPLY, - SW_AFTER_NOREPLY, - SW_ALMOST_DONE, - SW_SENTINEL - } state; - - state = r->state; - b = STAILQ_LAST(&r->mhdr, mbuf, next); - - ASSERT(r->is_request); - ASSERT(state >= SW_START && state < SW_SENTINEL); - ASSERT(b != NULL); - ASSERT(b->pos <= b->last); - - /* validate the parsing maker */ - ASSERT(r->pos != NULL); - ASSERT(r->pos >= b->pos && r->pos <= b->last); - - for (p = r->pos; p < b->last; p++) { - ch = *p; - - switch (state) { - case SW_START: - if (ch == ' ') { +void memcache_parse_req(struct msg *r, const struct string *hash_tag) { + struct mbuf *b; + uint8_t *p, *m; + uint8_t ch; + enum { + SW_START, + SW_REQ_TYPE, + SW_SPACES_BEFORE_KEY, + SW_KEY, + SW_SPACES_BEFORE_KEYS, + SW_SPACES_BEFORE_FLAGS, + SW_FLAGS, + SW_SPACES_BEFORE_EXPIRY, + SW_EXPIRY, + SW_SPACES_BEFORE_VLEN, + SW_VLEN, + SW_SPACES_BEFORE_CAS, + SW_CAS, + SW_RUNTO_VAL, + SW_VAL, + SW_SPACES_BEFORE_NUM, + SW_NUM, + SW_RUNTO_CRLF, + SW_CRLF, + SW_NOREPLY, + SW_AFTER_NOREPLY, + SW_ALMOST_DONE, + SW_SENTINEL + } state; + + state = r->state; + b = STAILQ_LAST(&r->mhdr, mbuf, next); + + ASSERT(r->is_request); + ASSERT(state >= SW_START && state < SW_SENTINEL); + ASSERT(b != NULL); + ASSERT(b->pos <= b->last); + + /* validate the parsing maker */ + ASSERT(r->pos != NULL); + ASSERT(r->pos >= b->pos && r->pos <= b->last); + + for (p = r->pos; p < b->last; p++) { + ch = *p; + + switch (state) { + case SW_START: + if (ch == ' ') { + break; + } + + if (!islower(ch)) { + goto error; + } + + /* req_start <- p; type_start <- p */ + r->token = p; + state = SW_REQ_TYPE; + + break; + + case SW_REQ_TYPE: + if (ch == ' ' || ch == CR) { + /* type_end = p - 1 */ + m = r->token; + r->token = NULL; + r->type = MSG_UNKNOWN; + r->narg++; + + switch (p - m) { + case 3: + if (str4cmp(m, 'g', 'e', 't', ' ')) { + r->type = MSG_REQ_MC_GET; + r->is_read = 1; break; - } + } - if (!islower(ch)) { - goto error; - } + if (str4cmp(m, 's', 'e', 't', ' ')) { + r->type = MSG_REQ_MC_SET; + r->is_read = 0; + break; + } - /* req_start <- p; type_start <- p */ - r->token = p; - state = SW_REQ_TYPE; + if (str4cmp(m, 'a', 'd', 'd', ' ')) { + r->type = MSG_REQ_MC_ADD; + r->is_read = 0; + break; + } - break; + if (str4cmp(m, 'c', 'a', 's', ' ')) { + r->type = MSG_REQ_MC_CAS; + r->is_read = 0; + break; + } - case SW_REQ_TYPE: - if (ch == ' ' || ch == CR) { - /* type_end = p - 1 */ - m = r->token; - r->token = NULL; - r->type = MSG_UNKNOWN; - r->narg++; - - switch (p - m) { - - case 3: - if (str4cmp(m, 'g', 'e', 't', ' ')) { - r->type = MSG_REQ_MC_GET; - r->is_read = 1; - break; - } - - if (str4cmp(m, 's', 'e', 't', ' ')) { - r->type = MSG_REQ_MC_SET; - r->is_read = 0; - break; - } - - if (str4cmp(m, 'a', 'd', 'd', ' ')) { - r->type = MSG_REQ_MC_ADD; - r->is_read = 0; - break; - } - - if (str4cmp(m, 'c', 'a', 's', ' ')) { - r->type = MSG_REQ_MC_CAS; - r->is_read = 0; - break; - } - - break; - - case 4: - if (str4cmp(m, 'g', 'e', 't', 's')) { - r->type = MSG_REQ_MC_GETS; - r->is_read = 1; - break; - } - - if (str4cmp(m, 'i', 'n', 'c', 'r')) { - r->type = MSG_REQ_MC_INCR; - r->is_read = 0; - break; - } - - if (str4cmp(m, 'd', 'e', 'c', 'r')) { - r->type = MSG_REQ_MC_DECR; - r->is_read = 0; - break; - } - - if (str4cmp(m, 'q', 'u', 'i', 't')) { - r->type = MSG_REQ_MC_QUIT; - r->quit = 1; - r->is_read = 1; - break; - } - - break; - - case 5: - if (str5cmp(m, 't', 'o', 'u', 'c', 'h')) { - r->type = MSG_REQ_MC_TOUCH; - r->is_read = 0; - break; - } - - break; - - case 6: - if (str6cmp(m, 'a', 'p', 'p', 'e', 'n', 'd')) { - r->type = MSG_REQ_MC_APPEND; - r->is_read = 0; - break; - } - - if (str6cmp(m, 'd', 'e', 'l', 'e', 't', 'e')) { - r->type = MSG_REQ_MC_DELETE; - r->is_read = 0; - break; - } - - break; - - case 7: - if (str7cmp(m, 'p', 'r', 'e', 'p', 'e', 'n', 'd')) { - r->type = MSG_REQ_MC_PREPEND; - r->is_read = 0; - break; - } - - if (str7cmp(m, 'r', 'e', 'p', 'l', 'a', 'c', 'e')) { - r->type = MSG_REQ_MC_REPLACE; - r->is_read = 0; - break; - } - - break; - } - - switch (r->type) { - case MSG_REQ_MC_GET: - case MSG_REQ_MC_GETS: - case MSG_REQ_MC_DELETE: - case MSG_REQ_MC_CAS: - case MSG_REQ_MC_SET: - case MSG_REQ_MC_ADD: - case MSG_REQ_MC_REPLACE: - case MSG_REQ_MC_APPEND: - case MSG_REQ_MC_PREPEND: - case MSG_REQ_MC_INCR: - case MSG_REQ_MC_DECR: - case MSG_REQ_MC_TOUCH: - if (ch == CR) { - goto error; - } - state = SW_SPACES_BEFORE_KEY; - break; - - case MSG_REQ_MC_QUIT: - p = p - 1; /* go back by 1 byte */ - state = SW_CRLF; - break; - - case MSG_UNKNOWN: - goto error; - - default: - NOT_REACHED(); - } - - } else if (!islower(ch)) { - goto error; - } + break; - break; + case 4: + if (str4cmp(m, 'g', 'e', 't', 's')) { + r->type = MSG_REQ_MC_GETS; + r->is_read = 1; + break; + } - case SW_SPACES_BEFORE_KEY: - if (ch != ' ') { - p = p - 1; /* go back by 1 byte */ - r->token = NULL; - state = SW_KEY; - } - break; + if (str4cmp(m, 'i', 'n', 'c', 'r')) { + r->type = MSG_REQ_MC_INCR; + r->is_read = 0; + break; + } - case SW_KEY: - if (r->token == NULL) { - r->token = p; - } - if (ch == ' ' || ch == CR) { - struct keypos *kpos; - int keylen = p - r->token; - if (keylen > MEMCACHE_MAX_KEY_LENGTH) { - log_error("parsed bad req %"PRIu64" of type %d with key " - "prefix '%.*s...' and length %d that exceeds " - "maximum key length", r->id, r->type, 16, - r->token, p - r->token); - goto error; - } else if (keylen == 0) { - log_error("parsed bad req %"PRIu64" of type %d with an " - "empty key", r->id, r->type); - goto error; - } - - kpos = array_push(r->keys); - if (kpos == NULL) { - goto enomem; - } - kpos->start = kpos->tag_start = r->token; - kpos->end = kpos->tag_end = p; - if (!string_empty(hash_tag)) { - uint8_t *tag_start, *tag_end; - - tag_start = dn_strchr(kpos->start, kpos->end, hash_tag->data[0]); - if (tag_start != NULL) { - tag_end = dn_strchr(tag_start + 1, kpos->end, hash_tag->data[1]); - if (tag_end != NULL) { - kpos->tag_start = tag_start + 1; - kpos->tag_end = tag_end; - } - } - } - - kpos->start = r->token; - kpos->end = p; - - r->narg++; - r->token = NULL; - - /* get next state */ - if (memcache_storage(r)) { - state = SW_SPACES_BEFORE_FLAGS; - } else if (memcache_arithmetic(r) || memcache_touch(r) ) { - state = SW_SPACES_BEFORE_NUM; - } else if (memcache_delete(r)) { - state = SW_RUNTO_CRLF; - } else if (memcache_retrieval(r)) { - state = SW_SPACES_BEFORE_KEYS; - } else { - state = SW_RUNTO_CRLF; - } - - if (ch == CR) { - if (memcache_storage(r) || memcache_arithmetic(r)) { - goto error; - } - p = p - 1; /* go back by 1 byte */ - } - } + if (str4cmp(m, 'd', 'e', 'c', 'r')) { + r->type = MSG_REQ_MC_DECR; + r->is_read = 0; + break; + } - break; + if (str4cmp(m, 'q', 'u', 'i', 't')) { + r->type = MSG_REQ_MC_QUIT; + r->quit = 1; + r->is_read = 1; + break; + } - case SW_SPACES_BEFORE_KEYS: - ASSERT(memcache_retrieval(r)); - switch (ch) { - case ' ': + break; + + case 5: + if (str5cmp(m, 't', 'o', 'u', 'c', 'h')) { + r->type = MSG_REQ_MC_TOUCH; + r->is_read = 0; break; + } - case CR: - state = SW_ALMOST_DONE; + break; + + case 6: + if (str6cmp(m, 'a', 'p', 'p', 'e', 'n', 'd')) { + r->type = MSG_REQ_MC_APPEND; + r->is_read = 0; + break; + } + + if (str6cmp(m, 'd', 'e', 'l', 'e', 't', 'e')) { + r->type = MSG_REQ_MC_DELETE; + r->is_read = 0; + break; + } + + break; + + case 7: + if (str7cmp(m, 'p', 'r', 'e', 'p', 'e', 'n', 'd')) { + r->type = MSG_REQ_MC_PREPEND; + r->is_read = 0; + break; + } + + if (str7cmp(m, 'r', 'e', 'p', 'l', 'a', 'c', 'e')) { + r->type = MSG_REQ_MC_REPLACE; + r->is_read = 0; break; + } + + break; + } + + switch (r->type) { + case MSG_REQ_MC_GET: + case MSG_REQ_MC_GETS: + case MSG_REQ_MC_DELETE: + case MSG_REQ_MC_CAS: + case MSG_REQ_MC_SET: + case MSG_REQ_MC_ADD: + case MSG_REQ_MC_REPLACE: + case MSG_REQ_MC_APPEND: + case MSG_REQ_MC_PREPEND: + case MSG_REQ_MC_INCR: + case MSG_REQ_MC_DECR: + case MSG_REQ_MC_TOUCH: + if (ch == CR) { + goto error; + } + state = SW_SPACES_BEFORE_KEY; + break; + + case MSG_REQ_MC_QUIT: + p = p - 1; /* go back by 1 byte */ + state = SW_CRLF; + break; + + case MSG_UNKNOWN: + goto error; default: - r->token = NULL; - p = p - 1; /* go back by 1 byte */ - state = SW_KEY; - } + NOT_REACHED(); + } - break; + } else if (!islower(ch)) { + goto error; + } - case SW_SPACES_BEFORE_FLAGS: - if (ch != ' ') { - if (!isdigit(ch)) { - goto error; - } - /* flags_start <- p; flags <- ch - '0' */ - r->token = p; - state = SW_FLAGS; - } + break; - break; + case SW_SPACES_BEFORE_KEY: + if (ch != ' ') { + p = p - 1; /* go back by 1 byte */ + r->token = NULL; + state = SW_KEY; + } + break; - case SW_FLAGS: - if (isdigit(ch)) { - /* flags <- flags * 10 + (ch - '0') */ - ; - } else if (ch == ' ') { - /* flags_end <- p - 1 */ - r->token = NULL; - state = SW_SPACES_BEFORE_EXPIRY; - } else { - goto error; + case SW_KEY: + if (r->token == NULL) { + r->token = p; + } + if (ch == ' ' || ch == CR) { + struct keypos *kpos; + int keylen = p - r->token; + if (keylen > MEMCACHE_MAX_KEY_LENGTH) { + log_error("parsed bad req %" PRIu64 + " of type %d with key " + "prefix '%.*s...' and length %d that exceeds " + "maximum key length", + r->id, r->type, 16, r->token, p - r->token); + goto error; + } else if (keylen == 0) { + log_error("parsed bad req %" PRIu64 + " of type %d with an " + "empty key", + r->id, r->type); + goto error; + } + + kpos = array_push(r->keys); + if (kpos == NULL) { + goto enomem; + } + kpos->start = kpos->tag_start = r->token; + kpos->end = kpos->tag_end = p; + if (!string_empty(hash_tag)) { + uint8_t *tag_start, *tag_end; + + tag_start = dn_strchr(kpos->start, kpos->end, hash_tag->data[0]); + if (tag_start != NULL) { + tag_end = dn_strchr(tag_start + 1, kpos->end, hash_tag->data[1]); + if (tag_end != NULL) { + kpos->tag_start = tag_start + 1; + kpos->tag_end = tag_end; + } } + } + + kpos->start = r->token; + kpos->end = p; + + r->narg++; + r->token = NULL; + + /* get next state */ + if (memcache_storage(r)) { + state = SW_SPACES_BEFORE_FLAGS; + } else if (memcache_arithmetic(r) || memcache_touch(r)) { + state = SW_SPACES_BEFORE_NUM; + } else if (memcache_delete(r)) { + state = SW_RUNTO_CRLF; + } else if (memcache_retrieval(r)) { + state = SW_SPACES_BEFORE_KEYS; + } else { + state = SW_RUNTO_CRLF; + } + + if (ch == CR) { + if (memcache_storage(r) || memcache_arithmetic(r)) { + goto error; + } + p = p - 1; /* go back by 1 byte */ + } + } - break; + break; - case SW_SPACES_BEFORE_EXPIRY: - if (ch != ' ') { - if (!isdigit(ch)) { - goto error; - } - /* expiry_start <- p; expiry <- ch - '0' */ - r->token = p; - state = SW_EXPIRY; - } + case SW_SPACES_BEFORE_KEYS: + ASSERT(memcache_retrieval(r)); + switch (ch) { + case ' ': + break; + case CR: + state = SW_ALMOST_DONE; break; - case SW_EXPIRY: - if (isdigit(ch)) { - /* expiry <- expiry * 10 + (ch - '0') */ - ; - } else if (ch == ' ') { - /* expiry_end <- p - 1 */ - r->token = NULL; - state = SW_SPACES_BEFORE_VLEN; - } else { - goto error; - } + default: + r->token = NULL; + p = p - 1; /* go back by 1 byte */ + state = SW_KEY; + } - break; + break; - case SW_SPACES_BEFORE_VLEN: - if (ch != ' ') { - if (!isdigit(ch)) { - goto error; - } - /* vlen_start <- p */ - r->vlen = (uint32_t)(ch - '0'); - state = SW_VLEN; - } + case SW_SPACES_BEFORE_FLAGS: + if (ch != ' ') { + if (!isdigit(ch)) { + goto error; + } + /* flags_start <- p; flags <- ch - '0' */ + r->token = p; + state = SW_FLAGS; + } - break; + break; - case SW_VLEN: - if (isdigit(ch)) { - r->vlen = r->vlen * 10 + (uint32_t)(ch - '0'); - } else if (memcache_cas(r)) { - if (ch != ' ') { - goto error; - } - /* vlen_end <- p - 1 */ - p = p - 1; /* go back by 1 byte */ - r->token = NULL; - state = SW_SPACES_BEFORE_CAS; - } else if (ch == ' ' || ch == CR) { - /* vlen_end <- p - 1 */ - p = p - 1; /* go back by 1 byte */ - r->token = NULL; - state = SW_RUNTO_CRLF; - } else { - goto error; - } + case SW_FLAGS: + if (isdigit(ch)) { + /* flags <- flags * 10 + (ch - '0') */ + ; + } else if (ch == ' ') { + /* flags_end <- p - 1 */ + r->token = NULL; + state = SW_SPACES_BEFORE_EXPIRY; + } else { + goto error; + } - break; + break; - case SW_SPACES_BEFORE_CAS: - if (ch != ' ') { - if (!isdigit(ch)) { - goto error; - } - /* cas_start <- p; cas <- ch - '0' */ - r->token = p; - state = SW_CAS; - } + case SW_SPACES_BEFORE_EXPIRY: + if (ch != ' ') { + if (!isdigit(ch)) { + goto error; + } + /* expiry_start <- p; expiry <- ch - '0' */ + r->token = p; + state = SW_EXPIRY; + } - break; + break; - case SW_CAS: - if (isdigit(ch)) { - /* cas <- cas * 10 + (ch - '0') */ - ; - } else if (ch == ' ' || ch == CR) { - /* cas_end <- p - 1 */ - p = p - 1; /* go back by 1 byte */ - r->token = NULL; - state = SW_RUNTO_CRLF; - } else { - goto error; - } + case SW_EXPIRY: + if (isdigit(ch)) { + /* expiry <- expiry * 10 + (ch - '0') */ + ; + } else if (ch == ' ') { + /* expiry_end <- p - 1 */ + r->token = NULL; + state = SW_SPACES_BEFORE_VLEN; + } else { + goto error; + } - break; + break; + case SW_SPACES_BEFORE_VLEN: + if (ch != ' ') { + if (!isdigit(ch)) { + goto error; + } + /* vlen_start <- p */ + r->vlen = (uint32_t)(ch - '0'); + state = SW_VLEN; + } - case SW_RUNTO_VAL: - switch (ch) { - case LF: - /* val_start <- p + 1 */ - state = SW_VAL; - break; + break; - default: - goto error; - } + case SW_VLEN: + if (isdigit(ch)) { + r->vlen = r->vlen * 10 + (uint32_t)(ch - '0'); + } else if (memcache_cas(r)) { + if (ch != ' ') { + goto error; + } + /* vlen_end <- p - 1 */ + p = p - 1; /* go back by 1 byte */ + r->token = NULL; + state = SW_SPACES_BEFORE_CAS; + } else if (ch == ' ' || ch == CR) { + /* vlen_end <- p - 1 */ + p = p - 1; /* go back by 1 byte */ + r->token = NULL; + state = SW_RUNTO_CRLF; + } else { + goto error; + } - break; + break; - case SW_VAL: - m = p + r->vlen; - if (m >= b->last) { - ASSERT(r->vlen >= (uint32_t)(b->last - p)); - r->vlen -= (uint32_t)(b->last - p); - m = b->last - 1; - p = m; /* move forward by vlen bytes */ - break; - } - switch (*m) { - case CR: - /* val_end <- p - 1 */ - p = m; /* move forward by vlen bytes */ - state = SW_ALMOST_DONE; - break; + case SW_SPACES_BEFORE_CAS: + if (ch != ' ') { + if (!isdigit(ch)) { + goto error; + } + /* cas_start <- p; cas <- ch - '0' */ + r->token = p; + state = SW_CAS; + } - default: - goto error; - } + break; - break; + case SW_CAS: + if (isdigit(ch)) { + /* cas <- cas * 10 + (ch - '0') */ + ; + } else if (ch == ' ' || ch == CR) { + /* cas_end <- p - 1 */ + p = p - 1; /* go back by 1 byte */ + r->token = NULL; + state = SW_RUNTO_CRLF; + } else { + goto error; + } - case SW_SPACES_BEFORE_NUM: - if (ch != ' ') { - if (!(isdigit(ch) || ch == '-')) { - goto error; - } - /* num_start <- p; num <- ch - '0' */ - r->token = p; - state = SW_NUM; - } + break; + case SW_RUNTO_VAL: + switch (ch) { + case LF: + /* val_start <- p + 1 */ + state = SW_VAL; break; - case SW_NUM: - if (isdigit(ch)) { - /* num <- num * 10 + (ch - '0') */ - ; - } else if (ch == ' ' || ch == CR) { - r->token = NULL; - /* num_end <- p - 1 */ - p = p - 1; /* go back by 1 byte */ - state = SW_RUNTO_CRLF; - } else { - goto error; - } + default: + goto error; + } + break; + + case SW_VAL: + m = p + r->vlen; + if (m >= b->last) { + ASSERT(r->vlen >= (uint32_t)(b->last - p)); + r->vlen -= (uint32_t)(b->last - p); + m = b->last - 1; + p = m; /* move forward by vlen bytes */ + break; + } + switch (*m) { + case CR: + /* val_end <- p - 1 */ + p = m; /* move forward by vlen bytes */ + state = SW_ALMOST_DONE; break; - case SW_RUNTO_CRLF: - switch (ch) { - case ' ': - break; + default: + goto error; + } - case 'n': - if (memcache_storage(r) || memcache_arithmetic(r) || memcache_delete(r) || memcache_touch(r)) { - /* noreply_start <- p */ - r->token = p; - state = SW_NOREPLY; - } else { - goto error; - } + break; - break; + case SW_SPACES_BEFORE_NUM: + if (ch != ' ') { + if (!(isdigit(ch) || ch == '-')) { + goto error; + } + /* num_start <- p; num <- ch - '0' */ + r->token = p; + state = SW_NUM; + } - case CR: - if (memcache_storage(r)) { - state = SW_RUNTO_VAL; - } else { - state = SW_ALMOST_DONE; - } + break; - break; + case SW_NUM: + if (isdigit(ch)) { + /* num <- num * 10 + (ch - '0') */ + ; + } else if (ch == ' ' || ch == CR) { + r->token = NULL; + /* num_end <- p - 1 */ + p = p - 1; /* go back by 1 byte */ + state = SW_RUNTO_CRLF; + } else { + goto error; + } - default: - goto error; - } + break; + case SW_RUNTO_CRLF: + switch (ch) { + case ' ': break; - case SW_NOREPLY: - switch (ch) { - case ' ': - case CR: - m = r->token; - if (((p - m) == 7) && str7cmp(m, 'n', 'o', 'r', 'e', 'p', 'l', 'y')) { - ASSERT(memcache_storage(r) || memcache_arithmetic(r) || memcache_delete(r) || memcache_touch(r)); - r->token = NULL; - /* noreply_end <- p - 1 */ - r->expect_datastore_reply = 0; - state = SW_AFTER_NOREPLY; - p = p - 1; /* go back by 1 byte */ - } else { - goto error; - } + case 'n': + if (memcache_storage(r) || memcache_arithmetic(r) || + memcache_delete(r) || memcache_touch(r)) { + /* noreply_start <- p */ + r->token = p; + state = SW_NOREPLY; + } else { + goto error; } break; - case SW_AFTER_NOREPLY: - switch (ch) { - case ' ': - break; - - case CR: - if (memcache_storage(r)) { - state = SW_RUNTO_VAL; - } else { - state = SW_ALMOST_DONE; - } - break; - - default: - goto error; + case CR: + if (memcache_storage(r)) { + state = SW_RUNTO_VAL; + } else { + state = SW_ALMOST_DONE; } break; - case SW_CRLF: - switch (ch) { - case ' ': - break; + default: + goto error; + } - case CR: - state = SW_ALMOST_DONE; - break; + break; - default: - goto error; + case SW_NOREPLY: + switch (ch) { + case ' ': + case CR: + m = r->token; + if (((p - m) == 7) && + str7cmp(m, 'n', 'o', 'r', 'e', 'p', 'l', 'y')) { + ASSERT(memcache_storage(r) || memcache_arithmetic(r) || + memcache_delete(r) || memcache_touch(r)); + r->token = NULL; + /* noreply_end <- p - 1 */ + r->expect_datastore_reply = 0; + state = SW_AFTER_NOREPLY; + p = p - 1; /* go back by 1 byte */ + } else { + goto error; } + } - break; + break; - case SW_ALMOST_DONE: - switch (ch) { - case LF: - /* req_end <- p */ - goto done; + case SW_AFTER_NOREPLY: + switch (ch) { + case ' ': + break; - default: - goto error; + case CR: + if (memcache_storage(r)) { + state = SW_RUNTO_VAL; + } else { + state = SW_ALMOST_DONE; } + break; + + default: + goto error; + } + + break; + case SW_CRLF: + switch (ch) { + case ' ': break; - case SW_SENTINEL: - default: - NOT_REACHED(); + case CR: + state = SW_ALMOST_DONE; break; + default: + goto error; } - } - /* - * At this point, buffer from b->pos to b->last has been parsed completely - * but we haven't been able to reach to any conclusion. Normally, this - * means that we have to parse again starting from the state we are in - * after more data has been read. The newly read data is either read into - * a new mbuf, if existing mbuf is full (b->last == b->end) or into the - * existing mbuf. - * - * The only exception to this is when the existing mbuf is full (b->last - * is at b->end) and token marker is set, which means that we have to - * copy the partial token into a new mbuf and parse again with more data - * read into new mbuf. - */ - ASSERT(p == b->last); - r->pos = p; - r->state = state; - - if (b->last == b->end && r->token != NULL) { - r->pos = r->token; - r->token = NULL; - r->result = MSG_PARSE_REPAIR; - } else { - r->result = MSG_PARSE_AGAIN; - } + break; - log_hexdump(LOG_VERB, b->pos, mbuf_length(b), "parsed req %"PRIu64" res %d " - "type %d state %d rpos %d of %d", r->id, r->result, r->type, - r->state, r->pos - b->pos, b->last - b->pos); - return; + case SW_ALMOST_DONE: + switch (ch) { + case LF: + /* req_end <- p */ + goto done; + + default: + goto error; + } + + break; + + case SW_SENTINEL: + default: + NOT_REACHED(); + break; + } + } + + /* + * At this point, buffer from b->pos to b->last has been parsed completely + * but we haven't been able to reach to any conclusion. Normally, this + * means that we have to parse again starting from the state we are in + * after more data has been read. The newly read data is either read into + * a new mbuf, if existing mbuf is full (b->last == b->end) or into the + * existing mbuf. + * + * The only exception to this is when the existing mbuf is full (b->last + * is at b->end) and token marker is set, which means that we have to + * copy the partial token into a new mbuf and parse again with more data + * read into new mbuf. + */ + ASSERT(p == b->last); + r->pos = p; + r->state = state; + + if (b->last == b->end && r->token != NULL) { + r->pos = r->token; + r->token = NULL; + r->result = MSG_PARSE_REPAIR; + } else { + r->result = MSG_PARSE_AGAIN; + } + + log_hexdump(LOG_VERB, b->pos, mbuf_length(b), + "parsed req %" PRIu64 + " res %d " + "type %d state %d rpos %d of %d", + r->id, r->result, r->type, r->state, r->pos - b->pos, + b->last - b->pos); + return; done: - ASSERT(r->type > MSG_UNKNOWN && r->type < MSG_SENTINEL); - r->pos = p + 1; - ASSERT(r->pos <= b->last); - r->state = SW_START; - r->result = MSG_PARSE_OK; - - log_hexdump(LOG_VERB, b->pos, mbuf_length(b), "parsed req %"PRIu64" res %d " - "type %d state %d rpos %d of %d", r->id, r->result, r->type, - r->state, r->pos - b->pos, b->last - b->pos); - return; + ASSERT(r->type > MSG_UNKNOWN && r->type < MSG_SENTINEL); + r->pos = p + 1; + ASSERT(r->pos <= b->last); + r->state = SW_START; + r->result = MSG_PARSE_OK; + + log_hexdump(LOG_VERB, b->pos, mbuf_length(b), + "parsed req %" PRIu64 + " res %d " + "type %d state %d rpos %d of %d", + r->id, r->result, r->type, r->state, r->pos - b->pos, + b->last - b->pos); + return; enomem: - r->result = MSG_PARSE_ERROR; - r->state = state; + r->result = MSG_PARSE_ERROR; + r->state = state; - log_hexdump(LOG_INFO, b->pos, mbuf_length(b), "out of memory on parse req %"PRIu64" " - "res %d type %d state %d", r->id, r->result, r->type, r->state); + log_hexdump(LOG_INFO, b->pos, mbuf_length(b), + "out of memory on parse req %" PRIu64 + " " + "res %d type %d state %d", + r->id, r->result, r->type, r->state); - return; + return; error: - r->result = MSG_PARSE_ERROR; - r->state = state; - errno = EINVAL; - - log_hexdump(LOG_INFO, b->pos, mbuf_length(b), "parsed bad req %"PRIu64" " - "res %d type %d state %d", r->id, r->result, r->type, - r->state); + r->result = MSG_PARSE_ERROR; + r->state = state; + errno = EINVAL; + + log_hexdump(LOG_INFO, b->pos, mbuf_length(b), + "parsed bad req %" PRIu64 + " " + "res %d type %d state %d", + r->id, r->result, r->type, r->state); } -void -memcache_parse_rsp(struct msg *r, const struct string *UNUSED) -{ - struct mbuf *b; - uint8_t *p, *m; - uint8_t ch; - enum { - SW_START, - SW_RSP_NUM, - SW_RSP_STR, - SW_SPACES_BEFORE_KEY, - SW_KEY, - SW_SPACES_BEFORE_FLAGS, /* 5 */ - SW_FLAGS, - SW_SPACES_BEFORE_VLEN, - SW_VLEN, - SW_RUNTO_VAL, - SW_VAL, /* 10 */ - SW_VAL_LF, - SW_END, - SW_RUNTO_CRLF, - SW_CRLF, - SW_ALMOST_DONE, /* 15 */ - SW_SENTINEL - } state; - - state = r->state; - b = STAILQ_LAST(&r->mhdr, mbuf, next); - - ASSERT(!r->is_request); - ASSERT(state >= SW_START && state < SW_SENTINEL); - ASSERT(b != NULL); - ASSERT(b->pos <= b->last); - - /* validate the parsing marker */ - ASSERT(r->pos != NULL); - ASSERT(r->pos >= b->pos && r->pos <= b->last); - - for (p = r->pos; p < b->last; p++) { - ch = *p; - - switch (state) { - case SW_START: - if (isdigit(ch)) { - state = SW_RSP_NUM; - } else { - state = SW_RSP_STR; - } - p = p - 1; /* go back by 1 byte */ +void memcache_parse_rsp(struct msg *r, const struct string *UNUSED) { + struct mbuf *b; + uint8_t *p, *m; + uint8_t ch; + enum { + SW_START, + SW_RSP_NUM, + SW_RSP_STR, + SW_SPACES_BEFORE_KEY, + SW_KEY, + SW_SPACES_BEFORE_FLAGS, /* 5 */ + SW_FLAGS, + SW_SPACES_BEFORE_VLEN, + SW_VLEN, + SW_RUNTO_VAL, + SW_VAL, /* 10 */ + SW_VAL_LF, + SW_END, + SW_RUNTO_CRLF, + SW_CRLF, + SW_ALMOST_DONE, /* 15 */ + SW_SENTINEL + } state; + + state = r->state; + b = STAILQ_LAST(&r->mhdr, mbuf, next); + + ASSERT(!r->is_request); + ASSERT(state >= SW_START && state < SW_SENTINEL); + ASSERT(b != NULL); + ASSERT(b->pos <= b->last); + + /* validate the parsing marker */ + ASSERT(r->pos != NULL); + ASSERT(r->pos >= b->pos && r->pos <= b->last); + + for (p = r->pos; p < b->last; p++) { + ch = *p; + + switch (state) { + case SW_START: + if (isdigit(ch)) { + state = SW_RSP_NUM; + } else { + state = SW_RSP_STR; + } + p = p - 1; /* go back by 1 byte */ - break; + break; - case SW_RSP_NUM: - if (r->token == NULL) { - /* rsp_start <- p; type_start <- p */ - r->token = p; - } + case SW_RSP_NUM: + if (r->token == NULL) { + /* rsp_start <- p; type_start <- p */ + r->token = p; + } - if (isdigit(ch)) { - /* num <- num * 10 + (ch - '0') */ - ; - } else if (ch == ' ' || ch == CR) { - /* type_end <- p - 1 */ - r->token = NULL; - r->type = MSG_RSP_MC_NUM; - p = p - 1; /* go back by 1 byte */ - state = SW_CRLF; - } else { - goto error; - } + if (isdigit(ch)) { + /* num <- num * 10 + (ch - '0') */ + ; + } else if (ch == ' ' || ch == CR) { + /* type_end <- p - 1 */ + r->token = NULL; + r->type = MSG_RSP_MC_NUM; + p = p - 1; /* go back by 1 byte */ + state = SW_CRLF; + } else { + goto error; + } - break; + break; - case SW_RSP_STR: - if (r->token == NULL) { - /* rsp_start <- p; type_start <- p */ - r->token = p; - } + case SW_RSP_STR: + if (r->token == NULL) { + /* rsp_start <- p; type_start <- p */ + r->token = p; + } - if (ch == ' ' || ch == CR) { - /* type_end <- p - 1 */ - m = r->token; - /* r->token = NULL; */ - r->type = MSG_UNKNOWN; - - switch (p - m) { - case 3: - if (str4cmp(m, 'E', 'N', 'D', '\r')) { - r->type = MSG_RSP_MC_END; - /* end_start <- m; end_end <- p - 1 */ - r->end = m; - break; - } - - break; - - case 5: - if (str5cmp(m, 'V', 'A', 'L', 'U', 'E')) { - /* - * Encompasses responses for 'get', 'gets' and - * 'cas' command. - */ - r->type = MSG_RSP_MC_VALUE; - break; - } - - if (str5cmp(m, 'E', 'R', 'R', 'O', 'R')) { - r->type = MSG_RSP_MC_ERROR; - break; - } - - break; - - case 6: - if (str6cmp(m, 'S', 'T', 'O', 'R', 'E', 'D')) { - r->type = MSG_RSP_MC_STORED; - break; - } - - if (str6cmp(m, 'E', 'X', 'I', 'S', 'T', 'S')) { - r->type = MSG_RSP_MC_EXISTS; - break; - } - - break; - - case 7: - if (str7cmp(m, 'D', 'E', 'L', 'E', 'T', 'E', 'D')) { - r->type = MSG_RSP_MC_DELETED; - break; - } - - if (str7cmp(m, 'T', 'O', 'U', 'C', 'H', 'E', 'D')) { - r->type = MSG_RSP_MC_TOUCHED; - break; - } - - break; - - case 9: - if (str9cmp(m, 'N', 'O', 'T', '_', 'F', 'O', 'U', 'N', 'D')) { - r->type = MSG_RSP_MC_NOT_FOUND; - break; - } - - break; - - case 10: - if (str10cmp(m, 'N', 'O', 'T', '_', 'S', 'T', 'O', 'R', 'E', 'D')) { - r->type = MSG_RSP_MC_NOT_STORED; - break; - } - - break; - - case 12: - if (str12cmp(m, 'C', 'L', 'I', 'E', 'N', 'T', '_', 'E', 'R', 'R', 'O', 'R')) { - r->type = MSG_RSP_MC_CLIENT_ERROR; - break; - } - - if (str12cmp(m, 'S', 'E', 'R', 'V', 'E', 'R', '_', 'E', 'R', 'R', 'O', 'R')) { - r->type = MSG_RSP_MC_SERVER_ERROR; - break; - } - - break; - } - - switch (r->type) { - case MSG_UNKNOWN: - goto error; - - case MSG_RSP_MC_STORED: - case MSG_RSP_MC_NOT_STORED: - case MSG_RSP_MC_EXISTS: - case MSG_RSP_MC_NOT_FOUND: - case MSG_RSP_MC_DELETED: - case MSG_RSP_MC_TOUCHED: - state = SW_CRLF; - break; - - case MSG_RSP_MC_END: - state = SW_CRLF; - break; - - case MSG_RSP_MC_VALUE: - state = SW_SPACES_BEFORE_KEY; - break; - - case MSG_RSP_MC_ERROR: - state = SW_CRLF; - break; - - case MSG_RSP_MC_CLIENT_ERROR: - case MSG_RSP_MC_SERVER_ERROR: - state = SW_RUNTO_CRLF; - break; - - default: - NOT_REACHED(); - } - - p = p - 1; /* go back by 1 byte */ - } + if (ch == ' ' || ch == CR) { + /* type_end <- p - 1 */ + m = r->token; + /* r->token = NULL; */ + r->type = MSG_UNKNOWN; + + switch (p - m) { + case 3: + if (str4cmp(m, 'E', 'N', 'D', '\r')) { + r->type = MSG_RSP_MC_END; + /* end_start <- m; end_end <- p - 1 */ + r->end = m; + break; + } - break; + break; - case SW_SPACES_BEFORE_KEY: - if (ch != ' ') { - state = SW_KEY; - p = p - 1; /* go back by 1 byte */ - } + case 5: + if (str5cmp(m, 'V', 'A', 'L', 'U', 'E')) { + /* + * Encompasses responses for 'get', 'gets' and + * 'cas' command. + */ + r->type = MSG_RSP_MC_VALUE; + break; + } - break; + if (str5cmp(m, 'E', 'R', 'R', 'O', 'R')) { + r->type = MSG_RSP_MC_ERROR; + break; + } - case SW_KEY: - if (ch == ' ') { - /* r->token = NULL; */ - state = SW_SPACES_BEFORE_FLAGS; - } + break; - break; + case 6: + if (str6cmp(m, 'S', 'T', 'O', 'R', 'E', 'D')) { + r->type = MSG_RSP_MC_STORED; + break; + } - case SW_SPACES_BEFORE_FLAGS: - if (ch != ' ') { - if (!isdigit(ch)) { - goto error; - } - state = SW_FLAGS; - p = p - 1; /* go back by 1 byte */ - } + if (str6cmp(m, 'E', 'X', 'I', 'S', 'T', 'S')) { + r->type = MSG_RSP_MC_EXISTS; + break; + } - break; + break; - case SW_FLAGS: - if (r->token == NULL) { - /* flags_start <- p */ - /* r->token = p; */ - } + case 7: + if (str7cmp(m, 'D', 'E', 'L', 'E', 'T', 'E', 'D')) { + r->type = MSG_RSP_MC_DELETED; + break; + } - if (isdigit(ch)) { - /* flags <- flags * 10 + (ch - '0') */ - ; - } else if (ch == ' ') { - /* flags_end <- p - 1 */ - /* r->token = NULL; */ - state = SW_SPACES_BEFORE_VLEN; - } else { - goto error; - } + if (str7cmp(m, 'T', 'O', 'U', 'C', 'H', 'E', 'D')) { + r->type = MSG_RSP_MC_TOUCHED; + break; + } - break; + break; - case SW_SPACES_BEFORE_VLEN: - if (ch != ' ') { - if (!isdigit(ch)) { - goto error; - } - p = p - 1; /* go back by 1 byte */ - state = SW_VLEN; - r->vlen = 0; - } + case 9: + if (str9cmp(m, 'N', 'O', 'T', '_', 'F', 'O', 'U', 'N', 'D')) { + r->type = MSG_RSP_MC_NOT_FOUND; + break; + } - break; + break; - case SW_VLEN: - if (isdigit(ch)) { - r->vlen = r->vlen * 10 + (uint32_t)(ch - '0'); - } else if (ch == ' ' || ch == CR) { - /* vlen_end <- p - 1 */ - p = p - 1; /* go back by 1 byte */ - /* r->token = NULL; */ - state = SW_RUNTO_CRLF; - } else { - goto error; - } + case 10: + if (str10cmp(m, 'N', 'O', 'T', '_', 'S', 'T', 'O', 'R', 'E', + 'D')) { + r->type = MSG_RSP_MC_NOT_STORED; + break; + } - break; + break; - case SW_RUNTO_VAL: - switch (ch) { - case LF: - /* val_start <- p + 1 */ - state = SW_VAL; - r->token = NULL; + case 12: + if (str12cmp(m, 'C', 'L', 'I', 'E', 'N', 'T', '_', 'E', 'R', 'R', + 'O', 'R')) { + r->type = MSG_RSP_MC_CLIENT_ERROR; break; + } - default: - goto error; - } + if (str12cmp(m, 'S', 'E', 'R', 'V', 'E', 'R', '_', 'E', 'R', 'R', + 'O', 'R')) { + r->type = MSG_RSP_MC_SERVER_ERROR; + break; + } - break; + break; + } - case SW_VAL: - m = p + r->vlen; - if (m >= b->last) { - ASSERT(r->vlen >= (uint32_t)(b->last - p)); - r->vlen -= (uint32_t)(b->last - p); - m = b->last - 1; - p = m; /* move forward by vlen bytes */ - break; - } - switch (*m) { - case CR: - /* val_end <- p - 1 */ - p = m; /* move forward by vlen bytes */ - state = SW_VAL_LF; - break; + switch (r->type) { + case MSG_UNKNOWN: + goto error; - default: - goto error; - } + case MSG_RSP_MC_STORED: + case MSG_RSP_MC_NOT_STORED: + case MSG_RSP_MC_EXISTS: + case MSG_RSP_MC_NOT_FOUND: + case MSG_RSP_MC_DELETED: + case MSG_RSP_MC_TOUCHED: + state = SW_CRLF; + break; - break; + case MSG_RSP_MC_END: + state = SW_CRLF; + break; - case SW_VAL_LF: - switch (ch) { - case LF: - /* state = SW_END; */ - state = SW_RSP_STR; - break; + case MSG_RSP_MC_VALUE: + state = SW_SPACES_BEFORE_KEY; + break; + + case MSG_RSP_MC_ERROR: + state = SW_CRLF; + break; + + case MSG_RSP_MC_CLIENT_ERROR: + case MSG_RSP_MC_SERVER_ERROR: + state = SW_RUNTO_CRLF; + break; default: - goto error; - } + NOT_REACHED(); + } - break; + p = p - 1; /* go back by 1 byte */ + } - case SW_END: - if (r->token == NULL) { - if (ch != 'E') { - goto error; - } - /* end_start <- p */ - r->token = p; - } else if (ch == CR) { - /* end_end <- p */ - m = r->token; - r->token = NULL; - - switch (p - m) { - case 3: - if (str4cmp(m, 'E', 'N', 'D', '\r')) { - r->end = m; - state = SW_ALMOST_DONE; - } - break; - - default: - goto error; - } - } + break; + + case SW_SPACES_BEFORE_KEY: + if (ch != ' ') { + state = SW_KEY; + p = p - 1; /* go back by 1 byte */ + } + + break; + + case SW_KEY: + if (ch == ' ') { + /* r->token = NULL; */ + state = SW_SPACES_BEFORE_FLAGS; + } + + break; + + case SW_SPACES_BEFORE_FLAGS: + if (ch != ' ') { + if (!isdigit(ch)) { + goto error; + } + state = SW_FLAGS; + p = p - 1; /* go back by 1 byte */ + } + + break; + + case SW_FLAGS: + if (r->token == NULL) { + /* flags_start <- p */ + /* r->token = p; */ + } + + if (isdigit(ch)) { + /* flags <- flags * 10 + (ch - '0') */ + ; + } else if (ch == ' ') { + /* flags_end <- p - 1 */ + /* r->token = NULL; */ + state = SW_SPACES_BEFORE_VLEN; + } else { + goto error; + } + + break; + + case SW_SPACES_BEFORE_VLEN: + if (ch != ' ') { + if (!isdigit(ch)) { + goto error; + } + p = p - 1; /* go back by 1 byte */ + state = SW_VLEN; + r->vlen = 0; + } + + break; + + case SW_VLEN: + if (isdigit(ch)) { + r->vlen = r->vlen * 10 + (uint32_t)(ch - '0'); + } else if (ch == ' ' || ch == CR) { + /* vlen_end <- p - 1 */ + p = p - 1; /* go back by 1 byte */ + /* r->token = NULL; */ + state = SW_RUNTO_CRLF; + } else { + goto error; + } + break; + + case SW_RUNTO_VAL: + switch (ch) { + case LF: + /* val_start <- p + 1 */ + state = SW_VAL; + r->token = NULL; break; - case SW_RUNTO_CRLF: - switch (ch) { - case CR: - if (r->type == MSG_RSP_MC_VALUE) { - state = SW_RUNTO_VAL; - } else { - state = SW_ALMOST_DONE; - } + default: + goto error; + } - break; + break; - default: - break; - } + case SW_VAL: + m = p + r->vlen; + if (m >= b->last) { + ASSERT(r->vlen >= (uint32_t)(b->last - p)); + r->vlen -= (uint32_t)(b->last - p); + m = b->last - 1; + p = m; /* move forward by vlen bytes */ + break; + } + switch (*m) { + case CR: + /* val_end <- p - 1 */ + p = m; /* move forward by vlen bytes */ + state = SW_VAL_LF; + break; + + default: + goto error; + } + break; + + case SW_VAL_LF: + switch (ch) { + case LF: + /* state = SW_END; */ + state = SW_RSP_STR; break; - case SW_CRLF: - switch (ch) { - case ' ': - break; + default: + goto error; + } + + break; - case CR: + case SW_END: + if (r->token == NULL) { + if (ch != 'E') { + goto error; + } + /* end_start <- p */ + r->token = p; + } else if (ch == CR) { + /* end_end <- p */ + m = r->token; + r->token = NULL; + + switch (p - m) { + case 3: + if (str4cmp(m, 'E', 'N', 'D', '\r')) { + r->end = m; state = SW_ALMOST_DONE; - break; + } + break; default: - goto error; + goto error; + } + } + + break; + + case SW_RUNTO_CRLF: + switch (ch) { + case CR: + if (r->type == MSG_RSP_MC_VALUE) { + state = SW_RUNTO_VAL; + } else { + state = SW_ALMOST_DONE; } break; - case SW_ALMOST_DONE: - switch (ch) { - case LF: - /* rsp_end <- p */ - goto done; + default: + break; + } - default: - goto error; - } + break; + case SW_CRLF: + switch (ch) { + case ' ': break; - case SW_SENTINEL: - default: - NOT_REACHED(); + case CR: + state = SW_ALMOST_DONE; break; + default: + goto error; } - } - ASSERT(p == b->last); - r->pos = p; - r->state = state; + break; + + case SW_ALMOST_DONE: + switch (ch) { + case LF: + /* rsp_end <- p */ + goto done; - if (b->last == b->end && r->token != NULL) { - if (state <= SW_RUNTO_VAL || state == SW_CRLF || state == SW_ALMOST_DONE) { - r->state = SW_START; + default: + goto error; } - r->pos = r->token; - r->token = NULL; - r->result = MSG_PARSE_REPAIR; - } else { - r->result = MSG_PARSE_AGAIN; + + break; + + case SW_SENTINEL: + default: + NOT_REACHED(); + break; } + } - log_hexdump(LOG_VERB, b->pos, mbuf_length(b), "parsed rsp %"PRIu64" res %d " - "type %d state %d rpos %d of %d", r->id, r->result, r->type, - r->state, r->pos - b->pos, b->last - b->pos); - return; + ASSERT(p == b->last); + r->pos = p; + r->state = state; -done: - ASSERT(r->type > MSG_UNKNOWN && r->type < MSG_SENTINEL); - r->pos = p + 1; - ASSERT(r->pos <= b->last); - r->state = SW_START; + if (b->last == b->end && r->token != NULL) { + if (state <= SW_RUNTO_VAL || state == SW_CRLF || state == SW_ALMOST_DONE) { + r->state = SW_START; + } + r->pos = r->token; r->token = NULL; - r->result = MSG_PARSE_OK; + r->result = MSG_PARSE_REPAIR; + } else { + r->result = MSG_PARSE_AGAIN; + } + + log_hexdump(LOG_VERB, b->pos, mbuf_length(b), + "parsed rsp %" PRIu64 + " res %d " + "type %d state %d rpos %d of %d", + r->id, r->result, r->type, r->state, r->pos - b->pos, + b->last - b->pos); + return; - log_hexdump(LOG_VERB, b->pos, mbuf_length(b), "parsed rsp %"PRIu64" res %d " - "type %d state %d rpos %d of %d", r->id, r->result, r->type, - r->state, r->pos - b->pos, b->last - b->pos); - return; +done: + ASSERT(r->type > MSG_UNKNOWN && r->type < MSG_SENTINEL); + r->pos = p + 1; + ASSERT(r->pos <= b->last); + r->state = SW_START; + r->token = NULL; + r->result = MSG_PARSE_OK; + + log_hexdump(LOG_VERB, b->pos, mbuf_length(b), + "parsed rsp %" PRIu64 + " res %d " + "type %d state %d rpos %d of %d", + r->id, r->result, r->type, r->state, r->pos - b->pos, + b->last - b->pos); + return; error: - r->result = MSG_PARSE_ERROR; - r->state = state; - errno = EINVAL; - - log_hexdump(LOG_INFO, b->pos, mbuf_length(b), "parsed bad rsp %"PRIu64" " - "res %d type %d state %d", r->id, r->result, r->type, - r->state); + r->result = MSG_PARSE_ERROR; + r->state = state; + errno = EINVAL; + + log_hexdump(LOG_INFO, b->pos, mbuf_length(b), + "parsed bad rsp %" PRIu64 + " " + "res %d type %d state %d", + r->id, r->result, r->type, r->state); } -bool -memcache_failure(struct msg *r) -{ - return false; -} +bool memcache_failure(struct msg *r) { return false; } -static rstatus_t -memcache_append_key(struct msg *r, uint8_t *key, uint32_t keylen) -{ - struct mbuf *mbuf; - struct keypos *kpos; +static rstatus_t memcache_append_key(struct msg *r, uint8_t *key, + uint32_t keylen) { + struct mbuf *mbuf; + struct keypos *kpos; - mbuf = msg_ensure_mbuf(r, keylen + 2); - if (mbuf == NULL) { - return DN_ENOMEM; - } + mbuf = msg_ensure_mbuf(r, keylen + 2); + if (mbuf == NULL) { + return DN_ENOMEM; + } - kpos = array_push(r->keys); - if (kpos == NULL) { - return DN_ENOMEM; - } + kpos = array_push(r->keys); + if (kpos == NULL) { + return DN_ENOMEM; + } - kpos->start = mbuf->last; - kpos->end = mbuf->last + keylen; - mbuf_copy(mbuf, key, keylen); - r->mlen += keylen; + kpos->start = mbuf->last; + kpos->end = mbuf->last + keylen; + mbuf_copy(mbuf, key, keylen); + r->mlen += keylen; - mbuf_copy(mbuf, (uint8_t *)" ", 1); - r->mlen += 1; - return DN_OK; + mbuf_copy(mbuf, (uint8_t *)" ", 1); + r->mlen += 1; + return DN_OK; } /* * read the comment in proto/nc_redis.c */ -static rstatus_t -memcache_fragment_retrieval(struct msg *r, struct server_pool *pool, struct rack *rack, - struct msg_tqh *frag_msgq, uint32_t key_step) -{ - struct mbuf *mbuf; - struct msg **sub_msgs; - uint32_t i; - rstatus_t status; - - sub_msgs = dn_zalloc(rack->ncontinuum * sizeof(*sub_msgs)); - if (sub_msgs == NULL) { - return DN_ENOMEM; - } +static rstatus_t memcache_fragment_retrieval(struct msg *r, + struct server_pool *pool, + struct rack *rack, + struct msg_tqh *frag_msgq, + uint32_t key_step) { + struct mbuf *mbuf; + struct msg **sub_msgs; + uint32_t i; + rstatus_t status; + + sub_msgs = dn_zalloc(rack->ncontinuum * sizeof(*sub_msgs)); + if (sub_msgs == NULL) { + return DN_ENOMEM; + } + + ASSERT(r->frag_seq == NULL); + r->frag_seq = dn_alloc(array_n(r->keys) * sizeof(*r->frag_seq)); + if (r->frag_seq == NULL) { + dn_free(sub_msgs); + return DN_ENOMEM; + } + + mbuf = STAILQ_FIRST(&r->mhdr); + mbuf->pos = mbuf->start; + + /* + * This code is based on the assumption that 'gets ' is located + * in a contiguous location. + * This is always true because we have capped our MBUF_MIN_SIZE at 512 and + * whenever we have multiple messages, we copy the tail message into a new + * mbuf + */ + for (; *(mbuf->pos) != ' ';) { /* eat get/gets */ + mbuf->pos++; + } + mbuf->pos++; + + r->frag_id = msg_gen_frag_id(); + r->nfrag = 0; + r->frag_owner = r; - ASSERT(r->frag_seq == NULL); - r->frag_seq = dn_alloc(array_n(r->keys) * sizeof(*r->frag_seq)); - if (r->frag_seq == NULL) { + for (i = 0; i < array_n(r->keys); i++) { /* for each key */ + struct msg *sub_msg; + struct keypos *kpos = array_get(r->keys, i); + uint32_t idx = dnode_peer_idx_for_key_on_rack(pool, rack, kpos->start, + kpos->end - kpos->start); + + if (sub_msgs[idx] == NULL) { + sub_msgs[idx] = msg_get(r->owner, r->is_request, __FUNCTION__); + if (sub_msgs[idx] == NULL) { dn_free(sub_msgs); return DN_ENOMEM; + } } + r->frag_seq[i] = sub_msg = sub_msgs[idx]; - mbuf = STAILQ_FIRST(&r->mhdr); - mbuf->pos = mbuf->start; - - /* - * This code is based on the assumption that 'gets ' is located - * in a contiguous location. - * This is always true because we have capped our MBUF_MIN_SIZE at 512 and - * whenever we have multiple messages, we copy the tail message into a new mbuf - */ - for (; *(mbuf->pos) != ' ';) { /* eat get/gets */ - mbuf->pos++; + sub_msg->narg++; + status = memcache_append_key(sub_msg, kpos->start, kpos->end - kpos->start); + if (status != DN_OK) { + dn_free(sub_msgs); + return status; } - mbuf->pos++; - - r->frag_id = msg_gen_frag_id(); - r->nfrag = 0; - r->frag_owner = r; - - for (i = 0; i < array_n(r->keys); i++) { /* for each key */ - struct msg *sub_msg; - struct keypos *kpos = array_get(r->keys, i); - uint32_t idx = dnode_peer_idx_for_key_on_rack(pool, rack, kpos->start, - kpos->end - kpos->start); - - if (sub_msgs[idx] == NULL) { - sub_msgs[idx] = msg_get(r->owner, r->is_request, __FUNCTION__); - if (sub_msgs[idx] == NULL) { - dn_free(sub_msgs); - return DN_ENOMEM; - } - } - r->frag_seq[i] = sub_msg = sub_msgs[idx]; + } - sub_msg->narg++; - status = memcache_append_key(sub_msg, kpos->start, kpos->end - kpos->start); - if (status != DN_OK) { - dn_free(sub_msgs); - return status; - } + for (i = 0; i < rack->ncontinuum; + i++) { /* prepend mget header, and forward it */ + struct msg *sub_msg = sub_msgs[i]; + if (sub_msg == NULL) { + continue; } - for (i = 0; i < rack->ncontinuum; i++) { /* prepend mget header, and forward it */ - struct msg *sub_msg = sub_msgs[i]; - if (sub_msg == NULL) { - continue; - } - - /* prepend get/gets */ - if (r->type == MSG_REQ_MC_GET) { - status = msg_prepend(sub_msg, (uint8_t *)"get ", 4); - } else if (r->type == MSG_REQ_MC_GETS) { - status = msg_prepend(sub_msg, (uint8_t *)"gets ", 5); - } - if (status != DN_OK) { - dn_free(sub_msgs); - return status; - } + /* prepend get/gets */ + if (r->type == MSG_REQ_MC_GET) { + status = msg_prepend(sub_msg, (uint8_t *)"get ", 4); + } else if (r->type == MSG_REQ_MC_GETS) { + status = msg_prepend(sub_msg, (uint8_t *)"gets ", 5); + } + if (status != DN_OK) { + dn_free(sub_msgs); + return status; + } - /* append \r\n */ - status = msg_append(sub_msg, (uint8_t *)CRLF, CRLF_LEN); - if (status != DN_OK) { - dn_free(sub_msgs); - return status; - } + /* append \r\n */ + status = msg_append(sub_msg, (uint8_t *)CRLF, CRLF_LEN); + if (status != DN_OK) { + dn_free(sub_msgs); + return status; + } - sub_msg->type = r->type; - sub_msg->frag_id = r->frag_id; - sub_msg->frag_owner = r->frag_owner; + sub_msg->type = r->type; + sub_msg->frag_id = r->frag_id; + sub_msg->frag_owner = r->frag_owner; - TAILQ_INSERT_TAIL(frag_msgq, sub_msg, m_tqe); - r->nfrag++; - } + TAILQ_INSERT_TAIL(frag_msgq, sub_msg, m_tqe); + r->nfrag++; + } - dn_free(sub_msgs); - return DN_OK; + dn_free(sub_msgs); + return DN_OK; } -rstatus_t -memcache_fragment(struct msg *r, struct server_pool *pool, struct rack *rack, struct msg_tqh *frag_msgq) -{ - if (memcache_retrieval(r)) { - return memcache_fragment_retrieval(r, pool, rack, frag_msgq, 1); - } - return DN_OK; +rstatus_t memcache_fragment(struct msg *r, struct server_pool *pool, + struct rack *rack, struct msg_tqh *frag_msgq) { + if (memcache_retrieval(r)) { + return memcache_fragment_retrieval(r, pool, rack, frag_msgq, 1); + } + return DN_OK; } -rstatus_t -memcache_verify_request(struct msg *r, struct server_pool *pool, struct rack *rack) -{ - return DN_OK; +rstatus_t memcache_verify_request(struct msg *r, struct server_pool *pool, + struct rack *rack) { + return DN_OK; } /* * Pre-coalesce handler is invoked when the message is a response to * the fragmented multi vector request - 'get' or 'gets' and all the * responses to the fragmented request vector hasn't been received */ -void -memcache_pre_coalesce(struct msg *r) -{ - struct msg *pr = r->peer; /* peer request */ - struct mbuf *mbuf; - - ASSERT(!r->is_request); - ASSERT(pr->is_request); - - if (pr->frag_id == 0) { - /* do nothing, if not a response to a fragmented request */ - return; - } +void memcache_pre_coalesce(struct msg *r) { + struct msg *pr = r->peer; /* peer request */ + struct mbuf *mbuf; + + ASSERT(!r->is_request); + ASSERT(pr->is_request); - pr->frag_owner->nfrag_done++; - switch (r->type) { + if (pr->frag_id == 0) { + /* do nothing, if not a response to a fragmented request */ + return; + } + pr->frag_owner->nfrag_done++; + switch (r->type) { case MSG_RSP_MC_VALUE: case MSG_RSP_MC_END: - /* - * Readjust responses of the fragmented message vector by not - * including the end marker for all - */ - - ASSERT(r->end != NULL); + /* + * Readjust responses of the fragmented message vector by not + * including the end marker for all + */ - for (;;) { - mbuf = STAILQ_LAST(&r->mhdr, mbuf, next); - ASSERT(mbuf != NULL); + ASSERT(r->end != NULL); - /* - * We cannot assert that end marker points to the last mbuf - * Consider a scenario where end marker points to the - * penultimate mbuf and the last mbuf only contains spaces - * and CRLF: mhdr -> [...END] -> [\r\n] - */ + for (;;) { + mbuf = STAILQ_LAST(&r->mhdr, mbuf, next); + ASSERT(mbuf != NULL); - if (r->end >= mbuf->pos && r->end < mbuf->last) { - /* end marker is within this mbuf */ - r->mlen -= (uint32_t)(mbuf->last - r->end); - mbuf->last = r->end; - break; - } + /* + * We cannot assert that end marker points to the last mbuf + * Consider a scenario where end marker points to the + * penultimate mbuf and the last mbuf only contains spaces + * and CRLF: mhdr -> [...END] -> [\r\n] + */ - /* end marker is not in this mbuf */ - r->mlen -= mbuf_length(mbuf); - mbuf_remove(&r->mhdr, mbuf); - mbuf_put(mbuf); + if (r->end >= mbuf->pos && r->end < mbuf->last) { + /* end marker is within this mbuf */ + r->mlen -= (uint32_t)(mbuf->last - r->end); + mbuf->last = r->end; + break; } - break; + /* end marker is not in this mbuf */ + r->mlen -= mbuf_length(mbuf); + mbuf_remove(&r->mhdr, mbuf); + mbuf_put(mbuf); + } + + break; default: - /* - * Valid responses for a fragmented requests are MSG_RSP_MC_VALUE or, - * MSG_RSP_MC_END. For an invalid response, we send out SERVER_ERRROR - * with EINVAL errno - */ - mbuf = STAILQ_FIRST(&r->mhdr); - log_hexdump(LOG_ERR, mbuf->pos, mbuf_length(mbuf), "rsp fragment " - "with unknown type %d", r->type); - pr->is_error = 1; - pr->error_code = EINVAL; - break; - } + /* + * Valid responses for a fragmented requests are MSG_RSP_MC_VALUE or, + * MSG_RSP_MC_END. For an invalid response, we send out SERVER_ERRROR + * with EINVAL errno + */ + mbuf = STAILQ_FIRST(&r->mhdr); + log_hexdump(LOG_ERR, mbuf->pos, mbuf_length(mbuf), + "rsp fragment " + "with unknown type %d", + r->type); + pr->is_error = 1; + pr->error_code = EINVAL; + break; + } } /* * Copy one response from src to dst and return bytes copied */ -static rstatus_t -memcache_copy_bulk(struct msg *dst, struct msg *src) -{ - struct mbuf *mbuf, *nbuf; - uint8_t *p; - uint32_t len = 0; - uint32_t bytes = 0; - uint32_t i = 0; - - for (mbuf = STAILQ_FIRST(&src->mhdr); - mbuf && mbuf_empty(mbuf); - mbuf = STAILQ_FIRST(&src->mhdr)) { - - mbuf_remove(&src->mhdr, mbuf); - mbuf_put(mbuf); +static rstatus_t memcache_copy_bulk(struct msg *dst, struct msg *src) { + struct mbuf *mbuf, *nbuf; + uint8_t *p; + uint32_t len = 0; + uint32_t bytes = 0; + uint32_t i = 0; + + for (mbuf = STAILQ_FIRST(&src->mhdr); mbuf && mbuf_empty(mbuf); + mbuf = STAILQ_FIRST(&src->mhdr)) { + mbuf_remove(&src->mhdr, mbuf); + mbuf_put(mbuf); + } + + mbuf = STAILQ_FIRST(&src->mhdr); + if (mbuf == NULL) { + return DN_OK; /* key not exists */ + } + p = mbuf->pos; + + /* + * get : VALUE key 0 len\r\nval\r\n + * gets: VALUE key 0 len cas\r\nval\r\n + */ + ASSERT(*p == 'V'); + for (i = 0; i < 3; i++) { /* eat 'VALUE key 0 ' */ + for (; *p != ' ';) { + p++; } - - mbuf = STAILQ_FIRST(&src->mhdr); - if (mbuf == NULL) { - return DN_OK; /* key not exists */ - } - p = mbuf->pos; - - /* - * get : VALUE key 0 len\r\nval\r\n - * gets: VALUE key 0 len cas\r\nval\r\n - */ - ASSERT(*p == 'V'); - for (i = 0; i < 3; i++) { /* eat 'VALUE key 0 ' */ - for (; *p != ' ';) { - p++; - } - p++; - } - - len = 0; - for (; p < mbuf->last && isdigit(*p); p++) { - len = len * 10 + (uint32_t)(*p - '0'); - } - - for (; p < mbuf->last && ('\r' != *p); p++) { /* eat cas for gets */ - ; - } - - len += CRLF_LEN * 2; - len += (p - mbuf->pos); - - bytes = len; - - /* copy len bytes to dst */ - for (; mbuf;) { - if (mbuf_length(mbuf) <= len) { /* steal this mbuf from src to dst */ - nbuf = STAILQ_NEXT(mbuf, next); - mbuf_remove(&src->mhdr, mbuf); - mbuf_insert(&dst->mhdr, mbuf); - len -= mbuf_length(mbuf); - mbuf = nbuf; - } else { /* split it */ - nbuf = mbuf_get(); - if (nbuf == NULL) { - return DN_ENOMEM; - } - mbuf_copy(nbuf, mbuf->pos, len); - mbuf_insert(&dst->mhdr, nbuf); - mbuf->pos += len; - break; - } + p++; + } + + len = 0; + for (; p < mbuf->last && isdigit(*p); p++) { + len = len * 10 + (uint32_t)(*p - '0'); + } + + for (; p < mbuf->last && ('\r' != *p); p++) { /* eat cas for gets */ + ; + } + + len += CRLF_LEN * 2; + len += (p - mbuf->pos); + + bytes = len; + + /* copy len bytes to dst */ + for (; mbuf;) { + if (mbuf_length(mbuf) <= len) { /* steal this mbuf from src to dst */ + nbuf = STAILQ_NEXT(mbuf, next); + mbuf_remove(&src->mhdr, mbuf); + mbuf_insert(&dst->mhdr, mbuf); + len -= mbuf_length(mbuf); + mbuf = nbuf; + } else { /* split it */ + nbuf = mbuf_get(); + if (nbuf == NULL) { + return DN_ENOMEM; + } + mbuf_copy(nbuf, mbuf->pos, len); + mbuf_insert(&dst->mhdr, nbuf); + mbuf->pos += len; + break; } + } - dst->mlen += bytes; - src->mlen -= bytes; - log_debug(LOG_VVERB, "memcache_copy_bulk copy bytes: %d", bytes); - return DN_OK; + dst->mlen += bytes; + src->mlen -= bytes; + log_debug(LOG_VVERB, "memcache_copy_bulk copy bytes: %d", bytes); + return DN_OK; } /* @@ -1535,100 +1534,85 @@ memcache_copy_bulk(struct msg *dst, struct msg *src) * responses to the fragmented request vector has been received and * the fragmented request is consider to be done */ -void -memcache_post_coalesce(struct msg *request) -{ - struct msg *response = request->peer; - struct msg *sub_msg; - uint32_t i; - rstatus_t status; - - ASSERT(!response->is_request); - ASSERT(request->is_request && (request->frag_owner == request)); - if (request->is_error || request->is_ferror) { - response->owner->err = 1; - return; - } +void memcache_post_coalesce(struct msg *request) { + struct msg *response = request->peer; + struct msg *sub_msg; + uint32_t i; + rstatus_t status; + + ASSERT(!response->is_request); + ASSERT(request->is_request && (request->frag_owner == request)); + if (request->is_error || request->is_ferror) { + response->owner->err = 1; + return; + } - for (i = 0; i < array_n(request->keys); i++) { /* for each key */ - sub_msg = request->frag_seq[i]->peer; /* get it's peer response */ - if (sub_msg == NULL) { - response->owner->err = 1; - return; - } - status = memcache_copy_bulk(response, sub_msg); - if (status != DN_OK) { - response->owner->err = 1; - return; - } + for (i = 0; i < array_n(request->keys); i++) { /* for each key */ + sub_msg = request->frag_seq[i]->peer; /* get it's peer response */ + if (sub_msg == NULL) { + response->owner->err = 1; + return; } - - /* append END\r\n */ - status = msg_append(response, (uint8_t *)"END\r\n", 5); + status = memcache_copy_bulk(response, sub_msg); if (status != DN_OK) { - response->owner->err = 1; - return; + response->owner->err = 1; + return; } -} + } -void -memcache_post_connect(struct context *ctx, struct conn *conn, struct server *server) -{ + /* append END\r\n */ + status = msg_append(response, (uint8_t *)"END\r\n", 5); + if (status != DN_OK) { + response->owner->err = 1; + return; + } } -void -memcache_swallow_msg(struct conn *conn, struct msg *pmsg, struct msg *msg) -{ -} +void memcache_post_connect(struct context *ctx, struct conn *conn, + struct server *server) {} -rstatus_t -memcache_add_auth(struct context *ctx, struct conn *c_conn, struct conn *s_conn) -{ - NOT_REACHED(); - return DN_OK; -} +void memcache_swallow_msg(struct conn *conn, struct msg *pmsg, + struct msg *msg) {} -rstatus_t -memcache_reply(struct msg *r) -{ - NOT_REACHED(); - return DN_OK; +rstatus_t memcache_add_auth(struct context *ctx, struct conn *c_conn, + struct conn *s_conn) { + NOT_REACHED(); + return DN_OK; } -bool -memcache_is_multikey_request(struct msg *r) -{ - return false; +rstatus_t memcache_reply(struct msg *r) { + NOT_REACHED(); + return DN_OK; } -struct msg * -memcache_reconcile_responses(struct response_mgr *rspmgr) -{ - if (rspmgr->msg->consistency == DC_QUORUM) { - log_info("none of the responses match, returning first"); - return rspmgr->responses[0]; - } else { - log_info("none of the responses match, returning error"); - struct msg *rsp = msg_get_error(NULL, DYNOMITE_NO_QUORUM_ACHIEVED, 0); - // There is a case that when 1 out of three nodes are down, the - // response manager has 1 error response and 2 good responses. - // We reach here when the two responses differ and we want to return - // failed to achieve quorum. In this case, free the existing error - // response - if (rspmgr->err_rsp) { - rsp_put(rspmgr->err_rsp); - } - rspmgr->err_rsp = rsp; - rspmgr->error_responses++; - return rsp; +bool memcache_is_multikey_request(struct msg *r) { return false; } + +struct msg *memcache_reconcile_responses(struct response_mgr *rspmgr) { + if (rspmgr->msg->consistency == DC_QUORUM) { + log_info("none of the responses match, returning first"); + return rspmgr->responses[0]; + } else { + log_info("none of the responses match, returning error"); + struct msg *rsp = msg_get_error(NULL, DYNOMITE_NO_QUORUM_ACHIEVED, 0); + // There is a case that when 1 out of three nodes are down, the + // response manager has 1 error response and 2 good responses. + // We reach here when the two responses differ and we want to return + // failed to achieve quorum. In this case, free the existing error + // response + if (rspmgr->err_rsp) { + rsp_put(rspmgr->err_rsp); } + rspmgr->err_rsp = rsp; + rspmgr->error_responses++; + return rsp; + } } /* * Placeholder function for memcache query rewrites. * No rewrites implemented toady. */ -rstatus_t memcache_rewrite_query(struct msg* orig_msg, struct context* ctx, bool* did_rewrite, - struct msg** new_msg_ptr) { - return DN_OK; +rstatus_t memcache_rewrite_query(struct msg *orig_msg, struct context *ctx, + bool *did_rewrite, struct msg **new_msg_ptr) { + return DN_OK; } diff --git a/src/proto/dyn_proto.h b/src/proto/dyn_proto.h index 7e983b662..77040d19a 100644 --- a/src/proto/dyn_proto.h +++ b/src/proto/dyn_proto.h @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,14 +20,21 @@ * limitations under the License. */ -#include - - #ifndef _DN_PROTO_H_ #define _DN_PROTO_H_ +#include +#include "../dyn_types.h" +// Forward declarations +struct context; +struct msg; +struct msg_tqh; +struct rack; +struct response_mgr; +struct server_pool; +struct string; void memcache_parse_req(struct msg *r, const struct string *hash_tag); void memcache_parse_rsp(struct msg *r, const struct string *UNUSED); @@ -35,12 +42,12 @@ void memcache_pre_coalesce(struct msg *r); void memcache_post_coalesce(struct msg *r); bool memcache_is_multikey_request(struct msg *r); struct msg *memcache_reconcile_responses(struct response_mgr *rspmgr); -rstatus_t memcache_fragment(struct msg *r, struct server_pool *pool, struct rack *rack, - struct msg_tqh *frag_msgq); +rstatus_t memcache_fragment(struct msg *r, struct server_pool *pool, + struct rack *rack, struct msg_tqh *frag_msgq); rstatus_t memcache_verify_request(struct msg *r, struct server_pool *pool, struct rack *rack); -rstatus_t memcache_rewrite_query(struct msg* orig_msg, struct context* ctx, bool* did_rewrite, - struct msg** new_msg_ptr); +rstatus_t memcache_rewrite_query(struct msg *orig_msg, struct context *ctx, + bool *did_rewrite, struct msg **new_msg_ptr); void redis_parse_req(struct msg *r, const struct string *hash_tag); void redis_parse_rsp(struct msg *r, const struct string *UNUSED); @@ -48,11 +55,11 @@ void redis_pre_coalesce(struct msg *r); void redis_post_coalesce(struct msg *r); bool redis_is_multikey_request(struct msg *r); struct msg *redis_reconcile_responses(struct response_mgr *rspmgr); -rstatus_t redis_fragment(struct msg *r, struct server_pool *pool, struct rack *rack, - struct msg_tqh *frag_msgq); +rstatus_t redis_fragment(struct msg *r, struct server_pool *pool, + struct rack *rack, struct msg_tqh *frag_msgq); rstatus_t redis_verify_request(struct msg *r, struct server_pool *pool, struct rack *rack); -rstatus_t redis_rewrite_query(struct msg* orig_msg, struct context* ctx, bool* did_rewrite, - struct msg** new_msg_ptr); +rstatus_t redis_rewrite_query(struct msg *orig_msg, struct context *ctx, + bool *did_rewrite, struct msg **new_msg_ptr); #endif diff --git a/src/proto/dyn_redis.c b/src/proto/dyn_redis.c index c8d9f3b14..074a87073 100644 --- a/src/proto/dyn_redis.c +++ b/src/proto/dyn_redis.c @@ -1,7 +1,7 @@ /* - * Dynomite - A thin, distributed replication layer for multi non-distributed storages. - * Copyright (C) 2014 Netflix, Inc. - */ + * Dynomite - A thin, distributed replication layer for multi non-distributed + * storages. Copyright (C) 2014 Netflix, Inc. + */ /* * twemproxy - A fast and lightweight proxy for memcached protocol. @@ -20,49 +20,45 @@ * limitations under the License. */ -#include #include +#include #include "../dyn_core.h" #include "../dyn_dnode_peer.h" #include "dyn_proto.h" -#define RSP_STRING(ACTION) \ - ACTION( ok, "+OK\r\n" ) +#define RSP_STRING(ACTION) ACTION(ok, "+OK\r\n") - /*ACTION( pong, "+PONG\r\n" ) \ */ +/*ACTION( pong, "+PONG\r\n" ) \ */ -#define DEFINE_ACTION(_var, _str) static struct string rsp_##_var = string(_str); - RSP_STRING( DEFINE_ACTION ) +#define DEFINE_ACTION(_var, _str) \ + static struct string rsp_##_var = string(_str); +RSP_STRING(DEFINE_ACTION) #undef DEFINE_ACTION /* * Return true, if the redis command take no key, otherwise * return false */ -static bool -redis_argz(struct msg *r) -{ - switch (r->type) { +static bool redis_argz(struct msg *r) { + switch (r->type) { case MSG_REQ_REDIS_PING: case MSG_REQ_REDIS_QUIT: - return true; + return true; default: - break; - } + break; + } - return false; + return false; } /* * Return true, if the redis command accepts no arguments, otherwise * return false */ -static bool -redis_arg0(struct msg *r) -{ - switch (r->type) { +static bool redis_arg0(struct msg *r) { + switch (r->type) { case MSG_REQ_REDIS_PERSIST: case MSG_REQ_REDIS_PTTL: case MSG_REQ_REDIS_TTL: @@ -91,23 +87,21 @@ redis_arg0(struct msg *r) case MSG_REQ_REDIS_KEYS: case MSG_REQ_REDIS_PFCOUNT: - return true; + return true; default: - break; - } + break; + } - return false; + return false; } /* * Return true, if the redis command accepts exactly 1 argument, otherwise * return false */ -static bool -redis_arg1(struct msg *r) -{ - switch (r->type) { +static bool redis_arg1(struct msg *r) { + switch (r->type) { case MSG_REQ_REDIS_EXPIRE: case MSG_REQ_REDIS_EXPIREAT: case MSG_REQ_REDIS_PEXPIRE: @@ -137,34 +131,30 @@ redis_arg1(struct msg *r) case MSG_REQ_REDIS_SLAVEOF: case MSG_REQ_REDIS_CONFIG: - return true; + return true; default: - break; - } + break; + } - return false; + return false; } -static bool -redis_arg_upto1(struct msg *r) -{ - switch (r->type) { - case MSG_REQ_REDIS_INFO: - return true; - default: - break; - } - return false; +static bool redis_arg_upto1(struct msg *r) { + switch (r->type) { + case MSG_REQ_REDIS_INFO: + return true; + default: + break; + } + return false; } /* * Return true, if the redis command accepts exactly 2 arguments, otherwise * return false */ -static bool -redis_arg2(struct msg *r) -{ - switch (r->type) { +static bool redis_arg2(struct msg *r) { + switch (r->type) { case MSG_REQ_REDIS_GETRANGE: case MSG_REQ_REDIS_PSETEX: case MSG_REQ_REDIS_SETBIT: @@ -192,41 +182,37 @@ redis_arg2(struct msg *r) case MSG_REQ_REDIS_RESTORE: - return true; + return true; default: - break; - } + break; + } - return false; + return false; } /* * Return true, if the redis command accepts exactly 3 arguments, otherwise * return false */ -static bool -redis_arg3(struct msg *r) -{ - switch (r->type) { +static bool redis_arg3(struct msg *r) { + switch (r->type) { case MSG_REQ_REDIS_LINSERT: - return true; + return true; default: - break; - } + break; + } - return false; + return false; } /* * Return true, if the redis command accepts 0 or more arguments, otherwise * return false */ -static bool -redis_argn(struct msg *r) -{ - switch (r->type) { +static bool redis_argn(struct msg *r) { + switch (r->type) { case MSG_REQ_REDIS_SORT: case MSG_REQ_REDIS_BITCOUNT: @@ -265,56 +251,52 @@ redis_argn(struct msg *r) case MSG_REQ_REDIS_ZSCAN: case MSG_REQ_REDIS_PFADD: case MSG_REQ_REDIS_GEOADD: - case MSG_REQ_REDIS_GEORADIUS: + case MSG_REQ_REDIS_GEORADIUS: case MSG_REQ_REDIS_GEODIST: case MSG_REQ_REDIS_GEOHASH: case MSG_REQ_REDIS_GEOPOS: case MSG_REQ_REDIS_GEORADIUSBYMEMBER: - return true; + return true; default: - break; - } + break; + } - return false; + return false; } /* * Return true, if the redis command is a vector command accepting one or * more keys, otherwise return false */ -static bool -redis_argx(struct msg *r) -{ - switch (r->type) { +static bool redis_argx(struct msg *r) { + switch (r->type) { case MSG_REQ_REDIS_MGET: case MSG_REQ_REDIS_DEL: case MSG_REQ_REDIS_EXISTS: - return true; + return true; default: - break; - } + break; + } - return false; + return false; } /* * Return true, if the redis command is a vector command accepting one or * more key-value pairs, otherwise return false */ -static bool -redis_argkvx(struct msg *r) -{ - switch (r->type) { +static bool redis_argkvx(struct msg *r) { + switch (r->type) { case MSG_REQ_REDIS_MSET: - return true; + return true; default: - break; - } + break; + } - return false; + return false; } /* @@ -323,29 +305,25 @@ redis_argkvx(struct msg *r) * followed by zero or more arguments (the documentation online seems to suggest * that at least one argument is required, but that shouldn't be the case). */ -static bool -redis_argeval(struct msg *r) -{ - switch (r->type) { +static bool redis_argeval(struct msg *r) { + switch (r->type) { case MSG_REQ_REDIS_EVAL: case MSG_REQ_REDIS_EVALSHA: - return true; + return true; default: - break; - } + break; + } - return false; + return false; } /* * Return true, if the redis response is an error response i.e. a simple * string whose first character is '-', otherwise return false. */ -static bool -redis_error(struct msg *r) -{ - switch (r->type) { +static bool redis_error(struct msg *r) { + switch (r->type) { case MSG_RSP_REDIS_ERROR: case MSG_RSP_REDIS_ERROR_ERR: case MSG_RSP_REDIS_ERROR_OOM: @@ -360,12 +338,12 @@ redis_error(struct msg *r) case MSG_RSP_REDIS_ERROR_EXECABORT: case MSG_RSP_REDIS_ERROR_MASTERDOWN: case MSG_RSP_REDIS_ERROR_NOREPLICAS: - return true; + return true; default: - break; - } - return false; + break; + } + return false; } /* @@ -382,83 +360,86 @@ redis_error(struct msg *r) * ensures that the checksum comparison succeeds. * * * Sets *did_rewrite='true' if a rewrite occured and 'false' if not. - * * Does not modify 'orig_msg' and sets 'new_msg_ptr' to point to the new 'msg' struct with the - * rewritten query if 'did_rewrite' is true. + * * Does not modify 'orig_msg' and sets 'new_msg_ptr' to point to the new 'msg' + * struct with the rewritten query if 'did_rewrite' is true. * * Caller must take ownership of the newly allocated msg '*new_msg_ptr'. */ -rstatus_t redis_rewrite_query(struct msg* orig_msg, struct context* ctx, bool* did_rewrite, - struct msg** new_msg_ptr) { - const char* SMEMBERS_REWRITE_FMT_STRING = "*3\r\n$4\r\nsort\r\n$%d\r\n%s\r\n$5\r\nalpha\r\n"; - - ASSERT(orig_msg != NULL); - ASSERT(orig_msg->is_request); - ASSERT(did_rewrite != NULL); - - *did_rewrite = false; - - struct msg* new_msg = NULL; - uint8_t* key = NULL; - rstatus_t ret_status = DN_OK; - switch (orig_msg->type) { - case MSG_REQ_REDIS_SMEMBERS: - - if (orig_msg->owner->read_consistency == DC_SAFE_QUORUM) { - // SMEMBERS should have only one key. - ASSERT(orig_msg->nkeys == 1); - - // Get a new 'msg' structure. - new_msg = msg_get(orig_msg->owner, true, __FUNCTION__); - if (new_msg == NULL) { - ret_status = DN_ENOMEM; - goto error; - } - - uint32_t keylen; - // Get a copy of the key from 'orig_msg'. - key = msg_get_full_key_copy(orig_msg, 0, &keylen); - if (key == NULL) { - ret_status = DN_ENOMEM; - goto error; - } - - // Write the new command into 'new_msg' - rstatus_t prepend_status = msg_prepend_format(new_msg, SMEMBERS_REWRITE_FMT_STRING, keylen, key); - if (prepend_status != DN_OK) { - ret_status = prepend_status; - goto error; - } - - { - // Point the 'pos' pointer in 'new_msg' to the mbuf we've added. - struct mbuf* new_mbuf = STAILQ_LAST(&new_msg->mhdr, mbuf, next); - new_msg->pos = new_mbuf->pos; - } - // Parse the message 'new_msg' to populate all of its appropriate fields. - new_msg->parser(new_msg, &ctx->pool.hash_tag); - // Check if 'new_msg' was parsed successfully. - if (new_msg->result != MSG_PARSE_OK) { - ret_status = DN_ERROR; - goto error; - } - - *new_msg_ptr = new_msg; - *did_rewrite = true; - goto done; - } - break; - default: - return DN_OK; - } +rstatus_t redis_rewrite_query(struct msg *orig_msg, struct context *ctx, + bool *did_rewrite, struct msg **new_msg_ptr) { + const char *SMEMBERS_REWRITE_FMT_STRING = + "*3\r\n$4\r\nsort\r\n$%d\r\n%s\r\n$5\r\nalpha\r\n"; + + ASSERT(orig_msg != NULL); + ASSERT(orig_msg->is_request); + ASSERT(did_rewrite != NULL); + + *did_rewrite = false; + + struct msg *new_msg = NULL; + uint8_t *key = NULL; + rstatus_t ret_status = DN_OK; + switch (orig_msg->type) { + case MSG_REQ_REDIS_SMEMBERS: + + if (orig_msg->owner->read_consistency == DC_SAFE_QUORUM) { + // SMEMBERS should have only one key. + ASSERT(orig_msg->nkeys == 1); + + // Get a new 'msg' structure. + new_msg = msg_get(orig_msg->owner, true, __FUNCTION__); + if (new_msg == NULL) { + ret_status = DN_ENOMEM; + goto error; + } + + uint32_t keylen; + // Get a copy of the key from 'orig_msg'. + key = msg_get_full_key_copy(orig_msg, 0, &keylen); + if (key == NULL) { + ret_status = DN_ENOMEM; + goto error; + } + + // Write the new command into 'new_msg' + rstatus_t prepend_status = msg_prepend_format( + new_msg, SMEMBERS_REWRITE_FMT_STRING, keylen, key); + if (prepend_status != DN_OK) { + ret_status = prepend_status; + goto error; + } + + { + // Point the 'pos' pointer in 'new_msg' to the mbuf we've added. + struct mbuf *new_mbuf = STAILQ_LAST(&new_msg->mhdr, mbuf, next); + new_msg->pos = new_mbuf->pos; + } + // Parse the message 'new_msg' to populate all of its appropriate + // fields. + new_msg->parser(new_msg, &ctx->pool.hash_tag); + // Check if 'new_msg' was parsed successfully. + if (new_msg->result != MSG_PARSE_OK) { + ret_status = DN_ERROR; + goto error; + } + + *new_msg_ptr = new_msg; + *did_rewrite = true; + goto done; + } + break; + default: + return DN_OK; + } error: - if (key != NULL) dn_free(key); - // Return the newly allocated message back to the free message queue. - if (new_msg != NULL) msg_put(new_msg); - return ret_status; + if (key != NULL) dn_free(key); + // Return the newly allocated message back to the free message queue. + if (new_msg != NULL) msg_put(new_msg); + return ret_status; done: - if (key != NULL) dn_free(key); - return DN_OK; + if (key != NULL) dn_free(key); + return DN_OK; } /* @@ -486,1552 +467,1576 @@ rstatus_t redis_rewrite_query(struct msg* orig_msg, struct context* ctx, bool* d * Dynomite supports the Redis unified protocol for requests and inline ping. * The inline ping is being utilized by redis-benchmark */ -void -redis_parse_req(struct msg *r, const struct string *hash_tag) -{ - struct mbuf *b; - uint8_t *p, *m = 0; - uint8_t ch; - enum { - SW_START, - SW_NARG, - SW_NARG_LF, - SW_REQ_TYPE_LEN, - SW_REQ_TYPE_LEN_LF, - SW_REQ_TYPE, - SW_REQ_TYPE_LF, - SW_KEY_LEN, - SW_KEY_LEN_LF, - SW_KEY, - SW_KEY_LF, - SW_ARG1_LEN, - SW_ARG1_LEN_LF, - SW_ARG1, - SW_ARG1_LF, - SW_ARG2_LEN, - SW_ARG2_LEN_LF, - SW_ARG2, - SW_ARG2_LF, - SW_ARG3_LEN, - SW_ARG3_LEN_LF, - SW_ARG3, - SW_ARG3_LF, - SW_ARGN_LEN, - SW_ARGN_LEN_LF, - SW_ARGN, - SW_ARGN_LF, - SW_FRAGMENT, - SW_INLINE_PING, - SW_SENTINEL - } state; - - state = r->state; - b = STAILQ_LAST(&r->mhdr, mbuf, next); - - ASSERT(r->is_request); - ASSERT(state >= SW_START && state < SW_SENTINEL); - ASSERT(b != NULL); - ASSERT(b->pos <= b->last); - - /* validate the parsing maker */ - ASSERT(r->pos != NULL); - //ASSERT(r->pos >= b->pos && r->pos <= b->last); - - for (p = r->pos; p < b->last; p++) { - ch = *p; - - switch (state) { - - case SW_START: - - case SW_NARG: - if (r->token == NULL) { - if (ch == 'p' || ch == 'P' ){ /* inline ping */ - state = SW_INLINE_PING; - log_hexdump(LOG_VERB, b->pos, mbuf_length(b),"INLINE PING"); - break; - } - else if (ch != '*') { - goto error; - } - r->token = p; - /* req_start <- p */ - r->narg_start = p; - r->rnarg = 0; - state = SW_NARG; - } else if (isdigit(ch)) { - r->rnarg = r->rnarg * 10 + (uint32_t)(ch - '0'); - } else if (ch == CR) { - if (r->rnarg == 0) { - goto error; - } - r->narg = r->rnarg; - r->narg_end = p; - r->token = NULL; - state = SW_NARG_LF; - } else { - goto error; - } - +void redis_parse_req(struct msg *r, const struct string *hash_tag) { + struct mbuf *b; + uint8_t *p, *m = 0; + uint8_t ch; + enum { + SW_START, + SW_NARG, + SW_NARG_LF, + SW_REQ_TYPE_LEN, + SW_REQ_TYPE_LEN_LF, + SW_REQ_TYPE, + SW_REQ_TYPE_LF, + SW_KEY_LEN, + SW_KEY_LEN_LF, + SW_KEY, + SW_KEY_LF, + SW_ARG1_LEN, + SW_ARG1_LEN_LF, + SW_ARG1, + SW_ARG1_LF, + SW_ARG2_LEN, + SW_ARG2_LEN_LF, + SW_ARG2, + SW_ARG2_LF, + SW_ARG3_LEN, + SW_ARG3_LEN_LF, + SW_ARG3, + SW_ARG3_LF, + SW_ARGN_LEN, + SW_ARGN_LEN_LF, + SW_ARGN, + SW_ARGN_LF, + SW_FRAGMENT, + SW_INLINE_PING, + SW_SENTINEL + } state; + + state = r->state; + b = STAILQ_LAST(&r->mhdr, mbuf, next); + + ASSERT(r->is_request); + ASSERT(state >= SW_START && state < SW_SENTINEL); + ASSERT(b != NULL); + ASSERT(b->pos <= b->last); + + /* validate the parsing maker */ + ASSERT(r->pos != NULL); + // ASSERT(r->pos >= b->pos && r->pos <= b->last); + + for (p = r->pos; p < b->last; p++) { + ch = *p; + + switch (state) { + case SW_START: + + case SW_NARG: + if (r->token == NULL) { + if (ch == 'p' || ch == 'P') { /* inline ping */ + state = SW_INLINE_PING; + log_hexdump(LOG_VERB, b->pos, mbuf_length(b), "INLINE PING"); break; - - case SW_INLINE_PING: - if (str3icmp(p, 'i', 'n', 'g') && p + 4 < b->last) { - p = p + 4; - log_hexdump(LOG_VERB, b->pos, mbuf_length(b),"PING"); - r->type = MSG_REQ_REDIS_PING; - r->is_read = 1; - state = SW_REQ_TYPE_LF; - goto done; - } - else{ - log_hexdump(LOG_VERB, b->pos, mbuf_length(b),"PING ERROR %d, %s",p-m,p); - goto error; - } + } else if (ch != '*') { + goto error; + } + r->token = p; + /* req_start <- p */ + r->narg_start = p; + r->rnarg = 0; + state = SW_NARG; + } else if (isdigit(ch)) { + r->rnarg = r->rnarg * 10 + (uint32_t)(ch - '0'); + } else if (ch == CR) { + if (r->rnarg == 0) { + goto error; + } + r->narg = r->rnarg; + r->narg_end = p; + r->token = NULL; + state = SW_NARG_LF; + } else { + goto error; + } break; + case SW_INLINE_PING: + if (str3icmp(p, 'i', 'n', 'g') && p + 4 < b->last) { + p = p + 4; + log_hexdump(LOG_VERB, b->pos, mbuf_length(b), "PING"); + r->type = MSG_REQ_REDIS_PING; + r->is_read = 1; + state = SW_REQ_TYPE_LF; + goto done; + } else { + log_hexdump(LOG_VERB, b->pos, mbuf_length(b), "PING ERROR %d, %s", + p - m, p); + goto error; + } - case SW_NARG_LF: - switch (ch) { - case LF: - state = SW_REQ_TYPE_LEN; - break; - - default: - goto error; - } + break; + case SW_NARG_LF: + switch (ch) { + case LF: + state = SW_REQ_TYPE_LEN; break; - case SW_REQ_TYPE_LEN: - if (r->token == NULL) { - if (ch != '$') { - goto error; - } - r->token = p; - r->rlen = 0; - } else if (isdigit(ch)) { - r->rlen = r->rlen * 10 + (uint32_t)(ch - '0'); - } else if (ch == CR) { - if (r->rlen == 0 || r->rnarg == 0) { - goto error; - } - r->rnarg--; - r->token = NULL; - state = SW_REQ_TYPE_LEN_LF; - } else { - goto error; - } + default: + goto error; + } - break; + break; - case SW_REQ_TYPE_LEN_LF: - switch (ch) { - case LF: - state = SW_REQ_TYPE; - break; + case SW_REQ_TYPE_LEN: + if (r->token == NULL) { + if (ch != '$') { + goto error; + } + r->token = p; + r->rlen = 0; + } else if (isdigit(ch)) { + r->rlen = r->rlen * 10 + (uint32_t)(ch - '0'); + } else if (ch == CR) { + if (r->rlen == 0 || r->rnarg == 0) { + goto error; + } + r->rnarg--; + r->token = NULL; + state = SW_REQ_TYPE_LEN_LF; + } else { + goto error; + } - default: - goto error; - } + break; + case SW_REQ_TYPE_LEN_LF: + switch (ch) { + case LF: + state = SW_REQ_TYPE; break; - case SW_REQ_TYPE: - if (r->token == NULL) { - r->token = p; - } - - m = r->token + r->rlen; - if (m >= b->last) { - m = b->last - 1; - p = m; - break; - } - - if (*m != CR) { - goto error; - } - - p = m; /* move forward by rlen bytes */ - r->rlen = 0; - m = r->token; - r->token = NULL; - r->type = MSG_UNKNOWN; - - switch (p - m) { - - case 3: - if (str3icmp(m, 'g', 'e', 't')) { - r->type = MSG_REQ_REDIS_GET; - r->is_read = 1; - break; - } - - if (str3icmp(m, 's', 'e', 't')) { - r->type = MSG_REQ_REDIS_SET; - r->is_read = 0; - break; - } - - if (str3icmp(m, 't', 't', 'l')) { - r->type = MSG_REQ_REDIS_TTL; - r->is_read = 0; - break; - } - - if (str3icmp(m, 'd', 'e', 'l')) { - r->type = MSG_REQ_REDIS_DEL; - r->is_read = 0; - break; - } - - break; - - case 4: - if (str4icmp(m, 'p', 't', 't', 'l')) { - r->type = MSG_REQ_REDIS_PTTL; - r->is_read = 1; - break; - } - - if (str4icmp(m, 'd', 'e', 'c', 'r')) { - r->type = MSG_REQ_REDIS_DECR; - r->is_read = 0; - break; - } - - if (str4icmp(m, 'd', 'u', 'm', 'p')) { - r->type = MSG_REQ_REDIS_DUMP; - r->is_read = 1; - break; - } - - if (str4icmp(m, 'h', 'd', 'e', 'l')) { - r->type = MSG_REQ_REDIS_HDEL; - r->is_read = 0; - break; - } - - if (str4icmp(m, 'h', 'g', 'e', 't')) { - r->type = MSG_REQ_REDIS_HGET; - r->is_read = 1; - break; - } - - if (str4icmp(m, 'h', 'l', 'e', 'n')) { - r->type = MSG_REQ_REDIS_HLEN; - r->is_read = 1; - break; - } - - if (str4icmp(m, 'h', 's', 'e', 't')) { - r->type = MSG_REQ_REDIS_HSET; - r->is_read = 0; - break; - } - - if (str4icmp(m, 'i', 'n', 'c', 'r')) { - r->type = MSG_REQ_REDIS_INCR; - r->is_read = 0; - break; - } - - if (str4icmp(m, 'k', 'e', 'y', 's')) { /* Yannis: Need to identify how this is defined in Redis protocol */ - r->type = MSG_REQ_REDIS_KEYS; - r->msg_routing = ROUTING_LOCAL_NODE_ONLY; - r->is_read = 1; - break; - } - - if (str4icmp(m, 'i', 'n', 'f', 'o')) { - r->type = MSG_REQ_REDIS_INFO; - r->msg_routing = ROUTING_LOCAL_NODE_ONLY; - r->is_read = 1; - break; - } - - if (str4icmp(m, 'l', 'l', 'e', 'n')) { - r->type = MSG_REQ_REDIS_LLEN; - r->is_read = 1; - break; - } - - if (str4icmp(m, 'l', 'p', 'o', 'p')) { - r->type = MSG_REQ_REDIS_LPOP; - r->is_read = 0; - break; - } - - if (str4icmp(m, 'l', 'r', 'e', 'm')) { - r->type = MSG_REQ_REDIS_LREM; - r->is_read = 0; - break; - } - - if (str4icmp(m, 'l', 's', 'e', 't')) { - r->type = MSG_REQ_REDIS_LSET; - r->is_read = 0; - break; - } - - if (str4icmp(m, 'm', 'g', 'e', 't')) { - r->type = MSG_REQ_REDIS_MGET; - r->is_read = 1; - break; - } - - if (str4icmp(m, 'm', 's', 'e', 't')) { /* Yannis: need to investigate the fan out of data to multiple nodes */ - r->type = MSG_REQ_REDIS_MSET; - r->is_read = 0; - break; - } - - if (str4icmp(m, 'p', 'i', 'n', 'g')) { - r->type = MSG_REQ_REDIS_PING; - r->msg_routing = ROUTING_LOCAL_NODE_ONLY; - p = p + 1; - r->is_read = 1; - goto done; - } - - if (str4icmp(m, 'r', 'p', 'o', 'p')) { - r->type = MSG_REQ_REDIS_RPOP; - r->is_read = 0; - break; - } - - if (str4icmp(m, 's', 'a', 'd', 'd')) { - r->type = MSG_REQ_REDIS_SADD; - r->is_read = 0; - break; - } - - if (str4icmp(m, 's', 'c', 'a', 'n')) { - r->type = MSG_REQ_REDIS_SCAN; - r->msg_routing = ROUTING_LOCAL_NODE_ONLY; - r->is_read = 1; - break; - } - - if (str4icmp(m, 's', 'p', 'o', 'p')) { - r->type = MSG_REQ_REDIS_SPOP; - r->is_read = 0; - break; - } - - if (str4icmp(m, 's', 'r', 'e', 'm')) { - r->type = MSG_REQ_REDIS_SREM; - r->is_read = 0; - break; - } - - if (str4icmp(m, 't', 'y', 'p', 'e')) { - r->type = MSG_REQ_REDIS_TYPE; - r->is_read = 1; - break; - } - - if (str4icmp(m, 'z', 'a', 'd', 'd')) { - r->type = MSG_REQ_REDIS_ZADD; - r->is_read = 0; - break; - } - - if (str4icmp(m, 'z', 'r', 'e', 'm')) { - r->type = MSG_REQ_REDIS_ZREM; - r->is_read = 0; - break; - } - - if (str4icmp(m, 'e', 'v', 'a', 'l')) { - r->type = MSG_REQ_REDIS_EVAL; - r->is_read = 0; - break; - } - - if (str4icmp(m, 's', 'o', 'r', 't')) { - r->type = MSG_REQ_REDIS_SORT; - r->is_read = 1; - break; - } - - if (str4icmp(m, 'q', 'u', 'i', 't')) { - r->type = MSG_REQ_REDIS_QUIT; - r->quit = 1; - break; - } - - break; - - case 5: - if (str5icmp(m, 'h', 'k', 'e', 'y', 's')) { - r->type = MSG_REQ_REDIS_HKEYS; - r->msg_routing = ROUTING_TOKEN_OWNER_LOCAL_RACK_ONLY; - r->is_read = 1; - break; - } - - if (str5icmp(m, 'h', 'm', 'g', 'e', 't')) { - r->type = MSG_REQ_REDIS_HMGET; - r->is_read = 1; - break; - } - - if (str5icmp(m, 'h', 'm', 's', 'e', 't')) { - r->type = MSG_REQ_REDIS_HMSET; - r->is_read = 0; - break; - } - - if (str5icmp(m, 'h', 'v', 'a', 'l', 's')) { - r->type = MSG_REQ_REDIS_HVALS; - r->msg_routing = ROUTING_TOKEN_OWNER_LOCAL_RACK_ONLY; - r->is_read = 1; - break; - } - - if (str5icmp(m, 'h', 's', 'c', 'a', 'n')) { - r->type = MSG_REQ_REDIS_HSCAN; - r->msg_routing = ROUTING_TOKEN_OWNER_LOCAL_RACK_ONLY; - r->is_read = 1; - break; - } - - if (str5icmp(m, 'l', 'p', 'u', 's', 'h')) { - r->type = MSG_REQ_REDIS_LPUSH; - r->is_read = 0; - break; - } - - if (str5icmp(m, 'l', 't', 'r', 'i', 'm')) { - r->type = MSG_REQ_REDIS_LTRIM; - r->is_read = 0; - break; - } - - if (str5icmp(m, 'r', 'p', 'u', 's', 'h')) { - r->type = MSG_REQ_REDIS_RPUSH; - r->is_read = 0; - break; - } - - if (str5icmp(m, 's', 'c', 'a', 'r', 'd')) { - r->type = MSG_REQ_REDIS_SCARD; - r->is_read = 1; - break; - } - - if (str5icmp(m, 's', 'd', 'i', 'f', 'f')) { - r->type = MSG_REQ_REDIS_SDIFF; - r->is_read = 1; - break; - } - - if (str5icmp(m, 's', 'e', 't', 'e', 'x')) { - r->type = MSG_REQ_REDIS_SETEX; - r->is_read = 0; - break; - } - - if (str5icmp(m, 's', 'e', 't', 'n', 'x')) { - r->type = MSG_REQ_REDIS_SETNX; - r->is_read = 0; - break; - } - - if (str5icmp(m, 's', 'm', 'o', 'v', 'e')) { - r->type = MSG_REQ_REDIS_SMOVE; - r->is_read = 0; - break; - } - - if (str5icmp(m, 's', 's', 'c', 'a', 'n')) { - r->type = MSG_REQ_REDIS_SSCAN; - r->msg_routing = ROUTING_TOKEN_OWNER_LOCAL_RACK_ONLY; - r->is_read = 1; - break; - } - - if (str5icmp(m, 'z', 'c', 'a', 'r', 'd')) { - r->type = MSG_REQ_REDIS_ZCARD; - r->is_read = 1; - break; - } - - if (str5icmp(m, 'z', 'r', 'a', 'n', 'k')) { - r->type = MSG_REQ_REDIS_ZRANK; - r->is_read = 1; - break; - } - - if (str5icmp(m, 'z', 's', 'c', 'a', 'n')) { - r->type = MSG_REQ_REDIS_ZSCAN; - r->msg_routing = ROUTING_TOKEN_OWNER_LOCAL_RACK_ONLY; - r->is_read = 1; - break; - } - if (str5icmp(m, 'p', 'f', 'a', 'd', 'd')) { - r->type = MSG_REQ_REDIS_PFADD; - r->is_read = 0; - break; - } - - break; - - case 6: - if (str6icmp(m, 'a', 'p', 'p', 'e', 'n', 'd')) { - r->type = MSG_REQ_REDIS_APPEND; - r->is_read = 0; - break; - } - - if (str6icmp(m, 'd', 'e', 'c', 'r', 'b', 'y')) { - r->type = MSG_REQ_REDIS_DECRBY; - r->is_read = 0; - break; - } - - if (str6icmp(m, 'e', 'x', 'i', 's', 't', 's')) { - r->type = MSG_REQ_REDIS_EXISTS; - r->is_read = 1; - break; - } - - if (str6icmp(m, 'e', 'x', 'p', 'i', 'r', 'e')) { - r->type = MSG_REQ_REDIS_EXPIRE; - r->is_read = 0; - break; - } - - if (str6icmp(m, 'g', 'e', 't', 'b', 'i', 't')) { - r->type = MSG_REQ_REDIS_GETBIT; - r->is_read = 1; - break; - } - - if (str6icmp(m, 'g', 'e', 't', 's', 'e', 't')) { - r->type = MSG_REQ_REDIS_GETSET; - r->is_read = 0; - break; - } - - if (str6icmp(m, 'p', 's', 'e', 't', 'e', 'x')) { - r->type = MSG_REQ_REDIS_PSETEX; - r->is_read = 0; - break; - } - - if (str6icmp(m, 'h', 's', 'e', 't', 'n', 'x')) { - r->type = MSG_REQ_REDIS_HSETNX; - r->is_read = 0; - break; - } - - if (str6icmp(m, 'i', 'n', 'c', 'r', 'b', 'y')) { - r->type = MSG_REQ_REDIS_INCRBY; - r->is_read = 0; - break; - } - - if (str6icmp(m, 'l', 'i', 'n', 'd', 'e', 'x')) { - r->type = MSG_REQ_REDIS_LINDEX; - r->is_read = 1; - break; - } - - if (str6icmp(m, 'l', 'p', 'u', 's', 'h', 'x')) { - r->type = MSG_REQ_REDIS_LPUSHX; - r->is_read = 0; - break; - } - - if (str6icmp(m, 'l', 'r', 'a', 'n', 'g', 'e')) { - r->type = MSG_REQ_REDIS_LRANGE; - r->is_read = 1; - break; - } - - if (str6icmp(m, 'r', 'p', 'u', 's', 'h', 'x')) { - r->type = MSG_REQ_REDIS_RPUSHX; - r->is_read = 0; - break; - } - - if (str6icmp(m, 's', 'e', 't', 'b', 'i', 't')) { - r->type = MSG_REQ_REDIS_SETBIT; - r->is_read = 0; - break; - } - - if (str6icmp(m, 's', 'i', 'n', 't', 'e', 'r')) { - r->type = MSG_REQ_REDIS_SINTER; - r->is_read = 1; - break; - } - - if (str6icmp(m, 's', 't', 'r', 'l', 'e', 'n')) { - r->type = MSG_REQ_REDIS_STRLEN; - r->is_read = 1; - break; - } - - if (str6icmp(m, 's', 'u', 'n', 'i', 'o', 'n')) { - r->type = MSG_REQ_REDIS_SUNION; - r->is_read = 1; - break; - } - - if (str6icmp(m, 'z', 'c', 'o', 'u', 'n', 't')) { - r->type = MSG_REQ_REDIS_ZCOUNT; - r->is_read = 1; - break; - } - - if (str6icmp(m, 'z', 'r', 'a', 'n', 'g', 'e')) { - r->type = MSG_REQ_REDIS_ZRANGE; - r->is_read = 1; - break; - } - - if (str6icmp(m, 'z', 's', 'c', 'o', 'r', 'e')) { - r->type = MSG_REQ_REDIS_ZSCORE; - r->is_read = 1; - break; - } - - if (str6icmp(m, 'c', 'o', 'n', 'f', 'i', 'g')) { - r->type = MSG_REQ_REDIS_CONFIG; - r->msg_routing = ROUTING_LOCAL_NODE_ONLY; - r->is_read = 1; - break; - } - if (str6icmp(m, 'g', 'e', 'o', 'a', 'd', 'd')) { - r->type = MSG_REQ_REDIS_GEOADD; - r->is_read = 0; - break; - } - if (str6icmp(m, 'g', 'e', 'o', 'p', 'o', 's')) { - r->type = MSG_REQ_REDIS_GEOPOS; - r->is_read = 1; - break; - } + default: + goto error; + } - break; + break; - case 7: - if (str7icmp(m, 'p', 'e', 'r', 's', 'i', 's', 't')) { - r->type = MSG_REQ_REDIS_PERSIST; - r->is_read = 0; - break; - } - - if (str7icmp(m, 'p', 'e', 'x', 'p', 'i', 'r', 'e')) { - r->type = MSG_REQ_REDIS_PEXPIRE; - r->is_read = 0; - break; - } - - if (str7icmp(m, 'h', 'e', 'x', 'i', 's', 't', 's')) { - r->type = MSG_REQ_REDIS_HEXISTS; - r->is_read = 1; - break; - } - - if (str7icmp(m, 'h', 'g', 'e', 't', 'a', 'l', 'l')) { - r->type = MSG_REQ_REDIS_HGETALL; - r->msg_routing = ROUTING_TOKEN_OWNER_LOCAL_RACK_ONLY; - r->is_read = 1; - break; - } - - if (str7icmp(m, 'h', 'i', 'n', 'c', 'r', 'b', 'y')) { - r->type = MSG_REQ_REDIS_HINCRBY; - r->is_read = 0; - break; - } - - if (str7icmp(m, 'l', 'i', 'n', 's', 'e', 'r', 't')) { - r->type = MSG_REQ_REDIS_LINSERT; - r->is_read = 0; - break; - } - - if (str7icmp(m, 'z', 'i', 'n', 'c', 'r', 'b', 'y')) { - r->type = MSG_REQ_REDIS_ZINCRBY; - r->is_read = 0; - break; - } - - if (str7icmp(m, 'e', 'v', 'a', 'l', 's', 'h', 'a')) { - r->type = MSG_REQ_REDIS_EVALSHA; - r->is_read = 0; - break; - } - - if (str7icmp(m, 'r', 'e', 's', 't', 'o', 'r', 'e')) { - r->type = MSG_REQ_REDIS_RESTORE; - r->is_read = 0; - break; - } - - if (str7icmp(m, 's', 'l', 'a', 'v', 'e', 'o', 'f')) { - r->type = MSG_REQ_REDIS_SLAVEOF; - r->msg_routing = ROUTING_LOCAL_NODE_ONLY; - r->is_read = 0; - break; - } - if (str7icmp(m, 'p', 'f', 'c', 'o', 'u', 'n', 't')) { - r->type = MSG_REQ_REDIS_PFCOUNT; - r->is_read = 0; - break; - } - if (str7icmp(m, 'g', 'e', 'o', 'h', 'a', 's', 'h')) { - r->type = MSG_REQ_REDIS_GEOHASH; - r->is_read = 1; - break; - } - if (str7icmp(m, 'g', 'e', 'o', 'd', 'i', 's', 't')) { - r->type = MSG_REQ_REDIS_GEODIST; - r->is_read = 1; - break; - } - - break; + case SW_REQ_TYPE: + if (r->token == NULL) { + r->token = p; + } - case 8: - if (str8icmp(m, 'e', 'x', 'p', 'i', 'r', 'e', 'a', 't')) { - r->type = MSG_REQ_REDIS_EXPIREAT; - r->is_read = 0; - break; - } - - if (str8icmp(m, 'b', 'i', 't', 'c', 'o', 'u', 'n', 't')) { - r->type = MSG_REQ_REDIS_BITCOUNT; - r->is_read = 1; - break; - } - - if (str8icmp(m, 'g', 'e', 't', 'r', 'a', 'n', 'g', 'e')) { - r->type = MSG_REQ_REDIS_GETRANGE; - r->is_read = 1; - break; - } - - if (str8icmp(m, 's', 'e', 't', 'r', 'a', 'n', 'g', 'e')) { - r->type = MSG_REQ_REDIS_SETRANGE; - r->is_read = 0; - break; - } - - if (str8icmp(m, 's', 'm', 'e', 'm', 'b', 'e', 'r', 's')) { - r->type = MSG_REQ_REDIS_SMEMBERS; - r->is_read = 1; - break; - } - - if (str8icmp(m, 'z', 'r', 'e', 'v', 'r', 'a', 'n', 'k')) { - r->type = MSG_REQ_REDIS_ZREVRANK; - r->is_read = 1; - break; - } + m = r->token + r->rlen; + if (m >= b->last) { + m = b->last - 1; + p = m; + break; + } - break; + if (*m != CR) { + goto error; + } - case 9: - if (str9icmp(m, 'p', 'e', 'x', 'p', 'i', 'r', 'e', 'a', 't')) { - r->type = MSG_REQ_REDIS_PEXPIREAT; - r->is_read = 0; - break; - } - - if (str9icmp(m, 'r', 'p', 'o', 'p', 'l', 'p', 'u', 's', 'h')) { - r->type = MSG_REQ_REDIS_RPOPLPUSH; - r->is_read = 0; - break; - } - - if (str9icmp(m, 's', 'i', 's', 'm', 'e', 'm', 'b', 'e', 'r')) { - r->type = MSG_REQ_REDIS_SISMEMBER; - r->is_read = 1; - break; - } - - if (str9icmp(m, 'z', 'l', 'e', 'x', 'c', 'o', 'u', 'n', 't')) { - r->type = MSG_REQ_REDIS_ZLEXCOUNT; - r->is_read = 1; - break; - } - - if (str9icmp(m, 'z', 'r', 'e', 'v', 'r', 'a', 'n', 'g', 'e')) { - r->type = MSG_REQ_REDIS_ZREVRANGE; - r->is_read = 1; - break; - } - - if (str9icmp(m, 'g', 'e', 'o', 'r', 'a', 'd', 'i', 'u', 's')) { - r->type = MSG_REQ_REDIS_GEORADIUS; - r->is_read = 1; - break; - } + p = m; /* move forward by rlen bytes */ + r->rlen = 0; + m = r->token; + r->token = NULL; + r->type = MSG_UNKNOWN; - break; + switch (p - m) { + case 3: + if (str3icmp(m, 'g', 'e', 't')) { + r->type = MSG_REQ_REDIS_GET; + r->is_read = 1; + break; + } - case 10: - if (str10icmp(m, 's', 'd', 'i', 'f', 'f', 's', 't', 'o', 'r', 'e')) { - r->type = MSG_REQ_REDIS_SDIFFSTORE; - r->is_read = 0; - break; - } + if (str3icmp(m, 's', 'e', 't')) { + r->type = MSG_REQ_REDIS_SET; + r->is_read = 0; + break; + } - break; + if (str3icmp(m, 't', 't', 'l')) { + r->type = MSG_REQ_REDIS_TTL; + r->is_read = 0; + break; + } - case 11: - if (str11icmp(m, 'i', 'n', 'c', 'r', 'b', 'y', 'f', 'l', 'o', 'a', 't')) { - r->type = MSG_REQ_REDIS_INCRBYFLOAT; - r->is_read = 0; - break; - } - - if (str11icmp(m, 's', 'i', 'n', 't', 'e', 'r', 's', 't', 'o', 'r', 'e')) { - r->type = MSG_REQ_REDIS_SINTERSTORE; - r->is_read = 0; - break; - } - - if (str11icmp(m, 's', 'r', 'a', 'n', 'd', 'm', 'e', 'm', 'b', 'e', 'r')) { - r->type = MSG_REQ_REDIS_SRANDMEMBER; - r->is_read = 1; - break; - } - - if (str11icmp(m, 's', 'u', 'n', 'i', 'o', 'n', 's', 't', 'o', 'r', 'e')) { - r->type = MSG_REQ_REDIS_SUNIONSTORE; - r->is_read = 1; - break; - } - - if (str11icmp(m, 'z', 'i', 'n', 't', 'e', 'r', 's', 't', 'o', 'r', 'e')) { - r->type = MSG_REQ_REDIS_ZINTERSTORE; - r->is_read = 1; - break; - } - - if (str11icmp(m, 'z', 'r', 'a', 'n', 'g', 'e', 'b', 'y', 'l', 'e', 'x')) { - r->type = MSG_REQ_REDIS_ZRANGEBYLEX; - r->is_read = 1; - break; - } - - if (str11icmp(m, 'z', 'u', 'n', 'i', 'o', 'n', 's', 't', 'o', 'r', 'e')) { - r->type = MSG_REQ_REDIS_ZUNIONSTORE; - r->is_read = 1; - break; - } + if (str3icmp(m, 'd', 'e', 'l')) { + r->type = MSG_REQ_REDIS_DEL; + r->is_read = 0; + break; + } - break; + break; - case 12: - if (str12icmp(m, 'h', 'i', 'n', 'c', 'r', 'b', 'y', 'f', 'l', 'o', 'a', 't')) { - r->type = MSG_REQ_REDIS_HINCRBYFLOAT; - r->is_read = 0; - break; - } + case 4: + if (str4icmp(m, 'p', 't', 't', 'l')) { + r->type = MSG_REQ_REDIS_PTTL; + r->is_read = 1; + break; + } - break; + if (str4icmp(m, 'd', 'e', 'c', 'r')) { + r->type = MSG_REQ_REDIS_DECR; + r->is_read = 0; + break; + } - case 13: - if (str13icmp(m, 'z', 'r', 'a', 'n', 'g', 'e', 'b', 'y', 's', 'c', 'o', 'r', 'e')) { - r->type = MSG_REQ_REDIS_ZRANGEBYSCORE; - r->is_read = 1; - break; - } + if (str4icmp(m, 'd', 'u', 'm', 'p')) { + r->type = MSG_REQ_REDIS_DUMP; + r->is_read = 1; + break; + } - break; + if (str4icmp(m, 'h', 'd', 'e', 'l')) { + r->type = MSG_REQ_REDIS_HDEL; + r->is_read = 0; + break; + } - case 14: - if (str14icmp(m, 'z', 'r', 'e', 'm', 'r', 'a', 'n', 'g', 'e', 'b', 'y', 'l', 'e', 'x')) { - r->type = MSG_REQ_REDIS_ZREMRANGEBYLEX; - r->is_read = 0; - break; - } + if (str4icmp(m, 'h', 'g', 'e', 't')) { + r->type = MSG_REQ_REDIS_HGET; + r->is_read = 1; + break; + } - if (str14icmp(m, 'z', 'r', 'e', 'v', 'r', 'a', 'n', 'g', 'e', 'b', 'y', 'l', 'e', 'x')) { - r->type = MSG_REQ_REDIS_ZREVRANGEBYLEX; - r->is_read = 1; - break; - } + if (str4icmp(m, 'h', 'l', 'e', 'n')) { + r->type = MSG_REQ_REDIS_HLEN; + r->is_read = 1; + break; + } - break; + if (str4icmp(m, 'h', 's', 'e', 't')) { + r->type = MSG_REQ_REDIS_HSET; + r->is_read = 0; + break; + } - case 15: - if (str15icmp(m, 'z', 'r', 'e', 'm', 'r', 'a', 'n', 'g', 'e', 'b', 'y', 'r', 'a', 'n', 'k')) { - r->type = MSG_REQ_REDIS_ZREMRANGEBYRANK; - r->is_read = 0; - break; - } + if (str4icmp(m, 'i', 'n', 'c', 'r')) { + r->type = MSG_REQ_REDIS_INCR; + r->is_read = 0; + break; + } - break; + if (str4icmp(m, 'k', 'e', 'y', 's')) { /* Yannis: Need to identify + how this is defined in + Redis protocol */ + r->type = MSG_REQ_REDIS_KEYS; + r->msg_routing = ROUTING_LOCAL_NODE_ONLY; + r->is_read = 1; + break; + } - case 16: - if (str16icmp(m, 'z', 'r', 'e', 'm', 'r', 'a', 'n', 'g', 'e', 'b', 'y', 's', 'c', 'o', 'r', 'e')) { - r->type = MSG_REQ_REDIS_ZREMRANGEBYSCORE; - r->is_read = 0; - break; - } + if (str4icmp(m, 'i', 'n', 'f', 'o')) { + r->type = MSG_REQ_REDIS_INFO; + r->msg_routing = ROUTING_LOCAL_NODE_ONLY; + r->is_read = 1; + break; + } - if (str16icmp(m, 'z', 'r', 'e', 'v', 'r', 'a', 'n', 'g', 'e', 'b', 'y', 's', 'c', 'o', 'r', 'e')) { - r->type = MSG_REQ_REDIS_ZREVRANGEBYSCORE; - r->is_read = 1; - break; - } + if (str4icmp(m, 'l', 'l', 'e', 'n')) { + r->type = MSG_REQ_REDIS_LLEN; + r->is_read = 1; + break; + } - break; + if (str4icmp(m, 'l', 'p', 'o', 'p')) { + r->type = MSG_REQ_REDIS_LPOP; + r->is_read = 0; + break; + } - case 17: - if (str17icmp(m, 'g', 'e', 'o', 'r', 'a', 'd', 'i', 'u', 's', 'b', 'y', 'm', 'e', 'm', 'b','e','r')) { - r->type = MSG_REQ_REDIS_GEORADIUSBYMEMBER; - r->is_read = 1; - break; - } + if (str4icmp(m, 'l', 'r', 'e', 'm')) { + r->type = MSG_REQ_REDIS_LREM; + r->is_read = 0; + break; + } - break; - default: - r->is_read = 1; - break; + if (str4icmp(m, 'l', 's', 'e', 't')) { + r->type = MSG_REQ_REDIS_LSET; + r->is_read = 0; + break; } - if (r->type == MSG_UNKNOWN) { - log_error("parsed unsupported command '%.*s'", p - m, m); - goto error; + if (str4icmp(m, 'm', 'g', 'e', 't')) { + r->type = MSG_REQ_REDIS_MGET; + r->is_read = 1; + break; } - log_debug(LOG_VERB, "parsed command '%.*s'", p - m, m); + if (str4icmp(m, 'm', 's', 'e', + 't')) { /* Yannis: need to investigate the fan out of + data to multiple nodes */ + r->type = MSG_REQ_REDIS_MSET; + r->is_read = 0; + break; + } - state = SW_REQ_TYPE_LF; - break; + if (str4icmp(m, 'p', 'i', 'n', 'g')) { + r->type = MSG_REQ_REDIS_PING; + r->msg_routing = ROUTING_LOCAL_NODE_ONLY; + p = p + 1; + r->is_read = 1; + goto done; + } + if (str4icmp(m, 'r', 'p', 'o', 'p')) { + r->type = MSG_REQ_REDIS_RPOP; + r->is_read = 0; + break; + } - case SW_REQ_TYPE_LF: - switch (ch) { - case LF: - if (redis_argz(r) && (r->rnarg == 0)) { - goto done; - } else if (redis_arg_upto1(r) && r->rnarg == 0) { - goto done; - } else if (redis_arg_upto1(r) && r->rnarg == 1) { - state = SW_ARG1_LEN; - } else if (redis_argeval(r)) { - state = SW_ARG1_LEN; - } else { - state = SW_KEY_LEN; - } - break; - - default: - goto error; - } + if (str4icmp(m, 's', 'a', 'd', 'd')) { + r->type = MSG_REQ_REDIS_SADD; + r->is_read = 0; + break; + } - break; + if (str4icmp(m, 's', 'c', 'a', 'n')) { + r->type = MSG_REQ_REDIS_SCAN; + r->msg_routing = ROUTING_LOCAL_NODE_ONLY; + r->is_read = 1; + break; + } - case SW_KEY_LEN: - if (r->token == NULL) { - if (ch != '$') { - goto error; - } - r->token = p; - r->rlen = 0; - } else if (isdigit(ch)) { - r->rlen = r->rlen * 10 + (uint32_t)(ch - '0'); - } else if (ch == CR) { - if (r->rlen == 0) { - log_error("parsed bad req %"PRIu64" of type %d with empty " - "key", r->id, r->type); - goto error; - } - if (r->rlen >= mbuf_data_size()) { - log_error("parsed bad req %"PRIu64" of type %d with key " - "length %d that greater than or equal to maximum" - " redis key length of %d", r->id, r->type, - r->rlen, mbuf_data_size()); - goto error; - } - if (r->rnarg == 0) { - goto error; - } - r->rnarg--; - r->token = NULL; - state = SW_KEY_LEN_LF; - } else { - goto error; + if (str4icmp(m, 's', 'p', 'o', 'p')) { + r->type = MSG_REQ_REDIS_SPOP; + r->is_read = 0; + break; } - break; + if (str4icmp(m, 's', 'r', 'e', 'm')) { + r->type = MSG_REQ_REDIS_SREM; + r->is_read = 0; + break; + } - case SW_KEY_LEN_LF: - switch (ch) { - case LF: - state = SW_KEY; - break; + if (str4icmp(m, 't', 'y', 'p', 'e')) { + r->type = MSG_REQ_REDIS_TYPE; + r->is_read = 1; + break; + } - default: - goto error; + if (str4icmp(m, 'z', 'a', 'd', 'd')) { + r->type = MSG_REQ_REDIS_ZADD; + r->is_read = 0; + break; } - break; + if (str4icmp(m, 'z', 'r', 'e', 'm')) { + r->type = MSG_REQ_REDIS_ZREM; + r->is_read = 0; + break; + } - case SW_KEY: - if (r->token == NULL) { - r->token = p; + if (str4icmp(m, 'e', 'v', 'a', 'l')) { + r->type = MSG_REQ_REDIS_EVAL; + r->is_read = 0; + break; } - m = r->token + r->rlen; - if (m >= b->last) { - m = b->last - 1; - p = m; - break; + if (str4icmp(m, 's', 'o', 'r', 't')) { + r->type = MSG_REQ_REDIS_SORT; + r->is_read = 1; + break; } - if (*m != CR) { - goto error; - } else { - struct keypos *kpos; - - p = m; /* move forward by rlen bytes */ - r->rlen = 0; - m = r->token; - r->token = NULL; - - kpos = array_push(r->keys); - if (kpos == NULL) { - goto enomem; - } - kpos->start = kpos->tag_start = m; - kpos->end = kpos->tag_end = p; - if (!string_empty(hash_tag)) { - uint8_t *tag_start, *tag_end; - - tag_start = dn_strchr(kpos->start, kpos->end, hash_tag->data[0]); - if (tag_start != NULL) { - tag_end = dn_strchr(tag_start + 1, kpos->end, hash_tag->data[1]); - if (tag_end != NULL) { - kpos->tag_start = tag_start + 1; - kpos->tag_end = tag_end; - } - } - } - state = SW_KEY_LF; + if (str4icmp(m, 'q', 'u', 'i', 't')) { + r->type = MSG_REQ_REDIS_QUIT; + r->quit = 1; + break; } break; - case SW_KEY_LF: - switch (ch) { - case LF: - if (redis_arg0(r)) { - if (r->rnarg != 0) { - goto error; - } - goto done; - } else if (redis_arg1(r)) { - if (r->rnarg != 1) { - goto error; - } - state = SW_ARG1_LEN; - } else if (redis_arg2(r)) { - if (r->rnarg != 2) { - goto error; - } - state = SW_ARG1_LEN; - } else if (redis_arg3(r)) { - if (r->rnarg != 3) { - goto error; - } - state = SW_ARG1_LEN; - } else if (redis_argn(r)) { - if (r->rnarg == 0) { - goto done; - } - state = SW_ARG1_LEN; - } else if (redis_argx(r)) { - if (r->rnarg == 0) { - goto done; - } - state = SW_KEY_LEN; - } else if (redis_argkvx(r)) { - if (r->narg % 2 == 0) { - goto error; - } - state = SW_ARG1_LEN; - } else if (redis_argeval(r)) { - r->nkeys--; - if (r->nkeys > 0) { - // if there are more keys pending, parse them - state = SW_KEY_LEN; - } else if (r->rnarg > 0) { - // we finished parsing keys, now start with args - state = SW_ARGN_LEN; - } else { - // no more args left, we are done - goto done; - } - } else { - goto error; - } + case 5: + if (str5icmp(m, 'h', 'k', 'e', 'y', 's')) { + r->type = MSG_REQ_REDIS_HKEYS; + r->msg_routing = ROUTING_TOKEN_OWNER_LOCAL_RACK_ONLY; + r->is_read = 1; + break; + } - break; + if (str5icmp(m, 'h', 'm', 'g', 'e', 't')) { + r->type = MSG_REQ_REDIS_HMGET; + r->is_read = 1; + break; + } - default: - goto error; + if (str5icmp(m, 'h', 'm', 's', 'e', 't')) { + r->type = MSG_REQ_REDIS_HMSET; + r->is_read = 0; + break; } - break; + if (str5icmp(m, 'h', 'v', 'a', 'l', 's')) { + r->type = MSG_REQ_REDIS_HVALS; + r->msg_routing = ROUTING_TOKEN_OWNER_LOCAL_RACK_ONLY; + r->is_read = 1; + break; + } - case SW_ARG1_LEN: - if (r->token == NULL) { - if (ch != '$') { - goto error; - } - r->rlen = 0; - r->token = p; - } else if (isdigit(ch)) { - r->rlen = r->rlen * 10 + (uint32_t)(ch - '0'); - } else if (ch == CR) { - if ((p - r->token) <= 1 || r->rnarg == 0) { - goto error; - } - r->rnarg--; - r->token = NULL; - state = SW_ARG1_LEN_LF; - } else { - goto error; + if (str5icmp(m, 'h', 's', 'c', 'a', 'n')) { + r->type = MSG_REQ_REDIS_HSCAN; + r->msg_routing = ROUTING_TOKEN_OWNER_LOCAL_RACK_ONLY; + r->is_read = 1; + break; } - break; + if (str5icmp(m, 'l', 'p', 'u', 's', 'h')) { + r->type = MSG_REQ_REDIS_LPUSH; + r->is_read = 0; + break; + } - case SW_ARG1_LEN_LF: - switch (ch) { - case LF: - state = SW_ARG1; - break; + if (str5icmp(m, 'l', 't', 'r', 'i', 'm')) { + r->type = MSG_REQ_REDIS_LTRIM; + r->is_read = 0; + break; + } - default: - goto error; + if (str5icmp(m, 'r', 'p', 'u', 's', 'h')) { + r->type = MSG_REQ_REDIS_RPUSH; + r->is_read = 0; + break; } - break; + if (str5icmp(m, 's', 'c', 'a', 'r', 'd')) { + r->type = MSG_REQ_REDIS_SCARD; + r->is_read = 1; + break; + } - case SW_ARG1: + if (str5icmp(m, 's', 'd', 'i', 'f', 'f')) { + r->type = MSG_REQ_REDIS_SDIFF; + r->is_read = 1; + break; + } + if (str5icmp(m, 's', 'e', 't', 'e', 'x')) { + r->type = MSG_REQ_REDIS_SETEX; + r->is_read = 0; + break; + } - if (r->type == MSG_REQ_REDIS_CONFIG && !str3icmp(m, 'g', 'e', 't')) { - log_error("Redis CONFIG command not supported '%.*s'", p - m, m); - goto error; + if (str5icmp(m, 's', 'e', 't', 'n', 'x')) { + r->type = MSG_REQ_REDIS_SETNX; + r->is_read = 0; + break; } - m = p + r->rlen; - if (m >= b->last) { - r->rlen -= (uint32_t)(b->last - p); - m = b->last - 1; - p = m; - break; + if (str5icmp(m, 's', 'm', 'o', 'v', 'e')) { + r->type = MSG_REQ_REDIS_SMOVE; + r->is_read = 0; + break; } - if (*m != CR) { - goto error; + if (str5icmp(m, 's', 's', 'c', 'a', 'n')) { + r->type = MSG_REQ_REDIS_SSCAN; + r->msg_routing = ROUTING_TOKEN_OWNER_LOCAL_RACK_ONLY; + r->is_read = 1; + break; } - p = m; /* move forward by rlen bytes */ - r->rlen = 0; + if (str5icmp(m, 'z', 'c', 'a', 'r', 'd')) { + r->type = MSG_REQ_REDIS_ZCARD; + r->is_read = 1; + break; + } + if (str5icmp(m, 'z', 'r', 'a', 'n', 'k')) { + r->type = MSG_REQ_REDIS_ZRANK; + r->is_read = 1; + break; + } - state = SW_ARG1_LF; + if (str5icmp(m, 'z', 's', 'c', 'a', 'n')) { + r->type = MSG_REQ_REDIS_ZSCAN; + r->msg_routing = ROUTING_TOKEN_OWNER_LOCAL_RACK_ONLY; + r->is_read = 1; + break; + } + if (str5icmp(m, 'p', 'f', 'a', 'd', 'd')) { + r->type = MSG_REQ_REDIS_PFADD; + r->is_read = 0; + break; + } break; - case SW_ARG1_LF: - switch (ch) { - case LF: - if (redis_arg_upto1(r) || redis_arg1(r)) { - if (r->rnarg != 0) { - goto error; - } - goto done; - } else if (redis_arg2(r)) { - if (r->rnarg != 1) { - goto error; - } - state = SW_ARG2_LEN; - } else if (redis_arg3(r)) { - if (r->rnarg != 2) { - goto error; - } - state = SW_ARG2_LEN; - } else if (redis_argn(r)) { - if (r->rnarg == 0) { - goto done; - } - state = SW_ARGN_LEN; - } else if (redis_argeval(r)) { - if (r->rnarg < 2) { - log_error("Dynomite EVAL/EVALSHA requires at least 1 key"); - goto error; - } - state = SW_ARG2_LEN; - } else if (redis_argkvx(r)) { - if (r->rnarg == 0) { - goto done; - } - state = SW_KEY_LEN; - } else { - goto error; - } - - break; + case 6: + if (str6icmp(m, 'a', 'p', 'p', 'e', 'n', 'd')) { + r->type = MSG_REQ_REDIS_APPEND; + r->is_read = 0; + break; + } - default: - goto error; + if (str6icmp(m, 'd', 'e', 'c', 'r', 'b', 'y')) { + r->type = MSG_REQ_REDIS_DECRBY; + r->is_read = 0; + break; } - break; + if (str6icmp(m, 'e', 'x', 'i', 's', 't', 's')) { + r->type = MSG_REQ_REDIS_EXISTS; + r->is_read = 1; + break; + } - case SW_ARG2_LEN: - if (r->token == NULL) { - if (ch != '$') { - goto error; - } - r->rlen = 0; - r->token = p; - } else if (isdigit(ch)) { - r->rlen = r->rlen * 10 + (uint32_t)(ch - '0'); - } else if (ch == CR) { - if ((p - r->token) <= 1 || r->rnarg == 0) { - goto error; - } - r->rnarg--; - r->token = NULL; - state = SW_ARG2_LEN_LF; - } else { - goto error; + if (str6icmp(m, 'e', 'x', 'p', 'i', 'r', 'e')) { + r->type = MSG_REQ_REDIS_EXPIRE; + r->is_read = 0; + break; } - break; + if (str6icmp(m, 'g', 'e', 't', 'b', 'i', 't')) { + r->type = MSG_REQ_REDIS_GETBIT; + r->is_read = 1; + break; + } - case SW_ARG2_LEN_LF: - switch (ch) { - case LF: - state = SW_ARG2; - break; + if (str6icmp(m, 'g', 'e', 't', 's', 'e', 't')) { + r->type = MSG_REQ_REDIS_GETSET; + r->is_read = 0; + break; + } - default: - goto error; + if (str6icmp(m, 'p', 's', 'e', 't', 'e', 'x')) { + r->type = MSG_REQ_REDIS_PSETEX; + r->is_read = 0; + break; } - break; + if (str6icmp(m, 'h', 's', 'e', 't', 'n', 'x')) { + r->type = MSG_REQ_REDIS_HSETNX; + r->is_read = 0; + break; + } - case SW_ARG2: - if (r->token == NULL && redis_argeval(r)) { - /* - * For EVAL/EVALSHA, ARG2 represents the # key/arg pairs which must - * be tokenized and stored in contiguous memory. - */ - r->token = p; + if (str6icmp(m, 'i', 'n', 'c', 'r', 'b', 'y')) { + r->type = MSG_REQ_REDIS_INCRBY; + r->is_read = 0; + break; } - m = p + r->rlen; - if (m >= b->last) { - r->rlen -= (uint32_t)(b->last - p); - m = b->last - 1; - p = m; - break; + if (str6icmp(m, 'l', 'i', 'n', 'd', 'e', 'x')) { + r->type = MSG_REQ_REDIS_LINDEX; + r->is_read = 1; + break; } - if (*m != CR) { - goto error; + if (str6icmp(m, 'l', 'p', 'u', 's', 'h', 'x')) { + r->type = MSG_REQ_REDIS_LPUSHX; + r->is_read = 0; + break; } - p = m; /* move forward by rlen bytes */ - r->rlen = 0; + if (str6icmp(m, 'l', 'r', 'a', 'n', 'g', 'e')) { + r->type = MSG_REQ_REDIS_LRANGE; + r->is_read = 1; + break; + } - if (redis_argeval(r)) { - uint32_t nkey; - uint8_t *chp; - /* - * For EVAL/EVALSHA, we need to find the integer value of this - * argument. It tells us the number of keys in the script, and - * we need to error out if number of keys is 0. At this point, - * both p and m point to the end of the argument and r->token - * points to the start. - */ - if (p - r->token < 1) { - goto error; - } - - for (nkey = 0, chp = r->token; chp < p; chp++) { - if (isdigit(*chp)) { - nkey = nkey * 10 + (uint32_t)(*chp - '0'); - } else { - goto error; - } - } - - if (nkey == 0) { - log_error("EVAL/EVALSHA requires atleast 1 key"); - goto error; - } - if (r->rnarg < nkey) { - log_error("EVAL/EVALSHA Not all keys provided: expecting %u", nkey); - goto error; - } - r->nkeys = nkey; - r->token = NULL; - } - - state = SW_ARG2_LF; + if (str6icmp(m, 'r', 'p', 'u', 's', 'h', 'x')) { + r->type = MSG_REQ_REDIS_RPUSHX; + r->is_read = 0; + break; + } + + if (str6icmp(m, 's', 'e', 't', 'b', 'i', 't')) { + r->type = MSG_REQ_REDIS_SETBIT; + r->is_read = 0; + break; + } + + if (str6icmp(m, 's', 'i', 'n', 't', 'e', 'r')) { + r->type = MSG_REQ_REDIS_SINTER; + r->is_read = 1; + break; + } + + if (str6icmp(m, 's', 't', 'r', 'l', 'e', 'n')) { + r->type = MSG_REQ_REDIS_STRLEN; + r->is_read = 1; + break; + } + + if (str6icmp(m, 's', 'u', 'n', 'i', 'o', 'n')) { + r->type = MSG_REQ_REDIS_SUNION; + r->is_read = 1; + break; + } + + if (str6icmp(m, 'z', 'c', 'o', 'u', 'n', 't')) { + r->type = MSG_REQ_REDIS_ZCOUNT; + r->is_read = 1; + break; + } + + if (str6icmp(m, 'z', 'r', 'a', 'n', 'g', 'e')) { + r->type = MSG_REQ_REDIS_ZRANGE; + r->is_read = 1; + break; + } + + if (str6icmp(m, 'z', 's', 'c', 'o', 'r', 'e')) { + r->type = MSG_REQ_REDIS_ZSCORE; + r->is_read = 1; + break; + } + + if (str6icmp(m, 'c', 'o', 'n', 'f', 'i', 'g')) { + r->type = MSG_REQ_REDIS_CONFIG; + r->msg_routing = ROUTING_LOCAL_NODE_ONLY; + r->is_read = 1; + break; + } + if (str6icmp(m, 'g', 'e', 'o', 'a', 'd', 'd')) { + r->type = MSG_REQ_REDIS_GEOADD; + r->is_read = 0; + break; + } + if (str6icmp(m, 'g', 'e', 'o', 'p', 'o', 's')) { + r->type = MSG_REQ_REDIS_GEOPOS; + r->is_read = 1; + break; + } break; - case SW_ARG2_LF: - switch (ch) { - case LF: - if (redis_arg2(r)) { - if (r->rnarg != 0) { - goto error; - } - goto done; - } else if (redis_arg3(r)) { - if (r->rnarg != 1) { - goto error; - } - state = SW_ARG3_LEN; - } else if (redis_argn(r)) { - if (r->rnarg == 0) { - goto done; - } - state = SW_ARGN_LEN; - } else if (redis_argeval(r)) { - if (r->rnarg < 1) { - goto error; - } - state = SW_KEY_LEN; - } else { - goto error; - } + case 7: + if (str7icmp(m, 'p', 'e', 'r', 's', 'i', 's', 't')) { + r->type = MSG_REQ_REDIS_PERSIST; + r->is_read = 0; + break; + } - break; + if (str7icmp(m, 'p', 'e', 'x', 'p', 'i', 'r', 'e')) { + r->type = MSG_REQ_REDIS_PEXPIRE; + r->is_read = 0; + break; + } - default: - goto error; + if (str7icmp(m, 'h', 'e', 'x', 'i', 's', 't', 's')) { + r->type = MSG_REQ_REDIS_HEXISTS; + r->is_read = 1; + break; + } + + if (str7icmp(m, 'h', 'g', 'e', 't', 'a', 'l', 'l')) { + r->type = MSG_REQ_REDIS_HGETALL; + r->msg_routing = ROUTING_TOKEN_OWNER_LOCAL_RACK_ONLY; + r->is_read = 1; + break; + } + + if (str7icmp(m, 'h', 'i', 'n', 'c', 'r', 'b', 'y')) { + r->type = MSG_REQ_REDIS_HINCRBY; + r->is_read = 0; + break; + } + + if (str7icmp(m, 'l', 'i', 'n', 's', 'e', 'r', 't')) { + r->type = MSG_REQ_REDIS_LINSERT; + r->is_read = 0; + break; + } + + if (str7icmp(m, 'z', 'i', 'n', 'c', 'r', 'b', 'y')) { + r->type = MSG_REQ_REDIS_ZINCRBY; + r->is_read = 0; + break; + } + + if (str7icmp(m, 'e', 'v', 'a', 'l', 's', 'h', 'a')) { + r->type = MSG_REQ_REDIS_EVALSHA; + r->is_read = 0; + break; + } + + if (str7icmp(m, 'r', 'e', 's', 't', 'o', 'r', 'e')) { + r->type = MSG_REQ_REDIS_RESTORE; + r->is_read = 0; + break; + } + + if (str7icmp(m, 's', 'l', 'a', 'v', 'e', 'o', 'f')) { + r->type = MSG_REQ_REDIS_SLAVEOF; + r->msg_routing = ROUTING_LOCAL_NODE_ONLY; + r->is_read = 0; + break; + } + if (str7icmp(m, 'p', 'f', 'c', 'o', 'u', 'n', 't')) { + r->type = MSG_REQ_REDIS_PFCOUNT; + r->is_read = 0; + break; + } + if (str7icmp(m, 'g', 'e', 'o', 'h', 'a', 's', 'h')) { + r->type = MSG_REQ_REDIS_GEOHASH; + r->is_read = 1; + break; + } + if (str7icmp(m, 'g', 'e', 'o', 'd', 'i', 's', 't')) { + r->type = MSG_REQ_REDIS_GEODIST; + r->is_read = 1; + break; } break; - case SW_ARG3_LEN: - if (r->token == NULL) { - if (ch != '$') { - goto error; - } - r->rlen = 0; - r->token = p; - } else if (isdigit(ch)) { - r->rlen = r->rlen * 10 + (uint32_t)(ch - '0'); - } else if (ch == CR) { - if ((p - r->token) <= 1 || r->rnarg == 0) { - goto error; - } - r->rnarg--; - r->token = NULL; - state = SW_ARG3_LEN_LF; - } else { - goto error; + case 8: + if (str8icmp(m, 'e', 'x', 'p', 'i', 'r', 'e', 'a', 't')) { + r->type = MSG_REQ_REDIS_EXPIREAT; + r->is_read = 0; + break; + } + + if (str8icmp(m, 'b', 'i', 't', 'c', 'o', 'u', 'n', 't')) { + r->type = MSG_REQ_REDIS_BITCOUNT; + r->is_read = 1; + break; + } + + if (str8icmp(m, 'g', 'e', 't', 'r', 'a', 'n', 'g', 'e')) { + r->type = MSG_REQ_REDIS_GETRANGE; + r->is_read = 1; + break; + } + + if (str8icmp(m, 's', 'e', 't', 'r', 'a', 'n', 'g', 'e')) { + r->type = MSG_REQ_REDIS_SETRANGE; + r->is_read = 0; + break; + } + + if (str8icmp(m, 's', 'm', 'e', 'm', 'b', 'e', 'r', 's')) { + r->type = MSG_REQ_REDIS_SMEMBERS; + r->is_read = 1; + break; + } + + if (str8icmp(m, 'z', 'r', 'e', 'v', 'r', 'a', 'n', 'k')) { + r->type = MSG_REQ_REDIS_ZREVRANK; + r->is_read = 1; + break; } break; - case SW_ARG3_LEN_LF: - switch (ch) { - case LF: - state = SW_ARG3; - break; + case 9: + if (str9icmp(m, 'p', 'e', 'x', 'p', 'i', 'r', 'e', 'a', 't')) { + r->type = MSG_REQ_REDIS_PEXPIREAT; + r->is_read = 0; + break; + } - default: - goto error; + if (str9icmp(m, 'r', 'p', 'o', 'p', 'l', 'p', 'u', 's', 'h')) { + r->type = MSG_REQ_REDIS_RPOPLPUSH; + r->is_read = 0; + break; + } + + if (str9icmp(m, 's', 'i', 's', 'm', 'e', 'm', 'b', 'e', 'r')) { + r->type = MSG_REQ_REDIS_SISMEMBER; + r->is_read = 1; + break; + } + + if (str9icmp(m, 'z', 'l', 'e', 'x', 'c', 'o', 'u', 'n', 't')) { + r->type = MSG_REQ_REDIS_ZLEXCOUNT; + r->is_read = 1; + break; + } + + if (str9icmp(m, 'z', 'r', 'e', 'v', 'r', 'a', 'n', 'g', 'e')) { + r->type = MSG_REQ_REDIS_ZREVRANGE; + r->is_read = 1; + break; + } + + if (str9icmp(m, 'g', 'e', 'o', 'r', 'a', 'd', 'i', 'u', 's')) { + r->type = MSG_REQ_REDIS_GEORADIUS; + r->is_read = 1; + break; } break; - case SW_ARG3: - m = p + r->rlen; - if (m >= b->last) { - r->rlen -= (uint32_t)(b->last - p); - m = b->last - 1; - p = m; - break; + case 10: + if (str10icmp(m, 's', 'd', 'i', 'f', 'f', 's', 't', 'o', 'r', + 'e')) { + r->type = MSG_REQ_REDIS_SDIFFSTORE; + r->is_read = 0; + break; } - if (*m != CR) { - goto error; + break; + + case 11: + if (str11icmp(m, 'i', 'n', 'c', 'r', 'b', 'y', 'f', 'l', 'o', 'a', + 't')) { + r->type = MSG_REQ_REDIS_INCRBYFLOAT; + r->is_read = 0; + break; } - p = m; /* move forward by rlen bytes */ - r->rlen = 0; - state = SW_ARG3_LF; + if (str11icmp(m, 's', 'i', 'n', 't', 'e', 'r', 's', 't', 'o', 'r', + 'e')) { + r->type = MSG_REQ_REDIS_SINTERSTORE; + r->is_read = 0; + break; + } + + if (str11icmp(m, 's', 'r', 'a', 'n', 'd', 'm', 'e', 'm', 'b', 'e', + 'r')) { + r->type = MSG_REQ_REDIS_SRANDMEMBER; + r->is_read = 1; + break; + } + + if (str11icmp(m, 's', 'u', 'n', 'i', 'o', 'n', 's', 't', 'o', 'r', + 'e')) { + r->type = MSG_REQ_REDIS_SUNIONSTORE; + r->is_read = 1; + break; + } + + if (str11icmp(m, 'z', 'i', 'n', 't', 'e', 'r', 's', 't', 'o', 'r', + 'e')) { + r->type = MSG_REQ_REDIS_ZINTERSTORE; + r->is_read = 1; + break; + } + + if (str11icmp(m, 'z', 'r', 'a', 'n', 'g', 'e', 'b', 'y', 'l', 'e', + 'x')) { + r->type = MSG_REQ_REDIS_ZRANGEBYLEX; + r->is_read = 1; + break; + } + + if (str11icmp(m, 'z', 'u', 'n', 'i', 'o', 'n', 's', 't', 'o', 'r', + 'e')) { + r->type = MSG_REQ_REDIS_ZUNIONSTORE; + r->is_read = 1; + break; + } break; - case SW_ARG3_LF: - switch (ch) { - case LF: - if (redis_arg3(r)) { - if (r->rnarg != 0) { - goto error; - } - goto done; - } else if (redis_argn(r)) { - if (r->rnarg == 0) { - goto done; - } - state = SW_ARGN_LEN; - } else { - goto error; - } + case 12: + if (str12icmp(m, 'h', 'i', 'n', 'c', 'r', 'b', 'y', 'f', 'l', 'o', + 'a', 't')) { + r->type = MSG_REQ_REDIS_HINCRBYFLOAT; + r->is_read = 0; + break; + } - break; + break; - default: - goto error; + case 13: + if (str13icmp(m, 'z', 'r', 'a', 'n', 'g', 'e', 'b', 'y', 's', 'c', + 'o', 'r', 'e')) { + r->type = MSG_REQ_REDIS_ZRANGEBYSCORE; + r->is_read = 1; + break; } break; - case SW_ARGN_LEN: - if (r->token == NULL) { - if (ch != '$') { - goto error; - } - r->rlen = 0; - r->token = p; - } else if (isdigit(ch)) { - r->rlen = r->rlen * 10 + (uint32_t)(ch - '0'); - } else if (ch == CR) { - if ((p - r->token) <= 1 || r->rnarg == 0) { - goto error; - } - r->rnarg--; - r->token = NULL; - state = SW_ARGN_LEN_LF; - } else { - goto error; + case 14: + if (str14icmp(m, 'z', 'r', 'e', 'm', 'r', 'a', 'n', 'g', 'e', 'b', + 'y', 'l', 'e', 'x')) { + r->type = MSG_REQ_REDIS_ZREMRANGEBYLEX; + r->is_read = 0; + break; + } + + if (str14icmp(m, 'z', 'r', 'e', 'v', 'r', 'a', 'n', 'g', 'e', 'b', + 'y', 'l', 'e', 'x')) { + r->type = MSG_REQ_REDIS_ZREVRANGEBYLEX; + r->is_read = 1; + break; } break; - case SW_ARGN_LEN_LF: - switch (ch) { - case LF: - state = SW_ARGN; - break; + case 15: + if (str15icmp(m, 'z', 'r', 'e', 'm', 'r', 'a', 'n', 'g', 'e', 'b', + 'y', 'r', 'a', 'n', 'k')) { + r->type = MSG_REQ_REDIS_ZREMRANGEBYRANK; + r->is_read = 0; + break; + } + + break; + + case 16: + if (str16icmp(m, 'z', 'r', 'e', 'm', 'r', 'a', 'n', 'g', 'e', 'b', + 'y', 's', 'c', 'o', 'r', 'e')) { + r->type = MSG_REQ_REDIS_ZREMRANGEBYSCORE; + r->is_read = 0; + break; + } + + if (str16icmp(m, 'z', 'r', 'e', 'v', 'r', 'a', 'n', 'g', 'e', 'b', + 'y', 's', 'c', 'o', 'r', 'e')) { + r->type = MSG_REQ_REDIS_ZREVRANGEBYSCORE; + r->is_read = 1; + break; + } + + break; + + case 17: + if (str17icmp(m, 'g', 'e', 'o', 'r', 'a', 'd', 'i', 'u', 's', 'b', + 'y', 'm', 'e', 'm', 'b', 'e', 'r')) { + r->type = MSG_REQ_REDIS_GEORADIUSBYMEMBER; + r->is_read = 1; + break; + } + + break; + default: + r->is_read = 1; + break; + } + + if (r->type == MSG_UNKNOWN) { + log_error("parsed unsupported command '%.*s'", p - m, m); + goto error; + } + + log_debug(LOG_VERB, "parsed command '%.*s'", p - m, m); - default: + state = SW_REQ_TYPE_LF; + break; + + case SW_REQ_TYPE_LF: + switch (ch) { + case LF: + if (redis_argz(r) && (r->rnarg == 0)) { + goto done; + } else if (redis_arg_upto1(r) && r->rnarg == 0) { + goto done; + } else if (redis_arg_upto1(r) && r->rnarg == 1) { + state = SW_ARG1_LEN; + } else if (redis_argeval(r)) { + state = SW_ARG1_LEN; + } else { + state = SW_KEY_LEN; + } + break; + + default: + goto error; + } + + break; + + case SW_KEY_LEN: + if (r->token == NULL) { + if (ch != '$') { + goto error; + } + r->token = p; + r->rlen = 0; + } else if (isdigit(ch)) { + r->rlen = r->rlen * 10 + (uint32_t)(ch - '0'); + } else if (ch == CR) { + if (r->rlen == 0) { + log_error("parsed bad req %" PRIu64 + " of type %d with empty " + "key", + r->id, r->type); + goto error; + } + if (r->rlen >= mbuf_data_size()) { + log_error("parsed bad req %" PRIu64 + " of type %d with key " + "length %d that greater than or equal to maximum" + " redis key length of %d", + r->id, r->type, r->rlen, mbuf_data_size()); + goto error; + } + if (r->rnarg == 0) { + goto error; + } + r->rnarg--; + r->token = NULL; + state = SW_KEY_LEN_LF; + } else { + goto error; + } + + break; + + case SW_KEY_LEN_LF: + switch (ch) { + case LF: + state = SW_KEY; + break; + + default: + goto error; + } + + break; + + case SW_KEY: + if (r->token == NULL) { + r->token = p; + } + + m = r->token + r->rlen; + if (m >= b->last) { + m = b->last - 1; + p = m; + break; + } + + if (*m != CR) { + goto error; + } else { + struct keypos *kpos; + + p = m; /* move forward by rlen bytes */ + r->rlen = 0; + m = r->token; + r->token = NULL; + + kpos = array_push(r->keys); + if (kpos == NULL) { + goto enomem; + } + kpos->start = kpos->tag_start = m; + kpos->end = kpos->tag_end = p; + if (!string_empty(hash_tag)) { + uint8_t *tag_start, *tag_end; + + tag_start = dn_strchr(kpos->start, kpos->end, hash_tag->data[0]); + if (tag_start != NULL) { + tag_end = dn_strchr(tag_start + 1, kpos->end, hash_tag->data[1]); + if (tag_end != NULL) { + kpos->tag_start = tag_start + 1; + kpos->tag_end = tag_end; + } + } + } + state = SW_KEY_LF; + } + + break; + + case SW_KEY_LF: + switch (ch) { + case LF: + if (redis_arg0(r)) { + if (r->rnarg != 0) { + goto error; + } + goto done; + } else if (redis_arg1(r)) { + if (r->rnarg != 1) { + goto error; + } + state = SW_ARG1_LEN; + } else if (redis_arg2(r)) { + if (r->rnarg != 2) { + goto error; + } + state = SW_ARG1_LEN; + } else if (redis_arg3(r)) { + if (r->rnarg != 3) { + goto error; + } + state = SW_ARG1_LEN; + } else if (redis_argn(r)) { + if (r->rnarg == 0) { + goto done; + } + state = SW_ARG1_LEN; + } else if (redis_argx(r)) { + if (r->rnarg == 0) { + goto done; + } + state = SW_KEY_LEN; + } else if (redis_argkvx(r)) { + if (r->narg % 2 == 0) { goto error; + } + state = SW_ARG1_LEN; + } else if (redis_argeval(r)) { + r->nkeys--; + if (r->nkeys > 0) { + // if there are more keys pending, parse them + state = SW_KEY_LEN; + } else if (r->rnarg > 0) { + // we finished parsing keys, now start with args + state = SW_ARGN_LEN; + } else { + // no more args left, we are done + goto done; + } + } else { + goto error; } break; - case SW_ARGN: - m = p + r->rlen; - if (m >= b->last) { - r->rlen -= (uint32_t)(b->last - p); - m = b->last - 1; - p = m; - break; + default: + goto error; + } + + break; + + case SW_ARG1_LEN: + if (r->token == NULL) { + if (ch != '$') { + goto error; + } + r->rlen = 0; + r->token = p; + } else if (isdigit(ch)) { + r->rlen = r->rlen * 10 + (uint32_t)(ch - '0'); + } else if (ch == CR) { + if ((p - r->token) <= 1 || r->rnarg == 0) { + goto error; + } + r->rnarg--; + r->token = NULL; + state = SW_ARG1_LEN_LF; + } else { + goto error; + } + + break; + + case SW_ARG1_LEN_LF: + switch (ch) { + case LF: + state = SW_ARG1; + break; + + default: + goto error; + } + + break; + + case SW_ARG1: + + if (r->type == MSG_REQ_REDIS_CONFIG && !str3icmp(m, 'g', 'e', 't')) { + log_error("Redis CONFIG command not supported '%.*s'", p - m, m); + goto error; + } + m = p + r->rlen; + + if (m >= b->last) { + r->rlen -= (uint32_t)(b->last - p); + m = b->last - 1; + p = m; + break; + } + + if (*m != CR) { + goto error; + } + + p = m; /* move forward by rlen bytes */ + r->rlen = 0; + + state = SW_ARG1_LF; + + break; + + case SW_ARG1_LF: + switch (ch) { + case LF: + if (redis_arg_upto1(r) || redis_arg1(r)) { + if (r->rnarg != 0) { + goto error; + } + goto done; + } else if (redis_arg2(r)) { + if (r->rnarg != 1) { + goto error; + } + state = SW_ARG2_LEN; + } else if (redis_arg3(r)) { + if (r->rnarg != 2) { + goto error; + } + state = SW_ARG2_LEN; + } else if (redis_argn(r)) { + if (r->rnarg == 0) { + goto done; + } + state = SW_ARGN_LEN; + } else if (redis_argeval(r)) { + if (r->rnarg < 2) { + log_error("Dynomite EVAL/EVALSHA requires at least 1 key"); + goto error; + } + state = SW_ARG2_LEN; + } else if (redis_argkvx(r)) { + if (r->rnarg == 0) { + goto done; + } + state = SW_KEY_LEN; + } else { + goto error; } - if (*m != CR) { + break; + + default: + goto error; + } + + break; + + case SW_ARG2_LEN: + if (r->token == NULL) { + if (ch != '$') { + goto error; + } + r->rlen = 0; + r->token = p; + } else if (isdigit(ch)) { + r->rlen = r->rlen * 10 + (uint32_t)(ch - '0'); + } else if (ch == CR) { + if ((p - r->token) <= 1 || r->rnarg == 0) { + goto error; + } + r->rnarg--; + r->token = NULL; + state = SW_ARG2_LEN_LF; + } else { + goto error; + } + + break; + + case SW_ARG2_LEN_LF: + switch (ch) { + case LF: + state = SW_ARG2; + break; + + default: + goto error; + } + + break; + + case SW_ARG2: + if (r->token == NULL && redis_argeval(r)) { + /* + * For EVAL/EVALSHA, ARG2 represents the # key/arg pairs which must + * be tokenized and stored in contiguous memory. + */ + r->token = p; + } + + m = p + r->rlen; + if (m >= b->last) { + r->rlen -= (uint32_t)(b->last - p); + m = b->last - 1; + p = m; + break; + } + + if (*m != CR) { + goto error; + } + + p = m; /* move forward by rlen bytes */ + r->rlen = 0; + + if (redis_argeval(r)) { + uint32_t nkey; + uint8_t *chp; + /* + * For EVAL/EVALSHA, we need to find the integer value of this + * argument. It tells us the number of keys in the script, and + * we need to error out if number of keys is 0. At this point, + * both p and m point to the end of the argument and r->token + * points to the start. + */ + if (p - r->token < 1) { + goto error; + } + + for (nkey = 0, chp = r->token; chp < p; chp++) { + if (isdigit(*chp)) { + nkey = nkey * 10 + (uint32_t)(*chp - '0'); + } else { + goto error; + } + } + + if (nkey == 0) { + log_error("EVAL/EVALSHA requires atleast 1 key"); + goto error; + } + if (r->rnarg < nkey) { + log_error("EVAL/EVALSHA Not all keys provided: expecting %u", nkey); + goto error; + } + r->nkeys = nkey; + r->token = NULL; + } + + state = SW_ARG2_LF; + + break; + + case SW_ARG2_LF: + switch (ch) { + case LF: + if (redis_arg2(r)) { + if (r->rnarg != 0) { + goto error; + } + goto done; + } else if (redis_arg3(r)) { + if (r->rnarg != 1) { + goto error; + } + state = SW_ARG3_LEN; + } else if (redis_argn(r)) { + if (r->rnarg == 0) { + goto done; + } + state = SW_ARGN_LEN; + } else if (redis_argeval(r)) { + if (r->rnarg < 1) { goto error; + } + state = SW_KEY_LEN; + } else { + goto error; } - p = m; /* move forward by rlen bytes */ - r->rlen = 0; - state = SW_ARGN_LF; + break; + default: + goto error; + } + + break; + + case SW_ARG3_LEN: + if (r->token == NULL) { + if (ch != '$') { + goto error; + } + r->rlen = 0; + r->token = p; + } else if (isdigit(ch)) { + r->rlen = r->rlen * 10 + (uint32_t)(ch - '0'); + } else if (ch == CR) { + if ((p - r->token) <= 1 || r->rnarg == 0) { + goto error; + } + r->rnarg--; + r->token = NULL; + state = SW_ARG3_LEN_LF; + } else { + goto error; + } + + break; + + case SW_ARG3_LEN_LF: + switch (ch) { + case LF: + state = SW_ARG3; break; - case SW_ARGN_LF: - switch (ch) { - case LF: - if (redis_argn(r) || redis_argeval(r)) { - if (r->rnarg == 0) { - goto done; - } - state = SW_ARGN_LEN; - } else { - goto error; - } + default: + goto error; + } - break; + break; + + case SW_ARG3: + m = p + r->rlen; + if (m >= b->last) { + r->rlen -= (uint32_t)(b->last - p); + m = b->last - 1; + p = m; + break; + } + + if (*m != CR) { + goto error; + } + + p = m; /* move forward by rlen bytes */ + r->rlen = 0; + state = SW_ARG3_LF; + + break; - default: + case SW_ARG3_LF: + switch (ch) { + case LF: + if (redis_arg3(r)) { + if (r->rnarg != 0) { goto error; + } + goto done; + } else if (redis_argn(r)) { + if (r->rnarg == 0) { + goto done; + } + state = SW_ARGN_LEN; + } else { + goto error; } break; - case SW_SENTINEL: - default: - NOT_REACHED(); + default: + goto error; + } + + break; + + case SW_ARGN_LEN: + if (r->token == NULL) { + if (ch != '$') { + goto error; + } + r->rlen = 0; + r->token = p; + } else if (isdigit(ch)) { + r->rlen = r->rlen * 10 + (uint32_t)(ch - '0'); + } else if (ch == CR) { + if ((p - r->token) <= 1 || r->rnarg == 0) { + goto error; + } + r->rnarg--; + r->token = NULL; + state = SW_ARGN_LEN_LF; + } else { + goto error; + } + + break; + + case SW_ARGN_LEN_LF: + switch (ch) { + case LF: + state = SW_ARGN; break; + + default: + goto error; } - } - //ASSERT(p == b->last); - r->pos = p; - r->state = state; + break; - if (b->last == b->end && r->token != NULL) { - r->pos = r->token; - r->token = NULL; - r->result = MSG_PARSE_REPAIR; - } else { - r->result = MSG_PARSE_AGAIN; + case SW_ARGN: + m = p + r->rlen; + if (m >= b->last) { + r->rlen -= (uint32_t)(b->last - p); + m = b->last - 1; + p = m; + break; + } + + if (*m != CR) { + goto error; + } + + p = m; /* move forward by rlen bytes */ + r->rlen = 0; + state = SW_ARGN_LF; + + break; + + case SW_ARGN_LF: + switch (ch) { + case LF: + if (redis_argn(r) || redis_argeval(r)) { + if (r->rnarg == 0) { + goto done; + } + state = SW_ARGN_LEN; + } else { + goto error; + } + + break; + + default: + goto error; + } + + break; + + case SW_SENTINEL: + default: + NOT_REACHED(); + break; } + } - log_hexdump(LOG_VERB, b->pos, mbuf_length(b), "parsed req %"PRIu64" res %d " - "type %d state %d rpos %d of %d", r->id, r->result, r->type, - r->state, r->pos - b->pos, b->last - b->pos); - return; + // ASSERT(p == b->last); + r->pos = p; + r->state = state; -done: - ASSERT(r->type > MSG_UNKNOWN && r->type < MSG_SENTINEL); - r->pos = p + 1; - ASSERT(r->pos <= b->last); - r->state = SW_START; + if (b->last == b->end && r->token != NULL) { + r->pos = r->token; r->token = NULL; - r->result = MSG_PARSE_OK; + r->result = MSG_PARSE_REPAIR; + } else { + r->result = MSG_PARSE_AGAIN; + } + + log_hexdump(LOG_VERB, b->pos, mbuf_length(b), + "parsed req %" PRIu64 + " res %d " + "type %d state %d rpos %d of %d", + r->id, r->result, r->type, r->state, r->pos - b->pos, + b->last - b->pos); + return; - log_hexdump(LOG_VERB, b->pos, mbuf_length(b), "parsed req %"PRIu64" res %d " - "type %d state %d rpos %d of %d", r->id, r->result, r->type, - r->state, r->pos - b->pos, b->last - b->pos); - return; +done: + ASSERT(r->type > MSG_UNKNOWN && r->type < MSG_SENTINEL); + r->pos = p + 1; + ASSERT(r->pos <= b->last); + r->state = SW_START; + r->token = NULL; + r->result = MSG_PARSE_OK; + + log_hexdump(LOG_VERB, b->pos, mbuf_length(b), + "parsed req %" PRIu64 + " res %d " + "type %d state %d rpos %d of %d", + r->id, r->result, r->type, r->state, r->pos - b->pos, + b->last - b->pos); + return; enomem: - r->result = MSG_PARSE_ERROR; - r->state = state; - log_hexdump(LOG_ERR, b->pos, mbuf_length(b), "out of memory on parse req %"PRIu64" " - "res %d type %d state %d", r->id, r->result, r->type, r->state); + r->result = MSG_PARSE_ERROR; + r->state = state; + log_hexdump(LOG_ERR, b->pos, mbuf_length(b), + "out of memory on parse req %" PRIu64 + " " + "res %d type %d state %d", + r->id, r->result, r->type, r->state); - return; + return; error: - r->result = MSG_PARSE_ERROR; - r->state = state; - errno = EINVAL; - - log_hexdump(LOG_WARN, b->pos, mbuf_length(b), "parsed bad req %"PRIu64" " - "res %d type %d state %d", r->id, r->result, r->type, - r->state); - + r->result = MSG_PARSE_ERROR; + r->state = state; + errno = EINVAL; + + log_hexdump(LOG_WARN, b->pos, mbuf_length(b), + "parsed bad req %" PRIu64 + " " + "res %d type %d state %d", + r->id, r->result, r->type, r->state); } /* @@ -2062,541 +2067,557 @@ redis_parse_req(struct msg *r, const struct string *hash_tag) * strings (bulks) with the initial line indicating how many bulks that * will follow. The first byte of a multi bulk reply is always *. */ -void -redis_parse_rsp(struct msg *r, const struct string *UNUSED) -{ - struct mbuf *b; - uint8_t *p, *m; - uint8_t ch; - - enum { - SW_START, - SW_STATUS, - SW_ERROR, - SW_INTEGER, - SW_INTEGER_START, - SW_SIMPLE, - SW_BULK, - SW_BULK_LF, - SW_BULK_ARG, - SW_BULK_ARG_LF, - SW_MULTIBULK, - SW_MULTIBULK_NARG_LF, - SW_MULTIBULK_ARGN_LEN, - SW_MULTIBULK_ARGN_LEN_LF, - SW_MULTIBULK_ARGN, - SW_MULTIBULK_ARGN_LF, - SW_RUNTO_CRLF, - SW_ALMOST_DONE, - SW_SENTINEL - } state; - - state = r->state; - b = STAILQ_LAST(&r->mhdr, mbuf, next); - - ASSERT(!r->is_request); - ASSERT(state >= SW_START && state < SW_SENTINEL); - ASSERT(b != NULL); - ASSERT(b->pos <= b->last); - - /* validate the parsing marker */ - ASSERT(r->pos != NULL); - ASSERT(r->pos >= b->pos && r->pos <= b->last); - - for (p = r->pos; p < b->last; p++) { - ch = *p; - - switch (state) { - case SW_START: - r->type = MSG_UNKNOWN; - switch (ch) { - case '+': - p = p - 1; /* go back by 1 byte */ - r->type = MSG_RSP_REDIS_STATUS; - state = SW_STATUS; +void redis_parse_rsp(struct msg *r, const struct string *UNUSED) { + struct mbuf *b; + uint8_t *p, *m; + uint8_t ch; + + enum { + SW_START, + SW_STATUS, + SW_ERROR, + SW_INTEGER, + SW_INTEGER_START, + SW_SIMPLE, + SW_BULK, + SW_BULK_LF, + SW_BULK_ARG, + SW_BULK_ARG_LF, + SW_MULTIBULK, + SW_MULTIBULK_NARG_LF, + SW_MULTIBULK_ARGN_LEN, + SW_MULTIBULK_ARGN_LEN_LF, + SW_MULTIBULK_ARGN, + SW_MULTIBULK_ARGN_LF, + SW_RUNTO_CRLF, + SW_ALMOST_DONE, + SW_SENTINEL + } state; + + state = r->state; + b = STAILQ_LAST(&r->mhdr, mbuf, next); + + ASSERT(!r->is_request); + ASSERT(state >= SW_START && state < SW_SENTINEL); + ASSERT(b != NULL); + ASSERT(b->pos <= b->last); + + /* validate the parsing marker */ + ASSERT(r->pos != NULL); + ASSERT(r->pos >= b->pos && r->pos <= b->last); + + for (p = r->pos; p < b->last; p++) { + ch = *p; + + switch (state) { + case SW_START: + r->type = MSG_UNKNOWN; + switch (ch) { + case '+': + p = p - 1; /* go back by 1 byte */ + r->type = MSG_RSP_REDIS_STATUS; + state = SW_STATUS; + break; + + case '-': + r->type = MSG_RSP_REDIS_ERROR; + p = p - 1; /* go back by 1 byte */ + state = SW_ERROR; + break; + + case ':': + r->type = MSG_RSP_REDIS_INTEGER; + p = p - 1; /* go back by 1 byte */ + state = SW_INTEGER; + break; + + case '$': + r->type = MSG_RSP_REDIS_BULK; + p = p - 1; /* go back by 1 byte */ + state = SW_BULK; + break; + + case '*': + r->type = MSG_RSP_REDIS_MULTIBULK; + p = p - 1; /* go back by 1 byte */ + state = SW_MULTIBULK; + break; + + default: + goto error; + } + + break; + + case SW_STATUS: + /* rsp_start <- p */ + state = SW_RUNTO_CRLF; + break; + + case SW_ERROR: + if (r->token == NULL) { + if (ch != '-') { + goto error; + } + /* rsp_start <- p */ + r->token = p; + } + if (ch == ' ' || ch == CR) { + m = r->token; + r->token = NULL; + switch (p - m) { + case 4: + /* + * -ERR no such key\r\n + * -ERR syntax error\r\n + * -ERR source and destination objects are the same\r\n + * -ERR index out of range\r\n + */ + if (str4cmp(m, '-', 'E', 'R', 'R')) { + r->type = MSG_RSP_REDIS_ERROR_ERR; break; + } + + /* -OOM command not allowed when used memory > 'maxmemory'.\r\n */ + if (str4cmp(m, '-', 'O', 'O', 'M')) { + r->type = MSG_RSP_REDIS_ERROR_OOM; + break; + } + + break; - case '-': - r->type = MSG_RSP_REDIS_ERROR; - p = p - 1; /* go back by 1 byte */ - state = SW_ERROR; + case 5: + /* -BUSY Redis is busy running a script. You can only call SCRIPT + * KILL or SHUTDOWN NOSAVE.\r\n" */ + if (str5cmp(m, '-', 'B', 'U', 'S', 'Y')) { + r->type = MSG_RSP_REDIS_ERROR_BUSY; break; + } + + break; - case ':': - r->type = MSG_RSP_REDIS_INTEGER; - p = p - 1; /* go back by 1 byte */ - state = SW_INTEGER; + case 7: + /* -NOAUTH Authentication required.\r\n */ + if (str7cmp(m, '-', 'N', 'O', 'A', 'U', 'T', 'H')) { + r->type = MSG_RSP_REDIS_ERROR_NOAUTH; break; + } + + break; - case '$': - r->type = MSG_RSP_REDIS_BULK; - p = p - 1; /* go back by 1 byte */ - state = SW_BULK; + case 8: + /* rsp: "-LOADING Redis is loading the dataset in memory\r\n" */ + if (str8cmp(m, '-', 'L', 'O', 'A', 'D', 'I', 'N', 'G')) { + r->type = MSG_RSP_REDIS_ERROR_LOADING; break; + } - case '*': - r->type = MSG_RSP_REDIS_MULTIBULK; - p = p - 1; /* go back by 1 byte */ - state = SW_MULTIBULK; + /* -BUSYKEY Target key name already exists.\r\n */ + if (str8cmp(m, '-', 'B', 'U', 'S', 'Y', 'K', 'E', 'Y')) { + r->type = MSG_RSP_REDIS_ERROR_BUSYKEY; + break; + } + + /* "-MISCONF Redis is configured to save RDB snapshots, but is + * currently not able to persist on disk. Commands that may modify + * the data set are disabled. Please check Redis logs for details + * about the error.\r\n" */ + if (str8cmp(m, '-', 'M', 'I', 'S', 'C', 'O', 'N', 'F')) { + r->type = MSG_RSP_REDIS_ERROR_MISCONF; break; + } - default: - goto error; - } + break; - break; + case 9: + /* -NOSCRIPT No matching script. Please use EVAL.\r\n */ + if (str9cmp(m, '-', 'N', 'O', 'S', 'C', 'R', 'I', 'P', 'T')) { + r->type = MSG_RSP_REDIS_ERROR_NOSCRIPT; + break; + } - case SW_STATUS: - /* rsp_start <- p */ - state = SW_RUNTO_CRLF; - break; + /* -READONLY You can't write against a read only slave.\r\n */ + if (str9cmp(m, '-', 'R', 'E', 'A', 'D', 'O', 'N', 'L', 'Y')) { + r->type = MSG_RSP_REDIS_ERROR_READONLY; + break; + } + + break; - case SW_ERROR: - if (r->token == NULL) { - if (ch != '-') { - goto error; - } - /* rsp_start <- p */ - r->token = p; - } - if (ch == ' ' || ch == CR) { - m = r->token; - r->token = NULL; - switch (p - m) { - - case 4: - /* - * -ERR no such key\r\n - * -ERR syntax error\r\n - * -ERR source and destination objects are the same\r\n - * -ERR index out of range\r\n - */ - if (str4cmp(m, '-', 'E', 'R', 'R')) { - r->type = MSG_RSP_REDIS_ERROR_ERR; - break; - } - - /* -OOM command not allowed when used memory > 'maxmemory'.\r\n */ - if (str4cmp(m, '-', 'O', 'O', 'M')) { - r->type = MSG_RSP_REDIS_ERROR_OOM; - break; - } - - break; - - case 5: - /* -BUSY Redis is busy running a script. You can only call SCRIPT KILL or SHUTDOWN NOSAVE.\r\n" */ - if (str5cmp(m, '-', 'B', 'U', 'S', 'Y')) { - r->type = MSG_RSP_REDIS_ERROR_BUSY; - break; - } - - break; - - case 7: - /* -NOAUTH Authentication required.\r\n */ - if (str7cmp(m, '-', 'N', 'O', 'A', 'U', 'T', 'H')) { - r->type = MSG_RSP_REDIS_ERROR_NOAUTH; - break; - } - - break; - - case 8: - /* rsp: "-LOADING Redis is loading the dataset in memory\r\n" */ - if (str8cmp(m, '-', 'L', 'O', 'A', 'D', 'I', 'N', 'G')) { - r->type = MSG_RSP_REDIS_ERROR_LOADING; - break; - } - - /* -BUSYKEY Target key name already exists.\r\n */ - if (str8cmp(m, '-', 'B', 'U', 'S', 'Y', 'K', 'E', 'Y')) { - r->type = MSG_RSP_REDIS_ERROR_BUSYKEY; - break; - } - - /* "-MISCONF Redis is configured to save RDB snapshots, but is currently not able to persist on disk. Commands that may modify the data set are disabled. Please check Redis logs for details about the error.\r\n" */ - if (str8cmp(m, '-', 'M', 'I', 'S', 'C', 'O', 'N', 'F')) { - r->type = MSG_RSP_REDIS_ERROR_MISCONF; - break; - } - - break; - - case 9: - /* -NOSCRIPT No matching script. Please use EVAL.\r\n */ - if (str9cmp(m, '-', 'N', 'O', 'S', 'C', 'R', 'I', 'P', 'T')) { - r->type = MSG_RSP_REDIS_ERROR_NOSCRIPT; - break; - } - - /* -READONLY You can't write against a read only slave.\r\n */ - if (str9cmp(m, '-', 'R', 'E', 'A', 'D', 'O', 'N', 'L', 'Y')) { - r->type = MSG_RSP_REDIS_ERROR_READONLY; - break; - } - - break; - - case 10: - /* -WRONGTYPE Operation against a key holding the wrong kind of value\r\n */ - if (str10cmp(m, '-', 'W', 'R', 'O', 'N', 'G', 'T', 'Y', 'P', 'E')) { - r->type = MSG_RSP_REDIS_ERROR_WRONGTYPE; - break; - } - - /* -EXECABORT Transaction discarded because of previous errors.\r\n" */ - if (str10cmp(m, '-', 'E', 'X', 'E', 'C', 'A', 'B', 'O', 'R', 'T')) { - r->type = MSG_RSP_REDIS_ERROR_EXECABORT; - break; - } + case 10: + /* -WRONGTYPE Operation against a key holding the wrong kind of + * value\r\n */ + if (str10cmp(m, '-', 'W', 'R', 'O', 'N', 'G', 'T', 'Y', 'P', + 'E')) { + r->type = MSG_RSP_REDIS_ERROR_WRONGTYPE; + break; + } + /* -EXECABORT Transaction discarded because of previous + * errors.\r\n" */ + if (str10cmp(m, '-', 'E', 'X', 'E', 'C', 'A', 'B', 'O', 'R', + 'T')) { + r->type = MSG_RSP_REDIS_ERROR_EXECABORT; break; + } - case 11: - /* -MASTERDOWN Link with MASTER is down and slave-serve-stale-data is set to 'no'.\r\n */ - if (str11cmp(m, '-', 'M', 'A', 'S', 'T', 'E', 'R', 'D', 'O', 'W', 'N')) { - r->type = MSG_RSP_REDIS_ERROR_MASTERDOWN; - break; - } + break; - /* -NOREPLICAS Not enough good slaves to write.\r\n */ - if (str11cmp(m, '-', 'N', 'O', 'R', 'E', 'P', 'L', 'I', 'C', 'A', 'S')) { - r->type = MSG_RSP_REDIS_ERROR_NOREPLICAS; - break; - } + case 11: + /* -MASTERDOWN Link with MASTER is down and slave-serve-stale-data + * is set to 'no'.\r\n */ + if (str11cmp(m, '-', 'M', 'A', 'S', 'T', 'E', 'R', 'D', 'O', 'W', + 'N')) { + r->type = MSG_RSP_REDIS_ERROR_MASTERDOWN; + break; + } - break; - } - state = SW_RUNTO_CRLF; - } - break; + /* -NOREPLICAS Not enough good slaves to write.\r\n */ + if (str11cmp(m, '-', 'N', 'O', 'R', 'E', 'P', 'L', 'I', 'C', 'A', + 'S')) { + r->type = MSG_RSP_REDIS_ERROR_NOREPLICAS; + break; + } - case SW_INTEGER: - /* rsp_start <- p */ - state = SW_INTEGER_START; - r->integer = 0; - break; + break; + } + state = SW_RUNTO_CRLF; + } + break; - case SW_SIMPLE: - if (ch == CR) { - state = SW_MULTIBULK_ARGN_LF; - r->rnarg--; - } - break; - - case SW_INTEGER_START: - if (ch == CR) { - state = SW_ALMOST_DONE; - } else if (ch == '-') { - ; - } else if (isdigit(ch)) { - r->integer = r->integer * 10 + (uint32_t)(ch - '0'); - } else { - goto error; - } + case SW_INTEGER: + /* rsp_start <- p */ + state = SW_INTEGER_START; + r->integer = 0; + break; + + case SW_SIMPLE: + if (ch == CR) { + state = SW_MULTIBULK_ARGN_LF; + r->rnarg--; + } + break; + + case SW_INTEGER_START: + if (ch == CR) { + state = SW_ALMOST_DONE; + } else if (ch == '-') { + ; + } else if (isdigit(ch)) { + r->integer = r->integer * 10 + (uint32_t)(ch - '0'); + } else { + goto error; + } + break; + + case SW_RUNTO_CRLF: + switch (ch) { + case CR: + state = SW_ALMOST_DONE; break; - case SW_RUNTO_CRLF: - switch (ch) { - case CR: - state = SW_ALMOST_DONE; - break; + default: + break; + } - default: - break; - } + break; - break; + case SW_ALMOST_DONE: + switch (ch) { + case LF: + /* rsp_end <- p */ + goto done; - case SW_ALMOST_DONE: - switch (ch) { - case LF: - /* rsp_end <- p */ - goto done; + default: + goto error; + } - default: - goto error; - } + break; - break; + case SW_BULK: + if (r->token == NULL) { + if (ch != '$') { + goto error; + } + /* rsp_start <- p */ + r->token = p; + r->rlen = 0; + } else if (ch == '-') { + /* handles null bulk reply = '$-1' */ + state = SW_RUNTO_CRLF; + } else if (isdigit(ch)) { + r->rlen = r->rlen * 10 + (uint32_t)(ch - '0'); + } else if (ch == CR) { + if ((p - r->token) <= 1) { + goto error; + } + r->token = NULL; + state = SW_BULK_LF; + } else { + goto error; + } - case SW_BULK: - if (r->token == NULL) { - if (ch != '$') { - goto error; - } - /* rsp_start <- p */ - r->token = p; - r->rlen = 0; - } else if (ch == '-') { - /* handles null bulk reply = '$-1' */ - state = SW_RUNTO_CRLF; - } else if (isdigit(ch)) { - r->rlen = r->rlen * 10 + (uint32_t)(ch - '0'); - } else if (ch == CR) { - if ((p - r->token) <= 1) { - goto error; - } - r->token = NULL; - state = SW_BULK_LF; - } else { - goto error; - } + break; + case SW_BULK_LF: + switch (ch) { + case LF: + state = SW_BULK_ARG; break; - case SW_BULK_LF: - switch (ch) { - case LF: - state = SW_BULK_ARG; - break; + default: + goto error; + } - default: - goto error; - } + break; - break; + case SW_BULK_ARG: + m = p + r->rlen; + if (m >= b->last) { + r->rlen -= (uint32_t)(b->last - p); + m = b->last - 1; + p = m; + break; + } - case SW_BULK_ARG: - m = p + r->rlen; - if (m >= b->last) { - r->rlen -= (uint32_t)(b->last - p); - m = b->last - 1; - p = m; - break; - } + if (*m != CR) { + goto error; + } - if (*m != CR) { - goto error; - } + p = m; /* move forward by rlen bytes */ + r->rlen = 0; - p = m; /* move forward by rlen bytes */ - r->rlen = 0; + state = SW_BULK_ARG_LF; - state = SW_BULK_ARG_LF; + break; - break; + case SW_BULK_ARG_LF: + switch (ch) { + case LF: + goto done; - case SW_BULK_ARG_LF: - switch (ch) { - case LF: - goto done; + default: + goto error; + } - default: - goto error; - } + break; - break; + case SW_MULTIBULK: + if (r->token == NULL) { + if (ch != '*') { + goto error; + } + r->token = p; + /* rsp_start <- p */ + r->narg_start = p; + r->rnarg = 0; + } else if (ch == '-') { + state = SW_RUNTO_CRLF; + } else if (isdigit(ch)) { + r->rnarg = r->rnarg * 10 + (uint32_t)(ch - '0'); + } else if (ch == CR) { + if ((p - r->token) <= 1) { + goto error; + } + + r->narg = r->rnarg; + r->narg_end = p; + r->token = NULL; + state = SW_MULTIBULK_NARG_LF; + } else { + goto error; + } - case SW_MULTIBULK: - if (r->token == NULL) { - if (ch != '*') { - goto error; - } - r->token = p; - /* rsp_start <- p */ - r->narg_start = p; - r->rnarg = 0; - } else if (ch == '-') { - state = SW_RUNTO_CRLF; - } else if (isdigit(ch)) { - r->rnarg = r->rnarg * 10 + (uint32_t)(ch - '0'); - } else if (ch == CR) { - if ((p - r->token) <= 1) { - goto error; - } - - r->narg = r->rnarg; - r->narg_end = p; - r->token = NULL; - state = SW_MULTIBULK_NARG_LF; - } else { - goto error; - } + break; + case SW_MULTIBULK_NARG_LF: + switch (ch) { + case LF: + if (r->rnarg == 0) { + /* response is '*0\r\n' */ + goto done; + } + state = SW_MULTIBULK_ARGN_LEN; break; - case SW_MULTIBULK_NARG_LF: - switch (ch) { - case LF: - if (r->rnarg == 0) { - /* response is '*0\r\n' */ - goto done; - } - state = SW_MULTIBULK_ARGN_LEN; - break; + default: + goto error; + } - default: - goto error; - } + break; + case SW_MULTIBULK_ARGN_LEN: + if (r->token == NULL) { + /* + * From: http://redis.io/topics/protocol, a multi bulk reply + * is used to return an array of other replies. Every element + * of a multi bulk reply can be of any kind, including a + * nested multi bulk reply. + * + * Here, we only handle a multi bulk reply element that + * are either integer reply or bulk reply. + * + * there is a special case for sscan/hscan/zscan, these command + * replay a nested multi-bulk with a number and a multi bulk like + * this: + * + * - mulit-bulk + * - cursor + * - mulit-bulk + * - val1 + * - val2 + * - val3 + * + * in this case, there is only one sub-multi-bulk, + * and it's the last element of parent, + * we can handle it like tail-recursive. + * + */ + if (ch == '*') { /* for sscan/hscan/zscan only */ + p = p - 1; /* go back by 1 byte */ + state = SW_MULTIBULK; break; + } - case SW_MULTIBULK_ARGN_LEN: - if (r->token == NULL) { - /* - * From: http://redis.io/topics/protocol, a multi bulk reply - * is used to return an array of other replies. Every element - * of a multi bulk reply can be of any kind, including a - * nested multi bulk reply. - * - * Here, we only handle a multi bulk reply element that - * are either integer reply or bulk reply. - * - * there is a special case for sscan/hscan/zscan, these command - * replay a nested multi-bulk with a number and a multi bulk like this: - * - * - mulit-bulk - * - cursor - * - mulit-bulk - * - val1 - * - val2 - * - val3 - * - * in this case, there is only one sub-multi-bulk, - * and it's the last element of parent, - * we can handle it like tail-recursive. - * - */ - if (ch == '*') { /* for sscan/hscan/zscan only */ - p = p - 1; /* go back by 1 byte */ - state = SW_MULTIBULK; - break; - } - - if (ch == ':' || ch == '+' || ch == '-') { - /* handles not-found reply = '$-1' or integer reply = ':' */ - /* and *2\r\n$2\r\nr0\r\n+OK\r\n or *1\r\n+OK\r\n */ - state = SW_SIMPLE; - break; - } - - if (ch != '$') { - goto error; - } - - r->token = p; - r->rlen = 0; - } else if (isdigit(ch)) { - r->rlen = r->rlen * 10 + (uint32_t)(ch - '0'); - } else if (ch == '-') { - ; - } else if (ch == CR) { - if ((p - r->token) <= 1 || r->rnarg == 0) { - goto error; - } - - if ((r->rlen == 1 && (p - r->token) == 3)) { - r->rlen = 0; - state = SW_MULTIBULK_ARGN_LF; - } else { - state = SW_MULTIBULK_ARGN_LEN_LF; - } - r->rnarg--; - r->token = NULL; - } else { - goto error; - } - - break; - - case SW_MULTIBULK_ARGN_LEN_LF: - switch (ch) { - case LF: - state = SW_MULTIBULK_ARGN; - break; + if (ch == ':' || ch == '+' || ch == '-') { + /* handles not-found reply = '$-1' or integer reply = ':' */ + /* and *2\r\n$2\r\nr0\r\n+OK\r\n or *1\r\n+OK\r\n */ + state = SW_SIMPLE; + break; + } + + if (ch != '$') { + goto error; + } + + r->token = p; + r->rlen = 0; + } else if (isdigit(ch)) { + r->rlen = r->rlen * 10 + (uint32_t)(ch - '0'); + } else if (ch == '-') { + ; + } else if (ch == CR) { + if ((p - r->token) <= 1 || r->rnarg == 0) { + goto error; + } + + if ((r->rlen == 1 && (p - r->token) == 3)) { + r->rlen = 0; + state = SW_MULTIBULK_ARGN_LF; + } else { + state = SW_MULTIBULK_ARGN_LEN_LF; + } + r->rnarg--; + r->token = NULL; + } else { + goto error; + } - default: - goto error; - } + break; + case SW_MULTIBULK_ARGN_LEN_LF: + switch (ch) { + case LF: + state = SW_MULTIBULK_ARGN; break; - case SW_MULTIBULK_ARGN: - m = p + r->rlen; - if (m >= b->last) { - r->rlen -= (uint32_t)(b->last - p); - m = b->last - 1; - p = m; - break; - } + default: + goto error; + } - if (*m != CR) { - goto error; - } + break; - p += r->rlen; /* move forward by rlen bytes */ - r->rlen = 0; + case SW_MULTIBULK_ARGN: + m = p + r->rlen; + if (m >= b->last) { + r->rlen -= (uint32_t)(b->last - p); + m = b->last - 1; + p = m; + break; + } - state = SW_MULTIBULK_ARGN_LF; + if (*m != CR) { + goto error; + } - break; + p += r->rlen; /* move forward by rlen bytes */ + r->rlen = 0; - case SW_MULTIBULK_ARGN_LF: - switch (ch) { - case LF: - if (r->rnarg == 0) { - goto done; - } + state = SW_MULTIBULK_ARGN_LF; - state = SW_MULTIBULK_ARGN_LEN; - break; + break; - default: - goto error; + case SW_MULTIBULK_ARGN_LF: + switch (ch) { + case LF: + if (r->rnarg == 0) { + goto done; } + state = SW_MULTIBULK_ARGN_LEN; break; - case SW_SENTINEL: - default: - NOT_REACHED(); - break; + default: + goto error; } - } - ASSERT(p == b->last); - r->pos = p; - r->state = state; - r->is_error = redis_error(r); + break; - if (b->last == b->end && r->token != NULL) { - r->pos = r->token; - r->token = NULL; - r->result = MSG_PARSE_REPAIR; - } else { - r->result = MSG_PARSE_AGAIN; + case SW_SENTINEL: + default: + NOT_REACHED(); + break; } + } - log_hexdump(LOG_VERB, b->pos, mbuf_length(b), "parsed rsp %"PRIu64" res %d " - "type %d state %d rpos %d of %d", r->id, r->result, r->type, - r->state, r->pos - b->pos, b->last - b->pos); - return; + ASSERT(p == b->last); + r->pos = p; + r->state = state; + r->is_error = redis_error(r); -done: - ASSERT(r->type > MSG_UNKNOWN && r->type < MSG_SENTINEL); - r->pos = p + 1; - ASSERT(r->pos <= b->last); - r->state = SW_START; + if (b->last == b->end && r->token != NULL) { + r->pos = r->token; r->token = NULL; - r->result = MSG_PARSE_OK; - r->is_error = redis_error(r); + r->result = MSG_PARSE_REPAIR; + } else { + r->result = MSG_PARSE_AGAIN; + } + + log_hexdump(LOG_VERB, b->pos, mbuf_length(b), + "parsed rsp %" PRIu64 + " res %d " + "type %d state %d rpos %d of %d", + r->id, r->result, r->type, r->state, r->pos - b->pos, + b->last - b->pos); + return; - log_hexdump(LOG_VERB, b->pos, mbuf_length(b), "parsed rsp %"PRIu64" res %d " - "type %d state %d rpos %d of %d", r->id, r->result, r->type, - r->state, r->pos - b->pos, b->last - b->pos); - return; +done: + ASSERT(r->type > MSG_UNKNOWN && r->type < MSG_SENTINEL); + r->pos = p + 1; + ASSERT(r->pos <= b->last); + r->state = SW_START; + r->token = NULL; + r->result = MSG_PARSE_OK; + r->is_error = redis_error(r); + + log_hexdump(LOG_VERB, b->pos, mbuf_length(b), + "parsed rsp %" PRIu64 + " res %d " + "type %d state %d rpos %d of %d", + r->id, r->result, r->type, r->state, r->pos - b->pos, + b->last - b->pos); + return; error: - r->result = MSG_PARSE_ERROR; - r->state = state; - errno = EINVAL; - - log_hexdump(LOG_INFO, b->pos, mbuf_length(b), "parsed bad rsp %"PRIu64" " - "res %d type %d state %d", r->id, r->result, r->type, - r->state); - + r->result = MSG_PARSE_ERROR; + r->state = state; + errno = EINVAL; + + log_hexdump(LOG_INFO, b->pos, mbuf_length(b), + "parsed bad rsp %" PRIu64 + " " + "res %d type %d state %d", + r->id, r->result, r->type, r->state); } /* @@ -2605,85 +2626,78 @@ redis_parse_rsp(struct msg *r, const struct string *UNUSED) * if dst == NULL, we just eat the bulk * * */ -static rstatus_t -redis_copy_bulk(struct msg *dst, struct msg *src, bool log) -{ - struct mbuf *mbuf, *nbuf; - uint8_t *p; - uint32_t len = 0; - uint32_t bytes = 0; - rstatus_t status; - - for (mbuf = STAILQ_FIRST(&src->mhdr); - mbuf && mbuf_empty(mbuf); - mbuf = STAILQ_FIRST(&src->mhdr)) { - - mbuf_remove(&src->mhdr, mbuf); - mbuf_put(mbuf); - } - - mbuf = STAILQ_FIRST(&src->mhdr); - if (mbuf == NULL) { - return DN_ERROR; +static rstatus_t redis_copy_bulk(struct msg *dst, struct msg *src, bool log) { + struct mbuf *mbuf, *nbuf; + uint8_t *p; + uint32_t len = 0; + uint32_t bytes = 0; + rstatus_t status; + + for (mbuf = STAILQ_FIRST(&src->mhdr); mbuf && mbuf_empty(mbuf); + mbuf = STAILQ_FIRST(&src->mhdr)) { + mbuf_remove(&src->mhdr, mbuf); + mbuf_put(mbuf); + } + + mbuf = STAILQ_FIRST(&src->mhdr); + if (mbuf == NULL) { + return DN_ERROR; + } + + p = mbuf->pos; + ASSERT(*p == '$'); + p++; + + if (p[0] == '-' && p[1] == '1') { + len = 1 + 2 + CRLF_LEN; /* $-1\r\n */ + p = mbuf->pos + len; + if (log) log_notice("here"); + } else { + len = 0; + for (; p < mbuf->last && isdigit(*p); p++) { + len = len * 10 + (uint32_t)(*p - '0'); } - - p = mbuf->pos; - ASSERT(*p == '$'); - p++; - - if (p[0] == '-' && p[1] == '1') { - len = 1 + 2 + CRLF_LEN; /* $-1\r\n */ - p = mbuf->pos + len; - if (log) - log_notice("here"); - } else { - len = 0; - for (; p < mbuf->last && isdigit(*p); p++) { - len = len * 10 + (uint32_t)(*p - '0'); - } - len += CRLF_LEN * 2; - len += (p - mbuf->pos); + len += CRLF_LEN * 2; + len += (p - mbuf->pos); + } + bytes = len; + + /* copy len bytes to dst */ + for (; mbuf;) { + if (log) { + log_notice("dumping mbuf"); + mbuf_dump(mbuf); } - bytes = len; - - /* copy len bytes to dst */ - for (; mbuf;) { - if (log) { - log_notice("dumping mbuf"); - mbuf_dump(mbuf); - } - if (mbuf_length(mbuf) <= len) { /* steal this buf from src to dst */ - nbuf = STAILQ_NEXT(mbuf, next); - mbuf_remove(&src->mhdr, mbuf); - if (dst != NULL) { - mbuf_insert(&dst->mhdr, mbuf); - } else { - mbuf_put(mbuf); - } - len -= mbuf_length(mbuf); - mbuf = nbuf; - if (log) - log_notice("stealing mbuf"); - } else { /* split it */ - if (dst != NULL) { - if (log) - log_notice("appending mbuf"); - status = msg_append(dst, mbuf->pos, len); - if (status != DN_OK) { - return status; - } - } - mbuf->pos += len; - break; + if (mbuf_length(mbuf) <= len) { /* steal this buf from src to dst */ + nbuf = STAILQ_NEXT(mbuf, next); + mbuf_remove(&src->mhdr, mbuf); + if (dst != NULL) { + mbuf_insert(&dst->mhdr, mbuf); + } else { + mbuf_put(mbuf); + } + len -= mbuf_length(mbuf); + mbuf = nbuf; + if (log) log_notice("stealing mbuf"); + } else { /* split it */ + if (dst != NULL) { + if (log) log_notice("appending mbuf"); + status = msg_append(dst, mbuf->pos, len); + if (status != DN_OK) { + return status; } + } + mbuf->pos += len; + break; } - - if (dst != NULL) { - dst->mlen += bytes; - } - src->mlen -= bytes; - log_debug(LOG_VVERB, "redis_copy_bulk copy bytes: %d", bytes); - return DN_OK; + } + + if (dst != NULL) { + dst->mlen += bytes; + } + src->mlen -= bytes; + log_debug(LOG_VVERB, "redis_copy_bulk copy bytes: %d", bytes); + return DN_OK; } /* @@ -2691,91 +2705,89 @@ redis_copy_bulk(struct msg *dst, struct msg *src, bool log) * the fragmented multi vector request - 'mget' or 'del' and all the * responses to the fragmented request vector hasn't been received */ -void -redis_pre_coalesce(struct msg *rsp) -{ - struct msg *req = rsp->peer; /* peer request */ - struct mbuf *mbuf; - - ASSERT(!rsp->is_request); - ASSERT(req->is_request); - - if (req->frag_id == 0) { - /* do nothing, if not a response to a fragmented request */ - return; - } - - req->frag_owner->nfrag_done++; - switch (rsp->type) { - case MSG_RSP_REDIS_INTEGER: - /* only redis 'del' fragmented request sends back integer reply */ - ASSERT((req->type == MSG_REQ_REDIS_DEL) || - (req->type == MSG_REQ_REDIS_EXISTS)); +void redis_pre_coalesce(struct msg *rsp) { + struct msg *req = rsp->peer; /* peer request */ + struct mbuf *mbuf; - mbuf = STAILQ_FIRST(&rsp->mhdr); - /* - * Our response parser guarantees that the integer reply will be - * completely encapsulated in a single mbuf and we should skip over - * all the mbuf contents and discard it as the parser has already - * parsed the integer reply and stored it in msg->integer - */ - ASSERT(mbuf == STAILQ_LAST(&rsp->mhdr, mbuf, next)); - ASSERT(rsp->mlen == mbuf_length(mbuf)); + ASSERT(!rsp->is_request); + ASSERT(req->is_request); - rsp->mlen -= mbuf_length(mbuf); - mbuf_rewind(mbuf); + if (req->frag_id == 0) { + /* do nothing, if not a response to a fragmented request */ + return; + } - /* accumulate the integer value in frag_owner of peer request */ - req->frag_owner->integer += rsp->integer; - break; + req->frag_owner->nfrag_done++; + switch (rsp->type) { + case MSG_RSP_REDIS_INTEGER: + /* only redis 'del' fragmented request sends back integer reply */ + ASSERT((req->type == MSG_REQ_REDIS_DEL) || + (req->type == MSG_REQ_REDIS_EXISTS)); + + mbuf = STAILQ_FIRST(&rsp->mhdr); + /* + * Our response parser guarantees that the integer reply will be + * completely encapsulated in a single mbuf and we should skip over + * all the mbuf contents and discard it as the parser has already + * parsed the integer reply and stored it in msg->integer + */ + ASSERT(mbuf == STAILQ_LAST(&rsp->mhdr, mbuf, next)); + ASSERT(rsp->mlen == mbuf_length(mbuf)); + + rsp->mlen -= mbuf_length(mbuf); + mbuf_rewind(mbuf); + + /* accumulate the integer value in frag_owner of peer request */ + req->frag_owner->integer += rsp->integer; + break; case MSG_RSP_REDIS_MULTIBULK: - /* only redis 'mget' fragmented request sends back multi-bulk reply */ - ASSERT(req->type == MSG_REQ_REDIS_MGET); + /* only redis 'mget' fragmented request sends back multi-bulk reply */ + ASSERT(req->type == MSG_REQ_REDIS_MGET); - mbuf = STAILQ_FIRST(&rsp->mhdr); - /* - * Muti-bulk reply can span over multiple mbufs and in each reply - * we should skip over the narg token. Our response parser - * guarantees thaat the narg token and the immediately following - * '\r\n' will exist in a contiguous region in the first mbuf - */ - ASSERT(rsp->narg_start == mbuf->pos); - ASSERT(rsp->narg_start < rsp->narg_end); - - rsp->narg_end += CRLF_LEN; - rsp->mlen -= (uint32_t)(rsp->narg_end - rsp->narg_start); - mbuf->pos = rsp->narg_end; + mbuf = STAILQ_FIRST(&rsp->mhdr); + /* + * Muti-bulk reply can span over multiple mbufs and in each reply + * we should skip over the narg token. Our response parser + * guarantees thaat the narg token and the immediately following + * '\r\n' will exist in a contiguous region in the first mbuf + */ + ASSERT(rsp->narg_start == mbuf->pos); + ASSERT(rsp->narg_start < rsp->narg_end); - break; + rsp->narg_end += CRLF_LEN; + rsp->mlen -= (uint32_t)(rsp->narg_end - rsp->narg_start); + mbuf->pos = rsp->narg_end; + + break; case MSG_RSP_REDIS_STATUS: - if (req->type == MSG_REQ_REDIS_MSET) { /* MSET segments */ - mbuf = STAILQ_FIRST(&rsp->mhdr); - rsp->mlen -= mbuf_length(mbuf); - mbuf_rewind(mbuf); - } - break; + if (req->type == MSG_REQ_REDIS_MSET) { /* MSET segments */ + mbuf = STAILQ_FIRST(&rsp->mhdr); + rsp->mlen -= mbuf_length(mbuf); + mbuf_rewind(mbuf); + } + break; case MSG_RSP_REDIS_ERROR: - req->is_error = rsp->is_error; - req->error_code = rsp->error_code; - req->dyn_error_code = rsp->dyn_error_code; - break; + req->is_error = rsp->is_error; + req->error_code = rsp->error_code; + req->dyn_error_code = rsp->dyn_error_code; + break; default: - /* - * Valid responses for a fragmented request are MSG_RSP_REDIS_INTEGER or, - * MSG_RSP_REDIS_MULTIBULK. For an invalid response, we send out -ERR - * with EINVAL errno - */ - log_warn("Invalid Response type"); - msg_dump(LOG_WARN, rsp); - msg_dump(LOG_WARN, req); - req->is_error = 1; - req->error_code = EINVAL; - break; - } + /* + * Valid responses for a fragmented request are MSG_RSP_REDIS_INTEGER or, + * MSG_RSP_REDIS_MULTIBULK. For an invalid response, we send out -ERR + * with EINVAL errno + */ + log_warn("Invalid Response type"); + msg_dump(LOG_WARN, rsp); + msg_dump(LOG_WARN, req); + req->is_error = 1; + req->error_code = EINVAL; + break; + } } /* @@ -2784,68 +2796,64 @@ redis_pre_coalesce(struct msg *rsp) * responses to the fragmented request vector has been received and * the fragmented request is consider to be done */ -void -redis_post_coalesce_mset(struct msg *request) -{ - rstatus_t status; - struct msg *response = request->selected_rsp; - - status = msg_append(response, rsp_ok.data, rsp_ok.len); - if (status != DN_OK) { - response->is_error = 1; /* mark this msg as err */ - response->error_code = errno; - } +void redis_post_coalesce_mset(struct msg *request) { + rstatus_t status; + struct msg *response = request->selected_rsp; + + status = msg_append(response, rsp_ok.data, rsp_ok.len); + if (status != DN_OK) { + response->is_error = 1; /* mark this msg as err */ + response->error_code = errno; + } } -void -redis_post_coalesce_num(struct msg *request) -{ - struct msg *response = request->selected_rsp; - rstatus_t status; +void redis_post_coalesce_num(struct msg *request) { + struct msg *response = request->selected_rsp; + rstatus_t status; - status = msg_prepend_format(response, ":%d\r\n", request->integer); - if (status != DN_OK) { - response->is_error = 1; - response->error_code = errno; - } + status = msg_prepend_format(response, ":%d\r\n", request->integer); + if (status != DN_OK) { + response->is_error = 1; + response->error_code = errno; + } } -static void -redis_post_coalesce_mget(struct msg *request) -{ - struct msg *response = request->selected_rsp; - struct msg *sub_msg; - rstatus_t status; - uint32_t i; +static void redis_post_coalesce_mget(struct msg *request) { + struct msg *response = request->selected_rsp; + struct msg *sub_msg; + rstatus_t status; + uint32_t i; - // -1 is because mget is also counted in narg. So the response will be 1 less - status = msg_prepend_format(response, "*%d\r\n", request->narg - 1); - if (status != DN_OK) { - /* - * the fragments is still in c_conn->omsg_q, we have to discard all of them, - * we just close the conn here - */ - log_warn("marking %s as error", print_obj(response->owner)); - response->owner->err = 1; - return; + // -1 is because mget is also counted in narg. So the response will be 1 less + status = msg_prepend_format(response, "*%d\r\n", request->narg - 1); + if (status != DN_OK) { + /* + * the fragments is still in c_conn->omsg_q, we have to discard all of them, + * we just close the conn here + */ + log_warn("marking %s as error", print_obj(response->owner)); + response->owner->err = 1; + return; + } + + for (i = 0; i < array_n(request->keys); i++) { /* for each key */ + sub_msg = request->frag_seq[i]->selected_rsp; /* get it's peer response */ + if (sub_msg == NULL) { + struct keypos *kpos = array_get(request->keys, i); + log_warn("Response missing for key %.*s, %s marking %s as error", + kpos->tag_end - kpos->tag_start, kpos->tag_start, + print_obj(request), print_obj(response->owner)); + response->owner->err = 1; + return; } - - for (i = 0; i < array_n(request->keys); i++) { /* for each key */ - sub_msg = request->frag_seq[i]->selected_rsp; /* get it's peer response */ - if (sub_msg == NULL) { - struct keypos *kpos = array_get(request->keys, i); - log_warn("Response missing for key %.*s, %s marking %s as error", - kpos->tag_end - kpos->tag_start, kpos->tag_start, print_obj(request), print_obj(response->owner)); - response->owner->err = 1; - return; - } - if ((sub_msg->is_error) || redis_copy_bulk(response, sub_msg, false)) { - log_warn("marking %s as error, %s %s", print_obj(response->owner), print_obj(request), print_obj(response)); - msg_dump(LOG_INFO, sub_msg); - response->owner->err = 1; - return; - } + if ((sub_msg->is_error) || redis_copy_bulk(response, sub_msg, false)) { + log_warn("marking %s as error, %s %s", print_obj(response->owner), + print_obj(request), print_obj(response)); + msg_dump(LOG_INFO, sub_msg); + response->owner->err = 1; + return; } + } } /* @@ -2854,84 +2862,79 @@ redis_post_coalesce_mget(struct msg *request) * responses to the fragmented request vector has been received and * the fragmented request is consider to be done */ -void -redis_post_coalesce(struct msg *req) -{ - struct msg *rsp = req->selected_rsp; /* peer response */ - - ASSERT(!rsp->is_request); - ASSERT(req->is_request && (req->frag_owner == req)); - if (req->is_error || req->is_ferror) { - /* do nothing, if msg is in error */ - return; - } +void redis_post_coalesce(struct msg *req) { + struct msg *rsp = req->selected_rsp; /* peer response */ + + ASSERT(!rsp->is_request); + ASSERT(req->is_request && (req->frag_owner == req)); + if (req->is_error || req->is_ferror) { + /* do nothing, if msg is in error */ + return; + } - //log_notice("Post coalesce %s", print_obj(req)); - switch (req->type) { + // log_notice("Post coalesce %s", print_obj(req)); + switch (req->type) { case MSG_REQ_REDIS_MGET: - return redis_post_coalesce_mget(req); + return redis_post_coalesce_mget(req); case MSG_REQ_REDIS_DEL: case MSG_REQ_REDIS_EXISTS: - return redis_post_coalesce_num(req); + return redis_post_coalesce_num(req); case MSG_REQ_REDIS_MSET: - return redis_post_coalesce_mset(req); + return redis_post_coalesce_mset(req); default: - NOT_REACHED(); - } + NOT_REACHED(); + } } - -static rstatus_t -redis_append_key(struct msg *r, struct keypos *kpos_src) -{ - uint32_t len; - struct mbuf *mbuf; - uint8_t printbuf[32]; - struct keypos *kpos; - - /* 1. keylen */ - uint32_t keylen = kpos_src->end - kpos_src->start; - uint32_t taglen = kpos_src->tag_end - kpos_src->tag_start; - len = (uint32_t)dn_snprintf(printbuf, sizeof(printbuf), "$%d\r\n", keylen); - mbuf = msg_ensure_mbuf(r, len); - if (mbuf == NULL) { - return DN_ENOMEM; - } - mbuf_copy(mbuf, printbuf, len); - r->mlen += len; - - /* 2. key */ - mbuf = msg_ensure_mbuf(r, keylen); - if (mbuf == NULL) { - return DN_ENOMEM; - } - - kpos = array_push(r->keys); - if (kpos == NULL) { - return DN_ENOMEM; - } - - kpos->start = mbuf->last; - kpos->tag_start = kpos->start + (kpos_src->tag_start - kpos_src->start); - - kpos->end = kpos->start + keylen; - kpos->tag_end = kpos->tag_start + taglen; - - mbuf_copy(mbuf, kpos_src->start, keylen); - r->mlen += keylen; - - /* 3. CRLF */ - mbuf = msg_ensure_mbuf(r, CRLF_LEN); - if (mbuf == NULL) { - return DN_ENOMEM; - } - mbuf_copy(mbuf, (uint8_t *)CRLF, CRLF_LEN); - r->mlen += (uint32_t)CRLF_LEN; - - return DN_OK; +static rstatus_t redis_append_key(struct msg *r, struct keypos *kpos_src) { + uint32_t len; + struct mbuf *mbuf; + uint8_t printbuf[32]; + struct keypos *kpos; + + /* 1. keylen */ + uint32_t keylen = kpos_src->end - kpos_src->start; + uint32_t taglen = kpos_src->tag_end - kpos_src->tag_start; + len = (uint32_t)dn_snprintf(printbuf, sizeof(printbuf), "$%d\r\n", keylen); + mbuf = msg_ensure_mbuf(r, len); + if (mbuf == NULL) { + return DN_ENOMEM; + } + mbuf_copy(mbuf, printbuf, len); + r->mlen += len; + + /* 2. key */ + mbuf = msg_ensure_mbuf(r, keylen); + if (mbuf == NULL) { + return DN_ENOMEM; + } + + kpos = array_push(r->keys); + if (kpos == NULL) { + return DN_ENOMEM; + } + + kpos->start = mbuf->last; + kpos->tag_start = kpos->start + (kpos_src->tag_start - kpos_src->start); + + kpos->end = kpos->start + keylen; + kpos->tag_end = kpos->tag_start + taglen; + + mbuf_copy(mbuf, kpos_src->start, keylen); + r->mlen += keylen; + + /* 3. CRLF */ + mbuf = msg_ensure_mbuf(r, CRLF_LEN); + if (mbuf == NULL) { + return DN_ENOMEM; + } + mbuf_copy(mbuf, (uint8_t *)CRLF, CRLF_LEN); + r->mlen += (uint32_t)CRLF_LEN; + + return DN_OK; } /* @@ -2942,7 +2945,8 @@ redis_append_key(struct msg *r, struct keypos *kpos_src) * all the keys map to the same backend will group into one fragment. * * frag_id: - * a unique fragment id for all fragments of the message vector. including the orig msg. + * a unique fragment id for all fragments of the message vector. including the + * orig msg. * * frag_owner: * All fragments of the message use frag_owner point to the orig msg @@ -2974,11 +2978,11 @@ redis_append_key(struct msg *r, struct keypos *kpos_src) * /-----------+ | /------------+ frag_owner | * | | | | | | * | v v v | | - * +--------------------+ +---------------------+ +----+----------------+ - * | frag_id = 10 | | frag_id = 10 | | frag_id = 10 | - * | nfrag = 3 | | nfrag = 0 | | nfrag = 0 | - * | frag_seq = x x x | | key1, key3 | | key2 | - * +------------|-|-|---+ +---------------------+ +---------------------+ + * +--------------------+ +---------------------+ +----+----------------+ + * | frag_id = 10 | | frag_id = 10 | | frag_id = 10 | + * | nfrag = 3 | | nfrag = 0 | | nfrag = 0 | + * | frag_seq = x x x | | key1, key3 | | key2 | + * +------------|-|-|---+ +---------------------+ +---------------------+ * | | | ^ ^ ^ * | \ \ | | | * | \ ----------+ | | @@ -2986,474 +2990,456 @@ redis_append_key(struct msg *r, struct keypos *kpos_src) * ------------------------------------------+ * */ -static rstatus_t -redis_fragment_argx(struct msg *r, struct server_pool *pool, struct rack *rack, - struct msg_tqh *frag_msgq, uint32_t key_step) -{ - struct mbuf *mbuf; - struct msg **sub_msgs; - uint32_t i; - rstatus_t status; - - ASSERT(array_n(r->keys) == (r->narg - 1) / key_step); - - uint32_t total_peers = array_n(&pool->peers); - sub_msgs = dn_zalloc(total_peers * sizeof(*sub_msgs)); - if (sub_msgs == NULL) { - return DN_ENOMEM; +static rstatus_t redis_fragment_argx(struct msg *r, struct server_pool *pool, + struct rack *rack, + struct msg_tqh *frag_msgq, + uint32_t key_step) { + struct mbuf *mbuf; + struct msg **sub_msgs; + uint32_t i; + rstatus_t status; + + ASSERT(array_n(r->keys) == (r->narg - 1) / key_step); + + uint32_t total_peers = array_n(&pool->peers); + sub_msgs = dn_zalloc(total_peers * sizeof(*sub_msgs)); + if (sub_msgs == NULL) { + return DN_ENOMEM; + } + + ASSERT(r->frag_seq == NULL); + r->frag_seq = dn_alloc(array_n(r->keys) * sizeof(*r->frag_seq)); + if (r->frag_seq == NULL) { + dn_free(sub_msgs); + return DN_ENOMEM; + } + + mbuf = STAILQ_FIRST(&r->mhdr); + mbuf->pos = mbuf->start; + + /* + * This code is based on the assumption that '*narg\r\n$4\r\nMGET\r\n' is + * located in a contiguous location. This is always true because we have + * capped our MBUF_MIN_SIZE at 512 and whenever we have multiple messages, we + * copy the tail message into a new mbuf + */ + for (i = 0; i < 3; i++) { /* eat *narg\r\n$4\r\nMGET\r\n */ + for (; *(mbuf->pos) != '\n';) { + mbuf->pos++; } + mbuf->pos++; + } - ASSERT(r->frag_seq == NULL); - r->frag_seq = dn_alloc(array_n(r->keys) * sizeof(*r->frag_seq)); - if (r->frag_seq == NULL) { + r->frag_id = msg_gen_frag_id(); + r->nfrag = 0; + r->frag_owner = r; + + for (i = 0; i < array_n(r->keys); i++) { /* for each key */ + struct msg *sub_msg; + struct keypos *kpos = array_get(r->keys, i); + // use hash-tagged start and end for forwarding. + uint32_t idx = dnode_peer_idx_for_key_on_rack( + pool, rack, kpos->tag_start, kpos->tag_end - kpos->tag_start); + + if (sub_msgs[idx] == NULL) { + sub_msgs[idx] = msg_get(r->owner, r->is_request, __FUNCTION__); + if (sub_msgs[idx] == NULL) { dn_free(sub_msgs); return DN_ENOMEM; + } } + r->frag_seq[i] = sub_msg = sub_msgs[idx]; - mbuf = STAILQ_FIRST(&r->mhdr); - mbuf->pos = mbuf->start; - - /* - * This code is based on the assumption that '*narg\r\n$4\r\nMGET\r\n' is located - * in a contiguous location. - * This is always true because we have capped our MBUF_MIN_SIZE at 512 and - * whenever we have multiple messages, we copy the tail message into a new mbuf - */ - for (i = 0; i < 3; i++) { /* eat *narg\r\n$4\r\nMGET\r\n */ - for (; *(mbuf->pos) != '\n';) { - mbuf->pos++; - } - mbuf->pos++; + sub_msg->narg++; + status = redis_append_key(sub_msg, kpos); + if (status != DN_OK) { + dn_free(sub_msgs); + return status; } - r->frag_id = msg_gen_frag_id(); - r->nfrag = 0; - r->frag_owner = r; - - for (i = 0; i < array_n(r->keys); i++) { /* for each key */ - struct msg *sub_msg; - struct keypos *kpos = array_get(r->keys, i); - // use hash-tagged start and end for forwarding. - uint32_t idx = dnode_peer_idx_for_key_on_rack(pool, rack, kpos->tag_start, - kpos->tag_end - kpos->tag_start); - - if (sub_msgs[idx] == NULL) { - sub_msgs[idx] = msg_get(r->owner, r->is_request, __FUNCTION__); - if (sub_msgs[idx] == NULL) { - dn_free(sub_msgs); - return DN_ENOMEM; - } - } - r->frag_seq[i] = sub_msg = sub_msgs[idx]; - - sub_msg->narg++; - status = redis_append_key(sub_msg, kpos); - if (status != DN_OK) { - dn_free(sub_msgs); - return status; - } + if (key_step == 1) { /* mget,del */ + continue; + } else { /* mset */ + status = redis_copy_bulk(NULL, r, false); /* eat key */ + if (status != DN_OK) { + dn_free(sub_msgs); + return status; + } - if (key_step == 1) { /* mget,del */ - continue; - } else { /* mset */ - status = redis_copy_bulk(NULL, r, false); /* eat key */ - if (status != DN_OK) { - dn_free(sub_msgs); - return status; - } + status = redis_copy_bulk(sub_msg, r, false); + if (status != DN_OK) { + dn_free(sub_msgs); + return status; + } - status = redis_copy_bulk(sub_msg, r, false); - if (status != DN_OK) { - dn_free(sub_msgs); - return status; - } + sub_msg->narg++; + } + } - sub_msg->narg++; - } + log_info("Fragmenting %s", print_obj(r)); + for (i = 0; i < total_peers; i++) { /* prepend mget header, and forward it */ + struct msg *sub_msg = sub_msgs[i]; + if (sub_msg == NULL) { + continue; } - log_info("Fragmenting %s", print_obj(r)); - for (i = 0; i < total_peers; i++) { /* prepend mget header, and forward it */ - struct msg *sub_msg = sub_msgs[i]; - if (sub_msg == NULL) { - continue; - } - - if (r->type == MSG_REQ_REDIS_MGET) { - status = msg_prepend_format(sub_msg, "*%d\r\n$4\r\nmget\r\n", - sub_msg->narg + 1); - } else if (r->type == MSG_REQ_REDIS_DEL) { - status = msg_prepend_format(sub_msg, "*%d\r\n$3\r\ndel\r\n", - sub_msg->narg + 1); - } else if (r->type == MSG_REQ_REDIS_EXISTS) { - status = msg_prepend_format(sub_msg, "*%d\r\n$6\r\nexists\r\n", - sub_msg->narg + 1); - } else if (r->type == MSG_REQ_REDIS_MSET) { - status = msg_prepend_format(sub_msg, "*%d\r\n$4\r\nmset\r\n", - sub_msg->narg + 1); - } else { - NOT_REACHED(); - } - if (status != DN_OK) { - dn_free(sub_msgs); - return status; - } + if (r->type == MSG_REQ_REDIS_MGET) { + status = msg_prepend_format(sub_msg, "*%d\r\n$4\r\nmget\r\n", + sub_msg->narg + 1); + } else if (r->type == MSG_REQ_REDIS_DEL) { + status = msg_prepend_format(sub_msg, "*%d\r\n$3\r\ndel\r\n", + sub_msg->narg + 1); + } else if (r->type == MSG_REQ_REDIS_EXISTS) { + status = msg_prepend_format(sub_msg, "*%d\r\n$6\r\nexists\r\n", + sub_msg->narg + 1); + } else if (r->type == MSG_REQ_REDIS_MSET) { + status = msg_prepend_format(sub_msg, "*%d\r\n$4\r\nmset\r\n", + sub_msg->narg + 1); + } else { + NOT_REACHED(); + } + if (status != DN_OK) { + dn_free(sub_msgs); + return status; + } - sub_msg->type = r->type; - sub_msg->frag_id = r->frag_id; - sub_msg->frag_owner = r->frag_owner; + sub_msg->type = r->type; + sub_msg->frag_id = r->frag_id; + sub_msg->frag_owner = r->frag_owner; - log_info("Fragment %d) %s", i, print_obj(sub_msg)); - TAILQ_INSERT_TAIL(frag_msgq, sub_msg, m_tqe); - r->nfrag++; - } + log_info("Fragment %d) %s", i, print_obj(sub_msg)); + TAILQ_INSERT_TAIL(frag_msgq, sub_msg, m_tqe); + r->nfrag++; + } - dn_free(sub_msgs); - return DN_OK; + dn_free(sub_msgs); + return DN_OK; } -rstatus_t -redis_fragment(struct msg *r, struct server_pool *pool, struct rack *rack, struct msg_tqh *frag_msgq) -{ - if (1 == array_n(r->keys)){ - return DN_OK; - } +rstatus_t redis_fragment(struct msg *r, struct server_pool *pool, + struct rack *rack, struct msg_tqh *frag_msgq) { + if (1 == array_n(r->keys)) { + return DN_OK; + } - switch (r->type) { + switch (r->type) { case MSG_REQ_REDIS_MGET: case MSG_REQ_REDIS_DEL: case MSG_REQ_REDIS_EXISTS: - return redis_fragment_argx(r, pool, rack, frag_msgq, 1); + return redis_fragment_argx(r, pool, rack, frag_msgq, 1); case MSG_REQ_REDIS_MSET: - return redis_fragment_argx(r, pool, rack, frag_msgq, 2); + return redis_fragment_argx(r, pool, rack, frag_msgq, 2); default: - return DN_OK; - } + return DN_OK; + } } -rstatus_t -redis_verify_request(struct msg *r, struct server_pool *pool, struct rack *rack) -{ - if (r->type != MSG_REQ_REDIS_EVAL) - return DN_OK; +rstatus_t redis_verify_request(struct msg *r, struct server_pool *pool, + struct rack *rack) { + if (r->type != MSG_REQ_REDIS_EVAL) return DN_OK; - // For EVAL based commands, Dynomite wants to restrict all keys used by the - // script belong to same node - if (1 >= array_n(r->keys)){ - return DN_OK; - } - uint32_t prev_idx = 0, i; - for (i = 0; i < array_n(r->keys); i++) { /* for each key */ - struct keypos *kpos = array_get(r->keys, i); - uint32_t idx = dnode_peer_idx_for_key_on_rack(pool, rack, kpos->tag_start, - kpos->tag_end - kpos->tag_start); - if (i == 0) - prev_idx = idx; - if (prev_idx != idx) { - return DYNOMITE_SCRIPT_SPANS_NODES; - } - } + // For EVAL based commands, Dynomite wants to restrict all keys used by the + // script belong to same node + if (1 >= array_n(r->keys)) { return DN_OK; + } + uint32_t prev_idx = 0, i; + for (i = 0; i < array_n(r->keys); i++) { /* for each key */ + struct keypos *kpos = array_get(r->keys, i); + uint32_t idx = dnode_peer_idx_for_key_on_rack( + pool, rack, kpos->tag_start, kpos->tag_end - kpos->tag_start); + if (i == 0) prev_idx = idx; + if (prev_idx != idx) { + return DYNOMITE_SCRIPT_SPANS_NODES; + } + } + return DN_OK; } -bool -redis_is_multikey_request(struct msg *req) -{ - ASSERT(req->is_request); - switch (req->type) { +bool redis_is_multikey_request(struct msg *req) { + ASSERT(req->is_request); + switch (req->type) { case MSG_REQ_REDIS_MGET: case MSG_REQ_REDIS_DEL: case MSG_REQ_REDIS_EXISTS: case MSG_REQ_REDIS_MSET: - return true; + return true; default: - return false; - } + return false; + } } -static int -consume_numargs_from_response(struct msg *rsp) -{ - enum { - SW_START, - SW_NARG, - SW_NARG_LF, - SW_DONE - } state; - state = SW_START; - - int narg = 0; - struct mbuf *b = STAILQ_FIRST(&rsp->mhdr); - //struct mbuf *b = STAILQ_LAST(&rsp->mhdr, mbuf, next); - uint8_t *p; - uint8_t ch; - for (p = b->pos; p < b->last; p++) { - ch = *p; - switch (state) { - case SW_START: - if (ch != '*') { - goto error; - } - log_debug(LOG_VVVERB, "SW_START -> SW_NARG"); - state = SW_NARG; - break; +static int consume_numargs_from_response(struct msg *rsp) { + enum { SW_START, SW_NARG, SW_NARG_LF, SW_DONE } state; + state = SW_START; + + int narg = 0; + struct mbuf *b = STAILQ_FIRST(&rsp->mhdr); + // struct mbuf *b = STAILQ_LAST(&rsp->mhdr, mbuf, next); + uint8_t *p; + uint8_t ch; + for (p = b->pos; p < b->last; p++) { + ch = *p; + switch (state) { + case SW_START: + if (ch != '*') { + goto error; + } + log_debug(LOG_VVVERB, "SW_START -> SW_NARG"); + state = SW_NARG; + break; - case SW_NARG: - if (isdigit(ch)) { - narg = narg * 10 + (ch - '0'); - } else if (ch == CR) { - log_debug(LOG_VVVERB, "SW_START -> SW_NARG_LF %d", narg); - state = SW_NARG_LF; - } else { - goto error; - } - break; + case SW_NARG: + if (isdigit(ch)) { + narg = narg * 10 + (ch - '0'); + } else if (ch == CR) { + log_debug(LOG_VVVERB, "SW_START -> SW_NARG_LF %d", narg); + state = SW_NARG_LF; + } else { + goto error; + } + break; - case SW_NARG_LF: - if (ch == LF) { - log_debug(LOG_VVVERB, "SW_NARG_LF -> SW_DONE %d", narg); - state = SW_DONE; - } else { - goto error; - } - break; - case SW_DONE: - log_debug(LOG_VVERB, "SW_DONE %d", narg); - b->pos = p; - return narg; + case SW_NARG_LF: + if (ch == LF) { + log_debug(LOG_VVVERB, "SW_NARG_LF -> SW_DONE %d", narg); + state = SW_DONE; + } else { + goto error; } + break; + case SW_DONE: + log_debug(LOG_VVERB, "SW_DONE %d", narg); + b->pos = p; + return narg; } + } error: - return -1; + return -1; } -static rstatus_t -consume_numargs_from_responses(struct array *responses, int *narg) -{ - uint32_t iter = 0; - *narg = -2; // some invalid value - - while (iter < array_n(responses)) { - // get numargs - if (*narg == -2) { - struct msg *rsp = *(struct msg **)array_get(responses, iter); - *narg = consume_numargs_from_response(rsp); - } else { - if (*narg != consume_numargs_from_response(*(struct msg **)array_get(responses, iter))) - return DN_ERROR; - } - iter++; +static rstatus_t consume_numargs_from_responses(struct array *responses, + int *narg) { + uint32_t iter = 0; + *narg = -2; // some invalid value + + while (iter < array_n(responses)) { + // get numargs + if (*narg == -2) { + struct msg *rsp = *(struct msg **)array_get(responses, iter); + *narg = consume_numargs_from_response(rsp); + } else { + if (*narg != consume_numargs_from_response( + *(struct msg **)array_get(responses, iter))) + return DN_ERROR; } - return DN_OK; + iter++; + } + return DN_OK; } -static rstatus_t -redis_append_nargs(struct msg *rsp, int nargs) -{ - size_t len = 1 + 10 + CRLF_LEN; // len(*CRLF) - struct mbuf *mbuf = msg_ensure_mbuf(rsp, len); - if (!mbuf) - return DN_ENOMEM; - rsp->narg_start = mbuf->last; - int n = dn_scnprintf(mbuf->last, mbuf_size(mbuf), "*%d\r\n", nargs); - mbuf->last += n; - rsp->narg_end = (rsp->narg_start + n - CRLF_LEN); - rsp->mlen += (uint32_t)n; - return DN_OK; +static rstatus_t redis_append_nargs(struct msg *rsp, int nargs) { + size_t len = 1 + 10 + CRLF_LEN; // len(*CRLF) + struct mbuf *mbuf = msg_ensure_mbuf(rsp, len); + if (!mbuf) return DN_ENOMEM; + rsp->narg_start = mbuf->last; + int n = dn_scnprintf(mbuf->last, mbuf_size(mbuf), "*%d\r\n", nargs); + mbuf->last += n; + rsp->narg_end = (rsp->narg_start + n - CRLF_LEN); + rsp->mlen += (uint32_t)n; + return DN_OK; } -static rstatus_t -get_next_response_fragment(struct msg *rsp, struct msg **fragment) -{ - ASSERT(*fragment == NULL); - *fragment = rsp_get(rsp->owner); - if (*fragment == NULL) { - return DN_ENOMEM; - } - redis_copy_bulk(*fragment, rsp, false); - return DN_OK; +static rstatus_t get_next_response_fragment(struct msg *rsp, + struct msg **fragment) { + ASSERT(*fragment == NULL); + *fragment = rsp_get(rsp->owner); + if (*fragment == NULL) { + return DN_ENOMEM; + } + redis_copy_bulk(*fragment, rsp, false); + return DN_OK; } // Returns a quorum response. -static struct msg * -redis_get_fragment_quorum(struct array *fragment_from_responses) -{ - uint32_t total = array_n(fragment_from_responses); - ASSERT(total <= 3); - uint32_t checksums[MAX_REPLICAS_PER_DC]; - uint32_t fragment_iter; - for (fragment_iter = 0; fragment_iter < total; fragment_iter++) { - checksums[fragment_iter] = msg_payload_crc32(*(struct msg **)array_get(fragment_from_responses, fragment_iter)); - } - switch(total) { - case 2: - if (checksums[0] == checksums[1]) - return *(struct msg **)array_get(fragment_from_responses, 0); - else - return NULL; - case 3: - if (checksums[0] == checksums[1]) - return *(struct msg **)array_get(fragment_from_responses, 0); - if (checksums[0] == checksums[2]) - return *(struct msg **)array_get(fragment_from_responses, 0); - if (checksums[1] == checksums[2]) - return *(struct msg **)array_get(fragment_from_responses, 1); - return NULL; - default: - return NULL; - } +static struct msg *redis_get_fragment_quorum( + struct array *fragment_from_responses) { + uint32_t total = array_n(fragment_from_responses); + ASSERT(total <= 3); + uint32_t checksums[MAX_REPLICAS_PER_DC]; + uint32_t fragment_iter; + for (fragment_iter = 0; fragment_iter < total; fragment_iter++) { + checksums[fragment_iter] = msg_payload_crc32( + *(struct msg **)array_get(fragment_from_responses, fragment_iter)); + } + switch (total) { + case 2: + if (checksums[0] == checksums[1]) + return *(struct msg **)array_get(fragment_from_responses, 0); + else + return NULL; + case 3: + if (checksums[0] == checksums[1]) + return *(struct msg **)array_get(fragment_from_responses, 0); + if (checksums[0] == checksums[2]) + return *(struct msg **)array_get(fragment_from_responses, 0); + if (checksums[1] == checksums[2]) + return *(struct msg **)array_get(fragment_from_responses, 1); + return NULL; + default: + return NULL; + } } -rstatus_t -free_rsp_each(void *elem) -{ - struct msg *rsp = *(struct msg **)elem; - ASSERT(rsp->object.type == OBJ_RSP); - rsp_put(rsp); - return DN_OK; +rstatus_t free_rsp_each(void *elem) { + struct msg *rsp = *(struct msg **)elem; + ASSERT(rsp->object.type == OBJ_RSP); + rsp_put(rsp); + return DN_OK; } // if no quorum could be achieved, return NULL -static struct msg * -redis_reconcile_multikey_responses(struct response_mgr *rspmgr) -{ - // take the responses. get each value, and compare and return the common one - // create a copy of the responses; - - struct array cloned_responses; - struct array cloned_rsp_fragment_array; - struct msg *selected_rsp = NULL; - - rstatus_t s = array_init(&cloned_responses, rspmgr->good_responses, sizeof(struct msg *)); - if (s != DN_OK) - goto cleanup; - - s = rspmgr_clone_responses(rspmgr, &cloned_responses); - if (s != DN_OK) +static struct msg *redis_reconcile_multikey_responses( + struct response_mgr *rspmgr) { + // take the responses. get each value, and compare and return the common one + // create a copy of the responses; + + struct array cloned_responses; + struct array cloned_rsp_fragment_array; + struct msg *selected_rsp = NULL; + + rstatus_t s = array_init(&cloned_responses, rspmgr->good_responses, + sizeof(struct msg *)); + if (s != DN_OK) goto cleanup; + + s = rspmgr_clone_responses(rspmgr, &cloned_responses); + if (s != DN_OK) goto cleanup; + + log_info("%s cloned %d good responses", print_obj(rspmgr->msg), + array_n(&cloned_responses)); + + // if number of arguments do not match, return NULL; + int nargs; + s = consume_numargs_from_responses(&cloned_responses, &nargs); + if (s != DN_OK) goto cleanup; + + log_info("numargs matched = %d", nargs); + + // create the result response + selected_rsp = rsp_get(rspmgr->conn); + if (!selected_rsp) { + s = DN_ENOMEM; + goto cleanup; + } + selected_rsp->expect_datastore_reply = + rspmgr->responses[0]->expect_datastore_reply; + selected_rsp->swallow = rspmgr->responses[0]->swallow; + selected_rsp->type = rspmgr->responses[0]->type; + + s = redis_append_nargs(selected_rsp, nargs); + if (s != DN_OK) goto cleanup; + + log_debug(LOG_DEBUG, "%s after appending nargs", print_obj(selected_rsp)); + msg_dump(LOG_DEBUG, selected_rsp); + + // array to hold 1 fragment from each response + s = array_init(&cloned_rsp_fragment_array, rspmgr->good_responses, + sizeof(struct msg *)); + if (s != DN_OK) goto cleanup; + + // for every response fragment, try to achieve a quorum + int arg_iter; + for (arg_iter = 0; arg_iter < nargs; arg_iter++) { + // carve out one fragment from each response + uint8_t response_iter; + for (response_iter = 0; response_iter < rspmgr->good_responses; + response_iter++) { + struct msg *cloned_rsp = + *(struct msg **)array_get(&cloned_responses, response_iter); + struct msg *cloned_rsp_fragment = NULL; + s = get_next_response_fragment(cloned_rsp, &cloned_rsp_fragment); + if (s != DN_OK) { goto cleanup; + } + log_debug(LOG_DEBUG, "Fragment %d of %d, from response(%d of %d) %s", + arg_iter + 1, nargs, response_iter + 1, rspmgr->good_responses, + print_obj(cloned_rsp_fragment)); + msg_dump(LOG_DEBUG, cloned_rsp_fragment); + + struct msg **pdst = (struct msg **)array_push(&cloned_rsp_fragment_array); + *pdst = cloned_rsp_fragment; + } - log_info("%s cloned %d good responses", print_obj(rspmgr->msg), array_n(&cloned_responses)); - - // if number of arguments do not match, return NULL; - int nargs; - s = consume_numargs_from_responses(&cloned_responses, &nargs); - if (s != DN_OK) - goto cleanup; - - log_info("numargs matched = %d", nargs); - - // create the result response - selected_rsp = rsp_get(rspmgr->conn); - if (!selected_rsp) { - s = DN_ENOMEM; + // Now that we have 1 fragment from each good response, try to get a quorum + // on them + struct msg *quorum_fragment = + redis_get_fragment_quorum(&cloned_rsp_fragment_array); + if (quorum_fragment == NULL) { + if (rspmgr->msg->consistency == DC_QUORUM) { + log_info( + "Fragment %d of %d, none of them match, selecting first fragment", + arg_iter + 1, nargs); + quorum_fragment = + *(struct msg **)array_get(&cloned_rsp_fragment_array, 0); + } else { + s = DN_ERROR; goto cleanup; + } } - selected_rsp->expect_datastore_reply = rspmgr->responses[0]->expect_datastore_reply; - selected_rsp->swallow = rspmgr->responses[0]->swallow; - selected_rsp->type = rspmgr->responses[0]->type; - s = redis_append_nargs(selected_rsp, nargs); - if (s != DN_OK) - goto cleanup; + log_debug(LOG_DEBUG, "quorum fragment %s", print_obj(quorum_fragment)); + msg_dump(LOG_DEBUG, quorum_fragment); + // Copy that fragment to the resulting response + s = redis_copy_bulk(selected_rsp, quorum_fragment, false); + if (s != DN_OK) { + goto cleanup; + } - log_debug(LOG_DEBUG, "%s after appending nargs", print_obj(selected_rsp)); + log_debug(LOG_DEBUG, "response now is %s", print_obj(selected_rsp)); msg_dump(LOG_DEBUG, selected_rsp); - - // array to hold 1 fragment from each response - s = array_init(&cloned_rsp_fragment_array, rspmgr->good_responses, - sizeof(struct msg *)); - if (s != DN_OK) - goto cleanup; - - // for every response fragment, try to achieve a quorum - int arg_iter; - for (arg_iter =0; arg_iter < nargs; arg_iter++) { - // carve out one fragment from each response - uint8_t response_iter; - for (response_iter = 0; response_iter < rspmgr->good_responses; response_iter++) { - struct msg *cloned_rsp = *(struct msg **)array_get(&cloned_responses, response_iter); - struct msg *cloned_rsp_fragment = NULL; - s = get_next_response_fragment(cloned_rsp, &cloned_rsp_fragment); - if (s != DN_OK) { - goto cleanup; - } - log_debug(LOG_DEBUG, "Fragment %d of %d, from response(%d of %d) %s", - arg_iter+1, nargs, response_iter+1, rspmgr->good_responses, print_obj(cloned_rsp_fragment)); - msg_dump(LOG_DEBUG, cloned_rsp_fragment); - - struct msg **pdst = (struct msg **)array_push(&cloned_rsp_fragment_array); - *pdst = cloned_rsp_fragment; - } - - // Now that we have 1 fragment from each good response, try to get a quorum on them - struct msg *quorum_fragment = redis_get_fragment_quorum(&cloned_rsp_fragment_array); - if (quorum_fragment == NULL) { - if (rspmgr->msg->consistency == DC_QUORUM) { - log_info("Fragment %d of %d, none of them match, selecting first fragment", - arg_iter+1, nargs); - quorum_fragment = *(struct msg**)array_get(&cloned_rsp_fragment_array, 0); - } else { - s = DN_ERROR; - goto cleanup; - } - } - - log_debug(LOG_DEBUG, "quorum fragment %s", print_obj(quorum_fragment)); - msg_dump(LOG_DEBUG, quorum_fragment); - // Copy that fragment to the resulting response - s = redis_copy_bulk(selected_rsp, quorum_fragment, false); - if (s != DN_OK) { - goto cleanup; - } - - log_debug(LOG_DEBUG, "response now is %s", print_obj(selected_rsp)); - msg_dump(LOG_DEBUG, selected_rsp); - // free the responses in the array - array_each(&cloned_rsp_fragment_array, free_rsp_each); - array_reset(&cloned_rsp_fragment_array); - } -cleanup: - array_each(&cloned_responses, free_rsp_each); - array_deinit(&cloned_responses); + // free the responses in the array array_each(&cloned_rsp_fragment_array, free_rsp_each); - array_deinit(&cloned_rsp_fragment_array); - if (s != DN_OK) { - rsp_put(selected_rsp); - selected_rsp = NULL; - } - return selected_rsp; + array_reset(&cloned_rsp_fragment_array); + } +cleanup: + array_each(&cloned_responses, free_rsp_each); + array_deinit(&cloned_responses); + array_each(&cloned_rsp_fragment_array, free_rsp_each); + array_deinit(&cloned_rsp_fragment_array); + if (s != DN_OK) { + rsp_put(selected_rsp); + selected_rsp = NULL; + } + return selected_rsp; } -struct msg * -redis_reconcile_responses(struct response_mgr *rspmgr) -{ - struct msg *selected_rsp = NULL; - if (redis_is_multikey_request(rspmgr->msg)) { - selected_rsp = redis_reconcile_multikey_responses(rspmgr); - } - // if a quorum response was achieved, good, return that. - if (selected_rsp != NULL) - return selected_rsp; - - // No quorum was achieved. - if (rspmgr->msg->consistency == DC_QUORUM) { - log_info("none of the responses match, returning first"); - return rspmgr->responses[0]; - } else { - log_info("none of the responses match, returning error"); - struct msg *rsp = msg_get_error(NULL, DYNOMITE_NO_QUORUM_ACHIEVED, 0); - // There is a case that when 1 out of three nodes are down, the - // response manager has 1 error response and 2 good responses. - // We reach here when the two responses differ and we want to return - // failed to achieve quorum. In this case, free the existing error - // response - if (rspmgr->err_rsp) { - rsp_put(rspmgr->err_rsp); - } - rspmgr->err_rsp = rsp; - rspmgr->error_responses++; - return rsp; +struct msg *redis_reconcile_responses(struct response_mgr *rspmgr) { + struct msg *selected_rsp = NULL; + if (redis_is_multikey_request(rspmgr->msg)) { + selected_rsp = redis_reconcile_multikey_responses(rspmgr); + } + // if a quorum response was achieved, good, return that. + if (selected_rsp != NULL) return selected_rsp; + + // No quorum was achieved. + if (rspmgr->msg->consistency == DC_QUORUM) { + log_info("none of the responses match, returning first"); + return rspmgr->responses[0]; + } else { + log_info("none of the responses match, returning error"); + struct msg *rsp = msg_get_error(NULL, DYNOMITE_NO_QUORUM_ACHIEVED, 0); + // There is a case that when 1 out of three nodes are down, the + // response manager has 1 error response and 2 good responses. + // We reach here when the two responses differ and we want to return + // failed to achieve quorum. In this case, free the existing error + // response + if (rspmgr->err_rsp) { + rsp_put(rspmgr->err_rsp); } + rspmgr->err_rsp = rsp; + rspmgr->error_responses++; + return rsp; + } } diff --git a/src/seedsprovider/dyn_dns.c b/src/seedsprovider/dyn_dns.c index 8e7dbc8bb..bc2ba9269 100644 --- a/src/seedsprovider/dyn_dns.c +++ b/src/seedsprovider/dyn_dns.c @@ -1,140 +1,139 @@ -#include -#include #include -#include -#include #include +#include #include +#include +#include +#include #ifdef __APPLE__ #include #endif -#include "dyn_seeds_provider.h" #include "dyn_core.h" +#include "dyn_seeds_provider.h" #include "dyn_string.h" -// Keep poling DNS server for the TXT record with seeds, same format as for Florida seeds +// Keep poling DNS server for the TXT record with seeds, same format as for +// Florida seeds // // -// DYNOMITE_DNS_TXT_NAME=_dynomite.yourhost.com src/dynomite -c conf/dynomite_dns_single.yml -v 11 +// DYNOMITE_DNS_TXT_NAME=_dynomite.yourhost.com src/dynomite -c +// conf/dynomite_dns_single.yml -v 11 // // To compile the domain use make CFLAGS="-DDNS_TXT_NAME=_dynomite.yourhost.com" // // - #ifndef DNS_TXT_NAME #define DNS_TXT_NAME "_dynomite.ec2-internal" #endif -static char * dnsName = NULL; -static char * dnsType = NULL; +static char *dnsName = NULL; +static char *dnsType = NULL; static int queryType = T_TXT; -static int64_t last = 0; //storing last time for seeds check +static int64_t last = 0; // storing last time for seeds check static uint32_t last_seeds_hash = 0; -static bool seeds_check() -{ - int64_t now = dn_msec_now(); +static bool seeds_check() { + int64_t now = dn_msec_now(); - int64_t delta = (int64_t)(now - last); - log_debug(LOG_VERB, "Delta or elapsed time : %lu", delta); - log_debug(LOG_VERB, "Seeds check internal %d", SEEDS_CHECK_INTERVAL); + int64_t delta = (int64_t)(now - last); + log_debug(LOG_VERB, "Delta or elapsed time : %lu", delta); + log_debug(LOG_VERB, "Seeds check internal %d", SEEDS_CHECK_INTERVAL); - if (delta > SEEDS_CHECK_INTERVAL) { - last = now; - return true; - } + if (delta > SEEDS_CHECK_INTERVAL) { + last = now; + return true; + } - return false; + return false; } - -static uint32_t -hash_seeds(uint8_t *seeds, size_t length) -{ - const uint8_t *ptr = seeds; - uint32_t value = 0; - - while (length--) { - uint32_t val = (uint32_t) *ptr++; - value += val; - value += (value << 10); - value ^= (value >> 6); - } - value += (value << 3); - value ^= (value >> 11); - value += (value << 15); - - return value; +static uint32_t hash_seeds(uint8_t *seeds, size_t length) { + const uint8_t *ptr = seeds; + uint32_t value = 0; + + while (length--) { + uint32_t val = (uint32_t)*ptr++; + value += val; + value += (value << 10); + value ^= (value >> 6); + } + value += (value << 3); + value ^= (value >> 11); + value += (value << 15); + + return value; } -uint8_t -dns_get_seeds(struct context * ctx, struct mbuf *seeds_buf) -{ - static int _env_checked = 0; - - if (!_env_checked) { - _env_checked = 1; - dnsName = getenv("DYNOMITE_DNS_NAME"); - if (dnsName == NULL) dnsName = DNS_TXT_NAME; - dnsType = getenv("DYNOMITE_DNS_TYPE"); - if (dnsType != NULL) { if (strcmp(dnsType, "A") == 0) queryType = T_A; } - } - - log_debug(LOG_VVERB, "checking for %s", dnsName); - - if (!seeds_check()) { - return DN_NOOPS; - } - - unsigned char buf[BUFSIZ]; +uint8_t dns_get_seeds(struct context *ctx, struct mbuf *seeds_buf) { + static int _env_checked = 0; - int r = res_query(dnsName, C_IN, queryType, buf, sizeof(buf)); - if (r == -1) { - log_debug(LOG_DEBUG, "DNS response for %s: %s", dnsName, hstrerror(h_errno)); - return DN_NOOPS; + if (!_env_checked) { + _env_checked = 1; + dnsName = getenv("DYNOMITE_DNS_NAME"); + if (dnsName == NULL) dnsName = DNS_TXT_NAME; + dnsType = getenv("DYNOMITE_DNS_TYPE"); + if (dnsType != NULL) { + if (strcmp(dnsType, "A") == 0) queryType = T_A; } - if (r >= sizeof(buf)) { - log_debug(LOG_DEBUG, "DNS reply is too large for %s: %d, bufsize: %d", dnsName, r, sizeof(buf)); - return DN_NOOPS; + } + + log_debug(LOG_VVERB, "checking for %s", dnsName); + + if (!seeds_check()) { + return DN_NOOPS; + } + + unsigned char buf[BUFSIZ]; + + int r = res_query(dnsName, C_IN, queryType, buf, sizeof(buf)); + if (r == -1) { + log_debug(LOG_DEBUG, "DNS response for %s: %s", dnsName, + hstrerror(h_errno)); + return DN_NOOPS; + } + if (r >= sizeof(buf)) { + log_debug(LOG_DEBUG, "DNS reply is too large for %s: %d, bufsize: %d", + dnsName, r, sizeof(buf)); + return DN_NOOPS; + } + HEADER *hdr = (HEADER *)buf; + if (hdr->rcode != NOERROR) { + log_debug(LOG_DEBUG, "DNS reply code for %s: %d", dnsName, hdr->rcode); + return DN_NOOPS; + } + int na = ntohs(hdr->ancount); + + ns_msg m; + if (ns_initparse(buf, r, &m) == -1) { + log_debug(LOG_DEBUG, "ns_initparse error for %s: %s", dnsName, + strerror(errno)); + return DN_NOOPS; + } + int i; + ns_rr rr; + for (i = 0; i < na; ++i) { + if (ns_parserr(&m, ns_s_an, i, &rr) == -1) { + log_debug(LOG_DEBUG, "ns_parserr for %s: %s", dnsName, strerror(errno)); + return DN_NOOPS; } - HEADER *hdr = (HEADER*)buf; - if (hdr->rcode != NOERROR) { - log_debug(LOG_DEBUG, "DNS reply code for %s: %d", dnsName, hdr->rcode); - return DN_NOOPS; + mbuf_rewind(seeds_buf); + unsigned char *s = ns_rr_rdata(rr); + if (s[0] >= ns_rr_rdlen(rr)) { + log_debug(LOG_DEBUG, "invalid length for %s: %d < %d", dnsName, s[0], + ns_rr_rdlen(rr)); + return DN_NOOPS; } - int na = ntohs(hdr->ancount); - - ns_msg m; - if (ns_initparse(buf, r, &m) == -1) { - log_debug(LOG_DEBUG, "ns_initparse error for %s: %s", dnsName, strerror(errno)); - return DN_NOOPS; - } - int i; - ns_rr rr; - for (i = 0; i < na; ++i) { - if (ns_parserr(&m, ns_s_an, i, &rr) == -1) { - log_debug(LOG_DEBUG, "ns_parserr for %s: %s", dnsName, strerror (errno)); - return DN_NOOPS; - } - mbuf_rewind(seeds_buf); - unsigned char *s = ns_rr_rdata(rr); - if (s[0] >= ns_rr_rdlen(rr)) { - log_debug(LOG_DEBUG, "invalid length for %s: %d < %d", dnsName, s[0], ns_rr_rdlen(rr)); - return DN_NOOPS; - } - log_debug(LOG_VERB, "seeds for %s: %.*s", dnsName, s[0], s +1); - mbuf_copy(seeds_buf, s + 1, s[0]); - } - - uint32_t seeds_hash = hash_seeds(seeds_buf->pos, mbuf_length(seeds_buf)); - if (last_seeds_hash != seeds_hash) { - last_seeds_hash = seeds_hash; - } else { - return DN_NOOPS; - } - return DN_OK; + log_debug(LOG_VERB, "seeds for %s: %.*s", dnsName, s[0], s + 1); + mbuf_copy(seeds_buf, s + 1, s[0]); + } + + uint32_t seeds_hash = hash_seeds(seeds_buf->pos, mbuf_length(seeds_buf)); + if (last_seeds_hash != seeds_hash) { + last_seeds_hash = seeds_hash; + } else { + return DN_NOOPS; + } + return DN_OK; } - - diff --git a/src/seedsprovider/dyn_florida.c b/src/seedsprovider/dyn_florida.c index 088d7202c..3fa1509ee 100644 --- a/src/seedsprovider/dyn_florida.c +++ b/src/seedsprovider/dyn_florida.c @@ -1,11 +1,11 @@ -#include -#include #include -#include #include +#include +#include +#include -#include "dyn_seeds_provider.h" #include "dyn_core.h" +#include "dyn_seeds_provider.h" #include "dyn_string.h" /*************************************************************************** @@ -26,205 +26,206 @@ #endif #ifndef FLORIDA_REQUEST -#define FLORIDA_REQUEST "GET /REST/v1/admin/get_seeds HTTP/1.0\r\nHost: 127.0.0.1\r\nUser-Agent: HTMLGET 1.0\r\n\r\n"; +#define FLORIDA_REQUEST \ + "GET /REST/v1/admin/get_seeds HTTP/1.0\r\nHost: 127.0.0.1\r\nUser-Agent: " \ + "HTMLGET 1.0\r\n\r\n"; #endif -static char * floridaIp = NULL; -static int floridaPort = NULL; -static char * request = NULL; -static int isOsVarEval = 0; +static char *floridaIp = NULL; +static int floridaPort = NULL; +static char *request = NULL; +static int isOsVarEval = 0; static void evalOSVar(); static uint32_t create_tcp_socket(); -static int64_t last = 0; //storing last time for seeds check +static int64_t last = 0; // storing last time for seeds check static uint32_t last_seeds_hash = 0; -static void evalOSVar(){ - if (isOsVarEval==0){ - request = (getenv("DYNOMITE_FLORIDA_REQUEST")!=NULL) ? getenv("DYNOMITE_FLORIDA_REQUEST") : FLORIDA_REQUEST; - floridaPort = (getenv("DYNOMITE_FLORIDA_PORT")!=NULL) ? atoi(getenv("DYNOMITE_FLORIDA_PORT")) : FLORIDA_PORT; - floridaIp = (getenv("DYNOMITE_FLORIDA_IP")!=NULL) ? getenv("DYNOMITE_FLORIDA_IP") : FLORIDA_IP; - isOsVarEval = 1; +static void evalOSVar() { + if (isOsVarEval == 0) { + request = (getenv("DYNOMITE_FLORIDA_REQUEST") != NULL) + ? getenv("DYNOMITE_FLORIDA_REQUEST") + : FLORIDA_REQUEST; + floridaPort = (getenv("DYNOMITE_FLORIDA_PORT") != NULL) + ? atoi(getenv("DYNOMITE_FLORIDA_PORT")) + : FLORIDA_PORT; + floridaIp = (getenv("DYNOMITE_FLORIDA_IP") != NULL) + ? getenv("DYNOMITE_FLORIDA_IP") + : FLORIDA_IP; + isOsVarEval = 1; } } -static bool seeds_check() -{ - msec_t now = dn_msec_now(); +static bool seeds_check() { + msec_t now = dn_msec_now(); - int64_t delta = (int64_t)(now - last); - log_debug(LOG_VERB, "Delta or elapsed time : %lu", delta); - log_debug(LOG_VERB, "Seeds check internal %d", SEEDS_CHECK_INTERVAL); + int64_t delta = (int64_t)(now - last); + log_debug(LOG_VERB, "Delta or elapsed time : %lu", delta); + log_debug(LOG_VERB, "Seeds check internal %d", SEEDS_CHECK_INTERVAL); - if (delta > SEEDS_CHECK_INTERVAL) { - last = now; - return true; - } + if (delta > SEEDS_CHECK_INTERVAL) { + last = now; + return true; + } - return false; + return false; } +static uint32_t hash_seeds(uint8_t *seeds, size_t length) { + const uint8_t *ptr = seeds; + uint32_t value = 0; -static uint32_t -hash_seeds(uint8_t *seeds, size_t length) -{ - const uint8_t *ptr = seeds; - uint32_t value = 0; - - while (length--) { - uint32_t val = (uint32_t) *ptr++; - value += val; - value += (value << 10); - value ^= (value >> 6); - } - value += (value << 3); - value ^= (value >> 11); - value += (value << 15); + while (length--) { + uint32_t val = (uint32_t)*ptr++; + value += val; + value += (value << 10); + value ^= (value >> 6); + } + value += (value << 3); + value ^= (value >> 11); + value += (value << 15); - return value; + return value; } -uint8_t -florida_get_seeds(struct context * ctx, struct mbuf *seeds_buf) { - - evalOSVar(); +uint8_t florida_get_seeds(struct context *ctx, struct mbuf *seeds_buf) { + evalOSVar(); - struct sockaddr_in *remote; - uint32_t sock; - int32_t tmpres; - uint8_t buf[BUFSIZ + 1]; + struct sockaddr_in *remote; + uint32_t sock; + int32_t tmpres; + uint8_t buf[BUFSIZ + 1]; - log_debug(LOG_VVERB, "Running florida_get_seeds!"); + log_debug(LOG_VVERB, "Running florida_get_seeds!"); - if (!seeds_check()) { - return DN_NOOPS; - } + if (!seeds_check()) { + return DN_NOOPS; + } - sock = create_tcp_socket(); - if (sock == -1) { - log_debug(LOG_VVERB, "Unable to create a socket"); - return DN_ERROR; - } + sock = create_tcp_socket(); + if (sock == -1) { + log_debug(LOG_VVERB, "Unable to create a socket"); + return DN_ERROR; + } - remote = (struct sockaddr_in *) dn_alloc(sizeof(struct sockaddr_in *)); - remote->sin_family = AF_INET; - tmpres = inet_pton(AF_INET, floridaIp, (void *)(&(remote->sin_addr.s_addr))); - remote->sin_port = htons(floridaPort); + remote = (struct sockaddr_in *)dn_alloc(sizeof(struct sockaddr_in *)); + remote->sin_family = AF_INET; + tmpres = inet_pton(AF_INET, floridaIp, (void *)(&(remote->sin_addr.s_addr))); + remote->sin_port = htons(floridaPort); - if(connect(sock, (struct sockaddr *)remote, sizeof(struct sockaddr)) < 0) { - log_debug(LOG_VVERB, "Unable to connect the destination"); - return DN_ERROR; - } + if (connect(sock, (struct sockaddr *)remote, sizeof(struct sockaddr)) < 0) { + log_debug(LOG_VVERB, "Unable to connect the destination"); + return DN_ERROR; + } - uint32_t sent = 0; - while(sent < dn_strlen(request)) - { - tmpres = send(sock, request+sent, dn_strlen(request)-sent, 0); - if(tmpres == -1){ - log_debug(LOG_VVERB, "Unable to send query"); - close(sock); - dn_free(remote); - return DN_ERROR; - } - sent += tmpres; + uint32_t sent = 0; + while (sent < dn_strlen(request)) { + tmpres = send(sock, request + sent, dn_strlen(request) - sent, 0); + if (tmpres == -1) { + log_debug(LOG_VVERB, "Unable to send query"); + close(sock); + dn_free(remote); + return DN_ERROR; } + sent += tmpres; + } - mbuf_rewind(seeds_buf); - - memset(buf, 0, sizeof(buf)); - uint32_t htmlstart = 0; - uint8_t * htmlcontent; - uint8_t *ok = NULL; - - bool socket_has_data = true; - uint32_t rx_total = 0; - - while (socket_has_data) { - // Read socket data until we get them all or RX buffer becomes full - while ((rx_total < BUFSIZ) && (tmpres = recv(sock, buf + rx_total, BUFSIZ - rx_total, 0)) > 0) { - rx_total += tmpres; - } - - // Look for a OK response in the first buffer output. - if (!ok) - ok = (uint8_t *) strstr((char *)buf, "200 OK\r\n"); - if (ok == NULL) { - log_error("Received Error from Florida while getting seeds"); - loga_hexdump(buf, rx_total, "Florida Response with %ld bytes of data", rx_total); - close(sock); - dn_free(remote); - return DN_ERROR; - } - - if (htmlstart == 0) { - htmlcontent = (uint8_t *) strstr((char *)buf, "\r\n\r\n"); - if(htmlcontent != NULL) { - htmlstart = 1; - htmlcontent += 4; - } - } else { - htmlcontent = buf; - } - - if (htmlstart) { - mbuf_copy(seeds_buf, htmlcontent, rx_total - (htmlcontent - buf)); - } - - // If socket still has data for reading - if (tmpres > 0) { - if ((htmlstart == 0) && (rx_total >= 3)) { - /* In some corner cases (eg. when the read buffer size is near to the - * response header size) we can get into a situations when 4-bytes html - * content start sequence '\r\n\r\n' splits between two read iterations. - * To deal with this case the easiest way to restore splitted sequence - * before the next read iteration by move 3 last bytes (3 is enough to - * cover all split variants) from the current read iteration to the buffer - * head. - * Please notice, we repeat this step until html content is found. - */ - memcpy(buf, buf + (rx_total - 3) , 3); - memset(buf + 3, 0, rx_total - 3); - rx_total = 3; - } else { - memset(buf, 0, rx_total); - rx_total = 0; - } - } else { - socket_has_data = false; - } - } + mbuf_rewind(seeds_buf); + + memset(buf, 0, sizeof(buf)); + uint32_t htmlstart = 0; + uint8_t *htmlcontent; + uint8_t *ok = NULL; - if(tmpres < 0) { - log_debug(LOG_VVERB, "Error receiving data"); + bool socket_has_data = true; + uint32_t rx_total = 0; + + while (socket_has_data) { + // Read socket data until we get them all or RX buffer becomes full + while ((rx_total < BUFSIZ) && + (tmpres = recv(sock, buf + rx_total, BUFSIZ - rx_total, 0)) > 0) { + rx_total += tmpres; } - close(sock); - dn_free(remote); + // Look for a OK response in the first buffer output. + if (!ok) ok = (uint8_t *)strstr((char *)buf, "200 OK\r\n"); + if (ok == NULL) { + log_error("Received Error from Florida while getting seeds"); + loga_hexdump(buf, rx_total, "Florida Response with %ld bytes of data", + rx_total); + close(sock); + dn_free(remote); + return DN_ERROR; + } - if (mbuf_length(seeds_buf) == 0) { - log_error("No seeds were found in Florida response (htmlstart %u)", htmlstart); - return DN_ERROR; + if (htmlstart == 0) { + htmlcontent = (uint8_t *)strstr((char *)buf, "\r\n\r\n"); + if (htmlcontent != NULL) { + htmlstart = 1; + htmlcontent += 4; + } + } else { + htmlcontent = buf; } - uint32_t seeds_hash = hash_seeds(seeds_buf->pos, mbuf_length(seeds_buf)); + if (htmlstart) { + mbuf_copy(seeds_buf, htmlcontent, rx_total - (htmlcontent - buf)); + } - if (last_seeds_hash != seeds_hash) { - last_seeds_hash = seeds_hash; + // If socket still has data for reading + if (tmpres > 0) { + if ((htmlstart == 0) && (rx_total >= 3)) { + /* In some corner cases (eg. when the read buffer size is near to the + * response header size) we can get into a situations when 4-bytes html + * content start sequence '\r\n\r\n' splits between two read iterations. + * To deal with this case the easiest way to restore splitted sequence + * before the next read iteration by move 3 last bytes (3 is enough to + * cover all split variants) from the current read iteration to the + * buffer head. Please notice, we repeat this step until html content is + * found. + */ + memcpy(buf, buf + (rx_total - 3), 3); + memset(buf + 3, 0, rx_total - 3); + rx_total = 3; + } else { + memset(buf, 0, rx_total); + rx_total = 0; + } } else { - return DN_NOOPS; + socket_has_data = false; } + } - return DN_OK; -} + if (tmpres < 0) { + log_debug(LOG_VVERB, "Error receiving data"); + } + close(sock); + dn_free(remote); -uint32_t create_tcp_socket() -{ - uint32_t sock; - if((sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { - log_debug(LOG_VVERB, "Unable to create TCP socket"); - return DN_ERROR; - } - return sock; + if (mbuf_length(seeds_buf) == 0) { + log_error("No seeds were found in Florida response (htmlstart %u)", + htmlstart); + return DN_ERROR; + } + + uint32_t seeds_hash = hash_seeds(seeds_buf->pos, mbuf_length(seeds_buf)); + + if (last_seeds_hash != seeds_hash) { + last_seeds_hash = seeds_hash; + } else { + return DN_NOOPS; + } + + return DN_OK; +} + +uint32_t create_tcp_socket() { + uint32_t sock; + if ((sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { + log_debug(LOG_VVERB, "Unable to create TCP socket"); + return DN_ERROR; + } + return sock; } diff --git a/src/seedsprovider/dyn_seeds_provider.h b/src/seedsprovider/dyn_seeds_provider.h index 182bf2f3d..ae589d233 100644 --- a/src/seedsprovider/dyn_seeds_provider.h +++ b/src/seedsprovider/dyn_seeds_provider.h @@ -1,16 +1,14 @@ -#include "dyn_core.h" - - #ifndef _DYN_SEEDS_PROVIDER_H_ #define _DYN_SEEDS_PROVIDER_H_ +#define SEEDS_CHECK_INTERVAL (30 * 1000) /* in msec */ -#define SEEDS_CHECK_INTERVAL (30 * 1000) /* in msec */ - - -uint8_t florida_get_seeds(struct context * ctx, struct mbuf *seeds_buf); -uint8_t dns_get_seeds(struct context * ctx, struct mbuf *seeds_buf); +// Forward declarations +struct context; +struct mbuf; +uint8_t florida_get_seeds(struct context *ctx, struct mbuf *seeds_buf); +uint8_t dns_get_seeds(struct context *ctx, struct mbuf *seeds_buf); #endif /* DYN_SEEDS_PROVIDER_H_ */ diff --git a/src/tools/dyn_hash_tool.c b/src/tools/dyn_hash_tool.c index 917d66fbc..920e95eda 100644 --- a/src/tools/dyn_hash_tool.c +++ b/src/tools/dyn_hash_tool.c @@ -1,135 +1,138 @@ -#include +#include #include -#include +#include +#include + +#include "../dyn_log.h" +#include "../hashkit/dyn_token.h" + static struct option long_options[] = { - { "help", no_argument, NULL, 'h' }, - { "outputkey", no_argument, NULL, 'k' }, - { "tokenfile", required_argument, NULL, 'o' }, - { "keyfile", required_argument, NULL, 'i' }, - { NULL, 0, NULL, 0 } -}; + {"help", no_argument, NULL, 'h'}, + {"outputkey", no_argument, NULL, 'k'}, + {"tokenfile", required_argument, NULL, 'o'}, + {"keyfile", required_argument, NULL, 'i'}, + {NULL, 0, NULL, 0}}; static char short_options[] = "hki:o:"; bool outputkey = false; -char * key_filename = NULL; -char * token_filename = NULL; -static void -print_usage() -{ - printf("Usage: dyno-hash-tool [-hk] -i -o \n"); - printf("Read a key from input, hash key to create token, then output token to output.\n"); - printf(" -i or '-' for stdin (default)\n"); - printf(" -o or '-' for stdout (default)\n\n"); - printf("Options:\n"); - printf(" -h, --help : this help\n"); - printf(" -k : include key in output\n\n"); - printf("Read a key from the input file and output the token to the output file. Default\n"); - printf("input and output are stdin and stdout, respectively. Input file must list one\n"); - printf("key per line. Output file will list one token per line that matches the line\n"); - printf("number of the input key.\n\n"); - printf("If the '-k' option is specified, then keys and tokens will output on alternating\n"); - printf("rows. Each key has 'KEY:' appended to the beginning of the line\n\n"); - printf("WARNING: dyno-hash-tool ONLY SUPPORTS MURMUR HASH (CURRENTLY)\n\n"); +char *key_filename = NULL; +char *token_filename = NULL; +static void print_usage() { + printf("Usage: dyno-hash-tool [-hk] -i -o \n"); + printf( + "Read a key from input, hash key to create token, then output token to " + "output.\n"); + printf(" -i or '-' for stdin (default)\n"); + printf(" -o or '-' for stdout (default)\n\n"); + printf("Options:\n"); + printf(" -h, --help : this help\n"); + printf(" -k : include key in output\n\n"); + printf( + "Read a key from the input file and output the token to the output file. " + "Default\n"); + printf( + "input and output are stdin and stdout, respectively. Input file must " + "list one\n"); + printf( + "key per line. Output file will list one token per line that matches the " + "line\n"); + printf("number of the input key.\n\n"); + printf( + "If the '-k' option is specified, then keys and tokens will output on " + "alternating\n"); + printf("rows. Each key has 'KEY:' appended to the beginning of the line\n\n"); + printf("WARNING: dyno-hash-tool ONLY SUPPORTS MURMUR HASH (CURRENTLY)\n\n"); } -static int -dn_get_options(int argc, char **argv) -{ - int c, value; - - opterr = 0; - - for (;;) { - c = getopt_long(argc, argv, short_options, long_options, NULL); - if (c == -1) { - /* no more options */ - break; - } - - switch (c) { - case 'h': - print_usage(); - return 1; - - case 'k': - outputkey = true; - break; - - case 'i': - key_filename = optarg; - break; +static int dn_get_options(int argc, char **argv) { + int c, value; - case 'o': - token_filename = optarg; - break; + opterr = 0; - default: - printf("dynomite: invalid option -- '%c'", optopt); - return 1; - - } + for (;;) { + c = getopt_long(argc, argv, short_options, long_options, NULL); + if (c == -1) { + /* no more options */ + break; } - return 0; -} + switch (c) { + case 'h': + print_usage(); + return 1; + case 'k': + outputkey = true; + break; -int main(int argc, char **argv) -{ - log_init(5, NULL); - int ret = dn_get_options(argc, argv); - if (ret) - exit(EINVAL); - if (!key_filename) - key_filename = "-"; - if (!token_filename) - token_filename = "-"; - FILE *ifp, *ofp; - char *line = NULL; - size_t len = 0; - ssize_t read; - - if (!strcmp(key_filename, "-")) - ifp = stdin; - else { - log_debug(LOG_VERB, "opening input stream %s", key_filename); - ifp = fopen(key_filename, "r"); - } + case 'i': + key_filename = optarg; + break; - if (ifp == NULL) { - log_error("could not open input stream"); - exit(EXIT_FAILURE); - } + case 'o': + token_filename = optarg; + break; - if (!strcmp(token_filename, "-")) - ofp = stdout; - else { - log_debug(LOG_VERB, "opening output stream %s", key_filename); - ofp = fopen(token_filename, "w"); + default: + printf("dynomite: invalid option -- '%c'", optopt); + return 1; } + } - if (ofp == NULL) { - log_error("could not open input stream"); - exit(EXIT_FAILURE); - } + return 0; +} - while ((read = getline(&line, &len, ifp)) != -1) { - if (line[read-1] == '\n') { - line[read-1] = '\0'; - read--; - } - struct dyn_token d; - init_dyn_token(&d); - hash_murmur(line, read, &d); - log_debug(LOG_VERB, "KEY (%s) Token: %lu", line, *d.mag); - if (outputkey) - fprintf(ofp, "KEY:%s\n", line); - fprintf(ofp, "%lu\n", *d.mag); +int main(int argc, char **argv) { + log_init(5, NULL); + int ret = dn_get_options(argc, argv); + if (ret) exit(EINVAL); + if (!key_filename) key_filename = "-"; + if (!token_filename) token_filename = "-"; + FILE *ifp, *ofp; + char *line = NULL; + size_t len = 0; + ssize_t read; + + if (!strcmp(key_filename, "-")) + ifp = stdin; + else { + log_debug(LOG_VERB, "opening input stream %s", key_filename); + ifp = fopen(key_filename, "r"); + } + + if (ifp == NULL) { + log_error("could not open input stream"); + exit(EXIT_FAILURE); + } + + if (!strcmp(token_filename, "-")) + ofp = stdout; + else { + log_debug(LOG_VERB, "opening output stream %s", key_filename); + ofp = fopen(token_filename, "w"); + } + + if (ofp == NULL) { + log_error("could not open input stream"); + exit(EXIT_FAILURE); + } + + while ((read = getline(&line, &len, ifp)) != -1) { + if (line[read - 1] == '\n') { + line[read - 1] = '\0'; + read--; } - - fclose(ofp); - fclose(ifp); - free(line); - exit(EXIT_SUCCESS); + struct dyn_token d; + init_dyn_token(&d); + hash_murmur(line, read, &d); + log_debug(LOG_VERB, "KEY (%s) Token: %lu", line, *d.mag); + if (outputkey) fprintf(ofp, "KEY:%s\n", line); + fprintf(ofp, "%lu\n", *d.mag); + } + + fclose(ofp); + fclose(ifp); + free(line); + exit(EXIT_SUCCESS); }