flexflow · jiazhihao · Nov 15, 2023 · Oct 11, 2023 · Oct 12, 2023 · Oct 12, 2023
diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
@@ -59,6 +59,9 @@ class BatchConfig {
 
   //  Set by update
   int num_tokens;
+  // number of tokens in prompt phase, start offset of tokens in inc_decoding
+  // phase. num_tokens - num_prompt_tokens = num_generation_tokens;
+  int num_generation_tokens;
 
   struct PerRequestInfo {
     int first_token_depth_in_request;

diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -29,7 +29,7 @@ class IncMultiHeadSelfAttention : public Op {
 
   IncMultiHeadSelfAttention(FFModel &model,
                             LayerID const &layer_guid,
-                            const ParallelTensor _input,
+                            ParallelTensor const _input,
                             int _embed_dim,
                             int _num_q_heads,
                             int _num_kv_heads,
@@ -50,8 +50,8 @@ class IncMultiHeadSelfAttention : public Op {
                             int _tensor_parallelism_degree,
                             char const *name);
   IncMultiHeadSelfAttention(FFModel &model,
-                            const ParallelTensor _input,
-                            const ParallelTensor _weight,
+                            ParallelTensor const _input,
+                            ParallelTensor const _weight,
                             int _embed_dim,
                             int _num_q_heads,
                             int _num_kv_heads,
@@ -73,7 +73,7 @@ class IncMultiHeadSelfAttention : public Op {
                             char const *name);
   IncMultiHeadSelfAttention(FFModel &model,
                             IncMultiHeadSelfAttention const &other,
-                            const ParallelTensor input,
+                            ParallelTensor const input,
                             bool allocate_weights);
   IncMultiHeadSelfAttention(FFModel &model,
                             Params const &params,
@@ -192,9 +192,11 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   void *attn_heads;
   char *quantized_weight_ptr;
   BatchConfig::PerTokenInfo *token_infos;
+  BatchConfig::PerRequestInfo *request_infos;
   DataType quantization_type;
   bool offload;
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+  // cudaStream_t task_local_stream;
   cudnnTensorDescriptor_t qk_tensor;
   cuFloatComplex *complex_input;
 #elif defined(FF_USE_HIP_ROCM)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -14,6 +14,22 @@ namespace FlexFlow {
 namespace Kernels {
 namespace IncMultiHeadAttention {
 
+template <typename DT>
+void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m,
+                                         BatchConfig const *bc,
+                                         DT *output_ptr,
+                                         ffStream_t stream);
+
+template <typename DT>
+void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
+                         BatchConfig const *bc,
+                         int shard_id,
+                         DT *output_ptr,
+                         DT const *weight_ptr,
+                         DT const *bias_ptr,
+                         int num_tokens,
+                         ffStream_t stream);
+
 template <typename DT>
 __global__ void apply_position_bias_qkprd(DT *input_ptr,
                                           int num_tokens,