update

flexflow · Dec 8, 2024 · ab0b209 · ab0b209
1 parent 2d7910b
commit ab0b209
Show file tree

Hide file tree

Showing 26 changed files with 360,070 additions and 329,380 deletions.
diff --git a/...sts_per_batch_8_max_tokens_per_batch_128_arrival_rate_0.000000_num_warmup_requests_10.csv b/...sts_per_batch_8_max_tokens_per_batch_128_arrival_rate_0.000000_num_warmup_requests_10.csv
diff --git a/...sts_per_batch_8_max_tokens_per_batch_256_arrival_rate_0.000000_num_warmup_requests_10.csv b/...sts_per_batch_8_max_tokens_per_batch_256_arrival_rate_0.000000_num_warmup_requests_10.csv
diff --git a/...sts_per_batch_8_max_tokens_per_batch_512_arrival_rate_0.000000_num_warmup_requests_10.csv b/...sts_per_batch_8_max_tokens_per_batch_512_arrival_rate_0.000000_num_warmup_requests_10.csv
diff --git a/...sts_per_batch_8_max_tokens_per_batch_128_arrival_rate_0.000000_num_warmup_requests_10.csv b/...sts_per_batch_8_max_tokens_per_batch_128_arrival_rate_0.000000_num_warmup_requests_10.csv
diff --git a/...sts_per_batch_8_max_tokens_per_batch_256_arrival_rate_0.000000_num_warmup_requests_10.csv b/...sts_per_batch_8_max_tokens_per_batch_256_arrival_rate_0.000000_num_warmup_requests_10.csv
diff --git a/...sts_per_batch_8_max_tokens_per_batch_512_arrival_rate_0.000000_num_warmup_requests_10.csv b/...sts_per_batch_8_max_tokens_per_batch_512_arrival_rate_0.000000_num_warmup_requests_10.csv
diff --git a/benchmarking/get_sharegpt_trace.py b/benchmarking/get_sharegpt_trace.py
@@ -3,10 +3,15 @@
 import os
 import random
 import requests
+import argparse
 from tqdm.asyncio import tqdm
 from typing import List, Optional
 from collections import OrderedDict
 from transformers import AutoTokenizer
+import pandas as pd
+from math import ceil
+from random import uniform
+import numpy as np
 
 SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
 
@@ -16,6 +21,7 @@ class TraceEntry:
     response: str
     prompt_length: int
     response_length: int
+    arrival_time: int
 
 @dataclass
 class TraceMetadata:
@@ -28,11 +34,87 @@ class TraceMetadata:
     min_response_length: int
     avg_response_length: float
     max_total_length: int
+    trace_type: str
+    arrival_rate: float
 
 @dataclass
 class Trace:
     entries: List[TraceEntry] = field(default_factory=list)
-    metadata: TraceMetadata = field(default_factory=lambda: TraceMetadata(0, 0, 0, 0, 0, 0, 0, 0,0))
+    metadata: TraceMetadata = field(default_factory=lambda: TraceMetadata(0, 0, 0, 0, 0, 0, 0, 0, 0, "offline", 0.0))
+
+def generate_arrival_rates_splitwise(n, target_arrival_rate_sec, seed):
+    def get_splitwise_trace(trace_type="conv"):
+        # Import Microsoft LLM 1 hour trace
+        df_trace = pd.read_csv("https://raw.githubusercontent.com/Azure/AzurePublicDataset/master/data/AzureLLMInferenceTrace_"+trace_type+".csv", parse_dates=["TIMESTAMP"])
+        req_times = (pd.to_datetime(df_trace["TIMESTAMP"]).astype(int)//1000) # Timestamps are in microseconds
+        req_times = req_times - req_times.min()
+        req_times = req_times.tolist()
+        return req_times
+
+    debug_verbose = True
+    req_times = get_splitwise_trace()
+
+    np.random.seed(seed)
+    random.seed(seed)
+
+    microsec = 1000000
+    avg_arrival_rate = len(req_times) / (req_times[-1]/float(microsec)) # Request per second. Computed that way to enforce working with numbers of reasonable orders of magnitude
+    if debug_verbose:
+        print("Avg arrival rate of original trace (req/s): ", avg_arrival_rate)
+    scale_factor = float(target_arrival_rate_sec) / avg_arrival_rate
+    if debug_verbose:
+        print("Scale factor to obtain target arrival rate: ", scale_factor)
+
+    # Buckets are 1 second timeframes
+    nb_buckets = ceil(req_times[-1] / microsec)
+    j = 0
+    # print("Number of buckets: ", nb_buckets)
+    bucket_sizes=[]
+    for i in range(nb_buckets):
+        bucket_size = 0
+        while(j < len(req_times) and req_times[j] >= i*microsec and req_times[j] < (i+1)*microsec):
+            bucket_size += 1
+            j += 1
+        bucket_size = bucket_size*scale_factor
+        prob = bucket_size - int(bucket_size)
+        bucket_size = int(bucket_size) + int(uniform(0, 1) <= prob)
+        bucket_sizes.append(bucket_size)
+
+    arrival_times = []
+    for arrival_time, num_requests in enumerate(bucket_sizes):
+        for i in range(num_requests):
+            arrival_times.append(arrival_time)
+    if len(arrival_times) > n:
+        arrival_times = arrival_times[:n]
+    elif len(arrival_times) < n:
+        raise RuntimeError(f"Error: not enough buckets ({nb_buckets}) in scaled trace to generate arrival times for all requests ({n})")
+    return arrival_times
+
+def generate_poisson_arrivals(n, target_arrival_rate_sec, seed):
+    """
+    Generate arrival times for n requests following a Poisson process.
+    
+    Parameters:
+    n (int): Number of requests to generate
+    arrival_rate (float): Average arrival rate (requests per second)
+    
+    Returns:
+    numpy.ndarray: Array of arrival times in seconds
+    """
+    np.random.seed(seed)
+    random.seed(seed)
+
+    # Generate n exponentially distributed inter-arrival times
+    # For a Poisson process, inter-arrival times follow exponential distribution
+    inter_arrival_times = np.random.exponential(scale=1/target_arrival_rate_sec, size=n)
+
+    # Calculate cumulative sum to get arrival times
+    arrival_times = np.cumsum(inter_arrival_times)
+
+    # Round to 6 decimal places for practical purposes (microsecond precision)
+    arrival_times = np.round(arrival_times, decimals=6)
+
+    return arrival_times
 
 def download_and_cache_file(url: str, filename: Optional[str] = None):
     """Read and cache a file from a url."""
@@ -90,10 +172,16 @@ def get_warmup_entries(model_name: str, num_warmup_requests: int) -> List[TraceE
         response = "I'm doing well, thank you for asking."
         prompt_length = len(tokenizer(prompt)["input_ids"])
         response_length = len(tokenizer(response)["input_ids"])
-        warmup_entries.append(TraceEntry(prompt, response, prompt_length, response_length))
+        warmup_entries.append(TraceEntry(prompt, response, prompt_length, response_length, 0))
     return warmup_entries
 
-def build_trace(model_name: str, num_entries: int, num_warmup_requests: int, seed: int):
+def build_trace(model_name: str, 
+                num_entries: int, 
+                max_length: int, 
+                seed: int, 
+                trace_type: str = "offline",
+                arrival_rate: float = 0.0,
+                apply_chat_template: bool = False):
     # Download sharegpt if necessary
     dataset_path = download_and_cache_file(SHAREGPT_URL)
 
@@ -117,7 +205,7 @@ def build_trace(model_name: str, num_entries: int, num_warmup_requests: int, see
 
     trace = Trace()
     trace_metadata = TraceMetadata(
-        num_warmup_requests=num_warmup_requests,
+        num_warmup_requests=0,
         avg_entries_per_partition=0,
         max_prompt_length=0,
         min_prompt_length=float("inf"),
@@ -126,25 +214,36 @@ def build_trace(model_name: str, num_entries: int, num_warmup_requests: int, see
         min_response_length=float("inf"),
         avg_response_length=0,
         max_total_length=0,
+        trace_type=trace_type,
+        arrival_rate=arrival_rate
     )
 
-    trace.entries += get_warmup_entries(model_name, num_warmup_requests)
+    arrival_times = num_entries*[0.0]
+    if trace_type == "poisson":
+        arrival_times = generate_poisson_arrivals(num_entries, arrival_rate, seed)
+    elif trace_type == "splitwise":
+        arrival_times = generate_arrival_rates_splitwise(num_entries, arrival_rate, seed)
+    assert(len(arrival_times) == num_entries)
+
 
     for i in tqdm(range(len(dataset))):
-        if len(trace.entries)-num_warmup_requests == num_entries:
+        if len(trace.entries) == num_entries:
             break
 
         # Tokenize the prompts and completions.
         prompt = dataset[i][0]
-        prompt = tokenizer.apply_chat_template(
-            [{"role": "user", "content": prompt}],
-            add_generation_prompt=True,
-            tokenize=False,
-        )
+        if apply_chat_template:
+            prompt = tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
         response = dataset[i][1]
         prompt_length = len(tokenizer(prompt)["input_ids"])
         response_length = len(tokenizer(response)["input_ids"])
-        new_entry = TraceEntry(prompt, response, prompt_length, response_length)
+        if prompt_length + response_length > max_length:
+            continue
+        new_entry = TraceEntry(prompt, response, prompt_length, response_length, arrival_times[len(trace.entries)])
         trace.entries.append(new_entry)
         trace_metadata.max_prompt_length = max(trace_metadata.max_prompt_length, prompt_length)
         trace_metadata.min_prompt_length = min(trace_metadata.min_prompt_length, prompt_length)
@@ -156,6 +255,7 @@ def build_trace(model_name: str, num_entries: int, num_warmup_requests: int, see
     trace_metadata.avg_prompt_length /= len(trace.entries)
     trace_metadata.avg_response_length /= len(trace.entries)
     trace_metadata.avg_entries_per_partition = len(trace.entries)
+    trace_metadata.arrival_rate = arrival_rate
 
     trace.metadata = trace_metadata
 
@@ -179,20 +279,31 @@ def save_trace(trace: Trace, output_path: str):
     print(f"Trace saved to {output_path}")
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Build WildChat trace")
+    parser.add_argument("--model_name", type=str, default="meta-llama/Llama-3.1-70B-Instruct", help="Model name")
+    parser.add_argument("-m", "--max-length", type=int, default=5000, help="Maximum prompt + response length")
+    parser.add_argument("-n", "--num_entries", type=int, default=250, help="Number of entries")
+    parser.add_argument("-s", "--seed", type=int, default=12345, help="Random seed")
+    parser.add_argument("-o", "--output_file", type=str, default="./traces/sharegpt.json", help="Output file name")
+    parser.add_argument("-t", "--trace-type", type=str, choices=["offline", "poisson", "splitwise"], default="offline", help="Arrival Times Trace Type")
+    parser.add_argument("-a", "--arrival-rate", type=float, default=0.0, help="Arrival Rate")
+    args = parser.parse_args()
+
     # Change directory to that holding this script
     os.chdir(os.path.dirname(os.path.abspath(__file__)))
 
-    num_entries=2048
-    num_warmup_requests=10
-    max_length=2048
-    seed=123456
-
-    trace = build_trace("meta-llama/Llama-3.1-70B-Instruct", num_entries, num_warmup_requests, seed)
+    trace = build_trace(args.model_name, 
+                        args.num_entries, 
+                        args.max_length, 
+                        args.seed,
+                        trace_type=args.trace_type,
+                        arrival_rate=args.arrival_rate,
+                        apply_chat_template=False)
     print(trace.metadata)
     # Save prompts list to a json file
     num_above_2048 = 0
     for entry in trace.entries:
         if entry.prompt_length + entry.response_length > 2048:
             num_above_2048 += 1
     print(f"Number of entries above 2048 tokens: {num_above_2048}")
-    # save_trace(trace, "sharegpt.json")
+    save_trace(trace, args.output_file)
diff --git a/benchmarking/get_wildchat_trace.py b/benchmarking/get_wildchat_trace.py
@@ -121,10 +121,10 @@ def save_trace(trace: Trace, output_path: str):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Build WildChat trace")
     parser.add_argument("--model_name", type=str, default="meta-llama/Llama-3.1-70B-Instruct", help="Model name")
-    parser.add_argument("-m", "--max-length", type=int, default=8190, help="Maximum prompt + response length")
+    parser.add_argument("-m", "--max-length", type=int, default=5000, help="Maximum prompt + response length")
     parser.add_argument("-n", "--num_entries", type=int, default=250, help="Number of entries")
     parser.add_argument("-s", "--seed", type=int, default=12345, help="Random seed")
-    parser.add_argument("-o", "--output_file", type=str, default="./wildchat.json", help="Output file name")
+    parser.add_argument("-o", "--output_file", type=str, default="./traces/wildchat.json", help="Output file name")
     args = parser.parse_args()
 
     # Change directory to that holding this script

diff --git a/benchmarking/overhead_test.sh b/benchmarking/overhead_test.sh
@@ -32,8 +32,8 @@ ZSIZE=200000
 # FSIZE=30000
 # ZSIZE=20000
 
-OUTPUT_FOLDER="../inference/output/overhead_test"
-MAX_SEQ_LEN=8192
+OUTPUT_FOLDER="../benchmarking/data/overhead_test"
+MAX_SEQ_LEN=5000
 BATCH_SIZE=8
 
 max_tokens_per_batch_values=(

diff --git a/benchmarking/plot_finetuning_overheads.py b/benchmarking/plot_finetuning_overheads.py
@@ -5,7 +5,7 @@
 import os
 
 # Read the CSV file
-def plot_fwd_overhead(filepath, num_tokens_per_batch):
+def plot_fwd_overhead(filepath, model_name, tp_degree, bz, num_tokens_per_batch, ft_bwd_tokens):
     # Load the CSV file
     df = pd.read_csv(filepath)
 
@@ -60,7 +60,7 @@ def plot_fwd_overhead(filepath, num_tokens_per_batch):
                    y='step_time',
                    alpha=0.6)
 
-    plt.title('Step Time vs Number of Finetuning Forward Tokens\nMax Tokens per Batch: ' + str(num_tokens_per_batch))
+    plt.title(f'Step Time vs Number of Finetuning Forward Tokens\nModel: {model_name} (TP={tp_degree})\nBatch Size: {bz} - Max Tokens per Batch: {num_tokens_per_batch}\nBWD finetuning tokens: {ft_bwd_tokens}')
     plt.xlabel('Number of Finetuning Forward Tokens')
     plt.ylabel('Step Time (milliseconds)')
 
@@ -86,7 +86,7 @@ def plot_fwd_overhead(filepath, num_tokens_per_batch):
 
     # plt.show()
 
-def plot_bwd_overhead(filepath, num_tokens_per_batch):
+def plot_bwd_overhead(filepath, model_name, tp_degree, bz, num_tokens_per_batch, ft_bwd_tokens):
     # Load the CSV file
     df = pd.read_csv(filepath)
 
@@ -132,7 +132,7 @@ def plot_bwd_overhead(filepath, num_tokens_per_batch):
                    y='step_time',
                    alpha=0.6)
 
-    plt.title('Step Time vs Number of BWD Finetuning Layers\nMax Tokens per Batch: ' + str(num_tokens_per_batch))
+    plt.title(f'Step Time vs Number of BWD Finetuning Layers\nModel: {model_name} (TP={tp_degree})\nBatch Size: {bz} - Max Tokens per Batch: {num_tokens_per_batch}\nBWD finetuning tokens: {ft_bwd_tokens}')
     plt.xlabel('Number of BWD Finetuning Layers')
     plt.ylabel('Step Time (milliseconds)')
 
@@ -169,10 +169,13 @@ def plot_bwd_overhead(filepath, num_tokens_per_batch):
     if not os.path.exists('./plots'):
         os.makedirs('./plots')
 
+    model_name="meta-llama/Llama-3.1-70B"
     tp_degree=4
+    ft_bwd_tokens=1024
+    bz=8
 
     for tokens_per_batch in [128, 256, 512]:
-        fp=f"../inference/output/overhead_test/step_profiling_meta-llama_llama-3.1-70b_tensor_parallelism_{tp_degree}_max_requests_per_batch_8_max_tokens_per_batch_{tokens_per_batch}_arrival_rate_0.000000_num_warmup_requests_10.csv"
+        fp=f"../benchmarking/data/overhead_test/step_profiling_meta-llama_llama-3.1-70b_tensor_parallelism_{tp_degree}_max_requests_per_batch_8_max_tokens_per_batch_{tokens_per_batch}_arrival_rate_0.000000_num_warmup_requests_10.csv"
 
-        plot_fwd_overhead(fp, tokens_per_batch)
-        plot_bwd_overhead(fp, tokens_per_batch)
+        plot_fwd_overhead(fp, model_name, tp_degree, bz, tokens_per_batch, ft_bwd_tokens)
+        plot_bwd_overhead(fp, model_name, tp_degree, bz, tokens_per_batch, ft_bwd_tokens)
diff --git a/benchmarking/plots/bwd_overhead_128.pdf b/benchmarking/plots/bwd_overhead_128.pdf
diff --git a/benchmarking/plots/bwd_overhead_256.pdf b/benchmarking/plots/bwd_overhead_256.pdf
diff --git a/benchmarking/plots/bwd_overhead_512.pdf b/benchmarking/plots/bwd_overhead_512.pdf
diff --git a/benchmarking/plots/fwd_overhead_128.pdf b/benchmarking/plots/fwd_overhead_128.pdf
diff --git a/benchmarking/plots/fwd_overhead_256.pdf b/benchmarking/plots/fwd_overhead_256.pdf
diff --git a/benchmarking/plots/fwd_overhead_512.pdf b/benchmarking/plots/fwd_overhead_512.pdf
diff --git a/benchmarking/run_incr_dec.sh b/benchmarking/run_incr_dec.sh
@@ -5,15 +5,54 @@ set -e
 # Cd into directory holding this script
 cd "${BASH_SOURCE[0]%/*}/../build"
 
+reset
+# ../config/config.linux
+# make -j 
+source ./set_python_envs.sh
 
-./inference/incr_decoding/incr_decoding \
-    -ll:cpu 16 -ll:gpu 8 -ll:util 16 \
-    -ll:fsize 20000 -ll:zsize 30000 \
-    --fusion \
-    -llm-model meta-llama/Llama-3.1-8B-Instruct \
-    -prompt ../benchmarking/test.json \
-    -tensor-parallelism-degree 8 \
-    -log-file ../inference/output/test.out \
-    -output-file ../inference/output/test.json \
-    --max-requests-per-batch 1 --max-tokens-per-batch 3000 --max-sequence-length 3000
+MODEL_NAME="meta-llama/Llama-3.1-70B"
+NGPUS=4
+NCPUS=16
+FSIZE=76000
+ZSIZE=200000
+
+OUTPUT_FOLDER="../benchmarking/data/incr_decoding"
+TRACES_FOLDER="../benchmarking/traces"
+MAX_SEQ_LEN=5000
+BATCH_SIZE=8
+
+trace_files=(
+    sharegpt
+    wildchat
+)
+
+max_tokens_per_batch_values=(
+    512
+    256
+    128
+)
+
+mkdir -p $OUTPUT_FOLDER
+
+for j in "${!max_tokens_per_batch_values[@]}"; do
+for i in "${!trace_files[@]}"; do
+    TRACE_FILE="${TRACES_FOLDER}/${trace_files[$i]}.json"
+    test -f $TRACE_FILE || { echo "File $TRACE_FILE not found"; exit 1; }
+    MAX_TOKENS_PER_BATCH=${max_tokens_per_batch_values[$j]}
+    echo "Running $TRACE_FILE with $MAX_TOKENS_PER_BATCH tokens per batch"
+    LOG_FILE="${OUTPUT_FOLDER}/incr_dec_${trace_files[$i]}_${MAX_TOKENS_PER_BATCH}_tokens_per_batch.log"
+    rm $LOG_FILE || true
+    ./inference/incr_decoding/incr_decoding \
+        -ll:cpu $NCPUS -ll:gpu $NGPUS -ll:util $NCPUS \
+        -ll:fsize $FSIZE -ll:zsize $ZSIZE \
+        -llm-model $MODEL_NAME --fusion \
+        -tensor-parallelism-degree $NGPUS \
+        -prompt $TRACE_FILE \
+        -output-folder $OUTPUT_FOLDER \
+        --max-requests-per-batch $BATCH_SIZE \
+        --max-tokens-per-batch $MAX_TOKENS_PER_BATCH \
+        --max-sequence-length $MAX_SEQ_LEN \
+        2>&1 | tee $LOG_FILE
+done
+done