Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
goliaro committed Dec 8, 2024
1 parent 2d7910b commit ab0b209
Show file tree
Hide file tree
Showing 26 changed files with 360,070 additions and 329,380 deletions.
220,451 changes: 113,680 additions & 106,771 deletions ...sts_per_batch_8_max_tokens_per_batch_128_arrival_rate_0.000000_num_warmup_requests_10.csv

Large diffs are not rendered by default.

193,416 changes: 105,280 additions & 88,136 deletions ...sts_per_batch_8_max_tokens_per_batch_256_arrival_rate_0.000000_num_warmup_requests_10.csv

Large diffs are not rendered by default.

181,603 changes: 101,040 additions & 80,563 deletions ...sts_per_batch_8_max_tokens_per_batch_512_arrival_rate_0.000000_num_warmup_requests_10.csv

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

149 changes: 130 additions & 19 deletions benchmarking/get_sharegpt_trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,15 @@
import os
import random
import requests
import argparse
from tqdm.asyncio import tqdm
from typing import List, Optional
from collections import OrderedDict
from transformers import AutoTokenizer
import pandas as pd
from math import ceil
from random import uniform
import numpy as np

SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"

Expand All @@ -16,6 +21,7 @@ class TraceEntry:
response: str
prompt_length: int
response_length: int
arrival_time: int

@dataclass
class TraceMetadata:
Expand All @@ -28,11 +34,87 @@ class TraceMetadata:
min_response_length: int
avg_response_length: float
max_total_length: int
trace_type: str
arrival_rate: float

@dataclass
class Trace:
entries: List[TraceEntry] = field(default_factory=list)
metadata: TraceMetadata = field(default_factory=lambda: TraceMetadata(0, 0, 0, 0, 0, 0, 0, 0,0))
metadata: TraceMetadata = field(default_factory=lambda: TraceMetadata(0, 0, 0, 0, 0, 0, 0, 0, 0, "offline", 0.0))

def generate_arrival_rates_splitwise(n, target_arrival_rate_sec, seed):
def get_splitwise_trace(trace_type="conv"):
# Import Microsoft LLM 1 hour trace
df_trace = pd.read_csv("https://raw.githubusercontent.com/Azure/AzurePublicDataset/master/data/AzureLLMInferenceTrace_"+trace_type+".csv", parse_dates=["TIMESTAMP"])
req_times = (pd.to_datetime(df_trace["TIMESTAMP"]).astype(int)//1000) # Timestamps are in microseconds
req_times = req_times - req_times.min()
req_times = req_times.tolist()
return req_times

debug_verbose = True
req_times = get_splitwise_trace()

np.random.seed(seed)
random.seed(seed)

microsec = 1000000
avg_arrival_rate = len(req_times) / (req_times[-1]/float(microsec)) # Request per second. Computed that way to enforce working with numbers of reasonable orders of magnitude
if debug_verbose:
print("Avg arrival rate of original trace (req/s): ", avg_arrival_rate)
scale_factor = float(target_arrival_rate_sec) / avg_arrival_rate
if debug_verbose:
print("Scale factor to obtain target arrival rate: ", scale_factor)

# Buckets are 1 second timeframes
nb_buckets = ceil(req_times[-1] / microsec)
j = 0
# print("Number of buckets: ", nb_buckets)
bucket_sizes=[]
for i in range(nb_buckets):
bucket_size = 0
while(j < len(req_times) and req_times[j] >= i*microsec and req_times[j] < (i+1)*microsec):
bucket_size += 1
j += 1
bucket_size = bucket_size*scale_factor
prob = bucket_size - int(bucket_size)
bucket_size = int(bucket_size) + int(uniform(0, 1) <= prob)
bucket_sizes.append(bucket_size)

arrival_times = []
for arrival_time, num_requests in enumerate(bucket_sizes):
for i in range(num_requests):
arrival_times.append(arrival_time)
if len(arrival_times) > n:
arrival_times = arrival_times[:n]
elif len(arrival_times) < n:
raise RuntimeError(f"Error: not enough buckets ({nb_buckets}) in scaled trace to generate arrival times for all requests ({n})")
return arrival_times

def generate_poisson_arrivals(n, target_arrival_rate_sec, seed):
"""
Generate arrival times for n requests following a Poisson process.
Parameters:
n (int): Number of requests to generate
arrival_rate (float): Average arrival rate (requests per second)
Returns:
numpy.ndarray: Array of arrival times in seconds
"""
np.random.seed(seed)
random.seed(seed)

# Generate n exponentially distributed inter-arrival times
# For a Poisson process, inter-arrival times follow exponential distribution
inter_arrival_times = np.random.exponential(scale=1/target_arrival_rate_sec, size=n)

# Calculate cumulative sum to get arrival times
arrival_times = np.cumsum(inter_arrival_times)

# Round to 6 decimal places for practical purposes (microsecond precision)
arrival_times = np.round(arrival_times, decimals=6)

return arrival_times

def download_and_cache_file(url: str, filename: Optional[str] = None):
"""Read and cache a file from a url."""
Expand Down Expand Up @@ -90,10 +172,16 @@ def get_warmup_entries(model_name: str, num_warmup_requests: int) -> List[TraceE
response = "I'm doing well, thank you for asking."
prompt_length = len(tokenizer(prompt)["input_ids"])
response_length = len(tokenizer(response)["input_ids"])
warmup_entries.append(TraceEntry(prompt, response, prompt_length, response_length))
warmup_entries.append(TraceEntry(prompt, response, prompt_length, response_length, 0))
return warmup_entries

def build_trace(model_name: str, num_entries: int, num_warmup_requests: int, seed: int):
def build_trace(model_name: str,
num_entries: int,
max_length: int,
seed: int,
trace_type: str = "offline",
arrival_rate: float = 0.0,
apply_chat_template: bool = False):
# Download sharegpt if necessary
dataset_path = download_and_cache_file(SHAREGPT_URL)

Expand All @@ -117,7 +205,7 @@ def build_trace(model_name: str, num_entries: int, num_warmup_requests: int, see

trace = Trace()
trace_metadata = TraceMetadata(
num_warmup_requests=num_warmup_requests,
num_warmup_requests=0,
avg_entries_per_partition=0,
max_prompt_length=0,
min_prompt_length=float("inf"),
Expand All @@ -126,25 +214,36 @@ def build_trace(model_name: str, num_entries: int, num_warmup_requests: int, see
min_response_length=float("inf"),
avg_response_length=0,
max_total_length=0,
trace_type=trace_type,
arrival_rate=arrival_rate
)

trace.entries += get_warmup_entries(model_name, num_warmup_requests)
arrival_times = num_entries*[0.0]
if trace_type == "poisson":
arrival_times = generate_poisson_arrivals(num_entries, arrival_rate, seed)
elif trace_type == "splitwise":
arrival_times = generate_arrival_rates_splitwise(num_entries, arrival_rate, seed)
assert(len(arrival_times) == num_entries)


for i in tqdm(range(len(dataset))):
if len(trace.entries)-num_warmup_requests == num_entries:
if len(trace.entries) == num_entries:
break

# Tokenize the prompts and completions.
prompt = dataset[i][0]
prompt = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
if apply_chat_template:
prompt = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
response = dataset[i][1]
prompt_length = len(tokenizer(prompt)["input_ids"])
response_length = len(tokenizer(response)["input_ids"])
new_entry = TraceEntry(prompt, response, prompt_length, response_length)
if prompt_length + response_length > max_length:
continue
new_entry = TraceEntry(prompt, response, prompt_length, response_length, arrival_times[len(trace.entries)])
trace.entries.append(new_entry)
trace_metadata.max_prompt_length = max(trace_metadata.max_prompt_length, prompt_length)
trace_metadata.min_prompt_length = min(trace_metadata.min_prompt_length, prompt_length)
Expand All @@ -156,6 +255,7 @@ def build_trace(model_name: str, num_entries: int, num_warmup_requests: int, see
trace_metadata.avg_prompt_length /= len(trace.entries)
trace_metadata.avg_response_length /= len(trace.entries)
trace_metadata.avg_entries_per_partition = len(trace.entries)
trace_metadata.arrival_rate = arrival_rate

trace.metadata = trace_metadata

Expand All @@ -179,20 +279,31 @@ def save_trace(trace: Trace, output_path: str):
print(f"Trace saved to {output_path}")

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Build WildChat trace")
parser.add_argument("--model_name", type=str, default="meta-llama/Llama-3.1-70B-Instruct", help="Model name")
parser.add_argument("-m", "--max-length", type=int, default=5000, help="Maximum prompt + response length")
parser.add_argument("-n", "--num_entries", type=int, default=250, help="Number of entries")
parser.add_argument("-s", "--seed", type=int, default=12345, help="Random seed")
parser.add_argument("-o", "--output_file", type=str, default="./traces/sharegpt.json", help="Output file name")
parser.add_argument("-t", "--trace-type", type=str, choices=["offline", "poisson", "splitwise"], default="offline", help="Arrival Times Trace Type")
parser.add_argument("-a", "--arrival-rate", type=float, default=0.0, help="Arrival Rate")
args = parser.parse_args()

# Change directory to that holding this script
os.chdir(os.path.dirname(os.path.abspath(__file__)))

num_entries=2048
num_warmup_requests=10
max_length=2048
seed=123456

trace = build_trace("meta-llama/Llama-3.1-70B-Instruct", num_entries, num_warmup_requests, seed)
trace = build_trace(args.model_name,
args.num_entries,
args.max_length,
args.seed,
trace_type=args.trace_type,
arrival_rate=args.arrival_rate,
apply_chat_template=False)
print(trace.metadata)
# Save prompts list to a json file
num_above_2048 = 0
for entry in trace.entries:
if entry.prompt_length + entry.response_length > 2048:
num_above_2048 += 1
print(f"Number of entries above 2048 tokens: {num_above_2048}")
# save_trace(trace, "sharegpt.json")
save_trace(trace, args.output_file)
4 changes: 2 additions & 2 deletions benchmarking/get_wildchat_trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,10 @@ def save_trace(trace: Trace, output_path: str):
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Build WildChat trace")
parser.add_argument("--model_name", type=str, default="meta-llama/Llama-3.1-70B-Instruct", help="Model name")
parser.add_argument("-m", "--max-length", type=int, default=8190, help="Maximum prompt + response length")
parser.add_argument("-m", "--max-length", type=int, default=5000, help="Maximum prompt + response length")
parser.add_argument("-n", "--num_entries", type=int, default=250, help="Number of entries")
parser.add_argument("-s", "--seed", type=int, default=12345, help="Random seed")
parser.add_argument("-o", "--output_file", type=str, default="./wildchat.json", help="Output file name")
parser.add_argument("-o", "--output_file", type=str, default="./traces/wildchat.json", help="Output file name")
args = parser.parse_args()

# Change directory to that holding this script
Expand Down
4 changes: 2 additions & 2 deletions benchmarking/overhead_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ ZSIZE=200000
# FSIZE=30000
# ZSIZE=20000

OUTPUT_FOLDER="../inference/output/overhead_test"
MAX_SEQ_LEN=8192
OUTPUT_FOLDER="../benchmarking/data/overhead_test"
MAX_SEQ_LEN=5000
BATCH_SIZE=8

max_tokens_per_batch_values=(
Expand Down
17 changes: 10 additions & 7 deletions benchmarking/plot_finetuning_overheads.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import os

# Read the CSV file
def plot_fwd_overhead(filepath, num_tokens_per_batch):
def plot_fwd_overhead(filepath, model_name, tp_degree, bz, num_tokens_per_batch, ft_bwd_tokens):
# Load the CSV file
df = pd.read_csv(filepath)

Expand Down Expand Up @@ -60,7 +60,7 @@ def plot_fwd_overhead(filepath, num_tokens_per_batch):
y='step_time',
alpha=0.6)

plt.title('Step Time vs Number of Finetuning Forward Tokens\nMax Tokens per Batch: ' + str(num_tokens_per_batch))
plt.title(f'Step Time vs Number of Finetuning Forward Tokens\nModel: {model_name} (TP={tp_degree})\nBatch Size: {bz} - Max Tokens per Batch: {num_tokens_per_batch}\nBWD finetuning tokens: {ft_bwd_tokens}')
plt.xlabel('Number of Finetuning Forward Tokens')
plt.ylabel('Step Time (milliseconds)')

Expand All @@ -86,7 +86,7 @@ def plot_fwd_overhead(filepath, num_tokens_per_batch):

# plt.show()

def plot_bwd_overhead(filepath, num_tokens_per_batch):
def plot_bwd_overhead(filepath, model_name, tp_degree, bz, num_tokens_per_batch, ft_bwd_tokens):
# Load the CSV file
df = pd.read_csv(filepath)

Expand Down Expand Up @@ -132,7 +132,7 @@ def plot_bwd_overhead(filepath, num_tokens_per_batch):
y='step_time',
alpha=0.6)

plt.title('Step Time vs Number of BWD Finetuning Layers\nMax Tokens per Batch: ' + str(num_tokens_per_batch))
plt.title(f'Step Time vs Number of BWD Finetuning Layers\nModel: {model_name} (TP={tp_degree})\nBatch Size: {bz} - Max Tokens per Batch: {num_tokens_per_batch}\nBWD finetuning tokens: {ft_bwd_tokens}')
plt.xlabel('Number of BWD Finetuning Layers')
plt.ylabel('Step Time (milliseconds)')

Expand Down Expand Up @@ -169,10 +169,13 @@ def plot_bwd_overhead(filepath, num_tokens_per_batch):
if not os.path.exists('./plots'):
os.makedirs('./plots')

model_name="meta-llama/Llama-3.1-70B"
tp_degree=4
ft_bwd_tokens=1024
bz=8

for tokens_per_batch in [128, 256, 512]:
fp=f"../inference/output/overhead_test/step_profiling_meta-llama_llama-3.1-70b_tensor_parallelism_{tp_degree}_max_requests_per_batch_8_max_tokens_per_batch_{tokens_per_batch}_arrival_rate_0.000000_num_warmup_requests_10.csv"
fp=f"../benchmarking/data/overhead_test/step_profiling_meta-llama_llama-3.1-70b_tensor_parallelism_{tp_degree}_max_requests_per_batch_8_max_tokens_per_batch_{tokens_per_batch}_arrival_rate_0.000000_num_warmup_requests_10.csv"

plot_fwd_overhead(fp, tokens_per_batch)
plot_bwd_overhead(fp, tokens_per_batch)
plot_fwd_overhead(fp, model_name, tp_degree, bz, tokens_per_batch, ft_bwd_tokens)
plot_bwd_overhead(fp, model_name, tp_degree, bz, tokens_per_batch, ft_bwd_tokens)
Binary file modified benchmarking/plots/bwd_overhead_128.pdf
Binary file not shown.
Binary file modified benchmarking/plots/bwd_overhead_256.pdf
Binary file not shown.
Binary file modified benchmarking/plots/bwd_overhead_512.pdf
Binary file not shown.
Binary file modified benchmarking/plots/fwd_overhead_128.pdf
Binary file not shown.
Binary file modified benchmarking/plots/fwd_overhead_256.pdf
Binary file not shown.
Binary file modified benchmarking/plots/fwd_overhead_512.pdf
Binary file not shown.
59 changes: 49 additions & 10 deletions benchmarking/run_incr_dec.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,54 @@ set -e
# Cd into directory holding this script
cd "${BASH_SOURCE[0]%/*}/../build"

reset
# ../config/config.linux
# make -j
source ./set_python_envs.sh

./inference/incr_decoding/incr_decoding \
-ll:cpu 16 -ll:gpu 8 -ll:util 16 \
-ll:fsize 20000 -ll:zsize 30000 \
--fusion \
-llm-model meta-llama/Llama-3.1-8B-Instruct \
-prompt ../benchmarking/test.json \
-tensor-parallelism-degree 8 \
-log-file ../inference/output/test.out \
-output-file ../inference/output/test.json \
--max-requests-per-batch 1 --max-tokens-per-batch 3000 --max-sequence-length 3000
MODEL_NAME="meta-llama/Llama-3.1-70B"
NGPUS=4
NCPUS=16
FSIZE=76000
ZSIZE=200000

OUTPUT_FOLDER="../benchmarking/data/incr_decoding"
TRACES_FOLDER="../benchmarking/traces"
MAX_SEQ_LEN=5000
BATCH_SIZE=8

trace_files=(
sharegpt
wildchat
)

max_tokens_per_batch_values=(
512
256
128
)

mkdir -p $OUTPUT_FOLDER

for j in "${!max_tokens_per_batch_values[@]}"; do
for i in "${!trace_files[@]}"; do
TRACE_FILE="${TRACES_FOLDER}/${trace_files[$i]}.json"
test -f $TRACE_FILE || { echo "File $TRACE_FILE not found"; exit 1; }
MAX_TOKENS_PER_BATCH=${max_tokens_per_batch_values[$j]}
echo "Running $TRACE_FILE with $MAX_TOKENS_PER_BATCH tokens per batch"
LOG_FILE="${OUTPUT_FOLDER}/incr_dec_${trace_files[$i]}_${MAX_TOKENS_PER_BATCH}_tokens_per_batch.log"
rm $LOG_FILE || true
./inference/incr_decoding/incr_decoding \
-ll:cpu $NCPUS -ll:gpu $NGPUS -ll:util $NCPUS \
-ll:fsize $FSIZE -ll:zsize $ZSIZE \
-llm-model $MODEL_NAME --fusion \
-tensor-parallelism-degree $NGPUS \
-prompt $TRACE_FILE \
-output-folder $OUTPUT_FOLDER \
--max-requests-per-batch $BATCH_SIZE \
--max-tokens-per-batch $MAX_TOKENS_PER_BATCH \
--max-sequence-length $MAX_SEQ_LEN \
2>&1 | tee $LOG_FILE
done
done

Loading

0 comments on commit ab0b209

Please sign in to comment.