-
Notifications
You must be signed in to change notification settings - Fork 4
/
run_HiRED_sys_report.py
89 lines (71 loc) · 3 KB
/
run_HiRED_sys_report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
import torch
from PIL import Image
import requests
import time
import numpy as np
from helper import *
# Model details
model_id = "llava-hf/llava-v1.6-vicuna-7b-hf"
commit_hash = "0524afe4453163103dcefe78eb0a58b3f6424eac"
# Quantization flag
quantization = False
# Load the model from the specified commit
model = LlavaNextForConditionalGeneration.from_pretrained(
model_id,
revision=commit_hash, # Specify the commit hash
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
load_in_4bit=quantization, # Handles quantization if enabled
)
# Load the processor from the specified commit
processor = LlavaNextProcessor.from_pretrained(
model_id,
revision=commit_hash,
)
if quantization is False: # hot fix for: .to` is not supported for `4-bit` or `8-bit` bitsandbytes models.
# Please use the model as it is, since the model has already been set to the correct devices and casted to the correct `dtype`.
model = model.to("cuda:0")
# ================ HiRED config ================
model.config.hired_config = {
"token_budget_rate": 0.2, # 1.0 for full execution
"alpha": 0.5,
}
print(f"Running inference on model: {model_id} for HiRED config: {model.config.hired_config}")
num_runs = 3
throughputs = []
latencies = []
time_to_first_tokens = []
print(f"Running {num_runs} inferences...")
for _ in range(num_runs):
image = get_random_image(h=600, w=600)
prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image> \n What is this image about? \n ASSISTANT:"
# Reset GPU memory statistics
torch.cuda.reset_peak_memory_stats("cuda:0")
# start counting time
start_time = time.time()
inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
with torch.inference_mode():
output = model.generate(
**inputs,
do_sample=False,
max_new_tokens=15,
use_cache=True,
return_dict_in_generate=True,
)
# length of newly generated tokens
output_text = processor.decode(output['sequences'][0], skip_special_tokens=False)
end_time = time.time()
time_to_first_token = model.prefill_time - start_time
total_generation_time = end_time - start_time
num_tokens_generated = len(output['sequences'][0]) - len(inputs["input_ids"][0])
throughput = num_tokens_generated / total_generation_time # tokens per second
throughputs.append(throughput)
time_to_first_tokens.append(time_to_first_token)
avg_gen_throughput = np.mean(throughputs)
avg_time_to_first_token = np.mean(time_to_first_tokens)
peak_memory_used = torch.cuda.max_memory_allocated("cuda:0") / (1024 * 1024 * 1024) # Convert to GB
# Print results
print(f"Throughput: {avg_gen_throughput} tokens/s")
print(f"Time to first token: {avg_time_to_first_token} s")
print(f"Peak GPU memory used: {peak_memory_used} GB")