You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
importosos.environ["HF_HOME"] ="/dataset/huggingface"importmathimportnumpyasnpimporttorchimporttorchvision.transformsasTfromdecordimportVideoReader, cpufromPILimportImagefromtorchvision.transforms.functionalimportInterpolationModefromtransformersimportAutoModel, AutoTokenizerIMAGENET_MEAN= (0.485, 0.456, 0.406)
IMAGENET_STD= (0.229, 0.224, 0.225)
defbuild_transform(input_size):
MEAN, STD=IMAGENET_MEAN, IMAGENET_STDtransform=T.Compose([
T.Lambda(lambdaimg: img.convert('RGB') ifimg.mode!='RGB'elseimg),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
returntransformdeffind_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff=float('inf')
best_ratio= (1, 1)
area=width*heightforratiointarget_ratios:
target_aspect_ratio=ratio[0] /ratio[1]
ratio_diff=abs(aspect_ratio-target_aspect_ratio)
ifratio_diff<best_ratio_diff:
best_ratio_diff=ratio_diffbest_ratio=ratioelifratio_diff==best_ratio_diff:
ifarea>0.5*image_size*image_size*ratio[0] *ratio[1]:
best_ratio=ratioreturnbest_ratiodefdynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
orig_width, orig_height=image.sizeaspect_ratio=orig_width/orig_height# calculate the existing image aspect ratiotarget_ratios=set(
(i, j) forninrange(min_num, max_num+1) foriinrange(1, n+1) forjinrange(1, n+1) ifi*j<=max_numandi*j>=min_num)
target_ratios=sorted(target_ratios, key=lambdax: x[0] *x[1])
# find the closest aspect ratio to the targettarget_aspect_ratio=find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# calculate the target width and heighttarget_width=image_size*target_aspect_ratio[0]
target_height=image_size*target_aspect_ratio[1]
blocks=target_aspect_ratio[0] *target_aspect_ratio[1]
# resize the imageresized_img=image.resize((target_width, target_height))
processed_images= []
foriinrange(blocks):
box= (
(i% (target_width//image_size)) *image_size,
(i// (target_width//image_size)) *image_size,
((i% (target_width//image_size)) +1) *image_size,
((i// (target_width//image_size)) +1) *image_size
)
# split the imagesplit_img=resized_img.crop(box)
processed_images.append(split_img)
assertlen(processed_images) ==blocksifuse_thumbnailandlen(processed_images) !=1:
thumbnail_img=image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
returnprocessed_imagesdefload_image(image_file, input_size=448, max_num=12):
image=Image.open(image_file).convert('RGB')
transform=build_transform(input_size=input_size)
images=dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values= [transform(image) forimageinimages]
pixel_values=torch.stack(pixel_values)
returnpixel_valuesdefsplit_model(model_name):
device_map= {}
world_size=torch.cuda.device_count()
num_layers= {
'InternVL2_5-1B': 24, 'InternVL2_5-2B': 24, 'InternVL2_5-4B': 36, 'InternVL2_5-8B': 32,
'InternVL2_5-26B': 48, 'InternVL2_5-38B': 64, 'InternVL2_5-78B': 80}[model_name]
# Since the first GPU will be used for ViT, treat it as half a GPU.num_layers_per_gpu=math.ceil(num_layers/ (world_size-0.5))
num_layers_per_gpu= [num_layers_per_gpu] *world_sizenum_layers_per_gpu[0] =math.ceil(num_layers_per_gpu[0] *0.5)
layer_cnt=0fori, num_layerinenumerate(num_layers_per_gpu):
forjinrange(num_layer):
device_map[f'language_model.model.layers.{layer_cnt}'] =ilayer_cnt+=1device_map['vision_model'] =0device_map['mlp1'] =0device_map['language_model.model.tok_embeddings'] =0device_map['language_model.model.embed_tokens'] =0device_map['language_model.output'] =0device_map['language_model.model.norm'] =0device_map['language_model.lm_head'] =0device_map[f'language_model.model.layers.{num_layers-1}'] =0returndevice_mappath="OpenGVLab/InternVL2_5-78B"device_map=split_model('InternVL2_5-78B')
model=AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
device_map=device_map).eval()
tokenizer=AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
generation_config=dict(max_new_tokens=1024, do_sample=True)
# single-image single-round conversation (单图单轮对话)question='<image>\nPlease describe the image shortly.'# set the max number of tiles in `max_num`pixel_values=load_image('sample.jpg', max_num=12).to(torch.bfloat16).cuda()
response=model.chat(tokenizer, pixel_values, question, generation_config)
print(f'User: {question}\nAssistant: {response}')
'''This image was misclassified by the fire and smoke classifier. Please provide a detailed explanation of the reasons for this misclassification, considering the visual features, the object's position within the image, the overall context, and what the object actually represents.'''
Error
Traceback (most recent call last):
File "/workspace/internvl.py", line 132, in <module>
response = model.chat(tokenizer, pixel_values, question, generation_config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dataset/huggingface/modules/transformers_modules/OpenGVLab/InternVL2_5-78B/ea891f50e952a1bdf9dd44df66a932bc5a4f40ec/modeling_internvl_chat.py", line 290, in chat
generation_output = self.generate(
^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/dataset/huggingface/modules/transformers_modules/OpenGVLab/InternVL2_5-78B/ea891f50e952a1bdf9dd44df66a932bc5a4f40ec/modeling_internvl_chat.py", line 339, in generate
outputs = self.language_model.generate(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/transformers/generation/utils.py", line 2252, in generate
result = self._sample(
^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/transformers/generation/utils.py", line 3251, in _sample
outputs = self(**model_inputs, return_dict=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 1169, in forward
outputs = self.model(
^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 875, in forward
position_embeddings = self.rotary_emb(hidden_states, position_ids)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 167, in forward
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)
I printed device of its tensor and I got these results.
System Info
transformers
version: 4.47.1Who can help?
I faced this bug running VLM InternVL2.5-78B.
text models: @ArthurZucker
vision models: @amyeroberts, @qubvel
Information
Tasks
examples
folder (such as GLUE/SQuAD, ...)Reproduction
Code sample
I tried to run demo code of
internVL2_5-78B
.Error
I printed device of its tensor and I got these results.
Expected behavior
No errors and print image caption.
The text was updated successfully, but these errors were encountered: