forked from salesforce/BLIP
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
root
committed
Jan 27, 2022
0 parents
commit 97a975f
Showing
54 changed files
with
5,573 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# BLIP |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
{ | ||
"architectures": [ | ||
"BertForMaskedLM" | ||
], | ||
"attention_probs_dropout_prob": 0.1, | ||
"hidden_act": "gelu", | ||
"hidden_dropout_prob": 0.1, | ||
"hidden_size": 768, | ||
"initializer_range": 0.02, | ||
"intermediate_size": 3072, | ||
"layer_norm_eps": 1e-12, | ||
"max_position_embeddings": 512, | ||
"model_type": "bert", | ||
"num_attention_heads": 12, | ||
"num_hidden_layers": 12, | ||
"pad_token_id": 0, | ||
"type_vocab_size": 2, | ||
"vocab_size": 30522, | ||
"encoder_width": 768, | ||
"add_cross_attention": true | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
image_root: '/export/share/datasets/vision/coco/images/' | ||
ann_root: 'annotation' | ||
coco_gt_root: 'annotation/coco_gt' | ||
|
||
# set pretrained as a file path or an url | ||
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_base_caption.pth' | ||
|
||
# size of vit model; base or large | ||
vit: 'base' | ||
vit_grad_ckpt: False | ||
vit_ckpt_layer: 0 | ||
batch_size: 32 | ||
init_lr: 1e-5 | ||
|
||
# vit: 'large' | ||
# vit_grad_ckpt: True | ||
# vit_ckpt_layer: 5 | ||
# batch_size: 16 | ||
# init_lr: 2e-6 | ||
|
||
image_size: 384 | ||
|
||
# generation configs | ||
max_length: 20 | ||
min_length: 5 | ||
num_beams: 3 | ||
prompt: 'a picture of ' | ||
|
||
# optimizer | ||
weight_decay: 0.05 | ||
min_lr: 0 | ||
max_epoch: 5 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
{ | ||
"architectures": [ | ||
"BertModel" | ||
], | ||
"attention_probs_dropout_prob": 0.1, | ||
"hidden_act": "gelu", | ||
"hidden_dropout_prob": 0.1, | ||
"hidden_size": 768, | ||
"initializer_range": 0.02, | ||
"intermediate_size": 3072, | ||
"layer_norm_eps": 1e-12, | ||
"max_position_embeddings": 512, | ||
"model_type": "bert", | ||
"num_attention_heads": 12, | ||
"num_hidden_layers": 12, | ||
"pad_token_id": 0, | ||
"type_vocab_size": 2, | ||
"vocab_size": 30524, | ||
"encoder_width": 768, | ||
"add_cross_attention": true | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
image_root: '/export/share/datasets/vision/NLVR2/' | ||
ann_root: 'annotation' | ||
|
||
# set pretrained as a file path or an url | ||
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth' | ||
|
||
#size of vit model; base or large | ||
vit: 'base' | ||
batch_size_train: 16 | ||
batch_size_test: 64 | ||
vit_grad_ckpt: False | ||
vit_ckpt_layer: 0 | ||
max_epoch: 15 | ||
|
||
image_size: 384 | ||
|
||
# optimizer | ||
weight_decay: 0.05 | ||
init_lr: 3e-5 | ||
min_lr: 0 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
image_root: '/export/share/datasets/vision/nocaps/' | ||
ann_root: 'annotation' | ||
|
||
# set pretrained as a file path or an url | ||
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_base_caption.pth' | ||
|
||
vit: 'base' | ||
batch_size: 32 | ||
|
||
image_size: 384 | ||
|
||
max_length: 20 | ||
min_length: 5 | ||
num_beams: 3 | ||
prompt: 'a picture of ' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
train_file: ['/export/share/junnan-li/VL_pretrain/annotation/coco_karpathy_train.json', | ||
'/export/share/junnan-li/VL_pretrain/annotation/vg_caption.json', | ||
] | ||
laion_path: '' | ||
|
||
# size of vit model; base or large | ||
vit: 'base' | ||
vit_grad_ckpt: False | ||
vit_ckpt_layer: 0 | ||
|
||
image_size: 224 | ||
batch_size: 75 | ||
|
||
queue_size: 57600 | ||
alpha: 0.4 | ||
|
||
# optimizer | ||
weight_decay: 0.05 | ||
init_lr: 3e-4 | ||
min_lr: 1e-6 | ||
warmup_lr: 1e-6 | ||
lr_decay_rate: 0.9 | ||
max_epoch: 20 | ||
warmup_steps: 3000 | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
image_root: '/export/share/datasets/vision/coco/images/' | ||
ann_root: 'annotation' | ||
dataset: 'coco' | ||
|
||
# set pretrained as a file path or an url | ||
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth' | ||
|
||
# size of vit model; base or large | ||
|
||
vit: 'base' | ||
batch_size_train: 32 | ||
batch_size_test: 64 | ||
vit_grad_ckpt: True | ||
vit_ckpt_layer: 4 | ||
init_lr: 1e-5 | ||
|
||
# vit: 'large' | ||
# batch_size_train: 16 | ||
# batch_size_test: 32 | ||
# vit_grad_ckpt: True | ||
# vit_ckpt_layer: 12 | ||
# init_lr: 5e-6 | ||
|
||
image_size: 384 | ||
queue_size: 57600 | ||
alpha: 0.4 | ||
k_test: 256 | ||
negative_all_rank: True | ||
|
||
# optimizer | ||
weight_decay: 0.05 | ||
min_lr: 0 | ||
max_epoch: 6 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
image_root: '/export/share/datasets/vision/flickr30k/' | ||
ann_root: 'annotation' | ||
dataset: 'flickr' | ||
|
||
# set pretrained as a file path or an url | ||
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_flickr.pth' | ||
|
||
# size of vit model; base or large | ||
|
||
vit: 'base' | ||
batch_size_train: 32 | ||
batch_size_test: 64 | ||
vit_grad_ckpt: True | ||
vit_ckpt_layer: 4 | ||
init_lr: 1e-5 | ||
|
||
# vit: 'large' | ||
# batch_size_train: 16 | ||
# batch_size_test: 32 | ||
# vit_grad_ckpt: True | ||
# vit_ckpt_layer: 10 | ||
# init_lr: 5e-6 | ||
|
||
image_size: 384 | ||
queue_size: 57600 | ||
alpha: 0.4 | ||
k_test: 128 | ||
negative_all_rank: False | ||
|
||
# optimizer | ||
weight_decay: 0.05 | ||
min_lr: 0 | ||
max_epoch: 6 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
vqa_root: '/export/share/datasets/vision/VQA/Images/mscoco/' #followed by train2014/ | ||
vg_root: '/export/share/datasets/vision/visual-genome/' #followed by image/ | ||
train_files: ['vqa_train','vqa_val','vg_qa'] | ||
ann_root: 'annotation' | ||
|
||
# set pretrained as a file path or an url | ||
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_vqa.pth' | ||
|
||
# size of vit model; base or large | ||
vit: 'base' | ||
batch_size_train: 16 | ||
batch_size_test: 32 | ||
vit_grad_ckpt: False | ||
vit_ckpt_layer: 0 | ||
init_lr: 2e-5 | ||
|
||
image_size: 480 | ||
|
||
k_test: 128 | ||
inference: 'rank' | ||
|
||
# optimizer | ||
weight_decay: 0.05 | ||
min_lr: 0 | ||
max_epoch: 10 |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
''' | ||
* Copyright (c) 2022, salesforce.com, inc. | ||
* All rights reserved. | ||
* SPDX-License-Identifier: BSD-3-Clause | ||
* For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause | ||
* By Junnan Li | ||
''' | ||
import argparse | ||
import os | ||
import ruamel_yaml as yaml | ||
import numpy as np | ||
import random | ||
import time | ||
import datetime | ||
import json | ||
from pathlib import Path | ||
|
||
import torch | ||
import torch.nn as nn | ||
import torch.nn.functional as F | ||
import torch.backends.cudnn as cudnn | ||
import torch.distributed as dist | ||
from torch.utils.data import DataLoader | ||
|
||
from models.blip import blip_decoder | ||
import utils | ||
from data import create_dataset, create_sampler, create_loader | ||
from data.utils import save_result | ||
|
||
@torch.no_grad() | ||
def evaluate(model, data_loader, device, config): | ||
# evaluate | ||
model.eval() | ||
|
||
metric_logger = utils.MetricLogger(delimiter=" ") | ||
header = 'Evaluation:' | ||
print_freq = 10 | ||
|
||
result = [] | ||
for image, image_id in metric_logger.log_every(data_loader, print_freq, header): | ||
|
||
image = image.to(device) | ||
|
||
captions = model.generate(image, sample=False, num_beams=config['num_beams'], max_length=config['max_length'], | ||
min_length=config['min_length'], repetition_penalty=1.1) | ||
|
||
for caption, img_id in zip(captions, image_id): | ||
result.append({"image_id": img_id.item(), "caption": caption}) | ||
|
||
return result | ||
|
||
|
||
def main(args, config): | ||
utils.init_distributed_mode(args) | ||
|
||
device = torch.device(args.device) | ||
|
||
# fix the seed for reproducibility | ||
seed = args.seed + utils.get_rank() | ||
torch.manual_seed(seed) | ||
np.random.seed(seed) | ||
random.seed(seed) | ||
cudnn.benchmark = True | ||
|
||
#### Dataset #### | ||
print("Creating captioning dataset") | ||
val_dataset, test_dataset = create_dataset('nocaps', config) | ||
|
||
if args.distributed: | ||
num_tasks = utils.get_world_size() | ||
global_rank = utils.get_rank() | ||
samplers = create_sampler([val_dataset,test_dataset], [False,False], num_tasks, global_rank) | ||
else: | ||
samplers = [None,None] | ||
|
||
val_loader, test_loader = create_loader([val_dataset, test_dataset],samplers, | ||
batch_size=[config['batch_size']]*2,num_workers=[4,4], | ||
is_trains=[False, False], collate_fns=[None,None]) | ||
|
||
#### Model #### | ||
print("Creating model") | ||
model = blip_decoder(pretrained=config['pretrained'], image_size=config['image_size'], vit=config['vit'], | ||
prompt=config['prompt']) | ||
|
||
model = model.to(device) | ||
|
||
model_without_ddp = model | ||
if args.distributed: | ||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) | ||
model_without_ddp = model.module | ||
|
||
val_result = evaluate(model_without_ddp, val_loader, device, config) | ||
val_result_file = save_result(val_result, args.result_dir, 'val', remove_duplicate='image_id') | ||
test_result = evaluate(model_without_ddp, test_loader, device, config) | ||
test_result_file = save_result(test_result, args.result_dir, 'test', remove_duplicate='image_id') | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--config', default='./configs/nocaps.yaml') | ||
parser.add_argument('--output_dir', default='output/NoCaps') | ||
parser.add_argument('--device', default='cuda') | ||
parser.add_argument('--seed', default=42, type=int) | ||
parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes') | ||
parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training') | ||
parser.add_argument('--distributed', default=True, type=bool) | ||
args = parser.parse_args() | ||
|
||
config = yaml.load(open(args.config, 'r'), Loader=yaml.Loader) | ||
|
||
args.result_dir = os.path.join(args.output_dir, 'result') | ||
|
||
Path(args.output_dir).mkdir(parents=True, exist_ok=True) | ||
Path(args.result_dir).mkdir(parents=True, exist_ok=True) | ||
|
||
yaml.dump(config, open(os.path.join(args.output_dir, 'config.yaml'), 'w')) | ||
|
||
main(args, config) |
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Oops, something went wrong.