Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
root committed Jan 27, 2022
0 parents commit 97a975f
Show file tree
Hide file tree
Showing 54 changed files with 5,573 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# BLIP
21 changes: 21 additions & 0 deletions configs/bert_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 30522,
"encoder_width": 768,
"add_cross_attention": true
}
33 changes: 33 additions & 0 deletions configs/caption_coco.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
image_root: '/export/share/datasets/vision/coco/images/'
ann_root: 'annotation'
coco_gt_root: 'annotation/coco_gt'

# set pretrained as a file path or an url
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_base_caption.pth'

# size of vit model; base or large
vit: 'base'
vit_grad_ckpt: False
vit_ckpt_layer: 0
batch_size: 32
init_lr: 1e-5

# vit: 'large'
# vit_grad_ckpt: True
# vit_ckpt_layer: 5
# batch_size: 16
# init_lr: 2e-6

image_size: 384

# generation configs
max_length: 20
min_length: 5
num_beams: 3
prompt: 'a picture of '

# optimizer
weight_decay: 0.05
min_lr: 0
max_epoch: 5

21 changes: 21 additions & 0 deletions configs/med_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"architectures": [
"BertModel"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 30524,
"encoder_width": 768,
"add_cross_attention": true
}
21 changes: 21 additions & 0 deletions configs/nlvr.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
image_root: '/export/share/datasets/vision/NLVR2/'
ann_root: 'annotation'

# set pretrained as a file path or an url
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth'

#size of vit model; base or large
vit: 'base'
batch_size_train: 16
batch_size_test: 64
vit_grad_ckpt: False
vit_ckpt_layer: 0
max_epoch: 15

image_size: 384

# optimizer
weight_decay: 0.05
init_lr: 3e-5
min_lr: 0

15 changes: 15 additions & 0 deletions configs/nocaps.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
image_root: '/export/share/datasets/vision/nocaps/'
ann_root: 'annotation'

# set pretrained as a file path or an url
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_base_caption.pth'

vit: 'base'
batch_size: 32

image_size: 384

max_length: 20
min_length: 5
num_beams: 3
prompt: 'a picture of '
27 changes: 27 additions & 0 deletions configs/pretrain.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
train_file: ['/export/share/junnan-li/VL_pretrain/annotation/coco_karpathy_train.json',
'/export/share/junnan-li/VL_pretrain/annotation/vg_caption.json',
]
laion_path: ''

# size of vit model; base or large
vit: 'base'
vit_grad_ckpt: False
vit_ckpt_layer: 0

image_size: 224
batch_size: 75

queue_size: 57600
alpha: 0.4

# optimizer
weight_decay: 0.05
init_lr: 3e-4
min_lr: 1e-6
warmup_lr: 1e-6
lr_decay_rate: 0.9
max_epoch: 20
warmup_steps: 3000



34 changes: 34 additions & 0 deletions configs/retrieval_coco.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
image_root: '/export/share/datasets/vision/coco/images/'
ann_root: 'annotation'
dataset: 'coco'

# set pretrained as a file path or an url
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth'

# size of vit model; base or large

vit: 'base'
batch_size_train: 32
batch_size_test: 64
vit_grad_ckpt: True
vit_ckpt_layer: 4
init_lr: 1e-5

# vit: 'large'
# batch_size_train: 16
# batch_size_test: 32
# vit_grad_ckpt: True
# vit_ckpt_layer: 12
# init_lr: 5e-6

image_size: 384
queue_size: 57600
alpha: 0.4
k_test: 256
negative_all_rank: True

# optimizer
weight_decay: 0.05
min_lr: 0
max_epoch: 6

34 changes: 34 additions & 0 deletions configs/retrieval_flickr.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
image_root: '/export/share/datasets/vision/flickr30k/'
ann_root: 'annotation'
dataset: 'flickr'

# set pretrained as a file path or an url
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_flickr.pth'

# size of vit model; base or large

vit: 'base'
batch_size_train: 32
batch_size_test: 64
vit_grad_ckpt: True
vit_ckpt_layer: 4
init_lr: 1e-5

# vit: 'large'
# batch_size_train: 16
# batch_size_test: 32
# vit_grad_ckpt: True
# vit_ckpt_layer: 10
# init_lr: 5e-6

image_size: 384
queue_size: 57600
alpha: 0.4
k_test: 128
negative_all_rank: False

# optimizer
weight_decay: 0.05
min_lr: 0
max_epoch: 6

25 changes: 25 additions & 0 deletions configs/vqa.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
vqa_root: '/export/share/datasets/vision/VQA/Images/mscoco/' #followed by train2014/
vg_root: '/export/share/datasets/vision/visual-genome/' #followed by image/
train_files: ['vqa_train','vqa_val','vg_qa']
ann_root: 'annotation'

# set pretrained as a file path or an url
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_vqa.pth'

# size of vit model; base or large
vit: 'base'
batch_size_train: 16
batch_size_test: 32
vit_grad_ckpt: False
vit_ckpt_layer: 0
init_lr: 2e-5

image_size: 480

k_test: 128
inference: 'rank'

# optimizer
weight_decay: 0.05
min_lr: 0
max_epoch: 10
173 changes: 173 additions & 0 deletions demo.ipynb

Large diffs are not rendered by default.

118 changes: 118 additions & 0 deletions eval_nocaps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
'''
* Copyright (c) 2022, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
* By Junnan Li
'''
import argparse
import os
import ruamel_yaml as yaml
import numpy as np
import random
import time
import datetime
import json
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torch.distributed as dist
from torch.utils.data import DataLoader

from models.blip import blip_decoder
import utils
from data import create_dataset, create_sampler, create_loader
from data.utils import save_result

@torch.no_grad()
def evaluate(model, data_loader, device, config):
# evaluate
model.eval()

metric_logger = utils.MetricLogger(delimiter=" ")
header = 'Evaluation:'
print_freq = 10

result = []
for image, image_id in metric_logger.log_every(data_loader, print_freq, header):

image = image.to(device)

captions = model.generate(image, sample=False, num_beams=config['num_beams'], max_length=config['max_length'],
min_length=config['min_length'], repetition_penalty=1.1)

for caption, img_id in zip(captions, image_id):
result.append({"image_id": img_id.item(), "caption": caption})

return result


def main(args, config):
utils.init_distributed_mode(args)

device = torch.device(args.device)

# fix the seed for reproducibility
seed = args.seed + utils.get_rank()
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
cudnn.benchmark = True

#### Dataset ####
print("Creating captioning dataset")
val_dataset, test_dataset = create_dataset('nocaps', config)

if args.distributed:
num_tasks = utils.get_world_size()
global_rank = utils.get_rank()
samplers = create_sampler([val_dataset,test_dataset], [False,False], num_tasks, global_rank)
else:
samplers = [None,None]

val_loader, test_loader = create_loader([val_dataset, test_dataset],samplers,
batch_size=[config['batch_size']]*2,num_workers=[4,4],
is_trains=[False, False], collate_fns=[None,None])

#### Model ####
print("Creating model")
model = blip_decoder(pretrained=config['pretrained'], image_size=config['image_size'], vit=config['vit'],
prompt=config['prompt'])

model = model.to(device)

model_without_ddp = model
if args.distributed:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
model_without_ddp = model.module

val_result = evaluate(model_without_ddp, val_loader, device, config)
val_result_file = save_result(val_result, args.result_dir, 'val', remove_duplicate='image_id')
test_result = evaluate(model_without_ddp, test_loader, device, config)
test_result_file = save_result(test_result, args.result_dir, 'test', remove_duplicate='image_id')


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--config', default='./configs/nocaps.yaml')
parser.add_argument('--output_dir', default='output/NoCaps')
parser.add_argument('--device', default='cuda')
parser.add_argument('--seed', default=42, type=int)
parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes')
parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
parser.add_argument('--distributed', default=True, type=bool)
args = parser.parse_args()

config = yaml.load(open(args.config, 'r'), Loader=yaml.Loader)

args.result_dir = os.path.join(args.output_dir, 'result')

Path(args.output_dir).mkdir(parents=True, exist_ok=True)
Path(args.result_dir).mkdir(parents=True, exist_ok=True)

yaml.dump(config, open(os.path.join(args.output_dir, 'config.yaml'), 'w'))

main(args, config)
Empty file added models/__init__.py
Empty file.
Binary file added models/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file added models/__pycache__/__init__.cpython-38.pyc
Binary file not shown.
Binary file added models/__pycache__/blip.cpython-38.pyc
Binary file not shown.
Binary file added models/__pycache__/blip_nlvr.cpython-38.pyc
Binary file not shown.
Binary file added models/__pycache__/blip_retrieval.cpython-38.pyc
Binary file not shown.
Binary file added models/__pycache__/blip_vqa.cpython-38.pyc
Binary file not shown.
Binary file added models/__pycache__/booster.cpython-38.pyc
Binary file not shown.
Binary file added models/__pycache__/booster_nlvr.cpython-38.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added models/__pycache__/booster_vqa.cpython-38.pyc
Binary file not shown.
Binary file added models/__pycache__/med.cpython-36.pyc
Binary file not shown.
Binary file added models/__pycache__/med.cpython-38.pyc
Binary file not shown.
Binary file added models/__pycache__/nlvr_encoder.cpython-38.pyc
Binary file not shown.
Binary file added models/__pycache__/univlm.cpython-36.pyc
Binary file not shown.
Binary file added models/__pycache__/univlm.cpython-38.pyc
Binary file not shown.
Binary file added models/__pycache__/univlm_pretrain.cpython-38.pyc
Binary file not shown.
Binary file not shown.
Binary file added models/__pycache__/univlm_vqa.cpython-38.pyc
Binary file not shown.
Binary file added models/__pycache__/vit.cpython-36.pyc
Binary file not shown.
Binary file added models/__pycache__/vit.cpython-38.pyc
Binary file not shown.
Binary file added models/__pycache__/vl_model.cpython-38.pyc
Binary file not shown.
Binary file added models/__pycache__/xbert.cpython-38.pyc
Binary file not shown.
Loading

0 comments on commit 97a975f

Please sign in to comment.