forked from alirezazareian/ovr-cnn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmmss_v07.yaml
84 lines (83 loc) · 3.42 KB
/
mmss_v07.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
MODEL:
# This is what indicates we want image-caption training not object detection
META_ARCHITECTURE: "MMSS-GCNN"
# URL to the initial weights, trained for imagenet classification
WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
RESNETS:
BACKBONE_OUT_CHANNELS: 2048
BACKBONE:
# a full resnet, including stem and 4 blocks
CONV_BODY: "R-50-C5"
# don't freeze any layer, train everything
FREEZE_CONV_BODY_AT: 0
LANGUAGE_BACKBONE:
# make a BERT model to process captions
TYPE: "BERT-Base"
# and freeze it (loaded from original pretrained bert of huggingface)
FREEZE: True
MMSS_HEAD:
# We want both a grounding head and a transformer head on top of image and caption,
# each of which defines its own objective functions.
TYPES: ("GroundingHead", "TransformerHead")
DEFAULT_HEAD: "GroundingHead"
# Share the weights of the vision to language projection between the two heads.
# Use the one on the grounding head because that is the default (see above)
TIE_VL_PROJECTION_WEIGHTS: True
# Randomly keep up to 100 visual regions from each image. This is to save memory.
SPATIAL_DROPOUT: 100
GROUNDING:
# Use dot product for grounding. This could be cosine or euclidean too.
LOCAL_METRIC: "dot"
# After aligning words to regions, sum the local distances to compute global distance.
GLOBAL_METRIC: "aligned_local"
# Use softmax to softly align each word to regions, and vice versa.
# This could be for instance hardmax, which aligns to the most similar
ALIGNMENT: "softmax"
# Typical good values are 100.0 for euclidean, 10.0 for dot, 0.01 for cosine
ALIGNMENT_TEMPERATURE: 10.0
# This loss is to choose the right caption out of all captions in the batch,
# And similarly choose the right image. Could be triplet loss instead.
LOSS: "cross_entropy"
# Whether to find a region for each word
ALIGN_WORDS_TO_REGIONS: True
# Whether to find a word for a region
# At least one of these two should be True
ALIGN_REGIONS_TO_WORDS: True
TRANSFORMER:
# Whether to perform masked language modeling (randomly mask words from captions
# and have the model reconstruct them)
MASKED_LANGUAGE_MODELING: True
# Whether to do that during validation as well. That is not good if you want to
# measure image-caption matching scores.
MASKED_LANGUAGE_MODELING_VALIDATION: False
# For now this is not implemented, so keep it False and ''
MASKED_VISUAL_MODELING: False
MVM_LOSS: ''
# For Multimedia Matching loss, cross-entropy works just like in the grounding head
MMM_LOSS: 'cross_entropy'
# Typical BERT configs as in Huggingface
BERT_CONFIG:
num_hidden_layers: 6
num_attention_heads: 8
intermediate_size: 768
DATASETS:
TRAIN: ("coco_captions_train",)
TEST: ("coco_captions_val",)
DATASET_CLASS: "COCOCaptionsDataset"
SOLVER:
BASE_LR: 0.01
WEIGHT_DECAY: 0.0001
STEPS: (20000, 35000)
MAX_ITER: 40000
IMS_PER_BATCH: 64
TEST_PERIOD: 1000
CHECKPOINT_PERIOD: 1000
LOG_PERIOD: 100
CLIP_GRAD_NORM_AT: 5.0
# A value of more than one means accumulate gradients for several batches before updating
GRADIENT_ACCUMULATION_STEPS: 1
# If true, it calls model.train() before computing validation loss. Needed for some models.
USE_TRAIN_MODE_FOR_VALIDATION_LOSS: False
TEST:
DO_EVAL: False
IMS_PER_BATCH: 64