-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
yang
committed
Feb 29, 2024
1 parent
a637cde
commit e896645
Showing
27 changed files
with
2,309 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# CSPNeXt ImageNet Pre-training | ||
|
||
In this folder, we provide the imagenet pre-training config of RTMDet's backbone CSPNeXt. | ||
|
||
## Requirements | ||
|
||
To train with these configs, please install [MMClassification 1.x](https://github.com/open-mmlab/mmclassification/tree/1.x) first. | ||
|
||
Install by MIM: | ||
|
||
```shell | ||
mim install mmcls>=1.0.0rc0 | ||
``` | ||
|
||
or install by pip: | ||
|
||
```shell | ||
pip install mmcls>=1.0.0rc0 | ||
``` | ||
|
||
## Prepare Dataset | ||
|
||
To pre-train on ImageNet, you need to prepare the dataset first. Please refer to the [guide](https://mmclassification.readthedocs.io/en/1.x/user_guides/dataset_prepare.html#imagenet). | ||
|
||
## How to Train | ||
|
||
You can use the classification config in the same way as the detection config. | ||
|
||
For single-GPU training, run: | ||
|
||
```shell | ||
python tools/train.py \ | ||
${CONFIG_FILE} \ | ||
[optional arguments] | ||
``` | ||
|
||
For multi-GPU training, run: | ||
|
||
```shell | ||
bash ./tools/dist_train.sh \ | ||
${CONFIG_FILE} \ | ||
${GPU_NUM} \ | ||
[optional arguments] | ||
``` | ||
|
||
More details can be found in [user guides](https://mmdetection.readthedocs.io/en/3.x/user_guides/train.html). | ||
|
||
## Results and Models | ||
|
||
| Model | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Download | | ||
| :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------------------------------------------------------------------------------------: | | ||
| CSPNeXt-tiny | 224x224 | 2.73 | 0.339 | 69.44 | 89.45 | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth) | | ||
| CSPNeXt-s | 224x224 | 4.89 | 0.664 | 74.41 | 92.23 | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth) | |
67 changes: 67 additions & 0 deletions
67
configs/rtmdet/cspnext_imagenet_pretrain/cspnext-s_8xb256-rsb-a1-600e_in1k.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
_base_ = [ | ||
'mmcls::_base_/datasets/imagenet_bs256_rsb_a12.py', | ||
'mmcls::_base_/schedules/imagenet_bs2048_rsb.py', | ||
'mmcls::_base_/default_runtime.py' | ||
] | ||
|
||
custom_imports = dict( | ||
imports=['mmdet.models', 'mmyolo.models'], allow_failed_imports=False) | ||
|
||
model = dict( | ||
type='ImageClassifier', | ||
backbone=dict( | ||
type='mmyolo.CSPNeXt', | ||
arch='P5', | ||
out_indices=(4, ), | ||
expand_ratio=0.5, | ||
deepen_factor=0.33, | ||
widen_factor=0.5, | ||
channel_attention=True, | ||
norm_cfg=dict(type='BN'), | ||
act_cfg=dict(type='mmyolo.SiLU')), | ||
neck=dict(type='GlobalAveragePooling'), | ||
head=dict( | ||
type='LinearClsHead', | ||
num_classes=1000, | ||
in_channels=512, | ||
loss=dict( | ||
type='LabelSmoothLoss', | ||
label_smooth_val=0.1, | ||
mode='original', | ||
loss_weight=1.0), | ||
topk=(1, 5)), | ||
train_cfg=dict(augments=[ | ||
dict(type='Mixup', alpha=0.2, num_classes=1000), | ||
dict(type='CutMix', alpha=1.0, num_classes=1000) | ||
])) | ||
|
||
# dataset settings | ||
train_dataloader = dict(sampler=dict(type='RepeatAugSampler', shuffle=True)) | ||
|
||
# schedule settings | ||
optim_wrapper = dict( | ||
optimizer=dict(weight_decay=0.01), | ||
paramwise_cfg=dict(bias_decay_mult=0., norm_decay_mult=0.), | ||
) | ||
|
||
param_scheduler = [ | ||
# warm up learning rate scheduler | ||
dict( | ||
type='LinearLR', | ||
start_factor=0.0001, | ||
by_epoch=True, | ||
begin=0, | ||
end=5, | ||
# update by iter | ||
convert_to_iter_based=True), | ||
# main learning rate scheduler | ||
dict( | ||
type='CosineAnnealingLR', | ||
T_max=595, | ||
eta_min=1.0e-6, | ||
by_epoch=True, | ||
begin=5, | ||
end=600) | ||
] | ||
|
||
train_cfg = dict(by_epoch=True, max_epochs=600) |
5 changes: 5 additions & 0 deletions
5
configs/rtmdet/cspnext_imagenet_pretrain/cspnext-tiny_8xb256-rsb-a1-600e_in1k.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
_base_ = './cspnext-s_8xb256-rsb-a1-600e_in1k.py' | ||
|
||
model = dict( | ||
backbone=dict(deepen_factor=0.167, widen_factor=0.375), | ||
head=dict(in_channels=384)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
# Distill RTM Detectors Based on MMRazor | ||
|
||
## Description | ||
|
||
To further improve the model accuracy while not introducing much additional | ||
computation cost, we apply the feature-based distillation to the training phase | ||
of these RTM detectors. In summary, our distillation strategy are threefold: | ||
|
||
(1) Inspired by [PKD](https://arxiv.org/abs/2207.02039), we first normalize | ||
the intermediate feature maps to have zero mean and unit variances before calculating | ||
the distillation loss. | ||
|
||
(2) Inspired by [CWD](https://arxiv.org/abs/2011.13256), we adopt the channel-wise | ||
distillation paradigm, which can pay more attention to the most salient regions | ||
of each channel. | ||
|
||
(3) Inspired by [DAMO-YOLO](https://arxiv.org/abs/2211.15444), the distillation | ||
process is split into two stages. 1) The teacher distills the student at the | ||
first stage (280 epochs) on strong mosaic domain. 2) The student finetunes itself | ||
on no masaic domain at the second stage (20 epochs). | ||
|
||
## Results and Models | ||
|
||
| Location | Dataset | Teacher | Student | mAP | mAP(T) | mAP(S) | Config | Download | | ||
| :------: | :-----: | :---------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------: | :---------: | :----: | :----: | :------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||
| FPN | COCO | [RTMDet-s](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py) | [RTMDet-tiny](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py) | 41.8 (+0.8) | 44.6 | 41.0 | [config](kd_tiny_rtmdet_s_neck_300e_coco.py) | [teacher](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco/rtmdet_s_syncbn_fast_8xb32-300e_coco_20221230_182329-0a8c901a.pth) \|[model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_tiny_rtmdet_s_neck_300e_coco/kd_tiny_rtmdet_s_neck_300e_coco_20230213_104240-e1e4197c.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_tiny_rtmdet_s_neck_300e_coco/kd_tiny_rtmdet_s_neck_300e_coco_20230213_104240-176901d8.json) | | ||
| FPN | COCO | [RTMDet-m](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py) | [RTMDet-s](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py) | 45.7 (+1.1) | 49.3 | 44.6 | [config](kd_s_rtmdet_m_neck_300e_coco.py) | [teacher](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco/rtmdet_m_syncbn_fast_8xb32-300e_coco_20230102_135952-40af4fe8.pth) \|[model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_s_rtmdet_m_neck_300e_coco/kd_s_rtmdet_m_neck_300e_coco_20230220_140647-446ff003.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_s_rtmdet_m_neck_300e_coco/kd_s_rtmdet_m_neck_300e_coco_20230220_140647-89862269.json) | | ||
| FPN | COCO | [RTMDet-l](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py) | [RTMDet-m](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py) | 50.2 (+0.9) | 51.4 | 49.3 | [config](kd_m_rtmdet_l_neck_300e_coco.py) | [teacher](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928-ee3abdc4.pth) \|[model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_m_rtmdet_l_neck_300e_coco/kd_m_rtmdet_l_neck_300e_coco_20230220_141313-b806f503.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_m_rtmdet_l_neck_300e_coco/kd_m_rtmdet_l_neck_300e_coco_20230220_141313-bd028fd3.json) | | ||
| FPN | COCO | [RTMDet-x](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco.py) | [RTMDet-l](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py) | 52.3 (+0.9) | 52.8 | 51.4 | [config](kd_l_rtmdet_x_neck_300e_coco.py) | [teacher](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco/rtmdet_x_syncbn_fast_8xb32-300e_coco_20221231_100345-b85cd476.pth) \|[model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_l_rtmdet_x_neck_300e_coco/kd_l_rtmdet_x_neck_300e_coco_20230220_141912-c9979722.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_l_rtmdet_x_neck_300e_coco/kd_l_rtmdet_x_neck_300e_coco_20230220_141912-c5c4e17b.json) | | ||
|
||
## Usage | ||
|
||
### Prerequisites | ||
|
||
- [MMRazor dev-1.x](https://github.com/open-mmlab/mmrazor/tree/dev-1.x) | ||
|
||
Install MMRazor from source | ||
|
||
``` | ||
git clone -b dev-1.x https://github.com/open-mmlab/mmrazor.git | ||
cd mmrazor | ||
# Install MMRazor | ||
mim install -v -e . | ||
``` | ||
|
||
### Training commands | ||
|
||
In MMYOLO's root directory, run the following command to train the RTMDet-tiny | ||
with 8 GPUs, using RTMDet-s as the teacher: | ||
|
||
```bash | ||
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 PORT=29500 ./tools/dist_train.sh configs/rtmdet/distillation/kd_tiny_rtmdet_s_neck_300e_coco.py | ||
``` | ||
|
||
### Testing commands | ||
|
||
In MMYOLO's root directory, run the following command to test the model: | ||
|
||
```bash | ||
CUDA_VISIBLE_DEVICES=0 PORT=29500 ./tools/dist_test.sh configs/rtmdet/distillation/kd_tiny_rtmdet_s_neck_300e_coco.py ${CHECKPOINT_PATH} | ||
``` | ||
|
||
### Getting student-only checkpoint | ||
|
||
After training, the checkpoint contains parameters for both student and teacher models. | ||
Run the following command to convert it to student-only checkpoint: | ||
|
||
```bash | ||
python ./tools/model_converters/convert_kd_ckpt_to_student.py ${CHECKPOINT_PATH} --out-path ${OUTPUT_CHECKPOINT_PATH} | ||
``` | ||
|
||
## Configs | ||
|
||
Here we provide detection configs and models for MMRazor in MMYOLO. For clarify, | ||
we take `./kd_tiny_rtmdet_s_neck_300e_coco.py` as an example to show how to | ||
distill a RTM detector based on MMRazor. | ||
|
||
Here is the main part of `./kd_tiny_rtmdet_s_neck_300e_coco.py`. | ||
|
||
```shell | ||
norm_cfg = dict(type='BN', affine=False, track_running_stats=False) | ||
|
||
distiller=dict( | ||
type='ConfigurableDistiller', | ||
student_recorders=dict( | ||
fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), | ||
fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), | ||
fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv'), | ||
), | ||
teacher_recorders=dict( | ||
fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), | ||
fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), | ||
fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv')), | ||
connectors=dict( | ||
fpn0_s=dict(type='ConvModuleConnector', in_channel=96, | ||
out_channel=128, bias=False, norm_cfg=norm_cfg, | ||
act_cfg=None), | ||
fpn0_t=dict( | ||
type='NormConnector', in_channels=128, norm_cfg=norm_cfg), | ||
fpn1_s=dict( | ||
type='ConvModuleConnector', in_channel=96, | ||
out_channel=128, bias=False, norm_cfg=norm_cfg, | ||
act_cfg=None), | ||
fpn1_t=dict( | ||
type='NormConnector', in_channels=128, norm_cfg=norm_cfg), | ||
fpn2_s=dict( | ||
type='ConvModuleConnector', in_channel=96, | ||
out_channel=128, bias=False, norm_cfg=norm_cfg, | ||
act_cfg=None), | ||
fpn2_t=dict( | ||
type='NormConnector', in_channels=128, norm_cfg=norm_cfg)), | ||
distill_losses=dict( | ||
loss_fpn0=dict(type='ChannelWiseDivergence', loss_weight=1), | ||
loss_fpn1=dict(type='ChannelWiseDivergence', loss_weight=1), | ||
loss_fpn2=dict(type='ChannelWiseDivergence', loss_weight=1)), | ||
loss_forward_mappings=dict( | ||
loss_fpn0=dict( | ||
preds_S=dict(from_student=True, recorder='fpn0', connector='fpn0_s'), | ||
preds_T=dict(from_student=False, recorder='fpn0', connector='fpn0_t')), | ||
loss_fpn1=dict( | ||
preds_S=dict(from_student=True, recorder='fpn1', connector='fpn1_s'), | ||
preds_T=dict(from_student=False, recorder='fpn1', connector='fpn1_t')), | ||
loss_fpn2=dict( | ||
preds_S=dict(from_student=True, recorder='fpn2', connector='fpn2_s'), | ||
preds_T=dict(from_student=False, recorder='fpn2', connector='fpn2_t')))) | ||
|
||
``` | ||
|
||
`recorders` are used to record various intermediate results during the model forward. | ||
In this example, they can help record the output of 3 `nn.Module` of the teacher | ||
and the student. Details are list in [Recorder](https://github.com/open-mmlab/mmrazor/blob/dev-1.x/docs/en/advanced_guides/recorder.md) and [MMRazor Distillation](https://zhuanlan.zhihu.com/p/596582609) (if users can read Chinese). | ||
|
||
`connectors` are adaptive layers which usually map teacher's and students features | ||
to the same dimension. | ||
|
||
`distill_losses` are configs for multiple distill losses. | ||
|
||
`loss_forward_mappings` are mappings between distill loss forward arguments and records. | ||
|
||
In addition, the student finetunes itself on no masaic domain at the last 20 epochs, | ||
so we add a new hook named `StopDistillHook` to stop distillation on time. | ||
We need to add this hook to the `custom_hooks` list like this: | ||
|
||
```shell | ||
custom_hooks = [..., dict(type='mmrazor.StopDistillHook', detach_epoch=280)] | ||
``` |
99 changes: 99 additions & 0 deletions
99
configs/rtmdet/distillation/kd_l_rtmdet_x_neck_300e_coco.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
_base_ = '../rtmdet_l_syncbn_fast_8xb32-300e_coco.py' | ||
|
||
teacher_ckpt = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco/rtmdet_x_syncbn_fast_8xb32-300e_coco_20221231_100345-b85cd476.pth' # noqa: E501 | ||
|
||
norm_cfg = dict(type='BN', affine=False, track_running_stats=False) | ||
|
||
model = dict( | ||
_delete_=True, | ||
_scope_='mmrazor', | ||
type='FpnTeacherDistill', | ||
architecture=dict( | ||
cfg_path='mmyolo::rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py'), | ||
teacher=dict( | ||
cfg_path='mmyolo::rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco.py'), | ||
teacher_ckpt=teacher_ckpt, | ||
distiller=dict( | ||
type='ConfigurableDistiller', | ||
# `recorders` are used to record various intermediate results during | ||
# the model forward. | ||
student_recorders=dict( | ||
fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), | ||
fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), | ||
fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv'), | ||
), | ||
teacher_recorders=dict( | ||
fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), | ||
fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), | ||
fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv')), | ||
# `connectors` are adaptive layers which usually map teacher's and | ||
# students features to the same dimension. | ||
connectors=dict( | ||
fpn0_s=dict( | ||
type='ConvModuleConnector', | ||
in_channel=256, | ||
out_channel=320, | ||
bias=False, | ||
norm_cfg=norm_cfg, | ||
act_cfg=None), | ||
fpn0_t=dict( | ||
type='NormConnector', in_channels=320, norm_cfg=norm_cfg), | ||
fpn1_s=dict( | ||
type='ConvModuleConnector', | ||
in_channel=256, | ||
out_channel=320, | ||
bias=False, | ||
norm_cfg=norm_cfg, | ||
act_cfg=None), | ||
fpn1_t=dict( | ||
type='NormConnector', in_channels=320, norm_cfg=norm_cfg), | ||
fpn2_s=dict( | ||
type='ConvModuleConnector', | ||
in_channel=256, | ||
out_channel=320, | ||
bias=False, | ||
norm_cfg=norm_cfg, | ||
act_cfg=None), | ||
fpn2_t=dict( | ||
type='NormConnector', in_channels=320, norm_cfg=norm_cfg)), | ||
distill_losses=dict( | ||
loss_fpn0=dict(type='ChannelWiseDivergence', loss_weight=1), | ||
loss_fpn1=dict(type='ChannelWiseDivergence', loss_weight=1), | ||
loss_fpn2=dict(type='ChannelWiseDivergence', loss_weight=1)), | ||
# `loss_forward_mappings` are mappings between distill loss forward | ||
# arguments and records. | ||
loss_forward_mappings=dict( | ||
loss_fpn0=dict( | ||
preds_S=dict( | ||
from_student=True, recorder='fpn0', connector='fpn0_s'), | ||
preds_T=dict( | ||
from_student=False, recorder='fpn0', connector='fpn0_t')), | ||
loss_fpn1=dict( | ||
preds_S=dict( | ||
from_student=True, recorder='fpn1', connector='fpn1_s'), | ||
preds_T=dict( | ||
from_student=False, recorder='fpn1', connector='fpn1_t')), | ||
loss_fpn2=dict( | ||
preds_S=dict( | ||
from_student=True, recorder='fpn2', connector='fpn2_s'), | ||
preds_T=dict( | ||
from_student=False, recorder='fpn2', | ||
connector='fpn2_t'))))) | ||
|
||
find_unused_parameters = True | ||
|
||
custom_hooks = [ | ||
dict( | ||
type='EMAHook', | ||
ema_type='ExpMomentumEMA', | ||
momentum=0.0002, | ||
update_buffers=True, | ||
strict_load=False, | ||
priority=49), | ||
dict( | ||
type='mmdet.PipelineSwitchHook', | ||
switch_epoch=_base_.max_epochs - _base_.num_epochs_stage2, | ||
switch_pipeline=_base_.train_pipeline_stage2), | ||
# stop distillation after the 280th epoch | ||
dict(type='mmrazor.StopDistillHook', stop_epoch=280) | ||
] |
Oops, something went wrong.