diff --git a/.circleci/docker/Dockerfile b/.circleci/docker/Dockerfile index d9cf8cc77..b5efe06a4 100644 --- a/.circleci/docker/Dockerfile +++ b/.circleci/docker/Dockerfile @@ -1,6 +1,7 @@ ARG PYTORCH="1.8.1" ARG CUDA="10.2" ARG CUDNN="7" +ARG DEBIAN_FRONTEND=noninteractive FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel diff --git a/.circleci/test.yml b/.circleci/test.yml index c24bebcb5..51d9770ad 100644 --- a/.circleci/test.yml +++ b/.circleci/test.yml @@ -80,16 +80,22 @@ jobs: type: string cuda: type: enum - enum: ["10.1", "10.2", "11.1", "11.7"] + enum: ["10.1", "10.2", "11.1", "11.7", "11.8"] cudnn: type: integer default: 7 machine: - image: ubuntu-2004-cuda-11.4:202110-01 + image: linux-cuda-11:default # docker_layer_caching: true - resource_class: gpu.nvidia.small + resource_class: gpu.nvidia.small.multi steps: - checkout + - run: + name: Install nvidia-container-toolkit and Restart Docker + command: | + sudo apt-get update + sudo apt-get install -y nvidia-container-toolkit + sudo systemctl restart docker - run: # Cloning repos in VM since Docker doesn't have access to the private key name: Clone Repos @@ -152,8 +158,8 @@ workflows: - lint - build_cpu: name: maximum_version_cpu - torch: 2.0.0 - torchvision: 0.15.1 + torch: 2.1.0 + torchvision: 0.16.0 python: 3.9.0 requires: - minimum_version_cpu @@ -171,10 +177,10 @@ workflows: - hold - build_cuda: name: mainstream_version_gpu - torch: 2.0.0 + torch: 2.1.0 # Use double quotation mark to explicitly specify its type # as string instead of number - cuda: "11.7" + cuda: "11.8" cudnn: 8 requires: - hold diff --git a/.codespellrc b/.codespellrc index d9a0a76c5..72be50e00 100644 --- a/.codespellrc +++ b/.codespellrc @@ -2,4 +2,4 @@ skip = *.ipynb count = quiet-level = 3 -ignore-words-list = convertor,convertors,formating,nin,wan,datas,hist,ned +ignore-words-list = convertor,convertors,formating,nin,wan,datas,hist,ned,ser diff --git a/.github/workflows/merge_stage_test.yml b/.github/workflows/merge_stage_test.yml index 856ede833..44be34746 100644 --- a/.github/workflows/merge_stage_test.yml +++ b/.github/workflows/merge_stage_test.yml @@ -60,7 +60,7 @@ jobs: strategy: matrix: python-version: [3.7] - torch: [1.6.0, 1.7.1, 1.8.1, 1.9.1, 1.10.1, 1.11.0, 1.12.1, 1.13.0] + torch: [1.6.0, 1.7.1, 1.8.1, 1.9.1, 1.10.1, 1.11.0, 1.12.1, 1.13.0, 2.0.0, 2.1.0] include: - torch: 1.6.0 torchvision: 0.7.0 @@ -81,6 +81,9 @@ jobs: - torch: 2.0.0 torchvision: 0.15.1 python-version: 3.8 + - torch: 2.1.0 + torchvision: 0.16.0 + python-version: 3.8 steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} diff --git a/configs/re/_base_/datasets/xfund_zh.py b/configs/re/_base_/datasets/xfund_zh.py new file mode 100644 index 000000000..06fb11c09 --- /dev/null +++ b/configs/re/_base_/datasets/xfund_zh.py @@ -0,0 +1,14 @@ +xfund_zh_re_data_root = 'data/xfund/zh' + +xfund_zh_re_train = dict( + type='XFUNDDataset', + data_root=xfund_zh_re_data_root, + ann_file='re_train.json', + pipeline=None) + +xfund_zh_re_test = dict( + type='XFUNDDataset', + data_root=xfund_zh_re_data_root, + ann_file='re_test.json', + test_mode=True, + pipeline=None) diff --git a/configs/ser/_base_/datasets/xfund_zh.py b/configs/ser/_base_/datasets/xfund_zh.py new file mode 100644 index 000000000..e790a7bf6 --- /dev/null +++ b/configs/ser/_base_/datasets/xfund_zh.py @@ -0,0 +1,14 @@ +xfund_zh_ser_data_root = 'data/xfund/zh' + +xfund_zh_ser_train = dict( + type='XFUNDDataset', + data_root=xfund_zh_ser_data_root, + ann_file='ser_train.json', + pipeline=None) + +xfund_zh_ser_test = dict( + type='XFUNDDataset', + data_root=xfund_zh_ser_data_root, + ann_file='ser_test.json', + test_mode=True, + pipeline=None) diff --git a/dataset_zoo/xfund/de/metafile.yml b/dataset_zoo/xfund/de/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/de/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/de/re.py b/dataset_zoo/xfund/de/re.py new file mode 100644 index 000000000..e0419d026 --- /dev/null +++ b/dataset_zoo/xfund/de/re.py @@ -0,0 +1,6 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/de/sample_anno.md b/dataset_zoo/xfund/de/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/de/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/de/ser.py b/dataset_zoo/xfund/de/ser.py new file mode 100644 index 000000000..5e9769eb0 --- /dev/null +++ b/dataset_zoo/xfund/de/ser.py @@ -0,0 +1,60 @@ +lang = 'de' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='8c9f949952d227290e22f736cdbe4d29', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='3e4b95c7da893bf5a91018445c83ccdd', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='d13d12278d585214183c3cfb949b0e59', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='8eaf742f2d19b17f5c0e72da5c7761ef', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/dataset_zoo/xfund/es/metafile.yml b/dataset_zoo/xfund/es/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/es/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/es/re.py b/dataset_zoo/xfund/es/re.py new file mode 100644 index 000000000..e0419d026 --- /dev/null +++ b/dataset_zoo/xfund/es/re.py @@ -0,0 +1,6 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/es/sample_anno.md b/dataset_zoo/xfund/es/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/es/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/es/ser.py b/dataset_zoo/xfund/es/ser.py new file mode 100644 index 000000000..da8900980 --- /dev/null +++ b/dataset_zoo/xfund/es/ser.py @@ -0,0 +1,60 @@ +lang = 'es' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='0ff89032bc6cb2e7ccba062c71944d03', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='b40b43f276c7deaaaa5923d035da2820', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='efad9fb11ee3036bef003b6364a79ac0', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='96ffc2057049ba2826a005825b3e7f0d', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/dataset_zoo/xfund/fr/metafile.yml b/dataset_zoo/xfund/fr/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/fr/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/fr/re.py b/dataset_zoo/xfund/fr/re.py new file mode 100644 index 000000000..e0419d026 --- /dev/null +++ b/dataset_zoo/xfund/fr/re.py @@ -0,0 +1,6 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/fr/sample_anno.md b/dataset_zoo/xfund/fr/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/fr/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/fr/ser.py b/dataset_zoo/xfund/fr/ser.py new file mode 100644 index 000000000..aad6b7cf3 --- /dev/null +++ b/dataset_zoo/xfund/fr/ser.py @@ -0,0 +1,60 @@ +lang = 'fr' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='d821ca50f37cc39ff1715632f4068ea1', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='349e7f824225bc7cc53f0c0eb8c87d3e', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='9ccbf15816ca05e50229885b75e57e49', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='15d8a52a4eb20ea029a4aa3eaa25ef8d', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/dataset_zoo/xfund/it/metafile.yml b/dataset_zoo/xfund/it/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/it/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/it/re.py b/dataset_zoo/xfund/it/re.py new file mode 100644 index 000000000..e0419d026 --- /dev/null +++ b/dataset_zoo/xfund/it/re.py @@ -0,0 +1,6 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/it/sample_anno.md b/dataset_zoo/xfund/it/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/it/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/it/ser.py b/dataset_zoo/xfund/it/ser.py new file mode 100644 index 000000000..fc9fc8b70 --- /dev/null +++ b/dataset_zoo/xfund/it/ser.py @@ -0,0 +1,60 @@ +lang = 'it' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='c531e39f0cbc1dc74caa320ffafe5de9', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='fa6afe204a6af57152627e76fe2de005', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='35446a115561d0773b7f2a0c2f32fe5c', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='260d4ea447636cbca1ce1ca5fc5846d9', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/dataset_zoo/xfund/ja/metafile.yml b/dataset_zoo/xfund/ja/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/ja/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/ja/re.py b/dataset_zoo/xfund/ja/re.py new file mode 100644 index 000000000..e0419d026 --- /dev/null +++ b/dataset_zoo/xfund/ja/re.py @@ -0,0 +1,6 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/ja/sample_anno.md b/dataset_zoo/xfund/ja/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/ja/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/ja/ser.py b/dataset_zoo/xfund/ja/ser.py new file mode 100644 index 000000000..856b4f96d --- /dev/null +++ b/dataset_zoo/xfund/ja/ser.py @@ -0,0 +1,60 @@ +lang = 'ja' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='50c22c6774706494080a73f8eabcf45d', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='46cd53deab3b8fbd69278da56d1778c4', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='93a22fea044894264bfa3c9f9c84dd37', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='f576b6dc6c08fd98cf877fb04bc4c8c3', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/dataset_zoo/xfund/pt/metafile.yml b/dataset_zoo/xfund/pt/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/pt/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/pt/re.py b/dataset_zoo/xfund/pt/re.py new file mode 100644 index 000000000..e0419d026 --- /dev/null +++ b/dataset_zoo/xfund/pt/re.py @@ -0,0 +1,6 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/pt/sample_anno.md b/dataset_zoo/xfund/pt/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/pt/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/pt/ser.py b/dataset_zoo/xfund/pt/ser.py new file mode 100644 index 000000000..ff147ba4c --- /dev/null +++ b/dataset_zoo/xfund/pt/ser.py @@ -0,0 +1,60 @@ +lang = 'pt' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='783ba0aba419235bc81cf547e7c5011b', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='3fe0fb93e631fcbc391216d2d7b0510d', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='5f0189d29c5a0e6764757457f54ba14f', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='82a93addffdd7ac7fd978972adf1a348', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/dataset_zoo/xfund/zh/metafile.yml b/dataset_zoo/xfund/zh/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/zh/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/zh/re.py b/dataset_zoo/xfund/zh/re.py new file mode 100644 index 000000000..e0419d026 --- /dev/null +++ b/dataset_zoo/xfund/zh/re.py @@ -0,0 +1,6 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/zh/sample_anno.md b/dataset_zoo/xfund/zh/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/zh/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/zh/ser.py b/dataset_zoo/xfund/zh/ser.py new file mode 100644 index 000000000..20a3d1150 --- /dev/null +++ b/dataset_zoo/xfund/zh/ser.py @@ -0,0 +1,60 @@ +lang = 'zh' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='a4ce16d1c1a8554a8b1e00907cff3b4b', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='af1afd5e935cccd3a105de6c12eb4c31', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='f84c2651e350f5b394585207a43d06e4', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='c243c35d1685a16435c8b281a445005c', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/mmocr/__init__.py b/mmocr/__init__.py index faf1ae81e..4524c4c3c 100644 --- a/mmocr/__init__.py +++ b/mmocr/__init__.py @@ -43,7 +43,7 @@ f'<{mmengine_maximum_version}.' mmdet_minimum_version = '3.0.0rc5' -mmdet_maximum_version = '3.2.0' +mmdet_maximum_version = '3.4.0' mmdet_version = digit_version(mmdet.__version__) assert (mmdet_version >= digit_version(mmdet_minimum_version) diff --git a/mmocr/datasets/preparers/config_generators/__init__.py b/mmocr/datasets/preparers/config_generators/__init__.py index 8e884c6d9..69e3b5157 100644 --- a/mmocr/datasets/preparers/config_generators/__init__.py +++ b/mmocr/datasets/preparers/config_generators/__init__.py @@ -3,8 +3,11 @@ from .textdet_config_generator import TextDetConfigGenerator from .textrecog_config_generator import TextRecogConfigGenerator from .textspotting_config_generator import TextSpottingConfigGenerator +from .xfund_config_generator import (XFUNDREConfigGenerator, + XFUNDSERConfigGenerator) __all__ = [ 'BaseDatasetConfigGenerator', 'TextDetConfigGenerator', - 'TextRecogConfigGenerator', 'TextSpottingConfigGenerator' + 'TextRecogConfigGenerator', 'TextSpottingConfigGenerator', + 'XFUNDSERConfigGenerator', 'XFUNDREConfigGenerator' ] diff --git a/mmocr/datasets/preparers/config_generators/xfund_config_generator.py b/mmocr/datasets/preparers/config_generators/xfund_config_generator.py new file mode 100644 index 000000000..0bc243505 --- /dev/null +++ b/mmocr/datasets/preparers/config_generators/xfund_config_generator.py @@ -0,0 +1,147 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional + +from mmocr.registry import CFG_GENERATORS +from .base import BaseDatasetConfigGenerator + + +@CFG_GENERATORS.register_module() +class XFUNDSERConfigGenerator(BaseDatasetConfigGenerator): + """XFUND dataset Semantic Entity Recognition task config generator. + + Args: + data_root (str): The root path of the dataset. + dataset_name (str): The name of the dataset. + overwrite_cfg (bool): Whether to overwrite the dataset config file if + it already exists. If False, config generator will not generate new + config for datasets whose configs are already in base. + train_anns (List[Dict], optional): A list of train annotation files + to appear in the base configs. Defaults to + ``[dict(file='ser_train.json', dataset_postfix='')]``. + Each element is typically a dict with the following fields: + - ann_file (str): The path to the annotation file relative to + data_root. + - dataset_postfix (str, optional): Affects the postfix of the + resulting variable in the generated config. If specified, the + dataset variable will be named in the form of + ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to + None. + val_anns (List[Dict], optional): A list of val annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to []. + test_anns (List[Dict], optional): A list of test annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to ``[dict(file='ser_test.json')]``. + config_path (str): Path to the configs. Defaults to 'configs/'. + """ + + def __init__(self, + data_root: str, + dataset_name: str, + overwrite_cfg: bool = False, + train_anns: Optional[List[Dict]] = [ + dict(ann_file='ser_train.json', dataset_postfix='') + ], + val_anns: Optional[List[Dict]] = [], + test_anns: Optional[List[Dict]] = [ + dict(ann_file='ser_test.json', dataset_postfix='') + ], + config_path: str = 'configs/') -> None: + + if '/' in dataset_name: + dataset_name = '_'.join(dataset_name.split('/')) + + super().__init__( + data_root=data_root, + task='ser', + overwrite_cfg=overwrite_cfg, + dataset_name=dataset_name, + train_anns=train_anns, + val_anns=val_anns, + test_anns=test_anns, + config_path=config_path, + ) + + def _gen_dataset_config(self) -> str: + """Generate a full dataset config based on the annotation file + dictionary. + + Args: + ann_dict (dict[str, dict(str, str)]): A nested dictionary that maps + a config variable name (such as icdar2015_textrecog_train) to + its corresponding annotation information dict. Each dict + contains following keys: + - ann_file (str): The path to the annotation file relative to + data_root. + - dataset_postfix (str, optional): Affects the postfix of the + resulting variable in the generated config. If specified, the + dataset variable will be named in the form of + ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults + to None. + - split (str): The split the annotation belongs to. Usually + it can be 'train', 'val' and 'test'. + + Returns: + str: The generated dataset config. + """ + cfg = '' + for key_name, ann_dict in self.anns.items(): + cfg += f'\n{key_name} = dict(\n' + cfg += ' type=\'XFUNDDataset\',\n' + cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501 + cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n' + if ann_dict['split'] in ['test', 'val']: + cfg += ' test_mode=True,\n' + cfg += ' pipeline=None)\n' + return cfg + + +@CFG_GENERATORS.register_module() +class XFUNDREConfigGenerator(BaseDatasetConfigGenerator): + """XFUND dataset Relation Extraction task config generator. + + The main difference with `XFUNDSERConfigGenerator` is: + - train_anns/val_anns/test_anns default file name: + f'{ser or re}_{train or test}.json' + - the value of self.task: 'ser' or 're' + """ + + def __init__(self, + data_root: str, + dataset_name: str, + overwrite_cfg: bool = False, + train_anns: Optional[List[Dict]] = [ + dict(ann_file='re_train.json', dataset_postfix='') + ], + val_anns: Optional[List[Dict]] = [], + test_anns: Optional[List[Dict]] = [ + dict(ann_file='re_test.json', dataset_postfix='') + ], + config_path: str = 'configs/') -> None: + + if '/' in dataset_name: + dataset_name = '_'.join(dataset_name.split('/')) + + super().__init__( + data_root=data_root, + task='re', + overwrite_cfg=overwrite_cfg, + dataset_name=dataset_name, + train_anns=train_anns, + val_anns=val_anns, + test_anns=test_anns, + config_path=config_path, + ) + + def _gen_dataset_config(self) -> str: + """Same as `XFUNDSERConfigGenerator._gen_dataset_config()`""" + cfg = '' + for key_name, ann_dict in self.anns.items(): + cfg += f'\n{key_name} = dict(\n' + cfg += ' type=\'XFUNDDataset\',\n' + cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501 + cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n' + if ann_dict['split'] in ['test', 'val']: + cfg += ' test_mode=True,\n' + cfg += ' pipeline=None)\n' + return cfg diff --git a/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py b/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py index 51b0d266c..c743a4859 100644 --- a/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py +++ b/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py @@ -186,8 +186,7 @@ def move(self, mapping: List[Tuple[str, str]]) -> None: if '*' in src: mkdir_or_exist(dst) for f in glob.glob(src): - if not osp.exists( - osp.join(dst, osp.relpath(f, self.data_root))): + if not osp.exists(osp.join(dst, osp.basename(f))): shutil.move(f, dst) elif osp.exists(src) and not osp.exists(dst): diff --git a/mmocr/datasets/preparers/packers/__init__.py b/mmocr/datasets/preparers/packers/__init__.py index 78eb55dc4..a271a3ce5 100644 --- a/mmocr/datasets/preparers/packers/__init__.py +++ b/mmocr/datasets/preparers/packers/__init__.py @@ -1,5 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from .base import BasePacker +from .re_packer import REPacker +from .ser_packer import SERPacker from .textdet_packer import TextDetPacker from .textrecog_packer import TextRecogCropPacker, TextRecogPacker from .textspotting_packer import TextSpottingPacker @@ -7,5 +9,5 @@ __all__ = [ 'BasePacker', 'TextDetPacker', 'TextRecogPacker', 'TextRecogCropPacker', - 'TextSpottingPacker', 'WildReceiptPacker' + 'TextSpottingPacker', 'WildReceiptPacker', 'SERPacker', 'REPacker' ] diff --git a/mmocr/datasets/preparers/packers/re_packer.py b/mmocr/datasets/preparers/packers/re_packer.py new file mode 100644 index 000000000..62dca972c --- /dev/null +++ b/mmocr/datasets/preparers/packers/re_packer.py @@ -0,0 +1,125 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import warnings +from typing import Dict, Tuple + +import mmcv + +from mmocr.registry import DATA_PACKERS +from .ser_packer import SERPacker + + +@DATA_PACKERS.register_module() +class REPacker(SERPacker): + """Relation Extraction packer. It is used to pack the parsed annotation + info to MMOCR format. + + .. code-block:: python + + { + "metainfo": {}, + "data_list": + [ + { + "img_path": "imgs\\test\\zh_val_0.jpg", + "height": 3508, + "width": 2480, + "instances": + { + "texts": ["绩效目标申报表(一级项目)", "项目名称", ...], + "boxes": [[906,195,1478,259], + [357,325,467,357], ...], + "labels": ["header", "question", ...], + "linkings": [[0, 1], [2, 3], ...], + "ids": [0, 1, ...], + "words": [[{ + "box": [ + 904, + 192, + 942, + 253 + ], + "text": "绩" + }, + { + "box": [ + 953, + 192, + 987, + 253 + ], + "text": "效" + }, ...], ...] + } + } + ] + } + """ + + def pack_instance(self, sample: Tuple) -> Dict: + """Pack the parsed annotation info to an MMOCR format instance. + + Args: + sample (Tuple): A tuple of (img_file, instances). + - img_path (str): Path to the image file. + - instances (Sequence[Dict]): A list of converted annos. Each + element should be a dict with the following keys: + + - 'text' + - 'box' + - 'label' + - 'linking' + - 'id' + - 'words' (optional) + + Returns: + Dict: An MMOCR format instance. + """ + + img_path, instances = sample + + img = mmcv.imread(img_path) + h, w = img.shape[:2] + + texts_per_doc = [] + boxes_per_doc = [] + labels_per_doc = [] + linking_per_doc = [] + id_per_doc = [] + has_words = all(['words' in ins for ins in instances]) + if has_words: + words_per_doc = [] + else: + warnings.warn( + 'Not all instance has `words` key,' + 'so final MMOCR format SER instance will not have `words` key') + + for instance in instances: + text = instance.get('text', None) + box = instance.get('box', None) + label = instance.get('label', None) + linking = instance.get('linking', None) + ins_id = instance.get('id', None) + assert text or box or label or linking or ins_id + texts_per_doc.append(text) + boxes_per_doc.append(box) + labels_per_doc.append(label) + linking_per_doc.append(linking) + id_per_doc.append(ins_id) + if has_words: + words = instance.get('words', None) + words_per_doc.append(words) + packed_instances = dict( + instances=dict( + texts=texts_per_doc, + boxes=boxes_per_doc, + labels=labels_per_doc, + linkings=linking_per_doc, + ids=id_per_doc), + img_path=osp.relpath(img_path, self.data_root), + height=h, + width=w) + if has_words: + packed_instances['instances'].update({'words': words_per_doc}) + + return packed_instances diff --git a/mmocr/datasets/preparers/packers/ser_packer.py b/mmocr/datasets/preparers/packers/ser_packer.py new file mode 100644 index 000000000..798cfc4a2 --- /dev/null +++ b/mmocr/datasets/preparers/packers/ser_packer.py @@ -0,0 +1,125 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import warnings +from typing import Dict, List, Tuple + +import mmcv + +from mmocr.registry import DATA_PACKERS +from .base import BasePacker + + +@DATA_PACKERS.register_module() +class SERPacker(BasePacker): + """Semantic Entity Recognition packer. It is used to pack the parsed + annotation info to MMOCR format. + + .. code-block:: python + + { + "metainfo": {}, + "data_list": + [ + { + "img_path": "imgs\\test\\zh_val_0.jpg", + "height": 3508, + "width": 2480, + "instances": + { + "texts": ["绩效目标申报表(一级项目)", "项目名称", ...], + "boxes": [[906,195,1478,259], + [357,325,467,357], ...], + "labels": ["header", "question", ...], + "words": [[{ + "box": [ + 904, + 192, + 942, + 253 + ], + "text": "绩" + }, + { + "box": [ + 953, + 192, + 987, + 253 + ], + "text": "效" + }, ...], ...] + } + } + ] + } + """ + + def pack_instance(self, sample: Tuple) -> Dict: + """Pack the parsed annotation info to an MMOCR format instance. + + Args: + sample (Tuple): A tuple of (img_file, instances). + - img_path (str): Path to the image file. + - instances (Sequence[Dict]): A list of converted annos. Each + element should be a dict with the following keys: + + - 'text' + - 'box' + - 'label' + - 'words' (optional) + + Returns: + Dict: An MMOCR format instance. + """ + + img_path, instances = sample + + img = mmcv.imread(img_path) + h, w = img.shape[:2] + + texts_per_doc = [] + boxes_per_doc = [] + labels_per_doc = [] + has_words = all(['words' in ins for ins in instances]) + if has_words: + words_per_doc = [] + else: + warnings.warn( + 'Not all instance has `words` key,' + 'so final MMOCR format SER instance will not have `words` key') + + for instance in instances: + text = instance.get('text', None) + box = instance.get('box', None) + label = instance.get('label', None) + assert text or box or label + texts_per_doc.append(text) + boxes_per_doc.append(box) + labels_per_doc.append(label) + if has_words: + words = instance.get('words', None) + words_per_doc.append(words) + packed_instances = dict( + instances=dict( + texts=texts_per_doc, + boxes=boxes_per_doc, + labels=labels_per_doc), + img_path=osp.relpath(img_path, self.data_root), + height=h, + width=w) + if has_words: + packed_instances['instances'].update({'words': words_per_doc}) + + return packed_instances + + def add_meta(self, sample: List) -> Dict: + """Add meta information to the sample. + + Args: + sample (List): A list of samples of the dataset. + + Returns: + Dict: A dict contains the meta information and samples. + """ + meta = {'metainfo': {}, 'data_list': sample} + return meta diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py index fd3794710..620797ae5 100644 --- a/mmocr/datasets/preparers/parsers/__init__.py +++ b/mmocr/datasets/preparers/parsers/__init__.py @@ -12,11 +12,12 @@ from .synthtext_parser import SynthTextAnnParser from .totaltext_parser import TotaltextTextDetAnnParser from .wildreceipt_parser import WildreceiptKIEAnnParser +from .xfund_parser import XFUNDAnnParser __all__ = [ 'BaseParser', 'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser', 'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser', 'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser', 'SROIETextDetAnnParser', 'NAFAnnParser', 'CTW1500AnnParser', - 'SynthTextAnnParser', 'MJSynthAnnParser' + 'SynthTextAnnParser', 'MJSynthAnnParser', 'XFUNDAnnParser' ] diff --git a/mmocr/datasets/preparers/parsers/xfund_parser.py b/mmocr/datasets/preparers/parsers/xfund_parser.py new file mode 100644 index 000000000..e776b0fee --- /dev/null +++ b/mmocr/datasets/preparers/parsers/xfund_parser.py @@ -0,0 +1,44 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp +from typing import List + +from mmocr.registry import DATA_PARSERS +from .base import BaseParser + + +@DATA_PARSERS.register_module() +class XFUNDAnnParser(BaseParser): + """XFUND Semantic Entity Recognition and Relation Extraction Annotation + Parser. See dataset_zoo/xfund/xx/sample_anno.md for annotation example. + + Args: + nproc (int): The number of processes to parse the annotation. Defaults + to 1. + """ + + def parse_files(self, img_dir: str, ann_path: str) -> List: + """Parse annotations.""" + assert isinstance(ann_path, str) + samples = list() + for img_fname, instance in self.loader(ann_path): + samples.append((osp.join(img_dir, img_fname), instance)) + return samples + + def loader(self, file_path: str): + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + for i in range(len(data['documents'])): + img_fname = data['documents'][i]['img']['fname'] + instances = list() + for j in range(len(data['documents'][i]['document'])): + cur_item = data['documents'][i]['document'][j] + instance = dict( + text=cur_item['text'], + box=cur_item['box'], + label=cur_item['label'], + words=cur_item['words'], + linking=cur_item['linking'], + id=cur_item['id']) + instances.append(instance) + yield img_fname, instances diff --git a/projects/LayoutLMv3/README.md b/projects/LayoutLMv3/README.md new file mode 100644 index 000000000..41561997d --- /dev/null +++ b/projects/LayoutLMv3/README.md @@ -0,0 +1,149 @@ +# LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking + +
+[arXiv paper] +
+ +## Description + +This is an implementation of [LayoutLMv3](https://github.com/microsoft/unilm/tree/master/layoutlmv3) based on [MMOCR](https://github.com/open-mmlab/mmocr/tree/dev-1.x), [MMCV](https://github.com/open-mmlab/mmcv), [MMEngine](https://github.com/open-mmlab/mmengine) and [Transformers](https://github.com/huggingface/transformers). + +**LayoutLMv3** Self-supervised pre-training techniques have achieved remarkable progress in Document AI. Most multimodal pre-trained models use a masked language modeling objective to learn bidirectional representations on the text modality, but they differ in pre-training objectives for the image modality. This discrepancy adds difficulty to multimodal representation learning. In this paper, we propose LayoutLMv3 to pre-train multimodal Transformers for Document AI with unified text and image masking. Additionally, LayoutLMv3 is pre-trained with a word-patch alignment objective to learn cross-modal alignment by predicting whether the corresponding image patch of a text word is masked. The simple unified architecture and training objectives make LayoutLMv3 a general-purpose pre-trained model for both text-centric and image-centric Document AI tasks. Experimental results show that LayoutLMv3 achieves state-of-the-art performance not only in text-centric tasks, including form understanding, receipt understanding, and document visual question answering, but also in image-centric tasks such as document image classification and document layout analysis.The code and models are publicly available at https://aka.ms/layoutlmv3. + +
+ +
+ +## Usage + + + +### Prerequisites + +- Python 3.7 +- PyTorch 1.6 or higher +- [Transformers](https://github.com/huggingface/transformers) 4.31.0.dev0 or higher +- [MIM](https://github.com/open-mmlab/mim) +- [MMOCR](https://github.com/open-mmlab/mmocr) + +### Preparing xfund dataset + +In MMOCR's root directory, run the following command to prepare xfund dataset: + +```shell +sh projects/LayoutLMv3/scripts/prepare_dataset.sh +``` + +### Downloading Pre-training LayoutLMv3 model + +Download the [LayoutLMv3 Chinese pre-trained model](https://huggingface.co/microsoft/layoutlmv3-base-chinese) from huggingface. + +### Training commands + +Modify the path of the parameter `hf_pretrained_model` in the config file(`projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py`) + +In MMOCR's root directory, run the following command to train the model: + +```bash +export TOKENIZERS_PARALLELISM=false +export OMP_NUM_THREADS=1 +mim train mmocr projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py --work-dir work_dirs/ +``` + + + +### Testing commands + +In MMOCR's root directory, run the following command to test the model: + +```bash +mim test mmocr projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py --work-dir work_dirs/ --checkpoint ${CHECKPOINT_PATH} +``` + +## Results + + + +## Citation + +If you find LayoutLMv3 useful in your research or applications, please cite LayoutLMv3 with the following BibTeX entry. + +```bibtex +@inproceedings{huang2022layoutlmv3, + title={Layoutlmv3: Pre-training for document ai with unified text and image masking}, + author={Huang, Yupan and Lv, Tengchao and Cui, Lei and Lu, Yutong and Wei, Furu}, + booktitle={Proceedings of the 30th ACM International Conference on Multimedia}, + pages={4083--4091}, + year={2022} +} +``` + +## Checklist + +Here is a checklist illustrating a usual development workflow of a successful project, and also serves as an overview of this project's progress. + +> The PIC (person in charge) or contributors of this project should check all the items that they believe have been finished, which will further be verified by codebase maintainers via a PR. +> +> OpenMMLab's maintainer will review the code to ensure the project's quality. Reaching the first milestone means that this project suffices the minimum requirement of being merged into 'projects/'. But this project is only eligible to become a part of the core package upon attaining the last milestone. +> +> Note that keeping this section up-to-date is crucial not only for this project's developers but the entire community, since there might be some other contributors joining this project and deciding their starting point from this list. It also helps maintainers accurately estimate time and effort on further code polishing, if needed. +> +> A project does not necessarily have to be finished in a single PR, but it's essential for the project to at least reach the first milestone in its very first PR. + +- [ ] Milestone 1: PR-ready, and acceptable to be one of the `projects/`. + + - [x] Finish the code + + > The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `mmocr.registry.MODELS` and configurable via a config file. + + - [ ] Basic docstrings & proper citation + + > Each major object should contain a docstring, describing its functionality and arguments. If you have adapted the code from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd) + + - [ ] Test-time correctness + + > If you are reproducing the result from a paper, make sure your model's inference-time performance matches that in the original paper. The weights usually could be obtained by simply renaming the keys in the official pre-trained weights. This test could be skipped though, if you are able to prove the training-time correctness and check the second milestone. + + - [ ] A full README + + > As this template does. + +- [ ] Milestone 2: Indicates a successful model implementation. + + - [ ] Training-time correctness + + > If you are reproducing the result from a paper, checking this item means that you should have trained your model from scratch based on the original paper's specification and verified that the final result matches the report within a minor error range. + +- [ ] Milestone 3: Good to be a part of our core package! + + - [ ] Type hints and docstrings + + > Ideally *all* the methods should have [type hints](https://www.pythontutorial.net/python-basics/python-type-hints/) and [docstrings](https://google.github.io/styleguide/pyguide.html#381-docstrings). [Example](https://github.com/open-mmlab/mmocr/blob/76637a290507f151215d299707c57cea5120976e/mmocr/utils/polygon_utils.py#L80-L96) + + - [ ] Unit tests + + > Unit tests for each module are required. [Example](https://github.com/open-mmlab/mmocr/blob/76637a290507f151215d299707c57cea5120976e/tests/test_utils/test_polygon_utils.py#L97-L106) + + - [ ] Code polishing + + > Refactor your code according to reviewer's comment. + + - [ ] Metafile.yml + + > It will be parsed by MIM and Inferencer. [Example](https://github.com/open-mmlab/mmocr/blob/1.x/configs/textdet/dbnet/metafile.yml) + +- [ ] Move your modules into the core package following the codebase's file hierarchy structure. + + > In particular, you may have to refactor this README into a standard one. [Example](/configs/textdet/dbnet/README.md) + +- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure. diff --git a/projects/LayoutLMv3/configs/_base_/datasets/xfund_zh.py b/projects/LayoutLMv3/configs/_base_/datasets/xfund_zh.py new file mode 100644 index 000000000..e790a7bf6 --- /dev/null +++ b/projects/LayoutLMv3/configs/_base_/datasets/xfund_zh.py @@ -0,0 +1,14 @@ +xfund_zh_ser_data_root = 'data/xfund/zh' + +xfund_zh_ser_train = dict( + type='XFUNDDataset', + data_root=xfund_zh_ser_data_root, + ann_file='ser_train.json', + pipeline=None) + +xfund_zh_ser_test = dict( + type='XFUNDDataset', + data_root=xfund_zh_ser_data_root, + ann_file='ser_test.json', + test_mode=True, + pipeline=None) diff --git a/projects/LayoutLMv3/configs/_base_/default_runtime.py b/projects/LayoutLMv3/configs/_base_/default_runtime.py new file mode 100644 index 000000000..d080a0015 --- /dev/null +++ b/projects/LayoutLMv3/configs/_base_/default_runtime.py @@ -0,0 +1,39 @@ +default_scope = 'mmocr' +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl'), +) +randomness = dict(seed=None) + +default_hooks = dict( + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=5, log_metric_by_epoch=False), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict(type='CheckpointHook', interval=20, by_epoch=False), + sampler_seed=dict(type='DistSamplerSeedHook'), + sync_buffer=dict(type='SyncBuffersHook'), + visualization=dict( + type='VisualizationHook', + interval=1, + enable=False, + show=False, + draw_gt=False, + draw_pred=False), +) + +# Logging +log_level = 'INFO' +log_processor = dict(type='LogProcessor', window_size=10, by_epoch=False) + +load_from = None +resume = False + +# Evaluation +val_evaluator = dict(type='SeqevalMetric') +test_evaluator = val_evaluator + +# Visualization +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='SERLocalVisualizer', name='visualizer', vis_backends=vis_backends) diff --git a/projects/LayoutLMv3/configs/_base_/schedules/schedule_adamw_1k.py b/projects/LayoutLMv3/configs/_base_/schedules/schedule_adamw_1k.py new file mode 100644 index 000000000..33bc695c4 --- /dev/null +++ b/projects/LayoutLMv3/configs/_base_/schedules/schedule_adamw_1k.py @@ -0,0 +1,11 @@ +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=7e-5, weight_decay=0.01)) +train_cfg = dict(type='IterBasedTrainLoop', max_iters=1000, val_interval=100) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') +# learning policy +param_scheduler = [ + dict(type='OneCycleLR', eta_max=7e-5, by_epoch=False, total_steps=1000), +] diff --git a/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py b/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py new file mode 100644 index 000000000..a4f9fcf0a --- /dev/null +++ b/projects/LayoutLMv3/configs/ser/layoutlmv3_1k_xfund_zh_1xbs8.py @@ -0,0 +1,148 @@ +_base_ = [ + '../_base_/datasets/xfund_zh.py', '../_base_/default_runtime.py', + '../_base_/schedules/schedule_adamw_1k.py' +] + +# ================== Frequently modified parameters ================== +hf_pretrained_model = 'data/layoutlmv3-base-chinese' +dataset_name = 'xfund_zh' +class_name = ('answer', 'header', 'question', 'other') +max_iters = 1000 +val_interval = 100 +lr = 7e-5 +train_batch_size_per_gpu = 2 +train_num_workers = 8 +test_batch_size_per_gpu = 1 # can't batch infer now +test_num_workers = 8 +only_label_first_subword = True # select label process strategy +# ==================================================================== +# =========================== schedule =============================== +train_cfg = dict( + type='IterBasedTrainLoop', max_iters=max_iters, val_interval=val_interval) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=lr, weight_decay=0.01)) +param_scheduler = [ + dict( + type='OneCycleLR', + eta_max=lr, + by_epoch=False, + total_steps=max_iters, + three_phase=True, + final_div_factor=4), +] +# ==================================================================== +# =========================== Dataset ================================ +train_dataset = _base_.xfund_zh_ser_train +test_dataset = _base_.xfund_zh_ser_test +train_pipeline = [ + dict(type='LoadImageFromFile', color_type='color'), + dict( + type='LoadProcessorFromPretrainedModel', + pretrained_model_name_or_path=hf_pretrained_model, + image_processor=dict(size=(224, 224), apply_ocr=False)), + dict(type='ProcessImageForLayoutLMv3'), + dict( + type='ProcessTokenForLayoutLMv3', + padding='max_length', + max_length=512, + truncation=True), + dict( + type='ConvertBIOLabelForSER', + classes=class_name, + only_label_first_subword=only_label_first_subword), + dict( + type='PackSERInputs', + meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor')) +] +test_pipeline = [ + dict(type='LoadImageFromFile', color_type='color'), + dict( + type='LoadProcessorFromPretrainedModel', + pretrained_model_name_or_path=hf_pretrained_model, + image_processor=dict(size=(224, 224), apply_ocr=False)), + dict(type='ProcessImageForLayoutLMv3'), + dict( + type='ProcessTokenForLayoutLMv3', + padding='max_length', + max_length=512, + truncation=True), + dict( + type='ConvertBIOLabelForSER', + classes=class_name, + only_label_first_subword=only_label_first_subword), + dict( + type='PackSERInputs', + meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor', + 'truncation_word_ids', 'instances')) +] +train_dataset.pipeline = train_pipeline +test_dataset.pipeline = test_pipeline +# ==================================================================== +# ========================= Dataloader =============================== +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + pin_memory=True, + persistent_workers=True, + sampler=dict(type='InfiniteSampler', shuffle=True), + collate_fn=dict(type='ser_collate', training=True), + dataset=train_dataset) +val_dataloader = dict( + batch_size=test_batch_size_per_gpu, + num_workers=test_num_workers, + pin_memory=True, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + collate_fn=dict(type='ser_collate', training=False), + dataset=test_dataset) +test_dataloader = val_dataloader +# ==================================================================== +# ============================ Model ================================= +model = dict( + type='HFLayoutLMv3ForTokenClassificationWrapper', + layoutlmv3_token_classifier=dict( + pretrained_model_name_or_path=hf_pretrained_model, + num_labels=len(class_name) * 2 - 1), + loss_processor=dict(type='ComputeLossAfterLabelSmooth'), + postprocessor=dict( + type='SERPostprocessor', + classes=class_name, + only_label_first_subword=only_label_first_subword)) +# ==================================================================== +# ========================= Evaluation =============================== +val_evaluator = dict(type='SeqevalMetric', prefix=dataset_name) +test_evaluator = val_evaluator +# ==================================================================== +# ======================= Visualization ============================== +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='SERLocalVisualizer', name='visualizer', vis_backends=vis_backends) +# ==================================================================== +# ============================= Hook ================================= +default_hooks = dict( + logger=dict(type='LoggerHook', interval=10), + checkpoint=dict( + type='CheckpointHook', + interval=500, + save_best=f'{dataset_name}/f1', + rule='greater'), + visualization=dict( + type='VisualizationHook', + interval=10, + enable=True, + show=False, + draw_gt=True, + draw_pred=True), +) +# ==================================================================== +# ========================= Custom imports =========================== +custom_imports = dict( + imports=[ + 'projects.LayoutLMv3.datasets', 'projects.LayoutLMv3.evaluation', + 'projects.LayoutLMv3.models', 'projects.LayoutLMv3.visualization' + ], + allow_failed_imports=False) +# ==================================================================== diff --git a/projects/LayoutLMv3/datasets/__init__.py b/projects/LayoutLMv3/datasets/__init__.py new file mode 100644 index 000000000..b218a79fc --- /dev/null +++ b/projects/LayoutLMv3/datasets/__init__.py @@ -0,0 +1,5 @@ +from .transforms import * # NOQA +from .utils import ser_collate +from .xfund_dataset import XFUNDDataset + +__all__ = ['XFUNDDataset', 'ser_collate'] diff --git a/projects/LayoutLMv3/datasets/transforms/__init__.py b/projects/LayoutLMv3/datasets/transforms/__init__.py new file mode 100644 index 000000000..c837267e0 --- /dev/null +++ b/projects/LayoutLMv3/datasets/transforms/__init__.py @@ -0,0 +1,10 @@ +from .formatting import PackSERInputs +from .layoutlmv3_transforms import (ConvertBIOLabelForSER, + LoadProcessorFromPretrainedModel, + ProcessImageForLayoutLMv3, + ProcessTokenForLayoutLMv3) + +__all__ = [ + 'LoadProcessorFromPretrainedModel', 'ProcessImageForLayoutLMv3', + 'ProcessTokenForLayoutLMv3', 'ConvertBIOLabelForSER', 'PackSERInputs' +] diff --git a/projects/LayoutLMv3/datasets/transforms/formatting.py b/projects/LayoutLMv3/datasets/transforms/formatting.py new file mode 100644 index 000000000..7dd4911f6 --- /dev/null +++ b/projects/LayoutLMv3/datasets/transforms/formatting.py @@ -0,0 +1,127 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +from mmcv.transforms import to_tensor +from mmcv.transforms.base import BaseTransform +from mmengine.structures import LabelData + +from mmocr.registry import TRANSFORMS +from projects.LayoutLMv3.structures import SERDataSample + + +@TRANSFORMS.register_module() +class PackSERInputs(BaseTransform): + """Pack the inputs data for LayoutLMv3ForTokenClassification model. + + The type of outputs is `dict`: + + - inputs: Data for model forwarding. Five components will be included: + + - input_ids, whose shape is (truncation_number, 512). + - bbox, whose shape is (truncation_number, 512, 4). + - attention_mask, whose shape is (truncation_number, 512). + - pixel_values, whose shape is (truncation_number, 3, 224, 224). + - labels, whose shape is (truncation_number, 512). + + - data_samples: Two components of ``SERDataSample`` will be updated: + + - gt_instances (InstanceData): Depending on annotations, a subset of the + following keys will be updated: + + - bboxes (torch.Tensor((N, 4), dtype=torch.float32)): The groundtruth + of bounding boxes in the form of [x1, y1, x2, y2]. Renamed from + 'gt_bboxes'. + - labels (torch.LongTensor(N)): The labels of instances. + Renamed from 'gt_bboxes_labels'. + - texts (list[str]): The groundtruth texts. Renamed from 'gt_texts'. + + - metainfo (dict): 'metainfo' is always populated. The contents of the + 'metainfo' depends on ``meta_keys``. By default it includes: + + - "img_path": Path to the image file. + - "img_shape": Shape of the image input to the network as a tuple + (h, w). Note that the image may be zero-padded afterward on the + bottom/right if the batch tensor is larger than this shape. + - "scale_factor": A tuple indicating the ratio of width and height + of the preprocessed image to the original one. + - "ori_shape": Shape of the preprocessed image as a tuple (h, w). + + Args: + meta_keys (Sequence[str], optional): Meta keys to be converted to + the metainfo of ``SERDataSample``. Defaults to ``('img_path', + 'ori_shape', 'img_shape', 'scale_factor')``. + """ + # HF LayoutLMv3ForTokenClassification model input params. + ser_keys = [ + 'input_ids', 'bbox', 'attention_mask', 'pixel_values', 'labels' + ] + + def __init__(self, meta_keys=()): + self.meta_keys = meta_keys + + def transform(self, results: dict) -> dict: + """Method to pack SER input data. + + Args: + results (dict): Result dict from the data pipeline. + + Returns: + dict: + + - 'inputs' (obj:`dict`): Data for model forwarding. + - 'data_samples' (obj:`SERDataSample`): The annotation info of the + sample. + """ + + packed_results = dict() + truncation_number = results['truncation_number'] + + if 'pixel_values' in results: + img = results['pixel_values'] + if len(img.shape) < 3: + img = np.expand_dims(img, -1) + # A simple trick to speedup formatting by 3-5 times when + # OMP_NUM_THREADS != 1 + # Refer to https://github.com/open-mmlab/mmdetection/pull/9533 + # for more details + if img.flags.c_contiguous: + img = to_tensor(img) + img = img.permute(2, 0, 1).contiguous() + else: + img = np.ascontiguousarray(img.transpose(2, 0, 1)) + img = to_tensor(img) + results['pixel_values'] = torch.cat( + [img.unsqueeze(0)] * truncation_number, dim=0) + + # pack `inputs` + inputs = {} + for key in self.ser_keys: + if key not in results: + continue + inputs[key] = to_tensor(results[key]) + packed_results['inputs'] = inputs + + # pack `data_samples` + data_samples = [] + for truncation_idx in range(truncation_number): + data_sample = SERDataSample() + gt_label = LabelData() + if results.get('labels', None): + gt_label.item = to_tensor(results['labels'][truncation_idx]) + data_sample.gt_label = gt_label + meta = {} + for key in self.meta_keys: + if key == 'truncation_word_ids': + meta[key] = results[key][truncation_idx] + else: + meta[key] = results[key] + data_sample.set_metainfo(meta) + data_samples.append(data_sample) + packed_results['data_samples'] = data_samples + + return packed_results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(meta_keys={self.meta_keys})' + return repr_str diff --git a/projects/LayoutLMv3/datasets/transforms/layoutlmv3_transforms.py b/projects/LayoutLMv3/datasets/transforms/layoutlmv3_transforms.py new file mode 100644 index 000000000..ed545d5f6 --- /dev/null +++ b/projects/LayoutLMv3/datasets/transforms/layoutlmv3_transforms.py @@ -0,0 +1,279 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Union + +from mmcv.transforms.base import BaseTransform +from transformers import LayoutLMv3ImageProcessor, LayoutXLMTokenizerFast +from transformers.file_utils import PaddingStrategy +from transformers.image_processing_utils import BatchFeature +from transformers.image_utils import ChannelDimension +from transformers.tokenization_utils_base import (BatchEncoding, + TruncationStrategy) + +from mmocr.registry import TRANSFORMS +from projects.LayoutLMv3.utils.bio_label_utils import \ + find_other_label_name_of_biolabel + + +@TRANSFORMS.register_module() +class LoadProcessorFromPretrainedModel(BaseTransform): + """A transform to load image_processor/text_tokenizer from pretrained + model, which will use HuggingFace `LayoutLMv3ImageProcessor` and + `LayoutXLMTokenizerFast` + + Added Keys: + + - image_processor + - tokeinzer + + Args: + pretrained_model_name_or_path (str): The name or path of huggingface + pretrained model, which must be specified. + image_processor (dict): The specific parameters for image_processor. + tokenizer (dict): The specific parameters for tokenizer. + """ + + def __init__( + self, + pretrained_model_name_or_path: str, + image_processor: dict = dict(), + tokenizer: dict = dict() + ) -> None: + super().__init__() + assert pretrained_model_name_or_path != '' + self.image_processor = LayoutLMv3ImageProcessor.from_pretrained( + pretrained_model_name_or_path=pretrained_model_name_or_path, + **image_processor) + # TODO: support apply_ocr + if self.image_processor.apply_ocr: + raise ValueError( + 'Now only support initialized the image processor ' + 'with apply_ocr set to False.') + + # https://huggingface.co/microsoft/layoutlmv3-base-chinese/discussions/3 + # use LayoutXLMTokenizerFast instead of LayoutLMv3TokenizerFast + self.tokenizer = LayoutXLMTokenizerFast.from_pretrained( + pretrained_model_name_or_path=pretrained_model_name_or_path, + **tokenizer) + + def transform(self, results: dict) -> Dict: + results['image_processor'] = self.image_processor + results['tokenizer'] = self.tokenizer + return results + + +@TRANSFORMS.register_module() +class ProcessImageForLayoutLMv3(BaseTransform): + """A transform to process image for LayoutLMv3. + + Required Keys: + + - img + - img_shape + - image_processor + + Modified Keys: + + - img_shape + + Added Keys: + + - scale_factor + - pixel_values + """ + + def __init__(self) -> None: + super().__init__() + + def _resize_rescale_norm(self, results: dict) -> None: + """apply the image_processor to img.""" + img = results['img'] + h, w = results['img_shape'] + + image_processor = results['image_processor'] + features: BatchFeature = image_processor( + images=img, return_tensors='np', data_format=ChannelDimension.LAST) + + # output default dims NHWC and here N=1 + pixel_values = features['pixel_values'][0] + new_h, new_w = pixel_values.shape[:2] + w_scale = new_w / w + h_scale = new_h / h + results['pixel_values'] = pixel_values + results['img_shape'] = (new_h, new_w) + results['scale_factor'] = (w_scale, h_scale) + + def transform(self, results: dict) -> Dict: + self._resize_rescale_norm(results) + return results + + +@TRANSFORMS.register_module() +class ProcessTokenForLayoutLMv3(BaseTransform): + """A transform to process texts for LayoutLMv3, + + Required Keys: + + - tokenizer + - width + - height + - instances + - texts + - boxes + + Added Keys: + + - input_ids + - attention_mask + - bbox + - truncation_number + - truncation_word_ids + + Args: + Refer to the parameters of the corresponding tokenizer + """ + + def __init__(self, + padding: Union[bool, str, PaddingStrategy] = False, + max_length: Optional[int] = None, + truncation: Union[bool, str, TruncationStrategy] = None, + pad_to_multiple_of: Optional[int] = None) -> None: + super().__init__() + self.padding = padding + self.max_length = max_length + self.truncation = truncation + self.pad_to_multiple_of = pad_to_multiple_of + + def box_norm(self, box, width, height) -> List: + + def clip(min_num, num, max_num): + return min(max(num, min_num), max_num) + + x0, y0, x1, y1 = box + x0 = clip(0, int((x0 / width) * 1000), 1000) + y0 = clip(0, int((y0 / height) * 1000), 1000) + x1 = clip(0, int((x1 / width) * 1000), 1000) + y1 = clip(0, int((y1 / height) * 1000), 1000) + assert x1 >= x0 + assert y1 >= y0 + return [x0, y0, x1, y1] + + def _tokenize(self, results: dict) -> None: + tokenizer = results['tokenizer'] + + instances = results['instances'] + texts = instances['texts'] + boxes = instances['boxes'] + + # norm boxes + width = results['width'] + height = results['height'] + norm_boxes = [self.box_norm(box, width, height) for box in boxes] + + tokenized_inputs: BatchEncoding = tokenizer( + text=texts, + boxes=norm_boxes, + padding=self.padding, + max_length=self.max_length, + truncation=self.truncation, + pad_to_multiple_of=self.pad_to_multiple_of, + add_special_tokens=True, + return_tensors='np', + return_attention_mask=True, + return_overflowing_tokens=True) + + truncation_number = tokenized_inputs['input_ids'].shape[0] + results['truncation_number'] = truncation_number + # record input_ids/attention_mask/bbox + for k in ['input_ids', 'attention_mask', 'bbox']: + results[k] = tokenized_inputs[k] + # record truncation_word_ids + results['truncation_word_ids'] = [ + tokenized_inputs.encodings[batch_index].word_ids + for batch_index in range(truncation_number) + ] + + def transform(self, results: dict) -> Dict: + self._tokenize(results) + return results + + +@TRANSFORMS.register_module() +class ConvertBIOLabelForSER(BaseTransform): + """A transform to convert BIO format labels for SER task, + + Required Keys: + + - tokenizer + - truncation_word_ids + - instances + - labels + + Added Keys: + + - labels + + Args: + classes (Union[tuple, list]): dataset classes + only_label_first_subword (bool): Whether or not to only label + the first subword, in case word labels are provided. + """ + + def __init__(self, + classes: Union[tuple, list], + only_label_first_subword: bool = True) -> None: + super().__init__() + self.other_label_name = find_other_label_name_of_biolabel(classes) + self.biolabel2id = self._generate_biolabel2id_map(classes) + assert only_label_first_subword is True, \ + 'Only support `only_label_first_subword=True` now.' + self.only_label_first_subword = only_label_first_subword + + def _generate_biolabel2id_map(self, classes: Union[tuple, list]) -> Dict: + bio_label_list = [] + for c in sorted(classes): + if c == self.other_label_name: + bio_label_list.insert(0, 'O') + else: + bio_label_list.append(f'B-{c}') + bio_label_list.append(f'I-{c}') + biolabel2id_map = { + bio_label: idx + for idx, bio_label in enumerate(bio_label_list) + } + return biolabel2id_map + + def _convert(self, results: dict) -> None: + tokenizer = results['tokenizer'] + + instances = results['instances'] + labels = [label for label in instances['labels']] + + batch_biolabel_ids = [] + for truncation_word_ids in results['truncation_word_ids']: + biolabel_ids = [] + pre_word_id = None + for cur_word_id in truncation_word_ids: + if cur_word_id is not None: + if cur_word_id != pre_word_id: + biolabel_name = f'B-{labels[cur_word_id]}' \ + if labels[cur_word_id] != \ + self.other_label_name else 'O' + elif self.only_label_first_subword: + biolabel_name = 'O' + else: + biolabel_name = f'I-{labels[cur_word_id]}' \ + if labels[cur_word_id] != \ + self.other_label_name else 'O' + # convert biolabel to id + biolabel_ids.append(self.biolabel2id[biolabel_name]) + else: + biolabel_ids.append(tokenizer.pad_token_label) + pre_word_id = cur_word_id + batch_biolabel_ids.append(biolabel_ids) + + # record batch_biolabel_ids + results['labels'] = batch_biolabel_ids + + def transform(self, results: dict) -> Dict: + self._convert(results) + return results diff --git a/projects/LayoutLMv3/datasets/utils.py b/projects/LayoutLMv3/datasets/utils.py new file mode 100644 index 000000000..9526a1994 --- /dev/null +++ b/projects/LayoutLMv3/datasets/utils.py @@ -0,0 +1,64 @@ +from typing import Dict, Sequence + +import torch +from mmengine.dataset.utils import COLLATE_FUNCTIONS + + +@COLLATE_FUNCTIONS.register_module() +def ser_collate(data_batch: Sequence, training: bool = True) -> Dict: + """A collate function designed for SER. + + Args: + data_batch (Sequence): Data sampled from dataset. + Like: + [ + { + 'inputs': {'input_ids': ..., 'bbox': ..., ...}, + 'data_samples': ['SERDataSample_1'] + }, + { + 'inputs': {'input_ids': ..., 'bbox': ..., ...}, + 'data_samples': ['SERDataSample_1', 'SERDataSample_2', ...] + }, + ... + ] + training (bool): whether training process or not. + + Note: + Different from ``default_collate`` in pytorch or in mmengine, + ``ser_collate`` can accept `inputs` tensor and `data_samples` + list with the different shape. + + Returns: + transposed (Dict): A dict have two elements, + the first element `inputs` is a dict + the second element `data_samples` is a list + """ + batch_size = len(data_batch) + # transpose `inputs`, which is a dict. + batch_inputs = [data_item['inputs'] for data_item in data_batch] + batch_inputs_item = batch_inputs[0] + transposed_batch_inputs = {} + for key in batch_inputs_item: + concat_value = torch.concat([d[key] for d in batch_inputs], dim=0) + # TODO: because long text will be truncated, the concat_value + # cannot be sliced directly when training=False. + # How to support batch inference? + transposed_batch_inputs[key] = concat_value[:batch_size] \ + if training else concat_value + # transpose `data_samples`, which is a list. + batch_data_samples = [ + data_item['data_samples'] for data_item in data_batch + ] + flattened = [sub_item for item in batch_data_samples for sub_item in item] + # TODO: because long text will be truncated, the concat_value + # cannot be sliced directly when training=False. + # How to support batch inference? + transposed_batch_data_samples = flattened[:batch_size] \ + if training else flattened + + transposed = { + 'inputs': transposed_batch_inputs, + 'data_samples': transposed_batch_data_samples + } + return transposed diff --git a/projects/LayoutLMv3/datasets/xfund_dataset.py b/projects/LayoutLMv3/datasets/xfund_dataset.py new file mode 100644 index 000000000..5a7fda46b --- /dev/null +++ b/projects/LayoutLMv3/datasets/xfund_dataset.py @@ -0,0 +1,57 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import BaseDataset + +from mmocr.registry import DATASETS + + +@DATASETS.register_module() +class XFUNDDataset(BaseDataset): + """XFUND Dataset for Semantic Entity Recognition and Relation Extraction + task. + + The annotation format is shown as follows. + + .. code-block:: none + + { + "metainfo":{}, + "data_list": + [ + { + "img_path": "data/xfund/zh/imgs/train/zh_train_0.jpg", + "height": 3508, + "width": 2480, + "instances": + { + "texts": ["绩效目标申报表(一级项目)", "项目名称", ...], + "boxes": [[906,195,1478,259], + [357,325,467,357], ...], + "labels": ["header", "question", ...], + "linkings": [[0, 1], [2, 3], ...], (RE task will have) + "ids": [0, 1, ...], (RE task will have) + "words": [[{ + "box": [ + 904, + 192, + 942, + 253 + ], + "text": "绩" + }, + { + "box": [ + 953, + 192, + 987, + 253 + ], + "text": "效" + }, ...], ...] + } + }, + ] + } + + Args: + The same as OCRDataset + """ diff --git a/projects/LayoutLMv3/evaluation/__init__.py b/projects/LayoutLMv3/evaluation/__init__.py new file mode 100644 index 000000000..e9f2df5e3 --- /dev/null +++ b/projects/LayoutLMv3/evaluation/__init__.py @@ -0,0 +1 @@ +from .metrics import * # NOQA diff --git a/projects/LayoutLMv3/evaluation/metrics/__init__.py b/projects/LayoutLMv3/evaluation/metrics/__init__.py new file mode 100644 index 000000000..d3d029f46 --- /dev/null +++ b/projects/LayoutLMv3/evaluation/metrics/__init__.py @@ -0,0 +1,3 @@ +from .seqeval_metric import SeqevalMetric + +__all__ = ['SeqevalMetric'] diff --git a/projects/LayoutLMv3/evaluation/metrics/seqeval_metric.py b/projects/LayoutLMv3/evaluation/metrics/seqeval_metric.py new file mode 100644 index 000000000..7ee774266 --- /dev/null +++ b/projects/LayoutLMv3/evaluation/metrics/seqeval_metric.py @@ -0,0 +1,41 @@ +from typing import Any, Optional, Sequence + +from mmengine.evaluator import BaseMetric +from seqeval.metrics import (accuracy_score, f1_score, precision_score, + recall_score) + +from mmocr.registry import METRICS + + +@METRICS.register_module() +class SeqevalMetric(BaseMetric): + + default_prefix: Optional[str] = 'ser' + + def __init__(self, + collect_device: str = 'cpu', + prefix: Optional[str] = None) -> None: + super().__init__(collect_device, prefix) + + def process(self, data_batch: Any, data_samples: Sequence[dict]) -> None: + for data_sample in data_samples: + pred_labels = data_sample.get('pred_label').get('item') + gt_labels = data_sample.get('gt_label').get('item') + + result = dict(pred_labels=pred_labels, gt_labels=gt_labels) + self.results.append(result) + + def compute_metrics(self, results: list) -> dict: + preds = [] + gts = [] + for result in results: + preds.append(result['pred_labels']) + gts.append(result['gt_labels']) + + result = { + 'precision': precision_score(gts, preds), + 'recall': recall_score(gts, preds), + 'f1': f1_score(gts, preds), + 'accuracy': accuracy_score(gts, preds) + } + return result diff --git a/projects/LayoutLMv3/models/__init__.py b/projects/LayoutLMv3/models/__init__.py new file mode 100644 index 000000000..e84b0b2ac --- /dev/null +++ b/projects/LayoutLMv3/models/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .hf_layoutlmv3_wrapper import HFLayoutLMv3ForTokenClassificationWrapper +from .loss_processor import ComputeLossAfterLabelSmooth +from .ser_postprocessor import SERPostprocessor + +__all__ = [ + 'HFLayoutLMv3ForTokenClassificationWrapper', 'SERPostprocessor', + 'ComputeLossAfterLabelSmooth' +] diff --git a/projects/LayoutLMv3/models/hf_layoutlmv3_wrapper.py b/projects/LayoutLMv3/models/hf_layoutlmv3_wrapper.py new file mode 100644 index 000000000..dcb786602 --- /dev/null +++ b/projects/LayoutLMv3/models/hf_layoutlmv3_wrapper.py @@ -0,0 +1,153 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Optional, Tuple, Union + +import torch +from mmengine.model import BaseModel +from transformers import LayoutLMv3ForTokenClassification +from transformers.modeling_outputs import TokenClassifierOutput + +from mmocr.registry import MODELS +from projects.LayoutLMv3.utils.typing_utils import (OptSERSampleList, + SERSampleList) + +ForwardResults = Union[Dict[str, torch.Tensor], SERSampleList, + Tuple[torch.Tensor], torch.Tensor] + + +@MODELS.register_module() +class HFLayoutLMv3ForTokenClassificationWrapper(BaseModel): + + def __init__(self, + layoutlmv3_token_classifier: dict = dict( + pretrained_model_name_or_path=None), + loss_processor: Optional[Dict] = None, + data_preprocessor: Optional[Dict] = None, + postprocessor: Optional[Dict] = None, + init_cfg: Optional[Dict] = None): + super().__init__( + data_preprocessor=data_preprocessor, init_cfg=init_cfg) + if isinstance(layoutlmv3_token_classifier, dict) and \ + layoutlmv3_token_classifier.get( + 'pretrained_model_name_or_path', None): + self.model = LayoutLMv3ForTokenClassification.from_pretrained( + **layoutlmv3_token_classifier) + else: + raise TypeError( + 'layoutlmv3_token_classifier cfg should be a `dict` and a key ' + '`pretrained_model_name_or_path` must be specified') + + if loss_processor is not None: + assert isinstance(loss_processor, dict) + self.loss_processor = MODELS.build(loss_processor) + + if postprocessor is not None: + assert isinstance(postprocessor, dict) + self.postprocessor = MODELS.build(postprocessor) + + def forward(self, + inputs: torch.Tensor, + data_samples: OptSERSampleList = None, + mode: str = 'tensor') -> ForwardResults: + """The unified entry for a forward process in both training and test. + + The method should accept three modes: "tensor", "predict" and "loss": + + - "tensor": Forward the whole network and return tensor or tuple of + tensor without any post-processing, same as a common nn.Module. + - "predict": Forward and return the predictions, which are fully + processed to a list of :obj:`SERDataSample`. + - "loss": Forward and return a dict of losses according to the given + inputs and data samples. + + Note that this method doesn't handle either back propagation or + parameter update, which are supposed to be done in :meth:`train_step`. + + Args: + inputs (torch.Tensor): The input tensor with shape + (N, C, ...) in general. + data_samples (list[:obj:`SERDataSample`], optional): A batch of + data samples that contain annotations and predictions. + Defaults to None. + mode (str): Return what kind of value. Defaults to 'tensor'. + + Returns: + The return type depends on ``mode``. + + - If ``mode="tensor"``, return a tensor or a tuple of tensor. + - If ``mode="predict"``, return a list of :obj:`SERDataSample`. + - If ``mode="loss"``, return a dict of tensor. + """ + if mode == 'loss': + return self.loss(inputs, data_samples) + elif mode == 'predict': + return self.predict(inputs, data_samples) + elif mode == 'tensor': + return self._forward(inputs, data_samples) + else: + raise RuntimeError(f'Invalid mode "{mode}". ' + 'Only supports loss, predict and tensor mode') + + def loss(self, inputs: torch.Tensor, data_samples: SERSampleList) -> Dict: + """Calculate losses from a batch of inputs and data samples. + + Args: + inputs (torch.Tensor): Input images of shape (N, C, H, W). + Typically these should be mean centered and std scaled. + data_samples (list[SERDataSample]): A list of N + datasamples, containing meta information and gold annotations + for each of the images. + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + labels = inputs.pop('labels') + outputs: TokenClassifierOutput = self.model(**inputs) + return self.loss_processor(outputs, labels) + + def predict(self, inputs: torch.Tensor, + data_samples: SERSampleList) -> SERSampleList: + """Predict results from a batch of inputs and data samples with post- + processing. + + Args: + inputs (torch.Tensor): Images of shape (N, C, H, W). + data_samples (list[SERDataSample]): A list of N + datasamples, containing meta information and gold annotations + for each of the images. + + Returns: + list[SERDataSample]: A list of N datasamples of prediction + results. Each DetDataSample usually contain + 'pred_instances'. And the ``pred_instances`` usually + contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - polygons (list[np.ndarray]): The length is num_instances. + Each element represents the polygon of the + instance, in (xn, yn) order. + """ + outputs: TokenClassifierOutput = self.model(**inputs) + return self.postprocessor(outputs['logits'], data_samples) + + def _forward(self, + inputs: torch.Tensor, + data_samples: OptSERSampleList = None, + **kwargs) -> torch.Tensor: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + inputs (Tensor): Inputs with shape (N, C, H, W). + data_samples (list[SERDataSample]): A list of N + datasamples, containing meta information and gold annotations + for each of the images. + + Returns: + Tensor or tuple[Tensor]: A tuple of features from ``det_head`` + forward. + """ + return self.model(**inputs) diff --git a/projects/LayoutLMv3/models/loss_processor.py b/projects/LayoutLMv3/models/loss_processor.py new file mode 100644 index 000000000..54154ffa2 --- /dev/null +++ b/projects/LayoutLMv3/models/loss_processor.py @@ -0,0 +1,19 @@ +from transformers.trainer_pt_utils import LabelSmoother + +from mmocr.registry import MODELS + + +@MODELS.register_module() +class ComputeLossAfterLabelSmooth(LabelSmoother): + """Compute loss after label-smoothing. + + Args: + epsilon (`float`, *optional*, defaults to 0.1): + The label smoothing factor. + ignore_index (`int`, *optional*, defaults to -100): + The index in the labels to ignore when computing the loss. + """ + + def __call__(self, model_output, labels, shift_labels=False): + loss = super().__call__(model_output, labels, shift_labels) + return {'loss': loss} diff --git a/projects/LayoutLMv3/models/ser_postprocessor.py b/projects/LayoutLMv3/models/ser_postprocessor.py new file mode 100644 index 000000000..d8be05433 --- /dev/null +++ b/projects/LayoutLMv3/models/ser_postprocessor.py @@ -0,0 +1,145 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import Dict, Sequence, Union + +import torch +import torch.nn as nn +from mmengine.structures import LabelData + +from mmocr.registry import MODELS +from projects.LayoutLMv3.structures import SERDataSample +from projects.LayoutLMv3.utils.bio_label_utils import \ + find_other_label_name_of_biolabel + + +@MODELS.register_module() +class SERPostprocessor(nn.Module): + """PostProcessor for SER.""" + + def __init__(self, + classes: Union[tuple, list], + only_label_first_subword: bool = True) -> None: + super().__init__() + self.other_label_name = find_other_label_name_of_biolabel(classes) + self.id2biolabel = self._generate_id2biolabel_map(classes) + assert only_label_first_subword is True, \ + 'Only support `only_label_first_subword=True` now.' + self.only_label_first_subword = only_label_first_subword + self.softmax = nn.Softmax(dim=-1) + + def _generate_id2biolabel_map(self, classes: Union[tuple, list]) -> Dict: + bio_label_list = [] + classes = sorted([c for c in classes]) + for c in classes: + if c == self.other_label_name: + bio_label_list.insert(0, 'O') + else: + bio_label_list.append(f'B-{c}') + bio_label_list.append(f'I-{c}') + id2biolabel_map = { + idx: bio_label + for idx, bio_label in enumerate(bio_label_list) + } + return id2biolabel_map + + def __call__(self, outputs: torch.Tensor, + data_samples: Sequence[SERDataSample] + ) -> Sequence[SERDataSample]: + assert all('truncation_word_ids' in d for d in data_samples), \ + 'The key `truncation_word_ids` should be specified' \ + 'in PackSERInputs.' + truncation_word_ids = [ + data_sample.pop('truncation_word_ids') + for data_sample in data_samples + ] + word_ids = [ + word_id for word_ids in truncation_word_ids + for word_id in word_ids[1:-1] + ] + + # merge several truncation data_sample to one data_sample + merged_data_sample = copy.deepcopy(data_samples[0]) + + # convert outputs dim from (truncation_num, max_length, label_num) + # to (truncation_num * max_length, label_num) + outputs = outputs.cpu().detach() + outputs = torch.reshape(outputs[:, 1:-1, :], (-1, outputs.size(-1))) + # get pred label ids/scores from outputs + probs = self.softmax(outputs) + max_value, max_idx = torch.max(probs, -1) + pred_label_ids = max_idx.numpy().tolist() + pred_label_scores = max_value.numpy().tolist() + + # inference process do not have item in gt_label, + # so select valid token with word_ids rather than + # with gt_label_ids like official code. + pred_words_biolabels = [] + word_biolabels = [] + pre_word_id = None + for idx, cur_word_id in enumerate(word_ids): + if cur_word_id is not None: + if cur_word_id != pre_word_id: + if word_biolabels: + pred_words_biolabels.append(word_biolabels) + word_biolabels = [] + word_biolabels.append((self.id2biolabel[pred_label_ids[idx]], + pred_label_scores[idx])) + else: + pred_words_biolabels.append(word_biolabels) + word_biolabels = [] + break + pre_word_id = cur_word_id + if word_biolabels: + pred_words_biolabels.append(word_biolabels) + # record pred_label + if self.only_label_first_subword: + pred_label = LabelData() + pred_label.item = [ + pred_word_biolabels[0][0] + for pred_word_biolabels in pred_words_biolabels + ] + pred_label.score = [ + pred_word_biolabels[0][1] + for pred_word_biolabels in pred_words_biolabels + ] + merged_data_sample.pred_label = pred_label + else: + raise NotImplementedError( + 'The `only_label_first_subword=False` is not support yet.') + + # determine whether it is an inference process + if 'item' in data_samples[0].gt_label: + # merge gt label ids from data_samples + gt_label_ids = [ + data_sample.gt_label.item[1:-1] for data_sample in data_samples + ] + gt_label_ids = torch.cat( + gt_label_ids, dim=0).cpu().detach().numpy().tolist() + gt_words_biolabels = [] + word_biolabels = [] + pre_word_id = None + for idx, cur_word_id in enumerate(word_ids): + if cur_word_id is not None: + if cur_word_id != pre_word_id: + if word_biolabels: + gt_words_biolabels.append(word_biolabels) + word_biolabels = [] + word_biolabels.append(self.id2biolabel[gt_label_ids[idx]]) + else: + gt_words_biolabels.append(word_biolabels) + word_biolabels = [] + break + pre_word_id = cur_word_id + if word_biolabels: + gt_words_biolabels.append(word_biolabels) + # update merged gt_label + if self.only_label_first_subword: + merged_data_sample.gt_label.item = [ + gt_word_biolabels[0] + for gt_word_biolabels in gt_words_biolabels + ] + else: + raise NotImplementedError( + 'The `only_label_first_subword=False` is not support yet.') + + return [merged_data_sample] diff --git a/projects/LayoutLMv3/scripts/prepare_dataset.sh b/projects/LayoutLMv3/scripts/prepare_dataset.sh new file mode 100644 index 000000000..6c74a9e45 --- /dev/null +++ b/projects/LayoutLMv3/scripts/prepare_dataset.sh @@ -0,0 +1,18 @@ +PROJ_ROOT=$(pwd) +DATASET_ZOO_PATH=${PROJ_ROOT}/dataset_zoo +NPROC=8 +TASKS=('ser' 're') +SPLITS=('train' 'test') +# DATASET_NAME=('xfund/de' 'xfund/es' 'xfund/fr' 'xfund/jt' 'xfund/ja' 'xfund/pt' 'xfund/zh') +DATASET_NAME=('xfund/zh') + +for TASK in ${TASKS[@]} +do + python tools/dataset_converters/prepare_dataset.py \ + ${DATASET_NAME[@]} \ + --nproc ${NPROC} \ + --task ${TASK} \ + --splits ${SPLITS[@]} \ + --dataset-zoo-path ${DATASET_ZOO_PATH} \ + --overwrite-cfg +done diff --git a/projects/LayoutLMv3/structures/__init__.py b/projects/LayoutLMv3/structures/__init__.py new file mode 100644 index 000000000..729b26f57 --- /dev/null +++ b/projects/LayoutLMv3/structures/__init__.py @@ -0,0 +1,3 @@ +from .ser_data_sample import SERDataSample + +__all__ = ['SERDataSample'] diff --git a/projects/LayoutLMv3/structures/ser_data_sample.py b/projects/LayoutLMv3/structures/ser_data_sample.py new file mode 100644 index 000000000..7f2318e7b --- /dev/null +++ b/projects/LayoutLMv3/structures/ser_data_sample.py @@ -0,0 +1,82 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.structures import BaseDataElement, LabelData + + +class SERDataSample(BaseDataElement): + """A data structure interface of MMOCR for Semantic Entity Recognition. + They are used as interfaces between different components. + + The attributes in ``SERDataSample`` are divided into two parts: + + - ``gt_label``(LabelData): Ground truth label. + - ``pred_label``(LabelData): predictions label. + + Examples: + >>> import torch + >>> import numpy as np + >>> from mmengine.structures import LabelData + >>> from mmocr.data import SERDataSample + >>> # gt_label + >>> data_sample = SERDataSample() + >>> img_meta = dict(img_shape=(800, 1196, 3), + ... pad_shape=(800, 1216, 3)) + >>> gt_label = LabelData(metainfo=img_meta) + >>> gt_label.item = 'mmocr' + >>> data_sample.gt_label = gt_label + >>> assert 'img_shape' in data_sample.gt_label.metainfo_keys() + >>> print(data_sample) + + ) at 0x7f21fb1b9880> + >>> # pred_label + >>> pred_label = LabelData(metainfo=img_meta) + >>> pred_label.item = 'mmocr' + >>> data_sample = SERDataSample(pred_label=pred_label) + >>> assert 'pred_label' in data_sample + >>> data_sample = SERDataSample() + >>> gt_label_data = dict(item='mmocr') + >>> gt_label = LabelData(**gt_label_data) + >>> data_sample.gt_label = gt_label + >>> assert 'gt_label' in data_sample + >>> assert 'item' in data_sample.gt_label + """ + + @property + def gt_label(self) -> LabelData: + """LabelData: ground truth label. + """ + return self._gt_label + + @gt_label.setter + def gt_label(self, value: LabelData) -> None: + """gt_label setter.""" + self.set_field(value, '_gt_label', dtype=LabelData) + + @gt_label.deleter + def gt_label(self) -> None: + """gt_label deleter.""" + del self._gt_label + + @property + def pred_label(self) -> LabelData: + """LabelData: prediction label. + """ + return self._pred_label + + @pred_label.setter + def pred_label(self, value: LabelData) -> None: + """pred_label setter.""" + self.set_field(value, '_pred_label', dtype=LabelData) + + @pred_label.deleter + def pred_label(self) -> None: + """pred_label deleter.""" + del self._pred_label diff --git a/projects/LayoutLMv3/utils/bio_label_utils.py b/projects/LayoutLMv3/utils/bio_label_utils.py new file mode 100644 index 000000000..0e5c94e53 --- /dev/null +++ b/projects/LayoutLMv3/utils/bio_label_utils.py @@ -0,0 +1,14 @@ +from typing import List, Tuple, Union + + +def find_other_label_name_of_biolabel(classes: Union[List[str], Tuple[str]]): + """Find the original name of BIO label `O` + + Args: + classes (List[str]): The list or tuple of class_names. + """ + valid_other_label_names = ('other', 'Other', 'OTHER') + for c in classes: + if c in valid_other_label_names: + return c + return None diff --git a/projects/LayoutLMv3/utils/typing_utils.py b/projects/LayoutLMv3/utils/typing_utils.py new file mode 100644 index 000000000..fa555e74c --- /dev/null +++ b/projects/LayoutLMv3/utils/typing_utils.py @@ -0,0 +1,6 @@ +from typing import List, Optional + +from projects.LayoutLMv3.structures import SERDataSample + +SERSampleList = List[SERDataSample] +OptSERSampleList = Optional[SERSampleList] diff --git a/projects/LayoutLMv3/visualization/__init__.py b/projects/LayoutLMv3/visualization/__init__.py new file mode 100644 index 000000000..fa9a62c1d --- /dev/null +++ b/projects/LayoutLMv3/visualization/__init__.py @@ -0,0 +1,3 @@ +from .ser_visualizer import SERLocalVisualizer + +__all__ = ['SERLocalVisualizer'] diff --git a/projects/LayoutLMv3/visualization/ser_visualizer.py b/projects/LayoutLMv3/visualization/ser_visualizer.py new file mode 100644 index 000000000..0df89db0b --- /dev/null +++ b/projects/LayoutLMv3/visualization/ser_visualizer.py @@ -0,0 +1,224 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Tuple, Union + +import mmcv +import numpy as np +import torch +from mmdet.visualization.palette import _get_adaptive_scales +from mmengine.structures import LabelData + +from mmocr.registry import VISUALIZERS +from mmocr.visualization.base_visualizer import BaseLocalVisualizer +from projects.LayoutLMv3.structures import SERDataSample + + +@VISUALIZERS.register_module() +class SERLocalVisualizer(BaseLocalVisualizer): + """The MMOCR Semantic Entity Recognition Local Visualizer. + + Args: + name (str): Name of the instance. Defaults to 'visualizer'. + image (np.ndarray, optional): The origin image to draw. The format + should be RGB. Defaults to None. + with_poly (bool): Whether to draw polygons. Defaults to False. + with_bbox (bool): Whether to draw bboxes. Defaults to True. + vis_backends (list, optional): Visual backend config list. + Defaults to None. + save_dir (str, optional): Save file dir for all storage backends. + If it is None, the backend storage will not save any data. + bbox_color (Union[str, tuple, list[str], list[tuple]]): The + colors of bboxes. ``colors`` can have the same + length with lines or just single value. If ``colors`` is single + value, all the lines will have the same colors. Refer to + `matplotlib.colors` for full list of formats that are accepted. + Defaults to 'b'. + label_color (Union[str, tuple, list[str], list[tuple]]): The + colors of gt/pred label. ``colors`` can have + the same length with lines or just single value. If ``colors`` + is single value, all the lines will have the same colors. Refer + to `matplotlib.colors` for full list of formats that are accepted. + Defaults to 'g'. + line_width (int, float): The linewidth of lines. Defaults to 2. + alpha (float): The transparency of bboxes or polygons. Defaults to 0.8. + """ + + def __init__(self, + name: str = 'visualizer', + image: Optional[np.ndarray] = None, + with_poly: bool = False, + with_bbox: bool = True, + vis_backends: Optional[Dict] = None, + save_dir: Optional[str] = None, + bbox_color: Union[str, Tuple, List[str], List[Tuple]] = 'b', + label_color: Union[str, Tuple, List[str], List[Tuple]] = 'g', + line_width: Union[int, float] = 2, + alpha: float = 0.8) -> None: + super().__init__( + name=name, + image=image, + vis_backends=vis_backends, + save_dir=save_dir) + self.with_poly = with_poly + self.with_bbox = with_bbox + self.bbox_color = bbox_color + self.label_color = label_color + self.line_width = line_width + self.alpha = alpha + + def _draw_instances(self, + image: np.ndarray, + bboxes: Union[np.ndarray, torch.Tensor], + gt_labels: Optional[LabelData] = None, + pred_labels: Optional[LabelData] = None) -> np.ndarray: + """Draw bboxes and polygons on image. + + Args: + image (np.ndarray): The origin image to draw. + bboxes (Union[np.ndarray, torch.Tensor]): The bboxes to draw. + word_ids (Optional[List[int]]): The word id of tokens. + gt_labels (Optional[LabelData]): The gt LabelData. + pred_labels (Optional[LabelData]): The pred LabelData. + Returns: + np.ndarray: The image with bboxes and gt/pred labels drawn. + """ + self.set_image(image) + # draw bboxes + if bboxes is not None and self.with_bbox: + image = self.get_bboxes_image( + image, + bboxes, + colors=self.bbox_color, + line_width=self.line_width, + alpha=self.alpha) + + areas = (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0]) + scales = _get_adaptive_scales(areas) + positions = (bboxes[:, :2] + bboxes[:, 2:]) // 2 + + if gt_labels is not None: + gt_tokens_biolabel = gt_labels.item + gt_words_label = [ + token_biolabel[2:] if token_biolabel != 'O' else 'other' + for token_biolabel in gt_tokens_biolabel + ] + assert len(gt_words_label) == len(bboxes) + + if pred_labels is not None: + pred_tokens_biolabel = pred_labels.item + pred_words_label = [ + token_biolabel[2:] if token_biolabel != 'O' else 'other' + for token_biolabel in pred_tokens_biolabel + ] + pred_words_label_score = pred_labels.score + assert len(pred_words_label) == len(bboxes) + + # draw gt or pred labels + if gt_labels is not None and pred_labels is not None: + for i, (pos, gt, pred) in enumerate( + zip(positions, gt_words_label, pred_words_label)): + score = round(float(pred_words_label_score[i]) * 100, 1) + label_text = f'{gt} | {pred}({score})' + self.draw_texts( + label_text, + pos, + colors=self.label_color if gt == pred else 'r', + font_sizes=int(13 * scales[i]), + vertical_alignments='center', + horizontal_alignments='center') + elif pred_labels is not None: + for i, (pos, pred) in enumerate(zip(positions, pred_words_label)): + score = round(float(pred_words_label_score[i]) * 100, 1) + label_text = f'Pred: {pred}({score})' + self.draw_texts( + label_text, + pos, + colors=self.label_color, + font_sizes=int(13 * scales[i]), + vertical_alignments='center', + horizontal_alignments='center') + elif gt_labels is not None: + for i, (pos, gt) in enumerate(zip(positions, gt_words_label)): + label_text = f'GT: {gt}' + self.draw_texts( + label_text, + pos, + colors=self.label_color, + font_sizes=int(13 * scales[i]), + vertical_alignments='center', + horizontal_alignments='center') + + return self.get_image() + + def add_datasample(self, + name: str, + image: np.ndarray, + data_sample: Optional[SERDataSample] = None, + draw_gt: bool = True, + draw_pred: bool = True, + show: bool = False, + wait_time: int = 0, + out_file: Optional[str] = None, + pred_score_thr: float = 0.3, + step: int = 0) -> None: + """Draw datasample and save to all backends. + + - If GT and prediction are plotted at the same time, they are + displayed in a stitched image where the left image is the + ground truth and the right image is the prediction. + - If ``show`` is True, all storage backends are ignored, and + the images will be displayed in a local window. + - If ``out_file`` is specified, the drawn image will be + saved to ``out_file``. This is usually used when the display + is not available. + + Args: + name (str): The image identifier. + image (np.ndarray): The image to draw. + data_sample (:obj:`TextDetDataSample`, optional): + TextDetDataSample which contains gt and prediction. Defaults + to None. + draw_gt (bool): Whether to draw GT TextDetDataSample. + Defaults to True. + draw_pred (bool): Whether to draw Predicted TextDetDataSample. + Defaults to True. + show (bool): Whether to display the drawn image. Default to False. + wait_time (float): The interval of show (s). Defaults to 0. + out_file (str): Path to output file. Defaults to None. + pred_score_thr (float): The threshold to visualize the bboxes + and masks. Defaults to 0.3. + step (int): Global step value to record. Defaults to 0. + """ + cat_images = [] + if data_sample is not None: + bboxes = np.array(data_sample.instances.get('boxes', None)) + gt_label = data_sample.gt_label if \ + draw_gt and 'gt_label' in data_sample else None + pred_label = data_sample.pred_label if \ + draw_pred and 'pred_label' in data_sample else None + # draw original image with bboxes + orig_img_with_bboxes = self._draw_instances( + image=image.copy(), + bboxes=bboxes, + gt_labels=None, + pred_labels=None) + cat_images.append(orig_img_with_bboxes) + empty_img = np.full_like(image, 255) + empty_img_with_label = self._draw_instances( + image=empty_img, + bboxes=bboxes, + gt_labels=gt_label, + pred_labels=pred_label) + cat_images.append(empty_img_with_label) + cat_images = self._cat_image(cat_images, axis=1) + if cat_images is None: + cat_images = image + if show: + self.show(cat_images, win_name=name, wait_time=wait_time) + else: + self.add_image(name, cat_images, step) + + if out_file is not None: + mmcv.imwrite(cat_images[..., ::-1], out_file) + + self.set_image(cat_images) + return self.get_image() diff --git a/tools/dataset_converters/prepare_dataset.py b/tools/dataset_converters/prepare_dataset.py index 84b8a0353..1d2e74c06 100644 --- a/tools/dataset_converters/prepare_dataset.py +++ b/tools/dataset_converters/prepare_dataset.py @@ -21,9 +21,9 @@ def parse_args(): parser.add_argument( '--task', default='textdet', - choices=['textdet', 'textrecog', 'textspotting', 'kie'], + choices=['textdet', 'textrecog', 'textspotting', 'kie', 'ser', 're'], help='Task type. Options are "textdet", "textrecog", "textspotting"' - ' and "kie".') + ' "kie", "ser" and "re".') parser.add_argument( '--splits', default=['train', 'test', 'val'],