diff --git a/.circleci/docker/Dockerfile b/.circleci/docker/Dockerfile index d9cf8cc77..b5efe06a4 100644 --- a/.circleci/docker/Dockerfile +++ b/.circleci/docker/Dockerfile @@ -1,6 +1,7 @@ ARG PYTORCH="1.8.1" ARG CUDA="10.2" ARG CUDNN="7" +ARG DEBIAN_FRONTEND=noninteractive FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel diff --git a/.circleci/test.yml b/.circleci/test.yml index c24bebcb5..51d9770ad 100644 --- a/.circleci/test.yml +++ b/.circleci/test.yml @@ -80,16 +80,22 @@ jobs: type: string cuda: type: enum - enum: ["10.1", "10.2", "11.1", "11.7"] + enum: ["10.1", "10.2", "11.1", "11.7", "11.8"] cudnn: type: integer default: 7 machine: - image: ubuntu-2004-cuda-11.4:202110-01 + image: linux-cuda-11:default # docker_layer_caching: true - resource_class: gpu.nvidia.small + resource_class: gpu.nvidia.small.multi steps: - checkout + - run: + name: Install nvidia-container-toolkit and Restart Docker + command: | + sudo apt-get update + sudo apt-get install -y nvidia-container-toolkit + sudo systemctl restart docker - run: # Cloning repos in VM since Docker doesn't have access to the private key name: Clone Repos @@ -152,8 +158,8 @@ workflows: - lint - build_cpu: name: maximum_version_cpu - torch: 2.0.0 - torchvision: 0.15.1 + torch: 2.1.0 + torchvision: 0.16.0 python: 3.9.0 requires: - minimum_version_cpu @@ -171,10 +177,10 @@ workflows: - hold - build_cuda: name: mainstream_version_gpu - torch: 2.0.0 + torch: 2.1.0 # Use double quotation mark to explicitly specify its type # as string instead of number - cuda: "11.7" + cuda: "11.8" cudnn: 8 requires: - hold diff --git a/.codespellrc b/.codespellrc index d9a0a76c5..72be50e00 100644 --- a/.codespellrc +++ b/.codespellrc @@ -2,4 +2,4 @@ skip = *.ipynb count = quiet-level = 3 -ignore-words-list = convertor,convertors,formating,nin,wan,datas,hist,ned +ignore-words-list = convertor,convertors,formating,nin,wan,datas,hist,ned,ser diff --git a/.github/workflows/merge_stage_test.yml b/.github/workflows/merge_stage_test.yml index 856ede833..44be34746 100644 --- a/.github/workflows/merge_stage_test.yml +++ b/.github/workflows/merge_stage_test.yml @@ -60,7 +60,7 @@ jobs: strategy: matrix: python-version: [3.7] - torch: [1.6.0, 1.7.1, 1.8.1, 1.9.1, 1.10.1, 1.11.0, 1.12.1, 1.13.0] + torch: [1.6.0, 1.7.1, 1.8.1, 1.9.1, 1.10.1, 1.11.0, 1.12.1, 1.13.0, 2.0.0, 2.1.0] include: - torch: 1.6.0 torchvision: 0.7.0 @@ -81,6 +81,9 @@ jobs: - torch: 2.0.0 torchvision: 0.15.1 python-version: 3.8 + - torch: 2.1.0 + torchvision: 0.16.0 + python-version: 3.8 steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} diff --git a/configs/re/_base_/datasets/xfund_zh.py b/configs/re/_base_/datasets/xfund_zh.py new file mode 100644 index 000000000..06fb11c09 --- /dev/null +++ b/configs/re/_base_/datasets/xfund_zh.py @@ -0,0 +1,14 @@ +xfund_zh_re_data_root = 'data/xfund/zh' + +xfund_zh_re_train = dict( + type='XFUNDDataset', + data_root=xfund_zh_re_data_root, + ann_file='re_train.json', + pipeline=None) + +xfund_zh_re_test = dict( + type='XFUNDDataset', + data_root=xfund_zh_re_data_root, + ann_file='re_test.json', + test_mode=True, + pipeline=None) diff --git a/configs/ser/_base_/datasets/xfund_zh.py b/configs/ser/_base_/datasets/xfund_zh.py new file mode 100644 index 000000000..e790a7bf6 --- /dev/null +++ b/configs/ser/_base_/datasets/xfund_zh.py @@ -0,0 +1,14 @@ +xfund_zh_ser_data_root = 'data/xfund/zh' + +xfund_zh_ser_train = dict( + type='XFUNDDataset', + data_root=xfund_zh_ser_data_root, + ann_file='ser_train.json', + pipeline=None) + +xfund_zh_ser_test = dict( + type='XFUNDDataset', + data_root=xfund_zh_ser_data_root, + ann_file='ser_test.json', + test_mode=True, + pipeline=None) diff --git a/dataset_zoo/xfund/de/metafile.yml b/dataset_zoo/xfund/de/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/de/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/de/re.py b/dataset_zoo/xfund/de/re.py new file mode 100644 index 000000000..e0419d026 --- /dev/null +++ b/dataset_zoo/xfund/de/re.py @@ -0,0 +1,6 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/de/sample_anno.md b/dataset_zoo/xfund/de/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/de/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/de/ser.py b/dataset_zoo/xfund/de/ser.py new file mode 100644 index 000000000..5e9769eb0 --- /dev/null +++ b/dataset_zoo/xfund/de/ser.py @@ -0,0 +1,60 @@ +lang = 'de' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='8c9f949952d227290e22f736cdbe4d29', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='3e4b95c7da893bf5a91018445c83ccdd', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='d13d12278d585214183c3cfb949b0e59', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='8eaf742f2d19b17f5c0e72da5c7761ef', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/dataset_zoo/xfund/es/metafile.yml b/dataset_zoo/xfund/es/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/es/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/es/re.py b/dataset_zoo/xfund/es/re.py new file mode 100644 index 000000000..e0419d026 --- /dev/null +++ b/dataset_zoo/xfund/es/re.py @@ -0,0 +1,6 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/es/sample_anno.md b/dataset_zoo/xfund/es/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/es/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/es/ser.py b/dataset_zoo/xfund/es/ser.py new file mode 100644 index 000000000..da8900980 --- /dev/null +++ b/dataset_zoo/xfund/es/ser.py @@ -0,0 +1,60 @@ +lang = 'es' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='0ff89032bc6cb2e7ccba062c71944d03', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='b40b43f276c7deaaaa5923d035da2820', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='efad9fb11ee3036bef003b6364a79ac0', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='96ffc2057049ba2826a005825b3e7f0d', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/dataset_zoo/xfund/fr/metafile.yml b/dataset_zoo/xfund/fr/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/fr/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/fr/re.py b/dataset_zoo/xfund/fr/re.py new file mode 100644 index 000000000..e0419d026 --- /dev/null +++ b/dataset_zoo/xfund/fr/re.py @@ -0,0 +1,6 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/fr/sample_anno.md b/dataset_zoo/xfund/fr/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/fr/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/fr/ser.py b/dataset_zoo/xfund/fr/ser.py new file mode 100644 index 000000000..aad6b7cf3 --- /dev/null +++ b/dataset_zoo/xfund/fr/ser.py @@ -0,0 +1,60 @@ +lang = 'fr' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='d821ca50f37cc39ff1715632f4068ea1', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='349e7f824225bc7cc53f0c0eb8c87d3e', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='9ccbf15816ca05e50229885b75e57e49', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='15d8a52a4eb20ea029a4aa3eaa25ef8d', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/dataset_zoo/xfund/it/metafile.yml b/dataset_zoo/xfund/it/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/it/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/it/re.py b/dataset_zoo/xfund/it/re.py new file mode 100644 index 000000000..e0419d026 --- /dev/null +++ b/dataset_zoo/xfund/it/re.py @@ -0,0 +1,6 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/it/sample_anno.md b/dataset_zoo/xfund/it/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/it/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/it/ser.py b/dataset_zoo/xfund/it/ser.py new file mode 100644 index 000000000..fc9fc8b70 --- /dev/null +++ b/dataset_zoo/xfund/it/ser.py @@ -0,0 +1,60 @@ +lang = 'it' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='c531e39f0cbc1dc74caa320ffafe5de9', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='fa6afe204a6af57152627e76fe2de005', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='35446a115561d0773b7f2a0c2f32fe5c', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='260d4ea447636cbca1ce1ca5fc5846d9', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/dataset_zoo/xfund/ja/metafile.yml b/dataset_zoo/xfund/ja/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/ja/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/ja/re.py b/dataset_zoo/xfund/ja/re.py new file mode 100644 index 000000000..e0419d026 --- /dev/null +++ b/dataset_zoo/xfund/ja/re.py @@ -0,0 +1,6 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/ja/sample_anno.md b/dataset_zoo/xfund/ja/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/ja/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/ja/ser.py b/dataset_zoo/xfund/ja/ser.py new file mode 100644 index 000000000..856b4f96d --- /dev/null +++ b/dataset_zoo/xfund/ja/ser.py @@ -0,0 +1,60 @@ +lang = 'ja' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='50c22c6774706494080a73f8eabcf45d', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='46cd53deab3b8fbd69278da56d1778c4', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='93a22fea044894264bfa3c9f9c84dd37', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='f576b6dc6c08fd98cf877fb04bc4c8c3', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/dataset_zoo/xfund/pt/metafile.yml b/dataset_zoo/xfund/pt/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/pt/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/pt/re.py b/dataset_zoo/xfund/pt/re.py new file mode 100644 index 000000000..e0419d026 --- /dev/null +++ b/dataset_zoo/xfund/pt/re.py @@ -0,0 +1,6 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/pt/sample_anno.md b/dataset_zoo/xfund/pt/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/pt/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/pt/ser.py b/dataset_zoo/xfund/pt/ser.py new file mode 100644 index 000000000..ff147ba4c --- /dev/null +++ b/dataset_zoo/xfund/pt/ser.py @@ -0,0 +1,60 @@ +lang = 'pt' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='783ba0aba419235bc81cf547e7c5011b', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='3fe0fb93e631fcbc391216d2d7b0510d', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='5f0189d29c5a0e6764757457f54ba14f', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='82a93addffdd7ac7fd978972adf1a348', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/dataset_zoo/xfund/zh/metafile.yml b/dataset_zoo/xfund/zh/metafile.yml new file mode 100644 index 000000000..86dfff885 --- /dev/null +++ b/dataset_zoo/xfund/zh/metafile.yml @@ -0,0 +1,41 @@ +Name: 'XFUND' +Paper: + Title: 'XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding' + URL: https://aclanthology.org/2022.findings-acl.253 + Venue: ACL + Year: '2022' + BibTeX: '@inproceedings{xu-etal-2022-xfund, + title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding", + author = "Xu, Yiheng and + Lv, Tengchao and + Cui, Lei and + Wang, Guoxin and + Lu, Yijuan and + Florencio, Dinei and + Zhang, Cha and + Wei, Furu", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.findings-acl.253", + doi = "10.18653/v1/2022.findings-acl.253", + pages = "3214--3224", + abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.", +}' +Data: + Website: https://github.com/doc-analysis/XFUND + Language: + - Chinese, Japanese, Spanish, French, Italian, German, Portuguese + Scene: + - Document + Granularity: + - Word + Tasks: + - ser + - re + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/xfund/zh/re.py b/dataset_zoo/xfund/zh/re.py new file mode 100644 index 000000000..e0419d026 --- /dev/null +++ b/dataset_zoo/xfund/zh/re.py @@ -0,0 +1,6 @@ +_base_ = ['ser.py'] + +_base_.train_preparer.packer.type = 'REPacker' +_base_.test_preparer.packer.type = 'REPacker' + +config_generator = dict(type='XFUNDREConfigGenerator') diff --git a/dataset_zoo/xfund/zh/sample_anno.md b/dataset_zoo/xfund/zh/sample_anno.md new file mode 100644 index 000000000..6f41a5e92 --- /dev/null +++ b/dataset_zoo/xfund/zh/sample_anno.md @@ -0,0 +1,70 @@ +**Semantic Entity Recognition / Relation Extraction** + +```json +{ + "lang": "zh", + "version": "0.1", + "split": "val", + "documents": [ + { + "id": "zh_val_0", + "uid": "0ac15750a098682aa02b51555f7c49ff43adc0436c325548ba8dba560cde4e7e", + "document": [ + { + "box": [ + 410, + 541, + 535, + 590 + ], + "text": "夏艳辰", + "label": "answer", + "words": [ + { + "box": [ + 413, + 541, + 447, + 587 + ], + "text": "夏" + }, + { + "box": [ + 458, + 542, + 489, + 588 + ], + "text": "艳" + }, + { + "box": [ + 497, + 544, + 531, + 590 + ], + "text": "辰" + } + ], + "linking": [ + [ + 30, + 26 + ] + ], + "id": 26 + }, + // ... + ], + "img": { + "fname": "zh_val_0.jpg", + "width": 2480, + "height": 3508 + } + }, + // ... + ] +} +``` diff --git a/dataset_zoo/xfund/zh/ser.py b/dataset_zoo/xfund/zh/ser.py new file mode 100644 index 000000000..20a3d1150 --- /dev/null +++ b/dataset_zoo/xfund/zh/ser.py @@ -0,0 +1,60 @@ +lang = 'zh' +data_root = f'data/xfund/{lang}' +cache_path = 'data/cache' + +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.zip', + save_name=f'{lang}_train.zip', + md5='a4ce16d1c1a8554a8b1e00907cff3b4b', + content=['image'], + mapping=[[f'{lang}_train/*.jpg', 'imgs/train']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.train.json', + save_name=f'{lang}_train.json', + md5='af1afd5e935cccd3a105de6c12eb4c31', + content=['annotation'], + mapping=[[f'{lang}_train.json', 'annotations/train.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='train.json', img_dir='imgs/train'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.zip', + save_name=f'{lang}_val.zip', + md5='f84c2651e350f5b394585207a43d06e4', + content=['image'], + mapping=[[f'{lang}_val/*.jpg', 'imgs/test']]), + dict( + url='https://github.com/doc-analysis/XFUND/' + f'releases/download/v1.0/{lang}.val.json', + save_name=f'{lang}_val.json', + md5='c243c35d1685a16435c8b281a445005c', + content=['annotation'], + mapping=[[f'{lang}_val.json', 'annotations/test.json']]) + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.json', img_dir='imgs/test'), + parser=dict(type='XFUNDAnnParser'), + packer=dict(type='SERPacker'), + dumper=dict(type='JsonDumper'), +) + +delete = ['annotations'] + [f'{lang}_{split}' for split in ['train', 'val']] +config_generator = dict(type='XFUNDSERConfigGenerator') diff --git a/mmocr/__init__.py b/mmocr/__init__.py index faf1ae81e..4524c4c3c 100644 --- a/mmocr/__init__.py +++ b/mmocr/__init__.py @@ -43,7 +43,7 @@ f'<{mmengine_maximum_version}.' mmdet_minimum_version = '3.0.0rc5' -mmdet_maximum_version = '3.2.0' +mmdet_maximum_version = '3.4.0' mmdet_version = digit_version(mmdet.__version__) assert (mmdet_version >= digit_version(mmdet_minimum_version) diff --git a/mmocr/datasets/preparers/config_generators/__init__.py b/mmocr/datasets/preparers/config_generators/__init__.py index 8e884c6d9..69e3b5157 100644 --- a/mmocr/datasets/preparers/config_generators/__init__.py +++ b/mmocr/datasets/preparers/config_generators/__init__.py @@ -3,8 +3,11 @@ from .textdet_config_generator import TextDetConfigGenerator from .textrecog_config_generator import TextRecogConfigGenerator from .textspotting_config_generator import TextSpottingConfigGenerator +from .xfund_config_generator import (XFUNDREConfigGenerator, + XFUNDSERConfigGenerator) __all__ = [ 'BaseDatasetConfigGenerator', 'TextDetConfigGenerator', - 'TextRecogConfigGenerator', 'TextSpottingConfigGenerator' + 'TextRecogConfigGenerator', 'TextSpottingConfigGenerator', + 'XFUNDSERConfigGenerator', 'XFUNDREConfigGenerator' ] diff --git a/mmocr/datasets/preparers/config_generators/xfund_config_generator.py b/mmocr/datasets/preparers/config_generators/xfund_config_generator.py new file mode 100644 index 000000000..0bc243505 --- /dev/null +++ b/mmocr/datasets/preparers/config_generators/xfund_config_generator.py @@ -0,0 +1,147 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional + +from mmocr.registry import CFG_GENERATORS +from .base import BaseDatasetConfigGenerator + + +@CFG_GENERATORS.register_module() +class XFUNDSERConfigGenerator(BaseDatasetConfigGenerator): + """XFUND dataset Semantic Entity Recognition task config generator. + + Args: + data_root (str): The root path of the dataset. + dataset_name (str): The name of the dataset. + overwrite_cfg (bool): Whether to overwrite the dataset config file if + it already exists. If False, config generator will not generate new + config for datasets whose configs are already in base. + train_anns (List[Dict], optional): A list of train annotation files + to appear in the base configs. Defaults to + ``[dict(file='ser_train.json', dataset_postfix='')]``. + Each element is typically a dict with the following fields: + - ann_file (str): The path to the annotation file relative to + data_root. + - dataset_postfix (str, optional): Affects the postfix of the + resulting variable in the generated config. If specified, the + dataset variable will be named in the form of + ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to + None. + val_anns (List[Dict], optional): A list of val annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to []. + test_anns (List[Dict], optional): A list of test annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to ``[dict(file='ser_test.json')]``. + config_path (str): Path to the configs. Defaults to 'configs/'. + """ + + def __init__(self, + data_root: str, + dataset_name: str, + overwrite_cfg: bool = False, + train_anns: Optional[List[Dict]] = [ + dict(ann_file='ser_train.json', dataset_postfix='') + ], + val_anns: Optional[List[Dict]] = [], + test_anns: Optional[List[Dict]] = [ + dict(ann_file='ser_test.json', dataset_postfix='') + ], + config_path: str = 'configs/') -> None: + + if '/' in dataset_name: + dataset_name = '_'.join(dataset_name.split('/')) + + super().__init__( + data_root=data_root, + task='ser', + overwrite_cfg=overwrite_cfg, + dataset_name=dataset_name, + train_anns=train_anns, + val_anns=val_anns, + test_anns=test_anns, + config_path=config_path, + ) + + def _gen_dataset_config(self) -> str: + """Generate a full dataset config based on the annotation file + dictionary. + + Args: + ann_dict (dict[str, dict(str, str)]): A nested dictionary that maps + a config variable name (such as icdar2015_textrecog_train) to + its corresponding annotation information dict. Each dict + contains following keys: + - ann_file (str): The path to the annotation file relative to + data_root. + - dataset_postfix (str, optional): Affects the postfix of the + resulting variable in the generated config. If specified, the + dataset variable will be named in the form of + ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults + to None. + - split (str): The split the annotation belongs to. Usually + it can be 'train', 'val' and 'test'. + + Returns: + str: The generated dataset config. + """ + cfg = '' + for key_name, ann_dict in self.anns.items(): + cfg += f'\n{key_name} = dict(\n' + cfg += ' type=\'XFUNDDataset\',\n' + cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501 + cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n' + if ann_dict['split'] in ['test', 'val']: + cfg += ' test_mode=True,\n' + cfg += ' pipeline=None)\n' + return cfg + + +@CFG_GENERATORS.register_module() +class XFUNDREConfigGenerator(BaseDatasetConfigGenerator): + """XFUND dataset Relation Extraction task config generator. + + The main difference with `XFUNDSERConfigGenerator` is: + - train_anns/val_anns/test_anns default file name: + f'{ser or re}_{train or test}.json' + - the value of self.task: 'ser' or 're' + """ + + def __init__(self, + data_root: str, + dataset_name: str, + overwrite_cfg: bool = False, + train_anns: Optional[List[Dict]] = [ + dict(ann_file='re_train.json', dataset_postfix='') + ], + val_anns: Optional[List[Dict]] = [], + test_anns: Optional[List[Dict]] = [ + dict(ann_file='re_test.json', dataset_postfix='') + ], + config_path: str = 'configs/') -> None: + + if '/' in dataset_name: + dataset_name = '_'.join(dataset_name.split('/')) + + super().__init__( + data_root=data_root, + task='re', + overwrite_cfg=overwrite_cfg, + dataset_name=dataset_name, + train_anns=train_anns, + val_anns=val_anns, + test_anns=test_anns, + config_path=config_path, + ) + + def _gen_dataset_config(self) -> str: + """Same as `XFUNDSERConfigGenerator._gen_dataset_config()`""" + cfg = '' + for key_name, ann_dict in self.anns.items(): + cfg += f'\n{key_name} = dict(\n' + cfg += ' type=\'XFUNDDataset\',\n' + cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501 + cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n' + if ann_dict['split'] in ['test', 'val']: + cfg += ' test_mode=True,\n' + cfg += ' pipeline=None)\n' + return cfg diff --git a/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py b/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py index 51b0d266c..c743a4859 100644 --- a/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py +++ b/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py @@ -186,8 +186,7 @@ def move(self, mapping: List[Tuple[str, str]]) -> None: if '*' in src: mkdir_or_exist(dst) for f in glob.glob(src): - if not osp.exists( - osp.join(dst, osp.relpath(f, self.data_root))): + if not osp.exists(osp.join(dst, osp.basename(f))): shutil.move(f, dst) elif osp.exists(src) and not osp.exists(dst): diff --git a/mmocr/datasets/preparers/packers/__init__.py b/mmocr/datasets/preparers/packers/__init__.py index 78eb55dc4..a271a3ce5 100644 --- a/mmocr/datasets/preparers/packers/__init__.py +++ b/mmocr/datasets/preparers/packers/__init__.py @@ -1,5 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from .base import BasePacker +from .re_packer import REPacker +from .ser_packer import SERPacker from .textdet_packer import TextDetPacker from .textrecog_packer import TextRecogCropPacker, TextRecogPacker from .textspotting_packer import TextSpottingPacker @@ -7,5 +9,5 @@ __all__ = [ 'BasePacker', 'TextDetPacker', 'TextRecogPacker', 'TextRecogCropPacker', - 'TextSpottingPacker', 'WildReceiptPacker' + 'TextSpottingPacker', 'WildReceiptPacker', 'SERPacker', 'REPacker' ] diff --git a/mmocr/datasets/preparers/packers/re_packer.py b/mmocr/datasets/preparers/packers/re_packer.py new file mode 100644 index 000000000..62dca972c --- /dev/null +++ b/mmocr/datasets/preparers/packers/re_packer.py @@ -0,0 +1,125 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import warnings +from typing import Dict, Tuple + +import mmcv + +from mmocr.registry import DATA_PACKERS +from .ser_packer import SERPacker + + +@DATA_PACKERS.register_module() +class REPacker(SERPacker): + """Relation Extraction packer. It is used to pack the parsed annotation + info to MMOCR format. + + .. code-block:: python + + { + "metainfo": {}, + "data_list": + [ + { + "img_path": "imgs\\test\\zh_val_0.jpg", + "height": 3508, + "width": 2480, + "instances": + { + "texts": ["绩效目标申报表(一级项目)", "项目名称", ...], + "boxes": [[906,195,1478,259], + [357,325,467,357], ...], + "labels": ["header", "question", ...], + "linkings": [[0, 1], [2, 3], ...], + "ids": [0, 1, ...], + "words": [[{ + "box": [ + 904, + 192, + 942, + 253 + ], + "text": "绩" + }, + { + "box": [ + 953, + 192, + 987, + 253 + ], + "text": "效" + }, ...], ...] + } + } + ] + } + """ + + def pack_instance(self, sample: Tuple) -> Dict: + """Pack the parsed annotation info to an MMOCR format instance. + + Args: + sample (Tuple): A tuple of (img_file, instances). + - img_path (str): Path to the image file. + - instances (Sequence[Dict]): A list of converted annos. Each + element should be a dict with the following keys: + + - 'text' + - 'box' + - 'label' + - 'linking' + - 'id' + - 'words' (optional) + + Returns: + Dict: An MMOCR format instance. + """ + + img_path, instances = sample + + img = mmcv.imread(img_path) + h, w = img.shape[:2] + + texts_per_doc = [] + boxes_per_doc = [] + labels_per_doc = [] + linking_per_doc = [] + id_per_doc = [] + has_words = all(['words' in ins for ins in instances]) + if has_words: + words_per_doc = [] + else: + warnings.warn( + 'Not all instance has `words` key,' + 'so final MMOCR format SER instance will not have `words` key') + + for instance in instances: + text = instance.get('text', None) + box = instance.get('box', None) + label = instance.get('label', None) + linking = instance.get('linking', None) + ins_id = instance.get('id', None) + assert text or box or label or linking or ins_id + texts_per_doc.append(text) + boxes_per_doc.append(box) + labels_per_doc.append(label) + linking_per_doc.append(linking) + id_per_doc.append(ins_id) + if has_words: + words = instance.get('words', None) + words_per_doc.append(words) + packed_instances = dict( + instances=dict( + texts=texts_per_doc, + boxes=boxes_per_doc, + labels=labels_per_doc, + linkings=linking_per_doc, + ids=id_per_doc), + img_path=osp.relpath(img_path, self.data_root), + height=h, + width=w) + if has_words: + packed_instances['instances'].update({'words': words_per_doc}) + + return packed_instances diff --git a/mmocr/datasets/preparers/packers/ser_packer.py b/mmocr/datasets/preparers/packers/ser_packer.py new file mode 100644 index 000000000..798cfc4a2 --- /dev/null +++ b/mmocr/datasets/preparers/packers/ser_packer.py @@ -0,0 +1,125 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import warnings +from typing import Dict, List, Tuple + +import mmcv + +from mmocr.registry import DATA_PACKERS +from .base import BasePacker + + +@DATA_PACKERS.register_module() +class SERPacker(BasePacker): + """Semantic Entity Recognition packer. It is used to pack the parsed + annotation info to MMOCR format. + + .. code-block:: python + + { + "metainfo": {}, + "data_list": + [ + { + "img_path": "imgs\\test\\zh_val_0.jpg", + "height": 3508, + "width": 2480, + "instances": + { + "texts": ["绩效目标申报表(一级项目)", "项目名称", ...], + "boxes": [[906,195,1478,259], + [357,325,467,357], ...], + "labels": ["header", "question", ...], + "words": [[{ + "box": [ + 904, + 192, + 942, + 253 + ], + "text": "绩" + }, + { + "box": [ + 953, + 192, + 987, + 253 + ], + "text": "效" + }, ...], ...] + } + } + ] + } + """ + + def pack_instance(self, sample: Tuple) -> Dict: + """Pack the parsed annotation info to an MMOCR format instance. + + Args: + sample (Tuple): A tuple of (img_file, instances). + - img_path (str): Path to the image file. + - instances (Sequence[Dict]): A list of converted annos. Each + element should be a dict with the following keys: + + - 'text' + - 'box' + - 'label' + - 'words' (optional) + + Returns: + Dict: An MMOCR format instance. + """ + + img_path, instances = sample + + img = mmcv.imread(img_path) + h, w = img.shape[:2] + + texts_per_doc = [] + boxes_per_doc = [] + labels_per_doc = [] + has_words = all(['words' in ins for ins in instances]) + if has_words: + words_per_doc = [] + else: + warnings.warn( + 'Not all instance has `words` key,' + 'so final MMOCR format SER instance will not have `words` key') + + for instance in instances: + text = instance.get('text', None) + box = instance.get('box', None) + label = instance.get('label', None) + assert text or box or label + texts_per_doc.append(text) + boxes_per_doc.append(box) + labels_per_doc.append(label) + if has_words: + words = instance.get('words', None) + words_per_doc.append(words) + packed_instances = dict( + instances=dict( + texts=texts_per_doc, + boxes=boxes_per_doc, + labels=labels_per_doc), + img_path=osp.relpath(img_path, self.data_root), + height=h, + width=w) + if has_words: + packed_instances['instances'].update({'words': words_per_doc}) + + return packed_instances + + def add_meta(self, sample: List) -> Dict: + """Add meta information to the sample. + + Args: + sample (List): A list of samples of the dataset. + + Returns: + Dict: A dict contains the meta information and samples. + """ + meta = {'metainfo': {}, 'data_list': sample} + return meta diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py index fd3794710..620797ae5 100644 --- a/mmocr/datasets/preparers/parsers/__init__.py +++ b/mmocr/datasets/preparers/parsers/__init__.py @@ -12,11 +12,12 @@ from .synthtext_parser import SynthTextAnnParser from .totaltext_parser import TotaltextTextDetAnnParser from .wildreceipt_parser import WildreceiptKIEAnnParser +from .xfund_parser import XFUNDAnnParser __all__ = [ 'BaseParser', 'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser', 'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser', 'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser', 'SROIETextDetAnnParser', 'NAFAnnParser', 'CTW1500AnnParser', - 'SynthTextAnnParser', 'MJSynthAnnParser' + 'SynthTextAnnParser', 'MJSynthAnnParser', 'XFUNDAnnParser' ] diff --git a/mmocr/datasets/preparers/parsers/xfund_parser.py b/mmocr/datasets/preparers/parsers/xfund_parser.py new file mode 100644 index 000000000..e776b0fee --- /dev/null +++ b/mmocr/datasets/preparers/parsers/xfund_parser.py @@ -0,0 +1,44 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp +from typing import List + +from mmocr.registry import DATA_PARSERS +from .base import BaseParser + + +@DATA_PARSERS.register_module() +class XFUNDAnnParser(BaseParser): + """XFUND Semantic Entity Recognition and Relation Extraction Annotation + Parser. See dataset_zoo/xfund/xx/sample_anno.md for annotation example. + + Args: + nproc (int): The number of processes to parse the annotation. Defaults + to 1. + """ + + def parse_files(self, img_dir: str, ann_path: str) -> List: + """Parse annotations.""" + assert isinstance(ann_path, str) + samples = list() + for img_fname, instance in self.loader(ann_path): + samples.append((osp.join(img_dir, img_fname), instance)) + return samples + + def loader(self, file_path: str): + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + for i in range(len(data['documents'])): + img_fname = data['documents'][i]['img']['fname'] + instances = list() + for j in range(len(data['documents'][i]['document'])): + cur_item = data['documents'][i]['document'][j] + instance = dict( + text=cur_item['text'], + box=cur_item['box'], + label=cur_item['label'], + words=cur_item['words'], + linking=cur_item['linking'], + id=cur_item['id']) + instances.append(instance) + yield img_fname, instances diff --git a/projects/LayoutLMv3/README.md b/projects/LayoutLMv3/README.md new file mode 100644 index 000000000..41561997d --- /dev/null +++ b/projects/LayoutLMv3/README.md @@ -0,0 +1,149 @@ +# LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking + +
+ +## Description + +This is an implementation of [LayoutLMv3](https://github.com/microsoft/unilm/tree/master/layoutlmv3) based on [MMOCR](https://github.com/open-mmlab/mmocr/tree/dev-1.x), [MMCV](https://github.com/open-mmlab/mmcv), [MMEngine](https://github.com/open-mmlab/mmengine) and [Transformers](https://github.com/huggingface/transformers). + +**LayoutLMv3** Self-supervised pre-training techniques have achieved remarkable progress in Document AI. Most multimodal pre-trained models use a masked language modeling objective to learn bidirectional representations on the text modality, but they differ in pre-training objectives for the image modality. This discrepancy adds difficulty to multimodal representation learning. In this paper, we propose LayoutLMv3 to pre-train multimodal Transformers for Document AI with unified text and image masking. Additionally, LayoutLMv3 is pre-trained with a word-patch alignment objective to learn cross-modal alignment by predicting whether the corresponding image patch of a text word is masked. The simple unified architecture and training objectives make LayoutLMv3 a general-purpose pre-trained model for both text-centric and image-centric Document AI tasks. Experimental results show that LayoutLMv3 achieves state-of-the-art performance not only in text-centric tasks, including form understanding, receipt understanding, and document visual question answering, but also in image-centric tasks such as document image classification and document layout analysis.The code and models are publicly available at https://aka.ms/layoutlmv3. + +