Skip to content

Commit

Permalink
add scripts about pre-training and fine-tuning
Browse files Browse the repository at this point in the history
  • Loading branch information
Ryuto10 committed Nov 2, 2020
1 parent 1d7dbb7 commit b3cac6b
Show file tree
Hide file tree
Showing 7 changed files with 540 additions and 0 deletions.
233 changes: 233 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
.idea/
.pyenv/
jaen/data/
jaen/work/

### macOS
# General
.DS_Store
.AppleDouble
.LSOverride

# Icon must end with two \r
Icon


# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk

### Python
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

### JetBrains
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf

# Generated files
.idea/**/contentModel.xml

# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml

# Gradle
.idea/**/gradle.xml
.idea/**/libraries

# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr

# CMake
cmake-build-*/

# Mongo Explorer plugin
.idea/**/mongoSettings.xml

# File-based project format
*.iws

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

# Editor-based Rest Client
.idea/httpRequests

# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser

# Fix fairseq version
fairseq
43 changes: 43 additions & 0 deletions src/preprocess_fairseq.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash

# Usage: bash preprocess_fairseq.sh [TRAIN PREFIX] [DEV PREFIX] [OUT PATH] [SPM VOCAB PATH]

TRAIN=$(readlink -f $1)
DEV=$(readlink -f $2)
PRE_PROCESSED_DIR=$(readlink -f $3)
SPM_VOCAB=$(readlink -f $4)

SRC_LANG="context"
TRG_LANG="response"
FAIRSEQ_VOCAB=${PRE_PROCESSED_DIR}/fairseq_vocab.txt
N_WORKER=12

echo "Train-context:" ${TRAIN}.${SRC_LANG}
echo "Train-response:" ${TRAIN}.${TRG_LANG}
echo "Dev-context:" ${DEV}.${SRC_LANG}
echo "Dev-response:" ${DEV}.${TRG_LANG}
echo "Output:" ${PRE_PROCESSED_DIR}
echo
echo "sentencepiece vocab:" ${SPM_VOCAB}
echo

mkdir -p ${PRE_PROCESSED_DIR}
cut -f1 ${SPM_VOCAB} | tail -n +4 | sed "s/$/ 100/g" > ${FAIRSEQ_VOCAB}

echo "Create:" ${FAIRSEQ_VOCAB}
echo
echo "Your fairseq version:"
pip list | grep fairseq
echo

set -x

fairseq-preprocess \
--source-lang ${SRC_LANG} \
--target-lang ${TRG_LANG} \
--trainpref ${TRAIN} \
--validpref ${DEV} \
--destdir ${PRE_PROCESSED_DIR} \
--srcdict ${FAIRSEQ_VOCAB} \
--joined-dictionary \
--workers ${N_WORKER}
42 changes: 42 additions & 0 deletions src/setting_fine_tuning.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#! /bin/sh

# your own path (to be changed)
WORK_DIR="./fine_tuning"
DATA_DIR="/path/to/fine_tuning_data"
PRETRAINED_MODEL="/path/to/pre_trained_model_checkpoint.pt"

# path & extension
MODEL_DIR="${WORK_DIR}/fine_tuned_models"
TENSORBOARD_DIR="${WORK_DIR}/fine_tuned_tensorboard_log"
SRC_LANG="context"
TRG_LANG="response"

# model parameters
ENC_EMB=1024
ENC_FFN=8192
ENC_LAYER=2
ENC_HEAD=32
DEC_EMB=${ENC_EMB}
DEC_FFN=${ENC_FFN}
DEC_LAYER=16
DEC_HEAD=${ENC_HEAD}

# optimizer setting
GPU=0
MAX_TOKEN=2000
MAX_UPDATE=`expr 10000 + 400000` # fine-tuning updates + pre-training updates
UFREQ=16
WARMUP_STEP=5000 # {100, 500, 1000, 5000}
INIT_LR=1e-07
LR=1e-04 # {1e-04, 5e-05, 1e-05, 5e-06}
MIN_LR=1e-09

# save & log
KEEP_LAST_EPOCH=1
KEEP_LAST_UPD=5
SAVE_UPD=2000
LOG_UPD=2000

# else
NUM_WORKERS=15
SEED=2020
41 changes: 41 additions & 0 deletions src/setting_pre_training.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#! /bin/sh

# your own path (to be changed)
WORK_DIR="./pre_training"
DATA_DIR="/path/to/pre_training_data"

# path & extension
MODEL_DIR="${WORK_DIR}/pre_trained_models"
TENSORBOARD_DIR="${WORK_DIR}/pre_trained_tensorboard_logs"
SRC_LANG="context"
TRG_LANG="response"

# model parameters
ENC_EMB=1024
ENC_FFN=8192
ENC_LAYER=2
ENC_HEAD=32
DEC_EMB=${ENC_EMB}
DEC_FFN=${ENC_FFN}
DEC_LAYER=16
DEC_HEAD=${ENC_HEAD}

# optimizer setting
GPU=0
MAX_TOKEN=2000
MAX_UPDATE=400000
UFREQ=16
WARMUP_STEP=3125 # {500, 1000, 2000, 3125, 5000}
INIT_LR=1e-07
LR=1e-03 # {1e-03, 5e-04, 2e-04, 1e-04, 5e-05, 2e-05}
MIN_LR=1e-09

# save & log
KEEP_LAST_EPOCH=1
KEEP_LAST_UPD=10
SAVE_UPD=20000
LOG_UPD=2500

# else
NUM_WORKERS=15
SEED=2020
Loading

0 comments on commit b3cac6b

Please sign in to comment.