From 0a9c8b114c282bbe79ff2e14c990e61aa2d79018 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Fri, 22 May 2020 15:02:48 +0800 Subject: [PATCH] Fine README.md (#97) --- README.md | 53 ++++++++++++++----- doc/edl_distill_design_doc.md | 2 +- docker/Dockerfile | 1 - example/collective/README.md | 38 ------------- example/collective/README_cn.md | 37 ------------- example/demo/collective/env.sh | 4 ++ .../demo/collective/resnet50/package.sh | 2 +- .../demo/collective/start_job_client.sh | 2 +- .../demo/collective/start_job_server.sh | 2 +- python/paddle_edl/tests/unittests/test_edl.sh | 2 + 10 files changed, 51 insertions(+), 92 deletions(-) delete mode 100644 example/collective/README.md delete mode 100644 example/collective/README_cn.md create mode 100644 example/demo/collective/env.sh rename {python/paddle_edl => example}/demo/collective/resnet50/package.sh (94%) rename {python/paddle_edl => example}/demo/collective/start_job_client.sh (90%) rename {python/paddle_edl => example}/demo/collective/start_job_server.sh (84%) diff --git a/README.md b/README.md index 3deed38d..4ada122b 100644 --- a/README.md +++ b/README.md @@ -21,14 +21,41 @@ improving the running time of deep learning jobs, EDL optimizes 1. the global utilization of the cluster, and 1. the waiting time of job submitters. -For more about the project EDL, please refer to this [invited blog -post](https://kubernetes.io/blog/2017/12/paddle-paddle-fluid-elastic-learning/) -on the Kubernetes official blog. +## Key Features: +- Efficiency: Provides parallelism strategies to minimize adjustment overheads. +- Consistency: Accuracy verification on multiple models compared those without scaling. +- Flexibility: Any components can be killed or joined at any time. +- Easy to use: Few lines of code need to be added to support EDL. + +## Quick start demo: EDL Resnet50 experiments on a single machine: +We highly **recommand** you run it in our docker: + +1. Start a Jobserver on one node. + +``` +docker pull hub.baidubce.com/paddle-edl/paddle_edl:latest-cuda10.0-cudnn7 +cd example/demo/collective +./start_job_server.sh +``` + +2. Start a Jobclient which controls the worker process. + +``` +#Set the ImageNet data path +export PADDLE_EDL_IMAGENET_PATH= +#Set the checkpoint path +export PADDLE_EDL_FLEET_CHECKPOINT_PATH= + +mkdir -p resnet50_pod +./start_job_client.sh +``` + +3. Experiments result + +| total batch size | acc1 | acc5 | +| :-----: | ----: | ----: | +| 1024 | 76.0 | 75.8 | -## Tutorials -- [Run CTR Training and Deployment on Baidu Cloud](./example/ctr/deploy_ctr_on_baidu_cloud_cn.rst) -- [Run EDL distill training demo on Kubernetes or a single node](./example/distill/README.md) -- [Run Elastic Deep Learning Demo on a sinle node](./example/collective/README.md) ## Design Docs - A scheduler on Kubernetes: @@ -36,12 +63,14 @@ on the Kubernetes official blog. - EDL framework on PaddlePaddle: - [Fault-Tolerant Training in PaddlePaddle](./doc/fault_tolerance.md) - [EDL framework](./doc/edl_collective_design_doc.md) - - [EDL Distillation](./doc/edl_distill_design_doc.md) -## Experiments: - -- [Auto-scaling Experiment](https://github.com/PaddlePaddle/cloud/blob/develop/doc/edl/experiment/README.md) -- [Distill training on Resnet50](./doc/experiment/distill_resnet50.md) +## Applications: +- EDL Distillation: + - [EDL Distillation design](./doc/edl_distill_design_doc.md) + - [Run EDL distillation training demo on Kubernetes or a single node](./example/distill/README.md) + - [EDL Distillation performance: Resnet50](./doc/experiment/distill_resnet50.md) +- EDL CTR + - [EDL CTR training and deployment on Baidu Cloud](./example/ctr/deploy_ctr_on_baidu_cloud_cn.rst) ## FAQ diff --git a/doc/edl_distill_design_doc.md b/doc/edl_distill_design_doc.md index cf3ec9db..b65493ed 100644 --- a/doc/edl_distill_design_doc.md +++ b/doc/edl_distill_design_doc.md @@ -21,4 +21,4 @@ EDL Distillation is a large scale and universal solution for knowledge distillat ## Balancer ## Reference -1.
[Distilling the Knowledge in a Neural Network](https://arxiv.org/pdf/1503.02531.pdf)
+ 1.[Distilling the Knowledge in a Neural Network](https://arxiv.org/pdf/1503.02531.pdf) diff --git a/docker/Dockerfile b/docker/Dockerfile index 66db94f6..212a2d41 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -13,7 +13,6 @@ ENV PATH=$PATH:{GOROOT}/bin:${GOPATH}/bin # python ADD ./docker/requirements.txt /root/paddle_edl/requirements.txt RUN python -m pip install -r /root/paddle_edl/requirements.txt -#RUN python -m pip install paddlepaddle-gpu -i http://mirrors.aliyun.com/pypi/simple/ #etcd ENV HOME /root diff --git a/example/collective/README.md b/example/collective/README.md deleted file mode 100644 index 959f844c..00000000 --- a/example/collective/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# Introduction -This demo is for developers of EDL: you can test Paddle EDL function without a Kubernetes cluster. And it's simple to test it on a none or multiple nodes. -Of course, this is also a toy. You can play with it! -Have fun! - -# Install -1. Install EDL from source - -``` -git clone https://github.com/PaddlePaddle/edl -cd edl -mkdir build & cd build -cmake .. -pip install ./python/dist/paddle_edl-0.0.0-py2-none-any.whl -``` - -2. Install EDL using `pip install paddle_edl`. - -# Run the demo on a single node -1. Start a Jobserver on one node. - -``` -git clone https://github.com/PaddlePaddle/edl -cd python/edl/demo/collective -./start_job_server.sh -``` - -2. Start a Jobclient on every node. Jobclient controls the POD process. - -``` -#Set the ImageNet data path -export PADDLE_EDL_IMAGENET_PATH= -#Set the checkpoint path -export PADDLE_EDL_FLEET_CHECKPOINT_PATH= - -mkdir -p resnet50_pod -./start_job_client.sh -``` diff --git a/example/collective/README_cn.md b/example/collective/README_cn.md deleted file mode 100644 index 6bb22cbb..00000000 --- a/example/collective/README_cn.md +++ /dev/null @@ -1,37 +0,0 @@ -# 前言 -在单节点或者多个节点(物理机器或者虚拟机或者Docker之类的)搭建EDL主要是为开发者准备的:没有集群的情况下也可以对Paddle(计算引擎)模拟进行EDL的测试。 -当然,这个过程也有点意思,看着训练进程起起伏伏而且不影响最后的结果,还是蛮有意思的。 -Have fun! - -# 安装EDL -1. 你可以从源代码编译安装 - -``` -git clone https://github.com/PaddlePaddle/edl -cd edl -mkdir build & cd build -cmake .. -pip install ./python/dist/paddle_edl-0.0.0-py2-none-any.whl -``` - -2. 也可以直接使用`pip`安装我们发布的版本`pip install paddle_edl` - -# demo搭建步骤:以单节点为例 -1. 我们需要在一个节点上启动JobServer的demo,用来记录训练任务的Pod信息。 - -``` -git clone https://github.com/PaddlePaddle/edl -cd python/paddle_edl/demo/collective -./start_job_server.sh -``` -2. 我们需要在(各个)节点上启动一个JobClient的demo,用来管理训练的Pod进程。 - -``` -#指定ImageNet的数据目录路径 -export PADDLE_EDL_IMAGENET_PATH= -#指定`checkpoint`的目录,用来保存checkpoint -export PADDLE_EDL_FLEET_CHECKPOINT_PATH= - -mkdir -p resnet50_pod -./start_job_client.sh -``` diff --git a/example/demo/collective/env.sh b/example/demo/collective/env.sh new file mode 100644 index 00000000..568c1893 --- /dev/null +++ b/example/demo/collective/env.sh @@ -0,0 +1,4 @@ +#指定ImageNet的数据目录路径 +export PADDLE_EDL_IMAGENET_PATH=/root/go/dataset/ImageNet +#指定`checkpoint`的目录,用来保存checkpoint +export PADDLE_EDL_FLEET_CHECKPOINT_PATH=/root/go/checkpoints/resnet50_1 diff --git a/python/paddle_edl/demo/collective/resnet50/package.sh b/example/demo/collective/resnet50/package.sh similarity index 94% rename from python/paddle_edl/demo/collective/resnet50/package.sh rename to example/demo/collective/resnet50/package.sh index f0762961..48ea7204 100755 --- a/python/paddle_edl/demo/collective/resnet50/package.sh +++ b/example/demo/collective/resnet50/package.sh @@ -15,7 +15,7 @@ while true ; do done -src_dir=../../../../example/collective/resnet50 +src_dir=../../../collective/resnet50 dst_dir=resnet50_pod/${pod_id} echo "mkdir resnet50_pod/${pod_id}" diff --git a/python/paddle_edl/demo/collective/start_job_client.sh b/example/demo/collective/start_job_client.sh similarity index 90% rename from python/paddle_edl/demo/collective/start_job_client.sh rename to example/demo/collective/start_job_client.sh index 5d586720..8fcbcdea 100755 --- a/python/paddle_edl/demo/collective/start_job_client.sh +++ b/example/demo/collective/start_job_client.sh @@ -15,7 +15,7 @@ export PADDLE_POD_ID="not set" BASEDIR=$(dirname $(readlink -f $0)) echo $BASEDIR -nohup python -u ${BASEDIR}/job_client_demo.py \ +nohup python -u paddle_edl.demo.collective.job_client_demo \ --log_level 20 \ --package_sh ./resnet50/package.sh \ --pod_path ./resnet50_pod \ diff --git a/python/paddle_edl/demo/collective/start_job_server.sh b/example/demo/collective/start_job_server.sh similarity index 84% rename from python/paddle_edl/demo/collective/start_job_server.sh rename to example/demo/collective/start_job_server.sh index 35d322d5..6b7f7ae4 100755 --- a/python/paddle_edl/demo/collective/start_job_server.sh +++ b/example/demo/collective/start_job_server.sh @@ -8,7 +8,7 @@ echo "node_ips:${node_ips}" BASEDIR=$(dirname $(readlink -f $0)) echo "${BASEDIR}" -nohup python -u ${BASEDIR}/job_server_demo.py \ +nohup python -u paddle_edl.demo.collective.job_server_demo \ --node_ips ${node_ips} \ --pod_num_of_node 8 \ --time_interval_to_change 900 \ diff --git a/python/paddle_edl/tests/unittests/test_edl.sh b/python/paddle_edl/tests/unittests/test_edl.sh index 3268bb47..3ce56e67 100755 --- a/python/paddle_edl/tests/unittests/test_edl.sh +++ b/python/paddle_edl/tests/unittests/test_edl.sh @@ -10,6 +10,8 @@ echo "${BASEDIR}" rm -rf job_server.log job_client.log ./edl_demo_log +echo "python path:" $PYTHONPATH + nohup python -m paddle_edl.demo.collective.job_server_demo --pod_num_of_node 2 \ --time_interval_to_change 900 \ --gpu_num_of_node 2 \