Fine README.md (#97)

elasticdeeplearning · May 22, 2020 · 0a9c8b1 · 0a9c8b1
1 parent 21a0974
commit 0a9c8b1
Show file tree

Hide file tree

Showing 10 changed files with 51 additions and 92 deletions.
diff --git a/README.md b/README.md
@@ -21,27 +21,56 @@ improving the running time of deep learning jobs, EDL optimizes
 1. the global utilization of the cluster, and
 1. the waiting time of job submitters.
 
-For more about the project EDL, please refer to this [invited blog
-post](https://kubernetes.io/blog/2017/12/paddle-paddle-fluid-elastic-learning/)
-on the Kubernetes official blog.
+## Key Features:
+- Efficiency: Provides parallelism strategies to minimize adjustment overheads.
+- Consistency: Accuracy verification on multiple models compared those without scaling.
+- Flexibility: Any components can be killed or joined at any time.
+- Easy to use: Few lines of code need to be added to support EDL.
+
+## Quick start demo: EDL Resnet50 experiments on a single machine:
+We highly **recommand** you run it in our docker:  
+
+1. Start a Jobserver on one node.
+
+```
+docker pull hub.baidubce.com/paddle-edl/paddle_edl:latest-cuda10.0-cudnn7
+cd example/demo/collective
+./start_job_server.sh
+```
+
+2. Start a Jobclient which controls the worker process.
+
+```
+#Set the ImageNet data path
+export PADDLE_EDL_IMAGENET_PATH=<your path>
+#Set the checkpoint path
+export PADDLE_EDL_FLEET_CHECKPOINT_PATH=<your path>
+
+mkdir -p resnet50_pod
+./start_job_client.sh
+```
+
+3. Experiments result
+
+| total batch size | acc1 | acc5 |
+| :-----: | ----: | ----: |
+| 1024 | 76.0 | 75.8 |
 
-## Tutorials
-- [Run CTR Training and Deployment on Baidu Cloud](./example/ctr/deploy_ctr_on_baidu_cloud_cn.rst)
-- [Run EDL distill training demo on Kubernetes or a single node](./example/distill/README.md)
-- [Run Elastic Deep Learning Demo on a sinle node](./example/collective/README.md)
 
 ## Design Docs
 - A scheduler on Kubernetes:
   -  [Scheduler](./doc/edl_design_doc.md)
 - EDL framework on PaddlePaddle:
   -  [Fault-Tolerant Training in PaddlePaddle](./doc/fault_tolerance.md)
   -  [EDL framework](./doc/edl_collective_design_doc.md)
-  -  [EDL Distillation](./doc/edl_distill_design_doc.md)
 
-## Experiments:
-
-- [Auto-scaling Experiment](https://github.com/PaddlePaddle/cloud/blob/develop/doc/edl/experiment/README.md)
-- [Distill training on Resnet50](./doc/experiment/distill_resnet50.md)
+## Applications:
+- EDL Distillation:
+  - [EDL Distillation design](./doc/edl_distill_design_doc.md)
+  - [Run EDL distillation training demo on Kubernetes or a single node](./example/distill/README.md)
+  - [EDL Distillation performance: Resnet50](./doc/experiment/distill_resnet50.md)
+- EDL CTR
+  - [EDL CTR training and deployment on Baidu Cloud](./example/ctr/deploy_ctr_on_baidu_cloud_cn.rst)
 
 ## FAQ
 

diff --git a/doc/edl_distill_design_doc.md b/doc/edl_distill_design_doc.md
@@ -21,4 +21,4 @@ EDL Distillation is a large scale and universal solution for knowledge distillat
 ## Balancer
 
 ## Reference
-1. <div id="r_1">[Distilling the Knowledge in a Neural Network](https://arxiv.org/pdf/1503.02531.pdf)</div>
+<a name="r_1"></a> 1.[Distilling the Knowledge in a Neural Network](https://arxiv.org/pdf/1503.02531.pdf)
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -13,7 +13,6 @@ ENV PATH=$PATH:{GOROOT}/bin:${GOPATH}/bin
 # python
 ADD ./docker/requirements.txt /root/paddle_edl/requirements.txt
 RUN python -m pip install -r /root/paddle_edl/requirements.txt
-#RUN python -m pip install paddlepaddle-gpu -i  http://mirrors.aliyun.com/pypi/simple/
 
 #etcd
 ENV HOME /root

diff --git a/example/collective/README.md b/example/collective/README.md
diff --git a/example/collective/README_cn.md b/example/collective/README_cn.md
diff --git a/example/demo/collective/env.sh b/example/demo/collective/env.sh
@@ -0,0 +1,4 @@
+#指定ImageNet的数据目录路径
+export PADDLE_EDL_IMAGENET_PATH=/root/go/dataset/ImageNet
+#指定`checkpoint`的目录，用来保存checkpoint
+export PADDLE_EDL_FLEET_CHECKPOINT_PATH=/root/go/checkpoints/resnet50_1
diff --git a/...e_edl/demo/collective/resnet50/package.sh → example/demo/collective/resnet50/package.sh b/...e_edl/demo/collective/resnet50/package.sh → example/demo/collective/resnet50/package.sh
@@ -15,7 +15,7 @@ while true ; do
 done
 
 
-src_dir=../../../../example/collective/resnet50
+src_dir=../../../collective/resnet50
 dst_dir=resnet50_pod/${pod_id}
 
 echo "mkdir resnet50_pod/${pod_id}"

diff --git a/...e_edl/demo/collective/start_job_client.sh → example/demo/collective/start_job_client.sh b/...e_edl/demo/collective/start_job_client.sh → example/demo/collective/start_job_client.sh
@@ -15,7 +15,7 @@ export PADDLE_POD_ID="not set"
 BASEDIR=$(dirname $(readlink -f $0))
 echo $BASEDIR
 
-nohup python -u ${BASEDIR}/job_client_demo.py \
+nohup python -u paddle_edl.demo.collective.job_client_demo \
     --log_level 20 \
     --package_sh ./resnet50/package.sh \
     --pod_path ./resnet50_pod \

diff --git a/...e_edl/demo/collective/start_job_server.sh → example/demo/collective/start_job_server.sh b/...e_edl/demo/collective/start_job_server.sh → example/demo/collective/start_job_server.sh
@@ -8,7 +8,7 @@ echo "node_ips:${node_ips}"
 BASEDIR=$(dirname $(readlink -f $0))
 echo "${BASEDIR}"
 
-nohup python -u ${BASEDIR}/job_server_demo.py \
+nohup python -u paddle_edl.demo.collective.job_server_demo \
     --node_ips ${node_ips} \
     --pod_num_of_node 8 \
     --time_interval_to_change 900 \

diff --git a/python/paddle_edl/tests/unittests/test_edl.sh b/python/paddle_edl/tests/unittests/test_edl.sh
@@ -10,6 +10,8 @@ echo "${BASEDIR}"
 
 rm -rf job_server.log job_client.log ./edl_demo_log
 
+echo "python path:" $PYTHONPATH
+
 nohup python -m paddle_edl.demo.collective.job_server_demo --pod_num_of_node 2 \
     --time_interval_to_change 900 \
     --gpu_num_of_node 2 \