Skip to content

Commit

Permalink
chore: update neuron unit tests to Neuron SDK 2.21.0
Browse files Browse the repository at this point in the history
  • Loading branch information
mselim00 committed Jan 13, 2025
1 parent eed33db commit da28bc4
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 54 deletions.
2 changes: 1 addition & 1 deletion test/cases/neuron/manifests/k8s-neuron-device-plugin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ spec:
- trn2.48xlarge
containers:
# Find all neuron-device-plugin images at https://gallery.ecr.aws/neuron/neuron-device-plugin
- image: public.ecr.aws/neuron/neuron-device-plugin:2.19.16.0
- image: public.ecr.aws/neuron/neuron-device-plugin:2.23.30.0
imagePullPolicy: Always
name: neuron-device-plugin
env:
Expand Down
106 changes: 53 additions & 53 deletions test/images/neuron/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
FROM public.ecr.aws/docker/library/ubuntu:20.04
FROM public.ecr.aws/docker/library/ubuntu:22.04

# Neuron SDK components version numbers
ARG NEURONX_DISTRIBUTED_VERSION=0.8.0
ARG NEURONX_CC_VERSION=2.15.128.0
ARG NEURONX_FRAMEWORK_VERSION=2.1.2.2.3.0
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.22.26.0-17a033bc8
ARG NEURONX_RUNTIME_LIB_VERSION=2.22.14.0-6e27b8d5b
ARG NEURONX_TOOLS_VERSION=2.19.0.0
ARG NEURONX_DISTRIBUTED_VERSION=0.10.0
ARG NEURONX_CC_VERSION=2.16.345.0
ARG NEURONX_FRAMEWORK_VERSION=2.5.1.2.4.0
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.23.133.0-3e70920f2
ARG NEURONX_RUNTIME_LIB_VERSION=2.23.110.0-9b5179492
ARG NEURONX_TOOLS_VERSION=2.20.204.0

ARG PYTHON=python3.10
ARG PYTHON_VERSION=3.10.12
Expand All @@ -33,8 +33,8 @@ ENV PATH /opt/aws/neuron/bin/:$PATH
ENV DGLBACKEND=pytorch

RUN apt-get update \
&& apt-get upgrade -y \
&& apt-get install -y --no-install-recommends \
&& apt-get upgrade -y \
&& apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
cmake \
Expand All @@ -60,8 +60,8 @@ RUN apt-get update \
libcap-dev \
gnupg2 \
gpg-agent \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean

# new user to avoid mpirun --allow-run-as-root
RUN useradd -ms /bin/bash ubuntu
Expand All @@ -77,36 +77,36 @@ RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sour
RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -

RUN apt-get update \
&& apt-get install -y \
&& apt-get install -y \
aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

# Install Open MPI
RUN mkdir -p /tmp/openmpi \
&& cd /tmp/openmpi \
&& wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \
&& tar zxf openmpi-${OMPI_VERSION}.tar.gz \
&& cd openmpi-${OMPI_VERSION} \
&& ./configure --enable-orterun-prefix-by-default \
&& make -j $(nproc) all \
&& make install \
&& ldconfig \
&& rm -rf /tmp/openmpi
&& cd /tmp/openmpi \
&& wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \
&& tar zxf openmpi-${OMPI_VERSION}.tar.gz \
&& cd openmpi-${OMPI_VERSION} \
&& ./configure --enable-orterun-prefix-by-default \
&& make -j $(nproc) all \
&& make install \
&& ldconfig \
&& rm -rf /tmp/openmpi

# install Python
RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
&& tar -xzf Python-$PYTHON_VERSION.tgz \
&& cd Python-$PYTHON_VERSION \
&& ./configure --enable-shared --prefix=/usr/local \
&& make -j $(nproc) && make install \
&& cd .. && rm -rf ../Python-$PYTHON_VERSION* \
&& ln -s /usr/local/bin/pip3 /usr/bin/pip \
&& ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
&& ${PIP} --no-cache-dir install --upgrade \
&& tar -xzf Python-$PYTHON_VERSION.tgz \
&& cd Python-$PYTHON_VERSION \
&& ./configure --enable-shared --prefix=/usr/local \
&& make -j $(nproc) && make install \
&& cd .. && rm -rf ../Python-$PYTHON_VERSION* \
&& ln -s /usr/local/bin/pip3 /usr/bin/pip \
&& ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
&& ${PIP} --no-cache-dir install --upgrade \
pip \
setuptools

Expand All @@ -132,9 +132,9 @@ RUN ${PIP} install --no-cache-dir -U \

RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt
RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \
&& ${PIP} install --force-reinstall torch-neuronx==$NEURONX_FRAMEWORK_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \
&& ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \
&& ${PIP} install --force-reinstall --no-deps neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
&& ${PIP} install --force-reinstall torch-neuronx==$NEURONX_FRAMEWORK_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \
&& ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \
&& ${PIP} install --force-reinstall --no-deps neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com

# attrs, neuronx-cc required: >=19.2.0, sagemaker <24,>=23.1.0
# protobuf neuronx-cc<4, sagemaker-training >=3.9.2,<=3.20.3
Expand All @@ -152,20 +152,20 @@ RUN ${PIP} install --no-cache-dir -U \
# EFA Installer does apt get. Make sure to run apt update before that
RUN apt-get update
RUN cd $HOME \
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \
&& wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \
&& cat aws-efa-installer.key | gpg --fingerprint \
&& wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \
&& tar -xf aws-efa-installer-latest.tar.gz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
&& cd $HOME
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \
&& wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \
&& cat aws-efa-installer.key | gpg --fingerprint \
&& wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \
&& tar -xf aws-efa-installer-latest.tar.gz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
&& cd $HOME


# Clean up after apt update
RUN rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean
&& rm -rf /tmp/tmp* \
&& apt-get clean

# Install some common packages used by training scripts
# torchvision needed for MLP. since it depends on torch and torch neuron/torch
Expand All @@ -175,14 +175,14 @@ RUN pip3 install --no-cache-dir --no-deps -U \


RUN HOME_DIR=/root \
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
&& chmod +x /usr/local/bin/testOSSCompliance \
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
&& rm -rf ${HOME_DIR}/oss_compliance* \
&& rm -rf /tmp/tmp*
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
&& chmod +x /usr/local/bin/testOSSCompliance \
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
&& rm -rf ${HOME_DIR}/oss_compliance* \
&& rm -rf /tmp/tmp*

RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.1/license.txt

Expand Down

0 comments on commit da28bc4

Please sign in to comment.