🏗️ Building PyTorch 2.8 from Source on Aurora
Sam Foreman 2025-06-14
📦 GitHub Project
See:
- argonne-lcf/frameworks-standalone
- Aurora/pytorch/README.md for additional details.
🏖️ Shell Environment
-
Helper function to get timestamp:
tstamp() { date +"%Y-%m-%d-%H%M%S" } -
Load
frameworksmodule:module load frameworks -
Deactivate
condaenvironment:conda deactivate -
Create new
condaenvironment:ENV_PATH="/flare/datascience/foremans/miniconda/2025-06-15" conda create --prefix "${ENV_PATH}" --y --solver=libmamba --verbose python=3.12 conda activate "${ENV_PATH}"
🔨 Build Libraries
PyTorch
-
Clone pytorch/
pytorchgit clone https://github.com/pytorch/pytorch cd pytorch git submodule sync git submodule update --init --recursive -
Install dependencies:
python3 -m pip install cmake ninja python3 -m pip install -r requirements.txt python3 -m pip install mkl-static mkl-include -
Make triton:
export USE_XPU=1 # for Intel GPU support make triton -
Set environment variables for PyTorch build:
CC=$(which gcc); export CC CXX=$(which g++); export CXX export REL_WITH_DEB_INFO=1 export USE_CUDA=0 export USE_ROCM=0 export USE_MKLDNN=1 export USE_MKL=1 export USE_ROCM=0 export USE_CUDNN=0 export USE_FBGEMM=1 export USE_NNPACK=1 export USE_QNNPACK=1 export USE_NCCL=0 export USE_CUDA=0 export BUILD_CAFFE2_OPS=0 export BUILD_TEST=0 export USE_DISTRIBUTED=1 export USE_NUMA=0 export USE_MPI=1 export USE_XPU=1 export USE_XCCL=1 export INTEL_MKL_DIR=$MKLROOT export USE_AOT_DEVLIST='pvc' export TORCH_XPU_ARCH_LIST='pvc' export OCLOC_VERSION=24.39.1 which -a gcc which -a g++ -
Build PyTorch (takes ~ 30 mins):
python3 setup.py bdist_wheel 2>&1 | tee "torch-build-whl-$(tstamp).log" python3 -m pip install dist/*.whl cd .. -
[Optional] Install {
torchvision,torchaudio,torchdata} with no dependencies:python3 -m pip install torchvision torchaudio --no-deps --index-url https://download.pytorch.org/whl/xpu python3 -m pip install torchdata --no-deps
Intel Libraries
-
intel/
intel-extension-for-pytorchgit clone https://github.com/intel/intel-extension-for-pytorch cd intel-extension-for-pytorch git checkout xpu-main git submodule sync git submodule update --init --recursive python3 -m pip install -r requirements.txt python3 -m pip install --upgrade pip setuptools wheel build black flake8 MAX_JOBS=48 CC=$(which gcc) CXX=$(which g++) INTELONEAPIROOT="${ONEAPI_ROOT}" python3 setup.py bdist_wheel 2>&1 | tee "ipex-build-whl-$(tstamp).log" python3 -m pip install dist/*.whl cd .. -
git clone https://github.com/intel/torch-ccl cd torch-ccl git checkout c27ded5 git submodule sync git submodule update --init --recursive python3 -m pip install -r requirements.txt # see: # https://github.com/intel/torch-ccl/blob/c27ded5190a6b115ec68c7a8c28f40cfe7f0a32a/version.txt ONECCL_BINDINGS_FOR_PYTORCH_BACKEND=xpu INTELONEAPIROOT="${ONEAPI_ROOT}" USE_SYSTEM_ONECCL=ON COMPUTE_BACKEND=dpcpp python3 setup.py bdist_wheel 2>&1 | tee "torch-ccl-build-whl-$(tstamp).log" python3 -m pip install dist/*.whl cd ..
mpi4py
git clone https://github.com/mpi4py/mpi4py
cd mpi4py
CC=mpicc CXX=mpicxx python3 setup.py build |& tee build.log
CC=mpicc CXX=mpicxx python3 setup.py install |& tee install.log
cd ..
h5py
module load hdf5
git clone https://github.com/h5py/h5py
cd h5py
CC=mpicc CXX=mpicxx HDF5_MPI="ON" python3 -m pip install --no-binary=h5py .
h5cc -showconfig
torch / ao
git clone https://github.com/pytorch/ao
cd ao
USE_CUDA=0 USE_XPU=1 USE_XCCL=1 python3 setup.py bdist_wheel 2>&1 | tee "torchao-build-whl-$(tstamp).log"
python3 -m pip install dist/*.whl
cd ../
TorchTune
git clone https://github.com/pytorch/torchtune
cd torchtune
python3 -m pip install -e "." --require-virtualenv --verbose
tune download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir ~/torchtune_anl2/out_dir --ignore-patterns "original/consolidated.00.pth" --hf-token <hf-token>
cd ../
✅ Verify Installation
-
Command:
python3 -c 'import torch; print(torch.__file__); print(*torch.__config__.show().split("\n"), sep="\n") ; print(f"{torch.__version__=}"); print(f"{torch.xpu.is_available()=}"); print(f"{torch.xpu.device_count()=}") ; import torch.distributed; print(f"{torch.distributed.is_xccl_available()=}"); import torch; import intel_extension_for_pytorch as ipex; print(f"{torch.__version__=}"); print(f"{ipex.__version__=}"); import oneccl_bindings_for_pytorch as oneccl_bpt; print(f"{oneccl_bpt.__version__=}") ; [print(f"[{i}]: {torch.xpu.get_device_properties(i)}") for i in range(torch.xpu.device_count())]' -
Output:
$ python3 -c 'import torch; print(torch.__file__); print(*torch.__config__.show().split("\n"), sep="\n") ; print(f"{torch.__version__=}"); print(f"{torch.xpu.is_available()=}"); print(f"{torch.xpu.device_count()=}") ; import torch.distributed; print(f"{torch.distributed.is_xccl_available()=}"); import torch; import intel_extension_for_pytorch as ipex; print(f"{torch.__version__=}"); print(f"{ipex.__version__=}"); import oneccl_bindings_for_pytorch as oneccl_bpt; print(f"{oneccl_bpt.__version__=}") ; [print(f"[{i}]: {torch.xpu.get_device_properties(i)}") for i in range(torch.xpu.device_count())]' /flare/datascience/foremans/miniconda/2025-06-15/lib/python3.12/site-packages/torch/__init__.py PyTorch built with: - GCC 13.3 - C++ Version: 201703 - Intel(R) oneAPI Math Kernel Library Version 2025.0.1-Product Build 20241031 for Intel(R) 64 architecture applications - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d) - OpenMP 201511 (a.k.a. OpenMP 4.5) - LAPACK is enabled (usually provided by MKL) - NNPACK is enabled - CPU capability usage: AVX512 XPU backend - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Debug, COMMIT_SHA=655b3b14ffba4ae73e26a63b4289329e8d160a6f, CXX_COMPILER=/opt/aurora/24.347.0/spack/unified/0.9.2/install/linux-sles15-x86_64/gcc-13.3.0/gcc-13.3.0-4enwbrb/bin/g++, CXX_FLAGS= -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DUSE_KINETO -DLIBKINETO_NOCUPTI -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=OFF -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -DC10_NODEPRECATED -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-dangling-reference -Wno-error=dangling-reference -DUSE_XPU -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.8.0, USE_CUDA=0, USE_CUDNN=OFF, USE_CUSPARSELT=OFF, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=1, USE_MPI=1, USE_NCCL=OFF, USE_NNPACK=1,USE_OPENMP=ON, USE_ROCM=0, USE_ROCM_KERNEL_ASSERT=OFF, USE_XCCL=1, USE_XPU=1, torch.__version__='2.8.0a0+git655b3b1' torch.xpu.is_available()=True torch.xpu.device_count()=12 torch.distributed.is_xccl_available()=True [W615 14:52:10.420018164 OperatorEntry.cpp:217] Warning: Warning only once for all operators, other operators may also be overridden. Overriding a previously registered kernel for the same operator and the same dispatch key operator: aten::geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!) registered at /lus/flare/projects/datascience/foremans/AuroraBuilds/2025-06-15/pytorch/build/aten/src/ATen/RegisterSchema.cpp:6 dispatch key: XPU previous kernel: registered at /lus/flare/projects/datascience/foremans/AuroraBuilds/2025-06-15/pytorch/aten/src/ATen/VmapModeRegistrations.cpp:37 new kernel: registered at /lus/flare/projects/datascience/foremans/AuroraBuilds/2025-06-15/intel-extension-for-pytorch/build/Release/csrc/gpu/csrc/gpu/xpu/ATen/RegisterXPU_0.cpp:186 (function operator()) torch.__version__='2.8.0a0+git655b3b1' ipex.__version__='2.8.10+git57bb68a' oneccl_bpt.__version__='2.8.0+xpu' [0]: _XpuDeviceProperties(name='Intel(R) Data Center GPU Max 1550', platform_name='Intel(R) oneAPI Unified Runtime over Level-Zero', type='gpu', driver_version='1.6.32567+18', total_memory=65536MB, max_compute_units=448, gpu_eu_count=448, gpu_subslice_count=56, max_work_group_size=1024, max_num_sub_groups=64, sub_group_sizes=[16 32], has_fp16=1, has_fp64=1, has_atomic64=1) [1]: _XpuDeviceProperties(name='Intel(R) Data Center GPU Max 1550', platform_name='Intel(R) oneAPI Unified Runtime over Level-Zero', type='gpu', driver_version='1.6.32567+18', total_memory=65536MB, max_compute_units=448, gpu_eu_count=448, gpu_subslice_count=56, max_work_group_size=1024, max_num_sub_groups=64, sub_group_sizes=[16 32], has_fp16=1, has_fp64=1, has_atomic64=1) [2]: _XpuDeviceProperties(name='Intel(R) Data Center GPU Max 1550', platform_name='Intel(R) oneAPI Unified Runtime over Level-Zero', type='gpu', driver_version='1.6.32567+18', total_memory=65536MB, max_compute_units=448, gpu_eu_count=448, gpu_subslice_count=56, max_work_group_size=1024, max_num_sub_groups=64, sub_group_sizes=[16 32], has_fp16=1, has_fp64=1, has_atomic64=1) [3]: _XpuDeviceProperties(name='Intel(R) Data Center GPU Max 1550', platform_name='Intel(R) oneAPI Unified Runtime over Level-Zero', type='gpu', driver_version='1.6.32567+18', total_memory=65536MB, max_compute_units=448, gpu_eu_count=448, gpu_subslice_count=56, max_work_group_size=1024, max_num_sub_groups=64, sub_group_sizes=[16 32], has_fp16=1, has_fp64=1, has_atomic64=1) [4]: _XpuDeviceProperties(name='Intel(R) Data Center GPU Max 1550', platform_name='Intel(R) oneAPI Unified Runtime over Level-Zero', type='gpu', driver_version='1.6.32567+18', total_memory=65536MB, max_compute_units=448, gpu_eu_count=448, gpu_subslice_count=56, max_work_group_size=1024, max_num_sub_groups=64, sub_group_sizes=[16 32], has_fp16=1, has_fp64=1, has_atomic64=1) [5]: _XpuDeviceProperties(name='Intel(R) Data Center GPU Max 1550', platform_name='Intel(R) oneAPI Unified Runtime over Level-Zero', type='gpu', driver_version='1.6.32567+18', total_memory=65536MB, max_compute_units=448, gpu_eu_count=448, gpu_subslice_count=56, max_work_group_size=1024, max_num_sub_groups=64, sub_group_sizes=[16 32], has_fp16=1, has_fp64=1, has_atomic64=1) [6]: _XpuDeviceProperties(name='Intel(R) Data Center GPU Max 1550', platform_name='Intel(R) oneAPI Unified Runtime over Level-Zero', type='gpu', driver_version='1.6.32567+18', total_memory=65536MB, max_compute_units=448, gpu_eu_count=448, gpu_subslice_count=56, max_work_group_size=1024, max_num_sub_groups=64, sub_group_sizes=[16 32], has_fp16=1, has_fp64=1, has_atomic64=1) [7]: _XpuDeviceProperties(name='Intel(R) Data Center GPU Max 1550', platform_name='Intel(R) oneAPI Unified Runtime over Level-Zero', type='gpu', driver_version='1.6.32567+18', total_memory=65536MB, max_compute_units=448, gpu_eu_count=448, gpu_subslice_count=56, max_work_group_size=1024, max_num_sub_groups=64, sub_group_sizes=[16 32], has_fp16=1, has_fp64=1, has_atomic64=1) [8]: _XpuDeviceProperties(name='Intel(R) Data Center GPU Max 1550', platform_name='Intel(R) oneAPI Unified Runtime over Level-Zero', type='gpu', driver_version='1.6.32567+18', total_memory=65536MB, max_compute_units=448, gpu_eu_count=448, gpu_subslice_count=56, max_work_group_size=1024, max_num_sub_groups=64, sub_group_sizes=[16 32], has_fp16=1, has_fp64=1, has_atomic64=1) [9]: _XpuDeviceProperties(name='Intel(R) Data Center GPU Max 1550', platform_name='Intel(R) oneAPI Unified Runtime over Level-Zero', type='gpu', driver_version='1.6.32567+18', total_memory=65536MB, max_compute_units=448, gpu_eu_count=448, gpu_subslice_count=56, max_work_group_size=1024, max_num_sub_groups=64, sub_group_sizes=[16 32], has_fp16=1, has_fp64=1, has_atomic64=1) [10]: _XpuDeviceProperties(name='Intel(R) Data Center GPU Max 1550', platform_name='Intel(R) oneAPI Unified Runtime over Level-Zero', type='gpu', driver_version='1.6.32567+18', total_memory=65536MB, max_compute_units=448, gpu_eu_count=448, gpu_subslice_count=56, max_work_group_size=1024, max_num_sub_groups=64, sub_group_sizes=[16 32], has_fp16=1, has_fp64=1, has_atomic64=1) [11]: _XpuDeviceProperties(name='Intel(R) Data Center GPU Max 1550', platform_name='Intel(R) oneAPI Unified Runtime over Level-Zero', type='gpu', driver_version='1.6.32567+18', total_memory=65536MB, max_compute_units=448, gpu_eu_count=448, gpu_subslice_count=56, max_work_group_size=1024, max_num_sub_groups=64, sub_group_sizes=[16 32], has_fp16=1, has_fp64=1, has_atomic64=1) took: 0h:00m:21s