{ stdenv, lib, fetchFromGitHub, fetchFromGitLab, git-unroll, buildPythonPackage, python, runCommand, writeShellScript, config, cudaSupport ? config.cudaSupport, cudaPackages, autoAddDriverRunpath, effectiveMagma ? if cudaSupport then magma-cuda-static else if rocmSupport then magma-hip else magma, magma, magma-hip, magma-cuda-static, # Use the system NCCL as long as we're targeting CUDA on a supported platform. useSystemNccl ? (cudaSupport && !cudaPackages.nccl.meta.unsupported || rocmSupport), MPISupport ? false, mpi, buildDocs ? false, # tests.cudaAvailable: callPackage, # Native build inputs cmake, symlinkJoin, which, pybind11, pkg-config, removeReferencesTo, # Build inputs apple-sdk_13, numactl, # dependencies astunparse, expecttest, filelock, fsspec, hypothesis, jinja2, networkx, packaging, psutil, pyyaml, requests, sympy, types-dataclasses, typing-extensions, # ROCm build and `torch.compile` requires `triton` tritonSupport ? (!stdenv.hostPlatform.isDarwin), triton, # TODO: 1. callPackage needs to learn to distinguish between the task # of "asking for an attribute from the parent scope" and # the task of "exposing a formal parameter in .override". # TODO: 2. We should probably abandon attributes such as `torchWithCuda` (etc.) # as they routinely end up consuming the wrong arguments\ # (dependencies without cuda support). # Instead we should rely on overlays and nixpkgsFun. # (@SomeoneSerge) _tritonEffective ? if cudaSupport then triton-cuda else if rocmSupport then rocmPackages.triton else triton, triton-cuda, # Disable MKLDNN on aarch64-darwin, it negatively impacts performance, # this is also what official pytorch build does mklDnnSupport ? !(stdenv.hostPlatform.isDarwin && stdenv.hostPlatform.isAarch64), # virtual pkg that consistently instantiates blas across nixpkgs # See https://github.com/NixOS/nixpkgs/pull/83888 blas, # ninja (https://ninja-build.org) must be available to run C++ extensions tests, ninja, # dependencies for torch.utils.tensorboard pillow, six, tensorboard, protobuf, # ROCm dependencies rocmSupport ? config.rocmSupport, rocmPackages, gpuTargets ? [ ], vulkanSupport ? false, vulkan-headers, vulkan-loader, shaderc, }: let inherit (lib) attrsets lists strings trivial ; inherit (cudaPackages) cudaFlags cudnn nccl; triton = throw "python3Packages.torch: use _tritonEffective instead of triton to avoid divergence"; setBool = v: if v then "1" else "0"; # https://github.com/pytorch/pytorch/blob/v2.6.0/torch/utils/cpp_extension.py#L2046-L2048 supportedTorchCudaCapabilities = let real = [ "3.5" "3.7" "5.0" "5.2" "5.3" "6.0" "6.1" "6.2" "7.0" "7.2" "7.5" "8.0" "8.6" "8.7" "8.9" "9.0" "9.0a" "10.0" ]; ptx = lists.map (x: "${x}+PTX") real; in real ++ ptx; # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements # of the first list *from* the second list. That means: # lists.subtractLists a b = b - a # For CUDA supportedCudaCapabilities = lists.intersectLists cudaFlags.cudaCapabilities supportedTorchCudaCapabilities; unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities cudaFlags.cudaCapabilities; isCudaJetson = cudaSupport && cudaPackages.cudaFlags.isJetsonBuild; # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified. gpuArchWarner = supported: unsupported: trivial.throwIf (supported == [ ]) ( "No supported GPU targets specified. Requested GPU targets: " + strings.concatStringsSep ", " unsupported ) supported; # Create the gpuTargetString. gpuTargetString = strings.concatStringsSep ";" ( if gpuTargets != [ ] then # If gpuTargets is specified, it always takes priority. gpuTargets else if cudaSupport then gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities else if rocmSupport then rocmPackages.clr.gpuTargets else throw "No GPU targets specified" ); rocmtoolkit_joined = symlinkJoin { name = "rocm-merged"; paths = with rocmPackages; [ rocm-core clr rccl miopen aotriton rocrand rocblas rocsparse hipsparse rocthrust rocprim hipcub roctracer rocfft rocsolver hipfft hiprand hipsolver hipblas-common hipblas hipblaslt rocminfo rocm-comgr rocm-device-libs rocm-runtime clr.icd hipify ]; # Fix `setuptools` not being found postBuild = '' rm -rf $out/nix-support ''; }; brokenConditions = attrsets.filterAttrs (_: cond: cond) { "CUDA and ROCm are mutually exclusive" = cudaSupport && rocmSupport; "CUDA is not targeting Linux" = cudaSupport && !stdenv.hostPlatform.isLinux; "Unsupported CUDA version" = cudaSupport && !(builtins.elem cudaPackages.cudaMajorVersion [ "11" "12" ]); "MPI cudatoolkit does not match cudaPackages.cudatoolkit" = MPISupport && cudaSupport && (mpi.cudatoolkit != cudaPackages.cudatoolkit); # This used to be a deep package set comparison between cudaPackages and # effectiveMagma.cudaPackages, making torch too strict in cudaPackages. # In particular, this triggered warnings from cuda's `aliases.nix` "Magma cudaPackages does not match cudaPackages" = cudaSupport && (effectiveMagma.cudaPackages.cudaMajorMinorVersion != cudaPackages.cudaMajorMinorVersion); }; unroll-src = writeShellScript "unroll-src" '' echo "{ version, fetchFromGitLab, fetchFromGitHub, runCommand, }: assert version == "'"'$1'"'";" ${lib.getExe git-unroll} https://github.com/pytorch/pytorch v$1 echo echo "# Update using: unroll-src [version]" ''; stdenv' = if cudaSupport then cudaPackages.backendStdenv else stdenv; in buildPythonPackage rec { pname = "torch"; # Don't forget to update torch-bin to the same version. version = "2.6.0"; pyproject = true; stdenv = stdenv'; outputs = [ "out" # output standard python package "dev" # output libtorch headers "lib" # output libtorch libraries "cxxdev" # propagated deps for the cmake consumers of torch ]; cudaPropagateToOutput = "cxxdev"; src = callPackage ./src.nix { inherit version fetchFromGitHub fetchFromGitLab runCommand ; }; patches = [ ./clang19-template-warning.patch # fix invalid static cast in XNNPACK # https://github.com/google/XNNPACK/issues/7489 ./xnnpack-bfloat16.patch ] ++ lib.optionals cudaSupport [ ./fix-cmake-cuda-toolkit.patch ] ++ lib.optionals stdenv.hostPlatform.isLinux [ # Propagate CUPTI to Kineto by overriding the search path with environment variables. # https://github.com/pytorch/pytorch/pull/108847 ./pytorch-pr-108847.patch ] ++ lib.optionals (lib.getName blas.provider == "mkl") [ # The CMake install tries to add some hardcoded rpaths, incompatible # with the Nix store, which fails. Simply remove this step to get # rpaths that point to the Nix store. ./disable-cmake-mkl-rpath.patch ]; postPatch = '' substituteInPlace cmake/public/cuda.cmake \ --replace-fail \ 'message(FATAL_ERROR "Found two conflicting CUDA' \ 'message(WARNING "Found two conflicting CUDA' \ --replace-warn \ "set(CUDAToolkit_ROOT" \ "# Upstream: set(CUDAToolkit_ROOT" substituteInPlace third_party/gloo/cmake/Cuda.cmake \ --replace-warn "find_package(CUDAToolkit 7.0" "find_package(CUDAToolkit" # annotations (3.7), print_function (3.0), with_statement (2.6) are all supported sed -i -e "/from __future__ import/d" **.py substituteInPlace third_party/NNPACK/CMakeLists.txt \ --replace-fail "PYTHONPATH=" 'PYTHONPATH=$ENV{PYTHONPATH}:' # flag from cmakeFlags doesn't work, not clear why # setting it at the top of NNPACK's own CMakeLists does sed -i '2s;^;set(PYTHON_SIX_SOURCE_DIR ${six.src})\n;' third_party/NNPACK/CMakeLists.txt '' + lib.optionalString rocmSupport '' # https://github.com/facebookincubator/gloo/pull/297 substituteInPlace third_party/gloo/cmake/Hipify.cmake \ --replace-fail "\''${HIPIFY_COMMAND}" "python \''${HIPIFY_COMMAND}" # Replace hard-coded rocm paths substituteInPlace caffe2/CMakeLists.txt \ --replace-fail "/opt/rocm" "${rocmtoolkit_joined}" \ --replace-fail "hcc/include" "hip/include" \ --replace-fail "rocblas/include" "include/rocblas" \ --replace-fail "hipsparse/include" "include/hipsparse" # Doesn't pick up the environment variable? substituteInPlace third_party/kineto/libkineto/CMakeLists.txt \ --replace-fail "\''$ENV{ROCM_SOURCE_DIR}" "${rocmtoolkit_joined}" \ --replace-fail "/opt/rocm" "${rocmtoolkit_joined}" # Strangely, this is never set in cmake substituteInPlace cmake/public/LoadHIP.cmake \ --replace "set(ROCM_PATH \$ENV{ROCM_PATH})" \ "set(ROCM_PATH \$ENV{ROCM_PATH})''\nset(ROCM_VERSION ${lib.concatStrings (lib.intersperse "0" (lib.splitVersion rocmPackages.clr.version))})" '' # Detection of NCCL version doesn't work particularly well when using the static binary. + lib.optionalString cudaSupport '' substituteInPlace cmake/Modules/FindNCCL.cmake \ --replace-fail \ 'message(FATAL_ERROR "Found NCCL header version and library version' \ 'message(WARNING "Found NCCL header version and library version' '' # Remove PyTorch's FindCUDAToolkit.cmake and use CMake's default. # NOTE: Parts of pytorch rely on unmaintained FindCUDA.cmake with custom patches to support e.g. # newer architectures (sm_90a). We do want to delete vendored patches, but have to keep them # until https://github.com/pytorch/pytorch/issues/76082 is addressed + lib.optionalString cudaSupport '' rm cmake/Modules/FindCUDAToolkit.cmake ''; # NOTE(@connorbaker): Though we do not disable Gloo or MPI when building with CUDA support, caution should be taken # when using the different backends. Gloo's GPU support isn't great, and MPI and CUDA can't be used at the same time # without extreme care to ensure they don't lock each other out of shared resources. # For more, see https://github.com/open-mpi/ompi/issues/7733#issuecomment-629806195. preConfigure = lib.optionalString cudaSupport '' export TORCH_CUDA_ARCH_LIST="${gpuTargetString}" export CUPTI_INCLUDE_DIR=${lib.getDev cudaPackages.cuda_cupti}/include export CUPTI_LIBRARY_DIR=${lib.getLib cudaPackages.cuda_cupti}/lib '' + lib.optionalString (cudaSupport && cudaPackages ? cudnn) '' export CUDNN_INCLUDE_DIR=${lib.getLib cudnn}/include export CUDNN_LIB_DIR=${lib.getLib cudnn}/lib '' + lib.optionalString rocmSupport '' export ROCM_PATH=${rocmtoolkit_joined} export ROCM_SOURCE_DIR=${rocmtoolkit_joined} export PYTORCH_ROCM_ARCH="${gpuTargetString}" export CMAKE_CXX_FLAGS="-I${rocmtoolkit_joined}/include -I${rocmtoolkit_joined}/include/rocblas" python tools/amd_build/build_amd.py ''; # Use pytorch's custom configurations dontUseCmakeConfigure = true; # causes possible redefinition of _FORTIFY_SOURCE hardeningDisable = [ "fortify3" ]; BUILD_NAMEDTENSOR = setBool true; BUILD_DOCS = setBool buildDocs; # We only do an imports check, so do not build tests either. BUILD_TEST = setBool false; # ninja hook doesn't automatically turn on ninja # because pytorch setup.py is responsible for this CMAKE_GENERATOR = "Ninja"; # Unlike MKL, oneDNN (née MKLDNN) is FOSS, so we enable support for # it by default. PyTorch currently uses its own vendored version # of oneDNN through Intel iDeep. USE_MKLDNN = setBool mklDnnSupport; USE_MKLDNN_CBLAS = setBool mklDnnSupport; # Avoid using pybind11 from git submodule # Also avoids pytorch exporting the headers of pybind11 USE_SYSTEM_PYBIND11 = true; # Multicore CPU convnet support USE_NNPACK = 1; # Explicitly enable MPS for Darwin USE_MPS = setBool stdenv.hostPlatform.isDarwin; # building torch.distributed on Darwin is disabled by default # https://pytorch.org/docs/stable/distributed.html#torch.distributed.is_available USE_DISTRIBUTED = setBool true; cmakeFlags = [ (lib.cmakeFeature "PYTHON_SIX_SOURCE_DIR" "${six.src}") # (lib.cmakeBool "CMAKE_FIND_DEBUG_MODE" true) (lib.cmakeFeature "CUDAToolkit_VERSION" cudaPackages.cudaMajorMinorVersion) ] ++ lib.optionals cudaSupport [ # Unbreaks version discovery in enable_language(CUDA) when wrapping nvcc with ccache # Cf. https://gitlab.kitware.com/cmake/cmake/-/issues/26363 (lib.cmakeFeature "CMAKE_CUDA_COMPILER_TOOLKIT_VERSION" cudaPackages.cudaMajorMinorVersion) ]; preBuild = '' export MAX_JOBS=$NIX_BUILD_CORES ${python.pythonOnBuildForHost.interpreter} setup.py build --cmake-only ${cmake}/bin/cmake build ''; preFixup = '' function join_by { local IFS="$1"; shift; echo "$*"; } function strip2 { IFS=':' read -ra RP <<< $(patchelf --print-rpath $1) IFS=' ' RP_NEW=$(join_by : ''${RP[@]:2}) patchelf --set-rpath \$ORIGIN:''${RP_NEW} "$1" } for f in $(find ''${out} -name 'libcaffe2*.so') do strip2 $f done ''; # Override the (weirdly) wrong version set by default. See # https://github.com/NixOS/nixpkgs/pull/52437#issuecomment-449718038 # https://github.com/pytorch/pytorch/blob/v1.0.0/setup.py#L267 PYTORCH_BUILD_VERSION = version; PYTORCH_BUILD_NUMBER = 0; # In-tree builds of NCCL are not supported. # Use NCCL when cudaSupport is enabled and nccl is available. USE_NCCL = setBool useSystemNccl; USE_SYSTEM_NCCL = USE_NCCL; USE_STATIC_NCCL = USE_NCCL; # Set the correct Python library path, broken since # https://github.com/pytorch/pytorch/commit/3d617333e PYTHON_LIB_REL_PATH = "${placeholder "out"}/${python.sitePackages}"; env = { # disable warnings as errors as they break the build on every compiler # bump, among other things. # Also of interest: pytorch ignores CXXFLAGS uses CFLAGS for both C and C++: # https://github.com/pytorch/pytorch/blob/v1.11.0/setup.py#L17 NIX_CFLAGS_COMPILE = toString ( [ "-Wno-error" ] # fix build aarch64-linux build failure with GCC14 ++ lib.optionals (stdenv.hostPlatform.isLinux && stdenv.hostPlatform.isAarch64) [ "-Wno-error=incompatible-pointer-types" ] ); USE_VULKAN = setBool vulkanSupport; } // lib.optionalAttrs vulkanSupport { VULKAN_SDK = shaderc.bin; } // lib.optionalAttrs rocmSupport { AOTRITON_INSTALLED_PREFIX = "${rocmPackages.aotriton}"; }; nativeBuildInputs = [ cmake which ninja pybind11 pkg-config removeReferencesTo ] ++ lib.optionals cudaSupport ( with cudaPackages; [ autoAddDriverRunpath cuda_nvcc ] ) ++ lib.optionals isCudaJetson [ cudaPackages.autoAddCudaCompatRunpath ] ++ lib.optionals rocmSupport [ rocmtoolkit_joined ]; buildInputs = [ blas blas.provider ] ++ lib.optionals cudaSupport ( with cudaPackages; [ cuda_cccl # cuda_cudart # cuda_runtime.h and libraries cuda_cupti # For kineto cuda_nvcc # crt/host_config.h; even though we include this in nativeBuildInputs, it's needed here too cuda_nvml_dev # cuda_nvrtc cuda_nvtx # -llibNVToolsExt cusparselt libcublas libcufft libcurand libcusolver libcusparse ] ++ lists.optionals (cudaPackages ? cudnn) [ cudnn ] ++ lists.optionals useSystemNccl [ # Some platforms do not support NCCL (i.e., Jetson) nccl # Provides nccl.h AND a static copy of NCCL! ] ++ lists.optionals (cudaOlder "11.8") [ cuda_nvprof # ] ++ lists.optionals (cudaAtLeast "11.8") [ cuda_profiler_api # ] ) ++ lib.optionals rocmSupport [ rocmPackages.llvm.openmp ] ++ lib.optionals (cudaSupport || rocmSupport) [ effectiveMagma ] ++ lib.optionals stdenv.hostPlatform.isLinux [ numactl ] ++ lib.optionals stdenv.hostPlatform.isDarwin [ apple-sdk_13 ] ++ lib.optionals tritonSupport [ _tritonEffective ] ++ lib.optionals MPISupport [ mpi ] ++ lib.optionals rocmSupport [ rocmtoolkit_joined rocmPackages.clr # Added separately so setup hook applies ]; pythonRelaxDeps = [ "sympy" ]; dependencies = [ astunparse expecttest filelock fsspec hypothesis jinja2 networkx ninja packaging psutil pyyaml requests sympy types-dataclasses typing-extensions # the following are required for tensorboard support pillow six tensorboard protobuf # torch/csrc requires `pybind11` at runtime pybind11 ] ++ lib.optionals tritonSupport [ _tritonEffective ] ++ lib.optionals vulkanSupport [ vulkan-headers vulkan-loader ]; propagatedCxxBuildInputs = [ ] ++ lib.optionals MPISupport [ mpi ] ++ lib.optionals rocmSupport [ rocmtoolkit_joined ]; # Tests take a long time and may be flaky, so just sanity-check imports doCheck = false; pythonImportsCheck = [ "torch" ]; nativeCheckInputs = [ hypothesis ninja psutil ]; checkPhase = with lib.versions; with lib.strings; concatStringsSep " " [ "runHook preCheck" "${python.interpreter} test/run_test.py" "--exclude" (concatStringsSep " " [ "utils" # utils requires git, which is not allowed in the check phase # "dataloader" # psutils correctly finds and triggers multiprocessing, but is too sandboxed to run -- resulting in numerous errors # ^^^^^^^^^^^^ NOTE: while test_dataloader does return errors, these are acceptable errors and do not interfere with the build # tensorboard has acceptable failures for pytorch 1.3.x due to dependencies on tensorboard-plugins (optionalString (majorMinor version == "1.3") "tensorboard") ]) "runHook postCheck" ]; pythonRemoveDeps = [ # In our dist-info the name is just "triton" "pytorch-triton-rocm" ]; postInstall = '' find "$out/${python.sitePackages}/torch/include" "$out/${python.sitePackages}/torch/lib" -type f -exec remove-references-to -t ${stdenv.cc} '{}' + mkdir $dev # CppExtension requires that include files are packaged with the main # python library output; which is why they are copied here. cp -r $out/${python.sitePackages}/torch/include $dev/include # Cmake files under /share are different and can be safely moved. This # avoids unnecessary closure blow-up due to apple sdk references when # USE_DISTRIBUTED is enabled. mv $out/${python.sitePackages}/torch/share $dev/share # Fix up library paths for split outputs substituteInPlace \ $dev/share/cmake/Torch/TorchConfig.cmake \ --replace-fail \''${TORCH_INSTALL_PREFIX}/lib "$lib/lib" substituteInPlace \ $dev/share/cmake/Caffe2/Caffe2Targets-release.cmake \ --replace-fail \''${_IMPORT_PREFIX}/lib "$lib/lib" mkdir $lib mv $out/${python.sitePackages}/torch/lib $lib/lib ln -s $lib/lib $out/${python.sitePackages}/torch/lib '' + lib.optionalString rocmSupport '' substituteInPlace $dev/share/cmake/Tensorpipe/TensorpipeTargets-release.cmake \ --replace-fail "\''${_IMPORT_PREFIX}/lib64" "$lib/lib" substituteInPlace $dev/share/cmake/ATen/ATenConfig.cmake \ --replace-fail "/build/source/torch/include" "$dev/include" ''; postFixup = '' mkdir -p "$cxxdev/nix-support" printWords "''${propagatedCxxBuildInputs[@]}" >> "$cxxdev/nix-support/propagated-build-inputs" '' + lib.optionalString stdenv.hostPlatform.isDarwin '' for f in $(ls $lib/lib/*.dylib); do install_name_tool -id $lib/lib/$(basename $f) $f || true done install_name_tool -change @rpath/libshm.dylib $lib/lib/libshm.dylib $lib/lib/libtorch_python.dylib install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libtorch_python.dylib install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch_python.dylib install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch.dylib install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libshm.dylib install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libshm.dylib ''; # See https://github.com/NixOS/nixpkgs/issues/296179 # # This is a quick hack to add `libnvrtc` to the runpath so that torch can find # it when it is needed at runtime. extraRunpaths = lib.optionals cudaSupport [ "${lib.getLib cudaPackages.cuda_nvrtc}/lib" ]; postPhases = lib.optionals stdenv.hostPlatform.isLinux [ "postPatchelfPhase" ]; postPatchelfPhase = '' while IFS= read -r -d $'\0' elf ; do for extra in $extraRunpaths ; do echo patchelf "$elf" --add-rpath "$extra" >&2 patchelf "$elf" --add-rpath "$extra" done done < <( find "''${!outputLib}" "$out" -type f -iname '*.so' -print0 ) ''; # Builds in 2+h with 2 cores, and ~15m with a big-parallel builder. requiredSystemFeatures = [ "big-parallel" ]; passthru = { inherit cudaSupport cudaPackages rocmSupport rocmPackages unroll-src ; cudaCapabilities = if cudaSupport then supportedCudaCapabilities else [ ]; # At least for 1.10.2 `torch.fft` is unavailable unless BLAS provider is MKL. This attribute allows for easy detection of its availability. blasProvider = blas.provider; # To help debug when a package is broken due to CUDA support inherit brokenConditions; tests = callPackage ../tests { }; }; meta = { changelog = "https://github.com/pytorch/pytorch/releases/tag/v${version}"; # keep PyTorch in the description so the package can be found under that name on search.nixos.org description = "PyTorch: Tensors and Dynamic neural networks in Python with strong GPU acceleration"; homepage = "https://pytorch.org/"; license = lib.licenses.bsd3; maintainers = with lib.maintainers; [ teh thoughtpolice tscholak ]; # tscholak esp. for darwin-related builds platforms = lib.platforms.linux ++ lib.optionals (!cudaSupport && !rocmSupport) lib.platforms.darwin; broken = builtins.any trivial.id (builtins.attrValues brokenConditions); }; }