nixpkgs/pkgs/development/python-modules/flashinfer/default.nix

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

104 lines
2.9 KiB
Nix
Raw Normal View History

2025-05-18 16:26:16 -07:00
# NOTE: At runtime, FlashInfer will fall back to PyTorchs JIT compilation if a
# requested kernel wasnt pre-compiled in AOT mode, and JIT compilation always
# requires the CUDA toolkit (via nvcc) to be available.
#
# This means that if you plan to use flashinfer, you will need to set the
2025-06-02 15:54:57 +02:00
# environment variable `CUDA_HOME` to `cudatoolkit`.
2025-05-18 17:04:48 -07:00
{
lib,
2025-05-20 15:29:12 -07:00
config,
2025-05-18 16:26:16 -07:00
buildPythonPackage,
fetchFromGitHub,
setuptools,
2025-05-20 15:29:12 -07:00
cudaPackages,
2025-05-18 16:26:16 -07:00
cmake,
ninja,
numpy,
2025-05-18 17:04:48 -07:00
torch,
2025-05-18 16:26:16 -07:00
}:
let
pname = "flashinfer";
version = "0.2.5";
src_cutlass = fetchFromGitHub {
owner = "NVIDIA";
repo = "cutlass";
# Using the revision obtained in submodule inside flashinfer's `3rdparty`.
rev = "df8a550d3917b0e97f416b2ed8c2d786f7f686a3";
hash = "sha256-d4czDoEv0Focf1bJHOVGX4BDS/h5O7RPoM/RrujhgFQ=";
};
2025-05-18 17:04:48 -07:00
in
buildPythonPackage {
2025-05-18 16:26:16 -07:00
inherit pname version;
src = fetchFromGitHub {
owner = "flashinfer-ai";
repo = "flashinfer";
tag = "v${version}";
hash = "sha256-YrYfatkI9DQkFEEGiF8CK/bTafaNga4Ufyt+882C0bQ=";
};
build-system = [ setuptools ];
nativeBuildInputs = [
cmake
ninja
2025-05-20 15:29:12 -07:00
(lib.getBin cudaPackages.cuda_nvcc)
2025-05-18 16:26:16 -07:00
];
dontUseCmakeConfigure = true;
2025-05-20 16:15:45 -07:00
buildInputs = [
cudaPackages.cuda_cudart
cudaPackages.libcublas
cudaPackages.cuda_cccl
cudaPackages.libcurand
];
2025-05-18 16:26:16 -07:00
postPatch = ''
rmdir 3rdparty/cutlass
ln -s ${src_cutlass} 3rdparty/cutlass
'';
# FlashInfer offers two installation modes:
#
# JIT mode: CUDA kernels are compiled at runtime using PyTorchs JIT, with
# compiled kernels cached for future use. JIT mode allows fast installation,
# as no CUDA kernels are pre-compiled, making it ideal for development and
# testing. JIT version is also available as a sdist in PyPI.
2025-05-18 17:04:48 -07:00
#
2025-05-18 16:26:16 -07:00
# AOT mode: Core CUDA kernels are pre-compiled and included in the library,
# reducing runtime compilation overhead. If a required kernel is not
# pre-compiled, it will be compiled at runtime using JIT. AOT mode is
# recommended for production environments.
#
# Here we use opt for the AOT version.
preConfigure = ''
export FLASHINFER_ENABLE_AOT=1
export TORCH_NVCC_FLAGS="--maxrregcount=64"
'';
TORCH_CUDA_ARCH_LIST = lib.concatStringsSep ";" torch.cudaCapabilities;
2025-05-18 16:26:16 -07:00
dependencies = [
numpy
torch
];
meta = with lib; {
broken = !torch.cudaSupport || !config.cudaSupport;
2025-05-18 16:26:16 -07:00
homepage = "https://flashinfer.ai/";
description = "Library and kernel generator for Large Language Models";
longDescription = ''
FlashInfer is a library and kernel generator for Large Language Models
that provides high-performance implementation of LLM GPU kernels such as
FlashAttention, PageAttention and LoRA. FlashInfer focus on LLM serving
and inference, and delivers state-of-the-art performance across diverse
scenarios.
2025-05-18 16:26:16 -07:00
'';
license = licenses.asl20;
maintainers = with maintainers; [ breakds ];
};
}