nixpkgs/pkgs/development/python-modules/flashinfer/default.nix
Peder Bergebakken Sundt c77ac9dfc3 treewide: fix typos
2025-06-02 16:07:07 +02:00

103 lines
2.9 KiB
Nix
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# NOTE: At runtime, FlashInfer will fall back to PyTorchs JIT compilation if a
# requested kernel wasnt pre-compiled in AOT mode, and JIT compilation always
# requires the CUDA toolkit (via nvcc) to be available.
#
# This means that if you plan to use flashinfer, you will need to set the
# environment variable `CUDA_HOME` to `cudatoolkit`.
{
lib,
config,
buildPythonPackage,
fetchFromGitHub,
setuptools,
cudaPackages,
cmake,
ninja,
numpy,
torch,
}:
let
pname = "flashinfer";
version = "0.2.5";
src_cutlass = fetchFromGitHub {
owner = "NVIDIA";
repo = "cutlass";
# Using the revision obtained in submodule inside flashinfer's `3rdparty`.
rev = "df8a550d3917b0e97f416b2ed8c2d786f7f686a3";
hash = "sha256-d4czDoEv0Focf1bJHOVGX4BDS/h5O7RPoM/RrujhgFQ=";
};
in
buildPythonPackage {
inherit pname version;
src = fetchFromGitHub {
owner = "flashinfer-ai";
repo = "flashinfer";
tag = "v${version}";
hash = "sha256-YrYfatkI9DQkFEEGiF8CK/bTafaNga4Ufyt+882C0bQ=";
};
build-system = [ setuptools ];
nativeBuildInputs = [
cmake
ninja
(lib.getBin cudaPackages.cuda_nvcc)
];
dontUseCmakeConfigure = true;
buildInputs = [
cudaPackages.cuda_cudart
cudaPackages.libcublas
cudaPackages.cuda_cccl
cudaPackages.libcurand
];
postPatch = ''
rmdir 3rdparty/cutlass
ln -s ${src_cutlass} 3rdparty/cutlass
'';
# FlashInfer offers two installation modes:
#
# JIT mode: CUDA kernels are compiled at runtime using PyTorchs JIT, with
# compiled kernels cached for future use. JIT mode allows fast installation,
# as no CUDA kernels are pre-compiled, making it ideal for development and
# testing. JIT version is also available as a sdist in PyPI.
#
# AOT mode: Core CUDA kernels are pre-compiled and included in the library,
# reducing runtime compilation overhead. If a required kernel is not
# pre-compiled, it will be compiled at runtime using JIT. AOT mode is
# recommended for production environments.
#
# Here we use opt for the AOT version.
preConfigure = ''
export FLASHINFER_ENABLE_AOT=1
export TORCH_NVCC_FLAGS="--maxrregcount=64"
'';
TORCH_CUDA_ARCH_LIST = lib.concatStringsSep ";" torch.cudaCapabilities;
dependencies = [
numpy
torch
];
meta = with lib; {
broken = !torch.cudaSupport || !config.cudaSupport;
homepage = "https://flashinfer.ai/";
description = "Library and kernel generator for Large Language Models";
longDescription = ''
FlashInfer is a library and kernel generator for Large Language Models
that provides high-performance implementation of LLM GPU kernels such as
FlashAttention, PageAttention and LoRA. FlashInfer focus on LLM serving
and inference, and delivers state-of-the-art performance across diverse
scenarios.
'';
license = licenses.asl20;
maintainers = with maintainers; [ breakds ];
};
}