mirror of
https://github.com/NixOS/nixpkgs.git
synced 2025-06-10 19:55:41 +03:00
100 lines
2.8 KiB
Nix
100 lines
2.8 KiB
Nix
![]() |
# NOTE: At runtime, FlashInfer will fall back to PyTorch’s JIT compilation if a
|
|||
|
# requested kernel wasn’t pre-compiled in AOT mode, and JIT compilation always
|
|||
|
# requires the CUDA toolkit (via nvcc) to be available.
|
|||
|
#
|
|||
|
# This means that if you plan to use flashinfer, you will need to set the
|
|||
|
# environment varaible `CUDA_HOME` to `cudatoolkit`.
|
|||
|
{ lib,
|
|||
|
buildPythonPackage,
|
|||
|
symlinkJoin,
|
|||
|
fetchFromGitHub,
|
|||
|
setuptools,
|
|||
|
cmake,
|
|||
|
ninja,
|
|||
|
numpy,
|
|||
|
torch
|
|||
|
}:
|
|||
|
|
|||
|
assert torch.cudaSupport;
|
|||
|
|
|||
|
let
|
|||
|
pname = "flashinfer";
|
|||
|
version = "0.2.5";
|
|||
|
|
|||
|
inherit (torch) cudaPackages;
|
|||
|
inherit (cudaPackages) cudaMajorMinorVersion;
|
|||
|
|
|||
|
cudaMajorMinorVersionString = lib.replaceStrings [ "." ] [ "" ] cudaMajorMinorVersion;
|
|||
|
|
|||
|
src_cutlass = fetchFromGitHub {
|
|||
|
owner = "NVIDIA";
|
|||
|
repo = "cutlass";
|
|||
|
# Using the revision obtained in submodule inside flashinfer's `3rdparty`.
|
|||
|
rev = "df8a550d3917b0e97f416b2ed8c2d786f7f686a3";
|
|||
|
hash = "sha256-d4czDoEv0Focf1bJHOVGX4BDS/h5O7RPoM/RrujhgFQ=";
|
|||
|
};
|
|||
|
|
|||
|
in buildPythonPackage {
|
|||
|
inherit pname version;
|
|||
|
|
|||
|
src = fetchFromGitHub {
|
|||
|
owner = "flashinfer-ai";
|
|||
|
repo = "flashinfer";
|
|||
|
tag = "v${version}";
|
|||
|
hash = "sha256-YrYfatkI9DQkFEEGiF8CK/bTafaNga4Ufyt+882C0bQ=";
|
|||
|
};
|
|||
|
|
|||
|
build-system = [ setuptools ];
|
|||
|
|
|||
|
nativeBuildInputs = [
|
|||
|
cmake
|
|||
|
ninja
|
|||
|
cudaPackages.cudatoolkit
|
|||
|
];
|
|||
|
dontUseCmakeConfigure = true;
|
|||
|
|
|||
|
postPatch = ''
|
|||
|
rmdir 3rdparty/cutlass
|
|||
|
ln -s ${src_cutlass} 3rdparty/cutlass
|
|||
|
'';
|
|||
|
|
|||
|
# FlashInfer offers two installation modes:
|
|||
|
#
|
|||
|
# JIT mode: CUDA kernels are compiled at runtime using PyTorch’s JIT, with
|
|||
|
# compiled kernels cached for future use. JIT mode allows fast installation,
|
|||
|
# as no CUDA kernels are pre-compiled, making it ideal for development and
|
|||
|
# testing. JIT version is also available as a sdist in PyPI.
|
|||
|
#
|
|||
|
# AOT mode: Core CUDA kernels are pre-compiled and included in the library,
|
|||
|
# reducing runtime compilation overhead. If a required kernel is not
|
|||
|
# pre-compiled, it will be compiled at runtime using JIT. AOT mode is
|
|||
|
# recommended for production environments.
|
|||
|
#
|
|||
|
# Here we use opt for the AOT version.
|
|||
|
preConfigure = ''
|
|||
|
export FLASHINFER_ENABLE_AOT=1
|
|||
|
export TORCH_NVCC_FLAGS="--maxrregcount=64"
|
|||
|
'';
|
|||
|
|
|||
|
CUDA_HOME = "${cudaPackages.cudatoolkit}";
|
|||
|
TORCH_CUDA_ARCH_LIST = "${lib.concatStringsSep ";" torch.cudaCapabilities}";
|
|||
|
|
|||
|
dependencies = [
|
|||
|
numpy
|
|||
|
torch
|
|||
|
];
|
|||
|
|
|||
|
meta = with lib; {
|
|||
|
homepage = "https://flashinfer.ai/";
|
|||
|
description = '';
|
|||
|
FlashInfer is a library and kernel generator for Large Language Models
|
|||
|
that provides high-performance implementation of LLM GPU kernels such as
|
|||
|
FlashAttention, PageAttention and LoRA. FlashInfer focus on LLM serving
|
|||
|
and inference, and delivers state-of-the-art performance across diverse
|
|||
|
scenarios.
|
|||
|
'';
|
|||
|
license = licenses.asl20;
|
|||
|
maintainers = with maintainers; [ breakds ];
|
|||
|
};
|
|||
|
}
|