nixpkgs/pkgs/development/python-modules/flashinfer/default.nix

# NOTE: At runtime, FlashInfer will fall back to PyTorch’s JIT compilation if a
# requested kernel wasn’t pre-compiled in AOT mode, and JIT compilation always
# requires the CUDA toolkit (via nvcc) to be available.
#
# This means that if you plan to use flashinfer, you will need to set the
# environment varaible `CUDA_HOME` to `cudatoolkit`.
{ lib,
  buildPythonPackage,
  symlinkJoin,
  fetchFromGitHub,
  setuptools,
  cmake,
  ninja,
  numpy,
  torch
}:

assert torch.cudaSupport;

let
  pname = "flashinfer";
  version = "0.2.5";

  inherit (torch) cudaPackages;
  inherit (cudaPackages) cudaMajorMinorVersion;

  cudaMajorMinorVersionString = lib.replaceStrings [ "." ] [ "" ] cudaMajorMinorVersion;

  src_cutlass = fetchFromGitHub {
    owner = "NVIDIA";
    repo = "cutlass";
    # Using the revision obtained in submodule inside flashinfer's `3rdparty`.
    rev = "df8a550d3917b0e97f416b2ed8c2d786f7f686a3";
    hash = "sha256-d4czDoEv0Focf1bJHOVGX4BDS/h5O7RPoM/RrujhgFQ=";
  };

in buildPythonPackage {
  inherit pname version;

  src = fetchFromGitHub {
    owner = "flashinfer-ai";
    repo = "flashinfer";
    tag = "v${version}";
    hash = "sha256-YrYfatkI9DQkFEEGiF8CK/bTafaNga4Ufyt+882C0bQ=";
  };

  build-system = [ setuptools ];

  nativeBuildInputs = [
    cmake
    ninja
    cudaPackages.cudatoolkit
  ];
  dontUseCmakeConfigure = true;

  postPatch = ''
    rmdir 3rdparty/cutlass
    ln -s ${src_cutlass} 3rdparty/cutlass
  '';

  # FlashInfer offers two installation modes:
  #
  # JIT mode: CUDA kernels are compiled at runtime using PyTorch’s JIT, with
  # compiled kernels cached for future use. JIT mode allows fast installation,
  # as no CUDA kernels are pre-compiled, making it ideal for development and
  # testing. JIT version is also available as a sdist in PyPI.
  # 
  # AOT mode: Core CUDA kernels are pre-compiled and included in the library,
  # reducing runtime compilation overhead. If a required kernel is not
  # pre-compiled, it will be compiled at runtime using JIT. AOT mode is
  # recommended for production environments.
  #
  # Here we use opt for the AOT version.
  preConfigure = ''
    export FLASHINFER_ENABLE_AOT=1
    export TORCH_NVCC_FLAGS="--maxrregcount=64"
  '';

  CUDA_HOME = "${cudaPackages.cudatoolkit}";
  TORCH_CUDA_ARCH_LIST = "${lib.concatStringsSep ";" torch.cudaCapabilities}";

  dependencies = [
    numpy
    torch
  ];

  meta = with lib; {
    homepage = "https://flashinfer.ai/";
    description = '';
      FlashInfer is a library and kernel generator for Large Language Models
      that provides high-performance implementation of LLM GPU kernels such as
      FlashAttention, PageAttention and LoRA. FlashInfer focus on LLM serving
      and inference, and delivers state-of-the-art performance across diverse
      scenarios.
    '';
    license = licenses.asl20;
    maintainers = with maintainers; [ breakds ];
  };
}