diff --git a/nixos/tests/slurm.nix b/nixos/tests/slurm.nix index d0e62d15437c..a54c5d9db482 100644 --- a/nixos/tests/slurm.nix +++ b/nixos/tests/slurm.nix @@ -1,16 +1,52 @@ -import ./make-test-python.nix ({ lib, ... }: +import ./make-test-python.nix ({ lib, pkgs, ... }: let - mungekey = "mungeverryweakkeybuteasytointegratoinatest"; - slurmconfig = { - controlMachine = "control"; - nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ]; - partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ]; - extraConfig = '' - AccountingStorageHost=dbd - AccountingStorageType=accounting_storage/slurmdbd - ''; + services.slurm = { + controlMachine = "control"; + nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ]; + partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ]; + extraConfig = '' + AccountingStorageHost=dbd + AccountingStorageType=accounting_storage/slurmdbd + ''; + }; + environment.systemPackages = [ mpitest ]; + networking.firewall.enable = false; + systemd.tmpfiles.rules = [ + "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest" + ]; }; + + mpitest = let + mpitestC = pkgs.writeText "mpitest.c" '' + #include + #include + #include + + int + main (int argc, char *argv[]) + { + int rank, size, length; + char name[512]; + + MPI_Init (&argc, &argv); + MPI_Comm_rank (MPI_COMM_WORLD, &rank); + MPI_Comm_size (MPI_COMM_WORLD, &size); + MPI_Get_processor_name (name, &length); + + if ( rank == 0 ) printf("size=%d\n", size); + + printf ("%s: hello world from process %d of %d\n", name, rank, size); + + MPI_Finalize (); + + return EXIT_SUCCESS; + } + ''; + in pkgs.runCommandNoCC "mpitest" {} '' + mkdir -p $out/bin + ${pkgs.openmpi}/bin/mpicc ${mpitestC} -o $out/bin/mpitest + ''; in { name = "slurm"; @@ -21,37 +57,40 @@ in { computeNode = { ...}: { + imports = [ slurmconfig ]; # TODO slurmd port and slurmctld port should be configurations and # automatically allowed by the firewall. - networking.firewall.enable = false; services.slurm = { client.enable = true; - } // slurmconfig; + }; }; in { control = { ...}: { - networking.firewall.enable = false; + imports = [ slurmconfig ]; services.slurm = { server.enable = true; - } // slurmconfig; + }; }; submit = { ...}: { - networking.firewall.enable = false; + imports = [ slurmconfig ]; services.slurm = { enableStools = true; - } // slurmconfig; + }; }; dbd = { pkgs, ... } : { networking.firewall.enable = false; + systemd.tmpfiles.rules = [ + "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest" + ]; services.slurm.dbdserver = { enable = true; storagePass = "password123"; @@ -87,24 +126,7 @@ in { '' start_all() - # Set up authentification across the cluster - for node in [submit, control, dbd, node1, node2, node3]: - - node.wait_for_unit("default.target") - - node.succeed("mkdir /etc/munge") - node.succeed( - "echo '${mungekey}' > /etc/munge/munge.key" - ) - node.succeed("chmod 0400 /etc/munge/munge.key") - node.succeed("chown munge:munge /etc/munge/munge.key") - node.succeed("systemctl restart munged") - - node.wait_for_unit("munged") - - - # Restart the services since they have probably failed due to the munge init - # failure + # Make sure DBD is up after DB initialzation with subtest("can_start_slurmdbd"): dbd.succeed("systemctl restart slurmdbd") dbd.wait_for_unit("slurmdbd.service") @@ -137,5 +159,8 @@ in { # find the srun job from above in the database control.succeed("sleep 5") control.succeed("sacct | grep hostname") + + with subtest("run_PMIx_mpitest"): + submit.succeed("srun -N 3 --mpi=pmix mpitest | grep size=3") ''; }) diff --git a/pkgs/development/libraries/openmpi/default.nix b/pkgs/development/libraries/openmpi/default.nix index 5c185f630d98..13f633ac0df0 100644 --- a/pkgs/development/libraries/openmpi/default.nix +++ b/pkgs/development/libraries/openmpi/default.nix @@ -1,6 +1,6 @@ { stdenv, fetchurl, fetchpatch, gfortran, perl, libnl , rdma-core, zlib, numactl, libevent, hwloc, targetPackages, symlinkJoin -, libpsm2, libfabric +, libpsm2, libfabric, pmix # Enable CUDA support , cudaSupport ? false, cudatoolkit ? null @@ -46,7 +46,7 @@ in stdenv.mkDerivation rec { ''; buildInputs = with stdenv; [ gfortran zlib ] - ++ lib.optionals isLinux [ libnl numactl ] + ++ lib.optionals isLinux [ libnl numactl pmix ] ++ lib.optionals cudaSupport [ cudatoolkit ] ++ [ libevent hwloc ] ++ lib.optional (isLinux || isFreeBSD) rdma-core @@ -55,8 +55,11 @@ in stdenv.mkDerivation rec { nativeBuildInputs = [ perl ]; configureFlags = with stdenv; lib.optional (!cudaSupport) "--disable-mca-dso" - ++ lib.optional isLinux "--with-libnl=${libnl.dev}" - ++ lib.optional enableSGE "--with-sge" + ++ lib.optionals isLinux [ + "--with-libnl=${libnl.dev}" + "--with-pmix=${pmix}" + "--with-pmix-libdir=${pmix}/lib" + ] ++ lib.optional enableSGE "--with-sge" ++ lib.optional enablePrefix "--enable-mpirun-prefix-by-default" # TODO: add UCX support, which is recommended to use with cuda for the most robust OpenMPI build # https://github.com/openucx/ucx diff --git a/pkgs/development/libraries/pmix/default.nix b/pkgs/development/libraries/pmix/default.nix new file mode 100644 index 000000000000..c8d8be4e8cbf --- /dev/null +++ b/pkgs/development/libraries/pmix/default.nix @@ -0,0 +1,48 @@ +{ stdenv, fetchFromGitHub, perl, autoconf, automake +, libtool, flex, libevent, hwloc, munge, zlib +} : + +let + version = "3.1.5"; + +in stdenv.mkDerivation { + pname = "pmix"; + inherit version; + + src = fetchFromGitHub { + repo = "openpmix"; + owner = "openpmix"; + rev = "v${version}"; + sha256 = "0fvfsig20amcigyn4v3gcdxc0jif44vqg37b8zzh0s8jqqj7jz5w"; + }; + + postPatch = '' + patchShebangs ./autogen.pl + patchShebangs ./config + ''; + + nativeBuildInputs = [ perl autoconf automake libtool flex ]; + + buildInputs = [ libevent hwloc munge zlib ]; + + configureFlags = [ + "--with-libevent=${libevent.dev}" + "--with-munge=${munge}" + "--with-hwloc=${hwloc.dev}" + ]; + + preConfigure = '' + ./autogen.pl + ''; + + enableParallelBuilding = true; + + meta = with stdenv.lib; { + description = "Process Management Interface for HPC environments"; + homepage = "https://openpmix.github.io/"; + license = licenses.bsd3; + maintainers = [ maintainers.markuskowa ]; + platforms = platforms.linux; + }; +} + diff --git a/pkgs/servers/computing/slurm/default.nix b/pkgs/servers/computing/slurm/default.nix index 1ec807f0bf82..d93c13719597 100644 --- a/pkgs/servers/computing/slurm/default.nix +++ b/pkgs/servers/computing/slurm/default.nix @@ -2,6 +2,7 @@ , python, munge, perl, pam, zlib, shadow, coreutils , ncurses, libmysqlclient, gtk2, lua, hwloc, numactl , readline, freeipmi, xorg, lz4, rdma-core, nixosTests +, pmix # enable internal X11 support via libssh2 , enableX11 ? true }: @@ -26,6 +27,8 @@ stdenv.mkDerivation rec { # increase string length to allow for full # path of 'echo' in nix store ./common-env-echo.patch + # Required for configure to pick up the right dlopen path + ./pmix-configure.patch ]; prePatch = '' @@ -46,6 +49,7 @@ stdenv.mkDerivation rec { curl python munge perl pam zlib libmysqlclient ncurses gtk2 lz4 rdma-core lua hwloc numactl readline freeipmi shadow.su + pmix ] ++ stdenv.lib.optionals enableX11 [ xorg.xauth ]; configureFlags = with stdenv.lib; @@ -56,6 +60,7 @@ stdenv.mkDerivation rec { "--with-zlib=${zlib}" "--with-ofed=${rdma-core}" "--sysconfdir=/etc/slurm" + "--with-pmix=${pmix}" ] ++ (optional (gtk2 == null) "--disable-gtktest") ++ (optional (!enableX11) "--disable-x11"); diff --git a/pkgs/servers/computing/slurm/pmix-configure.patch b/pkgs/servers/computing/slurm/pmix-configure.patch new file mode 100644 index 000000000000..21c2197c3ff1 --- /dev/null +++ b/pkgs/servers/computing/slurm/pmix-configure.patch @@ -0,0 +1,13 @@ +diff --git a/configure b/configure +index 1cf53bc..ab68441 100755 +--- a/configure ++++ b/configure +@@ -21207,7 +21207,7 @@ rm -f conftest.err conftest.i conftest.$ac_ext + as_fn_error $? "error processing $x_ac_cv_pmix_libdir: PMIx v3.x was already found in one of the previous paths" "$LINENO" 5 + fi + _x_ac_pmix_v3_found="1" +- PMIX_V3_CPPFLAGS="-I$x_ac_cv_pmix_dir/include" ++ PMIX_V3_CPPFLAGS="-I$x_ac_cv_pmix_dir/include -DPMIXP_V3_LIBPATH=\\\"$x_ac_cv_pmix_libdir\\\"" + if test "$ac_with_rpath" = "yes"; then + PMIX_V3_LDFLAGS="-Wl,-rpath -Wl,$x_ac_cv_pmix_libdir -L$x_ac_cv_pmix_libdir" + else diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix index 5dfddd472871..928669e3ea7d 100644 --- a/pkgs/top-level/all-packages.nix +++ b/pkgs/top-level/all-packages.nix @@ -6190,6 +6190,8 @@ in pmacct = callPackage ../tools/networking/pmacct { }; + pmix = callPackage ../development/libraries/pmix { }; + polygraph = callPackage ../tools/networking/polygraph { }; progress = callPackage ../tools/misc/progress { };