# Invocation command line: # /home/HPC2021v1.1.7/bin/harness/runhpc --reportable --configfile nv2.cfg --tune base --define ucx --define model=acc --pmodel ACC --threads 1 --ranks 2 --size ref --iterations 3 --tune base,peak --size ref --nopower --runmode speed --tune base:peak --size ref tiny # output_root was not used for this run ############################################################################ ###################################################################### # The header section of the config file. Must appear # before any instances of "section markers" (see below) # # ext = how the binaries you generated will be identified # tune = specify "base" or "peak" or "all" label = %{label}_%{model} tune = all output_format = all use_submit_for_speed = 1 # Compiler Settings default: CC = mpicc CXX = mpicxx FC = mpif90 system_class = Homogeneous Cluster sw_compiler = Nvidia HPC SDK 22.11 sw_mpi_library = Open MPI 4.0.5 sw_mpi_other = None test_sponsor = Lenovo Global Technology license_num = 28 tester = Lenovo Global Technology hw_avail = Feb-2023 sw_avail = Feb-2023 prepared_by = Lenovo Global Technology system_vendor = Lenovo Global Technology system_name = ThinkSystem SR655 V3 (AMD EPYC 9654P, Nvidia H100-PCIE-80G) node_compute_syslbl = ThinkSystem SR655 V3 node_compute_sw_state = Multi-user, run level 3 node_compute_sw_sharedfile = XFS node_compute_sw_other = None node_compute_sw_localfile = xfs node_compute_sw_accel_driver = 525.60.13 node_compute_purpose = compute node_compute_order = 1 node_compute_hw_vendor = Lenovo Global Technology node_compute_hw_tcache = 384 MB I+D on chip per chip node_compute_hw_scache = 1 MB I+D on chip per core node_compute_hw_pcache = 32 KB I + 32 KB D on chip per core node_compute_hw_other = None node_compute_hw_ocache = None node_compute_hw_nthreadspercore = 1 node_compute_hw_ncpuorder = 1 chips node_compute_hw_ncoresperchip = 96 node_compute_hw_ncores = 96 node_compute_hw_nchips = 1 node_compute_hw_model = ThinkSystem SR655 V3 node_compute_hw_memory = 384 GB (12 x 16 GB 2Rx4 PC5-4800B-R) node_compute_hw_disk = 1x ThinkSystem 2.5" 5300 480GB SSD node_compute_hw_cpu_name = AMD EPYC 9654P node_compute_hw_cpu_mhz = 2400 node_compute_hw_cpu_char = Intel Turbo Boost Technology up to 3.7 GHz node_compute_hw_adapter_fs_slot_type = PCI-Express 5.0 x16 node_compute_hw_adapter_fs_ports_used = 1 node_compute_hw_adapter_fs_model = Mellanox ConnectX-6 HDR node_compute_hw_adapter_fs_interconnect = Nvidia Mellanox ConnectX-6 HDR node_compute_hw_adapter_fs_firmware = 20.28.1002 node_compute_hw_adapter_fs_driver = 5.2-1.0.4 node_compute_hw_adapter_fs_data_rate = 200 Gb/s node_compute_hw_adapter_fs_count = 1 node_compute_hw_accel_vendor = Nvidia Corporation node_compute_hw_accel_type = GPU node_compute_hw_accel_model = Tesla H100 PCIe 80GB node_compute_hw_accel_ecc = Yes node_compute_hw_accel_desc = Nvidia Tesla H100 PCIe 80GB node_compute_hw_accel_count = 8 node_compute_hw_accel_connect = PCIe Gen5 x16 node_compute_count = 1 # Compiler Version Flags CC_VERSION_OPTION = -V CXX_VERSION_OPTION = -V FC_VERSION_OPTION = -V %ifdef %{ucx} # if using OpenMPI with UCX support, these settings are needed with use of CUDA Aware MPI # without these flags, LBM is known to hang when using OpenACC and OpenMP Target to GPUs preENV_UCX_MEMTYPE_CACHE=n preENV_UCX_TLS=self,shm,cuda_copy %endif MPIRUN_OPTS = --allow-run-as-root --bind-to none submit = mpirun --allow-run-as-root -x UCX_MEMTYPE_CACHE=n -np $ranks perl $[top]/bind2.pl $command # Optimization default: pmodel=ACC default=base=default: ranks = %{RANKS} OPTIMIZE = -w -fast -acc=gpu -Mfprelaxed -Mnouniform -Mstack_arrays -DSPEC_ACCEL_AWARE_MPI CXXPORTABILITY = --c++17 505.lbm_t,605.lbm_s,705.lbm_m,805.lbm_l: PORTABILITY+= -DSPEC_OPENACC_NO_SELF 505.lbm_t=peak=default: ranks = %{RANKS} OPTIMIZE = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -DSPEC_ACCEL_AWARE_MPI PORTABILITY+= -DSPEC_OPENACC_NO_SELF 513.soma_t=peak=default: basepeak=1 518.tealeaf_t=peak=default: ranks = %{RANKS} OPTIMIZE = -w -fast -acc=gpu -Msafeptr -DSPEC_ACCEL_AWARE_MPI 519.clvleaf_t=peak=default: basepeak=1 521.miniswp_t=peak=default: #ranks = %{RANKS} #OPTIMIZE = -w -fast -acc=gpu -gpu=pinned # -DSPEC_ACCEL_AWARE_MPI basepeak=1 528.pot3d_t=peak=default: basepeak=1 532.sph_exa_t=peak=default: ranks = %{RANKS} OPTIMIZE = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -Mstack_arrays -static-nvidia -DSPEC_ACCEL_AWARE_MPI 534.hpgmgfv_t=peak=default: ranks = %{RANKS} OPTIMIZE = -w -fast -acc=gpu -static-nvidia -DSPEC_ACCEL_AWARE_MPI 535.weather_t=peak=default: ranks = %{RANKS} OPTIMIZE = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -Mstack_arrays -static-nvidia -DSPEC_ACCEL_AWARE_MPI 605.lbm_s=peak=default: ranks = %{RANKS} OPTIMIZE = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -DSPEC_ACCEL_AWARE_MPI 613.soma_s=peak=default: basepeak=1 618.tealeaf_s=peak=default: ranks = %{RANKS} OPTIMIZE = -w -fast -acc=gpu -Msafeptr -DSPEC_ACCEL_AWARE_MPI 619.clvleaf_s=peak=default: basepeak=1 621.miniswp_s=peak=default: ranks = %{RANKS} OPTIMIZE = -w -fast -acc=gpu -gpu=pinned # -DSPEC_ACCEL_AWARE_MPI 628.pot3d_s=peak=default: basepeak=1 632.sph_exa_s=peak=default: ranks = %{RANKS} OPTIMIZE = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -Mstack_arrays -static-nvidia -DSPEC_ACCEL_AWARE_MPI 634.hpgmgfv_s=peak=default: ranks = %{RANKS} OPTIMIZE = -w -fast -acc=gpu -static-nvidia -DSPEC_ACCEL_AWARE_MPI 635.weather_s=peak=default: ranks = %{RANKS} OPTIMIZE = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -Mstack_arrays -static-nvidia -DSPEC_ACCEL_AWARE_MPI # The following section was added automatically, and contains settings that # did not appear in the original configuration file, but were added to the # raw file after the run. default: notes_000 =Environment variables set by runhpc before the start of the run: notes_005 =UCX_MEMTYPE_CACHE = "n" notes_010 =UCX_TLS = "self,shm,cuda_copy" notes_015 = flagsurl000 = http://www.spec.org/hpc2021/flags/nv2021_flags_v1.0.3.xml node_compute_sw_os000 = Red Hat Enterprise Linux Server release 9, node_compute_sw_os001 = Kernel 5.14.0-70.22.1.el9_0.x86_64 notes_submit_000 =Indiviual Ranks were bound to the CPU cores on the same NUMA node as notes_submit_005 =the GPU using 'numactl' within the following "bind2.pl" perl script: notes_submit_010 =---- Start bind2.pl ------ notes_submit_015 =my %bind; notes_submit_020 =$bind{0} = "1-3"; notes_submit_025 =$bind{1} = "144-146"; notes_submit_030 =$bind{2} = "8-10"; notes_submit_035 =$bind{3} = "11-14"; notes_submit_040 =$bind{4} = "41-43"; notes_submit_045 =$bind{5} = "44-47"; notes_submit_050 =$bind{6} = "61-63"; notes_submit_055 =$bind{7} = "64-67"; notes_submit_060 =my $rank = $ENV{OMPI_COMM_WORLD_LOCAL_RANK}; notes_submit_065 =my $cmd = "taskset -c $bind{$rank} "; notes_submit_070 =while (my $arg = shift) { notes_submit_075 = $cmd .= "$arg "; notes_submit_080 =} notes_submit_085 =my $rc = system($cmd); notes_submit_090 =exit($rc); notes_submit_095 =---- End bind.pl ------ notes_submit_100 =The config file option 'submit' was used. notes_submit_105 =submit = mpirun --allow-run-as-root -x UCX_MEMTYPE_CACHE=n notes_submit_110 =-host localhost:2 -np $ranks perl $[top]/bind2.pl $command