# Invocation command line:
# /home/HPC2021v1.1.7/bin/harness/runhpc --reportable --configfile nv2.cfg --tune base --define ucx --define model=acc --pmodel ACC --threads 1 --ranks 2 --size ref --iterations 3 --tune base,peak --size ref --nopower --runmode speed --tune base:peak --size ref tiny
# output_root was not used for this run
############################################################################
######################################################################
# The header section of the config file.  Must appear
# before any instances of "section markers" (see below)
#
# ext = how the binaries you generated will be identified
# tune = specify "base" or "peak" or "all"
label         = %{label}_%{model}
tune          = all
output_format = all
use_submit_for_speed = 1

# Compiler Settings
default:
CC           = mpicc
CXX          = mpicxx
FC           = mpif90
system_class = Homogeneous Cluster
sw_compiler      = Nvidia HPC SDK 22.11
sw_mpi_library = Open MPI 4.0.5
sw_mpi_other = None
test_sponsor = Lenovo Global Technology
license_num = 28
tester = Lenovo Global Technology
hw_avail = Feb-2023
sw_avail = Feb-2023
prepared_by = Lenovo Global Technology
system_vendor = Lenovo Global Technology
system_name = ThinkSystem SR655 V3 (AMD EPYC 9654P, Nvidia H100-PCIE-80G)
node_compute_syslbl = ThinkSystem SR655 V3
node_compute_sw_state = Multi-user, run level 3
node_compute_sw_sharedfile = XFS
node_compute_sw_other = None
node_compute_sw_localfile = xfs
node_compute_sw_accel_driver = 525.60.13
node_compute_purpose = compute
node_compute_order = 1
node_compute_hw_vendor = Lenovo Global Technology
node_compute_hw_tcache = 384 MB I+D on chip per chip
node_compute_hw_scache = 1 MB I+D on chip per core
node_compute_hw_pcache = 32 KB I + 32 KB D on chip per core
node_compute_hw_other = None
node_compute_hw_ocache = None
node_compute_hw_nthreadspercore = 1
node_compute_hw_ncpuorder = 1 chips
node_compute_hw_ncoresperchip = 96
node_compute_hw_ncores = 96
node_compute_hw_nchips = 1
node_compute_hw_model = ThinkSystem SR655 V3
node_compute_hw_memory = 384 GB (12 x 16 GB 2Rx4 PC5-4800B-R)
node_compute_hw_disk = 1x ThinkSystem 2.5" 5300 480GB SSD
node_compute_hw_cpu_name = AMD EPYC 9654P
node_compute_hw_cpu_mhz = 2400
node_compute_hw_cpu_char = Intel Turbo Boost Technology up to 3.7 GHz
node_compute_hw_adapter_fs_slot_type = PCI-Express 5.0 x16
node_compute_hw_adapter_fs_ports_used = 1
node_compute_hw_adapter_fs_model = Mellanox ConnectX-6 HDR
node_compute_hw_adapter_fs_interconnect = Nvidia Mellanox ConnectX-6 HDR
node_compute_hw_adapter_fs_firmware = 20.28.1002
node_compute_hw_adapter_fs_driver = 5.2-1.0.4
node_compute_hw_adapter_fs_data_rate = 200 Gb/s
node_compute_hw_adapter_fs_count = 1
node_compute_hw_accel_vendor = Nvidia Corporation
node_compute_hw_accel_type = GPU
node_compute_hw_accel_model = Tesla H100 PCIe 80GB
node_compute_hw_accel_ecc = Yes
node_compute_hw_accel_desc = Nvidia Tesla H100 PCIe 80GB
node_compute_hw_accel_count = 8
node_compute_hw_accel_connect = PCIe Gen5 x16
node_compute_count = 1

# Compiler Version Flags
CC_VERSION_OPTION  = -V
CXX_VERSION_OPTION = -V
FC_VERSION_OPTION  = -V

%ifdef %{ucx}
# if using OpenMPI with UCX support, these settings are needed with use of CUDA Aware MPI
# without these flags, LBM is known to hang when using OpenACC and OpenMP Target to GPUs
preENV_UCX_MEMTYPE_CACHE=n
preENV_UCX_TLS=self,shm,cuda_copy
%endif

MPIRUN_OPTS = --allow-run-as-root --bind-to none
submit = mpirun --allow-run-as-root -x UCX_MEMTYPE_CACHE=n -np $ranks perl $[top]/bind2.pl $command

# Optimization
default:
pmodel=ACC
default=base=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -Mfprelaxed -Mnouniform -Mstack_arrays -DSPEC_ACCEL_AWARE_MPI
CXXPORTABILITY = --c++17

505.lbm_t,605.lbm_s,705.lbm_m,805.lbm_l:
PORTABILITY+= -DSPEC_OPENACC_NO_SELF

505.lbm_t=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -DSPEC_ACCEL_AWARE_MPI
PORTABILITY+= -DSPEC_OPENACC_NO_SELF

513.soma_t=peak=default:
basepeak=1

518.tealeaf_t=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -Msafeptr -DSPEC_ACCEL_AWARE_MPI

519.clvleaf_t=peak=default:
basepeak=1

521.miniswp_t=peak=default:
#ranks   = %{RANKS}
#OPTIMIZE       = -w -fast -acc=gpu -gpu=pinned # -DSPEC_ACCEL_AWARE_MPI
basepeak=1

528.pot3d_t=peak=default:
basepeak=1

532.sph_exa_t=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -Mstack_arrays -static-nvidia -DSPEC_ACCEL_AWARE_MPI

534.hpgmgfv_t=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -static-nvidia -DSPEC_ACCEL_AWARE_MPI

535.weather_t=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -Mstack_arrays -static-nvidia -DSPEC_ACCEL_AWARE_MPI

605.lbm_s=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -DSPEC_ACCEL_AWARE_MPI

613.soma_s=peak=default:
basepeak=1

618.tealeaf_s=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -Msafeptr -DSPEC_ACCEL_AWARE_MPI

619.clvleaf_s=peak=default:
basepeak=1

621.miniswp_s=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -gpu=pinned # -DSPEC_ACCEL_AWARE_MPI

628.pot3d_s=peak=default:
basepeak=1

632.sph_exa_s=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -Mstack_arrays -static-nvidia -DSPEC_ACCEL_AWARE_MPI

634.hpgmgfv_s=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -static-nvidia -DSPEC_ACCEL_AWARE_MPI

635.weather_s=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -Mstack_arrays -static-nvidia -DSPEC_ACCEL_AWARE_MPI


# The following section was added automatically, and contains settings that
# did not appear in the original configuration file, but were added to the
# raw file after the run.
default:
notes_000 =Environment variables set by runhpc before the start of the run:
notes_005 =UCX_MEMTYPE_CACHE = "n"
notes_010 =UCX_TLS = "self,shm,cuda_copy"
notes_015 =
flagsurl000 = http://www.spec.org/hpc2021/flags/nv2021_flags_v1.0.3.xml
node_compute_sw_os000 = Red Hat Enterprise Linux Server release 9,
node_compute_sw_os001 = Kernel 5.14.0-70.22.1.el9_0.x86_64
notes_submit_000 =Indiviual Ranks were bound to the CPU cores on the same NUMA node as
notes_submit_005 =the GPU using 'numactl' within the following "bind2.pl" perl script:
notes_submit_010 =---- Start bind2.pl ------
notes_submit_015 =my %bind;
notes_submit_020 =$bind{0} = "1-3";
notes_submit_025 =$bind{1} = "144-146";
notes_submit_030 =$bind{2} = "8-10";
notes_submit_035 =$bind{3} = "11-14";
notes_submit_040 =$bind{4} = "41-43";
notes_submit_045 =$bind{5} = "44-47";
notes_submit_050 =$bind{6} = "61-63";
notes_submit_055 =$bind{7} = "64-67";
notes_submit_060 =my $rank = $ENV{OMPI_COMM_WORLD_LOCAL_RANK};
notes_submit_065 =my $cmd = "taskset -c $bind{$rank} ";
notes_submit_070 =while (my $arg = shift) {
notes_submit_075 = $cmd .= "$arg ";
notes_submit_080 =}
notes_submit_085 =my $rc = system($cmd);
notes_submit_090 =exit($rc);
notes_submit_095 =---- End bind.pl ------
notes_submit_100 =The config file option 'submit' was used.
notes_submit_105 =submit = mpirun --allow-run-as-root -x UCX_MEMTYPE_CACHE=n
notes_submit_110 =-host localhost:2 -np $ranks perl $[top]/bind2.pl $command