# Invocation command line:
# /local/home/mcolgrove/HPC2021/bin/harness/runhpc -c nvhpc_acc --reportable tiny
# output_root was not used for this run
############################################################################
teeout = yes
makeflags=-j 40
flagsurl000=http://www.spec.org/hpc2021/flags/nv2021_flags_v1.0.3.xml
envars = 1

license_num     =019
test_sponsor    = NVIDIA Corporation
tester          = NVIDIA Corporation

######################################################
# SUT Section
######################################################
#include: snow.inc
#  ----- Begin inclusion of 'snow.inc'
############################################################################
######################################################
# Example configuration information for a
# system under test (SUT) Section
######################################################
# General SUT info
system_vendor      = GIGA-BYTE TECHNOLOGY CO., LTD
system_name        = GIGA-BYTE G242-P31 (Ampere Altra Q80-33, Tesla A100-PCIE-40GB)
node_compute_sw_accel_driver = NVIDIA UNIX aarch64 Kernel Module 460.32.03
hw_avail           = Jun-2021
sw_avail           = Sep-2021
prepared_by = Mathew Colgrove (mcolgrove@nvidia.com)

# Computation node info
# [Node_Description: Hardware]
node_compute_syslbl = Ampere Altra
node_compute_order = 1
node_compute_count = 1
node_compute_purpose = compute
node_compute_hw_vendor = GIGA-BYTE TECHNOLOGY CO., LTD
node_compute_hw_model = G242-P31
node_compute_hw_cpu_name = Ampere Altra Q80-33
node_compute_hw_ncpuorder = 1 chips
node_compute_hw_nchips = 1
node_compute_hw_ncores = 80
node_compute_hw_ncoresperchip = 80
node_compute_hw_nthreadspercore = 1
node_compute_hw_cpu_char = Max Frequency 3300Mhz
node_compute_hw_cpu_mhz = 3000
node_compute_hw_pcache = 64 KB I + 64 KB D on chip per core
node_compute_hw_scache = 1 MB I+D on chip per core
node_compute_hw_tcache = 32 MB I+D on chip per core
node_compute_hw_ocache = None
node_compute_hw_memory = 256 GB (16 x 16 GB 2Rx8 PC4-3200AA-R)
node_compute_hw_disk = 1 x 960 GB, NVME, M.2, PCIe Gen3
node_compute_hw_other = None

#[Node_Description: Accelerator]
node_compute_hw_accel_model = Tesla A100-PCIE-40GB
node_compute_hw_accel_count = 2
node_compute_hw_accel_vendor= NVIDIA Corporation
node_compute_hw_accel_type  = GPU
node_compute_hw_accel_connect = PCIe 3.0 16x
node_compute_hw_accel_ecc    = Yes
node_compute_hw_accel_desc   = See Notes

#[Node_Description: Software]
node_compute_hw_adapter_fs_model = None
node_compute_hw_adapter_fs_count = 0
node_compute_hw_adapter_fs_slot_type = None
node_compute_hw_adapter_fs_data_rate = None
node_compute_hw_adapter_fs_ports_used = 0
node_compute_hw_adapter_fs_interconnect = None
node_compute_hw_adapter_fs_driver = None
node_compute_hw_adapter_fs_firmware = None
node_compute_sw_os = CentOS 8.3-2011
node_compute_sw_localfile = xfs
node_compute_sw_sharedfile = None
node_compute_sw_state = Multi-user, run level 3
node_compute_sw_other = None

#[Interconnect]
interconnect_fs_syslbl = None
interconnect_fs_order = 0
interconnect_fs_purpose = N/A
interconnect_fs_hw_vendor = N/A
interconnect_fs_hw_model = N/A
interconnect_fs_hw_switch_fs_model= N/A
interconnect_fs_hw_switch_fs_count = 0
interconnect_fs_hw_switch_fs_ports = 0
interconnect_fs_hw_topo = N/A
interconnect_fs_hw_switch_fs_data_rate = 0
interconnect_fs_hw_switch_fs_firmware = 0

#######################################################################
# End of SUT section
# If this config file were to be applied to several SUTs, edits would
# be needed only ABOVE this point.
######################################################################
# ---- End inclusion of '/local/home/mcolgrove/HPC2021/config/snow.inc'

#[Software]
sw_compiler000   = C/C++/Fortran: Version 21.9 of
sw_compiler001   = NVIDIA HPC SDK for Linux
sw_mpi_library   = OpenMPI Version 4.0.5, included with NVHPC SDK
sw_mpi_other     = None
system_class = SMP
sw_other         = None

#[General notes]

#######################################################################
# End of SUT section
######################################################################

label         = nv_acc
tune          = base,peak
output_format = text,html,pdf
use_submit_for_speed = 1
reportable = 1

# Compiler Settings
default:
CC           = mpicc
CXX          = mpicxx
FC           = mpif90
# Compiler Version Flags
CC_VERSION_OPTION  = -V
CXX_VERSION_OPTION = -V
FC_VERSION_OPTION  = -V

MPIRUN_OPTS = --bind-to none
submit = mpirun ${MPIRUN_OPTS} -np $ranks $command

#######################################################################
# Optimization
#######################################################################
default:
pmodel=ACC

default=base=default:
ranks=2
OPTIMIZE       = -w -Mfprelaxed -Mnouniform -Mstack_arrays -fast -acc=gpu
CXXPORTABILITY = --c++17

505.lbm_t=peak=default:
basepeak=1

513.soma_t=peak=default:
ranks=2
OPTIMIZE       = -w -fast -O3 -acc=gpu -gpu=pinned

518.tealeaf_t=peak=default:
ranks=2
OPTIMIZE       = -w -fast -Msafeptr -acc=gpu

519.clvleaf_t=peak=default:
ranks=2
OPTIMIZE       = -w -Mfprelaxed -fast -acc=gpu -gpu=pinned

521.miniswp_t=peak=default:
ranks=2
OPTIMIZE       = -w -Mfprelaxed -Mnouniform -Mstack_arrays -fast -acc=gpu -gpu=pinned

528.pot3d_t=peak=default:
ranks=2
OPTIMIZE       = -w -Mstack_arrays -fast -acc=gpu

532.sph_exa_t=peak=default:
ranks=16
OPTIMIZE       = -w -Mfprelaxed -Mnouniform -Mstack_arrays -fast -acc=gpu

534.hpgmgfv_t=peak=default:
ranks=2
OPTIMIZE       = -w -fast -acc=gpu -gpu=pinned -static-nvidia

535.weather_t=peak=default:
ranks=2
OPTIMIZE       = -w -Mfprelaxed -Mnouniform -Mstack_arrays -fast -acc=gpu


# The following section was added automatically, and contains settings that
# did not appear in the original configuration file, but were added to the
# raw file after the run.
default:
notes_submit_000= MPI startup command:
notes_submit_005=   mpirun command was used to start MPI jobs.
notes_plat_000=
notes_plat_005= Information from nvaccelinfo
notes_plat_010= CUDA Driver Version:           11020
notes_plat_015= NVRM version:                  NVIDIA UNIX aarch64 Kernel Module 460.32.03
notes_plat_020= Device Number:                 0
notes_plat_025= Device Name:                   A100-PCIE-40GB
notes_plat_030= Device Revision Number:        8.0
notes_plat_035= Global Memory Size:            42505273344
notes_plat_040= Number of Multiprocessors:     108
notes_plat_045= Concurrent Copy and Execution: Yes
notes_plat_050= Total Constant Memory:         65536
notes_plat_055= Total Shared Memory per Block: 49152
notes_plat_060= Registers per Block:           65536
notes_plat_065= Warp Size:                     32
notes_plat_070= Maximum Threads per Block:     1024
notes_plat_075= Maximum Block Dimensions:      1024, 1024, 64
notes_plat_080= Maximum Grid Dimensions:       2147483647 x 65535 x 65535
notes_plat_085= Maximum Memory Pitch:          2147483647B
notes_plat_090= Texture Alignment:             512B
notes_plat_095= Clock Rate:                    1410 MHz
notes_plat_100= Execution Timeout:             No
notes_plat_105= Integrated Device:             No
notes_plat_110= Can Map Host Memory:           Yes
notes_plat_115= Compute Mode:                  default
notes_plat_120= Concurrent Kernels:            Yes
notes_plat_125= ECC Enabled:                   Yes
notes_plat_130= Memory Clock Rate:             1215 MHz
notes_plat_135= Memory Bus Width:              5120 bits
notes_plat_140= L2 Cache Size:                 41943040 bytes
notes_plat_145= Max Threads Per SMP:           2048
notes_plat_150= Async Engines:                 3
notes_plat_155= Unified Addressing:            Yes
notes_plat_160= Managed Memory:                Yes
notes_plat_165= Concurrent Managed Memory:     Yes
notes_plat_170= Preemption Supported:          Yes
notes_plat_175= Cooperative Launch:            Yes
notes_plat_180=   Multi-Device:                Yes
notes_plat_185= Default Target:                cc80