55
66import numpy as np
77from common import common
8- from common .helper_cuda import checkCudaErrors , findCudaDevice
8+ from common .helper_cuda import check_cuda_errors , find_cuda_device
99
1010from cuda .bindings import driver as cuda
1111
5050}
5151"""
5252
53- NUM_BLOCKS = 64
54- NUM_THREADS = 256
53+ num_blocks = 64
54+ num_threads = 256
5555
5656
5757def elems_to_bytes (nelems , dt ):
@@ -64,52 +64,52 @@ def main():
6464 if platform .machine () == "armv7l" :
6565 pytest .skip ("clock_nvrtc is not supported on ARMv7" )
6666
67- timer = np .empty (NUM_BLOCKS * 2 , dtype = "int64" )
68- hinput = np .empty (NUM_THREADS * 2 , dtype = "float32" )
67+ timer = np .empty (num_blocks * 2 , dtype = "int64" )
68+ hinput = np .empty (num_threads * 2 , dtype = "float32" )
6969
70- for i in range (NUM_THREADS * 2 ):
70+ for i in range (num_threads * 2 ):
7171 hinput [i ] = i
7272
73- devID = findCudaDevice ()
74- with common .KernelHelper (clock_nvrtc , devID ) as kernelHelper :
75- kernel_addr = kernelHelper . getFunction (b"timedReduction" )
76-
77- dinput = checkCudaErrors (cuda .cuMemAlloc (hinput .nbytes ))
78- doutput = checkCudaErrors (cuda .cuMemAlloc (elems_to_bytes (NUM_BLOCKS , np .float32 )))
79- dtimer = checkCudaErrors (cuda .cuMemAlloc (timer .nbytes ))
80- checkCudaErrors (cuda .cuMemcpyHtoD (dinput , hinput , hinput .nbytes ))
81-
82- args = ((dinput , doutput , dtimer ), (None , None , None ))
83- shared_memory_nbytes = elems_to_bytes (2 * NUM_THREADS , np .float32 )
84-
85- grid_dims = (NUM_BLOCKS , 1 , 1 )
86- block_dims = (NUM_THREADS , 1 , 1 )
87-
88- checkCudaErrors (
89- cuda .cuLaunchKernel (
90- kernel_addr ,
91- * grid_dims , # grid dim
92- * block_dims , # block dim
93- shared_memory_nbytes ,
94- 0 , # shared mem, stream
95- args ,
96- 0 ,
97- )
98- ) # arguments
99-
100- checkCudaErrors (cuda .cuCtxSynchronize ())
101- checkCudaErrors (cuda .cuMemcpyDtoH (timer , dtimer , timer .nbytes ))
102- checkCudaErrors (cuda .cuMemFree (dinput ))
103- checkCudaErrors (cuda .cuMemFree (doutput ))
104- checkCudaErrors (cuda .cuMemFree (dtimer ))
105-
106- avgElapsedClocks = 0.0
107-
108- for i in range (NUM_BLOCKS ):
109- avgElapsedClocks += timer [i + NUM_BLOCKS ] - timer [i ]
110-
111- avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS
112- print (f"Average clocks/block = { avgElapsedClocks } " )
73+ dev_id = find_cuda_device ()
74+ kernel_helper = common .KernelHelper (clock_nvrtc , dev_id )
75+ kernel_addr = kernel_helper . get_function (b"timedReduction" )
76+
77+ dinput = check_cuda_errors (cuda .cuMemAlloc (hinput .nbytes ))
78+ doutput = check_cuda_errors (cuda .cuMemAlloc (elems_to_bytes (num_blocks , np .float32 )))
79+ dtimer = check_cuda_errors (cuda .cuMemAlloc (timer .nbytes ))
80+ check_cuda_errors (cuda .cuMemcpyHtoD (dinput , hinput , hinput .nbytes ))
81+
82+ args = ((dinput , doutput , dtimer ), (None , None , None ))
83+ shared_memory_nbytes = elems_to_bytes (2 * num_threads , np .float32 )
84+
85+ grid_dims = (num_blocks , 1 , 1 )
86+ block_dims = (num_threads , 1 , 1 )
87+
88+ check_cuda_errors (
89+ cuda .cuLaunchKernel (
90+ kernel_addr ,
91+ * grid_dims , # grid dim
92+ * block_dims , # block dim
93+ shared_memory_nbytes ,
94+ 0 , # shared mem, stream
95+ args ,
96+ 0 ,
97+ )
98+ ) # arguments
99+
100+ check_cuda_errors (cuda .cuCtxSynchronize ())
101+ check_cuda_errors (cuda .cuMemcpyDtoH (timer , dtimer , timer .nbytes ))
102+ check_cuda_errors (cuda .cuMemFree (dinput ))
103+ check_cuda_errors (cuda .cuMemFree (doutput ))
104+ check_cuda_errors (cuda .cuMemFree (dtimer ))
105+
106+ avg_elapsed_clocks = 0.0
107+
108+ for i in range (num_blocks ):
109+ avg_elapsed_clocks += timer [i + num_blocks ] - timer [i ]
110+
111+ avg_elapsed_clocks = avg_elapsed_clocks / num_blocks
112+ print (f"Average clocks/block = { avg_elapsed_clocks } " )
113113
114114
115115if __name__ == "__main__" :
0 commit comments