#include #include #define BLOCK_SIZE 256 #define NUM_ELEMENTS 256*7 extern __shared__ int smem[]; __global__ void A2_kernel( int *output, int test_iterations ) { int start, stop, i; int tid = threadIdx.x; volatile int* smem1 = (int*)smem; volatile int* smem2 = (int*)&smem1[NUM_ELEMENTS]; start = clock(); __syncthreads(); #pragma unroll 75 for ( i = 0 ; i < test_iterations ; i ++ ) { // transfer smem to smem // manual unroll smem2[tid] = smem1[tid]; smem2[tid + 256] = smem1[tid + 256]; smem2[tid + 512] = smem1[tid + 512]; smem2[tid + 768] = smem1[tid + 768]; smem2[tid + 1024] = smem1[tid + 1024]; smem2[tid + 1280] = smem1[tid + 1280]; smem2[tid + 1536] = smem1[tid + 1536]; __syncthreads(); } stop = clock(); __syncthreads(); output[0] = stop - start; } int main () { int num_elements, data_size; int grid_size, block_size; int test_iterations; int total_clk_cycles, ave_clk_cycles; int *h_output, *d_output; float bandwidth, bandwidth2, gpu_clk_rate; float total_time, ave_time; // get GPU clk rate cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, 0); gpu_clk_rate = deviceProp.clockRate * 1e-6; printf("\nYour GPU clock rate = %f GHz\n", gpu_clk_rate); grid_size = 1; block_size = BLOCK_SIZE; test_iterations = 100000; num_elements = NUM_ELEMENTS; data_size = num_elements*sizeof(int); // allocate arrays h_output = (int*)malloc(data_size); // allocate CUDA arrays cudaMalloc((void **) &d_output, data_size); // kernel invocation A2_kernel <<< grid_size, block_size, data_size*2 >>> ( d_output, test_iterations ); // trasnfer output from gpu to cpu cudaMemcpy(h_output, d_output, data_size, cudaMemcpyDeviceToHost); // calculate bandwidth total_clk_cycles = h_output[0]; ave_clk_cycles = total_clk_cycles / test_iterations; total_time = total_clk_cycles / gpu_clk_rate / 1e9; // (seconds) ave_time = ave_clk_cycles / gpu_clk_rate / 1e9; // (seconds) bandwidth = data_size / ave_time; // (byte/second) bandwidth2 = float(data_size)/ave_clk_cycles; // display results printf("\nSmem test used %d test iterations\n", test_iterations); printf("Total time of %f ms (%d clk cycles)\n", (float)total_time*1e3, total_clk_cycles); printf("Average time of %f us (%d clk cycles)\n", (float)ave_time*1e6, ave_clk_cycles); printf("Transfered data = %d bytes\n\n", data_size); printf("%f Bytes/clock (%f%% of theoretical)\n", bandwidth2, bandwidth2*100/16); printf("---> Bandwidth = %f GB/s <---\n\n", bandwidth*1e-9); // free memory free(h_output); cudaFree(d_output); }