Constant Memory Bandwidth Program

Welcome everyone to my constant memory bandwidth thread!

The purpose of this exercise is to compare memory latency of the constant memory cache vs global memory.

I’ve created a simple program to measure the memory bandwidth from the constant memory cache to shared memory. I have a second one that measures bandwidth from global memory to shared memory. In all test cases I’ve chosen to transfer ~5KB of data, looped the data transfer 10000 times, and ran 1 block of 128 threads. (notice that shared memory is my reference)

I assume that I’m using the constant memory cache b/c I do 1 initial transfer from cmem off-chip to smem. Now I assume that this action triggers the gpu to ALSO transfer the data to the cmem cache on-chip. Is this assumption correct? (hence timing occurs after this initial transfer)

I ran 4 test cases & got the following bandwidths:

  1. cmem sequential access = 1.423931 GB/s

  2. cmem same addr access = 5.589218 GB/s

  3. gmem sequential access = 1.368713 GB/s

  4. gmem same addr access = 1.075658 GB/s

As expected the same address access for cmem is remarkably better.

So here’s my question. I thought that the cache memory access would be significantly faster than global memory… is my approach correct?

thanks for the read!

(below you’ll find the full output for the 4 test cases along w/ my 2 programs)

Output

[codebox]Test 1

Your GPU clock rate = 1.620000 GHz

Cmem test used 10000 test iterations

Total time of 35.957091 ms (58250488 clk cycles)

Average time of 3.595679 us (5825 clk cycles)

Transfered data = 5120 bytes

—> Bandwidth = 1.423931 GB/s <—

Test 2

Your GPU clock rate = 1.620000 GHz

Cmem test used 10000 test iterations

Total time of 9.160802 ms (14840500 clk cycles)

Average time of 0.916049 us (1484 clk cycles)

Transfered data = 5120 bytes

—> Bandwidth = 5.589218 GB/s <—

Test 3

Your GPU clock rate = 1.620000 GHz

Gmem test used 10000 test iterations

Total time of 37.412941 ms (60608962 clk cycles)

Average time of 3.740741 us (6060 clk cycles)

Transfered data = 5120 bytes

—> Bandwidth = 1.368713 GB/s <—

Test 4

Your GPU clock rate = 1.620000 GHz

Gmem test used 10000 test iterations

Total time of 47.603682 ms (77117958 clk cycles)

Average time of 4.759876 us (7711 clk cycles)

Transfered data = 5120 bytes

—> Bandwidth = 1.075658 GB/s <—

[/codebox]

Program 1: Cmem Test

[codebox]#include <stdio.h>

#include <cuda.h>

#include <cuda_runtime_api.h>

constant int cmem[16000];

global void cmem_test( int *output, int num_elements, int test_iterations )

{

int start, stop, i, s;

int tid = threadIdx.x;



extern __shared__ int smem[];



	

// INITIAL transfer cmem(off-chip) to smem

for ( s = 0 ; s < num_elements ; s += blockDim.x )

		smem[tid + s] = cmem[tid + s];

__syncthreads();



if ( tid == 0 )

	start = clock();





for ( i = 0 ; i < test_iterations ; i ++ )

{

	// transfer cmem(cache) to smem

	for ( s = 0 ; s < num_elements ; s += blockDim.x )

		smem[tid + s] = cmem[tid + s]; // change to cmem[0] to access same address access

	__syncthreads();

}	

__syncthreads();





if  (tid == 0 )

	stop = clock();

	

// transfer smem to gmem

for ( s = 0 ; s < num_elements ; s += blockDim.x )

	output[tid + s] = smem[tid + s];

__syncthreads();

if ( tid == 0  )

	output[0] = stop - start;

}

int main ()

{

int num_elements, data_size;

int grid_size, block_size;

int test_iterations, i;

int total_clk_cycles, ave_clk_cycles;

int *h_data, *h_output, *d_output;	



float bandwidth, gpu_clk_rate;

float total_time, ave_time;





// get GPU clk rate

cudaDeviceProp deviceProp;

cudaGetDeviceProperties(&deviceProp, 0);

gpu_clk_rate = deviceProp.clockRate * 1e-6;

printf("\nYour GPU clock rate = %f GHz\n", gpu_clk_rate);

grid_size  = 1;

block_size = 128;

test_iterations = 10000;

num_elements = block_size*10; // 1280 ints (5.12kb)

data_size    = num_elements*sizeof(int);

// allocate arrays

h_data   = (int*)malloc(data_size);

h_output = (int*)malloc(data_size);



// allocate CUDA arrays

cudaMalloc((void **) &d_output, data_size);



	

// fill data

for ( i = 0 ; i < num_elements ; i ++ )

	h_data[i] = i+1;



// transfer data from cpu to gpu CONSTANT MEMORY

cudaMemcpyToSymbol(cmem, h_data, data_size, 0);





// kernel invocation

cmem_test <<< grid_size, block_size, data_size >>> ( d_output, num_elements, test_iterations );	



// transfer output from gpu to cpu

cudaMemcpy(h_output,  d_output, data_size, cudaMemcpyDeviceToHost);





// calculate bandwidth

total_clk_cycles = h_output[0];

ave_clk_cycles   = total_clk_cycles / test_iterations;



total_time = total_clk_cycles / gpu_clk_rate / 1e9; // (seconds)

ave_time   = ave_clk_cycles   / gpu_clk_rate / 1e9; // (seconds)



bandwidth  = data_size / ave_time; // (byte/second)  





// display results

printf("\nCmem test used %d test iterations\n", test_iterations);

printf("Total time of %f ms (%d clk cycles)\n", (float)total_time*1e3, total_clk_cycles);

printf("Average time of %f us (%d clk cycles)\n", (float)ave_time*1e6, ave_clk_cycles);

printf("Transfered data = %d bytes\n", data_size);

printf("---> Bandwidth = %f GB/s <---\n\n", bandwidth*1e-9);

	

// free memory

free(h_data);

free(h_output);

cudaFree(d_output);

}

[/codebox]

Program 2: Gmem Test

[codebox]#include <stdio.h>

#include <cuda.h>

#include <cuda_runtime_api.h>

global void gmem_test( int *data, int *output, int num_elements, int test_iterations )

{

int start, stop, i, s;

int tid = threadIdx.x;



extern __shared__ int smem[];



if ( tid == 0 )

	start = clock();





for ( i = 0 ; i < test_iterations ; i ++ )

{

	// transfer gmem to smem

	for ( s = 0 ; s < num_elements ; s += blockDim.x )

		smem[tid + s] = data[tid + s]; // change to data[0] for same address access

	__syncthreads();

}	

__syncthreads();





if  (tid == 0 )

	stop = clock();

	

// transfer smem to gmem

for ( s = 0 ; s < num_elements ; s += blockDim.x )

	output[tid + s] = smem[tid + s];

__syncthreads();

if ( tid == 0  )

	output[0] = stop - start;

}

int main ()

{

int num_elements, data_size;

int grid_size, block_size;

int test_iterations, i;

int total_clk_cycles, ave_clk_cycles;

int *h_data, *d_data, *h_output, *d_output;	



float bandwidth, gpu_clk_rate;

float total_time, ave_time;





// get GPU clk rate

cudaDeviceProp deviceProp;

cudaGetDeviceProperties(&deviceProp, 0);

gpu_clk_rate = deviceProp.clockRate * 1e-6;

printf("\nYour GPU clock rate = %f GHz\n", gpu_clk_rate);

grid_size  = 1;

block_size = 128;

test_iterations = 10000;

num_elements = block_size*10; // 1280 ints (5.12kb)

data_size    = num_elements*sizeof(int);

// allocate arrays

h_data   = (int*)malloc(data_size);

h_output = (int*)malloc(data_size);



// allocate CUDA arrays

cudaMalloc((void **) &d_data,   data_size);

cudaMalloc((void **) &d_output, data_size);



	

// fill data

for ( i = 0 ; i < num_elements ; i ++ )

	h_data[i] = i+1;



// transfer data from cpu to gpu

cudaMemcpyToSymbol(d_data, h_data, data_size, cudaMemcpyHostToDevice);





// kernel invocation

gmem_test <<< grid_size, block_size, data_size >>> ( d_data, d_output, num_elements, test_iterations );	



// transfer output from gpu to cpu

cudaMemcpy(h_output, d_output, data_size, cudaMemcpyDeviceToHost);





// calculate bandwidth

total_clk_cycles = h_output[0];

ave_clk_cycles   = total_clk_cycles / test_iterations;



total_time = total_clk_cycles / gpu_clk_rate / 1e9; // (seconds)

ave_time   = ave_clk_cycles   / gpu_clk_rate / 1e9; // (seconds)



bandwidth  = data_size / ave_time; // (byte/second)  





// display results

printf("\nGmem test used %d test iterations\n", test_iterations);

printf("Total time of %f ms (%d clk cycles)\n", (float)total_time*1e3, total_clk_cycles);

printf("Average time of %f us (%d clk cycles)\n", (float)ave_time*1e6, ave_clk_cycles);

printf("Transfered data = %d bytes\n", data_size);

printf("---> Bandwidth = %f GB/s <---\n\n", bandwidth*1e-9);

	

// free memory

free(h_data);

free(h_output);

cudaFree(d_data);

cudaFree(d_output);

}[/codebox]

Good…