Welcome everyone to my constant memory bandwidth thread!
The purpose of this exercise is to compare memory latency of the constant memory cache vs global memory.
I’ve created a simple program to measure the memory bandwidth from the constant memory cache to shared memory. I have a second one that measures bandwidth from global memory to shared memory. In all test cases I’ve chosen to transfer ~5KB of data, looped the data transfer 10000 times, and ran 1 block of 128 threads. (notice that shared memory is my reference)
I assume that I’m using the constant memory cache b/c I do 1 initial transfer from cmem off-chip to smem. Now I assume that this action triggers the gpu to ALSO transfer the data to the cmem cache on-chip. Is this assumption correct? (hence timing occurs after this initial transfer)
I ran 4 test cases & got the following bandwidths:
-
cmem sequential access = 1.423931 GB/s
-
cmem same addr access = 5.589218 GB/s
-
gmem sequential access = 1.368713 GB/s
-
gmem same addr access = 1.075658 GB/s
As expected the same address access for cmem is remarkably better.
So here’s my question. I thought that the cache memory access would be significantly faster than global memory… is my approach correct?
thanks for the read!
(below you’ll find the full output for the 4 test cases along w/ my 2 programs)
Output
[codebox]Test 1
Your GPU clock rate = 1.620000 GHz
Cmem test used 10000 test iterations
Total time of 35.957091 ms (58250488 clk cycles)
Average time of 3.595679 us (5825 clk cycles)
Transfered data = 5120 bytes
—> Bandwidth = 1.423931 GB/s <—
Test 2
Your GPU clock rate = 1.620000 GHz
Cmem test used 10000 test iterations
Total time of 9.160802 ms (14840500 clk cycles)
Average time of 0.916049 us (1484 clk cycles)
Transfered data = 5120 bytes
—> Bandwidth = 5.589218 GB/s <—
Test 3
Your GPU clock rate = 1.620000 GHz
Gmem test used 10000 test iterations
Total time of 37.412941 ms (60608962 clk cycles)
Average time of 3.740741 us (6060 clk cycles)
Transfered data = 5120 bytes
—> Bandwidth = 1.368713 GB/s <—
Test 4
Your GPU clock rate = 1.620000 GHz
Gmem test used 10000 test iterations
Total time of 47.603682 ms (77117958 clk cycles)
Average time of 4.759876 us (7711 clk cycles)
Transfered data = 5120 bytes
—> Bandwidth = 1.075658 GB/s <—
[/codebox]
Program 1: Cmem Test
[codebox]#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
constant int cmem[16000];
global void cmem_test( int *output, int num_elements, int test_iterations )
{
int start, stop, i, s;
int tid = threadIdx.x;
extern __shared__ int smem[];
// INITIAL transfer cmem(off-chip) to smem
for ( s = 0 ; s < num_elements ; s += blockDim.x )
smem[tid + s] = cmem[tid + s];
__syncthreads();
if ( tid == 0 )
start = clock();
for ( i = 0 ; i < test_iterations ; i ++ )
{
// transfer cmem(cache) to smem
for ( s = 0 ; s < num_elements ; s += blockDim.x )
smem[tid + s] = cmem[tid + s]; // change to cmem[0] to access same address access
__syncthreads();
}
__syncthreads();
if (tid == 0 )
stop = clock();
// transfer smem to gmem
for ( s = 0 ; s < num_elements ; s += blockDim.x )
output[tid + s] = smem[tid + s];
__syncthreads();
if ( tid == 0 )
output[0] = stop - start;
}
int main ()
{
int num_elements, data_size;
int grid_size, block_size;
int test_iterations, i;
int total_clk_cycles, ave_clk_cycles;
int *h_data, *h_output, *d_output;
float bandwidth, gpu_clk_rate;
float total_time, ave_time;
// get GPU clk rate
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
gpu_clk_rate = deviceProp.clockRate * 1e-6;
printf("\nYour GPU clock rate = %f GHz\n", gpu_clk_rate);
grid_size = 1;
block_size = 128;
test_iterations = 10000;
num_elements = block_size*10; // 1280 ints (5.12kb)
data_size = num_elements*sizeof(int);
// allocate arrays
h_data = (int*)malloc(data_size);
h_output = (int*)malloc(data_size);
// allocate CUDA arrays
cudaMalloc((void **) &d_output, data_size);
// fill data
for ( i = 0 ; i < num_elements ; i ++ )
h_data[i] = i+1;
// transfer data from cpu to gpu CONSTANT MEMORY
cudaMemcpyToSymbol(cmem, h_data, data_size, 0);
// kernel invocation
cmem_test <<< grid_size, block_size, data_size >>> ( d_output, num_elements, test_iterations );
// transfer output from gpu to cpu
cudaMemcpy(h_output, d_output, data_size, cudaMemcpyDeviceToHost);
// calculate bandwidth
total_clk_cycles = h_output[0];
ave_clk_cycles = total_clk_cycles / test_iterations;
total_time = total_clk_cycles / gpu_clk_rate / 1e9; // (seconds)
ave_time = ave_clk_cycles / gpu_clk_rate / 1e9; // (seconds)
bandwidth = data_size / ave_time; // (byte/second)
// display results
printf("\nCmem test used %d test iterations\n", test_iterations);
printf("Total time of %f ms (%d clk cycles)\n", (float)total_time*1e3, total_clk_cycles);
printf("Average time of %f us (%d clk cycles)\n", (float)ave_time*1e6, ave_clk_cycles);
printf("Transfered data = %d bytes\n", data_size);
printf("---> Bandwidth = %f GB/s <---\n\n", bandwidth*1e-9);
// free memory
free(h_data);
free(h_output);
cudaFree(d_output);
}
[/codebox]
Program 2: Gmem Test
[codebox]#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
global void gmem_test( int *data, int *output, int num_elements, int test_iterations )
{
int start, stop, i, s;
int tid = threadIdx.x;
extern __shared__ int smem[];
if ( tid == 0 )
start = clock();
for ( i = 0 ; i < test_iterations ; i ++ )
{
// transfer gmem to smem
for ( s = 0 ; s < num_elements ; s += blockDim.x )
smem[tid + s] = data[tid + s]; // change to data[0] for same address access
__syncthreads();
}
__syncthreads();
if (tid == 0 )
stop = clock();
// transfer smem to gmem
for ( s = 0 ; s < num_elements ; s += blockDim.x )
output[tid + s] = smem[tid + s];
__syncthreads();
if ( tid == 0 )
output[0] = stop - start;
}
int main ()
{
int num_elements, data_size;
int grid_size, block_size;
int test_iterations, i;
int total_clk_cycles, ave_clk_cycles;
int *h_data, *d_data, *h_output, *d_output;
float bandwidth, gpu_clk_rate;
float total_time, ave_time;
// get GPU clk rate
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
gpu_clk_rate = deviceProp.clockRate * 1e-6;
printf("\nYour GPU clock rate = %f GHz\n", gpu_clk_rate);
grid_size = 1;
block_size = 128;
test_iterations = 10000;
num_elements = block_size*10; // 1280 ints (5.12kb)
data_size = num_elements*sizeof(int);
// allocate arrays
h_data = (int*)malloc(data_size);
h_output = (int*)malloc(data_size);
// allocate CUDA arrays
cudaMalloc((void **) &d_data, data_size);
cudaMalloc((void **) &d_output, data_size);
// fill data
for ( i = 0 ; i < num_elements ; i ++ )
h_data[i] = i+1;
// transfer data from cpu to gpu
cudaMemcpyToSymbol(d_data, h_data, data_size, cudaMemcpyHostToDevice);
// kernel invocation
gmem_test <<< grid_size, block_size, data_size >>> ( d_data, d_output, num_elements, test_iterations );
// transfer output from gpu to cpu
cudaMemcpy(h_output, d_output, data_size, cudaMemcpyDeviceToHost);
// calculate bandwidth
total_clk_cycles = h_output[0];
ave_clk_cycles = total_clk_cycles / test_iterations;
total_time = total_clk_cycles / gpu_clk_rate / 1e9; // (seconds)
ave_time = ave_clk_cycles / gpu_clk_rate / 1e9; // (seconds)
bandwidth = data_size / ave_time; // (byte/second)
// display results
printf("\nGmem test used %d test iterations\n", test_iterations);
printf("Total time of %f ms (%d clk cycles)\n", (float)total_time*1e3, total_clk_cycles);
printf("Average time of %f us (%d clk cycles)\n", (float)ave_time*1e6, ave_clk_cycles);
printf("Transfered data = %d bytes\n", data_size);
printf("---> Bandwidth = %f GB/s <---\n\n", bandwidth*1e-9);
// free memory
free(h_data);
free(h_output);
cudaFree(d_data);
cudaFree(d_output);
}[/codebox]