I am trying to check the performance gain of using constant memory. I write following two kernels (one for global and one for constant memory) in visual studio 2010.
__global__ void test_GlobalMemorykernel(float* darray,float *dangle)
{
int index;
index = blockIdx.x * blockDim.x + threadIdx.x;
#pragma unroll 10
for(int loop=0;loop<360;loop++)
{
float a=dangle [loop];
float b=darray [index];
darray[index]= a + b ;
}
return;
}
__constant__ float cangle[360];
__global__ void test_ConstantMemorykernel(float* darray)
{
int index;
index = blockIdx.x * blockDim.x + threadIdx.x;
#pragma unroll 10
for(int loop=0;loop<360;loop++)
{
float a=cangle [loop];
float b=darray [index];
darray[index]= a + b ;
}
return;
}
int main(int argc,char** argv)
{
int size=3200;
float* darray;
float *dhangle;
float hangle[360];
int kernel=2;
cudaMalloc ((void**)&darray,sizeof(float)*size);
cudaMemset (darray,0,sizeof(float)*size);
for(int loop=0;loop<360;loop++)
hangle[loop] = acos( -1.0f )* loop/ 180.0f;
if (kernel==1) //global memory
{
cudaMalloc ((void**)&dhangle,sizeof(float)*360);
cudaMemcpy(dhangle, hangle, sizeof(float)*360, cudaMemcpyHostToDevice);
test_GlobalMemorykernel <<< size/128 ,128 >>> (darray,dhangle);
cudaFree(dhangle);
}
if (kernel==2)
{
cudaMemcpyToSymbol ( cangle, hangle, sizeof(float)*360 );
test_ConstantMemorykernel <<< size/128 ,128 >>> (darray);
}
cudaFree(darray);
return 0;
}
Nsight reported 259 micro second duration for test_ConstantMemorykernel and 253 micro second for test_GlobalMemorykernel. When I consulted Memory overview for load operations in both kernels, I found that there is 3.66KB transferred from device memory to L2 Cache. Memory overview statistics reported by Nsight are given below.
kernel(test_GlobalMemorykernel)–>Global(72K req)–>L1 Cache 0% (8.79MB)–>L2 cach 100% (5.49MB)–>Device (0 byte).
kernel(test_ConstantMemorykernel)–>Global(72K req)–>L1 Cache 0% (8.79MB)–>L2 cach 100% (5.49MB)–>Device (3.66K byte).
I think 3.66 kB transef is causing extra time in constant memory kernel. but I cant understand why there is no data transfer (device memory to L2 cache) in case of global memory kernel?
Can somebody explain why test_ConstantMemorykernel took longer time to complete than test_GlobalMemorykernel?? why there is no data transfer from device memory to L2 cache in case of est_GlobalMemorykernel? My GPU card is GeForce GT640. Thanks
The difference between 259 us and 253 us is quite small (about 2%). Such variation can arise from run-to-run in a GPU, especially in a windows environment. I’m not sure much can be inferred from it.
In some cases it may be that windows is impacting the results somehow. In your cross-posting here:
http://stackoverflow.com/questions/33446028/global-vs-constant-memory-performance-issue
(which is slightly different in that you seem to be launching 64 threads per block instead of 128) there is a comment with a link to the code there being run on linux (on a GT640) instead of windows. On linux the behavior is as expected - the constant memory kernel is not slower.
Here’s the full linux test case:
$ cat t962.cu
__global__ void test_GlobalMemorykernel(float* darray,float *cangle)
{
int index;
//calculate each thread global index
index = blockIdx.x * blockDim.x + threadIdx.x;
#pragma unroll 10
for(int loop=0;loop<360;loop++)
darray[index]= darray [index] + cangle [loop] ;
return;
}
//declare constant memory
__constant__ float cangle[360];
__global__ void test_ConstantMemorykernel(float* darray)
{
int index;
//calculate each thread global index
index = blockIdx.x * blockDim.x + threadIdx.x;
#pragma unroll 10
for(int loop=0;loop<360;loop++)
darray[index]= darray [index] + cangle [loop] ;
return;
}
int main(int argc,char** argv)
{
int size=3200;
float* darray;
float *dhangle;
float hangle[360];
int kernel=2;
if (argc > 1) (kernel = atoi(argv[1]));
if (kernel > 2) kernel = 2;
if (kernel < 1) kernel = 1;
//initialize angle array on host
for(int loop=0;loop<360;loop++)
hangle[loop] = acos( -1.0f )* loop/ 180.0f;
if (kernel==1) //global memory
{
//allocate device memory
cudaMalloc ((void**)&darray,sizeof(float)*size);
//initialize allocated memory
cudaMemset (darray,0,sizeof(float)*size);
cudaMalloc ((void**)&dhangle,sizeof(float)*360);
//copy host angle data to global memory
cudaMemcpy(dhangle, hangle, sizeof(float)*360,cudaMemcpyHostToDevice);
test_GlobalMemorykernel <<< size/64 ,64 >>> (darray,dhangle);
//free device memory
cudaFree(darray);
cudaFree(dhangle);
}
if (kernel==2)
{
//allocate device memory
cudaMalloc ((void**)&darray,sizeof(float)*size);
//initialize allocated memory
cudaMemset (darray,0,sizeof(float)*size);
//copy host angle data to constant memory
cudaMemcpyToSymbol( cangle, hangle, sizeof(float)*360 );
test_ConstantMemorykernel <<< size/64 ,64 >>> (darray);
//free device memory
cudaFree(darray);
}
return 0;
}
$ nvcc -arch=sm_35 -o t962 t962.cu
$ nvprof --print-gpu-trace ./t962 1
==1778== NVPROF is profiling process 1778, command: ./t962 1
==1778== Profiling application: ./t962 1
==1778== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput Device Context Stream Name
825.12ms 3.9680us - - - - - 12.500KB 3.0043GB/s GeForce GT 640 1 7 [CUDA memset]
825.26ms 1.2160us - - - - - 1.4063KB 1.1029GB/s GeForce GT 640 1 7 [CUDA memcpy HtoD]
825.29ms 131.04us (50 1 1) (64 1 1) 8 0B 0B - - GeForce GT 640 1 7 test_GlobalMemorykernel(float*, float*) [97]
Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
$ nvprof --print-gpu-trace ./t962 2
==1788== NVPROF is profiling process 1788, command: ./t962 2
==1788== Profiling application: ./t962 2
==1788== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput Device Context Stream Name
825.32ms 4.0640us - - - - - 12.500KB 2.9333GB/s GeForce GT 640 1 7 [CUDA memset]
825.34ms 1.1840us - - - - - 1.4063KB 1.1327GB/s GeForce GT 640 1 7 [CUDA memcpy HtoD]
825.37ms 22.752us (50 1 1) (64 1 1) 14 0B 0B - - GeForce GT 640 1 7 test_ConstantMemorykernel(float*) [95]
Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
$
thanx @txbob
Let me check it on Linux machine