Constant Vs Global Memory Performance

I am trying to check the performance gain of using constant memory. I write following two kernels (one for global and one for constant memory) in visual studio 2010.

__global__ void test_GlobalMemorykernel(float* darray,float *dangle)
{
    int index;
    index = blockIdx.x * blockDim.x + threadIdx.x;
    #pragma unroll 10
    for(int loop=0;loop<360;loop++)
    {
       float a=dangle [loop];
       float b=darray [index];
       darray[index]= a + b ;
    }
    return;
}


__constant__ float cangle[360];
__global__ void test_ConstantMemorykernel(float* darray)
{
    int index;
    index = blockIdx.x * blockDim.x + threadIdx.x;
    #pragma unroll 10
    for(int loop=0;loop<360;loop++)
   {
      float a=cangle [loop];
      float b=darray [index];
      darray[index]= a + b ;
   }  
   return;
}

int main(int argc,char** argv)
{
	int size=3200;
	float* darray;
	float *dhangle;
	float hangle[360];
	int kernel=2;

	cudaMalloc ((void**)&darray,sizeof(float)*size);
	cudaMemset (darray,0,sizeof(float)*size);

	for(int loop=0;loop<360;loop++)
		hangle[loop] = acos( -1.0f )* loop/ 180.0f;
	
	if (kernel==1) //global memory
	{
		cudaMalloc ((void**)&dhangle,sizeof(float)*360);
		cudaMemcpy(dhangle, hangle, sizeof(float)*360, cudaMemcpyHostToDevice);
		test_GlobalMemorykernel <<<  size/128  ,128  >>>  (darray,dhangle);
		cudaFree(dhangle);
	}
	if (kernel==2)
	{
		cudaMemcpyToSymbol    (  cangle,  hangle,   sizeof(float)*360  );
   		test_ConstantMemorykernel <<<  size/128  ,128  >>>  (darray);
        }
        cudaFree(darray);
        return 0;
}

Nsight reported 259 micro second duration for test_ConstantMemorykernel and 253 micro second for test_GlobalMemorykernel. When I consulted Memory overview for load operations in both kernels, I found that there is 3.66KB transferred from device memory to L2 Cache. Memory overview statistics reported by Nsight are given below.

kernel(test_GlobalMemorykernel)–>Global(72K req)–>L1 Cache 0% (8.79MB)–>L2 cach 100% (5.49MB)–>Device (0 byte).

kernel(test_ConstantMemorykernel)–>Global(72K req)–>L1 Cache 0% (8.79MB)–>L2 cach 100% (5.49MB)–>Device (3.66K byte).

I think 3.66 kB transef is causing extra time in constant memory kernel. but I cant understand why there is no data transfer (device memory to L2 cache) in case of global memory kernel?

Can somebody explain why test_ConstantMemorykernel took longer time to complete than test_GlobalMemorykernel?? why there is no data transfer from device memory to L2 cache in case of est_GlobalMemorykernel? My GPU card is GeForce GT640. Thanks

The difference between 259 us and 253 us is quite small (about 2%). Such variation can arise from run-to-run in a GPU, especially in a windows environment. I’m not sure much can be inferred from it.

In some cases it may be that windows is impacting the results somehow. In your cross-posting here:

http://stackoverflow.com/questions/33446028/global-vs-constant-memory-performance-issue

(which is slightly different in that you seem to be launching 64 threads per block instead of 128) there is a comment with a link to the code there being run on linux (on a GT640) instead of windows. On linux the behavior is as expected - the constant memory kernel is not slower.

Here’s the full linux test case:

$ cat t962.cu
__global__ void test_GlobalMemorykernel(float* darray,float *cangle)
{
 int index;
 //calculate each thread global index
 index = blockIdx.x * blockDim.x + threadIdx.x;
 #pragma unroll 10
 for(int loop=0;loop<360;loop++)
      darray[index]= darray [index] + cangle [loop] ;
 return;
}
 
 
//declare constant memory
__constant__ float cangle[360];
__global__ void test_ConstantMemorykernel(float* darray)
{
int index;
//calculate each thread global index
index = blockIdx.x * blockDim.x + threadIdx.x;
#pragma unroll 10
for(int loop=0;loop<360;loop++)
      darray[index]= darray [index] + cangle [loop] ;
return;
}
 
 
int main(int argc,char** argv)
{
     int size=3200;
     float* darray;
     float *dhangle;
     float hangle[360];
     int kernel=2;
     if (argc > 1) (kernel = atoi(argv[1]));
     if (kernel > 2) kernel = 2;
     if (kernel < 1) kernel = 1;
 
     //initialize angle array on host
    for(int loop=0;loop<360;loop++)
       hangle[loop] = acos( -1.0f )* loop/ 180.0f;
 
    if (kernel==1) //global memory
    {
 
        //allocate device memory
        cudaMalloc ((void**)&darray,sizeof(float)*size);
 
        //initialize allocated memory
        cudaMemset (darray,0,sizeof(float)*size);
 
        cudaMalloc ((void**)&dhangle,sizeof(float)*360);
 
        //copy host angle data to global memory
        cudaMemcpy(dhangle, hangle, sizeof(float)*360,cudaMemcpyHostToDevice);
 
        test_GlobalMemorykernel <<<  size/64  ,64  >>>  (darray,dhangle);
 
        //free device memory
        cudaFree(darray);
        cudaFree(dhangle);
    }
    if (kernel==2)
    {
 
        //allocate device memory
        cudaMalloc ((void**)&darray,sizeof(float)*size);
 
        //initialize allocated memory
        cudaMemset (darray,0,sizeof(float)*size);
 
        //copy host angle data to constant memory
        cudaMemcpyToSymbol(  cangle,  hangle,   sizeof(float)*360  );
 
        test_ConstantMemorykernel <<<  size/64  ,64  >>>  (darray);
 
        //free device memory
        cudaFree(darray);
   }
 
  return 0;
}
$ nvcc -arch=sm_35 -o t962 t962.cu
$ nvprof --print-gpu-trace ./t962 1
==1778== NVPROF is profiling process 1778, command: ./t962 1
==1778== Profiling application: ./t962 1
==1778== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
825.12ms  3.9680us                    -               -         -         -         -  12.500KB  3.0043GB/s  GeForce GT 640          1         7  [CUDA memset]
825.26ms  1.2160us                    -               -         -         -         -  1.4063KB  1.1029GB/s  GeForce GT 640          1         7  [CUDA memcpy HtoD]
825.29ms  131.04us             (50 1 1)        (64 1 1)         8        0B        0B         -           -  GeForce GT 640          1         7  test_GlobalMemorykernel(float*, float*) [97]
 
Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
$ nvprof --print-gpu-trace ./t962 2
==1788== NVPROF is profiling process 1788, command: ./t962 2
==1788== Profiling application: ./t962 2
==1788== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
825.32ms  4.0640us                    -               -         -         -         -  12.500KB  2.9333GB/s  GeForce GT 640          1         7  [CUDA memset]
825.34ms  1.1840us                    -               -         -         -         -  1.4063KB  1.1327GB/s  GeForce GT 640          1         7  [CUDA memcpy HtoD]
825.37ms  22.752us             (50 1 1)        (64 1 1)        14        0B        0B         -           -  GeForce GT 640          1         7  test_ConstantMemorykernel(float*) [95]
 
Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
$

thanx @txbob
Let me check it on Linux machine