constant cache no faster than global mem? constant memory access

Hi: below 2 versions are of same performance. 1st I think using constant mem, 2nd using global mem. How’s that 8kbytes cache/ multiprocessor doing? Thanks!

version1:

__constant__ int* c_idata;

__constant__ int* c_odata;

void

runTest( int argc, char** argv) 

{

  int nR = 1024*1024*16;

	int* h_idata = (int*)malloc( sizeof(int) * nR);

	CUDA_SAFE_CALL( cudaMalloc( (void**) &c_idata, sizeof(int) * nR));

	CUDA_SAFE_CALL( cudaMemcpy( c_idata, h_idata, sizeof(int) * nR, cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMalloc( (void**) &c_odata, sizeof(int) * nR));

	dim3 Dg(128, 1, 1);

	dim3 Db(256, 1, 1);

	testKernel<<<Dg, Db>>>( c_idata, c_odata, nR);

}

__global__ void

testKernel(int* c_idata, int* c_odata, const int nR) 

{

	int bx = blockIdx.x;

	int tx = threadIdx.x;

	int blockSize = nR / gridDim.x;

	int loopTimes = blockSize/blockDim.x;

	int dataIdx;

	for( int i = 0; i < loopTimes; i++ )

	{

  dataIdx = blockSize * bx + i * blockDim.x + tx;

  

  for( int j = 0; j < 1023; j++ )

  {

    c_idata[dataIdx + 1] = c_idata[dataIdx];

  }  

 c_odata[dataIdx] = c_idata[dataIdx];

	}

}

version2:

//__constant__ int* c_idata;

//__constant__ int* c_odata;

void

runTest( int argc, char** argv) 

{

 int* c_idata;

 int* c_odata;

...//below all the same

Both versions use the global memory. There is a symbol overload in the first version with the kernel signature.

Peter

Thanks, I’m afraid I can’t quite follow you by “symbol overload” and “signature”. Could you please as well point to some examples on how to use ConstantMemory? Thanks!

Thanks, I think I catch your meaning now. I changed to below, and timing is from 50ms down to 8ms:

__constant__ int* c_idata;

__constant__ int* c_odata;

void

runTest( int argc, char** argv) 

{

 //int* c_idata;

 //int* c_odata;

	int nR = 1024*1024*16;

	int* h_idata = (int*)malloc( sizeof(int) * nR);

	CUDA_SAFE_CALL( cudaMalloc( (void**) &c_idata, sizeof(int) * nR));

	CUDA_SAFE_CALL( cudaMemcpy( c_idata, h_idata, sizeof(int) * nR, cudaMemcpyHostToDevice) );

	CUDA_SAFE_CALL( cudaMalloc( (void**) &c_odata, sizeof(int) * nR));

	dim3 Dg(128, 1, 1);

	dim3 Db(256, 1, 1);

	testKernel<<<Dg, Db>>>(/* c_idata, c_odata, */nR);

}
extern __constant__ int* c_idata;

extern __constant__ int* c_odata;

__global__ void

testKernel(/*__constant__ int* c_idata, __constant__ int* c_odata, */const int nR) 

{

	int bx = blockIdx.x;

	int tx = threadIdx.x;

	int blockSize = nR / gridDim.x;

	int loopTimes = blockSize/blockDim.x;

	int dataIdx;

	for( int i = 0; i < loopTimes; i++ )

	{

  dataIdx = blockSize * bx + i * blockDim.x + tx;

  

  for( int j = 0; j < 1023; j++ )

  {

    c_idata[dataIdx + 1] = c_idata[dataIdx];

  }  

 c_odata[dataIdx] = c_idata[dataIdx];

	}

}

Is this a correct way of using constant memory?

Thanks.

No, it is not.
Constant memory is read-only on GPU.