From section 5.1.2.3 of programming guide, it says
" For all threads of a half-warp, reading from the constant cache is as fast as reading
from a register as long as all threads read the same address. The cost scales linearly
with the number of different addresses read by all threads."
I write a simple kernel to test “warp serialization” of constant memory
__constant__ float PMCenterPosition[16][16] =
{
1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,
2.8f, 2.7f, 3.6f, 4.5f, 5.4f, 6.3f, 7.2f, 8.1f, 9.2f, 10.3f, 11.4f, 12.5f, 13.6f, 14.7f, 15.8f, 16.1f,
3.0f, 2.0f, 3.0f, 4.1f, 5.0f, 9.0f, 17.0f, 8.0f, 9.1f, 11.0f, 11.0f, 12.0f, 14.0f, 15.0f, 25.0f, 16.0f,
4.0f, 3.0f, 4.0f, 5.0f, 6.0f, 6.1f, 8.0f, 8.2f, 9.5f, 10.2f, 11.2f, 12.2f, 13.2f, 14.3f, 15.01f, 16.6f,
1.7f, 2.0f, 3.1f, 4.1f, 5.1f, 6.3f, 7.5f, 8.7f, 9.6f, 41.0f, 11.0f, 12.1f, 23.0f, 14.0f, 11.0f, 16.0f,
2.9f, 2.1f, 3.6f, 4.2f, 5.2f, 6.2f, 7.1f, 8.1f, 9.2f, 10.3f, 21.4f, 22.5f, 13.6f, 15.7f, 19.8f, 16.0f,
3.6f, 2.2f, 3.3f, 4.3f, 5.3f, 9.1f, 1.6f, 8.7f, 9.1f, 51.1f, 81.0f, 12.0f, 14.1f, 15.0f, 25.0f, 16.0f,
4.5f, 3.3f, 4.4f, 5.4f, 6.4f, 6.4f, 8.7f, 8.2f, 9.4f, 10.2f, 91.2f, 13.2f, 13.2f, 14.3f, 17.01f, 16.7f,
3.0f, 2.0f, 3.0f, 4.1f, 5.0f, 9.0f, 17.0f, 8.0f, 9.1f, 11.0f, 11.0f, 12.0f, 14.0f, 15.0f, 25.0f, 16.0f,
4.0f, 3.0f, 4.0f, 5.0f, 6.0f, 6.1f, 8.0f, 8.2f, 9.5f, 10.2f, 11.2f, 12.2f, 13.2f, 14.3f, 15.01f, 16.5f,
2.9f, 2.1f, 3.6f, 4.2f, 5.2f, 6.2f, 7.1f, 8.1f, 9.2f, 10.3f, 21.4f, 22.5f, 13.6f, 15.7f, 19.8f, 16.0f,
3.6f, 2.2f, 3.3f, 4.3f, 5.3f, 9.1f, 1.6f, 8.7f, 9.1f, 51.1f, 81.0f, 12.0f, 14.1f, 15.0f, 25.0f, 16.0f,
2.8f, 2.7f, 3.6f, 4.5f, 5.4f, 6.3f, 7.2f, 8.1f, 9.2f, 10.3f, 11.4f, 12.5f, 13.6f, 14.7f, 15.8f, 16.0f,
4.5f, 3.3f, 4.4f, 5.4f, 6.4f, 6.4f, 8.7f, 8.2f, 9.4f, 10.2f, 91.2f, 13.2f, 13.2f, 14.3f, 17.01f, 16.1f,
3.8f, 2.7f, 5.6f, 4.5f, 5.4f, 6.3f, 9.2f, 8.1f, 9.2f, 20.3f, 11.4f, 12.5f, 13.6f, 14.7f, 15.8f, 16.0f,
4.5f, 3.3f, 4.4f, 5.4f, 6.4f, 6.4f, 8.7f, 8.2f, 9.4f, 10.2f, 91.2f, 13.2f, 13.2f, 14.3f, 17.01f, 16.3f
};
// case 1: warp serialization = 589
static __global__ void fotone( float *A, float* B, float *C )
{
int k1 = threadIdx.x/16;
int k2 = threadIdx.x % 16;
int index = blockIdx.x * 128 + threadIdx.x;
C[index] = PMCenterPosition[k1][k2];
}
void cmem_device( float *A, float* B, float *C )
{
dim3 threads( 128,1,1 );
dim3 grid( 30, 1);
fotone<<<grid, threads>>>( A, B, C );
}
and driver
void profile_cmem( void )
{
float *h_A, *h_B, *h_C;
int num_bin = 30;
int num_thread = 128;
int size_A = num_bin * num_thread * sizeof( float );
int size_B = size_A;
int size_C = size_A;
h_A = (float*) malloc( size_A ); assert(h_A);
h_B = (float*) malloc( size_B ); assert(h_B);
h_C = (float*) malloc( size_C ); assert(h_C);
for( int i = 0; i < num_bin * num_thread; i++){
h_A[i] = rand();
h_B[i] = rand() % num_bin;
}
float *d_A, *d_B, *d_C;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_A, size_A));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_B, size_B));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_C, size_C));
CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, size_A, cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(d_B, h_B, size_B, cudaMemcpyHostToDevice) );
cmem_device( d_A, d_B, d_C );
cutilSafeCall( cudaMemcpy( h_C, d_C, size_C, cudaMemcpyDeviceToHost) );
free( h_A ); free( h_B );free( h_C );
cutilSafeCall(cudaFree(d_A));
cutilSafeCall(cudaFree(d_B));
cutilSafeCall(cudaFree(d_C));
}
kernel of “case 1” simulates your kernel
int Idx_centerpmx = ceilf(Idx_rel/16)*PM_X_THREAD*16+threadIdx.x%16+16*ipm;
However it does not satisfy “all threads of a half-warp read the same address”,
it has “warp serialization = 589”.
similarly, if we use
// case 2: warp serialization = 1566
static __global__ void fotone( float *A, float* B, float *C )
{
int k2 = threadIdx.x % 16;
int index = blockIdx.x * 128 + threadIdx.x;
C[index] = PMCenterPosition[k2][0];
}
then it has “warp serialization = 1566” since each thread of a half-warp access different address.
However if we use
// case 3: warp serialization = 0
static __global__ void fotone( float *A, float* B, float *C )
{
int k2 = threadIdx.x / 16;
int index = blockIdx.x * 128 + threadIdx.x;
C[index] = PMCenterPosition[k2][0];
}
then it has 0 warp-serialization since all threads of a half-warp access the same address.