A global memory read is aligned and coalesced, however, the gld_efficiency is only 50% when using nvcc option -Xptxas -dlcm=ca to enable L1/texture cache.
The code copy from the book Professional CUDA C Programming.
#include "../common/common.h"
#include <cuda_runtime.h>
#include <stdio.h>
/*
* This example demonstrates the impact of misaligned reads on performance by
* forcing misaligned reads to occur on a float*.
*/
void checkResult(float *hostRef, float *gpuRef, const int N)
{
double epsilon = 1.0E-8;
bool match = 1;
for (int i = 0; i < N; i++)
{
if (abs(hostRef[i] - gpuRef[i]) > epsilon)
{
match = 0;
printf("different on %dth element: host %f gpu %f\n", i, hostRef[i],
gpuRef[i]);
break;
}
}
if (!match) printf("Arrays do not match.\n\n");
}
void initialData(float *ip, int size)
{
for (int i = 0; i < size; i++)
{
ip[i] = (float)( rand() & 0xFF ) / 100.0f;
}
return;
}
void sumArraysOnHost(float *A, float *B, float *C, const int n, int offset)
{
for (int idx = offset, k = 0; idx < n; idx++, k++)
{
C[k] = A[idx] + B[idx];
}
}
__global__ void warmup(float *A, float *B, float *C, const int n, int offset)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int k = i + offset;
if (k < n) C[i] = A[k] + B[k];
}
__global__ void readOffset(float *A, float *B, float *C, const int n,
int offset)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int k = i + offset;
if (k < n) C[i] = A[k] + B[k];
}
__global__ void readTest(float *A, float *C, const int n, int offset)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int k = i + offset;
if (k < n) C[i] = A[k];
}
int main(int argc, char **argv)
{
// set up device
int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("%s starting reduction at ", argv[0]);
printf("device %d: %s ", dev, deviceProp.name);
CHECK(cudaSetDevice(dev));
// set up array size
int nElem = 1 << 20; // total number of elements to reduce
printf(" with array size %d\n", nElem);
size_t nBytes = nElem * sizeof(float);
// set up offset for summary
int blocksize = 512;
int offset = 0;
if (argc > 1) offset = atoi(argv[1]);
if (argc > 2) blocksize = atoi(argv[2]);
// execution configuration
dim3 block (blocksize, 1);
dim3 grid ((nElem + block.x - 1) / block.x, 1);
// allocate host memory
float *h_A = (float *)malloc(nBytes);
float *h_B = (float *)malloc(nBytes);
float *hostRef = (float *)malloc(nBytes);
float *gpuRef = (float *)malloc(nBytes);
// initialize host array
initialData(h_A, nElem);
memcpy(h_B, h_A, nBytes);
// summary at host side
sumArraysOnHost(h_A, h_B, hostRef, nElem, offset);
// allocate device memory
float *d_A, *d_B, *d_C, *d_test;
CHECK(cudaMalloc((float**)&d_A, nBytes));
CHECK(cudaMalloc((float**)&d_B, nBytes));
CHECK(cudaMalloc((float**)&d_C, nBytes));
CHECK(cudaMalloc((float**)&d_test, nBytes));
// copy data from host to device
CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_B, h_A, nBytes, cudaMemcpyHostToDevice));
// kernel 1:
double iStart = seconds();
warmup<<<grid, block>>>(d_A, d_B, d_C, nElem, offset);
CHECK(cudaDeviceSynchronize());
double iElaps = seconds() - iStart;
printf("warmup <<< %4d, %4d >>> offset %4d elapsed %f sec\n", grid.x,
block.x, offset, iElaps);
CHECK(cudaGetLastError());
iStart = seconds();
readOffset<<<grid, block>>>(d_A, d_B, d_C, nElem, offset);
CHECK(cudaDeviceSynchronize());
iElaps = seconds() - iStart;
printf("readOffset <<< %4d, %4d >>> offset %4d elapsed %f sec\n", grid.x,
block.x, offset, iElaps);
CHECK(cudaGetLastError());
iStart = seconds();
readTest<<<grid, block>>>(d_A, d_test, nElem, offset);
CHECK(cudaDeviceSynchronize());
iElaps = seconds() - iStart;
printf("readTest <<<%4d, %4d>>> offset %4d elapsed %f sec\n", grid.x, block.x,
block.x, offset, iElaps);
CHECK(cudaGetLastError());
// copy kernel result back to host side and check device results
CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
checkResult(hostRef, gpuRef, nElem - offset);
// free host and device memory
CHECK(cudaFree(d_A));
CHECK(cudaFree(d_B));
CHECK(cudaFree(d_C));
CHECK(cudaFree(d_test));
free(h_A);
free(h_B);
// reset device
CHECK(cudaDeviceReset());
return EXIT_SUCCESS;
}
When compile the code with the following command:
nvcc -arch=sm_61 readSegment.cu -o readSegment
For different offset values:
nvprof --metrics gld_efficiency ./readSegment 0
Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "TITAN Xp (0)"
Kernel: readOffset(float*, float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 100.00% 100.00% 100.00%
Kernel: readTest(float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 100.00% 100.00% 100.00%
Kernel: warmup(float*, float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 100.00% 100.00% 100.00%
nvprof --metrics gld_efficiency ./readSegment 11
Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "TITAN Xp (0)"
Kernel: readOffset(float*, float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 80.00% 80.00% 80.00%
Kernel: readTest(float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 80.00% 80.00% 80.00%
Kernel: warmup(float*, float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 80.00% 80.00% 80.00%
nvprof --metrics gld_efficiency ./readSegment 128
Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "TITAN Xp (0)"
Kernel: readOffset(float*, float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 100.00% 100.00% 100.00%
Kernel: readTest(float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 100.00% 100.00% 100.00%
Kernel: warmup(float*, float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 100.00% 100.00% 100.00%
When compile the code with the following command to enable L1/Texture cache:
nvcc -arch=sm_61 -Xptxas -dlcm=ca readSegment.cu -o enableL1ReadSegment
For different offset values:
nvprof --metrics gld_efficiency ./enableL1ReadSegment 0
Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "TITAN Xp (0)"
Kernel: readOffset(float*, float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 50.00% 50.00% 50.00%
Kernel: readTest(float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 50.00% 50.00% 50.00%
Kernel: warmup(float*, float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 50.00% 50.00% 50.00%
nvprof --metrics gld_efficiency ./enableL1ReadSegment 11
Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "TITAN Xp (0)"
Kernel: readOffset(float*, float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 40.00% 40.00% 40.00%
Kernel: readTest(float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 40.00% 40.00% 40.00%
Kernel: warmup(float*, float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 40.00% 40.00% 40.00%
nvprof --metrics gld_efficiency ./enableL1ReadSegment 128
Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "TITAN Xp (0)"
Kernel: readOffset(float*, float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 50.00% 50.00% 50.00%
Kernel: readTest(float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 50.00% 50.00% 50.00%
Kernel: warmup(float*, float*, float*, int, int)
1 gld_efficiency Global Memory Load Efficiency 50.00% 50.00% 50.00%
Why L1 cache causes efficiency to be half when not using cache ?
I know a L1 cache line is 128 bytes, but this does not explain the above phenomenon very well.
I hope to get a more reasonable explanation.
My GPU is Titan XP.
Thanks