Hi all,
I’m trying to understand what limits the performance of kernels that do non-coalesced memory reads. Here is the code I use to test things:
#include <cstdlib>
#include <cstdio>
using namespace std;
const int kElements = 1024*1024*512/4;
const int kBlockSize = 1024;
const int kIterations = 20;
const int kUnroll = 1;
const int kThreads = kElements / kUnroll;
const int kBlocks = kThreads / kBlockSize;
const int kWraps = kThreads / 32;
__global__
void IncrementCopyCoalesced(float* src, float* dst) {
int i = threadIdx.x + kBlockSize * blockIdx.x;
#pragma unroll
for (int k = 0; k < kUnroll; k++) {
dst[i + k * kThreads] = src[i + k * kThreads] + 1;
}
}
__global__
void IncrementCopyUncoalesced(float* src, float* dst) {
int thread_idx = threadIdx.x + blockIdx.x * kBlockSize;
int wrap_idx = thread_idx / 32;
int wrap_lane = thread_idx % 32;
int i0 = wrap_idx + wrap_lane * kWraps;
float data[kUnroll];
#pragma unroll
for (int k=0; k < kUnroll; ++k) {
data[k] = src[i0 + kThreads * k];
}
/* __syncthreads(); */
#pragma unroll
for (int k=0; k < kUnroll; ++k) {
dst[i0 + kThreads * k] = data[k] + 1;
}
}
void check(float* src, int sz) {
float* cpu_data = (float*)malloc(sz);
cudaMemcpy(cpu_data, src, sz, cudaMemcpyDeviceToHost);
printf("Checking...\n");
for (int i=0; i < kElements; ++i) {
if (cpu_data[i] != kIterations * 2) {
printf("Bad result at index %i: %f\n", i, cpu_data[i]);
abort();
}
}
printf("Done.\n");
free(cpu_data);
}
int main() {
const int sz = sizeof(float)*kElements;
float *src, *dst;
cudaMalloc((void**)&src, sz);
cudaMalloc((void**)&dst, sz);
printf("Running coalesced...\n");
cudaMemset(src, 0, sz);
for (int i=0; i < kIterations; ++i) {
IncrementCopyCoalesced<<<kBlocks, kBlockSize>>>(src, dst);
IncrementCopyCoalesced<<<kBlocks, kBlockSize>>>(dst, src);
}
check(src, sz);
printf("Running uncoalesced...\n");
cudaMemset(src, 0, sz);
for (int i=0; i < kIterations; ++i) {
IncrementCopyUncoalesced<<<kBlocks, kBlockSize>>>(src, dst);
IncrementCopyUncoalesced<<<kBlocks, kBlockSize>>>(dst, src);
}
check(src, sz);
cudaFree(src);
cudaFree(dst);
}
IncrementCopyCoalesced is memory throughput bound. However the profiler tells me that IncrementCopyUncoalesced is latency bound and I don’t understand why. The coalesced copy issues 4 L2 transactions per warp (I’m running on GTX 1070). Uncoalesced copy issues 32 transactions per wrap and I would expect the kernel to be bandwidth bound (at L2 cache in this case, since the hit rate is quite high). However the profiler tells me that the utilization of all memory and compute is low. So where does the latency come from? Surely 32 transactions per wrap should hide memory latency better than 4.
The only idea I have is that memory transaction queue is limited somewhere. I saw a mention somewhere that there is a per-wrap queue for global loads that is 6 or so requests long. In this case 32 transactions would incur ~5x L2/DRAM latency. This roughly corresponds to the timings I get (5 ms for coalesced copy vs 30 for uncoalesced), but how do I see it in the profiler?
Here are some numbers from nvprof:
Device "GeForce GTX 1070 (0)"
Kernel: IncrementCopyCoalesced(float*, float*)
40 l2_read_throughput L2 Throughput (Reads) 93.779GB/s 94.967GB/s 94.398GB/s
40 l2_write_throughput L2 Throughput (Writes) 93.104GB/s 93.945GB/s 93.492GB/s
40 dram_read_throughput Device Memory Read Throughput 93.774GB/s 94.966GB/s 94.395GB/s
40 tex_utilization Unified Cache Utilization Low (1) Low (1) Low (1)
40 issue_slot_utilization Issue Slot Utilization 9.26% 9.43% 9.33%
Kernel: IncrementCopyUncoalesced(float*, float*)
40 l2_read_throughput L2 Throughput (Reads) 111.23GB/s 111.53GB/s 111.41GB/s
40 l2_write_throughput L2 Throughput (Writes) 110.35GB/s 110.56GB/s 110.46GB/s
40 dram_read_throughput Device Memory Read Throughput 14.673GB/s 14.830GB/s 14.756GB/s
40 tex_utilization Unified Cache Utilization Low (1) Low (1) Low (1)
40 issue_slot_utilization Issue Slot Utilization 1.96% 1.96% 1.96%
Thanks!