I tried to create a minimalistic reproduction of the behavior I am seeing. Consider the code given below:
#include <stdlib.h>
__global__ void vectorAdd(const float *a, const float *b, float *c, int n) {
// Calculate the index for this thread
int idx = threadIdx.x + blockDim.x * blockIdx.x;
float d = 10.0;
// Perform the vector addition if within bounds
if (idx < n) {
c[idx] = a[idx] + b[idx] +d;
}
}
__global__ void vectorAdd1(const float *a, const float *b, float *c, float *data, int n) {
// Calculate the index for this thread
int idx = threadIdx.x + blockDim.x * blockIdx.x;
float d = data[0];
// Perform the vector addition if within bounds
if (idx < n) {
c[idx] = a[idx] + b[idx] + d;
}
}
__global__ void vectorAdd2(const float *a, const float *b, float *c, float *data, int n) {
// Calculate the index for this thread
int idx = threadIdx.x + blockDim.x * blockIdx.x;
float d = 10.0;
if (threadIdx.x % warpSize == 0) {
d = data[0];
}
// Perform the vector addition if within bounds
if (idx < n) {
c[idx] = a[idx] + b[idx] + d;
}
}
int main() {
int n = 1<<19; // Size of vectors
size_t bytes = n * sizeof(float);
// Allocate host memory
float *h_a = (float*)malloc(bytes);
float *h_b = (float*)malloc(bytes);
float *h_c = (float*)malloc(bytes);
float *h_data = (float*)malloc(sizeof(float));
// Initialize input vectors on host
for (int i = 0; i < n; i++) {
h_a[i] = i * 1.0f;
h_b[i] = i * 2.0f;
}
h_data[0] = 10.0;
// Allocate device memory for vectors
float *d_a, *d_b, *d_c, *d_data;
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
cudaMalloc(&d_data, sizeof(float));
// Copy data from host to device
cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_data, h_data, sizeof(float), cudaMemcpyHostToDevice);
// Launch kernel with indirect pointers
int threads = 256;
int blocks = (n + threads - 1) / threads;
vectorAdd<<<blocks, threads>>>(d_a, d_b, d_c, n);
vectorAdd1<<<blocks, threads>>>(d_a, d_b, d_c, d_data, n);
vectorAdd2<<<blocks, threads>>>(d_a, d_b, d_c, d_data, n);
// Copy result back to host
cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
// Free memory
free(h_a);
free(h_b);
free(h_c);
free(h_data);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFree(d_data);
return 0;
}
The first kernel is a basic vectorAdd
kernel where I add two vectors a
and b
element-wise, and store the result in a third vector c
. So, per thread we access global memory twice to read and one to write.
From the ncu timing diagram, there are 32.77 K Req of Read to Global Memory (I guess 32.77/2 = 16.38 k Req for each of
a
or b
read)
In the second kernel, vectorAdd1
we read in the memory content of a vector containing just one element in each thread of the kernel. So, based on the previous logic, there are now 3 * 16.38 k (=49.15) Req of Read to Global Memory. (One for the data
vector and other for vector a
and b
)
The third kernel vectorAdd2
tries to access data
only for first thread in each warp, so I supposed that just for the access of the data
vector, there should not be 16.38k
but rather 16.38k/32
given that the memory is accessed once for every 32 threads. But contrary to my assumption, I find that the memory request made are same as that of vectorAdd1
.
Why is it happening like this?