Hi, I use a GTX470 and CUDA 4. When I profile the following simple program, the profiler finds a “Global Memory Replay Overhead” of 31% and a “Global Cache Replay Overhead” of 12.9%.
The Profiler suggests that this is caused by non-coalescing Global Memory access. However, I believe that the access pattern should be OK. What does that mean?
#include (cuda.h)
__global__ void kernel_vectorfield_times_vectorfield(float2 *lhs1, float2 *rhs1,
float2 *lhs2, float2 *rhs2,
float2 *result){
int index = blockIdx.x * blockDim.x + threadIdx.x;
float2 temp, l1, l2, r1, r2;
l1 = lhs1[index];
r1 = rhs1[index];
temp.x = l1.x * r1.x - l1.y * r1.y;
temp.y = l1.x * r1.y + l1.y * r1.x;
l2 = lhs2[index];
r2 = rhs2[index];
temp.x += l2.x * r2.x - l2.y * r2.y;
temp.y += l2.x * r2.y + l2.y * r2.x;
result[index] = temp;
}
int main(){
float2 *l1, *l2, *r1, *r2, *data;
cudaMalloc((void**)&l1, sizeof(float2)*1024*1024);
cudaMalloc((void**)&l2, sizeof(float2)*1024*1024);
cudaMalloc((void**)&r1, sizeof(float2)*1024*1024);
cudaMalloc((void**)&r2, sizeof(float2)*1024*1024);
cudaMalloc((void**)&data, sizeof(float2)*1024*1024);
dim3 dimGrid(1024*1024/128);
dim3 dimBlock(128);
kernel_vectorfield_times_vectorfield (((dimGrid, dimBlock))) (l1, r1, l2, r2, data);
cudaFree(l1);
cudaFree(l2);
cudaFree(r1);
cudaFree(r2);
cudaFree(data);
return 0;
}