Where does the "Replay Overhead" come from?

Hi, I use a GTX470 and CUDA 4. When I profile the following simple program, the profiler finds a “Global Memory Replay Overhead” of 31% and a “Global Cache Replay Overhead” of 12.9%.
The Profiler suggests that this is caused by non-coalescing Global Memory access. However, I believe that the access pattern should be OK. What does that mean?

#include (cuda.h)

__global__ void kernel_vectorfield_times_vectorfield(float2 *lhs1, float2 *rhs1,
						     float2 *lhs2, float2 *rhs2,
						     float2 *result){
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  float2 temp, l1, l2, r1, r2;
  l1 = lhs1[index];
  r1 = rhs1[index];
  temp.x = l1.x * r1.x - l1.y * r1.y;
  temp.y = l1.x * r1.y + l1.y * r1.x;  
  l2 = lhs2[index];
  r2 = rhs2[index];
  temp.x += l2.x * r2.x - l2.y * r2.y;
  temp.y += l2.x * r2.y + l2.y * r2.x;
  result[index] = temp;

int main(){
  float2 *l1, *l2, *r1, *r2, *data;
  cudaMalloc((void**)&l1, sizeof(float2)*1024*1024);
  cudaMalloc((void**)&l2, sizeof(float2)*1024*1024);
  cudaMalloc((void**)&r1, sizeof(float2)*1024*1024);
  cudaMalloc((void**)&r2, sizeof(float2)*1024*1024);
  cudaMalloc((void**)&data, sizeof(float2)*1024*1024);
  dim3 dimGrid(1024*1024/128);
  dim3 dimBlock(128);
  kernel_vectorfield_times_vectorfield (((dimGrid, dimBlock))) (l1, r1, l2, r2, data);
  return 0;