My profiler says gld_coherent is 1 and gst_coherent is 8?
But I think coalesced store should also be 1.
I run this code on GeForce9500GT driver is 190.18
nvcc version 2.3 Gcc 4.1.2
However, both gld_128b and gst_128b are 1 on GTX280.
Did I do something wrong with coalesced strore, or it is just a profiler error.
Here is the simple code:
#include <cuda.h>
#include <stdio.h>
#define N 16
global void kernel1( float2 *i_d, float2 *o_d);
int main (void) {
float2 * in, *out, *i_d, o_d;
in=(float2) malloc(sizeof(float2)*N);
cudaMalloc((void**)&i_d, sizeof(float2)N);
cudaMalloc((void*)&o_d, sizeof(float2)*N);
cudaMemcpy(i_d, in, sizeof(float2)*N, cudaMemcpyHostToDevice);
dim3 dB1(16,1);
dim3 dG1(1,1);
kernel1<<<dG1, dB1>>>( i_d, o_d);
return 0;
}
global void kernel1( float2* i_d, float2* o_d)
{
int tx = threadIdx.x;
(o_d+tx)->x=(i_d+tx)->x;
(o_d+tx)->y=(i_d+tx)->y;
}