I was wondering about poor performance in my application and analyzing it with CUDA Profiler and I receive odd results in two (rather simple) functions. The first one is quite fast and has only coherent reads and writes. The second one has a lot of incoherent reads and a large overhead between GPU Time and CPU Time. All Pointers are allocated with cudaMalloc.

```
__global__ void Add1(float2* res,float2* a,float2 alpha)
{
const unsigned int tid = threadIdx.x;
const unsigned int bid = blockIdx.x;
const unsigned int line =tid+bid*blockDim.x;
res[line]=alpha*res[line]+a[line];
}
__global__ void Add2(float2* res,float2* a,float2 alpha)
{
const unsigned int tid = threadIdx.x;
const unsigned int bid = blockIdx.x;
const unsigned int line =tid+bid*blockDim.x;
res[line]=res[line]+alpha*a[line];
}
```

Overloaded Addition and Multiplication

```
inline __device__ __host__ float2 operator+(const float2& a,const float2& b){
float t1=a.x+b.x;
float e=t1-a.x;
float t2=((b.x-e)+(a.x-(t1-e)))+b.y*b.y;
float2 s;
s.x=t1+t2;
s.y=t2-(s.x-t1);
return s;
}
inline __device__ __host__ float2 split(float a)
{
float t=a*splitvalue;
float2 y;
y.x=t-(t-a);
y.y=a-y.x;
return y;
}
inline __device__ __host__ float2 operator*(const float2& a,const float2& b){
float2 u=split(a.x);
float2 v=split(b.x);
float2 m;
m.x=a.x*b.x;
m.y=(((u.x*v.x-m.x)+(u.x*v.y))+(u.y*v.x))+(u.y*v.y)+a.x*b.y+a.y*b.x;
float2 s;
s.x=m.x+m.y;
s.y=(m.x-s.x)+m.y;
return s;
}
```