Performance problems (incoherent loads)

I was wondering about poor performance in my application and analyzing it with CUDA Profiler and I receive odd results in two (rather simple) functions. The first one is quite fast and has only coherent reads and writes. The second one has a lot of incoherent reads and a large overhead between GPU Time and CPU Time. All Pointers are allocated with cudaMalloc.

__global__ void Add1(float2* res,float2* a,float2 alpha)

{

	const unsigned int tid = threadIdx.x;

	const unsigned int bid = blockIdx.x;

	const unsigned int line =tid+bid*blockDim.x;

	res[line]=alpha*res[line]+a[line];

}

__global__ void Add2(float2* res,float2* a,float2 alpha)

{

	const unsigned int tid = threadIdx.x;

	const unsigned int bid = blockIdx.x;

	const unsigned int line =tid+bid*blockDim.x;

	res[line]=res[line]+alpha*a[line];

}

Overloaded Addition and Multiplication

inline __device__ __host__ float2 operator+(const float2& a,const float2& b){

	float t1=a.x+b.x;

	float e=t1-a.x;

	float t2=((b.x-e)+(a.x-(t1-e)))+b.y*b.y;

	float2 s;

	s.x=t1+t2;

	s.y=t2-(s.x-t1);

	return s;

}

inline __device__ __host__ float2 split(float a)

{

	float t=a*splitvalue;

	float2 y;

	y.x=t-(t-a);

	y.y=a-y.x;

	return y;

}

inline __device__ __host__ float2 operator*(const float2& a,const float2& b){

	float2 u=split(a.x);

	float2 v=split(b.x);

	float2 m;

	m.x=a.x*b.x;

	m.y=(((u.x*v.x-m.x)+(u.x*v.y))+(u.y*v.x))+(u.y*v.y)+a.x*b.y+a.y*b.x;

	float2 s;

	s.x=m.x+m.y;

	s.y=(m.x-s.x)+m.y;

	return s;

}