In a typical sparse/banded implementation on the GPU, you would launch 1 thread per row to compute the dot product, which is complete memory bandwidth bound. That probably means 700 threads for your 700x700 case, which is very small. The sort of sparse systems I solve using Krylov subspace methods on GPUs have several million rows. I suspect your problems are just far too small for the GPU to perform well.

Do it something like this:

volatile unsigned int tidx = threadIdx.x + threadIdx.y*blockDim.x;

volatile unsigned int gridsize = gridDim.x * blockDim.x;

float sum = 0.;

for (int i=tidx; i<N; i+=gridsize){

	rvhat = v_hat[i];

	svold = v[i];

	v[i] = rvhat/beta;

	rvhat = Av[i] - alpha*v[i] - beta* svold;

	v_hat[i]=rvhat; //COMPUTE V_HAT

	sum += rvhat *rvhat;


svhat[tid] = sum; __syncthreads();

// Reduction follows.....

You need to add the shared memory size to the kernel launch statement. So a kernel launch looks something like this (using the Linfty norm example again), where shmsize is the dynamic shared memory per block:

template <typename ValueType> ValueType linfty_norm(const thrust::device_vector<ValueType> & d_x)


	dim3 nblocks(112); // GTX 470 has 14 MP

	dim3 blocksize(96); // 3 warps per block

	size_t shmsize = blocksize.x * sizeof(ValueType);

	thrust::device_vector<ValueType> d_pnorms(nblocks.x);

	linfty_norm_kernel<ValueType> <<< nblocks, blocksize, shmsize>>> 

				 ( (unsigned int)d_x.size(), thrust::raw_pointer_cast(&d_x[0]), thrust::raw_pointer_cast(&d_pnorms[0]) ); 

	thrust::host_vector<ValueType> pnorms = d_pnorms;

	ValueType norm = pnorms[0];

	for(int i=1; i<blocksize.x; i++) { norm = fmax( pnorms[i], norm ); }

	return norm;