Hi,

I implemted an CG Solver. The Solver is runing well, but the compute time isn’t so well. So I looked in the profiler an I saw that the matrix vector multiplication takes 94,5% of the hole time. So im trying to reduce this. My idea is to precache only these elements from x (Ax=y) witch I need to compute. But at the moment I have some problems to get the elemts in cache(in right position). Perphaps somebody has done something same and can help me. Here is my code:

```
//Matrix vector multiplication
__global__ void spmv_jd(const int num_rows,
const int* ptr,
const int* indices,
const computetype* data,
const computetype* x,
computetype* y
)
{
//Level 2 multiplication in shared memory and parallel reduction
__shared__ computetype cache_sum[ 256 ]; //cache with size of threads per block
//Level 3 precaching of elments from x (Ax=y)
__shared__ computetype cache_x [ 256 ]; //cache for elemts of input vector x
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int thread_id = idx + blockIdx.y * gridDim.x * blockDim.x; // global thread index
int warp_id = thread_id / 64; // global warp index
int lane = thread_id & (64 - 1); // thread index within the warp
//one warp per row
int row = warp_id ;
if ( row < num_rows )
{
int row_start = ptr [row ];
int row_end = ptr [ row +1];
//----added-------------------------------------------------
//here I want to fill the cache only with these elements from x, witch I need to compute later
cache_x [ threadIdx.x ] = 0;
if(row_start + threadIdx.x < row_end )
cache_x [ threadIdx.x ] = x[ indices [threadIdx.x] ];
__syncthreads();
//----added--------------------------------------------------
//compute running sum per thread
cache_sum [ threadIdx.x ] = 0;
for ( int j = row_start + lane ; j < row_end ; j += 64)
cache_sum [ threadIdx.x ] += data [j] * cache_x[j]; //cache [ threadIdx.x ] += data [j] * x[indices [j]];
__syncthreads();
//parallel reduction in shared memory
if ( lane < 32) cache_sum [ threadIdx.x ] += cache_sum [ threadIdx.x + 32];
if ( lane < 16) cache_sum [ threadIdx.x ] += cache_sum [ threadIdx.x + 16];
if ( lane < 8) cache_sum [ threadIdx.x ] += cache_sum [ threadIdx.x + 8];
if ( lane < 4) cache_sum [ threadIdx.x ] += cache_sum [ threadIdx.x + 4];
if ( lane < 2) cache_sum [ threadIdx.x ] += cache_sum [ threadIdx.x + 2];
if ( lane < 1) cache_sum [ threadIdx.x ] += cache_sum [ threadIdx.x + 1];
__syncthreads();
//first thread writes the result
if ( lane == 0)
y[ row ] = cache_sum [ threadIdx.x ];
}
}
```