Problem: I wanna implement my own sparse matrix-vector multiplication, the sparse matrix is stored in CSR format.

Here’s the CPU version code: (A is the value vector, IA is the row pointer vector and JA is the column index vector)

void cpu_AX(double *A,int *IA,int *JA,double *x,double *w,int N)

{

int i,j,k1,k2;

for (i=0;i<N;i++) {

w[i] = 0.0;

k1 = IA[i];

k2 = IA[i+1];

for (j=k1;j<k2;j++) w[i] += A[j]*x[JA[j]];

}

}

I wrote a piece of code trying to adopt shared memory for faster calculation, here’s my code:

**device** double AX_device(double *Ak,int *JA,double *xk, int startpos, int endpos)

{

extern **shared** double sA;

extern **shared** double sX;

**shared** double res;

**shared** int i;

res = 0;

for (i = startpos; i < endpos; i++)

{

sA[i - startpos] = Ak[i];

sX[i - startpos] = xk[JA[i]];

```
res += sA[i]*sX[i];
```

}

return res;

}

**global** void AX_kernel(double *A,int *IA,int *JA,double *xk,double wk)
{
shared int sp,ep;
sp = IA[blockIdx.xblockDim.x+threadIdx.x];
ep = IA[blockIdx.x*blockDim.x+threadIdx.x + 1];

```
wk[blockIdx.x*blockDim.x+threadIdx.x] = AX_device(A,JA,xk,sp,ep);
__syncthreads();
```

}

void cuda_AX(double *A,int *IA,int *JA,double *x,double *w)
{
AX_kernel<<<grid_dim,block_dim,block_dim.x*sizeof(double)>>>(A,IA,JA,x,w);

}

However my results are wrong, any suggestions about where did I do wrong and how to correct it?