matrix-vector mult in CSR format

Problem: I wanna implement my own sparse matrix-vector multiplication, the sparse matrix is stored in CSR format.

Here’s the CPU version code: (A is the value vector, IA is the row pointer vector and JA is the column index vector)

void cpu_AX(double *A,int *IA,int *JA,double *x,double *w,int N)
{
int i,j,k1,k2;

for (i=0;i<N;i++) {
w[i] = 0.0;
k1 = IA[i];
k2 = IA[i+1];
for (j=k1;j<k2;j++) w[i] += A[j]*x[JA[j]];
}
}


I wrote a piece of code trying to adopt shared memory for faster calculation, here’s my code:
device double AX_device(double *Ak,int *JA,double *xk, int startpos, int endpos)
{

extern shared double sA;
extern shared double sX;
shared double res;
shared int i;

res = 0;
for (i = startpos; i < endpos; i++)
{
sA[i - startpos] = Ak[i];
sX[i - startpos] = xk[JA[i]];

res += sA[i]*sX[i];

}

return res;

}

global void AX_kernel(double *A,int *IA,int *JA,double xk,double wk)
{
shared int sp,ep;
sp = IA[blockIdx.x
blockDim.x+threadIdx.x];
ep = IA[blockIdx.x
blockDim.x+threadIdx.x + 1];

wk[blockIdx.x*blockDim.x+threadIdx.x] = AX_device(A,JA,xk,sp,ep);
__syncthreads();

}

void cuda_AX(double *A,int *IA,int *JA,double *x,double w)
{
AX_kernel<<<grid_dim,block_dim,block_dim.x
sizeof(double)>>>(A,IA,JA,x,w);

}

However my results are wrong, any suggestions about where did I do wrong and how to correct it?

Problem: I wanna implement my own sparse matrix-vector multiplication, the sparse matrix is stored in CSR format.

Here’s the CPU version code: (A is the value vector, IA is the row pointer vector and JA is the column index vector)

void cpu_AX(double *A,int *IA,int *JA,double *x,double *w,int N)
{
int i,j,k1,k2;

for (i=0;i<N;i++) {
w[i] = 0.0;
k1 = IA[i];
k2 = IA[i+1];
for (j=k1;j<k2;j++) w[i] += A[j]*x[JA[j]];
}
}


I wrote a piece of code trying to adopt shared memory for faster calculation, here’s my code:
device double AX_device(double *Ak,int *JA,double *xk, int startpos, int endpos)
{

extern shared double sA;
extern shared double sX;
shared double res;
shared int i;

res = 0;
for (i = startpos; i < endpos; i++)
{
sA[i - startpos] = Ak[i];
sX[i - startpos] = xk[JA[i]];

res += sA[i]*sX[i];

}

return res;

}

global void AX_kernel(double *A,int *IA,int *JA,double xk,double wk)
{
shared int sp,ep;
sp = IA[blockIdx.x
blockDim.x+threadIdx.x];
ep = IA[blockIdx.x
blockDim.x+threadIdx.x + 1];

wk[blockIdx.x*blockDim.x+threadIdx.x] = AX_device(A,JA,xk,sp,ep);
__syncthreads();

}

void cuda_AX(double *A,int *IA,int *JA,double *x,double w)
{
AX_kernel<<<grid_dim,block_dim,block_dim.x
sizeof(double)>>>(A,IA,JA,x,w);

}

However my results are wrong, any suggestions about where did I do wrong and how to correct it?