Reorganizing Memory

Hi everybody

Im pretty new in programming using CUDA, and right now I want to create a program to multiply two sparse diagonals matrices.
The first aproach Im following is to reorganice the sparse matrices, and the do a normal matrix, matrix multiplication. Im using the matrix were the result will be stored like a temporal Matrix, and the save that Matrix in the original one with the new indexes. The first Matrix is copied without problem, but when trying to copy the second I just wrong values :wacko:

I hope you can help me, and sorry because of my bad english

global void
SparseMMul( float* C, float* A, float* dA, int hA, int wA, int ndA,
float* B, float* dB, int hB, int wB, int ndB)
{

int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

int rowa = (row * wB + col) / ndA;
int cola = (row * wB + col) % ndA;

int rowaf;
int colaf;
int rowbf;
int colbf;

if (hA >= wA){
	rowaf = rowa - dA[cola];
	colaf = rowa;
}
else{
	colaf = rowaf + dA[cola];
	rowaf = rowa;
}

int rowb = (row * wB + col) / ndB;
int colb = (row * wB + col) % ndB;

if (hB >= wB){
	rowbf = rowb - dB[colb];
	colbf = rowb;
}
else{
	colbf = rowbf + dB[colb];
	rowbf = rowb;
}


if (rowaf < hA && colaf < wA && rowaf >= 0 && colaf >=0 )// &&
{
	C[rowaf * wB + colaf] =  A[row * wB + col];
}

	A[row * wB + col] = 0;
	A[row * wB + col] = C[row * wB + col];

if (rowbf < hB && colbf < wB && rowbf >= 0 && colbf >=0 )// &&
{
	C[rowbf * wB + colbf] = B[row * wB + col];
}

	B[row * wA + col] = 0;
    B[row * wB + col] = C[row * wB + col];

float Cvalue = 0;
for (int e = 0; e < wA; e++){

  Cvalue += A[row * wA + e]
			*B[e * wB + col];
}
	C[row * wB + col] = Cvalue;

}