Matrix multiply Vector problem using ScalarProd

I want to multiply an NXN matrix with a Vector.As a beginner, I modified the example scalarprodGPU kernel in the CUDA SDK package as follows(the comment has been removed for clarity):

d_C is used to store the result of multiplication.

d_D is used to store the result of transposed matrix of d_A multiplies vector d_B(initialized to be zero).

d_A is the matrix

d_B is the vector

the result shows that d_C is OK but d_D is not correct. Is there any suggestions?

Thanks a lot.

#define IMUL(a, b) __mul24(a, b)

#define ACCUM_N 1024

__global__ void scalarProdGPU(

    float *d_C,

    float *d_A,

    float *d_B,

    float *d_D,

    int vectorN,

    int elementN

){

    //Accumulators cache

    __shared__ float accumResult[ACCUM_N];

   for(int vec = blockIdx.x; vec < vectorN; vec += gridDim.x){

        int vectorBase = IMUL(elementN, vec);

        int vectorEnd  = vectorBase + elementN;

       for(int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x){

            float sum = 0;

           for(int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)

  {

///////////////////////////////////////////////////////////////////////////////////////////////

//modified code :

                                      float tmp = d_A[pos] * d_B[pos - vectorBase];

                                      d_D[pos-vectorBase]+=tmp; //add every rows' tmp result

                                       sum += tmp;                 

///////////////////////////////////////////////////////////////////////////////////////////////

  }

           accumResult[iAccum] = sum;

        }

        for(int stride = ACCUM_N / 2; stride > 0; stride >>= 1){

            __syncthreads();

            for(int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)

                accumResult[iAccum] += accumResult[stride + iAccum];

        }

       if(threadIdx.x == 0) d_C[vec] = accumResult[0];

    }

}

I find that

d_D[pos-vectorBase] += tmp;

executed only once for every element of d_D but not the VectorN.

But I don’t know why.