Y = M*X <==> q = A*p

//>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>//
__global__ void spars_matrix( float* A , int* ia , int* ja , float* p , float* q , 
                               int* hband , int* nn , int* size , int* module)
                                           //nn = dimension of matrix
                                           //size = (nn-module)/32 <=decomposed nn  
                                 to all multiprocessors to caculate "size" elements of q
                                                      
{
   __shared__ float sq[16];
   __shared__ int ssize;
   
   const int tid = threadIdx.x;
   const int bid = blockIdx.y;
   const int num_threads = blockDim.x;
   const int num_blocks = gridDim.y;

   ssize = *size;

   for(int i=tid ; i<ssize ; i+=num_threads)
   {
 	q[ i+ bid*ssize ] = A[ ia[i+bid*ssize] ] * p[i+bid*ssize]; // diagonal element
   } 

    __syncthreads();

         for(int i=bid*ssize ; i<(bid*ssize)+ssize ; i++)
	 { 
	 	sq[tid]=0.0;
		for(int j=ia[i]+tid+1 ; j<ia[i+1] ; j+=num_threads)
		{
			sq[tid] += A[j] * p[ja[j]]; //upper elements
			q[ja[j]] += A[j] * p[i]; //lower elements
		}

		__syncthreads();
		
		for(int j=(num_threads*0.5) ; j>=1 ; j*=0.5)
		{
		    if(tid<j)
		    {
			sq[tid] += sq[tid+j];
		    }
		 __syncthreads();
		}

		if(tid==0) 
		{
		    q[i] += sq[tid];
		}
	 }

	 if(bid==0 && tid==0)
	 {
		for(int i=(num_threads*ssize) ; i<num_blocks*ssize+(*module) ; i++)
		{ 
			for(int j=ia[i]+1 ; j<ia[i+1] ; j++)
			{
			    q[i] += A[j] * p[ja[j]];
			    q[ja[j]] += A[j] * p[i];
			}
		}
	 } //last elements
}
//<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<//

this code run on en8800 GTX gets 7.72ms(0.26Gflops).

compared the code run on the CPU gets 5.213ms(0.38 Gflops).

Why CPU runs fast than en8800?

In .cu file , I configured 32 multiprocessors to run on this multiplication 

and each multiprocessors had 16 threads.

In .cpp file , only 1 thread to run on this multiplication. 

I'm very confusing and can't to figure out why.

Anybody help me , very thanks.