Y = M*X <==> q = A*p //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>// __global__ void spars_matrix( float* A , int* ia , int* ja , float* p , float* q , int* hband , int* nn , int* size , int* module) //nn = dimension of matrix //size = (nn-module)/32 <=decomposed nn to all multiprocessors to caculate "size" elements of q { __shared__ float sq[16]; __shared__ int ssize; const int tid = threadIdx.x; const int bid = blockIdx.y; const int num_threads = blockDim.x; const int num_blocks = gridDim.y; ssize = *size; for(int i=tid ; i=1 ; j*=0.5) { if(tid