I wanted to work out Matrix-Vector multiplication.After data from matrix and vector were multiplied and stored into share memory,I needed to reduce them to get the sum result.Here is the reduction code with wrap:
__global__ void mv_kernel(double *A, double *x, double *b,int Num_Basis)
{
__shared__ double share[300];
__shared__ double temp[300];
unsigned int tx = threadIdx.x;
unsigned int bx = blockIdx.x;
share[tx] = 0.0;
......//data of verctor stored into share memory
for (int i = 0;i < Num_Basis;++i) {
temp[tx] = 0.0;
temp[tx] = A[tx + i*Num_Basis] * share[tx];
__syncthreads();
//Wrap-reduction
if (tx >= 256) temp[tx-256] += temp[tx]; __syncthreads();
if (tx <128) temp[tx] += temp[tx + 128]; __syncthreads();
if (tx <64) temp[tx] += temp[tx + 64]; __syncthreads();
<u>if (tx < 32)</u> {
volatile double *wsSum = temp;
<u>wsSum[tx] += wsSum[tx + 32];</u>
wsSum[tx] += wsSum[tx + 16];
wsSum[tx] += wsSum[tx + 8];
wsSum[tx] += wsSum[tx + 4];
wsSum[tx] += wsSum[tx + 2];
wsSum[tx] += wsSum[tx + 1];
if (tx == 0) {
b[i] = wsSum[tx];
}
}
}
}
I got the correct and stable result when I run the simple Matri-vector-multiplication project.But when I run the demanded project with the same code,it turn out wrong and unstable.That’s to say,every running turns out different result.But after I changed my code like that:
if (tx >= 256) temp[tx-256] += temp[tx]; __syncthreads();
if (tx <128) temp[tx] += temp[tx + 128]; __syncthreads();
if (tx <64) temp[tx] += temp[tx + 64]; __syncthreads();
<u>if (tx <32) temp[tx] += temp[tx + 32]; __syncthreads();</u>
<u>if (tx < 16)</u> {
volatile double *wsSum = temp;
wsSum[tx] += wsSum[tx + 16];
wsSum[tx] += wsSum[tx + 8];
wsSum[tx] += wsSum[tx + 4];
wsSum[tx] += wsSum[tx + 2];
wsSum[tx] += wsSum[tx + 1];
if (tx == 0) {
b[i] = wsSum[tx];
}
}
I got the correct and stable result.
Why?
We can use wrap-synchronization to reduce __syncthreads() when num_thread<=32 according to Wilt N. The CUDA handbook : a comprehensive guide to GPU programming[J]. Cuda Handbook A Comprehensive Guide to Gpu Programming, 2013, 44(6):147-153.
But in my demanded project,it is correct just when num_thread<=16.