Hello,
I’m going absolutely crazy with this problem. First of all here is my code :
Kernel :
__global__ void train_network(Quaternion * __restrict__ dW, Quaternion * __restrict__ dX, Quaternion *dB, Quaternion *dOutput, Quaternion *dGradient, Quaternion *dTarget, Quaternion *dSum, int *topolo gy, int size, int weightSize){
25
26
27 QNN qnn(topology, size, weightSize);
28 qnn.feed_forward(dW,dX,dB,dOutput);
29 qnn.output_error(dOutput,dTarget,dGradient, dSum);
30 }
output_error :
74 __device__ void QNN::output_error(Quaternion * dOutput, Quaternion * dTarget, Quaternion *dGradient, Quaternion *dSum){
75
76 //extern __shared__ Quaternion test[];
77
78 //exponential(dOutput);
79
80 exponential_sum_reduction(dOutput,dGradient);
81 __syncthreads();
82 exponential_sum_reduction(dGradient,dSum);
83
84
85 }
and the reduction :
__device__ void QNN::exponential_sum_reduction(Quaternion * dIn, Quaternion * dOut){
35
36
37 extern __shared__ Quaternion x_shared[];
38
39 unsigned int gid = threadIdx.x + blockIdx.x * blockDim.x;
40 unsigned int tid = threadIdx.x;
41 const Quaternion empty(0.0f,0.0f,0.0f,0.0f);
42
43 x_shared[tid] = empty;
44 __syncthreads();
45 if (gid < topology){
46 x_shared[tid] = dIn[gid];
47
48
49 //x_shared[tid].exponential_equal();
50
51 }
52
53
54 __syncthreads();
55
56 for(int i=blockDim.x/2; i>0; i>>=1){
57
58 if(tid<i){
59
60 //printf("Thread %d, got %d\n", i, blockIdx.x);
61 x_shared[tid] += x_shared[tid + i];
62 }
63 __syncthreads();
64 }
65
66 if( tid == 0){
67
68 //printf("bef Thread %d, got %f\n", blockIdx.x, dOut[blockIdx.x].q.x);
69 dOut[blockIdx.x] = x_shared[0];
70 //printf("af Thread %d, got %f\n", blockIdx.x, dOut[blockIdx.x].q.x);
71 }
72 }
My problem is that when i use printf, everything is ok. But when i just run the app without memcheck or printf to get the result, i got inconsistent results. Sometimes it’s the good result, sometimes not … I’m thinking of a shared_memory problem but how ? And why ? If i put only ONE printf on the code it works … Interresting thing : if i only do one reduction, result is ok and consistent … Problem only occur whith the second reduction.
I realy don’t know HOW to catch up my mistake cause i can’t check my variables when the error happens …
Thanks a lot