It is definitely not the case that exp() is never working (also in my code no problems in other places), it is a combination with something else.
This is the whole kernel routine that still fails, when I take the maximum amount of other code out. I noticed that if I take the condition out of the atomicAdd (if (isword0)), it no longer fails. Since d_probs is initialized with zero, it would suggest that the condition is not satisfied and this causes the zero output and not the exp. However, I can leave the condition in and replace prob1=1.0/exp(0.0f) by prob1=1.0f and numbers do show up in d_probs. This means the condition is met. Failure requires both the condition to be there and the exp function.
__global__ void perplexity_lbl(int loops, float* d_predRw, size_t d_predRwp, float *d_qk, size_t d_qkp, unsigned int* d_qki, float* d_probs, unsigned int* d_wordnumbermatrix, unsigned int mspaceplus1, unsigned int num_leaf_cells, unsigned int* d_leaf_cell_table, unsigned int num_words)
{ unsigned int index;
unsigned int qvector;
unsigned int word0;
unsigned int word1;
bool one_branche;
unsigned int prev_qvec;
unsigned int right_word;
float prob0, prob1, prob;
bool isword0, isword1;
const unsigned int word_num = blockIdx.y;
isword0=false;
isword1=false;
right_word=d_wordnumbermatrix[word_num];
__syncthreads();
for (int i=0;i<loops;++i) {
index=(blockIdx.x * loops + i) * blockDim.x + threadIdx.x; //0..num_leaf_cells-1
if (index<num_leaf_cells) {//for all q nodes with leafs in tree
qvector=d_leaf_cell_table[index];
prev_qvec=d_qki[3*qvector];
word0=d_qki[3*qvector+1];
word1=d_qki[3*qvector+2];
if ((word0>0x80000000) && ((word0 & 0x7fffffff)==right_word)) {
isword0=true;
}
if ((word1>0x80000000) && ((word1 & 0x7fffffff)==right_word)) {
isword1=true;
}
if (isword0 || isword1) {
prob1=1.0f/(1.0f+exp(0.0f));
if (isword0) {
atomicAdd(&d_probs[word_num], prob1);
}
if (isword1) {
atomicAdd(&d_probs[word_num], prob1);
}
}
}
}
}
The kernel routine is called like this
unsigned int lpw;
cudaError_t err;
blcks=min((int)ceil((num_leaf_cells)/(float)(THREADS)), MAXBLOCKS);
dim3 dimBlock(THREADS,1,1);
dim3 dimGrid(blcks,1024,1);
lpw=(int)ceil(num_leaf_cells/float(THREADS*blcks));
printf("num_leaf_cells=%d\n", num_leaf_cells);
cudaMemset(d_probs, 0, 1024*sizeof(d_probs[0]));
err=cudaGetLastError();
if (err!=0) printf("cuda error memset %s\n", cudaGetErrorString(err));
perplexity_lbl<<<dimGrid, dimBlock>>>(lpw, d_predRw, d_predRwp, d_qk, d_qkp, d_qki, d_probs, d_wordnumbermatrix, mspaceplus1, num_leaf_cells, d_leaf_cell_table, last_word-first_word);
err=cudaGetLastError();
if (err!=0) printf("cuda error perplexity %s\n", cudaGetErrorString(err));
cutilSafeCall(cudaThreadSynchronize());
err=cudaGetLastError();
if (err!=0) printf("cuda error perplexity%s\n", cudaGetErrorString(err));
cutilSafeCall(cudaMemcpy(h_probs, d_probs, 1024*sizeof(float), cudaMemcpyDeviceToHost));
unfortunately to really run it independently, you need a lot of arrays to be intialized.