Understanding CUDA Kernel execution

I am learning CUDA and I tried the following kernel code.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuPrintf.cu"
#include <stdio.h>

__device__ void func(float &temp,float* dum)
{
    float a=5;
dum[threadIdx.x]=temp+a;
cuPrintf("%f\n",dum[threadIdx.x]);
return;
}
__global__ void kernel(float* d_in, float* d_out)
{
 int tid=(blockIdx.x*blockDim.x)+threadIdx.x;
 float temp=d_in[tid];
 float dum[9];
 func(temp,dum);
 cuPrintf("dum %f\n",dum[threadIdx.x]);
 atomicAdd(&d_out[tid],dum[tid]);
 //d_out[tid]+=dum[tid];
 cuPrintf("d_out %f\n",d_out[threadIdx.x]);
 }

 int main()
 {
    int i;
    cudaError_t cudastatus;
    float in[9]={1,2,3,4,5,6,7,8,9};
    float* h_in=in;
    float* d_in={0};
    cudastatus=cudaMalloc((void**)&d_in,9*sizeof(float));
    if (cudastatus != cudaSuccess) {
         fprintf(stderr, "cm0 fail %s\n", cudaGetErrorString(cudastatus));
    }
    cudastatus=cudaMemcpy(d_in,h_in,9*sizeof(float),cudaMemcpyHostToDevice);
    if (cudastatus != cudaSuccess) {
         fprintf(stderr, "cm1 fail %s\n", cudaGetErrorString(cudastatus));
    }
    float* d_out={0};
    cudastatus=cudaMalloc((void**)&d_out,9*sizeof(float));
    if (cudastatus != cudaSuccess) {
         fprintf(stderr, "cm2 fail %s\n", cudaGetErrorString(cudastatus));
    }
    cudaMemset(d_out, 0, 9*sizeof(float));
    float out[9]={0};
    cudaPrintfInit();
    kernel<<<3,3>>>(d_in,d_out);
    cudaDeviceSynchronize();
    cudaPrintfDisplay(stdout,true);
    cudaPrintfEnd();
    cudastatus = cudaGetLastError();
    if (cudastatus != cudaSuccess) {
        fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudastatus));
    }
    cudastatus=cudaMemcpy(out,d_out,9*sizeof(float),cudaMemcpyDeviceToHost);
    if (cudastatus != cudaSuccess) {
         fprintf(stderr, "cm3 fail %s\n", cudaGetErrorString(cudastatus));
    }
    for(i=0;i<9;i++)
    {
        printf("%f\n",out[i]);
    }
    getchar();
    return 0;
}

While I don’t get any error in cuda-memcheck and through the cudaError_t checks, the printf gives output inside the device function and the one in the global function are different. (I have tried both simple addition and atomic add operation).

What is the reason for this difference and how to correct this?

Also, how do we make sure that the device function is called for each value of temp?

Please help me sort this out. Thanks in advance.