Why Do I Always Get Zero From My Code

As the title mentioned, I always get “0.000000” from the following code.
Any idea?

include <stdlib.h>

include <stdio.h>

include <time.h>

include <float.h>

include <cuda.h>

include <cuda_runtime.h>

int main ( void );

global void complexCal(float *d_temp, float num1, float num2, float num3, float num4);

int main ( void )
{
float num1 = 1.0;
float num2 = 1.0;
float num3 = 1.0;
float num4 = 1.0;

float *h_temp = NULL;
float *d_temp = NULL;

int numThreads = 3;

h_temp = (float*)malloc(numThreads);
cudaMalloc(((void**)&d_temp), numThreads);

cudaMemcpy(d_temp, h_temp, numThreads, cudaMemcpyHostToDevice);

printf(“CPU result: %12.6f \n”, num1 * num2 + num3 * num4);

complexCal<<<1, numThreads>>>(d_temp, num1, num2, num3, num4);

cudaMemcpy(h_temp, d_temp, numThreads, cudaMemcpyDeviceToHost);

for(int i = 0; i < numThreads; i++)
{
printf(“GPU result %d : % 12.6f \n”, i, h_temp[i]);
}
free(h_temp);
cudaFree(d_temp);

return 0;
}

global void complexCal(float *d_temp, float num1, float num2, float num3, float num4)
{
int i = threadIdx.x;

d_temp[i] = num1 * num2 + num3 * num4;
}

The first problem I see is that you’re allocating and copying only 3 bytes instead of “3 * sizeof(float)”:

int numThreads = 3;
h_temp = (float*)malloc(numThreads);
cudaMalloc(((void**)&d_temp), numThreads);
cudaMemcpy(d_temp, h_temp, numThreads, cudaMemcpyHostToDevice);
cudaMemcpy(h_temp, d_temp, numThreads, cudaMemcpyDeviceToHost);

Thanks a lot. I got the problem.