Hi there,
If I have the following example code
float *u_d_array_host;
float *u_d_array_device;
u_d_array_host = (float *)cudaHostAlloc((u_d_array_length)*sizeof(float)); //???????
cudaMalloc((void **) &u_d_array_device,(u_d_array_length)*sizeof(float));
for(int j = 0;j<u_d_array_length;j++) //??????????
{
u_d_array_host[j] = u_d_array[j];
}
cudaMemcpy(u_d_array_device,u_d_array_host,(u_d_array_length)*sizeof(float),cudaMemcpyHostToDevice);
Do I need the step over u_d_array_host or can I just do sth like
float *u_d_array_device;
cudaMalloc((void **) &u_d_array_device,(u_d_array_length)*sizeof(float));
cudaMemcpy(u_d_array_device,u_d_array,(u_d_array_length)*sizeof(float),cudaMemcpyHostToDevice);
I know that my u_d_array has e.g 10 values that are in line, does the step over u_d_array_host offer me any benefits speed wise ???
thanks in advance