For the transportation of the data i allocate memory in the CUT_THREADPROC routine using CudaMalloc and then i copy them to
the GPUs using cudaMemcpy. So each thread does the same thing for each device and returns the data to a different matrix which is
declared as a pointer in a strucrure.
I do that with the following code:
[i]
CUDA_SAFE_CALL(cudaSetDevice(str->real_device));
CUDA_SAFE_CALL(cudaGetDevice(&dev));
printf(“The device running now is %d\n”, dev);
CUDA_SAFE_CALL( cudaMalloc((void**)&d_Pad_Frame, str->PAD_FRAME_SIZE_PER_DEVICE));
CUDA_SAFE_CALL( cudaMalloc((void**)&d_OutcomeGPU, str->OUTCOME_SIZE_PER_DEVICE));
gettimeofday (&str->start_in, NULL);
CUDA_SAFE_CALL( cudaMemcpyToSymbol(d_Template, h_Template, TEMPLATE_SIZE));
CUDA_SAFE_CALL( cudaMemcpy(d_Pad_Frame, str->h_Pad_Frame, str->PAD_FRAME_SIZE_PER_DEVICE, cudaMemcpyHostToDevice));
gettimeofday (&str->end_in, NULL);
CUDA_SAFE_CALL( cudaMemcpy(str->test_Pad_Frame, d_Pad_Frame, str->PAD_FRAME_SIZE_PER_DEVICE, cudaMemcpyDeviceToHost) );
Printing_Matrix(PAD_FRAME_M, str->PAD_FRAME_SIZE_PER_DEVICE/(PAD_FRAME_M * sizeof(float)), str->test_Pad_Frame);
[/i]
So actually i print the matrices.
For the second thing you ask: In order to launch the threads i use the steps of the MonteCarloMultiGpu from NVIDIA CUDA SDK