Hi. I’m trying to parallelize my C++ code and its going sooooo slow when I add the CUDA Part.
I’ve got a list of Objects, and for every object I have to do some operations…In spite of using a “for” estructure in order to going throw the list, my idea was to parallelyze this list.
The problem is that this block of code (the “for” structure with the operations) is called several times during the execution of the program, and when I add the call to the cuda Function it adds a big delay to the final result (Not perceptible in a single iteration, but yes in the global execution time of the program)
I don’t know if there is a problem of a communication delay between C Code and CUDA Code, or perhaps, my CUDA Code is not defined properly. I put a sample of what I want to do with CUDA
My cpp function
void Env::check ( void ) {
if ( pop->size() < get_param ( "max_size" ) ) {
std::list<BAC *>::iterator j;
//CALL TO CUDA FUNCTION
CUDA_envCheck();
for ( j=population->begin(); j!=population->end(); j++ ) {
/*Set of operations*/
}
}
}
And this is the CU File
// CUDA-C includes
#include <cuda.h>
#include "device_launch_parameters.h"
#include <cuda_runtime.h>
#include <stdio.h>
#include *******
#include *******
cudaError_t _envCheckCuda();
__global__ void _envCheckKernel()
{
int i = threadIdx.x;
//Here will be the functions for each object of the list, by the moment is empty, just for checking
//that the code doesn't have too many delays of time
}
extern "C" int* CUDA_envCheck(int *c){
int c[arraySize] = { 0 };
// Add vectors in parallel.
cudaError_t cudaStatus = envCheckCuda(c);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "envCheckCuda failed!");
//return 1;
}
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
//return 1;
}
return c;
}
cudaError_t envCheckCuda(int *c)
{
int size=16;
cudaError_t cudaStatus;
std::list<BAC *>::iterator j;
int *dev_c = 0;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
worldUpdKernel<<<1, size>>>(dev_c);
// Launch a kernel on the GPU with one thread for each element.
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "decKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
Error:
cudaFree(dev_c);
return cudaStatus;
}
Thank you