MultiGPU Pinned and pageable memory

Hi Everybody:

I am trying to modify my code, so instead of just use one GPU for the calculations use two of them. My program has the option to run with pageable or pinned memory.

For one GPU is running ok, with no problems, but for the new code I am always having problems if I try to run for 2 GPU.

For pinned memory I always get the following problem.

Waiting for GPUs results …

set device

set device

device 0 test 0

Allocate memory in device 0

device 1 test 0

Allocate memory in device 1 : cudaSafeCall() Runtime API error : no CUDA-capable device is available.

and for pageable memory

Waiting for GPUs results …

set device

device 0 test 1

Allocate memory in device 0

set device

device 1 test 1

Allocate memory in device 1

Copy memory to device 0

Copy memory to device 1

Device = 0 Gx = 65535 Gy = 1

Copy results to Host 0

Device = 1 Gx = 65535 Gy = 1

Copy results to Host 1 : cudaSafeCall() Runtime API error : unspecified launch failure.

This is the part of my code where im doing all the GPU settings and launching the kernel


static CUT_THREADPROC solverThread(TGPUplan *plan)


short int* DEV_vec_struct;

float* DEV_weight;

int* DEV_index;

int* DEV_vec_entry;

short int* DEV_lookup;

float* DEV_proj_space;

float* DEV_image_space;

int* DEV_vec_check;

printf(“set device \n”);

//Set device

cutilSafeCall( cudaSetDevice(plan->device) );

//cudaStream_t stream1;

// cudaStreamCreate(&stream1);

printf(“device %d test %d\n”,plan->device, plan->test);

int nsize_s=plan->nlength_shortint*sizeof(short int);

int nsize_w=plan->nmatrix_elements*sizeof(float);

int nsize_i=plan->nmatrix_elements*sizeof(int);

int nsize_ve=2plan->nvecssizeof(int);

int nsize_lk=plan->nsymmetriesplan->nsymmetriessizeof(short int);

int nsize_proj=plan->nsymmetriesplan->nvecssizeof(float);

int nsize_img=plan->nfieldsplan->nlengthsizeof(float);

int nsize_ck=10plan->nvecssizeof(int);

//Initialization of results



// Allocate memory

printf(“Allocate memory in device %d \n”,plan->device);

cutilSafeCall(cudaMalloc((void**) &DEV_vec_struct, nsize_s));

cutilSafeCall(cudaMalloc((void**) &DEV_weight, nsize_w));

cutilSafeCall(cudaMalloc((void**) &DEV_index, nsize_i));

cutilSafeCall(cudaMalloc((void**) &DEV_vec_entry, nsize_ve));

cutilSafeCall(cudaMalloc((void**) &DEV_lookup, nsize_lk));

cutilSafeCall(cudaMalloc((void**) &DEV_proj_space,nsize_proj));

cutilSafeCall(cudaMalloc((void**) &DEV_image_space, nsize_img));

cutilSafeCall(cudaMalloc((void**) &DEV_vec_check, nsize_ck));

//Copy input data from CPU

printf(“Copy memory to device %d \n”,plan->device);

if (plan->test){

cutilSafeCall(cudaMemcpy(DEV_vec_struct, plan->vec_struct, nsize_s,

			  cudaMemcpyHostToDevice) );

cutilSafeCall(cudaMemcpy(DEV_weight, plan->weight, nsize_w,

			  cudaMemcpyHostToDevice) );

cutilSafeCall(cudaMemcpy(DEV_index, plan->index, nsize_i,

			  cudaMemcpyHostToDevice) ); 


			  cudaMemcpyHostToDevice) ); 

cutilSafeCall(cudaMemcpy(DEV_lookup, plan->symm_lookup, nsize_lk,

			  cudaMemcpyHostToDevice) );


			  cudaMemcpyHostToDevice) );

cutilSafeCall(cudaMemcpy(DEV_image_space, plan->flatimage, nsize_img,

			  cudaMemcpyHostToDevice) );

cutilSafeCall(cudaMemcpy(DEV_vec_check, plan->vec_check, nsize_ck,

			  cudaMemcpyHostToDevice) );



cutilSafeCall(cudaMemcpyAsync(DEV_vec_struct, plan->vec_struct, nsize_s,

			  cudaMemcpyHostToDevice,0) );

cutilSafeCall(cudaMemcpyAsync(DEV_weight, plan->weight, nsize_w,

			  cudaMemcpyHostToDevice,0) );

cutilSafeCall(cudaMemcpyAsync(DEV_index, plan->index, nsize_i,

			  cudaMemcpyHostToDevice,0) ); 


			  cudaMemcpyHostToDevice,0) ); 

cutilSafeCall(cudaMemcpyAsync(DEV_lookup, plan->symm_lookup, nsize_lk,

			  cudaMemcpyHostToDevice,0) );


			  cudaMemcpyHostToDevice,0) );

cutilSafeCall(cudaMemcpyAsync(DEV_image_space, plan->flatimage, nsize_img,

			  cudaMemcpyHostToDevice,0) );

cutilSafeCall(cudaMemcpyAsync(DEV_vec_check, plan->vec_check, nsize_ck,

			  cudaMemcpyHostToDevice,0) );


//Perform GPU computations

//—setup execution parameters

int tX=320;

int tY=1;

dim3 threads(tX, tY);

int gY=(plan->nvecs + 65535)/65535;

int gX=65535;

printf(“Device = %d Gx = %d Gy = %d \n”,plan->device,gX,gY);

dim3 grid(gX, gY);

int nv = plan->nvecs;

//—execute the kernel

matrixMul<<< grid, threads >>>(DEV_vec_struct, DEV_weight, DEV_index,

			 DEV_lookup, DEV_proj_space, DEV_image_space, DEV_vec_entry,


//—check if kernel execution generated and error

cutilCheckMsg(“Kernel execution failed”);

// if (plan->test == false)

cutilSafeCall(cudaThreadSynchronize());    /// Add to check correct time

printf(“Copy results to Host %d\n”,plan->device);

if (plan->test)


  cutilSafeCall(cudaMemcpy (plan->proj_space, DEV_proj_space, nsize_proj,

  			     cudaMemcpyDeviceToHost) );




  cutilSafeCall(cudaMemcpyAsync(plan->proj_space, DEV_proj_space, nsize_proj,

		   cudaMemcpyDeviceToHost,0) );


//if (plan->test == false)

cutilSafeCall(cudaThreadSynchronize()); /// Add to check correct time

//Blocks until the device has performed all task requested


// Shutdown this GPU

printf(“Free memory from device %d\n”,plan->device);













Line 1661 is cudaSafeCall() Runtime API error : no CUDA-capable device is available.

and Line 1748 is cutilSafeCall(cudaMemcpy (plan->proj_space, DEV_proj_space, nsize_proj,

  			     cudaMemcpyDeviceToHost) );

Basically this program is based on the MonteCarloMultiGPU.cpp. this program runs succesfully with no problems.

Any ideas, why the program crashes?


Luis Garcia