multi GPUs programming error

cuda the launch timed out and was terminated(occurs in cudaMemcpy())
I am doing the programming of matMul with OpenMP.When executed on one server of 2 GPUs,it was fine.But when executed on another server of 4 GPUs,it occurs the error.(when the scale of the data is large,it occurs the error ,but it is fine on the 2 GPUs server),I don’t konw why. Expect your solutions.
the 2 GPUs’s server is centos5.7(64 bits,tesla C2050,16G memory)
the 4 GPUs’s server is ubuntu 12.04(64 bits,tesla C2050,48G memory)

Does the computer with 4 GPUs have one or two CPU sockets?

4 threads and one thread controls one GPU card.

Actually, I’m asking about the hardware in the 4 GPU server. Does the motherboard have 1 or 2 CPUs installed?

Oh,just 2 CPU sockets…,and the source code is as follows:
The error occurs in “CUDA_SAFE_CALL(cudaMemcpy( , ,deviceToHost))” about line 147…

#include <omp.h>
#include <stdio.h> // stdio functions are used since C++ streams aren’t necessarily thread safe
#include <cutil_inline.h>
//#include <shrQATest.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cutil_inline.h>
#include <cuda_runtime_api.h>
//#include “matrixMul.h”
#include<sys/time.h>
void randomInit(float * data, int n)
{
for (int i = 0; i < n; ++i)
data[i] = rand() / (float)RAND_MAX;
}
global void matrixMul(float C,float A,float B, int wA, int hA,int wB,int hB)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
float Cvalue=0;
int row=by
blockDim.y+ty;
int col=bx
blockDim.x+tx;
for (int e=0;e<wA;++e)
Cvalue+=A[row
wA+e]B[ewB+col];
C[row*wB+col]=Cvalue;

}
void run();
int main(int argc, char *argv)
{
run();
return 0;

}

void run(){
printf("[ matrixMul ]\n");
int num_gpus=0;
cudaGetDeviceCount(&num_gpus);
printf(“number of host CPUs:\t%d\n”, omp_get_num_procs());
printf(“number of CUDA devices:\t%d\n”, num_gpus);
for(int i = 0; i < num_gpus; i++)
{
cudaDeviceProp dprop;
cudaGetDeviceProperties(&dprop, i);
printf(" %d: %s\n", i, dprop.name);
}
printf("---------------------------\n");
struct timeval t1, t2;
double tg1, tg2;
gettimeofday(&t1, NULL);
tg1 = t1.tv_usec/1000000.0 + t1.tv_sec;
srand((int)time(0));

unsigned int WA, HA, WB, HB, WC, HC;
WA=600*16;
HA=2000*16;
WB=600*16;
HB=WA;
HC=HA;
WC=WB;

/unsigned int WA, HA, WB, HB, WC, HC;
WA=20
16;
HA=5016;
WB=50
16;
HB=WA;
HC=HA;
WC=WB;
/
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float
h_A = (float*)malloc(mem_size_A);
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*)malloc(mem_size_B);
unsigned int size_C = WC * HC;
unsigned int mem_size_C = sizeof(float) * size_C;
float* h_C = (float*)malloc(mem_size_C);
randomInit(h_A, size_A);
printf(“1111111111\n”);
randomInit(h_B, size_B);
printf(“2222222222\n”);

omp_set_num_threads(num_gpus);	// create as many CPU threads as there are CUDA devices
//omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there are CUDA devices

#pragma omp parallel

{		

printf( "[%d] Hello----\n ", omp_get_thread_num());
unsigned int cpu_thread_id = omp_get_thread_num();
unsigned int num_cpu_threads = omp_get_num_threads();
int gpu_id = -1;
CUDA_SAFE_CALL(cudaSetDevice(cpu_thread_id));
CUDA_SAFE_CALL(cudaGetDevice(&gpu_id));
printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);

unsigned int A_nbytes_per_kernel = mem_size_A/ num_cpu_threads;
//unsigned int B_nbytes_per_kernel = mem_size_B/ num_cpu_threads;
unsigned int C_nbytes_per_kernel = mem_size_C/ num_cpu_threads;
printf("4444444444444\n");
float* d_A=0;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_A, A_nbytes_per_kernel));
float* d_B=0;
printf("55555555555555\n");
CUDA_SAFE_CALL(cudaMalloc((void**) &d_B, mem_size_B));
float* d_C=0;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_C, C_nbytes_per_kernel));
printf("6666666666666666\n");
float *sub_A = h_A + cpu_thread_id * size_A / num_cpu_threads;
float *sub_B = h_B;	
printf("777777777777\n");   
/*float *sub_B = (float*)malloc(B_nbytes_per_kernel);
int p=0;
for(int i=0;i<HB;i++)
   for(int j=cpu_thread_id*WB/2;j<((WB/2)+cpu_thread_id*WB/2);j++)
   {
      sub_B[p]=h_B[i*WB+j];
      p++;
      break;
   }   

*/
printf(“888888888888888\n”);
float *sub_C = h_C + cpu_thread_id * size_C / num_cpu_threads;
CUDA_SAFE_CALL(cudaMemset(d_A, 0, A_nbytes_per_kernel));
CUDA_SAFE_CALL(cudaMemset(d_B, 0, mem_size_B));
CUDA_SAFE_CALL(cudaMemset(d_C, 0, C_nbytes_per_kernel));

CUDA_SAFE_CALL(cudaMemcpy(d_A, sub_A, A_nbytes_per_kernel,cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(d_B, sub_B, mem_size_B,cudaMemcpyHostToDevice) );

//float* h_C = (float*) malloc(mem_size_C);

dim3 threads(16,16);
dim3 grid(WC / threads.x, HC /(4*threads.y));

matrixMul<<<grid, threads>>>(d_C, d_A, d_B, WA,HA/4,WB,HB);

cudaThreadSynchronize();

CUDA_SAFE_CALL(cudaMemcpy(sub_C, d_C,C_nbytes_per_kernel,cudaMemcpyDeviceToHost) );


CUDA_SAFE_CALL(cudaFree(d_A));
CUDA_SAFE_CALL(cudaFree(d_B));
CUDA_SAFE_CALL(cudaFree(d_C));

}
gettimeofday(&t2, NULL);
tg2 = t2.tv_usec/1000000.0 + t2.tv_sec;
printf(“matrixMul compute time of GPU: %f s\n”, tg2-tg1);
}