Multi-GPU, simple kernel

Hi,

I recently bought an extra 470 GTX and set up a SLI.
I am trying to use them with CUDA and I started with something simple. This works when DATA_N is less than 33554432 but above I have unexpected results.

I am working on a Linux Mint 14 64 bits machine and with CUDA 5.0 installed.

Can somebody explain why this works with data of size inferior to 33Mo but not with greater data ?

#include <stdio.h>

#define NUMTHREADS 256
#define DATA_N 33554432

__global__ void fill_data(int *tab,int len){
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	if(idx >= len)
		idx = len-1;
	tab[idx] = 1;
}

__global__ static void reduceKernel(int *d_Result, int *d_Input, int N)
{

    int i;
    int sum = 0;
    if(threadIdx.x == 0){
    	for(i=0;i<blockDim.x;i++)
    		sum+=d_Input[blockIdx.x*blockDim.x+i];
    	d_Result[blockIdx.x] = sum;
    }
}

int main(){

	int GPU_N = 0;
	int **p_data,**p_res;
	int dev;
	int **res_data;
	cudaStream_t *streams;

	cudaGetDeviceCount(&GPU_N);

	p_data = (int **)malloc(GPU_N*sizeof(int*));
	p_res = (int **)malloc(GPU_N*sizeof(int*));
	res_data = (int **)malloc(GPU_N*sizeof(int*));
	streams = (cudaStream_t *)malloc(sizeof(cudaStream_t)*GPU_N);

	for(dev=0;dev<GPU_N;dev++){
		cudaSetDevice(dev);
		cudaStreamCreate(&streams[dev]);
		cudaMalloc((void **)&p_data[dev],(DATA_N/GPU_N)*sizeof(int));
		cudaMalloc((void **)&p_res[dev],(DATA_N/GPU_N)*sizeof(int));
		res_data[dev] = (int *)calloc(sizeof(int),DATA_N/GPU_N/NUMTHREADS);
	}

	for(dev=0;dev<GPU_N;dev++){
		cudaSetDevice(dev);
		cudaMemset(p_data[dev],1,(DATA_N/GPU_N)*sizeof(int));
		//fill_data<<<DATA_N/GPU_N/NUMTHREADS,NUMTHREADS,0,streams[dev]>>>(p_data[dev],DATA_N/GPU_N);
	}

	for(dev=0;dev<GPU_N;dev++){
		cudaSetDevice(dev);
		reduceKernel<<<DATA_N/GPU_N/NUMTHREADS,NUMTHREADS,0,streams[dev]>>>(p_res[dev],p_data[dev],DATA_N/GPU_N);
	}

	for(dev=0;dev<GPU_N;dev++){
		cudaSetDevice(dev);
		cudaMemcpy(res_data[dev],p_res[dev],sizeof(int)*(DATA_N/GPU_N/NUMTHREADS),cudaMemcpyDeviceToHost);
	}

	for(dev=0;dev<GPU_N;dev++){
		int mysum = 0;
		int i;
		for(i=0;i<DATA_N/GPU_N/NUMTHREADS;i++)
			mysum+=res_data[dev][i];
		printf("Dev[%d] : %d\n",dev,mysum);
	}

	return 0;
}

The most important thing to do is modify this code to check the return values from all the CUDA functions. That will help you pinpoint what is going wrong, and where it is happening.

I did it, no error occured.

#include <stdio.h>

#define NUMTHREADS 256
#define DATA_N 33554432

static void HandleError( cudaError_t err,
                         const char *file,
                         int line ) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

__global__ void fill_data(int *tab,int len){
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	if(idx >= len)
		idx = len-1;
	tab[idx] = 1;
}

__global__ static void reduceKernel(int *d_Result, int *d_Input, int N)
{

    int i;
    int sum = 0;
    if(threadIdx.x == 0){
    	for(i=0;i<blockDim.x;i++)
    		sum+=d_Input[blockIdx.x*blockDim.x+i];
    	d_Result[blockIdx.x] = sum;
    }
}

int main(){

	int GPU_N = 0;
	int **p_data,**p_res;
	int dev;
	int **res_data;
	cudaStream_t *streams;

	HANDLE_ERROR(cudaGetDeviceCount(&GPU_N));

	p_data = (int **)malloc(GPU_N*sizeof(int*));
	p_res = (int **)malloc(GPU_N*sizeof(int*));
	res_data = (int **)malloc(GPU_N*sizeof(int*));
	streams = (cudaStream_t *)malloc(sizeof(cudaStream_t)*GPU_N);

	for(dev=0;dev<GPU_N;dev++){
		HANDLE_ERROR(cudaSetDevice(dev));
		HANDLE_ERROR(cudaStreamCreate(&streams[dev]));
		HANDLE_ERROR(cudaMalloc((void **)&p_data[dev],(DATA_N/GPU_N)*sizeof(int)));
		HANDLE_ERROR(cudaMalloc((void **)&p_res[dev],(DATA_N/GPU_N)*sizeof(int)));
		res_data[dev] = (int *)calloc(sizeof(int),DATA_N/GPU_N/NUMTHREADS);
	}

	for(dev=0;dev<GPU_N;dev++){
		HANDLE_ERROR(cudaSetDevice(dev));
		HANDLE_ERROR(cudaMemset(p_data[dev],1,(DATA_N/GPU_N)*sizeof(int)));
		//fill_data<<<DATA_N/GPU_N/NUMTHREADS,NUMTHREADS,0,streams[dev]>>>(p_data[dev],DATA_N/GPU_N);
	}

	for(dev=0;dev<GPU_N;dev++){
		HANDLE_ERROR(cudaSetDevice(dev));
		reduceKernel<<<DATA_N/GPU_N/NUMTHREADS,NUMTHREADS,0,streams[dev]>>>(p_res[dev],p_data[dev],DATA_N/GPU_N);
	}

	for(dev=0;dev<GPU_N;dev++){
		HANDLE_ERROR(cudaSetDevice(dev));
		HANDLE_ERROR(cudaMemcpy(res_data[dev],p_res[dev],sizeof(int)*(DATA_N/GPU_N/NUMTHREADS),cudaMemcpyDeviceToHost));
	}

	for(dev=0;dev<GPU_N;dev++){
		int mysum = 0;
		int i;
		for(i=0;i<DATA_N/GPU_N/NUMTHREADS;i++)
			mysum+=res_data[dev][i];
		printf("Dev[%d] : %d\n",dev,mysum);
	}

	return 0;
}

OK, that is weird. What is the unexpected result, exactly?

If I launch it with DATA_N = 33554432 I should have 16777216 on both devices (33554432/2 as I have 2 GPUs) but I have :

Dev[0] : 2076674393
Dev[1] : 1333383326

Did you see something obvious ?

I am afraid only dual-GPU users can test it.

I also synchronized threads before retrieving my data from both cards but it does not change the results.

Anyone has an idea ?

Ok the answer is pretty simple, maybe the “…” from Gert-Jan was for that.
The maximum number of blocks is 65536 so my program does not work with 33554432/2/256 blocks.

Sorry!