I recently bought an extra 470 GTX and set up a SLI.
I am trying to use them with CUDA and I started with something simple. This works when DATA_N is less than 33554432 but above I have unexpected results.
I am working on a Linux Mint 14 64 bits machine and with CUDA 5.0 installed.
Can somebody explain why this works with data of size inferior to 33Mo but not with greater data ?
#include <stdio.h>
#define NUMTHREADS 256
#define DATA_N 33554432
__global__ void fill_data(int *tab,int len){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx >= len)
idx = len-1;
tab[idx] = 1;
}
__global__ static void reduceKernel(int *d_Result, int *d_Input, int N)
{
int i;
int sum = 0;
if(threadIdx.x == 0){
for(i=0;i<blockDim.x;i++)
sum+=d_Input[blockIdx.x*blockDim.x+i];
d_Result[blockIdx.x] = sum;
}
}
int main(){
int GPU_N = 0;
int **p_data,**p_res;
int dev;
int **res_data;
cudaStream_t *streams;
cudaGetDeviceCount(&GPU_N);
p_data = (int **)malloc(GPU_N*sizeof(int*));
p_res = (int **)malloc(GPU_N*sizeof(int*));
res_data = (int **)malloc(GPU_N*sizeof(int*));
streams = (cudaStream_t *)malloc(sizeof(cudaStream_t)*GPU_N);
for(dev=0;dev<GPU_N;dev++){
cudaSetDevice(dev);
cudaStreamCreate(&streams[dev]);
cudaMalloc((void **)&p_data[dev],(DATA_N/GPU_N)*sizeof(int));
cudaMalloc((void **)&p_res[dev],(DATA_N/GPU_N)*sizeof(int));
res_data[dev] = (int *)calloc(sizeof(int),DATA_N/GPU_N/NUMTHREADS);
}
for(dev=0;dev<GPU_N;dev++){
cudaSetDevice(dev);
cudaMemset(p_data[dev],1,(DATA_N/GPU_N)*sizeof(int));
//fill_data<<<DATA_N/GPU_N/NUMTHREADS,NUMTHREADS,0,streams[dev]>>>(p_data[dev],DATA_N/GPU_N);
}
for(dev=0;dev<GPU_N;dev++){
cudaSetDevice(dev);
reduceKernel<<<DATA_N/GPU_N/NUMTHREADS,NUMTHREADS,0,streams[dev]>>>(p_res[dev],p_data[dev],DATA_N/GPU_N);
}
for(dev=0;dev<GPU_N;dev++){
cudaSetDevice(dev);
cudaMemcpy(res_data[dev],p_res[dev],sizeof(int)*(DATA_N/GPU_N/NUMTHREADS),cudaMemcpyDeviceToHost);
}
for(dev=0;dev<GPU_N;dev++){
int mysum = 0;
int i;
for(i=0;i<DATA_N/GPU_N/NUMTHREADS;i++)
mysum+=res_data[dev][i];
printf("Dev[%d] : %d\n",dev,mysum);
}
return 0;
}
The most important thing to do is modify this code to check the return values from all the CUDA functions. That will help you pinpoint what is going wrong, and where it is happening.
Ok the answer is pretty simple, maybe the “…” from Gert-Jan was for that.
The maximum number of blocks is 65536 so my program does not work with 33554432/2/256 blocks.