I am working on simplemultiGPU code. I removed CPU parts to shorten the code. I have a server with 4 graphic card and I want to see if I only use one graphic card it is slower than using 4 graphic card. But when I check running time, for 1 GPU card it is faster than 4 gpu card. Could you please help me to find my mistake or misunderstanding? I should mention that I use “for (int k = 0; k < 100000; k++){” to increase the computational time and can see the impact of computational cost.
// System includes
#include <stdio.h>
#include <assert.h>
#include <math.h>
#include <stdlib.h>
#include <ctime>
// CUDA runtime
#include <cuda_runtime.h>
#ifndef MAX
#define MAX(a,b) (a > b ? a : b)
#endif
#include "Header.h"
////////////////////////////////////////////////////////////////////////////////
// Data configuration
////////////////////////////////////////////////////////////////////////////////
const int MAX_GPU_COUNT = 32;
const int DATA_N = 128000003;
////////////////////////////////////////////////////////////////////////////////
// Simple reduction kernel.
// Refer to the 'reduction' CUDA Sample describing
// reduction optimization strategies
////////////////////////////////////////////////////////////////////////////////
__global__ static void reduceKernel(float *d_Result, float *d_Input, float *d_Inputt, int N)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int threadN = gridDim.x * blockDim.x;
float sum = 0;
for (int pos = tid; pos < N; pos += threadN)
sum += d_Input[pos] + d_Inputt[pos];
d_Result[tid] = sum;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
clock_t time_req;
//Solver config
TGPUplan plan[MAX_GPU_COUNT];
//GPU reduction results
float h_SumGPU[MAX_GPU_COUNT];
float sumGPU;
double sumCPU, diff;
int i, j, gpuBase, GPU_N;
//more accurate slow
const int BLOCK_N =4;
const int THREAD_N = 16;
//low accurate fast
//const int BLOCK_N = 128;
//const int THREAD_N = 512;
const int ACCUM_N = BLOCK_N * THREAD_N;
printf("Starting simpleMultiGPU\n");
cudaGetDeviceCount(&GPU_N);
if (GPU_N > MAX_GPU_COUNT)
{
GPU_N = MAX_GPU_COUNT;
}
//GPU_N = 1;
printf("CUDA-capable device count: %i\n", GPU_N);
printf("Generating input data...\n\n");
//Subdividing input data across GPUs
//Get data sizes for each GPU
for (i = 0; i < GPU_N; i++)
{
plan[i].dataN = DATA_N / GPU_N;
}
//Take into account "odd" data sizes
// from i=0 to i' we add one number for each data to total data for the GPUs from 0 to i'
// maybe DATA_N is like 11 and we have 2 GPU_N. so first we divide it to 5 data for each gpu and then we give 1 more data to the first GPU.
for (i = 0; i < DATA_N % GPU_N; i++)
{
plan[i].dataN++;
}
//Assign data ranges to GPUs
//gpuBase = 0;
for (i = 0; i < GPU_N; i++)
{
plan[i].h_Sum = h_SumGPU + i;
//gpuBase += plan[i].dataN;
}
//Create streams for issuing GPU command asynchronously and allocate memory (GPU and System page-locked)
for (i = 0; i < GPU_N; i++)
{
cudaSetDevice(i);
cudaStreamCreate(&plan[i].stream);
//Allocate memory
//sample: cudaMalloc(&d_v_x, (DS)* sizeof(float));
//plan[i].d_Data is our variable in GPU with total number of variables plan[i].dataN for GPU_N=i:
//in face we have:plan[i].d_Data[0 to plan[i].dataN]
cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float));
cudaMalloc((void **)&plan[i].d_Dataa, plan[i].dataN * sizeof(float));
cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float));
cudaMallocHost((void **)&plan[i].h_Sum_from_device, ACCUM_N * sizeof(float));
cudaMallocHost((void **)&plan[i].h_Data, plan[i].dataN * sizeof(float));
cudaMallocHost((void **)&plan[i].h_Dataa, plan[i].dataN * sizeof(float));
for (j = 0; j < plan[i].dataN; j++)
{
plan[i].h_Data[j] = 1.0;
plan[i].h_Dataa[j] = 1.0;
}
}
//Start timing and compute on GPU(s)
printf("Computing with %d GPUs...\n", GPU_N);
// Using pow function
time_req = clock();
//Copy data to GPU, launch the kernel and copy data back. All asynchronously
for (i = 0; i < GPU_N; i++)
{
//Set device
cudaSetDevice(i);
//Copy input data from CPU
cudaMemcpyAsync(plan[i].d_Data, plan[i].h_Data, plan[i].dataN * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream);
cudaMemcpyAsync(plan[i].d_Dataa, plan[i].h_Dataa, plan[i].dataN * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream);
//Perform GPU computations
for (int k = 0; k < 100000; k++){
reduceKernel << <BLOCK_N, THREAD_N, 0, plan[i].stream >> >(plan[i].d_Sum, plan[i].d_Data, plan[i].d_Dataa, plan[i].dataN);
}
//Read back GPU results
cudaMemcpyAsync(plan[i].h_Sum_from_device, plan[i].d_Sum, ACCUM_N *sizeof(float), cudaMemcpyDeviceToHost, plan[i].stream);
}
printf("************GPU time:%f \n", (float)(clock() - time_req) / CLOCKS_PER_SEC);
system("pause");
exit((diff < 1e-5) ? EXIT_SUCCESS : EXIT_FAILURE);
}