Why the following multigpu code works faster when I set GPU_N=1 while it is slower for GPU_N=4?

I am working on simplemultiGPU code. I removed CPU parts to shorten the code. I have a server with 4 graphic card and I want to see if I only use one graphic card it is slower than using 4 graphic card. But when I check running time, for 1 GPU card it is faster than 4 gpu card. Could you please help me to find my mistake or misunderstanding? I should mention that I use “for (int k = 0; k < 100000; k++){” to increase the computational time and can see the impact of computational cost.

// System includes
#include <stdio.h>
#include <assert.h>
#include <math.h>
#include <stdlib.h>
#include <ctime>

// CUDA runtime
#include <cuda_runtime.h>

#ifndef MAX
#define MAX(a,b) (a > b ? a : b)

#include "Header.h"

// Data configuration
const int MAX_GPU_COUNT = 32;
const int DATA_N = 128000003;

// Simple reduction kernel.
// Refer to the 'reduction' CUDA Sample describing
// reduction optimization strategies

__global__ static void reduceKernel(float *d_Result, float *d_Input, float *d_Inputt, int N)
    const int     tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int threadN = gridDim.x * blockDim.x;
    float sum = 0;

    for (int pos = tid; pos < N; pos += threadN)
            sum += d_Input[pos] + d_Inputt[pos];

    d_Result[tid] = sum;

// Program main
int main(int argc, char **argv)
    clock_t time_req;

    //Solver config
    TGPUplan      plan[MAX_GPU_COUNT];

    //GPU reduction results
    float     h_SumGPU[MAX_GPU_COUNT];

    float sumGPU;
    double sumCPU, diff;

    int i, j, gpuBase, GPU_N;

    //more accurate slow
    const int  BLOCK_N =4;
    const int THREAD_N = 16;

    //low accurate fast
    //const int  BLOCK_N = 128;
    //const int THREAD_N = 512;

    const int  ACCUM_N = BLOCK_N * THREAD_N;

    printf("Starting simpleMultiGPU\n");

    if (GPU_N > MAX_GPU_COUNT)
        GPU_N = MAX_GPU_COUNT;

    //GPU_N = 1;

    printf("CUDA-capable device count: %i\n", GPU_N);

    printf("Generating input data...\n\n");

    //Subdividing input data across GPUs
    //Get data sizes for each GPU
    for (i = 0; i < GPU_N; i++)
        plan[i].dataN = DATA_N / GPU_N;

    //Take into account "odd" data sizes
    // from i=0 to i' we add one number for each data to total data for the GPUs from 0 to i'
    // maybe DATA_N is like 11 and we have 2 GPU_N. so first we divide it to 5 data for each gpu and then we give 1 more data to the first GPU.
    for (i = 0; i < DATA_N % GPU_N; i++)

    //Assign data ranges to GPUs
    //gpuBase = 0;

    for (i = 0; i < GPU_N; i++)
        plan[i].h_Sum = h_SumGPU + i;
        //gpuBase += plan[i].dataN;

    //Create streams for issuing GPU command asynchronously and allocate memory (GPU and System page-locked)
    for (i = 0; i < GPU_N; i++)
        //Allocate memory

        //sample: cudaMalloc(&d_v_x, (DS)* sizeof(float));

        //plan[i].d_Data is our variable in GPU with total number of variables plan[i].dataN for GPU_N=i:
        //in face we have:plan[i].d_Data[0 to plan[i].dataN]
        cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float));
        cudaMalloc((void **)&plan[i].d_Dataa, plan[i].dataN * sizeof(float));
        cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float));
        cudaMallocHost((void **)&plan[i].h_Sum_from_device, ACCUM_N * sizeof(float));
        cudaMallocHost((void **)&plan[i].h_Data, plan[i].dataN * sizeof(float));
        cudaMallocHost((void **)&plan[i].h_Dataa, plan[i].dataN * sizeof(float));

        for (j = 0; j < plan[i].dataN; j++)
            plan[i].h_Data[j] = 1.0;
            plan[i].h_Dataa[j] = 1.0;

    //Start timing and compute on GPU(s)
    printf("Computing with %d GPUs...\n", GPU_N);

    // Using pow function
    time_req = clock();

    //Copy data to GPU, launch the kernel and copy data back. All asynchronously
    for (i = 0; i < GPU_N; i++)
        //Set device

        //Copy input data from CPU
        cudaMemcpyAsync(plan[i].d_Data, plan[i].h_Data, plan[i].dataN * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream);
        cudaMemcpyAsync(plan[i].d_Dataa, plan[i].h_Dataa, plan[i].dataN * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream);

        //Perform GPU computations
        for (int k = 0; k < 100000; k++){
            reduceKernel << <BLOCK_N, THREAD_N, 0, plan[i].stream >> >(plan[i].d_Sum, plan[i].d_Data, plan[i].d_Dataa, plan[i].dataN);

        //Read back GPU results
        cudaMemcpyAsync(plan[i].h_Sum_from_device, plan[i].d_Sum, ACCUM_N *sizeof(float), cudaMemcpyDeviceToHost, plan[i].stream);

    printf("************GPU time:%f \n", (float)(clock() - time_req) / CLOCKS_PER_SEC);

    exit((diff < 1e-5) ? EXIT_SUCCESS : EXIT_FAILURE);

Have you profiled your application using NVVP?

Do the results show parallel execution of your kernels across the 4 GPUs?

IME, even if the profiler shows parallel execution, it can be tricky to get parallel execution outside of the profiler, but you certainly won’t get it if the profiler is showing serial execution.

Did you forget to show where you’re doing cudaDeviceSychronize or cudaStreamSyncrhonize before recording the end time?