Why the following multigpu code works faster when I set GPU_N=1 while it is slower for GPU_N=4?

I am working on simplemultiGPU code. I removed CPU parts to shorten the code. I have a server with 4 graphic card and I want to see if I only use one graphic card it is slower than using 4 graphic card. But when I check running time, for 1 GPU card it is faster than 4 gpu card. Could you please help me to find my mistake or misunderstanding? I should mention that I use “for (int k = 0; k < 100000; k++){” to increase the computational time and can see the impact of computational cost.

// System includes
#include <stdio.h>
#include <assert.h>
#include <math.h>
#include <stdlib.h>
#include <ctime>

// CUDA runtime
#include <cuda_runtime.h>


#ifndef MAX
#define MAX(a,b) (a > b ? a : b)
#endif

#include "Header.h"

////////////////////////////////////////////////////////////////////////////////
// Data configuration
////////////////////////////////////////////////////////////////////////////////
const int MAX_GPU_COUNT = 32;
const int DATA_N = 128000003;

////////////////////////////////////////////////////////////////////////////////
// Simple reduction kernel.
// Refer to the 'reduction' CUDA Sample describing
// reduction optimization strategies
////////////////////////////////////////////////////////////////////////////////

__global__ static void reduceKernel(float *d_Result, float *d_Input, float *d_Inputt, int N)
{
    const int     tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int threadN = gridDim.x * blockDim.x;
    float sum = 0;

    for (int pos = tid; pos < N; pos += threadN)
            sum += d_Input[pos] + d_Inputt[pos];

    d_Result[tid] = sum;
}


////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    clock_t time_req;

    //Solver config
    TGPUplan      plan[MAX_GPU_COUNT];

    //GPU reduction results
    float     h_SumGPU[MAX_GPU_COUNT];

    float sumGPU;
    double sumCPU, diff;

    int i, j, gpuBase, GPU_N;

    //more accurate slow
    const int  BLOCK_N =4;
    const int THREAD_N = 16;

    //low accurate fast
    //const int  BLOCK_N = 128;
    //const int THREAD_N = 512;

    const int  ACCUM_N = BLOCK_N * THREAD_N;

    printf("Starting simpleMultiGPU\n");
    cudaGetDeviceCount(&GPU_N);

    if (GPU_N > MAX_GPU_COUNT)
    {
        GPU_N = MAX_GPU_COUNT;
    }

    //GPU_N = 1;

    printf("CUDA-capable device count: %i\n", GPU_N);

    printf("Generating input data...\n\n");

    //Subdividing input data across GPUs
    //Get data sizes for each GPU
    for (i = 0; i < GPU_N; i++)
    {
        plan[i].dataN = DATA_N / GPU_N;
    }

    //Take into account "odd" data sizes
    // from i=0 to i' we add one number for each data to total data for the GPUs from 0 to i'
    // maybe DATA_N is like 11 and we have 2 GPU_N. so first we divide it to 5 data for each gpu and then we give 1 more data to the first GPU.
    for (i = 0; i < DATA_N % GPU_N; i++)
    {
        plan[i].dataN++;
    }

    //Assign data ranges to GPUs
    //gpuBase = 0;

    for (i = 0; i < GPU_N; i++)
    {
        plan[i].h_Sum = h_SumGPU + i;
        //gpuBase += plan[i].dataN;
    }

    //Create streams for issuing GPU command asynchronously and allocate memory (GPU and System page-locked)
    for (i = 0; i < GPU_N; i++)
    {
        cudaSetDevice(i);
        cudaStreamCreate(&plan[i].stream);
        //Allocate memory

        //sample: cudaMalloc(&d_v_x, (DS)* sizeof(float));

        //plan[i].d_Data is our variable in GPU with total number of variables plan[i].dataN for GPU_N=i:
        //in face we have:plan[i].d_Data[0 to plan[i].dataN]
        cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float));
        cudaMalloc((void **)&plan[i].d_Dataa, plan[i].dataN * sizeof(float));
        cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float));
        cudaMallocHost((void **)&plan[i].h_Sum_from_device, ACCUM_N * sizeof(float));
        cudaMallocHost((void **)&plan[i].h_Data, plan[i].dataN * sizeof(float));
        cudaMallocHost((void **)&plan[i].h_Dataa, plan[i].dataN * sizeof(float));

        for (j = 0; j < plan[i].dataN; j++)
        {
            plan[i].h_Data[j] = 1.0;
            plan[i].h_Dataa[j] = 1.0;
        }
    }

    //Start timing and compute on GPU(s)
    printf("Computing with %d GPUs...\n", GPU_N);

    // Using pow function
    time_req = clock();

    //Copy data to GPU, launch the kernel and copy data back. All asynchronously
    for (i = 0; i < GPU_N; i++)
    {
        //Set device
        cudaSetDevice(i);

        //Copy input data from CPU
        cudaMemcpyAsync(plan[i].d_Data, plan[i].h_Data, plan[i].dataN * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream);
        cudaMemcpyAsync(plan[i].d_Dataa, plan[i].h_Dataa, plan[i].dataN * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream);

        //Perform GPU computations
        for (int k = 0; k < 100000; k++){
            reduceKernel << <BLOCK_N, THREAD_N, 0, plan[i].stream >> >(plan[i].d_Sum, plan[i].d_Data, plan[i].d_Dataa, plan[i].dataN);
        }


        //Read back GPU results
        cudaMemcpyAsync(plan[i].h_Sum_from_device, plan[i].d_Sum, ACCUM_N *sizeof(float), cudaMemcpyDeviceToHost, plan[i].stream);
    }

    printf("************GPU time:%f \n", (float)(clock() - time_req) / CLOCKS_PER_SEC);


    system("pause");
    exit((diff < 1e-5) ? EXIT_SUCCESS : EXIT_FAILURE);
}