multi host thread over multi gpu

Hi, I have a problem.
In my case, I have two gpu cards, gpuA and gpuB, in the host program, I create two threads, threadA, and threadB, I let thread A do the job on gpuA and threadB do the job on gpuB.
If I start the two threads at the same time, each thread runs extremely slow. this is the slowly code.

#include <stdio.h>
#include <thread>
#include <unistd.h>
#include <iostream>
#include <cuda_runtime.h>
#include <vector>

std::vector<void*> mem_lists;

__global__ void proc(int64_t* ptr[],int len){
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    idx = idx % len;
    for(int i=0;i<1024;++i){
        for(int j=0;j<1024;++j){
            *ptr[idx] += *ptr[idx]; 
        }
    }
}

void proc_thread1(int gpu_id){
    void *ptr;
    auto err = cudaMallocManaged(&ptr,sizeof(void*)*mem_lists.size()/2);
    if(err){
        std::cout << "proc_thread0 " << err << std::endl;
    }
    int64_t** data_ptr = (int64_t**)(ptr);
    for(int i=0; i < mem_lists.size()/2; ++i){
        auto idx = i + 2*gpu_id%2;
        data_ptr[i] = (int64_t*)(mem_lists[idx]);
    }

    cudaSetDevice(gpu_id);
    auto start = std::chrono::high_resolution_clock::now();
    
    proc<<<128,128>>>(data_ptr,mem_lists.size()/2);
    cudaDeviceSynchronize();

    auto stop = std::chrono::high_resolution_clock::now();
    auto span = (std::chrono::duration<double, std::milli>(stop - start)).count();
    std::cout << "gpu id = " << gpu_id;
    std::cout << ", cost :" << span << "(ms)" << std::endl;
}


int main(){
    for(int i=0; i<128; ++i){
        void* ptr=nullptr;
        auto err = cudaMallocManaged(&ptr,8);
        if(err){
            std::cout << "cudaMallocManaged " << err << std::endl;
        }
        mem_lists.push_back(ptr);
    }
    
    
    std::thread th0(&proc_thread1,0);
    std::thread th1(&proc_thread1,0);
    th0.join();
    th1.join();
    return 0;
}

but, if I just run one thread, then, It was extremely fast.
this is the code which runs fast.

#include <stdio.h>
#include <thread>
#include <unistd.h>
#include <iostream>
#include <cuda_runtime.h>
#include <vector>

std::vector<void*> mem_lists;

__global__ void proc(int64_t* ptr[],int len){
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    idx = idx % len;
    for(int i=0;i<1024;++i){
        for(int j=0;j<1024;++j){
            *ptr[idx] += *ptr[idx]; 
        }
    }
}

void proc_thread1(int gpu_id){
    void *ptr;
    auto err = cudaMallocManaged(&ptr,sizeof(void*)*mem_lists.size()/2);
    if(err){
        std::cout << "proc_thread0 " << err << std::endl;
    }
    int64_t** data_ptr = (int64_t**)(ptr);
    for(int i=0; i < mem_lists.size()/2; ++i){
        auto idx = i + 2*gpu_id%2;
        data_ptr[i] = (int64_t*)(mem_lists[idx]);
    }

    cudaSetDevice(gpu_id);
    auto start = std::chrono::high_resolution_clock::now();
    
    proc<<<128,128>>>(data_ptr,mem_lists.size()/2);
    cudaDeviceSynchronize();

    auto stop = std::chrono::high_resolution_clock::now();
    auto span = (std::chrono::duration<double, std::milli>(stop - start)).count();
    std::cout << "gpu id = " << gpu_id;
    std::cout << ", cost :" << span << "(ms)" << std::endl;
}


int main(){
    for(int i=0; i<128; ++i){
        void* ptr=nullptr;
        auto err = cudaMallocManaged(&ptr,8);
        if(err){
            std::cout << "cudaMallocManaged " << err << std::endl;
        }
        mem_lists.push_back(ptr);
    }
    
    
    std::thread th0(&proc_thread1,0);
    // std::thread th1(&proc_thread1,0);
    th0.join();
    // th1.join();
    return 0;
}

looking forwards to any reply, thanks.

profiling is often a useful activity to get some ideas about problems like this.

In order to look at this, I would want to know the exact UM setup

what are the two GPUs?
what is the operating system?
what is the CUDA version?

all of the above information matters, in order to accurately predict the behavior of UM (managed memory)

I haven’t tried to sort out your very complex indexing. Is it necessary to have arrays of interleaved pointers to demonstrate the issue? Can you simplify this? In the 2-thread/slow case, do you have a situation where two different GPUs are using the same allocation?

Thanks for you reply.
The two GPUs are 1060,
operation system is Ubuntu 16.04
CUDA version is 9.0.

I have simplified the sample code, and let the thread allocate unified memory by itself.

#include <stdio.h>
#include <thread>
#include <unistd.h>
#include <iostream>
#include <cuda_runtime.h>
#include <vector>


__global__ void proc(int64_t* ptr[],int len){
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    idx = idx % len;
    for(int i=0;i<128;++i){
        for(int j=0;j<128;++j){
            *ptr[idx] += *ptr[idx]; 
        }
    }
}

void proc_thread1(int gpu_id){
    void *array_ptr;
    auto err = cudaMallocManaged(&array_ptr,sizeof(void*)*128);
    if(err){
        std::cout << "proc_thread0 " << err << std::endl;
    }
    int64_t** (data_ptr) = (int64_t**)(array_ptr);
    for(int i=0; i<128; ++i){
        void* ptr=nullptr;
        auto err = cudaMallocManaged(&ptr,8);
        if(err){
            std::cout << "cudaMallocManaged " << err << std::endl;
        }
        data_ptr[i] = (int64_t*)ptr;
    }

    while(true){
        cudaSetDevice(gpu_id);
        auto start = std::chrono::high_resolution_clock::now();
        
        proc<<<128,128>>>(data_ptr,128);
        cudaDeviceSynchronize();

        auto stop = std::chrono::high_resolution_clock::now();
        auto span = (std::chrono::duration<double, std::milli>(stop - start)).count();
        std::cout << "gpu id = " << gpu_id;
        std::cout << ", cost :" << span << "(ms)" << std::endl;
    }
}


int main(){

//    std::thread th0(&proc_thread1,0);
    std::thread th1(&proc_thread1,1);
//    th0.join();
    th1.join();
    return 0;
}

It runs extremely slow, if I start th0 and th1 in the main.

looking forwarding to any reply, thanks.

what is the result of running the simpleP2P CUDA sample code on that system?

[./simpleP2P] - Starting…
Checking for multiple GPUs…
CUDA-capable device count: 2

GPU0 = “GeForce GTX 1060 6GB” IS capable of Peer-to-Peer (P2P)
GPU1 = “GeForce GTX 1060 6GB” IS capable of Peer-to-Peer (P2P)

Checking GPU(s) for support of peer to peer memory access…

Peer access from GeForce GTX 1060 6GB (GPU0) → GeForce GTX 1060 6GB (GPU1) : No
Peer access from GeForce GTX 1060 6GB (GPU1) → GeForce GTX 1060 6GB (GPU0) : No
Two or more GPUs with SM 2.0 or higher capability are required for ./simpleP2P.
Peer to Peer access is not available amongst GPUs in the system, waiving test.

In my example, I allocate unified memory in each thread, so I supposed thread0 will not access thread1’s unified memory, so I supposed there is no P2P access.