multi host thread over multi gpu

yefu.chen · February 20, 2019, 12:59pm

Hi, I have a problem.
In my case, I have two gpu cards, gpuA and gpuB, in the host program, I create two threads, threadA, and threadB, I let thread A do the job on gpuA and threadB do the job on gpuB.
If I start the two threads at the same time, each thread runs extremely slow. this is the slowly code.

#include <stdio.h>
#include <thread>
#include <unistd.h>
#include <iostream>
#include <cuda_runtime.h>
#include <vector>

std::vector<void*> mem_lists;

__global__ void proc(int64_t* ptr[],int len){
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    idx = idx % len;
    for(int i=0;i<1024;++i){
        for(int j=0;j<1024;++j){
            *ptr[idx] += *ptr[idx]; 
        }
    }
}

void proc_thread1(int gpu_id){
    void *ptr;
    auto err = cudaMallocManaged(&ptr,sizeof(void*)*mem_lists.size()/2);
    if(err){
        std::cout << "proc_thread0 " << err << std::endl;
    }
    int64_t** data_ptr = (int64_t**)(ptr);
    for(int i=0; i < mem_lists.size()/2; ++i){
        auto idx = i + 2*gpu_id%2;
        data_ptr[i] = (int64_t*)(mem_lists[idx]);
    }

    cudaSetDevice(gpu_id);
    auto start = std::chrono::high_resolution_clock::now();
    
    proc<<<128,128>>>(data_ptr,mem_lists.size()/2);
    cudaDeviceSynchronize();

    auto stop = std::chrono::high_resolution_clock::now();
    auto span = (std::chrono::duration<double, std::milli>(stop - start)).count();
    std::cout << "gpu id = " << gpu_id;
    std::cout << ", cost :" << span << "(ms)" << std::endl;
}


int main(){
    for(int i=0; i<128; ++i){
        void* ptr=nullptr;
        auto err = cudaMallocManaged(&ptr,8);
        if(err){
            std::cout << "cudaMallocManaged " << err << std::endl;
        }
        mem_lists.push_back(ptr);
    }
    
    
    std::thread th0(&proc_thread1,0);
    std::thread th1(&proc_thread1,0);
    th0.join();
    th1.join();
    return 0;
}

but, if I just run one thread, then, It was extremely fast.
this is the code which runs fast.

#include <stdio.h>
#include <thread>
#include <unistd.h>
#include <iostream>
#include <cuda_runtime.h>
#include <vector>

std::vector<void*> mem_lists;

__global__ void proc(int64_t* ptr[],int len){
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    idx = idx % len;
    for(int i=0;i<1024;++i){
        for(int j=0;j<1024;++j){
            *ptr[idx] += *ptr[idx]; 
        }
    }
}

void proc_thread1(int gpu_id){
    void *ptr;
    auto err = cudaMallocManaged(&ptr,sizeof(void*)*mem_lists.size()/2);
    if(err){
        std::cout << "proc_thread0 " << err << std::endl;
    }
    int64_t** data_ptr = (int64_t**)(ptr);
    for(int i=0; i < mem_lists.size()/2; ++i){
        auto idx = i + 2*gpu_id%2;
        data_ptr[i] = (int64_t*)(mem_lists[idx]);
    }

    cudaSetDevice(gpu_id);
    auto start = std::chrono::high_resolution_clock::now();
    
    proc<<<128,128>>>(data_ptr,mem_lists.size()/2);
    cudaDeviceSynchronize();

    auto stop = std::chrono::high_resolution_clock::now();
    auto span = (std::chrono::duration<double, std::milli>(stop - start)).count();
    std::cout << "gpu id = " << gpu_id;
    std::cout << ", cost :" << span << "(ms)" << std::endl;
}


int main(){
    for(int i=0; i<128; ++i){
        void* ptr=nullptr;
        auto err = cudaMallocManaged(&ptr,8);
        if(err){
            std::cout << "cudaMallocManaged " << err << std::endl;
        }
        mem_lists.push_back(ptr);
    }
    
    
    std::thread th0(&proc_thread1,0);
    // std::thread th1(&proc_thread1,0);
    th0.join();
    // th1.join();
    return 0;
}

looking forwards to any reply, thanks.

Robert_Crovella · February 20, 2019, 4:25pm

profiling is often a useful activity to get some ideas about problems like this.

In order to look at this, I would want to know the exact UM setup

what are the two GPUs?
what is the operating system?
what is the CUDA version?

all of the above information matters, in order to accurately predict the behavior of UM (managed memory)

I haven’t tried to sort out your very complex indexing. Is it necessary to have arrays of interleaved pointers to demonstrate the issue? Can you simplify this? In the 2-thread/slow case, do you have a situation where two different GPUs are using the same allocation?

yefu.chen · February 21, 2019, 3:29am

Thanks for you reply.
The two GPUs are 1060,
operation system is Ubuntu 16.04
CUDA version is 9.0.

I have simplified the sample code, and let the thread allocate unified memory by itself.

#include <stdio.h>
#include <thread>
#include <unistd.h>
#include <iostream>
#include <cuda_runtime.h>
#include <vector>


__global__ void proc(int64_t* ptr[],int len){
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    idx = idx % len;
    for(int i=0;i<128;++i){
        for(int j=0;j<128;++j){
            *ptr[idx] += *ptr[idx]; 
        }
    }
}

void proc_thread1(int gpu_id){
    void *array_ptr;
    auto err = cudaMallocManaged(&array_ptr,sizeof(void*)*128);
    if(err){
        std::cout << "proc_thread0 " << err << std::endl;
    }
    int64_t** (data_ptr) = (int64_t**)(array_ptr);
    for(int i=0; i<128; ++i){
        void* ptr=nullptr;
        auto err = cudaMallocManaged(&ptr,8);
        if(err){
            std::cout << "cudaMallocManaged " << err << std::endl;
        }
        data_ptr[i] = (int64_t*)ptr;
    }

    while(true){
        cudaSetDevice(gpu_id);
        auto start = std::chrono::high_resolution_clock::now();
        
        proc<<<128,128>>>(data_ptr,128);
        cudaDeviceSynchronize();

        auto stop = std::chrono::high_resolution_clock::now();
        auto span = (std::chrono::duration<double, std::milli>(stop - start)).count();
        std::cout << "gpu id = " << gpu_id;
        std::cout << ", cost :" << span << "(ms)" << std::endl;
    }
}


int main(){

//    std::thread th0(&proc_thread1,0);
    std::thread th1(&proc_thread1,1);
//    th0.join();
    th1.join();
    return 0;
}

It runs extremely slow, if I start th0 and th1 in the main.

looking forwarding to any reply, thanks.

Robert_Crovella · February 21, 2019, 4:23am

what is the result of running the simpleP2P CUDA sample code on that system?

yefu.chen · February 21, 2019, 5:04am

[./simpleP2P] - Starting…
Checking for multiple GPUs…
CUDA-capable device count: 2

GPU0 = “GeForce GTX 1060 6GB” IS capable of Peer-to-Peer (P2P)
GPU1 = “GeForce GTX 1060 6GB” IS capable of Peer-to-Peer (P2P)

Checking GPU(s) for support of peer to peer memory access…

Peer access from GeForce GTX 1060 6GB (GPU0) → GeForce GTX 1060 6GB (GPU1) : No
Peer access from GeForce GTX 1060 6GB (GPU1) → GeForce GTX 1060 6GB (GPU0) : No
Two or more GPUs with SM 2.0 or higher capability are required for ./simpleP2P.
Peer to Peer access is not available amongst GPUs in the system, waiving test.

yefu.chen · February 21, 2019, 5:08am

In my example, I allocate unified memory in each thread, so I supposed thread0 will not access thread1’s unified memory, so I supposed there is no P2P access.

Topic		Replies	Views
Multiple GPUs Devise a synchro mechanism for host threads CUDA Programming and Performance	7	4206	May 13, 2010
How to check work is done by different GPU in multi GPU environment CUDA Programming and Performance	8	3008	June 18, 2009
multiGPU poor performance up to 10x lowest performance in multiGPU CUDA Programming and Performance	14	10778	January 18, 2008
Host->Device memcpy failure in forked process valgrind output included CUDA Programming and Performance	4	7133	February 26, 2008
Questions for multiple GPUs CUDA Programming and Performance	8	7176	April 20, 2009
Overhead of using more than one streams? CUDA Programming and Performance	5	6181	April 14, 2009
Failure with independent devices on independent processes Try it yourself! CUDA Programming and Performance	19	3466	March 10, 2011
My first test on CUDA and some questions sync, thread with CUDA CUDA Programming and Performance	5	3040	November 13, 2007
CUDA is slower than expected. Is something missing? CUDA Programming and Performance cuda , gpu , gpu-computing , parallel-computing	4	280	July 7, 2024
GPU and CPU don't run in (pure) parallel ? CUDA Programming and Performance	24	20170	May 4, 2007

multi host thread over multi gpu

Related topics