Hi, I have a problem.
In my case, I have two gpu cards, gpuA and gpuB, in the host program, I create two threads, threadA, and threadB, I let thread A do the job on gpuA and threadB do the job on gpuB.
If I start the two threads at the same time, each thread runs extremely slow. this is the slowly code.
#include <stdio.h>
#include <thread>
#include <unistd.h>
#include <iostream>
#include <cuda_runtime.h>
#include <vector>
std::vector<void*> mem_lists;
__global__ void proc(int64_t* ptr[],int len){
int idx = blockDim.x * blockIdx.x + threadIdx.x;
idx = idx % len;
for(int i=0;i<1024;++i){
for(int j=0;j<1024;++j){
*ptr[idx] += *ptr[idx];
}
}
}
void proc_thread1(int gpu_id){
void *ptr;
auto err = cudaMallocManaged(&ptr,sizeof(void*)*mem_lists.size()/2);
if(err){
std::cout << "proc_thread0 " << err << std::endl;
}
int64_t** data_ptr = (int64_t**)(ptr);
for(int i=0; i < mem_lists.size()/2; ++i){
auto idx = i + 2*gpu_id%2;
data_ptr[i] = (int64_t*)(mem_lists[idx]);
}
cudaSetDevice(gpu_id);
auto start = std::chrono::high_resolution_clock::now();
proc<<<128,128>>>(data_ptr,mem_lists.size()/2);
cudaDeviceSynchronize();
auto stop = std::chrono::high_resolution_clock::now();
auto span = (std::chrono::duration<double, std::milli>(stop - start)).count();
std::cout << "gpu id = " << gpu_id;
std::cout << ", cost :" << span << "(ms)" << std::endl;
}
int main(){
for(int i=0; i<128; ++i){
void* ptr=nullptr;
auto err = cudaMallocManaged(&ptr,8);
if(err){
std::cout << "cudaMallocManaged " << err << std::endl;
}
mem_lists.push_back(ptr);
}
std::thread th0(&proc_thread1,0);
std::thread th1(&proc_thread1,0);
th0.join();
th1.join();
return 0;
}
but, if I just run one thread, then, It was extremely fast.
this is the code which runs fast.
#include <stdio.h>
#include <thread>
#include <unistd.h>
#include <iostream>
#include <cuda_runtime.h>
#include <vector>
std::vector<void*> mem_lists;
__global__ void proc(int64_t* ptr[],int len){
int idx = blockDim.x * blockIdx.x + threadIdx.x;
idx = idx % len;
for(int i=0;i<1024;++i){
for(int j=0;j<1024;++j){
*ptr[idx] += *ptr[idx];
}
}
}
void proc_thread1(int gpu_id){
void *ptr;
auto err = cudaMallocManaged(&ptr,sizeof(void*)*mem_lists.size()/2);
if(err){
std::cout << "proc_thread0 " << err << std::endl;
}
int64_t** data_ptr = (int64_t**)(ptr);
for(int i=0; i < mem_lists.size()/2; ++i){
auto idx = i + 2*gpu_id%2;
data_ptr[i] = (int64_t*)(mem_lists[idx]);
}
cudaSetDevice(gpu_id);
auto start = std::chrono::high_resolution_clock::now();
proc<<<128,128>>>(data_ptr,mem_lists.size()/2);
cudaDeviceSynchronize();
auto stop = std::chrono::high_resolution_clock::now();
auto span = (std::chrono::duration<double, std::milli>(stop - start)).count();
std::cout << "gpu id = " << gpu_id;
std::cout << ", cost :" << span << "(ms)" << std::endl;
}
int main(){
for(int i=0; i<128; ++i){
void* ptr=nullptr;
auto err = cudaMallocManaged(&ptr,8);
if(err){
std::cout << "cudaMallocManaged " << err << std::endl;
}
mem_lists.push_back(ptr);
}
std::thread th0(&proc_thread1,0);
// std::thread th1(&proc_thread1,0);
th0.join();
// th1.join();
return 0;
}
looking forwards to any reply, thanks.