Hi,
I am trying implement a kernel for one of our applications in CUDA, for that I am attempting to use CUDA cooperative groups for grid synchronization. The code below re-creates the problem that I am facing. When I run the code below with cuda-memcheck, it gives the misaligned memory address error. Can anyone please shed some light on what I am doing wrong here?
Also, is it possible to use cooperative groups for grid synchronization without unified memory? If I run the same code without unified memory, it crashes. I could not find anything related to this in the programming guide.
Also if anyone could point me towards a tutorial about using cooperative groups for grid synchronization that would be great!
For compilation please use: nvcc --std=c++11 -rdc=true -arch=sm_61 -g -G coop.cu
I am using CUDA 10.2 on Titan X pascal GPU
#include <iostream>
#include <cooperative_groups.h>
#define matSize 2048
#define cudaErrchk(ans) \
{ \
gpuAssert((ans), __FILE__, __LINE__); \
}
inline void
gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if(code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if(abort)
exit(code);
}
}
__global__ void matSum(int *a, int*b, int*sum){
auto g = cooperative_groups::this_grid();
if(g.thread_rank() < matSize)
sum[g.thread_rank()] = a[g.thread_rank()] + b[g.thread_rank()];
}
int main(){
int *h_test;
int *d_a, *d_b, *d_sum;
cudaDeviceProp deviceProp;
int devID = 0;
cudaGetDeviceProperties(&deviceProp, devID);
if (deviceProp.cooperativeLaunch) {
// if coop groups supported
std::cout<<"supports coop groups"<<std::endl;
}
h_test = new int[matSize];
cudaErrchk(cudaMallocManaged(&d_a, matSize*sizeof(int)));
cudaErrchk(cudaMallocManaged(&d_b, matSize*sizeof(int)));
cudaErrchk(cudaMallocManaged(&d_sum, matSize*sizeof(int)));
cudaErrchk(cudaDeviceSynchronize());
for(int i = 0; i < matSize; i++){
d_a[i] = 10;
d_b[i] = 20;
h_test[i] = d_a[i]+d_b[i];
}
void *kernelArgs[] = {d_a, d_b, d_sum};
cudaErrchk(cudaLaunchCooperativeKernel(
(void*)matSum,
2,
1024,
kernelArgs
));
cudaErrchk(cudaDeviceSynchronize());
//h_sum = *d_sum;
int error = 0;
for(int i = 0; i < matSize; i++){
if(d_sum[i] != h_test[i]){
error++;
}
}
if(error > 0){
std::cout<<"errors:"<<error<<std::endl;
}else{
std::cout<<"no errors"<<std::endl;
}
cudaErrchk(cudaFree(d_a));
cudaErrchk(cudaFree(d_b));
cudaErrchk(cudaFree(d_sum));
}
Thanks.