address is misaligned error when using cooperative groups for grid synch

I am trying implement a kernel for one of our applications in CUDA, for that I am attempting to use CUDA cooperative groups for grid synchronization. The code below re-creates the problem that I am facing. When I run the code below with cuda-memcheck, it gives the misaligned memory address error. Can anyone please shed some light on what I am doing wrong here?

Also, is it possible to use cooperative groups for grid synchronization without unified memory? If I run the same code without unified memory, it crashes. I could not find anything related to this in the programming guide.

Also if anyone could point me towards a tutorial about using cooperative groups for grid synchronization that would be great!

For compilation please use: nvcc --std=c++11 -rdc=true -arch=sm_61 -g -G

I am using CUDA 10.2 on Titan X pascal GPU

#include <iostream>
#include <cooperative_groups.h>

#define matSize 2048
#define cudaErrchk(ans)                                                                  \
    {                                                                                    \
        gpuAssert((ans), __FILE__, __LINE__);                                            \
inline void
gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
    if(code != cudaSuccess)
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);

__global__ void matSum(int *a, int*b, int*sum){
auto g = cooperative_groups::this_grid();
if(g.thread_rank() < matSize)
  sum[g.thread_rank()] = a[g.thread_rank()] + b[g.thread_rank()];

int main(){

int *h_test;
int *d_a, *d_b, *d_sum;

cudaDeviceProp deviceProp;
int devID = 0;
cudaGetDeviceProperties(&deviceProp, devID);
if (deviceProp.cooperativeLaunch) {
  // if coop groups supported
  std::cout<<"supports coop groups"<<std::endl;

h_test = new int[matSize];
cudaErrchk(cudaMallocManaged(&d_a, matSize*sizeof(int)));
cudaErrchk(cudaMallocManaged(&d_b, matSize*sizeof(int)));
cudaErrchk(cudaMallocManaged(&d_sum, matSize*sizeof(int)));

for(int i = 0; i < matSize; i++){
  d_a[i] = 10;
  d_b[i] = 20;
  h_test[i] = d_a[i]+d_b[i];

void *kernelArgs[] = {d_a, d_b, d_sum};
//h_sum = *d_sum;

int error = 0;
for(int i = 0; i < matSize; i++){
  if(d_sum[i] != h_test[i]){

if(error > 0){
  std::cout<<"no errors"<<std::endl;


Your kernel args are wrong. Study any of the CUDA sample codes that do a CG grid group.

You should change this:

void *kernelArgs[] = {d_a, d_b, d_sum};

to this:

void *kernelArgs[] = {&d_a, &d_b, &d_sum};

Yes, it’s possible to use cooperative groups, with grid synchronization, without using unified memory.

Thank you. That was a dumb mistake on my end.