warp aggregated atomics result

Hi,

I tried the following example to compute sum of an array using warp aggregated atomics. But it does not give the correct sum. Could anyone please suggest what is the mistake?

Thanks.

#include
#include <cuda.h>
#include <cuda_device_runtime_api.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <math.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cooperative_groups.h>
#include <time.h>

const int SUB_IMAGE_COUNT = 1;
const int NUM_ELEMENTS = 10;

namespace cg = cooperative_groups;

// warp-aggregated atomic increment
device int atomicAggInc(int *counter)
{
cg::coalesced_group active = cg::coalesced_threads();

int mask = active.ballot(1);
// select the leader
int leader = __ffs(mask) - 1;

// leader does the update
int res = 0;
if (active.thread_rank() == leader)
{
    res = atomicAdd(counter, __popc(mask));
}

// broadcast result
res = active.shfl(res, leader);

// each thread computes its own value
return res + __popc(mask & ((1 << active.thread_rank()) - 1));

}

global void sum_approach2(double *dst, int *nres, const double *src)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;

for (int i = id; i < NUM_ELEMENTS; i += gridDim.x*blockDim.x)
{
    if(src[i] > 0)
        dst[atomicAggInc(nres)] = src[i];
}

}

int main()
{
FILE A_f = fopen(“sample.txt”, “r”);
int i;
double h_A = (double)malloc(NUM_ELEMENTS
sizeof(double));
if (A_f == NULL) { return 1; }
for (i = 0;i<NUM_ELEMENTS;++i) {
fscanf(A_f, “%lf”, &h_A[i]);
}

for (i = 0;i<NUM_ELEMENTS;++i) {
printf("\n h_A[%d]=%0.15f \n",i,h_A[i]);
}

double h_sum2 = (double)malloc(SUB_IMAGE_COUNT* sizeof(double));

for (i = 0;i<SUB_IMAGE_COUNT;++i) {
h_sum2[i]=0.0f;
}

int h_nres=0;
int *d_nres;
cudaMalloc(&d_nres, sizeof(int));
cudaMemset(d_nres, 0, sizeof(int));

double d_A,d_sum2;
cudaMalloc(&d_A, NUM_ELEMENTS
sizeof(double));
cudaMalloc(&d_sum2, SUB_IMAGE_COUNT
sizeof(double));

cudaMemcpy(d_A, h_A, NUM_ELEMENTS* sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_sum2, h_sum2, SUB_IMAGE_COUNT* sizeof(double), cudaMemcpyHostToDevice);

sum_approach2<<<1, 10>>>(d_sum2, d_nres, d_A);
cudaDeviceSynchronize();

cudaMemcpy(&h_nres, d_nres, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(h_sum2,d_sum2, SUB_IMAGE_COUNT* sizeof(double), cudaMemcpyDeviceToHost);

// Generate host output with host filtering code.
int host_flt_count=0;
for (int i=0; i < NUM_ELEMENTS; i++)
{
    if (h_A[i] > 0)
    {
        h_A[host_flt_count++] = h_A[i];
    }
}

printf("\nWarp Aggregated Atomics %s \n", host_flt_count == h_nres ? “PASSED” : “FAILED”);

printf("\n Approach 2: Final Sum = %0.15f \n", h_sum2[0]);

return 0;
}

==================

Sample input:
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000

===================

output got is:

Approach 2: Final Sum = 1.00000

Any time you’re having trouble with a CUDA code, its good practice to use proper CUDA error checking, and also run your code with cuda-memcheck.

When I run your code with cuda-memcheck, I get a variety of invalid global write errors, indicating that your kernel is attempting to write data out-of-bounds.

I would start by debugging that. You can localize these types of errors to a specific line of code by following the methodology here:

https://stackoverflow.com/questions/27277365/unspecified-launch-failure-on-memcpy/27278218#27278218

Hi sreeram,

The example you show seems right, but it is not computing a sum. Instead, the code copies positive elements from src to dst in undefined order, and ignores negative and zero elements. It is essentially a parallel version of this loop:

int nres = 0;
for (int i = 0; i < NUM_ELEMENTS; i++)
{
  if(src[i] > 0)
    dst[nres++] = src[i];
}

To compute a sum using warp-aggregated / warp-synchronous primitives, you would have to perform a parallel reduction with shuffles. You can find an example under the ‘Warp-level collectives’ section of this blog post: https://devblogs.nvidia.com/parallelforall/cooperative-groups/