Hi,
I tried the following example to compute sum of an array using warp aggregated atomics. But it does not give the correct sum. Could anyone please suggest what is the mistake?
Thanks.
#include
#include <cuda.h>
#include <cuda_device_runtime_api.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <math.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cooperative_groups.h>
#include <time.h>
const int SUB_IMAGE_COUNT = 1;
const int NUM_ELEMENTS = 10;
namespace cg = cooperative_groups;
// warp-aggregated atomic increment
device int atomicAggInc(int *counter)
{
cg::coalesced_group active = cg::coalesced_threads();
int mask = active.ballot(1);
// select the leader
int leader = __ffs(mask) - 1;
// leader does the update
int res = 0;
if (active.thread_rank() == leader)
{
res = atomicAdd(counter, __popc(mask));
}
// broadcast result
res = active.shfl(res, leader);
// each thread computes its own value
return res + __popc(mask & ((1 << active.thread_rank()) - 1));
}
global void sum_approach2(double *dst, int *nres, const double *src)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = id; i < NUM_ELEMENTS; i += gridDim.x*blockDim.x)
{
if(src[i] > 0)
dst[atomicAggInc(nres)] = src[i];
}
}
int main()
{
FILE A_f = fopen(“sample.txt”, “r”);
int i;
double h_A = (double)malloc(NUM_ELEMENTS sizeof(double));
if (A_f == NULL) { return 1; }
for (i = 0;i<NUM_ELEMENTS;++i) {
fscanf(A_f, “%lf”, &h_A[i]);
}
for (i = 0;i<NUM_ELEMENTS;++i) {
printf("\n h_A[%d]=%0.15f \n",i,h_A[i]);
}
double h_sum2 = (double)malloc(SUB_IMAGE_COUNT* sizeof(double));
for (i = 0;i<SUB_IMAGE_COUNT;++i) {
h_sum2[i]=0.0f;
}
int h_nres=0;
int *d_nres;
cudaMalloc(&d_nres, sizeof(int));
cudaMemset(d_nres, 0, sizeof(int));
double d_A,d_sum2;
cudaMalloc(&d_A, NUM_ELEMENTS sizeof(double));
cudaMalloc(&d_sum2, SUB_IMAGE_COUNT sizeof(double));
cudaMemcpy(d_A, h_A, NUM_ELEMENTS* sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_sum2, h_sum2, SUB_IMAGE_COUNT* sizeof(double), cudaMemcpyHostToDevice);
sum_approach2<<<1, 10>>>(d_sum2, d_nres, d_A);
cudaDeviceSynchronize();
cudaMemcpy(&h_nres, d_nres, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(h_sum2,d_sum2, SUB_IMAGE_COUNT* sizeof(double), cudaMemcpyDeviceToHost);
// Generate host output with host filtering code.
int host_flt_count=0;
for (int i=0; i < NUM_ELEMENTS; i++)
{
if (h_A[i] > 0)
{
h_A[host_flt_count++] = h_A[i];
}
}
printf("\nWarp Aggregated Atomics %s \n", host_flt_count == h_nres ? “PASSED” : “FAILED”);
printf("\n Approach 2: Final Sum = %0.15f \n", h_sum2[0]);
return 0;
}
==================
Sample input:
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
===================
output got is:
Approach 2: Final Sum = 1.00000