As an exercise I wrote a CUDA version of the standard subset problem in which a bit mask is used to brute force enumerate all the different subsets out of the 2^n possibilities. The kernel takes in the set size (in this case 30) and the subset size (say 15). The kernel counts the bits in the current index, and if it is equal to the target subset size, then it calculates the sum of values associated with that specific subset

(using a global **constant** array[30] with the values associated with each element index).

Rather than the usual shared memory method I like to use, I instead attempted to used AtomicMax() to return the best answer with AtomicExch() to store the bitmask pattern associated with that optimal answer.

The overall answer for the optimal value has always been correct(when compared to the CPU result), but the mask answer is sometimes off by 1-3 bits.

I really have not used Atomics much before, so I probably am missing some obvious error probably with the AtomicExch() call.

here are the relevant parts of the code;

```
#define THREADS 256
__constant__ int d_cost[30]={344,12,55,3,2,6,5,11,2232,44,77,12,31,41,51,66,777,888,1,22,3,71,12,333,44,66,77,98,70,200};//global device cost array
__device__ int d_cntbt(int n){//count the bits of an int on the device
n=n-((n>>1)&0x55555555);
n=(n&0x33333333)+((n>>2)&0x33333333);
return (((n+(n>>4))&0x0F0F0F0F)*0x01010101)>>24;
}
__global__ void _gpu_subset_test(const int set_size, const int subset_size,int *local_max,int *global_max,int *result_mask,int local_sz){
int idx=threadIdx.x+blockDim.x*blockIdx.x;
int i2=idx;
if(idx>=(1<<set_size))return;//out of range
if(d_cntbt(i2)!=subset_size)return;
int d_temp=0,local_idx=threadIdx.x;
for(int j=0;j<set_size;j++){
if(idx&(1<<j)){
d_temp+=d_cost[j];
}
}
int old_best=atomicMax(&local_max[local_idx],d_temp);//
if(old_best<d_temp){
atomicMax(global_max,d_temp);//update the value
atomicExch(result_mask,idx);//update the mask
}
}
int main(){...
int num_blocks=(1<<setSize)/THREADS;
_gpu_subset_test<<<num_blocks,THREADS>>>(setSize,subsetSize,d_loc,d_mx,d_rez_mask,THREADS);//launch kernel
err=cudaThreadSynchronize();
if(err!=cudaSuccess){printf("%s in %s at line %d
",cudaGetErrorString(err),__FILE__,__LINE__);}
..then copy back results to host
```

the local_max array is of size THREADS. I do use cudaMemset(0) for all the pointers/variables before the call.

Thanks!