Hi! I have task to find all non-zero indexes in vector oldrow {0,0,4,0,5,6,0,1}
n=8
oldrowsparse must have answer{2,4,5,7}
num non-zero nzr = 4
but my solution give {0,0,0,0,18} i.e their sum in nzr+1 position
nzr = 4 is correct
global void find_nonzero_oldrow_sparse_fast(double* oldrow, int* n, int* oldrowsparse, int* nzr)
{
int tid = threadIdx.x; // Local thread index
int idx = blockIdx.x * blockDim.x + threadIdx.x; // Global thread index
shared double values[256];
shared int indexes[256];
shared int result[256];
shared int cnt[1];
cnt[0] = 0;
values[tid] = (idx < (*n)) ? (oldrow[idx]) : 0;
indexes[tid] = (idx < (*n)) ? idx : 0;
result[tid] = 0;
__syncthreads();
int offset = blockDim.x * gridDim.x;
while (idx < *n)
{
if (values[tid] != 0)
{
atomicAdd(&cnt[0], 1);
atomicAdd(&result[cnt[0]], indexes[tid]);
}
idx += offset;
}
__syncthreads();
atomicAdd(&(oldrowsparse[tid]), result[tid]);
if (tid == 0)
{
*nzr = cnt[0];
}
}