All non-zero element indexes

Hi! I have task to find all non-zero indexes in vector oldrow {0,0,4,0,5,6,0,1}
n=8
oldrowsparse must have answer{2,4,5,7}
num non-zero nzr = 4
but my solution give {0,0,0,0,18} i.e their sum in nzr+1 position
nzr = 4 is correct

global void find_nonzero_oldrow_sparse_fast(double* oldrow, int* n, int* oldrowsparse, int* nzr)
{
int tid = threadIdx.x; // Local thread index
int idx = blockIdx.x * blockDim.x + threadIdx.x; // Global thread index
shared double values[256];
shared int indexes[256];
shared int result[256];
shared int cnt[1];
cnt[0] = 0;
values[tid] = (idx < (*n)) ? (oldrow[idx]) : 0;
indexes[tid] = (idx < (*n)) ? idx : 0;
result[tid] = 0;
__syncthreads();
int offset = blockDim.x * gridDim.x;
while (idx < *n)
{
if (values[tid] != 0)
{
atomicAdd(&cnt[0], 1);
atomicAdd(&result[cnt[0]], indexes[tid]);
}
idx += offset;
}
__syncthreads();
atomicAdd(&(oldrowsparse[tid]), result[tid]);
if (tid == 0)
{
*nzr = cnt[0];
}
}

You should look at stream compaction.
Thrust can do what you need, this is a simple example

include <thrust/iterator/counting_iterator.h>
include <thrust/copy.h>
include <thrust/functional.h>
include <thrust/device_vector.h>

int main()
{
// this example computes indices for all the nonzero values in a sequence

// sequence of zero and nonzero values
thrust::device_vector stencil(8);
stencil[0] = 0;
stencil[1] = 0;
stencil[2] = 4;
stencil[3] = 0;
stencil[4] = 5;
stencil[5] = 6;
stencil[6] = 0;
stencil[7] = 1;

thrust::copy(stencil.begin(), stencil.end(), std::ostream_iterator(std::cout, " "));
std::cout<< "\n ";

// storage for the nonzero indices
thrust::device_vector indices(8);

// compute indices of nonzero elements
typedef thrust::device_vector::iterator IndexIterator;

// use make_counting_iterator to define the sequence [0, 8)
IndexIterator indices_end = thrust::copy_if(thrust::make_counting_iterator(0),
thrust::make_counting_iterator(8),
stencil.begin(),
indices.begin(),
thrust::identity());
// indices now contains [2,4,5,7]

thrust::copy(indices.begin(), indices.end(), std::ostream_iterator(std::cout, " "));
std::cout<< "\n ";

return 0
}

nvcc -std=c++14 ex.cu
./a.out

0 0 4 0 5 6 0 1
2 4 5 7 0 0 0 0