How to put specific elements from one array to another array use CUDA?

void Filter(int *array,int *part_array,int array_size)
{
    int index=0;
    for(int i=0;i<array_size;i++)
    {
        if(array[i]>M)//M is a rule
        {
            part_array[index] = array[i];
            index++; 
        }
    }
}

This is my host function. I want to change it to a cuda kernel function.
I don’t know how to create the kernel function and i don’t want to use thrust. thanks for reply!

This problem is known as parallel stream compaction. You can find many resources about it on the internet.
The general approach is to create a flag array indicating which elements to copy. Then create a prefix sum of the flag array to find the output positions of elements to copy. Last, perform the copy.

Do you want to avoid external libraries in general, or only Thrust? What about cub?

Thanks for your reply! I don’t want to use any external libraries, just write a kernel function.

If the ordering of the results in part_array are not important, you may find this helpful:

The canonical method for stream compaction uses a prefix sum followed by an indexed copy. For the overall stream compaction operation, the general idea is mapped out here.

You can get started learning how to write a prefix sum here. An indexed copy is a fairly easy operation to write.

In practice, writing a prefix sum is an operation I wouldn’t want to write myself, although I probably could with enough effort. For this reason I suspect many folks would prefer to use a library implementation. If you want to write your own prefix sum, a typical approach will have multiple steps. You’ll do a block-level prefix sum on tiles of data in the first step, and in the second step you’ll do a “global” prefix-sum (perhaps using a single block) to provide the “offsets” for each of the block-level sums. Finally, you will “fix up” the block level prefix sums using the global offsets.

Good luck!

Here is a code example that follows the steps I indicated, using a prefix sum algorithm derived from the example given in GPU gems that I previously linked. I strongly encourage anyone to use thrust or cub instead.


#include <iostream>
#include <cstdio>

// a block-level exclusive prefix scan/sum
template <typename T>
__global__ void prescan(T *g_odata, T *g_idata, T *g_bdata, int n)
{
  extern __shared__ T temp[];  // allocated on invocation
  int thid = threadIdx.x;
  size_t idx = blockIdx.x*blockDim.x + thid;
  int offset = 1;
  temp[2*thid] = g_idata[2*idx]; // load input into shared memory
  temp[2*thid+1] = g_idata[2*idx+1];
  for (int d = n>>1; d > 0; d >>= 1)                    // build sum in place up the tree
  {
    __syncthreads();
    if (thid < d)
    {
      int ai = offset*(2*thid+1)-1;
      int bi = offset*(2*thid+2)-1;
      temp[bi] += temp[ai];
    }
    offset *= 2;
  }
  if (thid == 0) { temp[n - 1] = 0; } // clear the last element
  for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
    {
      offset >>= 1;
      __syncthreads();
      if (thid < d)
      {
         int ai = offset*(2*thid+1)-1;
         int bi = offset*(2*thid+2)-1;
         T t = temp[ai];
         temp[ai] = temp[bi];
         temp[bi] += t;
      }
    }
  __syncthreads();
  g_odata[2*idx] = temp[2*thid]; // write results to device memory
  g_odata[2*idx+1] = temp[2*thid+1];
  if (thid == (blockDim.x-1)) g_bdata[blockIdx.x] = temp[2*thid+1] + g_idata[2*idx+1]; // scan value for block
}

template <typename IT, typename DT>
__global__ void icopy(IT *g_i1data, IT *g_i2data,  IT *g_bdata, DT *g_iddata, DT *g_oddata)
{
  size_t idx = (blockIdx.x*blockDim.x + threadIdx.x)*2;
  size_t cidx;
  for (int offset = 0; offset < 2; offset++)
    if (g_i1data[idx+offset]){
      cidx = g_i2data[idx+offset] + g_bdata[blockIdx.x];
      g_oddata[cidx] = g_iddata[idx+offset];}
}

const int BSIZE = 1024;
const int ESIZE = 2*BSIZE;
using itype = int;
using dtype = float;
const int nb = 2; // max 1024;, must be even
const int ds = nb*ESIZE;  // two elements processed per thread
int main(){

// The input data must consist of a  data array as well as a flag array.
// The flag array must take the value of 1 for elements to be copied, and 0 for no copy
// The data set size must be a whole number multiple of the block size, up to 1024 blocks max
// The data set must be "padded" up to the data set size if needed, with padded elements in the flag array as zero for no copy

  // data setup
  itype *h_f, *d_f, *d_fo, *d_b, *d_bo; // flag array
  dtype *h_d, *d_d, *h_o, *d_o;   // data to copy
  size_t dsi = sizeof(itype)*ds;
  size_t dsd = sizeof(dtype)*ds;

  h_f = new itype[ds];
  h_d = new dtype[ds];
  cudaMalloc(&d_f, dsi);
  cudaMalloc(&d_fo, dsi);
  cudaMalloc(&d_d, dsd);
  cudaMalloc(&d_b, nb*sizeof(itype));
  cudaMalloc(&d_bo, nb*sizeof(itype));
  for (int i = 0 ; i < ds; i++){
    h_f[i] = i&1; // copy every other element
    h_d[i] = sqrtf((float)i);}
  cudaMemcpy(d_f, h_f, dsi, cudaMemcpyHostToDevice);
  cudaMemcpy(d_d, h_d, dsd, cudaMemcpyHostToDevice);
  // step 1 - block level prefix sum
  prescan<<<nb,BSIZE, ESIZE*sizeof(itype)>>>(d_fo, d_f, d_b, ESIZE);
  // step 2 - compute scan over blocks
  prescan<<<1,nb/2, nb*sizeof(itype)>>>(d_bo, d_b, d_b, nb);
  // step 3 - create output buffer
  itype buf_size;
  cudaMemcpy(&buf_size, d_b, sizeof(itype), cudaMemcpyDeviceToHost);
  cudaMalloc(&d_o, buf_size*sizeof(dtype));
  // step 4 - fixup index and indexed copy
  icopy<<<nb,BSIZE>>>(d_f, d_fo, d_bo, d_d, d_o);
  // step 5 - print results
  h_o = new dtype[buf_size];
  cudaMemcpy(h_o, d_o, buf_size*sizeof(dtype), cudaMemcpyDeviceToHost);
  for (int i = 0; i < buf_size; i++) std::cout << h_o[i] << " ";
  std::cout << std::endl;
  return 0;
}

This code can be made to work for about 4M elements. Beyond that, you would need to add an additional scan layer/operation.

Thank you for your help! It’s really help me solve my problem! I will consider about thrust or cub in my program.