Help converting OpenCL to CUDA

The following code is a modified version of BICG by a colleague of mine written in OpenCL. I need to convert it to CUDA to do some benchmark comparisons, but I’m not completely sure how everything here translates into CUDA. Can someone help out in some of this translation? Thanks.

/// Bundle size (can be 2, 4, 8, 16).
#define BUNDLE_SIZE 2

/// Define appropriate ba type based on ba size.
#if   BUNDLE_SIZE == 2
  typedef float2 BUNDLE_T;
#elif BUNDLE_SIZE == 4
  typedef float4 BUNDLE_T;
#elif BUNDLE_SIZE == 8
  typedef float8 BUNDLE_T;
#elif BUNDLE_SIZE == 16
  typedef float16 BUNDLE_T;
#else
  #error "Invalid ba size"
#endif

/// Define primary type;
typedef float DATA_T;

/// GDS fetchfront size for kernel 2.
#define GDS_FETCH_FRONT_SIZE (256 / BUNDLE_SIZE)

/// LDS fetchfront size for kernel 2.
#define LDS_FETCH_FRONT_SIZE (32 / BUNDLE_SIZE)

/// For bundle size of 16 the "fetchront" size is quarter-wavefront. It follows
/// that half-wavefront takes up 2 rows in the "transformation" space.
/// Therefore shift amounts are different for bundle size of 16.

/// LDS fetch address for memory transformation (even row).
#define LDS_FA_E ((wglid - wglid % BUNDLE_SIZE) +        \
                 (wglid + ((wglid / BUNDLE_SIZE) % 32) / \
                 LDS_FETCH_FRONT_SIZE) % BUNDLE_SIZE)

/// LDS fetch address for memory transformation (odd row).
#if BUNDLE_SIZE == 16
#define LDS_FA_O ((wglid - wglid % BUNDLE_SIZE) +                        \
                 ((wglid + ((wglid / BUNDLE_SIZE) % 32) /                \
                 LDS_FETCH_FRONT_SIZE) % BUNDLE_SIZE + 8) % BUNDLE_SIZE)
#else
#define LDS_FA_O ((wglid - wglid % BUNDLE_SIZE) +        \
                 (wglid + ((wglid / BUNDLE_SIZE) % 32) / \
                 LDS_FETCH_FRONT_SIZE) % BUNDLE_SIZE)
#endif // BUNDLE_SIZE == 16

/// LDS store address for memory transformation.
#define LDS_SA(sa) (BUNDLE_SIZE * gds_fflid + (lds_ffwgid + sa) % BUNDLE_SIZE)

kernel void bicgKernel2(global BUNDLE_T *A, global BUNDLE_T *r,
                        global DATA_T *s, int nx, int ny) {
  /// Global id.
  int gid = get_global_id(0);

  /// Local id in the workgroup.
  int wglid = get_local_id(0);

  /// Workgroup id.
  int wgid = get_group_id(0);

  /// Local id in the GDS "fetchfront".
  int gds_fflid = get_local_id(0) % GDS_FETCH_FRONT_SIZE;

  /// GDS "fetchfront" group id.
  int gds_ffwgid = get_global_id(0) / GDS_FETCH_FRONT_SIZE % BUNDLE_SIZE;

  /// LDS "fetchfront" group id.
  int lds_ffwgid = get_global_id(0) % 32 / LDS_FETCH_FRONT_SIZE;

  /// Workgroup shared data.
  local DATA_T shared[BUNDLE_SIZE][256];

  if (gid < ny) {
    s[gid] = 0.0f;

    for(int i = 0; i < nx / BUNDLE_SIZE; i++) {
      BUNDLE_T ba = A[i * nx / BUNDLE_SIZE * BUNDLE_SIZE +
                      gds_ffwgid * nx / BUNDLE_SIZE +
                      gds_fflid + GDS_FETCH_FRONT_SIZE * wgid];
      BUNDLE_T br = r[i];

      /// Memory transformation.
      shared[gds_ffwgid][LDS_SA(0)] = ba.s0;
      shared[gds_ffwgid][LDS_SA(1)] = ba.s1;
      #if BUNDLE_SIZE > 2
        shared[gds_ffwgid][LDS_SA(2)] = ba.s2;
        shared[gds_ffwgid][LDS_SA(3)] = ba.s3;
      #endif // BUNDLE_SIZE > 2
      #if BUNDLE_SIZE > 4
        shared[gds_ffwgid][LDS_SA(4)] = ba.s4;
        shared[gds_ffwgid][LDS_SA(5)] = ba.s5;
        shared[gds_ffwgid][LDS_SA(6)] = ba.s6;
        shared[gds_ffwgid][LDS_SA(7)] = ba.s7;
      #endif // BUNDLE_SIZE > 4
      #if BUNDLE_SIZE > 8
        shared[gds_ffwgid][LDS_SA(8)] = ba.s8;
        shared[gds_ffwgid][LDS_SA(9)] = ba.s9;
        shared[gds_ffwgid][LDS_SA(10)] = ba.sa;
        shared[gds_ffwgid][LDS_SA(11)] = ba.sb;
        shared[gds_ffwgid][LDS_SA(12)] = ba.sc;
        shared[gds_ffwgid][LDS_SA(13)] = ba.sd;
        shared[gds_ffwgid][LDS_SA(14)] = ba.se;
        shared[gds_ffwgid][LDS_SA(15)] = ba.sf;
      #endif // BUNDLE_SIZE > 8

      /// Wait for transformation to complete.
      barrier(CLK_LOCAL_MEM_FENCE);

      /// Perform operations.
      s[gid] += shared[0][LDS_FA_E] * br.s0;
      s[gid] += shared[1][LDS_FA_O] * br.s1;
      #if BUNDLE_SIZE > 2
        s[gid] += shared[2][LDS_FA_E] * br.s2;
        s[gid] += shared[3][LDS_FA_O] * br.s3;
      #endif // BUNDLE_SIZE > 2
      #if BUNDLE_SIZE > 4
        s[gid] += shared[4][LDS_FA_E] * br.s4;
        s[gid] += shared[5][LDS_FA_O] * br.s5;
        s[gid] += shared[6][LDS_FA_E] * br.s6;
        s[gid] += shared[7][LDS_FA_O] * br.s7;
      #endif // BUNDLE_SIZE > 4
      #if BUNDLE_SIZE > 8
        s[gid] += shared[8][LDS_FA_E] * br.s8;
        s[gid] += shared[9][LDS_FA_O] * br.s9;
        s[gid] += shared[10][LDS_FA_E] * br.sa;
        s[gid] += shared[11][LDS_FA_O] * br.sb;
        s[gid] += shared[12][LDS_FA_E] * br.sc;
        s[gid] += shared[13][LDS_FA_O] * br.sd;
        s[gid] += shared[14][LDS_FA_E] * br.se;
        s[gid] += shared[15][LDS_FA_O] * br.sf;
      #endif // BUNDLE_SIZE > 8

      /// Wait for operations to complete.
      barrier(CLK_LOCAL_MEM_FENCE);
    } // end for
  } // end if
} // end bicgKernel2

AMD has a CUDA to OpenCL document, google for it.

I think the find-replace rules below will be a good start:

kernel --> __global__
local  --> __shared__

get_local_id(0)  --> threadIdx.x
get_group_id(0)  --> blockIdx.x
get_global_id(0) --> blockIdx.x*blockDim.x+threadIdx.x

barrier(CLK_LOCAL_MEM_FENCE) --> __syncthreads()

See also http://www.netlib.org/utk/people/JackDongarra/PAPERS/parcocudaopencl.pdf

In CUDA there is only float, float2 and float4 (no float8 or float16, unless you create them of course). The elements in a float4 are called x, y, z and w. So ba.s0 --> ba.x, ba.s1 --> ba.y, etc.