/* Device Code for the Interlacing / PSlitting operations of multiple arrays */ #ifndef _DEINTERLACE_KERNEL_H_ #define _DEINTERLACE_KERNEL_H_ #define tile_x 8 #define tile_y 8 #define thd 64 template __global__ void deinterlace (float* odata, float* idata, int N) { } template <> __global__ void deinterlace <2> (float* odata, float* idata, int N) { unsigned int bid = __mul24(blockIdx.y, gridDim.x) + blockIdx.x; unsigned int tid = __mul24(bid,__mul24(2, blockDim.x)) + threadIdx.x; extern __shared__ float tile []; unsigned int thdx = 2*threadIdx.x; // if (tid < 2*N) { tile[threadIdx.x] = idata[tid]; tile[threadIdx.x + thd] = idata[tid + thd]; // } tid = __mul24(bid, blockDim.x) + threadIdx.x; __syncthreads(); // if (tid < N) { odata [tid] = tile[thdx]; odata [tid + N] = tile[thdx + 1]; // } } template <> __global__ void deinterlace <3> (float* odata, float* idata, int N) { unsigned int bid = __mul24(blockIdx.y, gridDim.x) + blockIdx.x; unsigned int tid = __mul24(bid,__mul24(3, blockDim.x)) + threadIdx.x; extern __shared__ float tile []; unsigned int thdx = 3*threadIdx.x; tile[threadIdx.x] = idata[tid]; tile[threadIdx.x + thd] = idata[tid + thd]; tile[threadIdx.x + (2*thd)] = idata[tid + (2*thd)]; tid = __mul24(bid, blockDim.x) + threadIdx.x; __syncthreads(); odata [tid] = tile[thdx]; odata [tid + N] = tile[thdx + 1]; odata [tid + (2*N)] = tile[thdx + 2]; } #endif