#ifndef _C_KERNEL_H_ #define _C_KERNEL_H_ #define num_loops 1024 #define MATRIX_COLS 1000 #define num_threads_per_block 20 __device__ float ReturnOut(); __global__ void CKernel(float *c, float *c2, float *o) { int tx = blockIdx.x * num_threads_per_block + threadIdx.x; int ty = blockIdx.y * num_threads_per_block + threadIdx.y; //o[tx * MATRIX_COLS + ty] = ReturnOut(); o[tx * MATRIX_COLS + ty] = 1.1; } __device__ float ReturnOut() { float out = 0.0; for (int ii=0; ii