Need help with parallel programing this filter use CUDA to parallel this program

7 #define BLOCKSIZE 500
8
9
10 struct sepia_cpu {
11
12 unsigned char *src;
13 unsigned char *destg4;
14 unsigned char *destgpu;
15 }
16
17
18
19 global void cuda_call_kernel(unsigned char *dm_src, unsigned char dm_destgpu)
20 {
21 int bx = blockIdx.x;
22 int tx = threadIdx.x;
23 int o = bx * BLOCKSIZE + tx;
24
25 // load from global memory
26 unsigned char temp_src = dm_src[o];
27 unsigned char temp_destgpu = dm_destgpu[o];
28
29 //sepia_pixel(&temp_src[(tx+bx
width)3],&temp_destcpu[(tx+bxwidth)*3]);
30
31 float B = temp_src[0] * .101 + temp_src[1] * .531 + temp_src[2] * .291;
32 float G = temp_src[0] * .161 + temp_src[1] * .691 + temp_src[2] * .361;
33 float R = temp_src[0] * .201 + temp_src[1] * .741 + temp_src[2] * .411;
34
35 if(R > 255.0) R = 255.0;
36 if(G > 255.0) G = 255.0;
37 if(B > 255.0) B = 255.0;
38
39 temp_destgpu[0] = (unsigned char)B;
40 temp_destgpu[1] = (unsigned char)G;
41 temp_destgpu[2] = (unsigned char)R;

This function above I need to somehow use the tx and bx variables to describe location and I’m not sure how? The commented out section above is what the function looks like in the original program and I need to somehow incorporate that above. This is my first parallel program using CUDA so I’m really lost. This is also a piece from the cuda program not the original.