Hello.
I have an array, to which all the threads access simultaneously.
As I remember the efficiency of the constant memory is maximized when all threads access the same memory,
so I put them in a constant memory, like below.
__constant__ float array [12] ;
__global__ void move(
float3 *ap_src_pts, int a_src_pts_w,
float3 *ap_dst_pts
)
{
int gid_x = blockIdx.x * blockDim.x + threadIdx.x ;
int gid_y = blockIdx.y * blockDim.y + threadIdx.y ;
int l_index = gid_x + gid_y * a_src_pts_w ;
float3 l_from, l_to ;
l_from = ap_src_pts[l_index] ;
l_to.x = p_dc_trans_mat[0] * l_from.x
+ p_dc_trans_mat[1] * l_from.y
+ p_dc_trans_mat[2] * l_from.z
+ p_dc_trans_mat[9] ;
l_to.y = p_dc_trans_mat[3] * l_from.x
+ p_dc_trans_mat[4] * l_from.y
+ p_dc_trans_mat[5] * l_from.z
+ p_dc_trans_mat[10] ;
l_to.z = p_dc_trans_mat[6] * l_from.x
+ p_dc_trans_mat[7] * l_from.y
+ p_dc_trans_mat[8] * l_from.z
+ p_dc_trans_mat[11] ;
ap_dst_pts[l_index] = l_to ;
}
However, the array’s length is very short and it has only 12 elements…
I’m wondering that how it is to pass the array as an argument to the kernel
instead of storing it in the constant memory, like below.
__global__ void move(
float3 *ap_src_pts, int a_src_pts_w,
float3 *ap_dst_pts,
float p_dc_trans_mat[12]
)
{
int gid_x = blockIdx.x * blockDim.x + threadIdx.x ;
int gid_y = blockIdx.y * blockDim.y + threadIdx.y ;
int l_index = gid_x + gid_y * a_src_pts_w ;
float3 l_from, l_to ;
l_from = ap_src_pts[l_index] ;
l_to.x = p_dc_trans_mat[0] * l_from.x
+ p_dc_trans_mat[1] * l_from.y
+ p_dc_trans_mat[2] * l_from.z
+ p_dc_trans_mat[9] ;
l_to.y = p_dc_trans_mat[3] * l_from.x
+ p_dc_trans_mat[4] * l_from.y
+ p_dc_trans_mat[5] * l_from.z
+ p_dc_trans_mat[10] ;
l_to.z = p_dc_trans_mat[6] * l_from.x
+ p_dc_trans_mat[7] * l_from.y
+ p_dc_trans_mat[8] * l_from.z
+ p_dc_trans_mat[11] ;
ap_dst_pts[l_index] = l_to ;
}
Which one do you think a better approach is ?