Hi Experts,
I am struggling with synchronization in a complicated kernel.
Initially, I have 128
threads in a block, and I use 64
of them to copy data from global memory to shared memory.
After this step, I perform the first synchronization, and it works fine.
Next, I use 96
of the 128 threads to perform another computation and update a variable in shared memory. However, there is a branch condition within this step.
After completing these steps, I attempt to synchronize all 128
threads in the block, but the process stalls at this point.
I feel frustrated with the complex synchronization conditions in my kernel.
Could someone advise me on how to properly handle synchronization in such a scenario?
Or should I avoid making my kernel so complex in the first place?
Thank you!
This is a simplified structure:
__global__ void kernel() {
__shared__ float sheme[3][4];
int tid = threadIdx.x;
if (tid < 64) {
// do something
}
__syncthreads();
if (tid < 96) {
if (condition) {
sheme[][] = val;
} else {
sheme[][] = -FLT_MAX;
}
}
__syncthreads(); // dead in this synchronization
}
Source Code:
#define PIX 2
template <int PIX>
__global__ void MyKernel(float *__restrict__ spectrum_out, const cuFloatComplex *__restrict__ eigen_vec, int obj_num, DOAConfig cfg, int AZI_BEGIN, int AZI_END, int ELE_BEGIN, int ELE_END, int threshold) {
__shared__ cuFloatComplex shared_coarse_steering_v[3 * (PIX + 2)][8];
__shared__ cuFloatComplex shared_eigen_vec[8][8];
__shared__ cuFloatComplex shared_dot_prod[3 * (PIX + 2) * 8][8];
__shared__ float shared_coarse_spectrum[3][PIX + 2];
float d = cfg.d;
float lambda = cfg.lambda;
int N_NOISE = 8 - obj_num;
int tid = threadIdx.x;
int tid_pixel = tid >> 6; // tid / (blockDim.x/pixelPerBlock)
int pid = tid & 63; // tid % (blockDim.x/pixelPerBlock)
if (tid < 64) {
// eigenvectors
int row = tid >> 3; // tid / 8;
int col = tid & 7; // tid % 8
shared_eigen_vec[row][col] = eigen_vec[tid];
}
__syncthreads();
if (tid < 8 * 3 * (PIX + 2)) {
int pad_pixel_id = tid / 8;
int pad_ant_id = tid % 8;
int cx = blockIdx.x + AZI_BEGIN;
int cy = blockIdx.y * PIX + ELE_BEGIN;
int pad_row = pad_pixel_id / (PIX + 2);
int pad_col = pad_pixel_id % (PIX + 2);
int pad_azimuth, pad_elevation = pad_col - 1 + cy;
if (pad_pixel_id < 4) {
pad_azimuth = cx - 1;
} else if (pad_pixel_id < 8) {
pad_azimuth = cx;
} else {
pad_azimuth = cx + 1;
}
if (pad_azimuth >= AZI_BEGIN && pad_azimuth <= AZI_END && pad_elevation >= ELE_BEGIN && pad_elevation <= ELE_END) {
// MUSIC
float rad_azimuth = __fmul_rn(pad_azimuth, RAD);
float rad_elevation = __fmul_rn(pad_elevation, RAD);
float sin_phi, cos_phi;
sincosf(rad_azimuth, &sin_phi, &cos_phi);
float sin_theta, cos_theta;
sincosf(rad_elevation, &sin_theta, &cos_theta);
float phaseShift_pi = __fdividef(d, lambda) * 2 * __fmaf_rn(cfg.antenna[pad_ant_id][0], __fmul_rn(cos_phi, sin_theta), __fmaf_rn(cfg.antenna[pad_ant_id][1], __fmul_rn(sin_phi, sin_theta), __fmul_rn(cfg.antenna[pad_ant_id][2], cos_theta)));
float sin_phaseShift, cos_phaseShift;
sincospif(phaseShift_pi, &sin_phaseShift, &cos_phaseShift);
shared_coarse_steering_v[pad_pixel_id][pad_ant_id] = make_cuFloatComplex(cos_phaseShift, sin_phaseShift);
for (int k = 0; k < N_NOISE; ++k) {
shared_dot_prod[pad_pixel_id * N_NOISE + k][pad_ant_id] = cuConjf(cuCmulf(shared_coarse_steering_v[pad_pixel_id][pad_ant_id], shared_eigen_vec[k][pad_ant_id]));
}
// __syncwarp();
// reduce
for (int noise_row_offset = 0; noise_row_offset < N_NOISE; ++noise_row_offset) {
#pragma unroll
for (int stride = 4; stride > 0; stride >>= 1) {
if (pad_ant_id < stride) {
shared_dot_prod[noise_row_offset + pad_pixel_id * N_NOISE][pad_ant_id] = cuCaddf(shared_dot_prod[noise_row_offset + pad_pixel_id * N_NOISE][pad_ant_id], shared_dot_prod[noise_row_offset + pad_pixel_id * N_NOISE][pad_ant_id + stride]);
}
__syncthreads();
}
}
// accumulate
if (pad_ant_id == 0) {
float accuSum_ = 0;
for (int noise_row_offset = 0; noise_row_offset < N_NOISE; ++noise_row_offset) {
cuFloatComplex val_ = shared_dot_prod[noise_row_offset + pad_pixel_id * N_NOISE][0];
accuSum_ = __fadd_rn(accuSum_, __fadd_rn(__fmul_rn(val_.x, val_.x), __fmul_rn(val_.y, val_.y)));
}
shared_coarse_spectrum[pad_row][pad_col] = 10 * __log10f(__fdividef(1.0, accuSum_));
}
} else {
shared_coarse_spectrum[pad_row][pad_col] = -FLT_MAX;
}
}
__syncthreads(); // dead in this synchronization
}