Usage of __syncthreads() in complicated branch

Hi Experts,

I am struggling with synchronization in a complicated kernel.

Initially, I have 128 threads in a block, and I use 64 of them to copy data from global memory to shared memory.

After this step, I perform the first synchronization, and it works fine.

Next, I use 96 of the 128 threads to perform another computation and update a variable in shared memory. However, there is a branch condition within this step.

After completing these steps, I attempt to synchronize all 128 threads in the block, but the process stalls at this point.

I feel frustrated with the complex synchronization conditions in my kernel.

Could someone advise me on how to properly handle synchronization in such a scenario?

Or should I avoid making my kernel so complex in the first place?

Thank you!

This is a simplified structure:

__global__ void kernel() {
    __shared__ float sheme[3][4];

    int tid = threadIdx.x;

    if (tid < 64) {
        // do something
    }

    __syncthreads();

    if (tid < 96) {
        if (condition) {
            sheme[][] = val;
        } else {
            sheme[][] = -FLT_MAX;
        }
    }

    __syncthreads();                      // dead in this synchronization
}

Source Code:

#define PIX 2

template <int PIX>
__global__ void MyKernel(float *__restrict__ spectrum_out, const cuFloatComplex *__restrict__ eigen_vec, int obj_num, DOAConfig cfg, int AZI_BEGIN, int AZI_END, int ELE_BEGIN, int ELE_END, int threshold) {
    __shared__ cuFloatComplex shared_coarse_steering_v[3 * (PIX + 2)][8];
    __shared__ cuFloatComplex shared_eigen_vec[8][8];
    __shared__ cuFloatComplex shared_dot_prod[3 * (PIX + 2) * 8][8];
    __shared__ float shared_coarse_spectrum[3][PIX + 2];
    
    float d      = cfg.d;
    float lambda = cfg.lambda;

    int N_NOISE = 8 - obj_num;

    int tid  = threadIdx.x;
    int tid_pixel = tid >> 6; // tid / (blockDim.x/pixelPerBlock)
    int pid = tid & 63;       // tid % (blockDim.x/pixelPerBlock)

    if (tid < 64) {
        // eigenvectors
        int row = tid >> 3; // tid / 8;
        int col = tid & 7;  // tid % 8

        shared_eigen_vec[row][col] = eigen_vec[tid];
    }

    __syncthreads();

    if (tid < 8 * 3 * (PIX + 2)) {
        int pad_pixel_id = tid / 8;
        int pad_ant_id   = tid % 8;

        int cx = blockIdx.x + AZI_BEGIN;
        int cy = blockIdx.y * PIX + ELE_BEGIN;
        
        int pad_row = pad_pixel_id / (PIX + 2);
        int pad_col = pad_pixel_id % (PIX + 2);

        int pad_azimuth, pad_elevation = pad_col - 1 + cy;
        if (pad_pixel_id < 4) {
            pad_azimuth = cx - 1;
        } else if (pad_pixel_id < 8) {
            pad_azimuth = cx;
        } else {
            pad_azimuth = cx + 1;
        }

        if (pad_azimuth >= AZI_BEGIN && pad_azimuth <= AZI_END && pad_elevation >= ELE_BEGIN && pad_elevation <= ELE_END) {
            // MUSIC
            float rad_azimuth   = __fmul_rn(pad_azimuth, RAD);
            float rad_elevation = __fmul_rn(pad_elevation, RAD);

            float sin_phi, cos_phi;
            sincosf(rad_azimuth, &sin_phi, &cos_phi);

            float sin_theta, cos_theta;
            sincosf(rad_elevation, &sin_theta, &cos_theta);

            float phaseShift_pi = __fdividef(d, lambda) * 2 * __fmaf_rn(cfg.antenna[pad_ant_id][0], __fmul_rn(cos_phi, sin_theta), __fmaf_rn(cfg.antenna[pad_ant_id][1], __fmul_rn(sin_phi, sin_theta), __fmul_rn(cfg.antenna[pad_ant_id][2], cos_theta)));
            float sin_phaseShift, cos_phaseShift;
            sincospif(phaseShift_pi, &sin_phaseShift, &cos_phaseShift);

            shared_coarse_steering_v[pad_pixel_id][pad_ant_id] = make_cuFloatComplex(cos_phaseShift, sin_phaseShift);

            for (int k = 0; k < N_NOISE; ++k) {
                shared_dot_prod[pad_pixel_id * N_NOISE + k][pad_ant_id] = cuConjf(cuCmulf(shared_coarse_steering_v[pad_pixel_id][pad_ant_id], shared_eigen_vec[k][pad_ant_id]));
            }
            // __syncwarp();

            // reduce
            for (int noise_row_offset = 0; noise_row_offset < N_NOISE; ++noise_row_offset) {

                #pragma unroll
                for (int stride = 4; stride > 0; stride >>= 1) {
                    if (pad_ant_id < stride) {
                        shared_dot_prod[noise_row_offset + pad_pixel_id * N_NOISE][pad_ant_id] = cuCaddf(shared_dot_prod[noise_row_offset + pad_pixel_id * N_NOISE][pad_ant_id], shared_dot_prod[noise_row_offset + pad_pixel_id * N_NOISE][pad_ant_id + stride]);
                    }
                    __syncthreads();
                }
            }

            // accumulate
            if (pad_ant_id == 0) {
                float accuSum_ = 0;
                for (int noise_row_offset = 0; noise_row_offset < N_NOISE; ++noise_row_offset) {
                    cuFloatComplex val_ = shared_dot_prod[noise_row_offset + pad_pixel_id * N_NOISE][0];
                    accuSum_ = __fadd_rn(accuSum_, __fadd_rn(__fmul_rn(val_.x, val_.x), __fmul_rn(val_.y, val_.y)));
                }

                shared_coarse_spectrum[pad_row][pad_col] = 10 * __log10f(__fdividef(1.0, accuSum_));

            }

        } else {
            shared_coarse_spectrum[pad_row][pad_col] = -FLT_MAX;
        }
        
        
    }
    
    __syncthreads();           // dead in this synchronization
}

Synchronization within an if-statement is only allowed if all threads to synchronize will reach the instruction.
However, you call __syncthreads() within a branch that not every thread may execute.

1 Like

Reason, the @striker159 s answer

You can either put your if condition repeatedly inside the block, so that all threads reach the __syncthreads

or you can synchronize only between the 96 threads (e.g. with cooperative groups or with named inline PTX assembly barriers).

1 Like