Shared_memory vs local memory. Exponential calculations

Hello i have a kernel like this

template <typename T1, typename T2>
__global__ void time_advance_single_step(Vector_field<T1, T2> *U, fft_var<T1, T2> *spectral_setup, universal_arrays<T1, T2> *univ, T1 a, T1 b, T1 c, bool final_copy)
{
    int64 i = threadIdx.x + (blockDim.x * blockIdx.x);

    if (i >= Elements_per_process_fourier_gpu)
        return;

    // Local variables for Ukx , Uky , Ukz
    T2 data_ukx{U->ukx[i]};
    T2 data_ukz{U->ukz[i]};

    T2 U_nlinx{U->nlinx[i]};
    T2 U_nlinz{U->nlinz[i]};

    T1 ksqr{spectral_setup->ksqr[i]};

    T1 exponential_term_1 = exp(-a * ((nu_gpu<T1> * ksqr) + (nu_hypo_gpu<T1> * pow(ksqr, 0.5 * nu_hypo_power_factor_gpu<T1>)) + (nu_hyper_gpu<T1> * pow(ksqr, 0.5 * nu_hyper_power_factor_gpu<T1>))) * dt_gpu<T1>);
    T1 exponential_term_2 = exp(-b * ((nu_gpu<T1> * ksqr) + (nu_hypo_gpu<T1> * pow(ksqr, 0.5 * nu_hypo_power_factor_gpu<T1>)) + (nu_hyper_gpu<T1> * pow(ksqr, 0.5 * nu_hyper_power_factor_gpu<T1>))) * dt_gpu<T1>);

    if ((i == 0) && (hypo_viscosity_flag_gpu) && (rank_gpu == 0))
    {
        exponential_term_1 = 1;
        exponential_term_2 = 1;
    }

    if (dimension_gpu == 2)
    {
        // Velocity updating at end for next term
        data_ukx.x = (data_ukx.x * exponential_term_1) + (c * dt_gpu<T1> * U_nlinx.x * exponential_term_2);
        data_ukx.y = (data_ukx.y * exponential_term_1) + (c * dt_gpu<T1> * U_nlinx.y * exponential_term_2);

        // Velocity updating at end for next term
        data_ukz.x = (data_ukz.x * exponential_term_1) + (c * dt_gpu<T1> * U_nlinz.x * exponential_term_2);
        data_ukz.y = (data_ukz.y * exponential_term_1) + (c * dt_gpu<T1> * U_nlinz.y * exponential_term_2);

        // Copying data for better timmings
        if (final_copy)
        {
            U->ukx[i] = data_ukx;
            U->ukz[i] = data_ukz;
        }

        // Copying data into variables
        univ->tempF1[i] = data_ukx;
        univ->tempF3[i] = data_ukz;

        if (TIME_SCHEME_NUMBER_GPU != 3)
        {
            // Dealising the data for FFT
            dealias(ksqr, data_ukx, data_ukz);

            // Copying the data into nlin variables
            U->nlinx[i] = data_ukx;
            U->nlinz[i] = data_ukz;
        }
    }
    if (dimension_gpu == 3)
    {
        T2 data_uky{U->uky[i]};
        T2 U_nliny{U->nliny[i]};

        // Velocity updating at end for next term
        data_ukx.x = (data_ukx.x * exponential_term_1) + (c * dt_gpu<T1> * U_nlinx.x * exponential_term_2);
        data_ukx.y = (data_ukx.y * exponential_term_1) + (c * dt_gpu<T1> * U_nlinx.y * exponential_term_2);

        // Velocity updating at end for next term
        data_uky.x = (data_uky.x * exponential_term_1) + (c * dt_gpu<T1> * U_nliny.x * exponential_term_2);
        data_uky.y = (data_uky.y * exponential_term_1) + (c * dt_gpu<T1> * U_nliny.y * exponential_term_2);

        // Velocity updating at end for next term
        data_ukz.x = (data_ukz.x * exponential_term_1) + (c * dt_gpu<T1> * U_nlinz.x * exponential_term_2);
        data_ukz.y = (data_ukz.y * exponential_term_1) + (c * dt_gpu<T1> * U_nlinz.y * exponential_term_2);

        // Copying data for better timmings
        if (final_copy)
        {
            U->ukx[i] = data_ukx;
            U->uky[i] = data_uky;
            U->ukz[i] = data_ukz;
        }

        // Copying data into variables
        univ->tempF1[i] = data_ukx;
        univ->tempF2[i] = data_uky;
        univ->tempF3[i] = data_ukz;

        if (TIME_SCHEME_NUMBER_GPU != 3)
        {
            // Dealising the data for FFT
            dealias(ksqr, data_ukx, data_uky, data_ukz);

            // Copying the data into nlin variables
            U->nlinx[i] = data_ukx;
            U->nliny[i] = data_uky;
            U->nlinz[i] = data_ukz;
        }
    }
}

Here i have performed some mathematical complex type operations in local memory. I want to know is it better to use shared memory or local memory is better as here only element wise operations are done .

My second question is is there any way to optimize the exponential calculations.

One option would be for you to code up both versions, and test/benchmark both versions.