In my work, I’m calculating eigenvectors\eigenvalues for multiple specific 6x6 complex-valued matrices. In my case, I’m able to use semianalitical approach to solve this problem. I had wrote a kernel to perform this work and found that grid launch\ run of grid blocks fails when amount of blocks reaches specific number. In this example, 10000 blocks in release mode is enought for GTX1080, but when I use only one block, and printf values everything is ok. If I left in my code only the part that works with evaluation of eigennumbers, excluding the part for vectors calculation, this problem disapears. Can somebody explain what is wrong and how I can fix this.
Windows 10
Visual Studio 2022
CUDA 11.7
Driver 516.59
GTX 1080
how I launching
blocks = 1;
//blocks = 10000; // an illegal memory access will encountered in release version
host_config.EVEN_Db = host_config.EVEN_Db;
knx9_EVEN_LIST_optV5d21_solver_v3d1 << <blocks, 9, 0, host_config.EVEN_Stream >> > (pdevice_config);
cudaStreamSynchronize(host_config.EVEN_Stream);
CUDA_Check(cudaGetLastError());
my kernel
__global__ void knx9_EVEN_LIST_optV5d21_solver_v3d1(DGPU_PROBLEM_CONFIG_t* device_config)
{
//WARNING 9 THREADS FIXED VERSION
//one in layerXproblem = one block
__shared__ cuComplex sh_A2inv[9];
__shared__ cuComplex sh_Abuilt[18];
__shared__ cuComplex sh_temp_det_matrices[36];
__shared__ cuComplex sh_part_of_DET[12];
__shared__ cuComplex sh_interp_points_dets_coefs[8];
__shared__ cuComplex sh_refined_sor_roots_EV_cache[42];
__shared__ cuComplex sh_problem4[4];
__shared__ bool only_real;
//======sys solver
__shared__ int sh_diffrent_en;
__shared__ int sh_sysbuild_worksize;
__shared__ int sh_en_order[6];
__shared__ int sh_store_offsets[6];
__shared__ cuComplex sh_BC_storage[18];
__shared__ cuComplex sh_EN_for_sys[6];
__shared__ cuComplex sh_systems[216];
//============test
__shared__ cuComplex sh_EN_backup[6];
int tid = threadIdx.x;
if (tid == 0)
only_real = device_config->flag_only_real;
int problems_ = device_config->problems_in_use;
int layers_ = device_config->layers_in_use;
int total_ = problems_ * layers_;
//TEST
int bid = blockIdx.x;
bid = 0;
while (bid < total_)
{
//define problem and layer
int problem_ = bid / layers_;
int layer_ = bid % layers_;
if (tid < 3) //3 el*4 matr
{
sh_problem4[tid] = device_config->problems_ptr[problem_ + tid];
}
// printf("check point 1\n");
__syncthreads();
tid = threadIdx.x;
cuComplex temp_gamma = sh_problem4[2];
cuComplex temp_alpha = sh_problem4[0];
cuComplex temp_omega = sh_problem4[1];
//cuComplex temp_z = sh_problem4[3];
float t1f_ = __expf(temp_gamma.y);
float t1f = __fdividef(1.0f, t1f_);
float t2f = __fmul_rn(__cosf(temp_gamma.x), 0.5f);
float t3f = __fmul_rn(__sinf(temp_gamma.x), 0.5f);
float t2mt1 = __fmul_rn(t1f, t2f);
float t3mt1 = __fmul_rn(t1f, t3f);
float t2dt1 = __fmul_rn(t2f, t1f_);
float t3dt1 = __fmul_rn(t3f, t1f_);
cuComplex alpha1 = __cuCmulf(temp_alpha, make_cuComplex(__fadd_rn(t2mt1, t2dt1), __fsub_rn(t3mt1, t3dt1)));
cuComplex alpha2 = __cuCmulf(temp_alpha, make_cuComplex(-__fadd_rn(t3mt1, t3dt1), __fsub_rn(t2mt1, t2dt1)));
cuComplex alpha11 = __cuCmulf(alpha1, alpha1);
cuComplex alpha12 = __cuCmulf(alpha1, alpha2);
cuComplex alpha22 = __cuCmulf(alpha2, alpha2);
//step 1 build lower part of temp matrix
cuComplex t1 = __cuCmulf(device_config->Axx_Storage[tid], alpha11);//alpha_1^2 *A_01
cuComplex t2 = __cuCmulf(device_config->Axx_Storage[tid + 9], alpha22);//alpha_2^2*A_02
cuComplex t3 = __cuCmulf(device_config->Axx_Storage[tid + 18], alpha12);// alpha_1*alpha_2*A_03
cuComplex t4 = __cuCaddf(__cuCaddf(t1, t2), t3);//alpha_1^2 *A_01 + alpha_2^2*A_02+ alpha_1*alpha_2*A_03
cuComplex t5 = __cuCmulf(__cuCmulf(device_config->Axx_Storage[tid + 27], temp_omega), temp_omega);
sh_Abuilt[tid] = __cuCsubf(t4, t5);
t1 = __cuCmulf(device_config->Axx_Storage[tid + 36], alpha1);
t2 = __cuCmulf(device_config->Axx_Storage[tid + 45], alpha2);
sh_Abuilt[tid + 9] = __cuCaddf(t1, t2);
sh_A2inv[tid] = device_config->Axx_Storage[tid + 54];
//step 2 mult lower part of temp matrix with inverse
__syncthreads; //need all work to be done before calculation
int i = tid % 3;
int j = tid / 3;
cuComplex cache_i0 = sh_A2inv[i + 0];
cuComplex cache_i3 = sh_A2inv[i + 3];
cuComplex cache_i6 = sh_A2inv[i + 6];
cuComplex temp = __cuCmulf(cache_i0, sh_Abuilt[0 + j * 3]);
temp = __cuCfmaf(cache_i3, sh_Abuilt[1 + j * 3], temp);
temp = __cuCfmaf(cache_i6, sh_Abuilt[2 + j * 3], temp);
cuComplex cache_sh_Abuilt18 = temp;//B
temp = __cuCmulf(cache_i0, sh_Abuilt[9 + 0 + j * 3]);
temp = __cuCfmaf(cache_i3, sh_Abuilt[9 + 1 + j * 3], temp);
temp = __cuCfmaf(cache_i6, sh_Abuilt[9 + 2 + j * 3], temp);
float diag = __fsub_rn(__saturatef((float)abs(i - j)), 1.0f);
//step 3 calculate determenants
// first is calculate summ of matrix
cuComplex lambda0 = make_cuComplex(1.0f, 0.0f);
cuComplex lambda1 = make_cuComplex(0.0f, 1.0f);
cuComplex lambda2 = make_cuComplex(0.70710677f, 0.70710677f);
cuComplex lambda3 = make_cuComplex(-0.70710677f, 0.70710677f);
cuComplex cache_sh_Abuilt27 = make_cuComplex(-temp.y, temp.x);//C
//caching for system
sh_BC_storage[tid] = cache_sh_Abuilt18;
sh_BC_storage[tid + 9] = cache_sh_Abuilt27;
//printf("lambda_sqrt[0] %f %f\n", lambda.x, lambda.y);
//temp = __cuCfmaf(cuCmulf(lambda, lambda), diag, cache_sh_Abuilt18);
cuComplex temp0 = __cuCsqc_mulr_addc_f(lambda0, diag, cache_sh_Abuilt18);
cuComplex temp1 = __cuCsqc_mulr_addc_f(lambda1, diag, cache_sh_Abuilt18);
cuComplex temp2 = __cuCsqc_mulr_addc_f(lambda2, diag, cache_sh_Abuilt18);
cuComplex temp3 = __cuCsqc_mulr_addc_f(lambda3, diag, cache_sh_Abuilt18);
temp0 = __cuCfmaf(lambda0, cache_sh_Abuilt27, temp0);
temp1 = __cuCfmaf(lambda1, cache_sh_Abuilt27, temp1);
temp2 = __cuCfmaf(lambda2, cache_sh_Abuilt27, temp2);
temp3 = __cuCfmaf(lambda3, cache_sh_Abuilt27, temp3);
sh_temp_det_matrices[tid] = temp0;
sh_temp_det_matrices[tid + 9] = temp1;
sh_temp_det_matrices[tid + 18] = temp2;
sh_temp_det_matrices[tid + 27] = temp3;
//second step is to calculate is determenant exactly
int det_offset = 9 * j;
float sign = (float)(1 - 2 * i % 2);
int index1 = (i + 1) % 3;
int index2 = (i + 2) % 3;
//printf("check point 2\n");
//need all work to be done before calculation
__syncthreads;
cuComplex a = __cuCmulf(sh_temp_det_matrices[det_offset + 1 + 3 * index1], sh_temp_det_matrices[det_offset + 2 + 3 * index2]);
cuComplex b = __cuCmulf(sh_temp_det_matrices[det_offset + 2 + 3 * index1], sh_temp_det_matrices[det_offset + 1 + 3 * index2]);
cuComplex t = __cuCmulf(sh_temp_det_matrices[det_offset + 0 + 3 * i], sign);
sh_part_of_DET[tid] = __cuCmulf(t, __cuCsubf(a, b));
if (tid < 3) //3 el*4 matr
{
det_offset = det_offset + 27;
a = __cuCmulf(sh_temp_det_matrices[det_offset + 1 + 3 * index1], sh_temp_det_matrices[det_offset + 2 + 3 * index2]);
b = __cuCmulf(sh_temp_det_matrices[det_offset + 2 + 3 * index1], sh_temp_det_matrices[det_offset + 1 + 3 * index2]);
t = __cuCmulf(sh_temp_det_matrices[det_offset + 0 + 3 * i], sign);
sh_part_of_DET[9 + tid] = __cuCmulf(t, __cuCsubf(a, b));
}
__syncthreads;
if (tid < 4) //3 el*4 matr
{
sh_interp_points_dets_coefs[tid] = __cuCaddf(__cuCaddf(sh_part_of_DET[0 + 3 * tid], sh_part_of_DET[1 + 3 * tid]), sh_part_of_DET[2 + 3 * tid]);
}
__syncthreads;
//step 4 calculate interpolation polynom + normalize
if (tid < 4) //3 el*4 matr
{
cuComplex temp = __cuCmulf(const_lagrange_matr[tid + 0], sh_interp_points_dets_coefs[0]);
temp = __cuCfmaf(const_lagrange_matr[tid + 4], sh_interp_points_dets_coefs[1], temp);
temp = __cuCfmaf(const_lagrange_matr[tid + 8], sh_interp_points_dets_coefs[2], temp);
temp = __cuCfmaf(const_lagrange_matr[tid + 12], sh_interp_points_dets_coefs[3], temp);
sh_interp_points_dets_coefs[tid + 4] = temp;
}
//printf("check point 3\n");
__syncthreads;
/* if (tid == 0)
{
printf("sh_interp_points_dets_coefs \n ");
for (int i = 0; i < 8; i++)
{
printf(" (%f %f)\n ", sh_interp_points_dets_coefs[i].x, sh_interp_points_dets_coefs[i].y);
}
}
__syncthreads;*/
if (tid == 0)
{
cuComplex a = sh_interp_points_dets_coefs[3 + 4];
cuComplex a_ = __cuCinversef(a);
cuComplex b = __cuCmulf(sh_interp_points_dets_coefs[2 + 4], a_);
cuComplex c = __cuCmulf(sh_interp_points_dets_coefs[1 + 4], a_);
cuComplex d = __cuCmulf(sh_interp_points_dets_coefs[0 + 4], a_);
//a = make_cuComplex(1.0f, 0.0f);
cuComplex b2 = __cuCsquaref(b);
cuComplex delta_cardano = __cuCmulf(b, -0.33333334f);
cuComplex t2_ = __cuCmulf(__cuCmulf(c, b), 0.33333334f);
cuComplex p = __cuCsubf(c, __cuCmulf(b2, 0.33333334f));
cuComplex t1_ = __cuCmulf(__cuCmulf(b, b2), 0.074074075f);
cuComplex q = __cuCaddf(__cuCsubf(t1_, t2_), d);
t1_ = __cuCmulf(p, 0.33333334f);
t2_ = __cuCmulf(q, 0.5f);
cuComplex pd3q = __cuCcubef(t1_);
cuComplex qd2k = __cuCsquaref(t2_);
cuComplex Q = __cuCaddf(pd3q, qd2k);
cuComplex Qrt = __cuCsqrtf(Q);
cuComplex alpha3 = __cuCsubf(Qrt, t2_);
cuComplex alpha = __cuCcbrtf(alpha3);
cuComplex beta = __cuCdivf(t1_, alpha);
cuComplex root0 = __cuCsubf(alpha, beta);
t1_ = __cuCmulf(root0, -0.5f);
t2_ = make_cuComplex(0.0f, 0.8660254f);
cuComplex t3_ = __cuCaddf(alpha, beta);
t3_ = __cuCmulf(t2_, t3_);
t2_ = __cuCaddf(t1_, delta_cardano);
cuComplex root1 = __cuCaddf(t2_, t3_);
cuComplex root2 = __cuCsubf(t2_, t3_);
root0 = __cuCaddf(delta_cardano, root0);
root1 = __cuCsqrtf(root1);
root2 = __cuCsqrtf(root2);
root0 = __cuCsqrtf(root0);
bool isotropic = device_config->ptr_isisotropic[bid]; // cache value before access, memory access is very slow
float sign = (float)(1 - 2 * (root0.x < 0));
root0.x *= sign;
root0.y *= sign;
sign = (float)(1 - 2 * (root1.x < 0));
root1.x *= sign;
root1.y *= sign;
sign = (float)(1 - 2 * (root2.x < 0));
root2.x *= sign;
root2.y *= sign;
if (root0.x < root1.x)
{
cuComplex t = root0;
root0 = root1;
root1 = t;
}
if (root0.x < root2.x)
{
cuComplex t = root0;
root0 = root2;
root2 = t;
}
if (root1.x < root2.x)
{
cuComplex t = root1;
root1 = root2;
root2 = t;
}
//=========================================================================
// FIND ROOTS ORDER AND PACK
// FIND AMOUNT OF DIFFRENT ROOTS
//=========================================================================
float max0 = cuCL1f(root0); //IN sh_refined_sor_roots_EV_cache
float max1 = cuCL1f(root1); //IN is isotropic
float max2 = cuCL1f(root2); //IN EN_diff_threshold
cuComplex d1 = root0 - root1; //OUT list of diffrent EN
cuComplex d2 = root1 - root2; //OUT orders of EN
float d1_abs = cuCL1f(d1) / max(max0, max1);//OUT refined sh_refined_sor_roots_EV_cache
float d2_abs = cuCL1f(d2) / max(max1, max2);
float min_dif = min(d1_abs, d2_abs);
int ind = (int)(d1_abs > d2_abs);
float EN_diff_threshold = 1.0e-5;
bool l_flag_2nd_orderEN = (min_dif < EN_diff_threshold) || isotropic;
// printf("check point 4\n");
//=========================================================================
// PACKING ROOTS
//=========================================================================
int counter = 0;
int no_of_en = 0;
if (l_flag_2nd_orderEN)
{
// printf("ind %d\n", ind);
int i0 = 2 - 1 * ind;
int i1 = 2 - 1 * (1 - ind);
int i2 = 2 - 1 * (1 - ind);
int i3 = 2 - 1 * ind;
sh_en_order[0] = i0;
sh_en_order[1] = i1;
sh_en_order[2] = i2;
sh_en_order[3] = i3;
sh_store_offsets[0] = 6;
int s = 6 + 6 * i0;
sh_store_offsets[1] = s;
s += 6 * i1;
sh_store_offsets[2] = s;
s += 6 * i2;
sh_store_offsets[3] = s;
sh_diffrent_en = 4;
sh_EN_backup[0] = root0; // printf("root 0 %f %f\n", root0.x, root0.y);
sh_EN_backup[1] = root1; // printf("root 1 %f %f\n", root1.x, root1.y);
sh_EN_backup[2] = root2; // printf("root 2 %f %f\n", root2.x, root2.y);
sh_EN_backup[3] = make_cuComplex(-root2.x, -root2.y);
sh_EN_backup[4] = make_cuComplex(-root1.x, -root1.y);
sh_EN_backup[5] = make_cuComplex(-root0.x, -root0.y);
//4numbers are located in temp array
if (ind == 0)
{
cuComplex root_ = __cuCmulf(__cuCaddf(root0, root1), 0.5f);
sh_EN_for_sys[0] = root_;
sh_EN_for_sys[1] = root2;
sh_EN_for_sys[2] = make_cuComplex(-root2.x, -root2.y);
sh_EN_for_sys[3] = make_cuComplex(-root_.x, -root_.y);
}
else
{
cuComplex root_ = __cuCmulf(__cuCaddf(root1, root2), 0.5f);
sh_EN_for_sys[0] = root0;//printf("lroot_ %f %f\n", root_.x, root_.y);
sh_EN_for_sys[1] = root_;
sh_EN_for_sys[2] = make_cuComplex(-root_.x, -root_.y);
sh_EN_for_sys[3] = make_cuComplex(-root0.x, -root0.y);
}
}
else
{// don't need to change anything
sh_diffrent_en = 6;
sh_en_order[0] = 1;
sh_en_order[1] = 1;
sh_en_order[2] = 1;
sh_en_order[3] = 1;
sh_en_order[4] = 1;
sh_en_order[5] = 1;
sh_EN_for_sys[0] = root0;
sh_EN_for_sys[1] = root1;
sh_EN_for_sys[2] = root2;
sh_EN_for_sys[3] = make_cuComplex(-root2.x, -root2.y);
sh_EN_for_sys[4] = make_cuComplex(-root1.x, -root1.y);
sh_EN_for_sys[5] = make_cuComplex(-root0.x, -root0.y);
sh_store_offsets[0] = 6;
sh_store_offsets[1] = 12;
sh_store_offsets[2] = 18;
sh_store_offsets[3] = 24;
sh_store_offsets[4] = 30;
sh_store_offsets[5] = 36;
}
sh_sysbuild_worksize = sh_diffrent_en * 9; //
// printf("bid %d sh_sysbuild_worksize %d\n", bid, sh_sysbuild_worksize);
// upper and botton part is building in parralel, so x9 not x18
}
//printf("check point 5\n");
tid = threadIdx.x;
cuComplex cache1 = sh_BC_storage[tid];
cuComplex cache2 = sh_BC_storage[tid + 9];
for (int ev_id = 0; ev_id < sh_diffrent_en; ev_id++)
{// i, j found at start of "while"
cuComplex ev = sh_EN_for_sys[ev_id];
int fast_index = ev_id * 36 + i + j * 6;
cuComplex t = make_cuComplex(-diag, 0.0f) * ev;
float diag = 1.0f - __saturatef((float)abs(i - j));
sh_systems[fast_index] = t;
sh_systems[fast_index + 18] = make_cuComplex(diag, 0.0f);
sh_systems[fast_index + 3] = cache1;// [B |C-lE ]
sh_systems[fast_index + 21] = cache2 + t;
}
__syncthreads();
/*
if (tid == 0)
{
printf("bid %d sh_systems start\n", bid);
for (int i = 0; i < 6; i++)
{
for (int j_ = 0; j_ < 6; j_++)
{
cuComplex x = sh_systems[i + j_ * 6];
printf("(%- .5ff, %- .5ff) ", x.x, x.y);
}
printf("\n");
}
printf("\n");
}
__syncthreads();
*/
//====================================================
// SYSTEM REDUCTION -FIRST THREE COLS/ROWS
//=====================================================
//======================================================
// PROBLEM HERE
//=======================================================
while (tid < sh_diffrent_en * 3)
{
int tid_ = tid % 3;
int mid_ = tid / 3;
for (int i = 0; i < 3; i++)
{
cuComplex lead = sh_systems[36 * mid_ + (3 + tid_) + 3 * 6];
for (int ii = 0; ii < 6; ii++)
{
sh_systems[36 * mid_ + (3 + tid_) + ii * 6] = sh_systems[36 * mid_ + (3 + tid_) + ii * 6] - (lead * sh_systems[36 * mid_ + 0 + ii * 6]);
}
lead = sh_systems[36 * mid_ + (3 + tid_) + 4 * 6];
for (int ii = 0; ii < 6; ii++)
{
sh_systems[36 * mid_ + (3 + tid_) + ii * 6] = sh_systems[36 * mid_ + (3 + tid_) + ii * 6] - (lead * sh_systems[36 * mid_ + 1 + ii * 6]);
}
lead = sh_systems[36 * mid_ + (3 + tid_) + 5 * 6];
for (int ii = 0; ii < 6; ii++)
{
sh_systems[36 * mid_ + (3 + tid_) + ii * 6] = sh_systems[36 * mid_ + (3 + tid_) + ii * 6] - (lead * sh_systems[36 * mid_ + 2 + ii * 6]);
}
}
tid += blockDim.x;
}
tid = threadIdx.x;
/*
if (tid == 0)
{
printf("bid %d sh_systems red 3\n", bid);
for (int i = 0; i < 6; i++)
{
for (int j_ = 0; j_ < 6; j_++)
{
cuComplex x = sh_systems[i + j_ * 6];
printf("(%- .5ff, %- .5ff) ", x.x, x.y);
}
printf("\n");
}
printf("\n");
}
*/
__syncthreads();
//====================================================
// SYSTEM REDUCTION -SECOND THREE COLS/ROWS - NOT IMPORTANT FOR PROBLEM DETECTION
//=====================================================
/*
__syncthreads();
int target_tid = 0;
if (tid < sh_diffrent_en)
{
cuComplex r0 = sh_systems[36 * tid + (3 + 0) + 0 * 6];
cuComplex r1 = sh_systems[36 * tid + (3 + 0) + 1 * 6];
cuComplex r2 = sh_systems[36 * tid + (3 + 0) + 2 * 6];
float L1_0 = cuCL1f(r0) + cuCL1f(r1) + cuCL1f(r2);
float max = L1_0;
r0 = sh_systems[36 * tid + (3 + 1) + 0 * 6];
r1 = sh_systems[36 * tid + (3 + 1) + 1 * 6];
r2 = sh_systems[36 * tid + (3 + 1) + 2 * 6];
float L1_1 = cuCL1f(r0) + cuCL1f(r1) + cuCL1f(r2);
int index_max = L1_1 > max;
max = (float)(1 - index_max) * L1_0 + L1_1 * (float)index_max;
r0 = sh_systems[36 * tid + (3 + 2) + 0 * 6];
r1 = sh_systems[36 * tid + (3 + 2) + 1 * 6];
r2 = sh_systems[36 * tid + (3 + 2) + 2 * 6];
float L1_2 = cuCL1f(r0) + cuCL1f(r1) + cuCL1f(r2);
int classifier = L1_2 > max;
index_max = index_max * (1 - classifier) + 2 * classifier;
max = max * (float)(1 - classifier) + L1_2 * (float)(classifier);
if (target_tid == tid)
{
printf("L1_0 %f \n", L1_0);
printf("L1_1 %f \n", L1_1);
printf("L1_2 %f \n", L1_2);
printf("max %f \n", max);
printf("index_max %d \n", index_max);
}
cuComplex t0 = sh_systems[36 * tid + (index_max + 3) + 0 * 6];
cuComplex t1 = sh_systems[36 * tid + (index_max + 3) + 1 * 6];
cuComplex t2 = sh_systems[36 * tid + (index_max + 3) + 2 * 6];
sh_systems[36 * tid + (index_max + 3) + 0 * 6] = sh_systems[36 * tid + (0 + 3) + 0 * 6];
sh_systems[36 * tid + (index_max + 3) + 1 * 6] = sh_systems[36 * tid + (0 + 3) + 1 * 6];
sh_systems[36 * tid + (index_max + 3) + 2 * 6] = sh_systems[36 * tid + (0 + 3) + 2 * 6];
cuComplex L22 = (__cuCsqabsf(t0) + __cuCsqabsf(t1)) + __cuCsqabsf(t2);
cuComplex Linv =make_cuComplex( - 1.0 / sqrtf(L22.x),0.0f);
t0 = t0 * Linv;
t1 = t1 * Linv;
t2 = t2 * Linv;
L22 = (__cuCsqabsf(t0) + __cuCsqabsf(t1)) + __cuCsqabsf(t2);
Linv = make_cuComplex(-1.0 / sqrtf(L22.x), 0.0f);
if (target_tid == tid)
{
printf("L22 %f %f \n", L22.x, L22.y);
printf("Linv %f %f \n", Linv.x, Linv.y);
printf("t0 %f %f \n", t0.x, t0.y);
printf("t1 %f %f \n", t1.x, t1.y);
printf("t2 %f %f \n", t2.x, t2.y);
}
sh_systems[36 * tid + (0 + 3) + 0 * 6] = t0;
sh_systems[36 * tid + (0 + 3) + 1 * 6] = t1;
sh_systems[36 * tid + (0 + 3) + 2 * 6] = t2;
if (target_tid == tid)
{
printf("target_tid == %d sh_systems\n", target_tid);
for (int i = 0; i < 6; i++)
{
for (int j_ = 0; j_ < 6; j_++)
{
cuComplex x = sh_systems[i + j_ * 6];
printf("(%- .5ff, %- .5ff) ", x.x, x.y);
}
printf("\n");
}
printf("\n");
}
__syncthreads();
for (int ii = 1; ii < 6; ii++)
{
int index_i = (3 + ii) % 6;
cuComplex b0 = sh_systems[36 * tid + index_i + 0 * 6];
cuComplex b1 = sh_systems[36 * tid + index_i + 1 * 6];
cuComplex b2 = sh_systems[36 * tid + index_i + 2 * 6];
if (target_tid == tid)
{
printf("b0 %f %f \n", b0.x, b0.y); //some math mistakes here is not important now
printf("b1 %f %f \n", b1.x, b1.y);
printf("b2 %f %f \n", b2.x, b2.y);
}
cuComplex scalp = __cuCmul_conjf(b0, t0);
scalp = scalp + __cuCmul_conjf(b1, t1);
scalp = scalp + __cuCmul_conjf(b2, t2);
cuComplex k = scalp * Linv;
if (target_tid == tid)
{
printf("scalp %f %f \n", scalp.x, scalp.y);
printf("k %f %f \n", k.x, k.y);
}
for (int jj = 0; jj < 6; jj++)
{
sh_systems[36 * tid + index_i + jj * 6] = __cuCfmaf(k, sh_systems[36 * tid + 3 + jj * 6],sh_systems[36 * tid + index_i + jj * 6]);
}
}
if (target_tid == tid)
{
printf("target_tid == %d sh_systems red pre 6\n", target_tid);
for (int i = 0; i < 6; i++)
{
for (int j_ = 0; j_ < 6; j_++)
{
cuComplex x = sh_systems[i + j_ * 6];
printf("(%- .5ff, %- .5ff) ", x.x, x.y);
}
printf("\n");
}
printf("\n");
}
__syncthreads();
int ev_order = sh_en_order[tid];
if (ev_order == 1)
{
r0 = sh_systems[36 * tid + (3 + 1) + 0 * 6];
r1 = sh_systems[36 * tid + (3 + 1) + 1 * 6];
r2 = sh_systems[36 * tid + (3 + 1) + 2 * 6];
float L1_1 = cuCL1f(r0) + cuCL1f(r1) + cuCL1f(r2);
int index_max = 1;
max = L1_1;
r0 = sh_systems[36 * tid + (3 + 2) + 0 * 6];
r1 = sh_systems[36 * tid + (3 + 2) + 1 * 6];
r2 = sh_systems[36 * tid + (3 + 2) + 2 * 6];
float L1_2 = cuCL1f(r0) + cuCL1f(r1) + cuCL1f(r2);
int classifier = L1_2 > max;
index_max = index_max * (1 - classifier) + 2 * classifier;
max = max * (float)(1 - classifier) + L1_2 * (float)(classifier);
t0 = sh_systems[36 * tid + (index_max + 3) + 0 * 6];
t1 = sh_systems[36 * tid + (index_max + 3) + 1 * 6];
t2 = sh_systems[36 * tid + (index_max + 3) + 2 * 6];
sh_systems[36 * tid + (index_max + 3) + 0 * 6] = sh_systems[36 * tid + (1 + 3) + 0 * 6];
sh_systems[36 * tid + (index_max + 3) + 1 * 6] = sh_systems[36 * tid + (1 + 3) + 1 * 6];
sh_systems[36 * tid + (index_max + 3) + 2 * 6] = sh_systems[36 * tid + (1 + 3) + 2 * 6];
cuComplex L22 = (__cuCsqabsf(t0) + __cuCsqabsf(t1)) + __cuCsqabsf(t2);
cuComplex Linv = make_cuComplex(-1.0 / sqrtf(L22.x), 0.0f);
t0 = t0 * Linv;
t1 = t1 * Linv;
t2 = t2 * Linv;
L22 = (__cuCsqabsf(t0) + __cuCsqabsf(t1)) + __cuCsqabsf(t2);
Linv = make_cuComplex(-1.0 / sqrtf(L22.x), 0.0f);
sh_systems[36 * tid + (1 + 3) + 0 * 6] = t0;
sh_systems[36 * tid + (1 + 3) + 1 * 6] = t1;
sh_systems[36 * tid + (1 + 3) + 2 * 6] = t2;
for (int ii = 1; ii < 6; ii++)
{
int index_i = (4 + ii) % 6;
cuComplex b0 = sh_systems[36 * tid + index_i + 0 * 6];
cuComplex b1 = sh_systems[36 * tid + index_i + 1 * 6];
cuComplex b2 = sh_systems[36 * tid + index_i + 2 * 6];
if (target_tid == tid)
{
printf("b0 %f %f \n", b0.x, b0.y);
printf("b1 %f %f \n", b1.x, b1.y);
printf("b2 %f %f \n", b2.x, b2.y);
}
cuComplex scalp = __cuCmul_conjf(b0, t0);
scalp = scalp + __cuCmul_conjf(b1, t1);
scalp = scalp + __cuCmul_conjf(b2, t2);
cuComplex k = scalp * Linv;
if (target_tid == tid)
{
printf("scalp %f %f \n", scalp.x, scalp.y);
printf("k %f %f \n", k.x, k.y);
}
for (int jj = 0; jj < 6; jj++)
{
sh_systems[36 * tid + index_i + jj * 6] = __cuCfmaf(k, sh_systems[36 * tid + 3 + jj * 6], sh_systems[36 * tid + index_i + jj * 6]);
}
}
}
}
tid = threadIdx.x;
__syncthreads();
if (target_tid == tid )
{
printf("target_tid == %d sh_systems red final 6\n", target_tid );
for (int i = 0; i < 6; i++)
{
for (int j_ = 0; j_ < 6; j_++)
{
cuComplex x = sh_systems[i + j_ * 6];
printf("(%- .5ff, %- .5ff) ", x.x, x.y);
}
printf("\n");
}
printf("\n");
}
__syncthreads()
*/
//if (tid == 0)
//{
// printf("bid %d EV EN \n", bid);
// printf("bid %d ", bid);
// for (int i = 0; i < 6; i++)
// {
// for (int j_ = 0; j_ < 7; j_++)
// {
// cuComplex x = sh_refined_sor_roots_EV_cache[i + j_ * 6];
// printf("(%- .5ff, %- .5ff) ", x.x, x.y);
// }
// printf("\n");
// }
// printf("\n");
//}
//__syncthreads();
//==================================================================
//SAVE TO GDDR
//==================================================================
tid = threadIdx.x;
//while (tid < 42)
//{
// device_config->EVEN_Storage[bid * 42 + tid] = sh_refined_sor_roots_EV_cache[tid];
// tid = tid + blockDim.x;
//}
bid += gridDim.x;
//printf("check point 9\n");
bid = total_ + 1;
}
};