I implemented a multiple-precision (MP) float and integer library for CUDA. I use these MP functions within the kernel listed below. Thereby, the kernel computes wrong values for element k of vector c (it remains 0), if the thread block dimension is greater than 11x11. In device emulation mode it works. In addition, if the code is compiled for the device with the -g -G flags, then it works correctly too.
Nevertheless, element k of vector c is computed only once at Line 122 (last instruction); it depends on vector b2 and s (vectors b2 and s are computed correctly). It seems that the kernel is too large for thread block dimensions greater than 11x11. Afterward, I broke this kernel into two and then it worked.
As a consequence, it seems that that the maximum number of instructions per kernel depends on the thread block dimension. Is there an explanation for this behavior?
[codebox]
dim3 dimBlock(block_size, block_size);
dim3 dimGrid(ceil(COLUMNS(a) / (float)dimBlock.x), ceil(ROWS(a) / (float)dimBlock.y));
…
mpf_k_gso<<<dimGrid, dimBlock>>>(…);
…
// Kernel
55 template <mp_d_size_t Z, mp_d_size_t F>
56 global void
57 mpf_k_gso(struct mp_matrix_d<mpz_d > b,
58 struct mp_matrix_d<mpf_d > _b,
59 struct mp_matrix_d<mpf_d > mu,
60 struct mp_matrix_d<mpf_d > b2,
61 struct mp_matrix_d<mpf_d > c,
62 long k,
63 mpf_d*bound,
64 long st,
65 struct mp_matrix_d<mpf_d > buf,
66 struct mp_matrix_d<mpf_d > s,
67 mpf_d*bound2,
68 struct mp_matrix_d<mpf_d > tmpf) {
69
70 long i, j;
71 mpz_d t1z;
72
73 mp_d_size_t row = blockIdx.y * blockDim.y + threadIdx.y + 1;
74 mp_d_size_t col = blockIdx.x * blockDim.x + threadIdx.x + 1;
75
76 mpz_d_init(&t1z);
77
78 #warning Only one CUDA thread is enabled for mpf_k_gso
79 if (row == 1 && col == 1) {
80 if(st < k) {
81 for (i = 1; i < st; i++)
82 mpf_d_mul(&VECTOR_IDX1(buf, i), &CIDX1(mu, k, i), &VECTOR_IDX1(c, i));
83 }
84
85 // Compute new s_j with st <= j < k
86 for (j = st; j < k; j++) {
87 mpf_d_set_si(&VECTOR_IDX1(s,j), 0);
88 mp_vector_d_inner(&VECTOR_IDX1(s,j), _b, _b, k-1, j-1);
89 mpf_d_pow_ui(&TMPF2(j), &VECTOR_IDX1(s,j), 2);
90
91 mpf_d_set(&TMPF3(j), &TMPF2(j));
92 mpf_d_mul(&TMPF2(j), &TMPF3(j), bound); // Store bound in constant memory
93 mpf_d_mul(&TMPF1(j), &VECTOR_IDX1(b2,k), &VECTOR_IDX1(b2,j));
94
95 if (mpf_d_cmp(&TMPF1(j), bound2) >= 0 && mpf_d_cmp(&TMPF1(j), &TMPF2(j)) >= 0) {
96 mpz_d_set_si(&t1z, 0);
97 mp_vector_d_inner(&t1z, b, b, k-1, j-1);
98 mpf_d_set_z(&VECTOR_IDX1(s,j), &t1z);
99 }
100 }
101
102 // Compute new mu_{k,j} and buf_j
103 for (j = st; j < k; j++) {
103 for (j = st; j < k; j++) {
104 mpf_d_set_si(&TMPF2(j), 0);
105 for (i = 1; i < j; i++) { // C O N F E R F E N C E
106 mpf_d_mul(&TMPF1(j), &CIDX1(mu,j,i), &VECTOR_IDX1(buf,i));
107 mpf_d_set(&TMPF3(j), &TMPF2(j));
108 mpf_d_add(&TMPF2(j), &TMPF3(j), &TMPF1(j)); //t1 += mu_{j,i} * buf_i
109 }
110
111 mpf_d_sub(&VECTOR_IDX1(buf,j), &VECTOR_IDX1(s,j), &TMPF2(j));
112 mpf_d_div_d(&CIDX1(mu,k,j), &VECTOR_IDX1(buf,j), &VECTOR_IDX1(c,j));
113 }
114 // Compute c_k
115 mpf_d_set_si(&VECTOR_IDX(s, 0), 0);
116 for (i = 1; i < k; i++) {
117 mpf_d_mul(&TMPF1(1), &CIDX1(mu, k, i), &VECTOR_IDX1(buf, i));
118 mpf_d_set(&TMPF2(1), &VECTOR_IDX(s, 0));
119 mpf_d_add(&VECTOR_IDX(s, 0), &TMPF2(1), &TMPF1(1)); // s = s + mu_{k,j} * buf_j
120 }
121
122 mpf_d_sub(&VECTOR_IDX1(c,k), &VECTOR_IDX1(b2,k), &VECTOR_IDX(s,0));
123 }
124 }
[/codebox]