Maximum instructions per kernel depends on thread block dimension?

I implemented a multiple-precision (MP) float and integer library for CUDA. I use these MP functions within the kernel listed below. Thereby, the kernel computes wrong values for element k of vector c (it remains 0), if the thread block dimension is greater than 11x11. In device emulation mode it works. In addition, if the code is compiled for the device with the -g -G flags, then it works correctly too.

Nevertheless, element k of vector c is computed only once at Line 122 (last instruction); it depends on vector b2 and s (vectors b2 and s are computed correctly). It seems that the kernel is too large for thread block dimensions greater than 11x11. Afterward, I broke this kernel into two and then it worked.

As a consequence, it seems that that the maximum number of instructions per kernel depends on the thread block dimension. Is there an explanation for this behavior?

[codebox]

dim3 dimBlock(block_size, block_size);

dim3 dimGrid(ceil(COLUMNS(a) / (float)dimBlock.x), ceil(ROWS(a) / (float)dimBlock.y));

mpf_k_gso<<<dimGrid, dimBlock>>>(…);

// Kernel

55 template <mp_d_size_t Z, mp_d_size_t F>

56 global void

57 mpf_k_gso(struct mp_matrix_d<mpz_d > b,

58 struct mp_matrix_d<mpf_d > _b,

59 struct mp_matrix_d<mpf_d > mu,

60 struct mp_matrix_d<mpf_d > b2,

61 struct mp_matrix_d<mpf_d > c,

62 long k,

63 mpf_d*bound,

64 long st,

65 struct mp_matrix_d<mpf_d > buf,

66 struct mp_matrix_d<mpf_d > s,

67 mpf_d*bound2,

68 struct mp_matrix_d<mpf_d > tmpf) {

69

70 long i, j;

71 mpz_d t1z;

72

73 mp_d_size_t row = blockIdx.y * blockDim.y + threadIdx.y + 1;

74 mp_d_size_t col = blockIdx.x * blockDim.x + threadIdx.x + 1;

75

76 mpz_d_init(&t1z);

77

78 #warning Only one CUDA thread is enabled for mpf_k_gso

79 if (row == 1 && col == 1) {

80 if(st < k) {

81 for (i = 1; i < st; i++)

82 mpf_d_mul(&VECTOR_IDX1(buf, i), &CIDX1(mu, k, i), &VECTOR_IDX1(c, i));

83 }

84

85 // Compute new s_j with st <= j < k

86 for (j = st; j < k; j++) {

87 mpf_d_set_si(&VECTOR_IDX1(s,j), 0);

88 mp_vector_d_inner(&VECTOR_IDX1(s,j), _b, _b, k-1, j-1);

89 mpf_d_pow_ui(&TMPF2(j), &VECTOR_IDX1(s,j), 2);

90

91 mpf_d_set(&TMPF3(j), &TMPF2(j));

92 mpf_d_mul(&TMPF2(j), &TMPF3(j), bound); // Store bound in constant memory

93 mpf_d_mul(&TMPF1(j), &VECTOR_IDX1(b2,k), &VECTOR_IDX1(b2,j));

94

95 if (mpf_d_cmp(&TMPF1(j), bound2) >= 0 && mpf_d_cmp(&TMPF1(j), &TMPF2(j)) >= 0) {

96 mpz_d_set_si(&t1z, 0);

97 mp_vector_d_inner(&t1z, b, b, k-1, j-1);

98 mpf_d_set_z(&VECTOR_IDX1(s,j), &t1z);

99 }

100 }

101

102 // Compute new mu_{k,j} and buf_j

103 for (j = st; j < k; j++) {

103 for (j = st; j < k; j++) {

104 mpf_d_set_si(&TMPF2(j), 0);

105 for (i = 1; i < j; i++) { // C O N F E R F E N C E

106 mpf_d_mul(&TMPF1(j), &CIDX1(mu,j,i), &VECTOR_IDX1(buf,i));

107 mpf_d_set(&TMPF3(j), &TMPF2(j));

108 mpf_d_add(&TMPF2(j), &TMPF3(j), &TMPF1(j)); //t1 += mu_{j,i} * buf_i

109 }

110

111 mpf_d_sub(&VECTOR_IDX1(buf,j), &VECTOR_IDX1(s,j), &TMPF2(j));

112 mpf_d_div_d(&CIDX1(mu,k,j), &VECTOR_IDX1(buf,j), &VECTOR_IDX1(c,j));

113 }

114 // Compute c_k

115 mpf_d_set_si(&VECTOR_IDX(s, 0), 0);

116 for (i = 1; i < k; i++) {

117 mpf_d_mul(&TMPF1(1), &CIDX1(mu, k, i), &VECTOR_IDX1(buf, i));

118 mpf_d_set(&TMPF2(1), &VECTOR_IDX(s, 0));

119 mpf_d_add(&VECTOR_IDX(s, 0), &TMPF2(1), &TMPF1(1)); // s = s + mu_{k,j} * buf_j

120 }

121

122 mpf_d_sub(&VECTOR_IDX1(c,k), &VECTOR_IDX1(b2,k), &VECTOR_IDX(s,0));

123 }

124 }

[/codebox]