Compiler bug?

frmky · July 23, 2009, 11:52pm

Hi,

I’m trying to use a multiprecision modular multiplication routine, but I’m getting the wrong answer on the device. The host running the same code gives the correct answer. The code below was compiled and run using the 190.16 Linux 64-bit driver, 2.3 64-bit toolkit, and the 2.3 SDK.

Output:

Calculating 1100946504954263218833^2 mod 1766730974551267606529

Device: 127327064766967149491

Host: 1662573381987388132102

Does anyone have any hints where the device may be going wrong, and ways to work around it?

Thanks!

[codebox]#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <math.h>

#include <cutil_inline.h>

/* Multiprecision routines adapted from those of Jason Papadopoulos */

/* as distributed in msieve */

#define MAX_MP_WORDS 6

#define MP_RADIX 4294967296.0

typedef unsigned int uint32;

typedef int int32;

typedef unsigned long long uint64;

typedef struct {

uint32 nwords;		/* number of nonzero words in val[] */

uint32 val[MAX_MP_WORDS];

} mp_t;

typedef struct {

uint32 nwords;

uint32 val[2 * MAX_MP_WORDS];

} big_mp_t;

#define MIN(a,b) ((a) < (b)? (a) : (b))

#define MAX(a,b) ((a) > (b)? (a) : (b))

#define mp_is_zero(a) ((a)->nwords == 0)

host device static void mp_clear(mp_t *a);

host device static void mp_copy(mp_t *a, mp_t *b);

host device static int32 mp_cmp(const mp_t *a, const mp_t *b);

host device static uint32 num_nonzero_words(uint32 *x, uint32 max_words);

host device static void mp_addmul_1(mp_t *a, uint32 b, uint32 *x);

host device static uint32 mp_submul_1(uint32 *a, uint32 b, uint32 words, uint32 *x);

host device static uint32 mp_divrem_1(mp_t *num, uint32 denom, mp_t *quot);

host device static void mp_divrem_core(big_mp_t *num, mp_t *denom, mp_t *quot, mp_t *rem);

host device static void mp_modmul(mp_t *a, mp_t *b, mp_t *n, mp_t *res);

host device static uint32 mp_mod_1(mp_t *num, uint32 denom);

host device static uint32 mp_mod_1_core(uint32 *num, uint32 nwords, uint32 denom);

char * mp_print(mp_t *a, uint32 base, FILE *f, char *scratch);

#define mp_printf(a, base, scratch) mp_print(a, base, stdout, scratch)

#define mp_fprintf(a, base, f, scratch) mp_print(a, base, f, scratch)

#define mp_sprintf(a, base, scratch) mp_print(a, base, NULL, scratch)

void TestHost(mp_t *k);

global void Test(mp_t *k);

int main(void) {

mp_t k, k2;

mp_t *d_k;

char scratch[32*MAX_MP_WORDS+1];

CUDA_SAFE_CALL(cudaSetDevice(0));

CUDA_SAFE_CALL(cudaMalloc((void**) &d_k, sizeof(mp_t)));

Test<<<1, 32>>>(d_k);

	TestHost(&k2);

CUDA_SAFE_CALL(cudaThreadSynchronize());

	

CUDA_SAFE_CALL(cudaMemcpy(&k, d_k, sizeof(mp_t), cudaMemcpyDeviceToHost));

printf("Calculating 1100946504954263218833^2 mod 1766730974551267606529\n");

printf("Device: "); mp_printf(&k, 10, scratch); printf("\n");

printf("Host:   "); mp_printf(&k2, 10, scratch); printf("\n");

cudaFree(d_k);

}

void TestHost(mp_t *k) {

mp_t b,fac;

int i;

mp_clear(&b);

mp_clear(&fac);

b.nwords=3;

b.val[0]=4205947537;

b.val[1]=2931012912;

b.val[2]=59;

fac.nwords=3;

fac.val[0]=1;

fac.val[1]=3327216848;

fac.val[2]=95;

mp_modmul(&b,&b,&fac,&b);

for (i=0;i<MAX_MP_WORDS;i++) k->val[i]=b.val[i];

k->nwords=b.nwords;

}

global void Test(mp_t *k) {

mp_t b,fac;

int i;

mp_clear(&b);

mp_clear(&fac);

b.nwords=3;

b.val[0]=4205947537;

b.val[1]=2931012912;

b.val[2]=59;

fac.nwords=3;

fac.val[0]=1;

fac.val[1]=3327216848;

fac.val[2]=95;

mp_modmul(&b,&b,&fac,&b);

if (blockIdx.x * blockDim.x + threadIdx.x == 0) {

	for (i=0;i<MAX_MP_WORDS;i++) k->val[i]=b.val[i];

	k->nwords=b.nwords;

}

}

/* ----------------- */

/* MP Routines */

/* ----------------- */

host device static void mp_clear(mp_t *a) {

int i;

a->nwords=0;

#pragma unroll

for (i=0;i<MAX_MP_WORDS;i++) a->val[i]=0;

}

host device static void mp_copy(mp_t *a, mp_t *b) {

*b = *a;

}

host device static int32 mp_cmp(const mp_t *a, const mp_t *b) {

int32 i;

if (a->nwords > b->nwords)

	return 1;

if (a->nwords < b->nwords)

	return -1;

for (i = a->nwords - 1; i >= 0; i--) {

	if (a->val[i] > b->val[i])

		return 1;

	if (a->val[i] < b->val[i])

		return -1;

}

return 0;

}

host device static uint32 num_nonzero_words(uint32 *x, uint32 max_words) {

uint32 i;

for (i = max_words; i && !x[i-1]; i--);

return i;

}

host device static void mp_addmul_1(mp_t *a, uint32 b, uint32 *x) {

uint32 words = a->nwords;

uint32 carry = 0;

uint32 i;

uint64 acc;

for (i = 0; i < words; i++) {

	acc = (uint64)a->val[i] * (uint64)b + 

	      (uint64)carry +

	      (uint64)x[i];

	x[i] = (uint32)acc;

	carry = (uint32)(acc >> 32);

}

/* avoid writing the carry if zero. This is needed to

   avoid a buffer overrun when callers are multiplying

   integers whose product requires MP_MAX_WORDS words 

   of storage. A side effect of this is that callers 

   must guarantee that x[words] is initialized to zero 

   when mp_addmul_1 is called */

if (carry) x[words] = carry;

}

host device static uint32 mp_submul_1(uint32 *a, uint32 b,

		uint32 words, uint32 *x) {

uint32 carry = 0;

uint32 i, j, k;

for (i = 0; i < words; i++) {

	uint64 acc = (uint64)a[i] * (uint64)b + (uint64)carry;

	k = x[i];

	j = (uint32)acc;

	carry = (uint32)(acc >> 32);

	x[i] = k - j;

	carry += (x[i] > k);

}

return carry;

}

host device static uint32 mp_divrem_1(mp_t *num, uint32 denom, mp_t *quot) {

int32 i;

uint32 rem = 0;

if (quot == NULL)

	return mp_mod_1(num, denom);

for (i = (int32)num->nwords; i < MAX_MP_WORDS; i++)

	quot->val[i] = 0;



i = num->nwords - 1;

if (num->val[i] < denom) {

	rem = num->val[i];

	quot->val[i--] = 0;

}

while (i >= 0) {

	uint64 acc = (uint64)rem << 32 | (uint64)num->val[i];

	quot->val[i] = (uint32)(acc / (uint64)denom);

	rem = (uint32)(acc % (uint64)denom);

	i--;

}

i = num->nwords;

quot->nwords = i;

if (i && quot->val[i-1] == 0)

	quot->nwords--;

return rem;

}

host device static void mp_divrem_core(big_mp_t *num, mp_t *denom, mp_t *quot, mp_t *rem) {

int32 i, j, k;

uint32 shift, comp_shift;

uint32 high_denom, low_denom;

uint32 nacc[2*MAX_MP_WORDS+1];

uint32 dacc[MAX_MP_WORDS];

mp_clear(quot);

mp_clear(rem);

i = num->nwords;

j = denom->nwords;

if (mp_cmp((mp_t *)num, denom) < 0) {

	mp_copy((mp_t *)num, rem);  /* copy low half of num only! */

	return;

}

if (j <= 1) { 		

	/* quotient and remainder had better fit in an mp_t */

	rem->val[0] = mp_divrem_1((mp_t *)num, denom->val[0], quot);

	if (rem->val[0] > 0)

		rem->nwords = 1;

	return;

}

high_denom = denom->val[j-1];

comp_shift = 0x80000000;

shift = 0;

if ((high_denom >> 16) == 0) {

	comp_shift = 0x8000;

	shift = 16;

}

while ( !(high_denom & comp_shift) ) {

	shift++;

	comp_shift >>= 1;

}

comp_shift = 32 - shift;

if (shift) {

	for (k = j; k > 1; k--) {

		dacc[k-1] = denom->val[k-1] << shift |

				denom->val[k-2] >> comp_shift;

	}

	dacc[0] = denom->val[0] << shift;

	for (k = i; k > 1; k--) {

		nacc[k-1] = num->val[k-1] << shift |

				num->val[k-2] >> comp_shift;

	}

	nacc[0] = num->val[0] << shift;

	nacc[i] = num->val[i-1] >> comp_shift;

}

else {

	for (int ii=0;ii<j;ii++) dacc[ii]= denom->val[ii];

	for (int ii=0;ii<i;ii++) nacc[ii]= num->val[ii];

	nacc[i] = 0;

}

high_denom = dacc[j-1];

low_denom = dacc[j-2];

while (i + 1 > j) {

	uint32 q, r;

	uint32 high_num = nacc[i];

	uint32 low_num = nacc[i-1];

	uint32 check = nacc[i-2];

	if (high_num == high_denom) {

		/* the quotient will be 2^32, and must

		   always get corrected */

		q = 0xffffffff;

	}

	else {

		uint64 acc = ((uint64)high_num << 32) | ((uint64)low_num);

		q = (uint32)(acc / (uint64)high_denom);

		r = (uint32)(acc % (uint64)high_denom);

		while ((uint64)low_denom * (uint64)q >

			((uint64)r << 32) + check) {

			q--;

			r += high_denom;

			if (r < high_denom)

				break;

		}

	}

	if (mp_submul_1(dacc, q, (uint32)j, nacc + i - j) 

						!= high_num) {

		uint32 carry = 0;

		for (q--, k = 0; k < j; k++) {

			uint32 sum = nacc[i-j+k] + carry;

			carry = (sum < nacc[i-j+k]);

			nacc[i-j+k] = sum + dacc[k];

			carry += (nacc[i-j+k] < sum);

		}

	}

	if (i - j < MAX_MP_WORDS)

		quot->val[i - j] = q;

	nacc[i--] = 0;

}

if (shift) {

	for (k = 0; k < j-1; k++) {

		rem->val[k] = nacc[k] >> shift |

				nacc[k+1] << comp_shift;

	}

	rem->val[k] = nacc[k] >> shift;

}

else {

	for (int ii=0;ii<j;ii++) rem->val[ii]= nacc[ii];

}

j = MIN(num->nwords - denom->nwords + 1, MAX_MP_WORDS);

quot->nwords = num_nonzero_words(quot->val, (uint32)j);

rem->nwords = num_nonzero_words(rem->val, denom->nwords);

}

host device static void mp_modmul(mp_t *a, mp_t *b, mp_t *n, mp_t *res) {

uint32 i;

mp_t *small = a;

mp_t *large = b;

big_mp_t prod;

mp_t tmp_quot;

if (small->nwords > large->nwords) {

	small = b;

	large = a;

}

if (small->nwords == 0) {

	mp_clear(res);

	return;

}

#pragma unroll

for (i=0;i<2 * MAX_MP_WORDS;i++) prod.val[i]=0;

for (i = 0; i < small->nwords; i++) 

	mp_addmul_1(large, small->val[i], prod.val + i);



prod.nwords = num_nonzero_words(prod.val, a->nwords + b->nwords);

mp_divrem_core(&prod, n, &tmp_quot, res);

}

host device static uint32 mp_mod_1(mp_t *num, uint32 denom) {

return mp_mod_1_core(num->val, num->nwords, denom);

}

host device static uint32 mp_mod_1_core(uint32 *num,

			uint32 nwords, uint32 denom) {

int32 i = nwords - 1;

uint32 rem = 0;

if (num[i] < denom)

	rem = num[i--];

while (i >= 0) {

	uint64 acc = ((uint64)rem << 32) | ((uint64)num[i]);

	rem = (uint32)(acc % (uint64)denom);

	i--;

}

return rem;

}

char * mp_print(mp_t *a, uint32 base, FILE *f, char *scratch) {

mp_t tmp;

char *bufptr;

uint32 next_letter;

mp_copy(a, &tmp);

bufptr = scratch + 32 * MAX_MP_WORDS;

*bufptr = 0;

do {

	next_letter = mp_divrem_1(&tmp, base, &tmp);

	if (next_letter < 10)

		*(--bufptr) = '0' + next_letter;

	else

		*(--bufptr) = 'a' + (next_letter - 10);

} while ( !mp_is_zero(&tmp) );



if (f)

	fprintf(f, "%s", bufptr);

return bufptr;

}

[/codebox]

Nico · July 24, 2009, 7:52am

It’s a known bug when using modulo operator on 64bit values. See this thread.

N.

frmky · July 24, 2009, 8:20am

Thanks! The code uses mods with unsigned 64-bit ints. I replaced all three instances of A%B with A-B*(A/B), and it works great! I hope this bug is fixed soon, but in the meantime the workaround is effective.