Hi all, I wrote a cuda program. The results (carry) are correct if the last sentence (c_2[tid] = c_2[tid] +carry[tid] External Image is not been executed. But the last sentence does not change the output(carry). If the last sentence is replaced with (c_2[tid] =1;), the results are still wrong. It’s very wired. Could any one can help me to find the problem? Thanks.
#include "cuda.h"
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <gmp.h>
uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words);
__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2);
int main()
{
int samples, words, index;
uint32_t *x, *y, *d, *z, *cpy_z;
uint32_t *dev_a, *dev_b, *dev_d, *dev_c, *carry, *c_2;
samples=512;
words=samples/4;
cudaMalloc( (void**)&dev_a, words * sizeof(uint32_t) );
cudaMalloc( (void**)&dev_b, words * sizeof(uint32_t) );
cudaMalloc( (void**)&dev_d, words * sizeof(uint32_t) );
cudaMalloc( (void**)&dev_c, words * sizeof(uint32_t) );
cudaMalloc( (void**)&carry, words * sizeof(uint32_t) );
cudaMalloc( (void**)&c_2, words * sizeof(uint32_t) );
cudaMemset(dev_a, 0, sizeof(uint32_t)*words);
cudaMemset(dev_b, 0, sizeof(uint32_t)*words);
cudaMemset(dev_d, 0, sizeof(uint32_t)*words);
cudaMemset(dev_c, 0, sizeof(uint32_t)*words);
cudaMemset(carry, 0, sizeof(uint32_t)*words);
cudaMemset(c_2, 0, sizeof(uint32_t)*words);
x=(uint32_t *)malloc(sizeof(uint32_t)*words);
y=(uint32_t *)malloc(sizeof(uint32_t)*words);
d=(uint32_t *)malloc(sizeof(uint32_t)*words);
z=(uint32_t *)malloc(sizeof(uint32_t)*words);
cpy_z=(uint32_t *)malloc(sizeof(uint32_t)*words);
for(index=0;index<words-2;index++) {
x[index]=(rand()<<16)^rand();
y[index]=(rand()<<16)^rand();
}
for(index=words-2;index<words;index++) {
x[index]=0;
y[index]=0;
}
for(index=0;index<words-1;index++) {
d[index]=(rand()<<16)^rand();
}
d[words-1]=0;
cudaMemcpy( dev_a, x, words * sizeof(uint32_t),cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, y, words * sizeof(uint32_t),cudaMemcpyHostToDevice );
cudaMemcpy( dev_d, d, words * sizeof(uint32_t),cudaMemcpyHostToDevice );
z = AddMod(x, y, d, words-1, words);
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
cudaEventRecord( start, 0);
CUDA_AddMod<<<1024,256>>>(dev_a,dev_b,dev_d,words-1, words,dev_c,carry,c_2);
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
float elapsed;
cudaEventElapsedTime( &elapsed,start, stop );
elapsed=elapsed/1000;
printf( "Time to generate: %3.5f s\n", elapsed );
cudaEventDestroy( start );
cudaEventDestroy( stop );
cudaMemcpy( cpy_z, carry, words*sizeof(uint32_t), cudaMemcpyDeviceToHost );
int flag=0;
int nof=0;
int fi;
printf("cpy_z[0]=%d\n", cpy_z[0]);
printf("z[0]=%d\n", z[0]);
printf("cpy_z[1]=%d\n", cpy_z[1]);
printf("z[1]=%d\n", z[1]);
printf("cpy_z[2]=%d\n", cpy_z[2]);
printf("z[2]=%d\n", z[2]);
printf("cpy_z[3]=%d\n", cpy_z[3]);
printf("z[3]=%d\n", z[3]);
printf("cpy_z[4]=%d\n", cpy_z[4]);
printf("z[4]=%d\n", z[4]);
for(int i=0;i<words;i++)
{
if(cpy_z[i]!=z[i])
{
flag=1;
nof++;
fi=i;
break;
}
}
if (flag==1)
printf("fail %d times at %d\n",nof,fi);
else
printf("all succeed!\n");
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_d);
cudaFree(dev_c);
cudaFree(carry);
cudaFree(c_2);
free(x);
free(y);
free(d);
free(z);
free(cpy_z);
}
uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words){
int i;
unsigned long long int SumWord;
uint32_t *carry, *z_1, *z_2, *z_3;
z_1=(uint32_t *)malloc(sizeof(uint32_t)*(dLength+1));
z_2=(uint32_t *)malloc(sizeof(uint32_t)*words);
z_3=(uint32_t *)malloc(sizeof(uint32_t)*words);
carry=(uint32_t *)malloc(sizeof(uint32_t)*words);
carry[0]=0;
for(i=0; i<words; i++){
SumWord = (unsigned long long int)x[i]+(unsigned long long int)y[i];
//printf("SumWord %l = %llx\n", i, SumWord);
if(SumWord >= 0x100000000){
z_1[i] = SumWord - 0x100000000;
carry[i+1] = 1;
}
else {
z_1[i] = SumWord;
carry[i+1] = 0;
}
}
for(i=0; i<dLength+1; i++){
z_1[i] = z_1[i] + carry[i];
}
return carry;
}
__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2){
unsigned long long int SumWord;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid<words){
SumWord = (unsigned long long int)a[tid]+(unsigned long long int)b[tid];
if(SumWord >= 0x100000000){
c[tid] = SumWord - 0x100000000; //bit operation 0x100000000 100000000
carry[tid+1] = 1;
}
else {
c[tid] = SumWord;
carry[tid+1] = 0;
}
__syncthreads();
}
c_2[tid] = c_2[tid] +carry[tid] ; //c_2[tid] =1;
}