Hi all. I wrote the codes to implement the parallel addition. But there are always some errors in my final results. The program runs well until it run the last sentence. So there may be something wrong with the last sentence. Could anyone can help me find the wired problem. Thanks.
#include "cuda.h"
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <gmp.h>
uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words);
__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2);
int main()
{
int samples, words, index;
uint32_t *x, *y, *d, *z, *cpy_z;
uint32_t *dev_a, *dev_b, *dev_d, *dev_c, *carry, *c_2;
samples=1024*512;
words=samples/4;
cudaMalloc( (void**)&dev_a, words * sizeof(uint32_t) );
cudaMalloc( (void**)&dev_b, words * sizeof(uint32_t) );
cudaMalloc( (void**)&dev_d, words * sizeof(uint32_t) );
cudaMalloc( (void**)&dev_c, words * sizeof(uint32_t) );
cudaMalloc( (void**)&carry, words * sizeof(uint32_t) );
cudaMalloc( (void**)&c_2, words * sizeof(uint32_t) );
cudaMemset(dev_a, 0, sizeof(uint32_t)*words);
cudaMemset(dev_b, 0, sizeof(uint32_t)*words);
cudaMemset(dev_d, 0, sizeof(uint32_t)*words);
cudaMemset(dev_c, 0, sizeof(uint32_t)*words);
cudaMemset(carry, 0, sizeof(uint32_t)*words);
cudaMemset(c_2, 0, sizeof(uint32_t)*words);
x=(uint32_t *)malloc(sizeof(uint32_t)*words);
y=(uint32_t *)malloc(sizeof(uint32_t)*words);
d=(uint32_t *)malloc(sizeof(uint32_t)*words);
z=(uint32_t *)malloc(sizeof(uint32_t)*words);
cpy_z=(uint32_t *)malloc(sizeof(uint32_t)*words);
for(index=0;index<words-2;index++) { //generate the test data
x[index]=(rand()<<16)^rand();
y[index]=(rand()<<16)^rand();
}
for(index=words-2;index<words;index++) {
x[index]=0;
y[index]=0;
}
for(index=0;index<words-1;index++) {
d[index]=(rand()<<16)^rand();
}
d[words-1]=0;
cudaMemcpy( dev_a, x, words * sizeof(uint32_t),cudaMemcpyHostToDevice ); // copy data from cpu to gpu
cudaMemcpy( dev_b, y, words * sizeof(uint32_t),cudaMemcpyHostToDevice );
cudaMemcpy( dev_d, d, words * sizeof(uint32_t),cudaMemcpyHostToDevice );
z = AddMod(x, y, d, words-1, words); //C function used to test the results.
CUDA_AddMod<<<4096,256>>>(dev_a,dev_b,dev_d,words-1, words,dev_c,carry,c_2); // Cuda function
cudaMemcpy( cpy_z, c_2, words*sizeof(uint32_t), cudaMemcpyDeviceToHost ); //copy results back to cpu
int flag=0; //testing is as follows.
int nof=0;
int fi;
printf("cpy_z[0]=%d\n", cpy_z[0]);
printf("z[0]=%d\n", z[0]);
printf("cpy_z[1]=%d\n", cpy_z[1]);
printf("z[1]=%d\n", z[1]);
printf("cpy_z[2]=%d\n", cpy_z[2]);
printf("z[2]=%d\n", z[2]);
printf("cpy_z[3]=%d\n", cpy_z[3]);
printf("z[3]=%d\n", z[3]);
printf("cpy_z[4]=%d\n", cpy_z[4]);
printf("z[4]=%d\n", z[4]);
for(int i=0;i<words;i++){
if(cpy_z[i]!=z[i]){
flag=1;
nof++;
fi=i;
break;
}
}
if (flag==1)
printf("fail %d times at %d\n",nof,fi);
else
printf("all succeed!\n");
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_d);
cudaFree(dev_c);
cudaFree(carry);
cudaFree(c_2);
free(x);
free(y);
free(d);
free(z);
free(cpy_z);
}
uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words){ //C function
int i;
unsigned long long int SumWord;
uint32_t *carry, *z_1;
z_1=(uint32_t *)malloc(sizeof(uint32_t)*(dLength+1));
carry=(uint32_t *)malloc(sizeof(uint32_t)*words);
carry[0]=0;
for(i=0; i<words; i++){
SumWord = (unsigned long long int)x[i]+(unsigned long long int)y[i];
if(SumWord >= 0x100000000){
z_1[i] = SumWord - 0x100000000;
carry[i+1] = 1;
}
else {
z_1[i] = SumWord;
carry[i+1] = 0;
}
}
for(i=0; i<dLength+1; i++){
z_1[i] = z_1[i] + carry[i];
}
return z_1;
}
//Cuda function is as follows.
__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2){
unsigned long long int SumWord;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid<words){
SumWord = (unsigned long long int)a[tid]+(unsigned long long int)b[tid];
if(SumWord >= 0x100000000){
c[tid] = SumWord - 0x100000000;
carry[tid+1] = 1;
}
else {
c[tid] = SumWord;
carry[tid+1] = 0;
}
__syncthreads();
c_2[0]=c[0];
c_2[tid] = c[tid] + carry[tid] ; // The problem is this sentence.
}
}