Hi, I wrote a CUDA codes to do a parallel subtration. But the first calculation is always wrong. But the second, the third, … are all right. I do not why. Anyone can help me solve the problem? Thanks.
#include “cuda.h”
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
//typedef uint32_t uint32_t;
void SubParallel(uint32_t *x, uint32_t xLength, uint32_t *y, uint32_t Length, uint32_t *z, int samples);
global void CUDA_SUB(uint32_t *a, uint32_t *b, uint32_t *c, uint32_t words, uint32_t *carry);
int main()
{
int words, index, samples;
uint32_t *x, *y, *z, *cpy_z;
uint32_t *dev_a, *dev_b, *dev_c, *carry;
samples=1024*128*2; //dim*dim;
words=samples/4;
cudaMalloc( (void**)&dev_a, words * sizeof(uint32_t) );
cudaMalloc( (void**)&dev_b, words * sizeof(uint32_t) );
cudaMalloc( (void**)&dev_c, words * sizeof(uint32_t) );
cudaMalloc( (void**)&carry, words * sizeof(uint32_t) );
x=(uint32_t *)malloc(sizeof(uint32_t)*words);
y=(uint32_t *)malloc(sizeof(uint32_t)*words);
z=(uint32_t *)malloc(sizeof(uint32_t)*words);
cpy_z=(uint32_t *)malloc(sizeof(uint32_t)*words);
for(index=0;index<words-1;index++)
{
x[index]=(rand()<<16)^rand();
y[index]=(rand()<<16)^rand();
}
x[words-1]=1;
y[words-1]=0;
cudaMemcpy( dev_a, x, words * sizeof(uint32_t),cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, y, words * sizeof(uint32_t),cudaMemcpyHostToDevice );
SubParallel(x, words, y, words, z, samples);
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
cudaEventRecord( start, 0);
CUDA_SUB<<<1024,256>>>(dev_a,dev_b,dev_c,words,carry);
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
float elapsed;
cudaEventElapsedTime( &elapsed,start, stop );
elapsed=elapsed/1000;
printf( "Time to generate: %3.5f s\n", elapsed );
cudaEventDestroy( start );
cudaEventDestroy( stop );
cudaMemcpy( cpy_z, dev_c, words*sizeof(uint32_t), cudaMemcpyDeviceToHost );
int flag=0;
int nof=0;
int fi;
for(int i=0;i<words;i++)
{
if(cpy_z[i]!=z[i])
{
flag=1;
nof++;
fi=i;
break;
}
}
if (flag==1)
printf("fail %d times at %d\n",nof,fi);
else
printf("all succeed!\n");
}
void SubParallel(uint32_t *x, uint32_t xLength, uint32_t *y, uint32_t Length, uint32_t *z, int samples){
int words, i;
uint32_t *carry;
words=xLength;
carry=(uint32_t *)malloc(sizeof(uint32_t)*words);
for(i=0; i<words; i++){
if(x[i] >= y[i]){
z[i] = x[i] - y[i]; //bit operation
carry[i] = 0;
}
else {
z[i] = (unsigned long long int)x[i] - (unsigned long long int)y[i] + 0x100000000;
carry[i] = 1;
}
}
for(i=1; i<words; i++){
z[i] = z[i] - carry[i-1];
}
//printf("Size: %5dK bits subtraction complete\n", samples/128);
}
global void CUDA_SUB(uint32_t *a, uint32_t *b, uint32_t *c, uint32_t words, uint32_t *carry){
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid < words){
if(a[tid] >= b[tid]){
c[tid] = a[tid] - b[tid]; //bit operation 0x100000000 100000000
carry[tid] = 0;
}
else {
c[tid] = a[tid] - b[tid] + 0x100000000;
carry[tid] = 1;
}
//__syncthreads();
if(tid!=0)
c[tid] = c[tid] - carry[tid-1];
__syncthreads();
}
}