The first calculation is always wrong

Hi, I wrote a CUDA codes to do a parallel subtration. But the first calculation is always wrong. But the second, the third, … are all right. I do not why. Anyone can help me solve the problem? Thanks.

#include “cuda.h”

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>

//typedef uint32_t uint32_t;

void SubParallel(uint32_t *x, uint32_t xLength, uint32_t *y, uint32_t Length, uint32_t *z, int samples);

global void CUDA_SUB(uint32_t *a, uint32_t *b, uint32_t *c, uint32_t words, uint32_t *carry);

int main()
{

int          words, index, samples;
uint32_t    *x, *y, *z, *cpy_z;

uint32_t *dev_a, *dev_b, *dev_c, *carry;

samples=1024*128*2; //dim*dim;
words=samples/4;

	cudaMalloc( (void**)&dev_a, words * sizeof(uint32_t) );
	cudaMalloc( (void**)&dev_b, words * sizeof(uint32_t) );
	cudaMalloc( (void**)&dev_c, words * sizeof(uint32_t) );
cudaMalloc( (void**)&carry, words * sizeof(uint32_t) );

x=(uint32_t *)malloc(sizeof(uint32_t)*words);
y=(uint32_t *)malloc(sizeof(uint32_t)*words);
z=(uint32_t *)malloc(sizeof(uint32_t)*words);
cpy_z=(uint32_t *)malloc(sizeof(uint32_t)*words);

for(index=0;index<words-1;index++)
{
	x[index]=(rand()<<16)^rand();
	y[index]=(rand()<<16)^rand();
}
    x[words-1]=1;
    y[words-1]=0;

 	cudaMemcpy( dev_a, x, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

 	cudaMemcpy( dev_b, y, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

SubParallel(x, words, y, words, z, samples);

   	cudaEvent_t     start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
cudaEventRecord( start, 0);

    CUDA_SUB<<<1024,256>>>(dev_a,dev_b,dev_c,words,carry);

cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
float   elapsed;
cudaEventElapsedTime( &elapsed,start, stop );
elapsed=elapsed/1000;
printf( "Time to generate:  %3.5f s\n", elapsed ); 
    
    cudaEventDestroy( start );
    cudaEventDestroy( stop ); 

cudaMemcpy( cpy_z, dev_c, words*sizeof(uint32_t), cudaMemcpyDeviceToHost );

int flag=0;

int nof=0;
int fi;
for(int i=0;i<words;i++)
{
	if(cpy_z[i]!=z[i])
	{
		flag=1;
		nof++;
		fi=i;
		break;
	}

}

if (flag==1)
	printf("fail %d times at %d\n",nof,fi);
else
	printf("all succeed!\n");

}

void SubParallel(uint32_t *x, uint32_t xLength, uint32_t *y, uint32_t Length, uint32_t *z, int samples){
int words, i;

    uint32_t *carry;

words=xLength;

carry=(uint32_t *)malloc(sizeof(uint32_t)*words);

for(i=0; i<words; i++){
	if(x[i] >= y[i]){
		z[i] = x[i] - y[i]; //bit operation
		carry[i] = 0;
	}
	else {
		z[i] = (unsigned long long int)x[i] - (unsigned long long int)y[i] + 0x100000000;
		carry[i] = 1;
	}
}

	for(i=1; i<words; i++){
		z[i] = z[i] - carry[i-1];
	}
    
    //printf("Size: %5dK bits subtraction complete\n", samples/128);

}

global void CUDA_SUB(uint32_t *a, uint32_t *b, uint32_t *c, uint32_t words, uint32_t *carry){

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid < words){

if(a[tid] >= b[tid]){
	c[tid] = a[tid] - b[tid]; //bit operation 0x100000000 100000000
	carry[tid] = 0;
}
else {
	c[tid] = a[tid] - b[tid] + 0x100000000;
	carry[tid] = 1;
}

//__syncthreads();

if(tid!=0)
       c[tid] = c[tid] - carry[tid-1];
    __syncthreads();

}

}

I am nott sure if will solve your problem, but the line c[tid] = c[tid] - carry[tid-1]; can different results. You should create a new array and use d[tid] = c[tid] - carry[tid-1];