help with my cuda program

Hi all, I wrote a cuda program. The results (carry) are correct if the last sentence (c_2[tid] = c_2[tid] +carry[tid] ;) is not been executed. But the last sentence does not change the output(carry). If the last sentence is replaced with (c_2[tid] =1;), the results are still wrong. It’s very wired. Could any one can help me to find the problem? Thanks.

#include "cuda.h"

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#include <gmp.h>

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words);

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2);

int main()

{

int          samples, words, index;

uint32_t    *x, *y, *d, *z, *cpy_z;

uint32_t *dev_a, *dev_b, *dev_d, *dev_c, *carry, *c_2;

samples=512;

words=samples/4;

cudaMalloc( (void**)&dev_a, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_b, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_d, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_c, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&carry, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&c_2, words * sizeof(uint32_t) );

cudaMemset(dev_a, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_b, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_d, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_c, 0, sizeof(uint32_t)*words);

   cudaMemset(carry, 0, sizeof(uint32_t)*words);

   cudaMemset(c_2, 0, sizeof(uint32_t)*words);

x=(uint32_t *)malloc(sizeof(uint32_t)*words);

y=(uint32_t *)malloc(sizeof(uint32_t)*words);

   d=(uint32_t *)malloc(sizeof(uint32_t)*words);

z=(uint32_t *)malloc(sizeof(uint32_t)*words);

   cpy_z=(uint32_t *)malloc(sizeof(uint32_t)*words);

for(index=0;index<words-2;index++) {

	x[index]=(rand()<<16)^rand();

	y[index]=(rand()<<16)^rand();

}

   for(index=words-2;index<words;index++) {

	x[index]=0;

	y[index]=0;

}

for(index=0;index<words-1;index++) {

	d[index]=(rand()<<16)^rand();

}

   d[words-1]=0;

cudaMemcpy( dev_a, x, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

   cudaMemcpy( dev_b, y, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

   cudaMemcpy( dev_d, d, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

z = AddMod(x, y, d, words-1, words);

cudaEvent_t     start, stop;

   cudaEventCreate( &start );

   cudaEventCreate( &stop );

   cudaEventRecord( start, 0);

CUDA_AddMod<<<1024,256>>>(dev_a,dev_b,dev_d,words-1, words,dev_c,carry,c_2);

cudaEventRecord( stop, 0 );

   cudaEventSynchronize( stop );

   float   elapsed;

   cudaEventElapsedTime( &elapsed,start, stop );

   elapsed=elapsed/1000;

   printf( "Time to generate:  %3.5f s\n", elapsed ); 

cudaEventDestroy( start );

   cudaEventDestroy( stop ); 

cudaMemcpy( cpy_z, carry, words*sizeof(uint32_t), cudaMemcpyDeviceToHost );

	int flag=0;

	

	int nof=0;

	int fi;

printf("cpy_z[0]=%d\n",  cpy_z[0]);

                        printf("z[0]=%d\n",  z[0]);

                        printf("cpy_z[1]=%d\n",  cpy_z[1]);

                        printf("z[1]=%d\n",  z[1]);

                        printf("cpy_z[2]=%d\n",  cpy_z[2]);

                        printf("z[2]=%d\n",  z[2]);

                        printf("cpy_z[3]=%d\n",  cpy_z[3]);

                        printf("z[3]=%d\n",  z[3]);

                        printf("cpy_z[4]=%d\n",  cpy_z[4]);

                        printf("z[4]=%d\n",  z[4]);

	for(int i=0;i<words;i++)

	{

		if(cpy_z[i]!=z[i])

		{

			flag=1;

			nof++;

			fi=i;

			break;

		}

	}

	if (flag==1)

		printf("fail %d times at %d\n",nof,fi);

	else

		printf("all succeed!\n");

cudaFree(dev_a);

   cudaFree(dev_b);

   cudaFree(dev_d);

   cudaFree(dev_c);

   cudaFree(carry);

   cudaFree(c_2);

free(x);

   free(y);

   free(d);

   free(z);

   free(cpy_z);

}

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words){

	int    i;

	unsigned long long int   SumWord;

	uint32_t *carry, *z_1, *z_2, *z_3;

z_1=(uint32_t *)malloc(sizeof(uint32_t)*(dLength+1));

        z_2=(uint32_t *)malloc(sizeof(uint32_t)*words);

        z_3=(uint32_t *)malloc(sizeof(uint32_t)*words);

        carry=(uint32_t *)malloc(sizeof(uint32_t)*words);

carry[0]=0;

	for(i=0; i<words; i++){

		SumWord = (unsigned long long int)x[i]+(unsigned long long int)y[i];

                //printf("SumWord %l = %llx\n", i, SumWord);

		if(SumWord >= 0x100000000){

			z_1[i] = SumWord - 0x100000000; 

			carry[i+1] = 1;

		}

		else {

			z_1[i] = SumWord;

			carry[i+1] = 0;

		}

	}

for(i=0; i<dLength+1; i++){

              z_1[i] = z_1[i] + carry[i];

        }

return carry;

}

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2){

	unsigned long long  int SumWord;

int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid<words){

	    SumWord = (unsigned long long int)a[tid]+(unsigned long long int)b[tid];

	    if(SumWord >= 0x100000000){

		c[tid] = SumWord - 0x100000000; //bit operation 0x100000000 100000000

		carry[tid+1] = 1;

	    }

	    else {

		c[tid] = SumWord;

		carry[tid+1] = 0;

	    }

	    __syncthreads();

}

c_2[tid] = c_2[tid] +carry[tid] ; //c_2[tid] =1;

}

Please put the code between these line, makes it more readable:

[ code ]

[ / code ]

(without the spaces)

#include "cuda.h"

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#include <gmp.h>

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words);

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2);

int main()

{

int samples, words, index;

uint32_t *x, *y, *d, *z, *cpy_z;

uint32_t *dev_a, *dev_b, *dev_d, *dev_c, *carry, *c_2;

samples=512;

words=samples/4;

cudaMalloc( (void**)&dev_a, words * sizeof(uint32_t) );

cudaMalloc( (void**)&dev_b, words * sizeof(uint32_t) );

cudaMalloc( (void**)&dev_d, words * sizeof(uint32_t) );

cudaMalloc( (void**)&dev_c, words * sizeof(uint32_t) );

cudaMalloc( (void**)&carry, words * sizeof(uint32_t) );

cudaMalloc( (void**)&c_2, words * sizeof(uint32_t) );

cudaMemset(dev_a, 0, sizeof(uint32_t)*words);

cudaMemset(dev_b, 0, sizeof(uint32_t)*words);

cudaMemset(dev_d, 0, sizeof(uint32_t)*words);

cudaMemset(dev_c, 0, sizeof(uint32_t)*words);

cudaMemset(carry, 0, sizeof(uint32_t)*words);

cudaMemset(c_2, 0, sizeof(uint32_t)*words);

x=(uint32_t *)malloc(sizeof(uint32_t)*words);

y=(uint32_t *)malloc(sizeof(uint32_t)*words);

d=(uint32_t *)malloc(sizeof(uint32_t)*words);

z=(uint32_t *)malloc(sizeof(uint32_t)*words);

cpy_z=(uint32_t *)malloc(sizeof(uint32_t)*words);

for(index=0;index<words-2;index++) { //generate the test data

x[index]=(rand()<<16)^rand();

y[index]=(rand()<<16)^rand();

}

for(index=words-2;index<words;index++) {

x[index]=0;

y[index]=0;

}

for(index=0;index<words-1;index++) {

d[index]=(rand()<<16)^rand();

}

d[words-1]=0;

cudaMemcpy( dev_a, x, words * sizeof(uint32_t),cudaMemcpyHostToDevice ); // copy data from cpu to gpu

cudaMemcpy( dev_b, y, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

cudaMemcpy( dev_d, d, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

z = AddMod(x, y, d, words-1, words); //C function used to test the results.

CUDA_AddMod<<<1024,256>>>(dev_a,dev_b,dev_d, words-1, words,dev_c,carry,c_2); // Cuda function

cudaMemcpy( cpy_z, carry, words*sizeof(uint32_t), cudaMemcpyDeviceToHost ); //copy results back to cpu

int flag=0; //testing is as follows.

int nof=0;

int fi;

printf("cpy_z[0]=%d\n", cpy_z[0]);

printf("z[0]=%d\n", z[0]);

printf("cpy_z[1]=%d\n", cpy_z[1]);

printf("z[1]=%d\n", z[1]);

printf("cpy_z[2]=%d\n", cpy_z[2]);

printf("z[2]=%d\n", z[2]);

printf("cpy_z[3]=%d\n", cpy_z[3]);

printf("z[3]=%d\n", z[3]);

printf("cpy_z[4]=%d\n", cpy_z[4]);

printf("z[4]=%d\n", z[4]);

for(int i=0;i<words;i++){

if(cpy_z[i]!=z[i]){

flag=1;

nof++;

fi=i;

break;

}

}

if (flag==1)

printf("fail %d times at %d\n",nof,fi);

else

printf("all succeed!\n");

cudaFree(dev_a);

cudaFree(dev_b);

cudaFree(dev_d);

cudaFree(dev_c);

cudaFree(carry);

cudaFree(c_2);

free(x);

free(y);

free(d);

free(z);

free(cpy_z);

}

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words){ //C function

int i;

unsigned long long int SumWord;

uint32_t *carry, *z_1;

z_1=(uint32_t *)malloc(sizeof(uint32_t)*(dLength+1));

carry=(uint32_t *)malloc(sizeof(uint32_t)*words);

carry[0]=0;

for(i=0; i<words; i++){

SumWord = (unsigned long long int)x[i]+(unsigned long long int)y[i];

if(SumWord >= 0x100000000){

z_1[i] = SumWord - 0x100000000;

carry[i+1] = 1;

}

else {

z_1[i] = SumWord;

carry[i+1] = 0;

}

}

for(i=0; i<dLength+1; i++){

z_1[i] = z_1[i] + carry[i];

}

return carry;

}

//Cuda function is as follows.

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2){

unsigned long long int SumWord;

int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid<words){

SumWord = (unsigned long long int)a[tid]+(unsigned long long int)b[tid];

if(SumWord >= 0x100000000){

c[tid] = SumWord - 0x100000000;

carry[tid+1] = 1;

}

else {

c[tid] = SumWord;

carry[tid+1] = 0;

}

__syncthreads();

}

c_2[tid] = c_2[tid] +carry[tid] ; //c_2[tid] =1; The problem is this sentence.

}

I am sorry about that. I make some changes to my codes. I hope it will bring some help.

#include "cuda.h"

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#include <gmp.h>

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words);

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2);

int main()            

{

int          samples, words, index;

uint32_t    *x, *y, *d, *z, *cpy_z;

   uint32_t *dev_a, *dev_b, *dev_d, *dev_c, *carry, *c_2;

samples=512;

   words=samples/4;

cudaMalloc( (void**)&dev_a, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_b, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_d, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_c, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&carry, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&c_2, words * sizeof(uint32_t) );

cudaMemset(dev_a, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_b, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_d, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_c, 0, sizeof(uint32_t)*words);

   cudaMemset(carry, 0, sizeof(uint32_t)*words);

   cudaMemset(c_2, 0, sizeof(uint32_t)*words);

x=(uint32_t *)malloc(sizeof(uint32_t)*words);

y=(uint32_t *)malloc(sizeof(uint32_t)*words);

   d=(uint32_t *)malloc(sizeof(uint32_t)*words);

z=(uint32_t *)malloc(sizeof(uint32_t)*words);

   cpy_z=(uint32_t *)malloc(sizeof(uint32_t)*words);

for(index=0;index<words-2;index++) {           //generate the test data

x[index]=(rand()<<16)^rand();

y[index]=(rand()<<16)^rand();

}

   for(index=words-2;index<words;index++) {

x[index]=0;

y[index]=0;

}

   for(index=0;index<words-1;index++) {

d[index]=(rand()<<16)^rand();

}

   d[words-1]=0;

cudaMemcpy( dev_a, x, words * sizeof(uint32_t),cudaMemcpyHostToDevice );  // copy data from cpu to gpu

   cudaMemcpy( dev_b, y, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

   cudaMemcpy( dev_d, d, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

z = AddMod(x, y, d, words-1, words);     //C function used to test the results.

CUDA_AddMod<<<1024,256>>>(dev_a,dev_b,dev_d,words-1, words,dev_c,carry,c_2);  // Cuda function

cudaMemcpy( cpy_z, carry, words*sizeof(uint32_t), cudaMemcpyDeviceToHost );  //copy results back to cpu

int flag=0;                             //testing is as follows.

   int nof=0;

   int fi;

   printf("cpy_z[0]=%d\n",  cpy_z[0]);

   printf("z[0]=%d\n",  z[0]);

   printf("cpy_z[1]=%d\n",  cpy_z[1]);

   printf("z[1]=%d\n",  z[1]);

   printf("cpy_z[2]=%d\n",  cpy_z[2]);

   printf("z[2]=%d\n",  z[2]);

   printf("cpy_z[3]=%d\n",  cpy_z[3]);

   printf("z[3]=%d\n",  z[3]);

   printf("cpy_z[4]=%d\n",  cpy_z[4]);

   printf("z[4]=%d\n",  z[4]);

for(int i=0;i<words;i++){

       if(cpy_z[i]!=z[i]){

             flag=1;

             nof++;

             fi=i;

             break;

       }

   }

if (flag==1)

     printf("fail %d times at %d\n",nof,fi);

  else

     printf("all succeed!\n");

cudaFree(dev_a);

   cudaFree(dev_b);

   cudaFree(dev_d);

   cudaFree(dev_c);

   cudaFree(carry);

   cudaFree(c_2);

free(x);

   free(y);

   free(d);

   free(z);

   free(cpy_z);

}

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words){  //C function

    int    i;

    unsigned long long int   SumWord;

    uint32_t *carry, *z_1;

z_1=(uint32_t *)malloc(sizeof(uint32_t)*(dLength+1));

    carry=(uint32_t *)malloc(sizeof(uint32_t)*words);

carry[0]=0;

    for(i=0; i<words; i++){

        SumWord = (unsigned long long int)x[i]+(unsigned long long int)y[i];

            if(SumWord >= 0x100000000){

                z_1[i] = SumWord - 0x100000000; 

                carry[i+1] = 1;

            }

           else {

                z_1[i] = SumWord;

                carry[i+1] = 0;

           }

    }

for(i=0; i<dLength+1; i++){

        z_1[i] = z_1[i] + carry[i];

    }

    return carry;

}

//Cuda function is as follows.

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2){

unsigned long long  int SumWord;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid<words){

        SumWord = (unsigned long long int)a[tid]+(unsigned long long int)b[tid];

        if(SumWord >= 0x100000000){

            c[tid] = SumWord - 0x100000000; 

            carry[tid+1] = 1;

	}

	else {

	    c[tid] = SumWord;

            carry[tid+1] = 0;

	}

        __syncthreads();

    }

c_2[tid] = c_2[tid] +carry[tid] ; //c_2[tid] =1; The problem is this sentence.

}

The problem has been solved. Thanks.