help me to find the wired problem

Hi all. I wrote the codes to implement the parallel addition. But there are always some errors in my final results. The program runs well until it run the last sentence. So there may be something wrong with the last sentence. Could anyone can help me find the wired problem. Thanks.

#include "cuda.h"

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#include <gmp.h>

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words);

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2);

int main()            

{

int          samples, words, index;

uint32_t    *x, *y, *d, *z, *cpy_z;

   uint32_t *dev_a, *dev_b, *dev_d, *dev_c, *carry, *c_2;

samples=1024*512;

   words=samples/4;

cudaMalloc( (void**)&dev_a, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_b, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_d, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_c, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&carry, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&c_2, words * sizeof(uint32_t) );

cudaMemset(dev_a, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_b, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_d, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_c, 0, sizeof(uint32_t)*words);

   cudaMemset(carry, 0, sizeof(uint32_t)*words);

   cudaMemset(c_2, 0, sizeof(uint32_t)*words);

x=(uint32_t *)malloc(sizeof(uint32_t)*words);

y=(uint32_t *)malloc(sizeof(uint32_t)*words);

   d=(uint32_t *)malloc(sizeof(uint32_t)*words);

z=(uint32_t *)malloc(sizeof(uint32_t)*words);

   cpy_z=(uint32_t *)malloc(sizeof(uint32_t)*words);

for(index=0;index<words-2;index++) {           //generate the test data

x[index]=(rand()<<16)^rand();

y[index]=(rand()<<16)^rand();

}

   for(index=words-2;index<words;index++) {

x[index]=0;

y[index]=0;

}

   for(index=0;index<words-1;index++) {

d[index]=(rand()<<16)^rand();

}

   d[words-1]=0;

cudaMemcpy( dev_a, x, words * sizeof(uint32_t),cudaMemcpyHostToDevice );  // copy data from cpu to gpu

   cudaMemcpy( dev_b, y, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

   cudaMemcpy( dev_d, d, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

z = AddMod(x, y, d, words-1, words);     //C function used to test the results.

CUDA_AddMod<<<4096,256>>>(dev_a,dev_b,dev_d,words-1, words,dev_c,carry,c_2);  // Cuda function

cudaMemcpy( cpy_z, c_2, words*sizeof(uint32_t), cudaMemcpyDeviceToHost );  //copy results back to cpu

int flag=0;                             //testing is as follows.

   int nof=0;

   int fi;

   printf("cpy_z[0]=%d\n",  cpy_z[0]);

   printf("z[0]=%d\n",  z[0]);

   printf("cpy_z[1]=%d\n",  cpy_z[1]);

   printf("z[1]=%d\n",  z[1]);

   printf("cpy_z[2]=%d\n",  cpy_z[2]);

   printf("z[2]=%d\n",  z[2]);

   printf("cpy_z[3]=%d\n",  cpy_z[3]);

   printf("z[3]=%d\n",  z[3]);

   printf("cpy_z[4]=%d\n",  cpy_z[4]);

   printf("z[4]=%d\n",  z[4]);

for(int i=0;i<words;i++){

       if(cpy_z[i]!=z[i]){

             flag=1;

             nof++;

             fi=i;

             break;

       }

   }

if (flag==1)

     printf("fail %d times at %d\n",nof,fi);

  else

     printf("all succeed!\n");

cudaFree(dev_a);

   cudaFree(dev_b);

   cudaFree(dev_d);

   cudaFree(dev_c);

   cudaFree(carry);

   cudaFree(c_2);

free(x);

   free(y);

   free(d);

   free(z);

   free(cpy_z);

}

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words){  //C function

    int    i;

    unsigned long long int   SumWord;

    uint32_t *carry, *z_1;

z_1=(uint32_t *)malloc(sizeof(uint32_t)*(dLength+1));

    carry=(uint32_t *)malloc(sizeof(uint32_t)*words);

carry[0]=0;

    for(i=0; i<words; i++){

        SumWord = (unsigned long long int)x[i]+(unsigned long long int)y[i];

            if(SumWord >= 0x100000000){

                z_1[i] = SumWord - 0x100000000; 

                carry[i+1] = 1;

            }

           else {

                z_1[i] = SumWord;

                carry[i+1] = 0;

           }

    }

for(i=0; i<dLength+1; i++){

        z_1[i] = z_1[i] + carry[i];

    }

return z_1;

}

//Cuda function is as follows.

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2){

unsigned long long  int SumWord;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid<words){

        SumWord = (unsigned long long int)a[tid]+(unsigned long long int)b[tid];

        if(SumWord >= 0x100000000){

            c[tid] = SumWord - 0x100000000; 

            carry[tid+1] = 1;

	}

	else {

	    c[tid] = SumWord;

            carry[tid+1] = 0;

	}

        __syncthreads();

c_2[0]=c[0];

c_2[tid] = c[tid] + carry[tid] ; // The problem is this sentence.

    }

}

Hi,

I guess that your problem comes from a mix-up between “dLength” and “words” in the kernel (as well as on the C function BTW).

if(tid<words){  // words == words in this context

     ...

     carry[tid+1] = 0;  // carry is allocate like this:   cudaMalloc( (void**)&carry, words * sizeof(uint32_t) );

     ...

  }

So you have an “off by one” problem here.

HTH

I made some changes to my codes. But the problem still exsits. I donot know how to find the problem.

#include "cuda.h"

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#include <gmp.h>

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t words);

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, int32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2);

int main()            

{

int          samples, words, index;

uint32_t    *x, *y, *d, *z, *cpy_z;

   uint32_t *dev_a, *dev_b, *dev_d, *dev_c, *carry, *c_2;

samples=1024*512;

   words=samples/4;

cudaMalloc( (void**)&dev_a, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_b, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_d, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_c, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&carry, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&c_2, words * sizeof(uint32_t) );

cudaMemset(dev_a, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_b, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_d, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_c, 0, sizeof(uint32_t)*words);

   cudaMemset(carry, 0, sizeof(uint32_t)*words);

   cudaMemset(c_2, 0, sizeof(uint32_t)*words);

x=(uint32_t *)malloc(sizeof(uint32_t)*words);

y=(uint32_t *)malloc(sizeof(uint32_t)*words);

   d=(uint32_t *)malloc(sizeof(uint32_t)*words);

z=(uint32_t *)malloc(sizeof(uint32_t)*words);

   cpy_z=(uint32_t *)malloc(sizeof(uint32_t)*words);

for(index=0;index<words-2;index++) {           //generate the test data

x[index]=(rand()<<16)^rand();

y[index]=(rand()<<16)^rand();

}

   for(index=words-2;index<words;index++) {

x[index]=0;

y[index]=0;

}

   for(index=0;index<words-1;index++) {

d[index]=(rand()<<16)^rand();

}

   d[words-1]=0;

cudaMemcpy( dev_a, x, words * sizeof(uint32_t),cudaMemcpyHostToDevice );  // copy data from cpu to gpu

   cudaMemcpy( dev_b, y, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

   cudaMemcpy( dev_d, d, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

z = AddMod(x, y, d, words);     //C function used to test the results.

CUDA_AddMod<<<4096,256>>>(dev_a,dev_b,dev_d, words,dev_c,carry,c_2);  // Cuda function

cudaMemcpy( cpy_z, c_2, words*sizeof(uint32_t), cudaMemcpyDeviceToHost );  //copy results back to cpu

int flag=0;                             //testing is as follows.

   int nof=0;

   int fi;

   printf("cpy_z[0]=%d\n",  cpy_z[0]);

   printf("z[0]=%d\n",  z[0]);

   printf("cpy_z[1]=%d\n",  cpy_z[1]);

   printf("z[1]=%d\n",  z[1]);

   printf("cpy_z[2]=%d\n",  cpy_z[2]);

   printf("z[2]=%d\n",  z[2]);

   printf("cpy_z[3]=%d\n",  cpy_z[3]);

   printf("z[3]=%d\n",  z[3]);

   printf("cpy_z[4]=%d\n",  cpy_z[4]);

   printf("z[4]=%d\n",  z[4]);

for(int i=0;i<words;i++){

       if(cpy_z[i]!=z[i]){

             flag=1;

             nof++;

             fi=i;

             break;

       }

   }

if (flag==1)

     printf("fail %d times at %d\n",nof,fi);

  else

     printf("all succeed!\n");

cudaFree(dev_a);

   cudaFree(dev_b);

   cudaFree(dev_d);

   cudaFree(dev_c);

   cudaFree(carry);

   cudaFree(c_2);

free(x);

   free(y);

   free(d);

   free(z);

   free(cpy_z);

}

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t words){  //C function

    int    i;

    unsigned long long int   SumWord;

    uint32_t *carry, *z_1;

z_1=(uint32_t *)malloc(sizeof(uint32_t)*words);

    carry=(uint32_t *)malloc(sizeof(uint32_t)*words);

carry[0]=0;

    for(i=0; i<words; i++){

        SumWord = (unsigned long long int)x[i]+(unsigned long long int)y[i];

            if(SumWord >= 0x100000000){

                z_1[i] = SumWord - 0x100000000; 

                carry[i] = 1;

            }

           else {

                z_1[i] = SumWord;

                carry[i] = 0;

           }

    }

for(i=1; i<words; i++){

        z_1[i] = z_1[i] + carry[i-1];

    }

return z_1;

}

//Cuda function is as follows.

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, int32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2){

unsigned long long  int SumWord;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid<words){

        SumWord = (unsigned long long int)a[tid]+(unsigned long long int)b[tid];

        if(SumWord >= 0x100000000){

            c[tid] = SumWord - 0x100000000; 

            carry[tid] = 1;

	}

	else {

	    c[tid] = SumWord;

            carry[tid] = 0;

	}

        __syncthreads();

if(tid>0)        

        c_2[tid] = c[tid] + carry[tid-1] ; // The problem is this sentence.

        c_2[0] = c[0];

__syncthreads();

    }

}

Here again your code is wrong since tid and tid-1 are not always on the same threadblock.
So your __syncthreads() only synchronizes inside a single block, not across blocks. You have race conditions.
Can it be your problem?

Yes, this is the problem. I begin to learn cuda programming. Could you give me some advises in order to make my cuda program correct? Thanks for your help.

Hello,

The threadfence fucntions might help. They do not make synchronizations, rather block other threads from accesing a memory address until it becomes visible to all threads and blocks. In the programming guide there is an example about how to use it for a reduction algorithm.

For this specific problem, just finishing your kernel without the faulty line and then having a subsequent kernel specifically for making this operation should do. Actually, splitting kernels is the common way of enforcing a global thread synchronization.
Now I’m sure there are some possible optimisations to think about, but try first to make it right, and then think of making it faster if needed.

EDIT: just saw pasoleatis’ answer and I must say I didn’t remember about __threadfence

Hello, I made some changes as follows. I use threadfence in the new cuda program. But the cuda program still cannot run properly. Do I use the threadfence correctly or are there still some problems in the cuda codes?

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, int32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2){

unsigned long long  int SumWord;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid<words){

        SumWord = (unsigned long long int)a[tid]+(unsigned long long int)b[tid];

        if(SumWord >= 0x100000000){

            c[tid] = SumWord - 0x100000000; 

            carry[tid] = 1;

	}

	else {

	    c[tid] = SumWord;

            carry[tid] = 0;

	}

if(tid>0)        

        c_2[tid] = c[tid] + carry[tid-1] ; // The problem is this sentence.

        c_2[0] = c[0];

__threadfence();

}

}

No you don’t use threadfence properly at all.
Just look page 102 (printed 90) of the CUDA_C_Programming_Guide.pdf for an usage example.

I looked through the example in the programming guide. I was confused and do not know how to use it in my cuda codes. Any suggestions? Thanks,

Just launch another kernel. It’s the proper solution anyway.