help me to find the wired problem

weiwang217 · March 6, 2012, 3:31am

Hi all. I wrote the codes to implement the parallel addition. But there are always some errors in my final results. The program runs well until it run the last sentence. So there may be something wrong with the last sentence. Could anyone can help me find the wired problem. Thanks.

#include "cuda.h"

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#include <gmp.h>

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words);

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2);

int main()            

{

int          samples, words, index;

uint32_t    *x, *y, *d, *z, *cpy_z;

   uint32_t *dev_a, *dev_b, *dev_d, *dev_c, *carry, *c_2;

samples=1024*512;

   words=samples/4;

cudaMalloc( (void**)&dev_a, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_b, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_d, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_c, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&carry, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&c_2, words * sizeof(uint32_t) );

cudaMemset(dev_a, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_b, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_d, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_c, 0, sizeof(uint32_t)*words);

   cudaMemset(carry, 0, sizeof(uint32_t)*words);

   cudaMemset(c_2, 0, sizeof(uint32_t)*words);

x=(uint32_t *)malloc(sizeof(uint32_t)*words);

y=(uint32_t *)malloc(sizeof(uint32_t)*words);

   d=(uint32_t *)malloc(sizeof(uint32_t)*words);

z=(uint32_t *)malloc(sizeof(uint32_t)*words);

   cpy_z=(uint32_t *)malloc(sizeof(uint32_t)*words);

for(index=0;index<words-2;index++) {           //generate the test data

x[index]=(rand()<<16)^rand();

y[index]=(rand()<<16)^rand();

}

   for(index=words-2;index<words;index++) {

x[index]=0;

y[index]=0;

}

   for(index=0;index<words-1;index++) {

d[index]=(rand()<<16)^rand();

}

   d[words-1]=0;

cudaMemcpy( dev_a, x, words * sizeof(uint32_t),cudaMemcpyHostToDevice );  // copy data from cpu to gpu

   cudaMemcpy( dev_b, y, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

   cudaMemcpy( dev_d, d, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

z = AddMod(x, y, d, words-1, words);     //C function used to test the results.

CUDA_AddMod<<<4096,256>>>(dev_a,dev_b,dev_d,words-1, words,dev_c,carry,c_2);  // Cuda function

cudaMemcpy( cpy_z, c_2, words*sizeof(uint32_t), cudaMemcpyDeviceToHost );  //copy results back to cpu

int flag=0;                             //testing is as follows.

   int nof=0;

   int fi;

   printf("cpy_z[0]=%d\n",  cpy_z[0]);

   printf("z[0]=%d\n",  z[0]);

   printf("cpy_z[1]=%d\n",  cpy_z[1]);

   printf("z[1]=%d\n",  z[1]);

   printf("cpy_z[2]=%d\n",  cpy_z[2]);

   printf("z[2]=%d\n",  z[2]);

   printf("cpy_z[3]=%d\n",  cpy_z[3]);

   printf("z[3]=%d\n",  z[3]);

   printf("cpy_z[4]=%d\n",  cpy_z[4]);

   printf("z[4]=%d\n",  z[4]);

for(int i=0;i<words;i++){

       if(cpy_z[i]!=z[i]){

             flag=1;

             nof++;

             fi=i;

             break;

       }

   }

if (flag==1)

     printf("fail %d times at %d\n",nof,fi);

  else

     printf("all succeed!\n");

cudaFree(dev_a);

   cudaFree(dev_b);

   cudaFree(dev_d);

   cudaFree(dev_c);

   cudaFree(carry);

   cudaFree(c_2);

free(x);

   free(y);

   free(d);

   free(z);

   free(cpy_z);

}

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words){  //C function

    int    i;

    unsigned long long int   SumWord;

    uint32_t *carry, *z_1;

z_1=(uint32_t *)malloc(sizeof(uint32_t)*(dLength+1));

    carry=(uint32_t *)malloc(sizeof(uint32_t)*words);

carry[0]=0;

    for(i=0; i<words; i++){

        SumWord = (unsigned long long int)x[i]+(unsigned long long int)y[i];

            if(SumWord >= 0x100000000){

                z_1[i] = SumWord - 0x100000000; 

                carry[i+1] = 1;

            }

           else {

                z_1[i] = SumWord;

                carry[i+1] = 0;

           }

    }

for(i=0; i<dLength+1; i++){

        z_1[i] = z_1[i] + carry[i];

    }

return z_1;

}

//Cuda function is as follows.

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2){

unsigned long long  int SumWord;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid<words){

        SumWord = (unsigned long long int)a[tid]+(unsigned long long int)b[tid];

        if(SumWord >= 0x100000000){

            c[tid] = SumWord - 0x100000000; 

            carry[tid+1] = 1;

	}

	else {

	    c[tid] = SumWord;

            carry[tid+1] = 0;

	}

        __syncthreads();

c_2[0]=c[0];

c_2[tid] = c[tid] + carry[tid] ; // The problem is this sentence.

    }

}

Gilles_C · March 6, 2012, 4:58am

Hi,

I guess that your problem comes from a mix-up between “dLength” and “words” in the kernel (as well as on the C function BTW).

if(tid<words){  // words == words in this context

     ...

     carry[tid+1] = 0;  // carry is allocate like this:   cudaMalloc( (void**)&carry, words * sizeof(uint32_t) );

     ...

  }

So you have an “off by one” problem here.

HTH

weiwang217 · March 6, 2012, 2:56pm

I made some changes to my codes. But the problem still exsits. I donot know how to find the problem.

#include "cuda.h"

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#include <gmp.h>

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t words);

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, int32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2);

int main()            

{

int          samples, words, index;

uint32_t    *x, *y, *d, *z, *cpy_z;

   uint32_t *dev_a, *dev_b, *dev_d, *dev_c, *carry, *c_2;

samples=1024*512;

   words=samples/4;

cudaMalloc( (void**)&dev_a, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_b, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_d, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_c, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&carry, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&c_2, words * sizeof(uint32_t) );

cudaMemset(dev_a, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_b, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_d, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_c, 0, sizeof(uint32_t)*words);

   cudaMemset(carry, 0, sizeof(uint32_t)*words);

   cudaMemset(c_2, 0, sizeof(uint32_t)*words);

x=(uint32_t *)malloc(sizeof(uint32_t)*words);

y=(uint32_t *)malloc(sizeof(uint32_t)*words);

   d=(uint32_t *)malloc(sizeof(uint32_t)*words);

z=(uint32_t *)malloc(sizeof(uint32_t)*words);

   cpy_z=(uint32_t *)malloc(sizeof(uint32_t)*words);

for(index=0;index<words-2;index++) {           //generate the test data

x[index]=(rand()<<16)^rand();

y[index]=(rand()<<16)^rand();

}

   for(index=words-2;index<words;index++) {

x[index]=0;

y[index]=0;

}

   for(index=0;index<words-1;index++) {

d[index]=(rand()<<16)^rand();

}

   d[words-1]=0;

cudaMemcpy( dev_a, x, words * sizeof(uint32_t),cudaMemcpyHostToDevice );  // copy data from cpu to gpu

   cudaMemcpy( dev_b, y, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

   cudaMemcpy( dev_d, d, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

z = AddMod(x, y, d, words);     //C function used to test the results.

CUDA_AddMod<<<4096,256>>>(dev_a,dev_b,dev_d, words,dev_c,carry,c_2);  // Cuda function

cudaMemcpy( cpy_z, c_2, words*sizeof(uint32_t), cudaMemcpyDeviceToHost );  //copy results back to cpu

int flag=0;                             //testing is as follows.

   int nof=0;

   int fi;

   printf("cpy_z[0]=%d\n",  cpy_z[0]);

   printf("z[0]=%d\n",  z[0]);

   printf("cpy_z[1]=%d\n",  cpy_z[1]);

   printf("z[1]=%d\n",  z[1]);

   printf("cpy_z[2]=%d\n",  cpy_z[2]);

   printf("z[2]=%d\n",  z[2]);

   printf("cpy_z[3]=%d\n",  cpy_z[3]);

   printf("z[3]=%d\n",  z[3]);

   printf("cpy_z[4]=%d\n",  cpy_z[4]);

   printf("z[4]=%d\n",  z[4]);

for(int i=0;i<words;i++){

       if(cpy_z[i]!=z[i]){

             flag=1;

             nof++;

             fi=i;

             break;

       }

   }

if (flag==1)

     printf("fail %d times at %d\n",nof,fi);

  else

     printf("all succeed!\n");

cudaFree(dev_a);

   cudaFree(dev_b);

   cudaFree(dev_d);

   cudaFree(dev_c);

   cudaFree(carry);

   cudaFree(c_2);

free(x);

   free(y);

   free(d);

   free(z);

   free(cpy_z);

}

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t words){  //C function

    int    i;

    unsigned long long int   SumWord;

    uint32_t *carry, *z_1;

z_1=(uint32_t *)malloc(sizeof(uint32_t)*words);

    carry=(uint32_t *)malloc(sizeof(uint32_t)*words);

carry[0]=0;

    for(i=0; i<words; i++){

        SumWord = (unsigned long long int)x[i]+(unsigned long long int)y[i];

            if(SumWord >= 0x100000000){

                z_1[i] = SumWord - 0x100000000; 

                carry[i] = 1;

            }

           else {

                z_1[i] = SumWord;

                carry[i] = 0;

           }

    }

for(i=1; i<words; i++){

        z_1[i] = z_1[i] + carry[i-1];

    }

return z_1;

}

//Cuda function is as follows.

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, int32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2){

unsigned long long  int SumWord;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid<words){

        SumWord = (unsigned long long int)a[tid]+(unsigned long long int)b[tid];

        if(SumWord >= 0x100000000){

            c[tid] = SumWord - 0x100000000; 

            carry[tid] = 1;

	}

	else {

	    c[tid] = SumWord;

            carry[tid] = 0;

	}

        __syncthreads();

if(tid>0)        

        c_2[tid] = c[tid] + carry[tid-1] ; // The problem is this sentence.

        c_2[0] = c[0];

__syncthreads();

    }

}

Gilles_C · March 6, 2012, 3:08pm

Here again your code is wrong since tid and tid-1 are not always on the same threadblock.
So your __syncthreads() only synchronizes inside a single block, not across blocks. You have race conditions.
Can it be your problem?

weiwang217 · March 6, 2012, 3:28pm

Yes, this is the problem. I begin to learn cuda programming. Could you give me some advises in order to make my cuda program correct? Thanks for your help.

pasoleatis · March 6, 2012, 3:46pm

Hello,

The threadfence fucntions might help. They do not make synchronizations, rather block other threads from accesing a memory address until it becomes visible to all threads and blocks. In the programming guide there is an example about how to use it for a reduction algorithm.

Gilles_C · March 6, 2012, 3:47pm

For this specific problem, just finishing your kernel without the faulty line and then having a subsequent kernel specifically for making this operation should do. Actually, splitting kernels is the common way of enforcing a global thread synchronization.
Now I’m sure there are some possible optimisations to think about, but try first to make it right, and then think of making it faster if needed.

EDIT: just saw pasoleatis’ answer and I must say I didn’t remember about __threadfence

weiwang217 · March 6, 2012, 4:17pm

Hello, I made some changes as follows. I use threadfence in the new cuda program. But the cuda program still cannot run properly. Do I use the threadfence correctly or are there still some problems in the cuda codes?

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, int32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2){

unsigned long long  int SumWord;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid<words){

        SumWord = (unsigned long long int)a[tid]+(unsigned long long int)b[tid];

        if(SumWord >= 0x100000000){

            c[tid] = SumWord - 0x100000000; 

            carry[tid] = 1;

	}

	else {

	    c[tid] = SumWord;

            carry[tid] = 0;

	}

if(tid>0)        

        c_2[tid] = c[tid] + carry[tid-1] ; // The problem is this sentence.

        c_2[0] = c[0];

__threadfence();

}

}

Gilles_C · March 6, 2012, 5:04pm

No you don’t use threadfence properly at all.
Just look page 102 (printed 90) of the CUDA_C_Programming_Guide.pdf for an usage example.

weiwang217 · March 6, 2012, 9:00pm

I looked through the example in the programming guide. I was confused and do not know how to use it in my cuda codes. Any suggestions? Thanks,

tera · March 6, 2012, 10:47pm

Just launch another kernel. It’s the proper solution anyway.

Topic		Replies	Views
help with my cuda program CUDA Programming and Performance	3	2933	March 5, 2012
Same Code (really, it is) - Much Different Results CUDA Programming and Performance	38	13893	September 30, 2010
Emulation/CPU=correct,Execution/GPU=incorrect emulation CUDA Programming and Performance	26	21545	September 2, 2008
Odd error fixed by commenting unrelated line? CUDA Programming and Performance	11	8656	February 17, 2010
Getting started with CUDA ... cannot add simple vectors CUDA Programming and Performance	9	20971	January 31, 2011
Possible CUDA bug for multiple active blocks CUDA Programming and Performance	34	46652	June 17, 2009
problem running demos CUDA Programming and Performance	9	8225	January 1, 2009
Annoying problems with memory and/or syntax CUDA Programming and Performance	19	4798	April 8, 2008
Possible nvcc bug? CUDA Programming and Performance	13	8834	January 9, 2011
Very strange behaviour. Maybe a bug...? Kernel fails to run strangely, but no errors are reported. CUDA Programming and Performance	5	1060	May 13, 2009

help me to find the wired problem

Related topics