help with my cuda program

weiwang217 · March 5, 2012, 8:19pm

Hi all, I wrote a cuda program. The results (carry) are correct if the last sentence (c_2[tid] = c_2[tid] +carry[tid] External Image is not been executed. But the last sentence does not change the output(carry). If the last sentence is replaced with (c_2[tid] =1;), the results are still wrong. It’s very wired. Could any one can help me to find the problem? Thanks.

#include "cuda.h"

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#include <gmp.h>

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words);

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2);

int main()

{

int          samples, words, index;

uint32_t    *x, *y, *d, *z, *cpy_z;

uint32_t *dev_a, *dev_b, *dev_d, *dev_c, *carry, *c_2;

samples=512;

words=samples/4;

cudaMalloc( (void**)&dev_a, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_b, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_d, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_c, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&carry, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&c_2, words * sizeof(uint32_t) );

cudaMemset(dev_a, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_b, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_d, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_c, 0, sizeof(uint32_t)*words);

   cudaMemset(carry, 0, sizeof(uint32_t)*words);

   cudaMemset(c_2, 0, sizeof(uint32_t)*words);

x=(uint32_t *)malloc(sizeof(uint32_t)*words);

y=(uint32_t *)malloc(sizeof(uint32_t)*words);

   d=(uint32_t *)malloc(sizeof(uint32_t)*words);

z=(uint32_t *)malloc(sizeof(uint32_t)*words);

   cpy_z=(uint32_t *)malloc(sizeof(uint32_t)*words);

for(index=0;index<words-2;index++) {

	x[index]=(rand()<<16)^rand();

	y[index]=(rand()<<16)^rand();

}

   for(index=words-2;index<words;index++) {

	x[index]=0;

	y[index]=0;

}

for(index=0;index<words-1;index++) {

	d[index]=(rand()<<16)^rand();

}

   d[words-1]=0;

cudaMemcpy( dev_a, x, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

   cudaMemcpy( dev_b, y, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

   cudaMemcpy( dev_d, d, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

z = AddMod(x, y, d, words-1, words);

cudaEvent_t     start, stop;

   cudaEventCreate( &start );

   cudaEventCreate( &stop );

   cudaEventRecord( start, 0);

CUDA_AddMod<<<1024,256>>>(dev_a,dev_b,dev_d,words-1, words,dev_c,carry,c_2);

cudaEventRecord( stop, 0 );

   cudaEventSynchronize( stop );

   float   elapsed;

   cudaEventElapsedTime( &elapsed,start, stop );

   elapsed=elapsed/1000;

   printf( "Time to generate:  %3.5f s\n", elapsed ); 

cudaEventDestroy( start );

   cudaEventDestroy( stop ); 

cudaMemcpy( cpy_z, carry, words*sizeof(uint32_t), cudaMemcpyDeviceToHost );

	int flag=0;

	

	int nof=0;

	int fi;

printf("cpy_z[0]=%d\n",  cpy_z[0]);

                        printf("z[0]=%d\n",  z[0]);

                        printf("cpy_z[1]=%d\n",  cpy_z[1]);

                        printf("z[1]=%d\n",  z[1]);

                        printf("cpy_z[2]=%d\n",  cpy_z[2]);

                        printf("z[2]=%d\n",  z[2]);

                        printf("cpy_z[3]=%d\n",  cpy_z[3]);

                        printf("z[3]=%d\n",  z[3]);

                        printf("cpy_z[4]=%d\n",  cpy_z[4]);

                        printf("z[4]=%d\n",  z[4]);

	for(int i=0;i<words;i++)

	{

		if(cpy_z[i]!=z[i])

		{

			flag=1;

			nof++;

			fi=i;

			break;

		}

	}

	if (flag==1)

		printf("fail %d times at %d\n",nof,fi);

	else

		printf("all succeed!\n");

cudaFree(dev_a);

   cudaFree(dev_b);

   cudaFree(dev_d);

   cudaFree(dev_c);

   cudaFree(carry);

   cudaFree(c_2);

free(x);

   free(y);

   free(d);

   free(z);

   free(cpy_z);

}

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words){

	int    i;

	unsigned long long int   SumWord;

	uint32_t *carry, *z_1, *z_2, *z_3;

z_1=(uint32_t *)malloc(sizeof(uint32_t)*(dLength+1));

        z_2=(uint32_t *)malloc(sizeof(uint32_t)*words);

        z_3=(uint32_t *)malloc(sizeof(uint32_t)*words);

        carry=(uint32_t *)malloc(sizeof(uint32_t)*words);

carry[0]=0;

	for(i=0; i<words; i++){

		SumWord = (unsigned long long int)x[i]+(unsigned long long int)y[i];

                //printf("SumWord %l = %llx\n", i, SumWord);

		if(SumWord >= 0x100000000){

			z_1[i] = SumWord - 0x100000000; 

			carry[i+1] = 1;

		}

		else {

			z_1[i] = SumWord;

			carry[i+1] = 0;

		}

	}

for(i=0; i<dLength+1; i++){

              z_1[i] = z_1[i] + carry[i];

        }

return carry;

}

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2){

	unsigned long long  int SumWord;

int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid<words){

	    SumWord = (unsigned long long int)a[tid]+(unsigned long long int)b[tid];

	    if(SumWord >= 0x100000000){

		c[tid] = SumWord - 0x100000000; //bit operation 0x100000000 100000000

		carry[tid+1] = 1;

	    }

	    else {

		c[tid] = SumWord;

		carry[tid+1] = 0;

	    }

	    __syncthreads();

}

c_2[tid] = c_2[tid] +carry[tid] ; //c_2[tid] =1;

}

pasoleatis · March 5, 2012, 8:21pm

Please put the code between these line, makes it more readable:

[ code ]

[ / code ]

(without the spaces)

#include "cuda.h"

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#include <gmp.h>

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words);

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2);

int main()

{

int samples, words, index;

uint32_t *x, *y, *d, *z, *cpy_z;

uint32_t *dev_a, *dev_b, *dev_d, *dev_c, *carry, *c_2;

samples=512;

words=samples/4;

cudaMalloc( (void**)&dev_a, words * sizeof(uint32_t) );

cudaMalloc( (void**)&dev_b, words * sizeof(uint32_t) );

cudaMalloc( (void**)&dev_d, words * sizeof(uint32_t) );

cudaMalloc( (void**)&dev_c, words * sizeof(uint32_t) );

cudaMalloc( (void**)&carry, words * sizeof(uint32_t) );

cudaMalloc( (void**)&c_2, words * sizeof(uint32_t) );

cudaMemset(dev_a, 0, sizeof(uint32_t)*words);

cudaMemset(dev_b, 0, sizeof(uint32_t)*words);

cudaMemset(dev_d, 0, sizeof(uint32_t)*words);

cudaMemset(dev_c, 0, sizeof(uint32_t)*words);

cudaMemset(carry, 0, sizeof(uint32_t)*words);

cudaMemset(c_2, 0, sizeof(uint32_t)*words);

x=(uint32_t *)malloc(sizeof(uint32_t)*words);

y=(uint32_t *)malloc(sizeof(uint32_t)*words);

d=(uint32_t *)malloc(sizeof(uint32_t)*words);

z=(uint32_t *)malloc(sizeof(uint32_t)*words);

cpy_z=(uint32_t *)malloc(sizeof(uint32_t)*words);

for(index=0;index<words-2;index++) { //generate the test data

x[index]=(rand()<<16)^rand();

y[index]=(rand()<<16)^rand();

}

for(index=words-2;index<words;index++) {

x[index]=0;

y[index]=0;

}

for(index=0;index<words-1;index++) {

d[index]=(rand()<<16)^rand();

}

d[words-1]=0;

cudaMemcpy( dev_a, x, words * sizeof(uint32_t),cudaMemcpyHostToDevice ); // copy data from cpu to gpu

cudaMemcpy( dev_b, y, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

cudaMemcpy( dev_d, d, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

z = AddMod(x, y, d, words-1, words); //C function used to test the results.

CUDA_AddMod<<<1024,256>>>(dev_a,dev_b,dev_d, words-1, words,dev_c,carry,c_2); // Cuda function

cudaMemcpy( cpy_z, carry, words*sizeof(uint32_t), cudaMemcpyDeviceToHost ); //copy results back to cpu

int flag=0; //testing is as follows.

int nof=0;

int fi;

printf("cpy_z[0]=%d\n", cpy_z[0]);

printf("z[0]=%d\n", z[0]);

printf("cpy_z[1]=%d\n", cpy_z[1]);

printf("z[1]=%d\n", z[1]);

printf("cpy_z[2]=%d\n", cpy_z[2]);

printf("z[2]=%d\n", z[2]);

printf("cpy_z[3]=%d\n", cpy_z[3]);

printf("z[3]=%d\n", z[3]);

printf("cpy_z[4]=%d\n", cpy_z[4]);

printf("z[4]=%d\n", z[4]);

for(int i=0;i<words;i++){

if(cpy_z[i]!=z[i]){

flag=1;

nof++;

fi=i;

break;

}

}

if (flag==1)

printf("fail %d times at %d\n",nof,fi);

else

printf("all succeed!\n");

cudaFree(dev_a);

cudaFree(dev_b);

cudaFree(dev_d);

cudaFree(dev_c);

cudaFree(carry);

cudaFree(c_2);

free(x);

free(y);

free(d);

free(z);

free(cpy_z);

}

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words){ //C function

int i;

unsigned long long int SumWord;

uint32_t *carry, *z_1;

z_1=(uint32_t *)malloc(sizeof(uint32_t)*(dLength+1));

carry=(uint32_t *)malloc(sizeof(uint32_t)*words);

carry[0]=0;

for(i=0; i<words; i++){

SumWord = (unsigned long long int)x[i]+(unsigned long long int)y[i];

if(SumWord >= 0x100000000){

z_1[i] = SumWord - 0x100000000;

carry[i+1] = 1;

}

else {

z_1[i] = SumWord;

carry[i+1] = 0;

}

}

for(i=0; i<dLength+1; i++){

z_1[i] = z_1[i] + carry[i];

}

return carry;

}

//Cuda function is as follows.

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2){

unsigned long long int SumWord;

int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid<words){

SumWord = (unsigned long long int)a[tid]+(unsigned long long int)b[tid];

if(SumWord >= 0x100000000){

c[tid] = SumWord - 0x100000000;

carry[tid+1] = 1;

}

else {

c[tid] = SumWord;

carry[tid+1] = 0;

}

__syncthreads();

}

c_2[tid] = c_2[tid] +carry[tid] ; //c_2[tid] =1; The problem is this sentence.

}

weiwang217 · March 5, 2012, 8:40pm

I am sorry about that. I make some changes to my codes. I hope it will bring some help.

#include "cuda.h"

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#include <gmp.h>

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words);

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2);

int main()            

{

int          samples, words, index;

uint32_t    *x, *y, *d, *z, *cpy_z;

   uint32_t *dev_a, *dev_b, *dev_d, *dev_c, *carry, *c_2;

samples=512;

   words=samples/4;

cudaMalloc( (void**)&dev_a, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_b, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_d, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_c, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&carry, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&c_2, words * sizeof(uint32_t) );

cudaMemset(dev_a, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_b, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_d, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_c, 0, sizeof(uint32_t)*words);

   cudaMemset(carry, 0, sizeof(uint32_t)*words);

   cudaMemset(c_2, 0, sizeof(uint32_t)*words);

x=(uint32_t *)malloc(sizeof(uint32_t)*words);

y=(uint32_t *)malloc(sizeof(uint32_t)*words);

   d=(uint32_t *)malloc(sizeof(uint32_t)*words);

z=(uint32_t *)malloc(sizeof(uint32_t)*words);

   cpy_z=(uint32_t *)malloc(sizeof(uint32_t)*words);

for(index=0;index<words-2;index++) {           //generate the test data

x[index]=(rand()<<16)^rand();

y[index]=(rand()<<16)^rand();

}

   for(index=words-2;index<words;index++) {

x[index]=0;

y[index]=0;

}

   for(index=0;index<words-1;index++) {

d[index]=(rand()<<16)^rand();

}

   d[words-1]=0;

cudaMemcpy( dev_a, x, words * sizeof(uint32_t),cudaMemcpyHostToDevice );  // copy data from cpu to gpu

   cudaMemcpy( dev_b, y, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

   cudaMemcpy( dev_d, d, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

z = AddMod(x, y, d, words-1, words);     //C function used to test the results.

CUDA_AddMod<<<1024,256>>>(dev_a,dev_b,dev_d,words-1, words,dev_c,carry,c_2);  // Cuda function

cudaMemcpy( cpy_z, carry, words*sizeof(uint32_t), cudaMemcpyDeviceToHost );  //copy results back to cpu

int flag=0;                             //testing is as follows.

   int nof=0;

   int fi;

   printf("cpy_z[0]=%d\n",  cpy_z[0]);

   printf("z[0]=%d\n",  z[0]);

   printf("cpy_z[1]=%d\n",  cpy_z[1]);

   printf("z[1]=%d\n",  z[1]);

   printf("cpy_z[2]=%d\n",  cpy_z[2]);

   printf("z[2]=%d\n",  z[2]);

   printf("cpy_z[3]=%d\n",  cpy_z[3]);

   printf("z[3]=%d\n",  z[3]);

   printf("cpy_z[4]=%d\n",  cpy_z[4]);

   printf("z[4]=%d\n",  z[4]);

for(int i=0;i<words;i++){

       if(cpy_z[i]!=z[i]){

             flag=1;

             nof++;

             fi=i;

             break;

       }

   }

if (flag==1)

     printf("fail %d times at %d\n",nof,fi);

  else

     printf("all succeed!\n");

cudaFree(dev_a);

   cudaFree(dev_b);

   cudaFree(dev_d);

   cudaFree(dev_c);

   cudaFree(carry);

   cudaFree(c_2);

free(x);

   free(y);

   free(d);

   free(z);

   free(cpy_z);

}

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words){  //C function

    int    i;

    unsigned long long int   SumWord;

    uint32_t *carry, *z_1;

z_1=(uint32_t *)malloc(sizeof(uint32_t)*(dLength+1));

    carry=(uint32_t *)malloc(sizeof(uint32_t)*words);

carry[0]=0;

    for(i=0; i<words; i++){

        SumWord = (unsigned long long int)x[i]+(unsigned long long int)y[i];

            if(SumWord >= 0x100000000){

                z_1[i] = SumWord - 0x100000000; 

                carry[i+1] = 1;

            }

           else {

                z_1[i] = SumWord;

                carry[i+1] = 0;

           }

    }

for(i=0; i<dLength+1; i++){

        z_1[i] = z_1[i] + carry[i];

    }

    return carry;

}

//Cuda function is as follows.

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2){

unsigned long long  int SumWord;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid<words){

        SumWord = (unsigned long long int)a[tid]+(unsigned long long int)b[tid];

        if(SumWord >= 0x100000000){

            c[tid] = SumWord - 0x100000000; 

            carry[tid+1] = 1;

	}

	else {

	    c[tid] = SumWord;

            carry[tid+1] = 0;

	}

        __syncthreads();

    }

c_2[tid] = c_2[tid] +carry[tid] ; //c_2[tid] =1; The problem is this sentence.

}

weiwang217 · March 5, 2012, 10:11pm

The problem has been solved. Thanks.

I am sorry about that. I make some changes to my codes. I hope it will bring some help.

#include "cuda.h"

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#include <gmp.h>

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words);

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2);

int main()            

{

int          samples, words, index;

uint32_t    *x, *y, *d, *z, *cpy_z;

   uint32_t *dev_a, *dev_b, *dev_d, *dev_c, *carry, *c_2;

samples=512;

   words=samples/4;

cudaMalloc( (void**)&dev_a, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_b, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_d, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&dev_c, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&carry, words * sizeof(uint32_t) );

   cudaMalloc( (void**)&c_2, words * sizeof(uint32_t) );

cudaMemset(dev_a, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_b, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_d, 0, sizeof(uint32_t)*words);

   cudaMemset(dev_c, 0, sizeof(uint32_t)*words);

   cudaMemset(carry, 0, sizeof(uint32_t)*words);

   cudaMemset(c_2, 0, sizeof(uint32_t)*words);

x=(uint32_t *)malloc(sizeof(uint32_t)*words);

y=(uint32_t *)malloc(sizeof(uint32_t)*words);

   d=(uint32_t *)malloc(sizeof(uint32_t)*words);

z=(uint32_t *)malloc(sizeof(uint32_t)*words);

   cpy_z=(uint32_t *)malloc(sizeof(uint32_t)*words);

for(index=0;index<words-2;index++) {           //generate the test data

x[index]=(rand()<<16)^rand();

y[index]=(rand()<<16)^rand();

}

   for(index=words-2;index<words;index++) {

x[index]=0;

y[index]=0;

}

   for(index=0;index<words-1;index++) {

d[index]=(rand()<<16)^rand();

}

   d[words-1]=0;

cudaMemcpy( dev_a, x, words * sizeof(uint32_t),cudaMemcpyHostToDevice );  // copy data from cpu to gpu

   cudaMemcpy( dev_b, y, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

   cudaMemcpy( dev_d, d, words * sizeof(uint32_t),cudaMemcpyHostToDevice );

z = AddMod(x, y, d, words-1, words);     //C function used to test the results.

CUDA_AddMod<<<1024,256>>>(dev_a,dev_b,dev_d,words-1, words,dev_c,carry,c_2);  // Cuda function

cudaMemcpy( cpy_z, carry, words*sizeof(uint32_t), cudaMemcpyDeviceToHost );  //copy results back to cpu

int flag=0;                             //testing is as follows.

   int nof=0;

   int fi;

   printf("cpy_z[0]=%d\n",  cpy_z[0]);

   printf("z[0]=%d\n",  z[0]);

   printf("cpy_z[1]=%d\n",  cpy_z[1]);

   printf("z[1]=%d\n",  z[1]);

   printf("cpy_z[2]=%d\n",  cpy_z[2]);

   printf("z[2]=%d\n",  z[2]);

   printf("cpy_z[3]=%d\n",  cpy_z[3]);

   printf("z[3]=%d\n",  z[3]);

   printf("cpy_z[4]=%d\n",  cpy_z[4]);

   printf("z[4]=%d\n",  z[4]);

for(int i=0;i<words;i++){

       if(cpy_z[i]!=z[i]){

             flag=1;

             nof++;

             fi=i;

             break;

       }

   }

if (flag==1)

     printf("fail %d times at %d\n",nof,fi);

  else

     printf("all succeed!\n");

cudaFree(dev_a);

   cudaFree(dev_b);

   cudaFree(dev_d);

   cudaFree(dev_c);

   cudaFree(carry);

   cudaFree(c_2);

free(x);

   free(y);

   free(d);

   free(z);

   free(cpy_z);

}

uint32_t* AddMod(uint32_t *x, uint32_t *y, uint32_t *d, uint32_t dLength, uint32_t words){  //C function

    int    i;

    unsigned long long int   SumWord;

    uint32_t *carry, *z_1;

z_1=(uint32_t *)malloc(sizeof(uint32_t)*(dLength+1));

    carry=(uint32_t *)malloc(sizeof(uint32_t)*words);

carry[0]=0;

    for(i=0; i<words; i++){

        SumWord = (unsigned long long int)x[i]+(unsigned long long int)y[i];

            if(SumWord >= 0x100000000){

                z_1[i] = SumWord - 0x100000000; 

                carry[i+1] = 1;

            }

           else {

                z_1[i] = SumWord;

                carry[i+1] = 0;

           }

    }

for(i=0; i<dLength+1; i++){

        z_1[i] = z_1[i] + carry[i];

    }

    return carry;

}

//Cuda function is as follows.

__global__ void CUDA_AddMod(uint32_t *a, uint32_t *b, uint32_t *d, uint32_t dLength, uint32_t words, uint32_t *c, uint32_t *carry, uint32_t *c_2){

unsigned long long  int SumWord;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid<words){

        SumWord = (unsigned long long int)a[tid]+(unsigned long long int)b[tid];

        if(SumWord >= 0x100000000){

            c[tid] = SumWord - 0x100000000; 

            carry[tid+1] = 1;

	}

	else {

	    c[tid] = SumWord;

            carry[tid+1] = 0;

	}

        __syncthreads();

    }

c_2[tid] = c_2[tid] +carry[tid] ; //c_2[tid] =1; The problem is this sentence.

}

Topic		Replies	Views
help me to find the wired problem CUDA Programming and Performance	10	2796	March 6, 2012
Big numbers multiplication CUDA Programming and Performance	3	2212	June 13, 2008
errors in modulo incorrect results when % is used for a ciculair buffer CUDA Programming and Performance	4	712	July 21, 2011
Error when calculating modulo 2+512=2 for CUDA CUDA Programming and Performance	2	2614	December 27, 2009
Compiler bug? CUDA Programming and Performance	2	1176	July 24, 2009
The first calculation is always wrong CUDA Programming and Performance	1	1618	February 27, 2012
Odd code exits kernel CUDA Programming and Performance	10	4141	July 16, 2008
CUDA & modulos CUDA Programming and Performance	1	1614	February 28, 2010
Cant modify data on the GPU CUDA Programming and Performance	16	10264	December 20, 2008
error in modulo operation CUDA Programming and Performance	12	16152	September 20, 2009

help with my cuda program

Related topics