how to accelerate this cuda codes

Hello all, I wrote a cude codes. But the speed is slow. Could anyone give me some advices about how to accelerate the codes? Thanks,

#include "cuda.h"

#include <stdio.h>

#include <stdlib.h>

void AddParallel(unsigned int *x, unsigned int xLength, unsigned int *y, unsigned int Length,  unsigned int *z, int samplec);

__global__ void CUDA_ADD1(unsigned int *a, unsigned int *b, unsigned int *c, int words, unsigned int *carry);

__global__ void CUDA_ADD2(unsigned int *a, unsigned int *b, unsigned int *c, int words, unsigned int *carry);

int main()

{

	int          words, index, samples;

	unsigned int    *x, *y, *z, *cpy_z;

	unsigned int *dev_a, *dev_b, *dev_c, *carry;

	//int dim=1024*128;

	samples=1024*128; //dim*dim;

	//words=samples/4;

        words=24533;

//const int threadsPerBlock = 256;

	//dim3 block(dim/16,dim/16);

	//dim3 thread(16,16);

	cudaMalloc( (void**)&dev_a, words * sizeof(unsigned int) );

    	cudaMalloc( (void**)&dev_b, words * sizeof(unsigned int) );

    	cudaMalloc( (void**)&dev_c, words * sizeof(unsigned int) );

	cudaMalloc( (void**)&carry, words * sizeof(unsigned int) );

	x=(unsigned int *)malloc(sizeof(unsigned int)*words);

	y=(unsigned int *)malloc(sizeof(unsigned int)*words);

	z=(unsigned int *)malloc(sizeof(unsigned int)*words);

	cpy_z=(unsigned int *)malloc(sizeof(unsigned int)*words);

	for(index=0;index<words-1;index++)

	{

		x[index]=(rand()<<16)^rand();

		y[index]=(rand()<<16)^rand();

        //printf("x[%d] = %x \n", index, x[index]);

        //printf("y[%d] = %x \n", index, y[index]);

 	}

        x[words-1]=0;

        y[words-1]=0;

	cudaMemcpy( dev_a, x, words * sizeof(unsigned int),cudaMemcpyHostToDevice );

	cudaMemcpy( dev_b, y, words * sizeof(unsigned int),cudaMemcpyHostToDevice );

	AddParallel(x, words, y, words, z, samples);

	cudaEvent_t     start, stop;

	cudaEventCreate( &start );

	cudaEventCreate( &stop );

	cudaEventRecord( start, 0);

	

        CUDA_ADD1<<<256,128>>>(dev_a,dev_b,dev_c,words,carry);

        CUDA_ADD2<<<256,128>>>(dev_a,dev_b,dev_c,words,carry);

	cudaEventRecord( stop, 0 );

	cudaEventSynchronize( stop );

	float   elapsed;

	cudaEventElapsedTime( &elapsed,start, stop );

	elapsed=elapsed/1000;

	printf( "Time to generate:  %3.5f s\n", elapsed ); 

cudaEventDestroy( start );

        cudaEventDestroy( stop ); 

	cudaMemcpy( cpy_z, dev_c, words*sizeof(unsigned int), cudaMemcpyDeviceToHost );

	int flag=0;

	

	int nof=0;

	int fi;

	for(int i=0;i<words;i++)

	{

		if(cpy_z[i]!=z[i])

		{

			flag=1;

			nof++;

			fi=i;

			break;

		}

	}

	if (flag==1)

		printf("fail %d times at %d\n",nof,fi);

	else

		printf("all succeed!\n");

}

void AddParallel(unsigned int *x, unsigned int xLength, unsigned int *y, unsigned int Length,  unsigned int *z, int samples){

	int   words, i;

	unsigned long long int   SumWord;

	unsigned int *carry;

	words=samples/4;

	carry=(unsigned int *)malloc(sizeof(unsigned int)*words);

	for(i=0; i<words; i++){

		SumWord = (unsigned long long int)x[i]+(unsigned long long int)y[i];

                //printf("SumWord %l = %llx\n", i, SumWord);

		if(SumWord >= 0x100000000){

			z[i] = SumWord - 0x100000000; //bit operation 0x100000000 100000000

			carry[i] = 1;

		}

		else {

			z[i] = SumWord;

			carry[i] = 0;

		}

	}

	z[words-1] = 0;

	for(i=1; i<words; i++){

		z[i] = z[i] + carry[i-1];

	}

//printf("Size: %5dK bits addition complete\n", samples/128);

}

__global__ void CUDA_ADD1(unsigned int *a, unsigned int *b, unsigned int *c, int words, unsigned int *carry){

	unsigned long long  int SumWord;

int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid < words) {

	    SumWord = (unsigned long long int)a[tid]+(unsigned long long int)b[tid];

            c[tid] = SumWord & 0xFFFFFFFF;

            carry[tid] = SumWord >> 32;

}

}

__global__ void CUDA_ADD2(unsigned int *a, unsigned int *b, unsigned int *c, int words, unsigned int *carry){

int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid < words) {

	   if(tid!=0)

               c[tid] = c[tid] + carry[tid-1];

       }

}

I am not an expert but you can try to use shared or local memory instead of the global one. Have a look at the chapter 6 of the Best Practices guide.

By the way you also may read the entire guide. It can be very useful to understand some important concepts.

It would help a little bit you could describe what your code is supposed to do. I can see one way to improve. This lines

if(tid!=0)

               c[tid] = c[tid] + carry[tid-1];

instead of having c[tid] = c[tid] + carry[tid-1]; you couls try somehting like ca[tid] = cb[tid] + carry[tid-1];

I do not think it is good in general (both gpu or cpu) to read and then write the same address.

You could also check if you can combine the kernels. If you can combine the kernels then you will need to maybe shared memory because each thread would process b[tid] and b[tid-1] and there is no point to fetch the same data 2 times from the global memory.

The numbers from the calls <<<256,128>>> can be adjusted to have maximum efficiency.