Hello all, I wrote a cude codes. But the speed is slow. Could anyone give me some advices about how to accelerate the codes? Thanks,
#include "cuda.h"
#include <stdio.h>
#include <stdlib.h>
void AddParallel(unsigned int *x, unsigned int xLength, unsigned int *y, unsigned int Length, unsigned int *z, int samplec);
__global__ void CUDA_ADD1(unsigned int *a, unsigned int *b, unsigned int *c, int words, unsigned int *carry);
__global__ void CUDA_ADD2(unsigned int *a, unsigned int *b, unsigned int *c, int words, unsigned int *carry);
int main()
{
int words, index, samples;
unsigned int *x, *y, *z, *cpy_z;
unsigned int *dev_a, *dev_b, *dev_c, *carry;
//int dim=1024*128;
samples=1024*128; //dim*dim;
//words=samples/4;
words=24533;
//const int threadsPerBlock = 256;
//dim3 block(dim/16,dim/16);
//dim3 thread(16,16);
cudaMalloc( (void**)&dev_a, words * sizeof(unsigned int) );
cudaMalloc( (void**)&dev_b, words * sizeof(unsigned int) );
cudaMalloc( (void**)&dev_c, words * sizeof(unsigned int) );
cudaMalloc( (void**)&carry, words * sizeof(unsigned int) );
x=(unsigned int *)malloc(sizeof(unsigned int)*words);
y=(unsigned int *)malloc(sizeof(unsigned int)*words);
z=(unsigned int *)malloc(sizeof(unsigned int)*words);
cpy_z=(unsigned int *)malloc(sizeof(unsigned int)*words);
for(index=0;index<words-1;index++)
{
x[index]=(rand()<<16)^rand();
y[index]=(rand()<<16)^rand();
//printf("x[%d] = %x \n", index, x[index]);
//printf("y[%d] = %x \n", index, y[index]);
}
x[words-1]=0;
y[words-1]=0;
cudaMemcpy( dev_a, x, words * sizeof(unsigned int),cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, y, words * sizeof(unsigned int),cudaMemcpyHostToDevice );
AddParallel(x, words, y, words, z, samples);
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
cudaEventRecord( start, 0);
CUDA_ADD1<<<256,128>>>(dev_a,dev_b,dev_c,words,carry);
CUDA_ADD2<<<256,128>>>(dev_a,dev_b,dev_c,words,carry);
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
float elapsed;
cudaEventElapsedTime( &elapsed,start, stop );
elapsed=elapsed/1000;
printf( "Time to generate: %3.5f s\n", elapsed );
cudaEventDestroy( start );
cudaEventDestroy( stop );
cudaMemcpy( cpy_z, dev_c, words*sizeof(unsigned int), cudaMemcpyDeviceToHost );
int flag=0;
int nof=0;
int fi;
for(int i=0;i<words;i++)
{
if(cpy_z[i]!=z[i])
{
flag=1;
nof++;
fi=i;
break;
}
}
if (flag==1)
printf("fail %d times at %d\n",nof,fi);
else
printf("all succeed!\n");
}
void AddParallel(unsigned int *x, unsigned int xLength, unsigned int *y, unsigned int Length, unsigned int *z, int samples){
int words, i;
unsigned long long int SumWord;
unsigned int *carry;
words=samples/4;
carry=(unsigned int *)malloc(sizeof(unsigned int)*words);
for(i=0; i<words; i++){
SumWord = (unsigned long long int)x[i]+(unsigned long long int)y[i];
//printf("SumWord %l = %llx\n", i, SumWord);
if(SumWord >= 0x100000000){
z[i] = SumWord - 0x100000000; //bit operation 0x100000000 100000000
carry[i] = 1;
}
else {
z[i] = SumWord;
carry[i] = 0;
}
}
z[words-1] = 0;
for(i=1; i<words; i++){
z[i] = z[i] + carry[i-1];
}
//printf("Size: %5dK bits addition complete\n", samples/128);
}
__global__ void CUDA_ADD1(unsigned int *a, unsigned int *b, unsigned int *c, int words, unsigned int *carry){
unsigned long long int SumWord;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid < words) {
SumWord = (unsigned long long int)a[tid]+(unsigned long long int)b[tid];
c[tid] = SumWord & 0xFFFFFFFF;
carry[tid] = SumWord >> 32;
}
}
__global__ void CUDA_ADD2(unsigned int *a, unsigned int *b, unsigned int *c, int words, unsigned int *carry){
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid < words) {
if(tid!=0)
c[tid] = c[tid] + carry[tid-1];
}
}