[cuda]how to make him faster

[codebox]#include <math.h>

#include <stdio.h>

#include <cutil_inline.h>

#include <cudpp/cudpp.h>

const int firstNumber = 3;

const int arraySide = 15;

int count= 0;

global void firstkernel(int* tab, int n, int firstNumber) {

int x=blockDim.x*blockIdx.x+threadIdx.x;



if(x<n){

	tab[x] = 1;

	

	int temp = firstNumber + (x*2);

	for(int i = 0 ; i < n ; i++){

		if(firstNumber+(i*2) != 1 && firstNumber+(i*2) < temp){

			if(temp%(firstNumber+(i*2)) == 0){

				tab[x] = 0;

			}

		}

	}

}

}

global void replace( int* e, int* d, int* c, int n, int firstNumber , int count)

{

int x=blockDim.x*blockIdx.x+threadIdx.x;



if(x<n && c[x] == 1){

	int temp = firstNumber + (x*2);

	e[d[x]] = temp;

}

}

int main(int argc, char** argv) {

if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

    cutilDeviceInit(argc, argv);

else

    cudaSetDevice( cutGetMaxGflopsDeviceId() );

int* c=(int*)malloc(sizeof(int)* arraySide);

int* c_t;

int* d=(int*)malloc(sizeof(int)* arraySide);

int* d_t;

int* e=(int*)malloc(sizeof(int)* arraySide);

int* e_t;

cutilSafeCall(cudaMalloc((void**)&c_t, sizeof(int)*arraySide));

cutilSafeCall(cudaMalloc((void**)&d_t, sizeof(int)*arraySide));

cutilSafeCall(cudaMalloc((void**)&e_t, sizeof(int)*arraySide));

cutilSafeCall(cudaMemcpy(c_t, c, arraySide*sizeof(int), cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(d_t, d, arraySide*sizeof(int), cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(e_t, e, arraySide*sizeof(int), cudaMemcpyHostToDevice));



int width=arraySide/512+(((arraySide%512)!=0)?1:0);

unsigned int timer = 0;



cutilCheckError(cutCreateTimer(&timer));



cutilCheckError(cutStartTimer(timer));

firstkernel<<<width,512,0>>>(c_t, arraySide, firstNumber);

cutilSafeCall(cudaMemcpy(c, c_t, arraySide*sizeof(int), cudaMemcpyDeviceToHost));





CUDPPConfiguration config;

config.op = CUDPP_ADD;

config.datatype = CUDPP_INT;

config.algorithm = CUDPP_SCAN;

config.options = CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE;

CUDPPHandle scanplan = 0;

CUDPPResult resPlan = cudppPlan(&scanplan, config, arraySide, 1, 0);

if (CUDPP_SUCCESS != resPlan) {

	printf("Error creating CUDPPPlan\n");

	exit(-1);

}

cudppScan (scanplan, d_t, c_t, arraySide);

cutilSafeCall(cudaMemcpy(d, d_t, arraySide*sizeof(int), cudaMemcpyDeviceToHost));



replace<<<width,512,0>>>(e_t,d_t,c_t,arraySide,firstNumber,count);

cutilSafeCall(cudaMemcpy(e, e_t, arraySide*sizeof(int), cudaMemcpyDeviceToHost));



cutilSafeCall(cudaThreadSynchronize());



cutilCheckError(cutStopTimer(timer));





printf("\nSpeed: %f (ms) \n\n", cutGetTimerValue(timer));

for (int i=0;(i<arraySide && e[i]>0);i++) {	

	printf(" %d \n", e[i]);	

}



printf("\n");

cudaThreadExit();

}

[/codebox]