_
Hi,
I am here for a suggestion on how to improve the timing of execution for the below given code .I had a code to modify for obvious performance improvemance through cuda.I thought it would be better by converting to integer array instead of character array(which was previously used)
which was previously used all over the REST of the code).The code takes 125000 character array which has only characters A,C,G,T,$ and replaces by 1,2,3,4,0 in another integers array of same number of characters.
For this i am taking some part of T(character array)into shared memory and then convert those respective values .I intened that converting to integer array would improve my performance but instead it degraded .I wanna vary the number of characters array size and measure the times of execution of both cuda and non cuda version along with the number the of characters varying.
Please suggest me ways to optimize the below code in terms of times of execution when RUN.Please suggest me the number of threads per block and number of blocks guideline for this problem.I am novice to CUDA .please guide me .
global_ void toint(char *Td,int *Ind)
{
extern shared char sc_data;
int inOffset = blockDim.x * blockIdx.x;
int in = inOffset + threadIdx.x;
sc_data[threadIdx.x] = Td[in];
__syncthreads();
if(sc_data[threadIdx.x]=='A')Ind[in]=1;
if(sc_data[threadIdx.x]=='C')Ind[in]=2;
if(sc_data[threadIdx.x]=='G')Ind[in]=3;
if(sc_data[threadIdx.x]=='T')Ind[in]=4;
if(sc_data[threadIdx.x]=='$')Ind[in]=0;
}
int main()
{
struct timeval start, stop, echodelay;//for time
if((gettimeofday(&start, NULL)) == -1) {perror(“gettimeofday”); exit(1);}//getting start time
char *Td;//character array on device
int *Ind;//integer array on device
char T[125000]=…125000 character;//character array on CPU
int In[125000];//integer array on CPU
int numThreadsPerBlock = 5;//number of threads per block
int numofblocks=25000;
int sharedMemSize = numThreadsPerBlock * sizeof(char);//shared memory size
cudaMalloc( (void **) &Td, 125000sizeof(char) );//allocating memory on device for character array
cudaMalloc( (void **) &Ind, 125000sizeof(int) );//allocating memory on device for integer array
cudaMemcpy(Td,T,125000sizeof(char), cudaMemcpyHostToDevice);
toint<<<numofblocks,numThreadsPerBlock,sharedMemSize>>>(Td,Ind);
cudaMemcpy(In,Ind,125000sizeof(int), cudaMemcpyDeviceToHost);
if((gettimeofday(&stop, NULL)) == -1){perror(“gettimeofday”);exit(1);}//getting end time
timeval_subtract(&echodelay, &stop, &start);//difference of time
printf("\n The time of execution is %d \n ",echodelay.tv_usec);
cudaFree(Td);
cudaFree(Ind);
return 0;
}