I’m trying to retrieve each column from a 2D array
into a shared memory 1D array.
The program works well but after analysis it seems
that it takes too long time to get the work done.
I would appreciate any help to solve this problem because
it slow the overall execution of my application
Here is what I did:
__device__ void getCols(float *_iarray, int row, int col)
{
__shared__ float tmpCol[BLOCK_SIZE];
int tid = threadIdx.x;
int stride = blockDim.x * col;
int tmpV;
int size = row * col;
for(int j = 0; j < col; j++)
{
//(re)initialize tmpCol for the next column
tmpCol[tid] = 0;
__syncthreads();
//get the jth col
for(int k = 0; k < size; k += stride)
{
tmpV = tid * col + j + k;
if(tmpV < size){
tmpCol[tid] += _iarray[tmpV];
}
}
//do somthing with tmpCol;
}
}
I’m trying to retrieve each column from a 2D array
into a shared memory 1D array.
The program works well but after analysis it seems
that it takes too long time to get the work done.
I would appreciate any help to solve this problem because
it slow the overall execution of my application
Here is what I did:
__device__ void getCols(float *_iarray, int row, int col)
{
__shared__ float tmpCol[BLOCK_SIZE];
int tid = threadIdx.x;
int stride = blockDim.x * col;
int tmpV;
int size = row * col;
for(int j = 0; j < col; j++)
{
//(re)initialize tmpCol for the next column
tmpCol[tid] = 0;
__syncthreads();
//get the jth col
for(int k = 0; k < size; k += stride)
{
tmpV = tid * col + j + k;
if(tmpV < size){
tmpCol[tid] += _iarray[tmpV];
}
}
//do somthing with tmpCol;
}
}