Retrieve array columns

I’m trying to retrieve each column from a 2D array

into a shared memory 1D array.

The program works well but after analysis it seems

that it takes too long time to get the work done.

I would appreciate any help to solve this problem because

it slow the overall execution of my application

Here is what I did:

__device__ void getCols(float *_iarray, int row, int col)

{

	__shared__ float tmpCol[BLOCK_SIZE];

	int tid = threadIdx.x;

	int stride = blockDim.x * col;

	int tmpV;

	int size = row * col;

	for(int j = 0; j < col; j++)

	{

		//(re)initialize tmpCol for the next column

		tmpCol[tid] = 0;

		__syncthreads();		

		//get the jth col

		for(int k = 0; k < size; k += stride)

		{

			tmpV = tid * col + j + k;

			if(tmpV < size){

				tmpCol[tid] += _iarray[tmpV];	

			}	

		}

		//do somthing with tmpCol;

	}

}

I’m trying to retrieve each column from a 2D array

into a shared memory 1D array.

The program works well but after analysis it seems

that it takes too long time to get the work done.

I would appreciate any help to solve this problem because

it slow the overall execution of my application

Here is what I did:

__device__ void getCols(float *_iarray, int row, int col)

{

	__shared__ float tmpCol[BLOCK_SIZE];

	int tid = threadIdx.x;

	int stride = blockDim.x * col;

	int tmpV;

	int size = row * col;

	for(int j = 0; j < col; j++)

	{

		//(re)initialize tmpCol for the next column

		tmpCol[tid] = 0;

		__syncthreads();		

		//get the jth col

		for(int k = 0; k < size; k += stride)

		{

			tmpV = tid * col + j + k;

			if(tmpV < size){

				tmpCol[tid] += _iarray[tmpV];	

			}	

		}

		//do somthing with tmpCol;

	}

}

tmpCol[tid] += _iarray[tmpV];

Is that supposed to be += and not just an assignment?

+= is much slower than =

tmpCol[tid] += _iarray[tmpV];

Is that supposed to be += and not just an assignment?

+= is much slower than =

is this what you’re trying to do?

__device__ void getCols(float *_iarray, int rows, int cols) {

	__shared__ float tmpCol[rows];

	int tid = threadIdx.x;

	int stride = blockDim.x;

	for(int x = 0; x < cols; x++)  {

		//get the jth col

		for(int y = tid; y < rows; y += stride)

			tmpCol[y] = _iarray[x+y*cols];	

		__syncthreads();		

		//do something with tmpCol;

	}

}

is this what you’re trying to do?

__device__ void getCols(float *_iarray, int rows, int cols) {

	__shared__ float tmpCol[rows];

	int tid = threadIdx.x;

	int stride = blockDim.x;

	for(int x = 0; x < cols; x++)  {

		//get the jth col

		for(int y = tid; y < rows; y += stride)

			tmpCol[y] = _iarray[x+y*cols];	

		__syncthreads();		

		//do something with tmpCol;

	}

}