copy from 1D array to shared memory matrix in cuda

how can I modify this code so that it copies the elements of an array to a shared memory matrix efficiently?

#include "cuda.h"
#include "cuda_runtime.h"

#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>

#define N 64
#define shared_size 32 
#define BLOCKS (N/shared_size)
#define THREADS_PER_BLOCK shared_size
// Define this to turn on error checking
#define CUDA_ERROR_CHECK
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )

//****************************************************************************************
// functions for cuda error checking
inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n",
			file, line, cudaGetErrorString(err));
		exit(-1);
	}
#endif

	return;
}
inline void __cudaCheckError(const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
	cudaError err = cudaGetLastError();
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n",
			file, line, cudaGetErrorString(err));
		exit(-1);
	}

	// More careful checking. However, this will affect performance.
	// Comment away if needed.
	err = cudaDeviceSynchronize();
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
			file, line, cudaGetErrorString(err));
		exit(-1);
	}
#endif

	return;
}
//****************************************************************************************

__global__ void calculate_ratios(float *a)
{
	//int i = threadIdx.x+blockDim.x*blockIdx.x;
	int tx = threadIdx.x;
	int ty = threadIdx.y;

	int txx = threadIdx.x + blockDim.x*blockIdx.x;
	int tyy = threadIdx.y + blockDim.y*blockIdx.y;

	__shared__ float temp[shared_size][shared_size];

	//  copy from global memory to shared memory
	temp[ty][tx] = a[tyy*(shared_size)+tx];
	__syncthreads();
	temp[ty][tx] = temp[ty][tx] * 2;
	__syncthreads();
	a[tyy*(shared_size)+tx] = temp[ty][tx];
	

	
}

int main()
{

	float *a_h;
	a_h = (float *)malloc(N*N*sizeof(float)); //allocate memory on host

	for (int i = 0; i < N*N; i++){

			int num = rand() % 10;
			 a_h[i]=1;

		
	
	}

	
	float *a_d;
	float *b_d;
	cudaMalloc((void **)&a_d, N*N*sizeof(float));
	cudaMalloc((void **)&b_d, N*N*sizeof(float));

	 cudaMemcpy(a_d, a_h, N*N*sizeof(float), cudaMemcpyHostToDevice);
	 dim3 dimBlock(THREADS_PER_BLOCK, THREADS_PER_BLOCK, 1);
	 dim3 dimGrid(BLOCKS, BLOCKS, 1);
	 calculate_ratios << <dimGrid, dimBlock >> >(a_d);
	cudaDeviceSynchronize();
	cudaMemcpy(a_h, a_d, N*N*sizeof(float), cudaMemcpyDeviceToHost);

	for (int i = 0; i < N; i++){

		for (int j = 0; j < N; j++){
			printf("%.1f ", a_h[i *N + j]);

		}
		printf("\n");
	}
	// cuda error checking
	cudaError_t error = cudaGetLastError();
	if (error != cudaSuccess)
	{
		printf("CUDA Error: %s\n", cudaGetErrorString(error));

		// we can't recover from the error -- exit the program
		return 1;
	}
	
	cudaFree(a_d);
	cudaFree(b_d);
}

why would you think that it is currently not efficient?
it seems aligned, without bank conflicts - it does not get better than that

seemingly, you have a one-to-one mapping between the addresses threads read from, and write to - threads work in on the array independently of other threads
thus, you could drop the __syncthreads(), as there is really nothing to synchronize on

the results are not correct

Your global memory indexing is not correct. The compiler warning that txx was not used is a clue.

Change your kernel lines to look like this:

//  copy from global memory to shared memory
        temp[ty][tx] = a[tyy*(gridDim.x*shared_size)+txx];
        __syncthreads();
        temp[ty][tx] = temp[ty][tx] * 2;
        __syncthreads();
        a[tyy*(gridDim.x*shared_size)+txx] = temp[ty][tx];

thanks all

I guess I should point out that the __syncthreads() are not necessary for this particular code sequence. Threads do not access any location in temp other than “their own”. In fact, there is actually little point in using shared memory here, except that you seem to be using it to learn and explore, which is fine. As little_jimmy has already pointed out, your basic shared access mechanisms appear to be un-bank-conflicted and “efficient”.

thank txbob
I modified the code, does it seems efficient?

__global__ void calculate_ratios(float *a,float *b)
{
	
	int tx = threadIdx.x;
	int ty = threadIdx.y;

	int txx = threadIdx.x + blockDim.x*blockIdx.x;
	int tyy = threadIdx.y + blockDim.y*blockIdx.y;

	__shared__ float temp[shared_size][shared_size];
	__shared__ float digonal_elem[shared_size*BLOCKS];
    __shared__ float transposed[shared_size][shared_size];
	 
	//  copy from global memory to shared memory
	temp[ty][tx] = a[(blockIdx.y*BLOCKS*shared_size*shared_size) + (ty*BLOCKS*shared_size) + (tx + blockIdx.x*shared_size)];
	


	
		digonal_elem[txx] = a[txx*(shared_size*BLOCKS + 1)];
	
	
	
	
	__syncthreads();
	
	//do the divsion in the shared memory
	if ( (blockIdx.x== blockIdx.y)){
		temp[tx + ty][ty] = temp[tx + ty][ty] / digonal_elem[ty + blockIdx.x*shared_size];
	}
	else if (blockIdx.x < blockIdx.y) {

		temp[ty][tx] = temp[ty][tx] / digonal_elem[tx];
	}

	__syncthreads();

	


	//transpose the matrix
		transposed[tx][ty]=temp[ty][tx];
	 __syncthreads();


	// copy the result of division to gloabal memory
	a[(blockIdx.y*BLOCKS*shared_size*shared_size) + (ty*BLOCKS*shared_size) + (tx + blockIdx.x*shared_size)] = temp[ty][tx];
	b[(blockIdx.y*BLOCKS*shared_size*shared_size) + (ty*BLOCKS*shared_size) + (tx + blockIdx.x*shared_size)] = transposed[ty][tx];

	//printf("(%d %d)\n", txx,tyy);
}

does the “__syncthreads();” stills unnecessary ?

These lines seem to me to have various issues:

temp[ty][tx] = a[(blockIdx.y*BLOCKS*shared_size*shared_size) + (ty*BLOCKS*shared_size) + (tx + blockIdx.x*shared_size)];
	


	
		digonal_elem[txx] = a[txx*(shared_size*BLOCKS + 1)];

First, this will not coalesce nicely on a global load, I don’t think:

= a[(blockIdx.y*BLOCKS*shared_size*shared_size) + (ty*BLOCKS*shared_size) + (tx + blockIdx.x*shared_size)];

nor will this:

= a[txx*(shared_size*BLOCKS + 1)];

In addition, this is likely not a correct way to index into shared memory:

digonal_elem[txx] =

Your txx variable is a globally unique index. For indexing into shared memory, you would want to construct some kind of “local” i.e. intra-threadblock index, such as using tx, for example.

There may be other issues with the code as well. Those were just a couple items I noticed.