how can I modify this code so that it copies the elements of an array to a shared memory matrix efficiently?
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>
#define N 64
#define shared_size 32
#define BLOCKS (N/shared_size)
#define THREADS_PER_BLOCK shared_size
// Define this to turn on error checking
#define CUDA_ERROR_CHECK
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
//****************************************************************************************
// functions for cuda error checking
inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
if (cudaSuccess != err)
{
fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
#endif
return;
}
inline void __cudaCheckError(const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
// More careful checking. However, this will affect performance.
// Comment away if needed.
err = cudaDeviceSynchronize();
if (cudaSuccess != err)
{
fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
#endif
return;
}
//****************************************************************************************
__global__ void calculate_ratios(float *a)
{
//int i = threadIdx.x+blockDim.x*blockIdx.x;
int tx = threadIdx.x;
int ty = threadIdx.y;
int txx = threadIdx.x + blockDim.x*blockIdx.x;
int tyy = threadIdx.y + blockDim.y*blockIdx.y;
__shared__ float temp[shared_size][shared_size];
// copy from global memory to shared memory
temp[ty][tx] = a[tyy*(shared_size)+tx];
__syncthreads();
temp[ty][tx] = temp[ty][tx] * 2;
__syncthreads();
a[tyy*(shared_size)+tx] = temp[ty][tx];
}
int main()
{
float *a_h;
a_h = (float *)malloc(N*N*sizeof(float)); //allocate memory on host
for (int i = 0; i < N*N; i++){
int num = rand() % 10;
a_h[i]=1;
}
float *a_d;
float *b_d;
cudaMalloc((void **)&a_d, N*N*sizeof(float));
cudaMalloc((void **)&b_d, N*N*sizeof(float));
cudaMemcpy(a_d, a_h, N*N*sizeof(float), cudaMemcpyHostToDevice);
dim3 dimBlock(THREADS_PER_BLOCK, THREADS_PER_BLOCK, 1);
dim3 dimGrid(BLOCKS, BLOCKS, 1);
calculate_ratios << <dimGrid, dimBlock >> >(a_d);
cudaDeviceSynchronize();
cudaMemcpy(a_h, a_d, N*N*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
printf("%.1f ", a_h[i *N + j]);
}
printf("\n");
}
// cuda error checking
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess)
{
printf("CUDA Error: %s\n", cudaGetErrorString(error));
// we can't recover from the error -- exit the program
return 1;
}
cudaFree(a_d);
cudaFree(b_d);
}
why would you think that it is currently not efficient?
it seems aligned, without bank conflicts - it does not get better than that
seemingly, you have a one-to-one mapping between the addresses threads read from, and write to - threads work in on the array independently of other threads
thus, you could drop the __syncthreads(), as there is really nothing to synchronize on
the results are not correct
Your global memory indexing is not correct. The compiler warning that txx was not used is a clue.
Change your kernel lines to look like this:
// copy from global memory to shared memory
temp[ty][tx] = a[tyy*(gridDim.x*shared_size)+txx];
__syncthreads();
temp[ty][tx] = temp[ty][tx] * 2;
__syncthreads();
a[tyy*(gridDim.x*shared_size)+txx] = temp[ty][tx];
I guess I should point out that the __syncthreads() are not necessary for this particular code sequence. Threads do not access any location in temp other than “their own”. In fact, there is actually little point in using shared memory here, except that you seem to be using it to learn and explore, which is fine. As little_jimmy has already pointed out, your basic shared access mechanisms appear to be un-bank-conflicted and “efficient”.
thank txbob
I modified the code, does it seems efficient?
__global__ void calculate_ratios(float *a,float *b)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
int txx = threadIdx.x + blockDim.x*blockIdx.x;
int tyy = threadIdx.y + blockDim.y*blockIdx.y;
__shared__ float temp[shared_size][shared_size];
__shared__ float digonal_elem[shared_size*BLOCKS];
__shared__ float transposed[shared_size][shared_size];
// copy from global memory to shared memory
temp[ty][tx] = a[(blockIdx.y*BLOCKS*shared_size*shared_size) + (ty*BLOCKS*shared_size) + (tx + blockIdx.x*shared_size)];
digonal_elem[txx] = a[txx*(shared_size*BLOCKS + 1)];
__syncthreads();
//do the divsion in the shared memory
if ( (blockIdx.x== blockIdx.y)){
temp[tx + ty][ty] = temp[tx + ty][ty] / digonal_elem[ty + blockIdx.x*shared_size];
}
else if (blockIdx.x < blockIdx.y) {
temp[ty][tx] = temp[ty][tx] / digonal_elem[tx];
}
__syncthreads();
//transpose the matrix
transposed[tx][ty]=temp[ty][tx];
__syncthreads();
// copy the result of division to gloabal memory
a[(blockIdx.y*BLOCKS*shared_size*shared_size) + (ty*BLOCKS*shared_size) + (tx + blockIdx.x*shared_size)] = temp[ty][tx];
b[(blockIdx.y*BLOCKS*shared_size*shared_size) + (ty*BLOCKS*shared_size) + (tx + blockIdx.x*shared_size)] = transposed[ty][tx];
//printf("(%d %d)\n", txx,tyy);
}
does the “__syncthreads();” stills unnecessary ?
These lines seem to me to have various issues:
temp[ty][tx] = a[(blockIdx.y*BLOCKS*shared_size*shared_size) + (ty*BLOCKS*shared_size) + (tx + blockIdx.x*shared_size)];
digonal_elem[txx] = a[txx*(shared_size*BLOCKS + 1)];
First, this will not coalesce nicely on a global load, I don’t think:
= a[(blockIdx.y*BLOCKS*shared_size*shared_size) + (ty*BLOCKS*shared_size) + (tx + blockIdx.x*shared_size)];
nor will this:
= a[txx*(shared_size*BLOCKS + 1)];
In addition, this is likely not a correct way to index into shared memory:
digonal_elem[txx] =
Your txx variable is a globally unique index. For indexing into shared memory, you would want to construct some kind of “local” i.e. intra-threadblock index, such as using tx, for example.
There may be other issues with the code as well. Those were just a couple items I noticed.