Is there an error in the cuda manual matrix multiplication example?

I have been playing around with Cuda, and tried to use the example from the cuda manual on matrix multiplication from within R (statistical language). Everything appears to work “almost” fine, exept for that the answers are wrong. If I multiply matrix A and B, I get what is in fact B * A for square matrices, if I multiply non square matrices I get complete nonsense answers. As the way I did it differs so little from the cuda 2.1 manual, I’m getting the impression that there might be something wrong in there…

Thanks in advance, I would really appreciate a little help.

The code is included below:

#include <stdio.h>
#include <cutil.h>
#include <math.h>
#include <stdlib.h>

#define BLOCK_SIZE 16

// kernel module

global void Muld(float *A, float *B, int wA, int wB, float *C)
{

// Block index
int bx = blockIdx.x;
int by = blockIdx.y;

// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;

// Index of the first sub-matrix of A processed by the block
int aBegin = wA * BLOCK_SIZE * by;

// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + wA - 1;

// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;

// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;

// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * wB;

// The element of the block sub-matrix that is computed by the thread
float Csub = 0;

// Loop over all the sub-matrices of A and B required to compute the block sub-matrix
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
{

// Shared memory for the sub-matrices of A and B
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

// Load the matrices from global memory to shared memory, each thread loads one element of each matrix
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];

// Synchronize to make sure the matrices are loaded
__syncthreads();

// Multiply the two matrices together, each thread computes one element of the block sub-matrix
for (int k = 0; k < BLOCK_SIZE; ++k) Csub += As[ty][k] * Bs[k][tx];

// Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
__syncthreads();

}

// Write the block sub-matrix to global memory, each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
}

// main routine

int main(double *AA, double *BB, long *hAA, long *wAA, long *wBB, double *CC)
{

int hA = *hAA;
int wA = *wAA;
int wB = *wBB;

// pointer to host & device array
float *Ah, *Bh, *Ch, *Ad, *Bd, *Cd;

// allocate host and device memory for matrices A and B
int size_A = hA * wA;
int mem_size_A = size_A * sizeof(float);
Ah = (float *)malloc(mem_size_A);

int size_B = wA * wB;
int mem_size_B = size_B * sizeof(float);
Bh = (float *)malloc(mem_size_B);

// initialize host memory
for (int i=0; i<size_A; i++) Ah[i] = (float)AA[i];
for (int i=0; i<size_B; i++) Bh[i] = (float)BB[i];

// allocate device memory
cudaMalloc((void**)&Ad, mem_size_A);
cudaMalloc((void**)&Bd, mem_size_B);

// copy host memory to device
cudaMemcpy(Ad, Ah, mem_size_A, cudaMemcpyHostToDevice);
cudaMemcpy(Bd, Bh, mem_size_B, cudaMemcpyHostToDevice);

// Allocate device memory for C
int size_C = hA * wB;
int mem_size_C = size_C * sizeof(float);
cudaMalloc((void**)&Cd, mem_size_C);

// Allocate host memory for C
Ch = (float *)malloc(mem_size_C);

// Compute the execution configuration assuming the matrix dimensions are multiples of BLOCK_SIZE
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(hA / dimBlock.x, wB / dimBlock.y);

// Launch the device computation
Muld<<<dimGrid, dimBlock>>>(Ad, Bd, wA, wB, Cd);

// Read C from the device
cudaMemcpy(Ch, Cd, mem_size_C, cudaMemcpyDeviceToHost);
for (int i=0; i<size_C; i++) CC[i] = (double)Ch[i];

// Clean up the memory
free(Ah);
free(Bh);
free(Ch);
cudaFree(Ad);
cudaFree(Bd);
cudaFree(Cd);

cudaThreadExit();

}

I’m not sure this matters, but when I read the manual I noticed that the matrix dimensions
were inverted with respect to usual mathematical formalism, that is,
instead of (number of rows, number of columns) they describe a matrix by
(number of columns, number of rows).

giovanni

I looked at it a bit more: there is indeed an error in the cuda manual.

In stead of:
// Multiply the two matrices together, each thread computes one element of the block sub-matrix
for (int k = 0; k < BLOCK_SIZE; ++k) Csub += As[ty][k] * Bs[k][tx];

It should have been:
// Multiply the two matrices together, each thread computes one element of the block sub-matrix
for (int k = 0; k < BLOCK_SIZE; ++k) Csub += As[k][tx] * Bs[ty][k];

Splitting up everything in blocks makes the code so complex that it tends to become difficult to get a grip on what’s going on, thats why it took some time to figure out.

Not very precise of the NVIDIA people, however, that they didn’t check whether the largest example in the manual worked…

Is this correct? I downloaded the latest example, they use defines instead of usual address. I’ve deleted comments and have such code below:

#define BLOCK_SIZE 1

#define CHECK_BANK_CONFLICTS 0

#if CHECK_BANK_CONFLICTS

#define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j))

#define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))

#else

#define AS(i, j) As[i][j]

#define BS(i, j) Bs[i][j]

#endif

__global__ void __prod( float* A, float* B, float* C, long wA, long hA, long wB)

{

	long bx = blockIdx.x;

	long by = blockIdx.y;

	long tx = threadIdx.x;

	long ty = threadIdx.y;

	long aBegin = wA * BLOCK_SIZE * by;

	long aEnd   = aBegin + wA - 1;

	long aStep  = BLOCK_SIZE;

	long bBegin = BLOCK_SIZE * bx;

	long bStep  = BLOCK_SIZE * wB;

	float Csub = 0;

	for (long a = aBegin, b = bBegin;a <= aEnd;a += aStep, b += bStep)

	{

		__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

		__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

		AS(ty, tx) = A[a + wA * ty + tx];

		BS(ty, tx) = B[b + wB * ty + tx];

		__syncthreads();

		//for (long k = 0; k < BLOCK_SIZE; ++k)

		  //  Csub += AS(ty, k) * BS(k, tx);

		for (long k = 0; k < BLOCK_SIZE; ++k)

			Csub += AS(k,tx) * BS(ty,k);

		__syncthreads();

	}

	long c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;

	C[c + wB * ty + tx] = Csub;

}

void prod(float *d_A, float *d_B, float *d_C, long wA, long hA, long wB)

{

	long WC = wB;  // Matrix C width

	long HC = hA;  // Matrix C height

	dim3 threads(BLOCK_SIZE, BLOCK_SIZE);

	   dim3 grid(WC / threads.x, HC / threads.y);

	__prod<<< grid, threads >>>(d_A, d_B, d_C, wA, hA, wB);

}

I need to multiply matrix of image in format [3 channels X ImageHeight*ImageWidth] and small matrix of transformation to transform RGB to YIQ. That’s why I set BLOCK_SIZE to 1 to avoid dimension/BLOCK_SIZE=0. But this matrix multiply is not working, no errors, but no result. Also I don’t know, how to set constant matrix in CUDA, but I tried to multiply my image with zero-matrix (I made scalar multiply of small matrix and 0), and also have no effect. This is the code of my calling

float *RGBYIQ;//={{0.299 , 0.587 ,0.114},{ 0.596, -0.274, -0.322},{ 0.212, -0.523, 0.311}};

		cudaMalloc((void**) &RGBYIQ,3*3*sizeof(float));

		//*(RGBYIQ + 0) = 0.299; *(RGBYIQ + 1) = 0.587; *(RGBYIQ + 2) = 0.114;

		//*(RGBYIQ + 3) = 0.596; *(RGBYIQ + 4) = -0.274; *(RGBYIQ + 5) = -0.322;

		//*(RGBYIQ + 6) = 0.212; *(RGBYIQ + 7) = -0.523; *(RGBYIQ + 8) = 0.311;

		prod(RGBYIQ,img_in,img_out,3,3,imageHeight*ImageWidth);

		cudaFree(RGBYIQ);

Thanks for advices.

Could anyone help? or have link to another implementation of matrix multiplication?

the kernel “Muld” you post is correct since I have verified it.

however this kernel assumes that matrix A, B, C are row-major and

dimension is multiple of BLOCK_SIZE.

  1. you make a mistake in execution configuration
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

dim3 dimGrid(hA / dimBlock.x, wB / dimBlock.y);

// Launch the device computation

Muld<<<dimGrid, dimBlock>>>(Ad, Bd, wA, wB, Cd);

you need to modify it as

dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

dim3 dimGrid(wB / dimBlock.x, hA / dimBlock.y);

// Launch the device computation

Muld<<<dimGrid, dimBlock>>>(Ad, Bd, wA, wB, Cd);

because your wrong setting, so “if I multiply non square matrices I get complete nonsense answers”.

  1. please check your parameter

double *AA, double *BB, double *CC

I guess that they are column-major, such that

you obtain B * A, not A*B

since if A is row-major, then under square matrix, col-major(A) = tranpose(A)

suppose AA, BB, CC is column-major, then according

for (int i=0; i<size_A; i++) Ah[i] = (float)AA[i];

for (int i=0; i<size_B; i++) Bh[i] = (float)BB[i];

you have

Ah = transpose(AA)

Bh = transpose(BB)

after

Muld<<<dimGrid, dimBlock>>>(Ad, Bd, wA, wB, Cd);

Cd = Ad * Bd

after

cudaMemcpy(Ch, Cd, mem_size_C, cudaMemcpyDeviceToHost);

Ch = Ah * Bh

after

for (int i=0; i<size_C; i++) CC[i] = (double)Ch[i];

CC = transpose(Ch)

hence

CC = transpose(Ch) = transpose( Ah * Bh )

= transpose(Bh) * transpose(Ah) = BB * AA

that is what you obtain

I have made your correction, and my function’s code is

#define BLOCK_SIZE 1

__global__ void __prod(float *A, float *B, long wA, long wB, float *C)

{

	long bx = blockIdx.x;

	long by = blockIdx.y;

	long tx = threadIdx.x;

	long ty = threadIdx.y;

	long aBegin = wA * BLOCK_SIZE * by;

	long aEnd = aBegin + wA - 1;

	long aStep = BLOCK_SIZE;

	long bBegin = BLOCK_SIZE * bx;

	long bStep = BLOCK_SIZE * wB;

	float Csub = 0;

	for (long a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)

	{

	__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

	__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

	As[ty][tx] = A[a + wA * ty + tx];

	Bs[ty][tx] = B[b + wB * ty + tx];

	__syncthreads();

	for (long k = 0; k < BLOCK_SIZE; ++k) Csub += As[ty][k] * Bs[k][tx];

	__syncthreads();

	}

	long c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;

	C[c + wB * ty + tx] = Csub;

}

void prod(float *d_A, float *d_B, float *d_C, long wA, long hA, long wB)

{

	dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

	dim3 dimGrid(wB / dimBlock.x, hA / dimBlock.y);

	__prod<<<dimGrid, dimBlock>>>(d_A, d_B, wA, wB, d_C);

}

and I need to multiply matrix of image that is described as [3 RGB channels X ImageHeight*ImageWidth] and [3 X 3] transform matrix to make RGB <-> YIQ. This is my code to do this:

void rgb_yiq(float *&img_in, float *&img_out, long iH,long iW, bool dir)

{

	if (dir)

	{

		float RGBYIQH[3][3]={{0.299 , 0.587 ,0.114},{ 0.596, -0.274, -0.322},{ 0.212, -0.523, 0.311}};

		float *RGBYIQ;

		cudaMalloc((void**) &RGBYIQ,3*3*sizeof(float));

		cudaMemcpy(RGBYIQ,RGBYIQH,3*3*sizeof(float),cudaMemcpyHostToDevice);

		prod(RGBYIQ,img_in,img_out,3,3,iW);

		cudaFree(RGBYIQ);

	}

	else

	{

		float YIQRGBH[3][3] = {{1.0, 0.956, 0.621},{ 1.0, -0.272, -0.647},{ 1.0, -1.105, 1.702}};

		float *YIQRGB;

		cudaMalloc((void**) &YIQRGB,3*3*sizeof(float));

		cudaMemcpy(YIQRGB,YIQRGBH,3*3*sizeof(float),cudaMemcpyHostToDevice);

		prod(YIQRGB,img_in,img_out,3,3,iW);

		cudaFree(YIQRGB);

	}

}

iH is 3, iW is ImageHeightImageWidth. In emulation mode I see, that if iH is 3, the __prod function is not entered (I use printf as a signal), even when I set BLOCK_SIZE = 1. If I set iH = 3ImageHeight and iW = ImageWidth, the function is entered, but the result is incorrect. Something incorrect with dimensions, but I can’t understand it.

your code works, I use following driver

int main(int argc, char* argv[])

{

	int i, j, index;

	long iH = 3, iW = 5;

	unsigned int size = iH * iW * sizeof(float);

	

	float *img_in = (float*) malloc( size );

	float *img_out = (float*) malloc( size );

	for (int i = 0; i < iH * iW; ++i){ img_in[i] = (double)i; }

	float *img_in_d, *img_out_d  ;

	CUDA_SAFE_CALL(cudaMalloc((void**) &img_in_d, size));

	CUDA_SAFE_CALL(cudaMalloc((void**) &img_out_d, size));

	CUDA_SAFE_CALL(cudaMemcpy(img_in_d, img_in, size, cudaMemcpyHostToDevice) );

	rgb_yiq(img_in_d, img_out_d, iH, iW, true );

	

	cutilSafeCall( cudaMemcpy( img_out, img_out_d, size, cudaMemcpyDeviceToHost) );

	printf("imag_in = \n");

	index = 0;

	for( i = 1; i <= iH; i++){

		for ( j = 1; j <= iW; j++){

			printf("%12.3f  ", img_in[index]);

			index++;

		}

		printf("\n");

	}

	printf("imag_out = \n");

	index = 0;

	for( i = 1; i <= iH; i++){

		for ( j = 1; j <= iW; j++){

			printf("%12.3f  ", img_out[index]);

			index++;

		}

		printf("\n");

	}

	return 0;

}

I guess that you may use host pointer img_in and img_out when calling function “rgb_yiq”.

Yes, it works, but only for small images. I’ve made tests, and for images of 400X400 and higher dimension this function is not calling, but for small images (100X100, 200X200, 250X250) it works well. I can’t understand the difference.

remember that The maximum size of each dimension of a grid of thread blocks is 65535

your execution of configuration is

dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

	dim3 dimGrid(wB / dimBlock.x, hA / dimBlock.y);

and BLOCK_SIZE = 1.

So if iW = 400 * 400 (images of 400X400), then dimGrid.x = 160000 > 65535, so lernel __prod fails.

however if iW =100X100, 200X200, 250X250, dimGrid.x < 65535, so __prod succeeds.

Hi,

Any one can help me,

I need a CUDA code for matrix subtraction.

I need a CUDA code for matrix subtraction.

you wanna C[i][j] = A[i][j] - B[i][j] for each i & j ?

so… very very easy. what is the problem?