CUDA Matrix Example

dooky84 · September 24, 2009, 6:01pm

hallo,

i am a CUDA rookie and i try to reproduce the CudaExample from the CUDA_ProgrammingGuide_2.3 page 20.

but i get the Error “CXX0030” (expression not evaluatable) in the Line where i allocated the device memory ?

Can anyone help me? thanks

/************************************************************

********

* MatrixMulti CUDA program.

************************************************************

*********/

#define BLOCK_SIZE 32

#define WIDTH BLOCK_SIZE

#define HEIGHT BLOCK_SIZE

#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

#include <cutil.h>

typedef struct {

int width;

int height;

float* elements;

} Matrix;

/************************************************************

************/

/* Init CUDA															*/

/************************************************************

************/

#if __DEVICE_EMULATION__

bool InitCUDA(void){return true;}

#else

bool InitCUDA(void)

{

	int count = 0;

	int i = 0;

	cudaGetDeviceCount(&count);

	if(count == 0) {

		fprintf(stderr, "There is no device.\n");

		return false;

	}

	for(i = 0; i < count; i++) {

		cudaDeviceProp prop;

		if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {

			if(prop.major >= 1) {

				break;

			}

		}

	}

	if(i == count) {

		fprintf(stderr, "There is no device supporting CUDA.\n");

		return false;

	}

	cudaSetDevice(i);

	printf("CUDA initialized.\n");

	return true;

}

#endif

// Allocates a matrix with random float entries.

void randomInit(float* data, int size)

{

	for (int i = 0; i < size; ++i)

		data[i] = rand() / (float)RAND_MAX;

	

}

/************************************************************

************/

//Kernel

//Matrix multiplication kernel called by MatMul()

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)

{

// Each thread computes one element of C

// by accumulating results into Cvalue

float Cvalue = 0;

int row = blockIdx.y * blockDim.y + threadIdx.y;

int col = blockIdx.x * blockDim.x + threadIdx.x;

for (int e = 0; e < A.width; ++e)

Cvalue += A.elements[row * A.width + e]

* B.elements[e * B.width + col];

C.elements[row * C.width + col] = Cvalue;

}

/************************************************************

************/

// Matrix multiplication - Host code

// Matrix dimensions are assumed to be multiples of BLOCK_SIZE

void MatMul(const Matrix A, const Matrix B, Matrix C)

{

// Load A and B to device memory

Matrix d_A,d_B,d_C;

size_t size = A.width * A.height * sizeof(float);

d_A.width =A.width; d_A.height = A.width;

CUDA_SAFE_CALL(cudaMalloc((void**)&d_A.elements, size));

CUDA_SAFE_CALL(cudaMemcpy(d_A.elements,A.elements, size,

cudaMemcpyHostToDevice));

d_B.width = B.width; d_B.height = B.height;

CUDA_SAFE_CALL(cudaMalloc((void**)&d_B.elements, size));

CUDA_SAFE_CALL(cudaMemcpy(d_B.elements, B.elements, size,

cudaMemcpyHostToDevice));

// Allocate C in device memory

d_C.width = C.width; d_C.height = C.height;

size = C.width * C.height * sizeof(float);

CUDA_SAFE_CALL(cudaMalloc((void**)&d_C.elements, size));

// Invoke kernel

dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);

unsigned int timer = 0;

CUT_SAFE_CALL( cutCreateTimer( &timer));

CUT_SAFE_CALL( cutStartTimer( timer));

MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);

CUT_CHECK_ERROR("Kernel execution failed\n");

CUDA_SAFE_CALL( cudaThreadSynchronize() );

CUT_SAFE_CALL( cutStopTimer( timer));

printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));

CUT_SAFE_CALL( cutDeleteTimer( timer));

// Read C from device memory

cudaMemcpy(C.elements, d_C.elements, size,cudaMemcpyDeviceToHost);

// Free device memory

cudaFree(d_A.elements);

cudaFree(d_B.elements);

cudaFree(d_C.elements);

}

/************************************************************

************/

/*MAIN														  */

/************************************************************

************/

int main(int argc, char* argv[])

{

	if(!InitCUDA()) {

		return 0;

	}

	   // allocate host memory for matrices A and B

	Matrix h_A,h_B,h_C;

	h_A.width=WIDTH;

	h_A.height=HEIGHT;

	h_B.width=WIDTH;

	h_B.height=HEIGHT;

	h_C.width=WIDTH;

	h_C.height=HEIGHT;

	unsigned int size = WIDTH*HEIGHT;

	unsigned int mem_size = sizeof(float) * size;

	

	h_A.elements= (float*) malloc(mem_size);

	h_B.elements= (float*) malloc(mem_size);

	h_C.elements= (float*) malloc(mem_size);

	// set seed for rand()

	srand(2006);

	// initialize host memory

   randomInit(h_A.elements, size);

   randomInit(h_B.elements, size);

		

//invoke MatMul

MatMul(h_A,h_B,h_C);

CUT_EXIT(argc, argv);

	return 0;

}

LSChien · September 25, 2009, 12:34am

I compile your code in my machine, it is O.K. without errors.

my platform: winxp pro x64, vc2005, driver 190.38, cuda 2.3

what’s your platform?

are you store it as .cu file? you must use nvcc to compile it.

dooky84 · September 25, 2009, 9:11am

Thanks for you Answer.
I make a mistake yesterday because i publish a wrong Error. Sorry, i was a little bit confused.I can compile the Code but if i start the Program, i get a
“cudaError on memoryposition 0x0012fe60…”.

my platform: winxp pro , vc2005 Express, driver 190.38, cuda 2.3, GeForce 8800 GTS 512

LSChien · September 25, 2009, 3:13pm

sorry, I just compile your code but never execute it.

now if I execute your code, I got error message “inalid configuration argument” on

“MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);”

this is because “BLOCK_SIZE =32” and “dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);” is

impossible since maximum threads per block is 512.

change BLOCK_SIZE to 16, then program works.

I have no idea with error “cudaError on memoryposition 0x0012fe60…”.

but try to modify value of BLOCK_SIZE

dooky84 · September 28, 2009, 7:52am

It works!..Thank you External Image

sushilmunot · March 4, 2010, 7:10am

hi … i am new to cuda .

can any one tell me how the above code runs in parallel(the kernel code).

thanx in advance for your response.

Topic		Replies	Views
cuda error out of memory how to increase the size of matrix in multiplication CUDA Programming and Performance	10	11733	October 1, 2009
matrix multiplication with its transpose in cuda(cudamemcpy from device to host not working) . CUDA Programming and Performance	6	1769	October 5, 2018
32 x 32 Matrix Multiplication CUDA Programming and Performance	2	2871	March 5, 2010
Matrix multiplication ERRORS & few thoughts on CUDA Basic programming errors need correction CUDA Programming and Performance	14	13293	January 24, 2009
Intro CUDA - Matrix Multiplication Returning Odd Values CUDA Programming and Performance	1	5709	June 25, 2009
Cuda matrix multiplication too slow CUDA Programming and Performance	5	13334	February 17, 2010
MatrixMul sample CUDA Programming and Performance	15	3657	February 9, 2018
CUDA 2.1 on VS 2008 Queries regarding compiler settings CUDA Programming and Performance	0	2187	May 20, 2009
Why different shape matrix multiplication have different performance? CUDA Programming and Performance	2	766	August 26, 2018
bug in CUDA initialization? simple code cant see the device after xxx runs CUDA Programming and Performance	10	7761	June 23, 2009

CUDA Matrix Example

Related topics