CUDA Matrix Example


i am a CUDA rookie and i try to reproduce the CudaExample from the CUDA_ProgrammingGuide_2.3 page 20.

but i get the Error “CXX0030” (expression not evaluatable) in the Line where i allocated the device memory ?

Can anyone help me? thanks



* MatrixMulti CUDA program.



#define BLOCK_SIZE 32



#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

#include <cutil.h>

typedef struct {

int width;

int height;

float* elements;

} Matrix;



/* Init CUDA															*/




bool InitCUDA(void){return true;}


bool InitCUDA(void)


	int count = 0;

	int i = 0;


	if(count == 0) {

		fprintf(stderr, "There is no device.\n");

		return false;


	for(i = 0; i < count; i++) {

		cudaDeviceProp prop;

		if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {

			if(prop.major >= 1) {





	if(i == count) {

		fprintf(stderr, "There is no device supporting CUDA.\n");

		return false;



	printf("CUDA initialized.\n");

	return true;



// Allocates a matrix with random float entries.

void randomInit(float* data, int size)


	for (int i = 0; i < size; ++i)

		data[i] = rand() / (float)RAND_MAX;






//Matrix multiplication kernel called by MatMul()

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)


// Each thread computes one element of C

// by accumulating results into Cvalue

float Cvalue = 0;

int row = blockIdx.y * blockDim.y + threadIdx.y;

int col = blockIdx.x * blockDim.x + threadIdx.x;

for (int e = 0; e < A.width; ++e)

Cvalue += A.elements[row * A.width + e]

* B.elements[e * B.width + col];

C.elements[row * C.width + col] = Cvalue;




// Matrix multiplication - Host code

// Matrix dimensions are assumed to be multiples of BLOCK_SIZE

void MatMul(const Matrix A, const Matrix B, Matrix C)


// Load A and B to device memory

Matrix d_A,d_B,d_C;

size_t size = A.width * A.height * sizeof(float);

d_A.width =A.width; d_A.height = A.width;

CUDA_SAFE_CALL(cudaMalloc((void**)&d_A.elements, size));

CUDA_SAFE_CALL(cudaMemcpy(d_A.elements,A.elements, size,


d_B.width = B.width; d_B.height = B.height;

CUDA_SAFE_CALL(cudaMalloc((void**)&d_B.elements, size));

CUDA_SAFE_CALL(cudaMemcpy(d_B.elements, B.elements, size,


// Allocate C in device memory

d_C.width = C.width; d_C.height = C.height;

size = C.width * C.height * sizeof(float);

CUDA_SAFE_CALL(cudaMalloc((void**)&d_C.elements, size));

// Invoke kernel

dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);

unsigned int timer = 0;

CUT_SAFE_CALL( cutCreateTimer( &timer));

CUT_SAFE_CALL( cutStartTimer( timer));

MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);

CUT_CHECK_ERROR("Kernel execution failed\n");

CUDA_SAFE_CALL( cudaThreadSynchronize() );

CUT_SAFE_CALL( cutStopTimer( timer));

printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));

CUT_SAFE_CALL( cutDeleteTimer( timer));

// Read C from device memory

cudaMemcpy(C.elements, d_C.elements, size,cudaMemcpyDeviceToHost);

// Free device memory







/*MAIN														  */



int main(int argc, char* argv[])


	if(!InitCUDA()) {

		return 0;


	   // allocate host memory for matrices A and B

	Matrix h_A,h_B,h_C;







	unsigned int size = WIDTH*HEIGHT;

	unsigned int mem_size = sizeof(float) * size;


	h_A.elements= (float*) malloc(mem_size);

	h_B.elements= (float*) malloc(mem_size);

	h_C.elements= (float*) malloc(mem_size);

	// set seed for rand()


	// initialize host memory

   randomInit(h_A.elements, size);

   randomInit(h_B.elements, size);


//invoke MatMul


CUT_EXIT(argc, argv);

	return 0;


I compile your code in my machine, it is O.K. without errors.

my platform: winxp pro x64, vc2005, driver 190.38, cuda 2.3

what’s your platform?

are you store it as .cu file? you must use nvcc to compile it.

Thanks for you Answer.
I make a mistake yesterday because i publish a wrong Error. Sorry, i was a little bit confused.I can compile the Code but if i start the Program, i get a
“cudaError on memoryposition 0x0012fe60…”.

my platform: winxp pro , vc2005 Express, driver 190.38, cuda 2.3, GeForce 8800 GTS 512

sorry, I just compile your code but never execute it.

now if I execute your code, I got error message “inalid configuration argument” on

“MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);”

this is because “BLOCK_SIZE =32” and “dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);” is

impossible since maximum threads per block is 512.

change BLOCK_SIZE to 16, then program works.

I have no idea with error “cudaError on memoryposition 0x0012fe60…”.

but try to modify value of BLOCK_SIZE

It works!..Thank you :thumbup:

hi … i am new to cuda .

can any one tell me how the above code runs in parallel(the kernel code).

thanx in advance for your response.