/************************************************************
*********/
include <stdio.h>
include <stdlib.h>
include <string.h>
include <cuda.h>
include <cutil.h> /* includes project */
include “cuda_runtime.h”
include “cuda_runtime_api.h”
define WIDTH 4
/* ------------------------------- declaration of functions -------------------------------- */
bool InitCUDA(void);
/************************************************************
************/
/* Init CUDA */
/************************************************************
************/
#if DEVICE_EMULATION
bool InitCUDA(void){return true;}
else
bool InitCUDA(void)
{
int count = 0;
int i = 0;
cudaGetDeviceCount(&count);
if(count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
for(i = 0; i < count; i++) {
cudaDeviceProp prop;
if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if(prop.major >= 1) {
break;
}
}
}
if(i == count) {
fprintf(stderr, "There is no device supporting CUDA.\n");
return false;
}
cudaSetDevice(i);
printf("CUDA initialized.\n");
return true;
}
endif
/************************************************************
************/
/* My First CUDA Code */
/************************************************************
************/
// Code for Multiplication in GPU vs CPU.
// A suffix ‘d’ suggests operation on the Device.
// A suffix ‘h’ suggests operation on the Host.
// Matrix Multiplication required: P = M x N.
// One thread handles one element of P. Each thread:
// * Loads a row of Matrix M.
// * Loads a column of Matrix N.
// * For each pair of elements (Mij and Nji), it performs a multiplication and then addition.
// However, here the matrices have been used as one-dimensional arrays.
// Shared memory usage not employed now. Only one block of thread will compute the matrix P.
// So the size of the matrix P (also M & N) is limited by the number of threads allowed in a block.
/* ----------------------------------------- global variables ------------------------------------- */
typedef struct {
int* elements;
} Matrix;
/* ----------------------------------------- global Functions ------------------------------------- */
// extern “C”
global static void MatrixMul_DeviceKernel(int* Md, int* Nd, int* Pd)
{
// Performs Matrix Multiplication on the device.
// Set-up configuration (grid, block, etc.) details available from main().
// Temporary variables
int i;
int tx;
int ty;
int M_element = 0;
int N_element = 0;
int P_element = 0;
// 2D Thread ID
tx = threadIdx.x;
ty = threadIdx.y;
// Perform Multiplication
// Each thread is supposed to pick a row in Md and a column in Nd, multiply corresponding elements and add them.
__syncthreads();
for (i = 0; i < WIDTH; i++)
{
M_element = Md.elements[tx*WIDTH + i];
N_element = Nd.elements[i*WIDTH + ty];
P_element += M_element* N_element;
}
Pd.elements[tx*WIDTH + ty] = P_element;
__syncthreads();
}
/* --------------------------------------------- Host’s (CPU) Main( ) Code ----------------------------------- */
int main(void) {
if(!InitCUDA()) {
return 0;
}
int i;
struct Matrix *Mh, *Nh, *Ph, *Md, *Nd, *Pd;
int matrixsize = WIDTHWIDTH sizeof(int);
cudaError_t err;
// Allocate and initialize the matrices on the CPU
CUDA_SAFE_CALL(cudaMallocHost((void**)&Mh,sizeof(Matrix)));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Mh malloc error: %s.\n",cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Nh,sizeof(Matrix)));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Nh malloc error: %s.\n",cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Ph,sizeof(Matrix)));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Ph malloc error: %s.\n",cudaGetErrorString(err));
memset(Mh,1,matrixsize);
memset(Nh,1,matrixsize);
memset(Ph,0,matrixsize);
// Allocate and initialize the elements array in the Matrices on the CPU
CUDA_SAFE_CALL(cudaMallocHost((void**)&Mh.elements,matrixsize));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Mh.elements malloc error: %s.\n",cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Nh.elements,matrixsize));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Nh.elements malloc error: %s.\n",cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Ph.elements,matrixsize));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Ph.elements malloc error: %s.\n",cudaGetErrorString(err));
memset(Mh.elements,1,matrixsize);
memset(Nh.elements,1,matrixsize);
memset(Ph.elements,0,matrixsize);
// Assign/Fetch values of matrices M and N
for(i=0;i<WIDTH*WIDTH;i++)
{
Mh.elements[i]=5;
Nh.elements[i]=1;
Ph.elements[i]=0;
}
// Allocates enough memory for matrices Md, Nd and Pd on the Device.
CUDA_SAFE_CALL(cudaMalloc((void**)&Md,sizeof(Matrix)));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Md malloc error: %s.\n",cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaMalloc((void**)&Nd,sizeof(Matrix)));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Nd malloc error: %s.\n",cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaMalloc((void**)&Pd,sizeof(Matrix)));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Pd malloc error: %s.\n",cudaGetErrorString(err));
// Initializes matrices Md, Nd and Pd on the Device.
CUDA_SAFE_CALL(cudaMemset(Md,1,sizeof(Matrix)));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Md memset error: %s.\n",cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaMemset(Nd,1,sizeof(Matrix)));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Nd memset error: %s.\n",cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaMemset(Pd,0,sizeof(Matrix)));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Pd memset error: %s.\n",cudaGetErrorString(err));
// Allocate the elements array in matrices Md, Nd and Pd on the GPU
CUDA_SAFE_CALL(cudaMalloc((void**)&Md.elements,matrixsize));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Md.elements malloc error: %s.\n",cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaMalloc((void**)&Nd.elements,matrixsize));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Nd.elements malloc error: %s.\n",cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaMalloc((void**)&Pd.elements,matrixsize));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Pd.elements malloc error: %s.\n",cudaGetErrorString(err));
// Initializes the elements array in matrices Md, Nd and Pd on the GPU
CUDA_SAFE_CALL(cudaMemset(Md.elements,1,matrixsize));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Md.elements memset error: %s.\n",cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaMemset(Nd.elements,1,matrixsize));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Nd.elements memset error: %s.\n",cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaMemset(Pd.elements,0,matrixsize));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "Pd.elements memset error: %s.\n",cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaMemcpy(Md, Mh, sizeof(Matrix), cudaMemcpyHostToDevice));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "cudaMemcpyHostToDevice error: %s.\n",cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaMemcpy(Nd, Nh, sizeof(Matrix), cudaMemcpyHostToDevice));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "cudaMemcpyHostToDevice error: %s.\n",cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaMemcpy(Pd, Ph, sizeof(Matrix), cudaMemcpyHostToDevice));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "cudaMemcpyHostToDevice error: %s.\n",cudaGetErrorString(err));
// Perform Multiplication
// Set-up the execution configuration
dim3 dimGrid(1, 1); /* the grid has only 1 block in this code */
dim3 dimBlock(WIDTH, WIDTH); /* # elements in the matrix = # threads in the block */
// Launch a kernel of threads to perform Matrix Multiplication on the Device
// The function (MatrixMul_DeviceKernel) performs the Matrix Multiplication
MatrixMul_DeviceKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd);
// This launches a kernel of threads in the “Block†in the “Gridâ€, all of whose threads perform the function defined in the global function MatrixMul_DeviceKernel and need arguments Md, Nd and Pd to do that.
// Multiplication Over
// Read and copy output matrix Pd from the device to the output matrix P on the host
CUDA_SAFE_CALL(cudaMemcpy(Ph, Pd, sizeof(Matrix), cudaMemcpyDeviceToHost));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "cudaMemcpyDeviceToHost for result of Multiplication error: %s.\n",cudaGetErrorString(err));
// Print the output matrix
for (i=0;i<WIDTH*WIDTH;i++){
printf("Ph.elements[%d] = %d\n",i,Ph.elements[i]);
}
// Free device memory
CUDA_SAFE_CALL(cudaFree(Md.elements));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "cudaFree(Md.elements) error: %s.\n",i,cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaFree(Nd.elements));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "cudaFree(Nd.elements) error: %s.\n",i,cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaFree(Pd.elements));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "cudaFree(Pd.elements) error: %s.\n",i,cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaFree(Md));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "cudaFree(Md) error: %s.\n",i,cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaFree(Nd));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "cudaFree(Nd) error: %s.\n",i,cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaFree(Pd));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "cudaFree(Pd) error: %s.\n",i,cudaGetErrorString(err));
// Free matrices allocated on the CPU
CUDA_SAFE_CALL(cudaFreeHost(Mh.elements));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "cudaFreeHost(Mh.elements) error: %s.\n",i,cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaFreeHost(Nh.elements));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "cudaFreeHost(Nh.elements) error: %s.\n",i,cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaFreeHost(Ph.elements));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "cudaFreeHost(Ph.elements) error: %s.\n",i,cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaFreeHost(Mh));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "cudaFreeHost(Mh) error: %s.\n",i,cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaFreeHost(Nh));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "cudaFreeHost(Nh) error: %s.\n",i,cudaGetErrorString(err));
CUDA_SAFE_CALL(cudaFreeHost(Ph));
err = cudaGetLastError();
if( cudaSuccess != err)
fprintf(stderr, "cudaFreeHost(Ph) error: %s.\n",i,cudaGetErrorString(err));
return 0;
}
/* ------------------------------------------------------------------------------------------------------------------------- */
ERROR LOG:
1>------ Build started: Project: CUDAWinApp1_MatrixMul, Configuration: EmuDebug Win32 ------
1>Compiling…
1>sample.cu
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(114): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(115): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(119): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(142): error: incomplete type is not allowed
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(147): error: incomplete type is not allowed
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(152): error: incomplete type is not allowed
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(163): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(168): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(173): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(178): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(179): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(180): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(186): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(187): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(188): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(193): error: incomplete type is not allowed
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(198): error: incomplete type is not allowed
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(203): error: incomplete type is not allowed
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(210): error: incomplete type is not allowed
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(215): error: incomplete type is not allowed
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(220): error: incomplete type is not allowed
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(227): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(232): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(237): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(244): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(249): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(254): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(261): error: incomplete type is not allowed
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(266): error: incomplete type is not allowed
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(271): error: incomplete type is not allowed
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(285): error: argument of type “Matrix *” is incompatible with parameter of type “int *”
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(285): error: argument of type “Matrix *” is incompatible with parameter of type “int *”
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(285): error: argument of type “Matrix *” is incompatible with parameter of type “int *”
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(292): error: incomplete type is not allowed
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(299): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(304): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(308): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(312): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(332): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(336): error: expression must have class type
1>c:/Documents and Settings/fantom/Desktop/CPU_Project/cuda_00/CUDAWinApp1_MatrixMul//sample.cu(340): error: expression must have class type
1>41 errors detected in the compilation of “C:\DOCUME~1\fantom\LOCALS~1\Temp/tmpxft_00000c28_00000000-6_sample.cpp1.ii”.
1>Build log was saved at “file://c:\Documents and Settings\fantom\Desktop\CPU_Project\cuda_00\CUDAWinApp1_MatrixMul\EmuDebug\BuildLog.htm”
1>CUDAWinApp1_MatrixMul - 41 error(s), 0 warning(s)
========== Build: 0 succeeded, 1 failed, 0 up-to-date, 0 skipped ==========