CUDA 2.1 on VS 2008 Queries regarding compiler settings

Hello All,

I have been following posts on CUDA setup on VS2005. I am using CUDA 2.1 on VS2008. I am this much sure that CUDA2.1 runs on VS2008 because I am successfully able to run NVIDIA’s SDK examples on my machine. My machine has Vista 32 and NVidia GeForce 8800S, that supports CUDA 2.1.(Driver version 181.2).

Problem is I wrote a simple function for matrix multiplication. Precisely I am referring to Ch 6 from Programming Guide , CUDA example.

As per the guide, I created two files , one containing device code and other for host.

But when I compile, it gives me following error:

1>Project : error PRJ0019: A tool returned an error code from “Compiling with CUDA Build Rule…”

and I am using following compiler settings:

1>“C:\CUDA\bin\nvcc.exe” -arch sm_10 -ccbin “C:\Program Files\Microsoft Visual Studio 9.0\VC\bin” -Xcompiler "/EHsc /W3 /nologo /O2 /Zi /MT " -maxrregcount=32 --compile -o Debug\Add.cu.obj Add.cu

I am not able to figure out what is going wrong.

My source code for device : Add.cu

#include <stdio.h>

#include <stdlib.h>

#include <cutil_inline.h>

// Device multiplication function called by Mul()

// Compute C = A * B

// wA is the width of A

// wB is the width of B

__global__ void Muld(float* A, float* B, int wA, int wB, float* C)

{

		// Block index

		int bx = blockIdx.x;

		int by = blockIdx.y;

		

		// Thread index

		int tx = threadIdx.x;

		int ty = threadIdx.y;

		

		// Index of the first sub-matrix of A processed by the block

		int aBegin = wA * BLOCK_SIZE * by;

		

		// Index of the last sub-matrix of A processed by the block

		int aEnd = aBegin + wA - 1;

		

		// Step size used to iterate through the sub-matrices of A

		int aStep = BLOCK_SIZE;

		

		// Index of the first sub-matrix of B processed by the block

		int bBegin = BLOCK_SIZE * bx;

		

		// Step size used to iterate through the sub-matrices of B

		int bStep = BLOCK_SIZE * wB;

		

		// The element of the block sub-matrix that is computed

		// by the thread

		float Csub = 0;

		

		// Loop over all the sub-matrices of A and B required to

		// compute the block sub-matrix

		

		for (int a = aBegin, b = bBegin;a <= aEnd; a += aStep, b += bStep) 

		{

			

			// Shared memory for the sub-matrix of A

			__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

			

			// Shared memory for the sub-matrix of B

			__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

			

			// Load the matrices from global memory to shared memory;

			// each thread loads one element of each matrix

			As[ty][tx] = A[a + wA * ty + tx];

			Bs[ty][tx] = B[b + wB * ty + tx];

			

			// Synchronize to make sure the matrices are loaded

			__syncthreads();

			

			// Multiply the two matrices together;

			// each thread computes one element

			// of the block sub-matrix

			for (int k = 0; k < BLOCK_SIZE; ++k)

			Csub += As[ty][k] * Bs[k][tx];

			

			// Synchronize to make sure that the preceding

			// computation is done before loading two new

			// sub-matrices of A and B in the next iteration

			__syncthreads();

			

		}//end for loop

		// Write the block sub-matrix to global memory;

		// each thread writes one element

		int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;

		C[c + wB * ty + tx] = Csub;

		

}//end of Muld() on GPU

and Host code, Source1.cpp

#include<windows.h>

//CUDA specific header files.

#include <cuda_runtime.h>

#include <cutil_inline.h>

#include <cuda_gl_interop.h>

//extern "C" void* add(int* A, int* B, int* C, int N);

//Taking example from NVIDIA Programming Guide, Ch6.

// Thread block size

#define BLOCK_SIZE 16

// Forward declaration of the device multiplication function

__global__ void Muld(float*, float*, int, int, float*);

// Host multiplication function

// Compute C = A * B

// hA is the height of A

// wA is the width of A

// wB is the width of B

void Mul(const float* A, const float* B, int hA, int wA, int wB,float* C)

{

	int size;

	// Load A and B to the device

	float* Ad;

	size = hA * wA * sizeof(float);

	cudaMalloc((void**)&Ad, size);

	cudaMemcpy(Ad, A, size, cudaMemcpyHostToDevice);

	float* Bd;

	size = wA * wB * sizeof(float);

	cudaMalloc((void**)&Bd, size);

	cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice);

	// Allocate C on the device

	float* Cd;

	size = hA * wB * sizeof(float);

	cudaMalloc((void**)&Cd, size);

	// Compute the execution configuration assuming

	// the matrix dimensions are multiples of BLOCK_SIZE

	dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

	dim3 dimGrid(wB / dimBlock.x, hA / dimBlock.y);

	// Launch the device computation

	Muld<<<dimGrid, dimBlock>>>(Ad, Bd, wA, wB, Cd);

	

	// Read C from the device

	cudaMemcpy(C, Cd, size, cudaMemcpyDeviceToHost);

	

	// Free device memory

	cudaFree(Ad);

	cudaFree(Bd);

	cudaFree(Cd);

}//end of Mul() on the host.

/////////////////////////////////////////////////////////////////////////////////////////

/////////////////////////////////////////////////////////////////////////////////////////

//Main function.

void main(int argc , char* argv[])

{

	

  //Call Mul() that in turn calls Muld().

	float A[3][4] = { {2,3,4,8}, {1,3,5,1}, {2,8,7,5}};

	float B[3][4] = { {2,3,4,8}, {1,3,5,1}, {2,8,7,5}};

	

	float **C = NULL;

	C = (float**)malloc(sizeof(float*) * 3);

	for(int i = 0; i < 3; i++);

	{

		C[i] = (float*)malloc(4 * sizeof(float));

		memset(C[i], 0, 4 * sizeof(float));

	}

	//Call CPU version

	Mul(A,B,3,4,4,C);

	for(int i = 0; i < 3; i++);

		printf("C[%d] = %f", i, C[i]);

	//free memory from C 

	for(int i = 0; i < 3; i++);

	{

		free(C[i])

		C[i] = NULL;

	}

}//end of main().

Kindly help on this. Thank you.