fatal error LNK1104 Help

I have modified an example code that was provided with the tools required to compile cuda programs. I’ll admit that I am new and have much to learn, but I have been fixing errors for weeks and here is one that I seem to not be able to get rid of:

LINK : fatal error LNK1104: cannot open file ‘.\Debug\matrixMul_gold.obj’

here is the code that I modified please help.

// Utilities and system includes
#include <shrUtils.h>
#include <shrQATest.h>
#include "cutil_inline.h"
#include "matrixMul.h"

// includes, kernels
#include "matrixMul_kernel.cu"

static char *sSDKsample = "matrixMul";

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char** argv);
void randomInit(float*, int);
void printDiff(float*, float*, int, int, int, float);

//extern "C" // to execute in cpu 
//void computeGold(float*, const float*, const float*, unsigned int, unsigned int, unsigned int);


////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char**argv)
{
   shrQAStart(argc, argv);
	printf("[ %s ]\n", sSDKsample);

    //shrSetLogFileName ("matrixMul.txt");
    shrLog("%s Starting (CUDA tests)...\n\n", argv[0]);

    runTest(argc, argv);
    
}

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char** argv)
{
    if(shrCheckCmdLineFlag(argc, (const char**)argv, "device"))
    {
        cutilDeviceInit(argc, argv);
    }
    else
    {
        cutilSafeCall( cudaSetDevice(cutGetMaxGflopsDeviceId()) );
    }

    int devID;
    cudaDeviceProp props; // device information

    // get number of SMs on this GPU
    cutilSafeCall(cudaGetDevice(&devID)); // device information
    cutilSafeCall(cudaGetDeviceProperties(&props, devID)); // device information

    // use a larger block size for Fermi and above
    int block_size = (props.major < 2) ? 16 : 32;

    printf("Device %d: 

// Utilities and system includes
#include <shrUtils.h>
#include <shrQATest.h>
#include “cutil_inline.h”
#include “matrixMul.h”

// includes, kernels
#include “matrixMul_kernel.cu”

static char *sSDKsample = “matrixMul”;

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char** argv);
void randomInit(float*, int);
void printDiff(float*, float*, int, int, int, float);

//extern “C” // to execute in cpu
//void computeGold(float*, const float*, const float*, unsigned int, unsigned int, unsigned int);

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char**argv)
{
shrQAStart(argc, argv);
printf("[ %s ]\n", sSDKsample);

//shrSetLogFileName ("matrixMul.txt");
shrLog("%s Starting (CUDA tests)...\n\n", argv[0]);

runTest(argc, argv);

}

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char** argv)
{
if(shrCheckCmdLineFlag(argc, (const char**)argv, “device”))
{
cutilDeviceInit(argc, argv);
}
else
{
cutilSafeCall( cudaSetDevice(cutGetMaxGflopsDeviceId()) );
}

int devID;
cudaDeviceProp props; // device information

// get number of SMs on this GPU
cutilSafeCall(cudaGetDevice(&devID)); // device information
cutilSafeCall(cudaGetDeviceProperties(&props, devID)); // device information

// use a larger block size for Fermi and above
int block_size = (props.major < 2) ? 16 : 32;

printf("Device %d: \"%s\" with Compute %d.%d capability\n", devID, props.name, props.major, props.minor);

// set seed for rand()
srand(2006);

// Optional Command-line multiplier for matrix sizes
unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
int iSizeMultiple = 5;
shrGetCmdLineArgumenti(argc, (const char**)argv, "sizemult", &iSizeMultiple); 
iSizeMultiple = CLAMP(iSizeMultiple, 1, 10);

// For GPUs with fewer # of SM's, we limit the maximum size of the matrix
if (props.multiProcessorCount <= 4) {
	uiWA = 2 * block_size * iSizeMultiple;
	uiHA = 4 * block_size * iSizeMultiple;
	uiWB = 2 * block_size * iSizeMultiple;
	uiHB = 4 * block_size * iSizeMultiple;
	uiWC = 2 * block_size * iSizeMultiple;
	uiHC = 4 * block_size * iSizeMultiple;
} else {
	uiWA = WA * iSizeMultiple;
	uiHA = HA * iSizeMultiple;
	uiWB = WB * iSizeMultiple;
	uiHB = HB * iSizeMultiple;
	uiWC = WC * iSizeMultiple;
	uiHC = HC * iSizeMultiple;
}
shrLog("\nUsing Matrix Sizes: A(%u x %u), B(%u x %u), C(%u x %u)\n\n", 
        uiWA, uiHA, uiWB, uiHB, uiWC, uiHC);

// allocate host memory for matrices A and B
unsigned int size_A = uiWA * uiHA;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*)malloc(mem_size_A);
unsigned int size_B = uiWB * uiHB;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*)malloc(mem_size_B);

// initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);

// allocate device memory
float* d_A, *d_B, *d_C;
unsigned int size_C = uiWC * uiHC;
unsigned int mem_size_C = sizeof(float) * size_C;

// allocate host memory for the result
float* h_C      = (float*) malloc(mem_size_C);

cutilSafeCall(cudaMalloc((void**) &d_A, mem_size_A));
cutilSafeCall(cudaMalloc((void**) &d_B, mem_size_B));

// copy host memory to device
cutilSafeCall(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice) );
cutilSafeCall(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice) );

cutilSafeCall(cudaMalloc((void**) &d_C, mem_size_C));

// setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(uiWC / threads.x, uiHC / threads.y);

//create and start timer
unsigned int timer_matrixMul = 0;

// execute the kernel
int nIter = 30;

//normal CUDA method 

// For the case where "-cublas" is not specified, we will run the matrixMul kernel

    //Performs warmup operation using matrixMul CUDA kernel
	if (block_size == 16) {
        matrixMul<16><<< grid, threads >>>(d_C, d_A, d_B, uiWA, uiWB);
    } else {
        matrixMul<32><<< grid, threads >>>(d_C, d_A, d_B, uiWA, uiWB);
    }
    cutilDeviceSynchronize();

	// Start Timing	
	cutilCheckError(cutCreateTimer(&timer_matrixMul));
	cutilCheckError(cutStartTimer(timer_matrixMul));
	for (int j = 0; j < nIter; j++) {
		if (block_size == 16) {
			matrixMul<16><<< grid, threads >>>(d_C, d_A, d_B, uiWA, uiWB);
		} else {
			matrixMul<32><<< grid, threads >>>(d_C, d_A, d_B, uiWA, uiWB);
		}
	}
	// check if kernel execution generated and error
	cutilCheckMsg("CUDA matrixMul Kernel execution failed");

    cutilDeviceSynchronize();
	// stop and destroy timer
	cutilCheckError(cutStopTimer(timer_matrixMul));

	double dSeconds = cutGetTimerValue(timer_matrixMul)/((double)nIter * 1000.0); 
	double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB;
	double gflops = 1.0e-9 * dNumOps/dSeconds; // converts to gflops

	//Log througput, etc
	shrLogEx(LOGBOTH | MASTER, 0, "> CUDA matrixMul Throughput = %.4f GFlop/s, Time = %.5f s, Size = %.0f Ops, ", 
			gflops, dSeconds, dNumOps);
	shrLogEx(LOGBOTH | MASTER, 0, "NumDevsUsed = %d, Workgroup = %u\n", 1, threads.x * threads.y);

	cutilCheckError(cutDeleteTimer(timer_matrixMul));

	// copy result from device to host
	cutilSafeCall(cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost) );


// compute reference solution
shrLog("\nComparing GPU results with Host computation...\n\n");    
float* reference = (float*)malloc(mem_size_C);
computeGold(reference, h_A, h_B, uiHA, uiWA, uiWB);

// check result (matrixMul) and compare them
printf("Comparing CUDA matrixMul & Host results\n");
shrBOOL resCUDA = shrCompareL2fe(reference, h_C, size_C, 1.0e-6f);
if (resCUDA != shrTRUE) 
{
    printDiff(reference, h_C, uiWC, uiHC, 100, 1.0e-5f);
}
shrLog("CUDA matrixMul compares %s\n\n", (shrTRUE == resCUDA) ? "OK" : "FAIL");

// clean up memory
free(h_A);
free(h_B);
free(h_C);
free(reference);
cutilSafeCall(cudaFree(d_A));
cutilSafeCall(cudaFree(d_B));
cutilSafeCall(cudaFree(d_C));

cutilDeviceReset();
shrQAFinishExit(argc, (const char **)argv, (resCUDA == shrTRUE) ? QA_PASSED : QA_FAILED);

}

// Allocates a matrix with random float entries.
void randomInit(float* data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}

void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol)
{
shrLog(“Listing first %d Differences > %.6f…\n”, iListLength, fListTol);
int i,j,k;
int error_count=0;
for (j = 0; j < height; j++)
{
if (error_count < iListLength)
{
shrLog("\n Row %d:\n", j);
}
for (i = 0; i < width; i++)
{
k = j * width + i;
float fDiff = fabs(data1[k] - data2[k]);
if (fDiff > fListTol)
{
if (error_count < iListLength)
{
shrLog(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
}
error_count++;
}
}
}
shrLog(" \n Total Errors = %d\n\n", error_count);
}

quot;%s

// Utilities and system includes
#include <shrUtils.h>
#include <shrQATest.h>
#include “cutil_inline.h”
#include “matrixMul.h”

// includes, kernels
#include “matrixMul_kernel.cu”

static char *sSDKsample = “matrixMul”;

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char** argv);
void randomInit(float*, int);
void printDiff(float*, float*, int, int, int, float);

//extern “C” // to execute in cpu
//void computeGold(float*, const float*, const float*, unsigned int, unsigned int, unsigned int);

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char**argv)
{
shrQAStart(argc, argv);
printf("[ %s ]\n", sSDKsample);

//shrSetLogFileName ("matrixMul.txt");
shrLog("%s Starting (CUDA tests)...\n\n", argv[0]);

runTest(argc, argv);

}

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char** argv)
{
if(shrCheckCmdLineFlag(argc, (const char**)argv, “device”))
{
cutilDeviceInit(argc, argv);
}
else
{
cutilSafeCall( cudaSetDevice(cutGetMaxGflopsDeviceId()) );
}

int devID;
cudaDeviceProp props; // device information

// get number of SMs on this GPU
cutilSafeCall(cudaGetDevice(&devID)); // device information
cutilSafeCall(cudaGetDeviceProperties(&props, devID)); // device information

// use a larger block size for Fermi and above
int block_size = (props.major < 2) ? 16 : 32;

printf("Device %d: \"%s\" with Compute %d.%d capability\n", devID, props.name, props.major, props.minor);

// set seed for rand()
srand(2006);

// Optional Command-line multiplier for matrix sizes
unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
int iSizeMultiple = 5;
shrGetCmdLineArgumenti(argc, (const char**)argv, "sizemult", &iSizeMultiple); 
iSizeMultiple = CLAMP(iSizeMultiple, 1, 10);

// For GPUs with fewer # of SM's, we limit the maximum size of the matrix
if (props.multiProcessorCount <= 4) {
	uiWA = 2 * block_size * iSizeMultiple;
	uiHA = 4 * block_size * iSizeMultiple;
	uiWB = 2 * block_size * iSizeMultiple;
	uiHB = 4 * block_size * iSizeMultiple;
	uiWC = 2 * block_size * iSizeMultiple;
	uiHC = 4 * block_size * iSizeMultiple;
} else {
	uiWA = WA * iSizeMultiple;
	uiHA = HA * iSizeMultiple;
	uiWB = WB * iSizeMultiple;
	uiHB = HB * iSizeMultiple;
	uiWC = WC * iSizeMultiple;
	uiHC = HC * iSizeMultiple;
}
shrLog("\nUsing Matrix Sizes: A(%u x %u), B(%u x %u), C(%u x %u)\n\n", 
        uiWA, uiHA, uiWB, uiHB, uiWC, uiHC);

// allocate host memory for matrices A and B
unsigned int size_A = uiWA * uiHA;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*)malloc(mem_size_A);
unsigned int size_B = uiWB * uiHB;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*)malloc(mem_size_B);

// initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);

// allocate device memory
float* d_A, *d_B, *d_C;
unsigned int size_C = uiWC * uiHC;
unsigned int mem_size_C = sizeof(float) * size_C;

// allocate host memory for the result
float* h_C      = (float*) malloc(mem_size_C);

cutilSafeCall(cudaMalloc((void**) &d_A, mem_size_A));
cutilSafeCall(cudaMalloc((void**) &d_B, mem_size_B));

// copy host memory to device
cutilSafeCall(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice) );
cutilSafeCall(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice) );

cutilSafeCall(cudaMalloc((void**) &d_C, mem_size_C));

// setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(uiWC / threads.x, uiHC / threads.y);

//create and start timer
unsigned int timer_matrixMul = 0;

// execute the kernel
int nIter = 30;

//normal CUDA method 

// For the case where "-cublas" is not specified, we will run the matrixMul kernel

    //Performs warmup operation using matrixMul CUDA kernel
	if (block_size == 16) {
        matrixMul<16><<< grid, threads >>>(d_C, d_A, d_B, uiWA, uiWB);
    } else {
        matrixMul<32><<< grid, threads >>>(d_C, d_A, d_B, uiWA, uiWB);
    }
    cutilDeviceSynchronize();

	// Start Timing	
	cutilCheckError(cutCreateTimer(&timer_matrixMul));
	cutilCheckError(cutStartTimer(timer_matrixMul));
	for (int j = 0; j < nIter; j++) {
		if (block_size == 16) {
			matrixMul<16><<< grid, threads >>>(d_C, d_A, d_B, uiWA, uiWB);
		} else {
			matrixMul<32><<< grid, threads >>>(d_C, d_A, d_B, uiWA, uiWB);
		}
	}
	// check if kernel execution generated and error
	cutilCheckMsg("CUDA matrixMul Kernel execution failed");

    cutilDeviceSynchronize();
	// stop and destroy timer
	cutilCheckError(cutStopTimer(timer_matrixMul));

	double dSeconds = cutGetTimerValue(timer_matrixMul)/((double)nIter * 1000.0); 
	double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB;
	double gflops = 1.0e-9 * dNumOps/dSeconds; // converts to gflops

	//Log througput, etc
	shrLogEx(LOGBOTH | MASTER, 0, "> CUDA matrixMul Throughput = %.4f GFlop/s, Time = %.5f s, Size = %.0f Ops, ", 
			gflops, dSeconds, dNumOps);
	shrLogEx(LOGBOTH | MASTER, 0, "NumDevsUsed = %d, Workgroup = %u\n", 1, threads.x * threads.y);

	cutilCheckError(cutDeleteTimer(timer_matrixMul));

	// copy result from device to host
	cutilSafeCall(cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost) );


// compute reference solution
shrLog("\nComparing GPU results with Host computation...\n\n");    
float* reference = (float*)malloc(mem_size_C);
computeGold(reference, h_A, h_B, uiHA, uiWA, uiWB);

// check result (matrixMul) and compare them
printf("Comparing CUDA matrixMul & Host results\n");
shrBOOL resCUDA = shrCompareL2fe(reference, h_C, size_C, 1.0e-6f);
if (resCUDA != shrTRUE) 
{
    printDiff(reference, h_C, uiWC, uiHC, 100, 1.0e-5f);
}
shrLog("CUDA matrixMul compares %s\n\n", (shrTRUE == resCUDA) ? "OK" : "FAIL");

// clean up memory
free(h_A);
free(h_B);
free(h_C);
free(reference);
cutilSafeCall(cudaFree(d_A));
cutilSafeCall(cudaFree(d_B));
cutilSafeCall(cudaFree(d_C));

cutilDeviceReset();
shrQAFinishExit(argc, (const char **)argv, (resCUDA == shrTRUE) ? QA_PASSED : QA_FAILED);

}

// Allocates a matrix with random float entries.
void randomInit(float* data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}

void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol)
{
shrLog(“Listing first %d Differences > %.6f…\n”, iListLength, fListTol);
int i,j,k;
int error_count=0;
for (j = 0; j < height; j++)
{
if (error_count < iListLength)
{
shrLog("\n Row %d:\n", j);
}
for (i = 0; i < width; i++)
{
k = j * width + i;
float fDiff = fabs(data1[k] - data2[k]);
if (fDiff > fListTol)
{
if (error_count < iListLength)
{
shrLog(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
}
error_count++;
}
}
}
shrLog(" \n Total Errors = %d\n\n", error_count);
}

quot; with Compute %d.%d capability\n", devID, props.name, props.major, props.minor);

	// set seed for rand()
    srand(2006);

    // Optional Command-line multiplier for matrix sizes
    unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
    int iSizeMultiple = 5;
    shrGetCmdLineArgumenti(argc, (const char**)argv, "sizemult", &iSizeMultiple); 
    iSizeMultiple = CLAMP(iSizeMultiple, 1, 10);

	// For GPUs with fewer # of SM's, we limit the maximum size of the matrix
	if (props.multiProcessorCount <= 4) {
		uiWA = 2 * block_size * iSizeMultiple;
		uiHA = 4 * block_size * iSizeMultiple;
		uiWB = 2 * block_size * iSizeMultiple;
		uiHB = 4 * block_size * iSizeMultiple;
		uiWC = 2 * block_size * iSizeMultiple;
		uiHC = 4 * block_size * iSizeMultiple;
	} else {
		uiWA = WA * iSizeMultiple;
		uiHA = HA * iSizeMultiple;
		uiWB = WB * iSizeMultiple;
		uiHB = HB * iSizeMultiple;
		uiWC = WC * iSizeMultiple;
		uiHC = HC * iSizeMultiple;
	}
    shrLog("\nUsing Matrix Sizes: A(%u x %u), B(%u x %u), C(%u x %u)\n\n", 
            uiWA, uiHA, uiWB, uiHB, uiWC, uiHC);

    // allocate host memory for matrices A and B
    unsigned int size_A = uiWA * uiHA;
    unsigned int mem_size_A = sizeof(float) * size_A;
    float* h_A = (float*)malloc(mem_size_A);
    unsigned int size_B = uiWB * uiHB;
    unsigned int mem_size_B = sizeof(float) * size_B;
    float* h_B = (float*)malloc(mem_size_B);

    // initialize host memory
    randomInit(h_A, size_A);
    randomInit(h_B, size_B);
    
    // allocate device memory
    float* d_A, *d_B, *d_C;
    unsigned int size_C = uiWC * uiHC;
    unsigned int mem_size_C = sizeof(float) * size_C;

    // allocate host memory for the result
    float* h_C      = (float*) malloc(mem_size_C);

    cutilSafeCall(cudaMalloc((void**) &d_A, mem_size_A));
    cutilSafeCall(cudaMalloc((void**) &d_B, mem_size_B));

    // copy host memory to device
    cutilSafeCall(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice) );
    cutilSafeCall(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice) );
    
    cutilSafeCall(cudaMalloc((void**) &d_C, mem_size_C));
   
    // setup execution parameters
    dim3 threads(block_size, block_size);
    dim3 grid(uiWC / threads.x, uiHC / threads.y);

	//create and start timer
	unsigned int timer_matrixMul = 0;

    // execute the kernel
    int nIter = 30;
    
	//normal CUDA method 

	// For the case where "-cublas" is not specified, we will run the matrixMul kernel
	
        //Performs warmup operation using matrixMul CUDA kernel
		if (block_size == 16) {
            matrixMul<16><<< grid, threads >>>(d_C, d_A, d_B, uiWA, uiWB);
        } else {
            matrixMul<32><<< grid, threads >>>(d_C, d_A, d_B, uiWA, uiWB);
        }
        cutilDeviceSynchronize();

		// Start Timing	
		cutilCheckError(cutCreateTimer(&timer_matrixMul));
		cutilCheckError(cutStartTimer(timer_matrixMul));
		for (int j = 0; j < nIter; j++) {
			if (block_size == 16) {
				matrixMul<16><<< grid, threads >>>(d_C, d_A, d_B, uiWA, uiWB);
			} else {
				matrixMul<32><<< grid, threads >>>(d_C, d_A, d_B, uiWA, uiWB);
			}
		}
		// check if kernel execution generated and error
		cutilCheckMsg("CUDA matrixMul Kernel execution failed");

        cutilDeviceSynchronize();
		// stop and destroy timer
		cutilCheckError(cutStopTimer(timer_matrixMul));

		double dSeconds = cutGetTimerValue(timer_matrixMul)/((double)nIter * 1000.0); 
		double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB;
		double gflops = 1.0e-9 * dNumOps/dSeconds; // converts to gflops

		//Log througput, etc
		shrLogEx(LOGBOTH | MASTER, 0, "> CUDA matrixMul Throughput = %.4f GFlop/s, Time = %.5f s, Size = %.0f Ops, ", 
				gflops, dSeconds, dNumOps);
		shrLogEx(LOGBOTH | MASTER, 0, "NumDevsUsed = %d, Workgroup = %u\n", 1, threads.x * threads.y);

		cutilCheckError(cutDeleteTimer(timer_matrixMul));

		// copy result from device to host
		cutilSafeCall(cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost) );
	

    // compute reference solution
    shrLog("\nComparing GPU results with Host computation...\n\n");    
    float* reference = (float*)malloc(mem_size_C);
    computeGold(reference, h_A, h_B, uiHA, uiWA, uiWB);

    // check result (matrixMul) and compare them
	printf("Comparing CUDA matrixMul & Host results\n");
    shrBOOL resCUDA = shrCompareL2fe(reference, h_C, size_C, 1.0e-6f);
    if (resCUDA != shrTRUE) 
    {
        printDiff(reference, h_C, uiWC, uiHC, 100, 1.0e-5f);
    }
    shrLog("CUDA matrixMul compares %s\n\n", (shrTRUE == resCUDA) ? "OK" : "FAIL");

    // clean up memory
    free(h_A);
    free(h_B);
    free(h_C);
    free(reference);
    cutilSafeCall(cudaFree(d_A));
    cutilSafeCall(cudaFree(d_B));
    cutilSafeCall(cudaFree(d_C));

    cutilDeviceReset();
    shrQAFinishExit(argc, (const char **)argv, (resCUDA == shrTRUE) ? QA_PASSED : QA_FAILED);
}

// Allocates a matrix with random float entries.
void randomInit(float* data, int size)
{
    for (int i = 0; i < size; ++i)
        data[i] = rand() / (float)RAND_MAX;
}

void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol)
{
    shrLog("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
    int i,j,k;
    int error_count=0;
    for (j = 0; j < height; j++) 
    {
        if (error_count < iListLength)
        {
            shrLog("\n  Row %d:\n", j);
        }
        for (i = 0; i < width; i++) 
        {
            k = j * width + i;
            float fDiff = fabs(data1[k] - data2[k]);
            if (fDiff > fListTol) 
            {                
                if (error_count < iListLength)
                {
                    shrLog("    Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
                }
                error_count++;
            }
        }
    }
    shrLog(" \n  Total Errors = %d\n\n", error_count);
}

You probably don’t have permission to write or overwrite that file.