Cuda programming code cannot run in old version

My Cuda programming code cannot run because of “cublas.lib”

My teacher class - they are using Microsoft Visual Studio 2013 (old version)

Please help to solve this problem - this is important for studies

#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <conio.h>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <time.h>
#include <windows.h>  

#define BLOCK_SIZE 16
#define NUM_THREADS 256
#define cublascall(res, str) 

static cudaEvent_t cu_TimerStart;
static cudaEvent_t cu_TimerStop;

void d_CUDATimerStart(void)
{
	cudaEventCreate(&cu_TimerStart);
	cudaEventCreate(&cu_TimerStop);
	cudaEventRecord(cu_TimerStart);
}

float d_CUDATimerStop(void)
{
	cudaEventRecord(cu_TimerStop);
	cudaEventSynchronize(cu_TimerStop);

	float ms;
	cudaEventElapsedTime(&ms, cu_TimerStart, cu_TimerStop);
	cudaEventDestroy(cu_TimerStart);
	cudaEventDestroy(cu_TimerStop);
	return ms;
}

float* d_GetInv(float* L, int n)
{
	cublasHandle_t cu_cublasHandle;
	cublasCreate(&cu_cublasHandle);

	float** adL;
	float** adC;
	float* dL;
	float* dC;
	int* dLUPivots;
	int* dLUInfo;

	size_t szA = n * n * sizeof(float);

	cudaMalloc(&adL, sizeof(float*));
	cudaMalloc(&adC, sizeof(float*));
	cudaMalloc(&dL, szA);
	cudaMalloc(&dC, szA);
	cudaMalloc(&dLUPivots, n * sizeof(int));
	cudaMalloc(&dLUInfo, sizeof(int));

	cudaMemcpy(dL, L, szA, cudaMemcpyHostToDevice);
	cudaMemcpy(adL, &dL, sizeof(float*), cudaMemcpyHostToDevice);
	cudaMemcpy(adC, &dC, sizeof(float*), cudaMemcpyHostToDevice);

	d_CUDATimerStart();

	cublasSgetrfBatched(cu_cublasHandle, n, adL, n, dLUPivots, dLUInfo, 1);
	cudaDeviceSynchronize();

	cublasSgetriBatched(cu_cublasHandle, n, (const float **)adL, n, dLUPivots, adC, n, dLUInfo, 1);
	cudaDeviceSynchronize();

	float timed = d_CUDATimerStop();
	float* res = (float*)malloc(szA);

	cudaMemcpy(res, dC, szA, cudaMemcpyDeviceToHost);

	cudaFree(adL);
	cudaFree(adC);
	cudaFree(dL);
	cudaFree(dC);
	cudaFree(dLUPivots);
	cudaFree(dLUInfo);

	cublasDestroy(cu_cublasHandle);

	return res;
}

__global__ static void matMultCUDA(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
	__shared__ float matA[BLOCK_SIZE][BLOCK_SIZE];
	__shared__ float matB[BLOCK_SIZE][BLOCK_SIZE];
	const int tidc = threadIdx.x;
	const int tidr = threadIdx.y;
	const int bidc = blockIdx.x * BLOCK_SIZE;
	const int bidr = blockIdx.y * BLOCK_SIZE;
	
                 int i, j;

	float results = 0;
	float comp = 0;

	for (j = 0; j < n; j += BLOCK_SIZE) 
                 {
	if (tidr + bidr < n && tidc + j < n) 
                 {
	matA[tidr][tidc] = a[(tidr + bidr) * lda + tidc + j];
	}
	else 
                 {
	matA[tidr][tidc] = 0;
	}

	if (tidr + j < n && tidc + bidc < n) 
                 {
	matB[tidr][tidc] = b[(tidr + j) * ldb + tidc + bidc];
	}
	else 
                 {
	matB[tidr][tidc] = 0;
	}
                 __syncthreads();

	for (i = 0; i < BLOCK_SIZE; i++) 
                 {
	float t;
                  }
                  __syncthreads();
	}

	if (tidr + bidr < n && tidc + bidc < n) 
                 {
	c[(tidr + bidr) * ldc + tidc + bidc] = results;
	}
}

void matmult(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
	int i, j, k;

	for (i = 0; i < n; i++) 
                 {
	for (j = 0; j < n; j++) 
                 {
                 double t = 0;
	for (k = 0; k < n; k++) 
                 {
	t += a[i * lda + k] * b[k * ldb + j];
	}
	c[i * ldc + j] = t;
	}
	}
}

int main()
{
	int n;
	printf("Please input matrix number:");
	scanf("%d", &n);
                  printf("\n");

	float* L = (float*)malloc(n * n * sizeof(float));
	float* c = (float*)malloc(n * n * sizeof(float));

	int i, j;
	for (i = 0; i < n; i++)
                  {
	for (j = 0; j < n; j++) 
                  {
	 L[i * n + j] = (float)rand()/RAND_MAX + (float)rand()/(RAND_MAX * RAND_MAX);
                  printf("%.f\t", L[i * n + j]);
	 }
	printf("\n");
	}

	float* inv = d_GetInv(L, n);
	printf("\n");

	for (i = 0; i < n; i++) 
                  {
	 for (j = 0; j < n; j++) 
                  {
	 printf("%.f\t", inv[i * n + j]);
	 }
	printf("\n");
	 }
	matmult(L, n, inv, n, c, n, n);
	printf("\n");

	for (i = 0; i < n; i++) 
                  {
	 for (j = 0; j < n; j++) 
                  {
	 printf("%.f\t", c[i * n + j]);
	 }
	 printf("\n");
	 }

	printf("\n");
	printf("verify");
	_getch();

	return 0;
}

https://stackoverflow.com/questions/13570285/how-to-link-library-e-g-cublas-cusparse-for-cuda-on-windows/13588857#13588857

My Cuda code can run in Visual Studio 2013 (Cuda 9.1) after adding the cublas.lib
but cannot run in Visual Studio 2013 (Cuda 8) even after adding that cublas.lib

Please let me know how to solve this problem

You can try my Cuda code as below -

#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <conio.h>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <time.h>
#include <windows.h>  

#define BLOCK_SIZE 16
#define NUM_THREADS 256
#define cublascall(res, str) 

static cudaEvent_t cu_TimerStart;
static cudaEvent_t cu_TimerStop;

void d_CUDATimerStart(void)
{
	cudaEventCreate(&cu_TimerStart);
	cudaEventCreate(&cu_TimerStop);
	cudaEventRecord(cu_TimerStart);
}

float d_CUDATimerStop(void)
{
	cudaEventRecord(cu_TimerStop);
	cudaEventSynchronize(cu_TimerStop);

	float ms;
	cudaEventElapsedTime(&ms, cu_TimerStart, cu_TimerStop);
	cudaEventDestroy(cu_TimerStart);
	cudaEventDestroy(cu_TimerStop);
	return ms;
}

float* d_GetInv(float* L, int n)
{
	cublasHandle_t cu_cublasHandle;
	cublasCreate(&cu_cublasHandle);

	float** adL;
	float** adC;
	float* dL;
	float* dC;
	int* dLUPivots;
	int* dLUInfo;

	size_t szA = n * n * sizeof(float);

	cudaMalloc(&adL, sizeof(float*));
	cudaMalloc(&adC, sizeof(float*));
	cudaMalloc(&dL, szA);
	cudaMalloc(&dC, szA);
	cudaMalloc(&dLUPivots, n * sizeof(int));
	cudaMalloc(&dLUInfo, sizeof(int));

	cudaMemcpy(dL, L, szA, cudaMemcpyHostToDevice);
	cudaMemcpy(adL, &dL, sizeof(float*), cudaMemcpyHostToDevice);
	cudaMemcpy(adC, &dC, sizeof(float*), cudaMemcpyHostToDevice);

	d_CUDATimerStart();

	cublasSgetrfBatched(cu_cublasHandle, n, adL, n, dLUPivots, dLUInfo, 1);
	cudaDeviceSynchronize();

	cublasSgetriBatched(cu_cublasHandle, n, (const float **)adL, n, dLUPivots, adC, n, dLUInfo, 1);
	cudaDeviceSynchronize();

	float timed = d_CUDATimerStop();
	float* res = (float*)malloc(szA);

	cudaMemcpy(res, dC, szA, cudaMemcpyDeviceToHost);

	cudaFree(adL);
	cudaFree(adC);
	cudaFree(dL);
	cudaFree(dC);
	cudaFree(dLUPivots);
	cudaFree(dLUInfo);

	cublasDestroy(cu_cublasHandle);

	return res;
}

__global__ static void matMultCUDA(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
	__shared__ float matA[BLOCK_SIZE][BLOCK_SIZE];
	__shared__ float matB[BLOCK_SIZE][BLOCK_SIZE];
	const int tidc = threadIdx.x;
	const int tidr = threadIdx.y;
	const int bidc = blockIdx.x * BLOCK_SIZE;
	const int bidr = blockIdx.y * BLOCK_SIZE;
	
                 int i, j;

	float results = 0;
	float comp = 0;

	for (j = 0; j < n; j += BLOCK_SIZE) 
                 {
	if (tidr + bidr < n && tidc + j < n) 
                 {
	matA[tidr][tidc] = a[(tidr + bidr) * lda + tidc + j];
	}
	else 
                 {
	matA[tidr][tidc] = 0;
	}

	if (tidr + j < n && tidc + bidc < n) 
                 {
	matB[tidr][tidc] = b[(tidr + j) * ldb + tidc + bidc];
	}
	else 
                 {
	matB[tidr][tidc] = 0;
	}
                 __syncthreads();

	for (i = 0; i < BLOCK_SIZE; i++) 
                 {
	float t;
                  }
                  __syncthreads();
	}

	if (tidr + bidr < n && tidc + bidc < n) 
                 {
	c[(tidr + bidr) * ldc + tidc + bidc] = results;
	}
}

void matmult(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
	int i, j, k;

	for (i = 0; i < n; i++) 
                 {
	for (j = 0; j < n; j++) 
                 {
                 double t = 0;
	for (k = 0; k < n; k++) 
                 {
	t += a[i * lda + k] * b[k * ldb + j];
	}
	c[i * ldc + j] = t;
	}
	}
}

int main()
{
	int n;
	printf("Please input matrix number:");
	scanf("%d", &n);
                  printf("\n");

	float* L = (float*)malloc(n * n * sizeof(float));
	float* c = (float*)malloc(n * n * sizeof(float));

	int i, j;
	for (i = 0; i < n; i++)
                  {
	for (j = 0; j < n; j++) 
                  {
	 L[i * n + j] = (float)rand()/RAND_MAX + (float)rand()/(RAND_MAX * RAND_MAX);
                  printf("%.f\t", L[i * n + j]);
	 }
	printf("\n");
	}

	float* inv = d_GetInv(L, n);
	printf("\n");

	for (i = 0; i < n; i++) 
                  {
	 for (j = 0; j < n; j++) 
                  {
	 printf("%.f\t", inv[i * n + j]);
	 }
	printf("\n");
	 }
	matmult(L, n, inv, n, c, n, n);
	printf("\n");

	for (i = 0; i < n; i++) 
                  {
	 for (j = 0; j < n; j++) 
                  {
	 printf("%.f\t", c[i * n + j]);
	 }
	 printf("\n");
	 }

	printf("\n");
	printf("verify");
	_getch();

	return 0;
}

Without posting any error messages, it is not obvious why the case may be. If you have it running under 9.1, just work off that unless there’s a specific reason you need the previous version toolkit.

You might need to expand the error verbosity in Visual Studio to figure out why it’s failing:

https://blogs.msdn.microsoft.com/msbuild/2005/09/28/cranking-up-the-build-verbosity-in-the-ide/

.

That is because my teacher classroom - they are using Visual Studio 2013 (Cuda 8 old version)

.