Cuda programming code cannot run in old version

tronorinoyeong · June 18, 2018, 3:48pm

My Cuda programming code cannot run because of “cublas.lib”

My teacher class - they are using Microsoft Visual Studio 2013 (old version)

Please help to solve this problem - this is important for studies

#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <conio.h>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <time.h>
#include <windows.h>  

#define BLOCK_SIZE 16
#define NUM_THREADS 256
#define cublascall(res, str) 

static cudaEvent_t cu_TimerStart;
static cudaEvent_t cu_TimerStop;

void d_CUDATimerStart(void)
{
	cudaEventCreate(&cu_TimerStart);
	cudaEventCreate(&cu_TimerStop);
	cudaEventRecord(cu_TimerStart);
}

float d_CUDATimerStop(void)
{
	cudaEventRecord(cu_TimerStop);
	cudaEventSynchronize(cu_TimerStop);

	float ms;
	cudaEventElapsedTime(&ms, cu_TimerStart, cu_TimerStop);
	cudaEventDestroy(cu_TimerStart);
	cudaEventDestroy(cu_TimerStop);
	return ms;
}

float* d_GetInv(float* L, int n)
{
	cublasHandle_t cu_cublasHandle;
	cublasCreate(&cu_cublasHandle);

	float** adL;
	float** adC;
	float* dL;
	float* dC;
	int* dLUPivots;
	int* dLUInfo;

	size_t szA = n * n * sizeof(float);

	cudaMalloc(&adL, sizeof(float*));
	cudaMalloc(&adC, sizeof(float*));
	cudaMalloc(&dL, szA);
	cudaMalloc(&dC, szA);
	cudaMalloc(&dLUPivots, n * sizeof(int));
	cudaMalloc(&dLUInfo, sizeof(int));

	cudaMemcpy(dL, L, szA, cudaMemcpyHostToDevice);
	cudaMemcpy(adL, &dL, sizeof(float*), cudaMemcpyHostToDevice);
	cudaMemcpy(adC, &dC, sizeof(float*), cudaMemcpyHostToDevice);

	d_CUDATimerStart();

	cublasSgetrfBatched(cu_cublasHandle, n, adL, n, dLUPivots, dLUInfo, 1);
	cudaDeviceSynchronize();

	cublasSgetriBatched(cu_cublasHandle, n, (const float **)adL, n, dLUPivots, adC, n, dLUInfo, 1);
	cudaDeviceSynchronize();

	float timed = d_CUDATimerStop();
	float* res = (float*)malloc(szA);

	cudaMemcpy(res, dC, szA, cudaMemcpyDeviceToHost);

	cudaFree(adL);
	cudaFree(adC);
	cudaFree(dL);
	cudaFree(dC);
	cudaFree(dLUPivots);
	cudaFree(dLUInfo);

	cublasDestroy(cu_cublasHandle);

	return res;
}

__global__ static void matMultCUDA(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
	__shared__ float matA[BLOCK_SIZE][BLOCK_SIZE];
	__shared__ float matB[BLOCK_SIZE][BLOCK_SIZE];
	const int tidc = threadIdx.x;
	const int tidr = threadIdx.y;
	const int bidc = blockIdx.x * BLOCK_SIZE;
	const int bidr = blockIdx.y * BLOCK_SIZE;
	
                 int i, j;

	float results = 0;
	float comp = 0;

	for (j = 0; j < n; j += BLOCK_SIZE) 
                 {
	if (tidr + bidr < n && tidc + j < n) 
                 {
	matA[tidr][tidc] = a[(tidr + bidr) * lda + tidc + j];
	}
	else 
                 {
	matA[tidr][tidc] = 0;
	}

	if (tidr + j < n && tidc + bidc < n) 
                 {
	matB[tidr][tidc] = b[(tidr + j) * ldb + tidc + bidc];
	}
	else 
                 {
	matB[tidr][tidc] = 0;
	}
                 __syncthreads();

	for (i = 0; i < BLOCK_SIZE; i++) 
                 {
	float t;
                  }
                  __syncthreads();
	}

	if (tidr + bidr < n && tidc + bidc < n) 
                 {
	c[(tidr + bidr) * ldc + tidc + bidc] = results;
	}
}

void matmult(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
	int i, j, k;

	for (i = 0; i < n; i++) 
                 {
	for (j = 0; j < n; j++) 
                 {
                 double t = 0;
	for (k = 0; k < n; k++) 
                 {
	t += a[i * lda + k] * b[k * ldb + j];
	}
	c[i * ldc + j] = t;
	}
	}
}

int main()
{
	int n;
	printf("Please input matrix number:");
	scanf("%d", &n);
                  printf("\n");

	float* L = (float*)malloc(n * n * sizeof(float));
	float* c = (float*)malloc(n * n * sizeof(float));

	int i, j;
	for (i = 0; i < n; i++)
                  {
	for (j = 0; j < n; j++) 
                  {
	 L[i * n + j] = (float)rand()/RAND_MAX + (float)rand()/(RAND_MAX * RAND_MAX);
                  printf("%.f\t", L[i * n + j]);
	 }
	printf("\n");
	}

	float* inv = d_GetInv(L, n);
	printf("\n");

	for (i = 0; i < n; i++) 
                  {
	 for (j = 0; j < n; j++) 
                  {
	 printf("%.f\t", inv[i * n + j]);
	 }
	printf("\n");
	 }
	matmult(L, n, inv, n, c, n, n);
	printf("\n");

	for (i = 0; i < n; i++) 
                  {
	 for (j = 0; j < n; j++) 
                  {
	 printf("%.f\t", c[i * n + j]);
	 }
	 printf("\n");
	 }

	printf("\n");
	printf("verify");
	_getch();

	return 0;
}

Robert_Crovella · June 21, 2018, 2:10am

[url]visual studio 2010 - how to link library (e.g. CUBLAS, CUSPARSE) for CUDA on windows - Stack Overflow

tronorinoyeong · June 22, 2018, 12:41pm

My Cuda code can run in Visual Studio 2013 (Cuda 9.1) after adding the cublas.lib
but cannot run in Visual Studio 2013 (Cuda 8) even after adding that cublas.lib

Please let me know how to solve this problem

You can try my Cuda code as below -

#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <conio.h>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <time.h>
#include <windows.h>  

#define BLOCK_SIZE 16
#define NUM_THREADS 256
#define cublascall(res, str) 

static cudaEvent_t cu_TimerStart;
static cudaEvent_t cu_TimerStop;

void d_CUDATimerStart(void)
{
	cudaEventCreate(&cu_TimerStart);
	cudaEventCreate(&cu_TimerStop);
	cudaEventRecord(cu_TimerStart);
}

float d_CUDATimerStop(void)
{
	cudaEventRecord(cu_TimerStop);
	cudaEventSynchronize(cu_TimerStop);

	float ms;
	cudaEventElapsedTime(&ms, cu_TimerStart, cu_TimerStop);
	cudaEventDestroy(cu_TimerStart);
	cudaEventDestroy(cu_TimerStop);
	return ms;
}

float* d_GetInv(float* L, int n)
{
	cublasHandle_t cu_cublasHandle;
	cublasCreate(&cu_cublasHandle);

	float** adL;
	float** adC;
	float* dL;
	float* dC;
	int* dLUPivots;
	int* dLUInfo;

	size_t szA = n * n * sizeof(float);

	cudaMalloc(&adL, sizeof(float*));
	cudaMalloc(&adC, sizeof(float*));
	cudaMalloc(&dL, szA);
	cudaMalloc(&dC, szA);
	cudaMalloc(&dLUPivots, n * sizeof(int));
	cudaMalloc(&dLUInfo, sizeof(int));

	cudaMemcpy(dL, L, szA, cudaMemcpyHostToDevice);
	cudaMemcpy(adL, &dL, sizeof(float*), cudaMemcpyHostToDevice);
	cudaMemcpy(adC, &dC, sizeof(float*), cudaMemcpyHostToDevice);

	d_CUDATimerStart();

	cublasSgetrfBatched(cu_cublasHandle, n, adL, n, dLUPivots, dLUInfo, 1);
	cudaDeviceSynchronize();

	cublasSgetriBatched(cu_cublasHandle, n, (const float **)adL, n, dLUPivots, adC, n, dLUInfo, 1);
	cudaDeviceSynchronize();

	float timed = d_CUDATimerStop();
	float* res = (float*)malloc(szA);

	cudaMemcpy(res, dC, szA, cudaMemcpyDeviceToHost);

	cudaFree(adL);
	cudaFree(adC);
	cudaFree(dL);
	cudaFree(dC);
	cudaFree(dLUPivots);
	cudaFree(dLUInfo);

	cublasDestroy(cu_cublasHandle);

	return res;
}

__global__ static void matMultCUDA(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
	__shared__ float matA[BLOCK_SIZE][BLOCK_SIZE];
	__shared__ float matB[BLOCK_SIZE][BLOCK_SIZE];
	const int tidc = threadIdx.x;
	const int tidr = threadIdx.y;
	const int bidc = blockIdx.x * BLOCK_SIZE;
	const int bidr = blockIdx.y * BLOCK_SIZE;
	
                 int i, j;

	float results = 0;
	float comp = 0;

	for (j = 0; j < n; j += BLOCK_SIZE) 
                 {
	if (tidr + bidr < n && tidc + j < n) 
                 {
	matA[tidr][tidc] = a[(tidr + bidr) * lda + tidc + j];
	}
	else 
                 {
	matA[tidr][tidc] = 0;
	}

	if (tidr + j < n && tidc + bidc < n) 
                 {
	matB[tidr][tidc] = b[(tidr + j) * ldb + tidc + bidc];
	}
	else 
                 {
	matB[tidr][tidc] = 0;
	}
                 __syncthreads();

	for (i = 0; i < BLOCK_SIZE; i++) 
                 {
	float t;
                  }
                  __syncthreads();
	}

	if (tidr + bidr < n && tidc + bidc < n) 
                 {
	c[(tidr + bidr) * ldc + tidc + bidc] = results;
	}
}

void matmult(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
	int i, j, k;

	for (i = 0; i < n; i++) 
                 {
	for (j = 0; j < n; j++) 
                 {
                 double t = 0;
	for (k = 0; k < n; k++) 
                 {
	t += a[i * lda + k] * b[k * ldb + j];
	}
	c[i * ldc + j] = t;
	}
	}
}

int main()
{
	int n;
	printf("Please input matrix number:");
	scanf("%d", &n);
                  printf("\n");

	float* L = (float*)malloc(n * n * sizeof(float));
	float* c = (float*)malloc(n * n * sizeof(float));

	int i, j;
	for (i = 0; i < n; i++)
                  {
	for (j = 0; j < n; j++) 
                  {
	 L[i * n + j] = (float)rand()/RAND_MAX + (float)rand()/(RAND_MAX * RAND_MAX);
                  printf("%.f\t", L[i * n + j]);
	 }
	printf("\n");
	}

	float* inv = d_GetInv(L, n);
	printf("\n");

	for (i = 0; i < n; i++) 
                  {
	 for (j = 0; j < n; j++) 
                  {
	 printf("%.f\t", inv[i * n + j]);
	 }
	printf("\n");
	 }
	matmult(L, n, inv, n, c, n, n);
	printf("\n");

	for (i = 0; i < n; i++) 
                  {
	 for (j = 0; j < n; j++) 
                  {
	 printf("%.f\t", c[i * n + j]);
	 }
	 printf("\n");
	 }

	printf("\n");
	printf("verify");
	_getch();

	return 0;
}

vacaloca · June 22, 2018, 1:16pm

Without posting any error messages, it is not obvious why the case may be. If you have it running under 9.1, just work off that unless there’s a specific reason you need the previous version toolkit.

You might need to expand the error verbosity in Visual Studio to figure out why it’s failing:

tronorinoyeong · June 22, 2018, 7:16pm

.

That is because my teacher classroom - they are using Visual Studio 2013 (Cuda 8 old version)

.

Topic		Replies	Views
Some errors in CUDA matrix programming CUDA Programming and Performance	6	1034	May 21, 2018
error : "too many resources requested for launch" CUDA Programming and Performance	18	3063	January 16, 2014
CUDA & VS C++ 2008 EXPRESS EDITION CUDA Programming and Performance	7	14614	August 8, 2010
problem compiling cuda code HELP CUDA Programming and Performance	5	3811	July 21, 2009
CUDA 6.5 building problem CUDA Setup and Installation	21	10446	March 22, 2021
Parallel Reduction CUDA Programming and Performance	10	3715	June 26, 2011
CUDA VS Wizard 2.2 beta CUDA Programming and Performance	33	108729	May 29, 2011
can not execute external cuda process CUDA Programming and Performance	7	13997	February 22, 2010
Cuda compilation error: class template has already been defined and invalid records warnings CUDA Programming and Performance	5	3115	July 7, 2018
cuda on microsoft visual 2005 problem... CUDA Programming and Performance	1	4817	January 20, 2008

Cuda programming code cannot run in old version

Related topics