[b]Please find the anti-matrix of A.
Please verify the anti-matrix of A by multiplying A and anti-A.
In this way, can get an 1000*1000 identity matrix. [/b]
My CUDA coding below cannot run - not sure how to modify
#include <conio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <time.h>
#include <windows.h>
#define BLOCK_SIZE 16
#define NUM_THREADS 256
static cudaEvent_t cu_TimerStart;
static cudaEvent_t cu_TimerStop;
void d_CUDATimerStart(void)
{
cudaEventCreate(&cu_TimerStart);
cudaEventCreate(&cu_TimerStop);
cudaEventRecord(cu_TimerStart);
}
float d_CUDATimerStop(void)
{
cudaEventRecord(cu_TimerStop);
cudaEventSynchronize(cu_TimerStop);
float ms;
cudaEventElapsedTime(&ms, cu_TimerStart, cu_TimerStop);
cudaEventDestroy(cu_TimerStart);
cudaEventDestroy(cu_TimerStop);
return ms;
}
float* d_GetInv(float* L, int n)
{
cublasHandle_t cu_cublasHandle;
cublasCreate(&cu_cublasHandle);
float** adL;
float** adC;
float* dL;
float* dC;
int* dLUPivots;
int* dLUInfo;
size_t szA = n * n * sizeof(float);
cudaMalloc(&adL, sizeof(float*));
cudaMalloc(&adC, sizeof(float*));
cudaMalloc(&dL, szA);
cudaMalloc(&dC, szA);
cudaMalloc(&dLUPivots, n * sizeof(int));
cudaMalloc(&dLUInfo, sizeof(int));
cudaMemcpy(dL, L, szA, cudaMemcpyHostToDevice);
cudaMemcpy(adL, &dL, sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy(adC, &dC, sizeof(float*), cudaMemcpyHostToDevice);
d_CUDATimerStart();
cublasSgetrfBatched(cu_cublasHandle, n, adL, n, dLUPivots, dLUInfo, 1);
cudaDeviceSynchronize();
cublasSgetriBatched(cu_cublasHandle, n, (const float **)adL, n, dLUPivots, adC, n, dLUInfo, 1);
cudaDeviceSynchronize();
float timed = d_CUDATimerStop();
printf("\ncublas inverse in: %.5f ms.\n", timed);
float* res = (float*)malloc(szA);
cudaMemcpy(res, dC, szA, cudaMemcpyDeviceToHost);
cudaFree(adL);
cudaFree(adC);
cudaFree(dL);
cudaFree(dC);
cudaFree(dLUPivots);
cudaFree(dLUInfo);
cublasDestroy(cu_cublasHandle);
return res;
}
__global__ static void matMultCUDA(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
__shared__ float matA[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float matB[BLOCK_SIZE][BLOCK_SIZE];
const int tidc = threadIdx.x;
const int tidr = threadIdx.y;
const int bidc = blockIdx.x * BLOCK_SIZE;
const int bidr = blockIdx.y * BLOCK_SIZE;
int i, j;
float results = 0;
float comp = 0;
for (j = 0; j < n; j += BLOCK_SIZE)
{
if (tidr + bidr < n && tidc + j < n)
{
matA[tidr][tidc] = a[(tidr + bidr) * lda + tidc + j];
}
else
{
matA[tidr][tidc] = 0;
}
if (tidr + j < n && tidc + bidc < n)
{
matB[tidr][tidc] = b[(tidr + j) * ldb + tidc + bidc];
}
else
{
matB[tidr][tidc] = 0;
}
__syncthreads();
for (i = 0; i < BLOCK_SIZE; i++)
{
float t;
comp -= matA[tidr][i] * matB[i][tidc];
t = results - comp;
comp = (t - results) + comp;
results = t;
results = matA[tidr][i] * matB[i][tidc];
}
__syncthreads();
}
if (tidr + bidr < n && tidc + bidc < n)
{
c[(tidr + bidr) * ldc + tidc + bidc] = results;
}
}
void matmult(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
int i, j, k;
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
double t = 0;
for (k = 0; k < n; k++)
{
t += a[i * lda + k] * b[k * ldb + j];
}
c[i * ldc + j] = t;
}
}
}
int main()
{
int n;
printf("Please input matrix number :");
scanf("%d", &n);
float* L = (float*)malloc(n * n * sizeof(float));
float* c = (float*)malloc(n * n * sizeof(float));
int i, j;
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
L[i * n + j] = (float)rand() / RAND_MAX + (float)rand() / (RAND_MAX * RAND_MAX);
printf("%.1f\t", L[i * n + j]);
}
printf("\n");
}
float* inv = d_GetInv(L, n);
printf("\n");
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
printf("%.1f\t", inv[i * n + j]);
}
printf("\n");
}
matmult(L, n, inv, n, c, n, n);
printf("\n");
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
printf("%f\t", c[i * n + j]);
}
printf("\n");
}
printf("\n");
printf("Done.");
_getch();
return 0;
}
Error 58 error LNK1120: 4 unresolved externals
Error 54 error LNK2019: unresolved external symbol cublasCreate_v2 referenced in function “float * __cdecl d_GetInv(float *,int)” (?d_GetInv@@YAPEAMPEAMH@Z)
Error 55 error LNK2019: unresolved external symbol cublasDestroy_v2 referenced in function “float * __cdecl d_GetInv(float *,int)” (?d_GetInv@@YAPEAMPEAMH@Z)
Error 56 error LNK2019: unresolved external symbol cublasSgetrfBatched referenced in function “float * __cdecl d_GetInv(float *,int)” (?d_GetInv@@YAPEAMPEAMH@Z)
Error 57 error LNK2019: unresolved external symbol cublasSgetriBatched referenced in function “float * __cdecl d_GetInv(float *,int)” (?d_GetInv@@YAPEAMPEAMH@Z)
61 IntelliSense: identifier “__syncthreads” is undefined
60 IntelliSense: identifier “blockIdx” is undefined
59 IntelliSense: identifier “threadIdx” is undefined
Please provide your opinion and suggestion
thus I will be able to improve my computing skills