Hello.
Firstly, I wrapped some CUDA 8.0 code (kernel1.cu) into the dll and lib file for later invoking. The following
is the complete code in kernel1.cu. N.B. To distinguish the time used for vector add operations and memory transferring, I constructed two nearly the same functions, the only difference between them is whether launch the GPU kernel function or not, i.e. addKernel in this context.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cudadll1.h"
__global__ void addKernel(int *c, const int *a, const int *b, int size)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for(int i = index; i < size; i += stride)
c[i] = a[i] + b[i];
}
CUDADLL1_API int vectorCopyConsuming(int c[], int a[], int b[], int size)
{
int result = -1;
cudaError_t cudaStatus;
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
// running GPU selection
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
result = 1;
goto Error;
}
// assign memory in GPU for dev_a、dev_b、dev_c
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
result = 2;
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
result = 3;
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
result = 4;
goto Error;
}
// copy data from host to device
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
result = 5;
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
result = 6;
goto Error;
}
// launch GPU kernel function
//int blockSize = 256;
//int numBlocks = (size + blockSize - 1) / blockSize;
//addKernel <<<numBlocks, blockSize >>>(dev_c, dev_a, dev_b, size);
// Wait for GPU to finish before accessing on host
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
result = 7;
goto Error;
}
// copy data from device to host
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
result = 8;
goto Error;
}
result = 0;
// reset GPU device
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
return 9;
}
Error:
//Free memory
cudaFree(c);
cudaFree(a);
cudaFree(b);
return result;
}
CUDADLL1_API int vectorAddCABsize(int c[], int a[], int b[], int size)
{
int result = -1;
cudaError_t cudaStatus;
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
// running GPU selection
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
result = 1;
goto Error;
}
// assign memory in GPU for dev_a、dev_b、dev_c
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
result = 2;
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
result = 3;
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
result = 4;
goto Error;
}
// copy data from host to device
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
result = 5;
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
result = 6;
goto Error;
}
// launch GPU kernel function
int blockSize = 256;
int numBlocks = (size + blockSize - 1) / blockSize;
addKernel << <numBlocks, blockSize >> >(dev_c, dev_a, dev_b, size);
// Wait for GPU to finish before accessing on host
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
result = 7;
goto Error;
}
// copy data from device to host
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
result = 8;
goto Error;
}
result = 0;
// reset GPU device
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
return 9;
}
Error:
//Free memory
cudaFree(c);
cudaFree(a);
cudaFree(b);
return result;
}
Secondly, a new vs2015 project was created to test the CUDA functionality and efficiency of paralleling computing. The following is the main code which shows how I invoked the functions in dll file.
int main()
{
const int arraySize = 1 << 20;
int *a = new int[arraySize];
int *b = new int[arraySize];
int *c = new int[arraySize];
int *d = new int[arraySize];
int *e = new int[arraySize];
int *f = new int[arraySize];
int *g = new int[arraySize];
int *h = new int[arraySize];
int *k = new int[arraySize];
for (int i = 0; i < arraySize; i++) {
a[i] = 1;
b[i] = 2;
c[i] = 0;
d[i] = 1;
e[i] = 2;
f[i] = 0;
g[i] = 1;
h[i] = 2;
k[i] = 0;
}
printf("Data length : %d \n Vector Add operation ... \n \n", arraySize);
LARGE_INTEGER t1, t2, tc;
QueryPerformanceFrequency(&tc);
QueryPerformanceCounter(&t1);
// Add vectors in parallel.
int number1 = vectorAddCABsize(c, a, b, arraySize);
QueryPerformanceCounter(&t2);
printf(" CUDA GPU calculation+memory copy consuming Time:%f s\n", (t2.QuadPart - t1.QuadPart)*1.0 / tc.QuadPart);
printf("invoke CUDA dll status code = %d\n\n", number1);
LARGE_INTEGER t5, t6, te;
QueryPerformanceFrequency(&te);
QueryPerformanceCounter(&t5);
// Add vectors in parallel.
int number2 = vectorCopyConsuming(k, g, h, arraySize);
QueryPerformanceCounter(&t6);
printf(" CUDA GPU memory copy consuming Time:%f s\n", (t6.QuadPart - t5.QuadPart)*1.0 / te.QuadPart);
printf("invoke CUDA dll status code = %d\n\n", number2);
LARGE_INTEGER t3, t4, td;
QueryPerformanceFrequency(&td);
QueryPerformanceCounter(&t3);
// Add vectors in parallel.
for (int i = 0; i != arraySize; i++) {
f[i] = d[i] + e[i];
}
QueryPerformanceCounter(&t4);
printf(" C++ calculation consuming Time:%f s\n", (t4.QuadPart - t3.QuadPart)*1.0 / td.QuadPart);
system("Pause");
return 0;
}
The console output text is below.
Data length : 1048576
Vector Add operation …
CUDA GPU calculation+memory copy consuming Time:0.386232 s
invoke CUDA dll status code = 0
CUDA GPU memory copy consuming Time:0.065450 s
invoke CUDA dll status code = 0
C++ calculation consuming Time:0.003581 s
Therefore,one issue has confused me for long.
I will really appreciate if you can help me. Thank you in advance.