Hello everybody.
Because there are so few examples and materials concerning the CUDA Driver API and the PTX ISA on the web, I decided to share my working code with you. Maybe it’ll save someone’s time and nerves. Before we start, we have to do some setup work:
extern "C" __global__ void addKernel(int *devInArr1, int *devInArr2, int *devOutArr1) {
}
and change the generated *.ptx file name to “addKernel.ptx”
addKernel.ptx:
.version 1.4
.target sm_11
.entry addKernel (
.param .u32 devInArr1,
.param .u32 devInArr2,
.param .u32 devOutArr1
) {
.reg .b32 r1;
.reg .b32 r2;
.reg .b32 r3;
.reg .b32 r4;
$LDWbegin_addKernel:
mov.b32 r1, %tid.x;
shl.b32 r1, r1, 2;
ld.param.b32 r2, [devInArr1];
add.u32 r2, r2, r1;
ld.global.b32 r3, [r2];
ld.param.b32 r2, [devInArr2];
add.u32 r2, r2, r1;
ld.global.b32 r4, [r2];
add.s32 r3, r3, r4;
ld.param.b32 r2, [devOutArr1];
add.u32 r2, r2, r1;
st.global.b32 [r2], r3;
exit;
$LDWend_addKernel:
}
kernel.cu:
#define __CUDACC__
#include "builtin_types.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <conio.h>
#include <stdio.h>
#include <stdlib.h>
// -*- -*- -*-
CUcontext context;
CUdevice device;
CUfunction function;
CUmodule module;
// -*- -*- -*-
CUresult addWithCuda(int *inArr1, int *inArr2, int *outArr1, int size) {
CUdeviceptr devInArr1, devInArr2, devOutArr1;
CUresult err;
void *args[3] = { &devInArr1, &devInArr2, &devOutArr1 };
err = cuMemAlloc(&devInArr1, sizeof(int) * size);
if (err != CUDA_SUCCESS) goto Exit;
err = cuMemAlloc(&devInArr2, sizeof(int) * size);
if (err != CUDA_SUCCESS) goto Exit;
err = cuMemAlloc(&devOutArr1, sizeof(int) * size);
if (err != CUDA_SUCCESS) goto Exit;
err = cuMemcpyHtoD(devInArr1, inArr1, sizeof(int) * size);
if (err != CUDA_SUCCESS) goto Exit;
err = cuMemcpyHtoD(devInArr2, inArr2, sizeof(int) * size);
if (err != CUDA_SUCCESS) goto Exit;
err = cuLaunchKernel(function, 1, 1, 1, size, 1, 1, 0, 0, args, 0);
if (err != CUDA_SUCCESS) goto Exit;
err = cuMemcpyDtoH(outArr1, devOutArr1, sizeof(int) * size);
if (err != CUDA_SUCCESS) goto Exit;
Exit:
cuMemFree(devInArr1);
cuMemFree(devInArr2);
cuMemFree(devOutArr1);
return err;
};
int main() {
CUresult err;
int deviceCount = 0;
int inArr1[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
int inArr2[10] = { 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
int outArr1[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
err = cuInit(0);
if (err != CUDA_SUCCESS) { printf("cuInit error... .\n"); goto Exit; }
err = cuDeviceGetCount(&deviceCount);
if (err != CUDA_SUCCESS) { printf("cuDeviceGetCount error... .\n"); goto Exit; }
if (deviceCount == 0) { printf("No CUDA-capable devices... .\n"); goto Exit; }
err = cuDeviceGet(&device, 0);
if (err != CUDA_SUCCESS) { printf("cuDeviceGet error... .\n"); goto Exit; }
err = cuCtxCreate(&context, 0, device);
if (err != CUDA_SUCCESS) { printf("cuCtxCreate error... .\n"); goto Exit; }
err = cuModuleLoad(&module, "addKernel.ptx");
if (err != CUDA_SUCCESS) { printf("cuModuleLoad error... .\n"); goto Exit; }
err = cuModuleGetFunction(&function, module, "addKernel");
if (err != CUDA_SUCCESS) { printf("cuModuleGetFunction error... .\n"); goto Exit; }
err = addWithCuda(inArr1, inArr2, outArr1, 10);
if (err != CUDA_SUCCESS) { printf("Kernel invocation failed... .\n"); goto Exit; }
for (int i = 0; i < 10; ++i) printf("%d + %d = %d\n", inArr1[i], inArr2[i], outArr1[i]);
Exit:
cuCtxDetach(context);
getch();
return 0;
}