Ptx CUDA Driver API "Hello world" in VS 2010

Hello everybody.

Because there are so few examples and materials concerning the CUDA Driver API and the PTX ISA on the web, I decided to share my working code with you. Maybe it’ll save someone’s time and nerves. Before we start, we have to do some setup work:

  • In the "Linker\Input" section of the project's properties we add the additional dependencies: "cuda.lib" and "cudart.lib". Otherwise the compiler'll be spitting out there are some unresolved symbols.
  • We'll be editing generated *.ptx file, so we mark Yes "Keep Preprocessed Files" in the "CUDA C/C++\Common"
  • We compile the kernel.cu with
    extern "C" __global__ void addKernel(int *devInArr1, int *devInArr2, int *devOutArr1) {
    }
    

    and change the generated *.ptx file name to “addKernel.ptx”

  • We edit "addKernel.ptx" and "kernel.cu":

    addKernel.ptx:

    .version 1.4
    .target sm_11
    
    .entry addKernel (
        .param .u32 devInArr1,
        .param .u32 devInArr2,
        .param .u32 devOutArr1
    ) {
        .reg .b32 r1;
        .reg .b32 r2;
        .reg .b32 r3;
        .reg .b32 r4;
    $LDWbegin_addKernel:
        mov.b32 r1, %tid.x;
        shl.b32 r1, r1, 2;
        ld.param.b32 r2, [devInArr1];
        add.u32 r2, r2, r1;
        ld.global.b32 r3, [r2];
        ld.param.b32 r2, [devInArr2];
        add.u32 r2, r2, r1;
        ld.global.b32 r4, [r2];
        add.s32 r3, r3, r4;
        ld.param.b32 r2, [devOutArr1];
        add.u32 r2, r2, r1;
        st.global.b32 [r2], r3;
        exit;
    $LDWend_addKernel:
    }
    

    kernel.cu:

    #define __CUDACC__
    
    #include "builtin_types.h"
    #include "cuda.h"
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    
    #include <conio.h>
    #include <stdio.h>
    #include <stdlib.h>
    
    //   -*-   -*-   -*-
    
    CUcontext context;
    CUdevice device;
    CUfunction function;
    CUmodule module;
    
    //   -*-   -*-   -*-
    
    CUresult addWithCuda(int *inArr1, int *inArr2, int *outArr1, int size) {
        CUdeviceptr devInArr1, devInArr2, devOutArr1;
        CUresult err;
        void *args[3] = { &devInArr1, &devInArr2, &devOutArr1 };
    
        err = cuMemAlloc(&devInArr1, sizeof(int) * size);
        if (err != CUDA_SUCCESS) goto Exit;
        err = cuMemAlloc(&devInArr2, sizeof(int) * size);
        if (err != CUDA_SUCCESS) goto Exit;
        err = cuMemAlloc(&devOutArr1, sizeof(int) * size);
        if (err != CUDA_SUCCESS) goto Exit;
        err = cuMemcpyHtoD(devInArr1, inArr1, sizeof(int) * size);
        if (err != CUDA_SUCCESS) goto Exit;
        err = cuMemcpyHtoD(devInArr2, inArr2, sizeof(int) * size);
        if (err != CUDA_SUCCESS) goto Exit;
        err = cuLaunchKernel(function, 1, 1, 1, size, 1, 1, 0, 0, args, 0);
        if (err != CUDA_SUCCESS) goto Exit;
        err = cuMemcpyDtoH(outArr1, devOutArr1, sizeof(int) * size);
        if (err != CUDA_SUCCESS) goto Exit;
    Exit:
        cuMemFree(devInArr1);
        cuMemFree(devInArr2);
        cuMemFree(devOutArr1);
        return err;
    };
    
    int main() {
        CUresult err;
        int deviceCount = 0;
        int inArr1[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
        int inArr2[10] = { 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
        int outArr1[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
    
        err = cuInit(0);
        if (err != CUDA_SUCCESS) { printf("cuInit error... .\n"); goto Exit; }
        err = cuDeviceGetCount(&deviceCount);
        if (err != CUDA_SUCCESS) { printf("cuDeviceGetCount error... .\n"); goto Exit; }
        if (deviceCount == 0) { printf("No CUDA-capable devices... .\n"); goto Exit; } 
        err = cuDeviceGet(&device, 0);
        if (err != CUDA_SUCCESS) { printf("cuDeviceGet error... .\n"); goto Exit; }
        err = cuCtxCreate(&context, 0, device);
        if (err != CUDA_SUCCESS) { printf("cuCtxCreate error... .\n"); goto Exit; }
        err = cuModuleLoad(&module, "addKernel.ptx");
        if (err != CUDA_SUCCESS) { printf("cuModuleLoad error... .\n"); goto Exit; }
        err = cuModuleGetFunction(&function, module, "addKernel");
        if (err != CUDA_SUCCESS) { printf("cuModuleGetFunction error... .\n"); goto Exit; }
        err = addWithCuda(inArr1, inArr2, outArr1, 10);
        if (err != CUDA_SUCCESS) { printf("Kernel invocation failed... .\n"); goto Exit; }
        for (int i = 0; i < 10; ++i) printf("%d + %d = %d\n", inArr1[i], inArr2[i], outArr1[i]);
    Exit:  
        cuCtxDetach(context);
        getch();
        return 0;
    }
    
  • Very nice example!

    I’ve never tried, but I wonder if you can leverage VS2012 VS to compile and link the .ptx file for you without having to use the Runtime API at all?

    Maybe I’ll try, but I’ve to upgrade my VS first.

    Sorry, I meant VS in general and not 2012 specifically. I’m wondering if .ptx files are properly handled by the VS build.

    The CUDA Driver API with PTX JIT does not requires the application to link against the CUDART library. In fact you can go so far as to avoid linking against cuda.lib if you wish. The VS rules and props files are setup for CUDART by default.

    You can simplify your application using these steps:

    1. Move all of your CUDA Driver API code into a .cpp file to avoid calling through nvcc. In fact you have no need to use NVCC at all as the programming is JITing PTX.
    2. Remove the kernel.cu file that defines global addKernel. The global function is provided by the .ptx file you are passing to cuModuleLoad which will JIT the PTX file.

    The matrixMulDrv sample is a reasonable reference; however, it supports both PTX and cuobj so you will see a .cu in the project.

    Thanks to all for Your valuable suggestions.

    As for the global addKernel, I know, it serves only as the “template” - once compiler generates the *.ptx, you simply comment it out and edit the *.ptx file (btw. I tried to write it in my post, but it was too long so there were errors).