OpenCL can call tensor core?

user9725 · March 19, 2023, 7:01pm

As the title ,OpenCL can call tensor core ?
If OpenCL can call tensor core ,need install extension library?

Robert_Crovella · March 19, 2023, 8:44pm

There is no direct support for tensor core in NVIDIA OpenCL. Furthermore, AFAIK, NVIDIA does not support the khronos fp16 extension.

However the lack of fp16 extension can be worked around manually, and it is possible to access tensor core from OpenCL code via inline PTX. Using that methodology, no extension libraries are needed.

Here is a full example:

$ cat t15.cpp
#include <stdlib.h>
#include <iostream>
#include <string>
#include <cstring>
#include <CL/cl.h>
using namespace std;


#define testpp(...)#__VA_ARGS__
const char* pp = testpp(
  __kernel void mma_8x8x4_fp16_acc_fp32(__global float *out) {
    float c[8] = {0., 0., 0., 0., 0., 0., 0., 0.};
    float d[8] = {0., 0., 0., 0., 0., 0., 0., 0.};
    unsigned short a[4] = {15360, 15360, 15360, 15360};
    unsigned short b[4] = {15360, 15360, 15360, 15360};
    const unsigned *A = (const unsigned *)&a;
    const unsigned *B = (const unsigned *)&b;
    const float *C = (const float *)&c;
    float *D = (float *)&d;
    asm(
      "mma.sync.aligned.row.col.m8n8k4.f32.f16.f16.f32"
      "{%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, {%12,%13,%14,%15,%16,%17,%18,%19};\n"
      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3]), "=f"(D[4]), "=f"(D[5]), "=f"(D[6]), "=f"(D[7])
      :
        "r"(A[0]), "r"(A[1]),
        "r"(B[0]), "r"(B[1]),
        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "f"(C[4]), "f"(C[5]), "f"(C[6]), "f"(C[7])
    );
    int tidx = get_global_id(0);
    for (int i = 0; i < 8; i++) out[tidx*8+i] = D[i];
  }

);
float out[256];

int main()
{
        cl_int err = 0;
        cl_uint numPlatforms;
        cl_platform_id platform = NULL;
        int ans;
        err = clGetPlatformIDs(0, NULL, &numPlatforms);
        if (err != CL_SUCCESS)
        {
                cout << "Error: Getting Platforms\n";
                return EXIT_FAILURE;
        }
        if (numPlatforms > 0)
        {
                cl_platform_id* platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
                err = clGetPlatformIDs(numPlatforms, platforms, NULL);
                if (err != CL_SUCCESS)
                {
                        cout << "Error: Getting Platform Ids.(clGetPlatformIDs)\n";
                        return EXIT_FAILURE;
                }
                cout << "available platforms: " << endl;
                for (unsigned int i = 0; i < numPlatforms; ++i)
                {
                        char pbuff[100];
                        err = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL);
                        cout << i << ":" << pbuff << "\n";
                }
                cout << "select platform: ";
                cin >> ans;
                if (ans < numPlatforms) platform = platforms[ans];
                else {cout << "invalid platform choice" << endl;  return EXIT_FAILURE;}
                free(platforms);
        }
        else
        {
                cout << "no platforms found\n";
                return EXIT_FAILURE;
        }
        cl_uint num_devices = 0;
        cl_device_id* devices = NULL;
        cl_device_id device = NULL;
        err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, devices, &num_devices);
        if (num_devices == 0) //no GPU available.
        {
                cout << "No GPU device available." << endl;
                return EXIT_FAILURE;
        }
        else
        {
                devices = (cl_device_id*)malloc(num_devices * sizeof(cl_device_id));
                err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices, NULL);
                cout << "available devices: " << endl;
                for (unsigned int i = 0; i < num_devices; ++i)
                {
                        char pbuff[100];
                        err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
                        cout << i << ":" << pbuff << "\n";
                }
                cout << "select device: ";
                cin >> ans;
                if (ans < num_devices) device = devices[ans];
                else {cout << "invalid device choice" << endl; return EXIT_FAILURE;}
                free(devices);

        }
        cl_context context = nullptr;
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
        cl_command_queue commandQueue = nullptr;
        commandQueue = clCreateCommandQueue(context, device, 0, &err);
        size_t ppsize[] = { strlen(pp) };
        cl_program pprog = clCreateProgramWithSource(context, 1, &pp, ppsize, &err);
        if (err != CL_SUCCESS)
        {
                cout << "Error: Loading Binary into cl_program (clCreateProgramWithBinary)\n";
                return EXIT_FAILURE;
        }
        err = clBuildProgram(pprog, 1, &device, NULL, NULL, NULL);
        if (err != CL_SUCCESS)
        {
                if (err == CL_BUILD_PROGRAM_FAILURE) {
                        // Determine the size of the log
                        size_t log_size;
                        clGetProgramBuildInfo(pprog, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

                        // Allocate memory for the log
                        char* log = (char*)malloc(log_size);

                        // Get the log
                        clGetProgramBuildInfo(pprog, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
                        // Print the log
                        cout << log << endl;
                }
                cout << err;
                printf("Error: Building Program (clBuildProgram)\n");

                return EXIT_FAILURE;
        }

        cl_kernel testkernel = clCreateKernel(pprog, "mma_8x8x4_fp16_acc_fp32", &err);
        if (err != CL_SUCCESS)
        {
                size_t log_size;
                clGetProgramBuildInfo(pprog, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

                // Allocate memory for the log
                char* log = (char*)malloc(log_size);

                // Get the log
                clGetProgramBuildInfo(pprog, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
                // Print the log
                cout << log << endl;
                cout << "Error: Creating Kernel from program.(clCreateKernel)\n";
                return EXIT_FAILURE;
        }
        cl_mem outbuff = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * 256, &out, &err);
        err = clSetKernelArg(testkernel, 0, sizeof(cl_mem), (void*)&outbuff);
        size_t globalThreads = 32;
        size_t localThreads = 32;
        err = clEnqueueNDRangeKernel(commandQueue, testkernel, 1, NULL, &globalThreads, &localThreads, 0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
                cout << "Error: Enqueueing kernel\n";
                return EXIT_FAILURE;
        }
        err = clFinish(commandQueue);
        if (err != CL_SUCCESS)
        {
                cout << "Error: Finish command queue\n";
                return EXIT_FAILURE;
        }
        err = clEnqueueReadBuffer(commandQueue, outbuff, CL_TRUE, 0, sizeof(float) * 256, &out, 0, NULL, NULL);
        for (int i = 0; i < 16; i++){
          for (int j = 0; j < 16; j++)
          {
                cout << out[i*16+j] << " ";
          }
          cout << endl;
        }
}
$ nvcc -o t15 t15.cpp -lOpenCL
$ ./t15
0:NVIDIA Corporation
select platform: 0
0:Tesla V100-PCIE-32GB
1:Tesla K20Xm
2:Tesla K20Xm
3:Tesla K20Xm
select device: 0
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
$

(CUDA 11.4, driver 470.57.02)

A few notes:

the constant 15360 in the above code represents the unsigned short representation of fp16 value of 1.0
the lack of a fp16 extension can be worked around “manually”. As a “roadmap”, you could do something like what njuffa did here. I’m not suggesting it is drop-in ready-to-go, just a general sketch of how to do it.
the matrix result of all 4 above is expected. The selected mma operation does four m=8, n=8, k=4 matrix-multiply operations per warp. Therefore there are 256 total result elements, and since k=4, and both input matrices are all 1.0, 4 is the expected result for each element.

Topic		Replies	Views
Utilize Tensor Cores of GV100 (Titan V) in OpenCl CUDA Programming and Performance	1	796	January 5, 2018
Cross-vendor GPU development strategy CUDA Programming and Performance	20	6856	January 11, 2010
Cuda vs OpenCL CUDA Programming and Performance	49	262433	December 28, 2008
compatible OpenCL for 185.18.08 and more recent drivers CUDA Programming and Performance	0	7234	June 4, 2009
Quadro 4000Mac and OpenCL CUDA Programming and Performance	4	10131	December 14, 2011
OpenCL Double Precision Support using Nvidia 1.3 compute hardware CUDA Programming and Performance	8	29935	November 22, 2010
Question regarding Tensor Cores/GV100 CUDA Programming and Performance	8	2646	August 12, 2017
problem about tensor core CUDA Programming and Performance	2	722	June 28, 2018
Bootstrapping with OpenCL! Advice me please CUDA Programming and Performance	21	4851	July 15, 2010
Any reason to choose CUDA over OpenCL? CUDA Programming and Performance	27	26305	August 2, 2010

OpenCL can call tensor core?

Related topics