How can i linking the CUDA driver API library and CUPTI library on google colab?

I use google colab on cuda programing. how can i solve the following error?



  • Copyright 2011-2017 NVIDIA Corporation. All rights reserved

  • Sample app to demonstrate use of CUPTI library to obtain profiler

  • event values by sampling.


#ifdef _WIN32


    #define WIN32_LEAN_AND_MEAN



#include <stdio.h>

#include <cuda_runtime_api.h>

#include <cupti_events.h>

#ifdef _WIN32

#include <windows.h>


#include <unistd.h>

#include <pthread.h>


#define CHECK_CU_ERROR(err, cufunc) \

if (err != CUDA_SUCCESS) \

{                                                                   \

  printf ("Error %d for CUDA Driver API function '%s'.\n",          \

          err, cufunc);                                             \

  return 0;                                                         \


#define CHECK_CUPTI_ERROR(err, cuptifunc) \

if (err != CUPTI_SUCCESS) \

{                                                           \

  const char *errstr;                                       \

  cuptiGetResultString(err, &errstr);                       \

  printf ("%s:%d:Error %s for CUPTI API function '%s'.\n",  \

          __FILE__, __LINE__, errstr, cuptifunc);           \

  return 0;                                                 \


#define EVENT_NAME “inst_executed”

#define N 100000

#define ITERATIONS 2000


// used to signal from the compute thread to the sampling thread

static volatile int testComplete = 0;

static CUcontext context;

static CUdevice device;

static const char *eventName;

// Device code

global void VecAdd(const int* A, const int* B, int* C, int size)


int i = blockDim.x * blockIdx.x + threadIdx.x;

for(int n = 0 ; n < 100; n++) {

if (i < size)

  C[i] = A[i] + B[i];



static void

initVec(int *vec, int n)


for (int i=0; i< n; i++)

vec[i] = i;


void *

sampling_func(void *arg)


CUptiResult cuptiErr;

CUpti_EventGroup eventGroup;

CUpti_EventID eventId;

size_t bytesRead, valueSize;

uint32_t numInstances = 0, j = 0;

uint64_t *eventValues = NULL, eventVal = 0;

uint32_t profile_all = 1;

cuptiErr = cuptiSetEventCollectionMode(context,


CHECK_CUPTI_ERROR(cuptiErr, “cuptiSetEventCollectionMode”);

cuptiErr = cuptiEventGroupCreate(context, &eventGroup, 0);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupCreate”);

cuptiErr = cuptiEventGetIdFromName(device, eventName, &eventId);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGetIdFromName”);

cuptiErr = cuptiEventGroupAddEvent(eventGroup, eventId);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupAddEvent”);

cuptiErr = cuptiEventGroupSetAttribute(eventGroup,


                                     sizeof(profile_all), &profile_all);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupSetAttribute”);

cuptiErr = cuptiEventGroupEnable(eventGroup);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupEnable”);

valueSize = sizeof(numInstances);

cuptiErr = cuptiEventGroupGetAttribute(eventGroup,


                                     &valueSize, &numInstances);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupGetAttribute”);

bytesRead = sizeof(uint64_t) * numInstances;

eventValues = (uint64_t *) malloc(bytesRead);

if (eventValues == NULL) {

  printf("%s:%d: Failed to allocate memory.\n", __FILE__, __LINE__);



while (!testComplete) {

cuptiErr = cuptiEventGroupReadEvent(eventGroup,


                                    eventId, &bytesRead, eventValues);

CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupReadEvent");

if (bytesRead != (sizeof(uint64_t) * numInstances)) {

  printf("Failed to read value for \"%s\"\n", eventName);



for (j = 0; j < numInstances; j++) {

  eventVal += eventValues[j];


printf("%s: %llu\n", eventName, (unsigned long long)eventVal);

#ifdef _WIN32



usleep(SAMPLE_PERIOD_MS * 1000);



cuptiErr = cuptiEventGroupDisable(eventGroup);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupDisable”);

cuptiErr = cuptiEventGroupDestroy(eventGroup);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupDestroy”);


return NULL;


static void

compute(int iters)


size_t size = N * sizeof(int);

int threadsPerBlock = 0;

int blocksPerGrid = 0;

int sum, i;

int *h_A, *h_B, *h_C;

int *d_A, *d_B, *d_C;

// Allocate input vectors h_A and h_B in host memory

h_A = (int*)malloc(size);

h_B = (int*)malloc(size);

h_C = (int*)malloc(size);

// Initialize input vectors

initVec(h_A, N);

initVec(h_B, N);

memset(h_C, 0, size);

// Allocate vectors in device memory

cudaMalloc((void**)&d_A, size);

cudaMalloc((void**)&d_B, size);

cudaMalloc((void**)&d_C, size);

// Copy vectors from host memory to device memory

cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

// Invoke kernel (multiple times to make sure we have time for

// sampling)

threadsPerBlock = 256;

blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

for (i = 0; i < iters; i++) {

VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);


// Copy result from device memory to host memory

// h_C contains the result in host memory

cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

// Verify result

for (i = 0; i < N; ++i) {

sum = h_A[i] + h_B[i];

if (h_C[i] != sum) {

  printf("kernel execution FAILED\n");












main(int argc, char *argv)


#ifdef _WIN32

HANDLE hThread;


int status;

pthread_t pThread;


CUresult err;

int deviceNum;

int deviceCount;

char deviceName[32];

int major;

int minor;

printf(“Usage: %s [device_num] [event_name]\n”, argv[0]);

err = cuInit(0);

CHECK_CU_ERROR(err, “cuInit”);

err = cuDeviceGetCount(&deviceCount);

CHECK_CU_ERROR(err, “cuDeviceGetCount”);

if (deviceCount == 0) {

printf("There is no device supporting CUDA.\n");



if (argc > 1)

deviceNum = atoi(argv[1]);


deviceNum = 0;

printf(“CUDA Device Number: %d\n”, deviceNum);

err = cuDeviceGet(&device, deviceNum);

CHECK_CU_ERROR(err, “cuDeviceGet”);

err = cuDeviceGetName(deviceName, 32, device);

CHECK_CU_ERROR(err, “cuDeviceGetName”);

printf(“CUDA Device Name: %s\n”, deviceName);

err = cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);

CHECK_CU_ERROR(err, “cuDeviceGetAttribute”);

err = cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);

CHECK_CU_ERROR(err, “cuDeviceGetAttribute”);

printf(“Compute Capability of Device: %d.%d\n”, major,minor);

int deviceComputeCapability = 10 * major + minor;

if(deviceComputeCapability > 72) {

printf("Sample unsupported on Device with compute capability > 7.2\n");

return -2;


if (argc > 2) {

eventName = argv[2];


else {

eventName = EVENT_NAME;


err = cuCtxCreate(&context, 0, device);

CHECK_CU_ERROR(err, “cuCtxCreate”);

testComplete = 0;

printf(“Creating sampling thread\n”);

#ifdef _WIN32

hThread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) sampling_func,

                     NULL, 0, NULL );

if (!hThread) {

printf("CreateThread failed\n");




status = pthread_create(&pThread, NULL, sampling_func, NULL);

if (status != 0) {





// run kernel while sampling


// “signal” the sampling thread to exit and wait for it

testComplete = 1;

#ifdef _WIN32

WaitForSingleObject(hThread, INFINITE);


pthread_join(pThread, NULL);



return 0;


/tmp/tmpxft_000000d2_00000000-11_44f7df52-5495-4009-885a-d6d185b5b232.o: In function sampling_func(void*)': tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0xa2): undefined reference to cuptiSetEventCollectionMode’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0xbc): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x10c): undefined reference to cuptiEventGroupCreate’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x126): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x176): undefined reference to cuptiEventGetIdFromName’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x190): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x1d6): undefined reference to cuptiEventGroupAddEvent’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x1f0): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x242): undefined reference to cuptiEventGroupSetAttribute’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x25c): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x29d): undefined reference to cuptiEventGroupEnable’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x2b7): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x30d): undefined reference to cuptiEventGroupGetAttribute’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x327): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x3d9): undefined reference to cuptiEventGroupReadEvent’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x3f3): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x4cb): undefined reference to cuptiEventGroupDisable’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x4e5): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x523): undefined reference to cuptiEventGroupDestroy’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x53d): undefined reference to cuptiGetResultString' /tmp/tmpxft_000000d2_00000000-11_44f7df52-5495-4009-885a-d6d185b5b232.o: In function main’:
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x87e): undefined reference to cuInit' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x8ba): undefined reference to cuDeviceGetCount’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x953): undefined reference to cuDeviceGet' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x99a): undefined reference to cuDeviceGetName’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x9f9): undefined reference to cuDeviceGetAttribute' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0xa40): undefined reference to cuDeviceGetAttribute’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0xaf9): undefined reference to `cuCtxCreate_v2’
collect2: error: ld returned 1 exit status


Error log shows that there are undefined references to CUPTI APIs. Do you link your program with the CUPTI library (

No, i do not link my program with the CUPTI library.
In fact, i do not know how can do it on google colab.