How can i linking the CUDA driver API library and CUPTI library on google colab?

I use google colab on cuda programing. how can i solve the following error?

%%cu

/*

  • Copyright 2011-2017 NVIDIA Corporation. All rights reserved

  • Sample app to demonstrate use of CUPTI library to obtain profiler

  • event values by sampling.

*/

#ifdef _WIN32

#ifndef WIN32_LEAN_AND_MEAN

    #define WIN32_LEAN_AND_MEAN

#endif

#endif

#include <stdio.h>

#include <cuda_runtime_api.h>

#include <cupti_events.h>

#ifdef _WIN32

#include <windows.h>

#else

#include <unistd.h>

#include <pthread.h>

#endif

#define CHECK_CU_ERROR(err, cufunc) \

if (err != CUDA_SUCCESS) \

{                                                                   \

  printf ("Error %d for CUDA Driver API function '%s'.\n",          \

          err, cufunc);                                             \

  return 0;                                                         \

}

#define CHECK_CUPTI_ERROR(err, cuptifunc) \

if (err != CUPTI_SUCCESS) \

{                                                           \

  const char *errstr;                                       \

  cuptiGetResultString(err, &errstr);                       \

  printf ("%s:%d:Error %s for CUPTI API function '%s'.\n",  \

          __FILE__, __LINE__, errstr, cuptifunc);           \

  return 0;                                                 \

}

#define EVENT_NAME “inst_executed”

#define N 100000

#define ITERATIONS 2000

#define SAMPLE_PERIOD_MS 50

// used to signal from the compute thread to the sampling thread

static volatile int testComplete = 0;

static CUcontext context;

static CUdevice device;

static const char *eventName;

// Device code

global void VecAdd(const int* A, const int* B, int* C, int size)

{

int i = blockDim.x * blockIdx.x + threadIdx.x;

for(int n = 0 ; n < 100; n++) {

if (i < size)

  C[i] = A[i] + B[i];

}

}

static void

initVec(int *vec, int n)

{

for (int i=0; i< n; i++)

vec[i] = i;

}

void *

sampling_func(void *arg)

{

CUptiResult cuptiErr;

CUpti_EventGroup eventGroup;

CUpti_EventID eventId;

size_t bytesRead, valueSize;

uint32_t numInstances = 0, j = 0;

uint64_t *eventValues = NULL, eventVal = 0;

uint32_t profile_all = 1;

cuptiErr = cuptiSetEventCollectionMode(context,

                                     CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiSetEventCollectionMode”);

cuptiErr = cuptiEventGroupCreate(context, &eventGroup, 0);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupCreate”);

cuptiErr = cuptiEventGetIdFromName(device, eventName, &eventId);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGetIdFromName”);

cuptiErr = cuptiEventGroupAddEvent(eventGroup, eventId);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupAddEvent”);

cuptiErr = cuptiEventGroupSetAttribute(eventGroup,

                                     CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES,

                                     sizeof(profile_all), &profile_all);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupSetAttribute”);

cuptiErr = cuptiEventGroupEnable(eventGroup);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupEnable”);

valueSize = sizeof(numInstances);

cuptiErr = cuptiEventGroupGetAttribute(eventGroup,

                                     CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT,

                                     &valueSize, &numInstances);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupGetAttribute”);

bytesRead = sizeof(uint64_t) * numInstances;

eventValues = (uint64_t *) malloc(bytesRead);

if (eventValues == NULL) {

  printf("%s:%d: Failed to allocate memory.\n", __FILE__, __LINE__);

  exit(-1);

}

while (!testComplete) {

cuptiErr = cuptiEventGroupReadEvent(eventGroup,

                                    CUPTI_EVENT_READ_FLAG_NONE,

                                    eventId, &bytesRead, eventValues);

CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupReadEvent");

if (bytesRead != (sizeof(uint64_t) * numInstances)) {

  printf("Failed to read value for \"%s\"\n", eventName);

  exit(-1);

}

for (j = 0; j < numInstances; j++) {

  eventVal += eventValues[j];

}

printf("%s: %llu\n", eventName, (unsigned long long)eventVal);

#ifdef _WIN32

Sleep(SAMPLE_PERIOD_MS);

#else

usleep(SAMPLE_PERIOD_MS * 1000);

#endif

}

cuptiErr = cuptiEventGroupDisable(eventGroup);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupDisable”);

cuptiErr = cuptiEventGroupDestroy(eventGroup);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupDestroy”);

free(eventValues);

return NULL;

}

static void

compute(int iters)

{

size_t size = N * sizeof(int);

int threadsPerBlock = 0;

int blocksPerGrid = 0;

int sum, i;

int *h_A, *h_B, *h_C;

int *d_A, *d_B, *d_C;

// Allocate input vectors h_A and h_B in host memory

h_A = (int*)malloc(size);

h_B = (int*)malloc(size);

h_C = (int*)malloc(size);

// Initialize input vectors

initVec(h_A, N);

initVec(h_B, N);

memset(h_C, 0, size);

// Allocate vectors in device memory

cudaMalloc((void**)&d_A, size);

cudaMalloc((void**)&d_B, size);

cudaMalloc((void**)&d_C, size);

// Copy vectors from host memory to device memory

cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

// Invoke kernel (multiple times to make sure we have time for

// sampling)

threadsPerBlock = 256;

blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

for (i = 0; i < iters; i++) {

VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);

}

// Copy result from device memory to host memory

// h_C contains the result in host memory

cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

// Verify result

for (i = 0; i < N; ++i) {

sum = h_A[i] + h_B[i];

if (h_C[i] != sum) {

  printf("kernel execution FAILED\n");

  exit(-1);

}

}

cudaFree(d_A);

cudaFree(d_B);

cudaFree(d_C);

free(h_A);

free(h_B);

free(h_C);

}

int

main(int argc, char *argv)

{

#ifdef _WIN32

HANDLE hThread;

#else

int status;

pthread_t pThread;

#endif

CUresult err;

int deviceNum;

int deviceCount;

char deviceName[32];

int major;

int minor;

printf(“Usage: %s [device_num] [event_name]\n”, argv[0]);

err = cuInit(0);

CHECK_CU_ERROR(err, “cuInit”);

err = cuDeviceGetCount(&deviceCount);

CHECK_CU_ERROR(err, “cuDeviceGetCount”);

if (deviceCount == 0) {

printf("There is no device supporting CUDA.\n");

exit(-1);

}

if (argc > 1)

deviceNum = atoi(argv[1]);

else

deviceNum = 0;

printf(“CUDA Device Number: %d\n”, deviceNum);

err = cuDeviceGet(&device, deviceNum);

CHECK_CU_ERROR(err, “cuDeviceGet”);

err = cuDeviceGetName(deviceName, 32, device);

CHECK_CU_ERROR(err, “cuDeviceGetName”);

printf(“CUDA Device Name: %s\n”, deviceName);

err = cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);

CHECK_CU_ERROR(err, “cuDeviceGetAttribute”);

err = cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);

CHECK_CU_ERROR(err, “cuDeviceGetAttribute”);

printf(“Compute Capability of Device: %d.%d\n”, major,minor);

int deviceComputeCapability = 10 * major + minor;

if(deviceComputeCapability > 72) {

printf("Sample unsupported on Device with compute capability > 7.2\n");

return -2;

}

if (argc > 2) {

eventName = argv[2];

}

else {

eventName = EVENT_NAME;

}

err = cuCtxCreate(&context, 0, device);

CHECK_CU_ERROR(err, “cuCtxCreate”);

testComplete = 0;

printf(“Creating sampling thread\n”);

#ifdef _WIN32

hThread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) sampling_func,

                     NULL, 0, NULL );

if (!hThread) {

printf("CreateThread failed\n");

exit(-1);

}

#else

status = pthread_create(&pThread, NULL, sampling_func, NULL);

if (status != 0) {

perror("pthread_create");

exit(-1);

}

#endif

// run kernel while sampling

compute(ITERATIONS);

// “signal” the sampling thread to exit and wait for it

testComplete = 1;

#ifdef _WIN32

WaitForSingleObject(hThread, INFINITE);

#else

pthread_join(pThread, NULL);

#endif

cudaDeviceSynchronize();

return 0;

}

ERROR:
/tmp/tmpxft_000000d2_00000000-11_44f7df52-5495-4009-885a-d6d185b5b232.o: In function sampling_func(void*)': tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0xa2): undefined reference to cuptiSetEventCollectionMode’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0xbc): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x10c): undefined reference to cuptiEventGroupCreate’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x126): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x176): undefined reference to cuptiEventGetIdFromName’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x190): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x1d6): undefined reference to cuptiEventGroupAddEvent’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x1f0): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x242): undefined reference to cuptiEventGroupSetAttribute’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x25c): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x29d): undefined reference to cuptiEventGroupEnable’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x2b7): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x30d): undefined reference to cuptiEventGroupGetAttribute’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x327): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x3d9): undefined reference to cuptiEventGroupReadEvent’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x3f3): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x4cb): undefined reference to cuptiEventGroupDisable’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x4e5): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x523): undefined reference to cuptiEventGroupDestroy’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x53d): undefined reference to cuptiGetResultString' /tmp/tmpxft_000000d2_00000000-11_44f7df52-5495-4009-885a-d6d185b5b232.o: In function main’:
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x87e): undefined reference to cuInit' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x8ba): undefined reference to cuDeviceGetCount’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x953): undefined reference to cuDeviceGet' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x99a): undefined reference to cuDeviceGetName’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x9f9): undefined reference to cuDeviceGetAttribute' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0xa40): undefined reference to cuDeviceGetAttribute’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0xaf9): undefined reference to `cuCtxCreate_v2’
collect2: error: ld returned 1 exit status

Hi,

Error log shows that there are undefined references to CUPTI APIs. Do you link your program with the CUPTI library (libcupti.so)?