How can i linking the CUDA driver API library and CUPTI library on google colab?

mehran0mehrani · May 27, 2021, 8:55am

I use google colab on cuda programing. how can i solve the following error?

%%cu

/*

Copyright 2011-2017 NVIDIA Corporation. All rights reserved
Sample app to demonstrate use of CUPTI library to obtain profiler
event values by sampling.

*/

#ifndef WIN32_LEAN_AND_MEAN

    #define WIN32_LEAN_AND_MEAN

#endif

#endif

#include <stdio.h>

#include <cuda_runtime_api.h>

#include <cupti_events.h>

#ifdef _WIN32

#include <windows.h>

#else

#include <unistd.h>

#include <pthread.h>

#endif

#define CHECK_CU_ERROR(err, cufunc) \

if (err != CUDA_SUCCESS) \

{                                                                   \

  printf ("Error %d for CUDA Driver API function '%s'.\n",          \

          err, cufunc);                                             \

  return 0;                                                         \

}

#define CHECK_CUPTI_ERROR(err, cuptifunc) \

if (err != CUPTI_SUCCESS) \

{                                                           \

  const char *errstr;                                       \

  cuptiGetResultString(err, &errstr);                       \

  printf ("%s:%d:Error %s for CUPTI API function '%s'.\n",  \

          __FILE__, __LINE__, errstr, cuptifunc);           \

  return 0;                                                 \

}

#define EVENT_NAME “inst_executed”

#define N 100000

#define ITERATIONS 2000

#define SAMPLE_PERIOD_MS 50

// used to signal from the compute thread to the sampling thread

static volatile int testComplete = 0;

static CUcontext context;

static CUdevice device;

static const char *eventName;

// Device code

global void VecAdd(const int* A, const int* B, int* C, int size)

{

int i = blockDim.x * blockIdx.x + threadIdx.x;

for(int n = 0 ; n < 100; n++) {

if (i < size)

  C[i] = A[i] + B[i];

}

static void

initVec(int *vec, int n)

{

for (int i=0; i< n; i++)

vec[i] = i;

}

void *

sampling_func(void *arg)

{

CUptiResult cuptiErr;

CUpti_EventGroup eventGroup;

CUpti_EventID eventId;

size_t bytesRead, valueSize;

uint32_t numInstances = 0, j = 0;

uint64_t *eventValues = NULL, eventVal = 0;

uint32_t profile_all = 1;

cuptiErr = cuptiSetEventCollectionMode(context,

                                     CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiSetEventCollectionMode”);

cuptiErr = cuptiEventGroupCreate(context, &eventGroup, 0);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupCreate”);

cuptiErr = cuptiEventGetIdFromName(device, eventName, &eventId);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGetIdFromName”);

cuptiErr = cuptiEventGroupAddEvent(eventGroup, eventId);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupAddEvent”);

cuptiErr = cuptiEventGroupSetAttribute(eventGroup,

                                     CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES,

                                     sizeof(profile_all), &profile_all);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupSetAttribute”);

cuptiErr = cuptiEventGroupEnable(eventGroup);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupEnable”);

valueSize = sizeof(numInstances);

cuptiErr = cuptiEventGroupGetAttribute(eventGroup,

                                     CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT,

                                     &valueSize, &numInstances);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupGetAttribute”);

bytesRead = sizeof(uint64_t) * numInstances;

eventValues = (uint64_t *) malloc(bytesRead);

if (eventValues == NULL) {

  printf("%s:%d: Failed to allocate memory.\n", __FILE__, __LINE__);

  exit(-1);

}

while (!testComplete) {

cuptiErr = cuptiEventGroupReadEvent(eventGroup,

                                    CUPTI_EVENT_READ_FLAG_NONE,

                                    eventId, &bytesRead, eventValues);

CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupReadEvent");

if (bytesRead != (sizeof(uint64_t) * numInstances)) {

  printf("Failed to read value for \"%s\"\n", eventName);

  exit(-1);

}

for (j = 0; j < numInstances; j++) {

  eventVal += eventValues[j];

}

printf("%s: %llu\n", eventName, (unsigned long long)eventVal);

#ifdef _WIN32

Sleep(SAMPLE_PERIOD_MS);

#else

usleep(SAMPLE_PERIOD_MS * 1000);

#endif

}

cuptiErr = cuptiEventGroupDisable(eventGroup);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupDisable”);

cuptiErr = cuptiEventGroupDestroy(eventGroup);

CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupDestroy”);

free(eventValues);

return NULL;

}

static void

compute(int iters)

{

size_t size = N * sizeof(int);

int threadsPerBlock = 0;

int blocksPerGrid = 0;

int sum, i;

int *h_A, *h_B, *h_C;

int *d_A, *d_B, *d_C;

// Allocate input vectors h_A and h_B in host memory

h_A = (int*)malloc(size);

h_B = (int*)malloc(size);

h_C = (int*)malloc(size);

// Initialize input vectors

initVec(h_A, N);

initVec(h_B, N);

memset(h_C, 0, size);

// Allocate vectors in device memory

cudaMalloc((void**)&d_A, size);

cudaMalloc((void**)&d_B, size);

cudaMalloc((void**)&d_C, size);

// Copy vectors from host memory to device memory

cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

// Invoke kernel (multiple times to make sure we have time for

// sampling)

threadsPerBlock = 256;

blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

for (i = 0; i < iters; i++) {

VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);

}

// Copy result from device memory to host memory

// h_C contains the result in host memory

cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

// Verify result

for (i = 0; i < N; ++i) {

sum = h_A[i] + h_B[i];

if (h_C[i] != sum) {

  printf("kernel execution FAILED\n");

  exit(-1);

}

}

cudaFree(d_A);

cudaFree(d_B);

cudaFree(d_C);

free(h_A);

free(h_B);

free(h_C);

}

int

main(int argc, char *argv)

{

#ifdef _WIN32

HANDLE hThread;

#else

int status;

pthread_t pThread;

#endif

CUresult err;

int deviceNum;

int deviceCount;

char deviceName[32];

int major;

int minor;

printf(“Usage: %s [device_num] [event_name]\n”, argv[0]);

err = cuInit(0);

CHECK_CU_ERROR(err, “cuInit”);

err = cuDeviceGetCount(&deviceCount);

CHECK_CU_ERROR(err, “cuDeviceGetCount”);

if (deviceCount == 0) {

printf("There is no device supporting CUDA.\n");

exit(-1);

}

if (argc > 1)

deviceNum = atoi(argv[1]);

else

deviceNum = 0;

printf(“CUDA Device Number: %d\n”, deviceNum);

err = cuDeviceGet(&device, deviceNum);

CHECK_CU_ERROR(err, “cuDeviceGet”);

err = cuDeviceGetName(deviceName, 32, device);

CHECK_CU_ERROR(err, “cuDeviceGetName”);

printf(“CUDA Device Name: %s\n”, deviceName);

err = cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);

CHECK_CU_ERROR(err, “cuDeviceGetAttribute”);

err = cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);

CHECK_CU_ERROR(err, “cuDeviceGetAttribute”);

printf(“Compute Capability of Device: %d.%d\n”, major,minor);

int deviceComputeCapability = 10 * major + minor;

if(deviceComputeCapability > 72) {

printf("Sample unsupported on Device with compute capability > 7.2\n");

return -2;

}

if (argc > 2) {

eventName = argv[2];

}

else {

eventName = EVENT_NAME;

}

err = cuCtxCreate(&context, 0, device);

CHECK_CU_ERROR(err, “cuCtxCreate”);

testComplete = 0;

printf(“Creating sampling thread\n”);

#ifdef _WIN32

hThread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) sampling_func,

                     NULL, 0, NULL );

if (!hThread) {

printf("CreateThread failed\n");

exit(-1);

}

#else

status = pthread_create(&pThread, NULL, sampling_func, NULL);

if (status != 0) {

perror("pthread_create");

exit(-1);

}

#endif

// run kernel while sampling

compute(ITERATIONS);

// “signal” the sampling thread to exit and wait for it

testComplete = 1;

#ifdef _WIN32

WaitForSingleObject(hThread, INFINITE);

#else

pthread_join(pThread, NULL);

#endif

cudaDeviceSynchronize();

return 0;

}

ERROR:
/tmp/tmpxft_000000d2_00000000-11_44f7df52-5495-4009-885a-d6d185b5b232.o: In function sampling_func(void*)': tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0xa2): undefined reference to cuptiSetEventCollectionMode’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0xbc): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x10c): undefined reference to cuptiEventGroupCreate’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x126): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x176): undefined reference to cuptiEventGetIdFromName’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x190): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x1d6): undefined reference to cuptiEventGroupAddEvent’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x1f0): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x242): undefined reference to cuptiEventGroupSetAttribute’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x25c): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x29d): undefined reference to cuptiEventGroupEnable’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x2b7): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x30d): undefined reference to cuptiEventGroupGetAttribute’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x327): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x3d9): undefined reference to cuptiEventGroupReadEvent’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x3f3): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x4cb): undefined reference to cuptiEventGroupDisable’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x4e5): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x523): undefined reference to cuptiEventGroupDestroy’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x53d): undefined reference to cuptiGetResultString' /tmp/tmpxft_000000d2_00000000-11_44f7df52-5495-4009-885a-d6d185b5b232.o: In function main’:
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x87e): undefined reference to cuInit' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x8ba): undefined reference to cuDeviceGetCount’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x953): undefined reference to cuDeviceGet' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x99a): undefined reference to cuDeviceGetName’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x9f9): undefined reference to cuDeviceGetAttribute' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0xa40): undefined reference to cuDeviceGetAttribute’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0xaf9): undefined reference to `cuCtxCreate_v2’
collect2: error: ld returned 1 exit status

mjain · June 1, 2021, 7:08am

Hi,

Error log shows that there are undefined references to CUPTI APIs. Do you link your program with the CUPTI library (libcupti.so)?

Topic		Replies	Views
CUPTI Sample Code problem CUPTI – CUDA Profiler Tools Interface cuda , kernel	12	303	July 28, 2025
CUPTI Samples FAIL CUPTI – CUDA Profiler Tools Interface cuda	4	713	August 31, 2020
NVIDIA® CUDA Profiler Tools Interface (CUPTI) 2019.1 is now available CUPTI – CUDA Profiler Tools Interface	0	1940	March 1, 2019
CUDA Profiler Tools Interface (CUPTI) for CUDA Toolkit 12.5 is now available CUPTI – CUDA Profiler Tools Interface	6	493	October 23, 2024
NVIDIA® CUDA Profiler Tools Interface (CUPTI) for CUDA Toolkit 10.1 Update 2 is now available CUPTI – CUDA Profiler Tools Interface	1	1201	May 27, 2020
CUDA Profiler Tools Interface (CUPTI) for CUDA Toolkit 12.4 is now available CUPTI – CUDA Profiler Tools Interface	7	431	October 23, 2024
Profiling cuda graph with CUPTI Profiling API CUPTI – CUDA Profiler Tools Interface tensorrt	4	1408	September 29, 2025
NVIDIA® CUDA Profiler Tools Interface (CUPTI) for CUDA Toolkit 11.3 is now available CUPTI – CUDA Profiler Tools Interface	4	992	June 29, 2021
CUPTI Sample tutorial wrong CUPTI – CUDA Profiler Tools Interface cuda , kernel	2	170	July 28, 2025
CUDA Profiler Tools Interface (CUPTI) for CUDA Toolkit 12.1 is now available CUPTI – CUDA Profiler Tools Interface	5	1546	October 23, 2024

How can i linking the CUDA driver API library and CUPTI library on google colab?

Related topics