I use google colab on cuda programing. how can i solve the following error?
%%cu
/*
-
Copyright 2011-2017 NVIDIA Corporation. All rights reserved
-
Sample app to demonstrate use of CUPTI library to obtain profiler
-
event values by sampling.
*/
#ifdef _WIN32
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
#include <stdio.h>
#include <cuda_runtime_api.h>
#include <cupti_events.h>
#ifdef _WIN32
#include <windows.h>
#include <unistd.h>
#include <pthread.h>
#define CHECK_CU_ERROR(err, cufunc) \
if (err != CUDA_SUCCESS) \
{ \
printf ("Error %d for CUDA Driver API function '%s'.\n", \
err, cufunc); \
return 0; \
}
#define CHECK_CUPTI_ERROR(err, cuptifunc) \
if (err != CUPTI_SUCCESS) \
{ \
const char *errstr; \
cuptiGetResultString(err, &errstr); \
printf ("%s:%d:Error %s for CUPTI API function '%s'.\n", \
__FILE__, __LINE__, errstr, cuptifunc); \
return 0; \
}
#define EVENT_NAME “inst_executed”
#define N 100000
#define ITERATIONS 2000
#define SAMPLE_PERIOD_MS 50
// used to signal from the compute thread to the sampling thread
static volatile int testComplete = 0;
static CUcontext context;
static CUdevice device;
static const char *eventName;
// Device code
global void VecAdd(const int* A, const int* B, int* C, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
for(int n = 0 ; n < 100; n++) {
if (i < size)
C[i] = A[i] + B[i];
}
}
static void
initVec(int *vec, int n)
{
for (int i=0; i< n; i++)
vec[i] = i;
}
void *
sampling_func(void *arg)
{
CUptiResult cuptiErr;
CUpti_EventGroup eventGroup;
CUpti_EventID eventId;
size_t bytesRead, valueSize;
uint32_t numInstances = 0, j = 0;
uint64_t *eventValues = NULL, eventVal = 0;
uint32_t profile_all = 1;
cuptiErr = cuptiSetEventCollectionMode(context,
CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS);
CHECK_CUPTI_ERROR(cuptiErr, “cuptiSetEventCollectionMode”);
cuptiErr = cuptiEventGroupCreate(context, &eventGroup, 0);
CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupCreate”);
cuptiErr = cuptiEventGetIdFromName(device, eventName, &eventId);
CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGetIdFromName”);
cuptiErr = cuptiEventGroupAddEvent(eventGroup, eventId);
CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupAddEvent”);
cuptiErr = cuptiEventGroupSetAttribute(eventGroup,
CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES,
sizeof(profile_all), &profile_all);
CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupSetAttribute”);
cuptiErr = cuptiEventGroupEnable(eventGroup);
CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupEnable”);
valueSize = sizeof(numInstances);
cuptiErr = cuptiEventGroupGetAttribute(eventGroup,
CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT,
&valueSize, &numInstances);
CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupGetAttribute”);
bytesRead = sizeof(uint64_t) * numInstances;
eventValues = (uint64_t *) malloc(bytesRead);
if (eventValues == NULL) {
printf("%s:%d: Failed to allocate memory.\n", __FILE__, __LINE__);
exit(-1);
}
while (!testComplete) {
cuptiErr = cuptiEventGroupReadEvent(eventGroup,
CUPTI_EVENT_READ_FLAG_NONE,
eventId, &bytesRead, eventValues);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupReadEvent");
if (bytesRead != (sizeof(uint64_t) * numInstances)) {
printf("Failed to read value for \"%s\"\n", eventName);
exit(-1);
}
for (j = 0; j < numInstances; j++) {
eventVal += eventValues[j];
}
printf("%s: %llu\n", eventName, (unsigned long long)eventVal);
#ifdef _WIN32
Sleep(SAMPLE_PERIOD_MS);
usleep(SAMPLE_PERIOD_MS * 1000);
}
cuptiErr = cuptiEventGroupDisable(eventGroup);
CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupDisable”);
cuptiErr = cuptiEventGroupDestroy(eventGroup);
CHECK_CUPTI_ERROR(cuptiErr, “cuptiEventGroupDestroy”);
free(eventValues);
return NULL;
}
static void
compute(int iters)
{
size_t size = N * sizeof(int);
int threadsPerBlock = 0;
int blocksPerGrid = 0;
int sum, i;
int *h_A, *h_B, *h_C;
int *d_A, *d_B, *d_C;
// Allocate input vectors h_A and h_B in host memory
h_A = (int*)malloc(size);
h_B = (int*)malloc(size);
h_C = (int*)malloc(size);
// Initialize input vectors
initVec(h_A, N);
initVec(h_B, N);
memset(h_C, 0, size);
// Allocate vectors in device memory
cudaMalloc((void**)&d_A, size);
cudaMalloc((void**)&d_B, size);
cudaMalloc((void**)&d_C, size);
// Copy vectors from host memory to device memory
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// Invoke kernel (multiple times to make sure we have time for
// sampling)
threadsPerBlock = 256;
blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
for (i = 0; i < iters; i++) {
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
}
// Copy result from device memory to host memory
// h_C contains the result in host memory
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Verify result
for (i = 0; i < N; ++i) {
sum = h_A[i] + h_B[i];
if (h_C[i] != sum) {
printf("kernel execution FAILED\n");
exit(-1);
}
}
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
}
int
main(int argc, char *argv)
{
#ifdef _WIN32
HANDLE hThread;
int status;
pthread_t pThread;
CUresult err;
int deviceNum;
int deviceCount;
char deviceName[32];
int major;
int minor;
printf(“Usage: %s [device_num] [event_name]\n”, argv[0]);
err = cuInit(0);
CHECK_CU_ERROR(err, “cuInit”);
err = cuDeviceGetCount(&deviceCount);
CHECK_CU_ERROR(err, “cuDeviceGetCount”);
if (deviceCount == 0) {
printf("There is no device supporting CUDA.\n");
exit(-1);
}
if (argc > 1)
deviceNum = atoi(argv[1]);
else
deviceNum = 0;
printf(“CUDA Device Number: %d\n”, deviceNum);
err = cuDeviceGet(&device, deviceNum);
CHECK_CU_ERROR(err, “cuDeviceGet”);
err = cuDeviceGetName(deviceName, 32, device);
CHECK_CU_ERROR(err, “cuDeviceGetName”);
printf(“CUDA Device Name: %s\n”, deviceName);
err = cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
CHECK_CU_ERROR(err, “cuDeviceGetAttribute”);
err = cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
CHECK_CU_ERROR(err, “cuDeviceGetAttribute”);
printf(“Compute Capability of Device: %d.%d\n”, major,minor);
int deviceComputeCapability = 10 * major + minor;
if(deviceComputeCapability > 72) {
printf("Sample unsupported on Device with compute capability > 7.2\n");
return -2;
}
if (argc > 2) {
eventName = argv[2];
}
else {
eventName = EVENT_NAME;
}
err = cuCtxCreate(&context, 0, device);
CHECK_CU_ERROR(err, “cuCtxCreate”);
testComplete = 0;
printf(“Creating sampling thread\n”);
#ifdef _WIN32
hThread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) sampling_func,
NULL, 0, NULL );
if (!hThread) {
printf("CreateThread failed\n");
exit(-1);
}
status = pthread_create(&pThread, NULL, sampling_func, NULL);
if (status != 0) {
perror("pthread_create");
exit(-1);
}
// run kernel while sampling
compute(ITERATIONS);
// “signal” the sampling thread to exit and wait for it
testComplete = 1;
#ifdef _WIN32
WaitForSingleObject(hThread, INFINITE);
pthread_join(pThread, NULL);
cudaDeviceSynchronize();
return 0;
}
ERROR:
/tmp/tmpxft_000000d2_00000000-11_44f7df52-5495-4009-885a-d6d185b5b232.o: In function sampling_func(void*)': tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0xa2): undefined reference to
cuptiSetEventCollectionMode’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0xbc): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x10c): undefined reference to
cuptiEventGroupCreate’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x126): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x176): undefined reference to
cuptiEventGetIdFromName’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x190): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x1d6): undefined reference to
cuptiEventGroupAddEvent’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x1f0): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x242): undefined reference to
cuptiEventGroupSetAttribute’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x25c): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x29d): undefined reference to
cuptiEventGroupEnable’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x2b7): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x30d): undefined reference to
cuptiEventGroupGetAttribute’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x327): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x3d9): undefined reference to
cuptiEventGroupReadEvent’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x3f3): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x4cb): undefined reference to
cuptiEventGroupDisable’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x4e5): undefined reference to cuptiGetResultString' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x523): undefined reference to
cuptiEventGroupDestroy’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x53d): undefined reference to cuptiGetResultString' /tmp/tmpxft_000000d2_00000000-11_44f7df52-5495-4009-885a-d6d185b5b232.o: In function
main’:
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x87e): undefined reference to cuInit' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x8ba): undefined reference to
cuDeviceGetCount’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x953): undefined reference to cuDeviceGet' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x99a): undefined reference to
cuDeviceGetName’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0x9f9): undefined reference to cuDeviceGetAttribute' tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0xa40): undefined reference to
cuDeviceGetAttribute’
tmpxft_000000d2_00000000-6_44f7df52-5495-4009-885a-d6d185b5b232.cudafe1.cpp:(.text+0xaf9): undefined reference to `cuCtxCreate_v2’
collect2: error: ld returned 1 exit status